From b3626f0a3576152256bbbd7fedab90e063037ba1 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Sat, 16 Dec 2023 18:01:45 +1100 Subject: [PATCH 01/91] linux 6.7 compat: simplify current_time() check 6.7 changed the names of the time members in struct inode, so we can't assign back to it because we don't know its name. In practice this doesn't matter though - if we're missing current_time(), then we must be on <4.9, and we know our fallback will need to return timespec. Signed-off-by: Rob Norris Sponsored-by: https://github.com/sponsors/robn --- config/kernel-current-time.m4 | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/config/kernel-current-time.m4 b/config/kernel-current-time.m4 index 3ceb5f63efa..ab7d9c5cedb 100644 --- a/config/kernel-current-time.m4 +++ b/config/kernel-current-time.m4 @@ -2,12 +2,15 @@ dnl # dnl # 4.9, current_time() added dnl # 4.18, return type changed from timespec to timespec64 dnl # +dnl # Note that we don't care about the return type in this check. If we have +dnl # to implement a fallback, we'll know we're <4.9, which was timespec. +dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_CURRENT_TIME], [ ZFS_LINUX_TEST_SRC([current_time], [ #include ], [ struct inode ip __attribute__ ((unused)); - ip.i_atime = current_time(&ip); + (void) current_time(&ip); ]) ]) From 3c13601a12b1739d09cec36eb5057b24141b4ae7 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Sat, 16 Dec 2023 22:31:32 +1100 Subject: [PATCH 02/91] linux 6.7 compat: use inode atime/mtime accessors 6.6 made i_ctime inaccessible; 6.7 has done the same for i_atime and i_mtime. This extends the method used for ctime in b37f29341 to atime and mtime as well. Signed-off-by: Rob Norris Sponsored-by: https://github.com/sponsors/robn --- config/kernel-inode-times.m4 | 78 ++++++++++++++++++++++++++++++ include/os/linux/zfs/sys/zpl.h | 20 ++++++++ module/os/linux/zfs/zfs_ctldir.c | 4 +- module/os/linux/zfs/zfs_vnops_os.c | 33 ++++++++----- module/os/linux/zfs/zfs_znode.c | 45 +++++++++-------- module/os/linux/zfs/zpl_inode.c | 3 +- 6 files changed, 148 insertions(+), 35 deletions(-) diff --git a/config/kernel-inode-times.m4 b/config/kernel-inode-times.m4 index aae95abf172..4d861596ed0 100644 --- a/config/kernel-inode-times.m4 +++ b/config/kernel-inode-times.m4 @@ -52,6 +52,48 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_INODE_TIMES], [ memset(&ip, 0, sizeof(ip)); inode_set_ctime_to_ts(&ip, ts); ]) + + dnl # + dnl # 6.7 API change + dnl # i_atime/i_mtime no longer directly accessible, must use + dnl # inode_get_mtime(ip), inode_set_mtime*(ip) to + dnl # read/write. + dnl # + ZFS_LINUX_TEST_SRC([inode_get_atime], [ + #include + ],[ + struct inode ip; + + memset(&ip, 0, sizeof(ip)); + inode_get_atime(&ip); + ]) + ZFS_LINUX_TEST_SRC([inode_get_mtime], [ + #include + ],[ + struct inode ip; + + memset(&ip, 0, sizeof(ip)); + inode_get_mtime(&ip); + ]) + + ZFS_LINUX_TEST_SRC([inode_set_atime_to_ts], [ + #include + ],[ + struct inode ip; + struct timespec64 ts = {0}; + + memset(&ip, 0, sizeof(ip)); + inode_set_atime_to_ts(&ip, ts); + ]) + ZFS_LINUX_TEST_SRC([inode_set_mtime_to_ts], [ + #include + ],[ + struct inode ip; + struct timespec64 ts = {0}; + + memset(&ip, 0, sizeof(ip)); + inode_set_mtime_to_ts(&ip, ts); + ]) ]) AC_DEFUN([ZFS_AC_KERNEL_INODE_TIMES], [ @@ -90,4 +132,40 @@ AC_DEFUN([ZFS_AC_KERNEL_INODE_TIMES], [ ],[ AC_MSG_RESULT(no) ]) + + AC_MSG_CHECKING([whether inode_get_atime() exists]) + ZFS_LINUX_TEST_RESULT([inode_get_atime], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_INODE_GET_ATIME, 1, + [inode_get_atime() exists in linux/fs.h]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([whether inode_set_atime_to_ts() exists]) + ZFS_LINUX_TEST_RESULT([inode_set_atime_to_ts], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_INODE_SET_ATIME_TO_TS, 1, + [inode_set_atime_to_ts() exists in linux/fs.h]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([whether inode_get_mtime() exists]) + ZFS_LINUX_TEST_RESULT([inode_get_mtime], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_INODE_GET_MTIME, 1, + [inode_get_mtime() exists in linux/fs.h]) + ],[ + AC_MSG_RESULT(no) + ]) + + AC_MSG_CHECKING([whether inode_set_mtime_to_ts() exists]) + ZFS_LINUX_TEST_RESULT([inode_set_mtime_to_ts], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_INODE_SET_MTIME_TO_TS, 1, + [inode_set_mtime_to_ts() exists in linux/fs.h]) + ],[ + AC_MSG_RESULT(no) + ]) ]) diff --git a/include/os/linux/zfs/sys/zpl.h b/include/os/linux/zfs/sys/zpl.h index 9b729be6d74..91a4751fffb 100644 --- a/include/os/linux/zfs/sys/zpl.h +++ b/include/os/linux/zfs/sys/zpl.h @@ -273,5 +273,25 @@ extern long zpl_ioctl_fideduperange(struct file *filp, void *arg); #else #define zpl_inode_set_ctime_to_ts(ip, ts) (ip->i_ctime = ts) #endif +#ifdef HAVE_INODE_GET_ATIME +#define zpl_inode_get_atime(ip) inode_get_atime(ip) +#else +#define zpl_inode_get_atime(ip) (ip->i_atime) +#endif +#ifdef HAVE_INODE_SET_ATIME_TO_TS +#define zpl_inode_set_atime_to_ts(ip, ts) inode_set_atime_to_ts(ip, ts) +#else +#define zpl_inode_set_atime_to_ts(ip, ts) (ip->i_atime = ts) +#endif +#ifdef HAVE_INODE_GET_MTIME +#define zpl_inode_get_mtime(ip) inode_get_mtime(ip) +#else +#define zpl_inode_get_mtime(ip) (ip->i_mtime) +#endif +#ifdef HAVE_INODE_SET_MTIME_TO_TS +#define zpl_inode_set_mtime_to_ts(ip, ts) inode_set_mtime_to_ts(ip, ts) +#else +#define zpl_inode_set_mtime_to_ts(ip, ts) (ip->i_mtime = ts) +#endif #endif /* _SYS_ZPL_H */ diff --git a/module/os/linux/zfs/zfs_ctldir.c b/module/os/linux/zfs/zfs_ctldir.c index 94e25fa0ae8..54ed70d0394 100644 --- a/module/os/linux/zfs/zfs_ctldir.c +++ b/module/os/linux/zfs/zfs_ctldir.c @@ -520,8 +520,8 @@ zfsctl_inode_alloc(zfsvfs_t *zfsvfs, uint64_t id, ip->i_uid = SUID_TO_KUID(0); ip->i_gid = SGID_TO_KGID(0); ip->i_blkbits = SPA_MINBLOCKSHIFT; - ip->i_atime = now; - ip->i_mtime = now; + zpl_inode_set_atime_to_ts(ip, now); + zpl_inode_set_mtime_to_ts(ip, now); zpl_inode_set_ctime_to_ts(ip, now); ip->i_fop = fops; ip->i_op = ops; diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c index b464f615cdd..65d1d786ae5 100644 --- a/module/os/linux/zfs/zfs_vnops_os.c +++ b/module/os/linux/zfs/zfs_vnops_os.c @@ -2435,15 +2435,17 @@ top: if ((mask & ATTR_ATIME) || zp->z_atime_dirty) { zp->z_atime_dirty = B_FALSE; - ZFS_TIME_ENCODE(&ip->i_atime, atime); + inode_timespec_t tmp_atime; + ZFS_TIME_ENCODE(&tmp_atime, atime); + zpl_inode_set_atime_to_ts(ZTOI(zp), tmp_atime); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &atime, sizeof (atime)); } if (mask & (ATTR_MTIME | ATTR_SIZE)) { ZFS_TIME_ENCODE(&vap->va_mtime, mtime); - ZTOI(zp)->i_mtime = zpl_inode_timestamp_truncate( - vap->va_mtime, ZTOI(zp)); + zpl_inode_set_mtime_to_ts(ZTOI(zp), + zpl_inode_timestamp_truncate(vap->va_mtime, ZTOI(zp))); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_MTIME(zfsvfs), NULL, mtime, sizeof (mtime)); @@ -3657,7 +3659,7 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc, caddr_t va; int err = 0; uint64_t mtime[2], ctime[2]; - inode_timespec_t tmp_ctime; + inode_timespec_t tmp_ts; sa_bulk_attr_t bulk[3]; int cnt = 0; struct address_space *mapping; @@ -3821,9 +3823,10 @@ zfs_putpage(struct inode *ip, struct page *pp, struct writeback_control *wbc, &zp->z_pflags, 8); /* Preserve the mtime and ctime provided by the inode */ - ZFS_TIME_ENCODE(&ip->i_mtime, mtime); - tmp_ctime = zpl_inode_get_ctime(ip); - ZFS_TIME_ENCODE(&tmp_ctime, ctime); + tmp_ts = zpl_inode_get_mtime(ip); + ZFS_TIME_ENCODE(&tmp_ts, mtime); + tmp_ts = zpl_inode_get_ctime(ip); + ZFS_TIME_ENCODE(&tmp_ts, ctime); zp->z_atime_dirty = B_FALSE; zp->z_seq++; @@ -3873,7 +3876,7 @@ zfs_dirty_inode(struct inode *ip, int flags) zfsvfs_t *zfsvfs = ITOZSB(ip); dmu_tx_t *tx; uint64_t mode, atime[2], mtime[2], ctime[2]; - inode_timespec_t tmp_ctime; + inode_timespec_t tmp_ts; sa_bulk_attr_t bulk[4]; int error = 0; int cnt = 0; @@ -3918,10 +3921,12 @@ zfs_dirty_inode(struct inode *ip, int flags) SA_ADD_BULK_ATTR(bulk, cnt, SA_ZPL_CTIME(zfsvfs), NULL, &ctime, 16); /* Preserve the mode, mtime and ctime provided by the inode */ - ZFS_TIME_ENCODE(&ip->i_atime, atime); - ZFS_TIME_ENCODE(&ip->i_mtime, mtime); - tmp_ctime = zpl_inode_get_ctime(ip); - ZFS_TIME_ENCODE(&tmp_ctime, ctime); + tmp_ts = zpl_inode_get_atime(ip); + ZFS_TIME_ENCODE(&tmp_ts, atime); + tmp_ts = zpl_inode_get_mtime(ip); + ZFS_TIME_ENCODE(&tmp_ts, mtime); + tmp_ts = zpl_inode_get_ctime(ip); + ZFS_TIME_ENCODE(&tmp_ts, ctime); mode = ip->i_mode; zp->z_mode = mode; @@ -3964,7 +3969,9 @@ zfs_inactive(struct inode *ip) if (error) { dmu_tx_abort(tx); } else { - ZFS_TIME_ENCODE(&ip->i_atime, atime); + inode_timespec_t tmp_atime; + tmp_atime = zpl_inode_get_atime(ip); + ZFS_TIME_ENCODE(&tmp_atime, atime); mutex_enter(&zp->z_lock); (void) sa_update(zp->z_sa_hdl, SA_ZPL_ATIME(zfsvfs), (void *)&atime, sizeof (atime), tx); diff --git a/module/os/linux/zfs/zfs_znode.c b/module/os/linux/zfs/zfs_znode.c index f71026da83c..b99df188c64 100644 --- a/module/os/linux/zfs/zfs_znode.c +++ b/module/os/linux/zfs/zfs_znode.c @@ -542,7 +542,7 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, uint64_t links; uint64_t z_uid, z_gid; uint64_t atime[2], mtime[2], ctime[2], btime[2]; - inode_timespec_t tmp_ctime; + inode_timespec_t tmp_ts; uint64_t projid = ZFS_DEFAULT_PROJID; sa_bulk_attr_t bulk[12]; int count = 0; @@ -614,10 +614,12 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, if (zp->z_pflags & ZFS_XATTR) zp->z_xattr_parent = parent; - ZFS_TIME_DECODE(&ip->i_atime, atime); - ZFS_TIME_DECODE(&ip->i_mtime, mtime); - ZFS_TIME_DECODE(&tmp_ctime, ctime); - zpl_inode_set_ctime_to_ts(ip, tmp_ctime); + ZFS_TIME_DECODE(&tmp_ts, atime); + zpl_inode_set_atime_to_ts(ip, tmp_ts); + ZFS_TIME_DECODE(&tmp_ts, mtime); + zpl_inode_set_mtime_to_ts(ip, tmp_ts); + ZFS_TIME_DECODE(&tmp_ts, ctime); + zpl_inode_set_ctime_to_ts(ip, tmp_ts); ZFS_TIME_DECODE(&zp->z_btime, btime); ip->i_ino = zp->z_id; @@ -1197,7 +1199,7 @@ zfs_rezget(znode_t *zp) uint64_t gen; uint64_t z_uid, z_gid; uint64_t atime[2], mtime[2], ctime[2], btime[2]; - inode_timespec_t tmp_ctime; + inode_timespec_t tmp_ts; uint64_t projid = ZFS_DEFAULT_PROJID; znode_hold_t *zh; @@ -1290,10 +1292,12 @@ zfs_rezget(znode_t *zp) zfs_uid_write(ZTOI(zp), z_uid); zfs_gid_write(ZTOI(zp), z_gid); - ZFS_TIME_DECODE(&ZTOI(zp)->i_atime, atime); - ZFS_TIME_DECODE(&ZTOI(zp)->i_mtime, mtime); - ZFS_TIME_DECODE(&tmp_ctime, ctime); - zpl_inode_set_ctime_to_ts(ZTOI(zp), tmp_ctime); + ZFS_TIME_DECODE(&tmp_ts, atime); + zpl_inode_set_atime_to_ts(ZTOI(zp), tmp_ts); + ZFS_TIME_DECODE(&tmp_ts, mtime); + zpl_inode_set_mtime_to_ts(ZTOI(zp), tmp_ts); + ZFS_TIME_DECODE(&tmp_ts, ctime); + zpl_inode_set_ctime_to_ts(ZTOI(zp), tmp_ts); ZFS_TIME_DECODE(&zp->z_btime, btime); if ((uint32_t)gen != ZTOI(zp)->i_generation) { @@ -1401,22 +1405,24 @@ zfs_zinactive(znode_t *zp) boolean_t zfs_relatime_need_update(const struct inode *ip) { - inode_timespec_t now, tmp_ctime; + inode_timespec_t now, tmp_atime, tmp_ts; gethrestime(&now); + tmp_atime = zpl_inode_get_atime(ip); /* * In relatime mode, only update the atime if the previous atime * is earlier than either the ctime or mtime or if at least a day * has passed since the last update of atime. */ - if (zfs_compare_timespec(&ip->i_mtime, &ip->i_atime) >= 0) + tmp_ts = zpl_inode_get_mtime(ip); + if (zfs_compare_timespec(&tmp_ts, &tmp_atime) >= 0) return (B_TRUE); - tmp_ctime = zpl_inode_get_ctime(ip); - if (zfs_compare_timespec(&tmp_ctime, &ip->i_atime) >= 0) + tmp_ts = zpl_inode_get_ctime(ip); + if (zfs_compare_timespec(&tmp_ts, &tmp_atime) >= 0) return (B_TRUE); - if ((hrtime_t)now.tv_sec - (hrtime_t)ip->i_atime.tv_sec >= 24*60*60) + if ((hrtime_t)now.tv_sec - (hrtime_t)tmp_atime.tv_sec >= 24*60*60) return (B_TRUE); return (B_FALSE); @@ -1439,7 +1445,7 @@ void zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2], uint64_t ctime[2]) { - inode_timespec_t now, tmp_ctime; + inode_timespec_t now, tmp_ts; gethrestime(&now); @@ -1447,7 +1453,8 @@ zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2], if (flag & ATTR_MTIME) { ZFS_TIME_ENCODE(&now, mtime); - ZFS_TIME_DECODE(&(ZTOI(zp)->i_mtime), mtime); + ZFS_TIME_DECODE(&tmp_ts, mtime); + zpl_inode_set_mtime_to_ts(ZTOI(zp), tmp_ts); if (ZTOZSB(zp)->z_use_fuids) { zp->z_pflags |= (ZFS_ARCHIVE | ZFS_AV_MODIFIED); @@ -1456,8 +1463,8 @@ zfs_tstamp_update_setup(znode_t *zp, uint_t flag, uint64_t mtime[2], if (flag & ATTR_CTIME) { ZFS_TIME_ENCODE(&now, ctime); - ZFS_TIME_DECODE(&tmp_ctime, ctime); - zpl_inode_set_ctime_to_ts(ZTOI(zp), tmp_ctime); + ZFS_TIME_DECODE(&tmp_ts, ctime); + zpl_inode_set_ctime_to_ts(ZTOI(zp), tmp_ts); if (ZTOZSB(zp)->z_use_fuids) zp->z_pflags |= ZFS_ARCHIVE; } diff --git a/module/os/linux/zfs/zpl_inode.c b/module/os/linux/zfs/zpl_inode.c index 96f65b9e94e..ad1753f7a07 100644 --- a/module/os/linux/zfs/zpl_inode.c +++ b/module/os/linux/zfs/zpl_inode.c @@ -526,7 +526,8 @@ zpl_setattr(struct dentry *dentry, struct iattr *ia) vap->va_ctime = ia->ia_ctime; if (vap->va_mask & ATTR_ATIME) - ip->i_atime = zpl_inode_timestamp_truncate(ia->ia_atime, ip); + zpl_inode_set_atime_to_ts(ip, + zpl_inode_timestamp_truncate(ia->ia_atime, ip)); cookie = spl_fstrans_mark(); #ifdef HAVE_USERNS_IOPS_SETATTR From 18a9185165e2713e690e52347a37de1878e2a9fc Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Sat, 16 Dec 2023 17:39:07 +1100 Subject: [PATCH 03/91] linux 6.7 compat: handle superblock shrinker member change In 6.7 the superblock shrinker member s_shrink has changed from being an embedded struct to a pointer. Detect this, and don't take a reference if it already is one. Signed-off-by: Rob Norris Sponsored-by: https://github.com/sponsors/robn --- config/kernel-shrink.m4 | 35 +++++++++++++++++++++++++++++++- module/os/linux/zfs/zfs_vfsops.c | 10 +++++++-- 2 files changed, 42 insertions(+), 3 deletions(-) diff --git a/config/kernel-shrink.m4 b/config/kernel-shrink.m4 index 0c702153e8c..1c5f753d411 100644 --- a/config/kernel-shrink.m4 +++ b/config/kernel-shrink.m4 @@ -19,12 +19,44 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_SUPER_BLOCK_S_SHRINK], [ ],[]) ]) +dnl # +dnl # 6.7 API change +dnl # s_shrink is now a pointer. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_SUPER_BLOCK_S_SHRINK_PTR], [ + ZFS_LINUX_TEST_SRC([super_block_s_shrink_ptr], [ + #include + unsigned long shrinker_cb(struct shrinker *shrink, + struct shrink_control *sc) { return 0; } + static struct shrinker shrinker = { + .count_objects = shrinker_cb, + .scan_objects = shrinker_cb, + .seeks = DEFAULT_SEEKS, + }; + static const struct super_block + sb __attribute__ ((unused)) = { + .s_shrink = &shrinker, + }; + ],[]) +]) + AC_DEFUN([ZFS_AC_KERNEL_SUPER_BLOCK_S_SHRINK], [ AC_MSG_CHECKING([whether super_block has s_shrink]) ZFS_LINUX_TEST_RESULT([super_block_s_shrink], [ AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_SUPER_BLOCK_S_SHRINK, 1, + [have super_block s_shrink]) ],[ - ZFS_LINUX_TEST_ERROR([sb->s_shrink()]) + AC_MSG_RESULT(no) + AC_MSG_CHECKING([whether super_block has s_shrink pointer]) + ZFS_LINUX_TEST_RESULT([super_block_s_shrink_ptr], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_SUPER_BLOCK_S_SHRINK_PTR, 1, + [have super_block s_shrink pointer]) + ],[ + AC_MSG_RESULT(no) + ZFS_LINUX_TEST_ERROR([sb->s_shrink()]) + ]) ]) ]) @@ -174,6 +206,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SHRINK_CONTROL_STRUCT], [ AC_DEFUN([ZFS_AC_KERNEL_SRC_SHRINKER], [ ZFS_AC_KERNEL_SRC_SUPER_BLOCK_S_SHRINK + ZFS_AC_KERNEL_SRC_SUPER_BLOCK_S_SHRINK_PTR ZFS_AC_KERNEL_SRC_SHRINK_CONTROL_HAS_NID ZFS_AC_KERNEL_SRC_SHRINKER_CALLBACK ZFS_AC_KERNEL_SRC_SHRINK_CONTROL_STRUCT diff --git a/module/os/linux/zfs/zfs_vfsops.c b/module/os/linux/zfs/zfs_vfsops.c index 2792bc02721..2015c20d734 100644 --- a/module/os/linux/zfs/zfs_vfsops.c +++ b/module/os/linux/zfs/zfs_vfsops.c @@ -1240,12 +1240,18 @@ zfs_prune_aliases(zfsvfs_t *zfsvfs, unsigned long nr_to_scan) * and inode caches. This can occur when the ARC needs to free meta data * blocks but can't because they are all pinned by entries in these caches. */ +#if defined(HAVE_SUPER_BLOCK_S_SHRINK) +#define S_SHRINK(sb) (&(sb)->s_shrink) +#elif defined(HAVE_SUPER_BLOCK_S_SHRINK_PTR) +#define S_SHRINK(sb) ((sb)->s_shrink) +#endif + int zfs_prune(struct super_block *sb, unsigned long nr_to_scan, int *objects) { zfsvfs_t *zfsvfs = sb->s_fs_info; int error = 0; - struct shrinker *shrinker = &sb->s_shrink; + struct shrinker *shrinker = S_SHRINK(sb); struct shrink_control sc = { .nr_to_scan = nr_to_scan, .gfp_mask = GFP_KERNEL, @@ -1257,7 +1263,7 @@ zfs_prune(struct super_block *sb, unsigned long nr_to_scan, int *objects) #if defined(HAVE_SPLIT_SHRINKER_CALLBACK) && \ defined(SHRINK_CONTROL_HAS_NID) && \ defined(SHRINKER_NUMA_AWARE) - if (sb->s_shrink.flags & SHRINKER_NUMA_AWARE) { + if (shrinker->flags & SHRINKER_NUMA_AWARE) { *objects = 0; for_each_online_node(sc.nid) { *objects += (*shrinker->scan_objects)(shrinker, &sc); From 03b84099d9c4d3f1b4d1b123abc967e81f6d15db Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Sun, 17 Dec 2023 00:36:21 +1100 Subject: [PATCH 04/91] linux 6.7 compat: rework shrinker setup for heap allocations 6.7 changes the shrinker API such that shrinkers must be allocated dynamically by the kernel. To accomodate this, this commit reworks spl_register_shrinker() to do something similar against earlier kernels. Signed-off-by: Rob Norris Sponsored-by: https://github.com/sponsors/robn --- config/kernel-shrink.m4 | 52 +++++++++++-- include/os/linux/spl/sys/shrinker.h | 66 +++++----------- module/Kbuild.in | 1 + module/os/linux/spl/spl-shrinker.c | 115 ++++++++++++++++++++++++++++ module/os/linux/zfs/arc_os.c | 11 ++- 5 files changed, 189 insertions(+), 56 deletions(-) create mode 100644 module/os/linux/spl/spl-shrinker.c diff --git a/config/kernel-shrink.m4 b/config/kernel-shrink.m4 index 1c5f753d411..4a529c43b5b 100644 --- a/config/kernel-shrink.m4 +++ b/config/kernel-shrink.m4 @@ -128,6 +128,25 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_SHRINKER_CALLBACK], [ ]) ]) +dnl # +dnl # 6.7 API change +dnl # register_shrinker has been replaced by shrinker_register. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_SHRINKER_REGISTER], [ + ZFS_LINUX_TEST_SRC([shrinker_register], [ + #include + unsigned long shrinker_cb(struct shrinker *shrink, + struct shrink_control *sc) { return 0; } + ],[ + struct shrinker cache_shrinker = { + .count_objects = shrinker_cb, + .scan_objects = shrinker_cb, + .seeks = DEFAULT_SEEKS, + }; + shrinker_register(&cache_shrinker); + ]) +]) + AC_DEFUN([ZFS_AC_KERNEL_SHRINKER_CALLBACK],[ dnl # dnl # 6.0 API change @@ -165,14 +184,36 @@ AC_DEFUN([ZFS_AC_KERNEL_SHRINKER_CALLBACK],[ dnl # cs->shrink() is logically split in to dnl # cs->count_objects() and cs->scan_objects() dnl # - AC_MSG_CHECKING([if cs->count_objects callback exists]) + AC_MSG_CHECKING( + [whether cs->count_objects callback exists]) ZFS_LINUX_TEST_RESULT( - [shrinker_cb_shrink_control_split],[ - AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_SPLIT_SHRINKER_CALLBACK, 1, - [cs->count_objects exists]) + [shrinker_cb_shrink_control_split],[ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_SPLIT_SHRINKER_CALLBACK, 1, + [cs->count_objects exists]) ],[ + AC_MSG_RESULT(no) + + AC_MSG_CHECKING( + [whether shrinker_register exists]) + ZFS_LINUX_TEST_RESULT([shrinker_register], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_SHRINKER_REGISTER, 1, + [shrinker_register exists]) + + dnl # We assume that the split shrinker + dnl # callback exists if + dnl # shrinker_register() exists, + dnl # because the latter is a much more + dnl # recent addition, and the macro + dnl # test for shrinker_register() only + dnl # works if the callback is split + AC_DEFINE(HAVE_SPLIT_SHRINKER_CALLBACK, + 1, [cs->count_objects exists]) + ],[ + AC_MSG_RESULT(no) ZFS_LINUX_TEST_ERROR([shrinker]) + ]) ]) ]) ]) @@ -211,6 +252,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_SHRINKER], [ ZFS_AC_KERNEL_SRC_SHRINKER_CALLBACK ZFS_AC_KERNEL_SRC_SHRINK_CONTROL_STRUCT ZFS_AC_KERNEL_SRC_REGISTER_SHRINKER_VARARG + ZFS_AC_KERNEL_SRC_SHRINKER_REGISTER ]) AC_DEFUN([ZFS_AC_KERNEL_SHRINKER], [ diff --git a/include/os/linux/spl/sys/shrinker.h b/include/os/linux/spl/sys/shrinker.h index d472754be4f..bca4c850694 100644 --- a/include/os/linux/spl/sys/shrinker.h +++ b/include/os/linux/spl/sys/shrinker.h @@ -29,12 +29,13 @@ /* * Due to frequent changes in the shrinker API the following - * compatibility wrappers should be used. They are as follows: + * compatibility wrapper should be used. * - * SPL_SHRINKER_DECLARE(varname, countfunc, scanfunc, seek_cost); + * shrinker = spl_register_shrinker(name, countfunc, scanfunc, seek_cost); + * spl_unregister_shrinker(shrinker); * - * SPL_SHRINKER_DECLARE is used to declare a shrinker with the name varname, - * which is passed to spl_register_shrinker()/spl_unregister_shrinker(). + * spl_register_shrinker is used to create and register a shrinker with the + * given name. * The countfunc returns the number of free-able objects. * The scanfunc returns the number of objects that were freed. * The callbacks can return SHRINK_STOP if further calls can't make any more @@ -57,57 +58,28 @@ * ...scan objects in the cache and reclaim them... * } * - * SPL_SHRINKER_DECLARE(my_shrinker, my_count, my_scan, DEFAULT_SEEKS); + * static struct shrinker *my_shrinker; * * void my_init_func(void) { - * spl_register_shrinker(&my_shrinker); + * my_shrinker = spl_register_shrinker("my-shrinker", + * my_count, my_scan, DEFAULT_SEEKS); + * } + * + * void my_fini_func(void) { + * spl_unregister_shrinker(my_shrinker); * } */ -#ifdef HAVE_REGISTER_SHRINKER_VARARG -#define spl_register_shrinker(x) register_shrinker(x, "zfs-arc-shrinker") -#else -#define spl_register_shrinker(x) register_shrinker(x) -#endif -#define spl_unregister_shrinker(x) unregister_shrinker(x) +typedef unsigned long (*spl_shrinker_cb) + (struct shrinker *, struct shrink_control *); -/* - * Linux 3.0 to 3.11 Shrinker API Compatibility. - */ -#if defined(HAVE_SINGLE_SHRINKER_CALLBACK) -#define SPL_SHRINKER_DECLARE(varname, countfunc, scanfunc, seek_cost) \ -static int \ -__ ## varname ## _wrapper(struct shrinker *shrink, struct shrink_control *sc)\ -{ \ - if (sc->nr_to_scan != 0) { \ - (void) scanfunc(shrink, sc); \ - } \ - return (countfunc(shrink, sc)); \ -} \ - \ -static struct shrinker varname = { \ - .shrink = __ ## varname ## _wrapper, \ - .seeks = seek_cost, \ -} +struct shrinker *spl_register_shrinker(const char *name, + spl_shrinker_cb countfunc, spl_shrinker_cb scanfunc, int seek_cost); +void spl_unregister_shrinker(struct shrinker *); +#ifndef SHRINK_STOP +/* 3.0-3.11 compatibility */ #define SHRINK_STOP (-1) - -/* - * Linux 3.12 and later Shrinker API Compatibility. - */ -#elif defined(HAVE_SPLIT_SHRINKER_CALLBACK) -#define SPL_SHRINKER_DECLARE(varname, countfunc, scanfunc, seek_cost) \ -static struct shrinker varname = { \ - .count_objects = countfunc, \ - .scan_objects = scanfunc, \ - .seeks = seek_cost, \ -} - -#else -/* - * Linux 2.x to 2.6.22, or a newer shrinker API has been introduced. - */ -#error "Unknown shrinker callback" #endif #endif /* SPL_SHRINKER_H */ diff --git a/module/Kbuild.in b/module/Kbuild.in index b9c284a2441..f1a145779dd 100644 --- a/module/Kbuild.in +++ b/module/Kbuild.in @@ -79,6 +79,7 @@ SPL_OBJS := \ spl-kstat.o \ spl-proc.o \ spl-procfs-list.o \ + spl-shrinker.o \ spl-taskq.o \ spl-thread.o \ spl-trace.o \ diff --git a/module/os/linux/spl/spl-shrinker.c b/module/os/linux/spl/spl-shrinker.c new file mode 100644 index 00000000000..d5c8da471cb --- /dev/null +++ b/module/os/linux/spl/spl-shrinker.c @@ -0,0 +1,115 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Copyright (C) 2007 The Regents of the University of California. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + * + * Solaris Porting Layer (SPL) Shrinker Implementation. + */ + +#include +#include + +#ifdef HAVE_SINGLE_SHRINKER_CALLBACK +/* 3.0-3.11: single shrink() callback, which we wrap to carry both functions */ +struct spl_shrinker_wrap { + struct shrinker shrinker; + spl_shrinker_cb countfunc; + spl_shrinker_cb scanfunc; +}; + +static int +spl_shrinker_single_cb(struct shrinker *shrinker, struct shrink_control *sc) +{ + struct spl_shrinker_wrap *sw = (struct spl_shrinker_wrap *)shrinker; + + if (sc->nr_to_scan != 0) + (void) sw->scanfunc(&sw->shrinker, sc); + return (sw->countfunc(&sw->shrinker, sc)); +} +#endif + +struct shrinker * +spl_register_shrinker(const char *name, spl_shrinker_cb countfunc, + spl_shrinker_cb scanfunc, int seek_cost) +{ + struct shrinker *shrinker; + + /* allocate shrinker */ +#if defined(HAVE_SHRINKER_REGISTER) + /* 6.7: kernel will allocate the shrinker for us */ + shrinker = shrinker_alloc(0, name); +#elif defined(HAVE_SPLIT_SHRINKER_CALLBACK) + /* 3.12-6.6: we allocate the shrinker */ + shrinker = kmem_zalloc(sizeof (struct shrinker), KM_SLEEP); +#elif defined(HAVE_SINGLE_SHRINKER_CALLBACK) + /* 3.0-3.11: allocate a wrapper */ + struct spl_shrinker_wrap *sw = + kmem_zalloc(sizeof (struct spl_shrinker_wrap), KM_SLEEP); + shrinker = &sw->shrinker; +#else + /* 2.x-2.6.22, or a newer shrinker API has been introduced. */ +#error "Unknown shrinker API" +#endif + + if (shrinker == NULL) + return (NULL); + + /* set callbacks */ +#ifdef HAVE_SINGLE_SHRINKER_CALLBACK + sw->countfunc = countfunc; + sw->scanfunc = scanfunc; + shrinker->shrink = spl_shrinker_single_cb; +#else + shrinker->count_objects = countfunc; + shrinker->scan_objects = scanfunc; +#endif + + /* set params */ + shrinker->seeks = seek_cost; + + /* register with kernel */ +#if defined(HAVE_SHRINKER_REGISTER) + shrinker_register(shrinker); +#elif defined(HAVE_REGISTER_SHRINKER_VARARG) + register_shrinker(shrinker, name); +#else + register_shrinker(shrinker); +#endif + + return (shrinker); +} +EXPORT_SYMBOL(spl_register_shrinker); + +void +spl_unregister_shrinker(struct shrinker *shrinker) +{ +#if defined(HAVE_SHRINKER_REGISTER) + shrinker_free(shrinker); +#elif defined(HAVE_SPLIT_SHRINKER_CALLBACK) + unregister_shrinker(shrinker); + kmem_free(shrinker, sizeof (struct shrinker)); +#elif defined(HAVE_SINGLE_SHRINKER_CALLBACK) + unregister_shrinker(shrinker); + kmem_free(shrinker, sizeof (struct spl_shrinker_wrap)); +#else +#error "Unknown shrinker API" +#endif +} +EXPORT_SYMBOL(spl_unregister_shrinker); diff --git a/module/os/linux/zfs/arc_os.c b/module/os/linux/zfs/arc_os.c index 43ed087e2db..1fa9f3eb3f5 100644 --- a/module/os/linux/zfs/arc_os.c +++ b/module/os/linux/zfs/arc_os.c @@ -247,8 +247,7 @@ arc_shrinker_scan(struct shrinker *shrink, struct shrink_control *sc) return (sc->nr_to_scan); } -SPL_SHRINKER_DECLARE(arc_shrinker, - arc_shrinker_count, arc_shrinker_scan, DEFAULT_SEEKS); +static struct shrinker *arc_shrinker = NULL; int arc_memory_throttle(spa_t *spa, uint64_t reserve, uint64_t txg) @@ -351,14 +350,18 @@ arc_lowmem_init(void) * reclaim from the arc. This is done to prevent kswapd from * swapping out pages when it is preferable to shrink the arc. */ - spl_register_shrinker(&arc_shrinker); + arc_shrinker = spl_register_shrinker("zfs-arc-shrinker", + arc_shrinker_count, arc_shrinker_scan, DEFAULT_SEEKS); + VERIFY(arc_shrinker); + arc_set_sys_free(allmem); } void arc_lowmem_fini(void) { - spl_unregister_shrinker(&arc_shrinker); + spl_unregister_shrinker(arc_shrinker); + arc_shrinker = NULL; } int From 3c502e376b77b463501c2c218f286a1c2735afb4 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Fri, 8 Dec 2023 17:31:31 -0800 Subject: [PATCH 05/91] ZTS: Disable io_uring test on CentOS 9 The io_uring test fails on CentOS 9 with the following fio error. Disable the test for the benefit of the CI until this can be fully investigated. This basic test passes as expected on newer kernels. Reviewed-by: Tony Hutter Signed-off-by: Brian Behlendorf Closes #15636 --- tests/zfs-tests/tests/functional/io/io_uring.ksh | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tests/zfs-tests/tests/functional/io/io_uring.ksh b/tests/zfs-tests/tests/functional/io/io_uring.ksh index 47e439d0f4d..2fa14655635 100755 --- a/tests/zfs-tests/tests/functional/io/io_uring.ksh +++ b/tests/zfs-tests/tests/functional/io/io_uring.ksh @@ -44,6 +44,13 @@ if ! $(grep -q "CONFIG_IO_URING=y" /boot/config-$(uname -r)); then log_unsupported "Requires io_uring support" fi +if [ -e /etc/os-release ] ; then + source /etc/os-release + if [ -n "$REDHAT_SUPPORT_PRODUCT_VERSION" ] && ((floor($REDHAT_SUPPORT_PRODUCT_VERSION) == 9)) ; then + log_unsupported "Disabled on CentOS 9, fails with 'Operation not permitted'" + fi +fi + fio --ioengine=io_uring --parse-only || log_unsupported "fio io_uring support required" function cleanup From d530d5d8a567c0cf64a434f0303929dc0bb338da Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Thu, 21 Dec 2023 11:22:56 -0800 Subject: [PATCH 06/91] Linux 6.5 compat: check BLK_OPEN_EXCL is defined On some systems we already have blkdev_get_by_path() with 4 args but still the old FMODE_EXCL and not BLK_OPEN_EXCL defined. The vdev_bdev_mode() function was added to handle this case but there was no generic way to specify exclusive access. Reviewed-by: Brian Atkinson Signed-off-by: Brian Behlendorf Closes #15692 --- module/os/linux/zfs/vdev_disk.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c index 48ac55f0703..8b5aa94fe4f 100644 --- a/module/os/linux/zfs/vdev_disk.c +++ b/module/os/linux/zfs/vdev_disk.c @@ -85,7 +85,7 @@ static blk_mode_t #else static fmode_t #endif -vdev_bdev_mode(spa_mode_t spa_mode) +vdev_bdev_mode(spa_mode_t spa_mode, boolean_t exclusive) { #ifdef HAVE_BLK_MODE_T blk_mode_t mode = 0; @@ -95,6 +95,9 @@ vdev_bdev_mode(spa_mode_t spa_mode) if (spa_mode & SPA_MODE_WRITE) mode |= BLK_OPEN_WRITE; + + if (exclusive) + mode |= BLK_OPEN_EXCL; #else fmode_t mode = 0; @@ -103,6 +106,9 @@ vdev_bdev_mode(spa_mode_t spa_mode) if (spa_mode & SPA_MODE_WRITE) mode |= FMODE_WRITE; + + if (exclusive) + mode |= FMODE_EXCL; #endif return (mode); @@ -225,10 +231,10 @@ vdev_blkdev_get_by_path(const char *path, spa_mode_t mode, void *holder, { #ifdef HAVE_BLKDEV_GET_BY_PATH_4ARG return (blkdev_get_by_path(path, - vdev_bdev_mode(mode) | BLK_OPEN_EXCL, holder, hops)); + vdev_bdev_mode(mode, B_TRUE), holder, hops)); #else return (blkdev_get_by_path(path, - vdev_bdev_mode(mode) | FMODE_EXCL, holder)); + vdev_bdev_mode(mode, B_TRUE), holder)); #endif } @@ -238,7 +244,7 @@ vdev_blkdev_put(struct block_device *bdev, spa_mode_t mode, void *holder) #ifdef HAVE_BLKDEV_PUT_HOLDER return (blkdev_put(bdev, holder)); #else - return (blkdev_put(bdev, vdev_bdev_mode(mode) | FMODE_EXCL)); + return (blkdev_put(bdev, vdev_bdev_mode(mode, B_TRUE))); #endif } @@ -248,9 +254,9 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, { struct block_device *bdev; #ifdef HAVE_BLK_MODE_T - blk_mode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa)); + blk_mode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa), B_FALSE); #else - fmode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa)); + fmode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa), B_FALSE); #endif hrtime_t timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms); vdev_disk_t *vd; From db2db50e370162e0d94e21fcff0be0891c02e99e Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Wed, 25 Oct 2023 15:11:37 +1100 Subject: [PATCH 07/91] spa: make read/write queues configurable We are finding that as customers get larger and faster machines (hundreds of cores, large NVMe-backed pools) they keep hitting relatively low performance ceilings. Our profiling work almost always finds that they're running into bottlenecks on the SPA IO taskqs. Unfortunately there's often little we can advise at that point, because there's very few ways to change behaviour without patching. This commit adds two load-time parameters `zio_taskq_read` and `zio_taskq_write` that can configure the READ and WRITE IO taskqs directly. This achieves two goals: it gives operators (and those that support them) a way to tune things without requiring a custom build of OpenZFS, which is often not possible, and it lets us easily try different config variations in a variety of environments to inform the development of better defaults for these kind of systems. Because tuning the IO taskqs really requires a fairly deep understanding of how IO in ZFS works, and generally isn't needed without a pretty serious workload and an ability to identify bottlenecks, only minimal documentation is provided. Its expected that anyone using this is going to have the source code there as well. Signed-off-by: Rob Norris Sponsored-by: Klara, Inc. Sponsored-by: Wasabi Technology, Inc. --- man/man4/zfs.4 | 10 ++ module/zfs/spa.c | 297 ++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 306 insertions(+), 1 deletion(-) diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 4ec52a2fb65..c12ef1387cc 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -2280,6 +2280,16 @@ If .Sy 0 , generate a system-dependent value close to 6 threads per taskq. . +.It Sy zio_taskq_read Ns = Ns Sy fixed,1,8 null scale null Pq charp +Set the queue and thread configuration for the IO read queues. +This is an advanced debugging parameter. +Don't change this unless you understand what it does. +. +.It Sy zio_taskq_write Ns = Ns Sy batch fixed,1,5 scale fixed,1,5 Pq charp +Set the queue and thread configuration for the IO write queues. +This is an advanced debugging parameter. +Don't change this unless you understand what it does. +. .It Sy zvol_inhibit_dev Ns = Ns Sy 0 Ns | Ns 1 Pq uint Do not create zvol device nodes. This may slightly improve startup time on diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 1410651c63c..32a58529219 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -151,7 +151,7 @@ static const char *const zio_taskq_types[ZIO_TASKQ_TYPES] = { * and interrupt) and then to reserve threads for ZIO_PRIORITY_NOW I/Os that * need to be handled with minimum delay. */ -static const zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { +static zio_taskq_info_t zio_taskqs[ZIO_TYPES][ZIO_TASKQ_TYPES] = { /* ISSUE ISSUE_HIGH INTR INTR_HIGH */ { ZTI_ONE, ZTI_NULL, ZTI_ONE, ZTI_NULL }, /* NULL */ { ZTI_N(8), ZTI_NULL, ZTI_SCALE, ZTI_NULL }, /* READ */ @@ -1164,6 +1164,292 @@ spa_taskqs_fini(spa_t *spa, zio_type_t t, zio_taskq_type_t q) tqs->stqs_taskq = NULL; } +#ifdef _KERNEL +/* + * The READ and WRITE rows of zio_taskqs are configurable at module load time + * by setting zio_taskq_read or zio_taskq_write. + * + * Example (the defaults for READ and WRITE) + * zio_taskq_read='fixed,1,8 null scale null' + * zio_taskq_write='batch fixed,1,5 scale fixed,1,5' + * + * Each sets the entire row at a time. + * + * 'fixed' is parameterised: fixed,Q,T where Q is number of taskqs, T is number + * of threads per taskq. + * + * 'null' can only be set on the high-priority queues (queue selection for + * high-priority queues will fall back to the regular queue if the high-pri + * is NULL. + */ +static const char *const modes[ZTI_NMODES] = { + "fixed", "batch", "scale", "null" +}; + +/* Parse the incoming config string. Modifies cfg */ +static int +spa_taskq_param_set(zio_type_t t, char *cfg) +{ + int err = 0; + + zio_taskq_info_t row[ZIO_TASKQ_TYPES] = {{0}}; + + char *next = cfg, *tok, *c; + + /* + * Parse out each element from the string and fill `row`. The entire + * row has to be set at once, so any errors are flagged by just + * breaking out of this loop early. + */ + uint_t q; + for (q = 0; q < ZIO_TASKQ_TYPES; q++) { + /* `next` is the start of the config */ + if (next == NULL) + break; + + /* Eat up leading space */ + while (isspace(*next)) + next++; + if (*next == '\0') + break; + + /* Mode ends at space or end of string */ + tok = next; + next = strchr(tok, ' '); + if (next != NULL) *next++ = '\0'; + + /* Parameters start after a comma */ + c = strchr(tok, ','); + if (c != NULL) *c++ = '\0'; + + /* Match mode string */ + uint_t mode; + for (mode = 0; mode < ZTI_NMODES; mode++) + if (strcmp(tok, modes[mode]) == 0) + break; + if (mode == ZTI_NMODES) + break; + + /* Invalid canary */ + row[q].zti_mode = ZTI_NMODES; + + /* Per-mode setup */ + switch (mode) { + + /* + * FIXED is parameterised: number of queues, and number of + * threads per queue. + */ + case ZTI_MODE_FIXED: { + /* No parameters? */ + if (c == NULL || *c == '\0') + break; + + /* Find next parameter */ + tok = c; + c = strchr(tok, ','); + if (c == NULL) + break; + + /* Take digits and convert */ + unsigned long long nq; + if (!(isdigit(*tok))) + break; + err = ddi_strtoull(tok, &tok, 10, &nq); + /* Must succeed and also end at the next param sep */ + if (err != 0 || tok != c) + break; + + /* Move past the comma */ + tok++; + /* Need another number */ + if (!(isdigit(*tok))) + break; + /* Remember start to make sure we moved */ + c = tok; + + /* Take digits */ + unsigned long long ntpq; + err = ddi_strtoull(tok, &tok, 10, &ntpq); + /* Must succeed, and moved forward */ + if (err != 0 || tok == c || *tok != '\0') + break; + + /* + * sanity; zero queues/threads make no sense, and + * 16K is almost certainly more than anyone will ever + * need and avoids silly numbers like UINT32_MAX + */ + if (nq == 0 || nq >= 16384 || + ntpq == 0 || ntpq >= 16384) + break; + + const zio_taskq_info_t zti = ZTI_P(ntpq, nq); + row[q] = zti; + break; + } + + case ZTI_MODE_BATCH: { + const zio_taskq_info_t zti = ZTI_BATCH; + row[q] = zti; + break; + } + + case ZTI_MODE_SCALE: { + const zio_taskq_info_t zti = ZTI_SCALE; + row[q] = zti; + break; + } + + case ZTI_MODE_NULL: { + /* + * Can only null the high-priority queues; the general- + * purpose ones have to exist. + */ + if (q != ZIO_TASKQ_ISSUE_HIGH && + q != ZIO_TASKQ_INTERRUPT_HIGH) + break; + + const zio_taskq_info_t zti = ZTI_NULL; + row[q] = zti; + break; + } + + default: + break; + } + + /* Ensure we set a mode */ + if (row[q].zti_mode == ZTI_NMODES) + break; + } + + /* Didn't get a full row, fail */ + if (q < ZIO_TASKQ_TYPES) + return (SET_ERROR(EINVAL)); + + /* Eat trailing space */ + if (next != NULL) + while (isspace(*next)) + next++; + + /* If there's anything left over then fail */ + if (next != NULL && *next != '\0') + return (SET_ERROR(EINVAL)); + + /* Success! Copy it into the real config */ + for (q = 0; q < ZIO_TASKQ_TYPES; q++) + zio_taskqs[t][q] = row[q]; + + return (0); +} + +static int +spa_taskq_param_get(zio_type_t t, char *buf) +{ + int pos = 0; + + /* Build paramater string from live config */ + const char *sep = ""; + for (uint_t q = 0; q < ZIO_TASKQ_TYPES; q++) { + const zio_taskq_info_t *zti = &zio_taskqs[t][q]; + if (zti->zti_mode == ZTI_MODE_FIXED) + pos += sprintf(&buf[pos], "%s%s,%u,%u", sep, + modes[zti->zti_mode], zti->zti_count, + zti->zti_value); + else + pos += sprintf(&buf[pos], "%s%s", sep, + modes[zti->zti_mode]); + sep = " "; + } + + buf[pos++] = '\n'; + buf[pos] = '\0'; + + return (pos); +} + +#ifdef __linux__ +static int +spa_taskq_read_param_set(const char *val, zfs_kernel_param_t *kp) +{ + char *cfg = kmem_strdup(val); + int err = spa_taskq_param_set(ZIO_TYPE_READ, cfg); + kmem_free(cfg, strlen(val)+1); + return (-err); +} +static int +spa_taskq_read_param_get(char *buf, zfs_kernel_param_t *kp) +{ + return (spa_taskq_param_get(ZIO_TYPE_READ, buf)); +} + +static int +spa_taskq_write_param_set(const char *val, zfs_kernel_param_t *kp) +{ + char *cfg = kmem_strdup(val); + int err = spa_taskq_param_set(ZIO_TYPE_WRITE, cfg); + kmem_free(cfg, strlen(val)+1); + return (-err); +} +static int +spa_taskq_write_param_get(char *buf, zfs_kernel_param_t *kp) +{ + return (spa_taskq_param_get(ZIO_TYPE_WRITE, buf)); +} +#else +#include + +/* + * On FreeBSD load-time parameters can be set up before malloc() is available, + * so we have to do all the parsing work on the stack. + */ +#define SPA_TASKQ_PARAM_MAX (128) + +static int +spa_taskq_read_param(ZFS_MODULE_PARAM_ARGS) +{ + char buf[SPA_TASKQ_PARAM_MAX]; + int err = 0; + + if (req->newptr == NULL) { + int len = spa_taskq_param_get(ZIO_TYPE_READ, buf); + struct sbuf *s = sbuf_new_for_sysctl(NULL, NULL, len+1, req); + sbuf_cpy(s, buf); + err = sbuf_finish(s); + sbuf_delete(s); + return (err); + } + + err = sysctl_handle_string(oidp, buf, sizeof (buf), req); + if (err) + return (err); + return (spa_taskq_param_set(ZIO_TYPE_READ, buf)); +} + +static int +spa_taskq_write_param(ZFS_MODULE_PARAM_ARGS) +{ + char buf[SPA_TASKQ_PARAM_MAX]; + int err = 0; + + if (req->newptr == NULL) { + int len = spa_taskq_param_get(ZIO_TYPE_WRITE, buf); + struct sbuf *s = sbuf_new_for_sysctl(NULL, NULL, len+1, req); + sbuf_cpy(s, buf); + err = sbuf_finish(s); + sbuf_delete(s); + return (err); + } + + err = sysctl_handle_string(oidp, buf, sizeof (buf), req); + if (err) + return (err); + return (spa_taskq_param_set(ZIO_TYPE_WRITE, buf)); +} +#endif +#endif /* _KERNEL */ + /* * Dispatch a task to the appropriate taskq for the ZFS I/O type and priority. * Note that a type may have multiple discrete taskqs to avoid lock contention @@ -10210,4 +10496,13 @@ ZFS_MODULE_PARAM(zfs_livelist_condense, zfs_livelist_condense_, new_alloc, INT, ZMOD_RW, "Whether extra ALLOC blkptrs were added to a livelist entry while it " "was being condensed"); + +#ifdef _KERNEL +ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_read, + spa_taskq_read_param_set, spa_taskq_read_param_get, ZMOD_RD, + "Configure IO queues for read IO"); +ZFS_MODULE_VIRTUAL_PARAM_CALL(zfs_zio, zio_, taskq_write, + spa_taskq_write_param_set, spa_taskq_write_param_get, ZMOD_RD, + "Configure IO queues for write IO"); +#endif /* END CSTYLED */ From 2a59b6bfa96648bc2c8c83eed0e026010e8da864 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Tue, 24 Oct 2023 17:33:58 -0400 Subject: [PATCH 08/91] ABD: Be more assertive in iterators Once we verified the ABDs and asserted the sizes we should never see premature ABDs ends. Assert that and remove extra branches from production builds. Reviewed-by: Brian Atkinson Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #15428 --- module/zfs/abd.c | 104 ++++++++++++----------------------------------- 1 file changed, 26 insertions(+), 78 deletions(-) diff --git a/module/zfs/abd.c b/module/zfs/abd.c index 745ee8f02ed..d982f201c93 100644 --- a/module/zfs/abd.c +++ b/module/zfs/abd.c @@ -802,13 +802,10 @@ abd_iterate_func(abd_t *abd, size_t off, size_t size, abd_verify(abd); ASSERT3U(off + size, <=, abd->abd_size); - boolean_t gang = abd_is_gang(abd); abd_t *c_abd = abd_init_abd_iter(abd, &aiter, off); while (size > 0) { - /* If we are at the end of the gang ABD we are done */ - if (gang && !c_abd) - break; + IMPLY(abd_is_gang(abd), c_abd != NULL); abd_iter_map(&aiter); @@ -930,7 +927,6 @@ abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, { int ret = 0; struct abd_iter daiter, saiter; - boolean_t dabd_is_gang_abd, sabd_is_gang_abd; abd_t *c_dabd, *c_sabd; if (size == 0) @@ -942,16 +938,12 @@ abd_iterate_func2(abd_t *dabd, abd_t *sabd, size_t doff, size_t soff, ASSERT3U(doff + size, <=, dabd->abd_size); ASSERT3U(soff + size, <=, sabd->abd_size); - dabd_is_gang_abd = abd_is_gang(dabd); - sabd_is_gang_abd = abd_is_gang(sabd); c_dabd = abd_init_abd_iter(dabd, &daiter, doff); c_sabd = abd_init_abd_iter(sabd, &saiter, soff); while (size > 0) { - /* if we are at the end of the gang ABD we are done */ - if ((dabd_is_gang_abd && !c_dabd) || - (sabd_is_gang_abd && !c_sabd)) - break; + IMPLY(abd_is_gang(dabd), c_dabd != NULL); + IMPLY(abd_is_gang(sabd), c_sabd != NULL); abd_iter_map(&daiter); abd_iter_map(&saiter); @@ -1032,66 +1024,40 @@ abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, int i; ssize_t len, dlen; struct abd_iter caiters[3]; - struct abd_iter daiter = {0}; + struct abd_iter daiter; void *caddrs[3]; unsigned long flags __maybe_unused = 0; abd_t *c_cabds[3]; abd_t *c_dabd = NULL; - boolean_t cabds_is_gang_abd[3]; - boolean_t dabd_is_gang_abd = B_FALSE; ASSERT3U(parity, <=, 3); - for (i = 0; i < parity; i++) { - cabds_is_gang_abd[i] = abd_is_gang(cabds[i]); + abd_verify(cabds[i]); + ASSERT3U(csize, <=, cabds[i]->abd_size); c_cabds[i] = abd_init_abd_iter(cabds[i], &caiters[i], 0); } - if (dabd) { - dabd_is_gang_abd = abd_is_gang(dabd); + ASSERT3S(dsize, >=, 0); + if (dsize > 0) { + ASSERT(dabd); + abd_verify(dabd); + ASSERT3U(dsize, <=, dabd->abd_size); c_dabd = abd_init_abd_iter(dabd, &daiter, 0); } - ASSERT3S(dsize, >=, 0); - abd_enter_critical(flags); while (csize > 0) { - /* if we are at the end of the gang ABD we are done */ - if (dabd_is_gang_abd && !c_dabd) - break; - + len = csize; for (i = 0; i < parity; i++) { - /* - * If we are at the end of the gang ABD we are - * done. - */ - if (cabds_is_gang_abd[i] && !c_cabds[i]) - break; + IMPLY(abd_is_gang(cabds[i]), c_cabds[i] != NULL); abd_iter_map(&caiters[i]); caddrs[i] = caiters[i].iter_mapaddr; + len = MIN(caiters[i].iter_mapsize, len); } - len = csize; - - if (dabd && dsize > 0) + if (dsize > 0) { + IMPLY(abd_is_gang(dabd), c_dabd != NULL); abd_iter_map(&daiter); - - switch (parity) { - case 3: - len = MIN(caiters[2].iter_mapsize, len); - zfs_fallthrough; - case 2: - len = MIN(caiters[1].iter_mapsize, len); - zfs_fallthrough; - case 1: - len = MIN(caiters[0].iter_mapsize, len); - } - - /* must be progressive */ - ASSERT3S(len, >, 0); - - if (dabd && dsize > 0) { - /* this needs precise iter.length */ len = MIN(daiter.iter_mapsize, len); dlen = len; } else @@ -1114,7 +1080,7 @@ abd_raidz_gen_iterate(abd_t **cabds, abd_t *dabd, &caiters[i], len); } - if (dabd && dsize > 0) { + if (dsize > 0) { abd_iter_unmap(&daiter); c_dabd = abd_advance_abd_iter(dabd, c_dabd, &daiter, @@ -1153,16 +1119,16 @@ abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds, struct abd_iter xiters[3]; void *caddrs[3], *xaddrs[3]; unsigned long flags __maybe_unused = 0; - boolean_t cabds_is_gang_abd[3]; - boolean_t tabds_is_gang_abd[3]; abd_t *c_cabds[3]; abd_t *c_tabds[3]; ASSERT3U(parity, <=, 3); for (i = 0; i < parity; i++) { - cabds_is_gang_abd[i] = abd_is_gang(cabds[i]); - tabds_is_gang_abd[i] = abd_is_gang(tabds[i]); + abd_verify(cabds[i]); + abd_verify(tabds[i]); + ASSERT3U(tsize, <=, cabds[i]->abd_size); + ASSERT3U(tsize, <=, tabds[i]->abd_size); c_cabds[i] = abd_init_abd_iter(cabds[i], &citers[i], 0); c_tabds[i] = @@ -1171,36 +1137,18 @@ abd_raidz_rec_iterate(abd_t **cabds, abd_t **tabds, abd_enter_critical(flags); while (tsize > 0) { - + len = tsize; for (i = 0; i < parity; i++) { - /* - * If we are at the end of the gang ABD we - * are done. - */ - if (cabds_is_gang_abd[i] && !c_cabds[i]) - break; - if (tabds_is_gang_abd[i] && !c_tabds[i]) - break; + IMPLY(abd_is_gang(cabds[i]), c_cabds[i] != NULL); + IMPLY(abd_is_gang(tabds[i]), c_tabds[i] != NULL); abd_iter_map(&citers[i]); abd_iter_map(&xiters[i]); caddrs[i] = citers[i].iter_mapaddr; xaddrs[i] = xiters[i].iter_mapaddr; + len = MIN(citers[i].iter_mapsize, len); + len = MIN(xiters[i].iter_mapsize, len); } - len = tsize; - switch (parity) { - case 3: - len = MIN(xiters[2].iter_mapsize, len); - len = MIN(citers[2].iter_mapsize, len); - zfs_fallthrough; - case 2: - len = MIN(xiters[1].iter_mapsize, len); - len = MIN(citers[1].iter_mapsize, len); - zfs_fallthrough; - case 1: - len = MIN(xiters[0].iter_mapsize, len); - len = MIN(citers[0].iter_mapsize, len); - } /* must be progressive */ ASSERT3S(len, >, 0); /* From c34fe8dcbcb710081d8927b76bab06dd43c20c8c Mon Sep 17 00:00:00 2001 From: Alan Somers Date: Tue, 7 Nov 2023 12:34:50 -0700 Subject: [PATCH 09/91] Update the kstat dataset_name when renaming a zvol Add a dataset_kstats_rename function, and call it when renaming a zvol on FreeBSD and Linux. Reviewed-by: Alexander Motin Reviewed-by: Brian Behlendorf Signed-off-by: Alan Somers Sponsored-by: Axcient Closes #15482 Closes #15486 --- include/sys/dataset_kstats.h | 1 + module/os/freebsd/zfs/zvol_os.c | 1 + module/os/linux/zfs/zvol_os.c | 2 ++ module/zfs/dataset_kstats.c | 12 ++++++++++++ 4 files changed, 16 insertions(+) diff --git a/include/sys/dataset_kstats.h b/include/sys/dataset_kstats.h index 40cf5258a2e..c81a07f0c11 100644 --- a/include/sys/dataset_kstats.h +++ b/include/sys/dataset_kstats.h @@ -71,6 +71,7 @@ typedef struct dataset_kstats { int dataset_kstats_create(dataset_kstats_t *, objset_t *); void dataset_kstats_destroy(dataset_kstats_t *); +void dataset_kstats_rename(dataset_kstats_t *dk, const char *); void dataset_kstats_update_write_kstats(dataset_kstats_t *, int64_t); void dataset_kstats_update_read_kstats(dataset_kstats_t *, int64_t); diff --git a/module/os/freebsd/zfs/zvol_os.c b/module/os/freebsd/zfs/zvol_os.c index 2520507b98a..b6edac434de 100644 --- a/module/os/freebsd/zfs/zvol_os.c +++ b/module/os/freebsd/zfs/zvol_os.c @@ -1333,6 +1333,7 @@ zvol_os_rename_minor(zvol_state_t *zv, const char *newname) } } strlcpy(zv->zv_name, newname, sizeof (zv->zv_name)); + dataset_kstats_rename(&zv->zv_kstat, newname); } /* diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c index f94ce69fb9e..8562e989738 100644 --- a/module/os/linux/zfs/zvol_os.c +++ b/module/os/linux/zfs/zvol_os.c @@ -1521,6 +1521,8 @@ zvol_os_rename_minor(zvol_state_t *zv, const char *newname) */ set_disk_ro(zv->zv_zso->zvo_disk, !readonly); set_disk_ro(zv->zv_zso->zvo_disk, readonly); + + dataset_kstats_rename(&zv->zv_kstat, newname); } void diff --git a/module/zfs/dataset_kstats.c b/module/zfs/dataset_kstats.c index 767a461e002..2ac058fd2c9 100644 --- a/module/zfs/dataset_kstats.c +++ b/module/zfs/dataset_kstats.c @@ -198,6 +198,18 @@ dataset_kstats_destroy(dataset_kstats_t *dk) zil_sums_fini(&dk->dk_zil_sums); } +void +dataset_kstats_rename(dataset_kstats_t *dk, const char *name) +{ + dataset_kstat_values_t *dkv = dk->dk_kstats->ks_data; + char *ds_name; + + ds_name = KSTAT_NAMED_STR_PTR(&dkv->dkv_ds_name); + ASSERT3S(ds_name, !=, NULL); + (void) strlcpy(ds_name, name, + KSTAT_NAMED_STR_BUFLEN(&dkv->dkv_ds_name)); +} + void dataset_kstats_update_write_kstats(dataset_kstats_t *dk, int64_t nwritten) From f13593619b074dff63f6940d32033d2f147166e3 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Tue, 7 Nov 2023 14:35:40 -0500 Subject: [PATCH 10/91] FreeBSD: Optimize large kstat outputs - Use sbuf_new_for_sysctl() to reduce double-buffering on sysctl output. - Use much faster sbuf_cat() instead of sbuf_printf("%s"). Together it reduces `sysctl kstat.zfs.misc.dbufs` time from minutes to seconds, making dbufstat almost usable. Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #15495 --- module/os/freebsd/spl/spl_kstat.c | 38 +++++++++++++------------------ 1 file changed, 16 insertions(+), 22 deletions(-) diff --git a/module/os/freebsd/spl/spl_kstat.c b/module/os/freebsd/spl/spl_kstat.c index 9f5f92e194e..43cd4da02e3 100644 --- a/module/os/freebsd/spl/spl_kstat.c +++ b/module/os/freebsd/spl/spl_kstat.c @@ -187,19 +187,18 @@ kstat_sysctl_dataset_string(SYSCTL_HANDLER_ARGS) static int kstat_sysctl_io(SYSCTL_HANDLER_ARGS) { - struct sbuf *sb; + struct sbuf sb; kstat_t *ksp = arg1; kstat_io_t *kip = ksp->ks_data; int rc; - sb = sbuf_new_auto(); - if (sb == NULL) - return (ENOMEM); + sbuf_new_for_sysctl(&sb, NULL, 0, req); + /* Update the aggsums before reading */ (void) ksp->ks_update(ksp, KSTAT_READ); /* though wlentime & friends are signed, they will never be negative */ - sbuf_printf(sb, + sbuf_printf(&sb, "%-8llu %-8llu %-8u %-8u %-8llu %-8llu " "%-8llu %-8llu %-8llu %-8llu %-8u %-8u\n", kip->nread, kip->nwritten, @@ -207,25 +206,21 @@ kstat_sysctl_io(SYSCTL_HANDLER_ARGS) kip->wtime, kip->wlentime, kip->wlastupdate, kip->rtime, kip->rlentime, kip->rlastupdate, kip->wcnt, kip->rcnt); - rc = sbuf_finish(sb); - if (rc == 0) - rc = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb)); - sbuf_delete(sb); + rc = sbuf_finish(&sb); + sbuf_delete(&sb); return (rc); } static int kstat_sysctl_raw(SYSCTL_HANDLER_ARGS) { - struct sbuf *sb; + struct sbuf sb; void *data; kstat_t *ksp = arg1; void *(*addr_op)(kstat_t *ksp, loff_t index); int n, has_header, rc = 0; - sb = sbuf_new_auto(); - if (sb == NULL) - return (ENOMEM); + sbuf_new_for_sysctl(&sb, NULL, PAGE_SIZE, req); if (ksp->ks_raw_ops.addr) addr_op = ksp->ks_raw_ops.addr; @@ -258,8 +253,10 @@ restart_headers: if (has_header) { if (rc == ENOMEM && !kstat_resize_raw(ksp)) goto restart_headers; - if (rc == 0) - sbuf_printf(sb, "\n%s", ksp->ks_raw_buf); + if (rc == 0) { + sbuf_cat(&sb, "\n"); + sbuf_cat(&sb, ksp->ks_raw_buf); + } } while ((data = addr_op(ksp, n)) != NULL) { @@ -270,22 +267,19 @@ restart: if (rc == ENOMEM && !kstat_resize_raw(ksp)) goto restart; if (rc == 0) - sbuf_printf(sb, "%s", ksp->ks_raw_buf); + sbuf_cat(&sb, ksp->ks_raw_buf); } else { ASSERT3U(ksp->ks_ndata, ==, 1); - sbuf_hexdump(sb, ksp->ks_data, + sbuf_hexdump(&sb, ksp->ks_data, ksp->ks_data_size, NULL, 0); } n++; } free(ksp->ks_raw_buf, M_TEMP); mutex_exit(ksp->ks_lock); - sbuf_trim(sb); - rc = sbuf_finish(sb); - if (rc == 0) - rc = SYSCTL_OUT(req, sbuf_data(sb), sbuf_len(sb)); - sbuf_delete(sb); + rc = sbuf_finish(&sb); + sbuf_delete(&sb); return (rc); } From a8c29a79df2dd0ca8cf31f0f2b35475d10567fdb Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Fri, 10 Nov 2023 13:34:46 -0500 Subject: [PATCH 11/91] Linux: Reclaim unused spl_kmem_cache_reclaim It is unused for 3 years since #10576. Reviewed-by: George Melikov Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #15507 --- include/os/linux/spl/sys/kmem_cache.h | 2 -- man/man4/spl.4 | 8 -------- module/os/linux/spl/spl-kmem-cache.c | 11 ----------- 3 files changed, 21 deletions(-) diff --git a/include/os/linux/spl/sys/kmem_cache.h b/include/os/linux/spl/sys/kmem_cache.h index 82d50b6034c..b159bb52d11 100644 --- a/include/os/linux/spl/sys/kmem_cache.h +++ b/include/os/linux/spl/sys/kmem_cache.h @@ -70,8 +70,6 @@ typedef enum kmem_cbrc { #define KMC_REAP_CHUNK INT_MAX #define KMC_DEFAULT_SEEKS 1 -#define KMC_RECLAIM_ONCE 0x1 /* Force a single shrinker pass */ - extern struct list_head spl_kmem_cache_list; extern struct rw_semaphore spl_kmem_cache_sem; diff --git a/man/man4/spl.4 b/man/man4/spl.4 index 82455fb5325..414a9239485 100644 --- a/man/man4/spl.4 +++ b/man/man4/spl.4 @@ -31,14 +31,6 @@ for use by the kmem caches. For the majority of systems and workloads only a small number of threads are required. . -.It Sy spl_kmem_cache_reclaim Ns = Ns Sy 0 Pq uint -When this is set it prevents Linux from being able to rapidly reclaim all the -memory held by the kmem caches. -This may be useful in circumstances where it's preferable that Linux -reclaim memory from some other subsystem first. -Setting this will increase the likelihood out of memory events on a memory -constrained system. -. .It Sy spl_kmem_cache_obj_per_slab Ns = Ns Sy 8 Pq uint The preferred number of objects per slab in the cache. In general, a larger value will increase the caches memory footprint diff --git a/module/os/linux/spl/spl-kmem-cache.c b/module/os/linux/spl/spl-kmem-cache.c index 3c30dfc577b..a2920c74667 100644 --- a/module/os/linux/spl/spl-kmem-cache.c +++ b/module/os/linux/spl/spl-kmem-cache.c @@ -76,17 +76,6 @@ module_param(spl_kmem_cache_magazine_size, uint, 0444); MODULE_PARM_DESC(spl_kmem_cache_magazine_size, "Default magazine size (2-256), set automatically (0)"); -/* - * The default behavior is to report the number of objects remaining in the - * cache. This allows the Linux VM to repeatedly reclaim objects from the - * cache when memory is low satisfy other memory allocations. Alternately, - * setting this value to KMC_RECLAIM_ONCE limits how aggressively the cache - * is reclaimed. This may increase the likelihood of out of memory events. - */ -static unsigned int spl_kmem_cache_reclaim = 0 /* KMC_RECLAIM_ONCE */; -module_param(spl_kmem_cache_reclaim, uint, 0644); -MODULE_PARM_DESC(spl_kmem_cache_reclaim, "Single reclaim pass (0x1)"); - static unsigned int spl_kmem_cache_obj_per_slab = SPL_KMEM_CACHE_OBJ_PER_SLAB; module_param(spl_kmem_cache_obj_per_slab, uint, 0644); MODULE_PARM_DESC(spl_kmem_cache_obj_per_slab, "Number of objects per slab"); From 2e259c6f00142165588b492fd52cb8267d7aa753 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Tue, 14 Nov 2023 16:47:57 -0500 Subject: [PATCH 12/91] L2ARC: Restrict write size to 1/4 of the device PR #15457 exposed weird logic in L2ARC write sizing. If it appeared bigger than device size, instead of liming write it reset all the system-wide tunables to their default. Aside of being excessive, it did not actually help with the problem, still allowing infinite loop to happen. This patch removes the tunables reverting logic, but instead limits L2ARC writes (or at least eviction/trim) to 1/4 of the capacity. Reviewed-by: Brian Behlendorf Reviewed-by: George Amanakis Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #15519 --- module/zfs/arc.c | 30 +++---------------- .../tests/functional/cache/cache_012_pos.ksh | 20 ++++--------- 2 files changed, 10 insertions(+), 40 deletions(-) diff --git a/module/zfs/arc.c b/module/zfs/arc.c index dfea15b7439..4db6c06148b 100644 --- a/module/zfs/arc.c +++ b/module/zfs/arc.c @@ -8031,9 +8031,8 @@ l2arc_write_size(l2arc_dev_t *dev) */ size = l2arc_write_max; if (size == 0) { - cmn_err(CE_NOTE, "Bad value for l2arc_write_max, value must " - "be greater than zero, resetting it to the default (%d)", - L2ARC_WRITE_SIZE); + cmn_err(CE_NOTE, "l2arc_write_max must be greater than zero, " + "resetting it to the default (%d)", L2ARC_WRITE_SIZE); size = l2arc_write_max = L2ARC_WRITE_SIZE; } @@ -8056,30 +8055,9 @@ l2arc_write_size(l2arc_dev_t *dev) * device. This is important in l2arc_evict(), otherwise infinite * iteration can occur. */ - if (size > dev->l2ad_end - dev->l2ad_start) { - cmn_err(CE_NOTE, "l2arc_write_max or l2arc_write_boost " - "plus the overhead of log blocks (persistent L2ARC, " - "%llu bytes) exceeds the size of the cache device " - "(guid %llu), resetting them to the default (%d)", - (u_longlong_t)l2arc_log_blk_overhead(size, dev), - (u_longlong_t)dev->l2ad_vdev->vdev_guid, L2ARC_WRITE_SIZE); + size = MIN(size, (dev->l2ad_end - dev->l2ad_start) / 4); - size = l2arc_write_max = l2arc_write_boost = L2ARC_WRITE_SIZE; - - if (l2arc_trim_ahead > 1) { - cmn_err(CE_NOTE, "l2arc_trim_ahead set to 1"); - l2arc_trim_ahead = 1; - } - - if (arc_warm == B_FALSE) - size += l2arc_write_boost; - - size += l2arc_log_blk_overhead(size, dev); - if (dev->l2ad_vdev->vdev_has_trim && l2arc_trim_ahead > 0) { - size += MAX(64 * 1024 * 1024, - (size * l2arc_trim_ahead) / 100); - } - } + size = P2ROUNDUP(size, 1ULL << dev->l2ad_vdev->vdev_ashift); return (size); diff --git a/tests/zfs-tests/tests/functional/cache/cache_012_pos.ksh b/tests/zfs-tests/tests/functional/cache/cache_012_pos.ksh index 74caa12a9cc..945db71bf11 100755 --- a/tests/zfs-tests/tests/functional/cache/cache_012_pos.ksh +++ b/tests/zfs-tests/tests/functional/cache/cache_012_pos.ksh @@ -31,15 +31,13 @@ # 2. Set l2arc_write_max to a value larger than the cache device. # 3. Create a file larger than the cache device and random read # for 10 sec. -# 4. Verify that l2arc_write_max is set back to the default. -# 5. Set l2arc_write_max to a value less than the cache device size but +# 4. Set l2arc_write_max to a value less than the cache device size but # larger than the default (256MB). -# 6. Record the l2_size. -# 7. Random read for 1 sec. -# 8. Record the l2_size again. -# 9. If (6) <= (8) then we have not looped around yet. -# 10. If (6) > (8) then we looped around. Break out of the loop and test. -# 11. Destroy pool. +# 5. Record the l2_size. +# 6. Random read for 1 sec. +# 7. Record the l2_size again. +# 8. If (5) <= (7) then we have not looped around yet. +# 9. Destroy pool. # verify_runnable "global" @@ -93,10 +91,6 @@ log_must zfs set relatime=off $TESTPOOL log_must fio $FIO_SCRIPTS/mkfiles.fio log_must fio $FIO_SCRIPTS/random_reads.fio -typeset write_max2=$(get_tunable L2ARC_WRITE_MAX) - -log_must test $write_max2 -eq $write_max - log_must set_tunable32 L2ARC_WRITE_MAX $(( 256 * 1024 * 1024 )) export RUNTIME=1 @@ -108,8 +102,6 @@ while $do_once || [[ $l2_size1 -le $l2_size2 ]]; do do_once=false done -log_must test $l2_size1 -gt $l2_size2 - log_must zpool destroy $TESTPOOL log_pass "Looping around a cache device succeeds." From ad47eca195d048792a07a3d2dea05d369ad40900 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Tue, 28 Nov 2023 16:35:14 -0500 Subject: [PATCH 13/91] ZIL: Assert record sizes in different places This should make sure we have log written without overflows. Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #15517 --- module/os/freebsd/zfs/zio_crypt.c | 9 +++-- module/os/linux/zfs/zio_crypt.c | 9 +++-- module/zfs/zfs_replay.c | 50 ++++++++++++++++++++++---- module/zfs/zil.c | 60 ++++++++++++++++++++----------- module/zfs/zio_checksum.c | 16 +++++---- module/zfs/zvol.c | 8 +++++ 6 files changed, 115 insertions(+), 37 deletions(-) diff --git a/module/os/freebsd/zfs/zio_crypt.c b/module/os/freebsd/zfs/zio_crypt.c index 024a931d781..b08916b317f 100644 --- a/module/os/freebsd/zfs/zio_crypt.c +++ b/module/os/freebsd/zfs/zio_crypt.c @@ -1251,7 +1251,7 @@ zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf, iovec_t *dst_iovecs; zil_chain_t *zilc; lr_t *lr; - uint64_t txtype, lr_len; + uint64_t txtype, lr_len, nused; uint_t crypt_len, nr_iovecs, vec; uint_t aad_len = 0, total_len = 0; @@ -1268,7 +1268,10 @@ zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf, zilc = (zil_chain_t *)src; slrp = src + sizeof (zil_chain_t); aadp = aadbuf; - blkend = src + ((byteswap) ? BSWAP_64(zilc->zc_nused) : zilc->zc_nused); + nused = ((byteswap) ? BSWAP_64(zilc->zc_nused) : zilc->zc_nused); + ASSERT3U(nused, >=, sizeof (zil_chain_t)); + ASSERT3U(nused, <=, datalen); + blkend = src + nused; /* * Calculate the number of encrypted iovecs we will need. @@ -1287,6 +1290,8 @@ zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf, txtype = lr->lrc_txtype; lr_len = lr->lrc_reclen; } + ASSERT3U(lr_len, >=, sizeof (lr_t)); + ASSERT3U(lr_len, <=, blkend - slrp); nr_iovecs++; if (txtype == TX_WRITE && lr_len != sizeof (lr_write_t)) diff --git a/module/os/linux/zfs/zio_crypt.c b/module/os/linux/zfs/zio_crypt.c index 775ab8efbcd..2114be28190 100644 --- a/module/os/linux/zfs/zio_crypt.c +++ b/module/os/linux/zfs/zio_crypt.c @@ -1405,7 +1405,7 @@ zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf, boolean_t *no_crypt) { int ret; - uint64_t txtype, lr_len; + uint64_t txtype, lr_len, nused; uint_t nr_src, nr_dst, crypt_len; uint_t aad_len = 0, nr_iovecs = 0, total_len = 0; iovec_t *src_iovecs = NULL, *dst_iovecs = NULL; @@ -1432,7 +1432,10 @@ zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf, zilc = (zil_chain_t *)src; slrp = src + sizeof (zil_chain_t); aadp = aadbuf; - blkend = src + ((byteswap) ? BSWAP_64(zilc->zc_nused) : zilc->zc_nused); + nused = ((byteswap) ? BSWAP_64(zilc->zc_nused) : zilc->zc_nused); + ASSERT3U(nused, >=, sizeof (zil_chain_t)); + ASSERT3U(nused, <=, datalen); + blkend = src + nused; /* calculate the number of encrypted iovecs we will need */ for (; slrp < blkend; slrp += lr_len) { @@ -1445,6 +1448,8 @@ zio_crypt_init_uios_zil(boolean_t encrypt, uint8_t *plainbuf, txtype = BSWAP_64(lr->lrc_txtype); lr_len = BSWAP_64(lr->lrc_reclen); } + ASSERT3U(lr_len, >=, sizeof (lr_t)); + ASSERT3U(lr_len, <=, blkend - slrp); nr_iovecs++; if (txtype == TX_WRITE && lr_len != sizeof (lr_write_t)) diff --git a/module/zfs/zfs_replay.c b/module/zfs/zfs_replay.c index 09c7be853bf..2e0af60f6db 100644 --- a/module/zfs/zfs_replay.c +++ b/module/zfs/zfs_replay.c @@ -309,6 +309,8 @@ zfs_replay_create_acl(void *arg1, void *arg2, boolean_t byteswap) uint64_t dnodesize; int error; + ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lracl)); + txtype = (lr->lr_common.lrc_txtype & ~TX_CI); if (byteswap) { byteswap_uint64_array(lracl, sizeof (*lracl)); @@ -470,6 +472,8 @@ zfs_replay_create(void *arg1, void *arg2, boolean_t byteswap) uint64_t dnodesize; int error; + ASSERT3U(lr->lr_common.lrc_reclen, >, sizeof (*lr)); + txtype = (lr->lr_common.lrc_txtype & ~TX_CI); if (byteswap) { byteswap_uint64_array(lr, sizeof (*lr)); @@ -613,6 +617,8 @@ zfs_replay_remove(void *arg1, void *arg2, boolean_t byteswap) int error; int vflg = 0; + ASSERT3U(lr->lr_common.lrc_reclen, >, sizeof (*lr)); + if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); @@ -648,6 +654,8 @@ zfs_replay_link(void *arg1, void *arg2, boolean_t byteswap) int error; int vflg = 0; + ASSERT3U(lr->lr_common.lrc_reclen, >, sizeof (*lr)); + if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); @@ -715,12 +723,14 @@ zfs_replay_rename(void *arg1, void *arg2, boolean_t byteswap) { zfsvfs_t *zfsvfs = arg1; lr_rename_t *lr = arg2; - char *sname = (char *)(lr + 1); /* sname and tname follow lr_rename_t */ - char *tname = sname + strlen(sname) + 1; + + ASSERT3U(lr->lr_common.lrc_reclen, >, sizeof (*lr)); if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); + char *sname = (char *)(lr + 1); /* sname and tname follow lr_rename_t */ + char *tname = sname + strlen(sname) + 1; return (do_zfs_replay_rename(zfsvfs, lr, sname, tname, 0, NULL)); } @@ -730,12 +740,14 @@ zfs_replay_rename_exchange(void *arg1, void *arg2, boolean_t byteswap) #ifdef __linux__ zfsvfs_t *zfsvfs = arg1; lr_rename_t *lr = arg2; - char *sname = (char *)(lr + 1); /* sname and tname follow lr_rename_t */ - char *tname = sname + strlen(sname) + 1; + + ASSERT3U(lr->lr_common.lrc_reclen, >, sizeof (*lr)); if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); + char *sname = (char *)(lr + 1); /* sname and tname follow lr_rename_t */ + char *tname = sname + strlen(sname) + 1; return (do_zfs_replay_rename(zfsvfs, lr, sname, tname, RENAME_EXCHANGE, NULL)); #else @@ -750,14 +762,13 @@ zfs_replay_rename_whiteout(void *arg1, void *arg2, boolean_t byteswap) zfsvfs_t *zfsvfs = arg1; lr_rename_whiteout_t *lr = arg2; int error; - /* sname and tname follow lr_rename_whiteout_t */ - char *sname = (char *)(lr + 1); - char *tname = sname + strlen(sname) + 1; /* For the whiteout file. */ xvattr_t xva; uint64_t objid; uint64_t dnodesize; + ASSERT3U(lr->lr_rename.lr_common.lrc_reclen, >, sizeof (*lr)); + if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); @@ -783,6 +794,9 @@ zfs_replay_rename_whiteout(void *arg1, void *arg2, boolean_t byteswap) if (error) return (error); + /* sname and tname follow lr_rename_whiteout_t */ + char *sname = (char *)(lr + 1); + char *tname = sname + strlen(sname) + 1; return (do_zfs_replay_rename(zfsvfs, &lr->lr_rename, sname, tname, RENAME_WHITEOUT, &xva.xva_vattr)); #else @@ -800,6 +814,8 @@ zfs_replay_write(void *arg1, void *arg2, boolean_t byteswap) int error; uint64_t eod, offset, length; + ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr)); + if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); @@ -863,6 +879,8 @@ zfs_replay_write2(void *arg1, void *arg2, boolean_t byteswap) int error; uint64_t end; + ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr)); + if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); @@ -910,6 +928,8 @@ zfs_replay_truncate(void *arg1, void *arg2, boolean_t byteswap) flock64_t fl = {0}; int error; + ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr)); + if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); @@ -940,6 +960,8 @@ zfs_replay_setattr(void *arg1, void *arg2, boolean_t byteswap) int error; void *start; + ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr)); + xva_init(&xva); if (byteswap) { byteswap_uint64_array(lr, sizeof (*lr)); @@ -1002,6 +1024,9 @@ zfs_replay_setsaxattr(void *arg1, void *arg2, boolean_t byteswap) size_t size; int error = 0; + ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr)); + ASSERT3U(lr->lr_common.lrc_reclen, >, sizeof (*lr) + lr->lr_size); + ASSERT(spa_feature_is_active(zfsvfs->z_os->os_spa, SPA_FEATURE_ZILSAXATTR)); if (byteswap) @@ -1079,6 +1104,10 @@ zfs_replay_acl_v0(void *arg1, void *arg2, boolean_t byteswap) znode_t *zp; int error; + ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr)); + ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr) + + sizeof (ace_t) * lr->lr_aclcnt); + if (byteswap) { byteswap_uint64_array(lr, sizeof (*lr)); zfs_oldace_byteswap(ace, lr->lr_aclcnt); @@ -1124,6 +1153,9 @@ zfs_replay_acl(void *arg1, void *arg2, boolean_t byteswap) znode_t *zp; int error; + ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr)); + ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr) + lr->lr_acl_bytes); + if (byteswap) { byteswap_uint64_array(lr, sizeof (*lr)); zfs_ace_byteswap(ace, lr->lr_acl_bytes, B_FALSE); @@ -1171,6 +1203,10 @@ zfs_replay_clone_range(void *arg1, void *arg2, boolean_t byteswap) znode_t *zp; int error; + ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr)); + ASSERT3U(lr->lr_common.lrc_reclen, >=, offsetof(lr_clone_range_t, + lr_bps[lr->lr_nbps])); + if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); diff --git a/module/zfs/zil.c b/module/zfs/zil.c index a1188613699..37fb792f5dd 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -522,6 +522,7 @@ zil_parse(zilog_t *zilog, zil_parse_blk_func_t *parse_blk_func, lr_t *lr = (lr_t *)lrp; reclen = lr->lrc_reclen; ASSERT3U(reclen, >=, sizeof (lr_t)); + ASSERT3U(reclen, <=, end - lrp); if (lr->lrc_seq > claim_lr_seq) { arc_buf_destroy(abuf, &abuf); goto done; @@ -604,7 +605,7 @@ zil_claim_write(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t first_txg) lr_write_t *lr = (lr_write_t *)lrc; int error; - ASSERT(lrc->lrc_txtype == TX_WRITE); + ASSERT3U(lrc->lrc_reclen, >=, sizeof (*lr)); /* * If the block is not readable, don't claim it. This can happen @@ -631,7 +632,9 @@ zil_claim_clone_range(zilog_t *zilog, const lr_t *lrc, void *tx) spa_t *spa; uint_t ii; - ASSERT(lrc->lrc_txtype == TX_CLONE_RANGE); + ASSERT3U(lrc->lrc_reclen, >=, sizeof (*lr)); + ASSERT3U(lrc->lrc_reclen, >=, offsetof(lr_clone_range_t, + lr_bps[lr->lr_nbps])); if (tx == NULL) { return (0); @@ -691,7 +694,7 @@ zil_free_write(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t claim_txg) lr_write_t *lr = (lr_write_t *)lrc; blkptr_t *bp = &lr->lr_blkptr; - ASSERT(lrc->lrc_txtype == TX_WRITE); + ASSERT3U(lrc->lrc_reclen, >=, sizeof (*lr)); /* * If we previously claimed it, we need to free it. @@ -712,7 +715,9 @@ zil_free_clone_range(zilog_t *zilog, const lr_t *lrc, void *tx) spa_t *spa; uint_t ii; - ASSERT(lrc->lrc_txtype == TX_CLONE_RANGE); + ASSERT3U(lrc->lrc_reclen, >=, sizeof (*lr)); + ASSERT3U(lrc->lrc_reclen, >=, offsetof(lr_clone_range_t, + lr_bps[lr->lr_nbps])); if (tx == NULL) { return (0); @@ -1794,6 +1799,7 @@ zil_lwb_write_issue(zilog_t *zilog, lwb_t *lwb) itx = list_next(&lwb->lwb_itxs, itx)) zil_lwb_commit(zilog, lwb, itx); lwb->lwb_nused = lwb->lwb_nfilled; + ASSERT3U(lwb->lwb_nused, <=, lwb->lwb_nmax); lwb->lwb_root_zio = zio_root(spa, zil_lwb_flush_vdevs_done, lwb, ZIO_FLAG_CANFAIL); @@ -2023,13 +2029,16 @@ zil_lwb_assign(zilog_t *zilog, lwb_t *lwb, itx_t *itx, list_t *ilwbs) return (lwb); } + reclen = lr->lrc_reclen; if (lr->lrc_txtype == TX_WRITE && itx->itx_wr_state == WR_NEED_COPY) { + ASSERT3U(reclen, ==, sizeof (lr_write_t)); dlen = P2ROUNDUP_TYPED( lrw->lr_length, sizeof (uint64_t), uint64_t); } else { + ASSERT3U(reclen, >=, sizeof (lr_t)); dlen = 0; } - reclen = lr->lrc_reclen; + ASSERT3U(reclen, <=, zil_max_log_data(zilog, 0)); zilog->zl_cur_used += (reclen + dlen); cont: @@ -2048,19 +2057,19 @@ cont: if (lwb == NULL) return (NULL); lwb_sp = lwb->lwb_nmax - lwb->lwb_nused; - - /* - * There must be enough space in the new, empty log block to - * hold reclen. For WR_COPIED, we need to fit the whole - * record in one block, and reclen is the header size + the - * data size. For WR_NEED_COPY, we can create multiple - * records, splitting the data into multiple blocks, so we - * only need to fit one word of data per block; in this case - * reclen is just the header size (no data). - */ - ASSERT3U(reclen + MIN(dlen, sizeof (uint64_t)), <=, lwb_sp); } + /* + * There must be enough space in the log block to hold reclen. + * For WR_COPIED, we need to fit the whole record in one block, + * and reclen is the write record header size + the data size. + * For WR_NEED_COPY, we can create multiple records, splitting + * the data into multiple blocks, so we only need to fit one + * word of data per block; in this case reclen is just the header + * size (no data). + */ + ASSERT3U(reclen + MIN(dlen, sizeof (uint64_t)), <=, lwb_sp); + dnow = MIN(dlen, lwb_sp - reclen); if (dlen > dnow) { ASSERT3U(lr->lrc_txtype, ==, TX_WRITE); @@ -2236,7 +2245,9 @@ zil_itx_create(uint64_t txtype, size_t olrsize) size_t itxsize, lrsize; itx_t *itx; + ASSERT3U(olrsize, >=, sizeof (lr_t)); lrsize = P2ROUNDUP_TYPED(olrsize, sizeof (uint64_t), size_t); + ASSERT3U(lrsize, >=, olrsize); itxsize = offsetof(itx_t, itx_lr) + lrsize; itx = zio_data_buf_alloc(itxsize); @@ -2255,6 +2266,10 @@ zil_itx_create(uint64_t txtype, size_t olrsize) static itx_t * zil_itx_clone(itx_t *oitx) { + ASSERT3U(oitx->itx_size, >=, sizeof (itx_t)); + ASSERT3U(oitx->itx_size, ==, + offsetof(itx_t, itx_lr) + oitx->itx_lr.lrc_reclen); + itx_t *itx = zio_data_buf_alloc(oitx->itx_size); memcpy(itx, oitx, oitx->itx_size); itx->itx_callback = NULL; @@ -2265,6 +2280,9 @@ zil_itx_clone(itx_t *oitx) void zil_itx_destroy(itx_t *itx) { + ASSERT3U(itx->itx_size, >=, sizeof (itx_t)); + ASSERT3U(itx->itx_lr.lrc_reclen, ==, + itx->itx_size - offsetof(itx_t, itx_lr)); IMPLY(itx->itx_lr.lrc_txtype == TX_COMMIT, itx->itx_callback == NULL); IMPLY(itx->itx_callback != NULL, itx->itx_lr.lrc_txtype != TX_COMMIT); @@ -2348,7 +2366,7 @@ void zil_remove_async(zilog_t *zilog, uint64_t oid) { uint64_t otxg, txg; - itx_async_node_t *ian; + itx_async_node_t *ian, ian_search; avl_tree_t *t; avl_index_t where; list_t clean_list; @@ -2375,7 +2393,8 @@ zil_remove_async(zilog_t *zilog, uint64_t oid) * Locate the object node and append its list. */ t = &itxg->itxg_itxs->i_async_tree; - ian = avl_find(t, &oid, &where); + ian_search.ia_foid = oid; + ian = avl_find(t, &ian_search, &where); if (ian != NULL) list_move_tail(&clean_list, &ian->ia_list); mutex_exit(&itxg->itxg_lock); @@ -2573,7 +2592,7 @@ void zil_async_to_sync(zilog_t *zilog, uint64_t foid) { uint64_t otxg, txg; - itx_async_node_t *ian; + itx_async_node_t *ian, ian_search; avl_tree_t *t; avl_index_t where; @@ -2603,7 +2622,8 @@ zil_async_to_sync(zilog_t *zilog, uint64_t foid) */ t = &itxg->itxg_itxs->i_async_tree; if (foid != 0) { - ian = avl_find(t, &foid, &where); + ian_search.ia_foid = foid; + ian = avl_find(t, &ian_search, &where); if (ian != NULL) { list_move_tail(&itxg->itxg_itxs->i_sync_list, &ian->ia_list); diff --git a/module/zfs/zio_checksum.c b/module/zfs/zio_checksum.c index 9de515e8767..e511b31fee6 100644 --- a/module/zfs/zio_checksum.c +++ b/module/zfs/zio_checksum.c @@ -363,11 +363,14 @@ zio_checksum_compute(zio_t *zio, enum zio_checksum checksum, zil_chain_t zilc; abd_copy_to_buf(&zilc, abd, sizeof (zil_chain_t)); - size = P2ROUNDUP_TYPED(zilc.zc_nused, ZIL_MIN_BLKSZ, - uint64_t); + uint64_t nused = P2ROUNDUP_TYPED(zilc.zc_nused, + ZIL_MIN_BLKSZ, uint64_t); + ASSERT3U(size, >=, nused); + size = nused; eck = zilc.zc_eck; eck_offset = offsetof(zil_chain_t, zc_eck); } else { + ASSERT3U(size, >=, sizeof (zio_eck_t)); eck_offset = size - sizeof (zio_eck_t); abd_copy_to_buf_off(&eck, abd, eck_offset, sizeof (zio_eck_t)); @@ -448,12 +451,13 @@ zio_checksum_error_impl(spa_t *spa, const blkptr_t *bp, return (SET_ERROR(ECKSUM)); } - if (nused > size) { + nused = P2ROUNDUP_TYPED(nused, ZIL_MIN_BLKSZ, uint64_t); + if (size < nused) return (SET_ERROR(ECKSUM)); - } - - size = P2ROUNDUP_TYPED(nused, ZIL_MIN_BLKSZ, uint64_t); + size = nused; } else { + if (size < sizeof (zio_eck_t)) + return (SET_ERROR(ECKSUM)); eck_offset = size - sizeof (zio_eck_t); abd_copy_to_buf_off(&eck, abd, eck_offset, sizeof (zio_eck_t)); diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c index 53dcb4dee44..91b2d9fcb53 100644 --- a/module/zfs/zvol.c +++ b/module/zfs/zvol.c @@ -417,6 +417,8 @@ zvol_replay_truncate(void *arg1, void *arg2, boolean_t byteswap) lr_truncate_t *lr = arg2; uint64_t offset, length; + ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr)); + if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); @@ -453,6 +455,8 @@ zvol_replay_write(void *arg1, void *arg2, boolean_t byteswap) dmu_tx_t *tx; int error; + ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr)); + if (byteswap) byteswap_uint64_array(lr, sizeof (*lr)); @@ -506,6 +510,10 @@ zvol_replay_clone_range(void *arg1, void *arg2, boolean_t byteswap) uint_t ii; int error; + ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr)); + ASSERT3U(lr->lr_common.lrc_reclen, >=, offsetof(lr_clone_range_t, + lr_bps[lr->lr_nbps])); + dmu_objset_name(os, name); cmn_err(CE_WARN, "ZFS dropping block cloning transaction for %s.", name); From e48195c816edbea0efeb41436811af353ae26a35 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Fri, 1 Dec 2023 14:50:10 -0500 Subject: [PATCH 14/91] ZIO: Add overflow checks for linear buffers Since we use a limited set of kmem caches, quite often we have unused memory after the end of the buffer. Put there up to a 512-byte canary when built with debug to detect buffer overflows at the free time. Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #15553 --- lib/libspl/include/assert.h | 3 ++ module/zfs/zio.c | 57 +++++++++++++++++++++++++++++++++++-- 2 files changed, 58 insertions(+), 2 deletions(-) diff --git a/lib/libspl/include/assert.h b/lib/libspl/include/assert.h index c5bf0f0cc8f..af4957dfbaa 100644 --- a/lib/libspl/include/assert.h +++ b/lib/libspl/include/assert.h @@ -64,6 +64,9 @@ libspl_assert(const char *buf, const char *file, const char *func, int line) #undef verify #endif +#define PANIC(fmt, a...) \ + libspl_assertf(__FILE__, __FUNCTION__, __LINE__, fmt, ## a) + #define VERIFY(cond) \ (void) ((!(cond)) && \ libspl_assert(#cond, __FILE__, __FUNCTION__, __LINE__)) diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 3b3b40fa73d..d8eb075eef5 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -306,6 +306,53 @@ zio_fini(void) * ========================================================================== */ +#ifdef ZFS_DEBUG +static const ulong_t zio_buf_canary = (ulong_t)0xdeadc0dedead210b; +#endif + +/* + * Use empty space after the buffer to detect overflows. + * + * Since zio_init() creates kmem caches only for certain set of buffer sizes, + * allocations of different sizes may have some unused space after the data. + * Filling part of that space with a known pattern on allocation and checking + * it on free should allow us to detect some buffer overflows. + */ +static void +zio_buf_put_canary(ulong_t *p, size_t size, kmem_cache_t **cache, size_t c) +{ +#ifdef ZFS_DEBUG + size_t off = P2ROUNDUP(size, sizeof (ulong_t)); + ulong_t *canary = p + off / sizeof (ulong_t); + size_t asize = (c + 1) << SPA_MINBLOCKSHIFT; + if (c + 1 < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT && + cache[c] == cache[c + 1]) + asize = (c + 2) << SPA_MINBLOCKSHIFT; + for (; off < asize; canary++, off += sizeof (ulong_t)) + *canary = zio_buf_canary; +#endif +} + +static void +zio_buf_check_canary(ulong_t *p, size_t size, kmem_cache_t **cache, size_t c) +{ +#ifdef ZFS_DEBUG + size_t off = P2ROUNDUP(size, sizeof (ulong_t)); + ulong_t *canary = p + off / sizeof (ulong_t); + size_t asize = (c + 1) << SPA_MINBLOCKSHIFT; + if (c + 1 < SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT && + cache[c] == cache[c + 1]) + asize = (c + 2) << SPA_MINBLOCKSHIFT; + for (; off < asize; canary++, off += sizeof (ulong_t)) { + if (unlikely(*canary != zio_buf_canary)) { + PANIC("ZIO buffer overflow %p (%zu) + %zu %#lx != %#lx", + p, size, (canary - p) * sizeof (ulong_t), + *canary, zio_buf_canary); + } + } +#endif +} + /* * Use zio_buf_alloc to allocate ZFS metadata. This data will appear in a * crashdump if the kernel panics, so use it judiciously. Obviously, it's @@ -322,7 +369,9 @@ zio_buf_alloc(size_t size) atomic_add_64(&zio_buf_cache_allocs[c], 1); #endif - return (kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE)); + void *p = kmem_cache_alloc(zio_buf_cache[c], KM_PUSHPAGE); + zio_buf_put_canary(p, size, zio_buf_cache, c); + return (p); } /* @@ -338,7 +387,9 @@ zio_data_buf_alloc(size_t size) VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); - return (kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE)); + void *p = kmem_cache_alloc(zio_data_buf_cache[c], KM_PUSHPAGE); + zio_buf_put_canary(p, size, zio_data_buf_cache, c); + return (p); } void @@ -351,6 +402,7 @@ zio_buf_free(void *buf, size_t size) atomic_add_64(&zio_buf_cache_frees[c], 1); #endif + zio_buf_check_canary(buf, size, zio_buf_cache, c); kmem_cache_free(zio_buf_cache[c], buf); } @@ -361,6 +413,7 @@ zio_data_buf_free(void *buf, size_t size) VERIFY3U(c, <, SPA_MAXBLOCKSIZE >> SPA_MINBLOCKSHIFT); + zio_buf_check_canary(buf, size, zio_data_buf_cache, c); kmem_cache_free(zio_data_buf_cache[c], buf); } From 3b8f2273622318461836dc27fa65e50126caff78 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Fri, 1 Dec 2023 18:23:20 -0500 Subject: [PATCH 15/91] ZIL: Remove TX_CLONE_RANGE replay for ZVOLs. zil_claim_clone_range() takes references on cloned blocks before ZIL replay. Later zil_free_clone_range() drops them after replay or on dataset destroy. The total balance is neutral. It means we do not need to do anything (drop the references) for not implemented yet TX_CLONE_RANGE replay for ZVOLs. This is a logical follow up to #15603. Reviewed-by: Kay Pedersen Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #15612 --- module/zfs/zvol.c | 60 +---------------------------------------------- 1 file changed, 1 insertion(+), 59 deletions(-) diff --git a/module/zfs/zvol.c b/module/zfs/zvol.c index 91b2d9fcb53..20ea71f2337 100644 --- a/module/zfs/zvol.c +++ b/module/zfs/zvol.c @@ -486,64 +486,6 @@ zvol_replay_write(void *arg1, void *arg2, boolean_t byteswap) return (error); } -/* - * Replay a TX_CLONE_RANGE ZIL transaction that didn't get committed - * after a system failure. - * - * TODO: For now we drop block cloning transations for ZVOLs as they are - * unsupported, but we still need to inform BRT about that as we - * claimed them during pool import. - * This situation can occur when we try to import a pool from a ZFS - * version supporting block cloning for ZVOLs into a system that - * has this ZFS version, that doesn't support block cloning for ZVOLs. - */ -static int -zvol_replay_clone_range(void *arg1, void *arg2, boolean_t byteswap) -{ - char name[ZFS_MAX_DATASET_NAME_LEN]; - zvol_state_t *zv = arg1; - objset_t *os = zv->zv_objset; - lr_clone_range_t *lr = arg2; - blkptr_t *bp; - dmu_tx_t *tx; - spa_t *spa; - uint_t ii; - int error; - - ASSERT3U(lr->lr_common.lrc_reclen, >=, sizeof (*lr)); - ASSERT3U(lr->lr_common.lrc_reclen, >=, offsetof(lr_clone_range_t, - lr_bps[lr->lr_nbps])); - - dmu_objset_name(os, name); - cmn_err(CE_WARN, "ZFS dropping block cloning transaction for %s.", - name); - - if (byteswap) - byteswap_uint64_array(lr, sizeof (*lr)); - - tx = dmu_tx_create(os); - error = dmu_tx_assign(tx, TXG_WAIT); - if (error) { - dmu_tx_abort(tx); - return (error); - } - - spa = os->os_spa; - - for (ii = 0; ii < lr->lr_nbps; ii++) { - bp = &lr->lr_bps[ii]; - - if (!BP_IS_HOLE(bp)) { - zio_free(spa, dmu_tx_get_txg(tx), bp); - } - } - - (void) zil_replaying(zv->zv_zilog, tx); - dmu_tx_commit(tx); - - return (0); -} - static int zvol_replay_err(void *arg1, void *arg2, boolean_t byteswap) { @@ -578,7 +520,7 @@ zil_replay_func_t *const zvol_replay_vector[TX_MAX_TYPE] = { zvol_replay_err, /* TX_SETSAXATTR */ zvol_replay_err, /* TX_RENAME_EXCHANGE */ zvol_replay_err, /* TX_RENAME_WHITEOUT */ - zvol_replay_clone_range /* TX_CLONE_RANGE */ + zvol_replay_err, /* TX_CLONE_RANGE */ }; /* From e11b3eb1c608754d98ecb91e2c3dc8d5700ec7a8 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Tue, 5 Dec 2023 13:58:11 -0500 Subject: [PATCH 16/91] ZIL: Do not clone blocks from the future ZIL claim can not handle block pointers cloned from the future, since they are not yet allocated at that point. It may happen either if the block was just written when it was cloned, or if the pool was frozen or somehow else rewound on import. Handle it from two sides: prevent cloning of blocks with physical birth time from not yet synced or frozen TXG, and abort ZIL claim if we still detect such blocks due to rewind or something else. While there, assert that any cloned blocks we claim are really allocated by calling metaslab_check_free(). Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #15617 --- module/zfs/dmu.c | 15 +++++++++++++++ module/zfs/zil.c | 38 ++++++++++++++++++++++++++++---------- 2 files changed, 43 insertions(+), 10 deletions(-) diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 3f626031de5..63464d74742 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -2255,6 +2255,21 @@ dmu_read_l0_bps(objset_t *os, uint64_t object, uint64_t offset, uint64_t length, goto out; } + /* + * If the block was allocated in transaction group that is not + * yet synced, we could clone it, but we couldn't write this + * operation into ZIL, or it may be impossible to replay, since + * the block may appear not yet allocated at that point. + */ + if (BP_PHYSICAL_BIRTH(bp) > spa_freeze_txg(os->os_spa)) { + error = SET_ERROR(EINVAL); + goto out; + } + if (BP_PHYSICAL_BIRTH(bp) > spa_last_synced_txg(os->os_spa)) { + error = SET_ERROR(EAGAIN); + goto out; + } + bps[i] = *bp; } diff --git a/module/zfs/zil.c b/module/zfs/zil.c index 37fb792f5dd..7670e172950 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -625,11 +625,12 @@ zil_claim_write(zilog_t *zilog, const lr_t *lrc, void *tx, uint64_t first_txg) } static int -zil_claim_clone_range(zilog_t *zilog, const lr_t *lrc, void *tx) +zil_claim_clone_range(zilog_t *zilog, const lr_t *lrc, void *tx, + uint64_t first_txg) { const lr_clone_range_t *lr = (const lr_clone_range_t *)lrc; const blkptr_t *bp; - spa_t *spa; + spa_t *spa = zilog->zl_spa; uint_t ii; ASSERT3U(lrc->lrc_reclen, >=, sizeof (*lr)); @@ -644,19 +645,36 @@ zil_claim_clone_range(zilog_t *zilog, const lr_t *lrc, void *tx) * XXX: Do we need to byteswap lr? */ - spa = zilog->zl_spa; - for (ii = 0; ii < lr->lr_nbps; ii++) { bp = &lr->lr_bps[ii]; /* - * When data in embedded into BP there is no need to create - * BRT entry as there is no data block. Just copy the BP as - * it contains the data. + * When data is embedded into the BP there is no need to create + * BRT entry as there is no data block. Just copy the BP as it + * contains the data. */ - if (!BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) { + if (BP_IS_HOLE(bp) || BP_IS_EMBEDDED(bp)) + continue; + + /* + * We can not handle block pointers from the future, since they + * are not yet allocated. It should not normally happen, but + * just in case lets be safe and just stop here now instead of + * corrupting the pool. + */ + if (BP_PHYSICAL_BIRTH(bp) >= first_txg) + return (SET_ERROR(ENOENT)); + + /* + * Assert the block is really allocated before we reference it. + */ + metaslab_check_free(spa, bp); + } + + for (ii = 0; ii < lr->lr_nbps; ii++) { + bp = &lr->lr_bps[ii]; + if (!BP_IS_HOLE(bp) && !BP_IS_EMBEDDED(bp)) brt_pending_add(spa, bp, tx); - } } return (0); @@ -671,7 +689,7 @@ zil_claim_log_record(zilog_t *zilog, const lr_t *lrc, void *tx, case TX_WRITE: return (zil_claim_write(zilog, lrc, tx, first_txg)); case TX_CLONE_RANGE: - return (zil_claim_clone_range(zilog, lrc, tx)); + return (zil_claim_clone_range(zilog, lrc, tx, first_txg)); default: return (0); } From 121924575e48f45b402a9fe2282055946e065c4a Mon Sep 17 00:00:00 2001 From: oromenahar Date: Tue, 5 Dec 2023 20:03:48 +0100 Subject: [PATCH 17/91] Allow block cloning across encrypted datasets When two datasets share the same master encryption key, it is safe to clone encrypted blocks. Currently only snapshots and clones of a dataset share with it the same encryption key. Added a test for: - Clone from encrypted sibling to encrypted sibling with non encrypted parent - Clone from encrypted parent to inherited encrypted child - Clone from child to sibling with encrypted parent - Clone from snapshot to the original datasets - Clone from foreign snapshot to a foreign dataset - Cloning from non-encrypted to encrypted datasets - Cloning from encrypted to non-encrypted datasets Reviewed-by: Alexander Motin Reviewed-by: Brian Behlendorf Original-patch-by: Pawel Jakub Dawidek Signed-off-by: Kay Pedersen Closes #15544 --- include/sys/dsl_crypt.h | 1 + man/man7/zpool-features.7 | 9 +- module/zfs/brt.c | 6 +- module/zfs/dsl_crypt.c | 34 ++++ module/zfs/zfs_vnops.c | 25 ++- tests/runfiles/linux.run | 1 + tests/test-runner/bin/zts-report.py.in | 2 + tests/zfs-tests/tests/Makefile.am | 1 + .../block_cloning/block_cloning.kshlib | 12 +- .../block_cloning_cross_enc_dataset.ksh | 170 ++++++++++++++++++ 10 files changed, 236 insertions(+), 25 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/block_cloning/block_cloning_cross_enc_dataset.ksh diff --git a/include/sys/dsl_crypt.h b/include/sys/dsl_crypt.h index 72716e296c9..fbcae371535 100644 --- a/include/sys/dsl_crypt.h +++ b/include/sys/dsl_crypt.h @@ -206,6 +206,7 @@ void dsl_dataset_promote_crypt_sync(dsl_dir_t *target, dsl_dir_t *origin, dmu_tx_t *tx); int dmu_objset_create_crypt_check(dsl_dir_t *parentdd, dsl_crypto_params_t *dcp, boolean_t *will_encrypt); +boolean_t dmu_objset_crypto_key_equal(objset_t *osa, objset_t *osb); void dsl_dataset_create_crypt_sync(uint64_t dsobj, dsl_dir_t *dd, struct dsl_dataset *origin, dsl_crypto_params_t *dcp, dmu_tx_t *tx); uint64_t dsl_crypto_key_create_sync(uint64_t crypt, dsl_wrapping_key_t *wkey, diff --git a/man/man7/zpool-features.7 b/man/man7/zpool-features.7 index 8ca4bd927b2..8456a9aa764 100644 --- a/man/man7/zpool-features.7 +++ b/man/man7/zpool-features.7 @@ -364,9 +364,12 @@ When this feature is enabled ZFS will use block cloning for operations like Block cloning allows to create multiple references to a single block. It is much faster than copying the data (as the actual data is neither read nor written) and takes no additional space. -Blocks can be cloned across datasets under some conditions (like disabled -encryption and equal -.Nm recordsize ) . +Blocks can be cloned across datasets under some conditions (like equal +.Nm recordsize , +the same master encryption key, etc.). +ZFS tries its best to clone across datasets including encrypted ones. +This is limited for various (nontrivial) reasons depending on the OS +and/or ZFS internals. .Pp This feature becomes .Sy active diff --git a/module/zfs/brt.c b/module/zfs/brt.c index 759bc8d2e2b..a701c70fcfb 100644 --- a/module/zfs/brt.c +++ b/module/zfs/brt.c @@ -157,10 +157,8 @@ * (copying the file content to the new dataset and removing the source file). * In that case Block Cloning will only be used briefly, because the BRT entries * will be removed when the source is removed. - * Note: currently it is not possible to clone blocks between encrypted - * datasets, even if those datasets use the same encryption key (this includes - * snapshots of encrypted datasets). Cloning blocks between datasets that use - * the same keys should be possible and should be implemented in the future. + * Block Cloning across encrypted datasets is supported as long as both + * datasets share the same master key (e.g. snapshots and clones) * * Block Cloning flow through ZFS layers. * diff --git a/module/zfs/dsl_crypt.c b/module/zfs/dsl_crypt.c index 5e6e4e3d6c3..8e1055d9bcb 100644 --- a/module/zfs/dsl_crypt.c +++ b/module/zfs/dsl_crypt.c @@ -266,6 +266,40 @@ spa_crypto_key_compare(const void *a, const void *b) return (0); } +/* + * this compares a crypto key based on zk_guid. See comment on + * spa_crypto_key_compare for more information. + */ +boolean_t +dmu_objset_crypto_key_equal(objset_t *osa, objset_t *osb) +{ + dsl_crypto_key_t *dcka = NULL; + dsl_crypto_key_t *dckb = NULL; + uint64_t obja, objb; + boolean_t equal; + spa_t *spa; + + spa = dmu_objset_spa(osa); + if (spa != dmu_objset_spa(osb)) + return (B_FALSE); + obja = dmu_objset_ds(osa)->ds_object; + objb = dmu_objset_ds(osb)->ds_object; + + if (spa_keystore_lookup_key(spa, obja, FTAG, &dcka) != 0) + return (B_FALSE); + if (spa_keystore_lookup_key(spa, objb, FTAG, &dckb) != 0) { + spa_keystore_dsl_key_rele(spa, dcka, FTAG); + return (B_FALSE); + } + + equal = (dcka->dck_key.zk_guid == dckb->dck_key.zk_guid); + + spa_keystore_dsl_key_rele(spa, dcka, FTAG); + spa_keystore_dsl_key_rele(spa, dckb, FTAG); + + return (equal); +} + static int spa_key_mapping_compare(const void *a, const void *b) { diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index 3a5fa75df2e..17e990451e0 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -47,6 +47,7 @@ #include #include #include +#include #include #include #include @@ -1103,6 +1104,16 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, return (SET_ERROR(EXDEV)); } + /* + * Cloning across encrypted datasets is possible only if they + * share the same master key. + */ + if (inos != outos && inos->os_encrypted && + !dmu_objset_crypto_key_equal(inos, outos)) { + zfs_exit_two(inzfsvfs, outzfsvfs, FTAG); + return (SET_ERROR(EXDEV)); + } + error = zfs_verify_zp(inzp); if (error == 0) error = zfs_verify_zp(outzp); @@ -1286,20 +1297,6 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, */ break; } - /* - * Encrypted data is fine as long as it comes from the same - * dataset. - * TODO: We want to extend it in the future to allow cloning to - * datasets with the same keys, like clones or to be able to - * clone a file from a snapshot of an encrypted dataset into the - * dataset itself. - */ - if (BP_IS_PROTECTED(&bps[0])) { - if (inzfsvfs != outzfsvfs) { - error = SET_ERROR(EXDEV); - break; - } - } /* * Start a transaction. diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index 8bc55a1b4b4..fb78d96fb52 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -42,6 +42,7 @@ tests = ['block_cloning_copyfilerange', 'block_cloning_copyfilerange_partial', 'block_cloning_disabled_copyfilerange', 'block_cloning_disabled_ficlone', 'block_cloning_disabled_ficlonerange', 'block_cloning_copyfilerange_cross_dataset', + 'block_cloning_cross_enc_dataset', 'block_cloning_copyfilerange_fallback_same_txg'] tags = ['functional', 'block_cloning'] diff --git a/tests/test-runner/bin/zts-report.py.in b/tests/test-runner/bin/zts-report.py.in index 4608e87522a..b188a101c25 100755 --- a/tests/test-runner/bin/zts-report.py.in +++ b/tests/test-runner/bin/zts-report.py.in @@ -305,6 +305,8 @@ elif sys.platform.startswith('linux'): ['SKIP', cfr_cross_reason], 'block_cloning/block_cloning_copyfilerange_fallback_same_txg': ['SKIP', cfr_cross_reason], + 'block_cloning/block_cloning_cross_enc_dataset': + ['SKIP', cfr_cross_reason], }) diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 87b50f59ca7..21b830126b2 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -451,6 +451,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/block_cloning/block_cloning_ficlone.ksh \ functional/block_cloning/block_cloning_ficlonerange.ksh \ functional/block_cloning/block_cloning_ficlonerange_partial.ksh \ + functional/block_cloning/block_cloning_cross_enc_dataset.ksh \ functional/bootfs/bootfs_001_pos.ksh \ functional/bootfs/bootfs_002_neg.ksh \ functional/bootfs/bootfs_003_pos.ksh \ diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning.kshlib b/tests/zfs-tests/tests/functional/block_cloning/block_cloning.kshlib index 8e16366b4cd..526bd54a2bf 100644 --- a/tests/zfs-tests/tests/functional/block_cloning/block_cloning.kshlib +++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning.kshlib @@ -28,8 +28,8 @@ function have_same_content { - typeset hash1=$(cat $1 | md5sum) - typeset hash2=$(cat $2 | md5sum) + typeset hash1=$(md5digest $1) + typeset hash2=$(md5digest $2) log_must [ "$hash1" = "$hash2" ] } @@ -44,10 +44,14 @@ function have_same_content # function get_same_blocks { + KEY=$5 + if [ ${#KEY} -gt 0 ]; then + KEY="--key=$KEY" + fi typeset zdbout=${TMPDIR:-$TEST_BASE_DIR}/zdbout.$$ - zdb -vvvvv $1 -O $2 | \ + zdb $KEY -vvvvv $1 -O $2 | \ awk '/ L0 / { print l++ " " $3 " " $7 }' > $zdbout.a - zdb -vvvvv $3 -O $4 | \ + zdb $KEY -vvvvv $3 -O $4 | \ awk '/ L0 / { print l++ " " $3 " " $7 }' > $zdbout.b echo $(sort $zdbout.a $zdbout.b | uniq -d | cut -f1 -d' ') } diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_cross_enc_dataset.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_cross_enc_dataset.ksh new file mode 100755 index 00000000000..fe8f0867b90 --- /dev/null +++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_cross_enc_dataset.ksh @@ -0,0 +1,170 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2023, Kay Pedersen +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/block_cloning/block_cloning.kshlib + +verify_runnable "global" + +if [[ $(linux_version) -lt $(linux_version "5.3") ]]; then + log_unsupported "copy_file_range can't copy cross-filesystem before Linux 5.3" +fi + +claim="Block cloning across encrypted datasets." + +log_assert $claim + +DS1="$TESTPOOL/encrypted1" +DS2="$TESTPOOL/encrypted2" +DS1_NC="$TESTPOOL/notcrypted1" +PASSPHRASE="top_secret" + +function prepare_enc +{ + log_must zpool create -o feature@block_cloning=enabled $TESTPOOL $DISKS + log_must eval "echo $PASSPHRASE | zfs create -o encryption=on" \ + "-o keyformat=passphrase -o keylocation=prompt $DS1" + log_must eval "echo $PASSPHRASE | zfs create -o encryption=on" \ + "-o keyformat=passphrase -o keylocation=prompt $DS2" + log_must zfs create $DS1/child1 + log_must zfs create $DS1/child2 + log_must zfs create $DS1_NC + + log_note "Create test file" + # we must wait until the src file txg is written to the disk otherwise we + # will fallback to normal copy. See "dmu_read_l0_bps" in + # "zfs/module/zfs/dmu.c" and "zfs_clone_range" in + # "zfs/module/zfs/zfs_vnops.c" + log_must dd if=/dev/urandom of=/$DS1/file bs=128K count=4 + log_must dd if=/dev/urandom of=/$DS1/child1/file bs=128K count=4 + log_must dd if=/dev/urandom of=/$DS1_NC/file bs=128K count=4 + log_must sync_pool $TESTPOOL +} + +function cleanup_enc +{ + datasetexists $TESTPOOL && destroy_pool $TESTPOOL +} + +function clone_and_check +{ + I_FILE="$1" + O_FILE=$2 + I_DS=$3 + O_DS=$4 + SAME_BLOCKS=$5 + # the CLONE option provides a choice between copy_file_range + # which should clone and a dd which is a copy no matter what + CLONE=$6 + SNAPSHOT=$7 + if [ ${#SNAPSHOT} -gt 0 ]; then + I_FILE=".zfs/snapshot/$SNAPSHOT/$1" + fi + if [ $CLONE ]; then + log_must clonefile -f "/$I_DS/$I_FILE" "/$O_DS/$O_FILE" 0 0 524288 + else + log_must dd if="/$I_DS/$I_FILE" of="/$O_DS/$O_FILE" bs=128K + fi + log_must sync_pool $TESTPOOL + + log_must have_same_content "/$I_DS/$I_FILE" "/$O_DS/$O_FILE" + + if [ ${#SNAPSHOT} -gt 0 ]; then + I_DS="$I_DS@$SNAPSHOT" + I_FILE="$1" + fi + typeset blocks=$(get_same_blocks \ + $I_DS $I_FILE $O_DS $O_FILE $PASSPHRASE) + log_must [ "$blocks" = "$SAME_BLOCKS" ] +} + +log_onexit cleanup_enc + +prepare_enc + +log_note "Cloning entire file with copy_file_range across different enc" \ + "roots, should fallback" +# we are expecting no same block map. +clone_and_check "file" "clone" $DS1 $DS2 "" true +log_note "check if the file is still readable and the same after" \ + "unmount and key unload, shouldn't fail" +typeset hash1=$(md5digest "/$DS1/file") +log_must zfs umount $DS1 && zfs unload-key $DS1 +typeset hash2=$(md5digest "/$DS2/clone") +log_must [ "$hash1" = "$hash2" ] + +cleanup_enc +prepare_enc + +log_note "Cloning entire file with copy_file_range across different child datasets" +# clone shouldn't work because of deriving a new master key for the child +# we are expecting no same block map. +clone_and_check "file" "clone" $DS1 "$DS1/child1" "" true +clone_and_check "file" "clone" "$DS1/child1" "$DS1/child2" "" true + +cleanup_enc +prepare_enc + +log_note "Copying entire file with copy_file_range across same snapshot" +log_must zfs snapshot -r $DS1@s1 +log_must sync_pool $TESTPOOL +log_must rm -f "/$DS1/file" +log_must sync_pool $TESTPOOL +clone_and_check "file" "clone" "$DS1" "$DS1" "0 1 2 3" true "s1" + +cleanup_enc +prepare_enc + +log_note "Copying entire file with copy_file_range across different snapshot" +clone_and_check "file" "file" $DS1 $DS2 "" true +log_must zfs snapshot -r $DS2@s1 +log_must sync_pool $TESTPOOL +log_must rm -f "/$DS1/file" "/$DS2/file" +log_must sync_pool $TESTPOOL +clone_and_check "file" "clone" "$DS2" "$DS1" "" true "s1" +typeset hash1=$(md5digest "/$DS1/.zfs/snapshot/s1/file") +log_note "destroy the snapshot and check if the file is still readable and" \ + "has the same content" +log_must zfs destroy -r $DS2@s1 +log_must sync_pool $TESTPOOL +typeset hash2=$(md5digest "/$DS1/file") +log_must [ "$hash1" = "$hash2" ] + +cleanup_enc +prepare_enc + +log_note "Copying with copy_file_range from non encrypted to encrypted" +clone_and_check "file" "copy" $DS1_NC $DS1 "" true + +cleanup_enc +prepare_enc + +log_note "Copying with copy_file_range from encrypted to non encrypted" +clone_and_check "file" "copy" $DS1 $DS1_NC "" true + +log_must sync_pool $TESTPOOL + +log_pass $claim From dea2d3c6cda7f61bd53deb133926bee77819f9bd Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Wed, 6 Dec 2023 15:39:12 -0500 Subject: [PATCH 18/91] zdb: Dump encrypted write and clone ZIL records Block pointers are not encrypted in TX_WRITE and TX_CLONE_RANGE records, so we can dump them, that may be useful for debugging. Related to #15543. Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #15629 --- cmd/zdb/zdb_il.c | 60 ++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 58 insertions(+), 2 deletions(-) diff --git a/cmd/zdb/zdb_il.c b/cmd/zdb/zdb_il.c index 970c45c9b3b..63d95ddedc3 100644 --- a/cmd/zdb/zdb_il.c +++ b/cmd/zdb/zdb_il.c @@ -168,7 +168,7 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, const void *arg) (u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_offset, (u_longlong_t)lr->lr_length); - if (txtype == TX_WRITE2 || verbose < 5) + if (txtype == TX_WRITE2 || verbose < 4) return; if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { @@ -178,6 +178,8 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, const void *arg) "will claim" : "won't claim"); print_log_bp(bp, tab_prefix); + if (verbose < 5) + return; if (BP_IS_HOLE(bp)) { (void) printf("\t\t\tLSIZE 0x%llx\n", (u_longlong_t)BP_GET_LSIZE(bp)); @@ -202,6 +204,9 @@ zil_prt_rec_write(zilog_t *zilog, int txtype, const void *arg) if (error) goto out; } else { + if (verbose < 5) + return; + /* data is stored after the end of the lr_write record */ data = abd_alloc(lr->lr_length, B_FALSE); abd_copy_from_buf(data, lr + 1, lr->lr_length); @@ -217,6 +222,28 @@ out: abd_free(data); } +static void +zil_prt_rec_write_enc(zilog_t *zilog, int txtype, const void *arg) +{ + (void) txtype; + const lr_write_t *lr = arg; + const blkptr_t *bp = &lr->lr_blkptr; + int verbose = MAX(dump_opt['d'], dump_opt['i']); + + (void) printf("%s(encrypted)\n", tab_prefix); + + if (verbose < 4) + return; + + if (lr->lr_common.lrc_reclen == sizeof (lr_write_t)) { + (void) printf("%shas blkptr, %s\n", tab_prefix, + !BP_IS_HOLE(bp) && + bp->blk_birth >= spa_min_claim_txg(zilog->zl_spa) ? + "will claim" : "won't claim"); + print_log_bp(bp, tab_prefix); + } +} + static void zil_prt_rec_truncate(zilog_t *zilog, int txtype, const void *arg) { @@ -312,11 +339,34 @@ zil_prt_rec_clone_range(zilog_t *zilog, int txtype, const void *arg) { (void) zilog, (void) txtype; const lr_clone_range_t *lr = arg; + int verbose = MAX(dump_opt['d'], dump_opt['i']); (void) printf("%sfoid %llu, offset %llx, length %llx, blksize %llx\n", tab_prefix, (u_longlong_t)lr->lr_foid, (u_longlong_t)lr->lr_offset, (u_longlong_t)lr->lr_length, (u_longlong_t)lr->lr_blksz); + if (verbose < 4) + return; + + for (unsigned int i = 0; i < lr->lr_nbps; i++) { + (void) printf("%s[%u/%llu] ", tab_prefix, i + 1, + (u_longlong_t)lr->lr_nbps); + print_log_bp(&lr->lr_bps[i], ""); + } +} + +static void +zil_prt_rec_clone_range_enc(zilog_t *zilog, int txtype, const void *arg) +{ + (void) zilog, (void) txtype; + const lr_clone_range_t *lr = arg; + int verbose = MAX(dump_opt['d'], dump_opt['i']); + + (void) printf("%s(encrypted)\n", tab_prefix); + + if (verbose < 4) + return; + for (unsigned int i = 0; i < lr->lr_nbps; i++) { (void) printf("%s[%u/%llu] ", tab_prefix, i + 1, (u_longlong_t)lr->lr_nbps); @@ -327,6 +377,7 @@ zil_prt_rec_clone_range(zilog_t *zilog, int txtype, const void *arg) typedef void (*zil_prt_rec_func_t)(zilog_t *, int, const void *); typedef struct zil_rec_info { zil_prt_rec_func_t zri_print; + zil_prt_rec_func_t zri_print_enc; const char *zri_name; uint64_t zri_count; } zil_rec_info_t; @@ -341,7 +392,9 @@ static zil_rec_info_t zil_rec_info[TX_MAX_TYPE] = { {.zri_print = zil_prt_rec_remove, .zri_name = "TX_RMDIR "}, {.zri_print = zil_prt_rec_link, .zri_name = "TX_LINK "}, {.zri_print = zil_prt_rec_rename, .zri_name = "TX_RENAME "}, - {.zri_print = zil_prt_rec_write, .zri_name = "TX_WRITE "}, + {.zri_print = zil_prt_rec_write, + .zri_print_enc = zil_prt_rec_write_enc, + .zri_name = "TX_WRITE "}, {.zri_print = zil_prt_rec_truncate, .zri_name = "TX_TRUNCATE "}, {.zri_print = zil_prt_rec_setattr, .zri_name = "TX_SETATTR "}, {.zri_print = zil_prt_rec_acl, .zri_name = "TX_ACL_V0 "}, @@ -358,6 +411,7 @@ static zil_rec_info_t zil_rec_info[TX_MAX_TYPE] = { {.zri_print = zil_prt_rec_rename, .zri_name = "TX_RENAME_EXCHANGE "}, {.zri_print = zil_prt_rec_rename, .zri_name = "TX_RENAME_WHITEOUT "}, {.zri_print = zil_prt_rec_clone_range, + .zri_print_enc = zil_prt_rec_clone_range_enc, .zri_name = "TX_CLONE_RANGE "}, }; @@ -384,6 +438,8 @@ print_log_record(zilog_t *zilog, const lr_t *lr, void *arg, uint64_t claim_txg) if (txtype && verbose >= 3) { if (!zilog->zl_os->os_encrypted) { zil_rec_info[txtype].zri_print(zilog, txtype, lr); + } else if (zil_rec_info[txtype].zri_print_enc) { + zil_rec_info[txtype].zri_print_enc(zilog, txtype, lr); } else { (void) printf("%s(encrypted)\n", tab_prefix); } From 1e1d748cae2edf4afc08cb4a9d063b843dc9f396 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Wed, 6 Dec 2023 18:02:05 -0500 Subject: [PATCH 19/91] ZIL: Remove 128K into 2x68K LWB split optimization To improve 128KB block write performance in case of multiple VDEVs ZIL used to spit those writes into two 64KB ones. Unfortunately it was found to cause LWB buffer overflow, trying to write maximum- sizes 128KB TX_CLONE_RANGE record with 1022 block pointers into 68KB buffer, since unlike TX_WRITE ZIL code can't split it. This is a minimally-invasive temporary block cloning fix until the following more invasive prediction code refactoring. Reviewed-by: Brian Behlendorf Reviewed-by: Ameer Hamza Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #15634 --- module/zfs/zil.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/module/zfs/zil.c b/module/zfs/zil.c index 7670e172950..5642f082bdb 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -1733,8 +1733,6 @@ static const struct { { 8192 + 4096, 8192 + 4096 }, /* database */ { 32768 + 4096, 32768 + 4096 }, /* NFS writes */ { 65536 + 4096, 65536 + 4096 }, /* 64KB writes */ - { 131072, 131072 }, /* < 128KB writes */ - { 131072 +4096, 65536 + 4096 }, /* 128KB writes */ { UINT64_MAX, SPA_OLD_MAXBLOCKSIZE}, /* > 128KB writes */ }; From e09356fa05ff174ed02ade8ea8e4ab98effa0ccd Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Wed, 6 Dec 2023 18:37:27 -0500 Subject: [PATCH 20/91] BRT: Limit brt_vdev_dump() to only one vdev Without this patch on pool of 60 vdevs with ZFS_DEBUG enabled clone takes much more time than copy, while heavily trashing dbgmsg for no good reason, repeatedly dumping all vdevs BRTs again and again, even unmodified ones. I am generally not sure this dumping is not excessive, but decided to keep it for now, just restricting its scope to more reasonable. Reviewed-by: Kay Pedersen Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #15625 --- module/zfs/brt.c | 80 ++++++++++++++++++++---------------------------- 1 file changed, 33 insertions(+), 47 deletions(-) diff --git a/module/zfs/brt.c b/module/zfs/brt.c index a701c70fcfb..225ddaca1e5 100644 --- a/module/zfs/brt.c +++ b/module/zfs/brt.c @@ -342,7 +342,7 @@ brt_vdev_entcount_get(const brt_vdev_t *brtvd, uint64_t idx) ASSERT3U(idx, <, brtvd->bv_size); - if (brtvd->bv_need_byteswap) { + if (unlikely(brtvd->bv_need_byteswap)) { return (BSWAP_16(brtvd->bv_entcount[idx])); } else { return (brtvd->bv_entcount[idx]); @@ -355,7 +355,7 @@ brt_vdev_entcount_set(brt_vdev_t *brtvd, uint64_t idx, uint16_t entcnt) ASSERT3U(idx, <, brtvd->bv_size); - if (brtvd->bv_need_byteswap) { + if (unlikely(brtvd->bv_need_byteswap)) { brtvd->bv_entcount[idx] = BSWAP_16(entcnt); } else { brtvd->bv_entcount[idx] = entcnt; @@ -390,55 +390,39 @@ brt_vdev_entcount_dec(brt_vdev_t *brtvd, uint64_t idx) #ifdef ZFS_DEBUG static void -brt_vdev_dump(brt_t *brt) +brt_vdev_dump(brt_vdev_t *brtvd) { - brt_vdev_t *brtvd; - uint64_t vdevid; + uint64_t idx; - if ((zfs_flags & ZFS_DEBUG_BRT) == 0) { - return; - } - - if (brt->brt_nvdevs == 0) { - zfs_dbgmsg("BRT empty"); - return; - } - - zfs_dbgmsg("BRT vdev dump:"); - for (vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) { - uint64_t idx; - - brtvd = &brt->brt_vdevs[vdevid]; - zfs_dbgmsg(" vdevid=%llu/%llu meta_dirty=%d entcount_dirty=%d " - "size=%llu totalcount=%llu nblocks=%llu bitmapsize=%zu\n", - (u_longlong_t)vdevid, (u_longlong_t)brtvd->bv_vdevid, - brtvd->bv_meta_dirty, brtvd->bv_entcount_dirty, - (u_longlong_t)brtvd->bv_size, - (u_longlong_t)brtvd->bv_totalcount, - (u_longlong_t)brtvd->bv_nblocks, - (size_t)BT_SIZEOFMAP(brtvd->bv_nblocks)); - if (brtvd->bv_totalcount > 0) { - zfs_dbgmsg(" entcounts:"); - for (idx = 0; idx < brtvd->bv_size; idx++) { - if (brt_vdev_entcount_get(brtvd, idx) > 0) { - zfs_dbgmsg(" [%04llu] %hu", - (u_longlong_t)idx, - brt_vdev_entcount_get(brtvd, idx)); - } + zfs_dbgmsg(" BRT vdevid=%llu meta_dirty=%d entcount_dirty=%d " + "size=%llu totalcount=%llu nblocks=%llu bitmapsize=%zu\n", + (u_longlong_t)brtvd->bv_vdevid, + brtvd->bv_meta_dirty, brtvd->bv_entcount_dirty, + (u_longlong_t)brtvd->bv_size, + (u_longlong_t)brtvd->bv_totalcount, + (u_longlong_t)brtvd->bv_nblocks, + (size_t)BT_SIZEOFMAP(brtvd->bv_nblocks)); + if (brtvd->bv_totalcount > 0) { + zfs_dbgmsg(" entcounts:"); + for (idx = 0; idx < brtvd->bv_size; idx++) { + uint16_t entcnt = brt_vdev_entcount_get(brtvd, idx); + if (entcnt > 0) { + zfs_dbgmsg(" [%04llu] %hu", + (u_longlong_t)idx, entcnt); } } - if (brtvd->bv_entcount_dirty) { - char *bitmap; + } + if (brtvd->bv_entcount_dirty) { + char *bitmap; - bitmap = kmem_alloc(brtvd->bv_nblocks + 1, KM_SLEEP); - for (idx = 0; idx < brtvd->bv_nblocks; idx++) { - bitmap[idx] = - BT_TEST(brtvd->bv_bitmap, idx) ? 'x' : '.'; - } - bitmap[idx] = '\0'; - zfs_dbgmsg(" bitmap: %s", bitmap); - kmem_free(bitmap, brtvd->bv_nblocks + 1); + bitmap = kmem_alloc(brtvd->bv_nblocks + 1, KM_SLEEP); + for (idx = 0; idx < brtvd->bv_nblocks; idx++) { + bitmap[idx] = + BT_TEST(brtvd->bv_bitmap, idx) ? 'x' : '.'; } + bitmap[idx] = '\0'; + zfs_dbgmsg(" dirty: %s", bitmap); + kmem_free(bitmap, brtvd->bv_nblocks + 1); } } #endif @@ -767,7 +751,8 @@ brt_vdev_addref(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre, BT_SET(brtvd->bv_bitmap, idx); #ifdef ZFS_DEBUG - brt_vdev_dump(brt); + if (zfs_flags & ZFS_DEBUG_BRT) + brt_vdev_dump(brtvd); #endif } @@ -803,7 +788,8 @@ brt_vdev_decref(brt_t *brt, brt_vdev_t *brtvd, const brt_entry_t *bre, BT_SET(brtvd->bv_bitmap, idx); #ifdef ZFS_DEBUG - brt_vdev_dump(brt); + if (zfs_flags & ZFS_DEBUG_BRT) + brt_vdev_dump(brtvd); #endif } From b13c91bb2997cb121cdf935496f92ae773672773 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Fri, 8 Dec 2023 19:43:39 -0500 Subject: [PATCH 21/91] DMU: Fix lock leak on dbuf_hold() error dmu_assign_arcbuf_by_dnode() should drop dn_struct_rwlock lock in case dbuf_hold() failed. I don't have reproduction for this, but it looks inconsistent with dmu_buf_hold_noread_by_dnode() and co. Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #15644 --- module/zfs/dmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 63464d74742..909605aa264 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -1482,9 +1482,9 @@ dmu_assign_arcbuf_by_dnode(dnode_t *dn, uint64_t offset, arc_buf_t *buf, rw_enter(&dn->dn_struct_rwlock, RW_READER); blkid = dbuf_whichblock(dn, 0, offset); db = dbuf_hold(dn, blkid, FTAG); + rw_exit(&dn->dn_struct_rwlock); if (db == NULL) return (SET_ERROR(EIO)); - rw_exit(&dn->dn_struct_rwlock); /* * We can only assign if the offset is aligned and the arc buf is the From a701548eb4a8ed47ebda9b28a23e1de80adf1b91 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Tue, 12 Dec 2023 15:53:59 -0500 Subject: [PATCH 22/91] dbuf: Handle arcbuf assignment after block cloning In some cases dbuf_assign_arcbuf() may be called on a block that was recently cloned. If it happened in current TXG we must undo the block cloning first, since the only one dirty record per TXG can't and shouldn't mean both cloning and overwrite same time. Reviewed-by: Kay Pedersen Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #15653 --- module/zfs/dbuf.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 5a7fe42b602..7691cd85f6c 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -2930,7 +2930,8 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) while (db->db_state == DB_READ || db->db_state == DB_FILL) cv_wait(&db->db_changed, &db->db_mtx); - ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED); + ASSERT(db->db_state == DB_CACHED || db->db_state == DB_UNCACHED || + db->db_state == DB_NOFILL); if (db->db_state == DB_CACHED && zfs_refcount_count(&db->db_holds) - 1 > db->db_dirtycnt) { @@ -2967,6 +2968,15 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) arc_buf_destroy(db->db_buf, db); } db->db_buf = NULL; + } else if (db->db_state == DB_NOFILL) { + /* + * We will be completely replacing the cloned block. In case + * it was cloned in this transaction group, let's undirty the + * pending clone and mark the block as uncached. This will be + * as if the clone was never done. + */ + VERIFY(!dbuf_undirty(db, tx)); + db->db_state = DB_UNCACHED; } ASSERT(db->db_buf == NULL); dbuf_set_data(db, buf); From 9c40ae02199668d1e9a07d14c4ea713b2c5e584e Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Tue, 12 Dec 2023 15:59:24 -0500 Subject: [PATCH 23/91] dbuf: Set dr_data when unoverriding after clone Block cloning normally creates dirty record without dr_data. But if the block is read after cloning, it is moved into DB_CACHED state and receives the data buffer. If after that we call dbuf_unoverride() to convert the dirty record into normal write, we should give it the data buffer from dbuf and release one. Reviewed-by: Kay Pedersen Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #15654 Closes #15656 --- module/zfs/dbuf.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 7691cd85f6c..e4c59b59347 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -1904,7 +1904,6 @@ dbuf_unoverride(dbuf_dirty_record_t *dr) dmu_buf_impl_t *db = dr->dr_dbuf; blkptr_t *bp = &dr->dt.dl.dr_overridden_by; uint64_t txg = dr->dr_txg; - boolean_t release; ASSERT(MUTEX_HELD(&db->db_mtx)); /* @@ -1925,7 +1924,10 @@ dbuf_unoverride(dbuf_dirty_record_t *dr) if (!BP_IS_HOLE(bp) && !dr->dt.dl.dr_nopwrite) zio_free(db->db_objset->os_spa, txg, bp); - release = !dr->dt.dl.dr_brtwrite; + if (dr->dt.dl.dr_brtwrite) { + ASSERT0(dr->dt.dl.dr_data); + dr->dt.dl.dr_data = db->db_buf; + } dr->dt.dl.dr_override_state = DR_NOT_OVERRIDDEN; dr->dt.dl.dr_nopwrite = B_FALSE; dr->dt.dl.dr_brtwrite = B_FALSE; @@ -1939,7 +1941,7 @@ dbuf_unoverride(dbuf_dirty_record_t *dr) * the buf thawed to save the effort of freezing & * immediately re-thawing it. */ - if (release) + if (dr->dt.dl.dr_data) arc_release(dr->dt.dl.dr_data, db); } From f71c16a66126b7a1896b3b0c59a42a7b9186e56b Mon Sep 17 00:00:00 2001 From: chrisperedun <126915832+chrisperedun@users.noreply.github.com> Date: Thu, 21 Dec 2023 14:12:30 -0500 Subject: [PATCH 24/91] Don't panic on unencrypted block in encrypted dataset While 763ca47 closes the situation of block cloning creating unencrypted records in encrypted datasets, existing data still causes panic on read. Setting zfs_recover bypasses this but at the cost of potentially ignoring more serious issues. Reviewed-by: Alexander Motin Reviewed-by: Brian Behlendorf Signed-off-by: Chris Peredun Closes #15677 --- module/zfs/dbuf.c | 2 -- module/zfs/dmu_send.c | 2 -- 2 files changed, 4 deletions(-) diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index e4c59b59347..255add6cd24 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -1619,8 +1619,6 @@ dbuf_read_impl(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags, */ if (db->db_objset->os_encrypted && !BP_USES_CRYPT(bpp)) { spa_log_error(db->db_objset->os_spa, &zb, &bpp->blk_birth); - zfs_panic_recover("unencrypted block in encrypted " - "object set %llu", dmu_objset_id(db->db_objset)); err = SET_ERROR(EIO); goto early_unlock; } diff --git a/module/zfs/dmu_send.c b/module/zfs/dmu_send.c index 2d37ed2cdfb..37c68528bf9 100644 --- a/module/zfs/dmu_send.c +++ b/module/zfs/dmu_send.c @@ -1124,8 +1124,6 @@ send_cb(spa_t *spa, zilog_t *zilog, const blkptr_t *bp, if (sta->os->os_encrypted && !BP_IS_HOLE(bp) && !BP_USES_CRYPT(bp)) { spa_log_error(spa, zb, &bp->blk_birth); - zfs_panic_recover("unencrypted block in encrypted " - "object set %llu", dmu_objset_id(sta->os)); return (SET_ERROR(EIO)); } From 976bf9b6a61919638d42ed79cd207132785d128a Mon Sep 17 00:00:00 2001 From: Shengqi Chen Date: Tue, 9 Jan 2024 08:05:24 +0800 Subject: [PATCH 25/91] Linux 6.2 compat: add check for kernel_neon_* availability This patch adds check for `kernel_neon_*` symbols on arm and arm64 platforms to address the following issues: 1. Linux 6.2+ on arm64 has exported them with `EXPORT_SYMBOL_GPL`, so license compatibility must be checked before use. 2. On both arm and arm64, the definitions of these symbols are guarded by `CONFIG_KERNEL_MODE_NEON`, but their declarations are still present. Checking in configuration phase only leads to MODPOST errors (undefined references). Reviewed-by: Brian Behlendorf Signed-off-by: Shengqi Chen Closes #15711 Closes #14555 Closes: #15401 --- config/kernel-fpu.m4 | 23 +++++++++++++++++--- include/os/linux/kernel/linux/simd_aarch64.h | 6 +++++ include/os/linux/kernel/linux/simd_arm.h | 6 +++++ 3 files changed, 32 insertions(+), 3 deletions(-) diff --git a/config/kernel-fpu.m4 b/config/kernel-fpu.m4 index c6efebd8cf6..edfde1a02d3 100644 --- a/config/kernel-fpu.m4 +++ b/config/kernel-fpu.m4 @@ -79,6 +79,12 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_FPU], [ __kernel_fpu_end(); ], [], [ZFS_META_LICENSE]) + ZFS_LINUX_TEST_SRC([kernel_neon], [ + #include + ], [ + kernel_neon_begin(); + kernel_neon_end(); + ], [], [ZFS_META_LICENSE]) ]) AC_DEFUN([ZFS_AC_KERNEL_FPU], [ @@ -105,9 +111,20 @@ AC_DEFUN([ZFS_AC_KERNEL_FPU], [ AC_DEFINE(KERNEL_EXPORTS_X86_FPU, 1, [kernel exports FPU functions]) ],[ - AC_MSG_RESULT(internal) - AC_DEFINE(HAVE_KERNEL_FPU_INTERNAL, 1, - [kernel fpu internal]) + dnl # + dnl # ARM neon symbols (only on arm and arm64) + dnl # could be GPL-only on arm64 after Linux 6.2 + dnl # + ZFS_LINUX_TEST_RESULT([kernel_neon_license],[ + AC_MSG_RESULT(kernel_neon_*) + AC_DEFINE(HAVE_KERNEL_NEON, 1, + [kernel has kernel_neon_* functions]) + ],[ + # catch-all + AC_MSG_RESULT(internal) + AC_DEFINE(HAVE_KERNEL_FPU_INTERNAL, 1, + [kernel fpu internal]) + ]) ]) ]) ]) diff --git a/include/os/linux/kernel/linux/simd_aarch64.h b/include/os/linux/kernel/linux/simd_aarch64.h index 16276b08c75..123a0c72bc6 100644 --- a/include/os/linux/kernel/linux/simd_aarch64.h +++ b/include/os/linux/kernel/linux/simd_aarch64.h @@ -71,9 +71,15 @@ #define ID_AA64PFR0_EL1 sys_reg(3, 0, 0, 1, 0) #define ID_AA64ISAR0_EL1 sys_reg(3, 0, 0, 6, 0) +#if (defined(HAVE_KERNEL_NEON) && defined(CONFIG_KERNEL_MODE_NEON)) #define kfpu_allowed() 1 #define kfpu_begin() kernel_neon_begin() #define kfpu_end() kernel_neon_end() +#else +#define kfpu_allowed() 0 +#define kfpu_begin() do {} while (0) +#define kfpu_end() do {} while (0) +#endif #define kfpu_init() (0) #define kfpu_fini() do {} while (0) diff --git a/include/os/linux/kernel/linux/simd_arm.h b/include/os/linux/kernel/linux/simd_arm.h index c432a6d4abd..bc70eaef307 100644 --- a/include/os/linux/kernel/linux/simd_arm.h +++ b/include/os/linux/kernel/linux/simd_arm.h @@ -53,9 +53,15 @@ #include #include +#if (defined(HAVE_KERNEL_NEON) && defined(CONFIG_KERNEL_MODE_NEON)) #define kfpu_allowed() 1 #define kfpu_begin() kernel_neon_begin() #define kfpu_end() kernel_neon_end() +#else +#define kfpu_allowed() 0 +#define kfpu_begin() do {} while (0) +#define kfpu_end() do {} while (0) +#endif #define kfpu_init() (0) #define kfpu_fini() do {} while (0) From 152a775eac59e026100835cb213ccafa3a163ad6 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Tue, 9 Jan 2024 12:46:43 -0500 Subject: [PATCH 26/91] Improve block sizes checks during cloning - Fail if source block is smaller than destination. We can only grow blocks, not shrink them. - Fail if we do not have full znode range lock. In that case grow is not even called. We should improve zfs_rangelock_cb() somehow to know when cloning needs to grow the block size unlike write. - Fail of we tried to resize, but failed. There are many reasons for it to fail that we can not predict at this level, so be ready for them. Unlike write, that may proceed after growth failure, block cloning can't and must return error. This fixes assertion inside dmu_brt_clone() when it sees different number of blocks held in destination than it got block pointers. Builds without ZFS_DEBUG returned EXDEV, so are not affected much. Reviewed-by: Pawel Jakub Dawidek Reviewed-by: Brian Atkinson Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #15724 Closes #15735 --- module/zfs/zfs_vnops.c | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index 17e990451e0..812e42f645e 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -1192,11 +1192,18 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, inblksz = inzp->z_blksz; /* - * We cannot clone into files with different block size if we can't - * grow it (block size is already bigger or more than one block). + * We cannot clone into a file with different block size if we can't + * grow it (block size is already bigger, has more than one block, or + * not locked for growth). There are other possible reasons for the + * grow to fail, but we cover what we can before opening transaction + * and the rest detect after we try to do it. */ + if (inblksz < outzp->z_blksz) { + error = SET_ERROR(EINVAL); + goto unlock; + } if (inblksz != outzp->z_blksz && (outzp->z_size > outzp->z_blksz || - outzp->z_size > inblksz)) { + outlr->lr_length != UINT64_MAX)) { error = SET_ERROR(EINVAL); goto unlock; } @@ -1315,12 +1322,24 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, } /* - * Copy source znode's block size. This only happens on the - * first iteration since zfs_rangelock_reduce() will shrink down - * lr_len to the appropriate size. + * Copy source znode's block size. This is done only if the + * whole znode is locked (see zfs_rangelock_cb()) and only + * on the first iteration since zfs_rangelock_reduce() will + * shrink down lr_length to the appropriate size. */ if (outlr->lr_length == UINT64_MAX) { zfs_grow_blocksize(outzp, inblksz, tx); + + /* + * Block growth may fail for many reasons we can not + * predict here. If it happen the cloning is doomed. + */ + if (inblksz != outzp->z_blksz) { + error = SET_ERROR(EINVAL); + dmu_tx_abort(tx); + break; + } + /* * Round range lock up to the block boundary, so we * prevent appends until we are done. From ac592318b83a7d1cddb47f68d5b77789865f9768 Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Tue, 9 Jan 2024 12:48:40 -0500 Subject: [PATCH 27/91] Fix livelist assertions for dedup and cloning Two block pointers in livelist pointing to the same location may be caused not only by dedup, but also by block cloning. We should not assert D bit set in them. Two block pointers in livelist pointing to the same location may have different logical birth time in case of dedup or cloning. We should assert identical physical birth time instead. Assert identical physical block size between pointers in addition to checksum, since that is what checksums are calculated on. Reviewed-by: Matthew Ahrens Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #15732 --- module/zfs/dsl_deadlist.c | 31 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/module/zfs/dsl_deadlist.c b/module/zfs/dsl_deadlist.c index 47c234f76c4..ac30a370813 100644 --- a/module/zfs/dsl_deadlist.c +++ b/module/zfs/dsl_deadlist.c @@ -1000,8 +1000,6 @@ livelist_compare(const void *larg, const void *rarg) /* if vdevs are equal, sort by offsets. */ uint64_t l_dva0_offset = DVA_GET_OFFSET(&l->blk_dva[0]); uint64_t r_dva0_offset = DVA_GET_OFFSET(&r->blk_dva[0]); - if (l_dva0_offset == r_dva0_offset) - ASSERT3U(l->blk_birth, ==, r->blk_birth); return (TREE_CMP(l_dva0_offset, r_dva0_offset)); } @@ -1016,9 +1014,9 @@ struct livelist_iter_arg { * and used to match up ALLOC/FREE pairs. ALLOC'd blkptrs without a * corresponding FREE are stored in the supplied bplist. * - * Note that multiple FREE and ALLOC entries for the same blkptr may - * be encountered when dedup is involved. For this reason we keep a - * refcount for all the FREE entries of each blkptr and ensure that + * Note that multiple FREE and ALLOC entries for the same blkptr may be + * encountered when dedup or block cloning is involved. For this reason we + * keep a refcount for all the FREE entries of each blkptr and ensure that * each of those FREE entries has a corresponding ALLOC preceding it. */ static int @@ -1037,6 +1035,13 @@ dsl_livelist_iterate(void *arg, const blkptr_t *bp, boolean_t bp_freed, livelist_entry_t node; node.le_bp = *bp; livelist_entry_t *found = avl_find(avl, &node, NULL); + if (found) { + ASSERT3U(BP_GET_PSIZE(bp), ==, BP_GET_PSIZE(&found->le_bp)); + ASSERT3U(BP_GET_CHECKSUM(bp), ==, + BP_GET_CHECKSUM(&found->le_bp)); + ASSERT3U(BP_PHYSICAL_BIRTH(bp), ==, + BP_PHYSICAL_BIRTH(&found->le_bp)); + } if (bp_freed) { if (found == NULL) { /* first free entry for this blkptr */ @@ -1046,10 +1051,10 @@ dsl_livelist_iterate(void *arg, const blkptr_t *bp, boolean_t bp_freed, e->le_refcnt = 1; avl_add(avl, e); } else { - /* dedup block free */ - ASSERT(BP_GET_DEDUP(bp)); - ASSERT3U(BP_GET_CHECKSUM(bp), ==, - BP_GET_CHECKSUM(&found->le_bp)); + /* + * Deduped or cloned block free. We could assert D bit + * for dedup, but there is no such one for cloning. + */ ASSERT3U(found->le_refcnt + 1, >, found->le_refcnt); found->le_refcnt++; } @@ -1065,14 +1070,6 @@ dsl_livelist_iterate(void *arg, const blkptr_t *bp, boolean_t bp_freed, /* all tracked free pairs have been matched */ avl_remove(avl, found); kmem_free(found, sizeof (livelist_entry_t)); - } else { - /* - * This is definitely a deduped blkptr so - * let's validate it. - */ - ASSERT(BP_GET_DEDUP(bp)); - ASSERT3U(BP_GET_CHECKSUM(bp), ==, - BP_GET_CHECKSUM(&found->le_bp)); } } } From 3bd23fd78dce7b5e3b76cce5a210a2977e1e05a8 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Thu, 11 Jan 2024 19:43:38 +1100 Subject: [PATCH 28/91] freebsd: fix compile for spa_taskq_read/spa_taskq_write params Missed in #15695, backporting #15675. Signed-off-by: Rob Norris --- include/os/freebsd/spl/sys/mod_os.h | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/include/os/freebsd/spl/sys/mod_os.h b/include/os/freebsd/spl/sys/mod_os.h index 77ce75ca3f1..150e50380d8 100644 --- a/include/os/freebsd/spl/sys/mod_os.h +++ b/include/os/freebsd/spl/sys/mod_os.h @@ -91,6 +91,12 @@ #define param_set_max_auto_ashift_args(var) \ CTLTYPE_UINT, NULL, 0, param_set_max_auto_ashift, "IU" +#define spa_taskq_read_param_set_args(var) \ + CTLTYPE_STRING, NULL, 0, spa_taskq_read_param, "A" + +#define spa_taskq_write_param_set_args(var) \ + CTLTYPE_STRING, NULL, 0, spa_taskq_write_param, "A" + #define fletcher_4_param_set_args(var) \ CTLTYPE_STRING, NULL, 0, fletcher_4_param, "A" From 9181e94f0b24e3459e1e9b6b2b096ff40b9461eb Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Fri, 29 Dec 2023 10:22:58 -0500 Subject: [PATCH 29/91] spa: Fix FreeBSD sysctl handlers sbuf_cpy() resets the sbuf state, which is wrong for sbufs allocated by sbuf_new_for_sysctl(). In particular, this code triggers an assertion failure in sbuf_clear(). Simplify by just using sysctl_handle_string() for both reading and setting the tunable. Fixes: 6930ecbb7 ("spa: make read/write queues configurable") Reviewed-by: Rob Norris Reviewed-by: Alexander Motin Reported-by: Peter Holm Signed-off-by: Mark Johnston Closes #15719 --- module/zfs/spa.c | 30 ++++++------------------------ 1 file changed, 6 insertions(+), 24 deletions(-) diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 32a58529219..739e2cb7c2a 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -1398,8 +1398,6 @@ spa_taskq_write_param_get(char *buf, zfs_kernel_param_t *kp) return (spa_taskq_param_get(ZIO_TYPE_WRITE, buf)); } #else -#include - /* * On FreeBSD load-time parameters can be set up before malloc() is available, * so we have to do all the parsing work on the stack. @@ -1410,19 +1408,11 @@ static int spa_taskq_read_param(ZFS_MODULE_PARAM_ARGS) { char buf[SPA_TASKQ_PARAM_MAX]; - int err = 0; - - if (req->newptr == NULL) { - int len = spa_taskq_param_get(ZIO_TYPE_READ, buf); - struct sbuf *s = sbuf_new_for_sysctl(NULL, NULL, len+1, req); - sbuf_cpy(s, buf); - err = sbuf_finish(s); - sbuf_delete(s); - return (err); - } + int err; + (void) spa_taskq_param_get(ZIO_TYPE_READ, buf); err = sysctl_handle_string(oidp, buf, sizeof (buf), req); - if (err) + if (err || req->newptr == NULL) return (err); return (spa_taskq_param_set(ZIO_TYPE_READ, buf)); } @@ -1431,19 +1421,11 @@ static int spa_taskq_write_param(ZFS_MODULE_PARAM_ARGS) { char buf[SPA_TASKQ_PARAM_MAX]; - int err = 0; - - if (req->newptr == NULL) { - int len = spa_taskq_param_get(ZIO_TYPE_WRITE, buf); - struct sbuf *s = sbuf_new_for_sysctl(NULL, NULL, len+1, req); - sbuf_cpy(s, buf); - err = sbuf_finish(s); - sbuf_delete(s); - return (err); - } + int err; + (void) spa_taskq_param_get(ZIO_TYPE_WRITE, buf); err = sysctl_handle_string(oidp, buf, sizeof (buf), req); - if (err) + if (err || req->newptr == NULL) return (err); return (spa_taskq_param_set(ZIO_TYPE_WRITE, buf)); } From a00231a3fc9909aa5ccf91af9c3a473665e9dea4 Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Fri, 29 Dec 2023 12:56:35 -0500 Subject: [PATCH 30/91] spa: Let spa_taskq_param_get()'s addition of a newline be optional For FreeBSD sysctls, we don't want the extra newline, since the sysctl(8) utility will format strings appropriately. Reviewed-by: Rob Norris Reviewed-by: Alexander Motin Reported-by: Peter Holm Signed-off-by: Mark Johnston Closes #15719 --- module/zfs/spa.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/module/zfs/spa.c b/module/zfs/spa.c index 739e2cb7c2a..d7fe96cde6a 100644 --- a/module/zfs/spa.c +++ b/module/zfs/spa.c @@ -1345,7 +1345,7 @@ spa_taskq_param_set(zio_type_t t, char *cfg) } static int -spa_taskq_param_get(zio_type_t t, char *buf) +spa_taskq_param_get(zio_type_t t, char *buf, boolean_t add_newline) { int pos = 0; @@ -1363,7 +1363,8 @@ spa_taskq_param_get(zio_type_t t, char *buf) sep = " "; } - buf[pos++] = '\n'; + if (add_newline) + buf[pos++] = '\n'; buf[pos] = '\0'; return (pos); @@ -1381,7 +1382,7 @@ spa_taskq_read_param_set(const char *val, zfs_kernel_param_t *kp) static int spa_taskq_read_param_get(char *buf, zfs_kernel_param_t *kp) { - return (spa_taskq_param_get(ZIO_TYPE_READ, buf)); + return (spa_taskq_param_get(ZIO_TYPE_READ, buf, TRUE)); } static int @@ -1395,7 +1396,7 @@ spa_taskq_write_param_set(const char *val, zfs_kernel_param_t *kp) static int spa_taskq_write_param_get(char *buf, zfs_kernel_param_t *kp) { - return (spa_taskq_param_get(ZIO_TYPE_WRITE, buf)); + return (spa_taskq_param_get(ZIO_TYPE_WRITE, buf, TRUE)); } #else /* @@ -1410,7 +1411,7 @@ spa_taskq_read_param(ZFS_MODULE_PARAM_ARGS) char buf[SPA_TASKQ_PARAM_MAX]; int err; - (void) spa_taskq_param_get(ZIO_TYPE_READ, buf); + (void) spa_taskq_param_get(ZIO_TYPE_READ, buf, FALSE); err = sysctl_handle_string(oidp, buf, sizeof (buf), req); if (err || req->newptr == NULL) return (err); @@ -1423,7 +1424,7 @@ spa_taskq_write_param(ZFS_MODULE_PARAM_ARGS) char buf[SPA_TASKQ_PARAM_MAX]; int err; - (void) spa_taskq_param_get(ZIO_TYPE_WRITE, buf); + (void) spa_taskq_param_get(ZIO_TYPE_WRITE, buf, FALSE); err = sysctl_handle_string(oidp, buf, sizeof (buf), req); if (err || req->newptr == NULL) return (err); From 9ecd112dc1dcdb903ebd3f1971cb1256094e73c4 Mon Sep 17 00:00:00 2001 From: Shengqi Chen Date: Thu, 7 Dec 2023 04:37:50 +0800 Subject: [PATCH 31/91] compact: workaround for GPL-only symbols on riscv from Linux 6.2 Since Linux 6.2, the implementation of flush_dcache_page on riscv references GPL-only symbol `PageHuge`, breaking the build of zfs. This patch uses existing mechanism to override flush_dcache_page, removing the call to `PageHuge`. According to comments in kernel, it is only used to do some check against HugeTLB pages, which only exist in userspace. ZFS uses flush_dcache_page only on kernel pages, thus this patch will not introduce any behaviour change. See also: torvalds/linux@d33deda, openzfs/zfs@589f59b Reviewed-by: Brian Behlendorf Signed-off-by: Shengqi Chen Closes #14974 Closes #15627 --- config/kernel-flush_dcache_page.m4 | 5 +++-- config/kernel.m4 | 6 ++++++ include/os/linux/kernel/linux/dcache_compat.h | 15 +++++++++++++-- 3 files changed, 22 insertions(+), 4 deletions(-) diff --git a/config/kernel-flush_dcache_page.m4 b/config/kernel-flush_dcache_page.m4 index 2340c386ef5..aa916c87d53 100644 --- a/config/kernel-flush_dcache_page.m4 +++ b/config/kernel-flush_dcache_page.m4 @@ -1,7 +1,8 @@ dnl # dnl # Starting from Linux 5.13, flush_dcache_page() becomes an inline -dnl # function and may indirectly referencing GPL-only cpu_feature_keys on -dnl # powerpc +dnl # function and may indirectly referencing GPL-only symbols: +dnl # on powerpc: cpu_feature_keys +dnl # on riscv: PageHuge (added from 6.2) dnl # dnl # diff --git a/config/kernel.m4 b/config/kernel.m4 index 056517a841f..d25b65994f6 100644 --- a/config/kernel.m4 +++ b/config/kernel.m4 @@ -168,6 +168,9 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [ ZFS_AC_KERNEL_SRC_CPU_HAS_FEATURE ZFS_AC_KERNEL_SRC_FLUSH_DCACHE_PAGE ;; + riscv*) + ZFS_AC_KERNEL_SRC_FLUSH_DCACHE_PAGE + ;; esac AC_MSG_CHECKING([for available kernel interfaces]) @@ -310,6 +313,9 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [ ZFS_AC_KERNEL_CPU_HAS_FEATURE ZFS_AC_KERNEL_FLUSH_DCACHE_PAGE ;; + riscv*) + ZFS_AC_KERNEL_FLUSH_DCACHE_PAGE + ;; esac ]) diff --git a/include/os/linux/kernel/linux/dcache_compat.h b/include/os/linux/kernel/linux/dcache_compat.h index 1e35204932d..ab1711b99f3 100644 --- a/include/os/linux/kernel/linux/dcache_compat.h +++ b/include/os/linux/kernel/linux/dcache_compat.h @@ -42,8 +42,8 @@ /* * Starting from Linux 5.13, flush_dcache_page() becomes an inline function * and under some configurations, may indirectly referencing GPL-only - * cpu_feature_keys on powerpc. Override this function when it is detected - * being GPL-only. + * symbols, e.g., cpu_feature_keys on powerpc and PageHuge on riscv. + * Override this function when it is detected being GPL-only. */ #if defined __powerpc__ && defined HAVE_FLUSH_DCACHE_PAGE_GPL_ONLY #include @@ -53,6 +53,17 @@ clear_bit(PG_dcache_clean, &(page)->flags); \ } while (0) #endif +/* + * For riscv implementation, the use of PageHuge can be safely removed. + * Because it handles pages allocated by HugeTLB, while flush_dcache_page + * in zfs module is only called on kernel pages. + */ +#if defined __riscv && defined HAVE_FLUSH_DCACHE_PAGE_GPL_ONLY +#define flush_dcache_page(page) do { \ + if (test_bit(PG_dcache_clean, &(page)->flags)) \ + clear_bit(PG_dcache_clean, &(page)->flags); \ + } while (0) +#endif /* * 2.6.30 API change, From 2ecc2dfe42707d8569e30f3b6a4526a0a825d479 Mon Sep 17 00:00:00 2001 From: Rob N Date: Wed, 17 Jan 2024 09:01:17 +1100 Subject: [PATCH 32/91] Linux 6.7 compat: zfs_setattr fix atime update In db4fc559c I messed up and changed this bit of code to set the inode atime to an uninitialised value, when actually it was just supposed to loading the atime from the inode to be stored in the SA. This changes it to what it should have been. Ensure times change by the right amount Previously, we only checked if the times changed at all, which missed a bug where the atime was being set to an undefined value. Now ensure the times change by two seconds (or thereabouts), ensuring we catch cases where we set the time to something bonkers Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Sponsored-by: https://despairlabs.com/sponsor/ Closes #15762 Closes #15773 --- module/os/linux/zfs/zfs_vnops_os.c | 3 +-- tests/zfs-tests/cmd/ctime.c | 14 +++++++++++--- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c index 65d1d786ae5..9ea8ad5f4a6 100644 --- a/module/os/linux/zfs/zfs_vnops_os.c +++ b/module/os/linux/zfs/zfs_vnops_os.c @@ -2435,9 +2435,8 @@ top: if ((mask & ATTR_ATIME) || zp->z_atime_dirty) { zp->z_atime_dirty = B_FALSE; - inode_timespec_t tmp_atime; + inode_timespec_t tmp_atime = zpl_inode_get_atime(ip); ZFS_TIME_ENCODE(&tmp_atime, atime); - zpl_inode_set_atime_to_ts(ZTOI(zp), tmp_atime); SA_ADD_BULK_ATTR(bulk, count, SA_ZPL_ATIME(zfsvfs), NULL, &atime, sizeof (atime)); } diff --git a/tests/zfs-tests/cmd/ctime.c b/tests/zfs-tests/cmd/ctime.c index 0f5d81aea61..5ff1cea8a86 100644 --- a/tests/zfs-tests/cmd/ctime.c +++ b/tests/zfs-tests/cmd/ctime.c @@ -362,12 +362,20 @@ main(void) return (1); } - if (t1 == t2) { - (void) fprintf(stderr, "%s: t1(%ld) == t2(%ld)\n", + + /* + * Ideally, time change would be exactly two seconds, but allow + * a little slack in case of scheduling delays or similar. + */ + long delta = (long)t2 - (long)t1; + if (delta < 2 || delta > 4) { + (void) fprintf(stderr, + "%s: BAD time change: t1(%ld), t2(%ld)\n", timetest_table[i].name, (long)t1, (long)t2); return (1); } else { - (void) fprintf(stderr, "%s: t1(%ld) != t2(%ld)\n", + (void) fprintf(stderr, + "%s: good time change: t1(%ld), t2(%ld)\n", timetest_table[i].name, (long)t1, (long)t2); } } From 07cf973fe9e4b99b9c0a89038301fc9ad26f7e95 Mon Sep 17 00:00:00 2001 From: Kevin Jin <33590050+jxdking@users.noreply.github.com> Date: Wed, 17 Jan 2024 12:03:58 -0500 Subject: [PATCH 33/91] Autotrim High Load Average Fix Switch from cv_wait() to cv_wait_idle() in vdev_autotrim_wait_kick(), which should mitigate the high load average while waiting. Reviewed-by: Brian Atkinson Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: jxdking Closes #15781 --- module/zfs/vdev_trim.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/module/zfs/vdev_trim.c b/module/zfs/vdev_trim.c index 03e17db024e..d96b75e5edf 100644 --- a/module/zfs/vdev_trim.c +++ b/module/zfs/vdev_trim.c @@ -194,7 +194,8 @@ vdev_autotrim_wait_kick(vdev_t *vd, int num_of_kick) for (int i = 0; i < num_of_kick; i++) { if (vd->vdev_autotrim_exit_wanted) break; - cv_wait(&vd->vdev_autotrim_kick_cv, &vd->vdev_autotrim_lock); + cv_wait_idle(&vd->vdev_autotrim_kick_cv, + &vd->vdev_autotrim_lock); } boolean_t exit_wanted = vd->vdev_autotrim_exit_wanted; mutex_exit(&vd->vdev_autotrim_lock); From 387f003be3052ee1ea53cef7fdbc0babd2392c68 Mon Sep 17 00:00:00 2001 From: Ameer Hamza Date: Thu, 7 Dec 2023 01:18:43 +0500 Subject: [PATCH 34/91] ZTS: block_cloning: Use numeric sort for get_same_blocks Reviewed-by: Kay Pedersen Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Ameer Hamza Closes #15614 --- .../tests/functional/block_cloning/block_cloning.kshlib | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning.kshlib b/tests/zfs-tests/tests/functional/block_cloning/block_cloning.kshlib index 526bd54a2bf..50f3a3d262c 100644 --- a/tests/zfs-tests/tests/functional/block_cloning/block_cloning.kshlib +++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning.kshlib @@ -53,6 +53,6 @@ function get_same_blocks awk '/ L0 / { print l++ " " $3 " " $7 }' > $zdbout.a zdb $KEY -vvvvv $3 -O $4 | \ awk '/ L0 / { print l++ " " $3 " " $7 }' > $zdbout.b - echo $(sort $zdbout.a $zdbout.b | uniq -d | cut -f1 -d' ') + echo $(sort -n $zdbout.a $zdbout.b | uniq -d | cut -f1 -d' ') } From d8b0b6032b5f46bf26f7796db5659d68f96485c0 Mon Sep 17 00:00:00 2001 From: Ameer Hamza Date: Fri, 1 Dec 2023 01:14:56 +0500 Subject: [PATCH 35/91] ZTS: Add test cases for block cloning replay Reviewed-by: Kay Pedersen Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Ameer Hamza Closes #15614 --- tests/runfiles/linux.run | 3 +- tests/test-runner/bin/zts-report.py.in | 5 +- tests/zfs-tests/tests/Makefile.am | 2 + .../block_cloning/block_cloning_replay.ksh | 131 +++++++++++++++++ .../block_cloning_replay_encrypted.ksh | 133 ++++++++++++++++++ 5 files changed, 272 insertions(+), 2 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/block_cloning/block_cloning_replay.ksh create mode 100755 tests/zfs-tests/tests/functional/block_cloning/block_cloning_replay_encrypted.ksh diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index fb78d96fb52..17ba2335242 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -43,7 +43,8 @@ tests = ['block_cloning_copyfilerange', 'block_cloning_copyfilerange_partial', 'block_cloning_disabled_ficlonerange', 'block_cloning_copyfilerange_cross_dataset', 'block_cloning_cross_enc_dataset', - 'block_cloning_copyfilerange_fallback_same_txg'] + 'block_cloning_copyfilerange_fallback_same_txg', + 'block_cloning_replay', 'block_cloning_replay_encrypted'] tags = ['functional', 'block_cloning'] [tests/functional/chattr:Linux] diff --git a/tests/test-runner/bin/zts-report.py.in b/tests/test-runner/bin/zts-report.py.in index b188a101c25..3b5eeacb6ba 100755 --- a/tests/test-runner/bin/zts-report.py.in +++ b/tests/test-runner/bin/zts-report.py.in @@ -301,6 +301,10 @@ elif sys.platform.startswith('linux'): ['SKIP', cfr_reason], 'block_cloning/block_cloning_copyfilerange_fallback': ['SKIP', cfr_reason], + 'block_cloning/block_cloning_replay': + ['SKIP', cfr_reason], + 'block_cloning/block_cloning_replay_encrypted': + ['SKIP', cfr_reason], 'block_cloning/block_cloning_copyfilerange_cross_dataset': ['SKIP', cfr_cross_reason], 'block_cloning/block_cloning_copyfilerange_fallback_same_txg': @@ -309,7 +313,6 @@ elif sys.platform.startswith('linux'): ['SKIP', cfr_cross_reason], }) - # Not all Github actions runners have scsi_debug module, so we may skip # some tests which use it. if os.environ.get('CI') == 'true': diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 21b830126b2..88573a15ed3 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -452,6 +452,8 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/block_cloning/block_cloning_ficlonerange.ksh \ functional/block_cloning/block_cloning_ficlonerange_partial.ksh \ functional/block_cloning/block_cloning_cross_enc_dataset.ksh \ + functional/block_cloning/block_cloning_replay.ksh \ + functional/block_cloning/block_cloning_replay_encrypted.ksh \ functional/bootfs/bootfs_001_pos.ksh \ functional/bootfs/bootfs_002_neg.ksh \ functional/bootfs/bootfs_003_pos.ksh \ diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_replay.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_replay.ksh new file mode 100755 index 00000000000..1fdf379ed2d --- /dev/null +++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_replay.ksh @@ -0,0 +1,131 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/block_cloning/block_cloning.kshlib + +# +# DESCRIPTION: +# Verify slogs are replayed correctly for cloned files. This +# test is ported from slog_replay tests for block cloning. +# +# STRATEGY: +# 1. Create an empty file system (TESTFS) +# 2. Create regular files and sync +# 3. Freeze TESTFS +# 4. Clone the file +# 5. Unmount filesystem +# +# 6. Remount TESTFS +# 7. Compare clone file with the original file +# + +verify_runnable "global" + +if [[ $(linux_version) -lt $(linux_version "4.5") ]]; then + log_unsupported "copy_file_range not available before Linux 4.5" +fi + +export VDIR=$TEST_BASE_DIR/disk-bclone +export VDEV="$VDIR/a $VDIR/b $VDIR/c" +export LDEV="$VDIR/e $VDIR/f" +log_must rm -rf $VDIR +log_must mkdir -p $VDIR +log_must truncate -s $MINVDEVSIZE $VDEV $LDEV + +claim="The slogs are replayed correctly for cloned files." + +log_assert $claim + +function cleanup +{ + datasetexists $TESTPOOL && destroy_pool $TESTPOOL + rm -rf $TESTDIR $VDIR $VDIR2 +} + +log_onexit cleanup + +# +# 1. Create an empty file system (TESTFS) +# +log_must zpool create -o feature@block_cloning=enabled $TESTPOOL $VDEV \ + log mirror $LDEV +log_must zfs create $TESTPOOL/$TESTFS + +# +# 2. TX_WRITE: Create two files and sync txg +# +log_must dd if=/dev/urandom of=/$TESTPOOL/$TESTFS/file1 \ + oflag=sync bs=128k count=4 +log_must zfs set recordsize=16K $TESTPOOL/$TESTFS +log_must dd if=/dev/urandom of=/$TESTPOOL/$TESTFS/file2 \ + oflag=sync bs=16K count=2048 +sync_pool $TESTPOOL + +# +# 3. Checkpoint for ZIL Replay +# +log_must zpool freeze $TESTPOOL + +# +# 4. TX_CLONE_RANGE: Clone the file +# +log_must clonefile -c /$TESTPOOL/$TESTFS/file1 /$TESTPOOL/$TESTFS/clone1 +log_must clonefile -c /$TESTPOOL/$TESTFS/file2 /$TESTPOOL/$TESTFS/clone2 + +# +# 5. Unmount filesystem and export the pool +# +# At this stage TESTFS is frozen, the intent log contains a complete set +# of deltas to replay for clone files. +# +log_must zfs unmount /$TESTPOOL/$TESTFS + +log_note "Verify transactions to replay:" +log_must zdb -iv $TESTPOOL/$TESTFS + +log_must zpool export $TESTPOOL + +# +# 6. Remount TESTFS +# +# Import the pool to unfreeze it and claim log blocks. It has to be +# `zpool import -f` because we can't write a frozen pool's labels! +# +log_must zpool import -f -d $VDIR $TESTPOOL + +# +# 7. Compare clone file with the original file +# +log_must have_same_content /$TESTPOOL/$TESTFS/file1 /$TESTPOOL/$TESTFS/clone1 +log_must have_same_content /$TESTPOOL/$TESTFS/file2 /$TESTPOOL/$TESTFS/clone2 + +typeset blocks=$(get_same_blocks $TESTPOOL/$TESTFS file1 \ + $TESTPOOL/$TESTFS clone1) +log_must [ "$blocks" = "0 1 2 3" ] + +typeset blocks=$(get_same_blocks $TESTPOOL/$TESTFS file2 \ + $TESTPOOL/$TESTFS clone2) +log_must [ "$blocks" = "$(seq -s " " 0 2047)" ] + +log_pass $claim diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_replay_encrypted.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_replay_encrypted.ksh new file mode 100755 index 00000000000..f9f687c83e5 --- /dev/null +++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_replay_encrypted.ksh @@ -0,0 +1,133 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/block_cloning/block_cloning.kshlib + +# +# DESCRIPTION: +# Verify slogs are replayed correctly for encrypted cloned files. +# This test is ported from slog_replay tests for block cloning. +# +# STRATEGY: +# 1. Create an encrypted file system (TESTFS) +# 2. Create regular files and sync +# 3. Freeze TESTFS +# 4. Clone the file +# 5. Unmount filesystem +# +# 6. Remount encrypted TESTFS +# 7. Compare clone file with the original file +# + +verify_runnable "global" + +if [[ $(linux_version) -lt $(linux_version "4.5") ]]; then + log_unsupported "copy_file_range not available before Linux 4.5" +fi + +export VDIR=$TEST_BASE_DIR/disk-bclone +export VDEV="$VDIR/a $VDIR/b $VDIR/c" +export LDEV="$VDIR/e $VDIR/f" +log_must rm -rf $VDIR +log_must mkdir -p $VDIR +log_must truncate -s $MINVDEVSIZE $VDEV $LDEV +export PASSPHRASE="password" + +claim="The slogs are replayed correctly for encrypted cloned files." + +log_assert $claim + +function cleanup +{ + datasetexists $TESTPOOL && destroy_pool $TESTPOOL + rm -rf $TESTDIR $VDIR $VDIR2 +} + +log_onexit cleanup + +# +# 1. Create an encrypted file system (TESTFS) +# +log_must zpool create -o feature@block_cloning=enabled $TESTPOOL $VDEV \ + log mirror $LDEV +log_must eval "echo $PASSPHRASE | zfs create -o encryption=on" \ + "-o keyformat=passphrase -o keylocation=prompt $TESTPOOL/$TESTFS" + +# +# 2. TX_WRITE: Create two files and sync txg +# +log_must dd if=/dev/urandom of=/$TESTPOOL/$TESTFS/file1 \ + oflag=sync bs=128k count=4 +log_must zfs set recordsize=16K $TESTPOOL/$TESTFS +log_must dd if=/dev/urandom of=/$TESTPOOL/$TESTFS/file2 \ + oflag=sync bs=16K count=2048 +sync_pool $TESTPOOL + +# +# 3. Checkpoint for ZIL Replay +# +log_must zpool freeze $TESTPOOL + +# +# 4. TX_CLONE_RANGE: Clone the file +# +log_must clonefile -c /$TESTPOOL/$TESTFS/file1 /$TESTPOOL/$TESTFS/clone1 +log_must clonefile -c /$TESTPOOL/$TESTFS/file2 /$TESTPOOL/$TESTFS/clone2 + +# +# 5. Unmount filesystem and export the pool +# +# At this stage TESTFS is frozen, the intent log contains a complete set +# of deltas to replay for clone files. +# +log_must zfs unmount /$TESTPOOL/$TESTFS + +log_note "Verify transactions to replay:" +log_must zdb -iv $TESTPOOL/$TESTFS + +log_must zpool export $TESTPOOL + +# +# 6. Remount TESTFS +# +# Import the pool to unfreeze it and claim log blocks. It has to be +# `zpool import -f` because we can't write a frozen pool's labels! +# +log_must eval "echo $PASSPHRASE | zpool import -l -f -d $VDIR $TESTPOOL" + +# +# 7. Compare clone file with the original file +# +log_must have_same_content /$TESTPOOL/$TESTFS/file1 /$TESTPOOL/$TESTFS/clone1 +log_must have_same_content /$TESTPOOL/$TESTFS/file2 /$TESTPOOL/$TESTFS/clone2 + +typeset blocks=$(get_same_blocks $TESTPOOL/$TESTFS file1 \ + $TESTPOOL/$TESTFS clone1 $PASSPHRASE) +log_must [ "$blocks" = "0 1 2 3" ] + +typeset blocks=$(get_same_blocks $TESTPOOL/$TESTFS file2 \ + $TESTPOOL/$TESTFS clone2 $PASSPHRASE) +log_must [ "$blocks" = "$(seq -s " " 0 2047)" ] + +log_pass $claim From f94a77951dd3a1861cb39a4a386c5677d68f25a0 Mon Sep 17 00:00:00 2001 From: Umer Saleem Date: Sat, 16 Dec 2023 03:18:27 +0500 Subject: [PATCH 36/91] Test LWB buffer overflow for block cloning PR#15634 removes 128K into 2x68K LWB split optimization, since it was found to cause LWB buffer overflow while trying to write 128KB TX_CLONE_RANGE record with 1022 block pointers into 68KB buffer, with multiple VDEVs ZIL. This commit adds a test for this particular scenario by writing maximum sizes TX_CLONE_RANE record with 1022 block pointers into 68KB buffer, with two SLOG devices. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Reviewed-by: Ameer Hamza Signed-off-by: Umer Saleem Closes #15672 --- tests/runfiles/linux.run | 3 +- tests/test-runner/bin/zts-report.py.in | 2 + tests/zfs-tests/tests/Makefile.am | 1 + .../block_cloning_lwb_buffer_overflow.ksh | 89 +++++++++++++++++++ 4 files changed, 94 insertions(+), 1 deletion(-) create mode 100755 tests/zfs-tests/tests/functional/block_cloning/block_cloning_lwb_buffer_overflow.ksh diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index 17ba2335242..c7c17f27176 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -44,7 +44,8 @@ tests = ['block_cloning_copyfilerange', 'block_cloning_copyfilerange_partial', 'block_cloning_copyfilerange_cross_dataset', 'block_cloning_cross_enc_dataset', 'block_cloning_copyfilerange_fallback_same_txg', - 'block_cloning_replay', 'block_cloning_replay_encrypted'] + 'block_cloning_replay', 'block_cloning_replay_encrypted', + 'block_cloning_lwb_buffer_overflow'] tags = ['functional', 'block_cloning'] [tests/functional/chattr:Linux] diff --git a/tests/test-runner/bin/zts-report.py.in b/tests/test-runner/bin/zts-report.py.in index 3b5eeacb6ba..708b7be9176 100755 --- a/tests/test-runner/bin/zts-report.py.in +++ b/tests/test-runner/bin/zts-report.py.in @@ -305,6 +305,8 @@ elif sys.platform.startswith('linux'): ['SKIP', cfr_reason], 'block_cloning/block_cloning_replay_encrypted': ['SKIP', cfr_reason], + 'block_cloning/block_cloning_lwb_buffer_overflow': + ['SKIP', cfr_reason], 'block_cloning/block_cloning_copyfilerange_cross_dataset': ['SKIP', cfr_cross_reason], 'block_cloning/block_cloning_copyfilerange_fallback_same_txg': diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 88573a15ed3..7f5af6530ee 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -454,6 +454,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/block_cloning/block_cloning_cross_enc_dataset.ksh \ functional/block_cloning/block_cloning_replay.ksh \ functional/block_cloning/block_cloning_replay_encrypted.ksh \ + functional/block_cloning/block_cloning_lwb_buffer_overflow.ksh \ functional/bootfs/bootfs_001_pos.ksh \ functional/bootfs/bootfs_002_neg.ksh \ functional/bootfs/bootfs_003_pos.ksh \ diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_lwb_buffer_overflow.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_lwb_buffer_overflow.ksh new file mode 100755 index 00000000000..0ae76b7e54a --- /dev/null +++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_lwb_buffer_overflow.ksh @@ -0,0 +1,89 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2023 by iXsystems, Inc. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/block_cloning/block_cloning.kshlib + +# +# DESCRIPTION: +# Test for LWB buffer overflow with multiple VDEVs ZIL when 128KB +# block write is split into two 68KB ones, trying to write maximum +# sizes 128KB TX_CLONE_RANGE record with 1022 block pointers into +# 68KB buffer. +# +# STRATEGY: +# 1. Create a pool with multiple VDEVs ZIL +# 2. Write maximum sizes TX_CLONE_RANGE record with 1022 block +# pointers into 68KB buffer +# 3. Sync TXG +# 4. Clone the file +# 5. Synchronize cached writes +# + +verify_runnable "global" + +if [[ $(linux_version) -lt $(linux_version "4.5") ]]; then + log_unsupported "copy_file_range not available before Linux 4.5" +fi + +VDIR=$TEST_BASE_DIR/disk-bclone +VDEV="$VDIR/a $VDIR/b $VDIR/c" +LDEV="$VDIR/e $VDIR/f" + +function cleanup +{ + datasetexists $TESTPOOL && destroy_pool $TESTPOOL + rm -rf $VDIR +} + +log_onexit cleanup + +log_assert "Test for LWB buffer overflow with multiple VDEVs ZIL" + +log_must rm -rf $VDIR +log_must mkdir -p $VDIR +log_must truncate -s $MINVDEVSIZE $VDEV $LDEV + +log_must zpool create -o feature@block_cloning=enabled $TESTPOOL $VDEV \ + log mirror $LDEV +log_must zfs create -o recordsize=32K $TESTPOOL/$TESTFS +# Each ZIL log entry can fit 130816 bytes for a block cloning operation, +# so it can store 1022 block pointers. When LWB optimization is enabled, +# an assert is hit when 128KB block write is split into two 68KB ones +# for 2 SLOG devices +log_must dd if=/dev/urandom of=/$TESTPOOL/$TESTFS/file1 bs=32K count=1022 \ + conv=fsync +sync_pool $TESTPOOL +log_must clonefile -c /$TESTPOOL/$TESTFS/file1 /$TESTPOOL/$TESTFS/file2 +log_must sync + +sync_pool $TESTPOOL +log_must have_same_content /$TESTPOOL/$TESTFS/file1 /$TESTPOOL/$TESTFS/file2 +typeset blocks=$(get_same_blocks $TESTPOOL/$TESTFS file1 $TESTPOOL/$TESTFS file2) +log_must [ "$blocks" = "$(seq -s " " 0 1021)" ] + +log_pass "LWB buffer overflow is not triggered with multiple VDEVs ZIL" + From c16d103422806ed503cc6186fa098b1e8ee10c79 Mon Sep 17 00:00:00 2001 From: Pawel Jakub Dawidek Date: Tue, 26 Dec 2023 12:01:53 -0800 Subject: [PATCH 37/91] Block cloning tests. The test mostly focus on testing various corner cases. The tests take a long time to run, so for the common.run runfile we randomly select a hundred tests. To run all the bclone tests, bclone.run runfile should be used. Reviewed-by: Brian Behlendorf Signed-off-by: Pawel Jakub Dawidek Closes #15631 --- tests/Makefile.am | 1 + tests/runfiles/bclone.run | 46 +++ tests/runfiles/common.run | 18 + tests/test-runner/bin/zts-report.py.in | 73 ++-- tests/zfs-tests/cmd/Makefile.am | 2 +- tests/zfs-tests/cmd/clonefile.c | 80 +++-- tests/zfs-tests/include/commands.cfg | 3 +- tests/zfs-tests/include/math.shlib | 13 +- tests/zfs-tests/tests/Makefile.am | 21 ++ tests/zfs-tests/tests/functional/bclone/TODO | 4 + .../tests/functional/bclone/bclone.cfg | 32 ++ .../functional/bclone/bclone_common.kshlib | 280 ++++++++++++++++ .../bclone/bclone_corner_cases.kshlib | 315 ++++++++++++++++++ .../bclone/bclone_crossfs_corner_cases.ksh | 45 +++ .../bclone_crossfs_corner_cases_limited.ksh | 45 +++ .../functional/bclone/bclone_crossfs_data.ksh | 46 +++ .../bclone/bclone_crossfs_embedded.ksh | 50 +++ .../functional/bclone/bclone_crossfs_hole.ksh | 45 +++ .../bclone/bclone_diffprops_all.ksh | 86 +++++ .../bclone/bclone_diffprops_checksum.ksh | 62 ++++ .../bclone/bclone_diffprops_compress.ksh | 59 ++++ .../bclone/bclone_diffprops_copies.ksh | 59 ++++ .../bclone/bclone_diffprops_recordsize.ksh | 65 ++++ .../functional/bclone/bclone_prop_sync.ksh | 66 ++++ .../bclone/bclone_samefs_corner_cases.ksh | 42 +++ .../bclone_samefs_corner_cases_limited.ksh | 42 +++ .../functional/bclone/bclone_samefs_data.ksh | 44 +++ .../bclone/bclone_samefs_embedded.ksh | 48 +++ .../functional/bclone/bclone_samefs_hole.ksh | 44 +++ .../tests/functional/bclone/cleanup.ksh | 37 ++ .../tests/functional/bclone/setup.ksh | 45 +++ .../functional/redundancy/redundancy.kshlib | 22 -- 32 files changed, 1767 insertions(+), 73 deletions(-) create mode 100644 tests/runfiles/bclone.run create mode 100644 tests/zfs-tests/tests/functional/bclone/TODO create mode 100644 tests/zfs-tests/tests/functional/bclone/bclone.cfg create mode 100644 tests/zfs-tests/tests/functional/bclone/bclone_common.kshlib create mode 100644 tests/zfs-tests/tests/functional/bclone/bclone_corner_cases.kshlib create mode 100755 tests/zfs-tests/tests/functional/bclone/bclone_crossfs_corner_cases.ksh create mode 100755 tests/zfs-tests/tests/functional/bclone/bclone_crossfs_corner_cases_limited.ksh create mode 100755 tests/zfs-tests/tests/functional/bclone/bclone_crossfs_data.ksh create mode 100755 tests/zfs-tests/tests/functional/bclone/bclone_crossfs_embedded.ksh create mode 100755 tests/zfs-tests/tests/functional/bclone/bclone_crossfs_hole.ksh create mode 100755 tests/zfs-tests/tests/functional/bclone/bclone_diffprops_all.ksh create mode 100755 tests/zfs-tests/tests/functional/bclone/bclone_diffprops_checksum.ksh create mode 100755 tests/zfs-tests/tests/functional/bclone/bclone_diffprops_compress.ksh create mode 100755 tests/zfs-tests/tests/functional/bclone/bclone_diffprops_copies.ksh create mode 100755 tests/zfs-tests/tests/functional/bclone/bclone_diffprops_recordsize.ksh create mode 100755 tests/zfs-tests/tests/functional/bclone/bclone_prop_sync.ksh create mode 100755 tests/zfs-tests/tests/functional/bclone/bclone_samefs_corner_cases.ksh create mode 100755 tests/zfs-tests/tests/functional/bclone/bclone_samefs_corner_cases_limited.ksh create mode 100755 tests/zfs-tests/tests/functional/bclone/bclone_samefs_data.ksh create mode 100755 tests/zfs-tests/tests/functional/bclone/bclone_samefs_embedded.ksh create mode 100755 tests/zfs-tests/tests/functional/bclone/bclone_samefs_hole.ksh create mode 100755 tests/zfs-tests/tests/functional/bclone/cleanup.ksh create mode 100755 tests/zfs-tests/tests/functional/bclone/setup.ksh diff --git a/tests/Makefile.am b/tests/Makefile.am index 2e633041ab5..12e9c9f9daf 100644 --- a/tests/Makefile.am +++ b/tests/Makefile.am @@ -16,6 +16,7 @@ dist_scripts_test_runner_include_DATA = \ scripts_runfilesdir = $(datadir)/$(PACKAGE)/runfiles dist_scripts_runfiles_DATA = \ + %D%/runfiles/bclone.run \ %D%/runfiles/common.run \ %D%/runfiles/freebsd.run \ %D%/runfiles/linux.run \ diff --git a/tests/runfiles/bclone.run b/tests/runfiles/bclone.run new file mode 100644 index 00000000000..3d0f545d922 --- /dev/null +++ b/tests/runfiles/bclone.run @@ -0,0 +1,46 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# This run file contains all of the common functional tests. When +# adding a new test consider also adding it to the sanity.run file +# if the new test runs to completion in only a few seconds. +# +# Approximate run time: 5 hours +# + +[DEFAULT] +pre = setup +quiet = False +pre_user = root +user = root +timeout = 28800 +post_user = root +post = cleanup +failsafe_user = root +failsafe = callbacks/zfs_failsafe +outputdir = /var/tmp/test_results +tags = ['bclone'] + +[tests/functional/bclone] +tests = ['bclone_crossfs_corner_cases', + 'bclone_crossfs_data', + 'bclone_crossfs_embedded', + 'bclone_crossfs_hole', + 'bclone_diffprops_all', + 'bclone_diffprops_checksum', + 'bclone_diffprops_compress', + 'bclone_diffprops_copies', + 'bclone_diffprops_recordsize', + 'bclone_prop_sync', + 'bclone_samefs_corner_cases', + 'bclone_samefs_data', + 'bclone_samefs_embedded', + 'bclone_samefs_hole'] +tags = ['bclone'] diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index ef787c65c0f..f94a5fba9e2 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -53,6 +53,24 @@ tags = ['functional', 'arc'] tests = ['atime_001_pos', 'atime_002_neg', 'root_atime_off', 'root_atime_on'] tags = ['functional', 'atime'] +[tests/functional/bclone] +tests = ['bclone_crossfs_corner_cases_limited', + 'bclone_crossfs_data', + 'bclone_crossfs_embedded', + 'bclone_crossfs_hole', + 'bclone_diffprops_all', + 'bclone_diffprops_checksum', + 'bclone_diffprops_compress', + 'bclone_diffprops_copies', + 'bclone_diffprops_recordsize', + 'bclone_prop_sync', + 'bclone_samefs_corner_cases_limited', + 'bclone_samefs_data', + 'bclone_samefs_embedded', + 'bclone_samefs_hole'] +tags = ['functional', 'bclone'] +timeout = 7200 + [tests/functional/bootfs] tests = ['bootfs_001_pos', 'bootfs_002_neg', 'bootfs_003_pos', 'bootfs_004_neg', 'bootfs_005_neg', 'bootfs_006_pos', 'bootfs_007_pos', diff --git a/tests/test-runner/bin/zts-report.py.in b/tests/test-runner/bin/zts-report.py.in index 708b7be9176..7bf4d05d542 100755 --- a/tests/test-runner/bin/zts-report.py.in +++ b/tests/test-runner/bin/zts-report.py.in @@ -263,13 +263,50 @@ if sys.platform.startswith('freebsd'): 'cli_root/zpool_import/zpool_import_012_pos': ['FAIL', known_reason], 'delegate/zfs_allow_003_pos': ['FAIL', known_reason], 'inheritance/inherit_001_pos': ['FAIL', 11829], - 'resilver/resilver_restart_001': ['FAIL', known_reason], 'pool_checkpoint/checkpoint_big_rewind': ['FAIL', 12622], 'pool_checkpoint/checkpoint_indirect': ['FAIL', 12623], + 'resilver/resilver_restart_001': ['FAIL', known_reason], 'snapshot/snapshot_002_pos': ['FAIL', '14831'], }) elif sys.platform.startswith('linux'): maybe.update({ + 'bclone/bclone_crossfs_corner_cases': ['SKIP', cfr_cross_reason], + 'bclone/bclone_crossfs_corner_cases_limited': + ['SKIP', cfr_cross_reason], + 'bclone/bclone_crossfs_data': ['SKIP', cfr_cross_reason], + 'bclone/bclone_crossfs_embedded': ['SKIP', cfr_cross_reason], + 'bclone/bclone_crossfs_hole': ['SKIP', cfr_cross_reason], + 'bclone/bclone_diffprops_all': ['SKIP', cfr_cross_reason], + 'bclone/bclone_diffprops_checksum': ['SKIP', cfr_cross_reason], + 'bclone/bclone_diffprops_compress': ['SKIP', cfr_cross_reason], + 'bclone/bclone_diffprops_copies': ['SKIP', cfr_cross_reason], + 'bclone/bclone_diffprops_recordsize': ['SKIP', cfr_cross_reason], + 'bclone/bclone_prop_sync': ['SKIP', cfr_cross_reason], + 'bclone/bclone_samefs_corner_cases': ['SKIP', cfr_reason], + 'bclone/bclone_samefs_corner_cases_limited': ['SKIP', cfr_reason], + 'bclone/bclone_samefs_data': ['SKIP', cfr_reason], + 'bclone/bclone_samefs_embedded': ['SKIP', cfr_reason], + 'bclone/bclone_samefs_hole': ['SKIP', cfr_reason], + 'block_cloning/block_cloning_copyfilerange': + ['SKIP', cfr_reason], + 'block_cloning/block_cloning_copyfilerange_cross_dataset': + ['SKIP', cfr_cross_reason], + 'block_cloning/block_cloning_copyfilerange_fallback': + ['SKIP', cfr_reason], + 'block_cloning/block_cloning_copyfilerange_fallback_same_txg': + ['SKIP', cfr_cross_reason], + 'block_cloning/block_cloning_copyfilerange_partial': + ['SKIP', cfr_reason], + 'block_cloning/block_cloning_cross_enc_dataset': + ['SKIP', cfr_cross_reason], + 'block_cloning/block_cloning_disabled_copyfilerange': + ['SKIP', cfr_reason], + 'block_cloning/block_cloning_lwb_buffer_overflow': + ['SKIP', cfr_reason], + 'block_cloning/block_cloning_replay': + ['SKIP', cfr_reason], + 'block_cloning/block_cloning_replay_encrypted': + ['SKIP', cfr_reason], 'cli_root/zfs_rename/zfs_rename_002_pos': ['FAIL', known_reason], 'cli_root/zpool_reopen/zpool_reopen_003_pos': ['FAIL', known_reason], 'fault/auto_online_002_pos': ['FAIL', 11889], @@ -278,41 +315,21 @@ elif sys.platform.startswith('linux'): 'fault/auto_spare_multiple': ['FAIL', 11889], 'fault/auto_spare_shared': ['FAIL', 11889], 'fault/decompress_fault': ['FAIL', 11889], + 'idmap_mount/idmap_mount_001': ['SKIP', idmap_reason], + 'idmap_mount/idmap_mount_002': ['SKIP', idmap_reason], + 'idmap_mount/idmap_mount_003': ['SKIP', idmap_reason], + 'idmap_mount/idmap_mount_004': ['SKIP', idmap_reason], + 'idmap_mount/idmap_mount_005': ['SKIP', idmap_reason], 'io/io_uring': ['SKIP', 'io_uring support required'], 'limits/filesystem_limit': ['SKIP', known_reason], 'limits/snapshot_limit': ['SKIP', known_reason], 'mmp/mmp_active_import': ['FAIL', known_reason], 'mmp/mmp_exported_import': ['FAIL', known_reason], 'mmp/mmp_inactive_import': ['FAIL', known_reason], - 'zvol/zvol_misc/zvol_misc_snapdev': ['FAIL', 12621], - 'zvol/zvol_misc/zvol_misc_volmode': ['FAIL', known_reason], 'zvol/zvol_misc/zvol_misc_fua': ['SKIP', 14872], + 'zvol/zvol_misc/zvol_misc_snapdev': ['FAIL', 12621], 'zvol/zvol_misc/zvol_misc_trim': ['SKIP', 14872], - 'idmap_mount/idmap_mount_001': ['SKIP', idmap_reason], - 'idmap_mount/idmap_mount_002': ['SKIP', idmap_reason], - 'idmap_mount/idmap_mount_003': ['SKIP', idmap_reason], - 'idmap_mount/idmap_mount_004': ['SKIP', idmap_reason], - 'idmap_mount/idmap_mount_005': ['SKIP', idmap_reason], - 'block_cloning/block_cloning_disabled_copyfilerange': - ['SKIP', cfr_reason], - 'block_cloning/block_cloning_copyfilerange': - ['SKIP', cfr_reason], - 'block_cloning/block_cloning_copyfilerange_partial': - ['SKIP', cfr_reason], - 'block_cloning/block_cloning_copyfilerange_fallback': - ['SKIP', cfr_reason], - 'block_cloning/block_cloning_replay': - ['SKIP', cfr_reason], - 'block_cloning/block_cloning_replay_encrypted': - ['SKIP', cfr_reason], - 'block_cloning/block_cloning_lwb_buffer_overflow': - ['SKIP', cfr_reason], - 'block_cloning/block_cloning_copyfilerange_cross_dataset': - ['SKIP', cfr_cross_reason], - 'block_cloning/block_cloning_copyfilerange_fallback_same_txg': - ['SKIP', cfr_cross_reason], - 'block_cloning/block_cloning_cross_enc_dataset': - ['SKIP', cfr_cross_reason], + 'zvol/zvol_misc/zvol_misc_volmode': ['FAIL', known_reason], }) # Not all Github actions runners have scsi_debug module, so we may skip diff --git a/tests/zfs-tests/cmd/Makefile.am b/tests/zfs-tests/cmd/Makefile.am index 9bdb3c20975..1b915ae98ca 100644 --- a/tests/zfs-tests/cmd/Makefile.am +++ b/tests/zfs-tests/cmd/Makefile.am @@ -2,6 +2,7 @@ scripts_zfs_tests_bindir = $(datadir)/$(PACKAGE)/zfs-tests/bin scripts_zfs_tests_bin_PROGRAMS = %D%/chg_usr_exec +scripts_zfs_tests_bin_PROGRAMS += %D%/clonefile scripts_zfs_tests_bin_PROGRAMS += %D%/cp_files scripts_zfs_tests_bin_PROGRAMS += %D%/ctime scripts_zfs_tests_bin_PROGRAMS += %D%/dir_rd_update @@ -119,7 +120,6 @@ scripts_zfs_tests_bin_PROGRAMS += %D%/renameat2 scripts_zfs_tests_bin_PROGRAMS += %D%/xattrtest scripts_zfs_tests_bin_PROGRAMS += %D%/zed_fd_spill-zedlet scripts_zfs_tests_bin_PROGRAMS += %D%/idmap_util -scripts_zfs_tests_bin_PROGRAMS += %D%/clonefile %C%_idmap_util_LDADD = libspl.la diff --git a/tests/zfs-tests/cmd/clonefile.c b/tests/zfs-tests/cmd/clonefile.c index 696dc471d8c..d002cd9b587 100644 --- a/tests/zfs-tests/cmd/clonefile.c +++ b/tests/zfs-tests/cmd/clonefile.c @@ -59,6 +59,10 @@ #endif #endif /* __NR_copy_file_range */ +#ifdef __FreeBSD__ +#define loff_t off_t +#endif + ssize_t copy_file_range(int, loff_t *, int, loff_t *, size_t, unsigned int) __attribute__((weak)); @@ -140,7 +144,7 @@ usage(void) " FICLONERANGE:\n" " clonefile -r \n" " copy_file_range:\n" - " clonefile -f \n" + " clonefile -f [ ]\n" " FIDEDUPERANGE:\n" " clonefile -d \n"); return (1); @@ -179,13 +183,29 @@ main(int argc, char **argv) } } - if (mode == CF_MODE_NONE || (argc-optind) < 2 || - (mode != CF_MODE_CLONE && (argc-optind) < 5)) - return (usage()); + switch (mode) { + case CF_MODE_NONE: + return (usage()); + case CF_MODE_CLONE: + if ((argc-optind) != 2) + return (usage()); + break; + case CF_MODE_CLONERANGE: + case CF_MODE_DEDUPERANGE: + if ((argc-optind) != 5) + return (usage()); + break; + case CF_MODE_COPYFILERANGE: + if ((argc-optind) != 2 && (argc-optind) != 5) + return (usage()); + break; + default: + abort(); + } loff_t soff = 0, doff = 0; - size_t len = 0; - if (mode != CF_MODE_CLONE) { + size_t len = SSIZE_MAX; + if ((argc-optind) == 5) { soff = strtoull(argv[optind+2], NULL, 10); if (soff == ULLONG_MAX) { fprintf(stderr, "invalid source offset"); @@ -196,10 +216,15 @@ main(int argc, char **argv) fprintf(stderr, "invalid dest offset"); return (1); } - len = strtoull(argv[optind+4], NULL, 10); - if (len == ULLONG_MAX) { - fprintf(stderr, "invalid length"); - return (1); + if (mode == CF_MODE_COPYFILERANGE && + strcmp(argv[optind+4], "all") == 0) { + len = SSIZE_MAX; + } else { + len = strtoull(argv[optind+4], NULL, 10); + if (len == ULLONG_MAX) { + fprintf(stderr, "invalid length"); + return (1); + } } } @@ -237,13 +262,15 @@ main(int argc, char **argv) abort(); } - off_t spos = lseek(sfd, 0, SEEK_CUR); - off_t slen = lseek(sfd, 0, SEEK_END); - off_t dpos = lseek(dfd, 0, SEEK_CUR); - off_t dlen = lseek(dfd, 0, SEEK_END); + if (!quiet) { + off_t spos = lseek(sfd, 0, SEEK_CUR); + off_t slen = lseek(sfd, 0, SEEK_END); + off_t dpos = lseek(dfd, 0, SEEK_CUR); + off_t dlen = lseek(dfd, 0, SEEK_END); - fprintf(stderr, "file offsets: src=%lu/%lu; dst=%lu/%lu\n", spos, slen, - dpos, dlen); + fprintf(stderr, "file offsets: src=%lu/%lu; dst=%lu/%lu\n", + spos, slen, dpos, dlen); + } close(dfd); close(sfd); @@ -254,7 +281,8 @@ main(int argc, char **argv) int do_clone(int sfd, int dfd) { - fprintf(stderr, "using FICLONE\n"); + if (!quiet) + fprintf(stderr, "using FICLONE\n"); int err = ioctl(dfd, CF_FICLONE, sfd); if (err < 0) { fprintf(stderr, "ioctl(FICLONE): %s\n", strerror(errno)); @@ -266,7 +294,8 @@ do_clone(int sfd, int dfd) int do_clonerange(int sfd, int dfd, loff_t soff, loff_t doff, size_t len) { - fprintf(stderr, "using FICLONERANGE\n"); + if (!quiet) + fprintf(stderr, "using FICLONERANGE\n"); cf_file_clone_range_t fcr = { .src_fd = sfd, .src_offset = soff, @@ -284,12 +313,22 @@ do_clonerange(int sfd, int dfd, loff_t soff, loff_t doff, size_t len) int do_copyfilerange(int sfd, int dfd, loff_t soff, loff_t doff, size_t len) { - fprintf(stderr, "using copy_file_range\n"); + if (!quiet) + fprintf(stderr, "using copy_file_range\n"); ssize_t copied = cf_copy_file_range(sfd, &soff, dfd, &doff, len, 0); if (copied < 0) { fprintf(stderr, "copy_file_range: %s\n", strerror(errno)); return (1); } + if (len == SSIZE_MAX) { + struct stat sb; + + if (fstat(sfd, &sb) < 0) { + fprintf(stderr, "fstat(sfd): %s\n", strerror(errno)); + return (1); + } + len = sb.st_size; + } if (copied != len) { fprintf(stderr, "copy_file_range: copied less than requested: " "requested=%lu; copied=%lu\n", len, copied); @@ -301,7 +340,8 @@ do_copyfilerange(int sfd, int dfd, loff_t soff, loff_t doff, size_t len) int do_deduperange(int sfd, int dfd, loff_t soff, loff_t doff, size_t len) { - fprintf(stderr, "using FIDEDUPERANGE\n"); + if (!quiet) + fprintf(stderr, "using FIDEDUPERANGE\n"); char buf[sizeof (cf_file_dedupe_range_t)+ sizeof (cf_file_dedupe_range_info_t)] = {0}; diff --git a/tests/zfs-tests/include/commands.cfg b/tests/zfs-tests/include/commands.cfg index 648f2203dfb..c6f74cd81a1 100644 --- a/tests/zfs-tests/include/commands.cfg +++ b/tests/zfs-tests/include/commands.cfg @@ -98,7 +98,8 @@ export SYSTEM_FILES_COMMON='awk uname uniq vmstat - wc' + wc + xargs' export SYSTEM_FILES_FREEBSD='chflags compress diff --git a/tests/zfs-tests/include/math.shlib b/tests/zfs-tests/include/math.shlib index da1e77e5fb9..2b5e60180f5 100644 --- a/tests/zfs-tests/include/math.shlib +++ b/tests/zfs-tests/include/math.shlib @@ -123,10 +123,21 @@ function verify_ne # # # $1 lower bound # $2 upper bound +# [$3 how many] function random_int_between { typeset -i min=$1 typeset -i max=$2 + typeset -i count + typeset -i i - echo $(( (RANDOM % (max - min + 1)) + min )) + if [[ -z "$3" ]]; then + count=1 + else + count=$3 + fi + + for (( i = 0; i < $count; i++ )); do + echo $(( (RANDOM % (max - min + 1)) + min )) + done } diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 7f5af6530ee..33e97d22b6c 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -90,6 +90,9 @@ nobase_dist_datadir_zfs_tests_tests_DATA += \ functional/alloc_class/alloc_class.kshlib \ functional/atime/atime.cfg \ functional/atime/atime_common.kshlib \ + functional/bclone/bclone.cfg \ + functional/bclone/bclone_common.kshlib \ + functional/bclone/bclone_corner_cases.kshlib \ functional/block_cloning/block_cloning.kshlib \ functional/cache/cache.cfg \ functional/cache/cache.kshlib \ @@ -438,6 +441,24 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/atime/root_atime_on.ksh \ functional/atime/root_relatime_on.ksh \ functional/atime/setup.ksh \ + functional/bclone/bclone_crossfs_corner_cases.ksh \ + functional/bclone/bclone_crossfs_corner_cases_limited.ksh \ + functional/bclone/bclone_crossfs_data.ksh \ + functional/bclone/bclone_crossfs_embedded.ksh \ + functional/bclone/bclone_crossfs_hole.ksh \ + functional/bclone/bclone_diffprops_all.ksh \ + functional/bclone/bclone_diffprops_checksum.ksh \ + functional/bclone/bclone_diffprops_compress.ksh \ + functional/bclone/bclone_diffprops_copies.ksh \ + functional/bclone/bclone_diffprops_recordsize.ksh \ + functional/bclone/bclone_prop_sync.ksh \ + functional/bclone/bclone_samefs_corner_cases.ksh \ + functional/bclone/bclone_samefs_corner_cases_limited.ksh \ + functional/bclone/bclone_samefs_data.ksh \ + functional/bclone/bclone_samefs_embedded.ksh \ + functional/bclone/bclone_samefs_hole.ksh \ + functional/bclone/cleanup.ksh \ + functional/bclone/setup.ksh \ functional/block_cloning/cleanup.ksh \ functional/block_cloning/setup.ksh \ functional/block_cloning/block_cloning_copyfilerange_cross_dataset.ksh \ diff --git a/tests/zfs-tests/tests/functional/bclone/TODO b/tests/zfs-tests/tests/functional/bclone/TODO new file mode 100644 index 00000000000..7cd4ee898fc --- /dev/null +++ b/tests/zfs-tests/tests/functional/bclone/TODO @@ -0,0 +1,4 @@ +- If dedup enabled, block_cloning uses dedup. +- check when block cloning doesn't suppose to work +- check block cloning between two different pools +- block cloning from a snapshot diff --git a/tests/zfs-tests/tests/functional/bclone/bclone.cfg b/tests/zfs-tests/tests/functional/bclone/bclone.cfg new file mode 100644 index 00000000000..f72d17c1bec --- /dev/null +++ b/tests/zfs-tests/tests/functional/bclone/bclone.cfg @@ -0,0 +1,32 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2023 by Pawel Jakub Dawidek +# + +# TODO: We should calculate that based on ashift. +export MINBLOCKSIZE=512 + +export TESTSRCFS="$TESTPOOL/$TESTFS/src" +export TESTDSTFS="$TESTPOOL/$TESTFS/dst" +export TESTSRCDIR="$TESTDIR/src" +export TESTDSTDIR="$TESTDIR/dst" diff --git a/tests/zfs-tests/tests/functional/bclone/bclone_common.kshlib b/tests/zfs-tests/tests/functional/bclone/bclone_common.kshlib new file mode 100644 index 00000000000..beba01c0ed2 --- /dev/null +++ b/tests/zfs-tests/tests/functional/bclone/bclone_common.kshlib @@ -0,0 +1,280 @@ +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2023 by Pawel Jakub Dawidek +# + +. $STF_SUITE/tests/functional/bclone/bclone.cfg + +export RECORDSIZE=$(zfs get -Hp -o value recordsize $TESTPOOL/$TESTFS) + +MINBLKSIZE1=512 +MINBLKSIZE2=1024 + +function verify_block_cloning +{ + if is_linux && [[ $(linux_version) -lt $(linux_version "4.5") ]]; then + log_unsupported "copy_file_range not available before Linux 4.5" + fi +} + +function verify_crossfs_block_cloning +{ + if is_linux && [[ $(linux_version) -lt $(linux_version "5.3") ]]; then + log_unsupported "copy_file_range can't copy cross-filesystem before Linux 5.3" + fi +} + +# Unused. +function size_to_dsize +{ + typeset -r size=$1 + typeset -r dir=$2 + + typeset -r dataset=$(df $dir | tail -1 | awk '{print $1}') + typeset -r recordsize=$(get_prop recordsize $dataset) + typeset -r copies=$(get_prop copies $dataset) + typeset dsize + + if [[ $size -le $recordsize ]]; then + dsize=$(( ((size - 1) / MINBLOCKSIZE + 1) * MINBLOCKSIZE )) + else + dsize=$(( ((size - 1) / recordsize + 1) * recordsize )) + fi + dsize=$((dsize*copies)) + + echo $dsize +} + +function test_file_integrity +{ + typeset -r original_checksum=$1 + typeset -r clone=$2 + typeset -r filesize=$3 + + typeset -r clone_checksum=$(sha256digest $clone) + + if [[ $original_checksum != $clone_checksum ]]; then + log_fail "Clone $clone is corrupted with file size $filesize" + fi +} + +function verify_pool_prop_eq +{ + typeset -r prop=$1 + typeset -r expected=$2 + + typeset -r value=$(get_pool_prop $prop $TESTPOOL) + if [[ $value != $expected ]]; then + log_fail "Pool property $prop is incorrect: expected $expected, got $value" + fi +} + +function verify_pool_props +{ + typeset -r dsize=$1 + typeset -r ratio=$2 + + if [[ $dsize -eq 0 ]]; then + verify_pool_prop_eq bcloneused 0 + verify_pool_prop_eq bclonesaved 0 + verify_pool_prop_eq bcloneratio 1.00 + else + if [[ $ratio -eq 1 ]]; then + verify_pool_prop_eq bcloneused 0 + else + verify_pool_prop_eq bcloneused $dsize + fi + verify_pool_prop_eq bclonesaved $((dsize*(ratio-1))) + verify_pool_prop_eq bcloneratio "${ratio}.00" + fi +} + +# Function to test file copying and integrity check. +function bclone_test +{ + typeset -r datatype=$1 + typeset filesize=$2 + typeset -r embedded=$3 + typeset -r srcdir=$4 + typeset -r dstdir=$5 + typeset dsize + + typeset -r original="${srcdir}/original" + typeset -r clone="${dstdir}/clone" + + log_note "Testing file copy with datatype $datatype, file size $filesize, embedded $embedded" + + # Create a test file with known content. + case $datatype in + random|text) + sync_pool $TESTPOOL + if [[ $datatype = "random" ]]; then + dd if=/dev/urandom of=$original bs=$filesize count=1 2>/dev/null + else + filesize=$(((filesize/4)*4)) + dd if=/dev/urandom bs=$(((filesize/4)*3)) count=1 | \ + openssl base64 -A > $original + fi + sync_pool $TESTPOOL + clonefile -f $original "${clone}-tmp" + sync_pool $TESTPOOL + # It is hard to predict block sizes that will be used, + # so just do one clone and take it from bcloneused. + filesize=$(zpool get -Hp -o value bcloneused $TESTPOOL) + if [[ $embedded = "false" ]]; then + log_must test $filesize -gt 0 + fi + rm -f "${clone}-tmp" + sync_pool $TESTPOOL + dsize=$filesize + ;; + hole) + log_must truncate_test -s $filesize -f $original + dsize=0 + ;; + *) + log_fail "Unknown datatype $datatype" + ;; + esac + if [[ $embedded = "true" ]]; then + dsize=0 + fi + + typeset -r original_checksum=$(sha256digest $original) + + sync_pool $TESTPOOL + + # Create a first clone of the entire file. + clonefile -f $original "${clone}0" + # Try to clone the clone in the same transaction group. + clonefile -f "${clone}0" "${clone}2" + + # Clone the original again... + clonefile -f $original "${clone}1" + # ...and overwrite it in the same transaction group. + clonefile -f $original "${clone}1" + + # Clone the clone... + clonefile -f "${clone}1" "${clone}3" + sync_pool $TESTPOOL + # ...and overwrite in the new transaction group. + clonefile -f "${clone}1" "${clone}3" + + sync_pool $TESTPOOL + + # Test removal of the pending clones (before they are committed to disk). + clonefile -f $original "${clone}4" + clonefile -f "${clone}4" "${clone}5" + rm -f "${clone}4" "${clone}5" + + # Clone into one file, but remove another file, but with the same data in + # the same transaction group. + clonefile -f $original "${clone}5" + sync_pool $TESTPOOL + clonefile -f $original "${clone}4" + rm -f "${clone}5" + test_file_integrity $original_checksum "${clone}4" $filesize + sync_pool $TESTPOOL + test_file_integrity $original_checksum "${clone}4" $filesize + + clonefile -f "${clone}4" "${clone}5" + # Verify integrity of the cloned file before it is committed to disk. + test_file_integrity $original_checksum "${clone}5" $filesize + + sync_pool $TESTPOOL + + # Verify integrity in the new transaction group. + test_file_integrity $original_checksum "${clone}0" $filesize + test_file_integrity $original_checksum "${clone}1" $filesize + test_file_integrity $original_checksum "${clone}2" $filesize + test_file_integrity $original_checksum "${clone}3" $filesize + test_file_integrity $original_checksum "${clone}4" $filesize + test_file_integrity $original_checksum "${clone}5" $filesize + + verify_pool_props $dsize 7 + + # Clear cache and test after fresh import. + log_must zpool export $TESTPOOL + log_must zpool import $TESTPOOL + + # Cloned uncached file. + clonefile -f $original "${clone}6" + # Cloned uncached clone. + clonefile -f "${clone}6" "${clone}7" + + # Cache the file. + cat $original >/dev/null + clonefile -f $original "${clone}8" + clonefile -f "${clone}8" "${clone}9" + + test_file_integrity $original_checksum "${clone}6" $filesize + test_file_integrity $original_checksum "${clone}7" $filesize + test_file_integrity $original_checksum "${clone}8" $filesize + test_file_integrity $original_checksum "${clone}9" $filesize + + sync_pool $TESTPOOL + + verify_pool_props $dsize 11 + + log_must zpool export $TESTPOOL + log_must zpool import $TESTPOOL + + test_file_integrity $original_checksum "${clone}0" $filesize + test_file_integrity $original_checksum "${clone}1" $filesize + test_file_integrity $original_checksum "${clone}2" $filesize + test_file_integrity $original_checksum "${clone}3" $filesize + test_file_integrity $original_checksum "${clone}4" $filesize + test_file_integrity $original_checksum "${clone}5" $filesize + test_file_integrity $original_checksum "${clone}6" $filesize + test_file_integrity $original_checksum "${clone}7" $filesize + test_file_integrity $original_checksum "${clone}8" $filesize + test_file_integrity $original_checksum "${clone}9" $filesize + + rm -f $original + rm -f "${clone}1" "${clone}3" "${clone}5" "${clone}7" + + sync_pool $TESTPOOL + + test_file_integrity $original_checksum "${clone}0" $filesize + test_file_integrity $original_checksum "${clone}2" $filesize + test_file_integrity $original_checksum "${clone}4" $filesize + test_file_integrity $original_checksum "${clone}6" $filesize + test_file_integrity $original_checksum "${clone}8" $filesize + test_file_integrity $original_checksum "${clone}9" $filesize + + verify_pool_props $dsize 6 + + rm -f "${clone}0" "${clone}2" "${clone}4" "${clone}8" "${clone}9" + + sync_pool $TESTPOOL + + test_file_integrity $original_checksum "${clone}6" $filesize + + verify_pool_props $dsize 1 + + rm -f "${clone}6" + + sync_pool $TESTPOOL + + verify_pool_props $dsize 1 +} diff --git a/tests/zfs-tests/tests/functional/bclone/bclone_corner_cases.kshlib b/tests/zfs-tests/tests/functional/bclone/bclone_corner_cases.kshlib new file mode 100644 index 00000000000..ddfbfc999c4 --- /dev/null +++ b/tests/zfs-tests/tests/functional/bclone/bclone_corner_cases.kshlib @@ -0,0 +1,315 @@ +#! /bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2023 by Pawel Jakub Dawidek +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/include/math.shlib +. $STF_SUITE/tests/functional/bclone/bclone_common.kshlib + +function first_half_checksum +{ + typeset -r file=$1 + + dd if=$file bs=$HALFRECORDSIZE count=1 2>/dev/null | sha256digest +} + +function second_half_checksum +{ + typeset -r file=$1 + + dd if=$file bs=$HALFRECORDSIZE count=1 skip=1 2>/dev/null | sha256digest +} + +function bclone_corner_cases_init +{ + typeset -r srcdir=$1 + typeset -r dstdir=$2 + + export RECORDSIZE=4096 + export HALFRECORDSIZE=$((RECORDSIZE / 2)) + + export CLONE="$dstdir/clone0" + export ORIG0="$srcdir/orig0" + export ORIG1="$srcdir/orig1" + export ORIG2="$srcdir/orig2" + + # Create source files. + log_must dd if=/dev/urandom of="$ORIG0" bs=$RECORDSIZE count=1 + log_must dd if=/dev/urandom of="$ORIG1" bs=$RECORDSIZE count=1 + log_must dd if=/dev/urandom of="$ORIG2" bs=$RECORDSIZE count=1 + + export FIRST_HALF_ORIG0_CHECKSUM=$(first_half_checksum $ORIG0) + export FIRST_HALF_ORIG1_CHECKSUM=$(first_half_checksum $ORIG1) + export FIRST_HALF_ORIG2_CHECKSUM=$(first_half_checksum $ORIG2) + export SECOND_HALF_ORIG0_CHECKSUM=$(second_half_checksum $ORIG0) + export SECOND_HALF_ORIG1_CHECKSUM=$(second_half_checksum $ORIG1) + export SECOND_HALF_ORIG2_CHECKSUM=$(second_half_checksum $ORIG2) + export ZEROS_CHECKSUM=$(dd if=/dev/zero bs=$HALFRECORDSIZE count=1 | sha256digest) + export FIRST_HALF_CHECKSUM="" + export SECOND_HALF_CHECKSUM="" +} + +function cache_clone +{ + typeset -r cached=$1 + + case "$cached" in + "cached") + dd if=$CLONE of=/dev/null bs=$RECORDSIZE 2>/dev/null + ;; + "uncached") + ;; + *) + log_fail "invalid cached: $cached" + ;; + esac +} + +function create_existing +{ + typeset -r existing=$1 + + case "$existing" in + "no") + ;; + "small empty") + log_must truncate_test -s $HALFRECORDSIZE -f $CLONE + ;; + "full empty") + log_must truncate_test -s $RECORDSIZE -f $CLONE + ;; + "small data") + log_must dd if=/dev/urandom of=$CLONE bs=$HALFRECORDSIZE count=1 \ + 2>/dev/null + ;; + "full data") + log_must dd if=/dev/urandom of=$CLONE bs=$RECORDSIZE count=1 2>/dev/null + ;; + *) + log_fail "invalid existing: $existing" + ;; + esac +} + +function create_clone +{ + typeset -r clone=$1 + typeset -r file=$2 + + case "$clone" in + "no") + ;; + "yes") + clonefile -f $file $CLONE + case "$file" in + $ORIG0) + FIRST_HALF_CHECKSUM=$FIRST_HALF_ORIG0_CHECKSUM + SECOND_HALF_CHECKSUM=$SECOND_HALF_ORIG0_CHECKSUM + ;; + $ORIG2) + FIRST_HALF_CHECKSUM=$FIRST_HALF_ORIG2_CHECKSUM + SECOND_HALF_CHECKSUM=$SECOND_HALF_ORIG2_CHECKSUM + ;; + *) + log_fail "invalid file: $file" + ;; + esac + ;; + *) + log_fail "invalid clone: $clone" + ;; + esac +} + +function overwrite_clone +{ + typeset -r overwrite=$1 + + case "$overwrite" in + "no") + ;; + "free") + log_must truncate_test -s 0 -f $CLONE + log_must truncate_test -s $RECORDSIZE -f $CLONE + FIRST_HALF_CHECKSUM=$ZEROS_CHECKSUM + SECOND_HALF_CHECKSUM=$ZEROS_CHECKSUM + ;; + "full") + log_must dd if=$ORIG1 of=$CLONE bs=$RECORDSIZE count=1 2>/dev/null + FIRST_HALF_CHECKSUM=$FIRST_HALF_ORIG1_CHECKSUM + SECOND_HALF_CHECKSUM=$SECOND_HALF_ORIG1_CHECKSUM + ;; + "first half") + log_must dd if=$ORIG1 of=$CLONE bs=$HALFRECORDSIZE skip=0 seek=0 \ + count=1 conv=notrunc 2>/dev/null + FIRST_HALF_CHECKSUM=$FIRST_HALF_ORIG1_CHECKSUM + ;; + "second half") + log_must dd if=$ORIG1 of=$CLONE bs=$HALFRECORDSIZE skip=1 seek=1 \ + count=1 conv=notrunc 2>/dev/null + SECOND_HALF_CHECKSUM=$SECOND_HALF_ORIG1_CHECKSUM + ;; + *) + log_fail "invalid overwrite: $overwrite" + ;; + esac +} + +function checksum_compare +{ + typeset -r compare=$1 + typeset first_half_calculated_checksum second_half_calculated_checksum + + case "$compare" in + "no") + ;; + "yes") + first_half_calculated_checksum=$(first_half_checksum $CLONE) + second_half_calculated_checksum=$(second_half_checksum $CLONE) + + if [[ $first_half_calculated_checksum != $FIRST_HALF_CHECKSUM ]] || \ + [[ $second_half_calculated_checksum != $SECOND_HALF_CHECKSUM ]]; then + return 1 + fi + ;; + *) + log_fail "invalid compare: $compare" + ;; + esac +} + +function bclone_corner_cases_test +{ + typeset cached existing + typeset first_clone first_overwrite + typeset read_after read_before + typeset second_clone second_overwrite + typeset -r srcdir=$1 + typeset -r dstdir=$2 + typeset limit=$3 + typeset -i count=0 + + if [[ $srcdir != "count" ]]; then + if [[ -n "$limit" ]]; then + typeset -r total_count=$(bclone_corner_cases_test count) + limit=$(random_int_between 1 $total_count $((limit*2)) | sort -nu | head -n $limit | xargs) + fi + bclone_corner_cases_init $srcdir $dstdir + fi + + # + # (create) / (cache) / (clone) / (overwrite) / (read) / (clone) / (overwrite) / (read) / read next txg + # + for existing in "no" "small empty" "full empty" "small data" "full data"; do + for cached in "uncached" "cached"; do + for first_clone in "no" "yes"; do + for first_overwrite in "no" "free" "full" "first half" "second half"; do + for read_before in "no" "yes"; do + for second_clone in "no" "yes"; do + for second_overwrite in "no" "free" "full" "first half" "second half"; do + for read_after in "no" "yes"; do + if [[ $first_clone = "no" ]] && \ + [[ $second_clone = "no" ]]; then + continue + fi + if [[ $first_clone = "no" ]] && \ + [[ $read_before = "yes" ]]; then + continue + fi + if [[ $second_clone = "no" ]] && \ + [[ $read_before = "yes" ]] && \ + [[ $read_after = "yes" ]]; then + continue + fi + + count=$((count+1)) + + if [[ $srcdir = "count" ]]; then + # Just counting. + continue + fi + + if [[ -n "$limit" ]]; then + if ! echo " $limit " | grep -q " $count "; then + continue + fi + fi + + FIRST_HALF_CHECKSUM="" + SECOND_HALF_CHECKSUM="" + + log_must zpool export $TESTPOOL + log_must zpool import $TESTPOOL + + create_existing "$existing" + + log_must zpool export $TESTPOOL + log_must zpool import $TESTPOOL + + cache_clone "$cached" + + create_clone "$first_clone" "$ORIG0" + + overwrite_clone "$first_overwrite" + + if checksum_compare $read_before; then + log_note "existing: $existing / cached: $cached / first_clone: $first_clone / first_overwrite: $first_overwrite / read_before: $read_before" + else + log_fail "FAIL: existing: $existing / cached: $cached / first_clone: $first_clone / first_overwrite: $first_overwrite / read_before: $read_before" + fi + + create_clone "$second_clone" "$ORIG2" + + overwrite_clone "$second_overwrite" + + if checksum_compare $read_after; then + log_note "existing: $existing / cached: $cached / first_clone: $first_clone / first_overwrite: $first_overwrite / read_before: $read_before / second_clone: $second_clone / read_after: $read_after" + else + log_fail "FAIL: existing: $existing / cached: $cached / first_clone: $first_clone / first_overwrite: $first_overwrite / read_before: $read_before / second_clone: $second_clone / read_after: $read_after" + fi + + log_must zpool export $TESTPOOL + log_must zpool import $TESTPOOL + + if checksum_compare "yes"; then + log_note "existing: $existing / cached: $cached / first_clone: $first_clone / first_overwrite: $first_overwrite / read_before: $read_before / second_clone: $second_clone / read_after: $read_after / read_next_txg" + else + log_fail "FAIL: existing: $existing / cached: $cached / first_clone: $first_clone / first_overwrite: $first_overwrite / read_before: $read_before / second_clone: $second_clone / read_after: $read_after / read_next_txg" + fi + + rm -f "$CLONE" + done + done + done + done + done + done + done + done + + if [[ $srcdir = "count" ]]; then + echo $count + fi +} diff --git a/tests/zfs-tests/tests/functional/bclone/bclone_crossfs_corner_cases.ksh b/tests/zfs-tests/tests/functional/bclone/bclone_crossfs_corner_cases.ksh new file mode 100755 index 00000000000..35188cddb06 --- /dev/null +++ b/tests/zfs-tests/tests/functional/bclone/bclone_crossfs_corner_cases.ksh @@ -0,0 +1,45 @@ +#! /bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2023 by Pawel Jakub Dawidek +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/bclone/bclone_corner_cases.kshlib + +verify_runnable "both" + +verify_block_cloning +verify_crossfs_block_cloning + +log_assert "Verify various corner cases in block cloning across datasets" + +# Disable compression to make sure we won't use embedded blocks. +log_must zfs set compress=off $TESTSRCFS +log_must zfs set recordsize=$RECORDSIZE $TESTSRCFS +log_must zfs set compress=off $TESTDSTFS +log_must zfs set recordsize=$RECORDSIZE $TESTDSTFS + +bclone_corner_cases_test $TESTSRCDIR $TESTDSTDIR + +log_pass diff --git a/tests/zfs-tests/tests/functional/bclone/bclone_crossfs_corner_cases_limited.ksh b/tests/zfs-tests/tests/functional/bclone/bclone_crossfs_corner_cases_limited.ksh new file mode 100755 index 00000000000..1fc1bbd07fd --- /dev/null +++ b/tests/zfs-tests/tests/functional/bclone/bclone_crossfs_corner_cases_limited.ksh @@ -0,0 +1,45 @@ +#! /bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2023 by Pawel Jakub Dawidek +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/bclone/bclone_corner_cases.kshlib + +verify_runnable "both" + +verify_block_cloning +verify_crossfs_block_cloning + +log_assert "Verify various corner cases in block cloning across datasets" + +# Disable compression to make sure we won't use embedded blocks. +log_must zfs set compress=off $TESTSRCFS +log_must zfs set recordsize=$RECORDSIZE $TESTSRCFS +log_must zfs set compress=off $TESTDSTFS +log_must zfs set recordsize=$RECORDSIZE $TESTDSTFS + +bclone_corner_cases_test $TESTSRCDIR $TESTDSTDIR 100 + +log_pass diff --git a/tests/zfs-tests/tests/functional/bclone/bclone_crossfs_data.ksh b/tests/zfs-tests/tests/functional/bclone/bclone_crossfs_data.ksh new file mode 100755 index 00000000000..e2fe25d451d --- /dev/null +++ b/tests/zfs-tests/tests/functional/bclone/bclone_crossfs_data.ksh @@ -0,0 +1,46 @@ +#! /bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2023 by Pawel Jakub Dawidek +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/bclone/bclone_common.kshlib + +verify_runnable "both" + +verify_block_cloning +verify_crossfs_block_cloning + +log_assert "Verify block cloning properly clones regular files across datasets" + +# Disable compression to make sure we won't use embedded blocks. +log_must zfs set compress=off $TESTSRCFS +log_must zfs set compress=off $TESTDSTFS + +for filesize in 1 107 113 511 512 513 4095 4096 4097 131071 131072 131073 \ + 1048575 1048576 1048577 4194303 4194304 4194305; do + bclone_test random $filesize false $TESTSRCDIR $TESTDSTDIR +done + +log_pass diff --git a/tests/zfs-tests/tests/functional/bclone/bclone_crossfs_embedded.ksh b/tests/zfs-tests/tests/functional/bclone/bclone_crossfs_embedded.ksh new file mode 100755 index 00000000000..6a6fe1d309a --- /dev/null +++ b/tests/zfs-tests/tests/functional/bclone/bclone_crossfs_embedded.ksh @@ -0,0 +1,50 @@ +#! /bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2023 by Pawel Jakub Dawidek +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/bclone/bclone_common.kshlib + +verify_runnable "both" + +verify_block_cloning +verify_crossfs_block_cloning + +log_assert "Verify block cloning properly clones small files (with embedded blocks) across datasets" + +# Enable ZLE compression to make sure what is the maximum amount of data we +# can store in BP. +log_must zfs set compress=zle $TESTSRCFS +log_must zfs set compress=zle $TESTDSTFS + +# Test BP_IS_EMBEDDED(). +# Maximum embedded payload size is 112 bytes, but the buffer is extended to +# 512 bytes first and then compressed. 107 random bytes followed by 405 zeros +# gives exactly 112 bytes after compression with ZLE. +for filesize in 1 2 4 8 16 32 64 96 107; do + bclone_test random $filesize true $TESTSRCDIR $TESTDSTDIR +done + +log_pass diff --git a/tests/zfs-tests/tests/functional/bclone/bclone_crossfs_hole.ksh b/tests/zfs-tests/tests/functional/bclone/bclone_crossfs_hole.ksh new file mode 100755 index 00000000000..d4c33d6da30 --- /dev/null +++ b/tests/zfs-tests/tests/functional/bclone/bclone_crossfs_hole.ksh @@ -0,0 +1,45 @@ +#! /bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2023 by Pawel Jakub Dawidek +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/bclone/bclone_common.kshlib + +verify_runnable "both" + +verify_block_cloning +verify_crossfs_block_cloning + +log_assert "Verify block cloning properly clones sparse files (files with holes) across datasets" + +# Compression doesn't matter here. + +# Test BP_IS_HOLE(). +for filesize in 1 511 512 513 4095 4096 4097 131071 131072 131073 \ + 1048575 1048576 1048577 4194303 4194304 4194305; do + bclone_test hole $filesize false $TESTSRCDIR $TESTDSTDIR +done + +log_pass diff --git a/tests/zfs-tests/tests/functional/bclone/bclone_diffprops_all.ksh b/tests/zfs-tests/tests/functional/bclone/bclone_diffprops_all.ksh new file mode 100755 index 00000000000..a5e7282fe6a --- /dev/null +++ b/tests/zfs-tests/tests/functional/bclone/bclone_diffprops_all.ksh @@ -0,0 +1,86 @@ +#! /bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2023 by Pawel Jakub Dawidek +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/include/math.shlib +. $STF_SUITE/tests/functional/bclone/bclone_common.kshlib + +verify_runnable "both" + +verify_block_cloning +verify_crossfs_block_cloning + +log_assert "Verify block cloning across datasets with different properties" + +log_must zfs set checksum=off $TESTSRCFS +log_must zfs set compress=off $TESTSRCFS +log_must zfs set copies=1 $TESTSRCFS +log_must zfs set recordsize=131072 $TESTSRCFS +log_must zfs set checksum=fletcher2 $TESTDSTFS +log_must zfs set compress=lz4 $TESTDSTFS +log_must zfs set copies=3 $TESTDSTFS +log_must zfs set recordsize=8192 $TESTDSTFS + +FILESIZE=$(random_int_between 2 32767) +FILESIZE=$((FILESIZE * 64)) +bclone_test text $FILESIZE false $TESTSRCDIR $TESTDSTDIR + +log_must zfs set checksum=sha256 $TESTSRCFS +log_must zfs set compress=zstd $TESTSRCFS +log_must zfs set copies=2 $TESTSRCFS +log_must zfs set recordsize=262144 $TESTSRCFS +log_must zfs set checksum=off $TESTDSTFS +log_must zfs set compress=off $TESTDSTFS +log_must zfs set copies=1 $TESTDSTFS +log_must zfs set recordsize=131072 $TESTDSTFS + +FILESIZE=$(random_int_between 2 32767) +FILESIZE=$((FILESIZE * 64)) +bclone_test text $FILESIZE false $TESTSRCDIR $TESTDSTDIR + +log_must zfs set checksum=sha512 $TESTSRCFS +log_must zfs set compress=gzip $TESTSRCFS +log_must zfs set copies=2 $TESTSRCFS +log_must zfs set recordsize=512 $TESTSRCFS +log_must zfs set checksum=fletcher4 $TESTDSTFS +log_must zfs set compress=lzjb $TESTDSTFS +log_must zfs set copies=3 $TESTDSTFS +log_must zfs set recordsize=16384 $TESTDSTFS + +FILESIZE=$(random_int_between 2 32767) +FILESIZE=$((FILESIZE * 64)) +bclone_test text $FILESIZE false $TESTSRCDIR $TESTDSTDIR + +log_must zfs inherit checksum $TESTSRCFS +log_must zfs inherit compress $TESTSRCFS +log_must zfs inherit copies $TESTSRCFS +log_must zfs inherit recordsize $TESTSRCFS +log_must zfs inherit checksum $TESTDSTFS +log_must zfs inherit compress $TESTDSTFS +log_must zfs inherit copies $TESTDSTFS +log_must zfs inherit recordsize $TESTDSTFS + +log_pass diff --git a/tests/zfs-tests/tests/functional/bclone/bclone_diffprops_checksum.ksh b/tests/zfs-tests/tests/functional/bclone/bclone_diffprops_checksum.ksh new file mode 100755 index 00000000000..7e064a0dfd7 --- /dev/null +++ b/tests/zfs-tests/tests/functional/bclone/bclone_diffprops_checksum.ksh @@ -0,0 +1,62 @@ +#! /bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2023 by Pawel Jakub Dawidek +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/include/math.shlib +. $STF_SUITE/include/properties.shlib +. $STF_SUITE/tests/functional/bclone/bclone_common.kshlib + +verify_runnable "both" + +verify_block_cloning +verify_crossfs_block_cloning + +log_assert "Verify block cloning across datasets with different checksum properties" + +log_must zfs set compress=off $TESTSRCFS +log_must zfs set compress=off $TESTDSTFS + +for srcprop in "${checksum_prop_vals[@]}"; do + for dstprop in "${checksum_prop_vals[@]}"; do + if [[ $srcprop == $dstprop ]]; then + continue + fi + log_must zfs set checksum=$srcprop $TESTSRCFS + log_must zfs set checksum=$dstprop $TESTDSTFS + # 15*8=120, which is greater than 113, so we are sure the data won't + # be embedded into BP. + # 32767*8=262136, which is larger than a single default recordsize of + # 131072. + FILESIZE=$(random_int_between 15 32767) + FILESIZE=$((FILESIZE * 8)) + bclone_test random $FILESIZE false $TESTSRCDIR $TESTDSTDIR + done +done + +log_must zfs inherit checksum $TESTSRCFS +log_must zfs inherit checksum $TESTDSTFS + +log_pass diff --git a/tests/zfs-tests/tests/functional/bclone/bclone_diffprops_compress.ksh b/tests/zfs-tests/tests/functional/bclone/bclone_diffprops_compress.ksh new file mode 100755 index 00000000000..e1d6e594921 --- /dev/null +++ b/tests/zfs-tests/tests/functional/bclone/bclone_diffprops_compress.ksh @@ -0,0 +1,59 @@ +#! /bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2023 by Pawel Jakub Dawidek +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/include/math.shlib +. $STF_SUITE/include/properties.shlib +. $STF_SUITE/tests/functional/bclone/bclone_common.kshlib + +verify_runnable "both" + +verify_block_cloning +verify_crossfs_block_cloning + +log_assert "Verify block cloning across datasets with different compression properties" + +for srcprop in "${compress_prop_vals[@]}"; do + for dstprop in "${compress_prop_vals[@]}"; do + if [[ $srcprop == $dstprop ]]; then + continue + fi + log_must zfs set compress=$srcprop $TESTSRCFS + log_must zfs set compress=$dstprop $TESTDSTFS + # 15*8=120, which is greater than 113, so we are sure the data won't + # be embedded into BP. + # 32767*8=262136, which is larger than a single default recordsize of + # 131072. + FILESIZE=$(random_int_between 15 32767) + FILESIZE=$((FILESIZE * 8)) + bclone_test text $FILESIZE false $TESTSRCDIR $TESTDSTDIR + done +done + +log_must zfs inherit compress $TESTSRCFS +log_must zfs inherit compress $TESTDSTFS + +log_pass diff --git a/tests/zfs-tests/tests/functional/bclone/bclone_diffprops_copies.ksh b/tests/zfs-tests/tests/functional/bclone/bclone_diffprops_copies.ksh new file mode 100755 index 00000000000..ac823e1ec39 --- /dev/null +++ b/tests/zfs-tests/tests/functional/bclone/bclone_diffprops_copies.ksh @@ -0,0 +1,59 @@ +#! /bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2023 by Pawel Jakub Dawidek +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/include/math.shlib +. $STF_SUITE/include/properties.shlib +. $STF_SUITE/tests/functional/bclone/bclone_common.kshlib + +verify_runnable "both" + +verify_block_cloning +verify_crossfs_block_cloning + +log_assert "Verify block cloning across datasets with different copies properties" + +log_must zfs set compress=off $TESTSRCFS +log_must zfs set compress=off $TESTDSTFS + +for srcprop in "${copies_prop_vals[@]}"; do + for dstprop in "${copies_prop_vals[@]}"; do + log_must zfs set copies=$srcprop $TESTSRCFS + log_must zfs set copies=$dstprop $TESTDSTFS + # 15*8=120, which is greater than 113, so we are sure the data won't + # be embedded into BP. + # 32767*8=262136, which is larger than a single default recordsize of + # 131072. + FILESIZE=$(random_int_between 15 32767) + FILESIZE=$((FILESIZE * 8)) + bclone_test random $FILESIZE false $TESTSRCDIR $TESTDSTDIR + done +done + +log_must zfs inherit copies $TESTSRCFS +log_must zfs inherit copies $TESTDSTFS + +log_pass diff --git a/tests/zfs-tests/tests/functional/bclone/bclone_diffprops_recordsize.ksh b/tests/zfs-tests/tests/functional/bclone/bclone_diffprops_recordsize.ksh new file mode 100755 index 00000000000..d833e612310 --- /dev/null +++ b/tests/zfs-tests/tests/functional/bclone/bclone_diffprops_recordsize.ksh @@ -0,0 +1,65 @@ +#! /bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2023 by Pawel Jakub Dawidek +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/include/math.shlib +. $STF_SUITE/include/properties.shlib +. $STF_SUITE/tests/functional/bclone/bclone_common.kshlib + +verify_runnable "both" + +verify_block_cloning +verify_crossfs_block_cloning + +log_assert "Verify block cloning across datasets with different recordsize properties" + +log_must zfs set compress=off $TESTSRCFS +log_must zfs set compress=off $TESTDSTFS + +# recsize_prop_vals[] array contains too many entries and the tests take too +# long. Let's use only a subset of them. +typeset -a bclone_recsize_prop_vals=('512' '4096' '131072' '1048576') + +for srcprop in "${bclone_recsize_prop_vals[@]}"; do + for dstprop in "${bclone_recsize_prop_vals[@]}"; do + if [[ $srcprop == $dstprop ]]; then + continue + fi + log_must zfs set recordsize=$srcprop $TESTSRCFS + log_must zfs set recordsize=$dstprop $TESTDSTFS + # 2*64=128, which is greater than 113, so we are sure the data won't + # be embedded into BP. + # 32767*64=2097088, which is larger than the largest recordsize (1MB). + FILESIZE=$(random_int_between 2 32767) + FILESIZE=$((FILESIZE * 64)) + bclone_test random $FILESIZE false $TESTSRCDIR $TESTDSTDIR + done +done + +log_must zfs inherit recordsize $TESTSRCFS +log_must zfs inherit recordsize $TESTDSTFS + +log_pass diff --git a/tests/zfs-tests/tests/functional/bclone/bclone_prop_sync.ksh b/tests/zfs-tests/tests/functional/bclone/bclone_prop_sync.ksh new file mode 100755 index 00000000000..f8aa1c875c6 --- /dev/null +++ b/tests/zfs-tests/tests/functional/bclone/bclone_prop_sync.ksh @@ -0,0 +1,66 @@ +#! /bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2023 by Pawel Jakub Dawidek +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/include/math.shlib +. $STF_SUITE/include/properties.shlib +. $STF_SUITE/tests/functional/bclone/bclone_common.kshlib + +verify_runnable "both" + +verify_block_cloning +verify_crossfs_block_cloning + +log_assert "Verify block cloning with all sync property settings" + +log_must zfs set compress=zle $TESTSRCFS +log_must zfs set compress=zle $TESTDSTFS + +for prop in "${sync_prop_vals[@]}"; do + log_must zfs set sync=$prop $TESTSRCFS + # 32767*8=262136, which is larger than a single default recordsize of + # 131072. + FILESIZE=$(random_int_between 1 32767) + FILESIZE=$((FILESIZE * 8)) + bclone_test random $FILESIZE false $TESTSRCDIR $TESTSRCDIR +done + +for srcprop in "${sync_prop_vals[@]}"; do + log_must zfs set sync=$srcprop $TESTSRCFS + for dstprop in "${sync_prop_vals[@]}"; do + log_must zfs set sync=$dstprop $TESTDSTFS + # 32767*8=262136, which is larger than a single default recordsize of + # 131072. + FILESIZE=$(random_int_between 1 32767) + FILESIZE=$((FILESIZE * 8)) + bclone_test random $FILESIZE false $TESTSRCDIR $TESTDSTDIR + done +done + +log_must zfs inherit sync $TESTSRCFS +log_must zfs inherit sync $TESTDSTFS + +log_pass diff --git a/tests/zfs-tests/tests/functional/bclone/bclone_samefs_corner_cases.ksh b/tests/zfs-tests/tests/functional/bclone/bclone_samefs_corner_cases.ksh new file mode 100755 index 00000000000..4aa2914da29 --- /dev/null +++ b/tests/zfs-tests/tests/functional/bclone/bclone_samefs_corner_cases.ksh @@ -0,0 +1,42 @@ +#! /bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2023 by Pawel Jakub Dawidek +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/bclone/bclone_corner_cases.kshlib + +verify_runnable "both" + +verify_block_cloning + +log_assert "Verify various corner cases in block cloning within the same dataset" + +# Disable compression to make sure we won't use embedded blocks. +log_must zfs set compress=off $TESTSRCFS +log_must zfs set recordsize=$RECORDSIZE $TESTSRCFS + +bclone_corner_cases_test $TESTSRCDIR $TESTSRCDIR + +log_pass diff --git a/tests/zfs-tests/tests/functional/bclone/bclone_samefs_corner_cases_limited.ksh b/tests/zfs-tests/tests/functional/bclone/bclone_samefs_corner_cases_limited.ksh new file mode 100755 index 00000000000..b4737700eb7 --- /dev/null +++ b/tests/zfs-tests/tests/functional/bclone/bclone_samefs_corner_cases_limited.ksh @@ -0,0 +1,42 @@ +#! /bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2023 by Pawel Jakub Dawidek +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/bclone/bclone_corner_cases.kshlib + +verify_runnable "both" + +verify_block_cloning + +log_assert "Verify various corner cases in block cloning within the same dataset" + +# Disable compression to make sure we won't use embedded blocks. +log_must zfs set compress=off $TESTSRCFS +log_must zfs set recordsize=$RECORDSIZE $TESTSRCFS + +bclone_corner_cases_test $TESTSRCDIR $TESTSRCDIR 100 + +log_pass diff --git a/tests/zfs-tests/tests/functional/bclone/bclone_samefs_data.ksh b/tests/zfs-tests/tests/functional/bclone/bclone_samefs_data.ksh new file mode 100755 index 00000000000..e964f7bbf64 --- /dev/null +++ b/tests/zfs-tests/tests/functional/bclone/bclone_samefs_data.ksh @@ -0,0 +1,44 @@ +#! /bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2023 by Pawel Jakub Dawidek +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/bclone/bclone_common.kshlib + +verify_runnable "both" + +verify_block_cloning + +log_assert "Verify block cloning properly clones regular files within the same dataset" + +# Disable compression to make sure we won't use embedded blocks. +log_must zfs set compress=off $TESTSRCFS + +for filesize in 1 107 113 511 512 513 4095 4096 4097 131071 131072 131073 \ + 1048575 1048576 1048577 4194303 4194304 4194305; do + bclone_test random $filesize false $TESTSRCDIR $TESTSRCDIR +done + +log_pass diff --git a/tests/zfs-tests/tests/functional/bclone/bclone_samefs_embedded.ksh b/tests/zfs-tests/tests/functional/bclone/bclone_samefs_embedded.ksh new file mode 100755 index 00000000000..df393a87801 --- /dev/null +++ b/tests/zfs-tests/tests/functional/bclone/bclone_samefs_embedded.ksh @@ -0,0 +1,48 @@ +#! /bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2023 by Pawel Jakub Dawidek +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/bclone/bclone_common.kshlib + +verify_runnable "both" + +verify_block_cloning + +log_assert "Verify block cloning properly clones small files (with embedded blocks) within the same dataset" + +# Enable ZLE compression to make sure what is the maximum amount of data we +# can store in BP. +log_must zfs set compress=zle $TESTSRCFS + +# Test BP_IS_EMBEDDED(). +# Maximum embedded payload size is 112 bytes, but the buffer is extended to +# 512 bytes first and then compressed. 107 random bytes followed by 405 zeros +# gives exactly 112 bytes after compression with ZLE. +for filesize in 1 2 4 8 16 32 64 96 107; do + bclone_test random $filesize true $TESTSRCDIR $TESTSRCDIR +done + +log_pass diff --git a/tests/zfs-tests/tests/functional/bclone/bclone_samefs_hole.ksh b/tests/zfs-tests/tests/functional/bclone/bclone_samefs_hole.ksh new file mode 100755 index 00000000000..3c6e345e6e6 --- /dev/null +++ b/tests/zfs-tests/tests/functional/bclone/bclone_samefs_hole.ksh @@ -0,0 +1,44 @@ +#! /bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2023 by Pawel Jakub Dawidek +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/bclone/bclone_common.kshlib + +verify_runnable "both" + +verify_block_cloning + +log_assert "Verify block cloning properly clones sparse files (files with holes) within the same dataset" + +# Compression doesn't matter here. + +# Test BP_IS_HOLE(). +for filesize in 1 511 512 513 4095 4096 4097 131071 131072 131073 \ + 1048575 1048576 1048577 4194303 4194304 4194305; do + bclone_test hole $filesize false $TESTSRCDIR $TESTSRCDIR +done + +log_pass diff --git a/tests/zfs-tests/tests/functional/bclone/cleanup.ksh b/tests/zfs-tests/tests/functional/bclone/cleanup.ksh new file mode 100755 index 00000000000..df6d9c08fec --- /dev/null +++ b/tests/zfs-tests/tests/functional/bclone/cleanup.ksh @@ -0,0 +1,37 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2013 by Delphix. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/bclone/bclone.cfg + +log_must zfs destroy $TESTSRCFS +log_must zfs destroy $TESTDSTFS +default_cleanup diff --git a/tests/zfs-tests/tests/functional/bclone/setup.ksh b/tests/zfs-tests/tests/functional/bclone/setup.ksh new file mode 100755 index 00000000000..c68719ee72a --- /dev/null +++ b/tests/zfs-tests/tests/functional/bclone/setup.ksh @@ -0,0 +1,45 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright 2007 Sun Microsystems, Inc. All rights reserved. +# Use is subject to license terms. +# + +# +# Copyright (c) 2023 by Pawel Jakub Dawidek +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/bclone/bclone.cfg + +if ! command -v clonefile > /dev/null ; then + log_unsupported "clonefile program required to test block cloning" +fi + +DISK=${DISKS%% *} + +default_setup_noexit $DISK "true" +log_must zpool set feature@block_cloning=enabled $TESTPOOL +log_must zfs create $TESTSRCFS +log_must zfs create $TESTDSTFS +log_pass diff --git a/tests/zfs-tests/tests/functional/redundancy/redundancy.kshlib b/tests/zfs-tests/tests/functional/redundancy/redundancy.kshlib index 30818050a07..297c6a073bb 100644 --- a/tests/zfs-tests/tests/functional/redundancy/redundancy.kshlib +++ b/tests/zfs-tests/tests/functional/redundancy/redundancy.kshlib @@ -44,28 +44,6 @@ function cleanup done } -# -# Get random number between min and max number. -# -# $1 Minimal value -# $2 Maximal value -# -function random -{ - typeset -i min=$1 - typeset -i max=$2 - typeset -i value - - while true; do - ((value = RANDOM % (max + 1))) - if ((value >= min)); then - break - fi - done - - echo $value -} - # # Get the number of checksum errors for the pool. # From 83c0ccc7cf5494090621ab7038386b8a4750e560 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Fri, 12 Jan 2024 11:57:13 -0800 Subject: [PATCH 38/91] Enable block_cloning tests on FreeBSD Reviewed-by: Brian Behlendorf Signed-off-by: Pawel Jakub Dawidek Closes #15749 --- tests/runfiles/common.run | 11 +++++++++++ tests/runfiles/linux.run | 14 +++----------- .../block_cloning/block_cloning_copyfilerange.ksh | 2 +- .../block_cloning_copyfilerange_cross_dataset.ksh | 2 +- .../block_cloning_copyfilerange_fallback.ksh | 2 +- ...ock_cloning_copyfilerange_fallback_same_txg.ksh | 2 +- .../block_cloning_copyfilerange_partial.ksh | 2 +- .../block_cloning_cross_enc_dataset.ksh | 2 +- .../block_cloning_disabled_copyfilerange.ksh | 2 +- .../block_cloning_lwb_buffer_overflow.ksh | 7 ++++--- .../block_cloning/block_cloning_replay.ksh | 9 +++++---- .../block_cloning_replay_encrypted.ksh | 9 +++++---- 12 files changed, 35 insertions(+), 29 deletions(-) diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index f94a5fba9e2..13d83128337 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -71,6 +71,17 @@ tests = ['bclone_crossfs_corner_cases_limited', tags = ['functional', 'bclone'] timeout = 7200 +[tests/functional/block_cloning] +tests = ['block_cloning_copyfilerange', 'block_cloning_copyfilerange_partial', + 'block_cloning_copyfilerange_fallback', + 'block_cloning_disabled_copyfilerange', + 'block_cloning_copyfilerange_cross_dataset', + 'block_cloning_cross_enc_dataset', + 'block_cloning_copyfilerange_fallback_same_txg', + 'block_cloning_replay', 'block_cloning_replay_encrypted', + 'block_cloning_lwb_buffer_overflow'] +tags = ['functional', 'block_cloning'] + [tests/functional/bootfs] tests = ['bootfs_001_pos', 'bootfs_002_neg', 'bootfs_003_pos', 'bootfs_004_neg', 'bootfs_005_neg', 'bootfs_006_pos', 'bootfs_007_pos', diff --git a/tests/runfiles/linux.run b/tests/runfiles/linux.run index c7c17f27176..6a4cd3fe691 100644 --- a/tests/runfiles/linux.run +++ b/tests/runfiles/linux.run @@ -35,17 +35,9 @@ tests = ['atime_003_pos', 'root_relatime_on'] tags = ['functional', 'atime'] [tests/functional/block_cloning:Linux] -tests = ['block_cloning_copyfilerange', 'block_cloning_copyfilerange_partial', - 'block_cloning_copyfilerange_fallback', - 'block_cloning_ficlone', 'block_cloning_ficlonerange', - 'block_cloning_ficlonerange_partial', - 'block_cloning_disabled_copyfilerange', 'block_cloning_disabled_ficlone', - 'block_cloning_disabled_ficlonerange', - 'block_cloning_copyfilerange_cross_dataset', - 'block_cloning_cross_enc_dataset', - 'block_cloning_copyfilerange_fallback_same_txg', - 'block_cloning_replay', 'block_cloning_replay_encrypted', - 'block_cloning_lwb_buffer_overflow'] +tests = ['block_cloning_ficlone', 'block_cloning_ficlonerange', + 'block_cloning_ficlonerange_partial', 'block_cloning_disabled_ficlone', + 'block_cloning_disabled_ficlonerange'] tags = ['functional', 'block_cloning'] [tests/functional/chattr:Linux] diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange.ksh index 43ea47b0ef1..0599739abee 100755 --- a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange.ksh +++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange.ksh @@ -29,7 +29,7 @@ verify_runnable "global" -if [[ $(linux_version) -lt $(linux_version "4.5") ]]; then +if is_linux && [[ $(linux_version) -lt $(linux_version "4.5") ]]; then log_unsupported "copy_file_range not available before Linux 4.5" fi diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_cross_dataset.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_cross_dataset.ksh index 74e6b04903a..43323c207a6 100755 --- a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_cross_dataset.ksh +++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_cross_dataset.ksh @@ -29,7 +29,7 @@ verify_runnable "global" -if [[ $(linux_version) -lt $(linux_version "5.3") ]]; then +if is_linux && [[ $(linux_version) -lt $(linux_version "5.3") ]]; then log_unsupported "copy_file_range can't copy cross-filesystem before Linux 5.3" fi diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback.ksh index 9a96eacd60a..475910be747 100755 --- a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback.ksh +++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback.ksh @@ -30,7 +30,7 @@ verify_runnable "global" -if [[ $(linux_version) -lt $(linux_version "4.5") ]]; then +if is_linux && [[ $(linux_version) -lt $(linux_version "4.5") ]]; then log_unsupported "copy_file_range not available before Linux 4.5" fi diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh index e52b34ec8a5..00982f68db8 100755 --- a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh +++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh @@ -30,7 +30,7 @@ verify_runnable "global" -if [[ $(linux_version) -lt $(linux_version "4.5") ]]; then +if is_linux && [[ $(linux_version) -lt $(linux_version "4.5") ]]; then log_unsupported "copy_file_range not available before Linux 4.5" fi diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_partial.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_partial.ksh index a5da0a0bd35..38c46e4741c 100755 --- a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_partial.ksh +++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_partial.ksh @@ -29,7 +29,7 @@ verify_runnable "global" -if [[ $(linux_version) -lt $(linux_version "4.5") ]]; then +if is_linux && [[ $(linux_version) -lt $(linux_version "4.5") ]]; then log_unsupported "copy_file_range not available before Linux 4.5" fi diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_cross_enc_dataset.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_cross_enc_dataset.ksh index fe8f0867b90..34d3d269255 100755 --- a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_cross_enc_dataset.ksh +++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_cross_enc_dataset.ksh @@ -29,7 +29,7 @@ verify_runnable "global" -if [[ $(linux_version) -lt $(linux_version "5.3") ]]; then +if is_linux && [[ $(linux_version) -lt $(linux_version "5.3") ]]; then log_unsupported "copy_file_range can't copy cross-filesystem before Linux 5.3" fi diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_disabled_copyfilerange.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_disabled_copyfilerange.ksh index d21b6251134..3d916ab9216 100755 --- a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_disabled_copyfilerange.ksh +++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_disabled_copyfilerange.ksh @@ -29,7 +29,7 @@ verify_runnable "global" -if [[ $(linux_version) -lt $(linux_version "4.5") ]]; then +if is_linux && [[ $(linux_version) -lt $(linux_version "4.5") ]]; then log_unsupported "copy_file_range not available before Linux 4.5" fi diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_lwb_buffer_overflow.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_lwb_buffer_overflow.ksh index 0ae76b7e54a..919f320dea3 100755 --- a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_lwb_buffer_overflow.ksh +++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_lwb_buffer_overflow.ksh @@ -45,7 +45,7 @@ verify_runnable "global" -if [[ $(linux_version) -lt $(linux_version "4.5") ]]; then +if is_linux && [[ $(linux_version) -lt $(linux_version "4.5") ]]; then log_unsupported "copy_file_range not available before Linux 4.5" fi @@ -77,13 +77,14 @@ log_must zfs create -o recordsize=32K $TESTPOOL/$TESTFS log_must dd if=/dev/urandom of=/$TESTPOOL/$TESTFS/file1 bs=32K count=1022 \ conv=fsync sync_pool $TESTPOOL -log_must clonefile -c /$TESTPOOL/$TESTFS/file1 /$TESTPOOL/$TESTFS/file2 +log_must clonefile -f /$TESTPOOL/$TESTFS/file1 /$TESTPOOL/$TESTFS/file2 log_must sync sync_pool $TESTPOOL log_must have_same_content /$TESTPOOL/$TESTFS/file1 /$TESTPOOL/$TESTFS/file2 typeset blocks=$(get_same_blocks $TESTPOOL/$TESTFS file1 $TESTPOOL/$TESTFS file2) -log_must [ "$blocks" = "$(seq -s " " 0 1021)" ] +# FreeBSD's seq(1) leaves a trailing space, remove it with sed(1). +log_must [ "$blocks" = "$(seq -s " " 0 1021 | sed 's/ $//')" ] log_pass "LWB buffer overflow is not triggered with multiple VDEVs ZIL" diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_replay.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_replay.ksh index 1fdf379ed2d..53015200468 100755 --- a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_replay.ksh +++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_replay.ksh @@ -42,7 +42,7 @@ verify_runnable "global" -if [[ $(linux_version) -lt $(linux_version "4.5") ]]; then +if is_linux && [[ $(linux_version) -lt $(linux_version "4.5") ]]; then log_unsupported "copy_file_range not available before Linux 4.5" fi @@ -90,8 +90,8 @@ log_must zpool freeze $TESTPOOL # # 4. TX_CLONE_RANGE: Clone the file # -log_must clonefile -c /$TESTPOOL/$TESTFS/file1 /$TESTPOOL/$TESTFS/clone1 -log_must clonefile -c /$TESTPOOL/$TESTFS/file2 /$TESTPOOL/$TESTFS/clone2 +log_must clonefile -f /$TESTPOOL/$TESTFS/file1 /$TESTPOOL/$TESTFS/clone1 +log_must clonefile -f /$TESTPOOL/$TESTFS/file2 /$TESTPOOL/$TESTFS/clone2 # # 5. Unmount filesystem and export the pool @@ -126,6 +126,7 @@ log_must [ "$blocks" = "0 1 2 3" ] typeset blocks=$(get_same_blocks $TESTPOOL/$TESTFS file2 \ $TESTPOOL/$TESTFS clone2) -log_must [ "$blocks" = "$(seq -s " " 0 2047)" ] +# FreeBSD's seq(1) leaves a trailing space, remove it with sed(1). +log_must [ "$blocks" = "$(seq -s " " 0 2047 | sed 's/ $//')" ] log_pass $claim diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_replay_encrypted.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_replay_encrypted.ksh index f9f687c83e5..0967415b7b7 100755 --- a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_replay_encrypted.ksh +++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_replay_encrypted.ksh @@ -42,7 +42,7 @@ verify_runnable "global" -if [[ $(linux_version) -lt $(linux_version "4.5") ]]; then +if is_linux && [[ $(linux_version) -lt $(linux_version "4.5") ]]; then log_unsupported "copy_file_range not available before Linux 4.5" fi @@ -92,8 +92,8 @@ log_must zpool freeze $TESTPOOL # # 4. TX_CLONE_RANGE: Clone the file # -log_must clonefile -c /$TESTPOOL/$TESTFS/file1 /$TESTPOOL/$TESTFS/clone1 -log_must clonefile -c /$TESTPOOL/$TESTFS/file2 /$TESTPOOL/$TESTFS/clone2 +log_must clonefile -f /$TESTPOOL/$TESTFS/file1 /$TESTPOOL/$TESTFS/clone1 +log_must clonefile -f /$TESTPOOL/$TESTFS/file2 /$TESTPOOL/$TESTFS/clone2 # # 5. Unmount filesystem and export the pool @@ -128,6 +128,7 @@ log_must [ "$blocks" = "0 1 2 3" ] typeset blocks=$(get_same_blocks $TESTPOOL/$TESTFS file2 \ $TESTPOOL/$TESTFS clone2 $PASSPHRASE) -log_must [ "$blocks" = "$(seq -s " " 0 2047)" ] +# FreeBSD's seq(1) leaves a trailing space, remove it with sed(1). +log_must [ "$blocks" = "$(seq -s " " 0 2047 | sed 's/ $//')" ] log_pass $claim From d2f7b2e55767f8b84bcca79cf508f89c0471a92a Mon Sep 17 00:00:00 2001 From: Umer Saleem Date: Wed, 17 Jan 2024 02:15:10 +0500 Subject: [PATCH 39/91] ZTS: Test for clone, mmap and write for block cloning For block cloning, if we mmap the cloned file and write from the map into the file, it triggers a panic in dbuf_redirty() on Linux. The same scenario causes data corruption on FreeBSD. Both these issues are fixed under PR#15656 and PR#15665. It would be good to add a test for this scenario in ZTS. The test program and issue was produced by @robn. Reviewed-by: Pawel Jakub Dawidek Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Reviewed-by: Ameer Hamza Signed-off-by: Umer Saleem Closes #15717 --- tests/runfiles/common.run | 2 +- tests/test-runner/bin/zts-report.py.in | 2 + tests/zfs-tests/cmd/.gitignore | 1 + tests/zfs-tests/cmd/Makefile.am | 1 + tests/zfs-tests/cmd/clone_mmap_write.c | 123 ++++++++++++++++++ tests/zfs-tests/include/commands.cfg | 1 + tests/zfs-tests/tests/Makefile.am | 1 + .../block_cloning_clone_mmap_write.ksh | 79 +++++++++++ 8 files changed, 209 insertions(+), 1 deletion(-) create mode 100644 tests/zfs-tests/cmd/clone_mmap_write.c create mode 100755 tests/zfs-tests/tests/functional/block_cloning/block_cloning_clone_mmap_write.ksh diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 13d83128337..f320c54239d 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -79,7 +79,7 @@ tests = ['block_cloning_copyfilerange', 'block_cloning_copyfilerange_partial', 'block_cloning_cross_enc_dataset', 'block_cloning_copyfilerange_fallback_same_txg', 'block_cloning_replay', 'block_cloning_replay_encrypted', - 'block_cloning_lwb_buffer_overflow'] + 'block_cloning_lwb_buffer_overflow', 'block_cloning_clone_mmap_write'] tags = ['functional', 'block_cloning'] [tests/functional/bootfs] diff --git a/tests/test-runner/bin/zts-report.py.in b/tests/test-runner/bin/zts-report.py.in index 7bf4d05d542..c84f75cd806 100755 --- a/tests/test-runner/bin/zts-report.py.in +++ b/tests/test-runner/bin/zts-report.py.in @@ -287,6 +287,8 @@ elif sys.platform.startswith('linux'): 'bclone/bclone_samefs_data': ['SKIP', cfr_reason], 'bclone/bclone_samefs_embedded': ['SKIP', cfr_reason], 'bclone/bclone_samefs_hole': ['SKIP', cfr_reason], + 'block_cloning/block_cloning_clone_mmap_write': + ['SKIP', cfr_reason], 'block_cloning/block_cloning_copyfilerange': ['SKIP', cfr_reason], 'block_cloning/block_cloning_copyfilerange_cross_dataset': diff --git a/tests/zfs-tests/cmd/.gitignore b/tests/zfs-tests/cmd/.gitignore index 5f53b687191..a696fd38711 100644 --- a/tests/zfs-tests/cmd/.gitignore +++ b/tests/zfs-tests/cmd/.gitignore @@ -2,6 +2,7 @@ /btree_test /chg_usr_exec /clonefile +/clone_mmap_write /devname2devid /dir_rd_update /draid diff --git a/tests/zfs-tests/cmd/Makefile.am b/tests/zfs-tests/cmd/Makefile.am index 1b915ae98ca..379dc5e236c 100644 --- a/tests/zfs-tests/cmd/Makefile.am +++ b/tests/zfs-tests/cmd/Makefile.am @@ -3,6 +3,7 @@ scripts_zfs_tests_bindir = $(datadir)/$(PACKAGE)/zfs-tests/bin scripts_zfs_tests_bin_PROGRAMS = %D%/chg_usr_exec scripts_zfs_tests_bin_PROGRAMS += %D%/clonefile +scripts_zfs_tests_bin_PROGRAMS += %D%/clone_mmap_write scripts_zfs_tests_bin_PROGRAMS += %D%/cp_files scripts_zfs_tests_bin_PROGRAMS += %D%/ctime scripts_zfs_tests_bin_PROGRAMS += %D%/dir_rd_update diff --git a/tests/zfs-tests/cmd/clone_mmap_write.c b/tests/zfs-tests/cmd/clone_mmap_write.c new file mode 100644 index 00000000000..6a5cd8721c5 --- /dev/null +++ b/tests/zfs-tests/cmd/clone_mmap_write.c @@ -0,0 +1,123 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * This program clones the file, mmap it, and writes from the map into + * file. This scenario triggers a panic on Linux in dbuf_redirty(), + * which is fixed under PR#15656. On FreeBSD, the same test causes data + * corruption, which is fixed by PR#15665. + * + * It would be good to test for this scenario in ZTS. This program and + * issue was initially produced by @robn. + */ +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef __FreeBSD__ +#define loff_t off_t +#endif + +ssize_t +copy_file_range(int, loff_t *, int, loff_t *, size_t, unsigned int) + __attribute__((weak)); + +static int +open_file(const char *source) +{ + int fd; + if ((fd = open(source, O_RDWR | O_APPEND)) < 0) { + (void) fprintf(stderr, "Error opening %s\n", source); + exit(1); + } + sync(); + return (fd); +} + +static int +clone_file(int sfd, long long size, const char *dest) +{ + int dfd; + + if ((dfd = open(dest, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR)) < 0) { + (void) fprintf(stderr, "Error opening %s\n", dest); + exit(1); + } + + if (copy_file_range(sfd, 0, dfd, 0, size, 0) < 0) { + (void) fprintf(stderr, "copy_file_range failed\n"); + exit(1); + } + + return (dfd); +} + +static void * +map_file(int fd, long long size) +{ + void *p = mmap(NULL, size, PROT_READ, MAP_SHARED, fd, 0); + if (p == MAP_FAILED) { + (void) fprintf(stderr, "mmap failed\n"); + exit(1); + } + + return (p); +} + +static void +map_write(void *p, int fd) +{ + if (pwrite(fd, p, 1024*128, 0) < 0) { + (void) fprintf(stderr, "write failed\n"); + exit(1); + } +} + +int +main(int argc, char **argv) +{ + int sfd, dfd; + void *p; + struct stat sb; + if (argc != 3) { + (void) printf("usage: %s " + "\n", argv[0]); + exit(1); + } + sfd = open_file(argv[1]); + if (fstat(sfd, &sb) == -1) { + (void) fprintf(stderr, "fstat failed\n"); + exit(1); + } + dfd = clone_file(sfd, sb.st_size, argv[2]); + p = map_file(dfd, sb.st_size); + map_write(p, dfd); + return (0); +} diff --git a/tests/zfs-tests/include/commands.cfg b/tests/zfs-tests/include/commands.cfg index c6f74cd81a1..797078ed3ab 100644 --- a/tests/zfs-tests/include/commands.cfg +++ b/tests/zfs-tests/include/commands.cfg @@ -185,6 +185,7 @@ export ZFSTEST_FILES='badsend btree_test chg_usr_exec clonefile + clone_mmap_write devname2devid dir_rd_update draid diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 33e97d22b6c..aeff66627a7 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -461,6 +461,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/bclone/setup.ksh \ functional/block_cloning/cleanup.ksh \ functional/block_cloning/setup.ksh \ + functional/block_cloning/block_cloning_clone_mmap_write.ksh \ functional/block_cloning/block_cloning_copyfilerange_cross_dataset.ksh \ functional/block_cloning/block_cloning_copyfilerange_fallback.ksh \ functional/block_cloning/block_cloning_copyfilerange_fallback_same_txg.ksh \ diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_clone_mmap_write.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_clone_mmap_write.ksh new file mode 100755 index 00000000000..6215b3178e7 --- /dev/null +++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_clone_mmap_write.ksh @@ -0,0 +1,79 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/block_cloning/block_cloning.kshlib + +# +# DESCRIPTION: +# A PANIC is triggered in dbuf_redirty() if we clone a file, mmap it +# and write from the map into the file. PR#15656 fixes this scenario. +# This scenario also causes data corruption on FreeBSD, which is fixed +# by PR#15665. +# +# STRATEGY: +# 1. Create a pool +# 2. Create a test file +# 3. Clone, mmap and write to the file using clone_mmap_write +# 5. Synchronize cached writes +# 6. Verfiy data is correctly written to the disk +# + +verify_runnable "global" + +if is_linux && [[ $(linux_version) -lt $(linux_version "4.5") ]]; then + log_unsupported "copy_file_range not available before Linux 4.5" +fi + +VDIR=$TEST_BASE_DIR/disk-bclone +VDEV="$VDIR/a" + +function cleanup +{ + datasetexists $TESTPOOL && destroy_pool $TESTPOOL + rm -rf $VDIR +} + +log_onexit cleanup + +log_assert "Test for clone, mmap and write scenario" + +log_must rm -rf $VDIR +log_must mkdir -p $VDIR +log_must truncate -s 1G $VDEV + +log_must zpool create -o feature@block_cloning=enabled $TESTPOOL $VDEV +log_must zfs create $TESTPOOL/$TESTFS + +log_must dd if=/dev/urandom of=/$TESTPOOL/$TESTFS/file bs=1M count=512 +log_must clone_mmap_write /$TESTPOOL/$TESTFS/file /$TESTPOOL/$TESTFS/clone + +sync_pool $TESTPOOL +log_must sync + +log_must have_same_content /$TESTPOOL/$TESTFS/file /$TESTPOOL/$TESTFS/clone +blocks=$(get_same_blocks $TESTPOOL/$TESTFS file $TESTPOOL/$TESTFS clone) +# FreeBSD's seq(1) leaves a trailing space, remove it with sed(1). +log_must [ "$blocks" = "$(seq -s " " 1 4095 | sed 's/ $//')" ] + +log_pass "Clone, mmap and write does not cause data corruption or " \ + "trigger panic" From ef527958c6a1fc07177636465194625ef8e64083 Mon Sep 17 00:00:00 2001 From: Pawel Jakub Dawidek Date: Wed, 17 Jan 2024 08:51:07 -0800 Subject: [PATCH 40/91] Fix cloning into mmaped and cached file. If the destination file is mmaped and the mmaped region was already read, so it is cached, we need to update mmaped pages after successful clone using update_pages(). Reviewed-by: Alexander Motin Reviewed-by: Brian Behlendorf Pointed out by: Ka Ho Ng Signed-off-by: Pawel Jakub Dawidek Closes #15772 --- module/zfs/zfs_vnops.c | 4 + tests/runfiles/common.run | 4 +- tests/test-runner/bin/zts-report.py.in | 1 + tests/zfs-tests/cmd/.gitignore | 1 + tests/zfs-tests/cmd/Makefile.am | 1 + tests/zfs-tests/cmd/clone_mmap_cached.c | 146 ++++++++++++++++++ tests/zfs-tests/include/commands.cfg | 1 + tests/zfs-tests/tests/Makefile.am | 1 + .../block_cloning_clone_mmap_cached.ksh | 86 +++++++++++ .../tests/functional/block_cloning/setup.ksh | 3 + 10 files changed, 247 insertions(+), 1 deletion(-) create mode 100644 tests/zfs-tests/cmd/clone_mmap_cached.c create mode 100755 tests/zfs-tests/tests/functional/block_cloning/block_cloning_clone_mmap_cached.ksh diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index 812e42f645e..aa61575a6a1 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -1355,6 +1355,10 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, break; } + if (zn_has_cached_data(outzp, outoff, outoff + size - 1)) { + update_pages(outzp, outoff, size, outos); + } + zfs_clear_setid_bits_if_necessary(outzfsvfs, outzp, cr, &clear_setid_bits_txg, tx); diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index f320c54239d..33f30b00550 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -72,7 +72,9 @@ tags = ['functional', 'bclone'] timeout = 7200 [tests/functional/block_cloning] -tests = ['block_cloning_copyfilerange', 'block_cloning_copyfilerange_partial', +tests = ['block_cloning_clone_mmap_cached', + 'block_cloning_copyfilerange', + 'block_cloning_copyfilerange_partial', 'block_cloning_copyfilerange_fallback', 'block_cloning_disabled_copyfilerange', 'block_cloning_copyfilerange_cross_dataset', diff --git a/tests/test-runner/bin/zts-report.py.in b/tests/test-runner/bin/zts-report.py.in index c84f75cd806..ae4aa627546 100755 --- a/tests/test-runner/bin/zts-report.py.in +++ b/tests/test-runner/bin/zts-report.py.in @@ -287,6 +287,7 @@ elif sys.platform.startswith('linux'): 'bclone/bclone_samefs_data': ['SKIP', cfr_reason], 'bclone/bclone_samefs_embedded': ['SKIP', cfr_reason], 'bclone/bclone_samefs_hole': ['SKIP', cfr_reason], + 'block_cloning/block_cloning_clone_mmap_cached': ['SKIP', cfr_reason], 'block_cloning/block_cloning_clone_mmap_write': ['SKIP', cfr_reason], 'block_cloning/block_cloning_copyfilerange': diff --git a/tests/zfs-tests/cmd/.gitignore b/tests/zfs-tests/cmd/.gitignore index a696fd38711..0ed0a69eb01 100644 --- a/tests/zfs-tests/cmd/.gitignore +++ b/tests/zfs-tests/cmd/.gitignore @@ -2,6 +2,7 @@ /btree_test /chg_usr_exec /clonefile +/clone_mmap_cached /clone_mmap_write /devname2devid /dir_rd_update diff --git a/tests/zfs-tests/cmd/Makefile.am b/tests/zfs-tests/cmd/Makefile.am index 379dc5e236c..23848a82ffb 100644 --- a/tests/zfs-tests/cmd/Makefile.am +++ b/tests/zfs-tests/cmd/Makefile.am @@ -3,6 +3,7 @@ scripts_zfs_tests_bindir = $(datadir)/$(PACKAGE)/zfs-tests/bin scripts_zfs_tests_bin_PROGRAMS = %D%/chg_usr_exec scripts_zfs_tests_bin_PROGRAMS += %D%/clonefile +scripts_zfs_tests_bin_PROGRAMS += %D%/clone_mmap_cached scripts_zfs_tests_bin_PROGRAMS += %D%/clone_mmap_write scripts_zfs_tests_bin_PROGRAMS += %D%/cp_files scripts_zfs_tests_bin_PROGRAMS += %D%/ctime diff --git a/tests/zfs-tests/cmd/clone_mmap_cached.c b/tests/zfs-tests/cmd/clone_mmap_cached.c new file mode 100644 index 00000000000..c1cdf796cfb --- /dev/null +++ b/tests/zfs-tests/cmd/clone_mmap_cached.c @@ -0,0 +1,146 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2024 by Pawel Jakub Dawidek + */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef __FreeBSD__ +#define loff_t off_t +#endif + +ssize_t +copy_file_range(int, loff_t *, int, loff_t *, size_t, unsigned int) + __attribute__((weak)); + +static void * +mmap_file(int fd, size_t size) +{ + void *p; + + p = mmap(NULL, size, PROT_READ, MAP_SHARED, fd, 0); + if (p == MAP_FAILED) { + (void) fprintf(stderr, "mmap failed: %s\n", strerror(errno)); + exit(2); + } + + return (p); +} + +static void +usage(const char *progname) +{ + + /* + * -i cache input before copy_file_range(2). + * -o cache input before copy_file_range(2). + */ + (void) fprintf(stderr, "usage: %s [-io] \n", progname); + exit(3); +} + +int +main(int argc, char *argv[]) +{ + int dfd, sfd; + size_t dsize, ssize; + void *dmem, *smem, *ptr; + off_t doff, soff; + struct stat sb; + bool cache_input, cache_output; + const char *progname; + int c; + + progname = argv[0]; + cache_input = cache_output = false; + + while ((c = getopt(argc, argv, "io")) != -1) { + switch (c) { + case 'i': + cache_input = true; + break; + case 'o': + cache_output = true; + break; + default: + usage(progname); + } + } + argc -= optind; + argv += optind; + + if (argc != 2) { + usage(progname); + } + + sfd = open(argv[0], O_RDONLY); + if (fstat(sfd, &sb) == -1) { + (void) fprintf(stderr, "fstat failed: %s\n", strerror(errno)); + exit(2); + } + ssize = sb.st_size; + smem = mmap_file(sfd, ssize); + + dfd = open(argv[1], O_RDWR); + if (fstat(dfd, &sb) == -1) { + (void) fprintf(stderr, "fstat failed: %s\n", strerror(errno)); + exit(2); + } + dsize = sb.st_size; + dmem = mmap_file(dfd, dsize); + + /* + * Hopefully it won't be compiled out. + */ + if (cache_input) { + ptr = malloc(ssize); + assert(ptr != NULL); + memcpy(ptr, smem, ssize); + free(ptr); + } + if (cache_output) { + ptr = malloc(ssize); + assert(ptr != NULL); + memcpy(ptr, dmem, dsize); + free(ptr); + } + + soff = doff = 0; + if (copy_file_range(sfd, &soff, dfd, &doff, ssize, 0) < 0) { + (void) fprintf(stderr, "copy_file_range failed: %s\n", + strerror(errno)); + exit(2); + } + + exit(memcmp(smem, dmem, ssize) == 0 ? 0 : 1); +} diff --git a/tests/zfs-tests/include/commands.cfg b/tests/zfs-tests/include/commands.cfg index 797078ed3ab..daa79455168 100644 --- a/tests/zfs-tests/include/commands.cfg +++ b/tests/zfs-tests/include/commands.cfg @@ -185,6 +185,7 @@ export ZFSTEST_FILES='badsend btree_test chg_usr_exec clonefile + clone_mmap_cached clone_mmap_write devname2devid dir_rd_update diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index aeff66627a7..1c3dfc77eac 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -461,6 +461,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/bclone/setup.ksh \ functional/block_cloning/cleanup.ksh \ functional/block_cloning/setup.ksh \ + functional/block_cloning/block_cloning_clone_mmap_cached.ksh \ functional/block_cloning/block_cloning_clone_mmap_write.ksh \ functional/block_cloning/block_cloning_copyfilerange_cross_dataset.ksh \ functional/block_cloning/block_cloning_copyfilerange_fallback.ksh \ diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_clone_mmap_cached.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_clone_mmap_cached.ksh new file mode 100755 index 00000000000..b0ef8ec9953 --- /dev/null +++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_clone_mmap_cached.ksh @@ -0,0 +1,86 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/block_cloning/block_cloning.kshlib + +# +# DESCRIPTION: +# When the destination file is mmaped and is already cached we need to +# update mmaped pages after successful clone. +# +# STRATEGY: +# 1. Create a pool. +# 2. Create a two test files with random content. +# 3. mmap the files, read them and clone from one to the other using +# clone_mmap_cached. +# 4. clone_mmap_cached also verifies if the content of the destination +# file was updated while reading it from mmaped memory. +# + +verify_runnable "global" + +if is_linux && [[ $(linux_version) -lt $(linux_version "4.5") ]]; then + log_unsupported "copy_file_range not available before Linux 4.5" +fi + +VDIR=$TEST_BASE_DIR/disk-bclone +VDEV="$VDIR/a" + +function cleanup +{ + datasetexists $TESTPOOL && destroy_pool $TESTPOOL + rm -rf $VDIR +} + +log_onexit cleanup + +log_assert "Test for clone into mmaped and cached file" + +log_must rm -rf $VDIR +log_must mkdir -p $VDIR +log_must truncate -s 1G $VDEV + +log_must zpool create -o feature@block_cloning=enabled $TESTPOOL $VDEV +log_must zfs create $TESTPOOL/$TESTFS + +for opts in "--" "-i" "-o" "-io" +do + log_must dd if=/dev/urandom of=/$TESTPOOL/$TESTFS/src bs=1M count=1 + log_must dd if=/dev/urandom of=/$TESTPOOL/$TESTFS/dst bs=1M count=1 + + # Clear cache. + log_must zpool export $TESTPOOL + log_must zpool import -d $VDIR $TESTPOOL + + log_must clone_mmap_cached $opts /$TESTPOOL/$TESTFS/src /$TESTPOOL/$TESTFS/dst + + sync_pool $TESTPOOL + log_must sync + + log_must have_same_content /$TESTPOOL/$TESTFS/src /$TESTPOOL/$TESTFS/dst + blocks=$(get_same_blocks $TESTPOOL/$TESTFS src $TESTPOOL/$TESTFS dst) + # FreeBSD's seq(1) leaves a trailing space, remove it with sed(1). + log_must [ "$blocks" = "$(seq -s " " 0 7 | sed 's/ $//')" ] +done + +log_pass "Clone properly updates mmapped and cached pages" diff --git a/tests/zfs-tests/tests/functional/block_cloning/setup.ksh b/tests/zfs-tests/tests/functional/block_cloning/setup.ksh index 58441bf8f3a..a9b13f062a4 100755 --- a/tests/zfs-tests/tests/functional/block_cloning/setup.ksh +++ b/tests/zfs-tests/tests/functional/block_cloning/setup.ksh @@ -30,6 +30,9 @@ if ! command -v clonefile > /dev/null ; then log_unsupported "clonefile program required to test block cloning" fi +if ! command -v clone_mmap_cached > /dev/null ; then + log_unsupported "clone_mmap_cached program required to test block cloning" +fi verify_runnable "global" From c1161e28513410a3f566a0e10b48e54b11b19e59 Mon Sep 17 00:00:00 2001 From: Tino Reichardt Date: Wed, 17 Jan 2024 18:06:14 +0100 Subject: [PATCH 41/91] fix: variable type with zfs-tests/cmd/clonefile.c Compiling on arm64 freebsd-13.2 and arm64 almalinux-8 brings currently this error: ``` CC tests/zfs-tests/cmd/clonefile.o tests/zfs-tests/cmd/clonefile.c:166:43: error: result of comparison of \ constant -1 with expression of type 'char' is always true \ [-Werror,-Wtautological-constant-out-of-range-compare] while ((c = getopt(argc, argv, "crfdq")) != -1) { ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ ^ ~~ 1 error generated. gmake[2]: *** [Makefile:8675: tests/zfs-tests/cmd/clonefile.o] Error 1 ``` Fix: use correct variable type `int`. Reviewed-by: Brian Behlendorf Reviewed-by: Rob Norris Signed-off-by: Tino Reichardt Closes #15783 --- tests/zfs-tests/cmd/clonefile.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/zfs-tests/cmd/clonefile.c b/tests/zfs-tests/cmd/clonefile.c index d002cd9b587..bc30bb7798e 100644 --- a/tests/zfs-tests/cmd/clonefile.c +++ b/tests/zfs-tests/cmd/clonefile.c @@ -162,7 +162,7 @@ main(int argc, char **argv) { cf_mode_t mode = CF_MODE_NONE; - char c; + int c; while ((c = getopt(argc, argv, "crfdq")) != -1) { switch (c) { case 'c': From 9e0304c363d7bcc2330b252299edd84a6d4dabbc Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Mon, 22 Jan 2024 16:15:03 -0800 Subject: [PATCH 42/91] ZTS: Apply zfs_bclone_enabled to bclone tests If block cloning is disabled by default then enable it when running the bclone tests. Follow up to #15529. Reviewed-by: Brian Atkinson Signed-off-by: Brian Behlendorf Closes #15796 --- tests/zfs-tests/tests/functional/bclone/cleanup.ksh | 9 ++++++++- tests/zfs-tests/tests/functional/bclone/setup.ksh | 5 +++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/tests/zfs-tests/tests/functional/bclone/cleanup.ksh b/tests/zfs-tests/tests/functional/bclone/cleanup.ksh index df6d9c08fec..0021ccb57ae 100755 --- a/tests/zfs-tests/tests/functional/bclone/cleanup.ksh +++ b/tests/zfs-tests/tests/functional/bclone/cleanup.ksh @@ -34,4 +34,11 @@ log_must zfs destroy $TESTSRCFS log_must zfs destroy $TESTDSTFS -default_cleanup + +default_cleanup_noexit + +if tunable_exists BCLONE_ENABLED ; then + log_must restore_tunable BCLONE_ENABLED +fi + +log_pass diff --git a/tests/zfs-tests/tests/functional/bclone/setup.ksh b/tests/zfs-tests/tests/functional/bclone/setup.ksh index c68719ee72a..9d26088c5a8 100755 --- a/tests/zfs-tests/tests/functional/bclone/setup.ksh +++ b/tests/zfs-tests/tests/functional/bclone/setup.ksh @@ -36,6 +36,11 @@ if ! command -v clonefile > /dev/null ; then log_unsupported "clonefile program required to test block cloning" fi +if tunable_exists BCLONE_ENABLED ; then + log_must save_tunable BCLONE_ENABLED + log_must set_tunable32 BCLONE_ENABLED 1 +fi + DISK=${DISKS%% *} default_setup_noexit $DISK "true" From 3425484eb907d489c315cced2a1fdea08ef03fc4 Mon Sep 17 00:00:00 2001 From: Pawel Jakub Dawidek Date: Tue, 23 Jan 2024 15:03:48 -0800 Subject: [PATCH 43/91] Fix file descriptor leak on pool import. Descriptor leak can be easily reproduced by doing: # zpool import tank # sysctl kern.openfiles # zpool export tank; zpool import tank # sysctl kern.openfiles We were leaking four file descriptors on every import. Similar leak most likely existed when using file-based VDEVs. External-issue: https://reviews.freebsd.org/D43529 Reviewed-by: Brian Behlendorf Signed-off-by: Pawel Jakub Dawidek Closes #15630 --- module/os/freebsd/zfs/zfs_file_os.c | 63 +++++++++++++++++++++++------ 1 file changed, 51 insertions(+), 12 deletions(-) diff --git a/module/os/freebsd/zfs/zfs_file_os.c b/module/os/freebsd/zfs/zfs_file_os.c index 60c9ff0581e..f7f2be2cf95 100644 --- a/module/os/freebsd/zfs/zfs_file_os.c +++ b/module/os/freebsd/zfs/zfs_file_os.c @@ -53,26 +53,65 @@ int zfs_file_open(const char *path, int flags, int mode, zfs_file_t **fpp) { struct thread *td; - int rc, fd; + struct vnode *vp; + struct file *fp; + struct nameidata nd; + int error; td = curthread; pwd_ensure_dirs(); - /* 12.x doesn't take a const char * */ - rc = kern_openat(td, AT_FDCWD, __DECONST(char *, path), - UIO_SYSSPACE, flags, mode); - if (rc) - return (SET_ERROR(rc)); - fd = td->td_retval[0]; - td->td_retval[0] = 0; - if (fget(curthread, fd, &cap_no_rights, fpp)) - kern_close(td, fd); + + KASSERT((flags & (O_EXEC | O_PATH)) == 0, + ("invalid flags: 0x%x", flags)); + KASSERT((flags & O_ACCMODE) != O_ACCMODE, + ("invalid flags: 0x%x", flags)); + flags = FFLAGS(flags); + + error = falloc_noinstall(td, &fp); + if (error != 0) { + return (error); + } + fp->f_flag = flags & FMASK; + +#if __FreeBSD_version >= 1400043 + NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, path); +#else + NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, path, td); +#endif + error = vn_open(&nd, &flags, mode, fp); + if (error != 0) { + falloc_abort(td, fp); + return (SET_ERROR(error)); + } + NDFREE_PNBUF(&nd); + vp = nd.ni_vp; + fp->f_vnode = vp; + if (fp->f_ops == &badfileops) { + finit_vnode(fp, flags, NULL, &vnops); + } + VOP_UNLOCK(vp); + if (vp->v_type != VREG) { + zfs_file_close(fp); + return (SET_ERROR(EACCES)); + } + + if (flags & O_TRUNC) { + error = fo_truncate(fp, 0, td->td_ucred, td); + if (error != 0) { + zfs_file_close(fp); + return (SET_ERROR(error)); + } + } + + *fpp = fp; + return (0); } void zfs_file_close(zfs_file_t *fp) { - fo_close(fp, curthread); + fdrop(fp, curthread); } static int @@ -263,7 +302,7 @@ zfs_file_get(int fd) void zfs_file_put(zfs_file_t *fp) { - fdrop(fp, curthread); + zfs_file_close(fp); } loff_t From 4d4972ed98a83a4b3a404f53782d5b351b8ef8cf Mon Sep 17 00:00:00 2001 From: Rich Ercolani <214141+rincebrain@users.noreply.github.com> Date: Fri, 12 Jan 2024 15:17:26 -0500 Subject: [PATCH 44/91] Stop wasting time on malloc in snprintf_zstd_header Profiling zdb -vvvvv on datasets with a lot of zstd blocks, we find ourselves spending quite a lot of time on malloc/free, because we allocate a 16M abd each call, and never free it, so we're leaking 16M per call as well. This seems sub-optimal. So let's just keep the buffer around and reuse it. Reviewed-by: Brian Behlendorf Reviewed-by: Rob Norris Signed-off-by: Rich Ercolani Closes #15721 --- cmd/zdb/zdb.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 3fc9fd2a9d8..70c85a87ad7 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -2360,7 +2360,7 @@ static void snprintf_zstd_header(spa_t *spa, char *blkbuf, size_t buflen, const blkptr_t *bp) { - abd_t *pabd; + static abd_t *pabd = NULL; void *buf; zio_t *zio; zfs_zstdhdr_t zstd_hdr; @@ -2391,7 +2391,8 @@ snprintf_zstd_header(spa_t *spa, char *blkbuf, size_t buflen, return; } - pabd = abd_alloc_for_io(SPA_MAXBLOCKSIZE, B_FALSE); + if (!pabd) + pabd = abd_alloc_for_io(SPA_MAXBLOCKSIZE, B_FALSE); zio = zio_root(spa, NULL, NULL, 0); /* Decrypt but don't decompress so we can read the compression header */ From 7bccf98a731d717515ba83f728be337e2b21b9bc Mon Sep 17 00:00:00 2001 From: Rich Ercolani <214141+rincebrain@users.noreply.github.com> Date: Fri, 12 Jan 2024 14:55:17 -0500 Subject: [PATCH 45/91] Make zdb -R scale less poorly zdb -R with :d tries to use gzip decompression 9 times per size. There's absolutely no reason for that, they're all the same decompressor. Reviewed-by: Brian Atkinson Reviewed-by: Brian Behlendorf Signed-off-by: Rich Ercolani Closes #15726 --- cmd/zdb/zdb.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 70c85a87ad7..19b0d61f09c 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -8491,6 +8491,14 @@ zdb_decompress_block(abd_t *pabd, void *buf, void *lbuf, uint64_t lsize, *cfuncp++ = ZIO_COMPRESS_LZ4; *cfuncp++ = ZIO_COMPRESS_LZJB; mask |= ZIO_COMPRESS_MASK(LZ4) | ZIO_COMPRESS_MASK(LZJB); + /* + * Every gzip level has the same decompressor, no need to + * run it 9 times per bruteforce attempt. + */ + mask |= ZIO_COMPRESS_MASK(GZIP_2) | ZIO_COMPRESS_MASK(GZIP_3); + mask |= ZIO_COMPRESS_MASK(GZIP_4) | ZIO_COMPRESS_MASK(GZIP_5); + mask |= ZIO_COMPRESS_MASK(GZIP_6) | ZIO_COMPRESS_MASK(GZIP_7); + mask |= ZIO_COMPRESS_MASK(GZIP_8) | ZIO_COMPRESS_MASK(GZIP_9); for (int c = 0; c < ZIO_COMPRESS_FUNCTIONS; c++) if (((1ULL << c) & mask) == 0) *cfuncp++ = c; From 22e4f08c30f97d208a6d1ae8b8943071de340431 Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Tue, 9 Jan 2024 10:57:29 -0500 Subject: [PATCH 46/91] Linux: Defer loading the object set in zfs_setattr() We need to wait until after having done a zfs_enter() to load some fields from the zfsvfs structure. Otherwise a use-after-free is possible in the face of a concurrent rollback. Other functions in this file are careful to avoid this bug, I believe this is the only instance. Reviewed-by: Brian Atkinson Reviewed-by: Brian Behlendorf Signed-off-by: Mark Johnston Closes #15752 --- module/os/linux/zfs/zfs_vnops_os.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c index 9ea8ad5f4a6..ecfa4b54e29 100644 --- a/module/os/linux/zfs/zfs_vnops_os.c +++ b/module/os/linux/zfs/zfs_vnops_os.c @@ -1853,7 +1853,7 @@ zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr, zidmap_t *mnt_ns) { struct inode *ip; zfsvfs_t *zfsvfs = ZTOZSB(zp); - objset_t *os = zfsvfs->z_os; + objset_t *os; zilog_t *zilog; dmu_tx_t *tx; vattr_t oldva; @@ -1885,6 +1885,7 @@ zfs_setattr(znode_t *zp, vattr_t *vap, int flags, cred_t *cr, zidmap_t *mnt_ns) if ((err = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (err); ip = ZTOI(zp); + os = zfsvfs->z_os; /* * If this is a xvattr_t, then get a pointer to the structure of From 8b1c6db3d2f35cf13c5f35374b0ec37216881ec7 Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Tue, 9 Jan 2024 18:57:09 -0500 Subject: [PATCH 47/91] Fix a potential use-after-free in zfs_setsecattr() In general, VOPs must not load the "z_log" field until having called zfs_enter_verify_zp(). Reviewed-by: Brian Atkinson Reviewed-by: Brian Behlendorf Signed-off-by: Mark Johnston Closes #15752 --- module/zfs/zfs_vnops.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index aa61575a6a1..e6ae574ad06 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -801,11 +801,11 @@ zfs_setsecattr(znode_t *zp, vsecattr_t *vsecp, int flag, cred_t *cr) zfsvfs_t *zfsvfs = ZTOZSB(zp); int error; boolean_t skipaclchk = (flag & ATTR_NOACLCHECK) ? B_TRUE : B_FALSE; - zilog_t *zilog = zfsvfs->z_log; + zilog_t *zilog; if ((error = zfs_enter_verify_zp(zfsvfs, zp, FTAG)) != 0) return (error); - + zilog = zfsvfs->z_log; error = zfs_setacl(zp, vsecp, skipaclchk, cr); if (zfsvfs->z_os->os_sync == ZFS_SYNC_ALWAYS) From 4db88c37cc4ebcf0bb00ea9574cf66c9ccf2409c Mon Sep 17 00:00:00 2001 From: Stefan Lendl <1321542+stfl@users.noreply.github.com> Date: Fri, 12 Jan 2024 21:05:11 +0100 Subject: [PATCH 48/91] fix(mount): do not truncate shares not zfs mount When running zfs share -a resetting the exports.d/zfs.exports makes sense the get a clean state. Truncating was also called with zfs mount which would not populate the file again. Add test to verify shares persist after mount -a. Reviewed-by: Brian Behlendorf Signed-off-by: Stefan Lendl Closes #15607 Closes #15660 --- cmd/zfs/zfs_main.c | 3 +- tests/runfiles/common.run | 3 +- tests/zfs-tests/tests/Makefile.am | 1 + .../zfs_share/zfs_share_after_mount.ksh | 62 +++++++++++++++++++ 4 files changed, 67 insertions(+), 2 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/cli_root/zfs_share/zfs_share_after_mount.ksh diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c index 5644869cf33..67b191d72e6 100644 --- a/cmd/zfs/zfs_main.c +++ b/cmd/zfs/zfs_main.c @@ -7230,7 +7230,8 @@ share_mount(int op, int argc, char **argv) pthread_mutex_init(&share_mount_state.sm_lock, NULL); /* For a 'zfs share -a' operation start with a clean slate. */ - zfs_truncate_shares(NULL); + if (op == OP_SHARE) + zfs_truncate_shares(NULL); /* * libshare isn't mt-safe, so only do the operation in parallel diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 33f30b00550..f93bfb43336 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -318,7 +318,8 @@ tags = ['functional', 'cli_root', 'zfs_set'] [tests/functional/cli_root/zfs_share] tests = ['zfs_share_001_pos', 'zfs_share_002_pos', 'zfs_share_003_pos', 'zfs_share_004_pos', 'zfs_share_006_pos', 'zfs_share_008_neg', - 'zfs_share_010_neg', 'zfs_share_011_pos', 'zfs_share_concurrent_shares'] + 'zfs_share_010_neg', 'zfs_share_011_pos', 'zfs_share_concurrent_shares', + 'zfs_share_after_mount'] tags = ['functional', 'cli_root', 'zfs_share'] [tests/functional/cli_root/zfs_snapshot] diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 1c3dfc77eac..19174c71fbe 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -914,6 +914,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zfs_share/zfs_share_012_pos.ksh \ functional/cli_root/zfs_share/zfs_share_013_pos.ksh \ functional/cli_root/zfs_share/zfs_share_concurrent_shares.ksh \ + functional/cli_root/zfs_share/zfs_share_after_mount.ksh \ functional/cli_root/zfs_snapshot/cleanup.ksh \ functional/cli_root/zfs_snapshot/setup.ksh \ functional/cli_root/zfs_snapshot/zfs_snapshot_001_neg.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zfs_share/zfs_share_after_mount.ksh b/tests/zfs-tests/tests/functional/cli_root/zfs_share/zfs_share_after_mount.ksh new file mode 100755 index 00000000000..0d4b66ea854 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zfs_share/zfs_share_after_mount.ksh @@ -0,0 +1,62 @@ +#!/bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2023 by Proxmox. All rights reserved. +# + +. $STF_SUITE/include/libtest.shlib + +# DESCRIPTION: +# Verify that nfs shares persist after zfs mount -a +# +# STRATEGY: +# 1. Verify that the filesystem is not shared. +# 2. Enable the 'sharenfs' property +# 3. Verify filesystem is shared +# 4. Invoke 'zfs mount -a' +# 5. Verify filesystem is still shared + +verify_runnable "global" + +function cleanup +{ + log_must zfs set sharenfs=off $TESTPOOL/$TESTFS + is_shared $TESTPOOL/$TESTFS && \ + log_must unshare_fs $TESTPOOL/$TESTFS + log_must zfs share -a +} + + +log_onexit cleanup + +cleanup + +log_must zfs set sharenfs="on" $TESTPOOL/$TESTFS +log_must is_shared $TESTPOOL/$TESTFS +log_must is_exported $TESTPOOL/$TESTFS + +log_must zfs mount -a +log_must is_shared $TESTPOOL/$TESTFS +log_must is_exported $TESTPOOL/$TESTFS + +log_pass "Verify that nfs shares persist after zfs mount -a" From 509526ad2103adddc18c1b6d7b514d0c36b682ef Mon Sep 17 00:00:00 2001 From: Benjamin Sherman Date: Fri, 12 Jan 2024 14:33:41 -0600 Subject: [PATCH 49/91] fix: preserve linux kmod signature in zfs-kmod rpm spec This change provides rpm spec macros to sign the zfs and spl kmods as the final step after the %install scriptlet. This is needed since the find-debuginfo.sh script strips out debug symbols plus signatures. Kernel module signing only occurs when the required files are present as typically required in the Linux source tree: - certs/signing_key.pem - certs/signing_key.x509 The method for overriding the default __spec_install_post macro is inspired by (and largely copied from) the Fedora kernel.spec. Reviewed-by: Tony Hutter Reviewed-by: Tino Reichardt Signed-off-by: Benjamin Sherman Closes #15744 --- rpm/generic/zfs-kmod.spec.in | 24 ++++++++++++++++++++++++ rpm/redhat/zfs-kmod.spec.in | 24 ++++++++++++++++++++++++ 2 files changed, 48 insertions(+) diff --git a/rpm/generic/zfs-kmod.spec.in b/rpm/generic/zfs-kmod.spec.in index 3c73e2ff2d6..4cc075585d4 100644 --- a/rpm/generic/zfs-kmod.spec.in +++ b/rpm/generic/zfs-kmod.spec.in @@ -150,6 +150,30 @@ for kernel_version in %{?kernel_versions}; do done +# Module signing (modsign) +# +# This must be run _after_ find-debuginfo.sh runs, otherwise that will strip +# the signature off of the modules. +# (Based on Fedora's kernel.spec workaround) +%define __modsign_install_post \ + sign_pem="%{ksrc}/certs/signing_key.pem"; \ + sign_x509="%{ksrc}/certs/signing_key.x509"; \ + if [ -f "${sign_x509}" ]\ + then \ + echo "Signing kernel modules ..."; \ + for kmod in $(find ${RPM_BUILD_ROOT}%{kmodinstdir_prefix}/*/extra/ -name \*.ko); do \ + %{ksrc}/scripts/sign-file sha256 ${sign_pem} ${sign_x509} ${kmod}; \ + done \ + fi \ +%{nil} + +# hack to ensure signing happens after find-debuginfo.sh runs +%define __spec_install_post \ + %{?__debug_package:%{__debug_install_post}}\ + %{__arch_install_post}\ + %{__os_install_post}\ + %{__modsign_install_post} + %install rm -rf ${RPM_BUILD_ROOT} diff --git a/rpm/redhat/zfs-kmod.spec.in b/rpm/redhat/zfs-kmod.spec.in index f59551c0b43..9c836786bae 100644 --- a/rpm/redhat/zfs-kmod.spec.in +++ b/rpm/redhat/zfs-kmod.spec.in @@ -72,6 +72,30 @@ fi %{?kernel_llvm} make %{?_smp_mflags} +# Module signing (modsign) +# +# This must be run _after_ find-debuginfo.sh runs, otherwise that will strip +# the signature off of the modules. +# (Based on Fedora's kernel.spec workaround) +%define __modsign_install_post \ + sign_pem="%{ksrc}/certs/signing_key.pem"; \ + sign_x509="%{ksrc}/certs/signing_key.x509"; \ + if [ -f "${sign_x509}" ]\ + then \ + echo "Signing kernel modules ..."; \ + for kmod in $(find %{buildroot}/lib/modules/%{kverrel}/extra/ -name \*.ko); do \ + %{ksrc}/scripts/sign-file sha256 ${sign_pem} ${sign_x509} ${kmod}; \ + done \ + fi \ +%{nil} + +# hack to ensure signing happens after find-debuginfo.sh runs +%define __spec_install_post \ + %{?__debug_package:%{__debug_install_post}}\ + %{__arch_install_post}\ + %{__os_install_post}\ + %{__modsign_install_post} + %install make install \ DESTDIR=${RPM_BUILD_ROOT} \ From 2006ac1f4a52419d08641324ba56ecc5d0bbaf6f Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Fri, 12 Jan 2024 12:35:29 -0800 Subject: [PATCH 50/91] Fix "out of memory" error Drop the no_memory() call from zpool_in_use() when reading the label fails and instead return the error to the caller. This prevents a misleading "internal error: out of memory" error when the label can't be read. This will result in is_spare() returning B_FALSE instead of aborting, which is already safely handled. Furthermore, on Linux it's possible for EREMOTEIO to returned by an NVMe device if the device has been low-level formatted and not rescanned. In this case we want to fallback to the legacy scanning method and read any of the labels we can. Reviewed-by: Brian Atkinson Signed-off-by: Brian Behlendorf Issue #13538 Closes #15747 --- lib/libzfs/libzfs_import.c | 4 +--- lib/libzutil/zutil_import.c | 15 +++++++++++++-- 2 files changed, 14 insertions(+), 5 deletions(-) diff --git a/lib/libzfs/libzfs_import.c b/lib/libzfs/libzfs_import.c index 2a7c5a76a0a..e2d40a7b3bf 100644 --- a/lib/libzfs/libzfs_import.c +++ b/lib/libzfs/libzfs_import.c @@ -291,10 +291,8 @@ zpool_in_use(libzfs_handle_t *hdl, int fd, pool_state_t *state, char **namestr, *inuse = B_FALSE; - if (zpool_read_label(fd, &config, NULL) != 0) { - (void) no_memory(hdl); + if (zpool_read_label(fd, &config, NULL) != 0) return (-1); - } if (config == NULL) return (0); diff --git a/lib/libzutil/zutil_import.c b/lib/libzutil/zutil_import.c index 19d8a474281..bafe50e5f90 100644 --- a/lib/libzutil/zutil_import.c +++ b/lib/libzutil/zutil_import.c @@ -1056,10 +1056,21 @@ zpool_read_label(int fd, nvlist_t **config, int *num_labels) case EINVAL: break; case EINPROGRESS: - // This shouldn't be possible to - // encounter, die if we do. + /* + * This shouldn't be possible to + * encounter, die if we do. + */ ASSERT(B_FALSE); zfs_fallthrough; + case EREMOTEIO: + /* + * May be returned by an NVMe device + * which is visible in /dev/ but due + * to a low-level format change, or + * other error, needs to be rescanned. + * Try the slow method. + */ + zfs_fallthrough; case EOPNOTSUPP: case ENOSYS: do_slow = B_TRUE; From 52cee9a3eb0a691ce915a6f46d23f575351d8b4d Mon Sep 17 00:00:00 2001 From: Ameer Hamza Date: Thu, 4 Jan 2024 19:02:50 +0500 Subject: [PATCH 51/91] fix: Uber block label not always found for aux vdevs When spare or l2cache (aux) vdev is added during pool creation, spa->spa_uberblock is not dumped until that point. Subsequently, the aux label is never synchronized after its initial creation, resulting in the uberblock label remaining undumped. The uberblock is crucial for lib_blkid in identifying the ZFS partition type. To address this issue, we now ensure sync of the uberblock label once if it's not dumped initially. Reviewed-by: Umer Saleem Reviewed-by: Alexander Motin Signed-off-by: Ameer Hamza Closes #15737 --- include/sys/spa_impl.h | 1 + module/zfs/vdev_label.c | 31 +++++++++++++++++++++++++++++++ 2 files changed, 32 insertions(+) diff --git a/include/sys/spa_impl.h b/include/sys/spa_impl.h index cdf65c37133..c7ecd3d0ccd 100644 --- a/include/sys/spa_impl.h +++ b/include/sys/spa_impl.h @@ -266,6 +266,7 @@ struct spa { spa_aux_vdev_t spa_spares; /* hot spares */ spa_aux_vdev_t spa_l2cache; /* L2ARC cache devices */ + boolean_t spa_aux_sync_uber; /* need to sync aux uber */ nvlist_t *spa_label_features; /* Features for reading MOS */ uint64_t spa_config_object; /* MOS object for pool config */ uint64_t spa_config_generation; /* config generation number */ diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index a2e5524a839..21348f95a4e 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -1148,6 +1148,14 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) */ VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_ASHIFT, vd->vdev_ashift) == 0); + + /* + * When spare or l2cache (aux) vdev is added during pool + * creation, spa->spa_uberblock is not written until this + * point. Write it on next config sync. + */ + if (uberblock_verify(&spa->spa_uberblock)) + spa->spa_aux_sync_uber = B_TRUE; } else { uint64_t txg = 0ULL; @@ -1749,6 +1757,16 @@ vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags) for (int v = 0; v < svdcount; v++) vdev_uberblock_sync(zio, &good_writes, ub, svd[v], flags); + if (spa->spa_aux_sync_uber) { + for (int v = 0; v < spa->spa_spares.sav_count; v++) { + vdev_uberblock_sync(zio, &good_writes, ub, + spa->spa_spares.sav_vdevs[v], flags); + } + for (int v = 0; v < spa->spa_l2cache.sav_count; v++) { + vdev_uberblock_sync(zio, &good_writes, ub, + spa->spa_l2cache.sav_vdevs[v], flags); + } + } (void) zio_wait(zio); /* @@ -1763,6 +1781,19 @@ vdev_uberblock_sync_list(vdev_t **svd, int svdcount, uberblock_t *ub, int flags) zio_flush(zio, svd[v]); } } + if (spa->spa_aux_sync_uber) { + spa->spa_aux_sync_uber = B_FALSE; + for (int v = 0; v < spa->spa_spares.sav_count; v++) { + if (vdev_writeable(spa->spa_spares.sav_vdevs[v])) { + zio_flush(zio, spa->spa_spares.sav_vdevs[v]); + } + } + for (int v = 0; v < spa->spa_l2cache.sav_count; v++) { + if (vdev_writeable(spa->spa_l2cache.sav_vdevs[v])) { + zio_flush(zio, spa->spa_l2cache.sav_vdevs[v]); + } + } + } (void) zio_wait(zio); From eb4a36bcef41f2f73a74bbfcd7fb46152df7b0e6 Mon Sep 17 00:00:00 2001 From: Ameer Hamza Date: Thu, 4 Jan 2024 19:32:53 +0500 Subject: [PATCH 52/91] Extend aux label to add path information Pool import logic uses vdev paths, so it makes sense to add path information on AUX vdev as well. Reviewed-by: Umer Saleem Reviewed-by: Alexander Motin Signed-off-by: Ameer Hamza Closes #15737 --- module/zfs/vdev_label.c | 54 +++++++++++++++++++++++------------------ 1 file changed, 31 insertions(+), 23 deletions(-) diff --git a/module/zfs/vdev_label.c b/module/zfs/vdev_label.c index 21348f95a4e..737d8b33e18 100644 --- a/module/zfs/vdev_label.c +++ b/module/zfs/vdev_label.c @@ -1023,6 +1023,10 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) int error; uint64_t spare_guid = 0, l2cache_guid = 0; int flags = ZIO_FLAG_CONFIG_WRITER | ZIO_FLAG_CANFAIL; + boolean_t reason_spare = (reason == VDEV_LABEL_SPARE || (reason == + VDEV_LABEL_REMOVE && vd->vdev_isspare)); + boolean_t reason_l2cache = (reason == VDEV_LABEL_L2CACHE || (reason == + VDEV_LABEL_REMOVE && vd->vdev_isl2cache)); ASSERT(spa_config_held(spa, SCL_ALL, RW_WRITER) == SCL_ALL); @@ -1108,34 +1112,20 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) * really part of an active pool just yet. The labels will * be written again with a meaningful txg by spa_sync(). */ - if (reason == VDEV_LABEL_SPARE || - (reason == VDEV_LABEL_REMOVE && vd->vdev_isspare)) { + if (reason_spare || reason_l2cache) { /* - * For inactive hot spares, we generate a special label that - * identifies as a mutually shared hot spare. We write the - * label if we are adding a hot spare, or if we are removing an - * active hot spare (in which case we want to revert the - * labels). + * For inactive hot spares and level 2 ARC devices, we generate + * a special label that identifies as a mutually shared hot + * spare or l2cache device. We write the label in case of + * addition or removal of hot spare or l2cache vdev (in which + * case we want to revert the labels). */ VERIFY(nvlist_alloc(&label, NV_UNIQUE_NAME, KM_SLEEP) == 0); VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_VERSION, spa_version(spa)) == 0); VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_POOL_STATE, - POOL_STATE_SPARE) == 0); - VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_GUID, - vd->vdev_guid) == 0); - } else if (reason == VDEV_LABEL_L2CACHE || - (reason == VDEV_LABEL_REMOVE && vd->vdev_isl2cache)) { - /* - * For level 2 ARC devices, add a special label. - */ - VERIFY(nvlist_alloc(&label, NV_UNIQUE_NAME, KM_SLEEP) == 0); - - VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_VERSION, - spa_version(spa)) == 0); - VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_POOL_STATE, - POOL_STATE_L2CACHE) == 0); + reason_spare ? POOL_STATE_SPARE : POOL_STATE_L2CACHE) == 0); VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_GUID, vd->vdev_guid) == 0); @@ -1146,8 +1136,26 @@ vdev_label_init(vdev_t *vd, uint64_t crtxg, vdev_labeltype_t reason) * spa->spa_l2cache->sav_config (populated in * spa_ld_open_aux_vdevs()). */ - VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_ASHIFT, - vd->vdev_ashift) == 0); + if (reason_l2cache) { + VERIFY(nvlist_add_uint64(label, ZPOOL_CONFIG_ASHIFT, + vd->vdev_ashift) == 0); + } + + /* + * Add path information to help find it during pool import + */ + if (vd->vdev_path != NULL) { + VERIFY(nvlist_add_string(label, ZPOOL_CONFIG_PATH, + vd->vdev_path) == 0); + } + if (vd->vdev_devid != NULL) { + VERIFY(nvlist_add_string(label, ZPOOL_CONFIG_DEVID, + vd->vdev_devid) == 0); + } + if (vd->vdev_physpath != NULL) { + VERIFY(nvlist_add_string(label, ZPOOL_CONFIG_PHYS_PATH, + vd->vdev_physpath) == 0); + } /* * When spare or l2cache (aux) vdev is added during pool From a2e71db66434ea27a57e3add5fbda35ecd0722d6 Mon Sep 17 00:00:00 2001 From: Ameer Hamza Date: Thu, 4 Jan 2024 19:35:04 +0500 Subject: [PATCH 53/91] Add path handling for aux vdevs in `label_path` If the AUX vdev is added using UUID, importing the pool falls back AUX vdev to open it with disk name instead of UUID due to the absence of path information for AUX vdevs. Since AUX label now have path information, this PR adds path handling for it in `label_path`. Reviewed-by: Umer Saleem Reviewed-by: Alexander Motin Signed-off-by: Ameer Hamza Closes #15737 --- lib/libzutil/zutil_import.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/lib/libzutil/zutil_import.c b/lib/libzutil/zutil_import.c index bafe50e5f90..f7ef69a1d93 100644 --- a/lib/libzutil/zutil_import.c +++ b/lib/libzutil/zutil_import.c @@ -1221,13 +1221,26 @@ label_paths(libpc_handle_t *hdl, nvlist_t *label, const char **path, nvlist_t *nvroot; uint64_t pool_guid; uint64_t vdev_guid; + uint64_t state; *path = NULL; *devid = NULL; + if (nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &vdev_guid) != 0) + return (ENOENT); + + /* + * In case of spare or l2cache, we directly return path/devid from the + * label. + */ + if (!(nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_STATE, &state)) && + (state == POOL_STATE_SPARE || state == POOL_STATE_L2CACHE)) { + (void) nvlist_lookup_string(label, ZPOOL_CONFIG_PATH, path); + (void) nvlist_lookup_string(label, ZPOOL_CONFIG_DEVID, devid); + return (0); + } if (nvlist_lookup_nvlist(label, ZPOOL_CONFIG_VDEV_TREE, &nvroot) || - nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, &pool_guid) || - nvlist_lookup_uint64(label, ZPOOL_CONFIG_GUID, &vdev_guid)) + nvlist_lookup_uint64(label, ZPOOL_CONFIG_POOL_GUID, &pool_guid)) return (ENOENT); return (label_paths_impl(hdl, nvroot, pool_guid, vdev_guid, path, From 6b64acc157ec713f1e3d0b1980a528e874341e52 Mon Sep 17 00:00:00 2001 From: youzhongyang Date: Tue, 16 Jan 2024 16:30:58 -0500 Subject: [PATCH 54/91] Make spl_kmem_cache size check consistent On Linux x86_64, kmem cache can have size up to 4M, however increasing spl_kmem_cache_slab_limit can lead to crash due to the size check inconsistency. Reviewed-by: Brian Behlendorf Signed-off-by: Youzhong Yang Closes #15757 --- module/os/linux/spl/spl-kmem-cache.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/module/os/linux/spl/spl-kmem-cache.c b/module/os/linux/spl/spl-kmem-cache.c index a2920c74667..4b15081715a 100644 --- a/module/os/linux/spl/spl-kmem-cache.c +++ b/module/os/linux/spl/spl-kmem-cache.c @@ -91,7 +91,8 @@ MODULE_PARM_DESC(spl_kmem_cache_max_size, "Maximum size of slab in MB"); * of 16K was determined to be optimal for architectures using 4K pages and * to also work well on architecutres using larger 64K page sizes. */ -static unsigned int spl_kmem_cache_slab_limit = 16384; +static unsigned int spl_kmem_cache_slab_limit = + SPL_MAX_KMEM_ORDER_NR_PAGES * PAGE_SIZE; module_param(spl_kmem_cache_slab_limit, uint, 0644); MODULE_PARM_DESC(spl_kmem_cache_slab_limit, "Objects less than N bytes use the Linux slab"); @@ -783,7 +784,7 @@ spl_kmem_cache_create(const char *name, size_t size, size_t align, } else { unsigned long slabflags = 0; - if (size > (SPL_MAX_KMEM_ORDER_NR_PAGES * PAGE_SIZE)) + if (size > spl_kmem_cache_slab_limit) goto out; #if defined(SLAB_USERCOPY) From 424d06a29886603de2e33ec7aaae6607b17819ff Mon Sep 17 00:00:00 2001 From: Lalufu Date: Tue, 16 Jan 2024 22:32:59 +0100 Subject: [PATCH 55/91] Make sure all necessary RPM path macros are defined When building (s)rpm files through the Makefile, a directory structure is created in /tmp to hold the various files. In case the user running the command has overridden some of the RPM path settings through their user profile (for example in `~/.rpmmacros`), these paths do not line up with the configuration, and the build fails. Make sure all paths used are properly defined. Reviewed-by: Brian Behlendorf Signed-off-by: Ralf Ertzinger Closes #15756 --- config/rpm.am | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/config/rpm.am b/config/rpm.am index 13bd54a625b..85c56c0b2e3 100644 --- a/config/rpm.am +++ b/config/rpm.am @@ -83,6 +83,11 @@ srpm-common: rpm-local || exit 1; \ LANG=C $(RPMBUILD) \ --define "_tmppath $$rpmbuild/TMP" \ + --define "_builddir $$rpmbuild/BUILD" \ + --define "_rpmdir $$rpmbuild/RPMS" \ + --define "_srcrpmdir $$rpmbuild/SRPMS" \ + --define "_specdir $$rpmbuild/SPECS" \ + --define "_sourcedir $$rpmbuild/SOURCES" \ --define "_topdir $$rpmbuild" \ $(def) -bs $$rpmbuild/SPECS/$$rpmspec || exit 1; \ cp $$rpmbuild/SRPMS/$$rpmpkg . || exit 1; \ @@ -99,6 +104,11 @@ rpm-common: rpm-local || exit 1; \ LANG=C ${RPMBUILD} \ --define "_tmppath $$rpmbuild/TMP" \ + --define "_builddir $$rpmbuild/BUILD" \ + --define "_rpmdir $$rpmbuild/RPMS" \ + --define "_srcrpmdir $$rpmbuild/SRPMS" \ + --define "_specdir $$rpmbuild/SPECS" \ + --define "_sourcedir $$rpmbuild/SOURCES" \ --define "_topdir $$rpmbuild" \ $(def) --rebuild $$rpmpkg || exit 1; \ cp $$rpmbuild/RPMS/*/* . || exit 1; \ From 276be5357cf33a266a676fca1f22924655da1ba3 Mon Sep 17 00:00:00 2001 From: Tino Reichardt Date: Wed, 17 Jan 2024 18:05:12 +0100 Subject: [PATCH 56/91] linux spl: fix typo in top comment of spl-condvar.c Credential Implementation -> Condition Variables Implementation Reviewed-by: Brian Atkinson Reviewed-by: Brian Behlendorf Signed-off-by: Tino Reichardt Closes #15782 --- module/os/linux/spl/spl-condvar.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/module/os/linux/spl/spl-condvar.c b/module/os/linux/spl/spl-condvar.c index e87954714e3..5898789ad53 100644 --- a/module/os/linux/spl/spl-condvar.c +++ b/module/os/linux/spl/spl-condvar.c @@ -20,7 +20,7 @@ * You should have received a copy of the GNU General Public License along * with the SPL. If not, see . * - * Solaris Porting Layer (SPL) Credential Implementation. + * Solaris Porting Layer (SPL) Condition Variables Implementation. */ #include From 09a79613640bd96f2ac39967557a8ed602cd04a5 Mon Sep 17 00:00:00 2001 From: Val Packett Date: Fri, 19 Jan 2024 18:01:26 -0300 Subject: [PATCH 57/91] FreeBSD: Fix bootstrapping tools under Linux/musl musl libc has deprecated LFS64 aliases, so bootstrapping FreeBSD tools under musl distros has been failing with stat64 errors. Apply the aliases under non-glibc Linux to fix this problem. Reviewed-by: Richard Yao Reviewed-by: Brian Behlendorf Signed-off-by: Val Packett Closes #15780 --- lib/libspl/include/os/freebsd/sys/stat.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/lib/libspl/include/os/freebsd/sys/stat.h b/lib/libspl/include/os/freebsd/sys/stat.h index 88773cceb95..af488244bd0 100644 --- a/lib/libspl/include/os/freebsd/sys/stat.h +++ b/lib/libspl/include/os/freebsd/sys/stat.h @@ -76,8 +76,12 @@ fstat64_blk(int fd, struct stat64 *st) /* * Only Intel-based Macs have a separate stat64; Arm-based Macs are like * FreeBSD and have a full 64-bit stat from the start. + * + * On Linux, musl libc is full 64-bit too and has deprecated its own version + * of these defines since version 1.2.4. */ -#if defined(__APPLE__) && !(defined(__i386__) || defined(__x86_64__)) +#if (defined(__APPLE__) && !(defined(__i386__) || defined(__x86_64__))) || \ + (defined(__linux__) && !defined(__GLIBC__)) #define stat64 stat #define fstat64 fstat #endif From cfa29b994594dd4261117aa9c685adc6274485a8 Mon Sep 17 00:00:00 2001 From: Tino Reichardt Date: Fri, 26 Jan 2024 22:36:59 +0100 Subject: [PATCH 58/91] ZTS: Apply small changes for speeding up the tests The Github Action Runner got some new hardware metrics. We should use the provided and empty disk which is pre-mounted at /mnt now. Disk1: 89GiB -> rootfs + bootfs with ~80MB/s -> don't care Disk2: 64GiB -> /mnt with 420MB/s -> new testing ssd This commit will mount the new disk to /var/tmp and provide hopefully some speedups within our testings. Reviewed-by: Brian Behlendorf Reviewed-by: George Melikov Reviewed-by: Andrew Innes Signed-off-by: Tino Reichardt Closes #15811 --- .../workflows/scripts/setup-dependencies.sh | 35 ++++++++----------- 1 file changed, 15 insertions(+), 20 deletions(-) diff --git a/.github/workflows/scripts/setup-dependencies.sh b/.github/workflows/scripts/setup-dependencies.sh index 440d5e8e5ac..b40f9290f91 100755 --- a/.github/workflows/scripts/setup-dependencies.sh +++ b/.github/workflows/scripts/setup-dependencies.sh @@ -55,29 +55,24 @@ function mod_install() { cat /proc/spl/kstat/zfs/chksum_bench echo "::endgroup::" - echo "::group::Reclaim and report disk space" - # remove 4GiB of images - sudo systemd-run docker system prune --force --all --volumes + echo "::group::Optimize storage for ZFS testings" + # remove swap and umount fast storage + # 89GiB -> rootfs + bootfs with ~80MB/s -> don't care + # 64GiB -> /mnt with 420MB/s -> new testing ssd + sudo swapoff -a - # remove unused software - sudo systemd-run --wait rm -rf \ - "$AGENT_TOOLSDIRECTORY" \ - /opt/* \ - /usr/local/* \ - /usr/share/az* \ - /usr/share/dotnet \ - /usr/share/gradle* \ - /usr/share/miniconda \ - /usr/share/swift \ - /var/lib/gems \ - /var/lib/mysql \ - /var/lib/snapd - - # trim the cleaned space - sudo fstrim / + # this one is fast and mounted @ /mnt + # -> we reformat with ext4 + move it to /var/tmp + DEV="/dev/disk/azure/resource-part1" + sudo umount /mnt + sudo mkfs.ext4 -O ^has_journal -F $DEV + sudo mount -o noatime,barrier=0 $DEV /var/tmp + sudo chmod 1777 /var/tmp # disk usage afterwards - df -h / + sudo df -h / + sudo df -h /var/tmp + sudo fstrim -a echo "::endgroup::" } From 9da745f5de73487e14e6dfd65130b1677f84518a Mon Sep 17 00:00:00 2001 From: Richard Yao Date: Fri, 26 Jan 2024 17:11:33 -0500 Subject: [PATCH 59/91] Switch to CodeQL to detect prohibited function use The LLVM/Clang developers pointed out that using the CPP to detect use of functions that our QA policies prohibit risks invoking undefined behavior. To resolve this, we configure CodeQL to detect forbidden function usage. Note that cpp in the context of CodeQL refers to C/C++, rather than the C PreProcessor, which C++ also uses. It really should have been written cxx, but that ship sailed a long time ago. This misuse of the term cpp is retained in the CodeQL configuration for consistency with upstream CodeQL. As a side benefit, verbose make no longer is a wall of text showing a bunch of CPP macros, which can make debugging slightly easier. Reviewed-by: Brian Behlendorf Signed-off-by: Richard Yao Closes #15819 Closes #14134 --- .github/codeql-cpp.yml | 4 ++ .github/codeql-python.yml | 4 ++ .../cpp/deprecatedFunctionUsage.ql | 59 +++++++++++++++++++ .github/codeql/custom-queries/cpp/qlpack.yml | 4 ++ .github/workflows/codeql.yml | 1 + config/Rules.am | 15 ----- 6 files changed, 72 insertions(+), 15 deletions(-) create mode 100644 .github/codeql-cpp.yml create mode 100644 .github/codeql-python.yml create mode 100644 .github/codeql/custom-queries/cpp/deprecatedFunctionUsage.ql create mode 100644 .github/codeql/custom-queries/cpp/qlpack.yml diff --git a/.github/codeql-cpp.yml b/.github/codeql-cpp.yml new file mode 100644 index 00000000000..88b8c608602 --- /dev/null +++ b/.github/codeql-cpp.yml @@ -0,0 +1,4 @@ +name: "Custom CodeQL Analysis" + +queries: + - uses: ./.github/codeql/custom-queries/cpp/deprecatedFunctionUsage.ql diff --git a/.github/codeql-python.yml b/.github/codeql-python.yml new file mode 100644 index 00000000000..93cb4a435ed --- /dev/null +++ b/.github/codeql-python.yml @@ -0,0 +1,4 @@ +name: "Custom CodeQL Analysis" + +paths-ignore: + - tests diff --git a/.github/codeql/custom-queries/cpp/deprecatedFunctionUsage.ql b/.github/codeql/custom-queries/cpp/deprecatedFunctionUsage.ql new file mode 100644 index 00000000000..eb4b7bd6299 --- /dev/null +++ b/.github/codeql/custom-queries/cpp/deprecatedFunctionUsage.ql @@ -0,0 +1,59 @@ +/** + * @name Deprecated function usage detection + * @description Detects functions whose usage is banned from the OpenZFS + * codebase due to QA concerns. + * @kind problem + * @severity error + * @id cpp/deprecated-function-usage +*/ + +import cpp + +predicate isDeprecatedFunction(Function f) { + f.getName() = "strtok" or + f.getName() = "__xpg_basename" or + f.getName() = "basename" or + f.getName() = "dirname" or + f.getName() = "bcopy" or + f.getName() = "bcmp" or + f.getName() = "bzero" or + f.getName() = "asctime" or + f.getName() = "asctime_r" or + f.getName() = "gmtime" or + f.getName() = "localtime" or + f.getName() = "strncpy" + +} + +string getReplacementMessage(Function f) { + if f.getName() = "strtok" then + result = "Use strtok_r(3) instead!" + else if f.getName() = "__xpg_basename" then + result = "basename(3) is underspecified. Use zfs_basename() instead!" + else if f.getName() = "basename" then + result = "basename(3) is underspecified. Use zfs_basename() instead!" + else if f.getName() = "dirname" then + result = "dirname(3) is underspecified. Use zfs_dirnamelen() instead!" + else if f.getName() = "bcopy" then + result = "bcopy(3) is deprecated. Use memcpy(3)/memmove(3) instead!" + else if f.getName() = "bcmp" then + result = "bcmp(3) is deprecated. Use memcmp(3) instead!" + else if f.getName() = "bzero" then + result = "bzero(3) is deprecated. Use memset(3) instead!" + else if f.getName() = "asctime" then + result = "Use strftime(3) instead!" + else if f.getName() = "asctime_r" then + result = "Use strftime(3) instead!" + else if f.getName() = "gmtime" then + result = "gmtime(3) isn't thread-safe. Use gmtime_r(3) instead!" + else if f.getName() = "localtime" then + result = "localtime(3) isn't thread-safe. Use localtime_r(3) instead!" + else + result = "strncpy(3) is deprecated. Use strlcpy(3) instead!" +} + +from FunctionCall fc, Function f +where + fc.getTarget() = f and + isDeprecatedFunction(f) +select fc, getReplacementMessage(f) diff --git a/.github/codeql/custom-queries/cpp/qlpack.yml b/.github/codeql/custom-queries/cpp/qlpack.yml new file mode 100644 index 00000000000..cbe0f1cbe3c --- /dev/null +++ b/.github/codeql/custom-queries/cpp/qlpack.yml @@ -0,0 +1,4 @@ +name: openzfs-cpp-queries +version: 0.0.0 +libraryPathDependencies: codeql-cpp +suites: openzfs-cpp-suite diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 037f8aca0ea..7ccfc149256 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -29,6 +29,7 @@ jobs: - name: Initialize CodeQL uses: github/codeql-action/init@v2 with: + config-file: .github/codeql-${{ matrix.language }}.yml languages: ${{ matrix.language }} - name: Autobuild diff --git a/config/Rules.am b/config/Rules.am index 7c266964f3f..2e463ae6083 100644 --- a/config/Rules.am +++ b/config/Rules.am @@ -42,21 +42,6 @@ AM_CPPFLAGS += $(DEBUG_CPPFLAGS) AM_CPPFLAGS += $(CODE_COVERAGE_CPPFLAGS) AM_CPPFLAGS += -DTEXT_DOMAIN=\"zfs-@ac_system_l@-user\" -AM_CPPFLAGS_NOCHECK = -D"strtok(...)=strtok(__VA_ARGS__) __attribute__((deprecated(\"Use strtok_r(3) instead!\")))" -AM_CPPFLAGS_NOCHECK += -D"__xpg_basename(...)=__xpg_basename(__VA_ARGS__) __attribute__((deprecated(\"basename(3) is underspecified. Use zfs_basename() instead!\")))" -AM_CPPFLAGS_NOCHECK += -D"basename(...)=basename(__VA_ARGS__) __attribute__((deprecated(\"basename(3) is underspecified. Use zfs_basename() instead!\")))" -AM_CPPFLAGS_NOCHECK += -D"dirname(...)=dirname(__VA_ARGS__) __attribute__((deprecated(\"dirname(3) is underspecified. Use zfs_dirnamelen() instead!\")))" -AM_CPPFLAGS_NOCHECK += -D"bcopy(...)=__attribute__((deprecated(\"bcopy(3) is deprecated. Use memcpy(3)/memmove(3) instead!\"))) bcopy(__VA_ARGS__)" -AM_CPPFLAGS_NOCHECK += -D"bcmp(...)=__attribute__((deprecated(\"bcmp(3) is deprecated. Use memcmp(3) instead!\"))) bcmp(__VA_ARGS__)" -AM_CPPFLAGS_NOCHECK += -D"bzero(...)=__attribute__((deprecated(\"bzero(3) is deprecated. Use memset(3) instead!\"))) bzero(__VA_ARGS__)" -AM_CPPFLAGS_NOCHECK += -D"asctime(...)=__attribute__((deprecated(\"Use strftime(3) instead!\"))) asctime(__VA_ARGS__)" -AM_CPPFLAGS_NOCHECK += -D"asctime_r(...)=__attribute__((deprecated(\"Use strftime(3) instead!\"))) asctime_r(__VA_ARGS__)" -AM_CPPFLAGS_NOCHECK += -D"gmtime(...)=__attribute__((deprecated(\"gmtime(3) isn't thread-safe. Use gmtime_r(3) instead!\"))) gmtime(__VA_ARGS__)" -AM_CPPFLAGS_NOCHECK += -D"localtime(...)=__attribute__((deprecated(\"localtime(3) isn't thread-safe. Use localtime_r(3) instead!\"))) localtime(__VA_ARGS__)" -AM_CPPFLAGS_NOCHECK += -D"strncpy(...)=__attribute__((deprecated(\"strncpy(3) is deprecated. Use strlcpy(3) instead!\"))) strncpy(__VA_ARGS__)" - -AM_CPPFLAGS += $(AM_CPPFLAGS_NOCHECK) - if ASAN_ENABLED AM_CPPFLAGS += -DZFS_ASAN_ENABLED endif From 9ad150446fad14b1de6baf2b8bdef4a8965e6030 Mon Sep 17 00:00:00 2001 From: Tino Reichardt Date: Fri, 26 Jan 2024 23:22:26 +0100 Subject: [PATCH 60/91] ZTS: Update deprecated Github Action version numbers GitHub Actions is transitioning from Node 16 to Node 20. So we need to update these: - actions/checkout@v3 -> v4 - actions/download-artifact@v3 -> v4 - actions/upload-artifact@v3 -> v4 and some minor changes Update also the documentation of the testings workflow. Reviewed-by: Brian Behlendorf Reviewed-by: George Melikov Reviewed-by: Andrew Innes Signed-off-by: Tino Reichardt Closes #15820 --- .github/workflows/README.md | 41 +++++++++++-------- .github/workflows/checkstyle.yaml | 4 +- .github/workflows/codeql.yml | 2 +- .github/workflows/scripts/generate-summary.sh | 2 +- .github/workflows/zfs-linux-tests.yml | 22 +++++----- .github/workflows/zfs-linux.yml | 8 ++-- 6 files changed, 42 insertions(+), 37 deletions(-) diff --git a/.github/workflows/README.md b/.github/workflows/README.md index 8255dd21082..ab0555dcddf 100644 --- a/.github/workflows/README.md +++ b/.github/workflows/README.md @@ -4,44 +4,49 @@ ```mermaid flowchart TB subgraph CleanUp and Summary - Part1-20.04-->CleanUp+nice+Summary - Part2-20.04-->CleanUp+nice+Summary - PartN-20.04-->CleanUp+nice+Summary - Part1-22.04-->CleanUp+nice+Summary - Part2-22.04-->CleanUp+nice+Summary - PartN-22.04-->CleanUp+nice+Summary + sanity-checks-20.04-->CleanUp+Summary + Part1-20.04-->CleanUp+Summary + Part2-20.04-->CleanUp+Summary + Part3-20.04-->CleanUp+Summary + Part4-20.04-->CleanUp+Summary + Part1-22.04-->CleanUp+Summary + Part2-22.04-->CleanUp+Summary + Part3-22.04-->CleanUp+Summary + Part4-22.04-->CleanUp+Summary + sanity-checks-22.04-->CleanUp+Summary end subgraph Functional Testings + sanity-checks-20.04 + zloop-checks-20.04 functional-testing-20.04-->Part1-20.04 functional-testing-20.04-->Part2-20.04 - functional-testing-20.04-->PartN-20.04 + functional-testing-20.04-->Part3-20.04 + functional-testing-20.04-->Part4-20.04 functional-testing-22.04-->Part1-22.04 functional-testing-22.04-->Part2-22.04 - functional-testing-22.04-->PartN-22.04 -end - -subgraph Sanity and zloop Testings - sanity-checks-20.04-->functional-testing-20.04 - sanity-checks-22.04-->functional-testing-22.04 - zloop-checks-20.04-->functional - zloop-checks-22.04-->functional + functional-testing-22.04-->Part3-22.04 + functional-testing-22.04-->Part4-22.04 + sanity-checks-22.04 + zloop-checks-22.04 end subgraph Code Checking + Building + Build-Ubuntu-20.04-->sanity-checks-20.04 + Build-Ubuntu-20.04-->zloop-checks-20.04 + Build-Ubuntu-20.04-->functional-testing-20.04 codeql.yml checkstyle.yml - Build-Ubuntu-20.04-->sanity-checks-20.04 Build-Ubuntu-22.04-->sanity-checks-22.04 - Build-Ubuntu-20.04-->zloop-checks-20.04 Build-Ubuntu-22.04-->zloop-checks-22.04 + Build-Ubuntu-22.04-->functional-testing-22.04 end ``` 1) build zfs modules for Ubuntu 20.04 and 22.04 (~15m) 2) 2x zloop test (~10m) + 2x sanity test (~25m) -3) functional testings in parts 1..5 (each ~1h) +3) 4x functional testings in parts 1..4 (each ~1h) 4) cleanup and create summary - content of summary depends on the results of the steps diff --git a/.github/workflows/checkstyle.yaml b/.github/workflows/checkstyle.yaml index b0fdc570d47..abcb358fc04 100644 --- a/.github/workflows/checkstyle.yaml +++ b/.github/workflows/checkstyle.yaml @@ -8,7 +8,7 @@ jobs: checkstyle: runs-on: ubuntu-22.04 steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: ref: ${{ github.event.pull_request.head.sha }} - name: Install dependencies @@ -52,7 +52,7 @@ jobs: if: failure() && steps.CheckABI.outcome == 'failure' run: | find -name *.abi | tar -cf abi_files.tar -T - - - uses: actions/upload-artifact@v3 + - uses: actions/upload-artifact@v4 if: failure() && steps.CheckABI.outcome == 'failure' with: name: New ABI files (use only if you're sure about interface changes) diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 7ccfc149256..e015b2cb71d 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -24,7 +24,7 @@ jobs: echo "MAKEFLAGS=-j$(nproc)" >> $GITHUB_ENV - name: Checkout repository - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Initialize CodeQL uses: github/codeql-action/init@v2 diff --git a/.github/workflows/scripts/generate-summary.sh b/.github/workflows/scripts/generate-summary.sh index cd5ea3421c9..b5d89208a5d 100755 --- a/.github/workflows/scripts/generate-summary.sh +++ b/.github/workflows/scripts/generate-summary.sh @@ -87,7 +87,7 @@ function summarize_f() { output "\n## $headline\n" rm -rf testfiles for i in $(seq 1 $FUNCTIONAL_PARTS); do - tarfile="$2/part$i.tar" + tarfile="$2-part$i/part$i.tar" check_tarfile "$tarfile" check_logfile "testfiles/log" done diff --git a/.github/workflows/zfs-linux-tests.yml b/.github/workflows/zfs-linux-tests.yml index c4fe930d092..753f3cd0214 100644 --- a/.github/workflows/zfs-linux-tests.yml +++ b/.github/workflows/zfs-linux-tests.yml @@ -13,10 +13,10 @@ jobs: zloop: runs-on: ubuntu-${{ inputs.os }} steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: ref: ${{ github.event.pull_request.head.sha }} - - uses: actions/download-artifact@v3 + - uses: actions/download-artifact@v4 with: name: modules-${{ inputs.os }} - name: Install modules @@ -34,7 +34,7 @@ jobs: if: failure() run: | sudo chmod +r -R /var/tmp/zloop/ - - uses: actions/upload-artifact@v3 + - uses: actions/upload-artifact@v4 if: failure() with: name: Zpool-logs-${{ inputs.os }} @@ -43,7 +43,7 @@ jobs: !/var/tmp/zloop/*/vdev/ retention-days: 14 if-no-files-found: ignore - - uses: actions/upload-artifact@v3 + - uses: actions/upload-artifact@v4 if: failure() with: name: Zpool-files-${{ inputs.os }} @@ -55,10 +55,10 @@ jobs: sanity: runs-on: ubuntu-${{ inputs.os }} steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: ref: ${{ github.event.pull_request.head.sha }} - - uses: actions/download-artifact@v3 + - uses: actions/download-artifact@v4 with: name: modules-${{ inputs.os }} - name: Install modules @@ -77,7 +77,7 @@ jobs: RESPATH="/var/tmp/test_results" mv -f $RESPATH/current $RESPATH/testfiles tar cf $RESPATH/sanity.tar -h -C $RESPATH testfiles - - uses: actions/upload-artifact@v3 + - uses: actions/upload-artifact@v4 if: success() || failure() with: name: Logs-${{ inputs.os }}-sanity @@ -91,10 +91,10 @@ jobs: matrix: tests: [ part1, part2, part3, part4 ] steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: ref: ${{ github.event.pull_request.head.sha }} - - uses: actions/download-artifact@v3 + - uses: actions/download-artifact@v4 with: name: modules-${{ inputs.os }} - name: Install modules @@ -116,9 +116,9 @@ jobs: RESPATH="/var/tmp/test_results" mv -f $RESPATH/current $RESPATH/testfiles tar cf $RESPATH/${{ matrix.tests }}.tar -h -C $RESPATH testfiles - - uses: actions/upload-artifact@v3 + - uses: actions/upload-artifact@v4 if: success() || failure() with: - name: Logs-${{ inputs.os }}-functional + name: Logs-${{ inputs.os }}-functional-${{ matrix.tests }} path: /var/tmp/test_results/${{ matrix.tests }}.tar if-no-files-found: ignore diff --git a/.github/workflows/zfs-linux.yml b/.github/workflows/zfs-linux.yml index be3908deb94..e6b705c8605 100644 --- a/.github/workflows/zfs-linux.yml +++ b/.github/workflows/zfs-linux.yml @@ -14,14 +14,14 @@ jobs: os: [20.04, 22.04] runs-on: ubuntu-${{ matrix.os }} steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 with: ref: ${{ github.event.pull_request.head.sha }} - name: Build modules run: .github/workflows/scripts/setup-dependencies.sh build - name: Prepare modules upload run: tar czf modules-${{ matrix.os }}.tgz *.deb .github tests/test-runner tests/ImageOS.txt - - uses: actions/upload-artifact@v3 + - uses: actions/upload-artifact@v4 with: name: modules-${{ matrix.os }} path: modules-${{ matrix.os }}.tgz @@ -44,7 +44,7 @@ jobs: runs-on: ubuntu-22.04 needs: testings steps: - - uses: actions/download-artifact@v3 + - uses: actions/download-artifact@v4 - name: Generating summary run: | tar xzf modules-22.04/modules-22.04.tgz .github tests @@ -58,7 +58,7 @@ jobs: run: .github/workflows/scripts/generate-summary.sh 3 - name: Summary for errors #4 run: .github/workflows/scripts/generate-summary.sh 4 - - uses: actions/upload-artifact@v3 + - uses: actions/upload-artifact@v4 with: name: Summary Files path: Summary/ From dd3a0a27157bb918e6e216b698fbdc22c3c3cc0d Mon Sep 17 00:00:00 2001 From: Ameer Hamza Date: Sat, 27 Jan 2024 03:24:35 +0500 Subject: [PATCH 61/91] Update vdev devid and physpath if changed between imports If devid or physpath for a vdev changes between imports, ensure it is updated to the new value. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Ameer Hamza Closes #15816 --- module/zfs/vdev.c | 40 +++++++++++++++++++++++++++------------- 1 file changed, 27 insertions(+), 13 deletions(-) diff --git a/module/zfs/vdev.c b/module/zfs/vdev.c index afb01c0ef7f..e1ca1aecc90 100644 --- a/module/zfs/vdev.c +++ b/module/zfs/vdev.c @@ -2484,23 +2484,37 @@ vdev_validate(vdev_t *vd) return (0); } +static void +vdev_update_path(const char *prefix, char *svd, char **dvd, uint64_t guid) +{ + if (svd != NULL && *dvd != NULL) { + if (strcmp(svd, *dvd) != 0) { + zfs_dbgmsg("vdev_copy_path: vdev %llu: %s changed " + "from '%s' to '%s'", (u_longlong_t)guid, prefix, + *dvd, svd); + spa_strfree(*dvd); + *dvd = spa_strdup(svd); + } + } else if (svd != NULL) { + *dvd = spa_strdup(svd); + zfs_dbgmsg("vdev_copy_path: vdev %llu: path set to '%s'", + (u_longlong_t)guid, *dvd); + } +} + static void vdev_copy_path_impl(vdev_t *svd, vdev_t *dvd) { char *old, *new; - if (svd->vdev_path != NULL && dvd->vdev_path != NULL) { - if (strcmp(svd->vdev_path, dvd->vdev_path) != 0) { - zfs_dbgmsg("vdev_copy_path: vdev %llu: path changed " - "from '%s' to '%s'", (u_longlong_t)dvd->vdev_guid, - dvd->vdev_path, svd->vdev_path); - spa_strfree(dvd->vdev_path); - dvd->vdev_path = spa_strdup(svd->vdev_path); - } - } else if (svd->vdev_path != NULL) { - dvd->vdev_path = spa_strdup(svd->vdev_path); - zfs_dbgmsg("vdev_copy_path: vdev %llu: path set to '%s'", - (u_longlong_t)dvd->vdev_guid, dvd->vdev_path); - } + + vdev_update_path("vdev_path", svd->vdev_path, &dvd->vdev_path, + dvd->vdev_guid); + + vdev_update_path("vdev_devid", svd->vdev_devid, &dvd->vdev_devid, + dvd->vdev_guid); + + vdev_update_path("vdev_physpath", svd->vdev_physpath, + &dvd->vdev_physpath, dvd->vdev_guid); /* * Our enclosure sysfs path may have changed between imports From 0606ce20555a2392d9172e37d5e2ff3cdab5c1bd Mon Sep 17 00:00:00 2001 From: Rob N Date: Sat, 27 Jan 2024 09:41:31 +1100 Subject: [PATCH 62/91] zpool wait: print timestamp before the header list, status and iostat all display the -T timestamp before the header, but wait showed it after. Make it be like the others. Reported-by: Kyle Evans Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Closes #15825 --- cmd/zpool/zpool_main.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 5507f9d3fd6..5f96dc8d004 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -10752,6 +10752,9 @@ print_wait_status_row(wait_data_t *wd, zpool_handle_t *zhp, int row) col_widths[i] = MAX(strlen(headers[i]), 6) + 2; } + if (timestamp_fmt != NODATE) + print_timestamp(timestamp_fmt); + /* Print header if appropriate */ int term_height = terminal_height(); boolean_t reprint_header = (!wd->wd_headers_once && term_height > 0 && @@ -10819,9 +10822,6 @@ print_wait_status_row(wait_data_t *wd, zpool_handle_t *zhp, int row) if (vdev_any_spare_replacing(nvroot)) bytes_rem[ZPOOL_WAIT_REPLACE] = bytes_rem[ZPOOL_WAIT_RESILVER]; - if (timestamp_fmt != NODATE) - print_timestamp(timestamp_fmt); - for (i = 0; i < ZPOOL_WAIT_NUM_ACTIVITIES; i++) { char buf[64]; if (!wd->wd_enabled[i]) From 7cd666d54b122e2e1ca2fb4519ff75fc8d488a43 Mon Sep 17 00:00:00 2001 From: Andrew Innes Date: Tue, 30 Jan 2024 01:16:02 +0800 Subject: [PATCH 63/91] Move nodes into correct subgraphs Reviewed-by: Brian Behlendorf Reviewed-by: George Melikov Reviewed-by: Tino Reichardt Signed-off-by: Andrew Innes Closes #15828 --- .github/workflows/README.md | 31 ++++++++++++++++++------------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/.github/workflows/README.md b/.github/workflows/README.md index ab0555dcddf..eef47dae3dc 100644 --- a/.github/workflows/README.md +++ b/.github/workflows/README.md @@ -4,16 +4,7 @@ ```mermaid flowchart TB subgraph CleanUp and Summary - sanity-checks-20.04-->CleanUp+Summary - Part1-20.04-->CleanUp+Summary - Part2-20.04-->CleanUp+Summary - Part3-20.04-->CleanUp+Summary - Part4-20.04-->CleanUp+Summary - Part1-22.04-->CleanUp+Summary - Part2-22.04-->CleanUp+Summary - Part3-22.04-->CleanUp+Summary - Part4-22.04-->CleanUp+Summary - sanity-checks-22.04-->CleanUp+Summary + CleanUp+Summary end subgraph Functional Testings @@ -32,15 +23,29 @@ subgraph Functional Testings end subgraph Code Checking + Building + Build-Ubuntu-20.04 + codeql.yml + checkstyle.yml + Build-Ubuntu-22.04 +end + Build-Ubuntu-20.04-->sanity-checks-20.04 Build-Ubuntu-20.04-->zloop-checks-20.04 Build-Ubuntu-20.04-->functional-testing-20.04 - codeql.yml - checkstyle.yml Build-Ubuntu-22.04-->sanity-checks-22.04 Build-Ubuntu-22.04-->zloop-checks-22.04 Build-Ubuntu-22.04-->functional-testing-22.04 -end + + sanity-checks-20.04-->CleanUp+Summary + Part1-20.04-->CleanUp+Summary + Part2-20.04-->CleanUp+Summary + Part3-20.04-->CleanUp+Summary + Part4-20.04-->CleanUp+Summary + Part1-22.04-->CleanUp+Summary + Part2-22.04-->CleanUp+Summary + Part3-22.04-->CleanUp+Summary + Part4-22.04-->CleanUp+Summary + sanity-checks-22.04-->CleanUp+Summary ``` From dd0874cf7ea3e67130662180fea0e40f54108abb Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Mon, 29 Jan 2024 09:41:26 -0800 Subject: [PATCH 64/91] ZTS: Allow longer run time for zdb_args_pos The zdb_args_pos test may take slightly longer than 600 seconds to run on some of the CI builders. To prevent this from causing failures allow up to 1200 seconds for tests in this group. Reviewed-by: Tony Hutter Signed-off-by: Brian Behlendorf Closes #15826 --- tests/runfiles/common.run | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index f93bfb43336..85f29c82203 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -163,6 +163,7 @@ tests = ['zdb_002_pos', 'zdb_003_pos', 'zdb_004_pos', 'zdb_005_pos', pre = post = tags = ['functional', 'cli_root', 'zdb'] +timeout = 1200 [tests/functional/cli_root/zfs] tests = ['zfs_001_neg', 'zfs_002_pos'] From acc7cd8e99da50d775296694c42b2127e42a75b7 Mon Sep 17 00:00:00 2001 From: Chris Davidson Date: Mon, 29 Jan 2024 12:44:08 -0500 Subject: [PATCH 65/91] Update man pages to time(1) from time(2) zpool-iostat.8: Updated time(2) -> time(1) to align to manual page zpool-list.8: Updated time(2) -> time(1) to align to manual page zpool-status.8: Updated time(2) -> time(1) to align to manual page zpool-wait.8: Update time(2) -> time(1) to align to manual page Reviewed-by: Brian Behlendorf Signed-off-by: Christopher Davidson Closes #15823 --- man/man8/zpool-iostat.8 | 2 +- man/man8/zpool-list.8 | 2 +- man/man8/zpool-status.8 | 2 +- man/man8/zpool-wait.8 | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/man/man8/zpool-iostat.8 b/man/man8/zpool-iostat.8 index 34f7243d5aa..e1d2a4b4ff1 100644 --- a/man/man8/zpool-iostat.8 +++ b/man/man8/zpool-iostat.8 @@ -146,7 +146,7 @@ Specify .Sy u for a printed representation of the internal representation of time. See -.Xr time 2 . +.Xr time 1 . Specify .Sy d for standard date format. diff --git a/man/man8/zpool-list.8 b/man/man8/zpool-list.8 index 9e905d52ddd..c60c47f5eb3 100644 --- a/man/man8/zpool-list.8 +++ b/man/man8/zpool-list.8 @@ -95,7 +95,7 @@ Specify .Sy u for a printed representation of the internal representation of time. See -.Xr time 2 . +.Xr time 1 . Specify .Sy d for standard date format. diff --git a/man/man8/zpool-status.8 b/man/man8/zpool-status.8 index 8f9580cf086..10424b9f5b5 100644 --- a/man/man8/zpool-status.8 +++ b/man/man8/zpool-status.8 @@ -110,7 +110,7 @@ Specify .Sy u for a printed representation of the internal representation of time. See -.Xr time 2 . +.Xr time 1 . Specify .Sy d for standard date format. diff --git a/man/man8/zpool-wait.8 b/man/man8/zpool-wait.8 index 683b0141425..4fa4cb23564 100644 --- a/man/man8/zpool-wait.8 +++ b/man/man8/zpool-wait.8 @@ -97,7 +97,7 @@ Specify .Sy u for a printed representation of the internal representation of time. See -.Xr time 2 . +.Xr time 1 . Specify .Sy d for standard date format. From ab653603f8e113208539fcc1426321cdbb17451d Mon Sep 17 00:00:00 2001 From: Paul Dagnelie Date: Mon, 29 Jan 2024 10:36:42 -0800 Subject: [PATCH 66/91] Don't assert mg_initialized due to device addition race During device removal stress tests, we noticed that we were tripping the assertion that mg_initialized was true. After investigation, it was determined that the mg in question was the embedded log metaslab group for a newly added vdev; the normal mg had been initialized (by metaslab_sync_reassess, via vdev_sync_done). However, because the spa config alloc lock is not held as writer across both calls to metaslab_sync_reassess, it is possible for an allocation to happen between the two metaslab_groups being initialized. Because the metaslab code doesn't check the group in question, just the vdev's main mg, it is possible to get past the initial check in vdev_allocatable and later fail due to the assertion. We simply remove the assertions. We could also consider locking the ALLOC lock around the reassess calls in vdev_sync_done, but that risks deadlocks. We could check the actual target mg in vdev_allocatable, but that risks racing with a passivation that comes in after that check but before the assertion. We still won't be able to actually allocate from the metaslab group if no metaslabs are ready, so this change shouldn't break anything. Reviewed-by: Brian Behlendorf Reviewed-by: George Wilson Signed-off-by: Paul Dagnelie Closes #15818 --- module/zfs/metaslab.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/module/zfs/metaslab.c b/module/zfs/metaslab.c index 599d7ffa0cf..5809a832bcb 100644 --- a/module/zfs/metaslab.c +++ b/module/zfs/metaslab.c @@ -5061,7 +5061,6 @@ metaslab_group_alloc(metaslab_group_t *mg, zio_alloc_list_t *zal, int allocator, boolean_t try_hard) { uint64_t offset; - ASSERT(mg->mg_initialized); offset = metaslab_group_alloc_normal(mg, zal, asize, txg, want_unique, dva, d, allocator, try_hard); @@ -5212,8 +5211,6 @@ top: goto next; } - ASSERT(mg->mg_initialized); - /* * Avoid writing single-copy data to an unhealthy, * non-redundant vdev, unless we've already tried all From 621dfaff5ce1673ca1edce82e44cb70b2e00316e Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Mon, 29 Jan 2024 11:35:43 -0800 Subject: [PATCH 67/91] Linux 6.7 compat: META Update the META file to reflect compatibility with the 6.7 kernel. Reviewed-by: Tony Hutter Signed-off-by: Brian Behlendorf Closes #15833 --- META | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/META b/META index 93045ec3abe..05337a9c508 100644 --- a/META +++ b/META @@ -6,5 +6,5 @@ Release: 1 Release-Tags: relext License: CDDL Author: OpenZFS -Linux-Maximum: 6.6 +Linux-Maximum: 6.7 Linux-Minimum: 3.10 From 64afc4e66edf6a740f1c7ab808a452e42d964eb7 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Tue, 23 Jan 2024 10:50:53 +1100 Subject: [PATCH 68/91] Linux 6.8 compat: make test functions static The kernel is now being compiled with -Wmissing-prototypes. Most of our test stub functions had no prototype, and failed to compile. Since they don't need to be visible anywhere else, just make them all static. Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Sponsored-by: https://despairlabs.com/sponsor/ Closes #15805 --- config/kernel-acl.m4 | 14 +++++++------- config/kernel-automount.m4 | 2 +- config/kernel-bio.m4 | 2 +- config/kernel-block-device-operations.m4 | 8 ++++---- config/kernel-commit-metadata.m4 | 2 +- config/kernel-dentry-operations.m4 | 2 +- config/kernel-dirty-inode.m4 | 2 +- config/kernel-encode-fh-inode.m4 | 2 +- config/kernel-evict-inode.m4 | 2 +- config/kernel-fallocate.m4 | 2 +- config/kernel-fsync.m4 | 4 ++-- config/kernel-get-link.m4 | 8 ++++---- config/kernel-inode-create.m4 | 6 +++--- config/kernel-inode-getattr.m4 | 8 ++++---- config/kernel-inode-lookup.m4 | 2 +- config/kernel-inode-permission.m4 | 4 ++-- config/kernel-inode-setattr.m4 | 6 +++--- config/kernel-make-request-fn.m4 | 8 ++++---- config/kernel-mkdir.m4 | 6 +++--- config/kernel-mknod.m4 | 4 ++-- config/kernel-proc-operations.m4 | 10 +++++----- config/kernel-put-link.m4 | 4 ++-- config/kernel-rename.m4 | 10 +++++----- config/kernel-show-options.m4 | 2 +- config/kernel-shrink.m4 | 13 +++++-------- config/kernel-symlink.m4 | 4 ++-- config/kernel-timer.m4 | 4 ++-- config/kernel-tmpfile.m4 | 8 ++++---- config/kernel-vfs-direct_IO.m4 | 8 ++++---- config/kernel-vfs-iterate.m4 | 6 +++--- config/kernel-vfs-rw-iterate.m4 | 4 ++-- config/kernel-writepage_t.m4 | 2 +- config/kernel-xattr-handler.m4 | 24 ++++++++++++------------ 33 files changed, 95 insertions(+), 98 deletions(-) diff --git a/config/kernel-acl.m4 b/config/kernel-acl.m4 index be08c3c6072..3ae5dc6b6db 100644 --- a/config/kernel-acl.m4 +++ b/config/kernel-acl.m4 @@ -172,7 +172,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_INODE_OPERATIONS_GET_ACL], [ ZFS_LINUX_TEST_SRC([inode_operations_get_acl], [ #include - struct posix_acl *get_acl_fn(struct inode *inode, int type) + static struct posix_acl *get_acl_fn(struct inode *inode, int type) { return NULL; } static const struct inode_operations @@ -184,7 +184,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_INODE_OPERATIONS_GET_ACL], [ ZFS_LINUX_TEST_SRC([inode_operations_get_acl_rcu], [ #include - struct posix_acl *get_acl_fn(struct inode *inode, int type, + static struct posix_acl *get_acl_fn(struct inode *inode, int type, bool rcu) { return NULL; } static const struct inode_operations @@ -196,7 +196,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_INODE_OPERATIONS_GET_ACL], [ ZFS_LINUX_TEST_SRC([inode_operations_get_inode_acl], [ #include - struct posix_acl *get_inode_acl_fn(struct inode *inode, int type, + static struct posix_acl *get_inode_acl_fn(struct inode *inode, int type, bool rcu) { return NULL; } static const struct inode_operations @@ -243,7 +243,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_INODE_OPERATIONS_SET_ACL], [ ZFS_LINUX_TEST_SRC([inode_operations_set_acl_mnt_idmap_dentry], [ #include - int set_acl_fn(struct mnt_idmap *idmap, + static int set_acl_fn(struct mnt_idmap *idmap, struct dentry *dent, struct posix_acl *acl, int type) { return 0; } @@ -255,7 +255,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_INODE_OPERATIONS_SET_ACL], [ ZFS_LINUX_TEST_SRC([inode_operations_set_acl_userns_dentry], [ #include - int set_acl_fn(struct user_namespace *userns, + static int set_acl_fn(struct user_namespace *userns, struct dentry *dent, struct posix_acl *acl, int type) { return 0; } @@ -267,7 +267,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_INODE_OPERATIONS_SET_ACL], [ ZFS_LINUX_TEST_SRC([inode_operations_set_acl_userns], [ #include - int set_acl_fn(struct user_namespace *userns, + static int set_acl_fn(struct user_namespace *userns, struct inode *inode, struct posix_acl *acl, int type) { return 0; } @@ -279,7 +279,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_INODE_OPERATIONS_SET_ACL], [ ZFS_LINUX_TEST_SRC([inode_operations_set_acl], [ #include - int set_acl_fn(struct inode *inode, struct posix_acl *acl, + static int set_acl_fn(struct inode *inode, struct posix_acl *acl, int type) { return 0; } static const struct inode_operations diff --git a/config/kernel-automount.m4 b/config/kernel-automount.m4 index f7bb63c6815..52f1931b748 100644 --- a/config/kernel-automount.m4 +++ b/config/kernel-automount.m4 @@ -8,7 +8,7 @@ dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_AUTOMOUNT], [ ZFS_LINUX_TEST_SRC([dentry_operations_d_automount], [ #include - struct vfsmount *d_automount(struct path *p) { return NULL; } + static struct vfsmount *d_automount(struct path *p) { return NULL; } struct dentry_operations dops __attribute__ ((unused)) = { .d_automount = d_automount, }; diff --git a/config/kernel-bio.m4 b/config/kernel-bio.m4 index 18620ca5b7e..b22c1a3de7e 100644 --- a/config/kernel-bio.m4 +++ b/config/kernel-bio.m4 @@ -247,7 +247,7 @@ dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_BIO_END_IO_T_ARGS], [ ZFS_LINUX_TEST_SRC([bio_end_io_t_args], [ #include - void wanted_end_io(struct bio *bio) { return; } + static void wanted_end_io(struct bio *bio) { return; } bio_end_io_t *end_io __attribute__ ((unused)) = wanted_end_io; ], []) ]) diff --git a/config/kernel-block-device-operations.m4 b/config/kernel-block-device-operations.m4 index d13c1337b1f..4ff20b9c413 100644 --- a/config/kernel-block-device-operations.m4 +++ b/config/kernel-block-device-operations.m4 @@ -5,7 +5,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLOCK_DEVICE_OPERATIONS_CHECK_EVENTS], [ ZFS_LINUX_TEST_SRC([block_device_operations_check_events], [ #include - unsigned int blk_check_events(struct gendisk *disk, + static unsigned int blk_check_events(struct gendisk *disk, unsigned int clearing) { (void) disk, (void) clearing; return (0); @@ -34,7 +34,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID], [ ZFS_LINUX_TEST_SRC([block_device_operations_release_void], [ #include - void blk_release(struct gendisk *g, fmode_t mode) { + static void blk_release(struct gendisk *g, fmode_t mode) { (void) g, (void) mode; return; } @@ -56,7 +56,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG], [ ZFS_LINUX_TEST_SRC([block_device_operations_release_void_1arg], [ #include - void blk_release(struct gendisk *g) { + static void blk_release(struct gendisk *g) { (void) g; return; } @@ -96,7 +96,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK], [ ZFS_LINUX_TEST_SRC([block_device_operations_revalidate_disk], [ #include - int blk_revalidate_disk(struct gendisk *disk) { + static int blk_revalidate_disk(struct gendisk *disk) { (void) disk; return(0); } diff --git a/config/kernel-commit-metadata.m4 b/config/kernel-commit-metadata.m4 index 7df9b980290..49bffbf609d 100644 --- a/config/kernel-commit-metadata.m4 +++ b/config/kernel-commit-metadata.m4 @@ -7,7 +7,7 @@ dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_COMMIT_METADATA], [ ZFS_LINUX_TEST_SRC([export_operations_commit_metadata], [ #include - int commit_metadata(struct inode *inode) { return 0; } + static int commit_metadata(struct inode *inode) { return 0; } static struct export_operations eops __attribute__ ((unused))={ .commit_metadata = commit_metadata, }; diff --git a/config/kernel-dentry-operations.m4 b/config/kernel-dentry-operations.m4 index dd470d7607b..500f61e26ae 100644 --- a/config/kernel-dentry-operations.m4 +++ b/config/kernel-dentry-operations.m4 @@ -98,7 +98,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_D_REVALIDATE_NAMEIDATA], [ #include #include - int revalidate (struct dentry *dentry, + static int revalidate (struct dentry *dentry, struct nameidata *nidata) { return 0; } static const struct dentry_operations diff --git a/config/kernel-dirty-inode.m4 b/config/kernel-dirty-inode.m4 index dc7667fa488..2ef8658748c 100644 --- a/config/kernel-dirty-inode.m4 +++ b/config/kernel-dirty-inode.m4 @@ -8,7 +8,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_DIRTY_INODE], [ ZFS_LINUX_TEST_SRC([dirty_inode_with_flags], [ #include - void dirty_inode(struct inode *a, int b) { return; } + static void dirty_inode(struct inode *a, int b) { return; } static const struct super_operations sops __attribute__ ((unused)) = { diff --git a/config/kernel-encode-fh-inode.m4 b/config/kernel-encode-fh-inode.m4 index 9d4ba5f0f61..b3ec040b5e9 100644 --- a/config/kernel-encode-fh-inode.m4 +++ b/config/kernel-encode-fh-inode.m4 @@ -7,7 +7,7 @@ dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_ENCODE_FH_WITH_INODE], [ ZFS_LINUX_TEST_SRC([export_operations_encode_fh], [ #include - int encode_fh(struct inode *inode, __u32 *fh, int *max_len, + static int encode_fh(struct inode *inode, __u32 *fh, int *max_len, struct inode *parent) { return 0; } static struct export_operations eops __attribute__ ((unused))={ .encode_fh = encode_fh, diff --git a/config/kernel-evict-inode.m4 b/config/kernel-evict-inode.m4 index 66f10492de5..87082c9a283 100644 --- a/config/kernel-evict-inode.m4 +++ b/config/kernel-evict-inode.m4 @@ -6,7 +6,7 @@ dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_EVICT_INODE], [ ZFS_LINUX_TEST_SRC([evict_inode], [ #include - void evict_inode (struct inode * t) { return; } + static void evict_inode (struct inode * t) { return; } static struct super_operations sops __attribute__ ((unused)) = { .evict_inode = evict_inode, }; diff --git a/config/kernel-fallocate.m4 b/config/kernel-fallocate.m4 index 815602d3e2c..95186dada45 100644 --- a/config/kernel-fallocate.m4 +++ b/config/kernel-fallocate.m4 @@ -11,7 +11,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_FALLOCATE], [ ZFS_LINUX_TEST_SRC([file_fallocate], [ #include - long test_fallocate(struct file *file, int mode, + static long test_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { return 0; } static const struct file_operations diff --git a/config/kernel-fsync.m4 b/config/kernel-fsync.m4 index d198191d3ab..c155f8af81a 100644 --- a/config/kernel-fsync.m4 +++ b/config/kernel-fsync.m4 @@ -5,7 +5,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_FSYNC], [ ZFS_LINUX_TEST_SRC([fsync_without_dentry], [ #include - int test_fsync(struct file *f, int x) { return 0; } + static int test_fsync(struct file *f, int x) { return 0; } static const struct file_operations fops __attribute__ ((unused)) = { @@ -16,7 +16,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_FSYNC], [ ZFS_LINUX_TEST_SRC([fsync_range], [ #include - int test_fsync(struct file *f, loff_t a, loff_t b, int c) + static int test_fsync(struct file *f, loff_t a, loff_t b, int c) { return 0; } static const struct file_operations diff --git a/config/kernel-get-link.m4 b/config/kernel-get-link.m4 index e4f478e37c1..1f8f5b0c8b7 100644 --- a/config/kernel-get-link.m4 +++ b/config/kernel-get-link.m4 @@ -5,7 +5,7 @@ dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_GET_LINK], [ ZFS_LINUX_TEST_SRC([inode_operations_get_link], [ #include - const char *get_link(struct dentry *de, struct inode *ip, + static const char *get_link(struct dentry *de, struct inode *ip, struct delayed_call *done) { return "symlink"; } static struct inode_operations iops __attribute__ ((unused)) = { @@ -15,7 +15,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_GET_LINK], [ ZFS_LINUX_TEST_SRC([inode_operations_get_link_cookie], [ #include - const char *get_link(struct dentry *de, struct + static const char *get_link(struct dentry *de, struct inode *ip, void **cookie) { return "symlink"; } static struct inode_operations iops __attribute__ ((unused)) = { @@ -25,7 +25,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_GET_LINK], [ ZFS_LINUX_TEST_SRC([inode_operations_follow_link], [ #include - const char *follow_link(struct dentry *de, + static const char *follow_link(struct dentry *de, void **cookie) { return "symlink"; } static struct inode_operations iops __attribute__ ((unused)) = { @@ -35,7 +35,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_GET_LINK], [ ZFS_LINUX_TEST_SRC([inode_operations_follow_link_nameidata], [ #include - void *follow_link(struct dentry *de, struct + static void *follow_link(struct dentry *de, struct nameidata *nd) { return (void *)NULL; } static struct inode_operations iops __attribute__ ((unused)) = { diff --git a/config/kernel-inode-create.m4 b/config/kernel-inode-create.m4 index 9e9e4318097..95f8aa2d522 100644 --- a/config/kernel-inode-create.m4 +++ b/config/kernel-inode-create.m4 @@ -7,7 +7,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_CREATE], [ #include #include - int inode_create(struct mnt_idmap *idmap, + static int inode_create(struct mnt_idmap *idmap, struct inode *inode ,struct dentry *dentry, umode_t umode, bool flag) { return 0; } @@ -25,7 +25,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_CREATE], [ #include #include - int inode_create(struct user_namespace *userns, + static int inode_create(struct user_namespace *userns, struct inode *inode ,struct dentry *dentry, umode_t umode, bool flag) { return 0; } @@ -42,7 +42,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_CREATE], [ #include #include - int inode_create(struct inode *inode ,struct dentry *dentry, + static int inode_create(struct inode *inode ,struct dentry *dentry, umode_t umode, bool flag) { return 0; } static const struct inode_operations diff --git a/config/kernel-inode-getattr.m4 b/config/kernel-inode-getattr.m4 index c8bfb07862a..5f7ce1ad9a5 100644 --- a/config/kernel-inode-getattr.m4 +++ b/config/kernel-inode-getattr.m4 @@ -7,7 +7,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_INODE_GETATTR], [ ZFS_LINUX_TEST_SRC([inode_operations_getattr_mnt_idmap], [ #include - int test_getattr( + static int test_getattr( struct mnt_idmap *idmap, const struct path *p, struct kstat *k, u32 request_mask, unsigned int query_flags) @@ -28,7 +28,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_INODE_GETATTR], [ ZFS_LINUX_TEST_SRC([inode_operations_getattr_userns], [ #include - int test_getattr( + static int test_getattr( struct user_namespace *userns, const struct path *p, struct kstat *k, u32 request_mask, unsigned int query_flags) @@ -47,7 +47,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_INODE_GETATTR], [ ZFS_LINUX_TEST_SRC([inode_operations_getattr_path], [ #include - int test_getattr( + static int test_getattr( const struct path *p, struct kstat *k, u32 request_mask, unsigned int query_flags) { return 0; } @@ -61,7 +61,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_INODE_GETATTR], [ ZFS_LINUX_TEST_SRC([inode_operations_getattr_vfsmount], [ #include - int test_getattr( + static int test_getattr( struct vfsmount *mnt, struct dentry *d, struct kstat *k) { return 0; } diff --git a/config/kernel-inode-lookup.m4 b/config/kernel-inode-lookup.m4 index 1a56e69b04a..c7373056422 100644 --- a/config/kernel-inode-lookup.m4 +++ b/config/kernel-inode-lookup.m4 @@ -6,7 +6,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_LOOKUP_FLAGS], [ #include #include - struct dentry *inode_lookup(struct inode *inode, + static struct dentry *inode_lookup(struct inode *inode, struct dentry *dentry, unsigned int flags) { return NULL; } static const struct inode_operations iops diff --git a/config/kernel-inode-permission.m4 b/config/kernel-inode-permission.m4 index 01d23635b0c..aef4005c406 100644 --- a/config/kernel-inode-permission.m4 +++ b/config/kernel-inode-permission.m4 @@ -8,7 +8,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_PERMISSION], [ #include #include - int inode_permission(struct mnt_idmap *idmap, + static int inode_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { return 0; } static const struct inode_operations @@ -25,7 +25,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_PERMISSION], [ #include #include - int inode_permission(struct user_namespace *userns, + static int inode_permission(struct user_namespace *userns, struct inode *inode, int mask) { return 0; } static const struct inode_operations diff --git a/config/kernel-inode-setattr.m4 b/config/kernel-inode-setattr.m4 index 45755b4eb27..69289e897be 100644 --- a/config/kernel-inode-setattr.m4 +++ b/config/kernel-inode-setattr.m4 @@ -7,7 +7,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_INODE_SETATTR], [ ZFS_LINUX_TEST_SRC([inode_operations_setattr_mnt_idmap], [ #include - int test_setattr( + static int test_setattr( struct mnt_idmap *idmap, struct dentry *de, struct iattr *ia) { return 0; } @@ -27,7 +27,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_INODE_SETATTR], [ ZFS_LINUX_TEST_SRC([inode_operations_setattr_userns], [ #include - int test_setattr( + static int test_setattr( struct user_namespace *userns, struct dentry *de, struct iattr *ia) { return 0; } @@ -41,7 +41,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_INODE_SETATTR], [ ZFS_LINUX_TEST_SRC([inode_operations_setattr], [ #include - int test_setattr( + static int test_setattr( struct dentry *de, struct iattr *ia) { return 0; } diff --git a/config/kernel-make-request-fn.m4 b/config/kernel-make-request-fn.m4 index f17416acca6..4d20dd45c4a 100644 --- a/config/kernel-make-request-fn.m4 +++ b/config/kernel-make-request-fn.m4 @@ -4,7 +4,7 @@ dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_MAKE_REQUEST_FN], [ ZFS_LINUX_TEST_SRC([make_request_fn_void], [ #include - void make_request(struct request_queue *q, + static void make_request(struct request_queue *q, struct bio *bio) { return; } ],[ blk_queue_make_request(NULL, &make_request); @@ -12,7 +12,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_MAKE_REQUEST_FN], [ ZFS_LINUX_TEST_SRC([make_request_fn_blk_qc_t], [ #include - blk_qc_t make_request(struct request_queue *q, + static blk_qc_t make_request(struct request_queue *q, struct bio *bio) { return (BLK_QC_T_NONE); } ],[ blk_queue_make_request(NULL, &make_request); @@ -20,7 +20,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_MAKE_REQUEST_FN], [ ZFS_LINUX_TEST_SRC([blk_alloc_queue_request_fn], [ #include - blk_qc_t make_request(struct request_queue *q, + static blk_qc_t make_request(struct request_queue *q, struct bio *bio) { return (BLK_QC_T_NONE); } ],[ struct request_queue *q __attribute__ ((unused)); @@ -29,7 +29,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_MAKE_REQUEST_FN], [ ZFS_LINUX_TEST_SRC([blk_alloc_queue_request_fn_rh], [ #include - blk_qc_t make_request(struct request_queue *q, + static blk_qc_t make_request(struct request_queue *q, struct bio *bio) { return (BLK_QC_T_NONE); } ],[ struct request_queue *q __attribute__ ((unused)); diff --git a/config/kernel-mkdir.m4 b/config/kernel-mkdir.m4 index 7407a791b84..367f100094d 100644 --- a/config/kernel-mkdir.m4 +++ b/config/kernel-mkdir.m4 @@ -9,7 +9,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_MKDIR], [ ZFS_LINUX_TEST_SRC([mkdir_mnt_idmap], [ #include - int mkdir(struct mnt_idmap *idmap, + static int mkdir(struct mnt_idmap *idmap, struct inode *inode, struct dentry *dentry, umode_t umode) { return 0; } static const struct inode_operations @@ -26,7 +26,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_MKDIR], [ ZFS_LINUX_TEST_SRC([mkdir_user_namespace], [ #include - int mkdir(struct user_namespace *userns, + static int mkdir(struct user_namespace *userns, struct inode *inode, struct dentry *dentry, umode_t umode) { return 0; } @@ -47,7 +47,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_MKDIR], [ ZFS_LINUX_TEST_SRC([inode_operations_mkdir], [ #include - int mkdir(struct inode *inode, struct dentry *dentry, + static int mkdir(struct inode *inode, struct dentry *dentry, umode_t umode) { return 0; } static const struct inode_operations diff --git a/config/kernel-mknod.m4 b/config/kernel-mknod.m4 index 1494ec1ae4d..6ad3453aaf0 100644 --- a/config/kernel-mknod.m4 +++ b/config/kernel-mknod.m4 @@ -7,7 +7,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_MKNOD], [ #include #include - int tmp_mknod(struct mnt_idmap *idmap, + static int tmp_mknod(struct mnt_idmap *idmap, struct inode *inode ,struct dentry *dentry, umode_t u, dev_t d) { return 0; } @@ -25,7 +25,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_MKNOD], [ #include #include - int tmp_mknod(struct user_namespace *userns, + static int tmp_mknod(struct user_namespace *userns, struct inode *inode ,struct dentry *dentry, umode_t u, dev_t d) { return 0; } diff --git a/config/kernel-proc-operations.m4 b/config/kernel-proc-operations.m4 index df216222ecc..3ae8ce2b6d0 100644 --- a/config/kernel-proc-operations.m4 +++ b/config/kernel-proc-operations.m4 @@ -7,14 +7,14 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_PROC_OPERATIONS], [ ZFS_LINUX_TEST_SRC([proc_ops_struct], [ #include - int test_open(struct inode *ip, struct file *fp) { return 0; } - ssize_t test_read(struct file *fp, char __user *ptr, + static int test_open(struct inode *ip, struct file *fp) { return 0; } + static ssize_t test_read(struct file *fp, char __user *ptr, size_t size, loff_t *offp) { return 0; } - ssize_t test_write(struct file *fp, const char __user *ptr, + static ssize_t test_write(struct file *fp, const char __user *ptr, size_t size, loff_t *offp) { return 0; } - loff_t test_lseek(struct file *fp, loff_t off, int flag) + static loff_t test_lseek(struct file *fp, loff_t off, int flag) { return 0; } - int test_release(struct inode *ip, struct file *fp) + static int test_release(struct inode *ip, struct file *fp) { return 0; } const struct proc_ops test_ops __attribute__ ((unused)) = { diff --git a/config/kernel-put-link.m4 b/config/kernel-put-link.m4 index 4234861f334..8ab318cbff8 100644 --- a/config/kernel-put-link.m4 +++ b/config/kernel-put-link.m4 @@ -4,7 +4,7 @@ dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_PUT_LINK], [ ZFS_LINUX_TEST_SRC([put_link_cookie], [ #include - void put_link(struct inode *ip, void *cookie) + static void put_link(struct inode *ip, void *cookie) { return; } static struct inode_operations iops __attribute__ ((unused)) = { @@ -14,7 +14,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_PUT_LINK], [ ZFS_LINUX_TEST_SRC([put_link_nameidata], [ #include - void put_link(struct dentry *de, struct + static void put_link(struct dentry *de, struct nameidata *nd, void *ptr) { return; } static struct inode_operations iops __attribute__ ((unused)) = { diff --git a/config/kernel-rename.m4 b/config/kernel-rename.m4 index 57c3eed7897..ce881502d1b 100644 --- a/config/kernel-rename.m4 +++ b/config/kernel-rename.m4 @@ -8,7 +8,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_RENAME], [ dnl # ZFS_LINUX_TEST_SRC([inode_operations_rename2], [ #include - int rename2_fn(struct inode *sip, struct dentry *sdp, + static int rename2_fn(struct inode *sip, struct dentry *sdp, struct inode *tip, struct dentry *tdp, unsigned int flags) { return 0; } @@ -26,7 +26,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_RENAME], [ dnl # ZFS_LINUX_TEST_SRC([inode_operations_rename_flags], [ #include - int rename_fn(struct inode *sip, struct dentry *sdp, + static int rename_fn(struct inode *sip, struct dentry *sdp, struct inode *tip, struct dentry *tdp, unsigned int flags) { return 0; } @@ -44,7 +44,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_RENAME], [ dnl # ZFS_LINUX_TEST_SRC([dir_inode_operations_wrapper_rename2], [ #include - int rename2_fn(struct inode *sip, struct dentry *sdp, + static int rename2_fn(struct inode *sip, struct dentry *sdp, struct inode *tip, struct dentry *tdp, unsigned int flags) { return 0; } @@ -62,7 +62,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_RENAME], [ dnl # ZFS_LINUX_TEST_SRC([inode_operations_rename_userns], [ #include - int rename_fn(struct user_namespace *user_ns, struct inode *sip, + static int rename_fn(struct user_namespace *user_ns, struct inode *sip, struct dentry *sdp, struct inode *tip, struct dentry *tdp, unsigned int flags) { return 0; } @@ -77,7 +77,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_RENAME], [ dnl # ZFS_LINUX_TEST_SRC([inode_operations_rename_mnt_idmap], [ #include - int rename_fn(struct mnt_idmap *idmap, struct inode *sip, + static int rename_fn(struct mnt_idmap *idmap, struct inode *sip, struct dentry *sdp, struct inode *tip, struct dentry *tdp, unsigned int flags) { return 0; } diff --git a/config/kernel-show-options.m4 b/config/kernel-show-options.m4 index 93bd5fbfbb2..fd62f30086d 100644 --- a/config/kernel-show-options.m4 +++ b/config/kernel-show-options.m4 @@ -5,7 +5,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_SHOW_OPTIONS], [ ZFS_LINUX_TEST_SRC([super_operations_show_options], [ #include - int show_options(struct seq_file * x, struct dentry * y) { + static int show_options(struct seq_file * x, struct dentry * y) { return 0; }; diff --git a/config/kernel-shrink.m4 b/config/kernel-shrink.m4 index 4a529c43b5b..6580b08d5ff 100644 --- a/config/kernel-shrink.m4 +++ b/config/kernel-shrink.m4 @@ -8,9 +8,6 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_SUPER_BLOCK_S_SHRINK], [ ZFS_LINUX_TEST_SRC([super_block_s_shrink], [ #include - int shrink(struct shrinker *s, struct shrink_control *sc) - { return 0; } - static const struct super_block sb __attribute__ ((unused)) = { .s_shrink.seeks = DEFAULT_SEEKS, @@ -26,7 +23,7 @@ dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_SUPER_BLOCK_S_SHRINK_PTR], [ ZFS_LINUX_TEST_SRC([super_block_s_shrink_ptr], [ #include - unsigned long shrinker_cb(struct shrinker *shrink, + static unsigned long shrinker_cb(struct shrinker *shrink, struct shrink_control *sc) { return 0; } static struct shrinker shrinker = { .count_objects = shrinker_cb, @@ -89,7 +86,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SHRINK_CONTROL_HAS_NID], [ AC_DEFUN([ZFS_AC_KERNEL_SRC_REGISTER_SHRINKER_VARARG], [ ZFS_LINUX_TEST_SRC([register_shrinker_vararg], [ #include - unsigned long shrinker_cb(struct shrinker *shrink, + static unsigned long shrinker_cb(struct shrinker *shrink, struct shrink_control *sc) { return 0; } ],[ struct shrinker cache_shrinker = { @@ -104,7 +101,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_REGISTER_SHRINKER_VARARG], [ AC_DEFUN([ZFS_AC_KERNEL_SRC_SHRINKER_CALLBACK], [ ZFS_LINUX_TEST_SRC([shrinker_cb_shrink_control], [ #include - int shrinker_cb(struct shrinker *shrink, + static int shrinker_cb(struct shrinker *shrink, struct shrink_control *sc) { return 0; } ],[ struct shrinker cache_shrinker = { @@ -116,7 +113,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_SHRINKER_CALLBACK], [ ZFS_LINUX_TEST_SRC([shrinker_cb_shrink_control_split], [ #include - unsigned long shrinker_cb(struct shrinker *shrink, + static unsigned long shrinker_cb(struct shrinker *shrink, struct shrink_control *sc) { return 0; } ],[ struct shrinker cache_shrinker = { @@ -135,7 +132,7 @@ dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_SHRINKER_REGISTER], [ ZFS_LINUX_TEST_SRC([shrinker_register], [ #include - unsigned long shrinker_cb(struct shrinker *shrink, + static unsigned long shrinker_cb(struct shrinker *shrink, struct shrink_control *sc) { return 0; } ],[ struct shrinker cache_shrinker = { diff --git a/config/kernel-symlink.m4 b/config/kernel-symlink.m4 index a0333ed66a7..804fceab28f 100644 --- a/config/kernel-symlink.m4 +++ b/config/kernel-symlink.m4 @@ -6,7 +6,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_SYMLINK], [ ZFS_LINUX_TEST_SRC([symlink_mnt_idmap], [ #include #include - int tmp_symlink(struct mnt_idmap *idmap, + static int tmp_symlink(struct mnt_idmap *idmap, struct inode *inode ,struct dentry *dentry, const char *path) { return 0; } @@ -23,7 +23,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_SYMLINK], [ #include #include - int tmp_symlink(struct user_namespace *userns, + static int tmp_symlink(struct user_namespace *userns, struct inode *inode ,struct dentry *dentry, const char *path) { return 0; } diff --git a/config/kernel-timer.m4 b/config/kernel-timer.m4 index 403cff3f418..c710e804be0 100644 --- a/config/kernel-timer.m4 +++ b/config/kernel-timer.m4 @@ -18,7 +18,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_TIMER_SETUP], [ int data; }; - void task_expire(struct timer_list *tl) + static void task_expire(struct timer_list *tl) { struct my_task_timer *task_timer = from_timer(task_timer, tl, timer); @@ -31,7 +31,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_TIMER_SETUP], [ ZFS_LINUX_TEST_SRC([timer_list_function], [ #include - void task_expire(struct timer_list *tl) {} + static void task_expire(struct timer_list *tl) {} ],[ struct timer_list tl; tl.function = task_expire; diff --git a/config/kernel-tmpfile.m4 b/config/kernel-tmpfile.m4 index cc18b8f65a8..7439514186e 100644 --- a/config/kernel-tmpfile.m4 +++ b/config/kernel-tmpfile.m4 @@ -9,7 +9,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_TMPFILE], [ dnl # ZFS_LINUX_TEST_SRC([inode_operations_tmpfile_mnt_idmap], [ #include - int tmpfile(struct mnt_idmap *idmap, + static int tmpfile(struct mnt_idmap *idmap, struct inode *inode, struct file *file, umode_t mode) { return 0; } static struct inode_operations @@ -22,7 +22,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_TMPFILE], [ dnl # ZFS_LINUX_TEST_SRC([inode_operations_tmpfile], [ #include - int tmpfile(struct user_namespace *userns, + static int tmpfile(struct user_namespace *userns, struct inode *inode, struct file *file, umode_t mode) { return 0; } static struct inode_operations @@ -36,7 +36,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_TMPFILE], [ dnl # ZFS_LINUX_TEST_SRC([inode_operations_tmpfile_dentry_userns], [ #include - int tmpfile(struct user_namespace *userns, + static int tmpfile(struct user_namespace *userns, struct inode *inode, struct dentry *dentry, umode_t mode) { return 0; } static struct inode_operations @@ -46,7 +46,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_TMPFILE], [ ],[]) ZFS_LINUX_TEST_SRC([inode_operations_tmpfile_dentry], [ #include - int tmpfile(struct inode *inode, struct dentry *dentry, + static int tmpfile(struct inode *inode, struct dentry *dentry, umode_t mode) { return 0; } static struct inode_operations iops __attribute__ ((unused)) = { diff --git a/config/kernel-vfs-direct_IO.m4 b/config/kernel-vfs-direct_IO.m4 index 82583d52fcb..7b7b91f979f 100644 --- a/config/kernel-vfs-direct_IO.m4 +++ b/config/kernel-vfs-direct_IO.m4 @@ -5,7 +5,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_DIRECT_IO], [ ZFS_LINUX_TEST_SRC([direct_io_iter], [ #include - ssize_t test_direct_IO(struct kiocb *kiocb, + static ssize_t test_direct_IO(struct kiocb *kiocb, struct iov_iter *iter) { return 0; } static const struct address_space_operations @@ -17,7 +17,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_DIRECT_IO], [ ZFS_LINUX_TEST_SRC([direct_io_iter_offset], [ #include - ssize_t test_direct_IO(struct kiocb *kiocb, + static ssize_t test_direct_IO(struct kiocb *kiocb, struct iov_iter *iter, loff_t offset) { return 0; } static const struct address_space_operations @@ -29,7 +29,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_DIRECT_IO], [ ZFS_LINUX_TEST_SRC([direct_io_iter_rw_offset], [ #include - ssize_t test_direct_IO(int rw, struct kiocb *kiocb, + static ssize_t test_direct_IO(int rw, struct kiocb *kiocb, struct iov_iter *iter, loff_t offset) { return 0; } static const struct address_space_operations @@ -41,7 +41,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_DIRECT_IO], [ ZFS_LINUX_TEST_SRC([direct_io_iovec], [ #include - ssize_t test_direct_IO(int rw, struct kiocb *kiocb, + static ssize_t test_direct_IO(int rw, struct kiocb *kiocb, const struct iovec *iov, loff_t offset, unsigned long nr_segs) { return 0; } diff --git a/config/kernel-vfs-iterate.m4 b/config/kernel-vfs-iterate.m4 index 172118eac87..2e396daa1c0 100644 --- a/config/kernel-vfs-iterate.m4 +++ b/config/kernel-vfs-iterate.m4 @@ -1,7 +1,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_ITERATE], [ ZFS_LINUX_TEST_SRC([file_operations_iterate_shared], [ #include - int iterate(struct file *filp, struct dir_context * context) + static int iterate(struct file *filp, struct dir_context * context) { return 0; } static const struct file_operations fops @@ -12,7 +12,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_ITERATE], [ ZFS_LINUX_TEST_SRC([file_operations_iterate], [ #include - int iterate(struct file *filp, + static int iterate(struct file *filp, struct dir_context *context) { return 0; } static const struct file_operations fops @@ -27,7 +27,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_ITERATE], [ ZFS_LINUX_TEST_SRC([file_operations_readdir], [ #include - int readdir(struct file *filp, void *entry, + static int readdir(struct file *filp, void *entry, filldir_t func) { return 0; } static const struct file_operations fops diff --git a/config/kernel-vfs-rw-iterate.m4 b/config/kernel-vfs-rw-iterate.m4 index 000353ec15b..cb20ed03099 100644 --- a/config/kernel-vfs-rw-iterate.m4 +++ b/config/kernel-vfs-rw-iterate.m4 @@ -5,9 +5,9 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_RW_ITERATE], [ ZFS_LINUX_TEST_SRC([file_operations_rw], [ #include - ssize_t test_read(struct kiocb *kiocb, struct iov_iter *to) + static ssize_t test_read(struct kiocb *kiocb, struct iov_iter *to) { return 0; } - ssize_t test_write(struct kiocb *kiocb, struct iov_iter *from) + static ssize_t test_write(struct kiocb *kiocb, struct iov_iter *from) { return 0; } static const struct file_operations diff --git a/config/kernel-writepage_t.m4 b/config/kernel-writepage_t.m4 index 3a0cffd9857..a82cf370c9d 100644 --- a/config/kernel-writepage_t.m4 +++ b/config/kernel-writepage_t.m4 @@ -6,7 +6,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_WRITEPAGE_T], [ dnl # ZFS_LINUX_TEST_SRC([writepage_t_folio], [ #include - int putpage(struct folio *folio, + static int putpage(struct folio *folio, struct writeback_control *wbc, void *data) { return 0; } writepage_t func = putpage; diff --git a/config/kernel-xattr-handler.m4 b/config/kernel-xattr-handler.m4 index 6b8a08dbcc8..32f58c70a50 100644 --- a/config/kernel-xattr-handler.m4 +++ b/config/kernel-xattr-handler.m4 @@ -68,7 +68,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_XATTR_HANDLER_GET], [ ZFS_LINUX_TEST_SRC([xattr_handler_get_dentry_inode], [ #include - int get(const struct xattr_handler *handler, + static int get(const struct xattr_handler *handler, struct dentry *dentry, struct inode *inode, const char *name, void *buffer, size_t size) { return 0; } static const struct xattr_handler @@ -80,7 +80,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_XATTR_HANDLER_GET], [ ZFS_LINUX_TEST_SRC([xattr_handler_get_xattr_handler], [ #include - int get(const struct xattr_handler *handler, + static int get(const struct xattr_handler *handler, struct dentry *dentry, const char *name, void *buffer, size_t size) { return 0; } static const struct xattr_handler @@ -92,7 +92,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_XATTR_HANDLER_GET], [ ZFS_LINUX_TEST_SRC([xattr_handler_get_dentry], [ #include - int get(struct dentry *dentry, const char *name, + static int get(struct dentry *dentry, const char *name, void *buffer, size_t size, int handler_flags) { return 0; } static const struct xattr_handler @@ -104,7 +104,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_XATTR_HANDLER_GET], [ ZFS_LINUX_TEST_SRC([xattr_handler_get_dentry_inode_flags], [ #include - int get(const struct xattr_handler *handler, + static int get(const struct xattr_handler *handler, struct dentry *dentry, struct inode *inode, const char *name, void *buffer, size_t size, int flags) { return 0; } @@ -182,7 +182,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_XATTR_HANDLER_SET], [ ZFS_LINUX_TEST_SRC([xattr_handler_set_mnt_idmap], [ #include - int set(const struct xattr_handler *handler, + static int set(const struct xattr_handler *handler, struct mnt_idmap *idmap, struct dentry *dentry, struct inode *inode, const char *name, const void *buffer, @@ -197,7 +197,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_XATTR_HANDLER_SET], [ ZFS_LINUX_TEST_SRC([xattr_handler_set_userns], [ #include - int set(const struct xattr_handler *handler, + static int set(const struct xattr_handler *handler, struct user_namespace *mnt_userns, struct dentry *dentry, struct inode *inode, const char *name, const void *buffer, @@ -212,7 +212,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_XATTR_HANDLER_SET], [ ZFS_LINUX_TEST_SRC([xattr_handler_set_dentry_inode], [ #include - int set(const struct xattr_handler *handler, + static int set(const struct xattr_handler *handler, struct dentry *dentry, struct inode *inode, const char *name, const void *buffer, size_t size, int flags) @@ -226,7 +226,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_XATTR_HANDLER_SET], [ ZFS_LINUX_TEST_SRC([xattr_handler_set_xattr_handler], [ #include - int set(const struct xattr_handler *handler, + static int set(const struct xattr_handler *handler, struct dentry *dentry, const char *name, const void *buffer, size_t size, int flags) { return 0; } @@ -239,7 +239,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_XATTR_HANDLER_SET], [ ZFS_LINUX_TEST_SRC([xattr_handler_set_dentry], [ #include - int set(struct dentry *dentry, const char *name, + static int set(struct dentry *dentry, const char *name, const void *buffer, size_t size, int flags, int handler_flags) { return 0; } static const struct xattr_handler @@ -325,7 +325,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_XATTR_HANDLER_LIST], [ ZFS_LINUX_TEST_SRC([xattr_handler_list_simple], [ #include - bool list(struct dentry *dentry) { return 0; } + static bool list(struct dentry *dentry) { return 0; } static const struct xattr_handler xops __attribute__ ((unused)) = { .list = list, @@ -335,7 +335,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_XATTR_HANDLER_LIST], [ ZFS_LINUX_TEST_SRC([xattr_handler_list_xattr_handler], [ #include - size_t list(const struct xattr_handler *handler, + static size_t list(const struct xattr_handler *handler, struct dentry *dentry, char *list, size_t list_size, const char *name, size_t name_len) { return 0; } static const struct xattr_handler @@ -347,7 +347,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_XATTR_HANDLER_LIST], [ ZFS_LINUX_TEST_SRC([xattr_handler_list_dentry], [ #include - size_t list(struct dentry *dentry, + static size_t list(struct dentry *dentry, char *list, size_t list_size, const char *name, size_t name_len, int handler_flags) { return 0; } From ce782d080432506a41b49df32af6f0013b5775db Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Tue, 23 Jan 2024 15:42:57 +1100 Subject: [PATCH 69/91] Linux 6.8 compat: update for new bdev access functions blkdev_get_by_path() and blkdev_put() have been replaced by bdev_open_by_path() and bdev_release(), which return a "handle" object with the bdev object itself inside. This adds detection for the new functions, and macros to handle the old and new forms consistently. Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Sponsored-by: https://despairlabs.com/sponsor/ Closes #15805 --- config/kernel-blkdev.m4 | 56 ++++++++++++- module/os/linux/zfs/vdev_disk.c | 137 ++++++++++++++++++-------------- 2 files changed, 133 insertions(+), 60 deletions(-) diff --git a/config/kernel-blkdev.m4 b/config/kernel-blkdev.m4 index e04a2bd2c3b..8e9e638b125 100644 --- a/config/kernel-blkdev.m4 +++ b/config/kernel-blkdev.m4 @@ -35,6 +35,25 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_GET_BY_PATH_4ARG], [ ]) ]) +dnl # +dnl # 6.8.x API change +dnl # bdev_open_by_path() replaces blkdev_get_by_path() +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_OPEN_BY_PATH], [ + ZFS_LINUX_TEST_SRC([bdev_open_by_path], [ + #include + #include + ], [ + struct bdev_handle *bdh __attribute__ ((unused)) = NULL; + const char *path = "path"; + fmode_t mode = 0; + void *holder = NULL; + struct blk_holder_ops h; + + bdh = bdev_open_by_path(path, mode, holder, &h); + ]) +]) + AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_GET_BY_PATH], [ AC_MSG_CHECKING([whether blkdev_get_by_path() exists and takes 3 args]) ZFS_LINUX_TEST_RESULT([blkdev_get_by_path], [ @@ -47,7 +66,15 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_GET_BY_PATH], [ [blkdev_get_by_path() exists and takes 4 args]) AC_MSG_RESULT(yes) ], [ - ZFS_LINUX_TEST_ERROR([blkdev_get_by_path()]) + AC_MSG_RESULT(no) + AC_MSG_CHECKING([whether bdev_open_by_path() exists]) + ZFS_LINUX_TEST_RESULT([bdev_open_by_path], [ + AC_DEFINE(HAVE_BDEV_OPEN_BY_PATH, 1, + [bdev_open_by_path() exists]) + AC_MSG_RESULT(yes) + ], [ + ZFS_LINUX_TEST_ERROR([blkdev_get_by_path()]) + ]) ]) ]) ]) @@ -108,18 +135,41 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_PUT_HOLDER], [ ]) ]) +dnl # +dnl # 6.8.x API change +dnl # bdev_release() replaces blkdev_put() +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_RELEASE], [ + ZFS_LINUX_TEST_SRC([bdev_release], [ + #include + #include + ], [ + struct bdev_handle *bdh = NULL; + bdev_release(bdh); + ]) +]) + AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_PUT], [ AC_MSG_CHECKING([whether blkdev_put() exists]) ZFS_LINUX_TEST_RESULT([blkdev_put], [ AC_MSG_RESULT(yes) ], [ + AC_MSG_RESULT(no) AC_MSG_CHECKING([whether blkdev_put() accepts void* as arg 2]) ZFS_LINUX_TEST_RESULT([blkdev_put_holder], [ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_BLKDEV_PUT_HOLDER, 1, [blkdev_put() accepts void* as arg 2]) ], [ - ZFS_LINUX_TEST_ERROR([blkdev_put()]) + AC_MSG_RESULT(no) + AC_MSG_CHECKING([whether bdev_release() exists]) + ZFS_LINUX_TEST_RESULT([bdev_release], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BDEV_RELEASE, 1, + [bdev_release() exists]) + ], [ + ZFS_LINUX_TEST_ERROR([blkdev_put()]) + ]) ]) ]) ]) @@ -570,8 +620,10 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_BLK_STS_RESV_CONFLICT], [ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV], [ ZFS_AC_KERNEL_SRC_BLKDEV_GET_BY_PATH ZFS_AC_KERNEL_SRC_BLKDEV_GET_BY_PATH_4ARG + ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_OPEN_BY_PATH ZFS_AC_KERNEL_SRC_BLKDEV_PUT ZFS_AC_KERNEL_SRC_BLKDEV_PUT_HOLDER + ZFS_AC_KERNEL_SRC_BLKDEV_BDEV_RELEASE ZFS_AC_KERNEL_SRC_BLKDEV_REREAD_PART ZFS_AC_KERNEL_SRC_BLKDEV_INVALIDATE_BDEV ZFS_AC_KERNEL_SRC_BLKDEV_LOOKUP_BDEV diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c index 8b5aa94fe4f..e7f0aa57384 100644 --- a/module/os/linux/zfs/vdev_disk.c +++ b/module/os/linux/zfs/vdev_disk.c @@ -41,8 +41,28 @@ #include #endif +/* + * Linux 6.8.x uses a bdev_handle as an instance/refcount for an underlying + * block_device. Since it carries the block_device inside, its convenient to + * just use the handle as a proxy. For pre-6.8, we just emulate this with + * a cast, since we don't need any of the other fields inside the handle. + */ +#ifdef HAVE_BDEV_OPEN_BY_PATH +typedef struct bdev_handle zfs_bdev_handle_t; +#define BDH_BDEV(bdh) ((bdh)->bdev) +#define BDH_IS_ERR(bdh) (IS_ERR(bdh)) +#define BDH_PTR_ERR(bdh) (PTR_ERR(bdh)) +#define BDH_ERR_PTR(err) (ERR_PTR(err)) +#else +typedef void zfs_bdev_handle_t; +#define BDH_BDEV(bdh) ((struct block_device *)bdh) +#define BDH_IS_ERR(bdh) (IS_ERR(BDH_BDEV(bdh))) +#define BDH_PTR_ERR(bdh) (PTR_ERR(BDH_BDEV(bdh))) +#define BDH_ERR_PTR(err) (ERR_PTR(err)) +#endif + typedef struct vdev_disk { - struct block_device *vd_bdev; + zfs_bdev_handle_t *vd_bdh; krwlock_t vd_lock; } vdev_disk_t; @@ -209,29 +229,23 @@ static void vdev_disk_kobj_evt_post(vdev_t *v) { vdev_disk_t *vd = v->vdev_tsd; - if (vd && vd->vd_bdev) { - spl_signal_kobj_evt(vd->vd_bdev); + if (vd && vd->vd_bdh) { + spl_signal_kobj_evt(BDH_BDEV(vd->vd_bdh)); } else { vdev_dbgmsg(v, "vdev_disk_t is NULL for VDEV:%s\n", v->vdev_path); } } -#if !defined(HAVE_BLKDEV_GET_BY_PATH_4ARG) -/* - * Define a dummy struct blk_holder_ops for kernel versions - * prior to 6.5. - */ -struct blk_holder_ops {}; -#endif - -static struct block_device * -vdev_blkdev_get_by_path(const char *path, spa_mode_t mode, void *holder, - const struct blk_holder_ops *hops) +static zfs_bdev_handle_t * +vdev_blkdev_get_by_path(const char *path, spa_mode_t mode, void *holder) { -#ifdef HAVE_BLKDEV_GET_BY_PATH_4ARG +#if defined(HAVE_BDEV_OPEN_BY_PATH) + return (bdev_open_by_path(path, + vdev_bdev_mode(mode, B_TRUE), holder, NULL)); +#elif defined(HAVE_BLKDEV_GET_BY_PATH_4ARG) return (blkdev_get_by_path(path, - vdev_bdev_mode(mode, B_TRUE), holder, hops)); + vdev_bdev_mode(mode, B_TRUE), holder, NULL)); #else return (blkdev_get_by_path(path, vdev_bdev_mode(mode, B_TRUE), holder)); @@ -239,12 +253,15 @@ vdev_blkdev_get_by_path(const char *path, spa_mode_t mode, void *holder, } static void -vdev_blkdev_put(struct block_device *bdev, spa_mode_t mode, void *holder) +vdev_blkdev_put(zfs_bdev_handle_t *bdh, spa_mode_t mode, void *holder) { -#ifdef HAVE_BLKDEV_PUT_HOLDER - return (blkdev_put(bdev, holder)); +#if defined(HAVE_BDEV_RELEASE) + return (bdev_release(bdh)); +#elif defined(HAVE_BLKDEV_PUT_HOLDER) + return (blkdev_put(BDH_BDEV(bdh), holder)); #else - return (blkdev_put(bdev, vdev_bdev_mode(mode, B_TRUE))); + return (blkdev_put(BDH_BDEV(bdh), + vdev_bdev_mode(mode, B_TRUE))); #endif } @@ -252,7 +269,7 @@ static int vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, uint64_t *logical_ashift, uint64_t *physical_ashift) { - struct block_device *bdev; + zfs_bdev_handle_t *bdh; #ifdef HAVE_BLK_MODE_T blk_mode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa), B_FALSE); #else @@ -282,10 +299,11 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, boolean_t reread_part = B_FALSE; rw_enter(&vd->vd_lock, RW_WRITER); - bdev = vd->vd_bdev; - vd->vd_bdev = NULL; + bdh = vd->vd_bdh; + vd->vd_bdh = NULL; - if (bdev) { + if (bdh) { + struct block_device *bdev = BDH_BDEV(bdh); if (v->vdev_expanding && bdev != bdev_whole(bdev)) { vdev_bdevname(bdev_whole(bdev), disk_name + 5); /* @@ -307,15 +325,16 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, reread_part = B_TRUE; } - vdev_blkdev_put(bdev, mode, zfs_vdev_holder); + vdev_blkdev_put(bdh, mode, zfs_vdev_holder); } if (reread_part) { - bdev = vdev_blkdev_get_by_path(disk_name, mode, - zfs_vdev_holder, NULL); - if (!IS_ERR(bdev)) { - int error = vdev_bdev_reread_part(bdev); - vdev_blkdev_put(bdev, mode, zfs_vdev_holder); + bdh = vdev_blkdev_get_by_path(disk_name, mode, + zfs_vdev_holder); + if (!BDH_IS_ERR(bdh)) { + int error = + vdev_bdev_reread_part(BDH_BDEV(bdh)); + vdev_blkdev_put(bdh, mode, zfs_vdev_holder); if (error == 0) { timeout = MSEC2NSEC( zfs_vdev_open_timeout_ms * 2); @@ -358,11 +377,11 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, * subsequent attempts are expected to eventually succeed. */ hrtime_t start = gethrtime(); - bdev = ERR_PTR(-ENXIO); - while (IS_ERR(bdev) && ((gethrtime() - start) < timeout)) { - bdev = vdev_blkdev_get_by_path(v->vdev_path, mode, - zfs_vdev_holder, NULL); - if (unlikely(PTR_ERR(bdev) == -ENOENT)) { + bdh = BDH_ERR_PTR(-ENXIO); + while (BDH_IS_ERR(bdh) && ((gethrtime() - start) < timeout)) { + bdh = vdev_blkdev_get_by_path(v->vdev_path, mode, + zfs_vdev_holder); + if (unlikely(BDH_PTR_ERR(bdh) == -ENOENT)) { /* * There is no point of waiting since device is removed * explicitly @@ -371,52 +390,54 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, break; schedule_timeout(MSEC_TO_TICK(10)); - } else if (unlikely(PTR_ERR(bdev) == -ERESTARTSYS)) { + } else if (unlikely(BDH_PTR_ERR(bdh) == -ERESTARTSYS)) { timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms * 10); continue; - } else if (IS_ERR(bdev)) { + } else if (BDH_IS_ERR(bdh)) { break; } } - if (IS_ERR(bdev)) { - int error = -PTR_ERR(bdev); + if (BDH_IS_ERR(bdh)) { + int error = -BDH_PTR_ERR(bdh); vdev_dbgmsg(v, "open error=%d timeout=%llu/%llu", error, (u_longlong_t)(gethrtime() - start), (u_longlong_t)timeout); - vd->vd_bdev = NULL; + vd->vd_bdh = NULL; v->vdev_tsd = vd; rw_exit(&vd->vd_lock); return (SET_ERROR(error)); } else { - vd->vd_bdev = bdev; + vd->vd_bdh = bdh; v->vdev_tsd = vd; rw_exit(&vd->vd_lock); } + struct block_device *bdev = BDH_BDEV(vd->vd_bdh); + /* Determine the physical block size */ - int physical_block_size = bdev_physical_block_size(vd->vd_bdev); + int physical_block_size = bdev_physical_block_size(bdev); /* Determine the logical block size */ - int logical_block_size = bdev_logical_block_size(vd->vd_bdev); + int logical_block_size = bdev_logical_block_size(bdev); /* Clear the nowritecache bit, causes vdev_reopen() to try again. */ v->vdev_nowritecache = B_FALSE; /* Set when device reports it supports TRIM. */ - v->vdev_has_trim = bdev_discard_supported(vd->vd_bdev); + v->vdev_has_trim = bdev_discard_supported(bdev); /* Set when device reports it supports secure TRIM. */ - v->vdev_has_securetrim = bdev_secure_discard_supported(vd->vd_bdev); + v->vdev_has_securetrim = bdev_secure_discard_supported(bdev); /* Inform the ZIO pipeline that we are non-rotational */ - v->vdev_nonrot = blk_queue_nonrot(bdev_get_queue(vd->vd_bdev)); + v->vdev_nonrot = blk_queue_nonrot(bdev_get_queue(bdev)); /* Physical volume size in bytes for the partition */ - *psize = bdev_capacity(vd->vd_bdev); + *psize = bdev_capacity(bdev); /* Physical volume size in bytes including possible expansion space */ - *max_psize = bdev_max_capacity(vd->vd_bdev, v->vdev_wholedisk); + *max_psize = bdev_max_capacity(bdev, v->vdev_wholedisk); /* Based on the minimum sector size set the block size */ *physical_ashift = highbit64(MAX(physical_block_size, @@ -436,8 +457,8 @@ vdev_disk_close(vdev_t *v) if (v->vdev_reopening || vd == NULL) return; - if (vd->vd_bdev != NULL) { - vdev_blkdev_put(vd->vd_bdev, spa_mode(v->vdev_spa), + if (vd->vd_bdh != NULL) { + vdev_blkdev_put(vd->vd_bdh, spa_mode(v->vdev_spa), zfs_vdev_holder); } @@ -849,10 +870,10 @@ vdev_disk_io_trim(zio_t *zio) #if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE) if (zio->io_trim_flags & ZIO_TRIM_SECURE) { - return (-blkdev_issue_secure_erase(vd->vd_bdev, + return (-blkdev_issue_secure_erase(BDH_BDEV(vd->vd_bdh), zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS)); } else { - return (-blkdev_issue_discard(vd->vd_bdev, + return (-blkdev_issue_discard(BDH_BDEV(vd->vd_bdh), zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS)); } #elif defined(HAVE_BLKDEV_ISSUE_DISCARD) @@ -861,7 +882,7 @@ vdev_disk_io_trim(zio_t *zio) if (zio->io_trim_flags & ZIO_TRIM_SECURE) trim_flags |= BLKDEV_DISCARD_SECURE; #endif - return (-blkdev_issue_discard(vd->vd_bdev, + return (-blkdev_issue_discard(BDH_BDEV(vd->vd_bdh), zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS, trim_flags)); #else #error "Unsupported kernel" @@ -891,7 +912,7 @@ vdev_disk_io_start(zio_t *zio) * If the vdev is closed, it's likely due to a failed reopen and is * in the UNAVAIL state. Nothing to be done here but return failure. */ - if (vd->vd_bdev == NULL) { + if (vd->vd_bdh == NULL) { rw_exit(&vd->vd_lock); zio->io_error = ENXIO; zio_interrupt(zio); @@ -919,7 +940,7 @@ vdev_disk_io_start(zio_t *zio) break; } - error = vdev_disk_io_flush(vd->vd_bdev, zio); + error = vdev_disk_io_flush(BDH_BDEV(vd->vd_bdh), zio); if (error == 0) { rw_exit(&vd->vd_lock); return; @@ -958,7 +979,7 @@ vdev_disk_io_start(zio_t *zio) } zio->io_target_timestamp = zio_handle_io_delay(zio); - error = __vdev_disk_physio(vd->vd_bdev, zio, + error = __vdev_disk_physio(BDH_BDEV(vd->vd_bdh), zio, zio->io_size, zio->io_offset, rw, 0); rw_exit(&vd->vd_lock); @@ -981,8 +1002,8 @@ vdev_disk_io_done(zio_t *zio) vdev_t *v = zio->io_vd; vdev_disk_t *vd = v->vdev_tsd; - if (!zfs_check_disk_status(vd->vd_bdev)) { - invalidate_bdev(vd->vd_bdev); + if (!zfs_check_disk_status(BDH_BDEV(vd->vd_bdh))) { + invalidate_bdev(BDH_BDEV(vd->vd_bdh)); v->vdev_remove_wanted = B_TRUE; spa_async_request(zio->io_spa, SPA_ASYNC_REMOVE); } From 7466e09a492b644d39d85dd173e0f8051858a2a5 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Tue, 23 Jan 2024 16:34:49 +1100 Subject: [PATCH 70/91] Linux 6.8 compat: implement strlcpy fallback Linux has removed strlcpy in favour of strscpy. This implements a fallback implementation of strlcpy for this case. Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Sponsored-by: https://despairlabs.com/sponsor/ Closes #15805 --- config/kernel-strlcpy.m4 | 47 ++++++++++++++++++++++++++ config/kernel.m4 | 4 +++ include/os/linux/spl/sys/string.h | 49 ++++++++++++++++++++++++++++ module/os/linux/spl/spl-kmem-cache.c | 1 + module/os/linux/spl/spl-kstat.c | 1 + module/os/linux/spl/spl-thread.c | 1 + module/os/linux/spl/spl-zone.c | 1 + 7 files changed, 104 insertions(+) create mode 100644 config/kernel-strlcpy.m4 diff --git a/config/kernel-strlcpy.m4 b/config/kernel-strlcpy.m4 new file mode 100644 index 00000000000..c31cf52d78b --- /dev/null +++ b/config/kernel-strlcpy.m4 @@ -0,0 +1,47 @@ +dnl # +dnl # 6.8.x replaced strlcpy with strscpy. Check for both so we can provide +dnl # appropriate fallbacks. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_STRLCPY], [ + ZFS_LINUX_TEST_SRC([kernel_has_strlcpy], [ + #include + ], [ + const char *src = "goodbye"; + char dst[32]; + size_t len; + len = strlcpy(dst, src, sizeof (dst)); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_SRC_STRSCPY], [ + ZFS_LINUX_TEST_SRC([kernel_has_strscpy], [ + #include + ], [ + const char *src = "goodbye"; + char dst[32]; + ssize_t len; + len = strscpy(dst, src, sizeof (dst)); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_STRLCPY], [ + AC_MSG_CHECKING([whether strlcpy() exists]) + ZFS_LINUX_TEST_RESULT([kernel_has_strlcpy], [ + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_KERNEL_STRLCPY, 1, + [strlcpy() exists]) + ], [ + AC_MSG_RESULT([no]) + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_STRSCPY], [ + AC_MSG_CHECKING([whether strscpy() exists]) + ZFS_LINUX_TEST_RESULT([kernel_has_strscpy], [ + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_KERNEL_STRSCPY, 1, + [strscpy() exists]) + ], [ + AC_MSG_RESULT([no]) + ]) +]) diff --git a/config/kernel.m4 b/config/kernel.m4 index d25b65994f6..30bdd657952 100644 --- a/config/kernel.m4 +++ b/config/kernel.m4 @@ -149,6 +149,8 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [ ZFS_AC_KERNEL_SRC_SYSFS ZFS_AC_KERNEL_SRC_SET_SPECIAL_STATE ZFS_AC_KERNEL_SRC_STANDALONE_LINUX_STDARG + ZFS_AC_KERNEL_SRC_STRLCPY + ZFS_AC_KERNEL_SRC_STRSCPY ZFS_AC_KERNEL_SRC_PAGEMAP_FOLIO_WAIT_BIT ZFS_AC_KERNEL_SRC_ADD_DISK ZFS_AC_KERNEL_SRC_KTHREAD @@ -294,6 +296,8 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [ ZFS_AC_KERNEL_SYSFS ZFS_AC_KERNEL_SET_SPECIAL_STATE ZFS_AC_KERNEL_STANDALONE_LINUX_STDARG + ZFS_AC_KERNEL_STRLCPY + ZFS_AC_KERNEL_STRSCPY ZFS_AC_KERNEL_PAGEMAP_FOLIO_WAIT_BIT ZFS_AC_KERNEL_ADD_DISK ZFS_AC_KERNEL_KTHREAD diff --git a/include/os/linux/spl/sys/string.h b/include/os/linux/spl/sys/string.h index 38134dcf4c7..f44bf23eb32 100644 --- a/include/os/linux/spl/sys/string.h +++ b/include/os/linux/spl/sys/string.h @@ -1 +1,50 @@ +/* + * Copyright (C) 2007-2010 Lawrence Livermore National Security, LLC. + * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER). + * Written by Brian Behlendorf . + * UCRL-CODE-235197 + * + * This file is part of the SPL, Solaris Porting Layer. + * + * The SPL is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; either version 2 of the License, or (at your + * option) any later version. + * + * The SPL is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License + * for more details. + * + * You should have received a copy of the GNU General Public License along + * with the SPL. If not, see . + */ + +#ifndef _SPL_STRING_H +#define _SPL_STRING_H + #include + +/* Fallbacks for kernel missing strlcpy */ +#ifndef HAVE_KERNEL_STRLCPY + +#if defined(HAVE_KERNEL_STRSCPY) +/* + * strscpy is strlcpy, but returns an error on truncation. strlcpy is defined + * to return strlen(src), so detect error and override it. + */ +static inline size_t +strlcpy(char *dest, const char *src, size_t size) +{ + ssize_t ret = strscpy(dest, src, size); + if (likely(ret > 0)) + return ((size_t)ret); + return (strlen(src)); +} +#else +#error "no strlcpy fallback available" +#endif + +#endif /* HAVE_KERNEL_STRLCPY */ + +#endif /* _SPL_STRING_H */ diff --git a/module/os/linux/spl/spl-kmem-cache.c b/module/os/linux/spl/spl-kmem-cache.c index 4b15081715a..42821ad6025 100644 --- a/module/os/linux/spl/spl-kmem-cache.c +++ b/module/os/linux/spl/spl-kmem-cache.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include diff --git a/module/os/linux/spl/spl-kstat.c b/module/os/linux/spl/spl-kstat.c index 4308581147a..ad553a73a69 100644 --- a/module/os/linux/spl/spl-kstat.c +++ b/module/os/linux/spl/spl-kstat.c @@ -32,6 +32,7 @@ #include #include #include +#include static kmutex_t kstat_module_lock; static struct list_head kstat_module_list; diff --git a/module/os/linux/spl/spl-thread.c b/module/os/linux/spl/spl-thread.c index b4ef86a5e4a..ee3eb4690c3 100644 --- a/module/os/linux/spl/spl-thread.c +++ b/module/os/linux/spl/spl-thread.c @@ -26,6 +26,7 @@ #include #include #include +#include /* * Thread interfaces diff --git a/module/os/linux/spl/spl-zone.c b/module/os/linux/spl/spl-zone.c index e821fbb4f3a..d0d0cca154a 100644 --- a/module/os/linux/spl/spl-zone.c +++ b/module/os/linux/spl/spl-zone.c @@ -30,6 +30,7 @@ #include #include #include +#include #if defined(CONFIG_USER_NS) #include From 09e6724e1ee545a6afefc258820870dfedb2a16f Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Tue, 23 Jan 2024 16:41:05 +1100 Subject: [PATCH 71/91] Linux 6.8 compat: replace MAX_ORDER define MAX_ORDER has been renamed to MAX_PAGE_ORDER. Rather than just redefining it, instead define our own name and set it consistently from the start. Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Sponsored-by: https://despairlabs.com/sponsor/ Closes #15805 --- module/os/linux/zfs/abd_os.c | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) diff --git a/module/os/linux/zfs/abd_os.c b/module/os/linux/zfs/abd_os.c index 13150adbe0c..24390fbbf12 100644 --- a/module/os/linux/zfs/abd_os.c +++ b/module/os/linux/zfs/abd_os.c @@ -60,8 +60,16 @@ #ifdef _KERNEL #include #include +#endif + +#ifdef _KERNEL +#if defined(MAX_ORDER) +#define ABD_MAX_ORDER (MAX_ORDER) +#elif defined(MAX_PAGE_ORDER) +#define ABD_MAX_ORDER (MAX_PAGE_ORDER) +#endif #else -#define MAX_ORDER 1 +#define ABD_MAX_ORDER (1) #endif typedef struct abd_stats { @@ -71,7 +79,7 @@ typedef struct abd_stats { kstat_named_t abdstat_scatter_cnt; kstat_named_t abdstat_scatter_data_size; kstat_named_t abdstat_scatter_chunk_waste; - kstat_named_t abdstat_scatter_orders[MAX_ORDER]; + kstat_named_t abdstat_scatter_orders[ABD_MAX_ORDER]; kstat_named_t abdstat_scatter_page_multi_chunk; kstat_named_t abdstat_scatter_page_multi_zone; kstat_named_t abdstat_scatter_page_alloc_retry; @@ -139,7 +147,7 @@ static struct { wmsum_t abdstat_scatter_cnt; wmsum_t abdstat_scatter_data_size; wmsum_t abdstat_scatter_chunk_waste; - wmsum_t abdstat_scatter_orders[MAX_ORDER]; + wmsum_t abdstat_scatter_orders[ABD_MAX_ORDER]; wmsum_t abdstat_scatter_page_multi_chunk; wmsum_t abdstat_scatter_page_multi_zone; wmsum_t abdstat_scatter_page_alloc_retry; @@ -222,7 +230,7 @@ abd_free_struct_impl(abd_t *abd) } #ifdef _KERNEL -static unsigned zfs_abd_scatter_max_order = MAX_ORDER - 1; +static unsigned zfs_abd_scatter_max_order = ABD_MAX_ORDER - 1; /* * Mark zfs data pages so they can be excluded from kernel crash dumps @@ -272,7 +280,8 @@ abd_alloc_chunks(abd_t *abd, size_t size) struct page *page, *tmp_page = NULL; gfp_t gfp = __GFP_NOWARN | GFP_NOIO; gfp_t gfp_comp = (gfp | __GFP_NORETRY | __GFP_COMP) & ~__GFP_RECLAIM; - unsigned int max_order = MIN(zfs_abd_scatter_max_order, MAX_ORDER - 1); + unsigned int max_order = MIN(zfs_abd_scatter_max_order, + ABD_MAX_ORDER - 1); unsigned int nr_pages = abd_chunkcnt_for_bytes(size); unsigned int chunks = 0, zones = 0; size_t remaining_size; @@ -729,7 +738,7 @@ abd_kstats_update(kstat_t *ksp, int rw) wmsum_value(&abd_sums.abdstat_scatter_data_size); as->abdstat_scatter_chunk_waste.value.ui64 = wmsum_value(&abd_sums.abdstat_scatter_chunk_waste); - for (int i = 0; i < MAX_ORDER; i++) { + for (int i = 0; i < ABD_MAX_ORDER; i++) { as->abdstat_scatter_orders[i].value.ui64 = wmsum_value(&abd_sums.abdstat_scatter_orders[i]); } @@ -758,7 +767,7 @@ abd_init(void) wmsum_init(&abd_sums.abdstat_scatter_cnt, 0); wmsum_init(&abd_sums.abdstat_scatter_data_size, 0); wmsum_init(&abd_sums.abdstat_scatter_chunk_waste, 0); - for (i = 0; i < MAX_ORDER; i++) + for (i = 0; i < ABD_MAX_ORDER; i++) wmsum_init(&abd_sums.abdstat_scatter_orders[i], 0); wmsum_init(&abd_sums.abdstat_scatter_page_multi_chunk, 0); wmsum_init(&abd_sums.abdstat_scatter_page_multi_zone, 0); @@ -768,7 +777,7 @@ abd_init(void) abd_ksp = kstat_create("zfs", 0, "abdstats", "misc", KSTAT_TYPE_NAMED, sizeof (abd_stats) / sizeof (kstat_named_t), KSTAT_FLAG_VIRTUAL); if (abd_ksp != NULL) { - for (i = 0; i < MAX_ORDER; i++) { + for (i = 0; i < ABD_MAX_ORDER; i++) { snprintf(abd_stats.abdstat_scatter_orders[i].name, KSTAT_STRLEN, "scatter_order_%d", i); abd_stats.abdstat_scatter_orders[i].data_type = @@ -798,7 +807,7 @@ abd_fini(void) wmsum_fini(&abd_sums.abdstat_scatter_cnt); wmsum_fini(&abd_sums.abdstat_scatter_data_size); wmsum_fini(&abd_sums.abdstat_scatter_chunk_waste); - for (int i = 0; i < MAX_ORDER; i++) + for (int i = 0; i < ABD_MAX_ORDER; i++) wmsum_fini(&abd_sums.abdstat_scatter_orders[i]); wmsum_fini(&abd_sums.abdstat_scatter_page_multi_chunk); wmsum_fini(&abd_sums.abdstat_scatter_page_multi_zone); From cbd51c5f2416fecd1e0c1b79c7dad385ad29f5ce Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Tue, 23 Jan 2024 17:43:20 +1100 Subject: [PATCH 72/91] Linux 6.8 compat: fix inode permission tests The name inode_permission is now defined in the kernel. Rename ours to test_permission, in line with most of our other tests. Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Sponsored-by: https://despairlabs.com/sponsor/ Closes #15805 --- config/kernel-inode-permission.m4 | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/config/kernel-inode-permission.m4 b/config/kernel-inode-permission.m4 index aef4005c406..f7fc1643909 100644 --- a/config/kernel-inode-permission.m4 +++ b/config/kernel-inode-permission.m4 @@ -8,12 +8,12 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_PERMISSION], [ #include #include - static int inode_permission(struct mnt_idmap *idmap, + static int test_permission(struct mnt_idmap *idmap, struct inode *inode, int mask) { return 0; } static const struct inode_operations iops __attribute__ ((unused)) = { - .permission = inode_permission, + .permission = test_permission, }; ],[]) @@ -25,12 +25,12 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_PERMISSION], [ #include #include - static int inode_permission(struct user_namespace *userns, + static int test_permission(struct user_namespace *userns, struct inode *inode, int mask) { return 0; } static const struct inode_operations iops __attribute__ ((unused)) = { - .permission = inode_permission, + .permission = test_permission, }; ],[]) ]) From e6ca28c970842c387852acca89eaabfb54267b90 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Tue, 23 Jan 2024 21:14:06 +1100 Subject: [PATCH 73/91] Linux 6.8 compat: handle mnt_idmap user_namespace change struct mnt_idmap no longer has a struct user_namespace within it. Work around this by creating a temporary with the copy of the map we need taken from the idmap. Reviewed-by: Brian Behlendorf Co-authored-by: Youzhong Yang Signed-off-by: Rob Norris Sponsored-by: https://despairlabs.com/sponsor/ Closes #15805 --- config/kernel-idmap_mnt_api.m4 | 25 +++++++++++++++ config/kernel.m4 | 2 ++ include/os/linux/spl/sys/cred.h | 55 +++++++++++++++++++++++++++++--- include/os/linux/spl/sys/types.h | 11 +++++++ 4 files changed, 88 insertions(+), 5 deletions(-) diff --git a/config/kernel-idmap_mnt_api.m4 b/config/kernel-idmap_mnt_api.m4 index 47ddc5702fb..d1bdd053203 100644 --- a/config/kernel-idmap_mnt_api.m4 +++ b/config/kernel-idmap_mnt_api.m4 @@ -23,3 +23,28 @@ AC_DEFUN([ZFS_AC_KERNEL_IDMAP_MNT_API], [ ]) ]) +dnl # +dnl # 6.8 decouples mnt_idmap from user_namespace. This is all internal +dnl # to mnt_idmap so we can't detect it directly, but we detect a related +dnl # change as use that as a signal. +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_IDMAP_NO_USERNS], [ + ZFS_LINUX_TEST_SRC([idmap_no_userns], [ + #include + ], [ + struct uid_gid_map *map = NULL; + map_id_down(map, 0); + ]) +]) + + +AC_DEFUN([ZFS_AC_KERNEL_IDMAP_NO_USERNS], [ + AC_MSG_CHECKING([whether idmapped mounts have a user namespace]) + ZFS_LINUX_TEST_RESULT([idmap_no_userns], [ + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_IDMAP_NO_USERNS, 1, + [mnt_idmap does not have user_namespace]) + ], [ + AC_MSG_RESULT([no]) + ]) +]) diff --git a/config/kernel.m4 b/config/kernel.m4 index 30bdd657952..e3f8645774c 100644 --- a/config/kernel.m4 +++ b/config/kernel.m4 @@ -158,6 +158,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [ ZFS_AC_KERNEL_SRC___COPY_FROM_USER_INATOMIC ZFS_AC_KERNEL_SRC_USER_NS_COMMON_INUM ZFS_AC_KERNEL_SRC_IDMAP_MNT_API + ZFS_AC_KERNEL_SRC_IDMAP_NO_USERNS ZFS_AC_KERNEL_SRC_IATTR_VFSID ZFS_AC_KERNEL_SRC_FILEMAP ZFS_AC_KERNEL_SRC_WRITEPAGE_T @@ -305,6 +306,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [ ZFS_AC_KERNEL___COPY_FROM_USER_INATOMIC ZFS_AC_KERNEL_USER_NS_COMMON_INUM ZFS_AC_KERNEL_IDMAP_MNT_API + ZFS_AC_KERNEL_IDMAP_NO_USERNS ZFS_AC_KERNEL_IATTR_VFSID ZFS_AC_KERNEL_FILEMAP ZFS_AC_KERNEL_WRITEPAGE_T diff --git a/include/os/linux/spl/sys/cred.h b/include/os/linux/spl/sys/cred.h index 7fd5f644863..c19c3c0719f 100644 --- a/include/os/linux/spl/sys/cred.h +++ b/include/os/linux/spl/sys/cred.h @@ -73,13 +73,25 @@ static inline struct user_namespace *zfs_i_user_ns(struct inode *inode) static inline boolean_t zfs_no_idmapping(struct user_namespace *mnt_userns, struct user_namespace *fs_userns) { - return (zfs_is_init_userns(mnt_userns) || mnt_userns == fs_userns); + return (zfs_is_init_userns(mnt_userns) || + mnt_userns == fs_userns); } static inline uid_t zfs_uid_to_vfsuid(zidmap_t *mnt_userns, struct user_namespace *fs_userns, uid_t uid) { - struct user_namespace *owner = idmap_owner(mnt_userns); + struct user_namespace *owner; +#ifdef HAVE_IOPS_CREATE_IDMAP + if (mnt_userns == zfs_init_idmap) + return (uid); +#endif +#ifdef HAVE_IDMAP_NO_USERNS + struct user_namespace ns; + ns.uid_map = mnt_userns->uid_map; + owner = &ns; +#else + owner = idmap_owner(mnt_userns); +#endif if (zfs_no_idmapping(owner, fs_userns)) return (uid); if (!zfs_is_init_userns(fs_userns)) @@ -92,7 +104,18 @@ static inline uid_t zfs_uid_to_vfsuid(zidmap_t *mnt_userns, static inline gid_t zfs_gid_to_vfsgid(zidmap_t *mnt_userns, struct user_namespace *fs_userns, gid_t gid) { - struct user_namespace *owner = idmap_owner(mnt_userns); + struct user_namespace *owner; +#ifdef HAVE_IOPS_CREATE_IDMAP + if (mnt_userns == zfs_init_idmap) + return (gid); +#endif +#ifdef HAVE_IDMAP_NO_USERNS + struct user_namespace ns; + ns.gid_map = mnt_userns->gid_map; + owner = &ns; +#else + owner = idmap_owner(mnt_userns); +#endif if (zfs_no_idmapping(owner, fs_userns)) return (gid); if (!zfs_is_init_userns(fs_userns)) @@ -105,7 +128,18 @@ static inline gid_t zfs_gid_to_vfsgid(zidmap_t *mnt_userns, static inline uid_t zfs_vfsuid_to_uid(zidmap_t *mnt_userns, struct user_namespace *fs_userns, uid_t uid) { - struct user_namespace *owner = idmap_owner(mnt_userns); + struct user_namespace *owner; +#ifdef HAVE_IOPS_CREATE_IDMAP + if (mnt_userns == zfs_init_idmap) + return (uid); +#endif +#ifdef HAVE_IDMAP_NO_USERNS + struct user_namespace ns; + ns.uid_map = mnt_userns->uid_map; + owner = &ns; +#else + owner = idmap_owner(mnt_userns); +#endif if (zfs_no_idmapping(owner, fs_userns)) return (uid); uid = from_kuid(owner, KUIDT_INIT(uid)); @@ -119,7 +153,18 @@ static inline uid_t zfs_vfsuid_to_uid(zidmap_t *mnt_userns, static inline gid_t zfs_vfsgid_to_gid(zidmap_t *mnt_userns, struct user_namespace *fs_userns, gid_t gid) { - struct user_namespace *owner = idmap_owner(mnt_userns); + struct user_namespace *owner; +#ifdef HAVE_IOPS_CREATE_IDMAP + if (mnt_userns == zfs_init_idmap) + return (gid); +#endif +#ifdef HAVE_IDMAP_NO_USERNS + struct user_namespace ns; + ns.gid_map = mnt_userns->gid_map; + owner = &ns; +#else + owner = idmap_owner(mnt_userns); +#endif if (zfs_no_idmapping(owner, fs_userns)) return (gid); gid = from_kgid(owner, KGIDT_INIT(gid)); diff --git a/include/os/linux/spl/sys/types.h b/include/os/linux/spl/sys/types.h index d89a91c36f9..20ba457f7ef 100644 --- a/include/os/linux/spl/sys/types.h +++ b/include/os/linux/spl/sys/types.h @@ -57,12 +57,23 @@ typedef int minor_t; struct user_namespace; #ifdef HAVE_IOPS_CREATE_IDMAP #include +#ifdef HAVE_IDMAP_NO_USERNS +#include +struct mnt_idmap { + struct uid_gid_map uid_map; + struct uid_gid_map gid_map; + refcount_t count; +}; +typedef struct mnt_idmap zidmap_t; +#define idmap_owner(p) (NULL) +#else struct mnt_idmap { struct user_namespace *owner; refcount_t count; }; typedef struct mnt_idmap zidmap_t; #define idmap_owner(p) (((struct mnt_idmap *)p)->owner) +#endif #else typedef struct user_namespace zidmap_t; #define idmap_owner(p) ((struct user_namespace *)p) From 992d8871ebe172ab8da6e08ac7c31344267f6cdd Mon Sep 17 00:00:00 2001 From: Tony Hutter Date: Mon, 11 Dec 2023 09:59:59 -0800 Subject: [PATCH 74/91] ZTS: Add dirty dnode stress test Add a test for the dirty dnode SEEK_HOLE/SEEK_DATA bug described in https://github.com/openzfs/zfs/issues/15526 The bug was fixed in https://github.com/openzfs/zfs/pull/15571 and was backported to 2.2.2 and 2.1.14. This test case is just to make sure it does not come back. seekflood.c originally written by Rob Norris. Reviewed-by: Graham Perrin Reviewed-by: Brian Behlendorf Reviewed-by: Rob Norris Signed-off-by: Tony Hutter Closes #15608 --- tests/runfiles/common.run | 2 +- tests/zfs-tests/Makefile.am | 3 + tests/zfs-tests/tests/Makefile.am | 1 + .../tests/functional/cp_files/.gitignore | 1 + .../tests/functional/cp_files/cp_stress.ksh | 73 +++++++ .../tests/functional/cp_files/seekflood.c | 180 ++++++++++++++++++ 6 files changed, 259 insertions(+), 1 deletion(-) create mode 100644 tests/zfs-tests/tests/functional/cp_files/.gitignore create mode 100755 tests/zfs-tests/tests/functional/cp_files/cp_stress.ksh create mode 100644 tests/zfs-tests/tests/functional/cp_files/seekflood.c diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 85f29c82203..a3550d26ab3 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -630,7 +630,7 @@ tests = ['compress_001_pos', 'compress_002_pos', 'compress_003_pos', tags = ['functional', 'compression'] [tests/functional/cp_files] -tests = ['cp_files_001_pos'] +tests = ['cp_files_001_pos', 'cp_stress'] tags = ['functional', 'cp_files'] [tests/functional/crtime] diff --git a/tests/zfs-tests/Makefile.am b/tests/zfs-tests/Makefile.am index f8166352489..3dd1a645272 100644 --- a/tests/zfs-tests/Makefile.am +++ b/tests/zfs-tests/Makefile.am @@ -13,6 +13,9 @@ scripts_zfs_tests_functional_hkdf_PROGRAMS = %D%/tests/functional/hkdf/hkdf_test %C%_tests_functional_hkdf_hkdf_test_LDADD = \ libzpool.la +scripts_zfs_tests_functional_cp_filesdir = $(datadir)/$(PACKAGE)/zfs-tests/tests/functional/cp_files +scripts_zfs_tests_functional_cp_files_PROGRAMS = %D%/tests/functional/cp_files/seekflood + if BUILD_LINUX scripts_zfs_tests_functional_tmpfiledir = $(datadir)/$(PACKAGE)/zfs-tests/tests/functional/tmpfile scripts_zfs_tests_functional_tmpfile_PROGRAMS = \ diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 19174c71fbe..8bee07f480c 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1393,6 +1393,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/compression/setup.ksh \ functional/cp_files/cleanup.ksh \ functional/cp_files/cp_files_001_pos.ksh \ + functional/cp_files/cp_stress.ksh \ functional/cp_files/setup.ksh \ functional/crtime/cleanup.ksh \ functional/crtime/crtime_001_pos.ksh \ diff --git a/tests/zfs-tests/tests/functional/cp_files/.gitignore b/tests/zfs-tests/tests/functional/cp_files/.gitignore new file mode 100644 index 00000000000..d15225ac842 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cp_files/.gitignore @@ -0,0 +1 @@ +seekflood diff --git a/tests/zfs-tests/tests/functional/cp_files/cp_stress.ksh b/tests/zfs-tests/tests/functional/cp_files/cp_stress.ksh new file mode 100755 index 00000000000..43bb8ab572d --- /dev/null +++ b/tests/zfs-tests/tests/functional/cp_files/cp_stress.ksh @@ -0,0 +1,73 @@ +#! /bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END + +# +# Copyright (c) 2023 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# +# https://github.com/openzfs/zfs/issues/15526 identified a dirty dnode +# SEEK_HOLE/SEEK_DATA bug. https://github.com/openzfs/zfs/pull/15571 +# fixed the bug, and was backported to 2.1.14 and 2.2.2. +# +# This test is to ensure that the bug, as understood, will not recur. +# +# STRATEGY: +# +# 1. Run the 'seekflood' binary, for creation of files with timing +# characteristics that can trigger #15526. +# 2. A single run is not always a trigger, so run repeatedly. + +verify_runnable "global" + +function cleanup +{ + rm -rf /$TESTDIR/cp_stress +} + +log_assert "Run the 'seekflood' binary repeatedly to try to trigger #15526" + +log_onexit cleanup + +log_must mkdir /$TESTPOOL/cp_stress + +MYPWD="$PWD" +cd /$TESTPOOL/cp_stress +CPUS=$(get_num_cpus) + +if is_freebsd ; then + # 'seekflood' takes longer on FreeBSD and can timeout the test + RUNS=3 +else + RUNS=10 +fi + +for i in $(seq 1 $RUNS) ; do + # Each run takes around 12 seconds. + log_must $STF_SUITE/tests/functional/cp_files/seekflood 2000 $CPUS +done +cd "$MYPWD" + +log_pass "No corruption detected" diff --git a/tests/zfs-tests/tests/functional/cp_files/seekflood.c b/tests/zfs-tests/tests/functional/cp_files/seekflood.c new file mode 100644 index 00000000000..02c2c8e6eca --- /dev/null +++ b/tests/zfs-tests/tests/functional/cp_files/seekflood.c @@ -0,0 +1,180 @@ +/* + * SPDX-License-Identifier: MIT + * + * Copyright (c) 2023, Rob Norris + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + * + */ + +#ifndef _GNU_SOURCE +#define _GNU_SOURCE +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +#define DATASIZE (4096) +char data[DATASIZE]; + +static int +_open_file(int n, int wr) +{ + char buf[256]; + int fd; + + snprintf(buf, sizeof (buf), "testdata_%d_%d", getpid(), n); + + if ((fd = open(buf, wr ? (O_WRONLY | O_CREAT) : O_RDONLY, + wr ? (S_IRUSR | S_IWUSR) : 0)) < 0) { + fprintf(stderr, "Error: open '%s' (%s): %s\n", + buf, wr ? "write" : "read", strerror(errno)); + exit(1); + } + + return (fd); +} + +static void +_write_file(int n, int fd) +{ + /* write a big ball of stuff */ + ssize_t nwr = write(fd, data, DATASIZE); + if (nwr < 0) { + fprintf(stderr, "Error: write '%d_%d': %s\n", + getpid(), n, strerror(errno)); + exit(1); + } else if (nwr < DATASIZE) { + fprintf(stderr, "Error: write '%d_%d': short write\n", getpid(), + n); + exit(1); + } +} + +static int +_seek_file(int n, int fd) +{ + struct stat st; + if (fstat(fd, &st) < 0) { + fprintf(stderr, "Error: fstat '%d_%d': %s\n", getpid(), n, + strerror(errno)); + exit(1); + } + + /* + * A zero-sized file correctly has no data, so seeking the file is + * pointless. + */ + if (st.st_size == 0) + return (0); + + /* size is real, and we only write, so SEEK_DATA must find something */ + if (lseek(fd, 0, SEEK_DATA) < 0) { + if (errno == ENXIO) + return (1); + fprintf(stderr, "Error: lseek '%d_%d': %s\n", + getpid(), n, strerror(errno)); + exit(2); + } + + return (0); +} + +int +main(int argc, char **argv) +{ + int nfiles = 0; + int nthreads = 0; + + if (argc < 3 || (nfiles = atoi(argv[1])) == 0 || + (nthreads = atoi(argv[2])) == 0) { + printf("usage: seekflood \n"); + exit(1); + } + + memset(data, 0x5a, DATASIZE); + + /* fork off some flood threads */ + for (int i = 0; i < nthreads; i++) { + if (!fork()) { + /* thread main */ + + /* create zero file */ + int fd = _open_file(0, 1); + _write_file(0, fd); + close(fd); + + int count = 0; + + int h = 0, i, j, rfd, wfd; + for (i = 0; i < nfiles; i += 2, h++) { + j = i+1; + + /* seek h, write i */ + rfd = _open_file(h, 0); + wfd = _open_file(i, 1); + count += _seek_file(h, rfd); + _write_file(i, wfd); + close(rfd); + close(wfd); + + /* seek i, write j */ + rfd = _open_file(i, 0); + wfd = _open_file(j, 1); + count += _seek_file(i, rfd); + _write_file(j, wfd); + close(rfd); + close(wfd); + } + + /* return count of failed seeks to parent */ + exit(count < 256 ? count : 255); + } + } + + /* wait for threads, take their seek fail counts from exit code */ + int count = 0, crashed = 0; + for (int i = 0; i < nthreads; i++) { + int wstatus; + wait(&wstatus); + if (WIFEXITED(wstatus)) + count += WEXITSTATUS(wstatus); + else + crashed++; + } + + if (crashed) { + fprintf(stderr, "Error: child crashed; test failed\n"); + exit(1); + } + + if (count) { + fprintf(stderr, "Error: %d seek failures; test failed\n", + count); + exit(1); + } + + exit(0); +} From 59112ca27d94edd793dbfda6ed5d2fc7a97dddaa Mon Sep 17 00:00:00 2001 From: Tony Hutter Date: Tue, 7 Nov 2023 09:09:24 -0800 Subject: [PATCH 75/91] zed: misc vdev_enc_sysfs_path fixes There have been rare cases where the VDEV_ENC_SYSFS_PATH value that zed gets passed is stale. To mitigate this, dynamically check the sysfs path at the time of zed event processing, and use the dynamic value if possible. Note that there will be other times when we can not dynamically detect the sysfs path (like if a disk disappears) and have to rely on the old value for things like turning on the fault LED. That is to say, we can't just blindly use the dynamic path in every case. Also: - Add enclosure sysfs entry when running 'zpool add' - Fix 'slot' and 'enc' zpool.d scripts for nvme Reviewed-by: Don Brady Reviewed-by: Brian Behlendorf Signed-off-by: Tony Hutter Closes #15462 --- cmd/zed/agents/zfs_mod.c | 4 +++ cmd/zed/zed_event.c | 31 +++++++++++++++++++++++ cmd/zpool/zpool.d/ses | 12 +++++++-- cmd/zpool/zpool_vdev.c | 4 +++ include/libzutil.h | 2 ++ lib/libzfs/libzfs.abi | 7 +++++ lib/libzutil/os/freebsd/zutil_import_os.c | 9 +++++++ lib/libzutil/os/linux/zutil_import_os.c | 17 ++++++++----- 8 files changed, 78 insertions(+), 8 deletions(-) diff --git a/cmd/zed/agents/zfs_mod.c b/cmd/zed/agents/zfs_mod.c index 9636c99fc85..69163b80bd5 100644 --- a/cmd/zed/agents/zfs_mod.c +++ b/cmd/zed/agents/zfs_mod.c @@ -233,8 +233,12 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled) } (void) nvlist_lookup_string(vdev, ZPOOL_CONFIG_PHYS_PATH, &physpath); + + update_vdev_config_dev_sysfs_path(vdev, path, + ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH); (void) nvlist_lookup_string(vdev, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, &enc_sysfs_path); + (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, &wholedisk); (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_OFFLINE, &offline); (void) nvlist_lookup_uint64(vdev, ZPOOL_CONFIG_FAULTED, &faulted); diff --git a/cmd/zed/zed_event.c b/cmd/zed/zed_event.c index c60d5a4bc22..7e586769223 100644 --- a/cmd/zed/zed_event.c +++ b/cmd/zed/zed_event.c @@ -35,6 +35,7 @@ #include "zed_strings.h" #include "agents/zfs_agents.h" +#include #define MAXBUF 4096 @@ -922,6 +923,25 @@ _zed_event_add_time_strings(uint64_t eid, zed_strings_t *zsp, int64_t etime[]) } } + +static void +_zed_event_update_enc_sysfs_path(nvlist_t *nvl) +{ + const char *vdev_path; + + if (nvlist_lookup_string(nvl, FM_EREPORT_PAYLOAD_ZFS_VDEV_PATH, + &vdev_path) != 0) { + return; /* some other kind of event, ignore it */ + } + + if (vdev_path == NULL) { + return; + } + + update_vdev_config_dev_sysfs_path(nvl, vdev_path, + FM_EREPORT_PAYLOAD_ZFS_VDEV_ENC_SYSFS_PATH); +} + /* * Service the next zevent, blocking until one is available. */ @@ -969,6 +989,17 @@ zed_event_service(struct zed_conf *zcp) zed_log_msg(LOG_WARNING, "Failed to lookup zevent class (eid=%llu)", eid); } else { + /* + * Special case: If we can dynamically detect an enclosure sysfs + * path, then use that value rather than the one stored in the + * vd->vdev_enc_sysfs_path. There have been rare cases where + * vd->vdev_enc_sysfs_path becomes outdated. However, there + * will be other times when we can not dynamically detect the + * sysfs path (like if a disk disappears) and have to rely on + * the old value for things like turning on the fault LED. + */ + _zed_event_update_enc_sysfs_path(nvl); + /* let internal modules see this event first */ zfs_agent_post_event(class, NULL, nvl); diff --git a/cmd/zpool/zpool.d/ses b/cmd/zpool/zpool.d/ses index 638145c95d4..19ef92ad67b 100755 --- a/cmd/zpool/zpool.d/ses +++ b/cmd/zpool/zpool.d/ses @@ -33,10 +33,18 @@ for i in $scripts ; do val="" case $i in enc) - val=$(ls "$VDEV_ENC_SYSFS_PATH/../../" 2>/dev/null) + if echo "$VDEV_ENC_SYSFS_PATH" | grep -q '/sys/bus/pci/slots' ; then + val="$VDEV_ENC_SYSFS_PATH" + else + val="$(ls """$VDEV_ENC_SYSFS_PATH/../../""" 2>/dev/null)" + fi ;; slot) - val=$(cat "$VDEV_ENC_SYSFS_PATH/slot" 2>/dev/null) + if echo "$VDEV_ENC_SYSFS_PATH" | grep -q '/sys/bus/pci/slots' ; then + val="$(basename """$VDEV_ENC_SYSFS_PATH""")" + else + val="$(cat """$VDEV_ENC_SYSFS_PATH/slot""" 2>/dev/null)" + fi ;; encdev) val=$(ls "$VDEV_ENC_SYSFS_PATH/../device/scsi_generic" 2>/dev/null) diff --git a/cmd/zpool/zpool_vdev.c b/cmd/zpool/zpool_vdev.c index 3d0fc089c32..fbd4b81dfac 100644 --- a/cmd/zpool/zpool_vdev.c +++ b/cmd/zpool/zpool_vdev.c @@ -372,6 +372,10 @@ make_leaf_vdev(nvlist_t *props, const char *arg, boolean_t is_primary) verify(nvlist_add_string(vdev, ZPOOL_CONFIG_PATH, path) == 0); verify(nvlist_add_string(vdev, ZPOOL_CONFIG_TYPE, type) == 0); + /* Lookup and add the enclosure sysfs path (if exists) */ + update_vdev_config_dev_sysfs_path(vdev, path, + ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH); + if (strcmp(type, VDEV_TYPE_DISK) == 0) verify(nvlist_add_uint64(vdev, ZPOOL_CONFIG_WHOLE_DISK, (uint64_t)wholedisk) == 0); diff --git a/include/libzutil.h b/include/libzutil.h index 053b1ed4b52..9842c225b6f 100644 --- a/include/libzutil.h +++ b/include/libzutil.h @@ -208,6 +208,8 @@ int for_each_vdev_cb(void *zhp, nvlist_t *nv, pool_vdev_iter_f func, int for_each_vdev_in_nvlist(nvlist_t *nvroot, pool_vdev_iter_f func, void *data); void update_vdevs_config_dev_sysfs_path(nvlist_t *config); +_LIBZUTIL_H void update_vdev_config_dev_sysfs_path(nvlist_t *nv, + const char *path, const char *key); #ifdef __cplusplus } #endif diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index 2d612a16b22..3c975397ed3 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -260,6 +260,7 @@ + @@ -8329,6 +8330,12 @@ + + + + + + diff --git a/lib/libzutil/os/freebsd/zutil_import_os.c b/lib/libzutil/os/freebsd/zutil_import_os.c index 19ba58e79a0..a134c173bc8 100644 --- a/lib/libzutil/os/freebsd/zutil_import_os.c +++ b/lib/libzutil/os/freebsd/zutil_import_os.c @@ -249,6 +249,15 @@ zfs_dev_flush(int fd) return (0); } +void +update_vdev_config_dev_sysfs_path(nvlist_t *nv, const char *path, + const char *key) +{ + (void) nv; + (void) path; + (void) key; +} + void update_vdevs_config_dev_sysfs_path(nvlist_t *config) { diff --git a/lib/libzutil/os/linux/zutil_import_os.c b/lib/libzutil/os/linux/zutil_import_os.c index 44ed697dd49..fbfae4f7e68 100644 --- a/lib/libzutil/os/linux/zutil_import_os.c +++ b/lib/libzutil/os/linux/zutil_import_os.c @@ -766,9 +766,12 @@ no_dev: * Rescan the enclosure sysfs path for turning on enclosure LEDs and store it * in the nvlist * (if applicable). Like: * vdev_enc_sysfs_path: '/sys/class/enclosure/11:0:1:0/SLOT 4' + * + * key: The nvlist_t name (like ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH) */ -static void -update_vdev_config_dev_sysfs_path(nvlist_t *nv, const char *path) +void +update_vdev_config_dev_sysfs_path(nvlist_t *nv, const char *path, + const char *key) { char *upath, *spath; @@ -777,9 +780,9 @@ update_vdev_config_dev_sysfs_path(nvlist_t *nv, const char *path) spath = zfs_get_enclosure_sysfs_path(upath); if (spath) { - nvlist_add_string(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, spath); + (void) nvlist_add_string(nv, key, spath); } else { - nvlist_remove_all(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH); + (void) nvlist_remove_all(nv, key); } free(upath); @@ -799,7 +802,8 @@ sysfs_path_pool_vdev_iter_f(void *hdl_data, nvlist_t *nv, void *data) return (1); /* Rescan our enclosure sysfs path for this vdev */ - update_vdev_config_dev_sysfs_path(nv, path); + update_vdev_config_dev_sysfs_path(nv, path, + ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH); return (0); } @@ -888,7 +892,8 @@ update_vdev_config_dev_strs(nvlist_t *nv) (void) nvlist_add_string(nv, ZPOOL_CONFIG_PHYS_PATH, vds.vds_devphys); } - update_vdev_config_dev_sysfs_path(nv, path); + update_vdev_config_dev_sysfs_path(nv, path, + ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH); } else { /* Clear out any stale entries. */ (void) nvlist_remove_all(nv, ZPOOL_CONFIG_DEVID); From 69142125d75b7405e0f1cf141dbe7913448daedf Mon Sep 17 00:00:00 2001 From: Tony Hutter Date: Thu, 21 Dec 2023 10:53:16 -0800 Subject: [PATCH 76/91] zpool: Add slot power control, print power status Add `zpool` flags to control the slot power to drives. This assumes your SAS or NVMe enclosure supports slot power control via sysfs. The new `--power` flag is added to `zpool offline|online|clear`: zpool offline --power Turn off device slot power zpool online --power Turn on device slot power zpool clear --power [device] Turn on device slot power If the ZPOOL_AUTO_POWER_ON_SLOT env var is set, then the '--power' option is automatically implied for `zpool online` and `zpool clear` and does not need to be passed. zpool status also gets a --power option to print the slot power status. Reviewed-by: Brian Behlendorf Reviewed-by: Mart Frauenlob Signed-off-by: Tony Hutter Closes #15662 --- cmd/zpool/os/freebsd/zpool_vdev_os.c | 14 ++ cmd/zpool/os/linux/zpool_vdev_os.c | 255 ++++++++++++++++++++++++ cmd/zpool/zpool_iter.c | 4 + cmd/zpool/zpool_main.c | 239 +++++++++++++++++++--- cmd/zpool/zpool_util.h | 3 + include/libzfs.h | 3 + include/libzutil.h | 57 ++++++ lib/libzfs/libzfs.abi | 97 +++++++-- lib/libzfs/libzfs_pool.c | 49 ++++- lib/libzutil/os/linux/zutil_import_os.c | 40 +++- lib/libzutil/zutil_import.c | 98 +++++++++ lib/libzutil/zutil_pool.c | 31 +++ man/man8/zpool-clear.8 | 11 + man/man8/zpool-offline.8 | 18 +- man/man8/zpool-status.8 | 2 + man/man8/zpool.8 | 19 +- 16 files changed, 875 insertions(+), 65 deletions(-) diff --git a/cmd/zpool/os/freebsd/zpool_vdev_os.c b/cmd/zpool/os/freebsd/zpool_vdev_os.c index 231ca97f1f6..9dd733989e2 100644 --- a/cmd/zpool/os/freebsd/zpool_vdev_os.c +++ b/cmd/zpool/os/freebsd/zpool_vdev_os.c @@ -124,3 +124,17 @@ check_file(const char *file, boolean_t force, boolean_t isspare) { return (check_file_generic(file, force, isspare)); } + +int +zpool_power_current_state(zpool_handle_t *zhp, char *vdev) +{ + /* Enclosure slot power not supported on FreeBSD yet */ + return (-1); +} + +int +zpool_power(zpool_handle_t *zhp, char *vdev, boolean_t turn_on) +{ + /* Enclosure slot power not supported on FreeBSD yet */ + return (ENOTSUP); +} diff --git a/cmd/zpool/os/linux/zpool_vdev_os.c b/cmd/zpool/os/linux/zpool_vdev_os.c index 7f4486e062f..006a3a7d8e0 100644 --- a/cmd/zpool/os/linux/zpool_vdev_os.c +++ b/cmd/zpool/os/linux/zpool_vdev_os.c @@ -416,3 +416,258 @@ check_file(const char *file, boolean_t force, boolean_t isspare) { return (check_file_generic(file, force, isspare)); } + +/* + * Read from a sysfs file and return an allocated string. Removes + * the newline from the end of the string if there is one. + * + * Returns a string on success (which must be freed), or NULL on error. + */ +static char *zpool_sysfs_gets(char *path) +{ + int fd; + struct stat statbuf; + char *buf = NULL; + ssize_t count = 0; + fd = open(path, O_RDONLY); + if (fd < 0) + return (NULL); + + if (fstat(fd, &statbuf) != 0) { + close(fd); + return (NULL); + } + + buf = calloc(sizeof (*buf), statbuf.st_size + 1); + if (buf == NULL) { + close(fd); + return (NULL); + } + + /* + * Note, we can read less bytes than st_size, and that's ok. Sysfs + * files will report their size is 4k even if they only return a small + * string. + */ + count = read(fd, buf, statbuf.st_size); + if (count < 0) { + /* Error doing read() or we overran the buffer */ + close(fd); + free(buf); + return (NULL); + } + + /* Remove trailing newline */ + if (buf[count - 1] == '\n') + buf[count - 1] = 0; + + close(fd); + + return (buf); +} + +/* + * Write a string to a sysfs file. + * + * Returns 0 on success, non-zero otherwise. + */ +static int zpool_sysfs_puts(char *path, char *str) +{ + FILE *file; + + file = fopen(path, "w"); + if (!file) { + return (-1); + } + + if (fputs(str, file) < 0) { + fclose(file); + return (-2); + } + fclose(file); + return (0); +} + +/* Given a vdev nvlist_t, rescan its enclosure sysfs path */ +static void +rescan_vdev_config_dev_sysfs_path(nvlist_t *vdev_nv) +{ + update_vdev_config_dev_sysfs_path(vdev_nv, + fnvlist_lookup_string(vdev_nv, ZPOOL_CONFIG_PATH), + ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH); +} + +/* + * Given a power string: "on", "off", "1", or "0", return 0 if it's an + * off value, 1 if it's an on value, and -1 if the value is unrecognized. + */ +static int zpool_power_parse_value(char *str) +{ + if ((strcmp(str, "off") == 0) || (strcmp(str, "0") == 0)) + return (0); + + if ((strcmp(str, "on") == 0) || (strcmp(str, "1") == 0)) + return (1); + + return (-1); +} + +/* + * Given a vdev string return an allocated string containing the sysfs path to + * its power control file. Also do a check if the power control file really + * exists and has correct permissions. + * + * Example returned strings: + * + * /sys/class/enclosure/0:0:122:0/10/power_status + * /sys/bus/pci/slots/10/power + * + * Returns allocated string on success (which must be freed), NULL on failure. + */ +static char * +zpool_power_sysfs_path(zpool_handle_t *zhp, char *vdev) +{ + const char *enc_sysfs_dir = NULL; + char *path = NULL; + nvlist_t *vdev_nv = zpool_find_vdev(zhp, vdev, NULL, NULL, NULL); + + if (vdev_nv == NULL) { + return (NULL); + } + + /* Make sure we're getting the updated enclosure sysfs path */ + rescan_vdev_config_dev_sysfs_path(vdev_nv); + + if (nvlist_lookup_string(vdev_nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, + &enc_sysfs_dir) != 0) { + return (NULL); + } + + if (asprintf(&path, "%s/power_status", enc_sysfs_dir) == -1) + return (NULL); + + if (access(path, W_OK) != 0) { + free(path); + path = NULL; + /* No HDD 'power_control' file, maybe it's NVMe? */ + if (asprintf(&path, "%s/power", enc_sysfs_dir) == -1) { + return (NULL); + } + + if (access(path, R_OK | W_OK) != 0) { + /* Not NVMe either */ + free(path); + return (NULL); + } + } + + return (path); +} + +/* + * Given a path to a sysfs power control file, return B_TRUE if you should use + * "on/off" words to control it, or B_FALSE otherwise ("0/1" to control). + */ +static boolean_t +zpool_power_use_word(char *sysfs_path) +{ + if (strcmp(&sysfs_path[strlen(sysfs_path) - strlen("power_status")], + "power_status") == 0) { + return (B_TRUE); + } + return (B_FALSE); +} + +/* + * Check the sysfs power control value for a vdev. + * + * Returns: + * 0 - Power is off + * 1 - Power is on + * -1 - Error or unsupported + */ +int +zpool_power_current_state(zpool_handle_t *zhp, char *vdev) +{ + char *val; + int rc; + + char *path = zpool_power_sysfs_path(zhp, vdev); + if (path == NULL) + return (-1); + + val = zpool_sysfs_gets(path); + if (val == NULL) { + free(path); + return (-1); + } + + rc = zpool_power_parse_value(val); + free(val); + free(path); + return (rc); +} + +/* + * Turn on or off the slot to a device + * + * Device path is the full path to the device (like /dev/sda or /dev/sda1). + * + * Return code: + * 0: Success + * ENOTSUP: Power control not supported for OS + * EBADSLT: Couldn't read current power state + * ENOENT: No sysfs path to power control + * EIO: Couldn't write sysfs power value + * EBADE: Sysfs power value didn't change + */ +int +zpool_power(zpool_handle_t *zhp, char *vdev, boolean_t turn_on) +{ + char *sysfs_path; + const char *val; + int rc; + int timeout_ms; + + rc = zpool_power_current_state(zhp, vdev); + if (rc == -1) { + return (EBADSLT); + } + + /* Already correct value? */ + if (rc == (int)turn_on) + return (0); + + sysfs_path = zpool_power_sysfs_path(zhp, vdev); + if (sysfs_path == NULL) + return (ENOENT); + + if (zpool_power_use_word(sysfs_path)) { + val = turn_on ? "on" : "off"; + } else { + val = turn_on ? "1" : "0"; + } + + rc = zpool_sysfs_puts(sysfs_path, (char *)val); + + free(sysfs_path); + if (rc != 0) { + return (EIO); + } + + /* + * Wait up to 30 seconds for sysfs power value to change after + * writing it. + */ + timeout_ms = zpool_getenv_int("ZPOOL_POWER_ON_SLOT_TIMEOUT_MS", 30000); + for (int i = 0; i < MAX(1, timeout_ms / 200); i++) { + rc = zpool_power_current_state(zhp, vdev); + if (rc == (int)turn_on) + return (0); /* success */ + + fsleep(0.200); /* 200ms */ + } + + /* sysfs value never changed */ + return (EBADE); +} diff --git a/cmd/zpool/zpool_iter.c b/cmd/zpool/zpool_iter.c index 506b529dce4..ae2e9da9108 100644 --- a/cmd/zpool/zpool_iter.c +++ b/cmd/zpool/zpool_iter.c @@ -554,6 +554,10 @@ for_each_vdev_run_cb(void *zhp_data, nvlist_t *nv, void *cb_vcdl) if (nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path) != 0) return (1); + /* Make sure we're getting the updated enclosure sysfs path */ + update_vdev_config_dev_sysfs_path(nv, path, + ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH); + nvlist_lookup_string(nv, ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH, &vdev_enc_sysfs_path); diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 5f96dc8d004..6687a446445 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -353,7 +353,7 @@ get_usage(zpool_help_t idx) return (gettext("\tattach [-fsw] [-o property=value] " " \n")); case HELP_CLEAR: - return (gettext("\tclear [-nF] [device]\n")); + return (gettext("\tclear [[--power]|[-nF]] [device]\n")); case HELP_CREATE: return (gettext("\tcreate [-fnd] [-o property=value] ... \n" "\t [-O file-system-property=value] ... \n" @@ -389,9 +389,11 @@ get_usage(zpool_help_t idx) "[-T d|u] [pool] ... \n" "\t [interval [count]]\n")); case HELP_OFFLINE: - return (gettext("\toffline [-f] [-t] ...\n")); + return (gettext("\toffline [--power]|[[-f][-t]] " + " ...\n")); case HELP_ONLINE: - return (gettext("\tonline [-e] ...\n")); + return (gettext("\tonline [--power][-e] " + "...\n")); case HELP_REPLACE: return (gettext("\treplace [-fsw] [-o property=value] " " [new-device]\n")); @@ -410,7 +412,7 @@ get_usage(zpool_help_t idx) return (gettext("\ttrim [-dw] [-r ] [-c | -s] " "[ ...]\n")); case HELP_STATUS: - return (gettext("\tstatus [-c [script1,script2,...]] " + return (gettext("\tstatus [--power] [-c [script1,script2,...]] " "[-igLpPstvxD] [-T d|u] [pool] ... \n" "\t [interval [count]]\n")); case HELP_UPGRADE: @@ -516,6 +518,77 @@ print_vdev_prop_cb(int prop, void *cb) return (ZPROP_CONT); } +/* + * Given a leaf vdev name like 'L5' return its VDEV_CONFIG_PATH like + * '/dev/disk/by-vdev/L5'. + */ +static const char * +vdev_name_to_path(zpool_handle_t *zhp, char *vdev) +{ + nvlist_t *vdev_nv = zpool_find_vdev(zhp, vdev, NULL, NULL, NULL); + if (vdev_nv == NULL) { + return (NULL); + } + return (fnvlist_lookup_string(vdev_nv, ZPOOL_CONFIG_PATH)); +} + +static int +zpool_power_on(zpool_handle_t *zhp, char *vdev) +{ + return (zpool_power(zhp, vdev, B_TRUE)); +} + +static int +zpool_power_on_and_disk_wait(zpool_handle_t *zhp, char *vdev) +{ + int rc; + + rc = zpool_power_on(zhp, vdev); + if (rc != 0) + return (rc); + + zpool_disk_wait(vdev_name_to_path(zhp, vdev)); + + return (0); +} + +static int +zpool_power_on_pool_and_wait_for_devices(zpool_handle_t *zhp) +{ + nvlist_t *nv; + const char *path = NULL; + int rc; + + /* Power up all the devices first */ + FOR_EACH_REAL_LEAF_VDEV(zhp, nv) { + path = fnvlist_lookup_string(nv, ZPOOL_CONFIG_PATH); + if (path != NULL) { + rc = zpool_power_on(zhp, (char *)path); + if (rc != 0) { + return (rc); + } + } + } + + /* + * Wait for their devices to show up. Since we powered them on + * at roughly the same time, they should all come online around + * the same time. + */ + FOR_EACH_REAL_LEAF_VDEV(zhp, nv) { + path = fnvlist_lookup_string(nv, ZPOOL_CONFIG_PATH); + zpool_disk_wait(path); + } + + return (0); +} + +static int +zpool_power_off(zpool_handle_t *zhp, char *vdev) +{ + return (zpool_power(zhp, vdev, B_FALSE)); +} + /* * Display usage message. If we're inside a command, display only the usage for * that command. Otherwise, iterate over the entire command table and display @@ -2093,6 +2166,7 @@ typedef struct status_cbdata { boolean_t cb_print_vdev_init; boolean_t cb_print_vdev_trim; vdev_cmd_data_list_t *vcdl; + boolean_t cb_print_power; } status_cbdata_t; /* Return 1 if string is NULL, empty, or whitespace; return 0 otherwise. */ @@ -2378,6 +2452,26 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, else printf(" %5s", rbuf); } + if (cb->cb_print_power) { + if (children == 0) { + /* Only leaf vdevs have physical slots */ + switch (zpool_power_current_state(zhp, (char *) + fnvlist_lookup_string(nv, + ZPOOL_CONFIG_PATH))) { + case 0: + printf_color(ANSI_RED, " %5s", + gettext("off")); + break; + case 1: + printf(" %5s", gettext("on")); + break; + default: + printf(" %5s", "-"); + } + } else { + printf(" %5s", "-"); + } + } } if (nvlist_lookup_uint64(nv, ZPOOL_CONFIG_NOT_PRESENT, @@ -5428,19 +5522,6 @@ get_interval_count_filter_guids(int *argc, char **argv, float *interval, interval, count); } -/* - * Floating point sleep(). Allows you to pass in a floating point value for - * seconds. - */ -static void -fsleep(float sec) -{ - struct timespec req; - req.tv_sec = floor(sec); - req.tv_nsec = (sec - (float)req.tv_sec) * NANOSEC; - nanosleep(&req, NULL); -} - /* * Terminal height, in rows. Returns -1 if stdout is not connected to a TTY or * if we were unable to determine its size. @@ -6939,10 +7020,12 @@ zpool_do_split(int argc, char **argv) return (ret); } - +#define POWER_OPT 1024 /* - * zpool online ... + * zpool online [--power] ... + * + * --power: Power on the enclosure slot to the drive (if possible) */ int zpool_do_online(int argc, char **argv) @@ -6953,13 +7036,21 @@ zpool_do_online(int argc, char **argv) int ret = 0; vdev_state_t newstate; int flags = 0; + boolean_t is_power_on = B_FALSE; + struct option long_options[] = { + {"power", no_argument, NULL, POWER_OPT}, + {0, 0, 0, 0} + }; /* check options */ - while ((c = getopt(argc, argv, "e")) != -1) { + while ((c = getopt_long(argc, argv, "e", long_options, NULL)) != -1) { switch (c) { case 'e': flags |= ZFS_ONLINE_EXPAND; break; + case POWER_OPT: + is_power_on = B_TRUE; + break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); @@ -6967,6 +7058,9 @@ zpool_do_online(int argc, char **argv) } } + if (libzfs_envvar_is_set("ZPOOL_AUTO_POWER_ON_SLOT")) + is_power_on = B_TRUE; + argc -= optind; argv += optind; @@ -6988,6 +7082,18 @@ zpool_do_online(int argc, char **argv) for (i = 1; i < argc; i++) { vdev_state_t oldstate; boolean_t avail_spare, l2cache; + int rc; + + if (is_power_on) { + rc = zpool_power_on_and_disk_wait(zhp, argv[i]); + if (rc == ENOTSUP) { + (void) fprintf(stderr, + gettext("Power control not supported\n")); + } + if (rc != 0) + return (rc); + } + nvlist_t *tgt = zpool_find_vdev(zhp, argv[i], &avail_spare, &l2cache, NULL); if (tgt == NULL) { @@ -7033,12 +7139,15 @@ zpool_do_online(int argc, char **argv) } /* - * zpool offline [-ft] ... + * zpool offline [-ft]|[--power] ... + * * * -f Force the device into a faulted state. * * -t Only take the device off-line temporarily. The offline/faulted * state will not be persistent across reboots. + * + * --power Power off the enclosure slot to the drive (if possible) */ int zpool_do_offline(int argc, char **argv) @@ -7049,9 +7158,15 @@ zpool_do_offline(int argc, char **argv) int ret = 0; boolean_t istmp = B_FALSE; boolean_t fault = B_FALSE; + boolean_t is_power_off = B_FALSE; + + struct option long_options[] = { + {"power", no_argument, NULL, POWER_OPT}, + {0, 0, 0, 0} + }; /* check options */ - while ((c = getopt(argc, argv, "ft")) != -1) { + while ((c = getopt_long(argc, argv, "ft", long_options, NULL)) != -1) { switch (c) { case 'f': fault = B_TRUE; @@ -7059,6 +7174,9 @@ zpool_do_offline(int argc, char **argv) case 't': istmp = B_TRUE; break; + case POWER_OPT: + is_power_off = B_TRUE; + break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); @@ -7066,6 +7184,20 @@ zpool_do_offline(int argc, char **argv) } } + if (is_power_off && fault) { + (void) fprintf(stderr, + gettext("-0 and -f cannot be used together\n")); + usage(B_FALSE); + return (1); + } + + if (is_power_off && istmp) { + (void) fprintf(stderr, + gettext("-0 and -t cannot be used together\n")); + usage(B_FALSE); + return (1); + } + argc -= optind; argv += optind; @@ -7085,8 +7217,22 @@ zpool_do_offline(int argc, char **argv) return (1); for (i = 1; i < argc; i++) { - if (fault) { - uint64_t guid = zpool_vdev_path_to_guid(zhp, argv[i]); + uint64_t guid = zpool_vdev_path_to_guid(zhp, argv[i]); + if (is_power_off) { + /* + * Note: we have to power off first, then set REMOVED, + * or else zpool_vdev_set_removed_state() returns + * EAGAIN. + */ + ret = zpool_power_off(zhp, argv[i]); + if (ret != 0) { + (void) fprintf(stderr, "%s %s %d\n", + gettext("unable to power off slot for"), + argv[i], ret); + } + zpool_vdev_set_removed_state(zhp, guid, VDEV_AUX_NONE); + + } else if (fault) { vdev_aux_t aux; if (istmp == B_FALSE) { /* Force the fault to persist across imports */ @@ -7109,7 +7255,7 @@ zpool_do_offline(int argc, char **argv) } /* - * zpool clear [device] + * zpool clear [-nF]|[--power] [device] * * Clear all errors associated with a pool or a particular device. */ @@ -7121,13 +7267,20 @@ zpool_do_clear(int argc, char **argv) boolean_t dryrun = B_FALSE; boolean_t do_rewind = B_FALSE; boolean_t xtreme_rewind = B_FALSE; + boolean_t is_power_on = B_FALSE; uint32_t rewind_policy = ZPOOL_NO_REWIND; nvlist_t *policy = NULL; zpool_handle_t *zhp; char *pool, *device; + struct option long_options[] = { + {"power", no_argument, NULL, POWER_OPT}, + {0, 0, 0, 0} + }; + /* check options */ - while ((c = getopt(argc, argv, "FnX")) != -1) { + while ((c = getopt_long(argc, argv, "FnX", long_options, + NULL)) != -1) { switch (c) { case 'F': do_rewind = B_TRUE; @@ -7138,6 +7291,9 @@ zpool_do_clear(int argc, char **argv) case 'X': xtreme_rewind = B_TRUE; break; + case POWER_OPT: + is_power_on = B_TRUE; + break; case '?': (void) fprintf(stderr, gettext("invalid option '%c'\n"), optopt); @@ -7145,6 +7301,9 @@ zpool_do_clear(int argc, char **argv) } } + if (libzfs_envvar_is_set("ZPOOL_AUTO_POWER_ON_SLOT")) + is_power_on = B_TRUE; + argc -= optind; argv += optind; @@ -7185,6 +7344,14 @@ zpool_do_clear(int argc, char **argv) return (1); } + if (is_power_on) { + if (device == NULL) { + zpool_power_on_pool_and_wait_for_devices(zhp); + } else { + zpool_power_on_and_disk_wait(zhp, device); + } + } + if (zpool_clear(zhp, device, policy) != 0) ret = 1; @@ -8801,6 +8968,10 @@ status_callback(zpool_handle_t *zhp, void *data) printf_color(ANSI_BOLD, " %5s", gettext("SLOW")); } + if (cbp->cb_print_power) { + printf_color(ANSI_BOLD, " %5s", gettext("POWER")); + } + if (cbp->vcdl != NULL) print_cmd_columns(cbp->vcdl, 0); @@ -8847,8 +9018,8 @@ status_callback(zpool_handle_t *zhp, void *data) } /* - * zpool status [-c [script1,script2,...]] [-igLpPstvx] [-T d|u] [pool] ... - * [interval [count]] + * zpool status [-c [script1,script2,...]] [-igLpPstvx] [--power] [-T d|u] ... + * [pool] [interval [count]] * * -c CMD For each vdev, run command CMD * -i Display vdev initialization status. @@ -8862,6 +9033,7 @@ status_callback(zpool_handle_t *zhp, void *data) * -D Display dedup status (undocumented) * -t Display vdev TRIM status. * -T Display a timestamp in date(1) or Unix format + * --power Display vdev enclosure slot power status * * Describes the health status of all pools or some subset. */ @@ -8875,8 +9047,14 @@ zpool_do_status(int argc, char **argv) status_cbdata_t cb = { 0 }; char *cmd = NULL; + struct option long_options[] = { + {"power", no_argument, NULL, POWER_OPT}, + {0, 0, 0, 0} + }; + /* check options */ - while ((c = getopt(argc, argv, "c:igLpPsvxDtT:")) != -1) { + while ((c = getopt_long(argc, argv, "c:igLpPsvxDtT:", long_options, + NULL)) != -1) { switch (c) { case 'c': if (cmd != NULL) { @@ -8935,6 +9113,9 @@ zpool_do_status(int argc, char **argv) case 'T': get_timestamp_arg(*optarg); break; + case POWER_OPT: + cb.cb_print_power = B_TRUE; + break; case '?': if (optopt == 'c') { print_zpool_script_list("status"); diff --git a/cmd/zpool/zpool_util.h b/cmd/zpool/zpool_util.h index db8e631dc6b..7f5406f063e 100644 --- a/cmd/zpool/zpool_util.h +++ b/cmd/zpool/zpool_util.h @@ -138,6 +138,9 @@ int check_file(const char *file, boolean_t force, boolean_t isspare); void after_zpool_upgrade(zpool_handle_t *zhp); int check_file_generic(const char *file, boolean_t force, boolean_t isspare); +int zpool_power(zpool_handle_t *zhp, char *vdev, boolean_t turn_on); +int zpool_power_current_state(zpool_handle_t *zhp, char *vdev); + #ifdef __cplusplus } #endif diff --git a/include/libzfs.h b/include/libzfs.h index 4adfa38e87b..770c5e1f201 100644 --- a/include/libzfs.h +++ b/include/libzfs.h @@ -318,6 +318,9 @@ _LIBZFS_H int zpool_vdev_remove_wanted(zpool_handle_t *, const char *); _LIBZFS_H int zpool_vdev_fault(zpool_handle_t *, uint64_t, vdev_aux_t); _LIBZFS_H int zpool_vdev_degrade(zpool_handle_t *, uint64_t, vdev_aux_t); +_LIBZFS_H int zpool_vdev_set_removed_state(zpool_handle_t *, uint64_t, + vdev_aux_t); + _LIBZFS_H int zpool_vdev_clear(zpool_handle_t *, uint64_t); _LIBZFS_H nvlist_t *zpool_find_vdev(zpool_handle_t *, const char *, boolean_t *, diff --git a/include/libzutil.h b/include/libzutil.h index 9842c225b6f..839486fb62b 100644 --- a/include/libzutil.h +++ b/include/libzutil.h @@ -97,6 +97,7 @@ _LIBZUTIL_H int zpool_find_config(libpc_handle_t *, const char *, nvlist_t **, _LIBZUTIL_H const char * const * zpool_default_search_paths(size_t *count); _LIBZUTIL_H int zpool_read_label(int, nvlist_t **, int *); _LIBZUTIL_H int zpool_label_disk_wait(const char *, int); +_LIBZUTIL_H int zpool_disk_wait(const char *); struct udev_device; @@ -163,6 +164,8 @@ _LIBZUTIL_H void zfs_niceraw(uint64_t, char *, size_t); _LIBZUTIL_H void zpool_dump_ddt(const ddt_stat_t *, const ddt_histogram_t *); _LIBZUTIL_H int zpool_history_unpack(char *, uint64_t, uint64_t *, nvlist_t ***, uint_t *); +_LIBZUTIL_H void fsleep(float sec); +_LIBZUTIL_H int zpool_getenv_int(const char *env, int default_val); struct zfs_cmd; @@ -205,6 +208,60 @@ _LIBZUTIL_H void zfs_setproctitle(const char *fmt, ...); typedef int (*pool_vdev_iter_f)(void *, nvlist_t *, void *); int for_each_vdev_cb(void *zhp, nvlist_t *nv, pool_vdev_iter_f func, void *data); +int for_each_vdev_macro_helper_func(void *zhp_data, nvlist_t *nv, void *data); +int for_each_real_leaf_vdev_macro_helper_func(void *zhp_data, nvlist_t *nv, + void *data); +/* + * Often you'll want to iterate over all the vdevs in the pool, but don't want + * to use for_each_vdev() since it requires a callback function. + * + * Instead you can use FOR_EACH_VDEV(): + * + * zpool_handle_t *zhp // Assume this is initialized + * nvlist_t *nv + * ... + * FOR_EACH_VDEV(zhp, nv) { + * const char *path = NULL; + * nvlist_lookup_string(nv, ZPOOL_CONFIG_PATH, &path); + * printf("Looking at vdev %s\n", path); + * } + * + * Note: FOR_EACH_VDEV runs in O(n^2) time where n = number of vdevs. However, + * there's an upper limit of 256 vdevs per dRAID top-level vdevs (TLDs), 255 for + * raidz2 TLDs, a real world limit of ~500 vdevs for mirrors, so this shouldn't + * really be an issue. + * + * Here are some micro-benchmarks of a complete FOR_EACH_VDEV loop on a RAID0 + * pool: + * + * 100 vdevs = 0.7ms + * 500 vdevs = 17ms + * 750 vdevs = 40ms + * 1000 vdevs = 82ms + * + * The '__nv += 0' at the end of the for() loop gets around a "comma or + * semicolon followed by non-blank" checkstyle error. Note on most compliers + * the '__nv += 0' can just be replaced with 'NULL', but gcc on Centos 7 + * will give a 'warning: statement with no effect' error if you do that. + */ +#define __FOR_EACH_VDEV(__zhp, __nv, __func) { \ + __nv = zpool_get_config(__zhp, NULL); \ + VERIFY0(nvlist_lookup_nvlist(__nv, ZPOOL_CONFIG_VDEV_TREE, &__nv)); \ + } \ + for (nvlist_t *__root_nv = __nv, *__state = (nvlist_t *)0; \ + for_each_vdev_cb(&__state, __root_nv, __func, &__nv) == 1; \ + __nv += 0) + +#define FOR_EACH_VDEV(__zhp, __nv) \ + __FOR_EACH_VDEV(__zhp, __nv, for_each_vdev_macro_helper_func) + +/* + * "real leaf" vdevs are leaf vdevs that are real devices (disks or files). + * This excludes leaf vdevs like like draid spares. + */ +#define FOR_EACH_REAL_LEAF_VDEV(__zhp, __nv) \ + __FOR_EACH_VDEV(__zhp, __nv, for_each_real_leaf_vdev_macro_helper_func) + int for_each_vdev_in_nvlist(nvlist_t *nvroot, pool_vdev_iter_f func, void *data); void update_vdevs_config_dev_sysfs_path(nvlist_t *config); diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index 3c975397ed3..9bb8f6a47de 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -179,6 +179,7 @@ + @@ -466,6 +467,7 @@ + @@ -497,6 +499,7 @@ + @@ -567,6 +570,7 @@ + @@ -1402,8 +1406,6 @@ - - @@ -6355,6 +6357,12 @@ + + + + + + @@ -7588,6 +7596,12 @@ + + + + + + @@ -7597,6 +7611,10 @@ + + + + @@ -7714,6 +7732,11 @@ + + + + + @@ -7881,6 +7904,37 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -8070,12 +8124,6 @@ - - - - - - @@ -8102,11 +8150,6 @@ - - - - - @@ -8191,10 +8234,6 @@ - - - - @@ -8330,6 +8369,10 @@ + + + + @@ -8355,6 +8398,9 @@ + + + @@ -8628,6 +8674,7 @@ + @@ -8665,11 +8712,27 @@ + + + + + + + + + + + + + + + + diff --git a/lib/libzfs/libzfs_pool.c b/lib/libzfs/libzfs_pool.c index 4ebd112f452..2f9ccbc2ab5 100644 --- a/lib/libzfs/libzfs_pool.c +++ b/lib/libzfs/libzfs_pool.c @@ -3036,6 +3036,9 @@ zpool_vdev_is_interior(const char *name) return (B_FALSE); } +/* + * Lookup the nvlist for a given vdev. + */ nvlist_t * zpool_find_vdev(zpool_handle_t *zhp, const char *path, boolean_t *avail_spare, boolean_t *l2cache, boolean_t *log) @@ -3043,6 +3046,7 @@ zpool_find_vdev(zpool_handle_t *zhp, const char *path, boolean_t *avail_spare, char *end; nvlist_t *nvroot, *search, *ret; uint64_t guid; + boolean_t __avail_spare, __l2cache, __log; search = fnvlist_alloc(); @@ -3058,6 +3062,18 @@ zpool_find_vdev(zpool_handle_t *zhp, const char *path, boolean_t *avail_spare, nvroot = fnvlist_lookup_nvlist(zhp->zpool_config, ZPOOL_CONFIG_VDEV_TREE); + /* + * User can pass NULL for avail_spare, l2cache, and log, but + * we still need to provide variables to vdev_to_nvlist_iter(), so + * just point them to junk variables here. + */ + if (!avail_spare) + avail_spare = &__avail_spare; + if (!l2cache) + l2cache = &__l2cache; + if (!log) + log = &__log; + *avail_spare = B_FALSE; *l2cache = B_FALSE; if (log != NULL) @@ -3313,21 +3329,23 @@ zpool_vdev_fault(zpool_handle_t *zhp, uint64_t guid, vdev_aux_t aux) } /* - * Mark the given vdev degraded. + * Generic set vdev state function */ -int -zpool_vdev_degrade(zpool_handle_t *zhp, uint64_t guid, vdev_aux_t aux) +static int +zpool_vdev_set_state(zpool_handle_t *zhp, uint64_t guid, vdev_aux_t aux, + vdev_state_t state) { zfs_cmd_t zc = {"\0"}; char errbuf[ERRBUFLEN]; libzfs_handle_t *hdl = zhp->zpool_hdl; (void) snprintf(errbuf, sizeof (errbuf), - dgettext(TEXT_DOMAIN, "cannot degrade %llu"), (u_longlong_t)guid); + dgettext(TEXT_DOMAIN, "cannot set %s %llu"), + zpool_state_to_name(state, aux), (u_longlong_t)guid); (void) strlcpy(zc.zc_name, zhp->zpool_name, sizeof (zc.zc_name)); zc.zc_guid = guid; - zc.zc_cookie = VDEV_STATE_DEGRADED; + zc.zc_cookie = state; zc.zc_obj = aux; if (zfs_ioctl(hdl, ZFS_IOC_VDEV_SET_STATE, &zc) == 0) @@ -3336,6 +3354,27 @@ zpool_vdev_degrade(zpool_handle_t *zhp, uint64_t guid, vdev_aux_t aux) return (zpool_standard_error(hdl, errno, errbuf)); } +/* + * Mark the given vdev degraded. + */ +int +zpool_vdev_degrade(zpool_handle_t *zhp, uint64_t guid, vdev_aux_t aux) +{ + return (zpool_vdev_set_state(zhp, guid, aux, VDEV_STATE_DEGRADED)); +} + +/* + * Mark the given vdev as in a removed state (as if the device does not exist). + * + * This is different than zpool_vdev_remove() which does a removal of a device + * from the pool (but the device does exist). + */ +int +zpool_vdev_set_removed_state(zpool_handle_t *zhp, uint64_t guid, vdev_aux_t aux) +{ + return (zpool_vdev_set_state(zhp, guid, aux, VDEV_STATE_REMOVED)); +} + /* * Returns TRUE if the given nvlist is a vdev that was originally swapped in as * a hot spare. diff --git a/lib/libzutil/os/linux/zutil_import_os.c b/lib/libzutil/os/linux/zutil_import_os.c index fbfae4f7e68..bb91dec5acf 100644 --- a/lib/libzutil/os/linux/zutil_import_os.c +++ b/lib/libzutil/os/linux/zutil_import_os.c @@ -170,25 +170,17 @@ zpool_open_func(void *arg) if (rn->rn_labelpaths) { const char *path = NULL; const char *devid = NULL; - const char *env = NULL; rdsk_node_t *slice; avl_index_t where; - int timeout; int error; if (label_paths(rn->rn_hdl, rn->rn_config, &path, &devid)) return; - env = getenv("ZPOOL_IMPORT_UDEV_TIMEOUT_MS"); - if ((env == NULL) || sscanf(env, "%d", &timeout) != 1 || - timeout < 0) { - timeout = DISK_LABEL_WAIT; - } - /* * Allow devlinks to stabilize so all paths are available. */ - zpool_label_disk_wait(rn->rn_name, timeout); + zpool_disk_wait(rn->rn_name); if (path != NULL) { slice = zutil_alloc(hdl, sizeof (rdsk_node_t)); @@ -682,6 +674,20 @@ zpool_label_disk_wait(const char *path, int timeout_ms) #endif /* HAVE_LIBUDEV */ } +/* + * Simplified version of zpool_label_disk_wait() where we wait for a device + * to appear using the default timeouts. + */ +int +zpool_disk_wait(const char *path) +{ + int timeout; + timeout = zpool_getenv_int("ZPOOL_IMPORT_UDEV_TIMEOUT_MS", + DISK_LABEL_WAIT); + + return (zpool_label_disk_wait(path, timeout)); +} + /* * Encode the persistent devices strings * used for the vdev disk label @@ -767,6 +773,10 @@ no_dev: * in the nvlist * (if applicable). Like: * vdev_enc_sysfs_path: '/sys/class/enclosure/11:0:1:0/SLOT 4' * + * If an old path was in the nvlist, and the rescan can not find a new path, + * then keep the old path, since the disk may have been removed. + * + * path: The vdev path (value from ZPOOL_CONFIG_PATH) * key: The nvlist_t name (like ZPOOL_CONFIG_VDEV_ENC_SYSFS_PATH) */ void @@ -774,6 +784,9 @@ update_vdev_config_dev_sysfs_path(nvlist_t *nv, const char *path, const char *key) { char *upath, *spath; + const char *oldpath = NULL; + + (void) nvlist_lookup_string(nv, key, &oldpath); /* Add enclosure sysfs path (if disk is in an enclosure). */ upath = zfs_get_underlying_path(path); @@ -782,7 +795,14 @@ update_vdev_config_dev_sysfs_path(nvlist_t *nv, const char *path, if (spath) { (void) nvlist_add_string(nv, key, spath); } else { - (void) nvlist_remove_all(nv, key); + /* + * We couldn't dynamically scan the disk's enclosure sysfs path. + * This could be because the disk went away. If there's an old + * enclosure sysfs path in the nvlist, then keep using it. + */ + if (!oldpath) { + (void) nvlist_remove_all(nv, key); + } } free(upath); diff --git a/lib/libzutil/zutil_import.c b/lib/libzutil/zutil_import.c index f7ef69a1d93..eb913119045 100644 --- a/lib/libzutil/zutil_import.c +++ b/lib/libzutil/zutil_import.c @@ -1922,6 +1922,104 @@ zpool_find_config(libpc_handle_t *hdl, const char *target, nvlist_t **configp, return (0); } +/* Return if a vdev is a leaf vdev. Note: draid spares are leaf vdevs. */ +static boolean_t +vdev_is_leaf(nvlist_t *nv) +{ + uint_t children = 0; + nvlist_t **child; + + (void) nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, + &child, &children); + + return (children == 0); +} + +/* Return if a vdev is a leaf vdev and a real device (disk or file) */ +static boolean_t +vdev_is_real_leaf(nvlist_t *nv) +{ + const char *type = NULL; + if (!vdev_is_leaf(nv)) + return (B_FALSE); + + (void) nvlist_lookup_string(nv, ZPOOL_CONFIG_TYPE, &type); + if ((strcmp(type, VDEV_TYPE_DISK) == 0) || + (strcmp(type, VDEV_TYPE_FILE) == 0)) { + return (B_TRUE); + } + + return (B_FALSE); +} + +/* + * This function is called by our FOR_EACH_VDEV() macros. + * + * state: State machine status (stored inside of a (nvlist_t *)) + * nv: The current vdev nvlist_t we are iterating over. + * last_nv: The previous vdev nvlist_t we returned to the user in + * the last iteration of FOR_EACH_VDEV(). We use it + * to find the next vdev nvlist_t we should return. + * real_leaves_only: Only return leaf vdevs. + * + * Returns 1 if we found the next vdev nvlist_t for this iteration. 0 if + * we're still searching for it. + */ +static int +__for_each_vdev_macro_helper_func(void *state, nvlist_t *nv, void *last_nv, + boolean_t real_leaves_only) +{ + enum {FIRST_NV = 0, NEXT_IS_MATCH = 1, STOP_LOOKING = 2}; + + /* The very first entry in the NV list is a special case */ + if (*((nvlist_t **)state) == (nvlist_t *)FIRST_NV) { + if (real_leaves_only && !vdev_is_real_leaf(nv)) + return (0); + + *((nvlist_t **)last_nv) = nv; + *((nvlist_t **)state) = (nvlist_t *)STOP_LOOKING; + return (1); + } + + /* + * We came across our last_nv, meaning the next one is the one we + * want + */ + if (nv == *((nvlist_t **)last_nv)) { + /* Next iteration of this function will return the nvlist_t */ + *((nvlist_t **)state) = (nvlist_t *)NEXT_IS_MATCH; + return (0); + } + + /* + * We marked NEXT_IS_MATCH on the previous iteration, so this is the one + * we want. + */ + if (*(nvlist_t **)state == (nvlist_t *)NEXT_IS_MATCH) { + if (real_leaves_only && !vdev_is_real_leaf(nv)) + return (0); + + *((nvlist_t **)last_nv) = nv; + *((nvlist_t **)state) = (nvlist_t *)STOP_LOOKING; + return (1); + } + + return (0); +} + +int +for_each_vdev_macro_helper_func(void *state, nvlist_t *nv, void *last_nv) +{ + return (__for_each_vdev_macro_helper_func(state, nv, last_nv, B_FALSE)); +} + +int +for_each_real_leaf_vdev_macro_helper_func(void *state, nvlist_t *nv, + void *last_nv) +{ + return (__for_each_vdev_macro_helper_func(state, nv, last_nv, B_TRUE)); +} + /* * Internal function for iterating over the vdevs. * diff --git a/lib/libzutil/zutil_pool.c b/lib/libzutil/zutil_pool.c index 288a0033cd1..86460de3fc6 100644 --- a/lib/libzutil/zutil_pool.c +++ b/lib/libzutil/zutil_pool.c @@ -28,6 +28,7 @@ #include #include #include +#include #include @@ -144,3 +145,33 @@ zpool_history_unpack(char *buf, uint64_t bytes_read, uint64_t *leftover, *leftover = bytes_read; return (0); } + +/* + * Floating point sleep(). Allows you to pass in a floating point value for + * seconds. + */ +void +fsleep(float sec) +{ + struct timespec req; + req.tv_sec = floor(sec); + req.tv_nsec = (sec - (float)req.tv_sec) * NANOSEC; + nanosleep(&req, NULL); +} + +/* + * Get environment variable 'env' and return it as an integer. + * If 'env' is not set, then return 'default_val' instead. + */ +int +zpool_getenv_int(const char *env, int default_val) +{ + char *str; + int val; + str = getenv(env); + if ((str == NULL) || sscanf(str, "%d", &val) != 1 || + val < 0) { + val = default_val; + } + return (val); +} diff --git a/man/man8/zpool-clear.8 b/man/man8/zpool-clear.8 index 7b9d40c74eb..c61ecae483a 100644 --- a/man/man8/zpool-clear.8 +++ b/man/man8/zpool-clear.8 @@ -36,6 +36,7 @@ .Sh SYNOPSIS .Nm zpool .Cm clear +.Op Fl -power .Ar pool .Oo Ar device Oc Ns … . @@ -52,6 +53,16 @@ Pools with enabled which have been suspended cannot be resumed. While the pool was suspended, it may have been imported on another host, and resuming I/O could result in pool damage. +.Bl -tag -width Ds +.It Fl -power +Power on the devices's slot in the storage enclosure and wait for the device +to show up before attempting to clear errors. +This is done on all the devices specified. +Alternatively, you can set the +.Sy ZPOOL_AUTO_POWER_ON_SLOT +environment variable to always enable this behavior. +Note: This flag currently works on Linux only. +.El . .Sh SEE ALSO .Xr zdb 8 , diff --git a/man/man8/zpool-offline.8 b/man/man8/zpool-offline.8 index edcf1d06ab6..1b6095d6370 100644 --- a/man/man8/zpool-offline.8 +++ b/man/man8/zpool-offline.8 @@ -36,12 +36,13 @@ .Sh SYNOPSIS .Nm zpool .Cm offline -.Op Fl ft +.Op Fl Sy -power Ns | Ns Op Fl Sy ft .Ar pool .Ar device Ns … .Nm zpool .Cm online -.Op Fl e +.Op Fl Sy -power +.Op Fl Sy e .Ar pool .Ar device Ns … . @@ -50,7 +51,7 @@ .It Xo .Nm zpool .Cm offline -.Op Fl ft +.Op Fl Sy -power Ns | Ns Op Fl Sy ft .Ar pool .Ar device Ns … .Xc @@ -60,6 +61,9 @@ While the is offline, no attempt is made to read or write to the device. This command is not applicable to spares. .Bl -tag -width Ds +.It Fl -power +Power off the device's slot in the storage enclosure. +This flag currently works on Linux only .It Fl f Force fault. Instead of offlining the disk, put it into a faulted state. @@ -73,6 +77,7 @@ Upon reboot, the specified physical device reverts to its previous state. .It Xo .Nm zpool .Cm online +.Op Fl -power .Op Fl e .Ar pool .Ar device Ns … @@ -80,6 +85,13 @@ Upon reboot, the specified physical device reverts to its previous state. Brings the specified physical device online. This command is not applicable to spares. .Bl -tag -width Ds +.It Fl -power +Power on the device's slot in the storage enclosure and wait for the device +to show up before attempting to online it. +Alternatively, you can set the +.Sy ZPOOL_AUTO_POWER_ON_SLOT +environment variable to always enable this behavior. +This flag currently works on Linux only .It Fl e Expand the device to use all available space. If the device is part of a mirror or raidz then all devices must be expanded diff --git a/man/man8/zpool-status.8 b/man/man8/zpool-status.8 index 10424b9f5b5..56fa4aed057 100644 --- a/man/man8/zpool-status.8 +++ b/man/man8/zpool-status.8 @@ -57,6 +57,8 @@ and the estimated time to completion. Both of these are only approximate, because the amount of data in the pool and the other workloads on the system can change. .Bl -tag -width Ds +.It Fl -power +Display vdev enclosure slot power status (on or off). .It Fl c Op Ar SCRIPT1 Ns Oo , Ns Ar SCRIPT2 Oc Ns … Run a script (or scripts) on each vdev and include the output as a new column in the diff --git a/man/man8/zpool.8 b/man/man8/zpool.8 index 4c4020bdd81..fe44e15cabe 100644 --- a/man/man8/zpool.8 +++ b/man/man8/zpool.8 @@ -444,7 +444,7 @@ rpool 14.6G 54.9G 4 55 250K 2.69M .Ed . .Sh ENVIRONMENT VARIABLES -.Bl -tag -compact -width "ZPOOL_IMPORT_UDEV_TIMEOUT_MS" +.Bl -tag -compact -width "ZPOOL_STATUS_NON_NATIVE_ASHIFT_IGNORE" .It Sy ZFS_ABORT Cause .Nm @@ -456,6 +456,23 @@ Use ANSI color in and .Nm zpool Cm iostat output. +.It Sy ZPOOL_AUTO_POWER_ON_SLOT +Automatically attempt to turn on the drives enclosure slot power to a drive when +running the +.Nm zpool Cm online +or +.Nm zpool Cm clear +commands. +This has the same effect as passing the +.Fl -power +option to those commands. +.It Sy ZPOOL_POWER_ON_SLOT_TIMEOUT_MS +The maximum time in milliseconds to wait for a slot power sysfs value +to return the correct value after writing it. +For example, after writing "on" to the sysfs enclosure slot power_control file, +it can take some time for the enclosure to power down the slot and return +"on" if you read back the 'power_control' value. +Defaults to 30 seconds (30000ms) if not set. .It Sy ZPOOL_IMPORT_PATH The search path for devices or files to use with the pool. This is a colon-separated list of directories in which From 9ef15845f5fd1fe83267712c4753d804703854ea Mon Sep 17 00:00:00 2001 From: Mark Johnston Date: Wed, 27 Dec 2023 15:17:53 -0500 Subject: [PATCH 77/91] Fix the FreeBSD userspace build (#15716) - Mark some parameters to zpool_power*() as unused. - Add a stub zpool_disk_wait(). Fixes: a9520e6e5 ("zpool: Add slot power control, print power status") Signed-off-by: Mark Johnston Reviewed-by: Alexander Motin Reviewed-by: Tony Hutter --- cmd/zpool/os/freebsd/zpool_vdev_os.c | 7 +++++++ lib/libzutil/os/freebsd/zutil_import_os.c | 8 ++++++++ 2 files changed, 15 insertions(+) diff --git a/cmd/zpool/os/freebsd/zpool_vdev_os.c b/cmd/zpool/os/freebsd/zpool_vdev_os.c index 9dd733989e2..c57c689afa9 100644 --- a/cmd/zpool/os/freebsd/zpool_vdev_os.c +++ b/cmd/zpool/os/freebsd/zpool_vdev_os.c @@ -128,6 +128,9 @@ check_file(const char *file, boolean_t force, boolean_t isspare) int zpool_power_current_state(zpool_handle_t *zhp, char *vdev) { + + (void) zhp; + (void) vdev; /* Enclosure slot power not supported on FreeBSD yet */ return (-1); } @@ -135,6 +138,10 @@ zpool_power_current_state(zpool_handle_t *zhp, char *vdev) int zpool_power(zpool_handle_t *zhp, char *vdev, boolean_t turn_on) { + + (void) zhp; + (void) vdev; + (void) turn_on; /* Enclosure slot power not supported on FreeBSD yet */ return (ENOTSUP); } diff --git a/lib/libzutil/os/freebsd/zutil_import_os.c b/lib/libzutil/os/freebsd/zutil_import_os.c index a134c173bc8..049710d3985 100644 --- a/lib/libzutil/os/freebsd/zutil_import_os.c +++ b/lib/libzutil/os/freebsd/zutil_import_os.c @@ -263,3 +263,11 @@ update_vdevs_config_dev_sysfs_path(nvlist_t *config) { (void) config; } + +int +zpool_disk_wait(const char *path) +{ + + (void) path; + return (ENOTSUP); +} From 00d85a98ea10340cb017a4afc3c1c2ef1cf1914d Mon Sep 17 00:00:00 2001 From: Tony Hutter Date: Tue, 6 Feb 2024 09:55:43 -0800 Subject: [PATCH 78/91] BRT: Fix FICLONE/FICLONERANGE shortened copy On Linux the ioctl_ficlonerange() and ioctl_ficlone() system calls are expected to either fully clone the specified range or return an error. The range may be for an entire file. While internally ZFS supports cloning partial ranges there's no way to return the length cloned to the caller so we need to make this all or nothing. As part of this change support for the REMAP_FILE_CAN_SHORTEN flag has been added. When REMAP_FILE_CAN_SHORTEN is set zfs_clone_range() will return a shortened range when encountering pending dirty records. When it's clear zfs_clone_range() will block and wait for the records to be written out allowing the blocks to be cloned. Furthermore, the file range lock is held over the region being cloned to prevent it from being modified while cloning. This doesn't quite provide an atomic semantics since if an error is encountered only a portion of the range may be cloned. This will be converted to an error if REMAP_FILE_CAN_SHORTEN was not provided and returned to the caller. However, the destination file range is left in an undefined state. A test case has been added which exercises this functionality by verifying that `cp --reflink=never|auto|always` works correctly. Reviewed-by: Alexander Motin Signed-off-by: Brian Behlendorf Closes #15728 Closes #15842 --- include/os/freebsd/zfs/sys/zfs_vfsops_os.h | 1 - include/os/linux/zfs/sys/zfs_vfsops_os.h | 2 - include/sys/zfs_vnops.h | 3 + man/man4/zfs.4 | 9 + module/os/freebsd/zfs/zfs_vfsops.c | 4 - module/os/linux/zfs/zfs_vnops_os.c | 5 - module/os/linux/zfs/zpl_file_range.c | 48 +++--- module/zfs/zfs_vnops.c | 43 ++++- tests/runfiles/common.run | 2 +- tests/test-runner/bin/zts-report.py.in | 2 + tests/zfs-tests/include/tunables.cfg | 1 + tests/zfs-tests/tests/Makefile.am | 1 + .../functional/cp_files/cp_files_002_pos.ksh | 161 ++++++++++++++++++ 13 files changed, 243 insertions(+), 39 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/cp_files/cp_files_002_pos.ksh diff --git a/include/os/freebsd/zfs/sys/zfs_vfsops_os.h b/include/os/freebsd/zfs/sys/zfs_vfsops_os.h index 56a0ac96ac1..24bb03575f3 100644 --- a/include/os/freebsd/zfs/sys/zfs_vfsops_os.h +++ b/include/os/freebsd/zfs/sys/zfs_vfsops_os.h @@ -286,7 +286,6 @@ typedef struct zfid_long { extern uint_t zfs_fsyncer_key; extern int zfs_super_owner; -extern int zfs_bclone_enabled; extern void zfs_init(void); extern void zfs_fini(void); diff --git a/include/os/linux/zfs/sys/zfs_vfsops_os.h b/include/os/linux/zfs/sys/zfs_vfsops_os.h index 22046655025..b4d5db21f5e 100644 --- a/include/os/linux/zfs/sys/zfs_vfsops_os.h +++ b/include/os/linux/zfs/sys/zfs_vfsops_os.h @@ -45,8 +45,6 @@ extern "C" { typedef struct zfsvfs zfsvfs_t; struct znode; -extern int zfs_bclone_enabled; - /* * This structure emulates the vfs_t from other platforms. It's purpose * is to facilitate the handling of mount options and minimize structural diff --git a/include/sys/zfs_vnops.h b/include/sys/zfs_vnops.h index 5da103f1778..e60b99bed19 100644 --- a/include/sys/zfs_vnops.h +++ b/include/sys/zfs_vnops.h @@ -24,8 +24,11 @@ #ifndef _SYS_FS_ZFS_VNOPS_H #define _SYS_FS_ZFS_VNOPS_H + #include +extern int zfs_bclone_enabled; + extern int zfs_fsync(znode_t *, int, cred_t *); extern int zfs_read(znode_t *, zfs_uio_t *, int, cred_t *); extern int zfs_write(znode_t *, zfs_uio_t *, int, cred_t *); diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index c12ef1387cc..352990e02da 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -1142,6 +1142,15 @@ Enable the experimental block cloning feature. If this setting is 0, then even if feature@block_cloning is enabled, attempts to clone blocks will act as though the feature is disabled. . +.It Sy zfs_bclone_wait_dirty Ns = Ns Sy 0 Ns | Ns 1 Pq int +When set to 1 the FICLONE and FICLONERANGE ioctls wait for dirty data to be +written to disk. +This allows the clone operation to reliably succeed when a file is +modified and then immediately cloned. +For small files this may be slower than making a copy of the file. +Therefore, this setting defaults to 0 which causes a clone operation to +immediately fail when encountering a dirty block. +. .It Sy zfs_blake3_impl Ns = Ns Sy fastest Pq string Select a BLAKE3 implementation. .Pp diff --git a/module/os/freebsd/zfs/zfs_vfsops.c b/module/os/freebsd/zfs/zfs_vfsops.c index 23b8da18453..a972c720dfd 100644 --- a/module/os/freebsd/zfs/zfs_vfsops.c +++ b/module/os/freebsd/zfs/zfs_vfsops.c @@ -89,10 +89,6 @@ int zfs_debug_level; SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RWTUN, &zfs_debug_level, 0, "Debug level"); -int zfs_bclone_enabled = 0; -SYSCTL_INT(_vfs_zfs, OID_AUTO, bclone_enabled, CTLFLAG_RWTUN, - &zfs_bclone_enabled, 0, "Enable block cloning"); - struct zfs_jailparam { int mount_snapshot; }; diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c index ecfa4b54e29..c06a75662bf 100644 --- a/module/os/linux/zfs/zfs_vnops_os.c +++ b/module/os/linux/zfs/zfs_vnops_os.c @@ -4248,9 +4248,4 @@ EXPORT_SYMBOL(zfs_map); /* CSTYLED */ module_param(zfs_delete_blocks, ulong, 0644); MODULE_PARM_DESC(zfs_delete_blocks, "Delete files larger than N blocks async"); - -/* CSTYLED */ -module_param(zfs_bclone_enabled, uint, 0644); -MODULE_PARM_DESC(zfs_bclone_enabled, "Enable block cloning"); - #endif diff --git a/module/os/linux/zfs/zpl_file_range.c b/module/os/linux/zfs/zpl_file_range.c index 139c51cf46d..3065d54fa9d 100644 --- a/module/os/linux/zfs/zpl_file_range.c +++ b/module/os/linux/zfs/zpl_file_range.c @@ -31,8 +31,6 @@ #include #include -int zfs_bclone_enabled = 0; - /* * Clone part of a file via block cloning. * @@ -40,7 +38,7 @@ int zfs_bclone_enabled = 0; * care of that depending on how it was called. */ static ssize_t -__zpl_clone_file_range(struct file *src_file, loff_t src_off, +zpl_clone_file_range_impl(struct file *src_file, loff_t src_off, struct file *dst_file, loff_t dst_off, size_t len) { struct inode *src_i = file_inode(src_file); @@ -96,11 +94,12 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off, { ssize_t ret; + /* Flags is reserved for future extensions and must be zero. */ if (flags != 0) return (-EINVAL); - /* Try to do it via zfs_clone_range() */ - ret = __zpl_clone_file_range(src_file, src_off, + /* Try to do it via zfs_clone_range() and allow shortening. */ + ret = zpl_clone_file_range_impl(src_file, src_off, dst_file, dst_off, len); #ifdef HAVE_VFS_GENERIC_COPY_FILE_RANGE @@ -137,6 +136,11 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off, * FIDEDUPERANGE is for turning a non-clone into a clone, that is, compare the * range in both files and if they're the same, arrange for them to be backed * by the same storage. + * + * REMAP_FILE_CAN_SHORTEN lets us know we can clone less than the given range + * if we want. It's designed for filesystems that may need to shorten the + * length for alignment, EOF, or any other requirement. ZFS may shorten the + * request when there is outstanding dirty data which hasn't been written. */ loff_t zpl_remap_file_range(struct file *src_file, loff_t src_off, @@ -145,24 +149,21 @@ zpl_remap_file_range(struct file *src_file, loff_t src_off, if (flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_CAN_SHORTEN)) return (-EINVAL); - /* - * REMAP_FILE_CAN_SHORTEN lets us know we can clone less than the given - * range if we want. Its designed for filesystems that make data past - * EOF available, and don't want it to be visible in both files. ZFS - * doesn't do that, so we just turn the flag off. - */ - flags &= ~REMAP_FILE_CAN_SHORTEN; - + /* No support for dedup yet */ if (flags & REMAP_FILE_DEDUP) - /* No support for dedup yet */ return (-EOPNOTSUPP); /* Zero length means to clone everything to the end of the file */ if (len == 0) len = i_size_read(file_inode(src_file)) - src_off; - return (__zpl_clone_file_range(src_file, src_off, - dst_file, dst_off, len)); + ssize_t ret = zpl_clone_file_range_impl(src_file, src_off, + dst_file, dst_off, len); + + if (!(flags & REMAP_FILE_CAN_SHORTEN) && ret >= 0 && ret != len) + ret = -EINVAL; + + return (ret); } #endif /* HAVE_VFS_REMAP_FILE_RANGE */ @@ -179,8 +180,14 @@ zpl_clone_file_range(struct file *src_file, loff_t src_off, if (len == 0) len = i_size_read(file_inode(src_file)) - src_off; - return (__zpl_clone_file_range(src_file, src_off, - dst_file, dst_off, len)); + /* The entire length must be cloned or this is an error. */ + ssize_t ret = zpl_clone_file_range_impl(src_file, src_off, + dst_file, dst_off, len); + + if (ret >= 0 && ret != len) + ret = -EINVAL; + + return (ret); } #endif /* HAVE_VFS_CLONE_FILE_RANGE || HAVE_VFS_FILE_OPERATIONS_EXTEND */ @@ -214,8 +221,7 @@ zpl_ioctl_ficlone(struct file *dst_file, void *arg) size_t len = i_size_read(file_inode(src_file)); - ssize_t ret = - __zpl_clone_file_range(src_file, 0, dst_file, 0, len); + ssize_t ret = zpl_clone_file_range_impl(src_file, 0, dst_file, 0, len); fput(src_file); @@ -253,7 +259,7 @@ zpl_ioctl_ficlonerange(struct file *dst_file, void __user *arg) if (len == 0) len = i_size_read(file_inode(src_file)) - fcr.fcr_src_offset; - ssize_t ret = __zpl_clone_file_range(src_file, fcr.fcr_src_offset, + ssize_t ret = zpl_clone_file_range_impl(src_file, fcr.fcr_src_offset, dst_file, fcr.fcr_dest_offset, len); fput(src_file); diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index e6ae574ad06..2b37834d5c5 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -58,6 +58,26 @@ #include #include +/* + * Enable the experimental block cloning feature. If this setting is 0, then + * even if feature@block_cloning is enabled, attempts to clone blocks will act + * as though the feature is disabled. + */ +int zfs_bclone_enabled = 0; + +/* + * When set zfs_clone_range() waits for dirty data to be written to disk. + * This allows the clone operation to reliably succeed when a file is modified + * and then immediately cloned. For small files this may be slower than making + * a copy of the file and is therefore not the default. However, in certain + * scenarios this behavior may be desirable so a tunable is provided. + */ +static int zfs_bclone_wait_dirty = 0; + +/* + * Maximum bytes to read per chunk in zfs_read(). + */ +static uint64_t zfs_vnops_read_chunk_size = 1024 * 1024; static ulong_t zfs_fsync_sync_cnt = 4; @@ -189,8 +209,6 @@ zfs_access(znode_t *zp, int mode, int flag, cred_t *cr) return (error); } -static uint64_t zfs_vnops_read_chunk_size = 1024 * 1024; /* Tunable */ - /* * Read bytes from specified file into supplied buffer. * @@ -1055,6 +1073,7 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, size_t maxblocks, nbps; uint_t inblksz; uint64_t clear_setid_bits_txg = 0; + uint64_t last_synced_txg = 0; inoff = *inoffp; outoff = *outoffp; @@ -1293,15 +1312,23 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, } nbps = maxblocks; + last_synced_txg = spa_last_synced_txg(dmu_objset_spa(inos)); error = dmu_read_l0_bps(inos, inzp->z_id, inoff, size, bps, &nbps); if (error != 0) { /* * If we are trying to clone a block that was created - * in the current transaction group, error will be - * EAGAIN here, which we can just return to the caller - * so it can fallback if it likes. + * in the current transaction group, the error will be + * EAGAIN here. Based on zfs_bclone_wait_dirty either + * return a shortened range to the caller so it can + * fallback, or wait for the next TXG and check again. */ + if (error == EAGAIN && zfs_bclone_wait_dirty) { + txg_wait_synced(dmu_objset_pool(inos), + last_synced_txg + 1); + continue; + } + break; } @@ -1523,3 +1550,9 @@ EXPORT_SYMBOL(zfs_clone_range_replay); ZFS_MODULE_PARAM(zfs_vnops, zfs_vnops_, read_chunk_size, U64, ZMOD_RW, "Bytes to read per chunk"); + +ZFS_MODULE_PARAM(zfs, zfs_, bclone_enabled, INT, ZMOD_RW, + "Enable block cloning"); + +ZFS_MODULE_PARAM(zfs, zfs_, bclone_wait_dirty, INT, ZMOD_RW, + "Wait for dirty blocks when cloning"); diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index a3550d26ab3..dd936ce5983 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -630,7 +630,7 @@ tests = ['compress_001_pos', 'compress_002_pos', 'compress_003_pos', tags = ['functional', 'compression'] [tests/functional/cp_files] -tests = ['cp_files_001_pos', 'cp_stress'] +tests = ['cp_files_001_pos', 'cp_files_002_pos', 'cp_stress'] tags = ['functional', 'cp_files'] [tests/functional/crtime] diff --git a/tests/test-runner/bin/zts-report.py.in b/tests/test-runner/bin/zts-report.py.in index ae4aa627546..edfdd47ee6d 100755 --- a/tests/test-runner/bin/zts-report.py.in +++ b/tests/test-runner/bin/zts-report.py.in @@ -176,6 +176,7 @@ if sys.platform.startswith('freebsd'): 'cli_root/zpool_wait/zpool_wait_trim_cancel': ['SKIP', trim_reason], 'cli_root/zpool_wait/zpool_wait_trim_flag': ['SKIP', trim_reason], 'cli_root/zfs_unshare/zfs_unshare_008_pos': ['SKIP', na_reason], + 'cp_files/cp_files_002_pos': ['SKIP', na_reason], 'link_count/link_count_001': ['SKIP', na_reason], 'casenorm/mixed_create_failure': ['FAIL', 13215], 'mmap/mmap_sync_001_pos': ['SKIP', na_reason], @@ -312,6 +313,7 @@ elif sys.platform.startswith('linux'): ['SKIP', cfr_reason], 'cli_root/zfs_rename/zfs_rename_002_pos': ['FAIL', known_reason], 'cli_root/zpool_reopen/zpool_reopen_003_pos': ['FAIL', known_reason], + 'cp_files/cp_files_002_pos': ['SKIP', cfr_reason], 'fault/auto_online_002_pos': ['FAIL', 11889], 'fault/auto_replace_001_pos': ['FAIL', 14851], 'fault/auto_spare_002_pos': ['FAIL', 11889], diff --git a/tests/zfs-tests/include/tunables.cfg b/tests/zfs-tests/include/tunables.cfg index a0edad14d02..46cd42c4b8f 100644 --- a/tests/zfs-tests/include/tunables.cfg +++ b/tests/zfs-tests/include/tunables.cfg @@ -91,6 +91,7 @@ VOL_MODE vol.mode zvol_volmode VOL_RECURSIVE vol.recursive UNSUPPORTED VOL_USE_BLK_MQ UNSUPPORTED zvol_use_blk_mq BCLONE_ENABLED zfs_bclone_enabled zfs_bclone_enabled +BCLONE_WAIT_DIRTY zfs_bclone_wait_dirty zfs_bclone_wait_dirty XATTR_COMPAT xattr_compat zfs_xattr_compat ZEVENT_LEN_MAX zevent.len_max zfs_zevent_len_max ZEVENT_RETAIN_MAX zevent.retain_max zfs_zevent_retain_max diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 8bee07f480c..7442c798574 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1393,6 +1393,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/compression/setup.ksh \ functional/cp_files/cleanup.ksh \ functional/cp_files/cp_files_001_pos.ksh \ + functional/cp_files/cp_files_002_pos.ksh \ functional/cp_files/cp_stress.ksh \ functional/cp_files/setup.ksh \ functional/crtime/cleanup.ksh \ diff --git a/tests/zfs-tests/tests/functional/cp_files/cp_files_002_pos.ksh b/tests/zfs-tests/tests/functional/cp_files/cp_files_002_pos.ksh new file mode 100755 index 00000000000..60817449ab0 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cp_files/cp_files_002_pos.ksh @@ -0,0 +1,161 @@ +#! /bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2024 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/bclone/bclone_common.kshlib + +# +# DESCRIPTION: +# Verify all cp --reflink modes work with modified file. +# +# STRATEGY: +# 1. Verify "cp --reflink=never|auto|always" behaves as expected. +# Two different modes of operation are tested. +# +# a. zfs_bclone_wait_dirty=0: FICLONE and FICLONERANGE fail with EINVAL +# when there are dirty blocks which cannot be immediately cloned. +# This is the default behavior. +# +# b. zfs_bclone_wait_dirty=1: FICLONE and FICLONERANGE wait for +# dirty blocks to be written to disk allowing the clone to succeed. +# The downside to this is it may be slow which depending on the +# situtation may defeat the point of making a clone. +# + +verify_runnable "global" +verify_block_cloning + +if ! is_linux; then + log_unsupported "cp --reflink is a GNU coreutils option" +fi + +function cleanup +{ + datasetexists $TESTPOOL/cp-reflink && \ + destroy_dataset $$TESTPOOL/cp-reflink -f + log_must set_tunable32 BCLONE_WAIT_DIRTY 0 +} + +function verify_copy +{ + src_cksum=$(sha256digest $1) + dst_cksum=$(sha256digest $2) + + if [[ "$src_cksum" != "$dst_cksum" ]]; then + log_must ls -l $CP_TESTDIR + log_fail "checksum mismatch ($src_cksum != $dst_cksum)" + fi +} + +log_assert "Verify all cp --reflink modes work with modified file" + +log_onexit cleanup + +SRC_FILE=src.data +DST_FILE=dst.data +SRC_SIZE=$(($RANDOM % 2048)) + +# A smaller recordsize is used merely to speed up the test. +RECORDSIZE=4096 + +log_must zfs create -o recordsize=$RECORDSIZE $TESTPOOL/cp-reflink +CP_TESTDIR=$(get_prop mountpoint $TESTPOOL/cp-reflink) + +log_must cd $CP_TESTDIR + +# Never wait on dirty blocks (zfs_bclone_wait_dirty=0) +log_must set_tunable32 BCLONE_WAIT_DIRTY 0 + +for mode in "never" "auto" "always"; do + log_note "Checking 'cp --reflink=$mode'" + + # Create a new file and immediately copy it. + log_must dd if=/dev/urandom of=$SRC_FILE bs=$RECORDSIZE count=$SRC_SIZE + + if [[ "$mode" == "always" ]]; then + log_mustnot cp --reflink=$mode $SRC_FILE $DST_FILE + log_must ls -l $CP_TESTDIR + else + log_must cp --reflink=$mode $SRC_FILE $DST_FILE + verify_copy $SRC_FILE $DST_FILE + fi + log_must rm -f $DST_FILE + + # Append to an existing file and immediately copy it. + sync_pool $TESTPOOL + log_must dd if=/dev/urandom of=$SRC_FILE bs=$RECORDSIZE seek=$SRC_SIZE \ + count=1 conv=notrunc + if [[ "$mode" == "always" ]]; then + log_mustnot cp --reflink=$mode $SRC_FILE $DST_FILE + log_must ls -l $CP_TESTDIR + else + log_must cp --reflink=$mode $SRC_FILE $DST_FILE + verify_copy $SRC_FILE $DST_FILE + fi + log_must rm -f $DST_FILE + + # Overwrite a random range of an existing file and immediately copy it. + sync_pool $TESTPOOL + log_must dd if=/dev/urandom of=$SRC_FILE bs=$((RECORDSIZE / 2)) \ + seek=$(($RANDOM % $SRC_SIZE)) count=$(($RANDOM % 16)) conv=notrunc + if [[ "$mode" == "always" ]]; then + log_mustnot cp --reflink=$mode $SRC_FILE $DST_FILE + log_must ls -l $CP_TESTDIR + else + log_must cp --reflink=$mode $SRC_FILE $DST_FILE + verify_copy $SRC_FILE $DST_FILE + fi + log_must rm -f $SRC_FILE $DST_FILE +done + +# Wait on dirty blocks (zfs_bclone_wait_dirty=1) +log_must set_tunable32 BCLONE_WAIT_DIRTY 1 + +for mode in "never" "auto" "always"; do + log_note "Checking 'cp --reflink=$mode'" + + # Create a new file and immediately copy it. + log_must dd if=/dev/urandom of=$SRC_FILE bs=$RECORDSIZE count=$SRC_SIZE + log_must cp --reflink=$mode $SRC_FILE $DST_FILE + verify_copy $SRC_FILE $DST_FILE + log_must rm -f $DST_FILE + + # Append to an existing file and immediately copy it. + log_must dd if=/dev/urandom of=$SRC_FILE bs=$RECORDSIZE seek=$SRC_SIZE \ + count=1 conv=notrunc + log_must cp --reflink=$mode $SRC_FILE $DST_FILE + verify_copy $SRC_FILE $DST_FILE + log_must rm -f $DST_FILE + + # Overwrite a random range of an existing file and immediately copy it. + log_must dd if=/dev/urandom of=$SRC_FILE bs=$((RECORDSIZE / 2)) \ + seek=$(($RANDOM % $SRC_SIZE)) count=$(($RANDOM % 16)) conv=notrunc + log_must cp --reflink=$mode $SRC_FILE $DST_FILE + verify_copy $SRC_FILE $DST_FILE + log_must rm -f $SRC_FILE $DST_FILE +done + +log_pass From 08fd5ccc38c3b4575da91fc8b6ac350f444b5735 Mon Sep 17 00:00:00 2001 From: Umer Saleem Date: Sat, 3 Feb 2024 00:51:51 +0500 Subject: [PATCH 79/91] Improve performance for zpool trim on linux On Linux, ZFS uses blkdev_issue_discard in vdev_disk_io_trim to issue trim command which is synchronous. This commit updates vdev_disk_io_trim to use __blkdev_issue_discard, which is asynchronous. Unfortunately there isn't any asynchronous version for blkdev_issue_secure_erase, so performance of secure trim will still suffer. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Umer Saleem Closes #15843 --- config/kernel-blkdev.m4 | 34 ++++++++++++--- module/os/linux/zfs/vdev_disk.c | 74 ++++++++++++++++++++++++++------- 2 files changed, 88 insertions(+), 20 deletions(-) diff --git a/config/kernel-blkdev.m4 b/config/kernel-blkdev.m4 index 8e9e638b125..c5a353ca920 100644 --- a/config/kernel-blkdev.m4 +++ b/config/kernel-blkdev.m4 @@ -524,6 +524,7 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_BDEVNAME], [ dnl # dnl # 5.19 API: blkdev_issue_secure_erase() +dnl # 4.7 API: __blkdev_issue_discard(..., BLKDEV_DISCARD_SECURE) dnl # 3.10 API: blkdev_issue_discard(..., BLKDEV_DISCARD_SECURE) dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_ISSUE_SECURE_ERASE], [ @@ -539,6 +540,20 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_ISSUE_SECURE_ERASE], [ sector, nr_sects, GFP_KERNEL); ]) + ZFS_LINUX_TEST_SRC([blkdev_issue_discard_async_flags], [ + #include + ],[ + struct block_device *bdev = NULL; + sector_t sector = 0; + sector_t nr_sects = 0; + unsigned long flags = 0; + struct bio *biop = NULL; + int error __attribute__ ((unused)); + + error = __blkdev_issue_discard(bdev, + sector, nr_sects, GFP_KERNEL, flags, &biop); + ]) + ZFS_LINUX_TEST_SRC([blkdev_issue_discard_flags], [ #include ],[ @@ -562,13 +577,22 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_ISSUE_SECURE_ERASE], [ ],[ AC_MSG_RESULT(no) - AC_MSG_CHECKING([whether blkdev_issue_discard() is available]) - ZFS_LINUX_TEST_RESULT([blkdev_issue_discard_flags], [ + AC_MSG_CHECKING([whether __blkdev_issue_discard() is available]) + ZFS_LINUX_TEST_RESULT([blkdev_issue_discard_async_flags], [ AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_BLKDEV_ISSUE_DISCARD, 1, - [blkdev_issue_discard() is available]) + AC_DEFINE(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC, 1, + [__blkdev_issue_discard() is available]) ],[ - ZFS_LINUX_TEST_ERROR([blkdev_issue_discard()]) + AC_MSG_RESULT(no) + + AC_MSG_CHECKING([whether blkdev_issue_discard() is available]) + ZFS_LINUX_TEST_RESULT([blkdev_issue_discard_flags], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BLKDEV_ISSUE_DISCARD, 1, + [blkdev_issue_discard() is available]) + ],[ + ZFS_LINUX_TEST_ERROR([blkdev_issue_discard()]) + ]) ]) ]) ]) diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c index e7f0aa57384..b0bda5fa201 100644 --- a/module/os/linux/zfs/vdev_disk.c +++ b/module/os/linux/zfs/vdev_disk.c @@ -862,27 +862,66 @@ vdev_disk_io_flush(struct block_device *bdev, zio_t *zio) return (0); } +#if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE) || \ + defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC) +BIO_END_IO_PROTO(vdev_disk_discard_end_io, bio, error) +{ + zio_t *zio = bio->bi_private; +#ifdef HAVE_1ARG_BIO_END_IO_T + zio->io_error = BIO_END_IO_ERROR(bio); +#else + zio->io_error = -error; +#endif + bio_put(bio); + if (zio->io_error) + vdev_disk_error(zio); + zio_interrupt(zio); +} + +static int +vdev_issue_discard_trim(zio_t *zio, unsigned long flags) +{ + int ret; + struct bio *bio = NULL; + +#if defined(BLKDEV_DISCARD_SECURE) + ret = - __blkdev_issue_discard( + BDH_BDEV(((vdev_disk_t *)zio->io_vd->vdev_tsd)->vd_bdh), + zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS, flags, &bio); +#else + (void) flags; + ret = - __blkdev_issue_discard( + BDH_BDEV(((vdev_disk_t *)zio->io_vd->vdev_tsd)->vd_bdh), + zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS, &bio); +#endif + if (!ret && bio) { + bio->bi_private = zio; + bio->bi_end_io = vdev_disk_discard_end_io; + vdev_submit_bio(bio); + } + return (ret); +} +#endif + static int vdev_disk_io_trim(zio_t *zio) { - vdev_t *v = zio->io_vd; - vdev_disk_t *vd = v->vdev_tsd; - -#if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE) - if (zio->io_trim_flags & ZIO_TRIM_SECURE) { - return (-blkdev_issue_secure_erase(BDH_BDEV(vd->vd_bdh), - zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS)); - } else { - return (-blkdev_issue_discard(BDH_BDEV(vd->vd_bdh), - zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS)); - } -#elif defined(HAVE_BLKDEV_ISSUE_DISCARD) unsigned long trim_flags = 0; -#if defined(BLKDEV_DISCARD_SECURE) - if (zio->io_trim_flags & ZIO_TRIM_SECURE) + if (zio->io_trim_flags & ZIO_TRIM_SECURE) { +#if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE) + return (-blkdev_issue_secure_erase( + BDH_BDEV(((vdev_disk_t *)zio->io_vd->vdev_tsd)->vd_bdh), + zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS)); +#elif defined(BLKDEV_DISCARD_SECURE) trim_flags |= BLKDEV_DISCARD_SECURE; #endif - return (-blkdev_issue_discard(BDH_BDEV(vd->vd_bdh), + } +#if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE) || \ + defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC) + return (vdev_issue_discard_trim(zio, trim_flags)); +#elif defined(HAVE_BLKDEV_ISSUE_DISCARD) + return (-blkdev_issue_discard( + BDH_BDEV(((vdev_disk_t *)zio->io_vd->vdev_tsd)->vd_bdh), zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS, trim_flags)); #else #error "Unsupported kernel" @@ -968,7 +1007,12 @@ vdev_disk_io_start(zio_t *zio) case ZIO_TYPE_TRIM: zio->io_error = vdev_disk_io_trim(zio); rw_exit(&vd->vd_lock); +#if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE) + if (zio->io_trim_flags & ZIO_TRIM_SECURE) + zio_interrupt(zio); +#elif defined(HAVE_BLKDEV_ISSUE_DISCARD) zio_interrupt(zio); +#endif return; default: From 9bb8d26bd5485b579dff60166cf7a51f6e57820a Mon Sep 17 00:00:00 2001 From: Mauricio Faria de Oliveira Date: Fri, 8 Dec 2023 21:32:35 -0300 Subject: [PATCH 80/91] zed: fix typo in variable ZED_POWER_OFF_ENCLO*US*RE_SLOT_ON_FAULT Replace ENCLO_US_RE with ENCLO_SU_RE in the name of the variable. Note this changes the user-visible string in zed.rc, thus might break current users with the wrong string, but it's ~2 months since zfs-2.2.0 tag is out, thus should not be widespread yet. Mechanical change: $ grep -rl ZED_POWER_OFF_ENCLOUSRE_SLOT_ON_FAULT cmd/zed/zed.d/zed.rc cmd/zed/zed.d/statechange-slot_off.sh $ sed -i 's/ZED_POWER_OFF_ENCLOUSRE_SLOT_ON_FAULT/ ZED_POWER_OFF_ENCLOSURE_SLOT_ON_FAULT/g' \ cmd/zed/zed.d/zed.rc \ cmd/zed/zed.d/statechange-slot_off.sh $ grep -rl ZED_POWER_OFF_ENCLOUSRE_SLOT_ON_FAULT $ Fixes 11fbcacf37d1a66c7a40bb8920c70ce9a87270ea ("zed: Add zedlet to power off slot when drive is faulted") Reviewed-by: Tony Hutter Reviewed-by: Brian Behlendorf Signed-off-by: Mauricio Faria de Oliveira Closes #15651 --- cmd/zed/zed.d/statechange-slot_off.sh | 6 +++--- cmd/zed/zed.d/zed.rc | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cmd/zed/zed.d/statechange-slot_off.sh b/cmd/zed/zed.d/statechange-slot_off.sh index 150012abe71..06acce93b8a 100755 --- a/cmd/zed/zed.d/statechange-slot_off.sh +++ b/cmd/zed/zed.d/statechange-slot_off.sh @@ -5,7 +5,7 @@ # # Bad SCSI disks can often "disappear and reappear" causing all sorts of chaos # as they flip between FAULTED and ONLINE. If -# ZED_POWER_OFF_ENCLOUSRE_SLOT_ON_FAULT is set in zed.rc, and the disk gets +# ZED_POWER_OFF_ENCLOSURE_SLOT_ON_FAULT is set in zed.rc, and the disk gets # FAULTED, then power down the slot via sysfs: # # /sys/class/enclosure///power_status @@ -19,7 +19,7 @@ # Exit codes: # 0: slot successfully powered off # 1: enclosure not available -# 2: ZED_POWER_OFF_ENCLOUSRE_SLOT_ON_FAULT disabled +# 2: ZED_POWER_OFF_ENCLOSURE_SLOT_ON_FAULT disabled # 3: vdev was not FAULTED # 4: The enclosure sysfs path passed from ZFS does not exist # 5: Enclosure slot didn't actually turn off after we told it to @@ -32,7 +32,7 @@ if [ ! -d /sys/class/enclosure ] ; then exit 1 fi -if [ "${ZED_POWER_OFF_ENCLOUSRE_SLOT_ON_FAULT}" != "1" ] ; then +if [ "${ZED_POWER_OFF_ENCLOSURE_SLOT_ON_FAULT}" != "1" ] ; then exit 2 fi diff --git a/cmd/zed/zed.d/zed.rc b/cmd/zed/zed.d/zed.rc index 78dc1afc7b1..48051544d1a 100644 --- a/cmd/zed/zed.d/zed.rc +++ b/cmd/zed/zed.d/zed.rc @@ -146,4 +146,4 @@ ZED_SYSLOG_SUBCLASS_EXCLUDE="history_event" # Power off the drive's slot in the enclosure if it becomes FAULTED. This can # help silence misbehaving drives. This assumes your drive enclosure fully # supports slot power control via sysfs. -#ZED_POWER_OFF_ENCLOUSRE_SLOT_ON_FAULT=1 +#ZED_POWER_OFF_ENCLOSURE_SLOT_ON_FAULT=1 From 40e20d808ce263ac6f62c96a5c9cb10dc4add151 Mon Sep 17 00:00:00 2001 From: Cameron Harr Date: Wed, 7 Feb 2024 09:12:12 -0800 Subject: [PATCH 81/91] Add 'zpool status -e' flag to see unhealthy vdevs When very large pools are present, it can be laborious to find reasons for why a pool is degraded and/or where an unhealthy vdev is. This option filters out vdevs that are ONLINE and with no errors to make it easier to see where the issues are. Root and parents of unhealthy vdevs will always be printed. Testing: ZFS errors and drive failures for multiple vdevs were simulated with zinject. Sample vdev listings with '-e' option - All vdevs healthy NAME STATE READ WRITE CKSUM iron5 ONLINE 0 0 0 - ZFS errors NAME STATE READ WRITE CKSUM iron5 ONLINE 0 0 0 raidz2-5 ONLINE 1 0 0 L23 ONLINE 1 0 0 L24 ONLINE 1 0 0 L37 ONLINE 1 0 0 - Vdev faulted NAME STATE READ WRITE CKSUM iron5 DEGRADED 0 0 0 raidz2-6 DEGRADED 0 0 0 L67 FAULTED 0 0 0 too many errors - Vdev faults and data errors NAME STATE READ WRITE CKSUM iron5 DEGRADED 0 0 0 raidz2-1 DEGRADED 0 0 0 L2 FAULTED 0 0 0 too many errors raidz2-5 ONLINE 1 0 0 L23 ONLINE 1 0 0 L24 ONLINE 1 0 0 L37 ONLINE 1 0 0 raidz2-6 DEGRADED 0 0 0 L67 FAULTED 0 0 0 too many errors - Vdev missing NAME STATE READ WRITE CKSUM iron5 DEGRADED 0 0 0 raidz2-6 DEGRADED 0 0 0 L67 UNAVAIL 3 1 0 - Slow devices when -s provided with -e NAME STATE READ WRITE CKSUM SLOW iron5 DEGRADED 0 0 0 - raidz2-5 DEGRADED 0 0 0 - L10 FAULTED 0 0 0 0 external device fault L51 ONLINE 0 0 0 14 Reviewed-by: Tony Hutter Reviewed-by: Brian Behlendorf Signed-off-by: Cameron Harr Closes #15769 --- cmd/zpool/zpool_main.c | 58 +++++++++- man/man8/zpool-status.8 | 4 +- tests/runfiles/common.run | 3 +- tests/zfs-tests/tests/Makefile.am | 1 + .../zpool_status/zpool_status_002_pos.ksh | 4 +- .../zpool_status/zpool_status_003_pos.ksh | 2 + .../zpool_status/zpool_status_008_pos.ksh | 104 ++++++++++++++++++ 7 files changed, 169 insertions(+), 7 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_008_pos.ksh diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 6687a446445..69bf9649acf 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -2161,6 +2161,7 @@ typedef struct status_cbdata { boolean_t cb_explain; boolean_t cb_first; boolean_t cb_dedup_stats; + boolean_t cb_print_unhealthy; boolean_t cb_print_status; boolean_t cb_print_slow_ios; boolean_t cb_print_vdev_init; @@ -2357,6 +2358,35 @@ health_str_to_color(const char *health) return (NULL); } +/* + * Called for each leaf vdev. Returns 0 if the vdev is healthy. + * A vdev is unhealthy if any of the following are true: + * 1) there are read, write, or checksum errors, + * 2) its state is not ONLINE, or + * 3) slow IO reporting was requested (-s) and there are slow IOs. + */ +static int +vdev_health_check_cb(void *hdl_data, nvlist_t *nv, void *data) +{ + status_cbdata_t *cb = data; + vdev_stat_t *vs; + uint_t vsc; + (void) hdl_data; + + if (nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, + (uint64_t **)&vs, &vsc) != 0) + return (1); + + if (vs->vs_checksum_errors || vs->vs_read_errors || + vs->vs_write_errors || vs->vs_state != VDEV_STATE_HEALTHY) + return (1); + + if (cb->cb_print_slow_ios && vs->vs_slow_ios) + return (1); + + return (0); +} + /* * Print out configuration state as requested by status_callback. */ @@ -2375,7 +2405,8 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, const char *state; const char *type; const char *path = NULL; - const char *rcolor = NULL, *wcolor = NULL, *ccolor = NULL; + const char *rcolor = NULL, *wcolor = NULL, *ccolor = NULL, + *scolor = NULL; if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) @@ -2402,6 +2433,15 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, state = gettext("AVAIL"); } + /* + * If '-e' is specified then top-level vdevs and their children + * can be pruned if all of their leaves are healthy. + */ + if (cb->cb_print_unhealthy && depth > 0 && + for_each_vdev_in_nvlist(nv, vdev_health_check_cb, cb) == 0) { + return; + } + printf_color(health_str_to_color(state), "\t%*s%-*s %-8s", depth, "", cb->cb_namewidth - depth, name, state); @@ -2416,6 +2456,9 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, if (vs->vs_checksum_errors) ccolor = ANSI_RED; + if (vs->vs_slow_ios) + scolor = ANSI_BLUE; + if (cb->cb_literal) { fputc(' ', stdout); printf_color(rcolor, "%5llu", @@ -2448,9 +2491,10 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, } if (cb->cb_literal) - printf(" %5llu", (u_longlong_t)vs->vs_slow_ios); + printf_color(scolor, " %5llu", + (u_longlong_t)vs->vs_slow_ios); else - printf(" %5s", rbuf); + printf_color(scolor, " %5s", rbuf); } if (cb->cb_print_power) { if (children == 0) { @@ -8999,9 +9043,11 @@ status_callback(zpool_handle_t *zhp, void *data) (void) printf(gettext( "errors: No known data errors\n")); } else if (!cbp->cb_verbose) { + color_start(ANSI_RED); (void) printf(gettext("errors: %llu data " "errors, use '-v' for a list\n"), (u_longlong_t)nerr); + color_end(); } else { print_error_log(zhp); } @@ -9022,6 +9068,7 @@ status_callback(zpool_handle_t *zhp, void *data) * [pool] [interval [count]] * * -c CMD For each vdev, run command CMD + * -e Display only unhealthy vdevs * -i Display vdev initialization status. * -g Display guid for individual vdev name. * -L Follow links when resolving vdev path name. @@ -9053,7 +9100,7 @@ zpool_do_status(int argc, char **argv) }; /* check options */ - while ((c = getopt_long(argc, argv, "c:igLpPsvxDtT:", long_options, + while ((c = getopt_long(argc, argv, "c:eigLpPsvxDtT:", long_options, NULL)) != -1) { switch (c) { case 'c': @@ -9080,6 +9127,9 @@ zpool_do_status(int argc, char **argv) } cmd = optarg; break; + case 'e': + cb.cb_print_unhealthy = B_TRUE; + break; case 'i': cb.cb_print_vdev_init = B_TRUE; break; diff --git a/man/man8/zpool-status.8 b/man/man8/zpool-status.8 index 56fa4aed057..24ad6e643ca 100644 --- a/man/man8/zpool-status.8 +++ b/man/man8/zpool-status.8 @@ -36,7 +36,7 @@ .Sh SYNOPSIS .Nm zpool .Cm status -.Op Fl DigLpPstvx +.Op Fl DeigLpPstvx .Op Fl T Sy u Ns | Ns Sy d .Op Fl c Op Ar SCRIPT1 Ns Oo , Ns Ar SCRIPT2 Oc Ns … .Oo Ar pool Oc Ns … @@ -69,6 +69,8 @@ See the option of .Nm zpool Cm iostat for complete details. +.It Fl e +Only show unhealthy vdevs (not-ONLINE or with errors). .It Fl i Display vdev initialization status. .It Fl g diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index dd936ce5983..7331244515f 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -535,7 +535,8 @@ tags = ['functional', 'cli_root', 'zpool_split'] tests = ['zpool_status_001_pos', 'zpool_status_002_pos', 'zpool_status_003_pos', 'zpool_status_004_pos', 'zpool_status_005_pos', 'zpool_status_006_pos', - 'zpool_status_007_pos', 'zpool_status_features_001_pos'] + 'zpool_status_007_pos', 'zpool_status_008_pos', + 'zpool_status_features_001_pos'] tags = ['functional', 'cli_root', 'zpool_status'] [tests/functional/cli_root/zpool_sync] diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 7442c798574..e2824ee065e 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1238,6 +1238,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zpool_status/zpool_status_005_pos.ksh \ functional/cli_root/zpool_status/zpool_status_006_pos.ksh \ functional/cli_root/zpool_status/zpool_status_007_pos.ksh \ + functional/cli_root/zpool_status/zpool_status_008_pos.ksh \ functional/cli_root/zpool_status/zpool_status_features_001_pos.ksh \ functional/cli_root/zpool_sync/cleanup.ksh \ functional/cli_root/zpool_sync/setup.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_002_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_002_pos.ksh index 3bdd7db649f..d6f32cdc7ac 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_002_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_002_pos.ksh @@ -51,7 +51,7 @@ else fi set -A args "" "-x" "-v" "-x $testpool" "-v $testpool" "-xv $testpool" \ - "-vx $testpool" + "-vx $testpool" "-e $testpool" "-es $testpool" log_assert "Executing 'zpool status' with correct options succeeds" @@ -64,4 +64,6 @@ while [[ $i -lt ${#args[*]} ]]; do (( i = i + 1 )) done +cleanup + log_pass "'zpool status' with correct options succeeded" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_003_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_003_pos.ksh index b501aac5ad6..52b22dd833f 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_003_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_003_pos.ksh @@ -37,6 +37,7 @@ # 3. Read the file # 4. Take a snapshot and make a clone # 5. Verify we see "snapshot, clone and filesystem" output in 'zpool status -v' +# and 'zpool status -ev' function cleanup { @@ -68,6 +69,7 @@ log_must zpool status -v $TESTPOOL2 log_must eval "zpool status -v | grep '$TESTPOOL2@snap:/10m_file'" log_must eval "zpool status -v | grep '$TESTPOOL2/clone/10m_file'" log_must eval "zpool status -v | grep '$TESTPOOL2/10m_file'" +log_must eval "zpool status -ev | grep '$TESTPOOL2/10m_file'" log_mustnot eval "zpool status -v | grep '$TESTFS1'" log_pass "'zpool status -v' outputs affected filesystem, snapshot & clone" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_008_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_008_pos.ksh new file mode 100755 index 00000000000..6be2ad5a741 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_008_pos.ksh @@ -0,0 +1,104 @@ +#!/bin/ksh -p + +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2024 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Verify 'zpool status -e' only shows unhealthy devices. +# +# STRATEGY: +# 1. Create zpool +# 2. Force DEGRADE, FAULT, or inject slow IOs for vdevs +# 3. Verify vdevs are reported correctly with -e and -s +# 4. Verify parents are reported as DEGRADED +# 5. Verify healthy children are not reported +# + +function cleanup +{ + log_must set_tunable64 ZIO_SLOW_IO_MS $OLD_SLOW_IO + zinject -c all + poolexists $TESTPOOL2 && destroy_pool $TESTPOOL2 + log_must rm -f $all_vdevs +} + +log_assert "Verify 'zpool status -e'" + +log_onexit cleanup + +all_vdevs=$(echo $TESTDIR/vdev{1..6}) +log_must mkdir -p $TESTDIR +log_must truncate -s $MINVDEVSIZE $all_vdevs + +OLD_SLOW_IO=$(get_tunable ZIO_SLOW_IO_MS) + +for raid_type in "draid2:3d:6c:1s" "raidz2"; do + + log_must zpool create -f $TESTPOOL2 $raid_type $all_vdevs + + # Check DEGRADED vdevs are shown. + log_must check_vdev_state $TESTPOOL2 $TESTDIR/vdev4 "ONLINE" + log_must zinject -d $TESTDIR/vdev4 -A degrade $TESTPOOL2 + log_must eval "zpool status -e $TESTPOOL2 | grep $TESTDIR/vdev4 | grep DEGRADED" + + # Check FAULTED vdevs are shown. + log_must check_vdev_state $TESTPOOL2 $TESTDIR/vdev5 "ONLINE" + log_must zinject -d $TESTDIR/vdev5 -A fault $TESTPOOL2 + log_must eval "zpool status -e $TESTPOOL2 | grep $TESTDIR/vdev5 | grep FAULTED" + + # Check no ONLINE vdevs are shown + log_mustnot eval "zpool status -e $TESTPOOL2 | grep ONLINE" + + # Check no ONLINE slow vdevs are show. Then mark IOs greater than + # 10ms slow, delay IOs 20ms to vdev6, check slow IOs. + log_must check_vdev_state $TESTPOOL2 $TESTDIR/vdev6 "ONLINE" + log_mustnot eval "zpool status -es $TESTPOOL2 | grep ONLINE" + + log_must set_tunable64 ZIO_SLOW_IO_MS 10 + log_must zinject -d $TESTDIR/vdev6 -D20:100 $TESTPOOL2 + log_must mkfile 1048576 /$TESTPOOL2/testfile + sync_pool $TESTPOOL2 + log_must set_tunable64 ZIO_SLOW_IO_MS $OLD_SLOW_IO + + # Check vdev6 slow IOs are only shown when requested with -s. + log_mustnot eval "zpool status -e $TESTPOOL2 | grep $TESTDIR/vdev6 | grep ONLINE" + log_must eval "zpool status -es $TESTPOOL2 | grep $TESTDIR/vdev6 | grep ONLINE" + + # Pool level and top-vdev level status must be DEGRADED. + log_must eval "zpool status -e $TESTPOOL2 | grep $TESTPOOL2 | grep DEGRADED" + log_must eval "zpool status -e $TESTPOOL2 | grep $raid_type | grep DEGRADED" + + # Check that healthy vdevs[1-3] aren't shown with -e. + log_must check_vdev_state $TESTPOOL2 $TESTDIR/vdev1 "ONLINE" + log_must check_vdev_state $TESTPOOL2 $TESTDIR/vdev2 "ONLINE" + log_must check_vdev_state $TESTPOOL2 $TESTDIR/vdev3 "ONLINE" + log_mustnot eval "zpool status -es $TESTPOOL2 | grep $TESTDIR/vdev1 | grep ONLINE" + log_mustnot eval "zpool status -es $TESTPOOL2 | grep $TESTDIR/vdev2 | grep ONLINE" + log_mustnot eval "zpool status -es $TESTPOOL2 | grep $TESTDIR/vdev3 | grep ONLINE" + + log_must zinject -c all + log_must zpool status -es $TESTPOOL2 + + zpool destroy $TESTPOOL2 +done + +log_pass "Verify zpool status -e shows only unhealthy vdevs" From d22bf6a9bd216523e3f58195282be12d9da7fd33 Mon Sep 17 00:00:00 2001 From: the-Chain-Warden-thresh <18302010006@fudan.edu.cn> Date: Thu, 8 Feb 2024 03:53:05 +0800 Subject: [PATCH 82/91] LUA: Backport CVE-2020-24370's patch CVE-2020-24370 is a security vulnerability in lua. Although the CVE description in CVE-2020-24370 said that this CVE only affected lua 5.4.0, according to lua this CVE actually existed since lua 5.2. The root cause of this CVE is the negation overflow that occurs when you try to take the negative of 0x80000000. Thus, this CVE also exists in openzfs. Try to backport the fix to the lua in openzfs since the original fix is for 5.4 and several functions have been changed. https://github.com/advisories/GHSA-gfr4-c37g-mm3v https://nvd.nist.gov/vuln/detail/CVE-2020-24370 https://www.lua.org/bugs.html#5.4.0-11 https://github.com/lua/lua/commit/a585eae6e7ada1ca9271607a4f48dfb1786 Reviewed-by: Tony Hutter Reviewed-by: Brian Behlendorf Signed-off-by: ChenHao Lu <18302010006@fudan.edu.cn> Closes #15847 --- module/lua/ldebug.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/module/lua/ldebug.c b/module/lua/ldebug.c index 0092474c762..23e321bb124 100644 --- a/module/lua/ldebug.c +++ b/module/lua/ldebug.c @@ -111,10 +111,11 @@ static const char *upvalname (Proto *p, int uv) { static const char *findvararg (CallInfo *ci, int n, StkId *pos) { int nparams = clLvalue(ci->func)->p->numparams; - if (n >= ci->u.l.base - ci->func - nparams) + int nvararg = cast_int(ci->u.l.base - ci->func) - nparams; + if (n <= -nvararg) return NULL; /* no such vararg */ else { - *pos = ci->func + nparams + n; + *pos = ci->func + nparams - n; return "(*vararg)"; /* generic name for any vararg */ } } @@ -126,7 +127,7 @@ static const char *findlocal (lua_State *L, CallInfo *ci, int n, StkId base; if (isLua(ci)) { if (n < 0) /* access to vararg values? */ - return findvararg(ci, -n, pos); + return findvararg(ci, n, pos); else { base = ci->u.l.base; name = luaF_getlocalname(ci_func(ci)->p, n, currentpc(ci)); From b699dacb4ac8bb7622943ae8587474dbe1fc81b1 Mon Sep 17 00:00:00 2001 From: Tony Hutter Date: Mon, 12 Feb 2024 13:06:09 -0800 Subject: [PATCH 83/91] [zfs-2.2.3] Enable zfs_bclone_enabled on cp_files tests cp_files_002_pos uses BRT, so enable block cloning in setup/cleanup. This is only something we need to do in zfs-2.2.3, since 2.2.x ships with block cloning disabled by default. Signed-off-by: Tony Hutter --- tests/zfs-tests/tests/functional/cp_files/cleanup.ksh | 4 ++++ tests/zfs-tests/tests/functional/cp_files/setup.ksh | 6 ++++++ 2 files changed, 10 insertions(+) diff --git a/tests/zfs-tests/tests/functional/cp_files/cleanup.ksh b/tests/zfs-tests/tests/functional/cp_files/cleanup.ksh index 42fe70042d6..c0bccab1221 100755 --- a/tests/zfs-tests/tests/functional/cp_files/cleanup.ksh +++ b/tests/zfs-tests/tests/functional/cp_files/cleanup.ksh @@ -32,3 +32,7 @@ . $STF_SUITE/include/libtest.shlib default_cleanup + +if tunable_exists BCLONE_ENABLED ; then + log_must restore_tunable BCLONE_ENABLED +fi diff --git a/tests/zfs-tests/tests/functional/cp_files/setup.ksh b/tests/zfs-tests/tests/functional/cp_files/setup.ksh index b756d4e76c8..4223386b361 100755 --- a/tests/zfs-tests/tests/functional/cp_files/setup.ksh +++ b/tests/zfs-tests/tests/functional/cp_files/setup.ksh @@ -32,4 +32,10 @@ . $STF_SUITE/include/libtest.shlib DISK=${DISKS%% *} + +if tunable_exists BCLONE_ENABLED ; then + log_must save_tunable BCLONE_ENABLED + log_must set_tunable32 BCLONE_ENABLED 1 +fi + default_setup $DISK From 36116b4612f85b1373fbcefad3fd34cd7efd65d4 Mon Sep 17 00:00:00 2001 From: Rob N Date: Tue, 13 Feb 2024 08:58:04 +1100 Subject: [PATCH 84/91] zfs list: add '-t fs' and '-t vol' options (#15883) Because "filesystem" and "volume" are just too long! Sponsored-by: https://despairlabs.com/sponsor/ Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Closes #15864 (cherry picked from commit a5a725440bcb2f4c4554be3e489f911e3dd60412) --- cmd/zfs/zfs_main.c | 22 ++++++++++++++++------ man/man8/zfs-list.8 | 11 ++++++++++- 2 files changed, 26 insertions(+), 7 deletions(-) diff --git a/cmd/zfs/zfs_main.c b/cmd/zfs/zfs_main.c index 67b191d72e6..3017de9ee73 100644 --- a/cmd/zfs/zfs_main.c +++ b/cmd/zfs/zfs_main.c @@ -3672,15 +3672,25 @@ zfs_do_list(int argc, char **argv) for (char *tok; (tok = strsep(&optarg, ",")); ) { static const char *const type_subopts[] = { - "filesystem", "volume", - "snapshot", "snap", + "filesystem", + "fs", + "volume", + "vol", + "snapshot", + "snap", "bookmark", - "all" }; + "all" + }; static const int type_types[] = { - ZFS_TYPE_FILESYSTEM, ZFS_TYPE_VOLUME, - ZFS_TYPE_SNAPSHOT, ZFS_TYPE_SNAPSHOT, + ZFS_TYPE_FILESYSTEM, + ZFS_TYPE_FILESYSTEM, + ZFS_TYPE_VOLUME, + ZFS_TYPE_VOLUME, + ZFS_TYPE_SNAPSHOT, + ZFS_TYPE_SNAPSHOT, ZFS_TYPE_BOOKMARK, - ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK }; + ZFS_TYPE_DATASET | ZFS_TYPE_BOOKMARK + }; for (c = 0; c < ARRAY_SIZE(type_subopts); ++c) if (strcmp(tok, type_subopts[c]) == 0) { diff --git a/man/man8/zfs-list.8 b/man/man8/zfs-list.8 index 9f6a73ab956..85bd3fbafce 100644 --- a/man/man8/zfs-list.8 +++ b/man/man8/zfs-list.8 @@ -29,7 +29,7 @@ .\" Copyright 2018 Nexenta Systems, Inc. .\" Copyright 2019 Joyent, Inc. .\" -.Dd March 16, 2022 +.Dd February 8, 2024 .Dt ZFS-LIST 8 .Os . @@ -155,6 +155,15 @@ or For example, specifying .Fl t Sy snapshot displays only snapshots. +.Sy fs , +.Sy snap , +or +.Sy vol +can be used as aliases for +.Sy filesystem , +.Sy snapshot , +or +.Sy volume . .El . .Sh EXAMPLES From fc3d34bd08d81f9189fd06ac641da4e2d82a56c7 Mon Sep 17 00:00:00 2001 From: Bi11 Date: Tue, 13 Feb 2024 05:53:33 +0800 Subject: [PATCH 85/91] BRT: Fix slop space calculation with block cloning Similar to deduplication, the size of data duplicated by block cloning should not be included in the slop space calculation. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Yuxin Wang Closes #15874 --- module/zfs/spa_misc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/module/zfs/spa_misc.c b/module/zfs/spa_misc.c index 72b690162d6..24f038ad7f4 100644 --- a/module/zfs/spa_misc.c +++ b/module/zfs/spa_misc.c @@ -1822,7 +1822,8 @@ spa_get_slop_space(spa_t *spa) * deduplicated data, so since it's not useful to reserve more * space with more deduplicated data, we subtract that out here. */ - space = spa_get_dspace(spa) - spa->spa_dedup_dspace; + space = + spa_get_dspace(spa) - spa->spa_dedup_dspace - brt_get_dspace(spa); slop = MIN(space >> spa_slop_shift, spa_max_slop); /* From a6f6c881ffde598898165f36e4e0a2ff15836cf6 Mon Sep 17 00:00:00 2001 From: Dex Wood Date: Fri, 1 Dec 2023 17:25:17 -0600 Subject: [PATCH 86/91] Add Ntfy notification support to ZED This commit adds the zed_notify_ntfy() function and hooks it into zed_notify(). This will allow ZED to send notifications to ntfy.sh or a self-hosted Ntfy service, which can be received on a desktop or mobile device. It is configured with ZED_NTFY_TOPIC, ZED_NTFY_URL, and ZED_NTFY_ACCESS_TOKEN variables in zed.rc. Reviewed-by: @classabbyamp Reviewed-by: Brian Behlendorf Signed-off-by: Dex Wood Closes #15584 --- cmd/zed/zed.d/zed-functions.sh | 98 ++++++++++++++++++++++++++++++++++ cmd/zed/zed.d/zed.rc | 22 ++++++++ 2 files changed, 120 insertions(+) diff --git a/cmd/zed/zed.d/zed-functions.sh b/cmd/zed/zed.d/zed-functions.sh index 49b6b54029a..3a2519633d0 100644 --- a/cmd/zed/zed.d/zed-functions.sh +++ b/cmd/zed/zed.d/zed-functions.sh @@ -205,6 +205,10 @@ zed_notify() [ "${rv}" -eq 0 ] && num_success=$((num_success + 1)) [ "${rv}" -eq 1 ] && num_failure=$((num_failure + 1)) + zed_notify_ntfy "${subject}" "${pathname}"; rv=$? + [ "${rv}" -eq 0 ] && num_success=$((num_success + 1)) + [ "${rv}" -eq 1 ] && num_failure=$((num_failure + 1)) + [ "${num_success}" -gt 0 ] && return 0 [ "${num_failure}" -gt 0 ] && return 1 return 2 @@ -527,6 +531,100 @@ zed_notify_pushover() } +# zed_notify_ntfy (subject, pathname) +# +# Send a notification via Ntfy.sh . +# The ntfy topic (ZED_NTFY_TOPIC) identifies the topic that the notification +# will be sent to Ntfy.sh server. The ntfy url (ZED_NTFY_URL) defines the +# self-hosted or provided hosted ntfy service location. The ntfy access token +# (ZED_NTFY_ACCESS_TOKEN) reprsents an +# access token that could be used if a topic is read/write protected. If a +# topic can be written to publicaly, a ZED_NTFY_ACCESS_TOKEN is not required. +# +# Requires curl and sed executables to be installed in the standard PATH. +# +# References +# https://docs.ntfy.sh +# +# Arguments +# subject: notification subject +# pathname: pathname containing the notification message (OPTIONAL) +# +# Globals +# ZED_NTFY_TOPIC +# ZED_NTFY_ACCESS_TOKEN (OPTIONAL) +# ZED_NTFY_URL +# +# Return +# 0: notification sent +# 1: notification failed +# 2: not configured +# +zed_notify_ntfy() +{ + local subject="$1" + local pathname="${2:-"/dev/null"}" + local msg_body + local msg_out + local msg_err + + [ -n "${ZED_NTFY_TOPIC}" ] || return 2 + local url="${ZED_NTFY_URL:-"https://ntfy.sh"}/${ZED_NTFY_TOPIC}" + + if [ ! -r "${pathname}" ]; then + zed_log_err "ntfy cannot read \"${pathname}\"" + return 1 + fi + + zed_check_cmd "curl" "sed" || return 1 + + # Read the message body in. + # + msg_body="$(cat "${pathname}")" + + if [ -z "${msg_body}" ] + then + msg_body=$subject + subject="" + fi + + # Send the POST request and check for errors. + # + if [ -n "${ZED_NTFY_ACCESS_TOKEN}" ]; then + msg_out="$( \ + curl \ + -u ":${ZED_NTFY_ACCESS_TOKEN}" \ + -H "Title: ${subject}" \ + -d "${msg_body}" \ + -H "Priority: high" \ + "${url}" \ + 2>/dev/null \ + )"; rv=$? + else + msg_out="$( \ + curl \ + -H "Title: ${subject}" \ + -d "${msg_body}" \ + -H "Priority: high" \ + "${url}" \ + 2>/dev/null \ + )"; rv=$? + fi + if [ "${rv}" -ne 0 ]; then + zed_log_err "curl exit=${rv}" + return 1 + fi + msg_err="$(echo "${msg_out}" \ + | sed -n -e 's/.*"errors" *:.*\[\(.*\)\].*/\1/p')" + if [ -n "${msg_err}" ]; then + zed_log_err "ntfy \"${msg_err}"\" + return 1 + fi + return 0 +} + + + # zed_rate_limit (tag, [interval]) # # Check whether an event of a given type [tag] has already occurred within the diff --git a/cmd/zed/zed.d/zed.rc b/cmd/zed/zed.d/zed.rc index 48051544d1a..bc269b155d7 100644 --- a/cmd/zed/zed.d/zed.rc +++ b/cmd/zed/zed.d/zed.rc @@ -147,3 +147,25 @@ ZED_SYSLOG_SUBCLASS_EXCLUDE="history_event" # help silence misbehaving drives. This assumes your drive enclosure fully # supports slot power control via sysfs. #ZED_POWER_OFF_ENCLOSURE_SLOT_ON_FAULT=1 + +## +# Ntfy topic +# This defines which topic will receive the ntfy notification. +# +# Disabled by default; uncomment to enable. +#ZED_NTFY_TOPIC="" + +## +# Ntfy access token (optional for public topics) +# This defines an access token which can be used +# to allow you to authenticate when sending to topics +# +# Disabled by default; uncomment to enable. +#ZED_NTFY_ACCESS_TOKEN="" + +## +# Ntfy Service URL +# This defines which service the ntfy call will be directed toward +# +# https://ntfy.sh by default; uncomment to enable an alternative service url. +#ZED_NTFY_URL="https://ntfy.sh" From a4978d260580d85491eee2ecb2435f56a7d4e50c Mon Sep 17 00:00:00 2001 From: Bi11 Date: Tue, 13 Feb 2024 08:58:47 +0800 Subject: [PATCH 87/91] zdb: Fix false leak report for BRT objects Fix a misreport in 'zdb -d' where it falsely marked BRT objects as leaked. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Yuxin Wang Closes #15882 --- cmd/zdb/zdb.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 19b0d61f09c..d81199765c6 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -8041,6 +8041,17 @@ dump_mos_leaks(spa_t *spa) } } + if (spa->spa_brt != NULL) { + brt_t *brt = spa->spa_brt; + for (uint64_t vdevid = 0; vdevid < brt->brt_nvdevs; vdevid++) { + brt_vdev_t *brtvd = &brt->brt_vdevs[vdevid]; + if (brtvd != NULL && brtvd->bv_initiated) { + mos_obj_refd(brtvd->bv_mos_brtvdev); + mos_obj_refd(brtvd->bv_mos_entries); + } + } + } + /* * Visit all allocated objects and make sure they are referenced. */ From d92fbe2150d72e70ff1fdf1ada89ed3245a47cd4 Mon Sep 17 00:00:00 2001 From: Tony Hutter Date: Wed, 14 Feb 2024 14:29:19 -0800 Subject: [PATCH 88/91] [zfs-2.2.3] ZTS: Use correct bclone module param name on FreeBSD The bclone module names are not prefixed with 'zfs' on FreeBSD. This was causing test failues. Signed-off-by: Tony Hutter --- tests/zfs-tests/include/tunables.cfg | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/zfs-tests/include/tunables.cfg b/tests/zfs-tests/include/tunables.cfg index 46cd42c4b8f..718c4cf2d8a 100644 --- a/tests/zfs-tests/include/tunables.cfg +++ b/tests/zfs-tests/include/tunables.cfg @@ -90,8 +90,8 @@ VOL_INHIBIT_DEV UNSUPPORTED zvol_inhibit_dev VOL_MODE vol.mode zvol_volmode VOL_RECURSIVE vol.recursive UNSUPPORTED VOL_USE_BLK_MQ UNSUPPORTED zvol_use_blk_mq -BCLONE_ENABLED zfs_bclone_enabled zfs_bclone_enabled -BCLONE_WAIT_DIRTY zfs_bclone_wait_dirty zfs_bclone_wait_dirty +BCLONE_ENABLED bclone_enabled zfs_bclone_enabled +BCLONE_WAIT_DIRTY bclone_wait_dirty zfs_bclone_wait_dirty XATTR_COMPAT xattr_compat zfs_xattr_compat ZEVENT_LEN_MAX zevent.len_max zfs_zevent_len_max ZEVENT_RETAIN_MAX zevent.retain_max zfs_zevent_retain_max From b62fd2cef9baede3fb9ee7dca980a0eb10d694f8 Mon Sep 17 00:00:00 2001 From: Tony Hutter Date: Fri, 16 Feb 2024 08:59:56 -0800 Subject: [PATCH 89/91] ZTS: Skip cross-fs bclone tests if FreeBSD < 14.0 Skip cross filesystem block cloning tests on FreeBSD if running less than version 14.0. Cross filesystem copy_file_range() was added in FreeBSD 14. Reviewed-by: Brian Behlendorf Signed-off-by: Tony Hutter Closes #15901 --- tests/test-runner/bin/zts-report.py.in | 22 ++++++++++++++- tests/zfs-tests/include/libtest.shlib | 27 ++++++++++++++----- .../functional/bclone/bclone_common.kshlib | 6 +++++ ...ck_cloning_copyfilerange_cross_dataset.ksh | 5 ++-- .../block_cloning_cross_enc_dataset.ksh | 5 ++-- 5 files changed, 51 insertions(+), 14 deletions(-) diff --git a/tests/test-runner/bin/zts-report.py.in b/tests/test-runner/bin/zts-report.py.in index edfdd47ee6d..ecc50f48715 100755 --- a/tests/test-runner/bin/zts-report.py.in +++ b/tests/test-runner/bin/zts-report.py.in @@ -138,7 +138,11 @@ idmap_reason = 'Idmapped mount needs kernel 5.12+' # copy_file_range() is not supported by all kernels # cfr_reason = 'Kernel copy_file_range support required' -cfr_cross_reason = 'copy_file_range(2) cross-filesystem needs kernel 5.3+' + +if sys.platform.startswith('freebsd'): + cfr_cross_reason = 'copy_file_range(2) cross-filesystem needs FreeBSD 14+' +else: + cfr_cross_reason = 'copy_file_range(2) cross-filesystem needs kernel 5.3+' # # These tests are known to fail, thus we use this list to prevent these @@ -268,6 +272,22 @@ if sys.platform.startswith('freebsd'): 'pool_checkpoint/checkpoint_indirect': ['FAIL', 12623], 'resilver/resilver_restart_001': ['FAIL', known_reason], 'snapshot/snapshot_002_pos': ['FAIL', '14831'], + 'bclone/bclone_crossfs_corner_cases': ['SKIP', cfr_cross_reason], + 'bclone/bclone_crossfs_corner_cases_limited': + ['SKIP', cfr_cross_reason], + 'bclone/bclone_crossfs_data': ['SKIP', cfr_cross_reason], + 'bclone/bclone_crossfs_embedded': ['SKIP', cfr_cross_reason], + 'bclone/bclone_crossfs_hole': ['SKIP', cfr_cross_reason], + 'bclone/bclone_diffprops_all': ['SKIP', cfr_cross_reason], + 'bclone/bclone_diffprops_checksum': ['SKIP', cfr_cross_reason], + 'bclone/bclone_diffprops_compress': ['SKIP', cfr_cross_reason], + 'bclone/bclone_diffprops_copies': ['SKIP', cfr_cross_reason], + 'bclone/bclone_diffprops_recordsize': ['SKIP', cfr_cross_reason], + 'bclone/bclone_prop_sync': ['SKIP', cfr_cross_reason], + 'block_cloning/block_cloning_cross_enc_dataset': + ['SKIP', cfr_cross_reason], + 'block_cloning/block_cloning_copyfilerange_cross_dataset': + ['SKIP', cfr_cross_reason] }) elif sys.platform.startswith('linux'): maybe.update({ diff --git a/tests/zfs-tests/include/libtest.shlib b/tests/zfs-tests/include/libtest.shlib index b4d2b91dd47..dfab48d2cda 100644 --- a/tests/zfs-tests/include/libtest.shlib +++ b/tests/zfs-tests/include/libtest.shlib @@ -61,13 +61,8 @@ function compare_version_gte [ "$(printf "$1\n$2" | sort -V | tail -n1)" = "$1" ] } -# Linux kernel version comparison function -# -# $1 Linux version ("4.10", "2.6.32") or blank for installed Linux version -# -# Used for comparison: if [ $(linux_version) -ge $(linux_version "2.6.32") ] -# -function linux_version +# Helper function used by linux_version() and freebsd_version() +function kernel_version { typeset ver="$1" @@ -83,6 +78,24 @@ function linux_version echo $((version * 100000 + major * 1000 + minor)) } +# Linux kernel version comparison function +# +# $1 Linux version ("4.10", "2.6.32") or blank for installed Linux version +# +# Used for comparison: if [ $(linux_version) -ge $(linux_version "2.6.32") ] +function linux_version { + kernel_version "$1" +} + +# FreeBSD version comparison function +# +# $1 FreeBSD version ("13.2", "14.0") or blank for installed FreeBSD version +# +# Used for comparison: if [ $(freebsd_version) -ge $(freebsd_version "13.2") ] +function freebsd_version { + kernel_version "$1" +} + # Determine if this is a Linux test system # # Return 0 if platform Linux, 1 if otherwise diff --git a/tests/zfs-tests/tests/functional/bclone/bclone_common.kshlib b/tests/zfs-tests/tests/functional/bclone/bclone_common.kshlib index beba01c0ed2..3b8eaea5bb5 100644 --- a/tests/zfs-tests/tests/functional/bclone/bclone_common.kshlib +++ b/tests/zfs-tests/tests/functional/bclone/bclone_common.kshlib @@ -42,6 +42,12 @@ function verify_crossfs_block_cloning if is_linux && [[ $(linux_version) -lt $(linux_version "5.3") ]]; then log_unsupported "copy_file_range can't copy cross-filesystem before Linux 5.3" fi + + # Cross dataset block cloning only supported on FreeBSD 14+ + # https://github.com/freebsd/freebsd-src/commit/969071be938c + if is_freebsd && [ $(freebsd_version) -lt $(freebsd_version 14.0) ] ; then + log_unsupported "Cloning across datasets not supported in $(uname -r)" + fi } # Unused. diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_cross_dataset.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_cross_dataset.ksh index 43323c207a6..ad83d30291a 100755 --- a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_cross_dataset.ksh +++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_cross_dataset.ksh @@ -26,12 +26,11 @@ . $STF_SUITE/include/libtest.shlib . $STF_SUITE/tests/functional/block_cloning/block_cloning.kshlib +. $STF_SUITE/tests/functional/bclone/bclone_common.kshlib verify_runnable "global" -if is_linux && [[ $(linux_version) -lt $(linux_version "5.3") ]]; then - log_unsupported "copy_file_range can't copy cross-filesystem before Linux 5.3" -fi +verify_crossfs_block_cloning claim="The copy_file_range syscall can clone across datasets." diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_cross_enc_dataset.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_cross_enc_dataset.ksh index 34d3d269255..702e23267f7 100755 --- a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_cross_enc_dataset.ksh +++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_cross_enc_dataset.ksh @@ -26,12 +26,11 @@ . $STF_SUITE/include/libtest.shlib . $STF_SUITE/tests/functional/block_cloning/block_cloning.kshlib +. $STF_SUITE/tests/functional/bclone/bclone_common.kshlib verify_runnable "global" -if is_linux && [[ $(linux_version) -lt $(linux_version "5.3") ]]; then - log_unsupported "copy_file_range can't copy cross-filesystem before Linux 5.3" -fi +verify_crossfs_block_cloning claim="Block cloning across encrypted datasets." From c0c4866f8a29a38b2bb683c267d7278e0020d90c Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Fri, 15 Dec 2023 12:51:41 -0500 Subject: [PATCH 90/91] dmu: Allow buffer fills to fail When ZFS overwrites a whole block, it does not bother to read the old content from disk. It is a good optimization, but if the buffer fill fails due to page fault or something else, the buffer ends up corrupted, neither keeping old content, nor getting the new one. On FreeBSD this is additionally complicated by page faults being blocked by VFS layer, always returning EFAULT on attempt to write from mmap()'ed but not yet cached address range. Normally it is not a big problem, since after original failure VFS will retry the write after reading the required data. The problem becomes worse in specific case when somebody tries to write into a file its own mmap()'ed content from the same location. In that situation the only copy of the data is getting corrupted on the page fault and the following retries only fixate the status quo. Block cloning makes this issue easier to reproduce, since it does not read the old data, unlike traditional file copy, that may work by chance. This patch provides the fill status to dmu_buf_fill_done(), that in case of error can destroy the corrupted buffer as if no write happened. One more complication in case of block cloning is that if error is possible during fill, dmu_buf_will_fill() must read the data via fall-back to dmu_buf_will_dirty(). It is required to allow in case of error restoring the buffer to a state after the cloning, not not before it, that would happen if we just call dbuf_undirty(). Reviewed-by: Brian Behlendorf Reviewed-by: Rob Norris Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #15665 --- include/os/freebsd/spl/sys/uio.h | 2 +- include/os/linux/spl/sys/uio.h | 2 +- include/sys/dbuf.h | 4 ++-- lib/libspl/include/sys/uio.h | 2 +- module/os/freebsd/zfs/dmu_os.c | 4 ++-- module/zfs/dbuf.c | 33 +++++++++++++++++++++++--------- module/zfs/dmu.c | 21 +++++++++----------- module/zfs/dmu_recv.c | 2 +- 8 files changed, 41 insertions(+), 29 deletions(-) diff --git a/include/os/freebsd/spl/sys/uio.h b/include/os/freebsd/spl/sys/uio.h index b71f2f2e562..b9d41903ea6 100644 --- a/include/os/freebsd/spl/sys/uio.h +++ b/include/os/freebsd/spl/sys/uio.h @@ -62,7 +62,7 @@ zfs_uio_setoffset(zfs_uio_t *uio, offset_t off) } static inline void -zfs_uio_advance(zfs_uio_t *uio, size_t size) +zfs_uio_advance(zfs_uio_t *uio, ssize_t size) { zfs_uio_resid(uio) -= size; zfs_uio_offset(uio) += size; diff --git a/include/os/linux/spl/sys/uio.h b/include/os/linux/spl/sys/uio.h index a4b600004c9..5e6ea8d3c22 100644 --- a/include/os/linux/spl/sys/uio.h +++ b/include/os/linux/spl/sys/uio.h @@ -95,7 +95,7 @@ zfs_uio_setoffset(zfs_uio_t *uio, offset_t off) } static inline void -zfs_uio_advance(zfs_uio_t *uio, size_t size) +zfs_uio_advance(zfs_uio_t *uio, ssize_t size) { uio->uio_resid -= size; uio->uio_loffset += size; diff --git a/include/sys/dbuf.h b/include/sys/dbuf.h index 1800a7e31da..f2a1535c916 100644 --- a/include/sys/dbuf.h +++ b/include/sys/dbuf.h @@ -379,8 +379,8 @@ dmu_buf_impl_t *dbuf_find(struct objset *os, uint64_t object, uint8_t level, int dbuf_read(dmu_buf_impl_t *db, zio_t *zio, uint32_t flags); void dmu_buf_will_clone(dmu_buf_t *db, dmu_tx_t *tx); void dmu_buf_will_not_fill(dmu_buf_t *db, dmu_tx_t *tx); -void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx); -void dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx); +void dmu_buf_will_fill(dmu_buf_t *db, dmu_tx_t *tx, boolean_t canfail); +boolean_t dmu_buf_fill_done(dmu_buf_t *db, dmu_tx_t *tx, boolean_t failed); void dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx); dbuf_dirty_record_t *dbuf_dirty(dmu_buf_impl_t *db, dmu_tx_t *tx); dbuf_dirty_record_t *dbuf_dirty_lightweight(dnode_t *dn, uint64_t blkid, diff --git a/lib/libspl/include/sys/uio.h b/lib/libspl/include/sys/uio.h index e9e21819d4f..665bfc42301 100644 --- a/lib/libspl/include/sys/uio.h +++ b/lib/libspl/include/sys/uio.h @@ -90,7 +90,7 @@ zfs_uio_iov_at_index(zfs_uio_t *uio, uint_t idx, void **base, uint64_t *len) } static inline void -zfs_uio_advance(zfs_uio_t *uio, size_t size) +zfs_uio_advance(zfs_uio_t *uio, ssize_t size) { uio->uio_resid -= size; uio->uio_loffset += size; diff --git a/module/os/freebsd/zfs/dmu_os.c b/module/os/freebsd/zfs/dmu_os.c index a5f486b95db..c33ce01ab39 100644 --- a/module/os/freebsd/zfs/dmu_os.c +++ b/module/os/freebsd/zfs/dmu_os.c @@ -110,7 +110,7 @@ dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); if (tocpy == db->db_size) - dmu_buf_will_fill(db, tx); + dmu_buf_will_fill(db, tx, B_FALSE); else dmu_buf_will_dirty(db, tx); @@ -126,7 +126,7 @@ dmu_write_pages(objset_t *os, uint64_t object, uint64_t offset, uint64_t size, } if (tocpy == db->db_size) - dmu_buf_fill_done(db, tx); + dmu_buf_fill_done(db, tx, B_FALSE); offset += tocpy; size -= tocpy; diff --git a/module/zfs/dbuf.c b/module/zfs/dbuf.c index 255add6cd24..280001bc34b 100644 --- a/module/zfs/dbuf.c +++ b/module/zfs/dbuf.c @@ -2734,7 +2734,7 @@ dmu_buf_will_not_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) } void -dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) +dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx, boolean_t canfail) { dmu_buf_impl_t *db = (dmu_buf_impl_t *)db_fake; @@ -2752,8 +2752,14 @@ dmu_buf_will_fill(dmu_buf_t *db_fake, dmu_tx_t *tx) * Block cloning: We will be completely overwriting a block * cloned in this transaction group, so let's undirty the * pending clone and mark the block as uncached. This will be - * as if the clone was never done. + * as if the clone was never done. But if the fill can fail + * we should have a way to return back to the cloned data. */ + if (canfail && dbuf_find_dirty_eq(db, tx->tx_txg) != NULL) { + mutex_exit(&db->db_mtx); + dmu_buf_will_dirty(db_fake, tx); + return; + } VERIFY(!dbuf_undirty(db, tx)); db->db_state = DB_UNCACHED; } @@ -2814,32 +2820,41 @@ dbuf_override_impl(dmu_buf_impl_t *db, const blkptr_t *bp, dmu_tx_t *tx) dl->dr_overridden_by.blk_birth = dr->dr_txg; } -void -dmu_buf_fill_done(dmu_buf_t *dbuf, dmu_tx_t *tx) +boolean_t +dmu_buf_fill_done(dmu_buf_t *dbuf, dmu_tx_t *tx, boolean_t failed) { (void) tx; dmu_buf_impl_t *db = (dmu_buf_impl_t *)dbuf; - dbuf_states_t old_state; mutex_enter(&db->db_mtx); DBUF_VERIFY(db); - old_state = db->db_state; - db->db_state = DB_CACHED; - if (old_state == DB_FILL) { + if (db->db_state == DB_FILL) { if (db->db_level == 0 && db->db_freed_in_flight) { ASSERT(db->db_blkid != DMU_BONUS_BLKID); /* we were freed while filling */ /* XXX dbuf_undirty? */ memset(db->db.db_data, 0, db->db.db_size); db->db_freed_in_flight = FALSE; + db->db_state = DB_CACHED; DTRACE_SET_STATE(db, "fill done handling freed in flight"); + failed = B_FALSE; + } else if (failed) { + VERIFY(!dbuf_undirty(db, tx)); + db->db_buf = NULL; + dbuf_clear_data(db); + DTRACE_SET_STATE(db, "fill failed"); } else { + db->db_state = DB_CACHED; DTRACE_SET_STATE(db, "fill done"); } cv_broadcast(&db->db_changed); + } else { + db->db_state = DB_CACHED; + failed = B_FALSE; } mutex_exit(&db->db_mtx); + return (failed); } void @@ -2984,7 +2999,7 @@ dbuf_assign_arcbuf(dmu_buf_impl_t *db, arc_buf_t *buf, dmu_tx_t *tx) DTRACE_SET_STATE(db, "filling assigned arcbuf"); mutex_exit(&db->db_mtx); (void) dbuf_dirty(db, tx); - dmu_buf_fill_done(&db->db, tx); + dmu_buf_fill_done(&db->db, tx, B_FALSE); } void diff --git a/module/zfs/dmu.c b/module/zfs/dmu.c index 909605aa264..3215ab1c2a1 100644 --- a/module/zfs/dmu.c +++ b/module/zfs/dmu.c @@ -1115,14 +1115,14 @@ dmu_write_impl(dmu_buf_t **dbp, int numbufs, uint64_t offset, uint64_t size, ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); if (tocpy == db->db_size) - dmu_buf_will_fill(db, tx); + dmu_buf_will_fill(db, tx, B_FALSE); else dmu_buf_will_dirty(db, tx); (void) memcpy((char *)db->db_data + bufoff, buf, tocpy); if (tocpy == db->db_size) - dmu_buf_fill_done(db, tx); + dmu_buf_fill_done(db, tx, B_FALSE); offset += tocpy; size -= tocpy; @@ -1330,27 +1330,24 @@ dmu_write_uio_dnode(dnode_t *dn, zfs_uio_t *uio, uint64_t size, dmu_tx_t *tx) ASSERT(size > 0); - bufoff = zfs_uio_offset(uio) - db->db_offset; + offset_t off = zfs_uio_offset(uio); + bufoff = off - db->db_offset; tocpy = MIN(db->db_size - bufoff, size); ASSERT(i == 0 || i == numbufs-1 || tocpy == db->db_size); if (tocpy == db->db_size) - dmu_buf_will_fill(db, tx); + dmu_buf_will_fill(db, tx, B_TRUE); else dmu_buf_will_dirty(db, tx); - /* - * XXX zfs_uiomove could block forever (eg.nfs-backed - * pages). There needs to be a uiolockdown() function - * to lock the pages in memory, so that zfs_uiomove won't - * block. - */ err = zfs_uio_fault_move((char *)db->db_data + bufoff, tocpy, UIO_WRITE, uio); - if (tocpy == db->db_size) - dmu_buf_fill_done(db, tx); + if (tocpy == db->db_size && dmu_buf_fill_done(db, tx, err)) { + /* The fill was reverted. Undo any uio progress. */ + zfs_uio_advance(uio, off - zfs_uio_offset(uio)); + } if (err) break; diff --git a/module/zfs/dmu_recv.c b/module/zfs/dmu_recv.c index 05ca91717c2..54aa60259ea 100644 --- a/module/zfs/dmu_recv.c +++ b/module/zfs/dmu_recv.c @@ -2532,7 +2532,7 @@ receive_spill(struct receive_writer_arg *rwa, struct drr_spill *drrs, * size of the provided arc_buf_t. */ if (db_spill->db_size != drrs->drr_length) { - dmu_buf_will_fill(db_spill, tx); + dmu_buf_will_fill(db_spill, tx, B_FALSE); VERIFY0(dbuf_spill_set_blksz(db_spill, drrs->drr_length, tx)); } From c883088df83ced3a2b8b38e6d89a5e63fb153ee4 Mon Sep 17 00:00:00 2001 From: Tony Hutter Date: Tue, 30 Jan 2024 13:34:05 -0800 Subject: [PATCH 91/91] Tag zfs-2.2.3 META file and changelog updated. Signed-off-by: Tony Hutter --- META | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/META b/META index 05337a9c508..d64414e3222 100644 --- a/META +++ b/META @@ -1,7 +1,7 @@ Meta: 1 Name: zfs Branch: 1.0 -Version: 2.2.2 +Version: 2.2.3 Release: 1 Release-Tags: relext License: CDDL