From 6c2fc5691615e478d1360101b3ec22906e8ce7a8 Mon Sep 17 00:00:00 2001 From: Coleman Kane Date: Tue, 1 Aug 2023 11:27:58 -0400 Subject: [PATCH 01/19] Linux 6.5 compat: register_sysctl_table removed Additionally, the .child element of ctl_table has been removed in 6.5. This change adds a new test for the pre-6.5 register_sysctl_table() function, and uses the old code in that case. If it isn't found, then the parentage entries in the tables are removed, and the register_sysctl call is provided the paths of "kernel/spl", "kernel/spl/kmem", and "kernel/spl/kstat" directly, to populate each subdirectory over three calls, as is the new API. Reviewed-by: Brian Behlendorf Signed-off-by: Coleman Kane Closes #15098 --- config/kernel-register_sysctl_table.m4 | 27 ++++++++++++++++++++++++++ config/kernel.m4 | 2 ++ module/os/linux/spl/spl-proc.c | 26 ++++++++++++++++++++++--- 3 files changed, 52 insertions(+), 3 deletions(-) create mode 100644 config/kernel-register_sysctl_table.m4 diff --git a/config/kernel-register_sysctl_table.m4 b/config/kernel-register_sysctl_table.m4 new file mode 100644 index 00000000000..f18316b32b6 --- /dev/null +++ b/config/kernel-register_sysctl_table.m4 @@ -0,0 +1,27 @@ +dnl # +dnl # Linux 6.5 removes register_sysctl_table +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_TABLE], [ + ZFS_LINUX_TEST_SRC([has_register_sysctl_table], [ + #include + + static struct ctl_table dummy_table[] = { + {} + }; + + ],[ + struct ctl_table_header *h + __attribute((unused)) = register_sysctl_table(dummy_table); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_REGISTER_SYSCTL_TABLE], [ + AC_MSG_CHECKING([whether register_sysctl_table exists]) + ZFS_LINUX_TEST_RESULT([has_register_sysctl_table], [ + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_SYSCTL_REGISTER_TABLE, 1, + [sysctl_register_table exists]) + ],[ + AC_MSG_RESULT([no]) + ]) +]) diff --git a/config/kernel.m4 b/config/kernel.m4 index 1487fa2e779..28bd361d33f 100644 --- a/config/kernel.m4 +++ b/config/kernel.m4 @@ -160,6 +160,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [ ZFS_AC_KERNEL_SRC_FILEMAP ZFS_AC_KERNEL_SRC_WRITEPAGE_T ZFS_AC_KERNEL_SRC_RECLAIMED + ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_TABLE case "$host_cpu" in powerpc*) ZFS_AC_KERNEL_SRC_CPU_HAS_FEATURE @@ -299,6 +300,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [ ZFS_AC_KERNEL_FILEMAP ZFS_AC_KERNEL_WRITEPAGE_T ZFS_AC_KERNEL_RECLAIMED + ZFS_AC_KERNEL_REGISTER_SYSCTL_TABLE case "$host_cpu" in powerpc*) ZFS_AC_KERNEL_CPU_HAS_FEATURE diff --git a/module/os/linux/spl/spl-proc.c b/module/os/linux/spl/spl-proc.c index 01f5619e189..bcc356ae55b 100644 --- a/module/os/linux/spl/spl-proc.c +++ b/module/os/linux/spl/spl-proc.c @@ -624,6 +624,7 @@ static struct ctl_table spl_table[] = { .mode = 0644, .proc_handler = &proc_dohostid, }, +#ifdef HAVE_REGISTER_SYSCTL_TABLE { .procname = "kmem", .mode = 0555, @@ -634,9 +635,11 @@ static struct ctl_table spl_table[] = { .mode = 0555, .child = spl_kstat_table, }, +#endif {}, }; +#ifdef HAVE_REGISTER_SYSCTL_TABLE static struct ctl_table spl_dir[] = { { .procname = "spl", @@ -648,21 +651,38 @@ static struct ctl_table spl_dir[] = { static struct ctl_table spl_root[] = { { - .procname = "kernel", - .mode = 0555, - .child = spl_dir, + .procname = "kernel", + .mode = 0555, + .child = spl_dir, }, {} }; +#endif int spl_proc_init(void) { int rc = 0; +#ifdef HAVE_REGISTER_SYSCTL_TABLE spl_header = register_sysctl_table(spl_root); if (spl_header == NULL) return (-EUNATCH); +#else + spl_header = register_sysctl("kernel/spl", spl_table); + if (spl_header == NULL) + return (-EUNATCH); + + if (register_sysctl("kernel/spl/kmem", spl_kmem_table) == NULL) { + rc = -EUNATCH; + goto out; + } + + if (register_sysctl("kernel/spl/kstat", spl_kstat_table) == NULL) { + rc = -EUNATCH; + goto out; + } +#endif proc_spl = proc_mkdir("spl", NULL); if (proc_spl == NULL) { From c0f075c06b914b02e175f8de670b7b440630c7bc Mon Sep 17 00:00:00 2001 From: Coleman Kane Date: Tue, 1 Aug 2023 11:32:38 -0400 Subject: [PATCH 02/19] Linux 6.5 compat: use disk_check_media_change when it exists When disk_check_media_change() exists, then define zfs_check_media_change() to simply call disk_check_media_change() on the bd_disk member of its argument. Since disk_check_media_change() is newer than when revalidate_disk was present in bops, we should be able to safely do this via a macro, instead of recreating a new implementation of the inline function that forces revalidation. Reviewed-by: Brian Behlendorf Reviewed-by: Brian Atkinson Signed-off-by: Coleman Kane Closes #15101 --- include/os/linux/kernel/linux/blkdev_compat.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/os/linux/kernel/linux/blkdev_compat.h b/include/os/linux/kernel/linux/blkdev_compat.h index e0f20ba3200..1641dd92a91 100644 --- a/include/os/linux/kernel/linux/blkdev_compat.h +++ b/include/os/linux/kernel/linux/blkdev_compat.h @@ -347,6 +347,7 @@ zfs_check_media_change(struct block_device *bdev) #define vdev_bdev_reread_part(bdev) zfs_check_media_change(bdev) #elif defined(HAVE_DISK_CHECK_MEDIA_CHANGE) #define vdev_bdev_reread_part(bdev) disk_check_media_change(bdev->bd_disk) +#define zfs_check_media_change(bdev) disk_check_media_change(bdev->bd_disk) #else /* * This is encountered if check_disk_change() and bdev_check_media_change() From d76de9fb170d58f81edac5729c365fc3fd60a22f Mon Sep 17 00:00:00 2001 From: Coleman Kane Date: Tue, 1 Aug 2023 11:37:20 -0400 Subject: [PATCH 03/19] Linux 6.5 compat: blkdev changes Multiple changes to the blkdev API were introduced in Linux 6.5. This includes passing (void* holder) to blkdev_put, adding a new blk_holder_ops* arg to blkdev_get_by_path, adding a new blk_mode_t type that replaces uses of fmode_t, and removing an argument from the release handler on block_device_operations that we weren't using. The open function definition has also changed to take gendisk* and blk_mode_t, so update it accordingly, too. Implement local wrappers for blkdev_get_by_path() and vdev_blkdev_put() so that the in-line calls are cleaner, and place the conditionally-compiled implementation details inside of both of these local wrappers. Both calls are exclusively used within vdev_disk.c, at this time. Add blk_mode_is_open_write() to test FMODE_WRITE / BLK_OPEN_WRITE The wrapper function is now used for testing using the appropriate method for the kernel, whether the open mode is writable or not. Emphasize fmode_t arg in zvol_release is not used Reviewed-by: Brian Behlendorf Signed-off-by: Coleman Kane Closes #15099 --- config/kernel-blkdev.m4 | 84 ++++++++++++++++++- config/kernel-block-device-operations.m4 | 35 +++++++- include/os/linux/kernel/linux/blkdev_compat.h | 6 ++ module/os/linux/zfs/vdev_disk.c | 65 ++++++++++++-- module/os/linux/zfs/zfs_vnops_os.c | 2 +- module/os/linux/zfs/zpl_ctldir.c | 2 +- module/os/linux/zfs/zvol_os.c | 28 ++++++- 7 files changed, 203 insertions(+), 19 deletions(-) diff --git a/config/kernel-blkdev.m4 b/config/kernel-blkdev.m4 index 887acee670b..e04a2bd2c3b 100644 --- a/config/kernel-blkdev.m4 +++ b/config/kernel-blkdev.m4 @@ -16,12 +16,63 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_GET_BY_PATH], [ ]) ]) +dnl # +dnl # 6.5.x API change, +dnl # blkdev_get_by_path() takes 4 args +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_GET_BY_PATH_4ARG], [ + ZFS_LINUX_TEST_SRC([blkdev_get_by_path_4arg], [ + #include + #include + ], [ + struct block_device *bdev __attribute__ ((unused)) = NULL; + const char *path = "path"; + fmode_t mode = 0; + void *holder = NULL; + struct blk_holder_ops h; + + bdev = blkdev_get_by_path(path, mode, holder, &h); + ]) +]) + AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_GET_BY_PATH], [ - AC_MSG_CHECKING([whether blkdev_get_by_path() exists]) + AC_MSG_CHECKING([whether blkdev_get_by_path() exists and takes 3 args]) ZFS_LINUX_TEST_RESULT([blkdev_get_by_path], [ AC_MSG_RESULT(yes) ], [ - ZFS_LINUX_TEST_ERROR([blkdev_get_by_path()]) + AC_MSG_RESULT(no) + AC_MSG_CHECKING([whether blkdev_get_by_path() exists and takes 4 args]) + ZFS_LINUX_TEST_RESULT([blkdev_get_by_path_4arg], [ + AC_DEFINE(HAVE_BLKDEV_GET_BY_PATH_4ARG, 1, + [blkdev_get_by_path() exists and takes 4 args]) + AC_MSG_RESULT(yes) + ], [ + ZFS_LINUX_TEST_ERROR([blkdev_get_by_path()]) + ]) + ]) +]) + +dnl # +dnl # 6.5.x API change +dnl # blk_mode_t was added as a type to supercede some places where fmode_t +dnl # is used +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_BLK_MODE_T], [ + ZFS_LINUX_TEST_SRC([blk_mode_t], [ + #include + #include + ], [ + blk_mode_t m __attribute((unused)) = (blk_mode_t)0; + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_BLK_MODE_T], [ + AC_MSG_CHECKING([whether blk_mode_t is defined]) + ZFS_LINUX_TEST_RESULT([blk_mode_t], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BLK_MODE_T, 1, [blk_mode_t is defined]) + ], [ + AC_MSG_RESULT(no) ]) ]) @@ -41,12 +92,35 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_PUT], [ ]) ]) +dnl # +dnl # 6.5.x API change. +dnl # blkdev_put() takes (void* holder) as arg 2 +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_PUT_HOLDER], [ + ZFS_LINUX_TEST_SRC([blkdev_put_holder], [ + #include + #include + ], [ + struct block_device *bdev = NULL; + void *holder = NULL; + + blkdev_put(bdev, holder); + ]) +]) + AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_PUT], [ AC_MSG_CHECKING([whether blkdev_put() exists]) ZFS_LINUX_TEST_RESULT([blkdev_put], [ AC_MSG_RESULT(yes) ], [ - ZFS_LINUX_TEST_ERROR([blkdev_put()]) + AC_MSG_CHECKING([whether blkdev_put() accepts void* as arg 2]) + ZFS_LINUX_TEST_RESULT([blkdev_put_holder], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BLKDEV_PUT_HOLDER, 1, + [blkdev_put() accepts void* as arg 2]) + ], [ + ZFS_LINUX_TEST_ERROR([blkdev_put()]) + ]) ]) ]) @@ -495,7 +569,9 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_BLK_STS_RESV_CONFLICT], [ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV], [ ZFS_AC_KERNEL_SRC_BLKDEV_GET_BY_PATH + ZFS_AC_KERNEL_SRC_BLKDEV_GET_BY_PATH_4ARG ZFS_AC_KERNEL_SRC_BLKDEV_PUT + ZFS_AC_KERNEL_SRC_BLKDEV_PUT_HOLDER ZFS_AC_KERNEL_SRC_BLKDEV_REREAD_PART ZFS_AC_KERNEL_SRC_BLKDEV_INVALIDATE_BDEV ZFS_AC_KERNEL_SRC_BLKDEV_LOOKUP_BDEV @@ -510,6 +586,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV], [ ZFS_AC_KERNEL_SRC_BLKDEV_PART_TO_DEV ZFS_AC_KERNEL_SRC_BLKDEV_DISK_CHECK_MEDIA_CHANGE ZFS_AC_KERNEL_SRC_BLKDEV_BLK_STS_RESV_CONFLICT + ZFS_AC_KERNEL_SRC_BLKDEV_BLK_MODE_T ]) AC_DEFUN([ZFS_AC_KERNEL_BLKDEV], [ @@ -530,4 +607,5 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV], [ ZFS_AC_KERNEL_BLKDEV_PART_TO_DEV ZFS_AC_KERNEL_BLKDEV_DISK_CHECK_MEDIA_CHANGE ZFS_AC_KERNEL_BLKDEV_BLK_STS_RESV_CONFLICT + ZFS_AC_KERNEL_BLKDEV_BLK_MODE_T ]) diff --git a/config/kernel-block-device-operations.m4 b/config/kernel-block-device-operations.m4 index 84e39dc8a2f..d13c1337b1f 100644 --- a/config/kernel-block-device-operations.m4 +++ b/config/kernel-block-device-operations.m4 @@ -49,12 +49,42 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID], [ ], [], []) ]) +dnl # +dnl # 5.9.x API change +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG], [ + ZFS_LINUX_TEST_SRC([block_device_operations_release_void_1arg], [ + #include + + void blk_release(struct gendisk *g) { + (void) g; + return; + } + + static const struct block_device_operations + bops __attribute__ ((unused)) = { + .open = NULL, + .release = blk_release, + .ioctl = NULL, + .compat_ioctl = NULL, + }; + ], [], []) +]) + AC_DEFUN([ZFS_AC_KERNEL_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID], [ - AC_MSG_CHECKING([whether bops->release() is void]) + AC_MSG_CHECKING([whether bops->release() is void and takes 2 args]) ZFS_LINUX_TEST_RESULT([block_device_operations_release_void], [ AC_MSG_RESULT(yes) ],[ - ZFS_LINUX_TEST_ERROR([bops->release()]) + AC_MSG_RESULT(no) + AC_MSG_CHECKING([whether bops->release() is void and takes 1 arg]) + ZFS_LINUX_TEST_RESULT([block_device_operations_release_void_1arg], [ + AC_MSG_RESULT(yes) + AC_DEFINE([HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG], [1], + [Define if release() in block_device_operations takes 1 arg]) + ],[ + ZFS_LINUX_TEST_ERROR([bops->release()]) + ]) ]) ]) @@ -92,6 +122,7 @@ AC_DEFUN([ZFS_AC_KERNEL_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK], [ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLOCK_DEVICE_OPERATIONS], [ ZFS_AC_KERNEL_SRC_BLOCK_DEVICE_OPERATIONS_CHECK_EVENTS ZFS_AC_KERNEL_SRC_BLOCK_DEVICE_OPERATIONS_RELEASE_VOID + ZFS_AC_KERNEL_SRC_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG ZFS_AC_KERNEL_SRC_BLOCK_DEVICE_OPERATIONS_REVALIDATE_DISK ]) diff --git a/include/os/linux/kernel/linux/blkdev_compat.h b/include/os/linux/kernel/linux/blkdev_compat.h index 1641dd92a91..f111e648ccf 100644 --- a/include/os/linux/kernel/linux/blkdev_compat.h +++ b/include/os/linux/kernel/linux/blkdev_compat.h @@ -398,6 +398,12 @@ vdev_lookup_bdev(const char *path, dev_t *dev) #endif } +#if defined(HAVE_BLK_MODE_T) +#define blk_mode_is_open_write(flag) ((flag) & BLK_OPEN_WRITE) +#else +#define blk_mode_is_open_write(flag) ((flag) & FMODE_WRITE) +#endif + /* * Kernels without bio_set_op_attrs use bi_rw for the bio flags. */ diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c index 925ee9d9fe9..48ac55f0703 100644 --- a/module/os/linux/zfs/vdev_disk.c +++ b/module/os/linux/zfs/vdev_disk.c @@ -80,9 +80,22 @@ typedef struct dio_request { static unsigned int zfs_vdev_failfast_mask = 1; +#ifdef HAVE_BLK_MODE_T +static blk_mode_t +#else static fmode_t +#endif vdev_bdev_mode(spa_mode_t spa_mode) { +#ifdef HAVE_BLK_MODE_T + blk_mode_t mode = 0; + + if (spa_mode & SPA_MODE_READ) + mode |= BLK_OPEN_READ; + + if (spa_mode & SPA_MODE_WRITE) + mode |= BLK_OPEN_WRITE; +#else fmode_t mode = 0; if (spa_mode & SPA_MODE_READ) @@ -90,6 +103,7 @@ vdev_bdev_mode(spa_mode_t spa_mode) if (spa_mode & SPA_MODE_WRITE) mode |= FMODE_WRITE; +#endif return (mode); } @@ -197,12 +211,47 @@ vdev_disk_kobj_evt_post(vdev_t *v) } } +#if !defined(HAVE_BLKDEV_GET_BY_PATH_4ARG) +/* + * Define a dummy struct blk_holder_ops for kernel versions + * prior to 6.5. + */ +struct blk_holder_ops {}; +#endif + +static struct block_device * +vdev_blkdev_get_by_path(const char *path, spa_mode_t mode, void *holder, + const struct blk_holder_ops *hops) +{ +#ifdef HAVE_BLKDEV_GET_BY_PATH_4ARG + return (blkdev_get_by_path(path, + vdev_bdev_mode(mode) | BLK_OPEN_EXCL, holder, hops)); +#else + return (blkdev_get_by_path(path, + vdev_bdev_mode(mode) | FMODE_EXCL, holder)); +#endif +} + +static void +vdev_blkdev_put(struct block_device *bdev, spa_mode_t mode, void *holder) +{ +#ifdef HAVE_BLKDEV_PUT_HOLDER + return (blkdev_put(bdev, holder)); +#else + return (blkdev_put(bdev, vdev_bdev_mode(mode) | FMODE_EXCL)); +#endif +} + static int vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, uint64_t *logical_ashift, uint64_t *physical_ashift) { struct block_device *bdev; +#ifdef HAVE_BLK_MODE_T + blk_mode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa)); +#else fmode_t mode = vdev_bdev_mode(spa_mode(v->vdev_spa)); +#endif hrtime_t timeout = MSEC2NSEC(zfs_vdev_open_timeout_ms); vdev_disk_t *vd; @@ -252,15 +301,15 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, reread_part = B_TRUE; } - blkdev_put(bdev, mode | FMODE_EXCL); + vdev_blkdev_put(bdev, mode, zfs_vdev_holder); } if (reread_part) { - bdev = blkdev_get_by_path(disk_name, mode | FMODE_EXCL, - zfs_vdev_holder); + bdev = vdev_blkdev_get_by_path(disk_name, mode, + zfs_vdev_holder, NULL); if (!IS_ERR(bdev)) { int error = vdev_bdev_reread_part(bdev); - blkdev_put(bdev, mode | FMODE_EXCL); + vdev_blkdev_put(bdev, mode, zfs_vdev_holder); if (error == 0) { timeout = MSEC2NSEC( zfs_vdev_open_timeout_ms * 2); @@ -305,8 +354,8 @@ vdev_disk_open(vdev_t *v, uint64_t *psize, uint64_t *max_psize, hrtime_t start = gethrtime(); bdev = ERR_PTR(-ENXIO); while (IS_ERR(bdev) && ((gethrtime() - start) < timeout)) { - bdev = blkdev_get_by_path(v->vdev_path, mode | FMODE_EXCL, - zfs_vdev_holder); + bdev = vdev_blkdev_get_by_path(v->vdev_path, mode, + zfs_vdev_holder, NULL); if (unlikely(PTR_ERR(bdev) == -ENOENT)) { /* * There is no point of waiting since device is removed @@ -382,8 +431,8 @@ vdev_disk_close(vdev_t *v) return; if (vd->vd_bdev != NULL) { - blkdev_put(vd->vd_bdev, - vdev_bdev_mode(spa_mode(v->vdev_spa)) | FMODE_EXCL); + vdev_blkdev_put(vd->vd_bdev, spa_mode(v->vdev_spa), + zfs_vdev_holder); } rw_destroy(&vd->vd_lock); diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c index 234c4d5ef0e..33baac9db06 100644 --- a/module/os/linux/zfs/zfs_vnops_os.c +++ b/module/os/linux/zfs/zfs_vnops_os.c @@ -186,7 +186,7 @@ zfs_open(struct inode *ip, int mode, int flag, cred_t *cr) return (error); /* Honor ZFS_APPENDONLY file attribute */ - if ((mode & FMODE_WRITE) && (zp->z_pflags & ZFS_APPENDONLY) && + if (blk_mode_is_open_write(mode) && (zp->z_pflags & ZFS_APPENDONLY) && ((flag & O_APPEND) == 0)) { zfs_exit(zfsvfs, FTAG); return (SET_ERROR(EPERM)); diff --git a/module/os/linux/zfs/zpl_ctldir.c b/module/os/linux/zfs/zpl_ctldir.c index 68a7de78f47..7786444fea3 100644 --- a/module/os/linux/zfs/zpl_ctldir.c +++ b/module/os/linux/zfs/zpl_ctldir.c @@ -42,7 +42,7 @@ static int zpl_common_open(struct inode *ip, struct file *filp) { - if (filp->f_mode & FMODE_WRITE) + if (blk_mode_is_open_write(filp->f_mode)) return (-EACCES); return (generic_file_open(ip, filp)); diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c index 38bc8e2c4ee..7a95b54bdf0 100644 --- a/module/os/linux/zfs/zvol_os.c +++ b/module/os/linux/zfs/zvol_os.c @@ -671,7 +671,11 @@ zvol_request(struct request_queue *q, struct bio *bio) } static int +#ifdef HAVE_BLK_MODE_T +zvol_open(struct gendisk *disk, blk_mode_t flag) +#else zvol_open(struct block_device *bdev, fmode_t flag) +#endif { zvol_state_t *zv; int error = 0; @@ -686,10 +690,14 @@ retry: /* * Obtain a copy of private_data under the zvol_state_lock to make * sure that either the result of zvol free code path setting - * bdev->bd_disk->private_data to NULL is observed, or zvol_os_free() + * disk->private_data to NULL is observed, or zvol_os_free() * is not called on this zv because of the positive zv_open_count. */ +#ifdef HAVE_BLK_MODE_T + zv = disk->private_data; +#else zv = bdev->bd_disk->private_data; +#endif if (zv == NULL) { rw_exit(&zvol_state_lock); return (SET_ERROR(-ENXIO)); @@ -769,14 +777,15 @@ retry: } } - error = -zvol_first_open(zv, !(flag & FMODE_WRITE)); + error = -zvol_first_open(zv, !(blk_mode_is_open_write(flag))); if (drop_namespace) mutex_exit(&spa_namespace_lock); } if (error == 0) { - if ((flag & FMODE_WRITE) && (zv->zv_flags & ZVOL_RDONLY)) { + if ((blk_mode_is_open_write(flag)) && + (zv->zv_flags & ZVOL_RDONLY)) { if (zv->zv_open_count == 0) zvol_last_close(zv); @@ -791,14 +800,25 @@ retry: rw_exit(&zv->zv_suspend_lock); if (error == 0) +#ifdef HAVE_BLK_MODE_T + disk_check_media_change(disk); +#else zfs_check_media_change(bdev); +#endif return (error); } static void -zvol_release(struct gendisk *disk, fmode_t mode) +#ifdef HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG +zvol_release(struct gendisk *disk) +#else +zvol_release(struct gendisk *disk, fmode_t unused) +#endif { +#if !defined(HAVE_BLOCK_DEVICE_OPERATIONS_RELEASE_1ARG) + (void) unused; +#endif zvol_state_t *zv; boolean_t drop_suspend = B_TRUE; From 0bf2c5365ed3afa65545393d8da2317699f18b30 Mon Sep 17 00:00:00 2001 From: Coleman Kane Date: Sun, 23 Jul 2023 01:34:29 -0400 Subject: [PATCH 04/19] Linux 6.4 compat: iter_iov() function now used to get old iov member The iov_iter->iov member is now iov_iter->__iov and must be accessed via the accessor function iter_iov(). Create a wrapper that is conditionally compiled to use the access method appropriate for the target kernel version. Reviewed-by: Brian Behlendorf Reviewed-by: Brian Atkinson Signed-off-by: Coleman Kane Closes #15100 --- config/kernel-vfs-iov_iter.m4 | 23 +++++++++++++++++++++++ include/os/linux/spl/sys/uio.h | 6 ++++++ module/os/linux/zfs/zpl_file.c | 8 +++----- 3 files changed, 32 insertions(+), 5 deletions(-) diff --git a/config/kernel-vfs-iov_iter.m4 b/config/kernel-vfs-iov_iter.m4 index e0617faab02..cc5a7ab0c23 100644 --- a/config/kernel-vfs-iov_iter.m4 +++ b/config/kernel-vfs-iov_iter.m4 @@ -93,6 +93,14 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_IOV_ITER], [ struct iov_iter iter = { 0 }; __attribute__((unused)) enum iter_type i = iov_iter_type(&iter); ]) + + ZFS_LINUX_TEST_SRC([iter_iov], [ + #include + #include + ],[ + struct iov_iter iter = { 0 }; + __attribute__((unused)) const struct iovec *iov = iter_iov(&iter); + ]) ]) AC_DEFUN([ZFS_AC_KERNEL_VFS_IOV_ITER], [ @@ -201,4 +209,19 @@ AC_DEFUN([ZFS_AC_KERNEL_VFS_IOV_ITER], [ AC_DEFINE(HAVE_VFS_IOV_ITER, 1, [All required iov_iter interfaces are available]) ]) + + dnl # + dnl # Kernel 6.5 introduces the iter_iov() function that returns the + dnl # __iov member of an iov_iter*. The iov member was renamed to this + dnl # __iov member, and is intended to be accessed via the helper + dnl # function now. + dnl # + AC_MSG_CHECKING([whether iter_iov() is available]) + ZFS_LINUX_TEST_RESULT([iter_iov], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_ITER_IOV, 1, + [iter_iov() is available]) + ],[ + AC_MSG_RESULT(no) + ]) ]) diff --git a/include/os/linux/spl/sys/uio.h b/include/os/linux/spl/sys/uio.h index fe2b5c07a01..082e930e46b 100644 --- a/include/os/linux/spl/sys/uio.h +++ b/include/os/linux/spl/sys/uio.h @@ -173,4 +173,10 @@ zfs_uio_iov_iter_init(zfs_uio_t *uio, struct iov_iter *iter, offset_t offset, } #endif +#if defined(HAVE_ITER_IOV) +#define zfs_uio_iter_iov(iter) iter_iov((iter)) +#else +#define zfs_uio_iter_iov(iter) (iter)->iov +#endif + #endif /* SPL_UIO_H */ diff --git a/module/os/linux/zfs/zpl_file.c b/module/os/linux/zfs/zpl_file.c index 73526db731c..aedafd6002b 100644 --- a/module/os/linux/zfs/zpl_file.c +++ b/module/os/linux/zfs/zpl_file.c @@ -300,17 +300,15 @@ zpl_uio_init(zfs_uio_t *uio, struct kiocb *kiocb, struct iov_iter *to, { #if defined(HAVE_VFS_IOV_ITER) zfs_uio_iov_iter_init(uio, to, pos, count, skip); -#else -#ifdef HAVE_IOV_ITER_TYPE - zfs_uio_iovec_init(uio, to->iov, to->nr_segs, pos, +#elif defined(HAVE_IOV_ITER_TYPE) + zfs_uio_iovec_init(uio, zfs_uio_iter_iov(to), to->nr_segs, pos, iov_iter_type(to) & ITER_KVEC ? UIO_SYSSPACE : UIO_USERSPACE, count, skip); #else - zfs_uio_iovec_init(uio, to->iov, to->nr_segs, pos, + zfs_uio_iovec_init(uio, zfs_uio_iter_iov(to), to->nr_segs, pos, to->type & ITER_KVEC ? UIO_SYSSPACE : UIO_USERSPACE, count, skip); #endif -#endif } static ssize_t From 8be6308e85cffb2247753033fbfc3641f731af51 Mon Sep 17 00:00:00 2001 From: Coleman Kane Date: Sun, 30 Jul 2023 15:23:47 -0400 Subject: [PATCH 05/19] Linux 4.20 compat: wrapper function for iov_iter type access An iov_iter_type() function to access the "type" member of the struct iov_iter was added at one point. Move the conditional logic to decide which method to use for accessing it into a macro and simplify the zpl_uio_init code. Reviewed-by: Brian Behlendorf Reviewed-by: Brian Atkinson Signed-off-by: Coleman Kane Closes #15100 --- include/os/linux/spl/sys/uio.h | 6 ++++++ module/os/linux/zfs/zpl_file.c | 7 ++----- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/include/os/linux/spl/sys/uio.h b/include/os/linux/spl/sys/uio.h index 082e930e46b..cce097e16fb 100644 --- a/include/os/linux/spl/sys/uio.h +++ b/include/os/linux/spl/sys/uio.h @@ -179,4 +179,10 @@ zfs_uio_iov_iter_init(zfs_uio_t *uio, struct iov_iter *iter, offset_t offset, #define zfs_uio_iter_iov(iter) (iter)->iov #endif +#if defined(HAVE_IOV_ITER_TYPE) +#define zfs_uio_iov_iter_type(iter) iov_iter_type((iter)) +#else +#define zfs_uio_iov_iter_type(iter) (iter)->type +#endif + #endif /* SPL_UIO_H */ diff --git a/module/os/linux/zfs/zpl_file.c b/module/os/linux/zfs/zpl_file.c index aedafd6002b..f6af2ebd116 100644 --- a/module/os/linux/zfs/zpl_file.c +++ b/module/os/linux/zfs/zpl_file.c @@ -300,13 +300,10 @@ zpl_uio_init(zfs_uio_t *uio, struct kiocb *kiocb, struct iov_iter *to, { #if defined(HAVE_VFS_IOV_ITER) zfs_uio_iov_iter_init(uio, to, pos, count, skip); -#elif defined(HAVE_IOV_ITER_TYPE) - zfs_uio_iovec_init(uio, zfs_uio_iter_iov(to), to->nr_segs, pos, - iov_iter_type(to) & ITER_KVEC ? UIO_SYSSPACE : UIO_USERSPACE, - count, skip); #else zfs_uio_iovec_init(uio, zfs_uio_iter_iov(to), to->nr_segs, pos, - to->type & ITER_KVEC ? UIO_SYSSPACE : UIO_USERSPACE, + zfs_uio_iov_iter_type(to) & ITER_KVEC ? + UIO_SYSSPACE : UIO_USERSPACE, count, skip); #endif } From 3a68f3c50f82ec87d74b255e3a0db504ff366211 Mon Sep 17 00:00:00 2001 From: Brian Atkinson Date: Tue, 1 Aug 2023 17:48:19 -0400 Subject: [PATCH 06/19] Revert "Linux 6.5 compat: register_sysctl_table removed" This reverts commit b35374fd6474603170fd9a3c7503da6eb13ac712 as there are error messages when loading the SPL module. Errors seemed to be tied to duplicate a duplicate entry. Reviewed-by: Brian Behlendorf Signed-off-by: Brian Atkinson Closes #15134 --- config/kernel-register_sysctl_table.m4 | 27 -------------------------- config/kernel.m4 | 2 -- module/os/linux/spl/spl-proc.c | 26 +++---------------------- 3 files changed, 3 insertions(+), 52 deletions(-) delete mode 100644 config/kernel-register_sysctl_table.m4 diff --git a/config/kernel-register_sysctl_table.m4 b/config/kernel-register_sysctl_table.m4 deleted file mode 100644 index f18316b32b6..00000000000 --- a/config/kernel-register_sysctl_table.m4 +++ /dev/null @@ -1,27 +0,0 @@ -dnl # -dnl # Linux 6.5 removes register_sysctl_table -dnl # -AC_DEFUN([ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_TABLE], [ - ZFS_LINUX_TEST_SRC([has_register_sysctl_table], [ - #include - - static struct ctl_table dummy_table[] = { - {} - }; - - ],[ - struct ctl_table_header *h - __attribute((unused)) = register_sysctl_table(dummy_table); - ]) -]) - -AC_DEFUN([ZFS_AC_KERNEL_REGISTER_SYSCTL_TABLE], [ - AC_MSG_CHECKING([whether register_sysctl_table exists]) - ZFS_LINUX_TEST_RESULT([has_register_sysctl_table], [ - AC_MSG_RESULT([yes]) - AC_DEFINE(HAVE_SYSCTL_REGISTER_TABLE, 1, - [sysctl_register_table exists]) - ],[ - AC_MSG_RESULT([no]) - ]) -]) diff --git a/config/kernel.m4 b/config/kernel.m4 index 28bd361d33f..1487fa2e779 100644 --- a/config/kernel.m4 +++ b/config/kernel.m4 @@ -160,7 +160,6 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [ ZFS_AC_KERNEL_SRC_FILEMAP ZFS_AC_KERNEL_SRC_WRITEPAGE_T ZFS_AC_KERNEL_SRC_RECLAIMED - ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_TABLE case "$host_cpu" in powerpc*) ZFS_AC_KERNEL_SRC_CPU_HAS_FEATURE @@ -300,7 +299,6 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [ ZFS_AC_KERNEL_FILEMAP ZFS_AC_KERNEL_WRITEPAGE_T ZFS_AC_KERNEL_RECLAIMED - ZFS_AC_KERNEL_REGISTER_SYSCTL_TABLE case "$host_cpu" in powerpc*) ZFS_AC_KERNEL_CPU_HAS_FEATURE diff --git a/module/os/linux/spl/spl-proc.c b/module/os/linux/spl/spl-proc.c index bcc356ae55b..01f5619e189 100644 --- a/module/os/linux/spl/spl-proc.c +++ b/module/os/linux/spl/spl-proc.c @@ -624,7 +624,6 @@ static struct ctl_table spl_table[] = { .mode = 0644, .proc_handler = &proc_dohostid, }, -#ifdef HAVE_REGISTER_SYSCTL_TABLE { .procname = "kmem", .mode = 0555, @@ -635,11 +634,9 @@ static struct ctl_table spl_table[] = { .mode = 0555, .child = spl_kstat_table, }, -#endif {}, }; -#ifdef HAVE_REGISTER_SYSCTL_TABLE static struct ctl_table spl_dir[] = { { .procname = "spl", @@ -651,38 +648,21 @@ static struct ctl_table spl_dir[] = { static struct ctl_table spl_root[] = { { - .procname = "kernel", - .mode = 0555, - .child = spl_dir, + .procname = "kernel", + .mode = 0555, + .child = spl_dir, }, {} }; -#endif int spl_proc_init(void) { int rc = 0; -#ifdef HAVE_REGISTER_SYSCTL_TABLE spl_header = register_sysctl_table(spl_root); if (spl_header == NULL) return (-EUNATCH); -#else - spl_header = register_sysctl("kernel/spl", spl_table); - if (spl_header == NULL) - return (-EUNATCH); - - if (register_sysctl("kernel/spl/kmem", spl_kmem_table) == NULL) { - rc = -EUNATCH; - goto out; - } - - if (register_sysctl("kernel/spl/kstat", spl_kstat_table) == NULL) { - rc = -EUNATCH; - goto out; - } -#endif proc_spl = proc_mkdir("spl", NULL); if (proc_spl == NULL) { From 31a4673c05ea942498a278d9dd519f251b501db1 Mon Sep 17 00:00:00 2001 From: Coleman Kane Date: Wed, 2 Aug 2023 17:05:46 -0400 Subject: [PATCH 07/19] Linux 6.5 compat: register_sysctl_table removed Additionally, the .child element of ctl_table has been removed in 6.5. This change adds a new test for the pre-6.5 register_sysctl_table() function, and uses the old code in that case. If it isn't found, then the parentage entries in the tables are removed, and the register_sysctl call is provided the paths of "kernel/spl", "kernel/spl/kmem", and "kernel/spl/kstat" directly, to populate each subdirectory over three calls, as is the new API. Reviewed-by: Brian Atkinson Reviewed-by: Brian Behlendorf Signed-off-by: Coleman Kane Closes #15138 --- config/kernel-register_sysctl_table.m4 | 27 ++++++++++++++++++++++++++ config/kernel.m4 | 2 ++ module/os/linux/spl/spl-proc.c | 26 ++++++++++++++++++++++--- 3 files changed, 52 insertions(+), 3 deletions(-) create mode 100644 config/kernel-register_sysctl_table.m4 diff --git a/config/kernel-register_sysctl_table.m4 b/config/kernel-register_sysctl_table.m4 new file mode 100644 index 00000000000..a5e934f56d2 --- /dev/null +++ b/config/kernel-register_sysctl_table.m4 @@ -0,0 +1,27 @@ +dnl # +dnl # Linux 6.5 removes register_sysctl_table +dnl # +AC_DEFUN([ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_TABLE], [ + ZFS_LINUX_TEST_SRC([has_register_sysctl_table], [ + #include + + static struct ctl_table dummy_table[] = { + {} + }; + + ],[ + struct ctl_table_header *h + __attribute((unused)) = register_sysctl_table(dummy_table); + ]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_REGISTER_SYSCTL_TABLE], [ + AC_MSG_CHECKING([whether register_sysctl_table exists]) + ZFS_LINUX_TEST_RESULT([has_register_sysctl_table], [ + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_REGISTER_SYSCTL_TABLE, 1, + [register_sysctl_table exists]) + ],[ + AC_MSG_RESULT([no]) + ]) +]) diff --git a/config/kernel.m4 b/config/kernel.m4 index 1487fa2e779..28bd361d33f 100644 --- a/config/kernel.m4 +++ b/config/kernel.m4 @@ -160,6 +160,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [ ZFS_AC_KERNEL_SRC_FILEMAP ZFS_AC_KERNEL_SRC_WRITEPAGE_T ZFS_AC_KERNEL_SRC_RECLAIMED + ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_TABLE case "$host_cpu" in powerpc*) ZFS_AC_KERNEL_SRC_CPU_HAS_FEATURE @@ -299,6 +300,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [ ZFS_AC_KERNEL_FILEMAP ZFS_AC_KERNEL_WRITEPAGE_T ZFS_AC_KERNEL_RECLAIMED + ZFS_AC_KERNEL_REGISTER_SYSCTL_TABLE case "$host_cpu" in powerpc*) ZFS_AC_KERNEL_CPU_HAS_FEATURE diff --git a/module/os/linux/spl/spl-proc.c b/module/os/linux/spl/spl-proc.c index 01f5619e189..bcc356ae55b 100644 --- a/module/os/linux/spl/spl-proc.c +++ b/module/os/linux/spl/spl-proc.c @@ -624,6 +624,7 @@ static struct ctl_table spl_table[] = { .mode = 0644, .proc_handler = &proc_dohostid, }, +#ifdef HAVE_REGISTER_SYSCTL_TABLE { .procname = "kmem", .mode = 0555, @@ -634,9 +635,11 @@ static struct ctl_table spl_table[] = { .mode = 0555, .child = spl_kstat_table, }, +#endif {}, }; +#ifdef HAVE_REGISTER_SYSCTL_TABLE static struct ctl_table spl_dir[] = { { .procname = "spl", @@ -648,21 +651,38 @@ static struct ctl_table spl_dir[] = { static struct ctl_table spl_root[] = { { - .procname = "kernel", - .mode = 0555, - .child = spl_dir, + .procname = "kernel", + .mode = 0555, + .child = spl_dir, }, {} }; +#endif int spl_proc_init(void) { int rc = 0; +#ifdef HAVE_REGISTER_SYSCTL_TABLE spl_header = register_sysctl_table(spl_root); if (spl_header == NULL) return (-EUNATCH); +#else + spl_header = register_sysctl("kernel/spl", spl_table); + if (spl_header == NULL) + return (-EUNATCH); + + if (register_sysctl("kernel/spl/kmem", spl_kmem_table) == NULL) { + rc = -EUNATCH; + goto out; + } + + if (register_sysctl("kernel/spl/kstat", spl_kstat_table) == NULL) { + rc = -EUNATCH; + goto out; + } +#endif proc_spl = proc_mkdir("spl", NULL); if (proc_spl == NULL) { From 5a22de144abf2829bc8112e17a7a7e542da53dc5 Mon Sep 17 00:00:00 2001 From: Coleman Kane Date: Mon, 7 Aug 2023 18:47:46 -0400 Subject: [PATCH 08/19] Linux 6.5 compat: replace generic_file_splice_read with filemap_splice_read The generic_file_splice_read function was removed in Linux 6.5 in favor of filemap_splice_read. Add an autoconf test for filemap_splice_read and use it if it is found as the handler for .splice_read in the file_operations struct. Additionally, ITER_PIPE was removed in 6.5. This change removes the ITER_* macros that OpenZFS doesn't use from being tested in config/kernel-vfs-iov_iter.m4. The removal of ITER_PIPE was causing the test to fail, which also affected the code responsible for setting the .splice_read handler, above. That behavior caused run-time panics on Linux 6.5. Reviewed-by: Brian Atkinson Reviewed-by: Brian Behlendorf Signed-off-by: Coleman Kane Closes #15155 --- config/kernel-filemap-splice-read.m4 | 25 +++++++++++++++++++++++++ config/kernel-vfs-iov_iter.m4 | 3 +-- config/kernel.m4 | 2 ++ module/os/linux/zfs/zpl_file.c | 4 ++++ 4 files changed, 32 insertions(+), 2 deletions(-) create mode 100644 config/kernel-filemap-splice-read.m4 diff --git a/config/kernel-filemap-splice-read.m4 b/config/kernel-filemap-splice-read.m4 new file mode 100644 index 00000000000..5199b7373e4 --- /dev/null +++ b/config/kernel-filemap-splice-read.m4 @@ -0,0 +1,25 @@ +AC_DEFUN([ZFS_AC_KERNEL_SRC_FILEMAP_SPLICE_READ], [ + dnl # + dnl # Kernel 6.5 - generic_file_splice_read was removed in favor + dnl # of filemap_splice_read for the .splice_read member of the + dnl # file_operations struct. + dnl # + ZFS_LINUX_TEST_SRC([has_filemap_splice_read], [ + #include + + struct file_operations fops __attribute__((unused)) = { + .splice_read = filemap_splice_read, + }; + ],[]) +]) + +AC_DEFUN([ZFS_AC_KERNEL_FILEMAP_SPLICE_READ], [ + AC_MSG_CHECKING([whether filemap_splice_read() exists]) + ZFS_LINUX_TEST_RESULT([has_filemap_splice_read], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_FILEMAP_SPLICE_READ, 1, + [filemap_splice_read exists]) + ],[ + AC_MSG_RESULT(no) + ]) +]) diff --git a/config/kernel-vfs-iov_iter.m4 b/config/kernel-vfs-iov_iter.m4 index cc5a7ab0c23..ff560ff3eef 100644 --- a/config/kernel-vfs-iov_iter.m4 +++ b/config/kernel-vfs-iov_iter.m4 @@ -6,8 +6,7 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_VFS_IOV_ITER], [ #include #include ],[ - int type __attribute__ ((unused)) = - ITER_IOVEC | ITER_KVEC | ITER_BVEC | ITER_PIPE; + int type __attribute__ ((unused)) = ITER_KVEC; ]) ZFS_LINUX_TEST_SRC([iov_iter_advance], [ diff --git a/config/kernel.m4 b/config/kernel.m4 index 28bd361d33f..309f1819be4 100644 --- a/config/kernel.m4 +++ b/config/kernel.m4 @@ -161,6 +161,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [ ZFS_AC_KERNEL_SRC_WRITEPAGE_T ZFS_AC_KERNEL_SRC_RECLAIMED ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_TABLE + ZFS_AC_KERNEL_SRC_FILEMAP_SPLICE_READ case "$host_cpu" in powerpc*) ZFS_AC_KERNEL_SRC_CPU_HAS_FEATURE @@ -301,6 +302,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [ ZFS_AC_KERNEL_WRITEPAGE_T ZFS_AC_KERNEL_RECLAIMED ZFS_AC_KERNEL_REGISTER_SYSCTL_TABLE + ZFS_AC_KERNEL_FILEMAP_SPLICE_READ case "$host_cpu" in powerpc*) ZFS_AC_KERNEL_CPU_HAS_FEATURE diff --git a/module/os/linux/zfs/zpl_file.c b/module/os/linux/zfs/zpl_file.c index f6af2ebd116..24cc1064a8f 100644 --- a/module/os/linux/zfs/zpl_file.c +++ b/module/os/linux/zfs/zpl_file.c @@ -1323,7 +1323,11 @@ const struct file_operations zpl_file_operations = { .read_iter = zpl_iter_read, .write_iter = zpl_iter_write, #ifdef HAVE_VFS_IOV_ITER +#ifdef HAVE_FILEMAP_SPLICE_READ + .splice_read = filemap_splice_read, +#else .splice_read = generic_file_splice_read, +#endif .splice_write = iter_file_splice_write, #endif #else From 58a707375fef23697cb029fbad0e7e92bc78025b Mon Sep 17 00:00:00 2001 From: Coleman Kane Date: Tue, 8 Aug 2023 18:42:32 -0400 Subject: [PATCH 09/19] Linux 6.5 compat: Use copy_splice_read instead of filemap_splice_read Using the filemap_splice_read function for the splice_read handler was leading to occasional data corruption under certain circumstances. Favor using copy_splice_read instead, which does not demonstrate the same erroneous behavior under the tested failure cases. Reviewed-by: Brian Atkinson Reviewed-by: Brian Behlendorf Signed-off-by: Coleman Kane Closes #15164 --- config/kernel-filemap-splice-read.m4 | 18 +++++++++--------- config/kernel.m4 | 4 ++-- module/os/linux/zfs/zpl_file.c | 4 ++-- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/config/kernel-filemap-splice-read.m4 b/config/kernel-filemap-splice-read.m4 index 5199b7373e4..4c83b31d738 100644 --- a/config/kernel-filemap-splice-read.m4 +++ b/config/kernel-filemap-splice-read.m4 @@ -1,24 +1,24 @@ -AC_DEFUN([ZFS_AC_KERNEL_SRC_FILEMAP_SPLICE_READ], [ +AC_DEFUN([ZFS_AC_KERNEL_SRC_COPY_SPLICE_READ], [ dnl # dnl # Kernel 6.5 - generic_file_splice_read was removed in favor - dnl # of filemap_splice_read for the .splice_read member of the + dnl # of copy_splice_read for the .splice_read member of the dnl # file_operations struct. dnl # - ZFS_LINUX_TEST_SRC([has_filemap_splice_read], [ + ZFS_LINUX_TEST_SRC([has_copy_splice_read], [ #include struct file_operations fops __attribute__((unused)) = { - .splice_read = filemap_splice_read, + .splice_read = copy_splice_read, }; ],[]) ]) -AC_DEFUN([ZFS_AC_KERNEL_FILEMAP_SPLICE_READ], [ - AC_MSG_CHECKING([whether filemap_splice_read() exists]) - ZFS_LINUX_TEST_RESULT([has_filemap_splice_read], [ +AC_DEFUN([ZFS_AC_KERNEL_COPY_SPLICE_READ], [ + AC_MSG_CHECKING([whether copy_splice_read() exists]) + ZFS_LINUX_TEST_RESULT([has_copy_splice_read], [ AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_FILEMAP_SPLICE_READ, 1, - [filemap_splice_read exists]) + AC_DEFINE(HAVE_COPY_SPLICE_READ, 1, + [copy_splice_read exists]) ],[ AC_MSG_RESULT(no) ]) diff --git a/config/kernel.m4 b/config/kernel.m4 index 309f1819be4..df194ec7220 100644 --- a/config/kernel.m4 +++ b/config/kernel.m4 @@ -161,7 +161,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_SRC], [ ZFS_AC_KERNEL_SRC_WRITEPAGE_T ZFS_AC_KERNEL_SRC_RECLAIMED ZFS_AC_KERNEL_SRC_REGISTER_SYSCTL_TABLE - ZFS_AC_KERNEL_SRC_FILEMAP_SPLICE_READ + ZFS_AC_KERNEL_SRC_COPY_SPLICE_READ case "$host_cpu" in powerpc*) ZFS_AC_KERNEL_SRC_CPU_HAS_FEATURE @@ -302,7 +302,7 @@ AC_DEFUN([ZFS_AC_KERNEL_TEST_RESULT], [ ZFS_AC_KERNEL_WRITEPAGE_T ZFS_AC_KERNEL_RECLAIMED ZFS_AC_KERNEL_REGISTER_SYSCTL_TABLE - ZFS_AC_KERNEL_FILEMAP_SPLICE_READ + ZFS_AC_KERNEL_COPY_SPLICE_READ case "$host_cpu" in powerpc*) ZFS_AC_KERNEL_CPU_HAS_FEATURE diff --git a/module/os/linux/zfs/zpl_file.c b/module/os/linux/zfs/zpl_file.c index 24cc1064a8f..3caa0fc6c21 100644 --- a/module/os/linux/zfs/zpl_file.c +++ b/module/os/linux/zfs/zpl_file.c @@ -1323,8 +1323,8 @@ const struct file_operations zpl_file_operations = { .read_iter = zpl_iter_read, .write_iter = zpl_iter_write, #ifdef HAVE_VFS_IOV_ITER -#ifdef HAVE_FILEMAP_SPLICE_READ - .splice_read = filemap_splice_read, +#ifdef HAVE_COPY_SPLICE_READ + .splice_read = copy_splice_read, #else .splice_read = generic_file_splice_read, #endif From c7ee59a160f74049de1e459b3c3c63671784703f Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Sat, 2 Sep 2023 02:21:40 +0200 Subject: [PATCH 10/19] Linux 6.5 compat: safe cleanup in spl_proc_fini() If we fail to create a proc entry in spl_proc_init() we may end up calling unregister_sysctl_table() twice: one in the failure path of spl_proc_init() and another time during spl_proc_fini(). Avoid the double call to unregister_sysctl_table() and while at it refactor the code a bit to reduce code duplication. This was accidentally introduced when the spl code was updated for Linux 6.5 compatibility. Reviewed-by: Brian Behlendorf Reviewed-by: Ameer Hamza Signed-off-by: Andrea Righi Closes #15234 Closes #15235 --- module/os/linux/spl/spl-proc.c | 36 +++++++++++++++++----------------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/module/os/linux/spl/spl-proc.c b/module/os/linux/spl/spl-proc.c index bcc356ae55b..5cb5a6dadb0 100644 --- a/module/os/linux/spl/spl-proc.c +++ b/module/os/linux/spl/spl-proc.c @@ -659,6 +659,21 @@ static struct ctl_table spl_root[] = { }; #endif +static void spl_proc_cleanup(void) +{ + remove_proc_entry("kstat", proc_spl); + remove_proc_entry("slab", proc_spl_kmem); + remove_proc_entry("kmem", proc_spl); + remove_proc_entry("taskq-all", proc_spl); + remove_proc_entry("taskq", proc_spl); + remove_proc_entry("spl", NULL); + + if (spl_header) { + unregister_sysctl_table(spl_header); + spl_header = NULL; + } +} + int spl_proc_init(void) { @@ -723,15 +738,8 @@ spl_proc_init(void) goto out; } out: - if (rc) { - remove_proc_entry("kstat", proc_spl); - remove_proc_entry("slab", proc_spl_kmem); - remove_proc_entry("kmem", proc_spl); - remove_proc_entry("taskq-all", proc_spl); - remove_proc_entry("taskq", proc_spl); - remove_proc_entry("spl", NULL); - unregister_sysctl_table(spl_header); - } + if (rc) + spl_proc_cleanup(); return (rc); } @@ -739,13 +747,5 @@ out: void spl_proc_fini(void) { - remove_proc_entry("kstat", proc_spl); - remove_proc_entry("slab", proc_spl_kmem); - remove_proc_entry("kmem", proc_spl); - remove_proc_entry("taskq-all", proc_spl); - remove_proc_entry("taskq", proc_spl); - remove_proc_entry("spl", NULL); - - ASSERT(spl_header != NULL); - unregister_sysctl_table(spl_header); + spl_proc_cleanup(); } From cacc599aa20b6aba0ac8173386cea2f8b435068e Mon Sep 17 00:00:00 2001 From: Andrea Righi Date: Thu, 7 Sep 2023 23:36:32 +0200 Subject: [PATCH 11/19] Linux 6.5 compat: spl: properly unregister sysctl entries When register_sysctl_table() is unavailable we fail to properly unregister sysctl entries under "kernel/spl". This leads to errors like the following when spl is unloaded/reloaded, making impossible to properly reload the spl module: [ 746.995704] sysctl duplicate entry: /kernel/spl/kmem/slab_kvmem_total Fix by cleaning up all the sub-entries inside "kernel/spl" when the spl module is unloaded. Reviewed-by: Alexander Motin Reviewed-by: Brian Atkinson Signed-off-by: Andrea Righi Closes #15239 --- module/os/linux/spl/spl-proc.c | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/module/os/linux/spl/spl-proc.c b/module/os/linux/spl/spl-proc.c index 5cb5a6dadb0..f0f929d3ce9 100644 --- a/module/os/linux/spl/spl-proc.c +++ b/module/os/linux/spl/spl-proc.c @@ -47,6 +47,10 @@ static unsigned long table_min = 0; static unsigned long table_max = ~0; static struct ctl_table_header *spl_header = NULL; +#ifndef HAVE_REGISTER_SYSCTL_TABLE +static struct ctl_table_header *spl_kmem = NULL; +static struct ctl_table_header *spl_kstat = NULL; +#endif static struct proc_dir_entry *proc_spl = NULL; static struct proc_dir_entry *proc_spl_kmem = NULL; static struct proc_dir_entry *proc_spl_kmem_slab = NULL; @@ -668,6 +672,16 @@ static void spl_proc_cleanup(void) remove_proc_entry("taskq", proc_spl); remove_proc_entry("spl", NULL); +#ifndef HAVE_REGISTER_SYSCTL_TABLE + if (spl_kstat) { + unregister_sysctl_table(spl_kstat); + spl_kstat = NULL; + } + if (spl_kmem) { + unregister_sysctl_table(spl_kmem); + spl_kmem = NULL; + } +#endif if (spl_header) { unregister_sysctl_table(spl_header); spl_header = NULL; @@ -688,12 +702,13 @@ spl_proc_init(void) if (spl_header == NULL) return (-EUNATCH); - if (register_sysctl("kernel/spl/kmem", spl_kmem_table) == NULL) { + spl_kmem = register_sysctl("kernel/spl/kmem", spl_kmem_table); + if (spl_kmem == NULL) { rc = -EUNATCH; goto out; } - - if (register_sysctl("kernel/spl/kstat", spl_kstat_table) == NULL) { + spl_kstat = register_sysctl("kernel/spl/kstat", spl_kstat_table); + if (spl_kstat == NULL) { rc = -EUNATCH; goto out; } From c011ef8c917473405a769d2b0bab1ead2e49dcc1 Mon Sep 17 00:00:00 2001 From: Tony Hutter Date: Tue, 12 Sep 2023 12:51:11 -0700 Subject: [PATCH 12/19] Linux 6.5 compat: META (#15265) Update the META file to reflect compatibility with the 6.5 kernel. Signed-off-by: Tony Hutter Reviewed-by: Brian Behlendorf --- META | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/META b/META index 0953cc51922..9ffe90458db 100644 --- a/META +++ b/META @@ -6,5 +6,5 @@ Release: rc4 Release-Tags: relext License: CDDL Author: OpenZFS -Linux-Maximum: 6.4 +Linux-Maximum: 6.5 Linux-Minimum: 3.10 From 11943656f9233086260236e9eef5d752d9abe84c Mon Sep 17 00:00:00 2001 From: George Amanakis Date: Tue, 19 Sep 2023 02:06:35 +0200 Subject: [PATCH 13/19] Update the MOS directory on spa_upgrade_errlog() spa_upgrade_errlog() does not update the MOS directory when the head_errlog feature is enabled. In this case if spa_errlog_sync() is not called, the MOS dir references the old errlog_last and errlog_sync objects. Thus when doing a scrub a panic will occur: Call Trace: dump_stack+0x6d/0x8b panic+0x101/0x2e3 spl_panic+0xcf/0x102 [spl] delete_errlog+0x124/0x130 [zfs] spa_errlog_sync+0x256/0x260 [zfs] spa_sync_iterate_to_convergence+0xe5/0x250 [zfs] spa_sync+0x2f7/0x670 [zfs] txg_sync_thread+0x22d/0x2d0 [zfs] thread_generic_wrapper+0x83/0xa0 [spl] kthread+0x104/0x140 ret_from_fork+0x1f/0x40 Fix this by updating the related MOS directory objects in spa_upgrade_errlog(). Reviewed-by: Mark Maybee Reviewed-by: Brian Behlendorf Signed-off-by: George Amanakis Closes #15279 Closes #15277 --- module/zfs/spa_errlog.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/module/zfs/spa_errlog.c b/module/zfs/spa_errlog.c index 2e5c22c1149..5dd08f597f3 100644 --- a/module/zfs/spa_errlog.c +++ b/module/zfs/spa_errlog.c @@ -930,12 +930,21 @@ spa_upgrade_errlog(spa_t *spa, dmu_tx_t *tx) if (spa->spa_errlog_last != 0) { sync_upgrade_errlog(spa, spa->spa_errlog_last, &newobj, tx); spa->spa_errlog_last = newobj; + + (void) zap_update(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_LAST, + sizeof (uint64_t), 1, &spa->spa_errlog_last, tx); } if (spa->spa_errlog_scrub != 0) { sync_upgrade_errlog(spa, spa->spa_errlog_scrub, &newobj, tx); spa->spa_errlog_scrub = newobj; + + (void) zap_update(spa->spa_meta_objset, + DMU_POOL_DIRECTORY_OBJECT, DMU_POOL_ERRLOG_SCRUB, + sizeof (uint64_t), 1, &spa->spa_errlog_scrub, tx); } + mutex_exit(&spa->spa_errlog_lock); } From b9b9cdcdb148b50b8b647eb4bc300ec6c4ffd916 Mon Sep 17 00:00:00 2001 From: ednadolski-ix <137826107+ednadolski-ix@users.noreply.github.com> Date: Sat, 9 Sep 2023 11:23:29 -0600 Subject: [PATCH 14/19] update max_variance limit in zdb_block_size_histogram test for CI Commit 2d7843401a628ef8c483229742dd58bca70bc27e had previously updated this hardcoded limit to allow for CI testing. As there is no deterministic pass/fail value, the need has arisen for one more small increase. Reviewed-by: Alexander Motin Reviewed-by: Brian Behlendorf Signed-off-by: Edmund Nadolski Closes #15252 --- .../functional/cli_root/zdb/zdb_block_size_histogram.ksh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_block_size_histogram.ksh b/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_block_size_histogram.ksh index 0a4d24fa695..cfa26f54b11 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_block_size_histogram.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zdb/zdb_block_size_histogram.ksh @@ -204,11 +204,11 @@ function histo_check_test_pool # 4096 blocksize count for asize. For verification we stick # to just lsize counts. # - # The max_variance is hard-coded here at 12% to leave us some - # margin. Testing has shown this normally to be in the range - # of 2%-8%, but it may be as large as 11%. + # Variances are expected since this test does not account for + # metadata. The hardcoded limit here is empirical and should + # not be construed as deterministic. ################### - let max_variance=12 + let max_variance=15 let fail_value=0 let error_count=0 log_note "Comparisons for ${pool}" From 228b064d1b0acb991b7e27502f59f2ce22de8c4b Mon Sep 17 00:00:00 2001 From: Laura Hild Date: Mon, 11 Sep 2023 17:58:19 -0400 Subject: [PATCH 15/19] Remove implication that child `disk`s aren't vdevs in zpoolconcepts(7) Reviewed-by: Brian Behlendorf Signed-off-by: Laura Hild Closes #15247 --- man/man7/zpoolconcepts.7 | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/man/man7/zpoolconcepts.7 b/man/man7/zpoolconcepts.7 index db3fd492623..98f3ee7cd66 100644 --- a/man/man7/zpoolconcepts.7 +++ b/man/man7/zpoolconcepts.7 @@ -203,11 +203,9 @@ For more information, see the section. .El .Pp -Virtual devices cannot be nested, so a mirror or raidz virtual device can only -contain files or disks. -Mirrors of mirrors -.Pq or other combinations -are not allowed. +Virtual devices cannot be nested arbitrarily. +A mirror, raidz or draid virtual device can only be created with files or disks. +Mirrors of mirrors or other such combinations are not allowed. .Pp A pool can have any number of virtual devices at the top of the configuration .Po known as From 0ce7a068e9ec894cabef8f87c8603a06df055dd5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=D0=BD=D0=B0=D0=B1?= Date: Mon, 18 Sep 2023 18:08:41 +0200 Subject: [PATCH 16/19] check-zstd-symbols: also ignore __pfx_ symbols MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Link: https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=b341b20d648bb7e9a3307c33163e7399f0913e66 Reviewed-by: Matthew Ahrens Reviewed-by: Brian Behlendorf Signed-off-by: Ahelenia ZiemiaƄska Closes #15282 Closes #15284 --- module/Makefile.in | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/module/Makefile.in b/module/Makefile.in index 5b71e1abf79..9b34b3dfaec 100644 --- a/module/Makefile.in +++ b/module/Makefile.in @@ -168,4 +168,4 @@ gen-zstd-symbols: for obj in $(addprefix zstd/,$(ZSTD_UPSTREAM_OBJS)); do echo; echo "/* $${obj#zstd/}: */"; @OBJDUMP@ -t $$obj | awk '$$2 == "g" && !/ zfs_/ {print "#define\t" $$6 " zfs_" $$6}' | sort; done >> zstd/include/zstd_compat_wrapper.h check-zstd-symbols: - @OBJDUMP@ -t $(addprefix zstd/,$(ZSTD_UPSTREAM_OBJS)) | awk '/file format/ {print} $$2 == "g" && !/ zfs_/ {++ret; print} END {exit ret}' + @OBJDUMP@ -t $(addprefix zstd/,$(ZSTD_UPSTREAM_OBJS)) | awk '/file format/ {print} $$2 == "g" && (!/ zfs_/ && !/ __pfx_zfs_/) {++ret; print} END {exit ret}' From 54c6fbd378eaa402eff34acf6a91c02d6cf9da11 Mon Sep 17 00:00:00 2001 From: Tony Hutter Date: Mon, 18 Sep 2023 16:25:58 -0700 Subject: [PATCH 17/19] zed: Allow autoreplace and fault LEDs for removed vdevs Allow zed to autoreplace vdevs marked as REMOVED. Also update statechange-led zedlet to toggle fault LEDs for REMOVED vdevs. Reviewed-by: Brian Behlendorf Signed-off-by: Tony Hutter Closes #15281 --- cmd/zed/agents/zfs_mod.c | 1 + cmd/zed/zed.d/statechange-led.sh | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/cmd/zed/agents/zfs_mod.c b/cmd/zed/agents/zfs_mod.c index a8d084bb4bd..2f040ff7582 100644 --- a/cmd/zed/agents/zfs_mod.c +++ b/cmd/zed/agents/zfs_mod.c @@ -372,6 +372,7 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled) /* Only autoreplace bad disks */ if ((vs->vs_state != VDEV_STATE_DEGRADED) && (vs->vs_state != VDEV_STATE_FAULTED) && + (vs->vs_state != VDEV_STATE_REMOVED) && (vs->vs_state != VDEV_STATE_CANT_OPEN)) { zed_log_msg(LOG_INFO, " not autoreplacing since disk isn't in " "a bad state (currently %llu)", vs->vs_state); diff --git a/cmd/zed/zed.d/statechange-led.sh b/cmd/zed/zed.d/statechange-led.sh index 46bfc1b866f..40cb61f1730 100755 --- a/cmd/zed/zed.d/statechange-led.sh +++ b/cmd/zed/zed.d/statechange-led.sh @@ -121,7 +121,7 @@ state_to_val() { state="$1" case "$state" in - FAULTED|DEGRADED|UNAVAIL) + FAULTED|DEGRADED|UNAVAIL|REMOVED) echo 1 ;; ONLINE) From f7a07d76ee5a6b698540c6873f194350539a7065 Mon Sep 17 00:00:00 2001 From: Mateusz Guzik Date: Tue, 19 Sep 2023 01:53:33 +0200 Subject: [PATCH 18/19] Retire z_nr_znodes Added in ab26409db753 ("Linux 3.1 compat, super_block->s_shrink"), with the only consumer which needed the count getting retired in 066e82522101 ("Linux compat: Minimum kernel version 3.10"). The counter gets in the way of not maintaining the list to begin with. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Mateusz Guzik Closes #15274 --- include/os/freebsd/zfs/sys/zfs_vfsops_os.h | 1 - include/os/linux/zfs/sys/zfs_vfsops_os.h | 1 - module/os/freebsd/zfs/zfs_vfsops.c | 8 +++----- module/os/freebsd/zfs/zfs_znode.c | 2 -- module/os/linux/zfs/zfs_ctldir.c | 1 - module/os/linux/zfs/zfs_vfsops.c | 7 +++---- module/os/linux/zfs/zfs_znode.c | 2 -- 7 files changed, 6 insertions(+), 16 deletions(-) diff --git a/include/os/freebsd/zfs/sys/zfs_vfsops_os.h b/include/os/freebsd/zfs/sys/zfs_vfsops_os.h index f765d38dbac..24bb03575f3 100644 --- a/include/os/freebsd/zfs/sys/zfs_vfsops_os.h +++ b/include/os/freebsd/zfs/sys/zfs_vfsops_os.h @@ -93,7 +93,6 @@ struct zfsvfs { zfs_teardown_lock_t z_teardown_lock; zfs_teardown_inactive_lock_t z_teardown_inactive_lock; list_t z_all_znodes; /* all vnodes in the fs */ - uint64_t z_nr_znodes; /* number of znodes in the fs */ kmutex_t z_znodes_lock; /* lock for z_all_znodes */ struct zfsctl_root *z_ctldir; /* .zfs directory pointer */ boolean_t z_show_ctldir; /* expose .zfs in the root dir */ diff --git a/include/os/linux/zfs/sys/zfs_vfsops_os.h b/include/os/linux/zfs/sys/zfs_vfsops_os.h index e320b8de422..b4d5db21f5e 100644 --- a/include/os/linux/zfs/sys/zfs_vfsops_os.h +++ b/include/os/linux/zfs/sys/zfs_vfsops_os.h @@ -105,7 +105,6 @@ struct zfsvfs { rrmlock_t z_teardown_lock; krwlock_t z_teardown_inactive_lock; list_t z_all_znodes; /* all znodes in the fs */ - uint64_t z_nr_znodes; /* number of znodes in the fs */ unsigned long z_rollback_time; /* last online rollback time */ unsigned long z_snap_defer_time; /* last snapshot unmount deferral */ kmutex_t z_znodes_lock; /* lock for z_all_znodes */ diff --git a/module/os/freebsd/zfs/zfs_vfsops.c b/module/os/freebsd/zfs/zfs_vfsops.c index 33759fa2616..e8b9ada1316 100644 --- a/module/os/freebsd/zfs/zfs_vfsops.c +++ b/module/os/freebsd/zfs/zfs_vfsops.c @@ -1154,7 +1154,6 @@ zfsvfs_free(zfsvfs_t *zfsvfs) mutex_destroy(&zfsvfs->z_znodes_lock); mutex_destroy(&zfsvfs->z_lock); - ASSERT3U(zfsvfs->z_nr_znodes, ==, 0); list_destroy(&zfsvfs->z_all_znodes); ZFS_TEARDOWN_DESTROY(zfsvfs); ZFS_TEARDOWN_INACTIVE_DESTROY(zfsvfs); @@ -1558,12 +1557,11 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) * may add the parents of dir-based xattrs to the taskq * so we want to wait for these. * - * We can safely read z_nr_znodes without locking because the - * VFS has already blocked operations which add to the - * z_all_znodes list and thus increment z_nr_znodes. + * We can safely check z_all_znodes for being empty because the + * VFS has already blocked operations which add to it. */ int round = 0; - while (zfsvfs->z_nr_znodes > 0) { + while (!list_is_empty(&zfsvfs->z_all_znodes)) { taskq_wait_outstanding(dsl_pool_zrele_taskq( dmu_objset_pool(zfsvfs->z_os)), 0); if (++round > 1 && !unmounting) diff --git a/module/os/freebsd/zfs/zfs_znode.c b/module/os/freebsd/zfs/zfs_znode.c index c4f2b722ef4..0d4c94555c6 100644 --- a/module/os/freebsd/zfs/zfs_znode.c +++ b/module/os/freebsd/zfs/zfs_znode.c @@ -537,7 +537,6 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, mutex_enter(&zfsvfs->z_znodes_lock); list_insert_tail(&zfsvfs->z_all_znodes, zp); - zfsvfs->z_nr_znodes++; zp->z_zfsvfs = zfsvfs; mutex_exit(&zfsvfs->z_znodes_lock); @@ -1286,7 +1285,6 @@ zfs_znode_free(znode_t *zp) mutex_enter(&zfsvfs->z_znodes_lock); POINTER_INVALIDATE(&zp->z_zfsvfs); list_remove(&zfsvfs->z_all_znodes, zp); - zfsvfs->z_nr_znodes--; mutex_exit(&zfsvfs->z_znodes_lock); #if __FreeBSD_version >= 1300139 diff --git a/module/os/linux/zfs/zfs_ctldir.c b/module/os/linux/zfs/zfs_ctldir.c index c45a3eb5a4e..02cb379ea84 100644 --- a/module/os/linux/zfs/zfs_ctldir.c +++ b/module/os/linux/zfs/zfs_ctldir.c @@ -537,7 +537,6 @@ zfsctl_inode_alloc(zfsvfs_t *zfsvfs, uint64_t id, mutex_enter(&zfsvfs->z_znodes_lock); list_insert_tail(&zfsvfs->z_all_znodes, zp); - zfsvfs->z_nr_znodes++; membar_producer(); mutex_exit(&zfsvfs->z_znodes_lock); diff --git a/module/os/linux/zfs/zfs_vfsops.c b/module/os/linux/zfs/zfs_vfsops.c index 464c12e1108..a1db5c57c18 100644 --- a/module/os/linux/zfs/zfs_vfsops.c +++ b/module/os/linux/zfs/zfs_vfsops.c @@ -1330,12 +1330,11 @@ zfsvfs_teardown(zfsvfs_t *zfsvfs, boolean_t unmounting) * may add the parents of dir-based xattrs to the taskq * so we want to wait for these. * - * We can safely read z_nr_znodes without locking because the - * VFS has already blocked operations which add to the - * z_all_znodes list and thus increment z_nr_znodes. + * We can safely check z_all_znodes for being empty because the + * VFS has already blocked operations which add to it. */ int round = 0; - while (zfsvfs->z_nr_znodes > 0) { + while (!list_is_empty(&zfsvfs->z_all_znodes)) { taskq_wait_outstanding(dsl_pool_zrele_taskq( dmu_objset_pool(zfsvfs->z_os)), 0); if (++round > 1 && !unmounting) diff --git a/module/os/linux/zfs/zfs_znode.c b/module/os/linux/zfs/zfs_znode.c index 335ae3460c5..52c8e51df65 100644 --- a/module/os/linux/zfs/zfs_znode.c +++ b/module/os/linux/zfs/zfs_znode.c @@ -390,7 +390,6 @@ zfs_inode_destroy(struct inode *ip) mutex_enter(&zfsvfs->z_znodes_lock); if (list_link_active(&zp->z_link_node)) { list_remove(&zfsvfs->z_all_znodes, zp); - zfsvfs->z_nr_znodes--; } mutex_exit(&zfsvfs->z_znodes_lock); @@ -641,7 +640,6 @@ zfs_znode_alloc(zfsvfs_t *zfsvfs, dmu_buf_t *db, int blksz, mutex_enter(&zfsvfs->z_znodes_lock); list_insert_tail(&zfsvfs->z_all_znodes, zp); - zfsvfs->z_nr_znodes++; mutex_exit(&zfsvfs->z_znodes_lock); if (links > 0) From 62677576a75e94396e945c4ecd9372f5d34e50cb Mon Sep 17 00:00:00 2001 From: Alexander Motin Date: Wed, 20 Sep 2023 14:17:11 -0400 Subject: [PATCH 19/19] ZIL: Fix potential race on flush deferring. zil_lwb_set_zio_dependency() can not set write ZIO dependency on previous LWB's write ZIO if one is already in done handler and set state to LWB_STATE_WRITE_DONE. So theoretically done handler of next LWB's write ZIO may run before done handler of previous LWB write ZIO completes. In such case we can not defer flushes, since the flush issue process is not locked. This may fix some reported assertions of lwb_vdev_tree not being empty inside zil_free_lwb(). Reviewed-by: Brian Behlendorf Signed-off-by: Alexander Motin Sponsored by: iXsystems, Inc. Closes #15278 --- module/zfs/zil.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/module/zfs/zil.c b/module/zfs/zil.c index b30676b42d8..9e9c9c22549 100644 --- a/module/zfs/zil.c +++ b/module/zfs/zil.c @@ -1550,7 +1550,16 @@ zil_lwb_write_done(zio_t *zio) lwb->lwb_state = LWB_STATE_WRITE_DONE; lwb->lwb_child_zio = NULL; lwb->lwb_write_zio = NULL; + + /* + * If nlwb is not yet issued, zil_lwb_set_zio_dependency() is not + * called for it yet, and when it will be, it won't be able to make + * its write ZIO a parent this ZIO. In such case we can not defer + * our flushes or below may be a race between the done callbacks. + */ nlwb = list_next(&zilog->zl_lwb_list, lwb); + if (nlwb && nlwb->lwb_state != LWB_STATE_ISSUED) + nlwb = NULL; mutex_exit(&zilog->zl_lock); if (avl_numnodes(t) == 0)