From 06e25f9c4b0841e450e411bf270c7aa92c04c573 Mon Sep 17 00:00:00 2001 From: Umer Saleem Date: Sat, 3 Feb 2024 00:51:51 +0500 Subject: [PATCH 1/5] Improve performance for zpool trim on linux On Linux, ZFS uses blkdev_issue_discard in vdev_disk_io_trim to issue trim command which is synchronous. This commit updates vdev_disk_io_trim to use __blkdev_issue_discard, which is asynchronous. Unfortunately there isn't any asynchronous version for blkdev_issue_secure_erase, so performance of secure trim will still suffer. Reviewed-by: Brian Behlendorf Reviewed-by: Alexander Motin Signed-off-by: Umer Saleem Closes #15843 --- config/kernel-blkdev.m4 | 34 ++++++++++++--- module/os/linux/zfs/vdev_disk.c | 74 ++++++++++++++++++++++++++------- 2 files changed, 88 insertions(+), 20 deletions(-) diff --git a/config/kernel-blkdev.m4 b/config/kernel-blkdev.m4 index 8e9e638b125..c5a353ca920 100644 --- a/config/kernel-blkdev.m4 +++ b/config/kernel-blkdev.m4 @@ -524,6 +524,7 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_BDEVNAME], [ dnl # dnl # 5.19 API: blkdev_issue_secure_erase() +dnl # 4.7 API: __blkdev_issue_discard(..., BLKDEV_DISCARD_SECURE) dnl # 3.10 API: blkdev_issue_discard(..., BLKDEV_DISCARD_SECURE) dnl # AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_ISSUE_SECURE_ERASE], [ @@ -539,6 +540,20 @@ AC_DEFUN([ZFS_AC_KERNEL_SRC_BLKDEV_ISSUE_SECURE_ERASE], [ sector, nr_sects, GFP_KERNEL); ]) + ZFS_LINUX_TEST_SRC([blkdev_issue_discard_async_flags], [ + #include + ],[ + struct block_device *bdev = NULL; + sector_t sector = 0; + sector_t nr_sects = 0; + unsigned long flags = 0; + struct bio *biop = NULL; + int error __attribute__ ((unused)); + + error = __blkdev_issue_discard(bdev, + sector, nr_sects, GFP_KERNEL, flags, &biop); + ]) + ZFS_LINUX_TEST_SRC([blkdev_issue_discard_flags], [ #include ],[ @@ -562,13 +577,22 @@ AC_DEFUN([ZFS_AC_KERNEL_BLKDEV_ISSUE_SECURE_ERASE], [ ],[ AC_MSG_RESULT(no) - AC_MSG_CHECKING([whether blkdev_issue_discard() is available]) - ZFS_LINUX_TEST_RESULT([blkdev_issue_discard_flags], [ + AC_MSG_CHECKING([whether __blkdev_issue_discard() is available]) + ZFS_LINUX_TEST_RESULT([blkdev_issue_discard_async_flags], [ AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_BLKDEV_ISSUE_DISCARD, 1, - [blkdev_issue_discard() is available]) + AC_DEFINE(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC, 1, + [__blkdev_issue_discard() is available]) ],[ - ZFS_LINUX_TEST_ERROR([blkdev_issue_discard()]) + AC_MSG_RESULT(no) + + AC_MSG_CHECKING([whether blkdev_issue_discard() is available]) + ZFS_LINUX_TEST_RESULT([blkdev_issue_discard_flags], [ + AC_MSG_RESULT(yes) + AC_DEFINE(HAVE_BLKDEV_ISSUE_DISCARD, 1, + [blkdev_issue_discard() is available]) + ],[ + ZFS_LINUX_TEST_ERROR([blkdev_issue_discard()]) + ]) ]) ]) ]) diff --git a/module/os/linux/zfs/vdev_disk.c b/module/os/linux/zfs/vdev_disk.c index e7f0aa57384..b0bda5fa201 100644 --- a/module/os/linux/zfs/vdev_disk.c +++ b/module/os/linux/zfs/vdev_disk.c @@ -862,27 +862,66 @@ vdev_disk_io_flush(struct block_device *bdev, zio_t *zio) return (0); } +#if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE) || \ + defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC) +BIO_END_IO_PROTO(vdev_disk_discard_end_io, bio, error) +{ + zio_t *zio = bio->bi_private; +#ifdef HAVE_1ARG_BIO_END_IO_T + zio->io_error = BIO_END_IO_ERROR(bio); +#else + zio->io_error = -error; +#endif + bio_put(bio); + if (zio->io_error) + vdev_disk_error(zio); + zio_interrupt(zio); +} + +static int +vdev_issue_discard_trim(zio_t *zio, unsigned long flags) +{ + int ret; + struct bio *bio = NULL; + +#if defined(BLKDEV_DISCARD_SECURE) + ret = - __blkdev_issue_discard( + BDH_BDEV(((vdev_disk_t *)zio->io_vd->vdev_tsd)->vd_bdh), + zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS, flags, &bio); +#else + (void) flags; + ret = - __blkdev_issue_discard( + BDH_BDEV(((vdev_disk_t *)zio->io_vd->vdev_tsd)->vd_bdh), + zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS, &bio); +#endif + if (!ret && bio) { + bio->bi_private = zio; + bio->bi_end_io = vdev_disk_discard_end_io; + vdev_submit_bio(bio); + } + return (ret); +} +#endif + static int vdev_disk_io_trim(zio_t *zio) { - vdev_t *v = zio->io_vd; - vdev_disk_t *vd = v->vdev_tsd; - -#if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE) - if (zio->io_trim_flags & ZIO_TRIM_SECURE) { - return (-blkdev_issue_secure_erase(BDH_BDEV(vd->vd_bdh), - zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS)); - } else { - return (-blkdev_issue_discard(BDH_BDEV(vd->vd_bdh), - zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS)); - } -#elif defined(HAVE_BLKDEV_ISSUE_DISCARD) unsigned long trim_flags = 0; -#if defined(BLKDEV_DISCARD_SECURE) - if (zio->io_trim_flags & ZIO_TRIM_SECURE) + if (zio->io_trim_flags & ZIO_TRIM_SECURE) { +#if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE) + return (-blkdev_issue_secure_erase( + BDH_BDEV(((vdev_disk_t *)zio->io_vd->vdev_tsd)->vd_bdh), + zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS)); +#elif defined(BLKDEV_DISCARD_SECURE) trim_flags |= BLKDEV_DISCARD_SECURE; #endif - return (-blkdev_issue_discard(BDH_BDEV(vd->vd_bdh), + } +#if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE) || \ + defined(HAVE_BLKDEV_ISSUE_DISCARD_ASYNC) + return (vdev_issue_discard_trim(zio, trim_flags)); +#elif defined(HAVE_BLKDEV_ISSUE_DISCARD) + return (-blkdev_issue_discard( + BDH_BDEV(((vdev_disk_t *)zio->io_vd->vdev_tsd)->vd_bdh), zio->io_offset >> 9, zio->io_size >> 9, GFP_NOFS, trim_flags)); #else #error "Unsupported kernel" @@ -968,7 +1007,12 @@ vdev_disk_io_start(zio_t *zio) case ZIO_TYPE_TRIM: zio->io_error = vdev_disk_io_trim(zio); rw_exit(&vd->vd_lock); +#if defined(HAVE_BLKDEV_ISSUE_SECURE_ERASE) + if (zio->io_trim_flags & ZIO_TRIM_SECURE) + zio_interrupt(zio); +#elif defined(HAVE_BLKDEV_ISSUE_DISCARD) zio_interrupt(zio); +#endif return; default: From a0d3fe72bf78e9035d231b00583cc5408c5cc63d Mon Sep 17 00:00:00 2001 From: Rich Ercolani <214141+rincebrain@users.noreply.github.com> Date: Mon, 5 Feb 2024 13:00:41 -0500 Subject: [PATCH 2/5] libzdb: Initial breakout of libzdb Step 1 in trying to slowly rip the zdb functions out of zdb.c to allow people to play with more flexible things to leverage zdb's functionality. No promises on any functions or structs being stable, now or probably in general unless someone builds a more polished abstraction, the goal at the moment is to slowly untangle the global state usage in zdb... Reviewed-by: Brian Behlendorf Signed-off-by: Rich Ercolani Closes #15804 --- cmd/zdb/Makefile.am | 1 + cmd/zdb/zdb.c | 108 +---------------------------------------- include/Makefile.am | 1 + include/libzdb.h | 68 ++++++++++++++++++++++++++ lib/Makefile.am | 5 +- lib/libzdb/Makefile.am | 7 +++ lib/libzdb/libzdb.c | 102 ++++++++++++++++++++++++++++++++++++++ 7 files changed, 184 insertions(+), 108 deletions(-) create mode 100644 include/libzdb.h create mode 100644 lib/libzdb/Makefile.am create mode 100644 lib/libzdb/libzdb.c diff --git a/cmd/zdb/Makefile.am b/cmd/zdb/Makefile.am index c93c9c37cd8..ebdc19128e1 100644 --- a/cmd/zdb/Makefile.am +++ b/cmd/zdb/Makefile.am @@ -10,6 +10,7 @@ zdb_SOURCES = \ %D%/zdb_il.c zdb_LDADD = \ + libzdb.la \ libzpool.la \ libzfs_core.la \ libnvpair.la diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 2062f4fa102..afdc5a2c8b5 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -88,36 +88,10 @@ #include #include +#include + #include "zdb.h" -#define ZDB_COMPRESS_NAME(idx) ((idx) < ZIO_COMPRESS_FUNCTIONS ? \ - zio_compress_table[(idx)].ci_name : "UNKNOWN") -#define ZDB_CHECKSUM_NAME(idx) ((idx) < ZIO_CHECKSUM_FUNCTIONS ? \ - zio_checksum_table[(idx)].ci_name : "UNKNOWN") -#define ZDB_OT_TYPE(idx) ((idx) < DMU_OT_NUMTYPES ? (idx) : \ - (idx) == DMU_OTN_ZAP_DATA || (idx) == DMU_OTN_ZAP_METADATA ? \ - DMU_OT_ZAP_OTHER : \ - (idx) == DMU_OTN_UINT64_DATA || (idx) == DMU_OTN_UINT64_METADATA ? \ - DMU_OT_UINT64_OTHER : DMU_OT_NUMTYPES) - -/* Some platforms require part of inode IDs to be remapped */ -#ifdef __APPLE__ -#define ZDB_MAP_OBJECT_ID(obj) INO_XNUTOZFS(obj, 2) -#else -#define ZDB_MAP_OBJECT_ID(obj) (obj) -#endif - -static const char * -zdb_ot_name(dmu_object_type_t type) -{ - if (type < DMU_OT_NUMTYPES) - return (dmu_ot[type].ot_name); - else if ((type & DMU_OT_NEWTYPE) && - ((type & DMU_OT_BYTESWAP_MASK) < DMU_BSWAP_NUMFUNCS)) - return (dmu_ot_byteswap[type & DMU_OT_BYTESWAP_MASK].ob_name); - else - return ("UNKNOWN"); -} extern int reference_tracking_enable; extern int zfs_recover; @@ -135,35 +109,12 @@ typedef void object_viewer_t(objset_t *, uint64_t, void *data, size_t size); static uint64_t *zopt_metaslab = NULL; static unsigned zopt_metaslab_args = 0; -typedef struct zopt_object_range { - uint64_t zor_obj_start; - uint64_t zor_obj_end; - uint64_t zor_flags; -} zopt_object_range_t; static zopt_object_range_t *zopt_object_ranges = NULL; static unsigned zopt_object_args = 0; static int flagbits[256]; -#define ZOR_FLAG_PLAIN_FILE 0x0001 -#define ZOR_FLAG_DIRECTORY 0x0002 -#define ZOR_FLAG_SPACE_MAP 0x0004 -#define ZOR_FLAG_ZAP 0x0008 -#define ZOR_FLAG_ALL_TYPES -1 -#define ZOR_SUPPORTED_FLAGS (ZOR_FLAG_PLAIN_FILE | \ - ZOR_FLAG_DIRECTORY | \ - ZOR_FLAG_SPACE_MAP | \ - ZOR_FLAG_ZAP) - -#define ZDB_FLAG_CHECKSUM 0x0001 -#define ZDB_FLAG_DECOMPRESS 0x0002 -#define ZDB_FLAG_BSWAP 0x0004 -#define ZDB_FLAG_GBH 0x0008 -#define ZDB_FLAG_INDIRECT 0x0010 -#define ZDB_FLAG_RAW 0x0020 -#define ZDB_FLAG_PRINT_BLKPTR 0x0040 -#define ZDB_FLAG_VERBOSE 0x0080 static uint64_t max_inflight_bytes = 256 * 1024 * 1024; /* 256MB */ static int leaked_objects = 0; @@ -176,62 +127,7 @@ static void mos_obj_refd_multiple(uint64_t); static int dump_bpobj_cb(void *arg, const blkptr_t *bp, boolean_t free, dmu_tx_t *tx); -typedef struct sublivelist_verify { - /* FREE's that haven't yet matched to an ALLOC, in one sub-livelist */ - zfs_btree_t sv_pair; - /* ALLOC's without a matching FREE, accumulates across sub-livelists */ - zfs_btree_t sv_leftover; -} sublivelist_verify_t; - -static int -livelist_compare(const void *larg, const void *rarg) -{ - const blkptr_t *l = larg; - const blkptr_t *r = rarg; - - /* Sort them according to dva[0] */ - uint64_t l_dva0_vdev, r_dva0_vdev; - l_dva0_vdev = DVA_GET_VDEV(&l->blk_dva[0]); - r_dva0_vdev = DVA_GET_VDEV(&r->blk_dva[0]); - if (l_dva0_vdev < r_dva0_vdev) - return (-1); - else if (l_dva0_vdev > r_dva0_vdev) - return (+1); - - /* if vdevs are equal, sort by offsets. */ - uint64_t l_dva0_offset; - uint64_t r_dva0_offset; - l_dva0_offset = DVA_GET_OFFSET(&l->blk_dva[0]); - r_dva0_offset = DVA_GET_OFFSET(&r->blk_dva[0]); - if (l_dva0_offset < r_dva0_offset) { - return (-1); - } else if (l_dva0_offset > r_dva0_offset) { - return (+1); - } - - /* - * Since we're storing blkptrs without cancelling FREE/ALLOC pairs, - * it's possible the offsets are equal. In that case, sort by txg - */ - if (l->blk_birth < r->blk_birth) { - return (-1); - } else if (l->blk_birth > r->blk_birth) { - return (+1); - } - return (0); -} - -typedef struct sublivelist_verify_block { - dva_t svb_dva; - - /* - * We need this to check if the block marked as allocated - * in the livelist was freed (and potentially reallocated) - * in the metaslab spacemaps at a later TXG. - */ - uint64_t svb_allocated_txg; -} sublivelist_verify_block_t; static void zdb_print_blkptr(const blkptr_t *bp, int flags); diff --git a/include/Makefile.am b/include/Makefile.am index 5f38f6ac6dd..cb28a2d6c96 100644 --- a/include/Makefile.am +++ b/include/Makefile.am @@ -186,6 +186,7 @@ USER_H = \ libuutil.h \ libuutil_common.h \ libuutil_impl.h \ + libzdb.h \ libzfs.h \ libzfs_core.h \ libzfsbootenv.h \ diff --git a/include/libzdb.h b/include/libzdb.h new file mode 100644 index 00000000000..ef910d0a2c5 --- /dev/null +++ b/include/libzdb.h @@ -0,0 +1,68 @@ +#define ZDB_COMPRESS_NAME(idx) ((idx) < ZIO_COMPRESS_FUNCTIONS ? \ + zio_compress_table[(idx)].ci_name : "UNKNOWN") +#define ZDB_CHECKSUM_NAME(idx) ((idx) < ZIO_CHECKSUM_FUNCTIONS ? \ + zio_checksum_table[(idx)].ci_name : "UNKNOWN") +#define ZDB_OT_TYPE(idx) ((idx) < DMU_OT_NUMTYPES ? (idx) : \ + (idx) == DMU_OTN_ZAP_DATA || (idx) == DMU_OTN_ZAP_METADATA ? \ + DMU_OT_ZAP_OTHER : \ + (idx) == DMU_OTN_UINT64_DATA || (idx) == DMU_OTN_UINT64_METADATA ? \ + DMU_OT_UINT64_OTHER : DMU_OT_NUMTYPES) + +/* Some platforms require part of inode IDs to be remapped */ +#ifdef __APPLE__ +#define ZDB_MAP_OBJECT_ID(obj) INO_XNUTOZFS(obj, 2) +#else +#define ZDB_MAP_OBJECT_ID(obj) (obj) +#endif + +#define ZOR_FLAG_PLAIN_FILE 0x0001 +#define ZOR_FLAG_DIRECTORY 0x0002 +#define ZOR_FLAG_SPACE_MAP 0x0004 +#define ZOR_FLAG_ZAP 0x0008 +#define ZOR_FLAG_ALL_TYPES -1 +#define ZOR_SUPPORTED_FLAGS (ZOR_FLAG_PLAIN_FILE | \ + ZOR_FLAG_DIRECTORY | \ + ZOR_FLAG_SPACE_MAP | \ + ZOR_FLAG_ZAP) + +#define ZDB_FLAG_CHECKSUM 0x0001 +#define ZDB_FLAG_DECOMPRESS 0x0002 +#define ZDB_FLAG_BSWAP 0x0004 +#define ZDB_FLAG_GBH 0x0008 +#define ZDB_FLAG_INDIRECT 0x0010 +#define ZDB_FLAG_RAW 0x0020 +#define ZDB_FLAG_PRINT_BLKPTR 0x0040 +#define ZDB_FLAG_VERBOSE 0x0080 + + +typedef struct zdb_ctx { +} zdb_ctx_t; + +typedef struct zopt_object_range { + uint64_t zor_obj_start; + uint64_t zor_obj_end; + uint64_t zor_flags; +} zopt_object_range_t; + + +typedef struct sublivelist_verify { + /* FREE's that haven't yet matched to an ALLOC, in one sub-livelist */ + zfs_btree_t sv_pair; + + /* ALLOC's without a matching FREE, accumulates across sub-livelists */ + zfs_btree_t sv_leftover; +} sublivelist_verify_t; + +typedef struct sublivelist_verify_block { + dva_t svb_dva; + + /* + * We need this to check if the block marked as allocated + * in the livelist was freed (and potentially reallocated) + * in the metaslab spacemaps at a later TXG. + */ + uint64_t svb_allocated_txg; +} sublivelist_verify_block_t; + +const char *zdb_ot_name(dmu_object_type_t type); +int livelist_compare(const void *larg, const void *rarg); diff --git a/lib/Makefile.am b/lib/Makefile.am index 499ebdaeba9..050a6cac0a3 100644 --- a/lib/Makefile.am +++ b/lib/Makefile.am @@ -9,11 +9,11 @@ # These library interfaces are subject to change at any time. # # -# CMDS: zhack/ztest/zdb/ zfs/zpool/zed/ +# CMDS: zhack/ztest/ zfs/zpool/zed/ # raidz_{test,bench} zinject/zstream # | | # LIBS: | | libzfsbootenv* -# | | | +# |--libzdb--zdb | | # | | | # libzpool libzfs* ----------------+ # | | | \ / | | | @@ -62,6 +62,7 @@ include $(srcdir)/%D%/libspl/Makefile.am include $(srcdir)/%D%/libtpool/Makefile.am include $(srcdir)/%D%/libunicode/Makefile.am include $(srcdir)/%D%/libuutil/Makefile.am +include $(srcdir)/%D%/libzdb/Makefile.am include $(srcdir)/%D%/libzfs_core/Makefile.am include $(srcdir)/%D%/libzfs/Makefile.am include $(srcdir)/%D%/libzfsbootenv/Makefile.am diff --git a/lib/libzdb/Makefile.am b/lib/libzdb/Makefile.am new file mode 100644 index 00000000000..ec4fd92b984 --- /dev/null +++ b/lib/libzdb/Makefile.am @@ -0,0 +1,7 @@ +libzdb_la_CFLAGS = $(AM_CFLAGS) $(LIBRARY_CFLAGS) +libzdb_la_CFLAGS += -fvisibility=hidden + +noinst_LTLIBRARIES += libzdb.la + +libzdb_la_SOURCES = \ + %D%/libzdb.c diff --git a/lib/libzdb/libzdb.c b/lib/libzdb/libzdb.c new file mode 100644 index 00000000000..9989fa1eb80 --- /dev/null +++ b/lib/libzdb/libzdb.c @@ -0,0 +1,102 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include + +const char * +zdb_ot_name(dmu_object_type_t type) +{ + if (type < DMU_OT_NUMTYPES) + return (dmu_ot[type].ot_name); + else if ((type & DMU_OT_NEWTYPE) && + ((type & DMU_OT_BYTESWAP_MASK) < DMU_BSWAP_NUMFUNCS)) + return (dmu_ot_byteswap[type & DMU_OT_BYTESWAP_MASK].ob_name); + else + return ("UNKNOWN"); +} + +int +livelist_compare(const void *larg, const void *rarg) +{ + const blkptr_t *l = larg; + const blkptr_t *r = rarg; + + /* Sort them according to dva[0] */ + uint64_t l_dva0_vdev, r_dva0_vdev; + l_dva0_vdev = DVA_GET_VDEV(&l->blk_dva[0]); + r_dva0_vdev = DVA_GET_VDEV(&r->blk_dva[0]); + if (l_dva0_vdev < r_dva0_vdev) + return (-1); + else if (l_dva0_vdev > r_dva0_vdev) + return (+1); + + /* if vdevs are equal, sort by offsets. */ + uint64_t l_dva0_offset; + uint64_t r_dva0_offset; + l_dva0_offset = DVA_GET_OFFSET(&l->blk_dva[0]); + r_dva0_offset = DVA_GET_OFFSET(&r->blk_dva[0]); + if (l_dva0_offset < r_dva0_offset) { + return (-1); + } else if (l_dva0_offset > r_dva0_offset) { + return (+1); + } + + /* + * Since we're storing blkptrs without cancelling FREE/ALLOC pairs, + * it's possible the offsets are equal. In that case, sort by txg + */ + if (l->blk_birth < r->blk_birth) { + return (-1); + } else if (l->blk_birth > r->blk_birth) { + return (+1); + } + return (0); +} From 6dccdf501ea47bb8a45f00e4904d26efcb917ad4 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Mon, 5 Feb 2024 16:44:45 -0800 Subject: [PATCH 3/5] BRT: Fix FICLONE/FICLONERANGE shortened copy On Linux the ioctl_ficlonerange() and ioctl_ficlone() system calls are expected to either fully clone the specified range or return an error. The range may be for an entire file. While internally ZFS supports cloning partial ranges there's no way to return the length cloned to the caller so we need to make this all or nothing. As part of this change support for the REMAP_FILE_CAN_SHORTEN flag has been added. When REMAP_FILE_CAN_SHORTEN is set zfs_clone_range() will return a shortened range when encountering pending dirty records. When it's clear zfs_clone_range() will block and wait for the records to be written out allowing the blocks to be cloned. Furthermore, the file range lock is held over the region being cloned to prevent it from being modified while cloning. This doesn't quite provide an atomic semantics since if an error is encountered only a portion of the range may be cloned. This will be converted to an error if REMAP_FILE_CAN_SHORTEN was not provided and returned to the caller. However, the destination file range is left in an undefined state. A test case has been added which exercises this functionality by verifying that `cp --reflink=never|auto|always` works correctly. Reviewed-by: Alexander Motin Signed-off-by: Brian Behlendorf Closes #15728 Closes #15842 --- include/os/freebsd/zfs/sys/zfs_vfsops_os.h | 1 - include/os/linux/zfs/sys/zfs_vfsops_os.h | 2 - include/sys/zfs_vnops.h | 3 + man/man4/zfs.4 | 9 + module/os/freebsd/zfs/zfs_vfsops.c | 4 - module/os/linux/zfs/zfs_vnops_os.c | 5 - module/os/linux/zfs/zpl_file_range.c | 48 +++--- module/zfs/zfs_vnops.c | 43 ++++- tests/runfiles/common.run | 2 +- tests/test-runner/bin/zts-report.py.in | 2 + tests/zfs-tests/include/tunables.cfg | 1 + tests/zfs-tests/tests/Makefile.am | 1 + .../functional/cp_files/cp_files_002_pos.ksh | 161 ++++++++++++++++++ 13 files changed, 243 insertions(+), 39 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/cp_files/cp_files_002_pos.ksh diff --git a/include/os/freebsd/zfs/sys/zfs_vfsops_os.h b/include/os/freebsd/zfs/sys/zfs_vfsops_os.h index e7ebcccbe0c..7f0f24325d5 100644 --- a/include/os/freebsd/zfs/sys/zfs_vfsops_os.h +++ b/include/os/freebsd/zfs/sys/zfs_vfsops_os.h @@ -285,7 +285,6 @@ typedef struct zfid_long { #define LONG_FID_LEN (sizeof (zfid_long_t) - sizeof (uint16_t)) extern int zfs_super_owner; -extern int zfs_bclone_enabled; extern void zfs_init(void); extern void zfs_fini(void); diff --git a/include/os/linux/zfs/sys/zfs_vfsops_os.h b/include/os/linux/zfs/sys/zfs_vfsops_os.h index 22046655025..b4d5db21f5e 100644 --- a/include/os/linux/zfs/sys/zfs_vfsops_os.h +++ b/include/os/linux/zfs/sys/zfs_vfsops_os.h @@ -45,8 +45,6 @@ extern "C" { typedef struct zfsvfs zfsvfs_t; struct znode; -extern int zfs_bclone_enabled; - /* * This structure emulates the vfs_t from other platforms. It's purpose * is to facilitate the handling of mount options and minimize structural diff --git a/include/sys/zfs_vnops.h b/include/sys/zfs_vnops.h index 5da103f1778..e60b99bed19 100644 --- a/include/sys/zfs_vnops.h +++ b/include/sys/zfs_vnops.h @@ -24,8 +24,11 @@ #ifndef _SYS_FS_ZFS_VNOPS_H #define _SYS_FS_ZFS_VNOPS_H + #include +extern int zfs_bclone_enabled; + extern int zfs_fsync(znode_t *, int, cred_t *); extern int zfs_read(znode_t *, zfs_uio_t *, int, cred_t *); extern int zfs_write(znode_t *, zfs_uio_t *, int, cred_t *); diff --git a/man/man4/zfs.4 b/man/man4/zfs.4 index 47471a80590..30c168253f9 100644 --- a/man/man4/zfs.4 +++ b/man/man4/zfs.4 @@ -1159,6 +1159,15 @@ Enable the experimental block cloning feature. If this setting is 0, then even if feature@block_cloning is enabled, attempts to clone blocks will act as though the feature is disabled. . +.It Sy zfs_bclone_wait_dirty Ns = Ns Sy 0 Ns | Ns 1 Pq int +When set to 1 the FICLONE and FICLONERANGE ioctls wait for dirty data to be +written to disk. +This allows the clone operation to reliably succeed when a file is +modified and then immediately cloned. +For small files this may be slower than making a copy of the file. +Therefore, this setting defaults to 0 which causes a clone operation to +immediately fail when encountering a dirty block. +. .It Sy zfs_blake3_impl Ns = Ns Sy fastest Pq string Select a BLAKE3 implementation. .Pp diff --git a/module/os/freebsd/zfs/zfs_vfsops.c b/module/os/freebsd/zfs/zfs_vfsops.c index f2d5391037c..a972c720dfd 100644 --- a/module/os/freebsd/zfs/zfs_vfsops.c +++ b/module/os/freebsd/zfs/zfs_vfsops.c @@ -89,10 +89,6 @@ int zfs_debug_level; SYSCTL_INT(_vfs_zfs, OID_AUTO, debug, CTLFLAG_RWTUN, &zfs_debug_level, 0, "Debug level"); -int zfs_bclone_enabled = 1; -SYSCTL_INT(_vfs_zfs, OID_AUTO, bclone_enabled, CTLFLAG_RWTUN, - &zfs_bclone_enabled, 0, "Enable block cloning"); - struct zfs_jailparam { int mount_snapshot; }; diff --git a/module/os/linux/zfs/zfs_vnops_os.c b/module/os/linux/zfs/zfs_vnops_os.c index b7b89b8afc5..a32307c3933 100644 --- a/module/os/linux/zfs/zfs_vnops_os.c +++ b/module/os/linux/zfs/zfs_vnops_os.c @@ -4255,9 +4255,4 @@ EXPORT_SYMBOL(zfs_map); /* CSTYLED */ module_param(zfs_delete_blocks, ulong, 0644); MODULE_PARM_DESC(zfs_delete_blocks, "Delete files larger than N blocks async"); - -/* CSTYLED */ -module_param(zfs_bclone_enabled, uint, 0644); -MODULE_PARM_DESC(zfs_bclone_enabled, "Enable block cloning"); - #endif diff --git a/module/os/linux/zfs/zpl_file_range.c b/module/os/linux/zfs/zpl_file_range.c index 73476ff40eb..3065d54fa9d 100644 --- a/module/os/linux/zfs/zpl_file_range.c +++ b/module/os/linux/zfs/zpl_file_range.c @@ -31,8 +31,6 @@ #include #include -int zfs_bclone_enabled = 1; - /* * Clone part of a file via block cloning. * @@ -40,7 +38,7 @@ int zfs_bclone_enabled = 1; * care of that depending on how it was called. */ static ssize_t -__zpl_clone_file_range(struct file *src_file, loff_t src_off, +zpl_clone_file_range_impl(struct file *src_file, loff_t src_off, struct file *dst_file, loff_t dst_off, size_t len) { struct inode *src_i = file_inode(src_file); @@ -96,11 +94,12 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off, { ssize_t ret; + /* Flags is reserved for future extensions and must be zero. */ if (flags != 0) return (-EINVAL); - /* Try to do it via zfs_clone_range() */ - ret = __zpl_clone_file_range(src_file, src_off, + /* Try to do it via zfs_clone_range() and allow shortening. */ + ret = zpl_clone_file_range_impl(src_file, src_off, dst_file, dst_off, len); #ifdef HAVE_VFS_GENERIC_COPY_FILE_RANGE @@ -137,6 +136,11 @@ zpl_copy_file_range(struct file *src_file, loff_t src_off, * FIDEDUPERANGE is for turning a non-clone into a clone, that is, compare the * range in both files and if they're the same, arrange for them to be backed * by the same storage. + * + * REMAP_FILE_CAN_SHORTEN lets us know we can clone less than the given range + * if we want. It's designed for filesystems that may need to shorten the + * length for alignment, EOF, or any other requirement. ZFS may shorten the + * request when there is outstanding dirty data which hasn't been written. */ loff_t zpl_remap_file_range(struct file *src_file, loff_t src_off, @@ -145,24 +149,21 @@ zpl_remap_file_range(struct file *src_file, loff_t src_off, if (flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_CAN_SHORTEN)) return (-EINVAL); - /* - * REMAP_FILE_CAN_SHORTEN lets us know we can clone less than the given - * range if we want. Its designed for filesystems that make data past - * EOF available, and don't want it to be visible in both files. ZFS - * doesn't do that, so we just turn the flag off. - */ - flags &= ~REMAP_FILE_CAN_SHORTEN; - + /* No support for dedup yet */ if (flags & REMAP_FILE_DEDUP) - /* No support for dedup yet */ return (-EOPNOTSUPP); /* Zero length means to clone everything to the end of the file */ if (len == 0) len = i_size_read(file_inode(src_file)) - src_off; - return (__zpl_clone_file_range(src_file, src_off, - dst_file, dst_off, len)); + ssize_t ret = zpl_clone_file_range_impl(src_file, src_off, + dst_file, dst_off, len); + + if (!(flags & REMAP_FILE_CAN_SHORTEN) && ret >= 0 && ret != len) + ret = -EINVAL; + + return (ret); } #endif /* HAVE_VFS_REMAP_FILE_RANGE */ @@ -179,8 +180,14 @@ zpl_clone_file_range(struct file *src_file, loff_t src_off, if (len == 0) len = i_size_read(file_inode(src_file)) - src_off; - return (__zpl_clone_file_range(src_file, src_off, - dst_file, dst_off, len)); + /* The entire length must be cloned or this is an error. */ + ssize_t ret = zpl_clone_file_range_impl(src_file, src_off, + dst_file, dst_off, len); + + if (ret >= 0 && ret != len) + ret = -EINVAL; + + return (ret); } #endif /* HAVE_VFS_CLONE_FILE_RANGE || HAVE_VFS_FILE_OPERATIONS_EXTEND */ @@ -214,8 +221,7 @@ zpl_ioctl_ficlone(struct file *dst_file, void *arg) size_t len = i_size_read(file_inode(src_file)); - ssize_t ret = - __zpl_clone_file_range(src_file, 0, dst_file, 0, len); + ssize_t ret = zpl_clone_file_range_impl(src_file, 0, dst_file, 0, len); fput(src_file); @@ -253,7 +259,7 @@ zpl_ioctl_ficlonerange(struct file *dst_file, void __user *arg) if (len == 0) len = i_size_read(file_inode(src_file)) - fcr.fcr_src_offset; - ssize_t ret = __zpl_clone_file_range(src_file, fcr.fcr_src_offset, + ssize_t ret = zpl_clone_file_range_impl(src_file, fcr.fcr_src_offset, dst_file, fcr.fcr_dest_offset, len); fput(src_file); diff --git a/module/zfs/zfs_vnops.c b/module/zfs/zfs_vnops.c index c8ff7b6432f..7f39ad6fc77 100644 --- a/module/zfs/zfs_vnops.c +++ b/module/zfs/zfs_vnops.c @@ -58,6 +58,26 @@ #include #include +/* + * Enable the experimental block cloning feature. If this setting is 0, then + * even if feature@block_cloning is enabled, attempts to clone blocks will act + * as though the feature is disabled. + */ +int zfs_bclone_enabled = 1; + +/* + * When set zfs_clone_range() waits for dirty data to be written to disk. + * This allows the clone operation to reliably succeed when a file is modified + * and then immediately cloned. For small files this may be slower than making + * a copy of the file and is therefore not the default. However, in certain + * scenarios this behavior may be desirable so a tunable is provided. + */ +static int zfs_bclone_wait_dirty = 0; + +/* + * Maximum bytes to read per chunk in zfs_read(). + */ +static uint64_t zfs_vnops_read_chunk_size = 1024 * 1024; int zfs_fsync(znode_t *zp, int syncflag, cred_t *cr) @@ -182,8 +202,6 @@ zfs_access(znode_t *zp, int mode, int flag, cred_t *cr) return (error); } -static uint64_t zfs_vnops_read_chunk_size = 1024 * 1024; /* Tunable */ - /* * Read bytes from specified file into supplied buffer. * @@ -1049,6 +1067,7 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, size_t maxblocks, nbps; uint_t inblksz; uint64_t clear_setid_bits_txg = 0; + uint64_t last_synced_txg = 0; inoff = *inoffp; outoff = *outoffp; @@ -1287,15 +1306,23 @@ zfs_clone_range(znode_t *inzp, uint64_t *inoffp, znode_t *outzp, } nbps = maxblocks; + last_synced_txg = spa_last_synced_txg(dmu_objset_spa(inos)); error = dmu_read_l0_bps(inos, inzp->z_id, inoff, size, bps, &nbps); if (error != 0) { /* * If we are trying to clone a block that was created - * in the current transaction group, error will be - * EAGAIN here, which we can just return to the caller - * so it can fallback if it likes. + * in the current transaction group, the error will be + * EAGAIN here. Based on zfs_bclone_wait_dirty either + * return a shortened range to the caller so it can + * fallback, or wait for the next TXG and check again. */ + if (error == EAGAIN && zfs_bclone_wait_dirty) { + txg_wait_synced(dmu_objset_pool(inos), + last_synced_txg + 1); + continue; + } + break; } @@ -1517,3 +1544,9 @@ EXPORT_SYMBOL(zfs_clone_range_replay); ZFS_MODULE_PARAM(zfs_vnops, zfs_vnops_, read_chunk_size, U64, ZMOD_RW, "Bytes to read per chunk"); + +ZFS_MODULE_PARAM(zfs, zfs_, bclone_enabled, INT, ZMOD_RW, + "Enable block cloning"); + +ZFS_MODULE_PARAM(zfs, zfs_, bclone_wait_dirty, INT, ZMOD_RW, + "Wait for dirty blocks when cloning"); diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 7e0990b5d9f..05e8cdc8f23 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -631,7 +631,7 @@ tests = ['compress_001_pos', 'compress_002_pos', 'compress_003_pos', tags = ['functional', 'compression'] [tests/functional/cp_files] -tests = ['cp_files_001_pos', 'cp_stress'] +tests = ['cp_files_001_pos', 'cp_files_002_pos', 'cp_stress'] tags = ['functional', 'cp_files'] [tests/functional/crtime] diff --git a/tests/test-runner/bin/zts-report.py.in b/tests/test-runner/bin/zts-report.py.in index ae4aa627546..edfdd47ee6d 100755 --- a/tests/test-runner/bin/zts-report.py.in +++ b/tests/test-runner/bin/zts-report.py.in @@ -176,6 +176,7 @@ if sys.platform.startswith('freebsd'): 'cli_root/zpool_wait/zpool_wait_trim_cancel': ['SKIP', trim_reason], 'cli_root/zpool_wait/zpool_wait_trim_flag': ['SKIP', trim_reason], 'cli_root/zfs_unshare/zfs_unshare_008_pos': ['SKIP', na_reason], + 'cp_files/cp_files_002_pos': ['SKIP', na_reason], 'link_count/link_count_001': ['SKIP', na_reason], 'casenorm/mixed_create_failure': ['FAIL', 13215], 'mmap/mmap_sync_001_pos': ['SKIP', na_reason], @@ -312,6 +313,7 @@ elif sys.platform.startswith('linux'): ['SKIP', cfr_reason], 'cli_root/zfs_rename/zfs_rename_002_pos': ['FAIL', known_reason], 'cli_root/zpool_reopen/zpool_reopen_003_pos': ['FAIL', known_reason], + 'cp_files/cp_files_002_pos': ['SKIP', cfr_reason], 'fault/auto_online_002_pos': ['FAIL', 11889], 'fault/auto_replace_001_pos': ['FAIL', 14851], 'fault/auto_spare_002_pos': ['FAIL', 11889], diff --git a/tests/zfs-tests/include/tunables.cfg b/tests/zfs-tests/include/tunables.cfg index e4e380aa7fd..a619b846dd1 100644 --- a/tests/zfs-tests/include/tunables.cfg +++ b/tests/zfs-tests/include/tunables.cfg @@ -94,6 +94,7 @@ VOL_MODE vol.mode zvol_volmode VOL_RECURSIVE vol.recursive UNSUPPORTED VOL_USE_BLK_MQ UNSUPPORTED zvol_use_blk_mq BCLONE_ENABLED zfs_bclone_enabled zfs_bclone_enabled +BCLONE_WAIT_DIRTY zfs_bclone_wait_dirty zfs_bclone_wait_dirty XATTR_COMPAT xattr_compat zfs_xattr_compat ZEVENT_LEN_MAX zevent.len_max zfs_zevent_len_max ZEVENT_RETAIN_MAX zevent.retain_max zfs_zevent_retain_max diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 4040e60434a..44fa0bf6869 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1394,6 +1394,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/compression/setup.ksh \ functional/cp_files/cleanup.ksh \ functional/cp_files/cp_files_001_pos.ksh \ + functional/cp_files/cp_files_002_pos.ksh \ functional/cp_files/cp_stress.ksh \ functional/cp_files/setup.ksh \ functional/crtime/cleanup.ksh \ diff --git a/tests/zfs-tests/tests/functional/cp_files/cp_files_002_pos.ksh b/tests/zfs-tests/tests/functional/cp_files/cp_files_002_pos.ksh new file mode 100755 index 00000000000..60817449ab0 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cp_files/cp_files_002_pos.ksh @@ -0,0 +1,161 @@ +#! /bin/ksh -p +# +# CDDL HEADER START +# +# The contents of this file are subject to the terms of the +# Common Development and Distribution License (the "License"). +# You may not use this file except in compliance with the License. +# +# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE +# or https://opensource.org/licenses/CDDL-1.0. +# See the License for the specific language governing permissions +# and limitations under the License. +# +# When distributing Covered Code, include this CDDL HEADER in each +# file and include the License file at usr/src/OPENSOLARIS.LICENSE. +# If applicable, add the following below this CDDL HEADER, with the +# fields enclosed by brackets "[]" replaced with your own identifying +# information: Portions Copyright [yyyy] [name of copyright owner] +# +# CDDL HEADER END +# + +# +# Copyright (c) 2024 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib +. $STF_SUITE/tests/functional/bclone/bclone_common.kshlib + +# +# DESCRIPTION: +# Verify all cp --reflink modes work with modified file. +# +# STRATEGY: +# 1. Verify "cp --reflink=never|auto|always" behaves as expected. +# Two different modes of operation are tested. +# +# a. zfs_bclone_wait_dirty=0: FICLONE and FICLONERANGE fail with EINVAL +# when there are dirty blocks which cannot be immediately cloned. +# This is the default behavior. +# +# b. zfs_bclone_wait_dirty=1: FICLONE and FICLONERANGE wait for +# dirty blocks to be written to disk allowing the clone to succeed. +# The downside to this is it may be slow which depending on the +# situtation may defeat the point of making a clone. +# + +verify_runnable "global" +verify_block_cloning + +if ! is_linux; then + log_unsupported "cp --reflink is a GNU coreutils option" +fi + +function cleanup +{ + datasetexists $TESTPOOL/cp-reflink && \ + destroy_dataset $$TESTPOOL/cp-reflink -f + log_must set_tunable32 BCLONE_WAIT_DIRTY 0 +} + +function verify_copy +{ + src_cksum=$(sha256digest $1) + dst_cksum=$(sha256digest $2) + + if [[ "$src_cksum" != "$dst_cksum" ]]; then + log_must ls -l $CP_TESTDIR + log_fail "checksum mismatch ($src_cksum != $dst_cksum)" + fi +} + +log_assert "Verify all cp --reflink modes work with modified file" + +log_onexit cleanup + +SRC_FILE=src.data +DST_FILE=dst.data +SRC_SIZE=$(($RANDOM % 2048)) + +# A smaller recordsize is used merely to speed up the test. +RECORDSIZE=4096 + +log_must zfs create -o recordsize=$RECORDSIZE $TESTPOOL/cp-reflink +CP_TESTDIR=$(get_prop mountpoint $TESTPOOL/cp-reflink) + +log_must cd $CP_TESTDIR + +# Never wait on dirty blocks (zfs_bclone_wait_dirty=0) +log_must set_tunable32 BCLONE_WAIT_DIRTY 0 + +for mode in "never" "auto" "always"; do + log_note "Checking 'cp --reflink=$mode'" + + # Create a new file and immediately copy it. + log_must dd if=/dev/urandom of=$SRC_FILE bs=$RECORDSIZE count=$SRC_SIZE + + if [[ "$mode" == "always" ]]; then + log_mustnot cp --reflink=$mode $SRC_FILE $DST_FILE + log_must ls -l $CP_TESTDIR + else + log_must cp --reflink=$mode $SRC_FILE $DST_FILE + verify_copy $SRC_FILE $DST_FILE + fi + log_must rm -f $DST_FILE + + # Append to an existing file and immediately copy it. + sync_pool $TESTPOOL + log_must dd if=/dev/urandom of=$SRC_FILE bs=$RECORDSIZE seek=$SRC_SIZE \ + count=1 conv=notrunc + if [[ "$mode" == "always" ]]; then + log_mustnot cp --reflink=$mode $SRC_FILE $DST_FILE + log_must ls -l $CP_TESTDIR + else + log_must cp --reflink=$mode $SRC_FILE $DST_FILE + verify_copy $SRC_FILE $DST_FILE + fi + log_must rm -f $DST_FILE + + # Overwrite a random range of an existing file and immediately copy it. + sync_pool $TESTPOOL + log_must dd if=/dev/urandom of=$SRC_FILE bs=$((RECORDSIZE / 2)) \ + seek=$(($RANDOM % $SRC_SIZE)) count=$(($RANDOM % 16)) conv=notrunc + if [[ "$mode" == "always" ]]; then + log_mustnot cp --reflink=$mode $SRC_FILE $DST_FILE + log_must ls -l $CP_TESTDIR + else + log_must cp --reflink=$mode $SRC_FILE $DST_FILE + verify_copy $SRC_FILE $DST_FILE + fi + log_must rm -f $SRC_FILE $DST_FILE +done + +# Wait on dirty blocks (zfs_bclone_wait_dirty=1) +log_must set_tunable32 BCLONE_WAIT_DIRTY 1 + +for mode in "never" "auto" "always"; do + log_note "Checking 'cp --reflink=$mode'" + + # Create a new file and immediately copy it. + log_must dd if=/dev/urandom of=$SRC_FILE bs=$RECORDSIZE count=$SRC_SIZE + log_must cp --reflink=$mode $SRC_FILE $DST_FILE + verify_copy $SRC_FILE $DST_FILE + log_must rm -f $DST_FILE + + # Append to an existing file and immediately copy it. + log_must dd if=/dev/urandom of=$SRC_FILE bs=$RECORDSIZE seek=$SRC_SIZE \ + count=1 conv=notrunc + log_must cp --reflink=$mode $SRC_FILE $DST_FILE + verify_copy $SRC_FILE $DST_FILE + log_must rm -f $DST_FILE + + # Overwrite a random range of an existing file and immediately copy it. + log_must dd if=/dev/urandom of=$SRC_FILE bs=$((RECORDSIZE / 2)) \ + seek=$(($RANDOM % $SRC_SIZE)) count=$(($RANDOM % 16)) conv=notrunc + log_must cp --reflink=$mode $SRC_FILE $DST_FILE + verify_copy $SRC_FILE $DST_FILE + log_must rm -f $SRC_FILE $DST_FILE +done + +log_pass From 0823388752770b85be0890c8482b314b501e79ec Mon Sep 17 00:00:00 2001 From: Cameron Harr Date: Wed, 7 Feb 2024 09:12:12 -0800 Subject: [PATCH 4/5] Add 'zpool status -e' flag to see unhealthy vdevs When very large pools are present, it can be laborious to find reasons for why a pool is degraded and/or where an unhealthy vdev is. This option filters out vdevs that are ONLINE and with no errors to make it easier to see where the issues are. Root and parents of unhealthy vdevs will always be printed. Testing: ZFS errors and drive failures for multiple vdevs were simulated with zinject. Sample vdev listings with '-e' option - All vdevs healthy NAME STATE READ WRITE CKSUM iron5 ONLINE 0 0 0 - ZFS errors NAME STATE READ WRITE CKSUM iron5 ONLINE 0 0 0 raidz2-5 ONLINE 1 0 0 L23 ONLINE 1 0 0 L24 ONLINE 1 0 0 L37 ONLINE 1 0 0 - Vdev faulted NAME STATE READ WRITE CKSUM iron5 DEGRADED 0 0 0 raidz2-6 DEGRADED 0 0 0 L67 FAULTED 0 0 0 too many errors - Vdev faults and data errors NAME STATE READ WRITE CKSUM iron5 DEGRADED 0 0 0 raidz2-1 DEGRADED 0 0 0 L2 FAULTED 0 0 0 too many errors raidz2-5 ONLINE 1 0 0 L23 ONLINE 1 0 0 L24 ONLINE 1 0 0 L37 ONLINE 1 0 0 raidz2-6 DEGRADED 0 0 0 L67 FAULTED 0 0 0 too many errors - Vdev missing NAME STATE READ WRITE CKSUM iron5 DEGRADED 0 0 0 raidz2-6 DEGRADED 0 0 0 L67 UNAVAIL 3 1 0 - Slow devices when -s provided with -e NAME STATE READ WRITE CKSUM SLOW iron5 DEGRADED 0 0 0 - raidz2-5 DEGRADED 0 0 0 - L10 FAULTED 0 0 0 0 external device fault L51 ONLINE 0 0 0 14 Reviewed-by: Tony Hutter Reviewed-by: Brian Behlendorf Signed-off-by: Cameron Harr Closes #15769 --- cmd/zpool/zpool_main.c | 58 +++++++++- man/man8/zpool-status.8 | 4 +- tests/runfiles/common.run | 3 +- tests/zfs-tests/tests/Makefile.am | 1 + .../zpool_status/zpool_status_002_pos.ksh | 4 +- .../zpool_status/zpool_status_003_pos.ksh | 2 + .../zpool_status/zpool_status_008_pos.ksh | 104 ++++++++++++++++++ 7 files changed, 169 insertions(+), 7 deletions(-) create mode 100755 tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_008_pos.ksh diff --git a/cmd/zpool/zpool_main.c b/cmd/zpool/zpool_main.c index 11486f3f185..8753d726391 100644 --- a/cmd/zpool/zpool_main.c +++ b/cmd/zpool/zpool_main.c @@ -2161,6 +2161,7 @@ typedef struct status_cbdata { boolean_t cb_explain; boolean_t cb_first; boolean_t cb_dedup_stats; + boolean_t cb_print_unhealthy; boolean_t cb_print_status; boolean_t cb_print_slow_ios; boolean_t cb_print_vdev_init; @@ -2357,6 +2358,35 @@ health_str_to_color(const char *health) return (NULL); } +/* + * Called for each leaf vdev. Returns 0 if the vdev is healthy. + * A vdev is unhealthy if any of the following are true: + * 1) there are read, write, or checksum errors, + * 2) its state is not ONLINE, or + * 3) slow IO reporting was requested (-s) and there are slow IOs. + */ +static int +vdev_health_check_cb(void *hdl_data, nvlist_t *nv, void *data) +{ + status_cbdata_t *cb = data; + vdev_stat_t *vs; + uint_t vsc; + (void) hdl_data; + + if (nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, + (uint64_t **)&vs, &vsc) != 0) + return (1); + + if (vs->vs_checksum_errors || vs->vs_read_errors || + vs->vs_write_errors || vs->vs_state != VDEV_STATE_HEALTHY) + return (1); + + if (cb->cb_print_slow_ios && vs->vs_slow_ios) + return (1); + + return (0); +} + /* * Print out configuration state as requested by status_callback. */ @@ -2375,7 +2405,8 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, const char *state; const char *type; const char *path = NULL; - const char *rcolor = NULL, *wcolor = NULL, *ccolor = NULL; + const char *rcolor = NULL, *wcolor = NULL, *ccolor = NULL, + *scolor = NULL; if (nvlist_lookup_nvlist_array(nv, ZPOOL_CONFIG_CHILDREN, &child, &children) != 0) @@ -2402,6 +2433,15 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, state = gettext("AVAIL"); } + /* + * If '-e' is specified then top-level vdevs and their children + * can be pruned if all of their leaves are healthy. + */ + if (cb->cb_print_unhealthy && depth > 0 && + for_each_vdev_in_nvlist(nv, vdev_health_check_cb, cb) == 0) { + return; + } + printf_color(health_str_to_color(state), "\t%*s%-*s %-8s", depth, "", cb->cb_namewidth - depth, name, state); @@ -2416,6 +2456,9 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, if (vs->vs_checksum_errors) ccolor = ANSI_RED; + if (vs->vs_slow_ios) + scolor = ANSI_BLUE; + if (cb->cb_literal) { fputc(' ', stdout); printf_color(rcolor, "%5llu", @@ -2448,9 +2491,10 @@ print_status_config(zpool_handle_t *zhp, status_cbdata_t *cb, const char *name, } if (cb->cb_literal) - printf(" %5llu", (u_longlong_t)vs->vs_slow_ios); + printf_color(scolor, " %5llu", + (u_longlong_t)vs->vs_slow_ios); else - printf(" %5s", rbuf); + printf_color(scolor, " %5s", rbuf); } if (cb->cb_print_power) { if (children == 0) { @@ -9106,9 +9150,11 @@ status_callback(zpool_handle_t *zhp, void *data) (void) printf(gettext( "errors: No known data errors\n")); } else if (!cbp->cb_verbose) { + color_start(ANSI_RED); (void) printf(gettext("errors: %llu data " "errors, use '-v' for a list\n"), (u_longlong_t)nerr); + color_end(); } else { print_error_log(zhp); } @@ -9129,6 +9175,7 @@ status_callback(zpool_handle_t *zhp, void *data) * [pool] [interval [count]] * * -c CMD For each vdev, run command CMD + * -e Display only unhealthy vdevs * -i Display vdev initialization status. * -g Display guid for individual vdev name. * -L Follow links when resolving vdev path name. @@ -9160,7 +9207,7 @@ zpool_do_status(int argc, char **argv) }; /* check options */ - while ((c = getopt_long(argc, argv, "c:igLpPsvxDtT:", long_options, + while ((c = getopt_long(argc, argv, "c:eigLpPsvxDtT:", long_options, NULL)) != -1) { switch (c) { case 'c': @@ -9187,6 +9234,9 @@ zpool_do_status(int argc, char **argv) } cmd = optarg; break; + case 'e': + cb.cb_print_unhealthy = B_TRUE; + break; case 'i': cb.cb_print_vdev_init = B_TRUE; break; diff --git a/man/man8/zpool-status.8 b/man/man8/zpool-status.8 index 56fa4aed057..24ad6e643ca 100644 --- a/man/man8/zpool-status.8 +++ b/man/man8/zpool-status.8 @@ -36,7 +36,7 @@ .Sh SYNOPSIS .Nm zpool .Cm status -.Op Fl DigLpPstvx +.Op Fl DeigLpPstvx .Op Fl T Sy u Ns | Ns Sy d .Op Fl c Op Ar SCRIPT1 Ns Oo , Ns Ar SCRIPT2 Oc Ns … .Oo Ar pool Oc Ns … @@ -69,6 +69,8 @@ See the option of .Nm zpool Cm iostat for complete details. +.It Fl e +Only show unhealthy vdevs (not-ONLINE or with errors). .It Fl i Display vdev initialization status. .It Fl g diff --git a/tests/runfiles/common.run b/tests/runfiles/common.run index 05e8cdc8f23..502b4de2bae 100644 --- a/tests/runfiles/common.run +++ b/tests/runfiles/common.run @@ -536,7 +536,8 @@ tags = ['functional', 'cli_root', 'zpool_split'] tests = ['zpool_status_001_pos', 'zpool_status_002_pos', 'zpool_status_003_pos', 'zpool_status_004_pos', 'zpool_status_005_pos', 'zpool_status_006_pos', - 'zpool_status_007_pos', 'zpool_status_features_001_pos'] + 'zpool_status_007_pos', 'zpool_status_008_pos', + 'zpool_status_features_001_pos'] tags = ['functional', 'cli_root', 'zpool_status'] [tests/functional/cli_root/zpool_sync] diff --git a/tests/zfs-tests/tests/Makefile.am b/tests/zfs-tests/tests/Makefile.am index 44fa0bf6869..01af258d59f 100644 --- a/tests/zfs-tests/tests/Makefile.am +++ b/tests/zfs-tests/tests/Makefile.am @@ -1239,6 +1239,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \ functional/cli_root/zpool_status/zpool_status_005_pos.ksh \ functional/cli_root/zpool_status/zpool_status_006_pos.ksh \ functional/cli_root/zpool_status/zpool_status_007_pos.ksh \ + functional/cli_root/zpool_status/zpool_status_008_pos.ksh \ functional/cli_root/zpool_status/zpool_status_features_001_pos.ksh \ functional/cli_root/zpool_sync/cleanup.ksh \ functional/cli_root/zpool_sync/setup.ksh \ diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_002_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_002_pos.ksh index 3bdd7db649f..d6f32cdc7ac 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_002_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_002_pos.ksh @@ -51,7 +51,7 @@ else fi set -A args "" "-x" "-v" "-x $testpool" "-v $testpool" "-xv $testpool" \ - "-vx $testpool" + "-vx $testpool" "-e $testpool" "-es $testpool" log_assert "Executing 'zpool status' with correct options succeeds" @@ -64,4 +64,6 @@ while [[ $i -lt ${#args[*]} ]]; do (( i = i + 1 )) done +cleanup + log_pass "'zpool status' with correct options succeeded" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_003_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_003_pos.ksh index b501aac5ad6..52b22dd833f 100755 --- a/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_003_pos.ksh +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_003_pos.ksh @@ -37,6 +37,7 @@ # 3. Read the file # 4. Take a snapshot and make a clone # 5. Verify we see "snapshot, clone and filesystem" output in 'zpool status -v' +# and 'zpool status -ev' function cleanup { @@ -68,6 +69,7 @@ log_must zpool status -v $TESTPOOL2 log_must eval "zpool status -v | grep '$TESTPOOL2@snap:/10m_file'" log_must eval "zpool status -v | grep '$TESTPOOL2/clone/10m_file'" log_must eval "zpool status -v | grep '$TESTPOOL2/10m_file'" +log_must eval "zpool status -ev | grep '$TESTPOOL2/10m_file'" log_mustnot eval "zpool status -v | grep '$TESTFS1'" log_pass "'zpool status -v' outputs affected filesystem, snapshot & clone" diff --git a/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_008_pos.ksh b/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_008_pos.ksh new file mode 100755 index 00000000000..6be2ad5a741 --- /dev/null +++ b/tests/zfs-tests/tests/functional/cli_root/zpool_status/zpool_status_008_pos.ksh @@ -0,0 +1,104 @@ +#!/bin/ksh -p + +# +# CDDL HEADER START +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# +# CDDL HEADER END +# + +# +# Copyright (c) 2024 by Lawrence Livermore National Security, LLC. +# + +. $STF_SUITE/include/libtest.shlib + +# +# DESCRIPTION: +# Verify 'zpool status -e' only shows unhealthy devices. +# +# STRATEGY: +# 1. Create zpool +# 2. Force DEGRADE, FAULT, or inject slow IOs for vdevs +# 3. Verify vdevs are reported correctly with -e and -s +# 4. Verify parents are reported as DEGRADED +# 5. Verify healthy children are not reported +# + +function cleanup +{ + log_must set_tunable64 ZIO_SLOW_IO_MS $OLD_SLOW_IO + zinject -c all + poolexists $TESTPOOL2 && destroy_pool $TESTPOOL2 + log_must rm -f $all_vdevs +} + +log_assert "Verify 'zpool status -e'" + +log_onexit cleanup + +all_vdevs=$(echo $TESTDIR/vdev{1..6}) +log_must mkdir -p $TESTDIR +log_must truncate -s $MINVDEVSIZE $all_vdevs + +OLD_SLOW_IO=$(get_tunable ZIO_SLOW_IO_MS) + +for raid_type in "draid2:3d:6c:1s" "raidz2"; do + + log_must zpool create -f $TESTPOOL2 $raid_type $all_vdevs + + # Check DEGRADED vdevs are shown. + log_must check_vdev_state $TESTPOOL2 $TESTDIR/vdev4 "ONLINE" + log_must zinject -d $TESTDIR/vdev4 -A degrade $TESTPOOL2 + log_must eval "zpool status -e $TESTPOOL2 | grep $TESTDIR/vdev4 | grep DEGRADED" + + # Check FAULTED vdevs are shown. + log_must check_vdev_state $TESTPOOL2 $TESTDIR/vdev5 "ONLINE" + log_must zinject -d $TESTDIR/vdev5 -A fault $TESTPOOL2 + log_must eval "zpool status -e $TESTPOOL2 | grep $TESTDIR/vdev5 | grep FAULTED" + + # Check no ONLINE vdevs are shown + log_mustnot eval "zpool status -e $TESTPOOL2 | grep ONLINE" + + # Check no ONLINE slow vdevs are show. Then mark IOs greater than + # 10ms slow, delay IOs 20ms to vdev6, check slow IOs. + log_must check_vdev_state $TESTPOOL2 $TESTDIR/vdev6 "ONLINE" + log_mustnot eval "zpool status -es $TESTPOOL2 | grep ONLINE" + + log_must set_tunable64 ZIO_SLOW_IO_MS 10 + log_must zinject -d $TESTDIR/vdev6 -D20:100 $TESTPOOL2 + log_must mkfile 1048576 /$TESTPOOL2/testfile + sync_pool $TESTPOOL2 + log_must set_tunable64 ZIO_SLOW_IO_MS $OLD_SLOW_IO + + # Check vdev6 slow IOs are only shown when requested with -s. + log_mustnot eval "zpool status -e $TESTPOOL2 | grep $TESTDIR/vdev6 | grep ONLINE" + log_must eval "zpool status -es $TESTPOOL2 | grep $TESTDIR/vdev6 | grep ONLINE" + + # Pool level and top-vdev level status must be DEGRADED. + log_must eval "zpool status -e $TESTPOOL2 | grep $TESTPOOL2 | grep DEGRADED" + log_must eval "zpool status -e $TESTPOOL2 | grep $raid_type | grep DEGRADED" + + # Check that healthy vdevs[1-3] aren't shown with -e. + log_must check_vdev_state $TESTPOOL2 $TESTDIR/vdev1 "ONLINE" + log_must check_vdev_state $TESTPOOL2 $TESTDIR/vdev2 "ONLINE" + log_must check_vdev_state $TESTPOOL2 $TESTDIR/vdev3 "ONLINE" + log_mustnot eval "zpool status -es $TESTPOOL2 | grep $TESTDIR/vdev1 | grep ONLINE" + log_mustnot eval "zpool status -es $TESTPOOL2 | grep $TESTDIR/vdev2 | grep ONLINE" + log_mustnot eval "zpool status -es $TESTPOOL2 | grep $TESTDIR/vdev3 | grep ONLINE" + + log_must zinject -c all + log_must zpool status -es $TESTPOOL2 + + zpool destroy $TESTPOOL2 +done + +log_pass "Verify zpool status -e shows only unhealthy vdevs" From 229b9f4ed05e6d14fb4d73fa04a71e99b01bb534 Mon Sep 17 00:00:00 2001 From: the-Chain-Warden-thresh <18302010006@fudan.edu.cn> Date: Thu, 8 Feb 2024 03:53:05 +0800 Subject: [PATCH 5/5] LUA: Backport CVE-2020-24370's patch CVE-2020-24370 is a security vulnerability in lua. Although the CVE description in CVE-2020-24370 said that this CVE only affected lua 5.4.0, according to lua this CVE actually existed since lua 5.2. The root cause of this CVE is the negation overflow that occurs when you try to take the negative of 0x80000000. Thus, this CVE also exists in openzfs. Try to backport the fix to the lua in openzfs since the original fix is for 5.4 and several functions have been changed. https://github.com/advisories/GHSA-gfr4-c37g-mm3v https://nvd.nist.gov/vuln/detail/CVE-2020-24370 https://www.lua.org/bugs.html#5.4.0-11 https://github.com/lua/lua/commit/a585eae6e7ada1ca9271607a4f48dfb1786 Reviewed-by: Tony Hutter Reviewed-by: Brian Behlendorf Signed-off-by: ChenHao Lu <18302010006@fudan.edu.cn> Closes #15847 --- module/lua/ldebug.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/module/lua/ldebug.c b/module/lua/ldebug.c index 0092474c762..23e321bb124 100644 --- a/module/lua/ldebug.c +++ b/module/lua/ldebug.c @@ -111,10 +111,11 @@ static const char *upvalname (Proto *p, int uv) { static const char *findvararg (CallInfo *ci, int n, StkId *pos) { int nparams = clLvalue(ci->func)->p->numparams; - if (n >= ci->u.l.base - ci->func - nparams) + int nvararg = cast_int(ci->u.l.base - ci->func) - nparams; + if (n <= -nvararg) return NULL; /* no such vararg */ else { - *pos = ci->func + nparams + n; + *pos = ci->func + nparams - n; return "(*vararg)"; /* generic name for any vararg */ } } @@ -126,7 +127,7 @@ static const char *findlocal (lua_State *L, CallInfo *ci, int n, StkId base; if (isLua(ci)) { if (n < 0) /* access to vararg values? */ - return findvararg(ci, -n, pos); + return findvararg(ci, n, pos); else { base = ci->u.l.base; name = luaF_getlocalname(ci_func(ci)->p, n, currentpc(ci));