From d3bafe455457aaf4134961e0def864a01be56661 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Thu, 15 Feb 2024 19:37:38 +1100 Subject: [PATCH 01/20] ddt: modernise assertions Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Sponsored-by: Klara, Inc. Sponsored-by: iXsystems, Inc. Closes #15887 --- module/zfs/ddt.c | 74 +++++++++++++++++++++++--------------------- module/zfs/ddt_zap.c | 8 ++--- 2 files changed, 42 insertions(+), 40 deletions(-) diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c index 1fb19821990..05b28acbcd3 100644 --- a/module/zfs/ddt.c +++ b/module/zfs/ddt.c @@ -70,16 +70,16 @@ ddt_object_create(ddt_t *ddt, enum ddt_type type, enum ddt_class class, ddt_object_name(ddt, type, class, name); - ASSERT(*objectp == 0); - VERIFY(ddt_ops[type]->ddt_op_create(os, objectp, tx, prehash) == 0); - ASSERT(*objectp != 0); + ASSERT3U(*objectp, ==, 0); + VERIFY0(ddt_ops[type]->ddt_op_create(os, objectp, tx, prehash)); + ASSERT3U(*objectp, !=, 0); - VERIFY(zap_add(os, DMU_POOL_DIRECTORY_OBJECT, name, - sizeof (uint64_t), 1, objectp, tx) == 0); + VERIFY0(zap_add(os, DMU_POOL_DIRECTORY_OBJECT, name, + sizeof (uint64_t), 1, objectp, tx)); - VERIFY(zap_add(os, spa->spa_ddt_stat_object, name, + VERIFY0(zap_add(os, spa->spa_ddt_stat_object, name, sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t), - &ddt->ddt_histogram[type][class], tx) == 0); + &ddt->ddt_histogram[type][class], tx)); } static void @@ -94,12 +94,13 @@ ddt_object_destroy(ddt_t *ddt, enum ddt_type type, enum ddt_class class, ddt_object_name(ddt, type, class, name); - ASSERT(*objectp != 0); + ASSERT3U(*objectp, !=, 0); ASSERT(ddt_histogram_empty(&ddt->ddt_histogram[type][class])); - VERIFY(ddt_object_count(ddt, type, class, &count) == 0 && count == 0); - VERIFY(zap_remove(os, DMU_POOL_DIRECTORY_OBJECT, name, tx) == 0); - VERIFY(zap_remove(os, spa->spa_ddt_stat_object, name, tx) == 0); - VERIFY(ddt_ops[type]->ddt_op_destroy(os, *objectp, tx) == 0); + VERIFY0(ddt_object_count(ddt, type, class, &count)); + VERIFY0(count); + VERIFY0(zap_remove(os, DMU_POOL_DIRECTORY_OBJECT, name, tx)); + VERIFY0(zap_remove(os, spa->spa_ddt_stat_object, name, tx)); + VERIFY0(ddt_ops[type]->ddt_op_destroy(os, *objectp, tx)); memset(&ddt->ddt_object_stats[type][class], 0, sizeof (ddt_object_t)); *objectp = 0; @@ -156,15 +157,15 @@ ddt_object_sync(ddt_t *ddt, enum ddt_type type, enum ddt_class class, ddt_object_name(ddt, type, class, name); - VERIFY(zap_update(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name, + VERIFY0(zap_update(ddt->ddt_os, ddt->ddt_spa->spa_ddt_stat_object, name, sizeof (uint64_t), sizeof (ddt_histogram_t) / sizeof (uint64_t), - &ddt->ddt_histogram[type][class], tx) == 0); + &ddt->ddt_histogram[type][class], tx)); /* * Cache DDT statistics; this is the only time they'll change. */ - VERIFY(ddt_object_info(ddt, type, class, &doi) == 0); - VERIFY(ddt_object_count(ddt, type, class, &count) == 0); + VERIFY0(ddt_object_info(ddt, type, class, &doi)); + VERIFY0(ddt_object_count(ddt, type, class, &count)); ddo->ddo_count = count; ddo->ddo_dspace = doi.doi_physical_blocks_512 << 9; @@ -262,7 +263,7 @@ ddt_object_name(ddt_t *ddt, enum ddt_type type, enum ddt_class class, void ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, uint64_t txg) { - ASSERT(txg != 0); + ASSERT3U(txg, !=, 0); for (int d = 0; d < SPA_DVAS_PER_BP; d++) bp->blk_dva[d] = ddp->ddp_dva[d]; @@ -313,7 +314,7 @@ ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp) void ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp) { - ASSERT(ddp->ddp_phys_birth == 0); + ASSERT0(ddp->ddp_phys_birth); for (int d = 0; d < SPA_DVAS_PER_BP; d++) ddp->ddp_dva[d] = bp->blk_dva[d]; @@ -336,7 +337,7 @@ void ddt_phys_decref(ddt_phys_t *ddp) { if (ddp) { - ASSERT(ddp->ddp_refcnt > 0); + ASSERT3U(ddp->ddp_refcnt, >, 0); ddp->ddp_refcnt--; } } @@ -438,7 +439,7 @@ ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg) ddt_stat_generate(ddt, dde, &dds); bucket = highbit64(dds.dds_ref_blocks) - 1; - ASSERT(bucket >= 0); + ASSERT3U(bucket, >=, 0); ddh = &ddt->ddt_histogram[dde->dde_type][dde->dde_class]; @@ -561,7 +562,7 @@ ddt_compress(void *src, uchar_t *dst, size_t s_len, size_t d_len) zio_compress_info_t *ci = &zio_compress_table[cpfunc]; size_t c_len; - ASSERT(d_len >= s_len + 1); /* no compression plus version byte */ + ASSERT3U(d_len, >=, s_len + 1); /* no compression plus version byte */ c_len = ci->ci_compress(src, dst, s_len, d_len - 1, ci->ci_level); @@ -648,7 +649,7 @@ ddt_free(ddt_entry_t *dde) ASSERT(!dde->dde_loading); for (int p = 0; p < DDT_PHYS_TYPES; p++) - ASSERT(dde->dde_lead_zio[p] == NULL); + ASSERT3P(dde->dde_lead_zio[p], ==, NULL); if (dde->dde_repair_abd != NULL) abd_free(dde->dde_repair_abd); @@ -713,8 +714,8 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add) ddt_enter(ddt); - ASSERT(dde->dde_loaded == B_FALSE); - ASSERT(dde->dde_loading == B_TRUE); + ASSERT(!dde->dde_loaded); + ASSERT(dde->dde_loading); dde->dde_type = type; /* will be DDT_TYPES if no entry found */ dde->dde_class = class; /* will be DDT_CLASSES if no entry found */ @@ -803,8 +804,8 @@ ddt_table_alloc(spa_t *spa, enum zio_checksum c) static void ddt_table_free(ddt_t *ddt) { - ASSERT(avl_numnodes(&ddt->ddt_tree) == 0); - ASSERT(avl_numnodes(&ddt->ddt_repair_tree) == 0); + ASSERT0(avl_numnodes(&ddt->ddt_tree)); + ASSERT0(avl_numnodes(&ddt->ddt_repair_tree)); avl_destroy(&ddt->ddt_tree); avl_destroy(&ddt->ddt_repair_tree); mutex_destroy(&ddt->ddt_lock); @@ -1017,9 +1018,9 @@ ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg) ASSERT(!dde->dde_loading); for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { - ASSERT(dde->dde_lead_zio[p] == NULL); + ASSERT3P(dde->dde_lead_zio[p], ==, NULL); if (ddp->ddp_phys_birth == 0) { - ASSERT(ddp->ddp_refcnt == 0); + ASSERT0(ddp->ddp_refcnt); continue; } if (p == DDT_PHYS_DITTO) { @@ -1044,8 +1045,9 @@ ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg) if (otype != DDT_TYPES && (otype != ntype || oclass != nclass || total_refcnt == 0)) { - VERIFY(ddt_object_remove(ddt, otype, oclass, dde, tx) == 0); - ASSERT(ddt_object_lookup(ddt, otype, oclass, dde) == ENOENT); + VERIFY0(ddt_object_remove(ddt, otype, oclass, dde, tx)); + ASSERT3U( + ddt_object_lookup(ddt, otype, oclass, dde), ==, ENOENT); } if (total_refcnt != 0) { @@ -1054,7 +1056,7 @@ ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg) ddt_stat_update(ddt, dde, 0); if (!ddt_object_exists(ddt, ntype, nclass)) ddt_object_create(ddt, ntype, nclass, tx); - VERIFY(ddt_object_update(ddt, ntype, nclass, dde, tx) == 0); + VERIFY0(ddt_object_update(ddt, ntype, nclass, dde, tx)); /* * If the class changes, the order that we scan this bp @@ -1080,7 +1082,7 @@ ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg) if (avl_numnodes(&ddt->ddt_tree) == 0) return; - ASSERT(spa->spa_uberblock.ub_version >= SPA_VERSION_DEDUP); + ASSERT3U(spa->spa_uberblock.ub_version, >=, SPA_VERSION_DEDUP); if (spa->spa_ddt_stat_object == 0) { spa->spa_ddt_stat_object = zap_create_link(ddt->ddt_os, @@ -1098,8 +1100,8 @@ ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg) for (enum ddt_class class = 0; class < DDT_CLASSES; class++) { if (ddt_object_exists(ddt, type, class)) { ddt_object_sync(ddt, type, class, tx); - VERIFY(ddt_object_count(ddt, type, class, - &add) == 0); + VERIFY0(ddt_object_count(ddt, type, class, + &add)); count += add; } } @@ -1121,7 +1123,7 @@ ddt_sync(spa_t *spa, uint64_t txg) dmu_tx_t *tx; zio_t *rio; - ASSERT(spa_syncing_txg(spa) == txg); + ASSERT3U(spa_syncing_txg(spa), ==, txg); tx = dmu_tx_create_assigned(spa->spa_dsl_pool, txg); @@ -1201,7 +1203,7 @@ ddt_addref(spa_t *spa, const blkptr_t *bp) ddt_enter(ddt); dde = ddt_lookup(ddt, bp, B_TRUE); - ASSERT(dde != NULL); + ASSERT3P(dde, !=, NULL); if (dde->dde_type < DDT_TYPES) { ddt_phys_t *ddp; diff --git a/module/zfs/ddt_zap.c b/module/zfs/ddt_zap.c index 8f6397a6d10..a1d296407ae 100644 --- a/module/zfs/ddt_zap.c +++ b/module/zfs/ddt_zap.c @@ -69,8 +69,8 @@ ddt_zap_lookup(objset_t *os, uint64_t object, ddt_entry_t *dde) if (error) goto out; - ASSERT(one == 1); - ASSERT(csize <= (sizeof (dde->dde_phys) + 1)); + ASSERT3U(one, ==, 1); + ASSERT3U(csize, <=, (sizeof (dde->dde_phys) + 1)); error = zap_lookup_uint64(os, object, (uint64_t *)&dde->dde_key, DDT_KEY_WORDS, 1, csize, cbuf); @@ -133,10 +133,10 @@ ddt_zap_walk(objset_t *os, uint64_t object, ddt_entry_t *dde, uint64_t *walk) if ((error = zap_cursor_retrieve(&zc, &za)) == 0) { uchar_t cbuf[sizeof (dde->dde_phys) + 1]; uint64_t csize = za.za_num_integers; - ASSERT(za.za_integer_length == 1); + ASSERT3U(za.za_integer_length, ==, 1); error = zap_lookup_uint64(os, object, (uint64_t *)za.za_name, DDT_KEY_WORDS, 1, csize, cbuf); - ASSERT(error == 0); + ASSERT0(error); if (error == 0) { ddt_decompress(cbuf, dde->dde_phys, csize, sizeof (dde->dde_phys)); From 86e91c030c05f1514c99bb202d35f923133bd00b Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Fri, 30 Jun 2023 12:48:45 +1000 Subject: [PATCH 02/20] ddt: move entry compression into ddt_zap I think I can say with some confidence that anyone making a new storage type in 2023 is doing their own thing with compression, not this. Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Sponsored-by: Klara, Inc. Sponsored-by: iXsystems, Inc. Closes #15887 --- include/sys/ddt.h | 8 ------- module/zfs/ddt.c | 42 ----------------------------------- module/zfs/ddt_zap.c | 53 +++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 50 insertions(+), 53 deletions(-) diff --git a/include/sys/ddt.h b/include/sys/ddt.h index 6378c042c70..2f9ff84b28c 100644 --- a/include/sys/ddt.h +++ b/include/sys/ddt.h @@ -58,9 +58,6 @@ enum ddt_class { #define DDT_TYPE_CURRENT 0 -#define DDT_COMPRESS_BYTEORDER_MASK 0x80 -#define DDT_COMPRESS_FUNCTION_MASK 0x7f - /* * On-disk ddt entry: key (name) and physical storage (value). */ @@ -92,8 +89,6 @@ typedef struct ddt_key { #define DDK_GET_CRYPT(ddk) BF64_GET((ddk)->ddk_prop, 39, 1) #define DDK_SET_CRYPT(ddk, x) BF64_SET((ddk)->ddk_prop, 39, 1, x) -#define DDT_KEY_WORDS (sizeof (ddt_key_t) / sizeof (uint64_t)) - #define DDE_GET_NDVAS(dde) (DDK_GET_CRYPT(&dde->dde_key) \ ? SPA_DVAS_PER_BP - 1 : SPA_DVAS_PER_BP) @@ -220,9 +215,6 @@ extern void ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total); extern uint64_t ddt_get_dedup_dspace(spa_t *spa); extern uint64_t ddt_get_pool_dedup_ratio(spa_t *spa); -extern size_t ddt_compress(void *src, uchar_t *dst, size_t s_len, size_t d_len); -extern void ddt_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len); - extern ddt_t *ddt_select(spa_t *spa, const blkptr_t *bp); extern void ddt_enter(ddt_t *ddt); extern void ddt_exit(ddt_t *ddt); diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c index 05b28acbcd3..221e9641388 100644 --- a/module/zfs/ddt.c +++ b/module/zfs/ddt.c @@ -35,7 +35,6 @@ #include #include #include -#include #include #include @@ -554,47 +553,6 @@ ddt_get_pool_dedup_ratio(spa_t *spa) return (dds_total.dds_ref_dsize * 100 / dds_total.dds_dsize); } -size_t -ddt_compress(void *src, uchar_t *dst, size_t s_len, size_t d_len) -{ - uchar_t *version = dst++; - int cpfunc = ZIO_COMPRESS_ZLE; - zio_compress_info_t *ci = &zio_compress_table[cpfunc]; - size_t c_len; - - ASSERT3U(d_len, >=, s_len + 1); /* no compression plus version byte */ - - c_len = ci->ci_compress(src, dst, s_len, d_len - 1, ci->ci_level); - - if (c_len == s_len) { - cpfunc = ZIO_COMPRESS_OFF; - memcpy(dst, src, s_len); - } - - *version = cpfunc; - if (ZFS_HOST_BYTEORDER) - *version |= DDT_COMPRESS_BYTEORDER_MASK; - - return (c_len + 1); -} - -void -ddt_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len) -{ - uchar_t version = *src++; - int cpfunc = version & DDT_COMPRESS_FUNCTION_MASK; - zio_compress_info_t *ci = &zio_compress_table[cpfunc]; - - if (ci->ci_decompress != NULL) - (void) ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level); - else - memcpy(dst, src, d_len); - - if (((version & DDT_COMPRESS_BYTEORDER_MASK) != 0) != - (ZFS_HOST_BYTEORDER != 0)) - byteswap_uint64_array(dst, d_len); -} - ddt_t * ddt_select(spa_t *spa, const blkptr_t *bp) { diff --git a/module/zfs/ddt_zap.c b/module/zfs/ddt_zap.c index a1d296407ae..6b9d60c5609 100644 --- a/module/zfs/ddt_zap.c +++ b/module/zfs/ddt_zap.c @@ -30,10 +30,57 @@ #include #include #include +#include static unsigned int ddt_zap_default_bs = 15; static unsigned int ddt_zap_default_ibs = 15; +#define DDT_ZAP_COMPRESS_BYTEORDER_MASK 0x80 +#define DDT_ZAP_COMPRESS_FUNCTION_MASK 0x7f + +#define DDT_KEY_WORDS (sizeof (ddt_key_t) / sizeof (uint64_t)) + +static size_t +ddt_zap_compress(void *src, uchar_t *dst, size_t s_len, size_t d_len) +{ + uchar_t *version = dst++; + int cpfunc = ZIO_COMPRESS_ZLE; + zio_compress_info_t *ci = &zio_compress_table[cpfunc]; + size_t c_len; + + ASSERT3U(d_len, >=, s_len + 1); /* no compression plus version byte */ + + c_len = ci->ci_compress(src, dst, s_len, d_len - 1, ci->ci_level); + + if (c_len == s_len) { + cpfunc = ZIO_COMPRESS_OFF; + memcpy(dst, src, s_len); + } + + *version = cpfunc; + if (ZFS_HOST_BYTEORDER) + *version |= DDT_ZAP_COMPRESS_BYTEORDER_MASK; + + return (c_len + 1); +} + +static void +ddt_zap_decompress(uchar_t *src, void *dst, size_t s_len, size_t d_len) +{ + uchar_t version = *src++; + int cpfunc = version & DDT_ZAP_COMPRESS_FUNCTION_MASK; + zio_compress_info_t *ci = &zio_compress_table[cpfunc]; + + if (ci->ci_decompress != NULL) + (void) ci->ci_decompress(src, dst, s_len, d_len, ci->ci_level); + else + memcpy(dst, src, d_len); + + if (((version & DDT_ZAP_COMPRESS_BYTEORDER_MASK) != 0) != + (ZFS_HOST_BYTEORDER != 0)) + byteswap_uint64_array(dst, d_len); +} + static int ddt_zap_create(objset_t *os, uint64_t *objectp, dmu_tx_t *tx, boolean_t prehash) { @@ -77,7 +124,7 @@ ddt_zap_lookup(objset_t *os, uint64_t object, ddt_entry_t *dde) if (error) goto out; - ddt_decompress(cbuf, dde->dde_phys, csize, sizeof (dde->dde_phys)); + ddt_zap_decompress(cbuf, dde->dde_phys, csize, sizeof (dde->dde_phys)); out: kmem_free(cbuf, sizeof (dde->dde_phys) + 1); @@ -97,7 +144,7 @@ ddt_zap_update(objset_t *os, uint64_t object, ddt_entry_t *dde, dmu_tx_t *tx) uchar_t cbuf[sizeof (dde->dde_phys) + 1]; uint64_t csize; - csize = ddt_compress(dde->dde_phys, cbuf, + csize = ddt_zap_compress(dde->dde_phys, cbuf, sizeof (dde->dde_phys), sizeof (cbuf)); return (zap_update_uint64(os, object, (uint64_t *)&dde->dde_key, @@ -138,7 +185,7 @@ ddt_zap_walk(objset_t *os, uint64_t object, ddt_entry_t *dde, uint64_t *walk) DDT_KEY_WORDS, 1, csize, cbuf); ASSERT0(error); if (error == 0) { - ddt_decompress(cbuf, dde->dde_phys, csize, + ddt_zap_decompress(cbuf, dde->dde_phys, csize, sizeof (dde->dde_phys)); dde->dde_key = *(ddt_key_t *)za.za_name; } From 5c4cc21fd4e0d15afc3f94895f26efabe80908c6 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Thu, 18 Jan 2024 09:51:41 +1100 Subject: [PATCH 03/20] ddt_zap: standardise temp buffer allocations Always do them on the heap, and when we know how much we need, only that much. Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Sponsored-by: Klara, Inc. Sponsored-by: iXsystems, Inc. Closes #15887 --- module/zfs/ddt_zap.c | 41 ++++++++++++++++++++++++++--------------- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/module/zfs/ddt_zap.c b/module/zfs/ddt_zap.c index 6b9d60c5609..0ec3eccb1c1 100644 --- a/module/zfs/ddt_zap.c +++ b/module/zfs/ddt_zap.c @@ -109,24 +109,23 @@ ddt_zap_lookup(objset_t *os, uint64_t object, ddt_entry_t *dde) uint64_t one, csize; int error; - cbuf = kmem_alloc(sizeof (dde->dde_phys) + 1, KM_SLEEP); - error = zap_length_uint64(os, object, (uint64_t *)&dde->dde_key, DDT_KEY_WORDS, &one, &csize); if (error) - goto out; + return (error); ASSERT3U(one, ==, 1); ASSERT3U(csize, <=, (sizeof (dde->dde_phys) + 1)); + cbuf = kmem_alloc(csize, KM_SLEEP); + error = zap_lookup_uint64(os, object, (uint64_t *)&dde->dde_key, DDT_KEY_WORDS, 1, csize, cbuf); - if (error) - goto out; + if (error == 0) + ddt_zap_decompress(cbuf, dde->dde_phys, csize, + sizeof (dde->dde_phys)); - ddt_zap_decompress(cbuf, dde->dde_phys, csize, sizeof (dde->dde_phys)); -out: - kmem_free(cbuf, sizeof (dde->dde_phys) + 1); + kmem_free(cbuf, csize); return (error); } @@ -141,14 +140,19 @@ ddt_zap_prefetch(objset_t *os, uint64_t object, ddt_entry_t *dde) static int ddt_zap_update(objset_t *os, uint64_t object, ddt_entry_t *dde, dmu_tx_t *tx) { - uchar_t cbuf[sizeof (dde->dde_phys) + 1]; - uint64_t csize; + const size_t cbuf_size = sizeof (dde->dde_phys) + 1; - csize = ddt_zap_compress(dde->dde_phys, cbuf, - sizeof (dde->dde_phys), sizeof (cbuf)); + uchar_t *cbuf = kmem_alloc(cbuf_size, KM_SLEEP); - return (zap_update_uint64(os, object, (uint64_t *)&dde->dde_key, - DDT_KEY_WORDS, 1, csize, cbuf, tx)); + uint64_t csize = ddt_zap_compress(dde->dde_phys, cbuf, + sizeof (dde->dde_phys), cbuf_size); + + int error = zap_update_uint64(os, object, (uint64_t *)&dde->dde_key, + DDT_KEY_WORDS, 1, csize, cbuf, tx); + + kmem_free(cbuf, cbuf_size); + + return (error); } static int @@ -178,9 +182,13 @@ ddt_zap_walk(objset_t *os, uint64_t object, ddt_entry_t *dde, uint64_t *walk) zap_cursor_init_serialized(&zc, os, object, *walk); } if ((error = zap_cursor_retrieve(&zc, &za)) == 0) { - uchar_t cbuf[sizeof (dde->dde_phys) + 1]; uint64_t csize = za.za_num_integers; + ASSERT3U(za.za_integer_length, ==, 1); + ASSERT3U(csize, <=, sizeof (dde->dde_phys) + 1); + + uchar_t *cbuf = kmem_alloc(csize, KM_SLEEP); + error = zap_lookup_uint64(os, object, (uint64_t *)za.za_name, DDT_KEY_WORDS, 1, csize, cbuf); ASSERT0(error); @@ -189,6 +197,9 @@ ddt_zap_walk(objset_t *os, uint64_t object, ddt_entry_t *dde, uint64_t *walk) sizeof (dde->dde_phys)); dde->dde_key = *(ddt_key_t *)za.za_name; } + + kmem_free(cbuf, csize); + zap_cursor_advance(&zc); *walk = zap_cursor_serialize(&zc); } From 0cb1ef60ae26d855e8f4a959fff66a7693766ea3 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Fri, 9 Jun 2023 10:14:42 +1000 Subject: [PATCH 04/20] ddt: compare keys, not entries We're about to have different kinds of things that we'll compare on key, so generalise this function to support that. (It actually worked fine because of the way the casts work out, but it requires the key to be at the start of the object so the cast through ddt_entry_t works, and even then it reads strangely for anything that's not a ddt_entry_t). Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Sponsored-by: Klara, Inc. Sponsored-by: iXsystems, Inc. Closes #15887 --- cmd/zdb/zdb.c | 3 ++- include/sys/ddt.h | 3 ++- module/zfs/ddt.c | 24 ++++++++++++------------ 3 files changed, 16 insertions(+), 14 deletions(-) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index b857e61bd04..35d71012f77 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -7121,6 +7121,7 @@ dump_block_stats(spa_t *spa) } typedef struct zdb_ddt_entry { + /* key must be first for ddt_key_compare */ ddt_key_t zdde_key; uint64_t zdde_ref_blocks; uint64_t zdde_ref_lsize; @@ -7181,7 +7182,7 @@ dump_simulated_ddt(spa_t *spa) ddt_histogram_t ddh_total = {{{0}}}; ddt_stat_t dds_total = {0}; - avl_create(&t, ddt_entry_compare, + avl_create(&t, ddt_key_compare, sizeof (zdb_ddt_entry_t), offsetof(zdb_ddt_entry_t, zdde_node)); spa_config_enter(spa, SCL_CONFIG, FTAG, RW_READER); diff --git a/include/sys/ddt.h b/include/sys/ddt.h index 2f9ff84b28c..e6bbaa10e9a 100644 --- a/include/sys/ddt.h +++ b/include/sys/ddt.h @@ -114,6 +114,7 @@ enum ddt_phys_type { * In-core ddt entry */ struct ddt_entry { + /* key must be first for ddt_key_compare */ ddt_key_t dde_key; ddt_phys_t dde_phys[DDT_PHYS_TYPES]; zio_t *dde_lead_zio[DDT_PHYS_TYPES]; @@ -230,7 +231,7 @@ extern boolean_t ddt_class_contains(spa_t *spa, enum ddt_class max_class, extern ddt_entry_t *ddt_repair_start(ddt_t *ddt, const blkptr_t *bp); extern void ddt_repair_done(ddt_t *ddt, ddt_entry_t *dde); -extern int ddt_entry_compare(const void *x1, const void *x2); +extern int ddt_key_compare(const void *x1, const void *x2); extern void ddt_create(spa_t *spa); extern int ddt_load(spa_t *spa); diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c index 221e9641388..f54a51842eb 100644 --- a/module/zfs/ddt.c +++ b/module/zfs/ddt.c @@ -628,7 +628,8 @@ ddt_remove(ddt_t *ddt, ddt_entry_t *dde) ddt_entry_t * ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add) { - ddt_entry_t *dde, dde_search; + ddt_key_t search; + ddt_entry_t *dde; enum ddt_type type; enum ddt_class class; avl_index_t where; @@ -636,13 +637,13 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add) ASSERT(MUTEX_HELD(&ddt->ddt_lock)); - ddt_key_fill(&dde_search.dde_key, bp); + ddt_key_fill(&search, bp); - dde = avl_find(&ddt->ddt_tree, &dde_search, &where); + dde = avl_find(&ddt->ddt_tree, &search, &where); if (dde == NULL) { if (!add) return (NULL); - dde = ddt_alloc(&dde_search.dde_key); + dde = ddt_alloc(&search); avl_insert(&ddt->ddt_tree, dde, where); } @@ -713,7 +714,8 @@ ddt_prefetch(spa_t *spa, const blkptr_t *bp) } /* - * Opaque struct used for ddt_key comparison + * Key comparison. Any struct wanting to make use of this function must have + * the key as the first element. */ #define DDT_KEY_CMP_LEN (sizeof (ddt_key_t) / sizeof (uint16_t)) @@ -722,12 +724,10 @@ typedef struct ddt_key_cmp { } ddt_key_cmp_t; int -ddt_entry_compare(const void *x1, const void *x2) +ddt_key_compare(const void *x1, const void *x2) { - const ddt_entry_t *dde1 = x1; - const ddt_entry_t *dde2 = x2; - const ddt_key_cmp_t *k1 = (const ddt_key_cmp_t *)&dde1->dde_key; - const ddt_key_cmp_t *k2 = (const ddt_key_cmp_t *)&dde2->dde_key; + const ddt_key_cmp_t *k1 = (const ddt_key_cmp_t *)x1; + const ddt_key_cmp_t *k2 = (const ddt_key_cmp_t *)x2; int32_t cmp = 0; for (int i = 0; i < DDT_KEY_CMP_LEN; i++) { @@ -748,9 +748,9 @@ ddt_table_alloc(spa_t *spa, enum zio_checksum c) memset(ddt, 0, sizeof (ddt_t)); mutex_init(&ddt->ddt_lock, NULL, MUTEX_DEFAULT, NULL); - avl_create(&ddt->ddt_tree, ddt_entry_compare, + avl_create(&ddt->ddt_tree, ddt_key_compare, sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node)); - avl_create(&ddt->ddt_repair_tree, ddt_entry_compare, + avl_create(&ddt->ddt_repair_tree, ddt_key_compare, sizeof (ddt_entry_t), offsetof(ddt_entry_t, dde_node)); ddt->ddt_checksum = c; ddt->ddt_spa = spa; From 59738541537a859a4558bd34c0e1784fbe28bbcd Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Tue, 16 May 2023 13:30:26 +1000 Subject: [PATCH 05/20] ddt: lift dedup stats out to separate file We want to add other kinds of dedup-related objects and keep stats for them. This makes those functions easier to use from outside ddt.c. Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Sponsored-by: Klara, Inc. Sponsored-by: iXsystems, Inc. Closes #15887 --- include/sys/ddt.h | 1 + lib/libzpool/Makefile.am | 1 + module/Kbuild.in | 1 + module/Makefile.bsd | 1 + module/zfs/ddt.c | 171 --------------------------------- module/zfs/ddt_stats.c | 203 +++++++++++++++++++++++++++++++++++++++ 6 files changed, 207 insertions(+), 171 deletions(-) create mode 100644 module/zfs/ddt_stats.c diff --git a/include/sys/ddt.h b/include/sys/ddt.h index e6bbaa10e9a..a1535405768 100644 --- a/include/sys/ddt.h +++ b/include/sys/ddt.h @@ -204,6 +204,7 @@ extern void ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp, extern ddt_phys_t *ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp); extern uint64_t ddt_phys_total_refcnt(const ddt_entry_t *dde); +extern void ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg); extern void ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg); extern void ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src); diff --git a/lib/libzpool/Makefile.am b/lib/libzpool/Makefile.am index 3c986a707d2..42f3404db5a 100644 --- a/lib/libzpool/Makefile.am +++ b/lib/libzpool/Makefile.am @@ -79,6 +79,7 @@ nodist_libzpool_la_SOURCES = \ module/zfs/dbuf.c \ module/zfs/dbuf_stats.c \ module/zfs/ddt.c \ + module/zfs/ddt_stats.c \ module/zfs/ddt_zap.c \ module/zfs/dmu.c \ module/zfs/dmu_diff.c \ diff --git a/module/Kbuild.in b/module/Kbuild.in index fb22bfe733c..7e08374fa2b 100644 --- a/module/Kbuild.in +++ b/module/Kbuild.in @@ -326,6 +326,7 @@ ZFS_OBJS := \ dbuf.o \ dbuf_stats.o \ ddt.o \ + ddt_stats.o \ ddt_zap.o \ dmu.o \ dmu_diff.o \ diff --git a/module/Makefile.bsd b/module/Makefile.bsd index 0c4d8bfe115..1b0110d3ae9 100644 --- a/module/Makefile.bsd +++ b/module/Makefile.bsd @@ -252,6 +252,7 @@ SRCS+= abd.c \ bqueue.c \ dataset_kstats.c \ ddt.c \ + ddt_stats.c \ ddt_zap.c \ dmu.c \ dmu_diff.c \ diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c index f54a51842eb..43fc2aef520 100644 --- a/module/zfs/ddt.c +++ b/module/zfs/ddt.c @@ -382,177 +382,6 @@ ddt_phys_total_refcnt(const ddt_entry_t *dde) return (refcnt); } -static void -ddt_stat_generate(ddt_t *ddt, ddt_entry_t *dde, ddt_stat_t *dds) -{ - spa_t *spa = ddt->ddt_spa; - ddt_phys_t *ddp = dde->dde_phys; - ddt_key_t *ddk = &dde->dde_key; - uint64_t lsize = DDK_GET_LSIZE(ddk); - uint64_t psize = DDK_GET_PSIZE(ddk); - - memset(dds, 0, sizeof (*dds)); - - for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { - uint64_t dsize = 0; - uint64_t refcnt = ddp->ddp_refcnt; - - if (ddp->ddp_phys_birth == 0) - continue; - - for (int d = 0; d < DDE_GET_NDVAS(dde); d++) - dsize += dva_get_dsize_sync(spa, &ddp->ddp_dva[d]); - - dds->dds_blocks += 1; - dds->dds_lsize += lsize; - dds->dds_psize += psize; - dds->dds_dsize += dsize; - - dds->dds_ref_blocks += refcnt; - dds->dds_ref_lsize += lsize * refcnt; - dds->dds_ref_psize += psize * refcnt; - dds->dds_ref_dsize += dsize * refcnt; - } -} - -void -ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg) -{ - const uint64_t *s = (const uint64_t *)src; - uint64_t *d = (uint64_t *)dst; - uint64_t *d_end = (uint64_t *)(dst + 1); - - ASSERT(neg == 0 || neg == -1ULL); /* add or subtract */ - - for (int i = 0; i < d_end - d; i++) - d[i] += (s[i] ^ neg) - neg; -} - -static void -ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg) -{ - ddt_stat_t dds; - ddt_histogram_t *ddh; - int bucket; - - ddt_stat_generate(ddt, dde, &dds); - - bucket = highbit64(dds.dds_ref_blocks) - 1; - ASSERT3U(bucket, >=, 0); - - ddh = &ddt->ddt_histogram[dde->dde_type][dde->dde_class]; - - ddt_stat_add(&ddh->ddh_stat[bucket], &dds, neg); -} - -void -ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src) -{ - for (int h = 0; h < 64; h++) - ddt_stat_add(&dst->ddh_stat[h], &src->ddh_stat[h], 0); -} - -void -ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh) -{ - memset(dds, 0, sizeof (*dds)); - - for (int h = 0; h < 64; h++) - ddt_stat_add(dds, &ddh->ddh_stat[h], 0); -} - -boolean_t -ddt_histogram_empty(const ddt_histogram_t *ddh) -{ - const uint64_t *s = (const uint64_t *)ddh; - const uint64_t *s_end = (const uint64_t *)(ddh + 1); - - while (s < s_end) - if (*s++ != 0) - return (B_FALSE); - - return (B_TRUE); -} - -void -ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo_total) -{ - /* Sum the statistics we cached in ddt_object_sync(). */ - for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { - ddt_t *ddt = spa->spa_ddt[c]; - for (enum ddt_type type = 0; type < DDT_TYPES; type++) { - for (enum ddt_class class = 0; class < DDT_CLASSES; - class++) { - ddt_object_t *ddo = - &ddt->ddt_object_stats[type][class]; - ddo_total->ddo_count += ddo->ddo_count; - ddo_total->ddo_dspace += ddo->ddo_dspace; - ddo_total->ddo_mspace += ddo->ddo_mspace; - } - } - } - - /* ... and compute the averages. */ - if (ddo_total->ddo_count != 0) { - ddo_total->ddo_dspace /= ddo_total->ddo_count; - ddo_total->ddo_mspace /= ddo_total->ddo_count; - } -} - -void -ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh) -{ - for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { - ddt_t *ddt = spa->spa_ddt[c]; - for (enum ddt_type type = 0; type < DDT_TYPES && ddt; type++) { - for (enum ddt_class class = 0; class < DDT_CLASSES; - class++) { - ddt_histogram_add(ddh, - &ddt->ddt_histogram_cache[type][class]); - } - } - } -} - -void -ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total) -{ - ddt_histogram_t *ddh_total; - - ddh_total = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP); - ddt_get_dedup_histogram(spa, ddh_total); - ddt_histogram_stat(dds_total, ddh_total); - kmem_free(ddh_total, sizeof (ddt_histogram_t)); -} - -uint64_t -ddt_get_dedup_dspace(spa_t *spa) -{ - ddt_stat_t dds_total; - - if (spa->spa_dedup_dspace != ~0ULL) - return (spa->spa_dedup_dspace); - - memset(&dds_total, 0, sizeof (ddt_stat_t)); - - /* Calculate and cache the stats */ - ddt_get_dedup_stats(spa, &dds_total); - spa->spa_dedup_dspace = dds_total.dds_ref_dsize - dds_total.dds_dsize; - return (spa->spa_dedup_dspace); -} - -uint64_t -ddt_get_pool_dedup_ratio(spa_t *spa) -{ - ddt_stat_t dds_total = { 0 }; - - ddt_get_dedup_stats(spa, &dds_total); - if (dds_total.dds_dsize == 0) - return (100); - - return (dds_total.dds_ref_dsize * 100 / dds_total.dds_dsize); -} - ddt_t * ddt_select(spa_t *spa, const blkptr_t *bp) { diff --git a/module/zfs/ddt_stats.c b/module/zfs/ddt_stats.c new file mode 100644 index 00000000000..05d0c22f92a --- /dev/null +++ b/module/zfs/ddt_stats.c @@ -0,0 +1,203 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ + +/* + * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2012, 2016 by Delphix. All rights reserved. + * Copyright (c) 2022 by Pawel Jakub Dawidek + * Copyright (c) 2023, Klara Inc. + */ + +#include +#include +#include +#include + +static void +ddt_stat_generate(ddt_t *ddt, ddt_entry_t *dde, ddt_stat_t *dds) +{ + spa_t *spa = ddt->ddt_spa; + ddt_phys_t *ddp = dde->dde_phys; + ddt_key_t *ddk = &dde->dde_key; + uint64_t lsize = DDK_GET_LSIZE(ddk); + uint64_t psize = DDK_GET_PSIZE(ddk); + + memset(dds, 0, sizeof (*dds)); + + for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { + uint64_t dsize = 0; + uint64_t refcnt = ddp->ddp_refcnt; + + if (ddp->ddp_phys_birth == 0) + continue; + + for (int d = 0; d < DDE_GET_NDVAS(dde); d++) + dsize += dva_get_dsize_sync(spa, &ddp->ddp_dva[d]); + + dds->dds_blocks += 1; + dds->dds_lsize += lsize; + dds->dds_psize += psize; + dds->dds_dsize += dsize; + + dds->dds_ref_blocks += refcnt; + dds->dds_ref_lsize += lsize * refcnt; + dds->dds_ref_psize += psize * refcnt; + dds->dds_ref_dsize += dsize * refcnt; + } +} + +void +ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg) +{ + const uint64_t *s = (const uint64_t *)src; + uint64_t *d = (uint64_t *)dst; + uint64_t *d_end = (uint64_t *)(dst + 1); + + ASSERT(neg == 0 || neg == -1ULL); /* add or subtract */ + + for (int i = 0; i < d_end - d; i++) + d[i] += (s[i] ^ neg) - neg; +} + +void +ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg) +{ + ddt_stat_t dds; + ddt_histogram_t *ddh; + int bucket; + + ddt_stat_generate(ddt, dde, &dds); + + bucket = highbit64(dds.dds_ref_blocks) - 1; + ASSERT3U(bucket, >=, 0); + + ddh = &ddt->ddt_histogram[dde->dde_type][dde->dde_class]; + + ddt_stat_add(&ddh->ddh_stat[bucket], &dds, neg); +} + +void +ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src) +{ + for (int h = 0; h < 64; h++) + ddt_stat_add(&dst->ddh_stat[h], &src->ddh_stat[h], 0); +} + +void +ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh) +{ + memset(dds, 0, sizeof (*dds)); + + for (int h = 0; h < 64; h++) + ddt_stat_add(dds, &ddh->ddh_stat[h], 0); +} + +boolean_t +ddt_histogram_empty(const ddt_histogram_t *ddh) +{ + const uint64_t *s = (const uint64_t *)ddh; + const uint64_t *s_end = (const uint64_t *)(ddh + 1); + + while (s < s_end) + if (*s++ != 0) + return (B_FALSE); + + return (B_TRUE); +} + +void +ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo_total) +{ + /* Sum the statistics we cached in ddt_object_sync(). */ + for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { + ddt_t *ddt = spa->spa_ddt[c]; + for (enum ddt_type type = 0; type < DDT_TYPES; type++) { + for (enum ddt_class class = 0; class < DDT_CLASSES; + class++) { + ddt_object_t *ddo = + &ddt->ddt_object_stats[type][class]; + ddo_total->ddo_count += ddo->ddo_count; + ddo_total->ddo_dspace += ddo->ddo_dspace; + ddo_total->ddo_mspace += ddo->ddo_mspace; + } + } + } + + /* ... and compute the averages. */ + if (ddo_total->ddo_count != 0) { + ddo_total->ddo_dspace /= ddo_total->ddo_count; + ddo_total->ddo_mspace /= ddo_total->ddo_count; + } +} + +void +ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh) +{ + for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { + ddt_t *ddt = spa->spa_ddt[c]; + for (enum ddt_type type = 0; type < DDT_TYPES && ddt; type++) { + for (enum ddt_class class = 0; class < DDT_CLASSES; + class++) { + ddt_histogram_add(ddh, + &ddt->ddt_histogram_cache[type][class]); + } + } + } +} + +void +ddt_get_dedup_stats(spa_t *spa, ddt_stat_t *dds_total) +{ + ddt_histogram_t *ddh_total; + + ddh_total = kmem_zalloc(sizeof (ddt_histogram_t), KM_SLEEP); + ddt_get_dedup_histogram(spa, ddh_total); + ddt_histogram_stat(dds_total, ddh_total); + kmem_free(ddh_total, sizeof (ddt_histogram_t)); +} + +uint64_t +ddt_get_dedup_dspace(spa_t *spa) +{ + ddt_stat_t dds_total; + + if (spa->spa_dedup_dspace != ~0ULL) + return (spa->spa_dedup_dspace); + + memset(&dds_total, 0, sizeof (ddt_stat_t)); + + /* Calculate and cache the stats */ + ddt_get_dedup_stats(spa, &dds_total); + spa->spa_dedup_dspace = dds_total.dds_ref_dsize - dds_total.dds_dsize; + return (spa->spa_dedup_dspace); +} + +uint64_t +ddt_get_pool_dedup_ratio(spa_t *spa) +{ + ddt_stat_t dds_total = { 0 }; + + ddt_get_dedup_stats(spa, &dds_total); + if (dds_total.dds_dsize == 0) + return (100); + + return (dds_total.dds_ref_dsize * 100 / dds_total.dds_dsize); +} From 909006049f97723140425ea3257300cae45e2866 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Mon, 3 Jul 2023 15:25:06 +1000 Subject: [PATCH 06/20] ddt: remove DDE_GET_NDVAS macro It was a weird and confusing name, because it wasn't actually returning the number of DVAs in the entry (as in, in the value/phys part) but the maximum number of possible DVAs in a BP generated from the entry, based on the encrypt bit in the key. This is unlike the similarly named BP_GET_NDVAS, which really does return the number of DVAs. Since its only used in this one place, and for a specific purpose, it seemed more sensible to just write it in-place and remove the name. Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Sponsored-by: Klara, Inc. Sponsored-by: iXsystems, Inc. Closes #15887 --- include/sys/ddt.h | 3 --- module/zfs/ddt_stats.c | 4 +++- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/include/sys/ddt.h b/include/sys/ddt.h index a1535405768..39cffeb366a 100644 --- a/include/sys/ddt.h +++ b/include/sys/ddt.h @@ -89,9 +89,6 @@ typedef struct ddt_key { #define DDK_GET_CRYPT(ddk) BF64_GET((ddk)->ddk_prop, 39, 1) #define DDK_SET_CRYPT(ddk, x) BF64_SET((ddk)->ddk_prop, 39, 1, x) -#define DDE_GET_NDVAS(dde) (DDK_GET_CRYPT(&dde->dde_key) \ - ? SPA_DVAS_PER_BP - 1 : SPA_DVAS_PER_BP) - typedef struct ddt_phys { dva_t ddp_dva[SPA_DVAS_PER_BP]; uint64_t ddp_refcnt; diff --git a/module/zfs/ddt_stats.c b/module/zfs/ddt_stats.c index 05d0c22f92a..1f1a1188f97 100644 --- a/module/zfs/ddt_stats.c +++ b/module/zfs/ddt_stats.c @@ -49,7 +49,9 @@ ddt_stat_generate(ddt_t *ddt, ddt_entry_t *dde, ddt_stat_t *dds) if (ddp->ddp_phys_birth == 0) continue; - for (int d = 0; d < DDE_GET_NDVAS(dde); d++) + int ndvas = DDK_GET_CRYPT(&dde->dde_key) ? + SPA_DVAS_PER_BP - 1 : SPA_DVAS_PER_BP; + for (int d = 0; d < ndvas; d++) dsize += dva_get_dsize_sync(spa, &ddp->ddp_dva[d]); dds->dds_blocks += 1; From 8e414fcdf40b52442d8fa1faf07c70d291aa8ac2 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Fri, 30 Jun 2023 13:35:18 +1000 Subject: [PATCH 07/20] ddt: split internal DDT API into separate header Just to make it easier to know which bits to pay attention to. Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Sponsored-by: Klara, Inc. Sponsored-by: iXsystems, Inc. Closes #15887 --- cmd/zdb/zdb.c | 1 + include/Makefile.am | 1 + include/sys/ddt.h | 45 ---------------------- include/sys/ddt_impl.h | 86 ++++++++++++++++++++++++++++++++++++++++++ module/zfs/ddt.c | 17 +++++---- module/zfs/ddt_stats.c | 1 + module/zfs/ddt_zap.c | 1 + 7 files changed, 99 insertions(+), 53 deletions(-) create mode 100644 include/sys/ddt_impl.h diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 35d71012f77..63fb41df82e 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -74,6 +74,7 @@ #include #include #include +#include #include #include #include diff --git a/include/Makefile.am b/include/Makefile.am index cb28a2d6c96..fa725c2e7a5 100644 --- a/include/Makefile.am +++ b/include/Makefile.am @@ -37,6 +37,7 @@ COMMON_H = \ sys/dataset_kstats.h \ sys/dbuf.h \ sys/ddt.h \ + sys/ddt_impl.h \ sys/dmu.h \ sys/dmu_impl.h \ sys/dmu_objset.h \ diff --git a/include/sys/ddt.h b/include/sys/ddt.h index 39cffeb366a..25b8e7c9d37 100644 --- a/include/sys/ddt.h +++ b/include/sys/ddt.h @@ -152,57 +152,16 @@ typedef struct ddt_bookmark { uint64_t ddb_cursor; } ddt_bookmark_t; -/* - * Ops vector to access a specific DDT object type. - */ -typedef struct ddt_ops { - char ddt_op_name[32]; - int (*ddt_op_create)(objset_t *os, uint64_t *object, dmu_tx_t *tx, - boolean_t prehash); - int (*ddt_op_destroy)(objset_t *os, uint64_t object, dmu_tx_t *tx); - int (*ddt_op_lookup)(objset_t *os, uint64_t object, ddt_entry_t *dde); - void (*ddt_op_prefetch)(objset_t *os, uint64_t object, - ddt_entry_t *dde); - int (*ddt_op_update)(objset_t *os, uint64_t object, ddt_entry_t *dde, - dmu_tx_t *tx); - int (*ddt_op_remove)(objset_t *os, uint64_t object, ddt_entry_t *dde, - dmu_tx_t *tx); - int (*ddt_op_walk)(objset_t *os, uint64_t object, ddt_entry_t *dde, - uint64_t *walk); - int (*ddt_op_count)(objset_t *os, uint64_t object, uint64_t *count); -} ddt_ops_t; - -#define DDT_NAMELEN 107 - -extern void ddt_object_name(ddt_t *ddt, enum ddt_type type, - enum ddt_class clazz, char *name); -extern int ddt_object_walk(ddt_t *ddt, enum ddt_type type, - enum ddt_class clazz, uint64_t *walk, ddt_entry_t *dde); -extern int ddt_object_count(ddt_t *ddt, enum ddt_type type, - enum ddt_class clazz, uint64_t *count); -extern int ddt_object_info(ddt_t *ddt, enum ddt_type type, - enum ddt_class clazz, dmu_object_info_t *); -extern boolean_t ddt_object_exists(ddt_t *ddt, enum ddt_type type, - enum ddt_class clazz); - extern void ddt_bp_fill(const ddt_phys_t *ddp, blkptr_t *bp, uint64_t txg); extern void ddt_bp_create(enum zio_checksum checksum, const ddt_key_t *ddk, const ddt_phys_t *ddp, blkptr_t *bp); -extern void ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp); - extern void ddt_phys_fill(ddt_phys_t *ddp, const blkptr_t *bp); extern void ddt_phys_clear(ddt_phys_t *ddp); extern void ddt_phys_addref(ddt_phys_t *ddp); extern void ddt_phys_decref(ddt_phys_t *ddp); -extern void ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp, - uint64_t txg); extern ddt_phys_t *ddt_phys_select(const ddt_entry_t *dde, const blkptr_t *bp); -extern uint64_t ddt_phys_total_refcnt(const ddt_entry_t *dde); - -extern void ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg); -extern void ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg); extern void ddt_histogram_add(ddt_histogram_t *dst, const ddt_histogram_t *src); extern void ddt_histogram_stat(ddt_stat_t *dds, const ddt_histogram_t *ddh); @@ -236,13 +195,9 @@ extern int ddt_load(spa_t *spa); extern void ddt_unload(spa_t *spa); extern void ddt_sync(spa_t *spa, uint64_t txg); extern int ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde); -extern int ddt_object_update(ddt_t *ddt, enum ddt_type type, - enum ddt_class clazz, ddt_entry_t *dde, dmu_tx_t *tx); extern boolean_t ddt_addref(spa_t *spa, const blkptr_t *bp); -extern const ddt_ops_t ddt_zap_ops; - #ifdef __cplusplus } #endif diff --git a/include/sys/ddt_impl.h b/include/sys/ddt_impl.h new file mode 100644 index 00000000000..c43ced5a7f3 --- /dev/null +++ b/include/sys/ddt_impl.h @@ -0,0 +1,86 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or https://opensource.org/licenses/CDDL-1.0. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. + * Copyright (c) 2016 by Delphix. All rights reserved. + * Copyright (c) 2023, Klara Inc. + */ + +#ifndef _SYS_DDT_IMPL_H +#define _SYS_DDT_IMPL_H + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/* + * Ops vector to access a specific DDT object type. + */ +typedef struct ddt_ops { + char ddt_op_name[32]; + int (*ddt_op_create)(objset_t *os, uint64_t *object, dmu_tx_t *tx, + boolean_t prehash); + int (*ddt_op_destroy)(objset_t *os, uint64_t object, dmu_tx_t *tx); + int (*ddt_op_lookup)(objset_t *os, uint64_t object, ddt_entry_t *dde); + void (*ddt_op_prefetch)(objset_t *os, uint64_t object, + ddt_entry_t *dde); + int (*ddt_op_update)(objset_t *os, uint64_t object, ddt_entry_t *dde, + dmu_tx_t *tx); + int (*ddt_op_remove)(objset_t *os, uint64_t object, ddt_entry_t *dde, + dmu_tx_t *tx); + int (*ddt_op_walk)(objset_t *os, uint64_t object, ddt_entry_t *dde, + uint64_t *walk); + int (*ddt_op_count)(objset_t *os, uint64_t object, uint64_t *count); +} ddt_ops_t; + +extern const ddt_ops_t ddt_zap_ops; + +extern void ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg); + +/* + * These are only exposed so that zdb can access them. Try not to use them + * outside of the DDT implementation proper, and if you do, consider moving + * them up. + */ +#define DDT_NAMELEN 107 + +extern uint64_t ddt_phys_total_refcnt(const ddt_entry_t *dde); + +extern void ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp); + +extern void ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg); + +extern void ddt_object_name(ddt_t *ddt, enum ddt_type type, + enum ddt_class clazz, char *name); +extern int ddt_object_walk(ddt_t *ddt, enum ddt_type type, + enum ddt_class clazz, uint64_t *walk, ddt_entry_t *dde); +extern int ddt_object_count(ddt_t *ddt, enum ddt_type type, + enum ddt_class clazz, uint64_t *count); +extern int ddt_object_info(ddt_t *ddt, enum ddt_type type, + enum ddt_class clazz, dmu_object_info_t *); + +#ifdef __cplusplus +} +#endif + +#endif /* _SYS_DDT_H */ diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c index 43fc2aef520..321437c2234 100644 --- a/module/zfs/ddt.c +++ b/module/zfs/ddt.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -171,6 +172,12 @@ ddt_object_sync(ddt_t *ddt, enum ddt_type type, enum ddt_class class, ddo->ddo_mspace = doi.doi_fill_count * doi.doi_data_block_size; } +static boolean_t +ddt_object_exists(ddt_t *ddt, enum ddt_type type, enum ddt_class class) +{ + return (!!ddt->ddt_object[type][class]); +} + static int ddt_object_lookup(ddt_t *ddt, enum ddt_type type, enum ddt_class class, ddt_entry_t *dde) @@ -193,7 +200,7 @@ ddt_object_prefetch(ddt_t *ddt, enum ddt_type type, enum ddt_class class, ddt->ddt_object[type][class], dde); } -int +static int ddt_object_update(ddt_t *ddt, enum ddt_type type, enum ddt_class class, ddt_entry_t *dde, dmu_tx_t *tx) { @@ -244,12 +251,6 @@ ddt_object_info(ddt_t *ddt, enum ddt_type type, enum ddt_class class, doi)); } -boolean_t -ddt_object_exists(ddt_t *ddt, enum ddt_type type, enum ddt_class class) -{ - return (!!ddt->ddt_object[type][class]); -} - void ddt_object_name(ddt_t *ddt, enum ddt_type type, enum ddt_class class, char *name) @@ -341,7 +342,7 @@ ddt_phys_decref(ddt_phys_t *ddp) } } -void +static void ddt_phys_free(ddt_t *ddt, ddt_key_t *ddk, ddt_phys_t *ddp, uint64_t txg) { blkptr_t blk; diff --git a/module/zfs/ddt_stats.c b/module/zfs/ddt_stats.c index 1f1a1188f97..e1d36773085 100644 --- a/module/zfs/ddt_stats.c +++ b/module/zfs/ddt_stats.c @@ -30,6 +30,7 @@ #include #include #include +#include static void ddt_stat_generate(ddt_t *ddt, ddt_entry_t *dde, ddt_stat_t *dds) diff --git a/module/zfs/ddt_zap.c b/module/zfs/ddt_zap.c index 0ec3eccb1c1..56312881fdd 100644 --- a/module/zfs/ddt_zap.c +++ b/module/zfs/ddt_zap.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include From c8f694fe39ea7c1a99ced14064b909b515c53843 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Mon, 3 Jul 2023 12:32:53 +1000 Subject: [PATCH 08/20] ddt: typedef ddt_type and ddt_class Mostly for consistency, so the reader is less likely to wonder why these things look different. Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Sponsored-by: Klara, Inc. Sponsored-by: iXsystems, Inc. Closes #15887 --- cmd/zdb/zdb.c | 6 ++-- include/sys/ddt.h | 23 ++++++++++------ include/sys/ddt_impl.h | 16 +++++------ module/zfs/ddt.c | 62 +++++++++++++++++++++--------------------- module/zfs/ddt_stats.c | 8 +++--- module/zfs/dsl_scan.c | 4 +-- 6 files changed, 63 insertions(+), 56 deletions(-) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 63fb41df82e..7f17de03d3c 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -1905,7 +1905,7 @@ dump_dedup_ratio(const ddt_stat_t *dds) } static void -dump_ddt(ddt_t *ddt, enum ddt_type type, enum ddt_class class) +dump_ddt(ddt_t *ddt, ddt_type_t type, ddt_class_t class) { char name[DDT_NAMELEN]; ddt_entry_t dde; @@ -1965,8 +1965,8 @@ dump_all_ddts(spa_t *spa) for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { ddt_t *ddt = spa->spa_ddt[c]; - for (enum ddt_type type = 0; type < DDT_TYPES; type++) { - for (enum ddt_class class = 0; class < DDT_CLASSES; + for (ddt_type_t type = 0; type < DDT_TYPES; type++) { + for (ddt_class_t class = 0; class < DDT_CLASSES; class++) { dump_ddt(ddt, type, class); } diff --git a/include/sys/ddt.h b/include/sys/ddt.h index 25b8e7c9d37..a11324209de 100644 --- a/include/sys/ddt.h +++ b/include/sys/ddt.h @@ -41,22 +41,29 @@ struct abd; /* * On-disk DDT formats, in the desired search order (newest version first). */ -enum ddt_type { +typedef enum { DDT_TYPE_ZAP = 0, DDT_TYPES -}; +} ddt_type_t; + +_Static_assert(DDT_TYPES <= UINT8_MAX, + "ddt_type_t must fit in a uint8_t"); + +/* New and updated entries recieve this type, see ddt_sync_entry() */ +#define DDT_TYPE_DEFAULT (DDT_TYPE_ZAP) /* * DDT classes, in the desired search order (highest replication level first). */ -enum ddt_class { +typedef enum { DDT_CLASS_DITTO = 0, DDT_CLASS_DUPLICATE, DDT_CLASS_UNIQUE, DDT_CLASSES -}; +} ddt_class_t; -#define DDT_TYPE_CURRENT 0 +_Static_assert(DDT_CLASSES < UINT8_MAX, + "ddt_class_t must fit in a uint8_t"); /* * On-disk ddt entry: key (name) and physical storage (value). @@ -116,8 +123,8 @@ struct ddt_entry { ddt_phys_t dde_phys[DDT_PHYS_TYPES]; zio_t *dde_lead_zio[DDT_PHYS_TYPES]; struct abd *dde_repair_abd; - enum ddt_type dde_type; - enum ddt_class dde_class; + ddt_type_t dde_type; + ddt_class_t dde_class; uint8_t dde_loading; uint8_t dde_loaded; kcondvar_t dde_cv; @@ -182,7 +189,7 @@ extern ddt_entry_t *ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add); extern void ddt_prefetch(spa_t *spa, const blkptr_t *bp); extern void ddt_remove(ddt_t *ddt, ddt_entry_t *dde); -extern boolean_t ddt_class_contains(spa_t *spa, enum ddt_class max_class, +extern boolean_t ddt_class_contains(spa_t *spa, ddt_class_t max_class, const blkptr_t *bp); extern ddt_entry_t *ddt_repair_start(ddt_t *ddt, const blkptr_t *bp); diff --git a/include/sys/ddt_impl.h b/include/sys/ddt_impl.h index c43ced5a7f3..9960f966684 100644 --- a/include/sys/ddt_impl.h +++ b/include/sys/ddt_impl.h @@ -70,14 +70,14 @@ extern void ddt_key_fill(ddt_key_t *ddk, const blkptr_t *bp); extern void ddt_stat_add(ddt_stat_t *dst, const ddt_stat_t *src, uint64_t neg); -extern void ddt_object_name(ddt_t *ddt, enum ddt_type type, - enum ddt_class clazz, char *name); -extern int ddt_object_walk(ddt_t *ddt, enum ddt_type type, - enum ddt_class clazz, uint64_t *walk, ddt_entry_t *dde); -extern int ddt_object_count(ddt_t *ddt, enum ddt_type type, - enum ddt_class clazz, uint64_t *count); -extern int ddt_object_info(ddt_t *ddt, enum ddt_type type, - enum ddt_class clazz, dmu_object_info_t *); +extern void ddt_object_name(ddt_t *ddt, ddt_type_t type, ddt_class_t clazz, + char *name); +extern int ddt_object_walk(ddt_t *ddt, ddt_type_t type, ddt_class_t clazz, + uint64_t *walk, ddt_entry_t *dde); +extern int ddt_object_count(ddt_t *ddt, ddt_type_t type, ddt_class_t clazz, + uint64_t *count); +extern int ddt_object_info(ddt_t *ddt, ddt_type_t type, ddt_class_t clazz, + dmu_object_info_t *); #ifdef __cplusplus } diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c index 321437c2234..1b7063998c2 100644 --- a/module/zfs/ddt.c +++ b/module/zfs/ddt.c @@ -58,7 +58,7 @@ static const char *const ddt_class_name[DDT_CLASSES] = { }; static void -ddt_object_create(ddt_t *ddt, enum ddt_type type, enum ddt_class class, +ddt_object_create(ddt_t *ddt, ddt_type_t type, ddt_class_t class, dmu_tx_t *tx) { spa_t *spa = ddt->ddt_spa; @@ -83,7 +83,7 @@ ddt_object_create(ddt_t *ddt, enum ddt_type type, enum ddt_class class, } static void -ddt_object_destroy(ddt_t *ddt, enum ddt_type type, enum ddt_class class, +ddt_object_destroy(ddt_t *ddt, ddt_type_t type, ddt_class_t class, dmu_tx_t *tx) { spa_t *spa = ddt->ddt_spa; @@ -107,7 +107,7 @@ ddt_object_destroy(ddt_t *ddt, enum ddt_type type, enum ddt_class class, } static int -ddt_object_load(ddt_t *ddt, enum ddt_type type, enum ddt_class class) +ddt_object_load(ddt_t *ddt, ddt_type_t type, ddt_class_t class) { ddt_object_t *ddo = &ddt->ddt_object_stats[type][class]; dmu_object_info_t doi; @@ -147,7 +147,7 @@ ddt_object_load(ddt_t *ddt, enum ddt_type type, enum ddt_class class) } static void -ddt_object_sync(ddt_t *ddt, enum ddt_type type, enum ddt_class class, +ddt_object_sync(ddt_t *ddt, ddt_type_t type, ddt_class_t class, dmu_tx_t *tx) { ddt_object_t *ddo = &ddt->ddt_object_stats[type][class]; @@ -173,13 +173,13 @@ ddt_object_sync(ddt_t *ddt, enum ddt_type type, enum ddt_class class, } static boolean_t -ddt_object_exists(ddt_t *ddt, enum ddt_type type, enum ddt_class class) +ddt_object_exists(ddt_t *ddt, ddt_type_t type, ddt_class_t class) { return (!!ddt->ddt_object[type][class]); } static int -ddt_object_lookup(ddt_t *ddt, enum ddt_type type, enum ddt_class class, +ddt_object_lookup(ddt_t *ddt, ddt_type_t type, ddt_class_t class, ddt_entry_t *dde) { if (!ddt_object_exists(ddt, type, class)) @@ -190,7 +190,7 @@ ddt_object_lookup(ddt_t *ddt, enum ddt_type type, enum ddt_class class, } static void -ddt_object_prefetch(ddt_t *ddt, enum ddt_type type, enum ddt_class class, +ddt_object_prefetch(ddt_t *ddt, ddt_type_t type, ddt_class_t class, ddt_entry_t *dde) { if (!ddt_object_exists(ddt, type, class)) @@ -201,7 +201,7 @@ ddt_object_prefetch(ddt_t *ddt, enum ddt_type type, enum ddt_class class, } static int -ddt_object_update(ddt_t *ddt, enum ddt_type type, enum ddt_class class, +ddt_object_update(ddt_t *ddt, ddt_type_t type, ddt_class_t class, ddt_entry_t *dde, dmu_tx_t *tx) { ASSERT(ddt_object_exists(ddt, type, class)); @@ -211,7 +211,7 @@ ddt_object_update(ddt_t *ddt, enum ddt_type type, enum ddt_class class, } static int -ddt_object_remove(ddt_t *ddt, enum ddt_type type, enum ddt_class class, +ddt_object_remove(ddt_t *ddt, ddt_type_t type, ddt_class_t class, ddt_entry_t *dde, dmu_tx_t *tx) { ASSERT(ddt_object_exists(ddt, type, class)); @@ -221,7 +221,7 @@ ddt_object_remove(ddt_t *ddt, enum ddt_type type, enum ddt_class class, } int -ddt_object_walk(ddt_t *ddt, enum ddt_type type, enum ddt_class class, +ddt_object_walk(ddt_t *ddt, ddt_type_t type, ddt_class_t class, uint64_t *walk, ddt_entry_t *dde) { ASSERT(ddt_object_exists(ddt, type, class)); @@ -231,7 +231,7 @@ ddt_object_walk(ddt_t *ddt, enum ddt_type type, enum ddt_class class, } int -ddt_object_count(ddt_t *ddt, enum ddt_type type, enum ddt_class class, +ddt_object_count(ddt_t *ddt, ddt_type_t type, ddt_class_t class, uint64_t *count) { ASSERT(ddt_object_exists(ddt, type, class)); @@ -241,7 +241,7 @@ ddt_object_count(ddt_t *ddt, enum ddt_type type, enum ddt_class class, } int -ddt_object_info(ddt_t *ddt, enum ddt_type type, enum ddt_class class, +ddt_object_info(ddt_t *ddt, ddt_type_t type, ddt_class_t class, dmu_object_info_t *doi) { if (!ddt_object_exists(ddt, type, class)) @@ -252,7 +252,7 @@ ddt_object_info(ddt_t *ddt, enum ddt_type type, enum ddt_class class, } void -ddt_object_name(ddt_t *ddt, enum ddt_type type, enum ddt_class class, +ddt_object_name(ddt_t *ddt, ddt_type_t type, ddt_class_t class, char *name) { (void) snprintf(name, DDT_NAMELEN, DMU_POOL_DDT, @@ -460,8 +460,8 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add) { ddt_key_t search; ddt_entry_t *dde; - enum ddt_type type; - enum ddt_class class; + ddt_type_t type; + ddt_class_t class; avl_index_t where; int error; @@ -536,8 +536,8 @@ ddt_prefetch(spa_t *spa, const blkptr_t *bp) ddt = ddt_select(spa, bp); ddt_key_fill(&dde.dde_key, bp); - for (enum ddt_type type = 0; type < DDT_TYPES; type++) { - for (enum ddt_class class = 0; class < DDT_CLASSES; class++) { + for (ddt_type_t type = 0; type < DDT_TYPES; type++) { + for (ddt_class_t class = 0; class < DDT_CLASSES; class++) { ddt_object_prefetch(ddt, type, class, &dde); } } @@ -625,8 +625,8 @@ ddt_load(spa_t *spa) for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { ddt_t *ddt = spa->spa_ddt[c]; - for (enum ddt_type type = 0; type < DDT_TYPES; type++) { - for (enum ddt_class class = 0; class < DDT_CLASSES; + for (ddt_type_t type = 0; type < DDT_TYPES; type++) { + for (ddt_class_t class = 0; class < DDT_CLASSES; class++) { error = ddt_object_load(ddt, type, class); if (error != 0 && error != ENOENT) @@ -657,7 +657,7 @@ ddt_unload(spa_t *spa) } boolean_t -ddt_class_contains(spa_t *spa, enum ddt_class max_class, const blkptr_t *bp) +ddt_class_contains(spa_t *spa, ddt_class_t max_class, const blkptr_t *bp) { ddt_t *ddt; ddt_entry_t *dde; @@ -673,8 +673,8 @@ ddt_class_contains(spa_t *spa, enum ddt_class max_class, const blkptr_t *bp) ddt_key_fill(&(dde->dde_key), bp); - for (enum ddt_type type = 0; type < DDT_TYPES; type++) { - for (enum ddt_class class = 0; class <= max_class; class++) { + for (ddt_type_t type = 0; type < DDT_TYPES; type++) { + for (ddt_class_t class = 0; class <= max_class; class++) { if (ddt_object_lookup(ddt, type, class, dde) == 0) { kmem_cache_free(ddt_entry_cache, dde); return (B_TRUE); @@ -696,8 +696,8 @@ ddt_repair_start(ddt_t *ddt, const blkptr_t *bp) dde = ddt_alloc(&ddk); - for (enum ddt_type type = 0; type < DDT_TYPES; type++) { - for (enum ddt_class class = 0; class < DDT_CLASSES; class++) { + for (ddt_type_t type = 0; type < DDT_TYPES; type++) { + for (ddt_class_t class = 0; class < DDT_CLASSES; class++) { /* * We can only do repair if there are multiple copies * of the block. For anything in the UNIQUE class, @@ -796,10 +796,10 @@ ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg) dsl_pool_t *dp = ddt->ddt_spa->spa_dsl_pool; ddt_phys_t *ddp = dde->dde_phys; ddt_key_t *ddk = &dde->dde_key; - enum ddt_type otype = dde->dde_type; - enum ddt_type ntype = DDT_TYPE_CURRENT; - enum ddt_class oclass = dde->dde_class; - enum ddt_class nclass; + ddt_type_t otype = dde->dde_type; + ddt_type_t ntype = DDT_TYPE_DEFAULT; + ddt_class_t oclass = dde->dde_class; + ddt_class_t nclass; uint64_t total_refcnt = 0; ASSERT(dde->dde_loaded); @@ -883,9 +883,9 @@ ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg) ddt_free(dde); } - for (enum ddt_type type = 0; type < DDT_TYPES; type++) { + for (ddt_type_t type = 0; type < DDT_TYPES; type++) { uint64_t add, count = 0; - for (enum ddt_class class = 0; class < DDT_CLASSES; class++) { + for (ddt_class_t class = 0; class < DDT_CLASSES; class++) { if (ddt_object_exists(ddt, type, class)) { ddt_object_sync(ddt, type, class, tx); VERIFY0(ddt_object_count(ddt, type, class, @@ -893,7 +893,7 @@ ddt_sync_table(ddt_t *ddt, dmu_tx_t *tx, uint64_t txg) count += add; } } - for (enum ddt_class class = 0; class < DDT_CLASSES; class++) { + for (ddt_class_t class = 0; class < DDT_CLASSES; class++) { if (count == 0 && ddt_object_exists(ddt, type, class)) ddt_object_destroy(ddt, type, class, tx); } diff --git a/module/zfs/ddt_stats.c b/module/zfs/ddt_stats.c index e1d36773085..4ef6c3a07e3 100644 --- a/module/zfs/ddt_stats.c +++ b/module/zfs/ddt_stats.c @@ -132,8 +132,8 @@ ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo_total) /* Sum the statistics we cached in ddt_object_sync(). */ for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { ddt_t *ddt = spa->spa_ddt[c]; - for (enum ddt_type type = 0; type < DDT_TYPES; type++) { - for (enum ddt_class class = 0; class < DDT_CLASSES; + for (ddt_type_t type = 0; type < DDT_TYPES; type++) { + for (ddt_class_t class = 0; class < DDT_CLASSES; class++) { ddt_object_t *ddo = &ddt->ddt_object_stats[type][class]; @@ -156,8 +156,8 @@ ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh) { for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { ddt_t *ddt = spa->spa_ddt[c]; - for (enum ddt_type type = 0; type < DDT_TYPES && ddt; type++) { - for (enum ddt_class class = 0; class < DDT_CLASSES; + for (ddt_type_t type = 0; type < DDT_TYPES && ddt; type++) { + for (ddt_class_t class = 0; class < DDT_CLASSES; class++) { ddt_histogram_add(ddh, &ddt->ddt_histogram_cache[type][class]); diff --git a/module/zfs/dsl_scan.c b/module/zfs/dsl_scan.c index d04149f560a..060a5cc36d7 100644 --- a/module/zfs/dsl_scan.c +++ b/module/zfs/dsl_scan.c @@ -203,7 +203,7 @@ static uint_t zfs_scan_checkpoint_intval = 7200; /* in seconds */ int zfs_scan_suspend_progress = 0; /* set to prevent scans from progressing */ static int zfs_no_scrub_io = B_FALSE; /* set to disable scrub i/o */ static int zfs_no_scrub_prefetch = B_FALSE; /* set to disable scrub prefetch */ -static const enum ddt_class zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE; +static const ddt_class_t zfs_scrub_ddt_class_max = DDT_CLASS_DUPLICATE; /* max number of blocks to free in a single TXG */ static uint64_t zfs_async_block_max_blocks = UINT64_MAX; /* max number of dedup blocks to free in a single TXG */ @@ -2962,7 +2962,7 @@ dsl_scan_ddt_entry(dsl_scan_t *scn, enum zio_checksum checksum, * If there are N references to a deduped block, we don't want to scrub it * N times -- ideally, we should scrub it exactly once. * - * We leverage the fact that the dde's replication class (enum ddt_class) + * We leverage the fact that the dde's replication class (ddt_class_t) * is ordered from highest replication class (DDT_CLASS_DITTO) to lowest * (DDT_CLASS_UNIQUE) so that we may walk the DDT in that order. * From 3bad70040a7b8437ac587d249895e723594c16cf Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Mon, 3 Jul 2023 12:43:37 +1000 Subject: [PATCH 09/20] ddt: remove struct names and forward declarations Things get confused when there's more than one name for a thing. Note that we don't do this for ddt_object_t, ddt_histogram_t and ddt_stat_t because they're part of the public ZFS interface. Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Sponsored-by: Klara, Inc. Sponsored-by: iXsystems, Inc. Closes #15887 --- include/sys/ddt.h | 14 +++++++------- include/sys/ddt_impl.h | 2 +- include/sys/spa.h | 2 -- 3 files changed, 8 insertions(+), 10 deletions(-) diff --git a/include/sys/ddt.h b/include/sys/ddt.h index a11324209de..013692c96dc 100644 --- a/include/sys/ddt.h +++ b/include/sys/ddt.h @@ -68,7 +68,7 @@ _Static_assert(DDT_CLASSES < UINT8_MAX, /* * On-disk ddt entry: key (name) and physical storage (value). */ -typedef struct ddt_key { +typedef struct { zio_cksum_t ddk_cksum; /* 256-bit block checksum */ /* * Encoded with logical & physical size, encryption, and compression, @@ -96,7 +96,7 @@ typedef struct ddt_key { #define DDK_GET_CRYPT(ddk) BF64_GET((ddk)->ddk_prop, 39, 1) #define DDK_SET_CRYPT(ddk, x) BF64_SET((ddk)->ddk_prop, 39, 1, x) -typedef struct ddt_phys { +typedef struct { dva_t ddp_dva[SPA_DVAS_PER_BP]; uint64_t ddp_refcnt; uint64_t ddp_phys_birth; @@ -117,7 +117,7 @@ enum ddt_phys_type { /* * In-core ddt entry */ -struct ddt_entry { +typedef struct { /* key must be first for ddt_key_compare */ ddt_key_t dde_key; ddt_phys_t dde_phys[DDT_PHYS_TYPES]; @@ -129,12 +129,12 @@ struct ddt_entry { uint8_t dde_loaded; kcondvar_t dde_cv; avl_node_t dde_node; -}; +} ddt_entry_t; /* * In-core ddt */ -struct ddt { +typedef struct { kmutex_t ddt_lock; avl_tree_t ddt_tree; avl_tree_t ddt_repair_tree; @@ -147,12 +147,12 @@ struct ddt { ddt_histogram_t ddt_histogram_cache[DDT_TYPES][DDT_CLASSES]; ddt_object_t ddt_object_stats[DDT_TYPES][DDT_CLASSES]; avl_node_t ddt_node; -}; +} ddt_t; /* * In-core and on-disk bookmark for DDT walks */ -typedef struct ddt_bookmark { +typedef struct { uint64_t ddb_class; uint64_t ddb_type; uint64_t ddb_checksum; diff --git a/include/sys/ddt_impl.h b/include/sys/ddt_impl.h index 9960f966684..09937c71461 100644 --- a/include/sys/ddt_impl.h +++ b/include/sys/ddt_impl.h @@ -36,7 +36,7 @@ extern "C" { /* * Ops vector to access a specific DDT object type. */ -typedef struct ddt_ops { +typedef struct { char ddt_op_name[32]; int (*ddt_op_create)(objset_t *os, uint64_t *object, dmu_tx_t *tx, boolean_t prehash); diff --git a/include/sys/spa.h b/include/sys/spa.h index 08099cd4fa2..cada3c84103 100644 --- a/include/sys/spa.h +++ b/include/sys/spa.h @@ -62,8 +62,6 @@ typedef struct metaslab_class metaslab_class_t; typedef struct zio zio_t; typedef struct zilog zilog_t; typedef struct spa_aux_vdev spa_aux_vdev_t; -typedef struct ddt ddt_t; -typedef struct ddt_entry ddt_entry_t; typedef struct zbookmark_phys zbookmark_phys_t; typedef struct zbookmark_err_phys zbookmark_err_phys_t; From 5ee0f9c64946fca90026c0d7092326755d75c2d7 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Thu, 15 Jun 2023 16:10:00 +1000 Subject: [PATCH 10/20] ddt: ensure ddt objects exist before trying to get stats from them ddt_get_dedup_histogram() was actually checking it, just in an extremely cursed way. ddt_get_dedup_object_stats() wasn't, but wasn't being called from a dangerous place so no one noticed. These checks are necessary, because spa_ddt[] is not populated until spa_load(), but the spa can exist before that, while being created, and as vdevs and metaslabs are initialised the space accounting functions will be called to update pool space counts. Probably the whole create path doesn't need to go asking for space accounting from metadata subsystems until after the pool is created. This will at least catch misuse. Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Sponsored-by: Klara, Inc. Sponsored-by: iXsystems, Inc. Closes #15887 --- module/zfs/ddt_stats.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/module/zfs/ddt_stats.c b/module/zfs/ddt_stats.c index 4ef6c3a07e3..af5365a1d11 100644 --- a/module/zfs/ddt_stats.c +++ b/module/zfs/ddt_stats.c @@ -132,6 +132,9 @@ ddt_get_dedup_object_stats(spa_t *spa, ddt_object_t *ddo_total) /* Sum the statistics we cached in ddt_object_sync(). */ for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { ddt_t *ddt = spa->spa_ddt[c]; + if (!ddt) + continue; + for (ddt_type_t type = 0; type < DDT_TYPES; type++) { for (ddt_class_t class = 0; class < DDT_CLASSES; class++) { @@ -156,7 +159,10 @@ ddt_get_dedup_histogram(spa_t *spa, ddt_histogram_t *ddh) { for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { ddt_t *ddt = spa->spa_ddt[c]; - for (ddt_type_t type = 0; type < DDT_TYPES && ddt; type++) { + if (!ddt) + continue; + + for (ddt_type_t type = 0; type < DDT_TYPES; type++) { for (ddt_class_t class = 0; class < DDT_CLASSES; class++) { ddt_histogram_add(ddh, From 9029278dde70a85f720e7a75f4c6f69dcd25ee0f Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Mon, 3 Jul 2023 23:28:46 +1000 Subject: [PATCH 11/20] ddt: rework ops interface in terms of keys and values Store objects store keys and values, so have them take those types and nothing more. This way, they don't need to be concerned about the "kind" of entry being operated on; the dispatch layer can take care of the appropriate conversions. This adds a "contains" op to see if a particular entry exists without loading it, which makes a couple of things easier to do; in particular, it allows us to avoid an allocation in ddt_class_contains(). Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Sponsored-by: Klara, Inc. Sponsored-by: iXsystems, Inc. Closes #15887 --- include/sys/ddt_impl.h | 20 ++++++++------ module/Makefile.bsd | 1 + module/zfs/ddt.c | 48 ++++++++++++++++++++------------- module/zfs/ddt_zap.c | 61 +++++++++++++++++++++++++----------------- 4 files changed, 78 insertions(+), 52 deletions(-) diff --git a/include/sys/ddt_impl.h b/include/sys/ddt_impl.h index 09937c71461..d6693658885 100644 --- a/include/sys/ddt_impl.h +++ b/include/sys/ddt_impl.h @@ -41,15 +41,19 @@ typedef struct { int (*ddt_op_create)(objset_t *os, uint64_t *object, dmu_tx_t *tx, boolean_t prehash); int (*ddt_op_destroy)(objset_t *os, uint64_t object, dmu_tx_t *tx); - int (*ddt_op_lookup)(objset_t *os, uint64_t object, ddt_entry_t *dde); + int (*ddt_op_lookup)(objset_t *os, uint64_t object, + const ddt_key_t *ddk, ddt_phys_t *phys, size_t psize); + int (*ddt_op_contains)(objset_t *os, uint64_t object, + const ddt_key_t *ddk); void (*ddt_op_prefetch)(objset_t *os, uint64_t object, - ddt_entry_t *dde); - int (*ddt_op_update)(objset_t *os, uint64_t object, ddt_entry_t *dde, + const ddt_key_t *ddk); + int (*ddt_op_update)(objset_t *os, uint64_t object, + const ddt_key_t *ddk, const ddt_phys_t *phys, size_t psize, dmu_tx_t *tx); - int (*ddt_op_remove)(objset_t *os, uint64_t object, ddt_entry_t *dde, - dmu_tx_t *tx); - int (*ddt_op_walk)(objset_t *os, uint64_t object, ddt_entry_t *dde, - uint64_t *walk); + int (*ddt_op_remove)(objset_t *os, uint64_t object, + const ddt_key_t *ddk, dmu_tx_t *tx); + int (*ddt_op_walk)(objset_t *os, uint64_t object, uint64_t *walk, + ddt_key_t *ddk, ddt_phys_t *phys, size_t psize); int (*ddt_op_count)(objset_t *os, uint64_t object, uint64_t *count); } ddt_ops_t; @@ -62,7 +66,7 @@ extern void ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg); * outside of the DDT implementation proper, and if you do, consider moving * them up. */ -#define DDT_NAMELEN 107 +#define DDT_NAMELEN 110 extern uint64_t ddt_phys_total_refcnt(const ddt_entry_t *dde); diff --git a/module/Makefile.bsd b/module/Makefile.bsd index 1b0110d3ae9..e9ad69fc50a 100644 --- a/module/Makefile.bsd +++ b/module/Makefile.bsd @@ -421,6 +421,7 @@ CFLAGS.gcc+= -Wno-pointer-to-int-cast CFLAGS.abd.c= -Wno-cast-qual CFLAGS.ddt.c= -Wno-cast-qual +CFLAGS.ddt_zap.c= -Wno-cast-qual CFLAGS.dmu.c= -Wno-cast-qual CFLAGS.dmu_traverse.c= -Wno-cast-qual CFLAGS.dnode.c= ${NO_WUNUSED_BUT_SET_VARIABLE} diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c index 1b7063998c2..df79de74e3c 100644 --- a/module/zfs/ddt.c +++ b/module/zfs/ddt.c @@ -186,18 +186,30 @@ ddt_object_lookup(ddt_t *ddt, ddt_type_t type, ddt_class_t class, return (SET_ERROR(ENOENT)); return (ddt_ops[type]->ddt_op_lookup(ddt->ddt_os, - ddt->ddt_object[type][class], dde)); + ddt->ddt_object[type][class], &dde->dde_key, + dde->dde_phys, sizeof (dde->dde_phys))); +} + +static int +ddt_object_contains(ddt_t *ddt, ddt_type_t type, ddt_class_t class, + const ddt_key_t *ddk) +{ + if (!ddt_object_exists(ddt, type, class)) + return (SET_ERROR(ENOENT)); + + return (ddt_ops[type]->ddt_op_contains(ddt->ddt_os, + ddt->ddt_object[type][class], ddk)); } static void ddt_object_prefetch(ddt_t *ddt, ddt_type_t type, ddt_class_t class, - ddt_entry_t *dde) + const ddt_key_t *ddk) { if (!ddt_object_exists(ddt, type, class)) return; ddt_ops[type]->ddt_op_prefetch(ddt->ddt_os, - ddt->ddt_object[type][class], dde); + ddt->ddt_object[type][class], ddk); } static int @@ -207,17 +219,18 @@ ddt_object_update(ddt_t *ddt, ddt_type_t type, ddt_class_t class, ASSERT(ddt_object_exists(ddt, type, class)); return (ddt_ops[type]->ddt_op_update(ddt->ddt_os, - ddt->ddt_object[type][class], dde, tx)); + ddt->ddt_object[type][class], &dde->dde_key, dde->dde_phys, + sizeof (dde->dde_phys), tx)); } static int ddt_object_remove(ddt_t *ddt, ddt_type_t type, ddt_class_t class, - ddt_entry_t *dde, dmu_tx_t *tx) + const ddt_key_t *ddk, dmu_tx_t *tx) { ASSERT(ddt_object_exists(ddt, type, class)); return (ddt_ops[type]->ddt_op_remove(ddt->ddt_os, - ddt->ddt_object[type][class], dde, tx)); + ddt->ddt_object[type][class], ddk, tx)); } int @@ -227,7 +240,8 @@ ddt_object_walk(ddt_t *ddt, ddt_type_t type, ddt_class_t class, ASSERT(ddt_object_exists(ddt, type, class)); return (ddt_ops[type]->ddt_op_walk(ddt->ddt_os, - ddt->ddt_object[type][class], dde, walk)); + ddt->ddt_object[type][class], walk, &dde->dde_key, + dde->dde_phys, sizeof (dde->dde_phys))); } int @@ -523,7 +537,7 @@ void ddt_prefetch(spa_t *spa, const blkptr_t *bp) { ddt_t *ddt; - ddt_entry_t dde; + ddt_key_t ddk; if (!zfs_dedup_prefetch || bp == NULL || !BP_GET_DEDUP(bp)) return; @@ -534,11 +548,11 @@ ddt_prefetch(spa_t *spa, const blkptr_t *bp) * Thus no locking is required as the DDT can't disappear on us. */ ddt = ddt_select(spa, bp); - ddt_key_fill(&dde.dde_key, bp); + ddt_key_fill(&ddk, bp); for (ddt_type_t type = 0; type < DDT_TYPES; type++) { for (ddt_class_t class = 0; class < DDT_CLASSES; class++) { - ddt_object_prefetch(ddt, type, class, &dde); + ddt_object_prefetch(ddt, type, class, &ddk); } } } @@ -660,7 +674,7 @@ boolean_t ddt_class_contains(spa_t *spa, ddt_class_t max_class, const blkptr_t *bp) { ddt_t *ddt; - ddt_entry_t *dde; + ddt_key_t ddk; if (!BP_GET_DEDUP(bp)) return (B_FALSE); @@ -669,20 +683,16 @@ ddt_class_contains(spa_t *spa, ddt_class_t max_class, const blkptr_t *bp) return (B_TRUE); ddt = spa->spa_ddt[BP_GET_CHECKSUM(bp)]; - dde = kmem_cache_alloc(ddt_entry_cache, KM_SLEEP); - ddt_key_fill(&(dde->dde_key), bp); + ddt_key_fill(&ddk, bp); for (ddt_type_t type = 0; type < DDT_TYPES; type++) { for (ddt_class_t class = 0; class <= max_class; class++) { - if (ddt_object_lookup(ddt, type, class, dde) == 0) { - kmem_cache_free(ddt_entry_cache, dde); + if (ddt_object_contains(ddt, type, class, &ddk) == 0) return (B_TRUE); - } } } - kmem_cache_free(ddt_entry_cache, dde); return (B_FALSE); } @@ -833,9 +843,9 @@ ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg) if (otype != DDT_TYPES && (otype != ntype || oclass != nclass || total_refcnt == 0)) { - VERIFY0(ddt_object_remove(ddt, otype, oclass, dde, tx)); + VERIFY0(ddt_object_remove(ddt, otype, oclass, ddk, tx)); ASSERT3U( - ddt_object_lookup(ddt, otype, oclass, dde), ==, ENOENT); + ddt_object_contains(ddt, otype, oclass, ddk), ==, ENOENT); } if (total_refcnt != 0) { diff --git a/module/zfs/ddt_zap.c b/module/zfs/ddt_zap.c index 56312881fdd..741554de3c6 100644 --- a/module/zfs/ddt_zap.c +++ b/module/zfs/ddt_zap.c @@ -42,7 +42,7 @@ static unsigned int ddt_zap_default_ibs = 15; #define DDT_KEY_WORDS (sizeof (ddt_key_t) / sizeof (uint64_t)) static size_t -ddt_zap_compress(void *src, uchar_t *dst, size_t s_len, size_t d_len) +ddt_zap_compress(const void *src, uchar_t *dst, size_t s_len, size_t d_len) { uchar_t *version = dst++; int cpfunc = ZIO_COMPRESS_ZLE; @@ -51,7 +51,8 @@ ddt_zap_compress(void *src, uchar_t *dst, size_t s_len, size_t d_len) ASSERT3U(d_len, >=, s_len + 1); /* no compression plus version byte */ - c_len = ci->ci_compress(src, dst, s_len, d_len - 1, ci->ci_level); + c_len = ci->ci_compress((void *)src, dst, s_len, d_len - 1, + ci->ci_level); if (c_len == s_len) { cpfunc = ZIO_COMPRESS_OFF; @@ -93,8 +94,10 @@ ddt_zap_create(objset_t *os, uint64_t *objectp, dmu_tx_t *tx, boolean_t prehash) *objectp = zap_create_flags(os, 0, flags, DMU_OT_DDT_ZAP, ddt_zap_default_bs, ddt_zap_default_ibs, DMU_OT_NONE, 0, tx); + if (*objectp == 0) + return (SET_ERROR(ENOTSUP)); - return (*objectp == 0 ? SET_ERROR(ENOTSUP) : 0); + return (0); } static int @@ -104,51 +107,57 @@ ddt_zap_destroy(objset_t *os, uint64_t object, dmu_tx_t *tx) } static int -ddt_zap_lookup(objset_t *os, uint64_t object, ddt_entry_t *dde) +ddt_zap_lookup(objset_t *os, uint64_t object, + const ddt_key_t *ddk, ddt_phys_t *phys, size_t psize) { uchar_t *cbuf; uint64_t one, csize; int error; - error = zap_length_uint64(os, object, (uint64_t *)&dde->dde_key, + error = zap_length_uint64(os, object, (uint64_t *)ddk, DDT_KEY_WORDS, &one, &csize); if (error) return (error); ASSERT3U(one, ==, 1); - ASSERT3U(csize, <=, (sizeof (dde->dde_phys) + 1)); + ASSERT3U(csize, <=, psize + 1); cbuf = kmem_alloc(csize, KM_SLEEP); - error = zap_lookup_uint64(os, object, (uint64_t *)&dde->dde_key, + error = zap_lookup_uint64(os, object, (uint64_t *)ddk, DDT_KEY_WORDS, 1, csize, cbuf); if (error == 0) - ddt_zap_decompress(cbuf, dde->dde_phys, csize, - sizeof (dde->dde_phys)); + ddt_zap_decompress(cbuf, phys, csize, psize); kmem_free(cbuf, csize); return (error); } -static void -ddt_zap_prefetch(objset_t *os, uint64_t object, ddt_entry_t *dde) +static int +ddt_zap_contains(objset_t *os, uint64_t object, const ddt_key_t *ddk) { - (void) zap_prefetch_uint64(os, object, (uint64_t *)&dde->dde_key, - DDT_KEY_WORDS); + return (zap_length_uint64(os, object, (uint64_t *)ddk, DDT_KEY_WORDS, + NULL, NULL)); +} + +static void +ddt_zap_prefetch(objset_t *os, uint64_t object, const ddt_key_t *ddk) +{ + (void) zap_prefetch_uint64(os, object, (uint64_t *)ddk, DDT_KEY_WORDS); } static int -ddt_zap_update(objset_t *os, uint64_t object, ddt_entry_t *dde, dmu_tx_t *tx) +ddt_zap_update(objset_t *os, uint64_t object, const ddt_key_t *ddk, + const ddt_phys_t *phys, size_t psize, dmu_tx_t *tx) { - const size_t cbuf_size = sizeof (dde->dde_phys) + 1; + const size_t cbuf_size = psize + 1; uchar_t *cbuf = kmem_alloc(cbuf_size, KM_SLEEP); - uint64_t csize = ddt_zap_compress(dde->dde_phys, cbuf, - sizeof (dde->dde_phys), cbuf_size); + uint64_t csize = ddt_zap_compress(phys, cbuf, psize, cbuf_size); - int error = zap_update_uint64(os, object, (uint64_t *)&dde->dde_key, + int error = zap_update_uint64(os, object, (uint64_t *)ddk, DDT_KEY_WORDS, 1, csize, cbuf, tx); kmem_free(cbuf, cbuf_size); @@ -157,14 +166,16 @@ ddt_zap_update(objset_t *os, uint64_t object, ddt_entry_t *dde, dmu_tx_t *tx) } static int -ddt_zap_remove(objset_t *os, uint64_t object, ddt_entry_t *dde, dmu_tx_t *tx) +ddt_zap_remove(objset_t *os, uint64_t object, const ddt_key_t *ddk, + dmu_tx_t *tx) { - return (zap_remove_uint64(os, object, (uint64_t *)&dde->dde_key, + return (zap_remove_uint64(os, object, (uint64_t *)ddk, DDT_KEY_WORDS, tx)); } static int -ddt_zap_walk(objset_t *os, uint64_t object, ddt_entry_t *dde, uint64_t *walk) +ddt_zap_walk(objset_t *os, uint64_t object, uint64_t *walk, ddt_key_t *ddk, + ddt_phys_t *phys, size_t psize) { zap_cursor_t zc; zap_attribute_t za; @@ -186,7 +197,7 @@ ddt_zap_walk(objset_t *os, uint64_t object, ddt_entry_t *dde, uint64_t *walk) uint64_t csize = za.za_num_integers; ASSERT3U(za.za_integer_length, ==, 1); - ASSERT3U(csize, <=, sizeof (dde->dde_phys) + 1); + ASSERT3U(csize, <=, psize + 1); uchar_t *cbuf = kmem_alloc(csize, KM_SLEEP); @@ -194,9 +205,8 @@ ddt_zap_walk(objset_t *os, uint64_t object, ddt_entry_t *dde, uint64_t *walk) DDT_KEY_WORDS, 1, csize, cbuf); ASSERT0(error); if (error == 0) { - ddt_zap_decompress(cbuf, dde->dde_phys, csize, - sizeof (dde->dde_phys)); - dde->dde_key = *(ddt_key_t *)za.za_name; + ddt_zap_decompress(cbuf, phys, csize, psize); + *ddk = *(ddt_key_t *)za.za_name; } kmem_free(cbuf, csize); @@ -219,6 +229,7 @@ const ddt_ops_t ddt_zap_ops = { ddt_zap_create, ddt_zap_destroy, ddt_zap_lookup, + ddt_zap_contains, ddt_zap_prefetch, ddt_zap_update, ddt_zap_remove, From 2cffddd405e0082b781999513321959d03834cc9 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Mon, 31 Jul 2023 17:42:34 +1000 Subject: [PATCH 12/20] ddt: remove ddt_node Nothing uses it. Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Sponsored-by: Klara, Inc. Sponsored-by: iXsystems, Inc. Closes #15887 --- include/sys/ddt.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/sys/ddt.h b/include/sys/ddt.h index 013692c96dc..ce20da6b0f6 100644 --- a/include/sys/ddt.h +++ b/include/sys/ddt.h @@ -146,7 +146,6 @@ typedef struct { ddt_histogram_t ddt_histogram[DDT_TYPES][DDT_CLASSES]; ddt_histogram_t ddt_histogram_cache[DDT_TYPES][DDT_CLASSES]; ddt_object_t ddt_object_stats[DDT_TYPES][DDT_CLASSES]; - avl_node_t ddt_node; } ddt_t; /* From 406562c56318a9d29a5e78a6e4bdee158902cd14 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Tue, 5 Dec 2023 14:28:39 +1100 Subject: [PATCH 13/20] ddt: simplify entry load and flags Only a single bit is needed to track entry state, and definitely not two whole bytes. Some light refactoring in ddt_lookup() is needed to support this, but it reads a lot better now. Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Sponsored-by: Klara, Inc. Sponsored-by: iXsystems, Inc. Closes #15887 --- include/sys/ddt.h | 7 +++++-- module/zfs/ddt.c | 47 ++++++++++++++++++++++++++++------------------- 2 files changed, 33 insertions(+), 21 deletions(-) diff --git a/include/sys/ddt.h b/include/sys/ddt.h index ce20da6b0f6..b29667e5abb 100644 --- a/include/sys/ddt.h +++ b/include/sys/ddt.h @@ -117,6 +117,10 @@ enum ddt_phys_type { /* * In-core ddt entry */ + +/* State flags for dde_flags */ +#define DDE_FLAG_LOADED (1 << 0) /* entry ready for use */ + typedef struct { /* key must be first for ddt_key_compare */ ddt_key_t dde_key; @@ -125,8 +129,7 @@ typedef struct { struct abd *dde_repair_abd; ddt_type_t dde_type; ddt_class_t dde_class; - uint8_t dde_loading; - uint8_t dde_loaded; + uint8_t dde_flags; kcondvar_t dde_cv; avl_node_t dde_node; } ddt_entry_t; diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c index df79de74e3c..3e278d702da 100644 --- a/module/zfs/ddt.c +++ b/module/zfs/ddt.c @@ -448,7 +448,7 @@ ddt_alloc(const ddt_key_t *ddk) static void ddt_free(ddt_entry_t *dde) { - ASSERT(!dde->dde_loading); + ASSERT(dde->dde_flags & DDE_FLAG_LOADED); for (int p = 0; p < DDT_PHYS_TYPES; p++) ASSERT3P(dde->dde_lead_zio[p], ==, NULL); @@ -483,26 +483,37 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add) ddt_key_fill(&search, bp); + /* Find an existing live entry */ dde = avl_find(&ddt->ddt_tree, &search, &where); - if (dde == NULL) { - if (!add) - return (NULL); - dde = ddt_alloc(&search); - avl_insert(&ddt->ddt_tree, dde, where); + if (dde != NULL) { + /* Found it. If it's already loaded, we can just return it. */ + if (dde->dde_flags & DDE_FLAG_LOADED) + return (dde); + + /* Someone else is loading it, wait for it. */ + while (!(dde->dde_flags & DDE_FLAG_LOADED)) + cv_wait(&dde->dde_cv, &ddt->ddt_lock); + + return (dde); } - while (dde->dde_loading) - cv_wait(&dde->dde_cv, &ddt->ddt_lock); + /* Not found. */ + if (!add) + return (NULL); - if (dde->dde_loaded) - return (dde); - - dde->dde_loading = B_TRUE; + /* Time to make a new entry. */ + dde = ddt_alloc(&search); + avl_insert(&ddt->ddt_tree, dde, where); + /* + * ddt_tree is now stable, so unlock and let everyone else keep moving. + * Anyone landing on this entry will find it without DDE_FLAG_LOADED, + * and go to sleep waiting for it above. + */ ddt_exit(ddt); + /* Search all store objects for the entry. */ error = ENOENT; - for (type = 0; type < DDT_TYPES; type++) { for (class = 0; class < DDT_CLASSES; class++) { error = ddt_object_lookup(ddt, type, class, dde); @@ -517,17 +528,16 @@ ddt_lookup(ddt_t *ddt, const blkptr_t *bp, boolean_t add) ddt_enter(ddt); - ASSERT(!dde->dde_loaded); - ASSERT(dde->dde_loading); + ASSERT(!(dde->dde_flags & DDE_FLAG_LOADED)); dde->dde_type = type; /* will be DDT_TYPES if no entry found */ dde->dde_class = class; /* will be DDT_CLASSES if no entry found */ - dde->dde_loaded = B_TRUE; - dde->dde_loading = B_FALSE; if (error == 0) ddt_stat_update(ddt, dde, -1ULL); + /* Entry loaded, everyone can proceed now */ + dde->dde_flags |= DDE_FLAG_LOADED; cv_broadcast(&dde->dde_cv); return (dde); @@ -812,8 +822,7 @@ ddt_sync_entry(ddt_t *ddt, ddt_entry_t *dde, dmu_tx_t *tx, uint64_t txg) ddt_class_t nclass; uint64_t total_refcnt = 0; - ASSERT(dde->dde_loaded); - ASSERT(!dde->dde_loading); + ASSERT(dde->dde_flags & DDE_FLAG_LOADED); for (int p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { ASSERT3P(dde->dde_lead_zio[p], ==, NULL); From d9619546889dfec7b0a8ee45f82671184b75ea0d Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Thu, 1 Feb 2024 11:05:18 +1100 Subject: [PATCH 14/20] ddt: only create tables for dedup-capable checksums Most values in zio_checksum can never be used for dedup, partly because the dedup= property only offers a limited list, but also some values (eg ZIO_CHECKSUM_OFF) aren't real and will never be seen. A true flag would be better than a hardcoded list, but thats more cleanup elsewhere than I want to do right now. Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Sponsored-by: Klara, Inc. Sponsored-by: iXsystems, Inc. Closes #15887 --- cmd/zdb/zdb.c | 8 +++++++- module/zfs/ddt.c | 21 +++++++++++++++++++-- 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/cmd/zdb/zdb.c b/cmd/zdb/zdb.c index 7f17de03d3c..4880c804872 100644 --- a/cmd/zdb/zdb.c +++ b/cmd/zdb/zdb.c @@ -1965,6 +1965,8 @@ dump_all_ddts(spa_t *spa) for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { ddt_t *ddt = spa->spa_ddt[c]; + if (!ddt) + continue; for (ddt_type_t type = 0; type < DDT_TYPES; type++) { for (ddt_class_t class = 0; class < DDT_CLASSES; class++) { @@ -6062,6 +6064,8 @@ zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb) return; ASSERT(ddt_phys_total_refcnt(&dde) > 1); + ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum]; + VERIFY(ddt); for (p = 0; p < DDT_PHYS_TYPES; p++, ddp++) { if (ddp->ddp_phys_birth == 0) @@ -6076,7 +6080,7 @@ zdb_ddt_leak_init(spa_t *spa, zdb_cb_t *zcb) zcb->zcb_dedup_blocks++; } } - ddt_t *ddt = spa->spa_ddt[ddb.ddb_checksum]; + ddt_enter(ddt); VERIFY(ddt_lookup(ddt, &blk, B_TRUE) != NULL); ddt_exit(ddt); @@ -7949,6 +7953,8 @@ dump_mos_leaks(spa_t *spa) for (uint64_t cksum = 0; cksum < ZIO_CHECKSUM_FUNCTIONS; cksum++) { ddt_t *ddt = spa->spa_ddt[cksum]; + if (!ddt) + continue; mos_obj_refd(ddt->ddt_object[type][class]); } } diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c index 3e278d702da..f22c79c50aa 100644 --- a/module/zfs/ddt.c +++ b/module/zfs/ddt.c @@ -39,6 +39,15 @@ #include #include +/* + * These are the only checksums valid for dedup. They must match the list + * from dedup_table in zfs_prop.c + */ +#define DDT_CHECKSUM_VALID(c) \ + (c == ZIO_CHECKSUM_SHA256 || c == ZIO_CHECKSUM_SHA512 || \ + c == ZIO_CHECKSUM_SKEIN || c == ZIO_CHECKSUM_EDONR || \ + c == ZIO_CHECKSUM_BLAKE3) + static kmem_cache_t *ddt_cache; static kmem_cache_t *ddt_entry_cache; @@ -400,6 +409,7 @@ ddt_phys_total_refcnt(const ddt_entry_t *dde) ddt_t * ddt_select(spa_t *spa, const blkptr_t *bp) { + ASSERT(DDT_CHECKSUM_VALID(BP_GET_CHECKSUM(bp))); return (spa->spa_ddt[BP_GET_CHECKSUM(bp)]); } @@ -629,8 +639,10 @@ ddt_create(spa_t *spa) { spa->spa_dedup_checksum = ZIO_DEDUPCHECKSUM; - for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) - spa->spa_ddt[c] = ddt_table_alloc(spa, c); + for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { + if (DDT_CHECKSUM_VALID(c)) + spa->spa_ddt[c] = ddt_table_alloc(spa, c); + } } int @@ -648,6 +660,9 @@ ddt_load(spa_t *spa) return (error == ENOENT ? 0 : error); for (enum zio_checksum c = 0; c < ZIO_CHECKSUM_FUNCTIONS; c++) { + if (!DDT_CHECKSUM_VALID(c)) + continue; + ddt_t *ddt = spa->spa_ddt[c]; for (ddt_type_t type = 0; type < DDT_TYPES; type++) { for (ddt_class_t class = 0; class < DDT_CLASSES; @@ -967,6 +982,8 @@ ddt_walk(spa_t *spa, ddt_bookmark_t *ddb, ddt_entry_t *dde) do { do { ddt_t *ddt = spa->spa_ddt[ddb->ddb_checksum]; + if (ddt == NULL) + continue; int error = ENOENT; if (ddt_object_exists(ddt, ddb->ddb_type, ddb->ddb_class)) { From 5720b00632c1086fe4a57df42aef2f0972c83c74 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Tue, 28 Nov 2023 10:43:36 +1100 Subject: [PATCH 15/20] ddt: document the theory and the key data structures Reviewed-by: Brian Behlendorf Signed-off-by: Rob Norris Sponsored-by: Klara, Inc. Sponsored-by: iXsystems, Inc. Closes #15887 --- include/sys/ddt.h | 98 ++++++++++++++++++++++++++++++++++++----------- module/zfs/ddt.c | 96 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 171 insertions(+), 23 deletions(-) diff --git a/include/sys/ddt.h b/include/sys/ddt.h index b29667e5abb..726f1a3902e 100644 --- a/include/sys/ddt.h +++ b/include/sys/ddt.h @@ -21,6 +21,7 @@ /* * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2016 by Delphix. All rights reserved. + * Copyright (c) 2023, Klara Inc. */ #ifndef _SYS_DDT_H @@ -39,10 +40,16 @@ extern "C" { struct abd; /* - * On-disk DDT formats, in the desired search order (newest version first). + * DDT on-disk storage object types. Each one corresponds to specific + * implementation, see ddt_ops_t. The value itself is not stored on disk. + * + * When searching for an entry, objects types will be searched in this order. + * + * Note that DDT_TYPES is used as the "no type" for new entries that have not + * yet been written to a storage object. */ typedef enum { - DDT_TYPE_ZAP = 0, + DDT_TYPE_ZAP = 0, /* ZAP storage object, ddt_zap */ DDT_TYPES } ddt_type_t; @@ -53,12 +60,18 @@ _Static_assert(DDT_TYPES <= UINT8_MAX, #define DDT_TYPE_DEFAULT (DDT_TYPE_ZAP) /* - * DDT classes, in the desired search order (highest replication level first). + * DDT storage classes. Each class has a separate storage object for each type. + * The value itself is not stored on disk. + * + * When search for an entry, object classes will be searched in this order. + * + * Note that DDT_CLASSES is used as the "no class" for new entries that have not + * yet been written to a storage object. */ typedef enum { - DDT_CLASS_DITTO = 0, - DDT_CLASS_DUPLICATE, - DDT_CLASS_UNIQUE, + DDT_CLASS_DITTO = 0, /* entry has ditto blocks (obsolete) */ + DDT_CLASS_DUPLICATE, /* entry has multiple references */ + DDT_CLASS_UNIQUE, /* entry has a single reference */ DDT_CLASSES } ddt_class_t; @@ -66,7 +79,9 @@ _Static_assert(DDT_CLASSES < UINT8_MAX, "ddt_class_t must fit in a uint8_t"); /* - * On-disk ddt entry: key (name) and physical storage (value). + * The "key" part of an on-disk entry. This is the unique "name" for a block, + * that is, that parts of the block pointer that will always be the same for + * the same data. */ typedef struct { zio_cksum_t ddk_cksum; /* 256-bit block checksum */ @@ -80,6 +95,10 @@ typedef struct { uint64_t ddk_prop; } ddt_key_t; +/* + * Macros for accessing parts of a ddt_key_t. These are similar to their BP_* + * counterparts. + */ #define DDK_GET_LSIZE(ddk) \ BF64_GET_SB((ddk)->ddk_prop, 0, 16, SPA_MINBLOCKSHIFT, 1) #define DDK_SET_LSIZE(ddk, x) \ @@ -96,6 +115,16 @@ typedef struct { #define DDK_GET_CRYPT(ddk) BF64_GET((ddk)->ddk_prop, 39, 1) #define DDK_SET_CRYPT(ddk, x) BF64_SET((ddk)->ddk_prop, 39, 1, x) +/* + * The "value" part for an on-disk entry. These are the "physical" + * characteristics of the stored block, such as its location on disk (DVAs), + * birth txg and ref count. + * + * Note that an entry has an array of four ddt_phys_t, one for each number of + * DVAs (copies= property) and another for additional "ditto" copies. Most + * users of ddt_phys_t will handle indexing into or counting the phys they + * want. + */ typedef struct { dva_t ddp_dva[SPA_DVAS_PER_BP]; uint64_t ddp_refcnt; @@ -103,6 +132,8 @@ typedef struct { } ddt_phys_t; /* + * Named indexes into the ddt_phys_t array in each entry. + * * Note, we no longer generate new DDT_PHYS_DITTO-type blocks. However, * we maintain the ability to free existing dedup-ditto blocks. */ @@ -115,7 +146,8 @@ enum ddt_phys_type { }; /* - * In-core ddt entry + * A "live" entry, holding changes to an entry made this txg, and other data to + * support loading, updating and repairing the entry. */ /* State flags for dde_flags */ @@ -123,36 +155,56 @@ enum ddt_phys_type { typedef struct { /* key must be first for ddt_key_compare */ - ddt_key_t dde_key; - ddt_phys_t dde_phys[DDT_PHYS_TYPES]; + ddt_key_t dde_key; /* ddt_tree key */ + ddt_phys_t dde_phys[DDT_PHYS_TYPES]; /* on-disk data */ + + /* in-flight update IOs */ zio_t *dde_lead_zio[DDT_PHYS_TYPES]; + + /* copy of data after a repair read, to be rewritten */ struct abd *dde_repair_abd; + + /* storage type and class the entry was loaded from */ ddt_type_t dde_type; ddt_class_t dde_class; - uint8_t dde_flags; - kcondvar_t dde_cv; - avl_node_t dde_node; + + uint8_t dde_flags; /* load state flags */ + kcondvar_t dde_cv; /* signaled when load completes */ + + avl_node_t dde_node; /* ddt_tree node */ } ddt_entry_t; /* - * In-core ddt + * In-core DDT object. This covers all entries and stats for a the whole pool + * for a given checksum type. */ typedef struct { - kmutex_t ddt_lock; - avl_tree_t ddt_tree; - avl_tree_t ddt_repair_tree; - enum zio_checksum ddt_checksum; - spa_t *ddt_spa; - objset_t *ddt_os; - uint64_t ddt_stat_object; + kmutex_t ddt_lock; /* protects changes to all fields */ + + avl_tree_t ddt_tree; /* "live" (changed) entries this txg */ + + avl_tree_t ddt_repair_tree; /* entries being repaired */ + + enum zio_checksum ddt_checksum; /* checksum algorithm in use */ + spa_t *ddt_spa; /* pool this ddt is on */ + objset_t *ddt_os; /* ddt objset (always MOS) */ + + /* per-type/per-class entry store objects */ uint64_t ddt_object[DDT_TYPES][DDT_CLASSES]; + + /* object ids for whole-ddt and per-type/per-class stats */ + uint64_t ddt_stat_object; + ddt_object_t ddt_object_stats[DDT_TYPES][DDT_CLASSES]; + + /* type/class stats by power-2-sized referenced blocks */ ddt_histogram_t ddt_histogram[DDT_TYPES][DDT_CLASSES]; ddt_histogram_t ddt_histogram_cache[DDT_TYPES][DDT_CLASSES]; - ddt_object_t ddt_object_stats[DDT_TYPES][DDT_CLASSES]; } ddt_t; /* - * In-core and on-disk bookmark for DDT walks + * In-core and on-disk bookmark for DDT walks. This is a cursor for ddt_walk(), + * and is stable across calls, even if the DDT is updated, the pool is + * restarted or loaded on another system, or OpenZFS is upgraded. */ typedef struct { uint64_t ddb_class; diff --git a/module/zfs/ddt.c b/module/zfs/ddt.c index f22c79c50aa..de8640e58a2 100644 --- a/module/zfs/ddt.c +++ b/module/zfs/ddt.c @@ -23,6 +23,7 @@ * Copyright (c) 2009, 2010, Oracle and/or its affiliates. All rights reserved. * Copyright (c) 2012, 2016 by Delphix. All rights reserved. * Copyright (c) 2022 by Pawel Jakub Dawidek + * Copyright (c) 2023, Klara Inc. */ #include @@ -39,6 +40,101 @@ #include #include +/* + * # DDT: Deduplication tables + * + * The dedup subsystem provides block-level deduplication. When enabled, blocks + * to be written will have the dedup (D) bit set, which causes them to be + * tracked in a "dedup table", or DDT. If a block has been seen before (exists + * in the DDT), instead of being written, it will instead be made to reference + * the existing on-disk data, and a refcount bumped in the DDT instead. + * + * ## Dedup tables and entries + * + * Conceptually, a DDT is a dictionary or map. Each entry has a "key" + * (ddt_key_t) made up a block's checksum and certian properties, and a "value" + * (one or more ddt_phys_t) containing valid DVAs for the block's data, birth + * time and refcount. Together these are enough to track references to a + * specific block, to build a valid block pointer to reference that block (for + * freeing, scrubbing, etc), and to fill a new block pointer with the missing + * pieces to make it seem like it was written. + * + * There's a single DDT (ddt_t) for each checksum type, held in spa_ddt[]. + * Within each DDT, there can be multiple storage "types" (ddt_type_t, on-disk + * object data formats, each with their own implementations) and "classes" + * (ddt_class_t, instance of a storage type object, for entries with a specific + * characteristic). An entry (key) will only ever exist on one of these objects + * at any given time, but may be moved from one to another if their type or + * class changes. + * + * The DDT is driven by the write IO pipeline (zio_ddt_write()). When a block + * is to be written, before DVAs have been allocated, ddt_lookup() is called to + * see if the block has been seen before. If its not found, the write proceeds + * as normal, and after it succeeds, a new entry is created. If it is found, we + * fill the BP with the DVAs from the entry, increment the refcount and cause + * the write IO to return immediately. + * + * Each ddt_phys_t slot in the entry represents a separate dedup block for the + * same content/checksum. The slot is selected based on the zp_copies parameter + * the block is written with, that is, the number of DVAs in the block. The + * "ditto" slot (DDT_PHYS_DITTO) used to be used for now-removed "dedupditto" + * feature. These are no longer written, and will be freed if encountered on + * old pools. + * + * ## Lifetime of an entry + * + * A DDT can be enormous, and typically is not held in memory all at once. + * Instead, the changes to an entry are tracked in memory, and written down to + * disk at the end of each txg. + * + * A "live" in-memory entry (ddt_entry_t) is a node on the live tree + * (ddt_tree). At the start of a txg, ddt_tree is empty. When an entry is + * required for IO, ddt_lookup() is called. If an entry already exists on + * ddt_tree, it is returned. Otherwise, a new one is created, and the + * type/class objects for the DDT are searched for that key. If its found, its + * value is copied into the live entry. If not, an empty entry is created. + * + * The live entry will be modified during the txg, usually by modifying the + * refcount, but sometimes by adding or updating DVAs. At the end of the txg + * (during spa_sync()), type and class are recalculated for entry (see + * ddt_sync_entry()), and the entry is written to the appropriate storage + * object and (if necessary), removed from an old one. ddt_tree is cleared and + * the next txg can start. + * + * ## Repair IO + * + * If a read on a dedup block fails, but there are other copies of the block in + * the other ddt_phys_t slots, reads will be issued for those instead + * (zio_ddt_read_start()). If one of those succeeds, the read is returned to + * the caller, and a copy is stashed on the entry's dde_repair_abd. + * + * During the end-of-txg sync, any entries with a dde_repair_abd get a + * "rewrite" write issued for the original block pointer, with the data read + * from the alternate block. If the block is actually damaged, this will invoke + * the pool's "self-healing" mechanism, and repair the block. + * + * ## Scanning (scrub/resilver) + * + * If dedup is active, the scrub machinery will walk the dedup table first, and + * scrub all blocks with refcnt > 1 first. After that it will move on to the + * regular top-down scrub, and exclude the refcnt > 1 blocks when it sees them. + * In this way, heavily deduplicated blocks are only scrubbed once. See the + * commentary on dsl_scan_ddt() for more details. + * + * Walking the DDT is done via ddt_walk(). The current position is stored in a + * ddt_bookmark_t, which represents a stable position in the storage object. + * This bookmark is stored by the scan machinery, and must reference the same + * position on the object even if the object changes, the pool is exported, or + * OpenZFS is upgraded. + * + * ## Interaction with block cloning + * + * If block cloning and dedup are both enabled on a pool, BRT will look for the + * dedup bit on an incoming block pointer. If set, it will call into the DDT + * (ddt_addref()) to add a reference to the block, instead of adding a + * reference to the BRT. See brt_pending_apply(). + */ + /* * These are the only checksums valid for dedup. They must match the list * from dedup_table in zfs_prop.c From d6a3d3f12a6455cf813fc4d207d66e78a5e8b4ca Mon Sep 17 00:00:00 2001 From: Tony Hutter Date: Fri, 16 Feb 2024 08:59:56 -0800 Subject: [PATCH 16/20] ZTS: Skip cross-fs bclone tests if FreeBSD < 14.0 Skip cross filesystem block cloning tests on FreeBSD if running less than version 14.0. Cross filesystem copy_file_range() was added in FreeBSD 14. Reviewed-by: Brian Behlendorf Signed-off-by: Tony Hutter Closes #15901 --- tests/test-runner/bin/zts-report.py.in | 22 ++++++++++++++- tests/zfs-tests/include/libtest.shlib | 27 ++++++++++++++----- .../functional/bclone/bclone_common.kshlib | 6 +++++ ...ck_cloning_copyfilerange_cross_dataset.ksh | 5 ++-- .../block_cloning_cross_enc_dataset.ksh | 5 ++-- 5 files changed, 51 insertions(+), 14 deletions(-) diff --git a/tests/test-runner/bin/zts-report.py.in b/tests/test-runner/bin/zts-report.py.in index edfdd47ee6d..ecc50f48715 100755 --- a/tests/test-runner/bin/zts-report.py.in +++ b/tests/test-runner/bin/zts-report.py.in @@ -138,7 +138,11 @@ idmap_reason = 'Idmapped mount needs kernel 5.12+' # copy_file_range() is not supported by all kernels # cfr_reason = 'Kernel copy_file_range support required' -cfr_cross_reason = 'copy_file_range(2) cross-filesystem needs kernel 5.3+' + +if sys.platform.startswith('freebsd'): + cfr_cross_reason = 'copy_file_range(2) cross-filesystem needs FreeBSD 14+' +else: + cfr_cross_reason = 'copy_file_range(2) cross-filesystem needs kernel 5.3+' # # These tests are known to fail, thus we use this list to prevent these @@ -268,6 +272,22 @@ if sys.platform.startswith('freebsd'): 'pool_checkpoint/checkpoint_indirect': ['FAIL', 12623], 'resilver/resilver_restart_001': ['FAIL', known_reason], 'snapshot/snapshot_002_pos': ['FAIL', '14831'], + 'bclone/bclone_crossfs_corner_cases': ['SKIP', cfr_cross_reason], + 'bclone/bclone_crossfs_corner_cases_limited': + ['SKIP', cfr_cross_reason], + 'bclone/bclone_crossfs_data': ['SKIP', cfr_cross_reason], + 'bclone/bclone_crossfs_embedded': ['SKIP', cfr_cross_reason], + 'bclone/bclone_crossfs_hole': ['SKIP', cfr_cross_reason], + 'bclone/bclone_diffprops_all': ['SKIP', cfr_cross_reason], + 'bclone/bclone_diffprops_checksum': ['SKIP', cfr_cross_reason], + 'bclone/bclone_diffprops_compress': ['SKIP', cfr_cross_reason], + 'bclone/bclone_diffprops_copies': ['SKIP', cfr_cross_reason], + 'bclone/bclone_diffprops_recordsize': ['SKIP', cfr_cross_reason], + 'bclone/bclone_prop_sync': ['SKIP', cfr_cross_reason], + 'block_cloning/block_cloning_cross_enc_dataset': + ['SKIP', cfr_cross_reason], + 'block_cloning/block_cloning_copyfilerange_cross_dataset': + ['SKIP', cfr_cross_reason] }) elif sys.platform.startswith('linux'): maybe.update({ diff --git a/tests/zfs-tests/include/libtest.shlib b/tests/zfs-tests/include/libtest.shlib index b4d2b91dd47..dfab48d2cda 100644 --- a/tests/zfs-tests/include/libtest.shlib +++ b/tests/zfs-tests/include/libtest.shlib @@ -61,13 +61,8 @@ function compare_version_gte [ "$(printf "$1\n$2" | sort -V | tail -n1)" = "$1" ] } -# Linux kernel version comparison function -# -# $1 Linux version ("4.10", "2.6.32") or blank for installed Linux version -# -# Used for comparison: if [ $(linux_version) -ge $(linux_version "2.6.32") ] -# -function linux_version +# Helper function used by linux_version() and freebsd_version() +function kernel_version { typeset ver="$1" @@ -83,6 +78,24 @@ function linux_version echo $((version * 100000 + major * 1000 + minor)) } +# Linux kernel version comparison function +# +# $1 Linux version ("4.10", "2.6.32") or blank for installed Linux version +# +# Used for comparison: if [ $(linux_version) -ge $(linux_version "2.6.32") ] +function linux_version { + kernel_version "$1" +} + +# FreeBSD version comparison function +# +# $1 FreeBSD version ("13.2", "14.0") or blank for installed FreeBSD version +# +# Used for comparison: if [ $(freebsd_version) -ge $(freebsd_version "13.2") ] +function freebsd_version { + kernel_version "$1" +} + # Determine if this is a Linux test system # # Return 0 if platform Linux, 1 if otherwise diff --git a/tests/zfs-tests/tests/functional/bclone/bclone_common.kshlib b/tests/zfs-tests/tests/functional/bclone/bclone_common.kshlib index beba01c0ed2..3b8eaea5bb5 100644 --- a/tests/zfs-tests/tests/functional/bclone/bclone_common.kshlib +++ b/tests/zfs-tests/tests/functional/bclone/bclone_common.kshlib @@ -42,6 +42,12 @@ function verify_crossfs_block_cloning if is_linux && [[ $(linux_version) -lt $(linux_version "5.3") ]]; then log_unsupported "copy_file_range can't copy cross-filesystem before Linux 5.3" fi + + # Cross dataset block cloning only supported on FreeBSD 14+ + # https://github.com/freebsd/freebsd-src/commit/969071be938c + if is_freebsd && [ $(freebsd_version) -lt $(freebsd_version 14.0) ] ; then + log_unsupported "Cloning across datasets not supported in $(uname -r)" + fi } # Unused. diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_cross_dataset.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_cross_dataset.ksh index 43323c207a6..ad83d30291a 100755 --- a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_cross_dataset.ksh +++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_copyfilerange_cross_dataset.ksh @@ -26,12 +26,11 @@ . $STF_SUITE/include/libtest.shlib . $STF_SUITE/tests/functional/block_cloning/block_cloning.kshlib +. $STF_SUITE/tests/functional/bclone/bclone_common.kshlib verify_runnable "global" -if is_linux && [[ $(linux_version) -lt $(linux_version "5.3") ]]; then - log_unsupported "copy_file_range can't copy cross-filesystem before Linux 5.3" -fi +verify_crossfs_block_cloning claim="The copy_file_range syscall can clone across datasets." diff --git a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_cross_enc_dataset.ksh b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_cross_enc_dataset.ksh index 34d3d269255..702e23267f7 100755 --- a/tests/zfs-tests/tests/functional/block_cloning/block_cloning_cross_enc_dataset.ksh +++ b/tests/zfs-tests/tests/functional/block_cloning/block_cloning_cross_enc_dataset.ksh @@ -26,12 +26,11 @@ . $STF_SUITE/include/libtest.shlib . $STF_SUITE/tests/functional/block_cloning/block_cloning.kshlib +. $STF_SUITE/tests/functional/bclone/bclone_common.kshlib verify_runnable "global" -if is_linux && [[ $(linux_version) -lt $(linux_version "5.3") ]]; then - log_unsupported "copy_file_range can't copy cross-filesystem before Linux 5.3" -fi +verify_crossfs_block_cloning claim="Block cloning across encrypted datasets." From af4da5ccf28e1de6d4f6771a8d9d7cc578aba087 Mon Sep 17 00:00:00 2001 From: Brian Behlendorf Date: Fri, 16 Feb 2024 09:07:32 -0800 Subject: [PATCH 17/20] Check for minimum partition size On Linux block devices used for vdevs will by partitioned. The block device must be large enough for an 64M partition starting at offset of 2048 sectors (part1), and a second 64M reserved partition at the end of the device (part9). This commit adds a capacity check when creating the GPT label to immediately detect a device which is too small. With the existing code this would be caught slightly latter when attempting to use the partition. Catching it sooner let's us print a more useful error. Reviewed-by: Tony Hutter Signed-off-by: Brian Behlendorf Closes #15898 --- lib/libzfs/os/linux/libzfs_pool_os.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/lib/libzfs/os/linux/libzfs_pool_os.c b/lib/libzfs/os/linux/libzfs_pool_os.c index 401151b1afb..86eef3255bc 100644 --- a/lib/libzfs/os/linux/libzfs_pool_os.c +++ b/lib/libzfs/os/linux/libzfs_pool_os.c @@ -273,6 +273,16 @@ zpool_label_disk(libzfs_handle_t *hdl, zpool_handle_t *zhp, const char *name) vtoc->efi_parts[0].p_start = start_block; vtoc->efi_parts[0].p_size = slice_size; + if (vtoc->efi_parts[0].p_size * vtoc->efi_lbasize < SPA_MINDEVSIZE) { + (void) close(fd); + efi_free(vtoc); + + zfs_error_aux(hdl, dgettext(TEXT_DOMAIN, "cannot " + "label '%s': partition would be less than the minimum " + "device size (64M)"), path); + return (zfs_error(hdl, EZFS_LABELFAILED, errbuf)); + } + /* * Why we use V_USR: V_BACKUP confuses users, and is considered * disposable by some EFI utilities (since EFI doesn't have a backup From 5600dff0ef56f3bef23356a1ea2ee2364a4c1139 Mon Sep 17 00:00:00 2001 From: Quartz Date: Tue, 27 Feb 2024 03:41:44 +0800 Subject: [PATCH 18/20] Fixed parameter passing error when calling zfs_acl_chmod Follow up to 99495ba6abbf0bb726324d03212c6f5ffa00043e which accidentally introduce this regression. Reviewed by: Brian Behlendorf Signed-off-by: Quartz Closes #15907 --- module/os/linux/zfs/zfs_acl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/module/os/linux/zfs/zfs_acl.c b/module/os/linux/zfs/zfs_acl.c index a1fd3c9856c..48abbc01091 100644 --- a/module/os/linux/zfs/zfs_acl.c +++ b/module/os/linux/zfs/zfs_acl.c @@ -1921,8 +1921,8 @@ zfs_acl_ids_create(znode_t *dzp, int flag, vattr_t *vap, cred_t *cr, zfsvfs->z_acl_inherit != ZFS_ACL_PASSTHROUGH && zfsvfs->z_acl_inherit != ZFS_ACL_PASSTHROUGH_X) trim = B_TRUE; - zfs_acl_chmod(vap->va_mode, acl_ids->z_mode, B_FALSE, - trim, acl_ids->z_aclp); + zfs_acl_chmod(S_ISDIR(vap->va_mode), acl_ids->z_mode, + B_FALSE, trim, acl_ids->z_aclp); } } From c00c085bfb187f0c14bd276658a9c897d3919d07 Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Mon, 19 Feb 2024 21:13:59 +1100 Subject: [PATCH 19/20] config: use -Wno-format-truncation globally -Wformat-truncation looks for places where the return code of snprintf() is unchecked and the provided buffer might be too short. This is based on a heuristic that can change between compiler versions. It has been seen to get this wrong in ddt_object_name(), leading to DDT_NAMELEN being increased somewhat arbitrarily. There's no good reason to have this warning enabled, so here we disable it everywhere. Truncation may be undesirable, but snprintf() is guaranteed to emit a trailing null, so at worst we get a short string, not a buffer overrun. Reviewed by: Brian Behlendorf Signed-off-by: Rob Norris Sponsored-by: https://despairlabs.com/sponsor/ Closes #15908 --- cmd/Makefile.am | 2 -- config/Rules.am | 4 +++- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cmd/Makefile.am b/cmd/Makefile.am index 6d6de4adb42..2bd9d039f20 100644 --- a/cmd/Makefile.am +++ b/cmd/Makefile.am @@ -39,8 +39,6 @@ zhack_LDADD = \ ztest_CFLAGS = $(AM_CFLAGS) $(KERNEL_CFLAGS) -# Get rid of compiler warning for unchecked truncating snprintfs on gcc 7.1.1 -ztest_CFLAGS += $(NO_FORMAT_TRUNCATION) ztest_CPPFLAGS = $(AM_CPPFLAGS) $(FORCEDEBUG_CPPFLAGS) sbin_PROGRAMS += ztest diff --git a/config/Rules.am b/config/Rules.am index 2e463ae6083..30c5f353cd2 100644 --- a/config/Rules.am +++ b/config/Rules.am @@ -21,7 +21,9 @@ AM_CFLAGS += $(IMPLICIT_FALLTHROUGH) AM_CFLAGS += $(DEBUG_CFLAGS) AM_CFLAGS += $(ASAN_CFLAGS) AM_CFLAGS += $(UBSAN_CFLAGS) -AM_CFLAGS += $(CODE_COVERAGE_CFLAGS) $(NO_FORMAT_ZERO_LENGTH) +AM_CFLAGS += $(CODE_COVERAGE_CFLAGS) +AM_CFLAGS += $(NO_FORMAT_ZERO_LENGTH) +AM_CFLAGS += $(NO_FORMAT_TRUNCATION) if BUILD_FREEBSD AM_CFLAGS += -fPIC -Werror -Wno-unknown-pragmas -Wno-enum-conversion AM_CFLAGS += -include $(top_srcdir)/include/os/freebsd/spl/sys/ccompile.h From 8f2f6cd2ac688916adb2caf979daf95365ccb48f Mon Sep 17 00:00:00 2001 From: Rob Norris Date: Mon, 19 Feb 2024 21:19:32 +1100 Subject: [PATCH 20/20] ddt: reduce DDT_NAMELEN This is the buffer size passed to ddt_object_name(), to expand the DMU_POOL_DDT format. That format inserts the table checksum, class and type names, which as I write this are max 6, 9 and 3, respectively. Reviewed by: Brian Behlendorf Signed-off-by: Rob Norris Sponsored-by: https://despairlabs.com/sponsor/ Closes #15908 --- include/sys/ddt_impl.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/include/sys/ddt_impl.h b/include/sys/ddt_impl.h index d6693658885..52b927b7519 100644 --- a/include/sys/ddt_impl.h +++ b/include/sys/ddt_impl.h @@ -66,7 +66,12 @@ extern void ddt_stat_update(ddt_t *ddt, ddt_entry_t *dde, uint64_t neg); * outside of the DDT implementation proper, and if you do, consider moving * them up. */ -#define DDT_NAMELEN 110 + +/* + * Enough room to expand DMU_POOL_DDT format for all possible DDT + * checksum/class/type combinations. + */ +#define DDT_NAMELEN 32 extern uint64_t ddt_phys_total_refcnt(const ddt_entry_t *dde);