diff --git a/cddl/contrib/opensolaris/cmd/zpool/zpool_main.c b/cddl/contrib/opensolaris/cmd/zpool/zpool_main.c index bfb71145516..1686467aafe 100644 --- a/cddl/contrib/opensolaris/cmd/zpool/zpool_main.c +++ b/cddl/contrib/opensolaris/cmd/zpool/zpool_main.c @@ -1295,12 +1295,13 @@ print_status_config(zpool_handle_t *zhp, const char *name, nvlist_t *nv, int namewidth, int depth, boolean_t isspare) { nvlist_t **child; - uint_t c, children; + uint_t c, vsc, children; pool_scan_stat_t *ps = NULL; vdev_stat_t *vs; char rbuf[6], wbuf[6], cbuf[6]; char *vname; uint64_t notpresent; + uint64_t ashift; spare_cbdata_t cb; const char *state; @@ -1309,7 +1310,7 @@ print_status_config(zpool_handle_t *zhp, const char *name, nvlist_t *nv, children = 0; verify(nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_VDEV_STATS, - (uint64_t **)&vs, &c) == 0); + (uint64_t **)&vs, &vsc) == 0); state = zpool_state_to_name(vs->vs_state, vs->vs_aux); if (isspare) { @@ -1363,6 +1364,10 @@ print_status_config(zpool_handle_t *zhp, const char *name, nvlist_t *nv, (void) printf(gettext("unsupported feature(s)")); break; + case VDEV_AUX_ASHIFT_TOO_BIG: + (void) printf(gettext("unsupported minimum blocksize")); + break; + case VDEV_AUX_SPARED: verify(nvlist_lookup_uint64(nv, ZPOOL_CONFIG_GUID, &cb.cb_guid) == 0); @@ -1405,6 +1410,12 @@ print_status_config(zpool_handle_t *zhp, const char *name, nvlist_t *nv, (void) printf(gettext("corrupted data")); break; } + } else if (children == 0 && !isspare && + VDEV_STAT_VALID(vs_physical_ashift, vsc) && + vs->vs_configured_ashift < vs->vs_physical_ashift) { + (void) printf( + gettext(" block size: %dB configured, %dB native"), + 1 << vs->vs_configured_ashift, 1 << vs->vs_physical_ashift); } (void) nvlist_lookup_uint64_array(nv, ZPOOL_CONFIG_SCAN_STATS, @@ -4268,6 +4279,15 @@ status_callback(zpool_handle_t *zhp, void *data) "'zpool clear'.\n")); break; + case ZPOOL_STATUS_NON_NATIVE_ASHIFT: + (void) printf(gettext("status: One or more devices are " + "configured to use a non-native block size.\n" + "\tExpect reduced performance.\n")); + (void) printf(gettext("action: Replace affected devices with " + "devices that support the\n\tconfigured block size, or " + "migrate data to a properly configured\n\tpool.\n")); + break; + default: /* * The remaining errors can't actually be generated, yet. diff --git a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h index 981e9bd9e07..0a5ca82e4c9 100644 --- a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h +++ b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs.h @@ -326,6 +326,7 @@ typedef enum { ZPOOL_STATUS_RESILVERING, /* device being resilvered */ ZPOOL_STATUS_OFFLINE_DEV, /* device online */ ZPOOL_STATUS_REMOVED_DEV, /* removed device */ + ZPOOL_STATUS_NON_NATIVE_ASHIFT, /* (e.g. 512e dev with ashift of 9) */ /* * Finally, the following indicates a healthy pool. diff --git a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_status.c b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_status.c index 71b27a12140..906883ca60d 100644 --- a/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_status.c +++ b/cddl/contrib/opensolaris/lib/libzfs/common/libzfs_status.c @@ -73,57 +73,66 @@ static char *zfs_msgid_table[] = { /* ARGSUSED */ static int -vdev_missing(uint64_t state, uint64_t aux, uint64_t errs) +vdev_missing(vdev_stat_t *vs, uint_t vsc) { - return (state == VDEV_STATE_CANT_OPEN && - aux == VDEV_AUX_OPEN_FAILED); + return (vs->vs_state == VDEV_STATE_CANT_OPEN && + vs->vs_aux == VDEV_AUX_OPEN_FAILED); } /* ARGSUSED */ static int -vdev_faulted(uint64_t state, uint64_t aux, uint64_t errs) +vdev_faulted(vdev_stat_t *vs, uint_t vsc) { - return (state == VDEV_STATE_FAULTED); + return (vs->vs_state == VDEV_STATE_FAULTED); } /* ARGSUSED */ static int -vdev_errors(uint64_t state, uint64_t aux, uint64_t errs) +vdev_errors(vdev_stat_t *vs, uint_t vsc) { - return (state == VDEV_STATE_DEGRADED || errs != 0); + return (vs->vs_state == VDEV_STATE_DEGRADED || + vs->vs_read_errors != 0 || vs->vs_write_errors != 0 || + vs->vs_checksum_errors != 0); } /* ARGSUSED */ static int -vdev_broken(uint64_t state, uint64_t aux, uint64_t errs) +vdev_broken(vdev_stat_t *vs, uint_t vsc) { - return (state == VDEV_STATE_CANT_OPEN); + return (vs->vs_state == VDEV_STATE_CANT_OPEN); } /* ARGSUSED */ static int -vdev_offlined(uint64_t state, uint64_t aux, uint64_t errs) +vdev_offlined(vdev_stat_t *vs, uint_t vsc) { - return (state == VDEV_STATE_OFFLINE); + return (vs->vs_state == VDEV_STATE_OFFLINE); } /* ARGSUSED */ static int -vdev_removed(uint64_t state, uint64_t aux, uint64_t errs) +vdev_removed(vdev_stat_t *vs, uint_t vsc) { - return (state == VDEV_STATE_REMOVED); + return (vs->vs_state == VDEV_STATE_REMOVED); +} + +static int +vdev_non_native_ashift(vdev_stat_t *vs, uint_t vsc) +{ + return (VDEV_STAT_VALID(vs_physical_ashift, vsc) && + vs->vs_configured_ashift < vs->vs_physical_ashift); } /* * Detect if any leaf devices that have seen errors or could not be opened. */ static boolean_t -find_vdev_problem(nvlist_t *vdev, int (*func)(uint64_t, uint64_t, uint64_t)) +find_vdev_problem(nvlist_t *vdev, int (*func)(vdev_stat_t *, uint_t), + boolean_t ignore_replacing) { nvlist_t **child; vdev_stat_t *vs; - uint_t c, children; - char *type; + uint_t c, vsc, children; /* * Ignore problems within a 'replacing' vdev, since we're presumably in @@ -131,23 +140,25 @@ find_vdev_problem(nvlist_t *vdev, int (*func)(uint64_t, uint64_t, uint64_t)) * out again. We'll pick up the fact that a resilver is happening * later. */ - verify(nvlist_lookup_string(vdev, ZPOOL_CONFIG_TYPE, &type) == 0); - if (strcmp(type, VDEV_TYPE_REPLACING) == 0) - return (B_FALSE); + if (ignore_replacing == B_TRUE) { + char *type; + + verify(nvlist_lookup_string(vdev, ZPOOL_CONFIG_TYPE, + &type) == 0); + if (strcmp(type, VDEV_TYPE_REPLACING) == 0) + return (B_FALSE); + } if (nvlist_lookup_nvlist_array(vdev, ZPOOL_CONFIG_CHILDREN, &child, &children) == 0) { for (c = 0; c < children; c++) - if (find_vdev_problem(child[c], func)) + if (find_vdev_problem(child[c], func, ignore_replacing)) return (B_TRUE); } else { verify(nvlist_lookup_uint64_array(vdev, ZPOOL_CONFIG_VDEV_STATS, - (uint64_t **)&vs, &c) == 0); + (uint64_t **)&vs, &vsc) == 0); - if (func(vs->vs_state, vs->vs_aux, - vs->vs_read_errors + - vs->vs_write_errors + - vs->vs_checksum_errors)) + if (func(vs, vsc) != 0) return (B_TRUE); } @@ -157,7 +168,7 @@ find_vdev_problem(nvlist_t *vdev, int (*func)(uint64_t, uint64_t, uint64_t)) if (nvlist_lookup_nvlist_array(vdev, ZPOOL_CONFIG_L2CACHE, &child, &children) == 0) { for (c = 0; c < children; c++) - if (find_vdev_problem(child[c], func)) + if (find_vdev_problem(child[c], func, ignore_replacing)) return (B_TRUE); } @@ -270,15 +281,15 @@ check_status(nvlist_t *config, boolean_t isimport) * Bad devices in non-replicated config. */ if (vs->vs_state == VDEV_STATE_CANT_OPEN && - find_vdev_problem(nvroot, vdev_faulted)) + find_vdev_problem(nvroot, vdev_faulted, B_TRUE)) return (ZPOOL_STATUS_FAULTED_DEV_NR); if (vs->vs_state == VDEV_STATE_CANT_OPEN && - find_vdev_problem(nvroot, vdev_missing)) + find_vdev_problem(nvroot, vdev_missing, B_TRUE)) return (ZPOOL_STATUS_MISSING_DEV_NR); if (vs->vs_state == VDEV_STATE_CANT_OPEN && - find_vdev_problem(nvroot, vdev_broken)) + find_vdev_problem(nvroot, vdev_broken, B_TRUE)) return (ZPOOL_STATUS_CORRUPT_LABEL_NR); /* @@ -300,31 +311,37 @@ check_status(nvlist_t *config, boolean_t isimport) /* * Missing devices in a replicated config. */ - if (find_vdev_problem(nvroot, vdev_faulted)) + if (find_vdev_problem(nvroot, vdev_faulted, B_TRUE)) return (ZPOOL_STATUS_FAULTED_DEV_R); - if (find_vdev_problem(nvroot, vdev_missing)) + if (find_vdev_problem(nvroot, vdev_missing, B_TRUE)) return (ZPOOL_STATUS_MISSING_DEV_R); - if (find_vdev_problem(nvroot, vdev_broken)) + if (find_vdev_problem(nvroot, vdev_broken, B_TRUE)) return (ZPOOL_STATUS_CORRUPT_LABEL_R); /* * Devices with errors */ - if (!isimport && find_vdev_problem(nvroot, vdev_errors)) + if (!isimport && find_vdev_problem(nvroot, vdev_errors, B_TRUE)) return (ZPOOL_STATUS_FAILING_DEV); /* * Offlined devices */ - if (find_vdev_problem(nvroot, vdev_offlined)) + if (find_vdev_problem(nvroot, vdev_offlined, B_TRUE)) return (ZPOOL_STATUS_OFFLINE_DEV); /* * Removed device */ - if (find_vdev_problem(nvroot, vdev_removed)) + if (find_vdev_problem(nvroot, vdev_removed, B_TRUE)) return (ZPOOL_STATUS_REMOVED_DEV); + /* + * Suboptimal, but usable, ashift configuration. + */ + if (find_vdev_problem(nvroot, vdev_non_native_ashift, B_FALSE)) + return (ZPOOL_STATUS_NON_NATIVE_ASHIFT); + /* * Outdated, but usable, version */ diff --git a/cddl/contrib/opensolaris/lib/libzpool/common/kernel.c b/cddl/contrib/opensolaris/lib/libzpool/common/kernel.c index c5c7b66142a..71c3dceaa0f 100644 --- a/cddl/contrib/opensolaris/lib/libzpool/common/kernel.c +++ b/cddl/contrib/opensolaris/lib/libzpool/common/kernel.c @@ -591,6 +591,12 @@ dprintf_setup(int *argc, char **argv) dprintf_print_all = 1; } +int +sysctl_handle_64(SYSCTL_HANDLER_ARGS) +{ + return (0); +} + /* * ========================================================================= * debug printfs diff --git a/cddl/contrib/opensolaris/lib/libzpool/common/sys/zfs_context.h b/cddl/contrib/opensolaris/lib/libzpool/common/sys/zfs_context.h index 21075055a1d..da153c7c166 100644 --- a/cddl/contrib/opensolaris/lib/libzpool/common/sys/zfs_context.h +++ b/cddl/contrib/opensolaris/lib/libzpool/common/sys/zfs_context.h @@ -659,11 +659,55 @@ typedef uint32_t idmap_rid_t; #define SX_SYSINIT(name, lock, desc) +#define SYSCTL_HANDLER_ARGS struct sysctl_oid *oidp, void *arg1, \ + intptr_t arg2, struct sysctl_req *req + +/* + * This describes the access space for a sysctl request. This is needed + * so that we can use the interface from the kernel or from user-space. + */ +struct sysctl_req { + struct thread *td; /* used for access checking */ + int lock; /* wiring state */ + void *oldptr; + size_t oldlen; + size_t oldidx; + int (*oldfunc)(struct sysctl_req *, const void *, size_t); + void *newptr; + size_t newlen; + size_t newidx; + int (*newfunc)(struct sysctl_req *, void *, size_t); + size_t validlen; + int flags; +}; + +SLIST_HEAD(sysctl_oid_list, sysctl_oid); + +/* + * This describes one "oid" in the MIB tree. Potentially more nodes can + * be hidden behind it, expanded by the handler. + */ +struct sysctl_oid { + struct sysctl_oid_list *oid_parent; + SLIST_ENTRY(sysctl_oid) oid_link; + int oid_number; + u_int oid_kind; + void *oid_arg1; + intptr_t oid_arg2; + const char *oid_name; + int (*oid_handler)(SYSCTL_HANDLER_ARGS); + const char *oid_fmt; + int oid_refcnt; + u_int oid_running; + const char *oid_descr; +}; + #define SYSCTL_DECL(...) #define SYSCTL_NODE(...) #define SYSCTL_INT(...) #define SYSCTL_UINT(...) #define SYSCTL_ULONG(...) +#define SYSCTL_PROC(...) #define SYSCTL_QUAD(...) #define SYSCTL_UQUAD(...) #ifdef TUNABLE_INT @@ -675,6 +719,8 @@ typedef uint32_t idmap_rid_t; #define TUNABLE_ULONG(...) #define TUNABLE_QUAD(...) +int sysctl_handle_64(SYSCTL_HANDLER_ARGS); + /* Errors */ #ifndef ERESTART diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c index f0255dec37e..cd3f8ac7591 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/arc.c @@ -5147,7 +5147,7 @@ l2arc_compress_buf(l2arc_buf_hdr_t *l2hdr) len = l2hdr->b_asize; cdata = zio_data_buf_alloc(len); csize = zio_compress_data(ZIO_COMPRESS_LZ4, l2hdr->b_tmp_cdata, - cdata, l2hdr->b_asize); + cdata, l2hdr->b_asize, (size_t)SPA_MINBLOCKSIZE); if (csize == 0) { /* zero block, indicate that there's nothing to write */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c index 040f255c63b..f2f71392300 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/metaslab.c @@ -180,6 +180,27 @@ metaslab_class_space_update(metaslab_class_t *mc, int64_t alloc_delta, atomic_add_64(&mc->mc_dspace, dspace_delta); } +void +metaslab_class_minblocksize_update(metaslab_class_t *mc) +{ + metaslab_group_t *mg; + vdev_t *vd; + uint64_t minashift = UINT64_MAX; + + if ((mg = mc->mc_rotor) == NULL) { + mc->mc_minblocksize = SPA_MINBLOCKSIZE; + return; + } + + do { + vd = mg->mg_vd; + if (vd->vdev_ashift < minashift) + minashift = vd->vdev_ashift; + } while ((mg = mg->mg_next) != mc->mc_rotor); + + mc->mc_minblocksize = 1ULL << minashift; +} + uint64_t metaslab_class_get_alloc(metaslab_class_t *mc) { @@ -204,6 +225,12 @@ metaslab_class_get_dspace(metaslab_class_t *mc) return (spa_deflate(mc->mc_spa) ? mc->mc_dspace : mc->mc_space); } +uint64_t +metaslab_class_get_minblocksize(metaslab_class_t *mc) +{ + return (mc->mc_minblocksize); +} + /* * ========================================================================== * Metaslab groups @@ -295,6 +322,7 @@ metaslab_group_activate(metaslab_group_t *mg) mgnext->mg_prev = mg; } mc->mc_rotor = mg; + metaslab_class_minblocksize_update(mc); } void @@ -326,6 +354,7 @@ metaslab_group_passivate(metaslab_group_t *mg) mg->mg_prev = NULL; mg->mg_next = NULL; + metaslab_class_minblocksize_update(mc); } static void diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c index c9374ad53b1..cfeb21c6616 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa.c @@ -3424,6 +3424,7 @@ spa_create(const char *pool, nvlist_t *nvroot, nvlist_t *props, (error = spa_validate_aux(spa, nvroot, txg, VDEV_ALLOC_ADD)) == 0) { for (int c = 0; c < rvd->vdev_children; c++) { + vdev_ashift_optimize(rvd->vdev_child[c]); vdev_metaslab_set_size(rvd->vdev_child[c]); vdev_expand(rvd->vdev_child[c], txg); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c index ad6b3a86afb..2d0cf230932 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/spa_config.c @@ -519,8 +519,10 @@ spa_config_update(spa_t *spa, int what) */ for (c = 0; c < rvd->vdev_children; c++) { vdev_t *tvd = rvd->vdev_child[c]; - if (tvd->vdev_ms_array == 0) + if (tvd->vdev_ms_array == 0) { + vdev_ashift_optimize(tvd); vdev_metaslab_set_size(tvd); + } vdev_expand(tvd, txg); } } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h index d6c0bf4c94c..1a4fe7607b3 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab.h @@ -70,6 +70,7 @@ extern uint64_t metaslab_class_get_alloc(metaslab_class_t *mc); extern uint64_t metaslab_class_get_space(metaslab_class_t *mc); extern uint64_t metaslab_class_get_dspace(metaslab_class_t *mc); extern uint64_t metaslab_class_get_deferred(metaslab_class_t *mc); +extern uint64_t metaslab_class_get_minblocksize(metaslab_class_t *mc); extern metaslab_group_t *metaslab_group_create(metaslab_class_t *mc, vdev_t *vd); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h index 138e14ef59f..e9fa73ebaae 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/metaslab_impl.h @@ -49,6 +49,7 @@ struct metaslab_class { uint64_t mc_deferred; /* total deferred frees */ uint64_t mc_space; /* total space (alloc + free) */ uint64_t mc_dspace; /* total deflated space */ + uint64_t mc_minblocksize; }; struct metaslab_group { diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h index a905cb4a8c3..0fb279472c9 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/spa.h @@ -92,6 +92,17 @@ struct dsl_dataset; #define SPA_BLOCKSIZES (SPA_MAXBLOCKSHIFT - SPA_MINBLOCKSHIFT + 1) +/* + * Maximum supported logical ashift. + * + * The current 8k allocation block size limit is due to the 8k + * aligned/sized operations performed by vdev_probe() on + * vdev_label->vl_pad2. Using another "safe region" for these tests + * would allow the limit to be raised to 16k, at the expense of + * only having 8 available uberblocks in the label area. + */ +#define SPA_MAXASHIFT 13 + /* * Size of block to hold the configuration data (a packed nvlist) */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h index 920b1b0b356..74bcf5fd3e2 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev.h @@ -78,6 +78,7 @@ extern void vdev_rele(vdev_t *); extern int vdev_metaslab_init(vdev_t *vd, uint64_t txg); extern void vdev_metaslab_fini(vdev_t *vd); extern void vdev_metaslab_set_size(vdev_t *); +extern void vdev_ashift_optimize(vdev_t *); extern void vdev_expand(vdev_t *vd, uint64_t txg); extern void vdev_split(vdev_t *vd); extern void vdev_deadman(vdev_t *vd); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h index aaded0a515a..99dffb5fc0c 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/vdev_impl.h @@ -57,7 +57,7 @@ typedef struct vdev_cache_entry vdev_cache_entry_t; * Virtual device operations */ typedef int vdev_open_func_t(vdev_t *vd, uint64_t *size, uint64_t *max_size, - uint64_t *ashift); + uint64_t *logical_ashift, uint64_t *physical_ashift); typedef void vdev_close_func_t(vdev_t *vd); typedef uint64_t vdev_asize_func_t(vdev_t *vd, uint64_t psize); typedef int vdev_io_start_func_t(zio_t *zio); @@ -123,6 +123,24 @@ struct vdev { uint64_t vdev_min_asize; /* min acceptable asize */ uint64_t vdev_max_asize; /* max acceptable asize */ uint64_t vdev_ashift; /* block alignment shift */ + /* + * Logical block alignment shift + * + * The smallest sized/aligned I/O supported by the device. + */ + uint64_t vdev_logical_ashift; + /* + * Physical block alignment shift + * + * The device supports logical I/Os with vdev_logical_ashift + * size/alignment, but optimum performance will be achieved by + * aligning/sizing requests to vdev_physical_ashift. Smaller + * requests may be inflated or incur device level read-modify-write + * operations. + * + * May be 0 to indicate no preference (i.e. use vdev_logical_ashift). + */ + uint64_t vdev_physical_ashift; uint64_t vdev_state; /* see VDEV_STATE_* #defines */ uint64_t vdev_prevstate; /* used when reopening a vdev */ vdev_ops_t *vdev_ops; /* vdev operations */ diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_compress.h b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_compress.h index f4cb84511a2..14e5c6a2ca9 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_compress.h +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/sys/zio_compress.h @@ -79,7 +79,7 @@ extern int lz4_decompress(void *src, void *dst, size_t s_len, size_t d_len, * Compress and decompress data if necessary. */ extern size_t zio_compress_data(enum zio_compress c, void *src, void *dst, - size_t s_len); + size_t s_len, size_t minblocksize); extern int zio_decompress_data(enum zio_compress c, void *src, void *dst, size_t s_len, size_t d_len); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c index 9e01023e535..8fa4d4416e1 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev.c @@ -52,6 +52,51 @@ SYSCTL_NODE(_vfs_zfs, OID_AUTO, vdev, CTLFLAG_RW, 0, "ZFS VDEV"); * Virtual device management. */ +/** + * The limit for ZFS to automatically increase a top-level vdev's ashift + * from logical ashift to physical ashift. + * + * Example: one or more 512B emulation child vdevs + * child->vdev_ashift = 9 (512 bytes) + * child->vdev_physical_ashift = 12 (4096 bytes) + * zfs_max_auto_ashift = 11 (2048 bytes) + * + * On pool creation or the addition of a new top-leve vdev, ZFS will + * bump the ashift of the top-level vdev to 2048. + * + * Example: one or more 512B emulation child vdevs + * child->vdev_ashift = 9 (512 bytes) + * child->vdev_physical_ashift = 12 (4096 bytes) + * zfs_max_auto_ashift = 13 (8192 bytes) + * + * On pool creation or the addition of a new top-leve vdev, ZFS will + * bump the ashift of the top-level vdev to 4096. + */ +static uint64_t zfs_max_auto_ashift = SPA_MAXASHIFT; + +static int +sysctl_vfs_zfs_max_auto_ashift(SYSCTL_HANDLER_ARGS) +{ + uint64_t val; + int err; + + val = zfs_max_auto_ashift; + err = sysctl_handle_64(oidp, &val, 0, req); + if (err != 0 || req->newptr == NULL) + return (err); + + if (val > SPA_MAXASHIFT) + val = SPA_MAXASHIFT; + + zfs_max_auto_ashift = val; + + return (0); +} +SYSCTL_PROC(_vfs_zfs, OID_AUTO, max_auto_ashift, + CTLTYPE_U64 | CTLFLAG_MPSAFE | CTLFLAG_RW, 0, sizeof(uint64_t), + sysctl_vfs_zfs_max_auto_ashift, "QU", + "Cap on logical -> physical ashift adjustment on new top-level vdevs."); + static vdev_ops_t *vdev_ops_table[] = { &vdev_root_ops, &vdev_raidz_ops, @@ -746,6 +791,8 @@ vdev_add_parent(vdev_t *cvd, vdev_ops_t *ops) mvd->vdev_min_asize = cvd->vdev_min_asize; mvd->vdev_max_asize = cvd->vdev_max_asize; mvd->vdev_ashift = cvd->vdev_ashift; + mvd->vdev_logical_ashift = cvd->vdev_logical_ashift; + mvd->vdev_physical_ashift = cvd->vdev_physical_ashift; mvd->vdev_state = cvd->vdev_state; mvd->vdev_crtxg = cvd->vdev_crtxg; @@ -777,6 +824,8 @@ vdev_remove_parent(vdev_t *cvd) mvd->vdev_ops == &vdev_replacing_ops || mvd->vdev_ops == &vdev_spare_ops); cvd->vdev_ashift = mvd->vdev_ashift; + cvd->vdev_logical_ashift = mvd->vdev_logical_ashift; + cvd->vdev_physical_ashift = mvd->vdev_physical_ashift; vdev_remove_child(mvd, cvd); vdev_remove_child(pvd, mvd); @@ -1120,7 +1169,8 @@ vdev_open(vdev_t *vd) uint64_t osize = 0; uint64_t max_osize = 0; uint64_t asize, max_asize, psize; - uint64_t ashift = 0; + uint64_t logical_ashift = 0; + uint64_t physical_ashift = 0; ASSERT(vd->vdev_open_thread == curthread || spa_config_held(spa, SCL_STATE_ALL, RW_WRITER) == SCL_STATE_ALL); @@ -1150,7 +1200,8 @@ vdev_open(vdev_t *vd) return (SET_ERROR(ENXIO)); } - error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize, &ashift); + error = vd->vdev_ops->vdev_op_open(vd, &osize, &max_osize, + &logical_ashift, &physical_ashift); /* * Reset the vdev_reopening flag so that we actually close @@ -1248,6 +1299,17 @@ vdev_open(vdev_t *vd) return (SET_ERROR(EINVAL)); } + vd->vdev_physical_ashift = + MAX(physical_ashift, vd->vdev_physical_ashift); + vd->vdev_logical_ashift = MAX(logical_ashift, vd->vdev_logical_ashift); + vd->vdev_ashift = MAX(vd->vdev_logical_ashift, vd->vdev_ashift); + + if (vd->vdev_logical_ashift > SPA_MAXASHIFT) { + vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_ASHIFT_TOO_BIG); + return (EINVAL); + } + if (vd->vdev_asize == 0) { /* * This is the first-ever open, so use the computed values. @@ -1255,19 +1317,15 @@ vdev_open(vdev_t *vd) */ vd->vdev_asize = asize; vd->vdev_max_asize = max_asize; - vd->vdev_ashift = MAX(ashift, vd->vdev_ashift); } else { /* - * Detect if the alignment requirement has increased. - * We don't want to make the pool unavailable, just - * issue a warning instead. + * Make sure the alignment requirement hasn't increased. */ - if (ashift > vd->vdev_top->vdev_ashift && + if (vd->vdev_ashift > vd->vdev_top->vdev_ashift && vd->vdev_ops->vdev_op_leaf) { - cmn_err(CE_WARN, - "Disk, '%s', has a block alignment that is " - "larger than the pool's alignment\n", - vd->vdev_path); + vdev_set_state(vd, B_TRUE, VDEV_STATE_CANT_OPEN, + VDEV_AUX_BAD_LABEL); + return (EINVAL); } vd->vdev_max_asize = max_asize; } @@ -1577,6 +1635,23 @@ vdev_metaslab_set_size(vdev_t *vd) vd->vdev_ms_shift = MAX(vd->vdev_ms_shift, SPA_MAXBLOCKSHIFT); } +/* + * Maximize performance by inflating the configured ashift for + * top level vdevs to be as close to the physical ashift as + * possible without exceeding the administrator specified + * limit. + */ +void +vdev_ashift_optimize(vdev_t *vd) +{ + if (vd == vd->vdev_top && + (vd->vdev_ashift < vd->vdev_physical_ashift) && + (vd->vdev_ashift < zfs_max_auto_ashift)) { + vd->vdev_ashift = MIN(zfs_max_auto_ashift, + vd->vdev_physical_ashift); + } +} + void vdev_dirty(vdev_t *vd, int flags, void *arg, uint64_t txg) { @@ -2595,6 +2670,10 @@ vdev_get_stats(vdev_t *vd, vdev_stat_t *vs) if (vd->vdev_ops->vdev_op_leaf) vs->vs_rsize += VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; vs->vs_esize = vd->vdev_max_asize - vd->vdev_asize; + vs->vs_configured_ashift = vd->vdev_top != NULL + ? vd->vdev_top->vdev_ashift : vd->vdev_ashift; + vs->vs_logical_ashift = vd->vdev_logical_ashift; + vs->vs_physical_ashift = vd->vdev_physical_ashift; mutex_exit(&vd->vdev_stat_lock); /* diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c index fad12770b9b..1d8fa5cbe40 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_file.c @@ -49,7 +49,7 @@ vdev_file_rele(vdev_t *vd) static int vdev_file_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, - uint64_t *ashift) + uint64_t *logical_ashift, uint64_t *physical_ashift) { vdev_file_t *vf; vnode_t *vp; @@ -130,7 +130,8 @@ skip_open: } *max_psize = *psize = vattr.va_size; - *ashift = SPA_MINBLOCKSHIFT; + *logical_ashift = SPA_MINBLOCKSHIFT; + *physical_ashift = SPA_MINBLOCKSHIFT; return (0); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c index f79f08d94bb..15685a5bcb3 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_geom.c @@ -576,7 +576,7 @@ vdev_geom_open_by_path(vdev_t *vd, int check_guid) static int vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, - uint64_t *ashift) + uint64_t *logical_ashift, uint64_t *physical_ashift) { struct g_provider *pp; struct g_consumer *cp; @@ -662,9 +662,13 @@ vdev_geom_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, *max_psize = *psize = pp->mediasize; /* - * Determine the device's minimum transfer size. + * Determine the device's minimum transfer size and preferred + * transfer size. */ - *ashift = highbit(MAX(pp->sectorsize, SPA_MINBLOCKSIZE)) - 1; + *logical_ashift = highbit(MAX(pp->sectorsize, SPA_MINBLOCKSIZE)) - 1; + *physical_ashift = 0; + if (pp->stripesize) + *physical_ashift = highbit(pp->stripesize) - 1; /* * Clear the nowritecache settings, so that on a vdev_reopen() diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c index d4eaa3b09aa..34c28fa6619 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_mirror.c @@ -132,7 +132,7 @@ vdev_mirror_map_alloc(zio_t *zio) static int vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, - uint64_t *ashift) + uint64_t *logical_ashift, uint64_t *physical_ashift) { int numerrors = 0; int lasterror = 0; @@ -155,7 +155,9 @@ vdev_mirror_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; - *ashift = MAX(*ashift, cvd->vdev_ashift); + *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift); + *physical_ashift = MAX(*physical_ashift, + cvd->vdev_physical_ashift); } if (numerrors == vd->vdev_children) { diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c index b9eb99d1800..3088ab9e767 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_missing.c @@ -45,7 +45,7 @@ /* ARGSUSED */ static int vdev_missing_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, - uint64_t *ashift) + uint64_t *logical_ashift, uint64_t *physical_ashift) { /* * Really this should just fail. But then the root vdev will be in the @@ -55,7 +55,8 @@ vdev_missing_open(vdev_t *vd, uint64_t *psize, uint64_t *max_psize, */ *psize = 0; *max_psize = 0; - *ashift = 0; + *logical_ashift = 0; + *physical_ashift = 0; return (0); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c index 97b98935183..5e3c6c7a8a0 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_raidz.c @@ -1478,7 +1478,7 @@ vdev_raidz_reconstruct(raidz_map_t *rm, int *t, int nt) static int vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, - uint64_t *ashift) + uint64_t *logical_ashift, uint64_t *physical_ashift) { vdev_t *cvd; uint64_t nparity = vd->vdev_nparity; @@ -1507,7 +1507,9 @@ vdev_raidz_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, *asize = MIN(*asize - 1, cvd->vdev_asize - 1) + 1; *max_asize = MIN(*max_asize - 1, cvd->vdev_max_asize - 1) + 1; - *ashift = MAX(*ashift, cvd->vdev_ashift); + *logical_ashift = MAX(*logical_ashift, cvd->vdev_ashift); + *physical_ashift = MAX(*physical_ashift, + cvd->vdev_physical_ashift); } *asize *= vd->vdev_children; diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c index a5442a55eb5..03068880f76 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/vdev_root.c @@ -55,7 +55,7 @@ too_many_errors(vdev_t *vd, int numerrors) static int vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, - uint64_t *ashift) + uint64_t *logical_ashift, uint64_t *physical_ashift) { int lasterror = 0; int numerrors = 0; @@ -83,7 +83,8 @@ vdev_root_open(vdev_t *vd, uint64_t *asize, uint64_t *max_asize, *asize = 0; *max_asize = 0; - *ashift = 0; + *logical_ashift = 0; + *physical_ashift = 0; return (0); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c index f32acd877de..b92ceacd4ae 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio.c @@ -1137,8 +1137,10 @@ zio_write_bp_init(zio_t *zio) } if (compress != ZIO_COMPRESS_OFF) { + metaslab_class_t *mc = spa_normal_class(spa); void *cbuf = zio_buf_alloc(lsize); - psize = zio_compress_data(compress, zio->io_data, cbuf, lsize); + psize = zio_compress_data(compress, zio->io_data, cbuf, lsize, + (size_t)metaslab_class_get_minblocksize(mc)); if (psize == 0 || psize == lsize) { compress = ZIO_COMPRESS_OFF; zio_buf_free(cbuf, lsize); diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c index d3c6ae17cee..04eac7b15cf 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zio_compress.c @@ -77,7 +77,8 @@ zio_compress_select(enum zio_compress child, enum zio_compress parent) } size_t -zio_compress_data(enum zio_compress c, void *src, void *dst, size_t s_len) +zio_compress_data(enum zio_compress c, void *src, void *dst, size_t s_len, + size_t minblocksize) { uint64_t *word, *word_end; size_t c_len, d_len, r_len; @@ -102,7 +103,7 @@ zio_compress_data(enum zio_compress c, void *src, void *dst, size_t s_len) return (s_len); /* Compress at least 12.5% */ - d_len = P2ALIGN(s_len - (s_len >> 3), (size_t)SPA_MINBLOCKSIZE); + d_len = P2ALIGN(s_len - (s_len >> 3), minblocksize); if (d_len == 0) return (s_len); @@ -115,14 +116,14 @@ zio_compress_data(enum zio_compress c, void *src, void *dst, size_t s_len) * Cool. We compressed at least as much as we were hoping to. * For both security and repeatability, pad out the last sector. */ - r_len = P2ROUNDUP(c_len, (size_t)SPA_MINBLOCKSIZE); + r_len = P2ROUNDUP(c_len, minblocksize); if (r_len > c_len) { bzero((char *)dst + c_len, r_len - c_len); c_len = r_len; } ASSERT3U(c_len, <=, d_len); - ASSERT(P2PHASE(c_len, (size_t)SPA_MINBLOCKSIZE) == 0); + ASSERT(P2PHASE(c_len, minblocksize) == 0); return (c_len); } diff --git a/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h b/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h index 849b975113e..454c28a7d75 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h +++ b/sys/cddl/contrib/opensolaris/uts/common/sys/fs/zfs.h @@ -621,7 +621,8 @@ typedef enum vdev_aux { VDEV_AUX_IO_FAILURE, /* experienced I/O failure */ VDEV_AUX_BAD_LOG, /* cannot read log chain(s) */ VDEV_AUX_EXTERNAL, /* external diagnosis */ - VDEV_AUX_SPLIT_POOL /* vdev was split off into another pool */ + VDEV_AUX_SPLIT_POOL, /* vdev was split off into another pool */ + VDEV_AUX_ASHIFT_TOO_BIG /* vdev's min block size is too large */ } vdev_aux_t; /* @@ -715,7 +716,13 @@ typedef struct vdev_stat { uint64_t vs_self_healed; /* self-healed bytes */ uint64_t vs_scan_removing; /* removing? */ uint64_t vs_scan_processed; /* scan processed bytes */ + uint64_t vs_configured_ashift; /* TLV vdev_ashift */ + uint64_t vs_logical_ashift; /* vdev_logical_ashift */ + uint64_t vs_physical_ashift; /* vdev_physical_ashift */ } vdev_stat_t; +#define VDEV_STAT_VALID(field, uint64_t_field_count) \ + ((uint64_t_field_count * sizeof(uint64_t)) >= \ + (offsetof(vdev_stat_t, field) + sizeof(((vdev_stat_t *)NULL)->field))) /* * DDT statistics. Note: all fields should be 64-bit because this