From 3c2db0ef4345c4f065a14ca2b1fa86d59f35f9d3 Mon Sep 17 00:00:00 2001 From: Toomas Soome Date: Sun, 15 Dec 2019 21:52:40 +0000 Subject: [PATCH] loader: rewrite zfs vdev initialization In some cases the pool discovery will get stuck in infinite loop while setting up the vdev children. To fix, we split the vdev setup into two parts, first we create vdevs based on configuration we do get from pool label, then, we process pool config from MOS and update the pool config if needed. Testing done: confirm previously hung loader is not hung any more. MFC after: 1 week --- stand/libsa/zfs/zfsimpl.c | 699 ++++++++++++++++++++++-------------- sys/cddl/boot/zfs/zfsimpl.h | 10 +- sys/cddl/boot/zfs/zfssubr.c | 10 +- 3 files changed, 451 insertions(+), 268 deletions(-) diff --git a/stand/libsa/zfs/zfsimpl.c b/stand/libsa/zfs/zfsimpl.c index 5f95489b610..2cd1311429e 100644 --- a/stand/libsa/zfs/zfsimpl.c +++ b/stand/libsa/zfs/zfsimpl.c @@ -35,6 +35,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include "zfsimpl.h" #include "zfssubr.c" @@ -247,7 +248,7 @@ nvlist_find(const unsigned char *nvlist, const char *name, int type, const char *pairname; xdr_int(&p, &namelen); - pairname = (const char*)p; + pairname = (const char *)p; p += roundup(namelen, 4); xdr_int(&p, &pairtype); @@ -261,12 +262,12 @@ nvlist_find(const unsigned char *nvlist, const char *name, int type, } else if (type == DATA_TYPE_STRING) { int len; xdr_int(&p, &len); - (*(const char**)valuep) = (const char*)p; + (*(const char **)valuep) = (const char *)p; return (0); } else if (type == DATA_TYPE_NVLIST || type == DATA_TYPE_NVLIST_ARRAY) { - (*(const unsigned char**)valuep) = - (const unsigned char*)p; + (*(const unsigned char **)valuep) = + (const unsigned char *)p; return (0); } else { return (EIO); @@ -312,7 +313,7 @@ nvlist_check_features_for_read(const unsigned char *nvlist) found = 0; xdr_int(&p, &namelen); - pairname = (const char*)p; + pairname = (const char *)p; p += roundup(namelen, 4); xdr_int(&p, &pairtype); @@ -419,7 +420,7 @@ nvlist_print(const unsigned char *nvlist, unsigned int indent) const char *pairname; xdr_int(&p, &namelen); - pairname = (const char*)p; + pairname = (const char *)p; p += roundup(namelen, 4); xdr_int(&p, &pairtype); @@ -494,12 +495,12 @@ vdev_read_phys(vdev_t *vdev, const blkptr_t *bp, void *buf, } rc = vdev->v_phys_read(vdev, vdev->v_read_priv, offset, buf, psize); - if (rc) - return (rc); - if (bp != NULL) - return (zio_checksum_verify(vdev->spa, bp, buf)); + if (rc == 0) { + if (bp != NULL) + rc = zio_checksum_verify(vdev->v_spa, bp, buf); + } - return (0); + return (rc); } typedef struct remap_segment { @@ -567,6 +568,7 @@ vdev_indirect_mapping_open(spa_t *spa, objset_phys_t *os, vim->vim_havecounts = (vim->vim_dn->dn_bonuslen > VDEV_INDIRECT_MAPPING_SIZE_V0); + return (vim); } @@ -777,8 +779,10 @@ static vdev_t * vdev_lookup_top(spa_t *spa, uint64_t vdev) { vdev_t *rvd; + vdev_list_t *vlist; - STAILQ_FOREACH(rvd, &spa->spa_vdevs, v_childlink) + vlist = &spa->spa_root_vdev->v_children; + STAILQ_FOREACH(rvd, vlist, v_childlink) if (rvd->v_id == vdev) break; @@ -841,7 +845,7 @@ static void vdev_indirect_remap(vdev_t *vd, uint64_t offset, uint64_t asize, void *arg) { list_t stack; - spa_t *spa = vd->spa; + spa_t *spa = vd->v_spa; zio_t *zio = arg; remap_segment_t *rs; @@ -899,7 +903,6 @@ vdev_indirect_remap(vdev_t *vd, uint64_t offset, uint64_t asize, void *arg) */ if (zio->io_error != 0) break; - rs->rs_offset += inner_size; rs->rs_asize -= inner_size; rs->rs_split_offset += inner_size; @@ -935,19 +938,20 @@ static int vdev_indirect_read(vdev_t *vdev, const blkptr_t *bp, void *buf, off_t offset, size_t bytes) { - zio_t zio = { 0 }; - spa_t *spa = vdev->spa; - indirect_vsd_t *iv = malloc(sizeof (*iv)); + zio_t zio; + spa_t *spa = vdev->v_spa; + indirect_vsd_t *iv; indirect_split_t *first; int rc = EIO; + iv = calloc(1, sizeof(*iv)); if (iv == NULL) return (ENOMEM); - bzero(iv, sizeof (*iv)); list_create(&iv->iv_splits, sizeof (indirect_split_t), offsetof(indirect_split_t, is_node)); + bzero(&zio, sizeof(zio)); zio.io_spa = spa; zio.io_bp = (blkptr_t *)bp; zio.io_data = buf; @@ -1086,40 +1090,72 @@ vdev_create(uint64_t guid, vdev_read_t *_read) vdev_t *vdev; vdev_indirect_config_t *vic; - vdev = malloc(sizeof(vdev_t)); - memset(vdev, 0, sizeof(vdev_t)); - STAILQ_INIT(&vdev->v_children); - vdev->v_guid = guid; - vdev->v_state = VDEV_STATE_OFFLINE; - vdev->v_read = _read; + vdev = calloc(1, sizeof(vdev_t)); + if (vdev != NULL) { + STAILQ_INIT(&vdev->v_children); + vdev->v_guid = guid; + vdev->v_read = _read; - vic = &vdev->vdev_indirect_config; - vic->vic_prev_indirect_vdev = UINT64_MAX; - STAILQ_INSERT_TAIL(&zfs_vdevs, vdev, v_alllink); + /* + * root vdev has no read function. + * We only point root vdev from spa. + */ + if (_read != NULL) { + vic = &vdev->vdev_indirect_config; + vic->vic_prev_indirect_vdev = UINT64_MAX; + STAILQ_INSERT_TAIL(&zfs_vdevs, vdev, v_alllink); + } + } return (vdev); } -static int -vdev_init_from_nvlist(const unsigned char *nvlist, vdev_t *pvdev, - vdev_t **vdevp, int is_newer) +static void +vdev_set_initial_state(vdev_t *vdev, const unsigned char *nvlist) { - int rc; - uint64_t guid, id, ashift, asize, nparity; - const char *type; - const char *path; - vdev_t *vdev, *kid; - const unsigned char *kids; - int nkids, i, is_new; uint64_t is_offline, is_faulted, is_degraded, is_removed, isnt_present; uint64_t is_log; - if (nvlist_find(nvlist, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64, - NULL, &guid) - || nvlist_find(nvlist, ZPOOL_CONFIG_ID, DATA_TYPE_UINT64, NULL, &id) - || nvlist_find(nvlist, ZPOOL_CONFIG_TYPE, DATA_TYPE_STRING, + is_offline = is_removed = is_faulted = is_degraded = isnt_present = 0; + is_log = 0; + (void) nvlist_find(nvlist, ZPOOL_CONFIG_OFFLINE, DATA_TYPE_UINT64, NULL, + &is_offline); + (void) nvlist_find(nvlist, ZPOOL_CONFIG_REMOVED, DATA_TYPE_UINT64, NULL, + &is_removed); + (void) nvlist_find(nvlist, ZPOOL_CONFIG_FAULTED, DATA_TYPE_UINT64, NULL, + &is_faulted); + (void) nvlist_find(nvlist, ZPOOL_CONFIG_DEGRADED, DATA_TYPE_UINT64, + NULL, &is_degraded); + (void) nvlist_find(nvlist, ZPOOL_CONFIG_NOT_PRESENT, DATA_TYPE_UINT64, + NULL, &isnt_present); + (void) nvlist_find(nvlist, ZPOOL_CONFIG_IS_LOG, DATA_TYPE_UINT64, NULL, + &is_log); + + if (is_offline != 0) + vdev->v_state = VDEV_STATE_OFFLINE; + else if (is_removed != 0) + vdev->v_state = VDEV_STATE_REMOVED; + else if (is_faulted != 0) + vdev->v_state = VDEV_STATE_FAULTED; + else if (is_degraded != 0) + vdev->v_state = VDEV_STATE_DEGRADED; + else if (isnt_present != 0) + vdev->v_state = VDEV_STATE_CANT_OPEN; + + vdev->v_islog = is_log == 1; +} + +static int +vdev_init(uint64_t guid, const unsigned char *nvlist, vdev_t **vdevp) +{ + uint64_t id, ashift, asize, nparity; + const char *path; + const char *type; + vdev_t *vdev; + + if (nvlist_find(nvlist, ZPOOL_CONFIG_ID, DATA_TYPE_UINT64, NULL, &id) || + nvlist_find(nvlist, ZPOOL_CONFIG_TYPE, DATA_TYPE_STRING, NULL, &type)) { - printf("ZFS: can't find vdev details\n"); return (ENOENT); } @@ -1136,141 +1172,222 @@ vdev_init_from_nvlist(const unsigned char *nvlist, vdev_t *pvdev, return (EIO); } - is_offline = is_removed = is_faulted = is_degraded = isnt_present = 0; - is_log = 0; + if (strcmp(type, VDEV_TYPE_MIRROR) == 0) + vdev = vdev_create(guid, vdev_mirror_read); + else if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) + vdev = vdev_create(guid, vdev_raidz_read); + else if (strcmp(type, VDEV_TYPE_REPLACING) == 0) + vdev = vdev_create(guid, vdev_replacing_read); + else if (strcmp(type, VDEV_TYPE_INDIRECT) == 0) { + vdev_indirect_config_t *vic; - nvlist_find(nvlist, ZPOOL_CONFIG_OFFLINE, DATA_TYPE_UINT64, NULL, - &is_offline); - nvlist_find(nvlist, ZPOOL_CONFIG_REMOVED, DATA_TYPE_UINT64, NULL, - &is_removed); - nvlist_find(nvlist, ZPOOL_CONFIG_FAULTED, DATA_TYPE_UINT64, NULL, - &is_faulted); - nvlist_find(nvlist, ZPOOL_CONFIG_DEGRADED, DATA_TYPE_UINT64, NULL, - &is_degraded); - nvlist_find(nvlist, ZPOOL_CONFIG_NOT_PRESENT, DATA_TYPE_UINT64, NULL, - &isnt_present); - nvlist_find(nvlist, ZPOOL_CONFIG_IS_LOG, DATA_TYPE_UINT64, NULL, - &is_log); - - vdev = vdev_find(guid); - if (!vdev) { - is_new = 1; - - if (strcmp(type, VDEV_TYPE_MIRROR) == 0) - vdev = vdev_create(guid, vdev_mirror_read); - else if (strcmp(type, VDEV_TYPE_RAIDZ) == 0) - vdev = vdev_create(guid, vdev_raidz_read); - else if (strcmp(type, VDEV_TYPE_REPLACING) == 0) - vdev = vdev_create(guid, vdev_replacing_read); - else if (strcmp(type, VDEV_TYPE_INDIRECT) == 0) { - vdev_indirect_config_t *vic; - - vdev = vdev_create(guid, vdev_indirect_read); + vdev = vdev_create(guid, vdev_indirect_read); + if (vdev != NULL) { vdev->v_state = VDEV_STATE_HEALTHY; vic = &vdev->vdev_indirect_config; nvlist_find(nvlist, - ZPOOL_CONFIG_INDIRECT_OBJECT, DATA_TYPE_UINT64, + ZPOOL_CONFIG_INDIRECT_OBJECT, + DATA_TYPE_UINT64, NULL, &vic->vic_mapping_object); nvlist_find(nvlist, - ZPOOL_CONFIG_INDIRECT_BIRTHS, DATA_TYPE_UINT64, + ZPOOL_CONFIG_INDIRECT_BIRTHS, + DATA_TYPE_UINT64, NULL, &vic->vic_births_object); nvlist_find(nvlist, - ZPOOL_CONFIG_PREV_INDIRECT_VDEV, DATA_TYPE_UINT64, + ZPOOL_CONFIG_PREV_INDIRECT_VDEV, + DATA_TYPE_UINT64, NULL, &vic->vic_prev_indirect_vdev); - } else - vdev = vdev_create(guid, vdev_disk_read); - - vdev->v_id = id; - vdev->v_top = pvdev != NULL ? pvdev : vdev; - if (nvlist_find(nvlist, ZPOOL_CONFIG_ASHIFT, - DATA_TYPE_UINT64, NULL, &ashift) == 0) { - vdev->v_ashift = ashift; - } else { - vdev->v_ashift = 0; } - if (nvlist_find(nvlist, ZPOOL_CONFIG_ASIZE, - DATA_TYPE_UINT64, NULL, &asize) == 0) { - vdev->v_psize = asize + - VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; - } - if (nvlist_find(nvlist, ZPOOL_CONFIG_NPARITY, - DATA_TYPE_UINT64, NULL, &nparity) == 0) { - vdev->v_nparity = nparity; - } else { - vdev->v_nparity = 0; - } - if (nvlist_find(nvlist, ZPOOL_CONFIG_PATH, - DATA_TYPE_STRING, NULL, &path) == 0) { - if (strncmp(path, "/dev/", 5) == 0) - path += 5; - vdev->v_name = strdup(path); - } else { - char *name; - - if (strcmp(type, "raidz") == 0) { - if (vdev->v_nparity < 1 || - vdev->v_nparity > 3) { - printf("ZFS: can only boot from disk, " - "mirror, raidz1, raidz2 and raidz3 " - "vdevs\n"); - return (EIO); - } - rc = asprintf(&name, "%s%d-%jd", type, - vdev->v_nparity, id); - } else { - rc = asprintf(&name, "%s-%jd", type, id); - } - if (rc < 0) - return (ENOMEM); - vdev->v_name = name; - } - vdev->v_islog = is_log == 1; } else { - is_new = 0; + vdev = vdev_create(guid, vdev_disk_read); } - if (is_new || is_newer) { - /* - * This is either new vdev or we've already seen this vdev, - * but from an older vdev label, so let's refresh its state - * from the newer label. - */ - if (is_offline) - vdev->v_state = VDEV_STATE_OFFLINE; - else if (is_removed) - vdev->v_state = VDEV_STATE_REMOVED; - else if (is_faulted) - vdev->v_state = VDEV_STATE_FAULTED; - else if (is_degraded) - vdev->v_state = VDEV_STATE_DEGRADED; - else if (isnt_present) - vdev->v_state = VDEV_STATE_CANT_OPEN; + if (vdev == NULL) + return (ENOMEM); + + vdev_set_initial_state(vdev, nvlist); + vdev->v_id = id; + if (nvlist_find(nvlist, ZPOOL_CONFIG_ASHIFT, + DATA_TYPE_UINT64, NULL, &ashift) == 0) + vdev->v_ashift = ashift; + + if (nvlist_find(nvlist, ZPOOL_CONFIG_ASIZE, + DATA_TYPE_UINT64, NULL, &asize) == 0) { + vdev->v_psize = asize + + VDEV_LABEL_START_SIZE + VDEV_LABEL_END_SIZE; } + if (nvlist_find(nvlist, ZPOOL_CONFIG_NPARITY, + DATA_TYPE_UINT64, NULL, &nparity) == 0) + vdev->v_nparity = nparity; + + if (nvlist_find(nvlist, ZPOOL_CONFIG_PATH, + DATA_TYPE_STRING, NULL, &path) == 0) { + if (strncmp(path, "/dev/", 5) == 0) + path += 5; + vdev->v_name = strdup(path); + } else { + char *name; + + name = NULL; + if (strcmp(type, "raidz") == 0) { + if (vdev->v_nparity < 1 || + vdev->v_nparity > 3) { + printf("ZFS: can only boot from disk, " + "mirror, raidz1, raidz2 and raidz3 " + "vdevs\n"); + return (EIO); + } + (void) asprintf(&name, "%s%d-%" PRIu64, type, + vdev->v_nparity, id); + } else { + (void) asprintf(&name, "%s-%" PRIu64, type, id); + } + vdev->v_name = name; + } + *vdevp = vdev; + return (0); +} + +/* + * Find slot for vdev. We return either NULL to signal to use + * STAILQ_INSERT_HEAD, or we return link element to be used with + * STAILQ_INSERT_AFTER. + */ +static vdev_t * +vdev_find_previous(vdev_t *top_vdev, vdev_t *vdev) +{ + vdev_t *v, *previous; + + if (STAILQ_EMPTY(&top_vdev->v_children)) + return (NULL); + + previous = NULL; + STAILQ_FOREACH(v, &top_vdev->v_children, v_childlink) { + if (v->v_id > vdev->v_id) + return (previous); + + if (v->v_id == vdev->v_id) + return (v); + + if (v->v_id < vdev->v_id) + previous = v; + } + return (previous); +} + +static size_t +vdev_child_count(vdev_t *vdev) +{ + vdev_t *v; + size_t count; + + count = 0; + STAILQ_FOREACH(v, &vdev->v_children, v_childlink) { + count++; + } + return (count); +} + +/* + * Insert vdev into top_vdev children list. List is ordered by v_id. + */ +static void +vdev_insert(vdev_t *top_vdev, vdev_t *vdev) +{ + vdev_t *previous; + size_t count; + + /* + * The top level vdev can appear in random order, depending how + * the firmware is presenting the disk devices. + * However, we will insert vdev to create list ordered by v_id, + * so we can use either STAILQ_INSERT_HEAD or STAILQ_INSERT_AFTER + * as STAILQ does not have insert before. + */ + previous = vdev_find_previous(top_vdev, vdev); + + if (previous == NULL) { + STAILQ_INSERT_HEAD(&top_vdev->v_children, vdev, v_childlink); + count = vdev_child_count(top_vdev); + if (top_vdev->v_nchildren < count) + top_vdev->v_nchildren = count; + return; + } + + if (previous->v_id == vdev->v_id) + return; + + STAILQ_INSERT_AFTER(&top_vdev->v_children, previous, vdev, v_childlink); + count = vdev_child_count(top_vdev); + if (top_vdev->v_nchildren < count) + top_vdev->v_nchildren = count; +} + +static int +vdev_from_nvlist(spa_t *spa, uint64_t top_guid, const unsigned char *nvlist) +{ + vdev_t *top_vdev, *vdev; + const unsigned char *kids; + int rc, nkids; + + /* Get top vdev. */ + top_vdev = vdev_find(top_guid); + if (top_vdev == NULL) { + rc = vdev_init(top_guid, nvlist, &top_vdev); + if (rc != 0) + return (rc); + top_vdev->v_spa = spa; + top_vdev->v_top = top_vdev; + vdev_insert(spa->spa_root_vdev, top_vdev); + } + + /* Add children if there are any. */ rc = nvlist_find(nvlist, ZPOOL_CONFIG_CHILDREN, DATA_TYPE_NVLIST_ARRAY, &nkids, &kids); - /* - * Its ok if we don't have any kids. - */ if (rc == 0) { - vdev->v_nchildren = nkids; - for (i = 0; i < nkids; i++) { - rc = vdev_init_from_nvlist(kids, vdev, &kid, is_newer); - if (rc) + for (int i = 0; i < nkids; i++) { + uint64_t guid; + + rc = nvlist_find(kids, ZPOOL_CONFIG_GUID, + DATA_TYPE_UINT64, NULL, &guid); + if (rc != 0) return (rc); - if (is_new) - STAILQ_INSERT_TAIL(&vdev->v_children, kid, - v_childlink); + rc = vdev_init(guid, kids, &vdev); + if (rc != 0) + return (rc); + + vdev->v_spa = spa; + vdev->v_top = top_vdev; + vdev_insert(top_vdev, vdev); + kids = nvlist_next(kids); } } else { - vdev->v_nchildren = 0; + rc = 0; } - if (vdevp) - *vdevp = vdev; - return (0); + return (rc); +} + +static int +vdev_init_from_label(spa_t *spa, const unsigned char *nvlist) +{ + uint64_t pool_guid, top_guid; + const unsigned char *vdevs; + + if (nvlist_find(nvlist, ZPOOL_CONFIG_POOL_GUID, DATA_TYPE_UINT64, + NULL, &pool_guid) || + nvlist_find(nvlist, ZPOOL_CONFIG_TOP_GUID, DATA_TYPE_UINT64, + NULL, &top_guid) || + nvlist_find(nvlist, ZPOOL_CONFIG_VDEV_TREE, DATA_TYPE_NVLIST, + NULL, &vdevs)) { + printf("ZFS: can't find vdev details\n"); + return (ENOENT); + } + + return (vdev_from_nvlist(spa, top_guid, vdevs)); } static void @@ -1280,6 +1397,10 @@ vdev_set_state(vdev_t *vdev) int good_kids; int bad_kids; + STAILQ_FOREACH(kid, &vdev->v_children, v_childlink) { + vdev_set_state(kid); + } + /* * A mirror or raidz is healthy if all its kids are healthy. A * mirror is degraded if any of its kids is healthy; a raidz @@ -1314,6 +1435,104 @@ vdev_set_state(vdev_t *vdev) } } +static int +vdev_update_from_nvlist(uint64_t top_guid, const unsigned char *nvlist) +{ + vdev_t *vdev; + const unsigned char *kids; + int rc, nkids; + + /* Update top vdev. */ + vdev = vdev_find(top_guid); + if (vdev != NULL) + vdev_set_initial_state(vdev, nvlist); + + /* Update children if there are any. */ + rc = nvlist_find(nvlist, ZPOOL_CONFIG_CHILDREN, DATA_TYPE_NVLIST_ARRAY, + &nkids, &kids); + if (rc == 0) { + for (int i = 0; i < nkids; i++) { + uint64_t guid; + + rc = nvlist_find(kids, ZPOOL_CONFIG_GUID, + DATA_TYPE_UINT64, NULL, &guid); + if (rc != 0) + break; + + vdev = vdev_find(guid); + if (vdev != NULL) + vdev_set_initial_state(vdev, kids); + + kids = nvlist_next(kids); + } + } else { + rc = 0; + } + + return (rc); +} + +static int +vdev_init_from_nvlist(spa_t *spa, const unsigned char *nvlist) +{ + uint64_t pool_guid, vdev_children; + const unsigned char *vdevs, *kids; + int rc, nkids; + + if (nvlist_find(nvlist, ZPOOL_CONFIG_POOL_GUID, DATA_TYPE_UINT64, + NULL, &pool_guid) || + nvlist_find(nvlist, ZPOOL_CONFIG_VDEV_CHILDREN, DATA_TYPE_UINT64, + NULL, &vdev_children) || + nvlist_find(nvlist, ZPOOL_CONFIG_VDEV_TREE, DATA_TYPE_NVLIST, + NULL, &vdevs)) { + printf("ZFS: can't find vdev details\n"); + return (ENOENT); + } + + /* Wrong guid?! */ + if (spa->spa_guid != pool_guid) + return (EIO); + + spa->spa_root_vdev->v_nchildren = vdev_children; + + rc = nvlist_find(vdevs, ZPOOL_CONFIG_CHILDREN, DATA_TYPE_NVLIST_ARRAY, + &nkids, &kids); + + /* + * MOS config has at least one child for root vdev. + */ + if (rc != 0) + return (EIO); + + for (int i = 0; i < nkids; i++) { + uint64_t guid; + vdev_t *vdev; + + rc = nvlist_find(kids, ZPOOL_CONFIG_GUID, DATA_TYPE_UINT64, + NULL, &guid); + if (rc != 0) + break; + vdev = vdev_find(guid); + /* + * Top level vdev is missing, create it. + */ + if (vdev == NULL) + rc = vdev_from_nvlist(spa, guid, kids); + else + rc = vdev_update_from_nvlist(guid, kids); + if (rc != 0) + break; + kids = nvlist_next(kids); + } + + /* + * Re-evaluate top-level vdev state. + */ + vdev_set_state(spa->spa_root_vdev); + + return (rc); +} + static spa_t * spa_find_by_guid(uint64_t guid) { @@ -1356,7 +1575,7 @@ spa_get_primary_vdev(const spa_t *spa) spa = spa_get_primary(); if (spa == NULL) return (NULL); - vdev = STAILQ_FIRST(&spa->spa_vdevs); + vdev = spa->spa_root_vdev; if (vdev == NULL) return (NULL); for (kid = STAILQ_FIRST(&vdev->v_children); kid != NULL; @@ -1377,8 +1596,14 @@ spa_create(uint64_t guid, const char *name) free(spa); return (NULL); } - STAILQ_INIT(&spa->spa_vdevs); spa->spa_guid = guid; + spa->spa_root_vdev = vdev_create(guid, NULL); + if (spa->spa_root_vdev == NULL) { + free(spa->spa_name); + free(spa); + return (NULL); + } + spa->spa_root_vdev->v_name = strdup("root"); STAILQ_INSERT_TAIL(&zfs_pools, spa, spa_link); return (spa); @@ -1413,9 +1638,8 @@ pager_printf(const char *fmt, ...) va_list args; va_start(args, fmt); - vsprintf(line, fmt, args); + vsnprintf(line, sizeof(line), fmt, args); va_end(args); - return (pager_output(line)); } @@ -1426,14 +1650,13 @@ pager_printf(const char *fmt, ...) static int print_state(int indent, const char *name, vdev_state_t state) { - char buf[512]; int i; + char buf[512]; buf[0] = 0; for (i = 0; i < indent; i++) strcat(buf, " "); strcat(buf, name); - return (pager_printf(STATUS_FORMAT, buf, state_name(state))); } @@ -1465,6 +1688,7 @@ spa_status(spa_t *spa) { static char bootfs[ZFS_MAXNAMELEN]; uint64_t rootid; + vdev_list_t *vlist; vdev_t *vdev; int good_kids, bad_kids, degraded_kids, ret; vdev_state_t state; @@ -1493,7 +1717,8 @@ spa_status(spa_t *spa) good_kids = 0; degraded_kids = 0; bad_kids = 0; - STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) { + vlist = &spa->spa_root_vdev->v_children; + STAILQ_FOREACH(vdev, vlist, v_childlink) { if (vdev->v_state == VDEV_STATE_HEALTHY) good_kids++; else if (vdev->v_state == VDEV_STATE_DEGRADED) @@ -1511,7 +1736,8 @@ spa_status(spa_t *spa) ret = print_state(0, spa->spa_name, state); if (ret != 0) return (ret); - STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) { + + STAILQ_FOREACH(vdev, vlist, v_childlink) { ret = vdev_status(vdev, 1); if (ret != 0) return (ret); @@ -1703,15 +1929,14 @@ vdev_probe(vdev_phys_read_t *_read, void *read_priv, spa_t **spap) { vdev_t vtmp; spa_t *spa; - vdev_t *vdev, *top_vdev, *pool_vdev; + vdev_t *vdev; unsigned char *nvlist; uint64_t val; - uint64_t guid; + uint64_t guid, vdev_children; uint64_t pool_txg, pool_guid; const char *pool_name; - const unsigned char *vdevs; const unsigned char *features; - int rc, is_newer; + int rc; /* * Load the vdev label and figure out which @@ -1783,18 +2008,17 @@ vdev_probe(vdev_phys_read_t *_read, void *read_priv, spa_t **spap) */ spa = spa_find_by_guid(pool_guid); if (spa == NULL) { + nvlist_find(nvlist, ZPOOL_CONFIG_VDEV_CHILDREN, + DATA_TYPE_UINT64, NULL, &vdev_children); spa = spa_create(pool_guid, pool_name); if (spa == NULL) { free(nvlist); return (ENOMEM); } + spa->spa_root_vdev->v_nchildren = vdev_children; } - if (pool_txg > spa->spa_txg) { + if (pool_txg > spa->spa_txg) spa->spa_txg = pool_txg; - is_newer = 1; - } else { - is_newer = 0; - } /* * Get the vdev tree and create our in-core copy of it. @@ -1814,39 +2038,25 @@ vdev_probe(vdev_phys_read_t *_read, void *read_priv, spa_t **spap) return (EIO); } - if (nvlist_find(nvlist, ZPOOL_CONFIG_VDEV_TREE, DATA_TYPE_NVLIST, - NULL, &vdevs)) { - free(nvlist); - return (EIO); - } - - rc = vdev_init_from_nvlist(vdevs, NULL, &top_vdev, is_newer); + rc = vdev_init_from_label(spa, nvlist); free(nvlist); if (rc != 0) return (rc); - /* - * Add the toplevel vdev to the pool if its not already there. - */ - STAILQ_FOREACH(pool_vdev, &spa->spa_vdevs, v_childlink) - if (top_vdev == pool_vdev) - break; - - if (!pool_vdev && top_vdev) { - top_vdev->spa = spa; - STAILQ_INSERT_TAIL(&spa->spa_vdevs, top_vdev, v_childlink); - } - /* * We should already have created an incomplete vdev for this * vdev. Find it and initialise it with our read proc. */ vdev = vdev_find(guid); - if (vdev) { + if (vdev != NULL) { vdev->v_phys_read = _read; vdev->v_read_priv = read_priv; - vdev->v_state = VDEV_STATE_HEALTHY; vdev->v_psize = vtmp.v_psize; + /* + * If no other state is set, mark vdev healthy. + */ + if (vdev->v_state == VDEV_STATE_UNKNOWN) + vdev->v_state = VDEV_STATE_HEALTHY; } else { printf("ZFS: inconsistent nvlist contents\n"); return (EIO); @@ -1858,7 +2068,7 @@ vdev_probe(vdev_phys_read_t *_read, void *read_priv, spa_t **spap) /* * Re-evaluate top-level vdev state. */ - vdev_set_state(top_vdev); + vdev_set_state(vdev->v_top); /* * Ok, we are happy with the pool so far. Lets find @@ -1867,7 +2077,6 @@ vdev_probe(vdev_phys_read_t *_read, void *read_priv, spa_t **spap) */ vdev_uberblock_load(vdev, &spa->spa_uberblock); - vdev->spa = spa; if (spap != NULL) *spap = spa; return (0); @@ -1962,7 +2171,8 @@ zio_read(const spa_t *spa, const blkptr_t *bp, void *buf) for (i = 0; i < SPA_DVAS_PER_BP; i++) { const dva_t *dva = &bp->blk_dva[i]; vdev_t *vdev; - int vdevid; + vdev_list_t *vlist; + uint64_t vdevid; off_t offset; if (!dva->dva_word[0] && !dva->dva_word[1]) @@ -1970,7 +2180,8 @@ zio_read(const spa_t *spa, const blkptr_t *bp, void *buf) vdevid = DVA_GET_VDEV(dva); offset = DVA_GET_OFFSET(dva); - STAILQ_FOREACH(vdev, &spa->spa_vdevs, v_childlink) { + vlist = &spa->spa_root_vdev->v_children; + STAILQ_FOREACH(vdev, vlist, v_childlink) { if (vdev->v_id == vdevid) break; } @@ -1979,7 +2190,7 @@ zio_read(const spa_t *spa, const blkptr_t *bp, void *buf) size = BP_GET_PSIZE(bp); if (vdev->v_read == vdev_raidz_read) { - align = 1ULL << vdev->v_top->v_ashift; + align = 1ULL << vdev->v_ashift; if (P2PHASE(size, align) != 0) size = P2ROUNDUP(size, align); } @@ -2067,11 +2278,11 @@ dnode_read(const spa_t *spa, const dnode_phys_t *dnode, off_t offset, /* * The buffer contains our data block. Copy what we * need from it and loop. - */ + */ i = bsize - boff; if (i > buflen) i = buflen; memcpy(buf, &dnode_cache_buf[boff], i); - buf = ((char*)buf) + i; + buf = ((char *)buf) + i; offset += i; buflen -= i; } @@ -2124,7 +2335,7 @@ fzap_name_equal(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc, const char *p; namelen = zc->l_entry.le_name_numints; - + nc = &ZAP_LEAF_CHUNK(zl, zc->l_entry.le_name_chunk); p = name; while (namelen > 0) { @@ -2222,6 +2433,26 @@ fzap_leaf_array(const zap_leaf_t *zl, const zap_leaf_chunk_t *zc, } } +static int +fzap_check_size(uint64_t integer_size, uint64_t num_integers) +{ + + switch (integer_size) { + case 1: + case 2: + case 4: + case 8: + break; + default: + return (EINVAL); + } + + if (integer_size * num_integers > ZAP_MAXVALUELEN) + return (E2BIG); + + return (0); +} + /* * Lookup a value in a fatzap directory. Assumes that the zap scratch * buffer contains the directory header. @@ -2240,6 +2471,9 @@ fzap_lookup(const spa_t *spa, const dnode_phys_t *dnode, const char *name, if (zh.zap_magic != ZAP_MAGIC) return (EIO); + if ((rc = fzap_check_size(integer_size, num_integers)) != 0) + return (rc); + z.zap_block_shift = ilog2(bsize); z.zap_phys = (zap_phys_t *)zap_scratch; @@ -2388,7 +2622,7 @@ fzap_list(const spa_t *spa, const dnode_phys_t *dnode, zap_leaf_t zl; zl.l_bs = z.zap_block_shift; for (i = 0; i < zh.zap_num_leafs; i++) { - off_t off = (i + 1) << zl.l_bs; + off_t off = ((off_t)(i + 1)) << zl.l_bs; char name[256], *p; uint64_t value; @@ -2537,7 +2771,7 @@ fzap_rlookup(const spa_t *spa, const dnode_phys_t *dnode, char *name, uint64_t value) { int bsize = dnode->dn_datablkszsec << SPA_MINBLOCKSHIFT; - zap_phys_t zh = *(zap_phys_t *) zap_scratch; + zap_phys_t zh = *(zap_phys_t *)zap_scratch; fat_zap_t z; int i, j; @@ -2554,7 +2788,7 @@ fzap_rlookup(const spa_t *spa, const dnode_phys_t *dnode, char *name, zap_leaf_t zl; zl.l_bs = z.zap_block_shift; for (i = 0; i < zh.zap_num_leafs; i++) { - off_t off = (i + 1) << zl.l_bs; + off_t off = ((off_t)(i + 1)) << zl.l_bs; if (dnode_read(spa, dnode, off, zap_scratch, bsize)) return (EIO); @@ -2844,8 +3078,7 @@ zfs_get_root(const spa_t *spa, uint64_t *objid) sizeof(props), 1, &props) == 0 && objset_get_dnode(spa, &spa->spa_mos, props, &propdir) == 0 && zap_lookup(spa, &propdir, "bootfs", - sizeof(bootfs), 1, &bootfs) == 0 && - bootfs != 0) { + sizeof(bootfs), 1, &bootfs) == 0 && bootfs != 0) { *objid = bootfs; return (0); } @@ -2995,9 +3228,7 @@ zfs_spa_init(spa_t *spa) dnode_phys_t dir; uint64_t config_object; unsigned char *nvlist; - char *type; - const unsigned char *nv; - int nkids, rc; + int rc; if (zio_read(spa, &spa->spa_uberblock.ub_rootbp, &spa->spa_mos)) { printf("ZFS: can't read MOS of pool %s\n", spa->spa_name); @@ -3036,61 +3267,7 @@ zfs_spa_init(spa_t *spa) return (rc); /* Update vdevs from MOS config. */ - if (nvlist_find(nvlist + 4, ZPOOL_CONFIG_VDEV_TREE, DATA_TYPE_NVLIST, - NULL, &nv)) { - rc = EIO; - goto done; - } - - if (nvlist_find(nv, ZPOOL_CONFIG_TYPE, DATA_TYPE_STRING, NULL, &type)) { - printf("ZFS: can't find vdev details\n"); - rc = ENOENT; - goto done; - } - if (strcmp(type, VDEV_TYPE_ROOT) != 0) { - rc = ENOENT; - goto done; - } - - rc = nvlist_find(nv, ZPOOL_CONFIG_CHILDREN, DATA_TYPE_NVLIST_ARRAY, - &nkids, &nv); - if (rc != 0) - goto done; - - for (int i = 0; i < nkids; i++) { - vdev_t *vd, *prev, *kid = NULL; - rc = vdev_init_from_nvlist(nv, NULL, &kid, 0); - if (rc != 0) { - printf("vdev_init_from_nvlist: %d\n", rc); - break; - } - kid->spa = spa; - prev = NULL; - STAILQ_FOREACH(vd, &spa->spa_vdevs, v_childlink) { - /* Already present? */ - if (kid->v_id == vd->v_id) { - kid = NULL; - break; - } - if (vd->v_id > kid->v_id) { - if (prev == NULL) { - STAILQ_INSERT_HEAD(&spa->spa_vdevs, - kid, v_childlink); - } else { - STAILQ_INSERT_AFTER(&spa->spa_vdevs, - prev, kid, v_childlink); - } - kid = NULL; - break; - } - prev = vd; - } - if (kid != NULL) - STAILQ_INSERT_TAIL(&spa->spa_vdevs, kid, v_childlink); - nv = nvlist_next(nv); - } - rc = 0; -done: + rc = vdev_init_from_nvlist(spa, nvlist + 4); free(nvlist); return (rc); } diff --git a/sys/cddl/boot/zfs/zfsimpl.h b/sys/cddl/boot/zfs/zfsimpl.h index db61b7dc647..4f3d8e69b2a 100644 --- a/sys/cddl/boot/zfs/zfsimpl.h +++ b/sys/cddl/boot/zfs/zfsimpl.h @@ -760,6 +760,7 @@ typedef enum { #define ZPOOL_CONFIG_IS_LOG "is_log" #define ZPOOL_CONFIG_TIMESTAMP "timestamp" /* not stored on disk */ #define ZPOOL_CONFIG_FEATURES_FOR_READ "features_for_read" +#define ZPOOL_CONFIG_VDEV_CHILDREN "vdev_children" /* * The persistent vdev state is stored as separate values rather than a single @@ -1173,6 +1174,8 @@ typedef enum dmu_objset_type { DMU_OST_NUMTYPES } dmu_objset_type_t; +#define ZAP_MAXVALUELEN (1024 * 8) + /* * header for all bonus and spill buffers. * The header has a fixed portion with a variable number @@ -1755,13 +1758,13 @@ typedef struct vdev { int v_ashift; /* offset to block shift */ int v_nparity; /* # parity for raidz */ struct vdev *v_top; /* parent vdev */ - int v_nchildren; /* # children */ + size_t v_nchildren; /* # children */ vdev_state_t v_state; /* current state */ vdev_phys_read_t *v_phys_read; /* read from raw leaf vdev */ vdev_read_t *v_read; /* read from vdev */ void *v_read_priv; /* private data for read function */ boolean_t v_islog; - struct spa *spa; /* link to spa */ + struct spa *v_spa; /* link to spa */ /* * Values stored in the config for an indirect or removing vdev. */ @@ -1780,11 +1783,10 @@ typedef struct spa { uint64_t spa_guid; /* pool guid */ uint64_t spa_txg; /* most recent transaction */ struct uberblock spa_uberblock; /* best uberblock so far */ - vdev_list_t spa_vdevs; /* list of all toplevel vdevs */ + vdev_t *spa_root_vdev; /* toplevel vdev container */ objset_phys_t spa_mos; /* MOS for this pool */ zio_cksum_salt_t spa_cksum_salt; /* secret salt for cksum */ void *spa_cksum_tmpls[ZIO_CHECKSUM_FUNCTIONS]; - int spa_inited; /* initialized */ boolean_t spa_with_log; /* this pool has log */ } spa_t; diff --git a/sys/cddl/boot/zfs/zfssubr.c b/sys/cddl/boot/zfs/zfssubr.c index dad76b72598..e15cae438d7 100644 --- a/sys/cddl/boot/zfs/zfssubr.c +++ b/sys/cddl/boot/zfs/zfssubr.c @@ -1637,8 +1637,11 @@ reconstruct: * any errors. */ if (total_errors <= rm->rm_firstdatacol - parity_untried) { + int rv; + if (data_errors == 0) { - if (raidz_checksum_verify(vd->spa, bp, data, bytes) == 0) { + rv = raidz_checksum_verify(vd->v_spa, bp, data, bytes); + if (rv == 0) { /* * If we read parity information (unnecessarily * as it happens since no reconstruction was @@ -1683,7 +1686,8 @@ reconstruct: code = vdev_raidz_reconstruct(rm, tgts, n); - if (raidz_checksum_verify(vd->spa, bp, data, bytes) == 0) { + rv = raidz_checksum_verify(vd->v_spa, bp, data, bytes); + if (rv == 0) { /* * If we read more parity disks than were used * for reconstruction, confirm that the other @@ -1757,7 +1761,7 @@ reconstruct: if (total_errors > rm->rm_firstdatacol) { error = EIO; } else if (total_errors < rm->rm_firstdatacol && - (code = vdev_raidz_combrec(vd->spa, rm, bp, data, offset, bytes, + (code = vdev_raidz_combrec(vd->v_spa, rm, bp, data, offset, bytes, total_errors, data_errors)) != 0) { /* * If we didn't use all the available parity for the