Notable upstream pull request merges:
 #14378 c0e58995e Large sync writes perform worse with slog
 #14721 797f55ef1 Do not persist user/group/project quota zap objects
                  when unneeded
 #15356 380c25f64 FreeBSD: Improve taskq wrapper
 #15390 4fbc52495 Remove lock from dsl_pool_need_dirty_delay()
 #15397 57b409856 Trust ARC_BUF_SHARED() more
 #15402 b29e98fa8 Properly pad struct tx_cpu to cache line
 #15405 ea30b5a9e Set spa_ccw_fail_time=0 when expanding a vdev
 #15416 b9384b949 FreeBSD: taskq: Remove unused declaration

Obtained from:	OpenZFS
OpenZFS commit:	797f55ef12
This commit is contained in:
Martin Matuska 2023-10-22 18:25:23 +02:00
commit 22b267e823
22 changed files with 386 additions and 119 deletions

View file

@ -24,6 +24,7 @@
* Copyright 2014 Nexenta Systems, Inc. All rights reserved.
* Copyright (c) 2016, 2017, Intel Corporation.
* Copyright (c) 2017 Open-E, Inc. All Rights Reserved.
* Copyright (c) 2023, Klara Inc.
*/
/*
@ -204,7 +205,7 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
uint64_t is_spare = 0;
const char *physpath = NULL, *new_devid = NULL, *enc_sysfs_path = NULL;
char rawpath[PATH_MAX], fullpath[PATH_MAX];
char devpath[PATH_MAX];
char pathbuf[PATH_MAX];
int ret;
int online_flag = ZFS_ONLINE_CHECKREMOVE | ZFS_ONLINE_UNSPARE;
boolean_t is_sd = B_FALSE;
@ -214,6 +215,11 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
char **lines = NULL;
int lines_cnt = 0;
/*
* Get the persistent path, typically under the '/dev/disk/by-id' or
* '/dev/disk/by-vdev' directories. Note that this path can change
* when a vdev is replaced with a new disk.
*/
if (nvlist_lookup_string(vdev, ZPOOL_CONFIG_PATH, &path) != 0)
return;
@ -370,15 +376,17 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
(void) snprintf(rawpath, sizeof (rawpath), "%s%s",
is_sd ? DEV_BYVDEV_PATH : DEV_BYPATH_PATH, physpath);
if (realpath(rawpath, devpath) == NULL && !is_mpath_wholedisk) {
if (realpath(rawpath, pathbuf) == NULL && !is_mpath_wholedisk) {
zed_log_msg(LOG_INFO, " realpath: %s failed (%s)",
rawpath, strerror(errno));
(void) zpool_vdev_online(zhp, fullpath, ZFS_ONLINE_FORCEFAULT,
&newstate);
int err = zpool_vdev_online(zhp, fullpath,
ZFS_ONLINE_FORCEFAULT, &newstate);
zed_log_msg(LOG_INFO, " zpool_vdev_online: %s FORCEFAULT (%s)",
fullpath, libzfs_error_description(g_zfshdl));
zed_log_msg(LOG_INFO, " zpool_vdev_online: %s FORCEFAULT (%s) "
"err %d, new state %d",
fullpath, libzfs_error_description(g_zfshdl), err,
err ? (int)newstate : 0);
return;
}
@ -428,7 +436,7 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
* to trigger a ZFS fault for the device (and any hot spare
* replacement).
*/
leafname = strrchr(devpath, '/') + 1;
leafname = strrchr(pathbuf, '/') + 1;
/*
* If this is a request to label a whole disk, then attempt to
@ -436,7 +444,7 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
*/
if (zpool_prepare_and_label_disk(g_zfshdl, zhp, leafname,
vdev, "autoreplace", &lines, &lines_cnt) != 0) {
zed_log_msg(LOG_INFO,
zed_log_msg(LOG_WARNING,
" zpool_prepare_and_label_disk: could not "
"label '%s' (%s)", leafname,
libzfs_error_description(g_zfshdl));
@ -468,7 +476,7 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
sizeof (device->pd_physpath));
list_insert_tail(&g_device_list, device);
zed_log_msg(LOG_INFO, " zpool_label_disk: async '%s' (%llu)",
zed_log_msg(LOG_NOTICE, " zpool_label_disk: async '%s' (%llu)",
leafname, (u_longlong_t)guid);
return; /* resumes at EC_DEV_ADD.ESC_DISK for partition */
@ -491,8 +499,8 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
}
if (!found) {
/* unexpected partition slice encountered */
zed_log_msg(LOG_INFO, "labeled disk %s unexpected here",
fullpath);
zed_log_msg(LOG_WARNING, "labeled disk %s was "
"unexpected here", fullpath);
(void) zpool_vdev_online(zhp, fullpath,
ZFS_ONLINE_FORCEFAULT, &newstate);
return;
@ -501,8 +509,17 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
zed_log_msg(LOG_INFO, " zpool_label_disk: resume '%s' (%llu)",
physpath, (u_longlong_t)guid);
(void) snprintf(devpath, sizeof (devpath), "%s%s",
DEV_BYID_PATH, new_devid);
/*
* Paths that begin with '/dev/disk/by-id/' will change and so
* they must be updated before calling zpool_vdev_attach().
*/
if (strncmp(path, DEV_BYID_PATH, strlen(DEV_BYID_PATH)) == 0) {
(void) snprintf(pathbuf, sizeof (pathbuf), "%s%s",
DEV_BYID_PATH, new_devid);
zed_log_msg(LOG_INFO, " zpool_label_disk: path '%s' "
"replaced by '%s'", path, pathbuf);
path = pathbuf;
}
}
libzfs_free_str_array(lines, lines_cnt);
@ -545,9 +562,11 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
* Wait for udev to verify the links exist, then auto-replace
* the leaf disk at same physical location.
*/
if (zpool_label_disk_wait(path, 3000) != 0) {
zed_log_msg(LOG_WARNING, "zfs_mod: expected replacement "
"disk %s is missing", path);
if (zpool_label_disk_wait(path, DISK_LABEL_WAIT) != 0) {
zed_log_msg(LOG_WARNING, "zfs_mod: pool '%s', after labeling "
"replacement disk, the expected disk partition link '%s' "
"is missing after waiting %u ms",
zpool_get_name(zhp), path, DISK_LABEL_WAIT);
nvlist_free(nvroot);
return;
}
@ -562,7 +581,7 @@ zfs_process_add(zpool_handle_t *zhp, nvlist_t *vdev, boolean_t labeled)
B_TRUE, B_FALSE);
}
zed_log_msg(LOG_INFO, " zpool_vdev_replace: %s with %s (%s)",
zed_log_msg(LOG_WARNING, " zpool_vdev_replace: %s with %s (%s)",
fullpath, path, (ret == 0) ? "no errors" :
libzfs_error_description(g_zfshdl));
@ -660,7 +679,7 @@ zfs_iter_vdev(zpool_handle_t *zhp, nvlist_t *nvl, void *data)
dp->dd_prop, path);
dp->dd_found = B_TRUE;
/* pass the new devid for use by replacing code */
/* pass the new devid for use by auto-replacing code */
if (dp->dd_new_devid != NULL) {
(void) nvlist_add_string(nvl, "new_devid",
dp->dd_new_devid);

View file

@ -34,7 +34,7 @@ extern "C" {
#endif
/*
* Default wait time for a device name to be created.
* Default wait time in milliseconds for a device name to be created.
*/
#define DISK_LABEL_WAIT (30 * 1000) /* 30 seconds */

View file

@ -30,9 +30,9 @@
#include <sys/types.h>
#include <sys/proc.h>
#include <sys/queue.h>
#include <sys/taskqueue.h>
#include <sys/thread.h>
#include <sys/ck.h>
#ifdef __cplusplus
extern "C" {
@ -48,16 +48,16 @@ typedef uintptr_t taskqid_t;
typedef void (task_func_t)(void *);
typedef struct taskq_ent {
struct task tqent_task;
struct timeout_task tqent_timeout_task;
union {
struct task tqent_task;
struct timeout_task tqent_timeout_task;
};
task_func_t *tqent_func;
void *tqent_arg;
taskqid_t tqent_id;
CK_LIST_ENTRY(taskq_ent) tqent_hash;
uint8_t tqent_type;
uint8_t tqent_registered;
uint8_t tqent_cancelled;
volatile uint32_t tqent_rc;
taskqid_t tqent_id;
LIST_ENTRY(taskq_ent) tqent_hash;
uint_t tqent_type;
volatile uint_t tqent_rc;
} taskq_ent_t;
/*

View file

@ -73,8 +73,7 @@ struct tx_cpu {
kcondvar_t tc_cv[TXG_SIZE];
uint64_t tc_count[TXG_SIZE]; /* tx hold count on each txg */
list_t tc_callbacks[TXG_SIZE]; /* commit cb list */
char tc_pad[8]; /* pad to fill 3 cache lines */
};
} ____cacheline_aligned;
/*
* The tx_state structure maintains the state information about the different

View file

@ -582,9 +582,8 @@ zfs_device_get_physical(struct udev_device *dev, char *bufptr, size_t buflen)
* Wait up to timeout_ms for udev to set up the device node. The device is
* considered ready when libudev determines it has been initialized, all of
* the device links have been verified to exist, and it has been allowed to
* settle. At this point the device the device can be accessed reliably.
* Depending on the complexity of the udev rules this process could take
* several seconds.
* settle. At this point the device can be accessed reliably. Depending on
* the complexity of the udev rules this process could take several seconds.
*/
int
zpool_label_disk_wait(const char *path, int timeout_ms)

View file

@ -2172,7 +2172,7 @@ if a volatile out-of-order write cache is enabled.
Disable intent logging replay.
Can be disabled for recovery from corrupted ZIL.
.
.It Sy zil_slog_bulk Ns = Ns Sy 786432 Ns B Po 768 KiB Pc Pq u64
.It Sy zil_slog_bulk Ns = Ns Sy 67108864 Ns B Po 64 MiB Pc Pq u64
Limit SLOG write size per commit executed with synchronous priority.
Any writes above that will be executed with lower (asynchronous) priority
to limit potential SLOG device abuse by single active ZIL writer.

View file

@ -30,8 +30,6 @@
__FBSDID("$FreeBSD$");
#include <sys/param.h>
#include <sys/ck.h>
#include <sys/epoch.h>
#include <sys/kernel.h>
#include <sys/kmem.h>
#include <sys/lock.h>
@ -66,11 +64,9 @@ taskq_t *dynamic_taskq = NULL;
proc_t *system_proc;
extern int uma_align_cache;
static MALLOC_DEFINE(M_TASKQ, "taskq", "taskq structures");
static CK_LIST_HEAD(tqenthashhead, taskq_ent) *tqenthashtbl;
static LIST_HEAD(tqenthashhead, taskq_ent) *tqenthashtbl;
static unsigned long tqenthash;
static unsigned long tqenthashlock;
static struct sx *tqenthashtbl_lock;
@ -80,8 +76,8 @@ static taskqid_t tqidnext;
#define TQIDHASH(tqid) (&tqenthashtbl[(tqid) & tqenthash])
#define TQIDHASHLOCK(tqid) (&tqenthashtbl_lock[((tqid) & tqenthashlock)])
#define NORMAL_TASK 0
#define TIMEOUT_TASK 1
#define NORMAL_TASK 2
static void
system_taskq_init(void *arg)
@ -121,7 +117,7 @@ system_taskq_fini(void *arg)
for (i = 0; i < tqenthashlock + 1; i++)
sx_destroy(&tqenthashtbl_lock[i]);
for (i = 0; i < tqenthash + 1; i++)
VERIFY(CK_LIST_EMPTY(&tqenthashtbl[i]));
VERIFY(LIST_EMPTY(&tqenthashtbl[i]));
free(tqenthashtbl_lock, M_TASKQ);
free(tqenthashtbl, M_TASKQ);
}
@ -162,27 +158,27 @@ taskq_lookup(taskqid_t tqid)
{
taskq_ent_t *ent = NULL;
sx_xlock(TQIDHASHLOCK(tqid));
CK_LIST_FOREACH(ent, TQIDHASH(tqid), tqent_hash) {
if (tqid == 0)
return (NULL);
sx_slock(TQIDHASHLOCK(tqid));
LIST_FOREACH(ent, TQIDHASH(tqid), tqent_hash) {
if (ent->tqent_id == tqid)
break;
}
if (ent != NULL)
refcount_acquire(&ent->tqent_rc);
sx_xunlock(TQIDHASHLOCK(tqid));
sx_sunlock(TQIDHASHLOCK(tqid));
return (ent);
}
static taskqid_t
taskq_insert(taskq_ent_t *ent)
{
taskqid_t tqid;
taskqid_t tqid = __taskq_genid();
tqid = __taskq_genid();
ent->tqent_id = tqid;
ent->tqent_registered = B_TRUE;
sx_xlock(TQIDHASHLOCK(tqid));
CK_LIST_INSERT_HEAD(TQIDHASH(tqid), ent, tqent_hash);
LIST_INSERT_HEAD(TQIDHASH(tqid), ent, tqent_hash);
sx_xunlock(TQIDHASHLOCK(tqid));
return (tqid);
}
@ -192,13 +188,14 @@ taskq_remove(taskq_ent_t *ent)
{
taskqid_t tqid = ent->tqent_id;
if (!ent->tqent_registered)
if (tqid == 0)
return;
sx_xlock(TQIDHASHLOCK(tqid));
CK_LIST_REMOVE(ent, tqent_hash);
if (ent->tqent_id != 0) {
LIST_REMOVE(ent, tqent_hash);
ent->tqent_id = 0;
}
sx_xunlock(TQIDHASHLOCK(tqid));
ent->tqent_registered = B_FALSE;
}
static void
@ -285,21 +282,22 @@ taskq_cancel_id(taskq_t *tq, taskqid_t tid)
int rc;
taskq_ent_t *ent;
if (tid == 0)
return (0);
if ((ent = taskq_lookup(tid)) == NULL)
return (0);
ent->tqent_cancelled = B_TRUE;
if (ent->tqent_type == TIMEOUT_TASK) {
if (ent->tqent_type == NORMAL_TASK) {
rc = taskqueue_cancel(tq->tq_queue, &ent->tqent_task, &pend);
if (rc == EBUSY)
taskqueue_drain(tq->tq_queue, &ent->tqent_task);
} else {
rc = taskqueue_cancel_timeout(tq->tq_queue,
&ent->tqent_timeout_task, &pend);
} else
rc = taskqueue_cancel(tq->tq_queue, &ent->tqent_task, &pend);
if (rc == EBUSY) {
taskqueue_drain(tq->tq_queue, &ent->tqent_task);
} else if (pend) {
if (rc == EBUSY) {
taskqueue_drain_timeout(tq->tq_queue,
&ent->tqent_timeout_task);
}
}
if (pend) {
/*
* Tasks normally free themselves when run, but here the task
* was cancelled so it did not free itself.
@ -312,12 +310,13 @@ taskq_cancel_id(taskq_t *tq, taskqid_t tid)
}
static void
taskq_run(void *arg, int pending __unused)
taskq_run(void *arg, int pending)
{
taskq_ent_t *task = arg;
if (!task->tqent_cancelled)
task->tqent_func(task->tqent_arg);
if (pending == 0)
return;
task->tqent_func(task->tqent_arg);
taskq_free(task);
}
@ -345,7 +344,6 @@ taskq_dispatch_delay(taskq_t *tq, task_func_t func, void *arg,
task->tqent_func = func;
task->tqent_arg = arg;
task->tqent_type = TIMEOUT_TASK;
task->tqent_cancelled = B_FALSE;
refcount_init(&task->tqent_rc, 1);
tqid = taskq_insert(task);
TIMEOUT_TASK_INIT(tq->tq_queue, &task->tqent_timeout_task, 0,
@ -379,7 +377,6 @@ taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags)
refcount_init(&task->tqent_rc, 1);
task->tqent_func = func;
task->tqent_arg = arg;
task->tqent_cancelled = B_FALSE;
task->tqent_type = NORMAL_TASK;
tqid = taskq_insert(task);
TASK_INIT(&task->tqent_task, prio, taskq_run, task);
@ -388,10 +385,12 @@ taskq_dispatch(taskq_t *tq, task_func_t func, void *arg, uint_t flags)
}
static void
taskq_run_ent(void *arg, int pending __unused)
taskq_run_ent(void *arg, int pending)
{
taskq_ent_t *task = arg;
if (pending == 0)
return;
task->tqent_func(task->tqent_arg);
}
@ -406,8 +405,6 @@ taskq_dispatch_ent(taskq_t *tq, task_func_t func, void *arg, uint32_t flags,
* can go at the front of the queue.
*/
prio = !!(flags & TQ_FRONT);
task->tqent_cancelled = B_FALSE;
task->tqent_registered = B_FALSE;
task->tqent_id = 0;
task->tqent_func = func;
task->tqent_arg = arg;
@ -427,12 +424,13 @@ taskq_wait_id(taskq_t *tq, taskqid_t tid)
{
taskq_ent_t *ent;
if (tid == 0)
return;
if ((ent = taskq_lookup(tid)) == NULL)
return;
taskqueue_drain(tq->tq_queue, &ent->tqent_task);
if (ent->tqent_type == NORMAL_TASK)
taskqueue_drain(tq->tq_queue, &ent->tqent_task);
else
taskqueue_drain_timeout(tq->tq_queue, &ent->tqent_timeout_task);
taskq_free(ent);
}

View file

@ -1364,7 +1364,7 @@ arc_buf_is_shared(arc_buf_t *buf)
abd_is_linear(buf->b_hdr->b_l1hdr.b_pabd) &&
buf->b_data == abd_to_buf(buf->b_hdr->b_l1hdr.b_pabd));
IMPLY(shared, HDR_SHARED_DATA(buf->b_hdr));
IMPLY(shared, ARC_BUF_SHARED(buf));
EQUIV(shared, ARC_BUF_SHARED(buf));
IMPLY(shared, ARC_BUF_COMPRESSED(buf) || ARC_BUF_LAST(buf));
/*
@ -1998,7 +1998,7 @@ arc_buf_fill(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb,
IMPLY(encrypted, HDR_ENCRYPTED(hdr));
IMPLY(encrypted, ARC_BUF_ENCRYPTED(buf));
IMPLY(encrypted, ARC_BUF_COMPRESSED(buf));
IMPLY(encrypted, !ARC_BUF_SHARED(buf));
IMPLY(encrypted, !arc_buf_is_shared(buf));
/*
* If the caller wanted encrypted data we just need to copy it from
@ -2066,7 +2066,9 @@ arc_buf_fill(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb,
}
if (hdr_compressed == compressed) {
if (!arc_buf_is_shared(buf)) {
if (ARC_BUF_SHARED(buf)) {
ASSERT(arc_buf_is_shared(buf));
} else {
abd_copy_to_buf(buf->b_data, hdr->b_l1hdr.b_pabd,
arc_buf_size(buf));
}
@ -2078,7 +2080,7 @@ arc_buf_fill(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb,
* If the buf is sharing its data with the hdr, unlink it and
* allocate a new data buffer for the buf.
*/
if (arc_buf_is_shared(buf)) {
if (ARC_BUF_SHARED(buf)) {
ASSERT(ARC_BUF_COMPRESSED(buf));
/* We need to give the buf its own b_data */
@ -2090,6 +2092,8 @@ arc_buf_fill(arc_buf_t *buf, spa_t *spa, const zbookmark_phys_t *zb,
/* Previously overhead was 0; just add new overhead */
ARCSTAT_INCR(arcstat_overhead_size, HDR_GET_LSIZE(hdr));
} else if (ARC_BUF_COMPRESSED(buf)) {
ASSERT(!arc_buf_is_shared(buf));
/* We need to reallocate the buf's b_data */
arc_free_data_buf(hdr, buf->b_data, HDR_GET_PSIZE(hdr),
buf);
@ -2217,7 +2221,7 @@ arc_evictable_space_increment(arc_buf_hdr_t *hdr, arc_state_t *state)
for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
buf = buf->b_next) {
if (arc_buf_is_shared(buf))
if (ARC_BUF_SHARED(buf))
continue;
(void) zfs_refcount_add_many(&state->arcs_esize[type],
arc_buf_size(buf), buf);
@ -2256,7 +2260,7 @@ arc_evictable_space_decrement(arc_buf_hdr_t *hdr, arc_state_t *state)
for (arc_buf_t *buf = hdr->b_l1hdr.b_buf; buf != NULL;
buf = buf->b_next) {
if (arc_buf_is_shared(buf))
if (ARC_BUF_SHARED(buf))
continue;
(void) zfs_refcount_remove_many(&state->arcs_esize[type],
arc_buf_size(buf), buf);
@ -2481,7 +2485,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr)
* add to the refcount if the arc_buf_t is
* not shared.
*/
if (arc_buf_is_shared(buf))
if (ARC_BUF_SHARED(buf))
continue;
(void) zfs_refcount_add_many(
@ -2537,7 +2541,7 @@ arc_change_state(arc_state_t *new_state, arc_buf_hdr_t *hdr)
* add to the refcount if the arc_buf_t is
* not shared.
*/
if (arc_buf_is_shared(buf))
if (ARC_BUF_SHARED(buf))
continue;
(void) zfs_refcount_remove_many(
@ -3061,9 +3065,10 @@ arc_buf_destroy_impl(arc_buf_t *buf)
arc_cksum_verify(buf);
arc_buf_unwatch(buf);
if (arc_buf_is_shared(buf)) {
if (ARC_BUF_SHARED(buf)) {
arc_hdr_clear_flags(hdr, ARC_FLAG_SHARED_DATA);
} else {
ASSERT(!arc_buf_is_shared(buf));
uint64_t size = arc_buf_size(buf);
arc_free_data_buf(hdr, buf->b_data, size, buf);
ARCSTAT_INCR(arcstat_overhead_size, -size);
@ -3104,9 +3109,9 @@ arc_buf_destroy_impl(arc_buf_t *buf)
*/
if (lastbuf != NULL && !ARC_BUF_ENCRYPTED(lastbuf)) {
/* Only one buf can be shared at once */
VERIFY(!arc_buf_is_shared(lastbuf));
ASSERT(!arc_buf_is_shared(lastbuf));
/* hdr is uncompressed so can't have compressed buf */
VERIFY(!ARC_BUF_COMPRESSED(lastbuf));
ASSERT(!ARC_BUF_COMPRESSED(lastbuf));
ASSERT3P(hdr->b_l1hdr.b_pabd, !=, NULL);
arc_hdr_free_abd(hdr, B_FALSE);
@ -6189,7 +6194,7 @@ arc_release(arc_buf_t *buf, const void *tag)
ASSERT(hdr->b_l1hdr.b_buf != buf || buf->b_next != NULL);
VERIFY3S(remove_reference(hdr, tag), >, 0);
if (arc_buf_is_shared(buf) && !ARC_BUF_COMPRESSED(buf)) {
if (ARC_BUF_SHARED(buf) && !ARC_BUF_COMPRESSED(buf)) {
ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf);
ASSERT(ARC_BUF_LAST(buf));
}
@ -6206,9 +6211,9 @@ arc_release(arc_buf_t *buf, const void *tag)
* If the current arc_buf_t and the hdr are sharing their data
* buffer, then we must stop sharing that block.
*/
if (arc_buf_is_shared(buf)) {
if (ARC_BUF_SHARED(buf)) {
ASSERT3P(hdr->b_l1hdr.b_buf, !=, buf);
VERIFY(!arc_buf_is_shared(lastbuf));
ASSERT(!arc_buf_is_shared(lastbuf));
/*
* First, sever the block sharing relationship between
@ -6241,7 +6246,7 @@ arc_release(arc_buf_t *buf, const void *tag)
*/
ASSERT(arc_buf_is_shared(lastbuf) ||
arc_hdr_get_compress(hdr) != ZIO_COMPRESS_OFF);
ASSERT(!ARC_BUF_SHARED(buf));
ASSERT(!arc_buf_is_shared(buf));
}
ASSERT(hdr->b_l1hdr.b_pabd != NULL || HDR_HAS_RABD(hdr));
@ -6335,9 +6340,10 @@ arc_write_ready(zio_t *zio)
arc_cksum_free(hdr);
arc_buf_unwatch(buf);
if (hdr->b_l1hdr.b_pabd != NULL) {
if (arc_buf_is_shared(buf)) {
if (ARC_BUF_SHARED(buf)) {
arc_unshare_buf(hdr, buf);
} else {
ASSERT(!arc_buf_is_shared(buf));
arc_hdr_free_abd(hdr, B_FALSE);
}
}
@ -6636,9 +6642,10 @@ arc_write(zio_t *pio, spa_t *spa, uint64_t txg,
* The hdr will remain with a NULL data pointer and the
* buf will take sole ownership of the block.
*/
if (arc_buf_is_shared(buf)) {
if (ARC_BUF_SHARED(buf)) {
arc_unshare_buf(hdr, buf);
} else {
ASSERT(!arc_buf_is_shared(buf));
arc_hdr_free_abd(hdr, B_FALSE);
}
VERIFY3P(buf->b_data, !=, NULL);

View file

@ -965,18 +965,18 @@ dsl_pool_need_dirty_delay(dsl_pool_t *dp)
uint64_t delay_min_bytes =
zfs_dirty_data_max * zfs_delay_min_dirty_percent / 100;
mutex_enter(&dp->dp_lock);
uint64_t dirty = dp->dp_dirty_total;
mutex_exit(&dp->dp_lock);
return (dirty > delay_min_bytes);
/*
* We are not taking the dp_lock here and few other places, since torn
* reads are unlikely: on 64-bit systems due to register size and on
* 32-bit due to memory constraints. Pool-wide locks in hot path may
* be too expensive, while we do not need a precise result here.
*/
return (dp->dp_dirty_total > delay_min_bytes);
}
static boolean_t
dsl_pool_need_dirty_sync(dsl_pool_t *dp, uint64_t txg)
{
ASSERT(MUTEX_HELD(&dp->dp_lock));
uint64_t dirty_min_bytes =
zfs_dirty_data_max * zfs_dirty_data_sync_percent / 100;
uint64_t dirty = dp->dp_dirty_pertxg[txg & TXG_MASK];

View file

@ -4215,6 +4215,7 @@ vdev_online(spa_t *spa, uint64_t guid, uint64_t flags, vdev_state_t *newstate)
/* XXX - L2ARC 1.0 does not support expansion */
if (vd->vdev_aux)
return (spa_vdev_state_exit(spa, vd, ENOTSUP));
spa->spa_ccw_fail_time = 0;
spa_async_request(spa, SPA_ASYNC_CONFIG_UPDATE);
}

View file

@ -347,18 +347,32 @@ zfs_set_userquota(zfsvfs_t *zfsvfs, zfs_userquota_prop_t type,
if (*objp == 0) {
*objp = zap_create(zfsvfs->z_os, DMU_OT_USERGROUP_QUOTA,
DMU_OT_NONE, 0, tx);
VERIFY(0 == zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
VERIFY0(zap_add(zfsvfs->z_os, MASTER_NODE_OBJ,
zfs_userquota_prop_prefixes[type], 8, 1, objp, tx));
}
mutex_exit(&zfsvfs->z_lock);
if (quota == 0) {
err = zap_remove(zfsvfs->z_os, *objp, buf, tx);
if (err == ENOENT)
err = 0;
/*
* If the quota contains no more entries after the entry
* was removed, destroy the quota zap and remove the
* reference from zfsvfs. This will save us unnecessary
* zap_lookups for the quota during writes.
*/
uint64_t zap_nentries;
VERIFY0(zap_count(zfsvfs->z_os, *objp, &zap_nentries));
if (zap_nentries == 0) {
VERIFY0(zap_remove(zfsvfs->z_os, MASTER_NODE_OBJ,
zfs_userquota_prop_prefixes[type], tx));
VERIFY0(zap_destroy(zfsvfs->z_os, *objp, tx));
*objp = 0;
}
} else {
err = zap_update(zfsvfs->z_os, *objp, buf, 8, 1, &quota, tx);
}
mutex_exit(&zfsvfs->z_lock);
ASSERT(err == 0);
if (fuid_dirtied)
zfs_fuid_sync(zfsvfs, tx);

View file

@ -145,7 +145,7 @@ static int zil_nocacheflush = 0;
* Any writes above that will be executed with lower (asynchronous) priority
* to limit potential SLOG device abuse by single active ZIL writer.
*/
static uint64_t zil_slog_bulk = 768 * 1024;
static uint64_t zil_slog_bulk = 64 * 1024 * 1024;
static kmem_cache_t *zil_lwb_cache;
static kmem_cache_t *zil_zcw_cache;

View file

@ -122,10 +122,10 @@ tags = ['functional', 'fallocate']
[tests/functional/fault:Linux]
tests = ['auto_offline_001_pos', 'auto_online_001_pos', 'auto_online_002_pos',
'auto_replace_001_pos', 'auto_spare_001_pos', 'auto_spare_002_pos',
'auto_spare_multiple', 'auto_spare_ashift', 'auto_spare_shared',
'decrypt_fault', 'decompress_fault', 'scrub_after_resilver',
'zpool_status_-s']
'auto_replace_001_pos', 'auto_replace_002_pos', 'auto_spare_001_pos',
'auto_spare_002_pos', 'auto_spare_multiple', 'auto_spare_ashift',
'auto_spare_shared', 'decrypt_fault', 'decompress_fault',
'scrub_after_resilver', 'zpool_status_-s']
tags = ['functional', 'fault']
[tests/functional/features/large_dnode:Linux]

View file

@ -328,6 +328,7 @@ if os.environ.get('CI') == 'true':
'fault/auto_online_001_pos': ['SKIP', ci_reason],
'fault/auto_online_002_pos': ['SKIP', ci_reason],
'fault/auto_replace_001_pos': ['SKIP', ci_reason],
'fault/auto_replace_002_pos': ['SKIP', ci_reason],
'fault/auto_spare_ashift': ['SKIP', ci_reason],
'fault/auto_spare_shared': ['SKIP', ci_reason],
'procfs/pool_state': ['SKIP', ci_reason],

View file

@ -130,12 +130,14 @@ export SYSTEM_FILES_LINUX='attr
chattr
exportfs
fallocate
flock
free
getfattr
groupadd
groupdel
groupmod
hostid
logger
losetup
lsattr
lsblk
@ -145,21 +147,20 @@ export SYSTEM_FILES_LINUX='attr
md5sum
mkswap
modprobe
mountpoint
mpstat
nsenter
parted
perf
setfattr
setpriv
sha256sum
udevadm
unshare
useradd
userdel
usermod
setpriv
mountpoint
flock
logger'
wipefs'
export ZFS_FILES='zdb
zfs

View file

@ -1431,6 +1431,7 @@ nobase_dist_datadir_zfs_tests_tests_SCRIPTS += \
functional/fault/auto_online_001_pos.ksh \
functional/fault/auto_online_002_pos.ksh \
functional/fault/auto_replace_001_pos.ksh \
functional/fault/auto_replace_002_pos.ksh \
functional/fault/auto_spare_001_pos.ksh \
functional/fault/auto_spare_002_pos.ksh \
functional/fault/auto_spare_ashift.ksh \

View file

@ -29,7 +29,7 @@
#
if is_linux; then
# these are the set of setable ZFS properties
# these are the set of settable ZFS properties
PROP_NAMES="\
acltype atime \
checksum compression devices \
@ -81,7 +81,7 @@ elif is_freebsd; then
hidden"
else
# these are the set of setable ZFS properties
# these are the set of settable ZFS properties
PROP_NAMES="\
aclinherit aclmode atime \
checksum compression devices \

View file

@ -34,13 +34,14 @@
# 1. Update /etc/zfs/vdev_id.conf with scsidebug alias for a persistent path.
# This creates keys ID_VDEV and ID_VDEV_PATH and set phys_path="scsidebug".
# 2. Create a pool and set autoreplace=on (auto-replace is opt-in)
# 3. Export a pool
# 3. Export the pool
# 4. Wipe and offline the scsi_debug disk
# 5. Import pool with missing disk
# 5. Import the pool with missing disk
# 6. Re-online the wiped scsi_debug disk
# 7. Verify the ZED detects the new unused disk and adds it back to the pool
# 7. Verify ZED detects the new blank disk and replaces the missing vdev
# 8. Verify that the scsi_debug disk was re-partitioned
#
# Creates a raidz1 zpool using persistent disk path names
# Creates a raidz1 zpool using persistent /dev/disk/by-vdev path names
# (ie not /dev/sdc)
#
# Auto-replace is opt in, and matches by phys_path.
@ -83,11 +84,27 @@ log_must zpool create -f $TESTPOOL raidz1 $SD_DEVICE $DISK1 $DISK2 $DISK3
log_must zpool set autoreplace=on $TESTPOOL
# Add some data to the pool
log_must mkfile $FSIZE /$TESTPOOL/data
log_must zfs create $TESTPOOL/fs
log_must fill_fs /$TESTPOOL/fs 4 100 4096 512 Z
log_must zpool export $TESTPOOL
# Record the partition UUID for later comparison
part_uuid=$(udevadm info --query=property --property=ID_PART_TABLE_UUID \
--value /dev/disk/by-id/$SD_DEVICE_ID)
[[ -z "$part_uuid" ]] || log_note original disk GPT uuid ${part_uuid}
#
# Wipe and offline the disk
#
# Note that it is not enough to zero the disk to expunge the partitions.
# You also need to inform the kernel (e.g., 'hdparm -z' or 'partprobe').
#
# Using partprobe is overkill and hdparm is not as common as wipefs. So
# we use wipefs which lets the kernel know the partition was removed
# from the device (i.e., calls BLKRRPART ioctl).
#
log_must dd if=/dev/zero of=/dev/disk/by-id/$SD_DEVICE_ID bs=1M count=$SDSIZE
log_must /usr/sbin/wipefs -a /dev/disk/by-id/$SD_DEVICE_ID
remove_disk $SD
block_device_wait
@ -106,4 +123,18 @@ log_must wait_replacing $TESTPOOL 60
# Validate auto-replace was successful
log_must check_state $TESTPOOL "" "ONLINE"
#
# Confirm the partition UUID changed so we know the new disk was relabeled
#
# Note: some older versions of udevadm don't support "--property" option so
# we'll # skip this test when it is not supported
#
if [ ! -z "$part_uuid" ]; then
new_uuid=$(udevadm info --query=property --property=ID_PART_TABLE_UUID \
--value /dev/disk/by-id/$SD_DEVICE_ID)
log_note new disk GPT uuid ${new_uuid}
[[ "$part_uuid" = "$new_uuid" ]] && \
log_fail "The new disk was not relabeled as expected"
fi
log_pass "Auto-replace test successful"

View file

@ -0,0 +1,192 @@
#!/bin/ksh -p
#
# CDDL HEADER START
#
# The contents of this file are subject to the terms of the
# Common Development and Distribution License (the "License").
# You may not use this file except in compliance with the License.
#
# You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
# or https://opensource.org/licenses/CDDL-1.0.
# See the License for the specific language governing permissions
# and limitations under the License.
#
# When distributing Covered Code, include this CDDL HEADER in each
# file and include the License file at usr/src/OPENSOLARIS.LICENSE.
# If applicable, add the following below this CDDL HEADER, with the
# fields enclosed by brackets "[]" replaced with your own identifying
# information: Portions Copyright [yyyy] [name of copyright owner]
#
# CDDL HEADER END
#
#
# Copyright (c) 2017 by Intel Corporation. All rights reserved.
# Copyright (c) 2023 by Klara, Inc. All rights reserved.
#
. $STF_SUITE/include/libtest.shlib
. $STF_SUITE/tests/functional/fault/fault.cfg
#
# DESCRIPTION:
# Testing Fault Management Agent ZED Logic - Automated Auto-Replace Test.
# Verifys that auto-replace works with by-id paths.
#
# STRATEGY:
# 1. Update /etc/zfs/vdev_id.conf with scsidebug alias for a persistent path.
# This creates keys ID_VDEV and ID_VDEV_PATH and set phys_path="scsidebug".
# 2. Create a pool and set autoreplace=on (auto-replace is opt-in)
# 3. Export the pool
# 4. Wipe and offline the scsi_debug disk
# 5. Import the pool with missing disk
# 6. Re-online the wiped scsi_debug disk with a new serial number
# 7. Verify ZED detects the new blank disk and replaces the missing vdev
# 8. Verify that the scsi_debug disk was re-partitioned
#
# Creates a raidz1 zpool using persistent /dev/disk/by-id path names
#
# Auto-replace is opt in, and matches by phys_path.
#
verify_runnable "both"
if ! is_physical_device $DISKS; then
log_unsupported "Unsupported disks for this test."
fi
function cleanup
{
zpool status $TESTPOOL
destroy_pool $TESTPOOL
sed -i '/alias scsidebug/d' $VDEVID_CONF
unload_scsi_debug
}
#
# Wait until a vdev transitions to its replacement vdev
#
# Return 0 when vdev reaches expected state, 1 on timeout.
#
# Note: index +2 is to skip over root and raidz-0 vdevs
#
function wait_vdev_online # pool index oldguid timeout
{
typeset pool=$1
typeset -i index=$2+2
typeset guid=$3
typeset timeout=${4:-60}
typeset -i i=0
while [[ $i -lt $timeout ]]; do
vdev_guids=( $(zpool get -H -o value guid $pool all-vdevs) )
if [ "${vdev_guids[$index]}" != "${guid}" ]; then
log_note "new vdev[$((index-2))]: ${vdev_guids[$index]}, replacing ${guid}"
return 0
fi
i=$((i+1))
sleep 1
done
return 1
}
log_assert "automated auto-replace with by-id paths"
log_onexit cleanup
load_scsi_debug $SDSIZE $SDHOSTS $SDTGTS $SDLUNS '512b'
SD=$(get_debug_device)
SD_DEVICE_ID=$(get_persistent_disk_name $SD)
SD_HOST=$(get_scsi_host $SD)
# Register vdev_id alias for scsi_debug device to create a persistent path
echo "alias scsidebug /dev/disk/by-id/$SD_DEVICE_ID" >>$VDEVID_CONF
block_device_wait
SD_DEVICE=$(udevadm info -q all -n $DEV_DSKDIR/$SD | \
awk -F'=' '/ID_VDEV=/ {print $2; exit}')
[ -z $SD_DEVICE ] && log_fail "vdev rule was not registered properly"
log_must zpool events -c
log_must zpool create -f $TESTPOOL raidz1 $SD_DEVICE_ID $DISK1 $DISK2 $DISK3
vdev_guid=$(zpool get guid -H -o value $TESTPOOL $SD_DEVICE_ID)
log_note original vdev guid ${vdev_guid}
# Auto-replace is opt-in so need to set property
log_must zpool set autoreplace=on $TESTPOOL
# Add some data to the pool
log_must zfs create $TESTPOOL/fs
log_must fill_fs /$TESTPOOL/fs 4 100 4096 512 Z
log_must zpool export $TESTPOOL
# Record the partition UUID for later comparison
part_uuid=$(udevadm info --query=property --property=ID_PART_TABLE_UUID \
--value /dev/disk/by-id/$SD_DEVICE_ID)
[[ -z "$part_uuid" ]] || log_note original disk GPT uuid ${part_uuid}
#
# Wipe and offline the disk
#
# Note that it is not enough to zero the disk to expunge the partitions.
# You also need to inform the kernel (e.g., 'hdparm -z' or 'partprobe').
#
# Using partprobe is overkill and hdparm is not as common as wipefs. So
# we use wipefs which lets the kernel know the partition was removed
# from the device (i.e., calls BLKRRPART ioctl).
#
log_must dd if=/dev/zero of=/dev/disk/by-id/$SD_DEVICE_ID bs=1M count=$SDSIZE
log_must /usr/sbin/wipefs -a /dev/disk/by-id/$SD_DEVICE_ID
remove_disk $SD
block_device_wait
# Re-import pool with drive missing
log_must zpool import $TESTPOOL
log_must check_state $TESTPOOL "" "DEGRADED"
block_device_wait
#
# Online an empty disk in the same physical location, with a different by-id
# symlink. We use vpd_use_hostno to make sure the underlying serial number
# changes for the new disk which in turn gives us a different by-id path.
#
# The original names were something like:
# /dev/disk/by-id/scsi-SLinux_scsi_debug_16000-part1
# /dev/disk/by-id/wwn-0x33333330000007d0-part1
#
# This new inserted disk, will have different links like:
# /dev/disk/by-id/scsi-SLinux_scsi_debug_2000-part1
# /dev/disk/by-id/wwn-0x0x3333333000003e80 -part1
#
echo '0' > /sys/bus/pseudo/drivers/scsi_debug/vpd_use_hostno
insert_disk $SD $SD_HOST
# make sure the physical path points to the same scsi-debug device
SD_DEVICE_ID=$(get_persistent_disk_name $SD)
echo "alias scsidebug /dev/disk/by-id/$SD_DEVICE_ID" >>$VDEVID_CONF
block_device_wait
# Wait for the new disk to be online and replaced
log_must wait_vdev_online $TESTPOOL 0 $vdev_guid 45
log_must wait_replacing $TESTPOOL 45
# Validate auto-replace was successful
log_must check_state $TESTPOOL "" "ONLINE"
#
# Confirm the partition UUID changed so we know the new disk was relabeled
#
# Note: some older versions of udevadm don't support "--property" option so
# we'll # skip this test when it is not supported
#
if [ ! -z "$part_uuid" ]; then
new_uuid=$(udevadm info --query=property --property=ID_PART_TABLE_UUID \
--value /dev/disk/by-id/$SD_DEVICE_ID)
log_note new disk GPT uuid ${new_uuid}
[[ "$part_uuid" = "$new_uuid" ]] && \
log_fail "The new disk was not relabeled as expected"
fi
log_pass "automated auto-replace with by-id paths"

View file

@ -141,7 +141,11 @@ remove_disk $SDISK
# background since the command will hang when the pool gets suspended. The
# command will resume and exit after we restore the missing disk later on.
zpool scrub $TESTPOOL2 &
sleep 3 # Give the scrub some time to run before we check if it fails
# Once we trigger the zpool scrub, all zpool/zfs command gets stuck for 180 seconds.
# Post 180 seconds zpool/zfs commands gets start executing however few more seconds(10s)
# it take to update the status.
# hence sleeping for 200 seconds so that we get the correct status.
sleep 200 # Give the scrub some time to run before we check if it fails
log_must check_all $TESTPOOL2 "SUSPENDED"

View file

@ -1110,7 +1110,7 @@
/* #undef ZFS_IS_GPL_COMPATIBLE */
/* Define the project alias string. */
#define ZFS_META_ALIAS "zfs-2.2.99-FreeBSD_g8a7407012"
#define ZFS_META_ALIAS "zfs-2.2.99-FreeBSD_g797f55ef1"
/* Define the project author. */
#define ZFS_META_AUTHOR "OpenZFS"
@ -1140,7 +1140,7 @@
#define ZFS_META_NAME "zfs"
/* Define the project release. */
#define ZFS_META_RELEASE "FreeBSD_g8a7407012"
#define ZFS_META_RELEASE "FreeBSD_g797f55ef1"
/* Define the project version. */
#define ZFS_META_VERSION "2.2.99"

View file

@ -1 +1 @@
#define ZFS_META_GITREV "zfs-2.2.99-151-g8a7407012"
#define ZFS_META_GITREV "zfs-2.2.99-162-g797f55ef1"