6171 dsl_prop_unregister() slows down dataset eviction.

03bad06fbb

https://www.illumos.org/issues/6171
  A change to a property on a dataset must be propagated to its descendants
  in case that property is inherited. For datasets whose information is
  not currently loaded into memory (e.g. a snapshot that isn't currently
  mounted), there is nothing to do; the property change will take effect
  the next time that dataset is loaded. To handle updates to datasets that
  are in-core, ZFS registers a callback entry for each property of each
  loaded dataset with the dsl directory that holds that dataset. There
  is a dsl directory associated with each live dataset that references
  both the live dataset and any snapshots of the live dataset. A property
  change is effected by doing a traversal of the tree of dsl directories
  for a pool, starting at the directory sourcing the change, and invoking
  these callbacks.
  The current implementation both registers and de-registers properties
  individually for each loaded dataset. While registration for a property is
  O(1) (insert into a list), de-registration is O(n) (search list and then
  remove). The 'n' for de-registration, however, is not limited to the size
  (number of snapshots + 1) of the dsl directory. The eviction portion
  of the life cycle for the in core state of datasets is asynchronous,
  which allows multiple copies of the dataset information to be in-core
  at once. Only one of these copies is active at any time with the rest
  going through tear down processing, but all copies contribute to the
  cost of performing a dsl_prop_unregister().
  ...
  The fix employed here is to make property de-registration O(1). With
  this change in place, it is hoped that a single thread is more than
  sufficient to handle eviction processing. If it isn't, the problem can
  be solved by increasing the number of threads devoted to the eviction
  taskq.

Author: Justin Gibbs <gibbs@scsiguy.com>
Reviewed by: Matthew Ahrens <mahrens@delphix.com>
Reviewed by: Prakash Surya <prakash.surya@delphix.com>
Approved by: Dan McDonald <danmcd@omniti.com>
This commit is contained in:
Andriy Gapon 2015-09-21 12:00:12 +00:00
parent f38dd2a625
commit ddd84277c9
8 changed files with 171 additions and 204 deletions

View file

@ -680,40 +680,8 @@ dmu_objset_evict(objset_t *os)
for (int t = 0; t < TXG_SIZE; t++)
ASSERT(!dmu_objset_is_dirty(os, t));
if (ds) {
if (!ds->ds_is_snapshot) {
VERIFY0(dsl_prop_unregister(ds,
zfs_prop_to_name(ZFS_PROP_CHECKSUM),
checksum_changed_cb, os));
VERIFY0(dsl_prop_unregister(ds,
zfs_prop_to_name(ZFS_PROP_COMPRESSION),
compression_changed_cb, os));
VERIFY0(dsl_prop_unregister(ds,
zfs_prop_to_name(ZFS_PROP_COPIES),
copies_changed_cb, os));
VERIFY0(dsl_prop_unregister(ds,
zfs_prop_to_name(ZFS_PROP_DEDUP),
dedup_changed_cb, os));
VERIFY0(dsl_prop_unregister(ds,
zfs_prop_to_name(ZFS_PROP_LOGBIAS),
logbias_changed_cb, os));
VERIFY0(dsl_prop_unregister(ds,
zfs_prop_to_name(ZFS_PROP_SYNC),
sync_changed_cb, os));
VERIFY0(dsl_prop_unregister(ds,
zfs_prop_to_name(ZFS_PROP_REDUNDANT_METADATA),
redundant_metadata_changed_cb, os));
VERIFY0(dsl_prop_unregister(ds,
zfs_prop_to_name(ZFS_PROP_RECORDSIZE),
recordsize_changed_cb, os));
}
VERIFY0(dsl_prop_unregister(ds,
zfs_prop_to_name(ZFS_PROP_PRIMARYCACHE),
primary_cache_changed_cb, os));
VERIFY0(dsl_prop_unregister(ds,
zfs_prop_to_name(ZFS_PROP_SECONDARYCACHE),
secondary_cache_changed_cb, os));
}
if (ds)
dsl_prop_unregister_all(ds, os);
if (os->os_sa)
sa_tear_down(os);

View file

@ -282,6 +282,7 @@ dsl_dataset_evict(void *dbu)
ASSERT(!list_link_active(&ds->ds_synced_link));
list_destroy(&ds->ds_prop_cbs);
mutex_destroy(&ds->ds_lock);
mutex_destroy(&ds->ds_opening_lock);
mutex_destroy(&ds->ds_sendstream_lock);
@ -424,6 +425,9 @@ dsl_dataset_hold_obj(dsl_pool_t *dp, uint64_t dsobj, void *tag,
list_create(&ds->ds_sendstreams, sizeof (dmu_sendarg_t),
offsetof(dmu_sendarg_t, dsa_link));
list_create(&ds->ds_prop_cbs, sizeof (dsl_prop_cb_record_t),
offsetof(dsl_prop_cb_record_t, cbr_ds_node));
if (doi.doi_type == DMU_OTN_ZAP_METADATA) {
for (spa_feature_t f = 0; f < SPA_FEATURES; f++) {
if (!(spa_feature_table[f].fi_flags &

View file

@ -147,11 +147,7 @@ dsl_dir_evict(void *dbu)
spa_async_close(dd->dd_pool->dp_spa, dd);
/*
* The props callback list should have been cleaned up by
* objset_evict().
*/
list_destroy(&dd->dd_prop_cbs);
dsl_prop_fini(dd);
mutex_destroy(&dd->dd_lock);
kmem_free(dd, sizeof (dsl_dir_t));
}
@ -186,9 +182,7 @@ dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj,
dd->dd_dbuf = dbuf;
dd->dd_pool = dp;
mutex_init(&dd->dd_lock, NULL, MUTEX_DEFAULT, NULL);
list_create(&dd->dd_prop_cbs, sizeof (dsl_prop_cb_record_t),
offsetof(dsl_prop_cb_record_t, cbr_node));
dsl_prop_init(dd);
dsl_dir_snap_cmtime_update(dd);
@ -246,6 +240,7 @@ dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj,
if (winner != NULL) {
if (dd->dd_parent)
dsl_dir_rele(dd->dd_parent, dd);
dsl_prop_fini(dd);
mutex_destroy(&dd->dd_lock);
kmem_free(dd, sizeof (dsl_dir_t));
dd = winner;
@ -273,6 +268,7 @@ dsl_dir_hold_obj(dsl_pool_t *dp, uint64_t ddobj,
errout:
if (dd->dd_parent)
dsl_dir_rele(dd->dd_parent, dd);
dsl_prop_fini(dd);
mutex_destroy(&dd->dd_lock);
kmem_free(dd, sizeof (dsl_dir_t));
dmu_buf_rele(dbuf, tag);

View file

@ -216,6 +216,58 @@ dsl_prop_get_ds(dsl_dataset_t *ds, const char *propname,
intsz, numints, buf, setpoint, ds->ds_is_snapshot));
}
static dsl_prop_record_t *
dsl_prop_record_find(dsl_dir_t *dd, const char *propname)
{
dsl_prop_record_t *pr = NULL;
ASSERT(MUTEX_HELD(&dd->dd_lock));
for (pr = list_head(&dd->dd_props);
pr != NULL; pr = list_next(&dd->dd_props, pr)) {
if (strcmp(pr->pr_propname, propname) == 0)
break;
}
return (pr);
}
static dsl_prop_record_t *
dsl_prop_record_create(dsl_dir_t *dd, const char *propname)
{
dsl_prop_record_t *pr;
ASSERT(MUTEX_HELD(&dd->dd_lock));
pr = kmem_alloc(sizeof (dsl_prop_record_t), KM_SLEEP);
pr->pr_propname = spa_strdup(propname);
list_create(&pr->pr_cbs, sizeof (dsl_prop_cb_record_t),
offsetof(dsl_prop_cb_record_t, cbr_pr_node));
list_insert_head(&dd->dd_props, pr);
return (pr);
}
void
dsl_prop_init(dsl_dir_t *dd)
{
list_create(&dd->dd_props, sizeof (dsl_prop_record_t),
offsetof(dsl_prop_record_t, pr_node));
}
void
dsl_prop_fini(dsl_dir_t *dd)
{
dsl_prop_record_t *pr;
while ((pr = list_remove_head(&dd->dd_props)) != NULL) {
list_destroy(&pr->pr_cbs);
strfree((char *)pr->pr_propname);
kmem_free(pr, sizeof (dsl_prop_record_t));
}
list_destroy(&dd->dd_props);
}
/*
* Register interest in the named property. We'll call the callback
* once to notify it of the current property value, and again each time
@ -230,6 +282,7 @@ dsl_prop_register(dsl_dataset_t *ds, const char *propname,
dsl_dir_t *dd = ds->ds_dir;
dsl_pool_t *dp = dd->dd_pool;
uint64_t value;
dsl_prop_record_t *pr;
dsl_prop_cb_record_t *cbr;
int err;
@ -241,12 +294,16 @@ dsl_prop_register(dsl_dataset_t *ds, const char *propname,
cbr = kmem_alloc(sizeof (dsl_prop_cb_record_t), KM_SLEEP);
cbr->cbr_ds = ds;
cbr->cbr_propname = kmem_alloc(strlen(propname)+1, KM_SLEEP);
(void) strcpy((char *)cbr->cbr_propname, propname);
cbr->cbr_func = callback;
cbr->cbr_arg = cbarg;
mutex_enter(&dd->dd_lock);
list_insert_head(&dd->dd_prop_cbs, cbr);
pr = dsl_prop_record_find(dd, propname);
if (pr == NULL)
pr = dsl_prop_record_create(dd, propname);
cbr->cbr_pr = pr;
list_insert_head(&pr->pr_cbs, cbr);
list_insert_head(&ds->ds_prop_cbs, cbr);
mutex_exit(&dd->dd_lock);
cbr->cbr_func(cbr->cbr_arg, value);
@ -377,56 +434,34 @@ dsl_prop_predict(dsl_dir_t *dd, const char *propname,
}
/*
* Unregister this callback. Return 0 on success, ENOENT if ddname is
* invalid, or ENOMSG if no matching callback registered.
* Unregister all callbacks that are registered with the
* given callback argument.
*/
int
dsl_prop_unregister(dsl_dataset_t *ds, const char *propname,
dsl_prop_changed_cb_t *callback, void *cbarg)
void
dsl_prop_unregister_all(dsl_dataset_t *ds, void *cbarg)
{
dsl_prop_cb_record_t *cbr, *next_cbr;
dsl_dir_t *dd = ds->ds_dir;
dsl_prop_cb_record_t *cbr;
mutex_enter(&dd->dd_lock);
for (cbr = list_head(&dd->dd_prop_cbs);
cbr; cbr = list_next(&dd->dd_prop_cbs, cbr)) {
if (cbr->cbr_ds == ds &&
cbr->cbr_func == callback &&
cbr->cbr_arg == cbarg &&
strcmp(cbr->cbr_propname, propname) == 0)
break;
next_cbr = list_head(&ds->ds_prop_cbs);
while (next_cbr != NULL) {
cbr = next_cbr;
next_cbr = list_next(&ds->ds_prop_cbs, cbr);
if (cbr->cbr_arg == cbarg) {
list_remove(&ds->ds_prop_cbs, cbr);
list_remove(&cbr->cbr_pr->pr_cbs, cbr);
kmem_free(cbr, sizeof (dsl_prop_cb_record_t));
}
}
if (cbr == NULL) {
mutex_exit(&dd->dd_lock);
return (SET_ERROR(ENOMSG));
}
list_remove(&dd->dd_prop_cbs, cbr);
mutex_exit(&dd->dd_lock);
kmem_free((void*)cbr->cbr_propname, strlen(cbr->cbr_propname)+1);
kmem_free(cbr, sizeof (dsl_prop_cb_record_t));
return (0);
}
boolean_t
dsl_prop_hascb(dsl_dataset_t *ds)
{
dsl_dir_t *dd = ds->ds_dir;
boolean_t rv = B_FALSE;
dsl_prop_cb_record_t *cbr;
mutex_enter(&dd->dd_lock);
for (cbr = list_head(&dd->dd_prop_cbs); cbr;
cbr = list_next(&dd->dd_prop_cbs, cbr)) {
if (cbr->cbr_ds == ds) {
rv = B_TRUE;
break;
}
}
mutex_exit(&dd->dd_lock);
return (rv);
return (!list_is_empty(&ds->ds_prop_cbs));
}
/* ARGSUSED */
@ -434,38 +469,50 @@ static int
dsl_prop_notify_all_cb(dsl_pool_t *dp, dsl_dataset_t *ds, void *arg)
{
dsl_dir_t *dd = ds->ds_dir;
dsl_prop_record_t *pr;
dsl_prop_cb_record_t *cbr;
mutex_enter(&dd->dd_lock);
for (cbr = list_head(&dd->dd_prop_cbs); cbr;
cbr = list_next(&dd->dd_prop_cbs, cbr)) {
uint64_t value;
for (pr = list_head(&dd->dd_props);
pr; pr = list_next(&dd->dd_props, pr)) {
for (cbr = list_head(&pr->pr_cbs); cbr;
cbr = list_next(&pr->pr_cbs, cbr)) {
uint64_t value;
/*
* Callback entries do not have holds on their datasets
* so that datasets with registered callbacks are still
* eligible for eviction. Unlike operations on callbacks
* for a single dataset, we are performing a recursive
* descent of related datasets and the calling context
* for this iteration only has a dataset hold on the root.
* Without a hold, the callback's pointer to the dataset
* could be invalidated by eviction at any time.
*
* Use dsl_dataset_try_add_ref() to verify that the
* dataset has not begun eviction processing and to
* prevent eviction from occurring for the duration
* of the callback. If the hold attempt fails, this
* object is already being evicted and the callback can
* be safely ignored.
*/
if (!dsl_dataset_try_add_ref(dp, cbr->cbr_ds, FTAG))
continue;
/*
* Callback entries do not have holds on their
* datasets so that datasets with registered
* callbacks are still eligible for eviction.
* Unlike operations to update properties on a
* single dataset, we are performing a recursive
* descent of related head datasets. The caller
* of this function only has a dataset hold on
* the passed in head dataset, not the snapshots
* associated with this dataset. Without a hold,
* the dataset pointer within callback records
* for snapshots can be invalidated by eviction
* at any time.
*
* Use dsl_dataset_try_add_ref() to verify
* that the dataset for a snapshot has not
* begun eviction processing and to prevent
* eviction from occurring for the duration of
* the callback. If the hold attempt fails,
* this object is already being evicted and the
* callback can be safely ignored.
*/
if (ds != cbr->cbr_ds &&
!dsl_dataset_try_add_ref(dp, cbr->cbr_ds, FTAG))
continue;
if (dsl_prop_get_ds(cbr->cbr_ds, cbr->cbr_propname,
sizeof (value), 1, &value, NULL) == 0)
cbr->cbr_func(cbr->cbr_arg, value);
if (dsl_prop_get_ds(cbr->cbr_ds,
cbr->cbr_pr->pr_propname, sizeof (value), 1,
&value, NULL) == 0)
cbr->cbr_func(cbr->cbr_arg, value);
dsl_dataset_rele(cbr->cbr_ds, FTAG);
if (ds != cbr->cbr_ds)
dsl_dataset_rele(cbr->cbr_ds, FTAG);
}
}
mutex_exit(&dd->dd_lock);
@ -490,6 +537,7 @@ dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj,
const char *propname, uint64_t value, int first)
{
dsl_dir_t *dd;
dsl_prop_record_t *pr;
dsl_prop_cb_record_t *cbr;
objset_t *mos = dp->dp_meta_objset;
zap_cursor_t zc;
@ -516,30 +564,33 @@ dsl_prop_changed_notify(dsl_pool_t *dp, uint64_t ddobj,
}
mutex_enter(&dd->dd_lock);
for (cbr = list_head(&dd->dd_prop_cbs); cbr;
cbr = list_next(&dd->dd_prop_cbs, cbr)) {
uint64_t propobj;
pr = dsl_prop_record_find(dd, propname);
if (pr != NULL) {
for (cbr = list_head(&pr->pr_cbs); cbr;
cbr = list_next(&pr->pr_cbs, cbr)) {
uint64_t propobj;
/*
* cbr->cbf_ds may be invalidated due to eviction,
* requiring the use of dsl_dataset_try_add_ref().
* See comment block in dsl_prop_notify_all_cb()
* for details.
*/
if (strcmp(cbr->cbr_propname, propname) != 0 ||
!dsl_dataset_try_add_ref(dp, cbr->cbr_ds, FTAG))
continue;
/*
* cbr->cbr_ds may be invalidated due to eviction,
* requiring the use of dsl_dataset_try_add_ref().
* See comment block in dsl_prop_notify_all_cb()
* for details.
*/
if (!dsl_dataset_try_add_ref(dp, cbr->cbr_ds, FTAG))
continue;
propobj = dsl_dataset_phys(cbr->cbr_ds)->ds_props_obj;
propobj = dsl_dataset_phys(cbr->cbr_ds)->ds_props_obj;
/*
* If the property is not set on this ds, then it is
* inherited here; call the callback.
*/
if (propobj == 0 || zap_contains(mos, propobj, propname) != 0)
cbr->cbr_func(cbr->cbr_arg, value);
/*
* If the property is not set on this ds, then it is
* inherited here; call the callback.
*/
if (propobj == 0 ||
zap_contains(mos, propobj, propname) != 0)
cbr->cbr_func(cbr->cbr_arg, value);
dsl_dataset_rele(cbr->cbr_ds, FTAG);
dsl_dataset_rele(cbr->cbr_ds, FTAG);
}
}
mutex_exit(&dd->dd_lock);
@ -679,10 +730,10 @@ dsl_prop_set_sync_impl(dsl_dataset_t *ds, const char *propname,
* ds here.
*/
mutex_enter(&ds->ds_dir->dd_lock);
for (cbr = list_head(&ds->ds_dir->dd_prop_cbs); cbr;
cbr = list_next(&ds->ds_dir->dd_prop_cbs, cbr)) {
if (cbr->cbr_ds == ds &&
strcmp(cbr->cbr_propname, propname) == 0)
for (cbr = list_head(&ds->ds_prop_cbs); cbr;
cbr = list_next(&ds->ds_prop_cbs, cbr)) {
if (strcmp(cbr->cbr_pr->pr_propname,
propname) == 0)
cbr->cbr_func(cbr->cbr_arg, intval);
}
mutex_exit(&ds->ds_dir->dd_lock);

View file

@ -184,6 +184,9 @@ typedef struct dsl_dataset {
kmutex_t ds_sendstream_lock;
list_t ds_sendstreams;
/* Protected by our dsl_dir's dd_lock */
list_t ds_prop_cbs;
/*
* For ZFEATURE_FLAG_PER_DATASET features, set if this dataset
* uses this feature.

View file

@ -102,7 +102,7 @@ struct dsl_dir {
/* Protected by dd_lock */
kmutex_t dd_lock;
list_t dd_prop_cbs; /* list of dsl_prop_cb_record_t's */
list_t dd_props; /* list of dsl_prop_record_t's */
timestruc_t dd_snap_cmtime; /* last time snapshot namespace changed */
uint64_t dd_origin_txg;

View file

@ -41,10 +41,17 @@ struct dsl_dir;
/* The callback func may not call into the DMU or DSL! */
typedef void (dsl_prop_changed_cb_t)(void *arg, uint64_t newval);
typedef struct dsl_prop_record {
list_node_t pr_node; /* link on dd_props */
const char *pr_propname;
list_t pr_cbs;
} dsl_prop_record_t;
typedef struct dsl_prop_cb_record {
list_node_t cbr_node; /* link on dd_prop_cbs */
list_node_t cbr_pr_node; /* link on pr_cbs */
list_node_t cbr_ds_node; /* link on ds_prop_cbs */
dsl_prop_record_t *cbr_pr;
struct dsl_dataset *cbr_ds;
const char *cbr_propname;
dsl_prop_changed_cb_t *cbr_func;
void *cbr_arg;
} dsl_prop_cb_record_t;
@ -54,10 +61,11 @@ typedef struct dsl_props_arg {
zprop_source_t pa_source;
} dsl_props_arg_t;
void dsl_prop_init(dsl_dir_t *dd);
void dsl_prop_fini(dsl_dir_t *dd);
int dsl_prop_register(struct dsl_dataset *ds, const char *propname,
dsl_prop_changed_cb_t *callback, void *cbarg);
int dsl_prop_unregister(struct dsl_dataset *ds, const char *propname,
dsl_prop_changed_cb_t *callback, void *cbarg);
void dsl_prop_unregister_all(struct dsl_dataset *ds, void *cbarg);
void dsl_prop_notify_all(struct dsl_dir *dd);
boolean_t dsl_prop_hascb(struct dsl_dataset *ds);

View file

@ -561,33 +561,7 @@ zfs_register_callbacks(vfs_t *vfsp)
return (0);
unregister:
/*
* We may attempt to unregister some callbacks that are not
* registered, but this is OK; it will simply return ENOMSG,
* which we will ignore.
*/
(void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_ATIME),
atime_changed_cb, zfsvfs);
(void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_XATTR),
xattr_changed_cb, zfsvfs);
(void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_RECORDSIZE),
blksz_changed_cb, zfsvfs);
(void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_READONLY),
readonly_changed_cb, zfsvfs);
(void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_DEVICES),
devices_changed_cb, zfsvfs);
(void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_SETUID),
setuid_changed_cb, zfsvfs);
(void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_EXEC),
exec_changed_cb, zfsvfs);
(void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_SNAPDIR),
snapdir_changed_cb, zfsvfs);
(void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_ACLMODE),
acl_mode_changed_cb, zfsvfs);
(void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_ACLINHERIT),
acl_inherit_changed_cb, zfsvfs);
(void) dsl_prop_unregister(ds, zfs_prop_to_name(ZFS_PROP_VSCAN),
vscan_changed_cb, zfsvfs);
dsl_prop_unregister_all(ds, zfsvfs);
return (error);
}
@ -1249,46 +1223,9 @@ void
zfs_unregister_callbacks(zfsvfs_t *zfsvfs)
{
objset_t *os = zfsvfs->z_os;
struct dsl_dataset *ds;
/*
* Unregister properties.
*/
if (!dmu_objset_is_snapshot(os)) {
ds = dmu_objset_ds(os);
VERIFY(dsl_prop_unregister(ds, "atime", atime_changed_cb,
zfsvfs) == 0);
VERIFY(dsl_prop_unregister(ds, "xattr", xattr_changed_cb,
zfsvfs) == 0);
VERIFY(dsl_prop_unregister(ds, "recordsize", blksz_changed_cb,
zfsvfs) == 0);
VERIFY(dsl_prop_unregister(ds, "readonly", readonly_changed_cb,
zfsvfs) == 0);
VERIFY(dsl_prop_unregister(ds, "devices", devices_changed_cb,
zfsvfs) == 0);
VERIFY(dsl_prop_unregister(ds, "setuid", setuid_changed_cb,
zfsvfs) == 0);
VERIFY(dsl_prop_unregister(ds, "exec", exec_changed_cb,
zfsvfs) == 0);
VERIFY(dsl_prop_unregister(ds, "snapdir", snapdir_changed_cb,
zfsvfs) == 0);
VERIFY(dsl_prop_unregister(ds, "aclmode", acl_mode_changed_cb,
zfsvfs) == 0);
VERIFY(dsl_prop_unregister(ds, "aclinherit",
acl_inherit_changed_cb, zfsvfs) == 0);
VERIFY(dsl_prop_unregister(ds, "vscan",
vscan_changed_cb, zfsvfs) == 0);
}
if (!dmu_objset_is_snapshot(os))
dsl_prop_unregister_all(dmu_objset_ds(os), zfsvfs);
}
/*