From 4cdea4a853f628c88cd1a652f34b8d64b2318ffa Mon Sep 17 00:00:00 2001 From: Jeff Roberson Date: Tue, 10 Sep 2019 18:27:45 +0000 Subject: [PATCH] Use the sleepq lock rather than the page lock to protect against wakeup races with page busy state. The object lock is still used as an interlock to ensure that the identity stays valid. Most callers should use vm_page_sleep_if_busy() to handle the locking particulars. Reviewed by: alc, kib, markj Sponsored by: Netflix Differential Revision: https://reviews.freebsd.org/D21255 --- .../opensolaris/uts/common/fs/zfs/zfs_vnops.c | 10 +- sys/dev/drm2/ttm/ttm_bo_vm.c | 5 +- sys/kern/vfs_bio.c | 13 +- sys/vm/phys_pager.c | 5 +- sys/vm/vm_fault.c | 2 +- sys/vm/vm_object.c | 40 ++-- sys/vm/vm_page.c | 184 +++++++----------- sys/vm/vm_page.h | 3 +- 8 files changed, 89 insertions(+), 173 deletions(-) diff --git a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c index d49d7c5a4ae..aa4992ba2d8 100644 --- a/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c +++ b/sys/cddl/contrib/opensolaris/uts/common/fs/zfs/zfs_vnops.c @@ -422,10 +422,7 @@ page_busy(vnode_t *vp, int64_t start, int64_t off, int64_t nbytes) * likely to reclaim it. */ vm_page_reference(pp); - vm_page_lock(pp); - zfs_vmobject_wunlock(obj); - vm_page_busy_sleep(pp, "zfsmwb", true); - zfs_vmobject_wlock(obj); + vm_page_sleep_if_xbusy(pp, "zfsmwb"); continue; } vm_page_sbusy(pp); @@ -473,10 +470,7 @@ page_wire(vnode_t *vp, int64_t start) * likely to reclaim it. */ vm_page_reference(pp); - vm_page_lock(pp); - zfs_vmobject_wunlock(obj); - vm_page_busy_sleep(pp, "zfsmwb", true); - zfs_vmobject_wlock(obj); + vm_page_sleep_if_xbusy(pp, "zfsmwb"); continue; } diff --git a/sys/dev/drm2/ttm/ttm_bo_vm.c b/sys/dev/drm2/ttm/ttm_bo_vm.c index 6dc1fabab28..6bda57e7c97 100644 --- a/sys/dev/drm2/ttm/ttm_bo_vm.c +++ b/sys/dev/drm2/ttm/ttm_bo_vm.c @@ -232,10 +232,7 @@ reserve: VM_OBJECT_WLOCK(vm_obj); if (vm_page_busied(m)) { - vm_page_lock(m); - VM_OBJECT_WUNLOCK(vm_obj); - vm_page_busy_sleep(m, "ttmpbs", false); - VM_OBJECT_WLOCK(vm_obj); + vm_page_sleep_if_busy(m, "ttmpbs"); ttm_mem_io_unlock(man); ttm_bo_unreserve(bo); goto retry; diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c index 0bd3590bf78..1e5b44cf60b 100644 --- a/sys/kern/vfs_bio.c +++ b/sys/kern/vfs_bio.c @@ -2931,12 +2931,8 @@ vfs_vmio_invalidate(struct buf *bp) presid = resid > (PAGE_SIZE - poffset) ? (PAGE_SIZE - poffset) : resid; KASSERT(presid >= 0, ("brelse: extra page")); - while (vm_page_xbusied(m)) { - vm_page_lock(m); - VM_OBJECT_WUNLOCK(obj); - vm_page_busy_sleep(m, "mbncsh", true); - VM_OBJECT_WLOCK(obj); - } + while (vm_page_xbusied(m)) + vm_page_sleep_if_xbusy(m, "mbncsh"); if (pmap_page_wired_mappings(m) == 0) vm_page_set_invalid(m, poffset, presid); vm_page_release_locked(m, flags); @@ -4565,10 +4561,7 @@ vfs_drain_busy_pages(struct buf *bp) for (; last_busied < i; last_busied++) vm_page_sbusy(bp->b_pages[last_busied]); while (vm_page_xbusied(m)) { - vm_page_lock(m); - VM_OBJECT_WUNLOCK(bp->b_bufobj->bo_object); - vm_page_busy_sleep(m, "vbpage", true); - VM_OBJECT_WLOCK(bp->b_bufobj->bo_object); + vm_page_sleep_if_xbusy(m, "vbpage"); } } } diff --git a/sys/vm/phys_pager.c b/sys/vm/phys_pager.c index 9a14070b8fb..4b076f43a89 100644 --- a/sys/vm/phys_pager.c +++ b/sys/vm/phys_pager.c @@ -219,10 +219,7 @@ retry: pmap_zero_page(m); m->valid = VM_PAGE_BITS_ALL; } else if (vm_page_xbusied(m)) { - vm_page_lock(m); - VM_OBJECT_WUNLOCK(object); - vm_page_busy_sleep(m, "physb", true); - VM_OBJECT_WLOCK(object); + vm_page_sleep_if_xbusy(m, "physb"); goto retry; } else { vm_page_xbusy(m); diff --git a/sys/vm/vm_fault.c b/sys/vm/vm_fault.c index ac66b79b5f0..35742af74bd 100644 --- a/sys/vm/vm_fault.c +++ b/sys/vm/vm_fault.c @@ -510,7 +510,7 @@ vm_fault_populate(struct faultstate *fs, vm_prot_t prot, int fault_type, *m_hold = &m[i]; vm_page_wire(&m[i]); } - vm_page_xunbusy_maybelocked(&m[i]); + vm_page_xunbusy(&m[i]); } if (m_mtx != NULL) mtx_unlock(m_mtx); diff --git a/sys/vm/vm_object.c b/sys/vm/vm_object.c index ff63f6fdce5..c384c029a4b 100644 --- a/sys/vm/vm_object.c +++ b/sys/vm/vm_object.c @@ -1160,9 +1160,7 @@ next_page: ("vm_object_madvise: page %p is not managed", tm)); if (vm_page_busied(tm)) { if (object != tobject) - VM_OBJECT_WUNLOCK(tobject); - vm_page_lock(tm); - VM_OBJECT_WUNLOCK(object); + VM_OBJECT_WUNLOCK(object); if (advice == MADV_WILLNEED) { /* * Reference the page before unlocking and @@ -1345,10 +1343,7 @@ retry: */ if (vm_page_busied(m)) { VM_OBJECT_WUNLOCK(new_object); - vm_page_lock(m); - VM_OBJECT_WUNLOCK(orig_object); - vm_page_busy_sleep(m, "spltwt", false); - VM_OBJECT_WLOCK(orig_object); + vm_page_sleep_if_busy(m, "spltwt"); VM_OBJECT_WLOCK(new_object); goto retry; } @@ -1415,15 +1410,16 @@ vm_object_collapse_scan_wait(vm_object_t object, vm_page_t p, vm_page_t next, ("invalid ownership %p %p %p", p, object, backing_object)); if ((op & OBSC_COLLAPSE_NOWAIT) != 0) return (next); - if (p != NULL) - vm_page_lock(p); - VM_OBJECT_WUNLOCK(object); - VM_OBJECT_WUNLOCK(backing_object); /* The page is only NULL when rename fails. */ - if (p == NULL) + if (p == NULL) { vm_radix_wait(); - else + } else { + if (p->object == object) + VM_OBJECT_WUNLOCK(backing_object); + else + VM_OBJECT_WUNLOCK(object); vm_page_busy_sleep(p, "vmocol", false); + } VM_OBJECT_WLOCK(object); VM_OBJECT_WLOCK(backing_object); return (TAILQ_FIRST(&backing_object->memq)); @@ -1837,7 +1833,6 @@ vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end, int options) { vm_page_t p, next; - struct mtx *mtx; VM_OBJECT_ASSERT_WLOCKED(object); KASSERT((object->flags & OBJ_UNMANAGED) == 0 || @@ -1848,7 +1843,6 @@ vm_object_page_remove(vm_object_t object, vm_pindex_t start, vm_pindex_t end, vm_object_pip_add(object, 1); again: p = vm_page_find_least(object, start); - mtx = NULL; /* * Here, the variable "p" is either (1) the page with the least pindex @@ -1865,17 +1859,8 @@ again: * however, be invalidated if the option OBJPR_CLEANONLY is * not specified. */ - vm_page_change_lock(p, &mtx); - if (vm_page_xbusied(p)) { - VM_OBJECT_WUNLOCK(object); - vm_page_busy_sleep(p, "vmopax", true); - VM_OBJECT_WLOCK(object); - goto again; - } if (vm_page_busied(p)) { - VM_OBJECT_WUNLOCK(object); - vm_page_busy_sleep(p, "vmopar", false); - VM_OBJECT_WLOCK(object); + vm_page_sleep_if_busy(p, "vmopar"); goto again; } if (vm_page_wired(p)) { @@ -1904,8 +1889,6 @@ wired: goto wired; vm_page_free(p); } - if (mtx != NULL) - mtx_unlock(mtx); vm_object_pip_wakeup(object); } @@ -2190,8 +2173,7 @@ again: m = TAILQ_NEXT(m, listq); } if (vm_page_xbusied(tm)) { - vm_page_lock(tm); - for (tobject = object; locked_depth >= 1; + for (tobject = object; locked_depth > 1; locked_depth--) { t1object = tobject->backing_object; VM_OBJECT_RUNLOCK(tobject); diff --git a/sys/vm/vm_page.c b/sys/vm/vm_page.c index 45e8ccc8904..e26727d2192 100644 --- a/sys/vm/vm_page.c +++ b/sys/vm/vm_page.c @@ -84,6 +84,7 @@ __FBSDID("$FreeBSD$"); #include #include #include +#include #include #include #include @@ -873,27 +874,17 @@ void vm_page_busy_downgrade(vm_page_t m) { u_int x; - bool locked; vm_page_assert_xbusied(m); - locked = mtx_owned(vm_page_lockptr(m)); + x = m->busy_lock; for (;;) { - x = m->busy_lock; - x &= VPB_BIT_WAITERS; - if (x != 0 && !locked) - vm_page_lock(m); - if (atomic_cmpset_rel_int(&m->busy_lock, - VPB_SINGLE_EXCLUSIVER | x, VPB_SHARERS_WORD(1))) + if (atomic_fcmpset_rel_int(&m->busy_lock, + &x, VPB_SHARERS_WORD(1))) break; - if (x != 0 && !locked) - vm_page_unlock(m); } - if (x != 0) { + if ((x & VPB_BIT_WAITERS) != 0) wakeup(m); - if (!locked) - vm_page_unlock(m); - } } /* @@ -920,35 +911,23 @@ vm_page_sunbusy(vm_page_t m) { u_int x; - vm_page_lock_assert(m, MA_NOTOWNED); vm_page_assert_sbusied(m); + x = m->busy_lock; for (;;) { - x = m->busy_lock; if (VPB_SHARERS(x) > 1) { - if (atomic_cmpset_int(&m->busy_lock, x, + if (atomic_fcmpset_int(&m->busy_lock, &x, x - VPB_ONE_SHARER)) break; continue; } - if ((x & VPB_BIT_WAITERS) == 0) { - KASSERT(x == VPB_SHARERS_WORD(1), - ("vm_page_sunbusy: invalid lock state")); - if (atomic_cmpset_int(&m->busy_lock, - VPB_SHARERS_WORD(1), VPB_UNBUSIED)) - break; + KASSERT((x & ~VPB_BIT_WAITERS) == VPB_SHARERS_WORD(1), + ("vm_page_sunbusy: invalid lock state")); + if (!atomic_fcmpset_rel_int(&m->busy_lock, &x, VPB_UNBUSIED)) continue; - } - KASSERT(x == (VPB_SHARERS_WORD(1) | VPB_BIT_WAITERS), - ("vm_page_sunbusy: invalid lock state for waiters")); - - vm_page_lock(m); - if (!atomic_cmpset_int(&m->busy_lock, x, VPB_UNBUSIED)) { - vm_page_unlock(m); - continue; - } + if ((x & VPB_BIT_WAITERS) == 0) + break; wakeup(m); - vm_page_unlock(m); break; } } @@ -956,28 +935,35 @@ vm_page_sunbusy(vm_page_t m) /* * vm_page_busy_sleep: * - * Sleep and release the page lock, using the page pointer as wchan. + * Sleep if the page is busy, using the page pointer as wchan. * This is used to implement the hard-path of busying mechanism. * - * The given page must be locked. - * * If nonshared is true, sleep only if the page is xbusy. + * + * The object lock must be held on entry and will be released on exit. */ void vm_page_busy_sleep(vm_page_t m, const char *wmesg, bool nonshared) { + vm_object_t obj; u_int x; - vm_page_assert_locked(m); + obj = m->object; + vm_page_lock_assert(m, MA_NOTOWNED); + VM_OBJECT_ASSERT_LOCKED(obj); + sleepq_lock(m); x = m->busy_lock; if (x == VPB_UNBUSIED || (nonshared && (x & VPB_BIT_SHARED) != 0) || ((x & VPB_BIT_WAITERS) == 0 && !atomic_cmpset_int(&m->busy_lock, x, x | VPB_BIT_WAITERS))) { - vm_page_unlock(m); + VM_OBJECT_DROP(obj); + sleepq_release(m); return; } - msleep(m, vm_page_lockptr(m), PVM | PDROP, wmesg, 0); + VM_OBJECT_DROP(obj); + sleepq_add(m, NULL, wmesg, 0, 0); + sleepq_wait(m, PVM); } /* @@ -992,55 +978,20 @@ vm_page_trysbusy(vm_page_t m) { u_int x; + x = m->busy_lock; for (;;) { - x = m->busy_lock; if ((x & VPB_BIT_SHARED) == 0) return (0); - if (atomic_cmpset_acq_int(&m->busy_lock, x, x + VPB_ONE_SHARER)) + if (atomic_fcmpset_acq_int(&m->busy_lock, &x, + x + VPB_ONE_SHARER)) return (1); } } -static void -vm_page_xunbusy_locked(vm_page_t m) -{ - - vm_page_assert_xbusied(m); - vm_page_assert_locked(m); - - atomic_store_rel_int(&m->busy_lock, VPB_UNBUSIED); - /* There is a waiter, do wakeup() instead of vm_page_flash(). */ - wakeup(m); -} - -void -vm_page_xunbusy_maybelocked(vm_page_t m) -{ - bool lockacq; - - vm_page_assert_xbusied(m); - - /* - * Fast path for unbusy. If it succeeds, we know that there - * are no waiters, so we do not need a wakeup. - */ - if (atomic_cmpset_rel_int(&m->busy_lock, VPB_SINGLE_EXCLUSIVER, - VPB_UNBUSIED)) - return; - - lockacq = !mtx_owned(vm_page_lockptr(m)); - if (lockacq) - vm_page_lock(m); - vm_page_xunbusy_locked(m); - if (lockacq) - vm_page_unlock(m); -} - /* * vm_page_xunbusy_hard: * - * Called after the first try the exclusive unbusy of a page failed. - * It is assumed that the waiters bit is on. + * Called when unbusy has failed because there is a waiter. */ void vm_page_xunbusy_hard(vm_page_t m) @@ -1048,34 +999,10 @@ vm_page_xunbusy_hard(vm_page_t m) vm_page_assert_xbusied(m); - vm_page_lock(m); - vm_page_xunbusy_locked(m); - vm_page_unlock(m); -} - -/* - * vm_page_flash: - * - * Wakeup anyone waiting for the page. - * The ownership bits do not change. - * - * The given page must be locked. - */ -void -vm_page_flash(vm_page_t m) -{ - u_int x; - - vm_page_lock_assert(m, MA_OWNED); - - for (;;) { - x = m->busy_lock; - if ((x & VPB_BIT_WAITERS) == 0) - return; - if (atomic_cmpset_int(&m->busy_lock, x, - x & (~VPB_BIT_WAITERS))) - break; - } + /* + * Wake the waiter. + */ + atomic_store_rel_int(&m->busy_lock, VPB_UNBUSIED); wakeup(m); } @@ -1264,7 +1191,7 @@ vm_page_readahead_finish(vm_page_t m) /* * vm_page_sleep_if_busy: * - * Sleep and release the page queues lock if the page is busied. + * Sleep and release the object lock if the page is busied. * Returns TRUE if the thread slept. * * The given page must be unlocked and object containing it must @@ -1287,8 +1214,6 @@ vm_page_sleep_if_busy(vm_page_t m, const char *msg) * held by the callers. */ obj = m->object; - vm_page_lock(m); - VM_OBJECT_WUNLOCK(obj); vm_page_busy_sleep(m, msg, false); VM_OBJECT_WLOCK(obj); return (TRUE); @@ -1296,6 +1221,39 @@ vm_page_sleep_if_busy(vm_page_t m, const char *msg) return (FALSE); } +/* + * vm_page_sleep_if_xbusy: + * + * Sleep and release the object lock if the page is xbusied. + * Returns TRUE if the thread slept. + * + * The given page must be unlocked and object containing it must + * be locked. + */ +int +vm_page_sleep_if_xbusy(vm_page_t m, const char *msg) +{ + vm_object_t obj; + + vm_page_lock_assert(m, MA_NOTOWNED); + VM_OBJECT_ASSERT_WLOCKED(m->object); + + if (vm_page_xbusied(m)) { + /* + * The page-specific object must be cached because page + * identity can change during the sleep, causing the + * re-lock of a different object. + * It is assumed that a reference to the object is already + * held by the callers. + */ + obj = m->object; + vm_page_busy_sleep(m, msg, true); + VM_OBJECT_WLOCK(obj); + return (TRUE); + } + return (FALSE); +} + /* * vm_page_dirty_KBI: [ internal use only ] * @@ -1452,7 +1410,7 @@ vm_page_object_remove(vm_page_t m) KASSERT((m->ref_count & VPRC_OBJREF) != 0, ("page %p is missing its object ref", m)); if (vm_page_xbusied(m)) - vm_page_xunbusy_maybelocked(m); + vm_page_xunbusy(m); mrem = vm_radix_remove(&object->rtree, m->pindex); KASSERT(mrem == m, ("removed page %p, expected page %p", mrem, m)); @@ -1598,7 +1556,7 @@ vm_page_replace(vm_page_t mnew, vm_object_t object, vm_pindex_t pindex) mold->object = NULL; atomic_clear_int(&mold->ref_count, VPRC_OBJREF); - vm_page_xunbusy_maybelocked(mold); + vm_page_xunbusy(mold); /* * The object's resident_page_count does not change because we have @@ -4089,8 +4047,6 @@ retrylookup: * likely to reclaim it. */ vm_page_aflag_set(m, PGA_REFERENCED); - vm_page_lock(m); - VM_OBJECT_WUNLOCK(object); vm_page_busy_sleep(m, "pgrbwt", (allocflags & VM_ALLOC_IGN_SBUSY) != 0); VM_OBJECT_WLOCK(object); @@ -4188,8 +4144,6 @@ retrylookup: * likely to reclaim it. */ vm_page_aflag_set(m, PGA_REFERENCED); - vm_page_lock(m); - VM_OBJECT_WUNLOCK(object); vm_page_busy_sleep(m, "grbmaw", (allocflags & VM_ALLOC_IGN_SBUSY) != 0); VM_OBJECT_WLOCK(object); diff --git a/sys/vm/vm_page.h b/sys/vm/vm_page.h index 03833d5189b..84654e20b3e 100644 --- a/sys/vm/vm_page.h +++ b/sys/vm/vm_page.h @@ -541,7 +541,6 @@ malloc2vm_flags(int malloc_flags) void vm_page_busy_downgrade(vm_page_t m); void vm_page_busy_sleep(vm_page_t m, const char *msg, bool nonshared); -void vm_page_flash(vm_page_t m); void vm_page_free(vm_page_t m); void vm_page_free_zero(vm_page_t m); @@ -604,6 +603,7 @@ vm_page_t vm_page_scan_contig(u_long npages, vm_page_t m_start, vm_page_t m_end, u_long alignment, vm_paddr_t boundary, int options); void vm_page_set_valid_range(vm_page_t m, int base, int size); int vm_page_sleep_if_busy(vm_page_t m, const char *msg); +int vm_page_sleep_if_xbusy(vm_page_t m, const char *msg); vm_offset_t vm_page_startup(vm_offset_t vaddr); void vm_page_sunbusy(vm_page_t m); void vm_page_swapqueue(vm_page_t m, uint8_t oldq, uint8_t newq); @@ -618,7 +618,6 @@ void vm_page_updatefake(vm_page_t m, vm_paddr_t paddr, vm_memattr_t memattr); void vm_page_wire(vm_page_t); bool vm_page_wire_mapped(vm_page_t m); void vm_page_xunbusy_hard(vm_page_t m); -void vm_page_xunbusy_maybelocked(vm_page_t m); void vm_page_set_validclean (vm_page_t, int, int); void vm_page_clear_dirty (vm_page_t, int, int); void vm_page_set_invalid (vm_page_t, int, int);