VOP_RENAME: add mp-global lock

It is before all vnode locks, but after vn_start_write().

The lock prevents parallel rename operations on the same mount point,
which should in (near future) simplify a lot of code in VFS/fs that
otherwise need to code with either the changing hierarchy, or with the
lock order for vnodes due to changed hierarchy.

On renames, the lock is taken on the lowest stacked filesystem.
Otherwise rename could still occur in parallel, by performing one of op
on the lower fs.

Proposed by:	mjg (long time ago)
Reviewed by:	markj, olce
Tested by:	pho
Sponsored by:	The FreeBSD Foundation
MFC after:	1 week
Differential revision:	https://reviews.freebsd.org/D50648
This commit is contained in:
Konstantin Belousov 2025-06-02 10:05:06 +03:00
parent f9cf745a50
commit ef6ea91593
4 changed files with 29 additions and 1 deletions

View file

@ -156,6 +156,7 @@ mount_init(void *mem, int size, int flags)
mtx_init(&mp->mnt_mtx, "struct mount mtx", NULL, MTX_DEF);
mtx_init(&mp->mnt_listmtx, "struct mount vlist mtx", NULL, MTX_DEF);
lockinit(&mp->mnt_explock, PVFS, "explock", 0, 0);
lockinit(&mp->mnt_renamelock, PVFS, "rename", 0, 0);
mp->mnt_pcpu = uma_zalloc_pcpu(pcpu_zone_16, M_WAITOK | M_ZERO);
mp->mnt_ref = 0;
mp->mnt_vfs_ops = 1;
@ -170,6 +171,7 @@ mount_fini(void *mem, int size)
mp = (struct mount *)mem;
uma_zfree_pcpu(pcpu_zone_16, mp->mnt_pcpu);
lockdestroy(&mp->mnt_renamelock);
lockdestroy(&mp->mnt_explock);
mtx_destroy(&mp->mnt_listmtx);
mtx_destroy(&mp->mnt_mtx);

View file

@ -5853,6 +5853,8 @@ vop_rename_pre(void *ap)
struct vop_rename_args *a = ap;
#ifdef DEBUG_VFS_LOCKS
struct mount *tmp;
if (a->a_tvp)
ASSERT_VI_UNLOCKED(a->a_tvp, "VOP_RENAME");
ASSERT_VI_UNLOCKED(a->a_tdvp, "VOP_RENAME");
@ -5870,6 +5872,11 @@ vop_rename_pre(void *ap)
if (a->a_tvp)
ASSERT_VOP_LOCKED(a->a_tvp, "vop_rename: tvp not locked");
ASSERT_VOP_LOCKED(a->a_tdvp, "vop_rename: tdvp not locked");
tmp = NULL;
VOP_GETWRITEMOUNT(a->a_tdvp, &tmp);
lockmgr_assert(&tmp->mnt_renamelock, KA_XLOCKED);
vfs_rel(tmp);
#endif
/*
* It may be tempting to add vn_seqc_write_begin/end calls here and

View file

@ -3766,7 +3766,7 @@ int
kern_renameat(struct thread *td, int oldfd, const char *old, int newfd,
const char *new, enum uio_seg pathseg)
{
struct mount *mp = NULL;
struct mount *mp, *tmp;
struct vnode *tvp, *fvp, *tdvp;
struct nameidata fromnd, tond;
uint64_t tondflags;
@ -3774,6 +3774,7 @@ kern_renameat(struct thread *td, int oldfd, const char *old, int newfd,
short irflag;
again:
tmp = mp = NULL;
bwillwrite();
#ifdef MAC
if (mac_vnode_check_rename_from_enabled()) {
@ -3809,6 +3810,7 @@ again:
tvp = tond.ni_vp;
error = vn_start_write(fvp, &mp, V_NOWAIT);
if (error != 0) {
again1:
NDFREE_PNBUF(&fromnd);
NDFREE_PNBUF(&tond);
if (tvp != NULL)
@ -3819,11 +3821,25 @@ again:
vput(tdvp);
vrele(fromnd.ni_dvp);
vrele(fvp);
if (tmp != NULL) {
lockmgr(&tmp->mnt_renamelock, LK_EXCLUSIVE, NULL);
lockmgr(&tmp->mnt_renamelock, LK_RELEASE, NULL);
vfs_rel(tmp);
tmp = NULL;
}
error = vn_start_write(NULL, &mp, V_XSLEEP | V_PCATCH);
if (error != 0)
return (error);
goto again;
}
error = VOP_GETWRITEMOUNT(tdvp, &tmp);
if (error != 0 || tmp == NULL)
goto again1;
error = lockmgr(&tmp->mnt_renamelock, LK_EXCLUSIVE | LK_NOWAIT, NULL);
if (error != 0) {
vn_finished_write(mp);
goto again1;
}
irflag = vn_irflag_read(fvp);
if (((irflag & VIRF_NAMEDATTR) != 0 && tdvp != fromnd.ni_dvp) ||
(irflag & VIRF_NAMEDDIR) != 0) {
@ -3884,6 +3900,8 @@ out:
vrele(fromnd.ni_dvp);
vrele(fvp);
}
lockmgr(&tmp->mnt_renamelock, LK_RELEASE, 0);
vfs_rel(tmp);
vn_finished_write(mp);
out1:
if (error == ERESTART)

View file

@ -267,6 +267,7 @@ struct mount {
int mnt_lazyvnodelistsize; /* (l) # of lazy vnodes */
int mnt_upper_pending; /* (i) # of pending ops on mnt_uppers */
struct lock mnt_explock; /* vfs_export walkers lock */
struct lock mnt_renamelock; /* renames and O_RESOLVE_BENEATH */
TAILQ_HEAD(, mount_upper_node) mnt_uppers; /* (i) upper mounts over us */
TAILQ_HEAD(, mount_upper_node) mnt_notify; /* (i) upper mounts for notification */
STAILQ_ENTRY(mount) mnt_taskqueue_link; /* (d) our place in deferred unmount list */