From 441eb16a953ac93d356e0cda28b45cb66ad70bcc Mon Sep 17 00:00:00 2001 From: Konstantin Belousov Date: Fri, 13 Nov 2020 09:42:32 +0000 Subject: [PATCH] Allow some VOPs to return ERELOOKUP to indicate VFS operation restart at top level. Restart syscalls and some sync operations when filesystem indicated ERELOOKUP condition, mostly for VOPs operating on metdata. In particular, lookup results cached in the inode/v_data is no longer valid and needs recalculating. Right now this should be nop. Assert that ERELOOKUP is catched everywhere and not returned to userspace, by asserting that td_errno != ERELOOKUP on syscall return path. In collaboration with: pho Reviewed by: mckusick (previous version), markj Tested by: markj (syzkaller), pho Sponsored by: The FreeBSD Foundation Differential revision: https://reviews.freebsd.org/D26136 --- sys/kern/subr_syscall.c | 2 ++ sys/kern/uipc_usrreq.c | 2 ++ sys/kern/vfs_subr.c | 9 +++++++-- sys/kern/vfs_syscalls.c | 33 ++++++++++++++++++++++++++++----- 4 files changed, 39 insertions(+), 7 deletions(-) diff --git a/sys/kern/subr_syscall.c b/sys/kern/subr_syscall.c index a37f3dba3d3..85a0814a212 100644 --- a/sys/kern/subr_syscall.c +++ b/sys/kern/subr_syscall.c @@ -217,6 +217,8 @@ syscallret(struct thread *td) KASSERT((td->td_pflags & TDP_FORKING) == 0, ("fork() did not clear TDP_FORKING upon completion")); + KASSERT(td->td_errno != ERELOOKUP, + ("ERELOOKUP not consumed syscall %d", td->td_sa.code)); p = td->td_proc; sa = &td->td_sa; diff --git a/sys/kern/uipc_usrreq.c b/sys/kern/uipc_usrreq.c index b8a49480c18..8716161599e 100644 --- a/sys/kern/uipc_usrreq.c +++ b/sys/kern/uipc_usrreq.c @@ -671,6 +671,8 @@ restart: vput(nd.ni_dvp); if (error) { vn_finished_write(mp); + if (error == ERELOOKUP) + goto restart; goto error; } vp = nd.ni_vp; diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c index 93f5c497992..09256ce3674 100644 --- a/sys/kern/vfs_subr.c +++ b/sys/kern/vfs_subr.c @@ -1937,7 +1937,10 @@ bufobj_invalbuf(struct bufobj *bo, int flags, int slpflag, int slptimeo) } if (bo->bo_dirty.bv_cnt > 0) { BO_UNLOCK(bo); - if ((error = BO_SYNC(bo, MNT_WAIT)) != 0) + do { + error = BO_SYNC(bo, MNT_WAIT); + } while (error == ERELOOKUP); + if (error != 0) return (error); /* * XXX We could save a lock/unlock if this was only @@ -3678,7 +3681,9 @@ loop: vm_object_page_clean(vp->v_object, 0, 0, 0); VM_OBJECT_WUNLOCK(vp->v_object); } - error = VOP_FSYNC(vp, MNT_WAIT, td); + do { + error = VOP_FSYNC(vp, MNT_WAIT, td); + } while (error == ERELOOKUP); if (error != 0) { VOP_UNLOCK(vp); vdrop(vp); diff --git a/sys/kern/vfs_syscalls.c b/sys/kern/vfs_syscalls.c index fb15a2a69a0..b3c252647cf 100644 --- a/sys/kern/vfs_syscalls.c +++ b/sys/kern/vfs_syscalls.c @@ -1384,6 +1384,8 @@ restart: NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); vn_finished_write(mp); + if (error == ERELOOKUP) + goto restart; return (error); } @@ -1470,6 +1472,8 @@ out: vput(nd.ni_dvp); vn_finished_write(mp); NDFREE(&nd, NDF_ONLY_PNBUF); + if (error == ERELOOKUP) + goto restart; return (error); } @@ -1568,7 +1572,7 @@ kern_linkat(struct thread *td, int fd1, int fd2, const char *path1, return (error); NDFREE(&nd, NDF_ONLY_PNBUF); error = kern_linkat_vp(td, nd.ni_vp, fd2, path2, segflag); - } while (error == EAGAIN); + } while (error == EAGAIN || error == ERELOOKUP); return (error); } @@ -1741,6 +1745,8 @@ out2: NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); vn_finished_write(mp); + if (error == ERELOOKUP) + goto restart; out: if (segflg != UIO_SYSSPACE) uma_zfree(namei_zone, tmppath); @@ -1791,6 +1797,8 @@ restart: NDFREE(&nd, NDF_ONLY_PNBUF); vput(nd.ni_dvp); vn_finished_write(mp); + if (error == ERELOOKUP) + goto restart; return (error); } @@ -1937,6 +1945,8 @@ out: vrele(vp); else vput(vp); + if (error == ERELOOKUP) + goto restart; fdout: if (fp != NULL) fdrop(fp, td); @@ -3395,7 +3405,8 @@ kern_truncate(struct thread *td, const char *path, enum uio_seg pathseg, int error; if (length < 0) - return(EINVAL); + return (EINVAL); +retry: NDINIT(&nd, LOOKUP, FOLLOW | AUDITVNODE1, pathseg, path, td); if ((error = namei(&nd)) != 0) return (error); @@ -3424,6 +3435,8 @@ kern_truncate(struct thread *td, const char *path, enum uio_seg pathseg, vn_finished_write(mp); vn_rangelock_unlock(vp, rl_cookie); vrele(vp); + if (error == ERELOOKUP) + goto retry; return (error); } @@ -3479,6 +3492,7 @@ kern_fsync(struct thread *td, int fd, bool fullsync) if (!fullsync) /* XXXKIB: compete outstanding aio writes */; #endif +retry: error = vn_start_write(vp, &mp, V_WAIT | PCATCH); if (error != 0) goto drop; @@ -3498,6 +3512,8 @@ kern_fsync(struct thread *td, int fd, bool fullsync) error = fullsync ? VOP_FSYNC(vp, MNT_WAIT, td) : VOP_FDATASYNC(vp, td); VOP_UNLOCK(vp); vn_finished_write(mp); + if (error == ERELOOKUP) + goto retry; drop: fdrop(fp, td); return (error); @@ -3679,7 +3695,7 @@ again: * are links to the same vnode), then there is nothing to do. */ if (fvp == tvp) - error = -1; + error = ERESTART; #ifdef MAC else error = mac_vnode_check_rename_to(td->td_ucred, tdvp, @@ -3708,8 +3724,10 @@ out: out1: if (fromnd.ni_startdir) vrele(fromnd.ni_startdir); - if (error == -1) + if (error == ERESTART) return (0); + if (error == ERELOOKUP) + goto again; return (error); } @@ -3803,6 +3821,8 @@ out: if (error == 0) vput(nd.ni_vp); vn_finished_write(mp); + if (error == ERELOOKUP) + goto restart; return (error); } @@ -3903,6 +3923,8 @@ out: vrele(nd.ni_dvp); else vput(nd.ni_dvp); + if (error == ERELOOKUP) + goto restart; fdout: if (fp != NULL) fdrop(fp, td); @@ -4416,7 +4438,8 @@ kern_fhlinkat(struct thread *td, int fd, const char *path, if (error != 0) return (error); VOP_UNLOCK(vp); - } while ((error = kern_linkat_vp(td, vp, fd, path, pathseg)) == EAGAIN); + error = kern_linkat_vp(td, vp, fd, path, pathseg); + } while (error == EAGAIN || error == ERELOOKUP); return (error); }