From 1bc93bb7b9bd530143ef573114c3d115ea6d81e4 Mon Sep 17 00:00:00 2001 From: Konstantin Belousov Date: Wed, 27 May 2015 09:20:42 +0000 Subject: [PATCH] Currently, softupdate code detects overstepping on the workitems limits in the code which is deep in the call stack, and owns several critical system resources, like vnode locks. Attempt to wait while the per-mount softupdate thread cleans up the backlog may deadlock, because the thread might need to lock the same vnode which is owned by the waiting thread. Instead of synchronously waiting for the worker, perform the worker' tickle and pause until the backlog is cleaned, at the safe point during return from kernel to usermode. A new ast request to call softdep_ast_cleanup() is created, the SU code now only checks the size of queue and schedules ast. There is no ast delivery for the kernel threads, so they are exempted from the mechanism, except NFS daemon threads. NFS server loop explicitely checks for the request, and informs the schedule_cleanup() that it is capable of handling the requests by the process P2_AST_SU flag. This is needed because nfsd may be the sole cause of the SU workqueue overflow. But, to not cause nsfd to spawn additional threads just because we slow down existing workers, only tickle su threads, without waiting for the backlog cleanup. Reviewed by: jhb, mckusick Tested by: pho Sponsored by: The FreeBSD Foundation MFC after: 2 weeks --- sys/fs/nfsserver/nfs_nfsdkrpc.c | 10 +++ sys/kern/kern_exit.c | 6 ++ sys/kern/subr_trap.c | 7 ++ sys/sys/proc.h | 2 + sys/sys/systm.h | 2 + sys/ufs/ffs/ffs_softdep.c | 118 ++++++++++++++++++++++++++++---- 6 files changed, 131 insertions(+), 14 deletions(-) diff --git a/sys/fs/nfsserver/nfs_nfsdkrpc.c b/sys/fs/nfsserver/nfs_nfsdkrpc.c index f9b8eb8e92e..0f60686703f 100644 --- a/sys/fs/nfsserver/nfs_nfsdkrpc.c +++ b/sys/fs/nfsserver/nfs_nfsdkrpc.c @@ -294,6 +294,8 @@ nfssvc_program(struct svc_req *rqst, SVCXPRT *xprt) svc_freereq(rqst); out: + if (softdep_ast_cleanup != NULL) + softdep_ast_cleanup(); NFSEXITCODE(0); } @@ -464,6 +466,7 @@ int nfsrvd_nfsd(struct thread *td, struct nfsd_nfsd_args *args) { char principal[MAXHOSTNAMELEN + 5]; + struct proc *p; int error = 0; bool_t ret2, ret3, ret4; @@ -481,6 +484,10 @@ nfsrvd_nfsd(struct thread *td, struct nfsd_nfsd_args *args) */ NFSD_LOCK(); if (newnfs_numnfsd == 0) { + p = td->td_proc; + PROC_LOCK(p); + p->p_flag2 |= P2_AST_SU; + PROC_UNLOCK(p); newnfs_numnfsd++; NFSD_UNLOCK(); @@ -512,6 +519,9 @@ nfsrvd_nfsd(struct thread *td, struct nfsd_nfsd_args *args) NFSD_LOCK(); newnfs_numnfsd--; nfsrvd_init(1); + PROC_LOCK(p); + p->p_flag2 &= ~P2_AST_SU; + PROC_UNLOCK(p); } NFSD_UNLOCK(); diff --git a/sys/kern/kern_exit.c b/sys/kern/kern_exit.c index caf31d7018a..a2742c2cb8e 100644 --- a/sys/kern/kern_exit.c +++ b/sys/kern/kern_exit.c @@ -204,6 +204,12 @@ exit1(struct thread *td, int rv) panic("Going nowhere without my init!"); } + /* + * Deref SU mp, since the thread does not return to userspace. + */ + if (softdep_ast_cleanup != NULL) + softdep_ast_cleanup(); + /* * MUST abort all other threads before proceeding past here. */ diff --git a/sys/kern/subr_trap.c b/sys/kern/subr_trap.c index 93f7557aef6..82387c2d2d1 100644 --- a/sys/kern/subr_trap.c +++ b/sys/kern/subr_trap.c @@ -86,6 +86,8 @@ __FBSDID("$FreeBSD$"); #include +void (*softdep_ast_cleanup)(void); + /* * Define the code needed before returning to user mode, for trap and * syscall. @@ -114,6 +116,9 @@ userret(struct thread *td, struct trapframe *frame) #ifdef KTRACE KTRUSERRET(td); #endif + if (softdep_ast_cleanup != NULL) + softdep_ast_cleanup(); + /* * If this thread tickled GEOM, we need to wait for the giggling to * stop before we return to userland @@ -157,6 +162,8 @@ userret(struct thread *td, struct trapframe *frame) ("userret: Returning while holding vnode reservation")); KASSERT((td->td_flags & TDF_SBDRY) == 0, ("userret: Returning with stop signals deferred")); + KASSERT(td->td_su == NULL, + ("userret: Returning with SU cleanup request not handled")); #ifdef VIMAGE /* Unfortunately td_vnet_lpush needs VNET_DEBUG. */ VNET_ASSERT(curvnet == NULL, diff --git a/sys/sys/proc.h b/sys/sys/proc.h index b44d8f1ca7c..018b595fa9c 100644 --- a/sys/sys/proc.h +++ b/sys/sys/proc.h @@ -277,6 +277,7 @@ struct thread { u_int td_vp_reserv; /* (k) Count of reserved vnodes. */ int td_no_sleeping; /* (k) Sleeping disabled count. */ int td_dom_rr_idx; /* (k) RR Numa domain selection. */ + void *td_su; /* (k) FFS SU private */ #define td_endzero td_sigmask /* Copied during fork1() or create_thread(). */ @@ -677,6 +678,7 @@ struct proc { #define P2_INHERIT_PROTECTED 0x00000001 /* New children get P_PROTECTED. */ #define P2_NOTRACE 0x00000002 /* No ptrace(2) attach or coredumps. */ #define P2_NOTRACE_EXEC 0x00000004 /* Keep P2_NOPTRACE on exec(2). */ +#define P2_AST_SU 0x00000008 /* Handles SU ast for kthreads. */ /* Flags protected by proctree_lock, kept in p_treeflags. */ #define P_TREE_ORPHANED 0x00000001 /* Reparented, on orphan list */ diff --git a/sys/sys/systm.h b/sys/sys/systm.h index acea565c0ab..cb05a00008e 100644 --- a/sys/sys/systm.h +++ b/sys/sys/systm.h @@ -431,4 +431,6 @@ void free_unr(struct unrhdr *uh, u_int item); void intr_prof_stack_use(struct thread *td, struct trapframe *frame); +extern void (*softdep_ast_cleanup)(void); + #endif /* !_SYS_SYSTM_H_ */ diff --git a/sys/ufs/ffs/ffs_softdep.c b/sys/ufs/ffs/ffs_softdep.c index ab2bd413820..dc5d98ef9dc 100644 --- a/sys/ufs/ffs/ffs_softdep.c +++ b/sys/ufs/ffs/ffs_softdep.c @@ -900,6 +900,8 @@ static int pagedep_find(struct pagedep_hashhead *, ino_t, ufs_lbn_t, struct pagedep **); static void pause_timer(void *); static int request_cleanup(struct mount *, int); +static void schedule_cleanup(struct mount *); +static void softdep_ast_cleanup_proc(void); static int process_worklist_item(struct mount *, int, int); static void process_removes(struct vnode *); static void process_truncates(struct vnode *); @@ -921,6 +923,8 @@ static int journal_unsuspend(struct ufsmount *ump); static void softdep_prelink(struct vnode *, struct vnode *); static void add_to_journal(struct worklist *); static void remove_from_journal(struct worklist *); +static bool softdep_excess_inodes(struct ufsmount *); +static bool softdep_excess_dirrem(struct ufsmount *); static void softdep_process_journal(struct mount *, struct worklist *, int); static struct jremref *newjremref(struct dirrem *, struct inode *, struct inode *ip, off_t, nlink_t); @@ -2209,12 +2213,10 @@ inodedep_lookup(mp, inum, flags, inodedeppp) * responsible for more than our share of that usage and * we are not in a rush, request some inodedep cleanup. */ - while (dep_current[D_INODEDEP] > max_softdeps && - (flags & NODELAY) == 0 && - ump->softdep_curdeps[D_INODEDEP] > - max_softdeps / stat_flush_threads) - request_cleanup(mp, FLUSH_INODES); - FREE_LOCK(ump); + if (softdep_excess_inodes(ump)) + schedule_cleanup(mp); + else + FREE_LOCK(ump); inodedep = malloc(sizeof(struct inodedep), M_INODEDEP, M_SOFTDEP_FLAGS); workitem_alloc(&inodedep->id_list, D_INODEDEP, mp); @@ -2412,6 +2414,7 @@ softdep_initialize() bioops.io_complete = softdep_disk_write_complete; bioops.io_deallocate = softdep_deallocate_dependencies; bioops.io_countdeps = softdep_count_dependencies; + softdep_ast_cleanup = softdep_ast_cleanup_proc; /* Initialize the callout with an mtx. */ callout_init_mtx(&softdep_callout, &lk, 0); @@ -2430,6 +2433,7 @@ softdep_uninitialize() bioops.io_complete = NULL; bioops.io_deallocate = NULL; bioops.io_countdeps = NULL; + softdep_ast_cleanup = NULL; callout_drain(&softdep_callout); } @@ -9126,13 +9130,12 @@ newdirrem(bp, dp, ip, isrmdir, prevdirremp) * the number of freefile and freeblks structures. */ ACQUIRE_LOCK(ip->i_ump); - while (!IS_SNAPSHOT(ip) && dep_current[D_DIRREM] > max_softdeps / 2 && - ip->i_ump->softdep_curdeps[D_DIRREM] > - (max_softdeps / 2) / stat_flush_threads) - (void) request_cleanup(ITOV(dp)->v_mount, FLUSH_BLOCKS); - FREE_LOCK(ip->i_ump); - dirrem = malloc(sizeof(struct dirrem), - M_DIRREM, M_SOFTDEP_FLAGS|M_ZERO); + if (!IS_SNAPSHOT(ip) && softdep_excess_dirrem(ip->i_ump)) + schedule_cleanup(ITOV(dp)->v_mount); + else + FREE_LOCK(ip->i_ump); + dirrem = malloc(sizeof(struct dirrem), M_DIRREM, M_SOFTDEP_FLAGS | + M_ZERO); workitem_alloc(&dirrem->dm_list, D_DIRREM, dvp->v_mount); LIST_INIT(&dirrem->dm_jremrefhd); LIST_INIT(&dirrem->dm_jwork); @@ -13269,6 +13272,92 @@ retry: return (1); } +static bool +softdep_excess_inodes(struct ufsmount *ump) +{ + + return (dep_current[D_INODEDEP] > max_softdeps && + ump->softdep_curdeps[D_INODEDEP] > max_softdeps / + stat_flush_threads); +} + +static bool +softdep_excess_dirrem(struct ufsmount *ump) +{ + + return (dep_current[D_DIRREM] > max_softdeps / 2 && + ump->softdep_curdeps[D_DIRREM] > (max_softdeps / 2) / + stat_flush_threads); +} + +static void +schedule_cleanup(struct mount *mp) +{ + struct ufsmount *ump; + struct thread *td; + + ump = VFSTOUFS(mp); + LOCK_OWNED(ump); + FREE_LOCK(ump); + td = curthread; + if ((td->td_pflags & TDP_KTHREAD) != 0 && + (td->td_proc->p_flag2 & P2_AST_SU) == 0) { + /* + * No ast is delivered to kernel threads, so nobody + * would deref the mp. Some kernel threads + * explicitely check for AST, e.g. NFS daemon does + * this in the serving loop. + */ + return; + } + if (td->td_su != NULL) + vfs_rel(td->td_su); + vfs_ref(mp); + td->td_su = mp; + thread_lock(td); + td->td_flags |= TDF_ASTPENDING; + thread_unlock(td); +} + +static void +softdep_ast_cleanup_proc(void) +{ + struct thread *td; + struct mount *mp; + struct ufsmount *ump; + int error; + bool req; + + td = curthread; + mp = td->td_su; + if (mp == NULL) + return; + td->td_su = NULL; + error = vfs_busy(mp, MBF_NOWAIT); + vfs_rel(mp); + if (error != 0) + return; + if (ffs_own_mount(mp) && MOUNTEDSOFTDEP(mp)) { + ump = VFSTOUFS(mp); + for (;;) { + req = false; + ACQUIRE_LOCK(ump); + if (softdep_excess_inodes(ump)) { + req = true; + request_cleanup(mp, FLUSH_INODES); + } + if (softdep_excess_dirrem(ump)) { + req = true; + request_cleanup(mp, FLUSH_BLOCKS); + } + FREE_LOCK(ump); + if ((td->td_pflags & TDP_KTHREAD) != 0 || !req) + break; + } + } + vfs_unbusy(mp); +} + /* * If memory utilization has gotten too high, deliberately slow things * down and speed up the I/O processing. @@ -13355,7 +13444,8 @@ request_cleanup(mp, resource) callout_reset(&softdep_callout, tickdelay > 2 ? tickdelay : 2, pause_timer, 0); - msleep((caddr_t)&proc_waiting, &lk, PPAUSE, "softupdate", 0); + if ((td->td_pflags & TDP_KTHREAD) == 0) + msleep((caddr_t)&proc_waiting, &lk, PPAUSE, "softupdate", 0); proc_waiting -= 1; FREE_GBLLOCK(&lk); ACQUIRE_LOCK(ump);