Un-staticize runningbufwakeup() and staticize updateproc.

Add a new private thread flag to indicate that the thread should
not sleep if runningbufspace is too large.

Set this flag on the bufdaemon and syncer threads so that they skip
the waitrunningbufspace() call in bufwrite() rather than than
checking the proc pointer vs. the known proc pointers for these two
threads.  A way of preventing these threads from being starved for
I/O but still placing limits on their outstanding I/O would be
desirable.

Set this flag in ffs_copyonwrite() to prevent bufwrite() calls from
blocking on the runningbufspace check while holding snaplk.  This
prevents snaplk from being held for an arbitrarily long period of
time if runningbufspace is high and greatly reduces the contention
for snaplk.  The disadvantage is that ffs_copyonwrite() can start
a large amount of I/O if there are a large number of snapshots,
which could cause a deadlock in other parts of the code.

Call runningbufwakeup() in ffs_copyonwrite() to decrement runningbufspace
before attempting to grab snaplk so that I/O requests waiting on
snaplk are not counted in runningbufspace as being in-progress.
Increment runningbufspace again before actually launching the
original I/O request.

Prior to the above two changes, the system could deadlock if enough
I/O requests were blocked by snaplk to prevent runningbufspace from
falling below lorunningspace and one of the bawrite() calls in
ffs_copyonwrite() blocked in waitrunningbufspace() while holding
snaplk.

See <http://www.holm.cc/stress/log/cons143.html>
This commit is contained in:
Don Lewis 2005-09-30 01:30:01 +00:00
parent 9e241c5ef2
commit 6c8b634f1d
5 changed files with 23 additions and 9 deletions

View file

@ -318,7 +318,7 @@ bufspacewakeup(void)
* runningbufwakeup() - in-progress I/O accounting.
*
*/
static __inline void
void
runningbufwakeup(struct buf *bp)
{
@ -847,8 +847,7 @@ bufwrite(struct buf *bp)
* or syncer daemon trying to clean up as that can lead
* to deadlock.
*/
if (curthread->td_proc != bufdaemonproc &&
curthread->td_proc != updateproc)
if ((curthread->td_pflags & TDP_NORUNNINGBUF) == 0)
waitrunningbufspace();
}
@ -1964,6 +1963,7 @@ buf_daemon()
/*
* This process is allowed to take the buffer cache to the limit
*/
curthread->td_pflags |= TDP_NORUNNINGBUF;
mtx_lock(&bdlock);
for (;;) {
bd_request = 0;

View file

@ -1524,7 +1524,7 @@ sysctl_vfs_worklist_len(SYSCTL_HANDLER_ARGS)
SYSCTL_PROC(_vfs, OID_AUTO, worklist_len, CTLTYPE_INT | CTLFLAG_RD, NULL, 0,
sysctl_vfs_worklist_len, "I", "Syncer thread worklist length");
struct proc *updateproc;
static struct proc *updateproc;
static void sched_sync(void);
static struct kproc_desc up_kp = {
"syncer",
@ -1601,6 +1601,7 @@ sched_sync(void)
first_printf = 1;
syncer_state = SYNCER_RUNNING;
starttime = time_uptime;
td->td_pflags |= TDP_NORUNNINGBUF;
EVENTHANDLER_REGISTER(shutdown_pre_sync, syncer_shutdown, td->td_proc,
SHUTDOWN_PRI_LAST);

View file

@ -477,6 +477,7 @@ extern int nswbuf; /* Number of swap I/O buffer headers. */
extern int cluster_pbuf_freecnt; /* Number of pbufs for clusters */
extern int vnode_pbuf_freecnt; /* Number of pbufs for vnode pager */
void runningbufwakeup(struct buf *);
caddr_t kern_vfs_bio_buffer_alloc(caddr_t v, long physmem_est);
void bufinit(void);
void bwillwrite(void);

View file

@ -378,6 +378,7 @@ struct thread {
#define TDP_SCHED4 0x00008000 /* Reserved for scheduler private use */
#define TDP_GEOM 0x00010000 /* Settle GEOM before finishing syscall */
#define TDP_SOFTDEP 0x00020000 /* Stuck processing softdep worklist */
#define TDP_NORUNNINGBUF 0x00040000 /* Ignore runningbufspace check */
/*
* Reasons that the current thread can not be run yet.
@ -833,7 +834,6 @@ TAILQ_HEAD(threadqueue, thread);
extern struct proclist allproc; /* List of all processes. */
extern struct proclist zombproc; /* List of zombie processes. */
extern struct proc *initproc, *pageproc; /* Process slots for init, pager. */
extern struct proc *updateproc; /* Process slot for syncer (sic). */
extern struct uma_zone *proc_zone;

View file

@ -1997,6 +1997,12 @@ ffs_copyonwrite(devvp, bp)
VI_UNLOCK(devvp);
return (0);
}
/*
* Since I/O on bp isn't yet in progress and it may be blocked
* for a long time waiting on snaplk, back it out of
* runningbufspace, possibly waking other threads waiting for space.
*/
runningbufwakeup(bp);
/*
* Not in the precomputed list, so check the snapshots.
*/
@ -2028,7 +2034,7 @@ retry:
goto retry;
}
snapshot_locked = 1;
td->td_pflags |= TDP_COWINPROGRESS;
td->td_pflags |= TDP_COWINPROGRESS | TDP_NORUNNINGBUF;
error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
fs->fs_bsize, KERNCRED, BA_METAONLY, &ibp);
td->td_pflags &= ~TDP_COWINPROGRESS;
@ -2065,7 +2071,7 @@ retry:
goto retry;
}
snapshot_locked = 1;
td->td_pflags |= TDP_COWINPROGRESS;
td->td_pflags |= TDP_COWINPROGRESS | TDP_NORUNNINGBUF;
error = UFS_BALLOC(vp, lblktosize(fs, (off_t)lbn),
fs->fs_bsize, KERNCRED, 0, &cbp);
td->td_pflags &= ~TDP_COWINPROGRESS;
@ -2120,10 +2126,16 @@ retry:
if (dopersistence && VTOI(vp)->i_effnlink > 0)
(void) ffs_syncvnode(vp, MNT_WAIT);
}
if (snapshot_locked)
if (snapshot_locked) {
lockmgr(vp->v_vnlock, LK_RELEASE, NULL, td);
else
td->td_pflags &= ~TDP_NORUNNINGBUF;
} else
VI_UNLOCK(devvp);
/*
* I/O on bp will now be started, so count it in runningbufspace.
*/
if (bp->b_runningbufspace)
atomic_add_int(&runningbufspace, bp->b_runningbufspace);
return (error);
}