From b0b9dee5c9e8c74516fa4c8ca246aa89782ec4fd Mon Sep 17 00:00:00 2001
From: Attilio Rao <attilio@FreeBSD.org>
Date: Sat, 23 Jan 2010 15:54:21 +0000
Subject: [PATCH] - Fix a race in sched_switch() of sched_4bsd.   In the case
 of the thread being on a sleepqueue or a turnstile, the   sched_lock was
 acquired (without the aid of the td_lock interface) and   the td_lock was
 dropped. This was going to break locking rules on other   threads willing to
 access to the thread (via the td_lock interface) and   modify his flags
 (allowed as long as the container lock was different   by the one used in
 sched_switch).   In order to prevent this situation, while sched_lock is
 acquired there   the td_lock gets blocked. [0] - Merge the ULE's internal
 function thread_block_switch() into the global   thread_lock_block() and make
 the former semantic as the default for   thread_lock_block(). This means that
 thread_lock_block() will not   disable interrupts when called (and
 consequently thread_unlock_block()   will not re-enabled them when called).
 This should be done manually   when necessary.   Note, however, that ULE's
 thread_unblock_switch() is not reaped   because it does reflect a difference
 in semantic due in ULE (the   td_lock may not be necessarilly still
 blocked_lock when calling this).   While asymmetric, it does describe a
 remarkable difference in semantic   that is good to keep in mind.

[0] Reported by:	Kohji Okuno
			<okuno dot kohji at jp dot panasonic dot com>
Tested by:		Giovanni Trematerra
			<giovanni dot trematerra at gmail dot com>
MFC:			2 weeks
---
 sys/kern/kern_mutex.c |  2 --
 sys/kern/sched_4bsd.c |  7 +++++--
 sys/kern/sched_ule.c  | 27 ++++++---------------------
 3 files changed, 11 insertions(+), 25 deletions(-)
diff --git a/sys/kern/kern_mutex.c b/sys/kern/kern_mutex.c
index 85ca646370d..f9c3377ce88 100644
--- a/sys/kern/kern_mutex.c
+++ b/sys/kern/kern_mutex.c
@@ -616,7 +616,6 @@ thread_lock_block(struct thread *td)
 {
 	struct mtx *lock;
 
-	spinlock_enter();
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 	lock = td->td_lock;
 	td->td_lock = &blocked_lock;
@@ -631,7 +630,6 @@ thread_lock_unblock(struct thread *td, struct mtx *new)
 	mtx_assert(new, MA_OWNED);
 	MPASS(td->td_lock == &blocked_lock);
 	atomic_store_rel_ptr((volatile void *)&td->td_lock, (uintptr_t)new);
-	spinlock_exit();
 }
 
 void
diff --git a/sys/kern/sched_4bsd.c b/sys/kern/sched_4bsd.c
index 99ea7b80238..bcec5b9205d 100644
--- a/sys/kern/sched_4bsd.c
+++ b/sys/kern/sched_4bsd.c
@@ -920,9 +920,11 @@ sched_sleep(struct thread *td, int pri)
 void
 sched_switch(struct thread *td, struct thread *newtd, int flags)
 {
+	struct mtx *tmtx;
 	struct td_sched *ts;
 	struct proc *p;
 
+	tmtx = NULL;
 	ts = td->td_sched;
 	p = td->td_proc;
 
@@ -931,10 +933,11 @@ sched_switch(struct thread *td, struct thread *newtd, int flags)
 	/* 
 	 * Switch to the sched lock to fix things up and pick
 	 * a new thread.
+	 * Block the td_lock in order to avoid breaking the critical path.
 	 */
 	if (td->td_lock != &sched_lock) {
 		mtx_lock_spin(&sched_lock);
-		thread_unlock(td);
+		tmtx = thread_lock_block(td);
 	}
 
 	if ((td->td_flags & TDF_NOLOAD) == 0)
@@ -1004,7 +1007,7 @@ sched_switch(struct thread *td, struct thread *newtd, int flags)
 			(*dtrace_vtime_switch_func)(newtd);
 #endif
 
-		cpu_switch(td, newtd, td->td_lock);
+		cpu_switch(td, newtd, tmtx != NULL ? tmtx : td->td_lock);
 		lock_profile_obtain_lock_success(&sched_lock.lock_object,
 		    0, 0, __FILE__, __LINE__);
 		/*
diff --git a/sys/kern/sched_ule.c b/sys/kern/sched_ule.c
index 43c9a867ab0..f29dd95855a 100644
--- a/sys/kern/sched_ule.c
+++ b/sys/kern/sched_ule.c
@@ -301,7 +301,6 @@ static int sched_pickcpu(struct thread *, int);
 static void sched_balance(void);
 static int sched_balance_pair(struct tdq *, struct tdq *);
 static inline struct tdq *sched_setcpu(struct thread *, int, int);
-static inline struct mtx *thread_block_switch(struct thread *);
 static inline void thread_unblock_switch(struct thread *, struct mtx *);
 static struct mtx *sched_switch_migrate(struct tdq *, struct thread *, int);
 static int sysctl_kern_sched_topology_spec(SYSCTL_HANDLER_ARGS);
@@ -1106,9 +1105,11 @@ sched_setcpu(struct thread *td, int cpu, int flags)
 	 * The hard case, migration, we need to block the thread first to
 	 * prevent order reversals with other cpus locks.
 	 */
+	spinlock_enter();
 	thread_lock_block(td);
 	TDQ_LOCK(tdq);
 	thread_lock_unblock(td, TDQ_LOCKPTR(tdq));
+	spinlock_exit();
 	return (tdq);
 }
 
@@ -1714,23 +1715,6 @@ sched_unlend_user_prio(struct thread *td, u_char prio)
 	}
 }
 
-/*
- * Block a thread for switching.  Similar to thread_block() but does not
- * bump the spin count.
- */
-static inline struct mtx *
-thread_block_switch(struct thread *td)
-{
-	struct mtx *lock;
-
-	THREAD_LOCK_ASSERT(td, MA_OWNED);
-	lock = td->td_lock;
-	td->td_lock = &blocked_lock;
-	mtx_unlock_spin(lock);
-
-	return (lock);
-}
-
 /*
  * Handle migration from sched_switch().  This happens only for
  * cpu binding.
@@ -1749,7 +1733,7 @@ sched_switch_migrate(struct tdq *tdq, struct thread *td, int flags)
 	 * not holding either run-queue lock.
 	 */
 	spinlock_enter();
-	thread_block_switch(td);	/* This releases the lock on tdq. */
+	thread_lock_block(td);	/* This releases the lock on tdq. */
 
 	/*
 	 * Acquire both run-queue locks before placing the thread on the new
@@ -1769,7 +1753,8 @@ sched_switch_migrate(struct tdq *tdq, struct thread *td, int flags)
 }
 
 /*
- * Release a thread that was blocked with thread_block_switch().
+ * Variadic version of thread_lock_unblock() that does not assume td_lock
+ * is blocked.
  */
 static inline void
 thread_unblock_switch(struct thread *td, struct mtx *mtx)
@@ -1825,7 +1810,7 @@ sched_switch(struct thread *td, struct thread *newtd, int flags)
 	} else {
 		/* This thread must be going to sleep. */
 		TDQ_LOCK(tdq);
-		mtx = thread_block_switch(td);
+		mtx = thread_lock_block(td);
 		tdq_load_rem(tdq, td);
 	}
 	/*