diff --git a/sys/kern/sched_ule.c b/sys/kern/sched_ule.c
index b96338aeaf7..363ba411bfa 100644
--- a/sys/kern/sched_ule.c
+++ b/sys/kern/sched_ule.c
@@ -80,17 +80,17 @@ struct td_sched {
 	int		ts_ltick;	/* Last tick that we were running on */
 	int		ts_ftick;	/* First tick that we were running on */
 	int		ts_ticks;	/* Tick count */
+#ifdef SMP
+	int		ts_rltick;	/* Real last tick, for affinity. */
+#endif
 
 	/* originally from kg_sched */
 	int	skg_slptime;		/* Number of ticks we vol. slept */
 	int	skg_runtime;		/* Number of ticks we were running */
 };
-#define	ts_assign		ts_procq.tqe_next
 /* flags kept in ts_flags */
-#define	TSF_ASSIGNED	0x0001		/* Thread is being migrated. */
-#define	TSF_BOUND	0x0002		/* Thread can not migrate. */
-#define	TSF_XFERABLE	0x0004		/* Thread was added as transferable. */
-#define	TSF_REMOVED	0x0008		/* Thread was removed while ASSIGNED */
+#define	TSF_BOUND	0x0001		/* Thread can not migrate. */
+#define	TSF_XFERABLE	0x0002		/* Thread was added as transferable. */
 #define	TSF_DIDRUN	0x2000		/* Thread actually ran. */
 
 static struct td_sched td_sched0;
@@ -163,7 +163,6 @@ static int sched_interact = SCHED_INTERACT_THRESH;
 static int realstathz;
 static int tickincr;
 static int sched_slice;
-static int sched_rebalance = 1;
 
 /*
  * tdq - per processor runqs and statistics.
@@ -175,16 +174,18 @@ struct tdq {
 	int		tdq_idx;		/* Current insert index. */
 	int		tdq_ridx;		/* Current removal index. */
 	int		tdq_load;		/* Aggregate load. */
+	int		tdq_flags;		/* Thread queue flags */
 #ifdef SMP
 	int		tdq_transferable;
 	LIST_ENTRY(tdq)	tdq_siblings;		/* Next in tdq group. */
 	struct tdq_group *tdq_group;		/* Our processor group. */
-	volatile struct td_sched *tdq_assigned;	/* assigned by another CPU. */
 #else
 	int		tdq_sysload;		/* For loadavg, !ITHD load. */
 #endif
 };
 
+#define	TDQF_BUSY	0x0001			/* Queue is marked as busy */
+
 #ifdef SMP
 /*
  * tdq groups are groups of processors which can cheaply share threads.  When
@@ -203,13 +204,30 @@ struct tdq_group {
 	int	tdg_transferable;	/* Transferable load of this group. */
 	LIST_HEAD(, tdq) tdg_members;	/* Linked list of all members. */
 };
-#endif
+
+#define	SCHED_AFFINITY_DEFAULT	(hz / 100)
+#define	SCHED_AFFINITY(ts)	((ts)->ts_rltick > ticks - affinity)
+
+/*
+ * Run-time tunables.
+ */
+static int rebalance = 1;
+static int pick_pri = 1;
+static int affinity;
+static int tryself = 1;
+static int tryselfidle = 1;
+static int ipi_ast = 0;
+static int ipi_preempt = 1;
+static int ipi_thresh = PRI_MIN_KERN;
+static int steal_htt = 1;
+static int steal_busy = 1;
+static int busy_thresh = 4;
 
 /*
  * One thread queue per processor.
  */
-#ifdef SMP
-static cpumask_t tdq_idle;
+static volatile cpumask_t tdq_idle;
+static volatile cpumask_t tdq_busy;
 static int tdg_maxid;
 static struct tdq	tdq_cpu[MAXCPU];
 static struct tdq_group tdq_groups[MAXCPU];
@@ -248,21 +266,20 @@ static __inline void tdq_runq_rem(struct tdq *, struct td_sched *);
 void tdq_print(int cpu);
 static void runq_print(struct runq *rq);
 #ifdef SMP
-static int tdq_transfer(struct tdq *, struct td_sched *, int);
+static int tdq_pickidle(struct tdq *, struct td_sched *);
+static int tdq_pickpri(struct tdq *, struct td_sched *, int);
 static struct td_sched *runq_steal(struct runq *);
 static void sched_balance(void);
 static void sched_balance_groups(void);
 static void sched_balance_group(struct tdq_group *);
 static void sched_balance_pair(struct tdq *, struct tdq *);
-static void sched_smp_tick(void);
+static void sched_smp_tick(struct thread *);
 static void tdq_move(struct tdq *, int);
 static int tdq_idled(struct tdq *);
-static void tdq_notify(struct td_sched *, int);
-static void tdq_assign(struct tdq *);
+static void tdq_notify(struct td_sched *);
 static struct td_sched *tdq_steal(struct tdq *, int);
 
-#define	THREAD_CAN_MIGRATE(td)						\
-    ((td)->td_pinned == 0 && (td)->td_pri_class != PRI_ITHD)
+#define	THREAD_CAN_MIGRATE(td)	 ((td)->td_pinned == 0)
 #endif
 
 static void sched_setup(void *dummy);
@@ -337,6 +354,11 @@ tdq_runq_add(struct tdq *tdq, struct td_sched *ts, int flags)
 		tdq->tdq_transferable++;
 		tdq->tdq_group->tdg_transferable++;
 		ts->ts_flags |= TSF_XFERABLE;
+		if (tdq->tdq_transferable >= busy_thresh &&
+		    (tdq->tdq_flags & TDQF_BUSY) == 0) {
+			tdq->tdq_flags |= TDQF_BUSY;
+			atomic_set_int(&tdq_busy, 1 << TDQ_ID(tdq));
+		}
 	}
 #endif
 	if (ts->ts_runq == &tdq->tdq_timeshare) {
@@ -376,6 +398,11 @@ tdq_runq_rem(struct tdq *tdq, struct td_sched *ts)
 		tdq->tdq_transferable--;
 		tdq->tdq_group->tdg_transferable--;
 		ts->ts_flags &= ~TSF_XFERABLE;
+		if (tdq->tdq_transferable < busy_thresh && 
+		    (tdq->tdq_flags & TDQF_BUSY)) {
+			atomic_clear_int(&tdq_busy, 1 << TDQ_ID(tdq));
+			tdq->tdq_flags &= ~TDQF_BUSY;
+		}
 	}
 #endif
 	if (ts->ts_runq == &tdq->tdq_timeshare) {
@@ -402,7 +429,8 @@ tdq_load_add(struct tdq *tdq, struct td_sched *ts)
 	class = PRI_BASE(ts->ts_thread->td_pri_class);
 	tdq->tdq_load++;
 	CTR1(KTR_SCHED, "load: %d", tdq->tdq_load);
-	if (class != PRI_ITHD && (ts->ts_thread->td_proc->p_flag & P_NOLOAD) == 0)
+	if (class != PRI_ITHD &&
+	    (ts->ts_thread->td_proc->p_flag & P_NOLOAD) == 0)
 #ifdef SMP
 		tdq->tdq_group->tdg_load++;
 #else
@@ -416,7 +444,8 @@ tdq_load_rem(struct tdq *tdq, struct td_sched *ts)
 	int class;
 	mtx_assert(&sched_lock, MA_OWNED);
 	class = PRI_BASE(ts->ts_thread->td_pri_class);
-	if (class != PRI_ITHD  && (ts->ts_thread->td_proc->p_flag & P_NOLOAD) == 0)
+	if (class != PRI_ITHD &&
+	    (ts->ts_thread->td_proc->p_flag & P_NOLOAD) == 0)
 #ifdef SMP
 		tdq->tdq_group->tdg_load--;
 #else
@@ -429,23 +458,18 @@ tdq_load_rem(struct tdq *tdq, struct td_sched *ts)
 
 #ifdef SMP
 static void
-sched_smp_tick(void)
+sched_smp_tick(struct thread *td)
 {
 	struct tdq *tdq;
 
 	tdq = TDQ_SELF();
-	if (sched_rebalance) {
+	if (rebalance) {
 		if (ticks >= bal_tick)
 			sched_balance();
 		if (ticks >= gbal_tick && balance_groups)
 			sched_balance_groups();
 	}
-	/*
-	 * We could have been assigned a non real-time thread without an
-	 * IPI.
-	 */
-	if (tdq->tdq_assigned)
-		tdq_assign(tdq);	/* Potentially sets NEEDRESCHED */
+	td->td_sched->ts_rltick = ticks;
 }
 
 /*
@@ -599,10 +623,11 @@ tdq_move(struct tdq *from, int cpu)
 	}
 	if (tdq == to)
 		return;
-	ts->ts_state = TSS_THREAD;
-	tdq_runq_rem(tdq, ts);
-	tdq_load_rem(tdq, ts);
-	tdq_notify(ts, cpu);
+	sched_rem(ts->ts_thread);
+	ts->ts_cpu = cpu;
+	sched_pin_td(ts->ts_thread);
+	sched_add(ts->ts_thread, SRQ_YIELDING);
+	sched_unpin_td(ts->ts_thread);
 }
 
 static int
@@ -617,21 +642,34 @@ tdq_idled(struct tdq *tdq)
 	 * If we're in a cpu group, try and steal threads from another cpu in
 	 * the group before idling.
 	 */
-	if (tdg->tdg_cpus > 1 && tdg->tdg_transferable) {
+	if (steal_htt && tdg->tdg_cpus > 1 && tdg->tdg_transferable) {
 		LIST_FOREACH(steal, &tdg->tdg_members, tdq_siblings) {
 			if (steal == tdq || steal->tdq_transferable == 0)
 				continue;
 			ts = tdq_steal(steal, 0);
+			if (ts)
+				goto steal;
+		}
+	}
+	if (steal_busy) {
+		while (tdq_busy) {
+			int cpu;
+
+			cpu = ffs(tdq_busy);
+			if (cpu == 0)
+				break;
+			cpu--;
+			steal = TDQ_CPU(cpu);
+			if (steal->tdq_transferable == 0)
+				continue;
+			ts = tdq_steal(steal, 1);
 			if (ts == NULL)
 				continue;
-			ts->ts_state = TSS_THREAD;
-			tdq_runq_rem(steal, ts);
-			tdq_load_rem(steal, ts);
-			ts->ts_cpu = PCPU_GET(cpuid);
-			sched_pin_td(ts->ts_thread);
-			sched_add(ts->ts_thread, SRQ_YIELDING);
-			sched_unpin_td(ts->ts_thread);
-			return (0);
+			CTR5(KTR_SCHED,
+			    "tdq_idled: stealing td %p(%s) pri %d from %d busy 0x%X",
+			    ts->ts_thread, ts->ts_thread->td_proc->p_comm,
+			    ts->ts_thread->td_priority, cpu, tdq_busy);
+			goto steal;
 		}
 	}
 	/*
@@ -640,79 +678,51 @@ tdq_idled(struct tdq *tdq)
 	 * back and forth between two idle cores on seperate physical CPUs.
 	 */
 	tdg->tdg_idlemask |= PCPU_GET(cpumask);
-	if (tdg->tdg_idlemask != tdg->tdg_cpumask)
-		return (1);
-	atomic_set_int(&tdq_idle, tdg->tdg_mask);
+	if (tdg->tdg_idlemask == tdg->tdg_cpumask)
+		atomic_set_int(&tdq_idle, tdg->tdg_mask);
 	return (1);
+steal:
+	sched_rem(ts->ts_thread);
+	ts->ts_cpu = PCPU_GET(cpuid);
+	sched_pin_td(ts->ts_thread);
+	sched_add(ts->ts_thread, SRQ_YIELDING);
+	sched_unpin_td(ts->ts_thread);
+
+	return (0);
 }
 
 static void
-tdq_assign(struct tdq *tdq)
+tdq_notify(struct td_sched *ts)
 {
-	struct td_sched *nts;
-	struct td_sched *ts;
-
-	do {
-		*(volatile struct td_sched **)&ts = tdq->tdq_assigned;
-	} while(!atomic_cmpset_ptr((volatile uintptr_t *)&tdq->tdq_assigned,
-		(uintptr_t)ts, (uintptr_t)NULL));
-	for (; ts != NULL; ts = nts) {
-		nts = ts->ts_assign;
-		tdq->tdq_group->tdg_load--;
-		tdq->tdq_load--;
-		ts->ts_flags &= ~TSF_ASSIGNED;
-		if (ts->ts_flags & TSF_REMOVED) {
-			ts->ts_flags &= ~TSF_REMOVED;
-			continue;
-		}
-		sched_pin_td(ts->ts_thread);
-		sched_add(ts->ts_thread, SRQ_YIELDING);
-		sched_unpin_td(ts->ts_thread);
-	}
-}
-
-static void
-tdq_notify(struct td_sched *ts, int cpu)
-{
-	struct tdq *tdq;
 	struct thread *td;
 	struct pcpu *pcpu;
-	int class;
 	int prio;
+	int cpu;
 
-	tdq = TDQ_CPU(cpu);
-	class = PRI_BASE(ts->ts_thread->td_pri_class);
-	if ((class != PRI_IDLE && class != PRI_ITHD)
-	    && (tdq_idle & tdq->tdq_group->tdg_mask)) 
-		atomic_clear_int(&tdq_idle, tdq->tdq_group->tdg_mask);
-	tdq->tdq_group->tdg_load++;
-	tdq->tdq_load++;
-	ts->ts_cpu = cpu;
-	ts->ts_flags |= TSF_ASSIGNED;
 	prio = ts->ts_thread->td_priority;
-
-	/*
-	 * Place a thread on another cpu's queue and force a resched.
-	 */
-	do {
-		*(volatile struct td_sched **)&ts->ts_assign = tdq->tdq_assigned;
-	} while(!atomic_cmpset_ptr((volatile uintptr_t *)&tdq->tdq_assigned,
-		(uintptr_t)ts->ts_assign, (uintptr_t)ts));
-	/* Only ipi for realtime/ithd priorities */
-	if (ts->ts_thread->td_priority > PRI_MIN_KERN)
-		return;
-	/*
-	 * Without sched_lock we could lose a race where we set NEEDRESCHED
-	 * on a thread that is switched out before the IPI is delivered.  This
-	 * would lead us to miss the resched.  This will be a problem once
-	 * sched_lock is pushed down.
-	 */
+	cpu = ts->ts_cpu;
 	pcpu = pcpu_find(cpu);
 	td = pcpu->pc_curthread;
-	if (ts->ts_thread->td_priority < td->td_priority) {
+	/*
+	 * IPI if we exceed the threshold or if the target cpu is running an
+	 * idle thread.
+	 */
+	if (prio > ipi_thresh && td->td_priority < PRI_MIN_IDLE)
+		return;
+	/*
+ 	 * IPI only if our priority is better than the running thread and
+	 * the running thread is not the per cpu idle thread.  The
+	 * idlethread finds new work via sched_runnable().
+	 */
+	if (td == pcpu->pc_idlethread)
+		return;
+	if (prio > td->td_priority)
+		return;
+	if (ipi_ast) {
 		td->td_flags |= TDF_NEEDRESCHED;
 		ipi_selected(1 << cpu, IPI_AST);
-	}
+	} else if (ipi_preempt)
+		ipi_selected(1 << cpu, IPI_PREEMPT);
 }
 
 static struct td_sched *
@@ -762,95 +772,134 @@ tdq_steal(struct tdq *tdq, int stealidle)
 }
 
 int
-tdq_transfer(struct tdq *tdq, struct td_sched *ts, int class)
+tdq_pickidle(struct tdq *tdq, struct td_sched *ts)
 {
-	struct tdq_group *ntdg;
 	struct tdq_group *tdg;
-	struct tdq *old;
+	int self;
 	int cpu;
-	int idx;
 
+	self = PCPU_GET(cpuid);
 	if (smp_started == 0)
-		return (0);
-	cpu = 0;
+		return (self);
 	/*
-	 * If our load exceeds a certain threshold we should attempt to
-	 * reassign this thread.  The first candidate is the cpu that
-	 * originally ran the thread.  If it is idle, assign it there, 
-	 * otherwise, pick an idle cpu.
-	 *
-	 * The threshold at which we start to reassign has a large impact
-	 * on the overall performance of the system.  Tuned too high and
-	 * some CPUs may idle.  Too low and there will be excess migration
-	 * and context switches.
+	 * If the current CPU has idled, just run it here.
 	 */
-	old = TDQ_CPU(ts->ts_cpu);
-	ntdg = old->tdq_group;
-	tdg = tdq->tdq_group;
-	if (tdq_idle) {
-		if (tdq_idle & ntdg->tdg_mask) {
-			cpu = ffs(ntdg->tdg_idlemask);
-			if (cpu) {
-				CTR2(KTR_SCHED,
-				    "tdq_transfer: %p found old cpu %X " 
-				    "in idlemask.", ts, cpu);
-				goto migrate;
-			}
-		}
-		/*
-		 * Multiple cpus could find this bit simultaneously
-		 * but the race shouldn't be terrible.
-		 */
-		cpu = ffs(tdq_idle);
-		if (cpu) {
-			CTR2(KTR_SCHED, "tdq_transfer: %p found %X " 
-			    "in idlemask.", ts, cpu);
-			goto migrate;
-		}
-	}
-	idx = 0;
-#if 0
-	if (old->tdq_load < tdq->tdq_load) {
-		cpu = ts->ts_cpu + 1;
-		CTR2(KTR_SCHED, "tdq_transfer: %p old cpu %X " 
-		    "load less than ours.", ts, cpu);
-		goto migrate;
-	}
+	if ((tdq->tdq_group->tdg_idlemask & PCPU_GET(cpumask)) != 0)
+		return (self);
 	/*
-	 * No new CPU was found, look for one with less load.
+	 * Try the last group we ran on.
 	 */
-	for (idx = 0; idx <= tdg_maxid; idx++) {
-		ntdg = TDQ_GROUP(idx);
-		if (ntdg->tdg_load /*+ (ntdg->tdg_cpus  * 2)*/ < tdg->tdg_load) {
-			cpu = ffs(ntdg->tdg_cpumask);
-			CTR2(KTR_SCHED, "tdq_transfer: %p cpu %X load less " 
-			    "than ours.", ts, cpu);
-			goto migrate;
-		}
-	}
-#endif
+	tdg = TDQ_CPU(ts->ts_cpu)->tdq_group;
+	cpu = ffs(tdg->tdg_idlemask);
+	if (cpu)
+		return (cpu - 1);
 	/*
-	 * If another cpu in this group has idled, assign a thread over
-	 * to them after checking to see if there are idled groups.
+	 * Search for an idle group.
 	 */
-	if (tdg->tdg_idlemask) {
-		cpu = ffs(tdg->tdg_idlemask);
-		if (cpu) {
-			CTR2(KTR_SCHED, "tdq_transfer: %p cpu %X idle in " 
-			    "group.", ts, cpu);
-			goto migrate;
-		}
-	}
-	return (0);
-migrate:
+	cpu = ffs(tdq_idle);
+	if (cpu) 
+		return (cpu - 1);
 	/*
-	 * Now that we've found an idle CPU, migrate the thread.
+	 * XXX If there are no idle groups, check for an idle core.
 	 */
-	cpu--;
-	ts->ts_runq = NULL;
-	tdq_notify(ts, cpu);
+	/*
+	 * No idle CPUs?
+	 */
+	return (self);
+}
 
-	return (1);
+static int
+tdq_pickpri(struct tdq *tdq, struct td_sched *ts, int flags)
+{
+	struct pcpu *pcpu;
+	int lowpri;
+	int lowcpu;
+	int lowload;
+	int load;
+	int self;
+	int pri;
+	int cpu;
+
+	self = PCPU_GET(cpuid);
+	if (smp_started == 0)
+		return (self);
+
+	pri = ts->ts_thread->td_priority;
+	/*
+	 * Regardless of affinity, if the last cpu is idle send it there.
+	 */
+	pcpu = pcpu_find(ts->ts_cpu);
+	if (pcpu->pc_curthread->td_priority > PRI_MIN_IDLE) {
+		CTR5(KTR_SCHED,
+		    "ts_cpu %d idle, ltick %d ticks %d pri %d curthread %d",
+		    ts->ts_cpu, ts->ts_rltick, ticks, pri,
+		    pcpu->pc_curthread->td_priority);
+		return (ts->ts_cpu);
+	}
+	/*
+	 * If we have affinity, try to place it on the cpu we last ran on.
+	 */
+	if (SCHED_AFFINITY(ts) && pcpu->pc_curthread->td_priority > pri) {
+		CTR5(KTR_SCHED,
+		    "affinity for %d, ltick %d ticks %d pri %d curthread %d",
+		    ts->ts_cpu, ts->ts_rltick, ticks, pri,
+		    pcpu->pc_curthread->td_priority);
+		return (ts->ts_cpu);
+	}
+	/*
+	 * Try ourself first; If we're running something lower priority this
+	 * may have some locality with the waking thread and execute faster
+	 * here.
+	 */
+	if (tryself) {
+		/*
+		 * If we're being awoken by an interrupt thread or the waker
+		 * is going right to sleep run here as well.
+		 */
+		if ((TDQ_SELF()->tdq_load == 1) && (flags & SRQ_YIELDING ||
+		    curthread->td_pri_class == PRI_ITHD)) {
+			CTR2(KTR_SCHED, "tryself load %d flags %d",
+			    TDQ_SELF()->tdq_load, flags);
+			return (self);
+		}
+	}
+	/*
+	 * Look for an idle group.
+	 */
+	CTR1(KTR_SCHED, "tdq_idle %X", tdq_idle);
+	cpu = ffs(tdq_idle);
+	if (cpu)
+		return (cpu - 1);
+	if (tryselfidle && pri < curthread->td_priority) {
+		CTR1(KTR_SCHED, "tryself %d",
+		    curthread->td_priority);
+		return (self);
+	}
+	/*
+ 	 * Now search for the cpu running the lowest priority thread with
+	 * the least load.
+	 */
+	lowload = 0;
+	lowpri = lowcpu = 0;
+	for (cpu = 0; cpu <= mp_maxid; cpu++) {
+		if (CPU_ABSENT(cpu))
+			continue;
+		pcpu = pcpu_find(cpu);
+		pri = pcpu->pc_curthread->td_priority;
+		CTR4(KTR_SCHED,
+		    "cpu %d pri %d lowcpu %d lowpri %d",
+		    cpu, pri, lowcpu, lowpri);
+		if (pri < lowpri)
+			continue;
+		load = TDQ_CPU(cpu)->tdq_load;
+		if (lowpri && lowpri == pri && load > lowload)
+			continue;
+		lowpri = pri;
+		lowcpu = cpu;
+		lowload = load;
+	}
+
+	return (lowcpu);
 }
 
 #endif	/* SMP */
@@ -926,7 +975,6 @@ sched_setup(void *dummy)
 		struct tdq *tdq;
 
 		tdq = &tdq_cpu[i];
-		tdq->tdq_assigned = NULL;
 		tdq_setup(&tdq_cpu[i]);
 	}
 	if (smp_topology == NULL) {
@@ -1023,6 +1071,9 @@ sched_initticks(void *dummy)
 	 */
 	if (tickincr == 0)
 		tickincr = 1;
+#ifdef SMP
+	affinity = SCHED_AFFINITY_DEFAULT;
+#endif
 	mtx_unlock_spin(&sched_lock);
 }
 
@@ -1231,16 +1282,10 @@ sched_thread_priority(struct thread *td, u_char prio)
 		 * propagation, we may have to move ourselves to a new
 		 * queue.  This could be optimized to not re-add in some
 		 * cases.
-		 *
-		 * Hold this td_sched on this cpu so that sched_prio() doesn't
-		 * cause excessive migration.  We only want migration to
-		 * happen as the result of a wakeup.
 		 */
-		sched_pin_td(td);
 		sched_rem(td);
 		td->td_priority = prio;
 		sched_add(td, SRQ_BORROWING);
-		sched_unpin_td(td);
 	} else
 		td->td_priority = prio;
 }
@@ -1356,9 +1401,11 @@ sched_switch(struct thread *td, struct thread *newtd, int flags)
 {
 	struct tdq *tdq;
 	struct td_sched *ts;
+	int preempt;
 
 	mtx_assert(&sched_lock, MA_OWNED);
 
+	preempt = flags & SW_PREEMPT;
 	tdq = TDQ_SELF();
 	ts = td->td_sched;
 	td->td_lastcpu = td->td_oncpu;
@@ -1371,19 +1418,20 @@ sched_switch(struct thread *td, struct thread *newtd, int flags)
 	 */
 	if (td == PCPU_GET(idlethread)) {
 		TD_SET_CAN_RUN(td);
-	} else if ((ts->ts_flags & TSF_ASSIGNED) == 0) {
-		/* We are ending our run so make our slot available again */
+	} else {
 		tdq_load_rem(tdq, ts);
 		if (TD_IS_RUNNING(td)) {
 			/*
 			 * Don't allow the thread to migrate
 			 * from a preemption.
 			 */
-			sched_pin_td(td);
-			setrunqueue(td, (flags & SW_PREEMPT) ?
+			if (preempt)
+				sched_pin_td(td);
+			setrunqueue(td, preempt ?
 			    SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED :
 			    SRQ_OURSELF|SRQ_YIELDING);
-			sched_unpin_td(td);
+			if (preempt)
+				sched_unpin_td(td);
 		}
 	}
 	if (newtd != NULL) {
@@ -1614,7 +1662,7 @@ sched_clock(struct thread *td)
 
 	mtx_assert(&sched_lock, MA_OWNED);
 #ifdef SMP
-	sched_smp_tick();
+	sched_smp_tick(td);
 #endif
 	tdq = TDQ_SELF();
 	/*
@@ -1656,9 +1704,6 @@ sched_clock(struct thread *td)
 	 * We're out of time, recompute priorities and requeue.
 	 */
 	sched_priority(td);
-	tdq_load_rem(tdq, ts);
-	ts->ts_slice = sched_slice;
-	tdq_load_add(tdq, ts);
 	td->td_flags |= TDF_NEEDRESCHED;
 }
 
@@ -1672,11 +1717,8 @@ sched_runnable(void)
 
 	tdq = TDQ_SELF();
 #ifdef SMP
-	if (tdq->tdq_assigned) {
-		mtx_lock_spin(&sched_lock);
-		tdq_assign(tdq);
-		mtx_unlock_spin(&sched_lock);
-	}
+	if (tdq_busy)
+		goto out;
 #endif
 	if ((curthread->td_flags & TDF_IDLETD) != 0) {
 		if (tdq->tdq_load > 0)
@@ -1699,8 +1741,6 @@ sched_choose(void)
 	tdq = TDQ_SELF();
 #ifdef SMP
 restart:
-	if (tdq->tdq_assigned)
-		tdq_assign(tdq);
 #endif
 	ts = tdq_choose(tdq);
 	if (ts) {
@@ -1726,8 +1766,11 @@ sched_add(struct thread *td, int flags)
 	struct tdq *tdq;
 	struct td_sched *ts;
 	int preemptive;
-	int canmigrate;
 	int class;
+#ifdef SMP
+	int cpuid;
+	int cpumask;
+#endif
 
 	CTR5(KTR_SCHED, "sched_add: %p(%s) prio %d by %p(%s)",
 	    td, td->td_proc->p_comm, td->td_priority, curthread,
@@ -1737,15 +1780,6 @@ sched_add(struct thread *td, int flags)
 	ts = td->td_sched;
 	class = PRI_BASE(td->td_pri_class);
 	preemptive = !(flags & SRQ_YIELDING);
-	canmigrate = 1;
-#ifdef SMP
-	if (ts->ts_flags & TSF_ASSIGNED) {
-		if (ts->ts_flags & TSF_REMOVED)
-			ts->ts_flags &= ~TSF_REMOVED;
-		return;
-	}
-	canmigrate = THREAD_CAN_MIGRATE(td);
-#endif
 	KASSERT(ts->ts_state != TSS_ONRUNQ,
 	    ("sched_add: thread %p (%s) already in run queue", td,
 	    td->td_proc->p_comm));
@@ -1754,42 +1788,38 @@ sched_add(struct thread *td, int flags)
 	KASSERT(ts->ts_runq == NULL,
 	    ("sched_add: thread %p is still assigned to a run queue", td));
 	/*
-	 * Set the slice and pick the run queue.
+	 * Recalculate the priority before we select the target cpu or
+	 * run-queue.
 	 */
-	if (ts->ts_slice == 0)
-		ts->ts_slice = sched_slice;
 	if (class == PRI_TIMESHARE)
 		sched_priority(td);
-	if (td->td_priority <= PRI_MAX_REALTIME) {
-		ts->ts_runq = &tdq->tdq_realtime;
-		/*
-		 * If the thread is not artificially pinned and it's in
-		 * the realtime queue we directly dispatch it on this cpu
-		 * for minimum latency.  Interrupt handlers may also have
-		 * to complete on the cpu that dispatched them.
-		 */
-		if (td->td_pinned == 0 && class == PRI_ITHD)
-			ts->ts_cpu = PCPU_GET(cpuid);
-	} else if (td->td_priority <= PRI_MAX_TIMESHARE)
-		ts->ts_runq = &tdq->tdq_timeshare;
-	else
-		ts->ts_runq = &tdq->tdq_idle;
-
 #ifdef SMP
+	cpuid = PCPU_GET(cpuid);
 	/*
-	 * If this thread is pinned or bound, notify the target cpu.
+	 * Pick the destination cpu and if it isn't ours transfer to the
+	 * target cpu.
 	 */
-	if (!canmigrate && ts->ts_cpu != PCPU_GET(cpuid) ) {
-		ts->ts_runq = NULL;
-		tdq_notify(ts, ts->ts_cpu);
-		return;
-	}
+	if (THREAD_CAN_MIGRATE(td)) {
+		if (td->td_priority <= PRI_MAX_ITHD) {
+			CTR2(KTR_SCHED, "ithd %d < %d", td->td_priority, PRI_MAX_ITHD);
+			ts->ts_cpu = cpuid;
+		}
+		if (pick_pri)
+			ts->ts_cpu = tdq_pickpri(tdq, ts, flags);
+		else
+			ts->ts_cpu = tdq_pickidle(tdq, ts);
+	} else
+		CTR1(KTR_SCHED, "pinned %d", td->td_pinned);
+	if (ts->ts_cpu != cpuid)
+		preemptive = 0;
+	tdq = TDQ_CPU(ts->ts_cpu);
+	cpumask = 1 << ts->ts_cpu;
 	/*
 	 * If we had been idle, clear our bit in the group and potentially
-	 * the global bitmap.  If not, see if we should transfer this thread.
+	 * the global bitmap.
 	 */
 	if ((class != PRI_IDLE && class != PRI_ITHD) &&
-	    (tdq->tdq_group->tdg_idlemask & PCPU_GET(cpumask)) != 0) {
+	    (tdq->tdq_group->tdg_idlemask & cpumask) != 0) {
 		/*
 		 * Check to see if our group is unidling, and if so, remove it
 		 * from the global idle mask.
@@ -1800,20 +1830,34 @@ sched_add(struct thread *td, int flags)
 		/*
 		 * Now remove ourselves from the group specific idle mask.
 		 */
-		tdq->tdq_group->tdg_idlemask &= ~PCPU_GET(cpumask);
-	} else if (canmigrate && tdq->tdq_load > 1)
-		if (tdq_transfer(tdq, ts, class))
-			return;
-	ts->ts_cpu = PCPU_GET(cpuid);
+		tdq->tdq_group->tdg_idlemask &= ~cpumask;
+	}
 #endif
-	if (td->td_priority < curthread->td_priority)
-		curthread->td_flags |= TDF_NEEDRESCHED;
+	/*
+	 * Set the slice and pick the run queue.
+	 */
+	if (ts->ts_slice == 0)
+		ts->ts_slice = sched_slice;
+	if (td->td_priority <= PRI_MAX_REALTIME)
+		ts->ts_runq = &tdq->tdq_realtime;
+	else if (td->td_priority <= PRI_MAX_TIMESHARE)
+		ts->ts_runq = &tdq->tdq_timeshare;
+	else
+		ts->ts_runq = &tdq->tdq_idle;
 	if (preemptive && maybe_preempt(td))
 		return;
 	ts->ts_state = TSS_ONRUNQ;
 
 	tdq_runq_add(tdq, ts, flags);
 	tdq_load_add(tdq, ts);
+#ifdef SMP
+	if (ts->ts_cpu != cpuid) {
+		tdq_notify(ts);
+		return;
+	}
+#endif
+	if (td->td_priority < curthread->td_priority)
+		curthread->td_flags |= TDF_NEEDRESCHED;
 }
 
 void
@@ -1827,10 +1871,6 @@ sched_rem(struct thread *td)
 	    curthread->td_proc->p_comm);
 	mtx_assert(&sched_lock, MA_OWNED);
 	ts = td->td_sched;
-	if (ts->ts_flags & TSF_ASSIGNED) {
-		ts->ts_flags |= TSF_REMOVED;
-		return;
-	}
 	KASSERT((ts->ts_state == TSS_ONRUNQ),
 	    ("sched_rem: thread not on run queue"));
 
@@ -1881,8 +1921,6 @@ sched_bind(struct thread *td, int cpu)
 		return;
 	/* sched_rem without the runq_remove */
 	ts->ts_state = TSS_THREAD;
-	tdq_load_rem(TDQ_CPU(ts->ts_cpu), ts);
-	tdq_notify(ts, cpu);
 	/* When we return from mi_switch we'll be on the correct cpu. */
 	mi_switch(SW_VOL, NULL);
 	sched_pin();
@@ -1962,7 +2000,22 @@ SYSCTL_INT(_kern_sched, OID_AUTO, slice, CTLFLAG_RW, &sched_slice, 0, "");
 SYSCTL_INT(_kern_sched, OID_AUTO, interact, CTLFLAG_RW, &sched_interact, 0, "");
 SYSCTL_INT(_kern_sched, OID_AUTO, tickincr, CTLFLAG_RD, &tickincr, 0, "");
 SYSCTL_INT(_kern_sched, OID_AUTO, realstathz, CTLFLAG_RD, &realstathz, 0, "");
-SYSCTL_INT(_kern_sched, OID_AUTO, balance, CTLFLAG_RW, &sched_rebalance, 0, "");
+#ifdef SMP
+SYSCTL_INT(_kern_sched, OID_AUTO, pick_pri, CTLFLAG_RW, &pick_pri, 0, "");
+SYSCTL_INT(_kern_sched, OID_AUTO, pick_pri_affinity, CTLFLAG_RW,
+    &affinity, 0, "");
+SYSCTL_INT(_kern_sched, OID_AUTO, pick_pri_tryself, CTLFLAG_RW,
+    &tryself, 0, "");
+SYSCTL_INT(_kern_sched, OID_AUTO, pick_pri_tryselfidle, CTLFLAG_RW,
+    &tryselfidle, 0, "");
+SYSCTL_INT(_kern_sched, OID_AUTO, balance, CTLFLAG_RW, &rebalance, 0, "");
+SYSCTL_INT(_kern_sched, OID_AUTO, ipi_preempt, CTLFLAG_RW, &ipi_preempt, 0, "");
+SYSCTL_INT(_kern_sched, OID_AUTO, ipi_ast, CTLFLAG_RW, &ipi_ast, 0, "");
+SYSCTL_INT(_kern_sched, OID_AUTO, ipi_thresh, CTLFLAG_RW, &ipi_thresh, 0, "");
+SYSCTL_INT(_kern_sched, OID_AUTO, steal_htt, CTLFLAG_RW, &steal_htt, 0, "");
+SYSCTL_INT(_kern_sched, OID_AUTO, steal_busy, CTLFLAG_RW, &steal_busy, 0, "");
+SYSCTL_INT(_kern_sched, OID_AUTO, busy_thresh, CTLFLAG_RW, &busy_thresh, 0, "");
+#endif
 
 /* ps compat */
 static fixpt_t  ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */