diff --git a/sys/kern/sched_ule.c b/sys/kern/sched_ule.c
index 90f294bb0d4..711119c8438 100644
--- a/sys/kern/sched_ule.c
+++ b/sys/kern/sched_ule.c
@@ -196,6 +196,7 @@ _Static_assert(sizeof(struct thread) + sizeof(struct td_sched) <=
 #define	SCHED_SLICE_MIN_DIVISOR		6	/* DEFAULT/MIN = ~16 ms. */
 
 /* Flags kept in td_flags. */
+#define	TDF_PICKCPU	TDF_SCHED0	/* Thread should pick new CPU. */
 #define	TDF_SLICEEND	TDF_SCHED2	/* Thread time slice is over. */
 
 /*
@@ -633,15 +634,16 @@ sched_random(void)
 }
 
 struct cpu_search {
-	cpuset_t *cs_mask;
-	u_int	cs_prefer;
+	cpuset_t *cs_mask;	/* The mask of allowed CPUs to choose from. */
+	int	cs_prefer;	/* Prefer this CPU and groups including it. */
+	int	cs_running;	/* The thread is now running at cs_prefer. */
 	int	cs_pri;		/* Min priority for low. */
 	int	cs_limit;	/* Max load for low, min load for high. */
 };
 
 struct cpu_search_res {
-	int	cs_cpu;
-	int	cs_load;
+	int	cs_cpu;		/* The best CPU found. */
+	int	cs_load;	/* The load of cs_cpu. */
 };
 
 /*
@@ -657,7 +659,7 @@ cpu_search_lowest(const struct cpu_group *cg, const struct cpu_search *s,
 {
 	struct cpu_search_res lr;
 	struct tdq *tdq;
-	int c, bload, l, load, total;
+	int c, bload, l, load, p, total;
 
 	total = 0;
 	bload = INT_MAX;
@@ -668,6 +670,17 @@ cpu_search_lowest(const struct cpu_group *cg, const struct cpu_search *s,
 		for (c = cg->cg_children - 1; c >= 0; c--) {
 			load = cpu_search_lowest(&cg->cg_child[c], s, &lr);
 			total += load;
+
+			/*
+			 * When balancing do not prefer SMT groups with load >1.
+			 * It allows round-robin between SMT groups with equal
+			 * load within parent group for more fair scheduling.
+			 */
+			if (__predict_false(s->cs_running) &&
+			    (cg->cg_child[c].cg_flags & CG_FLAG_THREAD) &&
+			    load >= 128 && (load & 128) != 0)
+				load += 128;
+
 			if (lr.cs_cpu >= 0 && (load < bload ||
 			    (load == bload && lr.cs_load < r->cs_load))) {
 				bload = load;
@@ -684,20 +697,40 @@ cpu_search_lowest(const struct cpu_group *cg, const struct cpu_search *s,
 			continue;
 		tdq = TDQ_CPU(c);
 		l = tdq->tdq_load;
+		if (c == s->cs_prefer) {
+			if (__predict_false(s->cs_running))
+				l--;
+			p = 128;
+		} else
+			p = 0;
 		load = l * 256;
-		if (c == s->cs_prefer)
-			load -= 128;
-		total += load;
-		if (l > s->cs_limit || tdq->tdq_lowpri <= s->cs_pri ||
+		total += load - p;
+
+		/*
+		 * Check this CPU is acceptable.
+		 * If the threads is already on the CPU, don't look on the TDQ
+		 * priority, since it can be the priority of the thread itself.
+		 */
+		if (l > s->cs_limit || (tdq->tdq_lowpri <= s->cs_pri &&
+		     (!s->cs_running || c != s->cs_prefer)) ||
 		    !CPU_ISSET(c, s->cs_mask))
 			continue;
+
+		/*
+		 * When balancing do not prefer CPUs with load > 1.
+		 * It allows round-robin between CPUs with equal load
+		 * within the CPU group for more fair scheduling.
+		 */
+		if (__predict_false(s->cs_running) && l > 0)
+			p = 0;
+
 		load -= sched_random() % 128;
-		if (load < bload) {
-			bload = load;
+		if (bload > load - p) {
+			bload = load - p;
 			r->cs_cpu = c;
+			r->cs_load = load;
 		}
 	}
-	r->cs_load = bload;
 	return (total);
 }
 
@@ -736,9 +769,17 @@ cpu_search_highest(const struct cpu_group *cg, const struct cpu_search *s,
 		l = tdq->tdq_load;
 		load = l * 256;
 		total += load;
-		if (l < s->cs_limit || !tdq->tdq_transferable ||
+
+		/*
+		 * Check this CPU is acceptable.
+		 * If requested minimum load is 1, then caller must know how
+		 * to handle running threads, not counted in tdq_transferable.
+		 */
+		if (l < s->cs_limit || (tdq->tdq_transferable == 0 &&
+		    (s->cs_limit > 1 || l > 1)) ||
 		    !CPU_ISSET(c, s->cs_mask))
 			continue;
+
 		load -= sched_random() % 256;
 		if (load > bload) {
 			bload = load;
@@ -756,12 +797,13 @@ cpu_search_highest(const struct cpu_group *cg, const struct cpu_search *s,
  */
 static inline int
 sched_lowest(const struct cpu_group *cg, cpuset_t *mask, int pri, int maxload,
-    int prefer)
+    int prefer, int running)
 {
 	struct cpu_search s;
 	struct cpu_search_res r;
 
 	s.cs_prefer = prefer;
+	s.cs_running = running;
 	s.cs_mask = mask;
 	s.cs_pri = pri;
 	s.cs_limit = maxload;
@@ -788,12 +830,13 @@ static void
 sched_balance_group(struct cpu_group *cg)
 {
 	struct tdq *tdq;
+	struct thread *td;
 	cpuset_t hmask, lmask;
 	int high, low, anylow;
 
 	CPU_FILL(&hmask);
 	for (;;) {
-		high = sched_highest(cg, &hmask, 2);
+		high = sched_highest(cg, &hmask, 1);
 		/* Stop if there is no more CPU with transferrable threads. */
 		if (high == -1)
 			break;
@@ -802,10 +845,28 @@ sched_balance_group(struct cpu_group *cg)
 		/* Stop if there is no more CPU left for low. */
 		if (CPU_EMPTY(&lmask))
 			break;
-		anylow = 1;
 		tdq = TDQ_CPU(high);
+		if (tdq->tdq_load == 1) {
+			/*
+			 * There is only one running thread.  We can't move
+			 * it from here, so tell it to pick new CPU by itself.
+			 */
+			TDQ_LOCK(tdq);
+			td = pcpu_find(high)->pc_curthread;
+			if ((td->td_flags & TDF_IDLETD) == 0 &&
+			    THREAD_CAN_MIGRATE(td)) {
+				td->td_flags |= TDF_NEEDRESCHED | TDF_PICKCPU;
+				if (high != curcpu)
+					ipi_cpu(high, IPI_AST);
+			}
+			TDQ_UNLOCK(tdq);
+			break;
+		}
+		anylow = 1;
 nextlow:
-		low = sched_lowest(cg, &lmask, -1, tdq->tdq_load - 1, high);
+		if (tdq->tdq_transferable == 0)
+			continue;
+		low = sched_lowest(cg, &lmask, -1, tdq->tdq_load - 1, high, 1);
 		/* Stop if we looked well and found no less loaded CPU. */
 		if (anylow && low == -1)
 			break;
@@ -1227,7 +1288,7 @@ sched_pickcpu(struct thread *td, int flags)
 	struct td_sched *ts;
 	struct tdq *tdq;
 	cpuset_t *mask;
-	int cpu, pri, self, intr;
+	int cpu, pri, r, self, intr;
 
 	self = PCPU_GET(cpuid);
 	ts = td_get_sched(td);
@@ -1305,32 +1366,33 @@ llc:
 	cpu = -1;
 	mask = &td->td_cpuset->cs_mask;
 	pri = td->td_priority;
+	r = TD_IS_RUNNING(td);
 	/*
 	 * Try hard to keep interrupts within found LLC.  Search the LLC for
 	 * the least loaded CPU we can run now.  For NUMA systems it should
 	 * be within target domain, and it also reduces scheduling overhead.
 	 */
 	if (ccg != NULL && intr) {
-		cpu = sched_lowest(ccg, mask, pri, INT_MAX, ts->ts_cpu);
+		cpu = sched_lowest(ccg, mask, pri, INT_MAX, ts->ts_cpu, r);
 		if (cpu >= 0)
 			SCHED_STAT_INC(pickcpu_intrbind);
 	} else
 	/* Search the LLC for the least loaded idle CPU we can run now. */
 	if (ccg != NULL) {
 		cpu = sched_lowest(ccg, mask, max(pri, PRI_MAX_TIMESHARE),
-		    INT_MAX, ts->ts_cpu);
+		    INT_MAX, ts->ts_cpu, r);
 		if (cpu >= 0)
 			SCHED_STAT_INC(pickcpu_affinity);
 	}
 	/* Search globally for the least loaded CPU we can run now. */
 	if (cpu < 0) {
-		cpu = sched_lowest(cpu_top, mask, pri, INT_MAX, ts->ts_cpu);
+		cpu = sched_lowest(cpu_top, mask, pri, INT_MAX, ts->ts_cpu, r);
 		if (cpu >= 0)
 			SCHED_STAT_INC(pickcpu_lowest);
 	}
 	/* Search globally for the least loaded CPU. */
 	if (cpu < 0) {
-		cpu = sched_lowest(cpu_top, mask, -1, INT_MAX, ts->ts_cpu);
+		cpu = sched_lowest(cpu_top, mask, -1, INT_MAX, ts->ts_cpu, r);
 		if (cpu >= 0)
 			SCHED_STAT_INC(pickcpu_lowest);
 	}
@@ -2056,7 +2118,7 @@ sched_switch(struct thread *td, int flags)
 	struct td_sched *ts;
 	struct mtx *mtx;
 	int srqflag;
-	int cpuid, preempted;
+	int cpuid, pickcpu, preempted;
 
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
 
@@ -2064,11 +2126,15 @@ sched_switch(struct thread *td, int flags)
 	tdq = TDQ_SELF();
 	ts = td_get_sched(td);
 	sched_pctcpu_update(ts, 1);
-	ts->ts_rltick = ticks;
+	pickcpu = (td->td_flags & TDF_PICKCPU) != 0;
+	if (pickcpu)
+		ts->ts_rltick = ticks - affinity * MAX_CACHE_LEVELS;
+	else
+		ts->ts_rltick = ticks;
 	td->td_lastcpu = td->td_oncpu;
 	preempted = (td->td_flags & TDF_SLICEEND) == 0 &&
 	    (flags & SW_PREEMPT) != 0;
-	td->td_flags &= ~(TDF_NEEDRESCHED | TDF_SLICEEND);
+	td->td_flags &= ~(TDF_NEEDRESCHED | TDF_PICKCPU | TDF_SLICEEND);
 	td->td_owepreempt = 0;
 	tdq->tdq_owepreempt = 0;
 	if (!TD_IS_IDLETHREAD(td))
@@ -2088,7 +2154,8 @@ sched_switch(struct thread *td, int flags)
 		    SRQ_OURSELF|SRQ_YIELDING|SRQ_PREEMPTED :
 		    SRQ_OURSELF|SRQ_YIELDING;
 #ifdef SMP
-		if (THREAD_CAN_MIGRATE(td) && !THREAD_CAN_SCHED(td, ts->ts_cpu))
+		if (THREAD_CAN_MIGRATE(td) && (!THREAD_CAN_SCHED(td, ts->ts_cpu)
+		    || pickcpu))
 			ts->ts_cpu = sched_pickcpu(td, 0);
 #endif
 		if (ts->ts_cpu == cpuid)