diff --git a/src/listener.c b/src/listener.c
index 9e8c87818..2b8c2dfab 100644
--- a/src/listener.c
+++ b/src/listener.c
@@ -816,6 +816,52 @@ void listener_accept(int fd)
 		 */
 		next_conn = 0;
 
+#if defined(USE_THREAD)
+		count = l->bind_conf->thr_count;
+		if (count > 1) {
+			struct accept_queue_ring *ring;
+			int r, t1, t2, q1, q2;
+
+			/* pick two small distinct random values and drop lower bits */
+			r = (random() >> 8) % ((count - 1) * count);
+			t2 = r / count; // 0..thr_count-2
+			t1 = r % count; // 0..thr_count-1
+			t2 += t1 + 1;   // necessarily different from t1
+
+			if (t2 >= count)
+				t2 -= count;
+
+			t1 = bind_map_thread_id(l->bind_conf, t1);
+			t2 = bind_map_thread_id(l->bind_conf, t2);
+
+			q1 = accept_queue_rings[t1].tail - accept_queue_rings[t1].head + ACCEPT_QUEUE_SIZE;
+			if (q1 >= ACCEPT_QUEUE_SIZE)
+				q1 -= ACCEPT_QUEUE_SIZE;
+
+			q2 = accept_queue_rings[t2].tail - accept_queue_rings[t2].head + ACCEPT_QUEUE_SIZE;
+			if (q2 >= ACCEPT_QUEUE_SIZE)
+				q2 -= ACCEPT_QUEUE_SIZE;
+
+			/* make t1 the lowest loaded thread */
+			if (q1 >= ACCEPT_QUEUE_SIZE || l->thr_conn[t1] + q1 > l->thr_conn[t2] + q2)
+				t1 = t2;
+
+			/* We use deferred accepts even if it's the local thread because
+			 * tests show that it's the best performing model, likely due to
+			 * better cache locality when processing this loop.
+			 */
+			ring = &accept_queue_rings[t1];
+			if (accept_queue_push_mp(ring, cfd, l, &addr, laddr)) {
+				task_wakeup(ring->task, TASK_WOKEN_IO);
+				continue;
+			}
+			/* If the ring is full we do a synchronous accept on
+			 * the local thread here.
+			 * FIXME: we should update some stats here.
+			 */
+		}
+#endif // USE_THREAD
+
 		HA_ATOMIC_ADD(&l->thr_conn[tid], 1);
 		ret = l->accept(l, cfd, &addr);
 		if (unlikely(ret <= 0)) {