diff --git a/doc/configuration.txt b/doc/configuration.txt
index 270e35d31..fd8af6bfa 100644
--- a/doc/configuration.txt
+++ b/doc/configuration.txt
@@ -1981,6 +1981,16 @@ cpu-policy <policy>
                         the limit of 32 or 64 depending on the system. This is
                         the default policy.
 
+   - group-by-cluster   if neither "nbthread" not "nbtgroups" were set, then
+                        one thread group is created for each CPU cluster with
+                        available CPUs, each with as many threads as CPUs. All
+                        threads of a group are bound to all CPUs of the cluster
+                        so that intra-group communications remain local to the
+                        cluster without enforcing too strong a binding. The
+                        per-group thread limits and thread-group limits are
+                        respected. This is recommended on multi-socket and NUMA
+                        systems, as well as CPUs with bad inter-CCX latencies.
+
   See also: "cpu-map", "cpu-set", "nbthread"
 
 cpu-set <directive>...
diff --git a/src/cpu_topo.c b/src/cpu_topo.c
index edb1708e3..cd823272f 100644
--- a/src/cpu_topo.c
+++ b/src/cpu_topo.c
@@ -52,10 +52,12 @@ static int cpu_policy = 1; // "first-usable-node"
 
 /* list of CPU policies for "cpu-policy". The default one is the first one. */
 static int cpu_policy_first_usable_node(int policy, int tmin, int tmax, int gmin, int gmax, char **err);
+static int cpu_policy_group_by_cluster(int policy, int tmin, int tmax, int gmin, int gmax, char **err);
 
 static struct ha_cpu_policy ha_cpu_policy[] = {
 	{ .name = "none",               .desc = "use all available CPUs",                           .fct = NULL   },
 	{ .name = "first-usable-node",  .desc = "use only first usable node if nbthreads not set",  .fct = cpu_policy_first_usable_node  },
+	{ .name = "group-by-cluster",   .desc = "make one thread group per core cluster",           .fct = cpu_policy_group_by_cluster   },
 	{ 0 } /* end */
 };
 
@@ -959,6 +961,94 @@ static int cpu_policy_first_usable_node(int policy, int tmin, int tmax, int gmin
 	return 0;
 }
 
+/* the "group-by-cluster" cpu-policy:
+ *  - does nothing if nbthread or thread-groups are set
+ *  - otherwise tries to create one thread-group per cluster, with as many
+ *    threads as CPUs in the cluster, and bind all the threads of this group
+ *    to all the CPUs of the cluster.
+ */
+static int cpu_policy_group_by_cluster(int policy, int tmin, int tmax, int gmin, int gmax, char **err)
+{
+	struct hap_cpuset node_cpu_set;
+	int cpu, cpu_start;
+	int cpu_count;
+	int cid, lcid;
+	int thr;
+
+	if (global.nbthread)
+		return 0;
+
+	if (global.nbtgroups)
+		return 0;
+
+	/* iterate over each new cluster */
+	lcid = -1;
+	cpu_start = 0;
+	while (global.nbtgroups < MAX_TGROUPS) {
+		ha_cpuset_zero(&node_cpu_set);
+		cid = -1; cpu_count = 0;
+
+		for (cpu = cpu_start; cpu <= cpu_topo_lastcpu; cpu++) {
+			/* skip disabled and already visited CPUs */
+			if (ha_cpu_topo[cpu].st & HA_CPU_F_EXCL_MASK)
+				continue;
+			if (ha_cpu_topo[cpu].cl_gid <= lcid)
+				continue;
+
+			if (cid < 0) {
+				cid = ha_cpu_topo[cpu].cl_gid;
+				cpu_start = cpu + 1;
+			}
+			else if (cid != ha_cpu_topo[cpu].cl_gid)
+				continue;
+
+			/* make a mask of all of this cluster's CPUs */
+			ha_cpuset_set(&node_cpu_set, ha_cpu_topo[cpu].idx);
+			cpu_count++;
+		}
+		/* now cid = next cluster_id or -1 if none; cpu_count is the
+		 * number of CPUs in this cluster, and cpu_start is the next
+		 * cpu to restart from to scan for new clusters.
+		 */
+		if (cid < 0)
+			break;
+
+		/* check that we're still within limits */
+		if (cpu_count > MAX_THREADS_PER_GROUP)
+			cpu_count = MAX_THREADS_PER_GROUP;
+
+		if (cpu_count + global.nbthread > MAX_THREADS)
+			cpu_count = MAX_THREADS - global.nbthread;
+
+		if (cpu_count <= 0)
+			break;
+
+		/* let's create the new thread group */
+		ha_tgroup_info[global.nbtgroups].base  = global.nbthread;
+		ha_tgroup_info[global.nbtgroups].count = cpu_count;
+
+		/* assign to this group the required number of threads */
+		for (thr = 0; thr < cpu_count; thr++) {
+			ha_thread_info[thr + global.nbthread].tgid = global.nbtgroups + 1;
+			ha_thread_info[thr + global.nbthread].tg = &ha_tgroup_info[global.nbtgroups];
+			ha_thread_info[thr + global.nbthread].tg_ctx = &ha_tgroup_ctx[global.nbtgroups];
+			/* map these threads to all the CPUs */
+			ha_cpuset_assign(&cpu_map[global.nbtgroups].thread[thr], &node_cpu_set);
+		}
+
+		lcid = cid; // last cluster_id
+		global.nbthread += cpu_count;
+		global.nbtgroups++;
+	}
+
+	if (global.nbthread)
+		ha_diag_warning("Created %d threads split into %d groups\n", global.nbthread, global.nbtgroups);
+	else
+		ha_diag_warning("Could not determine any CPU cluster\n");
+
+	return 0;
+}
+
 /* apply the chosen CPU policy if no cpu-map was forced. Returns < 0 on failure
  * with a message in *err that must be freed by the caller if non-null.
  */