From 3c923d075c8c410b03201008481702a1be8041be Mon Sep 17 00:00:00 2001 From: Olivier Houchard Date: Thu, 28 May 2026 17:24:08 +0200 Subject: [PATCH] MEDIUM: servers: Move to a per-thread idle connection cleanup task Having a single task to take care of idle connection cleanup across all servers leads to high contention. It uses a lock to maintain its tree of servers to track, and then can acquire the idle_conns lock for each thread. Instead, have one task per thread. Each thread will maintain its own tree, so there will be no need for any lock, and it will just acquire its own idle_conns lock, so it will lead to less contention. This is a performance improvement, so backporting is optional, but may be considered if it is worth it. That would require backporting commit 6f8dab258379dd53e327433ffd890c6d3d6f89ed too. --- include/haproxy/server-t.h | 2 +- include/haproxy/server.h | 4 +- src/cfgparse.c | 19 +++++---- src/haproxy.c | 7 ++- src/server.c | 87 ++++++++++++++++++-------------------- 5 files changed, 59 insertions(+), 60 deletions(-) diff --git a/include/haproxy/server-t.h b/include/haproxy/server-t.h index 56d2edef7..74f206a92 100644 --- a/include/haproxy/server-t.h +++ b/include/haproxy/server-t.h @@ -280,6 +280,7 @@ struct srv_per_thread { struct ceb_root *safe_conns; /* Safe idle connections */ struct ceb_root *avail_conns; /* Connections in use, but with still new streams available */ struct server *srv; /* Back-pointer to the server */ + struct eb32_node idle_node; /* When to next do cleanup in the idle connections */ #ifdef USE_QUIC struct ist quic_retry_token; #endif @@ -402,7 +403,6 @@ struct server { * thread, and generally at the same time. */ THREAD_ALIGN(); - struct eb32_node idle_node; /* When to next do cleanup in the idle connections */ unsigned int curr_idle_conns; /* Current number of orphan idling connections, both the idle and the safe lists */ unsigned int curr_idle_nb; /* Current number of connections in the idle list */ unsigned int curr_safe_nb; /* Current number of connections in the safe list */ diff --git a/include/haproxy/server.h b/include/haproxy/server.h index 0d5892968..3f1ebe5ee 100644 --- a/include/haproxy/server.h +++ b/include/haproxy/server.h @@ -41,9 +41,9 @@ #include -__decl_thread(extern HA_SPINLOCK_T idle_conn_srv_lock); extern struct idle_conns idle_conns[MAX_THREADS]; -extern struct task *idle_conn_task; +extern struct task *idle_conn_task[MAX_THREADS]; +extern struct eb_root idle_conn_srv[MAX_THREADS]; extern struct mt_list servers_list; extern struct dict server_key_dict; diff --git a/src/cfgparse.c b/src/cfgparse.c index 2e684e15d..5e54500e6 100644 --- a/src/cfgparse.c +++ b/src/cfgparse.c @@ -2486,16 +2486,17 @@ init_proxies_list_stage1: /* At this point, target names have already been resolved. */ /***********************************************************/ - idle_conn_task = task_new_anywhere(); - if (!idle_conn_task) { - ha_alert("parsing : failed to allocate global idle connection task.\n"); - cfgerr++; - } - else { - idle_conn_task->process = srv_cleanup_idle_conns; - idle_conn_task->context = NULL; + for (int i = 0; i < global.nbthread; i++) { + idle_conn_srv[i] = EB_ROOT; + idle_conn_task[i] = task_new_on(i); + if (!idle_conn_task[i]) { + ha_alert("parsing : failed to allocate global idle connection task.\n"); + cfgerr++; + } + else { + idle_conn_task[i]->process = srv_cleanup_idle_conns; + idle_conn_task[i]->context = NULL; - for (i = 0; i < global.nbthread; i++) { idle_conns[i].cleanup_task = task_new_on(i); if (!idle_conns[i].cleanup_task) { ha_alert("parsing : failed to allocate idle connection tasks for thread '%d'.\n", i); diff --git a/src/haproxy.c b/src/haproxy.c index 6af7b031e..804237203 100644 --- a/src/haproxy.c +++ b/src/haproxy.c @@ -2810,6 +2810,7 @@ void deinit(void) struct cfg_postparser *pprs, *pprsb; char **tmp = init_env; int cur_fd; + int i; /* the user may want to skip this phase */ if (global.tune.options & GTUNE_QUICK_EXIT) @@ -2886,8 +2887,10 @@ void deinit(void) ha_free(&global.server_state_base); ha_free(&global.server_state_file); ha_free(&global.stats_file); - task_destroy(idle_conn_task); - idle_conn_task = NULL; + for (i = 0; i < global.nbthread; i++) { + task_destroy(idle_conn_task[i]); + idle_conn_task[i] = NULL; + } list_for_each_entry_safe(log, logb, &global.loggers, list) { LIST_DEL_INIT(&log->list); diff --git a/src/server.c b/src/server.c index 330b08427..957fc4c34 100644 --- a/src/server.c +++ b/src/server.c @@ -76,9 +76,8 @@ struct srv_kw_list srv_keywords = { .list = LIST_HEAD_INIT(srv_keywords.list) }; -__decl_thread(HA_SPINLOCK_T idle_conn_srv_lock); -struct eb_root idle_conn_srv = EB_ROOT; -struct task *idle_conn_task __read_mostly = NULL; +struct eb_root idle_conn_srv[MAX_THREADS]; +struct task *idle_conn_task[MAX_THREADS] __read_mostly = {}; struct mt_list servers_list = MT_LIST_HEAD_INIT(servers_list); static struct task *server_atomic_sync_task = NULL; static event_hdl_async_equeue server_atomic_sync_queue; @@ -3281,6 +3280,8 @@ struct server *srv_drop(struct server *srv) /* This BUG_ON() is invalid for now as server released on deinit will * trigger it as they are not properly removed from their tree. + * This is even more relevant now, as we would need to check the + * idle_node for each thread */ //BUG_ON(ceb_intree(&srv->addr_node) || // srv->idle_node.node.leaf_p || @@ -6652,7 +6653,8 @@ static int cli_parse_delete_server(char **args, char *payload, struct appctx *ap cebuis_item_delete(&be->used_server_addr, addr_node, addr_key, srv); /* remove srv from idle_node tree for idle conn cleanup */ - eb32_delete(&srv->idle_node); + for (ret = 0; ret < global.nbthread; ret++) + eb32_delete(&srv->per_thr[ret].idle_node); /* set LSB bit (odd bit) for reuse_cnt */ srv_id_reuse_cnt |= 1; @@ -7545,20 +7547,16 @@ int srv_add_to_idle_list(struct server *srv, struct connection *conn, int is_saf HA_SPIN_UNLOCK(IDLE_CONNS_LOCK, &idle_conns[tid].idle_conns_lock); _HA_ATOMIC_INC(&srv->curr_idle_thr[tid]); - if (HA_ATOMIC_LOAD(&srv->idle_node.node.leaf_p) == NULL) { - HA_SPIN_LOCK(OTHER_LOCK, &idle_conn_srv_lock); - if (_HA_ATOMIC_LOAD(&srv->idle_node.node.leaf_p) == NULL) { - srv->idle_node.key = tick_add(srv->pool_purge_delay, + if (srv->per_thr[tid].idle_node.node.leaf_p == NULL) { + srv->per_thr[tid].idle_node.key = tick_add(srv->pool_purge_delay, now_ms); - eb32_insert(&idle_conn_srv, &srv->idle_node); - if (!task_in_wq(idle_conn_task) && ! - task_in_rq(idle_conn_task)) { - task_schedule(idle_conn_task, - srv->idle_node.key); - } - BUG_ON_STRESS(!mt_list_isempty(&conn->toremove_list)); + eb32_insert(&idle_conn_srv[tid], &srv->per_thr[tid].idle_node); + if (!task_in_wq(idle_conn_task[tid]) && + !task_in_rq(idle_conn_task[tid])) { + task_schedule(idle_conn_task[tid], + srv->per_thr[tid].idle_node.key); } - HA_SPIN_UNLOCK(OTHER_LOCK, &idle_conn_srv_lock); + BUG_ON_STRESS(!mt_list_isempty(&conn->toremove_list)); } return 1; } @@ -7580,24 +7578,26 @@ struct task *srv_cleanup_idle_conns(struct task *task, void *context, unsigned i { struct server *srv; struct eb32_node *eb; - int i; unsigned int next_wakeup; + int mytid = tid; next_wakeup = TICK_ETERNITY; - HA_SPIN_LOCK(OTHER_LOCK, &idle_conn_srv_lock); while (1) { + struct srv_per_thread *per_thr; int exceed_conns; int to_kill; int curr_idle; + int max_conn; + int removed; - eb = eb32_lookup_ge(&idle_conn_srv, now_ms - TIMER_LOOK_BACK); + eb = eb32_lookup_ge(&idle_conn_srv[mytid], now_ms - TIMER_LOOK_BACK); if (!eb) { /* we might have reached the end of the tree, typically because * is in the first half and we're first scanning the last * half. Let's loop back to the beginning of the tree now. */ - eb = eb32_first(&idle_conn_srv); + eb = eb32_first(&idle_conn_srv[mytid]); if (likely(!eb)) break; } @@ -7606,7 +7606,8 @@ struct task *srv_cleanup_idle_conns(struct task *task, void *context, unsigned i next_wakeup = eb->key; break; } - srv = eb32_entry(eb, struct server, idle_node); + per_thr = eb32_entry(eb, struct srv_per_thread, idle_node); + srv = per_thr->srv; /* Calculate how many idle connections we want to kill : * we want to remove half the difference between the total @@ -7619,47 +7620,41 @@ struct task *srv_cleanup_idle_conns(struct task *task, void *context, unsigned i exceed_conns = srv->curr_used_conns + curr_idle - MAX(srv->max_used_conns, srv->est_need_conns); exceed_conns = to_kill = exceed_conns / 2 + (exceed_conns & 1); - srv->est_need_conns = (srv->est_need_conns + srv->max_used_conns) / 2; + /* + * It is acceptable not to lock anything before modifying + * est_need_conns and max_used_conns, even if multiple threads + * are running that task at the same time, we don't need a + * very high precision here, it will converge over time. + */ + HA_ATOMIC_STORE(&srv->est_need_conns, (srv->est_need_conns + srv->max_used_conns) / 2); if (srv->est_need_conns < srv->max_used_conns) - srv->est_need_conns = srv->max_used_conns; + HA_ATOMIC_STORE(&srv->est_need_conns, srv->max_used_conns); HA_ATOMIC_STORE(&srv->max_used_conns, srv->curr_used_conns); if (exceed_conns <= 0) goto remove; - /* check all threads starting with ours */ - for (i = tid;;) { - int max_conn; - int removed; + max_conn = (exceed_conns * srv->curr_idle_thr[mytid]) / curr_idle + 1; - max_conn = (exceed_conns * srv->curr_idle_thr[i]) / - curr_idle + 1; + HA_SPIN_LOCK(IDLE_CONNS_LOCK, &idle_conns[mytid].idle_conns_lock); + removed = srv_migrate_conns_to_remove(srv, mytid, max_conn); + HA_SPIN_UNLOCK(IDLE_CONNS_LOCK, &idle_conns[mytid].idle_conns_lock); - HA_SPIN_LOCK(IDLE_CONNS_LOCK, &idle_conns[i].idle_conns_lock); - removed = srv_migrate_conns_to_remove(srv, i, max_conn); - HA_SPIN_UNLOCK(IDLE_CONNS_LOCK, &idle_conns[i].idle_conns_lock); - - if (removed) - task_wakeup(idle_conns[i].cleanup_task, TASK_WOKEN_OTHER); - - if ((i = ((i + 1 == global.nbthread) ? 0 : i + 1)) == tid) - break; - } + if (removed) + task_wakeup(idle_conns[mytid].cleanup_task, TASK_WOKEN_OTHER); remove: - eb32_delete(&srv->idle_node); + eb32_delete(&srv->per_thr[mytid].idle_node); - if (srv->curr_idle_conns) { + if (!LIST_ISEMPTY(&srv->per_thr[mytid].idle_conn_list)) { /* There are still more idle connections, add the * server back in the tree. */ - srv->idle_node.key = tick_add(srv->pool_purge_delay, now_ms); - eb32_insert(&idle_conn_srv, &srv->idle_node); - next_wakeup = tick_first(next_wakeup, srv->idle_node.key); + srv->per_thr[mytid].idle_node.key = tick_add(srv->pool_purge_delay, now_ms); + eb32_insert(&idle_conn_srv[mytid], &srv->per_thr[mytid].idle_node); + next_wakeup = tick_first(next_wakeup, srv->per_thr[mytid].idle_node.key); } } - HA_SPIN_UNLOCK(OTHER_LOCK, &idle_conn_srv_lock); - task->expire = next_wakeup; return task; }