2006-06-25 20:48:02 -04:00
|
|
|
/*
|
2020-06-04 11:25:40 -04:00
|
|
|
* include/haproxy/task.h
|
2010-08-27 11:56:48 -04:00
|
|
|
* Functions for task management.
|
|
|
|
|
*
|
2020-06-04 11:25:40 -04:00
|
|
|
* Copyright (C) 2000-2020 Willy Tarreau - w@1wt.eu
|
2010-08-27 11:56:48 -04:00
|
|
|
*
|
|
|
|
|
* This library is free software; you can redistribute it and/or
|
|
|
|
|
* modify it under the terms of the GNU Lesser General Public
|
|
|
|
|
* License as published by the Free Software Foundation, version 2.1
|
|
|
|
|
* exclusively.
|
|
|
|
|
*
|
|
|
|
|
* This library is distributed in the hope that it will be useful,
|
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
|
|
|
* Lesser General Public License for more details.
|
|
|
|
|
*
|
|
|
|
|
* You should have received a copy of the GNU Lesser General Public
|
|
|
|
|
* License along with this library; if not, write to the Free Software
|
|
|
|
|
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
|
|
|
|
|
*/
|
2006-06-25 20:48:02 -04:00
|
|
|
|
2020-06-04 11:25:40 -04:00
|
|
|
#ifndef _HAPROXY_TASK_H
|
|
|
|
|
#define _HAPROXY_TASK_H
|
2006-06-25 20:48:02 -04:00
|
|
|
|
|
|
|
|
|
|
|
|
|
#include <sys/time.h>
|
2006-06-29 12:54:54 -04:00
|
|
|
|
2020-06-04 11:25:40 -04:00
|
|
|
#include <import/eb32tree.h>
|
|
|
|
|
|
2021-10-07 10:37:42 -04:00
|
|
|
#include <haproxy/activity.h>
|
2020-05-27 06:58:42 -04:00
|
|
|
#include <haproxy/api.h>
|
2021-10-08 03:33:24 -04:00
|
|
|
#include <haproxy/clock.h>
|
2020-06-04 11:25:40 -04:00
|
|
|
#include <haproxy/fd.h>
|
2020-06-04 11:05:57 -04:00
|
|
|
#include <haproxy/global.h>
|
2020-06-01 06:09:26 -04:00
|
|
|
#include <haproxy/intops.h>
|
2020-05-27 12:01:47 -04:00
|
|
|
#include <haproxy/list.h>
|
2020-06-04 11:25:40 -04:00
|
|
|
#include <haproxy/pool.h>
|
|
|
|
|
#include <haproxy/task-t.h>
|
2020-05-28 09:29:19 -04:00
|
|
|
#include <haproxy/thread.h>
|
2020-06-04 11:25:40 -04:00
|
|
|
#include <haproxy/ticks.h>
|
2017-09-27 08:59:38 -04:00
|
|
|
|
2019-09-24 08:55:28 -04:00
|
|
|
|
2009-03-08 10:53:06 -04:00
|
|
|
/* Principle of the wait queue.
|
|
|
|
|
*
|
|
|
|
|
* We want to be able to tell whether an expiration date is before of after the
|
|
|
|
|
* current time <now>. We KNOW that expiration dates are never too far apart,
|
|
|
|
|
* because they are measured in ticks (milliseconds). We also know that almost
|
|
|
|
|
* all dates will be in the future, and that a very small part of them will be
|
|
|
|
|
* in the past, they are the ones which have expired since last time we checked
|
|
|
|
|
* them. Using ticks, we know if a date is in the future or in the past, but we
|
|
|
|
|
* cannot use that to store sorted information because that reference changes
|
|
|
|
|
* all the time.
|
|
|
|
|
*
|
2009-03-21 05:01:42 -04:00
|
|
|
* We'll use the fact that the time wraps to sort timers. Timers above <now>
|
|
|
|
|
* are in the future, timers below <now> are in the past. Here, "above" and
|
|
|
|
|
* "below" are to be considered modulo 2^31.
|
2009-03-08 10:53:06 -04:00
|
|
|
*
|
2009-03-21 05:01:42 -04:00
|
|
|
* Timers are stored sorted in an ebtree. We use the new ability for ebtrees to
|
|
|
|
|
* lookup values starting from X to only expire tasks between <now> - 2^31 and
|
|
|
|
|
* <now>. If the end of the tree is reached while walking over it, we simply
|
|
|
|
|
* loop back to the beginning. That way, we have no problem keeping sorted
|
|
|
|
|
* wrapping timers in a tree, between (now - 24 days) and (now + 24 days). The
|
|
|
|
|
* keys in the tree always reflect their real position, none can be infinite.
|
|
|
|
|
* This reduces the number of checks to be performed.
|
2009-03-08 10:53:06 -04:00
|
|
|
*
|
|
|
|
|
* Another nice optimisation is to allow a timer to stay at an old place in the
|
|
|
|
|
* queue as long as it's not further than the real expiration date. That way,
|
|
|
|
|
* we use the tree as a place holder for a minorant of the real expiration
|
|
|
|
|
* date. Since we have a very low chance of hitting a timeout anyway, we can
|
|
|
|
|
* bounce the nodes to their right place when we scan the tree if we encounter
|
|
|
|
|
* a misplaced node once in a while. This even allows us not to remove the
|
|
|
|
|
* infinite timers from the wait queue.
|
|
|
|
|
*
|
|
|
|
|
* So, to summarize, we have :
|
|
|
|
|
* - node->key always defines current position in the wait queue
|
|
|
|
|
* - timer is the real expiration date (possibly infinite)
|
2009-03-21 05:01:42 -04:00
|
|
|
* - node->key is always before or equal to timer
|
2009-03-08 10:53:06 -04:00
|
|
|
*
|
|
|
|
|
* The run queue works similarly to the wait queue except that the current date
|
|
|
|
|
* is replaced by an insertion counter which can also wrap without any problem.
|
|
|
|
|
*/
|
|
|
|
|
|
2009-03-21 05:01:42 -04:00
|
|
|
/* The farthest we can look back in a timer tree */
|
|
|
|
|
#define TIMER_LOOK_BACK (1U << 31)
|
2009-03-08 10:53:06 -04:00
|
|
|
|
2020-06-04 11:25:40 -04:00
|
|
|
/* tasklets are recognized with nice==-32768 */
|
2021-03-02 09:54:11 -05:00
|
|
|
#define TASK_IS_TASKLET(t) ((t)->state & TASK_F_TASKLET)
|
2020-06-04 11:25:40 -04:00
|
|
|
|
2009-03-08 10:53:06 -04:00
|
|
|
/* a few exported variables */
|
2017-11-24 11:34:44 -05:00
|
|
|
extern struct pool_head *pool_head_task;
|
2018-05-18 12:45:28 -04:00
|
|
|
extern struct pool_head *pool_head_tasklet;
|
2017-11-24 11:34:44 -05:00
|
|
|
extern struct pool_head *pool_head_notification;
|
2020-06-04 11:25:40 -04:00
|
|
|
|
BUG/MAJOR: sched: protect task during removal from wait queue
The issue addressed by commit fbb934da9 ("BUG/MEDIUM: stick-table: fix
a race condition when updating the expiration task") is still present
when thread groups are enabled, but this time it lies in the scheduler.
What happens is that a task configured to run anywhere might already
have been queued into one group's wait queue. When updating a stick
table entry, sometimes the task will have to be dequeued and requeued.
For this a lock is taken on the current thread group's wait queue lock,
but while this is necessary for the queuing, it's not sufficient for
dequeuing since another thread might be in the process of expiring this
task under its own group's lock which is different. This is easy to test
using 3 stick tables with 1ms expiration, 3 track-sc rules and 4 thread
groups. The process crashes almost instantly under heavy traffic.
One approach could consist in storing the group number the task was
queued under in its descriptor (we don't need 32 bits to store the
thread id, it's possible to use one short for the tid and another
one for the tgrp). Sadly, no safe way to do this was figured, because
the race remains at the moment the thread group number is checked, as
it might be in the process of being changed by another thread. It seems
that a working approach could consist in always having it associated
with one group, and only allowing to change it under this group's lock,
so that any code trying to change it would have to iterately read it
and lock its group until the value matches, confirming it really holds
the correct lock. But this seems a bit complicated, particularly with
wait_expired_tasks() which already uses upgradable locks to switch from
read state to a write state.
Given that the shared tasks are not that common (stick-table expirations,
rate-limited listeners, maybe resolvers), it doesn't seem worth the extra
complexity for now. This patch takes a simpler and safer approach
consisting in switching back to a single wq_lock, but still keeping
separate wait queues. Given that shared wait queues are almost always
empty and that otherwise they're scanned under a read lock, the
contention remains manageable and most of the time the lock doesn't
even need to be taken since such tasks are not present in a group's
queue. In essence, this patch reverts half of the aforementionned
patch. This was tested and confirmed to work fine, without observing
any performance degradation under any workload. The performance with
8 groups on an EPYC 74F3 and 3 tables remains twice the one of a
single group, with the contention remaining on the table's lock first.
No backport is needed.
2022-11-22 01:05:44 -05:00
|
|
|
__decl_thread(extern HA_RWLOCK_T wq_lock THREAD_ALIGNED(64));
|
|
|
|
|
|
2021-02-24 11:51:38 -05:00
|
|
|
void __tasklet_wakeup_on(struct tasklet *tl, int thr);
|
2022-06-29 04:53:03 -04:00
|
|
|
struct list *__tasklet_wakeup_after(struct list *head, struct tasklet *tl);
|
2020-06-30 05:48:48 -04:00
|
|
|
void task_kill(struct task *t);
|
2021-07-28 10:12:57 -04:00
|
|
|
void tasklet_kill(struct tasklet *t);
|
2021-02-24 10:41:11 -05:00
|
|
|
void __task_wakeup(struct task *t);
|
2020-06-04 11:25:40 -04:00
|
|
|
void __task_queue(struct task *task, struct eb_root *wq);
|
2021-09-30 02:52:11 -04:00
|
|
|
|
2020-06-24 04:17:29 -04:00
|
|
|
unsigned int run_tasks_from_lists(unsigned int budgets[]);
|
2020-06-04 11:25:40 -04:00
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* This does 3 things :
|
|
|
|
|
* - wake up all expired tasks
|
|
|
|
|
* - call all runnable tasks
|
|
|
|
|
* - return the date of next event in <next> or eternity.
|
|
|
|
|
*/
|
|
|
|
|
|
CLEANUP: tree-wide: fix prototypes for functions taking no arguments.
"f(void)" is the correct and preferred form for a function taking no
argument, while some places use the older "f()". These were reported
by clang's -Wmissing-prototypes, for example:
src/cpuset.c:111:5: warning: no previous prototype for function 'ha_cpuset_size' [-Wmissing-prototypes]
int ha_cpuset_size()
include/haproxy/cpuset.h:42:5: note: this declaration is not a prototype; add 'void' to make it a prototype for a zero-parameter function
int ha_cpuset_size();
^
void
This aggregate patch fixes this for the following functions:
ha_backtrace_to_stderr(), ha_cpuset_size(), ha_panic(), ha_random64(),
ha_thread_dump_all_to_trash(), get_exec_path(), check_config_validity(),
mworker_child_nb(), mworker_cli_proxy_(create|stop)(),
mworker_cleantasks(), mworker_cleanlisteners(), mworker_ext_launch_all(),
mworker_reload(), mworker_(env|proc_list)_to_(proc_list|env)(),
mworker_(un|)block_signals(), proxy_adjust_all_maxconn(),
proxy_destroy_all_defaults(), get_tainted(),
pool_total_(allocated|used)(), thread_isolate(_full|)(),
thread(_sync|)_release(), thread_harmless_till_end(),
thread_cpu_mask_forced(), dequeue_all_listeners(), next_timer_expiry(),
wake_expired_tasks(), process_runnable_tasks(), init_acl(),
init_buffer(), (de|)init_log_buffers(), (de|)init_pollers(),
fork_poller(), pool_destroy_all(), pool_evict_from_local_caches(),
pool_total_failures(), dump_pools_to_trash(), cfg_run_diagnostics(),
tv_init_(process|thread)_date(), __signal_process_queue(),
deinit_signals(), haproxy_unblock_signals()
2021-09-12 06:49:33 -04:00
|
|
|
void process_runnable_tasks(void);
|
2020-06-04 11:25:40 -04:00
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Extract all expired timers from the timer queue, and wakes up all
|
|
|
|
|
* associated tasks.
|
|
|
|
|
*/
|
CLEANUP: tree-wide: fix prototypes for functions taking no arguments.
"f(void)" is the correct and preferred form for a function taking no
argument, while some places use the older "f()". These were reported
by clang's -Wmissing-prototypes, for example:
src/cpuset.c:111:5: warning: no previous prototype for function 'ha_cpuset_size' [-Wmissing-prototypes]
int ha_cpuset_size()
include/haproxy/cpuset.h:42:5: note: this declaration is not a prototype; add 'void' to make it a prototype for a zero-parameter function
int ha_cpuset_size();
^
void
This aggregate patch fixes this for the following functions:
ha_backtrace_to_stderr(), ha_cpuset_size(), ha_panic(), ha_random64(),
ha_thread_dump_all_to_trash(), get_exec_path(), check_config_validity(),
mworker_child_nb(), mworker_cli_proxy_(create|stop)(),
mworker_cleantasks(), mworker_cleanlisteners(), mworker_ext_launch_all(),
mworker_reload(), mworker_(env|proc_list)_to_(proc_list|env)(),
mworker_(un|)block_signals(), proxy_adjust_all_maxconn(),
proxy_destroy_all_defaults(), get_tainted(),
pool_total_(allocated|used)(), thread_isolate(_full|)(),
thread(_sync|)_release(), thread_harmless_till_end(),
thread_cpu_mask_forced(), dequeue_all_listeners(), next_timer_expiry(),
wake_expired_tasks(), process_runnable_tasks(), init_acl(),
init_buffer(), (de|)init_log_buffers(), (de|)init_pollers(),
fork_poller(), pool_destroy_all(), pool_evict_from_local_caches(),
pool_total_failures(), dump_pools_to_trash(), cfg_run_diagnostics(),
tv_init_(process|thread)_date(), __signal_process_queue(),
deinit_signals(), haproxy_unblock_signals()
2021-09-12 06:49:33 -04:00
|
|
|
void wake_expired_tasks(void);
|
2020-06-04 11:25:40 -04:00
|
|
|
|
|
|
|
|
/* Checks the next timer for the current thread by looking into its own timer
|
|
|
|
|
* list and the global one. It may return TICK_ETERNITY if no timer is present.
|
|
|
|
|
* Note that the next timer might very well be slightly in the past.
|
|
|
|
|
*/
|
CLEANUP: tree-wide: fix prototypes for functions taking no arguments.
"f(void)" is the correct and preferred form for a function taking no
argument, while some places use the older "f()". These were reported
by clang's -Wmissing-prototypes, for example:
src/cpuset.c:111:5: warning: no previous prototype for function 'ha_cpuset_size' [-Wmissing-prototypes]
int ha_cpuset_size()
include/haproxy/cpuset.h:42:5: note: this declaration is not a prototype; add 'void' to make it a prototype for a zero-parameter function
int ha_cpuset_size();
^
void
This aggregate patch fixes this for the following functions:
ha_backtrace_to_stderr(), ha_cpuset_size(), ha_panic(), ha_random64(),
ha_thread_dump_all_to_trash(), get_exec_path(), check_config_validity(),
mworker_child_nb(), mworker_cli_proxy_(create|stop)(),
mworker_cleantasks(), mworker_cleanlisteners(), mworker_ext_launch_all(),
mworker_reload(), mworker_(env|proc_list)_to_(proc_list|env)(),
mworker_(un|)block_signals(), proxy_adjust_all_maxconn(),
proxy_destroy_all_defaults(), get_tainted(),
pool_total_(allocated|used)(), thread_isolate(_full|)(),
thread(_sync|)_release(), thread_harmless_till_end(),
thread_cpu_mask_forced(), dequeue_all_listeners(), next_timer_expiry(),
wake_expired_tasks(), process_runnable_tasks(), init_acl(),
init_buffer(), (de|)init_log_buffers(), (de|)init_pollers(),
fork_poller(), pool_destroy_all(), pool_evict_from_local_caches(),
pool_total_failures(), dump_pools_to_trash(), cfg_run_diagnostics(),
tv_init_(process|thread)_date(), __signal_process_queue(),
deinit_signals(), haproxy_unblock_signals()
2021-09-12 06:49:33 -04:00
|
|
|
int next_timer_expiry(void);
|
2020-06-04 11:25:40 -04:00
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Delete every tasks before running the master polling loop
|
|
|
|
|
*/
|
CLEANUP: tree-wide: fix prototypes for functions taking no arguments.
"f(void)" is the correct and preferred form for a function taking no
argument, while some places use the older "f()". These were reported
by clang's -Wmissing-prototypes, for example:
src/cpuset.c:111:5: warning: no previous prototype for function 'ha_cpuset_size' [-Wmissing-prototypes]
int ha_cpuset_size()
include/haproxy/cpuset.h:42:5: note: this declaration is not a prototype; add 'void' to make it a prototype for a zero-parameter function
int ha_cpuset_size();
^
void
This aggregate patch fixes this for the following functions:
ha_backtrace_to_stderr(), ha_cpuset_size(), ha_panic(), ha_random64(),
ha_thread_dump_all_to_trash(), get_exec_path(), check_config_validity(),
mworker_child_nb(), mworker_cli_proxy_(create|stop)(),
mworker_cleantasks(), mworker_cleanlisteners(), mworker_ext_launch_all(),
mworker_reload(), mworker_(env|proc_list)_to_(proc_list|env)(),
mworker_(un|)block_signals(), proxy_adjust_all_maxconn(),
proxy_destroy_all_defaults(), get_tainted(),
pool_total_(allocated|used)(), thread_isolate(_full|)(),
thread(_sync|)_release(), thread_harmless_till_end(),
thread_cpu_mask_forced(), dequeue_all_listeners(), next_timer_expiry(),
wake_expired_tasks(), process_runnable_tasks(), init_acl(),
init_buffer(), (de|)init_log_buffers(), (de|)init_pollers(),
fork_poller(), pool_destroy_all(), pool_evict_from_local_caches(),
pool_total_failures(), dump_pools_to_trash(), cfg_run_diagnostics(),
tv_init_(process|thread)_date(), __signal_process_queue(),
deinit_signals(), haproxy_unblock_signals()
2021-09-12 06:49:33 -04:00
|
|
|
void mworker_cleantasks(void);
|
2020-06-04 11:25:40 -04:00
|
|
|
|
2021-02-24 09:10:07 -05:00
|
|
|
/* returns the number of running tasks+tasklets on the whole process. Note
|
|
|
|
|
* that this *is* racy since a task may move from the global to a local
|
|
|
|
|
* queue for example and be counted twice. This is only for statistics
|
|
|
|
|
* reporting.
|
|
|
|
|
*/
|
|
|
|
|
static inline int total_run_queues()
|
|
|
|
|
{
|
|
|
|
|
int thr, ret = 0;
|
2020-06-04 11:25:40 -04:00
|
|
|
|
2021-02-24 09:10:07 -05:00
|
|
|
for (thr = 0; thr < global.nbthread; thr++)
|
2021-10-01 05:30:33 -04:00
|
|
|
ret += _HA_ATOMIC_LOAD(&ha_thread_ctx[thr].rq_total);
|
2021-02-24 09:10:07 -05:00
|
|
|
return ret;
|
|
|
|
|
}
|
2018-08-01 09:58:44 -04:00
|
|
|
|
2021-02-24 11:03:30 -05:00
|
|
|
/* returns the number of allocated tasks across all threads. Note that this
|
|
|
|
|
* *is* racy since some threads might be updating their counts while we're
|
|
|
|
|
* looking, but this is only for statistics reporting.
|
|
|
|
|
*/
|
|
|
|
|
static inline int total_allocated_tasks()
|
|
|
|
|
{
|
|
|
|
|
int thr, ret;
|
|
|
|
|
|
|
|
|
|
for (thr = ret = 0; thr < global.nbthread; thr++)
|
2021-10-01 05:30:33 -04:00
|
|
|
ret += _HA_ATOMIC_LOAD(&ha_thread_ctx[thr].nb_tasks);
|
2021-02-24 11:03:30 -05:00
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
2023-09-06 05:33:53 -04:00
|
|
|
/* returns the number of running niced tasks+tasklets on the whole process.
|
|
|
|
|
* Note that this *is* racy since a task may move from the global to a local
|
|
|
|
|
* queue for example and be counted twice. This is only for statistics
|
|
|
|
|
* reporting.
|
|
|
|
|
*/
|
|
|
|
|
static inline int total_niced_running_tasks()
|
|
|
|
|
{
|
|
|
|
|
int tgrp, ret = 0;
|
|
|
|
|
|
|
|
|
|
for (tgrp = 0; tgrp < global.nbtgroups; tgrp++)
|
|
|
|
|
ret += _HA_ATOMIC_LOAD(&ha_tgroup_ctx[tgrp].niced_tasks);
|
|
|
|
|
return ret;
|
|
|
|
|
}
|
|
|
|
|
|
2009-03-07 11:25:21 -05:00
|
|
|
/* return 0 if task is in run queue, otherwise non-zero */
|
|
|
|
|
static inline int task_in_rq(struct task *t)
|
|
|
|
|
{
|
2018-05-18 12:45:28 -04:00
|
|
|
/* Check if leaf_p is NULL, in case he's not in the runqueue, and if
|
|
|
|
|
* it's not 0x1, which would mean it's in the tasklet list.
|
|
|
|
|
*/
|
2019-04-17 13:13:07 -04:00
|
|
|
return t->rq.node.leaf_p != NULL;
|
2009-03-07 11:25:21 -05:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* return 0 if task is in wait queue, otherwise non-zero */
|
|
|
|
|
static inline int task_in_wq(struct task *t)
|
|
|
|
|
{
|
|
|
|
|
return t->wq.node.leaf_p != NULL;
|
|
|
|
|
}
|
|
|
|
|
|
2020-06-04 11:25:40 -04:00
|
|
|
/* returns true if the current thread has some work to do */
|
|
|
|
|
static inline int thread_has_tasks(void)
|
|
|
|
|
{
|
2022-06-16 09:59:36 -04:00
|
|
|
return ((int)!eb_is_empty(&th_ctx->rqueue) |
|
|
|
|
|
(int)!eb_is_empty(&th_ctx->rqueue_shared) |
|
2022-04-14 09:00:42 -04:00
|
|
|
(int)!!th_ctx->tl_class_mask |
|
|
|
|
|
(int)!MT_LIST_ISEMPTY(&th_ctx->shared_tasklet_list));
|
2020-06-04 11:25:40 -04:00
|
|
|
}
|
|
|
|
|
|
2008-08-29 12:19:04 -04:00
|
|
|
/* puts the task <t> in run queue with reason flags <f>, and returns <t> */
|
2018-05-18 12:38:23 -04:00
|
|
|
/* This will put the task in the local runqueue if the task is only runnable
|
2021-02-16 12:44:00 -05:00
|
|
|
* by the current thread, in the global runqueue otherwies. With DEBUG_TASK,
|
|
|
|
|
* the <file>:<line> from the call place are stored into the task for tracing
|
|
|
|
|
* purposes.
|
2018-05-18 12:38:23 -04:00
|
|
|
*/
|
2022-09-06 04:25:49 -04:00
|
|
|
#define task_wakeup(t, f) \
|
|
|
|
|
_task_wakeup(t, f, MK_CALLER(WAKEUP_TYPE_TASK_WAKEUP, 0, 0))
|
|
|
|
|
|
|
|
|
|
static inline void _task_wakeup(struct task *t, unsigned int f, const struct ha_caller *caller)
|
2008-08-29 09:26:14 -04:00
|
|
|
{
|
2021-03-02 10:09:26 -05:00
|
|
|
unsigned int state;
|
2018-05-18 12:38:23 -04:00
|
|
|
|
2021-04-06 05:44:07 -04:00
|
|
|
state = _HA_ATOMIC_OR_FETCH(&t->state, f);
|
2019-04-17 05:47:18 -04:00
|
|
|
while (!(state & (TASK_RUNNING | TASK_QUEUED))) {
|
2019-04-17 14:52:51 -04:00
|
|
|
if (_HA_ATOMIC_CAS(&t->state, &state, state | TASK_QUEUED)) {
|
2023-11-09 06:03:24 -05:00
|
|
|
if (likely(caller)) {
|
|
|
|
|
caller = HA_ATOMIC_XCHG(&t->caller, caller);
|
|
|
|
|
BUG_ON((ulong)caller & 1);
|
2021-02-16 12:44:00 -05:00
|
|
|
#ifdef DEBUG_TASK
|
2023-11-09 06:03:24 -05:00
|
|
|
HA_ATOMIC_STORE(&t->debug.prev_caller, caller);
|
2021-02-16 12:44:00 -05:00
|
|
|
#endif
|
2023-11-09 06:03:24 -05:00
|
|
|
}
|
2021-02-24 10:41:11 -05:00
|
|
|
__task_wakeup(t);
|
2019-04-17 05:47:18 -04:00
|
|
|
break;
|
2019-04-17 14:52:51 -04:00
|
|
|
}
|
2019-04-17 05:47:18 -04:00
|
|
|
}
|
2008-08-29 09:26:14 -04:00
|
|
|
}
|
2006-06-25 20:48:02 -04:00
|
|
|
|
BUG/MAJOR: sched: prevent rare concurrent wakeup of multi-threaded tasks
Since the relaxation of the run-queue locks in 2.0 there has been a
very small but existing race between expired tasks and running tasks:
a task might be expiring and being woken up at the same time, on
different threads. This is protected against via the TASK_QUEUED and
TASK_RUNNING flags, but just after the task finishes executing, it
releases it TASK_RUNNING bit an only then it may go to task_queue().
This one will do nothing if the task's ->expire field is zero, but
if the field turns to zero between this test and the call to
__task_queue() then three things may happen:
- the task may remain in the WQ until the 24 next days if it's in
the future;
- the task may prevent any other task after it from expiring during
the 24 next days once it's queued
- if DEBUG_STRICT is set on 2.4 and above, an abort may happen
- since 2.2, if the task got killed in between, then we may
even requeue a freed task, causing random behaviour next time
it's found there, or possibly corrupting the tree if it gets
reinserted later.
The peers code is one call path that easily reproduces the case with
the ->expire field being reset, because it starts by setting it to
TICK_ETERNITY as the first thing when entering the task handler. But
other code parts also use multi-threaded tasks and rightfully expect
to be able to touch their expire field without causing trouble. No
trivial code path was found that would destroy such a shared task at
runtime, which already limits the risks.
This must be backported to 2.0.
2022-02-14 04:18:51 -05:00
|
|
|
/* Atomically drop the TASK_RUNNING bit while ensuring that any wakeup that
|
|
|
|
|
* happened since the flag was set will result in the task being queued (if
|
|
|
|
|
* it wasn't already). This is used to safely drop the flag from within the
|
|
|
|
|
* scheduler. The flag <f> is combined with existing flags before the test so
|
2022-07-29 13:26:53 -04:00
|
|
|
* that it's possible to unconditionally wakeup the task and drop the RUNNING
|
BUG/MAJOR: sched: prevent rare concurrent wakeup of multi-threaded tasks
Since the relaxation of the run-queue locks in 2.0 there has been a
very small but existing race between expired tasks and running tasks:
a task might be expiring and being woken up at the same time, on
different threads. This is protected against via the TASK_QUEUED and
TASK_RUNNING flags, but just after the task finishes executing, it
releases it TASK_RUNNING bit an only then it may go to task_queue().
This one will do nothing if the task's ->expire field is zero, but
if the field turns to zero between this test and the call to
__task_queue() then three things may happen:
- the task may remain in the WQ until the 24 next days if it's in
the future;
- the task may prevent any other task after it from expiring during
the 24 next days once it's queued
- if DEBUG_STRICT is set on 2.4 and above, an abort may happen
- since 2.2, if the task got killed in between, then we may
even requeue a freed task, causing random behaviour next time
it's found there, or possibly corrupting the tree if it gets
reinserted later.
The peers code is one call path that easily reproduces the case with
the ->expire field being reset, because it starts by setting it to
TICK_ETERNITY as the first thing when entering the task handler. But
other code parts also use multi-threaded tasks and rightfully expect
to be able to touch their expire field without causing trouble. No
trivial code path was found that would destroy such a shared task at
runtime, which already limits the risks.
This must be backported to 2.0.
2022-02-14 04:18:51 -05:00
|
|
|
* flag if needed.
|
|
|
|
|
*/
|
2023-11-24 10:45:34 -05:00
|
|
|
static inline void task_drop_running(struct task *t, unsigned int f)
|
BUG/MAJOR: sched: prevent rare concurrent wakeup of multi-threaded tasks
Since the relaxation of the run-queue locks in 2.0 there has been a
very small but existing race between expired tasks and running tasks:
a task might be expiring and being woken up at the same time, on
different threads. This is protected against via the TASK_QUEUED and
TASK_RUNNING flags, but just after the task finishes executing, it
releases it TASK_RUNNING bit an only then it may go to task_queue().
This one will do nothing if the task's ->expire field is zero, but
if the field turns to zero between this test and the call to
__task_queue() then three things may happen:
- the task may remain in the WQ until the 24 next days if it's in
the future;
- the task may prevent any other task after it from expiring during
the 24 next days once it's queued
- if DEBUG_STRICT is set on 2.4 and above, an abort may happen
- since 2.2, if the task got killed in between, then we may
even requeue a freed task, causing random behaviour next time
it's found there, or possibly corrupting the tree if it gets
reinserted later.
The peers code is one call path that easily reproduces the case with
the ->expire field being reset, because it starts by setting it to
TICK_ETERNITY as the first thing when entering the task handler. But
other code parts also use multi-threaded tasks and rightfully expect
to be able to touch their expire field without causing trouble. No
trivial code path was found that would destroy such a shared task at
runtime, which already limits the risks.
This must be backported to 2.0.
2022-02-14 04:18:51 -05:00
|
|
|
{
|
|
|
|
|
unsigned int state, new_state;
|
|
|
|
|
|
|
|
|
|
state = _HA_ATOMIC_LOAD(&t->state);
|
|
|
|
|
|
|
|
|
|
while (1) {
|
|
|
|
|
new_state = state | f;
|
|
|
|
|
if (new_state & TASK_WOKEN_ANY)
|
|
|
|
|
new_state |= TASK_QUEUED;
|
|
|
|
|
|
|
|
|
|
if (_HA_ATOMIC_CAS(&t->state, &state, new_state & ~TASK_RUNNING))
|
|
|
|
|
break;
|
|
|
|
|
__ha_cpu_relax();
|
|
|
|
|
}
|
|
|
|
|
|
2023-11-24 10:45:34 -05:00
|
|
|
if ((new_state & ~state) & TASK_QUEUED)
|
BUG/MAJOR: sched: prevent rare concurrent wakeup of multi-threaded tasks
Since the relaxation of the run-queue locks in 2.0 there has been a
very small but existing race between expired tasks and running tasks:
a task might be expiring and being woken up at the same time, on
different threads. This is protected against via the TASK_QUEUED and
TASK_RUNNING flags, but just after the task finishes executing, it
releases it TASK_RUNNING bit an only then it may go to task_queue().
This one will do nothing if the task's ->expire field is zero, but
if the field turns to zero between this test and the call to
__task_queue() then three things may happen:
- the task may remain in the WQ until the 24 next days if it's in
the future;
- the task may prevent any other task after it from expiring during
the 24 next days once it's queued
- if DEBUG_STRICT is set on 2.4 and above, an abort may happen
- since 2.2, if the task got killed in between, then we may
even requeue a freed task, causing random behaviour next time
it's found there, or possibly corrupting the tree if it gets
reinserted later.
The peers code is one call path that easily reproduces the case with
the ->expire field being reset, because it starts by setting it to
TICK_ETERNITY as the first thing when entering the task handler. But
other code parts also use multi-threaded tasks and rightfully expect
to be able to touch their expire field without causing trouble. No
trivial code path was found that would destroy such a shared task at
runtime, which already limits the risks.
This must be backported to 2.0.
2022-02-14 04:18:51 -05:00
|
|
|
__task_wakeup(t);
|
|
|
|
|
}
|
|
|
|
|
|
2009-03-07 11:25:21 -05:00
|
|
|
/*
|
|
|
|
|
* Unlink the task from the wait queue, and possibly update the last_timer
|
|
|
|
|
* pointer. A pointer to the task itself is returned. The task *must* already
|
|
|
|
|
* be in the wait queue before calling this function. If unsure, use the safer
|
|
|
|
|
* task_unlink_wq() function.
|
2006-06-25 20:48:02 -04:00
|
|
|
*/
|
2009-03-07 11:25:21 -05:00
|
|
|
static inline struct task *__task_unlink_wq(struct task *t)
|
|
|
|
|
{
|
|
|
|
|
eb32_delete(&t->wq);
|
|
|
|
|
return t;
|
|
|
|
|
}
|
|
|
|
|
|
2018-10-15 08:52:21 -04:00
|
|
|
/* remove a task from its wait queue. It may either be the local wait queue if
|
BUG/MAJOR: task: add a new TASK_SHARED_WQ flag to fix foreing requeuing
Since 1.9 with commit b20aa9eef3 ("MAJOR: tasks: create per-thread wait
queues") a task bound to a single thread will not use locks when being
queued or dequeued because the wait queue is assumed to be the owner
thread's.
But there exists a rare situation where this is not true: the health
check tasks may be running on one thread waiting for a response, and
may in parallel be requeued by another thread calling health_adjust()
after a detecting a response error in traffic when "observe l7" is set,
and "fastinter" is lower than "inter", requiring to shorten the running
check's timeout. In this case, the task being requeued was present in
another thread's wait queue, thus opening a race during task_unlink_wq(),
and gets requeued into the calling thread's wait queue instead of the
running one's, opening a second race here.
This patch aims at protecting against the risk of calling task_unlink_wq()
from one thread while the task is queued on another thread, hence unlocked,
by introducing a new TASK_SHARED_WQ flag.
This new flag indicates that a task's position in the wait queue may be
adjusted by other threads than then one currently executing it. This means
that such WQ manipulations must be performed under a lock. There are two
types of such tasks:
- the global ones, using the global wait queue (technically speaking,
those whose thread_mask has at least 2 bits set).
- some local ones, which for now will be placed into the global wait
queue as well in order to benefit from its lock.
The flag is automatically set on initialization if the task's thread mask
indicates more than one thread. The caller must also set it if it intends
to let other threads update the task's expiration delay (e.g. delegated
I/Os), or if it intends to change the task's affinity over time as this
could lead to the same situation.
Right now only the situation described above seems to be affected by this
issue, and it is very difficult to trigger, and even then, will often have
no visible effect beyond stopping the checks for example once the race is
met. On my laptop it is feasible with the following config, chained to
httpterm:
global
maxconn 400 # provoke FD errors, calling health_adjust()
defaults
mode http
timeout client 10s
timeout server 10s
timeout connect 10s
listen px
bind :8001
option httpchk /?t=50
server sback 127.0.0.1:8000 backup
server-template s 0-999 127.0.0.1:8000 check port 8001 inter 100 fastinter 10 observe layer7
This patch will automatically address the case for the checks because
check tasks are created with multiple threads bound and will get the
TASK_SHARED_WQ flag set.
If in the future more tasks need to rely on this (multi-threaded muxes
for example) and the use of the global wait queue becomes a bottleneck
again, then it should not be too difficult to place locks on the local
wait queues and queue the task on its bound thread.
This patch needs to be backported to 2.1, 2.0 and 1.9. It depends on
previous patch "MINOR: task: only check TASK_WOKEN_ANY to decide to
requeue a task".
Many thanks to William Dauchy for providing detailed traces allowing to
spot the problem.
2019-12-19 01:39:06 -05:00
|
|
|
* the task is bound to a single thread or the global queue. If the task uses a
|
|
|
|
|
* shared wait queue, the global wait queue lock is used.
|
2018-10-15 08:52:21 -04:00
|
|
|
*/
|
2009-03-07 11:25:21 -05:00
|
|
|
static inline struct task *task_unlink_wq(struct task *t)
|
2006-06-25 20:48:02 -04:00
|
|
|
{
|
BUG/MAJOR: fd/threads, task/threads: ensure all spin locks are unlocked
Calculate if the fd or task should be locked once, before locking, and
reuse the calculation when determing when to unlock.
Fixes a race condition added in 87d54a9a for fds, and b20aa9ee for tasks,
released in 1.9-dev4. When one thread modifies thread_mask to be a single
thread for a task or fd while a second thread has locked or is waiting on a
lock for that task or fd, the second thread will not unlock it. For FDs,
this is observable when a listener is polled by multiple threads, and is
closed while those threads have events pending. For tasks, this seems
possible, where task_set_affinity is called, but I did not observe it.
This must be backported to 1.9.
2019-02-20 15:43:45 -05:00
|
|
|
unsigned long locked;
|
|
|
|
|
|
2018-10-15 08:52:21 -04:00
|
|
|
if (likely(task_in_wq(t))) {
|
2022-06-15 10:48:45 -04:00
|
|
|
locked = t->tid < 0;
|
BUG/MEDIUM: task: relax one thread consistency check in task_unlink_wq()
While testing the fix for the previous issue related to reloads with
hard_stop_after, I've met another one which could spuriously produce:
FATAL: bug condition "t->tid >= 0 && t->tid != tid" matched at include/haproxy/task.h:266
In 2.3-dev2, we've added more consistency checks for a number of bug-
inducing programming errors related to the tasks, via commit e5d79bccc
("MINOR: tasks/debug: add a few BUG_ON() to detect use of wrong timer
queue"), and this check comes from there.
The problem that happens here is that when hard-stop-after is set, we
can abort the current thread even if there are still ongoing checks
(or connections in fact). In this case some tasks are present in a
thread's wait queue and are thus bound exclusively to this thread.
During deinit(), the collect and cleanup of all memory areas also
stops servers and kills their check tasks. And calling task_destroy()
does in turn call task_unlink_wq()... except that it's called from
thread 0 which doesn't match the initially planned thread number.
Several approaches are possible. One of them would consist in letting
threads perform their own cleanup (tasks, pools, FDs, etc). This would
possibly be even faster since done in parallel, but some corner cases
might be way more complicated (e.g. who will kill a check's task, or
what to do with a task found in a local wait queue or run queue, and
what about other consistency checks this could violate?).
Thus for now this patches takes an easier and more conservative
approach consisting in admitting that when the process is stopping,
this rule is not necessarily valid, and to let thread 0 collect all
other threads' garbage.
As such this patch can be backpoted to 2.4.
2022-08-10 12:03:11 -04:00
|
|
|
BUG_ON(t->tid >= 0 && t->tid != tid && !(global.mode & MODE_STOPPING));
|
BUG/MAJOR: fd/threads, task/threads: ensure all spin locks are unlocked
Calculate if the fd or task should be locked once, before locking, and
reuse the calculation when determing when to unlock.
Fixes a race condition added in 87d54a9a for fds, and b20aa9ee for tasks,
released in 1.9-dev4. When one thread modifies thread_mask to be a single
thread for a task or fd while a second thread has locked or is waiting on a
lock for that task or fd, the second thread will not unlock it. For FDs,
this is observable when a listener is polled by multiple threads, and is
closed while those threads have events pending. For tasks, this seems
possible, where task_set_affinity is called, but I did not observe it.
This must be backported to 1.9.
2019-02-20 15:43:45 -05:00
|
|
|
if (locked)
|
BUG/MAJOR: sched: protect task during removal from wait queue
The issue addressed by commit fbb934da9 ("BUG/MEDIUM: stick-table: fix
a race condition when updating the expiration task") is still present
when thread groups are enabled, but this time it lies in the scheduler.
What happens is that a task configured to run anywhere might already
have been queued into one group's wait queue. When updating a stick
table entry, sometimes the task will have to be dequeued and requeued.
For this a lock is taken on the current thread group's wait queue lock,
but while this is necessary for the queuing, it's not sufficient for
dequeuing since another thread might be in the process of expiring this
task under its own group's lock which is different. This is easy to test
using 3 stick tables with 1ms expiration, 3 track-sc rules and 4 thread
groups. The process crashes almost instantly under heavy traffic.
One approach could consist in storing the group number the task was
queued under in its descriptor (we don't need 32 bits to store the
thread id, it's possible to use one short for the tid and another
one for the tgrp). Sadly, no safe way to do this was figured, because
the race remains at the moment the thread group number is checked, as
it might be in the process of being changed by another thread. It seems
that a working approach could consist in always having it associated
with one group, and only allowing to change it under this group's lock,
so that any code trying to change it would have to iterately read it
and lock its group until the value matches, confirming it really holds
the correct lock. But this seems a bit complicated, particularly with
wait_expired_tasks() which already uses upgradable locks to switch from
read state to a write state.
Given that the shared tasks are not that common (stick-table expirations,
rate-limited listeners, maybe resolvers), it doesn't seem worth the extra
complexity for now. This patch takes a simpler and safer approach
consisting in switching back to a single wq_lock, but still keeping
separate wait queues. Given that shared wait queues are almost always
empty and that otherwise they're scanned under a read lock, the
contention remains manageable and most of the time the lock doesn't
even need to be taken since such tasks are not present in a group's
queue. In essence, this patch reverts half of the aforementionned
patch. This was tested and confirmed to work fine, without observing
any performance degradation under any workload. The performance with
8 groups on an EPYC 74F3 and 3 tables remains twice the one of a
single group, with the contention remaining on the table's lock first.
No backport is needed.
2022-11-22 01:05:44 -05:00
|
|
|
HA_RWLOCK_WRLOCK(TASK_WQ_LOCK, &wq_lock);
|
2009-03-07 11:25:21 -05:00
|
|
|
__task_unlink_wq(t);
|
BUG/MAJOR: fd/threads, task/threads: ensure all spin locks are unlocked
Calculate if the fd or task should be locked once, before locking, and
reuse the calculation when determing when to unlock.
Fixes a race condition added in 87d54a9a for fds, and b20aa9ee for tasks,
released in 1.9-dev4. When one thread modifies thread_mask to be a single
thread for a task or fd while a second thread has locked or is waiting on a
lock for that task or fd, the second thread will not unlock it. For FDs,
this is observable when a listener is polled by multiple threads, and is
closed while those threads have events pending. For tasks, this seems
possible, where task_set_affinity is called, but I did not observe it.
This must be backported to 1.9.
2019-02-20 15:43:45 -05:00
|
|
|
if (locked)
|
BUG/MAJOR: sched: protect task during removal from wait queue
The issue addressed by commit fbb934da9 ("BUG/MEDIUM: stick-table: fix
a race condition when updating the expiration task") is still present
when thread groups are enabled, but this time it lies in the scheduler.
What happens is that a task configured to run anywhere might already
have been queued into one group's wait queue. When updating a stick
table entry, sometimes the task will have to be dequeued and requeued.
For this a lock is taken on the current thread group's wait queue lock,
but while this is necessary for the queuing, it's not sufficient for
dequeuing since another thread might be in the process of expiring this
task under its own group's lock which is different. This is easy to test
using 3 stick tables with 1ms expiration, 3 track-sc rules and 4 thread
groups. The process crashes almost instantly under heavy traffic.
One approach could consist in storing the group number the task was
queued under in its descriptor (we don't need 32 bits to store the
thread id, it's possible to use one short for the tid and another
one for the tgrp). Sadly, no safe way to do this was figured, because
the race remains at the moment the thread group number is checked, as
it might be in the process of being changed by another thread. It seems
that a working approach could consist in always having it associated
with one group, and only allowing to change it under this group's lock,
so that any code trying to change it would have to iterately read it
and lock its group until the value matches, confirming it really holds
the correct lock. But this seems a bit complicated, particularly with
wait_expired_tasks() which already uses upgradable locks to switch from
read state to a write state.
Given that the shared tasks are not that common (stick-table expirations,
rate-limited listeners, maybe resolvers), it doesn't seem worth the extra
complexity for now. This patch takes a simpler and safer approach
consisting in switching back to a single wq_lock, but still keeping
separate wait queues. Given that shared wait queues are almost always
empty and that otherwise they're scanned under a read lock, the
contention remains manageable and most of the time the lock doesn't
even need to be taken since such tasks are not present in a group's
queue. In essence, this patch reverts half of the aforementionned
patch. This was tested and confirmed to work fine, without observing
any performance degradation under any workload. The performance with
8 groups on an EPYC 74F3 and 3 tables remains twice the one of a
single group, with the contention remaining on the table's lock first.
No backport is needed.
2022-11-22 01:05:44 -05:00
|
|
|
HA_RWLOCK_WRUNLOCK(TASK_WQ_LOCK, &wq_lock);
|
2018-10-15 08:52:21 -04:00
|
|
|
}
|
2007-04-29 04:41:56 -04:00
|
|
|
return t;
|
2006-06-25 20:48:02 -04:00
|
|
|
}
|
|
|
|
|
|
2020-06-04 11:25:40 -04:00
|
|
|
/* Place <task> into the wait queue, where it may already be. If the expiration
|
|
|
|
|
* timer is infinite, do nothing and rely on wake_expired_task to clean up.
|
|
|
|
|
* If the task uses a shared wait queue, it's queued into the global wait queue,
|
|
|
|
|
* protected by the global wq_lock, otherwise by it necessarily belongs to the
|
|
|
|
|
* current thread'sand is queued without locking.
|
|
|
|
|
*/
|
2023-11-09 06:05:08 -05:00
|
|
|
#define task_queue(t) \
|
|
|
|
|
_task_queue(t, MK_CALLER(WAKEUP_TYPE_TASK_QUEUE, 0, 0))
|
|
|
|
|
|
|
|
|
|
static inline void _task_queue(struct task *task, const struct ha_caller *caller)
|
2020-06-04 11:25:40 -04:00
|
|
|
{
|
|
|
|
|
/* If we already have a place in the wait queue no later than the
|
|
|
|
|
* timeout we're trying to set, we'll stay there, because it is very
|
|
|
|
|
* unlikely that we will reach the timeout anyway. If the timeout
|
|
|
|
|
* has been disabled, it's useless to leave the queue as well. We'll
|
|
|
|
|
* rely on wake_expired_tasks() to catch the node and move it to the
|
|
|
|
|
* proper place should it ever happen. Finally we only add the task
|
|
|
|
|
* to the queue if it was not there or if it was further than what
|
|
|
|
|
* we want.
|
|
|
|
|
*/
|
|
|
|
|
if (!tick_isset(task->expire))
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
#ifdef USE_THREAD
|
2022-06-15 10:48:45 -04:00
|
|
|
if (task->tid < 0) {
|
BUG/MAJOR: sched: protect task during removal from wait queue
The issue addressed by commit fbb934da9 ("BUG/MEDIUM: stick-table: fix
a race condition when updating the expiration task") is still present
when thread groups are enabled, but this time it lies in the scheduler.
What happens is that a task configured to run anywhere might already
have been queued into one group's wait queue. When updating a stick
table entry, sometimes the task will have to be dequeued and requeued.
For this a lock is taken on the current thread group's wait queue lock,
but while this is necessary for the queuing, it's not sufficient for
dequeuing since another thread might be in the process of expiring this
task under its own group's lock which is different. This is easy to test
using 3 stick tables with 1ms expiration, 3 track-sc rules and 4 thread
groups. The process crashes almost instantly under heavy traffic.
One approach could consist in storing the group number the task was
queued under in its descriptor (we don't need 32 bits to store the
thread id, it's possible to use one short for the tid and another
one for the tgrp). Sadly, no safe way to do this was figured, because
the race remains at the moment the thread group number is checked, as
it might be in the process of being changed by another thread. It seems
that a working approach could consist in always having it associated
with one group, and only allowing to change it under this group's lock,
so that any code trying to change it would have to iterately read it
and lock its group until the value matches, confirming it really holds
the correct lock. But this seems a bit complicated, particularly with
wait_expired_tasks() which already uses upgradable locks to switch from
read state to a write state.
Given that the shared tasks are not that common (stick-table expirations,
rate-limited listeners, maybe resolvers), it doesn't seem worth the extra
complexity for now. This patch takes a simpler and safer approach
consisting in switching back to a single wq_lock, but still keeping
separate wait queues. Given that shared wait queues are almost always
empty and that otherwise they're scanned under a read lock, the
contention remains manageable and most of the time the lock doesn't
even need to be taken since such tasks are not present in a group's
queue. In essence, this patch reverts half of the aforementionned
patch. This was tested and confirmed to work fine, without observing
any performance degradation under any workload. The performance with
8 groups on an EPYC 74F3 and 3 tables remains twice the one of a
single group, with the contention remaining on the table's lock first.
No backport is needed.
2022-11-22 01:05:44 -05:00
|
|
|
HA_RWLOCK_WRLOCK(TASK_WQ_LOCK, &wq_lock);
|
2023-11-09 06:05:08 -05:00
|
|
|
if (!task_in_wq(task) || tick_is_lt(task->expire, task->wq.key)) {
|
|
|
|
|
if (likely(caller)) {
|
|
|
|
|
caller = HA_ATOMIC_XCHG(&task->caller, caller);
|
|
|
|
|
BUG_ON((ulong)caller & 1);
|
|
|
|
|
#ifdef DEBUG_TASK
|
|
|
|
|
HA_ATOMIC_STORE(&task->debug.prev_caller, caller);
|
|
|
|
|
#endif
|
|
|
|
|
}
|
2022-07-07 09:22:55 -04:00
|
|
|
__task_queue(task, &tg_ctx->timers);
|
2023-11-09 06:05:08 -05:00
|
|
|
}
|
BUG/MAJOR: sched: protect task during removal from wait queue
The issue addressed by commit fbb934da9 ("BUG/MEDIUM: stick-table: fix
a race condition when updating the expiration task") is still present
when thread groups are enabled, but this time it lies in the scheduler.
What happens is that a task configured to run anywhere might already
have been queued into one group's wait queue. When updating a stick
table entry, sometimes the task will have to be dequeued and requeued.
For this a lock is taken on the current thread group's wait queue lock,
but while this is necessary for the queuing, it's not sufficient for
dequeuing since another thread might be in the process of expiring this
task under its own group's lock which is different. This is easy to test
using 3 stick tables with 1ms expiration, 3 track-sc rules and 4 thread
groups. The process crashes almost instantly under heavy traffic.
One approach could consist in storing the group number the task was
queued under in its descriptor (we don't need 32 bits to store the
thread id, it's possible to use one short for the tid and another
one for the tgrp). Sadly, no safe way to do this was figured, because
the race remains at the moment the thread group number is checked, as
it might be in the process of being changed by another thread. It seems
that a working approach could consist in always having it associated
with one group, and only allowing to change it under this group's lock,
so that any code trying to change it would have to iterately read it
and lock its group until the value matches, confirming it really holds
the correct lock. But this seems a bit complicated, particularly with
wait_expired_tasks() which already uses upgradable locks to switch from
read state to a write state.
Given that the shared tasks are not that common (stick-table expirations,
rate-limited listeners, maybe resolvers), it doesn't seem worth the extra
complexity for now. This patch takes a simpler and safer approach
consisting in switching back to a single wq_lock, but still keeping
separate wait queues. Given that shared wait queues are almost always
empty and that otherwise they're scanned under a read lock, the
contention remains manageable and most of the time the lock doesn't
even need to be taken since such tasks are not present in a group's
queue. In essence, this patch reverts half of the aforementionned
patch. This was tested and confirmed to work fine, without observing
any performance degradation under any workload. The performance with
8 groups on an EPYC 74F3 and 3 tables remains twice the one of a
single group, with the contention remaining on the table's lock first.
No backport is needed.
2022-11-22 01:05:44 -05:00
|
|
|
HA_RWLOCK_WRUNLOCK(TASK_WQ_LOCK, &wq_lock);
|
2020-06-04 11:25:40 -04:00
|
|
|
} else
|
|
|
|
|
#endif
|
|
|
|
|
{
|
2022-06-15 10:48:45 -04:00
|
|
|
BUG_ON(task->tid != tid);
|
2023-11-09 06:05:08 -05:00
|
|
|
if (!task_in_wq(task) || tick_is_lt(task->expire, task->wq.key)) {
|
|
|
|
|
if (likely(caller)) {
|
|
|
|
|
caller = HA_ATOMIC_XCHG(&task->caller, caller);
|
|
|
|
|
BUG_ON((ulong)caller & 1);
|
|
|
|
|
#ifdef DEBUG_TASK
|
|
|
|
|
HA_ATOMIC_STORE(&task->debug.prev_caller, caller);
|
|
|
|
|
#endif
|
|
|
|
|
}
|
2021-10-01 05:30:33 -04:00
|
|
|
__task_queue(task, &th_ctx->timers);
|
2023-11-09 06:05:08 -05:00
|
|
|
}
|
2020-06-04 11:25:40 -04:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2022-06-15 11:20:16 -04:00
|
|
|
/* Change the thread affinity of a task to <thr>, which may either be a valid
|
|
|
|
|
* thread number from 0 to nbthread-1, or a negative value to allow the task
|
|
|
|
|
* to run on any thread.
|
|
|
|
|
*
|
2020-06-04 11:25:40 -04:00
|
|
|
* This may only be done from within the running task itself or during its
|
|
|
|
|
* initialization. It will unqueue and requeue the task from the wait queue
|
|
|
|
|
* if it was in it. This is safe against a concurrent task_queue() call because
|
|
|
|
|
* task_queue() itself will unlink again if needed after taking into account
|
|
|
|
|
* the new thread_mask.
|
|
|
|
|
*/
|
2022-06-15 11:20:16 -04:00
|
|
|
static inline void task_set_thread(struct task *t, int thr)
|
2020-06-04 11:25:40 -04:00
|
|
|
{
|
2022-06-15 11:20:16 -04:00
|
|
|
#ifndef USE_THREAD
|
|
|
|
|
/* no shared queue without threads */
|
|
|
|
|
thr = 0;
|
|
|
|
|
#endif
|
2020-06-04 11:25:40 -04:00
|
|
|
if (unlikely(task_in_wq(t))) {
|
|
|
|
|
task_unlink_wq(t);
|
2022-06-15 08:19:48 -04:00
|
|
|
t->tid = thr;
|
2020-06-04 11:25:40 -04:00
|
|
|
task_queue(t);
|
|
|
|
|
}
|
2022-06-15 08:19:48 -04:00
|
|
|
else {
|
|
|
|
|
t->tid = thr;
|
|
|
|
|
}
|
2020-06-04 11:25:40 -04:00
|
|
|
}
|
|
|
|
|
|
2020-07-03 11:13:05 -04:00
|
|
|
/* schedules tasklet <tl> to run onto thread <thr> or the current thread if
|
BUG/MEDIUM: task: close a possible data race condition on a tasklet's list link
In issue #958 Ashley Penney reported intermittent crashes on AWS's ARM
nodes which would not happen on x86 nodes. After investigation it turned
out that the Neoverse N1 CPU cores used in the Graviton2 CPU are much
more aggressive than the usual Cortex A53/A72/A55 or any x86 regarding
memory ordering.
The issue that was triggered there is that if a tasklet_wakeup() call
is made on a tasklet scheduled to run on a foreign thread and that
tasklet is just being dequeued to be processed, there can be a race at
two places:
- if MT_LIST_TRY_ADDQ() happens between MT_LIST_BEHEAD() and
LIST_SPLICE_END_DETACHED() if the tasklet is alone in the list,
because the emptiness tests matches ;
- if MT_LIST_TRY_ADDQ() happens during LIST_DEL_INIT() in
run_tasks_from_lists(), then depending on how LIST_DEL_INIT() ends
up being implemented, it may even corrupt the adjacent nodes while
they're being reused for the in-tree storage.
This issue was introduced in 2.2 when support for waking up remote
tasklets was added. Initially the attachment of a tasklet to a list
was enough to know its status and this used to be stable information.
Now it's not sufficient to rely on this anymore, thus we need to use
a different information.
This patch solves this by adding a new task flag, TASK_IN_LIST, which
is atomically set before attaching a tasklet to a list, and is only
removed after the tasklet is detached from a list. It is checked
by tasklet_wakeup_on() so that it may only be done while the tasklet
is out of any list, and is cleared during the state switch when calling
the tasklet. Note that the flag is not set for pure tasks as it's not
needed.
However this introduces a new special case: the function
tasklet_remove_from_tasklet_list() needs to keep both states in sync
and cannot check both the state and the attachment to a list at the
same time. This function is already limited to being used by the thread
owning the tasklet, so in this case the test remains reliable. However,
just like its predecessors, this function is wrong by design and it
should probably be replaced with a stricter one, a lazy one, or be
totally removed (it's only used in checks to avoid calling a possibly
scheduled event, and when freeing a tasklet). Regardless, for now the
function exists so the flag is removed only if the deletion could be
done, which covers all cases we're interested in regarding the insertion.
This removal is safe against a concurrent tasklet_wakeup_on() since
MT_LIST_DEL() guarantees the atomic test, and will ultimately clear
the flag only if the task could be deleted, so the flag will always
reflect the last state.
This should be carefully be backported as far as 2.2 after some
observation period. This patch depends on previous patch
"MINOR: task: remove __tasklet_remove_from_tasklet_list()".
2020-11-30 08:58:53 -05:00
|
|
|
* <thr> is negative. Note that it is illegal to wakeup a foreign tasklet if
|
|
|
|
|
* its tid is negative and it is illegal to self-assign a tasklet that was
|
2021-02-16 12:44:00 -05:00
|
|
|
* at least once scheduled on a specific thread. With DEBUG_TASK, the
|
|
|
|
|
* <file>:<line> from the call place are stored into the tasklet for tracing
|
|
|
|
|
* purposes.
|
2020-07-03 11:13:05 -04:00
|
|
|
*/
|
2022-09-06 04:25:49 -04:00
|
|
|
#define tasklet_wakeup_on(tl, thr) \
|
|
|
|
|
_tasklet_wakeup_on(tl, thr, MK_CALLER(WAKEUP_TYPE_TASKLET_WAKEUP, 0, 0))
|
|
|
|
|
|
|
|
|
|
static inline void _tasklet_wakeup_on(struct tasklet *tl, int thr, const struct ha_caller *caller)
|
2018-05-18 12:45:28 -04:00
|
|
|
{
|
2021-03-02 10:09:26 -05:00
|
|
|
unsigned int state = tl->state;
|
BUG/MEDIUM: task: close a possible data race condition on a tasklet's list link
In issue #958 Ashley Penney reported intermittent crashes on AWS's ARM
nodes which would not happen on x86 nodes. After investigation it turned
out that the Neoverse N1 CPU cores used in the Graviton2 CPU are much
more aggressive than the usual Cortex A53/A72/A55 or any x86 regarding
memory ordering.
The issue that was triggered there is that if a tasklet_wakeup() call
is made on a tasklet scheduled to run on a foreign thread and that
tasklet is just being dequeued to be processed, there can be a race at
two places:
- if MT_LIST_TRY_ADDQ() happens between MT_LIST_BEHEAD() and
LIST_SPLICE_END_DETACHED() if the tasklet is alone in the list,
because the emptiness tests matches ;
- if MT_LIST_TRY_ADDQ() happens during LIST_DEL_INIT() in
run_tasks_from_lists(), then depending on how LIST_DEL_INIT() ends
up being implemented, it may even corrupt the adjacent nodes while
they're being reused for the in-tree storage.
This issue was introduced in 2.2 when support for waking up remote
tasklets was added. Initially the attachment of a tasklet to a list
was enough to know its status and this used to be stable information.
Now it's not sufficient to rely on this anymore, thus we need to use
a different information.
This patch solves this by adding a new task flag, TASK_IN_LIST, which
is atomically set before attaching a tasklet to a list, and is only
removed after the tasklet is detached from a list. It is checked
by tasklet_wakeup_on() so that it may only be done while the tasklet
is out of any list, and is cleared during the state switch when calling
the tasklet. Note that the flag is not set for pure tasks as it's not
needed.
However this introduces a new special case: the function
tasklet_remove_from_tasklet_list() needs to keep both states in sync
and cannot check both the state and the attachment to a list at the
same time. This function is already limited to being used by the thread
owning the tasklet, so in this case the test remains reliable. However,
just like its predecessors, this function is wrong by design and it
should probably be replaced with a stricter one, a lazy one, or be
totally removed (it's only used in checks to avoid calling a possibly
scheduled event, and when freeing a tasklet). Regardless, for now the
function exists so the flag is removed only if the deletion could be
done, which covers all cases we're interested in regarding the insertion.
This removal is safe against a concurrent tasklet_wakeup_on() since
MT_LIST_DEL() guarantees the atomic test, and will ultimately clear
the flag only if the task could be deleted, so the flag will always
reflect the last state.
This should be carefully be backported as far as 2.2 after some
observation period. This patch depends on previous patch
"MINOR: task: remove __tasklet_remove_from_tasklet_list()".
2020-11-30 08:58:53 -05:00
|
|
|
|
|
|
|
|
do {
|
|
|
|
|
/* do nothing if someone else already added it */
|
|
|
|
|
if (state & TASK_IN_LIST)
|
|
|
|
|
return;
|
|
|
|
|
} while (!_HA_ATOMIC_CAS(&tl->state, &state, state | TASK_IN_LIST));
|
|
|
|
|
|
2021-02-16 12:44:00 -05:00
|
|
|
/* at this point we're the first ones to add this task to the list */
|
2023-11-09 06:03:24 -05:00
|
|
|
if (likely(caller)) {
|
|
|
|
|
caller = HA_ATOMIC_XCHG(&tl->caller, caller);
|
|
|
|
|
BUG_ON((ulong)caller & 1);
|
2021-02-16 12:44:00 -05:00
|
|
|
#ifdef DEBUG_TASK
|
2023-11-09 06:03:24 -05:00
|
|
|
HA_ATOMIC_STORE(&tl->debug.prev_caller, caller);
|
MINOR: task: permanently enable latency measurement on tasklets
When tasklet latency measurement was enabled in 2.4 with commit b2285de04
("MINOR: tasks: also compute the tasklet latency when DEBUG_TASK is set"),
the feature was conditionned on DEBUG_TASK because the field would add 8
bytes to the struct tasklet.
This approach was not a very good idea because the struct ends on an int
anyway thus it does finish with a 32-bit hole regardless of the presence
of this field. What is true however is that adding it turned a 64-byte
struct to 72-byte when caller debugging is enabled.
This patch revisits this with a minor change. Now only the lowest 32
bits of the call date are stored, so they always fit in the remaining
hole, and this allows to remove the dependency on DEBUG_TASK. With
debugging off, we're now seeing a 48-byte struct, and with debugging
on it's exactly 64 bytes, thus still exactly one cache line. 32 bits
allow a latency of 4 seconds on a tasklet, which already indicates a
completely dead process, so there's no point storing the upper bits at
all. And even in the event it would happen once in a while, the lost
upper bits do not really add any value to the debug reports. Also, now
one tasklet wakeup every 4 billion will not be sampled due to the test
on the value itself. Similarly we just don't care, it's statistics and
the measurements are not 9-digit accurate anyway.
2022-09-06 13:17:45 -04:00
|
|
|
#endif
|
2023-11-09 06:03:24 -05:00
|
|
|
}
|
|
|
|
|
|
2022-06-22 03:19:46 -04:00
|
|
|
if (_HA_ATOMIC_LOAD(&th_ctx->flags) & TH_FL_TASK_PROFILING)
|
2022-09-07 08:49:50 -04:00
|
|
|
tl->wake_date = now_mono_time();
|
2021-02-24 11:51:38 -05:00
|
|
|
__tasklet_wakeup_on(tl, thr);
|
2020-07-03 11:13:05 -04:00
|
|
|
}
|
2018-05-18 12:45:28 -04:00
|
|
|
|
2020-07-03 11:13:05 -04:00
|
|
|
/* schedules tasklet <tl> to run onto the thread designated by tl->tid, which
|
2021-02-16 12:44:00 -05:00
|
|
|
* is either its owner thread if >= 0 or the current thread if < 0. When
|
|
|
|
|
* DEBUG_TASK is set, the <file>:<line> from the call place are stored into the
|
|
|
|
|
* task for tracing purposes.
|
2020-07-03 11:13:05 -04:00
|
|
|
*/
|
2022-09-06 04:25:49 -04:00
|
|
|
#define tasklet_wakeup(tl) \
|
|
|
|
|
_tasklet_wakeup_on(tl, (tl)->tid, MK_CALLER(WAKEUP_TYPE_TASKLET_WAKEUP, 0, 0))
|
2021-02-16 12:44:00 -05:00
|
|
|
|
2022-04-22 12:22:03 -04:00
|
|
|
/* instantly wakes up task <t> on its owner thread even if it's not the current
|
|
|
|
|
* one, bypassing the run queue. The purpose is to be able to avoid contention
|
|
|
|
|
* in the global run queue for massively remote tasks (e.g. queue) when there's
|
|
|
|
|
* no value in passing the task again through the priority ordering since it has
|
|
|
|
|
* already been subject to it once (e.g. before entering process_stream). The
|
|
|
|
|
* task goes directly into the shared mt_list as a tasklet and will run as
|
|
|
|
|
* TL_URGENT. Great care is taken to be certain it's not queued nor running
|
|
|
|
|
* already.
|
|
|
|
|
*/
|
2022-09-06 04:25:49 -04:00
|
|
|
#define task_instant_wakeup(t, f) \
|
|
|
|
|
_task_instant_wakeup(t, f, MK_CALLER(WAKEUP_TYPE_TASK_INSTANT_WAKEUP, 0, 0))
|
|
|
|
|
|
|
|
|
|
static inline void _task_instant_wakeup(struct task *t, unsigned int f, const struct ha_caller *caller)
|
2022-04-22 12:22:03 -04:00
|
|
|
{
|
2022-06-15 08:31:38 -04:00
|
|
|
int thr = t->tid;
|
2022-04-22 12:22:03 -04:00
|
|
|
unsigned int state;
|
|
|
|
|
|
2022-06-15 08:31:38 -04:00
|
|
|
if (thr < 0)
|
|
|
|
|
thr = tid;
|
|
|
|
|
|
2022-04-22 12:22:03 -04:00
|
|
|
/* first, let's update the task's state with the wakeup condition */
|
2022-09-06 10:31:30 -04:00
|
|
|
state = _HA_ATOMIC_OR_FETCH(&t->state, f);
|
2022-04-22 12:22:03 -04:00
|
|
|
|
|
|
|
|
/* next we need to make sure the task was not/will not be added to the
|
|
|
|
|
* run queue because the tasklet list's mt_list uses the same storage
|
|
|
|
|
* as the task's run_queue.
|
|
|
|
|
*/
|
|
|
|
|
do {
|
|
|
|
|
/* do nothing if someone else already added it */
|
|
|
|
|
if (state & (TASK_QUEUED|TASK_RUNNING))
|
|
|
|
|
return;
|
2022-09-06 10:31:30 -04:00
|
|
|
} while (!_HA_ATOMIC_CAS(&t->state, &state, state | TASK_QUEUED));
|
2022-04-22 12:22:03 -04:00
|
|
|
|
|
|
|
|
BUG_ON_HOT(task_in_rq(t));
|
|
|
|
|
|
|
|
|
|
/* at this point we're the first ones to add this task to the list */
|
2023-11-09 06:03:24 -05:00
|
|
|
if (likely(caller)) {
|
|
|
|
|
caller = HA_ATOMIC_XCHG(&t->caller, caller);
|
|
|
|
|
BUG_ON((ulong)caller & 1);
|
2022-04-22 12:22:03 -04:00
|
|
|
#ifdef DEBUG_TASK
|
2023-11-09 06:03:24 -05:00
|
|
|
HA_ATOMIC_STORE(&t->debug.prev_caller, caller);
|
2022-04-22 12:22:03 -04:00
|
|
|
#endif
|
2023-11-09 06:03:24 -05:00
|
|
|
}
|
|
|
|
|
|
2022-09-06 10:31:30 -04:00
|
|
|
if (_HA_ATOMIC_LOAD(&th_ctx->flags) & TH_FL_TASK_PROFILING)
|
2022-09-07 08:49:50 -04:00
|
|
|
t->wake_date = now_mono_time();
|
2022-09-06 10:31:30 -04:00
|
|
|
__tasklet_wakeup_on((struct tasklet *)t, thr);
|
2022-04-22 12:22:03 -04:00
|
|
|
}
|
|
|
|
|
|
2022-06-29 04:53:03 -04:00
|
|
|
/* schedules tasklet <tl> to run immediately after the current one is done
|
|
|
|
|
* <tl> will be queued after entry <head>, or at the head of the task list. Return
|
|
|
|
|
* the new head to be used to queue future tasks. This is used to insert multiple entries
|
|
|
|
|
* at the head of the tasklet list, typically to transfer processing from a tasklet
|
|
|
|
|
* to another one or a set of other ones. If <head> is NULL, the tasklet list of <thr>
|
|
|
|
|
* thread will be used.
|
|
|
|
|
* With DEBUG_TASK, the <file>:<line> from the call place are stored into the tasklet
|
|
|
|
|
* for tracing purposes.
|
|
|
|
|
*/
|
2022-09-06 04:25:49 -04:00
|
|
|
#define tasklet_wakeup_after(head, tl) \
|
|
|
|
|
_tasklet_wakeup_after(head, tl, MK_CALLER(WAKEUP_TYPE_TASKLET_WAKEUP_AFTER, 0, 0))
|
|
|
|
|
|
2022-06-29 04:53:03 -04:00
|
|
|
static inline struct list *_tasklet_wakeup_after(struct list *head, struct tasklet *tl,
|
2022-09-06 04:25:49 -04:00
|
|
|
const struct ha_caller *caller)
|
2022-06-29 04:53:03 -04:00
|
|
|
{
|
|
|
|
|
unsigned int state = tl->state;
|
|
|
|
|
|
|
|
|
|
do {
|
|
|
|
|
/* do nothing if someone else already added it */
|
|
|
|
|
if (state & TASK_IN_LIST)
|
|
|
|
|
return head;
|
|
|
|
|
} while (!_HA_ATOMIC_CAS(&tl->state, &state, state | TASK_IN_LIST));
|
|
|
|
|
|
|
|
|
|
/* at this point we're the first one to add this task to the list */
|
2023-11-09 06:03:24 -05:00
|
|
|
if (likely(caller)) {
|
|
|
|
|
caller = HA_ATOMIC_XCHG(&tl->caller, caller);
|
|
|
|
|
BUG_ON((ulong)caller & 1);
|
2022-06-29 04:53:03 -04:00
|
|
|
#ifdef DEBUG_TASK
|
2023-11-09 06:03:24 -05:00
|
|
|
HA_ATOMIC_STORE(&tl->debug.prev_caller, caller);
|
MINOR: task: permanently enable latency measurement on tasklets
When tasklet latency measurement was enabled in 2.4 with commit b2285de04
("MINOR: tasks: also compute the tasklet latency when DEBUG_TASK is set"),
the feature was conditionned on DEBUG_TASK because the field would add 8
bytes to the struct tasklet.
This approach was not a very good idea because the struct ends on an int
anyway thus it does finish with a 32-bit hole regardless of the presence
of this field. What is true however is that adding it turned a 64-byte
struct to 72-byte when caller debugging is enabled.
This patch revisits this with a minor change. Now only the lowest 32
bits of the call date are stored, so they always fit in the remaining
hole, and this allows to remove the dependency on DEBUG_TASK. With
debugging off, we're now seeing a 48-byte struct, and with debugging
on it's exactly 64 bytes, thus still exactly one cache line. 32 bits
allow a latency of 4 seconds on a tasklet, which already indicates a
completely dead process, so there's no point storing the upper bits at
all. And even in the event it would happen once in a while, the lost
upper bits do not really add any value to the debug reports. Also, now
one tasklet wakeup every 4 billion will not be sampled due to the test
on the value itself. Similarly we just don't care, it's statistics and
the measurements are not 9-digit accurate anyway.
2022-09-06 13:17:45 -04:00
|
|
|
#endif
|
2023-11-09 06:03:24 -05:00
|
|
|
}
|
|
|
|
|
|
2022-06-29 04:53:03 -04:00
|
|
|
if (th_ctx->flags & TH_FL_TASK_PROFILING)
|
2022-09-07 08:49:50 -04:00
|
|
|
tl->wake_date = now_mono_time();
|
2022-06-29 04:53:03 -04:00
|
|
|
return __tasklet_wakeup_after(head, tl);
|
|
|
|
|
}
|
|
|
|
|
|
2021-02-16 12:44:00 -05:00
|
|
|
/* This macro shows the current function name and the last known caller of the
|
|
|
|
|
* task (or tasklet) wakeup.
|
|
|
|
|
*/
|
|
|
|
|
#ifdef DEBUG_TASK
|
2022-09-06 04:25:49 -04:00
|
|
|
#define DEBUG_TASK_PRINT_CALLER(t) do { \
|
2022-09-06 05:11:47 -04:00
|
|
|
const struct ha_caller *__caller = (t)->caller; \
|
2022-09-06 04:25:49 -04:00
|
|
|
printf("%s woken up from %s(%s:%d)\n", __FUNCTION__, \
|
|
|
|
|
__caller ? __caller->func : NULL, \
|
|
|
|
|
__caller ? __caller->file : NULL, \
|
|
|
|
|
__caller ? __caller->line : 0); \
|
2021-02-16 12:44:00 -05:00
|
|
|
} while (0)
|
|
|
|
|
#else
|
2022-09-06 04:25:49 -04:00
|
|
|
#define DEBUG_TASK_PRINT_CALLER(t) do { } while (0)
|
2021-02-16 12:44:00 -05:00
|
|
|
#endif
|
|
|
|
|
|
2018-05-18 12:45:28 -04:00
|
|
|
|
2020-11-30 08:52:11 -05:00
|
|
|
/* Try to remove a tasklet from the list. This call is inherently racy and may
|
|
|
|
|
* only be performed on the thread that was supposed to dequeue this tasklet.
|
2021-04-21 01:32:39 -04:00
|
|
|
* This way it is safe to call MT_LIST_DELETE without first removing the
|
BUG/MEDIUM: task: close a possible data race condition on a tasklet's list link
In issue #958 Ashley Penney reported intermittent crashes on AWS's ARM
nodes which would not happen on x86 nodes. After investigation it turned
out that the Neoverse N1 CPU cores used in the Graviton2 CPU are much
more aggressive than the usual Cortex A53/A72/A55 or any x86 regarding
memory ordering.
The issue that was triggered there is that if a tasklet_wakeup() call
is made on a tasklet scheduled to run on a foreign thread and that
tasklet is just being dequeued to be processed, there can be a race at
two places:
- if MT_LIST_TRY_ADDQ() happens between MT_LIST_BEHEAD() and
LIST_SPLICE_END_DETACHED() if the tasklet is alone in the list,
because the emptiness tests matches ;
- if MT_LIST_TRY_ADDQ() happens during LIST_DEL_INIT() in
run_tasks_from_lists(), then depending on how LIST_DEL_INIT() ends
up being implemented, it may even corrupt the adjacent nodes while
they're being reused for the in-tree storage.
This issue was introduced in 2.2 when support for waking up remote
tasklets was added. Initially the attachment of a tasklet to a list
was enough to know its status and this used to be stable information.
Now it's not sufficient to rely on this anymore, thus we need to use
a different information.
This patch solves this by adding a new task flag, TASK_IN_LIST, which
is atomically set before attaching a tasklet to a list, and is only
removed after the tasklet is detached from a list. It is checked
by tasklet_wakeup_on() so that it may only be done while the tasklet
is out of any list, and is cleared during the state switch when calling
the tasklet. Note that the flag is not set for pure tasks as it's not
needed.
However this introduces a new special case: the function
tasklet_remove_from_tasklet_list() needs to keep both states in sync
and cannot check both the state and the attachment to a list at the
same time. This function is already limited to being used by the thread
owning the tasklet, so in this case the test remains reliable. However,
just like its predecessors, this function is wrong by design and it
should probably be replaced with a stricter one, a lazy one, or be
totally removed (it's only used in checks to avoid calling a possibly
scheduled event, and when freeing a tasklet). Regardless, for now the
function exists so the flag is removed only if the deletion could be
done, which covers all cases we're interested in regarding the insertion.
This removal is safe against a concurrent tasklet_wakeup_on() since
MT_LIST_DEL() guarantees the atomic test, and will ultimately clear
the flag only if the task could be deleted, so the flag will always
reflect the last state.
This should be carefully be backported as far as 2.2 after some
observation period. This patch depends on previous patch
"MINOR: task: remove __tasklet_remove_from_tasklet_list()".
2020-11-30 08:58:53 -05:00
|
|
|
* TASK_IN_LIST bit, which must absolutely be removed afterwards in case
|
|
|
|
|
* another thread would want to wake this tasklet up in parallel.
|
2019-03-25 13:02:54 -04:00
|
|
|
*/
|
2019-06-14 08:47:49 -04:00
|
|
|
static inline void tasklet_remove_from_tasklet_list(struct tasklet *t)
|
2019-03-25 13:02:54 -04:00
|
|
|
{
|
2022-01-28 03:48:12 -05:00
|
|
|
if (MT_LIST_DELETE(list_to_mt_list(&t->list))) {
|
BUG/MEDIUM: task: close a possible data race condition on a tasklet's list link
In issue #958 Ashley Penney reported intermittent crashes on AWS's ARM
nodes which would not happen on x86 nodes. After investigation it turned
out that the Neoverse N1 CPU cores used in the Graviton2 CPU are much
more aggressive than the usual Cortex A53/A72/A55 or any x86 regarding
memory ordering.
The issue that was triggered there is that if a tasklet_wakeup() call
is made on a tasklet scheduled to run on a foreign thread and that
tasklet is just being dequeued to be processed, there can be a race at
two places:
- if MT_LIST_TRY_ADDQ() happens between MT_LIST_BEHEAD() and
LIST_SPLICE_END_DETACHED() if the tasklet is alone in the list,
because the emptiness tests matches ;
- if MT_LIST_TRY_ADDQ() happens during LIST_DEL_INIT() in
run_tasks_from_lists(), then depending on how LIST_DEL_INIT() ends
up being implemented, it may even corrupt the adjacent nodes while
they're being reused for the in-tree storage.
This issue was introduced in 2.2 when support for waking up remote
tasklets was added. Initially the attachment of a tasklet to a list
was enough to know its status and this used to be stable information.
Now it's not sufficient to rely on this anymore, thus we need to use
a different information.
This patch solves this by adding a new task flag, TASK_IN_LIST, which
is atomically set before attaching a tasklet to a list, and is only
removed after the tasklet is detached from a list. It is checked
by tasklet_wakeup_on() so that it may only be done while the tasklet
is out of any list, and is cleared during the state switch when calling
the tasklet. Note that the flag is not set for pure tasks as it's not
needed.
However this introduces a new special case: the function
tasklet_remove_from_tasklet_list() needs to keep both states in sync
and cannot check both the state and the attachment to a list at the
same time. This function is already limited to being used by the thread
owning the tasklet, so in this case the test remains reliable. However,
just like its predecessors, this function is wrong by design and it
should probably be replaced with a stricter one, a lazy one, or be
totally removed (it's only used in checks to avoid calling a possibly
scheduled event, and when freeing a tasklet). Regardless, for now the
function exists so the flag is removed only if the deletion could be
done, which covers all cases we're interested in regarding the insertion.
This removal is safe against a concurrent tasklet_wakeup_on() since
MT_LIST_DEL() guarantees the atomic test, and will ultimately clear
the flag only if the task could be deleted, so the flag will always
reflect the last state.
This should be carefully be backported as far as 2.2 after some
observation period. This patch depends on previous patch
"MINOR: task: remove __tasklet_remove_from_tasklet_list()".
2020-11-30 08:58:53 -05:00
|
|
|
_HA_ATOMIC_AND(&t->state, ~TASK_IN_LIST);
|
2021-10-01 05:30:33 -04:00
|
|
|
_HA_ATOMIC_DEC(&ha_thread_ctx[t->tid >= 0 ? t->tid : tid].rq_total);
|
BUG/MEDIUM: task: close a possible data race condition on a tasklet's list link
In issue #958 Ashley Penney reported intermittent crashes on AWS's ARM
nodes which would not happen on x86 nodes. After investigation it turned
out that the Neoverse N1 CPU cores used in the Graviton2 CPU are much
more aggressive than the usual Cortex A53/A72/A55 or any x86 regarding
memory ordering.
The issue that was triggered there is that if a tasklet_wakeup() call
is made on a tasklet scheduled to run on a foreign thread and that
tasklet is just being dequeued to be processed, there can be a race at
two places:
- if MT_LIST_TRY_ADDQ() happens between MT_LIST_BEHEAD() and
LIST_SPLICE_END_DETACHED() if the tasklet is alone in the list,
because the emptiness tests matches ;
- if MT_LIST_TRY_ADDQ() happens during LIST_DEL_INIT() in
run_tasks_from_lists(), then depending on how LIST_DEL_INIT() ends
up being implemented, it may even corrupt the adjacent nodes while
they're being reused for the in-tree storage.
This issue was introduced in 2.2 when support for waking up remote
tasklets was added. Initially the attachment of a tasklet to a list
was enough to know its status and this used to be stable information.
Now it's not sufficient to rely on this anymore, thus we need to use
a different information.
This patch solves this by adding a new task flag, TASK_IN_LIST, which
is atomically set before attaching a tasklet to a list, and is only
removed after the tasklet is detached from a list. It is checked
by tasklet_wakeup_on() so that it may only be done while the tasklet
is out of any list, and is cleared during the state switch when calling
the tasklet. Note that the flag is not set for pure tasks as it's not
needed.
However this introduces a new special case: the function
tasklet_remove_from_tasklet_list() needs to keep both states in sync
and cannot check both the state and the attachment to a list at the
same time. This function is already limited to being used by the thread
owning the tasklet, so in this case the test remains reliable. However,
just like its predecessors, this function is wrong by design and it
should probably be replaced with a stricter one, a lazy one, or be
totally removed (it's only used in checks to avoid calling a possibly
scheduled event, and when freeing a tasklet). Regardless, for now the
function exists so the flag is removed only if the deletion could be
done, which covers all cases we're interested in regarding the insertion.
This removal is safe against a concurrent tasklet_wakeup_on() since
MT_LIST_DEL() guarantees the atomic test, and will ultimately clear
the flag only if the task could be deleted, so the flag will always
reflect the last state.
This should be carefully be backported as far as 2.2 after some
observation period. This patch depends on previous patch
"MINOR: task: remove __tasklet_remove_from_tasklet_list()".
2020-11-30 08:58:53 -05:00
|
|
|
}
|
2019-03-25 13:02:54 -04:00
|
|
|
}
|
|
|
|
|
|
2008-06-24 02:17:16 -04:00
|
|
|
/*
|
2009-03-21 13:13:21 -04:00
|
|
|
* Initialize a new task. The bare minimum is performed (queue pointers and
|
|
|
|
|
* state). The task is returned. This function should not be used outside of
|
2022-06-15 10:37:33 -04:00
|
|
|
* task_new(). If the thread ID is < 0, the task may run on any thread.
|
2008-06-24 02:17:16 -04:00
|
|
|
*/
|
2022-06-15 10:37:33 -04:00
|
|
|
static inline struct task *task_init(struct task *t, int tid)
|
2008-06-24 02:17:16 -04:00
|
|
|
{
|
2009-03-07 11:25:21 -05:00
|
|
|
t->wq.node.leaf_p = NULL;
|
|
|
|
|
t->rq.node.leaf_p = NULL;
|
2018-05-18 12:38:23 -04:00
|
|
|
t->state = TASK_SLEEPING;
|
2022-06-15 10:37:33 -04:00
|
|
|
#ifndef USE_THREAD
|
|
|
|
|
/* no shared wq without threads */
|
|
|
|
|
tid = 0;
|
|
|
|
|
#endif
|
|
|
|
|
t->tid = tid;
|
2008-06-30 01:51:00 -04:00
|
|
|
t->nice = 0;
|
2009-03-28 12:54:35 -04:00
|
|
|
t->calls = 0;
|
2022-09-07 08:49:50 -04:00
|
|
|
t->wake_date = 0;
|
2017-07-24 11:52:58 -04:00
|
|
|
t->expire = TICK_ETERNITY;
|
2022-09-06 05:11:47 -04:00
|
|
|
t->caller = NULL;
|
2006-06-25 20:48:02 -04:00
|
|
|
return t;
|
|
|
|
|
}
|
|
|
|
|
|
2021-03-02 09:54:11 -05:00
|
|
|
/* Initialize a new tasklet. It's identified as a tasklet by its flags
|
|
|
|
|
* TASK_F_TASKLET. It is expected to run on the calling thread by default,
|
|
|
|
|
* it's up to the caller to change ->tid if it wants to own it.
|
2019-10-18 00:43:53 -04:00
|
|
|
*/
|
2018-05-18 12:45:28 -04:00
|
|
|
static inline void tasklet_init(struct tasklet *t)
|
|
|
|
|
{
|
|
|
|
|
t->calls = 0;
|
2021-03-02 09:54:11 -05:00
|
|
|
t->state = TASK_F_TASKLET;
|
2018-07-19 10:02:16 -04:00
|
|
|
t->process = NULL;
|
2019-10-18 00:43:53 -04:00
|
|
|
t->tid = -1;
|
2022-09-07 08:49:50 -04:00
|
|
|
t->wake_date = 0;
|
2022-09-06 05:11:47 -04:00
|
|
|
t->caller = NULL;
|
2019-10-11 10:35:01 -04:00
|
|
|
LIST_INIT(&t->list);
|
2018-05-18 12:45:28 -04:00
|
|
|
}
|
|
|
|
|
|
2019-10-18 00:43:53 -04:00
|
|
|
/* Allocate and initialize a new tasklet, local to the thread by default. The
|
2020-03-10 03:06:11 -04:00
|
|
|
* caller may assign its tid if it wants to own the tasklet.
|
2019-10-18 00:43:53 -04:00
|
|
|
*/
|
2018-05-18 12:45:28 -04:00
|
|
|
static inline struct tasklet *tasklet_new(void)
|
|
|
|
|
{
|
|
|
|
|
struct tasklet *t = pool_alloc(pool_head_tasklet);
|
|
|
|
|
|
|
|
|
|
if (t) {
|
|
|
|
|
tasklet_init(t);
|
|
|
|
|
}
|
|
|
|
|
return t;
|
|
|
|
|
}
|
|
|
|
|
|
2006-06-25 20:48:02 -04:00
|
|
|
/*
|
2022-06-15 10:37:33 -04:00
|
|
|
* Allocate and initialize a new task, to run on global thread <thr>, or any
|
|
|
|
|
* thread if negative. The task count is incremented. The new task is returned,
|
|
|
|
|
* or NULL in case of lack of memory. It's up to the caller to pass a valid
|
|
|
|
|
* thread number (in tid space, 0 to nbthread-1, or <0 for any). Tasks created
|
|
|
|
|
* this way must be freed using task_destroy().
|
2009-03-21 13:13:21 -04:00
|
|
|
*/
|
2022-06-15 10:37:33 -04:00
|
|
|
static inline struct task *task_new_on(int thr)
|
2009-03-21 13:13:21 -04:00
|
|
|
{
|
2017-11-24 11:34:44 -05:00
|
|
|
struct task *t = pool_alloc(pool_head_task);
|
2009-03-21 13:13:21 -04:00
|
|
|
if (t) {
|
2021-10-01 05:30:33 -04:00
|
|
|
th_ctx->nb_tasks++;
|
2022-06-15 10:37:33 -04:00
|
|
|
task_init(t, thr);
|
2009-03-21 13:13:21 -04:00
|
|
|
}
|
|
|
|
|
return t;
|
|
|
|
|
}
|
|
|
|
|
|
2021-10-01 12:23:30 -04:00
|
|
|
/* Allocate and initialize a new task, to run on the calling thread. The new
|
|
|
|
|
* task is returned, or NULL in case of lack of memory. The task count is
|
|
|
|
|
* incremented.
|
|
|
|
|
*/
|
|
|
|
|
static inline struct task *task_new_here()
|
|
|
|
|
{
|
2022-06-15 10:37:33 -04:00
|
|
|
return task_new_on(tid);
|
2021-10-01 12:23:30 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Allocate and initialize a new task, to run on any thread. The new task is
|
|
|
|
|
* returned, or NULL in case of lack of memory. The task count is incremented.
|
|
|
|
|
*/
|
|
|
|
|
static inline struct task *task_new_anywhere()
|
|
|
|
|
{
|
2022-06-15 10:37:33 -04:00
|
|
|
return task_new_on(-1);
|
2021-10-01 12:23:30 -04:00
|
|
|
}
|
|
|
|
|
|
2009-03-21 13:13:21 -04:00
|
|
|
/*
|
2019-05-17 08:16:51 -04:00
|
|
|
* Free a task. Its context must have been freed since it will be lost. The
|
|
|
|
|
* task count is decremented. It it is the current task, this one is reset.
|
2006-06-25 20:48:02 -04:00
|
|
|
*/
|
2018-05-04 09:46:16 -04:00
|
|
|
static inline void __task_free(struct task *t)
|
2006-06-25 20:48:02 -04:00
|
|
|
{
|
2021-10-01 05:30:33 -04:00
|
|
|
if (t == th_ctx->current) {
|
|
|
|
|
th_ctx->current = NULL;
|
2019-05-17 08:16:51 -04:00
|
|
|
__ha_barrier_store();
|
|
|
|
|
}
|
2020-07-22 08:33:53 -04:00
|
|
|
BUG_ON(task_in_wq(t) || task_in_rq(t));
|
2021-02-18 08:38:49 -05:00
|
|
|
|
2022-09-06 05:11:47 -04:00
|
|
|
BUG_ON((ulong)t->caller & 1);
|
2021-02-18 08:38:49 -05:00
|
|
|
#ifdef DEBUG_TASK
|
2022-09-06 05:11:47 -04:00
|
|
|
HA_ATOMIC_STORE(&t->debug.prev_caller, HA_ATOMIC_LOAD(&t->caller));
|
2021-02-18 08:38:49 -05:00
|
|
|
#endif
|
2022-09-06 05:11:47 -04:00
|
|
|
HA_ATOMIC_STORE(&t->caller, (void*)1); // make sure to crash if used after free
|
2021-02-18 08:38:49 -05:00
|
|
|
|
2017-11-24 11:34:44 -05:00
|
|
|
pool_free(pool_head_task, t);
|
2021-10-01 05:30:33 -04:00
|
|
|
th_ctx->nb_tasks--;
|
2014-11-13 10:57:19 -05:00
|
|
|
if (unlikely(stopping))
|
2017-11-24 11:34:44 -05:00
|
|
|
pool_flush(pool_head_task);
|
2006-06-25 20:48:02 -04:00
|
|
|
}
|
|
|
|
|
|
2019-05-07 13:05:35 -04:00
|
|
|
/* Destroys a task : it's unlinked from the wait queues and is freed if it's
|
2022-05-25 10:30:41 -04:00
|
|
|
* the current task or not queued otherwise it's marked to be freed by the
|
|
|
|
|
* scheduler. It does nothing if <t> is NULL.
|
2019-05-07 13:05:35 -04:00
|
|
|
*/
|
2019-04-17 16:51:06 -04:00
|
|
|
static inline void task_destroy(struct task *t)
|
2018-05-04 09:46:16 -04:00
|
|
|
{
|
2019-05-07 09:25:25 -04:00
|
|
|
if (!t)
|
|
|
|
|
return;
|
|
|
|
|
|
2019-04-17 16:51:06 -04:00
|
|
|
task_unlink_wq(t);
|
2020-03-10 03:06:11 -04:00
|
|
|
/* We don't have to explicitly remove from the run queue.
|
2019-04-17 16:51:06 -04:00
|
|
|
* If we are in the runqueue, the test below will set t->process
|
|
|
|
|
* to NULL, and the task will be free'd when it'll be its turn
|
|
|
|
|
* to run.
|
|
|
|
|
*/
|
|
|
|
|
|
2018-05-04 09:46:16 -04:00
|
|
|
/* There's no need to protect t->state with a lock, as the task
|
|
|
|
|
* has to run on the current thread.
|
|
|
|
|
*/
|
2022-05-25 10:30:41 -04:00
|
|
|
if (t == th_ctx->current || !(t->state & (TASK_QUEUED | TASK_RUNNING)))
|
2018-05-04 09:46:16 -04:00
|
|
|
__task_free(t);
|
|
|
|
|
else
|
|
|
|
|
t->process = NULL;
|
|
|
|
|
}
|
|
|
|
|
|
2019-10-11 10:35:01 -04:00
|
|
|
/* Should only be called by the thread responsible for the tasklet */
|
2018-05-18 12:45:28 -04:00
|
|
|
static inline void tasklet_free(struct tasklet *tl)
|
|
|
|
|
{
|
2023-04-22 11:47:31 -04:00
|
|
|
if (!tl)
|
|
|
|
|
return;
|
|
|
|
|
|
2022-01-28 03:48:12 -05:00
|
|
|
if (MT_LIST_DELETE(list_to_mt_list(&tl->list)))
|
2021-10-01 05:30:33 -04:00
|
|
|
_HA_ATOMIC_DEC(&ha_thread_ctx[tl->tid >= 0 ? tl->tid : tid].rq_total);
|
2018-06-08 11:08:19 -04:00
|
|
|
|
2022-09-06 05:11:47 -04:00
|
|
|
BUG_ON((ulong)tl->caller & 1);
|
2021-02-18 08:38:49 -05:00
|
|
|
#ifdef DEBUG_TASK
|
2022-09-06 05:11:47 -04:00
|
|
|
HA_ATOMIC_STORE(&tl->debug.prev_caller, HA_ATOMIC_LOAD(&tl->caller));
|
2021-02-18 08:38:49 -05:00
|
|
|
#endif
|
2022-09-06 05:11:47 -04:00
|
|
|
HA_ATOMIC_STORE(&tl->caller, (void*)1); // make sure to crash if used after free
|
2018-05-18 12:45:28 -04:00
|
|
|
pool_free(pool_head_tasklet, tl);
|
|
|
|
|
if (unlikely(stopping))
|
|
|
|
|
pool_flush(pool_head_tasklet);
|
|
|
|
|
}
|
|
|
|
|
|
2019-09-20 11:18:35 -04:00
|
|
|
static inline void tasklet_set_tid(struct tasklet *tl, int tid)
|
|
|
|
|
{
|
|
|
|
|
tl->tid = tid;
|
|
|
|
|
}
|
|
|
|
|
|
2011-07-25 08:30:42 -04:00
|
|
|
/* Ensure <task> will be woken up at most at <when>. If the task is already in
|
|
|
|
|
* the run queue (but not running), nothing is done. It may be used that way
|
|
|
|
|
* with a delay : task_schedule(task, tick_add(now_ms, delay));
|
2021-09-30 10:38:09 -04:00
|
|
|
* It MUST NOT be used with a timer in the past, and even less with
|
|
|
|
|
* TICK_ETERNITY (which would block all timers). Note that passing it directly
|
|
|
|
|
* now_ms without using tick_add() will definitely make this happen once every
|
|
|
|
|
* 49.7 days.
|
2011-07-25 08:30:42 -04:00
|
|
|
*/
|
2023-11-09 06:05:08 -05:00
|
|
|
#define task_schedule(t, w) \
|
|
|
|
|
_task_schedule(t, w, MK_CALLER(WAKEUP_TYPE_TASK_SCHEDULE, 0, 0))
|
|
|
|
|
|
|
|
|
|
static inline void _task_schedule(struct task *task, int when, const struct ha_caller *caller)
|
2011-07-25 08:30:42 -04:00
|
|
|
{
|
2017-09-27 08:59:38 -04:00
|
|
|
/* TODO: mthread, check if there is no tisk with this test */
|
2011-07-25 08:30:42 -04:00
|
|
|
if (task_in_rq(task))
|
|
|
|
|
return;
|
|
|
|
|
|
2018-10-15 08:52:21 -04:00
|
|
|
#ifdef USE_THREAD
|
2022-06-15 10:48:45 -04:00
|
|
|
if (task->tid < 0) {
|
2019-05-28 12:48:07 -04:00
|
|
|
/* FIXME: is it really needed to lock the WQ during the check ? */
|
BUG/MAJOR: sched: protect task during removal from wait queue
The issue addressed by commit fbb934da9 ("BUG/MEDIUM: stick-table: fix
a race condition when updating the expiration task") is still present
when thread groups are enabled, but this time it lies in the scheduler.
What happens is that a task configured to run anywhere might already
have been queued into one group's wait queue. When updating a stick
table entry, sometimes the task will have to be dequeued and requeued.
For this a lock is taken on the current thread group's wait queue lock,
but while this is necessary for the queuing, it's not sufficient for
dequeuing since another thread might be in the process of expiring this
task under its own group's lock which is different. This is easy to test
using 3 stick tables with 1ms expiration, 3 track-sc rules and 4 thread
groups. The process crashes almost instantly under heavy traffic.
One approach could consist in storing the group number the task was
queued under in its descriptor (we don't need 32 bits to store the
thread id, it's possible to use one short for the tid and another
one for the tgrp). Sadly, no safe way to do this was figured, because
the race remains at the moment the thread group number is checked, as
it might be in the process of being changed by another thread. It seems
that a working approach could consist in always having it associated
with one group, and only allowing to change it under this group's lock,
so that any code trying to change it would have to iterately read it
and lock its group until the value matches, confirming it really holds
the correct lock. But this seems a bit complicated, particularly with
wait_expired_tasks() which already uses upgradable locks to switch from
read state to a write state.
Given that the shared tasks are not that common (stick-table expirations,
rate-limited listeners, maybe resolvers), it doesn't seem worth the extra
complexity for now. This patch takes a simpler and safer approach
consisting in switching back to a single wq_lock, but still keeping
separate wait queues. Given that shared wait queues are almost always
empty and that otherwise they're scanned under a read lock, the
contention remains manageable and most of the time the lock doesn't
even need to be taken since such tasks are not present in a group's
queue. In essence, this patch reverts half of the aforementionned
patch. This was tested and confirmed to work fine, without observing
any performance degradation under any workload. The performance with
8 groups on an EPYC 74F3 and 3 tables remains twice the one of a
single group, with the contention remaining on the table's lock first.
No backport is needed.
2022-11-22 01:05:44 -05:00
|
|
|
HA_RWLOCK_WRLOCK(TASK_WQ_LOCK, &wq_lock);
|
2018-10-15 08:52:21 -04:00
|
|
|
if (task_in_wq(task))
|
|
|
|
|
when = tick_first(when, task->expire);
|
|
|
|
|
|
|
|
|
|
task->expire = when;
|
2023-11-09 06:05:08 -05:00
|
|
|
if (!task_in_wq(task) || tick_is_lt(task->expire, task->wq.key)) {
|
|
|
|
|
if (likely(caller)) {
|
|
|
|
|
caller = HA_ATOMIC_XCHG(&task->caller, caller);
|
|
|
|
|
BUG_ON((ulong)caller & 1);
|
|
|
|
|
#ifdef DEBUG_TASK
|
|
|
|
|
HA_ATOMIC_STORE(&task->debug.prev_caller, caller);
|
|
|
|
|
#endif
|
|
|
|
|
}
|
2022-07-07 09:22:55 -04:00
|
|
|
__task_queue(task, &tg_ctx->timers);
|
2023-11-09 06:05:08 -05:00
|
|
|
}
|
BUG/MAJOR: sched: protect task during removal from wait queue
The issue addressed by commit fbb934da9 ("BUG/MEDIUM: stick-table: fix
a race condition when updating the expiration task") is still present
when thread groups are enabled, but this time it lies in the scheduler.
What happens is that a task configured to run anywhere might already
have been queued into one group's wait queue. When updating a stick
table entry, sometimes the task will have to be dequeued and requeued.
For this a lock is taken on the current thread group's wait queue lock,
but while this is necessary for the queuing, it's not sufficient for
dequeuing since another thread might be in the process of expiring this
task under its own group's lock which is different. This is easy to test
using 3 stick tables with 1ms expiration, 3 track-sc rules and 4 thread
groups. The process crashes almost instantly under heavy traffic.
One approach could consist in storing the group number the task was
queued under in its descriptor (we don't need 32 bits to store the
thread id, it's possible to use one short for the tid and another
one for the tgrp). Sadly, no safe way to do this was figured, because
the race remains at the moment the thread group number is checked, as
it might be in the process of being changed by another thread. It seems
that a working approach could consist in always having it associated
with one group, and only allowing to change it under this group's lock,
so that any code trying to change it would have to iterately read it
and lock its group until the value matches, confirming it really holds
the correct lock. But this seems a bit complicated, particularly with
wait_expired_tasks() which already uses upgradable locks to switch from
read state to a write state.
Given that the shared tasks are not that common (stick-table expirations,
rate-limited listeners, maybe resolvers), it doesn't seem worth the extra
complexity for now. This patch takes a simpler and safer approach
consisting in switching back to a single wq_lock, but still keeping
separate wait queues. Given that shared wait queues are almost always
empty and that otherwise they're scanned under a read lock, the
contention remains manageable and most of the time the lock doesn't
even need to be taken since such tasks are not present in a group's
queue. In essence, this patch reverts half of the aforementionned
patch. This was tested and confirmed to work fine, without observing
any performance degradation under any workload. The performance with
8 groups on an EPYC 74F3 and 3 tables remains twice the one of a
single group, with the contention remaining on the table's lock first.
No backport is needed.
2022-11-22 01:05:44 -05:00
|
|
|
HA_RWLOCK_WRUNLOCK(TASK_WQ_LOCK, &wq_lock);
|
2018-10-15 08:52:21 -04:00
|
|
|
} else
|
|
|
|
|
#endif
|
|
|
|
|
{
|
2022-06-15 08:24:57 -04:00
|
|
|
BUG_ON(task->tid != tid);
|
2018-10-15 08:52:21 -04:00
|
|
|
if (task_in_wq(task))
|
|
|
|
|
when = tick_first(when, task->expire);
|
2011-07-25 08:30:42 -04:00
|
|
|
|
2018-10-15 08:52:21 -04:00
|
|
|
task->expire = when;
|
2023-11-09 06:05:08 -05:00
|
|
|
if (!task_in_wq(task) || tick_is_lt(task->expire, task->wq.key)) {
|
|
|
|
|
if (likely(caller)) {
|
|
|
|
|
caller = HA_ATOMIC_XCHG(&task->caller, caller);
|
|
|
|
|
BUG_ON((ulong)caller & 1);
|
|
|
|
|
#ifdef DEBUG_TASK
|
|
|
|
|
HA_ATOMIC_STORE(&task->debug.prev_caller, caller);
|
|
|
|
|
#endif
|
|
|
|
|
}
|
2021-10-01 05:30:33 -04:00
|
|
|
__task_queue(task, &th_ctx->timers);
|
2023-11-09 06:05:08 -05:00
|
|
|
}
|
2018-10-15 08:52:21 -04:00
|
|
|
}
|
2011-07-25 08:30:42 -04:00
|
|
|
}
|
|
|
|
|
|
2022-09-06 09:01:55 -04:00
|
|
|
/* returns the string corresponding to a task type as found in the task caller
|
|
|
|
|
* locations.
|
|
|
|
|
*/
|
|
|
|
|
static inline const char *task_wakeup_type_str(uint t)
|
|
|
|
|
{
|
|
|
|
|
switch (t) {
|
|
|
|
|
case WAKEUP_TYPE_TASK_WAKEUP : return "task_wakeup";
|
|
|
|
|
case WAKEUP_TYPE_TASK_INSTANT_WAKEUP : return "task_instant_wakeup";
|
|
|
|
|
case WAKEUP_TYPE_TASKLET_WAKEUP : return "tasklet_wakeup";
|
|
|
|
|
case WAKEUP_TYPE_TASKLET_WAKEUP_AFTER : return "tasklet_wakeup_after";
|
2023-11-09 06:05:08 -05:00
|
|
|
case WAKEUP_TYPE_TASK_QUEUE : return "task_queue";
|
|
|
|
|
case WAKEUP_TYPE_TASK_SCHEDULE : return "task_schedule";
|
2022-09-06 09:01:55 -04:00
|
|
|
case WAKEUP_TYPE_APPCTX_WAKEUP : return "appctx_wakeup";
|
|
|
|
|
default : return "?";
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2017-07-12 08:31:10 -04:00
|
|
|
/* This function register a new signal. "lua" is the current lua
|
|
|
|
|
* execution context. It contains a pointer to the associated task.
|
|
|
|
|
* "link" is a list head attached to an other task that must be wake
|
|
|
|
|
* the lua task if an event occurs. This is useful with external
|
2020-03-10 03:06:11 -04:00
|
|
|
* events like TCP I/O or sleep functions. This function allocate
|
2017-07-12 08:31:10 -04:00
|
|
|
* memory for the signal.
|
|
|
|
|
*/
|
|
|
|
|
static inline struct notification *notification_new(struct list *purge, struct list *event, struct task *wakeup)
|
|
|
|
|
{
|
2017-11-24 11:34:44 -05:00
|
|
|
struct notification *com = pool_alloc(pool_head_notification);
|
2017-07-12 08:31:10 -04:00
|
|
|
if (!com)
|
|
|
|
|
return NULL;
|
2021-04-21 01:32:39 -04:00
|
|
|
LIST_APPEND(purge, &com->purge_me);
|
|
|
|
|
LIST_APPEND(event, &com->wake_me);
|
2017-11-07 04:42:54 -05:00
|
|
|
HA_SPIN_INIT(&com->lock);
|
2017-07-12 08:31:10 -04:00
|
|
|
com->task = wakeup;
|
|
|
|
|
return com;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* This function purge all the pending signals when the LUA execution
|
|
|
|
|
* is finished. This prevent than a coprocess try to wake a deleted
|
|
|
|
|
* task. This function remove the memory associated to the signal.
|
2017-12-10 11:14:07 -05:00
|
|
|
* The purge list is not locked because it is owned by only one
|
|
|
|
|
* process. before browsing this list, the caller must ensure to be
|
|
|
|
|
* the only one browser.
|
2017-07-12 08:31:10 -04:00
|
|
|
*/
|
|
|
|
|
static inline void notification_purge(struct list *purge)
|
|
|
|
|
{
|
|
|
|
|
struct notification *com, *back;
|
|
|
|
|
|
|
|
|
|
/* Delete all pending communication signals. */
|
|
|
|
|
list_for_each_entry_safe(com, back, purge, purge_me) {
|
2017-11-07 04:42:54 -05:00
|
|
|
HA_SPIN_LOCK(NOTIF_LOCK, &com->lock);
|
2021-04-21 01:32:39 -04:00
|
|
|
LIST_DELETE(&com->purge_me);
|
2017-07-16 18:14:07 -04:00
|
|
|
if (!com->task) {
|
2017-11-07 04:42:54 -05:00
|
|
|
HA_SPIN_UNLOCK(NOTIF_LOCK, &com->lock);
|
2017-11-24 11:34:44 -05:00
|
|
|
pool_free(pool_head_notification, com);
|
2017-07-16 18:14:07 -04:00
|
|
|
continue;
|
|
|
|
|
}
|
|
|
|
|
com->task = NULL;
|
2017-11-07 04:42:54 -05:00
|
|
|
HA_SPIN_UNLOCK(NOTIF_LOCK, &com->lock);
|
2017-07-12 08:31:10 -04:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2017-12-10 11:10:57 -05:00
|
|
|
/* In some cases, the disconnected notifications must be cleared.
|
2020-03-10 03:06:11 -04:00
|
|
|
* This function just release memory blocks. The purge list is not
|
2017-12-10 11:10:57 -05:00
|
|
|
* locked because it is owned by only one process. Before browsing
|
|
|
|
|
* this list, the caller must ensure to be the only one browser.
|
|
|
|
|
* The "com" is not locked because when com->task is NULL, the
|
|
|
|
|
* notification is no longer used.
|
|
|
|
|
*/
|
|
|
|
|
static inline void notification_gc(struct list *purge)
|
|
|
|
|
{
|
|
|
|
|
struct notification *com, *back;
|
|
|
|
|
|
|
|
|
|
/* Delete all pending communication signals. */
|
|
|
|
|
list_for_each_entry_safe (com, back, purge, purge_me) {
|
|
|
|
|
if (com->task)
|
|
|
|
|
continue;
|
2021-04-21 01:32:39 -04:00
|
|
|
LIST_DELETE(&com->purge_me);
|
2017-12-10 11:10:57 -05:00
|
|
|
pool_free(pool_head_notification, com);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2017-07-12 08:31:10 -04:00
|
|
|
/* This function sends signals. It wakes all the tasks attached
|
|
|
|
|
* to a list head, and remove the signal, and free the used
|
2017-12-10 11:14:07 -05:00
|
|
|
* memory. The wake list is not locked because it is owned by
|
|
|
|
|
* only one process. before browsing this list, the caller must
|
|
|
|
|
* ensure to be the only one browser.
|
2017-07-12 08:31:10 -04:00
|
|
|
*/
|
|
|
|
|
static inline void notification_wake(struct list *wake)
|
|
|
|
|
{
|
|
|
|
|
struct notification *com, *back;
|
|
|
|
|
|
|
|
|
|
/* Wake task and delete all pending communication signals. */
|
|
|
|
|
list_for_each_entry_safe(com, back, wake, wake_me) {
|
2017-11-07 04:42:54 -05:00
|
|
|
HA_SPIN_LOCK(NOTIF_LOCK, &com->lock);
|
2021-04-21 01:32:39 -04:00
|
|
|
LIST_DELETE(&com->wake_me);
|
2017-07-16 18:14:07 -04:00
|
|
|
if (!com->task) {
|
2017-11-07 04:42:54 -05:00
|
|
|
HA_SPIN_UNLOCK(NOTIF_LOCK, &com->lock);
|
2017-11-24 11:34:44 -05:00
|
|
|
pool_free(pool_head_notification, com);
|
2017-07-16 18:14:07 -04:00
|
|
|
continue;
|
|
|
|
|
}
|
2017-07-12 08:31:10 -04:00
|
|
|
task_wakeup(com->task, TASK_WOKEN_MSG);
|
2017-07-16 18:14:07 -04:00
|
|
|
com->task = NULL;
|
2017-11-07 04:42:54 -05:00
|
|
|
HA_SPIN_UNLOCK(NOTIF_LOCK, &com->lock);
|
2017-07-12 08:31:10 -04:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2018-05-30 05:40:08 -04:00
|
|
|
/* This function returns true is some notification are pending
|
|
|
|
|
*/
|
|
|
|
|
static inline int notification_registered(struct list *wake)
|
|
|
|
|
{
|
|
|
|
|
return !LIST_ISEMPTY(wake);
|
|
|
|
|
}
|
|
|
|
|
|
2020-06-04 11:25:40 -04:00
|
|
|
#endif /* _HAPROXY_TASK_H */
|
2006-06-25 20:48:02 -04:00
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Local variables:
|
|
|
|
|
* c-indent-level: 8
|
|
|
|
|
* c-basic-offset: 8
|
|
|
|
|
* End:
|
|
|
|
|
*/
|