2006-06-25 20:48:02 -04:00
|
|
|
/*
|
|
|
|
|
* File descriptors management functions.
|
|
|
|
|
*
|
MAJOR: polling: rework the whole polling system
This commit heavily changes the polling system in order to definitely
fix the frequent breakage of SSL which needs to remember the last
EAGAIN before deciding whether to poll or not. Now we have a state per
direction for each FD, as opposed to a previous and current state
previously. An FD can have up to 8 different states for each direction,
each of which being the result of a 3-bit combination. These 3 bits
indicate a wish to access the FD, the readiness of the FD and the
subscription of the FD to the polling system.
This means that it will now be possible to remember the state of a
file descriptor across disable/enable sequences that generally happen
during forwarding, where enabling reading on a previously disabled FD
would result in forgetting the EAGAIN flag it met last time.
Several new state manipulation functions have been introduced or
adapted :
- fd_want_{recv,send} : enable receiving/sending on the FD regardless
of its state (sets the ACTIVE flag) ;
- fd_stop_{recv,send} : stop receiving/sending on the FD regardless
of its state (clears the ACTIVE flag) ;
- fd_cant_{recv,send} : report a failure to receive/send on the FD
corresponding to EAGAIN (clears the READY flag) ;
- fd_may_{recv,send} : report the ability to receive/send on the FD
as reported by poll() (sets the READY flag) ;
Some functions are used to report the current FD status :
- fd_{recv,send}_active
- fd_{recv,send}_ready
- fd_{recv,send}_polled
Some functions were removed :
- fd_ev_clr(), fd_ev_set(), fd_ev_rem(), fd_ev_wai()
The POLLHUP/POLLERR flags are now reported as ready so that the I/O layers
knows it can try to access the file descriptor to get this information.
In order to simplify the conditions to add/remove cache entries, a new
function fd_alloc_or_release_cache_entry() was created to be used from
pollers while scanning for updates.
The following pollers have been updated :
ev_select() : done, built, tested on Linux 3.10
ev_poll() : done, built, tested on Linux 3.10
ev_epoll() : done, built, tested on Linux 3.10 & 3.13
ev_kqueue() : done, built, tested on OpenBSD 5.2
2014-01-10 10:58:45 -05:00
|
|
|
* Copyright 2000-2014 Willy Tarreau <w@1wt.eu>
|
2006-06-25 20:48:02 -04:00
|
|
|
*
|
|
|
|
|
* This program is free software; you can redistribute it and/or
|
|
|
|
|
* modify it under the terms of the GNU General Public License
|
|
|
|
|
* as published by the Free Software Foundation; either version
|
|
|
|
|
* 2 of the License, or (at your option) any later version.
|
|
|
|
|
*
|
2012-11-11 09:02:54 -05:00
|
|
|
* There is no direct link between the FD and the updates list. There is only a
|
|
|
|
|
* bit in the fdtab[] to indicate than a file descriptor is already present in
|
|
|
|
|
* the updates list. Once an fd is present in the updates list, it will have to
|
|
|
|
|
* be considered even if its changes are reverted in the middle or if the fd is
|
|
|
|
|
* replaced.
|
|
|
|
|
*
|
MAJOR: polling: rework the whole polling system
This commit heavily changes the polling system in order to definitely
fix the frequent breakage of SSL which needs to remember the last
EAGAIN before deciding whether to poll or not. Now we have a state per
direction for each FD, as opposed to a previous and current state
previously. An FD can have up to 8 different states for each direction,
each of which being the result of a 3-bit combination. These 3 bits
indicate a wish to access the FD, the readiness of the FD and the
subscription of the FD to the polling system.
This means that it will now be possible to remember the state of a
file descriptor across disable/enable sequences that generally happen
during forwarding, where enabling reading on a previously disabled FD
would result in forgetting the EAGAIN flag it met last time.
Several new state manipulation functions have been introduced or
adapted :
- fd_want_{recv,send} : enable receiving/sending on the FD regardless
of its state (sets the ACTIVE flag) ;
- fd_stop_{recv,send} : stop receiving/sending on the FD regardless
of its state (clears the ACTIVE flag) ;
- fd_cant_{recv,send} : report a failure to receive/send on the FD
corresponding to EAGAIN (clears the READY flag) ;
- fd_may_{recv,send} : report the ability to receive/send on the FD
as reported by poll() (sets the READY flag) ;
Some functions are used to report the current FD status :
- fd_{recv,send}_active
- fd_{recv,send}_ready
- fd_{recv,send}_polled
Some functions were removed :
- fd_ev_clr(), fd_ev_set(), fd_ev_rem(), fd_ev_wai()
The POLLHUP/POLLERR flags are now reported as ready so that the I/O layers
knows it can try to access the file descriptor to get this information.
In order to simplify the conditions to add/remove cache entries, a new
function fd_alloc_or_release_cache_entry() was created to be used from
pollers while scanning for updates.
The following pollers have been updated :
ev_select() : done, built, tested on Linux 3.10
ev_poll() : done, built, tested on Linux 3.10
ev_epoll() : done, built, tested on Linux 3.10 & 3.13
ev_kqueue() : done, built, tested on OpenBSD 5.2
2014-01-10 10:58:45 -05:00
|
|
|
* The event state for an FD, as found in fdtab[].state, is maintained for each
|
|
|
|
|
* direction. The state field is built this way, with R bits in the low nibble
|
|
|
|
|
* and W bits in the high nibble for ease of access and debugging :
|
2012-11-11 09:02:54 -05:00
|
|
|
*
|
MAJOR: polling: rework the whole polling system
This commit heavily changes the polling system in order to definitely
fix the frequent breakage of SSL which needs to remember the last
EAGAIN before deciding whether to poll or not. Now we have a state per
direction for each FD, as opposed to a previous and current state
previously. An FD can have up to 8 different states for each direction,
each of which being the result of a 3-bit combination. These 3 bits
indicate a wish to access the FD, the readiness of the FD and the
subscription of the FD to the polling system.
This means that it will now be possible to remember the state of a
file descriptor across disable/enable sequences that generally happen
during forwarding, where enabling reading on a previously disabled FD
would result in forgetting the EAGAIN flag it met last time.
Several new state manipulation functions have been introduced or
adapted :
- fd_want_{recv,send} : enable receiving/sending on the FD regardless
of its state (sets the ACTIVE flag) ;
- fd_stop_{recv,send} : stop receiving/sending on the FD regardless
of its state (clears the ACTIVE flag) ;
- fd_cant_{recv,send} : report a failure to receive/send on the FD
corresponding to EAGAIN (clears the READY flag) ;
- fd_may_{recv,send} : report the ability to receive/send on the FD
as reported by poll() (sets the READY flag) ;
Some functions are used to report the current FD status :
- fd_{recv,send}_active
- fd_{recv,send}_ready
- fd_{recv,send}_polled
Some functions were removed :
- fd_ev_clr(), fd_ev_set(), fd_ev_rem(), fd_ev_wai()
The POLLHUP/POLLERR flags are now reported as ready so that the I/O layers
knows it can try to access the file descriptor to get this information.
In order to simplify the conditions to add/remove cache entries, a new
function fd_alloc_or_release_cache_entry() was created to be used from
pollers while scanning for updates.
The following pollers have been updated :
ev_select() : done, built, tested on Linux 3.10
ev_poll() : done, built, tested on Linux 3.10
ev_epoll() : done, built, tested on Linux 3.10 & 3.13
ev_kqueue() : done, built, tested on OpenBSD 5.2
2014-01-10 10:58:45 -05:00
|
|
|
* 7 6 5 4 3 2 1 0
|
2019-09-04 03:52:57 -04:00
|
|
|
* [ 0 | 0 | RW | AW | 0 | 0 | RR | AR ]
|
2012-11-11 09:02:54 -05:00
|
|
|
*
|
MAJOR: polling: rework the whole polling system
This commit heavily changes the polling system in order to definitely
fix the frequent breakage of SSL which needs to remember the last
EAGAIN before deciding whether to poll or not. Now we have a state per
direction for each FD, as opposed to a previous and current state
previously. An FD can have up to 8 different states for each direction,
each of which being the result of a 3-bit combination. These 3 bits
indicate a wish to access the FD, the readiness of the FD and the
subscription of the FD to the polling system.
This means that it will now be possible to remember the state of a
file descriptor across disable/enable sequences that generally happen
during forwarding, where enabling reading on a previously disabled FD
would result in forgetting the EAGAIN flag it met last time.
Several new state manipulation functions have been introduced or
adapted :
- fd_want_{recv,send} : enable receiving/sending on the FD regardless
of its state (sets the ACTIVE flag) ;
- fd_stop_{recv,send} : stop receiving/sending on the FD regardless
of its state (clears the ACTIVE flag) ;
- fd_cant_{recv,send} : report a failure to receive/send on the FD
corresponding to EAGAIN (clears the READY flag) ;
- fd_may_{recv,send} : report the ability to receive/send on the FD
as reported by poll() (sets the READY flag) ;
Some functions are used to report the current FD status :
- fd_{recv,send}_active
- fd_{recv,send}_ready
- fd_{recv,send}_polled
Some functions were removed :
- fd_ev_clr(), fd_ev_set(), fd_ev_rem(), fd_ev_wai()
The POLLHUP/POLLERR flags are now reported as ready so that the I/O layers
knows it can try to access the file descriptor to get this information.
In order to simplify the conditions to add/remove cache entries, a new
function fd_alloc_or_release_cache_entry() was created to be used from
pollers while scanning for updates.
The following pollers have been updated :
ev_select() : done, built, tested on Linux 3.10
ev_poll() : done, built, tested on Linux 3.10
ev_epoll() : done, built, tested on Linux 3.10 & 3.13
ev_kqueue() : done, built, tested on OpenBSD 5.2
2014-01-10 10:58:45 -05:00
|
|
|
* A* = active *R = read
|
2019-09-04 03:52:57 -04:00
|
|
|
* R* = ready *W = write
|
MAJOR: polling: rework the whole polling system
This commit heavily changes the polling system in order to definitely
fix the frequent breakage of SSL which needs to remember the last
EAGAIN before deciding whether to poll or not. Now we have a state per
direction for each FD, as opposed to a previous and current state
previously. An FD can have up to 8 different states for each direction,
each of which being the result of a 3-bit combination. These 3 bits
indicate a wish to access the FD, the readiness of the FD and the
subscription of the FD to the polling system.
This means that it will now be possible to remember the state of a
file descriptor across disable/enable sequences that generally happen
during forwarding, where enabling reading on a previously disabled FD
would result in forgetting the EAGAIN flag it met last time.
Several new state manipulation functions have been introduced or
adapted :
- fd_want_{recv,send} : enable receiving/sending on the FD regardless
of its state (sets the ACTIVE flag) ;
- fd_stop_{recv,send} : stop receiving/sending on the FD regardless
of its state (clears the ACTIVE flag) ;
- fd_cant_{recv,send} : report a failure to receive/send on the FD
corresponding to EAGAIN (clears the READY flag) ;
- fd_may_{recv,send} : report the ability to receive/send on the FD
as reported by poll() (sets the READY flag) ;
Some functions are used to report the current FD status :
- fd_{recv,send}_active
- fd_{recv,send}_ready
- fd_{recv,send}_polled
Some functions were removed :
- fd_ev_clr(), fd_ev_set(), fd_ev_rem(), fd_ev_wai()
The POLLHUP/POLLERR flags are now reported as ready so that the I/O layers
knows it can try to access the file descriptor to get this information.
In order to simplify the conditions to add/remove cache entries, a new
function fd_alloc_or_release_cache_entry() was created to be used from
pollers while scanning for updates.
The following pollers have been updated :
ev_select() : done, built, tested on Linux 3.10
ev_poll() : done, built, tested on Linux 3.10
ev_epoll() : done, built, tested on Linux 3.10 & 3.13
ev_kqueue() : done, built, tested on OpenBSD 5.2
2014-01-10 10:58:45 -05:00
|
|
|
*
|
|
|
|
|
* An FD is marked "active" when there is a desire to use it.
|
|
|
|
|
* An FD is marked "ready" when it has not faced a new EAGAIN since last wake-up
|
2019-09-04 03:52:57 -04:00
|
|
|
* (it is a cache of the last EAGAIN regardless of polling changes). Each poller
|
|
|
|
|
* has its own "polled" state for the same fd, as stored in the polled_mask.
|
MAJOR: polling: rework the whole polling system
This commit heavily changes the polling system in order to definitely
fix the frequent breakage of SSL which needs to remember the last
EAGAIN before deciding whether to poll or not. Now we have a state per
direction for each FD, as opposed to a previous and current state
previously. An FD can have up to 8 different states for each direction,
each of which being the result of a 3-bit combination. These 3 bits
indicate a wish to access the FD, the readiness of the FD and the
subscription of the FD to the polling system.
This means that it will now be possible to remember the state of a
file descriptor across disable/enable sequences that generally happen
during forwarding, where enabling reading on a previously disabled FD
would result in forgetting the EAGAIN flag it met last time.
Several new state manipulation functions have been introduced or
adapted :
- fd_want_{recv,send} : enable receiving/sending on the FD regardless
of its state (sets the ACTIVE flag) ;
- fd_stop_{recv,send} : stop receiving/sending on the FD regardless
of its state (clears the ACTIVE flag) ;
- fd_cant_{recv,send} : report a failure to receive/send on the FD
corresponding to EAGAIN (clears the READY flag) ;
- fd_may_{recv,send} : report the ability to receive/send on the FD
as reported by poll() (sets the READY flag) ;
Some functions are used to report the current FD status :
- fd_{recv,send}_active
- fd_{recv,send}_ready
- fd_{recv,send}_polled
Some functions were removed :
- fd_ev_clr(), fd_ev_set(), fd_ev_rem(), fd_ev_wai()
The POLLHUP/POLLERR flags are now reported as ready so that the I/O layers
knows it can try to access the file descriptor to get this information.
In order to simplify the conditions to add/remove cache entries, a new
function fd_alloc_or_release_cache_entry() was created to be used from
pollers while scanning for updates.
The following pollers have been updated :
ev_select() : done, built, tested on Linux 3.10
ev_poll() : done, built, tested on Linux 3.10
ev_epoll() : done, built, tested on Linux 3.10 & 3.13
ev_kqueue() : done, built, tested on OpenBSD 5.2
2014-01-10 10:58:45 -05:00
|
|
|
*
|
2019-09-04 03:52:57 -04:00
|
|
|
* We have 4 possible states for each direction based on these 2 flags :
|
MAJOR: polling: rework the whole polling system
This commit heavily changes the polling system in order to definitely
fix the frequent breakage of SSL which needs to remember the last
EAGAIN before deciding whether to poll or not. Now we have a state per
direction for each FD, as opposed to a previous and current state
previously. An FD can have up to 8 different states for each direction,
each of which being the result of a 3-bit combination. These 3 bits
indicate a wish to access the FD, the readiness of the FD and the
subscription of the FD to the polling system.
This means that it will now be possible to remember the state of a
file descriptor across disable/enable sequences that generally happen
during forwarding, where enabling reading on a previously disabled FD
would result in forgetting the EAGAIN flag it met last time.
Several new state manipulation functions have been introduced or
adapted :
- fd_want_{recv,send} : enable receiving/sending on the FD regardless
of its state (sets the ACTIVE flag) ;
- fd_stop_{recv,send} : stop receiving/sending on the FD regardless
of its state (clears the ACTIVE flag) ;
- fd_cant_{recv,send} : report a failure to receive/send on the FD
corresponding to EAGAIN (clears the READY flag) ;
- fd_may_{recv,send} : report the ability to receive/send on the FD
as reported by poll() (sets the READY flag) ;
Some functions are used to report the current FD status :
- fd_{recv,send}_active
- fd_{recv,send}_ready
- fd_{recv,send}_polled
Some functions were removed :
- fd_ev_clr(), fd_ev_set(), fd_ev_rem(), fd_ev_wai()
The POLLHUP/POLLERR flags are now reported as ready so that the I/O layers
knows it can try to access the file descriptor to get this information.
In order to simplify the conditions to add/remove cache entries, a new
function fd_alloc_or_release_cache_entry() was created to be used from
pollers while scanning for updates.
The following pollers have been updated :
ev_select() : done, built, tested on Linux 3.10
ev_poll() : done, built, tested on Linux 3.10
ev_epoll() : done, built, tested on Linux 3.10 & 3.13
ev_kqueue() : done, built, tested on OpenBSD 5.2
2014-01-10 10:58:45 -05:00
|
|
|
*
|
2019-09-04 03:52:57 -04:00
|
|
|
* +---+---+----------+---------------------------------------------+
|
|
|
|
|
* | R | A | State | Description |
|
|
|
|
|
* +---+---+----------+---------------------------------------------+
|
|
|
|
|
* | 0 | 0 | DISABLED | No activity desired, not ready. |
|
|
|
|
|
* | 0 | 1 | ACTIVE | Activity desired. |
|
|
|
|
|
* | 1 | 0 | STOPPED | End of activity. |
|
|
|
|
|
* | 1 | 1 | READY | Activity desired and reported. |
|
|
|
|
|
* +---+---+----------+---------------------------------------------+
|
MAJOR: polling: rework the whole polling system
This commit heavily changes the polling system in order to definitely
fix the frequent breakage of SSL which needs to remember the last
EAGAIN before deciding whether to poll or not. Now we have a state per
direction for each FD, as opposed to a previous and current state
previously. An FD can have up to 8 different states for each direction,
each of which being the result of a 3-bit combination. These 3 bits
indicate a wish to access the FD, the readiness of the FD and the
subscription of the FD to the polling system.
This means that it will now be possible to remember the state of a
file descriptor across disable/enable sequences that generally happen
during forwarding, where enabling reading on a previously disabled FD
would result in forgetting the EAGAIN flag it met last time.
Several new state manipulation functions have been introduced or
adapted :
- fd_want_{recv,send} : enable receiving/sending on the FD regardless
of its state (sets the ACTIVE flag) ;
- fd_stop_{recv,send} : stop receiving/sending on the FD regardless
of its state (clears the ACTIVE flag) ;
- fd_cant_{recv,send} : report a failure to receive/send on the FD
corresponding to EAGAIN (clears the READY flag) ;
- fd_may_{recv,send} : report the ability to receive/send on the FD
as reported by poll() (sets the READY flag) ;
Some functions are used to report the current FD status :
- fd_{recv,send}_active
- fd_{recv,send}_ready
- fd_{recv,send}_polled
Some functions were removed :
- fd_ev_clr(), fd_ev_set(), fd_ev_rem(), fd_ev_wai()
The POLLHUP/POLLERR flags are now reported as ready so that the I/O layers
knows it can try to access the file descriptor to get this information.
In order to simplify the conditions to add/remove cache entries, a new
function fd_alloc_or_release_cache_entry() was created to be used from
pollers while scanning for updates.
The following pollers have been updated :
ev_select() : done, built, tested on Linux 3.10
ev_poll() : done, built, tested on Linux 3.10
ev_epoll() : done, built, tested on Linux 3.10 & 3.13
ev_kqueue() : done, built, tested on OpenBSD 5.2
2014-01-10 10:58:45 -05:00
|
|
|
*
|
|
|
|
|
* The transitions are pretty simple :
|
|
|
|
|
* - fd_want_*() : set flag A
|
|
|
|
|
* - fd_stop_*() : clear flag A
|
|
|
|
|
* - fd_cant_*() : clear flag R (when facing EAGAIN)
|
|
|
|
|
* - fd_may_*() : set flag R (upon return from poll())
|
|
|
|
|
*
|
2019-09-04 03:52:57 -04:00
|
|
|
* Each poller then computes its own polled state :
|
|
|
|
|
* if (A) { if (!R) P := 1 } else { P := 0 }
|
|
|
|
|
*
|
|
|
|
|
* The state transitions look like the diagram below.
|
|
|
|
|
*
|
|
|
|
|
* may +----------+
|
|
|
|
|
* ,----| DISABLED | (READY=0, ACTIVE=0)
|
|
|
|
|
* | +----------+
|
|
|
|
|
* | want | ^
|
|
|
|
|
* | | |
|
|
|
|
|
* | v | stop
|
|
|
|
|
* | +----------+
|
|
|
|
|
* | | ACTIVE | (READY=0, ACTIVE=1)
|
|
|
|
|
* | +----------+
|
|
|
|
|
* | | ^
|
|
|
|
|
* | may | |
|
2021-01-07 23:35:52 -05:00
|
|
|
* | v | EAGAIN (can't)
|
2019-09-04 03:52:57 -04:00
|
|
|
* | +--------+
|
|
|
|
|
* | | READY | (READY=1, ACTIVE=1)
|
|
|
|
|
* | +--------+
|
|
|
|
|
* | stop | ^
|
|
|
|
|
* | | |
|
|
|
|
|
* | v | want
|
|
|
|
|
* | +---------+
|
|
|
|
|
* `--->| STOPPED | (READY=1, ACTIVE=0)
|
|
|
|
|
* +---------+
|
2006-06-25 20:48:02 -04:00
|
|
|
*/
|
|
|
|
|
|
2007-04-09 13:29:56 -04:00
|
|
|
#include <stdio.h>
|
2007-04-08 10:39:58 -04:00
|
|
|
#include <string.h>
|
2006-06-25 20:48:02 -04:00
|
|
|
#include <unistd.h>
|
2018-07-26 11:55:11 -04:00
|
|
|
#include <fcntl.h>
|
2006-06-25 20:48:02 -04:00
|
|
|
#include <sys/types.h>
|
2019-02-21 16:19:17 -05:00
|
|
|
#include <sys/resource.h>
|
2019-08-27 05:08:17 -04:00
|
|
|
#include <sys/uio.h>
|
2006-06-25 20:48:02 -04:00
|
|
|
|
2019-05-22 13:24:06 -04:00
|
|
|
#if defined(USE_POLL)
|
2019-02-21 16:12:47 -05:00
|
|
|
#include <poll.h>
|
|
|
|
|
#endif
|
2024-05-27 12:56:12 -04:00
|
|
|
#include <errno.h>
|
2019-02-21 16:12:47 -05:00
|
|
|
|
2020-05-27 06:58:42 -04:00
|
|
|
#include <haproxy/api.h>
|
2021-10-06 13:54:09 -04:00
|
|
|
#include <haproxy/activity.h>
|
2020-06-18 02:58:47 -04:00
|
|
|
#include <haproxy/cfgparse.h>
|
2020-06-03 13:33:00 -04:00
|
|
|
#include <haproxy/fd.h>
|
2020-06-09 03:07:15 -04:00
|
|
|
#include <haproxy/global.h>
|
2021-05-08 14:35:03 -04:00
|
|
|
#include <haproxy/log.h>
|
2020-06-03 13:20:59 -04:00
|
|
|
#include <haproxy/port_range.h>
|
2021-10-06 13:55:29 -04:00
|
|
|
#include <haproxy/ticks.h>
|
2020-06-18 02:58:47 -04:00
|
|
|
#include <haproxy/tools.h>
|
2006-06-25 20:48:02 -04:00
|
|
|
|
2020-06-09 03:07:15 -04:00
|
|
|
|
2021-04-10 10:58:13 -04:00
|
|
|
struct fdtab *fdtab __read_mostly = NULL; /* array of all the file descriptors */
|
|
|
|
|
struct polled_mask *polled_mask __read_mostly = NULL; /* Array for the polled_mask of each fd */
|
|
|
|
|
struct fdinfo *fdinfo __read_mostly = NULL; /* less-often used infos for file descriptors */
|
2006-06-25 20:48:02 -04:00
|
|
|
int totalconn; /* total # of terminated sessions */
|
|
|
|
|
int actconn; /* # of active sessions */
|
|
|
|
|
|
2021-04-10 10:58:13 -04:00
|
|
|
struct poller pollers[MAX_POLLERS] __read_mostly;
|
|
|
|
|
struct poller cur_poller __read_mostly;
|
2007-04-08 10:39:58 -04:00
|
|
|
int nbpollers = 0;
|
2006-06-25 20:48:02 -04:00
|
|
|
|
2022-07-08 05:33:43 -04:00
|
|
|
volatile struct fdlist update_list[MAX_TGROUPS]; // Global update list
|
2018-01-24 12:17:56 -05:00
|
|
|
|
MAJOR: threads/fd: Make fd stuffs thread-safe
Many changes have been made to do so. First, the fd_updt array, where all
pending FDs for polling are stored, is now a thread-local array. Then 3 locks
have been added to protect, respectively, the fdtab array, the fd_cache array
and poll information. In addition, a lock for each entry in the fdtab array has
been added to protect all accesses to a specific FD or its information.
For pollers, according to the poller, the way to manage the concurrency is
different. There is a poller loop on each thread. So the set of monitored FDs
may need to be protected. epoll and kqueue are thread-safe per-se, so there few
things to do to protect these pollers. This is not possible with select and
poll, so there is no sharing between the threads. The poller on each thread is
independant from others.
Finally, per-thread init/deinit functions are used for each pollers and for FD
part for manage thread-local ressources.
Now, you must be carefull when a FD is created during the HAProxy startup. All
update on the FD state must be made in the threads context and never before
their creation. This is mandatory because fd_updt array is thread-local and
initialized only for threads. Because there is no pollers for the main one, this
array remains uninitialized in this context. For this reason, listeners are now
enabled in run_thread_poll_loop function, just like the worker pipe.
2017-05-29 04:40:41 -04:00
|
|
|
THREAD_LOCAL int *fd_updt = NULL; // FD updates list
|
|
|
|
|
THREAD_LOCAL int fd_nbupdt = 0; // number of updates in the list
|
MINOR: fd: don't scan the full fdtab on all threads
During tests, it's pretty visible that with many threads and a large
number of FDs, the process may take time to be ready. The reason for
this is that the full fdtab array is scanned by each and every thread
at boot in fd_reregister_all() in order to make each thread-local
poller adopt the FDs that are relevant to it. The problem is that
when dealing with 1-2M FDs and 64+ threads, it starts to represent
quite a number of loops, and usually the fdtab array doesn't entirely
fit in the CPU's L3 cache, causing extra memory accesses.
It's particularly visible when issuing debugging commands to the CLI
because usually the first one fails while the CPU is at 100% for half
a second (which also is socat's timeout). A quick test with this:
global
stats socket /tmp/sock1 level admin mode 666
stats timeout 1h
maxconn 2000000
And the following script started in another window:
while ! time socat -t5 - /tmp/sock1 <<< "show version";do date -Ins;done
shows that it takes 1.58s for the socat instance that succeeds on an
Ampere Altra with 80 cores, this requires to change the timeout (defaults
to half a second) otherwise it returns nothing. In addition it also means
that during reloads, some CPU spikes will be noticed.
Adding a prefetch of the current FD + 16 improves the startup time by 30%
but that's far from being sufficient.
In practice all of this is performed at boot time, a moment at which we
know that extremely few FDs are registered (basically just the listeners),
so FD numbers are usually very low and the rest of the table is scanned
for no benefit. Ideally, knowing upfront how many FDs we have should be
sufficient.
A first approach would consist in counting the entries on a single thread
before registering pollers. It's not necessarily efficient and would take
time anyway.
This patch takes a different approach. It consists in keeping a thread-local
max ("fd_highest") that is updated whenever fd_insert() is called with a
larger number. Of course this is not correct once all threads have started,
but it will remain valid during boot since the same value is used during
startup and is cloned for each thread, and no scheduling happens anywhere
during this period, so that all threads are aware of the highest FD they've
seen registered, even if it had been done in some init code, and this without
having to deal with a shared variable.
Here on the test platform, the script gets its response in 10ms vs 1580
before.
2024-07-15 09:09:10 -04:00
|
|
|
THREAD_LOCAL int fd_highest = -1; // highest FD known by the current thread
|
2018-07-26 11:55:11 -04:00
|
|
|
THREAD_LOCAL int poller_rd_pipe = -1; // Pipe to wake the thread
|
2021-04-10 10:58:13 -04:00
|
|
|
int poller_wr_pipe[MAX_THREADS] __read_mostly; // Pipe to wake the threads
|
MAJOR: threads/fd: Make fd stuffs thread-safe
Many changes have been made to do so. First, the fd_updt array, where all
pending FDs for polling are stored, is now a thread-local array. Then 3 locks
have been added to protect, respectively, the fdtab array, the fd_cache array
and poll information. In addition, a lock for each entry in the fdtab array has
been added to protect all accesses to a specific FD or its information.
For pollers, according to the poller, the way to manage the concurrency is
different. There is a poller loop on each thread. So the set of monitored FDs
may need to be protected. epoll and kqueue are thread-safe per-se, so there few
things to do to protect these pollers. This is not possible with select and
poll, so there is no sharing between the threads. The poller on each thread is
independant from others.
Finally, per-thread init/deinit functions are used for each pollers and for FD
part for manage thread-local ressources.
Now, you must be carefull when a FD is created during the HAProxy startup. All
update on the FD state must be made in the threads context and never before
their creation. This is mandatory because fd_updt array is thread-local and
initialized only for threads. Because there is no pollers for the main one, this
array remains uninitialized in this context. For this reason, listeners are now
enabled in run_thread_poll_loop function, just like the worker pipe.
2017-05-29 04:40:41 -04:00
|
|
|
|
2019-04-16 12:37:05 -04:00
|
|
|
volatile int ha_used_fds = 0; // Number of FD we're currently using
|
|
|
|
|
|
2018-02-05 11:14:55 -05:00
|
|
|
/* adds fd <fd> to fd list <list> if it was not yet in it */
|
2022-07-06 08:43:51 -04:00
|
|
|
void fd_add_to_fd_list(volatile struct fdlist *list, int fd)
|
2018-02-05 11:14:55 -05:00
|
|
|
{
|
|
|
|
|
int next;
|
|
|
|
|
int new;
|
|
|
|
|
int old;
|
|
|
|
|
int last;
|
|
|
|
|
|
|
|
|
|
redo_next:
|
2023-02-27 08:48:46 -05:00
|
|
|
next = HA_ATOMIC_LOAD(&fdtab[fd].update.next);
|
2018-02-05 11:14:55 -05:00
|
|
|
/* Check that we're not already in the cache, and if not, lock us. */
|
BUG/MEDIUM: fd/threads: fix a concurrency issue between add and rm on the same fd
There's a very hard-to-trigger bug in the FD list code where the
fd_add_to_fd_list() function assumes that if the FD it's trying to add
is already locked, it's in the process of being added. Unfortunately, it
can also be in the process of being removed. It is very hard to trigger
because it requires that one thread is removing the FD while another one
is adding it. First very few FDs run on multiple threads (listeners and
DNS), and second, it does not make sense to add and remove the FD at the
same time.
In practice the DNS code built on the older callback-only model does
perform bursts of fd_want_send() for all resolvers at once when it wants
to send a new query (dns_send_query()). And this is more likely to happen
when here are lots of resolutions in parallel and many resolvers, because
the dns_response_recv() callback can also trigger a series of queries on
all resolvers for each invalid response it receives. This means that it
really is perfectly possible to both stop and start in parallel during
short periods of time there.
This issue was not reported before 2.1, but 2.1 had the FD cache, built
on the exact same code base. It's very possible that the issue caused
exactly the opposite situation, where an event was occasionally lost,
causing a DNS retry that worked, and nobody noticing the problem in the
end. In 2.1 the lost entries are the updates asking for not polling for
writes anymore, and the effect is that the poller contiuously reports
writability on the socket when the issue happens.
This patch fixes bug #416 and must be backported as far as 1.8, and
absolutely requires that previous commit "MINOR: fd/threads: make
_GET_NEXT()/_GET_PREV() use the volatile attribute" is backported as
well otherwise it will make the issue worse.
Special thanks to Julien Pivotto for setting up a reliable reproducer
for this difficult issue.
2019-12-19 12:33:08 -05:00
|
|
|
if (next > -2)
|
2018-02-05 11:14:55 -05:00
|
|
|
goto done;
|
BUG/MEDIUM: fd/threads: fix a concurrency issue between add and rm on the same fd
There's a very hard-to-trigger bug in the FD list code where the
fd_add_to_fd_list() function assumes that if the FD it's trying to add
is already locked, it's in the process of being added. Unfortunately, it
can also be in the process of being removed. It is very hard to trigger
because it requires that one thread is removing the FD while another one
is adding it. First very few FDs run on multiple threads (listeners and
DNS), and second, it does not make sense to add and remove the FD at the
same time.
In practice the DNS code built on the older callback-only model does
perform bursts of fd_want_send() for all resolvers at once when it wants
to send a new query (dns_send_query()). And this is more likely to happen
when here are lots of resolutions in parallel and many resolvers, because
the dns_response_recv() callback can also trigger a series of queries on
all resolvers for each invalid response it receives. This means that it
really is perfectly possible to both stop and start in parallel during
short periods of time there.
This issue was not reported before 2.1, but 2.1 had the FD cache, built
on the exact same code base. It's very possible that the issue caused
exactly the opposite situation, where an event was occasionally lost,
causing a DNS retry that worked, and nobody noticing the problem in the
end. In 2.1 the lost entries are the updates asking for not polling for
writes anymore, and the effect is that the poller contiuously reports
writability on the socket when the issue happens.
This patch fixes bug #416 and must be backported as far as 1.8, and
absolutely requires that previous commit "MINOR: fd/threads: make
_GET_NEXT()/_GET_PREV() use the volatile attribute" is backported as
well otherwise it will make the issue worse.
Special thanks to Julien Pivotto for setting up a reliable reproducer
for this difficult issue.
2019-12-19 12:33:08 -05:00
|
|
|
if (next == -2)
|
|
|
|
|
goto redo_next;
|
2022-07-06 08:43:51 -04:00
|
|
|
if (!_HA_ATOMIC_CAS(&fdtab[fd].update.next, &next, -2))
|
2018-02-05 11:14:55 -05:00
|
|
|
goto redo_next;
|
2019-03-08 07:47:21 -05:00
|
|
|
__ha_barrier_atomic_store();
|
2018-02-05 11:52:24 -05:00
|
|
|
|
|
|
|
|
new = fd;
|
2018-02-05 11:14:55 -05:00
|
|
|
redo_last:
|
|
|
|
|
/* First, insert in the linked list */
|
|
|
|
|
last = list->last;
|
|
|
|
|
old = -1;
|
|
|
|
|
|
2022-07-06 08:43:51 -04:00
|
|
|
fdtab[fd].update.prev = -2;
|
2018-02-05 11:52:24 -05:00
|
|
|
/* Make sure the "prev" store is visible before we update the last entry */
|
|
|
|
|
__ha_barrier_store();
|
2018-02-05 11:14:55 -05:00
|
|
|
|
2018-02-05 11:52:24 -05:00
|
|
|
if (unlikely(last == -1)) {
|
|
|
|
|
/* list is empty, try to add ourselves alone so that list->last=fd */
|
2019-03-08 12:47:42 -05:00
|
|
|
if (unlikely(!_HA_ATOMIC_CAS(&list->last, &old, new)))
|
2018-02-05 11:14:55 -05:00
|
|
|
goto redo_last;
|
|
|
|
|
|
|
|
|
|
/* list->first was necessary -1, we're guaranteed to be alone here */
|
|
|
|
|
list->first = fd;
|
|
|
|
|
} else {
|
2018-02-05 11:52:24 -05:00
|
|
|
/* adding ourselves past the last element
|
|
|
|
|
* The CAS will only succeed if its next is -1,
|
|
|
|
|
* which means it's in the cache, and the last element.
|
|
|
|
|
*/
|
2022-07-06 08:43:51 -04:00
|
|
|
if (unlikely(!_HA_ATOMIC_CAS(&fdtab[last].update.next, &old, new)))
|
2018-02-05 11:14:55 -05:00
|
|
|
goto redo_last;
|
2018-02-05 11:52:24 -05:00
|
|
|
|
|
|
|
|
/* Then, update the last entry */
|
|
|
|
|
list->last = fd;
|
2018-02-05 11:14:55 -05:00
|
|
|
}
|
|
|
|
|
__ha_barrier_store();
|
2018-02-05 11:52:24 -05:00
|
|
|
/* since we're alone at the end of the list and still locked(-2),
|
2021-01-06 11:20:16 -05:00
|
|
|
* we know no one tried to add past us. Mark the end of list.
|
2018-02-05 11:52:24 -05:00
|
|
|
*/
|
2022-07-06 08:43:51 -04:00
|
|
|
fdtab[fd].update.prev = last;
|
|
|
|
|
fdtab[fd].update.next = -1;
|
2018-02-05 11:14:55 -05:00
|
|
|
__ha_barrier_store();
|
|
|
|
|
done:
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* removes fd <fd> from fd list <list> */
|
2022-07-06 08:43:51 -04:00
|
|
|
void fd_rm_from_fd_list(volatile struct fdlist *list, int fd)
|
2018-02-05 11:14:55 -05:00
|
|
|
{
|
|
|
|
|
#if defined(HA_HAVE_CAS_DW) || defined(HA_CAS_IS_8B)
|
2020-02-25 03:25:53 -05:00
|
|
|
volatile union {
|
|
|
|
|
struct fdlist_entry ent;
|
|
|
|
|
uint64_t u64;
|
|
|
|
|
uint32_t u32[2];
|
|
|
|
|
} cur_list, next_list;
|
2018-02-05 11:14:55 -05:00
|
|
|
#endif
|
|
|
|
|
int old;
|
|
|
|
|
int new = -2;
|
|
|
|
|
int prev;
|
|
|
|
|
int next;
|
|
|
|
|
int last;
|
|
|
|
|
lock_self:
|
|
|
|
|
#if (defined(HA_CAS_IS_8B) || defined(HA_HAVE_CAS_DW))
|
2020-02-25 03:25:53 -05:00
|
|
|
next_list.ent.next = next_list.ent.prev = -2;
|
2023-02-27 08:48:46 -05:00
|
|
|
cur_list.ent = *(volatile typeof(fdtab->update)*)&fdtab[fd].update;
|
2018-02-05 11:14:55 -05:00
|
|
|
/* First, attempt to lock our own entries */
|
|
|
|
|
do {
|
|
|
|
|
/* The FD is not in the FD cache, give up */
|
2020-02-25 03:25:53 -05:00
|
|
|
if (unlikely(cur_list.ent.next <= -3))
|
2018-02-05 11:14:55 -05:00
|
|
|
return;
|
2020-02-25 03:25:53 -05:00
|
|
|
if (unlikely(cur_list.ent.prev == -2 || cur_list.ent.next == -2))
|
2018-02-05 11:14:55 -05:00
|
|
|
goto lock_self;
|
|
|
|
|
} while (
|
|
|
|
|
#ifdef HA_CAS_IS_8B
|
2022-07-06 08:43:51 -04:00
|
|
|
unlikely(!_HA_ATOMIC_CAS(((uint64_t *)&fdtab[fd].update), (uint64_t *)&cur_list.u64, next_list.u64))
|
2018-02-05 11:14:55 -05:00
|
|
|
#else
|
2022-09-17 05:15:29 -04:00
|
|
|
unlikely(!_HA_ATOMIC_DWCAS(((long *)&fdtab[fd].update), (uint32_t *)&cur_list.u32, (const uint32_t *)&next_list.u32))
|
2018-02-05 11:14:55 -05:00
|
|
|
#endif
|
2020-02-25 03:25:53 -05:00
|
|
|
);
|
|
|
|
|
next = cur_list.ent.next;
|
|
|
|
|
prev = cur_list.ent.prev;
|
2018-02-05 11:14:55 -05:00
|
|
|
|
|
|
|
|
#else
|
|
|
|
|
lock_self_next:
|
2023-02-27 08:48:46 -05:00
|
|
|
next = HA_ATOMIC_LOAD(&fdtab[fd].update.next);
|
2018-02-05 11:14:55 -05:00
|
|
|
if (next == -2)
|
|
|
|
|
goto lock_self_next;
|
|
|
|
|
if (next <= -3)
|
|
|
|
|
goto done;
|
2022-07-06 08:43:51 -04:00
|
|
|
if (unlikely(!_HA_ATOMIC_CAS(&fdtab[fd].update.next, &next, -2)))
|
2018-02-05 11:14:55 -05:00
|
|
|
goto lock_self_next;
|
|
|
|
|
lock_self_prev:
|
2023-02-27 08:48:46 -05:00
|
|
|
prev = HA_ATOMIC_LOAD(&fdtab[fd].update.prev);
|
2018-02-05 11:14:55 -05:00
|
|
|
if (prev == -2)
|
|
|
|
|
goto lock_self_prev;
|
2022-07-06 08:43:51 -04:00
|
|
|
if (unlikely(!_HA_ATOMIC_CAS(&fdtab[fd].update.prev, &prev, -2)))
|
2018-02-05 11:14:55 -05:00
|
|
|
goto lock_self_prev;
|
|
|
|
|
#endif
|
2019-03-08 07:47:21 -05:00
|
|
|
__ha_barrier_atomic_store();
|
2018-02-05 11:14:55 -05:00
|
|
|
|
|
|
|
|
/* Now, lock the entries of our neighbours */
|
|
|
|
|
if (likely(prev != -1)) {
|
|
|
|
|
redo_prev:
|
|
|
|
|
old = fd;
|
|
|
|
|
|
2022-07-06 08:43:51 -04:00
|
|
|
if (unlikely(!_HA_ATOMIC_CAS(&fdtab[prev].update.next, &old, new))) {
|
2018-02-05 11:14:55 -05:00
|
|
|
if (unlikely(old == -2)) {
|
|
|
|
|
/* Neighbour already locked, give up and
|
|
|
|
|
* retry again once he's done
|
|
|
|
|
*/
|
2022-07-06 08:43:51 -04:00
|
|
|
fdtab[fd].update.prev = prev;
|
2018-02-05 11:14:55 -05:00
|
|
|
__ha_barrier_store();
|
2022-07-06 08:43:51 -04:00
|
|
|
fdtab[fd].update.next = next;
|
2018-02-05 11:14:55 -05:00
|
|
|
__ha_barrier_store();
|
|
|
|
|
goto lock_self;
|
|
|
|
|
}
|
|
|
|
|
goto redo_prev;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if (likely(next != -1)) {
|
|
|
|
|
redo_next:
|
|
|
|
|
old = fd;
|
2022-07-06 08:43:51 -04:00
|
|
|
if (unlikely(!_HA_ATOMIC_CAS(&fdtab[next].update.prev, &old, new))) {
|
2018-02-05 11:14:55 -05:00
|
|
|
if (unlikely(old == -2)) {
|
|
|
|
|
/* Neighbour already locked, give up and
|
|
|
|
|
* retry again once he's done
|
|
|
|
|
*/
|
|
|
|
|
if (prev != -1) {
|
2022-07-06 08:43:51 -04:00
|
|
|
fdtab[prev].update.next = fd;
|
2018-02-05 11:14:55 -05:00
|
|
|
__ha_barrier_store();
|
|
|
|
|
}
|
2022-07-06 08:43:51 -04:00
|
|
|
fdtab[fd].update.prev = prev;
|
2018-02-05 11:14:55 -05:00
|
|
|
__ha_barrier_store();
|
2022-07-06 08:43:51 -04:00
|
|
|
fdtab[fd].update.next = next;
|
2018-02-05 11:14:55 -05:00
|
|
|
__ha_barrier_store();
|
|
|
|
|
goto lock_self;
|
|
|
|
|
}
|
|
|
|
|
goto redo_next;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if (list->first == fd)
|
|
|
|
|
list->first = next;
|
|
|
|
|
__ha_barrier_store();
|
|
|
|
|
last = list->last;
|
2019-03-08 12:47:42 -05:00
|
|
|
while (unlikely(last == fd && (!_HA_ATOMIC_CAS(&list->last, &last, prev))))
|
2018-02-05 11:14:55 -05:00
|
|
|
__ha_compiler_barrier();
|
|
|
|
|
/* Make sure we let other threads know we're no longer in cache,
|
|
|
|
|
* before releasing our neighbours.
|
|
|
|
|
*/
|
|
|
|
|
__ha_barrier_store();
|
|
|
|
|
if (likely(prev != -1))
|
2022-07-06 08:43:51 -04:00
|
|
|
fdtab[prev].update.next = next;
|
2018-02-05 11:14:55 -05:00
|
|
|
__ha_barrier_store();
|
|
|
|
|
if (likely(next != -1))
|
2022-07-06 08:43:51 -04:00
|
|
|
fdtab[next].update.prev = prev;
|
2018-02-05 11:14:55 -05:00
|
|
|
__ha_barrier_store();
|
|
|
|
|
/* Ok, now we're out of the fd cache */
|
2022-07-06 08:43:51 -04:00
|
|
|
fdtab[fd].update.next = -(next + 4);
|
2018-02-05 11:14:55 -05:00
|
|
|
__ha_barrier_store();
|
|
|
|
|
done:
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
BUG/MEDIUM: fd: do not wait on FD removal in fd_delete()
Christopher discovered an issue mostly affecting 2.2 and to a less extent
2.3 and above, which is that it's possible to deadlock a soft-stop when
several threads are using a same listener:
thread1 thread2
unbind_listener() fd_set_running()
lock(listener) listener_accept()
fd_delete() lock(listener)
while (running_mask); -----> deadlock
unlock(listener)
This simple case disappeared from 2.3 due to the removal of some locked
operations at the end of listener_accept() on the regular path, but the
architectural problem is still here and caused by a lock inversion built
around the loop on running_mask in fd_clr_running_excl(), because there
are situations where the caller of fd_delete() may hold a lock that is
preventing other threads from dropping their bit in running_mask.
The real need here is to make sure the last user deletes the FD. We have
all we need to know the last one, it's the one calling fd_clr_running()
last, or entering fd_delete() last, both of which can be summed up as
the last one calling fd_clr_running() if fd_delete() calls fd_clr_running()
at the end. And we can prevent new threads from appearing in running_mask
by removing their bits in thread_mask.
So what this patch does is that it sets the running_mask for the thread
in fd_delete(), clears the thread_mask, thus marking the FD as orphaned,
then clears the running mask again, and completes the deletion if it was
the last one. If it was not, another thread will pass through fd_clr_running
and will complete the deletion of the FD.
The bug is easily reproducible in 2.2 under high connection rates during
soft close. When the old process stops its listener, occasionally two
threads will deadlock and the old process will then be killed by the
watchdog. It's strongly believed that similar situations do exist in 2.3
and 2.4 (e.g. if the removal attempt happens during resume_listener()
called from listener_accept()) but if so, they should be much harder to
trigger.
This should be backported to 2.2 as the issue appeared with the FD
migration. It requires previous patches "fd: make fd_clr_running() return
the remaining running mask" and "MINOR: fd: remove the unneeded running
bit from fd_insert()".
Notes for backport: in 2.2, the fd_dodelete() function requires an extra
argument "do_close" indicating whether we want to remove and close the FD
(fd_delete) or just delete it (fd_remove). While this information is not
conveyed along the chain, we know that late calls always imply do_close=1
become do_close=0 exclusively results from fd_remove() which is only used
by the config parser and the master, both of which are single-threaded,
hence are always the last ones in the running_mask. Thus it is safe to
assume that a postponed FD deletion always implies do_close=1.
Thanks to Olivier for his help in designing this optimal solution.
2021-03-24 05:51:32 -04:00
|
|
|
/* deletes the FD once nobody uses it anymore, as detected by the caller by its
|
|
|
|
|
* thread_mask being zero and its running mask turning to zero. There is no
|
|
|
|
|
* protection against concurrent accesses, it's up to the caller to make sure
|
2023-02-27 12:35:39 -05:00
|
|
|
* only the last thread will call it. If called under isolation, it is safe to
|
|
|
|
|
* call this from another group than the FD's. This is only for internal use,
|
|
|
|
|
* please use fd_delete() instead.
|
2007-04-08 10:39:58 -04:00
|
|
|
*/
|
BUG/MEDIUM: fd: do not wait on FD removal in fd_delete()
Christopher discovered an issue mostly affecting 2.2 and to a less extent
2.3 and above, which is that it's possible to deadlock a soft-stop when
several threads are using a same listener:
thread1 thread2
unbind_listener() fd_set_running()
lock(listener) listener_accept()
fd_delete() lock(listener)
while (running_mask); -----> deadlock
unlock(listener)
This simple case disappeared from 2.3 due to the removal of some locked
operations at the end of listener_accept() on the regular path, but the
architectural problem is still here and caused by a lock inversion built
around the loop on running_mask in fd_clr_running_excl(), because there
are situations where the caller of fd_delete() may hold a lock that is
preventing other threads from dropping their bit in running_mask.
The real need here is to make sure the last user deletes the FD. We have
all we need to know the last one, it's the one calling fd_clr_running()
last, or entering fd_delete() last, both of which can be summed up as
the last one calling fd_clr_running() if fd_delete() calls fd_clr_running()
at the end. And we can prevent new threads from appearing in running_mask
by removing their bits in thread_mask.
So what this patch does is that it sets the running_mask for the thread
in fd_delete(), clears the thread_mask, thus marking the FD as orphaned,
then clears the running mask again, and completes the deletion if it was
the last one. If it was not, another thread will pass through fd_clr_running
and will complete the deletion of the FD.
The bug is easily reproducible in 2.2 under high connection rates during
soft close. When the old process stops its listener, occasionally two
threads will deadlock and the old process will then be killed by the
watchdog. It's strongly believed that similar situations do exist in 2.3
and 2.4 (e.g. if the removal attempt happens during resume_listener()
called from listener_accept()) but if so, they should be much harder to
trigger.
This should be backported to 2.2 as the issue appeared with the FD
migration. It requires previous patches "fd: make fd_clr_running() return
the remaining running mask" and "MINOR: fd: remove the unneeded running
bit from fd_insert()".
Notes for backport: in 2.2, the fd_dodelete() function requires an extra
argument "do_close" indicating whether we want to remove and close the FD
(fd_delete) or just delete it (fd_remove). While this information is not
conveyed along the chain, we know that late calls always imply do_close=1
become do_close=0 exclusively results from fd_remove() which is only used
by the config parser and the master, both of which are single-threaded,
hence are always the last ones in the running_mask. Thus it is safe to
assume that a postponed FD deletion always implies do_close=1.
Thanks to Olivier for his help in designing this optimal solution.
2021-03-24 05:51:32 -04:00
|
|
|
void _fd_delete_orphan(int fd)
|
2006-06-25 20:48:02 -04:00
|
|
|
{
|
2023-02-27 12:35:39 -05:00
|
|
|
int tgrp = fd_tgid(fd);
|
2022-07-01 11:31:25 -04:00
|
|
|
uint fd_disown;
|
|
|
|
|
|
|
|
|
|
fd_disown = fdtab[fd].state & FD_DISOWN;
|
2021-04-06 11:49:19 -04:00
|
|
|
if (fdtab[fd].state & FD_LINGER_RISK) {
|
2013-12-15 08:19:38 -05:00
|
|
|
/* this is generally set when connecting to servers */
|
2020-04-02 06:02:08 -04:00
|
|
|
DISGUISE(setsockopt(fd, SOL_SOCKET, SO_LINGER,
|
|
|
|
|
(struct linger *) &nolinger, sizeof(struct linger)));
|
2013-12-15 08:19:38 -05:00
|
|
|
}
|
2022-07-06 10:23:41 -04:00
|
|
|
|
|
|
|
|
/* It's expected that a close() will result in the FD disappearing from
|
|
|
|
|
* pollers, but some pollers may have some internal bookkeeping to be
|
|
|
|
|
* done prior to the call (e.g. remove references from internal tables).
|
|
|
|
|
*/
|
2012-11-11 10:05:19 -05:00
|
|
|
if (cur_poller.clo)
|
|
|
|
|
cur_poller.clo(fd);
|
2021-03-24 10:34:25 -04:00
|
|
|
|
BUG/MAJOR: fd/thread: fix race between updates and closing FD
While running some L7 retries tests, Christopher and I stumbled upon a
very strange behavior showing some occasional server timeouts when the
server closes keep-alive connections quickly. The issue can be
reproduced with the following config:
global
expose-experimental-directives
#tune.fd.edge-triggered on # can speed up the issue
defaults
mode http
timeout client 5s
timeout server 10s
timeout connect 2s
listen f
bind :8001
http-reuse always
retry-on all-retryable-errors
server next 127.0.0.1:8002
frontend b
bind :8002
timeout http-keep-alive 1 # one ms
redirect location /
Sending fast requests without reusing the client connection on port 8001
with a single connection and at least 3 threads on haproxy occasionally
shows some glitches pauses (below with timeout server 2s):
$ taskset -c 2,3 h1load -e -t 1 -r 1 -c 1 http://127.0.0.1:8001/
# time conns tot_conn tot_req tot_bytes err cps rps bps ttfb
1 1 9794 9793 959714 0 9k79 9k79 7M67 42.94u
2 1 9794 9793 959714 0 0.00 0.00 0.00 -
3 1 9794 9793 959714 0 0.00 0.00 0.00 -
4 0 16015 16015 1569470 0 6k22 6k22 4M87 522.9u
5 0 18657 18656 1828190 2 2k63 2k63 2M06 39.22u
If this doesn't happen, limiting to a request rate close to 1/timeout
may help.
What is happening is that after several migrations, a late report
via fd_update_events() may detect that the thread is not welcome, and
will want to program an update so that the current thread's poller
disables its polling on it. It is allowed to do so because it used
fd_grab_tgid(). But what if _fd_delete_orphan() was just starting to
be called and already reset the update_mask ? We'll end up with a bit
present in the update mask, then _fd_delete_orphan() resets the tgid,
which will prevent the poller from consuming that update. The update
is not needed anymore since the FD was closed, but in this case nobody
will clear this bit until the same FD is reused again and cleared. And
as long as the thread's bit remains in the update_mask, no new updates
will be programmed for the next use of this FD on the same thread since
due to the bit being present, fd_nbupdt will not be changed. This is
what is causing this timeout.
The fix consists in making sure _fd_delete_orphan() waits for the
occasional watchers to leave, and to do this before clearing the
update_mask. This will be either fd_update_events() trying to check
its thread_mask, or the poller checking its updates, so that's pretty
short. But it definitely closes this race.
This fix is needed since the introduction of fd_grab_tgid(), hence 2.7.
Note that while testing the fix, another related issue concerning the
atomicity of running_mask vs thread_mask popped up and will have to be
fixed till 2.5 as part of another patch. It may make the tests for this
fix occasionally tigger a few BUG_ON() or face a null conn->subs in
sock_conn_iocb(), though these ones are much more difficult to trigger.
This is not caused by this fix.
2023-03-04 09:33:24 -05:00
|
|
|
/* now we're about to reset some of this FD's fields. We don't want
|
|
|
|
|
* anyone to grab it anymore and we need to make sure those which could
|
|
|
|
|
* possibly have stumbled upon it right now are leaving before we
|
|
|
|
|
* proceed. This is done in two steps. First we reset the tgid so that
|
|
|
|
|
* fd_take_tgid() and fd_grab_tgid() fail, then we wait for existing
|
|
|
|
|
* ref counts to drop. Past this point we're alone dealing with the
|
|
|
|
|
* FD's thead/running/update/polled masks.
|
|
|
|
|
*/
|
|
|
|
|
fd_reset_tgid(fd);
|
|
|
|
|
|
|
|
|
|
while (_HA_ATOMIC_LOAD(&fdtab[fd].refc_tgid) != 0) // refc==0 ?
|
|
|
|
|
__ha_cpu_relax();
|
|
|
|
|
|
2022-07-06 10:23:41 -04:00
|
|
|
/* we don't want this FD anymore in the global list */
|
2023-02-27 12:35:39 -05:00
|
|
|
fd_rm_from_fd_list(&update_list[tgrp - 1], fd);
|
2022-07-06 10:23:41 -04:00
|
|
|
|
|
|
|
|
/* no more updates on this FD are relevant anymore */
|
|
|
|
|
HA_ATOMIC_STORE(&fdtab[fd].update_mask, 0);
|
2022-07-06 10:20:11 -04:00
|
|
|
if (fd_nbupdt > 0 && fd_updt[fd_nbupdt - 1] == fd)
|
|
|
|
|
fd_nbupdt--;
|
2022-07-06 10:23:41 -04:00
|
|
|
|
2021-03-24 10:34:25 -04:00
|
|
|
port_range_release_port(fdinfo[fd].port_range, fdinfo[fd].local_port);
|
2019-08-05 12:51:52 -04:00
|
|
|
polled_mask[fd].poll_recv = polled_mask[fd].poll_send = 0;
|
2012-11-11 10:05:19 -05:00
|
|
|
|
MAJOR: polling: rework the whole polling system
This commit heavily changes the polling system in order to definitely
fix the frequent breakage of SSL which needs to remember the last
EAGAIN before deciding whether to poll or not. Now we have a state per
direction for each FD, as opposed to a previous and current state
previously. An FD can have up to 8 different states for each direction,
each of which being the result of a 3-bit combination. These 3 bits
indicate a wish to access the FD, the readiness of the FD and the
subscription of the FD to the polling system.
This means that it will now be possible to remember the state of a
file descriptor across disable/enable sequences that generally happen
during forwarding, where enabling reading on a previously disabled FD
would result in forgetting the EAGAIN flag it met last time.
Several new state manipulation functions have been introduced or
adapted :
- fd_want_{recv,send} : enable receiving/sending on the FD regardless
of its state (sets the ACTIVE flag) ;
- fd_stop_{recv,send} : stop receiving/sending on the FD regardless
of its state (clears the ACTIVE flag) ;
- fd_cant_{recv,send} : report a failure to receive/send on the FD
corresponding to EAGAIN (clears the READY flag) ;
- fd_may_{recv,send} : report the ability to receive/send on the FD
as reported by poll() (sets the READY flag) ;
Some functions are used to report the current FD status :
- fd_{recv,send}_active
- fd_{recv,send}_ready
- fd_{recv,send}_polled
Some functions were removed :
- fd_ev_clr(), fd_ev_set(), fd_ev_rem(), fd_ev_wai()
The POLLHUP/POLLERR flags are now reported as ready so that the I/O layers
knows it can try to access the file descriptor to get this information.
In order to simplify the conditions to add/remove cache entries, a new
function fd_alloc_or_release_cache_entry() was created to be used from
pollers while scanning for updates.
The following pollers have been updated :
ev_select() : done, built, tested on Linux 3.10
ev_poll() : done, built, tested on Linux 3.10
ev_epoll() : done, built, tested on Linux 3.10 & 3.13
ev_kqueue() : done, built, tested on OpenBSD 5.2
2014-01-10 10:58:45 -05:00
|
|
|
fdtab[fd].state = 0;
|
2012-11-11 10:05:19 -05:00
|
|
|
|
2020-06-23 04:04:54 -04:00
|
|
|
#ifdef DEBUG_FD
|
|
|
|
|
fdtab[fd].event_count = 0;
|
|
|
|
|
#endif
|
2009-10-18 01:25:52 -04:00
|
|
|
fdinfo[fd].port_range = NULL;
|
2012-07-05 17:19:22 -04:00
|
|
|
fdtab[fd].owner = NULL;
|
2022-07-06 10:23:41 -04:00
|
|
|
|
BUG/MEDIUM: fd: do not wait on FD removal in fd_delete()
Christopher discovered an issue mostly affecting 2.2 and to a less extent
2.3 and above, which is that it's possible to deadlock a soft-stop when
several threads are using a same listener:
thread1 thread2
unbind_listener() fd_set_running()
lock(listener) listener_accept()
fd_delete() lock(listener)
while (running_mask); -----> deadlock
unlock(listener)
This simple case disappeared from 2.3 due to the removal of some locked
operations at the end of listener_accept() on the regular path, but the
architectural problem is still here and caused by a lock inversion built
around the loop on running_mask in fd_clr_running_excl(), because there
are situations where the caller of fd_delete() may hold a lock that is
preventing other threads from dropping their bit in running_mask.
The real need here is to make sure the last user deletes the FD. We have
all we need to know the last one, it's the one calling fd_clr_running()
last, or entering fd_delete() last, both of which can be summed up as
the last one calling fd_clr_running() if fd_delete() calls fd_clr_running()
at the end. And we can prevent new threads from appearing in running_mask
by removing their bits in thread_mask.
So what this patch does is that it sets the running_mask for the thread
in fd_delete(), clears the thread_mask, thus marking the FD as orphaned,
then clears the running mask again, and completes the deletion if it was
the last one. If it was not, another thread will pass through fd_clr_running
and will complete the deletion of the FD.
The bug is easily reproducible in 2.2 under high connection rates during
soft close. When the old process stops its listener, occasionally two
threads will deadlock and the old process will then be killed by the
watchdog. It's strongly believed that similar situations do exist in 2.3
and 2.4 (e.g. if the removal attempt happens during resume_listener()
called from listener_accept()) but if so, they should be much harder to
trigger.
This should be backported to 2.2 as the issue appeared with the FD
migration. It requires previous patches "fd: make fd_clr_running() return
the remaining running mask" and "MINOR: fd: remove the unneeded running
bit from fd_insert()".
Notes for backport: in 2.2, the fd_dodelete() function requires an extra
argument "do_close" indicating whether we want to remove and close the FD
(fd_delete) or just delete it (fd_remove). While this information is not
conveyed along the chain, we know that late calls always imply do_close=1
become do_close=0 exclusively results from fd_remove() which is only used
by the config parser and the master, both of which are single-threaded,
hence are always the last ones in the running_mask. Thus it is safe to
assume that a postponed FD deletion always implies do_close=1.
Thanks to Olivier for his help in designing this optimal solution.
2021-03-24 05:51:32 -04:00
|
|
|
/* perform the close() call last as it's what unlocks the instant reuse
|
|
|
|
|
* of this FD by any other thread.
|
|
|
|
|
*/
|
2025-01-30 10:25:40 -05:00
|
|
|
if (!fd_disown) {
|
|
|
|
|
fdtab[fd].generation++;
|
2022-07-01 11:31:25 -04:00
|
|
|
close(fd);
|
2025-01-30 10:25:40 -05:00
|
|
|
}
|
2021-04-06 07:53:36 -04:00
|
|
|
_HA_ATOMIC_DEC(&ha_used_fds);
|
BUG/MEDIUM: fd: do not wait on FD removal in fd_delete()
Christopher discovered an issue mostly affecting 2.2 and to a less extent
2.3 and above, which is that it's possible to deadlock a soft-stop when
several threads are using a same listener:
thread1 thread2
unbind_listener() fd_set_running()
lock(listener) listener_accept()
fd_delete() lock(listener)
while (running_mask); -----> deadlock
unlock(listener)
This simple case disappeared from 2.3 due to the removal of some locked
operations at the end of listener_accept() on the regular path, but the
architectural problem is still here and caused by a lock inversion built
around the loop on running_mask in fd_clr_running_excl(), because there
are situations where the caller of fd_delete() may hold a lock that is
preventing other threads from dropping their bit in running_mask.
The real need here is to make sure the last user deletes the FD. We have
all we need to know the last one, it's the one calling fd_clr_running()
last, or entering fd_delete() last, both of which can be summed up as
the last one calling fd_clr_running() if fd_delete() calls fd_clr_running()
at the end. And we can prevent new threads from appearing in running_mask
by removing their bits in thread_mask.
So what this patch does is that it sets the running_mask for the thread
in fd_delete(), clears the thread_mask, thus marking the FD as orphaned,
then clears the running mask again, and completes the deletion if it was
the last one. If it was not, another thread will pass through fd_clr_running
and will complete the deletion of the FD.
The bug is easily reproducible in 2.2 under high connection rates during
soft close. When the old process stops its listener, occasionally two
threads will deadlock and the old process will then be killed by the
watchdog. It's strongly believed that similar situations do exist in 2.3
and 2.4 (e.g. if the removal attempt happens during resume_listener()
called from listener_accept()) but if so, they should be much harder to
trigger.
This should be backported to 2.2 as the issue appeared with the FD
migration. It requires previous patches "fd: make fd_clr_running() return
the remaining running mask" and "MINOR: fd: remove the unneeded running
bit from fd_insert()".
Notes for backport: in 2.2, the fd_dodelete() function requires an extra
argument "do_close" indicating whether we want to remove and close the FD
(fd_delete) or just delete it (fd_remove). While this information is not
conveyed along the chain, we know that late calls always imply do_close=1
become do_close=0 exclusively results from fd_remove() which is only used
by the config parser and the master, both of which are single-threaded,
hence are always the last ones in the running_mask. Thus it is safe to
assume that a postponed FD deletion always implies do_close=1.
Thanks to Olivier for his help in designing this optimal solution.
2021-03-24 05:51:32 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Deletes an FD from the fdsets. The file descriptor is also closed, possibly
|
2023-02-27 12:43:38 -05:00
|
|
|
* asynchronously. It is safe to call it from another thread from the same
|
|
|
|
|
* group as the FD's or from a thread from a different group. However if called
|
|
|
|
|
* from a thread from another group, there is an extra cost involved because
|
|
|
|
|
* the operation is performed under thread isolation, so doing so must be
|
|
|
|
|
* reserved for ultra-rare cases (e.g. stopping a listener).
|
BUG/MEDIUM: fd: do not wait on FD removal in fd_delete()
Christopher discovered an issue mostly affecting 2.2 and to a less extent
2.3 and above, which is that it's possible to deadlock a soft-stop when
several threads are using a same listener:
thread1 thread2
unbind_listener() fd_set_running()
lock(listener) listener_accept()
fd_delete() lock(listener)
while (running_mask); -----> deadlock
unlock(listener)
This simple case disappeared from 2.3 due to the removal of some locked
operations at the end of listener_accept() on the regular path, but the
architectural problem is still here and caused by a lock inversion built
around the loop on running_mask in fd_clr_running_excl(), because there
are situations where the caller of fd_delete() may hold a lock that is
preventing other threads from dropping their bit in running_mask.
The real need here is to make sure the last user deletes the FD. We have
all we need to know the last one, it's the one calling fd_clr_running()
last, or entering fd_delete() last, both of which can be summed up as
the last one calling fd_clr_running() if fd_delete() calls fd_clr_running()
at the end. And we can prevent new threads from appearing in running_mask
by removing their bits in thread_mask.
So what this patch does is that it sets the running_mask for the thread
in fd_delete(), clears the thread_mask, thus marking the FD as orphaned,
then clears the running mask again, and completes the deletion if it was
the last one. If it was not, another thread will pass through fd_clr_running
and will complete the deletion of the FD.
The bug is easily reproducible in 2.2 under high connection rates during
soft close. When the old process stops its listener, occasionally two
threads will deadlock and the old process will then be killed by the
watchdog. It's strongly believed that similar situations do exist in 2.3
and 2.4 (e.g. if the removal attempt happens during resume_listener()
called from listener_accept()) but if so, they should be much harder to
trigger.
This should be backported to 2.2 as the issue appeared with the FD
migration. It requires previous patches "fd: make fd_clr_running() return
the remaining running mask" and "MINOR: fd: remove the unneeded running
bit from fd_insert()".
Notes for backport: in 2.2, the fd_dodelete() function requires an extra
argument "do_close" indicating whether we want to remove and close the FD
(fd_delete) or just delete it (fd_remove). While this information is not
conveyed along the chain, we know that late calls always imply do_close=1
become do_close=0 exclusively results from fd_remove() which is only used
by the config parser and the master, both of which are single-threaded,
hence are always the last ones in the running_mask. Thus it is safe to
assume that a postponed FD deletion always implies do_close=1.
Thanks to Olivier for his help in designing this optimal solution.
2021-03-24 05:51:32 -04:00
|
|
|
*/
|
|
|
|
|
void fd_delete(int fd)
|
|
|
|
|
{
|
2022-01-31 14:05:02 -05:00
|
|
|
/* This must never happen and would definitely indicate a bug, in
|
|
|
|
|
* addition to overwriting some unexpected memory areas.
|
|
|
|
|
*/
|
|
|
|
|
BUG_ON(fd < 0 || fd >= global.maxsock);
|
|
|
|
|
|
2022-07-15 12:56:48 -04:00
|
|
|
/* NOTE: The master when going into reexec mode re-closes all FDs after
|
|
|
|
|
* they were already dispatched. But we know we didn't start the polling
|
|
|
|
|
* threads so we can still close them. The masks will probably not match
|
|
|
|
|
* however so we force the value and erase the refcount if any.
|
|
|
|
|
*/
|
|
|
|
|
if (unlikely(global.mode & MODE_STARTING))
|
|
|
|
|
fdtab[fd].refc_tgid = ti->tgid;
|
|
|
|
|
|
2022-07-06 12:47:38 -04:00
|
|
|
/* the tgid cannot change before a complete close so we should never
|
|
|
|
|
* face the situation where we try to close an fd that was reassigned.
|
2023-02-27 12:43:38 -05:00
|
|
|
* However there is one corner case where this happens, it's when an
|
|
|
|
|
* attempt to pause a listener fails (e.g. abns), leaving the listener
|
|
|
|
|
* in fault state and it is forcefully stopped. This needs to be done
|
|
|
|
|
* under isolation, and it's quite rare (i.e. once per such FD per
|
|
|
|
|
* process). Since we'll be isolated we can clear the thread mask and
|
|
|
|
|
* close the FD ourselves.
|
2022-07-06 12:47:38 -04:00
|
|
|
*/
|
2023-02-27 12:43:38 -05:00
|
|
|
if (unlikely(fd_tgid(fd) != ti->tgid)) {
|
|
|
|
|
int must_isolate = !thread_isolated() && !(global.mode & MODE_STOPPING);
|
|
|
|
|
|
|
|
|
|
if (must_isolate)
|
|
|
|
|
thread_isolate();
|
|
|
|
|
|
|
|
|
|
HA_ATOMIC_STORE(&fdtab[fd].thread_mask, 0);
|
|
|
|
|
HA_ATOMIC_STORE(&fdtab[fd].running_mask, 0);
|
|
|
|
|
_fd_delete_orphan(fd);
|
|
|
|
|
|
|
|
|
|
if (must_isolate)
|
|
|
|
|
thread_release();
|
|
|
|
|
return;
|
|
|
|
|
}
|
2022-07-06 12:47:38 -04:00
|
|
|
|
BUG/MEDIUM: fd: do not wait on FD removal in fd_delete()
Christopher discovered an issue mostly affecting 2.2 and to a less extent
2.3 and above, which is that it's possible to deadlock a soft-stop when
several threads are using a same listener:
thread1 thread2
unbind_listener() fd_set_running()
lock(listener) listener_accept()
fd_delete() lock(listener)
while (running_mask); -----> deadlock
unlock(listener)
This simple case disappeared from 2.3 due to the removal of some locked
operations at the end of listener_accept() on the regular path, but the
architectural problem is still here and caused by a lock inversion built
around the loop on running_mask in fd_clr_running_excl(), because there
are situations where the caller of fd_delete() may hold a lock that is
preventing other threads from dropping their bit in running_mask.
The real need here is to make sure the last user deletes the FD. We have
all we need to know the last one, it's the one calling fd_clr_running()
last, or entering fd_delete() last, both of which can be summed up as
the last one calling fd_clr_running() if fd_delete() calls fd_clr_running()
at the end. And we can prevent new threads from appearing in running_mask
by removing their bits in thread_mask.
So what this patch does is that it sets the running_mask for the thread
in fd_delete(), clears the thread_mask, thus marking the FD as orphaned,
then clears the running mask again, and completes the deletion if it was
the last one. If it was not, another thread will pass through fd_clr_running
and will complete the deletion of the FD.
The bug is easily reproducible in 2.2 under high connection rates during
soft close. When the old process stops its listener, occasionally two
threads will deadlock and the old process will then be killed by the
watchdog. It's strongly believed that similar situations do exist in 2.3
and 2.4 (e.g. if the removal attempt happens during resume_listener()
called from listener_accept()) but if so, they should be much harder to
trigger.
This should be backported to 2.2 as the issue appeared with the FD
migration. It requires previous patches "fd: make fd_clr_running() return
the remaining running mask" and "MINOR: fd: remove the unneeded running
bit from fd_insert()".
Notes for backport: in 2.2, the fd_dodelete() function requires an extra
argument "do_close" indicating whether we want to remove and close the FD
(fd_delete) or just delete it (fd_remove). While this information is not
conveyed along the chain, we know that late calls always imply do_close=1
become do_close=0 exclusively results from fd_remove() which is only used
by the config parser and the master, both of which are single-threaded,
hence are always the last ones in the running_mask. Thus it is safe to
assume that a postponed FD deletion always implies do_close=1.
Thanks to Olivier for his help in designing this optimal solution.
2021-03-24 05:51:32 -04:00
|
|
|
/* we must postpone removal of an FD that may currently be in use
|
2021-04-24 04:25:42 -04:00
|
|
|
* by another thread. This can happen in the following two situations:
|
BUG/MEDIUM: fd: do not wait on FD removal in fd_delete()
Christopher discovered an issue mostly affecting 2.2 and to a less extent
2.3 and above, which is that it's possible to deadlock a soft-stop when
several threads are using a same listener:
thread1 thread2
unbind_listener() fd_set_running()
lock(listener) listener_accept()
fd_delete() lock(listener)
while (running_mask); -----> deadlock
unlock(listener)
This simple case disappeared from 2.3 due to the removal of some locked
operations at the end of listener_accept() on the regular path, but the
architectural problem is still here and caused by a lock inversion built
around the loop on running_mask in fd_clr_running_excl(), because there
are situations where the caller of fd_delete() may hold a lock that is
preventing other threads from dropping their bit in running_mask.
The real need here is to make sure the last user deletes the FD. We have
all we need to know the last one, it's the one calling fd_clr_running()
last, or entering fd_delete() last, both of which can be summed up as
the last one calling fd_clr_running() if fd_delete() calls fd_clr_running()
at the end. And we can prevent new threads from appearing in running_mask
by removing their bits in thread_mask.
So what this patch does is that it sets the running_mask for the thread
in fd_delete(), clears the thread_mask, thus marking the FD as orphaned,
then clears the running mask again, and completes the deletion if it was
the last one. If it was not, another thread will pass through fd_clr_running
and will complete the deletion of the FD.
The bug is easily reproducible in 2.2 under high connection rates during
soft close. When the old process stops its listener, occasionally two
threads will deadlock and the old process will then be killed by the
watchdog. It's strongly believed that similar situations do exist in 2.3
and 2.4 (e.g. if the removal attempt happens during resume_listener()
called from listener_accept()) but if so, they should be much harder to
trigger.
This should be backported to 2.2 as the issue appeared with the FD
migration. It requires previous patches "fd: make fd_clr_running() return
the remaining running mask" and "MINOR: fd: remove the unneeded running
bit from fd_insert()".
Notes for backport: in 2.2, the fd_dodelete() function requires an extra
argument "do_close" indicating whether we want to remove and close the FD
(fd_delete) or just delete it (fd_remove). While this information is not
conveyed along the chain, we know that late calls always imply do_close=1
become do_close=0 exclusively results from fd_remove() which is only used
by the config parser and the master, both of which are single-threaded,
hence are always the last ones in the running_mask. Thus it is safe to
assume that a postponed FD deletion always implies do_close=1.
Thanks to Olivier for his help in designing this optimal solution.
2021-03-24 05:51:32 -04:00
|
|
|
* - after a takeover, the owning thread closes the connection but
|
|
|
|
|
* the previous one just woke up from the poller and entered
|
|
|
|
|
* the FD handler iocb. That thread holds an entry in running_mask
|
|
|
|
|
* and requires removal protection.
|
|
|
|
|
* - multiple threads are accepting connections on a listener, and
|
|
|
|
|
* one of them (or even an separate one) decides to unbind the
|
|
|
|
|
* listener under the listener's lock while other ones still hold
|
|
|
|
|
* the running bit.
|
|
|
|
|
* In both situations the FD is marked as unused (thread_mask = 0) and
|
|
|
|
|
* will not take new bits in its running_mask so we have the guarantee
|
|
|
|
|
* that the last thread eliminating running_mask is the one allowed to
|
|
|
|
|
* safely delete the FD. Most of the time it will be the current thread.
|
BUG/MAJOR: fd/threads: close a race on closing connections after takeover
As mentioned in commit 237e6a0d6 ("BUG/MAJOR: fd/thread: fix race between
updates and closing FD"), a race was found during stress tests involving
heavy backend connection reuse with many competing closes.
Here the problem is complex. The analysis in commit f69fea64e ("MAJOR:
fd: get rid of the DWCAS when setting the running_mask") that removed
the DWCAS in 2.5 overlooked a few races.
First, a takeover from thread1 could happen just after fd_update_events()
in thread2 validates it holds the tmask bit in the CAS loop. Since thread1
releases running_mask after the operation, thread2 will succeed the CAS
and both will believe the FD is theirs. This does explain the occasional
crashes seen with h1_io_cb() being called on a bad context, or
sock_conn_iocb() seeing conn->subs vanish after checking it. This issue
can be addressed using a DWCAS in both fd_takeover() and fd_update_events()
as it was before the patch above but this is not portable to all archs and
is not easy to adapt for those lacking it, due to some operations still
happening only on individual masks after the thread groups were added.
Second, the checks after fd_clr_running() for the current thread being
the last one is not sufficient: at the exact moment the operation
completes, another thread may also set and drop the running bit and see
itself as alone, and both can call _fd_close_orphan() in parallel. In
order to prevent this from happening, we cannot rely on the absence of
others, we need an explicit flag indicating that the FD must be closed.
One approach that was attempted consisted in playing with the thread_mask
but that was not reliable since it could still match between the late
deletion and the early insertion that follows. Instead, a new FD flag
was added, FD_MUST_CLOSE, that exactly indicates that the call to
_fd_delete_orphan() must be done. It is set by fd_delete(), and
atomically cleared by the first one which checks it, and which is the
only one to call _fd_delete_orphan().
With both points addressed, there's no more visible race left:
- takeover() only happens under the connection list's lock and cannot
compete with fd_delete() since fd_delete() must first remove the
connection from the list before deleting the FD. That's also why it
doesn't need to call _fd_delete_orphan() when dropping its running
bit.
- takeover() sets its running bit then atomically replaces the thread
mask, so that until that's done, it doesn't validate the condition
to end the synchonization loop in fd_update_events(). Once it's OK,
the previous thread's bit is lost, and this is checked for in
fd_update_events()
- fd_update_events() can compete with fd_delete() at various places
which are explained above. Since fd_delete() clears the thread mask
as after setting its running bit and after setting the FD_MUST_CLOSE
bit, the synchronization loop guarantees that the thread mask is seen
before going further, and that once it's seen, the FD_MUST_CLOSE flag
is already present.
- fd_delete() may start while fd_update_events() has already started,
but fd_delete() must hold a bit in thread_mask before starting, and
that is checked by the first test in fd_update_events() before setting
the running_mask.
- the poller's _update_fd() will not compete against _fd_delete_orphan()
nor fd_insert() thanks to the fd_grab_tgid() that's always done before
updating the polled_mask, and guarantees that we never pretend that a
polled_mask has a bit before the FD is added.
The issue is very hard to reproduce and is extremely time-sensitive.
Some tests were required with a 1-ms timeout with request rates
closely matching 1 kHz per server, though certain tests sometimes
benefitted from saturation. It was found that adding the following
slowdown at a few key places helped a lot and managed to trigger the
bug in 0.5 to 5 seconds instead of tens of minutes on a 20-thread
setup:
{ volatile int i = 10000; while (i--); }
Particularly, placing it at key places where only one of running_mask
or thread_mask is set and not the other one yet (e.g. after the
synchronization loop in fd_update_events or after dropping the
running bit) did yield great results.
Many thanks to Olivier Houchard for this expert help analysing these
races and reviewing candidate fixes.
The patch must be backported to 2.5. Note that 2.6 does not have tgid
in FDs, and that it requires a change of output on fd_clr_running() as
we need the previous bit. This is provided by carefully backporting
commit d6e1987612 ("MINOR: fd: make fd_clr_running() return the previous
value instead"). Tests have shown that the lack of tgid is a showstopper
for 2.6 and that unless a better workaround is found, it could still be
preferable to backport the minimum pieces required for fd_grab_tgid()
to 2.6 so that it stays stable long.
2023-03-07 13:11:02 -05:00
|
|
|
* We still need to set and check the one-shot flag FD_MUST_CLOSE
|
|
|
|
|
* to take care of the rare cases where a thread wakes up on late I/O
|
|
|
|
|
* before the thread_mask is zero, and sets its bit in the running_mask
|
|
|
|
|
* just after the current thread finishes clearing its own bit, hence
|
|
|
|
|
* the two threads see themselves as last ones (which they really are).
|
BUG/MEDIUM: fd: do not wait on FD removal in fd_delete()
Christopher discovered an issue mostly affecting 2.2 and to a less extent
2.3 and above, which is that it's possible to deadlock a soft-stop when
several threads are using a same listener:
thread1 thread2
unbind_listener() fd_set_running()
lock(listener) listener_accept()
fd_delete() lock(listener)
while (running_mask); -----> deadlock
unlock(listener)
This simple case disappeared from 2.3 due to the removal of some locked
operations at the end of listener_accept() on the regular path, but the
architectural problem is still here and caused by a lock inversion built
around the loop on running_mask in fd_clr_running_excl(), because there
are situations where the caller of fd_delete() may hold a lock that is
preventing other threads from dropping their bit in running_mask.
The real need here is to make sure the last user deletes the FD. We have
all we need to know the last one, it's the one calling fd_clr_running()
last, or entering fd_delete() last, both of which can be summed up as
the last one calling fd_clr_running() if fd_delete() calls fd_clr_running()
at the end. And we can prevent new threads from appearing in running_mask
by removing their bits in thread_mask.
So what this patch does is that it sets the running_mask for the thread
in fd_delete(), clears the thread_mask, thus marking the FD as orphaned,
then clears the running mask again, and completes the deletion if it was
the last one. If it was not, another thread will pass through fd_clr_running
and will complete the deletion of the FD.
The bug is easily reproducible in 2.2 under high connection rates during
soft close. When the old process stops its listener, occasionally two
threads will deadlock and the old process will then be killed by the
watchdog. It's strongly believed that similar situations do exist in 2.3
and 2.4 (e.g. if the removal attempt happens during resume_listener()
called from listener_accept()) but if so, they should be much harder to
trigger.
This should be backported to 2.2 as the issue appeared with the FD
migration. It requires previous patches "fd: make fd_clr_running() return
the remaining running mask" and "MINOR: fd: remove the unneeded running
bit from fd_insert()".
Notes for backport: in 2.2, the fd_dodelete() function requires an extra
argument "do_close" indicating whether we want to remove and close the FD
(fd_delete) or just delete it (fd_remove). While this information is not
conveyed along the chain, we know that late calls always imply do_close=1
become do_close=0 exclusively results from fd_remove() which is only used
by the config parser and the master, both of which are single-threaded,
hence are always the last ones in the running_mask. Thus it is safe to
assume that a postponed FD deletion always implies do_close=1.
Thanks to Olivier for his help in designing this optimal solution.
2021-03-24 05:51:32 -04:00
|
|
|
*/
|
|
|
|
|
|
2022-07-07 02:16:08 -04:00
|
|
|
HA_ATOMIC_OR(&fdtab[fd].running_mask, ti->ltid_bit);
|
BUG/MAJOR: fd/threads: close a race on closing connections after takeover
As mentioned in commit 237e6a0d6 ("BUG/MAJOR: fd/thread: fix race between
updates and closing FD"), a race was found during stress tests involving
heavy backend connection reuse with many competing closes.
Here the problem is complex. The analysis in commit f69fea64e ("MAJOR:
fd: get rid of the DWCAS when setting the running_mask") that removed
the DWCAS in 2.5 overlooked a few races.
First, a takeover from thread1 could happen just after fd_update_events()
in thread2 validates it holds the tmask bit in the CAS loop. Since thread1
releases running_mask after the operation, thread2 will succeed the CAS
and both will believe the FD is theirs. This does explain the occasional
crashes seen with h1_io_cb() being called on a bad context, or
sock_conn_iocb() seeing conn->subs vanish after checking it. This issue
can be addressed using a DWCAS in both fd_takeover() and fd_update_events()
as it was before the patch above but this is not portable to all archs and
is not easy to adapt for those lacking it, due to some operations still
happening only on individual masks after the thread groups were added.
Second, the checks after fd_clr_running() for the current thread being
the last one is not sufficient: at the exact moment the operation
completes, another thread may also set and drop the running bit and see
itself as alone, and both can call _fd_close_orphan() in parallel. In
order to prevent this from happening, we cannot rely on the absence of
others, we need an explicit flag indicating that the FD must be closed.
One approach that was attempted consisted in playing with the thread_mask
but that was not reliable since it could still match between the late
deletion and the early insertion that follows. Instead, a new FD flag
was added, FD_MUST_CLOSE, that exactly indicates that the call to
_fd_delete_orphan() must be done. It is set by fd_delete(), and
atomically cleared by the first one which checks it, and which is the
only one to call _fd_delete_orphan().
With both points addressed, there's no more visible race left:
- takeover() only happens under the connection list's lock and cannot
compete with fd_delete() since fd_delete() must first remove the
connection from the list before deleting the FD. That's also why it
doesn't need to call _fd_delete_orphan() when dropping its running
bit.
- takeover() sets its running bit then atomically replaces the thread
mask, so that until that's done, it doesn't validate the condition
to end the synchonization loop in fd_update_events(). Once it's OK,
the previous thread's bit is lost, and this is checked for in
fd_update_events()
- fd_update_events() can compete with fd_delete() at various places
which are explained above. Since fd_delete() clears the thread mask
as after setting its running bit and after setting the FD_MUST_CLOSE
bit, the synchronization loop guarantees that the thread mask is seen
before going further, and that once it's seen, the FD_MUST_CLOSE flag
is already present.
- fd_delete() may start while fd_update_events() has already started,
but fd_delete() must hold a bit in thread_mask before starting, and
that is checked by the first test in fd_update_events() before setting
the running_mask.
- the poller's _update_fd() will not compete against _fd_delete_orphan()
nor fd_insert() thanks to the fd_grab_tgid() that's always done before
updating the polled_mask, and guarantees that we never pretend that a
polled_mask has a bit before the FD is added.
The issue is very hard to reproduce and is extremely time-sensitive.
Some tests were required with a 1-ms timeout with request rates
closely matching 1 kHz per server, though certain tests sometimes
benefitted from saturation. It was found that adding the following
slowdown at a few key places helped a lot and managed to trigger the
bug in 0.5 to 5 seconds instead of tens of minutes on a 20-thread
setup:
{ volatile int i = 10000; while (i--); }
Particularly, placing it at key places where only one of running_mask
or thread_mask is set and not the other one yet (e.g. after the
synchronization loop in fd_update_events or after dropping the
running bit) did yield great results.
Many thanks to Olivier Houchard for this expert help analysing these
races and reviewing candidate fixes.
The patch must be backported to 2.5. Note that 2.6 does not have tgid
in FDs, and that it requires a change of output on fd_clr_running() as
we need the previous bit. This is provided by carefully backporting
commit d6e1987612 ("MINOR: fd: make fd_clr_running() return the previous
value instead"). Tests have shown that the lack of tgid is a showstopper
for 2.6 and that unless a better workaround is found, it could still be
preferable to backport the minimum pieces required for fd_grab_tgid()
to 2.6 so that it stays stable long.
2023-03-07 13:11:02 -05:00
|
|
|
HA_ATOMIC_OR(&fdtab[fd].state, FD_MUST_CLOSE);
|
BUG/MEDIUM: fd: do not wait on FD removal in fd_delete()
Christopher discovered an issue mostly affecting 2.2 and to a less extent
2.3 and above, which is that it's possible to deadlock a soft-stop when
several threads are using a same listener:
thread1 thread2
unbind_listener() fd_set_running()
lock(listener) listener_accept()
fd_delete() lock(listener)
while (running_mask); -----> deadlock
unlock(listener)
This simple case disappeared from 2.3 due to the removal of some locked
operations at the end of listener_accept() on the regular path, but the
architectural problem is still here and caused by a lock inversion built
around the loop on running_mask in fd_clr_running_excl(), because there
are situations where the caller of fd_delete() may hold a lock that is
preventing other threads from dropping their bit in running_mask.
The real need here is to make sure the last user deletes the FD. We have
all we need to know the last one, it's the one calling fd_clr_running()
last, or entering fd_delete() last, both of which can be summed up as
the last one calling fd_clr_running() if fd_delete() calls fd_clr_running()
at the end. And we can prevent new threads from appearing in running_mask
by removing their bits in thread_mask.
So what this patch does is that it sets the running_mask for the thread
in fd_delete(), clears the thread_mask, thus marking the FD as orphaned,
then clears the running mask again, and completes the deletion if it was
the last one. If it was not, another thread will pass through fd_clr_running
and will complete the deletion of the FD.
The bug is easily reproducible in 2.2 under high connection rates during
soft close. When the old process stops its listener, occasionally two
threads will deadlock and the old process will then be killed by the
watchdog. It's strongly believed that similar situations do exist in 2.3
and 2.4 (e.g. if the removal attempt happens during resume_listener()
called from listener_accept()) but if so, they should be much harder to
trigger.
This should be backported to 2.2 as the issue appeared with the FD
migration. It requires previous patches "fd: make fd_clr_running() return
the remaining running mask" and "MINOR: fd: remove the unneeded running
bit from fd_insert()".
Notes for backport: in 2.2, the fd_dodelete() function requires an extra
argument "do_close" indicating whether we want to remove and close the FD
(fd_delete) or just delete it (fd_remove). While this information is not
conveyed along the chain, we know that late calls always imply do_close=1
become do_close=0 exclusively results from fd_remove() which is only used
by the config parser and the master, both of which are single-threaded,
hence are always the last ones in the running_mask. Thus it is safe to
assume that a postponed FD deletion always implies do_close=1.
Thanks to Olivier for his help in designing this optimal solution.
2021-03-24 05:51:32 -04:00
|
|
|
HA_ATOMIC_STORE(&fdtab[fd].thread_mask, 0);
|
BUG/MAJOR: fd/threads: close a race on closing connections after takeover
As mentioned in commit 237e6a0d6 ("BUG/MAJOR: fd/thread: fix race between
updates and closing FD"), a race was found during stress tests involving
heavy backend connection reuse with many competing closes.
Here the problem is complex. The analysis in commit f69fea64e ("MAJOR:
fd: get rid of the DWCAS when setting the running_mask") that removed
the DWCAS in 2.5 overlooked a few races.
First, a takeover from thread1 could happen just after fd_update_events()
in thread2 validates it holds the tmask bit in the CAS loop. Since thread1
releases running_mask after the operation, thread2 will succeed the CAS
and both will believe the FD is theirs. This does explain the occasional
crashes seen with h1_io_cb() being called on a bad context, or
sock_conn_iocb() seeing conn->subs vanish after checking it. This issue
can be addressed using a DWCAS in both fd_takeover() and fd_update_events()
as it was before the patch above but this is not portable to all archs and
is not easy to adapt for those lacking it, due to some operations still
happening only on individual masks after the thread groups were added.
Second, the checks after fd_clr_running() for the current thread being
the last one is not sufficient: at the exact moment the operation
completes, another thread may also set and drop the running bit and see
itself as alone, and both can call _fd_close_orphan() in parallel. In
order to prevent this from happening, we cannot rely on the absence of
others, we need an explicit flag indicating that the FD must be closed.
One approach that was attempted consisted in playing with the thread_mask
but that was not reliable since it could still match between the late
deletion and the early insertion that follows. Instead, a new FD flag
was added, FD_MUST_CLOSE, that exactly indicates that the call to
_fd_delete_orphan() must be done. It is set by fd_delete(), and
atomically cleared by the first one which checks it, and which is the
only one to call _fd_delete_orphan().
With both points addressed, there's no more visible race left:
- takeover() only happens under the connection list's lock and cannot
compete with fd_delete() since fd_delete() must first remove the
connection from the list before deleting the FD. That's also why it
doesn't need to call _fd_delete_orphan() when dropping its running
bit.
- takeover() sets its running bit then atomically replaces the thread
mask, so that until that's done, it doesn't validate the condition
to end the synchonization loop in fd_update_events(). Once it's OK,
the previous thread's bit is lost, and this is checked for in
fd_update_events()
- fd_update_events() can compete with fd_delete() at various places
which are explained above. Since fd_delete() clears the thread mask
as after setting its running bit and after setting the FD_MUST_CLOSE
bit, the synchronization loop guarantees that the thread mask is seen
before going further, and that once it's seen, the FD_MUST_CLOSE flag
is already present.
- fd_delete() may start while fd_update_events() has already started,
but fd_delete() must hold a bit in thread_mask before starting, and
that is checked by the first test in fd_update_events() before setting
the running_mask.
- the poller's _update_fd() will not compete against _fd_delete_orphan()
nor fd_insert() thanks to the fd_grab_tgid() that's always done before
updating the polled_mask, and guarantees that we never pretend that a
polled_mask has a bit before the FD is added.
The issue is very hard to reproduce and is extremely time-sensitive.
Some tests were required with a 1-ms timeout with request rates
closely matching 1 kHz per server, though certain tests sometimes
benefitted from saturation. It was found that adding the following
slowdown at a few key places helped a lot and managed to trigger the
bug in 0.5 to 5 seconds instead of tens of minutes on a 20-thread
setup:
{ volatile int i = 10000; while (i--); }
Particularly, placing it at key places where only one of running_mask
or thread_mask is set and not the other one yet (e.g. after the
synchronization loop in fd_update_events or after dropping the
running bit) did yield great results.
Many thanks to Olivier Houchard for this expert help analysing these
races and reviewing candidate fixes.
The patch must be backported to 2.5. Note that 2.6 does not have tgid
in FDs, and that it requires a change of output on fd_clr_running() as
we need the previous bit. This is provided by carefully backporting
commit d6e1987612 ("MINOR: fd: make fd_clr_running() return the previous
value instead"). Tests have shown that the lack of tgid is a showstopper
for 2.6 and that unless a better workaround is found, it could still be
preferable to backport the minimum pieces required for fd_grab_tgid()
to 2.6 so that it stays stable long.
2023-03-07 13:11:02 -05:00
|
|
|
if (fd_clr_running(fd) == ti->ltid_bit) {
|
|
|
|
|
if (HA_ATOMIC_BTR(&fdtab[fd].state, FD_MUST_CLOSE_BIT)) {
|
|
|
|
|
_fd_delete_orphan(fd);
|
|
|
|
|
}
|
|
|
|
|
}
|
2006-06-25 20:48:02 -04:00
|
|
|
}
|
|
|
|
|
|
2022-04-27 04:50:00 -04:00
|
|
|
/* makes the new fd non-blocking and clears all other O_* flags; this is meant
|
|
|
|
|
* to be used on new FDs. Returns -1 on failure. The result is disguised at the
|
|
|
|
|
* end because some callers need to be able to ignore it regardless of the libc
|
|
|
|
|
* attributes.
|
2022-04-26 04:18:07 -04:00
|
|
|
*/
|
|
|
|
|
int fd_set_nonblock(int fd)
|
|
|
|
|
{
|
|
|
|
|
int ret = fcntl(fd, F_SETFL, O_NONBLOCK);
|
|
|
|
|
|
2022-04-27 04:50:00 -04:00
|
|
|
return DISGUISE(ret);
|
2022-04-26 04:18:07 -04:00
|
|
|
}
|
|
|
|
|
|
2022-04-27 04:50:00 -04:00
|
|
|
/* sets the close-on-exec flag on fd; returns -1 on failure. The result is
|
|
|
|
|
* disguised at the end because some callers need to be able to ignore it
|
|
|
|
|
* regardless of the libc attributes.
|
|
|
|
|
*/
|
2022-04-26 04:18:07 -04:00
|
|
|
int fd_set_cloexec(int fd)
|
|
|
|
|
{
|
|
|
|
|
int flags, ret;
|
|
|
|
|
|
|
|
|
|
flags = fcntl(fd, F_GETFD);
|
|
|
|
|
flags |= FD_CLOEXEC;
|
|
|
|
|
ret = fcntl(fd, F_SETFD, flags);
|
2022-04-27 04:50:00 -04:00
|
|
|
return DISGUISE(ret);
|
2022-04-26 04:18:07 -04:00
|
|
|
}
|
|
|
|
|
|
2023-04-03 09:27:13 -04:00
|
|
|
/* Migrate a FD to a new thread <new_tid>. It is explicitly permitted to
|
|
|
|
|
* migrate to another thread group, the function takes the necessary locking
|
|
|
|
|
* for this. It is even permitted to migrate from a foreign group to another,
|
|
|
|
|
* but the calling thread must be certain that the FD is not about to close
|
|
|
|
|
* when doing so, reason why it is highly recommended that only one of the
|
|
|
|
|
* FD's owners performs this operation. The polling is completely disabled.
|
|
|
|
|
* The operation never fails.
|
|
|
|
|
*/
|
|
|
|
|
void fd_migrate_on(int fd, uint new_tid)
|
|
|
|
|
{
|
|
|
|
|
struct thread_info *new_ti = &ha_thread_info[new_tid];
|
|
|
|
|
|
|
|
|
|
/* we must be alone to work on this idle FD. If not, it means that its
|
|
|
|
|
* poller is currently waking up and is about to use it, likely to
|
|
|
|
|
* close it on shut/error, but maybe also to process any unexpectedly
|
|
|
|
|
* pending data. It's also possible that the FD was closed and
|
|
|
|
|
* reassigned to another thread group, so let's be careful.
|
|
|
|
|
*/
|
|
|
|
|
fd_lock_tgid(fd, new_ti->tgid);
|
|
|
|
|
|
|
|
|
|
/* now we have exclusive access to it. From now FD belongs to tid_bit
|
|
|
|
|
* for this tgid.
|
|
|
|
|
*/
|
|
|
|
|
HA_ATOMIC_STORE(&fdtab[fd].thread_mask, new_ti->ltid_bit);
|
|
|
|
|
|
|
|
|
|
/* Make sure the FD doesn't have the active bit. It is possible that
|
|
|
|
|
* the fd is polled by the thread that used to own it, the new thread
|
|
|
|
|
* is supposed to call subscribe() later, to activate polling.
|
|
|
|
|
*/
|
|
|
|
|
fd_stop_both(fd);
|
|
|
|
|
|
|
|
|
|
/* we're done with it. As soon as we unlock it, other threads from the
|
|
|
|
|
* target group can manipulate it. However it may only disappear once
|
|
|
|
|
* we drop the reference.
|
|
|
|
|
*/
|
|
|
|
|
fd_unlock_tgid(fd);
|
|
|
|
|
fd_drop_tgid(fd);
|
|
|
|
|
}
|
|
|
|
|
|
2020-03-05 12:10:51 -05:00
|
|
|
/*
|
|
|
|
|
* Take over a FD belonging to another thread.
|
|
|
|
|
* unexpected_conn is the expected owner of the fd.
|
|
|
|
|
* Returns 0 on success, and -1 on failure.
|
|
|
|
|
*/
|
|
|
|
|
int fd_takeover(int fd, void *expected_owner)
|
|
|
|
|
{
|
2021-08-03 03:04:32 -04:00
|
|
|
unsigned long old;
|
2025-01-30 05:16:40 -05:00
|
|
|
int changing_tgid = 0;
|
|
|
|
|
int old_ltid, old_tgid;
|
2020-06-18 02:05:15 -04:00
|
|
|
|
2020-03-05 12:10:51 -05:00
|
|
|
/* protect ourself against a delete then an insert for the same fd,
|
|
|
|
|
* if it happens, then the owner will no longer be the expected
|
|
|
|
|
* connection.
|
|
|
|
|
*/
|
2021-08-03 03:04:32 -04:00
|
|
|
if (fdtab[fd].owner != expected_owner)
|
|
|
|
|
return -1;
|
|
|
|
|
|
2025-01-30 05:16:40 -05:00
|
|
|
/* We're taking a connection from a different thread group */
|
|
|
|
|
if ((fdtab[fd].refc_tgid & 0x7fff) != tgid) {
|
|
|
|
|
changing_tgid = 1;
|
|
|
|
|
|
|
|
|
|
old_tgid = fd_tgid(fd);
|
|
|
|
|
BUG_ON(atleast2(fdtab[fd].thread_mask));
|
|
|
|
|
old_ltid = my_ffsl(fdtab[fd].thread_mask) - 1;
|
|
|
|
|
|
|
|
|
|
if (unlikely(!fd_lock_tgid_cur(fd)))
|
|
|
|
|
return -1;
|
|
|
|
|
} else {
|
|
|
|
|
/* we must be alone to work on this idle FD. If not, it means that its
|
|
|
|
|
* poller is currently waking up and is about to use it, likely to
|
|
|
|
|
* close it on shut/error, but maybe also to process any unexpectedly
|
|
|
|
|
* pending data. It's also possible that the FD was closed and
|
|
|
|
|
* reassigned to another thread group, so let's be careful.
|
|
|
|
|
*/
|
|
|
|
|
if (unlikely(!fd_grab_tgid(fd, ti->tgid)))
|
|
|
|
|
return -1;
|
|
|
|
|
}
|
2022-07-06 12:47:38 -04:00
|
|
|
|
2021-08-03 03:04:32 -04:00
|
|
|
old = 0;
|
2022-07-07 02:16:08 -04:00
|
|
|
if (!HA_ATOMIC_CAS(&fdtab[fd].running_mask, &old, ti->ltid_bit)) {
|
2025-01-30 05:16:40 -05:00
|
|
|
if (changing_tgid)
|
|
|
|
|
fd_unlock_tgid(fd);
|
2022-07-06 12:47:38 -04:00
|
|
|
fd_drop_tgid(fd);
|
2021-08-03 03:04:32 -04:00
|
|
|
return -1;
|
2022-07-06 12:47:38 -04:00
|
|
|
}
|
2020-06-18 02:14:59 -04:00
|
|
|
|
2021-08-03 03:04:32 -04:00
|
|
|
/* success, from now on it's ours */
|
2022-07-07 02:23:03 -04:00
|
|
|
HA_ATOMIC_STORE(&fdtab[fd].thread_mask, ti->ltid_bit);
|
2020-06-18 02:14:59 -04:00
|
|
|
|
2025-01-30 05:16:40 -05:00
|
|
|
/*
|
|
|
|
|
* Change the tgid to our own tgid.
|
|
|
|
|
* This removes the lock, we don't need it anymore, but we keep
|
|
|
|
|
* the refcount.
|
|
|
|
|
*/
|
|
|
|
|
if (changing_tgid) {
|
|
|
|
|
fd_update_tgid(fd, tgid);
|
|
|
|
|
if (cur_poller.fixup_tgid_takeover)
|
|
|
|
|
cur_poller.fixup_tgid_takeover(&cur_poller, fd, old_ltid, old_tgid);
|
|
|
|
|
}
|
|
|
|
|
|
2020-06-17 14:34:05 -04:00
|
|
|
/* Make sure the FD doesn't have the active bit. It is possible that
|
|
|
|
|
* the fd is polled by the thread that used to own it, the new thread
|
|
|
|
|
* is supposed to call subscribe() later, to activate polling.
|
|
|
|
|
*/
|
2021-08-03 03:04:32 -04:00
|
|
|
fd_stop_recv(fd);
|
|
|
|
|
|
2025-01-30 09:59:11 -05:00
|
|
|
/* essentially for debugging */
|
|
|
|
|
fdtab[fd].nb_takeover++;
|
|
|
|
|
|
2021-08-03 03:04:32 -04:00
|
|
|
/* we're done with it */
|
2022-07-07 02:16:08 -04:00
|
|
|
HA_ATOMIC_AND(&fdtab[fd].running_mask, ~ti->ltid_bit);
|
2022-07-06 12:47:38 -04:00
|
|
|
|
|
|
|
|
/* no more changes planned */
|
|
|
|
|
fd_drop_tgid(fd);
|
2021-08-03 03:04:32 -04:00
|
|
|
return 0;
|
2020-03-05 12:10:51 -05:00
|
|
|
}
|
|
|
|
|
|
2019-09-04 07:25:41 -04:00
|
|
|
void updt_fd_polling(const int fd)
|
|
|
|
|
{
|
2022-07-15 14:12:31 -04:00
|
|
|
uint tgrp = fd_take_tgid(fd);
|
|
|
|
|
|
|
|
|
|
/* closed ? may happen */
|
|
|
|
|
if (!tgrp)
|
|
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
if (unlikely(tgrp != tgid && tgrp <= MAX_TGROUPS)) {
|
|
|
|
|
/* Hmmm delivered an update for another group... That may
|
|
|
|
|
* happen on suspend/resume of a listener for example when
|
|
|
|
|
* the FD was not even marked for running. Let's broadcast
|
|
|
|
|
* the update.
|
|
|
|
|
*/
|
|
|
|
|
unsigned long update_mask = fdtab[fd].update_mask;
|
|
|
|
|
int thr;
|
|
|
|
|
|
2023-01-19 13:14:18 -05:00
|
|
|
while (!_HA_ATOMIC_CAS(&fdtab[fd].update_mask, &update_mask,
|
|
|
|
|
_HA_ATOMIC_LOAD(&ha_tgroup_info[tgrp - 1].threads_enabled)))
|
2022-07-15 14:12:31 -04:00
|
|
|
__ha_cpu_relax();
|
|
|
|
|
|
|
|
|
|
fd_add_to_fd_list(&update_list[tgrp - 1], fd);
|
|
|
|
|
|
2023-01-19 11:10:10 -05:00
|
|
|
thr = one_among_mask(fdtab[fd].thread_mask & ha_tgroup_info[tgrp - 1].threads_enabled,
|
|
|
|
|
statistical_prng_range(ha_tgroup_info[tgrp - 1].count));
|
2022-07-15 14:12:31 -04:00
|
|
|
thr += ha_tgroup_info[tgrp - 1].base;
|
|
|
|
|
wake_thread(thr);
|
|
|
|
|
|
|
|
|
|
fd_drop_tgid(fd);
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fd_drop_tgid(fd);
|
|
|
|
|
|
2022-07-07 02:23:03 -04:00
|
|
|
if (tg->threads_enabled == 1UL || (fdtab[fd].thread_mask & tg->threads_enabled) == ti->ltid_bit) {
|
2022-07-05 13:21:06 -04:00
|
|
|
if (HA_ATOMIC_BTS(&fdtab[fd].update_mask, ti->ltid))
|
2019-09-04 07:25:41 -04:00
|
|
|
return;
|
|
|
|
|
|
|
|
|
|
fd_updt[fd_nbupdt++] = fd;
|
|
|
|
|
} else {
|
|
|
|
|
unsigned long update_mask = fdtab[fd].update_mask;
|
|
|
|
|
do {
|
2022-07-05 13:21:06 -04:00
|
|
|
if (update_mask == fdtab[fd].thread_mask) // FIXME: this works only on thread-groups 1
|
2019-09-04 07:25:41 -04:00
|
|
|
return;
|
2020-09-25 06:18:53 -04:00
|
|
|
} while (!_HA_ATOMIC_CAS(&fdtab[fd].update_mask, &update_mask, fdtab[fd].thread_mask));
|
|
|
|
|
|
2022-07-08 05:33:43 -04:00
|
|
|
fd_add_to_fd_list(&update_list[tgid - 1], fd);
|
2020-09-25 06:18:53 -04:00
|
|
|
|
2022-07-07 02:23:03 -04:00
|
|
|
if (fd_active(fd) && !(fdtab[fd].thread_mask & ti->ltid_bit)) {
|
2022-06-23 12:31:08 -04:00
|
|
|
/* we need to wake up another thread to handle it immediately, any will fit,
|
|
|
|
|
* so let's pick a random one so that it doesn't always end up on the same.
|
|
|
|
|
*/
|
2022-07-07 02:23:03 -04:00
|
|
|
int thr = one_among_mask(fdtab[fd].thread_mask & tg->threads_enabled,
|
2023-01-19 11:10:10 -05:00
|
|
|
statistical_prng_range(tg->count));
|
|
|
|
|
thr += tg->base;
|
2020-09-25 06:18:53 -04:00
|
|
|
wake_thread(thr);
|
|
|
|
|
}
|
2019-09-04 07:25:41 -04:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2021-07-29 10:53:46 -04:00
|
|
|
/* Update events seen for FD <fd> and its state if needed. This should be
|
|
|
|
|
* called by the poller, passing FD_EV_*_{R,W,RW} in <evts>. FD_EV_ERR_*
|
|
|
|
|
* doesn't need to also pass FD_EV_SHUT_*, it's implied. ERR and SHUT are
|
MEDIUM: fd: rely more on fd_update_events() to detect changes
This function already performs a number of checks prior to calling the
IOCB, and detects the change of thread (FD migration). Half of the
controls are still in each poller, and these pollers also maintain
activity counters for various cases.
Note that the unreliable test on thread_mask was removed so that only
the one performed by fd_set_running() is now used, since this one is
reliable.
Let's centralize all that fd-specific logic into the function and make
it return a status among:
FD_UPDT_DONE, // update done, nothing else to be done
FD_UPDT_DEAD, // FD was already dead, ignore it
FD_UPDT_CLOSED, // FD was closed
FD_UPDT_MIGRATED, // FD was migrated, ignore it now
Some pollers already used to call it last and have nothing to do after
it, regardless of the result. epoll has to delete the FD in case a
migration is detected. Overall this removes more code than it adds.
2021-07-29 10:57:19 -04:00
|
|
|
* allowed to be reported regardless of R/W readiness. Returns one of
|
|
|
|
|
* FD_UPDT_*.
|
2021-07-29 10:53:46 -04:00
|
|
|
*/
|
MEDIUM: fd: rely more on fd_update_events() to detect changes
This function already performs a number of checks prior to calling the
IOCB, and detects the change of thread (FD migration). Half of the
controls are still in each poller, and these pollers also maintain
activity counters for various cases.
Note that the unreliable test on thread_mask was removed so that only
the one performed by fd_set_running() is now used, since this one is
reliable.
Let's centralize all that fd-specific logic into the function and make
it return a status among:
FD_UPDT_DONE, // update done, nothing else to be done
FD_UPDT_DEAD, // FD was already dead, ignore it
FD_UPDT_CLOSED, // FD was closed
FD_UPDT_MIGRATED, // FD was migrated, ignore it now
Some pollers already used to call it last and have nothing to do after
it, regardless of the result. epoll has to delete the FD in case a
migration is detected. Overall this removes more code than it adds.
2021-07-29 10:57:19 -04:00
|
|
|
int fd_update_events(int fd, uint evts)
|
2021-07-29 10:53:46 -04:00
|
|
|
{
|
|
|
|
|
unsigned long locked;
|
|
|
|
|
uint old, new;
|
|
|
|
|
uint new_flags, must_stop;
|
2021-08-03 03:04:32 -04:00
|
|
|
ulong rmask, tmask;
|
2021-07-29 10:53:46 -04:00
|
|
|
|
2022-06-22 03:19:46 -04:00
|
|
|
_HA_ATOMIC_AND(&th_ctx->flags, ~TH_FL_STUCK); // this thread is still running
|
2021-07-29 10:53:46 -04:00
|
|
|
|
2022-07-06 12:47:38 -04:00
|
|
|
if (unlikely(!fd_grab_tgid(fd, ti->tgid))) {
|
|
|
|
|
/* the FD changed to another tgid, we can't safely
|
|
|
|
|
* check it anymore. The bits in the masks are not
|
|
|
|
|
* ours anymore and we're not allowed to touch them.
|
|
|
|
|
* Ours have already been cleared and the FD was
|
|
|
|
|
* closed in between so we can safely leave now.
|
|
|
|
|
*/
|
|
|
|
|
activity[tid].poll_drop_fd++;
|
|
|
|
|
return FD_UPDT_CLOSED;
|
|
|
|
|
}
|
|
|
|
|
|
BUG/MAJOR: fd/threads: close a race on closing connections after takeover
As mentioned in commit 237e6a0d6 ("BUG/MAJOR: fd/thread: fix race between
updates and closing FD"), a race was found during stress tests involving
heavy backend connection reuse with many competing closes.
Here the problem is complex. The analysis in commit f69fea64e ("MAJOR:
fd: get rid of the DWCAS when setting the running_mask") that removed
the DWCAS in 2.5 overlooked a few races.
First, a takeover from thread1 could happen just after fd_update_events()
in thread2 validates it holds the tmask bit in the CAS loop. Since thread1
releases running_mask after the operation, thread2 will succeed the CAS
and both will believe the FD is theirs. This does explain the occasional
crashes seen with h1_io_cb() being called on a bad context, or
sock_conn_iocb() seeing conn->subs vanish after checking it. This issue
can be addressed using a DWCAS in both fd_takeover() and fd_update_events()
as it was before the patch above but this is not portable to all archs and
is not easy to adapt for those lacking it, due to some operations still
happening only on individual masks after the thread groups were added.
Second, the checks after fd_clr_running() for the current thread being
the last one is not sufficient: at the exact moment the operation
completes, another thread may also set and drop the running bit and see
itself as alone, and both can call _fd_close_orphan() in parallel. In
order to prevent this from happening, we cannot rely on the absence of
others, we need an explicit flag indicating that the FD must be closed.
One approach that was attempted consisted in playing with the thread_mask
but that was not reliable since it could still match between the late
deletion and the early insertion that follows. Instead, a new FD flag
was added, FD_MUST_CLOSE, that exactly indicates that the call to
_fd_delete_orphan() must be done. It is set by fd_delete(), and
atomically cleared by the first one which checks it, and which is the
only one to call _fd_delete_orphan().
With both points addressed, there's no more visible race left:
- takeover() only happens under the connection list's lock and cannot
compete with fd_delete() since fd_delete() must first remove the
connection from the list before deleting the FD. That's also why it
doesn't need to call _fd_delete_orphan() when dropping its running
bit.
- takeover() sets its running bit then atomically replaces the thread
mask, so that until that's done, it doesn't validate the condition
to end the synchonization loop in fd_update_events(). Once it's OK,
the previous thread's bit is lost, and this is checked for in
fd_update_events()
- fd_update_events() can compete with fd_delete() at various places
which are explained above. Since fd_delete() clears the thread mask
as after setting its running bit and after setting the FD_MUST_CLOSE
bit, the synchronization loop guarantees that the thread mask is seen
before going further, and that once it's seen, the FD_MUST_CLOSE flag
is already present.
- fd_delete() may start while fd_update_events() has already started,
but fd_delete() must hold a bit in thread_mask before starting, and
that is checked by the first test in fd_update_events() before setting
the running_mask.
- the poller's _update_fd() will not compete against _fd_delete_orphan()
nor fd_insert() thanks to the fd_grab_tgid() that's always done before
updating the polled_mask, and guarantees that we never pretend that a
polled_mask has a bit before the FD is added.
The issue is very hard to reproduce and is extremely time-sensitive.
Some tests were required with a 1-ms timeout with request rates
closely matching 1 kHz per server, though certain tests sometimes
benefitted from saturation. It was found that adding the following
slowdown at a few key places helped a lot and managed to trigger the
bug in 0.5 to 5 seconds instead of tens of minutes on a 20-thread
setup:
{ volatile int i = 10000; while (i--); }
Particularly, placing it at key places where only one of running_mask
or thread_mask is set and not the other one yet (e.g. after the
synchronization loop in fd_update_events or after dropping the
running bit) did yield great results.
Many thanks to Olivier Houchard for this expert help analysing these
races and reviewing candidate fixes.
The patch must be backported to 2.5. Note that 2.6 does not have tgid
in FDs, and that it requires a change of output on fd_clr_running() as
we need the previous bit. This is provided by carefully backporting
commit d6e1987612 ("MINOR: fd: make fd_clr_running() return the previous
value instead"). Tests have shown that the lack of tgid is a showstopper
for 2.6 and that unless a better workaround is found, it could still be
preferable to backport the minimum pieces required for fd_grab_tgid()
to 2.6 so that it stays stable long.
2023-03-07 13:11:02 -05:00
|
|
|
/* Do not take running_mask if not strictly needed (will trigger a
|
|
|
|
|
* cosmetic BUG_ON() in fd_insert() anyway if done).
|
|
|
|
|
*/
|
|
|
|
|
tmask = _HA_ATOMIC_LOAD(&fdtab[fd].thread_mask);
|
|
|
|
|
if (!(tmask & ti->ltid_bit))
|
|
|
|
|
goto do_update;
|
|
|
|
|
|
|
|
|
|
HA_ATOMIC_OR(&fdtab[fd].running_mask, ti->ltid_bit);
|
|
|
|
|
|
|
|
|
|
/* From this point, our bit may possibly be in thread_mask, but it may
|
|
|
|
|
* still vanish, either because a takeover completed just before taking
|
|
|
|
|
* the bit above with the new owner deleting the FD, or because a
|
|
|
|
|
* takeover started just before taking the bit. In order to make sure a
|
|
|
|
|
* started takeover is complete, we need to verify that all bits of
|
|
|
|
|
* running_mask are present in thread_mask, since takeover first takes
|
|
|
|
|
* running then atomically replaces thread_mask. Once it's stable, if
|
|
|
|
|
* our bit remains there, no further takeover may happen because we
|
|
|
|
|
* hold running, but if our bit is not there it means we've lost the
|
|
|
|
|
* takeover race and have to decline touching the FD. Regarding the
|
|
|
|
|
* risk of deletion, our bit in running_mask prevents fd_delete() from
|
|
|
|
|
* finalizing the close, and the caller will leave the FD with a zero
|
|
|
|
|
* thread_mask and the FD_MUST_CLOSE flag set. It will then be our
|
|
|
|
|
* responsibility to close it.
|
|
|
|
|
*/
|
2021-08-03 03:04:32 -04:00
|
|
|
do {
|
BUG/MAJOR: fd/threads: close a race on closing connections after takeover
As mentioned in commit 237e6a0d6 ("BUG/MAJOR: fd/thread: fix race between
updates and closing FD"), a race was found during stress tests involving
heavy backend connection reuse with many competing closes.
Here the problem is complex. The analysis in commit f69fea64e ("MAJOR:
fd: get rid of the DWCAS when setting the running_mask") that removed
the DWCAS in 2.5 overlooked a few races.
First, a takeover from thread1 could happen just after fd_update_events()
in thread2 validates it holds the tmask bit in the CAS loop. Since thread1
releases running_mask after the operation, thread2 will succeed the CAS
and both will believe the FD is theirs. This does explain the occasional
crashes seen with h1_io_cb() being called on a bad context, or
sock_conn_iocb() seeing conn->subs vanish after checking it. This issue
can be addressed using a DWCAS in both fd_takeover() and fd_update_events()
as it was before the patch above but this is not portable to all archs and
is not easy to adapt for those lacking it, due to some operations still
happening only on individual masks after the thread groups were added.
Second, the checks after fd_clr_running() for the current thread being
the last one is not sufficient: at the exact moment the operation
completes, another thread may also set and drop the running bit and see
itself as alone, and both can call _fd_close_orphan() in parallel. In
order to prevent this from happening, we cannot rely on the absence of
others, we need an explicit flag indicating that the FD must be closed.
One approach that was attempted consisted in playing with the thread_mask
but that was not reliable since it could still match between the late
deletion and the early insertion that follows. Instead, a new FD flag
was added, FD_MUST_CLOSE, that exactly indicates that the call to
_fd_delete_orphan() must be done. It is set by fd_delete(), and
atomically cleared by the first one which checks it, and which is the
only one to call _fd_delete_orphan().
With both points addressed, there's no more visible race left:
- takeover() only happens under the connection list's lock and cannot
compete with fd_delete() since fd_delete() must first remove the
connection from the list before deleting the FD. That's also why it
doesn't need to call _fd_delete_orphan() when dropping its running
bit.
- takeover() sets its running bit then atomically replaces the thread
mask, so that until that's done, it doesn't validate the condition
to end the synchonization loop in fd_update_events(). Once it's OK,
the previous thread's bit is lost, and this is checked for in
fd_update_events()
- fd_update_events() can compete with fd_delete() at various places
which are explained above. Since fd_delete() clears the thread mask
as after setting its running bit and after setting the FD_MUST_CLOSE
bit, the synchronization loop guarantees that the thread mask is seen
before going further, and that once it's seen, the FD_MUST_CLOSE flag
is already present.
- fd_delete() may start while fd_update_events() has already started,
but fd_delete() must hold a bit in thread_mask before starting, and
that is checked by the first test in fd_update_events() before setting
the running_mask.
- the poller's _update_fd() will not compete against _fd_delete_orphan()
nor fd_insert() thanks to the fd_grab_tgid() that's always done before
updating the polled_mask, and guarantees that we never pretend that a
polled_mask has a bit before the FD is added.
The issue is very hard to reproduce and is extremely time-sensitive.
Some tests were required with a 1-ms timeout with request rates
closely matching 1 kHz per server, though certain tests sometimes
benefitted from saturation. It was found that adding the following
slowdown at a few key places helped a lot and managed to trigger the
bug in 0.5 to 5 seconds instead of tens of minutes on a 20-thread
setup:
{ volatile int i = 10000; while (i--); }
Particularly, placing it at key places where only one of running_mask
or thread_mask is set and not the other one yet (e.g. after the
synchronization loop in fd_update_events or after dropping the
running bit) did yield great results.
Many thanks to Olivier Houchard for this expert help analysing these
races and reviewing candidate fixes.
The patch must be backported to 2.5. Note that 2.6 does not have tgid
in FDs, and that it requires a change of output on fd_clr_running() as
we need the previous bit. This is provided by carefully backporting
commit d6e1987612 ("MINOR: fd: make fd_clr_running() return the previous
value instead"). Tests have shown that the lack of tgid is a showstopper
for 2.6 and that unless a better workaround is found, it could still be
preferable to backport the minimum pieces required for fd_grab_tgid()
to 2.6 so that it stays stable long.
2023-03-07 13:11:02 -05:00
|
|
|
rmask = _HA_ATOMIC_LOAD(&fdtab[fd].running_mask);
|
|
|
|
|
tmask = _HA_ATOMIC_LOAD(&fdtab[fd].thread_mask);
|
|
|
|
|
rmask &= ~ti->ltid_bit;
|
BUG/MEDIUM: fd: don't wait for tmask to stabilize if we're not in it.
In fd_update_events(), we loop until there's no bit in the running_mask
that is not in the thread_mask. Problem is, the thread sets its
running_mask bit before that loop, and so if 2 threads do the same, and
a 3rd one just closes the FD and sets the thread_mask to 0, then
running_mask will always be non-zero, and we will loop forever. This is
trivial to reproduce when using a DNS resolver that will just answer
"port unreachable", but could theoretically happen with other types of
file descriptors too.
To fix that, just don't bother looping if we're no longer in the
thread_mask, if that happens we know we won't have to take care of the
FD, anyway.
This should be backported to 2.7, 2.6 and 2.5.
2023-04-13 10:12:38 -04:00
|
|
|
} while ((rmask & ~tmask) && (tmask & ti->ltid_bit));
|
2021-08-03 03:04:32 -04:00
|
|
|
|
BUG/MAJOR: fd/threads: close a race on closing connections after takeover
As mentioned in commit 237e6a0d6 ("BUG/MAJOR: fd/thread: fix race between
updates and closing FD"), a race was found during stress tests involving
heavy backend connection reuse with many competing closes.
Here the problem is complex. The analysis in commit f69fea64e ("MAJOR:
fd: get rid of the DWCAS when setting the running_mask") that removed
the DWCAS in 2.5 overlooked a few races.
First, a takeover from thread1 could happen just after fd_update_events()
in thread2 validates it holds the tmask bit in the CAS loop. Since thread1
releases running_mask after the operation, thread2 will succeed the CAS
and both will believe the FD is theirs. This does explain the occasional
crashes seen with h1_io_cb() being called on a bad context, or
sock_conn_iocb() seeing conn->subs vanish after checking it. This issue
can be addressed using a DWCAS in both fd_takeover() and fd_update_events()
as it was before the patch above but this is not portable to all archs and
is not easy to adapt for those lacking it, due to some operations still
happening only on individual masks after the thread groups were added.
Second, the checks after fd_clr_running() for the current thread being
the last one is not sufficient: at the exact moment the operation
completes, another thread may also set and drop the running bit and see
itself as alone, and both can call _fd_close_orphan() in parallel. In
order to prevent this from happening, we cannot rely on the absence of
others, we need an explicit flag indicating that the FD must be closed.
One approach that was attempted consisted in playing with the thread_mask
but that was not reliable since it could still match between the late
deletion and the early insertion that follows. Instead, a new FD flag
was added, FD_MUST_CLOSE, that exactly indicates that the call to
_fd_delete_orphan() must be done. It is set by fd_delete(), and
atomically cleared by the first one which checks it, and which is the
only one to call _fd_delete_orphan().
With both points addressed, there's no more visible race left:
- takeover() only happens under the connection list's lock and cannot
compete with fd_delete() since fd_delete() must first remove the
connection from the list before deleting the FD. That's also why it
doesn't need to call _fd_delete_orphan() when dropping its running
bit.
- takeover() sets its running bit then atomically replaces the thread
mask, so that until that's done, it doesn't validate the condition
to end the synchonization loop in fd_update_events(). Once it's OK,
the previous thread's bit is lost, and this is checked for in
fd_update_events()
- fd_update_events() can compete with fd_delete() at various places
which are explained above. Since fd_delete() clears the thread mask
as after setting its running bit and after setting the FD_MUST_CLOSE
bit, the synchronization loop guarantees that the thread mask is seen
before going further, and that once it's seen, the FD_MUST_CLOSE flag
is already present.
- fd_delete() may start while fd_update_events() has already started,
but fd_delete() must hold a bit in thread_mask before starting, and
that is checked by the first test in fd_update_events() before setting
the running_mask.
- the poller's _update_fd() will not compete against _fd_delete_orphan()
nor fd_insert() thanks to the fd_grab_tgid() that's always done before
updating the polled_mask, and guarantees that we never pretend that a
polled_mask has a bit before the FD is added.
The issue is very hard to reproduce and is extremely time-sensitive.
Some tests were required with a 1-ms timeout with request rates
closely matching 1 kHz per server, though certain tests sometimes
benefitted from saturation. It was found that adding the following
slowdown at a few key places helped a lot and managed to trigger the
bug in 0.5 to 5 seconds instead of tens of minutes on a 20-thread
setup:
{ volatile int i = 10000; while (i--); }
Particularly, placing it at key places where only one of running_mask
or thread_mask is set and not the other one yet (e.g. after the
synchronization loop in fd_update_events or after dropping the
running bit) did yield great results.
Many thanks to Olivier Houchard for this expert help analysing these
races and reviewing candidate fixes.
The patch must be backported to 2.5. Note that 2.6 does not have tgid
in FDs, and that it requires a change of output on fd_clr_running() as
we need the previous bit. This is provided by carefully backporting
commit d6e1987612 ("MINOR: fd: make fd_clr_running() return the previous
value instead"). Tests have shown that the lack of tgid is a showstopper
for 2.6 and that unless a better workaround is found, it could still be
preferable to backport the minimum pieces required for fd_grab_tgid()
to 2.6 so that it stays stable long.
2023-03-07 13:11:02 -05:00
|
|
|
/* Now tmask is stable. Do nothing if the FD was taken over under us */
|
2022-07-09 12:55:37 -04:00
|
|
|
|
BUG/MAJOR: fd/threads: close a race on closing connections after takeover
As mentioned in commit 237e6a0d6 ("BUG/MAJOR: fd/thread: fix race between
updates and closing FD"), a race was found during stress tests involving
heavy backend connection reuse with many competing closes.
Here the problem is complex. The analysis in commit f69fea64e ("MAJOR:
fd: get rid of the DWCAS when setting the running_mask") that removed
the DWCAS in 2.5 overlooked a few races.
First, a takeover from thread1 could happen just after fd_update_events()
in thread2 validates it holds the tmask bit in the CAS loop. Since thread1
releases running_mask after the operation, thread2 will succeed the CAS
and both will believe the FD is theirs. This does explain the occasional
crashes seen with h1_io_cb() being called on a bad context, or
sock_conn_iocb() seeing conn->subs vanish after checking it. This issue
can be addressed using a DWCAS in both fd_takeover() and fd_update_events()
as it was before the patch above but this is not portable to all archs and
is not easy to adapt for those lacking it, due to some operations still
happening only on individual masks after the thread groups were added.
Second, the checks after fd_clr_running() for the current thread being
the last one is not sufficient: at the exact moment the operation
completes, another thread may also set and drop the running bit and see
itself as alone, and both can call _fd_close_orphan() in parallel. In
order to prevent this from happening, we cannot rely on the absence of
others, we need an explicit flag indicating that the FD must be closed.
One approach that was attempted consisted in playing with the thread_mask
but that was not reliable since it could still match between the late
deletion and the early insertion that follows. Instead, a new FD flag
was added, FD_MUST_CLOSE, that exactly indicates that the call to
_fd_delete_orphan() must be done. It is set by fd_delete(), and
atomically cleared by the first one which checks it, and which is the
only one to call _fd_delete_orphan().
With both points addressed, there's no more visible race left:
- takeover() only happens under the connection list's lock and cannot
compete with fd_delete() since fd_delete() must first remove the
connection from the list before deleting the FD. That's also why it
doesn't need to call _fd_delete_orphan() when dropping its running
bit.
- takeover() sets its running bit then atomically replaces the thread
mask, so that until that's done, it doesn't validate the condition
to end the synchonization loop in fd_update_events(). Once it's OK,
the previous thread's bit is lost, and this is checked for in
fd_update_events()
- fd_update_events() can compete with fd_delete() at various places
which are explained above. Since fd_delete() clears the thread mask
as after setting its running bit and after setting the FD_MUST_CLOSE
bit, the synchronization loop guarantees that the thread mask is seen
before going further, and that once it's seen, the FD_MUST_CLOSE flag
is already present.
- fd_delete() may start while fd_update_events() has already started,
but fd_delete() must hold a bit in thread_mask before starting, and
that is checked by the first test in fd_update_events() before setting
the running_mask.
- the poller's _update_fd() will not compete against _fd_delete_orphan()
nor fd_insert() thanks to the fd_grab_tgid() that's always done before
updating the polled_mask, and guarantees that we never pretend that a
polled_mask has a bit before the FD is added.
The issue is very hard to reproduce and is extremely time-sensitive.
Some tests were required with a 1-ms timeout with request rates
closely matching 1 kHz per server, though certain tests sometimes
benefitted from saturation. It was found that adding the following
slowdown at a few key places helped a lot and managed to trigger the
bug in 0.5 to 5 seconds instead of tens of minutes on a 20-thread
setup:
{ volatile int i = 10000; while (i--); }
Particularly, placing it at key places where only one of running_mask
or thread_mask is set and not the other one yet (e.g. after the
synchronization loop in fd_update_events or after dropping the
running bit) did yield great results.
Many thanks to Olivier Houchard for this expert help analysing these
races and reviewing candidate fixes.
The patch must be backported to 2.5. Note that 2.6 does not have tgid
in FDs, and that it requires a change of output on fd_clr_running() as
we need the previous bit. This is provided by carefully backporting
commit d6e1987612 ("MINOR: fd: make fd_clr_running() return the previous
value instead"). Tests have shown that the lack of tgid is a showstopper
for 2.6 and that unless a better workaround is found, it could still be
preferable to backport the minimum pieces required for fd_grab_tgid()
to 2.6 so that it stays stable long.
2023-03-07 13:11:02 -05:00
|
|
|
if (!(tmask & ti->ltid_bit)) {
|
|
|
|
|
/* a takeover has started */
|
|
|
|
|
activity[tid].poll_skip_fd++;
|
2022-07-06 12:47:38 -04:00
|
|
|
|
BUG/MAJOR: fd/threads: close a race on closing connections after takeover
As mentioned in commit 237e6a0d6 ("BUG/MAJOR: fd/thread: fix race between
updates and closing FD"), a race was found during stress tests involving
heavy backend connection reuse with many competing closes.
Here the problem is complex. The analysis in commit f69fea64e ("MAJOR:
fd: get rid of the DWCAS when setting the running_mask") that removed
the DWCAS in 2.5 overlooked a few races.
First, a takeover from thread1 could happen just after fd_update_events()
in thread2 validates it holds the tmask bit in the CAS loop. Since thread1
releases running_mask after the operation, thread2 will succeed the CAS
and both will believe the FD is theirs. This does explain the occasional
crashes seen with h1_io_cb() being called on a bad context, or
sock_conn_iocb() seeing conn->subs vanish after checking it. This issue
can be addressed using a DWCAS in both fd_takeover() and fd_update_events()
as it was before the patch above but this is not portable to all archs and
is not easy to adapt for those lacking it, due to some operations still
happening only on individual masks after the thread groups were added.
Second, the checks after fd_clr_running() for the current thread being
the last one is not sufficient: at the exact moment the operation
completes, another thread may also set and drop the running bit and see
itself as alone, and both can call _fd_close_orphan() in parallel. In
order to prevent this from happening, we cannot rely on the absence of
others, we need an explicit flag indicating that the FD must be closed.
One approach that was attempted consisted in playing with the thread_mask
but that was not reliable since it could still match between the late
deletion and the early insertion that follows. Instead, a new FD flag
was added, FD_MUST_CLOSE, that exactly indicates that the call to
_fd_delete_orphan() must be done. It is set by fd_delete(), and
atomically cleared by the first one which checks it, and which is the
only one to call _fd_delete_orphan().
With both points addressed, there's no more visible race left:
- takeover() only happens under the connection list's lock and cannot
compete with fd_delete() since fd_delete() must first remove the
connection from the list before deleting the FD. That's also why it
doesn't need to call _fd_delete_orphan() when dropping its running
bit.
- takeover() sets its running bit then atomically replaces the thread
mask, so that until that's done, it doesn't validate the condition
to end the synchonization loop in fd_update_events(). Once it's OK,
the previous thread's bit is lost, and this is checked for in
fd_update_events()
- fd_update_events() can compete with fd_delete() at various places
which are explained above. Since fd_delete() clears the thread mask
as after setting its running bit and after setting the FD_MUST_CLOSE
bit, the synchronization loop guarantees that the thread mask is seen
before going further, and that once it's seen, the FD_MUST_CLOSE flag
is already present.
- fd_delete() may start while fd_update_events() has already started,
but fd_delete() must hold a bit in thread_mask before starting, and
that is checked by the first test in fd_update_events() before setting
the running_mask.
- the poller's _update_fd() will not compete against _fd_delete_orphan()
nor fd_insert() thanks to the fd_grab_tgid() that's always done before
updating the polled_mask, and guarantees that we never pretend that a
polled_mask has a bit before the FD is added.
The issue is very hard to reproduce and is extremely time-sensitive.
Some tests were required with a 1-ms timeout with request rates
closely matching 1 kHz per server, though certain tests sometimes
benefitted from saturation. It was found that adding the following
slowdown at a few key places helped a lot and managed to trigger the
bug in 0.5 to 5 seconds instead of tens of minutes on a 20-thread
setup:
{ volatile int i = 10000; while (i--); }
Particularly, placing it at key places where only one of running_mask
or thread_mask is set and not the other one yet (e.g. after the
synchronization loop in fd_update_events or after dropping the
running bit) did yield great results.
Many thanks to Olivier Houchard for this expert help analysing these
races and reviewing candidate fixes.
The patch must be backported to 2.5. Note that 2.6 does not have tgid
in FDs, and that it requires a change of output on fd_clr_running() as
we need the previous bit. This is provided by carefully backporting
commit d6e1987612 ("MINOR: fd: make fd_clr_running() return the previous
value instead"). Tests have shown that the lack of tgid is a showstopper
for 2.6 and that unless a better workaround is found, it could still be
preferable to backport the minimum pieces required for fd_grab_tgid()
to 2.6 so that it stays stable long.
2023-03-07 13:11:02 -05:00
|
|
|
if (fd_clr_running(fd) == ti->ltid_bit)
|
|
|
|
|
goto closed_or_migrated;
|
|
|
|
|
|
|
|
|
|
goto do_update;
|
|
|
|
|
}
|
2021-07-29 10:53:46 -04:00
|
|
|
|
2022-07-06 12:47:38 -04:00
|
|
|
/* with running we're safe now, we can drop the reference */
|
|
|
|
|
fd_drop_tgid(fd);
|
|
|
|
|
|
2022-07-07 02:23:03 -04:00
|
|
|
locked = (tmask != ti->ltid_bit);
|
2021-07-29 10:53:46 -04:00
|
|
|
|
|
|
|
|
/* OK now we are guaranteed that our thread_mask was present and
|
|
|
|
|
* that we're allowed to update the FD.
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
new_flags =
|
|
|
|
|
((evts & FD_EV_READY_R) ? FD_POLL_IN : 0) |
|
|
|
|
|
((evts & FD_EV_READY_W) ? FD_POLL_OUT : 0) |
|
|
|
|
|
((evts & FD_EV_SHUT_R) ? FD_POLL_HUP : 0) |
|
|
|
|
|
((evts & FD_EV_ERR_RW) ? FD_POLL_ERR : 0);
|
|
|
|
|
|
|
|
|
|
/* SHUTW reported while FD was active for writes is an error */
|
|
|
|
|
if ((fdtab[fd].state & FD_EV_ACTIVE_W) && (evts & FD_EV_SHUT_W))
|
|
|
|
|
new_flags |= FD_POLL_ERR;
|
|
|
|
|
|
|
|
|
|
/* compute the inactive events reported late that must be stopped */
|
|
|
|
|
must_stop = 0;
|
|
|
|
|
if (unlikely(!fd_active(fd))) {
|
|
|
|
|
/* both sides stopped */
|
|
|
|
|
must_stop = FD_POLL_IN | FD_POLL_OUT;
|
|
|
|
|
}
|
|
|
|
|
else if (unlikely(!fd_recv_active(fd) && (evts & (FD_EV_READY_R | FD_EV_SHUT_R | FD_EV_ERR_RW)))) {
|
|
|
|
|
/* only send remains */
|
|
|
|
|
must_stop = FD_POLL_IN;
|
|
|
|
|
}
|
|
|
|
|
else if (unlikely(!fd_send_active(fd) && (evts & (FD_EV_READY_W | FD_EV_SHUT_W | FD_EV_ERR_RW)))) {
|
|
|
|
|
/* only recv remains */
|
|
|
|
|
must_stop = FD_POLL_OUT;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (new_flags & (FD_POLL_IN | FD_POLL_HUP | FD_POLL_ERR))
|
|
|
|
|
new_flags |= FD_EV_READY_R;
|
|
|
|
|
|
|
|
|
|
if (new_flags & (FD_POLL_OUT | FD_POLL_ERR))
|
|
|
|
|
new_flags |= FD_EV_READY_W;
|
|
|
|
|
|
|
|
|
|
old = fdtab[fd].state;
|
|
|
|
|
new = (old & ~FD_POLL_UPDT_MASK) | new_flags;
|
|
|
|
|
|
|
|
|
|
if (unlikely(locked)) {
|
|
|
|
|
/* Locked FDs (those with more than 2 threads) are atomically updated */
|
|
|
|
|
while (unlikely(new != old && !_HA_ATOMIC_CAS(&fdtab[fd].state, &old, new)))
|
|
|
|
|
new = (old & ~FD_POLL_UPDT_MASK) | new_flags;
|
|
|
|
|
} else {
|
|
|
|
|
if (new != old)
|
|
|
|
|
fdtab[fd].state = new;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (fdtab[fd].iocb && fd_active(fd)) {
|
|
|
|
|
fdtab[fd].iocb(fd);
|
|
|
|
|
}
|
|
|
|
|
|
MEDIUM: fd: quit fd_update_events() when FD is closed
The IOCB might have closed the FD itself, so it's not an error to
have fd.tgid==0 or anything else, nor to have a null running_mask.
In fact there are different conditions under which we can leave the
IOCB, all of them have been enumerated in the code's comments (namely
FD still valid and used, hence has running bit, FD closed but not yet
reassigned thus running==0, FD closed and reassigned, hence different
tgid and running becomes irrelevant, just like all other masks). For
this reason we have no other solution but to try to grab the tgid on
return before checking the other bits. In practice it doesn't represent
a big cost, because if the FD was closed and reassigned, it's instantly
detected and the bit is immediately released without blocking other
threads, and if the FD wasn't closed this doesn't prevent it from
being migrated to another thread. In the worst case a close by another
thread after a migration will be postponed till the moment the running
bit is cleared, which is the same as before.
2022-07-08 09:36:14 -04:00
|
|
|
/*
|
|
|
|
|
* We entered iocb with running set and with the valid tgid.
|
|
|
|
|
* Since then, this is what could have happened:
|
|
|
|
|
* - another thread tried to close the FD (e.g. timeout task from
|
|
|
|
|
* another one that owns it). We still have running set, but not
|
|
|
|
|
* tmask. We must call fd_clr_running() then _fd_delete_orphan()
|
|
|
|
|
* if we were the last one.
|
|
|
|
|
*
|
|
|
|
|
* - the iocb tried to close the FD => bit no more present in running,
|
|
|
|
|
* nothing to do. If it managed to close it, the poller's ->clo()
|
|
|
|
|
* has already been called.
|
|
|
|
|
*
|
|
|
|
|
* - after we closed, the FD was reassigned to another thread in
|
|
|
|
|
* another group => running not present, tgid differs, nothing to
|
|
|
|
|
* do because if it got reassigned it indicates it was already
|
|
|
|
|
* closed.
|
|
|
|
|
*
|
|
|
|
|
* There's no risk of takeover of the valid FD here during this period.
|
|
|
|
|
* Also if we still have running, immediately after we release it, the
|
|
|
|
|
* events above might instantly happen due to another thread taking
|
|
|
|
|
* over.
|
|
|
|
|
*
|
|
|
|
|
* As such, the only cases where the FD is still relevant are:
|
|
|
|
|
* - tgid still set and running still set (most common)
|
|
|
|
|
* - tgid still valid but running cleared due to fd_delete(): we may
|
|
|
|
|
* still need to stop polling otherwise we may keep it enabled
|
|
|
|
|
* while waiting for other threads to close it.
|
|
|
|
|
* And given that we may need to program a tentative update in case we
|
|
|
|
|
* don't immediately close, it's easier to grab the tgid during the
|
|
|
|
|
* whole check.
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
if (!fd_grab_tgid(fd, tgid))
|
|
|
|
|
return FD_UPDT_CLOSED;
|
|
|
|
|
|
|
|
|
|
tmask = _HA_ATOMIC_LOAD(&fdtab[fd].thread_mask);
|
|
|
|
|
|
2021-07-29 10:53:46 -04:00
|
|
|
/* another thread might have attempted to close this FD in the mean
|
|
|
|
|
* time (e.g. timeout task) striking on a previous thread and closing.
|
BUG/MAJOR: fd/threads: close a race on closing connections after takeover
As mentioned in commit 237e6a0d6 ("BUG/MAJOR: fd/thread: fix race between
updates and closing FD"), a race was found during stress tests involving
heavy backend connection reuse with many competing closes.
Here the problem is complex. The analysis in commit f69fea64e ("MAJOR:
fd: get rid of the DWCAS when setting the running_mask") that removed
the DWCAS in 2.5 overlooked a few races.
First, a takeover from thread1 could happen just after fd_update_events()
in thread2 validates it holds the tmask bit in the CAS loop. Since thread1
releases running_mask after the operation, thread2 will succeed the CAS
and both will believe the FD is theirs. This does explain the occasional
crashes seen with h1_io_cb() being called on a bad context, or
sock_conn_iocb() seeing conn->subs vanish after checking it. This issue
can be addressed using a DWCAS in both fd_takeover() and fd_update_events()
as it was before the patch above but this is not portable to all archs and
is not easy to adapt for those lacking it, due to some operations still
happening only on individual masks after the thread groups were added.
Second, the checks after fd_clr_running() for the current thread being
the last one is not sufficient: at the exact moment the operation
completes, another thread may also set and drop the running bit and see
itself as alone, and both can call _fd_close_orphan() in parallel. In
order to prevent this from happening, we cannot rely on the absence of
others, we need an explicit flag indicating that the FD must be closed.
One approach that was attempted consisted in playing with the thread_mask
but that was not reliable since it could still match between the late
deletion and the early insertion that follows. Instead, a new FD flag
was added, FD_MUST_CLOSE, that exactly indicates that the call to
_fd_delete_orphan() must be done. It is set by fd_delete(), and
atomically cleared by the first one which checks it, and which is the
only one to call _fd_delete_orphan().
With both points addressed, there's no more visible race left:
- takeover() only happens under the connection list's lock and cannot
compete with fd_delete() since fd_delete() must first remove the
connection from the list before deleting the FD. That's also why it
doesn't need to call _fd_delete_orphan() when dropping its running
bit.
- takeover() sets its running bit then atomically replaces the thread
mask, so that until that's done, it doesn't validate the condition
to end the synchonization loop in fd_update_events(). Once it's OK,
the previous thread's bit is lost, and this is checked for in
fd_update_events()
- fd_update_events() can compete with fd_delete() at various places
which are explained above. Since fd_delete() clears the thread mask
as after setting its running bit and after setting the FD_MUST_CLOSE
bit, the synchronization loop guarantees that the thread mask is seen
before going further, and that once it's seen, the FD_MUST_CLOSE flag
is already present.
- fd_delete() may start while fd_update_events() has already started,
but fd_delete() must hold a bit in thread_mask before starting, and
that is checked by the first test in fd_update_events() before setting
the running_mask.
- the poller's _update_fd() will not compete against _fd_delete_orphan()
nor fd_insert() thanks to the fd_grab_tgid() that's always done before
updating the polled_mask, and guarantees that we never pretend that a
polled_mask has a bit before the FD is added.
The issue is very hard to reproduce and is extremely time-sensitive.
Some tests were required with a 1-ms timeout with request rates
closely matching 1 kHz per server, though certain tests sometimes
benefitted from saturation. It was found that adding the following
slowdown at a few key places helped a lot and managed to trigger the
bug in 0.5 to 5 seconds instead of tens of minutes on a 20-thread
setup:
{ volatile int i = 10000; while (i--); }
Particularly, placing it at key places where only one of running_mask
or thread_mask is set and not the other one yet (e.g. after the
synchronization loop in fd_update_events or after dropping the
running bit) did yield great results.
Many thanks to Olivier Houchard for this expert help analysing these
races and reviewing candidate fixes.
The patch must be backported to 2.5. Note that 2.6 does not have tgid
in FDs, and that it requires a change of output on fd_clr_running() as
we need the previous bit. This is provided by carefully backporting
commit d6e1987612 ("MINOR: fd: make fd_clr_running() return the previous
value instead"). Tests have shown that the lack of tgid is a showstopper
for 2.6 and that unless a better workaround is found, it could still be
preferable to backport the minimum pieces required for fd_grab_tgid()
to 2.6 so that it stays stable long.
2023-03-07 13:11:02 -05:00
|
|
|
* This is detected by us being the last owners of a running_mask bit,
|
|
|
|
|
* and the thread_mask being zero. At the moment we release the running
|
|
|
|
|
* bit, a takeover may also happen, so in practice we check for our loss
|
|
|
|
|
* of the thread_mask bitboth thread_mask and running_mask being 0 after
|
MEDIUM: fd: quit fd_update_events() when FD is closed
The IOCB might have closed the FD itself, so it's not an error to
have fd.tgid==0 or anything else, nor to have a null running_mask.
In fact there are different conditions under which we can leave the
IOCB, all of them have been enumerated in the code's comments (namely
FD still valid and used, hence has running bit, FD closed but not yet
reassigned thus running==0, FD closed and reassigned, hence different
tgid and running becomes irrelevant, just like all other masks). For
this reason we have no other solution but to try to grab the tgid on
return before checking the other bits. In practice it doesn't represent
a big cost, because if the FD was closed and reassigned, it's instantly
detected and the bit is immediately released without blocking other
threads, and if the FD wasn't closed this doesn't prevent it from
being migrated to another thread. In the worst case a close by another
thread after a migration will be postponed till the moment the running
bit is cleared, which is the same as before.
2022-07-08 09:36:14 -04:00
|
|
|
* we remove ourselves last. There is no risk the FD gets reassigned
|
|
|
|
|
* to a different group since it's not released until the real close()
|
|
|
|
|
* in _fd_delete_orphan().
|
2021-07-29 10:53:46 -04:00
|
|
|
*/
|
BUG/MAJOR: fd/threads: close a race on closing connections after takeover
As mentioned in commit 237e6a0d6 ("BUG/MAJOR: fd/thread: fix race between
updates and closing FD"), a race was found during stress tests involving
heavy backend connection reuse with many competing closes.
Here the problem is complex. The analysis in commit f69fea64e ("MAJOR:
fd: get rid of the DWCAS when setting the running_mask") that removed
the DWCAS in 2.5 overlooked a few races.
First, a takeover from thread1 could happen just after fd_update_events()
in thread2 validates it holds the tmask bit in the CAS loop. Since thread1
releases running_mask after the operation, thread2 will succeed the CAS
and both will believe the FD is theirs. This does explain the occasional
crashes seen with h1_io_cb() being called on a bad context, or
sock_conn_iocb() seeing conn->subs vanish after checking it. This issue
can be addressed using a DWCAS in both fd_takeover() and fd_update_events()
as it was before the patch above but this is not portable to all archs and
is not easy to adapt for those lacking it, due to some operations still
happening only on individual masks after the thread groups were added.
Second, the checks after fd_clr_running() for the current thread being
the last one is not sufficient: at the exact moment the operation
completes, another thread may also set and drop the running bit and see
itself as alone, and both can call _fd_close_orphan() in parallel. In
order to prevent this from happening, we cannot rely on the absence of
others, we need an explicit flag indicating that the FD must be closed.
One approach that was attempted consisted in playing with the thread_mask
but that was not reliable since it could still match between the late
deletion and the early insertion that follows. Instead, a new FD flag
was added, FD_MUST_CLOSE, that exactly indicates that the call to
_fd_delete_orphan() must be done. It is set by fd_delete(), and
atomically cleared by the first one which checks it, and which is the
only one to call _fd_delete_orphan().
With both points addressed, there's no more visible race left:
- takeover() only happens under the connection list's lock and cannot
compete with fd_delete() since fd_delete() must first remove the
connection from the list before deleting the FD. That's also why it
doesn't need to call _fd_delete_orphan() when dropping its running
bit.
- takeover() sets its running bit then atomically replaces the thread
mask, so that until that's done, it doesn't validate the condition
to end the synchonization loop in fd_update_events(). Once it's OK,
the previous thread's bit is lost, and this is checked for in
fd_update_events()
- fd_update_events() can compete with fd_delete() at various places
which are explained above. Since fd_delete() clears the thread mask
as after setting its running bit and after setting the FD_MUST_CLOSE
bit, the synchronization loop guarantees that the thread mask is seen
before going further, and that once it's seen, the FD_MUST_CLOSE flag
is already present.
- fd_delete() may start while fd_update_events() has already started,
but fd_delete() must hold a bit in thread_mask before starting, and
that is checked by the first test in fd_update_events() before setting
the running_mask.
- the poller's _update_fd() will not compete against _fd_delete_orphan()
nor fd_insert() thanks to the fd_grab_tgid() that's always done before
updating the polled_mask, and guarantees that we never pretend that a
polled_mask has a bit before the FD is added.
The issue is very hard to reproduce and is extremely time-sensitive.
Some tests were required with a 1-ms timeout with request rates
closely matching 1 kHz per server, though certain tests sometimes
benefitted from saturation. It was found that adding the following
slowdown at a few key places helped a lot and managed to trigger the
bug in 0.5 to 5 seconds instead of tens of minutes on a 20-thread
setup:
{ volatile int i = 10000; while (i--); }
Particularly, placing it at key places where only one of running_mask
or thread_mask is set and not the other one yet (e.g. after the
synchronization loop in fd_update_events or after dropping the
running bit) did yield great results.
Many thanks to Olivier Houchard for this expert help analysing these
races and reviewing candidate fixes.
The patch must be backported to 2.5. Note that 2.6 does not have tgid
in FDs, and that it requires a change of output on fd_clr_running() as
we need the previous bit. This is provided by carefully backporting
commit d6e1987612 ("MINOR: fd: make fd_clr_running() return the previous
value instead"). Tests have shown that the lack of tgid is a showstopper
for 2.6 and that unless a better workaround is found, it could still be
preferable to backport the minimum pieces required for fd_grab_tgid()
to 2.6 so that it stays stable long.
2023-03-07 13:11:02 -05:00
|
|
|
if (fd_clr_running(fd) == ti->ltid_bit && !(tmask & ti->ltid_bit))
|
|
|
|
|
goto closed_or_migrated;
|
2021-07-29 10:53:46 -04:00
|
|
|
|
|
|
|
|
/* we had to stop this FD and it still must be stopped after the I/O
|
|
|
|
|
* cb's changes, so let's program an update for this.
|
|
|
|
|
*/
|
2022-07-05 13:21:06 -04:00
|
|
|
if (must_stop && !(fdtab[fd].update_mask & ti->ltid_bit)) {
|
2021-07-29 10:53:46 -04:00
|
|
|
if (((must_stop & FD_POLL_IN) && !fd_recv_active(fd)) ||
|
|
|
|
|
((must_stop & FD_POLL_OUT) && !fd_send_active(fd)))
|
2022-07-05 13:21:06 -04:00
|
|
|
if (!HA_ATOMIC_BTS(&fdtab[fd].update_mask, ti->ltid))
|
2021-07-29 10:53:46 -04:00
|
|
|
fd_updt[fd_nbupdt++] = fd;
|
|
|
|
|
}
|
MEDIUM: fd: rely more on fd_update_events() to detect changes
This function already performs a number of checks prior to calling the
IOCB, and detects the change of thread (FD migration). Half of the
controls are still in each poller, and these pollers also maintain
activity counters for various cases.
Note that the unreliable test on thread_mask was removed so that only
the one performed by fd_set_running() is now used, since this one is
reliable.
Let's centralize all that fd-specific logic into the function and make
it return a status among:
FD_UPDT_DONE, // update done, nothing else to be done
FD_UPDT_DEAD, // FD was already dead, ignore it
FD_UPDT_CLOSED, // FD was closed
FD_UPDT_MIGRATED, // FD was migrated, ignore it now
Some pollers already used to call it last and have nothing to do after
it, regardless of the result. epoll has to delete the FD in case a
migration is detected. Overall this removes more code than it adds.
2021-07-29 10:57:19 -04:00
|
|
|
|
MEDIUM: fd: quit fd_update_events() when FD is closed
The IOCB might have closed the FD itself, so it's not an error to
have fd.tgid==0 or anything else, nor to have a null running_mask.
In fact there are different conditions under which we can leave the
IOCB, all of them have been enumerated in the code's comments (namely
FD still valid and used, hence has running bit, FD closed but not yet
reassigned thus running==0, FD closed and reassigned, hence different
tgid and running becomes irrelevant, just like all other masks). For
this reason we have no other solution but to try to grab the tgid on
return before checking the other bits. In practice it doesn't represent
a big cost, because if the FD was closed and reassigned, it's instantly
detected and the bit is immediately released without blocking other
threads, and if the FD wasn't closed this doesn't prevent it from
being migrated to another thread. In the worst case a close by another
thread after a migration will be postponed till the moment the running
bit is cleared, which is the same as before.
2022-07-08 09:36:14 -04:00
|
|
|
fd_drop_tgid(fd);
|
MEDIUM: fd: rely more on fd_update_events() to detect changes
This function already performs a number of checks prior to calling the
IOCB, and detects the change of thread (FD migration). Half of the
controls are still in each poller, and these pollers also maintain
activity counters for various cases.
Note that the unreliable test on thread_mask was removed so that only
the one performed by fd_set_running() is now used, since this one is
reliable.
Let's centralize all that fd-specific logic into the function and make
it return a status among:
FD_UPDT_DONE, // update done, nothing else to be done
FD_UPDT_DEAD, // FD was already dead, ignore it
FD_UPDT_CLOSED, // FD was closed
FD_UPDT_MIGRATED, // FD was migrated, ignore it now
Some pollers already used to call it last and have nothing to do after
it, regardless of the result. epoll has to delete the FD in case a
migration is detected. Overall this removes more code than it adds.
2021-07-29 10:57:19 -04:00
|
|
|
return FD_UPDT_DONE;
|
BUG/MAJOR: fd/threads: close a race on closing connections after takeover
As mentioned in commit 237e6a0d6 ("BUG/MAJOR: fd/thread: fix race between
updates and closing FD"), a race was found during stress tests involving
heavy backend connection reuse with many competing closes.
Here the problem is complex. The analysis in commit f69fea64e ("MAJOR:
fd: get rid of the DWCAS when setting the running_mask") that removed
the DWCAS in 2.5 overlooked a few races.
First, a takeover from thread1 could happen just after fd_update_events()
in thread2 validates it holds the tmask bit in the CAS loop. Since thread1
releases running_mask after the operation, thread2 will succeed the CAS
and both will believe the FD is theirs. This does explain the occasional
crashes seen with h1_io_cb() being called on a bad context, or
sock_conn_iocb() seeing conn->subs vanish after checking it. This issue
can be addressed using a DWCAS in both fd_takeover() and fd_update_events()
as it was before the patch above but this is not portable to all archs and
is not easy to adapt for those lacking it, due to some operations still
happening only on individual masks after the thread groups were added.
Second, the checks after fd_clr_running() for the current thread being
the last one is not sufficient: at the exact moment the operation
completes, another thread may also set and drop the running bit and see
itself as alone, and both can call _fd_close_orphan() in parallel. In
order to prevent this from happening, we cannot rely on the absence of
others, we need an explicit flag indicating that the FD must be closed.
One approach that was attempted consisted in playing with the thread_mask
but that was not reliable since it could still match between the late
deletion and the early insertion that follows. Instead, a new FD flag
was added, FD_MUST_CLOSE, that exactly indicates that the call to
_fd_delete_orphan() must be done. It is set by fd_delete(), and
atomically cleared by the first one which checks it, and which is the
only one to call _fd_delete_orphan().
With both points addressed, there's no more visible race left:
- takeover() only happens under the connection list's lock and cannot
compete with fd_delete() since fd_delete() must first remove the
connection from the list before deleting the FD. That's also why it
doesn't need to call _fd_delete_orphan() when dropping its running
bit.
- takeover() sets its running bit then atomically replaces the thread
mask, so that until that's done, it doesn't validate the condition
to end the synchonization loop in fd_update_events(). Once it's OK,
the previous thread's bit is lost, and this is checked for in
fd_update_events()
- fd_update_events() can compete with fd_delete() at various places
which are explained above. Since fd_delete() clears the thread mask
as after setting its running bit and after setting the FD_MUST_CLOSE
bit, the synchronization loop guarantees that the thread mask is seen
before going further, and that once it's seen, the FD_MUST_CLOSE flag
is already present.
- fd_delete() may start while fd_update_events() has already started,
but fd_delete() must hold a bit in thread_mask before starting, and
that is checked by the first test in fd_update_events() before setting
the running_mask.
- the poller's _update_fd() will not compete against _fd_delete_orphan()
nor fd_insert() thanks to the fd_grab_tgid() that's always done before
updating the polled_mask, and guarantees that we never pretend that a
polled_mask has a bit before the FD is added.
The issue is very hard to reproduce and is extremely time-sensitive.
Some tests were required with a 1-ms timeout with request rates
closely matching 1 kHz per server, though certain tests sometimes
benefitted from saturation. It was found that adding the following
slowdown at a few key places helped a lot and managed to trigger the
bug in 0.5 to 5 seconds instead of tens of minutes on a 20-thread
setup:
{ volatile int i = 10000; while (i--); }
Particularly, placing it at key places where only one of running_mask
or thread_mask is set and not the other one yet (e.g. after the
synchronization loop in fd_update_events or after dropping the
running bit) did yield great results.
Many thanks to Olivier Houchard for this expert help analysing these
races and reviewing candidate fixes.
The patch must be backported to 2.5. Note that 2.6 does not have tgid
in FDs, and that it requires a change of output on fd_clr_running() as
we need the previous bit. This is provided by carefully backporting
commit d6e1987612 ("MINOR: fd: make fd_clr_running() return the previous
value instead"). Tests have shown that the lack of tgid is a showstopper
for 2.6 and that unless a better workaround is found, it could still be
preferable to backport the minimum pieces required for fd_grab_tgid()
to 2.6 so that it stays stable long.
2023-03-07 13:11:02 -05:00
|
|
|
|
|
|
|
|
closed_or_migrated:
|
|
|
|
|
/* We only come here once we've last dropped running and the FD is
|
|
|
|
|
* not for us as per !(tmask & tid_bit). It may imply we're
|
|
|
|
|
* responsible for closing it. Otherwise it's just a migration.
|
|
|
|
|
*/
|
|
|
|
|
if (HA_ATOMIC_BTR(&fdtab[fd].state, FD_MUST_CLOSE_BIT)) {
|
|
|
|
|
fd_drop_tgid(fd);
|
|
|
|
|
_fd_delete_orphan(fd);
|
|
|
|
|
return FD_UPDT_CLOSED;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* So we were alone, no close bit, at best the FD was migrated, at
|
|
|
|
|
* worst it's in the process of being closed by another thread. We must
|
|
|
|
|
* be ultra-careful as it can be re-inserted by yet another thread as
|
|
|
|
|
* the result of socket() or accept(). Let's just tell the poller the
|
|
|
|
|
* FD was lost. If it was closed it was already removed and this will
|
|
|
|
|
* only cost an update for nothing.
|
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
do_update:
|
|
|
|
|
/* The FD is not closed but we don't want the poller to wake up for
|
|
|
|
|
* it anymore.
|
|
|
|
|
*/
|
|
|
|
|
if (!HA_ATOMIC_BTS(&fdtab[fd].update_mask, ti->ltid))
|
|
|
|
|
fd_updt[fd_nbupdt++] = fd;
|
|
|
|
|
|
|
|
|
|
fd_drop_tgid(fd);
|
|
|
|
|
return FD_UPDT_MIGRATED;
|
2021-07-29 10:53:46 -04:00
|
|
|
}
|
|
|
|
|
|
2022-07-09 17:19:19 -04:00
|
|
|
/* This is used by pollers at boot time to re-register desired events for
|
|
|
|
|
* all FDs after new pollers have been created. It doesn't do much, it checks
|
|
|
|
|
* that their thread group matches the one in argument, and that the thread
|
|
|
|
|
* mask matches at least one of the bits in the mask, and if so, marks the FD
|
|
|
|
|
* as updated.
|
|
|
|
|
*/
|
|
|
|
|
void fd_reregister_all(int tgrp, ulong mask)
|
|
|
|
|
{
|
|
|
|
|
int fd;
|
|
|
|
|
|
MINOR: fd: don't scan the full fdtab on all threads
During tests, it's pretty visible that with many threads and a large
number of FDs, the process may take time to be ready. The reason for
this is that the full fdtab array is scanned by each and every thread
at boot in fd_reregister_all() in order to make each thread-local
poller adopt the FDs that are relevant to it. The problem is that
when dealing with 1-2M FDs and 64+ threads, it starts to represent
quite a number of loops, and usually the fdtab array doesn't entirely
fit in the CPU's L3 cache, causing extra memory accesses.
It's particularly visible when issuing debugging commands to the CLI
because usually the first one fails while the CPU is at 100% for half
a second (which also is socat's timeout). A quick test with this:
global
stats socket /tmp/sock1 level admin mode 666
stats timeout 1h
maxconn 2000000
And the following script started in another window:
while ! time socat -t5 - /tmp/sock1 <<< "show version";do date -Ins;done
shows that it takes 1.58s for the socat instance that succeeds on an
Ampere Altra with 80 cores, this requires to change the timeout (defaults
to half a second) otherwise it returns nothing. In addition it also means
that during reloads, some CPU spikes will be noticed.
Adding a prefetch of the current FD + 16 improves the startup time by 30%
but that's far from being sufficient.
In practice all of this is performed at boot time, a moment at which we
know that extremely few FDs are registered (basically just the listeners),
so FD numbers are usually very low and the rest of the table is scanned
for no benefit. Ideally, knowing upfront how many FDs we have should be
sufficient.
A first approach would consist in counting the entries on a single thread
before registering pollers. It's not necessarily efficient and would take
time anyway.
This patch takes a different approach. It consists in keeping a thread-local
max ("fd_highest") that is updated whenever fd_insert() is called with a
larger number. Of course this is not correct once all threads have started,
but it will remain valid during boot since the same value is used during
startup and is cloned for each thread, and no scheduling happens anywhere
during this period, so that all threads are aware of the highest FD they've
seen registered, even if it had been done in some init code, and this without
having to deal with a shared variable.
Here on the test platform, the script gets its response in 10ms vs 1580
before.
2024-07-15 09:09:10 -04:00
|
|
|
for (fd = 0; fd < fd_highest; fd++) {
|
2022-07-09 17:19:19 -04:00
|
|
|
if (!fdtab[fd].owner)
|
|
|
|
|
continue;
|
|
|
|
|
|
|
|
|
|
/* make sure we don't register other tgroups' FDs. We just
|
|
|
|
|
* avoid needlessly taking the lock if not needed.
|
|
|
|
|
*/
|
|
|
|
|
if (!(_HA_ATOMIC_LOAD(&fdtab[fd].thread_mask) & mask) ||
|
|
|
|
|
!fd_grab_tgid(fd, tgrp))
|
|
|
|
|
continue; // was not for us anyway
|
|
|
|
|
|
|
|
|
|
if (_HA_ATOMIC_LOAD(&fdtab[fd].thread_mask) & mask)
|
|
|
|
|
updt_fd_polling(fd);
|
|
|
|
|
fd_drop_tgid(fd);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2019-08-27 05:08:17 -04:00
|
|
|
/* Tries to send <npfx> parts from <prefix> followed by <nmsg> parts from <msg>
|
|
|
|
|
* optionally followed by a newline if <nl> is non-null, to file descriptor
|
|
|
|
|
* <fd>. The message is sent atomically using writev(). It may be truncated to
|
|
|
|
|
* <maxlen> bytes if <maxlen> is non-null. There is no distinction between the
|
|
|
|
|
* two lists, it's just a convenience to help the caller prepend some prefixes
|
|
|
|
|
* when necessary. It takes the fd's lock to make sure no other thread will
|
|
|
|
|
* write to the same fd in parallel. Returns the number of bytes sent, or <=0
|
|
|
|
|
* on failure. A limit to 31 total non-empty segments is enforced. The caller
|
|
|
|
|
* is responsible for taking care of making the fd non-blocking.
|
|
|
|
|
*/
|
|
|
|
|
ssize_t fd_write_frag_line(int fd, size_t maxlen, const struct ist pfx[], size_t npfx, const struct ist msg[], size_t nmsg, int nl)
|
|
|
|
|
{
|
|
|
|
|
struct iovec iovec[32];
|
|
|
|
|
size_t sent = 0;
|
|
|
|
|
int vec = 0;
|
2020-06-11 08:25:47 -04:00
|
|
|
int attempts = 0;
|
2019-08-27 05:08:17 -04:00
|
|
|
|
|
|
|
|
if (!maxlen)
|
|
|
|
|
maxlen = ~0;
|
|
|
|
|
|
|
|
|
|
/* keep one char for a possible trailing '\n' in any case */
|
|
|
|
|
maxlen--;
|
|
|
|
|
|
|
|
|
|
/* make an iovec from the concatenation of all parts of the original
|
|
|
|
|
* message. Skip empty fields and truncate the whole message to maxlen,
|
|
|
|
|
* leaving one spare iovec for the '\n'.
|
|
|
|
|
*/
|
|
|
|
|
while (vec < (sizeof(iovec) / sizeof(iovec[0]) - 1)) {
|
|
|
|
|
if (!npfx) {
|
|
|
|
|
pfx = msg;
|
|
|
|
|
npfx = nmsg;
|
|
|
|
|
nmsg = 0;
|
|
|
|
|
if (!npfx)
|
|
|
|
|
break;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
iovec[vec].iov_base = pfx->ptr;
|
|
|
|
|
iovec[vec].iov_len = MIN(maxlen, pfx->len);
|
|
|
|
|
maxlen -= iovec[vec].iov_len;
|
|
|
|
|
if (iovec[vec].iov_len)
|
|
|
|
|
vec++;
|
|
|
|
|
pfx++; npfx--;
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
if (nl) {
|
|
|
|
|
iovec[vec].iov_base = "\n";
|
|
|
|
|
iovec[vec].iov_len = 1;
|
|
|
|
|
vec++;
|
|
|
|
|
}
|
|
|
|
|
|
2020-06-11 08:25:47 -04:00
|
|
|
/* make sure we never interleave writes and we never block. This means
|
|
|
|
|
* we prefer to fail on collision than to block. But we don't want to
|
|
|
|
|
* lose too many logs so we just perform a few lock attempts then give
|
|
|
|
|
* up.
|
|
|
|
|
*/
|
|
|
|
|
|
2021-04-07 11:36:57 -04:00
|
|
|
while (HA_ATOMIC_BTS(&fdtab[fd].state, FD_EXCL_SYSCALL_BIT)) {
|
2020-06-11 08:25:47 -04:00
|
|
|
if (++attempts >= 200) {
|
|
|
|
|
/* so that the caller knows the message couldn't be delivered */
|
|
|
|
|
sent = -1;
|
|
|
|
|
errno = EAGAIN;
|
|
|
|
|
goto leave;
|
|
|
|
|
}
|
|
|
|
|
ha_thread_relax();
|
|
|
|
|
}
|
|
|
|
|
|
2021-04-06 11:57:12 -04:00
|
|
|
if (unlikely(!(fdtab[fd].state & FD_INITIALIZED))) {
|
|
|
|
|
HA_ATOMIC_OR(&fdtab[fd].state, FD_INITIALIZED);
|
2019-08-30 08:41:47 -04:00
|
|
|
if (!isatty(fd))
|
2022-04-26 04:24:14 -04:00
|
|
|
fd_set_nonblock(fd);
|
2019-08-30 08:41:47 -04:00
|
|
|
}
|
2019-08-27 05:08:17 -04:00
|
|
|
sent = writev(fd, iovec, vec);
|
2021-04-07 11:36:57 -04:00
|
|
|
HA_ATOMIC_BTR(&fdtab[fd].state, FD_EXCL_SYSCALL_BIT);
|
2019-08-27 05:08:17 -04:00
|
|
|
|
2020-06-11 08:25:47 -04:00
|
|
|
leave:
|
2019-08-27 05:08:17 -04:00
|
|
|
/* sent > 0 if the message was delivered */
|
|
|
|
|
return sent;
|
|
|
|
|
}
|
|
|
|
|
|
2019-02-25 08:26:54 -05:00
|
|
|
#if defined(USE_CLOSEFROM)
|
|
|
|
|
void my_closefrom(int start)
|
|
|
|
|
{
|
|
|
|
|
closefrom(start);
|
|
|
|
|
}
|
|
|
|
|
|
2019-05-22 13:24:06 -04:00
|
|
|
#elif defined(USE_POLL)
|
2019-02-21 16:12:47 -05:00
|
|
|
/* This is a portable implementation of closefrom(). It closes all open file
|
|
|
|
|
* descriptors starting at <start> and above. It relies on the fact that poll()
|
|
|
|
|
* will return POLLNVAL for each invalid (hence close) file descriptor passed
|
|
|
|
|
* in argument in order to skip them. It acts with batches of FDs and will
|
|
|
|
|
* typically perform one poll() call per 1024 FDs so the overhead is low in
|
|
|
|
|
* case all FDs have to be closed.
|
|
|
|
|
*/
|
|
|
|
|
void my_closefrom(int start)
|
|
|
|
|
{
|
|
|
|
|
struct pollfd poll_events[1024];
|
|
|
|
|
struct rlimit limit;
|
|
|
|
|
int nbfds, fd, ret, idx;
|
|
|
|
|
int step, next;
|
|
|
|
|
|
|
|
|
|
if (getrlimit(RLIMIT_NOFILE, &limit) == 0)
|
|
|
|
|
step = nbfds = limit.rlim_cur;
|
|
|
|
|
else
|
|
|
|
|
step = nbfds = 0;
|
|
|
|
|
|
|
|
|
|
if (nbfds <= 0) {
|
|
|
|
|
/* set safe limit */
|
|
|
|
|
nbfds = 1024;
|
|
|
|
|
step = 256;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (step > sizeof(poll_events) / sizeof(poll_events[0]))
|
|
|
|
|
step = sizeof(poll_events) / sizeof(poll_events[0]);
|
|
|
|
|
|
|
|
|
|
while (start < nbfds) {
|
|
|
|
|
next = (start / step + 1) * step;
|
|
|
|
|
|
|
|
|
|
for (fd = start; fd < next && fd < nbfds; fd++) {
|
|
|
|
|
poll_events[fd - start].fd = fd;
|
|
|
|
|
poll_events[fd - start].events = 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
do {
|
|
|
|
|
ret = poll(poll_events, fd - start, 0);
|
|
|
|
|
if (ret >= 0)
|
|
|
|
|
break;
|
2022-04-25 14:32:15 -04:00
|
|
|
} while (errno == EAGAIN || errno == EWOULDBLOCK || errno == EINTR || errno == ENOMEM);
|
2019-02-21 16:12:47 -05:00
|
|
|
|
2024-04-19 10:52:32 -04:00
|
|
|
/* always check the whole range */
|
|
|
|
|
ret = fd - start;
|
2019-02-22 03:07:42 -05:00
|
|
|
|
2019-02-21 16:12:47 -05:00
|
|
|
for (idx = 0; idx < ret; idx++) {
|
|
|
|
|
if (poll_events[idx].revents & POLLNVAL)
|
|
|
|
|
continue; /* already closed */
|
|
|
|
|
|
|
|
|
|
fd = poll_events[idx].fd;
|
|
|
|
|
close(fd);
|
|
|
|
|
}
|
|
|
|
|
start = next;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2019-05-22 13:24:06 -04:00
|
|
|
#else // defined(USE_POLL)
|
2019-02-21 16:12:47 -05:00
|
|
|
|
2019-02-21 16:19:17 -05:00
|
|
|
/* This is a portable implementation of closefrom(). It closes all open file
|
|
|
|
|
* descriptors starting at <start> and above. This is a naive version for use
|
|
|
|
|
* when the operating system provides no alternative.
|
|
|
|
|
*/
|
|
|
|
|
void my_closefrom(int start)
|
|
|
|
|
{
|
|
|
|
|
struct rlimit limit;
|
|
|
|
|
int nbfds;
|
|
|
|
|
|
|
|
|
|
if (getrlimit(RLIMIT_NOFILE, &limit) == 0)
|
|
|
|
|
nbfds = limit.rlim_cur;
|
|
|
|
|
else
|
|
|
|
|
nbfds = 0;
|
|
|
|
|
|
|
|
|
|
if (nbfds <= 0)
|
|
|
|
|
nbfds = 1024; /* safe limit */
|
|
|
|
|
|
|
|
|
|
while (start < nbfds)
|
|
|
|
|
close(start++);
|
|
|
|
|
}
|
2019-05-22 13:24:06 -04:00
|
|
|
#endif // defined(USE_POLL)
|
2019-02-21 16:19:17 -05:00
|
|
|
|
2022-09-22 10:08:47 -04:00
|
|
|
|
2021-10-06 13:55:29 -04:00
|
|
|
/* Computes the bounded poll() timeout based on the next expiration timer <next>
|
|
|
|
|
* by bounding it to MAX_DELAY_MS. <next> may equal TICK_ETERNITY. The pollers
|
|
|
|
|
* just needs to call this function right before polling to get their timeout
|
|
|
|
|
* value. Timeouts that are already expired (possibly due to a pending event)
|
|
|
|
|
* are accounted for in activity.poll_exp.
|
|
|
|
|
*/
|
|
|
|
|
int compute_poll_timeout(int next)
|
|
|
|
|
{
|
|
|
|
|
int wait_time;
|
|
|
|
|
|
|
|
|
|
if (!tick_isset(next))
|
|
|
|
|
wait_time = MAX_DELAY_MS;
|
|
|
|
|
else if (tick_is_expired(next, now_ms)) {
|
|
|
|
|
activity[tid].poll_exp++;
|
|
|
|
|
wait_time = 0;
|
|
|
|
|
}
|
|
|
|
|
else {
|
|
|
|
|
wait_time = TICKS_TO_MS(tick_remain(now_ms, next)) + 1;
|
|
|
|
|
if (wait_time > MAX_DELAY_MS)
|
|
|
|
|
wait_time = MAX_DELAY_MS;
|
|
|
|
|
}
|
|
|
|
|
return wait_time;
|
|
|
|
|
}
|
|
|
|
|
|
2022-06-22 09:21:34 -04:00
|
|
|
/* Handle the return of the poller, which consists in calculating the idle
|
|
|
|
|
* time, saving a few clocks, marking the thread harmful again etc. All that
|
|
|
|
|
* is some boring stuff that all pollers have to do anyway.
|
|
|
|
|
*/
|
|
|
|
|
void fd_leaving_poll(int wait_time, int status)
|
|
|
|
|
{
|
|
|
|
|
clock_leaving_poll(wait_time, status);
|
|
|
|
|
|
|
|
|
|
thread_harmless_end();
|
|
|
|
|
thread_idle_end();
|
|
|
|
|
|
2022-06-20 03:23:24 -04:00
|
|
|
_HA_ATOMIC_AND(&th_ctx->flags, ~TH_FL_SLEEPING);
|
2022-06-22 09:21:34 -04:00
|
|
|
}
|
|
|
|
|
|
2007-04-08 10:39:58 -04:00
|
|
|
/* disable the specified poller */
|
|
|
|
|
void disable_poller(const char *poller_name)
|
|
|
|
|
{
|
|
|
|
|
int p;
|
2006-06-25 20:48:02 -04:00
|
|
|
|
2007-04-08 10:39:58 -04:00
|
|
|
for (p = 0; p < nbpollers; p++)
|
|
|
|
|
if (strcmp(pollers[p].name, poller_name) == 0)
|
|
|
|
|
pollers[p].pref = 0;
|
|
|
|
|
}
|
2006-06-25 20:48:02 -04:00
|
|
|
|
2018-07-26 11:55:11 -04:00
|
|
|
void poller_pipe_io_handler(int fd)
|
|
|
|
|
{
|
|
|
|
|
char buf[1024];
|
|
|
|
|
/* Flush the pipe */
|
|
|
|
|
while (read(fd, buf, sizeof(buf)) > 0);
|
|
|
|
|
fd_cant_recv(fd);
|
|
|
|
|
}
|
|
|
|
|
|
2019-05-22 08:42:12 -04:00
|
|
|
/* allocate the per-thread fd_updt thus needs to be called early after
|
|
|
|
|
* thread creation.
|
|
|
|
|
*/
|
|
|
|
|
static int alloc_pollers_per_thread()
|
|
|
|
|
{
|
|
|
|
|
fd_updt = calloc(global.maxsock, sizeof(*fd_updt));
|
2024-05-24 04:41:28 -04:00
|
|
|
vma_set_name_id(fd_updt, global.maxsock * sizeof(*fd_updt), "fd", "fd_updt", tid + 1);
|
2019-05-22 08:42:12 -04:00
|
|
|
return fd_updt != NULL;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Initialize the pollers per thread.*/
|
MAJOR: threads/fd: Make fd stuffs thread-safe
Many changes have been made to do so. First, the fd_updt array, where all
pending FDs for polling are stored, is now a thread-local array. Then 3 locks
have been added to protect, respectively, the fdtab array, the fd_cache array
and poll information. In addition, a lock for each entry in the fdtab array has
been added to protect all accesses to a specific FD or its information.
For pollers, according to the poller, the way to manage the concurrency is
different. There is a poller loop on each thread. So the set of monitored FDs
may need to be protected. epoll and kqueue are thread-safe per-se, so there few
things to do to protect these pollers. This is not possible with select and
poll, so there is no sharing between the threads. The poller on each thread is
independant from others.
Finally, per-thread init/deinit functions are used for each pollers and for FD
part for manage thread-local ressources.
Now, you must be carefull when a FD is created during the HAProxy startup. All
update on the FD state must be made in the threads context and never before
their creation. This is mandatory because fd_updt array is thread-local and
initialized only for threads. Because there is no pollers for the main one, this
array remains uninitialized in this context. For this reason, listeners are now
enabled in run_thread_poll_loop function, just like the worker pipe.
2017-05-29 04:40:41 -04:00
|
|
|
static int init_pollers_per_thread()
|
|
|
|
|
{
|
2018-07-26 11:55:11 -04:00
|
|
|
int mypipe[2];
|
2019-05-22 08:42:12 -04:00
|
|
|
|
|
|
|
|
if (pipe(mypipe) < 0)
|
2018-07-26 11:55:11 -04:00
|
|
|
return 0;
|
2019-05-22 08:42:12 -04:00
|
|
|
|
2018-07-26 11:55:11 -04:00
|
|
|
poller_rd_pipe = mypipe[0];
|
|
|
|
|
poller_wr_pipe[tid] = mypipe[1];
|
2022-04-26 04:24:14 -04:00
|
|
|
fd_set_nonblock(poller_rd_pipe);
|
2022-07-07 02:29:00 -04:00
|
|
|
fd_insert(poller_rd_pipe, poller_pipe_io_handler, poller_pipe_io_handler, tgid, ti->ltid_bit);
|
|
|
|
|
fd_insert(poller_wr_pipe[tid], poller_pipe_io_handler, poller_pipe_io_handler, tgid, ti->ltid_bit);
|
2018-07-26 11:55:11 -04:00
|
|
|
fd_want_recv(poller_rd_pipe);
|
2022-01-24 14:33:09 -05:00
|
|
|
fd_stop_both(poller_wr_pipe[tid]);
|
MAJOR: threads/fd: Make fd stuffs thread-safe
Many changes have been made to do so. First, the fd_updt array, where all
pending FDs for polling are stored, is now a thread-local array. Then 3 locks
have been added to protect, respectively, the fdtab array, the fd_cache array
and poll information. In addition, a lock for each entry in the fdtab array has
been added to protect all accesses to a specific FD or its information.
For pollers, according to the poller, the way to manage the concurrency is
different. There is a poller loop on each thread. So the set of monitored FDs
may need to be protected. epoll and kqueue are thread-safe per-se, so there few
things to do to protect these pollers. This is not possible with select and
poll, so there is no sharing between the threads. The poller on each thread is
independant from others.
Finally, per-thread init/deinit functions are used for each pollers and for FD
part for manage thread-local ressources.
Now, you must be carefull when a FD is created during the HAProxy startup. All
update on the FD state must be made in the threads context and never before
their creation. This is mandatory because fd_updt array is thread-local and
initialized only for threads. Because there is no pollers for the main one, this
array remains uninitialized in this context. For this reason, listeners are now
enabled in run_thread_poll_loop function, just like the worker pipe.
2017-05-29 04:40:41 -04:00
|
|
|
return 1;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* Deinitialize the pollers per thread */
|
|
|
|
|
static void deinit_pollers_per_thread()
|
|
|
|
|
{
|
2018-12-15 16:34:31 -05:00
|
|
|
/* rd and wr are init at the same place, but only rd is init to -1, so
|
|
|
|
|
we rely to rd to close. */
|
|
|
|
|
if (poller_rd_pipe > -1) {
|
2022-08-10 11:08:17 -04:00
|
|
|
fd_delete(poller_rd_pipe);
|
2018-12-15 16:34:31 -05:00
|
|
|
poller_rd_pipe = -1;
|
2022-08-10 11:08:17 -04:00
|
|
|
fd_delete(poller_wr_pipe[tid]);
|
2018-12-15 16:34:31 -05:00
|
|
|
poller_wr_pipe[tid] = -1;
|
|
|
|
|
}
|
MAJOR: threads/fd: Make fd stuffs thread-safe
Many changes have been made to do so. First, the fd_updt array, where all
pending FDs for polling are stored, is now a thread-local array. Then 3 locks
have been added to protect, respectively, the fdtab array, the fd_cache array
and poll information. In addition, a lock for each entry in the fdtab array has
been added to protect all accesses to a specific FD or its information.
For pollers, according to the poller, the way to manage the concurrency is
different. There is a poller loop on each thread. So the set of monitored FDs
may need to be protected. epoll and kqueue are thread-safe per-se, so there few
things to do to protect these pollers. This is not possible with select and
poll, so there is no sharing between the threads. The poller on each thread is
independant from others.
Finally, per-thread init/deinit functions are used for each pollers and for FD
part for manage thread-local ressources.
Now, you must be carefull when a FD is created during the HAProxy startup. All
update on the FD state must be made in the threads context and never before
their creation. This is mandatory because fd_updt array is thread-local and
initialized only for threads. Because there is no pollers for the main one, this
array remains uninitialized in this context. For this reason, listeners are now
enabled in run_thread_poll_loop function, just like the worker pipe.
2017-05-29 04:40:41 -04:00
|
|
|
}
|
|
|
|
|
|
2019-05-22 08:42:12 -04:00
|
|
|
/* Release the pollers per thread, to be called late */
|
|
|
|
|
static void free_pollers_per_thread()
|
|
|
|
|
{
|
2022-07-26 13:06:17 -04:00
|
|
|
fd_nbupdt = 0;
|
2021-02-20 04:46:51 -05:00
|
|
|
ha_free(&fd_updt);
|
2019-05-22 08:42:12 -04:00
|
|
|
}
|
|
|
|
|
|
2006-06-25 20:48:02 -04:00
|
|
|
/*
|
2007-04-08 10:39:58 -04:00
|
|
|
* Initialize the pollers till the best one is found.
|
|
|
|
|
* If none works, returns 0, otherwise 1.
|
2006-06-25 20:48:02 -04:00
|
|
|
*/
|
2007-04-08 10:39:58 -04:00
|
|
|
int init_pollers()
|
2006-06-25 20:48:02 -04:00
|
|
|
{
|
2007-04-08 10:39:58 -04:00
|
|
|
int p;
|
|
|
|
|
struct poller *bp;
|
2006-06-25 20:48:02 -04:00
|
|
|
|
2025-08-11 12:55:09 -04:00
|
|
|
/* always provide an aligned fdtab */
|
|
|
|
|
if ((fdtab = ha_aligned_zalloc(64, global.maxsock * sizeof(*fdtab))) == NULL) {
|
2020-10-13 09:45:07 -04:00
|
|
|
ha_alert("Not enough memory to allocate %d entries for fdtab!\n", global.maxsock);
|
2017-08-31 11:52:09 -04:00
|
|
|
goto fail_tab;
|
2020-10-13 09:45:07 -04:00
|
|
|
}
|
2025-08-11 12:55:09 -04:00
|
|
|
vma_set_name(fdtab, global.maxsock * sizeof(*fdtab), "fd", "fdtab");
|
2022-01-27 10:10:48 -05:00
|
|
|
|
2020-10-13 09:45:07 -04:00
|
|
|
if ((polled_mask = calloc(global.maxsock, sizeof(*polled_mask))) == NULL) {
|
|
|
|
|
ha_alert("Not enough memory to allocate %d entries for polled_mask!\n", global.maxsock);
|
2018-04-26 08:23:07 -04:00
|
|
|
goto fail_polledmask;
|
2020-10-13 09:45:07 -04:00
|
|
|
}
|
2024-05-21 08:30:32 -04:00
|
|
|
vma_set_name(polled_mask, global.maxsock * sizeof(*polled_mask), "fd", "polled_mask");
|
2019-01-17 03:21:39 -05:00
|
|
|
|
2020-10-13 09:45:07 -04:00
|
|
|
if ((fdinfo = calloc(global.maxsock, sizeof(*fdinfo))) == NULL) {
|
|
|
|
|
ha_alert("Not enough memory to allocate %d entries for fdinfo!\n", global.maxsock);
|
2017-08-31 11:52:09 -04:00
|
|
|
goto fail_info;
|
2020-10-13 09:45:07 -04:00
|
|
|
}
|
2024-05-21 08:30:32 -04:00
|
|
|
vma_set_name(fdinfo, global.maxsock * sizeof(*fdinfo), "fd", "fdinfo");
|
2017-08-31 11:52:09 -04:00
|
|
|
|
2022-07-08 05:33:43 -04:00
|
|
|
for (p = 0; p < MAX_TGROUPS; p++)
|
|
|
|
|
update_list[p].first = update_list[p].last = -1;
|
2006-06-25 20:48:02 -04:00
|
|
|
|
2018-01-24 12:17:56 -05:00
|
|
|
for (p = 0; p < global.maxsock; p++) {
|
|
|
|
|
/* Mark the fd as out of the fd cache */
|
2018-04-25 10:58:25 -04:00
|
|
|
fdtab[p].update.next = -3;
|
2018-01-24 12:17:56 -05:00
|
|
|
}
|
MAJOR: threads/fd: Make fd stuffs thread-safe
Many changes have been made to do so. First, the fd_updt array, where all
pending FDs for polling are stored, is now a thread-local array. Then 3 locks
have been added to protect, respectively, the fdtab array, the fd_cache array
and poll information. In addition, a lock for each entry in the fdtab array has
been added to protect all accesses to a specific FD or its information.
For pollers, according to the poller, the way to manage the concurrency is
different. There is a poller loop on each thread. So the set of monitored FDs
may need to be protected. epoll and kqueue are thread-safe per-se, so there few
things to do to protect these pollers. This is not possible with select and
poll, so there is no sharing between the threads. The poller on each thread is
independant from others.
Finally, per-thread init/deinit functions are used for each pollers and for FD
part for manage thread-local ressources.
Now, you must be carefull when a FD is created during the HAProxy startup. All
update on the FD state must be made in the threads context and never before
their creation. This is mandatory because fd_updt array is thread-local and
initialized only for threads. Because there is no pollers for the main one, this
array remains uninitialized in this context. For this reason, listeners are now
enabled in run_thread_poll_loop function, just like the worker pipe.
2017-05-29 04:40:41 -04:00
|
|
|
|
2007-04-08 10:39:58 -04:00
|
|
|
do {
|
|
|
|
|
bp = NULL;
|
|
|
|
|
for (p = 0; p < nbpollers; p++)
|
|
|
|
|
if (!bp || (pollers[p].pref > bp->pref))
|
|
|
|
|
bp = &pollers[p];
|
2006-06-25 20:48:02 -04:00
|
|
|
|
2007-04-08 10:39:58 -04:00
|
|
|
if (!bp || bp->pref == 0)
|
2006-06-25 20:48:02 -04:00
|
|
|
break;
|
|
|
|
|
|
2007-04-08 10:39:58 -04:00
|
|
|
if (bp->init(bp)) {
|
|
|
|
|
memcpy(&cur_poller, bp, sizeof(*bp));
|
|
|
|
|
return 1;
|
2006-06-25 20:48:02 -04:00
|
|
|
}
|
2007-04-08 10:39:58 -04:00
|
|
|
} while (!bp || bp->pref == 0);
|
2012-11-11 09:02:54 -05:00
|
|
|
|
2017-08-31 11:52:09 -04:00
|
|
|
free(fdinfo);
|
|
|
|
|
fail_info:
|
2018-04-26 08:23:07 -04:00
|
|
|
free(polled_mask);
|
|
|
|
|
fail_polledmask:
|
2025-08-11 12:55:09 -04:00
|
|
|
ha_aligned_free(fdtab);
|
2019-01-17 03:21:39 -05:00
|
|
|
fail_tab:
|
2012-11-11 09:02:54 -05:00
|
|
|
return 0;
|
2006-06-25 20:48:02 -04:00
|
|
|
}
|
|
|
|
|
|
[MEDIUM] Fix memory freeing at exit
New functions implemented:
- deinit_pollers: called at the end of deinit())
- prune_acl: called via list_for_each_entry_safe
Add missing pool_destroy2 calls:
- p->hdr_idx_pool
- pool2_tree64
Implement all task stopping:
- health-check: needs new "struct task" in the struct server
- queue processing: queue_mgt
- appsess_refresh: appsession_refresh
before (idle system):
==6079== LEAK SUMMARY:
==6079== definitely lost: 1,112 bytes in 75 blocks.
==6079== indirectly lost: 53,356 bytes in 2,090 blocks.
==6079== possibly lost: 52 bytes in 1 blocks.
==6079== still reachable: 150,996 bytes in 504 blocks.
==6079== suppressed: 0 bytes in 0 blocks.
after (idle system):
==6945== LEAK SUMMARY:
==6945== definitely lost: 7,644 bytes in 137 blocks.
==6945== indirectly lost: 9,913 bytes in 587 blocks.
==6945== possibly lost: 0 bytes in 0 blocks.
==6945== still reachable: 0 bytes in 0 blocks.
==6945== suppressed: 0 bytes in 0 blocks.
before (running system for ~2m):
==9343== LEAK SUMMARY:
==9343== definitely lost: 1,112 bytes in 75 blocks.
==9343== indirectly lost: 54,199 bytes in 2,122 blocks.
==9343== possibly lost: 52 bytes in 1 blocks.
==9343== still reachable: 151,128 bytes in 509 blocks.
==9343== suppressed: 0 bytes in 0 blocks.
after (running system for ~2m):
==11616== LEAK SUMMARY:
==11616== definitely lost: 7,644 bytes in 137 blocks.
==11616== indirectly lost: 9,981 bytes in 591 blocks.
==11616== possibly lost: 0 bytes in 0 blocks.
==11616== still reachable: 4 bytes in 1 blocks.
==11616== suppressed: 0 bytes in 0 blocks.
Still not perfect but significant improvement.
2008-05-29 17:53:44 -04:00
|
|
|
/*
|
|
|
|
|
* Deinitialize the pollers.
|
|
|
|
|
*/
|
|
|
|
|
void deinit_pollers() {
|
|
|
|
|
|
|
|
|
|
struct poller *bp;
|
|
|
|
|
int p;
|
|
|
|
|
|
|
|
|
|
for (p = 0; p < nbpollers; p++) {
|
|
|
|
|
bp = &pollers[p];
|
|
|
|
|
|
|
|
|
|
if (bp && bp->pref)
|
|
|
|
|
bp->term(bp);
|
|
|
|
|
}
|
MAJOR: threads/fd: Make fd stuffs thread-safe
Many changes have been made to do so. First, the fd_updt array, where all
pending FDs for polling are stored, is now a thread-local array. Then 3 locks
have been added to protect, respectively, the fdtab array, the fd_cache array
and poll information. In addition, a lock for each entry in the fdtab array has
been added to protect all accesses to a specific FD or its information.
For pollers, according to the poller, the way to manage the concurrency is
different. There is a poller loop on each thread. So the set of monitored FDs
may need to be protected. epoll and kqueue are thread-safe per-se, so there few
things to do to protect these pollers. This is not possible with select and
poll, so there is no sharing between the threads. The poller on each thread is
independant from others.
Finally, per-thread init/deinit functions are used for each pollers and for FD
part for manage thread-local ressources.
Now, you must be carefull when a FD is created during the HAProxy startup. All
update on the FD state must be made in the threads context and never before
their creation. This is mandatory because fd_updt array is thread-local and
initialized only for threads. Because there is no pollers for the main one, this
array remains uninitialized in this context. For this reason, listeners are now
enabled in run_thread_poll_loop function, just like the worker pipe.
2017-05-29 04:40:41 -04:00
|
|
|
|
2021-02-20 04:46:51 -05:00
|
|
|
ha_free(&fdinfo);
|
2025-08-11 12:55:09 -04:00
|
|
|
ha_aligned_free(fdtab);
|
2021-02-20 04:46:51 -05:00
|
|
|
ha_free(&polled_mask);
|
[MEDIUM] Fix memory freeing at exit
New functions implemented:
- deinit_pollers: called at the end of deinit())
- prune_acl: called via list_for_each_entry_safe
Add missing pool_destroy2 calls:
- p->hdr_idx_pool
- pool2_tree64
Implement all task stopping:
- health-check: needs new "struct task" in the struct server
- queue processing: queue_mgt
- appsess_refresh: appsession_refresh
before (idle system):
==6079== LEAK SUMMARY:
==6079== definitely lost: 1,112 bytes in 75 blocks.
==6079== indirectly lost: 53,356 bytes in 2,090 blocks.
==6079== possibly lost: 52 bytes in 1 blocks.
==6079== still reachable: 150,996 bytes in 504 blocks.
==6079== suppressed: 0 bytes in 0 blocks.
after (idle system):
==6945== LEAK SUMMARY:
==6945== definitely lost: 7,644 bytes in 137 blocks.
==6945== indirectly lost: 9,913 bytes in 587 blocks.
==6945== possibly lost: 0 bytes in 0 blocks.
==6945== still reachable: 0 bytes in 0 blocks.
==6945== suppressed: 0 bytes in 0 blocks.
before (running system for ~2m):
==9343== LEAK SUMMARY:
==9343== definitely lost: 1,112 bytes in 75 blocks.
==9343== indirectly lost: 54,199 bytes in 2,122 blocks.
==9343== possibly lost: 52 bytes in 1 blocks.
==9343== still reachable: 151,128 bytes in 509 blocks.
==9343== suppressed: 0 bytes in 0 blocks.
after (running system for ~2m):
==11616== LEAK SUMMARY:
==11616== definitely lost: 7,644 bytes in 137 blocks.
==11616== indirectly lost: 9,981 bytes in 591 blocks.
==11616== possibly lost: 0 bytes in 0 blocks.
==11616== still reachable: 4 bytes in 1 blocks.
==11616== suppressed: 0 bytes in 0 blocks.
Still not perfect but significant improvement.
2008-05-29 17:53:44 -04:00
|
|
|
}
|
|
|
|
|
|
2007-04-09 13:29:56 -04:00
|
|
|
/*
|
|
|
|
|
* Lists the known pollers on <out>.
|
|
|
|
|
* Should be performed only before initialization.
|
|
|
|
|
*/
|
|
|
|
|
int list_pollers(FILE *out)
|
|
|
|
|
{
|
|
|
|
|
int p;
|
|
|
|
|
int last, next;
|
|
|
|
|
int usable;
|
|
|
|
|
struct poller *bp;
|
|
|
|
|
|
|
|
|
|
fprintf(out, "Available polling systems :\n");
|
|
|
|
|
|
|
|
|
|
usable = 0;
|
|
|
|
|
bp = NULL;
|
|
|
|
|
last = next = -1;
|
|
|
|
|
while (1) {
|
|
|
|
|
for (p = 0; p < nbpollers; p++) {
|
|
|
|
|
if ((next < 0 || pollers[p].pref > next)
|
2010-11-19 04:20:36 -05:00
|
|
|
&& (last < 0 || pollers[p].pref < last)) {
|
2007-04-09 13:29:56 -04:00
|
|
|
next = pollers[p].pref;
|
2010-11-19 04:20:36 -05:00
|
|
|
if (!bp || (pollers[p].pref > bp->pref))
|
|
|
|
|
bp = &pollers[p];
|
|
|
|
|
}
|
2007-04-09 13:29:56 -04:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (next == -1)
|
|
|
|
|
break;
|
|
|
|
|
|
|
|
|
|
for (p = 0; p < nbpollers; p++) {
|
|
|
|
|
if (pollers[p].pref == next) {
|
|
|
|
|
fprintf(out, " %10s : ", pollers[p].name);
|
|
|
|
|
if (pollers[p].pref == 0)
|
|
|
|
|
fprintf(out, "disabled, ");
|
|
|
|
|
else
|
|
|
|
|
fprintf(out, "pref=%3d, ", pollers[p].pref);
|
|
|
|
|
if (pollers[p].test(&pollers[p])) {
|
|
|
|
|
fprintf(out, " test result OK");
|
|
|
|
|
if (next > 0)
|
|
|
|
|
usable++;
|
2010-11-19 04:20:36 -05:00
|
|
|
} else {
|
2007-04-09 13:29:56 -04:00
|
|
|
fprintf(out, " test result FAILED");
|
2010-11-19 04:20:36 -05:00
|
|
|
if (bp == &pollers[p])
|
|
|
|
|
bp = NULL;
|
|
|
|
|
}
|
2007-04-09 13:29:56 -04:00
|
|
|
fprintf(out, "\n");
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
last = next;
|
|
|
|
|
next = -1;
|
|
|
|
|
};
|
|
|
|
|
fprintf(out, "Total: %d (%d usable), will use %s.\n", nbpollers, usable, bp ? bp->name : "none");
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
|
* Some pollers may lose their connection after a fork(). It may be necessary
|
|
|
|
|
* to create initialize part of them again. Returns 0 in case of failure,
|
|
|
|
|
* otherwise 1. The fork() function may be NULL if unused. In case of error,
|
|
|
|
|
* the the current poller is destroyed and the caller is responsible for trying
|
|
|
|
|
* another one by calling init_pollers() again.
|
|
|
|
|
*/
|
|
|
|
|
int fork_poller()
|
|
|
|
|
{
|
2014-05-20 08:28:24 -04:00
|
|
|
int fd;
|
MINOR: fd: don't scan the full fdtab on all threads
During tests, it's pretty visible that with many threads and a large
number of FDs, the process may take time to be ready. The reason for
this is that the full fdtab array is scanned by each and every thread
at boot in fd_reregister_all() in order to make each thread-local
poller adopt the FDs that are relevant to it. The problem is that
when dealing with 1-2M FDs and 64+ threads, it starts to represent
quite a number of loops, and usually the fdtab array doesn't entirely
fit in the CPU's L3 cache, causing extra memory accesses.
It's particularly visible when issuing debugging commands to the CLI
because usually the first one fails while the CPU is at 100% for half
a second (which also is socat's timeout). A quick test with this:
global
stats socket /tmp/sock1 level admin mode 666
stats timeout 1h
maxconn 2000000
And the following script started in another window:
while ! time socat -t5 - /tmp/sock1 <<< "show version";do date -Ins;done
shows that it takes 1.58s for the socat instance that succeeds on an
Ampere Altra with 80 cores, this requires to change the timeout (defaults
to half a second) otherwise it returns nothing. In addition it also means
that during reloads, some CPU spikes will be noticed.
Adding a prefetch of the current FD + 16 improves the startup time by 30%
but that's far from being sufficient.
In practice all of this is performed at boot time, a moment at which we
know that extremely few FDs are registered (basically just the listeners),
so FD numbers are usually very low and the rest of the table is scanned
for no benefit. Ideally, knowing upfront how many FDs we have should be
sufficient.
A first approach would consist in counting the entries on a single thread
before registering pollers. It's not necessarily efficient and would take
time anyway.
This patch takes a different approach. It consists in keeping a thread-local
max ("fd_highest") that is updated whenever fd_insert() is called with a
larger number. Of course this is not correct once all threads have started,
but it will remain valid during boot since the same value is used during
startup and is cloned for each thread, and no scheduling happens anywhere
during this period, so that all threads are aware of the highest FD they've
seen registered, even if it had been done in some init code, and this without
having to deal with a shared variable.
Here on the test platform, the script gets its response in 10ms vs 1580
before.
2024-07-15 09:09:10 -04:00
|
|
|
for (fd = 0; fd < fd_highest; fd++) {
|
2014-05-20 08:28:24 -04:00
|
|
|
if (fdtab[fd].owner) {
|
2021-04-06 11:53:33 -04:00
|
|
|
HA_ATOMIC_OR(&fdtab[fd].state, FD_CLONED);
|
2014-05-20 08:28:24 -04:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2007-04-09 13:29:56 -04:00
|
|
|
if (cur_poller.fork) {
|
|
|
|
|
if (cur_poller.fork(&cur_poller))
|
|
|
|
|
return 1;
|
|
|
|
|
cur_poller.term(&cur_poller);
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
return 1;
|
|
|
|
|
}
|
|
|
|
|
|
2020-06-18 02:58:47 -04:00
|
|
|
/* config parser for global "tune.fd.edge-triggered", accepts "on" or "off" */
|
|
|
|
|
static int cfg_parse_tune_fd_edge_triggered(char **args, int section_type, struct proxy *curpx,
|
2021-03-09 03:53:46 -05:00
|
|
|
const struct proxy *defpx, const char *file, int line,
|
2020-06-18 02:58:47 -04:00
|
|
|
char **err)
|
|
|
|
|
{
|
|
|
|
|
if (too_many_args(1, args, err, NULL))
|
|
|
|
|
return -1;
|
|
|
|
|
|
|
|
|
|
if (strcmp(args[1], "on") == 0)
|
|
|
|
|
global.tune.options |= GTUNE_FD_ET;
|
|
|
|
|
else if (strcmp(args[1], "off") == 0)
|
|
|
|
|
global.tune.options &= ~GTUNE_FD_ET;
|
|
|
|
|
else {
|
|
|
|
|
memprintf(err, "'%s' expects either 'on' or 'off' but got '%s'.", args[0], args[1]);
|
|
|
|
|
return -1;
|
|
|
|
|
}
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/* config keyword parsers */
|
|
|
|
|
static struct cfg_kw_list cfg_kws = {ILH, {
|
2021-05-08 05:06:32 -04:00
|
|
|
{ CFG_GLOBAL, "tune.fd.edge-triggered", cfg_parse_tune_fd_edge_triggered, KWF_EXPERIMENTAL },
|
2020-06-18 02:58:47 -04:00
|
|
|
{ 0, NULL, NULL }
|
|
|
|
|
}};
|
|
|
|
|
|
|
|
|
|
INITCALL1(STG_REGISTER, cfg_register_keywords, &cfg_kws);
|
|
|
|
|
|
2019-05-22 08:42:12 -04:00
|
|
|
REGISTER_PER_THREAD_ALLOC(alloc_pollers_per_thread);
|
2018-11-26 05:21:50 -05:00
|
|
|
REGISTER_PER_THREAD_INIT(init_pollers_per_thread);
|
|
|
|
|
REGISTER_PER_THREAD_DEINIT(deinit_pollers_per_thread);
|
2019-05-22 08:42:12 -04:00
|
|
|
REGISTER_PER_THREAD_FREE(free_pollers_per_thread);
|
2018-11-26 05:21:50 -05:00
|
|
|
|
2006-06-25 20:48:02 -04:00
|
|
|
/*
|
|
|
|
|
* Local variables:
|
|
|
|
|
* c-indent-level: 8
|
|
|
|
|
* c-basic-offset: 8
|
|
|
|
|
* End:
|
|
|
|
|
*/
|