haproxy/src/pool.c

565 lines
15 KiB
C
Raw Normal View History

/*
* Memory management functions.
*
* Copyright 2000-2007 Willy Tarreau <w@1wt.eu>
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
* 2 of the License, or (at your option) any later version.
*
*/
#include <errno.h>
#include <haproxy/activity-t.h>
#include <haproxy/api.h>
#include <haproxy/applet-t.h>
#include <haproxy/cfgparse.h>
#include <haproxy/channel.h>
#include <haproxy/cli.h>
#include <haproxy/errors.h>
#include <haproxy/global.h>
#include <haproxy/list.h>
#include <haproxy/pool.h>
#include <haproxy/stats-t.h>
#include <haproxy/stream_interface.h>
#include <haproxy/thread.h>
#include <haproxy/tools.h>
#ifdef CONFIG_HAP_POOLS
/* These ones are initialized per-thread on startup by init_pools() */
THREAD_LOCAL size_t pool_cache_bytes = 0; /* total cache size */
THREAD_LOCAL size_t pool_cache_count = 0; /* #cache objects */
#endif
static struct list pools = LIST_HEAD_INIT(pools);
int mem_poison_byte = -1;
#ifdef DEBUG_FAIL_ALLOC
static int mem_fail_rate = 0;
#endif
/* Try to find an existing shared pool with the same characteristics and
* returns it, otherwise creates this one. NULL is returned if no memory
* is available for a new creation. Two flags are supported :
* - MEM_F_SHARED to indicate that the pool may be shared with other users
* - MEM_F_EXACT to indicate that the size must not be rounded up
*/
struct pool_head *create_pool(char *name, unsigned int size, unsigned int flags)
{
struct pool_head *pool;
struct pool_head *entry;
struct list *start;
unsigned int align;
int thr __maybe_unused;
/* We need to store a (void *) at the end of the chunks. Since we know
* that the malloc() function will never return such a small size,
* let's round the size up to something slightly bigger, in order to
* ease merging of entries. Note that the rounding is a power of two.
* This extra (void *) is not accounted for in the size computation
* so that the visible parts outside are not affected.
*
* Note: for the LRU cache, we need to store 2 doubly-linked lists.
*/
if (!(flags & MEM_F_EXACT)) {
align = 4 * sizeof(void *); // 2 lists = 4 pointers min
size = ((size + POOL_EXTRA + align - 1) & -align) - POOL_EXTRA;
}
/* TODO: thread: we do not lock pool list for now because all pools are
* created during HAProxy startup (so before threads creation) */
start = &pools;
pool = NULL;
list_for_each_entry(entry, &pools, list) {
if (entry->size == size) {
/* either we can share this place and we take it, or
* we look for a shareable one or for the next position
* before which we will insert a new one.
*/
MINOR: pools/debug: slightly relax DEBUG_DONT_SHARE_POOLS The purpose of this debugging option was to prevent certain pools from masking other ones when they were shared. For example, task, http_txn, h2s, h1s, h1c, session, fcgi_strm, and connection are all 192 bytes and would normally be mergedi, but not with this option. The problem is that certain pools are declared multiple times with various parameters, which are often very close, and due to the way the option works, they're not shared either. Good examples of this are captures and stick tables. Some configurations have large numbers of stick-tables of pretty similar types and it's very common to end up with the following when the option is enabled: $ socat - /tmp/sock1 <<< "show pools" | grep stick - Pool sticktables (160 bytes) : 0 allocated (0 bytes), 0 used, needed_avg 0, 0 failures, 1 users, @0x753800=56 - Pool sticktables (160 bytes) : 0 allocated (0 bytes), 0 used, needed_avg 0, 0 failures, 1 users, @0x753880=57 - Pool sticktables (160 bytes) : 0 allocated (0 bytes), 0 used, needed_avg 0, 0 failures, 1 users, @0x753900=58 - Pool sticktables (160 bytes) : 0 allocated (0 bytes), 0 used, needed_avg 0, 0 failures, 1 users, @0x753980=59 - Pool sticktables (160 bytes) : 0 allocated (0 bytes), 0 used, needed_avg 0, 0 failures, 1 users, @0x753a00=60 - Pool sticktables (160 bytes) : 0 allocated (0 bytes), 0 used, needed_avg 0, 0 failures, 1 users, @0x753a80=61 - Pool sticktables (160 bytes) : 0 allocated (0 bytes), 0 used, needed_avg 0, 0 failures, 1 users, @0x753b00=62 - Pool sticktables (224 bytes) : 0 allocated (0 bytes), 0 used, needed_avg 0, 0 failures, 1 users, @0x753780=55 In addition to not being convenient, it can have important effects on the memory usage because these pools will not share their entries, so one stick table cannot allocate from another one's pool. This patch solves this by going back to the initial goal which was not to have different pools in the same list. Instead of masking the MAP_F_SHARED flag, it simply adds a test on the pool's name, and disables pool sharing if the names differ. This way pools are not shared unless they're of the same name and size, which doesn't hinder debugging. The same test above now returns this: $ socat - /tmp/sock1 <<< "show pools" | grep stick - Pool sticktables (160 bytes) : 0 allocated (0 bytes), 0 used, needed_avg 0, 0 failures, 7 users, @0x3fadb30 [SHARED] - Pool sticktables (224 bytes) : 0 allocated (0 bytes), 0 used, needed_avg 0, 0 failures, 1 users, @0x3facaa0 [SHARED] This is much better. This should probably be backported, in order to limit the side effects of DEBUG_DONT_SHARE_POOLS being enabled in production.
2021-05-05 01:29:01 -04:00
if ((flags & entry->flags & MEM_F_SHARED)
#ifdef DEBUG_DONT_SHARE_POOLS
&& strcmp(name, entry->name) == 0
#endif
) {
/* we can share this one */
pool = entry;
[MEDIUM] Fix memory freeing at exit New functions implemented: - deinit_pollers: called at the end of deinit()) - prune_acl: called via list_for_each_entry_safe Add missing pool_destroy2 calls: - p->hdr_idx_pool - pool2_tree64 Implement all task stopping: - health-check: needs new "struct task" in the struct server - queue processing: queue_mgt - appsess_refresh: appsession_refresh before (idle system): ==6079== LEAK SUMMARY: ==6079== definitely lost: 1,112 bytes in 75 blocks. ==6079== indirectly lost: 53,356 bytes in 2,090 blocks. ==6079== possibly lost: 52 bytes in 1 blocks. ==6079== still reachable: 150,996 bytes in 504 blocks. ==6079== suppressed: 0 bytes in 0 blocks. after (idle system): ==6945== LEAK SUMMARY: ==6945== definitely lost: 7,644 bytes in 137 blocks. ==6945== indirectly lost: 9,913 bytes in 587 blocks. ==6945== possibly lost: 0 bytes in 0 blocks. ==6945== still reachable: 0 bytes in 0 blocks. ==6945== suppressed: 0 bytes in 0 blocks. before (running system for ~2m): ==9343== LEAK SUMMARY: ==9343== definitely lost: 1,112 bytes in 75 blocks. ==9343== indirectly lost: 54,199 bytes in 2,122 blocks. ==9343== possibly lost: 52 bytes in 1 blocks. ==9343== still reachable: 151,128 bytes in 509 blocks. ==9343== suppressed: 0 bytes in 0 blocks. after (running system for ~2m): ==11616== LEAK SUMMARY: ==11616== definitely lost: 7,644 bytes in 137 blocks. ==11616== indirectly lost: 9,981 bytes in 591 blocks. ==11616== possibly lost: 0 bytes in 0 blocks. ==11616== still reachable: 4 bytes in 1 blocks. ==11616== suppressed: 0 bytes in 0 blocks. Still not perfect but significant improvement.
2008-05-29 17:53:44 -04:00
DPRINTF(stderr, "Sharing %s with %s\n", name, pool->name);
break;
}
}
else if (entry->size > size) {
/* insert before this one */
start = &entry->list;
break;
}
}
if (!pool) {
if (!pool)
pool = calloc(1, sizeof(*pool));
if (!pool)
return NULL;
if (name)
strlcpy2(pool->name, name, sizeof(pool->name));
pool->size = size;
pool->flags = flags;
LIST_APPEND(start, &pool->list);
#ifdef CONFIG_HAP_POOLS
/* update per-thread pool cache if necessary */
for (thr = 0; thr < MAX_THREADS; thr++) {
LIST_INIT(&pool->cache[thr].list);
}
#endif
}
pool->users++;
return pool;
}
/* Tries to allocate an object for the pool <pool> using the system's allocator
* and directly returns it. The pool's allocated counter is checked and updated,
* but no other checks are performed.
*/
void *pool_get_from_os(struct pool_head *pool)
{
if (!pool->limit || pool->allocated < pool->limit) {
void *ptr = pool_alloc_area(pool->size + POOL_EXTRA);
if (ptr) {
_HA_ATOMIC_INC(&pool->allocated);
return ptr;
}
_HA_ATOMIC_INC(&pool->failed);
}
activity[tid].pool_fail++;
return NULL;
}
/* Releases a pool item back to the operating system and atomically updates
* the allocation counter.
*/
void pool_put_to_os(struct pool_head *pool, void *ptr)
{
#ifdef DEBUG_UAF
/* This object will be released for real in order to detect a use after
* free. We also force a write to the area to ensure we crash on double
* free or free of a const area.
*/
*(uint32_t *)ptr = 0xDEADADD4;
#endif /* DEBUG_UAF */
pool_free_area(ptr, pool->size + POOL_EXTRA);
_HA_ATOMIC_DEC(&pool->allocated);
}
/* Tries to allocate an object for the pool <pool> using the system's allocator
* and directly returns it. The pool's counters are updated but the object is
* never cached, so this is usable with and without local or shared caches.
*/
void *pool_alloc_nocache(struct pool_head *pool)
MEDIUM: pools: add CONFIG_HAP_NO_GLOBAL_POOLS and CONFIG_HAP_GLOBAL_POOLS We've reached a point where the global pools represent a significant bottleneck with threads. On a 64-core machine, the performance was divided by 8 between 32 and 64 H2 connections only because there were not enough entries in the local caches to avoid picking from the global pools, and the contention on the list there was very high. It becomes obvious that we need to have an array of lists, but that will require more changes. In parallel, standard memory allocators have improved, with tcmalloc and jemalloc finding their ways through mainstream systems, and glibc having upgraded to a thread-aware ptmalloc variant, keeping this level of contention here isn't justified anymore when we have both the local per-thread pool caches and a fast process-wide allocator. For these reasons, this patch introduces a new compile time setting CONFIG_HAP_NO_GLOBAL_POOLS which is set by default when threads are enabled with thread local pool caches, and we know we have a fast thread-aware memory allocator (currently set for glibc>=2.26). In this case we entirely bypass the global pool and directly use the standard memory allocator when missing objects from the local pools. It is also possible to force it at compile time when a good allocator is used with another setup. It is still possible to re-enable the global pools using CONFIG_HAP_GLOBAL_POOLS, if a corner case is discovered regarding the operating system's default allocator, or when building with a recent libc but a different allocator which provides other benefits but does not scale well with threads.
2021-03-02 14:05:09 -05:00
{
void *ptr = NULL;
ptr = pool_get_from_os(pool);
if (!ptr)
MEDIUM: pools: add CONFIG_HAP_NO_GLOBAL_POOLS and CONFIG_HAP_GLOBAL_POOLS We've reached a point where the global pools represent a significant bottleneck with threads. On a 64-core machine, the performance was divided by 8 between 32 and 64 H2 connections only because there were not enough entries in the local caches to avoid picking from the global pools, and the contention on the list there was very high. It becomes obvious that we need to have an array of lists, but that will require more changes. In parallel, standard memory allocators have improved, with tcmalloc and jemalloc finding their ways through mainstream systems, and glibc having upgraded to a thread-aware ptmalloc variant, keeping this level of contention here isn't justified anymore when we have both the local per-thread pool caches and a fast process-wide allocator. For these reasons, this patch introduces a new compile time setting CONFIG_HAP_NO_GLOBAL_POOLS which is set by default when threads are enabled with thread local pool caches, and we know we have a fast thread-aware memory allocator (currently set for glibc>=2.26). In this case we entirely bypass the global pool and directly use the standard memory allocator when missing objects from the local pools. It is also possible to force it at compile time when a good allocator is used with another setup. It is still possible to re-enable the global pools using CONFIG_HAP_GLOBAL_POOLS, if a corner case is discovered regarding the operating system's default allocator, or when building with a recent libc but a different allocator which provides other benefits but does not scale well with threads.
2021-03-02 14:05:09 -05:00
return NULL;
swrate_add_scaled(&pool->needed_avg, POOL_AVG_SAMPLES, pool->used, POOL_AVG_SAMPLES/4);
_HA_ATOMIC_INC(&pool->used);
MEDIUM: pools: add CONFIG_HAP_NO_GLOBAL_POOLS and CONFIG_HAP_GLOBAL_POOLS We've reached a point where the global pools represent a significant bottleneck with threads. On a 64-core machine, the performance was divided by 8 between 32 and 64 H2 connections only because there were not enough entries in the local caches to avoid picking from the global pools, and the contention on the list there was very high. It becomes obvious that we need to have an array of lists, but that will require more changes. In parallel, standard memory allocators have improved, with tcmalloc and jemalloc finding their ways through mainstream systems, and glibc having upgraded to a thread-aware ptmalloc variant, keeping this level of contention here isn't justified anymore when we have both the local per-thread pool caches and a fast process-wide allocator. For these reasons, this patch introduces a new compile time setting CONFIG_HAP_NO_GLOBAL_POOLS which is set by default when threads are enabled with thread local pool caches, and we know we have a fast thread-aware memory allocator (currently set for glibc>=2.26). In this case we entirely bypass the global pool and directly use the standard memory allocator when missing objects from the local pools. It is also possible to force it at compile time when a good allocator is used with another setup. It is still possible to re-enable the global pools using CONFIG_HAP_GLOBAL_POOLS, if a corner case is discovered regarding the operating system's default allocator, or when building with a recent libc but a different allocator which provides other benefits but does not scale well with threads.
2021-03-02 14:05:09 -05:00
#ifdef DEBUG_MEMORY_POOLS
/* keep track of where the element was allocated from */
*POOL_LINK(pool, ptr) = (void *)pool;
#endif
return ptr;
}
/* Release a pool item back to the OS and keeps the pool's counters up to date.
* This is always defined even when pools are not enabled (their usage stats
* are maintained).
*/
void pool_free_nocache(struct pool_head *pool, void *ptr)
{
_HA_ATOMIC_DEC(&pool->used);
swrate_add(&pool->needed_avg, POOL_AVG_SAMPLES, pool->used);
pool_put_to_os(pool, ptr);
}
MEDIUM: pools: add CONFIG_HAP_NO_GLOBAL_POOLS and CONFIG_HAP_GLOBAL_POOLS We've reached a point where the global pools represent a significant bottleneck with threads. On a 64-core machine, the performance was divided by 8 between 32 and 64 H2 connections only because there were not enough entries in the local caches to avoid picking from the global pools, and the contention on the list there was very high. It becomes obvious that we need to have an array of lists, but that will require more changes. In parallel, standard memory allocators have improved, with tcmalloc and jemalloc finding their ways through mainstream systems, and glibc having upgraded to a thread-aware ptmalloc variant, keeping this level of contention here isn't justified anymore when we have both the local per-thread pool caches and a fast process-wide allocator. For these reasons, this patch introduces a new compile time setting CONFIG_HAP_NO_GLOBAL_POOLS which is set by default when threads are enabled with thread local pool caches, and we know we have a fast thread-aware memory allocator (currently set for glibc>=2.26). In this case we entirely bypass the global pool and directly use the standard memory allocator when missing objects from the local pools. It is also possible to force it at compile time when a good allocator is used with another setup. It is still possible to re-enable the global pools using CONFIG_HAP_GLOBAL_POOLS, if a corner case is discovered regarding the operating system's default allocator, or when building with a recent libc but a different allocator which provides other benefits but does not scale well with threads.
2021-03-02 14:05:09 -05:00
#ifdef CONFIG_HAP_POOLS
/* Evicts some of the oldest objects from one local cache, until its number of
* objects is no more than 16+1/8 of the total number of locally cached objects
* or the total size of the local cache is no more than 75% of its maximum (i.e.
* we don't want a single cache to use all the cache for itself). For this, the
* list is scanned in reverse.
*/
void pool_evict_from_local_cache(struct pool_head *pool)
{
struct pool_cache_head *ph = &pool->cache[tid];
struct pool_cache_item *item;
while (ph->count >= 16 + pool_cache_count / 8 &&
pool_cache_bytes > CONFIG_HAP_POOL_CACHE_SIZE * 3 / 4) {
item = LIST_NEXT(&ph->list, typeof(item), by_pool);
ph->count--;
pool_cache_bytes -= pool->size;
pool_cache_count--;
LIST_DELETE(&item->by_pool);
LIST_DELETE(&item->by_lru);
pool_put_to_shared_cache(pool, item);
}
}
/* Evicts some of the oldest objects from the local cache, pushing them to the
* global pool.
*/
void pool_evict_from_local_caches()
{
struct pool_cache_item *item;
struct pool_cache_head *ph;
struct pool_head *pool;
do {
item = LIST_PREV(&ti->pool_lru_head, struct pool_cache_item *, by_lru);
/* note: by definition we remove oldest objects so they also are the
* oldest in their own pools, thus their next is the pool's head.
*/
ph = LIST_NEXT(&item->by_pool, struct pool_cache_head *, list);
pool = container_of(ph - tid, struct pool_head, cache);
LIST_DELETE(&item->by_pool);
LIST_DELETE(&item->by_lru);
ph->count--;
pool_cache_count--;
pool_cache_bytes -= pool->size;
pool_put_to_shared_cache(pool, item);
} while (pool_cache_bytes > CONFIG_HAP_POOL_CACHE_SIZE * 7 / 8);
}
/* Frees an object to the local cache, possibly pushing oldest objects to the
* shared cache, which itself may decide to release some of them to the OS.
* While it is unspecified what the object becomes past this point, it is
* guaranteed to be released from the users' perpective.
*/
void pool_put_to_cache(struct pool_head *pool, void *ptr)
{
struct pool_cache_item *item = (struct pool_cache_item *)ptr;
struct pool_cache_head *ph = &pool->cache[tid];
LIST_INSERT(&ph->list, &item->by_pool);
LIST_INSERT(&ti->pool_lru_head, &item->by_lru);
ph->count++;
pool_cache_count++;
pool_cache_bytes += pool->size;
if (unlikely(pool_cache_bytes > CONFIG_HAP_POOL_CACHE_SIZE * 3 / 4)) {
if (ph->count >= 16 + pool_cache_count / 8)
pool_evict_from_local_cache(pool);
if (pool_cache_bytes > CONFIG_HAP_POOL_CACHE_SIZE)
pool_evict_from_local_caches();
}
}
#if defined(CONFIG_HAP_NO_GLOBAL_POOLS)
MEDIUM: pools: add CONFIG_HAP_NO_GLOBAL_POOLS and CONFIG_HAP_GLOBAL_POOLS We've reached a point where the global pools represent a significant bottleneck with threads. On a 64-core machine, the performance was divided by 8 between 32 and 64 H2 connections only because there were not enough entries in the local caches to avoid picking from the global pools, and the contention on the list there was very high. It becomes obvious that we need to have an array of lists, but that will require more changes. In parallel, standard memory allocators have improved, with tcmalloc and jemalloc finding their ways through mainstream systems, and glibc having upgraded to a thread-aware ptmalloc variant, keeping this level of contention here isn't justified anymore when we have both the local per-thread pool caches and a fast process-wide allocator. For these reasons, this patch introduces a new compile time setting CONFIG_HAP_NO_GLOBAL_POOLS which is set by default when threads are enabled with thread local pool caches, and we know we have a fast thread-aware memory allocator (currently set for glibc>=2.26). In this case we entirely bypass the global pool and directly use the standard memory allocator when missing objects from the local pools. It is also possible to force it at compile time when a good allocator is used with another setup. It is still possible to re-enable the global pools using CONFIG_HAP_GLOBAL_POOLS, if a corner case is discovered regarding the operating system's default allocator, or when building with a recent libc but a different allocator which provides other benefits but does not scale well with threads.
2021-03-02 14:05:09 -05:00
/* legacy stuff */
void pool_flush(struct pool_head *pool)
{
}
/* This function might ask the malloc library to trim its buffers. */
void pool_gc(struct pool_head *pool_ctx)
{
#if defined(HA_HAVE_MALLOC_TRIM)
malloc_trim(0);
#endif
}
#else /* CONFIG_HAP_NO_GLOBAL_POOLS */
/*
* This function frees whatever can be freed in pool <pool>.
*/
void pool_flush(struct pool_head *pool)
{
BUG/MAJOR: pools: fix possible race with free() in the lockless variant In GH issue #1275, Fabiano Nunes Parente provided a nicely detailed report showing reproducible crashes under musl. Musl is one of the libs coming with a simple allocator for which we prefer to keep the shared cache. On x86 we have a DWCAS so the lockless implementation is enabled for such libraries. And this implementation has had a small race since day one: the allocator will need to read the first object's <next> pointer to place it into the free list's head. If another thread picks the same element and immediately releases it, while both the local and the shared pools are too crowded, it will be freed to the OS. If the libc's allocator immediately releases it, the memory area is unmapped and we can have a crash while trying to read that pointer. However there is no problem as long as the item remains mapped in memory because whatever value found there will not be placed into the head since the counter will have changed. The probability for this to happen is extremely low, but as analyzed by Fabiano, it increases with the buffer size. On 16 threads it's relatively easy to reproduce with 2MB buffers above 200k req/s, where it should happen within the first 20 seconds of traffic usually. This is a structural issue for which there are two non-trivial solutions: - place a read lock in the alloc call and a barrier made of lock/unlock in the free() call to force to serialize operations; this will have a big performance impact since free() is already one of the contention points; - change the allocator to use a self-locked head, similar to what is done in the MT_LISTS. This requires two memory writes to the head instead of a single one, thus the overhead is exactly one memory write during alloc and one during free; This patch implements the second option. A new POOL_DUMMY pointer was defined for the locked pointer value, allowing to both read and lock it with a single xchg call. The code was carefully optimized so that the locked period remains the shortest possible and that bus writes are avoided as much as possible whenever the lock is held. Tests show that while a bit slower than the original lockless implementation on large buffers (2MB), it's 2.6 times faster than both the no-cache and the locked implementation on such large buffers, and remains as fast or faster than the all implementations when buffers are 48k or higher. Tests were also run on arm64 with similar results. Note that this code is not used on modern libcs featuring a fast allocator. A nice benefit of this change is that since it removes a dependency on the DWCAS, it will be possible to remove the locked implementation and replace it with this one, that is then usable on all systems, thus significantly increasing their performance with large buffers. Given that lockless pools were introduced in 1.9 (not supported anymore), this patch will have to be backported as far as 2.0. The code changed several times in this area and is subject to many ifdefs which will complicate the backport. What is important is to remove all the DWCAS code from the shared cache alloc/free lockless code and replace it with this one. The pool_flush() code is basically the same code as the allocator, retrieving the whole list at once. If in doubt regarding what barriers to use in older versions, it's safe to use the generic ones. This patch depends on the following previous commits: - MINOR: pools: do not maintain the lock during pool_flush() - MINOR: pools: call malloc_trim() under thread isolation - MEDIUM: pools: use a single pool_gc() function for locked and lockless The last one also removes one occurrence of an unneeded DWCAS in the code that was incompatible with this fix. The removal of the now unused seq field will happen in a future patch. Many thanks to Fabiano for his detailed report, and to Olivier for his help on this issue.
2021-06-09 12:59:58 -04:00
void *next, *temp;
if (!pool)
return;
BUG/MAJOR: pools: fix possible race with free() in the lockless variant In GH issue #1275, Fabiano Nunes Parente provided a nicely detailed report showing reproducible crashes under musl. Musl is one of the libs coming with a simple allocator for which we prefer to keep the shared cache. On x86 we have a DWCAS so the lockless implementation is enabled for such libraries. And this implementation has had a small race since day one: the allocator will need to read the first object's <next> pointer to place it into the free list's head. If another thread picks the same element and immediately releases it, while both the local and the shared pools are too crowded, it will be freed to the OS. If the libc's allocator immediately releases it, the memory area is unmapped and we can have a crash while trying to read that pointer. However there is no problem as long as the item remains mapped in memory because whatever value found there will not be placed into the head since the counter will have changed. The probability for this to happen is extremely low, but as analyzed by Fabiano, it increases with the buffer size. On 16 threads it's relatively easy to reproduce with 2MB buffers above 200k req/s, where it should happen within the first 20 seconds of traffic usually. This is a structural issue for which there are two non-trivial solutions: - place a read lock in the alloc call and a barrier made of lock/unlock in the free() call to force to serialize operations; this will have a big performance impact since free() is already one of the contention points; - change the allocator to use a self-locked head, similar to what is done in the MT_LISTS. This requires two memory writes to the head instead of a single one, thus the overhead is exactly one memory write during alloc and one during free; This patch implements the second option. A new POOL_DUMMY pointer was defined for the locked pointer value, allowing to both read and lock it with a single xchg call. The code was carefully optimized so that the locked period remains the shortest possible and that bus writes are avoided as much as possible whenever the lock is held. Tests show that while a bit slower than the original lockless implementation on large buffers (2MB), it's 2.6 times faster than both the no-cache and the locked implementation on such large buffers, and remains as fast or faster than the all implementations when buffers are 48k or higher. Tests were also run on arm64 with similar results. Note that this code is not used on modern libcs featuring a fast allocator. A nice benefit of this change is that since it removes a dependency on the DWCAS, it will be possible to remove the locked implementation and replace it with this one, that is then usable on all systems, thus significantly increasing their performance with large buffers. Given that lockless pools were introduced in 1.9 (not supported anymore), this patch will have to be backported as far as 2.0. The code changed several times in this area and is subject to many ifdefs which will complicate the backport. What is important is to remove all the DWCAS code from the shared cache alloc/free lockless code and replace it with this one. The pool_flush() code is basically the same code as the allocator, retrieving the whole list at once. If in doubt regarding what barriers to use in older versions, it's safe to use the generic ones. This patch depends on the following previous commits: - MINOR: pools: do not maintain the lock during pool_flush() - MINOR: pools: call malloc_trim() under thread isolation - MEDIUM: pools: use a single pool_gc() function for locked and lockless The last one also removes one occurrence of an unneeded DWCAS in the code that was incompatible with this fix. The removal of the now unused seq field will happen in a future patch. Many thanks to Fabiano for his detailed report, and to Olivier for his help on this issue.
2021-06-09 12:59:58 -04:00
/* The loop below atomically detaches the head of the free list and
* replaces it with a NULL. Then the list can be released.
*/
next = pool->free_list;
do {
BUG/MAJOR: pools: fix possible race with free() in the lockless variant In GH issue #1275, Fabiano Nunes Parente provided a nicely detailed report showing reproducible crashes under musl. Musl is one of the libs coming with a simple allocator for which we prefer to keep the shared cache. On x86 we have a DWCAS so the lockless implementation is enabled for such libraries. And this implementation has had a small race since day one: the allocator will need to read the first object's <next> pointer to place it into the free list's head. If another thread picks the same element and immediately releases it, while both the local and the shared pools are too crowded, it will be freed to the OS. If the libc's allocator immediately releases it, the memory area is unmapped and we can have a crash while trying to read that pointer. However there is no problem as long as the item remains mapped in memory because whatever value found there will not be placed into the head since the counter will have changed. The probability for this to happen is extremely low, but as analyzed by Fabiano, it increases with the buffer size. On 16 threads it's relatively easy to reproduce with 2MB buffers above 200k req/s, where it should happen within the first 20 seconds of traffic usually. This is a structural issue for which there are two non-trivial solutions: - place a read lock in the alloc call and a barrier made of lock/unlock in the free() call to force to serialize operations; this will have a big performance impact since free() is already one of the contention points; - change the allocator to use a self-locked head, similar to what is done in the MT_LISTS. This requires two memory writes to the head instead of a single one, thus the overhead is exactly one memory write during alloc and one during free; This patch implements the second option. A new POOL_DUMMY pointer was defined for the locked pointer value, allowing to both read and lock it with a single xchg call. The code was carefully optimized so that the locked period remains the shortest possible and that bus writes are avoided as much as possible whenever the lock is held. Tests show that while a bit slower than the original lockless implementation on large buffers (2MB), it's 2.6 times faster than both the no-cache and the locked implementation on such large buffers, and remains as fast or faster than the all implementations when buffers are 48k or higher. Tests were also run on arm64 with similar results. Note that this code is not used on modern libcs featuring a fast allocator. A nice benefit of this change is that since it removes a dependency on the DWCAS, it will be possible to remove the locked implementation and replace it with this one, that is then usable on all systems, thus significantly increasing their performance with large buffers. Given that lockless pools were introduced in 1.9 (not supported anymore), this patch will have to be backported as far as 2.0. The code changed several times in this area and is subject to many ifdefs which will complicate the backport. What is important is to remove all the DWCAS code from the shared cache alloc/free lockless code and replace it with this one. The pool_flush() code is basically the same code as the allocator, retrieving the whole list at once. If in doubt regarding what barriers to use in older versions, it's safe to use the generic ones. This patch depends on the following previous commits: - MINOR: pools: do not maintain the lock during pool_flush() - MINOR: pools: call malloc_trim() under thread isolation - MEDIUM: pools: use a single pool_gc() function for locked and lockless The last one also removes one occurrence of an unneeded DWCAS in the code that was incompatible with this fix. The removal of the now unused seq field will happen in a future patch. Many thanks to Fabiano for his detailed report, and to Olivier for his help on this issue.
2021-06-09 12:59:58 -04:00
while (unlikely(next == POOL_BUSY)) {
__ha_cpu_relax();
next = _HA_ATOMIC_LOAD(&pool->free_list);
}
if (next == NULL)
return;
} while (unlikely((next = _HA_ATOMIC_XCHG(&pool->free_list, POOL_BUSY)) == POOL_BUSY));
_HA_ATOMIC_STORE(&pool->free_list, NULL);
__ha_barrier_atomic_store();
BUG/MAJOR: pools: fix possible race with free() in the lockless variant In GH issue #1275, Fabiano Nunes Parente provided a nicely detailed report showing reproducible crashes under musl. Musl is one of the libs coming with a simple allocator for which we prefer to keep the shared cache. On x86 we have a DWCAS so the lockless implementation is enabled for such libraries. And this implementation has had a small race since day one: the allocator will need to read the first object's <next> pointer to place it into the free list's head. If another thread picks the same element and immediately releases it, while both the local and the shared pools are too crowded, it will be freed to the OS. If the libc's allocator immediately releases it, the memory area is unmapped and we can have a crash while trying to read that pointer. However there is no problem as long as the item remains mapped in memory because whatever value found there will not be placed into the head since the counter will have changed. The probability for this to happen is extremely low, but as analyzed by Fabiano, it increases with the buffer size. On 16 threads it's relatively easy to reproduce with 2MB buffers above 200k req/s, where it should happen within the first 20 seconds of traffic usually. This is a structural issue for which there are two non-trivial solutions: - place a read lock in the alloc call and a barrier made of lock/unlock in the free() call to force to serialize operations; this will have a big performance impact since free() is already one of the contention points; - change the allocator to use a self-locked head, similar to what is done in the MT_LISTS. This requires two memory writes to the head instead of a single one, thus the overhead is exactly one memory write during alloc and one during free; This patch implements the second option. A new POOL_DUMMY pointer was defined for the locked pointer value, allowing to both read and lock it with a single xchg call. The code was carefully optimized so that the locked period remains the shortest possible and that bus writes are avoided as much as possible whenever the lock is held. Tests show that while a bit slower than the original lockless implementation on large buffers (2MB), it's 2.6 times faster than both the no-cache and the locked implementation on such large buffers, and remains as fast or faster than the all implementations when buffers are 48k or higher. Tests were also run on arm64 with similar results. Note that this code is not used on modern libcs featuring a fast allocator. A nice benefit of this change is that since it removes a dependency on the DWCAS, it will be possible to remove the locked implementation and replace it with this one, that is then usable on all systems, thus significantly increasing their performance with large buffers. Given that lockless pools were introduced in 1.9 (not supported anymore), this patch will have to be backported as far as 2.0. The code changed several times in this area and is subject to many ifdefs which will complicate the backport. What is important is to remove all the DWCAS code from the shared cache alloc/free lockless code and replace it with this one. The pool_flush() code is basically the same code as the allocator, retrieving the whole list at once. If in doubt regarding what barriers to use in older versions, it's safe to use the generic ones. This patch depends on the following previous commits: - MINOR: pools: do not maintain the lock during pool_flush() - MINOR: pools: call malloc_trim() under thread isolation - MEDIUM: pools: use a single pool_gc() function for locked and lockless The last one also removes one occurrence of an unneeded DWCAS in the code that was incompatible with this fix. The removal of the now unused seq field will happen in a future patch. Many thanks to Fabiano for his detailed report, and to Olivier for his help on this issue.
2021-06-09 12:59:58 -04:00
while (next) {
temp = next;
next = *POOL_LINK(pool, temp);
pool_put_to_os(pool, temp);
}
/* here, we should have pool->allocated == pool->used */
}
/*
* This function frees whatever can be freed in all pools, but respecting
* the minimum thresholds imposed by owners. It makes sure to be alone to
* run by using thread_isolate(). <pool_ctx> is unused.
*/
void pool_gc(struct pool_head *pool_ctx)
{
struct pool_head *entry;
int isolated = thread_isolated();
if (!isolated)
thread_isolate();
list_for_each_entry(entry, &pools, list) {
void *temp;
//qfprintf(stderr, "Flushing pool %s\n", entry->name);
while (entry->free_list &&
(int)(entry->allocated - entry->used) > (int)entry->minavail) {
temp = entry->free_list;
entry->free_list = *POOL_LINK(entry, temp);
pool_put_to_os(entry, temp);
}
}
#if defined(HA_HAVE_MALLOC_TRIM)
malloc_trim(0);
#endif
if (!isolated)
thread_release();
}
#endif /* CONFIG_HAP_NO_GLOBAL_POOLS */
#else /* CONFIG_HAP_POOLS */
/* legacy stuff */
void pool_flush(struct pool_head *pool)
{
}
/* This function might ask the malloc library to trim its buffers. */
void pool_gc(struct pool_head *pool_ctx)
{
#if defined(HA_HAVE_MALLOC_TRIM)
malloc_trim(0);
#endif
}
#endif /* CONFIG_HAP_POOLS */
/*
* This function destroys a pool by freeing it completely, unless it's still
* in use. This should be called only under extreme circumstances. It always
* returns NULL if the resulting pool is empty, easing the clearing of the old
* pointer, otherwise it returns the pool.
* .
*/
void *pool_destroy(struct pool_head *pool)
{
if (pool) {
pool_flush(pool);
if (pool->used)
return pool;
pool->users--;
if (!pool->users) {
LIST_DELETE(&pool->list);
/* note that if used == 0, the cache is empty */
free(pool);
}
}
return NULL;
}
/* This destroys all pools on exit. It is *not* thread safe. */
void pool_destroy_all()
{
struct pool_head *entry, *back;
list_for_each_entry_safe(entry, back, &pools, list)
pool_destroy(entry);
}
/* This function dumps memory usage information into the trash buffer. */
void dump_pools_to_trash()
{
struct pool_head *entry;
unsigned long allocated, used;
int nbpools;
allocated = used = nbpools = 0;
chunk_printf(&trash, "Dumping pools usage. Use SIGQUIT to flush them.\n");
list_for_each_entry(entry, &pools, list) {
chunk_appendf(&trash, " - Pool %s (%u bytes) : %u allocated (%u bytes), %u used, needed_avg %u, %u failures, %u users, @%p%s\n",
entry->name, entry->size, entry->allocated,
entry->size * entry->allocated, entry->used,
swrate_avg(entry->needed_avg, POOL_AVG_SAMPLES), entry->failed,
entry->users, entry,
(entry->flags & MEM_F_SHARED) ? " [SHARED]" : "");
allocated += entry->allocated * entry->size;
used += entry->used * entry->size;
nbpools++;
}
chunk_appendf(&trash, "Total: %d pools, %lu bytes allocated, %lu used.\n",
nbpools, allocated, used);
}
/* Dump statistics on pools usage. */
void dump_pools(void)
{
dump_pools_to_trash();
qfprintf(stderr, "%s", trash.area);
}
/* This function returns the total number of failed pool allocations */
int pool_total_failures()
{
struct pool_head *entry;
int failed = 0;
list_for_each_entry(entry, &pools, list)
failed += entry->failed;
return failed;
}
/* This function returns the total amount of memory allocated in pools (in bytes) */
unsigned long pool_total_allocated()
{
struct pool_head *entry;
unsigned long allocated = 0;
list_for_each_entry(entry, &pools, list)
allocated += entry->allocated * entry->size;
return allocated;
}
/* This function returns the total amount of memory used in pools (in bytes) */
unsigned long pool_total_used()
{
struct pool_head *entry;
unsigned long used = 0;
list_for_each_entry(entry, &pools, list)
used += entry->used * entry->size;
return used;
}
/* This function dumps memory usage information onto the stream interface's
* read buffer. It returns 0 as long as it does not complete, non-zero upon
* completion. No state is used.
*/
static int cli_io_handler_dump_pools(struct appctx *appctx)
{
struct stream_interface *si = appctx->owner;
dump_pools_to_trash();
if (ci_putchk(si_ic(si), &trash) == -1) {
si_rx_room_blk(si);
return 0;
}
return 1;
}
/* callback used to create early pool <name> of size <size> and store the
* resulting pointer into <ptr>. If the allocation fails, it quits with after
* emitting an error message.
*/
void create_pool_callback(struct pool_head **ptr, char *name, unsigned int size)
{
*ptr = create_pool(name, size, MEM_F_SHARED);
if (!*ptr) {
ha_alert("Failed to allocate pool '%s' of size %u : %s. Aborting.\n",
name, size, strerror(errno));
exit(1);
}
}
/* Initializes all per-thread arrays on startup */
static void init_pools()
{
#ifdef CONFIG_HAP_POOLS
int thr;
for (thr = 0; thr < MAX_THREADS; thr++) {
LIST_INIT(&ha_thread_info[thr].pool_lru_head);
}
#endif
}
INITCALL0(STG_PREPARE, init_pools);
/* register cli keywords */
static struct cli_kw_list cli_kws = {{ },{
{ { "show", "pools", NULL }, "show pools : report information about the memory pools usage", NULL, cli_io_handler_dump_pools },
{{},}
}};
INITCALL1(STG_REGISTER, cli_register_kw, &cli_kws);
#ifdef DEBUG_FAIL_ALLOC
int mem_should_fail(const struct pool_head *pool)
{
int ret = 0;
if (mem_fail_rate > 0 && !(global.mode & MODE_STARTING)) {
if (mem_fail_rate > statistical_prng_range(100))
ret = 1;
else
ret = 0;
}
return ret;
}
/* config parser for global "tune.fail-alloc" */
static int mem_parse_global_fail_alloc(char **args, int section_type, struct proxy *curpx,
const struct proxy *defpx, const char *file, int line,
char **err)
{
if (too_many_args(1, args, err, NULL))
return -1;
mem_fail_rate = atoi(args[1]);
if (mem_fail_rate < 0 || mem_fail_rate > 100) {
memprintf(err, "'%s' expects a numeric value between 0 and 100.", args[0]);
return -1;
}
return 0;
}
#endif
/* register global config keywords */
static struct cfg_kw_list mem_cfg_kws = {ILH, {
#ifdef DEBUG_FAIL_ALLOC
{ CFG_GLOBAL, "tune.fail-alloc", mem_parse_global_fail_alloc },
#endif
{ 0, NULL, NULL }
}};
INITCALL1(STG_REGISTER, cfg_register_keywords, &mem_cfg_kws);
/*
* Local variables:
* c-indent-level: 8
* c-basic-offset: 8
* End:
*/