Reduce the overhead from malloc usable (#14074)

Currently, in the zmalloc and zfree family functions, we rely on
`je_malloc_usable_size()` to obtain the usable size of a pointer for
memory statistics or to return it to the caller. However, this function
is relatively expensive, as it involves locking and rbtree lookups
within jemalloc. Reducing the frequency of these calls can yield
significant performance improvements.

---------

Co-authored-by: oranagra <oran@redislabs.com>
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
This commit is contained in:
debing.sun 2025-06-12 22:31:26 +08:00 committed by GitHub
parent b1d202fb23
commit abaed0d54c
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 160 additions and 55 deletions

View file

@ -70,6 +70,6 @@ void jemalloc_prefork(void);
void jemalloc_postfork_parent(void);
void jemalloc_postfork_child(void);
void je_sdallocx_noflags(void *ptr, size_t size);
void *malloc_default(size_t size);
void *malloc_default(size_t size, size_t *usize);
#endif /* JEMALLOC_INTERNAL_EXTERNS_H */

View file

@ -255,15 +255,15 @@ malloc_initialized(void) {
* tail-call to the slowpath if they fire.
*/
JEMALLOC_ALWAYS_INLINE void *
imalloc_fastpath(size_t size, void *(fallback_alloc)(size_t)) {
imalloc_fastpath(size_t size, void *(fallback_alloc)(size_t, size_t *), size_t *usable_size) {
LOG("core.malloc.entry", "size: %zu", size);
if (tsd_get_allocates() && unlikely(!malloc_initialized())) {
return fallback_alloc(size);
return fallback_alloc(size, usable_size);
}
tsd_t *tsd = tsd_get(false);
if (unlikely((size > SC_LOOKUP_MAXCLASS) || tsd == NULL)) {
return fallback_alloc(size);
return fallback_alloc(size, usable_size);
}
/*
* The code below till the branch checking the next_event threshold may
@ -307,7 +307,7 @@ imalloc_fastpath(size_t size, void *(fallback_alloc)(size_t)) {
* 0) in a single branch.
*/
if (unlikely(allocated_after >= threshold)) {
return fallback_alloc(size);
return fallback_alloc(size, usable_size);
}
assert(tsd_fast(tsd));
@ -326,15 +326,17 @@ imalloc_fastpath(size_t size, void *(fallback_alloc)(size_t)) {
ret = cache_bin_alloc_easy(bin, &tcache_success);
if (tcache_success) {
fastpath_success_finish(tsd, allocated_after, bin, ret);
if (usable_size) *usable_size = usize;
return ret;
}
ret = cache_bin_alloc(bin, &tcache_success);
if (tcache_success) {
fastpath_success_finish(tsd, allocated_after, bin, ret);
if (usable_size) *usable_size = usize;
return ret;
}
return fallback_alloc(size);
return fallback_alloc(size, usable_size);
}
JEMALLOC_ALWAYS_INLINE int

View file

@ -151,3 +151,6 @@
/* This version of Jemalloc, modified for Redis, has the je_get_defrag_hint()
* function. */
#define JEMALLOC_FRAG_HINT
/* This version of Jemalloc, modified for Redis, has the je_*_usable() family functions. */
#define JEMALLOC_ALLOC_WITH_USIZE

View file

@ -2697,7 +2697,7 @@ imalloc(static_opts_t *sopts, dynamic_opts_t *dopts) {
JEMALLOC_NOINLINE
void *
malloc_default(size_t size) {
malloc_default(size_t size, size_t *usize) {
void *ret;
static_opts_t sopts;
dynamic_opts_t dopts;
@ -2731,6 +2731,7 @@ malloc_default(size_t size) {
LOG("core.malloc.exit", "result: %p", ret);
if (usize) *usize = dopts.usize;
return ret;
}
@ -2739,11 +2740,15 @@ malloc_default(size_t size) {
* Begin malloc(3)-compatible functions.
*/
static inline void *je_malloc_internal(size_t size, size_t *usize) {
return imalloc_fastpath(size, &malloc_default, usize);
}
JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
void JEMALLOC_NOTHROW *
JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(1)
je_malloc(size_t size) {
return imalloc_fastpath(size, &malloc_default);
return je_malloc_internal(size, NULL);
}
JEMALLOC_EXPORT int JEMALLOC_NOTHROW
@ -2826,10 +2831,7 @@ je_aligned_alloc(size_t alignment, size_t size) {
return ret;
}
JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
void JEMALLOC_NOTHROW *
JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE2(1, 2)
je_calloc(size_t num, size_t size) {
static void *je_calloc_internal(size_t num, size_t size, size_t *usize) {
void *ret;
static_opts_t sopts;
dynamic_opts_t dopts;
@ -2857,11 +2859,19 @@ je_calloc(size_t num, size_t size) {
LOG("core.calloc.exit", "result: %p", ret);
if (usize) *usize = dopts.usize;
return ret;
}
JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
void JEMALLOC_NOTHROW *
JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE2(1, 2)
je_calloc(size_t num, size_t size) {
return je_calloc_internal(num, size, NULL);
}
JEMALLOC_ALWAYS_INLINE void
ifree(tsd_t *tsd, void *ptr, tcache_t *tcache, bool slow_path) {
ifree(tsd_t *tsd, void *ptr, tcache_t *tcache, bool slow_path, size_t *usable) {
if (!slow_path) {
tsd_assert_fast(tsd);
}
@ -2894,6 +2904,7 @@ ifree(tsd_t *tsd, void *ptr, tcache_t *tcache, bool slow_path) {
true);
}
thread_dalloc_event(tsd, usize);
if (usable) *usable = usize;
}
JEMALLOC_ALWAYS_INLINE bool
@ -2993,7 +3004,7 @@ isfree(tsd_t *tsd, void *ptr, size_t usize, tcache_t *tcache, bool slow_path) {
JEMALLOC_NOINLINE
void
free_default(void *ptr) {
free_default(void *ptr, size_t *usize) {
UTRACE(ptr, 0, 0);
if (likely(ptr != NULL)) {
/*
@ -3011,14 +3022,14 @@ free_default(void *ptr) {
tcache_t *tcache = tcache_get_from_ind(tsd,
TCACHE_IND_AUTOMATIC, /* slow */ false,
/* is_alloc */ false);
ifree(tsd, ptr, tcache, /* slow */ false);
ifree(tsd, ptr, tcache, /* slow */ false, usize);
} else {
tcache_t *tcache = tcache_get_from_ind(tsd,
TCACHE_IND_AUTOMATIC, /* slow */ true,
/* is_alloc */ false);
uintptr_t args_raw[3] = {(uintptr_t)ptr};
hook_invoke_dalloc(hook_dalloc_free, ptr, args_raw);
ifree(tsd, ptr, tcache, /* slow */ true);
ifree(tsd, ptr, tcache, /* slow */ true, usize);
}
check_entry_exit_locking(tsd_tsdn(tsd));
@ -3062,7 +3073,7 @@ free_fastpath_nonfast_aligned(void *ptr, bool check_prof) {
/* Returns whether or not the free attempt was successful. */
JEMALLOC_ALWAYS_INLINE
bool free_fastpath(void *ptr, size_t size, bool size_hint) {
bool free_fastpath(void *ptr, size_t size, bool size_hint, size_t *usable_size) {
tsd_t *tsd = tsd_get(false);
/* The branch gets optimized away unless tsd_get_allocates(). */
if (unlikely(tsd == NULL)) {
@ -3131,6 +3142,7 @@ bool free_fastpath(void *ptr, size_t size, bool size_hint) {
bool fail = maybe_check_alloc_ctx(tsd, ptr, &alloc_ctx);
if (fail) {
/* See the comment in isfree. */
if (usable_size) *usable_size = usize;
return true;
}
@ -3151,18 +3163,23 @@ bool free_fastpath(void *ptr, size_t size, bool size_hint) {
*tsd_thread_deallocatedp_get(tsd) = deallocated_after;
if (usable_size) *usable_size = usize;
return true;
}
static inline void je_free_internal(void *ptr, size_t *usize) {
LOG("core.free.entry", "ptr: %p", ptr);
if (!free_fastpath(ptr, 0, false, usize)) {
free_default(ptr, usize);
}
LOG("core.free.exit", "");
}
JEMALLOC_EXPORT void JEMALLOC_NOTHROW
je_free(void *ptr) {
LOG("core.free.entry", "ptr: %p", ptr);
if (!free_fastpath(ptr, 0, false)) {
free_default(ptr);
}
LOG("core.free.exit", "");
je_free_internal(ptr, NULL);
}
/*
@ -3490,7 +3507,7 @@ irallocx_prof(tsd_t *tsd, void *old_ptr, size_t old_usize, size_t size,
}
static void *
do_rallocx(void *ptr, size_t size, int flags, bool is_realloc) {
do_rallocx(void *ptr, size_t size, int flags, bool is_realloc, size_t *old_usable_size, size_t *new_usable_size) {
void *p;
tsd_t *tsd;
size_t usize;
@ -3555,6 +3572,8 @@ do_rallocx(void *ptr, size_t size, int flags, bool is_realloc) {
junk_alloc_callback(excess_start, excess_len);
}
if (old_usable_size) *old_usable_size = old_usize;
if (new_usable_size) *new_usable_size = usize;
return p;
label_oom:
if (config_xmalloc && unlikely(opt_xmalloc)) {
@ -3573,13 +3592,13 @@ JEMALLOC_ALLOC_SIZE(2)
je_rallocx(void *ptr, size_t size, int flags) {
LOG("core.rallocx.entry", "ptr: %p, size: %zu, flags: %d", ptr,
size, flags);
void *ret = do_rallocx(ptr, size, flags, false);
void *ret = do_rallocx(ptr, size, flags, false, NULL, NULL);
LOG("core.rallocx.exit", "result: %p", ret);
return ret;
}
static void *
do_realloc_nonnull_zero(void *ptr) {
do_realloc_nonnull_zero(void *ptr, size_t *old_usize, size_t *new_usize) {
if (config_stats) {
atomic_fetch_add_zu(&zero_realloc_count, 1, ATOMIC_RELAXED);
}
@ -3590,7 +3609,7 @@ do_realloc_nonnull_zero(void *ptr) {
* reduce the harm, and turn off the tcache while allocating, so
* that we'll get a true first fit.
*/
return do_rallocx(ptr, 1, MALLOCX_TCACHE_NONE, true);
return do_rallocx(ptr, 1, MALLOCX_TCACHE_NONE, true, old_usize, new_usize);
} else if (opt_zero_realloc_action == zero_realloc_action_free) {
UTRACE(ptr, 0, 0);
tsd_t *tsd = tsd_fetch();
@ -3601,7 +3620,10 @@ do_realloc_nonnull_zero(void *ptr) {
/* is_alloc */ false);
uintptr_t args[3] = {(uintptr_t)ptr, 0};
hook_invoke_dalloc(hook_dalloc_realloc, ptr, args);
ifree(tsd, ptr, tcache, true);
size_t usize;
ifree(tsd, ptr, tcache, true, &usize);
if (old_usize) *old_usize = usize;
if (new_usize) *new_usize = 0;
check_entry_exit_locking(tsd_tsdn(tsd));
return NULL;
@ -3617,18 +3639,15 @@ do_realloc_nonnull_zero(void *ptr) {
}
}
JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
void JEMALLOC_NOTHROW *
JEMALLOC_ALLOC_SIZE(2)
je_realloc(void *ptr, size_t size) {
static inline void *je_realloc_internal(void *ptr, size_t size, size_t *old_usize, size_t *new_usize) {
LOG("core.realloc.entry", "ptr: %p, size: %zu\n", ptr, size);
if (likely(ptr != NULL && size != 0)) {
void *ret = do_rallocx(ptr, size, 0, true);
void *ret = do_rallocx(ptr, size, 0, true, old_usize, new_usize);
LOG("core.realloc.exit", "result: %p", ret);
return ret;
} else if (ptr != NULL && size == 0) {
void *ret = do_realloc_nonnull_zero(ptr);
void *ret = do_realloc_nonnull_zero(ptr, old_usize, new_usize);
LOG("core.realloc.exit", "result: %p", ret);
return ret;
} else {
@ -3657,10 +3676,19 @@ je_realloc(void *ptr, size_t size) {
(uintptr_t)ret, args);
}
LOG("core.realloc.exit", "result: %p", ret);
if (old_usize) *old_usize = 0;
if (new_usize) *new_usize = dopts.usize;
return ret;
}
}
JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
void JEMALLOC_NOTHROW *
JEMALLOC_ALLOC_SIZE(2)
je_realloc(void *ptr, size_t size) {
return je_realloc_internal(ptr, size, NULL, NULL);
}
JEMALLOC_ALWAYS_INLINE size_t
ixallocx_helper(tsdn_t *tsdn, void *ptr, size_t old_usize, size_t size,
size_t extra, size_t alignment, bool zero) {
@ -3883,11 +3911,11 @@ je_dallocx(void *ptr, int flags) {
UTRACE(ptr, 0, 0);
if (likely(fast)) {
tsd_assert_fast(tsd);
ifree(tsd, ptr, tcache, false);
ifree(tsd, ptr, tcache, false, NULL);
} else {
uintptr_t args_raw[3] = {(uintptr_t)ptr, flags};
hook_invoke_dalloc(hook_dalloc_dallocx, ptr, args_raw);
ifree(tsd, ptr, tcache, true);
ifree(tsd, ptr, tcache, true, NULL);
}
check_entry_exit_locking(tsd_tsdn(tsd));
@ -3935,7 +3963,7 @@ je_sdallocx(void *ptr, size_t size, int flags) {
LOG("core.sdallocx.entry", "ptr: %p, size: %zu, flags: %d", ptr,
size, flags);
if (flags != 0 || !free_fastpath(ptr, size, true)) {
if (flags != 0 || !free_fastpath(ptr, size, true, NULL)) {
sdallocx_default(ptr, size, flags);
}
@ -3947,7 +3975,7 @@ je_sdallocx_noflags(void *ptr, size_t size) {
LOG("core.sdallocx.entry", "ptr: %p, size: %zu, flags: 0", ptr,
size);
if (!free_fastpath(ptr, size, true)) {
if (!free_fastpath(ptr, size, true, NULL)) {
sdallocx_default(ptr, size, 0);
}
@ -4483,3 +4511,29 @@ get_defrag_hint(void* ptr) {
assert(ptr != NULL);
return iget_defrag_hint(TSDN_NULL, ptr);
}
JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
void JEMALLOC_NOTHROW *
JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE(1)
malloc_with_usize(size_t size, size_t *usize) {
return je_malloc_internal(size, usize);
}
JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
void JEMALLOC_NOTHROW *
JEMALLOC_ATTR(malloc) JEMALLOC_ALLOC_SIZE2(1, 2)
calloc_with_usize(size_t num, size_t size, size_t *usize) {
return je_calloc_internal(num, size, usize);
}
JEMALLOC_EXPORT JEMALLOC_ALLOCATOR JEMALLOC_RESTRICT_RETURN
void JEMALLOC_NOTHROW *
JEMALLOC_ALLOC_SIZE(2)
realloc_with_usize(void *ptr, size_t size, size_t *old_usize, size_t *new_usize) {
return je_realloc_internal(ptr, size, old_usize, new_usize);
}
JEMALLOC_EXPORT void JEMALLOC_NOTHROW
free_with_usize(void *ptr, size_t *usize) {
je_free_internal(ptr, usize);
}

View file

@ -94,8 +94,8 @@ handleOOM(std::size_t size, bool nothrow) {
template <bool IsNoExcept>
JEMALLOC_NOINLINE
static void *
fallback_impl(std::size_t size) noexcept(IsNoExcept) {
void *ptr = malloc_default(size);
fallback_impl(std::size_t size, std::size_t *usize) noexcept(IsNoExcept) {
void *ptr = malloc_default(size, NULL);
if (likely(ptr != nullptr)) {
return ptr;
}
@ -106,7 +106,7 @@ template <bool IsNoExcept>
JEMALLOC_ALWAYS_INLINE
void *
newImpl(std::size_t size) noexcept(IsNoExcept) {
return imalloc_fastpath(size, &fallback_impl<IsNoExcept>);
return imalloc_fastpath(size, &fallback_impl<IsNoExcept>, NULL);
}
void *

View file

@ -67,6 +67,16 @@ void zlibc_free(void *ptr) {
#define mallocx(size,flags) je_mallocx(size,flags)
#define rallocx(ptr,size,flags) je_rallocx(ptr,size,flags)
#define dallocx(ptr,flags) je_dallocx(ptr,flags)
#if defined(HAVE_ALLOC_WITH_USIZE)
void *je_malloc_with_usize(size_t size, size_t *usize);
void *je_calloc_with_usize(size_t num, size_t size, size_t *usize);
void *je_realloc_with_usize(void *ptr, size_t size, size_t *old_usize, size_t *new_usize);
void je_free_with_usize(void *ptr, size_t *usize);
#define malloc_with_usize(size,usize) je_malloc_with_usize(size,usize)
#define calloc_with_usize(num,size,usize) je_calloc_with_usize(num,size,usize)
#define realloc_with_usize(ptr,size,old_usize,new_usize) je_realloc_with_usize(ptr,size,old_usize,new_usize)
#define free_with_usize(ptr,usize) je_free_with_usize(ptr,usize)
#endif
#endif
#define MAX_THREADS 16 /* Keep it a power of 2 so we can use '&' instead of '%'. */
@ -119,10 +129,17 @@ void *extend_to_usable(void *ptr, size_t size) {
static inline void *ztrymalloc_usable_internal(size_t size, size_t *usable) {
/* Possible overflow, return NULL, so that the caller can panic or handle a failed allocation. */
if (size >= SIZE_MAX/2) return NULL;
#ifdef HAVE_ALLOC_WITH_USIZE
void *ptr = malloc_with_usize(MALLOC_MIN_SIZE(size)+PREFIX_SIZE, &size);
#else
void *ptr = malloc(MALLOC_MIN_SIZE(size)+PREFIX_SIZE);
#endif
if (!ptr) return NULL;
#ifdef HAVE_MALLOC_SIZE
#ifdef HAVE_ALLOC_WITH_USIZE
update_zmalloc_stat_alloc(size);
if (usable) *usable = size;
return ptr;
#elif HAVE_MALLOC_SIZE
size = zmalloc_size(ptr);
update_zmalloc_stat_alloc(size);
if (usable) *usable = size;
@ -243,10 +260,18 @@ void zfree_no_tcache(void *ptr) {
static inline void *ztrycalloc_usable_internal(size_t size, size_t *usable) {
/* Possible overflow, return NULL, so that the caller can panic or handle a failed allocation. */
if (size >= SIZE_MAX/2) return NULL;
#ifdef HAVE_ALLOC_WITH_USIZE
void *ptr = calloc_with_usize(1, MALLOC_MIN_SIZE(size)+PREFIX_SIZE, &size);
#else
void *ptr = calloc(1, MALLOC_MIN_SIZE(size)+PREFIX_SIZE);
#endif
if (ptr == NULL) return NULL;
#ifdef HAVE_MALLOC_SIZE
#ifdef HAVE_ALLOC_WITH_USIZE
update_zmalloc_stat_alloc(size);
if (usable) *usable = size;
return ptr;
#elif HAVE_MALLOC_SIZE
size = zmalloc_size(ptr);
update_zmalloc_stat_alloc(size);
if (usable) *usable = size;
@ -335,8 +360,17 @@ static inline void *ztryrealloc_usable_internal(void *ptr, size_t size, size_t *
if (usable) *usable = 0;
return NULL;
}
#ifdef HAVE_MALLOC_SIZE
#ifdef HAVE_ALLOC_WITH_USIZE
newptr = realloc_with_usize(ptr, size, &oldsize, &size);
if (newptr == NULL) {
if (usable) *usable = 0;
return NULL;
}
update_zmalloc_stat_free(oldsize);
update_zmalloc_stat_alloc(size);
if (usable) *usable = size;
return newptr;
#elif HAVE_MALLOC_SIZE
oldsize = zmalloc_size(ptr);
newptr = realloc(ptr,size);
if (newptr == NULL) {
@ -417,17 +451,18 @@ size_t zmalloc_usable_size(void *ptr) {
#endif
void zfree(void *ptr) {
#ifndef HAVE_MALLOC_SIZE
void *realptr;
size_t oldsize;
#endif
if (ptr == NULL) return;
#ifdef HAVE_MALLOC_SIZE
#ifdef HAVE_ALLOC_WITH_USIZE
size_t oldsize;
free_with_usize(ptr, &oldsize);
update_zmalloc_stat_free(oldsize);
#elif HAVE_MALLOC_SIZE
update_zmalloc_stat_free(zmalloc_size(ptr));
free(ptr);
#else
realptr = (char*)ptr-PREFIX_SIZE;
size_t oldsize;
void *realptr = (char*)ptr-PREFIX_SIZE;
oldsize = *((size_t*)realptr);
update_zmalloc_stat_free(oldsize+PREFIX_SIZE);
free(realptr);
@ -442,7 +477,11 @@ void zfree_usable(void *ptr, size_t *usable) {
#endif
if (ptr == NULL) return;
#ifdef HAVE_MALLOC_SIZE
#ifdef HAVE_ALLOC_WITH_USIZE
free_with_usize(ptr, usable);
update_zmalloc_stat_free(*usable);
#elif HAVE_MALLOC_SIZE
update_zmalloc_stat_free(*usable = zmalloc_size(ptr));
free(ptr);
#else

View file

@ -80,6 +80,13 @@
#define HAVE_DEFRAG
#endif
/* We can enable allocation with usable size capabilities only if we are using Jemalloc
* and the version used is our special version modified for Redis having
* the ability to return usable size during allocation or deallocation. */
#if defined(USE_JEMALLOC) && defined(JEMALLOC_ALLOC_WITH_USIZE)
#define HAVE_ALLOC_WITH_USIZE
#endif
/* 'noinline' attribute is intended to prevent the `-Wstringop-overread` warning
* when using gcc-12 later with LTO enabled. It may be removed once the
* bug[https://gcc.gnu.org/bugzilla/show_bug.cgi?id=96503] is fixed. */