chg: usr: Fix CPU spikes and slow queries when cache approaches memory limit

Spread cache cleanup probabilistically to avoid CPU usage
spikes and a drop in query throughput.

Closes isc-projects/bind9#5891

Merge branch '5891-improve-overmem-cleaning' into 'security-main'

See merge request isc-private/bind9!1002
This commit is contained in:
Ondřej Surý 2026-05-05 13:10:30 +02:00 committed by Michał Kępień
commit 2e7e9f51db
No known key found for this signature in database
3 changed files with 64 additions and 74 deletions

View file

@ -30,6 +30,7 @@
#include <isc/mutex.h>
#include <isc/os.h>
#include <isc/overflow.h>
#include <isc/random.h>
#include <isc/refcount.h>
#include <isc/stdtime.h>
#include <isc/strerr.h>
@ -126,7 +127,6 @@ static isc_mutex_t contextslock;
typedef union {
struct {
atomic_int_fast64_t inuse;
atomic_bool is_overmem;
};
char padding[ISC_OS_CACHELINE_SIZE];
} isc__mem_stat_t;
@ -620,7 +620,6 @@ mem_create(const char *name, isc_mem_t **ctxp, unsigned int debugging,
for (size_t i = 0; i < ARRAY_SIZE(ctx->stat_s); i++) {
atomic_init(&ctx->stat_s[i].inuse, 0);
atomic_init(&ctx->stat_s[i].is_overmem, false);
}
/* Reserve the [-1] index for ISC_TID_UNKNOWN */
@ -1020,50 +1019,30 @@ bool
isc_mem_isovermem(isc_mem_t *ctx) {
REQUIRE(VALID_CONTEXT(ctx));
int32_t tid = isc_tid();
bool is_overmem = atomic_load_relaxed(&ctx->stat[tid].is_overmem);
if (!is_overmem) {
/* We are not overmem, check whether we should be? */
size_t hiwater = atomic_load_relaxed(&ctx->hi_water);
if (hiwater == 0) {
return false;
}
size_t inuse = isc_mem_inuse(ctx);
if (inuse <= hiwater) {
return false;
}
if ((ctx->debugging & ISC_MEM_DEBUGUSAGE) != 0) {
fprintf(stderr,
"overmem %s mctx %p inuse %zu hi_water %zu\n",
ctx->name, ctx, inuse, hiwater);
}
atomic_store_relaxed(&ctx->stat[tid].is_overmem, true);
return true;
} else {
/* We are overmem, check whether we should not be? */
size_t lowater = atomic_load_relaxed(&ctx->lo_water);
if (lowater == 0) {
return false;
}
size_t inuse = isc_mem_inuse(ctx);
if (inuse >= lowater) {
return true;
}
if ((ctx->debugging & ISC_MEM_DEBUGUSAGE) != 0) {
fprintf(stderr,
"overmem %s mctx %p inuse %zu lo_water %zu\n",
ctx->name, ctx, inuse, lowater);
}
atomic_store_relaxed(&ctx->stat[tid].is_overmem, false);
size_t hiwater = atomic_load_relaxed(&ctx->hi_water);
if (hiwater == 0) {
return false;
}
size_t inuse = isc_mem_inuse(ctx);
if (inuse >= hiwater) {
return true;
}
size_t lowater = atomic_load_relaxed(&ctx->lo_water);
if (inuse <= lowater) {
return false;
}
/*
* Between lo_water and hi_water, return true with a probability
* that ramps linearly from 0 at lo_water to 1 at hi_water. This
* spreads cache cleaning across many inserts instead of triggering
* a thundering herd once the hi_water mark is crossed.
*/
uint32_t prob = (uint32_t)(((uint64_t)(inuse - lowater) * 256) /
(hiwater - lowater));
return isc_random8() < prob;
}
const char *

View file

@ -128,7 +128,7 @@ ISC_LOOP_TEST_IMPL(overmempurge_bigrdata) {
dns_db_t *db = NULL;
isc_mem_t *mctx = NULL;
isc_stdtime_t now = isc_stdtime_now();
size_t i;
size_t i = 0;
isc_mem_create("test", &mctx);
@ -140,21 +140,21 @@ ISC_LOOP_TEST_IMPL(overmempurge_bigrdata) {
isc_mem_setwater(mctx, hiwater, lowater);
/*
* Add cache entries with minimum size of data until 'overmem'
* condition is triggered.
* This should eventually happen, but we also limit the number of
* iteration to avoid an infinite loop in case something gets wrong.
* Add a lot of data entries sufficient to push the context
* above the hi_water mark.
*/
for (i = 0; !isc_mem_isovermem(mctx) && i < (maxcache / 10); i++) {
overmempurge_addrdataset(db, now, i, 50053, 0, false);
while (isc_mem_inuse(mctx) < hiwater) {
overmempurge_addrdataset(db, now, i, 50053, 0, true);
i++;
}
assert_true(isc_mem_isovermem(mctx));
assert_true(isc_mem_inuse(mctx) >= hiwater);
assert_true(isc_mem_inuse(mctx) < maxcache);
/*
* Then try to add the same number of entries, each has very large data.
* 'overmem purge' should keep the total cache size from exceeding
* the 'hiwater' mark too much. So we should be able to assume the
* cache size doesn't reach the "max".
* Probabilistic LRU cleaning should keep the total cache size from
* exceeding the 'hiwater' mark too much. So we should be able to
* assume the cache size doesn't reach the "max".
*/
while (i-- > 0) {
overmempurge_addrdataset(db, now, i, 50054,
@ -180,7 +180,7 @@ ISC_LOOP_TEST_IMPL(overmempurge_longname) {
dns_db_t *db = NULL;
isc_mem_t *mctx = NULL;
isc_stdtime_t now = isc_stdtime_now();
size_t i;
size_t i = 0;
isc_mem_create("test", &mctx);
@ -192,21 +192,21 @@ ISC_LOOP_TEST_IMPL(overmempurge_longname) {
isc_mem_setwater(mctx, hiwater, lowater);
/*
* Add cache entries with minimum size of data until 'overmem'
* condition is triggered.
* This should eventually happen, but we also limit the number of
* iteration to avoid an infinite loop in case something gets wrong.
* Add a lot of data entries sufficient to push the context
* above the hi_water mark.
*/
for (i = 0; !isc_mem_isovermem(mctx) && i < (maxcache / 10); i++) {
overmempurge_addrdataset(db, now, i, 50053, 0, false);
while (isc_mem_inuse(mctx) < hiwater) {
overmempurge_addrdataset(db, now, i, 50053, 0, true);
i++;
}
assert_true(isc_mem_isovermem(mctx));
assert_true(isc_mem_inuse(mctx) >= hiwater);
assert_true(isc_mem_inuse(mctx) < maxcache);
/*
* Then try to add the same number of entries, each has very long name.
* 'overmem purge' should keep the total cache size from not exceeding
* the 'hiwater' mark too much. So we should be able to assume the cache
* size doesn't reach the "max".
* Probabilistic LRU cleaning should keep the total cache size from
* exceeding the 'hiwater' mark too much. So we should be able to
* assume the cache size doesn't reach the "max".
*/
while (i-- > 0) {
overmempurge_addrdataset(db, now, i, 50054, 0, true);

View file

@ -291,6 +291,17 @@ ISC_RUN_TEST_IMPL(isc_mem_reallocate) {
isc_mem_free(isc_g_mctx, data);
}
static bool
at_least_one_overmem(isc_mem_t *mctx) {
for (size_t i = 0; i < UINT16_MAX; i++) {
/* The overmem is probability based in this range */
if (isc_mem_isovermem(mctx)) {
return true;
}
}
return false;
}
ISC_RUN_TEST_IMPL(isc_mem_overmem) {
isc_mem_t *mctx = NULL;
isc_mem_create("test", &mctx);
@ -298,27 +309,27 @@ ISC_RUN_TEST_IMPL(isc_mem_overmem) {
isc_mem_setwater(mctx, 1024, 512);
/* inuse < lo_water */
/* inuse <= lo_water is always false */
void *data1 = isc_mem_allocate(mctx, 256);
assert_false(isc_mem_isovermem(mctx));
/* lo_water < inuse < hi_water */
/* lo_water < inuse < hi_water might be true or false */
void *data2 = isc_mem_allocate(mctx, 512);
assert_false(isc_mem_isovermem(mctx));
assert_true(at_least_one_overmem(mctx));
/* hi_water < inuse */
/* hi_water <= inuse is always true */
void *data3 = isc_mem_allocate(mctx, 512);
assert_true(isc_mem_isovermem(mctx));
/* lo_water < inuse < hi_water */
/* lo_water < inuse < hi_water might be true or false */
isc_mem_free(mctx, data2);
assert_true(isc_mem_isovermem(mctx));
assert_true(at_least_one_overmem(mctx));
/* inuse < lo_water */
/* inuse <= lo_water is always false */
isc_mem_free(mctx, data3);
assert_false(isc_mem_isovermem(mctx));
/* inuse == 0 */
/* inuse == 0 is always false */
isc_mem_free(mctx, data1);
assert_false(isc_mem_isovermem(mctx));