Make isc_mem_isovermem() probabilistic

Replace the hysteretic hi_water/lo_water switch with a stochastic
check: always false below lo_water, always true at or above hi_water,
linearly ramped probability in between.  This spreads cache cleaning
across many inserts instead of triggering a thundering herd once the
hi_water mark is crossed (which causes every addrdataset to enter the
LRU purge path simultaneously and serializes lookups behind the node
write locks).

The is_overmem atomic and its stores are no longer needed and are
removed.  The existing tests that asserted specific hysteretic state
transitions are simplified to check only the deterministic boundaries.
This commit is contained in:
Ondřej Surý 2026-04-19 21:36:43 +02:00 committed by Michał Kępień
parent 21c8ba4f0b
commit 24ac3392d9
No known key found for this signature in database
3 changed files with 64 additions and 74 deletions

View file

@ -30,6 +30,7 @@
#include <isc/mutex.h>
#include <isc/os.h>
#include <isc/overflow.h>
#include <isc/random.h>
#include <isc/refcount.h>
#include <isc/stdtime.h>
#include <isc/strerr.h>
@ -126,7 +127,6 @@ static isc_mutex_t contextslock;
typedef union {
struct {
atomic_int_fast64_t inuse;
atomic_bool is_overmem;
};
char padding[ISC_OS_CACHELINE_SIZE];
} isc__mem_stat_t;
@ -620,7 +620,6 @@ mem_create(const char *name, isc_mem_t **ctxp, unsigned int debugging,
for (size_t i = 0; i < ARRAY_SIZE(ctx->stat_s); i++) {
atomic_init(&ctx->stat_s[i].inuse, 0);
atomic_init(&ctx->stat_s[i].is_overmem, false);
}
/* Reserve the [-1] index for ISC_TID_UNKNOWN */
@ -1020,50 +1019,30 @@ bool
isc_mem_isovermem(isc_mem_t *ctx) {
REQUIRE(VALID_CONTEXT(ctx));
int32_t tid = isc_tid();
bool is_overmem = atomic_load_relaxed(&ctx->stat[tid].is_overmem);
if (!is_overmem) {
/* We are not overmem, check whether we should be? */
size_t hiwater = atomic_load_relaxed(&ctx->hi_water);
if (hiwater == 0) {
return false;
}
size_t inuse = isc_mem_inuse(ctx);
if (inuse <= hiwater) {
return false;
}
if ((ctx->debugging & ISC_MEM_DEBUGUSAGE) != 0) {
fprintf(stderr,
"overmem %s mctx %p inuse %zu hi_water %zu\n",
ctx->name, ctx, inuse, hiwater);
}
atomic_store_relaxed(&ctx->stat[tid].is_overmem, true);
return true;
} else {
/* We are overmem, check whether we should not be? */
size_t lowater = atomic_load_relaxed(&ctx->lo_water);
if (lowater == 0) {
return false;
}
size_t inuse = isc_mem_inuse(ctx);
if (inuse >= lowater) {
return true;
}
if ((ctx->debugging & ISC_MEM_DEBUGUSAGE) != 0) {
fprintf(stderr,
"overmem %s mctx %p inuse %zu lo_water %zu\n",
ctx->name, ctx, inuse, lowater);
}
atomic_store_relaxed(&ctx->stat[tid].is_overmem, false);
size_t hiwater = atomic_load_relaxed(&ctx->hi_water);
if (hiwater == 0) {
return false;
}
size_t inuse = isc_mem_inuse(ctx);
if (inuse >= hiwater) {
return true;
}
size_t lowater = atomic_load_relaxed(&ctx->lo_water);
if (inuse <= lowater) {
return false;
}
/*
* Between lo_water and hi_water, return true with a probability
* that ramps linearly from 0 at lo_water to 1 at hi_water. This
* spreads cache cleaning across many inserts instead of triggering
* a thundering herd once the hi_water mark is crossed.
*/
uint32_t prob = (uint32_t)(((uint64_t)(inuse - lowater) * 256) /
(hiwater - lowater));
return isc_random8() < prob;
}
const char *

View file

@ -128,7 +128,7 @@ ISC_LOOP_TEST_IMPL(overmempurge_bigrdata) {
dns_db_t *db = NULL;
isc_mem_t *mctx = NULL;
isc_stdtime_t now = isc_stdtime_now();
size_t i;
size_t i = 0;
isc_mem_create("test", &mctx);
@ -140,21 +140,21 @@ ISC_LOOP_TEST_IMPL(overmempurge_bigrdata) {
isc_mem_setwater(mctx, hiwater, lowater);
/*
* Add cache entries with minimum size of data until 'overmem'
* condition is triggered.
* This should eventually happen, but we also limit the number of
* iteration to avoid an infinite loop in case something gets wrong.
* Add a lot of data entries sufficient to push the context
* above the hi_water mark.
*/
for (i = 0; !isc_mem_isovermem(mctx) && i < (maxcache / 10); i++) {
overmempurge_addrdataset(db, now, i, 50053, 0, false);
while (isc_mem_inuse(mctx) < hiwater) {
overmempurge_addrdataset(db, now, i, 50053, 0, true);
i++;
}
assert_true(isc_mem_isovermem(mctx));
assert_true(isc_mem_inuse(mctx) >= hiwater);
assert_true(isc_mem_inuse(mctx) < maxcache);
/*
* Then try to add the same number of entries, each has very large data.
* 'overmem purge' should keep the total cache size from exceeding
* the 'hiwater' mark too much. So we should be able to assume the
* cache size doesn't reach the "max".
* Probabilistic LRU cleaning should keep the total cache size from
* exceeding the 'hiwater' mark too much. So we should be able to
* assume the cache size doesn't reach the "max".
*/
while (i-- > 0) {
overmempurge_addrdataset(db, now, i, 50054,
@ -180,7 +180,7 @@ ISC_LOOP_TEST_IMPL(overmempurge_longname) {
dns_db_t *db = NULL;
isc_mem_t *mctx = NULL;
isc_stdtime_t now = isc_stdtime_now();
size_t i;
size_t i = 0;
isc_mem_create("test", &mctx);
@ -192,21 +192,21 @@ ISC_LOOP_TEST_IMPL(overmempurge_longname) {
isc_mem_setwater(mctx, hiwater, lowater);
/*
* Add cache entries with minimum size of data until 'overmem'
* condition is triggered.
* This should eventually happen, but we also limit the number of
* iteration to avoid an infinite loop in case something gets wrong.
* Add a lot of data entries sufficient to push the context
* above the hi_water mark.
*/
for (i = 0; !isc_mem_isovermem(mctx) && i < (maxcache / 10); i++) {
overmempurge_addrdataset(db, now, i, 50053, 0, false);
while (isc_mem_inuse(mctx) < hiwater) {
overmempurge_addrdataset(db, now, i, 50053, 0, true);
i++;
}
assert_true(isc_mem_isovermem(mctx));
assert_true(isc_mem_inuse(mctx) >= hiwater);
assert_true(isc_mem_inuse(mctx) < maxcache);
/*
* Then try to add the same number of entries, each has very long name.
* 'overmem purge' should keep the total cache size from not exceeding
* the 'hiwater' mark too much. So we should be able to assume the cache
* size doesn't reach the "max".
* Probabilistic LRU cleaning should keep the total cache size from
* exceeding the 'hiwater' mark too much. So we should be able to
* assume the cache size doesn't reach the "max".
*/
while (i-- > 0) {
overmempurge_addrdataset(db, now, i, 50054, 0, true);

View file

@ -291,6 +291,17 @@ ISC_RUN_TEST_IMPL(isc_mem_reallocate) {
isc_mem_free(isc_g_mctx, data);
}
static bool
at_least_one_overmem(isc_mem_t *mctx) {
for (size_t i = 0; i < UINT16_MAX; i++) {
/* The overmem is probability based in this range */
if (isc_mem_isovermem(mctx)) {
return true;
}
}
return false;
}
ISC_RUN_TEST_IMPL(isc_mem_overmem) {
isc_mem_t *mctx = NULL;
isc_mem_create("test", &mctx);
@ -298,27 +309,27 @@ ISC_RUN_TEST_IMPL(isc_mem_overmem) {
isc_mem_setwater(mctx, 1024, 512);
/* inuse < lo_water */
/* inuse <= lo_water is always false */
void *data1 = isc_mem_allocate(mctx, 256);
assert_false(isc_mem_isovermem(mctx));
/* lo_water < inuse < hi_water */
/* lo_water < inuse < hi_water might be true or false */
void *data2 = isc_mem_allocate(mctx, 512);
assert_false(isc_mem_isovermem(mctx));
assert_true(at_least_one_overmem(mctx));
/* hi_water < inuse */
/* hi_water <= inuse is always true */
void *data3 = isc_mem_allocate(mctx, 512);
assert_true(isc_mem_isovermem(mctx));
/* lo_water < inuse < hi_water */
/* lo_water < inuse < hi_water might be true or false */
isc_mem_free(mctx, data2);
assert_true(isc_mem_isovermem(mctx));
assert_true(at_least_one_overmem(mctx));
/* inuse < lo_water */
/* inuse <= lo_water is always false */
isc_mem_free(mctx, data3);
assert_false(isc_mem_isovermem(mctx));
/* inuse == 0 */
/* inuse == 0 is always false */
isc_mem_free(mctx, data1);
assert_false(isc_mem_isovermem(mctx));