Dismiss dict bucket arrays in fork child to reduce CoW (#14979)

During RDB saving and AOF rewriting, the fork child already dismisses
(madvise(MADV_DONTNEED)) individual key-value objects after serializing them.
However, the hash table bucket arrays of each dict were never dismissed,
leaving large contiguous allocations subject to CoW when the parent
modifies them.

This PR extends the dismiss mechanism to cover dict bucket arrays,
reducing CoW memory overhead.

- **Expires kvstore** — dismissed upfront before saving starts, since the
child never accesses expires directly, after embeding expire time in the key object.
- **Slot dicts** (cluster mode) — dismissed per-slot as the iterator moves
   to the next slot during RDB saving or AOF rewriting.
- **DB keys kvstore** (standalone mode) — dismissed per-DB after each DB is
   fully serialized during RDB saving or AOF rewriting.
This commit is contained in:
Yuan Wang 2026-04-15 20:34:36 +08:00 committed by GitHub
parent 670993a89d
commit 2f1a8b2bad
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 110 additions and 22 deletions

View file

@ -2570,10 +2570,20 @@ int rewriteAppendOnlyFileRio(rio *aof) {
if (rioWriteBulkLongLong(aof,j) == 0) goto werr;
kvstoreIteratorInit(&kvs_it, db->keys);
int last_slot = -1;
/* Iterate this DB writing every entry */
while((de = kvstoreIteratorNext(&kvs_it)) != NULL) {
long long expiretime;
size_t aof_bytes_before_key = aof->processed_bytes;
int curr_slot = kvstoreIteratorGetCurrentDictIndex(&kvs_it);
/* In cluster mode, dismiss bucket arrays of the previous slot
* which won't be accessed again, to avoid CoW. */
if (server.cluster_enabled && curr_slot != last_slot) {
if (server.in_fork_child && last_slot != -1)
dismissDictBucketsMemory(kvstoreGetDict(db->keys, last_slot));
last_slot = curr_slot;
}
/* Get the value object (of type kvobj) */
kvobj *o = dictGetKV(de);
@ -2582,12 +2592,9 @@ int rewriteAppendOnlyFileRio(rio *aof) {
expiretime = kvobjGetExpire(o);
/* Skip keys that are being trimmed */
if (server.cluster_enabled) {
int curr_slot = kvstoreIteratorGetCurrentDictIndex(&kvs_it);
if (isSlotInTrimJob(curr_slot)) {
skipped++;
continue;
}
if (server.cluster_enabled && isSlotInTrimJob(curr_slot)) {
skipped++;
continue;
}
/* Set on stack string object for key */
@ -2600,7 +2607,8 @@ int rewriteAppendOnlyFileRio(rio *aof) {
* OS and possibly avoid or decrease COW. We give the dismiss
* mechanism a hint about an estimated size of the object we stored. */
size_t dump_size = aof->processed_bytes - aof_bytes_before_key;
if (server.in_fork_child) dismissObject(o, dump_size);
if (server.in_fork_child && dump_size > server.page_size/2)
dismissObject(o, dump_size);
/* Update info every 1 second (approximately).
* in order to avoid calling mstime() on each iteration, we will
@ -2618,6 +2626,10 @@ int rewriteAppendOnlyFileRio(rio *aof) {
debugDelay(server.rdb_key_save_delay);
}
kvstoreIteratorReset(&kvs_it);
/* Dismiss bucket arrays of kvstore in standalone mode. */
if (server.in_fork_child && !server.cluster_enabled)
dismissKvstoreBucketsMemory(db->keys);
}
serverLog(LL_NOTICE, "AOF rewrite done, %ld keys saved, %llu keys skipped.", key_count, skipped);
return C_OK;

View file

@ -691,8 +691,7 @@ void dismissSetObject(robj *o, size_t size_hint) {
}
/* Dismiss hash table memory. */
dismissMemory(set->ht_table[0], DICTHT_SIZE(set->ht_size_exp[0])*sizeof(dictEntry*));
dismissMemory(set->ht_table[1], DICTHT_SIZE(set->ht_size_exp[1])*sizeof(dictEntry*));
dismissDictBucketsMemory(set);
} else if (o->encoding == OBJ_ENCODING_INTSET) {
dismissMemory(o->ptr, intsetBlobLen((intset*)o->ptr));
} else if (o->encoding == OBJ_ENCODING_LISTPACK) {
@ -720,9 +719,7 @@ void dismissZsetObject(robj *o, size_t size_hint) {
}
/* Dismiss hash table memory. */
dict *d = zs->dict;
dismissMemory(d->ht_table[0], DICTHT_SIZE(d->ht_size_exp[0])*sizeof(dictEntry*));
dismissMemory(d->ht_table[1], DICTHT_SIZE(d->ht_size_exp[1])*sizeof(dictEntry*));
dismissDictBucketsMemory(zs->dict);
} else if (o->encoding == OBJ_ENCODING_LISTPACK) {
dismissMemory(o->ptr, lpBytes((unsigned char*)o->ptr));
} else {
@ -748,8 +745,7 @@ void dismissHashObject(robj *o, size_t size_hint) {
}
/* Dismiss hash table memory. */
dismissMemory(d->ht_table[0], DICTHT_SIZE(d->ht_size_exp[0])*sizeof(dictEntry*));
dismissMemory(d->ht_table[1], DICTHT_SIZE(d->ht_size_exp[1])*sizeof(dictEntry*));
dismissDictBucketsMemory(d);
} else if (o->encoding == OBJ_ENCODING_LISTPACK) {
dismissMemory(o->ptr, lpBytes((unsigned char*)o->ptr));
} else if (o->encoding == OBJ_ENCODING_LISTPACK_EX) {

View file

@ -1680,6 +1680,10 @@ ssize_t rdbSaveDb(rio *rdb, int dbid, int rdbflags, long *key_counter, unsigned
written += res;
if ((res = rdbSaveLen(rdb, kvstoreDictSize(db->expires, curr_slot))) < 0) goto werr2;
written += res;
/* Dismiss bucket arrays of the previous slot to reduce CoW.
* The final slot is not dismissed since the child exits shortly after. */
if (server.in_fork_child && last_slot != -1)
dismissDictBucketsMemory(kvstoreGetDict(db->keys, last_slot));
last_slot = curr_slot;
}
kvobj *kv = dictGetKV(de);
@ -1707,7 +1711,8 @@ ssize_t rdbSaveDb(rio *rdb, int dbid, int rdbflags, long *key_counter, unsigned
* OS and possibly avoid or decrease COW. We give the dismiss
* mechanism a hint about an estimated size of the object we stored. */
size_t dump_size = rdb->processed_bytes - rdb_bytes_before_key;
if (server.in_fork_child) dismissObject(kv, dump_size);
if (server.in_fork_child && dump_size > server.page_size/2)
dismissObject(kv, dump_size);
/* Update child info every 1 second (approximately).
* in order to avoid calling mstime() on each iteration, we will
@ -1758,6 +1763,10 @@ int rdbSaveRio(int req, rio *rdb, int *error, int rdbflags, rdbSaveInfo *rsi) {
if (!(req & SLAVE_REQ_RDB_EXCLUDE_DATA)) {
for (j = 0; j < server.dbnum; j++) {
if (rdbSaveDb(rdb, j, rdbflags, &key_counter, &skipped) == -1) goto werr;
/* In standalone mode, dismiss bucket arrays of the saved DB's
* kvstore to reduce CoW. In cluster mode this is done per-slot. */
if (server.in_fork_child && !server.cluster_enabled)
dismissKvstoreBucketsMemory(server.db[j].keys);
}
}

View file

@ -7497,6 +7497,20 @@ void dismissClientMemory(client *c) {
}
}
/* Dismiss the hash table bucket arrays of a dict. */
void dismissDictBucketsMemory(dict *d) {
if (!d) return;
dismissMemory(d->ht_table[0], DICTHT_SIZE(d->ht_size_exp[0]) * sizeof(dictEntry*));
dismissMemory(d->ht_table[1], DICTHT_SIZE(d->ht_size_exp[1]) * sizeof(dictEntry*));
}
/* Dismiss the hash table bucket arrays for all dicts in the given kvstore. */
void dismissKvstoreBucketsMemory(kvstore *kvs) {
for (int didx = 0; didx < kvstoreNumDicts(kvs); didx++) {
dismissDictBucketsMemory(kvstoreGetDict(kvs, didx));
}
}
/* In the child process, we don't need some buffers anymore, and these are
* likely to change in the parent when there's heavy write traffic.
* We dismiss them right away, to avoid CoW.
@ -7535,6 +7549,14 @@ void dismissMemoryInChild(void) {
client *c = listNodeValue(ln);
dismissClientMemory(c);
}
/* Dismiss expires kvstore bucket arrays since the child process never
* accesses them, expire times are embedded in key objects. */
if (server.in_fork_child == CHILD_TYPE_RDB || server.in_fork_child == CHILD_TYPE_AOF) {
for (int dbid = 0; dbid < server.dbnum; dbid++) {
dismissKvstoreBucketsMemory(server.db[dbid].expires);
}
}
#endif
}

View file

@ -3682,6 +3682,8 @@ void activeDefragFreeRaw(void *ptr);
robj *activeDefragStringOb(robj* ob);
void dismissSds(sds s);
void dismissMemory(void* ptr, size_t size_hint);
void dismissDictBucketsMemory(dict *d);
void dismissKvstoreBucketsMemory(kvstore *kvs);
void dismissMemoryInChild(void);
int clientsCronRunClient(client *c);

View file

@ -4,7 +4,7 @@
# Actually, we may not have many asserts in the test, since we just check for
# crashes and the dump file inconsistencies.
start_server {tags {"dismiss external:skip"}} {
start_server {tags {"dismiss external:skip needs:debug"}} {
# In other tests, although we test child process dumping RDB file, but
# memory allocations of key/values are usually small, they couldn't cover
# the "dismiss" object methods, in this test, we create big size key/values
@ -47,12 +47,15 @@ start_server {tags {"dismiss external:skip"}} {
r xadd bigstream * entry1 $bigstr entry2 $bigstr
set digest [debug_digest]
r config set aof-use-rdb-preamble no
r bgrewriteaof
waitForBgrewriteaof r
r debug loadaof
set newdigest [debug_digest]
assert {$digest eq $newdigest}
# Test both RDB (yes) and AOF (no) rewrite paths.
foreach preamble {yes no} {
r config set aof-use-rdb-preamble $preamble
r bgrewriteaof
waitForBgrewriteaof r
r debug loadaof
set newdigest [debug_digest]
assert {$digest eq $newdigest}
}
}
test {dismiss client output buffer} {
@ -99,4 +102,48 @@ start_server {tags {"dismiss external:skip"}} {
waitForBgsave $master
}
}
test {dismiss multi-db kvstore bucket memory in standalone mode} {
r flushall
regexp {db=(\d+)} [r client info] -> curdb
# Populate multiple DBs to verify each DB's bucket arrays can be dismissed.
foreach db {0 1 2 3} {
r select $db
populate 2000 "db${db}key:" 3 0 false 3600
}
set digest [debug_digest]
# Test both RDB (yes) and AOF (no) rewrite paths.
foreach preamble {yes no} {
r config set aof-use-rdb-preamble $preamble
r bgrewriteaof
waitForBgrewriteaof r
r debug loadaof
set newdigest [debug_digest]
assert {$digest eq $newdigest}
}
r select $curdb
}
}
start_cluster 1 0 {tags {dismiss external:skip cluster needs:debug}} {
test {dismiss slot dict bucket memory in cluster mode} {
# Concentrate keys into a few slots using hash tags so each slot's
# bucket array is large enough to be dismissed.
# {06S} -> slot 0, {Qi} -> slot 1, {5L5} -> slot 2
foreach tag {{06S} {Qi} {5L5}} {
populate 2000 "${tag}key:" 3 0 false 3600
}
set digest [r debug digest]
# Test both RDB (yes) and AOF (no) rewrite paths.
foreach preamble {yes no} {
r config set aof-use-rdb-preamble $preamble
r bgrewriteaof
waitForBgrewriteaof r
r debug loadaof
set newdigest [r debug digest]
assert {$digest eq $newdigest}
}
}
}