From 2f1a8b2bad47007a4df73e8f643405ab62a9620b Mon Sep 17 00:00:00 2001 From: Yuan Wang Date: Wed, 15 Apr 2026 20:34:36 +0800 Subject: [PATCH] Dismiss dict bucket arrays in fork child to reduce CoW (#14979) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit During RDB saving and AOF rewriting, the fork child already dismisses (madvise(MADV_DONTNEED)) individual key-value objects after serializing them. However, the hash table bucket arrays of each dict were never dismissed, leaving large contiguous allocations subject to CoW when the parent modifies them. This PR extends the dismiss mechanism to cover dict bucket arrays, reducing CoW memory overhead. - **Expires kvstore** — dismissed upfront before saving starts, since the child never accesses expires directly, after embeding expire time in the key object. - **Slot dicts** (cluster mode) — dismissed per-slot as the iterator moves to the next slot during RDB saving or AOF rewriting. - **DB keys kvstore** (standalone mode) — dismissed per-DB after each DB is fully serialized during RDB saving or AOF rewriting. --- src/aof.c | 26 +++++++++---- src/object.c | 10 ++--- src/rdb.c | 11 +++++- src/server.c | 22 +++++++++++ src/server.h | 2 + tests/integration/dismiss-mem.tcl | 61 +++++++++++++++++++++++++++---- 6 files changed, 110 insertions(+), 22 deletions(-) diff --git a/src/aof.c b/src/aof.c index a094d11ca9..fe8336061c 100644 --- a/src/aof.c +++ b/src/aof.c @@ -2570,10 +2570,20 @@ int rewriteAppendOnlyFileRio(rio *aof) { if (rioWriteBulkLongLong(aof,j) == 0) goto werr; kvstoreIteratorInit(&kvs_it, db->keys); + int last_slot = -1; /* Iterate this DB writing every entry */ while((de = kvstoreIteratorNext(&kvs_it)) != NULL) { long long expiretime; size_t aof_bytes_before_key = aof->processed_bytes; + int curr_slot = kvstoreIteratorGetCurrentDictIndex(&kvs_it); + + /* In cluster mode, dismiss bucket arrays of the previous slot + * which won't be accessed again, to avoid CoW. */ + if (server.cluster_enabled && curr_slot != last_slot) { + if (server.in_fork_child && last_slot != -1) + dismissDictBucketsMemory(kvstoreGetDict(db->keys, last_slot)); + last_slot = curr_slot; + } /* Get the value object (of type kvobj) */ kvobj *o = dictGetKV(de); @@ -2582,12 +2592,9 @@ int rewriteAppendOnlyFileRio(rio *aof) { expiretime = kvobjGetExpire(o); /* Skip keys that are being trimmed */ - if (server.cluster_enabled) { - int curr_slot = kvstoreIteratorGetCurrentDictIndex(&kvs_it); - if (isSlotInTrimJob(curr_slot)) { - skipped++; - continue; - } + if (server.cluster_enabled && isSlotInTrimJob(curr_slot)) { + skipped++; + continue; } /* Set on stack string object for key */ @@ -2600,7 +2607,8 @@ int rewriteAppendOnlyFileRio(rio *aof) { * OS and possibly avoid or decrease COW. We give the dismiss * mechanism a hint about an estimated size of the object we stored. */ size_t dump_size = aof->processed_bytes - aof_bytes_before_key; - if (server.in_fork_child) dismissObject(o, dump_size); + if (server.in_fork_child && dump_size > server.page_size/2) + dismissObject(o, dump_size); /* Update info every 1 second (approximately). * in order to avoid calling mstime() on each iteration, we will @@ -2618,6 +2626,10 @@ int rewriteAppendOnlyFileRio(rio *aof) { debugDelay(server.rdb_key_save_delay); } kvstoreIteratorReset(&kvs_it); + + /* Dismiss bucket arrays of kvstore in standalone mode. */ + if (server.in_fork_child && !server.cluster_enabled) + dismissKvstoreBucketsMemory(db->keys); } serverLog(LL_NOTICE, "AOF rewrite done, %ld keys saved, %llu keys skipped.", key_count, skipped); return C_OK; diff --git a/src/object.c b/src/object.c index 9a9e6c2573..1fa9226792 100644 --- a/src/object.c +++ b/src/object.c @@ -691,8 +691,7 @@ void dismissSetObject(robj *o, size_t size_hint) { } /* Dismiss hash table memory. */ - dismissMemory(set->ht_table[0], DICTHT_SIZE(set->ht_size_exp[0])*sizeof(dictEntry*)); - dismissMemory(set->ht_table[1], DICTHT_SIZE(set->ht_size_exp[1])*sizeof(dictEntry*)); + dismissDictBucketsMemory(set); } else if (o->encoding == OBJ_ENCODING_INTSET) { dismissMemory(o->ptr, intsetBlobLen((intset*)o->ptr)); } else if (o->encoding == OBJ_ENCODING_LISTPACK) { @@ -720,9 +719,7 @@ void dismissZsetObject(robj *o, size_t size_hint) { } /* Dismiss hash table memory. */ - dict *d = zs->dict; - dismissMemory(d->ht_table[0], DICTHT_SIZE(d->ht_size_exp[0])*sizeof(dictEntry*)); - dismissMemory(d->ht_table[1], DICTHT_SIZE(d->ht_size_exp[1])*sizeof(dictEntry*)); + dismissDictBucketsMemory(zs->dict); } else if (o->encoding == OBJ_ENCODING_LISTPACK) { dismissMemory(o->ptr, lpBytes((unsigned char*)o->ptr)); } else { @@ -748,8 +745,7 @@ void dismissHashObject(robj *o, size_t size_hint) { } /* Dismiss hash table memory. */ - dismissMemory(d->ht_table[0], DICTHT_SIZE(d->ht_size_exp[0])*sizeof(dictEntry*)); - dismissMemory(d->ht_table[1], DICTHT_SIZE(d->ht_size_exp[1])*sizeof(dictEntry*)); + dismissDictBucketsMemory(d); } else if (o->encoding == OBJ_ENCODING_LISTPACK) { dismissMemory(o->ptr, lpBytes((unsigned char*)o->ptr)); } else if (o->encoding == OBJ_ENCODING_LISTPACK_EX) { diff --git a/src/rdb.c b/src/rdb.c index 61ca7f7cf9..52dd686d2d 100644 --- a/src/rdb.c +++ b/src/rdb.c @@ -1680,6 +1680,10 @@ ssize_t rdbSaveDb(rio *rdb, int dbid, int rdbflags, long *key_counter, unsigned written += res; if ((res = rdbSaveLen(rdb, kvstoreDictSize(db->expires, curr_slot))) < 0) goto werr2; written += res; + /* Dismiss bucket arrays of the previous slot to reduce CoW. + * The final slot is not dismissed since the child exits shortly after. */ + if (server.in_fork_child && last_slot != -1) + dismissDictBucketsMemory(kvstoreGetDict(db->keys, last_slot)); last_slot = curr_slot; } kvobj *kv = dictGetKV(de); @@ -1707,7 +1711,8 @@ ssize_t rdbSaveDb(rio *rdb, int dbid, int rdbflags, long *key_counter, unsigned * OS and possibly avoid or decrease COW. We give the dismiss * mechanism a hint about an estimated size of the object we stored. */ size_t dump_size = rdb->processed_bytes - rdb_bytes_before_key; - if (server.in_fork_child) dismissObject(kv, dump_size); + if (server.in_fork_child && dump_size > server.page_size/2) + dismissObject(kv, dump_size); /* Update child info every 1 second (approximately). * in order to avoid calling mstime() on each iteration, we will @@ -1758,6 +1763,10 @@ int rdbSaveRio(int req, rio *rdb, int *error, int rdbflags, rdbSaveInfo *rsi) { if (!(req & SLAVE_REQ_RDB_EXCLUDE_DATA)) { for (j = 0; j < server.dbnum; j++) { if (rdbSaveDb(rdb, j, rdbflags, &key_counter, &skipped) == -1) goto werr; + /* In standalone mode, dismiss bucket arrays of the saved DB's + * kvstore to reduce CoW. In cluster mode this is done per-slot. */ + if (server.in_fork_child && !server.cluster_enabled) + dismissKvstoreBucketsMemory(server.db[j].keys); } } diff --git a/src/server.c b/src/server.c index ef7f79f6e1..a284f57840 100644 --- a/src/server.c +++ b/src/server.c @@ -7497,6 +7497,20 @@ void dismissClientMemory(client *c) { } } +/* Dismiss the hash table bucket arrays of a dict. */ +void dismissDictBucketsMemory(dict *d) { + if (!d) return; + dismissMemory(d->ht_table[0], DICTHT_SIZE(d->ht_size_exp[0]) * sizeof(dictEntry*)); + dismissMemory(d->ht_table[1], DICTHT_SIZE(d->ht_size_exp[1]) * sizeof(dictEntry*)); +} + +/* Dismiss the hash table bucket arrays for all dicts in the given kvstore. */ +void dismissKvstoreBucketsMemory(kvstore *kvs) { + for (int didx = 0; didx < kvstoreNumDicts(kvs); didx++) { + dismissDictBucketsMemory(kvstoreGetDict(kvs, didx)); + } +} + /* In the child process, we don't need some buffers anymore, and these are * likely to change in the parent when there's heavy write traffic. * We dismiss them right away, to avoid CoW. @@ -7535,6 +7549,14 @@ void dismissMemoryInChild(void) { client *c = listNodeValue(ln); dismissClientMemory(c); } + + /* Dismiss expires kvstore bucket arrays since the child process never + * accesses them, expire times are embedded in key objects. */ + if (server.in_fork_child == CHILD_TYPE_RDB || server.in_fork_child == CHILD_TYPE_AOF) { + for (int dbid = 0; dbid < server.dbnum; dbid++) { + dismissKvstoreBucketsMemory(server.db[dbid].expires); + } + } #endif } diff --git a/src/server.h b/src/server.h index 506191327e..8a0ba3cbdd 100644 --- a/src/server.h +++ b/src/server.h @@ -3682,6 +3682,8 @@ void activeDefragFreeRaw(void *ptr); robj *activeDefragStringOb(robj* ob); void dismissSds(sds s); void dismissMemory(void* ptr, size_t size_hint); +void dismissDictBucketsMemory(dict *d); +void dismissKvstoreBucketsMemory(kvstore *kvs); void dismissMemoryInChild(void); int clientsCronRunClient(client *c); diff --git a/tests/integration/dismiss-mem.tcl b/tests/integration/dismiss-mem.tcl index 6e790665ae..2b0fbb3e4b 100644 --- a/tests/integration/dismiss-mem.tcl +++ b/tests/integration/dismiss-mem.tcl @@ -4,7 +4,7 @@ # Actually, we may not have many asserts in the test, since we just check for # crashes and the dump file inconsistencies. -start_server {tags {"dismiss external:skip"}} { +start_server {tags {"dismiss external:skip needs:debug"}} { # In other tests, although we test child process dumping RDB file, but # memory allocations of key/values are usually small, they couldn't cover # the "dismiss" object methods, in this test, we create big size key/values @@ -47,12 +47,15 @@ start_server {tags {"dismiss external:skip"}} { r xadd bigstream * entry1 $bigstr entry2 $bigstr set digest [debug_digest] - r config set aof-use-rdb-preamble no - r bgrewriteaof - waitForBgrewriteaof r - r debug loadaof - set newdigest [debug_digest] - assert {$digest eq $newdigest} + # Test both RDB (yes) and AOF (no) rewrite paths. + foreach preamble {yes no} { + r config set aof-use-rdb-preamble $preamble + r bgrewriteaof + waitForBgrewriteaof r + r debug loadaof + set newdigest [debug_digest] + assert {$digest eq $newdigest} + } } test {dismiss client output buffer} { @@ -99,4 +102,48 @@ start_server {tags {"dismiss external:skip"}} { waitForBgsave $master } } + + test {dismiss multi-db kvstore bucket memory in standalone mode} { + r flushall + regexp {db=(\d+)} [r client info] -> curdb + # Populate multiple DBs to verify each DB's bucket arrays can be dismissed. + foreach db {0 1 2 3} { + r select $db + populate 2000 "db${db}key:" 3 0 false 3600 + } + set digest [debug_digest] + + # Test both RDB (yes) and AOF (no) rewrite paths. + foreach preamble {yes no} { + r config set aof-use-rdb-preamble $preamble + r bgrewriteaof + waitForBgrewriteaof r + r debug loadaof + set newdigest [debug_digest] + assert {$digest eq $newdigest} + } + r select $curdb + } +} + +start_cluster 1 0 {tags {dismiss external:skip cluster needs:debug}} { + test {dismiss slot dict bucket memory in cluster mode} { + # Concentrate keys into a few slots using hash tags so each slot's + # bucket array is large enough to be dismissed. + # {06S} -> slot 0, {Qi} -> slot 1, {5L5} -> slot 2 + foreach tag {{06S} {Qi} {5L5}} { + populate 2000 "${tag}key:" 3 0 false 3600 + } + set digest [r debug digest] + + # Test both RDB (yes) and AOF (no) rewrite paths. + foreach preamble {yes no} { + r config set aof-use-rdb-preamble $preamble + r bgrewriteaof + waitForBgrewriteaof r + r debug loadaof + set newdigest [r debug digest] + assert {$digest eq $newdigest} + } + } }