Prefetch client fields before prefetching command-related data (#14700)

This PR refines the prefetch strategy by removing ineffective (to close on the pipeline) dictionary-level prefetching and improving prefetch usage in IO threads. The goal is to better aligning prefetches with predictable access patterns. ## Changes - Removed speculative prefetching from `dictFindLinkInternal()`, simplifying the dictionary lookup hot path. - Introduced a two-phase prefetch approach in `prefetchIOThreadCommands()`: - Phase 1: Prefetch client structures and `pending_cmds` - Phase 2: Add commands to the batch and prefetch follow-up fields (`reply`, `mem_usage_bucket`) ## Performance Measured with `memtier_benchmark-1Mkeys-string-setget2000c-1KiB-pipeline-16`. | Environment | % change | |-----------------------------|----------| | oss-standalone | -0.1% | | oss-standalone-02-io-threads | +0.4% | | oss-standalone-04-io-threads | +1.6% | | oss-standalone-08-io-threads | +2.3% | | oss-standalone-12-io-threads | +0.7% | | oss-standalone-16-io-threads | +1.9% | Overall, this shows an ~2% throughput improvement on IO-threaded configurations, with no meaningful impact on non-IO-threaded setups. --------- Co-authored-by: Yuan Wang <wangyuancode@163.com>
2026-05-28 04:02:46 -04:00 · 2026-01-18 12:14:39 +00:00 · 2026-01-18 12:14:39 +00:00 · 7f541b9607
commit 7f541b9607
parent c93e4a62c6
5 changed files with 60 additions and 14 deletions
--- a/src/config.c
+++ b/src/config.c
@ -3186,7 +3186,7 @@ standardConfig static_configs[] = {
    createIntConfig("databases", NULL, IMMUTABLE_CONFIG, 1, INT_MAX, server.dbnum, 16, INTEGER_CONFIG, NULL, NULL),
    createIntConfig("port", NULL, MODIFIABLE_CONFIG, 0, 65535, server.port, 6379, INTEGER_CONFIG, NULL, updatePort), /* TCP port. */
    createIntConfig("io-threads", NULL, DEBUG_CONFIG | IMMUTABLE_CONFIG, 1, 128, server.io_threads_num, 1, INTEGER_CONFIG, NULL, NULL), /* Single threaded by default */
-    createIntConfig("prefetch-batch-max-size", NULL, MODIFIABLE_CONFIG | HIDDEN_CONFIG, 0, 128, server.prefetch_batch_max_size, 16, INTEGER_CONFIG, NULL, NULL),
+    createIntConfig("prefetch-batch-max-size", NULL, MODIFIABLE_CONFIG | HIDDEN_CONFIG, 0, PREFETCH_BATCH_MAX_SIZE, server.prefetch_batch_max_size, 16, INTEGER_CONFIG, NULL, NULL),
    createIntConfig("auto-aof-rewrite-percentage", NULL, MODIFIABLE_CONFIG, 0, INT_MAX, server.aof_rewrite_perc, 100, INTEGER_CONFIG, NULL, NULL),
    createIntConfig("cluster-replica-validity-factor", "cluster-slave-validity-factor", MODIFIABLE_CONFIG, 0, INT_MAX, server.cluster_slave_validity_factor, 10, INTEGER_CONFIG, NULL, NULL), /* Slave max data age factor. */
    createIntConfig("list-max-listpack-size", "list-max-ziplist-size", MODIFIABLE_CONFIG, INT_MIN, INT_MAX, server.list_max_listpack_size, -2, INTEGER_CONFIG, NULL, NULL),
--- a/src/dict.c
+++ b/src/dict.c
@ -783,17 +783,11 @@ static dictEntryLink dictFindLinkInternal(dict *d, const void *key, dictEntryLin
        if (table == 0 && (long)idx < d->rehashidx) continue;
        idx = hash & DICTHT_SIZE_MASK(d->ht_size_exp[table]);

-        /* Prefetch the bucket at the calculated index */
-        redis_prefetch_read(&d->ht_table[table][idx]);
-
        link = &(d->ht_table[table][idx]);
        if (bucket) *bucket = link;
        while(link && *link) {
            const void *visitedKey = dictStoredKey2Key(d, dictGetKey(*link));

-            /* Prefetch the next entry to improve cache efficiency */
-            redis_prefetch_read(dictGetNext(*link));
-
            if (key == visitedKey || cmpFunc( &cmpCache, key, visitedKey))                
                return link;

--- a/src/iothread.c
+++ b/src/iothread.c
@ -351,18 +351,33 @@ int prefetchIOThreadCommands(IOThread *t) {
    int to_prefetch = determinePrefetchCount(len);
    if (to_prefetch == 0) return 0;

+    /* Two-phase approach to optimize cache utilization:
+     * Phase 1: Issue prefetch hints for client structures
+     * Phase 2: Access the now-cached client data and add commands to batch */
+    /* Since we double the configured size for better performance,
+     * see also `determinePrefetchCount` */
+    static client *c[PREFETCH_BATCH_MAX_SIZE*2];
+    serverAssert(PREFETCH_BATCH_MAX_SIZE*2 >= to_prefetch );
    int clients = 0;
    listIter li;
    listNode *ln;
    listRewind(mainThreadProcessingClients[t->id], &li);
-    while((ln = listNext(&li)) && clients < to_prefetch) {
-        client *c = listNodeValue(ln);
-        /* A single command may contain multiple keys. If the batch is full,
-         * we stop adding clients to it. */
-        if (addCommandToBatch(c) == C_ERR) break;
-        clients++;
+    /* Phase 1: Issue prefetch instructions for client struct and pending_cmds.
+     * These prefetches will bring data into cache asynchronously. */
+    for (int i = 0; i < to_prefetch && (ln = listNext(&li)); i++) {
+        c[i] = listNodeValue(ln);
+        redis_prefetch_read(c[i]);
+        redis_prefetch_read(&c[i]->pending_cmds);
    }
-
+    /* Phase 2: Access client data (now likely in cache) and add to batch.
+     * Also prefetch additional fields (reply, mem_usage_bucket) that will be
+     * needed later during command execution. */
+    for (int i = 0; i < to_prefetch; i++) {
+        if (addCommandToBatch(c[i]) == C_ERR) break;
+        if (c[i]->reply) redis_prefetch_read(c[i]->reply);
+        redis_prefetch_read(&c[i]->mem_usage_bucket);
+        clients++;
+     }
    /* Prefetch the commands in the batch. */
    prefetchCommands();
    return clients;
--- a/src/server.h
+++ b/src/server.h
@ -814,6 +814,9 @@ typedef enum {
 #define BUSY_MODULE_YIELD_EVENTS (1<<0)
 #define BUSY_MODULE_YIELD_CLIENTS (1<<1)

+/* Key prefetch configs */
+#define PREFETCH_BATCH_MAX_SIZE 128
+
 /*-----------------------------------------------------------------------------
 * Data types
 *----------------------------------------------------------------------------*/
--- a/tests/unit/networking.tcl
+++ b/tests/unit/networking.tcl
@ -332,6 +332,40 @@ start_server {config "minimal.conf" tags {"external:skip"} overrides {enable-deb
            # With slower machines, the number of prefetch entries can be lower
            assert_range $new_prefetch_entries [expr {$prefetch_entries + 2}] [expr {$prefetch_entries + 16}]
        }
+
+        test {Prefetch works with batch size greater than 16 (buffer overflow regression test)} {
+            # save the current value of prefetch entries
+            set info [r info stats]
+            set prefetch_entries [getInfoProperty $info io_threaded_total_prefetch_entries]
+            # set the batch size to a value greater than the old hardcoded limit of 16
+            r config set prefetch-batch-max-size 64
+
+            # Create a batch with more than 16 clients to trigger the old buffer overflow
+            do_prefetch_batch $server_pid 64
+
+            # verify the prefetch entries increased
+            set info [r info stats]
+            set new_prefetch_entries [getInfoProperty $info io_threaded_total_prefetch_entries]
+            # With slower machines, the number of prefetch entries can be lower
+            assert_range $new_prefetch_entries [expr {$prefetch_entries + 2}] [expr {$prefetch_entries + 64}]
+        }
+
+        test {Prefetch works with maximum batch size of 128 and client number larger than batch size} {
+            # save the current value of prefetch entries
+            set info [r info stats]
+            set prefetch_entries [getInfoProperty $info io_threaded_total_prefetch_entries]
+            # set the batch size to the maximum allowed value
+            r config set prefetch-batch-max-size 128
+
+            # Create a batch with 300 clients to test the maximum limit
+            do_prefetch_batch $server_pid 300
+
+            # verify the prefetch entries increased
+            set info [r info stats]
+            set new_prefetch_entries [getInfoProperty $info io_threaded_total_prefetch_entries]
+            # With slower machines, the number of prefetch entries can be lower
+            assert_range $new_prefetch_entries [expr {$prefetch_entries + 2}] [expr {$prefetch_entries + 300}]
+        }
    }
 }