Prefetch client fields before prefetching command-related data (#14700)

This PR refines the prefetch strategy by removing ineffective (to close
on the pipeline) dictionary-level prefetching and improving prefetch
usage in IO threads. The goal is to better aligning prefetches with
predictable access patterns.

## Changes

- Removed speculative prefetching from `dictFindLinkInternal()`,
simplifying the dictionary lookup hot path.
- Introduced a two-phase prefetch approach in
`prefetchIOThreadCommands()`:
  - Phase 1: Prefetch client structures and `pending_cmds`
- Phase 2: Add commands to the batch and prefetch follow-up fields
(`reply`, `mem_usage_bucket`)

## Performance

Measured with
`memtier_benchmark-1Mkeys-string-setget2000c-1KiB-pipeline-16`.

| Environment                  | % change |
|-----------------------------|----------|
| oss-standalone               | -0.1%    |
| oss-standalone-02-io-threads | +0.4%    |
| oss-standalone-04-io-threads | +1.6%    |
| oss-standalone-08-io-threads | +2.3%    |
| oss-standalone-12-io-threads | +0.7%    |
| oss-standalone-16-io-threads | +1.9%    |

Overall, this shows an ~2% throughput improvement on IO-threaded
configurations, with no meaningful impact on non-IO-threaded setups.

---------

Co-authored-by: Yuan Wang <wangyuancode@163.com>
This commit is contained in:
Filipe Oliveira (Redis) 2026-01-18 12:14:39 +00:00 committed by GitHub
parent c93e4a62c6
commit 7f541b9607
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
5 changed files with 60 additions and 14 deletions

View file

@ -3186,7 +3186,7 @@ standardConfig static_configs[] = {
createIntConfig("databases", NULL, IMMUTABLE_CONFIG, 1, INT_MAX, server.dbnum, 16, INTEGER_CONFIG, NULL, NULL),
createIntConfig("port", NULL, MODIFIABLE_CONFIG, 0, 65535, server.port, 6379, INTEGER_CONFIG, NULL, updatePort), /* TCP port. */
createIntConfig("io-threads", NULL, DEBUG_CONFIG | IMMUTABLE_CONFIG, 1, 128, server.io_threads_num, 1, INTEGER_CONFIG, NULL, NULL), /* Single threaded by default */
createIntConfig("prefetch-batch-max-size", NULL, MODIFIABLE_CONFIG | HIDDEN_CONFIG, 0, 128, server.prefetch_batch_max_size, 16, INTEGER_CONFIG, NULL, NULL),
createIntConfig("prefetch-batch-max-size", NULL, MODIFIABLE_CONFIG | HIDDEN_CONFIG, 0, PREFETCH_BATCH_MAX_SIZE, server.prefetch_batch_max_size, 16, INTEGER_CONFIG, NULL, NULL),
createIntConfig("auto-aof-rewrite-percentage", NULL, MODIFIABLE_CONFIG, 0, INT_MAX, server.aof_rewrite_perc, 100, INTEGER_CONFIG, NULL, NULL),
createIntConfig("cluster-replica-validity-factor", "cluster-slave-validity-factor", MODIFIABLE_CONFIG, 0, INT_MAX, server.cluster_slave_validity_factor, 10, INTEGER_CONFIG, NULL, NULL), /* Slave max data age factor. */
createIntConfig("list-max-listpack-size", "list-max-ziplist-size", MODIFIABLE_CONFIG, INT_MIN, INT_MAX, server.list_max_listpack_size, -2, INTEGER_CONFIG, NULL, NULL),

View file

@ -783,17 +783,11 @@ static dictEntryLink dictFindLinkInternal(dict *d, const void *key, dictEntryLin
if (table == 0 && (long)idx < d->rehashidx) continue;
idx = hash & DICTHT_SIZE_MASK(d->ht_size_exp[table]);
/* Prefetch the bucket at the calculated index */
redis_prefetch_read(&d->ht_table[table][idx]);
link = &(d->ht_table[table][idx]);
if (bucket) *bucket = link;
while(link && *link) {
const void *visitedKey = dictStoredKey2Key(d, dictGetKey(*link));
/* Prefetch the next entry to improve cache efficiency */
redis_prefetch_read(dictGetNext(*link));
if (key == visitedKey || cmpFunc( &cmpCache, key, visitedKey))
return link;

View file

@ -351,18 +351,33 @@ int prefetchIOThreadCommands(IOThread *t) {
int to_prefetch = determinePrefetchCount(len);
if (to_prefetch == 0) return 0;
/* Two-phase approach to optimize cache utilization:
* Phase 1: Issue prefetch hints for client structures
* Phase 2: Access the now-cached client data and add commands to batch */
/* Since we double the configured size for better performance,
* see also `determinePrefetchCount` */
static client *c[PREFETCH_BATCH_MAX_SIZE*2];
serverAssert(PREFETCH_BATCH_MAX_SIZE*2 >= to_prefetch );
int clients = 0;
listIter li;
listNode *ln;
listRewind(mainThreadProcessingClients[t->id], &li);
while((ln = listNext(&li)) && clients < to_prefetch) {
client *c = listNodeValue(ln);
/* A single command may contain multiple keys. If the batch is full,
* we stop adding clients to it. */
if (addCommandToBatch(c) == C_ERR) break;
clients++;
/* Phase 1: Issue prefetch instructions for client struct and pending_cmds.
* These prefetches will bring data into cache asynchronously. */
for (int i = 0; i < to_prefetch && (ln = listNext(&li)); i++) {
c[i] = listNodeValue(ln);
redis_prefetch_read(c[i]);
redis_prefetch_read(&c[i]->pending_cmds);
}
/* Phase 2: Access client data (now likely in cache) and add to batch.
* Also prefetch additional fields (reply, mem_usage_bucket) that will be
* needed later during command execution. */
for (int i = 0; i < to_prefetch; i++) {
if (addCommandToBatch(c[i]) == C_ERR) break;
if (c[i]->reply) redis_prefetch_read(c[i]->reply);
redis_prefetch_read(&c[i]->mem_usage_bucket);
clients++;
}
/* Prefetch the commands in the batch. */
prefetchCommands();
return clients;

View file

@ -814,6 +814,9 @@ typedef enum {
#define BUSY_MODULE_YIELD_EVENTS (1<<0)
#define BUSY_MODULE_YIELD_CLIENTS (1<<1)
/* Key prefetch configs */
#define PREFETCH_BATCH_MAX_SIZE 128
/*-----------------------------------------------------------------------------
* Data types
*----------------------------------------------------------------------------*/

View file

@ -332,6 +332,40 @@ start_server {config "minimal.conf" tags {"external:skip"} overrides {enable-deb
# With slower machines, the number of prefetch entries can be lower
assert_range $new_prefetch_entries [expr {$prefetch_entries + 2}] [expr {$prefetch_entries + 16}]
}
test {Prefetch works with batch size greater than 16 (buffer overflow regression test)} {
# save the current value of prefetch entries
set info [r info stats]
set prefetch_entries [getInfoProperty $info io_threaded_total_prefetch_entries]
# set the batch size to a value greater than the old hardcoded limit of 16
r config set prefetch-batch-max-size 64
# Create a batch with more than 16 clients to trigger the old buffer overflow
do_prefetch_batch $server_pid 64
# verify the prefetch entries increased
set info [r info stats]
set new_prefetch_entries [getInfoProperty $info io_threaded_total_prefetch_entries]
# With slower machines, the number of prefetch entries can be lower
assert_range $new_prefetch_entries [expr {$prefetch_entries + 2}] [expr {$prefetch_entries + 64}]
}
test {Prefetch works with maximum batch size of 128 and client number larger than batch size} {
# save the current value of prefetch entries
set info [r info stats]
set prefetch_entries [getInfoProperty $info io_threaded_total_prefetch_entries]
# set the batch size to the maximum allowed value
r config set prefetch-batch-max-size 128
# Create a batch with 300 clients to test the maximum limit
do_prefetch_batch $server_pid 300
# verify the prefetch entries increased
set info [r info stats]
set new_prefetch_entries [getInfoProperty $info io_threaded_total_prefetch_entries]
# With slower machines, the number of prefetch entries can be lower
assert_range $new_prefetch_entries [expr {$prefetch_entries + 2}] [expr {$prefetch_entries + 300}]
}
}
}