From 24c85cc36807a354361bdf273dad4f48eaa5ffca Mon Sep 17 00:00:00 2001 From: "Filipe Oliveira (Redis)" <52153106+fcostaoliveira@users.noreply.github.com> Date: Tue, 18 Jun 2024 11:00:47 +0100 Subject: [PATCH] reduce getNodeByQuery CPU time by using less cache lines (from 2064 Bytes struct to 64 Bytes): reduces LLC misses and Memory Loads (#13296) The following PR goes from 33 cacheline on getKeysResult struct (by default has 256 static buffer) ``` root@hpe10:~/redis# pahole -p ./src/server.o -C getKeysResult typedef struct { keyReference keysbuf[256]; /* 0 2048 */ /* --- cacheline 32 boundary (2048 bytes) --- */ /* typedef keyReference */ struct { int pos; int flags; } *keys; /* 2048 8 */ int numkeys; /* 2056 4 */ int size; /* 2060 4 */ /* size: 2064, cachelines: 33, members: 4 */ /* last cacheline: 16 bytes */ } getKeysResult; ``` to 1 cacheline with a static buffer of 6 keys per command): ``` root@hpe10:~/redis# pahole -p ./src/server.o -C getKeysResult typedef struct { int numkeys; /* 0 4 */ int size; /* 4 4 */ keyReference keysbuf[6]; /* 8 48 */ /* typedef keyReference */ struct { int pos; int flags; } *keys; /* 56 8 */ /* size: 64, cachelines: 1, members: 4 */ } getKeysResult; ``` we get around 1.5% higher ops/sec, and a confirmation of around 15% less LLC loads on getNodeByQuery and 37% less Stores. Function / Call Stack | CPU Time: Difference | CPU Time: 9462436fa444e746716845b1d807c74d8945831b | CPU Time: this PR | Loads: Difference | Loads: 9462436fa444e746716845b1d807c74d8945831b | Loads: this PR | Stores: Difference | Stores: 9462436fa444e746716845b1d807c74d8945831b | Stores: This PR -- | -- | -- | -- | -- | -- | -- | -- | -- | -- getNodeByQuery | 0.753767 | 1.57118 | 0.817416 | 144297829 (15% less loads) | 920575969 | 776278140 | 367607824 (37% less stores) | 991642384 | 624034560 ## results on client side ### baseline ``` taskset -c 2,3 memtier_benchmark -s 192.168.1.200 --port 6379 --authenticate perf --cluster-mode --pipeline 10 --data-size 100 --ratio 1:0 --key-pattern P:P --key-minimum=1 --key-maximum 1000000 --test-time 180 -c 25 -t 2 --hide-histogram Writing results to stdout [RUN #1] Preparing benchmark client... [RUN #1] Launching threads now... [RUN #1 100%, 180 secs] 0 threads: 110333450 ops, 604992 (avg: 612942) ops/sec, 84.75MB/sec (avg: 85.86MB/sec), 0.82 (avg: 0.81) msec latency 2 Threads 25 Connections per thread 180 Seconds ALL STATS ====================================================================================================================================================== Type Ops/sec Hits/sec Misses/sec MOVED/sec ASK/sec Avg. Latency p50 Latency p99 Latency p99.9 Latency KB/sec ------------------------------------------------------------------------------------------------------------------------------------------------------ Sets 612942.14 --- --- 0.00 0.00 0.81332 0.80700 1.26300 2.92700 87924.12 Gets 0.00 0.00 0.00 0.00 0.00 --- --- --- --- 0.00 Waits 0.00 --- --- --- --- --- --- --- --- --- Totals 612942.14 0.00 0.00 0.00 0.00 0.81332 0.80700 1.26300 2.92700 87924.12 ``` ### comparison ``` taskset -c 2,3 memtier_benchmark -s 192.168.1.200 --port 6379 --authenticate perf --cluster-mode --pipeline 10 --data-size 100 --ratio 1:0 --key-pattern P:P --key-minimum=1 --key-maximum 1000000 --test-time 180 -c 25 -t 2 --hide-histogram Writing results to stdout [RUN #1] Preparing benchmark client... [RUN #1] Launching threads now... [RUN #1 100%, 180 secs] 0 threads: 111731310 ops, 610195 (avg: 620707) ops/sec, 85.48MB/sec (avg: 86.95MB/sec), 0.82 (avg: 0.80) msec latency 2 Threads 25 Connections per thread 180 Seconds ALL STATS ====================================================================================================================================================== Type Ops/sec Hits/sec Misses/sec MOVED/sec ASK/sec Avg. Latency p50 Latency p99 Latency p99.9 Latency KB/sec ------------------------------------------------------------------------------------------------------------------------------------------------------ Sets 620707.72 --- --- 0.00 0.00 0.80312 0.79900 1.23900 2.87900 89037.78 Gets 0.00 0.00 0.00 0.00 0.00 --- --- --- --- 0.00 Waits 0.00 --- --- --- --- --- --- --- --- --- Totals 620707.72 0.00 0.00 0.00 0.00 0.80312 0.79900 1.23900 2.87900 89037.78 ``` Co-authored-by: filipecosta90 --- src/server.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/src/server.h b/src/server.h index cff87e10f..dbb1b9621 100644 --- a/src/server.h +++ b/src/server.h @@ -2071,7 +2071,8 @@ struct redisServer { char *locale_collate; }; -#define MAX_KEYS_BUFFER 256 +/* we use 6 so that all getKeyResult fits a cacheline */ +#define MAX_KEYS_BUFFER 6 typedef struct { int pos; /* The position of the key within the client array */ @@ -2084,12 +2085,12 @@ typedef struct { * for returning channel information. */ typedef struct { + int numkeys; /* Number of key indices return */ + int size; /* Available array size */ keyReference keysbuf[MAX_KEYS_BUFFER]; /* Pre-allocated buffer, to save heap allocations */ keyReference *keys; /* Key indices array, points to keysbuf or heap */ - int numkeys; /* Number of key indices return */ - int size; /* Available array size */ } getKeysResult; -#define GETKEYS_RESULT_INIT { {{0}}, NULL, 0, MAX_KEYS_BUFFER } +#define GETKEYS_RESULT_INIT { 0, MAX_KEYS_BUFFER, {{0}}, NULL } /* Key specs definitions. *