From 86b8072216e40702bca65cc24cdddbd50b544f9f Mon Sep 17 00:00:00 2001 From: Sergey Georgiev Date: Thu, 12 Mar 2026 08:45:59 +0200 Subject: [PATCH 01/48] added: rax insert optimizations - append fast-path and over-allocation in raxAddChild --- src/rax.c | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/src/rax.c b/src/rax.c index 0b134d7af..7b22e1fdd 100644 --- a/src/rax.c +++ b/src/rax.c @@ -273,12 +273,22 @@ raxNode *raxAddChild(rax *rax, raxNode *n, unsigned char c, raxNode **childptr, if (child == NULL) return NULL; /* Make space in the original node. */ - raxNode *newn = raxNodeRealloc(rax,n,newlen); - if (newn == NULL) { - raxFreeNode(rax,child); - return NULL; + size_t usable = rax_malloc_usable_size(n); + if (usable >= newlen) { + /* Existing allocation has room -- skip realloc entirely. */ + } else { + /* Grow with 2x factor to amortize future inserts. */ + size_t growlen = newlen < 1024 ? newlen * 2 : newlen + 1024; + raxNode *newn = raxNodeRealloc(rax, n, growlen); + if (newn == NULL) { + newn = raxNodeRealloc(rax, n, newlen); + if (newn == NULL) { + raxFreeNode(rax, child); + return NULL; + } + } + n = newn; } - n = newn; /* After the reallocation, we have up to 8/16 (depending on the system * pointer size, and the required node padding) bytes at the end, that is, @@ -309,8 +319,12 @@ raxNode *raxAddChild(rax *rax, raxNode *n, unsigned char c, raxNode **childptr, * a child "c" in our case pos will be = 2 after the end of the following * loop. */ int pos; - for (pos = 0; pos < n->size; pos++) { - if (n->data[pos] > c) break; + if (n->size > 0 && c > n->data[n->size - 1]) { + pos = n->size; + } else { + for (pos = 0; pos < n->size; pos++) { + if (n->data[pos] > c) break; + } } /* Now, if present, move auxiliary data pointer at the end From 88cf30648de4433de8fa2fcd45fbb7fbf2e69914 Mon Sep 17 00:00:00 2001 From: Sergey Georgiev Date: Thu, 12 Mar 2026 10:51:59 +0200 Subject: [PATCH 02/48] added: rax fast-paths for sequential inserts in raxLowWalk --- src/rax.c | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/src/rax.c b/src/rax.c index 7b22e1fdd..5b8cad3d4 100644 --- a/src/rax.c +++ b/src/rax.c @@ -492,13 +492,21 @@ static inline size_t raxLowWalk(rax *rax, unsigned char *s, size_t len, raxNode } if (j != h->size) break; } else { - /* Even when h->size is large, linear scan provides good - * performances compared to other approaches that are in theory - * more sounding, like performing a binary search. */ - for (j = 0; j < h->size; j++) { - if (v[j] == s[i]) break; + /* Children are sorted. Check the last child first: for + * sequential inserts the match is almost always at the end, + * and for random keys the extra compare is negligible vs + * the O(n) scan that follows on miss. */ + if (v[h->size - 1] == s[i]) { + j = h->size - 1; + } else if (s[i] > v[h->size - 1]) { + j = h->size; + break; + } else { + for (j = 0; j < h->size; j++) { + if (v[j] == s[i]) break; + } + if (j == h->size) break; } - if (j == h->size) break; i++; } From b1cabf7f20bf5e79b596af8f75a823ec16df9491 Mon Sep 17 00:00:00 2001 From: Sergey Georgiev Date: Thu, 12 Mar 2026 11:23:25 +0200 Subject: [PATCH 03/48] changed: revert over allocation --- src/rax.c | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/src/rax.c b/src/rax.c index 5b8cad3d4..30ff68785 100644 --- a/src/rax.c +++ b/src/rax.c @@ -273,22 +273,12 @@ raxNode *raxAddChild(rax *rax, raxNode *n, unsigned char c, raxNode **childptr, if (child == NULL) return NULL; /* Make space in the original node. */ - size_t usable = rax_malloc_usable_size(n); - if (usable >= newlen) { - /* Existing allocation has room -- skip realloc entirely. */ - } else { - /* Grow with 2x factor to amortize future inserts. */ - size_t growlen = newlen < 1024 ? newlen * 2 : newlen + 1024; - raxNode *newn = raxNodeRealloc(rax, n, growlen); - if (newn == NULL) { - newn = raxNodeRealloc(rax, n, newlen); - if (newn == NULL) { - raxFreeNode(rax, child); - return NULL; - } - } - n = newn; + raxNode *newn = raxNodeRealloc(rax,n,newlen); + if (newn == NULL) { + raxFreeNode(rax,child); + return NULL; } + n = newn; /* After the reallocation, we have up to 8/16 (depending on the system * pointer size, and the required node padding) bytes at the end, that is, From 87d2d1730efb948cab8bdea5e30842a1875f1df8 Mon Sep 17 00:00:00 2001 From: Sergey Georgiev Date: Thu, 12 Mar 2026 15:31:31 +0200 Subject: [PATCH 04/48] added: skip realloc in raxAddChild when usable block already fits --- src/rax.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/src/rax.c b/src/rax.c index 30ff68785..195a4565b 100644 --- a/src/rax.c +++ b/src/rax.c @@ -272,13 +272,17 @@ raxNode *raxAddChild(rax *rax, raxNode *n, unsigned char c, raxNode **childptr, raxNode *child = raxNewNode(rax,0,0); if (child == NULL) return NULL; - /* Make space in the original node. */ - raxNode *newn = raxNodeRealloc(rax,n,newlen); - if (newn == NULL) { - raxFreeNode(rax,child); - return NULL; + /* Make space in the original node. If the current allocation already + * has enough usable bytes (common with jemalloc size-class rounding), + * skip the realloc entirely. */ + if (rax_malloc_usable_size(n) < newlen) { + raxNode *newn = raxNodeRealloc(rax,n,newlen); + if (newn == NULL) { + raxFreeNode(rax,child); + return NULL; + } + n = newn; } - n = newn; /* After the reallocation, we have up to 8/16 (depending on the system * pointer size, and the required node padding) bytes at the end, that is, From 96f4dae1d8190bfe6f8768bdb27c22a6e7efc622 Mon Sep 17 00:00:00 2001 From: Sergey Georgiev Date: Fri, 20 Mar 2026 11:09:09 +0200 Subject: [PATCH 05/48] changed: replace rax PEL with two-level rax(ms)->flax(seq) structure --- src/Makefile | 2 +- src/aof.c | 33 +- src/defrag.c | 94 ++++-- src/flax.c | 783 ++++++++++++++++++++++++++++++++++++++++++++++ src/flax.h | 45 +++ src/flax_malloc.h | 6 + src/lazyfree.c | 2 +- src/rdb.c | 95 +++--- src/server.c | 1 + src/server.h | 1 + src/stream.h | 63 +++- src/t_stream.c | 673 +++++++++++++++++++++++++-------------- 12 files changed, 1457 insertions(+), 341 deletions(-) create mode 100644 src/flax.c create mode 100644 src/flax.h create mode 100644 src/flax_malloc.h diff --git a/src/Makefile b/src/Makefile index b3ebd13b8..36e4b5b4f 100644 --- a/src/Makefile +++ b/src/Makefile @@ -382,7 +382,7 @@ endif REDIS_SERVER_NAME=redis-server$(PROG_SUFFIX) REDIS_SENTINEL_NAME=redis-sentinel$(PROG_SUFFIX) -REDIS_SERVER_OBJ=threads_mngr.o memory_prefetch.o adlist.o quicklist.o ae.o anet.o dict.o ebuckets.o eventnotifier.o iothread.o mstr.o entry.o kvstore.o fwtree.o estore.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o cluster_asm.o cluster_legacy.o cluster_slot_stats.o crc16.o endianconv.o slowlog.o eval.o bio.o rio.o rand.o memtest.o syscheck.o crcspeed.o crccombine.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o redis-check-rdb.o redis-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o defrag.o siphash.o rax.o t_stream.o listpack.o localtime.o lolwut.o lolwut5.o lolwut6.o lolwut8.o acl.o tracking.o socket.o tls.o sha256.o timeout.o setcpuaffinity.o monotonic.o mt19937-64.o resp_parser.o call_reply.o script_lua.o script.o functions.o function_lua.o commands.o strl.o connection.o unix.o logreqres.o keymeta.o chk.o hotkeys.o gcra.o +REDIS_SERVER_OBJ=threads_mngr.o memory_prefetch.o adlist.o quicklist.o ae.o anet.o dict.o ebuckets.o eventnotifier.o iothread.o mstr.o entry.o kvstore.o fwtree.o estore.o server.o sds.o zmalloc.o lzf_c.o lzf_d.o pqsort.o zipmap.o sha1.o ziplist.o release.o networking.o util.o object.o db.o replication.o rdb.o t_string.o t_list.o t_set.o t_zset.o t_hash.o config.o aof.o pubsub.o multi.o debug.o sort.o intset.o syncio.o cluster.o cluster_asm.o cluster_legacy.o cluster_slot_stats.o crc16.o endianconv.o slowlog.o eval.o bio.o rio.o rand.o memtest.o syscheck.o crcspeed.o crccombine.o crc64.o bitops.o sentinel.o notify.o setproctitle.o blocked.o hyperloglog.o latency.o sparkline.o redis-check-rdb.o redis-check-aof.o geo.o lazyfree.o module.o evict.o expire.o geohash.o geohash_helper.o childinfo.o defrag.o siphash.o rax.o flax.o t_stream.o listpack.o localtime.o lolwut.o lolwut5.o lolwut6.o lolwut8.o acl.o tracking.o socket.o tls.o sha256.o timeout.o setcpuaffinity.o monotonic.o mt19937-64.o resp_parser.o call_reply.o script_lua.o script.o functions.o function_lua.o commands.o strl.o connection.o unix.o logreqres.o keymeta.o chk.o hotkeys.o gcra.o REDIS_CLI_NAME=redis-cli$(PROG_SUFFIX) REDIS_CLI_OBJ=anet.o adlist.o dict.o redis-cli.o zmalloc.o release.o ae.o redisassert.o crcspeed.o crccombine.o crc64.o siphash.o crc16.o monotonic.o cli_common.o mt19937-64.o strl.o cli_commands.o REDIS_BENCHMARK_NAME=redis-benchmark$(PROG_SUFFIX) diff --git a/src/aof.c b/src/aof.c index b489608d4..969c01f00 100644 --- a/src/aof.c +++ b/src/aof.c @@ -2324,7 +2324,7 @@ int rewriteStreamObject(rio *r, robj *key, robj *o) { while(raxNext(&ri_cons)) { streamConsumer *consumer = ri_cons.data; /* If there are no pending entries, just emit XGROUP CREATECONSUMER */ - if (raxSize(consumer->pel) == 0) { + if (consumer->pel_count == 0) { if (rioWriteStreamEmptyConsumer(r,key,(char*)ri.key, ri.key_len,consumer) == 0) { @@ -2336,22 +2336,23 @@ int rewriteStreamObject(rio *r, robj *key, robj *o) { } /* For the current consumer, iterate all the PEL entries * to emit the XCLAIM protocol. */ - raxIterator ri_pel; - raxStart(&ri_pel,consumer->pel); - raxSeek(&ri_pel,"^",NULL,0); - while(raxNext(&ri_pel)) { - streamNACK *nack = ri_pel.data; - if (rioWriteStreamPendingEntry(r,key,(char*)ri.key, - ri.key_len,consumer, - ri_pel.key,nack) == 0) - { - raxStop(&ri_pel); - raxStop(&ri_cons); - raxStop(&ri); - return 0; - } + pelIterator pi_pel; + pelIterStart(&pi_pel,consumer->pel); + if (pelIterSeek(&pi_pel,"^",NULL)) { + do { + streamNACK *nack = pi_pel.nack; + if (rioWriteStreamPendingEntry(r,key,(char*)ri.key, + ri.key_len,consumer, + pi_pel.rawkey,nack) == 0) + { + pelIterStop(&pi_pel); + raxStop(&ri_cons); + raxStop(&ri); + return 0; + } + } while (pelIterNext(&pi_pel)); } - raxStop(&ri_pel); + pelIterStop(&pi_pel); } raxStop(&ri_cons); } diff --git a/src/defrag.c b/src/defrag.c index b80d1ef28..0e97064a7 100644 --- a/src/defrag.c +++ b/src/defrag.c @@ -19,6 +19,7 @@ */ #include "server.h" +#include "stream.h" #include #include @@ -860,35 +861,53 @@ typedef struct { streamConsumer *c; } PendingEntryContext; -void* defragStreamConsumerPendingEntry(raxIterator *ri, void *privdata) { +/* Defrag a flax bucket in the consumer PEL. Each flax value is a NACK shared + * with the group PEL, so we update pointers in both places. */ +void* defragStreamConsumerPelFlax(raxIterator *ri, void *privdata) { PendingEntryContext *ctx = privdata; - streamNACK *nack = ri->data, *newnack; - nack->consumer = ctx->c; /* update nack pointer to consumer */ - nack->cgroup_ref_node->value = ctx->cg; /* Update the value of cgroups_ref node to the consumer group. */ - newnack = activeDefragAlloc(nack); - if (newnack) { - /* Update consumer group pointer to the nack. */ - void *prev; - raxInsert(ctx->cg->pel, ri->key, ri->key_len, newnack, &prev); - serverAssert(prev==nack); - - /* Update the doubly-linked list pointers in adjacent nacks. - * When we move a nack to a new address, we need to update the - * pel_prev->pel_next and pel_next->pel_prev pointers. */ - if (newnack->pel_prev) { - newnack->pel_prev->pel_next = newnack; - } else { - /* This is the head of the list */ - ctx->cg->pel_time_head = newnack; - } - if (newnack->pel_next) { - newnack->pel_next->pel_prev = newnack; - } else { - /* This is the tail of the list */ - ctx->cg->pel_time_tail = newnack; - } + flax *f = ri->data; + flax *newf = activeDefragAlloc(f); + if (newf) f = newf; + + /* Iterate entries in the flax and defrag each NACK. */ + flaxIterator fi; + flaxStart(&fi, f); + if (flaxSeek(&fi, "^", 0)) { + do { + streamNACK *nack = fi.data; + nack->consumer = ctx->c; + nack->cgroup_ref_node->value = ctx->cg; + streamNACK *newnack = activeDefragAlloc(nack); + if (newnack) { + /* Update in the consumer PEL flax. */ + flaxInsert(f, fi.key, newnack, NULL); + + /* Update in the group PEL flax. */ + unsigned char msbuf[8]; + uint64_t ms_be = htonu64(newnack->id.ms); + memcpy(msbuf, &ms_be, 8); + void *grp_flax_ptr = NULL; + raxFind(ctx->cg->pel, msbuf, 8, &grp_flax_ptr); + if (grp_flax_ptr) { + flaxInsert((flax *)grp_flax_ptr, (int64_t)newnack->id.seq, newnack, NULL); + } + + /* Update doubly-linked list pointers. */ + if (newnack->pel_prev) { + newnack->pel_prev->pel_next = newnack; + } else { + ctx->cg->pel_time_head = newnack; + } + if (newnack->pel_next) { + newnack->pel_next->pel_prev = newnack; + } else { + ctx->cg->pel_time_tail = newnack; + } + } + } while (flaxNext(&fi)); } - return newnack; + + return newf; } typedef struct { @@ -909,12 +928,21 @@ void* defragStreamConsumer(raxIterator *ri, void *privdata) { if (newsds) c->name = newsds; if (c->pel) { - /* Update pel back-pointer to new stream */ c->pel->alloc_size = &s->alloc_size; PendingEntryContext pel_ctx = {cg, c}; - defragRadixTree(&c->pel, 0, defragStreamConsumerPendingEntry, &pel_ctx); + defragRadixTree(&c->pel, 0, defragStreamConsumerPelFlax, &pel_ctx); + pelCacheInvalidate(c->pel); } - return newc; /* returns NULL if c was not defragged */ + return newc; +} + +/* Defrag a flax bucket in the group PEL. Only defrags the flax struct itself, + * not the NACKs (those are defragged via consumer PEL traversal). */ +void* defragStreamGroupPelFlax(raxIterator *ri, void *privdata) { + (void)privdata; + flax *f = ri->data; + flax *newf = activeDefragAlloc(f); + return newf; } void* defragStreamConsumerGroup(raxIterator *ri, void *privdata) { @@ -923,13 +951,11 @@ void* defragStreamConsumerGroup(raxIterator *ri, void *privdata) { if ((newcg = activeDefragAlloc(cg))) cg = newcg; if (cg->pel) { - /* Update pel back-pointer to new stream */ cg->pel->alloc_size = &s->alloc_size; - defragRadixTree(&cg->pel, 0, NULL, NULL); + defragRadixTree(&cg->pel, 0, defragStreamGroupPelFlax, NULL); + pelCacheInvalidate(cg->pel); } - /* pel_time_head/tail are just pointers to NACKs in pel, no separate defrag needed */ if (cg->consumers) { - /* Update consumers back-pointer to new stream */ cg->consumers->alloc_size = &s->alloc_size; StreamConsumerContext consumer_ctx = {s, cg}; defragRadixTree(&cg->consumers, 0, defragStreamConsumer, &consumer_ctx); diff --git a/src/flax.c b/src/flax.c new file mode 100644 index 000000000..e30407ea1 --- /dev/null +++ b/src/flax.c @@ -0,0 +1,783 @@ +#include "flax.h" +#include +#include +#include + +#ifndef flax_malloc +#ifndef FLAX_MALLOC_INCLUDE +#define FLAX_MALLOC_INCLUDE "flax_malloc.h" +#endif +#include FLAX_MALLOC_INCLUDE +#endif + +static size_t flax_values_offset(int64_t capacity) { + size_t raw = (size_t)capacity * sizeof(int64_t); + size_t align = alignof(void *); + return (raw + align - 1) & ~(align - 1); +} + +static int64_t *flax_keys(flax *f) { + return (int64_t *)f->data; +} + +static void **flax_values(flax *f) { + return (void **)((char *)f->data + flax_values_offset(f->capacity)); +} + +/* Linear scan with fast paths for first/last. + * Returns 1 if key found (out_idx = its index), 0 if not (out_idx = insertion point). + * Sequential access through the contiguous keys array is cache-friendly and + * avoids branch-misprediction overhead of binary search at typical flax sizes. */ +static int flax_search(const int64_t *keys, int64_t numele, int64_t key, int64_t *out_idx) { + if (numele == 0) { + *out_idx = 0; + return 0; + } + + /* Fast path: append (most common — seq numbers grow monotonically). */ + if (key > keys[numele - 1]) { + *out_idx = numele; + return 0; + } + if (key == keys[numele - 1]) { + *out_idx = numele - 1; + return 1; + } + + /* Fast path: match or prepend at head. */ + if (key <= keys[0]) { + *out_idx = 0; + return key == keys[0]; + } + + /* Linear scan through the middle. */ + for (int64_t i = 1; i < numele - 1; i++) { + if (keys[i] < key) continue; + *out_idx = i; + return keys[i] == key; + } + + *out_idx = numele - 1; + return 0; +} + +static void flax_resize(flax *f, int64_t new_capacity) { + size_t new_voff = flax_values_offset(new_capacity); + size_t new_alloc = new_voff + (size_t)new_capacity * sizeof(void *); + void *new_data = flax_malloc(new_alloc); + + if (f->data && f->numele > 0) { + memcpy(new_data, f->data, (size_t)f->numele * sizeof(int64_t)); + memcpy((char *)new_data + new_voff, + (char *)f->data + flax_values_offset(f->capacity), + (size_t)f->numele * sizeof(void *)); + } + + flax_free(f->data); + f->data = new_data; + f->capacity = new_capacity; +} + +static void flaxIterRefresh(flaxIterator *it) { + it->key = flax_keys(it->f)[it->idx]; + it->data = flax_values(it->f)[it->idx]; +} + +flax *flaxNew(void) { + flax *f = flax_malloc(sizeof(flax)); + f->numele = 0; + f->capacity = FLAX_MIN_CAPACITY; + size_t voff = flax_values_offset(FLAX_MIN_CAPACITY); + f->data = flax_malloc(voff + (size_t)FLAX_MIN_CAPACITY * sizeof(void *)); + return f; +} + +int flaxInsert(flax *f, int64_t key, void *data, void **old) { + if (f->numele == f->capacity) { + int64_t new_cap = f->capacity == 0 ? FLAX_MIN_CAPACITY : f->capacity * 2; + flax_resize(f, new_cap); + } + + int64_t idx; + if (flax_search(flax_keys(f), f->numele, key, &idx)) { + void **vals = flax_values(f); + if (old) *old = vals[idx]; + vals[idx] = data; + return 1; + } + + int64_t *keys = flax_keys(f); + void **vals = flax_values(f); + int64_t tail = f->numele - idx; + + if (tail > 0) { + memmove(&keys[idx + 1], &keys[idx], (size_t)tail * sizeof(int64_t)); + memmove(&vals[idx + 1], &vals[idx], (size_t)tail * sizeof(void *)); + } + + keys[idx] = key; + vals[idx] = data; + f->numele++; + if (old) *old = NULL; + return 1; +} + +int flaxTryInsert(flax *f, int64_t key, void *data, void **old) { + if (f->numele == f->capacity) { + int64_t new_cap = f->capacity == 0 ? FLAX_MIN_CAPACITY : f->capacity * 2; + flax_resize(f, new_cap); + } + + int64_t idx; + if (flax_search(flax_keys(f), f->numele, key, &idx)) { + if (old) *old = flax_values(f)[idx]; + return 0; + } + + int64_t *keys = flax_keys(f); + void **vals = flax_values(f); + int64_t tail = f->numele - idx; + + if (tail > 0) { + memmove(&keys[idx + 1], &keys[idx], (size_t)tail * sizeof(int64_t)); + memmove(&vals[idx + 1], &vals[idx], (size_t)tail * sizeof(void *)); + } + + keys[idx] = key; + vals[idx] = data; + f->numele++; + if (old) *old = NULL; + return 1; +} + +int flaxRemove(flax *f, int64_t key, void **old) { + if (!f || f->numele == 0) { + if (old) *old = NULL; + return 0; + } + + int64_t idx; + if (!flax_search(flax_keys(f), f->numele, key, &idx)) { + if (old) *old = NULL; + return 0; + } + + int64_t *keys = flax_keys(f); + void **vals = flax_values(f); + if (old) *old = vals[idx]; + int64_t tail = f->numele - idx - 1; + + if (tail > 0) { + memmove(&keys[idx], &keys[idx + 1], (size_t)tail * sizeof(int64_t)); + memmove(&vals[idx], &vals[idx + 1], (size_t)tail * sizeof(void *)); + } + + f->numele--; + + if (f->capacity > FLAX_MIN_CAPACITY && + f->numele < f->capacity / 4 && + f->capacity / 2 >= FLAX_MIN_CAPACITY) { + flax_resize(f, f->capacity / 2); + } + + return 1; +} + +int flaxFind(flax *f, int64_t key, void **value) { + if (!f || f->numele == 0) { + if (value) *value = NULL; + return 0; + } + int64_t idx; + if (flax_search(flax_keys(f), f->numele, key, &idx)) { + if (value) *value = flax_values(f)[idx]; + return 1; + } + if (value) *value = NULL; + return 0; +} + +void flaxFree(flax *f) { + flaxFreeWithCallback(f, NULL); +} + +void flaxFreeWithCallback(flax *f, void (*free_callback)(void *)) { + if (!f) return; + if (free_callback && f->data && f->numele > 0) { + void **vals = flax_values(f); + for (int64_t i = 0; i < f->numele; i++) + free_callback(vals[i]); + } + flax_free(f->data); + flax_free(f); +} + +void flaxFreeWithCbAndContext(flax *f, + void (*free_callback)(void *item, void *ctx), + void *ctx) { + if (!f) return; + if (free_callback && f->data && f->numele > 0) { + void **vals = flax_values(f); + for (int64_t i = 0; i < f->numele; i++) + free_callback(vals[i], ctx); + } + flax_free(f->data); + flax_free(f); +} + +uint64_t flaxSize(flax *f) { + return (uint64_t)f->numele; +} + +/* --- Iterator implementation --- */ + +void flaxStart(flaxIterator *it, flax *f) { + it->f = f; + it->idx = -1; + it->key = 0; + it->data = NULL; +} + +int flaxSeek(flaxIterator *it, const char *op, int64_t key) { + if (!it->f || it->f->numele == 0) { + it->idx = -1; + it->key = 0; + it->data = NULL; + return 0; + } + + if (op[0] == '^') { + it->idx = 0; + flaxIterRefresh(it); + return 1; + } + + if (op[0] == '$') { + it->idx = it->f->numele - 1; + flaxIterRefresh(it); + return 1; + } + + if (op[0] == '>' && op[1] == '=') { + int64_t idx; + flax_search(flax_keys(it->f), it->f->numele, key, &idx); + if (idx >= it->f->numele) { + it->idx = -1; + it->key = 0; + it->data = NULL; + return 0; + } + it->idx = idx; + flaxIterRefresh(it); + return 1; + } + + if (op[0] == '>' && op[1] == '\0') { + int64_t idx; + int found = flax_search(flax_keys(it->f), it->f->numele, key, &idx); + if (found) idx++; + if (idx >= it->f->numele) { + it->idx = -1; + it->key = 0; + it->data = NULL; + return 0; + } + it->idx = idx; + flaxIterRefresh(it); + return 1; + } + + if (op[0] == '<' && op[1] == '=') { + int64_t idx; + int found = flax_search(flax_keys(it->f), it->f->numele, key, &idx); + if (found) { + it->idx = idx; + } else { + if (idx == 0) { + it->idx = -1; + it->key = 0; + it->data = NULL; + return 0; + } + it->idx = idx - 1; + } + flaxIterRefresh(it); + return 1; + } + + if (op[0] == '<' && op[1] == '\0') { + int64_t idx; + flax_search(flax_keys(it->f), it->f->numele, key, &idx); + if (idx == 0) { + it->idx = -1; + it->key = 0; + it->data = NULL; + return 0; + } + it->idx = idx - 1; + flaxIterRefresh(it); + return 1; + } + + if (op[0] == '=' && op[1] == '\0') { + int64_t idx; + if (!flax_search(flax_keys(it->f), it->f->numele, key, &idx)) { + it->idx = -1; + it->key = 0; + it->data = NULL; + return 0; + } + it->idx = idx; + flaxIterRefresh(it); + return 1; + } + + it->idx = -1; + it->key = 0; + it->data = NULL; + return 0; +} + +int flaxNext(flaxIterator *it) { + if (it->idx < 0) return 0; + it->idx++; + if (it->idx >= it->f->numele) { + it->idx = -1; + it->key = 0; + it->data = NULL; + return 0; + } + flaxIterRefresh(it); + return 1; +} + +int flaxPrev(flaxIterator *it) { + if (it->idx < 0) return 0; + it->idx--; + if (it->idx < 0) { + it->key = 0; + it->data = NULL; + return 0; + } + flaxIterRefresh(it); + return 1; +} + +void flaxStop(flaxIterator *it) { + (void)it; +} + +int flaxEOF(flaxIterator *it) { + return it->idx < 0 || it->idx >= it->f->numele; +} + +#ifdef REDIS_TEST +#include "testhelp.h" +#include +#include + +#define UNUSED(x) (void)(x) + +#define ERR(x, ...) \ + do { \ + printf("%s:%s:%d:\t", __FILE__, __func__, __LINE__); \ + printf("ERROR! " x "\n", __VA_ARGS__); \ + err++; \ + } while (0) + +#define TEST(name) printf("test — %s\n", name); + +static int flax_test_free_count; + +static void flax_test_counting_free(void *p) { + flax_test_free_count++; + flax_free(p); +} + +static void flax_test_ctx_free(void *p, void *ctx) { + (void)p; + int *cnt = ctx; + (*cnt)++; +} + +int flaxTest(int argc, char **argv, int flags) { + UNUSED(argc); + UNUSED(argv); + UNUSED(flags); + + int err = 0; + + TEST("new and free empty") { + flax *a = flaxNew(); + assert(a != NULL); + assert(a->numele == 0); + assert(a->capacity == FLAX_MIN_CAPACITY); + assert(a->data != NULL); + flaxFree(a); + } + + TEST("find on empty flax") { + flax *a = flaxNew(); + void *val; + assert(flaxFind(a, 42, &val) == 0); + assert(val == NULL); + assert(flaxFind(a, 1, &val) == 0); + flaxFree(a); + } + + TEST("insert and find") { + flax *a = flaxNew(); + void *old, *val; + + flaxInsert(a, 30, "thirty", &old); + assert(old == NULL); + flaxInsert(a, 10, "ten", &old); + assert(old == NULL); + flaxInsert(a, 50, "fifty", &old); + assert(old == NULL); + flaxInsert(a, 20, "twenty", &old); + assert(old == NULL); + flaxInsert(a, 40, "forty", &old); + assert(old == NULL); + assert(flaxSize(a) == 5); + + assert(flaxFind(a, 10, &val) == 1); + assert(strcmp(val, "ten") == 0); + assert(flaxFind(a, 20, &val) == 1); + assert(strcmp(val, "twenty") == 0); + assert(flaxFind(a, 30, &val) == 1); + assert(strcmp(val, "thirty") == 0); + assert(flaxFind(a, 40, &val) == 1); + assert(strcmp(val, "forty") == 0); + assert(flaxFind(a, 50, &val) == 1); + assert(strcmp(val, "fifty") == 0); + assert(flaxFind(a, 99, &val) == 0); + assert(flaxFind(a, 0, &val) == 0); + + flaxFree(a); + } + + TEST("insert duplicate replaces value") { + flax *a = flaxNew(); + flaxInsert(a, 5, "old_five", NULL); + flaxInsert(a, 10, "old_ten", NULL); + + void *old, *val; + flaxInsert(a, 5, "new_five", &old); + assert(old != NULL); + assert(strcmp(old, "old_five") == 0); + assert(flaxSize(a) == 2); + assert(flaxFind(a, 5, &val) == 1); + assert(strcmp(val, "new_five") == 0); + + flaxInsert(a, 10, "new_ten", &old); + assert(strcmp(old, "old_ten") == 0); + assert(flaxSize(a) == 2); + assert(flaxFind(a, 10, &val) == 1); + assert(strcmp(val, "new_ten") == 0); + + flaxFree(a); + } + + TEST("remove basic") { + flax *a = flaxNew(); + flaxInsert(a, 1, "one", NULL); + flaxInsert(a, 2, "two", NULL); + flaxInsert(a, 3, "three", NULL); + + void *old, *val; + assert(flaxRemove(a, 2, &old) == 1); + assert(strcmp(old, "two") == 0); + assert(flaxSize(a) == 2); + assert(flaxFind(a, 2, &val) == 0); + assert(flaxFind(a, 1, &val) == 1); + assert(strcmp(val, "one") == 0); + assert(flaxFind(a, 3, &val) == 1); + assert(strcmp(val, "three") == 0); + + flaxFree(a); + } + + TEST("remove not found") { + flax *a = flaxNew(); + flaxInsert(a, 1, "one", NULL); + void *old; + assert(flaxRemove(a, 99, &old) == 0); + assert(old == NULL); + assert(flaxSize(a) == 1); + + flax *b = flaxNew(); + assert(flaxRemove(b, 1, &old) == 0); + flaxFree(b); + + flaxFree(a); + } + + TEST("remove only element") { + flax *a = flaxNew(); + flaxInsert(a, 42, "answer", NULL); + void *old, *val; + assert(flaxRemove(a, 42, &old) == 1); + assert(strcmp(old, "answer") == 0); + assert(flaxSize(a) == 0); + assert(flaxFind(a, 42, &val) == 0); + + flaxFree(a); + } + + TEST("insert at beginning and end") { + flax *a = flaxNew(); + flaxInsert(a, 50, "middle", NULL); + flaxInsert(a, 100, "end", NULL); + flaxInsert(a, 1, "begin", NULL); + + void *val; + assert(flaxSize(a) == 3); + assert(flaxFind(a, 1, &val) == 1); + assert(strcmp(val, "begin") == 0); + assert(flaxFind(a, 50, &val) == 1); + assert(strcmp(val, "middle") == 0); + assert(flaxFind(a, 100, &val) == 1); + assert(strcmp(val, "end") == 0); + + flaxFree(a); + } + + TEST("grow beyond initial capacity") { + flax *a = flaxNew(); + for (int64_t i = 0; i < 100; i++) { + char *buf = flax_malloc(16); + snprintf(buf, 16, "v%lld", (long long)i); + flaxInsert(a, i * 3, buf, NULL); + } + assert(flaxSize(a) == 100); + assert(a->capacity >= 100); + + for (int64_t i = 0; i < 100; i++) { + char expected[16]; + snprintf(expected, sizeof(expected), "v%lld", (long long)i); + void *val; + assert(flaxFind(a, i * 3, &val) == 1); + if (strcmp(val, expected) != 0) { + ERR("grow: key %lld expected '%s' got '%s'", + (long long)(i * 3), expected, (char *)val); + } + } + + flaxFreeWithCallback(a, flax_free); + } + + TEST("shrink after many removals") { + flax *a = flaxNew(); + for (int64_t i = 0; i < 64; i++) + flaxInsert(a, i, "x", NULL); + + assert(flaxSize(a) == 64); + int64_t cap_before = a->capacity; + + for (int64_t i = 0; i < 56; i++) + flaxRemove(a, i, NULL); + + assert(flaxSize(a) == 8); + if (a->capacity >= cap_before) { + ERR("shrink: capacity %lld should be less than %lld", + (long long)a->capacity, (long long)cap_before); + } + + for (int64_t i = 56; i < 64; i++) { + void *val; + assert(flaxFind(a, i, &val) == 1); + assert(strcmp(val, "x") == 0); + } + + flaxFree(a); + } + + TEST("flaxFreeWithCallback invokes callback") { + flax_test_free_count = 0; + flax *a = flaxNew(); + for (int i = 0; i < 5; i++) { + char *s = flax_malloc(8); + snprintf(s, 8, "str%d", i); + flaxInsert(a, i, s, NULL); + } + flaxFreeWithCallback(a, flax_test_counting_free); + if (flax_test_free_count != 5) { + ERR("freeWithCallback: expected 5 frees, got %d", + flax_test_free_count); + } + } + + TEST("flaxFreeWithCallback on empty flax") { + flax_test_free_count = 0; + flax *a = flaxNew(); + flaxFreeWithCallback(a, flax_test_counting_free); + if (flax_test_free_count != 0) { + ERR("freeWithCallback empty: expected 0 frees, got %d", + flax_test_free_count); + } + } + + TEST("negative keys") { + flax *a = flaxNew(); + flaxInsert(a, -100, "neg100", NULL); + flaxInsert(a, 0, "zero", NULL); + flaxInsert(a, 100, "pos100", NULL); + flaxInsert(a, -50, "neg50", NULL); + + void *val; + assert(flaxSize(a) == 4); + assert(flaxFind(a, -100, &val) == 1); + assert(strcmp(val, "neg100") == 0); + assert(flaxFind(a, -50, &val) == 1); + assert(strcmp(val, "neg50") == 0); + assert(flaxFind(a, 0, &val) == 1); + assert(strcmp(val, "zero") == 0); + assert(flaxFind(a, 100, &val) == 1); + assert(strcmp(val, "pos100") == 0); + + flaxFree(a); + } + + TEST("flaxTryInsert does not overwrite") { + flax *a = flaxNew(); + assert(flaxTryInsert(a, 10, "ten", NULL) == 1); + assert(flaxTryInsert(a, 20, "twenty", NULL) == 1); + assert(flaxSize(a) == 2); + + void *old, *val; + assert(flaxTryInsert(a, 10, "new_ten", &old) == 0); + assert(strcmp(old, "ten") == 0); + assert(flaxSize(a) == 2); + assert(flaxFind(a, 10, &val) == 1); + assert(strcmp(val, "ten") == 0); + + flaxFree(a); + } + + TEST("iterator on empty flax") { + flax *a = flaxNew(); + flaxIterator it; + flaxStart(&it, a); + assert(flaxSeek(&it, "^", 0) == 0); + assert(flaxEOF(&it) == 1); + assert(flaxSeek(&it, "$", 0) == 0); + assert(flaxSeek(&it, ">=", 42) == 0); + flaxStop(&it); + + flaxFree(a); + } + + TEST("iterator forward") { + flax *a = flaxNew(); + flaxInsert(a, 10, "ten", NULL); + flaxInsert(a, 30, "thirty", NULL); + flaxInsert(a, 20, "twenty", NULL); + flaxInsert(a, 40, "forty", NULL); + + flaxIterator it; + flaxStart(&it, a); + assert(flaxSeek(&it, "^", 0)); + assert(it.key == 10); + assert(strcmp(it.data, "ten") == 0); + assert(flaxNext(&it)); + assert(it.key == 20); + assert(flaxNext(&it)); + assert(it.key == 30); + assert(flaxNext(&it)); + assert(it.key == 40); + assert(flaxNext(&it) == 0); + assert(flaxEOF(&it) == 1); + flaxStop(&it); + + flaxFree(a); + } + + TEST("iterator backward") { + flax *a = flaxNew(); + flaxInsert(a, 10, "ten", NULL); + flaxInsert(a, 20, "twenty", NULL); + flaxInsert(a, 30, "thirty", NULL); + + flaxIterator it; + flaxStart(&it, a); + assert(flaxSeek(&it, "$", 0)); + assert(it.key == 30); + assert(flaxPrev(&it)); + assert(it.key == 20); + assert(flaxPrev(&it)); + assert(it.key == 10); + assert(flaxPrev(&it) == 0); + flaxStop(&it); + + flaxFree(a); + } + + TEST("iterator seek >=") { + flax *a = flaxNew(); + flaxInsert(a, 10, "ten", NULL); + flaxInsert(a, 20, "twenty", NULL); + flaxInsert(a, 30, "thirty", NULL); + flaxInsert(a, 40, "forty", NULL); + + flaxIterator it; + flaxStart(&it, a); + + assert(flaxSeek(&it, ">=", 20)); + assert(it.key == 20); + + assert(flaxSeek(&it, ">=", 25)); + assert(it.key == 30); + + assert(flaxSeek(&it, ">=", 5)); + assert(it.key == 10); + + assert(flaxSeek(&it, ">=", 41) == 0); + assert(flaxEOF(&it) == 1); + flaxStop(&it); + + flaxFree(a); + } + + TEST("iterator on single element") { + flax *a = flaxNew(); + flaxInsert(a, 42, "answer", NULL); + + flaxIterator it; + flaxStart(&it, a); + assert(flaxSeek(&it, "^", 0)); + assert(it.key == 42); + assert(strcmp(it.data, "answer") == 0); + assert(flaxNext(&it) == 0); + + flaxStart(&it, a); + assert(flaxSeek(&it, "$", 0)); + assert(it.key == 42); + assert(flaxPrev(&it) == 0); + flaxStop(&it); + + flaxFree(a); + } + + TEST("flaxFreeWithCbAndContext") { + int ctx_free_count = 0; + flax *a = flaxNew(); + flaxInsert(a, 1, "one", NULL); + flaxInsert(a, 2, "two", NULL); + flaxInsert(a, 3, "three", NULL); + flaxFreeWithCbAndContext(a, flax_test_ctx_free, &ctx_free_count); + if (ctx_free_count != 3) { + ERR("freeWithCbAndContext: expected 3 frees, got %d", + ctx_free_count); + } + } + + if (!err) + printf("ALL TESTS PASSED!\n"); + else + ERR("Sorry, not all tests passed! In fact, %d tests failed.", err); + + return err; +} + +#endif diff --git a/src/flax.h b/src/flax.h new file mode 100644 index 000000000..509b88ba7 --- /dev/null +++ b/src/flax.h @@ -0,0 +1,45 @@ +#ifndef FLAX_H +#define FLAX_H + +#include +#include + +#define FLAX_MIN_CAPACITY 12 + +typedef struct flax { + void *data; + int64_t numele; + int64_t capacity; +} flax; + +typedef struct flaxIterator { + flax *f; + int64_t key; + void *data; + int64_t idx; +} flaxIterator; + +/* Exported API. */ +flax *flaxNew(void); +int flaxInsert(flax *f, int64_t key, void *data, void **old); +int flaxTryInsert(flax *f, int64_t key, void *data, void **old); +int flaxRemove(flax *f, int64_t key, void **old); +int flaxFind(flax *f, int64_t key, void **value); +void flaxFree(flax *f); +void flaxFreeWithCallback(flax *f, void (*free_callback)(void *)); +void flaxFreeWithCbAndContext(flax *f, + void (*free_callback)(void *item, void *ctx), + void *ctx); +void flaxStart(flaxIterator *it, flax *f); +int flaxSeek(flaxIterator *it, const char *op, int64_t key); +int flaxNext(flaxIterator *it); +int flaxPrev(flaxIterator *it); +void flaxStop(flaxIterator *it); +int flaxEOF(flaxIterator *it); +uint64_t flaxSize(flax *f); + +#ifdef REDIS_TEST +int flaxTest(int argc, char *argv[], int flags); +#endif + +#endif diff --git a/src/flax_malloc.h b/src/flax_malloc.h new file mode 100644 index 000000000..c0723c4d2 --- /dev/null +++ b/src/flax_malloc.h @@ -0,0 +1,6 @@ +#ifndef FLAX_ALLOC_H +#define FLAX_ALLOC_H +#include "zmalloc.h" +#define flax_malloc zmalloc +#define flax_free zfree +#endif diff --git a/src/lazyfree.c b/src/lazyfree.c index e3f125ff9..72c822115 100644 --- a/src/lazyfree.c +++ b/src/lazyfree.c @@ -161,7 +161,7 @@ size_t lazyfreeGetFreeEffort(robj *key, robj *obj, int dbid) { * work. */ serverAssert(raxNext(&ri)); cg = ri.data; - effort += raxSize(s->cgroups)*(1+raxSize(cg->pel)); + effort += raxSize(s->cgroups)*(1+cg->pel_count); raxStop(&ri); } return effort; diff --git a/src/rdb.c b/src/rdb.c index bcae57502..836014b3d 100644 --- a/src/rdb.c +++ b/src/rdb.c @@ -737,44 +737,42 @@ int rdbLoadObjectType(rio *rdb) { * we serialized the NACKs as well, but when serializing the local consumer * PELs we just add the ID, that will be resolved inside the global PEL to * put a reference to the same structure. */ -ssize_t rdbSaveStreamPEL(rio *rdb, rax *pel, int nacks) { +ssize_t rdbSaveStreamPEL(rio *rdb, rax *pel, uint64_t pel_count, int nacks) { ssize_t n, nwritten = 0; /* Number of entries in the PEL. */ - if ((n = rdbSaveLen(rdb,raxSize(pel))) == -1) return -1; + if ((n = rdbSaveLen(rdb,pel_count)) == -1) return -1; nwritten += n; /* Save each entry. */ - raxIterator ri; - raxStart(&ri,pel); - raxSeek(&ri,"^",NULL,0); - while(raxNext(&ri)) { - /* We store IDs in raw form as 128 big big endian numbers, like - * they are inside the radix tree key. */ - if ((n = rdbWriteRaw(rdb,ri.key,sizeof(streamID))) == -1) { - raxStop(&ri); - return -1; - } - nwritten += n; + pelIterator pi; + pelIterStart(&pi,pel); + if (pelIterSeek(&pi,"^",NULL)) { + do { + /* We store IDs in raw form as 128 big big endian numbers, + * reconstructed from the two-level structure. */ + if ((n = rdbWriteRaw(rdb,pi.rawkey,sizeof(streamID))) == -1) { + pelIterStop(&pi); + return -1; + } + nwritten += n; - if (nacks) { - streamNACK *nack = ri.data; - if ((n = rdbSaveMillisecondTime(rdb,nack->delivery_time)) == -1) { - raxStop(&ri); - return -1; + if (nacks) { + streamNACK *nack = pi.nack; + if ((n = rdbSaveMillisecondTime(rdb,nack->delivery_time)) == -1) { + pelIterStop(&pi); + return -1; + } + nwritten += n; + if ((n = rdbSaveLen(rdb,nack->delivery_count)) == -1) { + pelIterStop(&pi); + return -1; + } + nwritten += n; } - nwritten += n; - if ((n = rdbSaveLen(rdb,nack->delivery_count)) == -1) { - raxStop(&ri); - return -1; - } - nwritten += n; - /* We don't save the consumer name: we'll save the pending IDs - * for each consumer in the consumer PEL, and resolve the consumer - * at loading time. */ - } + } while (pelIterNext(&pi)); } - raxStop(&ri); + pelIterStop(&pi); return nwritten; } @@ -1025,7 +1023,7 @@ size_t rdbSaveStreamConsumers(rio *rdb, streamCG *cg) { * passed with value of 0), at loading time we'll lookup the ID * in the consumer group global PEL and will put a reference in the * consumer local PEL. */ - if ((n = rdbSaveStreamPEL(rdb,consumer->pel,0)) == -1) { + if ((n = rdbSaveStreamPEL(rdb,consumer->pel,consumer->pel_count,0)) == -1) { raxStop(&ri); return -1; } @@ -1339,7 +1337,7 @@ ssize_t rdbSaveObject(rio *rdb, robj *o, robj *key, int dbid) { nwritten += n; /* Save the global PEL. */ - if ((n = rdbSaveStreamPEL(rdb,cg->pel,1)) == -1) { + if ((n = rdbSaveStreamPEL(rdb,cg->pel,cg->pel_count,1)) == -1) { raxStop(&ri); return -1; } @@ -3290,7 +3288,7 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) decrRefCount(o); return NULL; } - if (!raxTryInsert(cgroup->pel,rawid,sizeof(rawid),nack,NULL)) { + if (!pelTryInsert(cgroup->pel,&nack_id,nack,&cgroup->pel_count)) { rdbReportCorruptRDB("Duplicated global PEL entry " "loading stream consumer group"); streamFreeNACK(s, nack); @@ -3363,20 +3361,21 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) decrRefCount(o); return NULL; } - void *result; - if (!raxFind(cgroup->pel,rawid,sizeof(rawid),&result)) { + streamID cpel_id; + streamDecodeID(rawid, &cpel_id); + streamNACK *nack = pelFind(cgroup->pel, &cpel_id); + if (!nack) { rdbReportCorruptRDB("Consumer entry not found in " "group global PEL"); decrRefCount(o); return NULL; } - streamNACK *nack = result; /* Set the NACK consumer, that was left to NULL when * loading the global PEL. Then set the same shared * NACK structure also in the consumer-specific PEL. */ nack->consumer = consumer; - if (!raxTryInsert(consumer->pel,rawid,sizeof(rawid),nack,NULL)) { + if (!pelTryInsert(consumer->pel,&cpel_id,nack,&consumer->pel_count)) { rdbReportCorruptRDB("Duplicated consumer PEL entry " " loading a stream consumer " "group"); @@ -3389,19 +3388,19 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) /* Verify that each PEL eventually got a consumer assigned to it. */ if (deep_integrity_validation) { - raxIterator ri_cg_pel; - raxStart(&ri_cg_pel,cgroup->pel); - raxSeek(&ri_cg_pel,"^",NULL,0); - while(raxNext(&ri_cg_pel)) { - streamNACK *nack = ri_cg_pel.data; - if (!nack->consumer) { - raxStop(&ri_cg_pel); - rdbReportCorruptRDB("Stream CG PEL entry without consumer"); - decrRefCount(o); - return NULL; - } + pelIterator pi_cg_pel; + pelIterStart(&pi_cg_pel,cgroup->pel); + if (pelIterSeek(&pi_cg_pel,"^",NULL)) { + do { + if (!pi_cg_pel.nack->consumer) { + pelIterStop(&pi_cg_pel); + rdbReportCorruptRDB("Stream CG PEL entry without consumer"); + decrRefCount(o); + return NULL; + } + } while (pelIterNext(&pi_cg_pel)); } - raxStop(&ri_cg_pel); + pelIterStop(&pi_cg_pel); } } diff --git a/src/server.c b/src/server.c index a0245d710..d00853be6 100644 --- a/src/server.c +++ b/src/server.c @@ -7775,6 +7775,7 @@ struct redisTest { {"ebuckets", ebucketsTest}, {"bitmap", bitopsTest}, {"rax", raxTest}, + {"flax", flaxTest}, {"zset", zsetTest}, {"topk", chkTopKTest}, }; diff --git a/src/server.h b/src/server.h index bd1545072..f77abb2ab 100644 --- a/src/server.h +++ b/src/server.h @@ -63,6 +63,7 @@ typedef long long ustime_t; /* microsecond time type. */ #include "quicklist.h" /* Lists are encoded as linked lists of N-elements flat arrays */ #include "rax.h" /* Radix tree */ +#include "flax.h" /* Flat sorted array */ #include "connection.h" /* Connection abstraction */ #include "eventnotifier.h" /* Event notification */ #include "memory_prefetch.h" diff --git a/src/stream.h b/src/stream.h index 028e37435..0e652f451 100644 --- a/src/stream.h +++ b/src/stream.h @@ -5,6 +5,7 @@ #include "listpack.h" #include "dict.h" #include "xxhash.h" +#include "flax.h" /* Stream item ID: a 128 bit number composed of a milliseconds time and * a sequence counter. IDs generated in the same millisecond (or in a past @@ -99,12 +100,11 @@ typedef struct streamCG { group reads. In the real world, the reasoning behind this value is detailed at the top comment of streamEstimateDistanceFromFirstEverEntry(). */ - rax *pel; /* Pending entries list. This is a radix tree that - has every message delivered to consumers (without - the NOACK option) that was yet not acknowledged - as processed. The key of the radix tree is the - ID as a 64 bit big endian number, while the - associated value is a streamNACK structure.*/ + rax *pel; /* Two-level pending entries list. The outer rax is + keyed by the ms part (8-byte big-endian), and each + value is a flax* keyed by the seq part (int64_t) + whose values are streamNACK pointers. */ + uint64_t pel_count; /* Total number of NACK entries across all flax buckets. */ streamNACK *pel_time_head; /* Head of time-ordered doubly-linked list of pending entries (oldest delivery_time). Used for efficient CLAIM operations. O(1) access to oldest entries. */ @@ -123,13 +123,10 @@ typedef struct streamConsumer { sds name; /* Consumer name. This is how the consumer will be identified in the consumer group protocol. Case sensitive. */ - rax *pel; /* Consumer specific pending entries list: all - the pending messages delivered to this - consumer not yet acknowledged. Keys are - big endian message IDs, while values are - the same streamNACK structure referenced - in the "pel" of the consumer group structure - itself, so the value is shared. */ + rax *pel; /* Two-level consumer PEL: same structure as + streamCG.pel (ms -> flax(seq -> NACK*)). + NACK pointers are shared with the group PEL. */ + uint64_t pel_count; /* Total NACK count for this consumer. */ } streamConsumer; /* Pending (yet not acknowledged) message in a consumer group. */ @@ -196,6 +193,46 @@ int streamEntryExists(stream *s, streamID *id); listNode *streamLinkCGroupToEntry(stream *s, streamCG *cg, unsigned char *key); +/* Two-level PEL iterator: walks outer rax (ms buckets) and inner flax (seq). */ +typedef struct pelIterator { + raxIterator ri; + flaxIterator fi; + int valid; + streamID id; + streamNACK *nack; + unsigned char rawkey[sizeof(streamID)]; +} pelIterator; + +/* Inline cache embedded in rax metadata to avoid raxFind on every PEL op + * when consecutive messages share the same millisecond. */ +typedef struct pelCache { + uint64_t ms; + flax *f; +} pelCache; + +static inline pelCache *pelGetCache(rax *pel) { + return (pelCache *)pel->metadata; +} + +static inline void pelCacheInvalidate(rax *pel) { + pelGetCache(pel)->f = NULL; +} + +/* Two-level PEL operations. */ +rax *pelNew(size_t *alloc_size); +void pelFree(rax *pel, void (*nack_free)(void *, void *), void *ctx); +void pelFreeShallow(rax *pel); +int pelInsert(rax *pel, streamID *id, streamNACK *nack, uint64_t *count); +int pelTryInsert(rax *pel, streamID *id, streamNACK *nack, uint64_t *count); +streamNACK *pelFind(rax *pel, streamID *id); +streamNACK *pelRemove(rax *pel, streamID *id, uint64_t *count); + +void pelIterStart(pelIterator *pi, rax *pel); +int pelIterSeek(pelIterator *pi, const char *op, streamID *id); +int pelIterNext(pelIterator *pi); +int pelIterReseek(pelIterator *pi, streamID *id); +void pelIterStop(pelIterator *pi); + /* PEL time list management (used by RDB loading) */ void pelListInsertSorted(streamCG *cg, streamNACK *nack); diff --git a/src/t_stream.c b/src/t_stream.c index d263f2672..aabbef454 100644 --- a/src/t_stream.c +++ b/src/t_stream.c @@ -56,11 +56,250 @@ static idmpProducer *idmpGetOrCreateProducer(stream *s, const char *pid, size_t static int createIdempotencyHash(robj **argv, int64_t numfields, XXH128_hash_t *out_hash); static void idmpEvictOldestEntry(stream *s, idmpProducer *producer); +/* Forward declarations for stream ID encoding */ +void streamEncodeID(void *buf, streamID *id); + /* Forward declarations for PEL time list functions */ static void pelListInsertAtTail(streamCG *cg, streamNACK *nack); static void pelListUnlink(streamCG *cg, streamNACK *nack); static void pelListUpdate(streamCG *cg, streamNACK *nack, mstime_t new_delivery_time); +/* ----------------------------------------------------------------------- + * Two-level PEL: rax(ms -> flax(seq -> streamNACK*)) + * ----------------------------------------------------------------------- */ + +static inline void pelEncodeMs(unsigned char *buf, uint64_t ms) { + uint64_t be = htonu64(ms); + memcpy(buf, &be, sizeof(be)); +} + +static inline uint64_t pelDecodeMs(unsigned char *buf) { + uint64_t be; + memcpy(&be, buf, sizeof(be)); + return ntohu64(be); +} + +rax *pelNew(size_t *alloc_size) { + rax *pel = raxNewWithMetadata(sizeof(pelCache), alloc_size); + if (pel) pelCacheInvalidate(pel); + return pel; +} + +static void pelFreeFlaxCb(void *val, void *privdata) { + (void)privdata; + flaxFree((flax *)val); +} + +/* Free all flax structures and call nack_free for each NACK. */ +void pelFree(rax *pel, void (*nack_free)(void *, void *), void *ctx) { + if (!pel) return; + if (nack_free) { + raxIterator ri; + raxStart(&ri, pel); + raxSeek(&ri, "^", NULL, 0); + while (raxNext(&ri)) { + flax *f = ri.data; + flaxFreeWithCbAndContext(f, nack_free, ctx); + } + raxStop(&ri); + raxFreeWithCbAndContext(pel, NULL, NULL); + } else { + raxFreeWithCbAndContext(pel, pelFreeFlaxCb, NULL); + } +} + +/* Free flax structures without freeing NACKs (for consumer PEL where NACKs are shared). */ +void pelFreeShallow(rax *pel) { + if (!pel) return; + raxIterator ri; + raxStart(&ri, pel); + raxSeek(&ri, "^", NULL, 0); + while (raxNext(&ri)) { + flaxFree((flax *)ri.data); + } + raxStop(&ri); + raxFree(pel); +} + +/* Resolve the flax bucket for a given ms, using the inline cache to skip + * raxFind when consecutive operations target the same millisecond. + * Returns the flax, or NULL if the ms bucket doesn't exist. + * When create==1, a new bucket is created on miss and *created is set to 1. */ +static flax *pelResolveFlax(rax *pel, uint64_t ms, int create, int *created) { + pelCache *cache = pelGetCache(pel); + if (created) *created = 0; + + if (cache->f && cache->ms == ms) + return cache->f; + + unsigned char msbuf[8]; + pelEncodeMs(msbuf, ms); + void *existing = NULL; + if (raxFind(pel, msbuf, 8, &existing)) { + cache->ms = ms; + cache->f = existing; + return existing; + } + if (!create) return NULL; + + flax *f = flaxNew(); + raxInsert(pel, msbuf, 8, f, NULL); + cache->ms = ms; + cache->f = f; + if (created) *created = 1; + return f; +} + +/* Insert nack into two-level PEL. Returns 1 if new entry, 0 if key existed (old value replaced). */ +int pelInsert(rax *pel, streamID *id, streamNACK *nack, uint64_t *count) { + int created; + flax *f = pelResolveFlax(pel, id->ms, 1, &created); + if (created) { + flaxInsert(f, (int64_t)id->seq, nack, NULL); + if (count) (*count)++; + return 1; + } + void *old; + flaxInsert(f, (int64_t)id->seq, nack, &old); + if (old == NULL) { + if (count) (*count)++; + return 1; + } + return 0; +} + +/* Insert only if not present. Returns 1 if inserted, 0 if key already exists. */ +int pelTryInsert(rax *pel, streamID *id, streamNACK *nack, uint64_t *count) { + int created; + flax *f = pelResolveFlax(pel, id->ms, 1, &created); + if (created) { + flaxInsert(f, (int64_t)id->seq, nack, NULL); + if (count) (*count)++; + return 1; + } + if (!flaxTryInsert(f, (int64_t)id->seq, nack, NULL)) + return 0; + if (count) (*count)++; + return 1; +} + +/* Find a NACK by streamID. Returns NULL if not found. */ +streamNACK *pelFind(rax *pel, streamID *id) { + flax *f = pelResolveFlax(pel, id->ms, 0, NULL); + if (!f) return NULL; + void *val; + if (!flaxFind(f, (int64_t)id->seq, &val)) return NULL; + return (streamNACK *)val; +} + +/* Remove a NACK by streamID. Returns the removed NACK or NULL. */ +streamNACK *pelRemove(rax *pel, streamID *id, uint64_t *count) { + flax *f = pelResolveFlax(pel, id->ms, 0, NULL); + if (!f) return NULL; + void *old; + if (!flaxRemove(f, (int64_t)id->seq, &old)) return NULL; + streamNACK *nack = (streamNACK *)old; + if (count) (*count)--; + if (f->numele == 0) { + unsigned char msbuf[8]; + pelEncodeMs(msbuf, id->ms); + flaxFree(f); + raxRemove(pel, msbuf, 8, NULL); + pelCacheInvalidate(pel); + } + return nack; +} + +/* --- PEL Iterator --- */ + +/* Refresh iterator fields from current rax+flax positions. */ +static void pelIterRefresh(pelIterator *pi) { + pi->id.ms = pelDecodeMs(pi->ri.key); + pi->id.seq = (uint64_t)pi->fi.key; + pi->nack = (streamNACK *)pi->fi.data; + streamEncodeID(pi->rawkey, &pi->id); + pi->valid = 1; +} + +void pelIterStart(pelIterator *pi, rax *pel) { + raxStart(&pi->ri, pel); + pi->valid = 0; + memset(&pi->fi, 0, sizeof(pi->fi)); + memset(&pi->id, 0, sizeof(pi->id)); + pi->nack = NULL; +} + +int pelIterSeek(pelIterator *pi, const char *op, streamID *id) { + pi->valid = 0; + if (op[0] == '^') { + /* Seek to first entry. */ + raxSeek(&pi->ri, "^", NULL, 0); + if (!raxNext(&pi->ri)) return 0; + flaxStart(&pi->fi, (flax *)pi->ri.data); + if (!flaxSeek(&pi->fi, "^", 0)) return 0; + pelIterRefresh(pi); + return 1; + } else if (op[0] == '$') { + /* Seek to last entry. */ + raxSeek(&pi->ri, "$", NULL, 0); + if (!raxNext(&pi->ri)) return 0; + flaxStart(&pi->fi, (flax *)pi->ri.data); + if (!flaxSeek(&pi->fi, "$", 0)) return 0; + pelIterRefresh(pi); + return 1; + } else if (op[0] == '>' && op[1] == '=') { + unsigned char msbuf[8]; + pelEncodeMs(msbuf, id->ms); + raxSeek(&pi->ri, ">=", msbuf, 8); + if (!raxNext(&pi->ri)) return 0; + uint64_t cur_ms = pelDecodeMs(pi->ri.key); + flaxStart(&pi->fi, (flax *)pi->ri.data); + if (cur_ms == id->ms) { + if (!flaxSeek(&pi->fi, ">=", (int64_t)id->seq)) { + /* No seq >= target in this ms bucket, advance to next ms. */ + if (!raxNext(&pi->ri)) return 0; + flaxStart(&pi->fi, (flax *)pi->ri.data); + if (!flaxSeek(&pi->fi, "^", 0)) return 0; + } + } else { + if (!flaxSeek(&pi->fi, "^", 0)) return 0; + } + pelIterRefresh(pi); + return 1; + } + return 0; +} + +int pelIterNext(pelIterator *pi) { + if (!pi->valid) return 0; + if (flaxNext(&pi->fi)) { + pelIterRefresh(pi); + return 1; + } + /* Current flax exhausted, advance to next ms bucket. */ + if (!raxNext(&pi->ri)) { + pi->valid = 0; + return 0; + } + flaxStart(&pi->fi, (flax *)pi->ri.data); + if (!flaxSeek(&pi->fi, "^", 0)) { + pi->valid = 0; + return 0; + } + pelIterRefresh(pi); + return 1; +} + +/* Re-seek to >= id after a mid-iteration remove (XAUTOCLAIM pattern). */ +int pelIterReseek(pelIterator *pi, streamID *id) { + return pelIterSeek(pi, ">=", id); +} + +void pelIterStop(pelIterator *pi) { + raxStop(&pi->ri); + pi->valid = 0; +} + /* ----------------------------------------------------------------------- * Low level stream encoding: a radix tree of listpacks. * ----------------------------------------------------------------------- */ @@ -282,23 +521,22 @@ robj *streamDup(robj *o) { serverAssert(new_cg != NULL); /* Consumer Group PEL */ - raxIterator ri_cg_pel; - raxStart(&ri_cg_pel,cg->pel); - raxSeek(&ri_cg_pel,"^",NULL,0); - while(raxNext(&ri_cg_pel)){ - streamNACK *nack = ri_cg_pel.data; - streamID nack_id; - streamDecodeID(ri_cg_pel.key, &nack_id); - streamNACK *new_nack = streamCreateNACK(new_s, NULL, &nack_id); - new_nack->delivery_time = nack->delivery_time; - new_nack->delivery_count = nack->delivery_count; - new_nack->cgroup_ref_node = streamLinkCGroupToEntry(new_s, new_cg, ri_cg_pel.key); - raxInsert(new_cg->pel, ri_cg_pel.key, sizeof(streamID), new_nack, NULL); + pelIterator pi_cg; + pelIterStart(&pi_cg, cg->pel); + if (pelIterSeek(&pi_cg, "^", NULL)) { + do { + streamNACK *nack = pi_cg.nack; + streamID nack_id = pi_cg.id; + streamNACK *new_nack = streamCreateNACK(new_s, NULL, &nack_id); + new_nack->delivery_time = nack->delivery_time; + new_nack->delivery_count = nack->delivery_count; + new_nack->cgroup_ref_node = streamLinkCGroupToEntry(new_s, new_cg, pi_cg.rawkey); + pelInsert(new_cg->pel, &nack_id, new_nack, &new_cg->pel_count); - /* Insert in sorted order to preserve ordering */ - pelListInsertSorted(new_cg, new_nack); + pelListInsertSorted(new_cg, new_nack); + } while (pelIterNext(&pi_cg)); } - raxStop(&ri_cg_pel); + pelIterStop(&pi_cg); /* Consumers */ raxIterator ri_consumers; @@ -312,27 +550,26 @@ robj *streamDup(robj *o) { new_s->alloc_size += usable; new_consumer->name = sdsdup(consumer->name); new_s->alloc_size += sdsAllocSize(new_consumer->name); - new_consumer->pel = raxNewWithMetadata(0, &new_s->alloc_size); + new_consumer->pel = pelNew(&new_s->alloc_size); + new_consumer->pel_count = 0; raxInsert(new_cg->consumers,(unsigned char *)new_consumer->name, sdslen(new_consumer->name), new_consumer, NULL); new_consumer->seen_time = consumer->seen_time; new_consumer->active_time = consumer->active_time; /* Consumer PEL */ - raxIterator ri_cpel; - raxStart(&ri_cpel, consumer->pel); - raxSeek(&ri_cpel, "^", NULL, 0); - while (raxNext(&ri_cpel)) { - void *result; - int found = raxFind(new_cg->pel,ri_cpel.key,sizeof(streamID),&result); - - serverAssert(found); - - streamNACK *new_nack = result; - new_nack->consumer = new_consumer; - raxInsert(new_consumer->pel,ri_cpel.key,sizeof(streamID),new_nack,NULL); + pelIterator pi_cpel; + pelIterStart(&pi_cpel, consumer->pel); + if (pelIterSeek(&pi_cpel, "^", NULL)) { + do { + streamID cpel_id = pi_cpel.id; + streamNACK *new_nack = pelFind(new_cg->pel, &cpel_id); + serverAssert(new_nack); + new_nack->consumer = new_consumer; + pelInsert(new_consumer->pel, &cpel_id, new_nack, &new_consumer->pel_count); + } while (pelIterNext(&pi_cpel)); } - raxStop(&ri_cpel); + pelIterStop(&pi_cpel); } raxStop(&ri_consumers); } @@ -2060,14 +2297,12 @@ size_t streamReplyWithRange(client *c, stream *s, streamID *start, streamID *end /* Transfer ownership if needed */ if (nack->consumer != consumer) { - unsigned char buf[sizeof(streamID)]; - streamEncodeID(buf, &nack->id); - raxRemove(nack->consumer->pel,buf,sizeof(buf),NULL); + pelRemove(nack->consumer->pel, &nack->id, &nack->consumer->pel_count); nack->consumer = consumer; - raxInsert(consumer->pel,buf,sizeof(buf),nack,NULL); + pelInsert(consumer->pel, &nack->id, nack, &consumer->pel_count); } nack->delivery_count++; - pelListUpdate(group, nack, cmd_time_snapshot); /* Moves element from beginning to end of list */ + pelListUpdate(group, nack, cmd_time_snapshot); consumer->active_time = cmd_time_snapshot; @@ -2191,29 +2426,27 @@ size_t streamReplyWithRange(client *c, stream *s, streamID *start, streamID *end * if we find that there is already an entry for this ID. */ streamNACK *nack = streamCreateNACK(s, consumer, &id); int group_inserted = - raxTryInsert(group->pel,buf,sizeof(buf),nack,NULL); + pelTryInsert(group->pel,&id,nack,&group->pel_count); /* Now we can check if the entry was already busy, and * in that case reassign the entry to the new consumer, * or update it if the consumer is the same as before. */ if (group_inserted == 0) { streamFreeNACK(s,nack); - void *result; - int found = raxFind(group->pel,buf,sizeof(buf),&result); - serverAssert(found); - nack = result; + nack = pelFind(group->pel, &id); + serverAssert(nack); /* Only transfer between consumers if they're different */ if (nack->consumer != consumer) { - raxRemove(nack->consumer->pel,buf,sizeof(buf),NULL); + pelRemove(nack->consumer->pel, &id, &nack->consumer->pel_count); nack->consumer = consumer; - raxInsert(consumer->pel,buf,sizeof(buf),nack,NULL); + pelInsert(consumer->pel, &id, nack, &consumer->pel_count); } nack->delivery_count = 1; /* Update delivery time and reposition in time list */ pelListUpdate(group, nack, cmd_time_snapshot); } else { /* New NACK - insert into consumer's PEL and time list */ - raxInsert(consumer->pel,buf,sizeof(buf),nack,NULL); + pelInsert(consumer->pel, &id, nack, &consumer->pel_count); nack->cgroup_ref_node = streamLinkCGroupToEntry(s, group, buf); pelListInsertAtTail(group, nack); } @@ -2263,38 +2496,33 @@ size_t streamReplyWithRange(client *c, stream *s, streamID *start, streamID *end * to the client. However clients only reach this code path when they are * fetching the history of already retrieved messages, which is rare. */ size_t streamReplyWithRangeFromConsumerPEL(client *c, stream *s, streamID *start, streamID *end, size_t count, streamCG *group, streamConsumer *consumer) { - raxIterator ri; - unsigned char startkey[sizeof(streamID)]; - unsigned char endkey[sizeof(streamID)]; - streamEncodeID(startkey,start); - if (end) streamEncodeID(endkey,end); - size_t arraylen = 0; void *arraylen_ptr = addReplyDeferredLen(c); - raxStart(&ri,consumer->pel); - raxSeek(&ri,">=",startkey,sizeof(startkey)); - while(raxNext(&ri) && (!count || arraylen < count)) { - if (end && memcmp(ri.key,endkey,ri.key_len) > 0) break; - streamID thisid; - streamDecodeID(ri.key,&thisid); - if (streamReplyWithRange(c,s,&thisid,&thisid,1,0,-1,NULL,NULL, - STREAM_RWR_RAWENTRIES,NULL,NULL) == 0) - { - /* Note that we may have a not acknowledged entry in the PEL - * about a message that's no longer here because was removed - * by the user by other means. In that case we signal it emitting - * the ID but then a NULL entry for the fields. */ - addReplyArrayLen(c,2); - addReplyStreamID(c,&thisid); - addReplyNullArray(c); - } else { - streamNACK *nack = ri.data; - nack->delivery_count++; - pelListUpdate(group, nack, commandTimeSnapshot()); - } - arraylen++; + pelIterator pi; + pelIterStart(&pi, consumer->pel); + if (pelIterSeek(&pi, ">=", start)) { + do { + if (end && streamCompareID(&pi.id, end) > 0) break; + if (!count || arraylen < count) { + streamID thisid = pi.id; + if (streamReplyWithRange(c,s,&thisid,&thisid,1,0,-1,NULL,NULL, + STREAM_RWR_RAWENTRIES,NULL,NULL) == 0) + { + addReplyArrayLen(c,2); + addReplyStreamID(c,&thisid); + addReplyNullArray(c); + } else { + streamNACK *nack = pi.nack; + nack->delivery_count++; + pelListUpdate(group, nack, commandTimeSnapshot()); + } + arraylen++; + } else { + break; + } + } while (pelIterNext(&pi)); } - raxStop(&ri); + pelIterStop(&pi); setDeferredArrayLen(c,arraylen_ptr,arraylen); return arraylen; } @@ -3128,12 +3356,13 @@ void streamCleanupEntryCGroupRefs(stream *s, streamID *id) { streamCG *group = listNodeValue(ln); /* Find the message in this consumer group's PEL */ - serverAssert(raxFind(group->pel, buf, sizeof(buf), (void **)&nack)); + nack = pelFind(group->pel, id); + serverAssert(nack); /* Remove from group and consumer PELs */ pelListUnlink(group, nack); - raxRemove(group->pel, buf, sizeof(buf), NULL); - raxRemove(nack->consumer->pel, buf, sizeof(buf), NULL); + pelRemove(group->pel, id, &group->pel_count); + pelRemove(nack->consumer->pel, id, &nack->consumer->pel_count); /* Since we're removing all references from the cgroups_ref, we can directly * free the NACK without unlinking it from the cgroups_ref. */ streamFreeNACK(s, nack); @@ -3237,8 +3466,7 @@ void streamFreeNACKGeneric(void *na, void *ctx) { * should do some work before. */ void streamFreeConsumer(stream *s, streamConsumer *sc) { size_t usable; - raxFree(sc->pel); /* No value free callback: the PEL entries are shared - between the consumer and the main stream PEL. */ + pelFreeShallow(sc->pel); s->alloc_size -= sdsAllocSize(sc->name); sdsfree(sc->name); zfree_usable(sc, &usable); @@ -3263,7 +3491,8 @@ streamCG *streamCreateCG(stream *s, char *name, size_t namelen, streamID *id, lo size_t usable; streamCG *cg = zmalloc_usable(sizeof(*cg), &usable); s->alloc_size += usable; - cg->pel = raxNewWithMetadata(0, &s->alloc_size); + cg->pel = pelNew(&s->alloc_size); + cg->pel_count = 0; cg->pel_time_head = NULL; cg->pel_time_tail = NULL; cg->consumers = raxNewWithMetadata(0, &s->alloc_size); @@ -3279,7 +3508,7 @@ streamCG *streamCreateCG(stream *s, char *name, size_t namelen, streamID *id, lo static void streamFreeCG(stream *s, streamCG *cg) { /* Free the pel, unlinking each NACK from the time list in the callback */ streamFreeNACKCtx ctx = {s, cg}; - raxFreeWithCbAndContext(cg->pel, streamFreeNACKGeneric, &ctx); + pelFree(cg->pel, streamFreeNACKGeneric, &ctx); /* pel_time_head/tail should now be NULL after unlinking all NACKs */ serverAssert(cg->pel_time_head == NULL && cg->pel_time_tail == NULL); @@ -3293,14 +3522,14 @@ static void streamFreeCG(stream *s, streamCG *cg) { /* Destroy a consumer group and clean up all associated references. */ void streamDestroyCG(stream *s, streamCG *cg) { /* Remove all references from the cgroups_ref. */ - raxIterator it; - raxStart(&it, cg->pel); - raxSeek(&it, "^", NULL, 0); - while (raxNext(&it)) { - streamNACK *nack = it.data; - streamUnlinkEntryFromCGroupRef(s, nack, it.key); + pelIterator pi; + pelIterStart(&pi, cg->pel); + if (pelIterSeek(&pi, "^", NULL)) { + do { + streamUnlinkEntryFromCGroupRef(s, pi.nack, pi.rawkey); + } while (pelIterNext(&pi)); } - raxStop(&it); + pelIterStop(&pi); /* If we're destroying the group with the minimum last_id, the cached * minimum is no longer valid and needs to be recalculated from the @@ -3344,7 +3573,8 @@ streamConsumer *streamCreateConsumer(stream *s, streamCG *cg, sds name, robj *ke s->alloc_size += usable; consumer->name = sdsdup(name); s->alloc_size += sdsAllocSize(consumer->name); - consumer->pel = raxNewWithMetadata(0, &s->alloc_size); + consumer->pel = pelNew(&s->alloc_size); + consumer->pel_count = 0; consumer->active_time = -1; consumer->seen_time = commandTimeSnapshot(); if (dirty) server.dirty++; @@ -3364,22 +3594,20 @@ streamConsumer *streamLookupConsumer(streamCG *cg, sds name) { void streamDelConsumer(stream *s, streamCG *cg, streamConsumer *consumer) { /* Iterate all the consumer pending messages, deleting every corresponding * entry from the global entry. */ - raxIterator ri; - raxStart(&ri,consumer->pel); - raxSeek(&ri,"^",NULL,0); - while(raxNext(&ri)) { - streamNACK *nack = ri.data; - streamUnlinkEntryFromCGroupRef(s, nack, ri.key); + pelIterator pi; + pelIterStart(&pi, consumer->pel); + if (pelIterSeek(&pi, "^", NULL)) { + do { + streamNACK *nack = pi.nack; + streamUnlinkEntryFromCGroupRef(s, nack, pi.rawkey); - streamID id; - streamDecodeID(ri.key, &id); + pelListUnlink(cg, nack); + pelRemove(cg->pel, &pi.id, &cg->pel_count); - pelListUnlink(cg, nack); - raxRemove(cg->pel,ri.key,ri.key_len,NULL); - - streamFreeNACK(s, nack); + streamFreeNACK(s, nack); + } while (pelIterNext(&pi)); } - raxStop(&ri); + pelIterStop(&pi); /* Deallocate the consumer. */ raxRemove(cg->consumers,(unsigned char*)consumer->name, @@ -3575,7 +3803,7 @@ NULL * that were yet associated with such a consumer. */ if (server.memory_tracking_enabled) old_alloc = kvobjAllocSize(o); - pending = raxSize(consumer->pel); + pending = consumer->pel_count; streamDelConsumer(s,cg,consumer); if (server.memory_tracking_enabled) updateSlotAllocSize(c->db,getKeySlot(c->argv[2]->ptr),o,old_alloc,kvobjAllocSize(o)); @@ -3768,12 +3996,11 @@ void xackCommand(client *c) { /* Lookup the ID in the group PEL: it will have a reference to the * NACK structure that will have a reference to the consumer, so that * we are able to remove the entry from both PELs. */ - void *result; - if (raxFind(group->pel,buf,sizeof(buf),&result)) { - streamNACK *nack = result; + streamNACK *nack = pelFind(group->pel, &ids[j-3]); + if (nack) { pelListUnlink(group, nack); - raxRemove(group->pel,buf,sizeof(buf),NULL); - raxRemove(nack->consumer->pel,buf,sizeof(buf),NULL); + pelRemove(group->pel, &ids[j-3], &group->pel_count); + pelRemove(nack->consumer->pel, &ids[j-3], &nack->consumer->pel_count); streamDestroyNACK(kv->ptr, nack, buf); acknowledged++; server.dirty++; @@ -3844,12 +4071,11 @@ void xackdelCommand(client *c) { /* Lookup the ID in the group PEL: it will have a reference to the * NACK structure that will have a reference to the consumer, so that * we are able to remove the entry from both PELs. */ - void *result; - if (raxFind(group->pel,buf,sizeof(buf),&result)) { - streamNACK *nack = result; + streamNACK *nack = pelFind(group->pel, id); + if (nack) { pelListUnlink(group, nack); - raxRemove(group->pel,buf,sizeof(buf),NULL); - raxRemove(nack->consumer->pel,buf,sizeof(buf),NULL); + pelRemove(group->pel, id, &group->pel_count); + pelRemove(nack->consumer->pel, id, &nack->consumer->pel_count); streamDestroyNACK(s, nack, buf); server.dirty++; @@ -3992,39 +4218,38 @@ void xpendingCommand(client *c) { if (justinfo) { addReplyArrayLen(c,4); /* Total number of messages in the PEL. */ - addReplyLongLong(c,raxSize(group->pel)); + addReplyLongLong(c,group->pel_count); /* First and last IDs. */ - if (raxSize(group->pel) == 0) { + if (group->pel_count == 0) { addReplyNull(c); /* Start. */ addReplyNull(c); /* End. */ addReplyNullArray(c); /* Clients. */ } else { /* Start. */ - raxIterator ri; - raxStart(&ri,group->pel); - raxSeek(&ri,"^",NULL,0); - raxNext(&ri); - streamDecodeID(ri.key,&startid); - addReplyStreamID(c,&startid); + pelIterator pi; + pelIterStart(&pi,group->pel); + pelIterSeek(&pi,"^",NULL); + addReplyStreamID(c,&pi.id); /* End. */ - raxSeek(&ri,"$",NULL,0); - raxNext(&ri); - streamDecodeID(ri.key,&endid); - addReplyStreamID(c,&endid); - raxStop(&ri); + pelIterStop(&pi); + pelIterStart(&pi,group->pel); + pelIterSeek(&pi,"$",NULL); + addReplyStreamID(c,&pi.id); + pelIterStop(&pi); /* Consumers with pending messages. */ + raxIterator ri; raxStart(&ri,group->consumers); raxSeek(&ri,"^",NULL,0); void *arraylen_ptr = addReplyDeferredLen(c); size_t arraylen = 0; while(raxNext(&ri)) { streamConsumer *consumer = ri.data; - if (raxSize(consumer->pel) == 0) continue; + if (consumer->pel_count == 0) continue; addReplyArrayLen(c,2); addReplyBulkCBuffer(c,ri.key,ri.key_len); - addReplyBulkLongLong(c,raxSize(consumer->pel)); + addReplyBulkLongLong(c,consumer->pel_count); arraylen++; } setDeferredArrayLen(c,arraylen_ptr,arraylen); @@ -4044,48 +4269,44 @@ void xpendingCommand(client *c) { } rax *pel = consumer ? consumer->pel : group->pel; - unsigned char startkey[sizeof(streamID)]; - unsigned char endkey[sizeof(streamID)]; - raxIterator ri; mstime_t now = commandTimeSnapshot(); - streamEncodeID(startkey,&startid); - streamEncodeID(endkey,&endid); - raxStart(&ri,pel); - raxSeek(&ri,">=",startkey,sizeof(startkey)); + pelIterator pi; + pelIterStart(&pi, pel); void *arraylen_ptr = addReplyDeferredLen(c); size_t arraylen = 0; - while(count && raxNext(&ri) && memcmp(ri.key,endkey,ri.key_len) <= 0) { - streamNACK *nack = ri.data; + if (pelIterSeek(&pi, ">=", &startid)) { + do { + if (streamCompareID(&pi.id, &endid) > 0) break; + streamNACK *nack = pi.nack; - if (minidle) { - mstime_t this_idle = now - nack->delivery_time; - if (this_idle < minidle) continue; - } + if (minidle) { + mstime_t this_idle = now - nack->delivery_time; + if (this_idle < minidle) continue; + } - arraylen++; - count--; - addReplyArrayLen(c,4); + arraylen++; + count--; + addReplyArrayLen(c,4); - /* Entry ID. */ - streamID id; - streamDecodeID(ri.key,&id); - addReplyStreamID(c,&id); + /* Entry ID. */ + addReplyStreamID(c,&pi.id); - /* Consumer name. */ - addReplyBulkCBuffer(c,nack->consumer->name, - sdslen(nack->consumer->name)); + /* Consumer name. */ + addReplyBulkCBuffer(c,nack->consumer->name, + sdslen(nack->consumer->name)); - /* Milliseconds elapsed since last delivery. */ - mstime_t elapsed = now - nack->delivery_time; - if (elapsed < 0) elapsed = 0; - addReplyLongLong(c,elapsed); + /* Milliseconds elapsed since last delivery. */ + mstime_t elapsed = now - nack->delivery_time; + if (elapsed < 0) elapsed = 0; + addReplyLongLong(c,elapsed); - /* Number of deliveries. */ - addReplyLongLong(c,nack->delivery_count); + /* Number of deliveries. */ + addReplyLongLong(c,nack->delivery_count); + } while (count && pelIterNext(&pi)); } - raxStop(&ri); + pelIterStop(&pi); setDeferredArrayLen(c,arraylen_ptr,arraylen); } } @@ -4274,9 +4495,7 @@ void xclaimCommand(client *c) { streamEncodeID(buf,&id); /* Lookup the ID in the group PEL. */ - void *result = NULL; - raxFind(group->pel,buf,sizeof(buf),&result); - streamNACK *nack = result; + streamNACK *nack = pelFind(group->pel, &id); /* Item must exist for us to transfer it to another consumer. */ if (!streamEntryExists(s,&id)) { @@ -4288,8 +4507,8 @@ void xclaimCommand(client *c) { server.dirty++; /* Release the NACK */ pelListUnlink(group, nack); - raxRemove(group->pel,buf,sizeof(buf),NULL); - raxRemove(nack->consumer->pel,buf,sizeof(buf),NULL); + pelRemove(group->pel, &id, &group->pel_count); + pelRemove(nack->consumer->pel, &id, &nack->consumer->pel_count); streamDestroyNACK(s, nack, buf); } continue; @@ -4303,7 +4522,7 @@ void xclaimCommand(client *c) { if (force && nack == NULL) { /* Create the NACK. */ nack = streamCreateNACK(s, NULL, &id); - raxInsert(group->pel,buf,sizeof(buf),nack,NULL); + pelInsert(group->pel, &id, nack, &group->pel_count); pelListInsertAtTail(group, nack); nack->cgroup_ref_node = streamLinkCGroupToEntry(s, group, buf); } @@ -4325,7 +4544,7 @@ void xclaimCommand(client *c) { * Note that nack->consumer is NULL if we created the * NACK above because of the FORCE option. */ if (nack->consumer) { - raxRemove(nack->consumer->pel,buf,sizeof(buf),NULL); + pelRemove(nack->consumer->pel, &id, &nack->consumer->pel_count); } } @@ -4340,7 +4559,7 @@ void xclaimCommand(client *c) { } if (nack->consumer != consumer) { /* Add the entry in the new consumer local PEL. */ - raxInsert(consumer->pel,buf,sizeof(buf),nack,NULL); + pelInsert(consumer->pel, &id, nack, &consumer->pel_count); nack->consumer = consumer; } /* Send the reply for this entry. */ @@ -4465,19 +4684,15 @@ void xautoclaimCommand(client *c) { void *endidptr = addReplyDeferredLen(c); /* reply[0] */ void *arraylenptr = addReplyDeferredLen(c); /* reply[1] */ - unsigned char startkey[sizeof(streamID)]; - streamEncodeID(startkey,&startid); - raxIterator ri; - raxStart(&ri,group->pel); - raxSeek(&ri,">=",startkey,sizeof(startkey)); + pelIterator pi; + pelIterStart(&pi,group->pel); size_t arraylen = 0; mstime_t now = commandTimeSnapshot(); int deleted_id_num = 0; - while (attempts-- && count && raxNext(&ri)) { - streamNACK *nack = ri.data; - - streamID id; - streamDecodeID(ri.key, &id); + int has_entry = pelIterSeek(&pi, ">=", &startid); + while (attempts-- && count && has_entry) { + streamNACK *nack = pi.nack; + streamID id = pi.id; /* Item must exist for us to transfer it to another consumer. */ if (!streamEntryExists(s,&id)) { @@ -4488,28 +4703,29 @@ void xautoclaimCommand(client *c) { server.dirty++; /* Clear this entry from the PEL, it no longer exists */ pelListUnlink(group, nack); - raxRemove(group->pel,ri.key,ri.key_len,NULL); - raxRemove(nack->consumer->pel,ri.key,ri.key_len,NULL); - streamDestroyNACK(s, nack, ri.key); + pelRemove(group->pel, &id, &group->pel_count); + pelRemove(nack->consumer->pel, &id, &nack->consumer->pel_count); + unsigned char rawkey[sizeof(streamID)]; + streamEncodeID(rawkey, &id); + streamDestroyNACK(s, nack, rawkey); /* Remember the ID for later */ deleted_ids[deleted_id_num++] = id; - raxSeek(&ri,">=",ri.key,ri.key_len); + has_entry = pelIterReseek(&pi, &id); count--; /* Count is a limit of the command response size. */ continue; } if (minidle) { mstime_t this_idle = now - nack->delivery_time; - if (this_idle < minidle) + if (this_idle < minidle) { + has_entry = pelIterNext(&pi); continue; + } } if (nack->consumer != consumer) { - /* Remove the entry from the old consumer. - * Note that nack->consumer is NULL if we created the - * NACK above because of the FORCE option. */ if (nack->consumer) { - raxRemove(nack->consumer->pel,ri.key,ri.key_len,NULL); + pelRemove(nack->consumer->pel, &id, &nack->consumer->pel_count); } } @@ -4522,7 +4738,7 @@ void xautoclaimCommand(client *c) { if (nack->consumer != consumer) { /* Add the entry in the new consumer local PEL. */ - raxInsert(consumer->pel,ri.key,ri.key_len,nack,NULL); + pelInsert(consumer->pel, &id, nack, &consumer->pel_count); nack->consumer = consumer; } @@ -4542,21 +4758,22 @@ void xautoclaimCommand(client *c) { streamPropagateXCLAIM(c,c->argv[1],group,c->argv[2],idstr,nack); decrRefCount(idstr); server.dirty++; + has_entry = pelIterNext(&pi); } - /* We need to return the next entry as a cursor for the next XAUTOCLAIM call */ - raxNext(&ri); + /* The cursor for the next XAUTOCLAIM call is whatever pi currently points to. + * After the loop, pi is already on the next unprocessed entry (or invalid). */ if (server.memory_tracking_enabled) updateSlotAllocSize(c->db,getKeySlot(c->argv[1]->ptr),o,old_alloc,kvobjAllocSize(o)); streamID endid; - if (raxEOF(&ri)) { + if (!pi.valid) { endid.ms = endid.seq = 0; } else { - streamDecodeID(ri.key, &endid); + endid = pi.id; } - raxStop(&ri); + pelIterStop(&pi); setDeferredArrayLen(c,arraylenptr,arraylen); setDeferredReplyStreamID(c,endidptr,&endid); @@ -4946,39 +5163,39 @@ void xinfoReplyWithStreamInfo(client *c, kvobj *kv) { /* Group PEL count */ addReplyBulkCString(c,"pel-count"); - addReplyLongLong(c,raxSize(cg->pel)); + addReplyLongLong(c,cg->pel_count); /* Group PEL */ addReplyBulkCString(c,"pending"); long long arraylen_cg_pel = 0; void *arrayptr_cg_pel = addReplyDeferredLen(c); - raxIterator ri_cg_pel; - raxStart(&ri_cg_pel,cg->pel); - raxSeek(&ri_cg_pel,"^",NULL,0); - while(raxNext(&ri_cg_pel) && (!count || arraylen_cg_pel < count)) { - streamNACK *nack = ri_cg_pel.data; - addReplyArrayLen(c,4); + pelIterator pi_cg_pel; + pelIterStart(&pi_cg_pel,cg->pel); + if (pelIterSeek(&pi_cg_pel,"^",NULL)) { + do { + if (count && arraylen_cg_pel >= count) break; + streamNACK *nack = pi_cg_pel.nack; + addReplyArrayLen(c,4); - /* Entry ID. */ - streamID id; - streamDecodeID(ri_cg_pel.key,&id); - addReplyStreamID(c,&id); + /* Entry ID. */ + addReplyStreamID(c,&pi_cg_pel.id); - /* Consumer name. */ - serverAssert(nack->consumer); /* assertion for valgrind (avoid NPD) */ - addReplyBulkCBuffer(c,nack->consumer->name, - sdslen(nack->consumer->name)); + /* Consumer name. */ + serverAssert(nack->consumer); + addReplyBulkCBuffer(c,nack->consumer->name, + sdslen(nack->consumer->name)); - /* Last delivery. */ - addReplyLongLong(c,nack->delivery_time); + /* Last delivery. */ + addReplyLongLong(c,nack->delivery_time); - /* Number of deliveries. */ - addReplyLongLong(c,nack->delivery_count); + /* Number of deliveries. */ + addReplyLongLong(c,nack->delivery_count); - arraylen_cg_pel++; + arraylen_cg_pel++; + } while (pelIterNext(&pi_cg_pel)); } setDeferredArrayLen(c,arrayptr_cg_pel,arraylen_cg_pel); - raxStop(&ri_cg_pel); + pelIterStop(&pi_cg_pel); /* Consumers */ addReplyBulkCString(c,"consumers"); @@ -5004,34 +5221,34 @@ void xinfoReplyWithStreamInfo(client *c, kvobj *kv) { /* Consumer PEL count */ addReplyBulkCString(c,"pel-count"); - addReplyLongLong(c,raxSize(consumer->pel)); + addReplyLongLong(c,consumer->pel_count); /* Consumer PEL */ addReplyBulkCString(c,"pending"); long long arraylen_cpel = 0; void *arrayptr_cpel = addReplyDeferredLen(c); - raxIterator ri_cpel; - raxStart(&ri_cpel,consumer->pel); - raxSeek(&ri_cpel,"^",NULL,0); - while(raxNext(&ri_cpel) && (!count || arraylen_cpel < count)) { - streamNACK *nack = ri_cpel.data; - addReplyArrayLen(c,3); + pelIterator pi_cpel; + pelIterStart(&pi_cpel,consumer->pel); + if (pelIterSeek(&pi_cpel,"^",NULL)) { + do { + if (count && arraylen_cpel >= count) break; + streamNACK *nack = pi_cpel.nack; + addReplyArrayLen(c,3); - /* Entry ID. */ - streamID id; - streamDecodeID(ri_cpel.key,&id); - addReplyStreamID(c,&id); + /* Entry ID. */ + addReplyStreamID(c,&pi_cpel.id); - /* Last delivery. */ - addReplyLongLong(c,nack->delivery_time); + /* Last delivery. */ + addReplyLongLong(c,nack->delivery_time); - /* Number of deliveries. */ - addReplyLongLong(c,nack->delivery_count); + /* Number of deliveries. */ + addReplyLongLong(c,nack->delivery_count); - arraylen_cpel++; + arraylen_cpel++; + } while (pelIterNext(&pi_cpel)); } setDeferredArrayLen(c,arrayptr_cpel,arraylen_cpel); - raxStop(&ri_cpel); + pelIterStop(&pi_cpel); } raxStop(&ri_consumers); } @@ -5102,7 +5319,7 @@ NULL addReplyBulkCString(c,"name"); addReplyBulkCBuffer(c,consumer->name,sdslen(consumer->name)); addReplyBulkCString(c,"pending"); - addReplyLongLong(c,raxSize(consumer->pel)); + addReplyLongLong(c,consumer->pel_count); addReplyBulkCString(c,"idle"); addReplyLongLong(c,idle); addReplyBulkCString(c,"inactive"); @@ -5128,7 +5345,7 @@ NULL addReplyBulkCString(c,"consumers"); addReplyLongLong(c,raxSize(cg->consumers)); addReplyBulkCString(c,"pending"); - addReplyLongLong(c,raxSize(cg->pel)); + addReplyLongLong(c,cg->pel_count); addReplyBulkCString(c,"last-delivered-id"); addReplyStreamID(c,&cg->last_id); addReplyBulkCString(c,"entries-read"); From 11e77c3b37e273b9f956d22196794f6dc064da81 Mon Sep 17 00:00:00 2001 From: Sergey Georgiev Date: Fri, 20 Mar 2026 14:13:15 +0200 Subject: [PATCH 06/48] changed: replace rax PEL with two-level rax(ms)->flax(seq) structure --- src/flax.c | 32 +++++++++++++------------------- src/flax.h | 3 ++- src/t_stream.c | 1 + 3 files changed, 16 insertions(+), 20 deletions(-) diff --git a/src/flax.c b/src/flax.c index e30407ea1..23e313938 100644 --- a/src/flax.c +++ b/src/flax.c @@ -86,17 +86,15 @@ static void flaxIterRefresh(flaxIterator *it) { flax *flaxNew(void) { flax *f = flax_malloc(sizeof(flax)); f->numele = 0; - f->capacity = FLAX_MIN_CAPACITY; - size_t voff = flax_values_offset(FLAX_MIN_CAPACITY); - f->data = flax_malloc(voff + (size_t)FLAX_MIN_CAPACITY * sizeof(void *)); + f->capacity = FLAX_INIT_CAPACITY; + size_t voff = flax_values_offset(FLAX_INIT_CAPACITY); + f->data = flax_malloc(voff + (size_t)FLAX_INIT_CAPACITY * sizeof(void *)); return f; } int flaxInsert(flax *f, int64_t key, void *data, void **old) { - if (f->numele == f->capacity) { - int64_t new_cap = f->capacity == 0 ? FLAX_MIN_CAPACITY : f->capacity * 2; - flax_resize(f, new_cap); - } + if (f->numele == f->capacity) + flax_resize(f, f->capacity * 2); int64_t idx; if (flax_search(flax_keys(f), f->numele, key, &idx)) { @@ -123,10 +121,8 @@ int flaxInsert(flax *f, int64_t key, void *data, void **old) { } int flaxTryInsert(flax *f, int64_t key, void *data, void **old) { - if (f->numele == f->capacity) { - int64_t new_cap = f->capacity == 0 ? FLAX_MIN_CAPACITY : f->capacity * 2; - flax_resize(f, new_cap); - } + if (f->numele == f->capacity) + flax_resize(f, f->capacity * 2); int64_t idx; if (flax_search(flax_keys(f), f->numele, key, &idx)) { @@ -173,13 +169,6 @@ int flaxRemove(flax *f, int64_t key, void **old) { } f->numele--; - - if (f->capacity > FLAX_MIN_CAPACITY && - f->numele < f->capacity / 4 && - f->capacity / 2 >= FLAX_MIN_CAPACITY) { - flax_resize(f, f->capacity / 2); - } - return 1; } @@ -229,6 +218,11 @@ uint64_t flaxSize(flax *f) { return (uint64_t)f->numele; } +void flaxShrink(flax *f) { + if (f->numele > 0 && f->numele < f->capacity) + flax_resize(f, f->numele); +} + /* --- Iterator implementation --- */ void flaxStart(flaxIterator *it, flax *f) { @@ -411,7 +405,7 @@ int flaxTest(int argc, char **argv, int flags) { flax *a = flaxNew(); assert(a != NULL); assert(a->numele == 0); - assert(a->capacity == FLAX_MIN_CAPACITY); + assert(a->capacity == FLAX_INIT_CAPACITY); assert(a->data != NULL); flaxFree(a); } diff --git a/src/flax.h b/src/flax.h index 509b88ba7..c93840cb8 100644 --- a/src/flax.h +++ b/src/flax.h @@ -4,7 +4,7 @@ #include #include -#define FLAX_MIN_CAPACITY 12 +#define FLAX_INIT_CAPACITY 16 typedef struct flax { void *data; @@ -37,6 +37,7 @@ int flaxPrev(flaxIterator *it); void flaxStop(flaxIterator *it); int flaxEOF(flaxIterator *it); uint64_t flaxSize(flax *f); +void flaxShrink(flax *f); #ifdef REDIS_TEST int flaxTest(int argc, char *argv[], int flags); diff --git a/src/t_stream.c b/src/t_stream.c index aabbef454..876dfdfac 100644 --- a/src/t_stream.c +++ b/src/t_stream.c @@ -142,6 +142,7 @@ static flax *pelResolveFlax(rax *pel, uint64_t ms, int create, int *created) { } if (!create) return NULL; + if (cache->f) flaxShrink(cache->f); flax *f = flaxNew(); raxInsert(pel, msbuf, 8, f, NULL); cache->ms = ms; From d11f944501658d86dd15b1c4368d58873d99fc3e Mon Sep 17 00:00:00 2001 From: Sergey Georgiev Date: Fri, 20 Mar 2026 14:47:34 +0200 Subject: [PATCH 07/48] changed: replace rax PEL with two-level rax(ms)->flax(seq) structure --- src/flax.c | 12 ++++++------ src/flax.h | 4 ++-- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/flax.c b/src/flax.c index 23e313938..e806dccd9 100644 --- a/src/flax.c +++ b/src/flax.c @@ -10,7 +10,7 @@ #include FLAX_MALLOC_INCLUDE #endif -static size_t flax_values_offset(int64_t capacity) { +static size_t flax_values_offset(uint32_t capacity) { size_t raw = (size_t)capacity * sizeof(int64_t); size_t align = alignof(void *); return (raw + align - 1) & ~(align - 1); @@ -28,7 +28,7 @@ static void **flax_values(flax *f) { * Returns 1 if key found (out_idx = its index), 0 if not (out_idx = insertion point). * Sequential access through the contiguous keys array is cache-friendly and * avoids branch-misprediction overhead of binary search at typical flax sizes. */ -static int flax_search(const int64_t *keys, int64_t numele, int64_t key, int64_t *out_idx) { +static int flax_search(const int64_t *keys, uint32_t numele, int64_t key, int64_t *out_idx) { if (numele == 0) { *out_idx = 0; return 0; @@ -51,7 +51,7 @@ static int flax_search(const int64_t *keys, int64_t numele, int64_t key, int64_t } /* Linear scan through the middle. */ - for (int64_t i = 1; i < numele - 1; i++) { + for (uint32_t i = 1; i < numele - 1; i++) { if (keys[i] < key) continue; *out_idx = i; return keys[i] == key; @@ -61,7 +61,7 @@ static int flax_search(const int64_t *keys, int64_t numele, int64_t key, int64_t return 0; } -static void flax_resize(flax *f, int64_t new_capacity) { +static void flax_resize(flax *f, uint32_t new_capacity) { size_t new_voff = flax_values_offset(new_capacity); size_t new_alloc = new_voff + (size_t)new_capacity * sizeof(void *); void *new_data = flax_malloc(new_alloc); @@ -194,7 +194,7 @@ void flaxFreeWithCallback(flax *f, void (*free_callback)(void *)) { if (!f) return; if (free_callback && f->data && f->numele > 0) { void **vals = flax_values(f); - for (int64_t i = 0; i < f->numele; i++) + for (uint32_t i = 0; i < f->numele; i++) free_callback(vals[i]); } flax_free(f->data); @@ -207,7 +207,7 @@ void flaxFreeWithCbAndContext(flax *f, if (!f) return; if (free_callback && f->data && f->numele > 0) { void **vals = flax_values(f); - for (int64_t i = 0; i < f->numele; i++) + for (uint32_t i = 0; i < f->numele; i++) free_callback(vals[i], ctx); } flax_free(f->data); diff --git a/src/flax.h b/src/flax.h index c93840cb8..e8f0fe204 100644 --- a/src/flax.h +++ b/src/flax.h @@ -8,8 +8,8 @@ typedef struct flax { void *data; - int64_t numele; - int64_t capacity; + uint32_t numele; + uint32_t capacity; } flax; typedef struct flaxIterator { From 24201a00ce90a31ebac550a9a2814294a178baec Mon Sep 17 00:00:00 2001 From: Sergey Georgiev Date: Tue, 24 Mar 2026 14:26:17 +0200 Subject: [PATCH 08/48] fixed: issues in flax --- src/defrag.c | 2 +- src/flax.c | 172 ++++++++++++++++++++++++++++++++++++++++--------- src/flax.h | 79 +++++++++++++++++++---- src/t_stream.c | 16 ++--- 4 files changed, 217 insertions(+), 52 deletions(-) diff --git a/src/defrag.c b/src/defrag.c index 0e97064a7..d7e7ab676 100644 --- a/src/defrag.c +++ b/src/defrag.c @@ -889,7 +889,7 @@ void* defragStreamConsumerPelFlax(raxIterator *ri, void *privdata) { void *grp_flax_ptr = NULL; raxFind(ctx->cg->pel, msbuf, 8, &grp_flax_ptr); if (grp_flax_ptr) { - flaxInsert((flax *)grp_flax_ptr, (int64_t)newnack->id.seq, newnack, NULL); + flaxInsert((flax *)grp_flax_ptr, newnack->id.seq, newnack, NULL); } /* Update doubly-linked list pointers. */ diff --git a/src/flax.c b/src/flax.c index e806dccd9..81f6503b8 100644 --- a/src/flax.c +++ b/src/flax.c @@ -1,3 +1,13 @@ +/* Flax -- A flat sorted-array map for uint64 keys. + * + * Copyright (c) 2025-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of (a) the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ + #include "flax.h" #include #include @@ -10,25 +20,71 @@ #include FLAX_MALLOC_INCLUDE #endif +/* ---------------------------------------------------------------------------- + * Flax internals + * + * A flax stores a sorted array of (uint64_t key, void *value) pairs inside a + * single contiguous heap block. The block is split into two sub-arrays: + * + * [ keys: uint64_t * capacity ][ padding ][ values: void* * capacity ] + * + * The padding between keys and values ensures that the values array starts + * at a pointer-aligned offset (see flax_values_offset()). + * + * Only the first `numele` slots in each sub-array hold live data; the + * remainder up to `capacity` is unused reserved space. + * + * Lookup uses linear scan rather than binary search. The expected element + * count is small (e.g. per-consumer stream PEL), so sequential cache-friendly + * access outperforms binary search whose branch-misprediction cost dominates + * at these sizes. Fast-path checks for the head and tail positions further + * accelerate the common case of monotonically increasing keys. + * + * Growth: capacity doubles on insert when full. + * Shrink: flaxShrink() reallocates to fit exactly. + * -------------------------------------------------------------------------- */ + +/* ----------------------------- Internal helpers ---------------------------- */ + +/* Return the byte offset where the values array starts within the data + * block for a given capacity. The offset is aligned to pointer size. */ static size_t flax_values_offset(uint32_t capacity) { - size_t raw = (size_t)capacity * sizeof(int64_t); + size_t raw = (size_t)capacity * sizeof(uint64_t); size_t align = alignof(void *); return (raw + align - 1) & ~(align - 1); } -static int64_t *flax_keys(flax *f) { - return (int64_t *)f->data; +/* Return a pointer to the keys array inside the flax data block. */ +static uint64_t *flax_keys(flax *f) { + return (uint64_t *)f->data; } +/* Return a pointer to the values array inside the flax data block. */ static void **flax_values(flax *f) { return (void **)((char *)f->data + flax_values_offset(f->capacity)); } -/* Linear scan with fast paths for first/last. - * Returns 1 if key found (out_idx = its index), 0 if not (out_idx = insertion point). - * Sequential access through the contiguous keys array is cache-friendly and - * avoids branch-misprediction overhead of binary search at typical flax sizes. */ -static int flax_search(const int64_t *keys, uint32_t numele, int64_t key, int64_t *out_idx) { +/* Search for 'key' in the sorted 'keys' array of length 'numele'. + * + * Returns 1 if the key is found, storing its position in *out_idx. + * Returns 0 if the key is absent, storing the insertion point in *out_idx + * (i.e. the index where the key would be placed to keep the array sorted). + * + * The search is a linear scan rather than binary search. This is deliberate: + * flax instances are expected to be small (tens of elements -- e.g. a stream + * consumer's PEL). At these sizes, a sequential walk through a contiguous + * uint64_t array is faster than binary search because: + * 1. The entire keys array fits in one or two cache lines. + * 2. Linear access has no branch-misprediction overhead -- the branch + * predictor can reliably learn the "not found yet, keep going" pattern. + * 3. Binary search touches O(log N) *random* cache lines and suffers a + * misprediction at every comparison. + * + * Two fast paths are checked first: + * - Tail: key > keys[numele-1] is the append case, overwhelmingly common + * when keys are monotonically increasing sequence numbers. + * - Head: key <= keys[0] catches prepend and exact-match-at-zero. */ +static int flax_search(const uint64_t *keys, uint32_t numele, uint64_t key, int64_t *out_idx) { if (numele == 0) { *out_idx = 0; return 0; @@ -61,13 +117,21 @@ static int flax_search(const int64_t *keys, uint32_t numele, int64_t key, int64_ return 0; } +/* Resize the internal storage to 'new_capacity'. + * + * A fresh data block is allocated and the live keys and values are copied + * into it. Because the keys and values sub-arrays sit at different offsets + * that depend on the capacity (the values offset is re-aligned for the new + * capacity), we must perform two independent memcpy operations -- one for + * the keys at the start of the block and one for the values at the new + * aligned offset. The old data block is freed afterwards. */ static void flax_resize(flax *f, uint32_t new_capacity) { size_t new_voff = flax_values_offset(new_capacity); size_t new_alloc = new_voff + (size_t)new_capacity * sizeof(void *); void *new_data = flax_malloc(new_alloc); if (f->data && f->numele > 0) { - memcpy(new_data, f->data, (size_t)f->numele * sizeof(int64_t)); + memcpy(new_data, f->data, (size_t)f->numele * sizeof(uint64_t)); memcpy((char *)new_data + new_voff, (char *)f->data + flax_values_offset(f->capacity), (size_t)f->numele * sizeof(void *)); @@ -78,11 +142,19 @@ static void flax_resize(flax *f, uint32_t new_capacity) { f->capacity = new_capacity; } +/* Update the iterator key and data fields from the underlying flax + * at the current index position. */ static void flaxIterRefresh(flaxIterator *it) { it->key = flax_keys(it->f)[it->idx]; it->data = flax_values(it->f)[it->idx]; } +/* ---------------------------------------------------------------------------- + * Core API + * -------------------------------------------------------------------------- */ + +/* Allocate a new flax and return its pointer. On out of memory the function + * returns NULL. */ flax *flaxNew(void) { flax *f = flax_malloc(sizeof(flax)); f->numele = 0; @@ -92,7 +164,11 @@ flax *flaxNew(void) { return f; } -int flaxInsert(flax *f, int64_t key, void *data, void **old) { +/* Overwriting insert. Insert the element with the specified 'key', setting + * as associated data the pointer 'data'. If the element already exists, the + * associated data is updated and 1 is returned. If 'old' is not NULL the + * previous value is stored at that address. Returns 1 on success. */ +int flaxInsert(flax *f, uint64_t key, void *data, void **old) { if (f->numele == f->capacity) flax_resize(f, f->capacity * 2); @@ -104,12 +180,12 @@ int flaxInsert(flax *f, int64_t key, void *data, void **old) { return 1; } - int64_t *keys = flax_keys(f); + uint64_t *keys = flax_keys(f); void **vals = flax_values(f); int64_t tail = f->numele - idx; if (tail > 0) { - memmove(&keys[idx + 1], &keys[idx], (size_t)tail * sizeof(int64_t)); + memmove(&keys[idx + 1], &keys[idx], (size_t)tail * sizeof(uint64_t)); memmove(&vals[idx + 1], &vals[idx], (size_t)tail * sizeof(void *)); } @@ -120,7 +196,10 @@ int flaxInsert(flax *f, int64_t key, void *data, void **old) { return 1; } -int flaxTryInsert(flax *f, int64_t key, void *data, void **old) { +/* Non overwriting insert function: if an element with the same key exists, + * the value is not updated and the function returns 0. If 'old' is not NULL + * the existing value is stored at that address. Returns 1 on success. */ +int flaxTryInsert(flax *f, uint64_t key, void *data, void **old) { if (f->numele == f->capacity) flax_resize(f, f->capacity * 2); @@ -130,12 +209,12 @@ int flaxTryInsert(flax *f, int64_t key, void *data, void **old) { return 0; } - int64_t *keys = flax_keys(f); + uint64_t *keys = flax_keys(f); void **vals = flax_values(f); int64_t tail = f->numele - idx; if (tail > 0) { - memmove(&keys[idx + 1], &keys[idx], (size_t)tail * sizeof(int64_t)); + memmove(&keys[idx + 1], &keys[idx], (size_t)tail * sizeof(uint64_t)); memmove(&vals[idx + 1], &vals[idx], (size_t)tail * sizeof(void *)); } @@ -146,7 +225,10 @@ int flaxTryInsert(flax *f, int64_t key, void *data, void **old) { return 1; } -int flaxRemove(flax *f, int64_t key, void **old) { +/* Remove the specified item. Returns 1 if the item was found and + * deleted, 0 otherwise. If 'old' is not NULL the removed value is + * stored at that address. */ +int flaxRemove(flax *f, uint64_t key, void **old) { if (!f || f->numele == 0) { if (old) *old = NULL; return 0; @@ -158,13 +240,13 @@ int flaxRemove(flax *f, int64_t key, void **old) { return 0; } - int64_t *keys = flax_keys(f); + uint64_t *keys = flax_keys(f); void **vals = flax_values(f); if (old) *old = vals[idx]; int64_t tail = f->numele - idx - 1; if (tail > 0) { - memmove(&keys[idx], &keys[idx + 1], (size_t)tail * sizeof(int64_t)); + memmove(&keys[idx], &keys[idx + 1], (size_t)tail * sizeof(uint64_t)); memmove(&vals[idx], &vals[idx + 1], (size_t)tail * sizeof(void *)); } @@ -172,7 +254,10 @@ int flaxRemove(flax *f, int64_t key, void **old) { return 1; } -int flaxFind(flax *f, int64_t key, void **value) { +/* Find a key in the flax, returning 1 if found, 0 otherwise. If the key + * is found and 'value' is not NULL, the associated data pointer is stored + * at that address. */ +int flaxFind(flax *f, uint64_t key, void **value) { if (!f || f->numele == 0) { if (value) *value = NULL; return 0; @@ -186,10 +271,13 @@ int flaxFind(flax *f, int64_t key, void **value) { return 0; } +/* Free a whole flax. */ void flaxFree(flax *f) { flaxFreeWithCallback(f, NULL); } +/* Free a whole flax, calling the specified callback in order to + * free the auxiliary data. */ void flaxFreeWithCallback(flax *f, void (*free_callback)(void *)) { if (!f) return; if (free_callback && f->data && f->numele > 0) { @@ -201,6 +289,8 @@ void flaxFreeWithCallback(flax *f, void (*free_callback)(void *)) { flax_free(f); } +/* Free a whole flax, calling the specified callback with a context + * argument in order to free the auxiliary data. */ void flaxFreeWithCbAndContext(flax *f, void (*free_callback)(void *item, void *ctx), void *ctx) { @@ -214,17 +304,23 @@ void flaxFreeWithCbAndContext(flax *f, flax_free(f); } +/* Return the number of elements inside the flax. */ uint64_t flaxSize(flax *f) { return (uint64_t)f->numele; } +/* Shrink the internal storage to fit the current number of elements, + * releasing unused memory. */ void flaxShrink(flax *f) { if (f->numele > 0 && f->numele < f->capacity) flax_resize(f, f->numele); } -/* --- Iterator implementation --- */ +/* ------------------------------- Iterator --------------------------------- */ +/* Initialize a flax iterator. This call should be performed a single time + * to initialize the iterator, and must be followed by a flaxSeek() call, + * otherwise the flaxPrev()/flaxNext() functions will just return EOF. */ void flaxStart(flaxIterator *it, flax *f) { it->f = f; it->idx = -1; @@ -232,7 +328,12 @@ void flaxStart(flaxIterator *it, flax *f) { it->data = NULL; } -int flaxSeek(flaxIterator *it, const char *op, int64_t key) { +/* Seek an iterator at the specified element. The 'op' argument selects the + * seek mode: "^" for the first element, "$" for the last, ">=" for greater + * or equal, ">" for strictly greater, "<=" for less or equal, "<" for + * strictly less, and "=" for exact match. Return 0 if no matching element + * was found, otherwise 1 is returned. */ +int flaxSeek(flaxIterator *it, const char *op, uint64_t key) { if (!it->f || it->f->numele == 0) { it->idx = -1; it->key = 0; @@ -332,6 +433,8 @@ int flaxSeek(flaxIterator *it, const char *op, int64_t key) { return 0; } +/* Go to the next element in the scope of the iterator 'it'. + * If EOF is reached, 0 is returned, otherwise 1 is returned. */ int flaxNext(flaxIterator *it) { if (it->idx < 0) return 0; it->idx++; @@ -345,6 +448,8 @@ int flaxNext(flaxIterator *it) { return 1; } +/* Go to the previous element in the scope of the iterator 'it'. + * If EOF is reached, 0 is returned, otherwise 1 is returned. */ int flaxPrev(flaxIterator *it) { if (it->idx < 0) return 0; it->idx--; @@ -357,14 +462,21 @@ int flaxPrev(flaxIterator *it) { return 1; } +/* Free the iterator. */ void flaxStop(flaxIterator *it) { (void)it; } +/* Return if the iterator is in an EOF state. This happens when flaxSeek() + * failed to seek an appropriate element, so that flaxNext() or flaxPrev() + * will return zero, or when an EOF condition was reached while iterating + * with flaxNext() and flaxPrev(). */ int flaxEOF(flaxIterator *it) { return it->idx < 0 || it->idx >= it->f->numele; } +/* ----------------------------- Unit tests --------------------------------- */ + #ifdef REDIS_TEST #include "testhelp.h" #include @@ -612,23 +724,23 @@ int flaxTest(int argc, char **argv, int flags) { } } - TEST("negative keys") { + TEST("large keys near uint64 boundaries") { flax *a = flaxNew(); - flaxInsert(a, -100, "neg100", NULL); flaxInsert(a, 0, "zero", NULL); - flaxInsert(a, 100, "pos100", NULL); - flaxInsert(a, -50, "neg50", NULL); + flaxInsert(a, UINT64_MAX, "max", NULL); + flaxInsert(a, UINT64_MAX - 1, "max-1", NULL); + flaxInsert(a, 100, "hundred", NULL); void *val; assert(flaxSize(a) == 4); - assert(flaxFind(a, -100, &val) == 1); - assert(strcmp(val, "neg100") == 0); - assert(flaxFind(a, -50, &val) == 1); - assert(strcmp(val, "neg50") == 0); assert(flaxFind(a, 0, &val) == 1); assert(strcmp(val, "zero") == 0); assert(flaxFind(a, 100, &val) == 1); - assert(strcmp(val, "pos100") == 0); + assert(strcmp(val, "hundred") == 0); + assert(flaxFind(a, UINT64_MAX - 1, &val) == 1); + assert(strcmp(val, "max-1") == 0); + assert(flaxFind(a, UINT64_MAX, &val) == 1); + assert(strcmp(val, "max") == 0); flaxFree(a); } diff --git a/src/flax.h b/src/flax.h index e8f0fe204..3d430939b 100644 --- a/src/flax.h +++ b/src/flax.h @@ -1,3 +1,13 @@ +/* Flax -- A flat sorted-array map for uint64 keys. + * + * Copyright (c) 2025-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of (a) the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ + #ifndef FLAX_H #define FLAX_H @@ -6,36 +16,79 @@ #define FLAX_INIT_CAPACITY 16 +/* A flax is a sorted associative container that maps uint64_t keys to void* + * values. Both arrays live in a single heap allocation ("data block") laid + * out as follows: + * + * flax struct data block (single allocation) + * +------------+ +------------------------------------+ + * | *data -----------> | keys[0..cap-1] (uint64_t) | + * | numele | +-- aligned to sizeof(void*) --------+ + * | capacity | | values[0..cap-1] (void*) | + * +------------+ +------------------------------------+ + * + * Keys are maintained in ascending sorted order. Only the first 'numele' + * slots in each array contain live data; the remainder up to 'capacity' + * is reserved space for future inserts. + * + * Lookup, insert and delete use a linear scan over the keys array rather + * than binary search. This is intentional: the expected element count is + * small (e.g. per-consumer stream PEL), so the sequential, cache-friendly + * access pattern outperforms binary search whose branch-misprediction cost + * dominates at these sizes. The scan includes fast-path checks for the + * head and tail positions to accelerate the common case of monotonically + * increasing keys (e.g. stream entry IDs). + * + * Growth strategy: the data block doubles in capacity when full (on insert) + * and can be shrunk to fit with flaxShrink(). + */ typedef struct flax { - void *data; - uint32_t numele; - uint32_t capacity; + void *data; /* Packed storage: keys array followed by values array. */ + uint32_t numele; /* Number of elements currently stored. */ + uint32_t capacity; /* Current allocated capacity. */ } flax; +/* Flax iterator state. The typical lifecycle is: + * + * flaxIterator it; + * flaxStart(&it, myflax); -- initialize + * flaxSeek(&it, ">=", somekey); -- position + * while (flaxNext(&it)) { ... } -- iterate (or flaxPrev) + * flaxStop(&it); -- cleanup + * + * After flaxStart() the iterator is in EOF state until a successful + * flaxSeek(). The iterator does not allocate heap memory, so flaxStop() + * is a no-op included for API symmetry with rax. */ typedef struct flaxIterator { - flax *f; - int64_t key; - void *data; - int64_t idx; + flax *f; /* Flax we are iterating. */ + uint64_t key; /* The current key. */ + void *data; /* Data associated to this key. */ + int64_t idx; /* Current index into the flax arrays, -1 if EOF. */ } flaxIterator; -/* Exported API. */ +/* --- Creation and destruction --- */ flax *flaxNew(void); -int flaxInsert(flax *f, int64_t key, void *data, void **old); -int flaxTryInsert(flax *f, int64_t key, void *data, void **old); -int flaxRemove(flax *f, int64_t key, void **old); -int flaxFind(flax *f, int64_t key, void **value); void flaxFree(flax *f); void flaxFreeWithCallback(flax *f, void (*free_callback)(void *)); void flaxFreeWithCbAndContext(flax *f, void (*free_callback)(void *item, void *ctx), void *ctx); + +/* --- Lookup and mutation --- */ +int flaxInsert(flax *f, uint64_t key, void *data, void **old); +int flaxTryInsert(flax *f, uint64_t key, void *data, void **old); +int flaxRemove(flax *f, uint64_t key, void **old); +int flaxFind(flax *f, uint64_t key, void **value); + +/* --- Iterator --- */ void flaxStart(flaxIterator *it, flax *f); -int flaxSeek(flaxIterator *it, const char *op, int64_t key); +int flaxSeek(flaxIterator *it, const char *op, uint64_t key); int flaxNext(flaxIterator *it); int flaxPrev(flaxIterator *it); void flaxStop(flaxIterator *it); int flaxEOF(flaxIterator *it); + +/* --- Introspection --- */ uint64_t flaxSize(flax *f); void flaxShrink(flax *f); diff --git a/src/t_stream.c b/src/t_stream.c index 876dfdfac..a5c19394d 100644 --- a/src/t_stream.c +++ b/src/t_stream.c @@ -156,12 +156,12 @@ int pelInsert(rax *pel, streamID *id, streamNACK *nack, uint64_t *count) { int created; flax *f = pelResolveFlax(pel, id->ms, 1, &created); if (created) { - flaxInsert(f, (int64_t)id->seq, nack, NULL); + flaxInsert(f, id->seq, nack, NULL); if (count) (*count)++; return 1; } void *old; - flaxInsert(f, (int64_t)id->seq, nack, &old); + flaxInsert(f, id->seq, nack, &old); if (old == NULL) { if (count) (*count)++; return 1; @@ -174,11 +174,11 @@ int pelTryInsert(rax *pel, streamID *id, streamNACK *nack, uint64_t *count) { int created; flax *f = pelResolveFlax(pel, id->ms, 1, &created); if (created) { - flaxInsert(f, (int64_t)id->seq, nack, NULL); + flaxInsert(f, id->seq, nack, NULL); if (count) (*count)++; return 1; } - if (!flaxTryInsert(f, (int64_t)id->seq, nack, NULL)) + if (!flaxTryInsert(f, id->seq, nack, NULL)) return 0; if (count) (*count)++; return 1; @@ -189,7 +189,7 @@ streamNACK *pelFind(rax *pel, streamID *id) { flax *f = pelResolveFlax(pel, id->ms, 0, NULL); if (!f) return NULL; void *val; - if (!flaxFind(f, (int64_t)id->seq, &val)) return NULL; + if (!flaxFind(f, id->seq, &val)) return NULL; return (streamNACK *)val; } @@ -198,7 +198,7 @@ streamNACK *pelRemove(rax *pel, streamID *id, uint64_t *count) { flax *f = pelResolveFlax(pel, id->ms, 0, NULL); if (!f) return NULL; void *old; - if (!flaxRemove(f, (int64_t)id->seq, &old)) return NULL; + if (!flaxRemove(f, id->seq, &old)) return NULL; streamNACK *nack = (streamNACK *)old; if (count) (*count)--; if (f->numele == 0) { @@ -216,7 +216,7 @@ streamNACK *pelRemove(rax *pel, streamID *id, uint64_t *count) { /* Refresh iterator fields from current rax+flax positions. */ static void pelIterRefresh(pelIterator *pi) { pi->id.ms = pelDecodeMs(pi->ri.key); - pi->id.seq = (uint64_t)pi->fi.key; + pi->id.seq = pi->fi.key; pi->nack = (streamNACK *)pi->fi.data; streamEncodeID(pi->rawkey, &pi->id); pi->valid = 1; @@ -256,7 +256,7 @@ int pelIterSeek(pelIterator *pi, const char *op, streamID *id) { uint64_t cur_ms = pelDecodeMs(pi->ri.key); flaxStart(&pi->fi, (flax *)pi->ri.data); if (cur_ms == id->ms) { - if (!flaxSeek(&pi->fi, ">=", (int64_t)id->seq)) { + if (!flaxSeek(&pi->fi, ">=", id->seq)) { /* No seq >= target in this ms bucket, advance to next ms. */ if (!raxNext(&pi->ri)) return 0; flaxStart(&pi->fi, (flax *)pi->ri.data); From 843f3db43721240bd2379e5eb3a1e981983cfc5a Mon Sep 17 00:00:00 2001 From: Sergey Georgiev Date: Thu, 26 Mar 2026 10:44:52 +0200 Subject: [PATCH 09/48] fixed: issue with two level pel --- src/defrag.c | 13 +- src/flax.c | 40 ++++ src/flax.h | 2 + src/rdb.c | 1 - src/stream.h | 31 +-- src/t_stream.c | 335 +++++++++++++++++++++++------ tests/unit/type/stream-cgroups.tcl | 185 ++++++++++++++++ 7 files changed, 524 insertions(+), 83 deletions(-) diff --git a/src/defrag.c b/src/defrag.c index d7e7ab676..0bb097f7b 100644 --- a/src/defrag.c +++ b/src/defrag.c @@ -882,15 +882,10 @@ void* defragStreamConsumerPelFlax(raxIterator *ri, void *privdata) { /* Update in the consumer PEL flax. */ flaxInsert(f, fi.key, newnack, NULL); - /* Update in the group PEL flax. */ - unsigned char msbuf[8]; - uint64_t ms_be = htonu64(newnack->id.ms); - memcpy(msbuf, &ms_be, 8); - void *grp_flax_ptr = NULL; - raxFind(ctx->cg->pel, msbuf, 8, &grp_flax_ptr); - if (grp_flax_ptr) { - flaxInsert((flax *)grp_flax_ptr, newnack->id.seq, newnack, NULL); - } + /* Update in the group PEL flax. pelInsert is an + * overwriting insert; the key already exists so count + * is unaffected and no overflow can trigger. */ + pelInsert(ctx->cg->pel, &newnack->id, newnack, NULL); /* Update doubly-linked list pointers. */ if (newnack->pel_prev) { diff --git a/src/flax.c b/src/flax.c index 81f6503b8..04714781a 100644 --- a/src/flax.c +++ b/src/flax.c @@ -9,6 +9,7 @@ */ #include "flax.h" +#include #include #include #include @@ -309,6 +310,13 @@ uint64_t flaxSize(flax *f) { return (uint64_t)f->numele; } +/* Return the last (largest) key in the flax. + * Precondition: f->numele > 0. Calling on an empty flax is a bug. */ +uint64_t flaxLastKey(flax *f) { + assert(f->numele > 0); + return flax_keys(f)[f->numele - 1]; +} + /* Shrink the internal storage to fit the current number of elements, * releasing unused memory. */ void flaxShrink(flax *f) { @@ -316,6 +324,38 @@ void flaxShrink(flax *f) { flax_resize(f, f->numele); } +/* Split 'f' at the midpoint: entries [0, mid) stay in 'f', entries [mid, numele) + * move to a newly allocated flax which is returned. *split_key is set to the + * first key of the upper half. The caller should flaxShrink(f) afterwards if + * reclaiming the excess capacity of the lower half is desired. */ +flax *flaxSplit(flax *f, uint64_t *split_key) { + uint32_t mid = f->numele / 2; + uint32_t upper_count = f->numele - mid; + + uint64_t *src_keys = flax_keys(f); + void **src_vals = flax_values(f); + + *split_key = src_keys[mid]; + + /* Right-size the new flax to hold the upper half. */ + uint32_t cap = FLAX_INIT_CAPACITY; + while (cap < upper_count) cap *= 2; + + flax *upper = flax_malloc(sizeof(flax)); + upper->numele = upper_count; + upper->capacity = cap; + size_t voff = flax_values_offset(cap); + upper->data = flax_malloc(voff + (size_t)cap * sizeof(void *)); + + memcpy(flax_keys(upper), &src_keys[mid], + (size_t)upper_count * sizeof(uint64_t)); + memcpy(flax_values(upper), &src_vals[mid], + (size_t)upper_count * sizeof(void *)); + + f->numele = mid; + return upper; +} + /* ------------------------------- Iterator --------------------------------- */ /* Initialize a flax iterator. This call should be performed a single time diff --git a/src/flax.h b/src/flax.h index 3d430939b..0e85ad195 100644 --- a/src/flax.h +++ b/src/flax.h @@ -90,7 +90,9 @@ int flaxEOF(flaxIterator *it); /* --- Introspection --- */ uint64_t flaxSize(flax *f); +uint64_t flaxLastKey(flax *f); /* Precondition: f->numele > 0. */ void flaxShrink(flax *f); +flax *flaxSplit(flax *f, uint64_t *split_key); #ifdef REDIS_TEST int flaxTest(int argc, char *argv[], int flags); diff --git a/src/rdb.c b/src/rdb.c index 836014b3d..3f0f46682 100644 --- a/src/rdb.c +++ b/src/rdb.c @@ -3379,7 +3379,6 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) rdbReportCorruptRDB("Duplicated consumer PEL entry " " loading a stream consumer " "group"); - streamFreeNACK(s, nack); decrRefCount(o); return NULL; } diff --git a/src/stream.h b/src/stream.h index 0e652f451..3905419e8 100644 --- a/src/stream.h +++ b/src/stream.h @@ -101,9 +101,11 @@ typedef struct streamCG { this value is detailed at the top comment of streamEstimateDistanceFromFirstEverEntry(). */ rax *pel; /* Two-level pending entries list. The outer rax is - keyed by the ms part (8-byte big-endian), and each - value is a flax* keyed by the seq part (int64_t) - whose values are streamNACK pointers. */ + keyed by a 16-byte compound key (ms, seq_base) in + big-endian. Each value is a flax* covering the + half-open seq range [seq_base, next_seq_base). + Buckets split when they exceed FLAX_BUCKET_MAX + entries. Flax values are streamNACK pointers. */ uint64_t pel_count; /* Total number of NACK entries across all flax buckets. */ streamNACK *pel_time_head; /* Head of time-ordered doubly-linked list of pending entries (oldest delivery_time). Used for efficient @@ -124,8 +126,9 @@ typedef struct streamConsumer { will be identified in the consumer group protocol. Case sensitive. */ rax *pel; /* Two-level consumer PEL: same structure as - streamCG.pel (ms -> flax(seq -> NACK*)). - NACK pointers are shared with the group PEL. */ + streamCG.pel — 16-byte (ms, seq_base) rax + key, flax(seq -> NACK*) values with bucket + splitting. NACKs are shared with group PEL. */ uint64_t pel_count; /* Total NACK count for this consumer. */ } streamConsumer; @@ -203,19 +206,23 @@ typedef struct pelIterator { unsigned char rawkey[sizeof(streamID)]; } pelIterator; -/* Inline cache embedded in rax metadata to avoid raxFind on every PEL op - * when consecutive messages share the same millisecond. */ +/* Inline cache embedded in rax metadata to speed up sequential PEL ops + * when consecutive operations target the same bucket. + * seq_upper is the seq_base of the next bucket for the same ms, or + * UINT64_MAX when the cached bucket is the last (or only) one. */ typedef struct pelCache { uint64_t ms; + uint64_t seq_base; + uint64_t seq_upper; flax *f; } pelCache; -static inline pelCache *pelGetCache(rax *pel) { - return (pelCache *)pel->metadata; -} - static inline void pelCacheInvalidate(rax *pel) { - pelGetCache(pel)->f = NULL; + pelCache *cache = (pelCache *)pel->metadata; + cache->f = NULL; + cache->ms = 0; + cache->seq_base = 0; + cache->seq_upper = 0; } /* Two-level PEL operations. */ diff --git a/src/t_stream.c b/src/t_stream.c index a5c19394d..7fef212e0 100644 --- a/src/t_stream.c +++ b/src/t_stream.c @@ -68,14 +68,28 @@ static void pelListUpdate(streamCG *cg, streamNACK *nack, mstime_t new_delivery_ * Two-level PEL: rax(ms -> flax(seq -> streamNACK*)) * ----------------------------------------------------------------------- */ -static inline void pelEncodeMs(unsigned char *buf, uint64_t ms) { - uint64_t be = htonu64(ms); - memcpy(buf, &be, sizeof(be)); +#define FLAX_BUCKET_MAX 256 + +/* Encode a 16-byte compound rax key: 8 bytes big-endian ms + 8 bytes big-endian seq_base. */ +static inline void pelEncodeKey(unsigned char *buf, uint64_t ms, uint64_t seq_base) { + uint64_t be; + be = htonu64(ms); + memcpy(buf, &be, 8); + be = htonu64(seq_base); + memcpy(buf + 8, &be, 8); } +/* Decode just the ms portion (first 8 bytes) of a compound key. */ static inline uint64_t pelDecodeMs(unsigned char *buf) { uint64_t be; - memcpy(&be, buf, sizeof(be)); + memcpy(&be, buf, 8); + return ntohu64(be); +} + +/* Decode the seq_base portion (last 8 bytes) of a compound key. */ +static inline uint64_t pelDecodeSeqBase(unsigned char *buf) { + uint64_t be; + memcpy(&be, buf + 8, 8); return ntohu64(be); } @@ -121,63 +135,247 @@ void pelFreeShallow(rax *pel) { raxFree(pel); } -/* Resolve the flax bucket for a given ms, using the inline cache to skip - * raxFind when consecutive operations target the same millisecond. - * Returns the flax, or NULL if the ms bucket doesn't exist. - * When create==1, a new bucket is created on miss and *created is set to 1. */ -static flax *pelResolveFlax(rax *pel, uint64_t ms, int create, int *created) { - pelCache *cache = pelGetCache(pel); - if (created) *created = 0; - - if (cache->f && cache->ms == ms) - return cache->f; - - unsigned char msbuf[8]; - pelEncodeMs(msbuf, ms); - void *existing = NULL; - if (raxFind(pel, msbuf, 8, &existing)) { - cache->ms = ms; - cache->f = existing; - return existing; - } - if (!create) return NULL; - +/* pelResolveFlax -- Resolve the flax bucket for a given (ms, seq) pair. + * + * The two-level PEL maps stream IDs (ms, seq) to streamNACK pointers using a + * rax of flax buckets. Each rax key is a 16-byte compound of (ms, seq_base), + * and each rax value is a flax that stores NACKs whose sequence numbers fall + * within that bucket's range. + * + * Two-level PEL layout + * ==================== + * + * rax (keyed by 16 bytes: [ms | seq_base]) + * +--------------------+ +--------------------+ +--------------------+ + * | key: (42, 0) | | key: (42, 300) | | key: (99, 0) | + * | val: flax_A -------+--+ | val: flax_B -------+--+ | val: flax_C -------+--+ + * +--------------------+ | +--------------------+ | +--------------------+ | + * v v v + * +----------+ +----------+ +----------+ + * | seq: 5 | | seq: 300 | | seq: 0 | + * | seq: 11 | | seq: 301 | | seq: 42 | + * | seq: 12 | | seq: 500 | +----------+ + * | ... | +----------+ flax_C + * +----------+ flax_B + * flax_A + * + * Each flax bucket owns entries for a half-open seq range within one ms: + * [seq_base, seq_base_of_next_bucket) or [seq_base, UINT64_MAX) if last + * + * An inline cache (pelCache) stored in rax->metadata remembers the last + * resolved bucket so consecutive lookups to the same ms can skip raxSeek: + * + * pelCache { ms, seq_base, seq_upper, *f } + * ^ ^ + * | +-- seq_base of the NEXT bucket (exclusive upper bound) + * +-- the millisecond timestamp of the cached bucket + * + * + * Lookup flow (create == 0) + * ========================= + * + * pelResolveFlax(pel, ms, seq, create=0) + * | + * +---------+---------+ + * | cache valid for | + * | this (ms, seq)? | + * +----+---------+----+ + * YES | | NO + * +-------------+ +----------------+ + * | | + * v raxSeek("<=",(ms,seq)) + * return cache->f | + * +---------+---------+ + * | found key with | + * | same ms? | + * +----+---------+----+ + * YES | | NO + * +-------------+ +-----+ + * | | + * update cache, return NULL + * return cache->f + * + * + * Insert flow (create == 1), after cache is resolved + * =================================================== + * + * Once the cache points to the right bucket for this ms, the bucket + * may need to grow. Three cases: + * + * CASE 1 -- No bucket exists for this ms (rax miss) + * -------------------------------------------------- + * Create a brand-new flax, insert into rax with key (ms, 0). + * + * rax rax + * (empty for ms=42) ==> +------------------+ + * | key: (42, 0) | + * | val: new flax ---+---> (empty flax) + * +------------------+ + * + * CASE 2 -- Overflow: bucket is full, seq extends past the tail + * -------------------------------------------------------------- + * The new seq is larger than every key in the full bucket. + * Create a new bucket starting at seq, appended after the current one. + * + * Before: After: + * flax_A [0..255] FULL flax_A [0..255] (shrunk) + * seq = 300 flax_NEW [300..) = new flax + * + * rax key (42,0) -> flax_A rax key (42,0) -> flax_A (shrunk) + * rax key (42,300) -> flax_NEW + * + * CASE 3 -- Mid-bucket split: bucket is full, seq falls inside its range + * ----------------------------------------------------------------------- + * Split the full flax at its midpoint. The lower half stays in the + * original bucket; the upper half goes into a new rax entry. + * The cache is updated to whichever half now owns seq. + * + * Before: After: + * flax_A [0..255] FULL flax_A [0..split_key) (shrunk) + * seq = 130 flax_UPPER [split_key..) + * + * rax key (42,0) -> flax_A rax key (42,0) -> flax_A + * rax key (42,split_key) -> flax_UPPER + * + * if seq >= split_key, cache points to flax_UPPER; + * otherwise cache retains flax_A with seq_upper = split_key. + * + * Returns the flax bucket, or NULL if no matching bucket exists (create==0). + * When create==1, a new bucket is created on miss (or on overflow) and + * *created is set to 1. */ +/* Create a fresh flax bucket at rax key (ms, 0), update the cache, and + * resolve seq_upper by peeking at the next rax key for the same ms. */ +static flax *pelCreateBucket(rax *pel, pelCache *cache, uint64_t ms) { if (cache->f) flaxShrink(cache->f); flax *f = flaxNew(); - raxInsert(pel, msbuf, 8, f, NULL); + unsigned char keybuf[16]; + pelEncodeKey(keybuf, ms, 0); + raxInsert(pel, keybuf, 16, f, NULL); cache->ms = ms; + cache->seq_base = 0; cache->f = f; - if (created) *created = 1; + + raxIterator ri; + raxStart(&ri, pel); + raxSeek(&ri, ">", keybuf, 16); + if (raxNext(&ri) && pelDecodeMs(ri.key) == ms) + cache->seq_upper = pelDecodeSeqBase(ri.key); + else + cache->seq_upper = UINT64_MAX; + raxStop(&ri); return f; } +static flax *pelResolveFlax(rax *pel, uint64_t ms, uint64_t seq, int create, int *created) { + pelCache *cache = (pelCache *)pel->metadata; + if (created) *created = 0; + + /* Cache hit: same ms and seq falls within [seq_base, seq_upper). */ + if (!(cache->f && cache->ms == ms && + seq >= cache->seq_base && seq < cache->seq_upper)) { + /* Cache miss — fall back to rax lookup. */ + unsigned char keybuf[16]; + pelEncodeKey(keybuf, ms, seq); + + raxIterator ri; + raxStart(&ri, pel); + /* Seek to the largest key <= (ms, seq). */ + raxSeek(&ri, "<=", keybuf, 16); + if (!raxNext(&ri)) { + raxStop(&ri); + if (!create) return NULL; + if (created) *created = 1; + return pelCreateBucket(pel, cache, ms); + } + + uint64_t found_ms = pelDecodeMs(ri.key); + if (found_ms == ms) { + /* Bucket belongs to the same ms. Peek at the next rax entry + * to determine the upper bound for this bucket. */ + cache->ms = ms; + cache->seq_base = pelDecodeSeqBase(ri.key); + cache->f = (flax *)ri.data; + if (raxNext(&ri) && pelDecodeMs(ri.key) == ms) + cache->seq_upper = pelDecodeSeqBase(ri.key); + else + cache->seq_upper = UINT64_MAX; + raxStop(&ri); + } else { + raxStop(&ri); + if (!create) return NULL; + if (created) *created = 1; + return pelCreateBucket(pel, cache, ms); + } + } + + /* cache->f holds an existing bucket for this ms. + * Overflow: bucket is full and new seq extends past the tail. */ + if (create && cache->f->numele >= FLAX_BUCKET_MAX && seq > flaxLastKey(cache->f)) { + flaxShrink(cache->f); + flax *f = flaxNew(); + unsigned char keybuf[16]; + pelEncodeKey(keybuf, ms, seq); + raxInsert(pel, keybuf, 16, f, NULL); + /* seq_upper is intentionally NOT updated: the new bucket sits + * between the old bucket and whatever followed it, inheriting + * the old upper bound. */ + cache->seq_base = seq; + cache->f = f; + if (created) *created = 1; + return f; + } + + /* Mid-bucket split: bucket is full and seq falls inside its range. */ + if (create && cache->f->numele >= FLAX_BUCKET_MAX) { + uint64_t split_key; + flax *upper = flaxSplit(cache->f, &split_key); + flaxShrink(cache->f); + + unsigned char keybuf[16]; + pelEncodeKey(keybuf, ms, split_key); + raxInsert(pel, keybuf, 16, upper, NULL); + + if (seq >= split_key) { + cache->seq_base = split_key; + cache->f = upper; + } else { + cache->seq_upper = split_key; + } + return cache->f; + } + return cache->f; +} + /* Insert nack into two-level PEL. Returns 1 if new entry, 0 if key existed (old value replaced). */ int pelInsert(rax *pel, streamID *id, streamNACK *nack, uint64_t *count) { - int created; - flax *f = pelResolveFlax(pel, id->ms, 1, &created); - if (created) { - flaxInsert(f, id->seq, nack, NULL); - if (count) (*count)++; - return 1; + flax *f = pelResolveFlax(pel, id->ms, id->seq, 0, NULL); + if (f) { + void *old; + flaxInsert(f, id->seq, nack, &old); + if (old == NULL) { + if (count) (*count)++; + return 1; + } + return 0; } - void *old; - flaxInsert(f, id->seq, nack, &old); - if (old == NULL) { - if (count) (*count)++; - return 1; - } - return 0; + /* No bucket yet — create and insert. The second pelResolveFlax call is + * a cache hit since the first call just primed the rax iterator path. */ + f = pelResolveFlax(pel, id->ms, id->seq, 1, NULL); + flaxInsert(f, id->seq, nack, NULL); + if (count) (*count)++; + return 1; } /* Insert only if not present. Returns 1 if inserted, 0 if key already exists. */ int pelTryInsert(rax *pel, streamID *id, streamNACK *nack, uint64_t *count) { int created; - flax *f = pelResolveFlax(pel, id->ms, 1, &created); + flax *f = pelResolveFlax(pel, id->ms, id->seq, 1, &created); if (created) { flaxInsert(f, id->seq, nack, NULL); if (count) (*count)++; return 1; } + if (!flaxTryInsert(f, id->seq, nack, NULL)) return 0; if (count) (*count)++; @@ -186,7 +384,7 @@ int pelTryInsert(rax *pel, streamID *id, streamNACK *nack, uint64_t *count) { /* Find a NACK by streamID. Returns NULL if not found. */ streamNACK *pelFind(rax *pel, streamID *id) { - flax *f = pelResolveFlax(pel, id->ms, 0, NULL); + flax *f = pelResolveFlax(pel, id->ms, id->seq, 0, NULL); if (!f) return NULL; void *val; if (!flaxFind(f, id->seq, &val)) return NULL; @@ -195,17 +393,18 @@ streamNACK *pelFind(rax *pel, streamID *id) { /* Remove a NACK by streamID. Returns the removed NACK or NULL. */ streamNACK *pelRemove(rax *pel, streamID *id, uint64_t *count) { - flax *f = pelResolveFlax(pel, id->ms, 0, NULL); + flax *f = pelResolveFlax(pel, id->ms, id->seq, 0, NULL); if (!f) return NULL; void *old; if (!flaxRemove(f, id->seq, &old)) return NULL; streamNACK *nack = (streamNACK *)old; if (count) (*count)--; if (f->numele == 0) { - unsigned char msbuf[8]; - pelEncodeMs(msbuf, id->ms); + pelCache *cache = (pelCache *)pel->metadata; + unsigned char keybuf[16]; + pelEncodeKey(keybuf, id->ms, cache->seq_base); flaxFree(f); - raxRemove(pel, msbuf, 8, NULL); + raxRemove(pel, keybuf, 16, NULL); pelCacheInvalidate(pel); } return nack; @@ -249,20 +448,36 @@ int pelIterSeek(pelIterator *pi, const char *op, streamID *id) { pelIterRefresh(pi); return 1; } else if (op[0] == '>' && op[1] == '=') { - unsigned char msbuf[8]; - pelEncodeMs(msbuf, id->ms); - raxSeek(&pi->ri, ">=", msbuf, 8); - if (!raxNext(&pi->ri)) return 0; + unsigned char keybuf[16]; + pelEncodeKey(keybuf, id->ms, id->seq); + /* Seek to the largest rax key <= (ms, seq), which is the bucket + * that could contain the target seq. */ + raxSeek(&pi->ri, "<=", keybuf, 16); + if (!raxNext(&pi->ri)) { + /* All buckets are > target, start from the very first bucket. */ + raxSeek(&pi->ri, "^", NULL, 0); + if (!raxNext(&pi->ri)) return 0; + flaxStart(&pi->fi, (flax *)pi->ri.data); + if (!flaxSeek(&pi->fi, "^", 0)) return 0; + pelIterRefresh(pi); + return 1; + } uint64_t cur_ms = pelDecodeMs(pi->ri.key); flaxStart(&pi->fi, (flax *)pi->ri.data); if (cur_ms == id->ms) { if (!flaxSeek(&pi->fi, ">=", id->seq)) { - /* No seq >= target in this ms bucket, advance to next ms. */ + /* No seq >= target in this bucket, advance to next rax entry. */ if (!raxNext(&pi->ri)) return 0; flaxStart(&pi->fi, (flax *)pi->ri.data); if (!flaxSeek(&pi->fi, "^", 0)) return 0; } + } else if (cur_ms < id->ms) { + /* Landed in an earlier-ms bucket, advance to next. */ + if (!raxNext(&pi->ri)) return 0; + flaxStart(&pi->fi, (flax *)pi->ri.data); + if (!flaxSeek(&pi->fi, "^", 0)) return 0; } else { + /* cur_ms > id->ms: start from the head of this bucket. */ if (!flaxSeek(&pi->fi, "^", 0)) return 0; } pelIterRefresh(pi); @@ -277,16 +492,14 @@ int pelIterNext(pelIterator *pi) { pelIterRefresh(pi); return 1; } - /* Current flax exhausted, advance to next ms bucket. */ - if (!raxNext(&pi->ri)) { - pi->valid = 0; - return 0; - } - flaxStart(&pi->fi, (flax *)pi->ri.data); - if (!flaxSeek(&pi->fi, "^", 0)) { - pi->valid = 0; - return 0; - } + /* Current flax exhausted, advance to next non-empty bucket. */ + do { + if (!raxNext(&pi->ri)) { + pi->valid = 0; + return 0; + } + flaxStart(&pi->fi, (flax *)pi->ri.data); + } while (!flaxSeek(&pi->fi, "^", 0)); pelIterRefresh(pi); return 1; } diff --git a/tests/unit/type/stream-cgroups.tcl b/tests/unit/type/stream-cgroups.tcl index 4990275e2..26021b91d 100644 --- a/tests/unit/type/stream-cgroups.tcl +++ b/tests/unit/type/stream-cgroups.tcl @@ -3290,4 +3290,189 @@ start_server { assert_error "*ERR The CLAIM option is only supported*" {r XREAD COUNT 2 CLAIM 10 STREAMS mystream 0-0} } } + + test "Two-level PEL bucket overflow with fixed-ms entries" { + r DEL mystream + + # Add 600 entries sharing the same ms, forcing multiple flax buckets. + for {set i 0} {$i < 600} {incr i} { + r XADD mystream 1000-$i field value$i + } + + r XGROUP CREATE mystream grp 0 + r XREADGROUP GROUP grp consumer1 COUNT 600 STREAMS mystream > + + # Verify all 600 entries are pending. + set pending [r XPENDING mystream grp - + 10] + set summary [r XPENDING mystream grp] + assert_equal [lindex $summary 0] 600 + + # XACK entries from different bucket ranges: early, middle, late. + r XACK mystream grp 1000-0 1000-1 1000-2 + r XACK mystream grp 1000-300 1000-301 + r XACK mystream grp 1000-598 1000-599 + + set summary [r XPENDING mystream grp] + assert_equal [lindex $summary 0] 593 + + # XCLAIM an entry from a different bucket range into a new consumer. + after 10 + r XCLAIM mystream grp consumer2 0 1000-400 + + set pending_c2 [r XPENDING mystream grp - + 10 consumer2] + assert_equal [llength $pending_c2] 1 + assert_equal [lindex $pending_c2 0 0] "1000-400" + + # Verify consumer1's count decreased. + set pending_c1_summary [r XPENDING mystream grp - + 700 consumer1] + assert_equal [llength $pending_c1_summary] 592 + } + + test "Two-level PEL non-sequential insertion via XCLAIM into full buckets" { + r DEL mystream + + # Add 600 entries: consumer1 reads 0-299, consumer2 reads 300-599. + # consumer2's PEL will have a full bucket (256 entries) at seq_base=0. + for {set i 0} {$i < 600} {incr i} { + r XADD mystream 2000-$i field value$i + } + + r XGROUP CREATE mystream grp 0 + r XREADGROUP GROUP grp consumer1 COUNT 300 STREAMS mystream > + r XREADGROUP GROUP grp consumer2 COUNT 300 STREAMS mystream > + + set summary [r XPENDING mystream grp] + assert_equal [lindex $summary 0] 600 + + # XCLAIM entries 0-99 from consumer1 to consumer2. This inserts + # seqs 0-99 into consumer2's PEL which already holds seqs 300-599 + # in a full bucket, exercising non-sequential insertion into a + # bucket that has grown beyond FLAX_BUCKET_MAX. + for {set i 0} {$i < 100} {incr i} { + r XCLAIM mystream grp consumer2 0 2000-$i + } + + # consumer2 should now have 300 + 100 = 400 entries. + set pending_c2 [r XPENDING mystream grp - + 600 consumer2] + assert_equal [llength $pending_c2] 400 + + # Verify entries are returned in correct sorted order (numeric comparison). + set prev_ms -1 + set prev_seq -1 + foreach entry $pending_c2 { + set id [lindex $entry 0] + set parts [split $id -] + set ms [lindex $parts 0] + set seq [lindex $parts 1] + if {$ms == $prev_ms} { + assert {$seq > $prev_seq} + } elseif {$prev_ms >= 0} { + assert {$ms > $prev_ms} + } + set prev_ms $ms + set prev_seq $seq + } + + # Verify first and last entries. + assert_equal [lindex $pending_c2 0 0] "2000-0" + assert_equal [lindex $pending_c2 end 0] "2000-599" + } + + test "Two-level PEL empty-bucket removal after ACK" { + r DEL mystream + + # Create 3 buckets worth of entries (768 entries, 256 each) under + # one ms, then ACK all entries in the middle bucket. + for {set i 0} {$i < 768} {incr i} { + r XADD mystream 3000-$i field value$i + } + + r XGROUP CREATE mystream grp 0 + r XREADGROUP GROUP grp consumer1 COUNT 768 STREAMS mystream > + + set summary [r XPENDING mystream grp] + assert_equal [lindex $summary 0] 768 + + # ACK the middle 256 entries (seq 256-511). + for {set i 256} {$i < 512} {incr i} { + r XACK mystream grp 3000-$i + } + + set summary [r XPENDING mystream grp] + assert_equal [lindex $summary 0] 512 + + # Verify XPENDING returns only entries from first and last buckets + # in correct order, with no entries from the removed middle bucket. + set all_pending [r XPENDING mystream grp - + 600] + assert_equal [llength $all_pending] 512 + + foreach entry $all_pending { + set id [lindex $entry 0] + set seq [lindex [split $id -] 1] + assert {$seq < 256 || $seq >= 512} + } + + # Verify boundary entries still exist. + set first [lindex $all_pending 0 0] + set last [lindex $all_pending end 0] + assert_equal $first "3000-0" + assert_equal $last "3000-767" + } + + test "Two-level PEL cross-bucket iteration with XPENDING range" { + r DEL mystream + + # Use two different ms values, each with enough entries to span + # multiple buckets, to test iteration across ms+bucket boundaries. + for {set i 0} {$i < 400} {incr i} { + r XADD mystream 4000-$i field value$i + } + for {set i 0} {$i < 400} {incr i} { + r XADD mystream 5000-$i field value$i + } + + r XGROUP CREATE mystream grp 0 + r XREADGROUP GROUP grp consumer1 COUNT 800 STREAMS mystream > + + set summary [r XPENDING mystream grp] + assert_equal [lindex $summary 0] 800 + + # Fetch all 800 entries and verify strict ordering. + set all_pending [r XPENDING mystream grp - + 900] + assert_equal [llength $all_pending] 800 + + set prev_ms 0 + set prev_seq -1 + foreach entry $all_pending { + set id [lindex $entry 0] + set parts [split $id -] + set ms [lindex $parts 0] + set seq [lindex $parts 1] + if {$ms == $prev_ms} { + assert {$seq > $prev_seq} + } else { + assert {$ms > $prev_ms} + } + set prev_ms $ms + set prev_seq $seq + } + + # Paginated fetch: use COUNT to walk 100 entries at a time and + # verify continuity across pages. + set start "-" + set collected {} + while {1} { + set page [r XPENDING mystream grp $start + 100] + if {[llength $page] == 0} break + foreach entry $page { + lappend collected [lindex $entry 0] + } + # Advance start past the last entry returned. + set last_id [lindex $page end 0] + set parts [split $last_id -] + set next_seq [expr {[lindex $parts 1] + 1}] + set start "[lindex $parts 0]-$next_seq" + } + assert_equal [llength $collected] 800 + } } From 6d704376266ed4b5a013603cca8ae93fbdc50504 Mon Sep 17 00:00:00 2001 From: Sergey Georgiev Date: Thu, 26 Mar 2026 15:48:21 +0200 Subject: [PATCH 10/48] fixed: issue with two level pel --- src/flax.c | 19 +++++++++++++------ src/flax.h | 1 + src/t_stream.c | 24 +++++++++++++----------- 3 files changed, 27 insertions(+), 17 deletions(-) diff --git a/src/flax.c b/src/flax.c index 04714781a..b0cc4f470 100644 --- a/src/flax.c +++ b/src/flax.c @@ -154,15 +154,22 @@ static void flaxIterRefresh(flaxIterator *it) { * Core API * -------------------------------------------------------------------------- */ +/* Allocate a new flax with the given initial capacity and return its pointer. + * On out of memory the function returns NULL. */ +flax *flaxNewWithCapacity(uint32_t capacity) { + if (capacity < FLAX_INIT_CAPACITY) capacity = FLAX_INIT_CAPACITY; + flax *f = flax_malloc(sizeof(flax)); + f->numele = 0; + f->capacity = capacity; + size_t voff = flax_values_offset(capacity); + f->data = flax_malloc(voff + (size_t)capacity * sizeof(void *)); + return f; +} + /* Allocate a new flax and return its pointer. On out of memory the function * returns NULL. */ flax *flaxNew(void) { - flax *f = flax_malloc(sizeof(flax)); - f->numele = 0; - f->capacity = FLAX_INIT_CAPACITY; - size_t voff = flax_values_offset(FLAX_INIT_CAPACITY); - f->data = flax_malloc(voff + (size_t)FLAX_INIT_CAPACITY * sizeof(void *)); - return f; + return flaxNewWithCapacity(FLAX_INIT_CAPACITY); } /* Overwriting insert. Insert the element with the specified 'key', setting diff --git a/src/flax.h b/src/flax.h index 0e85ad195..98c7bea68 100644 --- a/src/flax.h +++ b/src/flax.h @@ -67,6 +67,7 @@ typedef struct flaxIterator { } flaxIterator; /* --- Creation and destruction --- */ +flax *flaxNewWithCapacity(uint32_t capacity); flax *flaxNew(void); void flaxFree(flax *f); void flaxFreeWithCallback(flax *f, void (*free_callback)(void *)); diff --git a/src/t_stream.c b/src/t_stream.c index 7fef212e0..b36707b6b 100644 --- a/src/t_stream.c +++ b/src/t_stream.c @@ -247,7 +247,7 @@ void pelFreeShallow(rax *pel) { * resolve seq_upper by peeking at the next rax key for the same ms. */ static flax *pelCreateBucket(rax *pel, pelCache *cache, uint64_t ms) { if (cache->f) flaxShrink(cache->f); - flax *f = flaxNew(); + flax *f = flaxNewWithCapacity(FLAX_BUCKET_MAX); unsigned char keybuf[16]; pelEncodeKey(keybuf, ms, 0); raxInsert(pel, keybuf, 16, f, NULL); @@ -312,7 +312,7 @@ static flax *pelResolveFlax(rax *pel, uint64_t ms, uint64_t seq, int create, int * Overflow: bucket is full and new seq extends past the tail. */ if (create && cache->f->numele >= FLAX_BUCKET_MAX && seq > flaxLastKey(cache->f)) { flaxShrink(cache->f); - flax *f = flaxNew(); + flax *f = flaxNewWithCapacity(FLAX_BUCKET_MAX); unsigned char keybuf[16]; pelEncodeKey(keybuf, ms, seq); raxInsert(pel, keybuf, 16, f, NULL); @@ -3519,18 +3519,20 @@ void streamUpdateCGroupLastId(stream *s, streamCG *cg, streamID *id) { /* Link a consumer group to a stream entry in the cgroups_ref index. * Returns a pointer to the list node, so that it can be used for future deletion. */ listNode *streamLinkCGroupToEntry(stream *s, streamCG *cg, unsigned char *key) { - list *cglist; - if (!s->cgroups_ref) s->cgroups_ref = raxNewWithMetadata(0, &s->alloc_size); - - /* Try to find the list for this stream ID, create it if it doesn't exist */ - if (!raxFind(s->cgroups_ref, key, sizeof(streamID), (void**)&cglist)) { - cglist = listCreate(); - serverAssert(raxInsert(s->cgroups_ref, key, sizeof(streamID), cglist, NULL)); + + /* Speculatively create a list and try to insert it. If the key already + * exists, raxTryInsert returns 0 and sets 'existing' to the current value, + * so we discard the unused list. This avoids a double rax traversal + * (find + insert) on the common miss path. */ + list *cglist = listCreate(); + list *existing; + if (!raxTryInsert(s->cgroups_ref, key, sizeof(streamID), cglist, (void**)&existing)) { + listRelease(cglist); + cglist = existing; } - - /* Add the consumer group to the list and return the list node */ + listAddNodeTail(cglist, cg); return listLast(cglist); } From 7b99673eea471950849d7b203c8675323def7261 Mon Sep 17 00:00:00 2001 From: Sergey Georgiev Date: Fri, 27 Mar 2026 19:20:23 +0200 Subject: [PATCH 11/48] changed: simplify pel two level --- src/flax.c | 128 +++++++------------ src/flax.h | 20 ++- src/stream.h | 27 ++-- src/t_stream.c | 331 ++++++++++--------------------------------------- 4 files changed, 131 insertions(+), 375 deletions(-) diff --git a/src/flax.c b/src/flax.c index b0cc4f470..54e2f8135 100644 --- a/src/flax.c +++ b/src/flax.c @@ -1,4 +1,4 @@ -/* Flax -- A flat sorted-array map for uint64 keys. +/* Flax -- A flat sorted-array map for uint8_t keys. * * Copyright (c) 2025-Present, Redis Ltd. * All rights reserved. @@ -24,10 +24,10 @@ /* ---------------------------------------------------------------------------- * Flax internals * - * A flax stores a sorted array of (uint64_t key, void *value) pairs inside a + * A flax stores a sorted array of (uint8_t key, void *value) pairs inside a * single contiguous heap block. The block is split into two sub-arrays: * - * [ keys: uint64_t * capacity ][ padding ][ values: void* * capacity ] + * [ keys: uint8_t * capacity ][ padding ][ values: void* * capacity ] * * The padding between keys and values ensures that the values array starts * at a pointer-aligned offset (see flax_values_offset()). @@ -50,14 +50,14 @@ /* Return the byte offset where the values array starts within the data * block for a given capacity. The offset is aligned to pointer size. */ static size_t flax_values_offset(uint32_t capacity) { - size_t raw = (size_t)capacity * sizeof(uint64_t); + size_t raw = (size_t)capacity * sizeof(uint8_t); size_t align = alignof(void *); return (raw + align - 1) & ~(align - 1); } /* Return a pointer to the keys array inside the flax data block. */ -static uint64_t *flax_keys(flax *f) { - return (uint64_t *)f->data; +static uint8_t *flax_keys(flax *f) { + return (uint8_t *)f->data; } /* Return a pointer to the values array inside the flax data block. */ @@ -74,7 +74,7 @@ static void **flax_values(flax *f) { * The search is a linear scan rather than binary search. This is deliberate: * flax instances are expected to be small (tens of elements -- e.g. a stream * consumer's PEL). At these sizes, a sequential walk through a contiguous - * uint64_t array is faster than binary search because: + * uint8_t array is faster than binary search because: * 1. The entire keys array fits in one or two cache lines. * 2. Linear access has no branch-misprediction overhead -- the branch * predictor can reliably learn the "not found yet, keep going" pattern. @@ -85,7 +85,7 @@ static void **flax_values(flax *f) { * - Tail: key > keys[numele-1] is the append case, overwhelmingly common * when keys are monotonically increasing sequence numbers. * - Head: key <= keys[0] catches prepend and exact-match-at-zero. */ -static int flax_search(const uint64_t *keys, uint32_t numele, uint64_t key, int64_t *out_idx) { +static int flax_search(const uint8_t *keys, uint32_t numele, uint8_t key, int64_t *out_idx) { if (numele == 0) { *out_idx = 0; return 0; @@ -132,7 +132,7 @@ static void flax_resize(flax *f, uint32_t new_capacity) { void *new_data = flax_malloc(new_alloc); if (f->data && f->numele > 0) { - memcpy(new_data, f->data, (size_t)f->numele * sizeof(uint64_t)); + memcpy(new_data, f->data, (size_t)f->numele * sizeof(uint8_t)); memcpy((char *)new_data + new_voff, (char *)f->data + flax_values_offset(f->capacity), (size_t)f->numele * sizeof(void *)); @@ -176,7 +176,7 @@ flax *flaxNew(void) { * as associated data the pointer 'data'. If the element already exists, the * associated data is updated and 1 is returned. If 'old' is not NULL the * previous value is stored at that address. Returns 1 on success. */ -int flaxInsert(flax *f, uint64_t key, void *data, void **old) { +int flaxInsert(flax *f, uint8_t key, void *data, void **old) { if (f->numele == f->capacity) flax_resize(f, f->capacity * 2); @@ -188,12 +188,12 @@ int flaxInsert(flax *f, uint64_t key, void *data, void **old) { return 1; } - uint64_t *keys = flax_keys(f); + uint8_t *keys = flax_keys(f); void **vals = flax_values(f); int64_t tail = f->numele - idx; if (tail > 0) { - memmove(&keys[idx + 1], &keys[idx], (size_t)tail * sizeof(uint64_t)); + memmove(&keys[idx + 1], &keys[idx], (size_t)tail * sizeof(uint8_t)); memmove(&vals[idx + 1], &vals[idx], (size_t)tail * sizeof(void *)); } @@ -207,7 +207,7 @@ int flaxInsert(flax *f, uint64_t key, void *data, void **old) { /* Non overwriting insert function: if an element with the same key exists, * the value is not updated and the function returns 0. If 'old' is not NULL * the existing value is stored at that address. Returns 1 on success. */ -int flaxTryInsert(flax *f, uint64_t key, void *data, void **old) { +int flaxTryInsert(flax *f, uint8_t key, void *data, void **old) { if (f->numele == f->capacity) flax_resize(f, f->capacity * 2); @@ -217,12 +217,12 @@ int flaxTryInsert(flax *f, uint64_t key, void *data, void **old) { return 0; } - uint64_t *keys = flax_keys(f); + uint8_t *keys = flax_keys(f); void **vals = flax_values(f); int64_t tail = f->numele - idx; if (tail > 0) { - memmove(&keys[idx + 1], &keys[idx], (size_t)tail * sizeof(uint64_t)); + memmove(&keys[idx + 1], &keys[idx], (size_t)tail * sizeof(uint8_t)); memmove(&vals[idx + 1], &vals[idx], (size_t)tail * sizeof(void *)); } @@ -236,7 +236,7 @@ int flaxTryInsert(flax *f, uint64_t key, void *data, void **old) { /* Remove the specified item. Returns 1 if the item was found and * deleted, 0 otherwise. If 'old' is not NULL the removed value is * stored at that address. */ -int flaxRemove(flax *f, uint64_t key, void **old) { +int flaxRemove(flax *f, uint8_t key, void **old) { if (!f || f->numele == 0) { if (old) *old = NULL; return 0; @@ -248,13 +248,13 @@ int flaxRemove(flax *f, uint64_t key, void **old) { return 0; } - uint64_t *keys = flax_keys(f); + uint8_t *keys = flax_keys(f); void **vals = flax_values(f); if (old) *old = vals[idx]; int64_t tail = f->numele - idx - 1; if (tail > 0) { - memmove(&keys[idx], &keys[idx + 1], (size_t)tail * sizeof(uint64_t)); + memmove(&keys[idx], &keys[idx + 1], (size_t)tail * sizeof(uint8_t)); memmove(&vals[idx], &vals[idx + 1], (size_t)tail * sizeof(void *)); } @@ -265,7 +265,7 @@ int flaxRemove(flax *f, uint64_t key, void **old) { /* Find a key in the flax, returning 1 if found, 0 otherwise. If the key * is found and 'value' is not NULL, the associated data pointer is stored * at that address. */ -int flaxFind(flax *f, uint64_t key, void **value) { +int flaxFind(flax *f, uint8_t key, void **value) { if (!f || f->numele == 0) { if (value) *value = NULL; return 0; @@ -317,13 +317,6 @@ uint64_t flaxSize(flax *f) { return (uint64_t)f->numele; } -/* Return the last (largest) key in the flax. - * Precondition: f->numele > 0. Calling on an empty flax is a bug. */ -uint64_t flaxLastKey(flax *f) { - assert(f->numele > 0); - return flax_keys(f)[f->numele - 1]; -} - /* Shrink the internal storage to fit the current number of elements, * releasing unused memory. */ void flaxShrink(flax *f) { @@ -331,38 +324,6 @@ void flaxShrink(flax *f) { flax_resize(f, f->numele); } -/* Split 'f' at the midpoint: entries [0, mid) stay in 'f', entries [mid, numele) - * move to a newly allocated flax which is returned. *split_key is set to the - * first key of the upper half. The caller should flaxShrink(f) afterwards if - * reclaiming the excess capacity of the lower half is desired. */ -flax *flaxSplit(flax *f, uint64_t *split_key) { - uint32_t mid = f->numele / 2; - uint32_t upper_count = f->numele - mid; - - uint64_t *src_keys = flax_keys(f); - void **src_vals = flax_values(f); - - *split_key = src_keys[mid]; - - /* Right-size the new flax to hold the upper half. */ - uint32_t cap = FLAX_INIT_CAPACITY; - while (cap < upper_count) cap *= 2; - - flax *upper = flax_malloc(sizeof(flax)); - upper->numele = upper_count; - upper->capacity = cap; - size_t voff = flax_values_offset(cap); - upper->data = flax_malloc(voff + (size_t)cap * sizeof(void *)); - - memcpy(flax_keys(upper), &src_keys[mid], - (size_t)upper_count * sizeof(uint64_t)); - memcpy(flax_values(upper), &src_vals[mid], - (size_t)upper_count * sizeof(void *)); - - f->numele = mid; - return upper; -} - /* ------------------------------- Iterator --------------------------------- */ /* Initialize a flax iterator. This call should be performed a single time @@ -380,7 +341,7 @@ void flaxStart(flaxIterator *it, flax *f) { * or equal, ">" for strictly greater, "<=" for less or equal, "<" for * strictly less, and "=" for exact match. Return 0 if no matching element * was found, otherwise 1 is returned. */ -int flaxSeek(flaxIterator *it, const char *op, uint64_t key) { +int flaxSeek(flaxIterator *it, const char *op, uint8_t key) { if (!it->f || it->f->numele == 0) { it->idx = -1; it->key = 0; @@ -698,22 +659,22 @@ int flaxTest(int argc, char **argv, int flags) { TEST("grow beyond initial capacity") { flax *a = flaxNew(); - for (int64_t i = 0; i < 100; i++) { + for (int i = 0; i < 128; i++) { char *buf = flax_malloc(16); - snprintf(buf, 16, "v%lld", (long long)i); - flaxInsert(a, i * 3, buf, NULL); + snprintf(buf, 16, "v%d", i); + flaxInsert(a, (uint8_t)(i * 2), buf, NULL); } - assert(flaxSize(a) == 100); - assert(a->capacity >= 100); + assert(flaxSize(a) == 128); + assert(a->capacity >= 128); - for (int64_t i = 0; i < 100; i++) { + for (int i = 0; i < 128; i++) { char expected[16]; - snprintf(expected, sizeof(expected), "v%lld", (long long)i); + snprintf(expected, sizeof(expected), "v%d", i); void *val; - assert(flaxFind(a, i * 3, &val) == 1); + assert(flaxFind(a, (uint8_t)(i * 2), &val) == 1); if (strcmp(val, expected) != 0) { - ERR("grow: key %lld expected '%s' got '%s'", - (long long)(i * 3), expected, (char *)val); + ERR("grow: key %d expected '%s' got '%s'", + i * 2, expected, (char *)val); } } @@ -722,24 +683,25 @@ int flaxTest(int argc, char **argv, int flags) { TEST("shrink after many removals") { flax *a = flaxNew(); - for (int64_t i = 0; i < 64; i++) - flaxInsert(a, i, "x", NULL); + for (int i = 0; i < 64; i++) + flaxInsert(a, (uint8_t)i, "x", NULL); assert(flaxSize(a) == 64); - int64_t cap_before = a->capacity; + uint32_t cap_before = a->capacity; - for (int64_t i = 0; i < 56; i++) - flaxRemove(a, i, NULL); + for (int i = 0; i < 56; i++) + flaxRemove(a, (uint8_t)i, NULL); assert(flaxSize(a) == 8); + flaxShrink(a); if (a->capacity >= cap_before) { - ERR("shrink: capacity %lld should be less than %lld", - (long long)a->capacity, (long long)cap_before); + ERR("shrink: capacity %u should be less than %u", + a->capacity, cap_before); } - for (int64_t i = 56; i < 64; i++) { + for (int i = 56; i < 64; i++) { void *val; - assert(flaxFind(a, i, &val) == 1); + assert(flaxFind(a, (uint8_t)i, &val) == 1); assert(strcmp(val, "x") == 0); } @@ -771,11 +733,11 @@ int flaxTest(int argc, char **argv, int flags) { } } - TEST("large keys near uint64 boundaries") { + TEST("keys near uint8 boundaries") { flax *a = flaxNew(); flaxInsert(a, 0, "zero", NULL); - flaxInsert(a, UINT64_MAX, "max", NULL); - flaxInsert(a, UINT64_MAX - 1, "max-1", NULL); + flaxInsert(a, 255, "max", NULL); + flaxInsert(a, 254, "max-1", NULL); flaxInsert(a, 100, "hundred", NULL); void *val; @@ -784,9 +746,9 @@ int flaxTest(int argc, char **argv, int flags) { assert(strcmp(val, "zero") == 0); assert(flaxFind(a, 100, &val) == 1); assert(strcmp(val, "hundred") == 0); - assert(flaxFind(a, UINT64_MAX - 1, &val) == 1); + assert(flaxFind(a, 254, &val) == 1); assert(strcmp(val, "max-1") == 0); - assert(flaxFind(a, UINT64_MAX, &val) == 1); + assert(flaxFind(a, 255, &val) == 1); assert(strcmp(val, "max") == 0); flaxFree(a); diff --git a/src/flax.h b/src/flax.h index 98c7bea68..e9cd33496 100644 --- a/src/flax.h +++ b/src/flax.h @@ -1,4 +1,4 @@ -/* Flax -- A flat sorted-array map for uint64 keys. +/* Flax -- A flat sorted-array map for uint8_t keys. * * Copyright (c) 2025-Present, Redis Ltd. * All rights reserved. @@ -16,13 +16,13 @@ #define FLAX_INIT_CAPACITY 16 -/* A flax is a sorted associative container that maps uint64_t keys to void* +/* A flax is a sorted associative container that maps uint8_t keys to void* * values. Both arrays live in a single heap allocation ("data block") laid * out as follows: * * flax struct data block (single allocation) * +------------+ +------------------------------------+ - * | *data -----------> | keys[0..cap-1] (uint64_t) | + * | *data -----------> | keys[0..cap-1] (uint8_t) | * | numele | +-- aligned to sizeof(void*) --------+ * | capacity | | values[0..cap-1] (void*) | * +------------+ +------------------------------------+ @@ -61,7 +61,7 @@ typedef struct flax { * is a no-op included for API symmetry with rax. */ typedef struct flaxIterator { flax *f; /* Flax we are iterating. */ - uint64_t key; /* The current key. */ + uint8_t key; /* The current key. */ void *data; /* Data associated to this key. */ int64_t idx; /* Current index into the flax arrays, -1 if EOF. */ } flaxIterator; @@ -76,14 +76,14 @@ void flaxFreeWithCbAndContext(flax *f, void *ctx); /* --- Lookup and mutation --- */ -int flaxInsert(flax *f, uint64_t key, void *data, void **old); -int flaxTryInsert(flax *f, uint64_t key, void *data, void **old); -int flaxRemove(flax *f, uint64_t key, void **old); -int flaxFind(flax *f, uint64_t key, void **value); +int flaxInsert(flax *f, uint8_t key, void *data, void **old); +int flaxTryInsert(flax *f, uint8_t key, void *data, void **old); +int flaxRemove(flax *f, uint8_t key, void **old); +int flaxFind(flax *f, uint8_t key, void **value); /* --- Iterator --- */ void flaxStart(flaxIterator *it, flax *f); -int flaxSeek(flaxIterator *it, const char *op, uint64_t key); +int flaxSeek(flaxIterator *it, const char *op, uint8_t key); int flaxNext(flaxIterator *it); int flaxPrev(flaxIterator *it); void flaxStop(flaxIterator *it); @@ -91,9 +91,7 @@ int flaxEOF(flaxIterator *it); /* --- Introspection --- */ uint64_t flaxSize(flax *f); -uint64_t flaxLastKey(flax *f); /* Precondition: f->numele > 0. */ void flaxShrink(flax *f); -flax *flaxSplit(flax *f, uint64_t *split_key); #ifdef REDIS_TEST int flaxTest(int argc, char *argv[], int flags); diff --git a/src/stream.h b/src/stream.h index bd93db7b7..5f04c360f 100644 --- a/src/stream.h +++ b/src/stream.h @@ -101,11 +101,11 @@ typedef struct streamCG { this value is detailed at the top comment of streamEstimateDistanceFromFirstEverEntry(). */ rax *pel; /* Two-level pending entries list. The outer rax is - keyed by a 16-byte compound key (ms, seq_base) in - big-endian. Each value is a flax* covering the - half-open seq range [seq_base, next_seq_base). - Buckets split when they exceed FLAX_BUCKET_MAX - entries. Flax values are streamNACK pointers. */ + keyed by a 15-byte prefix of the big-endian + encoded stream ID (full ms + upper 56 bits of + seq). Each value is a flax* mapping the low + byte of seq (uint8_t) to streamNACK pointers. + Max 256 entries per bucket, no splitting. */ uint64_t pel_count; /* Total number of NACK entries across all flax buckets. */ streamNACK *pel_time_head; /* Head of time-ordered doubly-linked list of pending entries (oldest delivery_time). Used for efficient @@ -126,9 +126,9 @@ typedef struct streamConsumer { will be identified in the consumer group protocol. Case sensitive. */ rax *pel; /* Two-level consumer PEL: same structure as - streamCG.pel — 16-byte (ms, seq_base) rax - key, flax(seq -> NACK*) values with bucket - splitting. NACKs are shared with group PEL. */ + streamCG.pel — 15-byte rax key (first 15 + bytes of encoded ID), flax(low byte of seq + -> NACK*). NACKs are shared with group PEL. */ uint64_t pel_count; /* Total NACK count for this consumer. */ } streamConsumer; @@ -209,22 +209,15 @@ typedef struct pelIterator { } pelIterator; /* Inline cache embedded in rax metadata to speed up sequential PEL ops - * when consecutive operations target the same bucket. - * seq_upper is the seq_base of the next bucket for the same ms, or - * UINT64_MAX when the cached bucket is the last (or only) one. */ + * when consecutive operations target the same 15-byte rax bucket. */ typedef struct pelCache { - uint64_t ms; - uint64_t seq_base; - uint64_t seq_upper; + unsigned char key[15]; flax *f; } pelCache; static inline void pelCacheInvalidate(rax *pel) { pelCache *cache = (pelCache *)pel->metadata; cache->f = NULL; - cache->ms = 0; - cache->seq_base = 0; - cache->seq_upper = 0; } /* Two-level PEL operations. */ diff --git a/src/t_stream.c b/src/t_stream.c index 2670f341c..07c5fca51 100644 --- a/src/t_stream.c +++ b/src/t_stream.c @@ -68,29 +68,21 @@ static void pelListUpdate(streamCG *cg, streamNACK *nack, mstime_t new_delivery_ * Two-level PEL: rax(ms -> flax(seq -> streamNACK*)) * ----------------------------------------------------------------------- */ -#define FLAX_BUCKET_MAX 256 +#define PEL_RAX_KEY_LEN 15 -/* Encode a 16-byte compound rax key: 8 bytes big-endian ms + 8 bytes big-endian seq_base. */ -static inline void pelEncodeKey(unsigned char *buf, uint64_t ms, uint64_t seq_base) { +/* Encode a 15-byte rax key: full 8B big-endian ms + upper 7 bytes of + * big-endian seq (i.e. first 15 bytes of the 16-byte encoded streamID). */ +static inline void pelEncodeRaxKey(unsigned char *buf, uint64_t ms, uint64_t seq) { uint64_t be; be = htonu64(ms); memcpy(buf, &be, 8); - be = htonu64(seq_base); - memcpy(buf + 8, &be, 8); + be = htonu64(seq); + memcpy(buf + 8, &be, 7); } -/* Decode just the ms portion (first 8 bytes) of a compound key. */ -static inline uint64_t pelDecodeMs(unsigned char *buf) { - uint64_t be; - memcpy(&be, buf, 8); - return ntohu64(be); -} - -/* Decode the seq_base portion (last 8 bytes) of a compound key. */ -static inline uint64_t pelDecodeSeqBase(unsigned char *buf) { - uint64_t be; - memcpy(&be, buf + 8, 8); - return ntohu64(be); +/* Extract the low byte of seq as the flax key. */ +static inline uint8_t pelFlaxKey(uint64_t seq) { + return (uint8_t)(seq & 0xFF); } rax *pelNew(size_t *alloc_size) { @@ -137,246 +129,58 @@ void pelFreeShallow(rax *pel) { /* pelResolveFlax -- Resolve the flax bucket for a given (ms, seq) pair. * - * The two-level PEL maps stream IDs (ms, seq) to streamNACK pointers using a - * rax of flax buckets. Each rax key is a 16-byte compound of (ms, seq_base), - * and each rax value is a flax that stores NACKs whose sequence numbers fall - * within that bucket's range. - * - * Two-level PEL layout - * ==================== - * - * rax (keyed by 16 bytes: [ms | seq_base]) - * +--------------------+ +--------------------+ +--------------------+ - * | key: (42, 0) | | key: (42, 300) | | key: (99, 0) | - * | val: flax_A -------+--+ | val: flax_B -------+--+ | val: flax_C -------+--+ - * +--------------------+ | +--------------------+ | +--------------------+ | - * v v v - * +----------+ +----------+ +----------+ - * | seq: 5 | | seq: 300 | | seq: 0 | - * | seq: 11 | | seq: 301 | | seq: 42 | - * | seq: 12 | | seq: 500 | +----------+ - * | ... | +----------+ flax_C - * +----------+ flax_B - * flax_A - * - * Each flax bucket owns entries for a half-open seq range within one ms: - * [seq_base, seq_base_of_next_bucket) or [seq_base, UINT64_MAX) if last - * - * An inline cache (pelCache) stored in rax->metadata remembers the last - * resolved bucket so consecutive lookups to the same ms can skip raxSeek: - * - * pelCache { ms, seq_base, seq_upper, *f } - * ^ ^ - * | +-- seq_base of the NEXT bucket (exclusive upper bound) - * +-- the millisecond timestamp of the cached bucket - * - * - * Lookup flow (create == 0) - * ========================= - * - * pelResolveFlax(pel, ms, seq, create=0) - * | - * +---------+---------+ - * | cache valid for | - * | this (ms, seq)? | - * +----+---------+----+ - * YES | | NO - * +-------------+ +----------------+ - * | | - * v raxSeek("<=",(ms,seq)) - * return cache->f | - * +---------+---------+ - * | found key with | - * | same ms? | - * +----+---------+----+ - * YES | | NO - * +-------------+ +-----+ - * | | - * update cache, return NULL - * return cache->f - * - * - * Insert flow (create == 1), after cache is resolved - * =================================================== - * - * Once the cache points to the right bucket for this ms, the bucket - * may need to grow. Three cases: - * - * CASE 1 -- No bucket exists for this ms (rax miss) - * -------------------------------------------------- - * Create a brand-new flax, insert into rax with key (ms, 0). - * - * rax rax - * (empty for ms=42) ==> +------------------+ - * | key: (42, 0) | - * | val: new flax ---+---> (empty flax) - * +------------------+ - * - * CASE 2 -- Overflow: bucket is full, seq extends past the tail - * -------------------------------------------------------------- - * The new seq is larger than every key in the full bucket. - * Create a new bucket starting at seq, appended after the current one. - * - * Before: After: - * flax_A [0..255] FULL flax_A [0..255] (shrunk) - * seq = 300 flax_NEW [300..) = new flax - * - * rax key (42,0) -> flax_A rax key (42,0) -> flax_A (shrunk) - * rax key (42,300) -> flax_NEW - * - * CASE 3 -- Mid-bucket split: bucket is full, seq falls inside its range - * ----------------------------------------------------------------------- - * Split the full flax at its midpoint. The lower half stays in the - * original bucket; the upper half goes into a new rax entry. - * The cache is updated to whichever half now owns seq. - * - * Before: After: - * flax_A [0..255] FULL flax_A [0..split_key) (shrunk) - * seq = 130 flax_UPPER [split_key..) - * - * rax key (42,0) -> flax_A rax key (42,0) -> flax_A - * rax key (42,split_key) -> flax_UPPER - * - * if seq >= split_key, cache points to flax_UPPER; - * otherwise cache retains flax_A with seq_upper = split_key. + * With the 15+1 byte key scheme, each rax key is the first 15 bytes of the + * big-endian encoded stream ID (full ms + upper 56 bits of seq). The flax + * stores the low byte of seq as its uint8_t key, so each bucket holds at + * most 256 entries and never needs splitting. * * Returns the flax bucket, or NULL if no matching bucket exists (create==0). - * When create==1, a new bucket is created on miss (or on overflow) and - * *created is set to 1. */ -/* Create a fresh flax bucket at rax key (ms, 0), update the cache, and - * resolve seq_upper by peeking at the next rax key for the same ms. */ -static flax *pelCreateBucket(rax *pel, pelCache *cache, uint64_t ms) { - if (cache->f) flaxShrink(cache->f); - flax *f = flaxNewWithCapacity(FLAX_BUCKET_MAX); - unsigned char keybuf[16]; - pelEncodeKey(keybuf, ms, 0); - raxInsert(pel, keybuf, 16, f, NULL); - cache->ms = ms; - cache->seq_base = 0; - cache->f = f; - - raxIterator ri; - raxStart(&ri, pel); - raxSeek(&ri, ">", keybuf, 16); - if (raxNext(&ri) && pelDecodeMs(ri.key) == ms) - cache->seq_upper = pelDecodeSeqBase(ri.key); - else - cache->seq_upper = UINT64_MAX; - raxStop(&ri); - return f; -} - -static flax *pelResolveFlax(rax *pel, uint64_t ms, uint64_t seq, int create, int *created) { + * When create==1, a new bucket is created on miss and *created is set to 1. */ +static flax *pelResolveFlax(rax *pel, uint64_t ms, uint64_t seq, + int create, int *created) { pelCache *cache = (pelCache *)pel->metadata; if (created) *created = 0; + unsigned char keybuf[PEL_RAX_KEY_LEN]; + pelEncodeRaxKey(keybuf, ms, seq); - /* Cache hit: same ms and seq falls within [seq_base, seq_upper). */ - if (!(cache->f && cache->ms == ms && - seq >= cache->seq_base && seq < cache->seq_upper)) { - /* Cache miss — fall back to rax lookup. */ - unsigned char keybuf[16]; - pelEncodeKey(keybuf, ms, seq); + /* Cache hit */ + if (cache->f && memcmp(cache->key, keybuf, PEL_RAX_KEY_LEN) == 0) + return cache->f; - raxIterator ri; - raxStart(&ri, pel); - /* Seek to the largest key <= (ms, seq). */ - raxSeek(&ri, "<=", keybuf, 16); - if (!raxNext(&ri)) { - raxStop(&ri); - if (!create) return NULL; - if (created) *created = 1; - return pelCreateBucket(pel, cache, ms); - } - - uint64_t found_ms = pelDecodeMs(ri.key); - if (found_ms == ms) { - /* Bucket belongs to the same ms. Peek at the next rax entry - * to determine the upper bound for this bucket. */ - cache->ms = ms; - cache->seq_base = pelDecodeSeqBase(ri.key); - cache->f = (flax *)ri.data; - if (raxNext(&ri) && pelDecodeMs(ri.key) == ms) - cache->seq_upper = pelDecodeSeqBase(ri.key); - else - cache->seq_upper = UINT64_MAX; - raxStop(&ri); - } else { - raxStop(&ri); - if (!create) return NULL; - if (created) *created = 1; - return pelCreateBucket(pel, cache, ms); - } - } - - /* cache->f holds an existing bucket for this ms. - * Overflow: bucket is full and new seq extends past the tail. */ - if (create && cache->f->numele >= FLAX_BUCKET_MAX && seq > flaxLastKey(cache->f)) { - flaxShrink(cache->f); - flax *f = flaxNewWithCapacity(FLAX_BUCKET_MAX); - unsigned char keybuf[16]; - pelEncodeKey(keybuf, ms, seq); - raxInsert(pel, keybuf, 16, f, NULL); - /* seq_upper is intentionally NOT updated: the new bucket sits - * between the old bucket and whatever followed it, inheriting - * the old upper bound. */ - cache->seq_base = seq; - cache->f = f; - if (created) *created = 1; - return f; - } - - /* Mid-bucket split: bucket is full and seq falls inside its range. */ - if (create && cache->f->numele >= FLAX_BUCKET_MAX) { - uint64_t split_key; - flax *upper = flaxSplit(cache->f, &split_key); - flaxShrink(cache->f); - - unsigned char keybuf[16]; - pelEncodeKey(keybuf, ms, split_key); - raxInsert(pel, keybuf, 16, upper, NULL); - - if (seq >= split_key) { - cache->seq_base = split_key; - cache->f = upper; - } else { - cache->seq_upper = split_key; - } + /* Rax lookup (exact match) */ + void *data; + if (raxFind(pel, keybuf, PEL_RAX_KEY_LEN, &data)) { + cache->f = (flax *)data; + memcpy(cache->key, keybuf, PEL_RAX_KEY_LEN); return cache->f; } - return cache->f; + if (!create) return NULL; + + /* Create new bucket */ + flax *f = flaxNew(); + raxInsert(pel, keybuf, PEL_RAX_KEY_LEN, f, NULL); + cache->f = f; + memcpy(cache->key, keybuf, PEL_RAX_KEY_LEN); + if (created) *created = 1; + return f; } /* Insert nack into two-level PEL. Returns 1 if new entry, 0 if key existed (old value replaced). */ int pelInsert(rax *pel, streamID *id, streamNACK *nack, uint64_t *count) { - flax *f = pelResolveFlax(pel, id->ms, id->seq, 0, NULL); - if (f) { - void *old; - flaxInsert(f, id->seq, nack, &old); - if (old == NULL) { - if (count) (*count)++; - return 1; - } - return 0; + flax *f = pelResolveFlax(pel, id->ms, id->seq, 1, NULL); + void *old; + flaxInsert(f, pelFlaxKey(id->seq), nack, &old); + if (old == NULL) { + if (count) (*count)++; + return 1; } - /* No bucket yet — create and insert. The second pelResolveFlax call is - * a cache hit since the first call just primed the rax iterator path. */ - f = pelResolveFlax(pel, id->ms, id->seq, 1, NULL); - flaxInsert(f, id->seq, nack, NULL); - if (count) (*count)++; - return 1; + return 0; } /* Insert only if not present. Returns 1 if inserted, 0 if key already exists. */ int pelTryInsert(rax *pel, streamID *id, streamNACK *nack, uint64_t *count) { - int created; - flax *f = pelResolveFlax(pel, id->ms, id->seq, 1, &created); - if (created) { - flaxInsert(f, id->seq, nack, NULL); - if (count) (*count)++; - return 1; - } - - if (!flaxTryInsert(f, id->seq, nack, NULL)) + flax *f = pelResolveFlax(pel, id->ms, id->seq, 1, NULL); + if (!flaxTryInsert(f, pelFlaxKey(id->seq), nack, NULL)) return 0; if (count) (*count)++; return 1; @@ -387,7 +191,7 @@ streamNACK *pelFind(rax *pel, streamID *id) { flax *f = pelResolveFlax(pel, id->ms, id->seq, 0, NULL); if (!f) return NULL; void *val; - if (!flaxFind(f, id->seq, &val)) return NULL; + if (!flaxFind(f, pelFlaxKey(id->seq), &val)) return NULL; return (streamNACK *)val; } @@ -396,15 +200,14 @@ streamNACK *pelRemove(rax *pel, streamID *id, uint64_t *count) { flax *f = pelResolveFlax(pel, id->ms, id->seq, 0, NULL); if (!f) return NULL; void *old; - if (!flaxRemove(f, id->seq, &old)) return NULL; + if (!flaxRemove(f, pelFlaxKey(id->seq), &old)) return NULL; streamNACK *nack = (streamNACK *)old; if (count) (*count)--; if (f->numele == 0) { - pelCache *cache = (pelCache *)pel->metadata; - unsigned char keybuf[16]; - pelEncodeKey(keybuf, id->ms, cache->seq_base); + unsigned char keybuf[PEL_RAX_KEY_LEN]; + pelEncodeRaxKey(keybuf, id->ms, id->seq); flaxFree(f); - raxRemove(pel, keybuf, 16, NULL); + raxRemove(pel, keybuf, PEL_RAX_KEY_LEN, NULL); pelCacheInvalidate(pel); } return nack; @@ -414,10 +217,10 @@ streamNACK *pelRemove(rax *pel, streamID *id, uint64_t *count) { /* Refresh iterator fields from current rax+flax positions. */ static void pelIterRefresh(pelIterator *pi) { - pi->id.ms = pelDecodeMs(pi->ri.key); - pi->id.seq = pi->fi.key; + memcpy(pi->rawkey, pi->ri.key, PEL_RAX_KEY_LEN); + pi->rawkey[PEL_RAX_KEY_LEN] = (unsigned char)pi->fi.key; + streamDecodeID(pi->rawkey, &pi->id); pi->nack = (streamNACK *)pi->fi.data; - streamEncodeID(pi->rawkey, &pi->id); pi->valid = 1; } @@ -432,7 +235,6 @@ void pelIterStart(pelIterator *pi, rax *pel) { int pelIterSeek(pelIterator *pi, const char *op, streamID *id) { pi->valid = 0; if (op[0] == '^') { - /* Seek to first entry. */ raxSeek(&pi->ri, "^", NULL, 0); if (!raxNext(&pi->ri)) return 0; flaxStart(&pi->fi, (flax *)pi->ri.data); @@ -440,7 +242,6 @@ int pelIterSeek(pelIterator *pi, const char *op, streamID *id) { pelIterRefresh(pi); return 1; } else if (op[0] == '$') { - /* Seek to last entry. */ raxSeek(&pi->ri, "$", NULL, 0); if (!raxNext(&pi->ri)) return 0; flaxStart(&pi->fi, (flax *)pi->ri.data); @@ -448,13 +249,14 @@ int pelIterSeek(pelIterator *pi, const char *op, streamID *id) { pelIterRefresh(pi); return 1; } else if (op[0] == '>' && op[1] == '=') { - unsigned char keybuf[16]; - pelEncodeKey(keybuf, id->ms, id->seq); - /* Seek to the largest rax key <= (ms, seq), which is the bucket - * that could contain the target seq. */ - raxSeek(&pi->ri, "<=", keybuf, 16); + unsigned char keybuf[PEL_RAX_KEY_LEN]; + pelEncodeRaxKey(keybuf, id->ms, id->seq); + uint8_t fkey = pelFlaxKey(id->seq); + + /* Seek to the largest rax key <= target 15-byte key. */ + raxSeek(&pi->ri, "<=", keybuf, PEL_RAX_KEY_LEN); if (!raxNext(&pi->ri)) { - /* All buckets are > target, start from the very first bucket. */ + /* All rax keys are > target, start from the very first bucket. */ raxSeek(&pi->ri, "^", NULL, 0); if (!raxNext(&pi->ri)) return 0; flaxStart(&pi->fi, (flax *)pi->ri.data); @@ -462,22 +264,23 @@ int pelIterSeek(pelIterator *pi, const char *op, streamID *id) { pelIterRefresh(pi); return 1; } - uint64_t cur_ms = pelDecodeMs(pi->ri.key); + + int cmp = memcmp(pi->ri.key, keybuf, PEL_RAX_KEY_LEN); flaxStart(&pi->fi, (flax *)pi->ri.data); - if (cur_ms == id->ms) { - if (!flaxSeek(&pi->fi, ">=", id->seq)) { - /* No seq >= target in this bucket, advance to next rax entry. */ + if (cmp == 0) { + /* Exact rax key match — seek flax for >= fkey. */ + if (!flaxSeek(&pi->fi, ">=", fkey)) { if (!raxNext(&pi->ri)) return 0; flaxStart(&pi->fi, (flax *)pi->ri.data); if (!flaxSeek(&pi->fi, "^", 0)) return 0; } - } else if (cur_ms < id->ms) { - /* Landed in an earlier-ms bucket, advance to next. */ + } else if (cmp < 0) { + /* Landed in an earlier bucket, advance to next. */ if (!raxNext(&pi->ri)) return 0; flaxStart(&pi->fi, (flax *)pi->ri.data); if (!flaxSeek(&pi->fi, "^", 0)) return 0; } else { - /* cur_ms > id->ms: start from the head of this bucket. */ + /* rax key > target: start from head of this bucket. */ if (!flaxSeek(&pi->fi, "^", 0)) return 0; } pelIterRefresh(pi); From f80d24b264a331c766c5b3579ae1280bac6e2fcc Mon Sep 17 00:00:00 2001 From: Sergey Georgiev Date: Sat, 28 Mar 2026 13:37:35 +0200 Subject: [PATCH 12/48] changed: cgref to use two levels --- src/stream.h | 5 +- src/t_stream.c | 134 +++++++++++++++++++++++++++++++++++++------------ 2 files changed, 107 insertions(+), 32 deletions(-) diff --git a/src/stream.h b/src/stream.h index 5f04c360f..76c50a5bb 100644 --- a/src/stream.h +++ b/src/stream.h @@ -43,7 +43,10 @@ typedef struct stream { uint64_t entries_added; /* All time count of elements added. */ size_t alloc_size; /* Total allocated memory (in bytes) by this stream. */ rax *cgroups; /* Consumer groups dictionary: name -> streamCG */ - rax *cgroups_ref; /* Index mapping message IDs to their consumer groups. */ + rax *cgroups_ref; /* Two-level index mapping message IDs to their + consumer groups. Same key scheme as PEL: + outer rax(15-byte prefix) -> flax(low byte + of seq -> list* of streamCG pointers). */ streamID min_cgroup_last_id; /* The minimum ID of consume group. */ unsigned int min_cgroup_last_id_valid: 1; uint64_t idmp_duration; /* IDMP duration in seconds. */ diff --git a/src/t_stream.c b/src/t_stream.c index 07c5fca51..d993080c3 100644 --- a/src/t_stream.c +++ b/src/t_stream.c @@ -317,6 +317,56 @@ void pelIterStop(pelIterator *pi) { pi->valid = 0; } +/* ----------------------------------------------------------------------- + * Two-level cgroups_ref: rax(15-byte prefix -> flax(low-byte-of-seq -> list*)) + * + * Mirrors the PEL two-level layout. The outer rax key is the first + * 15 bytes of the big-endian encoded streamID; the flax key is byte 15 + * (low byte of seq). Each flax slot holds a list* of consumer-group + * pointers that reference the entry. + * ----------------------------------------------------------------------- */ + +typedef struct cgrefCache { + unsigned char key[PEL_RAX_KEY_LEN]; + flax *f; +} cgrefCache; + +static inline void cgrefCacheInvalidate(rax *ref) { + cgrefCache *cache = (cgrefCache *)ref->metadata; + cache->f = NULL; +} + +static rax *cgrefNew(size_t *alloc_size) { + rax *ref = raxNewWithMetadata(sizeof(cgrefCache), alloc_size); + if (ref) cgrefCacheInvalidate(ref); + return ref; +} + +static flax *cgrefResolveFlax(rax *ref, unsigned char *key, int create) { + cgrefCache *cache = (cgrefCache *)ref->metadata; + + if (cache->f && memcmp(cache->key, key, PEL_RAX_KEY_LEN) == 0) + return cache->f; + + void *data; + if (raxFind(ref, key, PEL_RAX_KEY_LEN, &data)) { + cache->f = (flax *)data; + memcpy(cache->key, key, PEL_RAX_KEY_LEN); + return cache->f; + } + if (!create) return NULL; + + flax *f = flaxNew(); + raxInsert(ref, key, PEL_RAX_KEY_LEN, f, NULL); + cache->f = f; + memcpy(cache->key, key, PEL_RAX_KEY_LEN); + return f; +} + +static void cgrefFreeFlaxCb(void *val) { + flaxFreeWithCallback((flax *)val, listReleaseGeneric); +} + /* ----------------------------------------------------------------------- * Low level stream encoding: a radix tree of listpacks. * ----------------------------------------------------------------------- */ @@ -365,7 +415,7 @@ void freeStream(stream *s) { if (s->cgroups) raxFreeWithCbAndContext(s->cgroups, streamFreeCGGeneric, s); if (s->cgroups_ref) - raxFreeWithCallback(s->cgroups_ref, listReleaseGeneric); + raxFreeWithCallback(s->cgroups_ref, cgrefFreeFlaxCb); /* Free IDMP producers rax tree */ if (s->idmp_producers) raxFreeWithCbAndContext(s->idmp_producers, streamFreeIdmpProducerGeneric, s); @@ -3323,17 +3373,18 @@ void streamUpdateCGroupLastId(stream *s, streamCG *cg, streamID *id) { * Returns a pointer to the list node, so that it can be used for future deletion. */ listNode *streamLinkCGroupToEntry(stream *s, streamCG *cg, unsigned char *key) { if (!s->cgroups_ref) - s->cgroups_ref = raxNewWithMetadata(0, &s->alloc_size); + s->cgroups_ref = cgrefNew(&s->alloc_size); - /* Speculatively create a list and try to insert it. If the key already - * exists, raxTryInsert returns 0 and sets 'existing' to the current value, - * so we discard the unused list. This avoids a double rax traversal - * (find + insert) on the common miss path. */ - list *cglist = listCreate(); - list *existing; - if (!raxTryInsert(s->cgroups_ref, key, sizeof(streamID), cglist, (void**)&existing)) { - listRelease(cglist); - cglist = existing; + flax *f = cgrefResolveFlax(s->cgroups_ref, key, 1); + uint8_t fkey = key[PEL_RAX_KEY_LEN]; + + list *cglist; + void *existing; + if (flaxFind(f, fkey, &existing)) { + cglist = (list *)existing; + } else { + cglist = listCreate(); + flaxInsert(f, fkey, cglist, NULL); } listAddNodeTail(cglist, cg); @@ -3343,15 +3394,26 @@ listNode *streamLinkCGroupToEntry(stream *s, streamCG *cg, unsigned char *key) { /* Unlink a consumer group reference from the entry index for a specific stream ID. * This is called when a message is acknowledged or when a consumer group is deleted. */ void streamUnlinkEntryFromCGroupRef(stream *s, streamNACK *na, unsigned char *key) { - list *cglist; if (!s->cgroups_ref) return; - if (raxFind(s->cgroups_ref, key, sizeof(streamID), (void**)&cglist)) { - listDelNode(cglist, na->cgroup_ref_node); - - /* If the list is now empty, remove it from the index. */ - if (listLength(cglist) == 0) { - raxRemove(s->cgroups_ref, key, sizeof(streamID), NULL); - listRelease(cglist); + + flax *f = cgrefResolveFlax(s->cgroups_ref, key, 0); + if (!f) return; + + uint8_t fkey = key[PEL_RAX_KEY_LEN]; + void *val; + if (!flaxFind(f, fkey, &val)) return; + + list *cglist = (list *)val; + listDelNode(cglist, na->cgroup_ref_node); + + if (listLength(cglist) == 0) { + flaxRemove(f, fkey, NULL); + listRelease(cglist); + + if (f->numele == 0) { + flaxFree(f); + raxRemove(s->cgroups_ref, key, PEL_RAX_KEY_LEN, NULL); + cgrefCacheInvalidate(s->cgroups_ref); } } } @@ -3359,26 +3421,28 @@ void streamUnlinkEntryFromCGroupRef(stream *s, streamNACK *na, unsigned char *ke /* Remove all consumer group references to a specific stream message. */ void streamCleanupEntryCGroupRefs(stream *s, streamID *id) { if (!s->cgroups_ref) return; - list *cglist; - listIter li; - listNode *ln; unsigned char buf[sizeof(streamID)]; streamEncodeID(buf, id); - /* If message is not in any consumer group, nothing to do */ - if (!raxFind(s->cgroups_ref, buf, sizeof(streamID), (void **)&cglist)) - return; + flax *f = cgrefResolveFlax(s->cgroups_ref, buf, 0); + if (!f) return; + + uint8_t fkey = buf[PEL_RAX_KEY_LEN]; + void *val; + if (!flaxFind(f, fkey, &val)) return; + + list *cglist = (list *)val; + listIter li; + listNode *ln; listRewind(cglist, &li); while ((ln = listNext(&li))) { streamNACK *nack; streamCG *group = listNodeValue(ln); - - /* Find the message in this consumer group's PEL */ + nack = pelFind(group->pel, id); serverAssert(nack); - - /* Remove from group and consumer PELs */ + pelListUnlink(group, nack); pelRemove(group->pel, id, &group->pel_count); pelRemove(nack->consumer->pel, id, &nack->consumer->pel_count); @@ -3387,8 +3451,14 @@ void streamCleanupEntryCGroupRefs(stream *s, streamID *id) { streamFreeNACK(s, nack); } - raxRemove(s->cgroups_ref, buf, sizeof(streamID), NULL); + flaxRemove(f, fkey, NULL); listRelease(cglist); + + if (f->numele == 0) { + flaxFree(f); + raxRemove(s->cgroups_ref, buf, PEL_RAX_KEY_LEN, NULL); + cgrefCacheInvalidate(s->cgroups_ref); + } } /* Check if a stream entry is still referenced by any consumer group. @@ -3426,7 +3496,9 @@ int streamEntryIsReferenced(stream *s, streamID *id) { if (!s->cgroups_ref) return 0; unsigned char buf[sizeof(streamID)]; streamEncodeID(buf, id); - return raxFind(s->cgroups_ref, buf, sizeof(streamID), NULL); + flax *f = cgrefResolveFlax(s->cgroups_ref, buf, 0); + if (!f) return 0; + return flaxFind(f, buf[PEL_RAX_KEY_LEN], NULL); } /* Create a NACK entry setting the delivery count to 1 and the delivery From 0f35c61a89a1bd38c7b4b1695371787a512ecd30 Mon Sep 17 00:00:00 2001 From: Sergey Georgiev Date: Mon, 30 Mar 2026 12:29:40 +0300 Subject: [PATCH 13/48] fixed: shrinking --- src/t_stream.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/src/t_stream.c b/src/t_stream.c index d993080c3..2167a733c 100644 --- a/src/t_stream.c +++ b/src/t_stream.c @@ -135,11 +135,12 @@ void pelFreeShallow(rax *pel) { * most 256 entries and never needs splitting. * * Returns the flax bucket, or NULL if no matching bucket exists (create==0). - * When create==1, a new bucket is created on miss and *created is set to 1. */ + * When prev is non-NULL, *prev is set to the previously cached flax bucket + * before the cache is updated. */ static flax *pelResolveFlax(rax *pel, uint64_t ms, uint64_t seq, - int create, int *created) { + int create, flax **prev) { pelCache *cache = (pelCache *)pel->metadata; - if (created) *created = 0; + if (prev) *prev = cache->f; unsigned char keybuf[PEL_RAX_KEY_LEN]; pelEncodeRaxKey(keybuf, ms, seq); @@ -161,13 +162,14 @@ static flax *pelResolveFlax(rax *pel, uint64_t ms, uint64_t seq, raxInsert(pel, keybuf, PEL_RAX_KEY_LEN, f, NULL); cache->f = f; memcpy(cache->key, keybuf, PEL_RAX_KEY_LEN); - if (created) *created = 1; return f; } /* Insert nack into two-level PEL. Returns 1 if new entry, 0 if key existed (old value replaced). */ int pelInsert(rax *pel, streamID *id, streamNACK *nack, uint64_t *count) { - flax *f = pelResolveFlax(pel, id->ms, id->seq, 1, NULL); + flax *prev; + flax *f = pelResolveFlax(pel, id->ms, id->seq, 1, &prev); + if (prev && prev != f) flaxShrink(prev); void *old; flaxInsert(f, pelFlaxKey(id->seq), nack, &old); if (old == NULL) { @@ -179,7 +181,9 @@ int pelInsert(rax *pel, streamID *id, streamNACK *nack, uint64_t *count) { /* Insert only if not present. Returns 1 if inserted, 0 if key already exists. */ int pelTryInsert(rax *pel, streamID *id, streamNACK *nack, uint64_t *count) { - flax *f = pelResolveFlax(pel, id->ms, id->seq, 1, NULL); + flax *prev; + flax *f = pelResolveFlax(pel, id->ms, id->seq, 1, &prev); + if (prev && prev != f) flaxShrink(prev); if (!flaxTryInsert(f, pelFlaxKey(id->seq), nack, NULL)) return 0; if (count) (*count)++; From c9010cc23ed61d9961f482c8a05b09c0ac20804a Mon Sep 17 00:00:00 2001 From: Sergey Georgiev Date: Mon, 30 Mar 2026 13:50:38 +0300 Subject: [PATCH 14/48] added: alloc size counting --- src/flax.c | 22 +++++-- src/flax.h | 6 +- src/flax_malloc.h | 2 + src/t_stream.c | 155 +++++++++++++++++++--------------------------- 4 files changed, 88 insertions(+), 97 deletions(-) diff --git a/src/flax.c b/src/flax.c index 54e2f8135..2ff212126 100644 --- a/src/flax.c +++ b/src/flax.c @@ -129,7 +129,8 @@ static int flax_search(const uint8_t *keys, uint32_t numele, uint8_t key, int64_ static void flax_resize(flax *f, uint32_t new_capacity) { size_t new_voff = flax_values_offset(new_capacity); size_t new_alloc = new_voff + (size_t)new_capacity * sizeof(void *); - void *new_data = flax_malloc(new_alloc); + size_t new_usable; + void *new_data = flax_malloc_usable(new_alloc, &new_usable); if (f->data && f->numele > 0) { memcpy(new_data, f->data, (size_t)f->numele * sizeof(uint8_t)); @@ -138,9 +139,11 @@ static void flax_resize(flax *f, uint32_t new_capacity) { (size_t)f->numele * sizeof(void *)); } - flax_free(f->data); + size_t old_usable; + flax_free_usable(f->data, &old_usable); f->data = new_data; f->capacity = new_capacity; + f->alloc_size += new_usable - old_usable; } /* Update the iterator key and data fields from the underlying flax @@ -158,11 +161,14 @@ static void flaxIterRefresh(flaxIterator *it) { * On out of memory the function returns NULL. */ flax *flaxNewWithCapacity(uint32_t capacity) { if (capacity < FLAX_INIT_CAPACITY) capacity = FLAX_INIT_CAPACITY; - flax *f = flax_malloc(sizeof(flax)); + size_t usable; + flax *f = flax_malloc_usable(sizeof(flax), &usable); + f->alloc_size = usable; f->numele = 0; f->capacity = capacity; size_t voff = flax_values_offset(capacity); - f->data = flax_malloc(voff + (size_t)capacity * sizeof(void *)); + f->data = flax_malloc_usable(voff + (size_t)capacity * sizeof(void *), &usable); + f->alloc_size += usable; return f; } @@ -317,6 +323,13 @@ uint64_t flaxSize(flax *f) { return (uint64_t)f->numele; } +/* Return the total heap memory used by the flax struct and its data block. + * O(1): the value is maintained incrementally by alloc/resize operations. */ +size_t flaxAllocSize(flax *f) { + if (!f) return 0; + return f->alloc_size; +} + /* Shrink the internal storage to fit the current number of elements, * releasing unused memory. */ void flaxShrink(flax *f) { @@ -488,6 +501,7 @@ int flaxEOF(flaxIterator *it) { #ifdef REDIS_TEST #include "testhelp.h" #include +#include #include #define UNUSED(x) (void)(x) diff --git a/src/flax.h b/src/flax.h index e9cd33496..0f7da8bb4 100644 --- a/src/flax.h +++ b/src/flax.h @@ -44,8 +44,9 @@ */ typedef struct flax { void *data; /* Packed storage: keys array followed by values array. */ - uint32_t numele; /* Number of elements currently stored. */ - uint32_t capacity; /* Current allocated capacity. */ + uint16_t numele; /* Number of elements currently stored (max 256). */ + uint16_t capacity; /* Current allocated capacity. */ + uint32_t alloc_size; /* Total usable bytes: struct allocation + data block. */ } flax; /* Flax iterator state. The typical lifecycle is: @@ -91,6 +92,7 @@ int flaxEOF(flaxIterator *it); /* --- Introspection --- */ uint64_t flaxSize(flax *f); +size_t flaxAllocSize(flax *f); void flaxShrink(flax *f); #ifdef REDIS_TEST diff --git a/src/flax_malloc.h b/src/flax_malloc.h index c0723c4d2..97b587ca3 100644 --- a/src/flax_malloc.h +++ b/src/flax_malloc.h @@ -2,5 +2,7 @@ #define FLAX_ALLOC_H #include "zmalloc.h" #define flax_malloc zmalloc +#define flax_malloc_usable zmalloc_usable #define flax_free zfree +#define flax_free_usable zfree_usable #endif diff --git a/src/t_stream.c b/src/t_stream.c index 2167a733c..94f256717 100644 --- a/src/t_stream.c +++ b/src/t_stream.c @@ -91,27 +91,22 @@ rax *pelNew(size_t *alloc_size) { return pel; } -static void pelFreeFlaxCb(void *val, void *privdata) { - (void)privdata; - flaxFree((flax *)val); -} - /* Free all flax structures and call nack_free for each NACK. */ void pelFree(rax *pel, void (*nack_free)(void *, void *), void *ctx) { if (!pel) return; - if (nack_free) { - raxIterator ri; - raxStart(&ri, pel); - raxSeek(&ri, "^", NULL, 0); - while (raxNext(&ri)) { - flax *f = ri.data; + raxIterator ri; + raxStart(&ri, pel); + raxSeek(&ri, "^", NULL, 0); + while (raxNext(&ri)) { + flax *f = ri.data; + if (pel->alloc_size) *pel->alloc_size -= flaxAllocSize(f); + if (nack_free) flaxFreeWithCbAndContext(f, nack_free, ctx); - } - raxStop(&ri); - raxFreeWithCbAndContext(pel, NULL, NULL); - } else { - raxFreeWithCbAndContext(pel, pelFreeFlaxCb, NULL); + else + flaxFree(f); } + raxStop(&ri); + raxFreeWithCbAndContext(pel, NULL, NULL); } /* Free flax structures without freeing NACKs (for consumer PEL where NACKs are shared). */ @@ -121,13 +116,14 @@ void pelFreeShallow(rax *pel) { raxStart(&ri, pel); raxSeek(&ri, "^", NULL, 0); while (raxNext(&ri)) { + if (pel->alloc_size) *pel->alloc_size -= flaxAllocSize((flax *)ri.data); flaxFree((flax *)ri.data); } raxStop(&ri); raxFree(pel); } -/* pelResolveFlax -- Resolve the flax bucket for a given (ms, seq) pair. +/* pelResolveFlax -- Resolve the flax bucket for a given 15-byte rax key. * * With the 15+1 byte key scheme, each rax key is the first 15 bytes of the * big-endian encoded stream ID (full ms + upper 56 bits of seq). The flax @@ -137,12 +133,10 @@ void pelFreeShallow(rax *pel) { * Returns the flax bucket, or NULL if no matching bucket exists (create==0). * When prev is non-NULL, *prev is set to the previously cached flax bucket * before the cache is updated. */ -static flax *pelResolveFlax(rax *pel, uint64_t ms, uint64_t seq, +static flax *pelResolveFlax(rax *r, unsigned char *keybuf, int create, flax **prev) { - pelCache *cache = (pelCache *)pel->metadata; + pelCache *cache = (pelCache *)r->metadata; if (prev) *prev = cache->f; - unsigned char keybuf[PEL_RAX_KEY_LEN]; - pelEncodeRaxKey(keybuf, ms, seq); /* Cache hit */ if (cache->f && memcmp(cache->key, keybuf, PEL_RAX_KEY_LEN) == 0) @@ -150,7 +144,7 @@ static flax *pelResolveFlax(rax *pel, uint64_t ms, uint64_t seq, /* Rax lookup (exact match) */ void *data; - if (raxFind(pel, keybuf, PEL_RAX_KEY_LEN, &data)) { + if (raxFind(r, keybuf, PEL_RAX_KEY_LEN, &data)) { cache->f = (flax *)data; memcpy(cache->key, keybuf, PEL_RAX_KEY_LEN); return cache->f; @@ -159,7 +153,8 @@ static flax *pelResolveFlax(rax *pel, uint64_t ms, uint64_t seq, /* Create new bucket */ flax *f = flaxNew(); - raxInsert(pel, keybuf, PEL_RAX_KEY_LEN, f, NULL); + if (r->alloc_size) *r->alloc_size += flaxAllocSize(f); + raxInsert(r, keybuf, PEL_RAX_KEY_LEN, f, NULL); cache->f = f; memcpy(cache->key, keybuf, PEL_RAX_KEY_LEN); return f; @@ -167,11 +162,19 @@ static flax *pelResolveFlax(rax *pel, uint64_t ms, uint64_t seq, /* Insert nack into two-level PEL. Returns 1 if new entry, 0 if key existed (old value replaced). */ int pelInsert(rax *pel, streamID *id, streamNACK *nack, uint64_t *count) { + unsigned char keybuf[PEL_RAX_KEY_LEN]; + pelEncodeRaxKey(keybuf, id->ms, id->seq); flax *prev; - flax *f = pelResolveFlax(pel, id->ms, id->seq, 1, &prev); - if (prev && prev != f) flaxShrink(prev); + flax *f = pelResolveFlax(pel, keybuf, 1, &prev); + if (prev && prev != f) { + size_t before = flaxAllocSize(prev); + flaxShrink(prev); + if (pel->alloc_size) *pel->alloc_size -= before - flaxAllocSize(prev); + } + size_t before = flaxAllocSize(f); void *old; flaxInsert(f, pelFlaxKey(id->seq), nack, &old); + if (pel->alloc_size) *pel->alloc_size += flaxAllocSize(f) - before; if (old == NULL) { if (count) (*count)++; return 1; @@ -181,18 +184,28 @@ int pelInsert(rax *pel, streamID *id, streamNACK *nack, uint64_t *count) { /* Insert only if not present. Returns 1 if inserted, 0 if key already exists. */ int pelTryInsert(rax *pel, streamID *id, streamNACK *nack, uint64_t *count) { + unsigned char keybuf[PEL_RAX_KEY_LEN]; + pelEncodeRaxKey(keybuf, id->ms, id->seq); flax *prev; - flax *f = pelResolveFlax(pel, id->ms, id->seq, 1, &prev); - if (prev && prev != f) flaxShrink(prev); - if (!flaxTryInsert(f, pelFlaxKey(id->seq), nack, NULL)) - return 0; + flax *f = pelResolveFlax(pel, keybuf, 1, &prev); + if (prev && prev != f) { + size_t before = flaxAllocSize(prev); + flaxShrink(prev); + if (pel->alloc_size) *pel->alloc_size -= before - flaxAllocSize(prev); + } + size_t before = flaxAllocSize(f); + int inserted = flaxTryInsert(f, pelFlaxKey(id->seq), nack, NULL); + if (pel->alloc_size) *pel->alloc_size += flaxAllocSize(f) - before; + if (!inserted) return 0; if (count) (*count)++; return 1; } /* Find a NACK by streamID. Returns NULL if not found. */ streamNACK *pelFind(rax *pel, streamID *id) { - flax *f = pelResolveFlax(pel, id->ms, id->seq, 0, NULL); + unsigned char keybuf[PEL_RAX_KEY_LEN]; + pelEncodeRaxKey(keybuf, id->ms, id->seq); + flax *f = pelResolveFlax(pel, keybuf, 0, NULL); if (!f) return NULL; void *val; if (!flaxFind(f, pelFlaxKey(id->seq), &val)) return NULL; @@ -201,15 +214,16 @@ streamNACK *pelFind(rax *pel, streamID *id) { /* Remove a NACK by streamID. Returns the removed NACK or NULL. */ streamNACK *pelRemove(rax *pel, streamID *id, uint64_t *count) { - flax *f = pelResolveFlax(pel, id->ms, id->seq, 0, NULL); + unsigned char keybuf[PEL_RAX_KEY_LEN]; + pelEncodeRaxKey(keybuf, id->ms, id->seq); + flax *f = pelResolveFlax(pel, keybuf, 0, NULL); if (!f) return NULL; void *old; if (!flaxRemove(f, pelFlaxKey(id->seq), &old)) return NULL; streamNACK *nack = (streamNACK *)old; if (count) (*count)--; if (f->numele == 0) { - unsigned char keybuf[PEL_RAX_KEY_LEN]; - pelEncodeRaxKey(keybuf, id->ms, id->seq); + if (pel->alloc_size) *pel->alloc_size -= flaxAllocSize(f); flaxFree(f); raxRemove(pel, keybuf, PEL_RAX_KEY_LEN, NULL); pelCacheInvalidate(pel); @@ -321,56 +335,6 @@ void pelIterStop(pelIterator *pi) { pi->valid = 0; } -/* ----------------------------------------------------------------------- - * Two-level cgroups_ref: rax(15-byte prefix -> flax(low-byte-of-seq -> list*)) - * - * Mirrors the PEL two-level layout. The outer rax key is the first - * 15 bytes of the big-endian encoded streamID; the flax key is byte 15 - * (low byte of seq). Each flax slot holds a list* of consumer-group - * pointers that reference the entry. - * ----------------------------------------------------------------------- */ - -typedef struct cgrefCache { - unsigned char key[PEL_RAX_KEY_LEN]; - flax *f; -} cgrefCache; - -static inline void cgrefCacheInvalidate(rax *ref) { - cgrefCache *cache = (cgrefCache *)ref->metadata; - cache->f = NULL; -} - -static rax *cgrefNew(size_t *alloc_size) { - rax *ref = raxNewWithMetadata(sizeof(cgrefCache), alloc_size); - if (ref) cgrefCacheInvalidate(ref); - return ref; -} - -static flax *cgrefResolveFlax(rax *ref, unsigned char *key, int create) { - cgrefCache *cache = (cgrefCache *)ref->metadata; - - if (cache->f && memcmp(cache->key, key, PEL_RAX_KEY_LEN) == 0) - return cache->f; - - void *data; - if (raxFind(ref, key, PEL_RAX_KEY_LEN, &data)) { - cache->f = (flax *)data; - memcpy(cache->key, key, PEL_RAX_KEY_LEN); - return cache->f; - } - if (!create) return NULL; - - flax *f = flaxNew(); - raxInsert(ref, key, PEL_RAX_KEY_LEN, f, NULL); - cache->f = f; - memcpy(cache->key, key, PEL_RAX_KEY_LEN); - return f; -} - -static void cgrefFreeFlaxCb(void *val) { - flaxFreeWithCallback((flax *)val, listReleaseGeneric); -} - /* ----------------------------------------------------------------------- * Low level stream encoding: a radix tree of listpacks. * ----------------------------------------------------------------------- */ @@ -408,6 +372,11 @@ static void streamLpFreeGeneric(void *lp, void *strm) { lpFree(lp); } +static void listReleaseGenericCb(void *val, void *ctx) { + (void)ctx; + listReleaseGeneric(val); +} + void streamFreeIdmpProducerGeneric(void *producer, void *strm) { stream *s = strm; idmpProducerFree((idmpProducer *)producer, &s->alloc_size); @@ -419,7 +388,7 @@ void freeStream(stream *s) { if (s->cgroups) raxFreeWithCbAndContext(s->cgroups, streamFreeCGGeneric, s); if (s->cgroups_ref) - raxFreeWithCallback(s->cgroups_ref, cgrefFreeFlaxCb); + pelFree(s->cgroups_ref, listReleaseGenericCb, NULL); /* Free IDMP producers rax tree */ if (s->idmp_producers) raxFreeWithCbAndContext(s->idmp_producers, streamFreeIdmpProducerGeneric, s); @@ -3377,9 +3346,9 @@ void streamUpdateCGroupLastId(stream *s, streamCG *cg, streamID *id) { * Returns a pointer to the list node, so that it can be used for future deletion. */ listNode *streamLinkCGroupToEntry(stream *s, streamCG *cg, unsigned char *key) { if (!s->cgroups_ref) - s->cgroups_ref = cgrefNew(&s->alloc_size); + s->cgroups_ref = pelNew(&s->alloc_size); - flax *f = cgrefResolveFlax(s->cgroups_ref, key, 1); + flax *f = pelResolveFlax(s->cgroups_ref, key, 1, NULL); uint8_t fkey = key[PEL_RAX_KEY_LEN]; list *cglist; @@ -3388,7 +3357,9 @@ listNode *streamLinkCGroupToEntry(stream *s, streamCG *cg, unsigned char *key) { cglist = (list *)existing; } else { cglist = listCreate(); + size_t before = flaxAllocSize(f); flaxInsert(f, fkey, cglist, NULL); + s->alloc_size += flaxAllocSize(f) - before; } listAddNodeTail(cglist, cg); @@ -3400,7 +3371,7 @@ listNode *streamLinkCGroupToEntry(stream *s, streamCG *cg, unsigned char *key) { void streamUnlinkEntryFromCGroupRef(stream *s, streamNACK *na, unsigned char *key) { if (!s->cgroups_ref) return; - flax *f = cgrefResolveFlax(s->cgroups_ref, key, 0); + flax *f = pelResolveFlax(s->cgroups_ref, key, 0, NULL); if (!f) return; uint8_t fkey = key[PEL_RAX_KEY_LEN]; @@ -3415,9 +3386,10 @@ void streamUnlinkEntryFromCGroupRef(stream *s, streamNACK *na, unsigned char *ke listRelease(cglist); if (f->numele == 0) { + s->alloc_size -= flaxAllocSize(f); flaxFree(f); raxRemove(s->cgroups_ref, key, PEL_RAX_KEY_LEN, NULL); - cgrefCacheInvalidate(s->cgroups_ref); + pelCacheInvalidate(s->cgroups_ref); } } } @@ -3428,7 +3400,7 @@ void streamCleanupEntryCGroupRefs(stream *s, streamID *id) { unsigned char buf[sizeof(streamID)]; streamEncodeID(buf, id); - flax *f = cgrefResolveFlax(s->cgroups_ref, buf, 0); + flax *f = pelResolveFlax(s->cgroups_ref, buf, 0, NULL); if (!f) return; uint8_t fkey = buf[PEL_RAX_KEY_LEN]; @@ -3459,9 +3431,10 @@ void streamCleanupEntryCGroupRefs(stream *s, streamID *id) { listRelease(cglist); if (f->numele == 0) { + s->alloc_size -= flaxAllocSize(f); flaxFree(f); raxRemove(s->cgroups_ref, buf, PEL_RAX_KEY_LEN, NULL); - cgrefCacheInvalidate(s->cgroups_ref); + pelCacheInvalidate(s->cgroups_ref); } } @@ -3500,7 +3473,7 @@ int streamEntryIsReferenced(stream *s, streamID *id) { if (!s->cgroups_ref) return 0; unsigned char buf[sizeof(streamID)]; streamEncodeID(buf, id); - flax *f = cgrefResolveFlax(s->cgroups_ref, buf, 0); + flax *f = pelResolveFlax(s->cgroups_ref, buf, 0, NULL); if (!f) return 0; return flaxFind(f, buf[PEL_RAX_KEY_LEN], NULL); } From ff1a8ad7acbc3e472341a38f5e19e61d63c8040e Mon Sep 17 00:00:00 2001 From: Sergey Georgiev Date: Mon, 30 Mar 2026 14:55:12 +0300 Subject: [PATCH 15/48] fixed: issue with defrag --- src/defrag.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/src/defrag.c b/src/defrag.c index 0bb097f7b..6c4aa4f5a 100644 --- a/src/defrag.c +++ b/src/defrag.c @@ -868,6 +868,8 @@ void* defragStreamConsumerPelFlax(raxIterator *ri, void *privdata) { flax *f = ri->data; flax *newf = activeDefragAlloc(f); if (newf) f = newf; + void *newdata = activeDefragAlloc(f->data); + if (newdata) f->data = newdata; /* Iterate entries in the flax and defrag each NACK. */ flaxIterator fi; @@ -937,6 +939,9 @@ void* defragStreamGroupPelFlax(raxIterator *ri, void *privdata) { (void)privdata; flax *f = ri->data; flax *newf = activeDefragAlloc(f); + if (newf) f = newf; + void *newdata = activeDefragAlloc(f->data); + if (newdata) f->data = newdata; return newf; } From 0b0903325372b3b6aac4557b2791251af2dbf7c4 Mon Sep 17 00:00:00 2001 From: Sergey Georgiev Date: Wed, 1 Apr 2026 10:29:32 +0300 Subject: [PATCH 16/48] fixed: issue from review --- src/flax.c | 354 ++++++++++++++++++++++++++++++++++++++----------- src/flax.h | 16 ++- src/t_stream.c | 2 +- 3 files changed, 287 insertions(+), 85 deletions(-) diff --git a/src/flax.c b/src/flax.c index 2ff212126..ab628b31c 100644 --- a/src/flax.c +++ b/src/flax.c @@ -9,7 +9,7 @@ */ #include "flax.h" -#include +#include "redisassert.h" #include #include #include @@ -127,6 +127,7 @@ static int flax_search(const uint8_t *keys, uint32_t numele, uint8_t key, int64_ * the keys at the start of the block and one for the values at the new * aligned offset. The old data block is freed afterwards. */ static void flax_resize(flax *f, uint32_t new_capacity) { + if (new_capacity > UINT8_MAX) new_capacity = UINT8_MAX; size_t new_voff = flax_values_offset(new_capacity); size_t new_alloc = new_voff + (size_t)new_capacity * sizeof(void *); size_t new_usable; @@ -157,72 +158,37 @@ static void flaxIterRefresh(flaxIterator *it) { * Core API * -------------------------------------------------------------------------- */ -/* Allocate a new flax with the given initial capacity and return its pointer. - * On out of memory the function returns NULL. */ -flax *flaxNewWithCapacity(uint32_t capacity) { - if (capacity < FLAX_INIT_CAPACITY) capacity = FLAX_INIT_CAPACITY; +/* Allocate a new flax and return its pointer. On out of memory the function + * returns NULL. */ +flax *flaxNew(void) { size_t usable; flax *f = flax_malloc_usable(sizeof(flax), &usable); f->alloc_size = usable; f->numele = 0; - f->capacity = capacity; - size_t voff = flax_values_offset(capacity); - f->data = flax_malloc_usable(voff + (size_t)capacity * sizeof(void *), &usable); + f->capacity = FLAX_INIT_CAPACITY; + size_t voff = flax_values_offset(FLAX_INIT_CAPACITY); + f->data = flax_malloc_usable(voff + (size_t)FLAX_INIT_CAPACITY * sizeof(void *), &usable); f->alloc_size += usable; return f; } -/* Allocate a new flax and return its pointer. On out of memory the function - * returns NULL. */ -flax *flaxNew(void) { - return flaxNewWithCapacity(FLAX_INIT_CAPACITY); -} - -/* Overwriting insert. Insert the element with the specified 'key', setting - * as associated data the pointer 'data'. If the element already exists, the - * associated data is updated and 1 is returned. If 'old' is not NULL the - * previous value is stored at that address. Returns 1 on success. */ -int flaxInsert(flax *f, uint8_t key, void *data, void **old) { - if (f->numele == f->capacity) - flax_resize(f, f->capacity * 2); - - int64_t idx; - if (flax_search(flax_keys(f), f->numele, key, &idx)) { - void **vals = flax_values(f); - if (old) *old = vals[idx]; - vals[idx] = data; - return 1; - } - - uint8_t *keys = flax_keys(f); - void **vals = flax_values(f); - int64_t tail = f->numele - idx; - - if (tail > 0) { - memmove(&keys[idx + 1], &keys[idx], (size_t)tail * sizeof(uint8_t)); - memmove(&vals[idx + 1], &vals[idx], (size_t)tail * sizeof(void *)); - } - - keys[idx] = key; - vals[idx] = data; - f->numele++; - if (old) *old = NULL; - return 1; -} - -/* Non overwriting insert function: if an element with the same key exists, - * the value is not updated and the function returns 0. If 'old' is not NULL - * the existing value is stored at that address. Returns 1 on success. */ -int flaxTryInsert(flax *f, uint8_t key, void *data, void **old) { - if (f->numele == f->capacity) - flax_resize(f, f->capacity * 2); - +/* Generic insert. If 'overwrite' is true and the key already exists, the + * associated data is updated and 0 is returned. If 'overwrite' is false + * and the key exists, the data is left unchanged and 0 is returned. In + * both cases, if 'old' is not NULL the previous value is stored there. + * When the key is new, a new element is created and 1 is returned (and + * *old is set to NULL if provided). */ +static int flaxGenericInsert(flax *f, uint8_t key, void *data, void **old, int overwrite) { int64_t idx; if (flax_search(flax_keys(f), f->numele, key, &idx)) { if (old) *old = flax_values(f)[idx]; + if (overwrite) flax_values(f)[idx] = data; return 0; } + if (f->numele == f->capacity) + flax_resize(f, f->capacity * 2); + uint8_t *keys = flax_keys(f); void **vals = flax_values(f); int64_t tail = f->numele - idx; @@ -239,6 +205,17 @@ int flaxTryInsert(flax *f, uint8_t key, void *data, void **old) { return 1; } +/* Overwriting insert. This is just a wrapper for flaxGenericInsert(). */ +int flaxInsert(flax *f, uint8_t key, void *data, void **old) { + return flaxGenericInsert(f, key, data, old, 1); +} + +/* Non overwriting insert function: this is just a wrapper for + * flaxGenericInsert(). */ +int flaxTryInsert(flax *f, uint8_t key, void *data, void **old) { + return flaxGenericInsert(f, key, data, old, 0); +} + /* Remove the specified item. Returns 1 if the item was found and * deleted, 0 otherwise. If 'old' is not NULL the removed value is * stored at that address. */ @@ -287,27 +264,16 @@ int flaxFind(flax *f, uint8_t key, void **value) { /* Free a whole flax. */ void flaxFree(flax *f) { - flaxFreeWithCallback(f, NULL); -} - -/* Free a whole flax, calling the specified callback in order to - * free the auxiliary data. */ -void flaxFreeWithCallback(flax *f, void (*free_callback)(void *)) { if (!f) return; - if (free_callback && f->data && f->numele > 0) { - void **vals = flax_values(f); - for (uint32_t i = 0; i < f->numele; i++) - free_callback(vals[i]); - } flax_free(f->data); flax_free(f); } /* Free a whole flax, calling the specified callback with a context * argument in order to free the auxiliary data. */ -void flaxFreeWithCbAndContext(flax *f, - void (*free_callback)(void *item, void *ctx), - void *ctx) { +void flaxFreeWithCallback(flax *f, + void (*free_callback)(void *item, void *ctx), + void *ctx) { if (!f) return; if (free_callback && f->data && f->numele > 0) { void **vals = flax_values(f); @@ -331,7 +297,8 @@ size_t flaxAllocSize(flax *f) { } /* Shrink the internal storage to fit the current number of elements, - * releasing unused memory. */ + * releasing unused memory. No-op when the flax is empty (the caller should + * flaxFree() the whole structure instead) or already at exact capacity. */ void flaxShrink(flax *f) { if (f->numele > 0 && f->numele < f->capacity) flax_resize(f, f->numele); @@ -448,6 +415,7 @@ int flaxSeek(flaxIterator *it, const char *op, uint8_t key) { return 1; } + assert(0 && "flaxSeek: unrecognized op"); it->idx = -1; it->key = 0; it->data = NULL; @@ -483,7 +451,7 @@ int flaxPrev(flaxIterator *it) { return 1; } -/* Free the iterator. */ +/* Stop the iterator (no-op, included for API symmetry with rax). */ void flaxStop(flaxIterator *it) { (void)it; } @@ -500,7 +468,6 @@ int flaxEOF(flaxIterator *it) { #ifdef REDIS_TEST #include "testhelp.h" -#include #include #include @@ -517,7 +484,8 @@ int flaxEOF(flaxIterator *it) { static int flax_test_free_count; -static void flax_test_counting_free(void *p) { +static void flax_test_counting_free(void *p, void *ctx) { + (void)ctx; flax_test_free_count++; flax_free(p); } @@ -692,7 +660,7 @@ int flaxTest(int argc, char **argv, int flags) { } } - flaxFreeWithCallback(a, flax_free); + flaxFreeWithCallback(a, flax_test_counting_free, NULL); } TEST("shrink after many removals") { @@ -730,7 +698,7 @@ int flaxTest(int argc, char **argv, int flags) { snprintf(s, 8, "str%d", i); flaxInsert(a, i, s, NULL); } - flaxFreeWithCallback(a, flax_test_counting_free); + flaxFreeWithCallback(a, flax_test_counting_free, NULL); if (flax_test_free_count != 5) { ERR("freeWithCallback: expected 5 frees, got %d", flax_test_free_count); @@ -740,7 +708,7 @@ int flaxTest(int argc, char **argv, int flags) { TEST("flaxFreeWithCallback on empty flax") { flax_test_free_count = 0; flax *a = flaxNew(); - flaxFreeWithCallback(a, flax_test_counting_free); + flaxFreeWithCallback(a, flax_test_counting_free, NULL); if (flax_test_free_count != 0) { ERR("freeWithCallback empty: expected 0 frees, got %d", flax_test_free_count); @@ -888,19 +856,251 @@ int flaxTest(int argc, char **argv, int flags) { flaxFree(a); } - TEST("flaxFreeWithCbAndContext") { + TEST("flaxFreeWithCallback") { int ctx_free_count = 0; flax *a = flaxNew(); flaxInsert(a, 1, "one", NULL); flaxInsert(a, 2, "two", NULL); flaxInsert(a, 3, "three", NULL); - flaxFreeWithCbAndContext(a, flax_test_ctx_free, &ctx_free_count); + flaxFreeWithCallback(a, flax_test_ctx_free, &ctx_free_count); if (ctx_free_count != 3) { - ERR("freeWithCbAndContext: expected 3 frees, got %d", + ERR("freeWithCallback: expected 3 frees, got %d", ctx_free_count); } } + TEST("iterator seek >") { + flax *a = flaxNew(); + flaxInsert(a, 10, "ten", NULL); + flaxInsert(a, 20, "twenty", NULL); + flaxInsert(a, 30, "thirty", NULL); + flaxInsert(a, 40, "forty", NULL); + + flaxIterator it; + flaxStart(&it, a); + + /* ">" on existing key skips to the next one. */ + assert(flaxSeek(&it, ">", 20)); + assert(it.key == 30); + + /* ">" on non-existing key lands on the first key greater. */ + assert(flaxSeek(&it, ">", 25)); + assert(it.key == 30); + + /* ">" on a key smaller than all elements returns the first. */ + assert(flaxSeek(&it, ">", 5)); + assert(it.key == 10); + + /* ">" on the largest key returns EOF. */ + assert(flaxSeek(&it, ">", 40) == 0); + assert(flaxEOF(&it) == 1); + + /* ">" on a key beyond all elements returns EOF. */ + assert(flaxSeek(&it, ">", 50) == 0); + assert(flaxEOF(&it) == 1); + + flaxStop(&it); + flaxFree(a); + } + + TEST("iterator seek <=") { + flax *a = flaxNew(); + flaxInsert(a, 10, "ten", NULL); + flaxInsert(a, 20, "twenty", NULL); + flaxInsert(a, 30, "thirty", NULL); + flaxInsert(a, 40, "forty", NULL); + + flaxIterator it; + flaxStart(&it, a); + + /* "<=" on existing key lands on that key. */ + assert(flaxSeek(&it, "<=", 20)); + assert(it.key == 20); + + /* "<=" on non-existing key lands on the greatest smaller key. */ + assert(flaxSeek(&it, "<=", 25)); + assert(it.key == 20); + + /* "<=" on the largest key lands on it. */ + assert(flaxSeek(&it, "<=", 40)); + assert(it.key == 40); + + /* "<=" on a key beyond all elements lands on the last. */ + assert(flaxSeek(&it, "<=", 100)); + assert(it.key == 40); + + /* "<=" on a key smaller than all returns EOF. */ + assert(flaxSeek(&it, "<=", 5) == 0); + assert(flaxEOF(&it) == 1); + + flaxStop(&it); + flaxFree(a); + } + + TEST("iterator seek <") { + flax *a = flaxNew(); + flaxInsert(a, 10, "ten", NULL); + flaxInsert(a, 20, "twenty", NULL); + flaxInsert(a, 30, "thirty", NULL); + flaxInsert(a, 40, "forty", NULL); + + flaxIterator it; + flaxStart(&it, a); + + /* "<" on existing key lands on the previous one. */ + assert(flaxSeek(&it, "<", 20)); + assert(it.key == 10); + + /* "<" on non-existing key lands on the greatest smaller key. */ + assert(flaxSeek(&it, "<", 25)); + assert(it.key == 20); + + /* "<" on a key beyond all elements lands on the last. */ + assert(flaxSeek(&it, "<", 100)); + assert(it.key == 40); + + /* "<" on the smallest key returns EOF. */ + assert(flaxSeek(&it, "<", 10) == 0); + assert(flaxEOF(&it) == 1); + + /* "<" on a key smaller than all returns EOF. */ + assert(flaxSeek(&it, "<", 5) == 0); + assert(flaxEOF(&it) == 1); + + flaxStop(&it); + flaxFree(a); + } + + TEST("iterator seek =") { + flax *a = flaxNew(); + flaxInsert(a, 10, "ten", NULL); + flaxInsert(a, 20, "twenty", NULL); + flaxInsert(a, 30, "thirty", NULL); + + flaxIterator it; + flaxStart(&it, a); + + /* "=" on existing key succeeds. */ + assert(flaxSeek(&it, "=", 20)); + assert(it.key == 20); + assert(strcmp(it.data, "twenty") == 0); + + /* "=" on first key. */ + assert(flaxSeek(&it, "=", 10)); + assert(it.key == 10); + + /* "=" on last key. */ + assert(flaxSeek(&it, "=", 30)); + assert(it.key == 30); + + /* "=" on non-existing key returns EOF. */ + assert(flaxSeek(&it, "=", 15) == 0); + assert(flaxEOF(&it) == 1); + + assert(flaxSeek(&it, "=", 0) == 0); + assert(flaxSeek(&it, "=", 255) == 0); + + flaxStop(&it); + flaxFree(a); + } + + TEST("flaxAllocSize tracks allocations") { + flax *a = flaxNew(); + size_t sz0 = flaxAllocSize(a); + assert(sz0 > 0); + + for (int i = 0; i < 64; i++) + flaxInsert(a, (uint8_t)i, "x", NULL); + + size_t sz1 = flaxAllocSize(a); + assert(sz1 >= sz0); + + flaxShrink(a); + size_t sz2 = flaxAllocSize(a); + assert(sz2 <= sz1); + assert(sz2 > 0); + + assert(flaxAllocSize(NULL) == 0); + + flaxFree(a); + } + + TEST("flaxFree and flaxFreeWithCallback on NULL") { + flaxFree(NULL); + flaxFreeWithCallback(NULL, flax_test_counting_free, NULL); + } + + TEST("flaxFind and flaxRemove on NULL flax") { + void *val; + assert(flaxFind(NULL, 42, &val) == 0); + assert(val == NULL); + assert(flaxRemove(NULL, 42, &val) == 0); + assert(val == NULL); + } + + TEST("flaxTryInsert with old=NULL on duplicate") { + flax *a = flaxNew(); + assert(flaxTryInsert(a, 10, "ten", NULL) == 1); + assert(flaxTryInsert(a, 10, "new_ten", NULL) == 0); + assert(flaxSize(a) == 1); + + void *val; + assert(flaxFind(a, 10, &val) == 1); + assert(strcmp(val, "ten") == 0); + + flaxFree(a); + } + + TEST("iterator seek with boundary keys 0 and 255") { + flax *a = flaxNew(); + flaxInsert(a, 0, "zero", NULL); + flaxInsert(a, 128, "mid", NULL); + flaxInsert(a, 255, "max", NULL); + + flaxIterator it; + flaxStart(&it, a); + + assert(flaxSeek(&it, ">=", 0)); + assert(it.key == 0); + assert(flaxSeek(&it, ">=", 255)); + assert(it.key == 255); + + assert(flaxSeek(&it, ">", 0)); + assert(it.key == 128); + assert(flaxSeek(&it, ">", 255) == 0); + + assert(flaxSeek(&it, "<=", 255)); + assert(it.key == 255); + assert(flaxSeek(&it, "<=", 0)); + assert(it.key == 0); + + assert(flaxSeek(&it, "<", 255)); + assert(it.key == 128); + assert(flaxSeek(&it, "<", 0) == 0); + + assert(flaxSeek(&it, "=", 0)); + assert(it.key == 0); + assert(flaxSeek(&it, "=", 255)); + assert(it.key == 255); + + flaxStop(&it); + flaxFree(a); + } + + TEST("iterator seek on empty flax all operators") { + flax *a = flaxNew(); + flaxIterator it; + flaxStart(&it, a); + + assert(flaxSeek(&it, ">", 42) == 0); + assert(flaxSeek(&it, "<=", 42) == 0); + assert(flaxSeek(&it, "<", 42) == 0); + assert(flaxSeek(&it, "=", 42) == 0); + + flaxStop(&it); + flaxFree(a); + } + if (!err) printf("ALL TESTS PASSED!\n"); else diff --git a/src/flax.h b/src/flax.h index 0f7da8bb4..402c0ec69 100644 --- a/src/flax.h +++ b/src/flax.h @@ -25,7 +25,8 @@ * | *data -----------> | keys[0..cap-1] (uint8_t) | * | numele | +-- aligned to sizeof(void*) --------+ * | capacity | | values[0..cap-1] (void*) | - * +------------+ +------------------------------------+ + * | alloc_size | +------------------------------------+ + * +------------+ * * Keys are maintained in ascending sorted order. Only the first 'numele' * slots in each array contain live data; the remainder up to 'capacity' @@ -59,7 +60,10 @@ typedef struct flax { * * After flaxStart() the iterator is in EOF state until a successful * flaxSeek(). The iterator does not allocate heap memory, so flaxStop() - * is a no-op included for API symmetry with rax. */ + * is a no-op included for API symmetry with rax. + * + * WARNING: the iterator is invalidated by any mutation (insert / remove / + * resize) on the underlying flax. Do not modify the flax while iterating. */ typedef struct flaxIterator { flax *f; /* Flax we are iterating. */ uint8_t key; /* The current key. */ @@ -68,13 +72,11 @@ typedef struct flaxIterator { } flaxIterator; /* --- Creation and destruction --- */ -flax *flaxNewWithCapacity(uint32_t capacity); flax *flaxNew(void); void flaxFree(flax *f); -void flaxFreeWithCallback(flax *f, void (*free_callback)(void *)); -void flaxFreeWithCbAndContext(flax *f, - void (*free_callback)(void *item, void *ctx), - void *ctx); +void flaxFreeWithCallback(flax *f, + void (*free_callback)(void *item, void *ctx), + void *ctx); /* --- Lookup and mutation --- */ int flaxInsert(flax *f, uint8_t key, void *data, void **old); diff --git a/src/t_stream.c b/src/t_stream.c index 94f256717..8ab919e5d 100644 --- a/src/t_stream.c +++ b/src/t_stream.c @@ -101,7 +101,7 @@ void pelFree(rax *pel, void (*nack_free)(void *, void *), void *ctx) { flax *f = ri.data; if (pel->alloc_size) *pel->alloc_size -= flaxAllocSize(f); if (nack_free) - flaxFreeWithCbAndContext(f, nack_free, ctx); + flaxFreeWithCallback(f, nack_free, ctx); else flaxFree(f); } From 8245b6bb7b964cc2d2a24b1a383735bbd7a39705 Mon Sep 17 00:00:00 2001 From: Sergey Georgiev Date: Wed, 1 Apr 2026 13:26:48 +0300 Subject: [PATCH 17/48] fixed: issue from review --- src/aof.c | 16 ++++++++-------- src/flax_malloc.h | 17 +++++++++++++++++ 2 files changed, 25 insertions(+), 8 deletions(-) diff --git a/src/aof.c b/src/aof.c index 969c01f00..5c0496ad2 100644 --- a/src/aof.c +++ b/src/aof.c @@ -2336,23 +2336,23 @@ int rewriteStreamObject(rio *r, robj *key, robj *o) { } /* For the current consumer, iterate all the PEL entries * to emit the XCLAIM protocol. */ - pelIterator pi_pel; - pelIterStart(&pi_pel,consumer->pel); - if (pelIterSeek(&pi_pel,"^",NULL)) { + pelIterator pi; + pelIterStart(&pi,consumer->pel); + if (pelIterSeek(&pi,"^",NULL)) { do { - streamNACK *nack = pi_pel.nack; + streamNACK *nack = pi.nack; if (rioWriteStreamPendingEntry(r,key,(char*)ri.key, ri.key_len,consumer, - pi_pel.rawkey,nack) == 0) + pi.rawkey,nack) == 0) { - pelIterStop(&pi_pel); + pelIterStop(&pi); raxStop(&ri_cons); raxStop(&ri); return 0; } - } while (pelIterNext(&pi_pel)); + } while (pelIterNext(&pi)); } - pelIterStop(&pi_pel); + pelIterStop(&pi); } raxStop(&ri_cons); } diff --git a/src/flax_malloc.h b/src/flax_malloc.h index 97b587ca3..410efabd4 100644 --- a/src/flax_malloc.h +++ b/src/flax_malloc.h @@ -1,3 +1,20 @@ +/* Flax -- A flat sorted-array map for uint8_t keys. + * + * Copyright (c) 2025-Present, Redis Ltd. + * All rights reserved. + * + * Licensed under your choice of (a) the Redis Source Available License 2.0 + * (RSALv2); or (b) the Server Side Public License v1 (SSPLv1); or (c) the + * GNU Affero General Public License v3 (AGPLv3). + */ + +/* Allocator selection. + * + * This file is used in order to change the Flax allocator at compile time. + * Just define the following defines to what you want to use. Also add + * the include of your alternate allocator if needed (not needed in order + * to use the default libc allocator). */ + #ifndef FLAX_ALLOC_H #define FLAX_ALLOC_H #include "zmalloc.h" From 82794ef87dbf16906add42c730a6b6638fbf20cb Mon Sep 17 00:00:00 2001 From: Sergey Georgiev Date: Wed, 1 Apr 2026 13:52:42 +0300 Subject: [PATCH 18/48] fixed: issue from review --- src/defrag.c | 7 +++---- src/stream.h | 1 + src/t_stream.c | 13 +++++++++++++ 3 files changed, 17 insertions(+), 4 deletions(-) diff --git a/src/defrag.c b/src/defrag.c index 6c4aa4f5a..f8fc522ea 100644 --- a/src/defrag.c +++ b/src/defrag.c @@ -884,10 +884,9 @@ void* defragStreamConsumerPelFlax(raxIterator *ri, void *privdata) { /* Update in the consumer PEL flax. */ flaxInsert(f, fi.key, newnack, NULL); - /* Update in the group PEL flax. pelInsert is an - * overwriting insert; the key already exists so count - * is unaffected and no overflow can trigger. */ - pelInsert(ctx->cg->pel, &newnack->id, newnack, NULL); + /* Update in the group PEL flax. pelReplace bypasses the + * cache and flaxShrink to avoid new allocations during defrag. */ + pelReplace(ctx->cg->pel, &newnack->id, newnack); /* Update doubly-linked list pointers. */ if (newnack->pel_prev) { diff --git a/src/stream.h b/src/stream.h index 76c50a5bb..39fe1bb32 100644 --- a/src/stream.h +++ b/src/stream.h @@ -229,6 +229,7 @@ void pelFree(rax *pel, void (*nack_free)(void *, void *), void *ctx); void pelFreeShallow(rax *pel); int pelInsert(rax *pel, streamID *id, streamNACK *nack, uint64_t *count); int pelTryInsert(rax *pel, streamID *id, streamNACK *nack, uint64_t *count); +void pelReplace(rax *pel, streamID *id, streamNACK *nack); streamNACK *pelFind(rax *pel, streamID *id); streamNACK *pelRemove(rax *pel, streamID *id, uint64_t *count); diff --git a/src/t_stream.c b/src/t_stream.c index 8ab919e5d..2fa299bad 100644 --- a/src/t_stream.c +++ b/src/t_stream.c @@ -201,6 +201,19 @@ int pelTryInsert(rax *pel, streamID *id, streamNACK *nack, uint64_t *count) { return 1; } +/* Replace the NACK pointer for an existing entry without cache interaction or + * flax shrink side-effects. Intended for defrag, where the key is guaranteed + * to exist and we must avoid allocations that would increase fragmentation. */ +void pelReplace(rax *pel, streamID *id, streamNACK *nack) { + unsigned char keybuf[PEL_RAX_KEY_LEN]; + pelEncodeRaxKey(keybuf, id->ms, id->seq); + void *data; + int found = raxFind(pel, keybuf, PEL_RAX_KEY_LEN, &data); + serverAssert(found); + flax *f = (flax *)data; + flaxInsert(f, pelFlaxKey(id->seq), nack, NULL); +} + /* Find a NACK by streamID. Returns NULL if not found. */ streamNACK *pelFind(rax *pel, streamID *id) { unsigned char keybuf[PEL_RAX_KEY_LEN]; From 691c9aa153d51c8e356f215ee776b5e80c86d212 Mon Sep 17 00:00:00 2001 From: Sergey Georgiev Date: Wed, 1 Apr 2026 14:37:17 +0300 Subject: [PATCH 19/48] fixed: issue from review --- src/rdb.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/rdb.c b/src/rdb.c index 8916fab89..38ad5945b 100644 --- a/src/rdb.c +++ b/src/rdb.c @@ -3361,9 +3361,9 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) decrRefCount(o); return NULL; } - streamID cpel_id; - streamDecodeID(rawid, &cpel_id); - streamNACK *nack = pelFind(cgroup->pel, &cpel_id); + streamID nack_id; + streamDecodeID(rawid, &nack_id); + streamNACK *nack = pelFind(cgroup->pel, &nack_id); if (!nack) { rdbReportCorruptRDB("Consumer entry not found in " "group global PEL"); @@ -3375,7 +3375,7 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) * loading the global PEL. Then set the same shared * NACK structure also in the consumer-specific PEL. */ nack->consumer = consumer; - if (!pelTryInsert(consumer->pel,&cpel_id,nack,&consumer->pel_count)) { + if (!pelTryInsert(consumer->pel,&nack_id,nack,&consumer->pel_count)) { rdbReportCorruptRDB("Duplicated consumer PEL entry " " loading a stream consumer " "group"); @@ -3387,19 +3387,19 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) /* Verify that each PEL eventually got a consumer assigned to it. */ if (deep_integrity_validation) { - pelIterator pi_cg_pel; - pelIterStart(&pi_cg_pel,cgroup->pel); - if (pelIterSeek(&pi_cg_pel,"^",NULL)) { + pelIterator pi_cg; + pelIterStart(&pi_cg,cgroup->pel); + if (pelIterSeek(&pi_cg,"^",NULL)) { do { - if (!pi_cg_pel.nack->consumer) { - pelIterStop(&pi_cg_pel); + if (!pi_cg.nack->consumer) { + pelIterStop(&pi_cg); rdbReportCorruptRDB("Stream CG PEL entry without consumer"); decrRefCount(o); return NULL; } - } while (pelIterNext(&pi_cg_pel)); + } while (pelIterNext(&pi_cg)); } - pelIterStop(&pi_cg_pel); + pelIterStop(&pi_cg); } } From d4b766be8e3f4aef960d4e88c7a3d1a313e74010 Mon Sep 17 00:00:00 2001 From: Sergey Georgiev Date: Wed, 1 Apr 2026 15:36:28 +0300 Subject: [PATCH 20/48] fixed: issue from review --- src/t_stream.c | 46 ++++++++++++++++++---------------------------- 1 file changed, 18 insertions(+), 28 deletions(-) diff --git a/src/t_stream.c b/src/t_stream.c index 2fa299bad..0750e3742 100644 --- a/src/t_stream.c +++ b/src/t_stream.c @@ -56,9 +56,6 @@ static idmpProducer *idmpGetOrCreateProducer(stream *s, const char *pid, size_t static int createIdempotencyHash(robj **argv, int64_t numfields, XXH128_hash_t *out_hash); static void idmpEvictOldestEntry(stream *s, idmpProducer *producer); -/* Forward declarations for stream ID encoding */ -void streamEncodeID(void *buf, streamID *id); - /* Forward declarations for PEL time list functions */ static void pelListInsertAtTail(streamCG *cg, streamNACK *nack); static void pelListUnlink(streamCG *cg, streamNACK *nack); @@ -160,8 +157,10 @@ static flax *pelResolveFlax(rax *r, unsigned char *keybuf, return f; } -/* Insert nack into two-level PEL. Returns 1 if new entry, 0 if key existed (old value replaced). */ -int pelInsert(rax *pel, streamID *id, streamNACK *nack, uint64_t *count) { +/* Generic insert into two-level PEL. If 'overwrite' is true, an existing + * entry's value is replaced; otherwise the insert is skipped when the key + * already exists. Returns 1 if a new entry was created, 0 otherwise. */ +static int pelGenericInsert(rax *pel, streamID *id, streamNACK *nack, uint64_t *count, int overwrite) { unsigned char keybuf[PEL_RAX_KEY_LEN]; pelEncodeRaxKey(keybuf, id->ms, id->seq); flax *prev; @@ -172,33 +171,24 @@ int pelInsert(rax *pel, streamID *id, streamNACK *nack, uint64_t *count) { if (pel->alloc_size) *pel->alloc_size -= before - flaxAllocSize(prev); } size_t before = flaxAllocSize(f); - void *old; - flaxInsert(f, pelFlaxKey(id->seq), nack, &old); + int inserted = overwrite ? flaxInsert(f, pelFlaxKey(id->seq), nack, NULL) + : flaxTryInsert(f, pelFlaxKey(id->seq), nack, NULL); if (pel->alloc_size) *pel->alloc_size += flaxAllocSize(f) - before; - if (old == NULL) { - if (count) (*count)++; - return 1; - } - return 0; + if (inserted && count) (*count)++; + return inserted; } -/* Insert only if not present. Returns 1 if inserted, 0 if key already exists. */ +/* Overwriting insert. Just a wrapper for pelGenericInsert() that will + * update the element if there is already one for the same key. */ +int pelInsert(rax *pel, streamID *id, streamNACK *nack, uint64_t *count) { + return pelGenericInsert(pel, id, nack, count, 1); +} + +/* Non overwriting insert function: if an element with the same key + * exists, the value is not updated and the function returns 0. + * This is just a wrapper for pelGenericInsert(). */ int pelTryInsert(rax *pel, streamID *id, streamNACK *nack, uint64_t *count) { - unsigned char keybuf[PEL_RAX_KEY_LEN]; - pelEncodeRaxKey(keybuf, id->ms, id->seq); - flax *prev; - flax *f = pelResolveFlax(pel, keybuf, 1, &prev); - if (prev && prev != f) { - size_t before = flaxAllocSize(prev); - flaxShrink(prev); - if (pel->alloc_size) *pel->alloc_size -= before - flaxAllocSize(prev); - } - size_t before = flaxAllocSize(f); - int inserted = flaxTryInsert(f, pelFlaxKey(id->seq), nack, NULL); - if (pel->alloc_size) *pel->alloc_size += flaxAllocSize(f) - before; - if (!inserted) return 0; - if (count) (*count)++; - return 1; + return pelGenericInsert(pel, id, nack, count, 0); } /* Replace the NACK pointer for an existing entry without cache interaction or From 613e63a3ce5ce3bc360abff9e979ac8f4a430f45 Mon Sep 17 00:00:00 2001 From: Sergey Georgiev Date: Wed, 1 Apr 2026 18:01:46 +0300 Subject: [PATCH 21/48] fixed: issue from review --- src/aof.c | 25 +++--- src/rdb.c | 54 ++++++------ src/stream.h | 2 +- src/t_stream.c | 224 ++++++++++++++++++++++++------------------------- 4 files changed, 151 insertions(+), 154 deletions(-) diff --git a/src/aof.c b/src/aof.c index 5c0496ad2..51dc98db5 100644 --- a/src/aof.c +++ b/src/aof.c @@ -2338,19 +2338,18 @@ int rewriteStreamObject(rio *r, robj *key, robj *o) { * to emit the XCLAIM protocol. */ pelIterator pi; pelIterStart(&pi,consumer->pel); - if (pelIterSeek(&pi,"^",NULL)) { - do { - streamNACK *nack = pi.nack; - if (rioWriteStreamPendingEntry(r,key,(char*)ri.key, - ri.key_len,consumer, - pi.rawkey,nack) == 0) - { - pelIterStop(&pi); - raxStop(&ri_cons); - raxStop(&ri); - return 0; - } - } while (pelIterNext(&pi)); + pelIterSeek(&pi,"^",NULL); + while (pelIterNext(&pi)) { + streamNACK *nack = pi.nack; + if (rioWriteStreamPendingEntry(r,key,(char*)ri.key, + ri.key_len,consumer, + pi.rawkey,nack) == 0) + { + pelIterStop(&pi); + raxStop(&ri_cons); + raxStop(&ri); + return 0; + } } pelIterStop(&pi); } diff --git a/src/rdb.c b/src/rdb.c index 38ad5945b..e5f1de35e 100644 --- a/src/rdb.c +++ b/src/rdb.c @@ -747,30 +747,29 @@ ssize_t rdbSaveStreamPEL(rio *rdb, rax *pel, uint64_t pel_count, int nacks) { /* Save each entry. */ pelIterator pi; pelIterStart(&pi,pel); - if (pelIterSeek(&pi,"^",NULL)) { - do { - /* We store IDs in raw form as 128 big big endian numbers, - * reconstructed from the two-level structure. */ - if ((n = rdbWriteRaw(rdb,pi.rawkey,sizeof(streamID))) == -1) { + pelIterSeek(&pi,"^",NULL); + while (pelIterNext(&pi)) { + /* We store IDs in raw form as 128 big big endian numbers, + * reconstructed from the two-level structure. */ + if ((n = rdbWriteRaw(rdb,pi.rawkey,sizeof(streamID))) == -1) { + pelIterStop(&pi); + return -1; + } + nwritten += n; + + if (nacks) { + streamNACK *nack = pi.nack; + if ((n = rdbSaveMillisecondTime(rdb,nack->delivery_time)) == -1) { pelIterStop(&pi); return -1; } nwritten += n; - - if (nacks) { - streamNACK *nack = pi.nack; - if ((n = rdbSaveMillisecondTime(rdb,nack->delivery_time)) == -1) { - pelIterStop(&pi); - return -1; - } - nwritten += n; - if ((n = rdbSaveLen(rdb,nack->delivery_count)) == -1) { - pelIterStop(&pi); - return -1; - } - nwritten += n; + if ((n = rdbSaveLen(rdb,nack->delivery_count)) == -1) { + pelIterStop(&pi); + return -1; } - } while (pelIterNext(&pi)); + nwritten += n; + } } pelIterStop(&pi); return nwritten; @@ -3389,15 +3388,14 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) if (deep_integrity_validation) { pelIterator pi_cg; pelIterStart(&pi_cg,cgroup->pel); - if (pelIterSeek(&pi_cg,"^",NULL)) { - do { - if (!pi_cg.nack->consumer) { - pelIterStop(&pi_cg); - rdbReportCorruptRDB("Stream CG PEL entry without consumer"); - decrRefCount(o); - return NULL; - } - } while (pelIterNext(&pi_cg)); + pelIterSeek(&pi_cg,"^",NULL); + while (pelIterNext(&pi_cg)) { + if (!pi_cg.nack->consumer) { + pelIterStop(&pi_cg); + rdbReportCorruptRDB("Stream CG PEL entry without consumer"); + decrRefCount(o); + return NULL; + } } pelIterStop(&pi_cg); } diff --git a/src/stream.h b/src/stream.h index 39fe1bb32..c83270389 100644 --- a/src/stream.h +++ b/src/stream.h @@ -206,6 +206,7 @@ typedef struct pelIterator { raxIterator ri; flaxIterator fi; int valid; + int just_seeked; streamID id; streamNACK *nack; unsigned char rawkey[sizeof(streamID)]; @@ -236,7 +237,6 @@ streamNACK *pelRemove(rax *pel, streamID *id, uint64_t *count); void pelIterStart(pelIterator *pi, rax *pel); int pelIterSeek(pelIterator *pi, const char *op, streamID *id); int pelIterNext(pelIterator *pi); -int pelIterReseek(pelIterator *pi, streamID *id); void pelIterStop(pelIterator *pi); /* PEL time list management (used by RDB loading) */ diff --git a/src/t_stream.c b/src/t_stream.c index 0750e3742..8ed9eafde 100644 --- a/src/t_stream.c +++ b/src/t_stream.c @@ -248,6 +248,7 @@ static void pelIterRefresh(pelIterator *pi) { void pelIterStart(pelIterator *pi, rax *pel) { raxStart(&pi->ri, pel); pi->valid = 0; + pi->just_seeked = 0; memset(&pi->fi, 0, sizeof(pi->fi)); memset(&pi->id, 0, sizeof(pi->id)); pi->nack = NULL; @@ -255,12 +256,14 @@ void pelIterStart(pelIterator *pi, rax *pel) { int pelIterSeek(pelIterator *pi, const char *op, streamID *id) { pi->valid = 0; + pi->just_seeked = 0; if (op[0] == '^') { raxSeek(&pi->ri, "^", NULL, 0); if (!raxNext(&pi->ri)) return 0; flaxStart(&pi->fi, (flax *)pi->ri.data); if (!flaxSeek(&pi->fi, "^", 0)) return 0; pelIterRefresh(pi); + pi->just_seeked = 1; return 1; } else if (op[0] == '$') { raxSeek(&pi->ri, "$", NULL, 0); @@ -268,6 +271,7 @@ int pelIterSeek(pelIterator *pi, const char *op, streamID *id) { flaxStart(&pi->fi, (flax *)pi->ri.data); if (!flaxSeek(&pi->fi, "$", 0)) return 0; pelIterRefresh(pi); + pi->just_seeked = 1; return 1; } else if (op[0] == '>' && op[1] == '=') { unsigned char keybuf[PEL_RAX_KEY_LEN]; @@ -283,6 +287,7 @@ int pelIterSeek(pelIterator *pi, const char *op, streamID *id) { flaxStart(&pi->fi, (flax *)pi->ri.data); if (!flaxSeek(&pi->fi, "^", 0)) return 0; pelIterRefresh(pi); + pi->just_seeked = 1; return 1; } @@ -305,12 +310,17 @@ int pelIterSeek(pelIterator *pi, const char *op, streamID *id) { if (!flaxSeek(&pi->fi, "^", 0)) return 0; } pelIterRefresh(pi); + pi->just_seeked = 1; return 1; } return 0; } int pelIterNext(pelIterator *pi) { + if (pi->just_seeked) { + pi->just_seeked = 0; + return pi->valid; + } if (!pi->valid) return 0; if (flaxNext(&pi->fi)) { pelIterRefresh(pi); @@ -328,11 +338,6 @@ int pelIterNext(pelIterator *pi) { return 1; } -/* Re-seek to >= id after a mid-iteration remove (XAUTOCLAIM pattern). */ -int pelIterReseek(pelIterator *pi, streamID *id) { - return pelIterSeek(pi, ">=", id); -} - void pelIterStop(pelIterator *pi) { raxStop(&pi->ri); pi->valid = 0; @@ -566,18 +571,17 @@ robj *streamDup(robj *o) { /* Consumer Group PEL */ pelIterator pi_cg; pelIterStart(&pi_cg, cg->pel); - if (pelIterSeek(&pi_cg, "^", NULL)) { - do { - streamNACK *nack = pi_cg.nack; - streamID nack_id = pi_cg.id; - streamNACK *new_nack = streamCreateNACK(new_s, NULL, &nack_id); - new_nack->delivery_time = nack->delivery_time; - new_nack->delivery_count = nack->delivery_count; - new_nack->cgroup_ref_node = streamLinkCGroupToEntry(new_s, new_cg, pi_cg.rawkey); - pelInsert(new_cg->pel, &nack_id, new_nack, &new_cg->pel_count); + pelIterSeek(&pi_cg, "^", NULL); + while (pelIterNext(&pi_cg)) { + streamNACK *nack = pi_cg.nack; + streamID nack_id = pi_cg.id; + streamNACK *new_nack = streamCreateNACK(new_s, NULL, &nack_id); + new_nack->delivery_time = nack->delivery_time; + new_nack->delivery_count = nack->delivery_count; + new_nack->cgroup_ref_node = streamLinkCGroupToEntry(new_s, new_cg, pi_cg.rawkey); + pelInsert(new_cg->pel, &nack_id, new_nack, &new_cg->pel_count); - pelListInsertSorted(new_cg, new_nack); - } while (pelIterNext(&pi_cg)); + pelListInsertSorted(new_cg, new_nack); } pelIterStop(&pi_cg); @@ -603,14 +607,13 @@ robj *streamDup(robj *o) { /* Consumer PEL */ pelIterator pi_cpel; pelIterStart(&pi_cpel, consumer->pel); - if (pelIterSeek(&pi_cpel, "^", NULL)) { - do { - streamID cpel_id = pi_cpel.id; - streamNACK *new_nack = pelFind(new_cg->pel, &cpel_id); - serverAssert(new_nack); - new_nack->consumer = new_consumer; - pelInsert(new_consumer->pel, &cpel_id, new_nack, &new_consumer->pel_count); - } while (pelIterNext(&pi_cpel)); + pelIterSeek(&pi_cpel, "^", NULL); + while (pelIterNext(&pi_cpel)) { + streamID cpel_id = pi_cpel.id; + streamNACK *new_nack = pelFind(new_cg->pel, &cpel_id); + serverAssert(new_nack); + new_nack->consumer = new_consumer; + pelInsert(new_consumer->pel, &cpel_id, new_nack, &new_consumer->pel_count); } pelIterStop(&pi_cpel); } @@ -2543,27 +2546,26 @@ size_t streamReplyWithRangeFromConsumerPEL(client *c, stream *s, streamID *start void *arraylen_ptr = addReplyDeferredLen(c); pelIterator pi; pelIterStart(&pi, consumer->pel); - if (pelIterSeek(&pi, ">=", start)) { - do { - if (end && streamCompareID(&pi.id, end) > 0) break; - if (!count || arraylen < count) { - streamID thisid = pi.id; - if (streamReplyWithRange(c,s,&thisid,&thisid,1,0,-1,NULL,NULL, - STREAM_RWR_RAWENTRIES,NULL,NULL) == 0) - { - addReplyArrayLen(c,2); - addReplyStreamID(c,&thisid); - addReplyNullArray(c); - } else { - streamNACK *nack = pi.nack; - nack->delivery_count++; - pelListUpdate(group, nack, commandTimeSnapshot()); - } - arraylen++; + pelIterSeek(&pi, ">=", start); + while (pelIterNext(&pi)) { + if (end && streamCompareID(&pi.id, end) > 0) break; + if (!count || arraylen < count) { + streamID thisid = pi.id; + if (streamReplyWithRange(c,s,&thisid,&thisid,1,0,-1,NULL,NULL, + STREAM_RWR_RAWENTRIES,NULL,NULL) == 0) + { + addReplyArrayLen(c,2); + addReplyStreamID(c,&thisid); + addReplyNullArray(c); } else { - break; + streamNACK *nack = pi.nack; + nack->delivery_count++; + pelListUpdate(group, nack, commandTimeSnapshot()); } - } while (pelIterNext(&pi)); + arraylen++; + } else { + break; + } } pelIterStop(&pi); setDeferredArrayLen(c,arraylen_ptr,arraylen); @@ -3595,10 +3597,9 @@ void streamDestroyCG(stream *s, streamCG *cg) { /* Remove all references from the cgroups_ref. */ pelIterator pi; pelIterStart(&pi, cg->pel); - if (pelIterSeek(&pi, "^", NULL)) { - do { - streamUnlinkEntryFromCGroupRef(s, pi.nack, pi.rawkey); - } while (pelIterNext(&pi)); + pelIterSeek(&pi, "^", NULL); + while (pelIterNext(&pi)) { + streamUnlinkEntryFromCGroupRef(s, pi.nack, pi.rawkey); } pelIterStop(&pi); @@ -3667,16 +3668,15 @@ void streamDelConsumer(stream *s, streamCG *cg, streamConsumer *consumer) { * entry from the global entry. */ pelIterator pi; pelIterStart(&pi, consumer->pel); - if (pelIterSeek(&pi, "^", NULL)) { - do { - streamNACK *nack = pi.nack; - streamUnlinkEntryFromCGroupRef(s, nack, pi.rawkey); + pelIterSeek(&pi, "^", NULL); + while (pelIterNext(&pi)) { + streamNACK *nack = pi.nack; + streamUnlinkEntryFromCGroupRef(s, nack, pi.rawkey); - pelListUnlink(cg, nack); - pelRemove(cg->pel, &pi.id, &cg->pel_count); + pelListUnlink(cg, nack); + pelRemove(cg->pel, &pi.id, &cg->pel_count); - streamFreeNACK(s, nack); - } while (pelIterNext(&pi)); + streamFreeNACK(s, nack); } pelIterStop(&pi); @@ -4300,12 +4300,14 @@ void xpendingCommand(client *c) { pelIterator pi; pelIterStart(&pi,group->pel); pelIterSeek(&pi,"^",NULL); + pelIterNext(&pi); addReplyStreamID(c,&pi.id); /* End. */ pelIterStop(&pi); pelIterStart(&pi,group->pel); pelIterSeek(&pi,"$",NULL); + pelIterNext(&pi); addReplyStreamID(c,&pi.id); pelIterStop(&pi); @@ -4347,35 +4349,34 @@ void xpendingCommand(client *c) { void *arraylen_ptr = addReplyDeferredLen(c); size_t arraylen = 0; - if (pelIterSeek(&pi, ">=", &startid)) { - do { - if (streamCompareID(&pi.id, &endid) > 0) break; - streamNACK *nack = pi.nack; + pelIterSeek(&pi, ">=", &startid); + while (count && pelIterNext(&pi)) { + if (streamCompareID(&pi.id, &endid) > 0) break; + streamNACK *nack = pi.nack; - if (minidle) { - mstime_t this_idle = now - nack->delivery_time; - if (this_idle < minidle) continue; - } + if (minidle) { + mstime_t this_idle = now - nack->delivery_time; + if (this_idle < minidle) continue; + } - arraylen++; - count--; - addReplyArrayLen(c,4); + arraylen++; + count--; + addReplyArrayLen(c,4); - /* Entry ID. */ - addReplyStreamID(c,&pi.id); + /* Entry ID. */ + addReplyStreamID(c,&pi.id); - /* Consumer name. */ - addReplyBulkCBuffer(c,nack->consumer->name, - sdslen(nack->consumer->name)); + /* Consumer name. */ + addReplyBulkCBuffer(c,nack->consumer->name, + sdslen(nack->consumer->name)); - /* Milliseconds elapsed since last delivery. */ - mstime_t elapsed = now - nack->delivery_time; - if (elapsed < 0) elapsed = 0; - addReplyLongLong(c,elapsed); + /* Milliseconds elapsed since last delivery. */ + mstime_t elapsed = now - nack->delivery_time; + if (elapsed < 0) elapsed = 0; + addReplyLongLong(c,elapsed); - /* Number of deliveries. */ - addReplyLongLong(c,nack->delivery_count); - } while (count && pelIterNext(&pi)); + /* Number of deliveries. */ + addReplyLongLong(c,nack->delivery_count); } pelIterStop(&pi); setDeferredArrayLen(c,arraylen_ptr,arraylen); @@ -4760,7 +4761,8 @@ void xautoclaimCommand(client *c) { size_t arraylen = 0; mstime_t now = commandTimeSnapshot(); int deleted_id_num = 0; - int has_entry = pelIterSeek(&pi, ">=", &startid); + pelIterSeek(&pi, ">=", &startid); + int has_entry = pelIterNext(&pi); while (attempts-- && count && has_entry) { streamNACK *nack = pi.nack; streamID id = pi.id; @@ -4781,7 +4783,8 @@ void xautoclaimCommand(client *c) { streamDestroyNACK(s, nack, rawkey); /* Remember the ID for later */ deleted_ids[deleted_id_num++] = id; - has_entry = pelIterReseek(&pi, &id); + pelIterSeek(&pi, ">=", &id); + has_entry = pelIterNext(&pi); count--; /* Count is a limit of the command response size. */ continue; } @@ -4832,8 +4835,7 @@ void xautoclaimCommand(client *c) { has_entry = pelIterNext(&pi); } - /* The cursor for the next XAUTOCLAIM call is whatever pi currently points to. - * After the loop, pi is already on the next unprocessed entry (or invalid). */ + /* After the loop, pi is already on the next unprocessed entry (or invalid). */ if (server.memory_tracking_enabled) updateSlotAllocSize(c->db,getKeySlot(c->argv[1]->ptr),o,old_alloc,kvobjAllocSize(o)); @@ -5242,28 +5244,27 @@ void xinfoReplyWithStreamInfo(client *c, kvobj *kv) { void *arrayptr_cg_pel = addReplyDeferredLen(c); pelIterator pi_cg_pel; pelIterStart(&pi_cg_pel,cg->pel); - if (pelIterSeek(&pi_cg_pel,"^",NULL)) { - do { - if (count && arraylen_cg_pel >= count) break; - streamNACK *nack = pi_cg_pel.nack; - addReplyArrayLen(c,4); + pelIterSeek(&pi_cg_pel,"^",NULL); + while (pelIterNext(&pi_cg_pel)) { + if (count && arraylen_cg_pel >= count) break; + streamNACK *nack = pi_cg_pel.nack; + addReplyArrayLen(c,4); - /* Entry ID. */ - addReplyStreamID(c,&pi_cg_pel.id); + /* Entry ID. */ + addReplyStreamID(c,&pi_cg_pel.id); - /* Consumer name. */ - serverAssert(nack->consumer); - addReplyBulkCBuffer(c,nack->consumer->name, - sdslen(nack->consumer->name)); + /* Consumer name. */ + serverAssert(nack->consumer); + addReplyBulkCBuffer(c,nack->consumer->name, + sdslen(nack->consumer->name)); - /* Last delivery. */ - addReplyLongLong(c,nack->delivery_time); + /* Last delivery. */ + addReplyLongLong(c,nack->delivery_time); - /* Number of deliveries. */ - addReplyLongLong(c,nack->delivery_count); + /* Number of deliveries. */ + addReplyLongLong(c,nack->delivery_count); - arraylen_cg_pel++; - } while (pelIterNext(&pi_cg_pel)); + arraylen_cg_pel++; } setDeferredArrayLen(c,arrayptr_cg_pel,arraylen_cg_pel); pelIterStop(&pi_cg_pel); @@ -5300,23 +5301,22 @@ void xinfoReplyWithStreamInfo(client *c, kvobj *kv) { void *arrayptr_cpel = addReplyDeferredLen(c); pelIterator pi_cpel; pelIterStart(&pi_cpel,consumer->pel); - if (pelIterSeek(&pi_cpel,"^",NULL)) { - do { - if (count && arraylen_cpel >= count) break; - streamNACK *nack = pi_cpel.nack; - addReplyArrayLen(c,3); + pelIterSeek(&pi_cpel,"^",NULL); + while (pelIterNext(&pi_cpel)) { + if (count && arraylen_cpel >= count) break; + streamNACK *nack = pi_cpel.nack; + addReplyArrayLen(c,3); - /* Entry ID. */ - addReplyStreamID(c,&pi_cpel.id); + /* Entry ID. */ + addReplyStreamID(c,&pi_cpel.id); - /* Last delivery. */ - addReplyLongLong(c,nack->delivery_time); + /* Last delivery. */ + addReplyLongLong(c,nack->delivery_time); - /* Number of deliveries. */ - addReplyLongLong(c,nack->delivery_count); + /* Number of deliveries. */ + addReplyLongLong(c,nack->delivery_count); - arraylen_cpel++; - } while (pelIterNext(&pi_cpel)); + arraylen_cpel++; } setDeferredArrayLen(c,arrayptr_cpel,arraylen_cpel); pelIterStop(&pi_cpel); From b57f3d2696a075641c6a6a452b81e554812d1511 Mon Sep 17 00:00:00 2001 From: Sergey Georgiev Date: Wed, 1 Apr 2026 18:20:10 +0300 Subject: [PATCH 22/48] fixed: issue from review --- src/rdb.c | 4 ++-- src/stream.h | 2 +- src/t_stream.c | 39 +++++++++++++++++++++------------------ 3 files changed, 24 insertions(+), 21 deletions(-) diff --git a/src/rdb.c b/src/rdb.c index e5f1de35e..25829075f 100644 --- a/src/rdb.c +++ b/src/rdb.c @@ -3362,8 +3362,8 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) } streamID nack_id; streamDecodeID(rawid, &nack_id); - streamNACK *nack = pelFind(cgroup->pel, &nack_id); - if (!nack) { + streamNACK *nack; + if (!pelFind(cgroup->pel, &nack_id, &nack)) { rdbReportCorruptRDB("Consumer entry not found in " "group global PEL"); decrRefCount(o); diff --git a/src/stream.h b/src/stream.h index c83270389..13aa1c0dd 100644 --- a/src/stream.h +++ b/src/stream.h @@ -231,7 +231,7 @@ void pelFreeShallow(rax *pel); int pelInsert(rax *pel, streamID *id, streamNACK *nack, uint64_t *count); int pelTryInsert(rax *pel, streamID *id, streamNACK *nack, uint64_t *count); void pelReplace(rax *pel, streamID *id, streamNACK *nack); -streamNACK *pelFind(rax *pel, streamID *id); +int pelFind(rax *pel, streamID *id, streamNACK **nack); streamNACK *pelRemove(rax *pel, streamID *id, uint64_t *count); void pelIterStart(pelIterator *pi, rax *pel); diff --git a/src/t_stream.c b/src/t_stream.c index 8ed9eafde..340adae8d 100644 --- a/src/t_stream.c +++ b/src/t_stream.c @@ -204,15 +204,16 @@ void pelReplace(rax *pel, streamID *id, streamNACK *nack) { flaxInsert(f, pelFlaxKey(id->seq), nack, NULL); } -/* Find a NACK by streamID. Returns NULL if not found. */ -streamNACK *pelFind(rax *pel, streamID *id) { +/* Find a NACK by streamID. Returns 1 if found (setting *nack), 0 if not. */ +int pelFind(rax *pel, streamID *id, streamNACK **nack) { unsigned char keybuf[PEL_RAX_KEY_LEN]; pelEncodeRaxKey(keybuf, id->ms, id->seq); flax *f = pelResolveFlax(pel, keybuf, 0, NULL); - if (!f) return NULL; + if (!f) return 0; void *val; - if (!flaxFind(f, pelFlaxKey(id->seq), &val)) return NULL; - return (streamNACK *)val; + if (!flaxFind(f, pelFlaxKey(id->seq), &val)) return 0; + if (nack) *nack = (streamNACK *)val; + return 1; } /* Remove a NACK by streamID. Returns the removed NACK or NULL. */ @@ -610,8 +611,9 @@ robj *streamDup(robj *o) { pelIterSeek(&pi_cpel, "^", NULL); while (pelIterNext(&pi_cpel)) { streamID cpel_id = pi_cpel.id; - streamNACK *new_nack = pelFind(new_cg->pel, &cpel_id); - serverAssert(new_nack); + streamNACK *new_nack; + int found = pelFind(new_cg->pel, &cpel_id, &new_nack); + serverAssert(found); new_nack->consumer = new_consumer; pelInsert(new_consumer->pel, &cpel_id, new_nack, &new_consumer->pel_count); } @@ -2479,8 +2481,8 @@ size_t streamReplyWithRange(client *c, stream *s, streamID *start, streamID *end * or update it if the consumer is the same as before. */ if (group_inserted == 0) { streamFreeNACK(s,nack); - nack = pelFind(group->pel, &id); - serverAssert(nack); + int found = pelFind(group->pel, &id, &nack); + serverAssert(found); /* Only transfer between consumers if they're different */ if (nack->consumer != consumer) { pelRemove(nack->consumer->pel, &id, &nack->consumer->pel_count); @@ -3421,8 +3423,8 @@ void streamCleanupEntryCGroupRefs(stream *s, streamID *id) { streamNACK *nack; streamCG *group = listNodeValue(ln); - nack = pelFind(group->pel, id); - serverAssert(nack); + int found = pelFind(group->pel, id, &nack); + serverAssert(found); pelListUnlink(group, nack); pelRemove(group->pel, id, &group->pel_count); @@ -4067,8 +4069,8 @@ void xackCommand(client *c) { /* Lookup the ID in the group PEL: it will have a reference to the * NACK structure that will have a reference to the consumer, so that * we are able to remove the entry from both PELs. */ - streamNACK *nack = pelFind(group->pel, &ids[j-3]); - if (nack) { + streamNACK *nack; + if (pelFind(group->pel, &ids[j-3], &nack)) { pelListUnlink(group, nack); pelRemove(group->pel, &ids[j-3], &group->pel_count); pelRemove(nack->consumer->pel, &ids[j-3], &nack->consumer->pel_count); @@ -4142,8 +4144,8 @@ void xackdelCommand(client *c) { /* Lookup the ID in the group PEL: it will have a reference to the * NACK structure that will have a reference to the consumer, so that * we are able to remove the entry from both PELs. */ - streamNACK *nack = pelFind(group->pel, id); - if (nack) { + streamNACK *nack; + if (pelFind(group->pel, id, &nack)) { pelListUnlink(group, nack); pelRemove(group->pel, id, &group->pel_count); pelRemove(nack->consumer->pel, id, &nack->consumer->pel_count); @@ -4567,12 +4569,13 @@ void xclaimCommand(client *c) { streamEncodeID(buf,&id); /* Lookup the ID in the group PEL. */ - streamNACK *nack = pelFind(group->pel, &id); + streamNACK *nack = NULL; + int nack_found = pelFind(group->pel, &id, &nack); /* Item must exist for us to transfer it to another consumer. */ if (!streamEntryExists(s,&id)) { /* Clear this entry from the PEL, it no longer exists */ - if (nack != NULL) { + if (nack_found) { /* Propagate this change (we are going to delete the NACK). */ streamPropagateXCLAIM(c,c->argv[1],group,c->argv[2],c->argv[j],nack); propagate_last_id = 0; /* Will be propagated by XCLAIM itself. */ @@ -4591,7 +4594,7 @@ void xclaimCommand(client *c) { * entry in the PEL from scratch, so that XCLAIM can also * be used to create entries in the PEL. Useful for AOF * and replication of consumer groups. */ - if (force && nack == NULL) { + if (force && !nack_found) { /* Create the NACK. */ nack = streamCreateNACK(s, NULL, &id); pelInsert(group->pel, &id, nack, &group->pel_count); From 257de5995739a388dab7d9b363d7ef15f433342c Mon Sep 17 00:00:00 2001 From: Sergey Georgiev Date: Wed, 1 Apr 2026 18:46:43 +0300 Subject: [PATCH 23/48] fixed: issue from review --- src/t_stream.c | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/src/t_stream.c b/src/t_stream.c index 340adae8d..36a3d2e37 100644 --- a/src/t_stream.c +++ b/src/t_stream.c @@ -575,12 +575,11 @@ robj *streamDup(robj *o) { pelIterSeek(&pi_cg, "^", NULL); while (pelIterNext(&pi_cg)) { streamNACK *nack = pi_cg.nack; - streamID nack_id = pi_cg.id; - streamNACK *new_nack = streamCreateNACK(new_s, NULL, &nack_id); + streamNACK *new_nack = streamCreateNACK(new_s, NULL, &pi_cg.id); new_nack->delivery_time = nack->delivery_time; new_nack->delivery_count = nack->delivery_count; new_nack->cgroup_ref_node = streamLinkCGroupToEntry(new_s, new_cg, pi_cg.rawkey); - pelInsert(new_cg->pel, &nack_id, new_nack, &new_cg->pel_count); + pelInsert(new_cg->pel, &pi_cg.id, new_nack, &new_cg->pel_count); pelListInsertSorted(new_cg, new_nack); } @@ -610,12 +609,11 @@ robj *streamDup(robj *o) { pelIterStart(&pi_cpel, consumer->pel); pelIterSeek(&pi_cpel, "^", NULL); while (pelIterNext(&pi_cpel)) { - streamID cpel_id = pi_cpel.id; streamNACK *new_nack; - int found = pelFind(new_cg->pel, &cpel_id, &new_nack); + int found = pelFind(new_cg->pel, &pi_cpel.id, &new_nack); serverAssert(found); new_nack->consumer = new_consumer; - pelInsert(new_consumer->pel, &cpel_id, new_nack, &new_consumer->pel_count); + pelInsert(new_consumer->pel, &pi_cpel.id, new_nack, &new_consumer->pel_count); } pelIterStop(&pi_cpel); } @@ -2552,12 +2550,11 @@ size_t streamReplyWithRangeFromConsumerPEL(client *c, stream *s, streamID *start while (pelIterNext(&pi)) { if (end && streamCompareID(&pi.id, end) > 0) break; if (!count || arraylen < count) { - streamID thisid = pi.id; - if (streamReplyWithRange(c,s,&thisid,&thisid,1,0,-1,NULL,NULL, + if (streamReplyWithRange(c,s,&pi.id,&pi.id,1,0,-1,NULL,NULL, STREAM_RWR_RAWENTRIES,NULL,NULL) == 0) { addReplyArrayLen(c,2); - addReplyStreamID(c,&thisid); + addReplyStreamID(c,&pi.id); addReplyNullArray(c); } else { streamNACK *nack = pi.nack; @@ -4306,8 +4303,6 @@ void xpendingCommand(client *c) { addReplyStreamID(c,&pi.id); /* End. */ - pelIterStop(&pi); - pelIterStart(&pi,group->pel); pelIterSeek(&pi,"$",NULL); pelIterNext(&pi); addReplyStreamID(c,&pi.id); @@ -4348,12 +4343,11 @@ void xpendingCommand(client *c) { pelIterator pi; pelIterStart(&pi, pel); + pelIterSeek(&pi, ">=", &startid); void *arraylen_ptr = addReplyDeferredLen(c); size_t arraylen = 0; - pelIterSeek(&pi, ">=", &startid); - while (count && pelIterNext(&pi)) { - if (streamCompareID(&pi.id, &endid) > 0) break; + while (count && pelIterNext(&pi) && streamCompareID(&pi.id, &endid) <= 0) { streamNACK *nack = pi.nack; if (minidle) { @@ -4570,12 +4564,12 @@ void xclaimCommand(client *c) { /* Lookup the ID in the group PEL. */ streamNACK *nack = NULL; - int nack_found = pelFind(group->pel, &id, &nack); + pelFind(group->pel, &id, &nack); /* Item must exist for us to transfer it to another consumer. */ if (!streamEntryExists(s,&id)) { /* Clear this entry from the PEL, it no longer exists */ - if (nack_found) { + if (nack != NULL) { /* Propagate this change (we are going to delete the NACK). */ streamPropagateXCLAIM(c,c->argv[1],group,c->argv[2],c->argv[j],nack); propagate_last_id = 0; /* Will be propagated by XCLAIM itself. */ @@ -4594,7 +4588,7 @@ void xclaimCommand(client *c) { * entry in the PEL from scratch, so that XCLAIM can also * be used to create entries in the PEL. Useful for AOF * and replication of consumer groups. */ - if (force && !nack_found) { + if (force && nack == NULL) { /* Create the NACK. */ nack = streamCreateNACK(s, NULL, &id); pelInsert(group->pel, &id, nack, &group->pel_count); From 342a650ee0605a4a05009e1901dff3df47c010e6 Mon Sep 17 00:00:00 2001 From: Sergey Georgiev Date: Thu, 2 Apr 2026 09:17:31 +0300 Subject: [PATCH 24/48] fixed: issue from review --- src/t_stream.c | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/src/t_stream.c b/src/t_stream.c index 36a3d2e37..cbd94e9ab 100644 --- a/src/t_stream.c +++ b/src/t_stream.c @@ -573,7 +573,7 @@ robj *streamDup(robj *o) { pelIterator pi_cg; pelIterStart(&pi_cg, cg->pel); pelIterSeek(&pi_cg, "^", NULL); - while (pelIterNext(&pi_cg)) { + while(pelIterNext(&pi_cg)) { streamNACK *nack = pi_cg.nack; streamNACK *new_nack = streamCreateNACK(new_s, NULL, &pi_cg.id); new_nack->delivery_time = nack->delivery_time; @@ -608,7 +608,7 @@ robj *streamDup(robj *o) { pelIterator pi_cpel; pelIterStart(&pi_cpel, consumer->pel); pelIterSeek(&pi_cpel, "^", NULL); - while (pelIterNext(&pi_cpel)) { + while(pelIterNext(&pi_cpel)) { streamNACK *new_nack; int found = pelFind(new_cg->pel, &pi_cpel.id, &new_nack); serverAssert(found); @@ -2547,24 +2547,20 @@ size_t streamReplyWithRangeFromConsumerPEL(client *c, stream *s, streamID *start pelIterator pi; pelIterStart(&pi, consumer->pel); pelIterSeek(&pi, ">=", start); - while (pelIterNext(&pi)) { + while (pelIterNext(&pi) && (!count || arraylen < count)) { if (end && streamCompareID(&pi.id, end) > 0) break; - if (!count || arraylen < count) { - if (streamReplyWithRange(c,s,&pi.id,&pi.id,1,0,-1,NULL,NULL, - STREAM_RWR_RAWENTRIES,NULL,NULL) == 0) - { - addReplyArrayLen(c,2); - addReplyStreamID(c,&pi.id); - addReplyNullArray(c); - } else { - streamNACK *nack = pi.nack; - nack->delivery_count++; - pelListUpdate(group, nack, commandTimeSnapshot()); - } - arraylen++; + if (streamReplyWithRange(c,s,&pi.id,&pi.id,1,0,-1,NULL,NULL, + STREAM_RWR_RAWENTRIES,NULL,NULL) == 0) + { + addReplyArrayLen(c,2); + addReplyStreamID(c,&pi.id); + addReplyNullArray(c); } else { - break; + streamNACK *nack = pi.nack; + nack->delivery_count++; + pelListUpdate(group, nack, commandTimeSnapshot()); } + arraylen++; } pelIterStop(&pi); setDeferredArrayLen(c,arraylen_ptr,arraylen); From 756d3490e3fa28fc5878f6a08ef7ae2162bbb48c Mon Sep 17 00:00:00 2001 From: Sergey Georgiev Date: Thu, 2 Apr 2026 11:50:27 +0300 Subject: [PATCH 25/48] fixed: issue from review --- src/aof.c | 2 +- src/rdb.c | 11 +-- src/stream.h | 16 ++--- src/t_stream.c | 181 +++++++++++++++++++------------------------------ 4 files changed, 84 insertions(+), 126 deletions(-) diff --git a/src/aof.c b/src/aof.c index 51dc98db5..5ed0dfa7c 100644 --- a/src/aof.c +++ b/src/aof.c @@ -2340,7 +2340,7 @@ int rewriteStreamObject(rio *r, robj *key, robj *o) { pelIterStart(&pi,consumer->pel); pelIterSeek(&pi,"^",NULL); while (pelIterNext(&pi)) { - streamNACK *nack = pi.nack; + streamNACK *nack = pi.data; if (rioWriteStreamPendingEntry(r,key,(char*)ri.key, ri.key_len,consumer, pi.rawkey,nack) == 0) diff --git a/src/rdb.c b/src/rdb.c index 25829075f..341f7da84 100644 --- a/src/rdb.c +++ b/src/rdb.c @@ -758,7 +758,7 @@ ssize_t rdbSaveStreamPEL(rio *rdb, rax *pel, uint64_t pel_count, int nacks) { nwritten += n; if (nacks) { - streamNACK *nack = pi.nack; + streamNACK *nack = pi.data; if ((n = rdbSaveMillisecondTime(rdb,nack->delivery_time)) == -1) { pelIterStop(&pi); return -1; @@ -3280,7 +3280,7 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) streamNACK *nack = streamCreateNACK(s, NULL, &nack_id); nack->delivery_time = rdbLoadMillisecondTime(rdb,RDB_VERSION); nack->delivery_count = rdbLoadLen(rdb,NULL); - nack->cgroup_ref_node = streamLinkCGroupToEntry(s, cgroup, rawid); + nack->cgroup_ref_node = streamLinkCGroupToEntry(s, cgroup, &nack_id); if (rioGetReadError(rdb)) { rdbReportReadError("Stream PEL NACK loading failed."); streamFreeNACK(s, nack); @@ -3362,8 +3362,8 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) } streamID nack_id; streamDecodeID(rawid, &nack_id); - streamNACK *nack; - if (!pelFind(cgroup->pel, &nack_id, &nack)) { + void *val; + if (!pelFind(cgroup->pel, &nack_id, &val)) { rdbReportCorruptRDB("Consumer entry not found in " "group global PEL"); decrRefCount(o); @@ -3373,6 +3373,7 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) /* Set the NACK consumer, that was left to NULL when * loading the global PEL. Then set the same shared * NACK structure also in the consumer-specific PEL. */ + streamNACK *nack = val; nack->consumer = consumer; if (!pelTryInsert(consumer->pel,&nack_id,nack,&consumer->pel_count)) { rdbReportCorruptRDB("Duplicated consumer PEL entry " @@ -3390,7 +3391,7 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) pelIterStart(&pi_cg,cgroup->pel); pelIterSeek(&pi_cg,"^",NULL); while (pelIterNext(&pi_cg)) { - if (!pi_cg.nack->consumer) { + if (!((streamNACK *)pi_cg.data)->consumer) { pelIterStop(&pi_cg); rdbReportCorruptRDB("Stream CG PEL entry without consumer"); decrRefCount(o); diff --git a/src/stream.h b/src/stream.h index 13aa1c0dd..e8a1fcc7f 100644 --- a/src/stream.h +++ b/src/stream.h @@ -181,7 +181,7 @@ streamNACK *streamCreateNACK(stream *s, streamConsumer *consumer, streamID *id); void streamDecodeID(void *buf, streamID *id); int streamCompareID(streamID *a, streamID *b); void streamFreeNACK(stream *s, streamNACK *na); -void streamDestroyNACK(stream *s, streamNACK *na, unsigned char *key); +void streamDestroyNACK(stream *s, streamNACK *na); int streamIncrID(streamID *id); int streamDecrID(streamID *id); void streamPropagateConsumerCreation(client *c, robj *key, robj *groupname, sds consumername); @@ -199,7 +199,7 @@ int streamEntryExists(stream *s, streamID *id); void streamKeyLoaded(redisDb *db, robj *key, robj *val); void streamKeyRemoved(redisDb *db, robj *key, robj *val); -listNode *streamLinkCGroupToEntry(stream *s, streamCG *cg, unsigned char *key); +listNode *streamLinkCGroupToEntry(stream *s, streamCG *cg, streamID *id); /* Two-level PEL iterator: walks outer rax (ms buckets) and inner flax (seq). */ typedef struct pelIterator { @@ -208,7 +208,7 @@ typedef struct pelIterator { int valid; int just_seeked; streamID id; - streamNACK *nack; + void *data; unsigned char rawkey[sizeof(streamID)]; } pelIterator; @@ -228,11 +228,11 @@ static inline void pelCacheInvalidate(rax *pel) { rax *pelNew(size_t *alloc_size); void pelFree(rax *pel, void (*nack_free)(void *, void *), void *ctx); void pelFreeShallow(rax *pel); -int pelInsert(rax *pel, streamID *id, streamNACK *nack, uint64_t *count); -int pelTryInsert(rax *pel, streamID *id, streamNACK *nack, uint64_t *count); -void pelReplace(rax *pel, streamID *id, streamNACK *nack); -int pelFind(rax *pel, streamID *id, streamNACK **nack); -streamNACK *pelRemove(rax *pel, streamID *id, uint64_t *count); +int pelInsert(rax *pel, streamID *id, void *data, uint64_t *count); +int pelTryInsert(rax *pel, streamID *id, void *data, uint64_t *count); +void pelReplace(rax *pel, streamID *id, void *data); +int pelFind(rax *pel, streamID *id, void **data); +void *pelRemove(rax *pel, streamID *id, uint64_t *count); void pelIterStart(pelIterator *pi, rax *pel); int pelIterSeek(pelIterator *pi, const char *op, streamID *id); diff --git a/src/t_stream.c b/src/t_stream.c index cbd94e9ab..2d0323a5d 100644 --- a/src/t_stream.c +++ b/src/t_stream.c @@ -160,7 +160,7 @@ static flax *pelResolveFlax(rax *r, unsigned char *keybuf, /* Generic insert into two-level PEL. If 'overwrite' is true, an existing * entry's value is replaced; otherwise the insert is skipped when the key * already exists. Returns 1 if a new entry was created, 0 otherwise. */ -static int pelGenericInsert(rax *pel, streamID *id, streamNACK *nack, uint64_t *count, int overwrite) { +static int pelGenericInsert(rax *pel, streamID *id, void *data, uint64_t *count, int overwrite) { unsigned char keybuf[PEL_RAX_KEY_LEN]; pelEncodeRaxKey(keybuf, id->ms, id->seq); flax *prev; @@ -171,8 +171,8 @@ static int pelGenericInsert(rax *pel, streamID *id, streamNACK *nack, uint64_t * if (pel->alloc_size) *pel->alloc_size -= before - flaxAllocSize(prev); } size_t before = flaxAllocSize(f); - int inserted = overwrite ? flaxInsert(f, pelFlaxKey(id->seq), nack, NULL) - : flaxTryInsert(f, pelFlaxKey(id->seq), nack, NULL); + int inserted = overwrite ? flaxInsert(f, pelFlaxKey(id->seq), data, NULL) + : flaxTryInsert(f, pelFlaxKey(id->seq), data, NULL); if (pel->alloc_size) *pel->alloc_size += flaxAllocSize(f) - before; if (inserted && count) (*count)++; return inserted; @@ -180,51 +180,50 @@ static int pelGenericInsert(rax *pel, streamID *id, streamNACK *nack, uint64_t * /* Overwriting insert. Just a wrapper for pelGenericInsert() that will * update the element if there is already one for the same key. */ -int pelInsert(rax *pel, streamID *id, streamNACK *nack, uint64_t *count) { - return pelGenericInsert(pel, id, nack, count, 1); +int pelInsert(rax *pel, streamID *id, void *data, uint64_t *count) { + return pelGenericInsert(pel, id, data, count, 1); } /* Non overwriting insert function: if an element with the same key * exists, the value is not updated and the function returns 0. * This is just a wrapper for pelGenericInsert(). */ -int pelTryInsert(rax *pel, streamID *id, streamNACK *nack, uint64_t *count) { - return pelGenericInsert(pel, id, nack, count, 0); +int pelTryInsert(rax *pel, streamID *id, void *data, uint64_t *count) { + return pelGenericInsert(pel, id, data, count, 0); } /* Replace the NACK pointer for an existing entry without cache interaction or * flax shrink side-effects. Intended for defrag, where the key is guaranteed * to exist and we must avoid allocations that would increase fragmentation. */ -void pelReplace(rax *pel, streamID *id, streamNACK *nack) { +void pelReplace(rax *pel, streamID *id, void *data) { unsigned char keybuf[PEL_RAX_KEY_LEN]; pelEncodeRaxKey(keybuf, id->ms, id->seq); - void *data; - int found = raxFind(pel, keybuf, PEL_RAX_KEY_LEN, &data); + void *raxval; + int found = raxFind(pel, keybuf, PEL_RAX_KEY_LEN, &raxval); serverAssert(found); - flax *f = (flax *)data; - flaxInsert(f, pelFlaxKey(id->seq), nack, NULL); + flax *f = (flax *)raxval; + flaxInsert(f, pelFlaxKey(id->seq), data, NULL); } -/* Find a NACK by streamID. Returns 1 if found (setting *nack), 0 if not. */ -int pelFind(rax *pel, streamID *id, streamNACK **nack) { +/* Find a value by streamID. Returns 1 if found (setting *data), 0 if not. */ +int pelFind(rax *pel, streamID *id, void **data) { unsigned char keybuf[PEL_RAX_KEY_LEN]; pelEncodeRaxKey(keybuf, id->ms, id->seq); flax *f = pelResolveFlax(pel, keybuf, 0, NULL); if (!f) return 0; void *val; if (!flaxFind(f, pelFlaxKey(id->seq), &val)) return 0; - if (nack) *nack = (streamNACK *)val; + if (data) *data = val; return 1; } -/* Remove a NACK by streamID. Returns the removed NACK or NULL. */ -streamNACK *pelRemove(rax *pel, streamID *id, uint64_t *count) { +/* Remove a value by streamID. Returns the removed value or NULL. */ +void *pelRemove(rax *pel, streamID *id, uint64_t *count) { unsigned char keybuf[PEL_RAX_KEY_LEN]; pelEncodeRaxKey(keybuf, id->ms, id->seq); flax *f = pelResolveFlax(pel, keybuf, 0, NULL); if (!f) return NULL; void *old; if (!flaxRemove(f, pelFlaxKey(id->seq), &old)) return NULL; - streamNACK *nack = (streamNACK *)old; if (count) (*count)--; if (f->numele == 0) { if (pel->alloc_size) *pel->alloc_size -= flaxAllocSize(f); @@ -232,7 +231,7 @@ streamNACK *pelRemove(rax *pel, streamID *id, uint64_t *count) { raxRemove(pel, keybuf, PEL_RAX_KEY_LEN, NULL); pelCacheInvalidate(pel); } - return nack; + return old; } /* --- PEL Iterator --- */ @@ -242,7 +241,7 @@ static void pelIterRefresh(pelIterator *pi) { memcpy(pi->rawkey, pi->ri.key, PEL_RAX_KEY_LEN); pi->rawkey[PEL_RAX_KEY_LEN] = (unsigned char)pi->fi.key; streamDecodeID(pi->rawkey, &pi->id); - pi->nack = (streamNACK *)pi->fi.data; + pi->data = pi->fi.data; pi->valid = 1; } @@ -252,7 +251,7 @@ void pelIterStart(pelIterator *pi, rax *pel) { pi->just_seeked = 0; memset(&pi->fi, 0, sizeof(pi->fi)); memset(&pi->id, 0, sizeof(pi->id)); - pi->nack = NULL; + pi->data = NULL; } int pelIterSeek(pelIterator *pi, const char *op, streamID *id) { @@ -574,11 +573,11 @@ robj *streamDup(robj *o) { pelIterStart(&pi_cg, cg->pel); pelIterSeek(&pi_cg, "^", NULL); while(pelIterNext(&pi_cg)) { - streamNACK *nack = pi_cg.nack; + streamNACK *nack = pi_cg.data; streamNACK *new_nack = streamCreateNACK(new_s, NULL, &pi_cg.id); new_nack->delivery_time = nack->delivery_time; new_nack->delivery_count = nack->delivery_count; - new_nack->cgroup_ref_node = streamLinkCGroupToEntry(new_s, new_cg, pi_cg.rawkey); + new_nack->cgroup_ref_node = streamLinkCGroupToEntry(new_s, new_cg, &pi_cg.id); pelInsert(new_cg->pel, &pi_cg.id, new_nack, &new_cg->pel_count); pelListInsertSorted(new_cg, new_nack); @@ -609,9 +608,10 @@ robj *streamDup(robj *o) { pelIterStart(&pi_cpel, consumer->pel); pelIterSeek(&pi_cpel, "^", NULL); while(pelIterNext(&pi_cpel)) { - streamNACK *new_nack; - int found = pelFind(new_cg->pel, &pi_cpel.id, &new_nack); + void *val; + int found = pelFind(new_cg->pel, &pi_cpel.id, &val); serverAssert(found); + streamNACK *new_nack = val; new_nack->consumer = new_consumer; pelInsert(new_consumer->pel, &pi_cpel.id, new_nack, &new_consumer->pel_count); } @@ -2464,9 +2464,6 @@ size_t streamReplyWithRange(client *c, stream *s, streamID *start, streamID *end * a NACK for the entry, we need to associate it to the new * consumer. */ if (group && !noack) { - unsigned char buf[sizeof(streamID)]; - streamEncodeID(buf,&id); - /* Try to add a new NACK. Most of the time this will work and * will not require extra lookups. We'll fix the problem later * if we find that there is already an entry for this ID. */ @@ -2479,8 +2476,10 @@ size_t streamReplyWithRange(client *c, stream *s, streamID *start, streamID *end * or update it if the consumer is the same as before. */ if (group_inserted == 0) { streamFreeNACK(s,nack); - int found = pelFind(group->pel, &id, &nack); + void *val; + int found = pelFind(group->pel, &id, &val); serverAssert(found); + nack = val; /* Only transfer between consumers if they're different */ if (nack->consumer != consumer) { pelRemove(nack->consumer->pel, &id, &nack->consumer->pel_count); @@ -2493,7 +2492,7 @@ size_t streamReplyWithRange(client *c, stream *s, streamID *start, streamID *end } else { /* New NACK - insert into consumer's PEL and time list */ pelInsert(consumer->pel, &id, nack, &consumer->pel_count); - nack->cgroup_ref_node = streamLinkCGroupToEntry(s, group, buf); + nack->cgroup_ref_node = streamLinkCGroupToEntry(s, group, &id); pelListInsertAtTail(group, nack); } @@ -2556,7 +2555,7 @@ size_t streamReplyWithRangeFromConsumerPEL(client *c, stream *s, streamID *start addReplyStreamID(c,&pi.id); addReplyNullArray(c); } else { - streamNACK *nack = pi.nack; + streamNACK *nack = pi.data; nack->delivery_count++; pelListUpdate(group, nack, commandTimeSnapshot()); } @@ -3344,22 +3343,17 @@ void streamUpdateCGroupLastId(stream *s, streamCG *cg, streamID *id) { /* Link a consumer group to a stream entry in the cgroups_ref index. * Returns a pointer to the list node, so that it can be used for future deletion. */ -listNode *streamLinkCGroupToEntry(stream *s, streamCG *cg, unsigned char *key) { +listNode *streamLinkCGroupToEntry(stream *s, streamCG *cg, streamID *id) { if (!s->cgroups_ref) s->cgroups_ref = pelNew(&s->alloc_size); - flax *f = pelResolveFlax(s->cgroups_ref, key, 1, NULL); - uint8_t fkey = key[PEL_RAX_KEY_LEN]; - list *cglist; void *existing; - if (flaxFind(f, fkey, &existing)) { + if (pelFind(s->cgroups_ref, id, &existing)) { cglist = (list *)existing; } else { cglist = listCreate(); - size_t before = flaxAllocSize(f); - flaxInsert(f, fkey, cglist, NULL); - s->alloc_size += flaxAllocSize(f) - before; + pelInsert(s->cgroups_ref, id, cglist, NULL); } listAddNodeTail(cglist, cg); @@ -3368,74 +3362,47 @@ listNode *streamLinkCGroupToEntry(stream *s, streamCG *cg, unsigned char *key) { /* Unlink a consumer group reference from the entry index for a specific stream ID. * This is called when a message is acknowledged or when a consumer group is deleted. */ -void streamUnlinkEntryFromCGroupRef(stream *s, streamNACK *na, unsigned char *key) { +void streamUnlinkEntryFromCGroupRef(stream *s, streamNACK *na) { if (!s->cgroups_ref) return; + void *data; + if (!pelFind(s->cgroups_ref, &na->id, &data)) return; - flax *f = pelResolveFlax(s->cgroups_ref, key, 0, NULL); - if (!f) return; - - uint8_t fkey = key[PEL_RAX_KEY_LEN]; - void *val; - if (!flaxFind(f, fkey, &val)) return; - - list *cglist = (list *)val; + list *cglist = (list *)data; listDelNode(cglist, na->cgroup_ref_node); if (listLength(cglist) == 0) { - flaxRemove(f, fkey, NULL); + pelRemove(s->cgroups_ref, &na->id, NULL); listRelease(cglist); - - if (f->numele == 0) { - s->alloc_size -= flaxAllocSize(f); - flaxFree(f); - raxRemove(s->cgroups_ref, key, PEL_RAX_KEY_LEN, NULL); - pelCacheInvalidate(s->cgroups_ref); - } } } /* Remove all consumer group references to a specific stream message. */ void streamCleanupEntryCGroupRefs(stream *s, streamID *id) { if (!s->cgroups_ref) return; - unsigned char buf[sizeof(streamID)]; - streamEncodeID(buf, id); + void *data; + if (!pelFind(s->cgroups_ref, id, &data)) return; - flax *f = pelResolveFlax(s->cgroups_ref, buf, 0, NULL); - if (!f) return; - - uint8_t fkey = buf[PEL_RAX_KEY_LEN]; - void *val; - if (!flaxFind(f, fkey, &val)) return; - - list *cglist = (list *)val; + list *cglist = (list *)data; listIter li; listNode *ln; listRewind(cglist, &li); while ((ln = listNext(&li))) { - streamNACK *nack; + void *val; streamCG *group = listNodeValue(ln); - int found = pelFind(group->pel, id, &nack); + int found = pelFind(group->pel, id, &val); serverAssert(found); + streamNACK *nack = val; pelListUnlink(group, nack); pelRemove(group->pel, id, &group->pel_count); pelRemove(nack->consumer->pel, id, &nack->consumer->pel_count); - /* Since we're removing all references from the cgroups_ref, we can directly - * free the NACK without unlinking it from the cgroups_ref. */ streamFreeNACK(s, nack); } - flaxRemove(f, fkey, NULL); + pelRemove(s->cgroups_ref, id, NULL); listRelease(cglist); - - if (f->numele == 0) { - s->alloc_size -= flaxAllocSize(f); - flaxFree(f); - raxRemove(s->cgroups_ref, buf, PEL_RAX_KEY_LEN, NULL); - pelCacheInvalidate(s->cgroups_ref); - } } /* Check if a stream entry is still referenced by any consumer group. @@ -3471,11 +3438,7 @@ int streamEntryIsReferenced(stream *s, streamID *id) { /* Check if the message is in any consumer group's PEL */ if (!s->cgroups_ref) return 0; - unsigned char buf[sizeof(streamID)]; - streamEncodeID(buf, id); - flax *f = pelResolveFlax(s->cgroups_ref, buf, 0, NULL); - if (!f) return 0; - return flaxFind(f, buf[PEL_RAX_KEY_LEN], NULL); + return pelFind(s->cgroups_ref, id, NULL); } /* Create a NACK entry setting the delivery count to 1 and the delivery @@ -3505,10 +3468,10 @@ void streamFreeNACK(stream *s, streamNACK *na) { /* Free a NACK entry and remove its reference from the cgroups_ref. * This ensures proper cleanup of the consumer group list associated with the message ID. * Note: Caller must ensure NACK is unlinked from pel_time list before calling. */ -void streamDestroyNACK(stream *s, streamNACK *na, unsigned char *key) { +void streamDestroyNACK(stream *s, streamNACK *na) { size_t usable; serverAssert(na->pel_prev == NULL && na->pel_next == NULL); - streamUnlinkEntryFromCGroupRef(s, na, key); + streamUnlinkEntryFromCGroupRef(s, na); zfree_usable(na, &usable); s->alloc_size -= usable; } @@ -3594,7 +3557,7 @@ void streamDestroyCG(stream *s, streamCG *cg) { pelIterStart(&pi, cg->pel); pelIterSeek(&pi, "^", NULL); while (pelIterNext(&pi)) { - streamUnlinkEntryFromCGroupRef(s, pi.nack, pi.rawkey); + streamUnlinkEntryFromCGroupRef(s, pi.data); } pelIterStop(&pi); @@ -3665,8 +3628,8 @@ void streamDelConsumer(stream *s, streamCG *cg, streamConsumer *consumer) { pelIterStart(&pi, consumer->pel); pelIterSeek(&pi, "^", NULL); while (pelIterNext(&pi)) { - streamNACK *nack = pi.nack; - streamUnlinkEntryFromCGroupRef(s, nack, pi.rawkey); + streamNACK *nack = pi.data; + streamUnlinkEntryFromCGroupRef(s, nack); pelListUnlink(cg, nack); pelRemove(cg->pel, &pi.id, &cg->pel_count); @@ -4056,18 +4019,16 @@ void xackCommand(client *c) { int acknowledged = 0; size_t old_alloc = server.memory_tracking_enabled ? kvobjAllocSize(kv) : 0; for (int j = 3; j < c->argc; j++) { - unsigned char buf[sizeof(streamID)]; - streamEncodeID(buf,&ids[j-3]); - /* Lookup the ID in the group PEL: it will have a reference to the * NACK structure that will have a reference to the consumer, so that * we are able to remove the entry from both PELs. */ - streamNACK *nack; - if (pelFind(group->pel, &ids[j-3], &nack)) { + void *val; + if (pelFind(group->pel, &ids[j-3], &val)) { + streamNACK *nack = val; pelListUnlink(group, nack); pelRemove(group->pel, &ids[j-3], &group->pel_count); pelRemove(nack->consumer->pel, &ids[j-3], &nack->consumer->pel_count); - streamDestroyNACK(kv->ptr, nack, buf); + streamDestroyNACK(kv->ptr, nack); acknowledged++; server.dirty++; keyModified(c,c->db,c->argv[1],kv,0); @@ -4131,18 +4092,17 @@ void xackdelCommand(client *c) { for (int j = 0; j < args.numids; j++) { int res = XACKDEL_NO_ID; streamID *id = &ids[j]; - unsigned char buf[sizeof(streamID)]; - streamEncodeID(buf,id); /* Lookup the ID in the group PEL: it will have a reference to the * NACK structure that will have a reference to the consumer, so that * we are able to remove the entry from both PELs. */ - streamNACK *nack; - if (pelFind(group->pel, id, &nack)) { + void *val; + if (pelFind(group->pel, id, &val)) { + streamNACK *nack = val; pelListUnlink(group, nack); pelRemove(group->pel, id, &group->pel_count); pelRemove(nack->consumer->pel, id, &nack->consumer->pel_count); - streamDestroyNACK(s, nack, buf); + streamDestroyNACK(s, nack); server.dirty++; int can_delete = 1; @@ -4344,7 +4304,7 @@ void xpendingCommand(client *c) { size_t arraylen = 0; while (count && pelIterNext(&pi) && streamCompareID(&pi.id, &endid) <= 0) { - streamNACK *nack = pi.nack; + streamNACK *nack = pi.data; if (minidle) { mstime_t this_idle = now - nack->delivery_time; @@ -4555,12 +4515,11 @@ void xclaimCommand(client *c) { size_t arraylen = 0; for (int j = 5; j <= last_id_arg; j++) { streamID id = ids[j-5]; - unsigned char buf[sizeof(streamID)]; - streamEncodeID(buf,&id); /* Lookup the ID in the group PEL. */ - streamNACK *nack = NULL; - pelFind(group->pel, &id, &nack); + void *val = NULL; + pelFind(group->pel, &id, &val); + streamNACK *nack = val; /* Item must exist for us to transfer it to another consumer. */ if (!streamEntryExists(s,&id)) { @@ -4574,7 +4533,7 @@ void xclaimCommand(client *c) { pelListUnlink(group, nack); pelRemove(group->pel, &id, &group->pel_count); pelRemove(nack->consumer->pel, &id, &nack->consumer->pel_count); - streamDestroyNACK(s, nack, buf); + streamDestroyNACK(s, nack); } continue; } @@ -4589,7 +4548,7 @@ void xclaimCommand(client *c) { nack = streamCreateNACK(s, NULL, &id); pelInsert(group->pel, &id, nack, &group->pel_count); pelListInsertAtTail(group, nack); - nack->cgroup_ref_node = streamLinkCGroupToEntry(s, group, buf); + nack->cgroup_ref_node = streamLinkCGroupToEntry(s, group, &id); } if (nack != NULL) { @@ -4757,7 +4716,7 @@ void xautoclaimCommand(client *c) { pelIterSeek(&pi, ">=", &startid); int has_entry = pelIterNext(&pi); while (attempts-- && count && has_entry) { - streamNACK *nack = pi.nack; + streamNACK *nack = pi.data; streamID id = pi.id; /* Item must exist for us to transfer it to another consumer. */ @@ -4771,9 +4730,7 @@ void xautoclaimCommand(client *c) { pelListUnlink(group, nack); pelRemove(group->pel, &id, &group->pel_count); pelRemove(nack->consumer->pel, &id, &nack->consumer->pel_count); - unsigned char rawkey[sizeof(streamID)]; - streamEncodeID(rawkey, &id); - streamDestroyNACK(s, nack, rawkey); + streamDestroyNACK(s, nack); /* Remember the ID for later */ deleted_ids[deleted_id_num++] = id; pelIterSeek(&pi, ">=", &id); @@ -5240,7 +5197,7 @@ void xinfoReplyWithStreamInfo(client *c, kvobj *kv) { pelIterSeek(&pi_cg_pel,"^",NULL); while (pelIterNext(&pi_cg_pel)) { if (count && arraylen_cg_pel >= count) break; - streamNACK *nack = pi_cg_pel.nack; + streamNACK *nack = pi_cg_pel.data; addReplyArrayLen(c,4); /* Entry ID. */ @@ -5297,7 +5254,7 @@ void xinfoReplyWithStreamInfo(client *c, kvobj *kv) { pelIterSeek(&pi_cpel,"^",NULL); while (pelIterNext(&pi_cpel)) { if (count && arraylen_cpel >= count) break; - streamNACK *nack = pi_cpel.nack; + streamNACK *nack = pi_cpel.data; addReplyArrayLen(c,3); /* Entry ID. */ From 598e2e77cbf0113569335fdf4801c305f3e18c0a Mon Sep 17 00:00:00 2001 From: Sergey Georgiev Date: Thu, 2 Apr 2026 17:10:37 +0300 Subject: [PATCH 26/48] fixed: issue from review --- src/aof.c | 2 +- src/rdb.c | 5 +++-- src/stream.h | 15 ++----------- src/t_stream.c | 61 +++++++++++++++++++++++++++++--------------------- 4 files changed, 42 insertions(+), 41 deletions(-) diff --git a/src/aof.c b/src/aof.c index 5ed0dfa7c..25357ec4a 100644 --- a/src/aof.c +++ b/src/aof.c @@ -2343,7 +2343,7 @@ int rewriteStreamObject(rio *r, robj *key, robj *o) { streamNACK *nack = pi.data; if (rioWriteStreamPendingEntry(r,key,(char*)ri.key, ri.key_len,consumer, - pi.rawkey,nack) == 0) + pi.key,nack) == 0) { pelIterStop(&pi); raxStop(&ri_cons); diff --git a/src/rdb.c b/src/rdb.c index 341f7da84..13dfc903e 100644 --- a/src/rdb.c +++ b/src/rdb.c @@ -751,7 +751,7 @@ ssize_t rdbSaveStreamPEL(rio *rdb, rax *pel, uint64_t pel_count, int nacks) { while (pelIterNext(&pi)) { /* We store IDs in raw form as 128 big big endian numbers, * reconstructed from the two-level structure. */ - if ((n = rdbWriteRaw(rdb,pi.rawkey,sizeof(streamID))) == -1) { + if ((n = rdbWriteRaw(rdb,pi.key,sizeof(streamID))) == -1) { pelIterStop(&pi); return -1; } @@ -3391,7 +3391,8 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) pelIterStart(&pi_cg,cgroup->pel); pelIterSeek(&pi_cg,"^",NULL); while (pelIterNext(&pi_cg)) { - if (!((streamNACK *)pi_cg.data)->consumer) { + streamNACK *nack = pi_cg.data; + if (!nack->consumer) { pelIterStop(&pi_cg); rdbReportCorruptRDB("Stream CG PEL entry without consumer"); decrRefCount(o); diff --git a/src/stream.h b/src/stream.h index e8a1fcc7f..72bf7d9e3 100644 --- a/src/stream.h +++ b/src/stream.h @@ -209,22 +209,11 @@ typedef struct pelIterator { int just_seeked; streamID id; void *data; - unsigned char rawkey[sizeof(streamID)]; + unsigned char key[sizeof(streamID)]; } pelIterator; -/* Inline cache embedded in rax metadata to speed up sequential PEL ops - * when consecutive operations target the same 15-byte rax bucket. */ -typedef struct pelCache { - unsigned char key[15]; - flax *f; -} pelCache; - -static inline void pelCacheInvalidate(rax *pel) { - pelCache *cache = (pelCache *)pel->metadata; - cache->f = NULL; -} - /* Two-level PEL operations. */ +void pelCacheInvalidate(rax *pel); rax *pelNew(size_t *alloc_size); void pelFree(rax *pel, void (*nack_free)(void *, void *), void *ctx); void pelFreeShallow(rax *pel); diff --git a/src/t_stream.c b/src/t_stream.c index 2d0323a5d..a82cdeb8e 100644 --- a/src/t_stream.c +++ b/src/t_stream.c @@ -67,6 +67,18 @@ static void pelListUpdate(streamCG *cg, streamNACK *nack, mstime_t new_delivery_ #define PEL_RAX_KEY_LEN 15 +/* Cache embedded in rax metadata to speed up sequential PEL ops + * when consecutive operations target the same 15-byte rax bucket. */ +typedef struct pelCache { + unsigned char key[PEL_RAX_KEY_LEN]; + flax *f; +} pelCache; + +void pelCacheInvalidate(rax *pel) { + pelCache *cache = (pelCache *)pel->metadata; + cache->f = NULL; +} + /* Encode a 15-byte rax key: full 8B big-endian ms + upper 7 bytes of * big-endian seq (i.e. first 15 bytes of the 16-byte encoded streamID). */ static inline void pelEncodeRaxKey(unsigned char *buf, uint64_t ms, uint64_t seq) { @@ -238,9 +250,9 @@ void *pelRemove(rax *pel, streamID *id, uint64_t *count) { /* Refresh iterator fields from current rax+flax positions. */ static void pelIterRefresh(pelIterator *pi) { - memcpy(pi->rawkey, pi->ri.key, PEL_RAX_KEY_LEN); - pi->rawkey[PEL_RAX_KEY_LEN] = (unsigned char)pi->fi.key; - streamDecodeID(pi->rawkey, &pi->id); + memcpy(pi->key, pi->ri.key, PEL_RAX_KEY_LEN); + pi->key[PEL_RAX_KEY_LEN] = (unsigned char)pi->fi.key; + streamDecodeID(pi->key, &pi->id); pi->data = pi->fi.data; pi->valid = 1; } @@ -608,10 +620,10 @@ robj *streamDup(robj *o) { pelIterStart(&pi_cpel, consumer->pel); pelIterSeek(&pi_cpel, "^", NULL); while(pelIterNext(&pi_cpel)) { - void *val; - int found = pelFind(new_cg->pel, &pi_cpel.id, &val); + void *result; + int found = pelFind(new_cg->pel, &pi_cpel.id, &result); serverAssert(found); - streamNACK *new_nack = val; + streamNACK *new_nack = result; new_nack->consumer = new_consumer; pelInsert(new_consumer->pel, &pi_cpel.id, new_nack, &new_consumer->pel_count); } @@ -2476,10 +2488,10 @@ size_t streamReplyWithRange(client *c, stream *s, streamID *start, streamID *end * or update it if the consumer is the same as before. */ if (group_inserted == 0) { streamFreeNACK(s,nack); - void *val; - int found = pelFind(group->pel, &id, &val); + void *result; + int found = pelFind(group->pel, &id, &result); serverAssert(found); - nack = val; + nack = result; /* Only transfer between consumers if they're different */ if (nack->consumer != consumer) { pelRemove(nack->consumer->pel, &id, &nack->consumer->pel_count); @@ -3349,6 +3361,7 @@ listNode *streamLinkCGroupToEntry(stream *s, streamCG *cg, streamID *id) { list *cglist; void *existing; + /* Try to find the list for this stream ID, create it if it doesn't exist */ if (pelFind(s->cgroups_ref, id, &existing)) { cglist = (list *)existing; } else { @@ -3364,37 +3377,35 @@ listNode *streamLinkCGroupToEntry(stream *s, streamCG *cg, streamID *id) { * This is called when a message is acknowledged or when a consumer group is deleted. */ void streamUnlinkEntryFromCGroupRef(stream *s, streamNACK *na) { if (!s->cgroups_ref) return; - void *data; - if (!pelFind(s->cgroups_ref, &na->id, &data)) return; + list *cglist; + if (pelFind(s->cgroups_ref, &na->id, (void**)&cglist)) { + listDelNode(cglist, na->cgroup_ref_node); - list *cglist = (list *)data; - listDelNode(cglist, na->cgroup_ref_node); - - if (listLength(cglist) == 0) { - pelRemove(s->cgroups_ref, &na->id, NULL); - listRelease(cglist); + if (listLength(cglist) == 0) { + pelRemove(s->cgroups_ref, &na->id, NULL); + listRelease(cglist); + } } } /* Remove all consumer group references to a specific stream message. */ void streamCleanupEntryCGroupRefs(stream *s, streamID *id) { if (!s->cgroups_ref) return; - void *data; - if (!pelFind(s->cgroups_ref, id, &data)) return; + list *cglist; + /* If message is not in any consumer group, nothing to do */ + if (!pelFind(s->cgroups_ref, id, (void**)&cglist)) return; - list *cglist = (list *)data; listIter li; listNode *ln; - listRewind(cglist, &li); while ((ln = listNext(&li))) { - void *val; + streamNACK *nack; streamCG *group = listNodeValue(ln); - int found = pelFind(group->pel, id, &val); - serverAssert(found); - streamNACK *nack = val; + /* Find the message in this consumer group's PEL */ + serverAssert(pelFind(group->pel, id, (void**)&nack)); + /* Remove from group and consumer PELs */ pelListUnlink(group, nack); pelRemove(group->pel, id, &group->pel_count); pelRemove(nack->consumer->pel, id, &nack->consumer->pel_count); From 8d10bb9ccb6675716fd819ab9e38937a80aa53a4 Mon Sep 17 00:00:00 2001 From: Sergey Georgiev Date: Fri, 3 Apr 2026 08:05:10 +0300 Subject: [PATCH 27/48] fixed: issue from review --- src/t_stream.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/t_stream.c b/src/t_stream.c index a82cdeb8e..051170eee 100644 --- a/src/t_stream.c +++ b/src/t_stream.c @@ -4528,9 +4528,9 @@ void xclaimCommand(client *c) { streamID id = ids[j-5]; /* Lookup the ID in the group PEL. */ - void *val = NULL; - pelFind(group->pel, &id, &val); - streamNACK *nack = val; + void *result = NULL; + pelFind(group->pel, &id, &result); + streamNACK *nack = result; /* Item must exist for us to transfer it to another consumer. */ if (!streamEntryExists(s,&id)) { From 9ad7f5d25410b9267cd7b595af10d0f1a1387d33 Mon Sep 17 00:00:00 2001 From: Sergey Georgiev Date: Fri, 3 Apr 2026 11:05:56 +0300 Subject: [PATCH 28/48] fixed: issue from review --- src/flax.c | 34 +++++++++++++++++----------------- src/flax.h | 4 ++-- 2 files changed, 19 insertions(+), 19 deletions(-) diff --git a/src/flax.c b/src/flax.c index ab628b31c..ae9170e6f 100644 --- a/src/flax.c +++ b/src/flax.c @@ -49,7 +49,7 @@ /* Return the byte offset where the values array starts within the data * block for a given capacity. The offset is aligned to pointer size. */ -static size_t flax_values_offset(uint32_t capacity) { +static size_t flax_values_offset(uint16_t capacity) { size_t raw = (size_t)capacity * sizeof(uint8_t); size_t align = alignof(void *); return (raw + align - 1) & ~(align - 1); @@ -85,7 +85,7 @@ static void **flax_values(flax *f) { * - Tail: key > keys[numele-1] is the append case, overwhelmingly common * when keys are monotonically increasing sequence numbers. * - Head: key <= keys[0] catches prepend and exact-match-at-zero. */ -static int flax_search(const uint8_t *keys, uint32_t numele, uint8_t key, int64_t *out_idx) { +static int flax_search(const uint8_t *keys, uint32_t numele, uint8_t key, int16_t *out_idx) { if (numele == 0) { *out_idx = 0; return 0; @@ -126,8 +126,8 @@ static int flax_search(const uint8_t *keys, uint32_t numele, uint8_t key, int64_ * capacity), we must perform two independent memcpy operations -- one for * the keys at the start of the block and one for the values at the new * aligned offset. The old data block is freed afterwards. */ -static void flax_resize(flax *f, uint32_t new_capacity) { - if (new_capacity > UINT8_MAX) new_capacity = UINT8_MAX; +static void flax_resize(flax *f, uint16_t new_capacity) { + if (new_capacity > UINT8_MAX + 1) new_capacity = UINT8_MAX + 1; size_t new_voff = flax_values_offset(new_capacity); size_t new_alloc = new_voff + (size_t)new_capacity * sizeof(void *); size_t new_usable; @@ -179,7 +179,7 @@ flax *flaxNew(void) { * When the key is new, a new element is created and 1 is returned (and * *old is set to NULL if provided). */ static int flaxGenericInsert(flax *f, uint8_t key, void *data, void **old, int overwrite) { - int64_t idx; + int16_t idx; if (flax_search(flax_keys(f), f->numele, key, &idx)) { if (old) *old = flax_values(f)[idx]; if (overwrite) flax_values(f)[idx] = data; @@ -191,7 +191,7 @@ static int flaxGenericInsert(flax *f, uint8_t key, void *data, void **old, int o uint8_t *keys = flax_keys(f); void **vals = flax_values(f); - int64_t tail = f->numele - idx; + int16_t tail = f->numele - idx; if (tail > 0) { memmove(&keys[idx + 1], &keys[idx], (size_t)tail * sizeof(uint8_t)); @@ -225,7 +225,7 @@ int flaxRemove(flax *f, uint8_t key, void **old) { return 0; } - int64_t idx; + int16_t idx; if (!flax_search(flax_keys(f), f->numele, key, &idx)) { if (old) *old = NULL; return 0; @@ -234,7 +234,7 @@ int flaxRemove(flax *f, uint8_t key, void **old) { uint8_t *keys = flax_keys(f); void **vals = flax_values(f); if (old) *old = vals[idx]; - int64_t tail = f->numele - idx - 1; + int16_t tail = f->numele - idx - 1; if (tail > 0) { memmove(&keys[idx], &keys[idx + 1], (size_t)tail * sizeof(uint8_t)); @@ -253,7 +253,7 @@ int flaxFind(flax *f, uint8_t key, void **value) { if (value) *value = NULL; return 0; } - int64_t idx; + int16_t idx; if (flax_search(flax_keys(f), f->numele, key, &idx)) { if (value) *value = flax_values(f)[idx]; return 1; @@ -285,8 +285,8 @@ void flaxFreeWithCallback(flax *f, } /* Return the number of elements inside the flax. */ -uint64_t flaxSize(flax *f) { - return (uint64_t)f->numele; +uint16_t flaxSize(flax *f) { + return f->numele; } /* Return the total heap memory used by the flax struct and its data block. @@ -342,7 +342,7 @@ int flaxSeek(flaxIterator *it, const char *op, uint8_t key) { } if (op[0] == '>' && op[1] == '=') { - int64_t idx; + int16_t idx; flax_search(flax_keys(it->f), it->f->numele, key, &idx); if (idx >= it->f->numele) { it->idx = -1; @@ -356,7 +356,7 @@ int flaxSeek(flaxIterator *it, const char *op, uint8_t key) { } if (op[0] == '>' && op[1] == '\0') { - int64_t idx; + int16_t idx; int found = flax_search(flax_keys(it->f), it->f->numele, key, &idx); if (found) idx++; if (idx >= it->f->numele) { @@ -371,7 +371,7 @@ int flaxSeek(flaxIterator *it, const char *op, uint8_t key) { } if (op[0] == '<' && op[1] == '=') { - int64_t idx; + int16_t idx; int found = flax_search(flax_keys(it->f), it->f->numele, key, &idx); if (found) { it->idx = idx; @@ -389,7 +389,7 @@ int flaxSeek(flaxIterator *it, const char *op, uint8_t key) { } if (op[0] == '<' && op[1] == '\0') { - int64_t idx; + int16_t idx; flax_search(flax_keys(it->f), it->f->numele, key, &idx); if (idx == 0) { it->idx = -1; @@ -403,7 +403,7 @@ int flaxSeek(flaxIterator *it, const char *op, uint8_t key) { } if (op[0] == '=' && op[1] == '\0') { - int64_t idx; + int16_t idx; if (!flax_search(flax_keys(it->f), it->f->numele, key, &idx)) { it->idx = -1; it->key = 0; @@ -669,7 +669,7 @@ int flaxTest(int argc, char **argv, int flags) { flaxInsert(a, (uint8_t)i, "x", NULL); assert(flaxSize(a) == 64); - uint32_t cap_before = a->capacity; + uint16_t cap_before = a->capacity; for (int i = 0; i < 56; i++) flaxRemove(a, (uint8_t)i, NULL); diff --git a/src/flax.h b/src/flax.h index 402c0ec69..5347d1c58 100644 --- a/src/flax.h +++ b/src/flax.h @@ -68,7 +68,7 @@ typedef struct flaxIterator { flax *f; /* Flax we are iterating. */ uint8_t key; /* The current key. */ void *data; /* Data associated to this key. */ - int64_t idx; /* Current index into the flax arrays, -1 if EOF. */ + int16_t idx; /* Current index into the flax arrays, -1 if EOF. */ } flaxIterator; /* --- Creation and destruction --- */ @@ -93,7 +93,7 @@ void flaxStop(flaxIterator *it); int flaxEOF(flaxIterator *it); /* --- Introspection --- */ -uint64_t flaxSize(flax *f); +uint16_t flaxSize(flax *f); size_t flaxAllocSize(flax *f); void flaxShrink(flax *f); From bbd23ce5a75fdd9b0a7a4b9b3b85982b892cc977 Mon Sep 17 00:00:00 2001 From: Sergey Georgiev Date: Fri, 3 Apr 2026 11:24:30 +0300 Subject: [PATCH 29/48] fixed: issue from review --- src/rdb.c | 6 +++--- src/t_stream.c | 35 ++++++++++++++++++++--------------- 2 files changed, 23 insertions(+), 18 deletions(-) diff --git a/src/rdb.c b/src/rdb.c index 13dfc903e..178686ac1 100644 --- a/src/rdb.c +++ b/src/rdb.c @@ -3362,8 +3362,8 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) } streamID nack_id; streamDecodeID(rawid, &nack_id); - void *val; - if (!pelFind(cgroup->pel, &nack_id, &val)) { + void *result; + if (!pelFind(cgroup->pel, &nack_id, &result)) { rdbReportCorruptRDB("Consumer entry not found in " "group global PEL"); decrRefCount(o); @@ -3373,7 +3373,7 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) /* Set the NACK consumer, that was left to NULL when * loading the global PEL. Then set the same shared * NACK structure also in the consumer-specific PEL. */ - streamNACK *nack = val; + streamNACK *nack = result; nack->consumer = consumer; if (!pelTryInsert(consumer->pel,&nack_id,nack,&consumer->pel_count)) { rdbReportCorruptRDB("Duplicated consumer PEL entry " diff --git a/src/t_stream.c b/src/t_stream.c index 051170eee..6fc2da026 100644 --- a/src/t_stream.c +++ b/src/t_stream.c @@ -2563,6 +2563,10 @@ size_t streamReplyWithRangeFromConsumerPEL(client *c, stream *s, streamID *start if (streamReplyWithRange(c,s,&pi.id,&pi.id,1,0,-1,NULL,NULL, STREAM_RWR_RAWENTRIES,NULL,NULL) == 0) { + /* Note that we may have a not acknowledged entry in the PEL + * about a message that's no longer here because was removed + * by the user by other means. In that case we signal it emitting + * the ID but then a NULL entry for the fields. */ addReplyArrayLen(c,2); addReplyStreamID(c,&pi.id); addReplyNullArray(c); @@ -3356,19 +3360,16 @@ void streamUpdateCGroupLastId(stream *s, streamCG *cg, streamID *id) { /* Link a consumer group to a stream entry in the cgroups_ref index. * Returns a pointer to the list node, so that it can be used for future deletion. */ listNode *streamLinkCGroupToEntry(stream *s, streamCG *cg, streamID *id) { + list *cglist; if (!s->cgroups_ref) s->cgroups_ref = pelNew(&s->alloc_size); - list *cglist; - void *existing; /* Try to find the list for this stream ID, create it if it doesn't exist */ - if (pelFind(s->cgroups_ref, id, &existing)) { - cglist = (list *)existing; - } else { + if (!pelFind(s->cgroups_ref, id, (void**)&cglist)) { cglist = listCreate(); - pelInsert(s->cgroups_ref, id, cglist, NULL); + serverAssert(pelInsert(s->cgroups_ref, id, cglist, NULL)); } - + /* Add the consumer group to the list and return the list node */ listAddNodeTail(cglist, cg); return listLast(cglist); } @@ -3376,11 +3377,12 @@ listNode *streamLinkCGroupToEntry(stream *s, streamCG *cg, streamID *id) { /* Unlink a consumer group reference from the entry index for a specific stream ID. * This is called when a message is acknowledged or when a consumer group is deleted. */ void streamUnlinkEntryFromCGroupRef(stream *s, streamNACK *na) { - if (!s->cgroups_ref) return; list *cglist; + if (!s->cgroups_ref) return; if (pelFind(s->cgroups_ref, &na->id, (void**)&cglist)) { listDelNode(cglist, na->cgroup_ref_node); + /* If the list is now empty, remove it from the index. */ if (listLength(cglist) == 0) { pelRemove(s->cgroups_ref, &na->id, NULL); listRelease(cglist); @@ -3393,7 +3395,8 @@ void streamCleanupEntryCGroupRefs(stream *s, streamID *id) { if (!s->cgroups_ref) return; list *cglist; /* If message is not in any consumer group, nothing to do */ - if (!pelFind(s->cgroups_ref, id, (void**)&cglist)) return; + if (!pelFind(s->cgroups_ref, id, (void**)&cglist)) + return; listIter li; listNode *ln; @@ -3407,6 +3410,8 @@ void streamCleanupEntryCGroupRefs(stream *s, streamID *id) { /* Remove from group and consumer PELs */ pelListUnlink(group, nack); + /* Since we're removing all references from the cgroups_ref, we can directly + * free the NACK without unlinking it from the cgroups_ref. */ pelRemove(group->pel, id, &group->pel_count); pelRemove(nack->consumer->pel, id, &nack->consumer->pel_count); streamFreeNACK(s, nack); @@ -4033,9 +4038,9 @@ void xackCommand(client *c) { /* Lookup the ID in the group PEL: it will have a reference to the * NACK structure that will have a reference to the consumer, so that * we are able to remove the entry from both PELs. */ - void *val; - if (pelFind(group->pel, &ids[j-3], &val)) { - streamNACK *nack = val; + void *result; + if (pelFind(group->pel, &ids[j-3], &result)) { + streamNACK *nack = result; pelListUnlink(group, nack); pelRemove(group->pel, &ids[j-3], &group->pel_count); pelRemove(nack->consumer->pel, &ids[j-3], &nack->consumer->pel_count); @@ -4107,9 +4112,9 @@ void xackdelCommand(client *c) { /* Lookup the ID in the group PEL: it will have a reference to the * NACK structure that will have a reference to the consumer, so that * we are able to remove the entry from both PELs. */ - void *val; - if (pelFind(group->pel, id, &val)) { - streamNACK *nack = val; + void *result; + if (pelFind(group->pel, id, &result)) { + streamNACK *nack = result; pelListUnlink(group, nack); pelRemove(group->pel, id, &group->pel_count); pelRemove(nack->consumer->pel, id, &nack->consumer->pel_count); From ab0096be86b0a125db1028071e4a48556385f2b1 Mon Sep 17 00:00:00 2001 From: Sergey Georgiev Date: Fri, 3 Apr 2026 11:44:30 +0300 Subject: [PATCH 30/48] fixed: issue from review --- src/t_stream.c | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/src/t_stream.c b/src/t_stream.c index 6fc2da026..f4a6d2695 100644 --- a/src/t_stream.c +++ b/src/t_stream.c @@ -592,6 +592,7 @@ robj *streamDup(robj *o) { new_nack->cgroup_ref_node = streamLinkCGroupToEntry(new_s, new_cg, &pi_cg.id); pelInsert(new_cg->pel, &pi_cg.id, new_nack, &new_cg->pel_count); + /* Insert in sorted order to preserve ordering */ pelListInsertSorted(new_cg, new_nack); } pelIterStop(&pi_cg); @@ -2360,7 +2361,7 @@ size_t streamReplyWithRange(client *c, stream *s, streamID *start, streamID *end pelInsert(consumer->pel, &nack->id, nack, &consumer->pel_count); } nack->delivery_count++; - pelListUpdate(group, nack, cmd_time_snapshot); + pelListUpdate(group, nack, cmd_time_snapshot); /* Moves element from beginning to end of list */ consumer->active_time = cmd_time_snapshot; @@ -3394,12 +3395,13 @@ void streamUnlinkEntryFromCGroupRef(stream *s, streamNACK *na) { void streamCleanupEntryCGroupRefs(stream *s, streamID *id) { if (!s->cgroups_ref) return; list *cglist; + listIter li; + listNode *ln; + /* If message is not in any consumer group, nothing to do */ if (!pelFind(s->cgroups_ref, id, (void**)&cglist)) return; - listIter li; - listNode *ln; listRewind(cglist, &li); while ((ln = listNext(&li))) { streamNACK *nack; @@ -4730,8 +4732,7 @@ void xautoclaimCommand(client *c) { mstime_t now = commandTimeSnapshot(); int deleted_id_num = 0; pelIterSeek(&pi, ">=", &startid); - int has_entry = pelIterNext(&pi); - while (attempts-- && count && has_entry) { + while (attempts-- && count && pelIterNext(&pi)) { streamNACK *nack = pi.data; streamID id = pi.id; @@ -4750,7 +4751,6 @@ void xautoclaimCommand(client *c) { /* Remember the ID for later */ deleted_ids[deleted_id_num++] = id; pelIterSeek(&pi, ">=", &id); - has_entry = pelIterNext(&pi); count--; /* Count is a limit of the command response size. */ continue; } @@ -4758,12 +4758,14 @@ void xautoclaimCommand(client *c) { if (minidle) { mstime_t this_idle = now - nack->delivery_time; if (this_idle < minidle) { - has_entry = pelIterNext(&pi); continue; } } if (nack->consumer != consumer) { + /* Remove the entry from the old consumer. + * Note that nack->consumer is NULL if we created the + * NACK above because of the FORCE option. */ if (nack->consumer) { pelRemove(nack->consumer->pel, &id, &nack->consumer->pel_count); } @@ -4798,10 +4800,10 @@ void xautoclaimCommand(client *c) { streamPropagateXCLAIM(c,c->argv[1],group,c->argv[2],idstr,nack); decrRefCount(idstr); server.dirty++; - has_entry = pelIterNext(&pi); } - /* After the loop, pi is already on the next unprocessed entry (or invalid). */ + /* We need to return the next entry as a cursor for the next XAUTOCLAIM call */ + pelIterNext(&pi); if (server.memory_tracking_enabled) updateSlotAllocSize(c->db,getKeySlot(c->argv[1]->ptr),o,old_alloc,kvobjAllocSize(o)); @@ -5220,7 +5222,7 @@ void xinfoReplyWithStreamInfo(client *c, kvobj *kv) { addReplyStreamID(c,&pi_cg_pel.id); /* Consumer name. */ - serverAssert(nack->consumer); + serverAssert(nack->consumer); /* assertion for valgrind (avoid NPD) */ addReplyBulkCBuffer(c,nack->consumer->name, sdslen(nack->consumer->name)); From c1711d42697c495864953d1b2b0502585590d8ed Mon Sep 17 00:00:00 2001 From: Sergey Georgiev Date: Fri, 3 Apr 2026 12:02:32 +0300 Subject: [PATCH 31/48] fixed: issue from review --- src/t_stream.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/src/t_stream.c b/src/t_stream.c index f4a6d2695..fb131a588 100644 --- a/src/t_stream.c +++ b/src/t_stream.c @@ -3362,6 +3362,7 @@ void streamUpdateCGroupLastId(stream *s, streamCG *cg, streamID *id) { * Returns a pointer to the list node, so that it can be used for future deletion. */ listNode *streamLinkCGroupToEntry(stream *s, streamCG *cg, streamID *id) { list *cglist; + if (!s->cgroups_ref) s->cgroups_ref = pelNew(&s->alloc_size); @@ -4728,10 +4729,10 @@ void xautoclaimCommand(client *c) { pelIterator pi; pelIterStart(&pi,group->pel); + pelIterSeek(&pi, ">=", &startid); size_t arraylen = 0; mstime_t now = commandTimeSnapshot(); int deleted_id_num = 0; - pelIterSeek(&pi, ">=", &startid); while (attempts-- && count && pelIterNext(&pi)) { streamNACK *nack = pi.data; streamID id = pi.id; @@ -4757,9 +4758,8 @@ void xautoclaimCommand(client *c) { if (minidle) { mstime_t this_idle = now - nack->delivery_time; - if (this_idle < minidle) { + if (this_idle < minidle) continue; - } } if (nack->consumer != consumer) { @@ -5210,16 +5210,15 @@ void xinfoReplyWithStreamInfo(client *c, kvobj *kv) { addReplyBulkCString(c,"pending"); long long arraylen_cg_pel = 0; void *arrayptr_cg_pel = addReplyDeferredLen(c); - pelIterator pi_cg_pel; - pelIterStart(&pi_cg_pel,cg->pel); - pelIterSeek(&pi_cg_pel,"^",NULL); - while (pelIterNext(&pi_cg_pel)) { - if (count && arraylen_cg_pel >= count) break; - streamNACK *nack = pi_cg_pel.data; + pelIterator pi_cg; + pelIterStart(&pi_cg,cg->pel); + pelIterSeek(&pi_cg,"^",NULL); + while (pelIterNext(&pi_cg) && (!count || arraylen_cg_pel < count)) { + streamNACK *nack = pi_cg.data; addReplyArrayLen(c,4); /* Entry ID. */ - addReplyStreamID(c,&pi_cg_pel.id); + addReplyStreamID(c,&pi_cg.id); /* Consumer name. */ serverAssert(nack->consumer); /* assertion for valgrind (avoid NPD) */ @@ -5235,7 +5234,7 @@ void xinfoReplyWithStreamInfo(client *c, kvobj *kv) { arraylen_cg_pel++; } setDeferredArrayLen(c,arrayptr_cg_pel,arraylen_cg_pel); - pelIterStop(&pi_cg_pel); + pelIterStop(&pi_cg); /* Consumers */ addReplyBulkCString(c,"consumers"); @@ -5270,8 +5269,7 @@ void xinfoReplyWithStreamInfo(client *c, kvobj *kv) { pelIterator pi_cpel; pelIterStart(&pi_cpel,consumer->pel); pelIterSeek(&pi_cpel,"^",NULL); - while (pelIterNext(&pi_cpel)) { - if (count && arraylen_cpel >= count) break; + while (pelIterNext(&pi_cpel) && (!count || arraylen_cpel < count)) { streamNACK *nack = pi_cpel.data; addReplyArrayLen(c,3); From 2c5a0e8da4a285891facf563682740576d9d14ae Mon Sep 17 00:00:00 2001 From: Sergey Georgiev Date: Fri, 3 Apr 2026 12:15:42 +0300 Subject: [PATCH 32/48] fixed: issue from review --- src/t_stream.c | 52 +++++++++++++++++++++++++------------------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/src/t_stream.c b/src/t_stream.c index fb131a588..5894940ea 100644 --- a/src/t_stream.c +++ b/src/t_stream.c @@ -3576,7 +3576,7 @@ void streamDestroyCG(stream *s, streamCG *cg) { pelIterStart(&pi, cg->pel); pelIterSeek(&pi, "^", NULL); while (pelIterNext(&pi)) { - streamUnlinkEntryFromCGroupRef(s, pi.data); + streamUnlinkEntryFromCGroupRef(s,pi.data); } pelIterStop(&pi); @@ -3648,12 +3648,12 @@ void streamDelConsumer(stream *s, streamCG *cg, streamConsumer *consumer) { pelIterSeek(&pi, "^", NULL); while (pelIterNext(&pi)) { streamNACK *nack = pi.data; - streamUnlinkEntryFromCGroupRef(s, nack); + streamUnlinkEntryFromCGroupRef(s,nack); - pelListUnlink(cg, nack); - pelRemove(cg->pel, &pi.id, &cg->pel_count); + pelListUnlink(cg,nack); + pelRemove(cg->pel,&pi.id,&cg->pel_count); - streamFreeNACK(s, nack); + streamFreeNACK(s,nack); } pelIterStop(&pi); @@ -4317,12 +4317,12 @@ void xpendingCommand(client *c) { mstime_t now = commandTimeSnapshot(); pelIterator pi; - pelIterStart(&pi, pel); - pelIterSeek(&pi, ">=", &startid); + pelIterStart(&pi,pel); + pelIterSeek(&pi,">=",&startid); void *arraylen_ptr = addReplyDeferredLen(c); size_t arraylen = 0; - while (count && pelIterNext(&pi) && streamCompareID(&pi.id, &endid) <= 0) { + while (count && pelIterNext(&pi) && streamCompareID(&pi.id,&endid) <= 0) { streamNACK *nack = pi.data; if (minidle) { @@ -4537,7 +4537,7 @@ void xclaimCommand(client *c) { /* Lookup the ID in the group PEL. */ void *result = NULL; - pelFind(group->pel, &id, &result); + pelFind(group->pel,&id,&result); streamNACK *nack = result; /* Item must exist for us to transfer it to another consumer. */ @@ -4549,10 +4549,10 @@ void xclaimCommand(client *c) { propagate_last_id = 0; /* Will be propagated by XCLAIM itself. */ server.dirty++; /* Release the NACK */ - pelListUnlink(group, nack); - pelRemove(group->pel, &id, &group->pel_count); - pelRemove(nack->consumer->pel, &id, &nack->consumer->pel_count); - streamDestroyNACK(s, nack); + pelListUnlink(group,nack); + pelRemove(group->pel,&id,&group->pel_count); + pelRemove(nack->consumer->pel,&id,&nack->consumer->pel_count); + streamDestroyNACK(s,nack); } continue; } @@ -4564,10 +4564,10 @@ void xclaimCommand(client *c) { * and replication of consumer groups. */ if (force && nack == NULL) { /* Create the NACK. */ - nack = streamCreateNACK(s, NULL, &id); - pelInsert(group->pel, &id, nack, &group->pel_count); + nack = streamCreateNACK(s,NULL,&id); + pelInsert(group->pel,&id,nack,&group->pel_count); pelListInsertAtTail(group, nack); - nack->cgroup_ref_node = streamLinkCGroupToEntry(s, group, &id); + nack->cgroup_ref_node = streamLinkCGroupToEntry(s,group,&id); } if (nack != NULL) { @@ -4587,7 +4587,7 @@ void xclaimCommand(client *c) { * Note that nack->consumer is NULL if we created the * NACK above because of the FORCE option. */ if (nack->consumer) { - pelRemove(nack->consumer->pel, &id, &nack->consumer->pel_count); + pelRemove(nack->consumer->pel,&id,&nack->consumer->pel_count); } } @@ -4602,7 +4602,7 @@ void xclaimCommand(client *c) { } if (nack->consumer != consumer) { /* Add the entry in the new consumer local PEL. */ - pelInsert(consumer->pel, &id, nack, &consumer->pel_count); + pelInsert(consumer->pel,&id,nack,&consumer->pel_count); nack->consumer = consumer; } /* Send the reply for this entry. */ @@ -4729,7 +4729,7 @@ void xautoclaimCommand(client *c) { pelIterator pi; pelIterStart(&pi,group->pel); - pelIterSeek(&pi, ">=", &startid); + pelIterSeek(&pi,">=",&startid); size_t arraylen = 0; mstime_t now = commandTimeSnapshot(); int deleted_id_num = 0; @@ -4745,13 +4745,13 @@ void xautoclaimCommand(client *c) { decrRefCount(idstr); server.dirty++; /* Clear this entry from the PEL, it no longer exists */ - pelListUnlink(group, nack); - pelRemove(group->pel, &id, &group->pel_count); - pelRemove(nack->consumer->pel, &id, &nack->consumer->pel_count); - streamDestroyNACK(s, nack); + pelListUnlink(group,nack); + pelRemove(group->pel,&id,&group->pel_count); + pelRemove(nack->consumer->pel,&id,&nack->consumer->pel_count); + streamDestroyNACK(s,nack); /* Remember the ID for later */ deleted_ids[deleted_id_num++] = id; - pelIterSeek(&pi, ">=", &id); + pelIterSeek(&pi,">=",&id); count--; /* Count is a limit of the command response size. */ continue; } @@ -4767,7 +4767,7 @@ void xautoclaimCommand(client *c) { * Note that nack->consumer is NULL if we created the * NACK above because of the FORCE option. */ if (nack->consumer) { - pelRemove(nack->consumer->pel, &id, &nack->consumer->pel_count); + pelRemove(nack->consumer->pel,&id,&nack->consumer->pel_count); } } @@ -4780,7 +4780,7 @@ void xautoclaimCommand(client *c) { if (nack->consumer != consumer) { /* Add the entry in the new consumer local PEL. */ - pelInsert(consumer->pel, &id, nack, &consumer->pel_count); + pelInsert(consumer->pel,&id,nack,&consumer->pel_count); nack->consumer = consumer; } From 6b6f8061e2931b9e381650697e86d5ebe71daf28 Mon Sep 17 00:00:00 2001 From: Sergey Georgiev Date: Fri, 3 Apr 2026 12:37:34 +0300 Subject: [PATCH 33/48] fixed: issue from review --- src/defrag.c | 23 +++++++++++++---------- src/rdb.c | 3 +++ 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/src/defrag.c b/src/defrag.c index f8fc522ea..4dd504bab 100644 --- a/src/defrag.c +++ b/src/defrag.c @@ -866,8 +866,8 @@ typedef struct { void* defragStreamConsumerPelFlax(raxIterator *ri, void *privdata) { PendingEntryContext *ctx = privdata; flax *f = ri->data; - flax *newf = activeDefragAlloc(f); - if (newf) f = newf; + flax *newflax = activeDefragAlloc(f); + if (newflax) f = newflax; void *newdata = activeDefragAlloc(f->data); if (newdata) f->data = newdata; @@ -876,10 +876,10 @@ void* defragStreamConsumerPelFlax(raxIterator *ri, void *privdata) { flaxStart(&fi, f); if (flaxSeek(&fi, "^", 0)) { do { - streamNACK *nack = fi.data; - nack->consumer = ctx->c; - nack->cgroup_ref_node->value = ctx->cg; - streamNACK *newnack = activeDefragAlloc(nack); + streamNACK *nack = fi.data, *newnack; + nack->consumer = ctx->c; /* update nack pointer to consumer */ + nack->cgroup_ref_node->value = ctx->cg; /* Update the value of cgroups_ref node to the consumer group. */ + newnack = activeDefragAlloc(nack); if (newnack) { /* Update in the consumer PEL flax. */ flaxInsert(f, fi.key, newnack, NULL); @@ -888,22 +888,25 @@ void* defragStreamConsumerPelFlax(raxIterator *ri, void *privdata) { * cache and flaxShrink to avoid new allocations during defrag. */ pelReplace(ctx->cg->pel, &newnack->id, newnack); - /* Update doubly-linked list pointers. */ + /* Update the doubly-linked list pointers in adjacent nacks. + * When we move a nack to a new address, we need to update the + * pel_prev->pel_next and pel_next->pel_prev pointers. */ if (newnack->pel_prev) { newnack->pel_prev->pel_next = newnack; } else { + /* This is the head of the list */ ctx->cg->pel_time_head = newnack; } if (newnack->pel_next) { newnack->pel_next->pel_prev = newnack; } else { + /* This is the tail of the list */ ctx->cg->pel_time_tail = newnack; } } } while (flaxNext(&fi)); } - - return newf; + return newflax; } typedef struct { @@ -929,7 +932,7 @@ void* defragStreamConsumer(raxIterator *ri, void *privdata) { defragRadixTree(&c->pel, 0, defragStreamConsumerPelFlax, &pel_ctx); pelCacheInvalidate(c->pel); } - return newc; + return newc; /* returns NULL if c was not defragged */ } /* Defrag a flax bucket in the group PEL. Only defrags the flax struct itself, diff --git a/src/rdb.c b/src/rdb.c index 178686ac1..d45602996 100644 --- a/src/rdb.c +++ b/src/rdb.c @@ -769,6 +769,9 @@ ssize_t rdbSaveStreamPEL(rio *rdb, rax *pel, uint64_t pel_count, int nacks) { return -1; } nwritten += n; + /* We don't save the consumer name: we'll save the pending IDs + * for each consumer in the consumer PEL, and resolve the consumer + * at loading time. */ } } pelIterStop(&pi); From 99221dd5bc39b63c391424cc3b73deea63dd07ac Mon Sep 17 00:00:00 2001 From: Sergey Georgiev Date: Tue, 7 Apr 2026 13:39:54 +0300 Subject: [PATCH 34/48] fixed: issue from review --- src/defrag.c | 86 +++--- src/stream.h | 26 +- src/t_stream.c | 439 +++++++++++++++++++++-------- tests/unit/type/stream-cgroups.tcl | 183 ++++++++++++ 4 files changed, 585 insertions(+), 149 deletions(-) diff --git a/src/defrag.c b/src/defrag.c index 4dd504bab..1c34186fb 100644 --- a/src/defrag.c +++ b/src/defrag.c @@ -861,48 +861,59 @@ typedef struct { streamConsumer *c; } PendingEntryContext; -/* Defrag a flax bucket in the consumer PEL. Each flax value is a NACK shared - * with the group PEL, so we update pointers in both places. */ -void* defragStreamConsumerPelFlax(raxIterator *ri, void *privdata) { +/* Defrag helper: after a NACK is moved to a new address, update the + * doubly-linked time list pointers and the group PEL entry. */ +static void defragNackFixup(PendingEntryContext *ctx, streamNACK *newnack) { + pelReplace(ctx->cg->pel, &newnack->id, newnack); + if (newnack->pel_prev) { + newnack->pel_prev->pel_next = newnack; + } else { + ctx->cg->pel_time_head = newnack; + } + if (newnack->pel_next) { + newnack->pel_next->pel_prev = newnack; + } else { + ctx->cg->pel_time_tail = newnack; + } +} + +/* Defrag a bucket in the consumer PEL. Each value is a NACK shared + * with the group PEL, so we update pointers in both places. + * Handles both direct (single-entry) and flax buckets. */ +void* defragStreamConsumerPelBucket(raxIterator *ri, void *privdata) { PendingEntryContext *ctx = privdata; - flax *f = ri->data; + void *bucket = ri->data; + + if (PEL_IS_DIRECT(bucket)) { + streamNACK *nack = PEL_DIRECT_PTR(bucket); + uint8_t fkey = PEL_DIRECT_FKEY(bucket); + nack->consumer = ctx->c; + nack->cgroup_ref_node->value = ctx->cg; + streamNACK *newnack = activeDefragAlloc(nack); + if (newnack) { + defragNackFixup(ctx, newnack); + return PEL_DIRECT_ENCODE(newnack, fkey); + } + return NULL; + } + + flax *f = (flax *)bucket; flax *newflax = activeDefragAlloc(f); if (newflax) f = newflax; void *newdata = activeDefragAlloc(f->data); if (newdata) f->data = newdata; - /* Iterate entries in the flax and defrag each NACK. */ flaxIterator fi; flaxStart(&fi, f); if (flaxSeek(&fi, "^", 0)) { do { - streamNACK *nack = fi.data, *newnack; - nack->consumer = ctx->c; /* update nack pointer to consumer */ - nack->cgroup_ref_node->value = ctx->cg; /* Update the value of cgroups_ref node to the consumer group. */ - newnack = activeDefragAlloc(nack); + streamNACK *nack = fi.data; + nack->consumer = ctx->c; + nack->cgroup_ref_node->value = ctx->cg; + streamNACK *newnack = activeDefragAlloc(nack); if (newnack) { - /* Update in the consumer PEL flax. */ flaxInsert(f, fi.key, newnack, NULL); - - /* Update in the group PEL flax. pelReplace bypasses the - * cache and flaxShrink to avoid new allocations during defrag. */ - pelReplace(ctx->cg->pel, &newnack->id, newnack); - - /* Update the doubly-linked list pointers in adjacent nacks. - * When we move a nack to a new address, we need to update the - * pel_prev->pel_next and pel_next->pel_prev pointers. */ - if (newnack->pel_prev) { - newnack->pel_prev->pel_next = newnack; - } else { - /* This is the head of the list */ - ctx->cg->pel_time_head = newnack; - } - if (newnack->pel_next) { - newnack->pel_next->pel_prev = newnack; - } else { - /* This is the tail of the list */ - ctx->cg->pel_time_tail = newnack; - } + defragNackFixup(ctx, newnack); } } while (flaxNext(&fi)); } @@ -929,17 +940,20 @@ void* defragStreamConsumer(raxIterator *ri, void *privdata) { if (c->pel) { c->pel->alloc_size = &s->alloc_size; PendingEntryContext pel_ctx = {cg, c}; - defragRadixTree(&c->pel, 0, defragStreamConsumerPelFlax, &pel_ctx); + defragRadixTree(&c->pel, 0, defragStreamConsumerPelBucket, &pel_ctx); pelCacheInvalidate(c->pel); } return newc; /* returns NULL if c was not defragged */ } -/* Defrag a flax bucket in the group PEL. Only defrags the flax struct itself, - * not the NACKs (those are defragged via consumer PEL traversal). */ -void* defragStreamGroupPelFlax(raxIterator *ri, void *privdata) { +/* Defrag a bucket in the group PEL. Only defrags the flax struct itself, + * not the NACKs (those are defragged via consumer PEL traversal). + * Direct entries have no allocation to defrag. */ +void* defragStreamGroupPelBucket(raxIterator *ri, void *privdata) { (void)privdata; - flax *f = ri->data; + void *bucket = ri->data; + if (PEL_IS_DIRECT(bucket)) return NULL; + flax *f = (flax *)bucket; flax *newf = activeDefragAlloc(f); if (newf) f = newf; void *newdata = activeDefragAlloc(f->data); @@ -954,7 +968,7 @@ void* defragStreamConsumerGroup(raxIterator *ri, void *privdata) { cg = newcg; if (cg->pel) { cg->pel->alloc_size = &s->alloc_size; - defragRadixTree(&cg->pel, 0, defragStreamGroupPelFlax, NULL); + defragRadixTree(&cg->pel, 0, defragStreamGroupPelBucket, NULL); pelCacheInvalidate(cg->pel); } if (cg->consumers) { diff --git a/src/stream.h b/src/stream.h index 72bf7d9e3..301dbe526 100644 --- a/src/stream.h +++ b/src/stream.h @@ -6,6 +6,27 @@ #include "dict.h" #include "xxhash.h" #include "flax.h" +#include + +/* Tagged pointer helpers for PEL direct single-entry buckets. + * + * When a PEL rax bucket contains a single entry, the data pointer and the + * flax key (low byte of seq) are packed into a tagged pointer stored directly + * in the rax value, avoiding the flax struct + data block allocations. + * + * Layout (LP64, 64-bit pointers): + * Bit 0: 1 = direct entry, 0 = flax pointer + * Bits 56..63: flax key (uint8_t, low byte of seq) + * Bits 1..55: data pointer (heap-allocated, >=8-byte aligned) + */ +#define PEL_IS_FLAX(v) (((uintptr_t)(v) & 1) == 0) +#define PEL_IS_DIRECT(v) (((uintptr_t)(v) & 1) == 1) +#define PEL_DIRECT_ENCODE(ptr, fkey) \ + ((void *)(((uintptr_t)(uint8_t)(fkey) << 56) | (uintptr_t)(ptr) | 1)) +#define PEL_DIRECT_PTR(v) \ + ((void *)((uintptr_t)(v) & 0x00FFFFFFFFFFFFFEULL)) +#define PEL_DIRECT_FKEY(v) \ + ((uint8_t)((uintptr_t)(v) >> 56)) /* Stream item ID: a 128 bit number composed of a milliseconds time and * a sequence counter. IDs generated in the same millisecond (or in a past @@ -201,12 +222,15 @@ void streamKeyRemoved(redisDb *db, robj *key, robj *val); listNode *streamLinkCGroupToEntry(stream *s, streamCG *cg, streamID *id); -/* Two-level PEL iterator: walks outer rax (ms buckets) and inner flax (seq). */ +/* Two-level PEL iterator: walks outer rax (ms buckets) and inner flax (seq). + * Single-entry buckets are stored as tagged pointers (direct entries) without + * a flax allocation; is_direct tracks whether the current entry is direct. */ typedef struct pelIterator { raxIterator ri; flaxIterator fi; int valid; int just_seeked; + int is_direct; streamID id; void *data; unsigned char key[sizeof(streamID)]; diff --git a/src/t_stream.c b/src/t_stream.c index 5894940ea..172ba728e 100644 --- a/src/t_stream.c +++ b/src/t_stream.c @@ -62,21 +62,46 @@ static void pelListUnlink(streamCG *cg, streamNACK *nack); static void pelListUpdate(streamCG *cg, streamNACK *nack, mstime_t new_delivery_time); /* ----------------------------------------------------------------------- - * Two-level PEL: rax(ms -> flax(seq -> streamNACK*)) + * Two-level PEL: rax(ms -> direct | flax(seq -> data*)) + * + * When a bucket contains a single entry, the data pointer and the flax key + * (low byte of seq) are packed into a tagged pointer stored directly in the + * rax value, avoiding the flax struct + data block allocations (~32 bytes + * saved per single-entry bucket). + * + * Tagged pointer layout (64-bit, LP64): + * Bit 0: 1 = direct entry, 0 = flax pointer + * Bits 56..63: flax key (uint8_t, low byte of seq) + * Bits 1..55: data pointer (heap-allocated, >=8-byte aligned) * ----------------------------------------------------------------------- */ #define PEL_RAX_KEY_LEN 15 +_Static_assert(sizeof(void *) == 8, "PEL direct encoding requires 64-bit pointers"); + /* Cache embedded in rax metadata to speed up sequential PEL ops - * when consecutive operations target the same 15-byte rax bucket. */ + * when consecutive operations target the same 15-byte rax bucket. + * When 'dirty' is set, the cached value has been created/updated but not yet + * inserted into the rax; it will be committed on the next cache eviction + * or explicit flush. The cached value may be a flax* or a direct entry. */ typedef struct pelCache { unsigned char key[PEL_RAX_KEY_LEN]; - flax *f; + void *val; + int dirty; } pelCache; +static void pelCacheFlush(rax *r) { + pelCache *cache = (pelCache *)r->metadata; + if (!cache->dirty) return; + raxInsert(r, cache->key, PEL_RAX_KEY_LEN, cache->val, NULL); + cache->dirty = 0; +} + void pelCacheInvalidate(rax *pel) { + pelCacheFlush(pel); pelCache *cache = (pelCache *)pel->metadata; - cache->f = NULL; + cache->val = NULL; + cache->dirty = 0; } /* Encode a 15-byte rax key: full 8B big-endian ms + upper 7 bytes of @@ -96,95 +121,141 @@ static inline uint8_t pelFlaxKey(uint64_t seq) { rax *pelNew(size_t *alloc_size) { rax *pel = raxNewWithMetadata(sizeof(pelCache), alloc_size); - if (pel) pelCacheInvalidate(pel); + if (pel) { + pelCache *cache = (pelCache *)pel->metadata; + cache->val = NULL; + cache->dirty = 0; + } return pel; } -/* Free all flax structures and call nack_free for each NACK. */ +/* Free all buckets and call nack_free for each data pointer. */ void pelFree(rax *pel, void (*nack_free)(void *, void *), void *ctx) { if (!pel) return; + pelCacheFlush(pel); raxIterator ri; raxStart(&ri, pel); raxSeek(&ri, "^", NULL, 0); while (raxNext(&ri)) { - flax *f = ri.data; - if (pel->alloc_size) *pel->alloc_size -= flaxAllocSize(f); - if (nack_free) - flaxFreeWithCallback(f, nack_free, ctx); - else - flaxFree(f); + void *bucket = ri.data; + if (PEL_IS_DIRECT(bucket)) { + if (nack_free) nack_free(PEL_DIRECT_PTR(bucket), ctx); + } else { + flax *f = (flax *)bucket; + if (pel->alloc_size) *pel->alloc_size -= flaxAllocSize(f); + if (nack_free) + flaxFreeWithCallback(f, nack_free, ctx); + else + flaxFree(f); + } } raxStop(&ri); raxFreeWithCbAndContext(pel, NULL, NULL); } -/* Free flax structures without freeing NACKs (for consumer PEL where NACKs are shared). */ +/* Free buckets without freeing data (for consumer PEL where NACKs are shared). */ void pelFreeShallow(rax *pel) { if (!pel) return; + pelCacheFlush(pel); raxIterator ri; raxStart(&ri, pel); raxSeek(&ri, "^", NULL, 0); while (raxNext(&ri)) { - if (pel->alloc_size) *pel->alloc_size -= flaxAllocSize((flax *)ri.data); - flaxFree((flax *)ri.data); + void *bucket = ri.data; + if (PEL_IS_FLAX(bucket)) { + if (pel->alloc_size) *pel->alloc_size -= flaxAllocSize((flax *)bucket); + flaxFree((flax *)bucket); + } } raxStop(&ri); raxFree(pel); } -/* pelResolveFlax -- Resolve the flax bucket for a given 15-byte rax key. - * - * With the 15+1 byte key scheme, each rax key is the first 15 bytes of the - * big-endian encoded stream ID (full ms + upper 56 bits of seq). The flax - * stores the low byte of seq as its uint8_t key, so each bucket holds at - * most 256 entries and never needs splitting. - * - * Returns the flax bucket, or NULL if no matching bucket exists (create==0). - * When prev is non-NULL, *prev is set to the previously cached flax bucket - * before the cache is updated. */ -static flax *pelResolveFlax(rax *r, unsigned char *keybuf, - int create, flax **prev) { - pelCache *cache = (pelCache *)r->metadata; - if (prev) *prev = cache->f; - - /* Cache hit */ - if (cache->f && memcmp(cache->key, keybuf, PEL_RAX_KEY_LEN) == 0) - return cache->f; - - /* Rax lookup (exact match) */ - void *data; - if (raxFind(r, keybuf, PEL_RAX_KEY_LEN, &data)) { - cache->f = (flax *)data; - memcpy(cache->key, keybuf, PEL_RAX_KEY_LEN); - return cache->f; - } - if (!create) return NULL; - - /* Create new bucket */ - flax *f = flaxNew(); - if (r->alloc_size) *r->alloc_size += flaxAllocSize(f); - raxInsert(r, keybuf, PEL_RAX_KEY_LEN, f, NULL); - cache->f = f; - memcpy(cache->key, keybuf, PEL_RAX_KEY_LEN); - return f; -} - /* Generic insert into two-level PEL. If 'overwrite' is true, an existing * entry's value is replaced; otherwise the insert is skipped when the key - * already exists. Returns 1 if a new entry was created, 0 otherwise. */ + * already exists. Returns 1 if a new entry was created, 0 otherwise. + * + * New buckets are created as direct entries (tagged pointers). When a second + * entry is inserted into a direct bucket with a different flax key, the + * bucket is promoted to a flax. */ static int pelGenericInsert(rax *pel, streamID *id, void *data, uint64_t *count, int overwrite) { unsigned char keybuf[PEL_RAX_KEY_LEN]; pelEncodeRaxKey(keybuf, id->ms, id->seq); - flax *prev; - flax *f = pelResolveFlax(pel, keybuf, 1, &prev); - if (prev && prev != f) { - size_t before = flaxAllocSize(prev); - flaxShrink(prev); - if (pel->alloc_size) *pel->alloc_size -= before - flaxAllocSize(prev); + uint8_t fkey = pelFlaxKey(id->seq); + pelCache *cache = (pelCache *)pel->metadata; + + /* Check cache for this key. */ + int cache_hit = (cache->val != NULL && + memcmp(cache->key, keybuf, PEL_RAX_KEY_LEN) == 0); + + void *bucket; + if (cache_hit) { + bucket = cache->val; + } else { + /* Switching away from previous bucket: shrink it if it was a flax. */ + if (cache->val && PEL_IS_FLAX(cache->val)) { + flax *prev = (flax *)cache->val; + size_t before = flaxAllocSize(prev); + flaxShrink(prev); + if (pel->alloc_size) *pel->alloc_size -= before - flaxAllocSize(prev); + } + pelCacheFlush(pel); + + /* Check rax. */ + void *raxval; + if (raxFind(pel, keybuf, PEL_RAX_KEY_LEN, &raxval)) { + bucket = raxval; + cache->val = bucket; + memcpy(cache->key, keybuf, PEL_RAX_KEY_LEN); + } else { + bucket = NULL; + } } + + /* No existing bucket: store as direct entry. */ + if (!bucket) { + cache->val = PEL_DIRECT_ENCODE(data, fkey); + memcpy(cache->key, keybuf, PEL_RAX_KEY_LEN); + cache->dirty = 1; + if (count) (*count)++; + return 1; + } + + /* Existing direct entry. */ + if (PEL_IS_DIRECT(bucket)) { + uint8_t efkey = PEL_DIRECT_FKEY(bucket); + + if (efkey == fkey) { + if (overwrite) { + void *nv = PEL_DIRECT_ENCODE(data, fkey); + cache->val = nv; + if (!cache->dirty) + raxInsert(pel, keybuf, PEL_RAX_KEY_LEN, nv, NULL); + } + return 0; + } + + /* Different fkey: promote to flax. */ + void *eptr = PEL_DIRECT_PTR(bucket); + flax *f = flaxNew(); + if (pel->alloc_size) *pel->alloc_size += flaxAllocSize(f); + flaxInsert(f, efkey, eptr, NULL); + size_t before = flaxAllocSize(f); + int inserted = overwrite ? flaxInsert(f, fkey, data, NULL) + : flaxTryInsert(f, fkey, data, NULL); + if (pel->alloc_size) *pel->alloc_size += flaxAllocSize(f) - before; + cache->val = f; + if (!cache->dirty) + raxInsert(pel, keybuf, PEL_RAX_KEY_LEN, f, NULL); + if (inserted && count) (*count)++; + return inserted; + } + + /* Existing flax bucket. */ + flax *f = (flax *)bucket; size_t before = flaxAllocSize(f); - int inserted = overwrite ? flaxInsert(f, pelFlaxKey(id->seq), data, NULL) - : flaxTryInsert(f, pelFlaxKey(id->seq), data, NULL); + int inserted = overwrite ? flaxInsert(f, fkey, data, NULL) + : flaxTryInsert(f, fkey, data, NULL); if (pel->alloc_size) *pel->alloc_size += flaxAllocSize(f) - before; if (inserted && count) (*count)++; return inserted; @@ -203,64 +274,217 @@ int pelTryInsert(rax *pel, streamID *id, void *data, uint64_t *count) { return pelGenericInsert(pel, id, data, count, 0); } -/* Replace the NACK pointer for an existing entry without cache interaction or +/* Replace the data pointer for an existing entry without cache interaction or * flax shrink side-effects. Intended for defrag, where the key is guaranteed * to exist and we must avoid allocations that would increase fragmentation. */ void pelReplace(rax *pel, streamID *id, void *data) { unsigned char keybuf[PEL_RAX_KEY_LEN]; pelEncodeRaxKey(keybuf, id->ms, id->seq); - void *raxval; - int found = raxFind(pel, keybuf, PEL_RAX_KEY_LEN, &raxval); - serverAssert(found); - flax *f = (flax *)raxval; - flaxInsert(f, pelFlaxKey(id->seq), data, NULL); + uint8_t fkey = pelFlaxKey(id->seq); + pelCache *cache = (pelCache *)pel->metadata; + + int cache_hit = (cache->val != NULL && + memcmp(cache->key, keybuf, PEL_RAX_KEY_LEN) == 0); + void *bucket; + if (cache_hit) { + bucket = cache->val; + } else { + int found = raxFind(pel, keybuf, PEL_RAX_KEY_LEN, &bucket); + serverAssert(found); + } + + if (PEL_IS_DIRECT(bucket)) { + serverAssert(PEL_DIRECT_FKEY(bucket) == fkey); + void *nv = PEL_DIRECT_ENCODE(data, fkey); + if (cache_hit) { + cache->val = nv; + if (!cache->dirty) + raxInsert(pel, keybuf, PEL_RAX_KEY_LEN, nv, NULL); + } else { + raxInsert(pel, keybuf, PEL_RAX_KEY_LEN, nv, NULL); + } + } else { + flaxInsert((flax *)bucket, fkey, data, NULL); + } } /* Find a value by streamID. Returns 1 if found (setting *data), 0 if not. */ int pelFind(rax *pel, streamID *id, void **data) { unsigned char keybuf[PEL_RAX_KEY_LEN]; pelEncodeRaxKey(keybuf, id->ms, id->seq); - flax *f = pelResolveFlax(pel, keybuf, 0, NULL); - if (!f) return 0; + uint8_t fkey = pelFlaxKey(id->seq); + pelCache *cache = (pelCache *)pel->metadata; + + void *bucket; + if (cache->val && memcmp(cache->key, keybuf, PEL_RAX_KEY_LEN) == 0) { + bucket = cache->val; + } else { + pelCacheFlush(pel); + if (!raxFind(pel, keybuf, PEL_RAX_KEY_LEN, &bucket)) return 0; + } + + if (PEL_IS_DIRECT(bucket)) { + if (PEL_DIRECT_FKEY(bucket) != fkey) return 0; + if (data) *data = PEL_DIRECT_PTR(bucket); + return 1; + } + void *val; - if (!flaxFind(f, pelFlaxKey(id->seq), &val)) return 0; + if (!flaxFind((flax *)bucket, fkey, &val)) return 0; if (data) *data = val; return 1; } -/* Remove a value by streamID. Returns the removed value or NULL. */ +/* Remove a value by streamID. Returns the removed value or NULL. + * When a flax bucket drops to 1 entry, it is demoted to a direct entry. */ void *pelRemove(rax *pel, streamID *id, uint64_t *count) { unsigned char keybuf[PEL_RAX_KEY_LEN]; pelEncodeRaxKey(keybuf, id->ms, id->seq); - flax *f = pelResolveFlax(pel, keybuf, 0, NULL); - if (!f) return NULL; + uint8_t fkey = pelFlaxKey(id->seq); + pelCache *cache = (pelCache *)pel->metadata; + + int cache_hit = (cache->val != NULL && + memcmp(cache->key, keybuf, PEL_RAX_KEY_LEN) == 0); + void *bucket; + if (cache_hit) { + bucket = cache->val; + } else { + pelCacheFlush(pel); + if (!raxFind(pel, keybuf, PEL_RAX_KEY_LEN, &bucket)) return NULL; + } + + /* Direct entry. */ + if (PEL_IS_DIRECT(bucket)) { + if (PEL_DIRECT_FKEY(bucket) != fkey) return NULL; + void *old = PEL_DIRECT_PTR(bucket); + if (count) (*count)--; + if (cache_hit && cache->dirty) { + cache->val = NULL; + cache->dirty = 0; + } else { + raxRemove(pel, keybuf, PEL_RAX_KEY_LEN, NULL); + pelCacheInvalidate(pel); + } + return old; + } + + /* Flax bucket. */ + flax *f = (flax *)bucket; void *old; - if (!flaxRemove(f, pelFlaxKey(id->seq), &old)) return NULL; + if (!flaxRemove(f, fkey, &old)) return NULL; if (count) (*count)--; + if (f->numele == 0) { if (pel->alloc_size) *pel->alloc_size -= flaxAllocSize(f); flaxFree(f); - raxRemove(pel, keybuf, PEL_RAX_KEY_LEN, NULL); - pelCacheInvalidate(pel); + if (cache_hit && cache->dirty) { + cache->val = NULL; + cache->dirty = 0; + } else { + raxRemove(pel, keybuf, PEL_RAX_KEY_LEN, NULL); + pelCacheInvalidate(pel); + } + } else if (f->numele == 1) { + /* Demote to direct entry. */ + flaxIterator fi; + flaxStart(&fi, f); + flaxSeek(&fi, "^", 0); + void *direct = PEL_DIRECT_ENCODE(fi.data, fi.key); + flaxStop(&fi); + if (pel->alloc_size) *pel->alloc_size -= flaxAllocSize(f); + flaxFree(f); + if (cache_hit) { + cache->val = direct; + if (!cache->dirty) + raxInsert(pel, keybuf, PEL_RAX_KEY_LEN, direct, NULL); + } else { + raxInsert(pel, keybuf, PEL_RAX_KEY_LEN, direct, NULL); + } } + return old; } /* --- PEL Iterator --- */ -/* Refresh iterator fields from current rax+flax positions. */ +/* Refresh iterator fields from current rax+flax positions (flax bucket). */ static void pelIterRefresh(pelIterator *pi) { memcpy(pi->key, pi->ri.key, PEL_RAX_KEY_LEN); pi->key[PEL_RAX_KEY_LEN] = (unsigned char)pi->fi.key; streamDecodeID(pi->key, &pi->id); pi->data = pi->fi.data; pi->valid = 1; + pi->is_direct = 0; +} + +/* Refresh iterator fields from a direct entry in the current rax node. */ +static void pelIterRefreshDirect(pelIterator *pi) { + void *bucket = pi->ri.data; + memcpy(pi->key, pi->ri.key, PEL_RAX_KEY_LEN); + pi->key[PEL_RAX_KEY_LEN] = PEL_DIRECT_FKEY(bucket); + streamDecodeID(pi->key, &pi->id); + pi->data = PEL_DIRECT_PTR(bucket); + pi->valid = 1; + pi->is_direct = 1; +} + +/* Position the iterator on the first valid entry starting from the current + * rax position (calling raxNext first). Returns 1 on success, 0 if no more. */ +static int pelIterAdvanceRax(pelIterator *pi) { + while (raxNext(&pi->ri)) { + void *bucket = pi->ri.data; + if (PEL_IS_DIRECT(bucket)) { + pelIterRefreshDirect(pi); + return 1; + } + flaxStart(&pi->fi, (flax *)bucket); + if (flaxSeek(&pi->fi, "^", 0)) { + pelIterRefresh(pi); + return 1; + } + } + pi->valid = 0; + return 0; +} + +/* Position the iterator on the first entry of the current rax node's bucket. + * Returns 1 on success, 0 if bucket is empty (should not happen). */ +static int pelIterEnterBucketHead(pelIterator *pi) { + void *bucket = pi->ri.data; + if (PEL_IS_DIRECT(bucket)) { + pelIterRefreshDirect(pi); + return 1; + } + flaxStart(&pi->fi, (flax *)bucket); + if (flaxSeek(&pi->fi, "^", 0)) { + pelIterRefresh(pi); + return 1; + } + return 0; +} + +/* Position the iterator on the last entry of the current rax node's bucket. + * Returns 1 on success, 0 if bucket is empty. */ +static int pelIterEnterBucketTail(pelIterator *pi) { + void *bucket = pi->ri.data; + if (PEL_IS_DIRECT(bucket)) { + pelIterRefreshDirect(pi); + return 1; + } + flaxStart(&pi->fi, (flax *)bucket); + if (flaxSeek(&pi->fi, "$", 0)) { + pelIterRefresh(pi); + return 1; + } + return 0; } void pelIterStart(pelIterator *pi, rax *pel) { + pelCacheFlush(pel); raxStart(&pi->ri, pel); pi->valid = 0; pi->just_seeked = 0; + pi->is_direct = 0; memset(&pi->fi, 0, sizeof(pi->fi)); memset(&pi->id, 0, sizeof(pi->id)); pi->data = NULL; @@ -272,17 +496,13 @@ int pelIterSeek(pelIterator *pi, const char *op, streamID *id) { if (op[0] == '^') { raxSeek(&pi->ri, "^", NULL, 0); if (!raxNext(&pi->ri)) return 0; - flaxStart(&pi->fi, (flax *)pi->ri.data); - if (!flaxSeek(&pi->fi, "^", 0)) return 0; - pelIterRefresh(pi); + if (!pelIterEnterBucketHead(pi)) return 0; pi->just_seeked = 1; return 1; } else if (op[0] == '$') { raxSeek(&pi->ri, "$", NULL, 0); if (!raxNext(&pi->ri)) return 0; - flaxStart(&pi->fi, (flax *)pi->ri.data); - if (!flaxSeek(&pi->fi, "$", 0)) return 0; - pelIterRefresh(pi); + if (!pelIterEnterBucketTail(pi)) return 0; pi->just_seeked = 1; return 1; } else if (op[0] == '>' && op[1] == '=') { @@ -290,38 +510,40 @@ int pelIterSeek(pelIterator *pi, const char *op, streamID *id) { pelEncodeRaxKey(keybuf, id->ms, id->seq); uint8_t fkey = pelFlaxKey(id->seq); - /* Seek to the largest rax key <= target 15-byte key. */ raxSeek(&pi->ri, "<=", keybuf, PEL_RAX_KEY_LEN); if (!raxNext(&pi->ri)) { - /* All rax keys are > target, start from the very first bucket. */ raxSeek(&pi->ri, "^", NULL, 0); if (!raxNext(&pi->ri)) return 0; - flaxStart(&pi->fi, (flax *)pi->ri.data); - if (!flaxSeek(&pi->fi, "^", 0)) return 0; - pelIterRefresh(pi); + if (!pelIterEnterBucketHead(pi)) return 0; pi->just_seeked = 1; return 1; } int cmp = memcmp(pi->ri.key, keybuf, PEL_RAX_KEY_LEN); - flaxStart(&pi->fi, (flax *)pi->ri.data); + void *bucket = pi->ri.data; + if (cmp == 0) { - /* Exact rax key match — seek flax for >= fkey. */ - if (!flaxSeek(&pi->fi, ">=", fkey)) { - if (!raxNext(&pi->ri)) return 0; - flaxStart(&pi->fi, (flax *)pi->ri.data); - if (!flaxSeek(&pi->fi, "^", 0)) return 0; + if (PEL_IS_DIRECT(bucket)) { + if (PEL_DIRECT_FKEY(bucket) >= fkey) { + pelIterRefreshDirect(pi); + } else { + if (!pelIterAdvanceRax(pi)) return 0; + } + } else { + flaxStart(&pi->fi, (flax *)bucket); + if (!flaxSeek(&pi->fi, ">=", fkey)) { + if (!pelIterAdvanceRax(pi)) return 0; + } else { + pelIterRefresh(pi); + } } } else if (cmp < 0) { - /* Landed in an earlier bucket, advance to next. */ - if (!raxNext(&pi->ri)) return 0; - flaxStart(&pi->fi, (flax *)pi->ri.data); - if (!flaxSeek(&pi->fi, "^", 0)) return 0; + if (!pelIterAdvanceRax(pi)) return 0; } else { - /* rax key > target: start from head of this bucket. */ - if (!flaxSeek(&pi->fi, "^", 0)) return 0; + if (!pelIterEnterBucketHead(pi)) { + if (!pelIterAdvanceRax(pi)) return 0; + } } - pelIterRefresh(pi); pi->just_seeked = 1; return 1; } @@ -334,20 +556,13 @@ int pelIterNext(pelIterator *pi) { return pi->valid; } if (!pi->valid) return 0; - if (flaxNext(&pi->fi)) { + + if (!pi->is_direct && flaxNext(&pi->fi)) { pelIterRefresh(pi); return 1; } - /* Current flax exhausted, advance to next non-empty bucket. */ - do { - if (!raxNext(&pi->ri)) { - pi->valid = 0; - return 0; - } - flaxStart(&pi->fi, (flax *)pi->ri.data); - } while (!flaxSeek(&pi->fi, "^", 0)); - pelIterRefresh(pi); - return 1; + + return pelIterAdvanceRax(pi); } void pelIterStop(pelIterator *pi) { diff --git a/tests/unit/type/stream-cgroups.tcl b/tests/unit/type/stream-cgroups.tcl index 26021b91d..71d932d01 100644 --- a/tests/unit/type/stream-cgroups.tcl +++ b/tests/unit/type/stream-cgroups.tcl @@ -3475,4 +3475,187 @@ start_server { } assert_equal [llength $collected] 800 } + + test "Two-level PEL direct entries with 1 msg/ms pattern" { + r DEL mystream + + # Add entries with unique ms values (1 msg per ms), each with seq=0. + # This exercises the direct entry path: each rax bucket has 1 entry. + for {set i 0} {$i < 100} {incr i} { + set ms [expr {10000 + $i}] + r XADD mystream $ms-0 field value$i + } + + r XGROUP CREATE mystream grp 0 + r XREADGROUP GROUP grp consumer1 COUNT 100 STREAMS mystream > + + set summary [r XPENDING mystream grp] + assert_equal [lindex $summary 0] 100 + + # Verify iteration returns all entries in order. + set all [r XPENDING mystream grp - + 200] + assert_equal [llength $all] 100 + assert_equal [lindex $all 0 0] "10000-0" + assert_equal [lindex $all end 0] "10099-0" + + # ACK some entries to test direct entry removal. + r XACK mystream grp 10000-0 10050-0 10099-0 + set summary [r XPENDING mystream grp] + assert_equal [lindex $summary 0] 97 + + # Verify boundaries after ACK. + set all [r XPENDING mystream grp - + 200] + assert_equal [llength $all] 97 + assert_equal [lindex $all 0 0] "10001-0" + assert_equal [lindex $all end 0] "10098-0" + } + + test "Two-level PEL direct-to-flax promotion and flax-to-direct demotion" { + r DEL mystream + + # Create two entries sharing the same ms but different seq. + # First entry creates a direct bucket; second promotes to flax. + r XADD mystream 20000-0 field val0 + r XADD mystream 20000-1 field val1 + + r XGROUP CREATE mystream grp 0 + r XREADGROUP GROUP grp consumer1 COUNT 2 STREAMS mystream > + + set summary [r XPENDING mystream grp] + assert_equal [lindex $summary 0] 2 + + set all [r XPENDING mystream grp - + 10] + assert_equal [llength $all] 2 + assert_equal [lindex $all 0 0] "20000-0" + assert_equal [lindex $all 1 0] "20000-1" + + # ACK one entry: flax drops to 1 element, should demote to direct. + r XACK mystream grp 20000-0 + set summary [r XPENDING mystream grp] + assert_equal [lindex $summary 0] 1 + set all [r XPENDING mystream grp - + 10] + assert_equal [lindex $all 0 0] "20000-1" + + # ACK the last entry: bucket removed entirely. + r XACK mystream grp 20000-1 + set summary [r XPENDING mystream grp] + assert_equal [lindex $summary 0] 0 + } + + test "Two-level PEL direct entries with XCLAIM across consumers" { + r DEL mystream + + # 1 msg/ms pattern + for {set i 0} {$i < 10} {incr i} { + set ms [expr {30000 + $i}] + r XADD mystream $ms-0 field value$i + } + + r XGROUP CREATE mystream grp 0 + r XREADGROUP GROUP grp consumer1 COUNT 10 STREAMS mystream > + + # XCLAIM first 5 entries to consumer2 + for {set i 0} {$i < 5} {incr i} { + set ms [expr {30000 + $i}] + r XCLAIM mystream grp consumer2 0 $ms-0 + } + + set pending_c1 [r XPENDING mystream grp - + 20 consumer1] + set pending_c2 [r XPENDING mystream grp - + 20 consumer2] + assert_equal [llength $pending_c1] 5 + assert_equal [llength $pending_c2] 5 + + # Verify ordering in each consumer's PEL. + assert_equal [lindex $pending_c1 0 0] "30005-0" + assert_equal [lindex $pending_c1 end 0] "30009-0" + assert_equal [lindex $pending_c2 0 0] "30000-0" + assert_equal [lindex $pending_c2 end 0] "30004-0" + } + + test "Two-level PEL direct entries survive RDB save/load" { + r DEL mystream + + # 1 msg/ms pattern + for {set i 0} {$i < 50} {incr i} { + set ms [expr {40000 + $i}] + r XADD mystream $ms-0 field value$i + } + + r XGROUP CREATE mystream grp 0 + r XREADGROUP GROUP grp consumer1 COUNT 50 STREAMS mystream > + + r DEBUG RELOAD + + set summary [r XPENDING mystream grp] + assert_equal [lindex $summary 0] 50 + + set all [r XPENDING mystream grp - + 100] + assert_equal [llength $all] 50 + assert_equal [lindex $all 0 0] "40000-0" + assert_equal [lindex $all end 0] "40049-0" + } {} {needs:debug} + + test "Two-level PEL mixed direct and flax buckets" { + r DEL mystream + + # Create a mix: some ms values with 1 entry (direct) and some with + # multiple entries (flax). This exercises iteration across both types. + for {set ms 50000} {$ms < 50005} {incr ms} { + r XADD mystream $ms-0 field val-$ms-0 + } + # Add multiple entries under ms=50005 (will be flax after promotion) + for {set seq 0} {$seq < 5} {incr seq} { + r XADD mystream 50005-$seq field val-50005-$seq + } + # More single-entry ms values after the flax bucket + for {set ms 50006} {$ms < 50010} {incr ms} { + r XADD mystream $ms-0 field val-$ms-0 + } + + r XGROUP CREATE mystream grp 0 + r XREADGROUP GROUP grp consumer1 COUNT 100 STREAMS mystream > + + set summary [r XPENDING mystream grp] + assert_equal [lindex $summary 0] 14 + + set all [r XPENDING mystream grp - + 20] + assert_equal [llength $all] 14 + + # Verify full ordering across direct and flax buckets. + set prev_ms 0 + set prev_seq -1 + foreach entry $all { + set id [lindex $entry 0] + set parts [split $id -] + set ms [lindex $parts 0] + set seq [lindex $parts 1] + if {$ms == $prev_ms} { + assert {$seq > $prev_seq} + } else { + assert {$ms > $prev_ms} + } + set prev_ms $ms + set prev_seq $seq + } + + assert_equal [lindex $all 0 0] "50000-0" + assert_equal [lindex $all end 0] "50009-0" + + # ACK all the flax bucket entries except one -> demote to direct + r XACK mystream grp 50005-0 50005-1 50005-2 50005-3 + set summary [r XPENDING mystream grp] + assert_equal [lindex $summary 0] 10 + + set all [r XPENDING mystream grp - + 20] + assert_equal [llength $all] 10 + + # Verify the remaining entry from the former flax bucket + set found 0 + foreach entry $all { + if {[lindex $entry 0] eq "50005-4"} { + set found 1 + } + } + assert_equal $found 1 + } } From e88721d9fd518422b867e4b9ed4f4e58c3d43f25 Mon Sep 17 00:00:00 2001 From: Sergey Georgiev Date: Sun, 12 Apr 2026 17:34:50 +0300 Subject: [PATCH 35/48] fixed: issue from review --- src/defrag.c | 19 ++-- src/stream.h | 55 +++++----- src/t_stream.c | 272 +++++++++++++++++++++++++++++-------------------- 3 files changed, 190 insertions(+), 156 deletions(-) diff --git a/src/defrag.c b/src/defrag.c index 1c34186fb..03ffee20b 100644 --- a/src/defrag.c +++ b/src/defrag.c @@ -879,25 +879,23 @@ static void defragNackFixup(PendingEntryContext *ctx, streamNACK *newnack) { /* Defrag a bucket in the consumer PEL. Each value is a NACK shared * with the group PEL, so we update pointers in both places. - * Handles both direct (single-entry) and flax buckets. */ + * Handles both direct (16-byte key) and flax (15-byte key) buckets. */ void* defragStreamConsumerPelBucket(raxIterator *ri, void *privdata) { PendingEntryContext *ctx = privdata; - void *bucket = ri->data; - if (PEL_IS_DIRECT(bucket)) { - streamNACK *nack = PEL_DIRECT_PTR(bucket); - uint8_t fkey = PEL_DIRECT_FKEY(bucket); + if (ri->key_len == PEL_RAX_DIRECT_KEYLEN) { + streamNACK *nack = ri->data; nack->consumer = ctx->c; nack->cgroup_ref_node->value = ctx->cg; streamNACK *newnack = activeDefragAlloc(nack); if (newnack) { defragNackFixup(ctx, newnack); - return PEL_DIRECT_ENCODE(newnack, fkey); + return newnack; } return NULL; } - flax *f = (flax *)bucket; + flax *f = (flax *)ri->data; flax *newflax = activeDefragAlloc(f); if (newflax) f = newflax; void *newdata = activeDefragAlloc(f->data); @@ -948,12 +946,11 @@ void* defragStreamConsumer(raxIterator *ri, void *privdata) { /* Defrag a bucket in the group PEL. Only defrags the flax struct itself, * not the NACKs (those are defragged via consumer PEL traversal). - * Direct entries have no allocation to defrag. */ + * Direct entries (16-byte key) have no allocation to defrag. */ void* defragStreamGroupPelBucket(raxIterator *ri, void *privdata) { (void)privdata; - void *bucket = ri->data; - if (PEL_IS_DIRECT(bucket)) return NULL; - flax *f = (flax *)bucket; + if (ri->key_len == PEL_RAX_DIRECT_KEYLEN) return NULL; + flax *f = (flax *)ri->data; flax *newf = activeDefragAlloc(f); if (newf) f = newf; void *newdata = activeDefragAlloc(f->data); diff --git a/src/stream.h b/src/stream.h index 301dbe526..56949d6bc 100644 --- a/src/stream.h +++ b/src/stream.h @@ -8,25 +8,15 @@ #include "flax.h" #include -/* Tagged pointer helpers for PEL direct single-entry buckets. +/* Two-level PEL key-length convention: * - * When a PEL rax bucket contains a single entry, the data pointer and the - * flax key (low byte of seq) are packed into a tagged pointer stored directly - * in the rax value, avoiding the flax struct + data block allocations. - * - * Layout (LP64, 64-bit pointers): - * Bit 0: 1 = direct entry, 0 = flax pointer - * Bits 56..63: flax key (uint8_t, low byte of seq) - * Bits 1..55: data pointer (heap-allocated, >=8-byte aligned) - */ -#define PEL_IS_FLAX(v) (((uintptr_t)(v) & 1) == 0) -#define PEL_IS_DIRECT(v) (((uintptr_t)(v) & 1) == 1) -#define PEL_DIRECT_ENCODE(ptr, fkey) \ - ((void *)(((uintptr_t)(uint8_t)(fkey) << 56) | (uintptr_t)(ptr) | 1)) -#define PEL_DIRECT_PTR(v) \ - ((void *)((uintptr_t)(v) & 0x00FFFFFFFFFFFFFEULL)) -#define PEL_DIRECT_FKEY(v) \ - ((uint8_t)((uintptr_t)(v) >> 56)) + * Single-entry ("direct") buckets use a 16-byte rax key (the complete + * big-endian streamID) and store the data pointer directly as the rax + * value. Multi-entry ("flax") buckets use a 15-byte rax key (ms + upper + * 7 bytes of seq) and store a flax* mapping the low byte of seq to data + * pointers. The key length in rax (16 vs 15) distinguishes the two. */ +#define PEL_RAX_DIRECT_KEYLEN sizeof(streamID) /* 16 */ +#define PEL_RAX_FLAX_KEYLEN 15 /* Stream item ID: a 128 bit number composed of a milliseconds time and * a sequence counter. IDs generated in the same millisecond (or in a past @@ -66,8 +56,9 @@ typedef struct stream { rax *cgroups; /* Consumer groups dictionary: name -> streamCG */ rax *cgroups_ref; /* Two-level index mapping message IDs to their consumer groups. Same key scheme as PEL: - outer rax(15-byte prefix) -> flax(low byte - of seq -> list* of streamCG pointers). */ + outer rax -> flax(low byte of seq -> list* + of streamCG pointers). Direct / flax key + lengths follow the PEL convention (16/15). */ streamID min_cgroup_last_id; /* The minimum ID of consume group. */ unsigned int min_cgroup_last_id_valid: 1; uint64_t idmp_duration; /* IDMP duration in seconds. */ @@ -124,12 +115,12 @@ typedef struct streamCG { group reads. In the real world, the reasoning behind this value is detailed at the top comment of streamEstimateDistanceFromFirstEverEntry(). */ - rax *pel; /* Two-level pending entries list. The outer rax is - keyed by a 15-byte prefix of the big-endian - encoded stream ID (full ms + upper 56 bits of - seq). Each value is a flax* mapping the low - byte of seq (uint8_t) to streamNACK pointers. - Max 256 entries per bucket, no splitting. */ + rax *pel; /* Two-level pending entries list. Single-entry + buckets (direct) use a 16-byte rax key (full + big-endian streamID). Multi-entry buckets use + a 15-byte key (ms + upper 7 bytes of seq) and + store a flax* mapping the low byte of seq to + streamNACK pointers. Max 256 per bucket. */ uint64_t pel_count; /* Total number of NACK entries across all flax buckets. */ streamNACK *pel_time_head; /* Head of time-ordered doubly-linked list of pending entries (oldest delivery_time). Used for efficient @@ -150,9 +141,9 @@ typedef struct streamConsumer { will be identified in the consumer group protocol. Case sensitive. */ rax *pel; /* Two-level consumer PEL: same structure as - streamCG.pel — 15-byte rax key (first 15 - bytes of encoded ID), flax(low byte of seq - -> NACK*). NACKs are shared with group PEL. */ + streamCG.pel — 16-byte key for direct, + 15-byte key for flax buckets. NACKs are + shared with group PEL. */ uint64_t pel_count; /* Total NACK count for this consumer. */ } streamConsumer; @@ -222,9 +213,9 @@ void streamKeyRemoved(redisDb *db, robj *key, robj *val); listNode *streamLinkCGroupToEntry(stream *s, streamCG *cg, streamID *id); -/* Two-level PEL iterator: walks outer rax (ms buckets) and inner flax (seq). - * Single-entry buckets are stored as tagged pointers (direct entries) without - * a flax allocation; is_direct tracks whether the current entry is direct. */ +/* Two-level PEL iterator: walks outer rax and inner flax. + * Direct entries (16-byte rax key) have no flax allocation; + * is_direct tracks whether the current entry is direct. */ typedef struct pelIterator { raxIterator ri; flaxIterator fi; diff --git a/src/t_stream.c b/src/t_stream.c index 172ba728e..4751fc6f3 100644 --- a/src/t_stream.c +++ b/src/t_stream.c @@ -61,39 +61,40 @@ static void pelListInsertAtTail(streamCG *cg, streamNACK *nack); static void pelListUnlink(streamCG *cg, streamNACK *nack); static void pelListUpdate(streamCG *cg, streamNACK *nack, mstime_t new_delivery_time); +void streamEncodeID(void *buf, streamID *id); + /* ----------------------------------------------------------------------- - * Two-level PEL: rax(ms -> direct | flax(seq -> data*)) + * Two-level PEL: rax -> direct | flax(seq -> data*) * - * When a bucket contains a single entry, the data pointer and the flax key - * (low byte of seq) are packed into a tagged pointer stored directly in the - * rax value, avoiding the flax struct + data block allocations (~32 bytes - * saved per single-entry bucket). - * - * Tagged pointer layout (64-bit, LP64): - * Bit 0: 1 = direct entry, 0 = flax pointer - * Bits 56..63: flax key (uint8_t, low byte of seq) - * Bits 1..55: data pointer (heap-allocated, >=8-byte aligned) + * Multi-entry buckets use a 15-byte rax key (full ms + upper 7 bytes of + * big-endian seq) and store a flax* mapping the low byte of seq to data + * pointers. Single-entry buckets ("direct entries") use a 16-byte rax + * key (the complete big-endian streamID) and store the raw data pointer + * directly in the rax value, avoiding the flax allocation. The key + * length in rax (16 vs 15) distinguishes the two bucket types. * ----------------------------------------------------------------------- */ -#define PEL_RAX_KEY_LEN 15 +#define PEL_RAX_KEY_LEN PEL_RAX_FLAX_KEYLEN -_Static_assert(sizeof(void *) == 8, "PEL direct encoding requires 64-bit pointers"); /* Cache embedded in rax metadata to speed up sequential PEL ops - * when consecutive operations target the same 15-byte rax bucket. + * when consecutive operations target the same ID prefix. * When 'dirty' is set, the cached value has been created/updated but not yet * inserted into the rax; it will be committed on the next cache eviction - * or explicit flush. The cached value may be a flax* or a direct entry. */ + * or explicit flush. 'is_direct' mirrors the key-length convention: + * 1 = direct (16-byte key, raw data pointer), 0 = flax (15-byte key). */ typedef struct pelCache { - unsigned char key[PEL_RAX_KEY_LEN]; + unsigned char key[PEL_RAX_DIRECT_KEYLEN]; void *val; int dirty; + int is_direct; } pelCache; static void pelCacheFlush(rax *r) { pelCache *cache = (pelCache *)r->metadata; if (!cache->dirty) return; - raxInsert(r, cache->key, PEL_RAX_KEY_LEN, cache->val, NULL); + size_t keylen = cache->is_direct ? PEL_RAX_DIRECT_KEYLEN : PEL_RAX_KEY_LEN; + raxInsert(r, cache->key, keylen, cache->val, NULL); cache->dirty = 0; } @@ -102,6 +103,7 @@ void pelCacheInvalidate(rax *pel) { pelCache *cache = (pelCache *)pel->metadata; cache->val = NULL; cache->dirty = 0; + cache->is_direct = 0; } /* Encode a 15-byte rax key: full 8B big-endian ms + upper 7 bytes of @@ -125,6 +127,7 @@ rax *pelNew(size_t *alloc_size) { pelCache *cache = (pelCache *)pel->metadata; cache->val = NULL; cache->dirty = 0; + cache->is_direct = 0; } return pel; } @@ -137,11 +140,10 @@ void pelFree(rax *pel, void (*nack_free)(void *, void *), void *ctx) { raxStart(&ri, pel); raxSeek(&ri, "^", NULL, 0); while (raxNext(&ri)) { - void *bucket = ri.data; - if (PEL_IS_DIRECT(bucket)) { - if (nack_free) nack_free(PEL_DIRECT_PTR(bucket), ctx); + if (ri.key_len == PEL_RAX_DIRECT_KEYLEN) { + if (nack_free) nack_free(ri.data, ctx); } else { - flax *f = (flax *)bucket; + flax *f = (flax *)ri.data; if (pel->alloc_size) *pel->alloc_size -= flaxAllocSize(f); if (nack_free) flaxFreeWithCallback(f, nack_free, ctx); @@ -161,10 +163,10 @@ void pelFreeShallow(rax *pel) { raxStart(&ri, pel); raxSeek(&ri, "^", NULL, 0); while (raxNext(&ri)) { - void *bucket = ri.data; - if (PEL_IS_FLAX(bucket)) { - if (pel->alloc_size) *pel->alloc_size -= flaxAllocSize((flax *)bucket); - flaxFree((flax *)bucket); + if (ri.key_len == PEL_RAX_FLAX_KEYLEN) { + flax *f = (flax *)ri.data; + if (pel->alloc_size) *pel->alloc_size -= flaxAllocSize(f); + flaxFree(f); } } raxStop(&ri); @@ -175,25 +177,27 @@ void pelFreeShallow(rax *pel) { * entry's value is replaced; otherwise the insert is skipped when the key * already exists. Returns 1 if a new entry was created, 0 otherwise. * - * New buckets are created as direct entries (tagged pointers). When a second - * entry is inserted into a direct bucket with a different flax key, the - * bucket is promoted to a flax. */ + * New buckets are created as direct entries (16-byte rax key). When a second + * entry is inserted into a direct bucket with a different fkey, the bucket + * is promoted to a flax (15-byte rax key). */ static int pelGenericInsert(rax *pel, streamID *id, void *data, uint64_t *count, int overwrite) { - unsigned char keybuf[PEL_RAX_KEY_LEN]; - pelEncodeRaxKey(keybuf, id->ms, id->seq); - uint8_t fkey = pelFlaxKey(id->seq); + unsigned char fullkey[PEL_RAX_DIRECT_KEYLEN]; + streamEncodeID(fullkey, id); + uint8_t fkey = fullkey[PEL_RAX_KEY_LEN]; pelCache *cache = (pelCache *)pel->metadata; - /* Check cache for this key. */ + /* Cache lookup: compare the 15-byte prefix. */ int cache_hit = (cache->val != NULL && - memcmp(cache->key, keybuf, PEL_RAX_KEY_LEN) == 0); + memcmp(cache->key, fullkey, PEL_RAX_KEY_LEN) == 0); + int is_direct = 0; void *bucket; if (cache_hit) { bucket = cache->val; + is_direct = cache->is_direct; } else { /* Switching away from previous bucket: shrink it if it was a flax. */ - if (cache->val && PEL_IS_FLAX(cache->val)) { + if (cache->val && !cache->is_direct) { flax *prev = (flax *)cache->val; size_t before = flaxAllocSize(prev); flaxShrink(prev); @@ -201,52 +205,70 @@ static int pelGenericInsert(rax *pel, streamID *id, void *data, uint64_t *count, } pelCacheFlush(pel); - /* Check rax. */ + /* Look for an existing bucket: try 15-byte flax first, then + * prefix-scan for a 16-byte direct entry with any fkey. */ void *raxval; - if (raxFind(pel, keybuf, PEL_RAX_KEY_LEN, &raxval)) { + if (raxFind(pel, fullkey, PEL_RAX_KEY_LEN, &raxval)) { bucket = raxval; cache->val = bucket; - memcpy(cache->key, keybuf, PEL_RAX_KEY_LEN); + cache->is_direct = 0; + memcpy(cache->key, fullkey, PEL_RAX_KEY_LEN); } else { - bucket = NULL; + raxIterator ri; + raxStart(&ri, pel); + raxSeek(&ri, ">=", fullkey, PEL_RAX_KEY_LEN); + if (raxNext(&ri) && ri.key_len == PEL_RAX_DIRECT_KEYLEN && + memcmp(ri.key, fullkey, PEL_RAX_KEY_LEN) == 0) { + bucket = ri.data; + is_direct = 1; + cache->val = bucket; + cache->is_direct = 1; + memcpy(cache->key, ri.key, PEL_RAX_DIRECT_KEYLEN); + } else { + bucket = NULL; + } + raxStop(&ri); } } - /* No existing bucket: store as direct entry. */ + /* No existing bucket: store as direct entry with 16-byte key. */ if (!bucket) { - cache->val = PEL_DIRECT_ENCODE(data, fkey); - memcpy(cache->key, keybuf, PEL_RAX_KEY_LEN); + cache->val = data; + cache->is_direct = 1; + memcpy(cache->key, fullkey, PEL_RAX_DIRECT_KEYLEN); cache->dirty = 1; if (count) (*count)++; return 1; } /* Existing direct entry. */ - if (PEL_IS_DIRECT(bucket)) { - uint8_t efkey = PEL_DIRECT_FKEY(bucket); + if (is_direct) { + uint8_t efkey = cache->key[PEL_RAX_KEY_LEN]; if (efkey == fkey) { if (overwrite) { - void *nv = PEL_DIRECT_ENCODE(data, fkey); - cache->val = nv; + cache->val = data; if (!cache->dirty) - raxInsert(pel, keybuf, PEL_RAX_KEY_LEN, nv, NULL); + raxInsert(pel, fullkey, PEL_RAX_DIRECT_KEYLEN, data, NULL); } return 0; } /* Different fkey: promote to flax. */ - void *eptr = PEL_DIRECT_PTR(bucket); flax *f = flaxNew(); if (pel->alloc_size) *pel->alloc_size += flaxAllocSize(f); - flaxInsert(f, efkey, eptr, NULL); + flaxInsert(f, efkey, bucket, NULL); size_t before = flaxAllocSize(f); int inserted = overwrite ? flaxInsert(f, fkey, data, NULL) : flaxTryInsert(f, fkey, data, NULL); if (pel->alloc_size) *pel->alloc_size += flaxAllocSize(f) - before; + if (!cache->dirty) { + raxRemove(pel, cache->key, PEL_RAX_DIRECT_KEYLEN, NULL); + raxInsert(pel, fullkey, PEL_RAX_KEY_LEN, f, NULL); + } cache->val = f; - if (!cache->dirty) - raxInsert(pel, keybuf, PEL_RAX_KEY_LEN, f, NULL); + cache->is_direct = 0; + memcpy(cache->key, fullkey, PEL_RAX_KEY_LEN); if (inserted && count) (*count)++; return inserted; } @@ -278,30 +300,35 @@ int pelTryInsert(rax *pel, streamID *id, void *data, uint64_t *count) { * flax shrink side-effects. Intended for defrag, where the key is guaranteed * to exist and we must avoid allocations that would increase fragmentation. */ void pelReplace(rax *pel, streamID *id, void *data) { - unsigned char keybuf[PEL_RAX_KEY_LEN]; - pelEncodeRaxKey(keybuf, id->ms, id->seq); - uint8_t fkey = pelFlaxKey(id->seq); + unsigned char fullkey[PEL_RAX_DIRECT_KEYLEN]; + streamEncodeID(fullkey, id); + uint8_t fkey = fullkey[PEL_RAX_KEY_LEN]; pelCache *cache = (pelCache *)pel->metadata; int cache_hit = (cache->val != NULL && - memcmp(cache->key, keybuf, PEL_RAX_KEY_LEN) == 0); + memcmp(cache->key, fullkey, PEL_RAX_KEY_LEN) == 0); + int is_direct; void *bucket; if (cache_hit) { bucket = cache->val; + is_direct = cache->is_direct; } else { - int found = raxFind(pel, keybuf, PEL_RAX_KEY_LEN, &bucket); - serverAssert(found); + if (raxFind(pel, fullkey, PEL_RAX_DIRECT_KEYLEN, &bucket)) { + is_direct = 1; + } else { + int found = raxFind(pel, fullkey, PEL_RAX_KEY_LEN, &bucket); + serverAssert(found); + is_direct = 0; + } } - if (PEL_IS_DIRECT(bucket)) { - serverAssert(PEL_DIRECT_FKEY(bucket) == fkey); - void *nv = PEL_DIRECT_ENCODE(data, fkey); + if (is_direct) { if (cache_hit) { - cache->val = nv; + cache->val = data; if (!cache->dirty) - raxInsert(pel, keybuf, PEL_RAX_KEY_LEN, nv, NULL); + raxInsert(pel, fullkey, PEL_RAX_DIRECT_KEYLEN, data, NULL); } else { - raxInsert(pel, keybuf, PEL_RAX_KEY_LEN, nv, NULL); + raxInsert(pel, fullkey, PEL_RAX_DIRECT_KEYLEN, data, NULL); } } else { flaxInsert((flax *)bucket, fkey, data, NULL); @@ -310,23 +337,29 @@ void pelReplace(rax *pel, streamID *id, void *data) { /* Find a value by streamID. Returns 1 if found (setting *data), 0 if not. */ int pelFind(rax *pel, streamID *id, void **data) { - unsigned char keybuf[PEL_RAX_KEY_LEN]; - pelEncodeRaxKey(keybuf, id->ms, id->seq); - uint8_t fkey = pelFlaxKey(id->seq); + unsigned char fullkey[PEL_RAX_DIRECT_KEYLEN]; + streamEncodeID(fullkey, id); + uint8_t fkey = fullkey[PEL_RAX_KEY_LEN]; pelCache *cache = (pelCache *)pel->metadata; void *bucket; - if (cache->val && memcmp(cache->key, keybuf, PEL_RAX_KEY_LEN) == 0) { + if (cache->val && memcmp(cache->key, fullkey, PEL_RAX_KEY_LEN) == 0) { + if (cache->is_direct) { + if (cache->key[PEL_RAX_KEY_LEN] != fkey) return 0; + if (data) *data = cache->val; + return 1; + } bucket = cache->val; } else { pelCacheFlush(pel); - if (!raxFind(pel, keybuf, PEL_RAX_KEY_LEN, &bucket)) return 0; - } - - if (PEL_IS_DIRECT(bucket)) { - if (PEL_DIRECT_FKEY(bucket) != fkey) return 0; - if (data) *data = PEL_DIRECT_PTR(bucket); - return 1; + /* Try 16-byte key (direct entry) — exact match guarantees fkey. */ + if (raxFind(pel, fullkey, PEL_RAX_DIRECT_KEYLEN, &bucket)) { + if (data) *data = bucket; + return 1; + } + /* Try 15-byte key (flax bucket). */ + if (!raxFind(pel, fullkey, PEL_RAX_KEY_LEN, &bucket)) + return 0; } void *val; @@ -338,31 +371,40 @@ int pelFind(rax *pel, streamID *id, void **data) { /* Remove a value by streamID. Returns the removed value or NULL. * When a flax bucket drops to 1 entry, it is demoted to a direct entry. */ void *pelRemove(rax *pel, streamID *id, uint64_t *count) { - unsigned char keybuf[PEL_RAX_KEY_LEN]; - pelEncodeRaxKey(keybuf, id->ms, id->seq); - uint8_t fkey = pelFlaxKey(id->seq); + unsigned char fullkey[PEL_RAX_DIRECT_KEYLEN]; + streamEncodeID(fullkey, id); + uint8_t fkey = fullkey[PEL_RAX_KEY_LEN]; pelCache *cache = (pelCache *)pel->metadata; int cache_hit = (cache->val != NULL && - memcmp(cache->key, keybuf, PEL_RAX_KEY_LEN) == 0); + memcmp(cache->key, fullkey, PEL_RAX_KEY_LEN) == 0); + int is_direct; void *bucket; if (cache_hit) { bucket = cache->val; + is_direct = cache->is_direct; } else { pelCacheFlush(pel); - if (!raxFind(pel, keybuf, PEL_RAX_KEY_LEN, &bucket)) return NULL; + if (raxFind(pel, fullkey, PEL_RAX_DIRECT_KEYLEN, &bucket)) { + is_direct = 1; + } else if (raxFind(pel, fullkey, PEL_RAX_KEY_LEN, &bucket)) { + is_direct = 0; + } else { + return NULL; + } } /* Direct entry. */ - if (PEL_IS_DIRECT(bucket)) { - if (PEL_DIRECT_FKEY(bucket) != fkey) return NULL; - void *old = PEL_DIRECT_PTR(bucket); + if (is_direct) { + if (cache_hit && cache->key[PEL_RAX_KEY_LEN] != fkey) return NULL; + void *old = bucket; if (count) (*count)--; if (cache_hit && cache->dirty) { cache->val = NULL; cache->dirty = 0; + cache->is_direct = 0; } else { - raxRemove(pel, keybuf, PEL_RAX_KEY_LEN, NULL); + raxRemove(pel, fullkey, PEL_RAX_DIRECT_KEYLEN, NULL); pelCacheInvalidate(pel); } return old; @@ -380,25 +422,34 @@ void *pelRemove(rax *pel, streamID *id, uint64_t *count) { if (cache_hit && cache->dirty) { cache->val = NULL; cache->dirty = 0; + cache->is_direct = 0; } else { - raxRemove(pel, keybuf, PEL_RAX_KEY_LEN, NULL); + raxRemove(pel, fullkey, PEL_RAX_KEY_LEN, NULL); pelCacheInvalidate(pel); } } else if (f->numele == 1) { - /* Demote to direct entry. */ + /* Demote to direct entry with 16-byte key. */ flaxIterator fi; flaxStart(&fi, f); flaxSeek(&fi, "^", 0); - void *direct = PEL_DIRECT_ENCODE(fi.data, fi.key); + void *directval = fi.data; + unsigned char directkey[PEL_RAX_DIRECT_KEYLEN]; + memcpy(directkey, fullkey, PEL_RAX_KEY_LEN); + directkey[PEL_RAX_KEY_LEN] = (unsigned char)fi.key; flaxStop(&fi); if (pel->alloc_size) *pel->alloc_size -= flaxAllocSize(f); flaxFree(f); if (cache_hit) { - cache->val = direct; - if (!cache->dirty) - raxInsert(pel, keybuf, PEL_RAX_KEY_LEN, direct, NULL); + cache->val = directval; + cache->is_direct = 1; + memcpy(cache->key, directkey, PEL_RAX_DIRECT_KEYLEN); + if (!cache->dirty) { + raxRemove(pel, fullkey, PEL_RAX_KEY_LEN, NULL); + raxInsert(pel, directkey, PEL_RAX_DIRECT_KEYLEN, directval, NULL); + } } else { - raxInsert(pel, keybuf, PEL_RAX_KEY_LEN, direct, NULL); + raxRemove(pel, fullkey, PEL_RAX_KEY_LEN, NULL); + raxInsert(pel, directkey, PEL_RAX_DIRECT_KEYLEN, directval, NULL); } } @@ -417,13 +468,12 @@ static void pelIterRefresh(pelIterator *pi) { pi->is_direct = 0; } -/* Refresh iterator fields from a direct entry in the current rax node. */ +/* Refresh iterator fields from a direct entry in the current rax node. + * Direct entries have a 16-byte rax key (full streamID). */ static void pelIterRefreshDirect(pelIterator *pi) { - void *bucket = pi->ri.data; - memcpy(pi->key, pi->ri.key, PEL_RAX_KEY_LEN); - pi->key[PEL_RAX_KEY_LEN] = PEL_DIRECT_FKEY(bucket); + memcpy(pi->key, pi->ri.key, PEL_RAX_DIRECT_KEYLEN); streamDecodeID(pi->key, &pi->id); - pi->data = PEL_DIRECT_PTR(bucket); + pi->data = pi->ri.data; pi->valid = 1; pi->is_direct = 1; } @@ -432,12 +482,11 @@ static void pelIterRefreshDirect(pelIterator *pi) { * rax position (calling raxNext first). Returns 1 on success, 0 if no more. */ static int pelIterAdvanceRax(pelIterator *pi) { while (raxNext(&pi->ri)) { - void *bucket = pi->ri.data; - if (PEL_IS_DIRECT(bucket)) { + if (pi->ri.key_len == PEL_RAX_DIRECT_KEYLEN) { pelIterRefreshDirect(pi); return 1; } - flaxStart(&pi->fi, (flax *)bucket); + flaxStart(&pi->fi, (flax *)pi->ri.data); if (flaxSeek(&pi->fi, "^", 0)) { pelIterRefresh(pi); return 1; @@ -450,12 +499,11 @@ static int pelIterAdvanceRax(pelIterator *pi) { /* Position the iterator on the first entry of the current rax node's bucket. * Returns 1 on success, 0 if bucket is empty (should not happen). */ static int pelIterEnterBucketHead(pelIterator *pi) { - void *bucket = pi->ri.data; - if (PEL_IS_DIRECT(bucket)) { + if (pi->ri.key_len == PEL_RAX_DIRECT_KEYLEN) { pelIterRefreshDirect(pi); return 1; } - flaxStart(&pi->fi, (flax *)bucket); + flaxStart(&pi->fi, (flax *)pi->ri.data); if (flaxSeek(&pi->fi, "^", 0)) { pelIterRefresh(pi); return 1; @@ -466,12 +514,11 @@ static int pelIterEnterBucketHead(pelIterator *pi) { /* Position the iterator on the last entry of the current rax node's bucket. * Returns 1 on success, 0 if bucket is empty. */ static int pelIterEnterBucketTail(pelIterator *pi) { - void *bucket = pi->ri.data; - if (PEL_IS_DIRECT(bucket)) { + if (pi->ri.key_len == PEL_RAX_DIRECT_KEYLEN) { pelIterRefreshDirect(pi); return 1; } - flaxStart(&pi->fi, (flax *)bucket); + flaxStart(&pi->fi, (flax *)pi->ri.data); if (flaxSeek(&pi->fi, "$", 0)) { pelIterRefresh(pi); return 1; @@ -506,11 +553,11 @@ int pelIterSeek(pelIterator *pi, const char *op, streamID *id) { pi->just_seeked = 1; return 1; } else if (op[0] == '>' && op[1] == '=') { - unsigned char keybuf[PEL_RAX_KEY_LEN]; - pelEncodeRaxKey(keybuf, id->ms, id->seq); - uint8_t fkey = pelFlaxKey(id->seq); + unsigned char fullkey[PEL_RAX_DIRECT_KEYLEN]; + streamEncodeID(fullkey, id); + uint8_t fkey = fullkey[PEL_RAX_KEY_LEN]; - raxSeek(&pi->ri, "<=", keybuf, PEL_RAX_KEY_LEN); + raxSeek(&pi->ri, "<=", fullkey, PEL_RAX_DIRECT_KEYLEN); if (!raxNext(&pi->ri)) { raxSeek(&pi->ri, "^", NULL, 0); if (!raxNext(&pi->ri)) return 0; @@ -519,25 +566,24 @@ int pelIterSeek(pelIterator *pi, const char *op, streamID *id) { return 1; } - int cmp = memcmp(pi->ri.key, keybuf, PEL_RAX_KEY_LEN); - void *bucket = pi->ri.data; + int prefix_cmp = memcmp(pi->ri.key, fullkey, PEL_RAX_KEY_LEN); - if (cmp == 0) { - if (PEL_IS_DIRECT(bucket)) { - if (PEL_DIRECT_FKEY(bucket) >= fkey) { + if (prefix_cmp == 0) { + if (pi->ri.key_len == PEL_RAX_DIRECT_KEYLEN) { + if (pi->ri.key[PEL_RAX_KEY_LEN] >= fkey) { pelIterRefreshDirect(pi); } else { if (!pelIterAdvanceRax(pi)) return 0; } } else { - flaxStart(&pi->fi, (flax *)bucket); + flaxStart(&pi->fi, (flax *)pi->ri.data); if (!flaxSeek(&pi->fi, ">=", fkey)) { if (!pelIterAdvanceRax(pi)) return 0; } else { pelIterRefresh(pi); } } - } else if (cmp < 0) { + } else if (prefix_cmp < 0) { if (!pelIterAdvanceRax(pi)) return 0; } else { if (!pelIterEnterBucketHead(pi)) { From 4a861d639e4c4bf5593c47a432062388246e31db Mon Sep 17 00:00:00 2001 From: Sergey Georgiev Date: Mon, 13 Apr 2026 17:39:22 +0300 Subject: [PATCH 36/48] fixed: issue from review --- src/stream.h | 4 ++-- src/t_stream.c | 65 +++++++++++++++++++++++++------------------------- 2 files changed, 34 insertions(+), 35 deletions(-) diff --git a/src/stream.h b/src/stream.h index b87f75ae3..90ec3309f 100644 --- a/src/stream.h +++ b/src/stream.h @@ -220,13 +220,13 @@ listNode *streamLinkCGroupToEntry(stream *s, streamCG *cg, streamID *id); /* Two-level PEL iterator: walks outer rax and inner flax. * Direct entries (16-byte rax key) have no flax allocation; - * is_direct tracks whether the current entry is direct. */ + * direct tracks whether the current entry is direct. */ typedef struct pelIterator { raxIterator ri; flaxIterator fi; int valid; int just_seeked; - int is_direct; + int direct; streamID id; void *data; unsigned char key[sizeof(streamID)]; diff --git a/src/t_stream.c b/src/t_stream.c index 788277b36..4d8507790 100644 --- a/src/t_stream.c +++ b/src/t_stream.c @@ -81,19 +81,19 @@ void streamEncodeID(void *buf, streamID *id); * when consecutive operations target the same ID prefix. * When 'dirty' is set, the cached value has been created/updated but not yet * inserted into the rax; it will be committed on the next cache eviction - * or explicit flush. 'is_direct' mirrors the key-length convention: + * or explicit flush. 'direct' mirrors the key-length convention: * 1 = direct (16-byte key, raw data pointer), 0 = flax (15-byte key). */ typedef struct pelCache { unsigned char key[PEL_RAX_DIRECT_KEYLEN]; void *val; int dirty; - int is_direct; + int direct; } pelCache; static void pelCacheFlush(rax *r) { pelCache *cache = (pelCache *)r->metadata; if (!cache->dirty) return; - size_t keylen = cache->is_direct ? PEL_RAX_DIRECT_KEYLEN : PEL_RAX_KEY_LEN; + size_t keylen = cache->direct ? PEL_RAX_DIRECT_KEYLEN : PEL_RAX_KEY_LEN; raxInsert(r, cache->key, keylen, cache->val, NULL); cache->dirty = 0; } @@ -103,7 +103,7 @@ void pelCacheInvalidate(rax *pel) { pelCache *cache = (pelCache *)pel->metadata; cache->val = NULL; cache->dirty = 0; - cache->is_direct = 0; + cache->direct = 0; } /* Encode a 15-byte rax key: full 8B big-endian ms + upper 7 bytes of @@ -127,7 +127,7 @@ rax *pelNew(size_t *alloc_size) { pelCache *cache = (pelCache *)pel->metadata; cache->val = NULL; cache->dirty = 0; - cache->is_direct = 0; + cache->direct = 0; } return pel; } @@ -190,14 +190,14 @@ static int pelGenericInsert(rax *pel, streamID *id, void *data, uint64_t *count, int cache_hit = (cache->val != NULL && memcmp(cache->key, fullkey, PEL_RAX_KEY_LEN) == 0); - int is_direct = 0; + int direct = 0; void *bucket; if (cache_hit) { bucket = cache->val; - is_direct = cache->is_direct; + direct = cache->direct; } else { /* Switching away from previous bucket: shrink it if it was a flax. */ - if (cache->val && !cache->is_direct) { + if (cache->val && !cache->direct) { flax *prev = (flax *)cache->val; size_t before = flaxAllocSize(prev); flaxShrink(prev); @@ -211,7 +211,7 @@ static int pelGenericInsert(rax *pel, streamID *id, void *data, uint64_t *count, if (raxFind(pel, fullkey, PEL_RAX_KEY_LEN, &raxval)) { bucket = raxval; cache->val = bucket; - cache->is_direct = 0; + cache->direct = 0; memcpy(cache->key, fullkey, PEL_RAX_KEY_LEN); } else { raxIterator ri; @@ -220,9 +220,9 @@ static int pelGenericInsert(rax *pel, streamID *id, void *data, uint64_t *count, if (raxNext(&ri) && ri.key_len == PEL_RAX_DIRECT_KEYLEN && memcmp(ri.key, fullkey, PEL_RAX_KEY_LEN) == 0) { bucket = ri.data; - is_direct = 1; + direct = 1; cache->val = bucket; - cache->is_direct = 1; + cache->direct = 1; memcpy(cache->key, ri.key, PEL_RAX_DIRECT_KEYLEN); } else { bucket = NULL; @@ -234,7 +234,7 @@ static int pelGenericInsert(rax *pel, streamID *id, void *data, uint64_t *count, /* No existing bucket: store as direct entry with 16-byte key. */ if (!bucket) { cache->val = data; - cache->is_direct = 1; + cache->direct = 1; memcpy(cache->key, fullkey, PEL_RAX_DIRECT_KEYLEN); cache->dirty = 1; if (count) (*count)++; @@ -242,9 +242,8 @@ static int pelGenericInsert(rax *pel, streamID *id, void *data, uint64_t *count, } /* Existing direct entry. */ - if (is_direct) { + if (direct) { uint8_t efkey = cache->key[PEL_RAX_KEY_LEN]; - if (efkey == fkey) { if (overwrite) { cache->val = data; @@ -267,7 +266,7 @@ static int pelGenericInsert(rax *pel, streamID *id, void *data, uint64_t *count, raxInsert(pel, fullkey, PEL_RAX_KEY_LEN, f, NULL); } cache->val = f; - cache->is_direct = 0; + cache->direct = 0; memcpy(cache->key, fullkey, PEL_RAX_KEY_LEN); if (inserted && count) (*count)++; return inserted; @@ -307,22 +306,22 @@ void pelReplace(rax *pel, streamID *id, void *data) { int cache_hit = (cache->val != NULL && memcmp(cache->key, fullkey, PEL_RAX_KEY_LEN) == 0); - int is_direct; + int direct; void *bucket; if (cache_hit) { bucket = cache->val; - is_direct = cache->is_direct; + direct = cache->direct; } else { if (raxFind(pel, fullkey, PEL_RAX_DIRECT_KEYLEN, &bucket)) { - is_direct = 1; + direct = 1; } else { int found = raxFind(pel, fullkey, PEL_RAX_KEY_LEN, &bucket); serverAssert(found); - is_direct = 0; + direct = 0; } } - if (is_direct) { + if (direct) { if (cache_hit) { cache->val = data; if (!cache->dirty) @@ -344,7 +343,7 @@ int pelFind(rax *pel, streamID *id, void **data) { void *bucket; if (cache->val && memcmp(cache->key, fullkey, PEL_RAX_KEY_LEN) == 0) { - if (cache->is_direct) { + if (cache->direct) { if (cache->key[PEL_RAX_KEY_LEN] != fkey) return 0; if (data) *data = cache->val; return 1; @@ -378,31 +377,31 @@ void *pelRemove(rax *pel, streamID *id, uint64_t *count) { int cache_hit = (cache->val != NULL && memcmp(cache->key, fullkey, PEL_RAX_KEY_LEN) == 0); - int is_direct; + int direct; void *bucket; if (cache_hit) { bucket = cache->val; - is_direct = cache->is_direct; + direct = cache->direct; } else { pelCacheFlush(pel); if (raxFind(pel, fullkey, PEL_RAX_DIRECT_KEYLEN, &bucket)) { - is_direct = 1; + direct = 1; } else if (raxFind(pel, fullkey, PEL_RAX_KEY_LEN, &bucket)) { - is_direct = 0; + direct = 0; } else { return NULL; } } /* Direct entry. */ - if (is_direct) { + if (direct) { if (cache_hit && cache->key[PEL_RAX_KEY_LEN] != fkey) return NULL; void *old = bucket; if (count) (*count)--; if (cache_hit && cache->dirty) { cache->val = NULL; cache->dirty = 0; - cache->is_direct = 0; + cache->direct = 0; } else { raxRemove(pel, fullkey, PEL_RAX_DIRECT_KEYLEN, NULL); pelCacheInvalidate(pel); @@ -422,7 +421,7 @@ void *pelRemove(rax *pel, streamID *id, uint64_t *count) { if (cache_hit && cache->dirty) { cache->val = NULL; cache->dirty = 0; - cache->is_direct = 0; + cache->direct = 0; } else { raxRemove(pel, fullkey, PEL_RAX_KEY_LEN, NULL); pelCacheInvalidate(pel); @@ -441,7 +440,7 @@ void *pelRemove(rax *pel, streamID *id, uint64_t *count) { flaxFree(f); if (cache_hit) { cache->val = directval; - cache->is_direct = 1; + cache->direct = 1; memcpy(cache->key, directkey, PEL_RAX_DIRECT_KEYLEN); if (!cache->dirty) { raxRemove(pel, fullkey, PEL_RAX_KEY_LEN, NULL); @@ -465,7 +464,7 @@ static void pelIterRefresh(pelIterator *pi) { streamDecodeID(pi->key, &pi->id); pi->data = pi->fi.data; pi->valid = 1; - pi->is_direct = 0; + pi->direct = 0; } /* Refresh iterator fields from a direct entry in the current rax node. @@ -475,7 +474,7 @@ static void pelIterRefreshDirect(pelIterator *pi) { streamDecodeID(pi->key, &pi->id); pi->data = pi->ri.data; pi->valid = 1; - pi->is_direct = 1; + pi->direct = 1; } /* Position the iterator on the first valid entry starting from the current @@ -531,7 +530,7 @@ void pelIterStart(pelIterator *pi, rax *pel) { raxStart(&pi->ri, pel); pi->valid = 0; pi->just_seeked = 0; - pi->is_direct = 0; + pi->direct = 0; memset(&pi->fi, 0, sizeof(pi->fi)); memset(&pi->id, 0, sizeof(pi->id)); pi->data = NULL; @@ -603,7 +602,7 @@ int pelIterNext(pelIterator *pi) { } if (!pi->valid) return 0; - if (!pi->is_direct && flaxNext(&pi->fi)) { + if (!pi->direct && flaxNext(&pi->fi)) { pelIterRefresh(pi); return 1; } From 0fc7a522427d6e286cee143019e837d1d6d0ebae Mon Sep 17 00:00:00 2001 From: Sergey Georgiev Date: Mon, 13 Apr 2026 19:02:37 +0300 Subject: [PATCH 37/48] fixed: issue from review --- src/defrag.c | 74 +++++++++++++++++++++++++++++++++----------------- src/t_stream.c | 15 ---------- 2 files changed, 49 insertions(+), 40 deletions(-) diff --git a/src/defrag.c b/src/defrag.c index d3a750a4b..615369b55 100644 --- a/src/defrag.c +++ b/src/defrag.c @@ -856,12 +856,19 @@ void defragRadixTree(rax **raxref, int defrag_data, raxDefragFunction *element_c raxStop(&ri); } -/* Walk a consumer-PEL flax bucket and fix up the consumer back-pointer for +/* Walk a consumer-PEL entry and fix up the consumer back-pointer for * every NACK. NACKs themselves are defragged during the group PEL walk - * (defragStreamGroupPelFlax) which also covers unowned NACK-zone entries; + * (defragStreamGroupPelEntry) which also covers unowned NACK-zone entries; * here we only repair the stale consumer pointer. */ -void* defragStreamConsumerPelFlax(raxIterator *ri, void *privdata) { +void* defragStreamConsumerPelEntry(raxIterator *ri, void *privdata) { streamConsumer *c = privdata; + + if (ri->key_len == PEL_RAX_DIRECT_KEYLEN) { + streamNACK *nack = ri->data; + nack->consumer = c; + return NULL; + } + flax *f = ri->data; flax *newflax = activeDefragAlloc(f); if (newflax) f = newflax; @@ -891,18 +898,51 @@ void* defragStreamConsumer(raxIterator *ri, void *privdata) { c->name = newsds; if (c->pel) { c->pel->alloc_size = &s->alloc_size; - defragRadixTree(&c->pel, 0, defragStreamConsumerPelFlax, c); + defragRadixTree(&c->pel, 0, defragStreamConsumerPelEntry, c); pelCacheInvalidate(c->pel); } return newc; /* returns NULL if c was not defragged */ } -/* Walk a group-PEL flax bucket: defrag the flax struct itself, then defrag - * every NACK inside and update pointers in the consumer PEL, the doubly-linked +/* Defrag a single NACK and update all cross-references: the consumer PEL, + * the doubly-linked time list, the NACK-zone tail, and cgroup_ref_node. */ +static void defragStreamNack(streamCG *cg, streamNACK *nack, streamNACK *newnack) { + if (newnack->consumer) { + pelReplace(newnack->consumer->pel, &newnack->id, newnack); + } + if (newnack->pel_prev) { + newnack->pel_prev->pel_next = newnack; + } else { + cg->pel_time_head = newnack; + } + if (newnack->pel_next) { + newnack->pel_next->pel_prev = newnack; + } else { + cg->pel_time_tail = newnack; + } + if (cg->pel_nack_tail == nack) { + cg->pel_nack_tail = newnack; + } +} + +/* Walk a group-PEL entry: for direct entries (16-byte key) defrag the single + * NACK; for flax buckets (15-byte key) defrag the flax struct itself and then + * every NACK inside. Update pointers in the consumer PEL, the doubly-linked * time list, and the NACK-zone tail. cgroup_ref_node->value is also updated * here for every NACK (including unowned NACK-zone entries). */ -void* defragStreamGroupPelFlax(raxIterator *ri, void *privdata) { +void* defragStreamGroupPelEntry(raxIterator *ri, void *privdata) { streamCG *cg = privdata; + + if (ri->key_len == PEL_RAX_DIRECT_KEYLEN) { + streamNACK *nack = ri->data, *newnack; + nack->cgroup_ref_node->value = cg; + newnack = activeDefragAlloc(nack); + if (newnack) { + defragStreamNack(cg, nack, newnack); + } + return newnack; + } + flax *f = ri->data; flax *newf = activeDefragAlloc(f); if (newf) f = newf; @@ -918,23 +958,7 @@ void* defragStreamGroupPelFlax(raxIterator *ri, void *privdata) { newnack = activeDefragAlloc(nack); if (newnack) { flaxInsert(f, fi.key, newnack, NULL); - - if (newnack->consumer) { - pelReplace(newnack->consumer->pel, &newnack->id, newnack); - } - if (newnack->pel_prev) { - newnack->pel_prev->pel_next = newnack; - } else { - cg->pel_time_head = newnack; - } - if (newnack->pel_next) { - newnack->pel_next->pel_prev = newnack; - } else { - cg->pel_time_tail = newnack; - } - if (cg->pel_nack_tail == nack) { - cg->pel_nack_tail = newnack; - } + defragStreamNack(cg, nack, newnack); } } while (flaxNext(&fi)); } @@ -948,7 +972,7 @@ void* defragStreamConsumerGroup(raxIterator *ri, void *privdata) { cg = newcg; if (cg->pel) { cg->pel->alloc_size = &s->alloc_size; - defragRadixTree(&cg->pel, 0, defragStreamGroupPelFlax, cg); + defragRadixTree(&cg->pel, 0, defragStreamGroupPelEntry, cg); pelCacheInvalidate(cg->pel); } if (cg->consumers) { diff --git a/src/t_stream.c b/src/t_stream.c index 4d8507790..33836a1f8 100644 --- a/src/t_stream.c +++ b/src/t_stream.c @@ -106,21 +106,6 @@ void pelCacheInvalidate(rax *pel) { cache->direct = 0; } -/* Encode a 15-byte rax key: full 8B big-endian ms + upper 7 bytes of - * big-endian seq (i.e. first 15 bytes of the 16-byte encoded streamID). */ -static inline void pelEncodeRaxKey(unsigned char *buf, uint64_t ms, uint64_t seq) { - uint64_t be; - be = htonu64(ms); - memcpy(buf, &be, 8); - be = htonu64(seq); - memcpy(buf + 8, &be, 7); -} - -/* Extract the low byte of seq as the flax key. */ -static inline uint8_t pelFlaxKey(uint64_t seq) { - return (uint8_t)(seq & 0xFF); -} - rax *pelNew(size_t *alloc_size) { rax *pel = raxNewWithMetadata(sizeof(pelCache), alloc_size); if (pel) { From 40bca5e495934b502753884fb357a2e924b45abc Mon Sep 17 00:00:00 2001 From: Sergey Georgiev Date: Tue, 14 Apr 2026 09:16:41 +0300 Subject: [PATCH 38/48] fixed: issue from review --- src/defrag.c | 4 ++-- src/t_stream.c | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/defrag.c b/src/defrag.c index 615369b55..8929dd072 100644 --- a/src/defrag.c +++ b/src/defrag.c @@ -905,7 +905,7 @@ void* defragStreamConsumer(raxIterator *ri, void *privdata) { } /* Defrag a single NACK and update all cross-references: the consumer PEL, - * the doubly-linked time list, the NACK-zone tail, and cgroup_ref_node. */ + * the doubly-linked time list, and the NACK-zone tail. */ static void defragStreamNack(streamCG *cg, streamNACK *nack, streamNACK *newnack) { if (newnack->consumer) { pelReplace(newnack->consumer->pel, &newnack->id, newnack); @@ -979,7 +979,7 @@ void* defragStreamConsumerGroup(raxIterator *ri, void *privdata) { cg->consumers->alloc_size = &s->alloc_size; defragRadixTree(&cg->consumers, 0, defragStreamConsumer, s); } - return cg; + return newcg; } /* Defrag a single idmpProducer's dict and linked list entries. */ diff --git a/src/t_stream.c b/src/t_stream.c index 33836a1f8..b8a4a9a75 100644 --- a/src/t_stream.c +++ b/src/t_stream.c @@ -522,6 +522,7 @@ void pelIterStart(pelIterator *pi, rax *pel) { } int pelIterSeek(pelIterator *pi, const char *op, streamID *id) { + pelCacheFlush(pi->ri.rt); pi->valid = 0; pi->just_seeked = 0; if (op[0] == '^') { From 219f9a4f3af2d9bb7375ce55d609e52f47c9696f Mon Sep 17 00:00:00 2001 From: Sergey Georgiev Date: Tue, 14 Apr 2026 10:16:45 +0300 Subject: [PATCH 39/48] fixed: issue from review --- src/defrag.c | 17 ++++++----- src/rdb.c | 2 +- src/t_stream.c | 79 +++++++++++++++++++++++--------------------------- 3 files changed, 47 insertions(+), 51 deletions(-) diff --git a/src/defrag.c b/src/defrag.c index 8929dd072..5290d8459 100644 --- a/src/defrag.c +++ b/src/defrag.c @@ -858,7 +858,7 @@ void defragRadixTree(rax **raxref, int defrag_data, raxDefragFunction *element_c /* Walk a consumer-PEL entry and fix up the consumer back-pointer for * every NACK. NACKs themselves are defragged during the group PEL walk - * (defragStreamGroupPelEntry) which also covers unowned NACK-zone entries; + * (defragStreamCGPendingEntry) which also covers unowned NACK-zone entries; * here we only repair the stale consumer pointer. */ void* defragStreamConsumerPelEntry(raxIterator *ri, void *privdata) { streamConsumer *c = privdata; @@ -898,15 +898,16 @@ void* defragStreamConsumer(raxIterator *ri, void *privdata) { c->name = newsds; if (c->pel) { c->pel->alloc_size = &s->alloc_size; - defragRadixTree(&c->pel, 0, defragStreamConsumerPelEntry, c); pelCacheInvalidate(c->pel); + defragRadixTree(&c->pel, 0, defragStreamConsumerPelEntry, c); } return newc; /* returns NULL if c was not defragged */ } -/* Defrag a single NACK and update all cross-references: the consumer PEL, +/* After a NACK has been relocated by activeDefragAlloc(), fix up all + * cross-references that still point to the old address: the consumer PEL, * the doubly-linked time list, and the NACK-zone tail. */ -static void defragStreamNack(streamCG *cg, streamNACK *nack, streamNACK *newnack) { +static void relinkStreamNack(streamCG *cg, streamNACK *nack, streamNACK *newnack) { if (newnack->consumer) { pelReplace(newnack->consumer->pel, &newnack->id, newnack); } @@ -930,7 +931,7 @@ static void defragStreamNack(streamCG *cg, streamNACK *nack, streamNACK *newnack * every NACK inside. Update pointers in the consumer PEL, the doubly-linked * time list, and the NACK-zone tail. cgroup_ref_node->value is also updated * here for every NACK (including unowned NACK-zone entries). */ -void* defragStreamGroupPelEntry(raxIterator *ri, void *privdata) { +void* defragStreamCGPendingEntry(raxIterator *ri, void *privdata) { streamCG *cg = privdata; if (ri->key_len == PEL_RAX_DIRECT_KEYLEN) { @@ -938,7 +939,7 @@ void* defragStreamGroupPelEntry(raxIterator *ri, void *privdata) { nack->cgroup_ref_node->value = cg; newnack = activeDefragAlloc(nack); if (newnack) { - defragStreamNack(cg, nack, newnack); + relinkStreamNack(cg, nack, newnack); } return newnack; } @@ -958,7 +959,7 @@ void* defragStreamGroupPelEntry(raxIterator *ri, void *privdata) { newnack = activeDefragAlloc(nack); if (newnack) { flaxInsert(f, fi.key, newnack, NULL); - defragStreamNack(cg, nack, newnack); + relinkStreamNack(cg, nack, newnack); } } while (flaxNext(&fi)); } @@ -972,8 +973,8 @@ void* defragStreamConsumerGroup(raxIterator *ri, void *privdata) { cg = newcg; if (cg->pel) { cg->pel->alloc_size = &s->alloc_size; - defragRadixTree(&cg->pel, 0, defragStreamGroupPelEntry, cg); pelCacheInvalidate(cg->pel); + defragRadixTree(&cg->pel, 0, defragStreamCGPendingEntry, cg); } if (cg->consumers) { cg->consumers->alloc_size = &s->alloc_size; diff --git a/src/rdb.c b/src/rdb.c index b6fe88f13..e880d20be 100644 --- a/src/rdb.c +++ b/src/rdb.c @@ -3398,11 +3398,11 @@ robj *rdbLoadObject(int rdbtype, rio *rdb, sds key, int dbid, int *error) decrRefCount(o); return NULL; } + streamNACK *nack = result; /* Set the NACK consumer, that was left to NULL when * loading the global PEL. Then set the same shared * NACK structure also in the consumer-specific PEL. */ - streamNACK *nack = result; nack->consumer = consumer; if (!pelTryInsert(consumer->pel,&nack_id,nack,&consumer->pel_count)) { rdbReportCorruptRDB("Duplicated consumer PEL entry " diff --git a/src/t_stream.c b/src/t_stream.c index b8a4a9a75..777129279 100644 --- a/src/t_stream.c +++ b/src/t_stream.c @@ -70,18 +70,16 @@ void streamEncodeID(void *buf, streamID *id); * big-endian seq) and store a flax* mapping the low byte of seq to data * pointers. Single-entry buckets ("direct entries") use a 16-byte rax * key (the complete big-endian streamID) and store the raw data pointer - * directly in the rax value, avoiding the flax allocation. The key + * directly in the rax value, avoiding the flax allocation. The key * length in rax (16 vs 15) distinguishes the two bucket types. * ----------------------------------------------------------------------- */ -#define PEL_RAX_KEY_LEN PEL_RAX_FLAX_KEYLEN - /* Cache embedded in rax metadata to speed up sequential PEL ops * when consecutive operations target the same ID prefix. * When 'dirty' is set, the cached value has been created/updated but not yet * inserted into the rax; it will be committed on the next cache eviction - * or explicit flush. 'direct' mirrors the key-length convention: + * or explicit flush. 'direct' mirrors the key-length convention: * 1 = direct (16-byte key, raw data pointer), 0 = flax (15-byte key). */ typedef struct pelCache { unsigned char key[PEL_RAX_DIRECT_KEYLEN]; @@ -93,7 +91,7 @@ typedef struct pelCache { static void pelCacheFlush(rax *r) { pelCache *cache = (pelCache *)r->metadata; if (!cache->dirty) return; - size_t keylen = cache->direct ? PEL_RAX_DIRECT_KEYLEN : PEL_RAX_KEY_LEN; + size_t keylen = cache->direct ? PEL_RAX_DIRECT_KEYLEN : PEL_RAX_FLAX_KEYLEN; raxInsert(r, cache->key, keylen, cache->val, NULL); cache->dirty = 0; } @@ -168,15 +166,15 @@ void pelFreeShallow(rax *pel) { static int pelGenericInsert(rax *pel, streamID *id, void *data, uint64_t *count, int overwrite) { unsigned char fullkey[PEL_RAX_DIRECT_KEYLEN]; streamEncodeID(fullkey, id); - uint8_t fkey = fullkey[PEL_RAX_KEY_LEN]; + uint8_t fkey = fullkey[PEL_RAX_FLAX_KEYLEN]; pelCache *cache = (pelCache *)pel->metadata; /* Cache lookup: compare the 15-byte prefix. */ int cache_hit = (cache->val != NULL && - memcmp(cache->key, fullkey, PEL_RAX_KEY_LEN) == 0); + memcmp(cache->key, fullkey, PEL_RAX_FLAX_KEYLEN) == 0); int direct = 0; - void *bucket; + void *bucket = NULL; if (cache_hit) { bucket = cache->val; direct = cache->direct; @@ -193,24 +191,22 @@ static int pelGenericInsert(rax *pel, streamID *id, void *data, uint64_t *count, /* Look for an existing bucket: try 15-byte flax first, then * prefix-scan for a 16-byte direct entry with any fkey. */ void *raxval; - if (raxFind(pel, fullkey, PEL_RAX_KEY_LEN, &raxval)) { + if (raxFind(pel, fullkey, PEL_RAX_FLAX_KEYLEN, &raxval)) { bucket = raxval; cache->val = bucket; cache->direct = 0; - memcpy(cache->key, fullkey, PEL_RAX_KEY_LEN); + memcpy(cache->key, fullkey, PEL_RAX_FLAX_KEYLEN); } else { raxIterator ri; raxStart(&ri, pel); - raxSeek(&ri, ">=", fullkey, PEL_RAX_KEY_LEN); + raxSeek(&ri, ">=", fullkey, PEL_RAX_FLAX_KEYLEN); if (raxNext(&ri) && ri.key_len == PEL_RAX_DIRECT_KEYLEN && - memcmp(ri.key, fullkey, PEL_RAX_KEY_LEN) == 0) { + memcmp(ri.key, fullkey, PEL_RAX_FLAX_KEYLEN) == 0) { bucket = ri.data; direct = 1; cache->val = bucket; cache->direct = 1; memcpy(cache->key, ri.key, PEL_RAX_DIRECT_KEYLEN); - } else { - bucket = NULL; } raxStop(&ri); } @@ -228,7 +224,7 @@ static int pelGenericInsert(rax *pel, streamID *id, void *data, uint64_t *count, /* Existing direct entry. */ if (direct) { - uint8_t efkey = cache->key[PEL_RAX_KEY_LEN]; + uint8_t efkey = cache->key[PEL_RAX_FLAX_KEYLEN]; if (efkey == fkey) { if (overwrite) { cache->val = data; @@ -243,18 +239,17 @@ static int pelGenericInsert(rax *pel, streamID *id, void *data, uint64_t *count, if (pel->alloc_size) *pel->alloc_size += flaxAllocSize(f); flaxInsert(f, efkey, bucket, NULL); size_t before = flaxAllocSize(f); - int inserted = overwrite ? flaxInsert(f, fkey, data, NULL) - : flaxTryInsert(f, fkey, data, NULL); + flaxInsert(f, fkey, data, NULL); if (pel->alloc_size) *pel->alloc_size += flaxAllocSize(f) - before; if (!cache->dirty) { raxRemove(pel, cache->key, PEL_RAX_DIRECT_KEYLEN, NULL); - raxInsert(pel, fullkey, PEL_RAX_KEY_LEN, f, NULL); + raxInsert(pel, fullkey, PEL_RAX_FLAX_KEYLEN, f, NULL); } cache->val = f; cache->direct = 0; - memcpy(cache->key, fullkey, PEL_RAX_KEY_LEN); - if (inserted && count) (*count)++; - return inserted; + memcpy(cache->key, fullkey, PEL_RAX_FLAX_KEYLEN); + if (count) (*count)++; + return 1; } /* Existing flax bucket. */ @@ -286,11 +281,11 @@ int pelTryInsert(rax *pel, streamID *id, void *data, uint64_t *count) { void pelReplace(rax *pel, streamID *id, void *data) { unsigned char fullkey[PEL_RAX_DIRECT_KEYLEN]; streamEncodeID(fullkey, id); - uint8_t fkey = fullkey[PEL_RAX_KEY_LEN]; + uint8_t fkey = fullkey[PEL_RAX_FLAX_KEYLEN]; pelCache *cache = (pelCache *)pel->metadata; int cache_hit = (cache->val != NULL && - memcmp(cache->key, fullkey, PEL_RAX_KEY_LEN) == 0); + memcmp(cache->key, fullkey, PEL_RAX_FLAX_KEYLEN) == 0); int direct; void *bucket; if (cache_hit) { @@ -300,7 +295,7 @@ void pelReplace(rax *pel, streamID *id, void *data) { if (raxFind(pel, fullkey, PEL_RAX_DIRECT_KEYLEN, &bucket)) { direct = 1; } else { - int found = raxFind(pel, fullkey, PEL_RAX_KEY_LEN, &bucket); + int found = raxFind(pel, fullkey, PEL_RAX_FLAX_KEYLEN, &bucket); serverAssert(found); direct = 0; } @@ -323,13 +318,13 @@ void pelReplace(rax *pel, streamID *id, void *data) { int pelFind(rax *pel, streamID *id, void **data) { unsigned char fullkey[PEL_RAX_DIRECT_KEYLEN]; streamEncodeID(fullkey, id); - uint8_t fkey = fullkey[PEL_RAX_KEY_LEN]; + uint8_t fkey = fullkey[PEL_RAX_FLAX_KEYLEN]; pelCache *cache = (pelCache *)pel->metadata; void *bucket; - if (cache->val && memcmp(cache->key, fullkey, PEL_RAX_KEY_LEN) == 0) { + if (cache->val && memcmp(cache->key, fullkey, PEL_RAX_FLAX_KEYLEN) == 0) { if (cache->direct) { - if (cache->key[PEL_RAX_KEY_LEN] != fkey) return 0; + if (cache->key[PEL_RAX_FLAX_KEYLEN] != fkey) return 0; if (data) *data = cache->val; return 1; } @@ -342,7 +337,7 @@ int pelFind(rax *pel, streamID *id, void **data) { return 1; } /* Try 15-byte key (flax bucket). */ - if (!raxFind(pel, fullkey, PEL_RAX_KEY_LEN, &bucket)) + if (!raxFind(pel, fullkey, PEL_RAX_FLAX_KEYLEN, &bucket)) return 0; } @@ -357,11 +352,11 @@ int pelFind(rax *pel, streamID *id, void **data) { void *pelRemove(rax *pel, streamID *id, uint64_t *count) { unsigned char fullkey[PEL_RAX_DIRECT_KEYLEN]; streamEncodeID(fullkey, id); - uint8_t fkey = fullkey[PEL_RAX_KEY_LEN]; + uint8_t fkey = fullkey[PEL_RAX_FLAX_KEYLEN]; pelCache *cache = (pelCache *)pel->metadata; int cache_hit = (cache->val != NULL && - memcmp(cache->key, fullkey, PEL_RAX_KEY_LEN) == 0); + memcmp(cache->key, fullkey, PEL_RAX_FLAX_KEYLEN) == 0); int direct; void *bucket; if (cache_hit) { @@ -371,7 +366,7 @@ void *pelRemove(rax *pel, streamID *id, uint64_t *count) { pelCacheFlush(pel); if (raxFind(pel, fullkey, PEL_RAX_DIRECT_KEYLEN, &bucket)) { direct = 1; - } else if (raxFind(pel, fullkey, PEL_RAX_KEY_LEN, &bucket)) { + } else if (raxFind(pel, fullkey, PEL_RAX_FLAX_KEYLEN, &bucket)) { direct = 0; } else { return NULL; @@ -380,7 +375,7 @@ void *pelRemove(rax *pel, streamID *id, uint64_t *count) { /* Direct entry. */ if (direct) { - if (cache_hit && cache->key[PEL_RAX_KEY_LEN] != fkey) return NULL; + if (cache_hit && cache->key[PEL_RAX_FLAX_KEYLEN] != fkey) return NULL; void *old = bucket; if (count) (*count)--; if (cache_hit && cache->dirty) { @@ -408,7 +403,7 @@ void *pelRemove(rax *pel, streamID *id, uint64_t *count) { cache->dirty = 0; cache->direct = 0; } else { - raxRemove(pel, fullkey, PEL_RAX_KEY_LEN, NULL); + raxRemove(pel, fullkey, PEL_RAX_FLAX_KEYLEN, NULL); pelCacheInvalidate(pel); } } else if (f->numele == 1) { @@ -418,8 +413,8 @@ void *pelRemove(rax *pel, streamID *id, uint64_t *count) { flaxSeek(&fi, "^", 0); void *directval = fi.data; unsigned char directkey[PEL_RAX_DIRECT_KEYLEN]; - memcpy(directkey, fullkey, PEL_RAX_KEY_LEN); - directkey[PEL_RAX_KEY_LEN] = (unsigned char)fi.key; + memcpy(directkey, fullkey, PEL_RAX_FLAX_KEYLEN); + directkey[PEL_RAX_FLAX_KEYLEN] = (unsigned char)fi.key; flaxStop(&fi); if (pel->alloc_size) *pel->alloc_size -= flaxAllocSize(f); flaxFree(f); @@ -428,11 +423,11 @@ void *pelRemove(rax *pel, streamID *id, uint64_t *count) { cache->direct = 1; memcpy(cache->key, directkey, PEL_RAX_DIRECT_KEYLEN); if (!cache->dirty) { - raxRemove(pel, fullkey, PEL_RAX_KEY_LEN, NULL); + raxRemove(pel, fullkey, PEL_RAX_FLAX_KEYLEN, NULL); raxInsert(pel, directkey, PEL_RAX_DIRECT_KEYLEN, directval, NULL); } } else { - raxRemove(pel, fullkey, PEL_RAX_KEY_LEN, NULL); + raxRemove(pel, fullkey, PEL_RAX_FLAX_KEYLEN, NULL); raxInsert(pel, directkey, PEL_RAX_DIRECT_KEYLEN, directval, NULL); } } @@ -444,8 +439,8 @@ void *pelRemove(rax *pel, streamID *id, uint64_t *count) { /* Refresh iterator fields from current rax+flax positions (flax bucket). */ static void pelIterRefresh(pelIterator *pi) { - memcpy(pi->key, pi->ri.key, PEL_RAX_KEY_LEN); - pi->key[PEL_RAX_KEY_LEN] = (unsigned char)pi->fi.key; + memcpy(pi->key, pi->ri.key, PEL_RAX_FLAX_KEYLEN); + pi->key[PEL_RAX_FLAX_KEYLEN] = (unsigned char)pi->fi.key; streamDecodeID(pi->key, &pi->id); pi->data = pi->fi.data; pi->valid = 1; @@ -540,7 +535,7 @@ int pelIterSeek(pelIterator *pi, const char *op, streamID *id) { } else if (op[0] == '>' && op[1] == '=') { unsigned char fullkey[PEL_RAX_DIRECT_KEYLEN]; streamEncodeID(fullkey, id); - uint8_t fkey = fullkey[PEL_RAX_KEY_LEN]; + uint8_t fkey = fullkey[PEL_RAX_FLAX_KEYLEN]; raxSeek(&pi->ri, "<=", fullkey, PEL_RAX_DIRECT_KEYLEN); if (!raxNext(&pi->ri)) { @@ -551,11 +546,11 @@ int pelIterSeek(pelIterator *pi, const char *op, streamID *id) { return 1; } - int prefix_cmp = memcmp(pi->ri.key, fullkey, PEL_RAX_KEY_LEN); + int prefix_cmp = memcmp(pi->ri.key, fullkey, PEL_RAX_FLAX_KEYLEN); if (prefix_cmp == 0) { if (pi->ri.key_len == PEL_RAX_DIRECT_KEYLEN) { - if (pi->ri.key[PEL_RAX_KEY_LEN] >= fkey) { + if (pi->ri.key[PEL_RAX_FLAX_KEYLEN] >= fkey) { pelIterRefreshDirect(pi); } else { if (!pelIterAdvanceRax(pi)) return 0; From 4ef4b86e5e8237b685a426fb192ef64b3bf6bd29 Mon Sep 17 00:00:00 2001 From: Sergey Georgiev Date: Tue, 14 Apr 2026 11:06:42 +0300 Subject: [PATCH 40/48] fixed: issue from review --- src/flax.c | 59 +++++++++++++++++++++++++++++++++++++++++++++----- src/t_stream.c | 2 ++ 2 files changed, 55 insertions(+), 6 deletions(-) diff --git a/src/flax.c b/src/flax.c index ae9170e6f..ee3defc17 100644 --- a/src/flax.c +++ b/src/flax.c @@ -242,6 +242,16 @@ int flaxRemove(flax *f, uint8_t key, void **old) { } f->numele--; + + if (f->numele > 0 && + f->capacity > FLAX_INIT_CAPACITY && + f->numele <= f->capacity / 2) + { + uint16_t new_cap = f->capacity / 2; + if (new_cap < FLAX_INIT_CAPACITY) new_cap = FLAX_INIT_CAPACITY; + flax_resize(f, new_cap); + } + return 1; } @@ -663,22 +673,21 @@ int flaxTest(int argc, char **argv, int flags) { flaxFreeWithCallback(a, flax_test_counting_free, NULL); } - TEST("shrink after many removals") { + TEST("auto-shrink on remove") { flax *a = flaxNew(); for (int i = 0; i < 64; i++) flaxInsert(a, (uint8_t)i, "x", NULL); assert(flaxSize(a) == 64); - uint16_t cap_before = a->capacity; + uint16_t cap_full = a->capacity; for (int i = 0; i < 56; i++) flaxRemove(a, (uint8_t)i, NULL); assert(flaxSize(a) == 8); - flaxShrink(a); - if (a->capacity >= cap_before) { - ERR("shrink: capacity %u should be less than %u", - a->capacity, cap_before); + if (a->capacity >= cap_full) { + ERR("auto-shrink: capacity %u should be less than %u after removals", + a->capacity, cap_full); } for (int i = 56; i < 64; i++) { @@ -690,6 +699,44 @@ int flaxTest(int argc, char **argv, int flags) { flaxFree(a); } + TEST("explicit shrink after removals") { + flax *a = flaxNew(); + for (int i = 0; i < 64; i++) + flaxInsert(a, (uint8_t)i, "x", NULL); + + for (int i = 0; i < 56; i++) + flaxRemove(a, (uint8_t)i, NULL); + + assert(flaxSize(a) == 8); + uint16_t cap_after_remove = a->capacity; + flaxShrink(a); + assert(a->capacity <= cap_after_remove); + assert(a->capacity >= a->numele); + + for (int i = 56; i < 64; i++) { + void *val; + assert(flaxFind(a, (uint8_t)i, &val) == 1); + assert(strcmp(val, "x") == 0); + } + + flaxFree(a); + } + + TEST("no shrink below FLAX_INIT_CAPACITY") { + flax *a = flaxNew(); + for (int i = 0; i < 4; i++) + flaxInsert(a, (uint8_t)i, "x", NULL); + + assert(a->capacity == FLAX_INIT_CAPACITY); + for (int i = 0; i < 3; i++) + flaxRemove(a, (uint8_t)i, NULL); + + assert(flaxSize(a) == 1); + assert(a->capacity == FLAX_INIT_CAPACITY); + + flaxFree(a); + } + TEST("flaxFreeWithCallback invokes callback") { flax_test_free_count = 0; flax *a = flaxNew(); diff --git a/src/t_stream.c b/src/t_stream.c index 777129279..0d7fe71d2 100644 --- a/src/t_stream.c +++ b/src/t_stream.c @@ -392,7 +392,9 @@ void *pelRemove(rax *pel, streamID *id, uint64_t *count) { /* Flax bucket. */ flax *f = (flax *)bucket; void *old; + size_t before = flaxAllocSize(f); if (!flaxRemove(f, fkey, &old)) return NULL; + if (pel->alloc_size) *pel->alloc_size -= before - flaxAllocSize(f); if (count) (*count)--; if (f->numele == 0) { From de47df481bb62d340e9765f60c8b0b5e676138a7 Mon Sep 17 00:00:00 2001 From: Sergey Georgiev Date: Tue, 14 Apr 2026 12:27:24 +0300 Subject: [PATCH 41/48] fixed: issue from review --- src/defrag.c | 2 +- src/flax.c | 36 +++ src/flax.h | 1 + tests/unit/type/stream-cgroups.tcl | 368 ----------------------------- 4 files changed, 38 insertions(+), 369 deletions(-) diff --git a/src/defrag.c b/src/defrag.c index 5290d8459..b6433a2e7 100644 --- a/src/defrag.c +++ b/src/defrag.c @@ -958,7 +958,7 @@ void* defragStreamCGPendingEntry(raxIterator *ri, void *privdata) { nack->cgroup_ref_node->value = cg; newnack = activeDefragAlloc(nack); if (newnack) { - flaxInsert(f, fi.key, newnack, NULL); + flaxIterSetData(&fi, newnack); relinkStreamNack(cg, nack, newnack); } } while (flaxNext(&fi)); diff --git a/src/flax.c b/src/flax.c index ee3defc17..0be3468d8 100644 --- a/src/flax.c +++ b/src/flax.c @@ -474,6 +474,16 @@ int flaxEOF(flaxIterator *it) { return it->idx < 0 || it->idx >= it->f->numele; } +/* Replace the data pointer at the current iterator position. Unlike + * flaxInsert(), this is safe to call during iteration: it only writes to + * the value slot at the current index and never touches the key layout, + * element count, or capacity. The iterator's own 'data' field is updated + * to reflect the new value. */ +void flaxIterSetData(flaxIterator *it, void *data) { + flax_values(it->f)[it->idx] = data; + it->data = data; +} + /* ----------------------------- Unit tests --------------------------------- */ #ifdef REDIS_TEST @@ -1148,6 +1158,32 @@ int flaxTest(int argc, char **argv, int flags) { flaxFree(a); } + TEST("flaxIterSetData replaces value during iteration") { + flax *a = flaxNew(); + flaxInsert(a, 10, "ten", NULL); + flaxInsert(a, 20, "twenty", NULL); + flaxInsert(a, 30, "thirty", NULL); + + flaxIterator it; + flaxStart(&it, a); + assert(flaxSeek(&it, "^", 0)); + do { + if (it.key == 20) flaxIterSetData(&it, "TWENTY"); + } while (flaxNext(&it)); + flaxStop(&it); + + void *val; + assert(flaxFind(a, 10, &val) == 1); + assert(strcmp(val, "ten") == 0); + assert(flaxFind(a, 20, &val) == 1); + assert(strcmp(val, "TWENTY") == 0); + assert(flaxFind(a, 30, &val) == 1); + assert(strcmp(val, "thirty") == 0); + assert(flaxSize(a) == 3); + + flaxFree(a); + } + if (!err) printf("ALL TESTS PASSED!\n"); else diff --git a/src/flax.h b/src/flax.h index 5347d1c58..e2c2cd20e 100644 --- a/src/flax.h +++ b/src/flax.h @@ -91,6 +91,7 @@ int flaxNext(flaxIterator *it); int flaxPrev(flaxIterator *it); void flaxStop(flaxIterator *it); int flaxEOF(flaxIterator *it); +void flaxIterSetData(flaxIterator *it, void *data); /* --- Introspection --- */ uint16_t flaxSize(flax *f); diff --git a/tests/unit/type/stream-cgroups.tcl b/tests/unit/type/stream-cgroups.tcl index 3ae272358..9adb7c705 100644 --- a/tests/unit/type/stream-cgroups.tcl +++ b/tests/unit/type/stream-cgroups.tcl @@ -3291,374 +3291,6 @@ start_server { } } - test "Two-level PEL bucket overflow with fixed-ms entries" { - r DEL mystream - - # Add 600 entries sharing the same ms, forcing multiple flax buckets. - for {set i 0} {$i < 600} {incr i} { - r XADD mystream 1000-$i field value$i - } - - r XGROUP CREATE mystream grp 0 - r XREADGROUP GROUP grp consumer1 COUNT 600 STREAMS mystream > - - # Verify all 600 entries are pending. - set pending [r XPENDING mystream grp - + 10] - set summary [r XPENDING mystream grp] - assert_equal [lindex $summary 0] 600 - - # XACK entries from different bucket ranges: early, middle, late. - r XACK mystream grp 1000-0 1000-1 1000-2 - r XACK mystream grp 1000-300 1000-301 - r XACK mystream grp 1000-598 1000-599 - - set summary [r XPENDING mystream grp] - assert_equal [lindex $summary 0] 593 - - # XCLAIM an entry from a different bucket range into a new consumer. - after 10 - r XCLAIM mystream grp consumer2 0 1000-400 - - set pending_c2 [r XPENDING mystream grp - + 10 consumer2] - assert_equal [llength $pending_c2] 1 - assert_equal [lindex $pending_c2 0 0] "1000-400" - - # Verify consumer1's count decreased. - set pending_c1_summary [r XPENDING mystream grp - + 700 consumer1] - assert_equal [llength $pending_c1_summary] 592 - } - - test "Two-level PEL non-sequential insertion via XCLAIM into full buckets" { - r DEL mystream - - # Add 600 entries: consumer1 reads 0-299, consumer2 reads 300-599. - # consumer2's PEL will have a full bucket (256 entries) at seq_base=0. - for {set i 0} {$i < 600} {incr i} { - r XADD mystream 2000-$i field value$i - } - - r XGROUP CREATE mystream grp 0 - r XREADGROUP GROUP grp consumer1 COUNT 300 STREAMS mystream > - r XREADGROUP GROUP grp consumer2 COUNT 300 STREAMS mystream > - - set summary [r XPENDING mystream grp] - assert_equal [lindex $summary 0] 600 - - # XCLAIM entries 0-99 from consumer1 to consumer2. This inserts - # seqs 0-99 into consumer2's PEL which already holds seqs 300-599 - # in a full bucket, exercising non-sequential insertion into a - # bucket that has grown beyond FLAX_BUCKET_MAX. - for {set i 0} {$i < 100} {incr i} { - r XCLAIM mystream grp consumer2 0 2000-$i - } - - # consumer2 should now have 300 + 100 = 400 entries. - set pending_c2 [r XPENDING mystream grp - + 600 consumer2] - assert_equal [llength $pending_c2] 400 - - # Verify entries are returned in correct sorted order (numeric comparison). - set prev_ms -1 - set prev_seq -1 - foreach entry $pending_c2 { - set id [lindex $entry 0] - set parts [split $id -] - set ms [lindex $parts 0] - set seq [lindex $parts 1] - if {$ms == $prev_ms} { - assert {$seq > $prev_seq} - } elseif {$prev_ms >= 0} { - assert {$ms > $prev_ms} - } - set prev_ms $ms - set prev_seq $seq - } - - # Verify first and last entries. - assert_equal [lindex $pending_c2 0 0] "2000-0" - assert_equal [lindex $pending_c2 end 0] "2000-599" - } - - test "Two-level PEL empty-bucket removal after ACK" { - r DEL mystream - - # Create 3 buckets worth of entries (768 entries, 256 each) under - # one ms, then ACK all entries in the middle bucket. - for {set i 0} {$i < 768} {incr i} { - r XADD mystream 3000-$i field value$i - } - - r XGROUP CREATE mystream grp 0 - r XREADGROUP GROUP grp consumer1 COUNT 768 STREAMS mystream > - - set summary [r XPENDING mystream grp] - assert_equal [lindex $summary 0] 768 - - # ACK the middle 256 entries (seq 256-511). - for {set i 256} {$i < 512} {incr i} { - r XACK mystream grp 3000-$i - } - - set summary [r XPENDING mystream grp] - assert_equal [lindex $summary 0] 512 - - # Verify XPENDING returns only entries from first and last buckets - # in correct order, with no entries from the removed middle bucket. - set all_pending [r XPENDING mystream grp - + 600] - assert_equal [llength $all_pending] 512 - - foreach entry $all_pending { - set id [lindex $entry 0] - set seq [lindex [split $id -] 1] - assert {$seq < 256 || $seq >= 512} - } - - # Verify boundary entries still exist. - set first [lindex $all_pending 0 0] - set last [lindex $all_pending end 0] - assert_equal $first "3000-0" - assert_equal $last "3000-767" - } - - test "Two-level PEL cross-bucket iteration with XPENDING range" { - r DEL mystream - - # Use two different ms values, each with enough entries to span - # multiple buckets, to test iteration across ms+bucket boundaries. - for {set i 0} {$i < 400} {incr i} { - r XADD mystream 4000-$i field value$i - } - for {set i 0} {$i < 400} {incr i} { - r XADD mystream 5000-$i field value$i - } - - r XGROUP CREATE mystream grp 0 - r XREADGROUP GROUP grp consumer1 COUNT 800 STREAMS mystream > - - set summary [r XPENDING mystream grp] - assert_equal [lindex $summary 0] 800 - - # Fetch all 800 entries and verify strict ordering. - set all_pending [r XPENDING mystream grp - + 900] - assert_equal [llength $all_pending] 800 - - set prev_ms 0 - set prev_seq -1 - foreach entry $all_pending { - set id [lindex $entry 0] - set parts [split $id -] - set ms [lindex $parts 0] - set seq [lindex $parts 1] - if {$ms == $prev_ms} { - assert {$seq > $prev_seq} - } else { - assert {$ms > $prev_ms} - } - set prev_ms $ms - set prev_seq $seq - } - - # Paginated fetch: use COUNT to walk 100 entries at a time and - # verify continuity across pages. - set start "-" - set collected {} - while {1} { - set page [r XPENDING mystream grp $start + 100] - if {[llength $page] == 0} break - foreach entry $page { - lappend collected [lindex $entry 0] - } - # Advance start past the last entry returned. - set last_id [lindex $page end 0] - set parts [split $last_id -] - set next_seq [expr {[lindex $parts 1] + 1}] - set start "[lindex $parts 0]-$next_seq" - } - assert_equal [llength $collected] 800 - } - - test "Two-level PEL direct entries with 1 msg/ms pattern" { - r DEL mystream - - # Add entries with unique ms values (1 msg per ms), each with seq=0. - # This exercises the direct entry path: each rax bucket has 1 entry. - for {set i 0} {$i < 100} {incr i} { - set ms [expr {10000 + $i}] - r XADD mystream $ms-0 field value$i - } - - r XGROUP CREATE mystream grp 0 - r XREADGROUP GROUP grp consumer1 COUNT 100 STREAMS mystream > - - set summary [r XPENDING mystream grp] - assert_equal [lindex $summary 0] 100 - - # Verify iteration returns all entries in order. - set all [r XPENDING mystream grp - + 200] - assert_equal [llength $all] 100 - assert_equal [lindex $all 0 0] "10000-0" - assert_equal [lindex $all end 0] "10099-0" - - # ACK some entries to test direct entry removal. - r XACK mystream grp 10000-0 10050-0 10099-0 - set summary [r XPENDING mystream grp] - assert_equal [lindex $summary 0] 97 - - # Verify boundaries after ACK. - set all [r XPENDING mystream grp - + 200] - assert_equal [llength $all] 97 - assert_equal [lindex $all 0 0] "10001-0" - assert_equal [lindex $all end 0] "10098-0" - } - - test "Two-level PEL direct-to-flax promotion and flax-to-direct demotion" { - r DEL mystream - - # Create two entries sharing the same ms but different seq. - # First entry creates a direct bucket; second promotes to flax. - r XADD mystream 20000-0 field val0 - r XADD mystream 20000-1 field val1 - - r XGROUP CREATE mystream grp 0 - r XREADGROUP GROUP grp consumer1 COUNT 2 STREAMS mystream > - - set summary [r XPENDING mystream grp] - assert_equal [lindex $summary 0] 2 - - set all [r XPENDING mystream grp - + 10] - assert_equal [llength $all] 2 - assert_equal [lindex $all 0 0] "20000-0" - assert_equal [lindex $all 1 0] "20000-1" - - # ACK one entry: flax drops to 1 element, should demote to direct. - r XACK mystream grp 20000-0 - set summary [r XPENDING mystream grp] - assert_equal [lindex $summary 0] 1 - set all [r XPENDING mystream grp - + 10] - assert_equal [lindex $all 0 0] "20000-1" - - # ACK the last entry: bucket removed entirely. - r XACK mystream grp 20000-1 - set summary [r XPENDING mystream grp] - assert_equal [lindex $summary 0] 0 - } - - test "Two-level PEL direct entries with XCLAIM across consumers" { - r DEL mystream - - # 1 msg/ms pattern - for {set i 0} {$i < 10} {incr i} { - set ms [expr {30000 + $i}] - r XADD mystream $ms-0 field value$i - } - - r XGROUP CREATE mystream grp 0 - r XREADGROUP GROUP grp consumer1 COUNT 10 STREAMS mystream > - - # XCLAIM first 5 entries to consumer2 - for {set i 0} {$i < 5} {incr i} { - set ms [expr {30000 + $i}] - r XCLAIM mystream grp consumer2 0 $ms-0 - } - - set pending_c1 [r XPENDING mystream grp - + 20 consumer1] - set pending_c2 [r XPENDING mystream grp - + 20 consumer2] - assert_equal [llength $pending_c1] 5 - assert_equal [llength $pending_c2] 5 - - # Verify ordering in each consumer's PEL. - assert_equal [lindex $pending_c1 0 0] "30005-0" - assert_equal [lindex $pending_c1 end 0] "30009-0" - assert_equal [lindex $pending_c2 0 0] "30000-0" - assert_equal [lindex $pending_c2 end 0] "30004-0" - } - - test "Two-level PEL direct entries survive RDB save/load" { - r DEL mystream - - # 1 msg/ms pattern - for {set i 0} {$i < 50} {incr i} { - set ms [expr {40000 + $i}] - r XADD mystream $ms-0 field value$i - } - - r XGROUP CREATE mystream grp 0 - r XREADGROUP GROUP grp consumer1 COUNT 50 STREAMS mystream > - - r DEBUG RELOAD - - set summary [r XPENDING mystream grp] - assert_equal [lindex $summary 0] 50 - - set all [r XPENDING mystream grp - + 100] - assert_equal [llength $all] 50 - assert_equal [lindex $all 0 0] "40000-0" - assert_equal [lindex $all end 0] "40049-0" - } {} {needs:debug} - - test "Two-level PEL mixed direct and flax buckets" { - r DEL mystream - - # Create a mix: some ms values with 1 entry (direct) and some with - # multiple entries (flax). This exercises iteration across both types. - for {set ms 50000} {$ms < 50005} {incr ms} { - r XADD mystream $ms-0 field val-$ms-0 - } - # Add multiple entries under ms=50005 (will be flax after promotion) - for {set seq 0} {$seq < 5} {incr seq} { - r XADD mystream 50005-$seq field val-50005-$seq - } - # More single-entry ms values after the flax bucket - for {set ms 50006} {$ms < 50010} {incr ms} { - r XADD mystream $ms-0 field val-$ms-0 - } - - r XGROUP CREATE mystream grp 0 - r XREADGROUP GROUP grp consumer1 COUNT 100 STREAMS mystream > - - set summary [r XPENDING mystream grp] - assert_equal [lindex $summary 0] 14 - - set all [r XPENDING mystream grp - + 20] - assert_equal [llength $all] 14 - - # Verify full ordering across direct and flax buckets. - set prev_ms 0 - set prev_seq -1 - foreach entry $all { - set id [lindex $entry 0] - set parts [split $id -] - set ms [lindex $parts 0] - set seq [lindex $parts 1] - if {$ms == $prev_ms} { - assert {$seq > $prev_seq} - } else { - assert {$ms > $prev_ms} - } - set prev_ms $ms - set prev_seq $seq - } - - assert_equal [lindex $all 0 0] "50000-0" - assert_equal [lindex $all end 0] "50009-0" - - # ACK all the flax bucket entries except one -> demote to direct - r XACK mystream grp 50005-0 50005-1 50005-2 50005-3 - set summary [r XPENDING mystream grp] - assert_equal [lindex $summary 0] 10 - - set all [r XPENDING mystream grp - + 20] - assert_equal [llength $all] 10 - - # Verify the remaining entry from the former flax bucket - set found 0 - foreach entry $all { - if {[lindex $entry 0] eq "50005-4"} { - set found 1 - } - } - assert_equal $found 1 - } - # Verify that XNACK rejects every invalid invocation with the correct error. # Covers: wrong argument count, nonexistent key/group (NOGROUP), wrong key # type (WRONGTYPE), unrecognized options at every position the parser From f1e9b186820ee2affdf30f5c3060ecb3c49b5cae Mon Sep 17 00:00:00 2001 From: Sergey Georgiev Date: Wed, 15 Apr 2026 10:12:59 +0300 Subject: [PATCH 42/48] fixed: issue from review --- src/flax.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/flax.c b/src/flax.c index 0be3468d8..e742cb118 100644 --- a/src/flax.c +++ b/src/flax.c @@ -85,7 +85,7 @@ static void **flax_values(flax *f) { * - Tail: key > keys[numele-1] is the append case, overwhelmingly common * when keys are monotonically increasing sequence numbers. * - Head: key <= keys[0] catches prepend and exact-match-at-zero. */ -static int flax_search(const uint8_t *keys, uint32_t numele, uint8_t key, int16_t *out_idx) { +static int flax_search(const uint8_t *keys, uint16_t numele, uint8_t key, int16_t *out_idx) { if (numele == 0) { *out_idx = 0; return 0; @@ -108,7 +108,7 @@ static int flax_search(const uint8_t *keys, uint32_t numele, uint8_t key, int16_ } /* Linear scan through the middle. */ - for (uint32_t i = 1; i < numele - 1; i++) { + for (uint16_t i = 1; i < numele - 1; i++) { if (keys[i] < key) continue; *out_idx = i; return keys[i] == key; @@ -189,10 +189,13 @@ static int flaxGenericInsert(flax *f, uint8_t key, void *data, void **old, int o if (f->numele == f->capacity) flax_resize(f, f->capacity * 2); + /* Re-fetch pointers after potential resize (flax_resize may + * reallocate the data block, invalidating earlier pointers). */ uint8_t *keys = flax_keys(f); void **vals = flax_values(f); - int16_t tail = f->numele - idx; + int16_t tail = f->numele - idx; /* elements from [idx] onward that must shift right */ + /* Shift elements after idx one slot to the right to open a gap. */ if (tail > 0) { memmove(&keys[idx + 1], &keys[idx], (size_t)tail * sizeof(uint8_t)); memmove(&vals[idx + 1], &vals[idx], (size_t)tail * sizeof(void *)); @@ -202,7 +205,7 @@ static int flaxGenericInsert(flax *f, uint8_t key, void *data, void **old, int o vals[idx] = data; f->numele++; if (old) *old = NULL; - return 1; + return 1; /* new element created */ } /* Overwriting insert. This is just a wrapper for flaxGenericInsert(). */ @@ -234,8 +237,9 @@ int flaxRemove(flax *f, uint8_t key, void **old) { uint8_t *keys = flax_keys(f); void **vals = flax_values(f); if (old) *old = vals[idx]; - int16_t tail = f->numele - idx - 1; + int16_t tail = f->numele - idx - 1; /* elements after [idx] that must shift left */ + /* Collapse the gap left by the removed element. */ if (tail > 0) { memmove(&keys[idx], &keys[idx + 1], (size_t)tail * sizeof(uint8_t)); memmove(&vals[idx], &vals[idx + 1], (size_t)tail * sizeof(void *)); From 33664026397953638ec713bf6bb3acedc407b1f8 Mon Sep 17 00:00:00 2001 From: Sergey Georgiev Date: Wed, 15 Apr 2026 13:36:27 +0300 Subject: [PATCH 43/48] fixed: issue from review --- src/flax.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/flax.c b/src/flax.c index e742cb118..89f740eab 100644 --- a/src/flax.c +++ b/src/flax.c @@ -189,8 +189,6 @@ static int flaxGenericInsert(flax *f, uint8_t key, void *data, void **old, int o if (f->numele == f->capacity) flax_resize(f, f->capacity * 2); - /* Re-fetch pointers after potential resize (flax_resize may - * reallocate the data block, invalidating earlier pointers). */ uint8_t *keys = flax_keys(f); void **vals = flax_values(f); int16_t tail = f->numele - idx; /* elements from [idx] onward that must shift right */ From d9ac2a28fa9784ab75621793f9d4f93ec445e37e Mon Sep 17 00:00:00 2001 From: Sergey Georgiev Date: Wed, 15 Apr 2026 15:28:50 +0300 Subject: [PATCH 44/48] fixed: issue from review --- src/flax.c | 29 ++++++++++++----------------- src/flax.h | 5 +---- src/t_stream.c | 1 - 3 files changed, 13 insertions(+), 22 deletions(-) diff --git a/src/flax.c b/src/flax.c index 89f740eab..c0e0394dd 100644 --- a/src/flax.c +++ b/src/flax.c @@ -463,11 +463,6 @@ int flaxPrev(flaxIterator *it) { return 1; } -/* Stop the iterator (no-op, included for API symmetry with rax). */ -void flaxStop(flaxIterator *it) { - (void)it; -} - /* Return if the iterator is in an EOF state. This happens when flaxSeek() * failed to seek an appropriate element, so that flaxNext() or flaxPrev() * will return zero, or when an EOF condition was reached while iterating @@ -819,7 +814,7 @@ int flaxTest(int argc, char **argv, int flags) { assert(flaxEOF(&it) == 1); assert(flaxSeek(&it, "$", 0) == 0); assert(flaxSeek(&it, ">=", 42) == 0); - flaxStop(&it); + flaxFree(a); } @@ -844,7 +839,7 @@ int flaxTest(int argc, char **argv, int flags) { assert(it.key == 40); assert(flaxNext(&it) == 0); assert(flaxEOF(&it) == 1); - flaxStop(&it); + flaxFree(a); } @@ -864,7 +859,7 @@ int flaxTest(int argc, char **argv, int flags) { assert(flaxPrev(&it)); assert(it.key == 10); assert(flaxPrev(&it) == 0); - flaxStop(&it); + flaxFree(a); } @@ -890,7 +885,7 @@ int flaxTest(int argc, char **argv, int flags) { assert(flaxSeek(&it, ">=", 41) == 0); assert(flaxEOF(&it) == 1); - flaxStop(&it); + flaxFree(a); } @@ -910,7 +905,7 @@ int flaxTest(int argc, char **argv, int flags) { assert(flaxSeek(&it, "$", 0)); assert(it.key == 42); assert(flaxPrev(&it) == 0); - flaxStop(&it); + flaxFree(a); } @@ -958,7 +953,7 @@ int flaxTest(int argc, char **argv, int flags) { assert(flaxSeek(&it, ">", 50) == 0); assert(flaxEOF(&it) == 1); - flaxStop(&it); + flaxFree(a); } @@ -992,7 +987,7 @@ int flaxTest(int argc, char **argv, int flags) { assert(flaxSeek(&it, "<=", 5) == 0); assert(flaxEOF(&it) == 1); - flaxStop(&it); + flaxFree(a); } @@ -1026,7 +1021,7 @@ int flaxTest(int argc, char **argv, int flags) { assert(flaxSeek(&it, "<", 5) == 0); assert(flaxEOF(&it) == 1); - flaxStop(&it); + flaxFree(a); } @@ -1059,7 +1054,7 @@ int flaxTest(int argc, char **argv, int flags) { assert(flaxSeek(&it, "=", 0) == 0); assert(flaxSeek(&it, "=", 255) == 0); - flaxStop(&it); + flaxFree(a); } @@ -1142,7 +1137,7 @@ int flaxTest(int argc, char **argv, int flags) { assert(flaxSeek(&it, "=", 255)); assert(it.key == 255); - flaxStop(&it); + flaxFree(a); } @@ -1156,7 +1151,7 @@ int flaxTest(int argc, char **argv, int flags) { assert(flaxSeek(&it, "<", 42) == 0); assert(flaxSeek(&it, "=", 42) == 0); - flaxStop(&it); + flaxFree(a); } @@ -1172,7 +1167,7 @@ int flaxTest(int argc, char **argv, int flags) { do { if (it.key == 20) flaxIterSetData(&it, "TWENTY"); } while (flaxNext(&it)); - flaxStop(&it); + void *val; assert(flaxFind(a, 10, &val) == 1); diff --git a/src/flax.h b/src/flax.h index e2c2cd20e..47e9db784 100644 --- a/src/flax.h +++ b/src/flax.h @@ -56,11 +56,9 @@ typedef struct flax { * flaxStart(&it, myflax); -- initialize * flaxSeek(&it, ">=", somekey); -- position * while (flaxNext(&it)) { ... } -- iterate (or flaxPrev) - * flaxStop(&it); -- cleanup * * After flaxStart() the iterator is in EOF state until a successful - * flaxSeek(). The iterator does not allocate heap memory, so flaxStop() - * is a no-op included for API symmetry with rax. + * flaxSeek(). The iterator does not allocate heap memory. * * WARNING: the iterator is invalidated by any mutation (insert / remove / * resize) on the underlying flax. Do not modify the flax while iterating. */ @@ -89,7 +87,6 @@ void flaxStart(flaxIterator *it, flax *f); int flaxSeek(flaxIterator *it, const char *op, uint8_t key); int flaxNext(flaxIterator *it); int flaxPrev(flaxIterator *it); -void flaxStop(flaxIterator *it); int flaxEOF(flaxIterator *it); void flaxIterSetData(flaxIterator *it, void *data); diff --git a/src/t_stream.c b/src/t_stream.c index 0d7fe71d2..3df6a32b8 100644 --- a/src/t_stream.c +++ b/src/t_stream.c @@ -417,7 +417,6 @@ void *pelRemove(rax *pel, streamID *id, uint64_t *count) { unsigned char directkey[PEL_RAX_DIRECT_KEYLEN]; memcpy(directkey, fullkey, PEL_RAX_FLAX_KEYLEN); directkey[PEL_RAX_FLAX_KEYLEN] = (unsigned char)fi.key; - flaxStop(&fi); if (pel->alloc_size) *pel->alloc_size -= flaxAllocSize(f); flaxFree(f); if (cache_hit) { From d6c7c13065e1d9e3fad08df48cd68fb5168c1f36 Mon Sep 17 00:00:00 2001 From: Sergey Georgiev Date: Thu, 16 Apr 2026 09:44:47 +0300 Subject: [PATCH 45/48] fixed: issue from review --- src/t_stream.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/t_stream.c b/src/t_stream.c index 3df6a32b8..2c7c62f52 100644 --- a/src/t_stream.c +++ b/src/t_stream.c @@ -236,11 +236,9 @@ static int pelGenericInsert(rax *pel, streamID *id, void *data, uint64_t *count, /* Different fkey: promote to flax. */ flax *f = flaxNew(); - if (pel->alloc_size) *pel->alloc_size += flaxAllocSize(f); flaxInsert(f, efkey, bucket, NULL); - size_t before = flaxAllocSize(f); flaxInsert(f, fkey, data, NULL); - if (pel->alloc_size) *pel->alloc_size += flaxAllocSize(f) - before; + if (pel->alloc_size) *pel->alloc_size += flaxAllocSize(f); if (!cache->dirty) { raxRemove(pel, cache->key, PEL_RAX_DIRECT_KEYLEN, NULL); raxInsert(pel, fullkey, PEL_RAX_FLAX_KEYLEN, f, NULL); From 963aaf2dd637e2eb86232f060a2404e30eeed5f1 Mon Sep 17 00:00:00 2001 From: Sergey Georgiev Date: Thu, 16 Apr 2026 10:19:28 +0300 Subject: [PATCH 46/48] fixed: issue from review --- src/flax.c | 303 +--------------------------------------------- src/flax.h | 4 +- src/flax_malloc.h | 4 +- src/stream.h | 2 +- src/t_stream.c | 18 ++- 5 files changed, 23 insertions(+), 308 deletions(-) diff --git a/src/flax.c b/src/flax.c index c0e0394dd..dfab73b86 100644 --- a/src/flax.c +++ b/src/flax.c @@ -320,7 +320,7 @@ void flaxShrink(flax *f) { /* Initialize a flax iterator. This call should be performed a single time * to initialize the iterator, and must be followed by a flaxSeek() call, - * otherwise the flaxPrev()/flaxNext() functions will just return EOF. */ + * otherwise the flaxNext() function will just return 0. */ void flaxStart(flaxIterator *it, flax *f) { it->f = f; it->idx = -1; @@ -330,9 +330,8 @@ void flaxStart(flaxIterator *it, flax *f) { /* Seek an iterator at the specified element. The 'op' argument selects the * seek mode: "^" for the first element, "$" for the last, ">=" for greater - * or equal, ">" for strictly greater, "<=" for less or equal, "<" for - * strictly less, and "=" for exact match. Return 0 if no matching element - * was found, otherwise 1 is returned. */ + * or equal. Return 0 if no matching element was found, otherwise 1 is + * returned. */ int flaxSeek(flaxIterator *it, const char *op, uint8_t key) { if (!it->f || it->f->numele == 0) { it->idx = -1; @@ -367,66 +366,6 @@ int flaxSeek(flaxIterator *it, const char *op, uint8_t key) { return 1; } - if (op[0] == '>' && op[1] == '\0') { - int16_t idx; - int found = flax_search(flax_keys(it->f), it->f->numele, key, &idx); - if (found) idx++; - if (idx >= it->f->numele) { - it->idx = -1; - it->key = 0; - it->data = NULL; - return 0; - } - it->idx = idx; - flaxIterRefresh(it); - return 1; - } - - if (op[0] == '<' && op[1] == '=') { - int16_t idx; - int found = flax_search(flax_keys(it->f), it->f->numele, key, &idx); - if (found) { - it->idx = idx; - } else { - if (idx == 0) { - it->idx = -1; - it->key = 0; - it->data = NULL; - return 0; - } - it->idx = idx - 1; - } - flaxIterRefresh(it); - return 1; - } - - if (op[0] == '<' && op[1] == '\0') { - int16_t idx; - flax_search(flax_keys(it->f), it->f->numele, key, &idx); - if (idx == 0) { - it->idx = -1; - it->key = 0; - it->data = NULL; - return 0; - } - it->idx = idx - 1; - flaxIterRefresh(it); - return 1; - } - - if (op[0] == '=' && op[1] == '\0') { - int16_t idx; - if (!flax_search(flax_keys(it->f), it->f->numele, key, &idx)) { - it->idx = -1; - it->key = 0; - it->data = NULL; - return 0; - } - it->idx = idx; - flaxIterRefresh(it); - return 1; - } - assert(0 && "flaxSeek: unrecognized op"); it->idx = -1; it->key = 0; @@ -449,34 +388,13 @@ int flaxNext(flaxIterator *it) { return 1; } -/* Go to the previous element in the scope of the iterator 'it'. - * If EOF is reached, 0 is returned, otherwise 1 is returned. */ -int flaxPrev(flaxIterator *it) { - if (it->idx < 0) return 0; - it->idx--; - if (it->idx < 0) { - it->key = 0; - it->data = NULL; - return 0; - } - flaxIterRefresh(it); - return 1; -} - -/* Return if the iterator is in an EOF state. This happens when flaxSeek() - * failed to seek an appropriate element, so that flaxNext() or flaxPrev() - * will return zero, or when an EOF condition was reached while iterating - * with flaxNext() and flaxPrev(). */ -int flaxEOF(flaxIterator *it) { - return it->idx < 0 || it->idx >= it->f->numele; -} - /* Replace the data pointer at the current iterator position. Unlike * flaxInsert(), this is safe to call during iteration: it only writes to * the value slot at the current index and never touches the key layout, * element count, or capacity. The iterator's own 'data' field is updated * to reflect the new value. */ void flaxIterSetData(flaxIterator *it, void *data) { + assert(it->idx >= 0 && it->idx < it->f->numele); flax_values(it->f)[it->idx] = data; it->data = data; } @@ -811,7 +729,6 @@ int flaxTest(int argc, char **argv, int flags) { flaxIterator it; flaxStart(&it, a); assert(flaxSeek(&it, "^", 0) == 0); - assert(flaxEOF(&it) == 1); assert(flaxSeek(&it, "$", 0) == 0); assert(flaxSeek(&it, ">=", 42) == 0); @@ -838,27 +755,6 @@ int flaxTest(int argc, char **argv, int flags) { assert(flaxNext(&it)); assert(it.key == 40); assert(flaxNext(&it) == 0); - assert(flaxEOF(&it) == 1); - - - flaxFree(a); - } - - TEST("iterator backward") { - flax *a = flaxNew(); - flaxInsert(a, 10, "ten", NULL); - flaxInsert(a, 20, "twenty", NULL); - flaxInsert(a, 30, "thirty", NULL); - - flaxIterator it; - flaxStart(&it, a); - assert(flaxSeek(&it, "$", 0)); - assert(it.key == 30); - assert(flaxPrev(&it)); - assert(it.key == 20); - assert(flaxPrev(&it)); - assert(it.key == 10); - assert(flaxPrev(&it) == 0); flaxFree(a); @@ -884,7 +780,6 @@ int flaxTest(int argc, char **argv, int flags) { assert(it.key == 10); assert(flaxSeek(&it, ">=", 41) == 0); - assert(flaxEOF(&it) == 1); flaxFree(a); @@ -901,11 +796,6 @@ int flaxTest(int argc, char **argv, int flags) { assert(strcmp(it.data, "answer") == 0); assert(flaxNext(&it) == 0); - flaxStart(&it, a); - assert(flaxSeek(&it, "$", 0)); - assert(it.key == 42); - assert(flaxPrev(&it) == 0); - flaxFree(a); } @@ -923,141 +813,6 @@ int flaxTest(int argc, char **argv, int flags) { } } - TEST("iterator seek >") { - flax *a = flaxNew(); - flaxInsert(a, 10, "ten", NULL); - flaxInsert(a, 20, "twenty", NULL); - flaxInsert(a, 30, "thirty", NULL); - flaxInsert(a, 40, "forty", NULL); - - flaxIterator it; - flaxStart(&it, a); - - /* ">" on existing key skips to the next one. */ - assert(flaxSeek(&it, ">", 20)); - assert(it.key == 30); - - /* ">" on non-existing key lands on the first key greater. */ - assert(flaxSeek(&it, ">", 25)); - assert(it.key == 30); - - /* ">" on a key smaller than all elements returns the first. */ - assert(flaxSeek(&it, ">", 5)); - assert(it.key == 10); - - /* ">" on the largest key returns EOF. */ - assert(flaxSeek(&it, ">", 40) == 0); - assert(flaxEOF(&it) == 1); - - /* ">" on a key beyond all elements returns EOF. */ - assert(flaxSeek(&it, ">", 50) == 0); - assert(flaxEOF(&it) == 1); - - - flaxFree(a); - } - - TEST("iterator seek <=") { - flax *a = flaxNew(); - flaxInsert(a, 10, "ten", NULL); - flaxInsert(a, 20, "twenty", NULL); - flaxInsert(a, 30, "thirty", NULL); - flaxInsert(a, 40, "forty", NULL); - - flaxIterator it; - flaxStart(&it, a); - - /* "<=" on existing key lands on that key. */ - assert(flaxSeek(&it, "<=", 20)); - assert(it.key == 20); - - /* "<=" on non-existing key lands on the greatest smaller key. */ - assert(flaxSeek(&it, "<=", 25)); - assert(it.key == 20); - - /* "<=" on the largest key lands on it. */ - assert(flaxSeek(&it, "<=", 40)); - assert(it.key == 40); - - /* "<=" on a key beyond all elements lands on the last. */ - assert(flaxSeek(&it, "<=", 100)); - assert(it.key == 40); - - /* "<=" on a key smaller than all returns EOF. */ - assert(flaxSeek(&it, "<=", 5) == 0); - assert(flaxEOF(&it) == 1); - - - flaxFree(a); - } - - TEST("iterator seek <") { - flax *a = flaxNew(); - flaxInsert(a, 10, "ten", NULL); - flaxInsert(a, 20, "twenty", NULL); - flaxInsert(a, 30, "thirty", NULL); - flaxInsert(a, 40, "forty", NULL); - - flaxIterator it; - flaxStart(&it, a); - - /* "<" on existing key lands on the previous one. */ - assert(flaxSeek(&it, "<", 20)); - assert(it.key == 10); - - /* "<" on non-existing key lands on the greatest smaller key. */ - assert(flaxSeek(&it, "<", 25)); - assert(it.key == 20); - - /* "<" on a key beyond all elements lands on the last. */ - assert(flaxSeek(&it, "<", 100)); - assert(it.key == 40); - - /* "<" on the smallest key returns EOF. */ - assert(flaxSeek(&it, "<", 10) == 0); - assert(flaxEOF(&it) == 1); - - /* "<" on a key smaller than all returns EOF. */ - assert(flaxSeek(&it, "<", 5) == 0); - assert(flaxEOF(&it) == 1); - - - flaxFree(a); - } - - TEST("iterator seek =") { - flax *a = flaxNew(); - flaxInsert(a, 10, "ten", NULL); - flaxInsert(a, 20, "twenty", NULL); - flaxInsert(a, 30, "thirty", NULL); - - flaxIterator it; - flaxStart(&it, a); - - /* "=" on existing key succeeds. */ - assert(flaxSeek(&it, "=", 20)); - assert(it.key == 20); - assert(strcmp(it.data, "twenty") == 0); - - /* "=" on first key. */ - assert(flaxSeek(&it, "=", 10)); - assert(it.key == 10); - - /* "=" on last key. */ - assert(flaxSeek(&it, "=", 30)); - assert(it.key == 30); - - /* "=" on non-existing key returns EOF. */ - assert(flaxSeek(&it, "=", 15) == 0); - assert(flaxEOF(&it) == 1); - - assert(flaxSeek(&it, "=", 0) == 0); - assert(flaxSeek(&it, "=", 255) == 0); - - - flaxFree(a); - } - TEST("flaxAllocSize tracks allocations") { flax *a = flaxNew(); size_t sz0 = flaxAllocSize(a); @@ -1105,56 +860,6 @@ int flaxTest(int argc, char **argv, int flags) { flaxFree(a); } - TEST("iterator seek with boundary keys 0 and 255") { - flax *a = flaxNew(); - flaxInsert(a, 0, "zero", NULL); - flaxInsert(a, 128, "mid", NULL); - flaxInsert(a, 255, "max", NULL); - - flaxIterator it; - flaxStart(&it, a); - - assert(flaxSeek(&it, ">=", 0)); - assert(it.key == 0); - assert(flaxSeek(&it, ">=", 255)); - assert(it.key == 255); - - assert(flaxSeek(&it, ">", 0)); - assert(it.key == 128); - assert(flaxSeek(&it, ">", 255) == 0); - - assert(flaxSeek(&it, "<=", 255)); - assert(it.key == 255); - assert(flaxSeek(&it, "<=", 0)); - assert(it.key == 0); - - assert(flaxSeek(&it, "<", 255)); - assert(it.key == 128); - assert(flaxSeek(&it, "<", 0) == 0); - - assert(flaxSeek(&it, "=", 0)); - assert(it.key == 0); - assert(flaxSeek(&it, "=", 255)); - assert(it.key == 255); - - - flaxFree(a); - } - - TEST("iterator seek on empty flax all operators") { - flax *a = flaxNew(); - flaxIterator it; - flaxStart(&it, a); - - assert(flaxSeek(&it, ">", 42) == 0); - assert(flaxSeek(&it, "<=", 42) == 0); - assert(flaxSeek(&it, "<", 42) == 0); - assert(flaxSeek(&it, "=", 42) == 0); - - - flaxFree(a); - } - TEST("flaxIterSetData replaces value during iteration") { flax *a = flaxNew(); flaxInsert(a, 10, "ten", NULL); diff --git a/src/flax.h b/src/flax.h index 47e9db784..bd62eb756 100644 --- a/src/flax.h +++ b/src/flax.h @@ -55,7 +55,7 @@ typedef struct flax { * flaxIterator it; * flaxStart(&it, myflax); -- initialize * flaxSeek(&it, ">=", somekey); -- position - * while (flaxNext(&it)) { ... } -- iterate (or flaxPrev) + * while (flaxNext(&it)) { ... } -- iterate * * After flaxStart() the iterator is in EOF state until a successful * flaxSeek(). The iterator does not allocate heap memory. @@ -86,8 +86,6 @@ int flaxFind(flax *f, uint8_t key, void **value); void flaxStart(flaxIterator *it, flax *f); int flaxSeek(flaxIterator *it, const char *op, uint8_t key); int flaxNext(flaxIterator *it); -int flaxPrev(flaxIterator *it); -int flaxEOF(flaxIterator *it); void flaxIterSetData(flaxIterator *it, void *data); /* --- Introspection --- */ diff --git a/src/flax_malloc.h b/src/flax_malloc.h index 410efabd4..6b1ae2783 100644 --- a/src/flax_malloc.h +++ b/src/flax_malloc.h @@ -15,8 +15,8 @@ * the include of your alternate allocator if needed (not needed in order * to use the default libc allocator). */ -#ifndef FLAX_ALLOC_H -#define FLAX_ALLOC_H +#ifndef FLAX_MALLOC_H +#define FLAX_MALLOC_H #include "zmalloc.h" #define flax_malloc zmalloc #define flax_malloc_usable zmalloc_usable diff --git a/src/stream.h b/src/stream.h index 90ec3309f..9bd893302 100644 --- a/src/stream.h +++ b/src/stream.h @@ -121,7 +121,7 @@ typedef struct streamCG { a 15-byte key (ms + upper 7 bytes of seq) and store a flax* mapping the low byte of seq to streamNACK pointers. Max 256 per bucket. */ - uint64_t pel_count; /* Total number of NACK entries across all flax buckets. */ + uint64_t pel_count; /* Total number of NACK entries in this PEL (direct + flax). */ streamNACK *pel_time_head; /* Head of time-ordered doubly-linked list of pending entries (oldest delivery_time). Used for efficient CLAIM operations. O(1) access to oldest entries. */ diff --git a/src/t_stream.c b/src/t_stream.c index 2c7c62f52..4ea13fed2 100644 --- a/src/t_stream.c +++ b/src/t_stream.c @@ -80,7 +80,15 @@ void streamEncodeID(void *buf, streamID *id); * When 'dirty' is set, the cached value has been created/updated but not yet * inserted into the rax; it will be committed on the next cache eviction * or explicit flush. 'direct' mirrors the key-length convention: - * 1 = direct (16-byte key, raw data pointer), 0 = flax (15-byte key). */ + * 1 = direct (16-byte key, raw data pointer), 0 = flax (15-byte key). + * + * KEY INVARIANT: dirty==1 means "this bucket exists ONLY in the cache, not in + * the rax". This permits type transitions (direct<->flax) while dirty without + * touching the rax: the old entry was never committed, so there is nothing + * stale to remove. For example, a dirty direct can be promoted to a dirty + * flax (pelGenericInsert), or a dirty flax can be demoted to a dirty direct + * (pelRemove), and in both cases pelCacheFlush will later insert the new + * representation using the correct key length derived from cache->direct. */ typedef struct pelCache { unsigned char key[PEL_RAX_DIRECT_KEYLEN]; void *val; @@ -234,7 +242,9 @@ static int pelGenericInsert(rax *pel, streamID *id, void *data, uint64_t *count, return 0; } - /* Different fkey: promote to flax. */ + /* Different fkey: promote direct -> flax. + * When dirty, the old direct was never committed to rax, so we + * skip rax ops and just overwrite the cache (see pelCache invariant). */ flax *f = flaxNew(); flaxInsert(f, efkey, bucket, NULL); flaxInsert(f, fkey, data, NULL); @@ -407,7 +417,9 @@ void *pelRemove(rax *pel, streamID *id, uint64_t *count) { pelCacheInvalidate(pel); } } else if (f->numele == 1) { - /* Demote to direct entry with 16-byte key. */ + /* Demote flax -> direct entry with 16-byte key. + * When dirty, the flax was never committed to rax, so we just + * replace the cache contents (see pelCache invariant). */ flaxIterator fi; flaxStart(&fi, f); flaxSeek(&fi, "^", 0); From 3fe53d80c9e181e0f149466e2bd9abf6d67db4e2 Mon Sep 17 00:00:00 2001 From: Sergey Georgiev Date: Thu, 16 Apr 2026 13:53:09 +0300 Subject: [PATCH 47/48] fixed: issue from review --- src/t_stream.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/src/t_stream.c b/src/t_stream.c index 4ea13fed2..87c1a2fa2 100644 --- a/src/t_stream.c +++ b/src/t_stream.c @@ -559,6 +559,9 @@ int pelIterSeek(pelIterator *pi, const char *op, streamID *id) { int prefix_cmp = memcmp(pi->ri.key, fullkey, PEL_RAX_FLAX_KEYLEN); + /* raxSeek("<=", fullkey, 16) guarantees the returned key is <= fullkey, + * so its first 15 bytes cannot exceed fullkey's prefix. */ + serverAssert(prefix_cmp <= 0); if (prefix_cmp == 0) { if (pi->ri.key_len == PEL_RAX_DIRECT_KEYLEN) { if (pi->ri.key[PEL_RAX_FLAX_KEYLEN] >= fkey) { @@ -574,12 +577,8 @@ int pelIterSeek(pelIterator *pi, const char *op, streamID *id) { pelIterRefresh(pi); } } - } else if (prefix_cmp < 0) { - if (!pelIterAdvanceRax(pi)) return 0; } else { - if (!pelIterEnterBucketHead(pi)) { - if (!pelIterAdvanceRax(pi)) return 0; - } + if (!pelIterAdvanceRax(pi)) return 0; } pi->just_seeked = 1; return 1; From 66b0120ab5b9d99e0ecfa1f343ec95cb5b45195f Mon Sep 17 00:00:00 2001 From: Sergey Georgiev Date: Thu, 16 Apr 2026 16:07:52 +0300 Subject: [PATCH 48/48] fixed: issue from review --- src/flax.c | 6 +++++- src/t_stream.c | 11 ++++++++++- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/src/flax.c b/src/flax.c index dfab73b86..2aa9bffe9 100644 --- a/src/flax.c +++ b/src/flax.c @@ -125,7 +125,11 @@ static int flax_search(const uint8_t *keys, uint16_t numele, uint8_t key, int16_ * that depend on the capacity (the values offset is re-aligned for the new * capacity), we must perform two independent memcpy operations -- one for * the keys at the start of the block and one for the values at the new - * aligned offset. The old data block is freed afterwards. */ + * aligned offset. The old data block is freed afterwards. + * + * IMPORTANT: this function replaces f->data but never moves the flax struct + * itself. External code (e.g. the PEL cache in t_stream.c) relies on the + * struct pointer remaining stable across resize operations. */ static void flax_resize(flax *f, uint16_t new_capacity) { if (new_capacity > UINT8_MAX + 1) new_capacity = UINT8_MAX + 1; size_t new_voff = flax_values_offset(new_capacity); diff --git a/src/t_stream.c b/src/t_stream.c index 87c1a2fa2..bf4422a7c 100644 --- a/src/t_stream.c +++ b/src/t_stream.c @@ -244,7 +244,16 @@ static int pelGenericInsert(rax *pel, streamID *id, void *data, uint64_t *count, /* Different fkey: promote direct -> flax. * When dirty, the old direct was never committed to rax, so we - * skip rax ops and just overwrite the cache (see pelCache invariant). */ + * skip rax ops and just overwrite the cache (see pelCache invariant). + * + * When non-dirty, we commit the new flax entry to rax immediately + * and leave dirty==0. Subsequent inserts into the same flax bucket + * modify the flax object in-place (including possible flax_resize of + * its internal data block), but the flax *struct pointer* stored in + * the rax never changes -- flax_resize only replaces f->data, not f + * itself. So the rax entry remains valid without needing a dirty + * flush. This invariant depends on flax using a separate heap + * allocation for the struct vs. its data block. */ flax *f = flaxNew(); flaxInsert(f, efkey, bucket, NULL); flaxInsert(f, fkey, data, NULL);