From 63d841c3ae8760668a33c41acd2255f61669fe85 Mon Sep 17 00:00:00 2001
From: Sergei Georgiev <s_ggeorgiev@yahoo.com>
Date: Mon, 23 Mar 2026 16:09:49 +0200
Subject: [PATCH] Optimize rax insert and lookup for sequential key patterns
 (#14885)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

**Summary**

Optimizes rax tree insert performance by reducing unnecessary reallocs
and shortcutting sorted-array scans in `raxAddChild` and `raxLowWalk`,
targeting the common case of sequential or append-heavy key insertion
(e.g., stream IDs).

**Changes**

- **`raxAddChild` — skip realloc when allocation already fits**: Before
reallocating the node, checks `rax_malloc_usable_size(n) < newlen`.
Jemalloc size-class rounding often leaves enough usable bytes in the
existing allocation to accommodate the extra child byte and pointer,
making the realloc a no-op that can be avoided entirely.
- **`raxAddChild` — fast-path insertion position for append case**: When
the new child character `c` is greater than the last existing child
(`n->data[n->size - 1]`), sets `pos = n->size` directly instead of
scanning the full sorted edge array. This is the hot path for sequential
stream ID insertion where new keys are always lexicographically largest.
- **`raxLowWalk` — last-child-first lookup for non-compressed nodes**:
Checks the last child in the sorted edge array before falling back to
the linear scan. For sequential inserts the match is almost always at
the tail. If the search byte is greater than the last child, breaks
immediately (early miss), avoiding the full O(n) scan.

**Benefits**

- **Fewer reallocs**: Avoids `raxNodeRealloc` calls when the allocator
already provided enough space, reducing allocator pressure and
pointer-update overhead.
- **O(1) insert position for sequential keys**: The append fast-path in
`raxAddChild` turns the child-position search from O(k) to O(1) for the
dominant stream-ID pattern.
- **Faster tree walks**: The last-child-first check in `raxLowWalk` cuts
the per-node scan cost for sequential workloads from O(k) to O(1), and
adds at most one extra comparison for random workloads.
- **No memory overhead**: All optimizations operate on the existing node
layout and sorted-children invariant. No additional fields, flags, or
auxiliary data structures are introduced — memory usage is identical to
before.
- **Zero behavioral change**: All three optimizations are purely
performance shortcuts; no data structure or API semantics are altered.
---
 src/rax.c | 47 +++++++++++++++++++++++++++++++++--------------
 1 file changed, 33 insertions(+), 14 deletions(-)

diff --git a/src/rax.c b/src/rax.c
index 0b134d7af3..e34e7e2f13 100644
--- a/src/rax.c
+++ b/src/rax.c
@@ -272,13 +272,17 @@ raxNode *raxAddChild(rax *rax, raxNode *n, unsigned char c, raxNode **childptr,
     raxNode *child = raxNewNode(rax,0,0);
     if (child == NULL) return NULL;
 
-    /* Make space in the original node. */
-    raxNode *newn = raxNodeRealloc(rax,n,newlen);
-    if (newn == NULL) {
-        raxFreeNode(rax,child);
-        return NULL;
+    /* Make space in the original node. If the current allocation already
+     * has enough usable bytes (common with jemalloc size-class rounding),
+     * skip the realloc entirely. */
+    if (rax_malloc_usable_size(n) < newlen) {
+        raxNode *newn = raxNodeRealloc(rax,n,newlen);
+        if (newn == NULL) {
+            raxFreeNode(rax,child);
+            return NULL;
+        }
+        n = newn;
     }
-    n = newn;
 
     /* After the reallocation, we have up to 8/16 (depending on the system
      * pointer size, and the required node padding) bytes at the end, that is,
@@ -309,8 +313,12 @@ raxNode *raxAddChild(rax *rax, raxNode *n, unsigned char c, raxNode **childptr,
      * a child "c" in our case pos will be = 2 after the end of the following
      * loop. */
     int pos;
-    for (pos = 0; pos < n->size; pos++) {
-        if (n->data[pos] > c) break;
+    if (n->size > 0 && c > n->data[n->size - 1]) {
+        pos = n->size;
+    } else {
+        for (pos = 0; pos < n->size; pos++) {
+            if (n->data[pos] > c) break;
+        }
     }
 
     /* Now, if present, move auxiliary data pointer at the end
@@ -478,13 +486,24 @@ static inline size_t raxLowWalk(rax *rax, unsigned char *s, size_t len, raxNode
             }
             if (j != h->size) break;
         } else {
-            /* Even when h->size is large, linear scan provides good
-             * performances compared to other approaches that are in theory
-             * more sounding, like performing a binary search. */
-            for (j = 0; j < h->size; j++) {
-                if (v[j] == s[i]) break;
+            /* Children are sorted. Check the last child first: for
+             * sequential inserts the match is almost always at the end,
+             * and for random keys the extra compare is negligible vs
+             * the O(n) scan that follows on miss. */
+            if (v[h->size - 1] == s[i]) {
+                j = h->size - 1;
+            } else if (s[i] > v[h->size - 1]) {
+                j = h->size;
+                break;
+            } else {
+                /* Even when h->size is large, linear scan provides good
+                 * performances compared to other approaches that are in theory
+                 * more sounding, like performing a binary search. */
+                for (j = 0; j < h->size; j++) {
+                    if (v[j] == s[i]) break;
+                }
+                if (j == h->size) break;
             }
-            if (j == h->size) break;
             i++;
         }