From 861d8b9b76c5a2445b52f4089745f336d43ae3e6 Mon Sep 17 00:00:00 2001 From: Martin Dimitrov Date: Thu, 16 Apr 2026 13:26:03 -0700 Subject: [PATCH 1/5] replace bit shitf loop with a single __builtin_clzl --- src/bitops.c | 26 +++++++------------------- 1 file changed, 7 insertions(+), 19 deletions(-) diff --git a/src/bitops.c b/src/bitops.c index 26a08acfa..fbf03eed5 100644 --- a/src/bitops.c +++ b/src/bitops.c @@ -392,7 +392,7 @@ static inline long long redisPopcountAuto(const unsigned char *p, long count) { long long redisBitpos(void *s, unsigned long count, int bit) { unsigned long *l; unsigned char *c; - unsigned long skipval, word = 0, one; + unsigned long skipval, word = 0; long long pos = 0; /* Position of bit, to return to the caller. */ unsigned long j; int found; @@ -456,24 +456,12 @@ long long redisBitpos(void *s, unsigned long count, int bit) { * that the right of the string is zero padded. */ if (bit == 1 && word == 0) return -1; - /* Last word left, scan bit by bit. The first thing we need is to - * have a single "1" set in the most significant position in an - * unsigned long. We don't know the size of the long so we use a - * simple trick. */ - one = ULONG_MAX; /* All bits set to 1.*/ - one >>= 1; /* All bits set to 1 but the MSB. */ - one = ~one; /* All bits set to 0 but the MSB. */ - - while(one) { - if (((one & word) != 0) == bit) return pos; - pos++; - one >>= 1; - } - - /* If we reached this point, there is a bug in the algorithm, since - * the case of no match is handled as a special case before. */ - serverPanic("End of redisBitpos() reached."); - return 0; /* Just to avoid warnings. */ + /* Last word left, find the position of the first matching bit. + * __builtin_clzl gives the count of leading zeros in an unsigned long, + * which is exactly the bit offset from MSB to the first set bit. + * For bit=0 we invert the word first to find the first zero bit. */ + pos += bit ? __builtin_clzl(word) : __builtin_clzl(~word); + return pos; } /* The following set.*Bitfield and get.*Bitfield functions implement setting From 570e1adb15789c3bcdaff654ae079565e39721c1 Mon Sep 17 00:00:00 2001 From: Martin Dimitrov Date: Fri, 15 May 2026 09:54:12 -0700 Subject: [PATCH 2/5] bitops: vectorize redisBitpos() scan loop with AVX2/AVX512 The word-at-a-time scan in redisBitpos() checked 8 bytes per iteration. Add SIMD-accelerated scan helpers that process 32 bytes (AVX2) or 64 bytes (AVX512) per iteration, falling through to the scalar loop for the tail. AVX512 path: _mm512_cmpeq_epi64_mask compares 8 qwords at once (uses only AVX512F, no AVX512BW dependency). On mismatch, __builtin_ctz on the inverted mask locates the first non-matching qword. AVX2 path: _mm256_cmpeq_epi8 + _mm256_movemask_epi8 compares 32 bytes at once. On mismatch, __builtin_ctz locates the byte and rounds down to the containing 8-byte word boundary. Both helpers are separate functions with ATTRIBUTE_TARGET_{AVX512,AVX2} so they compile with the right ISA extensions while redisBitpos() itself remains ISA-neutral. Runtime dispatch via __builtin_cpu_supports guards (same pattern as bitopCommandAVX/bitopCommandAVX512). All 25 BITPOS unit tests pass. --- src/bitops.c | 75 +++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 74 insertions(+), 1 deletion(-) diff --git a/src/bitops.c b/src/bitops.c index fbf03eed5..b50bc00aa 100644 --- a/src/bitops.c +++ b/src/bitops.c @@ -382,6 +382,52 @@ static inline long long redisPopcountAuto(const unsigned char *p, long count) { #endif } +/* --------------------------------------------------------------------------- + * SIMD helpers for redisBitpos() — scan for the first word that does not + * match 'skipval' (0 when looking for bit=1, ULONG_MAX for bit=0). + * Each function returns the number of bytes scanned that all matched skipval. + * The caller advances past that many bytes and falls into the scalar tail. + * ----------------------------------------------------------------------- */ +#ifdef HAVE_AVX512 +ATTRIBUTE_TARGET_AVX512 +static unsigned long redisBitposScanAVX512(unsigned long *l, + unsigned long count, int bit) { + unsigned long scanned = 0; + __m512i skip = bit ? _mm512_setzero_si512() + : _mm512_set1_epi64(-1LL); + + while (count >= 64) { + __m512i data = _mm512_loadu_si512(l); + __mmask8 eq = _mm512_cmpeq_epi64_mask(data, skip); + if (eq != 0xFF) break; + l = (unsigned long *)((unsigned char *)l + 64); + count -= 64; + scanned += 64; + } + return scanned; +} +#endif + +#ifdef HAVE_AVX2 +ATTRIBUTE_TARGET_AVX2 +static unsigned long redisBitposScanAVX2(unsigned long *l, + unsigned long count, int bit) { + unsigned long scanned = 0; + __m256i skip = bit ? _mm256_setzero_si256() + : _mm256_set1_epi8((char)0xFF); + + while (count >= 32) { + __m256i data = _mm256_loadu_si256((__m256i *)l); + int eq = _mm256_movemask_epi8(_mm256_cmpeq_epi8(data, skip)); + if (eq != -1) break; + l = (unsigned long *)((unsigned char *)l + 32); + count -= 32; + scanned += 32; + } + return scanned; +} +#endif + /* Return the position of the first bit set to one (if 'bit' is 1) or * zero (if 'bit' is 0) in the bitmap starting at 's' and long 'count' bytes. * @@ -420,10 +466,37 @@ long long redisBitpos(void *s, unsigned long count, int bit) { pos += 8; } - /* Skip bits with full word step. */ + /* Skip bits with full word step. Use SIMD when available for the + * bulk of the scan, then fall through to scalar for the tail. */ l = (unsigned long*) c; if (!found) { skipval = bit ? 0 : ULONG_MAX; + +#if defined(HAVE_AVX512) || defined(HAVE_AVX2) + int useAVX = 0; +#endif + +#if defined(HAVE_AVX512) + if (BITOP_USE_AVX512 && count >= 64) { + unsigned long advanced = redisBitposScanAVX512(l, count, bit); + l = (unsigned long*)((unsigned char*)l + advanced); + count -= advanced; + pos += advanced * 8; + useAVX = 1; + } +#endif + +#if defined(HAVE_AVX2) + if (!useAVX && BITOP_USE_AVX2 && count >= 32) { + unsigned long advanced = redisBitposScanAVX2(l, count, bit); + l = (unsigned long*)((unsigned char*)l + advanced); + count -= advanced; + pos += advanced * 8; + } +#endif + + /* Scalar word-at-a-time scan handles the tail after SIMD and + * serves as the sole scan path when SIMD is unavailable. */ while (count >= sizeof(*l)) { if (*l != skipval) break; l++; From 7303abecb76de06b0546b51d18fe9258b0a308a6 Mon Sep 17 00:00:00 2001 From: Martin Dimitrov Date: Tue, 19 May 2026 20:27:57 -0700 Subject: [PATCH 3/5] bitops: document __builtin_clzl safety invariant in redisBitpos Explain why the argument to __builtin_clzl (which is undefined for zero) can never be zero: - bit==1: guarded by the explicit (word == 0) check above. - bit==0: the skip-word loop consumes all ULONG_MAX words, so the loaded word != ULONG_MAX and ~word != 0. --- src/bitops.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/bitops.c b/src/bitops.c index b50bc00aa..83a90e935 100644 --- a/src/bitops.c +++ b/src/bitops.c @@ -532,7 +532,14 @@ long long redisBitpos(void *s, unsigned long count, int bit) { /* Last word left, find the position of the first matching bit. * __builtin_clzl gives the count of leading zeros in an unsigned long, * which is exactly the bit offset from MSB to the first set bit. - * For bit=0 we invert the word first to find the first zero bit. */ + * For bit=0 we invert the word first to find the first zero bit. + * + * Safety: __builtin_clzl is undefined for a zero argument, but that + * cannot happen here: + * - bit==1: the 'if (bit == 1 && word == 0) return -1' above guards it. + * - bit==0: the skip-word loop consumes all words equal to ULONG_MAX + * (skipval), so the word loaded here satisfies word != ULONG_MAX, + * meaning ~word != 0. */ pos += bit ? __builtin_clzl(word) : __builtin_clzl(~word); return pos; } From afb8a2bbc4ade1ccd45f84f15fcce0fcff487345 Mon Sep 17 00:00:00 2001 From: Martin Dimitrov Date: Tue, 19 May 2026 20:36:48 -0700 Subject: [PATCH 4/5] bitops: use unsigned char pointer for SIMD scan arithmetic Refactor redisBitposScanAVX512/AVX2 to take unsigned char *p instead of unsigned long *l, replacing the repeated (unsigned long*)((unsigned char*)l + N) casts with simple p += N pointer arithmetic. At the call sites, work with the existing 'c' char pointer throughout the SIMD phase and cast to unsigned long * only once before the scalar tail loop. --- src/bitops.c | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/src/bitops.c b/src/bitops.c index 83a90e935..816af5c80 100644 --- a/src/bitops.c +++ b/src/bitops.c @@ -390,17 +390,17 @@ static inline long long redisPopcountAuto(const unsigned char *p, long count) { * ----------------------------------------------------------------------- */ #ifdef HAVE_AVX512 ATTRIBUTE_TARGET_AVX512 -static unsigned long redisBitposScanAVX512(unsigned long *l, +static unsigned long redisBitposScanAVX512(unsigned char *p, unsigned long count, int bit) { unsigned long scanned = 0; __m512i skip = bit ? _mm512_setzero_si512() : _mm512_set1_epi64(-1LL); while (count >= 64) { - __m512i data = _mm512_loadu_si512(l); + __m512i data = _mm512_loadu_si512(p); __mmask8 eq = _mm512_cmpeq_epi64_mask(data, skip); if (eq != 0xFF) break; - l = (unsigned long *)((unsigned char *)l + 64); + p += 64; count -= 64; scanned += 64; } @@ -410,17 +410,17 @@ static unsigned long redisBitposScanAVX512(unsigned long *l, #ifdef HAVE_AVX2 ATTRIBUTE_TARGET_AVX2 -static unsigned long redisBitposScanAVX2(unsigned long *l, +static unsigned long redisBitposScanAVX2(unsigned char *p, unsigned long count, int bit) { unsigned long scanned = 0; __m256i skip = bit ? _mm256_setzero_si256() : _mm256_set1_epi8((char)0xFF); while (count >= 32) { - __m256i data = _mm256_loadu_si256((__m256i *)l); + __m256i data = _mm256_loadu_si256((__m256i *)p); int eq = _mm256_movemask_epi8(_mm256_cmpeq_epi8(data, skip)); if (eq != -1) break; - l = (unsigned long *)((unsigned char *)l + 32); + p += 32; count -= 32; scanned += 32; } @@ -468,7 +468,6 @@ long long redisBitpos(void *s, unsigned long count, int bit) { /* Skip bits with full word step. Use SIMD when available for the * bulk of the scan, then fall through to scalar for the tail. */ - l = (unsigned long*) c; if (!found) { skipval = bit ? 0 : ULONG_MAX; @@ -478,8 +477,8 @@ long long redisBitpos(void *s, unsigned long count, int bit) { #if defined(HAVE_AVX512) if (BITOP_USE_AVX512 && count >= 64) { - unsigned long advanced = redisBitposScanAVX512(l, count, bit); - l = (unsigned long*)((unsigned char*)l + advanced); + unsigned long advanced = redisBitposScanAVX512(c, count, bit); + c += advanced; count -= advanced; pos += advanced * 8; useAVX = 1; @@ -488,8 +487,8 @@ long long redisBitpos(void *s, unsigned long count, int bit) { #if defined(HAVE_AVX2) if (!useAVX && BITOP_USE_AVX2 && count >= 32) { - unsigned long advanced = redisBitposScanAVX2(l, count, bit); - l = (unsigned long*)((unsigned char*)l + advanced); + unsigned long advanced = redisBitposScanAVX2(c, count, bit); + c += advanced; count -= advanced; pos += advanced * 8; } @@ -497,12 +496,14 @@ long long redisBitpos(void *s, unsigned long count, int bit) { /* Scalar word-at-a-time scan handles the tail after SIMD and * serves as the sole scan path when SIMD is unavailable. */ + l = (unsigned long *)c; while (count >= sizeof(*l)) { if (*l != skipval) break; l++; count -= sizeof(*l); pos += sizeof(*l)*8; } + c = (unsigned char *)l; } /* Load bytes into "word" considering the first byte as the most significant @@ -512,7 +513,6 @@ long long redisBitpos(void *s, unsigned long count, int bit) { * * Note that the loading is designed to work even when the bytes left * (count) are less than a full word. We pad it with zero on the right. */ - c = (unsigned char*)l; for (j = 0; j < sizeof(*l); j++) { word <<= 8; if (count) { From a4ad686ef3efdbdee988a278fff33e47cb1a857a Mon Sep 17 00:00:00 2001 From: Martin Dimitrov Date: Tue, 19 May 2026 20:42:37 -0700 Subject: [PATCH 5/5] bitops: switch AVX2 scan to epi64 comparison for consistency Use _mm256_cmpeq_epi64 + _mm256_movemask_pd in redisBitposScanAVX2, matching the qword-granularity comparison style of the AVX512 path which uses _mm512_cmpeq_epi64_mask. --- src/bitops.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/bitops.c b/src/bitops.c index 816af5c80..8fb201168 100644 --- a/src/bitops.c +++ b/src/bitops.c @@ -414,12 +414,13 @@ static unsigned long redisBitposScanAVX2(unsigned char *p, unsigned long count, int bit) { unsigned long scanned = 0; __m256i skip = bit ? _mm256_setzero_si256() - : _mm256_set1_epi8((char)0xFF); + : _mm256_set1_epi64x(-1LL); while (count >= 32) { __m256i data = _mm256_loadu_si256((__m256i *)p); - int eq = _mm256_movemask_epi8(_mm256_cmpeq_epi8(data, skip)); - if (eq != -1) break; + int eq = _mm256_movemask_pd(_mm256_castsi256_pd( + _mm256_cmpeq_epi64(data, skip))); + if (eq != 0xF) break; p += 32; count -= 32; scanned += 32;