diff --git a/modules/vector-sets/hnsw.c b/modules/vector-sets/hnsw.c index 8420ff314..1af44d289 100644 --- a/modules/vector-sets/hnsw.c +++ b/modules/vector-sets/hnsw.c @@ -46,6 +46,8 @@ #include #include "hnsw.h" #include "mixer.h" +#include +#include /* Check if we can compile SIMD code with function attributes. * This defines HAVE_AVX2, HAVE_AVX512, and HAVE_POPCNT when the compiler @@ -129,6 +131,60 @@ static void (*hfree)(void *p) = free; static void *(*hmalloc)(size_t s) = malloc; static void *(*hrealloc)(void *old, size_t s) = realloc; +/* --- Thread-Local PRNG (xoshiro128**) Implementation --- */ + +// Thread-local state variables +static __thread uint32_t thread_rng_state[4]; +static __thread int thread_rng_initialized = 0; + +// Rotate Left helper function +static inline uint32_t rotl(const uint32_t x, int k) { + return (x << k) | (x >> (32 - k)); +} + +/** + * thread_local_rand: + * Returns a 32-bit pseudo-random number. + * Each thread maintains its own state to avoid lock contention. + */ +static uint32_t thread_local_rand(void) { + if (!thread_rng_initialized) { + // Seed using time and thread ID for high entropy per thread + uint32_t s = (uint32_t)time(NULL) ^ (uint32_t)(uintptr_t)pthread_self(); + + // Splitmix32-style initialization to fill the 4 state slots + for (int i = 0; i < 4; i++) { + s += 0x9e3779b9; + uint32_t z = s; + z = (z ^ (z >> 16)) * 0x85ebca6b; + z = (z ^ (z >> 13)) * 0xc2b2ae35; + thread_rng_state[i] = z ^ (z >> 16); + } + + // Ensure state is not all zeros + if (thread_rng_state[0] == 0 && thread_rng_state[1] == 0 && + thread_rng_state[2] == 0 && thread_rng_state[3] == 0) { + thread_rng_state[0] = 1; + } + + thread_rng_initialized = 1; + } + + // xoshiro128** algorithm + const uint32_t result = rotl(thread_rng_state[1] * 5, 7) * 9; + const uint32_t t = thread_rng_state[1] << 9; + + thread_rng_state[2] ^= thread_rng_state[0]; + thread_rng_state[3] ^= thread_rng_state[1]; + thread_rng_state[1] ^= thread_rng_state[2]; + thread_rng_state[0] ^= thread_rng_state[3]; + + thread_rng_state[2] ^= t; + thread_rng_state[3] = rotl(thread_rng_state[3], 11); + + return result; +} + void hnsw_set_allocator(void (*free_ptr)(void*), void *(*malloc_ptr)(size_t), void *(*realloc_ptr)(void*, size_t)) { @@ -789,13 +845,18 @@ void hnsw_normalize_vector(float *x, float *l2ptr, uint32_t dim) { for (i = 0; i < dim; i++) x[i] /= l2; } -/* Helper function to generate random level. */ +/* Helper function to generate random level using thread-local PRNG. */ uint32_t random_level(void) { - static const int threshold = HNSW_P * RAND_MAX; + // We scale the probability HNSW_P (usually 0.25 or 0.5) to the full 32-bit range. + // 0.25 * 0xFFFFFFFF ensures we maintain the same logic as the original. + static const uint32_t threshold = (uint32_t)(HNSW_P * UINT32_MAX); uint32_t level = 0; - while (rand() < threshold && level < HNSW_MAX_LEVEL) + // Use the thread-local PRNG to avoid global lock contention. + while (thread_local_rand() < threshold && level < HNSW_MAX_LEVEL) { level += 1; + } + return level; } @@ -2374,8 +2435,9 @@ hnswNode *hnsw_random_node(HNSW *index, int slot) { if (current->level < level || current->layers[level].num_links == 0) continue; - /* Choose random neighbor at this level. */ - uint32_t rand_neighbor = rand() % current->layers[level].num_links; + /* Choose a random neighbor at this level using the thread-local PRNG. + * This avoids the global lock contention associated with rand(). */ + uint32_t rand_neighbor = thread_local_rand() % current->layers[level].num_links; current = current->layers[level].links[rand_neighbor]; } @@ -2385,18 +2447,19 @@ hnswNode *hnsw_random_node(HNSW *index, int slot) { uint32_t num_walks = (uint32_t)(logN * c); /* Avoid the ping-pong effect: imagine there are just two nodes and - * the number of walks selected is even. We will select always the + * the number of walks selected is even. We will always select the * first element of the graph; conversely, if it is odd, we will always - * select the other element. One way to add more selection randomness is - * to randomly add '1' or '0' to the number of walks to perform. */ - num_walks += rand() & 1; + * select the other element. To improve selection randomness, we + * add '1' or '0' to the number of walks using our thread-local PRNG. + * This avoids the global lock contention found in standard rand(). */ + num_walks += thread_local_rand() & 1; // Perform random walk at level 0. for (uint32_t i = 0; i < num_walks; i++) { if (current->layers[0].num_links == 0) return current; - // Choose random neighbor. - uint32_t rand_neighbor = rand() % current->layers[0].num_links; + // Choose random neighbor using thread-local PRNG + uint32_t rand_neighbor = thread_local_rand() % current->layers[0].num_links; current = current->layers[0].links[rand_neighbor]; } return current;