MEDIUM: tools: switch the main PRNG to a thread-local xoshiro256**

The current PRNG is xoroshiro128**, it was introduced in 2.2 with
commit 52bf83939 ("BUG/MEDIUM: random: implement a thread-safe and
process-safe PRNG").  It features a 2^128 sequence and can perform
2^64 or 2^96 jumps, though only the 2^96 jump is implemented. It
was initially designed to support both processes and threads, and
implements a shared state between threads instead of allocating
distinct sequences based on PID and thread numbers.

Since then, the PRNG's usage grew and processes have disappeared,
but the lock or the DWCAS are still there due to its shared nature.

Also, UUID and QUIC retry tokens now consume 128 bits from the PRNG
in two 64-bit calls, and used to weaken the PRNG by rapidly disclosing
its internal state on reasonably idle systems. This indicates that
most of the time we now need 128 bits.

This patch modernizes the internal generator by switching to xoshiro256**,
which has comparable properties (it's even faster), and features even
longer 2^256 periods, still returning 64 bits per call. It can be
initialized with 2^128 and 2^192 jumps. More details here:

   https://prng.di.unimi.it/
   https://prng.di.unimi.it/xoshiro256starstar.c

Here we implement a thread-local state instead of the old shared one,
so there is no more need for synchronization. The state is seeded at
boot, and each thread performs as many 2^192 jumps as their TID is
large. The master process performs a 2^128 jump where it used to
perform a 2^96 jump so that it doesn't overlap with any worker thread.
However a cleaner approach could be to perform a 2^128 jump for each
fork() (here the worker) and 2^192 for each thread. This might be for
a future improvement.

ha_random64_internal() is now the new PRNG, so that everything else
remains totally transparent. _ha_random64_pair_hashed() continues to
hash the first 128 bits of the state.

A simple config generating 100 UUID on 20 threads jumps from 135k to
1.25M req/s, which translates to a bump from 13.5M to 125M UUID/s,
or 9 times faster. And there is no more DWCAS can be seen anymore
in perf top:

Before:
Overhead  Shared Object            Symbol
  99.04%  haproxy       [.] ha_random64_internal
   0.66%  haproxy       [.] _ha_random64_pair_hashed
   0.03%  libc-2.42.so  [.] __printf_buffer
   0.02%  [kernel]      [k] _raw_spin_lock
   0.01%  libc-2.42.so  [.] __strchrnul_avx2
   0.01%  [kernel]      [k] ktime_get
   0.01%  [kernel]      [k] lapic_next_deadline
   0.01%  haproxy       [.] sample_process
   0.01%  haproxy       [.] chunk_printf
   0.01%  libc-2.42.so  [.] __printf_buffer_write
   0.01%  [kernel]      [k] hrtimer_active
   0.01%  libc-2.42.so  [.] __memmove_avx_unaligned_erms
   0.01%  libc-2.42.so  [.] _itoa_word

After:
  18.84%  libc-2.42.so      [.] __printf_buffer
   9.84%  haproxy           [.] sample_process
   8.33%  libc-2.42.so      [.] __strchrnul_avx2
   6.61%  libc-2.42.so      [.] __memmove_avx_unaligned_erms
   6.06%  libc-2.42.so      [.] __printf_buffer_write
   4.43%  haproxy           [.] strlcpy2
   4.09%  libc-2.42.so      [.] _itoa_word
   2.62%  haproxy           [.] sess_build_logline_orig
   2.12%  haproxy           [.] _ha_random64_pair_hashed
   1.28%  haproxy           [.] pool_put_to_cache
   1.06%  haproxy           [.] __pool_alloc
   1.00%  haproxy           [.] smp_fetch_uuid
   0.93%  haproxy           [.] lf_text_len
   0.82%  haproxy           [.] ha_generate_uuid_v4
This commit is contained in:
Willy Tarreau 2026-05-25 15:38:49 +02:00
parent 3f7862a0a9
commit 8da18bfa29
4 changed files with 126 additions and 97 deletions

View file

@ -1293,7 +1293,9 @@ struct uint64_pair _ha_random64_pair_hashed(void);
void ha_generate_uuid_v4(struct buffer *output);
void ha_generate_uuid_v7(struct buffer *output);
void ha_random_seed(const unsigned char *seed, size_t len);
void ha_random_jump96(uint32_t dist);
void ha_random_seed_thread(void);
void ha_random_jump128(uint32_t dist);
void ha_random_jump192(uint32_t dist);
uint64_t ha_random64(void);
uint64_t ha_random64_internal(void);

View file

@ -3098,6 +3098,7 @@ void *run_thread_poll_loop(void *data)
ha_set_thread(data);
set_thread_cpu_affinity();
clock_set_local_source();
ha_random_seed_thread();
#ifdef USE_THREAD
ha_thread_info[tid].pth_id = ha_get_pthread_id(tid);

View file

@ -1298,7 +1298,7 @@ void mworker_apply_master_worker_mode(void)
/* This one must not be exported, it's internal! */
unsetenv("HAPROXY_MWORKER_REEXEC");
ha_random_jump96(1);
ha_random_jump128(1);
list_for_each_entry(child, &proc_list, list) {
if ((child->options & PROC_O_TYPE_WORKER) && (child->options & PROC_O_INIT)) {

View file

@ -6238,80 +6238,103 @@ int varint_bytes(uint64_t v)
/* secret used for XXH hash involved in PRNG */
static char ha_random_xxh_secret[XXH3_SECRET_DEFAULT_SIZE] ALIGNED(64);
/* Random number generator state, see below */
static uint64_t ha_random_state[2] ALIGNED(2*sizeof(uint64_t));
/* This is a thread-safe implementation of xoroshiro128** described below:
* http://prng.di.unimi.it/
* It features a 2^128 long sequence, returns 64 high-quality bits on each call,
* supports fast jumps and passes all common quality tests. It is thread-safe,
* uses a double-cas on 64-bit architectures supporting it, and falls back to a
* local lock on other ones.
* It may only be used for internal random generation, because exposing its
* output will quickly reveal the internal state.
/* 2^256 sequnce thread-local PRNG state known as "XOSHIRO256**".
* See details here:
* https://prng.di.unimi.it/
* https://prng.di.unimi.it/xoshiro256starstar.c
* It features a 2^256 long sequence, returns 64 high-quality bits on each call,
* supports fast jumps and passes all common quality tests. Supporting 128-bit
* jumps, it allows to run thread-local with non-overlapping sequences. It must
* be seeded otherwise the ratio of zeroes is a bit high initially.
*/
uint64_t ha_random64_internal()
static THREAD_LOCAL uint64_t ha_random_state[4];
/* Returns the next 64-bit PRNG number from the thread-local 256-bit state and
* makes the internal state progress by one step. This is meant to be used by
* other local functions. Since its discloses the PRNG's internal state, it
* must not be called to produce externally visible randoms.
*/
static inline uint64_t _ha_random64_internal(void)
{
uint64_t old[2] ALIGNED(2*sizeof(uint64_t));
uint64_t new[2] ALIGNED(2*sizeof(uint64_t));
const uint64_t result = rotl64(ha_random_state[1] * 5, 7) * 9;
const uint64_t t = ha_random_state[1] << 17;
#if defined(USE_THREAD) && (!defined(HA_CAS_IS_8B) || !defined(HA_HAVE_CAS_DW))
static HA_SPINLOCK_T rand_lock;
HA_SPIN_LOCK(OTHER_LOCK, &rand_lock);
#endif
old[0] = ha_random_state[0];
old[1] = ha_random_state[1];
#if defined(USE_THREAD) && defined(HA_CAS_IS_8B) && defined(HA_HAVE_CAS_DW)
do {
#endif
new[1] = old[0] ^ old[1];
new[0] = rotl64(old[0], 24) ^ new[1] ^ (new[1] << 16); // a, b
new[1] = rotl64(new[1], 37); // c
#if defined(USE_THREAD) && defined(HA_CAS_IS_8B) && defined(HA_HAVE_CAS_DW)
} while (unlikely(!_HA_ATOMIC_DWCAS(ha_random_state, old, new)));
#else
ha_random_state[0] = new[0];
ha_random_state[1] = new[1];
#if defined(USE_THREAD)
HA_SPIN_UNLOCK(OTHER_LOCK, &rand_lock);
#endif
#endif
return rotl64(old[0] * 5, 7) * 9;
ha_random_state[2] ^= ha_random_state[0];
ha_random_state[3] ^= ha_random_state[1];
ha_random_state[1] ^= ha_random_state[2];
ha_random_state[0] ^= ha_random_state[3];
ha_random_state[2] ^= t;
ha_random_state[3] = rotl64(ha_random_state[3], 45);
return result;
}
/* Returns a uint64_t random hashed so as not to disclose the internal PRNG
* state. The function uses a local XXH secret that is created at boot, and
* now_ns as the seed to limit remote analysis.
/* Returns the next 64-bit PRNG number from the thread-local 256-bit state and
* makes the internal state progress by one step. Since its discloses the PRNG's
* internal state, it must not be called to produce externally visible randoms.
*/
uint64_t ha_random64(void)
uint64_t ha_random64_internal(void)
{
uint64_t ret;
ret = ha_random64_internal();
return XXH3_64bits_withSecretandSeed(&ret, sizeof(ret),
ha_random_xxh_secret, sizeof(ha_random_xxh_secret),
now_ns);
return _ha_random64_internal();
}
/* Returns a pair of uint64_t randoms hashed so as not to disclose the internal
* PRNG state. This function shouldn't be used directly, better use the public
* ha_random64_pair_hashed() which calls it. The function uses a local XXH
* secret that is created at boot, and now_ns as the seed to limit remote
* analysis.
/* This function uses a pre-calculated jump table to of 4 uint64_t to perform a
* jump equivalent to multiple calls to ha_random_next(). It shouldn't be
* used directly but only from the next functions.
*/
struct uint64_pair _ha_random64_pair_hashed(void)
static void _ha_random_jump(const uint64_t *table)
{
XXH128_hash_t ret;
ret = XXH3_128bits_withSecretandSeed(ha_random_state, 2*sizeof(uint64_t),
ha_random_xxh_secret, sizeof(ha_random_xxh_secret),
now_ns);
/* update the internal state */
ha_random64_internal();
return (struct uint64_pair){ .l = ret.low64, .h = ret.high64 };
uint64_t s0, s1, s2, s3;
uint i, j;
s0 = s1 = s2 = s3 = 0;
for (i = 0; i < 4; i++) {
for (j = 0; j < 64; j++) {
if (table[i] & (1ULL << j)) {
s0 ^= ha_random_state[0];
s1 ^= ha_random_state[1];
s2 ^= ha_random_state[2];
s3 ^= ha_random_state[3];
}
ha_random64_internal();
}
}
ha_random_state[0] = s0;
ha_random_state[1] = s1;
ha_random_state[2] = s2;
ha_random_state[3] = s3;
}
/* This function is equivalent to calling <dist> times 2^128 calls to
* ha_random_next(). It can be used to generate 2^128 non-overlapping
* sequences. The <dist> argument is the distance to jump to and is used
* in a loop so it rather not be too large if the processing time is a
* concern. It only applies to the current thread. Note that <dist> may
* not be zero.
*/
void ha_random_jump128(uint32_t dist)
{
static const uint64_t table[] = { 0x180ec6d33cfd0aba, 0xd5a61266f0c9392c, 0xa9582618e03fc9aa, 0x39abdc4529b1661c };
BUG_ON(!dist);
while (dist--)
_ha_random_jump(table);
}
/* This function is equivalent to calling <dist> times 2^192 calls to
* ha_random_next(). It can be used to generate 2^64 non-overlapping
* sequences. The <dist> argument is the distance to jump to and is used
* in a loop so it rather not be too large if the processing time is a
* concern. It only applies to the current thread. Note that <dist> may
* not be zero.
*/
void ha_random_jump192(uint32_t dist)
{
static const uint64_t table[] = { 0x76e15d3efefdcbbf, 0xc5004e441c522fb3, 0x77710069854ee241, 0x39109bb02acbe635 };
BUG_ON(!dist);
while (dist--)
_ha_random_jump(table);
}
/* seeds the random state using up to <len> bytes from <seed>, starting with
@ -6346,41 +6369,44 @@ void ha_random_seed(const unsigned char *seed, size_t len)
XXH3_generateSecret(ha_random_xxh_secret, sizeof(ha_random_xxh_secret), seed, len);
}
/* This causes a jump to (dist * 2^96) places in the pseudo-random sequence,
* and is equivalent to calling ha_random64() as many times. It is used to
* provide non-overlapping sequences of 2^96 numbers (~7*10^28) to up to 2^32
* different generators (i.e. different processes after a fork). The <dist>
* argument is the distance to jump to and is used in a loop so it rather not
* be too large if the processing time is a concern.
*
* BEWARE: this function is NOT thread-safe and must not be called during
* concurrent accesses to ha_random64().
*/
void ha_random_jump96(uint32_t dist)
/* Seed the PRNG for the current thread */
void ha_random_seed_thread(void)
{
while (dist--) {
uint64_t s0 = 0;
uint64_t s1 = 0;
int b;
/* seed already done for first thread, but jump still necessary */
if (tid > 0)
ha_random_seed(boot_seed, sizeof(boot_seed));
ha_random_jump192(tid + 1);
}
for (b = 0; b < 64; b++) {
if ((0xd2a98b26625eee7bULL >> b) & 1) {
s0 ^= ha_random_state[0];
s1 ^= ha_random_state[1];
}
ha_random64();
}
/* Returns a uint64_t random hashed so as not to disclose the internal PRNG
* state. The function uses a local XXH secret that is created at boot, and
* now_ns as the seed to limit remote analysis.
*/
uint64_t ha_random64(void)
{
uint64_t ret;
for (b = 0; b < 64; b++) {
if ((0xdddf9b1090aa7ac1ULL >> b) & 1) {
s0 ^= ha_random_state[0];
s1 ^= ha_random_state[1];
}
ha_random64();
}
ha_random_state[0] = s0;
ha_random_state[1] = s1;
}
ret = _ha_random64_internal();
return XXH3_64bits_withSecretandSeed(&ret, sizeof(ret),
ha_random_xxh_secret, sizeof(ha_random_xxh_secret),
now_ns);
}
/* Returns a pair of uint64_t randoms hashed so as not to disclose the internal
* PRNG state. This function shouldn't be used directly, better use the public
* ha_random64_pair_hashed() which calls it. The function uses a local XXH
* secret that is created at boot, and now_ns as the seed to limit remote
* analysis.
*/
struct uint64_pair _ha_random64_pair_hashed(void)
{
XXH128_hash_t ret;
ret = XXH3_128bits_withSecretandSeed(ha_random_state, 2*sizeof(uint64_t),
ha_random_xxh_secret, sizeof(ha_random_xxh_secret),
now_ns);
/* update the internal state */
_ha_random64_internal();
return (struct uint64_pair){ .l = ret.low64, .h = ret.high64 };
}
/* Generates an RFC 9562 version 4 UUID into chunk