diff --git a/CHANGES b/CHANGES index d9e07276de..0e3cce15a7 100644 --- a/CHANGES +++ b/CHANGES @@ -1,3 +1,7 @@ +5868. [cleanup] Use Daniel Lemire's "nearly divisionless" algorithm + for unbiased bounded random numbers, and move + re-seeding out of the hot path. [GL !6161] + 5867. [bug] Fix assertion failure triggered by attaching to dns_adb in dns_adb_createfind() that has been triggered to shut down in different thread between the check for shutting diff --git a/lib/isc/Makefile.am b/lib/isc/Makefile.am index c0bb69fa4a..94f0a74d3f 100644 --- a/lib/isc/Makefile.am +++ b/lib/isc/Makefile.am @@ -170,6 +170,7 @@ libisc_la_SOURCES = \ quota.c \ radix.c \ random.c \ + random_p.h \ ratelimiter.c \ regex.c \ region.c \ diff --git a/lib/isc/include/isc/random.h b/lib/isc/include/isc/random.h index 1e30d0c87d..79f1f9de2c 100644 --- a/lib/isc/include/isc/random.h +++ b/lib/isc/include/isc/random.h @@ -54,12 +54,18 @@ isc_random_buf(void *buf, size_t buflen); uint32_t isc_random_uniform(uint32_t upper_bound); /*!< - * \brief Will return a single 32-bit value, uniformly distributed but - * less than upper_bound. This is recommended over - * constructions like ``isc_random() % upper_bound'' as it - * avoids "modulo bias" when the upper bound is not a power of - * two. In the worst case, this function may require multiple - * iterations to ensure uniformity. + * \brief Returns a single 32-bit uniformly distributed random value + * less than upper_bound. + * + * This is better than ``isc_random() % upper_bound'' as it avoids + * "modulo bias" when the upper bound is not a power of two. This + * function is also faster, because it usually avoids doing any + * divisions (which are typically very slow). + * + * It uses rejection sampling to ensure uniformity, so it may require + * multiple iterations to get a result; the probability of needing to + * resample is very small when the upper_bound is small, rising to 0.5 + * when upper_bound is UINT32_MAX/2. */ ISC_LANG_ENDDECLS diff --git a/lib/isc/lib.c b/lib/isc/lib.c index 7f6b12c4aa..4d52c5bc2f 100644 --- a/lib/isc/lib.c +++ b/lib/isc/lib.c @@ -22,6 +22,7 @@ #include "config.h" #include "mem_p.h" #include "os_p.h" +#include "random_p.h" #include "tls_p.h" #include "trampoline_p.h" @@ -42,6 +43,7 @@ void isc__initialize(void) { isc__os_initialize(); isc__mem_initialize(); + isc__random_initialize(); isc__tls_initialize(); isc__trampoline_initialize(); (void)isc_os_ncpus(); diff --git a/lib/isc/random.c b/lib/isc/random.c index 7eead6608b..8f804360db 100644 --- a/lib/isc/random.c +++ b/lib/isc/random.c @@ -35,7 +35,6 @@ #include #include -#include #include #include #include @@ -43,22 +42,7 @@ #include #include "entropy_private.h" - -/* - * The specific implementation for PRNG is included as a C file - * that has to provide a static variable named seed, and a function - * uint32_t next(void) that provides next random number. - * - * The implementation must be thread-safe. - */ - -/* - * Two contestants have been considered: the xoroshiro family of the - * functions by Villa&Blackman, and PCG by O'Neill. After - * consideration, the xoshiro128starstar function has been chosen as - * the uint32_t random number provider because it is very fast and has - * good enough properties for our usage pattern. - */ +#include "random_p.h" /* * Written in 2018 by David Blackman and Sebastiano Vigna (vigna@acm.org) @@ -76,11 +60,9 @@ * It has excellent (sub-ns) speed, a state size (128 bits) that is large * enough for mild parallelism, and it passes all tests we are aware of. * - * For generating just single-precision (i.e., 32-bit) floating-point - * numbers, xoshiro128+ is even faster. - * * The state must be seeded so that it is not everywhere zero. */ + static thread_local uint32_t seed[4] = { 0 }; static uint32_t @@ -106,11 +88,8 @@ next(void) { return (result_starstar); } - -static thread_local isc_once_t isc_random_once = ISC_ONCE_INIT; - -static void -isc_random_initialize(void) { +void +isc__random_initialize(void) { int useed[4] = { 0, 0, 0, 1 }; #if FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION /* @@ -126,22 +105,16 @@ isc_random_initialize(void) { uint8_t isc_random8(void) { - RUNTIME_CHECK(isc_once_do(&isc_random_once, isc_random_initialize) == - ISC_R_SUCCESS); - return (next() & 0xff); + return ((uint8_t)next()); } uint16_t isc_random16(void) { - RUNTIME_CHECK(isc_once_do(&isc_random_once, isc_random_initialize) == - ISC_R_SUCCESS); - return (next() & 0xffff); + return ((uint16_t)next()); } uint32_t isc_random32(void) { - RUNTIME_CHECK(isc_once_do(&isc_random_once, isc_random_initialize) == - ISC_R_SUCCESS); return (next()); } @@ -153,9 +126,6 @@ isc_random_buf(void *buf, size_t buflen) { REQUIRE(buf != NULL); REQUIRE(buflen > 0); - RUNTIME_CHECK(isc_once_do(&isc_random_once, isc_random_initialize) == - ISC_R_SUCCESS); - for (i = 0; i + sizeof(r) <= buflen; i += sizeof(r)) { r = next(); memmove((uint8_t *)buf + i, &r, sizeof(r)); @@ -166,41 +136,75 @@ isc_random_buf(void *buf, size_t buflen) { } uint32_t -isc_random_uniform(uint32_t upper_bound) { - /* Copy of arc4random_uniform from OpenBSD */ - uint32_t r, min; - - RUNTIME_CHECK(isc_once_do(&isc_random_once, isc_random_initialize) == - ISC_R_SUCCESS); - - if (upper_bound < 2) { - return (0); - } - -#if (ULONG_MAX > 0xffffffffUL) - min = 0x100000000UL % upper_bound; -#else /* if (ULONG_MAX > 0xffffffffUL) */ - /* Calculate (2**32 % upper_bound) avoiding 64-bit math */ - if (upper_bound > 0x80000000) { - min = 1 + ~upper_bound; /* 2**32 - upper_bound */ - } else { - /* (2**32 - (x * 2)) % x == 2**32 % x when x <= 2**31 */ - min = ((0xffffffff - (upper_bound * 2)) + 1) % upper_bound; - } -#endif /* if (ULONG_MAX > 0xffffffffUL) */ - +isc_random_uniform(uint32_t limit) { /* - * This could theoretically loop forever but each retry has - * p > 0.5 (worst case, usually far better) of selecting a - * number inside the range we need, so it should rarely need - * to re-roll. + * Daniel Lemire's nearly-divisionless unbiased bounded random numbers. + * + * https://lemire.me/blog/?p=17551 + * + * The raw random number generator `next()` returns a 32-bit value. + * We do a 64-bit multiply `next() * limit` and treat the product as a + * 32.32 fixed-point value less than the limit. Our result will be the + * integer part (upper 32 bits), and we will use the fraction part + * (lower 32 bits) to determine whether or not we need to resample. */ - for (;;) { - r = next(); - if (r >= min) { - break; + uint64_t num = (uint64_t)next() * (uint64_t)limit; + /* + * In the fast path, we avoid doing a division in most cases by + * comparing the fraction part of `num` with the limit, which is + * a slight over-estimate for the exact resample threshold. + */ + if ((uint32_t)(num) < limit) { + /* + * We are in the slow path where we re-do the approximate test + * more accurately. The exact threshold for the resample loop + * is the remainder after dividing the raw RNG limit `1 << 32` + * by the caller's limit. We use a trick to calculate it + * within 32 bits: + * + * (1 << 32) % limit + * == ((1 << 32) - limit) % limit + * == (uint32_t)(-limit) % limit + * + * This division is safe: we know that `limit` is strictly + * greater than zero because of the slow-path test above. + */ + uint32_t residue = (uint32_t)(-limit) % limit; + /* + * Unless we get one of `N = (1 << 32) - residue` valid + * values, we reject the sample. This `N` is a multiple of + * `limit`, so our results will be unbiased; and `N` is the + * largest multiple that fits in 32 bits, so rejections are as + * rare as possible. + * + * There are `limit` possible values for the integer part of + * our fixed-point number. Each one corresponds to `N/limit` + * or `N/limit + 1` possible fraction parts. For our result to + * be unbiased, every possible integer part must have the same + * number of possible valid fraction parts. So, when we get + * the superfluous value in the `N/limit + 1` cases, we need + * to reject and resample. + * + * Because of the multiplication, the possible values in the + * fraction part are equally spaced by `limit`, with varying + * gaps at each end of the fraction's 32-bit range. We will + * choose a range of size `N` (a multiple of `limit`) into + * which valid fraction values must fall, with the rest of the + * 32-bit range covered by the `residue`. Lemire's paper says + * that exactly `N/limit` possible values spaced apart by + * `limit` will fit into our size `N` valid range, regardless + * of the size of the end gaps, the phase alignment of the + * values, or the position of the range. + * + * So, when a fraction value falls in the `residue` outside + * our valid range, it is superfluous, and we resample. + */ + while ((uint32_t)(num) < residue) { + num = (uint64_t)next() * (uint64_t)limit; } } - - return (r % upper_bound); + /* + * Return the integer part (upper 32 bits). + */ + return ((uint32_t)(num >> 32)); } diff --git a/lib/isc/random_p.h b/lib/isc/random_p.h new file mode 100644 index 0000000000..3e4917f8b7 --- /dev/null +++ b/lib/isc/random_p.h @@ -0,0 +1,30 @@ +/* + * Copyright (C) Internet Systems Consortium, Inc. ("ISC") + * + * SPDX-License-Identifier: MPL-2.0 + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, you can obtain one at https://mozilla.org/MPL/2.0/. + * + * See the COPYRIGHT file distributed with this work for additional + * information regarding copyright ownership. + */ + +#pragma once + +#include + +/*! \file isc/random_p.h + * \brief For automatically seeding and re-seeding when required. + */ + +ISC_LANG_BEGINDECLS + +void +isc__random_initialize(void); +/*!< + * \brief Seed the thread-local random number state with fresh entropy. + */ + +ISC_LANG_ENDDECLS diff --git a/lib/isc/tests/random_test.c b/lib/isc/tests/random_test.c index 040e1a5f77..3d8cfdb260 100644 --- a/lib/isc/tests/random_test.c +++ b/lib/isc/tests/random_test.c @@ -87,6 +87,11 @@ _setup(void **state) { result = isc_test_begin(NULL, true, 0); assert_int_equal(result, ISC_R_SUCCESS); + /* + * Ensure the RNG has been seeded. + */ + assert_int_not_equal(isc_random32(), 0); + return (0); } diff --git a/lib/isc/trampoline.c b/lib/isc/trampoline.c index fe01394b9e..a718e097e2 100644 --- a/lib/isc/trampoline.c +++ b/lib/isc/trampoline.c @@ -22,6 +22,7 @@ #include #include +#include "random_p.h" #include "trampoline_p.h" #define ISC__TRAMPOLINE_UNUSED 0 @@ -185,6 +186,11 @@ isc__trampoline_attach(isc__trampoline_t *trampoline) { * malloc() + free() calls altogether, as it would foil the fix. */ trampoline->jemalloc_enforce_init = malloc(8); + + /* + * Re-seed the random number generator in each thread. + */ + isc__random_initialize(); } isc_threadresult_t