diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index bc8ca63f2d..5c332adab3 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -18,14 +18,20 @@ variables: CLANG_VERSION: 16 CLANG: "clang-${CLANG_VERSION}" SCAN_BUILD: "scan-build-${CLANG_VERSION}" - ASAN_SYMBOLIZER_PATH: "/usr/lib/llvm-${CLANG_VERSION}/bin/llvm-symbolizer" + LLVM_SYMBOLIZER: "/usr/lib/llvm-${CLANG_VERSION}/bin/llvm-symbolizer" CLANG_FORMAT: "clang-format-${CLANG_VERSION}" CFLAGS_COMMON: -fno-omit-frame-pointer -fno-optimize-sibling-calls -O1 -g -Wall -Wextra # Pass run-time flags to AddressSanitizer to get core dumps on error. ASAN_OPTIONS: abort_on_error=1:disable_coredump=0:unmap_shadow_on_exit=1 - TSAN_OPTIONS_COMMON: "disable_coredump=0 second_deadlock_stack=1 history_size=7 log_exe_name=true log_path=tsan" + ASAN_SYMBOLIZER_PATH: "${LLVM_SYMBOLIZER}" + + TSAN_OPTIONS_COMMON: "disable_coredump=0 second_deadlock_stack=1 atexit_sleep_ms=1000 history_size=7 log_exe_name=true log_path=tsan" + TSAN_SYMBOLIZER: "external_symbolizer_path=/usr/bin/llvm-symbolizer" + TSAN_SUPPRESSIONS: "suppressions=${CI_PROJECT_DIR}/.tsan-suppress" + TSAN_OPTIONS_DEFAULT: "${TSAN_OPTIONS_COMMON} ${TSAN_SUPPRESSIONS} ${TSAN_SYMBOLIZER}" + UBSAN_OPTIONS: "halt_on_error=1:abort_on_error=1:disable_coredump=0" TARBALL_EXTENSION: xz @@ -1044,7 +1050,7 @@ gcc:tsan: system:gcc:tsan: variables: - TSAN_OPTIONS: "${TSAN_OPTIONS_COMMON} external_symbolizer_path=/usr/bin/llvm-symbolizer" + TSAN_OPTIONS: "${TSAN_OPTIONS_DEFAULT}" <<: *fedora_37_amd64_image <<: *system_test_tsan_job needs: @@ -1053,7 +1059,7 @@ system:gcc:tsan: unit:gcc:tsan: variables: - TSAN_OPTIONS: "${TSAN_OPTIONS_COMMON} external_symbolizer_path=/usr/bin/llvm-symbolizer" + TSAN_OPTIONS: "${TSAN_OPTIONS_DEFAULT}" <<: *fedora_37_amd64_image <<: *unit_test_tsan_job needs: @@ -1071,7 +1077,7 @@ clang:tsan: system:clang:tsan: variables: - TSAN_OPTIONS: "${TSAN_OPTIONS_COMMON} external_symbolizer_path=/usr/lib/llvm-${CLANG_VERSION}/bin/llvm-symbolizer" + TSAN_OPTIONS: "${TSAN_OPTIONS_COMMON} ${TSAN_SUPPRESSIONS} external_symbolizer_path=${LLVM_SYMBOLIZER}" <<: *base_image <<: *system_test_tsan_job needs: @@ -1080,7 +1086,7 @@ system:clang:tsan: unit:clang:tsan: variables: - TSAN_OPTIONS: "${TSAN_OPTIONS_COMMON} external_symbolizer_path=/usr/lib/llvm-${CLANG_VERSION}/bin/llvm-symbolizer suppressions=$CI_PROJECT_DIR/tsan-suppressions.txt" + TSAN_OPTIONS: "${TSAN_OPTIONS_COMMON} suppressions=$CI_PROJECT_DIR/.tsan-suppress-extra external_symbolizer_path=${LLVM_SYMBOLIZER}" <<: *base_image <<: *unit_test_tsan_job needs: @@ -1323,7 +1329,7 @@ respdiff-short:tsan: LDFLAGS: "-fsanitize=thread" EXTRA_CONFIGURE: "--enable-pthread-rwlock --without-jemalloc" MAX_DISAGREEMENTS_PERCENTAGE: "0.5" - TSAN_OPTIONS: "${TSAN_OPTIONS_COMMON} external_symbolizer_path=/usr/bin/llvm-symbolizer" + TSAN_OPTIONS: "${TSAN_OPTIONS_DEFAULT}" script: - bash respdiff.sh -s named -q "${PWD}/10k_a.txt" -c 3 -w "${PWD}/rspworkdir" "${CI_PROJECT_DIR}" "/usr/local/respdiff-reference-bind/sbin/named" after_script: @@ -1361,7 +1367,7 @@ respdiff-long:tsan: LDFLAGS: "-fsanitize=thread" EXTRA_CONFIGURE: "--enable-pthread-rwlock --without-jemalloc" MAX_DISAGREEMENTS_PERCENTAGE: "0.5" - TSAN_OPTIONS: "${TSAN_OPTIONS_COMMON} external_symbolizer_path=/usr/bin/llvm-symbolizer" + TSAN_OPTIONS: "${TSAN_OPTIONS_DEFAULT}" script: - bash respdiff.sh -s named -q "${PWD}/100k_mixed.txt" -c 3 -w "${PWD}/rspworkdir" "${CI_PROJECT_DIR}" "/usr/local/respdiff-reference-bind/sbin/named" after_script: diff --git a/.reuse/dep5 b/.reuse/dep5 index d4072e91fd..eb708299f7 100644 --- a/.reuse/dep5 +++ b/.reuse/dep5 @@ -163,11 +163,12 @@ Files: **/.clang-format .gitlab-ci.yml .lgtm.yml .pylintrc + .tsan-suppress + .tsan-suppress-extra .uncrustify.cfg doc/misc/*.zoneopt doc/misc/options doc/misc/rndc.grammar - tsan-suppressions.txt sonar-project.properties Copyright: Internet Systems Consortium, Inc. ("ISC") License: CC0-1.0 diff --git a/.tsan-suppress b/.tsan-suppress new file mode 100644 index 0000000000..c73eb7815f --- /dev/null +++ b/.tsan-suppress @@ -0,0 +1,4 @@ +# be more selective with liburcu +race:rcu_barrier +race:rcu_memb_barrier +thread:* diff --git a/.tsan-suppress-extra b/.tsan-suppress-extra new file mode 100644 index 0000000000..0f598bf36b --- /dev/null +++ b/.tsan-suppress-extra @@ -0,0 +1,7 @@ +# Uninstrumented libraries +called_from_lib:libfstrm.so +called_from_lib:libdummyrpz.so +# be more selective with liburcu +race:rcu_barrier +race:rcu_memb_barrier +thread:* diff --git a/CHANGES b/CHANGES index 34fe56bce2..5598d6f373 100644 --- a/CHANGES +++ b/CHANGES @@ -1,3 +1,10 @@ +6172. [cleanup] Refactor the loop manager and qp-trie code to remove + isc_qsbr and use liburcu instead. [GL #3936] + +6171. [cleanup] Remove the stack implementation added in change 6108: + we are using the liburcu concurrent data structures + instead. [GL !7920] + 6170. [func] The 'rndc -t' option allows a timeout to be set in seconds, so that commands that take a long time to complete (e.g., reloading a very large configuration) diff --git a/bin/delv/delv.c b/bin/delv/delv.c index fc10c7137c..6ae6af9183 100644 --- a/bin/delv/delv.c +++ b/bin/delv/delv.c @@ -2147,8 +2147,7 @@ run_server(void *arg) { CHECK(ns_interfacemgr_create(mctx, sctx, loopmgr, netmgr, dispatchmgr, NULL, false, &interfacemgr)); - CHECK(dns_view_create(mctx, loopmgr, dns_rdataclass_in, "_default", - &view)); + CHECK(dns_view_create(mctx, dns_rdataclass_in, "_default", &view)); CHECK(dns_cache_create(loopmgr, dns_rdataclass_in, "", &cache)); dns_view_setcache(view, cache, false); dns_cache_detach(&cache); diff --git a/bin/named/server.c b/bin/named/server.c index a13c9c7a4d..d38ecffd01 100644 --- a/bin/named/server.c +++ b/bin/named/server.c @@ -6437,8 +6437,7 @@ create_view(const cfg_obj_t *vconfig, dns_viewlist_t *viewlist, } INSIST(view == NULL); - result = dns_view_create(named_g_mctx, named_g_loopmgr, viewclass, - viewname, &view); + result = dns_view_create(named_g_mctx, viewclass, viewname, &view); if (result != ISC_R_SUCCESS) { return (result); } diff --git a/bin/tests/system/pipelined/pipequeries.c b/bin/tests/system/pipelined/pipequeries.c index 5569ad318d..805e3c5344 100644 --- a/bin/tests/system/pipelined/pipequeries.c +++ b/bin/tests/system/pipelined/pipequeries.c @@ -256,7 +256,7 @@ main(int argc, char *argv[]) { RUNCHECK(dns_requestmgr_create(mctx, dispatchmgr, dispatchv4, NULL, &requestmgr)); - RUNCHECK(dns_view_create(mctx, loopmgr, 0, "_test", &view)); + RUNCHECK(dns_view_create(mctx, 0, "_test", &view)); isc_loopmgr_setup(loopmgr, sendqueries, NULL); isc_loopmgr_run(loopmgr); diff --git a/bin/tests/system/run.sh.in b/bin/tests/system/run.sh.in index 084efc47d8..81038f1eb1 100644 --- a/bin/tests/system/run.sh.in +++ b/bin/tests/system/run.sh.in @@ -89,6 +89,7 @@ if ! $do_run; then LOG_FLAGS="$log_flags" \ TEST_LARGE_MAP="${TEST_LARGE_MAP}" \ CI_ENABLE_ALL_TESTS="${CI_ENABLE_ALL_TESTS}" \ + ${TSAN_OPTIONS:+"TSAN_OPTIONS=${TSAN_OPTIONS}"} \ ${VIRTUAL_ENV:+"VIRTUAL_ENV=${VIRTUAL_ENV}"} \ ${PERL5LIB:+"PERL5LIB=${PERL5LIB}"} \ make -e check diff --git a/bin/tools/mdig.c b/bin/tools/mdig.c index 4193f06912..7a366c0e85 100644 --- a/bin/tools/mdig.c +++ b/bin/tools/mdig.c @@ -2141,7 +2141,7 @@ main(int argc, char *argv[]) { mctx, dispatchmgr, have_ipv4 ? dispatchvx : NULL, have_ipv6 ? dispatchvx : NULL, &requestmgr)); - RUNCHECK(dns_view_create(mctx, loopmgr, 0, "_mdig", &view)); + RUNCHECK(dns_view_create(mctx, 0, "_mdig", &view)); query = ISC_LIST_HEAD(queries); isc_loopmgr_setup(loopmgr, sendqueries, query); diff --git a/doc/dev/dev.md b/doc/dev/dev.md index adbb40dcd0..1be096d6af 100644 --- a/doc/dev/dev.md +++ b/doc/dev/dev.md @@ -657,45 +657,6 @@ Items can be removed from the list using `ISC_LIST_UNLINK`: ISC_LIST_UNLINK(foolist, foo, link); -##### Atomic stacks - -In ``, there are also similar macros for singly-linked -stacks (`ISC_STACK`) and atomic stacks (`ISC_ASTACK`). - -Lock-free linked data structures have pitfalls that the `ISC_ASTACK` -macros avoid by having a restricted API. Firstly, it is difficult to -implement atomic doubly-linked lists without at least a double-width -compare-exchange, which is not a widely supported primitive, so this -stack is singly-linked. Secondly, when individual elements can be -removed from a list (whether doubly-linked or singly-linked), we run -into the ABA problem, where removes and (re-)inserts race and as a -result the links get in a tangle. Safe support for removing individual -elements requires some kind of garbage collection, or extending link -pointers with generation counters. Instead the ASTACK macros only -provide a function for emptying the entire stack in one go, -`ISC_ASTACK_TO_STACK()`. This only requires setting the `top` pointer to -`NULL`, which cannot cause `ISC_ASTACK_PUSH()` to go wrong. - -The `ISC_ASTACK` macros are used the same way as the non-atomic -doubly-linked lists described above, except that not all of the -`ISC_LIST` macros have `ISC_ASTACK` equivalents, and stacks have a -`TOP` instead of a `HEAD`. - -As well as read-only iteration similar to `ISC_LIST` described above, -there is an idiom for emptying an atomic stack into a regular stack, -and processing its contents: - - ISC_STACK(isc_foo_t) drain = ISC_ASTACK_TO_STACK(foo_work); - while (!ISC_STACK_EMPTY(drain)) { - isc_foo_t *foo = ISC_STACK_POP(drain, link); - /* do things */ - /* maybe again? ISC_ASTACK_PUSH(foo_work, foo, link) */ - } - -There is also an `ISC_ASTACK_ADD()` macro, which is a convenience -wrapper for pushing an element onto a stack if it has not already been -added. - #### Names The `dns_name` API has facilities for processing DNS names and labels, diff --git a/doc/dev/qsbr.md b/doc/dev/qsbr.md deleted file mode 100644 index 7880a56bd4..0000000000 --- a/doc/dev/qsbr.md +++ /dev/null @@ -1,397 +0,0 @@ - - -QSBR: quiescent state based reclamation -======================================= - -QSBR is a safe memory reclamation (SMR) algorithm for lock-free data -structures such as a qp-trie. (See `doc/dev/qp.md`.) - -When an object is unlinked from a lock-free data structure, it -cannot be `free()`ed immediately, because there can still be readers -accessing the object via an old version of the data structure. SMR -algorithms determine when it is safe to reclaim memory after it has -been unlinked. - - -Introductions and overviews ---------------------------- - -There is a terse overview in `include/isc/qsbr.h`. - -Jeff Preshing has a nice introduction to QSBR, -__ - -At the end of this note is a copy of a blog post about writing BIND's -`isc_qsbr`, __ - -[Paul McKenney's web page][paulmck] has links to his book on -concurrent programming, the [Userspace RCU library][urcu], and more. -McKenney invented RCU and QSBR. RCU is the Linux kernel's machinery -for lock-free data structures and safe memory reclamation, based on -QSBR. - -[paulmck]: http://www.rdrop.com/~paulmck/ -[urcu]: https://liburcu.org/ - - -Example code ------------- - -If you are implementing a lock-free data structure that needs safe -memory reclamation, here's a guide to using `isc_qsbr`, based on how -QSBR is used by `dns_qp`. - -### registration - -When the program starts up you need to register a global callback -function that will reclaim unused memory. You can do so using an -ISC_CONSTRUCTOR function that runs automatically at startup. - - static void - qp_qsbr_register(void) ISC_CONSTRUCTOR; - static void - qp_qsbr_register(void) { - isc_qsbr_register(qp_qsbr_reclaimer); - } - -### work list - -Your module will need somewhere that your callback can find the work -it needs to do. The qp-trie has an atomic list of `dns_qpmulti_t` -objects for this purpose. - - /* a global variable */ - static ISC_ASTACK(dns_qpmulti_t) qsbr_work; - -The reason for using global variables is so that we don't need to -allocate a thunk every time we have memory reclamation work to do. - -### read-only access - -You should design your data structure so that it has a single atomic -root pointer referring to its current version. A lock-free reader -_must_ run in an `isc_loop` callback. It gains access to the data -structure by taking a copy of this pointer: - - qp_node_t *reader = atomic_load_acquire(&multi->reader); - -During an `isc_loop` callback, a reader should keep using the same -pointer go get a consistent view of the data structure. If it reloads -the pointer it can get a different version changed by concurrent -writers. - -A reader _must_ stop using the root pointer and any interior pointers -obtained via the root pointer before it returns to the `isc_loop`. - -### modifications and writes - -All changes to the data structure must be copy-on-write (aka -read-copy-update) so that concurrent readers are not disturbed. - -When a new version of the data structure has been prepared, it is -committed by overwriting the atomic root pointer, - - atomic_store_release(&multi->reader, reader); /* COMMIT */ - -### scheduling cleanup - -After committing a change, your data structure may have memory that -will become free, after concurrent readers have stopped accessing it. -To reclaim the memory when it is safe, use code like: - - isc_qsbr_phase_t phase = isc_qsbr_phase(multi->loopmgr); - if (defer_chunk_reclamation(qp, phase)) { - ISC_ASTACK_ADD(qsbr_work, multi, cleanup); - isc_qsbr_activate(multi->loopmgr, phase); - } - - * First, get the current QSBR phase - - * Second, mark free memory with the phase number. The qp-trie scans - its chunks and marks those that will become free, and returns - `true` if there is cleanup work to do. - - * If so, the qp-trie is added to the work list. (`ISC_ALIST_ADD()` - is idempotent). - - * Finally, QSBR is informed that there is work to do. - -In other cases it might not make sense to scan the data structure -after committing, and instead you might make note of which memory to -clean up while making changes before you know what the phase will be. -You can then have per-phase work lists, like: - - static ISC_ASTACK(my_work_t) qsbr_work[ISC_QSBR_PHASES]; - - isc_qsbr_phase_t phase = isc_qsbr_phase(loopmgr); - ISC_ASTACK_ADD(qsbr_work[phase], cleanup_work, link); - isc_qsbr_activate(loopmgr, phase); - -In general, there will be several (maybe many) write operations during -a grace period. Your lock-free data structure should collect its -reclamation work from all these writes into a batch per phase, i.e. -per grace period. - -### reclaiming - -Inside the reclaimer callback, we iterate over the work list and clean -up each item on it. If there is more cleanup work to do in another -phase, we put the qp-trie back on the work list for another go. - - static void - qsbreclaimer(void *arg, isc_qsbr_phase_t phase) { - UNUSED(arg); - - ISC_STACK(dns_qpmulti_t) drain = ISC_ASTACK_TO_STACK(qsbr_work); - while (!ISC_STACK_EMPTY(drain)) { - dns_qpmulti_t *multi = ISC_STACK_POP(drain, cleanup); - INSIST(QPMULTI_VALID(multi)); - LOCK(&multi->mutex); - if (reclaim_chunks(&multi->writer, phase)) { - /* more to do next time */ - ISC_ALIST_PUSH(qsbr_work, multi, cleanup); - } - UNLOCK(&multi->mutex); - } - } - -### reclaim marks - -In the qp-trie data structure, each chunk has some metadata which -includes a bitfield for the reclaim phase: - - isc_qsbr_phase_t phase : ISC_QSBR_PHASE_BITS; - -We use a bitfield so that all the metadata fits in a single word. - - ------------------------------------------------------------------------- - -Safe memory reclamation for BIND -================================ - -At the end of October 2022, I _finally_ got [my multithreaded -qp-trie][qp-gc] working! It could be built with two different -concurrency control mechanisms: - - * A reader/writer lock - - This has poor read-side scalability, because every thread is - hammering on the same shared location. But its write performance - is reasonably good: concurrent readers don't slow it down too much. - - * [`liburcu`, userland read-copy-update][urcu] - - RCU has a fast and scalable read side, nice! But on the write side - I used `synchronize_rcu()`, which is blocking and rather slow, so - my write performance was terrible. - -OK, but I want the best of both worlds! To fix it, I needed to change -the qp-trie code to use safe memory reclamation more effectively: -instead of blocking inside `synchronize_rcu()` before cleaning up, use -`call_rcu()` to clean up asynchronously. I expect I'll write about the -qp-trie changes another time. - -Another issue is that I want the best of both worlds _by default_, -but `liburcu` is [LGPL][] and we don't want BIND to depend on -code whose licence demands more from our users than the [MPL][]. - -[qp-gc]: https://dotat.at/@/2021-06-23-page-based-gc-for-qp-trie-rcu.html -[LGPL]: https://opensource.org/licenses/LGPL-2.1 -[MPL]: https://opensource.org/licenses/MPL-2.0 - -So I set out to write my own safe memory reclamation support code. - - -lock freedom ------------- - -In a [multithreaded qp-trie][qp-gc], there can be many concurrent -readers, but there can be only one writer at a time and modifications -are strictly serialized. When I have got it working properly, readers -are completely wait-free, unaffected by other readers, and almost -unaffected by writers. Writers need to get a mutex to ensure there is -only one at a time, but once the mutex is acquired, a writer is not -obstructed by readers. - -The way this works is that readers use an atomic load to get a pointer -to the root of the current version of the trie. Readers can make -multiple queries using this root pointer and the results will be -consistent wrt that particular version, regardless of what changes -writers might be making concurrently. Writers do not affect readers -because all changes are made by copy-on-write. When a writer is ready -to commit a new version of the trie, it uses an atomic store to flip -the root pointer. - - -safe memory reclamation ------------------------ - -We can't copy-on-write indefinitely: we need to reclaim the memory -used by old versions of the trie. And we must do so "safely", i.e. -without `free()`ing memory that readers are still using. - -So, before `free()`ing memory, a writer must wait for a _"grace -period"_, which is a jargon term meaning "until readers are not using -the old version". There are a bunch of algorithms for determining when -a grace period is over, with varying amounts of over-approximation, -CPU overhead, and memory backlog. - -The [RCU][urcu] function `synchronize_rcu()` is slow because it blocks -waiting for a grace period; the `call_rcu()` function runs a callback -asynchronously after a grace period has passed. I wanted to avoid -blocking my writers, so I needed to implement something like -`call_rcu()`. - - -aversions ---------- - -When I started trying to work out how to do safe memory reclamation, -it all seemed quite intimidating. But as I learned more, I found that -my circumstances make it easier than it appeared at first. - -The [`liburcu`][urcu] homepage has a long list of supported CPU -architectures and operating systems. Do I have to care about those -details too? No! The RCU code dates back to before the age of -standardized concurrent memory models, so the RCU developers had to -invent their own atomic primitives and correctness rules. Twenty-ish -years later the state of the art has advanced, so I can use -`` without having to re-do it like `liburcu`. - -You can also choose between several algorithms implemented by -[`liburcu`][urcu], involving questions about kernel support, specially -reserved signals, and intrusiveness in application code. But while I -was working out how to schedule asynchronous memory reclamation work, -I realised that BIND is already well-suited to the fastest flavour of -RCU, called "QSBR". - - -QSBR ----- - -QSBR stands for "quiescent state based reclamation". A _"quiescent -state"_ is a fancy name for a point when a thread is not accessing a -lock-free data structure, and does not retain any root pointers or -interior pointers. - -When a thread has passed through a quiescent state, it no longer has -access to older versions of the data structures. When _all_ threads -have passed through quiescent states, then nothing in the program has -access to old versions. This is how QSBR detects grace periods: after -a writer commits a new version, it waits for all threads to pass -through quiescent states, and therefore a grace period has definitely -elapsed, and so it is then safe to reclaim the old version's memory. - -QSBR is fast because readers do not need to explicitly mark the -critical section surrounding the atomic load that I mentioned earlier. -Threads just need to pass through a quiescent state frequently enough -that there isn't a huge build-up of unreclaimed memory. - -Inside an operating system kernel (RCU's native environment), a -context switch provides a natural quiescent state. In a userland -application, you need to find a good place to call -`rcu_quiescent_state()`. You could call it every time you have -finished using a root pointer, but marking a quiescent state is not -completely free, so there are probably more efficient ways. - - -`libuv` -------- - -BIND is multithreaded, and (basically) each thread runs an event loop. -Recent versions of BIND use [`libuv`][uv] for the event loops. - -A lot of things started falling into place when I realised that the -`libuv` event loop gives BIND a [natural quiescent state][uv-loop]: -when the event callbacks have finished running, and `libuv` is about -to call `select()` or `poll()` or whatever, we can mark a quiescent -state. We can require that event-handling functions do not stash root -pointers in the heap, but only use them via local variables, so we -know that old versions are inaccessible after the callback returns. - -My design marks a quiescent state once per loop, so on a busy server -where each loop has lots to do, the cost of marking a quiescent state -is amortized across several I/O events. - -[uv]: http://libuv.org/ -[uv-loop]: http://docs.libuv.org/en/v1.x/design.html#the-i-o-loop - - -fuzzy barrier -------------- - -So, how do we mark a quiescent state? Using a _"fuzzy barrier"_. - -When a thread reaches a normal barrier, it blocks until all the other -threads have reached the barrier, after which exactly one of the -threads can enter a protected section of code, and the others are -unblocked and can proceed as normal. - -When a thread encounters a fuzzy barrier, it never blocks. It either -proceeds immediately as normal, or if it is the last thread to reach -the barrier, it enters the protected code. - -RCU does not actually use a fuzzy barrier as I have described it. Like -a fuzzy barrier, each thread keeps track of whether it has passed -through a quiescent state in the current grace period, without -blocking; but unlike a fuzzy barrier, no thread is diverted to the -protected code. Instead, code that wants to enter a protected section -uses the blocking `synchronize_rcu()` function. - - -EBR-ish -------- - -As in the paper ["performance of memory reclamation for lockless -synchronization"][HMBW], my implementation of QSBR uses a fuzzy -barrier designed for another safe memory reclamation algorithm, EBR, -epoch based reclamation. (EBR was invented here in Cambridge by [Keir -Fraser][tr579].) - -[HMBW]: http://csng.cs.toronto.edu/publication_files/0000/0159/jpdc07.pdf -[tr579]: https://www.cl.cam.ac.uk/techreports/UCAM-CL-TR-579.html - -Actually, my fuzzy barrier is slightly different to EBR's. In EBR, the -fuzzy barrier is used every time the program enters a critical -section. (In qp-trie terms, that would be every time a reader fetches -a root pointer.) So it is vital that EBR's barrier avoids mutating -shared state, because that would wreck multithreaded performance. - -Because BIND will only pass through the fuzzy barrier when it is about -to use a blocking system call, my version mutates shared state more -frequently (typically, once per CPU per grace period, instead of once -per grace period). If this turns out to be a problem, it won't be too -hard to make it work more like EBR. - -More trivially, I'm using the term "phase" instead of "epoch", because -it's nothing to do with the unix epoch, because there are three -phases, and because I can talk about phase transitions and threads -being out of phase with each other. - - -coda ----- - -While reading various RCU-related papers, I was amused by ["user-level -implementations of read-copy update"][DMSDW], which says: - -> BIND, a major domain-name server used for Internet domain-name -> resolution, is facing scalability issues. Since domain names -> are read often but rarely updated, using user-level RCU might be -> beneficial. - -Yes, I think it might :-) - -[DMSDW]: https://www.efficios.com/publications/ diff --git a/fuzz/Makefile.am b/fuzz/Makefile.am index d2e3931c00..75407d488b 100644 --- a/fuzz/Makefile.am +++ b/fuzz/Makefile.am @@ -6,6 +6,7 @@ AM_CFLAGS += \ AM_CPPFLAGS += \ $(LIBISC_CFLAGS) \ $(LIBDNS_CFLAGS) \ + $(LIBURCU_CFLAGS) \ $(LIBUV_CFLAGS) \ -DFUZZDIR=\"$(abs_srcdir)\" \ -I$(top_srcdir)/lib/dns \ diff --git a/fuzz/dns_message_checksig.c b/fuzz/dns_message_checksig.c index c27bfebe8b..a4b9dd15ab 100644 --- a/fuzz/dns_message_checksig.c +++ b/fuzz/dns_message_checksig.c @@ -236,8 +236,7 @@ LLVMFuzzerInitialize(int *argc ISC_ATTR_UNUSED, char ***argv ISC_ATTR_UNUSED) { isc_loopmgr_create(mctx, 1, &loopmgr); - result = dns_view_create(mctx, loopmgr, dns_rdataclass_in, "view", - &view); + result = dns_view_create(mctx, dns_rdataclass_in, "view", &view); if (result != ISC_R_SUCCESS) { fprintf(stderr, "dns_view_create failed: %s\n", isc_result_totext(result)); diff --git a/fuzz/dns_qp.c b/fuzz/dns_qp.c index 1bc19fa38f..045c8acbf2 100644 --- a/fuzz/dns_qp.c +++ b/fuzz/dns_qp.c @@ -16,10 +16,10 @@ #include #include -#include #include #include #include +#include #include #include diff --git a/lib/dns/Makefile.am b/lib/dns/Makefile.am index 1db6d6585a..a70d7e7381 100644 --- a/lib/dns/Makefile.am +++ b/lib/dns/Makefile.am @@ -263,6 +263,7 @@ libdns_la_CPPFLAGS = \ $(AM_CPPFLAGS) \ $(LIBDNS_CFLAGS) \ $(LIBISC_CFLAGS) \ + $(LIBURCU_CFLAGS) \ $(LIBUV_CFLAGS) \ $(OPENSSL_CFLAGS) @@ -272,6 +273,7 @@ libdns_la_LDFLAGS = \ libdns_la_LIBADD = \ $(LIBISC_LIBS) \ + $(LIBURCU_LIBS) \ $(LIBUV_LIBS) \ $(OPENSSL_LIBS) diff --git a/lib/dns/client.c b/lib/dns/client.c index 5a3c0fadda..579b9c3aab 100644 --- a/lib/dns/client.c +++ b/lib/dns/client.c @@ -206,8 +206,7 @@ createview(isc_mem_t *mctx, dns_rdataclass_t rdclass, isc_loopmgr_t *loopmgr, isc_result_t result; dns_view_t *view = NULL; - result = dns_view_create(mctx, loopmgr, rdclass, DNS_CLIENTVIEW_NAME, - &view); + result = dns_view_create(mctx, rdclass, DNS_CLIENTVIEW_NAME, &view); if (result != ISC_R_SUCCESS) { return (result); } diff --git a/lib/dns/include/dns/qp.h b/lib/dns/include/dns/qp.h index a425a4c257..d4bf9b3650 100644 --- a/lib/dns/include/dns/qp.h +++ b/lib/dns/include/dns/qp.h @@ -47,17 +47,11 @@ * version of the trie. * * For fast concurrent reads, call `dns_qpmulti_query()` to fill in a - * `dns_qpread_t`, which must be allocated on the stack. Readers can - * access a single version of the trie within the scope of an isc_loop - * thread (NOT an isc_work thread). We rely on the loop to bound the - * lifetime of a `dns_qpread_t`, instead of using locks. Readers are - * not blocked by any write activity, and vice versa. - * - * For read-only access outside the scope of a loop, such as from an - * isc_work callback, `dns_qpmulti_lockedread()`. This looks like a - * query transaction; the difference is that a locked read transaction - * takes the `dns_qpmulti_t` mutex. When you have finished with a - * `dns_qpread_t`, call `dns_qpread_destroy()` to release the mutex. + * `dns_qpread_t`, which must be allocated on the stack. This gives + * the reader access to a single version of the trie. The reader's + * thread must be registered with `liburcu`, which is normally taken + * care of by `libisc`. Readers are not blocked by any write activity, + * and vice versa. * * For reads that need a stable view of the trie for multiple cycles * of an isc_loop, or which can be used from any thread, call @@ -314,17 +308,15 @@ dns_qp_destroy(dns_qp_t **qptp); */ void -dns_qpmulti_create(isc_mem_t *mctx, isc_loopmgr_t *loopmgr, - const dns_qpmethods_t *methods, void *uctx, +dns_qpmulti_create(isc_mem_t *mctx, const dns_qpmethods_t *methods, void *uctx, dns_qpmulti_t **qpmp); /*%< * Create a multi-threaded qp-trie. * * Requires: * \li `mctx` is a pointer to a valid memory context. - * \li 'loopmgr' is a valid loop manager. - * \li `qpmp != NULL && *qpmp == NULL` * \li all the methods are non-NULL + * \li `qpmp != NULL && *qpmp == NULL` * * Ensures: * \li `*qpmp` is a pointer to a valid multi-threaded qp-trie diff --git a/lib/dns/include/dns/view.h b/lib/dns/include/dns/view.h index b72a3dd44f..e801b3a30f 100644 --- a/lib/dns/include/dns/view.h +++ b/lib/dns/include/dns/view.h @@ -259,8 +259,8 @@ struct dns_view { #endif /* HAVE_LMDB */ isc_result_t -dns_view_create(isc_mem_t *mctx, isc_loopmgr_t *loopmgr, - dns_rdataclass_t rdclass, const char *name, dns_view_t **viewp); +dns_view_create(isc_mem_t *mctx, dns_rdataclass_t rdclass, const char *name, + dns_view_t **viewp); /*%< * Create a view. * diff --git a/lib/dns/include/dns/zt.h b/lib/dns/include/dns/zt.h index 1118d06428..3ed74911c2 100644 --- a/lib/dns/include/dns/zt.h +++ b/lib/dns/include/dns/zt.h @@ -34,8 +34,7 @@ typedef isc_result_t dns_zt_callback_t(void *arg); void -dns_zt_create(isc_mem_t *mctx, isc_loopmgr_t *loopmgr, dns_view_t *view, - dns_zt_t **ztp); +dns_zt_create(isc_mem_t *mctx, dns_view_t *view, dns_zt_t **ztp); /*%< * Creates a new zone table for a view. * diff --git a/lib/dns/qp.c b/lib/dns/qp.c index 60acf13ed5..7633c4d5aa 100644 --- a/lib/dns/qp.c +++ b/lib/dns/qp.c @@ -28,17 +28,16 @@ #include #include -#include #include #include #include -#include #include #include #include #include #include #include +#include #include #include @@ -426,17 +425,18 @@ static void realloc_chunk_arrays(dns_qp_t *qp, qp_chunk_t newmax) { size_t oldptrs = sizeof(qp->base->ptr[0]) * qp->chunk_max; size_t newptrs = sizeof(qp->base->ptr[0]) * newmax; - size_t allbytes = sizeof(dns_qpbase_t) + newptrs; + size_t size = STRUCT_FLEX_SIZE(qp->base, ptr, newmax); if (qp->base == NULL || qpbase_unref(qp)) { - qp->base = isc_mem_reallocate(qp->mctx, qp->base, allbytes); + qp->base = isc_mem_reallocate(qp->mctx, qp->base, size); } else { dns_qpbase_t *oldbase = qp->base; - qp->base = isc_mem_allocate(qp->mctx, allbytes); + qp->base = isc_mem_allocate(qp->mctx, size); memmove(&qp->base->ptr[0], &oldbase->ptr[0], oldptrs); } memset(&qp->base->ptr[qp->chunk_max], 0, newptrs - oldptrs); isc_refcount_init(&qp->base->refcount, 1); + qp->base->magic = QPBASE_MAGIC; /* usage array is exclusive to the writer */ size_t oldusage = sizeof(qp->usage[0]) * qp->chunk_max; @@ -555,12 +555,14 @@ chunk_usage(dns_qp_t *qp, qp_chunk_t chunk) { */ static void chunk_discount(dns_qp_t *qp, qp_chunk_t chunk) { - if (qp->usage[chunk].phase == 0) { - INSIST(qp->used_count >= qp->usage[chunk].used); - INSIST(qp->free_count >= qp->usage[chunk].free); - qp->used_count -= qp->usage[chunk].used; - qp->free_count -= qp->usage[chunk].free; + if (qp->usage[chunk].discounted) { + return; } + INSIST(qp->used_count >= qp->usage[chunk].used); + INSIST(qp->free_count >= qp->usage[chunk].free); + qp->used_count -= qp->usage[chunk].used; + qp->free_count -= qp->usage[chunk].free; + qp->usage[chunk].discounted = true; } /* @@ -622,109 +624,95 @@ recycle(dns_qp_t *qp) { } /* - * At the end of a transaction, mark empty but immutable chunks for - * reclamation later. Returns true when chunks need reclaiming later. + * asynchronous cleanup */ -static bool -defer_chunk_reclamation(dns_qp_t *qp, isc_qsbr_phase_t phase) { - unsigned int reclaim = 0; +static void +reclaim_chunks_cb(struct rcu_head *arg) { + qp_rcuctx_t *rcuctx = isc_urcu_container(arg, qp_rcuctx_t, rcu_head); + REQUIRE(QPRCU_VALID(rcuctx)); + dns_qpmulti_t *multi = rcuctx->multi; + REQUIRE(QPMULTI_VALID(multi)); - for (qp_chunk_t chunk = 0; chunk < qp->chunk_max; chunk++) { - if (chunk != qp->bump && chunk_usage(qp, chunk) == 0 && - qp->usage[chunk].exists && qp->usage[chunk].immutable && - qp->usage[chunk].phase == 0) - { - chunk_discount(qp, chunk); - qp->usage[chunk].phase = phase; - reclaim++; - } - } + LOCK(&multi->mutex); - if (reclaim > 0) { - LOG_STATS("qp will reclaim %u chunks in phase %u", reclaim, - phase); - } + dns_qp_t *qp = &multi->writer; + REQUIRE(QP_VALID(qp)); - return (reclaim > 0); -} - -static bool -reclaim_chunks(dns_qp_t *qp, isc_qsbr_phase_t phase) { unsigned int free = 0; - bool more = false; - isc_nanosecs_t start = isc_time_monotonic(); - for (qp_chunk_t chunk = 0; chunk < qp->chunk_max; chunk++) { - if (qp->usage[chunk].phase == phase) { - if (qp->usage[chunk].snapshot) { - /* cleanup when snapshot is destroyed */ - qp->usage[chunk].snapfree = true; - } else { - chunk_free(qp, chunk); - free++; - } - } else if (qp->usage[chunk].phase != 0) { - /* - * We need to reclaim more of this trie's memory - * on a later qsbr callback. - */ - more = true; + for (unsigned int i = 0; i < rcuctx->count; i++) { + qp_chunk_t chunk = rcuctx->chunk[i]; + if (qp->usage[chunk].snapshot) { + /* cleanup when snapshot is destroyed */ + qp->usage[chunk].snapfree = true; + } else { + chunk_free(qp, chunk); + free++; } } + isc_mem_putanddetach(&rcuctx->mctx, rcuctx, + STRUCT_FLEX_SIZE(rcuctx, chunk, rcuctx->count)); + isc_nanosecs_t time = isc_time_monotonic() - start; recycle_time += time; if (free > 0) { - LOG_STATS("qp reclaim" PRItime "phase %u free %u chunks", time, - phase, free); + LOG_STATS("qp reclaim" PRItime "free %u chunks", time, free); LOG_STATS("qp reclaim leaf %u live %u used %u free %u hold %u", qp->leaf_count, qp->used_count - qp->free_count, qp->used_count, qp->free_count, qp->hold_count); } - return (more); + UNLOCK(&multi->mutex); } /* - * List of `dns_qpmulti_t`s that have chunks to be reclaimed. - */ -static ISC_ASTACK(dns_qpmulti_t) qsbr_work; - -/* - * When a grace period has passed, this function reclaims any unused memory. + * At the end of a transaction, schedule empty but immutable chunks + * for reclamation later. */ static void -qp_qsbr_reclaimer(isc_qsbr_phase_t phase) { - ISC_STACK(dns_qpmulti_t) drain = ISC_ASTACK_TO_STACK(qsbr_work); - while (!ISC_STACK_EMPTY(drain)) { - /* lock before pop */ - dns_qpmulti_t *multi = ISC_STACK_TOP(drain); - INSIST(QPMULTI_VALID(multi)); - LOCK(&multi->mutex); - ISC_STACK_POP(drain, cleanup); - if (multi->writer.destroy) { - UNLOCK(&multi->mutex); - dns_qpmulti_destroy(&multi); - } else { - if (reclaim_chunks(&multi->writer, phase)) { - /* more to do next time */ - ISC_ASTACK_PUSH(qsbr_work, multi, cleanup); - } - UNLOCK(&multi->mutex); +reclaim_chunks(dns_qpmulti_t *multi) { + dns_qp_t *qp = &multi->writer; + + unsigned int count = 0; + for (qp_chunk_t chunk = 0; chunk < qp->chunk_max; chunk++) { + if (chunk != qp->bump && chunk_usage(qp, chunk) == 0 && + qp->usage[chunk].exists && qp->usage[chunk].immutable && + !qp->usage[chunk].discounted) + { + count++; } } -} -/* - * Register `qp_qsbr_reclaimer()` with QSBR at startup. - */ -static void -qp_qsbr_register(void) ISC_CONSTRUCTOR; -static void -qp_qsbr_register(void) { - isc_qsbr_register(qp_qsbr_reclaimer); + if (count == 0) { + return; + } + + qp_rcuctx_t *rcuctx = + isc_mem_get(qp->mctx, STRUCT_FLEX_SIZE(rcuctx, chunk, count)); + *rcuctx = (qp_rcuctx_t){ + .magic = QPRCU_MAGIC, + .multi = multi, + .count = count, + }; + isc_mem_attach(qp->mctx, &rcuctx->mctx); + + unsigned int i = 0; + for (qp_chunk_t chunk = 0; chunk < qp->chunk_max; chunk++) { + if (chunk != qp->bump && chunk_usage(qp, chunk) == 0 && + qp->usage[chunk].exists && qp->usage[chunk].immutable && + !qp->usage[chunk].discounted) + { + rcuctx->chunk[i++] = chunk; + chunk_discount(qp, chunk); + } + } + + isc_urcu_cleanup(rcuctx, rcu_head, reclaim_chunks_cb); + + LOG_STATS("qp will reclaim %u chunks", count); } /* @@ -1121,6 +1109,8 @@ dns_qpmulti_update(dns_qpmulti_t *multi, dns_qp_t **qptp) { memmove(rollback, qp, sizeof(*rollback)); /* can be uninitialized on the first transaction */ if (rollback->base != NULL) { + INSIST(QPBASE_VALID(rollback->base)); + INSIST(qp->usage != NULL && qp->chunk_max > 0); /* paired with either _commit() or _rollback() */ isc_refcount_increment(&rollback->base->refcount); size_t usage_bytes = sizeof(qp->usage[0]) * qp->chunk_max; @@ -1187,12 +1177,8 @@ dns_qpmulti_commit(dns_qpmulti_t *multi, dns_qp_t **qptp) { recycle(qp); } - /* the reclamation phase must be sampled after the commit */ - isc_qsbr_phase_t phase = isc_qsbr_phase(multi->loopmgr); - if (defer_chunk_reclamation(qp, phase)) { - ISC_ASTACK_ADD(qsbr_work, multi, cleanup); - isc_qsbr_activate(multi->loopmgr, phase); - } + /* schedule the rest for later */ + reclaim_chunks(multi); *qptp = NULL; UNLOCK(&multi->mutex); @@ -1282,31 +1268,11 @@ dns_qpmulti_query(dns_qpmulti_t *multi, dns_qpread_t *qp) { REQUIRE(QPMULTI_VALID(multi)); REQUIRE(qp != NULL); - /* we MUST be in an isc_loop thread */ - qp->tid = isc_tid(); - REQUIRE(qp->tid != ISC_TID_UNKNOWN); - dns_qpmulti_t *whence = reader_open(multi, qp); INSIST(whence == multi); -} -/* - * a locked read takes the mutex - */ - -void -dns_qpmulti_lockedread(dns_qpmulti_t *multi, dns_qpread_t *qp) { - REQUIRE(QPMULTI_VALID(multi)); - REQUIRE(qp != NULL); - - /* we MUST NOT be in an isc_loop thread */ qp->tid = isc_tid(); - REQUIRE(qp->tid == ISC_TID_UNKNOWN); - - LOCK(&multi->mutex); - - dns_qpmulti_t *whence = reader_open(multi, qp); - INSIST(whence == multi); + rcu_read_lock(); } void @@ -1314,10 +1280,8 @@ dns_qpread_destroy(dns_qpmulti_t *multi, dns_qpread_t *qp) { REQUIRE(QPMULTI_VALID(multi)); REQUIRE(QP_VALID(qp)); REQUIRE(qp->tid == isc_tid()); - if (qp->tid == ISC_TID_UNKNOWN) { - UNLOCK(&multi->mutex); - } *qp = (dns_qpread_t){}; + rcu_read_unlock(); } /* @@ -1406,8 +1370,7 @@ dns_qp_create(isc_mem_t *mctx, const dns_qpmethods_t *methods, void *uctx, } void -dns_qpmulti_create(isc_mem_t *mctx, isc_loopmgr_t *loopmgr, - const dns_qpmethods_t *methods, void *uctx, +dns_qpmulti_create(isc_mem_t *mctx, const dns_qpmethods_t *methods, void *uctx, dns_qpmulti_t **qpmp) { REQUIRE(qpmp != NULL && *qpmp == NULL); @@ -1415,8 +1378,6 @@ dns_qpmulti_create(isc_mem_t *mctx, isc_loopmgr_t *loopmgr, *multi = (dns_qpmulti_t){ .magic = QPMULTI_MAGIC, .reader_ref = INVALID_REF, - .loopmgr = loopmgr, - .cleanup = ISC_SLINK_INITIALIZER, }; isc_mutex_init(&multi->mutex); ISC_LIST_INIT(multi->snapshots); @@ -1468,29 +1429,48 @@ dns_qp_destroy(dns_qp_t **qptp) { isc_mem_putanddetach(&qp->mctx, qp, sizeof(*qp)); } +static void +qpmulti_destroy_cb(struct rcu_head *arg) { + qp_rcuctx_t *rcuctx = isc_urcu_container(arg, qp_rcuctx_t, rcu_head); + REQUIRE(QPRCU_VALID(rcuctx)); + dns_qpmulti_t *multi = rcuctx->multi; + REQUIRE(QPMULTI_VALID(multi)); + dns_qp_t *qp = &multi->writer; + REQUIRE(QP_VALID(qp)); + + REQUIRE(rcuctx->count == 0); + + destroy_guts(qp); + isc_mutex_destroy(&multi->mutex); + isc_mem_putanddetach(&rcuctx->mctx, rcuctx, + STRUCT_FLEX_SIZE(rcuctx, chunk, rcuctx->count)); + isc_mem_putanddetach(&qp->mctx, multi, sizeof(*multi)); +} + void dns_qpmulti_destroy(dns_qpmulti_t **qpmp) { + dns_qp_t *qp = NULL; + dns_qpmulti_t *multi = NULL; + qp_rcuctx_t *rcuctx = NULL; + REQUIRE(qpmp != NULL); REQUIRE(QPMULTI_VALID(*qpmp)); - dns_qpmulti_t *multi = *qpmp; - dns_qp_t *qp = &multi->writer; + multi = *qpmp; + qp = &multi->writer; *qpmp = NULL; REQUIRE(QP_VALID(qp)); REQUIRE(multi->rollback == NULL); REQUIRE(ISC_LIST_EMPTY(multi->snapshots)); - LOCK(&multi->mutex); - if (ISC_SLINK_LINKED(multi, cleanup)) { - qp->destroy = true; - UNLOCK(&multi->mutex); - } else { - destroy_guts(qp); - UNLOCK(&multi->mutex); - isc_mutex_destroy(&multi->mutex); - isc_mem_putanddetach(&qp->mctx, multi, sizeof(*multi)); - } + rcuctx = isc_mem_get(qp->mctx, STRUCT_FLEX_SIZE(rcuctx, chunk, 0)); + *rcuctx = (qp_rcuctx_t){ + .magic = QPRCU_MAGIC, + .multi = multi, + }; + isc_mem_attach(qp->mctx, &rcuctx->mctx); + isc_urcu_cleanup(rcuctx, rcu_head, qpmulti_destroy_cb); } /*********************************************************************** diff --git a/lib/dns/qp_p.h b/lib/dns/qp_p.h index 90d05a20d8..e9054da953 100644 --- a/lib/dns/qp_p.h +++ b/lib/dns/qp_p.h @@ -301,11 +301,9 @@ ref_cell(qp_ref_t ref) { * transactions, but it must become immutable when an update * transaction is opened.) * - * When a chunk becomes empty (wrt the latest version of the trie), we - * note the QSBR phase after which no old versions of the trie will - * need the chunk and it will be safe to free(). There are a few flags - * used to mark which chunks are still needed by snapshots after the - * chunks have passed their normal reclamation phase. + * There are a few flags used to mark which chunks are still needed by + * snapshots after the chunks have passed their normal reclamation + * phase. */ typedef struct qp_usage { /*% the allocation point, increases monotonically */ @@ -316,14 +314,14 @@ typedef struct qp_usage { bool exists : 1; /*% is this chunk shared? [MT] */ bool immutable : 1; + /*% already subtracted from multi->*_count [MT] */ + bool discounted : 1; /*% is a snapshot using this chunk? [MT] */ bool snapshot : 1; /*% tried to free it but a snapshot needs it [MT] */ bool snapfree : 1; /*% for mark/sweep snapshot flag updates [MT] */ bool snapmark : 1; - /*% in which phase did this chunk become unused? [MT] */ - isc_qsbr_phase_t phase : ISC_QSBR_PHASE_BITS; } qp_usage_t; /* @@ -335,7 +333,7 @@ typedef struct qp_usage { * A `dns_qpbase_t` counts references from `dns_qp_t` objects and * from packed readers, but not from `dns_qpread_t` nor from * `dns_qpsnap_t` objects. Refcount adjustments for `dns_qpread_t` - * would wreck multicore scalability; instead we rely on QSBR. + * would wreck multicore scalability; instead we rely on RCU. * * The `usage` array determines when a chunk is no longer needed: old * chunk pointers in old `base` arrays are ignored. (They can become @@ -349,10 +347,25 @@ typedef struct qp_usage { * busy slots that are in use by readers. */ struct dns_qpbase { + unsigned int magic; isc_refcount_t refcount; qp_node_t *ptr[]; }; +/* + * Chunks that may be in use by readers are reclaimed asynchronously. + * When a transaction commits, immutable chunks that are now empty are + * listed in a `qp_rcuctx_t` structure and passed to `call_rcu()`. + */ +typedef struct qp_rcuctx { + unsigned int magic; + struct rcu_head rcu_head; + isc_mem_t *mctx; + dns_qpmulti_t *multi; + qp_chunk_t count; + qp_chunk_t chunk[]; +} qp_rcuctx_t; + /* * Returns true when the base array can be free()d. */ @@ -382,10 +395,14 @@ ref_ptr(dns_qpreadable_t qpr, qp_ref_t ref) { #define QPITER_MAGIC ISC_MAGIC('q', 'p', 'i', 't') #define QPMULTI_MAGIC ISC_MAGIC('q', 'p', 'm', 'v') #define QPREADER_MAGIC ISC_MAGIC('q', 'p', 'r', 'x') +#define QPBASE_MAGIC ISC_MAGIC('q', 'p', 'b', 'p') +#define QPRCU_MAGIC ISC_MAGIC('q', 'p', 'c', 'b') -#define QP_VALID(p) ISC_MAGIC_VALID(p, QP_MAGIC) -#define QPITER_VALID(p) ISC_MAGIC_VALID(p, QPITER_MAGIC) -#define QPMULTI_VALID(p) ISC_MAGIC_VALID(p, QPMULTI_MAGIC) +#define QP_VALID(qp) ISC_MAGIC_VALID(qp, QP_MAGIC) +#define QPITER_VALID(qp) ISC_MAGIC_VALID(qp, QPITER_MAGIC) +#define QPMULTI_VALID(qp) ISC_MAGIC_VALID(qp, QPMULTI_MAGIC) +#define QPBASE_VALID(qp) ISC_MAGIC_VALID(qp, QPBASE_MAGIC) +#define QPRCU_VALID(qp) ISC_MAGIC_VALID(qp, QPRCU_MAGIC) /* * Polymorphic initialization of the `dns_qpreader_t` prefix. @@ -504,8 +521,6 @@ struct dns_qp { enum { QP_NONE, QP_WRITE, QP_UPDATE } transaction_mode : 2; /*% compact the entire trie [MT] */ bool compact_all : 1; - /*% destroy the trie asynchronously [MT] */ - bool destroy : 1; /*% optionally when compiled with fuzzing support [MT] */ bool write_protect : 1; }; @@ -517,9 +532,6 @@ struct dns_qp { * of the trie. See the "packed reader nodes" section below for a * description of what it points to. * - * We need access to the loopmgr to hook into QSBR safe memory reclamation. - * It is constant after initialization. - * * The main object under the protection of the mutex is the `writer` * containing all the allocator state. There can be a backup copy when * we want to be able to rollback an update transaction. @@ -537,8 +549,6 @@ struct dns_qp { */ struct dns_qpmulti { uint32_t magic; - /*% safe memory reclamation context (const) */ - isc_loopmgr_t *loopmgr; /*% pointer to current packed reader */ atomic_ptr(qp_node_t) reader; /*% the mutex protects the rest of this structure */ @@ -551,8 +561,6 @@ struct dns_qpmulti { dns_qp_t *rollback; /*% all snapshots of this trie */ ISC_LIST(dns_qpsnap_t) snapshots; - /*% safe memory reclamation work list */ - ISC_SLINK(dns_qpmulti_t) cleanup; }; /*********************************************************************** @@ -878,13 +886,15 @@ static inline dns_qpmulti_t * unpack_reader(dns_qpreader_t *qp, qp_node_t *reader) { INSIST(reader_valid(reader)); dns_qpmulti_t *multi = node_pointer(&reader[0]); + dns_qpbase_t *base = node_pointer(&reader[1]); INSIST(QPMULTI_VALID(multi)); + INSIST(QPBASE_VALID(base)); *qp = (dns_qpreader_t){ .magic = QP_MAGIC, .uctx = multi->writer.uctx, .methods = multi->writer.methods, .root_ref = node32(&reader[1]), - .base = node_pointer(&reader[1]), + .base = base, }; return (multi); } diff --git a/lib/dns/rbtdb.c b/lib/dns/rbtdb.c index c94f935d55..2bc92f3835 100644 --- a/lib/dns/rbtdb.c +++ b/lib/dns/rbtdb.c @@ -9583,8 +9583,8 @@ new_gluelist(isc_mem_t *mctx, dns_name_t *name) { static void free_gluelist_rcu(struct rcu_head *rcu_head) { - rbtdb_glue_t *glue = caa_container_of(rcu_head, rbtdb_glue_t, rcu_head); - __tsan_acquire(glue); + rbtdb_glue_t *glue = isc_urcu_container(rcu_head, rbtdb_glue_t, + rcu_head); free_gluelist(glue); } @@ -9600,8 +9600,7 @@ free_gluetable(rbtdb_version_t *rbtversion) { caa_container_of(node, rdatasetheader_t, wfs_node); rbtdb_glue_t *glue = rcu_xchg_pointer(&header->glue_list, NULL); - __tsan_release(glue); - call_rcu(&glue->rcu_head, free_gluelist_rcu); + isc_urcu_cleanup(glue, rcu_head, free_gluelist_rcu); } rcu_read_unlock(); } diff --git a/lib/dns/view.c b/lib/dns/view.c index 5f58142627..ee2a5c9a06 100644 --- a/lib/dns/view.c +++ b/lib/dns/view.c @@ -79,8 +79,7 @@ #define DEFAULT_EDNS_BUFSIZE 1232 isc_result_t -dns_view_create(isc_mem_t *mctx, isc_loopmgr_t *loopmgr, - dns_rdataclass_t rdclass, const char *name, +dns_view_create(isc_mem_t *mctx, dns_rdataclass_t rdclass, const char *name, dns_view_t **viewp) { dns_view_t *view = NULL; isc_result_t result; @@ -135,7 +134,7 @@ dns_view_create(isc_mem_t *mctx, isc_loopmgr_t *loopmgr, isc_rwlock_init(&view->sfd_lock); view->zonetable = NULL; - dns_zt_create(mctx, loopmgr, view, &view->zonetable); + dns_zt_create(mctx, view, &view->zonetable); result = dns_fwdtable_create(mctx, &view->fwdtable); if (result != ISC_R_SUCCESS) { diff --git a/lib/dns/zone.c b/lib/dns/zone.c index 7481d3b12f..be7eb70c90 100644 --- a/lib/dns/zone.c +++ b/lib/dns/zone.c @@ -10619,8 +10619,6 @@ failure: cleanup: dns_db_detach(&kfetch->db); - /* The zone must be managed */ - INSIST(kfetch->zone->loop != NULL); isc_refcount_decrement(&zone->irefs); if (dns_rdataset_isassociated(keydataset)) { @@ -14658,12 +14656,15 @@ zone_timer_set(dns_zone_t *zone, isc_time_t *next, isc_time_t *now) { isc_time_subtract(next, now, &interval); } - if (zone->timer == NULL) { + if (zone->loop == NULL) { + zone_debuglog(zone, __func__, 10, "zone is not managed"); + } else if (zone->timer == NULL) { isc_refcount_increment0(&zone->irefs); isc_timer_create(zone->loop, zone_timer, zone, &zone->timer); } - - isc_timer_start(zone->timer, isc_timertype_once, &interval); + if (zone->timer != NULL) { + isc_timer_start(zone->timer, isc_timertype_once, &interval); + } } static void @@ -18169,7 +18170,8 @@ dns_zonemgr_managezone(dns_zonemgr_t *zmgr, dns_zone_t *zone) { REQUIRE(zone->timer == NULL); REQUIRE(zone->zmgr == NULL); - zone->loop = isc_loop_get(zmgr->loopmgr, zone->tid); + isc_loop_t *loop = isc_loop_get(zmgr->loopmgr, zone->tid); + isc_loop_attach(loop, &zone->loop); zonemgr_keymgmt_add(zmgr, zone, &zone->kfio); INSIST(zone->kfio != NULL); @@ -18200,6 +18202,13 @@ dns_zonemgr_releasezone(dns_zonemgr_t *zmgr, dns_zone_t *zone) { ENSURE(zone->kfio == NULL); } + if (zone->timer != NULL) { + isc_refcount_decrement(&zone->irefs); + isc_timer_destroy(&zone->timer); + } + + isc_loop_detach(&zone->loop); + /* Detach below, outside of the write lock. */ zone->zmgr = NULL; @@ -20658,8 +20667,6 @@ done: } cleanup: - /* The zone must be managed */ - INSIST(nsfetch->zone->loop != NULL); isc_refcount_decrement(&zone->irefs); if (dns_rdataset_isassociated(nsrrset)) { @@ -21922,7 +21929,7 @@ dns_zone_link(dns_zone_t *zone, dns_zone_t *raw) { LOCK_ZONE(zone); LOCK_ZONE(raw); - raw->loop = zone->loop; + isc_loop_attach(zone->loop, &raw->loop); /* dns_zone_attach(raw, &zone->raw); */ isc_refcount_increment(&raw->references); diff --git a/lib/dns/zt.c b/lib/dns/zt.c index cf91f7285b..01bed3c290 100644 --- a/lib/dns/zt.c +++ b/lib/dns/zt.c @@ -94,15 +94,14 @@ static dns_qpmethods_t ztqpmethods = { }; void -dns_zt_create(isc_mem_t *mctx, isc_loopmgr_t *loopmgr, dns_view_t *view, - dns_zt_t **ztp) { +dns_zt_create(isc_mem_t *mctx, dns_view_t *view, dns_zt_t **ztp) { dns_qpmulti_t *multi = NULL; dns_zt_t *zt = NULL; REQUIRE(ztp != NULL && *ztp == NULL); REQUIRE(view != NULL); - dns_qpmulti_create(mctx, loopmgr, &ztqpmethods, view, &multi); + dns_qpmulti_create(mctx, &ztqpmethods, view, &multi); zt = isc_mem_get(mctx, sizeof(*zt)); *zt = (dns_zt_t){ @@ -177,11 +176,8 @@ dns_zt_find(dns_zt_t *zt, const dns_name_t *name, dns_ztfind_t options, REQUIRE(VALID_ZT(zt)); REQUIRE(exactopts != exactmask); - if (isc_tid() == ISC_TID_UNKNOWN) { - dns_qpmulti_lockedread(zt->multi, &qpr); - } else { - dns_qpmulti_query(zt->multi, &qpr); - } + dns_qpmulti_query(zt->multi, &qpr); + if (exactopts == DNS_ZTFIND_EXACT) { result = dns_qp_getname(&qpr, name, &pval, &ival); } else if (exactopts == DNS_ZTFIND_NOEXACT) { diff --git a/lib/isc/Makefile.am b/lib/isc/Makefile.am index 7d74517464..3d9f7366a1 100644 --- a/lib/isc/Makefile.am +++ b/lib/isc/Makefile.am @@ -66,7 +66,6 @@ libisc_la_HEADERS = \ include/isc/pause.h \ include/isc/portset.h \ include/isc/quota.h \ - include/isc/qsbr.h \ include/isc/radix.h \ include/isc/random.h \ include/isc/ratelimiter.h \ @@ -81,7 +80,6 @@ libisc_la_HEADERS = \ include/isc/siphash.h \ include/isc/sockaddr.h \ include/isc/spinlock.h \ - include/isc/stack.h \ include/isc/stats.h \ include/isc/stdio.h \ include/isc/stdtime.h \ @@ -174,7 +172,6 @@ libisc_la_SOURCES = \ picohttpparser.h \ portset.c \ quota.c \ - qsbr.c \ radix.c \ random.c \ ratelimiter.c \ diff --git a/lib/isc/async.c b/lib/isc/async.c index db4388964e..1afa2a3027 100644 --- a/lib/isc/async.c +++ b/lib/isc/async.c @@ -107,8 +107,7 @@ isc__async_cb(uv_async_t *handle) { */ struct cds_wfcq_node *node, *next; __cds_wfcq_for_each_blocking_safe(&jobs.head, &jobs.tail, node, next) { - isc_job_t *job = caa_container_of(node, isc_job_t, wfcq_node); - __tsan_acquire(job); + isc_job_t *job = isc_urcu_container(node, isc_job_t, wfcq_node); job->cb(job->cbarg); diff --git a/lib/isc/histo.c b/lib/isc/histo.c index 395b27b6b6..870c3c0152 100644 --- a/lib/isc/histo.c +++ b/lib/isc/histo.c @@ -27,13 +27,6 @@ #include #include -/* - * XXXFANF to be added to by a commmit in a qp-trie - * feature branch - */ -#define STRUCT_FLEX_SIZE(pointer, member, count) \ - (sizeof(*(pointer)) + sizeof(*(pointer)->member) * (count)) - /* * XXXFANF this should probably be in too */ diff --git a/lib/isc/ht.c b/lib/isc/ht.c index d9de112e54..4d771882dd 100644 --- a/lib/isc/ht.c +++ b/lib/isc/ht.c @@ -287,7 +287,7 @@ isc__ht_add(isc_ht_t *ht, const unsigned char *key, const uint32_t keysize, hash = hash_32(hashval, ht->hashbits[idx]); - node = isc_mem_get(ht->mctx, sizeof(*node) + keysize); + node = isc_mem_get(ht->mctx, STRUCT_FLEX_SIZE(node, key, keysize)); *node = (isc_ht_node_t){ .keysize = keysize, .hashval = hashval, @@ -396,7 +396,7 @@ isc__ht_delete(isc_ht_t *ht, const unsigned char *key, const uint32_t keysize, prev->next = node->next; } isc_mem_put(ht->mctx, node, - sizeof(*node) + node->keysize); + STRUCT_FLEX_SIZE(node, key, node->keysize)); ht->count--; return (ISC_R_SUCCESS); diff --git a/lib/isc/include/isc/loop.h b/lib/isc/include/isc/loop.h index 93330e60d5..de94e25a22 100644 --- a/lib/isc/include/isc/loop.h +++ b/lib/isc/include/isc/loop.h @@ -69,17 +69,6 @@ isc_loopmgr_run(isc_loopmgr_t *loopmgr); *\li 'loopmgr' is a valid loop manager. */ -void -isc_loopmgr_wakeup(isc_loopmgr_t *loopmgr); -/*%< - * Send no-op events to wake up all running loops in 'loopmgr' except - * the current one. (See .) - * - * Requires: - *\li 'loopmgr' is a valid loop manager. - *\li We are in a running loop. - */ - void isc_loopmgr_pause(isc_loopmgr_t *loopmgr); /*%< diff --git a/lib/isc/include/isc/qsbr.h b/lib/isc/include/isc/qsbr.h deleted file mode 100644 index 242c6ac45b..0000000000 --- a/lib/isc/include/isc/qsbr.h +++ /dev/null @@ -1,282 +0,0 @@ -/* - * Copyright (C) Internet Systems Consortium, Inc. ("ISC") - * - * SPDX-License-Identifier: MPL-2.0 - * - * This Source Code Form is subject to the terms of the Mozilla Public - * License, v. 2.0. If a copy of the MPL was not distributed with this - * file, you can obtain one at https://mozilla.org/MPL/2.0/. - * - * See the COPYRIGHT file distributed with this work for additional - * information regarding copyright ownership. - */ - -#pragma once - -#include -#include -#include -#include - -/* - * Quiescent state based reclamation - * ================================= - * - * QSBR is a safe memory reclamation algorithm for lock-free data - * structures such as a qp-trie. - * - * When an object is unlinked from a lock-free data structure, it - * cannot be free()d immediately, because there can still be readers - * accessing the object via an old version of the data structure. SMR - * algorithms determine when it is safe to reclaim memory after it has - * been unlinked. - * - * With QSBR, reading a data structure is wait-free. All that is - * required is an atomic load to get the data structure's current - * root; there is no need to explicitly mark any read-side critical - * section. - * - * QSBR is used by RCU (read-copy-update) in the Linux kernel. BIND's - * implementation also uses some ideas from EBR (epoch-based reclamation). - * The following summary is based on the overview in the paper - * "performance of memory reclamation for lockless synchronization", - * (http://csng.cs.toronto.edu/publication_files/0000/0159/jpdc07.pdf). - * - * Aside: This QSBR implementation is somewhat different from the one - * in liburcu, described in the paper "user-level implementations of - * read-copy update", (https://www.efficios.com/publications/), which - * contains the amusing comment: - * - * BIND, a major domain-name server used for Internet domain-name - * resolution, is facing scalability issues. Since domain names - * are read often but rarely updated, using user-level RCU might - * be beneficial. - * - * A "quiescent state" is a point when a thread is not accessing any - * lock-free data structure. After passing through a quiescent state, - * a thread can no longer access versions of a data structure that - * were replaced before that point. In BIND, we use a point in the - * event loop (a uv_prepare_t callback) to identify a quiescent state. - * - * Aside: a prepare handle runs its callbacks before the loop sleeps, - * which reduces reclaim latency (unlike a check handle) and it does - * not affect timeout calculations (unlike an idle handle). - * - * A "grace period" is any time interval such that after the end of - * the grace period, all objects removed before the start of the grace - * period can safely be reclaimed. Different SMR algorithms detect - * grace periods with varying degrees of tightness or looseness. - * - * QSBR uses quiescent states to detect grace periods: a grace period - * is a time interval in which every thread passes through a quiescent - * state. (This is a safe over-estimate.) A "fuzzy barrier" is used to - * find out when all threads have passed through a quiescent state. - * - * NOTE: In BIND this means that code which is not running in an event - * loop thread (such as an isc_work / uv_work_t callback) must use - * locking (not lock-free) data structure accessors. - * - * Because a quiescent state happens once per event loop, a grace - * period takes roughly the same amount of time as the slowest event - * loop in each cycle. - * - * Similar to the paper linked above, this QSBR implementation uses a - * variant of the EBR fuzzy barrier. Like EBR, each grace period is - * numbered with a "phase", which cycles round 1,2,3,1,2,3,... (Phases - * are called epochs in EBR, but I think "phase" is a better metaphor.) - * When entering the fuzzy barrier, each thread updates its local phase - * to match the global phase, keeping a global count of the number of - * threads still to pass. When this count reaches zero, it is the end of - * the grace period; the global phase is updated and reclamation is - * triggered. - * - * Note that threads are usually slightly out-of-phase wrt the global - * grace period. At any particular point in time, there will be some - * threads in the current global phase, and some in the previous - * global phase. EBR has three phases because that is the minimum - * number that leaves one phase unoccupied by readers. Any objects that - * were detached from the data structure in the third phase can be - * reclaimed after the start of the current phase, because a grace - * period (the previous phase) has elapsed since the objects were - * detached. - * - * A phase number can be used by a lock-free data structure (such as a - * qp-trie) to record when an object was detached. QSBR calls the data - * structure's reclaimer function, passing a phase number indicating - * that objects detached in that phase can now be reclaimed - * - * In general, there will be several (maybe many) write operations - * during a grace period. The lock-free data structures that use QSBR - * will collect their reclamation work from all these writes into a - * batch per phase, i.e. per grace period. - * - * There is some example code in `doc/dev/qsbr.md`, with pointers to - * less terse introductions to QSBR and other overview material. - */ - -#define ISC_QSBR_PHASE_BITS 2 - -typedef unsigned int isc_qsbr_phase_t; -/*%< - * A grace period phase number. It can be stored in a bitfield of size - * ISC_QSBR_PHASE_BITS. You can use zero to indicate "no phase". - * (Don't assume the maximum is three: We might want to increase the - * number of phases so that there is more than one unoccupied phase. - * This would allow concurrent reclamation of objects released in - * multiple unoccupied phases.) - */ - -typedef void -isc_qsbreclaimer_t(isc_qsbr_phase_t phase); -/*%< - * The type of memory reclaimer callback functions. - * - * The `phase` identifies which objects are to be reclaimed. - * - * An isc_qsbreclaimer_t can call isc_qsbr_activate() if it could not - * reclaim everything and needs to be called again. - */ - -typedef struct isc_qsbr_registered { - ISC_SLINK(struct isc_qsbr_registered) link; - isc_qsbreclaimer_t *func; -} isc_qsbr_registered_t; -/*%< - * Each reclaimer callback has a static `isc_qsbr_registered_t` object - * so that QSBR can find it. - */ - -void -isc__qsbr_register(isc_qsbr_registered_t *reg); -/*%< - * Requires: - * \li reclaimer->link is not linked - * \li reclaimer->func is not NULL - */ - -#define isc_qsbr_register(cb) \ - do { \ - static isc_qsbr_registered_t registration = { \ - .link = ISC_SLINK_INITIALIZER, \ - .func = cb, \ - }; \ - isc__qsbr_register(®istration); \ - } while (0) -/*%< - * Register a callback function with QSBR. This macro should be used - * inside an `ISC_CONSTRUCTOR` function. There should be one callback - * for eack lock-free data structure implementation, which is able to - * reclaim all the unused memory across all instances of its data - * structure. - */ - -isc_qsbr_phase_t -isc_qsbr_phase(isc_loopmgr_t *loopmgr); -/*%< - * Get the current phase, to use for marking detached objects. - * - * To commit a write that requires cleanup, the ordering must be: - * - * - Use atomic_store_release() to commit the data structure's new - * root pointer; release ordering ensures that the interior changes - * are written before the root pointer. - * - * - Call isc_qsbr_phase() to get the phase to be used for marking - * objects to reclaim. This must happen after the commit, to ensure - * there is at least one grace period between commit and cleanup. - * - * - Pass the same phase to isc_qsbr_activate() so that the reclaimer - * will be called after a grace period has passed. - */ - -void -isc_qsbr_activate(isc_loopmgr_t *loopmgr, isc_qsbr_phase_t phase); -/*%< - * Tell QSBR that objects have been detached and will need reclaiming - * after a grace period. - */ - -/*********************************************************************** - * - * private parts - */ - -/* - * Accessors and constructors for the `grace` variable. - * It contains two bit fields: - * - * - the global phase in the lower ISC_QSBR_PHASE_BITS - * - * - a thread counter in the upper bits - */ - -#define ISC_QSBR_ONE_THREAD (1 << ISC_QSBR_PHASE_BITS) -#define ISC_QSBR_PHASE_MAX (ISC_QSBR_ONE_THREAD - 1) - -#define ISC_QSBR_GRACE_PHASE(grace) (grace & ISC_QSBR_PHASE_MAX) -#define ISC_QSBR_GRACE_THREADS(grace) (grace >> ISC_QSBR_PHASE_BITS) -#define ISC_QSBR_GRACE(threads, phase) \ - ((threads << ISC_QSBR_PHASE_BITS) | phase) - -typedef struct isc_qsbr { - /* - * The `grace` variable keeps track of the current grace period. - * When the phase changes, the thread counter is set to the number of - * threads that need to observe the new phase before the grace period - * can end. - * - * The thread counter is an add-on to the usual EBR fuzzy barrier. - * Counting threads through the barrier adds multi-thread update - * contention, and in EBR the fuzzy barrier runs frequently enough - * (on every access) that it's important to minimize its cost. With - * QSBR, the fuzzy barrier runs less frequently (roughly, per loop, - * instead of per-callback) so contention is less of a concern. The - * thread counter helps to reduce reclaim latency, because unlike EBR - * we don't probabilistically check, we know deterministically when - * all threads have changed phase. - */ - atomic_uint_fast32_t grace; - - /* - * A flag for each phase indicating that there will be work to - * do, so we don't invoke the reclaim machinery unnecessarily. - * Set by `isc_qsbr_activate()` and cleared before the reclaimer - * functions are invoked (so they can re-set their flag if - * necessary). - */ - atomic_uint_fast32_t activated; - - /* - * The time of the last phase transition (isc_nanosecs_t). Used - * to ensure that grace periods do not last forever. We use - * `isc_time_monotonic()` because we need the same time in all - * threads. (`uv_now()` is different in different threads.) - */ - atomic_uint_fast64_t transition_time; - -} isc_qsbr_t; - -/* - * When we start there is no worker thread yet, so the thread - * count is equal to the number of loops. The global phase starts - * off at one (it must always be nonzero). - */ -#define ISC_QSBR_INITIALIZER(nloops) \ - (isc_qsbr_t) { \ - .grace = ISC_QSBR_GRACE(nloops, 1), \ - .transition_time = isc_time_monotonic(), \ - } - -/* - * For use by tests that need to explicitly drive QSBR phase transitions. - */ -void -isc__qsbr_quiescent_state(isc_loop_t *loop); - -/* - * Used by the loopmgr - */ -void -isc__qsbr_quiescent_cb(uv_prepare_t *handle); -void -isc__qsbr_destroy(isc_loopmgr_t *loopmgr); diff --git a/lib/isc/include/isc/stack.h b/lib/isc/include/isc/stack.h deleted file mode 100644 index 92475b9d43..0000000000 --- a/lib/isc/include/isc/stack.h +++ /dev/null @@ -1,189 +0,0 @@ -/* - * Copyright (C) Internet Systems Consortium, Inc. ("ISC") - * - * SPDX-License-Identifier: MPL-2.0 - * - * This Source Code Form is subject to the terms of the Mozilla Public - * License, v. 2.0. If a copy of the MPL was not distributed with this - * file, you can obtain one at https://mozilla.org/MPL/2.0/. - * - * See the COPYRIGHT file distributed with this work for additional - * information regarding copyright ownership. - */ - -#pragma once - -#include -#include -#include -#include - -/* - * Single links, aka stack links - */ - -#define ISC_SLINK(type) \ - struct { \ - type *next; \ - } -#define ISC_SLINK_INITIALIZER_TYPE(type) \ - { \ - .next = ISC_LINK_TOMBSTONE(type), \ - } -#define ISC_SLINK_INIT_TYPE(elt, link, type) \ - { \ - (elt)->link.next = ISC_LINK_TOMBSTONE(type); \ - } - -#define ISC_SLINK_INITIALIZER ISC_SLINK_INITIALIZER_TYPE(void) -#define ISC_SLINK_INIT(elt, link) ISC_SLINK_INIT_TYPE(elt, link, void) - -#define ISC_SLINK_LINKED_TYPE(elt, link, type) \ - ((type *)((elt)->link.next) != ISC_LINK_TOMBSTONE(type)) -#define ISC_SLINK_LINKED(elt, link) ISC_SLINK_LINKED_TYPE(elt, link, void) - -#define ISC_SLINK_NEXT(elt, link) ((elt)->link.next) - -/* - * Simple singly-linked stack implementation - */ - -#define ISC_STACK(type) \ - struct { \ - type *top; \ - } - -#define ISC_STACK_INITIALIZER \ - { \ - .top = NULL, \ - } -#define ISC_STACK_INIT(stack) \ - { \ - (stack).top = NULL; \ - } - -#define ISC_STACK_TOP(stack) ((stack).top) -#define ISC_STACK_EMPTY(stack) ((stack).top == NULL) - -#define ISC_STACK_PUSHUNSAFE(stack, elt, link) \ - { \ - (elt)->link.next = (stack).top; \ - (stack).top = (elt); \ - } - -#define ISC_STACK_PUSH(stack, elt, link) \ - { \ - INSIST(!ISC_SLINK_LINKED(elt, link)); \ - ISC_STACK_PUSHUNSAFE(stack, elt, link); \ - } - -/* - * This is slightly round about because when `type` is `void` - * we can't directly access `__top->link.next`. - */ -#define ISC_STACK_POP_TYPE(stack, link, type) \ - ({ \ - type *__top = (stack).top; \ - if (__top != NULL) { \ - type *__next = ISC_SLINK_NEXT((stack).top, link); \ - ISC_SLINK_INIT_TYPE((stack).top, link, type); \ - (stack).top = __next; \ - } \ - __top; \ - }) - -#define ISC_STACK_POP(stack, link) ISC_STACK_POP_TYPE(stack, link, void) - -/* - * Helper to add element if not already on stack. - */ -#define ISC_STACK_ADD(stack, elt, link) \ - { \ - if (!ISC_STACK_LINKED(elt, link)) { \ - ISC_STACK_PUSHUNSAFE(stack, elt, link); \ - } \ - } - -/* - * This is a simplified implementation of the Treiber stack algorithm with - * back-off. - * - * The original paper which describes the algorithm can be found here: - * https://dominoweb.draco.res.ibm.com/58319a2ed2b1078985257003004617ef.html - * - * We sidestep the ABA problem when removing the elements by adding additional - * constraint: once the element has been removed from the stack, it cannot be - * re-added. - * - * This is actually not a problem for planned usage of the ISC_ASTACK where we - * only use ISC_ASTACK_PUSH() to add individual elements and ISC_ASTACK_DRAIN() - * to empty the stack in a single atomic operation. - * - * To make things simple, there's no implementation for ISC_ASTACK_POP() because - * that's complex to implement and requires double-word CAS (for ABA counter). - * - * The pointers are wrapped in structs so that their types are - * distinct, and to match the ISC_LIST macros. - * - * See doc/dev/dev.md for examples. - */ - -#define ISC_ASTACK(type) \ - struct { \ - atomic_ptr(type) atop; \ - } -#define ISC_ASTACK_INITIALIZER \ - { \ - .atop = NULL, \ - } -#define ISC_ASTACK_INIT(stack) \ - { \ - (stack).atop = NULL; \ - } - -/* - * ATOMIC: for performance, this kind of retry loop should use a weak - * CAS with relaxed ordering in the failure case; on success, release - * ordering ensures that writing the element contents happens before - * reading them, following the acquire in ISC_ASTACK_TOP() and _DRAIN(). - */ -#define ISC_ASTACK_PUSHUNSAFE(stack, elt, link) \ - { \ - (elt)->link.next = atomic_load_relaxed(&(stack).atop); \ - while (!atomic_compare_exchange_weak_explicit( \ - &(stack).atop, &ISC_SLINK_NEXT(elt, link), elt, \ - memory_order_release, memory_order_relaxed)) \ - { \ - isc_pause(); \ - } \ - } - -#define ISC_ASTACK_PUSH(stack, elt, link) \ - { \ - INSIST(!ISC_SLINK_LINKED(elt, link)); \ - ISC_ASTACK_PUSHUNSAFE(stack, elt, link); \ - } - -/* - * Helper to add element if not already on stack. - */ -#define ISC_ASTACK_ADD(stack, elt, link) \ - { \ - if (!ISC_SLINK_LINKED(elt, link)) { \ - ISC_ASTACK_PUSHUNSAFE(stack, elt, link); \ - } \ - } - -/* - * ATOMIC: acquire ordering pairs with ISC_ASTACK_PUSHUNSAFEe() - */ -#define ISC_ASTACK_TOP(stack) atomic_load_acquire(&(stack).atop) -#define ISC_ASTACK_EMPTY(stack) (ISC_ASTACK_TOP(stack) == NULL) - -/* - * ATOMIC: acquire ordering pairs with ISC_ASTACK_PUSHUNSAFE() - */ -#define ISC_ASTACK_TO_STACK(stack) \ - { \ - .top = atomic_exchange_acquire(&(stack).atop, NULL), \ - } diff --git a/lib/isc/include/isc/urcu.h b/lib/isc/include/isc/urcu.h index b43746ce63..0b828d92e6 100644 --- a/lib/isc/include/isc/urcu.h +++ b/lib/isc/include/isc/urcu.h @@ -39,6 +39,35 @@ #pragma GCC diagnostic pop +/* + * Help thread sanitizer to understand `call_rcu()`: + * + * The `rcu_head` argument to `call_rcu()` is expected to be embedded + * in a structure defined by the caller, which is named `_rcuctx` in + * these macros. The callback function uses `caa_container_of()` to + * recover the `rcuctx` pointer from the `_rcu_head` pointer that is + * passed to the callback. + * + * We explain the ordering dependency to tsan by releasing `_rcuctx` + * pointer before `call_rcu()` and acquiring it in the callback + * funtion. We pass the outer `_rcuctx` pointer to the `__tsan_` + * barriers, because it should match a pointer that is known by tsan + * to have been returned by `malloc()`. + */ + +#define isc_urcu_cleanup(ptr, member, func) \ + { \ + __tsan_release(ptr); \ + call_rcu(&(ptr)->member, func); \ + } + +#define isc_urcu_container(ptr, type, member) \ + ({ \ + type *_ptr = caa_container_of(ptr, type, member); \ + __tsan_acquire(_ptr); \ + _ptr; \ + }) + #if defined(RCU_QSBR) /* diff --git a/lib/isc/include/isc/util.h b/lib/isc/include/isc/util.h index 7fe6715a4e..c8b6e1c99f 100644 --- a/lib/isc/include/isc/util.h +++ b/lib/isc/include/isc/util.h @@ -96,6 +96,14 @@ #define ARRAY_SIZE(x) (sizeof(x) / sizeof(x[0])) +/*% + * Get the allocation size for a struct with a flexible array member + * containing `count` elements. The struct is identified by a pointer, + * typically the one that points to (or will point to) the allocation. + */ +#define STRUCT_FLEX_SIZE(pointer, member, count) \ + (sizeof(*(pointer)) + sizeof(*(pointer)->member) * (count)) + /*% * Use this in translation units that would otherwise be empty, to * suppress compiler warnings. diff --git a/lib/isc/loop.c b/lib/isc/loop.c index 1cef7d35ad..0b3918a00f 100644 --- a/lib/isc/loop.c +++ b/lib/isc/loop.c @@ -26,14 +26,12 @@ #include #include #include -#include #include #include #include #include #include #include -#include #include #include #include @@ -151,7 +149,6 @@ destroy_cb(uv_async_t *handle) { uv_close(&loop->run_trigger, isc__job_close); uv_close(&loop->destroy_trigger, NULL); uv_close(&loop->pause_trigger, NULL); - uv_close(&loop->wakeup_trigger, NULL); uv_close(&loop->quiescent, NULL); uv_walk(&loop->loop, loop_walk_cb, (char *)"destroy_cb"); @@ -162,8 +159,6 @@ shutdown_cb(uv_async_t *handle) { isc_loop_t *loop = uv_handle_get_data(handle); isc_loopmgr_t *loopmgr = loop->loopmgr; - loop->shuttingdown = true; - /* Make sure, we can't be called again */ uv_close(&loop->shutdown_trigger, shutdown_trigger_close_cb); @@ -185,12 +180,6 @@ shutdown_cb(uv_async_t *handle) { UV_RUNTIME_CHECK(uv_async_send, r); } -static void -wakeup_cb(uv_async_t *handle) { - /* we only woke up to make the loop take a spin */ - UNUSED(handle); -} - static void loop_init(isc_loop_t *loop, isc_loopmgr_t *loopmgr, uint32_t tid) { *loop = (isc_loop_t){ @@ -226,9 +215,6 @@ loop_init(isc_loop_t *loop, isc_loopmgr_t *loopmgr, uint32_t tid) { UV_RUNTIME_CHECK(uv_async_init, r); uv_handle_set_data(&loop->destroy_trigger, loop); - r = uv_async_init(&loop->loop, &loop->wakeup_trigger, wakeup_cb); - UV_RUNTIME_CHECK(uv_async_init, r); - r = uv_prepare_init(&loop->loop, &loop->quiescent); UV_RUNTIME_CHECK(uv_prepare_init, r); uv_handle_set_data(&loop->quiescent, loop); @@ -245,7 +231,7 @@ loop_init(isc_loop_t *loop, isc_loopmgr_t *loopmgr, uint32_t tid) { static void quiescent_cb(uv_prepare_t *handle) { - isc__qsbr_quiescent_cb(handle); + UNUSED(handle); #if defined(RCU_QSBR) /* safe memory reclamation */ @@ -340,7 +326,6 @@ isc_loopmgr_create(isc_mem_t *mctx, uint32_t nloops, isc_loopmgr_t **loopmgrp) { loopmgr = isc_mem_get(mctx, sizeof(*loopmgr)); *loopmgr = (isc_loopmgr_t){ .nloops = nloops, - .qsbr = ISC_QSBR_INITIALIZER(nloops), }; isc_mem_attach(mctx, &loopmgr->mctx); @@ -449,6 +434,10 @@ isc_loopmgr_run(isc_loopmgr_t *loopmgr) { */ ignore_signal(SIGPIPE, SIG_IGN); + bool free_call_rcu_data = !create_all_cpu_call_rcu_data(0); + + rcu_register_thread(); + /* * The thread 0 is this one. */ @@ -463,21 +452,13 @@ isc_loopmgr_run(isc_loopmgr_t *loopmgr) { } isc_thread_main(loop_thread, &loopmgr->loops[0]); -} -void -isc_loopmgr_wakeup(isc_loopmgr_t *loopmgr) { - REQUIRE(VALID_LOOPMGR(loopmgr)); + rcu_unregister_thread(); - for (size_t i = 0; i < loopmgr->nloops; i++) { - isc_loop_t *loop = &loopmgr->loops[i]; + rcu_barrier(); - /* Skip current loop */ - if (i == isc_tid()) { - continue; - } - - uv_async_send(&loop->wakeup_trigger); + if (free_call_rcu_data) { + free_all_cpu_call_rcu_data(); } } diff --git a/lib/isc/loop_p.h b/lib/isc/loop_p.h index b9bef13b19..185406d0d7 100644 --- a/lib/isc/loop_p.h +++ b/lib/isc/loop_p.h @@ -21,7 +21,6 @@ #include #include #include -#include #include #include #include @@ -76,9 +75,7 @@ struct isc_loop { uv_async_t destroy_trigger; /* safe memory reclamation */ - uv_async_t wakeup_trigger; uv_prepare_t quiescent; - isc_qsbr_phase_t qsbr_phase; }; /* @@ -113,9 +110,6 @@ struct isc_loopmgr { /* per-thread objects */ isc_loop_t *loops; - - /* safe memory reclamation */ - isc_qsbr_t qsbr; }; /* diff --git a/lib/isc/mem.c b/lib/isc/mem.c index 0b2a317244..3105236478 100644 --- a/lib/isc/mem.c +++ b/lib/isc/mem.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #ifdef HAVE_LIBXML2 @@ -578,6 +579,12 @@ isc__mem_destroy(isc_mem_t **ctxp FLARG) { ctx = *ctxp; *ctxp = NULL; + /* + * wait for asynchronous memory reclamation to complete + * before checking for memory leaks + */ + rcu_barrier(); + #if ISC_MEM_TRACKLINES if ((ctx->debugging & ISC_MEM_DEBUGTRACE) != 0) { fprintf(stderr, "destroy mctx %p file %s line %u\n", ctx, file, diff --git a/lib/isc/qsbr.c b/lib/isc/qsbr.c deleted file mode 100644 index c122770c14..0000000000 --- a/lib/isc/qsbr.c +++ /dev/null @@ -1,393 +0,0 @@ -/* - * Copyright (C) Internet Systems Consortium, Inc. ("ISC") - * - * SPDX-License-Identifier: MPL-2.0 - * - * This Source Code Form is subject to the terms of the Mozilla Public - * License, v. 2.0. If a copy of the MPL was not distributed with this - * file, you can obtain one at https://mozilla.org/MPL/2.0/. - * - * See the COPYRIGHT file distributed with this work for additional - * information regarding copyright ownership. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "loop_p.h" - -#define MAX_GRACE_PERIOD_NS 53 * NS_PER_MS - -#if 0 -#define TRACE(fmt, ...) \ - isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_OTHER, \ - ISC_LOG_DEBUG(7), "%s:%u:%s():t%u: " fmt, __FILE__, \ - __LINE__, __func__, isc_tid(), ##__VA_ARGS__) -#else -#define TRACE(...) -#endif - -static ISC_STACK(isc_qsbr_registered_t) qsbreclaimers = ISC_STACK_INITIALIZER; - -static void -reclaim_cb(void *arg); -static void -reclaimed_cb(void *arg); - -/**********************************************************************/ - -/* - * 3,2,1,3,2,1,... - */ -static isc_qsbr_phase_t -change_phase(isc_qsbr_phase_t phase) { - return (--phase > 0 ? phase : ISC_QSBR_PHASE_MAX); -} - -/* - * For marking or checking that a phase has cleanup work to do. - */ -static unsigned int -active_bit(isc_qsbr_phase_t phase) { - return (1 << phase); -} - -/* - * Extract the global phase from the grace period state. - */ -static isc_qsbr_phase_t -global_phase(isc_qsbr_t *qsbr, memory_order m_o) { - uint32_t grace = atomic_load_explicit(&qsbr->grace, m_o); - return (ISC_QSBR_GRACE_PHASE(grace)); -} - -/* - * Record that the current thread has passed the barrier. - * Returns true if more threads still need to pass. - * - * ATOMIC: acquire-release, to ensure that this is not reordered wrt - * read-only accesses to lock-free data structures. This implements the - * ordering requirements of a quiescent state. - */ -static bool -fuzzy_barrier_not_yet(isc_qsbr_t *qsbr) { - uint32_t grace = atomic_fetch_sub_acq_rel(&qsbr->grace, - ISC_QSBR_ONE_THREAD); - uint32_t threads = ISC_QSBR_GRACE_THREADS(grace); - return (threads > 1); -} - -/* - * Ungracefully drive all cleanup work to completion. - * - * ATOMIC: everything is relaxed, because we assume that concurrent - * readers have already finished. `reclaim_cb()` uses the `activated` - * flags to ensure it is OK that threads will race to complete the - * cleanup. - */ -static void -qsbr_shutdown(isc_loopmgr_t *loopmgr) { - isc_qsbr_t *qsbr = &loopmgr->qsbr; - isc_qsbr_phase_t phase = global_phase(qsbr, memory_order_relaxed); - uint32_t threads = isc_loopmgr_nloops(loopmgr); - uint32_t grace; - - while (atomic_load_relaxed(&qsbr->activated) != 0) { - reclaim_cb(loopmgr); - phase = change_phase(phase); - grace = ISC_QSBR_GRACE(threads, phase); - atomic_store_relaxed(&qsbr->grace, grace); - } -} - -/* - * On a quiet server that does not have enough network traffic to keep - * all its threads spinning, grace periods might extend indefinitely. - * So check if we have been waiting an unreasonably long time since - * the last phase change. If so, send a no-op async request to every - * thread to make them all cycle through a quiescent state. - */ -static void -maybe_wakeup(isc_loop_t *loop) { - isc_loopmgr_t *loopmgr = loop->loopmgr; - isc_qsbr_t *qsbr = &loopmgr->qsbr; - - /* - * ATOMIC: relaxed is OK here because we don't use any values guarded - * by the `activated` flags. - */ - if (atomic_load_relaxed(&qsbr->activated) == 0) { - return; - } - if (loop->shuttingdown) { - qsbr_shutdown(loopmgr); - return; - } - - /* - * ATOMIC: relaxed, because the `transition_time` doesn't guard any - * other values, just the isc_loopmgr_wakeup() call below. - */ - atomic_uint_fast64_t *qsbr_ttp = &qsbr->transition_time; - isc_nanosecs_t now = isc_time_monotonic(); - isc_nanosecs_t start = atomic_load_relaxed(qsbr_ttp); - if (now < start + MAX_GRACE_PERIOD_NS) { - return; - } - - /* - * To stop other threads from also invoking `isc_loopmgr_wakeup()`, - * we try to push the timer into the future (expecting that it will - * not trigger again), and quit if someone else got there first. - * ATOMIC: relaxed, as before; strong, because there is no retry loop. - */ - if (!atomic_compare_exchange_strong_relaxed(qsbr_ttp, &start, now)) { - return; - } - - TRACE("long grace period of %llu ns, waking up other threads", - (unsigned long long)(now - start)); - - isc_loopmgr_wakeup(loopmgr); -} - -/* - * Callers use the fuzzy barrier to ensure only one thread can enter - * this function at a time. - * - * Phase transitions happen at roughly the same frequency that IO - * event loops cycle, limited by the slowest loop in each cycle. - */ -static void -phase_transition(isc_loop_t *loop, isc_qsbr_phase_t current_phase) { - isc_loopmgr_t *loopmgr = loop->loopmgr; - isc_qsbr_t *qsbr = &loopmgr->qsbr; - - if (loop->shuttingdown) { - qsbr_shutdown(loopmgr); - return; - } - - /* - * After we change phase, threads will be in either the `current_phase` - * or the `next_phase`. We will reclaim memory from the `third_phase`. - * - * ATOMIC: relaxed is OK here because the necessary synchronization - * happens in `reclaim_cb()`. - */ - isc_qsbr_phase_t next_phase = change_phase(current_phase); - isc_qsbr_phase_t third_phase = change_phase(next_phase); - bool activated = atomic_load_relaxed(&qsbr->activated) & - active_bit(third_phase); - - /* - * Reset the wakeup timer, and log the length of the grace period. - * ATOMIC: relaxed, per the commentary in `maybe_wakeup()`. - */ - atomic_uint_fast64_t *qsbr_tt = &qsbr->transition_time; - isc_nanosecs_t now = isc_time_monotonic(); - isc_nanosecs_t start = atomic_exchange_relaxed(qsbr_tt, now); - TRACE("phase %u -> %u after grace period of %f ms", current_phase, - next_phase, (double)(now - start) / NS_PER_MS); - UNUSED(start); /* ifndef TRACE() */ - - /* - * Work out the threads counter for this grace period. - * - * We need to add one for any reclamation worker thread, to - * prevent us from changing phase before the work is done. If - * we change too early, any newly detached objects will be - * marked with the same phase as the running reclaimer, which - * might lead to them being free()d too soon. - */ - uint32_t threads = isc_loopmgr_nloops(loopmgr) + (activated ? 1 : 0); - - /* - * Start the new grace period. - * - * ATOMIC: release, to pair with the load-acquire in `reclaim_cb()` - * which is spawned in a separate worker thread. - */ - uint32_t grace = ISC_QSBR_GRACE(threads, next_phase); - atomic_store_release(&qsbr->grace, grace); - - if (activated) { - isc_work_enqueue(loop, reclaim_cb, reclaimed_cb, loopmgr); - } -} - -/* - * This function is called once per cycle of each IO event loop by the - * `uv_prepare` callback below. - */ -void -isc__qsbr_quiescent_state(isc_loop_t *loop) { - isc_loopmgr_t *loopmgr = loop->loopmgr; - isc_qsbr_t *qsbr = &loopmgr->qsbr; - - /* - * ATOMIC: relaxed. If we are in phase then we don't need to - * synchronize; if we are not then this thread's presence in - * the thread counter will prevent the phase from changing - * before we get to the fuzzy barrier. - */ - isc_qsbr_phase_t phase = global_phase(qsbr, memory_order_relaxed); - if (loop->qsbr_phase == phase) { - maybe_wakeup(loop); - return; - } - - /* - * Enter the current phase and count us out of the previous phase. - */ - loop->qsbr_phase = phase; - if (fuzzy_barrier_not_yet(qsbr)) { - maybe_wakeup(loop); - return; - } - - /* - * We were the last thread to enter the current phase so the - * grace period is up. No other thread can reach this point. - */ - phase_transition(loop, phase); -} - -void -isc__qsbr_quiescent_cb(uv_prepare_t *handle) { - isc_loop_t *loop = uv_handle_get_data((uv_handle_t *)handle); - isc__qsbr_quiescent_state(loop); -} - -static void -reclaimed_cb(void *arg) { - /* we are back on a loop thread */ - isc_loopmgr_t *loopmgr = arg; - isc_qsbr_t *qsbr = &loopmgr->qsbr; - isc_loop_t *loop = CURRENT_LOOP(loopmgr); - - /* - * Remove the reclaimers from the thread count, so that the - * next grace period can start. - */ - if (fuzzy_barrier_not_yet(qsbr)) { - return; - } - - /* - * The reclaimers were the last thread to be counted out: every - * other thread already passed through a quiescent state. - * - * We expect loop->qsbr_phase == global_phase() at this point, - * except during shutdown when the phase shifts rapidly. Also, - * the current loop might not have received the shutdown - * message yet, so it seems easiest to omit the assertion. - * - * ATOMIC: relaxed, the fuzzy barrier already synchronized. - */ - TRACE("reclaimers overran"); - phase_transition(loop, global_phase(qsbr, memory_order_relaxed)); -} - -static void -reclaim_cb(void *arg) { - /* we are on a work thread not a loop thread */ - isc_loopmgr_t *loopmgr = arg; - isc_qsbr_t *qsbr = &loopmgr->qsbr; - - /* - * The global phase has just been bumped by a `phase_transition()` - * and it cannot change again until the grace period is up, which - * cannot happen until we have finished working. - * - * ATOMIC: acquire, to pair with the release in `phase_transition()`. - * - * The phase we are to clean up is 2 before the current phase, - * which is the same as the one after the current phase (mod 3). - */ - isc_qsbr_phase_t cur_phase = global_phase(qsbr, memory_order_acquire); - isc_qsbr_phase_t third_phase = change_phase(cur_phase); - unsigned int third_bit = active_bit(third_phase); - - /* - * If any reclaimers need to be called again later, they can use - * `isc_qsbr_activate()`, so we need to clear the bit first. - * - * ATOMIC: acquire, so that `isc_qsbr_activate()` happens before - * the callbacks are invoked. - */ - uint32_t activated = atomic_fetch_and_explicit( - &qsbr->activated, ~third_bit, memory_order_acquire); - - /* this can happen when we are racing to clean up on shutdown */ - if ((activated & third_bit) == 0) { - return; - } - - isc_qsbr_registered_t *reclaimer = ISC_STACK_TOP(qsbreclaimers); - while (reclaimer != NULL) { - reclaimer->func(third_phase); - reclaimer = ISC_SLINK_NEXT(reclaimer, link); - } -} - -void -isc__qsbr_register(isc_qsbr_registered_t *reclaimer) { - REQUIRE(reclaimer->func != NULL); - ISC_STACK_PUSH(qsbreclaimers, reclaimer, link); -} - -/* - * ATOMIC: This function needs to ensure that the global phase is read - * after a write has committed. Acquire/release ordering is not sufficient - * for ordering between separate atomics (the data structure's root pointer - * and the global phase), so it must be sequentially consistent. - * - * In general, the phases up to and including the next phase transition - * look like: - * - * 1. local phase - * 2. global phase - * 3. next phase - * 1. third phase - * - * i.e. some threads are still one behind the global phase, on the same - * phase that will be cleaned up immediately after the phase transition. - * - * This function is called just after a write commits. It's likely that - * some threads on the global phase (2) are using a version of the data - * structure from before the write, and they can continue using it while - * the straggler threads (1) catch up and cause a phase transition. - * - * The writer can be one of the straggler threads. If it incorrectly marks - * cleanup work with its local phase (1), memory will be reclaimed - * immediately after the next phase transition (when the third phase is - * also 1), which could be almost immediately when the writer returns to - * the event loop. This will cause a use-after-free for existing readers - * (in phase 2). - * - * More straightforwardly, we need to be able to queue up reclaim work from - * a thread that isn't running a loop, which also means this function has - * to return the global phase. - */ -isc_qsbr_phase_t -isc_qsbr_phase(isc_loopmgr_t *loopmgr) { - isc_qsbr_t *qsbr = &loopmgr->qsbr; - return (global_phase(qsbr, memory_order_seq_cst)); -} - -void -isc_qsbr_activate(isc_loopmgr_t *loopmgr, isc_qsbr_phase_t phase) { - /* - * ATOMIC: release ordering ensures that writing the cleanup lists - * happens before the callback is invoked from a worker thread. - */ - atomic_fetch_or_release(&loopmgr->qsbr.activated, active_bit(phase)); -} diff --git a/lib/isc/quota.c b/lib/isc/quota.c index 4816333e1c..5a05aa439e 100644 --- a/lib/isc/quota.c +++ b/lib/isc/quota.c @@ -79,8 +79,7 @@ isc_quota_release(isc_quota_t *quota) { return; } - isc_job_t *job = caa_container_of(node, isc_job_t, wfcq_node); - __tsan_acquire(job); + isc_job_t *job = isc_urcu_container(node, isc_job_t, wfcq_node); job->cb(job->cbarg); } diff --git a/lib/isc/thread.c b/lib/isc/thread.c index aca7c38332..36ade47f1f 100644 --- a/lib/isc/thread.c +++ b/lib/isc/thread.c @@ -61,7 +61,7 @@ thread_wrap(isc_threadfunc_t func, void *arg) { .func = func, .arg = arg, }; - + __tsan_release(wrap); return (wrap); } @@ -84,12 +84,8 @@ thread_body(struct thread_wrap *wrap) { __tsan_acquire(wrap); free(wrap); - rcu_register_thread(); - ret = func(arg); - rcu_unregister_thread(); - return (ret); } @@ -101,10 +97,14 @@ thread_run(void *wrap) { */ isc__iterated_hash_initialize(); + rcu_register_thread(); + void *ret = thread_body(wrap); isc__iterated_hash_shutdown(); + rcu_unregister_thread(); + return (ret); } diff --git a/tests/bench/load-names.c b/tests/bench/load-names.c index babdd37ef1..94e105ad16 100644 --- a/tests/bench/load-names.c +++ b/tests/bench/load-names.c @@ -18,8 +18,8 @@ #include #include #include -#include #include +#include #include #include diff --git a/tests/bench/qpmulti.c b/tests/bench/qpmulti.c index 7034ac983f..03dd866264 100644 --- a/tests/bench/qpmulti.c +++ b/tests/bench/qpmulti.c @@ -18,6 +18,7 @@ #include #include +#include #include #include #include @@ -25,7 +26,6 @@ #include #include #include -#include #include #include #include @@ -33,6 +33,7 @@ #include #include #include +#include #include #include @@ -380,8 +381,7 @@ load_multi(struct bench_state *bctx) { size_t count = 0; uint64_t start; - dns_qpmulti_create(bctx->mctx, bctx->loopmgr, &item_methods, NULL, - &bctx->multi); + dns_qpmulti_create(bctx->mctx, &item_methods, NULL, &bctx->multi); /* initial contents of the trie */ start = isc_time_monotonic(); @@ -884,6 +884,9 @@ int main(void) { isc_loopmgr_t *loopmgr = NULL; isc_mem_t *mctx = NULL; + + setlinebuf(stdout); + uint32_t nloops; const char *env_workers = getenv("ISC_TASK_WORKERS"); diff --git a/tests/dns/qp_test.c b/tests/dns/qp_test.c index ee9b30da00..8a1156b537 100644 --- a/tests/dns/qp_test.c +++ b/tests/dns/qp_test.c @@ -22,11 +22,11 @@ #define UNIT_TESTING #include -#include #include #include #include #include +#include #include #include diff --git a/tests/dns/qpmulti_test.c b/tests/dns/qpmulti_test.c index 16e5350dec..96a2693033 100644 --- a/tests/dns/qpmulti_test.c +++ b/tests/dns/qpmulti_test.c @@ -27,10 +27,10 @@ #include #include #include -#include #include #include #include +#include #include #include @@ -367,14 +367,13 @@ many_transactions(void *arg) { UNUSED(arg); dns_qpmulti_t *qpm = NULL; - dns_qpmulti_create(mctx, loopmgr, &test_methods, NULL, &qpm); + dns_qpmulti_create(mctx, &test_methods, NULL, &qpm); qpm->writer.write_protect = true; for (size_t n = 0; n < TRANSACTION_COUNT; n++) { TRACE("transaction %zu", n); one_transaction(qpm); - isc__qsbr_quiescent_state(isc_loop_current(loopmgr)); - isc_loopmgr_wakeup(loopmgr); + rcu_quiescent_state(); } dns_qpmulti_destroy(&qpm); @@ -387,6 +386,7 @@ ISC_RUN_TEST_IMPL(qpmulti) { setup_items(); isc_loop_setup(isc_loop_main(loopmgr), many_transactions, NULL); isc_loopmgr_run(loopmgr); + rcu_barrier(); isc_loopmgr_destroy(&loopmgr); isc_log_destroy(&dns_lctx); } diff --git a/tests/libtest/Makefile.am b/tests/libtest/Makefile.am index 68708d463d..1fe4c57b36 100644 --- a/tests/libtest/Makefile.am +++ b/tests/libtest/Makefile.am @@ -4,6 +4,7 @@ AM_CPPFLAGS += \ $(LIBISC_CFLAGS) \ $(LIBDNS_CFLAGS) \ $(LIBNS_CFLAGS) \ + $(LIBURCU_CFLAGS) \ $(LIBUV_CFLAGS) \ -I$(top_srcdir)/lib/isc \ -I$(top_srcdir)/lib/dns diff --git a/tests/libtest/dns.c b/tests/libtest/dns.c index b093b88c43..d398c5c954 100644 --- a/tests/libtest/dns.c +++ b/tests/libtest/dns.c @@ -64,7 +64,7 @@ dns_test_makeview(const char *name, bool with_cache, dns_view_t **viewp) { dns_view_t *view = NULL; dns_cache_t *cache = NULL; - result = dns_view_create(mctx, loopmgr, dns_rdataclass_in, name, &view); + result = dns_view_create(mctx, dns_rdataclass_in, name, &view); if (result != ISC_R_SUCCESS) { return (result); } diff --git a/tests/libtest/qp.c b/tests/libtest/qp.c index db45fbcc16..aff5eb3dc6 100644 --- a/tests/libtest/qp.c +++ b/tests/libtest/qp.c @@ -18,9 +18,9 @@ #include #include #include -#include #include #include +#include #include #include @@ -255,10 +255,10 @@ qp_test_dumpchunks(dns_qp_t *qp) { dumpqp(qp, "qp"); for (qp_chunk_t c = 0; c < qp->chunk_max; c++) { printf("qp %p chunk %u base %p " - "used %u free %u immutable %u phase %u\n", + "used %u free %u immutable %u discounted %u\n", qp, c, qp->base->ptr[c], qp->usage[c].used, qp->usage[c].free, qp->usage[c].immutable, - qp->usage[c].phase); + qp->usage[c].discounted); used += qp->usage[c].used; free += qp->usage[c].free; } diff --git a/tsan-suppressions.txt b/tsan-suppressions.txt deleted file mode 100644 index 217d77c388..0000000000 --- a/tsan-suppressions.txt +++ /dev/null @@ -1,3 +0,0 @@ -# Uninstrumented library. -called_from_lib:libfstrm.so -called_from_lib:libdummyrpz.so