From 4f97a679f0d51d4e92614ea41a0fd300aaac3c7c Mon Sep 17 00:00:00 2001 From: Tony Finch Date: Thu, 9 Mar 2023 10:43:53 +0000 Subject: [PATCH 01/12] A macro for the size of a struct with a flexible array member It can be fairly long-winded to allocate space for a struct with a flexible array member: in general we need the size of the struct, the size of the member, and the number of elements. Wrap them all up in a STRUCT_FLEX_SIZE() macro, and use the new macro for the flexible arrays in isc_ht and dns_qp. --- lib/dns/qp.c | 6 +++--- lib/isc/histo.c | 7 ------- lib/isc/ht.c | 4 ++-- lib/isc/include/isc/util.h | 8 ++++++++ 4 files changed, 13 insertions(+), 12 deletions(-) diff --git a/lib/dns/qp.c b/lib/dns/qp.c index 60acf13ed5..fec71033b1 100644 --- a/lib/dns/qp.c +++ b/lib/dns/qp.c @@ -426,13 +426,13 @@ static void realloc_chunk_arrays(dns_qp_t *qp, qp_chunk_t newmax) { size_t oldptrs = sizeof(qp->base->ptr[0]) * qp->chunk_max; size_t newptrs = sizeof(qp->base->ptr[0]) * newmax; - size_t allbytes = sizeof(dns_qpbase_t) + newptrs; + size_t size = STRUCT_FLEX_SIZE(qp->base, ptr, newmax); if (qp->base == NULL || qpbase_unref(qp)) { - qp->base = isc_mem_reallocate(qp->mctx, qp->base, allbytes); + qp->base = isc_mem_reallocate(qp->mctx, qp->base, size); } else { dns_qpbase_t *oldbase = qp->base; - qp->base = isc_mem_allocate(qp->mctx, allbytes); + qp->base = isc_mem_allocate(qp->mctx, size); memmove(&qp->base->ptr[0], &oldbase->ptr[0], oldptrs); } memset(&qp->base->ptr[qp->chunk_max], 0, newptrs - oldptrs); diff --git a/lib/isc/histo.c b/lib/isc/histo.c index 395b27b6b6..870c3c0152 100644 --- a/lib/isc/histo.c +++ b/lib/isc/histo.c @@ -27,13 +27,6 @@ #include #include -/* - * XXXFANF to be added to by a commmit in a qp-trie - * feature branch - */ -#define STRUCT_FLEX_SIZE(pointer, member, count) \ - (sizeof(*(pointer)) + sizeof(*(pointer)->member) * (count)) - /* * XXXFANF this should probably be in too */ diff --git a/lib/isc/ht.c b/lib/isc/ht.c index d9de112e54..4d771882dd 100644 --- a/lib/isc/ht.c +++ b/lib/isc/ht.c @@ -287,7 +287,7 @@ isc__ht_add(isc_ht_t *ht, const unsigned char *key, const uint32_t keysize, hash = hash_32(hashval, ht->hashbits[idx]); - node = isc_mem_get(ht->mctx, sizeof(*node) + keysize); + node = isc_mem_get(ht->mctx, STRUCT_FLEX_SIZE(node, key, keysize)); *node = (isc_ht_node_t){ .keysize = keysize, .hashval = hashval, @@ -396,7 +396,7 @@ isc__ht_delete(isc_ht_t *ht, const unsigned char *key, const uint32_t keysize, prev->next = node->next; } isc_mem_put(ht->mctx, node, - sizeof(*node) + node->keysize); + STRUCT_FLEX_SIZE(node, key, node->keysize)); ht->count--; return (ISC_R_SUCCESS); diff --git a/lib/isc/include/isc/util.h b/lib/isc/include/isc/util.h index 7fe6715a4e..c8b6e1c99f 100644 --- a/lib/isc/include/isc/util.h +++ b/lib/isc/include/isc/util.h @@ -96,6 +96,14 @@ #define ARRAY_SIZE(x) (sizeof(x) / sizeof(x[0])) +/*% + * Get the allocation size for a struct with a flexible array member + * containing `count` elements. The struct is identified by a pointer, + * typically the one that points to (or will point to) the allocation. + */ +#define STRUCT_FLEX_SIZE(pointer, member, count) \ + (sizeof(*(pointer)) + sizeof(*(pointer)->member) * (count)) + /*% * Use this in translation units that would otherwise be empty, to * suppress compiler warnings. From 2e0c954806ea478554f34c0c222230344016d955 Mon Sep 17 00:00:00 2001 From: Tony Finch Date: Fri, 28 Apr 2023 14:10:03 +0100 Subject: [PATCH 02/12] Wait for RCU to finish before destroying a memory context Memory reclamation by `call_rcu()` is asynchronous, so during shutdown it can lose a race with the destruction of its memory context. When we defer memory reclamation, we need to attach to the memory context to indicate that it is still in use, but that is not enough to delay its destruction. So, call `rcu_barrier()` in `isc_mem_destroy()` to wait for pending RCU work to finish before proceeding to destroy the memory context. --- lib/isc/mem.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/lib/isc/mem.c b/lib/isc/mem.c index 0b2a317244..3105236478 100644 --- a/lib/isc/mem.c +++ b/lib/isc/mem.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #ifdef HAVE_LIBXML2 @@ -578,6 +579,12 @@ isc__mem_destroy(isc_mem_t **ctxp FLARG) { ctx = *ctxp; *ctxp = NULL; + /* + * wait for asynchronous memory reclamation to complete + * before checking for memory leaks + */ + rcu_barrier(); + #if ISC_MEM_TRACKLINES if ((ctx->debugging & ISC_MEM_DEBUGTRACE) != 0) { fprintf(stderr, "destroy mctx %p file %s line %u\n", ctx, file, From cd0795beea1d4875c0ffa0f9e626560c11e68f1d Mon Sep 17 00:00:00 2001 From: Tony Finch Date: Wed, 3 May 2023 15:44:36 +0100 Subject: [PATCH 03/12] Slightly more sanitary thread dispatch Tell thread sanitizer that the thread wrapper is released before passing it to a new thread. --- lib/isc/thread.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/isc/thread.c b/lib/isc/thread.c index aca7c38332..b0e0875aca 100644 --- a/lib/isc/thread.c +++ b/lib/isc/thread.c @@ -61,7 +61,7 @@ thread_wrap(isc_threadfunc_t func, void *arg) { .func = func, .arg = arg, }; - + __tsan_release(wrap); return (wrap); } From 05ca11e12217f8cd565bc8da2adf73b8b20991ce Mon Sep 17 00:00:00 2001 From: Tony Finch Date: Wed, 8 Mar 2023 10:20:16 +0000 Subject: [PATCH 04/12] Remove isc_qsbr (we are using liburcu instead) This commit breaks the qp-trie code. --- doc/dev/qsbr.md | 397 ------------------------------------- lib/isc/Makefile.am | 2 - lib/isc/include/isc/loop.h | 11 - lib/isc/include/isc/qsbr.h | 282 -------------------------- lib/isc/loop.c | 33 +-- lib/isc/loop_p.h | 6 - lib/isc/qsbr.c | 393 ------------------------------------ 7 files changed, 1 insertion(+), 1123 deletions(-) delete mode 100644 doc/dev/qsbr.md delete mode 100644 lib/isc/include/isc/qsbr.h delete mode 100644 lib/isc/qsbr.c diff --git a/doc/dev/qsbr.md b/doc/dev/qsbr.md deleted file mode 100644 index 7880a56bd4..0000000000 --- a/doc/dev/qsbr.md +++ /dev/null @@ -1,397 +0,0 @@ - - -QSBR: quiescent state based reclamation -======================================= - -QSBR is a safe memory reclamation (SMR) algorithm for lock-free data -structures such as a qp-trie. (See `doc/dev/qp.md`.) - -When an object is unlinked from a lock-free data structure, it -cannot be `free()`ed immediately, because there can still be readers -accessing the object via an old version of the data structure. SMR -algorithms determine when it is safe to reclaim memory after it has -been unlinked. - - -Introductions and overviews ---------------------------- - -There is a terse overview in `include/isc/qsbr.h`. - -Jeff Preshing has a nice introduction to QSBR, -__ - -At the end of this note is a copy of a blog post about writing BIND's -`isc_qsbr`, __ - -[Paul McKenney's web page][paulmck] has links to his book on -concurrent programming, the [Userspace RCU library][urcu], and more. -McKenney invented RCU and QSBR. RCU is the Linux kernel's machinery -for lock-free data structures and safe memory reclamation, based on -QSBR. - -[paulmck]: http://www.rdrop.com/~paulmck/ -[urcu]: https://liburcu.org/ - - -Example code ------------- - -If you are implementing a lock-free data structure that needs safe -memory reclamation, here's a guide to using `isc_qsbr`, based on how -QSBR is used by `dns_qp`. - -### registration - -When the program starts up you need to register a global callback -function that will reclaim unused memory. You can do so using an -ISC_CONSTRUCTOR function that runs automatically at startup. - - static void - qp_qsbr_register(void) ISC_CONSTRUCTOR; - static void - qp_qsbr_register(void) { - isc_qsbr_register(qp_qsbr_reclaimer); - } - -### work list - -Your module will need somewhere that your callback can find the work -it needs to do. The qp-trie has an atomic list of `dns_qpmulti_t` -objects for this purpose. - - /* a global variable */ - static ISC_ASTACK(dns_qpmulti_t) qsbr_work; - -The reason for using global variables is so that we don't need to -allocate a thunk every time we have memory reclamation work to do. - -### read-only access - -You should design your data structure so that it has a single atomic -root pointer referring to its current version. A lock-free reader -_must_ run in an `isc_loop` callback. It gains access to the data -structure by taking a copy of this pointer: - - qp_node_t *reader = atomic_load_acquire(&multi->reader); - -During an `isc_loop` callback, a reader should keep using the same -pointer go get a consistent view of the data structure. If it reloads -the pointer it can get a different version changed by concurrent -writers. - -A reader _must_ stop using the root pointer and any interior pointers -obtained via the root pointer before it returns to the `isc_loop`. - -### modifications and writes - -All changes to the data structure must be copy-on-write (aka -read-copy-update) so that concurrent readers are not disturbed. - -When a new version of the data structure has been prepared, it is -committed by overwriting the atomic root pointer, - - atomic_store_release(&multi->reader, reader); /* COMMIT */ - -### scheduling cleanup - -After committing a change, your data structure may have memory that -will become free, after concurrent readers have stopped accessing it. -To reclaim the memory when it is safe, use code like: - - isc_qsbr_phase_t phase = isc_qsbr_phase(multi->loopmgr); - if (defer_chunk_reclamation(qp, phase)) { - ISC_ASTACK_ADD(qsbr_work, multi, cleanup); - isc_qsbr_activate(multi->loopmgr, phase); - } - - * First, get the current QSBR phase - - * Second, mark free memory with the phase number. The qp-trie scans - its chunks and marks those that will become free, and returns - `true` if there is cleanup work to do. - - * If so, the qp-trie is added to the work list. (`ISC_ALIST_ADD()` - is idempotent). - - * Finally, QSBR is informed that there is work to do. - -In other cases it might not make sense to scan the data structure -after committing, and instead you might make note of which memory to -clean up while making changes before you know what the phase will be. -You can then have per-phase work lists, like: - - static ISC_ASTACK(my_work_t) qsbr_work[ISC_QSBR_PHASES]; - - isc_qsbr_phase_t phase = isc_qsbr_phase(loopmgr); - ISC_ASTACK_ADD(qsbr_work[phase], cleanup_work, link); - isc_qsbr_activate(loopmgr, phase); - -In general, there will be several (maybe many) write operations during -a grace period. Your lock-free data structure should collect its -reclamation work from all these writes into a batch per phase, i.e. -per grace period. - -### reclaiming - -Inside the reclaimer callback, we iterate over the work list and clean -up each item on it. If there is more cleanup work to do in another -phase, we put the qp-trie back on the work list for another go. - - static void - qsbreclaimer(void *arg, isc_qsbr_phase_t phase) { - UNUSED(arg); - - ISC_STACK(dns_qpmulti_t) drain = ISC_ASTACK_TO_STACK(qsbr_work); - while (!ISC_STACK_EMPTY(drain)) { - dns_qpmulti_t *multi = ISC_STACK_POP(drain, cleanup); - INSIST(QPMULTI_VALID(multi)); - LOCK(&multi->mutex); - if (reclaim_chunks(&multi->writer, phase)) { - /* more to do next time */ - ISC_ALIST_PUSH(qsbr_work, multi, cleanup); - } - UNLOCK(&multi->mutex); - } - } - -### reclaim marks - -In the qp-trie data structure, each chunk has some metadata which -includes a bitfield for the reclaim phase: - - isc_qsbr_phase_t phase : ISC_QSBR_PHASE_BITS; - -We use a bitfield so that all the metadata fits in a single word. - - ------------------------------------------------------------------------- - -Safe memory reclamation for BIND -================================ - -At the end of October 2022, I _finally_ got [my multithreaded -qp-trie][qp-gc] working! It could be built with two different -concurrency control mechanisms: - - * A reader/writer lock - - This has poor read-side scalability, because every thread is - hammering on the same shared location. But its write performance - is reasonably good: concurrent readers don't slow it down too much. - - * [`liburcu`, userland read-copy-update][urcu] - - RCU has a fast and scalable read side, nice! But on the write side - I used `synchronize_rcu()`, which is blocking and rather slow, so - my write performance was terrible. - -OK, but I want the best of both worlds! To fix it, I needed to change -the qp-trie code to use safe memory reclamation more effectively: -instead of blocking inside `synchronize_rcu()` before cleaning up, use -`call_rcu()` to clean up asynchronously. I expect I'll write about the -qp-trie changes another time. - -Another issue is that I want the best of both worlds _by default_, -but `liburcu` is [LGPL][] and we don't want BIND to depend on -code whose licence demands more from our users than the [MPL][]. - -[qp-gc]: https://dotat.at/@/2021-06-23-page-based-gc-for-qp-trie-rcu.html -[LGPL]: https://opensource.org/licenses/LGPL-2.1 -[MPL]: https://opensource.org/licenses/MPL-2.0 - -So I set out to write my own safe memory reclamation support code. - - -lock freedom ------------- - -In a [multithreaded qp-trie][qp-gc], there can be many concurrent -readers, but there can be only one writer at a time and modifications -are strictly serialized. When I have got it working properly, readers -are completely wait-free, unaffected by other readers, and almost -unaffected by writers. Writers need to get a mutex to ensure there is -only one at a time, but once the mutex is acquired, a writer is not -obstructed by readers. - -The way this works is that readers use an atomic load to get a pointer -to the root of the current version of the trie. Readers can make -multiple queries using this root pointer and the results will be -consistent wrt that particular version, regardless of what changes -writers might be making concurrently. Writers do not affect readers -because all changes are made by copy-on-write. When a writer is ready -to commit a new version of the trie, it uses an atomic store to flip -the root pointer. - - -safe memory reclamation ------------------------ - -We can't copy-on-write indefinitely: we need to reclaim the memory -used by old versions of the trie. And we must do so "safely", i.e. -without `free()`ing memory that readers are still using. - -So, before `free()`ing memory, a writer must wait for a _"grace -period"_, which is a jargon term meaning "until readers are not using -the old version". There are a bunch of algorithms for determining when -a grace period is over, with varying amounts of over-approximation, -CPU overhead, and memory backlog. - -The [RCU][urcu] function `synchronize_rcu()` is slow because it blocks -waiting for a grace period; the `call_rcu()` function runs a callback -asynchronously after a grace period has passed. I wanted to avoid -blocking my writers, so I needed to implement something like -`call_rcu()`. - - -aversions ---------- - -When I started trying to work out how to do safe memory reclamation, -it all seemed quite intimidating. But as I learned more, I found that -my circumstances make it easier than it appeared at first. - -The [`liburcu`][urcu] homepage has a long list of supported CPU -architectures and operating systems. Do I have to care about those -details too? No! The RCU code dates back to before the age of -standardized concurrent memory models, so the RCU developers had to -invent their own atomic primitives and correctness rules. Twenty-ish -years later the state of the art has advanced, so I can use -`` without having to re-do it like `liburcu`. - -You can also choose between several algorithms implemented by -[`liburcu`][urcu], involving questions about kernel support, specially -reserved signals, and intrusiveness in application code. But while I -was working out how to schedule asynchronous memory reclamation work, -I realised that BIND is already well-suited to the fastest flavour of -RCU, called "QSBR". - - -QSBR ----- - -QSBR stands for "quiescent state based reclamation". A _"quiescent -state"_ is a fancy name for a point when a thread is not accessing a -lock-free data structure, and does not retain any root pointers or -interior pointers. - -When a thread has passed through a quiescent state, it no longer has -access to older versions of the data structures. When _all_ threads -have passed through quiescent states, then nothing in the program has -access to old versions. This is how QSBR detects grace periods: after -a writer commits a new version, it waits for all threads to pass -through quiescent states, and therefore a grace period has definitely -elapsed, and so it is then safe to reclaim the old version's memory. - -QSBR is fast because readers do not need to explicitly mark the -critical section surrounding the atomic load that I mentioned earlier. -Threads just need to pass through a quiescent state frequently enough -that there isn't a huge build-up of unreclaimed memory. - -Inside an operating system kernel (RCU's native environment), a -context switch provides a natural quiescent state. In a userland -application, you need to find a good place to call -`rcu_quiescent_state()`. You could call it every time you have -finished using a root pointer, but marking a quiescent state is not -completely free, so there are probably more efficient ways. - - -`libuv` -------- - -BIND is multithreaded, and (basically) each thread runs an event loop. -Recent versions of BIND use [`libuv`][uv] for the event loops. - -A lot of things started falling into place when I realised that the -`libuv` event loop gives BIND a [natural quiescent state][uv-loop]: -when the event callbacks have finished running, and `libuv` is about -to call `select()` or `poll()` or whatever, we can mark a quiescent -state. We can require that event-handling functions do not stash root -pointers in the heap, but only use them via local variables, so we -know that old versions are inaccessible after the callback returns. - -My design marks a quiescent state once per loop, so on a busy server -where each loop has lots to do, the cost of marking a quiescent state -is amortized across several I/O events. - -[uv]: http://libuv.org/ -[uv-loop]: http://docs.libuv.org/en/v1.x/design.html#the-i-o-loop - - -fuzzy barrier -------------- - -So, how do we mark a quiescent state? Using a _"fuzzy barrier"_. - -When a thread reaches a normal barrier, it blocks until all the other -threads have reached the barrier, after which exactly one of the -threads can enter a protected section of code, and the others are -unblocked and can proceed as normal. - -When a thread encounters a fuzzy barrier, it never blocks. It either -proceeds immediately as normal, or if it is the last thread to reach -the barrier, it enters the protected code. - -RCU does not actually use a fuzzy barrier as I have described it. Like -a fuzzy barrier, each thread keeps track of whether it has passed -through a quiescent state in the current grace period, without -blocking; but unlike a fuzzy barrier, no thread is diverted to the -protected code. Instead, code that wants to enter a protected section -uses the blocking `synchronize_rcu()` function. - - -EBR-ish -------- - -As in the paper ["performance of memory reclamation for lockless -synchronization"][HMBW], my implementation of QSBR uses a fuzzy -barrier designed for another safe memory reclamation algorithm, EBR, -epoch based reclamation. (EBR was invented here in Cambridge by [Keir -Fraser][tr579].) - -[HMBW]: http://csng.cs.toronto.edu/publication_files/0000/0159/jpdc07.pdf -[tr579]: https://www.cl.cam.ac.uk/techreports/UCAM-CL-TR-579.html - -Actually, my fuzzy barrier is slightly different to EBR's. In EBR, the -fuzzy barrier is used every time the program enters a critical -section. (In qp-trie terms, that would be every time a reader fetches -a root pointer.) So it is vital that EBR's barrier avoids mutating -shared state, because that would wreck multithreaded performance. - -Because BIND will only pass through the fuzzy barrier when it is about -to use a blocking system call, my version mutates shared state more -frequently (typically, once per CPU per grace period, instead of once -per grace period). If this turns out to be a problem, it won't be too -hard to make it work more like EBR. - -More trivially, I'm using the term "phase" instead of "epoch", because -it's nothing to do with the unix epoch, because there are three -phases, and because I can talk about phase transitions and threads -being out of phase with each other. - - -coda ----- - -While reading various RCU-related papers, I was amused by ["user-level -implementations of read-copy update"][DMSDW], which says: - -> BIND, a major domain-name server used for Internet domain-name -> resolution, is facing scalability issues. Since domain names -> are read often but rarely updated, using user-level RCU might be -> beneficial. - -Yes, I think it might :-) - -[DMSDW]: https://www.efficios.com/publications/ diff --git a/lib/isc/Makefile.am b/lib/isc/Makefile.am index 7d74517464..061fa24e7c 100644 --- a/lib/isc/Makefile.am +++ b/lib/isc/Makefile.am @@ -66,7 +66,6 @@ libisc_la_HEADERS = \ include/isc/pause.h \ include/isc/portset.h \ include/isc/quota.h \ - include/isc/qsbr.h \ include/isc/radix.h \ include/isc/random.h \ include/isc/ratelimiter.h \ @@ -174,7 +173,6 @@ libisc_la_SOURCES = \ picohttpparser.h \ portset.c \ quota.c \ - qsbr.c \ radix.c \ random.c \ ratelimiter.c \ diff --git a/lib/isc/include/isc/loop.h b/lib/isc/include/isc/loop.h index 93330e60d5..de94e25a22 100644 --- a/lib/isc/include/isc/loop.h +++ b/lib/isc/include/isc/loop.h @@ -69,17 +69,6 @@ isc_loopmgr_run(isc_loopmgr_t *loopmgr); *\li 'loopmgr' is a valid loop manager. */ -void -isc_loopmgr_wakeup(isc_loopmgr_t *loopmgr); -/*%< - * Send no-op events to wake up all running loops in 'loopmgr' except - * the current one. (See .) - * - * Requires: - *\li 'loopmgr' is a valid loop manager. - *\li We are in a running loop. - */ - void isc_loopmgr_pause(isc_loopmgr_t *loopmgr); /*%< diff --git a/lib/isc/include/isc/qsbr.h b/lib/isc/include/isc/qsbr.h deleted file mode 100644 index 242c6ac45b..0000000000 --- a/lib/isc/include/isc/qsbr.h +++ /dev/null @@ -1,282 +0,0 @@ -/* - * Copyright (C) Internet Systems Consortium, Inc. ("ISC") - * - * SPDX-License-Identifier: MPL-2.0 - * - * This Source Code Form is subject to the terms of the Mozilla Public - * License, v. 2.0. If a copy of the MPL was not distributed with this - * file, you can obtain one at https://mozilla.org/MPL/2.0/. - * - * See the COPYRIGHT file distributed with this work for additional - * information regarding copyright ownership. - */ - -#pragma once - -#include -#include -#include -#include - -/* - * Quiescent state based reclamation - * ================================= - * - * QSBR is a safe memory reclamation algorithm for lock-free data - * structures such as a qp-trie. - * - * When an object is unlinked from a lock-free data structure, it - * cannot be free()d immediately, because there can still be readers - * accessing the object via an old version of the data structure. SMR - * algorithms determine when it is safe to reclaim memory after it has - * been unlinked. - * - * With QSBR, reading a data structure is wait-free. All that is - * required is an atomic load to get the data structure's current - * root; there is no need to explicitly mark any read-side critical - * section. - * - * QSBR is used by RCU (read-copy-update) in the Linux kernel. BIND's - * implementation also uses some ideas from EBR (epoch-based reclamation). - * The following summary is based on the overview in the paper - * "performance of memory reclamation for lockless synchronization", - * (http://csng.cs.toronto.edu/publication_files/0000/0159/jpdc07.pdf). - * - * Aside: This QSBR implementation is somewhat different from the one - * in liburcu, described in the paper "user-level implementations of - * read-copy update", (https://www.efficios.com/publications/), which - * contains the amusing comment: - * - * BIND, a major domain-name server used for Internet domain-name - * resolution, is facing scalability issues. Since domain names - * are read often but rarely updated, using user-level RCU might - * be beneficial. - * - * A "quiescent state" is a point when a thread is not accessing any - * lock-free data structure. After passing through a quiescent state, - * a thread can no longer access versions of a data structure that - * were replaced before that point. In BIND, we use a point in the - * event loop (a uv_prepare_t callback) to identify a quiescent state. - * - * Aside: a prepare handle runs its callbacks before the loop sleeps, - * which reduces reclaim latency (unlike a check handle) and it does - * not affect timeout calculations (unlike an idle handle). - * - * A "grace period" is any time interval such that after the end of - * the grace period, all objects removed before the start of the grace - * period can safely be reclaimed. Different SMR algorithms detect - * grace periods with varying degrees of tightness or looseness. - * - * QSBR uses quiescent states to detect grace periods: a grace period - * is a time interval in which every thread passes through a quiescent - * state. (This is a safe over-estimate.) A "fuzzy barrier" is used to - * find out when all threads have passed through a quiescent state. - * - * NOTE: In BIND this means that code which is not running in an event - * loop thread (such as an isc_work / uv_work_t callback) must use - * locking (not lock-free) data structure accessors. - * - * Because a quiescent state happens once per event loop, a grace - * period takes roughly the same amount of time as the slowest event - * loop in each cycle. - * - * Similar to the paper linked above, this QSBR implementation uses a - * variant of the EBR fuzzy barrier. Like EBR, each grace period is - * numbered with a "phase", which cycles round 1,2,3,1,2,3,... (Phases - * are called epochs in EBR, but I think "phase" is a better metaphor.) - * When entering the fuzzy barrier, each thread updates its local phase - * to match the global phase, keeping a global count of the number of - * threads still to pass. When this count reaches zero, it is the end of - * the grace period; the global phase is updated and reclamation is - * triggered. - * - * Note that threads are usually slightly out-of-phase wrt the global - * grace period. At any particular point in time, there will be some - * threads in the current global phase, and some in the previous - * global phase. EBR has three phases because that is the minimum - * number that leaves one phase unoccupied by readers. Any objects that - * were detached from the data structure in the third phase can be - * reclaimed after the start of the current phase, because a grace - * period (the previous phase) has elapsed since the objects were - * detached. - * - * A phase number can be used by a lock-free data structure (such as a - * qp-trie) to record when an object was detached. QSBR calls the data - * structure's reclaimer function, passing a phase number indicating - * that objects detached in that phase can now be reclaimed - * - * In general, there will be several (maybe many) write operations - * during a grace period. The lock-free data structures that use QSBR - * will collect their reclamation work from all these writes into a - * batch per phase, i.e. per grace period. - * - * There is some example code in `doc/dev/qsbr.md`, with pointers to - * less terse introductions to QSBR and other overview material. - */ - -#define ISC_QSBR_PHASE_BITS 2 - -typedef unsigned int isc_qsbr_phase_t; -/*%< - * A grace period phase number. It can be stored in a bitfield of size - * ISC_QSBR_PHASE_BITS. You can use zero to indicate "no phase". - * (Don't assume the maximum is three: We might want to increase the - * number of phases so that there is more than one unoccupied phase. - * This would allow concurrent reclamation of objects released in - * multiple unoccupied phases.) - */ - -typedef void -isc_qsbreclaimer_t(isc_qsbr_phase_t phase); -/*%< - * The type of memory reclaimer callback functions. - * - * The `phase` identifies which objects are to be reclaimed. - * - * An isc_qsbreclaimer_t can call isc_qsbr_activate() if it could not - * reclaim everything and needs to be called again. - */ - -typedef struct isc_qsbr_registered { - ISC_SLINK(struct isc_qsbr_registered) link; - isc_qsbreclaimer_t *func; -} isc_qsbr_registered_t; -/*%< - * Each reclaimer callback has a static `isc_qsbr_registered_t` object - * so that QSBR can find it. - */ - -void -isc__qsbr_register(isc_qsbr_registered_t *reg); -/*%< - * Requires: - * \li reclaimer->link is not linked - * \li reclaimer->func is not NULL - */ - -#define isc_qsbr_register(cb) \ - do { \ - static isc_qsbr_registered_t registration = { \ - .link = ISC_SLINK_INITIALIZER, \ - .func = cb, \ - }; \ - isc__qsbr_register(®istration); \ - } while (0) -/*%< - * Register a callback function with QSBR. This macro should be used - * inside an `ISC_CONSTRUCTOR` function. There should be one callback - * for eack lock-free data structure implementation, which is able to - * reclaim all the unused memory across all instances of its data - * structure. - */ - -isc_qsbr_phase_t -isc_qsbr_phase(isc_loopmgr_t *loopmgr); -/*%< - * Get the current phase, to use for marking detached objects. - * - * To commit a write that requires cleanup, the ordering must be: - * - * - Use atomic_store_release() to commit the data structure's new - * root pointer; release ordering ensures that the interior changes - * are written before the root pointer. - * - * - Call isc_qsbr_phase() to get the phase to be used for marking - * objects to reclaim. This must happen after the commit, to ensure - * there is at least one grace period between commit and cleanup. - * - * - Pass the same phase to isc_qsbr_activate() so that the reclaimer - * will be called after a grace period has passed. - */ - -void -isc_qsbr_activate(isc_loopmgr_t *loopmgr, isc_qsbr_phase_t phase); -/*%< - * Tell QSBR that objects have been detached and will need reclaiming - * after a grace period. - */ - -/*********************************************************************** - * - * private parts - */ - -/* - * Accessors and constructors for the `grace` variable. - * It contains two bit fields: - * - * - the global phase in the lower ISC_QSBR_PHASE_BITS - * - * - a thread counter in the upper bits - */ - -#define ISC_QSBR_ONE_THREAD (1 << ISC_QSBR_PHASE_BITS) -#define ISC_QSBR_PHASE_MAX (ISC_QSBR_ONE_THREAD - 1) - -#define ISC_QSBR_GRACE_PHASE(grace) (grace & ISC_QSBR_PHASE_MAX) -#define ISC_QSBR_GRACE_THREADS(grace) (grace >> ISC_QSBR_PHASE_BITS) -#define ISC_QSBR_GRACE(threads, phase) \ - ((threads << ISC_QSBR_PHASE_BITS) | phase) - -typedef struct isc_qsbr { - /* - * The `grace` variable keeps track of the current grace period. - * When the phase changes, the thread counter is set to the number of - * threads that need to observe the new phase before the grace period - * can end. - * - * The thread counter is an add-on to the usual EBR fuzzy barrier. - * Counting threads through the barrier adds multi-thread update - * contention, and in EBR the fuzzy barrier runs frequently enough - * (on every access) that it's important to minimize its cost. With - * QSBR, the fuzzy barrier runs less frequently (roughly, per loop, - * instead of per-callback) so contention is less of a concern. The - * thread counter helps to reduce reclaim latency, because unlike EBR - * we don't probabilistically check, we know deterministically when - * all threads have changed phase. - */ - atomic_uint_fast32_t grace; - - /* - * A flag for each phase indicating that there will be work to - * do, so we don't invoke the reclaim machinery unnecessarily. - * Set by `isc_qsbr_activate()` and cleared before the reclaimer - * functions are invoked (so they can re-set their flag if - * necessary). - */ - atomic_uint_fast32_t activated; - - /* - * The time of the last phase transition (isc_nanosecs_t). Used - * to ensure that grace periods do not last forever. We use - * `isc_time_monotonic()` because we need the same time in all - * threads. (`uv_now()` is different in different threads.) - */ - atomic_uint_fast64_t transition_time; - -} isc_qsbr_t; - -/* - * When we start there is no worker thread yet, so the thread - * count is equal to the number of loops. The global phase starts - * off at one (it must always be nonzero). - */ -#define ISC_QSBR_INITIALIZER(nloops) \ - (isc_qsbr_t) { \ - .grace = ISC_QSBR_GRACE(nloops, 1), \ - .transition_time = isc_time_monotonic(), \ - } - -/* - * For use by tests that need to explicitly drive QSBR phase transitions. - */ -void -isc__qsbr_quiescent_state(isc_loop_t *loop); - -/* - * Used by the loopmgr - */ -void -isc__qsbr_quiescent_cb(uv_prepare_t *handle); -void -isc__qsbr_destroy(isc_loopmgr_t *loopmgr); diff --git a/lib/isc/loop.c b/lib/isc/loop.c index 1cef7d35ad..41c7f656f0 100644 --- a/lib/isc/loop.c +++ b/lib/isc/loop.c @@ -26,14 +26,12 @@ #include #include #include -#include #include #include #include #include #include #include -#include #include #include #include @@ -151,7 +149,6 @@ destroy_cb(uv_async_t *handle) { uv_close(&loop->run_trigger, isc__job_close); uv_close(&loop->destroy_trigger, NULL); uv_close(&loop->pause_trigger, NULL); - uv_close(&loop->wakeup_trigger, NULL); uv_close(&loop->quiescent, NULL); uv_walk(&loop->loop, loop_walk_cb, (char *)"destroy_cb"); @@ -162,8 +159,6 @@ shutdown_cb(uv_async_t *handle) { isc_loop_t *loop = uv_handle_get_data(handle); isc_loopmgr_t *loopmgr = loop->loopmgr; - loop->shuttingdown = true; - /* Make sure, we can't be called again */ uv_close(&loop->shutdown_trigger, shutdown_trigger_close_cb); @@ -185,12 +180,6 @@ shutdown_cb(uv_async_t *handle) { UV_RUNTIME_CHECK(uv_async_send, r); } -static void -wakeup_cb(uv_async_t *handle) { - /* we only woke up to make the loop take a spin */ - UNUSED(handle); -} - static void loop_init(isc_loop_t *loop, isc_loopmgr_t *loopmgr, uint32_t tid) { *loop = (isc_loop_t){ @@ -226,9 +215,6 @@ loop_init(isc_loop_t *loop, isc_loopmgr_t *loopmgr, uint32_t tid) { UV_RUNTIME_CHECK(uv_async_init, r); uv_handle_set_data(&loop->destroy_trigger, loop); - r = uv_async_init(&loop->loop, &loop->wakeup_trigger, wakeup_cb); - UV_RUNTIME_CHECK(uv_async_init, r); - r = uv_prepare_init(&loop->loop, &loop->quiescent); UV_RUNTIME_CHECK(uv_prepare_init, r); uv_handle_set_data(&loop->quiescent, loop); @@ -245,7 +231,7 @@ loop_init(isc_loop_t *loop, isc_loopmgr_t *loopmgr, uint32_t tid) { static void quiescent_cb(uv_prepare_t *handle) { - isc__qsbr_quiescent_cb(handle); + UNUSED(handle); #if defined(RCU_QSBR) /* safe memory reclamation */ @@ -340,7 +326,6 @@ isc_loopmgr_create(isc_mem_t *mctx, uint32_t nloops, isc_loopmgr_t **loopmgrp) { loopmgr = isc_mem_get(mctx, sizeof(*loopmgr)); *loopmgr = (isc_loopmgr_t){ .nloops = nloops, - .qsbr = ISC_QSBR_INITIALIZER(nloops), }; isc_mem_attach(mctx, &loopmgr->mctx); @@ -465,22 +450,6 @@ isc_loopmgr_run(isc_loopmgr_t *loopmgr) { isc_thread_main(loop_thread, &loopmgr->loops[0]); } -void -isc_loopmgr_wakeup(isc_loopmgr_t *loopmgr) { - REQUIRE(VALID_LOOPMGR(loopmgr)); - - for (size_t i = 0; i < loopmgr->nloops; i++) { - isc_loop_t *loop = &loopmgr->loops[i]; - - /* Skip current loop */ - if (i == isc_tid()) { - continue; - } - - uv_async_send(&loop->wakeup_trigger); - } -} - void isc_loopmgr_pause(isc_loopmgr_t *loopmgr) { REQUIRE(VALID_LOOPMGR(loopmgr)); diff --git a/lib/isc/loop_p.h b/lib/isc/loop_p.h index b9bef13b19..185406d0d7 100644 --- a/lib/isc/loop_p.h +++ b/lib/isc/loop_p.h @@ -21,7 +21,6 @@ #include #include #include -#include #include #include #include @@ -76,9 +75,7 @@ struct isc_loop { uv_async_t destroy_trigger; /* safe memory reclamation */ - uv_async_t wakeup_trigger; uv_prepare_t quiescent; - isc_qsbr_phase_t qsbr_phase; }; /* @@ -113,9 +110,6 @@ struct isc_loopmgr { /* per-thread objects */ isc_loop_t *loops; - - /* safe memory reclamation */ - isc_qsbr_t qsbr; }; /* diff --git a/lib/isc/qsbr.c b/lib/isc/qsbr.c deleted file mode 100644 index c122770c14..0000000000 --- a/lib/isc/qsbr.c +++ /dev/null @@ -1,393 +0,0 @@ -/* - * Copyright (C) Internet Systems Consortium, Inc. ("ISC") - * - * SPDX-License-Identifier: MPL-2.0 - * - * This Source Code Form is subject to the terms of the Mozilla Public - * License, v. 2.0. If a copy of the MPL was not distributed with this - * file, you can obtain one at https://mozilla.org/MPL/2.0/. - * - * See the COPYRIGHT file distributed with this work for additional - * information regarding copyright ownership. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "loop_p.h" - -#define MAX_GRACE_PERIOD_NS 53 * NS_PER_MS - -#if 0 -#define TRACE(fmt, ...) \ - isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_OTHER, \ - ISC_LOG_DEBUG(7), "%s:%u:%s():t%u: " fmt, __FILE__, \ - __LINE__, __func__, isc_tid(), ##__VA_ARGS__) -#else -#define TRACE(...) -#endif - -static ISC_STACK(isc_qsbr_registered_t) qsbreclaimers = ISC_STACK_INITIALIZER; - -static void -reclaim_cb(void *arg); -static void -reclaimed_cb(void *arg); - -/**********************************************************************/ - -/* - * 3,2,1,3,2,1,... - */ -static isc_qsbr_phase_t -change_phase(isc_qsbr_phase_t phase) { - return (--phase > 0 ? phase : ISC_QSBR_PHASE_MAX); -} - -/* - * For marking or checking that a phase has cleanup work to do. - */ -static unsigned int -active_bit(isc_qsbr_phase_t phase) { - return (1 << phase); -} - -/* - * Extract the global phase from the grace period state. - */ -static isc_qsbr_phase_t -global_phase(isc_qsbr_t *qsbr, memory_order m_o) { - uint32_t grace = atomic_load_explicit(&qsbr->grace, m_o); - return (ISC_QSBR_GRACE_PHASE(grace)); -} - -/* - * Record that the current thread has passed the barrier. - * Returns true if more threads still need to pass. - * - * ATOMIC: acquire-release, to ensure that this is not reordered wrt - * read-only accesses to lock-free data structures. This implements the - * ordering requirements of a quiescent state. - */ -static bool -fuzzy_barrier_not_yet(isc_qsbr_t *qsbr) { - uint32_t grace = atomic_fetch_sub_acq_rel(&qsbr->grace, - ISC_QSBR_ONE_THREAD); - uint32_t threads = ISC_QSBR_GRACE_THREADS(grace); - return (threads > 1); -} - -/* - * Ungracefully drive all cleanup work to completion. - * - * ATOMIC: everything is relaxed, because we assume that concurrent - * readers have already finished. `reclaim_cb()` uses the `activated` - * flags to ensure it is OK that threads will race to complete the - * cleanup. - */ -static void -qsbr_shutdown(isc_loopmgr_t *loopmgr) { - isc_qsbr_t *qsbr = &loopmgr->qsbr; - isc_qsbr_phase_t phase = global_phase(qsbr, memory_order_relaxed); - uint32_t threads = isc_loopmgr_nloops(loopmgr); - uint32_t grace; - - while (atomic_load_relaxed(&qsbr->activated) != 0) { - reclaim_cb(loopmgr); - phase = change_phase(phase); - grace = ISC_QSBR_GRACE(threads, phase); - atomic_store_relaxed(&qsbr->grace, grace); - } -} - -/* - * On a quiet server that does not have enough network traffic to keep - * all its threads spinning, grace periods might extend indefinitely. - * So check if we have been waiting an unreasonably long time since - * the last phase change. If so, send a no-op async request to every - * thread to make them all cycle through a quiescent state. - */ -static void -maybe_wakeup(isc_loop_t *loop) { - isc_loopmgr_t *loopmgr = loop->loopmgr; - isc_qsbr_t *qsbr = &loopmgr->qsbr; - - /* - * ATOMIC: relaxed is OK here because we don't use any values guarded - * by the `activated` flags. - */ - if (atomic_load_relaxed(&qsbr->activated) == 0) { - return; - } - if (loop->shuttingdown) { - qsbr_shutdown(loopmgr); - return; - } - - /* - * ATOMIC: relaxed, because the `transition_time` doesn't guard any - * other values, just the isc_loopmgr_wakeup() call below. - */ - atomic_uint_fast64_t *qsbr_ttp = &qsbr->transition_time; - isc_nanosecs_t now = isc_time_monotonic(); - isc_nanosecs_t start = atomic_load_relaxed(qsbr_ttp); - if (now < start + MAX_GRACE_PERIOD_NS) { - return; - } - - /* - * To stop other threads from also invoking `isc_loopmgr_wakeup()`, - * we try to push the timer into the future (expecting that it will - * not trigger again), and quit if someone else got there first. - * ATOMIC: relaxed, as before; strong, because there is no retry loop. - */ - if (!atomic_compare_exchange_strong_relaxed(qsbr_ttp, &start, now)) { - return; - } - - TRACE("long grace period of %llu ns, waking up other threads", - (unsigned long long)(now - start)); - - isc_loopmgr_wakeup(loopmgr); -} - -/* - * Callers use the fuzzy barrier to ensure only one thread can enter - * this function at a time. - * - * Phase transitions happen at roughly the same frequency that IO - * event loops cycle, limited by the slowest loop in each cycle. - */ -static void -phase_transition(isc_loop_t *loop, isc_qsbr_phase_t current_phase) { - isc_loopmgr_t *loopmgr = loop->loopmgr; - isc_qsbr_t *qsbr = &loopmgr->qsbr; - - if (loop->shuttingdown) { - qsbr_shutdown(loopmgr); - return; - } - - /* - * After we change phase, threads will be in either the `current_phase` - * or the `next_phase`. We will reclaim memory from the `third_phase`. - * - * ATOMIC: relaxed is OK here because the necessary synchronization - * happens in `reclaim_cb()`. - */ - isc_qsbr_phase_t next_phase = change_phase(current_phase); - isc_qsbr_phase_t third_phase = change_phase(next_phase); - bool activated = atomic_load_relaxed(&qsbr->activated) & - active_bit(third_phase); - - /* - * Reset the wakeup timer, and log the length of the grace period. - * ATOMIC: relaxed, per the commentary in `maybe_wakeup()`. - */ - atomic_uint_fast64_t *qsbr_tt = &qsbr->transition_time; - isc_nanosecs_t now = isc_time_monotonic(); - isc_nanosecs_t start = atomic_exchange_relaxed(qsbr_tt, now); - TRACE("phase %u -> %u after grace period of %f ms", current_phase, - next_phase, (double)(now - start) / NS_PER_MS); - UNUSED(start); /* ifndef TRACE() */ - - /* - * Work out the threads counter for this grace period. - * - * We need to add one for any reclamation worker thread, to - * prevent us from changing phase before the work is done. If - * we change too early, any newly detached objects will be - * marked with the same phase as the running reclaimer, which - * might lead to them being free()d too soon. - */ - uint32_t threads = isc_loopmgr_nloops(loopmgr) + (activated ? 1 : 0); - - /* - * Start the new grace period. - * - * ATOMIC: release, to pair with the load-acquire in `reclaim_cb()` - * which is spawned in a separate worker thread. - */ - uint32_t grace = ISC_QSBR_GRACE(threads, next_phase); - atomic_store_release(&qsbr->grace, grace); - - if (activated) { - isc_work_enqueue(loop, reclaim_cb, reclaimed_cb, loopmgr); - } -} - -/* - * This function is called once per cycle of each IO event loop by the - * `uv_prepare` callback below. - */ -void -isc__qsbr_quiescent_state(isc_loop_t *loop) { - isc_loopmgr_t *loopmgr = loop->loopmgr; - isc_qsbr_t *qsbr = &loopmgr->qsbr; - - /* - * ATOMIC: relaxed. If we are in phase then we don't need to - * synchronize; if we are not then this thread's presence in - * the thread counter will prevent the phase from changing - * before we get to the fuzzy barrier. - */ - isc_qsbr_phase_t phase = global_phase(qsbr, memory_order_relaxed); - if (loop->qsbr_phase == phase) { - maybe_wakeup(loop); - return; - } - - /* - * Enter the current phase and count us out of the previous phase. - */ - loop->qsbr_phase = phase; - if (fuzzy_barrier_not_yet(qsbr)) { - maybe_wakeup(loop); - return; - } - - /* - * We were the last thread to enter the current phase so the - * grace period is up. No other thread can reach this point. - */ - phase_transition(loop, phase); -} - -void -isc__qsbr_quiescent_cb(uv_prepare_t *handle) { - isc_loop_t *loop = uv_handle_get_data((uv_handle_t *)handle); - isc__qsbr_quiescent_state(loop); -} - -static void -reclaimed_cb(void *arg) { - /* we are back on a loop thread */ - isc_loopmgr_t *loopmgr = arg; - isc_qsbr_t *qsbr = &loopmgr->qsbr; - isc_loop_t *loop = CURRENT_LOOP(loopmgr); - - /* - * Remove the reclaimers from the thread count, so that the - * next grace period can start. - */ - if (fuzzy_barrier_not_yet(qsbr)) { - return; - } - - /* - * The reclaimers were the last thread to be counted out: every - * other thread already passed through a quiescent state. - * - * We expect loop->qsbr_phase == global_phase() at this point, - * except during shutdown when the phase shifts rapidly. Also, - * the current loop might not have received the shutdown - * message yet, so it seems easiest to omit the assertion. - * - * ATOMIC: relaxed, the fuzzy barrier already synchronized. - */ - TRACE("reclaimers overran"); - phase_transition(loop, global_phase(qsbr, memory_order_relaxed)); -} - -static void -reclaim_cb(void *arg) { - /* we are on a work thread not a loop thread */ - isc_loopmgr_t *loopmgr = arg; - isc_qsbr_t *qsbr = &loopmgr->qsbr; - - /* - * The global phase has just been bumped by a `phase_transition()` - * and it cannot change again until the grace period is up, which - * cannot happen until we have finished working. - * - * ATOMIC: acquire, to pair with the release in `phase_transition()`. - * - * The phase we are to clean up is 2 before the current phase, - * which is the same as the one after the current phase (mod 3). - */ - isc_qsbr_phase_t cur_phase = global_phase(qsbr, memory_order_acquire); - isc_qsbr_phase_t third_phase = change_phase(cur_phase); - unsigned int third_bit = active_bit(third_phase); - - /* - * If any reclaimers need to be called again later, they can use - * `isc_qsbr_activate()`, so we need to clear the bit first. - * - * ATOMIC: acquire, so that `isc_qsbr_activate()` happens before - * the callbacks are invoked. - */ - uint32_t activated = atomic_fetch_and_explicit( - &qsbr->activated, ~third_bit, memory_order_acquire); - - /* this can happen when we are racing to clean up on shutdown */ - if ((activated & third_bit) == 0) { - return; - } - - isc_qsbr_registered_t *reclaimer = ISC_STACK_TOP(qsbreclaimers); - while (reclaimer != NULL) { - reclaimer->func(third_phase); - reclaimer = ISC_SLINK_NEXT(reclaimer, link); - } -} - -void -isc__qsbr_register(isc_qsbr_registered_t *reclaimer) { - REQUIRE(reclaimer->func != NULL); - ISC_STACK_PUSH(qsbreclaimers, reclaimer, link); -} - -/* - * ATOMIC: This function needs to ensure that the global phase is read - * after a write has committed. Acquire/release ordering is not sufficient - * for ordering between separate atomics (the data structure's root pointer - * and the global phase), so it must be sequentially consistent. - * - * In general, the phases up to and including the next phase transition - * look like: - * - * 1. local phase - * 2. global phase - * 3. next phase - * 1. third phase - * - * i.e. some threads are still one behind the global phase, on the same - * phase that will be cleaned up immediately after the phase transition. - * - * This function is called just after a write commits. It's likely that - * some threads on the global phase (2) are using a version of the data - * structure from before the write, and they can continue using it while - * the straggler threads (1) catch up and cause a phase transition. - * - * The writer can be one of the straggler threads. If it incorrectly marks - * cleanup work with its local phase (1), memory will be reclaimed - * immediately after the next phase transition (when the third phase is - * also 1), which could be almost immediately when the writer returns to - * the event loop. This will cause a use-after-free for existing readers - * (in phase 2). - * - * More straightforwardly, we need to be able to queue up reclaim work from - * a thread that isn't running a loop, which also means this function has - * to return the global phase. - */ -isc_qsbr_phase_t -isc_qsbr_phase(isc_loopmgr_t *loopmgr) { - isc_qsbr_t *qsbr = &loopmgr->qsbr; - return (global_phase(qsbr, memory_order_seq_cst)); -} - -void -isc_qsbr_activate(isc_loopmgr_t *loopmgr, isc_qsbr_phase_t phase) { - /* - * ATOMIC: release ordering ensures that writing the cleanup lists - * happens before the callback is invoked from a worker thread. - */ - atomic_fetch_or_release(&loopmgr->qsbr.activated, active_bit(phase)); -} From 6217e434b57bd5d60ed69f792ae9a1a65a008f57 Mon Sep 17 00:00:00 2001 From: Tony Finch Date: Wed, 8 Mar 2023 14:28:06 +0000 Subject: [PATCH 05/12] Refactor the core qp-trie code to use liburcu A `dns_qmpulti_t` no longer needs to know about its loopmgr. We no longer keep a linked list of `dns_qpmulti_t` that have reclamation work, and we no longer mark chunks with the phase in which they are to be reclaimed. Instead, empty chunks are listed in an array in a `qp_rcu_t`, which is passed to call_rcu(). --- lib/dns/Makefile.am | 2 + lib/dns/include/dns/qp.h | 22 ++-- lib/dns/qp.c | 242 ++++++++++++++++++--------------------- lib/dns/qp_p.h | 52 +++++---- 4 files changed, 153 insertions(+), 165 deletions(-) diff --git a/lib/dns/Makefile.am b/lib/dns/Makefile.am index 1db6d6585a..a70d7e7381 100644 --- a/lib/dns/Makefile.am +++ b/lib/dns/Makefile.am @@ -263,6 +263,7 @@ libdns_la_CPPFLAGS = \ $(AM_CPPFLAGS) \ $(LIBDNS_CFLAGS) \ $(LIBISC_CFLAGS) \ + $(LIBURCU_CFLAGS) \ $(LIBUV_CFLAGS) \ $(OPENSSL_CFLAGS) @@ -272,6 +273,7 @@ libdns_la_LDFLAGS = \ libdns_la_LIBADD = \ $(LIBISC_LIBS) \ + $(LIBURCU_LIBS) \ $(LIBUV_LIBS) \ $(OPENSSL_LIBS) diff --git a/lib/dns/include/dns/qp.h b/lib/dns/include/dns/qp.h index a425a4c257..d4bf9b3650 100644 --- a/lib/dns/include/dns/qp.h +++ b/lib/dns/include/dns/qp.h @@ -47,17 +47,11 @@ * version of the trie. * * For fast concurrent reads, call `dns_qpmulti_query()` to fill in a - * `dns_qpread_t`, which must be allocated on the stack. Readers can - * access a single version of the trie within the scope of an isc_loop - * thread (NOT an isc_work thread). We rely on the loop to bound the - * lifetime of a `dns_qpread_t`, instead of using locks. Readers are - * not blocked by any write activity, and vice versa. - * - * For read-only access outside the scope of a loop, such as from an - * isc_work callback, `dns_qpmulti_lockedread()`. This looks like a - * query transaction; the difference is that a locked read transaction - * takes the `dns_qpmulti_t` mutex. When you have finished with a - * `dns_qpread_t`, call `dns_qpread_destroy()` to release the mutex. + * `dns_qpread_t`, which must be allocated on the stack. This gives + * the reader access to a single version of the trie. The reader's + * thread must be registered with `liburcu`, which is normally taken + * care of by `libisc`. Readers are not blocked by any write activity, + * and vice versa. * * For reads that need a stable view of the trie for multiple cycles * of an isc_loop, or which can be used from any thread, call @@ -314,17 +308,15 @@ dns_qp_destroy(dns_qp_t **qptp); */ void -dns_qpmulti_create(isc_mem_t *mctx, isc_loopmgr_t *loopmgr, - const dns_qpmethods_t *methods, void *uctx, +dns_qpmulti_create(isc_mem_t *mctx, const dns_qpmethods_t *methods, void *uctx, dns_qpmulti_t **qpmp); /*%< * Create a multi-threaded qp-trie. * * Requires: * \li `mctx` is a pointer to a valid memory context. - * \li 'loopmgr' is a valid loop manager. - * \li `qpmp != NULL && *qpmp == NULL` * \li all the methods are non-NULL + * \li `qpmp != NULL && *qpmp == NULL` * * Ensures: * \li `*qpmp` is a pointer to a valid multi-threaded qp-trie diff --git a/lib/dns/qp.c b/lib/dns/qp.c index fec71033b1..05d9b76a1b 100644 --- a/lib/dns/qp.c +++ b/lib/dns/qp.c @@ -28,17 +28,16 @@ #include #include -#include #include #include #include -#include #include #include #include #include #include #include +#include #include #include @@ -437,6 +436,7 @@ realloc_chunk_arrays(dns_qp_t *qp, qp_chunk_t newmax) { } memset(&qp->base->ptr[qp->chunk_max], 0, newptrs - oldptrs); isc_refcount_init(&qp->base->refcount, 1); + qp->base->magic = QPBASE_MAGIC; /* usage array is exclusive to the writer */ size_t oldusage = sizeof(qp->usage[0]) * qp->chunk_max; @@ -555,12 +555,14 @@ chunk_usage(dns_qp_t *qp, qp_chunk_t chunk) { */ static void chunk_discount(dns_qp_t *qp, qp_chunk_t chunk) { - if (qp->usage[chunk].phase == 0) { - INSIST(qp->used_count >= qp->usage[chunk].used); - INSIST(qp->free_count >= qp->usage[chunk].free); - qp->used_count -= qp->usage[chunk].used; - qp->free_count -= qp->usage[chunk].free; + if (qp->usage[chunk].discounted) { + return; } + INSIST(qp->used_count >= qp->usage[chunk].used); + INSIST(qp->free_count >= qp->usage[chunk].free); + qp->used_count -= qp->usage[chunk].used; + qp->free_count -= qp->usage[chunk].free; + qp->usage[chunk].discounted = true; } /* @@ -622,109 +624,97 @@ recycle(dns_qp_t *qp) { } /* - * At the end of a transaction, mark empty but immutable chunks for - * reclamation later. Returns true when chunks need reclaiming later. + * asynchronous cleanup */ -static bool -defer_chunk_reclamation(dns_qp_t *qp, isc_qsbr_phase_t phase) { - unsigned int reclaim = 0; +static void +reclaim_chunks_cb(struct rcu_head *rcu_head) { + qp_rcuctx_t *rcuctx = caa_container_of(rcu_head, qp_rcuctx_t, rcu_head); + __tsan_acquire(rcuctx); + REQUIRE(QPRCU_VALID(rcuctx)); + dns_qpmulti_t *multi = rcuctx->multi; + REQUIRE(QPMULTI_VALID(multi)); - for (qp_chunk_t chunk = 0; chunk < qp->chunk_max; chunk++) { - if (chunk != qp->bump && chunk_usage(qp, chunk) == 0 && - qp->usage[chunk].exists && qp->usage[chunk].immutable && - qp->usage[chunk].phase == 0) - { - chunk_discount(qp, chunk); - qp->usage[chunk].phase = phase; - reclaim++; - } - } + LOCK(&multi->mutex); - if (reclaim > 0) { - LOG_STATS("qp will reclaim %u chunks in phase %u", reclaim, - phase); - } + dns_qp_t *qp = &multi->writer; + REQUIRE(QP_VALID(qp)); - return (reclaim > 0); -} - -static bool -reclaim_chunks(dns_qp_t *qp, isc_qsbr_phase_t phase) { unsigned int free = 0; - bool more = false; - isc_nanosecs_t start = isc_time_monotonic(); - for (qp_chunk_t chunk = 0; chunk < qp->chunk_max; chunk++) { - if (qp->usage[chunk].phase == phase) { - if (qp->usage[chunk].snapshot) { - /* cleanup when snapshot is destroyed */ - qp->usage[chunk].snapfree = true; - } else { - chunk_free(qp, chunk); - free++; - } - } else if (qp->usage[chunk].phase != 0) { - /* - * We need to reclaim more of this trie's memory - * on a later qsbr callback. - */ - more = true; + for (unsigned int i = 0; i < rcuctx->count; i++) { + qp_chunk_t chunk = rcuctx->chunk[i]; + if (qp->usage[chunk].snapshot) { + /* cleanup when snapshot is destroyed */ + qp->usage[chunk].snapfree = true; + } else { + chunk_free(qp, chunk); + free++; } } + isc_mem_putanddetach(&rcuctx->mctx, rcuctx, + STRUCT_FLEX_SIZE(rcuctx, chunk, rcuctx->count)); + isc_nanosecs_t time = isc_time_monotonic() - start; recycle_time += time; if (free > 0) { - LOG_STATS("qp reclaim" PRItime "phase %u free %u chunks", time, - phase, free); + LOG_STATS("qp reclaim" PRItime "free %u chunks", time, free); LOG_STATS("qp reclaim leaf %u live %u used %u free %u hold %u", qp->leaf_count, qp->used_count - qp->free_count, qp->used_count, qp->free_count, qp->hold_count); } - return (more); + UNLOCK(&multi->mutex); } /* - * List of `dns_qpmulti_t`s that have chunks to be reclaimed. - */ -static ISC_ASTACK(dns_qpmulti_t) qsbr_work; - -/* - * When a grace period has passed, this function reclaims any unused memory. + * At the end of a transaction, schedule empty but immutable chunks + * for reclamation later. */ static void -qp_qsbr_reclaimer(isc_qsbr_phase_t phase) { - ISC_STACK(dns_qpmulti_t) drain = ISC_ASTACK_TO_STACK(qsbr_work); - while (!ISC_STACK_EMPTY(drain)) { - /* lock before pop */ - dns_qpmulti_t *multi = ISC_STACK_TOP(drain); - INSIST(QPMULTI_VALID(multi)); - LOCK(&multi->mutex); - ISC_STACK_POP(drain, cleanup); - if (multi->writer.destroy) { - UNLOCK(&multi->mutex); - dns_qpmulti_destroy(&multi); - } else { - if (reclaim_chunks(&multi->writer, phase)) { - /* more to do next time */ - ISC_ASTACK_PUSH(qsbr_work, multi, cleanup); - } - UNLOCK(&multi->mutex); +reclaim_chunks(dns_qpmulti_t *multi) { + dns_qp_t *qp = &multi->writer; + + unsigned int count = 0; + for (qp_chunk_t chunk = 0; chunk < qp->chunk_max; chunk++) { + if (chunk != qp->bump && chunk_usage(qp, chunk) == 0 && + qp->usage[chunk].exists && qp->usage[chunk].immutable && + !qp->usage[chunk].discounted) + { + count++; } } -} -/* - * Register `qp_qsbr_reclaimer()` with QSBR at startup. - */ -static void -qp_qsbr_register(void) ISC_CONSTRUCTOR; -static void -qp_qsbr_register(void) { - isc_qsbr_register(qp_qsbr_reclaimer); + if (count == 0) { + return; + } + + qp_rcuctx_t *rcuctx = + isc_mem_get(qp->mctx, STRUCT_FLEX_SIZE(rcuctx, chunk, count)); + *rcuctx = (qp_rcuctx_t){ + .magic = QPRCU_MAGIC, + .multi = multi, + .count = count, + }; + isc_mem_attach(qp->mctx, &rcuctx->mctx); + + unsigned int i = 0; + for (qp_chunk_t chunk = 0; chunk < qp->chunk_max; chunk++) { + if (chunk != qp->bump && chunk_usage(qp, chunk) == 0 && + qp->usage[chunk].exists && qp->usage[chunk].immutable && + !qp->usage[chunk].discounted) + { + rcuctx->chunk[i++] = chunk; + chunk_discount(qp, chunk); + } + } + + __tsan_release(rcuctx); + call_rcu(&rcuctx->rcu_head, reclaim_chunks_cb); + + LOG_STATS("qp will reclaim %u chunks", count); } /* @@ -1121,6 +1111,8 @@ dns_qpmulti_update(dns_qpmulti_t *multi, dns_qp_t **qptp) { memmove(rollback, qp, sizeof(*rollback)); /* can be uninitialized on the first transaction */ if (rollback->base != NULL) { + INSIST(QPBASE_VALID(rollback->base)); + INSIST(qp->usage != NULL && qp->chunk_max > 0); /* paired with either _commit() or _rollback() */ isc_refcount_increment(&rollback->base->refcount); size_t usage_bytes = sizeof(qp->usage[0]) * qp->chunk_max; @@ -1187,12 +1179,8 @@ dns_qpmulti_commit(dns_qpmulti_t *multi, dns_qp_t **qptp) { recycle(qp); } - /* the reclamation phase must be sampled after the commit */ - isc_qsbr_phase_t phase = isc_qsbr_phase(multi->loopmgr); - if (defer_chunk_reclamation(qp, phase)) { - ISC_ASTACK_ADD(qsbr_work, multi, cleanup); - isc_qsbr_activate(multi->loopmgr, phase); - } + /* schedule the rest for later */ + reclaim_chunks(multi); *qptp = NULL; UNLOCK(&multi->mutex); @@ -1282,31 +1270,11 @@ dns_qpmulti_query(dns_qpmulti_t *multi, dns_qpread_t *qp) { REQUIRE(QPMULTI_VALID(multi)); REQUIRE(qp != NULL); - /* we MUST be in an isc_loop thread */ - qp->tid = isc_tid(); - REQUIRE(qp->tid != ISC_TID_UNKNOWN); - dns_qpmulti_t *whence = reader_open(multi, qp); INSIST(whence == multi); -} -/* - * a locked read takes the mutex - */ - -void -dns_qpmulti_lockedread(dns_qpmulti_t *multi, dns_qpread_t *qp) { - REQUIRE(QPMULTI_VALID(multi)); - REQUIRE(qp != NULL); - - /* we MUST NOT be in an isc_loop thread */ qp->tid = isc_tid(); - REQUIRE(qp->tid == ISC_TID_UNKNOWN); - - LOCK(&multi->mutex); - - dns_qpmulti_t *whence = reader_open(multi, qp); - INSIST(whence == multi); + rcu_read_lock(); } void @@ -1314,10 +1282,8 @@ dns_qpread_destroy(dns_qpmulti_t *multi, dns_qpread_t *qp) { REQUIRE(QPMULTI_VALID(multi)); REQUIRE(QP_VALID(qp)); REQUIRE(qp->tid == isc_tid()); - if (qp->tid == ISC_TID_UNKNOWN) { - UNLOCK(&multi->mutex); - } *qp = (dns_qpread_t){}; + rcu_read_unlock(); } /* @@ -1406,8 +1372,7 @@ dns_qp_create(isc_mem_t *mctx, const dns_qpmethods_t *methods, void *uctx, } void -dns_qpmulti_create(isc_mem_t *mctx, isc_loopmgr_t *loopmgr, - const dns_qpmethods_t *methods, void *uctx, +dns_qpmulti_create(isc_mem_t *mctx, const dns_qpmethods_t *methods, void *uctx, dns_qpmulti_t **qpmp) { REQUIRE(qpmp != NULL && *qpmp == NULL); @@ -1415,8 +1380,6 @@ dns_qpmulti_create(isc_mem_t *mctx, isc_loopmgr_t *loopmgr, *multi = (dns_qpmulti_t){ .magic = QPMULTI_MAGIC, .reader_ref = INVALID_REF, - .loopmgr = loopmgr, - .cleanup = ISC_SLINK_INITIALIZER, }; isc_mutex_init(&multi->mutex); ISC_LIST_INIT(multi->snapshots); @@ -1468,29 +1431,50 @@ dns_qp_destroy(dns_qp_t **qptp) { isc_mem_putanddetach(&qp->mctx, qp, sizeof(*qp)); } +static void +qpmulti_destroy_cb(struct rcu_head *rcu_head) { + qp_rcuctx_t *rcuctx = caa_container_of(rcu_head, qp_rcuctx_t, rcu_head); + __tsan_acquire(rcuctx); + REQUIRE(QPRCU_VALID(rcuctx)); + dns_qpmulti_t *multi = rcuctx->multi; + REQUIRE(QPMULTI_VALID(multi)); + dns_qp_t *qp = &multi->writer; + REQUIRE(QP_VALID(qp)); + + REQUIRE(rcuctx->count == 0); + + destroy_guts(qp); + isc_mutex_destroy(&multi->mutex); + isc_mem_putanddetach(&rcuctx->mctx, rcuctx, + STRUCT_FLEX_SIZE(rcuctx, chunk, rcuctx->count)); + isc_mem_putanddetach(&qp->mctx, multi, sizeof(*multi)); +} + void dns_qpmulti_destroy(dns_qpmulti_t **qpmp) { + dns_qp_t *qp = NULL; + dns_qpmulti_t *multi = NULL; + qp_rcuctx_t *rcuctx = NULL; + REQUIRE(qpmp != NULL); REQUIRE(QPMULTI_VALID(*qpmp)); - dns_qpmulti_t *multi = *qpmp; - dns_qp_t *qp = &multi->writer; + multi = *qpmp; + qp = &multi->writer; *qpmp = NULL; REQUIRE(QP_VALID(qp)); REQUIRE(multi->rollback == NULL); REQUIRE(ISC_LIST_EMPTY(multi->snapshots)); - LOCK(&multi->mutex); - if (ISC_SLINK_LINKED(multi, cleanup)) { - qp->destroy = true; - UNLOCK(&multi->mutex); - } else { - destroy_guts(qp); - UNLOCK(&multi->mutex); - isc_mutex_destroy(&multi->mutex); - isc_mem_putanddetach(&qp->mctx, multi, sizeof(*multi)); - } + rcuctx = isc_mem_get(qp->mctx, STRUCT_FLEX_SIZE(rcuctx, chunk, 0)); + *rcuctx = (qp_rcuctx_t){ + .magic = QPRCU_MAGIC, + .multi = multi, + }; + isc_mem_attach(qp->mctx, &rcuctx->mctx); + __tsan_release(rcuctx); + call_rcu(&rcuctx->rcu_head, qpmulti_destroy_cb); } /*********************************************************************** diff --git a/lib/dns/qp_p.h b/lib/dns/qp_p.h index 90d05a20d8..e9054da953 100644 --- a/lib/dns/qp_p.h +++ b/lib/dns/qp_p.h @@ -301,11 +301,9 @@ ref_cell(qp_ref_t ref) { * transactions, but it must become immutable when an update * transaction is opened.) * - * When a chunk becomes empty (wrt the latest version of the trie), we - * note the QSBR phase after which no old versions of the trie will - * need the chunk and it will be safe to free(). There are a few flags - * used to mark which chunks are still needed by snapshots after the - * chunks have passed their normal reclamation phase. + * There are a few flags used to mark which chunks are still needed by + * snapshots after the chunks have passed their normal reclamation + * phase. */ typedef struct qp_usage { /*% the allocation point, increases monotonically */ @@ -316,14 +314,14 @@ typedef struct qp_usage { bool exists : 1; /*% is this chunk shared? [MT] */ bool immutable : 1; + /*% already subtracted from multi->*_count [MT] */ + bool discounted : 1; /*% is a snapshot using this chunk? [MT] */ bool snapshot : 1; /*% tried to free it but a snapshot needs it [MT] */ bool snapfree : 1; /*% for mark/sweep snapshot flag updates [MT] */ bool snapmark : 1; - /*% in which phase did this chunk become unused? [MT] */ - isc_qsbr_phase_t phase : ISC_QSBR_PHASE_BITS; } qp_usage_t; /* @@ -335,7 +333,7 @@ typedef struct qp_usage { * A `dns_qpbase_t` counts references from `dns_qp_t` objects and * from packed readers, but not from `dns_qpread_t` nor from * `dns_qpsnap_t` objects. Refcount adjustments for `dns_qpread_t` - * would wreck multicore scalability; instead we rely on QSBR. + * would wreck multicore scalability; instead we rely on RCU. * * The `usage` array determines when a chunk is no longer needed: old * chunk pointers in old `base` arrays are ignored. (They can become @@ -349,10 +347,25 @@ typedef struct qp_usage { * busy slots that are in use by readers. */ struct dns_qpbase { + unsigned int magic; isc_refcount_t refcount; qp_node_t *ptr[]; }; +/* + * Chunks that may be in use by readers are reclaimed asynchronously. + * When a transaction commits, immutable chunks that are now empty are + * listed in a `qp_rcuctx_t` structure and passed to `call_rcu()`. + */ +typedef struct qp_rcuctx { + unsigned int magic; + struct rcu_head rcu_head; + isc_mem_t *mctx; + dns_qpmulti_t *multi; + qp_chunk_t count; + qp_chunk_t chunk[]; +} qp_rcuctx_t; + /* * Returns true when the base array can be free()d. */ @@ -382,10 +395,14 @@ ref_ptr(dns_qpreadable_t qpr, qp_ref_t ref) { #define QPITER_MAGIC ISC_MAGIC('q', 'p', 'i', 't') #define QPMULTI_MAGIC ISC_MAGIC('q', 'p', 'm', 'v') #define QPREADER_MAGIC ISC_MAGIC('q', 'p', 'r', 'x') +#define QPBASE_MAGIC ISC_MAGIC('q', 'p', 'b', 'p') +#define QPRCU_MAGIC ISC_MAGIC('q', 'p', 'c', 'b') -#define QP_VALID(p) ISC_MAGIC_VALID(p, QP_MAGIC) -#define QPITER_VALID(p) ISC_MAGIC_VALID(p, QPITER_MAGIC) -#define QPMULTI_VALID(p) ISC_MAGIC_VALID(p, QPMULTI_MAGIC) +#define QP_VALID(qp) ISC_MAGIC_VALID(qp, QP_MAGIC) +#define QPITER_VALID(qp) ISC_MAGIC_VALID(qp, QPITER_MAGIC) +#define QPMULTI_VALID(qp) ISC_MAGIC_VALID(qp, QPMULTI_MAGIC) +#define QPBASE_VALID(qp) ISC_MAGIC_VALID(qp, QPBASE_MAGIC) +#define QPRCU_VALID(qp) ISC_MAGIC_VALID(qp, QPRCU_MAGIC) /* * Polymorphic initialization of the `dns_qpreader_t` prefix. @@ -504,8 +521,6 @@ struct dns_qp { enum { QP_NONE, QP_WRITE, QP_UPDATE } transaction_mode : 2; /*% compact the entire trie [MT] */ bool compact_all : 1; - /*% destroy the trie asynchronously [MT] */ - bool destroy : 1; /*% optionally when compiled with fuzzing support [MT] */ bool write_protect : 1; }; @@ -517,9 +532,6 @@ struct dns_qp { * of the trie. See the "packed reader nodes" section below for a * description of what it points to. * - * We need access to the loopmgr to hook into QSBR safe memory reclamation. - * It is constant after initialization. - * * The main object under the protection of the mutex is the `writer` * containing all the allocator state. There can be a backup copy when * we want to be able to rollback an update transaction. @@ -537,8 +549,6 @@ struct dns_qp { */ struct dns_qpmulti { uint32_t magic; - /*% safe memory reclamation context (const) */ - isc_loopmgr_t *loopmgr; /*% pointer to current packed reader */ atomic_ptr(qp_node_t) reader; /*% the mutex protects the rest of this structure */ @@ -551,8 +561,6 @@ struct dns_qpmulti { dns_qp_t *rollback; /*% all snapshots of this trie */ ISC_LIST(dns_qpsnap_t) snapshots; - /*% safe memory reclamation work list */ - ISC_SLINK(dns_qpmulti_t) cleanup; }; /*********************************************************************** @@ -878,13 +886,15 @@ static inline dns_qpmulti_t * unpack_reader(dns_qpreader_t *qp, qp_node_t *reader) { INSIST(reader_valid(reader)); dns_qpmulti_t *multi = node_pointer(&reader[0]); + dns_qpbase_t *base = node_pointer(&reader[1]); INSIST(QPMULTI_VALID(multi)); + INSIST(QPBASE_VALID(base)); *qp = (dns_qpreader_t){ .magic = QP_MAGIC, .uctx = multi->writer.uctx, .methods = multi->writer.methods, .root_ref = node32(&reader[1]), - .base = node_pointer(&reader[1]), + .base = base, }; return (multi); } From 9882a6ef904b0405d0e048407db79e91c078ce22 Mon Sep 17 00:00:00 2001 From: Tony Finch Date: Thu, 27 Apr 2023 12:47:03 +0100 Subject: [PATCH 06/12] The zone table no longer depends on the loop manager This reverts some of the changes in commit b171cacf4f0123ba because now it isn't necessary to pass the loopmgr around. --- bin/delv/delv.c | 3 +-- bin/named/server.c | 3 +-- bin/tests/system/pipelined/pipequeries.c | 2 +- bin/tools/mdig.c | 2 +- fuzz/dns_message_checksig.c | 3 +-- lib/dns/client.c | 3 +-- lib/dns/include/dns/view.h | 4 ++-- lib/dns/include/dns/zt.h | 3 +-- lib/dns/view.c | 5 ++--- lib/dns/zt.c | 12 ++++-------- tests/bench/qpmulti.c | 3 +-- tests/dns/qpmulti_test.c | 2 +- tests/libtest/dns.c | 2 +- 13 files changed, 18 insertions(+), 29 deletions(-) diff --git a/bin/delv/delv.c b/bin/delv/delv.c index fc10c7137c..6ae6af9183 100644 --- a/bin/delv/delv.c +++ b/bin/delv/delv.c @@ -2147,8 +2147,7 @@ run_server(void *arg) { CHECK(ns_interfacemgr_create(mctx, sctx, loopmgr, netmgr, dispatchmgr, NULL, false, &interfacemgr)); - CHECK(dns_view_create(mctx, loopmgr, dns_rdataclass_in, "_default", - &view)); + CHECK(dns_view_create(mctx, dns_rdataclass_in, "_default", &view)); CHECK(dns_cache_create(loopmgr, dns_rdataclass_in, "", &cache)); dns_view_setcache(view, cache, false); dns_cache_detach(&cache); diff --git a/bin/named/server.c b/bin/named/server.c index a13c9c7a4d..d38ecffd01 100644 --- a/bin/named/server.c +++ b/bin/named/server.c @@ -6437,8 +6437,7 @@ create_view(const cfg_obj_t *vconfig, dns_viewlist_t *viewlist, } INSIST(view == NULL); - result = dns_view_create(named_g_mctx, named_g_loopmgr, viewclass, - viewname, &view); + result = dns_view_create(named_g_mctx, viewclass, viewname, &view); if (result != ISC_R_SUCCESS) { return (result); } diff --git a/bin/tests/system/pipelined/pipequeries.c b/bin/tests/system/pipelined/pipequeries.c index 5569ad318d..805e3c5344 100644 --- a/bin/tests/system/pipelined/pipequeries.c +++ b/bin/tests/system/pipelined/pipequeries.c @@ -256,7 +256,7 @@ main(int argc, char *argv[]) { RUNCHECK(dns_requestmgr_create(mctx, dispatchmgr, dispatchv4, NULL, &requestmgr)); - RUNCHECK(dns_view_create(mctx, loopmgr, 0, "_test", &view)); + RUNCHECK(dns_view_create(mctx, 0, "_test", &view)); isc_loopmgr_setup(loopmgr, sendqueries, NULL); isc_loopmgr_run(loopmgr); diff --git a/bin/tools/mdig.c b/bin/tools/mdig.c index 4193f06912..7a366c0e85 100644 --- a/bin/tools/mdig.c +++ b/bin/tools/mdig.c @@ -2141,7 +2141,7 @@ main(int argc, char *argv[]) { mctx, dispatchmgr, have_ipv4 ? dispatchvx : NULL, have_ipv6 ? dispatchvx : NULL, &requestmgr)); - RUNCHECK(dns_view_create(mctx, loopmgr, 0, "_mdig", &view)); + RUNCHECK(dns_view_create(mctx, 0, "_mdig", &view)); query = ISC_LIST_HEAD(queries); isc_loopmgr_setup(loopmgr, sendqueries, query); diff --git a/fuzz/dns_message_checksig.c b/fuzz/dns_message_checksig.c index c27bfebe8b..a4b9dd15ab 100644 --- a/fuzz/dns_message_checksig.c +++ b/fuzz/dns_message_checksig.c @@ -236,8 +236,7 @@ LLVMFuzzerInitialize(int *argc ISC_ATTR_UNUSED, char ***argv ISC_ATTR_UNUSED) { isc_loopmgr_create(mctx, 1, &loopmgr); - result = dns_view_create(mctx, loopmgr, dns_rdataclass_in, "view", - &view); + result = dns_view_create(mctx, dns_rdataclass_in, "view", &view); if (result != ISC_R_SUCCESS) { fprintf(stderr, "dns_view_create failed: %s\n", isc_result_totext(result)); diff --git a/lib/dns/client.c b/lib/dns/client.c index 5a3c0fadda..579b9c3aab 100644 --- a/lib/dns/client.c +++ b/lib/dns/client.c @@ -206,8 +206,7 @@ createview(isc_mem_t *mctx, dns_rdataclass_t rdclass, isc_loopmgr_t *loopmgr, isc_result_t result; dns_view_t *view = NULL; - result = dns_view_create(mctx, loopmgr, rdclass, DNS_CLIENTVIEW_NAME, - &view); + result = dns_view_create(mctx, rdclass, DNS_CLIENTVIEW_NAME, &view); if (result != ISC_R_SUCCESS) { return (result); } diff --git a/lib/dns/include/dns/view.h b/lib/dns/include/dns/view.h index b72a3dd44f..e801b3a30f 100644 --- a/lib/dns/include/dns/view.h +++ b/lib/dns/include/dns/view.h @@ -259,8 +259,8 @@ struct dns_view { #endif /* HAVE_LMDB */ isc_result_t -dns_view_create(isc_mem_t *mctx, isc_loopmgr_t *loopmgr, - dns_rdataclass_t rdclass, const char *name, dns_view_t **viewp); +dns_view_create(isc_mem_t *mctx, dns_rdataclass_t rdclass, const char *name, + dns_view_t **viewp); /*%< * Create a view. * diff --git a/lib/dns/include/dns/zt.h b/lib/dns/include/dns/zt.h index 1118d06428..3ed74911c2 100644 --- a/lib/dns/include/dns/zt.h +++ b/lib/dns/include/dns/zt.h @@ -34,8 +34,7 @@ typedef isc_result_t dns_zt_callback_t(void *arg); void -dns_zt_create(isc_mem_t *mctx, isc_loopmgr_t *loopmgr, dns_view_t *view, - dns_zt_t **ztp); +dns_zt_create(isc_mem_t *mctx, dns_view_t *view, dns_zt_t **ztp); /*%< * Creates a new zone table for a view. * diff --git a/lib/dns/view.c b/lib/dns/view.c index 5f58142627..ee2a5c9a06 100644 --- a/lib/dns/view.c +++ b/lib/dns/view.c @@ -79,8 +79,7 @@ #define DEFAULT_EDNS_BUFSIZE 1232 isc_result_t -dns_view_create(isc_mem_t *mctx, isc_loopmgr_t *loopmgr, - dns_rdataclass_t rdclass, const char *name, +dns_view_create(isc_mem_t *mctx, dns_rdataclass_t rdclass, const char *name, dns_view_t **viewp) { dns_view_t *view = NULL; isc_result_t result; @@ -135,7 +134,7 @@ dns_view_create(isc_mem_t *mctx, isc_loopmgr_t *loopmgr, isc_rwlock_init(&view->sfd_lock); view->zonetable = NULL; - dns_zt_create(mctx, loopmgr, view, &view->zonetable); + dns_zt_create(mctx, view, &view->zonetable); result = dns_fwdtable_create(mctx, &view->fwdtable); if (result != ISC_R_SUCCESS) { diff --git a/lib/dns/zt.c b/lib/dns/zt.c index cf91f7285b..01bed3c290 100644 --- a/lib/dns/zt.c +++ b/lib/dns/zt.c @@ -94,15 +94,14 @@ static dns_qpmethods_t ztqpmethods = { }; void -dns_zt_create(isc_mem_t *mctx, isc_loopmgr_t *loopmgr, dns_view_t *view, - dns_zt_t **ztp) { +dns_zt_create(isc_mem_t *mctx, dns_view_t *view, dns_zt_t **ztp) { dns_qpmulti_t *multi = NULL; dns_zt_t *zt = NULL; REQUIRE(ztp != NULL && *ztp == NULL); REQUIRE(view != NULL); - dns_qpmulti_create(mctx, loopmgr, &ztqpmethods, view, &multi); + dns_qpmulti_create(mctx, &ztqpmethods, view, &multi); zt = isc_mem_get(mctx, sizeof(*zt)); *zt = (dns_zt_t){ @@ -177,11 +176,8 @@ dns_zt_find(dns_zt_t *zt, const dns_name_t *name, dns_ztfind_t options, REQUIRE(VALID_ZT(zt)); REQUIRE(exactopts != exactmask); - if (isc_tid() == ISC_TID_UNKNOWN) { - dns_qpmulti_lockedread(zt->multi, &qpr); - } else { - dns_qpmulti_query(zt->multi, &qpr); - } + dns_qpmulti_query(zt->multi, &qpr); + if (exactopts == DNS_ZTFIND_EXACT) { result = dns_qp_getname(&qpr, name, &pval, &ival); } else if (exactopts == DNS_ZTFIND_NOEXACT) { diff --git a/tests/bench/qpmulti.c b/tests/bench/qpmulti.c index 7034ac983f..93b98a0b6a 100644 --- a/tests/bench/qpmulti.c +++ b/tests/bench/qpmulti.c @@ -380,8 +380,7 @@ load_multi(struct bench_state *bctx) { size_t count = 0; uint64_t start; - dns_qpmulti_create(bctx->mctx, bctx->loopmgr, &item_methods, NULL, - &bctx->multi); + dns_qpmulti_create(bctx->mctx, &item_methods, NULL, &bctx->multi); /* initial contents of the trie */ start = isc_time_monotonic(); diff --git a/tests/dns/qpmulti_test.c b/tests/dns/qpmulti_test.c index 16e5350dec..d5a5b91d0b 100644 --- a/tests/dns/qpmulti_test.c +++ b/tests/dns/qpmulti_test.c @@ -367,7 +367,7 @@ many_transactions(void *arg) { UNUSED(arg); dns_qpmulti_t *qpm = NULL; - dns_qpmulti_create(mctx, loopmgr, &test_methods, NULL, &qpm); + dns_qpmulti_create(mctx, &test_methods, NULL, &qpm); qpm->writer.write_protect = true; for (size_t n = 0; n < TRANSACTION_COUNT; n++) { diff --git a/tests/libtest/dns.c b/tests/libtest/dns.c index b093b88c43..d398c5c954 100644 --- a/tests/libtest/dns.c +++ b/tests/libtest/dns.c @@ -64,7 +64,7 @@ dns_test_makeview(const char *name, bool with_cache, dns_view_t **viewp) { dns_view_t *view = NULL; dns_cache_t *cache = NULL; - result = dns_view_create(mctx, loopmgr, dns_rdataclass_in, name, &view); + result = dns_view_create(mctx, dns_rdataclass_in, name, &view); if (result != ISC_R_SUCCESS) { return (result); } From c890b9b124add8ed9b1afb79737dbf9bc67f1e73 Mon Sep 17 00:00:00 2001 From: Tony Finch Date: Wed, 8 Mar 2023 14:52:30 +0000 Subject: [PATCH 07/12] Get the tests working with liburcu Mostly a few qp-trie details to adjust. --- fuzz/Makefile.am | 1 + fuzz/dns_qp.c | 2 +- tests/bench/load-names.c | 2 +- tests/bench/qpmulti.c | 6 +++++- tests/dns/qp_test.c | 2 +- tests/dns/qpmulti_test.c | 6 +++--- tests/libtest/Makefile.am | 1 + tests/libtest/qp.c | 6 +++--- 8 files changed, 16 insertions(+), 10 deletions(-) diff --git a/fuzz/Makefile.am b/fuzz/Makefile.am index d2e3931c00..75407d488b 100644 --- a/fuzz/Makefile.am +++ b/fuzz/Makefile.am @@ -6,6 +6,7 @@ AM_CFLAGS += \ AM_CPPFLAGS += \ $(LIBISC_CFLAGS) \ $(LIBDNS_CFLAGS) \ + $(LIBURCU_CFLAGS) \ $(LIBUV_CFLAGS) \ -DFUZZDIR=\"$(abs_srcdir)\" \ -I$(top_srcdir)/lib/dns \ diff --git a/fuzz/dns_qp.c b/fuzz/dns_qp.c index 1bc19fa38f..045c8acbf2 100644 --- a/fuzz/dns_qp.c +++ b/fuzz/dns_qp.c @@ -16,10 +16,10 @@ #include #include -#include #include #include #include +#include #include #include diff --git a/tests/bench/load-names.c b/tests/bench/load-names.c index babdd37ef1..94e105ad16 100644 --- a/tests/bench/load-names.c +++ b/tests/bench/load-names.c @@ -18,8 +18,8 @@ #include #include #include -#include #include +#include #include #include diff --git a/tests/bench/qpmulti.c b/tests/bench/qpmulti.c index 93b98a0b6a..03dd866264 100644 --- a/tests/bench/qpmulti.c +++ b/tests/bench/qpmulti.c @@ -18,6 +18,7 @@ #include #include +#include #include #include #include @@ -25,7 +26,6 @@ #include #include #include -#include #include #include #include @@ -33,6 +33,7 @@ #include #include #include +#include #include #include @@ -883,6 +884,9 @@ int main(void) { isc_loopmgr_t *loopmgr = NULL; isc_mem_t *mctx = NULL; + + setlinebuf(stdout); + uint32_t nloops; const char *env_workers = getenv("ISC_TASK_WORKERS"); diff --git a/tests/dns/qp_test.c b/tests/dns/qp_test.c index ee9b30da00..8a1156b537 100644 --- a/tests/dns/qp_test.c +++ b/tests/dns/qp_test.c @@ -22,11 +22,11 @@ #define UNIT_TESTING #include -#include #include #include #include #include +#include #include #include diff --git a/tests/dns/qpmulti_test.c b/tests/dns/qpmulti_test.c index d5a5b91d0b..96a2693033 100644 --- a/tests/dns/qpmulti_test.c +++ b/tests/dns/qpmulti_test.c @@ -27,10 +27,10 @@ #include #include #include -#include #include #include #include +#include #include #include @@ -373,8 +373,7 @@ many_transactions(void *arg) { for (size_t n = 0; n < TRANSACTION_COUNT; n++) { TRACE("transaction %zu", n); one_transaction(qpm); - isc__qsbr_quiescent_state(isc_loop_current(loopmgr)); - isc_loopmgr_wakeup(loopmgr); + rcu_quiescent_state(); } dns_qpmulti_destroy(&qpm); @@ -387,6 +386,7 @@ ISC_RUN_TEST_IMPL(qpmulti) { setup_items(); isc_loop_setup(isc_loop_main(loopmgr), many_transactions, NULL); isc_loopmgr_run(loopmgr); + rcu_barrier(); isc_loopmgr_destroy(&loopmgr); isc_log_destroy(&dns_lctx); } diff --git a/tests/libtest/Makefile.am b/tests/libtest/Makefile.am index 68708d463d..1fe4c57b36 100644 --- a/tests/libtest/Makefile.am +++ b/tests/libtest/Makefile.am @@ -4,6 +4,7 @@ AM_CPPFLAGS += \ $(LIBISC_CFLAGS) \ $(LIBDNS_CFLAGS) \ $(LIBNS_CFLAGS) \ + $(LIBURCU_CFLAGS) \ $(LIBUV_CFLAGS) \ -I$(top_srcdir)/lib/isc \ -I$(top_srcdir)/lib/dns diff --git a/tests/libtest/qp.c b/tests/libtest/qp.c index db45fbcc16..aff5eb3dc6 100644 --- a/tests/libtest/qp.c +++ b/tests/libtest/qp.c @@ -18,9 +18,9 @@ #include #include #include -#include #include #include +#include #include #include @@ -255,10 +255,10 @@ qp_test_dumpchunks(dns_qp_t *qp) { dumpqp(qp, "qp"); for (qp_chunk_t c = 0; c < qp->chunk_max; c++) { printf("qp %p chunk %u base %p " - "used %u free %u immutable %u phase %u\n", + "used %u free %u immutable %u discounted %u\n", qp, c, qp->base->ptr[c], qp->usage[c].used, qp->usage[c].free, qp->usage[c].immutable, - qp->usage[c].phase); + qp->usage[c].discounted); used += qp->usage[c].used; free += qp->usage[c].free; } From 2bce998b2b85eb42312610171ab013a1490a9b3e Mon Sep 17 00:00:00 2001 From: Tony Finch Date: Wed, 3 May 2023 15:29:43 +0100 Subject: [PATCH 08/12] Avoid using the zone timer after its loop has gone Shutdown and cleanup of zones is more asynchronous with the qp-trie zone table. As a result it's possible that some activity is delayed until after a zone has been released from its zonemanager. Previously, the dns_zone code was not very strict in the way it refers to the loop it is running on: The loop pointer was stashed when dns_zonemgr_managezone() was called and never cleared. Now, zones properly attach to and detach from their loops. The zone timer depends on its loop. The shutdown crashes occurred when asynchronous calls tried to modify the zone timer after dns_zonemgr_releasezone() has been called and the loop was invalidated. In these cases the attempt to set the timer is now ignored, with a debug log message. --- lib/dns/zone.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/lib/dns/zone.c b/lib/dns/zone.c index 7481d3b12f..be7eb70c90 100644 --- a/lib/dns/zone.c +++ b/lib/dns/zone.c @@ -10619,8 +10619,6 @@ failure: cleanup: dns_db_detach(&kfetch->db); - /* The zone must be managed */ - INSIST(kfetch->zone->loop != NULL); isc_refcount_decrement(&zone->irefs); if (dns_rdataset_isassociated(keydataset)) { @@ -14658,12 +14656,15 @@ zone_timer_set(dns_zone_t *zone, isc_time_t *next, isc_time_t *now) { isc_time_subtract(next, now, &interval); } - if (zone->timer == NULL) { + if (zone->loop == NULL) { + zone_debuglog(zone, __func__, 10, "zone is not managed"); + } else if (zone->timer == NULL) { isc_refcount_increment0(&zone->irefs); isc_timer_create(zone->loop, zone_timer, zone, &zone->timer); } - - isc_timer_start(zone->timer, isc_timertype_once, &interval); + if (zone->timer != NULL) { + isc_timer_start(zone->timer, isc_timertype_once, &interval); + } } static void @@ -18169,7 +18170,8 @@ dns_zonemgr_managezone(dns_zonemgr_t *zmgr, dns_zone_t *zone) { REQUIRE(zone->timer == NULL); REQUIRE(zone->zmgr == NULL); - zone->loop = isc_loop_get(zmgr->loopmgr, zone->tid); + isc_loop_t *loop = isc_loop_get(zmgr->loopmgr, zone->tid); + isc_loop_attach(loop, &zone->loop); zonemgr_keymgmt_add(zmgr, zone, &zone->kfio); INSIST(zone->kfio != NULL); @@ -18200,6 +18202,13 @@ dns_zonemgr_releasezone(dns_zonemgr_t *zmgr, dns_zone_t *zone) { ENSURE(zone->kfio == NULL); } + if (zone->timer != NULL) { + isc_refcount_decrement(&zone->irefs); + isc_timer_destroy(&zone->timer); + } + + isc_loop_detach(&zone->loop); + /* Detach below, outside of the write lock. */ zone->zmgr = NULL; @@ -20658,8 +20667,6 @@ done: } cleanup: - /* The zone must be managed */ - INSIST(nsfetch->zone->loop != NULL); isc_refcount_decrement(&zone->irefs); if (dns_rdataset_isassociated(nsrrset)) { @@ -21922,7 +21929,7 @@ dns_zone_link(dns_zone_t *zone, dns_zone_t *raw) { LOCK_ZONE(zone); LOCK_ZONE(raw); - raw->loop = zone->loop; + isc_loop_attach(zone->loop, &raw->loop); /* dns_zone_attach(raw, &zone->raw); */ isc_refcount_increment(&raw->references); From c377e0a9e3dd5dd087a2fb42368aaac5c82268e6 Mon Sep 17 00:00:00 2001 From: Tony Finch Date: Thu, 4 May 2023 15:26:13 +0100 Subject: [PATCH 09/12] Help thread sanitizer to cope with liburcu All the places the qp-trie code was using `call_rcu()` needed `__tsan_release()` and `__tsan_acquire()` annotations, so add a couple of wrappers to encapsulate this pattern. With these wrappers, the tests run almost clean under thread sanitizer. The remaining problems are due to `rcu_barrier()` which can be suppressed using `.tsan-suppress`. It does not suppress the whole of `liburcu`, because we would like thread sanitizer to detect problems in `call_rcu()` callbacks, which are called from `liburcu`. The CI jobs have been updated to use `.tsan-suppress` by default, except for a special-case job that needs the additional suppressions in `.tsan-suppress-extra`. We might be able to get rid of some of this after liburcu gains support for thread sanitizer. Note: the `rcu_barrier()` suppression is not entirely effective: tsan sometimes reports races that originate inside `rcu_barrier()` but tsan has discarded the stack so it does not have the information required to suppress the report. These "races" can be made much easier to reproduce by adding `atexit_sleep_ms=1000` to `TSAN_OPTIONS`. The problem with tsan's short memory can be addressed by increasing `history_size`: when it is large enough (6 or 7) the `rcu_barrier()` stack usually survives long enough for suppression to work. --- .gitlab-ci.yml | 22 ++++++++++++++-------- .reuse/dep5 | 3 ++- .tsan-suppress | 3 +++ .tsan-suppress-extra | 6 ++++++ bin/tests/system/run.sh.in | 1 + lib/dns/qp.c | 16 ++++++---------- lib/dns/rbtdb.c | 7 +++---- lib/isc/async.c | 3 +-- lib/isc/include/isc/urcu.h | 29 +++++++++++++++++++++++++++++ lib/isc/quota.c | 3 +-- tsan-suppressions.txt | 3 --- 11 files changed, 66 insertions(+), 30 deletions(-) create mode 100644 .tsan-suppress create mode 100644 .tsan-suppress-extra delete mode 100644 tsan-suppressions.txt diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index bc8ca63f2d..5c332adab3 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -18,14 +18,20 @@ variables: CLANG_VERSION: 16 CLANG: "clang-${CLANG_VERSION}" SCAN_BUILD: "scan-build-${CLANG_VERSION}" - ASAN_SYMBOLIZER_PATH: "/usr/lib/llvm-${CLANG_VERSION}/bin/llvm-symbolizer" + LLVM_SYMBOLIZER: "/usr/lib/llvm-${CLANG_VERSION}/bin/llvm-symbolizer" CLANG_FORMAT: "clang-format-${CLANG_VERSION}" CFLAGS_COMMON: -fno-omit-frame-pointer -fno-optimize-sibling-calls -O1 -g -Wall -Wextra # Pass run-time flags to AddressSanitizer to get core dumps on error. ASAN_OPTIONS: abort_on_error=1:disable_coredump=0:unmap_shadow_on_exit=1 - TSAN_OPTIONS_COMMON: "disable_coredump=0 second_deadlock_stack=1 history_size=7 log_exe_name=true log_path=tsan" + ASAN_SYMBOLIZER_PATH: "${LLVM_SYMBOLIZER}" + + TSAN_OPTIONS_COMMON: "disable_coredump=0 second_deadlock_stack=1 atexit_sleep_ms=1000 history_size=7 log_exe_name=true log_path=tsan" + TSAN_SYMBOLIZER: "external_symbolizer_path=/usr/bin/llvm-symbolizer" + TSAN_SUPPRESSIONS: "suppressions=${CI_PROJECT_DIR}/.tsan-suppress" + TSAN_OPTIONS_DEFAULT: "${TSAN_OPTIONS_COMMON} ${TSAN_SUPPRESSIONS} ${TSAN_SYMBOLIZER}" + UBSAN_OPTIONS: "halt_on_error=1:abort_on_error=1:disable_coredump=0" TARBALL_EXTENSION: xz @@ -1044,7 +1050,7 @@ gcc:tsan: system:gcc:tsan: variables: - TSAN_OPTIONS: "${TSAN_OPTIONS_COMMON} external_symbolizer_path=/usr/bin/llvm-symbolizer" + TSAN_OPTIONS: "${TSAN_OPTIONS_DEFAULT}" <<: *fedora_37_amd64_image <<: *system_test_tsan_job needs: @@ -1053,7 +1059,7 @@ system:gcc:tsan: unit:gcc:tsan: variables: - TSAN_OPTIONS: "${TSAN_OPTIONS_COMMON} external_symbolizer_path=/usr/bin/llvm-symbolizer" + TSAN_OPTIONS: "${TSAN_OPTIONS_DEFAULT}" <<: *fedora_37_amd64_image <<: *unit_test_tsan_job needs: @@ -1071,7 +1077,7 @@ clang:tsan: system:clang:tsan: variables: - TSAN_OPTIONS: "${TSAN_OPTIONS_COMMON} external_symbolizer_path=/usr/lib/llvm-${CLANG_VERSION}/bin/llvm-symbolizer" + TSAN_OPTIONS: "${TSAN_OPTIONS_COMMON} ${TSAN_SUPPRESSIONS} external_symbolizer_path=${LLVM_SYMBOLIZER}" <<: *base_image <<: *system_test_tsan_job needs: @@ -1080,7 +1086,7 @@ system:clang:tsan: unit:clang:tsan: variables: - TSAN_OPTIONS: "${TSAN_OPTIONS_COMMON} external_symbolizer_path=/usr/lib/llvm-${CLANG_VERSION}/bin/llvm-symbolizer suppressions=$CI_PROJECT_DIR/tsan-suppressions.txt" + TSAN_OPTIONS: "${TSAN_OPTIONS_COMMON} suppressions=$CI_PROJECT_DIR/.tsan-suppress-extra external_symbolizer_path=${LLVM_SYMBOLIZER}" <<: *base_image <<: *unit_test_tsan_job needs: @@ -1323,7 +1329,7 @@ respdiff-short:tsan: LDFLAGS: "-fsanitize=thread" EXTRA_CONFIGURE: "--enable-pthread-rwlock --without-jemalloc" MAX_DISAGREEMENTS_PERCENTAGE: "0.5" - TSAN_OPTIONS: "${TSAN_OPTIONS_COMMON} external_symbolizer_path=/usr/bin/llvm-symbolizer" + TSAN_OPTIONS: "${TSAN_OPTIONS_DEFAULT}" script: - bash respdiff.sh -s named -q "${PWD}/10k_a.txt" -c 3 -w "${PWD}/rspworkdir" "${CI_PROJECT_DIR}" "/usr/local/respdiff-reference-bind/sbin/named" after_script: @@ -1361,7 +1367,7 @@ respdiff-long:tsan: LDFLAGS: "-fsanitize=thread" EXTRA_CONFIGURE: "--enable-pthread-rwlock --without-jemalloc" MAX_DISAGREEMENTS_PERCENTAGE: "0.5" - TSAN_OPTIONS: "${TSAN_OPTIONS_COMMON} external_symbolizer_path=/usr/bin/llvm-symbolizer" + TSAN_OPTIONS: "${TSAN_OPTIONS_DEFAULT}" script: - bash respdiff.sh -s named -q "${PWD}/100k_mixed.txt" -c 3 -w "${PWD}/rspworkdir" "${CI_PROJECT_DIR}" "/usr/local/respdiff-reference-bind/sbin/named" after_script: diff --git a/.reuse/dep5 b/.reuse/dep5 index d4072e91fd..eb708299f7 100644 --- a/.reuse/dep5 +++ b/.reuse/dep5 @@ -163,11 +163,12 @@ Files: **/.clang-format .gitlab-ci.yml .lgtm.yml .pylintrc + .tsan-suppress + .tsan-suppress-extra .uncrustify.cfg doc/misc/*.zoneopt doc/misc/options doc/misc/rndc.grammar - tsan-suppressions.txt sonar-project.properties Copyright: Internet Systems Consortium, Inc. ("ISC") License: CC0-1.0 diff --git a/.tsan-suppress b/.tsan-suppress new file mode 100644 index 0000000000..132a18e5c6 --- /dev/null +++ b/.tsan-suppress @@ -0,0 +1,3 @@ +# be more selective with liburcu +race:rcu_barrier +race:rcu_memb_barrier diff --git a/.tsan-suppress-extra b/.tsan-suppress-extra new file mode 100644 index 0000000000..767a2cc8e1 --- /dev/null +++ b/.tsan-suppress-extra @@ -0,0 +1,6 @@ +# Uninstrumented libraries +called_from_lib:libfstrm.so +called_from_lib:libdummyrpz.so +# be more selective with liburcu +race:rcu_barrier +race:rcu_memb_barrier diff --git a/bin/tests/system/run.sh.in b/bin/tests/system/run.sh.in index 084efc47d8..81038f1eb1 100644 --- a/bin/tests/system/run.sh.in +++ b/bin/tests/system/run.sh.in @@ -89,6 +89,7 @@ if ! $do_run; then LOG_FLAGS="$log_flags" \ TEST_LARGE_MAP="${TEST_LARGE_MAP}" \ CI_ENABLE_ALL_TESTS="${CI_ENABLE_ALL_TESTS}" \ + ${TSAN_OPTIONS:+"TSAN_OPTIONS=${TSAN_OPTIONS}"} \ ${VIRTUAL_ENV:+"VIRTUAL_ENV=${VIRTUAL_ENV}"} \ ${PERL5LIB:+"PERL5LIB=${PERL5LIB}"} \ make -e check diff --git a/lib/dns/qp.c b/lib/dns/qp.c index 05d9b76a1b..7633c4d5aa 100644 --- a/lib/dns/qp.c +++ b/lib/dns/qp.c @@ -627,9 +627,8 @@ recycle(dns_qp_t *qp) { * asynchronous cleanup */ static void -reclaim_chunks_cb(struct rcu_head *rcu_head) { - qp_rcuctx_t *rcuctx = caa_container_of(rcu_head, qp_rcuctx_t, rcu_head); - __tsan_acquire(rcuctx); +reclaim_chunks_cb(struct rcu_head *arg) { + qp_rcuctx_t *rcuctx = isc_urcu_container(arg, qp_rcuctx_t, rcu_head); REQUIRE(QPRCU_VALID(rcuctx)); dns_qpmulti_t *multi = rcuctx->multi; REQUIRE(QPMULTI_VALID(multi)); @@ -711,8 +710,7 @@ reclaim_chunks(dns_qpmulti_t *multi) { } } - __tsan_release(rcuctx); - call_rcu(&rcuctx->rcu_head, reclaim_chunks_cb); + isc_urcu_cleanup(rcuctx, rcu_head, reclaim_chunks_cb); LOG_STATS("qp will reclaim %u chunks", count); } @@ -1432,9 +1430,8 @@ dns_qp_destroy(dns_qp_t **qptp) { } static void -qpmulti_destroy_cb(struct rcu_head *rcu_head) { - qp_rcuctx_t *rcuctx = caa_container_of(rcu_head, qp_rcuctx_t, rcu_head); - __tsan_acquire(rcuctx); +qpmulti_destroy_cb(struct rcu_head *arg) { + qp_rcuctx_t *rcuctx = isc_urcu_container(arg, qp_rcuctx_t, rcu_head); REQUIRE(QPRCU_VALID(rcuctx)); dns_qpmulti_t *multi = rcuctx->multi; REQUIRE(QPMULTI_VALID(multi)); @@ -1473,8 +1470,7 @@ dns_qpmulti_destroy(dns_qpmulti_t **qpmp) { .multi = multi, }; isc_mem_attach(qp->mctx, &rcuctx->mctx); - __tsan_release(rcuctx); - call_rcu(&rcuctx->rcu_head, qpmulti_destroy_cb); + isc_urcu_cleanup(rcuctx, rcu_head, qpmulti_destroy_cb); } /*********************************************************************** diff --git a/lib/dns/rbtdb.c b/lib/dns/rbtdb.c index c94f935d55..2bc92f3835 100644 --- a/lib/dns/rbtdb.c +++ b/lib/dns/rbtdb.c @@ -9583,8 +9583,8 @@ new_gluelist(isc_mem_t *mctx, dns_name_t *name) { static void free_gluelist_rcu(struct rcu_head *rcu_head) { - rbtdb_glue_t *glue = caa_container_of(rcu_head, rbtdb_glue_t, rcu_head); - __tsan_acquire(glue); + rbtdb_glue_t *glue = isc_urcu_container(rcu_head, rbtdb_glue_t, + rcu_head); free_gluelist(glue); } @@ -9600,8 +9600,7 @@ free_gluetable(rbtdb_version_t *rbtversion) { caa_container_of(node, rdatasetheader_t, wfs_node); rbtdb_glue_t *glue = rcu_xchg_pointer(&header->glue_list, NULL); - __tsan_release(glue); - call_rcu(&glue->rcu_head, free_gluelist_rcu); + isc_urcu_cleanup(glue, rcu_head, free_gluelist_rcu); } rcu_read_unlock(); } diff --git a/lib/isc/async.c b/lib/isc/async.c index db4388964e..1afa2a3027 100644 --- a/lib/isc/async.c +++ b/lib/isc/async.c @@ -107,8 +107,7 @@ isc__async_cb(uv_async_t *handle) { */ struct cds_wfcq_node *node, *next; __cds_wfcq_for_each_blocking_safe(&jobs.head, &jobs.tail, node, next) { - isc_job_t *job = caa_container_of(node, isc_job_t, wfcq_node); - __tsan_acquire(job); + isc_job_t *job = isc_urcu_container(node, isc_job_t, wfcq_node); job->cb(job->cbarg); diff --git a/lib/isc/include/isc/urcu.h b/lib/isc/include/isc/urcu.h index b43746ce63..0b828d92e6 100644 --- a/lib/isc/include/isc/urcu.h +++ b/lib/isc/include/isc/urcu.h @@ -39,6 +39,35 @@ #pragma GCC diagnostic pop +/* + * Help thread sanitizer to understand `call_rcu()`: + * + * The `rcu_head` argument to `call_rcu()` is expected to be embedded + * in a structure defined by the caller, which is named `_rcuctx` in + * these macros. The callback function uses `caa_container_of()` to + * recover the `rcuctx` pointer from the `_rcu_head` pointer that is + * passed to the callback. + * + * We explain the ordering dependency to tsan by releasing `_rcuctx` + * pointer before `call_rcu()` and acquiring it in the callback + * funtion. We pass the outer `_rcuctx` pointer to the `__tsan_` + * barriers, because it should match a pointer that is known by tsan + * to have been returned by `malloc()`. + */ + +#define isc_urcu_cleanup(ptr, member, func) \ + { \ + __tsan_release(ptr); \ + call_rcu(&(ptr)->member, func); \ + } + +#define isc_urcu_container(ptr, type, member) \ + ({ \ + type *_ptr = caa_container_of(ptr, type, member); \ + __tsan_acquire(_ptr); \ + _ptr; \ + }) + #if defined(RCU_QSBR) /* diff --git a/lib/isc/quota.c b/lib/isc/quota.c index 4816333e1c..5a05aa439e 100644 --- a/lib/isc/quota.c +++ b/lib/isc/quota.c @@ -79,8 +79,7 @@ isc_quota_release(isc_quota_t *quota) { return; } - isc_job_t *job = caa_container_of(node, isc_job_t, wfcq_node); - __tsan_acquire(job); + isc_job_t *job = isc_urcu_container(node, isc_job_t, wfcq_node); job->cb(job->cbarg); } diff --git a/tsan-suppressions.txt b/tsan-suppressions.txt deleted file mode 100644 index 217d77c388..0000000000 --- a/tsan-suppressions.txt +++ /dev/null @@ -1,3 +0,0 @@ -# Uninstrumented library. -called_from_lib:libfstrm.so -called_from_lib:libdummyrpz.so From f11cc8314275b9a791e3646bb11f5a81c9cacf88 Mon Sep 17 00:00:00 2001 From: Tony Finch Date: Fri, 5 May 2023 08:12:35 +0100 Subject: [PATCH 10/12] Use per-CPU RCU helper threads Create and free per-CPU helper threads from the main thread and tell thread sanitizer to suppress leaking threads. (We are not leaking threads ourselves and we can safely ignore the Userspace-RCU thread leaks.) --- .tsan-suppress | 1 + .tsan-suppress-extra | 1 + lib/isc/loop.c | 12 ++++++++++++ lib/isc/thread.c | 8 ++++---- 4 files changed, 18 insertions(+), 4 deletions(-) diff --git a/.tsan-suppress b/.tsan-suppress index 132a18e5c6..c73eb7815f 100644 --- a/.tsan-suppress +++ b/.tsan-suppress @@ -1,3 +1,4 @@ # be more selective with liburcu race:rcu_barrier race:rcu_memb_barrier +thread:* diff --git a/.tsan-suppress-extra b/.tsan-suppress-extra index 767a2cc8e1..0f598bf36b 100644 --- a/.tsan-suppress-extra +++ b/.tsan-suppress-extra @@ -4,3 +4,4 @@ called_from_lib:libdummyrpz.so # be more selective with liburcu race:rcu_barrier race:rcu_memb_barrier +thread:* diff --git a/lib/isc/loop.c b/lib/isc/loop.c index 41c7f656f0..0b3918a00f 100644 --- a/lib/isc/loop.c +++ b/lib/isc/loop.c @@ -434,6 +434,10 @@ isc_loopmgr_run(isc_loopmgr_t *loopmgr) { */ ignore_signal(SIGPIPE, SIG_IGN); + bool free_call_rcu_data = !create_all_cpu_call_rcu_data(0); + + rcu_register_thread(); + /* * The thread 0 is this one. */ @@ -448,6 +452,14 @@ isc_loopmgr_run(isc_loopmgr_t *loopmgr) { } isc_thread_main(loop_thread, &loopmgr->loops[0]); + + rcu_unregister_thread(); + + rcu_barrier(); + + if (free_call_rcu_data) { + free_all_cpu_call_rcu_data(); + } } void diff --git a/lib/isc/thread.c b/lib/isc/thread.c index b0e0875aca..36ade47f1f 100644 --- a/lib/isc/thread.c +++ b/lib/isc/thread.c @@ -84,12 +84,8 @@ thread_body(struct thread_wrap *wrap) { __tsan_acquire(wrap); free(wrap); - rcu_register_thread(); - ret = func(arg); - rcu_unregister_thread(); - return (ret); } @@ -101,10 +97,14 @@ thread_run(void *wrap) { */ isc__iterated_hash_initialize(); + rcu_register_thread(); + void *ret = thread_body(wrap); isc__iterated_hash_shutdown(); + rcu_unregister_thread(); + return (ret); } From fc770a8bd0c7b413d160fc20b703974de4560dfd Mon Sep 17 00:00:00 2001 From: Tony Finch Date: Fri, 12 May 2023 12:20:42 +0100 Subject: [PATCH 11/12] Remove the now-unused ISC_STACK We are using the liburcu concurrent data structures instead. --- CHANGES | 4 + doc/dev/dev.md | 39 -------- lib/isc/Makefile.am | 1 - lib/isc/include/isc/stack.h | 189 ------------------------------------ 4 files changed, 4 insertions(+), 229 deletions(-) delete mode 100644 lib/isc/include/isc/stack.h diff --git a/CHANGES b/CHANGES index 34fe56bce2..aaec78b58a 100644 --- a/CHANGES +++ b/CHANGES @@ -1,3 +1,7 @@ +6171. [cleanup] Remove the stack implementation added in change 6108: + we are using the liburcu concurrent data structures + instead. [GL !7920] + 6170. [func] The 'rndc -t' option allows a timeout to be set in seconds, so that commands that take a long time to complete (e.g., reloading a very large configuration) diff --git a/doc/dev/dev.md b/doc/dev/dev.md index adbb40dcd0..1be096d6af 100644 --- a/doc/dev/dev.md +++ b/doc/dev/dev.md @@ -657,45 +657,6 @@ Items can be removed from the list using `ISC_LIST_UNLINK`: ISC_LIST_UNLINK(foolist, foo, link); -##### Atomic stacks - -In ``, there are also similar macros for singly-linked -stacks (`ISC_STACK`) and atomic stacks (`ISC_ASTACK`). - -Lock-free linked data structures have pitfalls that the `ISC_ASTACK` -macros avoid by having a restricted API. Firstly, it is difficult to -implement atomic doubly-linked lists without at least a double-width -compare-exchange, which is not a widely supported primitive, so this -stack is singly-linked. Secondly, when individual elements can be -removed from a list (whether doubly-linked or singly-linked), we run -into the ABA problem, where removes and (re-)inserts race and as a -result the links get in a tangle. Safe support for removing individual -elements requires some kind of garbage collection, or extending link -pointers with generation counters. Instead the ASTACK macros only -provide a function for emptying the entire stack in one go, -`ISC_ASTACK_TO_STACK()`. This only requires setting the `top` pointer to -`NULL`, which cannot cause `ISC_ASTACK_PUSH()` to go wrong. - -The `ISC_ASTACK` macros are used the same way as the non-atomic -doubly-linked lists described above, except that not all of the -`ISC_LIST` macros have `ISC_ASTACK` equivalents, and stacks have a -`TOP` instead of a `HEAD`. - -As well as read-only iteration similar to `ISC_LIST` described above, -there is an idiom for emptying an atomic stack into a regular stack, -and processing its contents: - - ISC_STACK(isc_foo_t) drain = ISC_ASTACK_TO_STACK(foo_work); - while (!ISC_STACK_EMPTY(drain)) { - isc_foo_t *foo = ISC_STACK_POP(drain, link); - /* do things */ - /* maybe again? ISC_ASTACK_PUSH(foo_work, foo, link) */ - } - -There is also an `ISC_ASTACK_ADD()` macro, which is a convenience -wrapper for pushing an element onto a stack if it has not already been -added. - #### Names The `dns_name` API has facilities for processing DNS names and labels, diff --git a/lib/isc/Makefile.am b/lib/isc/Makefile.am index 061fa24e7c..3d9f7366a1 100644 --- a/lib/isc/Makefile.am +++ b/lib/isc/Makefile.am @@ -80,7 +80,6 @@ libisc_la_HEADERS = \ include/isc/siphash.h \ include/isc/sockaddr.h \ include/isc/spinlock.h \ - include/isc/stack.h \ include/isc/stats.h \ include/isc/stdio.h \ include/isc/stdtime.h \ diff --git a/lib/isc/include/isc/stack.h b/lib/isc/include/isc/stack.h deleted file mode 100644 index 92475b9d43..0000000000 --- a/lib/isc/include/isc/stack.h +++ /dev/null @@ -1,189 +0,0 @@ -/* - * Copyright (C) Internet Systems Consortium, Inc. ("ISC") - * - * SPDX-License-Identifier: MPL-2.0 - * - * This Source Code Form is subject to the terms of the Mozilla Public - * License, v. 2.0. If a copy of the MPL was not distributed with this - * file, you can obtain one at https://mozilla.org/MPL/2.0/. - * - * See the COPYRIGHT file distributed with this work for additional - * information regarding copyright ownership. - */ - -#pragma once - -#include -#include -#include -#include - -/* - * Single links, aka stack links - */ - -#define ISC_SLINK(type) \ - struct { \ - type *next; \ - } -#define ISC_SLINK_INITIALIZER_TYPE(type) \ - { \ - .next = ISC_LINK_TOMBSTONE(type), \ - } -#define ISC_SLINK_INIT_TYPE(elt, link, type) \ - { \ - (elt)->link.next = ISC_LINK_TOMBSTONE(type); \ - } - -#define ISC_SLINK_INITIALIZER ISC_SLINK_INITIALIZER_TYPE(void) -#define ISC_SLINK_INIT(elt, link) ISC_SLINK_INIT_TYPE(elt, link, void) - -#define ISC_SLINK_LINKED_TYPE(elt, link, type) \ - ((type *)((elt)->link.next) != ISC_LINK_TOMBSTONE(type)) -#define ISC_SLINK_LINKED(elt, link) ISC_SLINK_LINKED_TYPE(elt, link, void) - -#define ISC_SLINK_NEXT(elt, link) ((elt)->link.next) - -/* - * Simple singly-linked stack implementation - */ - -#define ISC_STACK(type) \ - struct { \ - type *top; \ - } - -#define ISC_STACK_INITIALIZER \ - { \ - .top = NULL, \ - } -#define ISC_STACK_INIT(stack) \ - { \ - (stack).top = NULL; \ - } - -#define ISC_STACK_TOP(stack) ((stack).top) -#define ISC_STACK_EMPTY(stack) ((stack).top == NULL) - -#define ISC_STACK_PUSHUNSAFE(stack, elt, link) \ - { \ - (elt)->link.next = (stack).top; \ - (stack).top = (elt); \ - } - -#define ISC_STACK_PUSH(stack, elt, link) \ - { \ - INSIST(!ISC_SLINK_LINKED(elt, link)); \ - ISC_STACK_PUSHUNSAFE(stack, elt, link); \ - } - -/* - * This is slightly round about because when `type` is `void` - * we can't directly access `__top->link.next`. - */ -#define ISC_STACK_POP_TYPE(stack, link, type) \ - ({ \ - type *__top = (stack).top; \ - if (__top != NULL) { \ - type *__next = ISC_SLINK_NEXT((stack).top, link); \ - ISC_SLINK_INIT_TYPE((stack).top, link, type); \ - (stack).top = __next; \ - } \ - __top; \ - }) - -#define ISC_STACK_POP(stack, link) ISC_STACK_POP_TYPE(stack, link, void) - -/* - * Helper to add element if not already on stack. - */ -#define ISC_STACK_ADD(stack, elt, link) \ - { \ - if (!ISC_STACK_LINKED(elt, link)) { \ - ISC_STACK_PUSHUNSAFE(stack, elt, link); \ - } \ - } - -/* - * This is a simplified implementation of the Treiber stack algorithm with - * back-off. - * - * The original paper which describes the algorithm can be found here: - * https://dominoweb.draco.res.ibm.com/58319a2ed2b1078985257003004617ef.html - * - * We sidestep the ABA problem when removing the elements by adding additional - * constraint: once the element has been removed from the stack, it cannot be - * re-added. - * - * This is actually not a problem for planned usage of the ISC_ASTACK where we - * only use ISC_ASTACK_PUSH() to add individual elements and ISC_ASTACK_DRAIN() - * to empty the stack in a single atomic operation. - * - * To make things simple, there's no implementation for ISC_ASTACK_POP() because - * that's complex to implement and requires double-word CAS (for ABA counter). - * - * The pointers are wrapped in structs so that their types are - * distinct, and to match the ISC_LIST macros. - * - * See doc/dev/dev.md for examples. - */ - -#define ISC_ASTACK(type) \ - struct { \ - atomic_ptr(type) atop; \ - } -#define ISC_ASTACK_INITIALIZER \ - { \ - .atop = NULL, \ - } -#define ISC_ASTACK_INIT(stack) \ - { \ - (stack).atop = NULL; \ - } - -/* - * ATOMIC: for performance, this kind of retry loop should use a weak - * CAS with relaxed ordering in the failure case; on success, release - * ordering ensures that writing the element contents happens before - * reading them, following the acquire in ISC_ASTACK_TOP() and _DRAIN(). - */ -#define ISC_ASTACK_PUSHUNSAFE(stack, elt, link) \ - { \ - (elt)->link.next = atomic_load_relaxed(&(stack).atop); \ - while (!atomic_compare_exchange_weak_explicit( \ - &(stack).atop, &ISC_SLINK_NEXT(elt, link), elt, \ - memory_order_release, memory_order_relaxed)) \ - { \ - isc_pause(); \ - } \ - } - -#define ISC_ASTACK_PUSH(stack, elt, link) \ - { \ - INSIST(!ISC_SLINK_LINKED(elt, link)); \ - ISC_ASTACK_PUSHUNSAFE(stack, elt, link); \ - } - -/* - * Helper to add element if not already on stack. - */ -#define ISC_ASTACK_ADD(stack, elt, link) \ - { \ - if (!ISC_SLINK_LINKED(elt, link)) { \ - ISC_ASTACK_PUSHUNSAFE(stack, elt, link); \ - } \ - } - -/* - * ATOMIC: acquire ordering pairs with ISC_ASTACK_PUSHUNSAFEe() - */ -#define ISC_ASTACK_TOP(stack) atomic_load_acquire(&(stack).atop) -#define ISC_ASTACK_EMPTY(stack) (ISC_ASTACK_TOP(stack) == NULL) - -/* - * ATOMIC: acquire ordering pairs with ISC_ASTACK_PUSHUNSAFE() - */ -#define ISC_ASTACK_TO_STACK(stack) \ - { \ - .top = atomic_exchange_acquire(&(stack).atop, NULL), \ - } From 5e97ec5eadf4fb1dd23e2d86d6795b278341d6c8 Mon Sep 17 00:00:00 2001 From: Tony Finch Date: Thu, 9 Mar 2023 11:11:41 +0000 Subject: [PATCH 12/12] CHANGES note for [GL #3936] [cleanup] Refactor the loop manager and qp-trie code to remove isc_qsbr and use liburcu instead. [GL #3936] --- CHANGES | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGES b/CHANGES index aaec78b58a..5598d6f373 100644 --- a/CHANGES +++ b/CHANGES @@ -1,3 +1,6 @@ +6172. [cleanup] Refactor the loop manager and qp-trie code to remove + isc_qsbr and use liburcu instead. [GL #3936] + 6171. [cleanup] Remove the stack implementation added in change 6108: we are using the liburcu concurrent data structures instead. [GL !7920]