mirror of
https://github.com/isc-projects/bind9.git
synced 2026-06-05 04:02:04 -04:00
Merge branch 'fanf-urcu-qp' into 'main'
Replace isc_qsbr with liburcu-qsbr Closes #3936 and #4019 See merge request isc-projects/bind9!7668
This commit is contained in:
commit
562697e703
49 changed files with 302 additions and 1598 deletions
|
|
@ -18,14 +18,20 @@ variables:
|
|||
CLANG_VERSION: 16
|
||||
CLANG: "clang-${CLANG_VERSION}"
|
||||
SCAN_BUILD: "scan-build-${CLANG_VERSION}"
|
||||
ASAN_SYMBOLIZER_PATH: "/usr/lib/llvm-${CLANG_VERSION}/bin/llvm-symbolizer"
|
||||
LLVM_SYMBOLIZER: "/usr/lib/llvm-${CLANG_VERSION}/bin/llvm-symbolizer"
|
||||
CLANG_FORMAT: "clang-format-${CLANG_VERSION}"
|
||||
|
||||
CFLAGS_COMMON: -fno-omit-frame-pointer -fno-optimize-sibling-calls -O1 -g -Wall -Wextra
|
||||
|
||||
# Pass run-time flags to AddressSanitizer to get core dumps on error.
|
||||
ASAN_OPTIONS: abort_on_error=1:disable_coredump=0:unmap_shadow_on_exit=1
|
||||
TSAN_OPTIONS_COMMON: "disable_coredump=0 second_deadlock_stack=1 history_size=7 log_exe_name=true log_path=tsan"
|
||||
ASAN_SYMBOLIZER_PATH: "${LLVM_SYMBOLIZER}"
|
||||
|
||||
TSAN_OPTIONS_COMMON: "disable_coredump=0 second_deadlock_stack=1 atexit_sleep_ms=1000 history_size=7 log_exe_name=true log_path=tsan"
|
||||
TSAN_SYMBOLIZER: "external_symbolizer_path=/usr/bin/llvm-symbolizer"
|
||||
TSAN_SUPPRESSIONS: "suppressions=${CI_PROJECT_DIR}/.tsan-suppress"
|
||||
TSAN_OPTIONS_DEFAULT: "${TSAN_OPTIONS_COMMON} ${TSAN_SUPPRESSIONS} ${TSAN_SYMBOLIZER}"
|
||||
|
||||
UBSAN_OPTIONS: "halt_on_error=1:abort_on_error=1:disable_coredump=0"
|
||||
|
||||
TARBALL_EXTENSION: xz
|
||||
|
|
@ -1044,7 +1050,7 @@ gcc:tsan:
|
|||
|
||||
system:gcc:tsan:
|
||||
variables:
|
||||
TSAN_OPTIONS: "${TSAN_OPTIONS_COMMON} external_symbolizer_path=/usr/bin/llvm-symbolizer"
|
||||
TSAN_OPTIONS: "${TSAN_OPTIONS_DEFAULT}"
|
||||
<<: *fedora_37_amd64_image
|
||||
<<: *system_test_tsan_job
|
||||
needs:
|
||||
|
|
@ -1053,7 +1059,7 @@ system:gcc:tsan:
|
|||
|
||||
unit:gcc:tsan:
|
||||
variables:
|
||||
TSAN_OPTIONS: "${TSAN_OPTIONS_COMMON} external_symbolizer_path=/usr/bin/llvm-symbolizer"
|
||||
TSAN_OPTIONS: "${TSAN_OPTIONS_DEFAULT}"
|
||||
<<: *fedora_37_amd64_image
|
||||
<<: *unit_test_tsan_job
|
||||
needs:
|
||||
|
|
@ -1071,7 +1077,7 @@ clang:tsan:
|
|||
|
||||
system:clang:tsan:
|
||||
variables:
|
||||
TSAN_OPTIONS: "${TSAN_OPTIONS_COMMON} external_symbolizer_path=/usr/lib/llvm-${CLANG_VERSION}/bin/llvm-symbolizer"
|
||||
TSAN_OPTIONS: "${TSAN_OPTIONS_COMMON} ${TSAN_SUPPRESSIONS} external_symbolizer_path=${LLVM_SYMBOLIZER}"
|
||||
<<: *base_image
|
||||
<<: *system_test_tsan_job
|
||||
needs:
|
||||
|
|
@ -1080,7 +1086,7 @@ system:clang:tsan:
|
|||
|
||||
unit:clang:tsan:
|
||||
variables:
|
||||
TSAN_OPTIONS: "${TSAN_OPTIONS_COMMON} external_symbolizer_path=/usr/lib/llvm-${CLANG_VERSION}/bin/llvm-symbolizer suppressions=$CI_PROJECT_DIR/tsan-suppressions.txt"
|
||||
TSAN_OPTIONS: "${TSAN_OPTIONS_COMMON} suppressions=$CI_PROJECT_DIR/.tsan-suppress-extra external_symbolizer_path=${LLVM_SYMBOLIZER}"
|
||||
<<: *base_image
|
||||
<<: *unit_test_tsan_job
|
||||
needs:
|
||||
|
|
@ -1323,7 +1329,7 @@ respdiff-short:tsan:
|
|||
LDFLAGS: "-fsanitize=thread"
|
||||
EXTRA_CONFIGURE: "--enable-pthread-rwlock --without-jemalloc"
|
||||
MAX_DISAGREEMENTS_PERCENTAGE: "0.5"
|
||||
TSAN_OPTIONS: "${TSAN_OPTIONS_COMMON} external_symbolizer_path=/usr/bin/llvm-symbolizer"
|
||||
TSAN_OPTIONS: "${TSAN_OPTIONS_DEFAULT}"
|
||||
script:
|
||||
- bash respdiff.sh -s named -q "${PWD}/10k_a.txt" -c 3 -w "${PWD}/rspworkdir" "${CI_PROJECT_DIR}" "/usr/local/respdiff-reference-bind/sbin/named"
|
||||
after_script:
|
||||
|
|
@ -1361,7 +1367,7 @@ respdiff-long:tsan:
|
|||
LDFLAGS: "-fsanitize=thread"
|
||||
EXTRA_CONFIGURE: "--enable-pthread-rwlock --without-jemalloc"
|
||||
MAX_DISAGREEMENTS_PERCENTAGE: "0.5"
|
||||
TSAN_OPTIONS: "${TSAN_OPTIONS_COMMON} external_symbolizer_path=/usr/bin/llvm-symbolizer"
|
||||
TSAN_OPTIONS: "${TSAN_OPTIONS_DEFAULT}"
|
||||
script:
|
||||
- bash respdiff.sh -s named -q "${PWD}/100k_mixed.txt" -c 3 -w "${PWD}/rspworkdir" "${CI_PROJECT_DIR}" "/usr/local/respdiff-reference-bind/sbin/named"
|
||||
after_script:
|
||||
|
|
|
|||
|
|
@ -163,11 +163,12 @@ Files: **/.clang-format
|
|||
.gitlab-ci.yml
|
||||
.lgtm.yml
|
||||
.pylintrc
|
||||
.tsan-suppress
|
||||
.tsan-suppress-extra
|
||||
.uncrustify.cfg
|
||||
doc/misc/*.zoneopt
|
||||
doc/misc/options
|
||||
doc/misc/rndc.grammar
|
||||
tsan-suppressions.txt
|
||||
sonar-project.properties
|
||||
Copyright: Internet Systems Consortium, Inc. ("ISC")
|
||||
License: CC0-1.0
|
||||
|
|
|
|||
4
.tsan-suppress
Normal file
4
.tsan-suppress
Normal file
|
|
@ -0,0 +1,4 @@
|
|||
# be more selective with liburcu
|
||||
race:rcu_barrier
|
||||
race:rcu_memb_barrier
|
||||
thread:*
|
||||
7
.tsan-suppress-extra
Normal file
7
.tsan-suppress-extra
Normal file
|
|
@ -0,0 +1,7 @@
|
|||
# Uninstrumented libraries
|
||||
called_from_lib:libfstrm.so
|
||||
called_from_lib:libdummyrpz.so
|
||||
# be more selective with liburcu
|
||||
race:rcu_barrier
|
||||
race:rcu_memb_barrier
|
||||
thread:*
|
||||
7
CHANGES
7
CHANGES
|
|
@ -1,3 +1,10 @@
|
|||
6172. [cleanup] Refactor the loop manager and qp-trie code to remove
|
||||
isc_qsbr and use liburcu instead. [GL #3936]
|
||||
|
||||
6171. [cleanup] Remove the stack implementation added in change 6108:
|
||||
we are using the liburcu concurrent data structures
|
||||
instead. [GL !7920]
|
||||
|
||||
6170. [func] The 'rndc -t' option allows a timeout to be set in
|
||||
seconds, so that commands that take a long time to
|
||||
complete (e.g., reloading a very large configuration)
|
||||
|
|
|
|||
|
|
@ -2147,8 +2147,7 @@ run_server(void *arg) {
|
|||
CHECK(ns_interfacemgr_create(mctx, sctx, loopmgr, netmgr, dispatchmgr,
|
||||
NULL, false, &interfacemgr));
|
||||
|
||||
CHECK(dns_view_create(mctx, loopmgr, dns_rdataclass_in, "_default",
|
||||
&view));
|
||||
CHECK(dns_view_create(mctx, dns_rdataclass_in, "_default", &view));
|
||||
CHECK(dns_cache_create(loopmgr, dns_rdataclass_in, "", &cache));
|
||||
dns_view_setcache(view, cache, false);
|
||||
dns_cache_detach(&cache);
|
||||
|
|
|
|||
|
|
@ -6437,8 +6437,7 @@ create_view(const cfg_obj_t *vconfig, dns_viewlist_t *viewlist,
|
|||
}
|
||||
INSIST(view == NULL);
|
||||
|
||||
result = dns_view_create(named_g_mctx, named_g_loopmgr, viewclass,
|
||||
viewname, &view);
|
||||
result = dns_view_create(named_g_mctx, viewclass, viewname, &view);
|
||||
if (result != ISC_R_SUCCESS) {
|
||||
return (result);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -256,7 +256,7 @@ main(int argc, char *argv[]) {
|
|||
RUNCHECK(dns_requestmgr_create(mctx, dispatchmgr, dispatchv4, NULL,
|
||||
&requestmgr));
|
||||
|
||||
RUNCHECK(dns_view_create(mctx, loopmgr, 0, "_test", &view));
|
||||
RUNCHECK(dns_view_create(mctx, 0, "_test", &view));
|
||||
|
||||
isc_loopmgr_setup(loopmgr, sendqueries, NULL);
|
||||
isc_loopmgr_run(loopmgr);
|
||||
|
|
|
|||
|
|
@ -89,6 +89,7 @@ if ! $do_run; then
|
|||
LOG_FLAGS="$log_flags" \
|
||||
TEST_LARGE_MAP="${TEST_LARGE_MAP}" \
|
||||
CI_ENABLE_ALL_TESTS="${CI_ENABLE_ALL_TESTS}" \
|
||||
${TSAN_OPTIONS:+"TSAN_OPTIONS=${TSAN_OPTIONS}"} \
|
||||
${VIRTUAL_ENV:+"VIRTUAL_ENV=${VIRTUAL_ENV}"} \
|
||||
${PERL5LIB:+"PERL5LIB=${PERL5LIB}"} \
|
||||
make -e check
|
||||
|
|
|
|||
|
|
@ -2141,7 +2141,7 @@ main(int argc, char *argv[]) {
|
|||
mctx, dispatchmgr, have_ipv4 ? dispatchvx : NULL,
|
||||
have_ipv6 ? dispatchvx : NULL, &requestmgr));
|
||||
|
||||
RUNCHECK(dns_view_create(mctx, loopmgr, 0, "_mdig", &view));
|
||||
RUNCHECK(dns_view_create(mctx, 0, "_mdig", &view));
|
||||
|
||||
query = ISC_LIST_HEAD(queries);
|
||||
isc_loopmgr_setup(loopmgr, sendqueries, query);
|
||||
|
|
|
|||
|
|
@ -657,45 +657,6 @@ Items can be removed from the list using `ISC_LIST_UNLINK`:
|
|||
|
||||
ISC_LIST_UNLINK(foolist, foo, link);
|
||||
|
||||
##### Atomic stacks
|
||||
|
||||
In `<isc/stack.h>`, there are also similar macros for singly-linked
|
||||
stacks (`ISC_STACK`) and atomic stacks (`ISC_ASTACK`).
|
||||
|
||||
Lock-free linked data structures have pitfalls that the `ISC_ASTACK`
|
||||
macros avoid by having a restricted API. Firstly, it is difficult to
|
||||
implement atomic doubly-linked lists without at least a double-width
|
||||
compare-exchange, which is not a widely supported primitive, so this
|
||||
stack is singly-linked. Secondly, when individual elements can be
|
||||
removed from a list (whether doubly-linked or singly-linked), we run
|
||||
into the ABA problem, where removes and (re-)inserts race and as a
|
||||
result the links get in a tangle. Safe support for removing individual
|
||||
elements requires some kind of garbage collection, or extending link
|
||||
pointers with generation counters. Instead the ASTACK macros only
|
||||
provide a function for emptying the entire stack in one go,
|
||||
`ISC_ASTACK_TO_STACK()`. This only requires setting the `top` pointer to
|
||||
`NULL`, which cannot cause `ISC_ASTACK_PUSH()` to go wrong.
|
||||
|
||||
The `ISC_ASTACK` macros are used the same way as the non-atomic
|
||||
doubly-linked lists described above, except that not all of the
|
||||
`ISC_LIST` macros have `ISC_ASTACK` equivalents, and stacks have a
|
||||
`TOP` instead of a `HEAD`.
|
||||
|
||||
As well as read-only iteration similar to `ISC_LIST` described above,
|
||||
there is an idiom for emptying an atomic stack into a regular stack,
|
||||
and processing its contents:
|
||||
|
||||
ISC_STACK(isc_foo_t) drain = ISC_ASTACK_TO_STACK(foo_work);
|
||||
while (!ISC_STACK_EMPTY(drain)) {
|
||||
isc_foo_t *foo = ISC_STACK_POP(drain, link);
|
||||
/* do things */
|
||||
/* maybe again? ISC_ASTACK_PUSH(foo_work, foo, link) */
|
||||
}
|
||||
|
||||
There is also an `ISC_ASTACK_ADD()` macro, which is a convenience
|
||||
wrapper for pushing an element onto a stack if it has not already been
|
||||
added.
|
||||
|
||||
#### <a name="names"></a>Names
|
||||
|
||||
The `dns_name` API has facilities for processing DNS names and labels,
|
||||
|
|
|
|||
397
doc/dev/qsbr.md
397
doc/dev/qsbr.md
|
|
@ -1,397 +0,0 @@
|
|||
<!--
|
||||
Copyright (C) Internet Systems Consortium, Inc. ("ISC")
|
||||
|
||||
SPDX-License-Identifier: MPL-2.0
|
||||
|
||||
This Source Code Form is subject to the terms of the Mozilla Public
|
||||
License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
file, you can obtain one at https://mozilla.org/MPL/2.0/.
|
||||
|
||||
See the COPYRIGHT file distributed with this work for additional
|
||||
information regarding copyright ownership.
|
||||
-->
|
||||
|
||||
QSBR: quiescent state based reclamation
|
||||
=======================================
|
||||
|
||||
QSBR is a safe memory reclamation (SMR) algorithm for lock-free data
|
||||
structures such as a qp-trie. (See `doc/dev/qp.md`.)
|
||||
|
||||
When an object is unlinked from a lock-free data structure, it
|
||||
cannot be `free()`ed immediately, because there can still be readers
|
||||
accessing the object via an old version of the data structure. SMR
|
||||
algorithms determine when it is safe to reclaim memory after it has
|
||||
been unlinked.
|
||||
|
||||
|
||||
Introductions and overviews
|
||||
---------------------------
|
||||
|
||||
There is a terse overview in `include/isc/qsbr.h`.
|
||||
|
||||
Jeff Preshing has a nice introduction to QSBR,
|
||||
_<https://preshing.com/20160726/using-quiescent-states-to-reclaim-memory/>_
|
||||
|
||||
At the end of this note is a copy of a blog post about writing BIND's
|
||||
`isc_qsbr`, _<https://dotat.at/@/2023-01-10-qsbr.html>_
|
||||
|
||||
[Paul McKenney's web page][paulmck] has links to his book on
|
||||
concurrent programming, the [Userspace RCU library][urcu], and more.
|
||||
McKenney invented RCU and QSBR. RCU is the Linux kernel's machinery
|
||||
for lock-free data structures and safe memory reclamation, based on
|
||||
QSBR.
|
||||
|
||||
[paulmck]: http://www.rdrop.com/~paulmck/
|
||||
[urcu]: https://liburcu.org/
|
||||
|
||||
|
||||
Example code
|
||||
------------
|
||||
|
||||
If you are implementing a lock-free data structure that needs safe
|
||||
memory reclamation, here's a guide to using `isc_qsbr`, based on how
|
||||
QSBR is used by `dns_qp`.
|
||||
|
||||
### registration
|
||||
|
||||
When the program starts up you need to register a global callback
|
||||
function that will reclaim unused memory. You can do so using an
|
||||
ISC_CONSTRUCTOR function that runs automatically at startup.
|
||||
|
||||
static void
|
||||
qp_qsbr_register(void) ISC_CONSTRUCTOR;
|
||||
static void
|
||||
qp_qsbr_register(void) {
|
||||
isc_qsbr_register(qp_qsbr_reclaimer);
|
||||
}
|
||||
|
||||
### work list
|
||||
|
||||
Your module will need somewhere that your callback can find the work
|
||||
it needs to do. The qp-trie has an atomic list of `dns_qpmulti_t`
|
||||
objects for this purpose.
|
||||
|
||||
/* a global variable */
|
||||
static ISC_ASTACK(dns_qpmulti_t) qsbr_work;
|
||||
|
||||
The reason for using global variables is so that we don't need to
|
||||
allocate a thunk every time we have memory reclamation work to do.
|
||||
|
||||
### read-only access
|
||||
|
||||
You should design your data structure so that it has a single atomic
|
||||
root pointer referring to its current version. A lock-free reader
|
||||
_must_ run in an `isc_loop` callback. It gains access to the data
|
||||
structure by taking a copy of this pointer:
|
||||
|
||||
qp_node_t *reader = atomic_load_acquire(&multi->reader);
|
||||
|
||||
During an `isc_loop` callback, a reader should keep using the same
|
||||
pointer go get a consistent view of the data structure. If it reloads
|
||||
the pointer it can get a different version changed by concurrent
|
||||
writers.
|
||||
|
||||
A reader _must_ stop using the root pointer and any interior pointers
|
||||
obtained via the root pointer before it returns to the `isc_loop`.
|
||||
|
||||
### modifications and writes
|
||||
|
||||
All changes to the data structure must be copy-on-write (aka
|
||||
read-copy-update) so that concurrent readers are not disturbed.
|
||||
|
||||
When a new version of the data structure has been prepared, it is
|
||||
committed by overwriting the atomic root pointer,
|
||||
|
||||
atomic_store_release(&multi->reader, reader); /* COMMIT */
|
||||
|
||||
### scheduling cleanup
|
||||
|
||||
After committing a change, your data structure may have memory that
|
||||
will become free, after concurrent readers have stopped accessing it.
|
||||
To reclaim the memory when it is safe, use code like:
|
||||
|
||||
isc_qsbr_phase_t phase = isc_qsbr_phase(multi->loopmgr);
|
||||
if (defer_chunk_reclamation(qp, phase)) {
|
||||
ISC_ASTACK_ADD(qsbr_work, multi, cleanup);
|
||||
isc_qsbr_activate(multi->loopmgr, phase);
|
||||
}
|
||||
|
||||
* First, get the current QSBR phase
|
||||
|
||||
* Second, mark free memory with the phase number. The qp-trie scans
|
||||
its chunks and marks those that will become free, and returns
|
||||
`true` if there is cleanup work to do.
|
||||
|
||||
* If so, the qp-trie is added to the work list. (`ISC_ALIST_ADD()`
|
||||
is idempotent).
|
||||
|
||||
* Finally, QSBR is informed that there is work to do.
|
||||
|
||||
In other cases it might not make sense to scan the data structure
|
||||
after committing, and instead you might make note of which memory to
|
||||
clean up while making changes before you know what the phase will be.
|
||||
You can then have per-phase work lists, like:
|
||||
|
||||
static ISC_ASTACK(my_work_t) qsbr_work[ISC_QSBR_PHASES];
|
||||
|
||||
isc_qsbr_phase_t phase = isc_qsbr_phase(loopmgr);
|
||||
ISC_ASTACK_ADD(qsbr_work[phase], cleanup_work, link);
|
||||
isc_qsbr_activate(loopmgr, phase);
|
||||
|
||||
In general, there will be several (maybe many) write operations during
|
||||
a grace period. Your lock-free data structure should collect its
|
||||
reclamation work from all these writes into a batch per phase, i.e.
|
||||
per grace period.
|
||||
|
||||
### reclaiming
|
||||
|
||||
Inside the reclaimer callback, we iterate over the work list and clean
|
||||
up each item on it. If there is more cleanup work to do in another
|
||||
phase, we put the qp-trie back on the work list for another go.
|
||||
|
||||
static void
|
||||
qsbreclaimer(void *arg, isc_qsbr_phase_t phase) {
|
||||
UNUSED(arg);
|
||||
|
||||
ISC_STACK(dns_qpmulti_t) drain = ISC_ASTACK_TO_STACK(qsbr_work);
|
||||
while (!ISC_STACK_EMPTY(drain)) {
|
||||
dns_qpmulti_t *multi = ISC_STACK_POP(drain, cleanup);
|
||||
INSIST(QPMULTI_VALID(multi));
|
||||
LOCK(&multi->mutex);
|
||||
if (reclaim_chunks(&multi->writer, phase)) {
|
||||
/* more to do next time */
|
||||
ISC_ALIST_PUSH(qsbr_work, multi, cleanup);
|
||||
}
|
||||
UNLOCK(&multi->mutex);
|
||||
}
|
||||
}
|
||||
|
||||
### reclaim marks
|
||||
|
||||
In the qp-trie data structure, each chunk has some metadata which
|
||||
includes a bitfield for the reclaim phase:
|
||||
|
||||
isc_qsbr_phase_t phase : ISC_QSBR_PHASE_BITS;
|
||||
|
||||
We use a bitfield so that all the metadata fits in a single word.
|
||||
|
||||
|
||||
------------------------------------------------------------------------
|
||||
|
||||
Safe memory reclamation for BIND
|
||||
================================
|
||||
|
||||
At the end of October 2022, I _finally_ got [my multithreaded
|
||||
qp-trie][qp-gc] working! It could be built with two different
|
||||
concurrency control mechanisms:
|
||||
|
||||
* A reader/writer lock
|
||||
|
||||
This has poor read-side scalability, because every thread is
|
||||
hammering on the same shared location. But its write performance
|
||||
is reasonably good: concurrent readers don't slow it down too much.
|
||||
|
||||
* [`liburcu`, userland read-copy-update][urcu]
|
||||
|
||||
RCU has a fast and scalable read side, nice! But on the write side
|
||||
I used `synchronize_rcu()`, which is blocking and rather slow, so
|
||||
my write performance was terrible.
|
||||
|
||||
OK, but I want the best of both worlds! To fix it, I needed to change
|
||||
the qp-trie code to use safe memory reclamation more effectively:
|
||||
instead of blocking inside `synchronize_rcu()` before cleaning up, use
|
||||
`call_rcu()` to clean up asynchronously. I expect I'll write about the
|
||||
qp-trie changes another time.
|
||||
|
||||
Another issue is that I want the best of both worlds _by default_,
|
||||
but `liburcu` is [LGPL][] and we don't want BIND to depend on
|
||||
code whose licence demands more from our users than the [MPL][].
|
||||
|
||||
[qp-gc]: https://dotat.at/@/2021-06-23-page-based-gc-for-qp-trie-rcu.html
|
||||
[LGPL]: https://opensource.org/licenses/LGPL-2.1
|
||||
[MPL]: https://opensource.org/licenses/MPL-2.0
|
||||
|
||||
So I set out to write my own safe memory reclamation support code.
|
||||
|
||||
|
||||
lock freedom
|
||||
------------
|
||||
|
||||
In a [multithreaded qp-trie][qp-gc], there can be many concurrent
|
||||
readers, but there can be only one writer at a time and modifications
|
||||
are strictly serialized. When I have got it working properly, readers
|
||||
are completely wait-free, unaffected by other readers, and almost
|
||||
unaffected by writers. Writers need to get a mutex to ensure there is
|
||||
only one at a time, but once the mutex is acquired, a writer is not
|
||||
obstructed by readers.
|
||||
|
||||
The way this works is that readers use an atomic load to get a pointer
|
||||
to the root of the current version of the trie. Readers can make
|
||||
multiple queries using this root pointer and the results will be
|
||||
consistent wrt that particular version, regardless of what changes
|
||||
writers might be making concurrently. Writers do not affect readers
|
||||
because all changes are made by copy-on-write. When a writer is ready
|
||||
to commit a new version of the trie, it uses an atomic store to flip
|
||||
the root pointer.
|
||||
|
||||
|
||||
safe memory reclamation
|
||||
-----------------------
|
||||
|
||||
We can't copy-on-write indefinitely: we need to reclaim the memory
|
||||
used by old versions of the trie. And we must do so "safely", i.e.
|
||||
without `free()`ing memory that readers are still using.
|
||||
|
||||
So, before `free()`ing memory, a writer must wait for a _"grace
|
||||
period"_, which is a jargon term meaning "until readers are not using
|
||||
the old version". There are a bunch of algorithms for determining when
|
||||
a grace period is over, with varying amounts of over-approximation,
|
||||
CPU overhead, and memory backlog.
|
||||
|
||||
The [RCU][urcu] function `synchronize_rcu()` is slow because it blocks
|
||||
waiting for a grace period; the `call_rcu()` function runs a callback
|
||||
asynchronously after a grace period has passed. I wanted to avoid
|
||||
blocking my writers, so I needed to implement something like
|
||||
`call_rcu()`.
|
||||
|
||||
|
||||
aversions
|
||||
---------
|
||||
|
||||
When I started trying to work out how to do safe memory reclamation,
|
||||
it all seemed quite intimidating. But as I learned more, I found that
|
||||
my circumstances make it easier than it appeared at first.
|
||||
|
||||
The [`liburcu`][urcu] homepage has a long list of supported CPU
|
||||
architectures and operating systems. Do I have to care about those
|
||||
details too? No! The RCU code dates back to before the age of
|
||||
standardized concurrent memory models, so the RCU developers had to
|
||||
invent their own atomic primitives and correctness rules. Twenty-ish
|
||||
years later the state of the art has advanced, so I can use
|
||||
`<stdatomic.h>` without having to re-do it like `liburcu`.
|
||||
|
||||
You can also choose between several algorithms implemented by
|
||||
[`liburcu`][urcu], involving questions about kernel support, specially
|
||||
reserved signals, and intrusiveness in application code. But while I
|
||||
was working out how to schedule asynchronous memory reclamation work,
|
||||
I realised that BIND is already well-suited to the fastest flavour of
|
||||
RCU, called "QSBR".
|
||||
|
||||
|
||||
QSBR
|
||||
----
|
||||
|
||||
QSBR stands for "quiescent state based reclamation". A _"quiescent
|
||||
state"_ is a fancy name for a point when a thread is not accessing a
|
||||
lock-free data structure, and does not retain any root pointers or
|
||||
interior pointers.
|
||||
|
||||
When a thread has passed through a quiescent state, it no longer has
|
||||
access to older versions of the data structures. When _all_ threads
|
||||
have passed through quiescent states, then nothing in the program has
|
||||
access to old versions. This is how QSBR detects grace periods: after
|
||||
a writer commits a new version, it waits for all threads to pass
|
||||
through quiescent states, and therefore a grace period has definitely
|
||||
elapsed, and so it is then safe to reclaim the old version's memory.
|
||||
|
||||
QSBR is fast because readers do not need to explicitly mark the
|
||||
critical section surrounding the atomic load that I mentioned earlier.
|
||||
Threads just need to pass through a quiescent state frequently enough
|
||||
that there isn't a huge build-up of unreclaimed memory.
|
||||
|
||||
Inside an operating system kernel (RCU's native environment), a
|
||||
context switch provides a natural quiescent state. In a userland
|
||||
application, you need to find a good place to call
|
||||
`rcu_quiescent_state()`. You could call it every time you have
|
||||
finished using a root pointer, but marking a quiescent state is not
|
||||
completely free, so there are probably more efficient ways.
|
||||
|
||||
|
||||
`libuv`
|
||||
-------
|
||||
|
||||
BIND is multithreaded, and (basically) each thread runs an event loop.
|
||||
Recent versions of BIND use [`libuv`][uv] for the event loops.
|
||||
|
||||
A lot of things started falling into place when I realised that the
|
||||
`libuv` event loop gives BIND a [natural quiescent state][uv-loop]:
|
||||
when the event callbacks have finished running, and `libuv` is about
|
||||
to call `select()` or `poll()` or whatever, we can mark a quiescent
|
||||
state. We can require that event-handling functions do not stash root
|
||||
pointers in the heap, but only use them via local variables, so we
|
||||
know that old versions are inaccessible after the callback returns.
|
||||
|
||||
My design marks a quiescent state once per loop, so on a busy server
|
||||
where each loop has lots to do, the cost of marking a quiescent state
|
||||
is amortized across several I/O events.
|
||||
|
||||
[uv]: http://libuv.org/
|
||||
[uv-loop]: http://docs.libuv.org/en/v1.x/design.html#the-i-o-loop
|
||||
|
||||
|
||||
fuzzy barrier
|
||||
-------------
|
||||
|
||||
So, how do we mark a quiescent state? Using a _"fuzzy barrier"_.
|
||||
|
||||
When a thread reaches a normal barrier, it blocks until all the other
|
||||
threads have reached the barrier, after which exactly one of the
|
||||
threads can enter a protected section of code, and the others are
|
||||
unblocked and can proceed as normal.
|
||||
|
||||
When a thread encounters a fuzzy barrier, it never blocks. It either
|
||||
proceeds immediately as normal, or if it is the last thread to reach
|
||||
the barrier, it enters the protected code.
|
||||
|
||||
RCU does not actually use a fuzzy barrier as I have described it. Like
|
||||
a fuzzy barrier, each thread keeps track of whether it has passed
|
||||
through a quiescent state in the current grace period, without
|
||||
blocking; but unlike a fuzzy barrier, no thread is diverted to the
|
||||
protected code. Instead, code that wants to enter a protected section
|
||||
uses the blocking `synchronize_rcu()` function.
|
||||
|
||||
|
||||
EBR-ish
|
||||
-------
|
||||
|
||||
As in the paper ["performance of memory reclamation for lockless
|
||||
synchronization"][HMBW], my implementation of QSBR uses a fuzzy
|
||||
barrier designed for another safe memory reclamation algorithm, EBR,
|
||||
epoch based reclamation. (EBR was invented here in Cambridge by [Keir
|
||||
Fraser][tr579].)
|
||||
|
||||
[HMBW]: http://csng.cs.toronto.edu/publication_files/0000/0159/jpdc07.pdf
|
||||
[tr579]: https://www.cl.cam.ac.uk/techreports/UCAM-CL-TR-579.html
|
||||
|
||||
Actually, my fuzzy barrier is slightly different to EBR's. In EBR, the
|
||||
fuzzy barrier is used every time the program enters a critical
|
||||
section. (In qp-trie terms, that would be every time a reader fetches
|
||||
a root pointer.) So it is vital that EBR's barrier avoids mutating
|
||||
shared state, because that would wreck multithreaded performance.
|
||||
|
||||
Because BIND will only pass through the fuzzy barrier when it is about
|
||||
to use a blocking system call, my version mutates shared state more
|
||||
frequently (typically, once per CPU per grace period, instead of once
|
||||
per grace period). If this turns out to be a problem, it won't be too
|
||||
hard to make it work more like EBR.
|
||||
|
||||
More trivially, I'm using the term "phase" instead of "epoch", because
|
||||
it's nothing to do with the unix epoch, because there are three
|
||||
phases, and because I can talk about phase transitions and threads
|
||||
being out of phase with each other.
|
||||
|
||||
|
||||
coda
|
||||
----
|
||||
|
||||
While reading various RCU-related papers, I was amused by ["user-level
|
||||
implementations of read-copy update"][DMSDW], which says:
|
||||
|
||||
> BIND, a major domain-name server used for Internet domain-name
|
||||
> resolution, is facing scalability issues. Since domain names
|
||||
> are read often but rarely updated, using user-level RCU might be
|
||||
> beneficial.
|
||||
|
||||
Yes, I think it might :-)
|
||||
|
||||
[DMSDW]: https://www.efficios.com/publications/
|
||||
|
|
@ -6,6 +6,7 @@ AM_CFLAGS += \
|
|||
AM_CPPFLAGS += \
|
||||
$(LIBISC_CFLAGS) \
|
||||
$(LIBDNS_CFLAGS) \
|
||||
$(LIBURCU_CFLAGS) \
|
||||
$(LIBUV_CFLAGS) \
|
||||
-DFUZZDIR=\"$(abs_srcdir)\" \
|
||||
-I$(top_srcdir)/lib/dns \
|
||||
|
|
|
|||
|
|
@ -236,8 +236,7 @@ LLVMFuzzerInitialize(int *argc ISC_ATTR_UNUSED, char ***argv ISC_ATTR_UNUSED) {
|
|||
|
||||
isc_loopmgr_create(mctx, 1, &loopmgr);
|
||||
|
||||
result = dns_view_create(mctx, loopmgr, dns_rdataclass_in, "view",
|
||||
&view);
|
||||
result = dns_view_create(mctx, dns_rdataclass_in, "view", &view);
|
||||
if (result != ISC_R_SUCCESS) {
|
||||
fprintf(stderr, "dns_view_create failed: %s\n",
|
||||
isc_result_totext(result));
|
||||
|
|
|
|||
|
|
@ -16,10 +16,10 @@
|
|||
#include <stdbool.h>
|
||||
#include <stdint.h>
|
||||
|
||||
#include <isc/qsbr.h>
|
||||
#include <isc/random.h>
|
||||
#include <isc/refcount.h>
|
||||
#include <isc/rwlock.h>
|
||||
#include <isc/urcu.h>
|
||||
#include <isc/util.h>
|
||||
|
||||
#include <dns/qp.h>
|
||||
|
|
|
|||
|
|
@ -263,6 +263,7 @@ libdns_la_CPPFLAGS = \
|
|||
$(AM_CPPFLAGS) \
|
||||
$(LIBDNS_CFLAGS) \
|
||||
$(LIBISC_CFLAGS) \
|
||||
$(LIBURCU_CFLAGS) \
|
||||
$(LIBUV_CFLAGS) \
|
||||
$(OPENSSL_CFLAGS)
|
||||
|
||||
|
|
@ -272,6 +273,7 @@ libdns_la_LDFLAGS = \
|
|||
|
||||
libdns_la_LIBADD = \
|
||||
$(LIBISC_LIBS) \
|
||||
$(LIBURCU_LIBS) \
|
||||
$(LIBUV_LIBS) \
|
||||
$(OPENSSL_LIBS)
|
||||
|
||||
|
|
|
|||
|
|
@ -206,8 +206,7 @@ createview(isc_mem_t *mctx, dns_rdataclass_t rdclass, isc_loopmgr_t *loopmgr,
|
|||
isc_result_t result;
|
||||
dns_view_t *view = NULL;
|
||||
|
||||
result = dns_view_create(mctx, loopmgr, rdclass, DNS_CLIENTVIEW_NAME,
|
||||
&view);
|
||||
result = dns_view_create(mctx, rdclass, DNS_CLIENTVIEW_NAME, &view);
|
||||
if (result != ISC_R_SUCCESS) {
|
||||
return (result);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -47,17 +47,11 @@
|
|||
* version of the trie.
|
||||
*
|
||||
* For fast concurrent reads, call `dns_qpmulti_query()` to fill in a
|
||||
* `dns_qpread_t`, which must be allocated on the stack. Readers can
|
||||
* access a single version of the trie within the scope of an isc_loop
|
||||
* thread (NOT an isc_work thread). We rely on the loop to bound the
|
||||
* lifetime of a `dns_qpread_t`, instead of using locks. Readers are
|
||||
* not blocked by any write activity, and vice versa.
|
||||
*
|
||||
* For read-only access outside the scope of a loop, such as from an
|
||||
* isc_work callback, `dns_qpmulti_lockedread()`. This looks like a
|
||||
* query transaction; the difference is that a locked read transaction
|
||||
* takes the `dns_qpmulti_t` mutex. When you have finished with a
|
||||
* `dns_qpread_t`, call `dns_qpread_destroy()` to release the mutex.
|
||||
* `dns_qpread_t`, which must be allocated on the stack. This gives
|
||||
* the reader access to a single version of the trie. The reader's
|
||||
* thread must be registered with `liburcu`, which is normally taken
|
||||
* care of by `libisc`. Readers are not blocked by any write activity,
|
||||
* and vice versa.
|
||||
*
|
||||
* For reads that need a stable view of the trie for multiple cycles
|
||||
* of an isc_loop, or which can be used from any thread, call
|
||||
|
|
@ -314,17 +308,15 @@ dns_qp_destroy(dns_qp_t **qptp);
|
|||
*/
|
||||
|
||||
void
|
||||
dns_qpmulti_create(isc_mem_t *mctx, isc_loopmgr_t *loopmgr,
|
||||
const dns_qpmethods_t *methods, void *uctx,
|
||||
dns_qpmulti_create(isc_mem_t *mctx, const dns_qpmethods_t *methods, void *uctx,
|
||||
dns_qpmulti_t **qpmp);
|
||||
/*%<
|
||||
* Create a multi-threaded qp-trie.
|
||||
*
|
||||
* Requires:
|
||||
* \li `mctx` is a pointer to a valid memory context.
|
||||
* \li 'loopmgr' is a valid loop manager.
|
||||
* \li `qpmp != NULL && *qpmp == NULL`
|
||||
* \li all the methods are non-NULL
|
||||
* \li `qpmp != NULL && *qpmp == NULL`
|
||||
*
|
||||
* Ensures:
|
||||
* \li `*qpmp` is a pointer to a valid multi-threaded qp-trie
|
||||
|
|
|
|||
|
|
@ -259,8 +259,8 @@ struct dns_view {
|
|||
#endif /* HAVE_LMDB */
|
||||
|
||||
isc_result_t
|
||||
dns_view_create(isc_mem_t *mctx, isc_loopmgr_t *loopmgr,
|
||||
dns_rdataclass_t rdclass, const char *name, dns_view_t **viewp);
|
||||
dns_view_create(isc_mem_t *mctx, dns_rdataclass_t rdclass, const char *name,
|
||||
dns_view_t **viewp);
|
||||
/*%<
|
||||
* Create a view.
|
||||
*
|
||||
|
|
|
|||
|
|
@ -34,8 +34,7 @@ typedef isc_result_t
|
|||
dns_zt_callback_t(void *arg);
|
||||
|
||||
void
|
||||
dns_zt_create(isc_mem_t *mctx, isc_loopmgr_t *loopmgr, dns_view_t *view,
|
||||
dns_zt_t **ztp);
|
||||
dns_zt_create(isc_mem_t *mctx, dns_view_t *view, dns_zt_t **ztp);
|
||||
/*%<
|
||||
* Creates a new zone table for a view.
|
||||
*
|
||||
|
|
|
|||
244
lib/dns/qp.c
244
lib/dns/qp.c
|
|
@ -28,17 +28,16 @@
|
|||
|
||||
#include <isc/atomic.h>
|
||||
#include <isc/buffer.h>
|
||||
#include <isc/loop.h>
|
||||
#include <isc/magic.h>
|
||||
#include <isc/mem.h>
|
||||
#include <isc/mutex.h>
|
||||
#include <isc/qsbr.h>
|
||||
#include <isc/refcount.h>
|
||||
#include <isc/result.h>
|
||||
#include <isc/rwlock.h>
|
||||
#include <isc/tid.h>
|
||||
#include <isc/time.h>
|
||||
#include <isc/types.h>
|
||||
#include <isc/urcu.h>
|
||||
#include <isc/util.h>
|
||||
|
||||
#include <dns/fixedname.h>
|
||||
|
|
@ -426,17 +425,18 @@ static void
|
|||
realloc_chunk_arrays(dns_qp_t *qp, qp_chunk_t newmax) {
|
||||
size_t oldptrs = sizeof(qp->base->ptr[0]) * qp->chunk_max;
|
||||
size_t newptrs = sizeof(qp->base->ptr[0]) * newmax;
|
||||
size_t allbytes = sizeof(dns_qpbase_t) + newptrs;
|
||||
size_t size = STRUCT_FLEX_SIZE(qp->base, ptr, newmax);
|
||||
|
||||
if (qp->base == NULL || qpbase_unref(qp)) {
|
||||
qp->base = isc_mem_reallocate(qp->mctx, qp->base, allbytes);
|
||||
qp->base = isc_mem_reallocate(qp->mctx, qp->base, size);
|
||||
} else {
|
||||
dns_qpbase_t *oldbase = qp->base;
|
||||
qp->base = isc_mem_allocate(qp->mctx, allbytes);
|
||||
qp->base = isc_mem_allocate(qp->mctx, size);
|
||||
memmove(&qp->base->ptr[0], &oldbase->ptr[0], oldptrs);
|
||||
}
|
||||
memset(&qp->base->ptr[qp->chunk_max], 0, newptrs - oldptrs);
|
||||
isc_refcount_init(&qp->base->refcount, 1);
|
||||
qp->base->magic = QPBASE_MAGIC;
|
||||
|
||||
/* usage array is exclusive to the writer */
|
||||
size_t oldusage = sizeof(qp->usage[0]) * qp->chunk_max;
|
||||
|
|
@ -555,12 +555,14 @@ chunk_usage(dns_qp_t *qp, qp_chunk_t chunk) {
|
|||
*/
|
||||
static void
|
||||
chunk_discount(dns_qp_t *qp, qp_chunk_t chunk) {
|
||||
if (qp->usage[chunk].phase == 0) {
|
||||
INSIST(qp->used_count >= qp->usage[chunk].used);
|
||||
INSIST(qp->free_count >= qp->usage[chunk].free);
|
||||
qp->used_count -= qp->usage[chunk].used;
|
||||
qp->free_count -= qp->usage[chunk].free;
|
||||
if (qp->usage[chunk].discounted) {
|
||||
return;
|
||||
}
|
||||
INSIST(qp->used_count >= qp->usage[chunk].used);
|
||||
INSIST(qp->free_count >= qp->usage[chunk].free);
|
||||
qp->used_count -= qp->usage[chunk].used;
|
||||
qp->free_count -= qp->usage[chunk].free;
|
||||
qp->usage[chunk].discounted = true;
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
@ -622,109 +624,95 @@ recycle(dns_qp_t *qp) {
|
|||
}
|
||||
|
||||
/*
|
||||
* At the end of a transaction, mark empty but immutable chunks for
|
||||
* reclamation later. Returns true when chunks need reclaiming later.
|
||||
* asynchronous cleanup
|
||||
*/
|
||||
static bool
|
||||
defer_chunk_reclamation(dns_qp_t *qp, isc_qsbr_phase_t phase) {
|
||||
unsigned int reclaim = 0;
|
||||
static void
|
||||
reclaim_chunks_cb(struct rcu_head *arg) {
|
||||
qp_rcuctx_t *rcuctx = isc_urcu_container(arg, qp_rcuctx_t, rcu_head);
|
||||
REQUIRE(QPRCU_VALID(rcuctx));
|
||||
dns_qpmulti_t *multi = rcuctx->multi;
|
||||
REQUIRE(QPMULTI_VALID(multi));
|
||||
|
||||
for (qp_chunk_t chunk = 0; chunk < qp->chunk_max; chunk++) {
|
||||
if (chunk != qp->bump && chunk_usage(qp, chunk) == 0 &&
|
||||
qp->usage[chunk].exists && qp->usage[chunk].immutable &&
|
||||
qp->usage[chunk].phase == 0)
|
||||
{
|
||||
chunk_discount(qp, chunk);
|
||||
qp->usage[chunk].phase = phase;
|
||||
reclaim++;
|
||||
}
|
||||
}
|
||||
LOCK(&multi->mutex);
|
||||
|
||||
if (reclaim > 0) {
|
||||
LOG_STATS("qp will reclaim %u chunks in phase %u", reclaim,
|
||||
phase);
|
||||
}
|
||||
dns_qp_t *qp = &multi->writer;
|
||||
REQUIRE(QP_VALID(qp));
|
||||
|
||||
return (reclaim > 0);
|
||||
}
|
||||
|
||||
static bool
|
||||
reclaim_chunks(dns_qp_t *qp, isc_qsbr_phase_t phase) {
|
||||
unsigned int free = 0;
|
||||
bool more = false;
|
||||
|
||||
isc_nanosecs_t start = isc_time_monotonic();
|
||||
|
||||
for (qp_chunk_t chunk = 0; chunk < qp->chunk_max; chunk++) {
|
||||
if (qp->usage[chunk].phase == phase) {
|
||||
if (qp->usage[chunk].snapshot) {
|
||||
/* cleanup when snapshot is destroyed */
|
||||
qp->usage[chunk].snapfree = true;
|
||||
} else {
|
||||
chunk_free(qp, chunk);
|
||||
free++;
|
||||
}
|
||||
} else if (qp->usage[chunk].phase != 0) {
|
||||
/*
|
||||
* We need to reclaim more of this trie's memory
|
||||
* on a later qsbr callback.
|
||||
*/
|
||||
more = true;
|
||||
for (unsigned int i = 0; i < rcuctx->count; i++) {
|
||||
qp_chunk_t chunk = rcuctx->chunk[i];
|
||||
if (qp->usage[chunk].snapshot) {
|
||||
/* cleanup when snapshot is destroyed */
|
||||
qp->usage[chunk].snapfree = true;
|
||||
} else {
|
||||
chunk_free(qp, chunk);
|
||||
free++;
|
||||
}
|
||||
}
|
||||
|
||||
isc_mem_putanddetach(&rcuctx->mctx, rcuctx,
|
||||
STRUCT_FLEX_SIZE(rcuctx, chunk, rcuctx->count));
|
||||
|
||||
isc_nanosecs_t time = isc_time_monotonic() - start;
|
||||
recycle_time += time;
|
||||
|
||||
if (free > 0) {
|
||||
LOG_STATS("qp reclaim" PRItime "phase %u free %u chunks", time,
|
||||
phase, free);
|
||||
LOG_STATS("qp reclaim" PRItime "free %u chunks", time, free);
|
||||
LOG_STATS("qp reclaim leaf %u live %u used %u free %u hold %u",
|
||||
qp->leaf_count, qp->used_count - qp->free_count,
|
||||
qp->used_count, qp->free_count, qp->hold_count);
|
||||
}
|
||||
|
||||
return (more);
|
||||
UNLOCK(&multi->mutex);
|
||||
}
|
||||
|
||||
/*
|
||||
* List of `dns_qpmulti_t`s that have chunks to be reclaimed.
|
||||
*/
|
||||
static ISC_ASTACK(dns_qpmulti_t) qsbr_work;
|
||||
|
||||
/*
|
||||
* When a grace period has passed, this function reclaims any unused memory.
|
||||
* At the end of a transaction, schedule empty but immutable chunks
|
||||
* for reclamation later.
|
||||
*/
|
||||
static void
|
||||
qp_qsbr_reclaimer(isc_qsbr_phase_t phase) {
|
||||
ISC_STACK(dns_qpmulti_t) drain = ISC_ASTACK_TO_STACK(qsbr_work);
|
||||
while (!ISC_STACK_EMPTY(drain)) {
|
||||
/* lock before pop */
|
||||
dns_qpmulti_t *multi = ISC_STACK_TOP(drain);
|
||||
INSIST(QPMULTI_VALID(multi));
|
||||
LOCK(&multi->mutex);
|
||||
ISC_STACK_POP(drain, cleanup);
|
||||
if (multi->writer.destroy) {
|
||||
UNLOCK(&multi->mutex);
|
||||
dns_qpmulti_destroy(&multi);
|
||||
} else {
|
||||
if (reclaim_chunks(&multi->writer, phase)) {
|
||||
/* more to do next time */
|
||||
ISC_ASTACK_PUSH(qsbr_work, multi, cleanup);
|
||||
}
|
||||
UNLOCK(&multi->mutex);
|
||||
reclaim_chunks(dns_qpmulti_t *multi) {
|
||||
dns_qp_t *qp = &multi->writer;
|
||||
|
||||
unsigned int count = 0;
|
||||
for (qp_chunk_t chunk = 0; chunk < qp->chunk_max; chunk++) {
|
||||
if (chunk != qp->bump && chunk_usage(qp, chunk) == 0 &&
|
||||
qp->usage[chunk].exists && qp->usage[chunk].immutable &&
|
||||
!qp->usage[chunk].discounted)
|
||||
{
|
||||
count++;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* Register `qp_qsbr_reclaimer()` with QSBR at startup.
|
||||
*/
|
||||
static void
|
||||
qp_qsbr_register(void) ISC_CONSTRUCTOR;
|
||||
static void
|
||||
qp_qsbr_register(void) {
|
||||
isc_qsbr_register(qp_qsbr_reclaimer);
|
||||
if (count == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
qp_rcuctx_t *rcuctx =
|
||||
isc_mem_get(qp->mctx, STRUCT_FLEX_SIZE(rcuctx, chunk, count));
|
||||
*rcuctx = (qp_rcuctx_t){
|
||||
.magic = QPRCU_MAGIC,
|
||||
.multi = multi,
|
||||
.count = count,
|
||||
};
|
||||
isc_mem_attach(qp->mctx, &rcuctx->mctx);
|
||||
|
||||
unsigned int i = 0;
|
||||
for (qp_chunk_t chunk = 0; chunk < qp->chunk_max; chunk++) {
|
||||
if (chunk != qp->bump && chunk_usage(qp, chunk) == 0 &&
|
||||
qp->usage[chunk].exists && qp->usage[chunk].immutable &&
|
||||
!qp->usage[chunk].discounted)
|
||||
{
|
||||
rcuctx->chunk[i++] = chunk;
|
||||
chunk_discount(qp, chunk);
|
||||
}
|
||||
}
|
||||
|
||||
isc_urcu_cleanup(rcuctx, rcu_head, reclaim_chunks_cb);
|
||||
|
||||
LOG_STATS("qp will reclaim %u chunks", count);
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
@ -1121,6 +1109,8 @@ dns_qpmulti_update(dns_qpmulti_t *multi, dns_qp_t **qptp) {
|
|||
memmove(rollback, qp, sizeof(*rollback));
|
||||
/* can be uninitialized on the first transaction */
|
||||
if (rollback->base != NULL) {
|
||||
INSIST(QPBASE_VALID(rollback->base));
|
||||
INSIST(qp->usage != NULL && qp->chunk_max > 0);
|
||||
/* paired with either _commit() or _rollback() */
|
||||
isc_refcount_increment(&rollback->base->refcount);
|
||||
size_t usage_bytes = sizeof(qp->usage[0]) * qp->chunk_max;
|
||||
|
|
@ -1187,12 +1177,8 @@ dns_qpmulti_commit(dns_qpmulti_t *multi, dns_qp_t **qptp) {
|
|||
recycle(qp);
|
||||
}
|
||||
|
||||
/* the reclamation phase must be sampled after the commit */
|
||||
isc_qsbr_phase_t phase = isc_qsbr_phase(multi->loopmgr);
|
||||
if (defer_chunk_reclamation(qp, phase)) {
|
||||
ISC_ASTACK_ADD(qsbr_work, multi, cleanup);
|
||||
isc_qsbr_activate(multi->loopmgr, phase);
|
||||
}
|
||||
/* schedule the rest for later */
|
||||
reclaim_chunks(multi);
|
||||
|
||||
*qptp = NULL;
|
||||
UNLOCK(&multi->mutex);
|
||||
|
|
@ -1282,31 +1268,11 @@ dns_qpmulti_query(dns_qpmulti_t *multi, dns_qpread_t *qp) {
|
|||
REQUIRE(QPMULTI_VALID(multi));
|
||||
REQUIRE(qp != NULL);
|
||||
|
||||
/* we MUST be in an isc_loop thread */
|
||||
qp->tid = isc_tid();
|
||||
REQUIRE(qp->tid != ISC_TID_UNKNOWN);
|
||||
|
||||
dns_qpmulti_t *whence = reader_open(multi, qp);
|
||||
INSIST(whence == multi);
|
||||
}
|
||||
|
||||
/*
|
||||
* a locked read takes the mutex
|
||||
*/
|
||||
|
||||
void
|
||||
dns_qpmulti_lockedread(dns_qpmulti_t *multi, dns_qpread_t *qp) {
|
||||
REQUIRE(QPMULTI_VALID(multi));
|
||||
REQUIRE(qp != NULL);
|
||||
|
||||
/* we MUST NOT be in an isc_loop thread */
|
||||
qp->tid = isc_tid();
|
||||
REQUIRE(qp->tid == ISC_TID_UNKNOWN);
|
||||
|
||||
LOCK(&multi->mutex);
|
||||
|
||||
dns_qpmulti_t *whence = reader_open(multi, qp);
|
||||
INSIST(whence == multi);
|
||||
rcu_read_lock();
|
||||
}
|
||||
|
||||
void
|
||||
|
|
@ -1314,10 +1280,8 @@ dns_qpread_destroy(dns_qpmulti_t *multi, dns_qpread_t *qp) {
|
|||
REQUIRE(QPMULTI_VALID(multi));
|
||||
REQUIRE(QP_VALID(qp));
|
||||
REQUIRE(qp->tid == isc_tid());
|
||||
if (qp->tid == ISC_TID_UNKNOWN) {
|
||||
UNLOCK(&multi->mutex);
|
||||
}
|
||||
*qp = (dns_qpread_t){};
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
@ -1406,8 +1370,7 @@ dns_qp_create(isc_mem_t *mctx, const dns_qpmethods_t *methods, void *uctx,
|
|||
}
|
||||
|
||||
void
|
||||
dns_qpmulti_create(isc_mem_t *mctx, isc_loopmgr_t *loopmgr,
|
||||
const dns_qpmethods_t *methods, void *uctx,
|
||||
dns_qpmulti_create(isc_mem_t *mctx, const dns_qpmethods_t *methods, void *uctx,
|
||||
dns_qpmulti_t **qpmp) {
|
||||
REQUIRE(qpmp != NULL && *qpmp == NULL);
|
||||
|
||||
|
|
@ -1415,8 +1378,6 @@ dns_qpmulti_create(isc_mem_t *mctx, isc_loopmgr_t *loopmgr,
|
|||
*multi = (dns_qpmulti_t){
|
||||
.magic = QPMULTI_MAGIC,
|
||||
.reader_ref = INVALID_REF,
|
||||
.loopmgr = loopmgr,
|
||||
.cleanup = ISC_SLINK_INITIALIZER,
|
||||
};
|
||||
isc_mutex_init(&multi->mutex);
|
||||
ISC_LIST_INIT(multi->snapshots);
|
||||
|
|
@ -1468,29 +1429,48 @@ dns_qp_destroy(dns_qp_t **qptp) {
|
|||
isc_mem_putanddetach(&qp->mctx, qp, sizeof(*qp));
|
||||
}
|
||||
|
||||
static void
|
||||
qpmulti_destroy_cb(struct rcu_head *arg) {
|
||||
qp_rcuctx_t *rcuctx = isc_urcu_container(arg, qp_rcuctx_t, rcu_head);
|
||||
REQUIRE(QPRCU_VALID(rcuctx));
|
||||
dns_qpmulti_t *multi = rcuctx->multi;
|
||||
REQUIRE(QPMULTI_VALID(multi));
|
||||
dns_qp_t *qp = &multi->writer;
|
||||
REQUIRE(QP_VALID(qp));
|
||||
|
||||
REQUIRE(rcuctx->count == 0);
|
||||
|
||||
destroy_guts(qp);
|
||||
isc_mutex_destroy(&multi->mutex);
|
||||
isc_mem_putanddetach(&rcuctx->mctx, rcuctx,
|
||||
STRUCT_FLEX_SIZE(rcuctx, chunk, rcuctx->count));
|
||||
isc_mem_putanddetach(&qp->mctx, multi, sizeof(*multi));
|
||||
}
|
||||
|
||||
void
|
||||
dns_qpmulti_destroy(dns_qpmulti_t **qpmp) {
|
||||
dns_qp_t *qp = NULL;
|
||||
dns_qpmulti_t *multi = NULL;
|
||||
qp_rcuctx_t *rcuctx = NULL;
|
||||
|
||||
REQUIRE(qpmp != NULL);
|
||||
REQUIRE(QPMULTI_VALID(*qpmp));
|
||||
|
||||
dns_qpmulti_t *multi = *qpmp;
|
||||
dns_qp_t *qp = &multi->writer;
|
||||
multi = *qpmp;
|
||||
qp = &multi->writer;
|
||||
*qpmp = NULL;
|
||||
|
||||
REQUIRE(QP_VALID(qp));
|
||||
REQUIRE(multi->rollback == NULL);
|
||||
REQUIRE(ISC_LIST_EMPTY(multi->snapshots));
|
||||
|
||||
LOCK(&multi->mutex);
|
||||
if (ISC_SLINK_LINKED(multi, cleanup)) {
|
||||
qp->destroy = true;
|
||||
UNLOCK(&multi->mutex);
|
||||
} else {
|
||||
destroy_guts(qp);
|
||||
UNLOCK(&multi->mutex);
|
||||
isc_mutex_destroy(&multi->mutex);
|
||||
isc_mem_putanddetach(&qp->mctx, multi, sizeof(*multi));
|
||||
}
|
||||
rcuctx = isc_mem_get(qp->mctx, STRUCT_FLEX_SIZE(rcuctx, chunk, 0));
|
||||
*rcuctx = (qp_rcuctx_t){
|
||||
.magic = QPRCU_MAGIC,
|
||||
.multi = multi,
|
||||
};
|
||||
isc_mem_attach(qp->mctx, &rcuctx->mctx);
|
||||
isc_urcu_cleanup(rcuctx, rcu_head, qpmulti_destroy_cb);
|
||||
}
|
||||
|
||||
/***********************************************************************
|
||||
|
|
|
|||
|
|
@ -301,11 +301,9 @@ ref_cell(qp_ref_t ref) {
|
|||
* transactions, but it must become immutable when an update
|
||||
* transaction is opened.)
|
||||
*
|
||||
* When a chunk becomes empty (wrt the latest version of the trie), we
|
||||
* note the QSBR phase after which no old versions of the trie will
|
||||
* need the chunk and it will be safe to free(). There are a few flags
|
||||
* used to mark which chunks are still needed by snapshots after the
|
||||
* chunks have passed their normal reclamation phase.
|
||||
* There are a few flags used to mark which chunks are still needed by
|
||||
* snapshots after the chunks have passed their normal reclamation
|
||||
* phase.
|
||||
*/
|
||||
typedef struct qp_usage {
|
||||
/*% the allocation point, increases monotonically */
|
||||
|
|
@ -316,14 +314,14 @@ typedef struct qp_usage {
|
|||
bool exists : 1;
|
||||
/*% is this chunk shared? [MT] */
|
||||
bool immutable : 1;
|
||||
/*% already subtracted from multi->*_count [MT] */
|
||||
bool discounted : 1;
|
||||
/*% is a snapshot using this chunk? [MT] */
|
||||
bool snapshot : 1;
|
||||
/*% tried to free it but a snapshot needs it [MT] */
|
||||
bool snapfree : 1;
|
||||
/*% for mark/sweep snapshot flag updates [MT] */
|
||||
bool snapmark : 1;
|
||||
/*% in which phase did this chunk become unused? [MT] */
|
||||
isc_qsbr_phase_t phase : ISC_QSBR_PHASE_BITS;
|
||||
} qp_usage_t;
|
||||
|
||||
/*
|
||||
|
|
@ -335,7 +333,7 @@ typedef struct qp_usage {
|
|||
* A `dns_qpbase_t` counts references from `dns_qp_t` objects and
|
||||
* from packed readers, but not from `dns_qpread_t` nor from
|
||||
* `dns_qpsnap_t` objects. Refcount adjustments for `dns_qpread_t`
|
||||
* would wreck multicore scalability; instead we rely on QSBR.
|
||||
* would wreck multicore scalability; instead we rely on RCU.
|
||||
*
|
||||
* The `usage` array determines when a chunk is no longer needed: old
|
||||
* chunk pointers in old `base` arrays are ignored. (They can become
|
||||
|
|
@ -349,10 +347,25 @@ typedef struct qp_usage {
|
|||
* busy slots that are in use by readers.
|
||||
*/
|
||||
struct dns_qpbase {
|
||||
unsigned int magic;
|
||||
isc_refcount_t refcount;
|
||||
qp_node_t *ptr[];
|
||||
};
|
||||
|
||||
/*
|
||||
* Chunks that may be in use by readers are reclaimed asynchronously.
|
||||
* When a transaction commits, immutable chunks that are now empty are
|
||||
* listed in a `qp_rcuctx_t` structure and passed to `call_rcu()`.
|
||||
*/
|
||||
typedef struct qp_rcuctx {
|
||||
unsigned int magic;
|
||||
struct rcu_head rcu_head;
|
||||
isc_mem_t *mctx;
|
||||
dns_qpmulti_t *multi;
|
||||
qp_chunk_t count;
|
||||
qp_chunk_t chunk[];
|
||||
} qp_rcuctx_t;
|
||||
|
||||
/*
|
||||
* Returns true when the base array can be free()d.
|
||||
*/
|
||||
|
|
@ -382,10 +395,14 @@ ref_ptr(dns_qpreadable_t qpr, qp_ref_t ref) {
|
|||
#define QPITER_MAGIC ISC_MAGIC('q', 'p', 'i', 't')
|
||||
#define QPMULTI_MAGIC ISC_MAGIC('q', 'p', 'm', 'v')
|
||||
#define QPREADER_MAGIC ISC_MAGIC('q', 'p', 'r', 'x')
|
||||
#define QPBASE_MAGIC ISC_MAGIC('q', 'p', 'b', 'p')
|
||||
#define QPRCU_MAGIC ISC_MAGIC('q', 'p', 'c', 'b')
|
||||
|
||||
#define QP_VALID(p) ISC_MAGIC_VALID(p, QP_MAGIC)
|
||||
#define QPITER_VALID(p) ISC_MAGIC_VALID(p, QPITER_MAGIC)
|
||||
#define QPMULTI_VALID(p) ISC_MAGIC_VALID(p, QPMULTI_MAGIC)
|
||||
#define QP_VALID(qp) ISC_MAGIC_VALID(qp, QP_MAGIC)
|
||||
#define QPITER_VALID(qp) ISC_MAGIC_VALID(qp, QPITER_MAGIC)
|
||||
#define QPMULTI_VALID(qp) ISC_MAGIC_VALID(qp, QPMULTI_MAGIC)
|
||||
#define QPBASE_VALID(qp) ISC_MAGIC_VALID(qp, QPBASE_MAGIC)
|
||||
#define QPRCU_VALID(qp) ISC_MAGIC_VALID(qp, QPRCU_MAGIC)
|
||||
|
||||
/*
|
||||
* Polymorphic initialization of the `dns_qpreader_t` prefix.
|
||||
|
|
@ -504,8 +521,6 @@ struct dns_qp {
|
|||
enum { QP_NONE, QP_WRITE, QP_UPDATE } transaction_mode : 2;
|
||||
/*% compact the entire trie [MT] */
|
||||
bool compact_all : 1;
|
||||
/*% destroy the trie asynchronously [MT] */
|
||||
bool destroy : 1;
|
||||
/*% optionally when compiled with fuzzing support [MT] */
|
||||
bool write_protect : 1;
|
||||
};
|
||||
|
|
@ -517,9 +532,6 @@ struct dns_qp {
|
|||
* of the trie. See the "packed reader nodes" section below for a
|
||||
* description of what it points to.
|
||||
*
|
||||
* We need access to the loopmgr to hook into QSBR safe memory reclamation.
|
||||
* It is constant after initialization.
|
||||
*
|
||||
* The main object under the protection of the mutex is the `writer`
|
||||
* containing all the allocator state. There can be a backup copy when
|
||||
* we want to be able to rollback an update transaction.
|
||||
|
|
@ -537,8 +549,6 @@ struct dns_qp {
|
|||
*/
|
||||
struct dns_qpmulti {
|
||||
uint32_t magic;
|
||||
/*% safe memory reclamation context (const) */
|
||||
isc_loopmgr_t *loopmgr;
|
||||
/*% pointer to current packed reader */
|
||||
atomic_ptr(qp_node_t) reader;
|
||||
/*% the mutex protects the rest of this structure */
|
||||
|
|
@ -551,8 +561,6 @@ struct dns_qpmulti {
|
|||
dns_qp_t *rollback;
|
||||
/*% all snapshots of this trie */
|
||||
ISC_LIST(dns_qpsnap_t) snapshots;
|
||||
/*% safe memory reclamation work list */
|
||||
ISC_SLINK(dns_qpmulti_t) cleanup;
|
||||
};
|
||||
|
||||
/***********************************************************************
|
||||
|
|
@ -878,13 +886,15 @@ static inline dns_qpmulti_t *
|
|||
unpack_reader(dns_qpreader_t *qp, qp_node_t *reader) {
|
||||
INSIST(reader_valid(reader));
|
||||
dns_qpmulti_t *multi = node_pointer(&reader[0]);
|
||||
dns_qpbase_t *base = node_pointer(&reader[1]);
|
||||
INSIST(QPMULTI_VALID(multi));
|
||||
INSIST(QPBASE_VALID(base));
|
||||
*qp = (dns_qpreader_t){
|
||||
.magic = QP_MAGIC,
|
||||
.uctx = multi->writer.uctx,
|
||||
.methods = multi->writer.methods,
|
||||
.root_ref = node32(&reader[1]),
|
||||
.base = node_pointer(&reader[1]),
|
||||
.base = base,
|
||||
};
|
||||
return (multi);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -9583,8 +9583,8 @@ new_gluelist(isc_mem_t *mctx, dns_name_t *name) {
|
|||
|
||||
static void
|
||||
free_gluelist_rcu(struct rcu_head *rcu_head) {
|
||||
rbtdb_glue_t *glue = caa_container_of(rcu_head, rbtdb_glue_t, rcu_head);
|
||||
__tsan_acquire(glue);
|
||||
rbtdb_glue_t *glue = isc_urcu_container(rcu_head, rbtdb_glue_t,
|
||||
rcu_head);
|
||||
|
||||
free_gluelist(glue);
|
||||
}
|
||||
|
|
@ -9600,8 +9600,7 @@ free_gluetable(rbtdb_version_t *rbtversion) {
|
|||
caa_container_of(node, rdatasetheader_t, wfs_node);
|
||||
rbtdb_glue_t *glue = rcu_xchg_pointer(&header->glue_list, NULL);
|
||||
|
||||
__tsan_release(glue);
|
||||
call_rcu(&glue->rcu_head, free_gluelist_rcu);
|
||||
isc_urcu_cleanup(glue, rcu_head, free_gluelist_rcu);
|
||||
}
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
|
|
|||
|
|
@ -79,8 +79,7 @@
|
|||
#define DEFAULT_EDNS_BUFSIZE 1232
|
||||
|
||||
isc_result_t
|
||||
dns_view_create(isc_mem_t *mctx, isc_loopmgr_t *loopmgr,
|
||||
dns_rdataclass_t rdclass, const char *name,
|
||||
dns_view_create(isc_mem_t *mctx, dns_rdataclass_t rdclass, const char *name,
|
||||
dns_view_t **viewp) {
|
||||
dns_view_t *view = NULL;
|
||||
isc_result_t result;
|
||||
|
|
@ -135,7 +134,7 @@ dns_view_create(isc_mem_t *mctx, isc_loopmgr_t *loopmgr,
|
|||
isc_rwlock_init(&view->sfd_lock);
|
||||
|
||||
view->zonetable = NULL;
|
||||
dns_zt_create(mctx, loopmgr, view, &view->zonetable);
|
||||
dns_zt_create(mctx, view, &view->zonetable);
|
||||
|
||||
result = dns_fwdtable_create(mctx, &view->fwdtable);
|
||||
if (result != ISC_R_SUCCESS) {
|
||||
|
|
|
|||
|
|
@ -10619,8 +10619,6 @@ failure:
|
|||
cleanup:
|
||||
dns_db_detach(&kfetch->db);
|
||||
|
||||
/* The zone must be managed */
|
||||
INSIST(kfetch->zone->loop != NULL);
|
||||
isc_refcount_decrement(&zone->irefs);
|
||||
|
||||
if (dns_rdataset_isassociated(keydataset)) {
|
||||
|
|
@ -14658,12 +14656,15 @@ zone_timer_set(dns_zone_t *zone, isc_time_t *next, isc_time_t *now) {
|
|||
isc_time_subtract(next, now, &interval);
|
||||
}
|
||||
|
||||
if (zone->timer == NULL) {
|
||||
if (zone->loop == NULL) {
|
||||
zone_debuglog(zone, __func__, 10, "zone is not managed");
|
||||
} else if (zone->timer == NULL) {
|
||||
isc_refcount_increment0(&zone->irefs);
|
||||
isc_timer_create(zone->loop, zone_timer, zone, &zone->timer);
|
||||
}
|
||||
|
||||
isc_timer_start(zone->timer, isc_timertype_once, &interval);
|
||||
if (zone->timer != NULL) {
|
||||
isc_timer_start(zone->timer, isc_timertype_once, &interval);
|
||||
}
|
||||
}
|
||||
|
||||
static void
|
||||
|
|
@ -18169,7 +18170,8 @@ dns_zonemgr_managezone(dns_zonemgr_t *zmgr, dns_zone_t *zone) {
|
|||
REQUIRE(zone->timer == NULL);
|
||||
REQUIRE(zone->zmgr == NULL);
|
||||
|
||||
zone->loop = isc_loop_get(zmgr->loopmgr, zone->tid);
|
||||
isc_loop_t *loop = isc_loop_get(zmgr->loopmgr, zone->tid);
|
||||
isc_loop_attach(loop, &zone->loop);
|
||||
|
||||
zonemgr_keymgmt_add(zmgr, zone, &zone->kfio);
|
||||
INSIST(zone->kfio != NULL);
|
||||
|
|
@ -18200,6 +18202,13 @@ dns_zonemgr_releasezone(dns_zonemgr_t *zmgr, dns_zone_t *zone) {
|
|||
ENSURE(zone->kfio == NULL);
|
||||
}
|
||||
|
||||
if (zone->timer != NULL) {
|
||||
isc_refcount_decrement(&zone->irefs);
|
||||
isc_timer_destroy(&zone->timer);
|
||||
}
|
||||
|
||||
isc_loop_detach(&zone->loop);
|
||||
|
||||
/* Detach below, outside of the write lock. */
|
||||
zone->zmgr = NULL;
|
||||
|
||||
|
|
@ -20658,8 +20667,6 @@ done:
|
|||
}
|
||||
|
||||
cleanup:
|
||||
/* The zone must be managed */
|
||||
INSIST(nsfetch->zone->loop != NULL);
|
||||
isc_refcount_decrement(&zone->irefs);
|
||||
|
||||
if (dns_rdataset_isassociated(nsrrset)) {
|
||||
|
|
@ -21922,7 +21929,7 @@ dns_zone_link(dns_zone_t *zone, dns_zone_t *raw) {
|
|||
LOCK_ZONE(zone);
|
||||
LOCK_ZONE(raw);
|
||||
|
||||
raw->loop = zone->loop;
|
||||
isc_loop_attach(zone->loop, &raw->loop);
|
||||
|
||||
/* dns_zone_attach(raw, &zone->raw); */
|
||||
isc_refcount_increment(&raw->references);
|
||||
|
|
|
|||
12
lib/dns/zt.c
12
lib/dns/zt.c
|
|
@ -94,15 +94,14 @@ static dns_qpmethods_t ztqpmethods = {
|
|||
};
|
||||
|
||||
void
|
||||
dns_zt_create(isc_mem_t *mctx, isc_loopmgr_t *loopmgr, dns_view_t *view,
|
||||
dns_zt_t **ztp) {
|
||||
dns_zt_create(isc_mem_t *mctx, dns_view_t *view, dns_zt_t **ztp) {
|
||||
dns_qpmulti_t *multi = NULL;
|
||||
dns_zt_t *zt = NULL;
|
||||
|
||||
REQUIRE(ztp != NULL && *ztp == NULL);
|
||||
REQUIRE(view != NULL);
|
||||
|
||||
dns_qpmulti_create(mctx, loopmgr, &ztqpmethods, view, &multi);
|
||||
dns_qpmulti_create(mctx, &ztqpmethods, view, &multi);
|
||||
|
||||
zt = isc_mem_get(mctx, sizeof(*zt));
|
||||
*zt = (dns_zt_t){
|
||||
|
|
@ -177,11 +176,8 @@ dns_zt_find(dns_zt_t *zt, const dns_name_t *name, dns_ztfind_t options,
|
|||
REQUIRE(VALID_ZT(zt));
|
||||
REQUIRE(exactopts != exactmask);
|
||||
|
||||
if (isc_tid() == ISC_TID_UNKNOWN) {
|
||||
dns_qpmulti_lockedread(zt->multi, &qpr);
|
||||
} else {
|
||||
dns_qpmulti_query(zt->multi, &qpr);
|
||||
}
|
||||
dns_qpmulti_query(zt->multi, &qpr);
|
||||
|
||||
if (exactopts == DNS_ZTFIND_EXACT) {
|
||||
result = dns_qp_getname(&qpr, name, &pval, &ival);
|
||||
} else if (exactopts == DNS_ZTFIND_NOEXACT) {
|
||||
|
|
|
|||
|
|
@ -66,7 +66,6 @@ libisc_la_HEADERS = \
|
|||
include/isc/pause.h \
|
||||
include/isc/portset.h \
|
||||
include/isc/quota.h \
|
||||
include/isc/qsbr.h \
|
||||
include/isc/radix.h \
|
||||
include/isc/random.h \
|
||||
include/isc/ratelimiter.h \
|
||||
|
|
@ -81,7 +80,6 @@ libisc_la_HEADERS = \
|
|||
include/isc/siphash.h \
|
||||
include/isc/sockaddr.h \
|
||||
include/isc/spinlock.h \
|
||||
include/isc/stack.h \
|
||||
include/isc/stats.h \
|
||||
include/isc/stdio.h \
|
||||
include/isc/stdtime.h \
|
||||
|
|
@ -174,7 +172,6 @@ libisc_la_SOURCES = \
|
|||
picohttpparser.h \
|
||||
portset.c \
|
||||
quota.c \
|
||||
qsbr.c \
|
||||
radix.c \
|
||||
random.c \
|
||||
ratelimiter.c \
|
||||
|
|
|
|||
|
|
@ -107,8 +107,7 @@ isc__async_cb(uv_async_t *handle) {
|
|||
*/
|
||||
struct cds_wfcq_node *node, *next;
|
||||
__cds_wfcq_for_each_blocking_safe(&jobs.head, &jobs.tail, node, next) {
|
||||
isc_job_t *job = caa_container_of(node, isc_job_t, wfcq_node);
|
||||
__tsan_acquire(job);
|
||||
isc_job_t *job = isc_urcu_container(node, isc_job_t, wfcq_node);
|
||||
|
||||
job->cb(job->cbarg);
|
||||
|
||||
|
|
|
|||
|
|
@ -27,13 +27,6 @@
|
|||
#include <isc/mem.h>
|
||||
#include <isc/tid.h>
|
||||
|
||||
/*
|
||||
* XXXFANF to be added to <isc/util.h> by a commmit in a qp-trie
|
||||
* feature branch
|
||||
*/
|
||||
#define STRUCT_FLEX_SIZE(pointer, member, count) \
|
||||
(sizeof(*(pointer)) + sizeof(*(pointer)->member) * (count))
|
||||
|
||||
/*
|
||||
* XXXFANF this should probably be in <isc/util.h> too
|
||||
*/
|
||||
|
|
|
|||
|
|
@ -287,7 +287,7 @@ isc__ht_add(isc_ht_t *ht, const unsigned char *key, const uint32_t keysize,
|
|||
|
||||
hash = hash_32(hashval, ht->hashbits[idx]);
|
||||
|
||||
node = isc_mem_get(ht->mctx, sizeof(*node) + keysize);
|
||||
node = isc_mem_get(ht->mctx, STRUCT_FLEX_SIZE(node, key, keysize));
|
||||
*node = (isc_ht_node_t){
|
||||
.keysize = keysize,
|
||||
.hashval = hashval,
|
||||
|
|
@ -396,7 +396,7 @@ isc__ht_delete(isc_ht_t *ht, const unsigned char *key, const uint32_t keysize,
|
|||
prev->next = node->next;
|
||||
}
|
||||
isc_mem_put(ht->mctx, node,
|
||||
sizeof(*node) + node->keysize);
|
||||
STRUCT_FLEX_SIZE(node, key, node->keysize));
|
||||
ht->count--;
|
||||
|
||||
return (ISC_R_SUCCESS);
|
||||
|
|
|
|||
|
|
@ -69,17 +69,6 @@ isc_loopmgr_run(isc_loopmgr_t *loopmgr);
|
|||
*\li 'loopmgr' is a valid loop manager.
|
||||
*/
|
||||
|
||||
void
|
||||
isc_loopmgr_wakeup(isc_loopmgr_t *loopmgr);
|
||||
/*%<
|
||||
* Send no-op events to wake up all running loops in 'loopmgr' except
|
||||
* the current one. (See <isc/qsbr.h>.)
|
||||
*
|
||||
* Requires:
|
||||
*\li 'loopmgr' is a valid loop manager.
|
||||
*\li We are in a running loop.
|
||||
*/
|
||||
|
||||
void
|
||||
isc_loopmgr_pause(isc_loopmgr_t *loopmgr);
|
||||
/*%<
|
||||
|
|
|
|||
|
|
@ -1,282 +0,0 @@
|
|||
/*
|
||||
* Copyright (C) Internet Systems Consortium, Inc. ("ISC")
|
||||
*
|
||||
* SPDX-License-Identifier: MPL-2.0
|
||||
*
|
||||
* This Source Code Form is subject to the terms of the Mozilla Public
|
||||
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
* file, you can obtain one at https://mozilla.org/MPL/2.0/.
|
||||
*
|
||||
* See the COPYRIGHT file distributed with this work for additional
|
||||
* information regarding copyright ownership.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <isc/atomic.h>
|
||||
#include <isc/stack.h>
|
||||
#include <isc/types.h>
|
||||
#include <isc/uv.h>
|
||||
|
||||
/*
|
||||
* Quiescent state based reclamation
|
||||
* =================================
|
||||
*
|
||||
* QSBR is a safe memory reclamation algorithm for lock-free data
|
||||
* structures such as a qp-trie.
|
||||
*
|
||||
* When an object is unlinked from a lock-free data structure, it
|
||||
* cannot be free()d immediately, because there can still be readers
|
||||
* accessing the object via an old version of the data structure. SMR
|
||||
* algorithms determine when it is safe to reclaim memory after it has
|
||||
* been unlinked.
|
||||
*
|
||||
* With QSBR, reading a data structure is wait-free. All that is
|
||||
* required is an atomic load to get the data structure's current
|
||||
* root; there is no need to explicitly mark any read-side critical
|
||||
* section.
|
||||
*
|
||||
* QSBR is used by RCU (read-copy-update) in the Linux kernel. BIND's
|
||||
* implementation also uses some ideas from EBR (epoch-based reclamation).
|
||||
* The following summary is based on the overview in the paper
|
||||
* "performance of memory reclamation for lockless synchronization",
|
||||
* (http://csng.cs.toronto.edu/publication_files/0000/0159/jpdc07.pdf).
|
||||
*
|
||||
* Aside: This QSBR implementation is somewhat different from the one
|
||||
* in liburcu, described in the paper "user-level implementations of
|
||||
* read-copy update", (https://www.efficios.com/publications/), which
|
||||
* contains the amusing comment:
|
||||
*
|
||||
* BIND, a major domain-name server used for Internet domain-name
|
||||
* resolution, is facing scalability issues. Since domain names
|
||||
* are read often but rarely updated, using user-level RCU might
|
||||
* be beneficial.
|
||||
*
|
||||
* A "quiescent state" is a point when a thread is not accessing any
|
||||
* lock-free data structure. After passing through a quiescent state,
|
||||
* a thread can no longer access versions of a data structure that
|
||||
* were replaced before that point. In BIND, we use a point in the
|
||||
* event loop (a uv_prepare_t callback) to identify a quiescent state.
|
||||
*
|
||||
* Aside: a prepare handle runs its callbacks before the loop sleeps,
|
||||
* which reduces reclaim latency (unlike a check handle) and it does
|
||||
* not affect timeout calculations (unlike an idle handle).
|
||||
*
|
||||
* A "grace period" is any time interval such that after the end of
|
||||
* the grace period, all objects removed before the start of the grace
|
||||
* period can safely be reclaimed. Different SMR algorithms detect
|
||||
* grace periods with varying degrees of tightness or looseness.
|
||||
*
|
||||
* QSBR uses quiescent states to detect grace periods: a grace period
|
||||
* is a time interval in which every thread passes through a quiescent
|
||||
* state. (This is a safe over-estimate.) A "fuzzy barrier" is used to
|
||||
* find out when all threads have passed through a quiescent state.
|
||||
*
|
||||
* NOTE: In BIND this means that code which is not running in an event
|
||||
* loop thread (such as an isc_work / uv_work_t callback) must use
|
||||
* locking (not lock-free) data structure accessors.
|
||||
*
|
||||
* Because a quiescent state happens once per event loop, a grace
|
||||
* period takes roughly the same amount of time as the slowest event
|
||||
* loop in each cycle.
|
||||
*
|
||||
* Similar to the paper linked above, this QSBR implementation uses a
|
||||
* variant of the EBR fuzzy barrier. Like EBR, each grace period is
|
||||
* numbered with a "phase", which cycles round 1,2,3,1,2,3,... (Phases
|
||||
* are called epochs in EBR, but I think "phase" is a better metaphor.)
|
||||
* When entering the fuzzy barrier, each thread updates its local phase
|
||||
* to match the global phase, keeping a global count of the number of
|
||||
* threads still to pass. When this count reaches zero, it is the end of
|
||||
* the grace period; the global phase is updated and reclamation is
|
||||
* triggered.
|
||||
*
|
||||
* Note that threads are usually slightly out-of-phase wrt the global
|
||||
* grace period. At any particular point in time, there will be some
|
||||
* threads in the current global phase, and some in the previous
|
||||
* global phase. EBR has three phases because that is the minimum
|
||||
* number that leaves one phase unoccupied by readers. Any objects that
|
||||
* were detached from the data structure in the third phase can be
|
||||
* reclaimed after the start of the current phase, because a grace
|
||||
* period (the previous phase) has elapsed since the objects were
|
||||
* detached.
|
||||
*
|
||||
* A phase number can be used by a lock-free data structure (such as a
|
||||
* qp-trie) to record when an object was detached. QSBR calls the data
|
||||
* structure's reclaimer function, passing a phase number indicating
|
||||
* that objects detached in that phase can now be reclaimed
|
||||
*
|
||||
* In general, there will be several (maybe many) write operations
|
||||
* during a grace period. The lock-free data structures that use QSBR
|
||||
* will collect their reclamation work from all these writes into a
|
||||
* batch per phase, i.e. per grace period.
|
||||
*
|
||||
* There is some example code in `doc/dev/qsbr.md`, with pointers to
|
||||
* less terse introductions to QSBR and other overview material.
|
||||
*/
|
||||
|
||||
#define ISC_QSBR_PHASE_BITS 2
|
||||
|
||||
typedef unsigned int isc_qsbr_phase_t;
|
||||
/*%<
|
||||
* A grace period phase number. It can be stored in a bitfield of size
|
||||
* ISC_QSBR_PHASE_BITS. You can use zero to indicate "no phase".
|
||||
* (Don't assume the maximum is three: We might want to increase the
|
||||
* number of phases so that there is more than one unoccupied phase.
|
||||
* This would allow concurrent reclamation of objects released in
|
||||
* multiple unoccupied phases.)
|
||||
*/
|
||||
|
||||
typedef void
|
||||
isc_qsbreclaimer_t(isc_qsbr_phase_t phase);
|
||||
/*%<
|
||||
* The type of memory reclaimer callback functions.
|
||||
*
|
||||
* The `phase` identifies which objects are to be reclaimed.
|
||||
*
|
||||
* An isc_qsbreclaimer_t can call isc_qsbr_activate() if it could not
|
||||
* reclaim everything and needs to be called again.
|
||||
*/
|
||||
|
||||
typedef struct isc_qsbr_registered {
|
||||
ISC_SLINK(struct isc_qsbr_registered) link;
|
||||
isc_qsbreclaimer_t *func;
|
||||
} isc_qsbr_registered_t;
|
||||
/*%<
|
||||
* Each reclaimer callback has a static `isc_qsbr_registered_t` object
|
||||
* so that QSBR can find it.
|
||||
*/
|
||||
|
||||
void
|
||||
isc__qsbr_register(isc_qsbr_registered_t *reg);
|
||||
/*%<
|
||||
* Requires:
|
||||
* \li reclaimer->link is not linked
|
||||
* \li reclaimer->func is not NULL
|
||||
*/
|
||||
|
||||
#define isc_qsbr_register(cb) \
|
||||
do { \
|
||||
static isc_qsbr_registered_t registration = { \
|
||||
.link = ISC_SLINK_INITIALIZER, \
|
||||
.func = cb, \
|
||||
}; \
|
||||
isc__qsbr_register(®istration); \
|
||||
} while (0)
|
||||
/*%<
|
||||
* Register a callback function with QSBR. This macro should be used
|
||||
* inside an `ISC_CONSTRUCTOR` function. There should be one callback
|
||||
* for eack lock-free data structure implementation, which is able to
|
||||
* reclaim all the unused memory across all instances of its data
|
||||
* structure.
|
||||
*/
|
||||
|
||||
isc_qsbr_phase_t
|
||||
isc_qsbr_phase(isc_loopmgr_t *loopmgr);
|
||||
/*%<
|
||||
* Get the current phase, to use for marking detached objects.
|
||||
*
|
||||
* To commit a write that requires cleanup, the ordering must be:
|
||||
*
|
||||
* - Use atomic_store_release() to commit the data structure's new
|
||||
* root pointer; release ordering ensures that the interior changes
|
||||
* are written before the root pointer.
|
||||
*
|
||||
* - Call isc_qsbr_phase() to get the phase to be used for marking
|
||||
* objects to reclaim. This must happen after the commit, to ensure
|
||||
* there is at least one grace period between commit and cleanup.
|
||||
*
|
||||
* - Pass the same phase to isc_qsbr_activate() so that the reclaimer
|
||||
* will be called after a grace period has passed.
|
||||
*/
|
||||
|
||||
void
|
||||
isc_qsbr_activate(isc_loopmgr_t *loopmgr, isc_qsbr_phase_t phase);
|
||||
/*%<
|
||||
* Tell QSBR that objects have been detached and will need reclaiming
|
||||
* after a grace period.
|
||||
*/
|
||||
|
||||
/***********************************************************************
|
||||
*
|
||||
* private parts
|
||||
*/
|
||||
|
||||
/*
|
||||
* Accessors and constructors for the `grace` variable.
|
||||
* It contains two bit fields:
|
||||
*
|
||||
* - the global phase in the lower ISC_QSBR_PHASE_BITS
|
||||
*
|
||||
* - a thread counter in the upper bits
|
||||
*/
|
||||
|
||||
#define ISC_QSBR_ONE_THREAD (1 << ISC_QSBR_PHASE_BITS)
|
||||
#define ISC_QSBR_PHASE_MAX (ISC_QSBR_ONE_THREAD - 1)
|
||||
|
||||
#define ISC_QSBR_GRACE_PHASE(grace) (grace & ISC_QSBR_PHASE_MAX)
|
||||
#define ISC_QSBR_GRACE_THREADS(grace) (grace >> ISC_QSBR_PHASE_BITS)
|
||||
#define ISC_QSBR_GRACE(threads, phase) \
|
||||
((threads << ISC_QSBR_PHASE_BITS) | phase)
|
||||
|
||||
typedef struct isc_qsbr {
|
||||
/*
|
||||
* The `grace` variable keeps track of the current grace period.
|
||||
* When the phase changes, the thread counter is set to the number of
|
||||
* threads that need to observe the new phase before the grace period
|
||||
* can end.
|
||||
*
|
||||
* The thread counter is an add-on to the usual EBR fuzzy barrier.
|
||||
* Counting threads through the barrier adds multi-thread update
|
||||
* contention, and in EBR the fuzzy barrier runs frequently enough
|
||||
* (on every access) that it's important to minimize its cost. With
|
||||
* QSBR, the fuzzy barrier runs less frequently (roughly, per loop,
|
||||
* instead of per-callback) so contention is less of a concern. The
|
||||
* thread counter helps to reduce reclaim latency, because unlike EBR
|
||||
* we don't probabilistically check, we know deterministically when
|
||||
* all threads have changed phase.
|
||||
*/
|
||||
atomic_uint_fast32_t grace;
|
||||
|
||||
/*
|
||||
* A flag for each phase indicating that there will be work to
|
||||
* do, so we don't invoke the reclaim machinery unnecessarily.
|
||||
* Set by `isc_qsbr_activate()` and cleared before the reclaimer
|
||||
* functions are invoked (so they can re-set their flag if
|
||||
* necessary).
|
||||
*/
|
||||
atomic_uint_fast32_t activated;
|
||||
|
||||
/*
|
||||
* The time of the last phase transition (isc_nanosecs_t). Used
|
||||
* to ensure that grace periods do not last forever. We use
|
||||
* `isc_time_monotonic()` because we need the same time in all
|
||||
* threads. (`uv_now()` is different in different threads.)
|
||||
*/
|
||||
atomic_uint_fast64_t transition_time;
|
||||
|
||||
} isc_qsbr_t;
|
||||
|
||||
/*
|
||||
* When we start there is no worker thread yet, so the thread
|
||||
* count is equal to the number of loops. The global phase starts
|
||||
* off at one (it must always be nonzero).
|
||||
*/
|
||||
#define ISC_QSBR_INITIALIZER(nloops) \
|
||||
(isc_qsbr_t) { \
|
||||
.grace = ISC_QSBR_GRACE(nloops, 1), \
|
||||
.transition_time = isc_time_monotonic(), \
|
||||
}
|
||||
|
||||
/*
|
||||
* For use by tests that need to explicitly drive QSBR phase transitions.
|
||||
*/
|
||||
void
|
||||
isc__qsbr_quiescent_state(isc_loop_t *loop);
|
||||
|
||||
/*
|
||||
* Used by the loopmgr
|
||||
*/
|
||||
void
|
||||
isc__qsbr_quiescent_cb(uv_prepare_t *handle);
|
||||
void
|
||||
isc__qsbr_destroy(isc_loopmgr_t *loopmgr);
|
||||
|
|
@ -1,189 +0,0 @@
|
|||
/*
|
||||
* Copyright (C) Internet Systems Consortium, Inc. ("ISC")
|
||||
*
|
||||
* SPDX-License-Identifier: MPL-2.0
|
||||
*
|
||||
* This Source Code Form is subject to the terms of the Mozilla Public
|
||||
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
* file, you can obtain one at https://mozilla.org/MPL/2.0/.
|
||||
*
|
||||
* See the COPYRIGHT file distributed with this work for additional
|
||||
* information regarding copyright ownership.
|
||||
*/
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <isc/assertions.h>
|
||||
#include <isc/atomic.h>
|
||||
#include <isc/list.h>
|
||||
#include <isc/pause.h>
|
||||
|
||||
/*
|
||||
* Single links, aka stack links
|
||||
*/
|
||||
|
||||
#define ISC_SLINK(type) \
|
||||
struct { \
|
||||
type *next; \
|
||||
}
|
||||
#define ISC_SLINK_INITIALIZER_TYPE(type) \
|
||||
{ \
|
||||
.next = ISC_LINK_TOMBSTONE(type), \
|
||||
}
|
||||
#define ISC_SLINK_INIT_TYPE(elt, link, type) \
|
||||
{ \
|
||||
(elt)->link.next = ISC_LINK_TOMBSTONE(type); \
|
||||
}
|
||||
|
||||
#define ISC_SLINK_INITIALIZER ISC_SLINK_INITIALIZER_TYPE(void)
|
||||
#define ISC_SLINK_INIT(elt, link) ISC_SLINK_INIT_TYPE(elt, link, void)
|
||||
|
||||
#define ISC_SLINK_LINKED_TYPE(elt, link, type) \
|
||||
((type *)((elt)->link.next) != ISC_LINK_TOMBSTONE(type))
|
||||
#define ISC_SLINK_LINKED(elt, link) ISC_SLINK_LINKED_TYPE(elt, link, void)
|
||||
|
||||
#define ISC_SLINK_NEXT(elt, link) ((elt)->link.next)
|
||||
|
||||
/*
|
||||
* Simple singly-linked stack implementation
|
||||
*/
|
||||
|
||||
#define ISC_STACK(type) \
|
||||
struct { \
|
||||
type *top; \
|
||||
}
|
||||
|
||||
#define ISC_STACK_INITIALIZER \
|
||||
{ \
|
||||
.top = NULL, \
|
||||
}
|
||||
#define ISC_STACK_INIT(stack) \
|
||||
{ \
|
||||
(stack).top = NULL; \
|
||||
}
|
||||
|
||||
#define ISC_STACK_TOP(stack) ((stack).top)
|
||||
#define ISC_STACK_EMPTY(stack) ((stack).top == NULL)
|
||||
|
||||
#define ISC_STACK_PUSHUNSAFE(stack, elt, link) \
|
||||
{ \
|
||||
(elt)->link.next = (stack).top; \
|
||||
(stack).top = (elt); \
|
||||
}
|
||||
|
||||
#define ISC_STACK_PUSH(stack, elt, link) \
|
||||
{ \
|
||||
INSIST(!ISC_SLINK_LINKED(elt, link)); \
|
||||
ISC_STACK_PUSHUNSAFE(stack, elt, link); \
|
||||
}
|
||||
|
||||
/*
|
||||
* This is slightly round about because when `type` is `void`
|
||||
* we can't directly access `__top->link.next`.
|
||||
*/
|
||||
#define ISC_STACK_POP_TYPE(stack, link, type) \
|
||||
({ \
|
||||
type *__top = (stack).top; \
|
||||
if (__top != NULL) { \
|
||||
type *__next = ISC_SLINK_NEXT((stack).top, link); \
|
||||
ISC_SLINK_INIT_TYPE((stack).top, link, type); \
|
||||
(stack).top = __next; \
|
||||
} \
|
||||
__top; \
|
||||
})
|
||||
|
||||
#define ISC_STACK_POP(stack, link) ISC_STACK_POP_TYPE(stack, link, void)
|
||||
|
||||
/*
|
||||
* Helper to add element if not already on stack.
|
||||
*/
|
||||
#define ISC_STACK_ADD(stack, elt, link) \
|
||||
{ \
|
||||
if (!ISC_STACK_LINKED(elt, link)) { \
|
||||
ISC_STACK_PUSHUNSAFE(stack, elt, link); \
|
||||
} \
|
||||
}
|
||||
|
||||
/*
|
||||
* This is a simplified implementation of the Treiber stack algorithm with
|
||||
* back-off.
|
||||
*
|
||||
* The original paper which describes the algorithm can be found here:
|
||||
* https://dominoweb.draco.res.ibm.com/58319a2ed2b1078985257003004617ef.html
|
||||
*
|
||||
* We sidestep the ABA problem when removing the elements by adding additional
|
||||
* constraint: once the element has been removed from the stack, it cannot be
|
||||
* re-added.
|
||||
*
|
||||
* This is actually not a problem for planned usage of the ISC_ASTACK where we
|
||||
* only use ISC_ASTACK_PUSH() to add individual elements and ISC_ASTACK_DRAIN()
|
||||
* to empty the stack in a single atomic operation.
|
||||
*
|
||||
* To make things simple, there's no implementation for ISC_ASTACK_POP() because
|
||||
* that's complex to implement and requires double-word CAS (for ABA counter).
|
||||
*
|
||||
* The pointers are wrapped in structs so that their types are
|
||||
* distinct, and to match the ISC_LIST macros.
|
||||
*
|
||||
* See doc/dev/dev.md for examples.
|
||||
*/
|
||||
|
||||
#define ISC_ASTACK(type) \
|
||||
struct { \
|
||||
atomic_ptr(type) atop; \
|
||||
}
|
||||
#define ISC_ASTACK_INITIALIZER \
|
||||
{ \
|
||||
.atop = NULL, \
|
||||
}
|
||||
#define ISC_ASTACK_INIT(stack) \
|
||||
{ \
|
||||
(stack).atop = NULL; \
|
||||
}
|
||||
|
||||
/*
|
||||
* ATOMIC: for performance, this kind of retry loop should use a weak
|
||||
* CAS with relaxed ordering in the failure case; on success, release
|
||||
* ordering ensures that writing the element contents happens before
|
||||
* reading them, following the acquire in ISC_ASTACK_TOP() and _DRAIN().
|
||||
*/
|
||||
#define ISC_ASTACK_PUSHUNSAFE(stack, elt, link) \
|
||||
{ \
|
||||
(elt)->link.next = atomic_load_relaxed(&(stack).atop); \
|
||||
while (!atomic_compare_exchange_weak_explicit( \
|
||||
&(stack).atop, &ISC_SLINK_NEXT(elt, link), elt, \
|
||||
memory_order_release, memory_order_relaxed)) \
|
||||
{ \
|
||||
isc_pause(); \
|
||||
} \
|
||||
}
|
||||
|
||||
#define ISC_ASTACK_PUSH(stack, elt, link) \
|
||||
{ \
|
||||
INSIST(!ISC_SLINK_LINKED(elt, link)); \
|
||||
ISC_ASTACK_PUSHUNSAFE(stack, elt, link); \
|
||||
}
|
||||
|
||||
/*
|
||||
* Helper to add element if not already on stack.
|
||||
*/
|
||||
#define ISC_ASTACK_ADD(stack, elt, link) \
|
||||
{ \
|
||||
if (!ISC_SLINK_LINKED(elt, link)) { \
|
||||
ISC_ASTACK_PUSHUNSAFE(stack, elt, link); \
|
||||
} \
|
||||
}
|
||||
|
||||
/*
|
||||
* ATOMIC: acquire ordering pairs with ISC_ASTACK_PUSHUNSAFEe()
|
||||
*/
|
||||
#define ISC_ASTACK_TOP(stack) atomic_load_acquire(&(stack).atop)
|
||||
#define ISC_ASTACK_EMPTY(stack) (ISC_ASTACK_TOP(stack) == NULL)
|
||||
|
||||
/*
|
||||
* ATOMIC: acquire ordering pairs with ISC_ASTACK_PUSHUNSAFE()
|
||||
*/
|
||||
#define ISC_ASTACK_TO_STACK(stack) \
|
||||
{ \
|
||||
.top = atomic_exchange_acquire(&(stack).atop, NULL), \
|
||||
}
|
||||
|
|
@ -39,6 +39,35 @@
|
|||
|
||||
#pragma GCC diagnostic pop
|
||||
|
||||
/*
|
||||
* Help thread sanitizer to understand `call_rcu()`:
|
||||
*
|
||||
* The `rcu_head` argument to `call_rcu()` is expected to be embedded
|
||||
* in a structure defined by the caller, which is named `_rcuctx` in
|
||||
* these macros. The callback function uses `caa_container_of()` to
|
||||
* recover the `rcuctx` pointer from the `_rcu_head` pointer that is
|
||||
* passed to the callback.
|
||||
*
|
||||
* We explain the ordering dependency to tsan by releasing `_rcuctx`
|
||||
* pointer before `call_rcu()` and acquiring it in the callback
|
||||
* funtion. We pass the outer `_rcuctx` pointer to the `__tsan_`
|
||||
* barriers, because it should match a pointer that is known by tsan
|
||||
* to have been returned by `malloc()`.
|
||||
*/
|
||||
|
||||
#define isc_urcu_cleanup(ptr, member, func) \
|
||||
{ \
|
||||
__tsan_release(ptr); \
|
||||
call_rcu(&(ptr)->member, func); \
|
||||
}
|
||||
|
||||
#define isc_urcu_container(ptr, type, member) \
|
||||
({ \
|
||||
type *_ptr = caa_container_of(ptr, type, member); \
|
||||
__tsan_acquire(_ptr); \
|
||||
_ptr; \
|
||||
})
|
||||
|
||||
#if defined(RCU_QSBR)
|
||||
|
||||
/*
|
||||
|
|
|
|||
|
|
@ -96,6 +96,14 @@
|
|||
|
||||
#define ARRAY_SIZE(x) (sizeof(x) / sizeof(x[0]))
|
||||
|
||||
/*%
|
||||
* Get the allocation size for a struct with a flexible array member
|
||||
* containing `count` elements. The struct is identified by a pointer,
|
||||
* typically the one that points to (or will point to) the allocation.
|
||||
*/
|
||||
#define STRUCT_FLEX_SIZE(pointer, member, count) \
|
||||
(sizeof(*(pointer)) + sizeof(*(pointer)->member) * (count))
|
||||
|
||||
/*%
|
||||
* Use this in translation units that would otherwise be empty, to
|
||||
* suppress compiler warnings.
|
||||
|
|
|
|||
|
|
@ -26,14 +26,12 @@
|
|||
#include <isc/magic.h>
|
||||
#include <isc/mem.h>
|
||||
#include <isc/mutex.h>
|
||||
#include <isc/qsbr.h>
|
||||
#include <isc/refcount.h>
|
||||
#include <isc/result.h>
|
||||
#include <isc/signal.h>
|
||||
#include <isc/strerr.h>
|
||||
#include <isc/thread.h>
|
||||
#include <isc/tid.h>
|
||||
#include <isc/time.h>
|
||||
#include <isc/urcu.h>
|
||||
#include <isc/util.h>
|
||||
#include <isc/uv.h>
|
||||
|
|
@ -151,7 +149,6 @@ destroy_cb(uv_async_t *handle) {
|
|||
uv_close(&loop->run_trigger, isc__job_close);
|
||||
uv_close(&loop->destroy_trigger, NULL);
|
||||
uv_close(&loop->pause_trigger, NULL);
|
||||
uv_close(&loop->wakeup_trigger, NULL);
|
||||
uv_close(&loop->quiescent, NULL);
|
||||
|
||||
uv_walk(&loop->loop, loop_walk_cb, (char *)"destroy_cb");
|
||||
|
|
@ -162,8 +159,6 @@ shutdown_cb(uv_async_t *handle) {
|
|||
isc_loop_t *loop = uv_handle_get_data(handle);
|
||||
isc_loopmgr_t *loopmgr = loop->loopmgr;
|
||||
|
||||
loop->shuttingdown = true;
|
||||
|
||||
/* Make sure, we can't be called again */
|
||||
uv_close(&loop->shutdown_trigger, shutdown_trigger_close_cb);
|
||||
|
||||
|
|
@ -185,12 +180,6 @@ shutdown_cb(uv_async_t *handle) {
|
|||
UV_RUNTIME_CHECK(uv_async_send, r);
|
||||
}
|
||||
|
||||
static void
|
||||
wakeup_cb(uv_async_t *handle) {
|
||||
/* we only woke up to make the loop take a spin */
|
||||
UNUSED(handle);
|
||||
}
|
||||
|
||||
static void
|
||||
loop_init(isc_loop_t *loop, isc_loopmgr_t *loopmgr, uint32_t tid) {
|
||||
*loop = (isc_loop_t){
|
||||
|
|
@ -226,9 +215,6 @@ loop_init(isc_loop_t *loop, isc_loopmgr_t *loopmgr, uint32_t tid) {
|
|||
UV_RUNTIME_CHECK(uv_async_init, r);
|
||||
uv_handle_set_data(&loop->destroy_trigger, loop);
|
||||
|
||||
r = uv_async_init(&loop->loop, &loop->wakeup_trigger, wakeup_cb);
|
||||
UV_RUNTIME_CHECK(uv_async_init, r);
|
||||
|
||||
r = uv_prepare_init(&loop->loop, &loop->quiescent);
|
||||
UV_RUNTIME_CHECK(uv_prepare_init, r);
|
||||
uv_handle_set_data(&loop->quiescent, loop);
|
||||
|
|
@ -245,7 +231,7 @@ loop_init(isc_loop_t *loop, isc_loopmgr_t *loopmgr, uint32_t tid) {
|
|||
|
||||
static void
|
||||
quiescent_cb(uv_prepare_t *handle) {
|
||||
isc__qsbr_quiescent_cb(handle);
|
||||
UNUSED(handle);
|
||||
|
||||
#if defined(RCU_QSBR)
|
||||
/* safe memory reclamation */
|
||||
|
|
@ -340,7 +326,6 @@ isc_loopmgr_create(isc_mem_t *mctx, uint32_t nloops, isc_loopmgr_t **loopmgrp) {
|
|||
loopmgr = isc_mem_get(mctx, sizeof(*loopmgr));
|
||||
*loopmgr = (isc_loopmgr_t){
|
||||
.nloops = nloops,
|
||||
.qsbr = ISC_QSBR_INITIALIZER(nloops),
|
||||
};
|
||||
|
||||
isc_mem_attach(mctx, &loopmgr->mctx);
|
||||
|
|
@ -449,6 +434,10 @@ isc_loopmgr_run(isc_loopmgr_t *loopmgr) {
|
|||
*/
|
||||
ignore_signal(SIGPIPE, SIG_IGN);
|
||||
|
||||
bool free_call_rcu_data = !create_all_cpu_call_rcu_data(0);
|
||||
|
||||
rcu_register_thread();
|
||||
|
||||
/*
|
||||
* The thread 0 is this one.
|
||||
*/
|
||||
|
|
@ -463,21 +452,13 @@ isc_loopmgr_run(isc_loopmgr_t *loopmgr) {
|
|||
}
|
||||
|
||||
isc_thread_main(loop_thread, &loopmgr->loops[0]);
|
||||
}
|
||||
|
||||
void
|
||||
isc_loopmgr_wakeup(isc_loopmgr_t *loopmgr) {
|
||||
REQUIRE(VALID_LOOPMGR(loopmgr));
|
||||
rcu_unregister_thread();
|
||||
|
||||
for (size_t i = 0; i < loopmgr->nloops; i++) {
|
||||
isc_loop_t *loop = &loopmgr->loops[i];
|
||||
rcu_barrier();
|
||||
|
||||
/* Skip current loop */
|
||||
if (i == isc_tid()) {
|
||||
continue;
|
||||
}
|
||||
|
||||
uv_async_send(&loop->wakeup_trigger);
|
||||
if (free_call_rcu_data) {
|
||||
free_all_cpu_call_rcu_data();
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -21,7 +21,6 @@
|
|||
#include <isc/loop.h>
|
||||
#include <isc/magic.h>
|
||||
#include <isc/mem.h>
|
||||
#include <isc/qsbr.h>
|
||||
#include <isc/refcount.h>
|
||||
#include <isc/result.h>
|
||||
#include <isc/signal.h>
|
||||
|
|
@ -76,9 +75,7 @@ struct isc_loop {
|
|||
uv_async_t destroy_trigger;
|
||||
|
||||
/* safe memory reclamation */
|
||||
uv_async_t wakeup_trigger;
|
||||
uv_prepare_t quiescent;
|
||||
isc_qsbr_phase_t qsbr_phase;
|
||||
};
|
||||
|
||||
/*
|
||||
|
|
@ -113,9 +110,6 @@ struct isc_loopmgr {
|
|||
|
||||
/* per-thread objects */
|
||||
isc_loop_t *loops;
|
||||
|
||||
/* safe memory reclamation */
|
||||
isc_qsbr_t qsbr;
|
||||
};
|
||||
|
||||
/*
|
||||
|
|
|
|||
|
|
@ -32,6 +32,7 @@
|
|||
#include <isc/strerr.h>
|
||||
#include <isc/string.h>
|
||||
#include <isc/types.h>
|
||||
#include <isc/urcu.h>
|
||||
#include <isc/util.h>
|
||||
|
||||
#ifdef HAVE_LIBXML2
|
||||
|
|
@ -578,6 +579,12 @@ isc__mem_destroy(isc_mem_t **ctxp FLARG) {
|
|||
ctx = *ctxp;
|
||||
*ctxp = NULL;
|
||||
|
||||
/*
|
||||
* wait for asynchronous memory reclamation to complete
|
||||
* before checking for memory leaks
|
||||
*/
|
||||
rcu_barrier();
|
||||
|
||||
#if ISC_MEM_TRACKLINES
|
||||
if ((ctx->debugging & ISC_MEM_DEBUGTRACE) != 0) {
|
||||
fprintf(stderr, "destroy mctx %p file %s line %u\n", ctx, file,
|
||||
|
|
|
|||
393
lib/isc/qsbr.c
393
lib/isc/qsbr.c
|
|
@ -1,393 +0,0 @@
|
|||
/*
|
||||
* Copyright (C) Internet Systems Consortium, Inc. ("ISC")
|
||||
*
|
||||
* SPDX-License-Identifier: MPL-2.0
|
||||
*
|
||||
* This Source Code Form is subject to the terms of the Mozilla Public
|
||||
* License, v. 2.0. If a copy of the MPL was not distributed with this
|
||||
* file, you can obtain one at https://mozilla.org/MPL/2.0/.
|
||||
*
|
||||
* See the COPYRIGHT file distributed with this work for additional
|
||||
* information regarding copyright ownership.
|
||||
*/
|
||||
|
||||
#include <isc/atomic.h>
|
||||
#include <isc/log.h>
|
||||
#include <isc/loop.h>
|
||||
#include <isc/qsbr.h>
|
||||
#include <isc/stack.h>
|
||||
#include <isc/tid.h>
|
||||
#include <isc/time.h>
|
||||
#include <isc/types.h>
|
||||
#include <isc/uv.h>
|
||||
|
||||
#include "loop_p.h"
|
||||
|
||||
#define MAX_GRACE_PERIOD_NS 53 * NS_PER_MS
|
||||
|
||||
#if 0
|
||||
#define TRACE(fmt, ...) \
|
||||
isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL, ISC_LOGMODULE_OTHER, \
|
||||
ISC_LOG_DEBUG(7), "%s:%u:%s():t%u: " fmt, __FILE__, \
|
||||
__LINE__, __func__, isc_tid(), ##__VA_ARGS__)
|
||||
#else
|
||||
#define TRACE(...)
|
||||
#endif
|
||||
|
||||
static ISC_STACK(isc_qsbr_registered_t) qsbreclaimers = ISC_STACK_INITIALIZER;
|
||||
|
||||
static void
|
||||
reclaim_cb(void *arg);
|
||||
static void
|
||||
reclaimed_cb(void *arg);
|
||||
|
||||
/**********************************************************************/
|
||||
|
||||
/*
|
||||
* 3,2,1,3,2,1,...
|
||||
*/
|
||||
static isc_qsbr_phase_t
|
||||
change_phase(isc_qsbr_phase_t phase) {
|
||||
return (--phase > 0 ? phase : ISC_QSBR_PHASE_MAX);
|
||||
}
|
||||
|
||||
/*
|
||||
* For marking or checking that a phase has cleanup work to do.
|
||||
*/
|
||||
static unsigned int
|
||||
active_bit(isc_qsbr_phase_t phase) {
|
||||
return (1 << phase);
|
||||
}
|
||||
|
||||
/*
|
||||
* Extract the global phase from the grace period state.
|
||||
*/
|
||||
static isc_qsbr_phase_t
|
||||
global_phase(isc_qsbr_t *qsbr, memory_order m_o) {
|
||||
uint32_t grace = atomic_load_explicit(&qsbr->grace, m_o);
|
||||
return (ISC_QSBR_GRACE_PHASE(grace));
|
||||
}
|
||||
|
||||
/*
|
||||
* Record that the current thread has passed the barrier.
|
||||
* Returns true if more threads still need to pass.
|
||||
*
|
||||
* ATOMIC: acquire-release, to ensure that this is not reordered wrt
|
||||
* read-only accesses to lock-free data structures. This implements the
|
||||
* ordering requirements of a quiescent state.
|
||||
*/
|
||||
static bool
|
||||
fuzzy_barrier_not_yet(isc_qsbr_t *qsbr) {
|
||||
uint32_t grace = atomic_fetch_sub_acq_rel(&qsbr->grace,
|
||||
ISC_QSBR_ONE_THREAD);
|
||||
uint32_t threads = ISC_QSBR_GRACE_THREADS(grace);
|
||||
return (threads > 1);
|
||||
}
|
||||
|
||||
/*
|
||||
* Ungracefully drive all cleanup work to completion.
|
||||
*
|
||||
* ATOMIC: everything is relaxed, because we assume that concurrent
|
||||
* readers have already finished. `reclaim_cb()` uses the `activated`
|
||||
* flags to ensure it is OK that threads will race to complete the
|
||||
* cleanup.
|
||||
*/
|
||||
static void
|
||||
qsbr_shutdown(isc_loopmgr_t *loopmgr) {
|
||||
isc_qsbr_t *qsbr = &loopmgr->qsbr;
|
||||
isc_qsbr_phase_t phase = global_phase(qsbr, memory_order_relaxed);
|
||||
uint32_t threads = isc_loopmgr_nloops(loopmgr);
|
||||
uint32_t grace;
|
||||
|
||||
while (atomic_load_relaxed(&qsbr->activated) != 0) {
|
||||
reclaim_cb(loopmgr);
|
||||
phase = change_phase(phase);
|
||||
grace = ISC_QSBR_GRACE(threads, phase);
|
||||
atomic_store_relaxed(&qsbr->grace, grace);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* On a quiet server that does not have enough network traffic to keep
|
||||
* all its threads spinning, grace periods might extend indefinitely.
|
||||
* So check if we have been waiting an unreasonably long time since
|
||||
* the last phase change. If so, send a no-op async request to every
|
||||
* thread to make them all cycle through a quiescent state.
|
||||
*/
|
||||
static void
|
||||
maybe_wakeup(isc_loop_t *loop) {
|
||||
isc_loopmgr_t *loopmgr = loop->loopmgr;
|
||||
isc_qsbr_t *qsbr = &loopmgr->qsbr;
|
||||
|
||||
/*
|
||||
* ATOMIC: relaxed is OK here because we don't use any values guarded
|
||||
* by the `activated` flags.
|
||||
*/
|
||||
if (atomic_load_relaxed(&qsbr->activated) == 0) {
|
||||
return;
|
||||
}
|
||||
if (loop->shuttingdown) {
|
||||
qsbr_shutdown(loopmgr);
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* ATOMIC: relaxed, because the `transition_time` doesn't guard any
|
||||
* other values, just the isc_loopmgr_wakeup() call below.
|
||||
*/
|
||||
atomic_uint_fast64_t *qsbr_ttp = &qsbr->transition_time;
|
||||
isc_nanosecs_t now = isc_time_monotonic();
|
||||
isc_nanosecs_t start = atomic_load_relaxed(qsbr_ttp);
|
||||
if (now < start + MAX_GRACE_PERIOD_NS) {
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* To stop other threads from also invoking `isc_loopmgr_wakeup()`,
|
||||
* we try to push the timer into the future (expecting that it will
|
||||
* not trigger again), and quit if someone else got there first.
|
||||
* ATOMIC: relaxed, as before; strong, because there is no retry loop.
|
||||
*/
|
||||
if (!atomic_compare_exchange_strong_relaxed(qsbr_ttp, &start, now)) {
|
||||
return;
|
||||
}
|
||||
|
||||
TRACE("long grace period of %llu ns, waking up other threads",
|
||||
(unsigned long long)(now - start));
|
||||
|
||||
isc_loopmgr_wakeup(loopmgr);
|
||||
}
|
||||
|
||||
/*
|
||||
* Callers use the fuzzy barrier to ensure only one thread can enter
|
||||
* this function at a time.
|
||||
*
|
||||
* Phase transitions happen at roughly the same frequency that IO
|
||||
* event loops cycle, limited by the slowest loop in each cycle.
|
||||
*/
|
||||
static void
|
||||
phase_transition(isc_loop_t *loop, isc_qsbr_phase_t current_phase) {
|
||||
isc_loopmgr_t *loopmgr = loop->loopmgr;
|
||||
isc_qsbr_t *qsbr = &loopmgr->qsbr;
|
||||
|
||||
if (loop->shuttingdown) {
|
||||
qsbr_shutdown(loopmgr);
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* After we change phase, threads will be in either the `current_phase`
|
||||
* or the `next_phase`. We will reclaim memory from the `third_phase`.
|
||||
*
|
||||
* ATOMIC: relaxed is OK here because the necessary synchronization
|
||||
* happens in `reclaim_cb()`.
|
||||
*/
|
||||
isc_qsbr_phase_t next_phase = change_phase(current_phase);
|
||||
isc_qsbr_phase_t third_phase = change_phase(next_phase);
|
||||
bool activated = atomic_load_relaxed(&qsbr->activated) &
|
||||
active_bit(third_phase);
|
||||
|
||||
/*
|
||||
* Reset the wakeup timer, and log the length of the grace period.
|
||||
* ATOMIC: relaxed, per the commentary in `maybe_wakeup()`.
|
||||
*/
|
||||
atomic_uint_fast64_t *qsbr_tt = &qsbr->transition_time;
|
||||
isc_nanosecs_t now = isc_time_monotonic();
|
||||
isc_nanosecs_t start = atomic_exchange_relaxed(qsbr_tt, now);
|
||||
TRACE("phase %u -> %u after grace period of %f ms", current_phase,
|
||||
next_phase, (double)(now - start) / NS_PER_MS);
|
||||
UNUSED(start); /* ifndef TRACE() */
|
||||
|
||||
/*
|
||||
* Work out the threads counter for this grace period.
|
||||
*
|
||||
* We need to add one for any reclamation worker thread, to
|
||||
* prevent us from changing phase before the work is done. If
|
||||
* we change too early, any newly detached objects will be
|
||||
* marked with the same phase as the running reclaimer, which
|
||||
* might lead to them being free()d too soon.
|
||||
*/
|
||||
uint32_t threads = isc_loopmgr_nloops(loopmgr) + (activated ? 1 : 0);
|
||||
|
||||
/*
|
||||
* Start the new grace period.
|
||||
*
|
||||
* ATOMIC: release, to pair with the load-acquire in `reclaim_cb()`
|
||||
* which is spawned in a separate worker thread.
|
||||
*/
|
||||
uint32_t grace = ISC_QSBR_GRACE(threads, next_phase);
|
||||
atomic_store_release(&qsbr->grace, grace);
|
||||
|
||||
if (activated) {
|
||||
isc_work_enqueue(loop, reclaim_cb, reclaimed_cb, loopmgr);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
* This function is called once per cycle of each IO event loop by the
|
||||
* `uv_prepare` callback below.
|
||||
*/
|
||||
void
|
||||
isc__qsbr_quiescent_state(isc_loop_t *loop) {
|
||||
isc_loopmgr_t *loopmgr = loop->loopmgr;
|
||||
isc_qsbr_t *qsbr = &loopmgr->qsbr;
|
||||
|
||||
/*
|
||||
* ATOMIC: relaxed. If we are in phase then we don't need to
|
||||
* synchronize; if we are not then this thread's presence in
|
||||
* the thread counter will prevent the phase from changing
|
||||
* before we get to the fuzzy barrier.
|
||||
*/
|
||||
isc_qsbr_phase_t phase = global_phase(qsbr, memory_order_relaxed);
|
||||
if (loop->qsbr_phase == phase) {
|
||||
maybe_wakeup(loop);
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* Enter the current phase and count us out of the previous phase.
|
||||
*/
|
||||
loop->qsbr_phase = phase;
|
||||
if (fuzzy_barrier_not_yet(qsbr)) {
|
||||
maybe_wakeup(loop);
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* We were the last thread to enter the current phase so the
|
||||
* grace period is up. No other thread can reach this point.
|
||||
*/
|
||||
phase_transition(loop, phase);
|
||||
}
|
||||
|
||||
void
|
||||
isc__qsbr_quiescent_cb(uv_prepare_t *handle) {
|
||||
isc_loop_t *loop = uv_handle_get_data((uv_handle_t *)handle);
|
||||
isc__qsbr_quiescent_state(loop);
|
||||
}
|
||||
|
||||
static void
|
||||
reclaimed_cb(void *arg) {
|
||||
/* we are back on a loop thread */
|
||||
isc_loopmgr_t *loopmgr = arg;
|
||||
isc_qsbr_t *qsbr = &loopmgr->qsbr;
|
||||
isc_loop_t *loop = CURRENT_LOOP(loopmgr);
|
||||
|
||||
/*
|
||||
* Remove the reclaimers from the thread count, so that the
|
||||
* next grace period can start.
|
||||
*/
|
||||
if (fuzzy_barrier_not_yet(qsbr)) {
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* The reclaimers were the last thread to be counted out: every
|
||||
* other thread already passed through a quiescent state.
|
||||
*
|
||||
* We expect loop->qsbr_phase == global_phase() at this point,
|
||||
* except during shutdown when the phase shifts rapidly. Also,
|
||||
* the current loop might not have received the shutdown
|
||||
* message yet, so it seems easiest to omit the assertion.
|
||||
*
|
||||
* ATOMIC: relaxed, the fuzzy barrier already synchronized.
|
||||
*/
|
||||
TRACE("reclaimers overran");
|
||||
phase_transition(loop, global_phase(qsbr, memory_order_relaxed));
|
||||
}
|
||||
|
||||
static void
|
||||
reclaim_cb(void *arg) {
|
||||
/* we are on a work thread not a loop thread */
|
||||
isc_loopmgr_t *loopmgr = arg;
|
||||
isc_qsbr_t *qsbr = &loopmgr->qsbr;
|
||||
|
||||
/*
|
||||
* The global phase has just been bumped by a `phase_transition()`
|
||||
* and it cannot change again until the grace period is up, which
|
||||
* cannot happen until we have finished working.
|
||||
*
|
||||
* ATOMIC: acquire, to pair with the release in `phase_transition()`.
|
||||
*
|
||||
* The phase we are to clean up is 2 before the current phase,
|
||||
* which is the same as the one after the current phase (mod 3).
|
||||
*/
|
||||
isc_qsbr_phase_t cur_phase = global_phase(qsbr, memory_order_acquire);
|
||||
isc_qsbr_phase_t third_phase = change_phase(cur_phase);
|
||||
unsigned int third_bit = active_bit(third_phase);
|
||||
|
||||
/*
|
||||
* If any reclaimers need to be called again later, they can use
|
||||
* `isc_qsbr_activate()`, so we need to clear the bit first.
|
||||
*
|
||||
* ATOMIC: acquire, so that `isc_qsbr_activate()` happens before
|
||||
* the callbacks are invoked.
|
||||
*/
|
||||
uint32_t activated = atomic_fetch_and_explicit(
|
||||
&qsbr->activated, ~third_bit, memory_order_acquire);
|
||||
|
||||
/* this can happen when we are racing to clean up on shutdown */
|
||||
if ((activated & third_bit) == 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
isc_qsbr_registered_t *reclaimer = ISC_STACK_TOP(qsbreclaimers);
|
||||
while (reclaimer != NULL) {
|
||||
reclaimer->func(third_phase);
|
||||
reclaimer = ISC_SLINK_NEXT(reclaimer, link);
|
||||
}
|
||||
}
|
||||
|
||||
void
|
||||
isc__qsbr_register(isc_qsbr_registered_t *reclaimer) {
|
||||
REQUIRE(reclaimer->func != NULL);
|
||||
ISC_STACK_PUSH(qsbreclaimers, reclaimer, link);
|
||||
}
|
||||
|
||||
/*
|
||||
* ATOMIC: This function needs to ensure that the global phase is read
|
||||
* after a write has committed. Acquire/release ordering is not sufficient
|
||||
* for ordering between separate atomics (the data structure's root pointer
|
||||
* and the global phase), so it must be sequentially consistent.
|
||||
*
|
||||
* In general, the phases up to and including the next phase transition
|
||||
* look like:
|
||||
*
|
||||
* 1. local phase
|
||||
* 2. global phase
|
||||
* 3. next phase
|
||||
* 1. third phase
|
||||
*
|
||||
* i.e. some threads are still one behind the global phase, on the same
|
||||
* phase that will be cleaned up immediately after the phase transition.
|
||||
*
|
||||
* This function is called just after a write commits. It's likely that
|
||||
* some threads on the global phase (2) are using a version of the data
|
||||
* structure from before the write, and they can continue using it while
|
||||
* the straggler threads (1) catch up and cause a phase transition.
|
||||
*
|
||||
* The writer can be one of the straggler threads. If it incorrectly marks
|
||||
* cleanup work with its local phase (1), memory will be reclaimed
|
||||
* immediately after the next phase transition (when the third phase is
|
||||
* also 1), which could be almost immediately when the writer returns to
|
||||
* the event loop. This will cause a use-after-free for existing readers
|
||||
* (in phase 2).
|
||||
*
|
||||
* More straightforwardly, we need to be able to queue up reclaim work from
|
||||
* a thread that isn't running a loop, which also means this function has
|
||||
* to return the global phase.
|
||||
*/
|
||||
isc_qsbr_phase_t
|
||||
isc_qsbr_phase(isc_loopmgr_t *loopmgr) {
|
||||
isc_qsbr_t *qsbr = &loopmgr->qsbr;
|
||||
return (global_phase(qsbr, memory_order_seq_cst));
|
||||
}
|
||||
|
||||
void
|
||||
isc_qsbr_activate(isc_loopmgr_t *loopmgr, isc_qsbr_phase_t phase) {
|
||||
/*
|
||||
* ATOMIC: release ordering ensures that writing the cleanup lists
|
||||
* happens before the callback is invoked from a worker thread.
|
||||
*/
|
||||
atomic_fetch_or_release(&loopmgr->qsbr.activated, active_bit(phase));
|
||||
}
|
||||
|
|
@ -79,8 +79,7 @@ isc_quota_release(isc_quota_t *quota) {
|
|||
return;
|
||||
}
|
||||
|
||||
isc_job_t *job = caa_container_of(node, isc_job_t, wfcq_node);
|
||||
__tsan_acquire(job);
|
||||
isc_job_t *job = isc_urcu_container(node, isc_job_t, wfcq_node);
|
||||
job->cb(job->cbarg);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -61,7 +61,7 @@ thread_wrap(isc_threadfunc_t func, void *arg) {
|
|||
.func = func,
|
||||
.arg = arg,
|
||||
};
|
||||
|
||||
__tsan_release(wrap);
|
||||
return (wrap);
|
||||
}
|
||||
|
||||
|
|
@ -84,12 +84,8 @@ thread_body(struct thread_wrap *wrap) {
|
|||
__tsan_acquire(wrap);
|
||||
free(wrap);
|
||||
|
||||
rcu_register_thread();
|
||||
|
||||
ret = func(arg);
|
||||
|
||||
rcu_unregister_thread();
|
||||
|
||||
return (ret);
|
||||
}
|
||||
|
||||
|
|
@ -101,10 +97,14 @@ thread_run(void *wrap) {
|
|||
*/
|
||||
isc__iterated_hash_initialize();
|
||||
|
||||
rcu_register_thread();
|
||||
|
||||
void *ret = thread_body(wrap);
|
||||
|
||||
isc__iterated_hash_shutdown();
|
||||
|
||||
rcu_unregister_thread();
|
||||
|
||||
return (ret);
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -18,8 +18,8 @@
|
|||
#include <isc/hashmap.h>
|
||||
#include <isc/ht.h>
|
||||
#include <isc/list.h>
|
||||
#include <isc/qsbr.h>
|
||||
#include <isc/rwlock.h>
|
||||
#include <isc/urcu.h>
|
||||
#include <isc/util.h>
|
||||
|
||||
#include <dns/fixedname.h>
|
||||
|
|
|
|||
|
|
@ -18,6 +18,7 @@
|
|||
#include <unistd.h>
|
||||
|
||||
#include <isc/async.h>
|
||||
#include <isc/barrier.h>
|
||||
#include <isc/list.h>
|
||||
#include <isc/log.h>
|
||||
#include <isc/loop.h>
|
||||
|
|
@ -25,7 +26,6 @@
|
|||
#include <isc/mem.h>
|
||||
#include <isc/mutex.h>
|
||||
#include <isc/os.h>
|
||||
#include <isc/qsbr.h>
|
||||
#include <isc/random.h>
|
||||
#include <isc/refcount.h>
|
||||
#include <isc/rwlock.h>
|
||||
|
|
@ -33,6 +33,7 @@
|
|||
#include <isc/tid.h>
|
||||
#include <isc/time.h>
|
||||
#include <isc/timer.h>
|
||||
#include <isc/urcu.h>
|
||||
#include <isc/util.h>
|
||||
#include <isc/uv.h>
|
||||
|
||||
|
|
@ -380,8 +381,7 @@ load_multi(struct bench_state *bctx) {
|
|||
size_t count = 0;
|
||||
uint64_t start;
|
||||
|
||||
dns_qpmulti_create(bctx->mctx, bctx->loopmgr, &item_methods, NULL,
|
||||
&bctx->multi);
|
||||
dns_qpmulti_create(bctx->mctx, &item_methods, NULL, &bctx->multi);
|
||||
|
||||
/* initial contents of the trie */
|
||||
start = isc_time_monotonic();
|
||||
|
|
@ -884,6 +884,9 @@ int
|
|||
main(void) {
|
||||
isc_loopmgr_t *loopmgr = NULL;
|
||||
isc_mem_t *mctx = NULL;
|
||||
|
||||
setlinebuf(stdout);
|
||||
|
||||
uint32_t nloops;
|
||||
const char *env_workers = getenv("ISC_TASK_WORKERS");
|
||||
|
||||
|
|
|
|||
|
|
@ -22,11 +22,11 @@
|
|||
#define UNIT_TESTING
|
||||
#include <cmocka.h>
|
||||
|
||||
#include <isc/qsbr.h>
|
||||
#include <isc/random.h>
|
||||
#include <isc/refcount.h>
|
||||
#include <isc/result.h>
|
||||
#include <isc/string.h>
|
||||
#include <isc/urcu.h>
|
||||
#include <isc/util.h>
|
||||
|
||||
#include <dns/name.h>
|
||||
|
|
|
|||
|
|
@ -27,10 +27,10 @@
|
|||
#include <isc/loop.h>
|
||||
#include <isc/magic.h>
|
||||
#include <isc/mem.h>
|
||||
#include <isc/qsbr.h>
|
||||
#include <isc/random.h>
|
||||
#include <isc/refcount.h>
|
||||
#include <isc/rwlock.h>
|
||||
#include <isc/urcu.h>
|
||||
#include <isc/util.h>
|
||||
|
||||
#include <dns/log.h>
|
||||
|
|
@ -367,14 +367,13 @@ many_transactions(void *arg) {
|
|||
UNUSED(arg);
|
||||
|
||||
dns_qpmulti_t *qpm = NULL;
|
||||
dns_qpmulti_create(mctx, loopmgr, &test_methods, NULL, &qpm);
|
||||
dns_qpmulti_create(mctx, &test_methods, NULL, &qpm);
|
||||
qpm->writer.write_protect = true;
|
||||
|
||||
for (size_t n = 0; n < TRANSACTION_COUNT; n++) {
|
||||
TRACE("transaction %zu", n);
|
||||
one_transaction(qpm);
|
||||
isc__qsbr_quiescent_state(isc_loop_current(loopmgr));
|
||||
isc_loopmgr_wakeup(loopmgr);
|
||||
rcu_quiescent_state();
|
||||
}
|
||||
|
||||
dns_qpmulti_destroy(&qpm);
|
||||
|
|
@ -387,6 +386,7 @@ ISC_RUN_TEST_IMPL(qpmulti) {
|
|||
setup_items();
|
||||
isc_loop_setup(isc_loop_main(loopmgr), many_transactions, NULL);
|
||||
isc_loopmgr_run(loopmgr);
|
||||
rcu_barrier();
|
||||
isc_loopmgr_destroy(&loopmgr);
|
||||
isc_log_destroy(&dns_lctx);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -4,6 +4,7 @@ AM_CPPFLAGS += \
|
|||
$(LIBISC_CFLAGS) \
|
||||
$(LIBDNS_CFLAGS) \
|
||||
$(LIBNS_CFLAGS) \
|
||||
$(LIBURCU_CFLAGS) \
|
||||
$(LIBUV_CFLAGS) \
|
||||
-I$(top_srcdir)/lib/isc \
|
||||
-I$(top_srcdir)/lib/dns
|
||||
|
|
|
|||
|
|
@ -64,7 +64,7 @@ dns_test_makeview(const char *name, bool with_cache, dns_view_t **viewp) {
|
|||
dns_view_t *view = NULL;
|
||||
dns_cache_t *cache = NULL;
|
||||
|
||||
result = dns_view_create(mctx, loopmgr, dns_rdataclass_in, name, &view);
|
||||
result = dns_view_create(mctx, dns_rdataclass_in, name, &view);
|
||||
if (result != ISC_R_SUCCESS) {
|
||||
return (result);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -18,9 +18,9 @@
|
|||
#include <isc/buffer.h>
|
||||
#include <isc/loop.h>
|
||||
#include <isc/magic.h>
|
||||
#include <isc/qsbr.h>
|
||||
#include <isc/refcount.h>
|
||||
#include <isc/rwlock.h>
|
||||
#include <isc/urcu.h>
|
||||
#include <isc/util.h>
|
||||
|
||||
#include <dns/name.h>
|
||||
|
|
@ -255,10 +255,10 @@ qp_test_dumpchunks(dns_qp_t *qp) {
|
|||
dumpqp(qp, "qp");
|
||||
for (qp_chunk_t c = 0; c < qp->chunk_max; c++) {
|
||||
printf("qp %p chunk %u base %p "
|
||||
"used %u free %u immutable %u phase %u\n",
|
||||
"used %u free %u immutable %u discounted %u\n",
|
||||
qp, c, qp->base->ptr[c], qp->usage[c].used,
|
||||
qp->usage[c].free, qp->usage[c].immutable,
|
||||
qp->usage[c].phase);
|
||||
qp->usage[c].discounted);
|
||||
used += qp->usage[c].used;
|
||||
free += qp->usage[c].free;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,3 +0,0 @@
|
|||
# Uninstrumented library.
|
||||
called_from_lib:libfstrm.so
|
||||
called_from_lib:libdummyrpz.so
|
||||
Loading…
Reference in a new issue