chg: dev: Refactor NTA to use RCU instead of rwlock

Replace the ntatable rwlock with RCU read-side critical sections.

Merge branch 'ondrej/refactor-nta-using-RCU' into 'main'

See merge request isc-projects/bind9!11689
This commit is contained in:
Ondřej Surý 2026-03-19 01:45:12 +01:00
commit fdb5eca9a9

View file

@ -17,15 +17,17 @@
#include <stdbool.h>
#include <isc/async.h>
#include <isc/atomic.h>
#include <isc/buffer.h>
#include <isc/log.h>
#include <isc/loop.h>
#include <isc/mem.h>
#include <isc/refcount.h>
#include <isc/result.h>
#include <isc/rwlock.h>
#include <isc/string.h>
#include <isc/time.h>
#include <isc/timer.h>
#include <isc/urcu.h>
#include <isc/util.h>
#include <dns/db.h>
@ -41,10 +43,8 @@ struct dns_ntatable {
unsigned int magic;
isc_mem_t *mctx;
dns_view_t *view;
isc_rwlock_t rwlock;
isc_refcount_t references;
dns_qpmulti_t *table;
atomic_bool shuttingdown;
};
struct dns__nta {
@ -60,7 +60,6 @@ struct dns__nta {
dns_rdataset_t sigrdataset;
dns_name_t name;
isc_stdtime_t expiry;
bool shuttingdown;
};
#define NTA_MAGIC ISC_MAGIC('N', 'T', 'A', 'n')
@ -115,25 +114,22 @@ dns_ntatable_create(dns_view_t *view, dns_ntatable_t **ntatablep) {
ntatable = isc_mem_get(view->mctx, sizeof(*ntatable));
*ntatable = (dns_ntatable_t){
.magic = NTATABLE_MAGIC,
.mctx = isc_mem_ref(view->mctx),
.references = ISC_REFCOUNT_INITIALIZER(1),
};
dns_view_weakattach(view, &ntatable->view);
isc_rwlock_init(&ntatable->rwlock);
dns_qpmulti_create(view->mctx, &qpmethods, view, &ntatable->table);
isc_refcount_init(&ntatable->references, 1);
ntatable->magic = NTATABLE_MAGIC;
*ntatablep = ntatable;
}
static void
dns__ntatable_destroy(dns_ntatable_t *ntatable) {
ntatable->magic = 0;
isc_rwlock_destroy(&ntatable->rwlock);
dns_qpmulti_destroy(&ntatable->table);
INSIST(ntatable->table == NULL);
INSIST(ntatable->view == NULL);
isc_mem_putanddetach(&ntatable->mctx, ntatable, sizeof(*ntatable));
}
@ -150,8 +146,11 @@ fetch_done(void *arg) {
dns__nta_t *nta = resp->arg;
isc_result_t eresult = resp->result;
dns_ntatable_t *ntatable = nta->ntatable;
dns_view_t *view = ntatable->view;
isc_stdtime_t now = isc_stdtime_now();
isc_stdtime_t expiry;
rcu_read_lock();
dns_view_t *view = rcu_dereference(ntatable->view);
dns_rdataset_cleanup(&nta->rdataset);
dns_rdataset_cleanup(&nta->sigrdataset);
@ -169,17 +168,18 @@ fetch_done(void *arg) {
dns_resolver_freefresp(&resp);
expiry = CMM_LOAD_SHARED(nta->expiry);
switch (eresult) {
case ISC_R_SUCCESS:
case DNS_R_NCACHENXDOMAIN:
case DNS_R_NXDOMAIN:
case DNS_R_NCACHENXRRSET:
case DNS_R_NXRRSET:
RWLOCK(&ntatable->rwlock, isc_rwlocktype_write);
if (nta->expiry > now) {
nta->expiry = now;
if (expiry > now) {
CMM_STORE_SHARED(nta->expiry, now);
expiry = now;
}
RWUNLOCK(&ntatable->rwlock, isc_rwlocktype_write);
break;
default:
break;
@ -189,12 +189,13 @@ fetch_done(void *arg) {
* If we're expiring before the next recheck, we might
* as well stop the timer now.
*/
RWLOCK(&ntatable->rwlock, isc_rwlocktype_read);
if (nta->timer != NULL && nta->expiry - now < view->nta_recheck) {
if (nta->timer != NULL &&
(view == NULL || expiry - now < view->nta_recheck))
{
isc_timer_stop(nta->timer);
}
RWUNLOCK(&ntatable->rwlock, isc_rwlocktype_read);
rcu_read_unlock();
dns__nta_detach(&nta); /* for dns_resolver_createfetch() */
}
@ -212,13 +213,17 @@ checkbogus(void *arg) {
dns_rdataset_cleanup(&nta->rdataset);
dns_rdataset_cleanup(&nta->sigrdataset);
if (atomic_load(&ntatable->shuttingdown)) {
rcu_read_lock();
dns_view_t *view = rcu_dereference(ntatable->view);
if (view == NULL) {
isc_timer_stop(nta->timer);
rcu_read_unlock();
return;
}
result = dns_view_getresolver(ntatable->view, &resolver);
result = dns_view_getresolver(view, &resolver);
if (result != ISC_R_SUCCESS) {
rcu_read_unlock();
return;
}
@ -232,23 +237,21 @@ checkbogus(void *arg) {
dns__nta_detach(&nta); /* for dns_resolver_createfetch() */
}
dns_resolver_detach(&resolver);
rcu_read_unlock();
}
static void
settimer(dns_ntatable_t *ntatable, dns__nta_t *nta, uint32_t lifetime) {
dns_view_t *view = NULL;
isc_interval_t interval;
REQUIRE(VALID_NTATABLE(ntatable));
settimer(dns_view_t *view, dns__nta_t *nta, uint32_t lifetime) {
REQUIRE(VALID_NTA(nta));
view = ntatable->view;
if (view->nta_recheck == 0 || lifetime <= view->nta_recheck) {
return;
}
isc_timer_create(nta->loop, checkbogus, nta, &nta->timer);
isc_interval_t interval;
isc_interval_set(&interval, view->nta_recheck, 0);
isc_timer_create(nta->loop, checkbogus, nta, &nta->timer);
isc_timer_start(nta->timer, isc_timertype_ticker, &interval);
}
@ -283,50 +286,50 @@ isc_result_t
dns_ntatable_add(dns_ntatable_t *ntatable, const dns_name_t *name, bool force,
isc_stdtime_t now, uint32_t lifetime) {
isc_result_t result = ISC_R_SUCCESS;
dns__nta_t *nta = NULL;
dns__nta_t *nta = NULL, *old_nta = NULL;
dns_qp_t *qp = NULL;
void *pval = NULL;
REQUIRE(VALID_NTATABLE(ntatable));
if (atomic_load(&ntatable->shuttingdown)) {
rcu_read_lock();
dns_view_t *view = rcu_dereference(ntatable->view);
dns_qpmulti_t *table = rcu_dereference(ntatable->table);
if (view == NULL || table == NULL) {
rcu_read_unlock();
return ISC_R_SUCCESS;
}
RWLOCK(&ntatable->rwlock, isc_rwlocktype_write);
dns_qpmulti_write(ntatable->table, &qp);
dns_qpmulti_write(table, &qp);
nta_create(ntatable, name, &nta);
nta->forced = force;
result = dns_qp_insert(qp, nta, 0);
switch (result) {
case ISC_R_EXISTS:
result = dns_qp_getname(qp, &nta->name, DNS_DBNAMESPACE_NORMAL,
&pval, NULL);
if (result == ISC_R_SUCCESS) {
/*
* an NTA already existed: throw away the
* new one and update the old one.
*/
dns__nta_detach(&nta); /* for nta_create */
nta = pval;
break;
}
/* update the NTA's timer as if it were new */
result = dns_qp_deletename(qp, name, DNS_DBNAMESPACE_NORMAL,
(void *)&old_nta, NULL);
RUNTIME_CHECK(result == ISC_R_SUCCESS);
dns__nta_shutdown(old_nta);
dns__nta_detach(&old_nta);
result = dns_qp_insert(qp, nta, 0);
RUNTIME_CHECK(result == ISC_R_SUCCESS);
FALLTHROUGH;
case ISC_R_SUCCESS:
nta->expiry = now + lifetime;
CMM_STORE_SHARED(nta->expiry, now + lifetime);
if (!force) {
settimer(ntatable, nta, lifetime);
settimer(view, nta, lifetime);
}
break;
default:
break;
UNREACHABLE();
}
dns_qp_compact(qp, DNS_QPGC_MAYBE);
dns_qpmulti_commit(ntatable->table, &qp);
RWUNLOCK(&ntatable->rwlock, isc_rwlocktype_write);
dns_qpmulti_commit(table, &qp);
rcu_read_unlock();
return result;
}
@ -340,7 +343,14 @@ dns_ntatable_delete(dns_ntatable_t *ntatable, const dns_name_t *name) {
REQUIRE(VALID_NTATABLE(ntatable));
REQUIRE(name != NULL);
dns_qpmulti_write(ntatable->table, &qp);
rcu_read_lock();
dns_qpmulti_t *table = rcu_dereference(ntatable->table);
if (table == NULL) {
rcu_read_unlock();
return ISC_R_SUCCESS;
}
dns_qpmulti_write(table, &qp);
result = dns_qp_deletename(qp, name, DNS_DBNAMESPACE_NORMAL, &pval,
NULL);
if (result == ISC_R_SUCCESS) {
@ -349,7 +359,9 @@ dns_ntatable_delete(dns_ntatable_t *ntatable, const dns_name_t *name) {
dns__nta_detach(&n);
}
dns_qp_compact(qp, DNS_QPGC_MAYBE);
dns_qpmulti_commit(ntatable->table, &qp);
dns_qpmulti_commit(table, &qp);
rcu_read_unlock();
return result;
}
@ -361,44 +373,44 @@ delete_expired(void *arg) {
isc_result_t result;
dns_qp_t *qp = NULL;
void *pval = NULL;
dns_view_t *view = NULL;
bool flushnode = false;
REQUIRE(VALID_NTATABLE(ntatable));
RWLOCK(&ntatable->rwlock, isc_rwlocktype_write);
dns_qpmulti_write(ntatable->table, &qp);
rcu_read_lock();
dns_view_t *view = rcu_dereference(ntatable->view);
dns_qpmulti_t *table = rcu_dereference(ntatable->table);
if (view == NULL || table == NULL) {
goto unlock;
}
dns_qpmulti_write(table, &qp);
result = dns_qp_getname(qp, &nta->name, DNS_DBNAMESPACE_NORMAL, &pval,
NULL);
if (result == ISC_R_SUCCESS &&
((dns__nta_t *)pval)->expiry == nta->expiry && !nta->shuttingdown)
CMM_LOAD_SHARED(((dns__nta_t *)pval)->expiry) ==
CMM_LOAD_SHARED(nta->expiry))
{
char nb[DNS_NAME_FORMATSIZE];
dns_name_format(&nta->name, nb, sizeof(nb));
isc_log_write(DNS_LOGCATEGORY_DNSSEC, DNS_LOGMODULE_NTA,
ISC_LOG_INFO, "deleting expired NTA at %s", nb);
/*
* Delay the flushing to avoid lock-order-inversion, as
* dns_view_flushnode()->dns_adb_flushnames() locks 'adbname',
* and it can cause a problem e.g. in dns_ntatable_covered() in
* another thread called by the resolver (also involving 'fctx'
* lock), or in dns_ntatable_shutdown() (also involving 'view'
* lock).
*/
dns_view_weakattach(ntatable->view, &view);
flushnode = true;
dns_qp_deletename(qp, &nta->name, DNS_DBNAMESPACE_NORMAL, NULL,
NULL);
result = dns_qp_deletename(qp, &nta->name,
DNS_DBNAMESPACE_NORMAL, NULL, NULL);
RUNTIME_CHECK(result == ISC_R_SUCCESS);
dns__nta_shutdown(nta);
dns__nta_unref(nta);
}
dns_qp_compact(qp, DNS_QPGC_MAYBE);
dns_qpmulti_commit(ntatable->table, &qp);
RWUNLOCK(&ntatable->rwlock, isc_rwlocktype_write);
if (view != NULL) {
dns_qpmulti_commit(table, &qp);
if (flushnode) {
dns_view_flushnode(view, &nta->name, true);
dns_view_weakdetach(&view);
}
unlock:
rcu_read_unlock();
dns__nta_detach(&nta);
dns_ntatable_detach(&ntatable);
}
@ -415,8 +427,14 @@ dns_ntatable_covered(dns_ntatable_t *ntatable, isc_stdtime_t now,
REQUIRE(VALID_NTATABLE(ntatable));
REQUIRE(dns_name_isabsolute(name));
RWLOCK(&ntatable->rwlock, isc_rwlocktype_read);
dns_qpmulti_query(ntatable->table, &qpr);
rcu_read_lock();
dns_qpmulti_t *table = rcu_dereference(ntatable->table);
if (table == NULL) {
rcu_read_unlock();
return false;
}
dns_qpmulti_query(table, &qpr);
result = dns_qp_lookup(&qpr, name, DNS_DBNAMESPACE_NORMAL, NULL, NULL,
&pval, NULL);
nta = pval;
@ -440,18 +458,19 @@ dns_ntatable_covered(dns_ntatable_t *ntatable, isc_stdtime_t now,
goto done;
}
if (nta->expiry <= now) {
if (CMM_LOAD_SHARED(nta->expiry) <= now) {
/* NTA is expired */
dns__nta_ref(nta);
dns_ntatable_ref(nta->ntatable);
isc_async_current(delete_expired, nta);
isc_async_run(nta->loop, delete_expired, nta);
goto done;
}
answer = true;
done:
RWUNLOCK(&ntatable->rwlock, isc_rwlocktype_read);
dns_qpread_destroy(ntatable->table, &qpr);
dns_qpread_destroy(table, &qpr);
rcu_read_unlock();
return answer;
}
@ -475,8 +494,14 @@ dns_ntatable_totext(dns_ntatable_t *ntatable, const char *view,
REQUIRE(VALID_NTATABLE(ntatable));
RWLOCK(&ntatable->rwlock, isc_rwlocktype_read);
dns_qpmulti_query(ntatable->table, &qpr);
rcu_read_lock();
dns_qpmulti_t *table = rcu_dereference(ntatable->table);
if (table == NULL) {
rcu_read_unlock();
return ISC_R_SUCCESS;
}
dns_qpmulti_query(table, &qpr);
dns_qpiter_init(&qpr, &iter);
while (dns_qpiter_next(&iter, &pval, NULL) == ISC_R_SUCCESS) {
@ -486,19 +511,20 @@ dns_ntatable_totext(dns_ntatable_t *ntatable, const char *view,
char obuf[DNS_NAME_FORMATSIZE + ISC_FORMATHTTPTIMESTAMP_SIZE +
sizeof("expired: \n")];
isc_time_t t;
isc_stdtime_t expiry = CMM_LOAD_SHARED(n->expiry);
dns_name_format(&n->name, nbuf, sizeof(nbuf));
if (n->expiry != 0xffffffffU) {
if (expiry != 0xffffffffU) {
/* Normal NTA entries */
isc_time_set(&t, n->expiry, 0);
isc_time_set(&t, expiry, 0);
isc_time_formattimestamp(&t, tbuf, sizeof(tbuf));
snprintf(obuf, sizeof(obuf), "%s%s%s%s: %s %s",
first ? "" : "\n", nbuf,
view != NULL ? "/" : "",
view != NULL ? view : "",
n->expiry <= now ? "expired" : "expiry", tbuf);
expiry <= now ? "expired" : "expiry", tbuf);
} else {
/* "validate-except" entries */
snprintf(obuf, sizeof(obuf), "%s%s%s%s: %s",
@ -512,8 +538,8 @@ dns_ntatable_totext(dns_ntatable_t *ntatable, const char *view,
}
cleanup:
dns_qpread_destroy(ntatable->table, &qpr);
RWUNLOCK(&ntatable->rwlock, isc_rwlocktype_read);
dns_qpread_destroy(table, &qpr);
rcu_read_unlock();
return result;
}
@ -528,20 +554,27 @@ dns_ntatable_save(dns_ntatable_t *ntatable, FILE *fp) {
REQUIRE(VALID_NTATABLE(ntatable));
RWLOCK(&ntatable->rwlock, isc_rwlocktype_read);
dns_qpmulti_query(ntatable->table, &qpr);
rcu_read_lock();
dns_qpmulti_t *table = rcu_dereference(ntatable->table);
if (table == NULL) {
rcu_read_unlock();
return ISC_R_SUCCESS;
}
dns_qpmulti_query(table, &qpr);
dns_qpiter_init(&qpr, &iter);
while (dns_qpiter_next(&iter, &pval, NULL) == ISC_R_SUCCESS) {
dns__nta_t *n = pval;
isc_buffer_t b;
char nbuf[DNS_NAME_FORMATSIZE + 1], tbuf[80];
isc_stdtime_t expiry = CMM_LOAD_SHARED(n->expiry);
/*
* Skip this node if the expiry is already in the
* past, or if this is a "validate-except" entry.
*/
if (n->expiry <= now || n->expiry == 0xffffffffU) {
if (expiry <= now || expiry == 0xffffffffU) {
continue;
}
@ -555,7 +588,7 @@ dns_ntatable_save(dns_ntatable_t *ntatable, FILE *fp) {
isc_buffer_putuint8(&b, 0);
isc_buffer_init(&b, tbuf, sizeof(tbuf));
dns_time32_totext(n->expiry, &b);
dns_time32_totext(expiry, &b);
/* Zero terminate */
isc_buffer_putuint8(&b, 0);
@ -565,8 +598,8 @@ dns_ntatable_save(dns_ntatable_t *ntatable, FILE *fp) {
written = true;
}
dns_qpread_destroy(ntatable->table, &qpr);
RWUNLOCK(&ntatable->rwlock, isc_rwlocktype_read);
dns_qpread_destroy(table, &qpr);
rcu_read_unlock();
if (result == ISC_R_SUCCESS && !written) {
result = ISC_R_NOTFOUND;
@ -602,31 +635,34 @@ dns__nta_shutdown(dns__nta_t *nta) {
dns__nta_ref(nta);
isc_async_run(nta->loop, dns__nta_shutdown_cb, nta);
nta->shuttingdown = true;
}
void
dns_ntatable_shutdown(dns_ntatable_t *ntatable) {
REQUIRE(VALID_NTATABLE(ntatable));
dns_view_t *view = rcu_xchg_pointer(&ntatable->view, NULL);
dns_qpmulti_t *table = rcu_xchg_pointer(&ntatable->table, NULL);
synchronize_rcu();
dns_qpread_t qpr;
dns_qpiter_t iter;
void *pval = NULL;
REQUIRE(VALID_NTATABLE(ntatable));
RWLOCK(&ntatable->rwlock, isc_rwlocktype_write);
dns_qpmulti_query(ntatable->table, &qpr);
ntatable->shuttingdown = true;
dns_qpmulti_query(table, &qpr);
dns_qpiter_init(&qpr, &iter);
while (dns_qpiter_next(&iter, &pval, NULL) == ISC_R_SUCCESS) {
dns__nta_t *n = pval;
dns__nta_shutdown(n);
dns__nta_detach(&n);
dns__nta_t *nta = pval;
dns__nta_shutdown(nta);
dns__nta_detach(&nta);
}
dns_qpread_destroy(ntatable->table, &qpr);
dns_view_weakdetach(&ntatable->view);
RWUNLOCK(&ntatable->rwlock, isc_rwlocktype_write);
dns_qpread_destroy(table, &qpr);
dns_view_weakdetach(&view);
dns_qpmulti_destroy(&table);
}
static void