From 28fe8104eed82d2053b9e2a2d06bd29cb3085ae5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ond=C5=99ej=20Sur=C3=BD?= Date: Mon, 13 Feb 2023 16:16:26 +0100 Subject: [PATCH 1/6] Add isc_hashmap_find() DbC check for valuep This adds DbC check, so we don't pass non-NULL memory for a valued to the isc_hashmap_find() function. --- lib/isc/hashmap.c | 3 ++- tests/isc/hashmap_test.c | 5 ++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/lib/isc/hashmap.c b/lib/isc/hashmap.c index 2ea7a25182..1a8f15e0a2 100644 --- a/lib/isc/hashmap.c +++ b/lib/isc/hashmap.c @@ -332,8 +332,9 @@ isc_hashmap_find(const isc_hashmap_t *hashmap, const uint32_t *hashvalp, const void *key, uint32_t keysize, void **valuep) { REQUIRE(ISC_HASHMAP_VALID(hashmap)); REQUIRE(key != NULL && keysize <= UINT16_MAX); + REQUIRE(valuep == NULL || *valuep == NULL); - hashmap_node_t *node; + hashmap_node_t *node = NULL; uint8_t idx = hashmap->hindex; uint32_t hashval = (hashvalp != NULL) ? *hashvalp diff --git a/tests/isc/hashmap_test.c b/tests/isc/hashmap_test.c index 99b7951f57..b7e755a1fe 100644 --- a/tests/isc/hashmap_test.c +++ b/tests/isc/hashmap_test.c @@ -381,7 +381,6 @@ ISC_RUN_TEST_IMPL(isc_hashmap_case) { test_node_t lower = { .key = "isc_hashmap_case" }; test_node_t upper = { .key = "ISC_HASHMAP_CASE" }; test_node_t mixed = { .key = "IsC_hAsHmAp_CaSe" }; - test_node_t *value; isc_hashmap_create(mctx, 1, ISC_HASHMAP_CASE_SENSITIVE, &hashmap); @@ -398,7 +397,7 @@ ISC_RUN_TEST_IMPL(isc_hashmap_case) { assert_int_equal(result, ISC_R_SUCCESS); result = isc_hashmap_find(hashmap, NULL, mixed.key, strlen(mixed.key), - (void *)&value); + &(void *){ NULL }); assert_int_equal(result, ISC_R_NOTFOUND); isc_hashmap_destroy(&hashmap); @@ -418,7 +417,7 @@ ISC_RUN_TEST_IMPL(isc_hashmap_case) { assert_int_equal(result, ISC_R_EXISTS); result = isc_hashmap_find(hashmap, NULL, mixed.key, strlen(mixed.key), - (void *)&value); + &(void *){ NULL }); assert_int_equal(result, ISC_R_SUCCESS); isc_hashmap_destroy(&hashmap); From af12241f67e33ffb8d24db2f769fd0a79afde54b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ond=C5=99ej=20Sur=C3=BD?= Date: Tue, 14 Feb 2023 13:40:45 +0100 Subject: [PATCH 2/6] Add missing include to dns/badcache.c The dns_badcache was pulling the header only indirectly via , add the direct include as the no longer pulls the header when pthread_rwlock is used. --- lib/dns/badcache.c | 1 + 1 file changed, 1 insertion(+) diff --git a/lib/dns/badcache.c b/lib/dns/badcache.c index 3ef1dfaeb6..64363ea793 100644 --- a/lib/dns/badcache.c +++ b/lib/dns/badcache.c @@ -16,6 +16,7 @@ #include #include +#include #include #include #include From 6ffda5920e5092d4586a2680802143be755feb8b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ond=C5=99ej=20Sur=C3=BD?= Date: Wed, 24 Mar 2021 17:52:56 +0100 Subject: [PATCH 3/6] Add the reader-writer synchronization with modified C-RW-WP MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This changes the internal isc_rwlock implementation to: Irina Calciu, Dave Dice, Yossi Lev, Victor Luchangco, Virendra J. Marathe, and Nir Shavit. 2013. NUMA-aware reader-writer locks. SIGPLAN Not. 48, 8 (August 2013), 157–166. DOI:https://doi.org/10.1145/2517327.24425 (The full article available from: http://mcg.cs.tau.ac.il/papers/ppopp2013-rwlocks.pdf) The implementation is based on the The Writer-Preference Lock (C-RW-WP) variant (see the 3.4 section of the paper for the rationale). The implemented algorithm has been modified for simplicity and for usage patterns in rbtdb.c. The changes compared to the original algorithm: * We haven't implemented the cohort locks because that would require a knowledge of NUMA nodes, instead a simple atomic_bool is used as synchronization point for writer lock. * The per-thread reader counters are not being used - this would require the internal thread id (isc_tid_v) to be always initialized, even in the utilities; the change has a slight performance penalty, so we might revisit this change in the future. However, this change also saves a lot of memory, because cache-line aligned counters were used, so on 32-core machine, the rwlock would be 4096+ bytes big. * The readers use a writer_barrier that will raise after a while when readers lock can't be acquired to prevent readers starvation. * Separate ingress and egress readers counters queues to reduce both inter and intra-thread contention. --- bin/dnssec/dnssec-signzone.c | 2 +- lib/dns/acl.c | 2 +- lib/dns/badcache.c | 2 +- lib/dns/db.c | 2 +- lib/dns/dlz.c | 2 +- lib/dns/forward.c | 2 +- lib/dns/keytable.c | 4 +- lib/dns/nta.c | 2 +- lib/dns/rbtdb.c | 14 +- lib/dns/rpz.c | 2 +- lib/dns/transport.c | 2 +- lib/dns/tsig.c | 2 +- lib/dns/view.c | 2 +- lib/dns/zone.c | 8 +- lib/dns/zt.c | 2 +- lib/isc/include/isc/mutex.h | 2 + lib/isc/include/isc/rwlock.h | 141 ++++---- lib/isc/include/isc/util.h | 14 + lib/isc/log.c | 2 +- lib/isc/managers.c | 4 + lib/isc/rwlock.c | 649 ++++++++++++----------------------- lib/isc/tls.c | 2 +- 22 files changed, 358 insertions(+), 506 deletions(-) diff --git a/bin/dnssec/dnssec-signzone.c b/bin/dnssec/dnssec-signzone.c index 6f2328c72e..39ae3a35b9 100644 --- a/bin/dnssec/dnssec-signzone.c +++ b/bin/dnssec/dnssec-signzone.c @@ -3778,7 +3778,7 @@ main(int argc, char *argv[]) { * of keys rather early. */ ISC_LIST_INIT(keylist); - isc_rwlock_init(&keylist_lock, 0, 0); + isc_rwlock_init(&keylist_lock); /* * Fill keylist with: diff --git a/lib/dns/acl.c b/lib/dns/acl.c index 09f39bf28f..8678490906 100644 --- a/lib/dns/acl.c +++ b/lib/dns/acl.c @@ -696,7 +696,7 @@ dns_aclenv_create(isc_mem_t *mctx, dns_aclenv_t **envp) { isc_mem_attach(mctx, &env->mctx); isc_refcount_init(&env->references, 1); - isc_rwlock_init(&env->rwlock, 0, 0); + isc_rwlock_init(&env->rwlock); result = dns_acl_create(mctx, 0, &env->localhost); if (result != ISC_R_SUCCESS) { diff --git a/lib/dns/badcache.c b/lib/dns/badcache.c index 64363ea793..cff5dcadf0 100644 --- a/lib/dns/badcache.c +++ b/lib/dns/badcache.c @@ -83,7 +83,7 @@ dns_badcache_init(isc_mem_t *mctx, unsigned int size, dns_badcache_t **bcp) { }; isc_mem_attach(mctx, &bc->mctx); - isc_rwlock_init(&bc->lock, 0, 0); + isc_rwlock_init(&bc->lock); bc->table = isc_mem_getx(bc->mctx, sizeof(bc->table[0]) * size, ISC_MEM_ZERO); diff --git a/lib/dns/db.c b/lib/dns/db.c index e0a46c9856..87549bbcb4 100644 --- a/lib/dns/db.c +++ b/lib/dns/db.c @@ -70,7 +70,7 @@ static dns_dbimplementation_t rbtimp; static void initialize(void) { - isc_rwlock_init(&implock, 0, 0); + isc_rwlock_init(&implock); rbtimp.name = "rbt"; rbtimp.create = dns_rbtdb_create; diff --git a/lib/dns/dlz.c b/lib/dns/dlz.c index 7515943912..3cd390600c 100644 --- a/lib/dns/dlz.c +++ b/lib/dns/dlz.c @@ -83,7 +83,7 @@ static isc_once_t once = ISC_ONCE_INIT; static void dlz_initialize(void) { - isc_rwlock_init(&dlz_implock, 0, 0); + isc_rwlock_init(&dlz_implock); ISC_LIST_INIT(dlz_implementations); } diff --git a/lib/dns/forward.c b/lib/dns/forward.c index ef05529d83..20cb709472 100644 --- a/lib/dns/forward.c +++ b/lib/dns/forward.c @@ -54,7 +54,7 @@ dns_fwdtable_create(isc_mem_t *mctx, dns_fwdtable_t **fwdtablep) { goto cleanup_fwdtable; } - isc_rwlock_init(&fwdtable->rwlock, 0, 0); + isc_rwlock_init(&fwdtable->rwlock); fwdtable->mctx = NULL; isc_mem_attach(mctx, &fwdtable->mctx); fwdtable->magic = FWDTABLEMAGIC; diff --git a/lib/dns/keytable.c b/lib/dns/keytable.c index 2e9922509b..deb6ca01de 100644 --- a/lib/dns/keytable.c +++ b/lib/dns/keytable.c @@ -158,7 +158,7 @@ dns_keytable_create(isc_mem_t *mctx, dns_keytable_t **keytablep) { goto cleanup_keytable; } - isc_rwlock_init(&keytable->rwlock, 0, 0); + isc_rwlock_init(&keytable->rwlock); isc_refcount_init(&keytable->references, 1); keytable->mctx = NULL; @@ -342,7 +342,7 @@ new_keynode(dns_rdata_ds_t *ds, dns_keytable_t *keytable, bool managed, dns_rdataset_init(&knode->dsset); isc_refcount_init(&knode->refcount, 1); - isc_rwlock_init(&knode->rwlock, 0, 0); + isc_rwlock_init(&knode->rwlock); /* * If a DS was supplied, initialize an rdatalist. diff --git a/lib/dns/nta.c b/lib/dns/nta.c index 92fb14c535..c913c7b281 100644 --- a/lib/dns/nta.c +++ b/lib/dns/nta.c @@ -136,7 +136,7 @@ dns_ntatable_create(dns_view_t *view, isc_taskmgr_t *taskmgr, goto cleanup_task; } - isc_rwlock_init(&ntatable->rwlock, 0, 0); + isc_rwlock_init(&ntatable->rwlock); isc_refcount_init(&ntatable->references, 1); diff --git a/lib/dns/rbtdb.c b/lib/dns/rbtdb.c index b41f05d73d..a416ab66f7 100644 --- a/lib/dns/rbtdb.c +++ b/lib/dns/rbtdb.c @@ -104,7 +104,7 @@ typedef uint32_t rbtdb_rdatatype_t; RBTDB_RDATATYPE_VALUE(dns_rdatatype_rrsig, dns_rdatatype_soa) #define RBTDB_RDATATYPE_NCACHEANY RBTDB_RDATATYPE_VALUE(0, dns_rdatatype_any) -#define RBTDB_INITLOCK(l) isc_rwlock_init((l), 0, 0) +#define RBTDB_INITLOCK(l) isc_rwlock_init((l)) #define RBTDB_DESTROYLOCK(l) isc_rwlock_destroy(l) #define RBTDB_LOCK(l, t) RWLOCK((l), (t)) #define RBTDB_UNLOCK(l, t) RWUNLOCK((l), (t)) @@ -117,7 +117,7 @@ typedef uint32_t rbtdb_rdatatype_t; typedef isc_rwlock_t nodelock_t; -#define NODE_INITLOCK(l) isc_rwlock_init((l), 0, 0) +#define NODE_INITLOCK(l) isc_rwlock_init((l)) #define NODE_DESTROYLOCK(l) isc_rwlock_destroy(l) #define NODE_LOCK(l, t, tp) \ { \ @@ -161,7 +161,7 @@ typedef isc_rwlock_t nodelock_t; typedef isc_rwlock_t treelock_t; -#define TREE_INITLOCK(l) isc_rwlock_init(l, 0, 0) +#define TREE_INITLOCK(l) isc_rwlock_init(l) #define TREE_DESTROYLOCK(l) isc_rwlock_destroy(l) #define TREE_LOCK(l, t, tp) \ { \ @@ -1294,7 +1294,7 @@ allocate_version(isc_mem_t *mctx, rbtdb_serial_t serial, version->serial = serial; isc_refcount_init(&version->references, references); - isc_rwlock_init(&version->glue_rwlock, 0, 0); + isc_rwlock_init(&version->glue_rwlock); version->glue_table_bits = ISC_HASH_MIN_BITS; version->glue_table_nodecount = 0U; @@ -1343,7 +1343,7 @@ newversion(dns_db_t *db, dns_dbversion_t **versionp) { version->salt_length = 0; memset(version->salt, 0, sizeof(version->salt)); } - isc_rwlock_init(&version->rwlock, 0, 0); + isc_rwlock_init(&version->rwlock); RWLOCK(&rbtdb->current_version->rwlock, isc_rwlocktype_read); version->records = rbtdb->current_version->records; version->xfrsize = rbtdb->current_version->xfrsize; @@ -2207,7 +2207,7 @@ prune_tree(void *arg) { node = parent; } while (node != NULL); NODE_UNLOCK(&rbtdb->node_locks[locknum].lock, &nlocktype); - RWUNLOCK(&rbtdb->tree_lock, tlocktype); + TREE_UNLOCK(&rbtdb->tree_lock, &tlocktype); detach((dns_db_t **)&rbtdb); } @@ -8431,7 +8431,7 @@ dns_rbtdb_create(isc_mem_t *mctx, const dns_name_t *origin, dns_dbtype_t type, rbtdb->current_version->salt_length = 0; memset(rbtdb->current_version->salt, 0, sizeof(rbtdb->current_version->salt)); - isc_rwlock_init(&rbtdb->current_version->rwlock, 0, 0); + isc_rwlock_init(&rbtdb->current_version->rwlock); rbtdb->current_version->records = 0; rbtdb->current_version->xfrsize = 0; rbtdb->future_version = NULL; diff --git a/lib/dns/rpz.c b/lib/dns/rpz.c index 8dc62d29e2..75ee525f7d 100644 --- a/lib/dns/rpz.c +++ b/lib/dns/rpz.c @@ -1481,7 +1481,7 @@ dns_rpz_new_zones(isc_mem_t *mctx, isc_loopmgr_t *loopmgr, char *rps_cstr, .magic = DNS_RPZ_ZONES_MAGIC, }; - isc_rwlock_init(&rpzs->search_lock, 0, 0); + isc_rwlock_init(&rpzs->search_lock); isc_mutex_init(&rpzs->maint_lock); isc_refcount_init(&rpzs->references, 1); diff --git a/lib/dns/transport.c b/lib/dns/transport.c index 9c6a123081..d76f62f3d5 100644 --- a/lib/dns/transport.c +++ b/lib/dns/transport.c @@ -652,7 +652,7 @@ dns_transport_list_new(isc_mem_t *mctx) { *list = (dns_transport_list_t){ 0 }; - isc_rwlock_init(&list->lock, 0, 0); + isc_rwlock_init(&list->lock); isc_mem_attach(mctx, &list->mctx); isc_refcount_init(&list->references, 1); diff --git a/lib/dns/tsig.c b/lib/dns/tsig.c index 3dca68930b..1bf652a159 100644 --- a/lib/dns/tsig.c +++ b/lib/dns/tsig.c @@ -1812,7 +1812,7 @@ dns_tsigkeyring_create(isc_mem_t *mctx, dns_tsig_keyring_t **ringp) { ring = isc_mem_get(mctx, sizeof(dns_tsig_keyring_t)); - isc_rwlock_init(&ring->lock, 0, 0); + isc_rwlock_init(&ring->lock); ring->keys = NULL; result = dns_rbt_create(mctx, free_tsignode, NULL, &ring->keys); if (result != ISC_R_SUCCESS) { diff --git a/lib/dns/view.c b/lib/dns/view.c index e8a53aa108..0c7de74463 100644 --- a/lib/dns/view.c +++ b/lib/dns/view.c @@ -133,7 +133,7 @@ dns_view_create(isc_mem_t *mctx, dns_rdataclass_t rdclass, const char *name, isc_mutex_init(&view->lock); - isc_rwlock_init(&view->sfd_lock, 0, 0); + isc_rwlock_init(&view->sfd_lock); view->zonetable = NULL; result = dns_zt_create(mctx, rdclass, &view->zonetable); diff --git a/lib/dns/zone.c b/lib/dns/zone.c index 951bbfd34c..3a261d0b49 100644 --- a/lib/dns/zone.c +++ b/lib/dns/zone.c @@ -214,7 +214,7 @@ typedef struct dns_include dns_include_t; } while (0) #endif /* ifdef DNS_ZONE_CHECKLOCK */ -#define ZONEDB_INITLOCK(l) isc_rwlock_init((l), 0, 0) +#define ZONEDB_INITLOCK(l) isc_rwlock_init((l)) #define ZONEDB_DESTROYLOCK(l) isc_rwlock_destroy(l) #define ZONEDB_LOCK(l, t) RWLOCK((l), (t)) #define ZONEDB_UNLOCK(l, t) RWUNLOCK((l), (t)) @@ -17942,7 +17942,7 @@ zonemgr_keymgmt_init(dns_zonemgr_t *zmgr) { }; isc_mem_attach(zmgr->mctx, &mgmt->mctx); - isc_rwlock_init(&mgmt->lock, 0, 0); + isc_rwlock_init(&mgmt->lock); isc_hashmap_create(mgmt->mctx, DNS_KEYMGMT_HASH_BITS, ISC_HASHMAP_CASE_SENSITIVE, &mgmt->table); @@ -18074,10 +18074,10 @@ dns_zonemgr_create(isc_mem_t *mctx, isc_loopmgr_t *loopmgr, for (size_t i = 0; i < UNREACH_CACHE_SIZE; i++) { atomic_init(&zmgr->unreachable[i].expire, 0); } - isc_rwlock_init(&zmgr->rwlock, 0, 0); + isc_rwlock_init(&zmgr->rwlock); /* Unreachable lock. */ - isc_rwlock_init(&zmgr->urlock, 0, 0); + isc_rwlock_init(&zmgr->urlock); isc_ratelimiter_create(loop, &zmgr->checkdsrl); isc_ratelimiter_create(loop, &zmgr->notifyrl); diff --git a/lib/dns/zt.c b/lib/dns/zt.c index a0c300aaf0..da4eac74ce 100644 --- a/lib/dns/zt.c +++ b/lib/dns/zt.c @@ -95,7 +95,7 @@ dns_zt_create(isc_mem_t *mctx, dns_rdataclass_t rdclass, dns_zt_t **ztp) { goto cleanup_zt; } - isc_rwlock_init(&zt->rwlock, 0, 0); + isc_rwlock_init(&zt->rwlock); zt->mctx = NULL; isc_mem_attach(mctx, &zt->mctx); isc_refcount_init(&zt->references, 1); diff --git a/lib/isc/include/isc/mutex.h b/lib/isc/include/isc/mutex.h index df24922336..45256a6fac 100644 --- a/lib/isc/include/isc/mutex.h +++ b/lib/isc/include/isc/mutex.h @@ -23,6 +23,8 @@ #include /* for ISC_R_ codes */ #include +#define ISC_MUTEX_INITIALIZER PTHREAD_MUTEX_INITIALIZER + ISC_LANG_BEGINDECLS /* diff --git a/lib/isc/include/isc/rwlock.h b/lib/isc/include/isc/rwlock.h index 44a44a86cc..9d5c54053e 100644 --- a/lib/isc/include/isc/rwlock.h +++ b/lib/isc/include/isc/rwlock.h @@ -18,8 +18,6 @@ /*! \file isc/rwlock.h */ -#include -#include #include #include #include @@ -46,10 +44,10 @@ typedef enum { typedef pthread_rwlock_t *isc_rwlock_t; typedef pthread_rwlock_t isc__rwlock_t; -#define isc_rwlock_init(rwl, rq, wq) \ - { \ - *rwl = malloc(sizeof(**rwl)); \ - isc__rwlock_init(*rwl, rq, wq); \ +#define isc_rwlock_init(rwl) \ + { \ + *rwl = malloc(sizeof(**rwl)); \ + isc__rwlock_init(*rwl); \ } #define isc_rwlock_lock(rwl, type) isc__rwlock_lock(*rwl, type) #define isc_rwlock_trylock(rwl, type) isc__rwlock_trylock(*rwl, type) @@ -66,7 +64,7 @@ typedef pthread_rwlock_t isc__rwlock_t; typedef pthread_rwlock_t isc_rwlock_t; typedef pthread_rwlock_t isc__rwlock_t; -#define isc_rwlock_init(rwl, rq, wq) isc__rwlock_init(rwl, rq, wq) +#define isc_rwlock_init(rwl) isc__rwlock_init(rwl) #define isc_rwlock_lock(rwl, type) isc__rwlock_lock(rwl, type) #define isc_rwlock_trylock(rwl, type) isc__rwlock_trylock(rwl, type) #define isc_rwlock_unlock(rwl, type) isc__rwlock_unlock(rwl, type) @@ -75,7 +73,7 @@ typedef pthread_rwlock_t isc__rwlock_t; #endif /* ISC_TRACK_PTHREADS_OBJECTS */ -#define isc__rwlock_init(rwl, read_quota, write_quote) \ +#define isc__rwlock_init(rwl) \ { \ int _ret = pthread_rwlock_init(rwl, NULL); \ PTHREADS_RUNTIME_CHECK(pthread_rwlock_init, _ret); \ @@ -158,72 +156,99 @@ typedef pthread_rwlock_t isc__rwlock_t; PTHREADS_RUNTIME_CHECK(pthread_rwlock_destroy, _ret); \ } +#define isc_rwlock_setworkers(workers) + #else /* USE_PTHREAD_RWLOCK */ +#include +#include +#include + struct isc_rwlock { - /* Unlocked. */ - unsigned int magic; - isc_mutex_t lock; - atomic_int_fast32_t spins; - - /* - * When some atomic instructions with hardware assistance are - * available, rwlock will use those so that concurrent readers do not - * interfere with each other through mutex as long as no writers - * appear, massively reducing the lock overhead in the typical case. - * - * The basic algorithm of this approach is the "simple - * writer-preference lock" shown in the following URL: - * http://www.cs.rochester.edu/u/scott/synchronization/pseudocode/rw.html - * but our implementation does not rely on the spin lock unlike the - * original algorithm to be more portable as a user space application. - */ - - /* Read or modified atomically. */ - atomic_int_fast32_t write_requests; - atomic_int_fast32_t write_completions; - atomic_int_fast32_t cnt_and_flag; - - /* Locked by lock. */ - isc_condition_t readable; - isc_condition_t writeable; - unsigned int readers_waiting; - - /* Locked by rwlock itself. */ - atomic_uint_fast32_t write_granted; - - /* Unlocked. */ - unsigned int write_quota; + alignas(ISC_OS_CACHELINE_SIZE) atomic_uint_fast32_t readers_ingress; + alignas(ISC_OS_CACHELINE_SIZE) atomic_uint_fast32_t readers_egress; + alignas(ISC_OS_CACHELINE_SIZE) atomic_int_fast32_t writers_barrier; + alignas(ISC_OS_CACHELINE_SIZE) atomic_bool writers_lock; }; typedef struct isc_rwlock isc_rwlock_t; -typedef struct isc_rwlock isc__rwlock_t; - -#define isc_rwlock_init(rwl, rq, wq) isc__rwlock_init(rwl, rq, wq) -#define isc_rwlock_lock(rwl, type) isc__rwlock_lock(rwl, type) -#define isc_rwlock_trylock(rwl, type) isc__rwlock_trylock(rwl, type) -#define isc_rwlock_unlock(rwl, type) isc__rwlock_unlock(rwl, type) -#define isc_rwlock_tryupgrade(rwl) isc__rwlock_tryupgrade(rwl) -#define isc_rwlock_destroy(rwl) isc__rwlock_destroy(rwl) void -isc__rwlock_init(isc__rwlock_t *rwl, unsigned int read_quota, - unsigned int write_quota); +isc_rwlock_init(isc_rwlock_t *rwl); void -isc__rwlock_lock(isc__rwlock_t *rwl, isc_rwlocktype_t type); +isc_rwlock_rdlock(isc_rwlock_t *rwl); + +void +isc_rwlock_wrlock(isc_rwlock_t *rwl); isc_result_t -isc__rwlock_trylock(isc__rwlock_t *rwl, isc_rwlocktype_t type); - -void -isc__rwlock_unlock(isc__rwlock_t *rwl, isc_rwlocktype_t type); +isc_rwlock_tryrdlock(isc_rwlock_t *rwl); isc_result_t -isc__rwlock_tryupgrade(isc__rwlock_t *rwl); +isc_rwlock_trywrlock(isc_rwlock_t *rwl); void -isc__rwlock_destroy(isc__rwlock_t *rwl); +isc_rwlock_rdunlock(isc_rwlock_t *rwl); + +void +isc_rwlock_wrunlock(isc_rwlock_t *rwl); + +isc_result_t +isc_rwlock_tryupgrade(isc_rwlock_t *rwl); + +void +isc_rwlock_downgrade(isc_rwlock_t *rwl); + +void +isc_rwlock_destroy(isc_rwlock_t *rwl); + +void +isc_rwlock_setworkers(uint16_t workers); + +#define isc_rwlock_lock(rwl, type) \ + { \ + switch (type) { \ + case isc_rwlocktype_read: \ + isc_rwlock_rdlock(rwl); \ + break; \ + case isc_rwlocktype_write: \ + isc_rwlock_wrlock(rwl); \ + break; \ + default: \ + UNREACHABLE(); \ + } \ + } + +#define isc_rwlock_trylock(rwl, type) \ + ({ \ + int __result; \ + switch (type) { \ + case isc_rwlocktype_read: \ + __result = isc_rwlock_tryrdlock(rwl); \ + break; \ + case isc_rwlocktype_write: \ + __result = isc_rwlock_trywrlock(rwl); \ + break; \ + default: \ + UNREACHABLE(); \ + } \ + __result; \ + }) + +#define isc_rwlock_unlock(rwl, type) \ + { \ + switch (type) { \ + case isc_rwlocktype_read: \ + isc_rwlock_rdunlock(rwl); \ + break; \ + case isc_rwlocktype_write: \ + isc_rwlock_wrunlock(rwl); \ + break; \ + default: \ + UNREACHABLE(); \ + } \ + } #endif /* USE_PTHREAD_RWLOCK */ diff --git a/lib/isc/include/isc/util.h b/lib/isc/include/isc/util.h index f24a8da2d5..35d9050158 100644 --- a/lib/isc/include/isc/util.h +++ b/lib/isc/include/isc/util.h @@ -198,6 +198,20 @@ #define WRLOCK(lp) RWLOCK(lp, isc_rwlocktype_write) #define WRUNLOCK(lp) RWUNLOCK(lp, isc_rwlocktype_write) +#define UPGRADELOCK(lock, locktype) \ + { \ + if (locktype == isc_rwlocktype_read) { \ + if (isc_rwlock_tryupgrade(lock) == ISC_R_SUCCESS) { \ + locktype = isc_rwlocktype_write; \ + } else { \ + RWUNLOCK(lock, locktype); \ + locktype = isc_rwlocktype_write; \ + RWLOCK(lock, locktype); \ + } \ + } \ + INSIST(locktype == isc_rwlocktype_write); \ + } + /* * List Macros. */ diff --git a/lib/isc/log.c b/lib/isc/log.c index 5547495f7f..43e371a813 100644 --- a/lib/isc/log.c +++ b/lib/isc/log.c @@ -264,7 +264,7 @@ isc_log_create(isc_mem_t *mctx, isc_log_t **lctxp, isc_logconfig_t **lcfgp) { ISC_LIST_INIT(lctx->messages); isc_mutex_init(&lctx->lock); - isc_rwlock_init(&lctx->lcfg_rwl, 0, 0); + isc_rwlock_init(&lctx->lcfg_rwl); /* * Normally setting the magic number is the last step done diff --git a/lib/isc/managers.c b/lib/isc/managers.c index 2fa8177720..9fcade3e61 100644 --- a/lib/isc/managers.c +++ b/lib/isc/managers.c @@ -12,7 +12,9 @@ */ #include +#include #include +#include void isc_managers_create(isc_mem_t **mctxp, uint32_t workers, @@ -33,6 +35,8 @@ isc_managers_create(isc_mem_t **mctxp, uint32_t workers, REQUIRE(taskmgrp != NULL && *taskmgrp == NULL); isc_taskmgr_create(*mctxp, *loopmgrp, taskmgrp); INSIST(*taskmgrp != NULL); + + isc_rwlock_setworkers(workers); } void diff --git a/lib/isc/rwlock.c b/lib/isc/rwlock.c index 62bdbf1a62..ae4fc388a0 100644 --- a/lib/isc/rwlock.c +++ b/lib/isc/rwlock.c @@ -11,396 +11,247 @@ * information regarding copyright ownership. */ +/* + * Modified C-RW-WP Implementation from NUMA-Aware Reader-Writer Locks paper: + * http://dl.acm.org/citation.cfm?id=2442532 + * + * This work is based on C++ code available from + * https://github.com/pramalhe/ConcurrencyFreaks/ + * + * Copyright (c) 2014-2016, Pedro Ramalhete, Andreia Correia + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * * Neither the name of Concurrency Freaks nor the + * names of its contributors may be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS + * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED + * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A + * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL + * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR + * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF + * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF + * THE POSSIBILITY OF SUCH DAMAGE. + */ + /*! \file */ #include #include #include +#include +#include #include -#include +#include #include -#include #include +#include +#include #include -#define RWLOCK_MAGIC ISC_MAGIC('R', 'W', 'L', 'k') -#define VALID_RWLOCK(rwl) ISC_MAGIC_VALID(rwl, RWLOCK_MAGIC) +static atomic_uint_fast16_t isc__crwlock_workers = 128; -#ifndef RWLOCK_DEFAULT_READ_QUOTA -#define RWLOCK_DEFAULT_READ_QUOTA 4 -#endif /* ifndef RWLOCK_DEFAULT_READ_QUOTA */ - -#ifndef RWLOCK_DEFAULT_WRITE_QUOTA -#define RWLOCK_DEFAULT_WRITE_QUOTA 4 -#endif /* ifndef RWLOCK_DEFAULT_WRITE_QUOTA */ - -#ifndef RWLOCK_MAX_ADAPTIVE_COUNT -#define RWLOCK_MAX_ADAPTIVE_COUNT 100 -#endif /* ifndef RWLOCK_MAX_ADAPTIVE_COUNT */ - -#ifdef ISC_RWLOCK_TRACE -#include /* Required for fprintf/stderr. */ - -#include /* Required for isc_thread_self(). */ - -static void -print_lock(const char *operation, isc__rwlock_t *rwl, isc_rwlocktype_t type) { - fprintf(stderr, - "rwlock %p thread %" PRIuPTR " %s(%s): " - "write_requests=%u, write_completions=%u, " - "cnt_and_flag=0x%x, readers_waiting=%u, " - "write_granted=%u, write_quota=%u\n", - rwl, isc_thread_self(), operation, - (type == isc_rwlocktype_read ? "read" : "write"), - atomic_load_acquire(&rwl->write_requests), - atomic_load_acquire(&rwl->write_completions), - atomic_load_acquire(&rwl->cnt_and_flag), rwl->readers_waiting, - atomic_load_acquire(&rwl->write_granted), rwl->write_quota); -} -#endif /* ISC_RWLOCK_TRACE */ - -void -isc__rwlock_init(isc__rwlock_t *rwl, unsigned int read_quota, - unsigned int write_quota) { - REQUIRE(rwl != NULL); - - /* - * In case there's trouble initializing, we zero magic now. If all - * goes well, we'll set it to RWLOCK_MAGIC. - */ - rwl->magic = 0; - - atomic_init(&rwl->spins, 0); - atomic_init(&rwl->write_requests, 0); - atomic_init(&rwl->write_completions, 0); - atomic_init(&rwl->cnt_and_flag, 0); - rwl->readers_waiting = 0; - atomic_init(&rwl->write_granted, 0); - if (read_quota != 0) { - UNEXPECTED_ERROR("read quota is not supported"); - } - if (write_quota == 0) { - write_quota = RWLOCK_DEFAULT_WRITE_QUOTA; - } - rwl->write_quota = write_quota; - - isc_mutex_init(&rwl->lock); - - isc_condition_init(&rwl->readable); - isc_condition_init(&rwl->writeable); - - rwl->magic = RWLOCK_MAGIC; -} - -void -isc__rwlock_destroy(isc__rwlock_t *rwl) { - REQUIRE(VALID_RWLOCK(rwl)); - - REQUIRE(atomic_load_acquire(&rwl->write_requests) == - atomic_load_acquire(&rwl->write_completions) && - atomic_load_acquire(&rwl->cnt_and_flag) == 0 && - rwl->readers_waiting == 0); - - rwl->magic = 0; - isc_condition_destroy(&rwl->readable); - isc_condition_destroy(&rwl->writeable); - isc_mutex_destroy(&rwl->lock); -} +#define ISC_RWLOCK_UNLOCKED false +#define ISC_RWLOCK_LOCKED true /* - * When some architecture-dependent atomic operations are available, - * rwlock can be more efficient than the generic algorithm defined below. - * The basic algorithm is described in the following URL: - * http://www.cs.rochester.edu/u/scott/synchronization/pseudocode/rw.html - * - * The key is to use the following integer variables modified atomically: - * write_requests, write_completions, and cnt_and_flag. - * - * write_requests and write_completions act as a waiting queue for writers - * in order to ensure the FIFO order. Both variables begin with the initial - * value of 0. When a new writer tries to get a write lock, it increments - * write_requests and gets the previous value of the variable as a "ticket". - * When write_completions reaches the ticket number, the new writer can start - * writing. When the writer completes its work, it increments - * write_completions so that another new writer can start working. If the - * write_requests is not equal to write_completions, it means a writer is now - * working or waiting. In this case, a new readers cannot start reading, or - * in other words, this algorithm basically prefers writers. - * - * cnt_and_flag is a "lock" shared by all readers and writers. This integer - * variable is a kind of structure with two members: writer_flag (1 bit) and - * reader_count (31 bits). The writer_flag shows whether a writer is working, - * and the reader_count shows the number of readers currently working or almost - * ready for working. A writer who has the current "ticket" tries to get the - * lock by exclusively setting the writer_flag to 1, provided that the whole - * 32-bit is 0 (meaning no readers or writers working). On the other hand, - * a new reader tries to increment the "reader_count" field provided that - * the writer_flag is 0 (meaning there is no writer working). - * - * If some of the above operations fail, the reader or the writer sleeps - * until the related condition changes. When a working reader or writer - * completes its work, some readers or writers are sleeping, and the condition - * that suspended the reader or writer has changed, it wakes up the sleeping - * readers or writers. - * - * As already noted, this algorithm basically prefers writers. In order to - * prevent readers from starving, however, the algorithm also introduces the - * "writer quota" (Q). When Q consecutive writers have completed their work, - * suspending readers, the last writer will wake up the readers, even if a new - * writer is waiting. - * - * Implementation specific note: due to the combination of atomic operations - * and a mutex lock, ordering between the atomic operation and locks can be - * very sensitive in some cases. In particular, it is generally very important - * to check the atomic variable that requires a reader or writer to sleep after - * locking the mutex and before actually sleeping; otherwise, it could be very - * likely to cause a deadlock. For example, assume "var" is a variable - * atomically modified, then the corresponding code would be: - * if (var == need_sleep) { - * LOCK(lock); - * if (var == need_sleep) - * WAIT(cond, lock); - * UNLOCK(lock); - * } - * The second check is important, since "var" is protected by the atomic - * operation, not by the mutex, and can be changed just before sleeping. - * (The first "if" could be omitted, but this is also important in order to - * make the code efficient by avoiding the use of the mutex unless it is - * really necessary.) + * See https://csce.ucmss.com/cr/books/2017/LFS/CSREA2017/FCS3701.pdf for + * guidance on patience level */ - -#define WRITER_ACTIVE 0x1 -#define READER_INCR 0x2 +#ifndef RWLOCK_MAX_READER_PATIENCE +#define RWLOCK_MAX_READER_PATIENCE 500 +#endif /* ifndef RWLOCK_MAX_READER_PATIENCE */ static void -rwlock_lock(isc__rwlock_t *rwl, isc_rwlocktype_t type) { - int32_t cntflag; +read_indicator_wait_until_empty(isc_rwlock_t *rwl); - REQUIRE(VALID_RWLOCK(rwl)); +#include -#ifdef ISC_RWLOCK_TRACE - print_lock("prelock", rwl, type); -#endif /* ifdef ISC_RWLOCK_TRACE */ - - if (type == isc_rwlocktype_read) { - if (atomic_load_acquire(&rwl->write_requests) != - atomic_load_acquire(&rwl->write_completions)) - { - /* there is a waiting or active writer */ - LOCK(&rwl->lock); - if (atomic_load_acquire(&rwl->write_requests) != - atomic_load_acquire(&rwl->write_completions)) - { - rwl->readers_waiting++; - WAIT(&rwl->readable, &rwl->lock); - rwl->readers_waiting--; - } - UNLOCK(&rwl->lock); - } - - cntflag = atomic_fetch_add_release(&rwl->cnt_and_flag, - READER_INCR); - POST(cntflag); - while (1) { - if ((atomic_load_acquire(&rwl->cnt_and_flag) & - WRITER_ACTIVE) == 0) - { - break; - } - - /* A writer is still working */ - LOCK(&rwl->lock); - rwl->readers_waiting++; - if ((atomic_load_acquire(&rwl->cnt_and_flag) & - WRITER_ACTIVE) != 0) - { - WAIT(&rwl->readable, &rwl->lock); - } - rwl->readers_waiting--; - UNLOCK(&rwl->lock); - - /* - * Typically, the reader should be able to get a lock - * at this stage: - * (1) there should have been no pending writer when - * the reader was trying to increment the - * counter; otherwise, the writer should be in - * the waiting queue, preventing the reader from - * proceeding to this point. - * (2) once the reader increments the counter, no - * more writer can get a lock. - * Still, it is possible another writer can work at - * this point, e.g. in the following scenario: - * A previous writer unlocks the writer lock. - * This reader proceeds to point (1). - * A new writer appears, and gets a new lock before - * the reader increments the counter. - * The reader then increments the counter. - * The previous writer notices there is a waiting - * reader who is almost ready, and wakes it up. - * So, the reader needs to confirm whether it can now - * read explicitly (thus we loop). Note that this is - * not an infinite process, since the reader has - * incremented the counter at this point. - */ - } - - /* - * If we are temporarily preferred to writers due to the writer - * quota, reset the condition (race among readers doesn't - * matter). - */ - atomic_store_release(&rwl->write_granted, 0); - } else { - int32_t prev_writer; - - /* enter the waiting queue, and wait for our turn */ - prev_writer = atomic_fetch_add_release(&rwl->write_requests, 1); - while (atomic_load_acquire(&rwl->write_completions) != - prev_writer) - { - LOCK(&rwl->lock); - if (atomic_load_acquire(&rwl->write_completions) != - prev_writer) - { - WAIT(&rwl->writeable, &rwl->lock); - UNLOCK(&rwl->lock); - continue; - } - UNLOCK(&rwl->lock); - break; - } - - while (!atomic_compare_exchange_weak_acq_rel( - &rwl->cnt_and_flag, &(int_fast32_t){ 0 }, - WRITER_ACTIVE)) - { - /* Another active reader or writer is working. */ - LOCK(&rwl->lock); - if (atomic_load_acquire(&rwl->cnt_and_flag) != 0) { - WAIT(&rwl->writeable, &rwl->lock); - } - UNLOCK(&rwl->lock); - } - - INSIST((atomic_load_acquire(&rwl->cnt_and_flag) & - WRITER_ACTIVE)); - atomic_fetch_add_release(&rwl->write_granted, 1); - } - -#ifdef ISC_RWLOCK_TRACE - print_lock("postlock", rwl, type); -#endif /* ifdef ISC_RWLOCK_TRACE */ +static void +read_indicator_arrive(isc_rwlock_t *rwl) { + (void)atomic_fetch_add_release(&rwl->readers_ingress, 1); } -void -isc__rwlock_lock(isc__rwlock_t *rwl, isc_rwlocktype_t type) { - int32_t cnt = 0; - int32_t spins = atomic_load_acquire(&rwl->spins) * 2 + 10; - int32_t max_cnt = ISC_MAX(spins, RWLOCK_MAX_ADAPTIVE_COUNT); +static void +read_indicator_depart(isc_rwlock_t *rwl) { + (void)atomic_fetch_add_release(&rwl->readers_egress, 1); +} - do { - if (cnt++ >= max_cnt) { - rwlock_lock(rwl, type); +static bool +read_indicator_isempty(isc_rwlock_t *rwl) { + return (atomic_load_acquire(&rwl->readers_egress) == + atomic_load_acquire(&rwl->readers_ingress)); +} + +static void +writers_barrier_raise(isc_rwlock_t *rwl) { + (void)atomic_fetch_add_release(&rwl->writers_barrier, 1); +} + +static void +writers_barrier_lower(isc_rwlock_t *rwl) { + (void)atomic_fetch_sub_release(&rwl->writers_barrier, 1); +} + +static bool +writers_barrier_israised(isc_rwlock_t *rwl) { + return (atomic_load_acquire(&rwl->writers_barrier) > 0); +} + +static bool +writers_lock_islocked(isc_rwlock_t *rwl) { + return (atomic_load_acquire(&rwl->writers_lock) == ISC_RWLOCK_LOCKED); +} + +static bool +writers_lock_acquire(isc_rwlock_t *rwl) { + return (atomic_compare_exchange_weak_acq_rel( + &rwl->writers_lock, &(bool){ ISC_RWLOCK_UNLOCKED }, + ISC_RWLOCK_LOCKED)); +} + +static void +writers_lock_release(isc_rwlock_t *rwl) { + REQUIRE(atomic_compare_exchange_strong_acq_rel( + &rwl->writers_lock, &(bool){ ISC_RWLOCK_LOCKED }, + ISC_RWLOCK_UNLOCKED)); +} + +#define ran_out_of_patience(cnt) (cnt >= RWLOCK_MAX_READER_PATIENCE) + +void +isc_rwlock_rdlock(isc_rwlock_t *rwl) { + uint32_t cnt = 0; + bool barrier_raised = false; + + while (true) { + read_indicator_arrive(rwl); + if (!writers_lock_islocked(rwl)) { + /* Acquired lock in read-only mode */ break; } - isc_pause(); - } while (isc_rwlock_trylock(rwl, type) != ISC_R_SUCCESS); - atomic_fetch_add_release(&rwl->spins, (cnt - spins) / 8); + /* Writer has acquired the lock, must reset to 0 and wait */ + read_indicator_depart(rwl); + + while (writers_lock_islocked(rwl)) { + isc_pause(); + if (ran_out_of_patience(cnt++) && !barrier_raised) { + writers_barrier_raise(rwl); + barrier_raised = true; + } + } + } + if (barrier_raised) { + writers_barrier_lower(rwl); + } } isc_result_t -isc__rwlock_trylock(isc__rwlock_t *rwl, isc_rwlocktype_t type) { - int32_t cntflag; +isc_rwlock_tryrdlock(isc_rwlock_t *rwl) { + read_indicator_arrive(rwl); + if (writers_lock_islocked(rwl)) { + /* Writer has acquired the lock, release the read lock */ + read_indicator_depart(rwl); - REQUIRE(VALID_RWLOCK(rwl)); - -#ifdef ISC_RWLOCK_TRACE - print_lock("prelock", rwl, type); -#endif /* ifdef ISC_RWLOCK_TRACE */ - - if (type == isc_rwlocktype_read) { - /* If a writer is waiting or working, we fail. */ - if (atomic_load_acquire(&rwl->write_requests) != - atomic_load_acquire(&rwl->write_completions)) - { - return (ISC_R_LOCKBUSY); - } - - /* Otherwise, be ready for reading. */ - cntflag = atomic_fetch_add_release(&rwl->cnt_and_flag, - READER_INCR); - if ((cntflag & WRITER_ACTIVE) != 0) { - /* - * A writer is working. We lose, and cancel the read - * request. - */ - cntflag = atomic_fetch_sub_release(&rwl->cnt_and_flag, - READER_INCR); - /* - * If no other readers are waiting and we've suspended - * new writers in this short period, wake them up. - */ - if (cntflag == READER_INCR && - atomic_load_acquire(&rwl->write_completions) != - atomic_load_acquire(&rwl->write_requests)) - { - LOCK(&rwl->lock); - BROADCAST(&rwl->writeable); - UNLOCK(&rwl->lock); - } - - return (ISC_R_LOCKBUSY); - } - } else { - /* Try locking without entering the waiting queue. */ - int_fast32_t zero = 0; - if (!atomic_compare_exchange_strong_acq_rel( - &rwl->cnt_and_flag, &zero, WRITER_ACTIVE)) - { - return (ISC_R_LOCKBUSY); - } - - /* - * XXXJT: jump into the queue, possibly breaking the writer - * order. - */ - atomic_fetch_sub_release(&rwl->write_completions, 1); - atomic_fetch_add_release(&rwl->write_granted, 1); + return (ISC_R_LOCKBUSY); } -#ifdef ISC_RWLOCK_TRACE - print_lock("postlock", rwl, type); -#endif /* ifdef ISC_RWLOCK_TRACE */ - + /* Acquired lock in read-only mode */ return (ISC_R_SUCCESS); } +void +isc_rwlock_rdunlock(isc_rwlock_t *rwl) { + read_indicator_depart(rwl); +} + isc_result_t -isc__rwlock_tryupgrade(isc__rwlock_t *rwl) { - REQUIRE(VALID_RWLOCK(rwl)); +isc_rwlock_tryupgrade(isc_rwlock_t *rwl) { + /* Write Barriers has been raised */ + if (writers_barrier_israised(rwl)) { + return (ISC_R_LOCKBUSY); + } - int_fast32_t reader_incr = READER_INCR; + /* Try to acquire the write-lock */ + if (!writers_lock_acquire(rwl)) { + return (ISC_R_LOCKBUSY); + } - /* Try to acquire write access. */ - atomic_compare_exchange_strong_acq_rel(&rwl->cnt_and_flag, &reader_incr, - WRITER_ACTIVE); - /* - * There must have been no writer, and there must have - * been at least one reader. - */ - INSIST((reader_incr & WRITER_ACTIVE) == 0 && - (reader_incr & ~WRITER_ACTIVE) != 0); + /* Unlock the read-lock */ + read_indicator_depart(rwl); + + if (!read_indicator_isempty(rwl)) { + /* Re-acquire the read-lock back */ + read_indicator_arrive(rwl); + + /* Unlock the write-lock */ + writers_lock_release(rwl); + return (ISC_R_LOCKBUSY); + } + return (ISC_R_SUCCESS); +} + +static void +read_indicator_wait_until_empty(isc_rwlock_t *rwl) { + /* Write-lock was acquired, now wait for running Readers to finish */ + while (true) { + if (read_indicator_isempty(rwl)) { + break; + } + isc_pause(); + } +} + +void +isc_rwlock_wrlock(isc_rwlock_t *rwl) { + /* Write Barriers has been raised, wait */ + while (writers_barrier_israised(rwl)) { + isc_pause(); + } + + /* Try to acquire the write-lock */ + while (!writers_lock_acquire(rwl)) { + isc_pause(); + } + + read_indicator_wait_until_empty(rwl); +} + +void +isc_rwlock_wrunlock(isc_rwlock_t *rwl) { + writers_lock_release(rwl); +} + +isc_result_t +isc_rwlock_trywrlock(isc_rwlock_t *rwl) { + /* Write Barriers has been raised */ + if (writers_barrier_israised(rwl)) { + return (ISC_R_LOCKBUSY); + } + + /* Try to acquire the write-lock */ + if (!writers_lock_acquire(rwl)) { + return (ISC_R_LOCKBUSY); + } + + if (!read_indicator_isempty(rwl)) { + /* Unlock the write-lock */ + writers_lock_release(rwl); - if (reader_incr == READER_INCR) { - /* - * We are the only reader and have been upgraded. - * Now jump into the head of the writer waiting queue. - */ - atomic_fetch_sub_release(&rwl->write_completions, 1); - } else { return (ISC_R_LOCKBUSY); } @@ -408,74 +259,30 @@ isc__rwlock_tryupgrade(isc__rwlock_t *rwl) { } void -isc__rwlock_unlock(isc__rwlock_t *rwl, isc_rwlocktype_t type) { - int32_t prev_cnt; +isc_rwlock_downgrade(isc_rwlock_t *rwl) { + read_indicator_arrive(rwl); - REQUIRE(VALID_RWLOCK(rwl)); - -#ifdef ISC_RWLOCK_TRACE - print_lock("preunlock", rwl, type); -#endif /* ifdef ISC_RWLOCK_TRACE */ - - if (type == isc_rwlocktype_read) { - prev_cnt = atomic_fetch_sub_release(&rwl->cnt_and_flag, - READER_INCR); - /* - * If we're the last reader and any writers are waiting, wake - * them up. We need to wake up all of them to ensure the - * FIFO order. - */ - if (prev_cnt == READER_INCR && - atomic_load_acquire(&rwl->write_completions) != - atomic_load_acquire(&rwl->write_requests)) - { - LOCK(&rwl->lock); - BROADCAST(&rwl->writeable); - UNLOCK(&rwl->lock); - } - } else { - bool wakeup_writers = true; - - /* - * Reset the flag, and (implicitly) tell other writers - * we are done. - */ - atomic_fetch_sub_release(&rwl->cnt_and_flag, WRITER_ACTIVE); - atomic_fetch_add_release(&rwl->write_completions, 1); - - if ((atomic_load_acquire(&rwl->write_granted) >= - rwl->write_quota) || - (atomic_load_acquire(&rwl->write_requests) == - atomic_load_acquire(&rwl->write_completions)) || - (atomic_load_acquire(&rwl->cnt_and_flag) & ~WRITER_ACTIVE)) - { - /* - * We have passed the write quota, no writer is - * waiting, or some readers are almost ready, pending - * possible writers. Note that the last case can - * happen even if write_requests != write_completions - * (which means a new writer in the queue), so we need - * to catch the case explicitly. - */ - LOCK(&rwl->lock); - if (rwl->readers_waiting > 0) { - wakeup_writers = false; - BROADCAST(&rwl->readable); - } - UNLOCK(&rwl->lock); - } - - if ((atomic_load_acquire(&rwl->write_requests) != - atomic_load_acquire(&rwl->write_completions)) && - wakeup_writers) - { - LOCK(&rwl->lock); - BROADCAST(&rwl->writeable); - UNLOCK(&rwl->lock); - } - } - -#ifdef ISC_RWLOCK_TRACE - print_lock("postunlock", rwl, type); -#endif /* ifdef ISC_RWLOCK_TRACE */ + writers_lock_release(rwl); +} + +void +isc_rwlock_init(isc_rwlock_t *rwl) { + REQUIRE(rwl != NULL); + + atomic_init(&rwl->writers_lock, ISC_RWLOCK_UNLOCKED); + atomic_init(&rwl->writers_barrier, 0); + atomic_init(&rwl->readers_ingress, 0); + atomic_init(&rwl->readers_egress, 0); +} + +void +isc_rwlock_destroy(isc_rwlock_t *rwl) { + /* Check whether write lock has been unlocked */ + REQUIRE(atomic_load(&rwl->writers_lock) == ISC_RWLOCK_UNLOCKED); + REQUIRE(read_indicator_isempty(rwl)); +} + +void +isc_rwlock_setworkers(uint16_t workers) { + atomic_store(&isc__crwlock_workers, workers); } diff --git a/lib/isc/tls.c b/lib/isc/tls.c index 8c4768c46b..569a041a8d 100644 --- a/lib/isc/tls.c +++ b/lib/isc/tls.c @@ -1186,7 +1186,7 @@ isc_tlsctx_cache_create(isc_mem_t *mctx, isc_tlsctx_cache_t **cachep) { isc_mem_attach(mctx, &nc->mctx); isc_ht_init(&nc->data, mctx, 5, ISC_HT_CASE_SENSITIVE); - isc_rwlock_init(&nc->rwlock, 0, 0); + isc_rwlock_init(&nc->rwlock); *cachep = nc; } From 3d3d3b8c58d806c81c2d1f19d936802b17531630 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ond=C5=99ej=20Sur=C3=BD?= Date: Mon, 13 Feb 2023 15:52:47 +0100 Subject: [PATCH 4/6] Use C-RW-WP lock in the dns_resolver unit Replace the isc_mutex with isc_rwlock in the dns_resolver unit, specifically, both fetch context and fetch counters now uses the C-RW-WP locks. --- lib/dns/resolver.c | 105 ++++++++++++++++++++++++++++++--------------- 1 file changed, 71 insertions(+), 34 deletions(-) diff --git a/lib/dns/resolver.c b/lib/dns/resolver.c index cf1ac6e069..060a0a41e7 100644 --- a/lib/dns/resolver.c +++ b/lib/dns/resolver.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -325,6 +326,7 @@ struct fctxkey { typedef struct fctxcount fctxcount_t; struct fctxcount { unsigned int magic; + isc_mutex_t lock; dns_fixedname_t dfname; dns_name_t *domain; uint_fast32_t count; @@ -558,10 +560,10 @@ struct dns_resolver { dns_dispatchset_t *dispatches6; isc_hashmap_t *fctxs; - isc_mutex_t fctxs_lock; + isc_rwlock_t fctxs_lock; isc_hashmap_t *counters; - isc_mutex_t counters_lock; + isc_rwlock_t counters_lock; unsigned int ntasks; isc_task_t **tasks; @@ -1633,6 +1635,7 @@ fcount_incr(fetchctx_t *fctx, bool force) { fctxcount_t *counter = NULL; uint32_t hashval; uint_fast32_t spill; + isc_rwlocktype_t locktype = isc_rwlocktype_read; REQUIRE(fctx != NULL); res = fctx->res; @@ -1647,7 +1650,7 @@ fcount_incr(fetchctx_t *fctx, bool force) { hashval = isc_hashmap_hash(res->counters, fctx->domain->ndata, fctx->domain->length); - LOCK(&res->counters_lock); + RWLOCK(&res->counters_lock, locktype); result = isc_hashmap_find(res->counters, &hashval, fctx->domain->ndata, fctx->domain->length, (void **)&counter); switch (result) { @@ -1660,12 +1663,24 @@ fcount_incr(fetchctx_t *fctx, bool force) { .count = 0, .allowed = 0, }; + isc_mutex_init(&counter->lock); counter->domain = dns_fixedname_initname(&counter->dfname); dns_name_copy(fctx->domain, counter->domain); + UPGRADELOCK(&res->counters_lock, locktype); + result = isc_hashmap_add(res->counters, &hashval, counter->domain->ndata, counter->domain->length, counter); + if (result == ISC_R_EXISTS) { + isc_mutex_destroy(&counter->lock); + isc_mem_put(fctx->mctx, counter, sizeof(*counter)); + counter = NULL; + result = isc_hashmap_find( + res->counters, &hashval, fctx->domain->ndata, + fctx->domain->length, (void **)&counter); + } + INSIST(result == ISC_R_SUCCESS); break; default: @@ -1674,6 +1689,7 @@ fcount_incr(fetchctx_t *fctx, bool force) { INSIST(VALID_FCTXCOUNT(counter)); INSIST(spill > 0); + LOCK(&counter->lock); if (++counter->count > spill) { counter->count--; INSIST(counter->count > 0); @@ -1684,7 +1700,8 @@ fcount_incr(fetchctx_t *fctx, bool force) { counter->allowed++; fctx->counter = counter; } - UNLOCK(&res->counters_lock); + UNLOCK(&counter->lock); + RWUNLOCK(&res->counters_lock, locktype); return (result); } @@ -1699,19 +1716,35 @@ fcount_decr(fetchctx_t *fctx) { } fctx->counter = NULL; - LOCK(&fctx->res->counters_lock); + /* + * FIXME: This should not require a write lock, but should be + * implemented using reference counting later, otherwise we would could + * encounter ABA problem here - the count could go up and down when we + * switch from read to write lock. + */ + RWLOCK(&fctx->res->counters_lock, isc_rwlocktype_write); + + LOCK(&counter->lock); INSIST(VALID_FCTXCOUNT(counter)); INSIST(counter->count > 0); - if (--counter->count == 0) { - isc_result_t result = isc_hashmap_delete( - fctx->res->counters, NULL, counter->domain->ndata, - counter->domain->length); - INSIST(result == ISC_R_SUCCESS); - - fcount_logspill(fctx, counter, true); - isc_mem_put(fctx->mctx, counter, sizeof(*counter)); + if (--counter->count > 0) { + UNLOCK(&counter->lock); + RWUNLOCK(&fctx->res->counters_lock, isc_rwlocktype_write); + return; } - UNLOCK(&fctx->res->counters_lock); + + isc_result_t result = isc_hashmap_delete(fctx->res->counters, NULL, + counter->domain->ndata, + counter->domain->length); + INSIST(result == ISC_R_SUCCESS); + + fcount_logspill(fctx, counter, true); + UNLOCK(&counter->lock); + + isc_mutex_destroy(&counter->lock); + isc_mem_put(fctx->mctx, counter, sizeof(*counter)); + + RWUNLOCK(&fctx->res->counters_lock, isc_rwlocktype_write); } static void @@ -7201,12 +7234,12 @@ release_fctx(fetchctx_t *fctx) { return; } - LOCK(&res->fctxs_lock); + RWLOCK(&res->fctxs_lock, isc_rwlocktype_write); result = isc_hashmap_delete(res->fctxs, &hashval, fctx->key.key, fctx->key.size); INSIST(result == ISC_R_SUCCESS); fctx->hashed = false; - UNLOCK(&res->fctxs_lock); + RWUNLOCK(&res->fctxs_lock, isc_rwlocktype_write); } static void @@ -10072,11 +10105,11 @@ dns_resolver__destroy(dns_resolver_t *res) { INSIST(isc_hashmap_count(res->fctxs) == 0); isc_hashmap_destroy(&res->fctxs); - isc_mutex_destroy(&res->fctxs_lock); + isc_rwlock_destroy(&res->fctxs_lock); INSIST(isc_hashmap_count(res->counters) == 0); isc_hashmap_destroy(&res->counters); - isc_mutex_destroy(&res->counters_lock); + isc_rwlock_destroy(&res->counters_lock); if (res->dispatches4 != NULL) { dns_dispatchset_destroy(&res->dispatches4); @@ -10206,11 +10239,11 @@ dns_resolver_create(dns_view_t *view, isc_loopmgr_t *loopmgr, /* This needs to be case sensitive to not lowercase options and type */ isc_hashmap_create(view->mctx, RES_DOMAIN_HASH_BITS, ISC_HASHMAP_CASE_SENSITIVE, &res->fctxs); - isc_mutex_init(&res->fctxs_lock); + isc_rwlock_init(&res->fctxs_lock); isc_hashmap_create(view->mctx, RES_DOMAIN_HASH_BITS, ISC_HASHMAP_CASE_INSENSITIVE, &res->counters); - isc_mutex_init(&res->counters_lock); + isc_rwlock_init(&res->counters_lock); if (dispatchv4 != NULL) { dns_dispatchset_create(res->mctx, dispatchv4, &res->dispatches4, @@ -10370,7 +10403,7 @@ dns_resolver_shutdown(dns_resolver_t *res) { RTRACE("exiting"); - LOCK(&res->fctxs_lock); + RWLOCK(&res->fctxs_lock, isc_rwlocktype_write); isc_hashmap_iter_create(res->fctxs, &it); for (result = isc_hashmap_iter_first(it); result == ISC_R_SUCCESS; @@ -10386,7 +10419,7 @@ dns_resolver_shutdown(dns_resolver_t *res) { fctx); } isc_hashmap_iter_destroy(&it); - UNLOCK(&res->fctxs_lock); + RWUNLOCK(&res->fctxs_lock, isc_rwlocktype_write); LOCK(&res->lock); if (res->spillattimer != NULL) { @@ -10525,6 +10558,7 @@ get_attached_fctx(dns_resolver_t *res, const dns_name_t *name, name->length, }; fetchctx_t *fctx = NULL; + isc_rwlocktype_t locktype = isc_rwlocktype_read; STATIC_ASSERT(sizeof(key.options) == sizeof(options), "key options size mismatch"); @@ -10538,7 +10572,7 @@ get_attached_fctx(dns_resolver_t *res, const dns_name_t *name, hashval = isc_hashmap_hash(res->fctxs, key.key, key.size); again: - LOCK(&res->fctxs_lock); + RWLOCK(&res->fctxs_lock, locktype); result = isc_hashmap_find(res->fctxs, &hashval, key.key, key.size, (void **)&fctx); switch (result) { @@ -10552,13 +10586,17 @@ again: goto unlock; } - *new_fctx = true; - + UPGRADELOCK(&res->fctxs_lock, locktype); result = isc_hashmap_add(res->fctxs, &hashval, fctx->key.key, fctx->key.size, fctx); - - fctx->hashed = true; - + if (result == ISC_R_SUCCESS) { + *new_fctx = true; + fctx->hashed = true; + } else { + fctx_done_detach(&fctx, result); + result = isc_hashmap_find(res->fctxs, &hashval, key.key, + key.size, (void **)&fctx); + } INSIST(result == ISC_R_SUCCESS); break; default: @@ -10566,8 +10604,7 @@ again: } fetchctx_ref(fctx); unlock: - UNLOCK(&res->fctxs_lock); - + RWUNLOCK(&res->fctxs_lock, locktype); if (result == ISC_R_SUCCESS) { LOCK(&fctx->lock); if (SHUTTINGDOWN(fctx) || fctx->cloned) { @@ -11388,7 +11425,7 @@ dns_resolver_dumpfetches(dns_resolver_t *res, isc_statsformat_t format, REQUIRE(fp != NULL); REQUIRE(format == isc_statsformat_file); - LOCK(&res->counters_lock); + RWLOCK(&res->counters_lock, isc_rwlocktype_read); isc_hashmap_iter_create(res->counters, &it); for (result = isc_hashmap_iter_first(it); result == ISC_R_SUCCESS; result = isc_hashmap_iter_next(it)) @@ -11402,7 +11439,7 @@ dns_resolver_dumpfetches(dns_resolver_t *res, isc_statsformat_t format, " spilled, %" PRIuFAST32 " allowed)\n", counter->count, counter->dropped, counter->allowed); } - UNLOCK(&res->counters_lock); + RWUNLOCK(&res->counters_lock, isc_rwlocktype_read); isc_hashmap_iter_destroy(&it); } @@ -11419,7 +11456,7 @@ dns_resolver_dumpquota(dns_resolver_t *res, isc_buffer_t **buf) { return (ISC_R_SUCCESS); } - LOCK(&res->counters_lock); + RWLOCK(&res->counters_lock, isc_rwlocktype_read); isc_hashmap_iter_create(res->counters, &it); for (result = isc_hashmap_iter_first(it); result == ISC_R_SUCCESS; result = isc_hashmap_iter_next(it)) @@ -11451,7 +11488,7 @@ dns_resolver_dumpquota(dns_resolver_t *res, isc_buffer_t **buf) { } cleanup: - UNLOCK(&res->counters_lock); + RWUNLOCK(&res->counters_lock, isc_rwlocktype_read); isc_hashmap_iter_destroy(&it); return (result); } From c194ff5d77810a9f8a85fa1bbb0acc2507c83171 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ond=C5=99ej=20Sur=C3=BD?= Date: Mon, 13 Feb 2023 15:52:51 +0100 Subject: [PATCH 5/6] Use C-RW-WP lock in the dns_adb unit Replace the isc_mutex in the dns_adb unit with isc_rwlock for better performance. Both ADB names and ADB entries hashtables and LRU are now using isc_rwlock. --- lib/dns/adb.c | 202 +++++++++++++++++++++++++++++++------------------- 1 file changed, 127 insertions(+), 75 deletions(-) diff --git a/lib/dns/adb.c b/lib/dns/adb.c index 146c5cecca..6b32df2a0f 100644 --- a/lib/dns/adb.c +++ b/lib/dns/adb.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -118,12 +119,12 @@ struct dns_adb { dns_adbnamelist_t names_lru; isc_stdtime_t names_last_update; isc_hashmap_t *names; - isc_mutex_t names_lock; + isc_rwlock_t names_lock; dns_adbentrylist_t entries_lru; isc_stdtime_t entries_last_update; isc_hashmap_t *entries; - isc_mutex_t entries_lock; + isc_rwlock_t entries_lock; isc_stats_t *stats; @@ -372,6 +373,8 @@ maybe_expire_name(dns_adbname_t *adbname, isc_stdtime_t now); static void expire_name(dns_adbname_t *adbname, isc_eventtype_t evtype); static bool +entry_expired(dns_adbentry_t *adbentry, isc_stdtime_t now); +static bool maybe_expire_entry(dns_adbentry_t *adbentry, isc_stdtime_t now); static void expire_entry(dns_adbentry_t *adbentry); @@ -762,7 +765,7 @@ static void shutdown_names(dns_adb_t *adb) { dns_adbname_t *next = NULL; - LOCK(&adb->names_lock); + RWLOCK(&adb->names_lock, isc_rwlocktype_write); for (dns_adbname_t *name = ISC_LIST_HEAD(adb->names_lru); name != NULL; name = next) { @@ -779,20 +782,20 @@ shutdown_names(dns_adb_t *adb) { UNLOCK(&name->lock); dns_adbname_detach(&name); } - UNLOCK(&adb->names_lock); + RWUNLOCK(&adb->names_lock, isc_rwlocktype_write); } static void shutdown_entries(dns_adb_t *adb) { dns_adbentry_t *next = NULL; - LOCK(&adb->entries_lock); + RWLOCK(&adb->entries_lock, isc_rwlocktype_write); for (dns_adbentry_t *adbentry = ISC_LIST_HEAD(adb->entries_lru); adbentry != NULL; adbentry = next) { next = ISC_LIST_NEXT(adbentry, link); expire_entry(adbentry); } - UNLOCK(&adb->entries_lock); + RWUNLOCK(&adb->entries_lock, isc_rwlocktype_write); } /* @@ -1342,6 +1345,7 @@ get_attached_and_locked_name(dns_adb_t *adb, const dns_name_t *name, isc_time_t timenow; isc_stdtime_t last_update; adbnamekey_t key; + isc_rwlocktype_t locktype = isc_rwlocktype_read; isc_time_set(&timenow, now, 0); @@ -1351,54 +1355,65 @@ get_attached_and_locked_name(dns_adb_t *adb, const dns_name_t *name, hashval = isc_hashmap_hash(adb->names, &key.key, key.size); - LOCK(&adb->names_lock); + RWLOCK(&adb->names_lock, locktype); last_update = adb->names_last_update; - if (last_update + ADB_STALE_MARGIN < now || + + if (last_update + ADB_STALE_MARGIN >= now || atomic_load_relaxed(&adb->is_overmem)) { - last_update = adb->names_last_update = now; - + last_update = now; + UPGRADELOCK(&adb->names_lock, locktype); purge_stale_names(adb, now); + adb->names_last_update = last_update; } result = isc_hashmap_find(adb->names, &hashval, key.key, key.size, (void **)&adbname); switch (result) { case ISC_R_NOTFOUND: + UPGRADELOCK(&adb->names_lock, locktype); + /* Allocate a new name and add it to the hash table. */ adbname = new_adbname(adb, name, start_at_zone); result = isc_hashmap_add(adb->names, &hashval, &adbname->key.key, adbname->key.size, adbname); + if (result == ISC_R_EXISTS) { + destroy_adbname(adbname); + adbname = NULL; + result = isc_hashmap_find(adb->names, &hashval, key.key, + key.size, (void **)&adbname); + ISC_LIST_UNLINK(adb->names_lru, adbname, link); + } INSIST(result == ISC_R_SUCCESS); - dns_adbname_ref(adbname); - LOCK(&adbname->lock); /* Must be unlocked by the caller */ - adbname->last_used = now; - - ISC_LIST_PREPEND(adb->names_lru, adbname, link); break; case ISC_R_SUCCESS: - dns_adbname_ref(adbname); - LOCK(&adbname->lock); /* Must be unlocked by the caller */ - if (adbname->last_used + ADB_CACHE_MINIMUM <= last_update) { - adbname->last_used = now; - + if (locktype == isc_rwlocktype_write) { ISC_LIST_UNLINK(adb->names_lru, adbname, link); - ISC_LIST_PREPEND(adb->names_lru, adbname, link); } break; default: UNREACHABLE(); } + dns_adbname_ref(adbname); + + LOCK(&adbname->lock); /* Must be unlocked by the caller */ + if (adbname->last_used + ADB_CACHE_MINIMUM <= last_update) { + adbname->last_used = now; + } + if (locktype == isc_rwlocktype_write) { + ISC_LIST_PREPEND(adb->names_lru, adbname, link); + } + /* * The refcount is now 2 and the final detach will happen in * expire_name() - the unused adbname stored in the hashtable and lru * has always refcount == 1 */ - UNLOCK(&adb->names_lock); + RWUNLOCK(&adb->names_lock, locktype); return (adbname); } @@ -1415,64 +1430,93 @@ get_attached_and_locked_entry(dns_adb_t *adb, isc_stdtime_t now, isc_stdtime_t last_update; uint32_t hashval = isc_hashmap_hash( adb->entries, (const unsigned char *)addr, sizeof(*addr)); + isc_rwlocktype_t locktype = isc_rwlocktype_read; isc_time_set(&timenow, now, 0); - LOCK(&adb->entries_lock); + RWLOCK(&adb->entries_lock, locktype); last_update = adb->entries_last_update; + if (now - last_update > ADB_STALE_MARGIN || atomic_load_relaxed(&adb->is_overmem)) { - last_update = adb->entries_last_update = now; + last_update = now; + + UPGRADELOCK(&adb->entries_lock, locktype); purge_stale_entries(adb, now); + adb->entries_last_update = now; } +again: result = isc_hashmap_find(adb->entries, &hashval, (const unsigned char *)addr, sizeof(*addr), (void **)&adbentry); switch (result) { case ISC_R_NOTFOUND: { create: + UPGRADELOCK(&adb->entries_lock, locktype); + /* Allocate a new entry and add it to the hash table. */ adbentry = new_adbentry(adb, addr); result = isc_hashmap_add(adb->entries, &hashval, &adbentry->sockaddr, sizeof(adbentry->sockaddr), adbentry); + if (result == ISC_R_EXISTS) { + dns_adbentry_detach(&adbentry); + + result = isc_hashmap_find(adb->entries, &hashval, + (const unsigned char *)addr, + sizeof(*addr), + (void **)&adbentry); + ISC_LIST_UNLINK(adb->entries_lru, adbentry, link); + } INSIST(result == ISC_R_SUCCESS); - - dns_adbentry_ref(adbentry); - LOCK(&adbentry->lock); /* Must be unlocked by the caller */ - adbentry->last_used = now; - - ISC_LIST_PREPEND(adb->entries_lru, adbentry, link); break; } case ISC_R_SUCCESS: - /* - * The dns_adbentry_ref() must stay here before trying to expire - * the ADB entry, so it is not destroyed under the lock. - */ - dns_adbentry_ref(adbentry); - LOCK(&adbentry->lock); /* Must be unlocked by the caller */ - if (maybe_expire_entry(adbentry, now)) { - UNLOCK(&adbentry->lock); - dns_adbentry_detach(&adbentry); - goto create; - } - if (adbentry->last_used + ADB_CACHE_MINIMUM <= last_update) { - adbentry->last_used = now; - + if (locktype == isc_rwlocktype_write) { ISC_LIST_UNLINK(adb->entries_lru, adbentry, link); - ISC_LIST_PREPEND(adb->entries_lru, adbentry, link); } break; default: UNREACHABLE(); } - UNLOCK(&adb->entries_lock); + /* + * The dns_adbentry_ref() must stay here before trying to expire + * the ADB entry, so it is not destroyed under the lock. + */ + dns_adbentry_ref(adbentry); + LOCK(&adbentry->lock); /* Must be unlocked by the caller */ + switch (locktype) { + case isc_rwlocktype_read: + if (entry_expired(adbentry, now)) { + UNLOCK(&adbentry->lock); + dns_adbentry_detach(&adbentry); + goto again; + } + break; + case isc_rwlocktype_write: + if (maybe_expire_entry(adbentry, now)) { + UNLOCK(&adbentry->lock); + dns_adbentry_detach(&adbentry); + goto create; + } + break; + default: + UNREACHABLE(); + } + + if (adbentry->last_used + ADB_CACHE_MINIMUM <= last_update) { + adbentry->last_used = now; + } + if (locktype == isc_rwlocktype_write) { + ISC_LIST_PREPEND(adb->entries_lru, adbentry, link); + } + + RWUNLOCK(&adb->entries_lock, locktype); return (adbentry); } @@ -1657,9 +1701,7 @@ expire_entry(dns_adbentry_t *adbentry) { } static bool -maybe_expire_entry(dns_adbentry_t *adbentry, isc_stdtime_t now) { - REQUIRE(DNS_ADBENTRY_VALID(adbentry)); - +entry_expired(dns_adbentry_t *adbentry, isc_stdtime_t now) { if (!ISC_LIST_EMPTY(adbentry->nhs)) { return (false); } @@ -1668,11 +1710,21 @@ maybe_expire_entry(dns_adbentry_t *adbentry, isc_stdtime_t now) { return (false); } - expire_entry(adbentry); - return (true); } +static bool +maybe_expire_entry(dns_adbentry_t *adbentry, isc_stdtime_t now) { + REQUIRE(DNS_ADBENTRY_VALID(adbentry)); + + if (entry_expired(adbentry, now)) { + expire_entry(adbentry); + return (true); + } + + return (false); +} + /*% * Examine the tail entry of the LRU list to see if it expires or is stale * (unused for some period); if so, the name entry will be freed. If the ADB @@ -1756,7 +1808,7 @@ static void cleanup_names(dns_adb_t *adb, isc_stdtime_t now) { dns_adbname_t *next = NULL; - LOCK(&adb->names_lock); + RWLOCK(&adb->names_lock, isc_rwlocktype_write); for (dns_adbname_t *adbname = ISC_LIST_HEAD(adb->names_lru); adbname != NULL; adbname = next) { @@ -1775,7 +1827,7 @@ cleanup_names(dns_adb_t *adb, isc_stdtime_t now) { UNLOCK(&adbname->lock); dns_adbname_detach(&adbname); } - UNLOCK(&adb->names_lock); + RWUNLOCK(&adb->names_lock, isc_rwlocktype_write); } /*% @@ -1859,7 +1911,7 @@ static void cleanup_entries(dns_adb_t *adb, isc_stdtime_t now) { dns_adbentry_t *next = NULL; - LOCK(&adb->entries_lock); + RWLOCK(&adb->entries_lock, isc_rwlocktype_write); for (dns_adbentry_t *adbentry = ISC_LIST_HEAD(adb->entries_lru); adbentry != NULL; adbentry = next) { @@ -1871,7 +1923,7 @@ cleanup_entries(dns_adb_t *adb, isc_stdtime_t now) { UNLOCK(&adbentry->lock); dns_adbentry_detach(&adbentry); } - UNLOCK(&adb->entries_lock); + RWUNLOCK(&adb->entries_lock, isc_rwlocktype_write); } static void @@ -1880,18 +1932,18 @@ destroy(dns_adb_t *adb) { adb->magic = 0; - LOCK(&adb->names_lock); + RWLOCK(&adb->names_lock, isc_rwlocktype_write); INSIST(isc_hashmap_count(adb->names) == 0); isc_hashmap_destroy(&adb->names); - UNLOCK(&adb->names_lock); - isc_mutex_destroy(&adb->names_lock); + RWUNLOCK(&adb->names_lock, isc_rwlocktype_write); + isc_rwlock_destroy(&adb->names_lock); - LOCK(&adb->entries_lock); + RWLOCK(&adb->entries_lock, isc_rwlocktype_write); /* There are no unassociated entries */ INSIST(isc_hashmap_count(adb->entries) == 0); isc_hashmap_destroy(&adb->entries); - UNLOCK(&adb->entries_lock); - isc_mutex_destroy(&adb->entries_lock); + RWUNLOCK(&adb->entries_lock, isc_rwlocktype_write); + isc_rwlock_destroy(&adb->entries_lock); isc_mem_destroy(&adb->hmctx); @@ -1956,11 +2008,11 @@ dns_adb_create(isc_mem_t *mem, dns_view_t *view, isc_loopmgr_t *loopmgr, isc_hashmap_create(adb->hmctx, ADB_HASH_BITS, ISC_HASHMAP_CASE_INSENSITIVE, &adb->names); - isc_mutex_init(&adb->names_lock); + isc_rwlock_init(&adb->names_lock); isc_hashmap_create(adb->hmctx, ADB_HASH_BITS, ISC_HASHMAP_CASE_SENSITIVE, &adb->entries); - isc_mutex_init(&adb->entries_lock); + isc_rwlock_init(&adb->entries_lock); isc_mutex_init(&adb->lock); @@ -2002,11 +2054,11 @@ free_tasks: isc_mutex_destroy(&adb->lock); - isc_mutex_destroy(&adb->entries_lock); + isc_rwlock_destroy(&adb->entries_lock); isc_hashmap_destroy(&adb->entries); INSIST(ISC_LIST_EMPTY(adb->entries_lru)); - isc_mutex_destroy(&adb->names_lock); + isc_rwlock_destroy(&adb->names_lock); isc_hashmap_destroy(&adb->names); INSIST(ISC_LIST_EMPTY(adb->names_lru)); @@ -2507,7 +2559,7 @@ dump_adb(dns_adb_t *adb, FILE *f, bool debug, isc_stdtime_t now) { /* * Ensure this operation is applied to both hash tables at once. */ - LOCK(&adb->names_lock); + RWLOCK(&adb->names_lock, isc_rwlocktype_write); for (dns_adbname_t *name = ISC_LIST_HEAD(adb->names_lru); name != NULL; name = ISC_LIST_NEXT(name, link)) @@ -2546,7 +2598,7 @@ dump_adb(dns_adb_t *adb, FILE *f, bool debug, isc_stdtime_t now) { UNLOCK(&name->lock); } - LOCK(&adb->entries_lock); + RWLOCK(&adb->entries_lock, isc_rwlocktype_write); fprintf(f, ";\n; Unassociated entries\n;\n"); for (dns_adbentry_t *adbentry = ISC_LIST_HEAD(adb->entries_lru); adbentry != NULL; adbentry = ISC_LIST_NEXT(adbentry, link)) @@ -2558,8 +2610,8 @@ dump_adb(dns_adb_t *adb, FILE *f, bool debug, isc_stdtime_t now) { UNLOCK(&adbentry->lock); } - UNLOCK(&adb->entries_lock); - UNLOCK(&adb->names_lock); + RWUNLOCK(&adb->entries_lock, isc_rwlocktype_write); + RWUNLOCK(&adb->names_lock, isc_rwlocktype_write); } static void @@ -2735,7 +2787,7 @@ dns_adb_dumpquota(dns_adb_t *adb, isc_buffer_t **buf) { isc_hashmap_iter_t *it = NULL; isc_result_t result; - LOCK(&adb->entries_lock); + RWLOCK(&adb->entries_lock, isc_rwlocktype_read); isc_hashmap_iter_create(adb->entries, &it); for (result = isc_hashmap_iter_first(it); result == ISC_R_SUCCESS; result = isc_hashmap_iter_next(it)) @@ -2764,7 +2816,7 @@ dns_adb_dumpquota(dns_adb_t *adb, isc_buffer_t **buf) { UNLOCK(&entry->lock); } isc_hashmap_iter_destroy(&it); - UNLOCK(&adb->entries_lock); + RWUNLOCK(&adb->entries_lock, isc_rwlocktype_read); return (ISC_R_SUCCESS); } @@ -3612,7 +3664,7 @@ dns_adb_flushname(dns_adb_t *adb, const dns_name_t *name) { return; } - LOCK(&adb->names_lock); + RWLOCK(&adb->names_lock, isc_rwlocktype_write); again: /* * Delete both entries - without and with NAME_STARTATZONE set. @@ -3636,7 +3688,7 @@ again: start_at_zone = true; goto again; } - UNLOCK(&adb->names_lock); + RWUNLOCK(&adb->names_lock, isc_rwlocktype_write); } void @@ -3650,7 +3702,7 @@ dns_adb_flushnames(dns_adb_t *adb, const dns_name_t *name) { return; } - LOCK(&adb->names_lock); + RWLOCK(&adb->names_lock, isc_rwlocktype_write); for (dns_adbname_t *adbname = ISC_LIST_HEAD(adb->names_lru); adbname != NULL; adbname = next) { @@ -3663,7 +3715,7 @@ dns_adb_flushnames(dns_adb_t *adb, const dns_name_t *name) { UNLOCK(&adbname->lock); dns_adbname_detach(&adbname); } - UNLOCK(&adb->names_lock); + RWUNLOCK(&adb->names_lock, isc_rwlocktype_write); } static void From 459db4462fe2ba48b0443d7bf4bd42dfd675dcc4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ond=C5=99ej=20Sur=C3=BD?= Date: Tue, 14 Feb 2023 12:11:03 +0100 Subject: [PATCH 6/6] Add CHANGES note for [GL #1609] --- CHANGES | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGES b/CHANGES index 081f6572b6..b673261f55 100644 --- a/CHANGES +++ b/CHANGES @@ -1,3 +1,7 @@ +6099. [performance] Change the internal read-write lock to modified C-RW-WP + algorithm that is more reader-writer fair and has better + performance for our workloads. [GL #1609] + 6098. [test] Don't test HMAC-MD5 when not supported by libcrypto. [GL #3871]