From 5597be9bb88de138dfec9fa9176708443813925e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tatuya=20JINMEI=20=E7=A5=9E=E6=98=8E=E9=81=94=E5=93=89?= Date: Sat, 4 Jun 2005 05:32:50 +0000 Subject: [PATCH] 1813. [func] Restructured the data locking framework using architecture dependent atomic operations (when available), improving response performance on multi-processor machines significantly. x86, x86_64, alpha, and sparc64 are currently supported. (RT #13505) --- CHANGES | 7 +- bin/named/client.c | 45 +- configure.in | 72 +- lib/dns/acache.c | 80 +- lib/dns/include/dns/rbt.h | 75 +- lib/dns/rbt.c | 32 +- lib/dns/rbtdb.c | 1356 +++++++++++++++++-------- lib/dns/resolver.c | 86 +- lib/dns/zone.c | 143 ++- lib/isc/Makefile.in | 3 +- lib/isc/alpha/include/isc/atomic.h | 166 +++ lib/isc/include/isc/mem.h | 23 +- lib/isc/include/isc/platform.h.in | 25 +- lib/isc/include/isc/refcount.h | 85 +- lib/isc/include/isc/rwlock.h | 40 +- lib/isc/mem.c | 153 +-- lib/isc/noatomic/include/isc/atomic.h | 24 + lib/isc/rwlock.c | 420 +++++++- lib/isc/sparc64/include/isc/atomic.h | 119 +++ lib/isc/unix/include/isc/stdtime.h | 13 +- lib/isc/win32/include/isc/stdtime.h | 13 +- lib/isc/x86_32/include/isc/atomic.h | 83 ++ make/includes.in | 7 +- 23 files changed, 2413 insertions(+), 657 deletions(-) create mode 100644 lib/isc/alpha/include/isc/atomic.h create mode 100644 lib/isc/noatomic/include/isc/atomic.h create mode 100644 lib/isc/sparc64/include/isc/atomic.h create mode 100644 lib/isc/x86_32/include/isc/atomic.h diff --git a/CHANGES b/CHANGES index 39887fbf6d..f84495e8df 100644 --- a/CHANGES +++ b/CHANGES @@ -146,7 +146,12 @@ 1814. [func] UNIX domain controls are now supported. -1813. [placeholder] rt13505 +1813. [func] Restructured the data locking framework using + architecture dependent atomic operations (when + available), improving response performance on + multi-processor machines significantly. + x86, x86_64, alpha, and sparc64 are currently + supported. 1812. [port] win32: IN6_IS_ADDR_UNSPECIFIED macro is incorrect. [RT #13453] diff --git a/bin/named/client.c b/bin/named/client.c index 95ab3f9ac4..067e327497 100644 --- a/bin/named/client.c +++ b/bin/named/client.c @@ -15,13 +15,14 @@ * PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: client.c,v 1.226 2005/04/27 04:55:48 sra Exp $ */ +/* $Id: client.c,v 1.227 2005/06/04 05:32:46 jinmei Exp $ */ #include #include #include #include +#include #include #include #include @@ -471,7 +472,7 @@ exit_check(ns_client_t *client) { CTRACE("free"); client->magic = 0; - isc_mem_put(client->mctx, client, sizeof(*client)); + isc_mem_putanddetach(&client->mctx, client, sizeof(*client)); goto unlock; } @@ -1667,6 +1668,7 @@ static isc_result_t client_create(ns_clientmgr_t *manager, ns_client_t **clientp) { ns_client_t *client; isc_result_t result; + isc_mem_t *mctx = NULL; /* * Caller must be holding the manager lock. @@ -1678,9 +1680,31 @@ client_create(ns_clientmgr_t *manager, ns_client_t **clientp) { REQUIRE(clientp != NULL && *clientp == NULL); - client = isc_mem_get(manager->mctx, sizeof(*client)); - if (client == NULL) +#ifdef ISC_PLATFORM_USETHREADS + /* + * When enabling threads, we use a separate memory context for each + * client, since concurrent access to a shared context would cause + * heavy contentions. We also specify the NOLOCK flag on creation, + * since we are very sure that multiple threads will never get access + * to the context simultaneously. + */ + result = isc_mem_create2(0, 0, &mctx, ISC_MEMFLAG_NOLOCK); + if (result != ISC_R_SUCCESS) + return (result); +#else + /* + * Otherwise, simply share manager's context. Using a separate context + * in this case would simply waste memory. + */ + isc_mem_attach(manager->mctx, &mctx); +#endif + + client = isc_mem_get(mctx, sizeof(*client)); + if (client == NULL) { + isc_mem_detach(&mctx); return (ISC_R_NOMEMORY); + } + client->mctx = mctx; client->task = NULL; result = isc_task_create(manager->taskmgr, 0, &client->task); @@ -1697,7 +1721,7 @@ client_create(ns_clientmgr_t *manager, ns_client_t **clientp) { client->timerset = ISC_FALSE; client->message = NULL; - result = dns_message_create(manager->mctx, DNS_MESSAGE_INTENTPARSE, + result = dns_message_create(client->mctx, DNS_MESSAGE_INTENTPARSE, &client->message); if (result != ISC_R_SUCCESS) goto cleanup_timer; @@ -1705,7 +1729,7 @@ client_create(ns_clientmgr_t *manager, ns_client_t **clientp) { /* XXXRTH Hardwired constants */ client->sendevent = (isc_socketevent_t *) - isc_event_allocate(manager->mctx, client, + isc_event_allocate(client->mctx, client, ISC_SOCKEVENT_SENDDONE, client_senddone, client, sizeof(isc_socketevent_t)); @@ -1714,14 +1738,14 @@ client_create(ns_clientmgr_t *manager, ns_client_t **clientp) { goto cleanup_message; } - client->recvbuf = isc_mem_get(manager->mctx, RECV_BUFFER_SIZE); + client->recvbuf = isc_mem_get(client->mctx, RECV_BUFFER_SIZE); if (client->recvbuf == NULL) { result = ISC_R_NOMEMORY; goto cleanup_sendevent; } client->recvevent = (isc_socketevent_t *) - isc_event_allocate(manager->mctx, client, + isc_event_allocate(client->mctx, client, ISC_SOCKEVENT_RECVDONE, client_request, client, sizeof(isc_socketevent_t)); @@ -1731,7 +1755,6 @@ client_create(ns_clientmgr_t *manager, ns_client_t **clientp) { } client->magic = NS_CLIENT_MAGIC; - client->mctx = manager->mctx; client->manager = NULL; client->state = NS_CLIENTSTATE_INACTIVE; client->newstate = NS_CLIENTSTATE_MAX; @@ -1801,7 +1824,7 @@ client_create(ns_clientmgr_t *manager, ns_client_t **clientp) { isc_event_free((isc_event_t **)&client->recvevent); cleanup_recvbuf: - isc_mem_put(manager->mctx, client->recvbuf, RECV_BUFFER_SIZE); + isc_mem_put(client->mctx, client->recvbuf, RECV_BUFFER_SIZE); cleanup_sendevent: isc_event_free((isc_event_t **)&client->sendevent); @@ -1818,7 +1841,7 @@ client_create(ns_clientmgr_t *manager, ns_client_t **clientp) { isc_task_detach(&client->task); cleanup_client: - isc_mem_put(manager->mctx, client, sizeof(*client)); + isc_mem_putanddetach(&client->mctx, client, sizeof(*client)); return (result); } diff --git a/configure.in b/configure.in index 16f88edba8..e4f639fc90 100644 --- a/configure.in +++ b/configure.in @@ -18,7 +18,7 @@ AC_DIVERT_PUSH(1)dnl esyscmd([sed "s/^/# /" COPYRIGHT])dnl AC_DIVERT_POP()dnl -AC_REVISION($Revision: 1.376 $) +AC_REVISION($Revision: 1.377 $) AC_INIT(lib/dns/name.c) AC_PREREQ(2.13) @@ -1830,6 +1830,76 @@ yes) esac AC_SUBST(ISC_PLATFORM_HAVEIFNAMETOINDEX) +# +# Machine architecture dependent features +# +AC_ARG_ENABLE(atomic, + [ --enable-atomic enable machine specific atomic operations + [[default=autodetect]]], + enable_atomic="$enableval", + enable_atomic="autodetect") +case "$enable_atomic" in + yes|''|autodetect) + use_atomic=yes + ;; + no) + use_atomic=no + ;; +esac + +ISC_PLATFORM_USEOSFASM="#undef ISC_PLATFORM_USEOSFASM" +if test "$use_atomic" = "yes"; then + AC_MSG_CHECKING([architecture type for atomic operations]) + case "$host" in + [i[3456]86-*]|x86_64-*) + # XXX: also need to check portability of the "asm" keyword? + # XXX: some old x86 architectures actualy do not support + # (some of) these operations. Do we need stricter checks? + # Note: We currently use the same code for both the x86_32 and + # x86_64 architectures, but there may be a better + # implementation for the latter. + have_atomic=yes + arch=x86_32 + ;; + alpha*-*) + have_atomic=yes + arch=alpha + if test "X$GCC" != "Xyes"; then + case "$host" in + *-dec-osf*) + # Tru64 compiler has its own syntax for inline + # assembly. + ISC_PLATFORM_USEOSFASM="#define ISC_PLATFORM_USEOSFASM 1" + ;; + esac + fi + ;; + *) + have_atomic=no + arch=noatomic + ;; + esac + AC_MSG_RESULT($arch) +fi + +if test "$have_atomic" = "yes"; then + ISC_PLATFORM_HAVEXADD="#define ISC_PLATFORM_HAVEXADD 1" + ISC_PLATFORM_HAVECMPXCHG="#define ISC_PLATFORM_HAVECMPXCHG 1" + ISC_PLATFORM_HAVEATOMICSTORE="#define ISC_PLATFORM_HAVEATOMICSTORE 1" +else + ISC_PLATFORM_HAVEXADD="#undef ISC_PLATFORM_HAVEXADD" + ISC_PLATFORM_HAVECMPXCHG="#undef ISC_PLATFORM_HAVECMPXCHG" + ISC_PLATFORM_HAVEATOMICSTORE="#undef ISC_PLATFORM_HAVEATOMICSTORE" +fi + +AC_SUBST(ISC_PLATFORM_HAVEXADD) +AC_SUBST(ISC_PLATFORM_HAVECMPXCHG) +AC_SUBST(ISC_PLATFORM_HAVEATOMICSTORE) +AC_SUBST(ISC_PLATFORM_USEOSFASM) + +ISC_ARCH_DIR=$arch +AC_SUBST(ISC_ARCH_DIR) + # # The following sections deal with tools used for formatting # the documentation. They are all optional, unless you are diff --git a/lib/dns/acache.c b/lib/dns/acache.c index 1d9491020a..31bd333b67 100644 --- a/lib/dns/acache.c +++ b/lib/dns/acache.c @@ -14,10 +14,11 @@ * PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: acache.c,v 1.8 2005/02/07 00:04:05 marka Exp $ */ +/* $Id: acache.c,v 1.9 2005/06/04 05:32:46 jinmei Exp $ */ #include +#include #include #include #include @@ -25,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -73,6 +75,26 @@ #define DNS_ACACHE_MINSIZE 2097152 /* Bytes. 2097152 = 2 MB */ #define DNS_ACACHE_CLEANERINCREMENT 1000 /* Number of entries. */ +#if defined(ISC_RWLOCK_USEATOMIC) && defined(ISC_PLATFORM_HAVEATOMICSTORE) +#define ACACHE_USE_RWLOCK 1 +#endif + +#ifdef ACACHE_USE_RWLOCK +#define ACACHE_INITLOCK(l) isc_rwlock_init((l), 0, 0) +#define ACACHE_DESTROYLOCK(l) isc_rwlock_destroy(l) +#define ACACHE_LOCK(l, t) RWLOCK((l), (t)) +#define ACACHE_UNLOCK(l, t) RWUNLOCK((l), (t)) + +#define acache_storetime(entry, t) (isc_atomic_store(&(entry)->lastused, (t))) +#else +#define ACACHE_INITLOCK(l) isc_mutex_init(l) +#define ACACHE_DESTROYLOCK(l) DESTROYLOCK(l) +#define ACACHE_LOCK(l, t) LOCK(l) +#define ACACHE_UNLOCK(l, t) UNLOCK(l) + +#define acache_storetime(entry, t) ((entry)->lastused = (t)) +#endif + /* Locked by acache lock */ typedef struct dbentry { ISC_LINK(struct dbentry) link; @@ -157,7 +179,11 @@ struct dns_acache { struct dns_acacheentry { unsigned int magic; +#ifdef ACACHE_USE_RWLOCK + isc_rwlock_t lock; +#else isc_mutex_t lock; +#endif isc_refcount_t references; dns_acache_t *acache; @@ -185,7 +211,7 @@ struct dns_acacheentry { void *cbarg; /* Timestamp of the last time this entry is referred to */ - isc_stdtime_t lastused; + isc_stdtime32_t lastused; }; /* @@ -242,7 +268,7 @@ shutdown_entries(dns_acache_t *acache) { entry = entry_next) { entry_next = ISC_LIST_NEXT(entry, link); - LOCK(&entry->lock); + ACACHE_LOCK(&entry->lock, isc_rwlocktype_write); /* * If the cleaner holds this entry, it will be unlinked and @@ -256,7 +282,7 @@ shutdown_entries(dns_acache_t *acache) { entry->callback = NULL; } - UNLOCK(&entry->lock); + ACACHE_UNLOCK(&entry->lock, isc_rwlocktype_write); if (acache->cleaner.current_entry != entry) dns_acache_detachentry(&entry); @@ -352,7 +378,7 @@ destroy_entry(dns_acacheentry_t *entry) { */ clear_entry(acache, entry); - DESTROYLOCK(&entry->lock); + ACACHE_DESTROYLOCK(&entry->lock); isc_mem_put(acache->mctx, entry, sizeof(*entry)); @@ -666,6 +692,7 @@ entry_stale(acache_cleaner_t *cleaner, dns_acacheentry_t *entry, isc_stdtime_t now) { unsigned int interval = cleaner->cleaning_interval; + isc_stdtime32_t now32; /* * If the callback has been canceled, we definitely do not need the @@ -674,7 +701,8 @@ entry_stale(acache_cleaner_t *cleaner, dns_acacheentry_t *entry, if (entry->callback == NULL) return (ISC_TRUE); - if (entry->lastused + interval < now) + isc_stdtime_convert32(now, &now32); + if (entry->lastused + interval < now32) return (ISC_TRUE); /* @@ -683,7 +711,8 @@ entry_stale(acache_cleaner_t *cleaner, dns_acacheentry_t *entry, * use and the cleaning interval. */ if (cleaner->overmem) { - unsigned int passed = now - entry->lastused; /* <= interval */ + unsigned int passed = + now32 - entry->lastused; /* <= interval */ isc_uint32_t val, r; isc_random_get(&val); @@ -734,7 +763,7 @@ acache_incremental_cleaning_action(isc_task_t *task, isc_event_t *event) { next = ISC_LIST_NEXT(entry, link); - LOCK(&entry->lock); + ACACHE_LOCK(&entry->lock, isc_rwlocktype_write); is_stale = entry_stale(cleaner, entry, now); if (is_stale) { @@ -747,7 +776,7 @@ acache_incremental_cleaning_action(isc_task_t *task, isc_event_t *event) { cleaner->ncleaned++; } - UNLOCK(&entry->lock); + ACACHE_UNLOCK(&entry->lock, isc_rwlocktype_write); if (is_stale) dns_acache_detachentry(&entry); @@ -1129,7 +1158,7 @@ dns_acache_putdb(dns_acache_t *acache, dns_db_t *db) { * original holder has canceled callback,) destroy it here. */ while ((entry = ISC_LIST_HEAD(dbentry->originlist)) != NULL) { - LOCK(&entry->lock); + ACACHE_LOCK(&entry->lock, isc_rwlocktype_write); /* * Releasing olink first would avoid finddbent() in @@ -1144,13 +1173,13 @@ dns_acache_putdb(dns_acache_t *acache, dns_db_t *db) { (entry->callback)(entry, &entry->cbarg); entry->callback = NULL; - UNLOCK(&entry->lock); + ACACHE_UNLOCK(&entry->lock, isc_rwlocktype_write); if (acache->cleaner.current_entry != entry) dns_acache_detachentry(&entry); } while ((entry = ISC_LIST_HEAD(dbentry->referlist)) != NULL) { - LOCK(&entry->lock); + ACACHE_LOCK(&entry->lock, isc_rwlocktype_write); ISC_LIST_UNLINK(dbentry->referlist, entry, rlink); if (acache->cleaner.current_entry != entry) @@ -1161,7 +1190,7 @@ dns_acache_putdb(dns_acache_t *acache, dns_db_t *db) { (entry->callback)(entry, &entry->cbarg); entry->callback = NULL; - UNLOCK(&entry->lock); + ACACHE_UNLOCK(&entry->lock, isc_rwlocktype_write); if (acache->cleaner.current_entry != entry) dns_acache_detachentry(&entry); @@ -1200,7 +1229,7 @@ dns_acache_createentry(dns_acache_t *acache, dns_db_t *origdb, if (newentry == NULL) return (ISC_R_NOMEMORY); - result = isc_mutex_init(&newentry->lock); + result = ACACHE_INITLOCK(&newentry->lock); if (result != ISC_R_SUCCESS) { isc_mem_put(acache->mctx, newentry, sizeof(*newentry)); UNEXPECTED_ERROR(__FILE__, __LINE__, @@ -1246,6 +1275,7 @@ dns_acache_getentry(dns_acacheentry_t *entry, dns_zone_t **zonep, { isc_result_t result = ISC_R_SUCCESS; dns_rdataset_t *erdataset; + isc_stdtime32_t now32; REQUIRE(DNS_ACACHEENTRY_VALID(entry)); REQUIRE(zonep == NULL || *zonep == NULL); @@ -1254,10 +1284,11 @@ dns_acache_getentry(dns_acacheentry_t *entry, dns_zone_t **zonep, REQUIRE(nodep != NULL && *nodep == NULL); REQUIRE(fname != NULL); REQUIRE(msg != NULL); + + ACACHE_LOCK(&entry->lock, isc_rwlocktype_read); - LOCK(&entry->lock); - - entry->lastused = now; + isc_stdtime_convert32(now, &now32); + acache_storetime(entry, now32); if (entry->zone != NULL && zonep != NULL) dns_zone_attach(entry->zone, zonep); @@ -1284,7 +1315,8 @@ dns_acache_getentry(dns_acacheentry_t *entry, dns_zone_t **zonep, ardataset = NULL; result = dns_message_gettemprdataset(msg, &ardataset); if (result != ISC_R_SUCCESS) { - UNLOCK(&entry->lock); + ACACHE_UNLOCK(&entry->lock, + isc_rwlocktype_read); goto fail; } @@ -1300,7 +1332,7 @@ dns_acache_getentry(dns_acacheentry_t *entry, dns_zone_t **zonep, } } - UNLOCK(&entry->lock); + ACACHE_UNLOCK(&entry->lock, isc_rwlocktype_read); return (result); @@ -1337,7 +1369,7 @@ dns_acache_setentry(dns_acache_t *acache, dns_acacheentry_t *entry, REQUIRE(DNS_ACACHEENTRY_VALID(entry)); LOCK(&acache->lock); /* XXX: need to lock it here for ordering */ - LOCK(&entry->lock); + ACACHE_LOCK(&entry->lock, isc_rwlocktype_write); /* Set zone */ if (zone != NULL) @@ -1429,7 +1461,7 @@ dns_acache_setentry(dns_acache_t *acache, dns_acacheentry_t *entry, */ dns_acache_attachentry(entry, &dummy_entry); - UNLOCK(&entry->lock); + ACACHE_UNLOCK(&entry->lock, isc_rwlocktype_write); UNLOCK(&acache->lock); return (ISC_R_SUCCESS); @@ -1437,7 +1469,7 @@ dns_acache_setentry(dns_acache_t *acache, dns_acacheentry_t *entry, fail: clear_entry(acache, entry); - UNLOCK(&entry->lock); + ACACHE_UNLOCK(&entry->lock, isc_rwlocktype_write); UNLOCK(&acache->lock); return (result); @@ -1451,7 +1483,7 @@ dns_acache_cancelentry(dns_acacheentry_t *entry) { INSIST(DNS_ACACHE_VALID(acache)); LOCK(&acache->lock); - LOCK(&entry->lock); + ACACHE_LOCK(&entry->lock, isc_rwlocktype_write); /* * Release dependencies stored in this entry as much as possible. @@ -1465,7 +1497,7 @@ dns_acache_cancelentry(dns_acacheentry_t *entry) { entry->callback = NULL; entry->cbarg = NULL; - UNLOCK(&entry->lock); + ACACHE_UNLOCK(&entry->lock, isc_rwlocktype_write); UNLOCK(&acache->lock); } diff --git a/lib/dns/include/dns/rbt.h b/lib/dns/include/dns/rbt.h index 46ebeb3b8b..3b4bc22ef9 100644 --- a/lib/dns/include/dns/rbt.h +++ b/lib/dns/include/dns/rbt.h @@ -15,7 +15,7 @@ * PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: rbt.h,v 1.62 2005/04/29 00:23:00 marka Exp $ */ +/* $Id: rbt.h,v 1.63 2005/06/04 05:32:47 jinmei Exp $ */ #ifndef DNS_RBT_H #define DNS_RBT_H 1 @@ -24,6 +24,7 @@ #include #include +#include #include @@ -42,6 +43,12 @@ ISC_LANG_BEGINDECLS #define DNS_RBTFIND_NOPREDECESSOR 0x04 /*@}*/ +#ifndef DNS_RBT_USEISCREFCOUNT +#ifdef ISC_REFCOUNT_HAVEATOMIC +#define DNS_RBT_USEISCREFCOUNT 1 +#endif +#endif + /* * These should add up to 30. */ @@ -108,7 +115,11 @@ typedef struct dns_rbtnode { unsigned int dirty:1; unsigned int wild:1; unsigned int locknum:DNS_RBT_LOCKLENGTH; +#ifndef DNS_RBT_USEISCREFCOUNT unsigned int references:DNS_RBT_REFLENGTH; +#else + isc_refcount_t references; /* note that this is not in the bitfield */ +#endif /*@}*/ } dns_rbtnode_t; @@ -211,7 +222,10 @@ typedef struct dns_rbtnodechain { /***** ***** Public interfaces. *****/ - +isc_result_t +dns_rbt_create2(isc_mem_t *mctx, void (*deleter)(void *, void *), + void (*deleter2)(void *, void *), + void *deleter_arg, dns_rbt_t **rbtp); isc_result_t dns_rbt_create(isc_mem_t *mctx, void (*deleter)(void *, void *), void *deleter_arg, dns_rbt_t **rbtp); @@ -844,6 +858,63 @@ dns_rbtnodechain_next(dns_rbtnodechain_t *chain, dns_name_t *name, *\li <something_else> Any error result from dns_name_concatenate. */ +/* + * Wrapper macros for manipulating the rbtnode reference counter: + * Since we selectively use isc_refcount_t for the reference counter of + * a rbtnode, operations on the counter depend on the actual type of it. + * The following macros provide a common interface to these operations, + * hiding the back-end. The usage is the same as that of isc_refcount_xxx(). + */ +#ifdef DNS_RBT_USEISCREFCOUNT +#define dns_rbtnode_refinit(node, n) \ + do { \ + isc_refcount_init(&(node)->references, (n)); \ + } while (0) +#define dns_rbtnode_refdestroy(node) \ + do { \ + isc_refcount_destroy(&(node)->references); \ + } while (0) +#define dns_rbtnode_refcurrent(node) \ + isc_refcount_current(&(node)->references) +#define dns_rbtnode_refincrement0(node, refs) \ + do { \ + isc_refcount_increment0(&(node)->references, (refs)); \ + } while (0) +#define dns_rbtnode_refincrement(node, refs) \ + do { \ + isc_refcount_increment(&(node)->references, (refs)); \ + } while (0) +#define dns_rbtnode_refdecrement(node, refs) \ + do { \ + isc_refcount_decrement(&(node)->references, (refs)); \ + } while (0) +#else /* DNS_RBT_USEISCREFCOUNT */ +#define dns_rbtnode_refinit(node, n) ((node)->references = (n)) +#define dns_rbtnode_refdestroy(node) (REQUIRE((node)->references == 0)) +#define dns_rbtnode_refcurrent(node) ((node)->references) +#define dns_rbtnode_refincrement0(node, refs) \ + do { \ + unsigned int *_tmp = (unsigned int *)(refs); \ + (node)->references++; \ + if ((_tmp) != NULL) \ + (*_tmp) = (node)->references; \ + } while (0) +#define dns_rbtnode_refincrement(node, refs) \ + do { \ + REQUIRE((node)->references > 0); \ + (node)->references++; \ + if ((refs) != NULL) \ + (*refs) = (node)->references; \ + } while (0) +#define dns_rbtnode_refdecrement(node, refs) \ + do { \ + REQUIRE((node)->references > 0); \ + (node)->references--; \ + if ((refs) != NULL) \ + (*refs) = (node)->references; \ + } while (0) +#endif /* DNS_RBT_USEISCREFCOUNT */ + ISC_LANG_ENDDECLS #endif /* DNS_RBT_H */ diff --git a/lib/dns/rbt.c b/lib/dns/rbt.c index 18418316fb..1f3463f9ec 100644 --- a/lib/dns/rbt.c +++ b/lib/dns/rbt.c @@ -15,7 +15,7 @@ * PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: rbt.c,v 1.132 2005/04/29 00:22:50 marka Exp $ */ +/* $Id: rbt.c,v 1.133 2005/06/04 05:32:46 jinmei Exp $ */ /*! \file */ @@ -26,6 +26,7 @@ #include #include #include +#include #include #include @@ -62,6 +63,7 @@ struct dns_rbt { isc_mem_t * mctx; dns_rbtnode_t * root; void (*data_deleter)(void *, void *); + void (*data_deleter2)(void *, void *); void * deleter_arg; unsigned int nodecount; unsigned int hashsize; @@ -96,7 +98,6 @@ struct dns_rbt { #define DIRTY(node) ((node)->dirty) #define WILD(node) ((node)->wild) #define LOCKNUM(node) ((node)->locknum) -#define REFS(node) ((node)->references) /*% * The variable length stuff stored after the node. @@ -220,8 +221,9 @@ dns_rbt_deletetreeflat(dns_rbt_t *rbt, unsigned int quantum, * Initialize a red/black tree of trees. */ isc_result_t -dns_rbt_create(isc_mem_t *mctx, void (*deleter)(void *, void *), - void *deleter_arg, dns_rbt_t **rbtp) +dns_rbt_create2(isc_mem_t *mctx, void (*deleter)(void *, void *), + void (*deleter2)(void *, void *), + void *deleter_arg, dns_rbt_t **rbtp) { #ifdef DNS_RBT_USEHASH isc_result_t result; @@ -231,7 +233,7 @@ dns_rbt_create(isc_mem_t *mctx, void (*deleter)(void *, void *), REQUIRE(mctx != NULL); REQUIRE(rbtp != NULL && *rbtp == NULL); - REQUIRE(deleter == NULL ? deleter_arg == NULL : 1); + REQUIRE((deleter == NULL && deleter2 == NULL) ? deleter_arg == NULL : 1); rbt = (dns_rbt_t *)isc_mem_get(mctx, sizeof(*rbt)); if (rbt == NULL) @@ -239,6 +241,7 @@ dns_rbt_create(isc_mem_t *mctx, void (*deleter)(void *, void *), rbt->mctx = mctx; rbt->data_deleter = deleter; + rbt->data_deleter2 = deleter2; rbt->deleter_arg = deleter_arg; rbt->root = NULL; rbt->nodecount = 0; @@ -258,6 +261,13 @@ dns_rbt_create(isc_mem_t *mctx, void (*deleter)(void *, void *), return (ISC_R_SUCCESS); } +isc_result_t +dns_rbt_create(isc_mem_t *mctx, void (*deleter)(void *, void *), + void *deleter_arg, dns_rbt_t **rbtp) +{ + return (dns_rbt_create2(mctx, deleter, NULL, deleter_arg, rbtp)); +} + /* * Deallocate a red/black tree of trees. */ @@ -1276,6 +1286,9 @@ dns_rbt_deletenode(dns_rbt_t *rbt, dns_rbtnode_t *node, isc_boolean_t recurse) if (DATA(node) != NULL && rbt->data_deleter != NULL) rbt->data_deleter(DATA(node), rbt->deleter_arg); + if (DATA(node) != NULL && rbt->data_deleter2 != NULL) + rbt->data_deleter2(node, + rbt->deleter_arg); DATA(node) = NULL; /* @@ -1307,11 +1320,14 @@ dns_rbt_deletenode(dns_rbt_t *rbt, dns_rbtnode_t *node, isc_boolean_t recurse) if (DATA(node) != NULL && rbt->data_deleter != NULL) rbt->data_deleter(DATA(node), rbt->deleter_arg); + if (DATA(node) != NULL && rbt->data_deleter2 != NULL) + rbt->data_deleter2(node, rbt->deleter_arg); unhash_node(rbt, node); #if DNS_RBT_USEMAGIC node->magic = 0; #endif + dns_rbtnode_refdestroy(node); isc_mem_put(rbt->mctx, node, NODE_SIZE(node)); rbt->nodecount--; @@ -1436,9 +1452,9 @@ create_node(isc_mem_t *mctx, dns_name_t *name, dns_rbtnode_t **nodep) { #endif LOCKNUM(node) = 0; - REFS(node) = 0; WILD(node) = 0; DIRTY(node) = 0; + dns_rbtnode_refinit(node, 0); node->find_callback = 0; MAKE_BLACK(node); @@ -2021,6 +2037,8 @@ dns_rbt_deletetree(dns_rbt_t *rbt, dns_rbtnode_t *node) { if (DATA(node) != NULL && rbt->data_deleter != NULL) rbt->data_deleter(DATA(node), rbt->deleter_arg); + if (DATA(node) != NULL && rbt->data_deleter2 != NULL) + rbt->data_deleter2(node, rbt->deleter_arg); unhash_node(rbt, node); #if DNS_RBT_USEMAGIC @@ -2061,6 +2079,8 @@ dns_rbt_deletetreeflat(dns_rbt_t *rbt, unsigned int quantum, if (DATA(node) != NULL && rbt->data_deleter != NULL) rbt->data_deleter(DATA(node), rbt->deleter_arg); + if (DATA(node) != NULL && rbt->data_deleter2 != NULL) + rbt->data_deleter2(node, rbt->deleter_arg); unhash_node(rbt, node); #if DNS_RBT_USEMAGIC diff --git a/lib/dns/rbtdb.c b/lib/dns/rbtdb.c index 3a96203c9e..ef477d715f 100644 --- a/lib/dns/rbtdb.c +++ b/lib/dns/rbtdb.c @@ -15,7 +15,7 @@ * PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: rbtdb.c,v 1.206 2005/04/27 04:56:49 sra Exp $ */ +/* $Id: rbtdb.c,v 1.207 2005/06/04 05:32:46 jinmei Exp $ */ /*! \file */ @@ -102,6 +102,86 @@ typedef isc_uint32_t rbtdb_rdatatype_t; #define RBTDB_RDATATYPE_NCACHEANY \ RBTDB_RDATATYPE_VALUE(0, dns_rdatatype_any) +/* + * We use rwlock for DB lock only when DNS_RBTDB_USERWLOCK is non 0. + * Using rwlock is effective with regard to lookup performance only when + * it is implemented in an efficient way and the server receives a massive + * number of queries for non-existent names (which cause calls to getsoanode() + * below). + * Otherwise, it is generally wise to stick to the simple locking since rwlock + * would require more memory or can even make lookups slower due to its own + * overhead (when it internally calls mutex locks). + * By default, DNS_RBTDB_USERWLOCK is 0. It is only set to 1 when + * both DNS_RBTDB_ALLOWUSERWLOCK and ISC_RWLOCK_USEATOMIC is defined at + * compilation time (the latter is automatically defined when available). + */ +#ifndef DNS_RBTDB_USERWLOCK +#if defined(ISC_RWLOCK_USEATOMIC) && defined(DNS_RBTDB_ALLOWUSERWLOCK) +#define DNS_RBTDB_USERWLOCK 1 +#else +#define DNS_RBTDB_USERWLOCK 0 +#endif +#endif /* DNS_RBTDB_USERWLOCK */ + +#if DNS_RBTDB_USERWLOCK +#define RBTDB_INITLOCK(l) isc_rwlock_init((l), 0, 0) +#define RBTDB_DESTROYLOCK(l) isc_rwlock_destroy(l) +#define RBTDB_LOCK(l, t) RWLOCK((l), (t)) +#define RBTDB_UNLOCK(l, t) RWUNLOCK((l), (t)) +#else +#define RBTDB_INITLOCK(l) isc_mutex_init(l) +#define RBTDB_DESTROYLOCK(l) DESTROYLOCK(l) +#define RBTDB_LOCK(l, t) LOCK(l) +#define RBTDB_UNLOCK(l, t) UNLOCK(l) +#endif + +/* + * Since node locking is sensitive to both performance and memory footprint, + * we need some trick here. If we have both high-performance rwlock and + * high performance and small-memory reference counters, we use rwlock for + * node lock and isc_refcount for node references. In this case, we don't have + * to protect the access to the counters by locks. + * Otherwise, we simply use ordinary mutex lock for node locking, and use + * simple integers as reference counters which is protected by the lock. + * In most cases, we can simply use wrapper macros such as NODE_LOCK and + * NODE_UNLOCK. In some other cases, however, we need to protect reference + * counters first and then protect other parts of a node as read-only data. + * Special additional macros, NODE_STRONGLOCK(), NODE_WEAKLOCK(), etc, are also + * provided for these special cases. When we can use the efficient backend + * routines, we should only protect the "other members" by NODE_WEAKLOCK(read). + * Otherwise, we should use NODE_STRONGLOCK() to protect the entire critical + * section including the access to the reference counter. + * Note that we cannot use NODE_LOCK()/NODE_UNLOCK() wherever the protected + * section is also protected by NODE_STRONGLOCK(). + */ +#if defined(ISC_RWLOCK_USEATOMIC) && defined(DNS_RBT_USEISCREFCOUNT) +typedef isc_rwlock_t nodelock_t; + +#define NODE_INITLOCK(l) isc_rwlock_init((l), 0, 0) +#define NODE_DESTROYLOCK(l) isc_rwlock_destroy(l) +#define NODE_LOCK(l, t) RWLOCK((l), (t)) +#define NODE_UNLOCK(l, t) RWUNLOCK((l), (t)) +#define NODE_TRYUPGRADE(l) isc_rwlock_tryupgrade(l) + +#define NODE_STRONGLOCK(l) +#define NODE_STRONGUNLOCK(l) +#define NODE_WEAKLOCK(l, t) NODE_LOCK(l, t) +#define NODE_WEAKUNLOCK(l, t) NODE_UNLOCK(l, t) +#else +typedef isc_mutex_t nodelock_t; + +#define NODE_INITLOCK(l) isc_mutex_init(l) +#define NODE_DESTROYLOCK(l) DESTROYLOCK(l) +#define NODE_LOCK(l, t) LOCK(l) +#define NODE_UNLOCK(l, t) UNLOCK(l) +#define NODE_TRYUPGRADE(l) ISC_R_SUCCESS + +#define NODE_STRONGLOCK(l) LOCK(l) +#define NODE_STRONGUNLOCK(l) UNLOCK(l) +#define NODE_WEAKLOCK(l, t) +#define NODE_WEAKUNLOCK(l, t) +#endif + struct noqname { dns_name_t name; void * nsec; @@ -195,9 +275,10 @@ struct acachectl { #define DEFAULT_NODE_LOCK_COUNT 7 /*%< Should be prime. */ typedef struct { - isc_mutex_t lock; + nodelock_t lock; + /* Protected in the refcount routines. */ + isc_refcount_t references; /* Locked by lock. */ - unsigned int references; isc_boolean_t exiting; } rbtdb_nodelock_t; @@ -212,9 +293,14 @@ typedef ISC_LIST(rbtdb_changed_t) rbtdb_changedlist_t; typedef struct rbtdb_version { /* Not locked */ rbtdb_serial_t serial; + /* + * Protected in the refcount routines. + * XXXJT: should we change the lock policy based on the refcount + * performance? + */ + isc_refcount_t references; /* Locked by database lock. */ isc_boolean_t writer; - unsigned int references; isc_boolean_t commit_ok; rbtdb_changedlist_t changed_list; ISC_LINK(struct rbtdb_version) link; @@ -225,7 +311,11 @@ typedef ISC_LIST(rbtdb_version_t) rbtdb_versionlist_t; typedef struct { /* Unlocked. */ dns_db_t common; +#if DNS_RBTDB_USERWLOCK + isc_rwlock_t lock; +#else isc_mutex_t lock; +#endif isc_rwlock_t tree_lock; unsigned int node_lock_count; rbtdb_nodelock_t * node_locks; @@ -247,6 +337,9 @@ typedef struct { /* Locked by tree_lock. */ dns_rbt_t * tree; isc_boolean_t secure; + + /* Unlocked */ + isc_mem_t ** nodemctxs; } dns_rbtdb_t; #define RBTDB_ATTR_LOADED 0x01 @@ -456,12 +549,20 @@ free_rbtdb(dns_rbtdb_t *rbtdb, isc_boolean_t log, isc_event_t *event) { isc_result_t result; char buf[DNS_NAME_FORMATSIZE]; - REQUIRE(EMPTY(rbtdb->open_versions)); + REQUIRE(rbtdb->current_version != NULL || EMPTY(rbtdb->open_versions)); REQUIRE(rbtdb->future_version == NULL); - if (rbtdb->current_version != NULL) + if (rbtdb->current_version != NULL) { + unsigned int refs; + + isc_refcount_decrement(&rbtdb->current_version->references, + &refs); + INSIST(refs == 0); + UNLINK(rbtdb->open_versions, rbtdb->current_version, link); + isc_refcount_destroy(&rbtdb->current_version->references); isc_mem_put(rbtdb->common.mctx, rbtdb->current_version, sizeof(rbtdb_version_t)); + } again: if (rbtdb->tree != NULL) { result = dns_rbt_destroy2(&rbtdb->tree, @@ -496,15 +597,23 @@ free_rbtdb(dns_rbtdb_t *rbtdb, isc_boolean_t log, isc_event_t *event) { } if (dns_name_dynamic(&rbtdb->common.origin)) dns_name_free(&rbtdb->common.origin, rbtdb->common.mctx); - for (i = 0; i < rbtdb->node_lock_count; i++) - DESTROYLOCK(&rbtdb->node_locks[i].lock); + for (i = 0; i < rbtdb->node_lock_count; i++) { + isc_refcount_destroy(&rbtdb->node_locks[i].references); + NODE_DESTROYLOCK(&rbtdb->node_locks[i].lock); + + if (rbtdb->nodemctxs != NULL) + isc_mem_detach(&rbtdb->nodemctxs[i]); + } + if (rbtdb->nodemctxs != NULL) + isc_mem_put(rbtdb->common.mctx, rbtdb->nodemctxs, + sizeof(isc_mem_t *) * rbtdb->node_lock_count); isc_mem_put(rbtdb->common.mctx, rbtdb->node_locks, rbtdb->node_lock_count * sizeof(rbtdb_nodelock_t)); isc_rwlock_destroy(&rbtdb->tree_lock); isc_refcount_destroy(&rbtdb->references); if (rbtdb->task != NULL) isc_task_detach(&rbtdb->task); - DESTROYLOCK(&rbtdb->lock); + RBTDB_DESTROYLOCK(&rbtdb->lock); rbtdb->common.magic = 0; rbtdb->common.impmagic = 0; ondest = rbtdb->common.ondest; @@ -530,19 +639,21 @@ maybe_free_rbtdb(dns_rbtdb_t *rbtdb) { * may be nodes in use. */ for (i = 0; i < rbtdb->node_lock_count; i++) { - LOCK(&rbtdb->node_locks[i].lock); + NODE_LOCK(&rbtdb->node_locks[i].lock, isc_rwlocktype_write); rbtdb->node_locks[i].exiting = ISC_TRUE; - if (rbtdb->node_locks[i].references == 0) + NODE_UNLOCK(&rbtdb->node_locks[i].lock, isc_rwlocktype_write); + if (isc_refcount_current(&rbtdb->node_locks[i].references) + == 0) { inactive++; - UNLOCK(&rbtdb->node_locks[i].lock); + } } if (inactive != 0) { - LOCK(&rbtdb->lock); + RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write); rbtdb->active -= inactive; if (rbtdb->active == 0) want_free = ISC_TRUE; - UNLOCK(&rbtdb->lock); + RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write); if (want_free) { char buf[DNS_NAME_FORMATSIZE]; if (dns_name_dynamic(&rbtdb->common.origin)) @@ -577,15 +688,17 @@ static void currentversion(dns_db_t *db, dns_dbversion_t **versionp) { dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db; rbtdb_version_t *version; + unsigned int refs; REQUIRE(VALID_RBTDB(rbtdb)); - LOCK(&rbtdb->lock); version = rbtdb->current_version; - if (version->references == 0) + isc_refcount_increment(&version->references, &refs); + if (refs == 1) { + RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write); PREPEND(rbtdb->open_versions, version, link); - version->references++; - UNLOCK(&rbtdb->lock); + RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write); + } *versionp = (dns_dbversion_t *)version; } @@ -600,7 +713,7 @@ allocate_version(isc_mem_t *mctx, rbtdb_serial_t serial, if (version == NULL) return (NULL); version->serial = serial; - version->references = references; + isc_refcount_init(&version->references, references); version->writer = writer; version->commit_ok = ISC_FALSE; ISC_LIST_INIT(version->changed_list); @@ -618,7 +731,7 @@ newversion(dns_db_t *db, dns_dbversion_t **versionp) { REQUIRE(versionp != NULL && *versionp == NULL); REQUIRE(rbtdb->future_version == NULL); - LOCK(&rbtdb->lock); + RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write); RUNTIME_CHECK(rbtdb->next_serial != 0); /* XXX Error? */ version = allocate_version(rbtdb->common.mctx, rbtdb->next_serial, 1, ISC_TRUE); @@ -627,7 +740,7 @@ newversion(dns_db_t *db, dns_dbversion_t **versionp) { rbtdb->next_serial++; rbtdb->future_version = version; } - UNLOCK(&rbtdb->lock); + RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write); if (version == NULL) return (ISC_R_NOMEMORY); @@ -643,16 +756,12 @@ attachversion(dns_db_t *db, dns_dbversion_t *source, { dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db; rbtdb_version_t *rbtversion = source; + unsigned int refs; REQUIRE(VALID_RBTDB(rbtdb)); - LOCK(&rbtdb->lock); - - INSIST(rbtversion->references > 0); - rbtversion->references++; - INSIST(rbtversion->references != 0); - - UNLOCK(&rbtdb->lock); + isc_refcount_increment(&rbtversion->references, &refs); + INSIST(refs > 1); *targetp = rbtversion; } @@ -662,28 +771,29 @@ add_changed(dns_rbtdb_t *rbtdb, rbtdb_version_t *version, dns_rbtnode_t *node) { rbtdb_changed_t *changed; + unsigned int refs; /* - * Caller must be holding the node lock. + * Caller must be holding the node lock if its reference must be + * protected by the lock. */ changed = isc_mem_get(rbtdb->common.mctx, sizeof(*changed)); - LOCK(&rbtdb->lock); + RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write); REQUIRE(version->writer); if (changed != NULL) { - INSIST(node->references > 0); - node->references++; - INSIST(node->references != 0); + dns_rbtnode_refincrement0(node, &refs); + INSIST(refs > 0); changed->node = node; changed->dirty = ISC_FALSE; ISC_LIST_INITANDAPPEND(version->changed_list, changed, link); } else version->commit_ok = ISC_FALSE; - UNLOCK(&rbtdb->lock); + RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write); return (changed); } @@ -785,11 +895,13 @@ rollback_node(dns_rbtnode_t *node, rbtdb_serial_t serial) { static inline void clean_cache_node(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node) { rdatasetheader_t *current, *dcurrent, *top_prev, *top_next, *down_next; - isc_mem_t *mctx = rbtdb->common.mctx; + isc_mem_t *mctx; /* * Caller must be holding the node lock. */ + REQUIRE(rbtdb->nodemctxs != NULL); + mctx = rbtdb->nodemctxs[node->locknum]; top_prev = NULL; for (current = node->data; current != NULL; current = top_next) { @@ -945,16 +1057,29 @@ clean_zone_node(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node, node->dirty = 0; } +/* + * Caller must be holding the node lock if its reference must be protected + * by the lock. + */ static inline void new_reference(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node) { - if (node->references == 0) { - rbtdb->node_locks[node->locknum].references++; - INSIST(rbtdb->node_locks[node->locknum].references != 0); + unsigned int lockrefs, noderefs; + isc_refcount_t *lockref; + + dns_rbtnode_refincrement0(node, &noderefs); + if (noderefs == 1) { /* this is the first reference to the node */ + lockref = &rbtdb->node_locks[node->locknum].references; + isc_refcount_increment0(lockref, &lockrefs); + INSIST(lockrefs != 0); } - node->references++; - INSIST(node->references != 0); + INSIST(noderefs != 0); } + +/* + * Caller must be holding the node lock if its reference must be protected + * by the lock. + */ static void no_references(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node, rbtdb_serial_t least_serial, isc_rwlocktype_t lock) @@ -962,13 +1087,30 @@ no_references(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node, isc_result_t result; isc_boolean_t write_locked; unsigned int locknum; + unsigned int refs; /* - * Caller must be holding the node lock. + * We cannot request the node reference be 0 at the moment, since + * the reference counter can atomically be modified without a lock. + * It should still be safe unless we actually try to delete the node, + * at which point the operation is properly protected by locking. */ - REQUIRE(node->references == 0); + locknum = node->locknum; + NODE_WEAKLOCK(&rbtdb->node_locks[locknum].lock, isc_rwlocktype_read); + if (!node->dirty && (node->data != NULL || node->down != NULL)) { + /* easy and typical case first, in an efficient way. */ + isc_refcount_decrement(&rbtdb->node_locks[locknum].references, + &refs); + INSIST((int)refs >= 0); + NODE_WEAKUNLOCK(&rbtdb->node_locks[locknum].lock, + isc_rwlocktype_read); + return; + } + NODE_WEAKUNLOCK(&rbtdb->node_locks[locknum].lock, isc_rwlocktype_read); + + NODE_WEAKLOCK(&rbtdb->node_locks[locknum].lock, isc_rwlocktype_write); if (node->dirty) { if (IS_CACHE(rbtdb)) clean_cache_node(rbtdb, node); @@ -978,24 +1120,26 @@ no_references(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node, * Caller doesn't know the least serial. * Get it. */ - LOCK(&rbtdb->lock); + RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_read); least_serial = rbtdb->least_serial; - UNLOCK(&rbtdb->lock); + RBTDB_UNLOCK(&rbtdb->lock, + isc_rwlocktype_read); } clean_zone_node(rbtdb, node, least_serial); } } - locknum = node->locknum; - - INSIST(rbtdb->node_locks[locknum].references > 0); - rbtdb->node_locks[locknum].references--; + isc_refcount_decrement(&rbtdb->node_locks[locknum].references, &refs); + INSIST((int)refs >= 0); /* * XXXDCL should this only be done for cache zones? */ - if (node->data != NULL || node->down != NULL) + if (node->data != NULL || node->down != NULL) { + NODE_WEAKUNLOCK(&rbtdb->node_locks[locknum].lock, + isc_rwlocktype_write); return; + } /* * XXXDCL need to add a deferred delete method for ISC_R_LOCKBUSY. @@ -1019,6 +1163,15 @@ no_references(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node, write_locked = ISC_TRUE; if (write_locked) { + /* + * We are now ready for deleting the node. The node and tree + * locks must ensure there be no other users. (Note that + * dns_rbt_findnode() could find the node to be deleted while + * we are in this function. However, the tree lock would + * prevent us from entering this section in that case.) + */ + INSIST(dns_rbtnode_refcurrent(node) == 0); + if (isc_log_wouldlog(dns_lctx, ISC_LOG_DEBUG(1))) { char printname[DNS_NAME_FORMATSIZE]; @@ -1038,6 +1191,9 @@ no_references(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node, isc_result_totext(result)); } + NODE_WEAKUNLOCK(&rbtdb->node_locks[locknum].lock, + isc_rwlocktype_write); + /* * Relock a read lock, or unlock the write lock if no lock was held. */ @@ -1103,7 +1259,7 @@ closeversion(dns_db_t *db, dns_dbversion_t **versionp, isc_boolean_t commit) { rbtdb_changed_t *changed, *next_changed; rbtdb_serial_t serial, least_serial; dns_rbtnode_t *rbtnode; - isc_mutex_t *lock; + unsigned int refs; REQUIRE(VALID_RBTDB(rbtdb)); version = (rbtdb_version_t *)*versionp; @@ -1111,113 +1267,146 @@ closeversion(dns_db_t *db, dns_dbversion_t **versionp, isc_boolean_t commit) { cleanup_version = NULL; ISC_LIST_INIT(cleanup_list); - LOCK(&rbtdb->lock); - INSIST(version->references > 0); - INSIST(!version->writer || !(commit && version->references > 1)); - version->references--; + isc_refcount_decrement(&version->references, &refs); + if (refs > 0) { /* typical and easy case first */ + if (commit) { + RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_read); + INSIST(!version->writer); + RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_read); + } + goto end; + } + + RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write); serial = version->serial; - if (version->references == 0) { - if (version->writer) { - if (commit) { - INSIST(version->commit_ok); - INSIST(version == rbtdb->future_version); - if (EMPTY(rbtdb->open_versions)) { - /* - * We're going to become the least open - * version. - */ - make_least_version(rbtdb, version, - &cleanup_list); - } else { - /* - * Some other open version is the - * least version. We can't cleanup - * records that were changed in this - * version because the older versions - * may still be in use by an open - * version. - * - * We can, however, discard the - * changed records for things that - * we've added that didn't exist in - * prior versions. - */ - cleanup_nondirty(version, - &cleanup_list); - } + if (version->writer) { + if (commit) { + unsigned cur_ref; + rbtdb_version_t *cur_version; + + INSIST(version->commit_ok); + INSIST(version == rbtdb->future_version); + /* + * The current version is going to be replaced. + * Release the (likely last) reference to it from the + * DB itself and unlink it from the open list. + */ + cur_version = rbtdb->current_version; + isc_refcount_decrement(&cur_version->references, + &cur_ref); + if (cur_ref == 0) { + if (cur_version->serial == rbtdb->least_serial) + INSIST(EMPTY(cur_version->changed_list)); + UNLINK(rbtdb->open_versions, + cur_version, link); + } + if (EMPTY(rbtdb->open_versions)) { /* - * If the (soon to be former) current version - * isn't being used by anyone, we can clean - * it up. + * We're going to become the least open + * version. */ - if (rbtdb->current_version->references == 0) { - cleanup_version = - rbtdb->current_version; - APPENDLIST(version->changed_list, - cleanup_version->changed_list, - link); - } - /* - * Become the current version. - */ - version->writer = ISC_FALSE; - rbtdb->current_version = version; - rbtdb->current_serial = version->serial; - rbtdb->future_version = NULL; + make_least_version(rbtdb, version, + &cleanup_list); } else { /* - * We're rolling back this transaction. + * Some other open version is the + * least version. We can't cleanup + * records that were changed in this + * version because the older versions + * may still be in use by an open + * version. + * + * We can, however, discard the + * changed records for things that + * we've added that didn't exist in + * prior versions. */ - cleanup_list = version->changed_list; - ISC_LIST_INIT(version->changed_list); - rollback = ISC_TRUE; - cleanup_version = version; - rbtdb->future_version = NULL; + cleanup_nondirty(version, &cleanup_list); } + /* + * If the (soon to be former) current version + * isn't being used by anyone, we can clean + * it up. + */ + if (cur_ref == 0) { + cleanup_version = cur_version; + APPENDLIST(version->changed_list, + cleanup_version->changed_list, + link); + } + /* + * Become the current version. + */ + version->writer = ISC_FALSE; + rbtdb->current_version = version; + rbtdb->current_serial = version->serial; + rbtdb->future_version = NULL; + + /* + * Keep the current version in the open list, and + * gain a reference for the DB itself (see the DB + * creation function below). This must be the only + * case where we need to increment the counter from + * zero and need to use isc_refcount_increment0(). + */ + isc_refcount_increment0(&version->references, + &cur_ref); + INSIST(cur_ref == 1); + PREPEND(rbtdb->open_versions, + rbtdb->current_version, link); } else { - if (version != rbtdb->current_version) { - /* - * There are no external or internal references - * to this version and it can be cleaned up. - */ - cleanup_version = version; - - /* - * Find the version with the least serial - * number greater than ours. - */ - least_greater = PREV(version, link); - if (least_greater == NULL) - least_greater = rbtdb->current_version; - - INSIST(version->serial < least_greater->serial); - /* - * Is this the least open version? - */ - if (version->serial == rbtdb->least_serial) { - /* - * Yes. Install the new least open - * version. - */ - make_least_version(rbtdb, - least_greater, - &cleanup_list); - } else { - /* - * Add any unexecuted cleanups to - * those of the least greater version. - */ - APPENDLIST(least_greater->changed_list, - version->changed_list, - link); - } - } else if (version->serial == rbtdb->least_serial) - INSIST(EMPTY(version->changed_list)); - UNLINK(rbtdb->open_versions, version, link); + /* + * We're rolling back this transaction. + */ + cleanup_list = version->changed_list; + ISC_LIST_INIT(version->changed_list); + rollback = ISC_TRUE; + cleanup_version = version; + rbtdb->future_version = NULL; } + } else { + if (version != rbtdb->current_version) { + /* + * There are no external or internal references + * to this version and it can be cleaned up. + */ + cleanup_version = version; + + /* + * Find the version with the least serial + * number greater than ours. + */ + least_greater = PREV(version, link); + if (least_greater == NULL) + least_greater = rbtdb->current_version; + + INSIST(version->serial < least_greater->serial); + /* + * Is this the least open version? + */ + if (version->serial == rbtdb->least_serial) { + /* + * Yes. Install the new least open + * version. + */ + make_least_version(rbtdb, + least_greater, + &cleanup_list); + } else { + /* + * Add any unexecuted cleanups to + * those of the least greater version. + */ + APPENDLIST(least_greater->changed_list, + version->changed_list, + link); + } + } else if (version->serial == rbtdb->least_serial) + INSIST(EMPTY(version->changed_list)); + UNLINK(rbtdb->open_versions, version, link); } least_serial = rbtdb->least_serial; - UNLOCK(&rbtdb->lock); + RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write); if (cleanup_version != NULL) { INSIST(EMPTY(cleanup_version->changed_list)); @@ -1229,28 +1418,35 @@ closeversion(dns_db_t *db, dns_dbversion_t **versionp, isc_boolean_t commit) { for (changed = HEAD(cleanup_list); changed != NULL; changed = next_changed) { + nodelock_t *lock; + unsigned int refs; + next_changed = NEXT(changed, link); rbtnode = changed->node; lock = &rbtdb->node_locks[rbtnode->locknum].lock; - LOCK(lock); + NODE_STRONGLOCK(lock); - INSIST(rbtnode->references > 0); - rbtnode->references--; + dns_rbtnode_refdecrement(rbtnode, &refs); + INSIST((int)refs >= 0); + + NODE_WEAKLOCK(lock, isc_rwlocktype_write); if (rollback) rollback_node(rbtnode, serial); + NODE_WEAKUNLOCK(lock, isc_rwlocktype_write); - if (rbtnode->references == 0) + if (refs == 0) no_references(rbtdb, rbtnode, least_serial, isc_rwlocktype_none); - UNLOCK(lock); + NODE_STRONGUNLOCK(lock); isc_mem_put(rbtdb->common.mctx, changed, sizeof(*changed)); } } + end: *versionp = NULL; } @@ -1323,7 +1519,6 @@ findnode(dns_db_t *db, dns_name_t *name, isc_boolean_t create, dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db; dns_rbtnode_t *node = NULL; dns_name_t nodename; - unsigned int locknum; isc_result_t result; isc_rwlocktype_t locktype = isc_rwlocktype_read; @@ -1370,10 +1565,7 @@ findnode(dns_db_t *db, dns_name_t *name, isc_boolean_t create, return (result); } } - locknum = node->locknum; - LOCK(&rbtdb->node_locks[locknum].lock); new_reference(rbtdb, node); - UNLOCK(&rbtdb->node_locks[locknum].lock); RWUNLOCK(&rbtdb->tree_lock, locktype); *nodep = (dns_dbnode_t *)node; @@ -1402,7 +1594,8 @@ zone_zonecut_callback(dns_rbtnode_t *node, dns_name_t *name, void *arg) { result = DNS_R_CONTINUE; onode = search->rbtdb->origin_node; - LOCK(&(search->rbtdb->node_locks[node->locknum].lock)); + NODE_LOCK(&(search->rbtdb->node_locks[node->locknum].lock), + isc_rwlocktype_read); /* * Look for an NS or DNAME rdataset active in our version. @@ -1513,7 +1706,8 @@ zone_zonecut_callback(dns_rbtnode_t *node, dns_name_t *name, void *arg) { search->wild = ISC_TRUE; } - UNLOCK(&(search->rbtdb->node_locks[node->locknum].lock)); + NODE_UNLOCK(&(search->rbtdb->node_locks[node->locknum].lock), + isc_rwlocktype_read); return (result); } @@ -1526,7 +1720,11 @@ bind_rdataset(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node, unsigned char *raw; /* - * Caller must be holding the node lock. + * Caller must be holding the node reader lock. + * XXXJT: technically, we need a writer lock, since we'll increment + * the header count below. However, since the actual counter value + * doesn't matter, we prioritize performance here. (We may want to + * use atomic increment when available). */ if (rdataset == NULL) @@ -1606,14 +1804,16 @@ setup_delegation(rbtdb_search_t *search, dns_dbnode_t **nodep, search->need_cleanup = ISC_FALSE; } if (rdataset != NULL) { - LOCK(&(search->rbtdb->node_locks[node->locknum].lock)); + NODE_LOCK(&(search->rbtdb->node_locks[node->locknum].lock), + isc_rwlocktype_read); bind_rdataset(search->rbtdb, node, search->zonecut_rdataset, search->now, rdataset); if (sigrdataset != NULL && search->zonecut_sigrdataset != NULL) bind_rdataset(search->rbtdb, node, search->zonecut_sigrdataset, search->now, sigrdataset); - UNLOCK(&(search->rbtdb->node_locks[node->locknum].lock)); + NODE_UNLOCK(&(search->rbtdb->node_locks[node->locknum].lock), + isc_rwlocktype_read); } if (type == dns_rdatatype_dname) @@ -1708,7 +1908,8 @@ activeempty(rbtdb_search_t *search, dns_rbtnodechain_t *chain, origin, &node); if (result != ISC_R_SUCCESS) break; - LOCK(&(rbtdb->node_locks[node->locknum].lock)); + NODE_LOCK(&(rbtdb->node_locks[node->locknum].lock), + isc_rwlocktype_read); for (header = node->data; header != NULL; header = header->next) { @@ -1716,7 +1917,8 @@ activeempty(rbtdb_search_t *search, dns_rbtnodechain_t *chain, !IGNORE(header) && EXISTS(header)) break; } - UNLOCK(&(rbtdb->node_locks[node->locknum].lock)); + NODE_UNLOCK(&(rbtdb->node_locks[node->locknum].lock), + isc_rwlocktype_read); if (header != NULL) break; result = dns_rbtnodechain_next(chain, NULL, NULL); @@ -1773,7 +1975,8 @@ activeemtpynode(rbtdb_search_t *search, dns_name_t *qname, dns_name_t *wname) { origin, &node); if (result != ISC_R_SUCCESS) break; - LOCK(&(rbtdb->node_locks[node->locknum].lock)); + NODE_LOCK(&(rbtdb->node_locks[node->locknum].lock), + isc_rwlocktype_read); for (header = node->data; header != NULL; header = header->next) { @@ -1781,7 +1984,8 @@ activeemtpynode(rbtdb_search_t *search, dns_name_t *qname, dns_name_t *wname) { !IGNORE(header) && EXISTS(header)) break; } - UNLOCK(&(rbtdb->node_locks[node->locknum].lock)); + NODE_UNLOCK(&(rbtdb->node_locks[node->locknum].lock), + isc_rwlocktype_read); if (header != NULL) break; result = dns_rbtnodechain_prev(&chain, NULL, NULL); @@ -1798,7 +2002,8 @@ activeemtpynode(rbtdb_search_t *search, dns_name_t *qname, dns_name_t *wname) { origin, &node); if (result != ISC_R_SUCCESS) break; - LOCK(&(rbtdb->node_locks[node->locknum].lock)); + NODE_LOCK(&(rbtdb->node_locks[node->locknum].lock), + isc_rwlocktype_read); for (header = node->data; header != NULL; header = header->next) { @@ -1806,7 +2011,8 @@ activeemtpynode(rbtdb_search_t *search, dns_name_t *qname, dns_name_t *wname) { !IGNORE(header) && EXISTS(header)) break; } - UNLOCK(&(rbtdb->node_locks[node->locknum].lock)); + NODE_UNLOCK(&(rbtdb->node_locks[node->locknum].lock), + isc_rwlocktype_read); if (header != NULL) break; result = dns_rbtnodechain_next(&chain, NULL, NULL); @@ -1874,7 +2080,8 @@ find_wildcard(rbtdb_search_t *search, dns_rbtnode_t **nodep, done = ISC_FALSE; node = *nodep; do { - LOCK(&(rbtdb->node_locks[node->locknum].lock)); + NODE_LOCK(&(rbtdb->node_locks[node->locknum].lock), + isc_rwlocktype_read); /* * First we try to figure out if this node is active in @@ -1899,7 +2106,8 @@ find_wildcard(rbtdb_search_t *search, dns_rbtnode_t **nodep, else wild = ISC_FALSE; - UNLOCK(&(rbtdb->node_locks[node->locknum].lock)); + NODE_UNLOCK(&(rbtdb->node_locks[node->locknum].lock), + isc_rwlocktype_read); if (wild) { /* @@ -1932,33 +2140,38 @@ find_wildcard(rbtdb_search_t *search, dns_rbtnode_t **nodep, DNS_RBTFIND_EMPTYDATA, NULL, NULL); if (result == ISC_R_SUCCESS) { - /* - * We have found the wildcard node. If it - * is active in the search's version, we're - * done. - */ - LOCK(&(rbtdb->node_locks[wnode->locknum].lock)); - for (header = wnode->data; - header != NULL; - header = header->next) { - if (header->serial <= search->serial && - !IGNORE(header) && EXISTS(header)) - break; - } - UNLOCK(&(rbtdb->node_locks[wnode->locknum].lock)); - if (header != NULL || - activeempty(search, &wchain, wname)) { - if (activeemtpynode(search, qname, wname)) + nodelock_t *lock; + + /* + * We have found the wildcard node. If it + * is active in the search's version, we're + * done. + */ + lock = &rbtdb->node_locks[wnode->locknum].lock; + NODE_LOCK(lock, isc_rwlocktype_read); + for (header = wnode->data; + header != NULL; + header = header->next) { + if (header->serial <= search->serial && + !IGNORE(header) && EXISTS(header)) + break; + } + NODE_UNLOCK(lock, isc_rwlocktype_read); + if (header != NULL || + activeempty(search, &wchain, wname)) { + if (activeemtpynode(search, qname, + wname)) { return (ISC_R_NOTFOUND); - /* - * The wildcard node is active! - * - * Note: result is still ISC_R_SUCCESS - * so we don't have to set it. - */ - *nodep = wnode; - break; - } + } + /* + * The wildcard node is active! + * + * Note: result is still ISC_R_SUCCESS + * so we don't have to set it. + */ + *nodep = wnode; + break; + } } else if (result != ISC_R_NOTFOUND && result != DNS_R_PARTIALMATCH) { /* @@ -2010,7 +2223,8 @@ find_closest_nsec(rbtdb_search_t *search, dns_dbnode_t **nodep, origin, &node); if (result != ISC_R_SUCCESS) return (result); - LOCK(&(search->rbtdb->node_locks[node->locknum].lock)); + NODE_LOCK(&(search->rbtdb->node_locks[node->locknum].lock), + isc_rwlocktype_read); found = NULL; foundsig = NULL; empty_node = ISC_TRUE; @@ -2110,7 +2324,8 @@ find_closest_nsec(rbtdb_search_t *search, dns_dbnode_t **nodep, result = dns_rbtnodechain_prev(&search->chain, NULL, NULL); } - UNLOCK(&(search->rbtdb->node_locks[node->locknum].lock)); + NODE_UNLOCK(&(search->rbtdb->node_locks[node->locknum].lock), + isc_rwlocktype_read); } while (empty_node && result == ISC_R_SUCCESS); /* @@ -2139,12 +2354,12 @@ zone_find(dns_db_t *db, dns_name_t *name, dns_dbversion_t *version, isc_boolean_t at_zonecut = ISC_FALSE; isc_boolean_t wild; isc_boolean_t empty_node; - isc_mutex_t *lock; rdatasetheader_t *header, *header_next, *found, *nsecheader; rdatasetheader_t *foundsig, *cnamesig, *nsecsig; rbtdb_rdatatype_t sigtype; isc_boolean_t active; dns_rbtnodechain_t chain; + nodelock_t *lock; search.rbtdb = (dns_rbtdb_t *)db; @@ -2279,7 +2494,8 @@ zone_find(dns_db_t *db, dns_name_t *name, dns_dbversion_t *version, * We now go looking for rdata... */ - LOCK(&(search.rbtdb->node_locks[node->locknum].lock)); + NODE_LOCK(&(search.rbtdb->node_locks[node->locknum].lock), + isc_rwlocktype_read); found = NULL; foundsig = NULL; @@ -2421,7 +2637,8 @@ zone_find(dns_db_t *db, dns_name_t *name, dns_dbversion_t *version, * we really have a partial match. */ if (!wild) { - UNLOCK(&(search.rbtdb->node_locks[node->locknum].lock)); + lock = &search.rbtdb->node_locks[node->locknum].lock; + NODE_UNLOCK(lock, isc_rwlocktype_read); goto partial_match; } } @@ -2431,16 +2648,17 @@ zone_find(dns_db_t *db, dns_name_t *name, dns_dbversion_t *version, */ if (found == NULL) { if (search.zonecut != NULL) { - /* - * We were trying to find glue at a node beneath a - * zone cut, but didn't. - * - * Return the delegation. - */ - UNLOCK(&(search.rbtdb->node_locks[node->locknum].lock)); - result = setup_delegation(&search, nodep, foundname, - rdataset, sigrdataset); - goto tree_exit; + /* + * We were trying to find glue at a node beneath a + * zone cut, but didn't. + * + * Return the delegation. + */ + lock = &search.rbtdb->node_locks[node->locknum].lock; + NODE_UNLOCK(lock, isc_rwlocktype_read); + result = setup_delegation(&search, nodep, foundname, + rdataset, sigrdataset); + goto tree_exit; } /* * The desired type doesn't exist. @@ -2456,11 +2674,12 @@ zone_find(dns_db_t *db, dns_name_t *name, dns_dbversion_t *version, result = DNS_R_BADDB; goto node_exit; } - - UNLOCK(&(search.rbtdb->node_locks[node->locknum].lock)); + + lock = &search.rbtdb->node_locks[node->locknum].lock; + NODE_UNLOCK(lock, isc_rwlocktype_read); result = find_closest_nsec(&search, nodep, foundname, - rdataset, sigrdataset, - search.rbtdb->secure); + rdataset, sigrdataset, + search.rbtdb->secure); if (result == ISC_R_SUCCESS) result = DNS_R_EMPTYWILD; goto tree_exit; @@ -2532,9 +2751,10 @@ zone_find(dns_db_t *db, dns_name_t *name, dns_dbversion_t *version, if (result == DNS_R_GLUE && (search.options & DNS_DBFIND_VALIDATEGLUE) != 0 && !valid_glue(&search, foundname, type, node)) { - UNLOCK(&(search.rbtdb->node_locks[node->locknum].lock)); - result = setup_delegation(&search, nodep, foundname, - rdataset, sigrdataset); + lock = &search.rbtdb->node_locks[node->locknum].lock; + NODE_UNLOCK(lock, isc_rwlocktype_read); + result = setup_delegation(&search, nodep, foundname, + rdataset, sigrdataset); goto tree_exit; } } else { @@ -2563,7 +2783,8 @@ zone_find(dns_db_t *db, dns_name_t *name, dns_dbversion_t *version, foundname->attributes |= DNS_NAMEATTR_WILDCARD; node_exit: - UNLOCK(&(search.rbtdb->node_locks[node->locknum].lock)); + NODE_UNLOCK(&(search.rbtdb->node_locks[node->locknum].lock), + isc_rwlocktype_read); tree_exit: RWUNLOCK(&search.rbtdb->tree_lock, isc_rwlocktype_read); @@ -2573,17 +2794,20 @@ zone_find(dns_db_t *db, dns_name_t *name, dns_dbversion_t *version, * let go of it. */ if (search.need_cleanup) { + unsigned int refs; + node = search.zonecut; lock = &(search.rbtdb->node_locks[node->locknum].lock); - LOCK(lock); - INSIST(node->references > 0); - node->references--; - if (node->references == 0) + NODE_STRONGLOCK(lock); + + dns_rbtnode_refdecrement(node, &refs); + INSIST((int)refs >= 0); + if (refs == 0) no_references(search.rbtdb, node, 0, isc_rwlocktype_none); - UNLOCK(lock); + NODE_STRONGUNLOCK(lock); } if (close_version) @@ -2620,6 +2844,8 @@ cache_zonecut_callback(dns_rbtnode_t *node, dns_name_t *name, void *arg) { rdatasetheader_t *header, *header_prev, *header_next; rdatasetheader_t *dname_header, *sigdname_header; isc_result_t result; + nodelock_t *lock; + isc_rwlocktype_t locktype; /* XXX comment */ @@ -2630,7 +2856,9 @@ cache_zonecut_callback(dns_rbtnode_t *node, dns_name_t *name, void *arg) { */ UNUSED(name); - LOCK(&(search->rbtdb->node_locks[node->locknum].lock)); + lock = &(search->rbtdb->node_locks[node->locknum].lock); + locktype = isc_rwlocktype_read; + NODE_LOCK(lock, locktype); /* * Look for a DNAME or RRSIG DNAME rdataset. @@ -2648,21 +2876,41 @@ cache_zonecut_callback(dns_rbtnode_t *node, dns_name_t *name, void *arg) { * the node as dirty, so it will get cleaned * up later. */ - if (node->references == 0) { - INSIST(header->down == NULL); - if (header_prev != NULL) - header_prev->next = - header->next; - else - node->data = header->next; - free_rdataset(search->rbtdb->common.mctx, - header); - } else { - header->attributes |= - RDATASET_ATTR_STALE; - node->dirty = 1; + if (locktype == isc_rwlocktype_write || + NODE_TRYUPGRADE(lock) == ISC_R_SUCCESS) { + /* + * We update the node's status only when we + * can get write access; otherwise, we leave + * others to this work. Periodical cleaning + * will eventually take the job as the last + * resort. + * We won't downgrade the lock, since other + * rdatasets are probably stale, too. + */ + locktype = isc_rwlocktype_write; + + if (dns_rbtnode_refcurrent(node) == 0) { + isc_mem_t *mctx; + + INSIST(header->down == NULL); + if (header_prev != NULL) + header_prev->next = + header->next; + else + node->data = header->next; + INSIST(search->rbtdb->nodemctxs + != NULL); + mctx = search->rbtdb->nodemctxs[node->locknum]; + free_rdataset(mctx, + header); + } else { + header->attributes |= + RDATASET_ATTR_STALE; + node->dirty = 1; + header_prev = header; + } + } else header_prev = header; - } } else if (header->type == dns_rdatatype_dname && EXISTS(header)) { dname_header = header; @@ -2691,7 +2939,7 @@ cache_zonecut_callback(dns_rbtnode_t *node, dns_name_t *name, void *arg) { } else result = DNS_R_CONTINUE; - UNLOCK(&(search->rbtdb->node_locks[node->locknum].lock)); + NODE_UNLOCK(lock, locktype); return (result); } @@ -2709,6 +2957,8 @@ find_deepest_zonecut(rbtdb_search_t *search, dns_rbtnode_t *node, dns_name_t name; dns_rbtdb_t *rbtdb; isc_boolean_t done; + nodelock_t *lock; + isc_rwlocktype_t locktype; /* * Caller must be holding the tree lock. @@ -2717,8 +2967,10 @@ find_deepest_zonecut(rbtdb_search_t *search, dns_rbtnode_t *node, rbtdb = search->rbtdb; i = search->chain.level_matches; done = ISC_FALSE; + lock = &rbtdb->node_locks[node->locknum].lock; do { - LOCK(&(rbtdb->node_locks[node->locknum].lock)); + locktype = isc_rwlocktype_read; + NODE_LOCK(lock, locktype); /* * Look for NS and RRSIG NS rdatasets. @@ -2738,21 +2990,41 @@ find_deepest_zonecut(rbtdb_search_t *search, dns_rbtnode_t *node, * the node as dirty, so it will get cleaned * up later. */ - if (node->references == 0) { - INSIST(header->down == NULL); - if (header_prev != NULL) - header_prev->next = - header->next; - else - node->data = header->next; - free_rdataset(rbtdb->common.mctx, - header); - } else { - header->attributes |= - RDATASET_ATTR_STALE; - node->dirty = 1; + if (locktype == isc_rwlocktype_write || + NODE_TRYUPGRADE(lock) == ISC_R_SUCCESS) { + /* + * We update the node's status only + * when we can get write access. + */ + locktype = isc_rwlocktype_write; + + if (dns_rbtnode_refcurrent(node) + == 0) { + isc_mem_t *mctx; + + INSIST(header->down == NULL); + if (header_prev != NULL) + header_prev->next = + header->next; + else + node->data = + header->next; + + if (search->rbtdb->nodemctxs != + NULL) + mctx = search->rbtdb->nodemctxs[node->locknum]; + else + mctx = search->rbtdb->common.mctx; + free_rdataset(mctx, + header); + } else { + header->attributes |= + RDATASET_ATTR_STALE; + node->dirty = 1; + header_prev = header; + } + } else header_prev = header; - } } else if (EXISTS(header)) { /* * We've found an extant rdataset. See if @@ -2816,7 +3088,7 @@ find_deepest_zonecut(rbtdb_search_t *search, dns_rbtnode_t *node, } node_exit: - UNLOCK(&(search->rbtdb->node_locks[node->locknum].lock)); + NODE_UNLOCK(lock, locktype); if (found == NULL && i > 0) { i--; @@ -2842,6 +3114,8 @@ find_coveringnsec(rbtdb_search_t *search, dns_dbnode_t **nodep, dns_fixedname_t fname, forigin; dns_name_t *name, *origin; rbtdb_rdatatype_t matchtype, sigmatchtype, nsectype; + nodelock_t *lock; + isc_rwlocktype_t locktype; matchtype = RBTDB_RDATATYPE_VALUE(dns_rdatatype_nsec, 0); nsectype = RBTDB_RDATATYPE_VALUE(0, dns_rdatatype_nsec); @@ -2858,7 +3132,9 @@ find_coveringnsec(rbtdb_search_t *search, dns_dbnode_t **nodep, origin, &node); if (result != ISC_R_SUCCESS) return (result); - LOCK(&(search->rbtdb->node_locks[node->locknum].lock)); + locktype = isc_rwlocktype_read; + lock = &(search->rbtdb->node_locks[node->locknum].lock); + NODE_LOCK(lock, locktype); found = NULL; foundsig = NULL; empty_node = ISC_TRUE; @@ -2875,21 +3151,39 @@ find_coveringnsec(rbtdb_search_t *search, dns_dbnode_t **nodep, * node as dirty, so it will get cleaned up * later. */ - if (node->references == 0) { - INSIST(header->down == NULL); - if (header_prev != NULL) - header_prev->next = - header->next; - else - node->data = header->next; - free_rdataset(search->rbtdb->common.mctx, - header); - } else { - header->attributes |= - RDATASET_ATTR_STALE; - node->dirty = 1; + if (locktype == isc_rwlocktype_write || + NODE_TRYUPGRADE(lock) == ISC_R_SUCCESS) { + /* + * We update the node's status only + * when we can get write access. + */ + locktype = isc_rwlocktype_write; + + if (dns_rbtnode_refcurrent(node) + == 0) { + isc_mem_t *mctx; + + INSIST(header->down == NULL); + if (header_prev != NULL) + header_prev->next = + header->next; + else + node->data = header->next; + if (search->rbtdb->nodemctxs != + NULL) + mctx = search->rbtdb->nodemctxs[node->locknum]; + else + mctx = search->rbtdb->common.mctx; + free_rdataset(mctx, + header); + } else { + header->attributes |= + RDATASET_ATTR_STALE; + node->dirty = 1; + header_prev = header; + } + } else header_prev = header; - } continue; } if (NONEXISTENT(header) || NXDOMAIN(header)) { @@ -2922,7 +3216,7 @@ find_coveringnsec(rbtdb_search_t *search, dns_dbnode_t **nodep, result = dns_rbtnodechain_prev(&search->chain, NULL, NULL); unlock_node: - UNLOCK(&(search->rbtdb->node_locks[node->locknum].lock)); + NODE_UNLOCK(lock, locktype); } while (empty_node && result == ISC_R_SUCCESS); return (result); } @@ -2938,7 +3232,8 @@ cache_find(dns_db_t *db, dns_name_t *name, dns_dbversion_t *version, rbtdb_search_t search; isc_boolean_t cname_ok = ISC_TRUE; isc_boolean_t empty_node; - isc_mutex_t *lock; + nodelock_t *lock; + isc_rwlocktype_t locktype; rdatasetheader_t *header, *header_prev, *header_next; rdatasetheader_t *found, *nsheader; rdatasetheader_t *foundsig, *nssig, *cnamesig; @@ -3012,7 +3307,9 @@ cache_find(dns_db_t *db, dns_name_t *name, dns_dbversion_t *version, * We now go looking for rdata... */ - LOCK(&(search.rbtdb->node_locks[node->locknum].lock)); + lock = &(search.rbtdb->node_locks[node->locknum].lock); + locktype = isc_rwlocktype_read; + NODE_LOCK(lock, locktype); found = NULL; foundsig = NULL; @@ -3032,19 +3329,35 @@ cache_find(dns_db_t *db, dns_name_t *name, dns_dbversion_t *version, * mark it as stale, and the node as dirty, so it will * get cleaned up later. */ - if (node->references == 0) { - INSIST(header->down == NULL); - if (header_prev != NULL) - header_prev->next = header->next; - else - node->data = header->next; - free_rdataset(search.rbtdb->common.mctx, - header); - } else { - header->attributes |= RDATASET_ATTR_STALE; - node->dirty = 1; + if (locktype == isc_rwlocktype_write || + NODE_TRYUPGRADE(lock) == ISC_R_SUCCESS) { + /* + * We update the node's status only when we + * can get write access. + */ + locktype = isc_rwlocktype_write; + + if (dns_rbtnode_refcurrent(node) == 0) { + isc_mem_t *mctx; + + INSIST(header->down == NULL); + if (header_prev != NULL) + header_prev->next = + header->next; + else + node->data = header->next; + INSIST(search.rbtdb->nodemctxs != NULL); + mctx = search.rbtdb->nodemctxs[node->locknum]; + free_rdataset(mctx, + header); + } else { + header->attributes |= + RDATASET_ATTR_STALE; + node->dirty = 1; + header_prev = header; + } + } else header_prev = header; - } } else if (EXISTS(header)) { /* * We now know that there is at least one active @@ -3124,7 +3437,7 @@ cache_find(dns_db_t *db, dns_name_t *name, dns_dbversion_t *version, * extant rdatasets. That means that this node doesn't * meaningfully exist, and that we really have a partial match. */ - UNLOCK(&(search.rbtdb->node_locks[node->locknum].lock)); + NODE_UNLOCK(lock, locktype); goto find_ns; } @@ -3157,7 +3470,7 @@ cache_find(dns_db_t *db, dns_name_t *name, dns_dbversion_t *version, /* * Go find the deepest zone cut. */ - UNLOCK(&(search.rbtdb->node_locks[node->locknum].lock)); + NODE_UNLOCK(lock, locktype); goto find_ns; } @@ -3204,7 +3517,7 @@ cache_find(dns_db_t *db, dns_name_t *name, dns_dbversion_t *version, } node_exit: - UNLOCK(&(search.rbtdb->node_locks[node->locknum].lock)); + NODE_UNLOCK(lock, locktype); tree_exit: RWUNLOCK(&search.rbtdb->tree_lock, isc_rwlocktype_read); @@ -3214,16 +3527,21 @@ cache_find(dns_db_t *db, dns_name_t *name, dns_dbversion_t *version, * let go of it. */ if (search.need_cleanup) { + unsigned int refs; + node = search.zonecut; lock = &(search.rbtdb->node_locks[node->locknum].lock); - LOCK(lock); - INSIST(node->references > 0); - node->references--; - if (node->references == 0) + NODE_STRONGLOCK(lock); + + dns_rbtnode_refdecrement(node, &refs); + INSIST((int)refs >= 0); + + if (refs == 0) no_references(search.rbtdb, node, 0, isc_rwlocktype_none); - UNLOCK(lock); + + NODE_STRONGUNLOCK(lock); } dns_rbtnodechain_reset(&search.chain); @@ -3238,11 +3556,13 @@ cache_findzonecut(dns_db_t *db, dns_name_t *name, unsigned int options, dns_rdataset_t *rdataset, dns_rdataset_t *sigrdataset) { dns_rbtnode_t *node = NULL; + nodelock_t *lock; isc_result_t result; rbtdb_search_t search; rdatasetheader_t *header, *header_prev, *header_next; rdatasetheader_t *found, *foundsig; unsigned int rbtoptions = DNS_RBTFIND_EMPTYDATA; + isc_rwlocktype_t locktype; search.rbtdb = (dns_rbtdb_t *)db; @@ -3285,7 +3605,9 @@ cache_findzonecut(dns_db_t *db, dns_name_t *name, unsigned int options, * We now go looking for an NS rdataset at the node. */ - LOCK(&(search.rbtdb->node_locks[node->locknum].lock)); + lock = &(search.rbtdb->node_locks[node->locknum].lock); + locktype = isc_rwlocktype_read; + NODE_LOCK(lock, locktype); found = NULL; foundsig = NULL; @@ -3299,19 +3621,35 @@ cache_findzonecut(dns_db_t *db, dns_name_t *name, unsigned int options, * mark it as stale, and the node as dirty, so it will * get cleaned up later. */ - if (node->references == 0) { - INSIST(header->down == NULL); - if (header_prev != NULL) - header_prev->next = header->next; - else - node->data = header->next; - free_rdataset(search.rbtdb->common.mctx, - header); - } else { - header->attributes |= RDATASET_ATTR_STALE; - node->dirty = 1; + if (locktype == isc_rwlocktype_write || + NODE_TRYUPGRADE(lock) == ISC_R_SUCCESS) { + /* + * We update the node's status only when we + * can get write access. + */ + locktype = isc_rwlocktype_write; + + if (dns_rbtnode_refcurrent(node) == 0) { + isc_mem_t *mctx; + + INSIST(header->down == NULL); + if (header_prev != NULL) + header_prev->next = + header->next; + else + node->data = header->next; + INSIST(search.rbtdb->nodemctxs != NULL); + mctx = search.rbtdb->nodemctxs[node->locknum]; + free_rdataset(mctx, + header); + } else { + header->attributes |= + RDATASET_ATTR_STALE; + node->dirty = 1; + header_prev = header; + } + } else header_prev = header; - } } else if (EXISTS(header)) { /* * If we found a type we were looking for, remember @@ -3340,7 +3678,7 @@ cache_findzonecut(dns_db_t *db, dns_name_t *name, unsigned int options, /* * No NS records here. */ - UNLOCK(&(search.rbtdb->node_locks[node->locknum].lock)); + NODE_UNLOCK(lock, locktype); goto find_ns; } @@ -3354,7 +3692,7 @@ cache_findzonecut(dns_db_t *db, dns_name_t *name, unsigned int options, bind_rdataset(search.rbtdb, node, foundsig, search.now, sigrdataset); - UNLOCK(&(search.rbtdb->node_locks[node->locknum].lock)); + NODE_UNLOCK(lock, locktype); tree_exit: RWUNLOCK(&search.rbtdb->tree_lock, isc_rwlocktype_read); @@ -3373,15 +3711,15 @@ static void attachnode(dns_db_t *db, dns_dbnode_t *source, dns_dbnode_t **targetp) { dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db; dns_rbtnode_t *node = (dns_rbtnode_t *)source; + unsigned int refs; REQUIRE(VALID_RBTDB(rbtdb)); REQUIRE(targetp != NULL && *targetp == NULL); - LOCK(&rbtdb->node_locks[node->locknum].lock); - INSIST(node->references > 0); - node->references++; - INSIST(node->references != 0); /* Catch overflow. */ - UNLOCK(&rbtdb->node_locks[node->locknum].lock); + NODE_STRONGLOCK(&rbtdb->node_locks[node->locknum].lock); + dns_rbtnode_refincrement0(node, &refs); + INSIST(refs > 1); + NODE_STRONGUNLOCK(&rbtdb->node_locks[node->locknum].lock); *targetp = source; } @@ -3393,6 +3731,7 @@ detachnode(dns_db_t *db, dns_dbnode_t **targetp) { isc_boolean_t want_free = ISC_FALSE; isc_boolean_t inactive = ISC_FALSE; unsigned int locknum; + unsigned int refs; REQUIRE(VALID_RBTDB(rbtdb)); REQUIRE(targetp != NULL && *targetp != NULL); @@ -3400,27 +3739,31 @@ detachnode(dns_db_t *db, dns_dbnode_t **targetp) { node = (dns_rbtnode_t *)(*targetp); locknum = node->locknum; - LOCK(&rbtdb->node_locks[locknum].lock); + NODE_STRONGLOCK(&rbtdb->node_locks[locknum].lock); - INSIST(node->references > 0); - node->references--; - if (node->references == 0) { + dns_rbtnode_refdecrement(node, &refs); + INSIST((int)refs >= 0); + if (refs == 0) { no_references(rbtdb, node, 0, isc_rwlocktype_none); - if (rbtdb->node_locks[locknum].references == 0 && + NODE_WEAKLOCK(&rbtdb->node_locks[locknum].lock, + isc_rwlocktype_read); + if (isc_refcount_current(&rbtdb->node_locks[locknum].references) == 0 && rbtdb->node_locks[locknum].exiting) inactive = ISC_TRUE; + NODE_WEAKUNLOCK(&rbtdb->node_locks[locknum].lock, + isc_rwlocktype_read); } - UNLOCK(&rbtdb->node_locks[locknum].lock); + NODE_STRONGUNLOCK(&rbtdb->node_locks[locknum].lock); *targetp = NULL; if (inactive) { - LOCK(&rbtdb->lock); + RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write); rbtdb->active--; if (rbtdb->active == 0) want_free = ISC_TRUE; - UNLOCK(&rbtdb->lock); + RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write); if (want_free) { char buf[DNS_NAME_FORMATSIZE]; if (dns_name_dynamic(&rbtdb->common.origin)) @@ -3484,14 +3827,19 @@ expirenode(dns_db_t *db, dns_dbnode_t *node, isc_stdtime_t now) { sizeof(printname))); } - LOCK(&rbtdb->node_locks[rbtnode->locknum].lock); + /* + * We may not need write access, but this code path is not performance + * sensitive, so it should be okay to always lock as a writer. + */ + NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock, + isc_rwlocktype_write); for (header = rbtnode->data; header != NULL; header = header->next) if (header->ttl <= now) { /* - * We don't check if rbtnode->references == 0 and try + * We don't check if refcurrent(rbtnode) == 0 and try * to free like we do in cache_find(), because - * rbtnode->references must be non-zero. This is so + * refcurrent(rbtnode) must be non-zero. This is so * because 'node' is an argument to the function. */ header->attributes |= RDATASET_ATTR_STALE; @@ -3515,7 +3863,8 @@ expirenode(dns_db_t *db, dns_dbnode_t *node, isc_stdtime_t now) { isc_log_write(dns_lctx, category, module, level, "overmem cache: saved %s", printname); - UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock); + NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock, + isc_rwlocktype_write); return (ISC_R_SUCCESS); } @@ -3537,10 +3886,12 @@ printnode(dns_db_t *db, dns_dbnode_t *node, FILE *out) { REQUIRE(VALID_RBTDB(rbtdb)); - LOCK(&rbtdb->node_locks[rbtnode->locknum].lock); + NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock, + isc_rwlocktype_read); fprintf(out, "node %p, %u references, locknum = %u\n", - rbtnode, rbtnode->references, rbtnode->locknum); + rbtnode, dns_rbtnode_refcurrent(rbtnode), + rbtnode->locknum); if (rbtnode->data != NULL) { rdatasetheader_t *current, *top_next; @@ -3566,7 +3917,8 @@ printnode(dns_db_t *db, dns_dbnode_t *node, FILE *out) { } else fprintf(out, "(empty)\n"); - UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock); + NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock, + isc_rwlocktype_read); } static isc_result_t @@ -3627,7 +3979,8 @@ zone_findrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version, serial = rbtversion->serial; now = 0; - LOCK(&rbtdb->node_locks[rbtnode->locknum].lock); + NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock, + isc_rwlocktype_read); found = NULL; foundsig = NULL; @@ -3675,7 +4028,8 @@ zone_findrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version, sigrdataset); } - UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock); + NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock, + isc_rwlocktype_read); if (close_version) closeversion(db, (dns_dbversion_t **) (void *)(&rbtversion), @@ -3698,6 +4052,8 @@ cache_findrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version, rdatasetheader_t *header, *header_next, *found, *foundsig; rbtdb_rdatatype_t matchtype, sigmatchtype, nsectype; isc_result_t result; + nodelock_t *lock; + isc_rwlocktype_t locktype; REQUIRE(VALID_RBTDB(rbtdb)); REQUIRE(type != dns_rdatatype_any); @@ -3709,7 +4065,9 @@ cache_findrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version, if (now == 0) isc_stdtime_get(&now); - LOCK(&rbtdb->node_locks[rbtnode->locknum].lock); + lock = &rbtdb->node_locks[rbtnode->locknum].lock; + locktype = isc_rwlocktype_read; + NODE_LOCK(lock, locktype); found = NULL; foundsig = NULL; @@ -3723,14 +4081,24 @@ cache_findrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version, for (header = rbtnode->data; header != NULL; header = header_next) { header_next = header->next; if (header->ttl <= now) { - /* - * We don't check if rbtnode->references == 0 and try - * to free like we do in cache_find(), because - * rbtnode->references must be non-zero. This is so - * because 'node' is an argument to the function. - */ - header->attributes |= RDATASET_ATTR_STALE; - rbtnode->dirty = 1; + if (locktype == isc_rwlocktype_write || + NODE_TRYUPGRADE(lock) == ISC_R_SUCCESS) { + /* + * We update the node's status only when we + * can get write access. + */ + locktype = isc_rwlocktype_write; + + /* + * We don't check if refcurrent(rbtnode) == 0 + * and try to free like we do in cache_find(), + * because refcurrent(rbtnode) must be + * non-zero. This is so because 'node' is an + * argument to the function. + */ + header->attributes |= RDATASET_ATTR_STALE; + rbtnode->dirty = 1; + } } else if (EXISTS(header)) { if (header->type == matchtype) found = header; @@ -3748,7 +4116,7 @@ cache_findrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version, sigrdataset); } - UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock); + NODE_UNLOCK(lock, locktype); if (found == NULL) return (ISC_R_NOTFOUND); @@ -3774,6 +4142,7 @@ allrdatasets(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version, dns_rbtnode_t *rbtnode = (dns_rbtnode_t *)node; rbtdb_version_t *rbtversion = version; rbtdb_rdatasetiter_t *iterator; + unsigned int refs; REQUIRE(VALID_RBTDB(rbtdb)); @@ -3787,11 +4156,11 @@ allrdatasets(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version, currentversion(db, (dns_dbversion_t **) (void *)(&rbtversion)); else { - LOCK(&rbtdb->lock); - INSIST(rbtversion->references > 0); - rbtversion->references++; - INSIST(rbtversion->references != 0); - UNLOCK(&rbtdb->lock); + unsigned int refs; + + isc_refcount_increment(&rbtversion->references, + &refs); + INSIST(refs > 1); } } else { if (now == 0) @@ -3806,14 +4175,14 @@ allrdatasets(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version, iterator->common.version = (dns_dbversion_t *)rbtversion; iterator->common.now = now; - LOCK(&rbtdb->node_locks[rbtnode->locknum].lock); + NODE_STRONGLOCK(&rbtdb->node_locks[rbtnode->locknum].lock); + + dns_rbtnode_refincrement0(rbtnode, &refs); + INSIST(refs > 0); - INSIST(rbtnode->references > 0); - rbtnode->references++; - INSIST(rbtnode->references != 0); iterator->current = NULL; - UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock); + NODE_STRONGUNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock); *iteratorp = (dns_rdatasetiter_t *)iterator; @@ -3917,6 +4286,7 @@ add(dns_rbtdb_t *rbtdb, dns_rbtnode_t *rbtnode, rbtdb_version_t *rbtversion, isc_boolean_t merge; dns_rdatatype_t nsectype, rdtype, covers; dns_trust_t trust; + isc_mem_t *mctx; /* * Add an rdatasetheader_t to a node. @@ -3937,6 +4307,11 @@ add(dns_rbtdb_t *rbtdb, dns_rbtnode_t *rbtnode, rbtdb_version_t *rbtversion, else trust = newheader->trust; + if (rbtdb->nodemctxs != NULL) + mctx = rbtdb->nodemctxs[rbtnode->locknum]; + else + mctx = rbtdb->common.mctx; + if (rbtversion != NULL && !loading) { /* * We always add a changed record, even if no changes end up @@ -3945,7 +4320,7 @@ add(dns_rbtdb_t *rbtdb, dns_rbtnode_t *rbtnode, rbtdb_version_t *rbtversion, */ changed = add_changed(rbtdb, rbtversion, rbtnode); if (changed == NULL) { - free_rdataset(rbtdb->common.mctx, newheader); + free_rdataset(mctx, newheader); return (ISC_R_NOMEMORY); } } @@ -4002,7 +4377,8 @@ add(dns_rbtdb_t *rbtdb, dns_rbtnode_t *rbtnode, rbtdb_version_t *rbtversion, /* * The NXDOMAIN is more trusted. */ - free_rdataset(rbtdb->common.mctx, + + free_rdataset(mctx, newheader); if (addedrdataset != NULL) bind_rdataset(rbtdb, rbtnode, @@ -4049,7 +4425,7 @@ add(dns_rbtdb_t *rbtdb, dns_rbtnode_t *rbtnode, rbtdb_version_t *rbtversion, * Deleting an already non-existent rdataset has no effect. */ if (header_nx && newheader_nx) { - free_rdataset(rbtdb->common.mctx, newheader); + free_rdataset(mctx, newheader); return (DNS_R_UNCHANGED); } @@ -4059,7 +4435,7 @@ add(dns_rbtdb_t *rbtdb, dns_rbtnode_t *rbtnode, rbtdb_version_t *rbtversion, */ if (rbtversion == NULL && trust < header->trust && (header->ttl > now || header_nx)) { - free_rdataset(rbtdb->common.mctx, newheader); + free_rdataset(mctx, newheader); if (addedrdataset != NULL) bind_rdataset(rbtdb, rbtnode, header, now, addedrdataset); @@ -4094,7 +4470,7 @@ add(dns_rbtdb_t *rbtdb, dns_rbtnode_t *rbtnode, rbtdb_version_t *rbtversion, (unsigned char *)header, (unsigned char *)newheader, (unsigned int)(sizeof(*newheader)), - rbtdb->common.mctx, + mctx, rbtdb->common.rdclass, (dns_rdatatype_t)header->type, flags, &merged); @@ -4107,10 +4483,10 @@ add(dns_rbtdb_t *rbtdb, dns_rbtnode_t *rbtnode, rbtdb_version_t *rbtversion, * alone. It will get cleaned up when * clean_zone_node() runs. */ - free_rdataset(rbtdb->common.mctx, newheader); + free_rdataset(mctx, newheader); newheader = (rdatasetheader_t *)merged; } else { - free_rdataset(rbtdb->common.mctx, newheader); + free_rdataset(mctx, newheader); return (result); } } @@ -4141,7 +4517,7 @@ add(dns_rbtdb_t *rbtdb, dns_rbtnode_t *rbtnode, rbtdb_version_t *rbtversion, header->noqname = newheader->noqname; newheader->noqname = NULL; } - free_rdataset(rbtdb->common.mctx, newheader); + free_rdataset(mctx, newheader); if (addedrdataset != NULL) bind_rdataset(rbtdb, rbtnode, header, now, addedrdataset); @@ -4166,7 +4542,7 @@ add(dns_rbtdb_t *rbtdb, dns_rbtnode_t *rbtnode, rbtdb_version_t *rbtversion, header->noqname = newheader->noqname; newheader->noqname = NULL; } - free_rdataset(rbtdb->common.mctx, newheader); + free_rdataset(mctx, newheader); if (addedrdataset != NULL) bind_rdataset(rbtdb, rbtnode, header, now, addedrdataset); @@ -4187,7 +4563,7 @@ add(dns_rbtdb_t *rbtdb, dns_rbtnode_t *rbtnode, rbtdb_version_t *rbtversion, * loading, we MUST clean up 'header' now. */ newheader->down = NULL; - free_rdataset(rbtdb->common.mctx, header); + free_rdataset(mctx, header); } else { newheader->down = topheader; topheader->next = newheader; @@ -4205,7 +4581,7 @@ add(dns_rbtdb_t *rbtdb, dns_rbtnode_t *rbtnode, rbtdb_version_t *rbtversion, * If we're trying to delete the type, don't bother. */ if (newheader_nx) { - free_rdataset(rbtdb->common.mctx, newheader); + free_rdataset(mctx, newheader); return (DNS_R_UNCHANGED); } @@ -4331,6 +4707,7 @@ addrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version, rdatasetheader_t *newheader; isc_result_t result; isc_boolean_t delegating; + isc_mem_t *mctx; REQUIRE(VALID_RBTDB(rbtdb)); @@ -4340,7 +4717,12 @@ addrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version, } else now = 0; - result = dns_rdataslab_fromrdataset(rdataset, rbtdb->common.mctx, + if (rbtdb->nodemctxs != NULL) + mctx = rbtdb->nodemctxs[rbtnode->locknum]; + else + mctx = rbtdb->common.mctx; + + result = dns_rdataslab_fromrdataset(rdataset, mctx, ®ion, sizeof(rdatasetheader_t)); if (result != ISC_R_SUCCESS) @@ -4366,7 +4748,7 @@ addrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version, if ((rdataset->attributes & DNS_RDATASETATTR_NOQNAME) != 0) { result = addnoqname(rbtdb, newheader, rdataset); if (result != ISC_R_SUCCESS) { - free_rdataset(rbtdb->common.mctx, newheader); + free_rdataset(mctx, newheader); return (result); } } @@ -4384,14 +4766,16 @@ addrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version, } else delegating = ISC_FALSE; - LOCK(&rbtdb->node_locks[rbtnode->locknum].lock); + NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock, + isc_rwlocktype_write); result = add(rbtdb, rbtnode, rbtversion, newheader, options, ISC_FALSE, addedrdataset, now); if (result == ISC_R_SUCCESS && delegating) rbtnode->find_callback = 1; - UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock); + NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock, + isc_rwlocktype_write); if (delegating) RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_write); @@ -4412,10 +4796,16 @@ subtractrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version, isc_region_t region; isc_result_t result; rbtdb_changed_t *changed; + isc_mem_t *mctx; REQUIRE(VALID_RBTDB(rbtdb)); - result = dns_rdataslab_fromrdataset(rdataset, rbtdb->common.mctx, + if (rbtdb->nodemctxs != NULL) + mctx = rbtdb->nodemctxs[rbtnode->locknum]; + else + mctx = rbtdb->common.mctx; + + result = dns_rdataslab_fromrdataset(rdataset, mctx, ®ion, sizeof(rdatasetheader_t)); if (result != ISC_R_SUCCESS) @@ -4432,12 +4822,14 @@ subtractrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version, newheader->additional_auth = NULL; newheader->additional_glue = NULL; - LOCK(&rbtdb->node_locks[rbtnode->locknum].lock); + NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock, + isc_rwlocktype_write); changed = add_changed(rbtdb, rbtversion, rbtnode); if (changed == NULL) { free_rdataset(rbtdb->common.mctx, newheader); - UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock); + NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock, + isc_rwlocktype_write); return (ISC_R_NOMEMORY); } @@ -4471,12 +4863,12 @@ subtractrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version, (unsigned char *)header, (unsigned char *)newheader, (unsigned int)(sizeof(*newheader)), - rbtdb->common.mctx, + mctx, rbtdb->common.rdclass, (dns_rdatatype_t)header->type, flags, &subresult); if (result == ISC_R_SUCCESS) { - free_rdataset(rbtdb->common.mctx, newheader); + free_rdataset(mctx, newheader); newheader = (rdatasetheader_t *)subresult; /* * We have to set the serial since the rdataslab @@ -4496,8 +4888,8 @@ subtractrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version, * This subtraction would remove all of the rdata; * add a nonexistent header instead. */ - free_rdataset(rbtdb->common.mctx, newheader); - newheader = isc_mem_get(rbtdb->common.mctx, + free_rdataset(mctx, newheader); + newheader = isc_mem_get(mctx, sizeof(*newheader)); if (newheader == NULL) { result = ISC_R_NOMEMORY; @@ -4513,7 +4905,7 @@ subtractrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version, newheader->additional_auth = NULL; newheader->additional_glue = NULL; } else { - free_rdataset(rbtdb->common.mctx, newheader); + free_rdataset(mctx, newheader); goto unlock; } @@ -4536,7 +4928,7 @@ subtractrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version, * The rdataset doesn't exist, so we don't need to do anything * to satisfy the deletion request. */ - free_rdataset(rbtdb->common.mctx, newheader); + free_rdataset(mctx, newheader); if ((options & DNS_DBSUB_EXACT) != 0) result = DNS_R_NOTEXACT; else @@ -4547,7 +4939,8 @@ subtractrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version, bind_rdataset(rbtdb, rbtnode, newheader, 0, newrdataset); unlock: - UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock); + NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock, + isc_rwlocktype_write); return (result); } @@ -4561,6 +4954,7 @@ deleterdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version, rbtdb_version_t *rbtversion = version; isc_result_t result; rdatasetheader_t *newheader; + isc_mem_t *mctx; REQUIRE(VALID_RBTDB(rbtdb)); @@ -4569,7 +4963,12 @@ deleterdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version, if (type == dns_rdatatype_rrsig && covers == 0) return (ISC_R_NOTIMPLEMENTED); - newheader = isc_mem_get(rbtdb->common.mctx, sizeof(*newheader)); + if (rbtdb->nodemctxs != NULL) + mctx = rbtdb->nodemctxs[rbtnode->locknum]; + else + mctx = rbtdb->common.mctx; + + newheader = isc_mem_get(mctx, sizeof(*newheader)); if (newheader == NULL) return (ISC_R_NOMEMORY); newheader->ttl = 0; @@ -4585,12 +4984,14 @@ deleterdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version, newheader->serial = 0; newheader->count = 0; - LOCK(&rbtdb->node_locks[rbtnode->locknum].lock); + NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock, + isc_rwlocktype_write); result = add(rbtdb, rbtnode, rbtversion, newheader, DNS_DBADD_FORCE, ISC_FALSE, NULL, 0); - UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock); + NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock, + isc_rwlocktype_write); return (result); } @@ -4603,6 +5004,7 @@ loading_addrdataset(void *arg, dns_name_t *name, dns_rdataset_t *rdataset) { isc_result_t result; isc_region_t region; rdatasetheader_t *newheader; + isc_mem_t *mctx; /* * This routine does no node locking. See comments in @@ -4647,7 +5049,11 @@ loading_addrdataset(void *arg, dns_name_t *name, dns_rdataset_t *rdataset) { #endif } - result = dns_rdataslab_fromrdataset(rdataset, rbtdb->common.mctx, + if (rbtdb->nodemctxs != NULL) + mctx = rbtdb->nodemctxs[node->locknum]; + else + mctx = rbtdb->common.mctx; + result = dns_rdataslab_fromrdataset(rdataset, mctx, ®ion, sizeof(rdatasetheader_t)); if (result != ISC_R_SUCCESS) @@ -4694,13 +5100,13 @@ beginload(dns_db_t *db, dns_addrdatasetfunc_t *addp, dns_dbload_t **dbloadp) { else loadctx->now = 0; - LOCK(&rbtdb->lock); + RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write); REQUIRE((rbtdb->attributes & (RBTDB_ATTR_LOADED|RBTDB_ATTR_LOADING)) == 0); rbtdb->attributes |= RBTDB_ATTR_LOADING; - UNLOCK(&rbtdb->lock); + RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write); *addp = loading_addrdataset; *dbloadp = loadctx; @@ -4760,7 +5166,7 @@ endload(dns_db_t *db, dns_dbload_t **dbloadp) { loadctx = *dbloadp; REQUIRE(loadctx->rbtdb == rbtdb); - LOCK(&rbtdb->lock); + RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write); REQUIRE((rbtdb->attributes & RBTDB_ATTR_LOADING) != 0); REQUIRE((rbtdb->attributes & RBTDB_ATTR_LOADED) == 0); @@ -4768,7 +5174,7 @@ endload(dns_db_t *db, dns_dbload_t **dbloadp) { rbtdb->attributes &= ~RBTDB_ATTR_LOADING; rbtdb->attributes |= RBTDB_ATTR_LOADED; - UNLOCK(&rbtdb->lock); + RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write); /* * If there's a KEY rdataset at the zone origin containing a @@ -4800,11 +5206,17 @@ dump(dns_db_t *db, dns_dbversion_t *version, const char *filename) { static void delete_callback(void *data, void *arg) { dns_rbtdb_t *rbtdb = arg; + dns_rbtnode_t *rbtnode = data; rdatasetheader_t *current, *next; + isc_mem_t *mctx; - for (current = data; current != NULL; current = next) { + if (rbtdb->nodemctxs != NULL) + mctx = rbtdb->nodemctxs[rbtnode->locknum]; + else + mctx = rbtdb->common.mctx; + for (current = rbtnode->data; current != NULL; current = next) { next = current->next; - free_rdataset(rbtdb->common.mctx, current); + free_rdataset(mctx, current); } } @@ -4848,12 +5260,12 @@ settask(dns_db_t *db, isc_task_t *task) { REQUIRE(VALID_RBTDB(rbtdb)); - LOCK(&rbtdb->lock); + RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write); if (rbtdb->task != NULL) isc_task_detach(&rbtdb->task); if (task != NULL) isc_task_attach(task, &rbtdb->task); - UNLOCK(&rbtdb->lock); + RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write); } static isc_boolean_t @@ -4870,12 +5282,12 @@ getsoanode(dns_db_t *db, dns_dbnode_t **nodep) { REQUIRE(VALID_RBTDB(rbtdb)); REQUIRE(nodep != NULL && *nodep == NULL); - LOCK(&rbtdb->lock); + RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_read); if (rbtdb->soanode != NULL) { attachnode(db, rbtdb->soanode, nodep); } else result = ISC_R_NOTFOUND; - UNLOCK(&rbtdb->lock); + RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_read); return (result); } @@ -4887,11 +5299,11 @@ setsoanode(dns_db_t *db, dns_dbnode_t *node) { REQUIRE(VALID_RBTDB(rbtdb)); REQUIRE(node != NULL); - LOCK(&rbtdb->lock); + RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write); if (rbtdb->soanode != NULL) detachnode(db, &rbtdb->soanode); attachnode(db, node, &rbtdb->soanode); - UNLOCK(&rbtdb->lock); + RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write); return (ISC_R_SUCCESS); } @@ -4904,12 +5316,12 @@ getnsnode(dns_db_t *db, dns_dbnode_t **nodep) { REQUIRE(VALID_RBTDB(rbtdb)); REQUIRE(nodep != NULL && *nodep == NULL); - LOCK(&rbtdb->lock); + RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_read); if (rbtdb->nsnode != NULL) { attachnode(db, rbtdb->nsnode, nodep); } else result = ISC_R_NOTFOUND; - UNLOCK(&rbtdb->lock); + RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_read); return (result); } @@ -4921,11 +5333,11 @@ setnsnode(dns_db_t *db, dns_dbnode_t *node) { REQUIRE(VALID_RBTDB(rbtdb)); REQUIRE(node != NULL); - LOCK(&rbtdb->lock); + RBTDB_LOCK(&rbtdb->lock, isc_rwlocktype_write); if (rbtdb->nsnode != NULL) detachnode(db, &rbtdb->nsnode); attachnode(db, node, &rbtdb->nsnode); - UNLOCK(&rbtdb->lock); + RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write); return (ISC_R_SUCCESS); } @@ -5034,19 +5446,20 @@ dns_rbtdb_create rbtdb->common.methods = &zone_methods; rbtdb->common.rdclass = rdclass; rbtdb->common.mctx = NULL; + rbtdb->nodemctxs = NULL; - result = isc_mutex_init(&rbtdb->lock); + result = RBTDB_INITLOCK(&rbtdb->lock); if (result != ISC_R_SUCCESS) { isc_mem_put(mctx, rbtdb, sizeof(*rbtdb)); UNEXPECTED_ERROR(__FILE__, __LINE__, - "isc_mutex_init() failed: %s", + "RBTDB_INITLOCK() failed: %s", isc_result_totext(result)); return (ISC_R_UNEXPECTED); } result = isc_rwlock_init(&rbtdb->tree_lock, 0, 0); if (result != ISC_R_SUCCESS) { - DESTROYLOCK(&rbtdb->lock); + RBTDB_DESTROYLOCK(&rbtdb->lock); isc_mem_put(mctx, rbtdb, sizeof(*rbtdb)); UNEXPECTED_ERROR(__FILE__, __LINE__, "isc_rwlock_init() failed: %s", @@ -5061,27 +5474,44 @@ dns_rbtdb_create rbtdb->node_locks = isc_mem_get(mctx, rbtdb->node_lock_count * sizeof(rbtdb_nodelock_t)); rbtdb->active = rbtdb->node_lock_count; + + if (IS_CACHE(rbtdb)) { + rbtdb->nodemctxs = isc_mem_get(mctx, + sizeof(isc_mem_t *) * + rbtdb->node_lock_count); + if (rbtdb->nodemctxs == NULL) + INSIST(0); /* XXXJT: cleanup */ + for (i = 0; i < (int)(rbtdb->node_lock_count); i++) + rbtdb->nodemctxs[i] = NULL; + } + for (i = 0; i < (int)(rbtdb->node_lock_count); i++) { - result = isc_mutex_init(&rbtdb->node_locks[i].lock); + result = NODE_INITLOCK(&rbtdb->node_locks[i].lock); if (result != ISC_R_SUCCESS) { i--; while (i >= 0) { - DESTROYLOCK(&rbtdb->node_locks[i].lock); + NODE_DESTROYLOCK(&rbtdb->node_locks[i].lock); i--; } isc_mem_put(mctx, rbtdb->node_locks, rbtdb->node_lock_count * sizeof(rbtdb_nodelock_t)); isc_rwlock_destroy(&rbtdb->tree_lock); - DESTROYLOCK(&rbtdb->lock); + RBTDB_DESTROYLOCK(&rbtdb->lock); isc_mem_put(mctx, rbtdb, sizeof(*rbtdb)); UNEXPECTED_ERROR(__FILE__, __LINE__, "isc_mutex_init() failed: %s", isc_result_totext(result)); return (ISC_R_UNEXPECTED); } - rbtdb->node_locks[i].references = 0; + isc_refcount_init(&rbtdb->node_locks[i].references, 0); rbtdb->node_locks[i].exiting = ISC_FALSE; + + if (IS_CACHE(rbtdb)) { + result = isc_mem_create(0, 0, &rbtdb->nodemctxs[i]); + if (result != ISC_R_SUCCESS) + INSIST(0); /* XXXJT: cleanup */ + } } /* @@ -5108,7 +5538,8 @@ dns_rbtdb_create /* * Make the Red-Black Tree. */ - result = dns_rbt_create(mctx, delete_callback, rbtdb, &rbtdb->tree); + result = dns_rbt_create2(mctx, NULL, delete_callback, rbtdb, + &rbtdb->tree); if (result != ISC_R_SUCCESS) { free_rbtdb(rbtdb, ISC_FALSE, NULL); return (result); @@ -5166,13 +5597,18 @@ dns_rbtdb_create rbtdb->current_serial = 1; rbtdb->least_serial = 1; rbtdb->next_serial = 2; - rbtdb->current_version = allocate_version(mctx, 1, 0, ISC_FALSE); + rbtdb->current_version = allocate_version(mctx, 1, 1, ISC_FALSE); if (rbtdb->current_version == NULL) { free_rbtdb(rbtdb, ISC_FALSE, NULL); return (ISC_R_NOMEMORY); } rbtdb->future_version = NULL; ISC_LIST_INIT(rbtdb->open_versions); + /* + * Keep the current version in the open list so that list operation + * won't happen in normal lookup operations. + */ + PREPEND(rbtdb->open_versions, rbtdb->current_version, link); rbtdb->common.magic = DNS_DB_MAGIC; rbtdb->common.impmagic = RBTDB_MAGIC; @@ -5358,7 +5794,8 @@ rdatasetiter_first(dns_rdatasetiter_t *iterator) { now = 0; } - LOCK(&rbtdb->node_locks[rbtnode->locknum].lock); + NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock, + isc_rwlocktype_read); for (header = rbtnode->data; header != NULL; header = top_next) { top_next = header->next; @@ -5385,7 +5822,8 @@ rdatasetiter_first(dns_rdatasetiter_t *iterator) { break; } - UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock); + NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock, + isc_rwlocktype_read); rbtiterator->current = header; @@ -5418,7 +5856,8 @@ rdatasetiter_next(dns_rdatasetiter_t *iterator) { now = 0; } - LOCK(&rbtdb->node_locks[rbtnode->locknum].lock); + NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock, + isc_rwlocktype_read); type = header->type; for (header = header->next; header != NULL; header = top_next) { @@ -5450,7 +5889,8 @@ rdatasetiter_next(dns_rdatasetiter_t *iterator) { } } - UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock); + NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock, + isc_rwlocktype_read); rbtiterator->current = header; @@ -5470,12 +5910,14 @@ rdatasetiter_current(dns_rdatasetiter_t *iterator, dns_rdataset_t *rdataset) { header = rbtiterator->current; REQUIRE(header != NULL); - LOCK(&rbtdb->node_locks[rbtnode->locknum].lock); + NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock, + isc_rwlocktype_read); bind_rdataset(rbtdb, rbtnode, header, rbtiterator->common.now, rdataset); - UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock); + NODE_UNLOCK(&rbtdb->node_locks[rbtnode->locknum].lock, + isc_rwlocktype_read); } @@ -5492,26 +5934,26 @@ reference_iter_node(rbtdb_dbiterator_t *rbtdbiter) { return; INSIST(rbtdbiter->tree_locked != isc_rwlocktype_none); - LOCK(&rbtdb->node_locks[node->locknum].lock); new_reference(rbtdb, node); - UNLOCK(&rbtdb->node_locks[node->locknum].lock); } static inline void dereference_iter_node(rbtdb_dbiterator_t *rbtdbiter) { dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)rbtdbiter->common.db; dns_rbtnode_t *node = rbtdbiter->node; - isc_mutex_t *lock; + unsigned int refs; + nodelock_t *lock; if (node == NULL) return; lock = &rbtdb->node_locks[node->locknum].lock; - LOCK(lock); - INSIST(rbtdbiter->node->references > 0); - if (--node->references == 0) + NODE_STRONGLOCK(lock); + dns_rbtnode_refdecrement(node, &refs); + INSIST((int)refs >= 0); + if (refs == 0) no_references(rbtdb, node, 0, rbtdbiter->tree_locked); - UNLOCK(lock); + NODE_STRONGUNLOCK(lock); rbtdbiter->node = NULL; } @@ -5521,7 +5963,7 @@ flush_deletions(rbtdb_dbiterator_t *rbtdbiter) { dns_rbtnode_t *node; dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)rbtdbiter->common.db; isc_boolean_t was_read_locked = ISC_FALSE; - isc_mutex_t *lock; + nodelock_t *lock; int i; if (rbtdbiter->delete != 0) { @@ -5545,16 +5987,18 @@ flush_deletions(rbtdb_dbiterator_t *rbtdbiter) { rbtdbiter->tree_locked = isc_rwlocktype_write; for (i = 0; i < rbtdbiter->delete; i++) { + unsigned int refs; + node = rbtdbiter->deletions[i]; lock = &rbtdb->node_locks[node->locknum].lock; - LOCK(lock); - INSIST(node->references > 0); - node->references--; - if (node->references == 0) + NODE_STRONGLOCK(lock); + dns_rbtnode_refdecrement(node, &refs); + INSIST((int)refs >= 0); + if (refs == 0) no_references(rbtdb, node, 0, rbtdbiter->tree_locked); - UNLOCK(lock); + NODE_STRONGUNLOCK(lock); } rbtdbiter->delete = 0; @@ -5822,9 +6266,7 @@ dbiterator_current(dns_dbiterator_t *iterator, dns_dbnode_t **nodep, } else result = ISC_R_SUCCESS; - LOCK(&rbtdb->node_locks[node->locknum].lock); new_reference(rbtdb, node); - UNLOCK(&rbtdb->node_locks[node->locknum].lock); *nodep = rbtdbiter->node; @@ -5846,9 +6288,9 @@ dbiterator_current(dns_dbiterator_t *iterator, dns_dbnode_t **nodep, */ if (expire_result == ISC_R_SUCCESS && node->down == NULL) { rbtdbiter->deletions[rbtdbiter->delete++] = node; - LOCK(&rbtdb->node_locks[node->locknum].lock); - node->references++; - UNLOCK(&rbtdb->node_locks[node->locknum].lock); + NODE_STRONGLOCK(&rbtdb->node_locks[node->locknum].lock); + dns_rbtnode_refincrement0(node, NULL); + NODE_STRONGUNLOCK(&rbtdb->node_locks[node->locknum].lock); } } @@ -5908,7 +6350,7 @@ rdataset_getadditional(dns_rdataset_t *rdataset, dns_rdatasetadditional_t type, unsigned int current_count = rdataset->privateuint4; unsigned int count; rdatasetheader_t *header; - isc_mutex_t *nodelock; + nodelock_t *nodelock; unsigned int total_count; acachectl_t *acarray; dns_acacheentry_t *entry; @@ -5926,7 +6368,7 @@ rdataset_getadditional(dns_rdataset_t *rdataset, dns_rdatasetadditional_t type, acarray = NULL; nodelock = &rbtdb->node_locks[rbtnode->locknum].lock; - LOCK(nodelock); + NODE_LOCK(nodelock, isc_rwlocktype_read); switch (type) { case dns_rdatasetadditional_fromauth: @@ -5943,19 +6385,19 @@ rdataset_getadditional(dns_rdataset_t *rdataset, dns_rdatasetadditional_t type, } if (acarray == NULL) { - UNLOCK(nodelock); + NODE_UNLOCK(nodelock, isc_rwlocktype_read); return (ISC_R_NOTFOUND); } if (acarray[count].entry == NULL) { - UNLOCK(nodelock); + NODE_UNLOCK(nodelock, isc_rwlocktype_read); return (ISC_R_NOTFOUND); } entry = NULL; dns_acache_attachentry(acarray[count].entry, &entry); - UNLOCK(nodelock); + NODE_UNLOCK(nodelock, isc_rwlocktype_read); result = dns_acache_getentry(entry, zonep, dbp, versionp, nodep, fname, msg, now); @@ -5969,10 +6411,11 @@ static void acache_callback(dns_acacheentry_t *entry, void **arg) { dns_rbtdb_t *rbtdb; dns_rbtnode_t *rbtnode; - isc_mutex_t *nodelock; + nodelock_t *nodelock; acachectl_t *acarray = NULL; acache_cbarg_t *cbarg; unsigned int count; + isc_mem_t *mctx; REQUIRE(arg != NULL); cbarg = *arg; @@ -5985,7 +6428,7 @@ acache_callback(dns_acacheentry_t *entry, void **arg) { rbtnode = (dns_rbtnode_t *)cbarg->node; nodelock = &rbtdb->node_locks[rbtnode->locknum].lock; - LOCK(nodelock); + NODE_LOCK(nodelock, isc_rwlocktype_write); switch (cbarg->type) { case dns_rdatasetadditional_fromauth: @@ -6002,13 +6445,17 @@ acache_callback(dns_acacheentry_t *entry, void **arg) { if (acarray[count].entry == entry) acarray[count].entry = NULL; INSIST(acarray[count].cbarg != NULL); - isc_mem_put(rbtdb->common.mctx, acarray[count].cbarg, + if (rbtdb->nodemctxs != NULL) + mctx = rbtdb->nodemctxs[rbtnode->locknum]; + else + mctx = rbtdb->common.mctx; + isc_mem_put(mctx, acarray[count].cbarg, sizeof(acache_cbarg_t)); acarray[count].cbarg = NULL; dns_acache_detachentry(&entry); - UNLOCK(nodelock); + NODE_UNLOCK(nodelock, isc_rwlocktype_write); dns_db_detachnode((dns_db_t *)rbtdb, (dns_dbnode_t **)(void*)&rbtnode); dns_db_detach((dns_db_t **)(void*)&rbtdb); @@ -6050,8 +6497,8 @@ rdataset_setadditional(dns_rdataset_t *rdataset, dns_rdatasetadditional_t type, unsigned int current_count = rdataset->privateuint4; rdatasetheader_t *header; unsigned int total_count, count; - isc_mutex_t *nodelock; - isc_mem_t *mctx = rbtdb->common.mctx; + nodelock_t *nodelock; + isc_mem_t *mctx; isc_result_t result; acachectl_t *acarray; dns_acacheentry_t *newentry, *oldentry = NULL; @@ -6068,6 +6515,11 @@ rdataset_setadditional(dns_rdataset_t *rdataset, dns_rdatasetadditional_t type, INSIST(total_count > current_count); count = total_count - current_count - 1; /* should be private data */ + if (rbtdb->nodemctxs != NULL) + mctx = rbtdb->nodemctxs[rbtnode->locknum]; + else + mctx = rbtdb->common.mctx; + newcbarg = isc_mem_get(mctx, sizeof(*newcbarg)); if (newcbarg == NULL) return (ISC_R_NOMEMORY); @@ -6091,7 +6543,7 @@ rdataset_setadditional(dns_rdataset_t *rdataset, dns_rdatasetadditional_t type, goto fail; nodelock = &rbtdb->node_locks[rbtnode->locknum].lock; - LOCK(nodelock); + NODE_LOCK(nodelock, isc_rwlocktype_write); acarray = NULL; switch (type) { @@ -6112,7 +6564,7 @@ rdataset_setadditional(dns_rdataset_t *rdataset, dns_rdatasetadditional_t type, sizeof(acachectl_t)); if (acarray == NULL) { - UNLOCK(nodelock); + NODE_UNLOCK(nodelock, isc_rwlocktype_write); goto fail; } @@ -6144,7 +6596,7 @@ rdataset_setadditional(dns_rdataset_t *rdataset, dns_rdatasetadditional_t type, acarray[count].entry = newentry; acarray[count].cbarg = newcbarg; - UNLOCK(nodelock); + NODE_UNLOCK(nodelock, isc_rwlocktype_write); if (oldentry != NULL) { if (oldcbarg != NULL) @@ -6173,7 +6625,7 @@ rdataset_putadditional(dns_acache_t *acache, dns_rdataset_t *rdataset, unsigned char *raw = rdataset->private3; unsigned int current_count = rdataset->privateuint4; rdatasetheader_t *header; - isc_mutex_t *nodelock; + nodelock_t *nodelock; unsigned int total_count, count; acachectl_t *acarray; dns_acacheentry_t *entry; @@ -6195,7 +6647,7 @@ rdataset_putadditional(dns_acache_t *acache, dns_rdataset_t *rdataset, entry = NULL; nodelock = &rbtdb->node_locks[rbtnode->locknum].lock; - LOCK(nodelock); + NODE_LOCK(nodelock, isc_rwlocktype_write); switch (type) { case dns_rdatasetadditional_fromauth: @@ -6209,13 +6661,13 @@ rdataset_putadditional(dns_acache_t *acache, dns_rdataset_t *rdataset, } if (acarray == NULL) { - UNLOCK(nodelock); + NODE_UNLOCK(nodelock, isc_rwlocktype_write); return (ISC_R_NOTFOUND); } entry = acarray[count].entry; if (entry == NULL) { - UNLOCK(nodelock); + NODE_UNLOCK(nodelock, isc_rwlocktype_write); return (ISC_R_NOTFOUND); } @@ -6223,11 +6675,19 @@ rdataset_putadditional(dns_acache_t *acache, dns_rdataset_t *rdataset, cbarg = acarray[count].cbarg; acarray[count].cbarg = NULL; - UNLOCK(nodelock); + NODE_UNLOCK(nodelock, isc_rwlocktype_write); if (entry != NULL) { - if(cbarg != NULL) - acache_cancelentry(rbtdb->common.mctx, entry, &cbarg); + if(cbarg != NULL) { + isc_mem_t *mctx; + + if (rbtdb->nodemctxs != NULL) + mctx = rbtdb->nodemctxs[rbtnode->locknum]; + else + mctx = rbtdb->common.mctx; + + acache_cancelentry(mctx, entry, &cbarg); + } dns_acache_detachentry(&entry); } diff --git a/lib/dns/resolver.c b/lib/dns/resolver.c index 38c713c00d..012b67e788 100644 --- a/lib/dns/resolver.c +++ b/lib/dns/resolver.c @@ -15,7 +15,7 @@ * PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: resolver.c,v 1.307 2005/04/27 04:56:51 sra Exp $ */ +/* $Id: resolver.c,v 1.308 2005/06/04 05:32:47 jinmei Exp $ */ /*! \file */ @@ -262,6 +262,7 @@ typedef struct fctxbucket { isc_mutex_t lock; ISC_LIST(fetchctx_t) fctxs; isc_boolean_t exiting; + isc_mem_t * mctx; } fctxbucket_t; typedef struct alternate { @@ -943,12 +944,13 @@ fctx_query(fetchctx_t *fctx, dns_adbaddrinfo_t *addrinfo, dns_message_reset(fctx->rmessage, DNS_MESSAGE_INTENTPARSE); - query = isc_mem_get(res->mctx, sizeof(*query)); + query = isc_mem_get(res->buckets[fctx->bucketnum].mctx, + sizeof(*query)); if (query == NULL) { result = ISC_R_NOMEMORY; goto stop_idle_timer; } - query->mctx = res->mctx; + query->mctx = res->buckets[fctx->bucketnum].mctx; query->options = options; query->attributes = 0; query->sends = 0; @@ -1066,7 +1068,8 @@ fctx_query(fetchctx_t *fctx, dns_adbaddrinfo_t *addrinfo, cleanup_query: query->magic = 0; - isc_mem_put(res->mctx, query, sizeof(*query)); + isc_mem_put(res->buckets[fctx->bucketnum].mctx, + query, sizeof(*query)); stop_idle_timer: RUNTIME_CHECK(fctx_stopidletimer(fctx) == ISC_R_SUCCESS); @@ -1625,7 +1628,8 @@ add_bad(fetchctx_t *fctx, isc_sockaddr_t *address, isc_result_t reason) { FCTXTRACE("add_bad"); - sa = isc_mem_get(fctx->res->mctx, sizeof(*sa)); + sa = isc_mem_get(fctx->res->buckets[fctx->bucketnum].mctx, + sizeof(*sa)); if (sa == NULL) return; *sa = *address; @@ -2374,21 +2378,21 @@ fctx_destroy(fetchctx_t *fctx) { sa = next_sa) { next_sa = ISC_LIST_NEXT(sa, link); ISC_LIST_UNLINK(fctx->bad, sa, link); - isc_mem_put(res->mctx, sa, sizeof(*sa)); + isc_mem_put(res->buckets[bucketnum].mctx, sa, sizeof(*sa)); } isc_timer_detach(&fctx->timer); dns_message_destroy(&fctx->rmessage); dns_message_destroy(&fctx->qmessage); if (dns_name_countlabels(&fctx->domain) > 0) - dns_name_free(&fctx->domain, res->mctx); + dns_name_free(&fctx->domain, res->buckets[bucketnum].mctx); if (dns_rdataset_isassociated(&fctx->nameservers)) dns_rdataset_disassociate(&fctx->nameservers); - dns_name_free(&fctx->name, res->mctx); + dns_name_free(&fctx->name, res->buckets[bucketnum].mctx); dns_db_detach(&fctx->cache); dns_adb_detach(&fctx->adb); - isc_mem_free(res->mctx, fctx->info); - isc_mem_put(res->mctx, fctx, sizeof(*fctx)); + isc_mem_free(res->buckets[bucketnum].mctx, fctx->info); + isc_mem_put(res->buckets[bucketnum].mctx, fctx, sizeof(*fctx)); LOCK(&res->nlock); res->nfctx--; @@ -2633,8 +2637,8 @@ fctx_join(fetchctx_t *fctx, isc_task_t *task, isc_taskaction_t action, clone = NULL; isc_task_attach(task, &clone); event = (dns_fetchevent_t *) - isc_event_allocate(fctx->res->mctx, clone, - DNS_EVENT_FETCHDONE, + isc_event_allocate(fctx->res->buckets[fctx->bucketnum].mctx, + clone, DNS_EVENT_FETCHDONE, action, arg, sizeof(*event)); if (event == NULL) { isc_task_detach(&clone); @@ -2685,19 +2689,19 @@ fctx_create(dns_resolver_t *res, dns_name_t *name, dns_rdatatype_t type, */ REQUIRE(fctxp != NULL && *fctxp == NULL); - fctx = isc_mem_get(res->mctx, sizeof(*fctx)); + fctx = isc_mem_get(res->buckets[bucketnum].mctx, sizeof(*fctx)); if (fctx == NULL) return (ISC_R_NOMEMORY); dns_name_format(name, buf, sizeof(buf)); dns_rdatatype_format(type, typebuf, sizeof(typebuf)); strcat(buf, "/"); /* checked */ strcat(buf, typebuf); /* checked */ - fctx->info = isc_mem_strdup(res->mctx, buf); + fctx->info = isc_mem_strdup(res->buckets[bucketnum].mctx, buf); if (fctx->info == NULL) goto cleanup_fetch; FCTXTRACE("create"); dns_name_init(&fctx->name, NULL); - result = dns_name_dup(name, res->mctx, &fctx->name); + result = dns_name_dup(name, res->buckets[bucketnum].mctx, &fctx->name); if (result != ISC_R_SUCCESS) goto cleanup_info; dns_name_init(&fctx->domain, NULL); @@ -2772,7 +2776,9 @@ fctx_create(dns_resolver_t *res, dns_name_t *name, dns_rdatatype_t type, NULL); if (result != ISC_R_SUCCESS) goto cleanup_name; - result = dns_name_dup(domain, res->mctx, &fctx->domain); + result = dns_name_dup(domain, + res->buckets[bucketnum].mctx, + &fctx->domain); if (result != ISC_R_SUCCESS) { dns_rdataset_disassociate(&fctx->nameservers); goto cleanup_name; @@ -2781,12 +2787,16 @@ fctx_create(dns_resolver_t *res, dns_name_t *name, dns_rdatatype_t type, /* * We're in forward-only mode. Set the query domain. */ - result = dns_name_dup(domain, res->mctx, &fctx->domain); + result = dns_name_dup(domain, + res->buckets[bucketnum].mctx, + &fctx->domain); if (result != ISC_R_SUCCESS) goto cleanup_name; } } else { - result = dns_name_dup(domain, res->mctx, &fctx->domain); + result = dns_name_dup(domain, + res->buckets[bucketnum].mctx, + &fctx->domain); if (result != ISC_R_SUCCESS) goto cleanup_name; dns_rdataset_clone(nameservers, &fctx->nameservers); @@ -2795,14 +2805,16 @@ fctx_create(dns_resolver_t *res, dns_name_t *name, dns_rdatatype_t type, INSIST(dns_name_issubdomain(&fctx->name, &fctx->domain)); fctx->qmessage = NULL; - result = dns_message_create(res->mctx, DNS_MESSAGE_INTENTRENDER, + result = dns_message_create(res->buckets[bucketnum].mctx, + DNS_MESSAGE_INTENTRENDER, &fctx->qmessage); if (result != ISC_R_SUCCESS) goto cleanup_domain; fctx->rmessage = NULL; - result = dns_message_create(res->mctx, DNS_MESSAGE_INTENTPARSE, + result = dns_message_create(res->buckets[bucketnum].mctx, + DNS_MESSAGE_INTENTPARSE, &fctx->rmessage); if (result != ISC_R_SUCCESS) @@ -2875,18 +2887,19 @@ fctx_create(dns_resolver_t *res, dns_name_t *name, dns_rdatatype_t type, cleanup_domain: if (dns_name_countlabels(&fctx->domain) > 0) - dns_name_free(&fctx->domain, res->mctx); + dns_name_free(&fctx->domain, res->buckets[bucketnum].mctx); if (dns_rdataset_isassociated(&fctx->nameservers)) dns_rdataset_disassociate(&fctx->nameservers); cleanup_name: - dns_name_free(&fctx->name, res->mctx); + dns_name_free(&fctx->name, res->buckets[bucketnum].mctx); cleanup_info: - isc_mem_free(res->mctx, fctx->info); + isc_mem_free(res->buckets[bucketnum].mctx, fctx->info); cleanup_fetch: - isc_mem_put(res->mctx, fctx, sizeof(*fctx)); + isc_mem_putanddetach(&res->buckets[bucketnum].mctx, + fctx, sizeof(*fctx)); return (result); } @@ -4424,11 +4437,14 @@ noanswer_response(fetchctx_t *fctx, dns_name_t *oqname, * if so we should bail out. */ INSIST(dns_name_countlabels(&fctx->domain) > 0); - dns_name_free(&fctx->domain, fctx->res->mctx); + dns_name_free(&fctx->domain, + fctx->res->buckets[fctx->bucketnum].mctx); if (dns_rdataset_isassociated(&fctx->nameservers)) dns_rdataset_disassociate(&fctx->nameservers); dns_name_init(&fctx->domain, NULL); - result = dns_name_dup(ns_name, fctx->res->mctx, &fctx->domain); + result = dns_name_dup(ns_name, + fctx->res->buckets[fctx->bucketnum].mctx, + &fctx->domain); if (result != ISC_R_SUCCESS) return (result); fctx->attributes |= FCTX_ATTR_WANTCACHE; @@ -4883,9 +4899,11 @@ resume_dslookup(isc_task_t *task, isc_event_t *event) { if (dns_rdataset_isassociated(&fctx->nameservers)) dns_rdataset_disassociate(&fctx->nameservers); dns_rdataset_clone(fevent->rdataset, &fctx->nameservers); - dns_name_free(&fctx->domain, fctx->res->mctx); + dns_name_free(&fctx->domain, + fctx->res->buckets[bucketnum].mctx); dns_name_init(&fctx->domain, NULL); - result = dns_name_dup(&fctx->nsname, fctx->res->mctx, + result = dns_name_dup(&fctx->nsname, + fctx->res->buckets[bucketnum].mctx, &fctx->domain); if (result != ISC_R_SUCCESS) { fctx_done(fctx, DNS_R_SERVFAIL); @@ -5510,9 +5528,11 @@ resquery_response(isc_task_t *task, isc_event_t *event) { fctx_done(fctx, DNS_R_SERVFAIL); return; } - dns_name_free(&fctx->domain, fctx->res->mctx); + dns_name_free(&fctx->domain, + fctx->res->buckets[fctx->bucketnum].mctx); dns_name_init(&fctx->domain, NULL); - result = dns_name_dup(fname, fctx->res->mctx, + result = dns_name_dup(fname, + fctx->res->buckets[fctx->bucketnum].mctx, &fctx->domain); if (result != ISC_R_SUCCESS) { fctx_done(fctx, DNS_R_SERVFAIL); @@ -5610,6 +5630,7 @@ destroy(dns_resolver_t *res) { isc_task_shutdown(res->buckets[i].task); isc_task_detach(&res->buckets[i].task); DESTROYLOCK(&res->buckets[i].lock); + isc_mem_detach(&res->buckets[i].mctx); } isc_mem_put(res->mctx, res->buckets, res->nbuckets * sizeof(fctxbucket_t)); @@ -5731,6 +5752,9 @@ dns_resolver_create(dns_view_t *view, DESTROYLOCK(&res->buckets[i].lock); goto cleanup_buckets; } + res->buckets[i].mctx = NULL; + result = isc_mem_create(0, 0, &res->buckets[i].mctx); + INSIST(result == ISC_R_SUCCESS); /* XXXJT: need care */ snprintf(name, sizeof(name), "res%u", i); isc_task_setname(res->buckets[i].task, name, res); ISC_LIST_INIT(res->buckets[i].fctxs); @@ -6130,7 +6154,7 @@ dns_resolver_createfetch(dns_resolver_t *res, dns_name_t *name, if (fetch == NULL) return (ISC_R_NOMEMORY); - bucketnum = dns_name_hash(name, ISC_FALSE) % res->nbuckets; + bucketnum = dns_name_fullhash(name, ISC_FALSE) % res->nbuckets; LOCK(&res->buckets[bucketnum].lock); diff --git a/lib/dns/zone.c b/lib/dns/zone.c index 505eedc1e8..344be62362 100644 --- a/lib/dns/zone.c +++ b/lib/dns/zone.c @@ -15,7 +15,7 @@ * PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: zone.c,v 1.435 2005/05/24 04:30:10 marka Exp $ */ +/* $Id: zone.c,v 1.436 2005/06/04 05:32:47 jinmei Exp $ */ /*! \file */ @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -127,6 +128,18 @@ typedef ISC_LIST(dns_io_t) dns_iolist_t; #define LOCKED_ZONE(z) ISC_TRUE #endif +#ifdef ISC_RWLOCK_USEATOMIC +#define ZONEDB_INITLOCK(l) isc_rwlock_init((l), 0, 0) +#define ZONEDB_DESTROYLOCK(l) isc_rwlock_destroy(l) +#define ZONEDB_LOCK(l, t) RWLOCK((l), (t)) +#define ZONEDB_UNLOCK(l, t) RWUNLOCK((l), (t)) +#else +#define ZONEDB_INITLOCK(l) isc_mutex_init(l) +#define ZONEDB_DESTROYLOCK(l) DESTROYLOCK(l) +#define ZONEDB_LOCK(l, t) LOCK(l) +#define ZONEDB_UNLOCK(l, t) UNLOCK(l) +#endif + struct dns_zone { /* Unlocked */ unsigned int magic; @@ -137,8 +150,14 @@ struct dns_zone { isc_mem_t *mctx; isc_refcount_t erefs; +#ifdef ISC_RWLOCK_USEATOMIC + isc_rwlock_t dblock; +#else + isc_mutex_t dblock; +#endif + dns_db_t *db; /* Locked by dblock */ + /* Locked */ - dns_db_t *db; dns_zonemgr_t *zmgr; ISC_LINK(dns_zone_t) link; /* Used by zmgr. */ isc_timer_t *timer; @@ -507,21 +526,31 @@ dns_zone_create(dns_zone_t **zonep, isc_mem_t *mctx) { if (zone == NULL) return (ISC_R_NOMEMORY); + zone->mctx = NULL; + isc_mem_attach(mctx, &zone->mctx); + result = isc_mutex_init(&zone->lock); if (result != ISC_R_SUCCESS) { - isc_mem_put(mctx, zone, sizeof(*zone)); UNEXPECTED_ERROR(__FILE__, __LINE__, "isc_mutex_init() failed: %s", isc_result_totext(result)); - return (ISC_R_UNEXPECTED); + result = ISC_R_UNEXPECTED; + goto free_zone; + } + + result = ZONEDB_INITLOCK(&zone->dblock); + if (result != ISC_R_SUCCESS) { + UNEXPECTED_ERROR(__FILE__, __LINE__, + "isc_mutex_init() failed: %s", + isc_result_totext(result)); + result = ISC_R_UNEXPECTED; + goto free_mutex; } /* XXX MPA check that all elements are initialised */ - zone->mctx = NULL; #ifdef DNS_ZONE_CHECKLOCK zone->locked = ISC_FALSE; #endif - isc_mem_attach(mctx, &zone->mctx); zone->db = NULL; zone->zmgr = NULL; ISC_LINK_INIT(zone, link); @@ -605,7 +634,7 @@ dns_zone_create(dns_zone_t **zonep, isc_mem_t *mctx) { /* Must be after magic is set. */ result = dns_zone_setdbtype(zone, dbargc_default, dbargv_default); if (result != ISC_R_SUCCESS) - goto free_mutex; + goto free_dblock; ISC_EVENT_INIT(&zone->ctlevent, sizeof(zone->ctlevent), 0, NULL, DNS_EVENT_ZONECONTROL, zone_shutdown, zone, zone, @@ -613,8 +642,13 @@ dns_zone_create(dns_zone_t **zonep, isc_mem_t *mctx) { *zonep = zone; return (ISC_R_SUCCESS); + free_dblock: + ZONEDB_DESTROYLOCK(&zone->dblock); + free_mutex: DESTROYLOCK(&zone->lock); + + free_zone: isc_mem_putanddetach(&zone->mctx, zone, sizeof(*zone)); return (result); } @@ -686,6 +720,7 @@ zone_free(dns_zone_t *zone) { dns_ssutable_detach(&zone->ssutable); /* last stuff */ + ZONEDB_DESTROYLOCK(&zone->dblock); DESTROYLOCK(&zone->lock); isc_refcount_destroy(&zone->erefs); zone->magic = 0; @@ -857,6 +892,7 @@ dns_zone_setacache(dns_zone_t *zone, dns_acache_t *acache) { if (zone->acache != NULL) dns_acache_detach(&zone->acache); dns_acache_attach(acache, &zone->acache); + ZONEDB_LOCK(&zone->dblock, isc_rwlocktype_read); if (zone->db != NULL) { isc_result_t result; @@ -873,6 +909,7 @@ dns_zone_setacache(dns_zone_t *zone, dns_acache_t *acache) { isc_result_totext(result)); } } + ZONEDB_UNLOCK(&zone->dblock, isc_rwlocktype_read); UNLOCK_ZONE(zone); } @@ -1212,12 +1249,14 @@ zone_gotwritehandle(isc_task_t *task, isc_event_t *event) { goto fail; LOCK_ZONE(zone); + ZONEDB_LOCK(&zone->dblock, isc_rwlocktype_read); dns_db_currentversion(zone->db, &version); result = dns_master_dumpinc(zone->mctx, zone->db, version, &dns_master_style_default, zone->masterfile, zone->task, dump_done, zone, &zone->dctx); dns_db_closeversion(zone->db, &version, ISC_FALSE); + ZONEDB_UNLOCK(&zone->dblock, isc_rwlocktype_read); UNLOCK_ZONE(zone); if (result != DNS_R_CONTINUE) goto fail; @@ -1863,12 +1902,15 @@ zone_postload(dns_zone_t *zone, dns_db_t *db, isc_time_t loadtime, } #endif + ZONEDB_LOCK(&zone->dblock, isc_rwlocktype_write); if (zone->db != NULL) { result = zone_replacedb(zone, db, ISC_FALSE); + ZONEDB_UNLOCK(&zone->dblock, isc_rwlocktype_write); if (result != ISC_R_SUCCESS) goto cleanup; } else { zone_attachdb(zone, db); + ZONEDB_UNLOCK(&zone->dblock, isc_rwlocktype_write); DNS_ZONE_SETFLAG(zone, DNS_ZONEFLG_LOADED|DNS_ZONEFLG_NEEDNOTIFY); } @@ -2565,12 +2607,12 @@ dns_zone_getdb(dns_zone_t *zone, dns_db_t **dpb) { REQUIRE(DNS_ZONE_VALID(zone)); - LOCK_ZONE(zone); + ZONEDB_LOCK(&zone->dblock, isc_rwlocktype_read); if (zone->db == NULL) result = DNS_R_NOTLOADED; else dns_db_attach(zone->db, dpb); - UNLOCK_ZONE(zone); + ZONEDB_UNLOCK(&zone->dblock, isc_rwlocktype_read); return (result); } @@ -2950,9 +2992,11 @@ zone_dump(dns_zone_t *zone, isc_boolean_t compact) { ENTER; redo: - LOCK_ZONE(zone); + ZONEDB_LOCK(&zone->dblock, isc_rwlocktype_read); if (zone->db != NULL) dns_db_attach(zone->db, &db); + ZONEDB_UNLOCK(&zone->dblock, isc_rwlocktype_read); + LOCK_ZONE(zone); if (zone->masterfile != NULL) masterfile = isc_mem_strdup(zone->mctx, zone->masterfile); UNLOCK_ZONE(zone); @@ -3026,10 +3070,10 @@ dumptostream(dns_zone_t *zone, FILE *fd, const dns_master_style_t *style) { REQUIRE(DNS_ZONE_VALID(zone)); - LOCK_ZONE(zone); + ZONEDB_LOCK(&zone->dblock, isc_rwlocktype_read); if (zone->db != NULL) dns_db_attach(zone->db, &db); - UNLOCK_ZONE(zone); + ZONEDB_UNLOCK(&zone->dblock, isc_rwlocktype_read); if (db == NULL) return (DNS_R_NOTLOADED); @@ -3088,7 +3132,9 @@ zone_unload(dns_zone_t *zone) { REQUIRE(LOCKED_ZONE(zone)); + ZONEDB_LOCK(&zone->dblock, isc_rwlocktype_write); zone_detachdb(zone); + ZONEDB_UNLOCK(&zone->dblock, isc_rwlocktype_write); DNS_ZONE_CLRFLAG(zone, DNS_ZONEFLG_LOADED); DNS_ZONE_CLRFLAG(zone, DNS_ZONEFLG_NEEDDUMP); } @@ -3469,6 +3515,7 @@ dns_zone_notify(dns_zone_t *zone) { static void zone_notify(dns_zone_t *zone, isc_time_t *now) { dns_dbnode_t *node = NULL; + dns_db_t *zonedb = NULL; dns_dbversion_t *version = NULL; dns_name_t *origin = NULL; dns_name_t master; @@ -3486,7 +3533,6 @@ zone_notify(dns_zone_t *zone, isc_time_t *now) { dns_notifytype_t notifytype; unsigned int flags = 0; isc_boolean_t loggednotify = ISC_FALSE; - dns_db_t *db = NULL; REQUIRE(DNS_ZONE_VALID(zone)); @@ -3506,13 +3552,6 @@ zone_notify(dns_zone_t *zone, isc_time_t *now) { zone->type != dns_zone_master) return; - LOCK_ZONE(zone); - if (zone->db != NULL) - dns_db_attach(zone->db, &db); - UNLOCK_ZONE(zone); - if (db == NULL) - return; - origin = &zone->origin; /* @@ -3525,13 +3564,19 @@ zone_notify(dns_zone_t *zone, isc_time_t *now) { /* * Get SOA RRset. */ - dns_db_currentversion(db, &version); - result = dns_db_findnode(db, origin, ISC_FALSE, &node); + ZONEDB_LOCK(&zone->dblock, isc_rwlocktype_read); + if (zone->db != NULL) + dns_db_attach(zone->db, &zonedb); + ZONEDB_UNLOCK(&zone->dblock, isc_rwlocktype_read); + if (zonedb == NULL) + return; + dns_db_currentversion(zonedb, &version); + result = dns_db_findnode(zonedb, origin, ISC_FALSE, &node); if (result != ISC_R_SUCCESS) goto cleanup1; dns_rdataset_init(&soardset); - result = dns_db_findrdataset(db, node, version, dns_rdatatype_soa, + result = dns_db_findrdataset(zonedb, node, version, dns_rdatatype_soa, dns_rdatatype_none, 0, &soardset, NULL); if (result != ISC_R_SUCCESS) goto cleanup2; @@ -3588,7 +3633,7 @@ zone_notify(dns_zone_t *zone, isc_time_t *now) { */ dns_rdataset_init(&nsrdset); - result = dns_db_findrdataset(db, node, version, dns_rdatatype_ns, + result = dns_db_findrdataset(zonedb, node, version, dns_rdatatype_ns, dns_rdatatype_none, 0, &nsrdset, NULL); if (result != ISC_R_SUCCESS) goto cleanup3; @@ -3645,10 +3690,10 @@ zone_notify(dns_zone_t *zone, isc_time_t *now) { if (dns_name_dynamic(&master)) dns_name_free(&master, zone->mctx); cleanup2: - dns_db_detachnode(db, &node); + dns_db_detachnode(zonedb, &node); cleanup1: - dns_db_closeversion(db, &version, ISC_FALSE); - dns_db_detach(&db); + dns_db_closeversion(zonedb, &version, ISC_FALSE); + dns_db_detach(&zonedb); } /*** @@ -3895,10 +3940,10 @@ stub_callback(isc_task_t *task, isc_event_t *event) { * Tidy up. */ dns_db_closeversion(stub->db, &stub->version, ISC_TRUE); - LOCK_ZONE(zone); + ZONEDB_LOCK(&zone->dblock, isc_rwlocktype_write); if (zone->db == NULL) zone_attachdb(zone, stub->db); - UNLOCK_ZONE(zone); + ZONEDB_UNLOCK(&zone->dblock, isc_rwlocktype_write); dns_db_detach(&stub->db); if (zone->masterfile != NULL) { @@ -4637,9 +4682,13 @@ ns_query(dns_zone_t *zone, dns_rdataset_t *soardataset, dns_stub_t *stub) { * new one and attach it to the zone once we have the NS * RRset and glue. */ - if (zone->db != NULL) + ZONEDB_LOCK(&zone->dblock, isc_rwlocktype_read); + if (zone->db != NULL) { dns_db_attach(zone->db, &stub->db); - else { + ZONEDB_UNLOCK(&zone->dblock, isc_rwlocktype_read); + } else { + ZONEDB_UNLOCK(&zone->dblock, isc_rwlocktype_read); + INSIST(zone->db_argc >= 1); result = dns_db_create(zone->mctx, zone->db_argv[0], &zone->origin, dns_dbtype_stub, @@ -5006,6 +5055,7 @@ static isc_result_t notify_createmessage(dns_zone_t *zone, unsigned int flags, dns_message_t **messagep) { + dns_db_t *zonedb = NULL; dns_dbnode_t *node = NULL; dns_dbversion_t *version = NULL; dns_message_t *message = NULL; @@ -5071,15 +5121,20 @@ notify_createmessage(dns_zone_t *zone, unsigned int flags, if (result != ISC_R_SUCCESS) goto soa_cleanup; + ZONEDB_LOCK(&zone->dblock, isc_rwlocktype_read); + INSIST(zone->db != NULL); /* XXXJT: is this assumption correct? */ + dns_db_attach(zone->db, &zonedb); + ZONEDB_UNLOCK(&zone->dblock, isc_rwlocktype_read); + dns_name_init(tempname, NULL); dns_name_clone(&zone->origin, tempname); - dns_db_currentversion(zone->db, &version); - result = dns_db_findnode(zone->db, tempname, ISC_FALSE, &node); + dns_db_currentversion(zonedb, &version); + result = dns_db_findnode(zonedb, tempname, ISC_FALSE, &node); if (result != ISC_R_SUCCESS) goto soa_cleanup; dns_rdataset_init(&rdataset); - result = dns_db_findrdataset(zone->db, node, version, + result = dns_db_findrdataset(zonedb, node, version, dns_rdatatype_soa, dns_rdatatype_none, 0, &rdataset, NULL); @@ -5123,9 +5178,11 @@ notify_createmessage(dns_zone_t *zone, unsigned int flags, soa_cleanup: if (node != NULL) - dns_db_detachnode(zone->db, &node); + dns_db_detachnode(zonedb, &node); if (version != NULL) - dns_db_closeversion(zone->db, &version, ISC_FALSE); + dns_db_closeversion(zonedb, &version, ISC_FALSE); + if (zonedb != NULL) + dns_db_detach(&zonedb); if (tempname != NULL) dns_message_puttempname(message, &tempname); if (temprdata != NULL) @@ -5689,8 +5746,10 @@ dns_zone_settask(dns_zone_t *zone, isc_task_t *task) { if (zone->task != NULL) isc_task_detach(&zone->task); isc_task_attach(task, &zone->task); + ZONEDB_LOCK(&zone->dblock, isc_rwlocktype_read); if (zone->db != NULL) dns_db_settask(zone->db, zone->task); + ZONEDB_UNLOCK(&zone->dblock, isc_rwlocktype_read); UNLOCK_ZONE(zone); } @@ -5795,7 +5854,9 @@ dns_zone_replacedb(dns_zone_t *zone, dns_db_t *db, isc_boolean_t dump) { REQUIRE(DNS_ZONE_VALID(zone)); LOCK_ZONE(zone); + ZONEDB_LOCK(&zone->dblock, isc_rwlocktype_write); result = zone_replacedb(zone, db, dump); + ZONEDB_UNLOCK(&zone->dblock, isc_rwlocktype_write); UNLOCK_ZONE(zone); return (result); } @@ -5808,7 +5869,7 @@ zone_replacedb(dns_zone_t *zone, dns_db_t *db, isc_boolean_t dump) { unsigned int nscount = 0; /* - * 'zone' locked by caller. + * 'zone' and 'zonedb' locked by caller. */ REQUIRE(DNS_ZONE_VALID(zone)); REQUIRE(LOCKED_ZONE(zone)); @@ -5935,6 +5996,7 @@ zone_replacedb(dns_zone_t *zone, dns_db_t *db, isc_boolean_t dump) { return (result); } +/* The caller must hold the dblock as a writer. */ static inline void zone_attachdb(dns_zone_t *zone, dns_db_t *db) { REQUIRE(zone->db == NULL && db != NULL); @@ -5951,6 +6013,7 @@ zone_attachdb(dns_zone_t *zone, dns_db_t *db) { } } +/* The caller must hold the dblock as a writer. */ static inline void zone_detachdb(dns_zone_t *zone) { REQUIRE(zone->db != NULL); @@ -5989,8 +6052,11 @@ zone_xfrdone(dns_zone_t *zone, isc_result_t result) { /* * Has the zone expired underneath us? */ - if (zone->db == NULL) + ZONEDB_LOCK(&zone->dblock, isc_rwlocktype_read); + if (zone->db == NULL) { + ZONEDB_UNLOCK(&zone->dblock, isc_rwlocktype_read); goto same_master; + } /* * Update the zone structure's data from the actual @@ -6002,6 +6068,7 @@ zone_xfrdone(dns_zone_t *zone, isc_result_t result) { result = zone_get_from_db(zone, zone->db, &nscount, &soacount, &serial, &refresh, &retry, &expire, &minimum, NULL); + ZONEDB_UNLOCK(&zone->dblock, isc_rwlocktype_read); if (result == ISC_R_SUCCESS) { if (soacount != 1) dns_zone_log(zone, ISC_LOG_ERROR, diff --git a/lib/isc/Makefile.in b/lib/isc/Makefile.in index 8c45de6ede..0ed2c9d99b 100644 --- a/lib/isc/Makefile.in +++ b/lib/isc/Makefile.in @@ -13,7 +13,7 @@ # OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR # PERFORMANCE OF THIS SOFTWARE. -# $Id: Makefile.in,v 1.82 2004/07/20 07:13:42 marka Exp $ +# $Id: Makefile.in,v 1.83 2005/06/04 05:32:48 jinmei Exp $ srcdir = @srcdir@ VPATH = @srcdir@ @@ -25,6 +25,7 @@ top_srcdir = @top_srcdir@ CINCLUDES = -I${srcdir}/unix/include \ -I${srcdir}/@ISC_THREAD_DIR@/include \ + -I${srcdir}/@ISC_ARCH_DIR@/include \ -I./include \ -I${srcdir}/include CDEFINES = diff --git a/lib/isc/alpha/include/isc/atomic.h b/lib/isc/alpha/include/isc/atomic.h new file mode 100644 index 0000000000..83fe91d08b --- /dev/null +++ b/lib/isc/alpha/include/isc/atomic.h @@ -0,0 +1,166 @@ +/* + * Copyright (C) 2005 Internet Systems Consortium, Inc. ("ISC") + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS. IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT, + * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM + * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE + * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +/* $Id: atomic.h,v 1.2 2005/06/04 05:32:48 jinmei Exp $ */ + +/* + * This code was written based on FreeBSD's kernel source whose copyright + * follows: + */ + +/*- + * Copyright (c) 1998 Doug Rabson + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD: src/sys/alpha/include/atomic.h,v 1.18.6.1 2004/09/13 21:52:04 wilko Exp $ + */ + +#ifndef ISC_ATOMIC_H +#define ISC_ATOMIC_H 1 + +#include +#include + +#ifdef ISC_PLATFORM_USEOSFASM +#include + +#pragma intrinsic(asm) + +/* + * This routine atomically increments the value stored in 'p' by 'val', and + * returns the previous value. + */ +static inline isc_int32_t +isc_atomic_xadd(isc_int32_t *p, isc_int32_t val) { + return (asm("1:" + "ldl_l %t0, 0(%a0);" /* load old value */ + "mov %t0, %v0;" /* copy the old value */ + "addl %t0, %a1, %t0;" /* calculate new value */ + "stl_c %t0, 0(%a0);" /* attempt to store */ + "beq %t0, 1b;", /* spin if failed */ + p, val)); +} + +/* + * This routine atomically stores the value 'val' in 'p'. + */ +static inline void +isc_atomic_store(isc_int32_t *p, isc_int32_t val) { + (void)asm("1:" + "ldl_l %t0, 0(%a0);" /* load old value */ + "mov %a1, %t0;" /* value to store */ + "stl_c %t0, 0(%a0);" /* attempt to store */ + "beq %t0, 1b;", /* spin if failed */ + p, val); +} + +/* + * This routine atomically replaces the value in 'p' with 'val', if the + * original value is equal to 'cmpval'. The original value is returned in any + * case. + */ +static inline isc_int32_t +isc_atomic_cmpxchg(isc_int32_t *p, isc_int32_t cmpval, isc_int32_t val) { + + return(asm("1:" + "ldl_l %t0, 0(%a0);" /* load old value */ + "mov %t0, %v0;" /* copy the old value */ + "cmpeq %t0, %a1, %t0;" /* compare */ + "beq %t0, 2f;" /* exit if not equal */ + "mov %a2, %t0;" /* value to store */ + "stl_c %t0, 0(%a0);" /* attempt to store */ + "beq %t0, 1b;" /* if it failed, spin */ + "2:", + p, cmpval, val)); +} +#else /* ISC_PLATFORM_USEOSFASM */ +static inline isc_int32_t +isc_atomic_xadd(isc_int32_t *p, isc_int32_t val) { + isc_int32_t temp, prev; + + __asm__ volatile( + "1:" + "ldl_l %0, %1;" /* load old value */ + "mov %0, %2;" /* copy the old value */ + "addl %0, %3, %0;" /* calculate new value */ + "stl_c %0, %1;" /* attempt to store */ + "beq %0, 1b;" /* spin if failed */ + : "=&r"(temp), "+m"(*p), "=r"(prev) + : "r"(val) + : "memory"); + + return (prev); +} + +static inline void +isc_atomic_store(isc_int32_t *p, isc_int32_t val) { + isc_int32_t temp; + + __asm__ volatile( + "1:" + "ldl_l %0, %1;" /* load old value */ + "mov %2, %0;" /* value to store */ + "stl_c %0, %1;" /* attempt to store */ + "beq %0, 1b;" /* if it failed, spin */ + : "=&r"(temp), "+m"(*p) + : "r"(val) + : "memory"); +} + +static inline isc_int32_t +isc_atomic_cmpxchg(isc_int32_t *p, isc_int32_t cmpval, isc_int32_t val) { + isc_int32_t temp, prev; + + __asm__ volatile( + "1:" + "ldl_l %0, %1;" /* load old value */ + "mov %0, %2;" /* copy the old value */ + "cmpeq %0, %3, %0;" /* compare */ + "beq %0, 2f;" /* exit if not equal */ + "mov %4, %0;" /* value to store */ + "stl_c %0, %1;" /* attempt to store */ + "beq %0, 1b;" /* if it failed, spin */ + "2:" + : "=&r"(temp), "+m"(*p), "=r"(prev) + : "r"(cmpval), "r"(val) + : "memory"); + + return (prev); +} +#endif /* ISC_PLATFORM_USEOSFASM */ + +#endif /* ISC_ATOMIC_H */ diff --git a/lib/isc/include/isc/mem.h b/lib/isc/include/isc/mem.h index e37d5806f4..e026082ba8 100644 --- a/lib/isc/include/isc/mem.h +++ b/lib/isc/include/isc/mem.h @@ -15,7 +15,7 @@ * PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: mem.h,v 1.62 2005/04/29 00:23:40 marka Exp $ */ +/* $Id: mem.h,v 1.63 2005/06/04 05:32:48 jinmei Exp $ */ #ifndef ISC_MEM_H #define ISC_MEM_H 1 @@ -115,6 +115,11 @@ LIBISC_EXTERNAL_DATA extern unsigned int isc_mem_debugging; #define _ISC_MEM_FLARG #endif +/* + * Flags for isc_mem_create2()calls. + */ +#define ISC_MEMFLAG_NOLOCK 0x00000001 /* no lock is necessary */ + #define isc_mem_get(c, s) isc__mem_get((c), (s) _ISC_MEM_FILELINE) #define isc_mem_allocate(c, s) isc__mem_allocate((c), (s) _ISC_MEM_FILELINE) #define isc_mem_strdup(c, p) isc__mem_strdup((c), (p) _ISC_MEM_FILELINE) @@ -183,10 +188,20 @@ isc_result_t isc_mem_create(size_t max_size, size_t target_size, isc_mem_t **mctxp); +isc_result_t +isc_mem_create2(size_t max_size, size_t target_size, + isc_mem_t **mctxp, unsigned int flags); + isc_result_t isc_mem_createx(size_t max_size, size_t target_size, isc_memalloc_t memalloc, isc_memfree_t memfree, void *arg, isc_mem_t **mctxp); + +isc_result_t +isc_mem_createx2(size_t max_size, size_t target_size, + isc_memalloc_t memalloc, isc_memfree_t memfree, + void *arg, isc_mem_t **mctxp, unsigned int flags); + /*!< * \brief Create a memory context. * @@ -209,6 +224,12 @@ isc_mem_createx(size_t max_size, size_t target_size, * using isc_mem_create() will use the standard library malloc() * and free(). * + * If ISC_MEMFLAG_NOLOCK is set in 'flags', the corresponding memory context + * will be accessed without locking. The user who creates the context must + * ensure there be no race. Since this can be a source of bug, it is generally + * inadvisable to use this flag unless the user is very sure about the race + * condition and the access to the object is highly performance sensitive. + * * Requires: * mctxp != NULL && *mctxp == NULL */ /*@}*/ diff --git a/lib/isc/include/isc/platform.h.in b/lib/isc/include/isc/platform.h.in index 48277a1d29..c9333d9283 100644 --- a/lib/isc/include/isc/platform.h.in +++ b/lib/isc/include/isc/platform.h.in @@ -15,7 +15,7 @@ * PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: platform.h.in,v 1.37 2005/04/27 04:57:19 sra Exp $ */ +/* $Id: platform.h.in,v 1.38 2005/06/04 05:32:48 jinmei Exp $ */ #ifndef ISC_PLATFORM_H #define ISC_PLATFORM_H 1 @@ -217,6 +217,29 @@ */ @ISC_PLATFORM_HAVESYSUNH@ +/* + * If the "xadd" operation is available on this architecture, + * ISC_PLATFORM_HAVEXADD will be defined. + */ +@ISC_PLATFORM_HAVEXADD@ + +/* + * If the "atomic swap" operation is available on this architecture, + * ISC_PLATFORM_HAVEATOMICSTORE" will be defined. + */ +@ISC_PLATFORM_HAVEATOMICSTORE@ + +/* + * If the "compare-and-exchange" operation is available on this architecture, + * ISC_PLATFORM_HAVECMPXCHG will be defined. + */ +@ISC_PLATFORM_HAVECMPXCHG@ + +/* + * Define if Tru64 style ASM syntax must be used. + */ +@ISC_PLATFORM_USEOSFASM@ + #ifndef ISC_PLATFORM_USEDECLSPEC #define LIBISC_EXTERNAL_DATA #define LIBDNS_EXTERNAL_DATA diff --git a/lib/isc/include/isc/refcount.h b/lib/isc/include/isc/refcount.h index 7583d24862..87d078aebb 100644 --- a/lib/isc/include/isc/refcount.h +++ b/lib/isc/include/isc/refcount.h @@ -15,11 +15,12 @@ * PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: refcount.h,v 1.9 2005/04/29 00:23:43 marka Exp $ */ +/* $Id: refcount.h,v 1.10 2005/06/04 05:32:48 jinmei Exp $ */ #ifndef ISC_REFCOUNT_H #define ISC_REFCOUNT_H 1 +#include #include #include #include @@ -63,9 +64,14 @@ ISC_LANG_BEGINDECLS /* * void * isc_refcount_increment(isc_refcount_t *ref, unsigned int *targetp); + * isc_refcount_increment0(isc_refcount_t *ref, unsigned int *targetp); * * Increments the reference count, returning the new value in targetp if it's - * not NULL. + * not NULL. The reference counter typically begins with the initial counter + * of 1, and will be destroyed once the counter reaches 0. Thus, + * isc_refcount_increment() additionally requires the previous counter be + * larger than 0 so that an error which violates the usage can be easily + * caught. isc_refcount_increment0() does not have this restriction. * * Requires: * ref != NULL. @@ -87,6 +93,48 @@ ISC_LANG_BEGINDECLS * Sample implementations */ #ifdef ISC_PLATFORM_USETHREADS +#ifdef ISC_PLATFORM_HAVEXADD + +#define ISC_REFCOUNT_HAVEATOMIC 1 + +typedef struct isc_refcount { + isc_int32_t refs; +} isc_refcount_t; + +#define isc_refcount_init(rp, n) ((rp)->refs = (n)) +#define isc_refcount_destroy(rp) (REQUIRE((rp)->refs == 0)) +#define isc_refcount_current(rp) ((unsigned int)((rp)->refs)) + +#define isc_refcount_increment0(rp, tp) \ + do { \ + unsigned int *_tmp = (unsigned int *)(tp); \ + isc_int32_t prev; \ + prev = isc_atomic_xadd(&(rp)->refs, 1); \ + if (_tmp != NULL) \ + *_tmp = prev + 1; \ + } while (0) + +#define isc_refcount_increment(rp, tp) \ + do { \ + unsigned int *_tmp = (unsigned int *)(tp); \ + isc_int32_t prev; \ + prev = isc_atomic_xadd(&(rp)->refs, 1); \ + REQUIRE(prev > 0); \ + if (_tmp != NULL) \ + *_tmp = prev + 1; \ + } while (0) + +#define isc_refcount_decrement(rp, tp) \ + do { \ + unsigned int *_tmp = (unsigned int *)(tp); \ + isc_int32_t prev; \ + prev = isc_atomic_xadd(&(rp)->refs, -1); \ + REQUIRE(prev > 0); \ + if (_tmp != NULL) \ + *_tmp = prev - 1; \ + } while (0) + +#else /* ISC_PLATFORM_HAVEXADD */ typedef struct isc_refcount { int refs; @@ -112,6 +160,16 @@ typedef struct isc_refcount { #define isc_refcount_current(rp) ((unsigned int)((rp)->refs)) /*% Increments the reference count, returning the new value in targetp if it's not NULL. */ +#define isc_refcount_increment0(rp, tp) \ + do { \ + unsigned int *_tmp = (unsigned int *)(tp); \ + LOCK(&(rp)->lock); \ + ++((rp)->refs); \ + if (_tmp != NULL) \ + *_tmp = ((rp)->refs); \ + UNLOCK(&(rp)->lock); \ + } while (0) + #define isc_refcount_increment(rp, tp) \ do { \ unsigned int *_tmp = (unsigned int *)(tp); \ @@ -135,7 +193,8 @@ typedef struct isc_refcount { UNLOCK(&(rp)->lock); \ } while (0) -#else +#endif /* ISC_PLATFORM_HAVEXADD */ +#else /* ISC_PLATFORM_USETHREADS */ typedef struct isc_refcount { int refs; @@ -145,7 +204,7 @@ typedef struct isc_refcount { #define isc_refcount_destroy(rp) (REQUIRE((rp)->refs == 0)) #define isc_refcount_current(rp) ((unsigned int)((rp)->refs)) -#define isc_refcount_increment(rp, tp) \ +#define isc_refcount_increment0(rp, tp) \ do { \ unsigned int *_tmp = (unsigned int *)(tp); \ int _n = ++(rp)->refs; \ @@ -153,15 +212,27 @@ typedef struct isc_refcount { *_tmp = _n; \ } while (0) -#define isc_refcount_decrement(rp, tp) \ +#define isc_refcount_increment(rp, tp) \ do { \ unsigned int *_tmp = (unsigned int *)(tp); \ - int _n = --(rp)->refs; \ + int _n; \ + REQUIRE((rp)->refs > 0); \ + _n = ++(rp)->refs; \ if (_tmp != NULL) \ *_tmp = _n; \ } while (0) -#endif +#define isc_refcount_decrement(rp, tp) \ + do { \ + unsigned int *_tmp = (unsigned int *)(tp); \ + int _n; \ + REQUIRE((rp)->refs > 0); \ + _n = --(rp)->refs; \ + if (_tmp != NULL) \ + *_tmp = _n; \ + } while (0) + +#endif /* ISC_PLATFORM_USETHREADS */ ISC_LANG_ENDDECLS diff --git a/lib/isc/include/isc/rwlock.h b/lib/isc/include/isc/rwlock.h index ea52879ef5..f388d6ff41 100644 --- a/lib/isc/include/isc/rwlock.h +++ b/lib/isc/include/isc/rwlock.h @@ -15,7 +15,7 @@ * PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: rwlock.h,v 1.23 2005/04/29 00:23:44 marka Exp $ */ +/* $Id: rwlock.h,v 1.24 2005/06/04 05:32:49 jinmei Exp $ */ #ifndef ISC_RWLOCK_H #define ISC_RWLOCK_H 1 @@ -36,10 +36,47 @@ typedef enum { } isc_rwlocktype_t; #ifdef ISC_PLATFORM_USETHREADS +#if defined(ISC_PLATFORM_HAVEXADD) && defined(ISC_PLATFORM_HAVECMPXCHG) +#define ISC_RWLOCK_USEATOMIC 1 +#endif + struct isc_rwlock { /* Unlocked. */ unsigned int magic; isc_mutex_t lock; + +#if defined(ISC_PLATFORM_HAVEXADD) && defined(ISC_PLATFORM_HAVECMPXCHG) + /* + * When some atomic instructions with hardware assistance are + * available, rwlock will use those so that concurrent readers do not + * interfere with each other through mutex as long as no writers + * appear, massively reducing the lock overhead in the typical case. + * + * The basic algorithm of this approach is the "simple + * writer-preference lock" shown in the following URL: + * http://www.cs.rochester.edu/u/scott/synchronization/pseudocode/rw.html + * but our implementation does not rely on the spin lock unlike the + * original algorithm to be more portable as a user space application. + */ + + /* Read or modified atomically. */ + isc_int32_t write_requests; + isc_int32_t write_completions; + isc_int32_t cnt_and_flag; + + /* Locked by lock. */ + isc_condition_t readable; + isc_condition_t writeable; + unsigned int readers_waiting; + + /* Locked by rwlock itself. */ + unsigned int write_granted; + + /* Unlocked. */ + unsigned int write_quota; + +#else /* ISC_PLATFORM_HAVEXADD && ISC_PLATFORM_HAVECMPXCHG */ + /*%< Locked by lock. */ isc_condition_t readable; isc_condition_t writeable; @@ -60,6 +97,7 @@ struct isc_rwlock { unsigned int read_quota; unsigned int write_quota; isc_rwlocktype_t original; +#endif /* ISC_PLATFORM_HAVEXADD && ISC_PLATFORM_HAVECMPXCHG */ }; #else /* ISC_PLATFORM_USETHREADS */ struct isc_rwlock { diff --git a/lib/isc/mem.c b/lib/isc/mem.c index dac37f10ef..f20bd51c60 100644 --- a/lib/isc/mem.c +++ b/lib/isc/mem.c @@ -15,7 +15,7 @@ * PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: mem.c,v 1.119 2005/04/27 04:57:13 sra Exp $ */ +/* $Id: mem.c,v 1.120 2005/06/04 05:32:48 jinmei Exp $ */ /*! \file */ @@ -36,6 +36,9 @@ #include #include +#define MCTXLOCK(m, l) if (((m)->flags & ISC_MEMFLAG_NOLOCK) == 0) LOCK(l) +#define MCTXUNLOCK(m, l) if (((m)->flags & ISC_MEMFLAG_NOLOCK) == 0) UNLOCK(l) + #ifndef ISC_MEM_DEBUGGING #define ISC_MEM_DEBUGGING 0 #endif @@ -46,9 +49,11 @@ LIBISC_EXTERNAL_DATA unsigned int isc_mem_debugging = ISC_MEM_DEBUGGING; * implementation in preference to the system one. The internal malloc() * is very space-efficient, and quite fast on uniprocessor systems. It * performs poorly on multiprocessor machines. + * JT: we can overcome the performance issue on multiprocessor machines + * by carefully separating memory contexts. */ #ifndef ISC_MEM_USE_INTERNAL_MALLOC -#define ISC_MEM_USE_INTERNAL_MALLOC 0 +#define ISC_MEM_USE_INTERNAL_MALLOC 1 #endif /* @@ -117,6 +122,7 @@ typedef ISC_LIST(debuglink_t) debuglist_t; struct isc_mem { unsigned int magic; isc_ondestroy_t ondestroy; + unsigned int flags; isc_mutex_t lock; isc_memalloc_t memalloc; isc_memfree_t memfree; @@ -701,6 +707,16 @@ isc_result_t isc_mem_createx(size_t init_max_size, size_t target_size, isc_memalloc_t memalloc, isc_memfree_t memfree, void *arg, isc_mem_t **ctxp) +{ + return (isc_mem_createx2(init_max_size, target_size, memalloc, memfree, + arg, ctxp, 0)); + +} + +isc_result_t +isc_mem_createx2(size_t init_max_size, size_t target_size, + isc_memalloc_t memalloc, isc_memfree_t memfree, void *arg, + isc_mem_t **ctxp, unsigned int flags) { isc_mem_t *ctx; isc_result_t result; @@ -719,11 +735,14 @@ isc_mem_createx(size_t init_max_size, size_t target_size, if (ctx == NULL) return (ISC_R_NOMEMORY); - if (isc_mutex_init(&ctx->lock) != ISC_R_SUCCESS) { + if ((flags & ISC_MEMFLAG_NOLOCK) == 0 && + isc_mutex_init(&ctx->lock) != ISC_R_SUCCESS) { UNEXPECTED_ERROR(__FILE__, __LINE__, "isc_mutex_init() %s", - isc_msgcat_get(isc_msgcat, ISC_MSGSET_GENERAL, - ISC_MSG_FAILED, "failed")); + isc_msgcat_get(isc_msgcat, + ISC_MSGSET_GENERAL, + ISC_MSG_FAILED, + "failed")); (memfree)(arg, ctx); return (ISC_R_UNEXPECTED); } @@ -732,6 +751,7 @@ isc_mem_createx(size_t init_max_size, size_t target_size, ctx->max_size = DEF_MAX_SIZE; else ctx->max_size = init_max_size; + ctx->flags = flags; ctx->references = 1; ctx->quota = 0; ctx->total = 0; @@ -818,7 +838,8 @@ isc_mem_createx(size_t init_max_size, size_t target_size, if (ctx->debuglist != NULL) (ctx->memfree)(ctx->arg, ctx->debuglist); #endif /* ISC_MEM_TRACKLINES */ - DESTROYLOCK(&ctx->lock); + if ((ctx->flags & ISC_MEMFLAG_NOLOCK) == 0) + DESTROYLOCK(&ctx->lock); (memfree)(arg, ctx); } @@ -829,9 +850,18 @@ isc_result_t isc_mem_create(size_t init_max_size, size_t target_size, isc_mem_t **ctxp) { - return (isc_mem_createx(init_max_size, target_size, - default_memalloc, default_memfree, NULL, - ctxp)); + return (isc_mem_createx2(init_max_size, target_size, + default_memalloc, default_memfree, NULL, + ctxp, 0)); +} + +isc_result_t +isc_mem_create2(size_t init_max_size, size_t target_size, + isc_mem_t **ctxp, unsigned int flags) +{ + return (isc_mem_createx2(init_max_size, target_size, + default_memalloc, default_memfree, NULL, + ctxp, flags)); } static void @@ -891,7 +921,8 @@ destroy(isc_mem_t *ctx) { ondest = ctx->ondestroy; - DESTROYLOCK(&ctx->lock); + if ((ctx->flags & ISC_MEMFLAG_NOLOCK) == 0) + DESTROYLOCK(&ctx->lock); (ctx->memfree)(ctx->arg, ctx); isc_ondestroy_notify(&ondest, ctx); @@ -902,9 +933,9 @@ isc_mem_attach(isc_mem_t *source, isc_mem_t **targetp) { REQUIRE(VALID_CONTEXT(source)); REQUIRE(targetp != NULL && *targetp == NULL); - LOCK(&source->lock); + MCTXLOCK(source, &source->lock); source->references++; - UNLOCK(&source->lock); + MCTXUNLOCK(source, &source->lock); *targetp = source; } @@ -918,12 +949,12 @@ isc_mem_detach(isc_mem_t **ctxp) { ctx = *ctxp; REQUIRE(VALID_CONTEXT(ctx)); - LOCK(&ctx->lock); + MCTXLOCK(ctx, &ctx->lock); INSIST(ctx->references > 0); ctx->references--; if (ctx->references == 0) want_destroy = ISC_TRUE; - UNLOCK(&ctx->lock); + MCTXUNLOCK(ctx, &ctx->lock); if (want_destroy) destroy(ctx); @@ -958,11 +989,11 @@ isc__mem_putanddetach(isc_mem_t **ctxp, void *ptr, size_t size FLARG) { *ctxp = NULL; #if ISC_MEM_USE_INTERNAL_MALLOC - LOCK(&ctx->lock); + MCTXLOCK(ctx, &ctx->lock); mem_putunlocked(ctx, ptr, size); #else /* ISC_MEM_USE_INTERNAL_MALLOC */ mem_put(ctx, ptr, size); - LOCK(&ctx->lock); + MCTXLOCK(ctx, &ctx->lock); mem_putstats(ctx, ptr, size); #endif /* ISC_MEM_USE_INTERNAL_MALLOC */ @@ -972,7 +1003,7 @@ isc__mem_putanddetach(isc_mem_t **ctxp, void *ptr, size_t size FLARG) { if (ctx->references == 0) want_destroy = ISC_TRUE; - UNLOCK(&ctx->lock); + MCTXUNLOCK(ctx, &ctx->lock); if (want_destroy) destroy(ctx); @@ -991,14 +1022,14 @@ isc_mem_destroy(isc_mem_t **ctxp) { ctx = *ctxp; REQUIRE(VALID_CONTEXT(ctx)); - LOCK(&ctx->lock); + MCTXLOCK(ctx, &ctx->lock); #if ISC_MEM_TRACKLINES if (ctx->references != 1) print_active(ctx, stderr); #endif REQUIRE(ctx->references == 1); ctx->references--; - UNLOCK(&ctx->lock); + MCTXUNLOCK(ctx, &ctx->lock); destroy(ctx); @@ -1009,9 +1040,9 @@ isc_result_t isc_mem_ondestroy(isc_mem_t *ctx, isc_task_t *task, isc_event_t **event) { isc_result_t res; - LOCK(&ctx->lock); + MCTXLOCK(ctx, &ctx->lock); res = isc_ondestroy_register(&ctx->ondestroy, task, event); - UNLOCK(&ctx->lock); + MCTXUNLOCK(ctx, &ctx->lock); return (res); } @@ -1025,11 +1056,11 @@ isc__mem_get(isc_mem_t *ctx, size_t size FLARG) { REQUIRE(VALID_CONTEXT(ctx)); #if ISC_MEM_USE_INTERNAL_MALLOC - LOCK(&ctx->lock); + MCTXLOCK(ctx, &ctx->lock); ptr = mem_getunlocked(ctx, size); #else /* ISC_MEM_USE_INTERNAL_MALLOC */ ptr = mem_get(ctx, size); - LOCK(&ctx->lock); + MCTXLOCK(ctx, &ctx->lock); if (ptr != NULL) mem_getstats(ctx, size); #endif /* ISC_MEM_USE_INTERNAL_MALLOC */ @@ -1047,7 +1078,7 @@ isc__mem_get(isc_mem_t *ctx, size_t size FLARG) { fprintf(stderr, "maxinuse = %lu\n", (unsigned long)ctx->inuse); } - UNLOCK(&ctx->lock); + MCTXUNLOCK(ctx, &ctx->lock); if (call_water) (ctx->water)(ctx->water_arg, ISC_MEM_HIWATER); @@ -1064,11 +1095,11 @@ isc__mem_put(isc_mem_t *ctx, void *ptr, size_t size FLARG) REQUIRE(ptr != NULL); #if ISC_MEM_USE_INTERNAL_MALLOC - LOCK(&ctx->lock); + MCTXLOCK(ctx, &ctx->lock); mem_putunlocked(ctx, ptr, size); #else /* ISC_MEM_USE_INTERNAL_MALLOC */ mem_put(ctx, ptr, size); - LOCK(&ctx->lock); + MCTXLOCK(ctx, &ctx->lock); mem_putstats(ctx, ptr, size); #endif /* ISC_MEM_USE_INTERNAL_MALLOC */ @@ -1086,7 +1117,7 @@ isc__mem_put(isc_mem_t *ctx, void *ptr, size_t size FLARG) if (ctx->water != NULL) call_water = ISC_TRUE; } - UNLOCK(&ctx->lock); + MCTXUNLOCK(ctx, &ctx->lock); if (call_water) (ctx->water)(ctx->water_arg, ISC_MEM_LOWATER); @@ -1143,7 +1174,7 @@ isc_mem_stats(isc_mem_t *ctx, FILE *out) { const isc_mempool_t *pool; REQUIRE(VALID_CONTEXT(ctx)); - LOCK(&ctx->lock); + MCTXLOCK(ctx, &ctx->lock); for (i = 0; i <= ctx->max_size; i++) { s = &ctx->stats[i]; @@ -1205,7 +1236,7 @@ isc_mem_stats(isc_mem_t *ctx, FILE *out) { print_active(ctx, out); #endif - UNLOCK(&ctx->lock); + MCTXUNLOCK(ctx, &ctx->lock); } /* @@ -1236,11 +1267,11 @@ isc__mem_allocate(isc_mem_t *ctx, size_t size FLARG) { REQUIRE(VALID_CONTEXT(ctx)); #if ISC_MEM_USE_INTERNAL_MALLOC - LOCK(&ctx->lock); + MCTXLOCK(ctx, &ctx->lock); si = isc__mem_allocateunlocked(ctx, size); #else /* ISC_MEM_USE_INTERNAL_MALLOC */ si = isc__mem_allocateunlocked(ctx, size); - LOCK(&ctx->lock); + MCTXLOCK(ctx, &ctx->lock); if (si != NULL) mem_getstats(ctx, si[-1].u.size); #endif /* ISC_MEM_USE_INTERNAL_MALLOC */ @@ -1249,7 +1280,7 @@ isc__mem_allocate(isc_mem_t *ctx, size_t size FLARG) { ADD_TRACE(ctx, si, si[-1].u.size, file, line); #endif - UNLOCK(&ctx->lock); + MCTXUNLOCK(ctx, &ctx->lock); return (si); } @@ -1266,17 +1297,17 @@ isc__mem_free(isc_mem_t *ctx, void *ptr FLARG) { size = si->u.size; #if ISC_MEM_USE_INTERNAL_MALLOC - LOCK(&ctx->lock); + MCTXLOCK(ctx, &ctx->lock); mem_putunlocked(ctx, si, size); #else /* ISC_MEM_USE_INTERNAL_MALLOC */ mem_put(ctx, si, size); - LOCK(&ctx->lock); + MCTXLOCK(ctx, &ctx->lock); mem_putstats(ctx, si, size); #endif /* ISC_MEM_USE_INTERNAL_MALLOC */ DELETE_TRACE(ctx, ptr, size, file, line); - UNLOCK(&ctx->lock); + MCTXUNLOCK(ctx, &ctx->lock); } @@ -1305,11 +1336,11 @@ isc__mem_strdup(isc_mem_t *mctx, const char *s FLARG) { void isc_mem_setdestroycheck(isc_mem_t *ctx, isc_boolean_t flag) { REQUIRE(VALID_CONTEXT(ctx)); - LOCK(&ctx->lock); + MCTXLOCK(ctx, &ctx->lock); ctx->checkfree = flag; - UNLOCK(&ctx->lock); + MCTXUNLOCK(ctx, &ctx->lock); } /* @@ -1319,11 +1350,11 @@ isc_mem_setdestroycheck(isc_mem_t *ctx, isc_boolean_t flag) { void isc_mem_setquota(isc_mem_t *ctx, size_t quota) { REQUIRE(VALID_CONTEXT(ctx)); - LOCK(&ctx->lock); + MCTXLOCK(ctx, &ctx->lock); ctx->quota = quota; - UNLOCK(&ctx->lock); + MCTXUNLOCK(ctx, &ctx->lock); } size_t @@ -1331,11 +1362,11 @@ isc_mem_getquota(isc_mem_t *ctx) { size_t quota; REQUIRE(VALID_CONTEXT(ctx)); - LOCK(&ctx->lock); + MCTXLOCK(ctx, &ctx->lock); quota = ctx->quota; - UNLOCK(&ctx->lock); + MCTXUNLOCK(ctx, &ctx->lock); return (quota); } @@ -1345,11 +1376,11 @@ isc_mem_inuse(isc_mem_t *ctx) { size_t inuse; REQUIRE(VALID_CONTEXT(ctx)); - LOCK(&ctx->lock); + MCTXLOCK(ctx, &ctx->lock); inuse = ctx->inuse; - UNLOCK(&ctx->lock); + MCTXUNLOCK(ctx, &ctx->lock); return (inuse); } @@ -1361,7 +1392,7 @@ isc_mem_setwater(isc_mem_t *ctx, isc_mem_water_t water, void *water_arg, REQUIRE(VALID_CONTEXT(ctx)); REQUIRE(hiwater >= lowater); - LOCK(&ctx->lock); + MCTXLOCK(ctx, &ctx->lock); if (water == NULL) { ctx->water = NULL; ctx->water_arg = NULL; @@ -1375,7 +1406,7 @@ isc_mem_setwater(isc_mem_t *ctx, isc_mem_water_t water, void *water_arg, ctx->lo_water = lowater; ctx->hi_called = ISC_FALSE; } - UNLOCK(&ctx->lock); + MCTXUNLOCK(ctx, &ctx->lock); } /* @@ -1415,9 +1446,9 @@ isc_mempool_create(isc_mem_t *mctx, size_t size, isc_mempool_t **mpctxp) { *mpctxp = mpctx; - LOCK(&mctx->lock); + MCTXLOCK(mctx, &mctx->lock); ISC_LIST_INITANDAPPEND(mctx->pools, mpctx, link); - UNLOCK(&mctx->lock); + MCTXUNLOCK(mctx, &mctx->lock); return (ISC_R_SUCCESS); } @@ -1470,7 +1501,7 @@ isc_mempool_destroy(isc_mempool_t **mpctxp) { /* * Return any items on the free list */ - LOCK(&mctx->lock); + MCTXLOCK(mctx, &mctx->lock); while (mpctx->items != NULL) { INSIST(mpctx->freecount > 0); mpctx->freecount--; @@ -1484,14 +1515,14 @@ isc_mempool_destroy(isc_mempool_t **mpctxp) { mem_putstats(mctx, item, mpctx->size); #endif /* ISC_MEM_USE_INTERNAL_MALLOC */ } - UNLOCK(&mctx->lock); + MCTXUNLOCK(mctx, &mctx->lock); /* * Remove our linked list entry from the memory context. */ - LOCK(&mctx->lock); + MCTXLOCK(mctx, &mctx->lock); ISC_LIST_UNLINK(mctx->pools, mpctx, link); - UNLOCK(&mctx->lock); + MCTXUNLOCK(mctx, &mctx->lock); mpctx->magic = 0; @@ -1550,7 +1581,7 @@ isc__mempool_get(isc_mempool_t *mpctx FLARG) { * We need to dip into the well. Lock the memory context here and * fill up our free list. */ - LOCK(&mctx->lock); + MCTXLOCK(mctx, &mctx->lock); for (i = 0; i < mpctx->fillcount; i++) { #if ISC_MEM_USE_INTERNAL_MALLOC item = mem_getunlocked(mctx, mpctx->size); @@ -1565,7 +1596,7 @@ isc__mempool_get(isc_mempool_t *mpctx FLARG) { mpctx->items = item; mpctx->freecount++; } - UNLOCK(&mctx->lock); + MCTXUNLOCK(mctx, &mctx->lock); /* * If we didn't get any items, return NULL. @@ -1585,9 +1616,9 @@ isc__mempool_get(isc_mempool_t *mpctx FLARG) { #if ISC_MEM_TRACKLINES if (item != NULL) { - LOCK(&mctx->lock); + MCTXLOCK(mctx, &mctx->lock); ADD_TRACE(mctx, item, mpctx->size, file, line); - UNLOCK(&mctx->lock); + MCTXUNLOCK(mctx, &mctx->lock); } #endif /* ISC_MEM_TRACKLINES */ @@ -1611,9 +1642,9 @@ isc__mempool_put(isc_mempool_t *mpctx, void *mem FLARG) { mpctx->allocated--; #if ISC_MEM_TRACKLINES - LOCK(&mctx->lock); + MCTXLOCK(mctx, &mctx->lock); DELETE_TRACE(mctx, mem, mpctx->size, file, line); - UNLOCK(&mctx->lock); + MCTXUNLOCK(mctx, &mctx->lock); #endif /* ISC_MEM_TRACKLINES */ /* @@ -1621,14 +1652,14 @@ isc__mempool_put(isc_mempool_t *mpctx, void *mem FLARG) { */ if (mpctx->freecount >= mpctx->freemax) { #if ISC_MEM_USE_INTERNAL_MALLOC - LOCK(&mctx->lock); + MCTXLOCK(mctx, &mctx->lock); mem_putunlocked(mctx, mem, mpctx->size); - UNLOCK(&mctx->lock); + MCTXUNLOCK(mctx, &mctx->lock); #else /* ISC_MEM_USE_INTERNAL_MALLOC */ mem_put(mctx, mem, mpctx->size); - LOCK(&mctx->lock); + MCTXLOCK(mctx, &mctx->lock); mem_putstats(mctx, mem, mpctx->size); - UNLOCK(&mctx->lock); + MCTXUNLOCK(mctx, &mctx->lock); #endif /* ISC_MEM_USE_INTERNAL_MALLOC */ if (mpctx->lock != NULL) UNLOCK(mpctx->lock); diff --git a/lib/isc/noatomic/include/isc/atomic.h b/lib/isc/noatomic/include/isc/atomic.h new file mode 100644 index 0000000000..08681ce527 --- /dev/null +++ b/lib/isc/noatomic/include/isc/atomic.h @@ -0,0 +1,24 @@ +/* + * Copyright (C) 2005 Internet Systems Consortium, Inc. ("ISC") + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS. IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT, + * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM + * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE + * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +/* $Id: atomic.h,v 1.2 2005/06/04 05:32:49 jinmei Exp $ */ + +#ifndef ISC_ATOMIC_H +#define ISC_ATOMIC_H 1 + +/* This file is inherently empty. */ + +#endif /* ISC_ATOMIC_H */ diff --git a/lib/isc/rwlock.c b/lib/isc/rwlock.c index 3e6fd2f261..1498af43be 100644 --- a/lib/isc/rwlock.c +++ b/lib/isc/rwlock.c @@ -15,7 +15,7 @@ * PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: rwlock.c,v 1.40 2005/04/27 04:57:14 sra Exp $ */ +/* $Id: rwlock.c,v 1.41 2005/06/04 05:32:48 jinmei Exp $ */ /*! \file */ @@ -23,6 +23,7 @@ #include +#include #include #include #include @@ -83,6 +84,20 @@ isc_rwlock_init(isc_rwlock_t *rwl, unsigned int read_quota, */ rwl->magic = 0; +#if defined(ISC_PLATFORM_HAVEXADD) && defined(ISC_PLATFORM_HAVECMPXCHG) + rwl->write_requests = 0; + rwl->write_completions = 0; + rwl->cnt_and_flag = 0; + rwl->readers_waiting = 0; + rwl->write_granted = 0; + if (read_quota != 0) { + UNEXPECTED_ERROR(__FILE__, __LINE__, + "read quota is not supported"); + } + if (write_quota == 0) + write_quota = RWLOCK_DEFAULT_WRITE_QUOTA; + rwl->write_quota = write_quota; +#else rwl->type = isc_rwlocktype_read; rwl->original = isc_rwlocktype_none; rwl->active = 0; @@ -95,6 +110,8 @@ isc_rwlock_init(isc_rwlock_t *rwl, unsigned int read_quota, if (write_quota == 0) write_quota = RWLOCK_DEFAULT_WRITE_QUOTA; rwl->write_quota = write_quota; +#endif + result = isc_mutex_init(&rwl->lock); if (result != ISC_R_SUCCESS) { UNEXPECTED_ERROR(__FILE__, __LINE__, @@ -113,7 +130,6 @@ isc_rwlock_init(isc_rwlock_t *rwl, unsigned int read_quota, isc_result_totext(result)); result = ISC_R_UNEXPECTED; goto destroy_lock; - } result = isc_condition_init(&rwl->writeable); if (result != ISC_R_SUCCESS) { @@ -138,6 +154,389 @@ isc_rwlock_init(isc_rwlock_t *rwl, unsigned int read_quota, return (result); } +void +isc_rwlock_destroy(isc_rwlock_t *rwl) { + REQUIRE(VALID_RWLOCK(rwl)); + +#if defined(ISC_PLATFORM_HAVEXADD) && defined(ISC_PLATFORM_HAVECMPXCHG) + REQUIRE(rwl->write_requests == rwl->write_completions && + rwl->cnt_and_flag == 0 && rwl->readers_waiting == 0); +#else + LOCK(&rwl->lock); + REQUIRE(rwl->active == 0 && + rwl->readers_waiting == 0 && + rwl->writers_waiting == 0); + UNLOCK(&rwl->lock); +#endif + + rwl->magic = 0; + (void)isc_condition_destroy(&rwl->readable); + (void)isc_condition_destroy(&rwl->writeable); + DESTROYLOCK(&rwl->lock); +} + +#if defined(ISC_PLATFORM_HAVEXADD) && defined(ISC_PLATFORM_HAVECMPXCHG) + +/* + * When some architecture-dependent atomic operations are available, + * rwlock can be more efficient than the generic algorithm defined below. + * The basic algorithm is described in the following URL: + * http://www.cs.rochester.edu/u/scott/synchronization/pseudocode/rw.html + * + * The key is to use the following integer variables modified atomically: + * write_requests, write_completions, and cnt_and_flag. + * + * write_requests and write_completions act as a waiting queue for writers + * in order to ensure the FIFO order. Both variables begin with the initial + * value of 0. When a new writer tries to get a write lock, it increments + * write_requests and gets the previous value of the variable as a "ticket". + * When write_completions reaches the ticket number, the new writer can start + * writing. When the writer completes its work, it increments + * write_completions so that another new writer can start working. If the + * write_requests is not equal to write_completions, it means a writer is now + * working or waiting. In this case, a new readers cannot start reading, or + * in other words, this algorithm basically prefers writers. + * + * cnt_and_flag is a "lock" shared by all readers and writers. This integer + * variable is a kind of structure with two members: writer_flag (1 bit) and + * reader_count (31 bits). The writer_flag shows whether a writer is working, + * and the reader_count shows the number of readers currently working or almost + * ready for working. A writer who has the current "ticket" tries to get the + * lock by exclusively setting the writer_flag to 1, provided that the whole + * 32-bit is 0 (meaning no readers or writers working). On the other hand, + * a new reader tries to increment the "reader_count" field provided that + * the writer_flag is 0 (meaning there is no writer working). + * + * If some of the above operations fail, the reader or the writer sleeps + * until the related condition changes. When a working reader or writer + * completes its work, some readers or writers are sleeping, and the condition + * that suspended the reader or writer has changed, it wakes up the sleeping + * readers or writers. + * + * As already noted, this algorithm basically prefers writers. In order to + * prevent readers from starving, however, the algorithm also introduces the + * "writer quota" (Q). When Q consecutive writers have completed their work, + * suspending readers, the last writer will wake up the readers, even if a new + * writer is waiting. + * + * Implementation specific note: due to the combination of atomic operations + * and a mutex lock, ordering between the atomic operation and locks can be + * very sensitive in some cases. In particular, it is generally very important + * to check the atomic variable that requires a reader or writer to sleep after + * locking the mutex and before actually sleeping; otherwise, it could be very + * likely to cause a deadlock. For example, assume "var" is a variable + * atomically modified, then the corresponding code would be: + * if (var == need_sleep) { + * LOCK(lock); + * if (var == need_sleep) + * WAIT(cond, lock); + * UNLOCK(lock); + * } + * The second check is important, since "var" is protected by the atomic + * operation, not by the mutex, and can be changed just before sleeping. + * (The first "if" could be omitted, but this is also important in order to + * make the code efficient by avoiding the use of the mutex unless it is + * really necessary.) + */ + +#define WRITER_ACTIVE 0x1 +#define READER_INCR 0x2 + +isc_result_t +isc_rwlock_lock(isc_rwlock_t *rwl, isc_rwlocktype_t type) { + isc_int32_t cntflag; + + REQUIRE(VALID_RWLOCK(rwl)); + +#ifdef ISC_RWLOCK_TRACE + print_lock(isc_msgcat_get(isc_msgcat, ISC_MSGSET_RWLOCK, + ISC_MSG_PRELOCK, "prelock"), rwl, type); +#endif + + if (type == isc_rwlocktype_read) { + if (rwl->write_requests != rwl->write_completions) { + /* there is a waiting or active writer */ + LOCK(&rwl->lock); + if (rwl->write_requests != rwl->write_completions) { + rwl->readers_waiting++; + WAIT(&rwl->readable, &rwl->lock); + rwl->readers_waiting--; + } + UNLOCK(&rwl->lock); + } + + cntflag = isc_atomic_xadd(&rwl->cnt_and_flag, READER_INCR); + while (1) { + if ((rwl->cnt_and_flag & WRITER_ACTIVE) == 0) + break; + + /* A writer is still working */ + LOCK(&rwl->lock); + rwl->readers_waiting++; + if ((rwl->cnt_and_flag & WRITER_ACTIVE) != 0) + WAIT(&rwl->readable, &rwl->lock); + rwl->readers_waiting--; + UNLOCK(&rwl->lock); + + /* + * Typically, the reader should be able to get a lock + * at this stage: + * (1) there should have been no pending writer when + * the reader was trying to increment the + * counter; otherwise, the writer should be in + * the waiting queue, preventing the reader from + * proceeding to this point. + * (2) once the reader increments the counter, no + * more writer can get a lock. + * Still, it is possible another writer can work at + * this point, e.g. in the following scenario: + * A previous writer unlocks the writer lock. + * This reader proceeds to point (1). + * A new writer appears, and gets a new lock before + * the reader increments the counter. + * The reader then increments the counter. + * The previous writer notices there is a waiting + * reader who is almost ready, and wakes it up. + * So, the reader needs to confirm whether it can now + * read explicitly (thus we loop). Note that this is + * not an infinite process, since the reader has + * incremented the counter at this point. + */ + } + + /* + * If we are temporarily preferred to writers due to the writer + * quota, reset the condition (race among readers doesn't + * matter). + */ + rwl->write_granted = 0; + } else { + isc_int32_t prev_writer; + + /* enter the waiting queue, and wait for our turn */ + prev_writer = isc_atomic_xadd(&rwl->write_requests, 1); + while (rwl->write_completions != prev_writer) { + LOCK(&rwl->lock); + if (rwl->write_completions != prev_writer) { + WAIT(&rwl->writeable, &rwl->lock); + UNLOCK(&rwl->lock); + continue; + } + UNLOCK(&rwl->lock); + break; + } + + while (1) { + cntflag = isc_atomic_cmpxchg(&rwl->cnt_and_flag, 0, + WRITER_ACTIVE); + if (cntflag == 0) + break; + + /* Another active reader or writer is working. */ + LOCK(&rwl->lock); + if (rwl->cnt_and_flag != 0) + WAIT(&rwl->writeable, &rwl->lock); + UNLOCK(&rwl->lock); + } + + INSIST((rwl->cnt_and_flag & WRITER_ACTIVE) != 0); + rwl->write_granted++; + } + +#ifdef ISC_RWLOCK_TRACE + print_lock(isc_msgcat_get(isc_msgcat, ISC_MSGSET_RWLOCK, + ISC_MSG_POSTLOCK, "postlock"), rwl, type); +#endif + + return (ISC_R_SUCCESS); +} + +isc_result_t +isc_rwlock_trylock(isc_rwlock_t *rwl, isc_rwlocktype_t type) { + isc_int32_t cntflag; + + REQUIRE(VALID_RWLOCK(rwl)); + +#ifdef ISC_RWLOCK_TRACE + print_lock(isc_msgcat_get(isc_msgcat, ISC_MSGSET_RWLOCK, + ISC_MSG_PRELOCK, "prelock"), rwl, type); +#endif + + if (type == isc_rwlocktype_read) { + /* If a writer is waiting or working, we fail. */ + if (rwl->write_requests != rwl->write_completions) + return (ISC_R_LOCKBUSY); + + /* Otherwise, be ready for reading. */ + cntflag = isc_atomic_xadd(&rwl->cnt_and_flag, READER_INCR); + if ((cntflag & WRITER_ACTIVE) != 0) { + /* + * A writer is working. We lose, and cancel the read + * request. + */ + cntflag = isc_atomic_xadd(&rwl->cnt_and_flag, + -READER_INCR); + /* + * If no other readers are waiting and we've suspended + * new writers in this short period, wake them up. + */ + if (cntflag == READER_INCR && + rwl->write_completions != rwl->write_requests) { + LOCK(&rwl->lock); + BROADCAST(&rwl->writeable); + UNLOCK(&rwl->lock); + } + + return (ISC_R_LOCKBUSY); + } + } else { + /* Try locking without entering the waiting queue. */ + cntflag = isc_atomic_cmpxchg(&rwl->cnt_and_flag, 0, + WRITER_ACTIVE); + if (cntflag != 0) + return (ISC_R_LOCKBUSY); + + /* + * XXXJT: jump into the queue, possibly breaking the writer + * order. + */ + (void)isc_atomic_xadd(&rwl->write_completions, -1); + + rwl->write_granted++; + } + +#ifdef ISC_RWLOCK_TRACE + print_lock(isc_msgcat_get(isc_msgcat, ISC_MSGSET_RWLOCK, + ISC_MSG_POSTLOCK, "postlock"), rwl, type); +#endif + + return (ISC_R_SUCCESS); +} + +isc_result_t +isc_rwlock_tryupgrade(isc_rwlock_t *rwl) { + isc_int32_t prevcnt; + + REQUIRE(VALID_RWLOCK(rwl)); + + /* Try to acquire write access. */ + prevcnt = isc_atomic_cmpxchg(&rwl->cnt_and_flag, + READER_INCR, WRITER_ACTIVE); + /* + * There must have been no writer, and there must have been at least + * one reader. + */ + INSIST((prevcnt & WRITER_ACTIVE) == 0 && + (prevcnt & ~WRITER_ACTIVE) != 0); + + if (prevcnt == READER_INCR) { + /* + * We are the only reader and have been upgraded. + * Now jump into the head of the writer waiting queue. + */ + (void)isc_atomic_xadd(&rwl->write_completions, -1); + } else + return (ISC_R_LOCKBUSY); + + return (ISC_R_SUCCESS); + +} + +void +isc_rwlock_downgrade(isc_rwlock_t *rwl) { + isc_int32_t prev_readers; + + REQUIRE(VALID_RWLOCK(rwl)); + + /* Become an active reader. */ + prev_readers = isc_atomic_xadd(&rwl->cnt_and_flag, READER_INCR); + /* We must have been a writer. */ + INSIST((prev_readers & WRITER_ACTIVE) != 0); + + /* Complete write */ + (void)isc_atomic_xadd(&rwl->cnt_and_flag, -WRITER_ACTIVE); + (void)isc_atomic_xadd(&rwl->write_completions, 1); + + /* Resume other readers */ + LOCK(&rwl->lock); + if (rwl->readers_waiting > 0) + BROADCAST(&rwl->readable); + UNLOCK(&rwl->lock); +} + +isc_result_t +isc_rwlock_unlock(isc_rwlock_t *rwl, isc_rwlocktype_t type) { + isc_int32_t prev_cnt; + + REQUIRE(VALID_RWLOCK(rwl)); + +#ifdef ISC_RWLOCK_TRACE + print_lock(isc_msgcat_get(isc_msgcat, ISC_MSGSET_RWLOCK, + ISC_MSG_PREUNLOCK, "preunlock"), rwl, type); +#endif + + if (type == isc_rwlocktype_read) { + prev_cnt = isc_atomic_xadd(&rwl->cnt_and_flag, -READER_INCR); + + /* + * If we're the last reader and any writers are waiting, wake + * them up. We need to wake up all of them to ensure the + * FIFO order. + */ + if (prev_cnt == READER_INCR && + rwl->write_completions != rwl->write_requests) { + LOCK(&rwl->lock); + BROADCAST(&rwl->writeable); + UNLOCK(&rwl->lock); + } + } else { + isc_boolean_t wakeup_writers = ISC_TRUE; + + /* + * Reset the flag, and (implicitly) tell other writers + * we are done. + */ + (void)isc_atomic_xadd(&rwl->cnt_and_flag, -WRITER_ACTIVE); + (void)isc_atomic_xadd(&rwl->write_completions, 1); + + if (rwl->write_granted >= rwl->write_quota || + rwl->write_requests == rwl->write_completions || + (rwl->cnt_and_flag & ~WRITER_ACTIVE) != 0) { + /* + * We have passed the write quota, no writer is + * waiting, or some readers are almost ready, pending + * possible writers. Note that the last case can + * happen even if write_requests != write_completions + * (which means a new writer in the queue), so we need + * to catch the case explicitly. + */ + LOCK(&rwl->lock); + if (rwl->readers_waiting > 0) { + wakeup_writers = ISC_FALSE; + BROADCAST(&rwl->readable); + } + UNLOCK(&rwl->lock); + } + + if (rwl->write_requests != rwl->write_completions && + wakeup_writers) { + LOCK(&rwl->lock); + BROADCAST(&rwl->writeable); + UNLOCK(&rwl->lock); + } + } + +#ifdef ISC_RWLOCK_TRACE + print_lock(isc_msgcat_get(isc_msgcat, ISC_MSGSET_RWLOCK, + ISC_MSG_POSTUNLOCK, "postunlock"), + rwl, type); +#endif + + return (ISC_R_SUCCESS); +} + +#else /* ISC_PLATFORM_HAVEXADD && ISC_PLATFORM_HAVECMPXCHG */ + static isc_result_t doit(isc_rwlock_t *rwl, isc_rwlocktype_t type, isc_boolean_t nonblock) { isc_boolean_t skip = ISC_FALSE; @@ -323,22 +722,7 @@ isc_rwlock_unlock(isc_rwlock_t *rwl, isc_rwlocktype_t type) { return (ISC_R_SUCCESS); } -void -isc_rwlock_destroy(isc_rwlock_t *rwl) { - REQUIRE(VALID_RWLOCK(rwl)); - - LOCK(&rwl->lock); - REQUIRE(rwl->active == 0 && - rwl->readers_waiting == 0 && - rwl->writers_waiting == 0); - UNLOCK(&rwl->lock); - - rwl->magic = 0; - (void)isc_condition_destroy(&rwl->readable); - (void)isc_condition_destroy(&rwl->writeable); - DESTROYLOCK(&rwl->lock); -} - +#endif /* ISC_PLATFORM_HAVEXADD && ISC_PLATFORM_HAVECMPXCHG */ #else /* ISC_PLATFORM_USETHREADS */ isc_result_t diff --git a/lib/isc/sparc64/include/isc/atomic.h b/lib/isc/sparc64/include/isc/atomic.h new file mode 100644 index 0000000000..2d6cdd3e44 --- /dev/null +++ b/lib/isc/sparc64/include/isc/atomic.h @@ -0,0 +1,119 @@ +/* + * Copyright (C) 2005 Internet Systems Consortium, Inc. ("ISC") + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS. IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT, + * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM + * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE + * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +/* $Id: atomic.h,v 1.2 2005/06/04 05:32:49 jinmei Exp $ */ + +/* + * This code was written based on FreeBSD's kernel source whose copyright + * follows: + */ + +/*- + * Copyright (c) 1998 Doug Rabson. + * Copyright (c) 2001 Jake Burkholder. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: FreeBSD: src/sys/i386/include/atomic.h,v 1.20 2001/02/11 + * $FreeBSD: src/sys/sparc64/include/atomic.h,v 1.8 2004/05/22 00:52:16 marius Exp $ + */ + +#ifndef ISC_ATOMIC_H +#define ISC_ATOMIC_H 1 + +#include +#include + +#define ASI_P 0x80 /* Primary Address Space Identifier */ + +/* + * This routine atomically increments the value stored in 'p' by 'val', and + * returns the previous value. + */ +static inline isc_int32_t +isc_atomic_xadd(isc_int32_t *p, isc_int32_t val) { + isc_int32_t prev, swapped; + + for (prev = *(volatile isc_int32_t *)p; ; prev = swapped) { + swapped = prev + val; + __asm__ volatile( + "casa [%1] %2, %3, %0" + : "+r"(swapped) + : "r"(p), "n"(ASI_P), "r"(prev)); + if (swapped == prev) + break; + } + + return (prev); +} + +/* + * This routine atomically stores the value 'val' in 'p'. + */ +static inline void +isc_atomic_store(isc_int32_t *p, isc_int32_t val) { + isc_int32_t prev, swapped; + + for (prev = *(volatile isc_int32_t *)p; ; prev = swapped) { + swapped = val; + __asm__ volatile( + "casa [%1] %2, %3, %0" + : "+r"(swapped) + : "r"(p), "n"(ASI_P), "r"(prev) + : "memory"); + if (swapped == prev) + break; + } +} + +/* + * This routine atomically replaces the value in 'p' with 'val', if the + * original value is equal to 'cmpval'. The original value is returned in any + * case. + */ +static inline isc_int32_t +isc_atomic_cmpxchg(isc_int32_t *p, isc_int32_t cmpval, isc_int32_t val) { + isc_int32_t temp = val; + + __asm__ volatile( + "casa [%1] %2, %3, %0" + : "+r"(temp) + : "r"(p), "n"(ASI_P), "r"(cmpval)); + + return (temp); +} + +#endif /* ISC_ATOMIC_H */ diff --git a/lib/isc/unix/include/isc/stdtime.h b/lib/isc/unix/include/isc/stdtime.h index 45f1f75a78..be673e71e4 100644 --- a/lib/isc/unix/include/isc/stdtime.h +++ b/lib/isc/unix/include/isc/stdtime.h @@ -15,7 +15,7 @@ * PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: stdtime.h,v 1.11 2005/04/29 00:23:53 marka Exp $ */ +/* $Id: stdtime.h,v 1.12 2005/06/04 05:32:49 jinmei Exp $ */ #ifndef ISC_STDTIME_H #define ISC_STDTIME_H 1 @@ -31,6 +31,12 @@ * about its size. */ typedef isc_uint32_t isc_stdtime_t; +/* + * isc_stdtime32_t is a 32-bit version of isc_stdtime_t. A variable of this + * type should only be used as an opaque integer (e.g.,) to compare two + * time values. + */ +typedef isc_uint32_t isc_stdtime32_t; ISC_LANG_BEGINDECLS /* */ @@ -44,6 +50,11 @@ isc_stdtime_get(isc_stdtime_t *t); *\li 't' is a valid pointer. */ +#define isc_stdtime_convert32(t, t32p) (*(t32p) = t) +/* + * Convert the standard time to its 32-bit version. + */ + ISC_LANG_ENDDECLS #endif /* ISC_STDTIME_H */ diff --git a/lib/isc/win32/include/isc/stdtime.h b/lib/isc/win32/include/isc/stdtime.h index 3b0460cb41..ee3ad1e67f 100644 --- a/lib/isc/win32/include/isc/stdtime.h +++ b/lib/isc/win32/include/isc/stdtime.h @@ -15,7 +15,7 @@ * PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: stdtime.h,v 1.8 2004/03/05 05:12:06 marka Exp $ */ +/* $Id: stdtime.h,v 1.9 2005/06/04 05:32:49 jinmei Exp $ */ #ifndef ISC_STDTIME_H #define ISC_STDTIME_H 1 @@ -29,6 +29,12 @@ * about its size. */ typedef isc_uint32_t isc_stdtime_t; +/* + * isc_stdtime32_t is a 32-bit version of isc_stdtime_t. A variable of this + * type should only be used as an opaque integer (e.g.,) to compare two + * time values. + */ +typedef isc_uint32_t isc_stdtime32_t; ISC_LANG_BEGINDECLS @@ -42,6 +48,11 @@ isc_stdtime_get(isc_stdtime_t *t); * 't' is a valid pointer. */ +#define isc_stdtime_convert32(t, t32p) (*(t32p) = t) +/* + * Convert the standard time to its 32-bit version. + */ + ISC_LANG_ENDDECLS #endif /* ISC_STDTIME_H */ diff --git a/lib/isc/x86_32/include/isc/atomic.h b/lib/isc/x86_32/include/isc/atomic.h new file mode 100644 index 0000000000..807d6967ec --- /dev/null +++ b/lib/isc/x86_32/include/isc/atomic.h @@ -0,0 +1,83 @@ +/* + * Copyright (C) 2005 Internet Systems Consortium, Inc. ("ISC") + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND ISC DISCLAIMS ALL WARRANTIES WITH + * REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY + * AND FITNESS. IN NO EVENT SHALL ISC BE LIABLE FOR ANY SPECIAL, DIRECT, + * INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM + * LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE + * OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR + * PERFORMANCE OF THIS SOFTWARE. + */ + +/* $Id: atomic.h,v 1.2 2005/06/04 05:32:50 jinmei Exp $ */ + +#ifndef ISC_ATOMIC_H +#define ISC_ATOMIC_H 1 + +#include +#include + +/* + * This routine atomically increments the value stored in 'p' by 'val', and + * returns the previous value. + */ +static inline isc_int32_t +isc_atomic_xadd(isc_int32_t *p, isc_int32_t val) { + isc_int32_t prev = val; + + __asm__ volatile( +#ifdef ISC_PLATFORM_USETHREADS + "lock;" +#endif + "xadd %0, %1" + :"=q"(prev) + :"m"(*p), "0"(prev) + :"memory", "cc"); + + return (prev); +} + +/* + * This routine atomically stores the value 'val' in 'p'. + */ +static inline void +isc_atomic_store(isc_int32_t *p, isc_int32_t val) { + __asm__ volatile( +#ifdef ISC_PLATFORM_USETHREADS + /* + * xchg should automatically lock memory, but we add it + * explicitly just in case (it at least doesn't harm) + */ + "lock;" +#endif + "xchgl %1, %0" + : + : "r"(val), "m"(*p) + : "memory"); +} + +/* + * This routine atomically replaces the value in 'p' with 'val', if the + * original value is equal to 'cmpval'. The original value is returned in any + * case. + */ +static inline isc_int32_t +isc_atomic_cmpxchg(isc_int32_t *p, isc_int32_t cmpval, isc_int32_t val) { + __asm__ volatile( +#ifdef ISC_PLATFORM_USETHREADS + "lock;" +#endif + "cmpxchgl %1, %2" + : "=a"(cmpval) + : "r"(val), "m"(*p), "a"(cmpval) + : "memory"); + + return (cmpval); +} + +#endif /* ISC_ATOMIC_H */ diff --git a/make/includes.in b/make/includes.in index eb3b4f2a51..a5a3f003d8 100644 --- a/make/includes.in +++ b/make/includes.in @@ -1,4 +1,4 @@ -# Copyright (C) 2004 Internet Systems Consortium, Inc. ("ISC") +# Copyright (C) 2004, 2005 Internet Systems Consortium, Inc. ("ISC") # Copyright (C) 1999-2001 Internet Software Consortium. # # Permission to use, copy, modify, and distribute this software for any @@ -13,7 +13,7 @@ # OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR # PERFORMANCE OF THIS SOFTWARE. -# $Id: includes.in,v 1.18 2004/12/09 01:41:24 marka Exp $ +# $Id: includes.in,v 1.19 2005/06/04 05:32:50 jinmei Exp $ # Search for machine-generated header files in the build tree, # and for normal headers in the source tree (${top_srcdir}). @@ -25,7 +25,8 @@ ISC_INCLUDES = @BIND9_ISC_BUILDINCLUDE@ \ -I${top_srcdir}/lib/isc \ -I${top_srcdir}/lib/isc/include \ -I${top_srcdir}/lib/isc/unix/include \ - -I${top_srcdir}/lib/isc/@ISC_THREAD_DIR@/include + -I${top_srcdir}/lib/isc/@ISC_THREAD_DIR@/include \ + -I${top_srcdir}/lib/isc/@ISC_ARCH_DIR@/include ISCCC_INCLUDES = @BIND9_ISCCC_BUILDINCLUDE@ \ -I${top_srcdir}/lib/isccc/include