diff --git a/CHANGES b/CHANGES index 21e7282d43..5a18a3d62d 100644 --- a/CHANGES +++ b/CHANGES @@ -1,3 +1,20 @@ +2937. [bug] Worked around an apparent race condition in over + memory conditions. Without this fix a DNS cache DB or + ADB could incorrectly stay in an over memory state, + effectively refusing further caching, which + subsequently made a BIND 9 caching server unworkable. + This fix prevents this problem from happening by + polling the state of the memory context, rather than + making a copy of the state, which appeared to cause + a race. This is a "workaround" in that it doesn't + solve the possible race per se, but several experiments + proved this change solves the symptom. Also, the + polling overhead hasn't been reported to be an issue. + This bug should only affect a caching server that + specifies a finite max-cache-size. It's also quite + likely that the bug happens only when enabling threads, + but it's not confirmed yet. [RT #21818] + 2936. [func] Improved configuration syntax and multiple-view support for addzone/delzone feature (see change #2930). Removed "new-zone-file" option, replaced diff --git a/lib/dns/adb.c b/lib/dns/adb.c index 2e1c607ae5..7ab8aca660 100644 --- a/lib/dns/adb.c +++ b/lib/dns/adb.c @@ -15,7 +15,7 @@ * PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: adb.c,v 1.247 2009/02/03 22:33:13 jinmei Exp $ */ +/* $Id: adb.c,v 1.248 2010/08/11 22:54:58 jinmei Exp $ */ /*! \file * @@ -118,7 +118,6 @@ struct dns_adb { isc_taskmgr_t *taskmgr; isc_task_t *task; - isc_boolean_t overmem; isc_interval_t tick_interval; int next_cleanbucket; @@ -294,8 +293,8 @@ static inline void inc_adb_irefcnt(dns_adb_t *); static inline void inc_adb_erefcnt(dns_adb_t *); static inline void inc_entry_refcnt(dns_adb_t *, dns_adbentry_t *, isc_boolean_t); -static inline isc_boolean_t dec_entry_refcnt(dns_adb_t *, dns_adbentry_t *, - isc_boolean_t); +static inline isc_boolean_t dec_entry_refcnt(dns_adb_t *, isc_boolean_t, + dns_adbentry_t *, isc_boolean_t); static inline void violate_locking_hierarchy(isc_mutex_t *, isc_mutex_t *); static isc_boolean_t clean_namehooks(dns_adb_t *, dns_adbnamehooklist_t *); static void clean_target(dns_adb_t *, dns_name_t *); @@ -777,7 +776,7 @@ link_entry(dns_adb_t *adb, int bucket, dns_adbentry_t *entry) { int i; dns_adbentry_t *e; - if (adb->overmem) { + if (isc_mem_isovermem(adb->mctx)) { for (i = 0; i < 2; i++) { e = ISC_LIST_TAIL(adb->entries[bucket]); if (e == NULL) @@ -943,6 +942,7 @@ clean_namehooks(dns_adb_t *adb, dns_adbnamehooklist_t *namehooks) { dns_adbnamehook_t *namehook; int addr_bucket; isc_boolean_t result = ISC_FALSE; + isc_boolean_t overmem = isc_mem_isovermem(adb->mctx); addr_bucket = DNS_ADB_INVALIDBUCKET; namehook = ISC_LIST_HEAD(*namehooks); @@ -963,7 +963,8 @@ clean_namehooks(dns_adb_t *adb, dns_adbnamehooklist_t *namehooks) { LOCK(&adb->entrylocks[addr_bucket]); } - result = dec_entry_refcnt(adb, entry, ISC_FALSE); + result = dec_entry_refcnt(adb, overmem, entry, + ISC_FALSE); } /* @@ -1235,7 +1236,9 @@ inc_entry_refcnt(dns_adb_t *adb, dns_adbentry_t *entry, isc_boolean_t lock) { } static inline isc_boolean_t -dec_entry_refcnt(dns_adb_t *adb, dns_adbentry_t *entry, isc_boolean_t lock) { +dec_entry_refcnt(dns_adb_t *adb, isc_boolean_t overmem, dns_adbentry_t *entry, + isc_boolean_t lock) +{ int bucket; isc_boolean_t destroy_entry; isc_boolean_t result = ISC_FALSE; @@ -1250,7 +1253,7 @@ dec_entry_refcnt(dns_adb_t *adb, dns_adbentry_t *entry, isc_boolean_t lock) { destroy_entry = ISC_FALSE; if (entry->refcnt == 0 && - (adb->entry_sd[bucket] || entry->expires == 0 || adb->overmem || + (adb->entry_sd[bucket] || entry->expires == 0 || overmem || (entry->flags & ENTRY_IS_DEAD) != 0)) { destroy_entry = ISC_TRUE; result = unlink_entry(adb, entry); @@ -1852,7 +1855,7 @@ check_stale_name(dns_adb_t *adb, int bucket, isc_stdtime_t now) { int victims, max_victims; isc_boolean_t result; dns_adbname_t *victim, *next_victim; - isc_boolean_t overmem = adb->overmem; + isc_boolean_t overmem = isc_mem_isovermem(adb->mctx); int scans = 0; INSIST(bucket != DNS_ADB_INVALIDBUCKET); @@ -2049,7 +2052,6 @@ dns_adb_create(isc_mem_t *mem, dns_view_t *view, isc_timermgr_t *timermgr, adb, NULL, NULL); adb->cevent_sent = ISC_FALSE; adb->shutting_down = ISC_FALSE; - adb->overmem = ISC_FALSE; ISC_LIST_INIT(adb->whenshutdown); isc_mem_attach(mem, &adb->mctx); @@ -2616,6 +2618,7 @@ dns_adb_destroyfind(dns_adbfind_t **findp) { dns_adbaddrinfo_t *ai; int bucket; dns_adb_t *adb; + isc_boolean_t overmem; REQUIRE(findp != NULL && DNS_ADBFIND_VALID(*findp)); find = *findp; @@ -2640,13 +2643,14 @@ dns_adb_destroyfind(dns_adbfind_t **findp) { * Return the find to the memory pool, and decrement the adb's * reference count. */ + overmem = isc_mem_isovermem(adb->mctx); ai = ISC_LIST_HEAD(find->list); while (ai != NULL) { ISC_LIST_UNLINK(find->list, ai, publink); entry = ai->entry; ai->entry = NULL; INSIST(DNS_ADBENTRY_VALID(entry)); - RUNTIME_CHECK(dec_entry_refcnt(adb, entry, ISC_TRUE) == + RUNTIME_CHECK(dec_entry_refcnt(adb, overmem, entry, ISC_TRUE) == ISC_FALSE); free_adbaddrinfo(adb, &ai); ai = ISC_LIST_HEAD(find->list); @@ -3509,6 +3513,7 @@ dns_adb_freeaddrinfo(dns_adb_t *adb, dns_adbaddrinfo_t **addrp) { int bucket; isc_stdtime_t now; isc_boolean_t want_check_exit = ISC_FALSE; + isc_boolean_t overmem; REQUIRE(DNS_ADB_VALID(adb)); REQUIRE(addrp != NULL); @@ -3520,13 +3525,14 @@ dns_adb_freeaddrinfo(dns_adb_t *adb, dns_adbaddrinfo_t **addrp) { isc_stdtime_get(&now); *addrp = NULL; + overmem = isc_mem_isovermem(adb->mctx); bucket = addr->entry->lock_bucket; LOCK(&adb->entrylocks[bucket]); entry->expires = now + ADB_ENTRY_WINDOW; - want_check_exit = dec_entry_refcnt(adb, entry, ISC_FALSE); + want_check_exit = dec_entry_refcnt(adb, overmem, entry, ISC_FALSE); UNLOCK(&adb->entrylocks[bucket]); @@ -3591,6 +3597,14 @@ dns_adb_flushname(dns_adb_t *adb, dns_name_t *name) { static void water(void *arg, int mark) { + /* + * We're going to change the way to handle overmem condition: use + * isc_mem_isovermem() instead of storing the state via this callback, + * since the latter way tends to cause race conditions. + * To minimize the change, and in case we re-enable the callback + * approach, however, keep this function at the moment. + */ + dns_adb_t *adb = arg; isc_boolean_t overmem = ISC_TF(mark == ISC_MEM_HIWATER); @@ -3598,17 +3612,6 @@ water(void *arg, int mark) { DP(ISC_LOG_DEBUG(1), "adb reached %s water mark", overmem ? "high" : "low"); - - /* - * We can't use adb->lock as there is potential for water - * to be called when adb->lock is held. - */ - LOCK(&adb->overmemlock); - if (adb->overmem != overmem) { - adb->overmem = overmem; - isc_mem_waterack(adb->mctx, mark); - } - UNLOCK(&adb->overmemlock); } void diff --git a/lib/dns/rbtdb.c b/lib/dns/rbtdb.c index f453ad1fc2..8f9de66bca 100644 --- a/lib/dns/rbtdb.c +++ b/lib/dns/rbtdb.c @@ -15,7 +15,7 @@ * PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: rbtdb.c,v 1.301 2010/05/10 01:39:03 marka Exp $ */ +/* $Id: rbtdb.c,v 1.302 2010/08/11 22:54:58 jinmei Exp $ */ /*! \file */ @@ -411,7 +411,6 @@ typedef struct { rbtdb_version_t * current_version; rbtdb_version_t * future_version; rbtdb_versionlist_t open_versions; - isc_boolean_t overmem; isc_task_t * task; dns_dbnode_t *soanode; dns_dbnode_t *nsnode; @@ -5117,7 +5116,7 @@ expirenode(dns_db_t *db, dns_dbnode_t *node, isc_stdtime_t now) { if (now == 0) isc_stdtime_get(&now); - if (rbtdb->overmem) { + if (isc_mem_isovermem(rbtdb->common.mctx)) { isc_uint32_t val; isc_random_get(&val); @@ -5127,8 +5126,8 @@ expirenode(dns_db_t *db, dns_dbnode_t *node, isc_stdtime_t now) { force_expire = ISC_TF(rbtnode->down == NULL && val % 4 == 0); /* - * Note that 'log' can be true IFF rbtdb->overmem is also true. - * rbtdb->overmem can currently only be true for cache + * Note that 'log' can be true IFF overmem is also true. + * overmem can currently only be true for cache * databases -- hence all of the "overmem cache" log strings. */ log = ISC_TF(isc_log_wouldlog(dns_lctx, level)); @@ -5173,7 +5172,7 @@ expirenode(dns_db_t *db, dns_dbnode_t *node, isc_stdtime_t now) { "reprieve by RETAIN() %s", printname); } - } else if (rbtdb->overmem && log) + } else if (isc_mem_isovermem(rbtdb->common.mctx) && log) isc_log_write(dns_lctx, category, module, level, "overmem cache: saved %s", printname); @@ -5185,10 +5184,12 @@ expirenode(dns_db_t *db, dns_dbnode_t *node, isc_stdtime_t now) { static void overmem(dns_db_t *db, isc_boolean_t overmem) { - dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db; + /* This is an empty callback. See adb.c:water() */ - if (IS_CACHE(rbtdb)) - rbtdb->overmem = overmem; + UNUSED(db); + UNUSED(overmem); + + return; } static void @@ -6134,6 +6135,7 @@ addrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version, isc_boolean_t delegating; isc_boolean_t newnsec; isc_boolean_t tree_locked = ISC_FALSE; + isc_boolean_t cache_is_overmem = ISC_FALSE; REQUIRE(VALID_RBTDB(rbtdb)); @@ -6230,12 +6232,14 @@ addrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version, * the tree. In the latter case the lock does not necessarily have to * be acquired but it will help purge stale entries more effectively. */ - if (delegating || newnsec || (IS_CACHE(rbtdb) && rbtdb->overmem)) { + if (IS_CACHE(rbtdb) && isc_mem_isovermem(rbtdb->common.mctx)) + cache_is_overmem = ISC_TRUE; + if (delegating || newnsec || cache_is_overmem) { tree_locked = ISC_TRUE; RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_write); } - if (IS_CACHE(rbtdb) && rbtdb->overmem) + if (cache_is_overmem) overmem_purge(rbtdb, rbtnode->locknum, now, tree_locked); NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock, @@ -7399,7 +7403,6 @@ dns_rbtdb_create return (result); } rbtdb->attributes = 0; - rbtdb->overmem = ISC_FALSE; rbtdb->task = NULL; /* diff --git a/lib/isc/include/isc/mem.h b/lib/isc/include/isc/mem.h index 714e43fc78..c0a8510da8 100644 --- a/lib/isc/include/isc/mem.h +++ b/lib/isc/include/isc/mem.h @@ -15,7 +15,7 @@ * PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: mem.h,v 1.88 2010/03/04 23:50:34 tbox Exp $ */ +/* $Id: mem.h,v 1.89 2010/08/11 22:54:58 jinmei Exp $ */ #ifndef ISC_MEM_H #define ISC_MEM_H 1 @@ -224,6 +224,7 @@ typedef struct isc_memmethods { void *water_arg, size_t hiwater, size_t lowater); void (*waterack)(isc_mem_t *ctx, int flag); size_t (*inuse)(isc_mem_t *mctx); + isc_boolean_t (*isovermem)(isc_mem_t *mctx); isc_result_t (*mpcreate)(isc_mem_t *mctx, size_t size, isc_mempool_t **mpctxp); } isc_memmethods_t; @@ -420,6 +421,14 @@ isc_mem_inuse(isc_mem_t *mctx); * allocated from the system but not yet used. */ +isc_boolean_t +isc_mem_isovermem(isc_mem_t *mctx); +/*%< + * Return true iff the memory context is in "over memory" state, i.e., + * a hiwater mark has been set and the used amount of memory has exceeds + * the mark. + */ + void isc_mem_setwater(isc_mem_t *mctx, isc_mem_water_t water, void *water_arg, size_t hiwater, size_t lowater); diff --git a/lib/isc/include/isc/namespace.h b/lib/isc/include/isc/namespace.h index 4fa05ac4ef..fc92af642e 100644 --- a/lib/isc/include/isc/namespace.h +++ b/lib/isc/include/isc/namespace.h @@ -14,7 +14,7 @@ * PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: namespace.h,v 1.5 2009/10/01 01:30:01 sar Exp $ */ +/* $Id: namespace.h,v 1.6 2010/08/11 22:54:58 jinmei Exp $ */ #ifndef ISCAPI_NAMESPACE_H #define ISCAPI_NAMESPACE_H 1 @@ -67,6 +67,7 @@ #define isc_mem_getquota isc__mem_getquota #define isc_mem_gettag isc__mem_gettag #define isc_mem_inuse isc__mem_inuse +#define isc_mem_isovermem isc__mem_isovermem #define isc_mem_setname isc__mem_setname #define isc_mem_setwater isc__mem_setwater #define isc_mem_printallactive isc__mem_printallactive diff --git a/lib/isc/mem.c b/lib/isc/mem.c index cefb3aeefb..8fd0b1edbb 100644 --- a/lib/isc/mem.c +++ b/lib/isc/mem.c @@ -15,7 +15,7 @@ * PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: mem.c,v 1.156 2010/05/12 00:46:55 marka Exp $ */ +/* $Id: mem.c,v 1.157 2010/08/11 22:54:58 jinmei Exp $ */ /*! \file */ @@ -144,6 +144,7 @@ struct isc__mem { size_t hi_water; size_t lo_water; isc_boolean_t hi_called; + isc_boolean_t is_overmem; isc_mem_water_t water; void * water_arg; ISC_LIST(isc__mempool_t) pools; @@ -269,6 +270,8 @@ ISC_MEMFUNC_SCOPE size_t isc__mem_getquota(isc_mem_t *ctx); ISC_MEMFUNC_SCOPE size_t isc__mem_inuse(isc_mem_t *ctx); +ISC_MEMFUNC_SCOPE isc_boolean_t +isc__mem_isovermem(isc_mem_t *ctx); ISC_MEMFUNC_SCOPE void isc__mem_setwater(isc_mem_t *ctx, isc_mem_water_t water, void *water_arg, size_t hiwater, size_t lowater); @@ -345,6 +348,7 @@ static struct isc__memmethods { isc__mem_setwater, isc__mem_waterack, isc__mem_inuse, + isc__mem_isovermem, isc__mempool_create } #ifndef BIND9 @@ -939,6 +943,7 @@ isc__mem_createx2(size_t init_max_size, size_t target_size, ctx->hi_water = 0; ctx->lo_water = 0; ctx->hi_called = ISC_FALSE; + ctx->is_overmem = ISC_FALSE; ctx->water = NULL; ctx->water_arg = NULL; ctx->common.impmagic = MEM_MAGIC; @@ -1281,6 +1286,10 @@ isc___mem_get(isc_mem_t *ctx0, size_t size FLARG) { } ADD_TRACE(ctx, ptr, size, file, line); + if (ctx->hi_water != 0U && ctx->inuse > ctx->hi_water && + !ctx->is_overmem) { + ctx->is_overmem = ISC_TRUE; + } if (ctx->hi_water != 0U && !ctx->hi_called && ctx->inuse > ctx->hi_water) { call_water = ISC_TRUE; @@ -1338,6 +1347,10 @@ isc___mem_put(isc_mem_t *ctx0, void *ptr, size_t size FLARG) { * when the context was pushed over hi_water but then had * isc_mem_setwater() called with 0 for hi_water and lo_water. */ + if (ctx->is_overmem && + (ctx->inuse < ctx->lo_water || ctx->lo_water == 0U)) { + ctx->is_overmem = ISC_FALSE; + } if (ctx->hi_called && (ctx->inuse < ctx->lo_water || ctx->lo_water == 0U)) { if (ctx->water != NULL) @@ -1529,6 +1542,11 @@ isc___mem_allocate(isc_mem_t *ctx0, size_t size FLARG) { #if ISC_MEM_TRACKLINES ADD_TRACE(ctx, si, si[-1].u.size, file, line); #endif + if (ctx->hi_water != 0U && ctx->inuse > ctx->hi_water && + !ctx->is_overmem) { + ctx->is_overmem = ISC_TRUE; + } + if (ctx->hi_water != 0U && !ctx->hi_called && ctx->inuse > ctx->hi_water) { ctx->hi_called = ISC_TRUE; @@ -1619,6 +1637,11 @@ isc___mem_free(isc_mem_t *ctx0, void *ptr FLARG) { * when the context was pushed over hi_water but then had * isc_mem_setwater() called with 0 for hi_water and lo_water. */ + if (ctx->is_overmem && + (ctx->inuse < ctx->lo_water || ctx->lo_water == 0U)) { + ctx->is_overmem = ISC_FALSE; + } + if (ctx->hi_called && (ctx->inuse < ctx->lo_water || ctx->lo_water == 0U)) { ctx->hi_called = ISC_FALSE; @@ -1753,6 +1776,18 @@ isc__mem_setwater(isc_mem_t *ctx0, isc_mem_water_t water, void *water_arg, (oldwater)(oldwater_arg, ISC_MEM_LOWATER); } +ISC_MEMFUNC_SCOPE isc_boolean_t +isc__mem_isovermem(isc_mem_t *ctx0) { + isc__mem_t *ctx = (isc__mem_t *)ctx0; + + /* + * We don't bother to lock the context because 100% accuracy isn't + * necessary (and even if we locked the context the returned value + * could be different from the actual state when it's used anyway) + */ + return (ctx->is_overmem); +} + ISC_MEMFUNC_SCOPE void isc__mem_setname(isc_mem_t *ctx0, const char *name, void *tag) { isc__mem_t *ctx = (isc__mem_t *)ctx0;