From 27fe1966c948ba0c1c9d0d831ea3d8bf32d052ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tatuya=20JINMEI=20=E7=A5=9E=E6=98=8E=E9=81=94=E5=93=89?= Date: Wed, 11 Aug 2010 22:54:58 +0000 Subject: [PATCH] 2937. [bug] Worked around an apparent race condition in over memory conditions. Without this fix a DNS cache DB or ADB could incorrectly stay in an over memory state, effectively refusing further caching, which subsequently made a BIND 9 caching server unworkable. This fix prevents this problem from happening by polling the state of the memory context, rather than making a copy of the state, which appeared to cause a race. This is a "workaround" in that it doesn't solve the possible race per se, but several experiments proved this change solves the symptom. Also, the polling overhead hasn't been reported to be an issue. This bug should only affect a caching server that specifies a finite max-cache-size. It's also quite likely that the bug happens only when enabling threads, but it's not confirmed yet. [RT #21818] --- CHANGES | 17 ++++++++++++ lib/dns/adb.c | 49 +++++++++++++++++---------------- lib/dns/rbtdb.c | 27 ++++++++++-------- lib/isc/include/isc/mem.h | 11 +++++++- lib/isc/include/isc/namespace.h | 3 +- lib/isc/mem.c | 37 ++++++++++++++++++++++++- 6 files changed, 106 insertions(+), 38 deletions(-) diff --git a/CHANGES b/CHANGES index 21e7282d43..5a18a3d62d 100644 --- a/CHANGES +++ b/CHANGES @@ -1,3 +1,20 @@ +2937. [bug] Worked around an apparent race condition in over + memory conditions. Without this fix a DNS cache DB or + ADB could incorrectly stay in an over memory state, + effectively refusing further caching, which + subsequently made a BIND 9 caching server unworkable. + This fix prevents this problem from happening by + polling the state of the memory context, rather than + making a copy of the state, which appeared to cause + a race. This is a "workaround" in that it doesn't + solve the possible race per se, but several experiments + proved this change solves the symptom. Also, the + polling overhead hasn't been reported to be an issue. + This bug should only affect a caching server that + specifies a finite max-cache-size. It's also quite + likely that the bug happens only when enabling threads, + but it's not confirmed yet. [RT #21818] + 2936. [func] Improved configuration syntax and multiple-view support for addzone/delzone feature (see change #2930). Removed "new-zone-file" option, replaced diff --git a/lib/dns/adb.c b/lib/dns/adb.c index 2e1c607ae5..7ab8aca660 100644 --- a/lib/dns/adb.c +++ b/lib/dns/adb.c @@ -15,7 +15,7 @@ * PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: adb.c,v 1.247 2009/02/03 22:33:13 jinmei Exp $ */ +/* $Id: adb.c,v 1.248 2010/08/11 22:54:58 jinmei Exp $ */ /*! \file * @@ -118,7 +118,6 @@ struct dns_adb { isc_taskmgr_t *taskmgr; isc_task_t *task; - isc_boolean_t overmem; isc_interval_t tick_interval; int next_cleanbucket; @@ -294,8 +293,8 @@ static inline void inc_adb_irefcnt(dns_adb_t *); static inline void inc_adb_erefcnt(dns_adb_t *); static inline void inc_entry_refcnt(dns_adb_t *, dns_adbentry_t *, isc_boolean_t); -static inline isc_boolean_t dec_entry_refcnt(dns_adb_t *, dns_adbentry_t *, - isc_boolean_t); +static inline isc_boolean_t dec_entry_refcnt(dns_adb_t *, isc_boolean_t, + dns_adbentry_t *, isc_boolean_t); static inline void violate_locking_hierarchy(isc_mutex_t *, isc_mutex_t *); static isc_boolean_t clean_namehooks(dns_adb_t *, dns_adbnamehooklist_t *); static void clean_target(dns_adb_t *, dns_name_t *); @@ -777,7 +776,7 @@ link_entry(dns_adb_t *adb, int bucket, dns_adbentry_t *entry) { int i; dns_adbentry_t *e; - if (adb->overmem) { + if (isc_mem_isovermem(adb->mctx)) { for (i = 0; i < 2; i++) { e = ISC_LIST_TAIL(adb->entries[bucket]); if (e == NULL) @@ -943,6 +942,7 @@ clean_namehooks(dns_adb_t *adb, dns_adbnamehooklist_t *namehooks) { dns_adbnamehook_t *namehook; int addr_bucket; isc_boolean_t result = ISC_FALSE; + isc_boolean_t overmem = isc_mem_isovermem(adb->mctx); addr_bucket = DNS_ADB_INVALIDBUCKET; namehook = ISC_LIST_HEAD(*namehooks); @@ -963,7 +963,8 @@ clean_namehooks(dns_adb_t *adb, dns_adbnamehooklist_t *namehooks) { LOCK(&adb->entrylocks[addr_bucket]); } - result = dec_entry_refcnt(adb, entry, ISC_FALSE); + result = dec_entry_refcnt(adb, overmem, entry, + ISC_FALSE); } /* @@ -1235,7 +1236,9 @@ inc_entry_refcnt(dns_adb_t *adb, dns_adbentry_t *entry, isc_boolean_t lock) { } static inline isc_boolean_t -dec_entry_refcnt(dns_adb_t *adb, dns_adbentry_t *entry, isc_boolean_t lock) { +dec_entry_refcnt(dns_adb_t *adb, isc_boolean_t overmem, dns_adbentry_t *entry, + isc_boolean_t lock) +{ int bucket; isc_boolean_t destroy_entry; isc_boolean_t result = ISC_FALSE; @@ -1250,7 +1253,7 @@ dec_entry_refcnt(dns_adb_t *adb, dns_adbentry_t *entry, isc_boolean_t lock) { destroy_entry = ISC_FALSE; if (entry->refcnt == 0 && - (adb->entry_sd[bucket] || entry->expires == 0 || adb->overmem || + (adb->entry_sd[bucket] || entry->expires == 0 || overmem || (entry->flags & ENTRY_IS_DEAD) != 0)) { destroy_entry = ISC_TRUE; result = unlink_entry(adb, entry); @@ -1852,7 +1855,7 @@ check_stale_name(dns_adb_t *adb, int bucket, isc_stdtime_t now) { int victims, max_victims; isc_boolean_t result; dns_adbname_t *victim, *next_victim; - isc_boolean_t overmem = adb->overmem; + isc_boolean_t overmem = isc_mem_isovermem(adb->mctx); int scans = 0; INSIST(bucket != DNS_ADB_INVALIDBUCKET); @@ -2049,7 +2052,6 @@ dns_adb_create(isc_mem_t *mem, dns_view_t *view, isc_timermgr_t *timermgr, adb, NULL, NULL); adb->cevent_sent = ISC_FALSE; adb->shutting_down = ISC_FALSE; - adb->overmem = ISC_FALSE; ISC_LIST_INIT(adb->whenshutdown); isc_mem_attach(mem, &adb->mctx); @@ -2616,6 +2618,7 @@ dns_adb_destroyfind(dns_adbfind_t **findp) { dns_adbaddrinfo_t *ai; int bucket; dns_adb_t *adb; + isc_boolean_t overmem; REQUIRE(findp != NULL && DNS_ADBFIND_VALID(*findp)); find = *findp; @@ -2640,13 +2643,14 @@ dns_adb_destroyfind(dns_adbfind_t **findp) { * Return the find to the memory pool, and decrement the adb's * reference count. */ + overmem = isc_mem_isovermem(adb->mctx); ai = ISC_LIST_HEAD(find->list); while (ai != NULL) { ISC_LIST_UNLINK(find->list, ai, publink); entry = ai->entry; ai->entry = NULL; INSIST(DNS_ADBENTRY_VALID(entry)); - RUNTIME_CHECK(dec_entry_refcnt(adb, entry, ISC_TRUE) == + RUNTIME_CHECK(dec_entry_refcnt(adb, overmem, entry, ISC_TRUE) == ISC_FALSE); free_adbaddrinfo(adb, &ai); ai = ISC_LIST_HEAD(find->list); @@ -3509,6 +3513,7 @@ dns_adb_freeaddrinfo(dns_adb_t *adb, dns_adbaddrinfo_t **addrp) { int bucket; isc_stdtime_t now; isc_boolean_t want_check_exit = ISC_FALSE; + isc_boolean_t overmem; REQUIRE(DNS_ADB_VALID(adb)); REQUIRE(addrp != NULL); @@ -3520,13 +3525,14 @@ dns_adb_freeaddrinfo(dns_adb_t *adb, dns_adbaddrinfo_t **addrp) { isc_stdtime_get(&now); *addrp = NULL; + overmem = isc_mem_isovermem(adb->mctx); bucket = addr->entry->lock_bucket; LOCK(&adb->entrylocks[bucket]); entry->expires = now + ADB_ENTRY_WINDOW; - want_check_exit = dec_entry_refcnt(adb, entry, ISC_FALSE); + want_check_exit = dec_entry_refcnt(adb, overmem, entry, ISC_FALSE); UNLOCK(&adb->entrylocks[bucket]); @@ -3591,6 +3597,14 @@ dns_adb_flushname(dns_adb_t *adb, dns_name_t *name) { static void water(void *arg, int mark) { + /* + * We're going to change the way to handle overmem condition: use + * isc_mem_isovermem() instead of storing the state via this callback, + * since the latter way tends to cause race conditions. + * To minimize the change, and in case we re-enable the callback + * approach, however, keep this function at the moment. + */ + dns_adb_t *adb = arg; isc_boolean_t overmem = ISC_TF(mark == ISC_MEM_HIWATER); @@ -3598,17 +3612,6 @@ water(void *arg, int mark) { DP(ISC_LOG_DEBUG(1), "adb reached %s water mark", overmem ? "high" : "low"); - - /* - * We can't use adb->lock as there is potential for water - * to be called when adb->lock is held. - */ - LOCK(&adb->overmemlock); - if (adb->overmem != overmem) { - adb->overmem = overmem; - isc_mem_waterack(adb->mctx, mark); - } - UNLOCK(&adb->overmemlock); } void diff --git a/lib/dns/rbtdb.c b/lib/dns/rbtdb.c index f453ad1fc2..8f9de66bca 100644 --- a/lib/dns/rbtdb.c +++ b/lib/dns/rbtdb.c @@ -15,7 +15,7 @@ * PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: rbtdb.c,v 1.301 2010/05/10 01:39:03 marka Exp $ */ +/* $Id: rbtdb.c,v 1.302 2010/08/11 22:54:58 jinmei Exp $ */ /*! \file */ @@ -411,7 +411,6 @@ typedef struct { rbtdb_version_t * current_version; rbtdb_version_t * future_version; rbtdb_versionlist_t open_versions; - isc_boolean_t overmem; isc_task_t * task; dns_dbnode_t *soanode; dns_dbnode_t *nsnode; @@ -5117,7 +5116,7 @@ expirenode(dns_db_t *db, dns_dbnode_t *node, isc_stdtime_t now) { if (now == 0) isc_stdtime_get(&now); - if (rbtdb->overmem) { + if (isc_mem_isovermem(rbtdb->common.mctx)) { isc_uint32_t val; isc_random_get(&val); @@ -5127,8 +5126,8 @@ expirenode(dns_db_t *db, dns_dbnode_t *node, isc_stdtime_t now) { force_expire = ISC_TF(rbtnode->down == NULL && val % 4 == 0); /* - * Note that 'log' can be true IFF rbtdb->overmem is also true. - * rbtdb->overmem can currently only be true for cache + * Note that 'log' can be true IFF overmem is also true. + * overmem can currently only be true for cache * databases -- hence all of the "overmem cache" log strings. */ log = ISC_TF(isc_log_wouldlog(dns_lctx, level)); @@ -5173,7 +5172,7 @@ expirenode(dns_db_t *db, dns_dbnode_t *node, isc_stdtime_t now) { "reprieve by RETAIN() %s", printname); } - } else if (rbtdb->overmem && log) + } else if (isc_mem_isovermem(rbtdb->common.mctx) && log) isc_log_write(dns_lctx, category, module, level, "overmem cache: saved %s", printname); @@ -5185,10 +5184,12 @@ expirenode(dns_db_t *db, dns_dbnode_t *node, isc_stdtime_t now) { static void overmem(dns_db_t *db, isc_boolean_t overmem) { - dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db; + /* This is an empty callback. See adb.c:water() */ - if (IS_CACHE(rbtdb)) - rbtdb->overmem = overmem; + UNUSED(db); + UNUSED(overmem); + + return; } static void @@ -6134,6 +6135,7 @@ addrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version, isc_boolean_t delegating; isc_boolean_t newnsec; isc_boolean_t tree_locked = ISC_FALSE; + isc_boolean_t cache_is_overmem = ISC_FALSE; REQUIRE(VALID_RBTDB(rbtdb)); @@ -6230,12 +6232,14 @@ addrdataset(dns_db_t *db, dns_dbnode_t *node, dns_dbversion_t *version, * the tree. In the latter case the lock does not necessarily have to * be acquired but it will help purge stale entries more effectively. */ - if (delegating || newnsec || (IS_CACHE(rbtdb) && rbtdb->overmem)) { + if (IS_CACHE(rbtdb) && isc_mem_isovermem(rbtdb->common.mctx)) + cache_is_overmem = ISC_TRUE; + if (delegating || newnsec || cache_is_overmem) { tree_locked = ISC_TRUE; RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_write); } - if (IS_CACHE(rbtdb) && rbtdb->overmem) + if (cache_is_overmem) overmem_purge(rbtdb, rbtnode->locknum, now, tree_locked); NODE_LOCK(&rbtdb->node_locks[rbtnode->locknum].lock, @@ -7399,7 +7403,6 @@ dns_rbtdb_create return (result); } rbtdb->attributes = 0; - rbtdb->overmem = ISC_FALSE; rbtdb->task = NULL; /* diff --git a/lib/isc/include/isc/mem.h b/lib/isc/include/isc/mem.h index 714e43fc78..c0a8510da8 100644 --- a/lib/isc/include/isc/mem.h +++ b/lib/isc/include/isc/mem.h @@ -15,7 +15,7 @@ * PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: mem.h,v 1.88 2010/03/04 23:50:34 tbox Exp $ */ +/* $Id: mem.h,v 1.89 2010/08/11 22:54:58 jinmei Exp $ */ #ifndef ISC_MEM_H #define ISC_MEM_H 1 @@ -224,6 +224,7 @@ typedef struct isc_memmethods { void *water_arg, size_t hiwater, size_t lowater); void (*waterack)(isc_mem_t *ctx, int flag); size_t (*inuse)(isc_mem_t *mctx); + isc_boolean_t (*isovermem)(isc_mem_t *mctx); isc_result_t (*mpcreate)(isc_mem_t *mctx, size_t size, isc_mempool_t **mpctxp); } isc_memmethods_t; @@ -420,6 +421,14 @@ isc_mem_inuse(isc_mem_t *mctx); * allocated from the system but not yet used. */ +isc_boolean_t +isc_mem_isovermem(isc_mem_t *mctx); +/*%< + * Return true iff the memory context is in "over memory" state, i.e., + * a hiwater mark has been set and the used amount of memory has exceeds + * the mark. + */ + void isc_mem_setwater(isc_mem_t *mctx, isc_mem_water_t water, void *water_arg, size_t hiwater, size_t lowater); diff --git a/lib/isc/include/isc/namespace.h b/lib/isc/include/isc/namespace.h index 4fa05ac4ef..fc92af642e 100644 --- a/lib/isc/include/isc/namespace.h +++ b/lib/isc/include/isc/namespace.h @@ -14,7 +14,7 @@ * PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: namespace.h,v 1.5 2009/10/01 01:30:01 sar Exp $ */ +/* $Id: namespace.h,v 1.6 2010/08/11 22:54:58 jinmei Exp $ */ #ifndef ISCAPI_NAMESPACE_H #define ISCAPI_NAMESPACE_H 1 @@ -67,6 +67,7 @@ #define isc_mem_getquota isc__mem_getquota #define isc_mem_gettag isc__mem_gettag #define isc_mem_inuse isc__mem_inuse +#define isc_mem_isovermem isc__mem_isovermem #define isc_mem_setname isc__mem_setname #define isc_mem_setwater isc__mem_setwater #define isc_mem_printallactive isc__mem_printallactive diff --git a/lib/isc/mem.c b/lib/isc/mem.c index cefb3aeefb..8fd0b1edbb 100644 --- a/lib/isc/mem.c +++ b/lib/isc/mem.c @@ -15,7 +15,7 @@ * PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: mem.c,v 1.156 2010/05/12 00:46:55 marka Exp $ */ +/* $Id: mem.c,v 1.157 2010/08/11 22:54:58 jinmei Exp $ */ /*! \file */ @@ -144,6 +144,7 @@ struct isc__mem { size_t hi_water; size_t lo_water; isc_boolean_t hi_called; + isc_boolean_t is_overmem; isc_mem_water_t water; void * water_arg; ISC_LIST(isc__mempool_t) pools; @@ -269,6 +270,8 @@ ISC_MEMFUNC_SCOPE size_t isc__mem_getquota(isc_mem_t *ctx); ISC_MEMFUNC_SCOPE size_t isc__mem_inuse(isc_mem_t *ctx); +ISC_MEMFUNC_SCOPE isc_boolean_t +isc__mem_isovermem(isc_mem_t *ctx); ISC_MEMFUNC_SCOPE void isc__mem_setwater(isc_mem_t *ctx, isc_mem_water_t water, void *water_arg, size_t hiwater, size_t lowater); @@ -345,6 +348,7 @@ static struct isc__memmethods { isc__mem_setwater, isc__mem_waterack, isc__mem_inuse, + isc__mem_isovermem, isc__mempool_create } #ifndef BIND9 @@ -939,6 +943,7 @@ isc__mem_createx2(size_t init_max_size, size_t target_size, ctx->hi_water = 0; ctx->lo_water = 0; ctx->hi_called = ISC_FALSE; + ctx->is_overmem = ISC_FALSE; ctx->water = NULL; ctx->water_arg = NULL; ctx->common.impmagic = MEM_MAGIC; @@ -1281,6 +1286,10 @@ isc___mem_get(isc_mem_t *ctx0, size_t size FLARG) { } ADD_TRACE(ctx, ptr, size, file, line); + if (ctx->hi_water != 0U && ctx->inuse > ctx->hi_water && + !ctx->is_overmem) { + ctx->is_overmem = ISC_TRUE; + } if (ctx->hi_water != 0U && !ctx->hi_called && ctx->inuse > ctx->hi_water) { call_water = ISC_TRUE; @@ -1338,6 +1347,10 @@ isc___mem_put(isc_mem_t *ctx0, void *ptr, size_t size FLARG) { * when the context was pushed over hi_water but then had * isc_mem_setwater() called with 0 for hi_water and lo_water. */ + if (ctx->is_overmem && + (ctx->inuse < ctx->lo_water || ctx->lo_water == 0U)) { + ctx->is_overmem = ISC_FALSE; + } if (ctx->hi_called && (ctx->inuse < ctx->lo_water || ctx->lo_water == 0U)) { if (ctx->water != NULL) @@ -1529,6 +1542,11 @@ isc___mem_allocate(isc_mem_t *ctx0, size_t size FLARG) { #if ISC_MEM_TRACKLINES ADD_TRACE(ctx, si, si[-1].u.size, file, line); #endif + if (ctx->hi_water != 0U && ctx->inuse > ctx->hi_water && + !ctx->is_overmem) { + ctx->is_overmem = ISC_TRUE; + } + if (ctx->hi_water != 0U && !ctx->hi_called && ctx->inuse > ctx->hi_water) { ctx->hi_called = ISC_TRUE; @@ -1619,6 +1637,11 @@ isc___mem_free(isc_mem_t *ctx0, void *ptr FLARG) { * when the context was pushed over hi_water but then had * isc_mem_setwater() called with 0 for hi_water and lo_water. */ + if (ctx->is_overmem && + (ctx->inuse < ctx->lo_water || ctx->lo_water == 0U)) { + ctx->is_overmem = ISC_FALSE; + } + if (ctx->hi_called && (ctx->inuse < ctx->lo_water || ctx->lo_water == 0U)) { ctx->hi_called = ISC_FALSE; @@ -1753,6 +1776,18 @@ isc__mem_setwater(isc_mem_t *ctx0, isc_mem_water_t water, void *water_arg, (oldwater)(oldwater_arg, ISC_MEM_LOWATER); } +ISC_MEMFUNC_SCOPE isc_boolean_t +isc__mem_isovermem(isc_mem_t *ctx0) { + isc__mem_t *ctx = (isc__mem_t *)ctx0; + + /* + * We don't bother to lock the context because 100% accuracy isn't + * necessary (and even if we locked the context the returned value + * could be different from the actual state when it's used anyway) + */ + return (ctx->is_overmem); +} + ISC_MEMFUNC_SCOPE void isc__mem_setname(isc_mem_t *ctx0, const char *name, void *tag) { isc__mem_t *ctx = (isc__mem_t *)ctx0;