From 5d7849ad7ffc6d08870dbfbc8d6bfffd90007488 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Tatuya=20JINMEI=20=E7=A5=9E=E6=98=8E=E9=81=94=E5=93=89?= Date: Wed, 6 May 2009 22:53:54 +0000 Subject: [PATCH] 2596. [bug] Stale tree nodes of cache/dynamic rbtdb could stay long, leading to inefficient memory usage or rejecting newer cache entries in the worst case. [RT #19563] --- CHANGES | 4 + lib/dns/cache.c | 11 +- lib/dns/include/dns/events.h | 3 +- lib/dns/rbtdb.c | 206 ++++++++++++++++++++++++++++------- lib/isc/hash.c | 6 +- 5 files changed, 190 insertions(+), 40 deletions(-) diff --git a/CHANGES b/CHANGES index cf4771c859..3c277f06d6 100644 --- a/CHANGES +++ b/CHANGES @@ -1,3 +1,7 @@ +2596. [bug] Stale tree nodes of cache/dynamic rbtdb could stay + long, leading to inefficient memory usage or rejecting + newer cache entries in the worst case. [RT #19563] + 2595. [bug] Fix unknown extended rcodes in dig. [RT #19625] 2594. [func] Have rndc warn if using its default configuration diff --git a/lib/dns/cache.c b/lib/dns/cache.c index 819133e7ec..5909879923 100644 --- a/lib/dns/cache.c +++ b/lib/dns/cache.c @@ -15,7 +15,7 @@ * PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: cache.c,v 1.83 2009/01/17 14:13:42 fdupont Exp $ */ +/* $Id: cache.c,v 1.84 2009/05/06 22:53:54 jinmei Exp $ */ /*! \file */ @@ -186,6 +186,7 @@ dns_cache_create2(isc_mem_t *mctx, isc_taskmgr_t *taskmgr, isc_result_t result; dns_cache_t *cache; int i; + isc_task_t *dbtask; REQUIRE(cachep != NULL); REQUIRE(*cachep == NULL); @@ -251,6 +252,14 @@ dns_cache_create2(isc_mem_t *mctx, isc_taskmgr_t *taskmgr, result = cache_create_db(cache, &cache->db); if (result != ISC_R_SUCCESS) goto cleanup_dbargv; + if (taskmgr != NULL) { + dbtask = NULL; + result = isc_task_create(taskmgr, 1, &dbtask); + if (result != ISC_R_SUCCESS) + goto cleanup_db; + dns_db_settask(cache->db, dbtask); + isc_task_detach(&dbtask); + } cache->filename = NULL; diff --git a/lib/dns/include/dns/events.h b/lib/dns/include/dns/events.h index 7692c1b696..efa4862a75 100644 --- a/lib/dns/include/dns/events.h +++ b/lib/dns/include/dns/events.h @@ -15,7 +15,7 @@ * PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: events.h,v 1.49 2007/06/19 23:47:16 tbox Exp $ */ +/* $Id: events.h,v 1.50 2009/05/06 22:53:54 jinmei Exp $ */ #ifndef DNS_EVENTS_H #define DNS_EVENTS_H 1 @@ -68,6 +68,7 @@ #define DNS_EVENT_ACACHECONTROL (ISC_EVENTCLASS_DNS + 38) #define DNS_EVENT_ACACHECLEAN (ISC_EVENTCLASS_DNS + 39) #define DNS_EVENT_ACACHEOVERMEM (ISC_EVENTCLASS_DNS + 40) +#define DNS_EVENT_RBTPRUNE (ISC_EVENTCLASS_DNS + 41) #define DNS_EVENT_FIRSTEVENT (ISC_EVENTCLASS_DNS + 0) #define DNS_EVENT_LASTEVENT (ISC_EVENTCLASS_DNS + 65535) diff --git a/lib/dns/rbtdb.c b/lib/dns/rbtdb.c index c6d7f66fed..cc17d81508 100644 --- a/lib/dns/rbtdb.c +++ b/lib/dns/rbtdb.c @@ -15,7 +15,7 @@ * PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: rbtdb.c,v 1.275 2009/04/07 02:49:37 jinmei Exp $ */ +/* $Id: rbtdb.c,v 1.276 2009/05/06 22:53:54 jinmei Exp $ */ /*! \file */ @@ -533,6 +533,7 @@ static void overmem_purge(dns_rbtdb_t *rbtdb, unsigned int locknum_start, isc_stdtime_t now, isc_boolean_t tree_locked); static isc_result_t resign_insert(dns_rbtdb_t *rbtdb, int idx, rdatasetheader_t *newheader); +static void prune_tree(isc_task_t *task, isc_event_t *event); static dns_rdatasetmethods_t rdataset_methods = { rdataset_disassociate, @@ -1551,7 +1552,7 @@ new_reference(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node) { /* * This function is assumed to be called when a node is newly referenced * and can be in the deadnode list. In that case the node must be retrieved - * from the list because the it is going to be used. In addition, if the caller + * from the list because it is going to be used. In addition, if the caller * happens to hold a write lock on the tree, it's a good chance to purge dead * nodes. * Note: while a new reference is gained in multiple places, there are only very @@ -1604,13 +1605,15 @@ reactivate_node(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node, static isc_boolean_t decrement_reference(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node, rbtdb_serial_t least_serial, - isc_rwlocktype_t nlock, isc_rwlocktype_t tlock) + isc_rwlocktype_t nlock, isc_rwlocktype_t tlock, + isc_boolean_t pruning) { isc_result_t result; isc_boolean_t write_locked; rbtdb_nodelock_t *nodelock; unsigned int refs, nrefs; int bucket = node->locknum; + isc_boolean_t no_reference; nodelock = &rbtdb->node_locks[bucket]; @@ -1693,6 +1696,7 @@ decrement_reference(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node, } else write_locked = ISC_TRUE; + no_reference = ISC_TRUE; if (write_locked && dns_rbtnode_refcurrent(node) == 0) { /* * We can now delete the node if the reference counter is @@ -1701,35 +1705,97 @@ decrement_reference(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node, * current thread locks the tree (e.g., in findnode()). */ - if (isc_log_wouldlog(dns_lctx, ISC_LOG_DEBUG(1))) { - char printname[DNS_NAME_FORMATSIZE]; + /* + * If this node is the only one in the level it's in, deleting + * this node may recursively make its parent the only node in + * the parent level; if so, and if no one is currently using + * the parent node, this is almost the only opportunity to + * clean it up. But the recursive cleanup is not that trivial + * since the child and parent may be in different lock buckets, + * which would cause a lock order reversal problem. To avoid + * the trouble, we'll dispatch a separate event for batch + * cleaning. We need to check whether we're deleting the node + * as a result of pruning to avoid infinite dispatching. + * Note: pruning happens only when a task has been set for the + * rbtdb. If the user of the rbtdb chooses not to set a task, + * it's their responsibility to purge stale leaves (e.g. by + * periodic walk-through). + */ + if (!pruning && node->parent != NULL && + node->parent->down == node && node->left == NULL && + node->right == NULL && rbtdb->task != NULL) { + isc_event_t *ev; + dns_db_t *db; - isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE, - DNS_LOGMODULE_CACHE, ISC_LOG_DEBUG(1), - "decrement_reference: " - "delete from rbt: %p %s", - node, - dns_rbt_formatnodename(node, printname, - sizeof(printname))); + ev = isc_event_allocate(rbtdb->common.mctx, NULL, + DNS_EVENT_RBTPRUNE, + prune_tree, node, + sizeof(isc_event_t)); + if (ev != NULL) { + new_reference(rbtdb, node); + db = NULL; + attach((dns_db_t *)rbtdb, &db); + ev->ev_sender = db; + isc_task_send(rbtdb->task, &ev); + no_reference = ISC_FALSE; + } else { + /* + * XXX: this is a weird situation. We could + * ignore this error case, but then the stale + * node will unlikely be purged except via a + * rare condition such as manual cleanup. So + * we queue it in the deadnodes list, hoping + * the memory shortage is temporary and the node + * will be deleted later. + */ + isc_log_write(dns_lctx, + DNS_LOGCATEGORY_DATABASE, + DNS_LOGMODULE_CACHE, + ISC_LOG_INFO, + "decrement_reference: failed to " + "allocate pruning event"); + INSIST(!ISC_LINK_LINKED(node, deadlink)); + ISC_LIST_APPEND(rbtdb->deadnodes[bucket], node, + deadlink); + } + } else { + if (isc_log_wouldlog(dns_lctx, ISC_LOG_DEBUG(1))) { + char printname[DNS_NAME_FORMATSIZE]; + + isc_log_write(dns_lctx, + DNS_LOGCATEGORY_DATABASE, + DNS_LOGMODULE_CACHE, + ISC_LOG_DEBUG(1), + "decrement_reference: " + "delete from rbt: %p %s", + node, + dns_rbt_formatnodename(node, + printname, + sizeof(printname))); + } + + INSIST(!ISC_LINK_LINKED(node, deadlink)); + if (node->nsec3) + result = dns_rbt_deletenode(rbtdb->nsec3, node, + ISC_FALSE); + else + result = dns_rbt_deletenode(rbtdb->tree, node, + ISC_FALSE); + if (result != ISC_R_SUCCESS) { + isc_log_write(dns_lctx, + DNS_LOGCATEGORY_DATABASE, + DNS_LOGMODULE_CACHE, + ISC_LOG_WARNING, + "decrement_reference: " + "dns_rbt_deletenode: %s", + isc_result_totext(result)); + } } - - INSIST(!ISC_LINK_LINKED(node, deadlink)); - if (node->nsec3) - result = dns_rbt_deletenode(rbtdb->nsec3, node, - ISC_FALSE); - else - result = dns_rbt_deletenode(rbtdb->tree, node, - ISC_FALSE); - if (result != ISC_R_SUCCESS) - isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE, - DNS_LOGMODULE_CACHE, ISC_LOG_WARNING, - "decrement_reference: " - "dns_rbt_deletenode: %s", - isc_result_totext(result)); } else if (dns_rbtnode_refcurrent(node) == 0) { INSIST(!ISC_LINK_LINKED(node, deadlink)); ISC_LIST_APPEND(rbtdb->deadnodes[bucket], node, deadlink); - } + } else + no_reference = ISC_FALSE; /* Restore the lock? */ if (nlock == isc_rwlocktype_read) @@ -1746,7 +1812,71 @@ decrement_reference(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node, if (write_locked) isc_rwlock_downgrade(&rbtdb->tree_lock); - return (ISC_TRUE); + return (no_reference); +} + +/* + * Prune the tree by recursively cleaning-up single leaves. In the worst + * case, the number of iteration is the number of tree levels, which is at + * most the maximum number of domain name labels, i.e, 127. In practice, this + * should be much smaller (only a few times), and even the worst case would be + * acceptable for a single event. + */ +static void +prune_tree(isc_task_t *task, isc_event_t *event) { + dns_rbtdb_t *rbtdb = event->ev_sender; + dns_rbtnode_t *node = event->ev_arg; + dns_rbtnode_t *parent; + unsigned int locknum; + + UNUSED(task); + + isc_event_free(&event); + + RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_write); + locknum = node->locknum; + NODE_LOCK(&rbtdb->node_locks[locknum].lock, isc_rwlocktype_write); + do { + parent = node->parent; + decrement_reference(rbtdb, node, 0, isc_rwlocktype_write, + isc_rwlocktype_write, ISC_TRUE); + + if (parent != NULL && parent->down == NULL) { + /* + * node was the only down child of the parent and has + * just been removed. We'll then need to examine the + * parent. Keep the lock if possible; otherwise, + * release the old lock and acquire one for the parent. + */ + if (parent->locknum != locknum) { + NODE_UNLOCK(&rbtdb->node_locks[locknum].lock, + isc_rwlocktype_write); + locknum = parent->locknum; + NODE_LOCK(&rbtdb->node_locks[locknum].lock, + isc_rwlocktype_write); + } + + /* + * We need to gain a reference to the node before + * decrementing it in the next iteration. In addition, + * if the node is in the dead-nodes list, extract it + * from the list beforehand as we do in + * reactivate_node(). + */ + new_reference(rbtdb, parent); + if (ISC_LINK_LINKED(parent, deadlink)) { + ISC_LIST_UNLINK(rbtdb->deadnodes[locknum], + parent, deadlink); + } + } else + parent = NULL; + + node = parent; + } while (node != NULL); + NODE_UNLOCK(&rbtdb->node_locks[locknum].lock, isc_rwlocktype_write); + RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_write); + + detach((dns_db_t **)&rbtdb); } static inline void @@ -2163,8 +2293,8 @@ closeversion(dns_db_t *db, dns_dbversion_t **versionp, isc_boolean_t commit) { NODE_UNLOCK(lock, isc_rwlocktype_write); } decrement_reference(rbtdb, header->node, least_serial, - isc_rwlocktype_write, - isc_rwlocktype_none); + isc_rwlocktype_write, isc_rwlocktype_none, + ISC_FALSE); } if (!EMPTY(cleanup_list)) { @@ -2197,7 +2327,7 @@ closeversion(dns_db_t *db, dns_dbversion_t **versionp, isc_boolean_t commit) { rollback_node(rbtnode, serial); decrement_reference(rbtdb, rbtnode, least_serial, isc_rwlocktype_write, - isc_rwlocktype_write); + isc_rwlocktype_write, ISC_FALSE); NODE_UNLOCK(lock, isc_rwlocktype_write); @@ -3747,7 +3877,8 @@ zone_find(dns_db_t *db, dns_name_t *name, dns_dbversion_t *version, NODE_LOCK(lock, isc_rwlocktype_read); decrement_reference(search.rbtdb, node, 0, - isc_rwlocktype_read, isc_rwlocktype_none); + isc_rwlocktype_read, isc_rwlocktype_none, + ISC_FALSE); NODE_UNLOCK(lock, isc_rwlocktype_read); } @@ -4525,7 +4656,8 @@ cache_find(dns_db_t *db, dns_name_t *name, dns_dbversion_t *version, NODE_LOCK(lock, isc_rwlocktype_read); decrement_reference(search.rbtdb, node, 0, - isc_rwlocktype_read, isc_rwlocktype_none); + isc_rwlocktype_read, isc_rwlocktype_none, + ISC_FALSE); NODE_UNLOCK(lock, isc_rwlocktype_read); } @@ -4743,7 +4875,7 @@ detachnode(dns_db_t *db, dns_dbnode_t **targetp) { NODE_LOCK(&nodelock->lock, isc_rwlocktype_read); if (decrement_reference(rbtdb, node, 0, isc_rwlocktype_read, - isc_rwlocktype_none)) { + isc_rwlocktype_none, ISC_FALSE)) { if (isc_refcount_current(&nodelock->references) == 0 && nodelock->exiting) { inactive = ISC_TRUE; @@ -7469,7 +7601,7 @@ dereference_iter_node(rbtdb_dbiterator_t *rbtdbiter) { lock = &rbtdb->node_locks[node->locknum].lock; NODE_LOCK(lock, isc_rwlocktype_read); decrement_reference(rbtdb, node, 0, isc_rwlocktype_read, - rbtdbiter->tree_locked); + rbtdbiter->tree_locked, ISC_FALSE); NODE_UNLOCK(lock, isc_rwlocktype_read); rbtdbiter->node = NULL; @@ -7510,7 +7642,7 @@ flush_deletions(rbtdb_dbiterator_t *rbtdbiter) { NODE_LOCK(lock, isc_rwlocktype_read); decrement_reference(rbtdb, node, 0, isc_rwlocktype_read, - rbtdbiter->tree_locked); + rbtdbiter->tree_locked, ISC_FALSE); NODE_UNLOCK(lock, isc_rwlocktype_read); } @@ -8446,6 +8578,6 @@ expire_header(dns_rbtdb_t *rbtdb, rdatasetheader_t *header, decrement_reference(rbtdb, header->node, 0, isc_rwlocktype_write, tree_locked ? isc_rwlocktype_write : - isc_rwlocktype_none); + isc_rwlocktype_none, ISC_FALSE); } } diff --git a/lib/isc/hash.c b/lib/isc/hash.c index 41bb884284..4abf321eef 100644 --- a/lib/isc/hash.c +++ b/lib/isc/hash.c @@ -15,7 +15,7 @@ * PERFORMANCE OF THIS SOFTWARE. */ -/* $Id: hash.c,v 1.13 2007/06/19 23:47:17 tbox Exp $ */ +/* $Id: hash.c,v 1.14 2009/05/06 22:53:54 jinmei Exp $ */ /*! \file * Some portion of this code was derived from universal hash function @@ -293,6 +293,7 @@ static void destroy(isc_hash_t **hctxp) { isc_hash_t *hctx; isc_mem_t *mctx; + unsigned char canary0[4], canary1[4]; REQUIRE(hctxp != NULL && *hctxp != NULL); hctx = *hctxp; @@ -312,7 +313,10 @@ destroy(isc_hash_t **hctxp) { DESTROYLOCK(&hctx->lock); + memcpy(canary0, hctx + 1, sizeof(canary0)); memset(hctx, 0, sizeof(isc_hash_t)); + memcpy(canary1, hctx + 1, sizeof(canary1)); + INSIST(memcmp(canary0, canary1, sizeof(canary0)) == 0); isc_mem_put(mctx, hctx, sizeof(isc_hash_t)); isc_mem_detach(&mctx); }