2596. [bug] Stale tree nodes of cache/dynamic rbtdb could stay

long, leading to inefficient memory usage or rejecting
			newer cache entries in the worst case. [RT #19563]
This commit is contained in:
Tatuya JINMEI 神明達哉 2009-05-06 22:53:54 +00:00
parent 0e77d33686
commit 5d7849ad7f
5 changed files with 190 additions and 40 deletions

View file

@ -1,3 +1,7 @@
2596. [bug] Stale tree nodes of cache/dynamic rbtdb could stay
long, leading to inefficient memory usage or rejecting
newer cache entries in the worst case. [RT #19563]
2595. [bug] Fix unknown extended rcodes in dig. [RT #19625]
2594. [func] Have rndc warn if using its default configuration

View file

@ -15,7 +15,7 @@
* PERFORMANCE OF THIS SOFTWARE.
*/
/* $Id: cache.c,v 1.83 2009/01/17 14:13:42 fdupont Exp $ */
/* $Id: cache.c,v 1.84 2009/05/06 22:53:54 jinmei Exp $ */
/*! \file */
@ -186,6 +186,7 @@ dns_cache_create2(isc_mem_t *mctx, isc_taskmgr_t *taskmgr,
isc_result_t result;
dns_cache_t *cache;
int i;
isc_task_t *dbtask;
REQUIRE(cachep != NULL);
REQUIRE(*cachep == NULL);
@ -251,6 +252,14 @@ dns_cache_create2(isc_mem_t *mctx, isc_taskmgr_t *taskmgr,
result = cache_create_db(cache, &cache->db);
if (result != ISC_R_SUCCESS)
goto cleanup_dbargv;
if (taskmgr != NULL) {
dbtask = NULL;
result = isc_task_create(taskmgr, 1, &dbtask);
if (result != ISC_R_SUCCESS)
goto cleanup_db;
dns_db_settask(cache->db, dbtask);
isc_task_detach(&dbtask);
}
cache->filename = NULL;

View file

@ -15,7 +15,7 @@
* PERFORMANCE OF THIS SOFTWARE.
*/
/* $Id: events.h,v 1.49 2007/06/19 23:47:16 tbox Exp $ */
/* $Id: events.h,v 1.50 2009/05/06 22:53:54 jinmei Exp $ */
#ifndef DNS_EVENTS_H
#define DNS_EVENTS_H 1
@ -68,6 +68,7 @@
#define DNS_EVENT_ACACHECONTROL (ISC_EVENTCLASS_DNS + 38)
#define DNS_EVENT_ACACHECLEAN (ISC_EVENTCLASS_DNS + 39)
#define DNS_EVENT_ACACHEOVERMEM (ISC_EVENTCLASS_DNS + 40)
#define DNS_EVENT_RBTPRUNE (ISC_EVENTCLASS_DNS + 41)
#define DNS_EVENT_FIRSTEVENT (ISC_EVENTCLASS_DNS + 0)
#define DNS_EVENT_LASTEVENT (ISC_EVENTCLASS_DNS + 65535)

View file

@ -15,7 +15,7 @@
* PERFORMANCE OF THIS SOFTWARE.
*/
/* $Id: rbtdb.c,v 1.275 2009/04/07 02:49:37 jinmei Exp $ */
/* $Id: rbtdb.c,v 1.276 2009/05/06 22:53:54 jinmei Exp $ */
/*! \file */
@ -533,6 +533,7 @@ static void overmem_purge(dns_rbtdb_t *rbtdb, unsigned int locknum_start,
isc_stdtime_t now, isc_boolean_t tree_locked);
static isc_result_t resign_insert(dns_rbtdb_t *rbtdb, int idx,
rdatasetheader_t *newheader);
static void prune_tree(isc_task_t *task, isc_event_t *event);
static dns_rdatasetmethods_t rdataset_methods = {
rdataset_disassociate,
@ -1551,7 +1552,7 @@ new_reference(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node) {
/*
* This function is assumed to be called when a node is newly referenced
* and can be in the deadnode list. In that case the node must be retrieved
* from the list because the it is going to be used. In addition, if the caller
* from the list because it is going to be used. In addition, if the caller
* happens to hold a write lock on the tree, it's a good chance to purge dead
* nodes.
* Note: while a new reference is gained in multiple places, there are only very
@ -1604,13 +1605,15 @@ reactivate_node(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node,
static isc_boolean_t
decrement_reference(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node,
rbtdb_serial_t least_serial,
isc_rwlocktype_t nlock, isc_rwlocktype_t tlock)
isc_rwlocktype_t nlock, isc_rwlocktype_t tlock,
isc_boolean_t pruning)
{
isc_result_t result;
isc_boolean_t write_locked;
rbtdb_nodelock_t *nodelock;
unsigned int refs, nrefs;
int bucket = node->locknum;
isc_boolean_t no_reference;
nodelock = &rbtdb->node_locks[bucket];
@ -1693,6 +1696,7 @@ decrement_reference(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node,
} else
write_locked = ISC_TRUE;
no_reference = ISC_TRUE;
if (write_locked && dns_rbtnode_refcurrent(node) == 0) {
/*
* We can now delete the node if the reference counter is
@ -1701,35 +1705,97 @@ decrement_reference(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node,
* current thread locks the tree (e.g., in findnode()).
*/
if (isc_log_wouldlog(dns_lctx, ISC_LOG_DEBUG(1))) {
char printname[DNS_NAME_FORMATSIZE];
/*
* If this node is the only one in the level it's in, deleting
* this node may recursively make its parent the only node in
* the parent level; if so, and if no one is currently using
* the parent node, this is almost the only opportunity to
* clean it up. But the recursive cleanup is not that trivial
* since the child and parent may be in different lock buckets,
* which would cause a lock order reversal problem. To avoid
* the trouble, we'll dispatch a separate event for batch
* cleaning. We need to check whether we're deleting the node
* as a result of pruning to avoid infinite dispatching.
* Note: pruning happens only when a task has been set for the
* rbtdb. If the user of the rbtdb chooses not to set a task,
* it's their responsibility to purge stale leaves (e.g. by
* periodic walk-through).
*/
if (!pruning && node->parent != NULL &&
node->parent->down == node && node->left == NULL &&
node->right == NULL && rbtdb->task != NULL) {
isc_event_t *ev;
dns_db_t *db;
isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
DNS_LOGMODULE_CACHE, ISC_LOG_DEBUG(1),
"decrement_reference: "
"delete from rbt: %p %s",
node,
dns_rbt_formatnodename(node, printname,
sizeof(printname)));
ev = isc_event_allocate(rbtdb->common.mctx, NULL,
DNS_EVENT_RBTPRUNE,
prune_tree, node,
sizeof(isc_event_t));
if (ev != NULL) {
new_reference(rbtdb, node);
db = NULL;
attach((dns_db_t *)rbtdb, &db);
ev->ev_sender = db;
isc_task_send(rbtdb->task, &ev);
no_reference = ISC_FALSE;
} else {
/*
* XXX: this is a weird situation. We could
* ignore this error case, but then the stale
* node will unlikely be purged except via a
* rare condition such as manual cleanup. So
* we queue it in the deadnodes list, hoping
* the memory shortage is temporary and the node
* will be deleted later.
*/
isc_log_write(dns_lctx,
DNS_LOGCATEGORY_DATABASE,
DNS_LOGMODULE_CACHE,
ISC_LOG_INFO,
"decrement_reference: failed to "
"allocate pruning event");
INSIST(!ISC_LINK_LINKED(node, deadlink));
ISC_LIST_APPEND(rbtdb->deadnodes[bucket], node,
deadlink);
}
} else {
if (isc_log_wouldlog(dns_lctx, ISC_LOG_DEBUG(1))) {
char printname[DNS_NAME_FORMATSIZE];
isc_log_write(dns_lctx,
DNS_LOGCATEGORY_DATABASE,
DNS_LOGMODULE_CACHE,
ISC_LOG_DEBUG(1),
"decrement_reference: "
"delete from rbt: %p %s",
node,
dns_rbt_formatnodename(node,
printname,
sizeof(printname)));
}
INSIST(!ISC_LINK_LINKED(node, deadlink));
if (node->nsec3)
result = dns_rbt_deletenode(rbtdb->nsec3, node,
ISC_FALSE);
else
result = dns_rbt_deletenode(rbtdb->tree, node,
ISC_FALSE);
if (result != ISC_R_SUCCESS) {
isc_log_write(dns_lctx,
DNS_LOGCATEGORY_DATABASE,
DNS_LOGMODULE_CACHE,
ISC_LOG_WARNING,
"decrement_reference: "
"dns_rbt_deletenode: %s",
isc_result_totext(result));
}
}
INSIST(!ISC_LINK_LINKED(node, deadlink));
if (node->nsec3)
result = dns_rbt_deletenode(rbtdb->nsec3, node,
ISC_FALSE);
else
result = dns_rbt_deletenode(rbtdb->tree, node,
ISC_FALSE);
if (result != ISC_R_SUCCESS)
isc_log_write(dns_lctx, DNS_LOGCATEGORY_DATABASE,
DNS_LOGMODULE_CACHE, ISC_LOG_WARNING,
"decrement_reference: "
"dns_rbt_deletenode: %s",
isc_result_totext(result));
} else if (dns_rbtnode_refcurrent(node) == 0) {
INSIST(!ISC_LINK_LINKED(node, deadlink));
ISC_LIST_APPEND(rbtdb->deadnodes[bucket], node, deadlink);
}
} else
no_reference = ISC_FALSE;
/* Restore the lock? */
if (nlock == isc_rwlocktype_read)
@ -1746,7 +1812,71 @@ decrement_reference(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node,
if (write_locked)
isc_rwlock_downgrade(&rbtdb->tree_lock);
return (ISC_TRUE);
return (no_reference);
}
/*
* Prune the tree by recursively cleaning-up single leaves. In the worst
* case, the number of iteration is the number of tree levels, which is at
* most the maximum number of domain name labels, i.e, 127. In practice, this
* should be much smaller (only a few times), and even the worst case would be
* acceptable for a single event.
*/
static void
prune_tree(isc_task_t *task, isc_event_t *event) {
dns_rbtdb_t *rbtdb = event->ev_sender;
dns_rbtnode_t *node = event->ev_arg;
dns_rbtnode_t *parent;
unsigned int locknum;
UNUSED(task);
isc_event_free(&event);
RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
locknum = node->locknum;
NODE_LOCK(&rbtdb->node_locks[locknum].lock, isc_rwlocktype_write);
do {
parent = node->parent;
decrement_reference(rbtdb, node, 0, isc_rwlocktype_write,
isc_rwlocktype_write, ISC_TRUE);
if (parent != NULL && parent->down == NULL) {
/*
* node was the only down child of the parent and has
* just been removed. We'll then need to examine the
* parent. Keep the lock if possible; otherwise,
* release the old lock and acquire one for the parent.
*/
if (parent->locknum != locknum) {
NODE_UNLOCK(&rbtdb->node_locks[locknum].lock,
isc_rwlocktype_write);
locknum = parent->locknum;
NODE_LOCK(&rbtdb->node_locks[locknum].lock,
isc_rwlocktype_write);
}
/*
* We need to gain a reference to the node before
* decrementing it in the next iteration. In addition,
* if the node is in the dead-nodes list, extract it
* from the list beforehand as we do in
* reactivate_node().
*/
new_reference(rbtdb, parent);
if (ISC_LINK_LINKED(parent, deadlink)) {
ISC_LIST_UNLINK(rbtdb->deadnodes[locknum],
parent, deadlink);
}
} else
parent = NULL;
node = parent;
} while (node != NULL);
NODE_UNLOCK(&rbtdb->node_locks[locknum].lock, isc_rwlocktype_write);
RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
detach((dns_db_t **)&rbtdb);
}
static inline void
@ -2163,8 +2293,8 @@ closeversion(dns_db_t *db, dns_dbversion_t **versionp, isc_boolean_t commit) {
NODE_UNLOCK(lock, isc_rwlocktype_write);
}
decrement_reference(rbtdb, header->node, least_serial,
isc_rwlocktype_write,
isc_rwlocktype_none);
isc_rwlocktype_write, isc_rwlocktype_none,
ISC_FALSE);
}
if (!EMPTY(cleanup_list)) {
@ -2197,7 +2327,7 @@ closeversion(dns_db_t *db, dns_dbversion_t **versionp, isc_boolean_t commit) {
rollback_node(rbtnode, serial);
decrement_reference(rbtdb, rbtnode, least_serial,
isc_rwlocktype_write,
isc_rwlocktype_write);
isc_rwlocktype_write, ISC_FALSE);
NODE_UNLOCK(lock, isc_rwlocktype_write);
@ -3747,7 +3877,8 @@ zone_find(dns_db_t *db, dns_name_t *name, dns_dbversion_t *version,
NODE_LOCK(lock, isc_rwlocktype_read);
decrement_reference(search.rbtdb, node, 0,
isc_rwlocktype_read, isc_rwlocktype_none);
isc_rwlocktype_read, isc_rwlocktype_none,
ISC_FALSE);
NODE_UNLOCK(lock, isc_rwlocktype_read);
}
@ -4525,7 +4656,8 @@ cache_find(dns_db_t *db, dns_name_t *name, dns_dbversion_t *version,
NODE_LOCK(lock, isc_rwlocktype_read);
decrement_reference(search.rbtdb, node, 0,
isc_rwlocktype_read, isc_rwlocktype_none);
isc_rwlocktype_read, isc_rwlocktype_none,
ISC_FALSE);
NODE_UNLOCK(lock, isc_rwlocktype_read);
}
@ -4743,7 +4875,7 @@ detachnode(dns_db_t *db, dns_dbnode_t **targetp) {
NODE_LOCK(&nodelock->lock, isc_rwlocktype_read);
if (decrement_reference(rbtdb, node, 0, isc_rwlocktype_read,
isc_rwlocktype_none)) {
isc_rwlocktype_none, ISC_FALSE)) {
if (isc_refcount_current(&nodelock->references) == 0 &&
nodelock->exiting) {
inactive = ISC_TRUE;
@ -7469,7 +7601,7 @@ dereference_iter_node(rbtdb_dbiterator_t *rbtdbiter) {
lock = &rbtdb->node_locks[node->locknum].lock;
NODE_LOCK(lock, isc_rwlocktype_read);
decrement_reference(rbtdb, node, 0, isc_rwlocktype_read,
rbtdbiter->tree_locked);
rbtdbiter->tree_locked, ISC_FALSE);
NODE_UNLOCK(lock, isc_rwlocktype_read);
rbtdbiter->node = NULL;
@ -7510,7 +7642,7 @@ flush_deletions(rbtdb_dbiterator_t *rbtdbiter) {
NODE_LOCK(lock, isc_rwlocktype_read);
decrement_reference(rbtdb, node, 0,
isc_rwlocktype_read,
rbtdbiter->tree_locked);
rbtdbiter->tree_locked, ISC_FALSE);
NODE_UNLOCK(lock, isc_rwlocktype_read);
}
@ -8446,6 +8578,6 @@ expire_header(dns_rbtdb_t *rbtdb, rdatasetheader_t *header,
decrement_reference(rbtdb, header->node, 0,
isc_rwlocktype_write,
tree_locked ? isc_rwlocktype_write :
isc_rwlocktype_none);
isc_rwlocktype_none, ISC_FALSE);
}
}

View file

@ -15,7 +15,7 @@
* PERFORMANCE OF THIS SOFTWARE.
*/
/* $Id: hash.c,v 1.13 2007/06/19 23:47:17 tbox Exp $ */
/* $Id: hash.c,v 1.14 2009/05/06 22:53:54 jinmei Exp $ */
/*! \file
* Some portion of this code was derived from universal hash function
@ -293,6 +293,7 @@ static void
destroy(isc_hash_t **hctxp) {
isc_hash_t *hctx;
isc_mem_t *mctx;
unsigned char canary0[4], canary1[4];
REQUIRE(hctxp != NULL && *hctxp != NULL);
hctx = *hctxp;
@ -312,7 +313,10 @@ destroy(isc_hash_t **hctxp) {
DESTROYLOCK(&hctx->lock);
memcpy(canary0, hctx + 1, sizeof(canary0));
memset(hctx, 0, sizeof(isc_hash_t));
memcpy(canary1, hctx + 1, sizeof(canary1));
INSIST(memcmp(canary0, canary1, sizeof(canary0)) == 0);
isc_mem_put(mctx, hctx, sizeof(isc_hash_t));
isc_mem_detach(&mctx);
}