diff --git a/CHANGES b/CHANGES index 749b0f1b66..0922e10097 100644 --- a/CHANGES +++ b/CHANGES @@ -1,3 +1,7 @@ +6356. [bug] Create the pruning task in the dns_cache_flush(), so + the cache pruning still works after the flush. + [GL #4621] + 6353. [bug] Improve the TTL-based cleaning by removing the expired headers from the heap, so they don't block the next cleaning round and clean more than a single item for diff --git a/bin/tests/system/dyndb/driver/db.c b/bin/tests/system/dyndb/driver/db.c index 334fd549de..96857224c2 100644 --- a/bin/tests/system/dyndb/driver/db.c +++ b/bin/tests/system/dyndb/driver/db.c @@ -420,12 +420,12 @@ overmem(dns_db_t *db, bool over) { } static void -settask(dns_db_t *db, isc_task_t *task) { +settask(dns_db_t *db, isc_task_t *task, isc_task_t *prunetask) { sampledb_t *sampledb = (sampledb_t *)db; REQUIRE(VALID_SAMPLEDB(sampledb)); - dns_db_settask(sampledb->rbtdb, task); + dns_db_settask(sampledb->rbtdb, task, prunetask); } static isc_result_t diff --git a/doc/notes/notes-current.rst b/doc/notes/notes-current.rst index cd16ecdbec..f9a7315aaf 100644 --- a/doc/notes/notes-current.rst +++ b/doc/notes/notes-current.rst @@ -47,6 +47,11 @@ Bug Fixes :gl:`#4596` +- Using :option:`rndc flush` inadvertently caused cache cleaning to + become less effective. This could ultimately lead to the configured + :any:`max-cache-size` limit being exceeded and has now been fixed. + :gl:`#4621` + Known Issues ~~~~~~~~~~~~ diff --git a/lib/dns/cache.c b/lib/dns/cache.c index 7ffb6f85b8..2a6344e51f 100644 --- a/lib/dns/cache.c +++ b/lib/dns/cache.c @@ -129,6 +129,7 @@ struct dns_cache { isc_mutex_t lock; isc_mem_t *mctx; /* Main cache memory */ isc_mem_t *hmctx; /* Heap memory */ + isc_taskmgr_t *taskmgr; char *name; isc_refcount_t references; isc_refcount_t live_tasks; @@ -169,13 +170,47 @@ water(void *arg, int mark); static isc_result_t cache_create_db(dns_cache_t *cache, dns_db_t **db) { isc_result_t result; + isc_task_t *dbtask = NULL; + isc_task_t *prunetask = NULL; + result = dns_db_create(cache->mctx, cache->db_type, dns_rootname, dns_dbtype_cache, cache->rdclass, cache->db_argc, cache->db_argv, db); - if (result == ISC_R_SUCCESS) { - dns_db_setservestalettl(*db, cache->serve_stale_ttl); - dns_db_setservestalerefresh(*db, cache->serve_stale_refresh); + if (result != ISC_R_SUCCESS) { + return (result); } + + dns_db_setservestalettl(*db, cache->serve_stale_ttl); + dns_db_setservestalerefresh(*db, cache->serve_stale_refresh); + + if (cache->taskmgr == NULL) { + return (ISC_R_SUCCESS); + } + + result = isc_task_create(cache->taskmgr, 1, &dbtask); + if (result != ISC_R_SUCCESS) { + goto cleanup_db; + } + isc_task_setname(dbtask, "cache_dbtask", NULL); + + result = isc_task_create(cache->taskmgr, UINT_MAX, &prunetask); + if (result != ISC_R_SUCCESS) { + goto cleanup_dbtask; + } + isc_task_setname(prunetask, "cache_prunetask", NULL); + + dns_db_settask(*db, dbtask, prunetask); + + isc_task_detach(&prunetask); + isc_task_detach(&dbtask); + + return (ISC_R_SUCCESS); + +cleanup_dbtask: + isc_task_detach(&dbtask); +cleanup_db: + dns_db_detach(db); + return (result); } @@ -187,12 +222,12 @@ dns_cache_create(isc_mem_t *cmctx, isc_mem_t *hmctx, isc_taskmgr_t *taskmgr, isc_result_t result; dns_cache_t *cache; int i, extra = 0; - isc_task_t *dbtask; REQUIRE(cachep != NULL); REQUIRE(*cachep == NULL); REQUIRE(cmctx != NULL); REQUIRE(hmctx != NULL); + REQUIRE(taskmgr != NULL || strcmp(db_type, "rbt") != 0); REQUIRE(cachename != NULL); cache = isc_mem_get(cmctx, sizeof(*cache)); @@ -201,6 +236,11 @@ dns_cache_create(isc_mem_t *cmctx, isc_mem_t *hmctx, isc_taskmgr_t *taskmgr, isc_mem_attach(cmctx, &cache->mctx); isc_mem_attach(hmctx, &cache->hmctx); + cache->taskmgr = NULL; + if (taskmgr != NULL) { + isc_taskmgr_attach(taskmgr, &cache->taskmgr); + } + cache->name = NULL; if (cachename != NULL) { cache->name = isc_mem_strdup(cmctx, cachename); @@ -257,18 +297,6 @@ dns_cache_create(isc_mem_t *cmctx, isc_mem_t *hmctx, isc_taskmgr_t *taskmgr, if (result != ISC_R_SUCCESS) { goto cleanup_dbargv; } - if (taskmgr != NULL) { - dbtask = NULL; - result = isc_task_create(taskmgr, 1, &dbtask); - if (result != ISC_R_SUCCESS) { - goto cleanup_db; - } - - isc_task_setname(dbtask, "cache_dbtask", NULL); - dns_db_settask(cache->db, dbtask); - isc_task_detach(&dbtask); - } - cache->magic = CACHE_MAGIC; /* @@ -312,6 +340,9 @@ cleanup_lock: if (cache->name != NULL) { isc_mem_free(cmctx, cache->name); } + if (cache->taskmgr != NULL) { + isc_taskmgr_detach(&cache->taskmgr); + } isc_mem_detach(&cache->hmctx); isc_mem_putanddetach(&cache->mctx, cache, sizeof(*cache)); return (result); @@ -378,6 +409,10 @@ cache_free(dns_cache_t *cache) { isc_stats_detach(&cache->stats); } + if (cache->taskmgr != NULL) { + isc_taskmgr_detach(&cache->taskmgr); + } + isc_mutex_destroy(&cache->lock); cache->magic = 0; diff --git a/lib/dns/db.c b/lib/dns/db.c index c95d19ad13..0fce3005fb 100644 --- a/lib/dns/db.c +++ b/lib/dns/db.c @@ -829,10 +829,10 @@ dns_db_hashsize(dns_db_t *db) { } void -dns_db_settask(dns_db_t *db, isc_task_t *task) { +dns_db_settask(dns_db_t *db, isc_task_t *task, isc_task_t *prunetask) { REQUIRE(DNS_DB_VALID(db)); - (db->methods->settask)(db, task); + (db->methods->settask)(db, task, prunetask); } isc_result_t diff --git a/lib/dns/include/dns/cache.h b/lib/dns/include/dns/cache.h index 8fc96574d8..880c1ffd10 100644 --- a/lib/dns/include/dns/cache.h +++ b/lib/dns/include/dns/cache.h @@ -61,25 +61,20 @@ dns_cache_create(isc_mem_t *cmctx, isc_mem_t *hmctx, isc_taskmgr_t *taskmgr, const char *cachename, const char *db_type, unsigned int db_argc, char **db_argv, dns_cache_t **cachep); /*%< - * Create a new DNS cache. - * - * dns_cache_create2() will create a named cache. - * - * dns_cache_create3() will create a named cache using two separate memory - * contexts, one for cache data which can be cleaned and a separate one for - * memory allocated for the heap (which can grow without an upper limit and - * has no mechanism for shrinking). - * - * dns_cache_create() is a backward compatible version that internally - * specifies an empty cache name and a single memory context. + * Create a new named DNS cache using two separate memory contexts, one for + * cache data which can be cleaned and a separate one for memory allocated for + * the heap (which can grow without an upper limit and has no mechanism for + * shrinking). * * Requires: * - *\li 'cmctx' (and 'hmctx' if applicable) is a valid memory context. + *\li 'cmctx' and 'hmctx' are valid memory contexts. + * + *\li 'taskmgr' is a valid task manager (if 'db_type' is "rbt"). * *\li 'taskmgr' is a valid task manager and 'timermgr' is a valid timer - * manager, or both are NULL. If NULL, no periodic cleaning of the - * cache will take place. + * manager, or both are NULL (if 'db_type' is not "rbt"). If NULL, no + * periodic cleaning of the cache will take place. * *\li 'cachename' is a valid string. This must not be NULL. * diff --git a/lib/dns/include/dns/db.h b/lib/dns/include/dns/db.h index 9b53f0435f..bbae0e2991 100644 --- a/lib/dns/include/dns/db.h +++ b/lib/dns/include/dns/db.h @@ -139,7 +139,7 @@ typedef struct dns_dbmethods { unsigned int (*nodecount)(dns_db_t *db, dns_dbtree_t); bool (*ispersistent)(dns_db_t *db); void (*overmem)(dns_db_t *db, bool overmem); - void (*settask)(dns_db_t *db, isc_task_t *); + void (*settask)(dns_db_t *db, isc_task_t *, isc_task_t *); isc_result_t (*getoriginnode)(dns_db_t *db, dns_dbnode_t **nodep); void (*transfernode)(dns_db_t *db, dns_dbnode_t **sourcep, dns_dbnode_t **targetp); @@ -1389,13 +1389,14 @@ dns_db_hashsize(dns_db_t *db); */ void -dns_db_settask(dns_db_t *db, isc_task_t *task); +dns_db_settask(dns_db_t *db, isc_task_t *task, isc_task_t *prunetask); /*%< * If task is set then the final detach maybe performed asynchronously. * * Requires: * \li 'db' is a valid database. - * \li 'task' to be valid or NULL. + * \li 'task' to be valid or NULL (default task to send events to). + * \li 'prunetask' to be valid or NULL (task to send tree-pruning events to). */ bool diff --git a/lib/dns/rbtdb.c b/lib/dns/rbtdb.c index 8ac0340bb0..80c128b343 100644 --- a/lib/dns/rbtdb.c +++ b/lib/dns/rbtdb.c @@ -462,6 +462,7 @@ struct dns_rbtdb { rbtdb_version_t *future_version; rbtdb_versionlist_t open_versions; isc_task_t *task; + isc_task_t *prunetask; dns_dbnode_t *soanode; dns_dbnode_t *nsnode; @@ -1163,6 +1164,9 @@ free_rbtdb(dns_rbtdb_t *rbtdb, bool log, isc_event_t *event) { if (rbtdb->task != NULL) { isc_task_detach(&rbtdb->task); } + if (rbtdb->prunetask != NULL) { + isc_task_detach(&rbtdb->prunetask); + } RBTDB_DESTROYLOCK(&rbtdb->lock); rbtdb->common.magic = 0; @@ -1850,7 +1854,7 @@ new_reference(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node, * The tree lock must be held for the result to be valid. */ static bool -is_leaf(dns_rbtnode_t *node) { +is_last_node_on_its_level(dns_rbtnode_t *node) { return (node->parent != NULL && node->parent->down == node && node->left == NULL && node->right == NULL); } @@ -1867,7 +1871,7 @@ send_to_prune_tree(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node, db = NULL; attach((dns_db_t *)rbtdb, &db); ev->ev_sender = db; - isc_task_send(rbtdb->task, &ev); + isc_task_send(rbtdb->prunetask, &ev); } /*% @@ -1900,7 +1904,7 @@ cleanup_dead_nodes(dns_rbtdb_t *rbtdb, int bucketnum) { continue; } - if (is_leaf(node) && rbtdb->task != NULL) { + if (is_last_node_on_its_level(node) && rbtdb->task != NULL) { send_to_prune_tree(rbtdb, node, isc_rwlocktype_write); } else if (node->down == NULL && node->data == NULL) { /* @@ -2084,26 +2088,31 @@ decrement_reference(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node, if (write_locked) { /* - * If this node is the only one in the level it's in, deleting - * this node may recursively make its parent the only node in - * the parent level; if so, and if no one is currently using - * the parent node, this is almost the only opportunity to - * clean it up. But the recursive cleanup is not that trivial - * since the child and parent may be in different lock buckets, - * which would cause a lock order reversal problem. To avoid - * the trouble, we'll dispatch a separate event for batch - * cleaning. We need to check whether we're deleting the node - * as a result of pruning to avoid infinite dispatching. - * Note: pruning happens only when a task has been set for the - * rbtdb. If the user of the rbtdb chooses not to set a task, - * it's their responsibility to purge stale leaves (e.g. by - * periodic walk-through). + * If this node is the only one left on its RBTDB level, + * attempt pruning the RBTDB (i.e. deleting empty nodes that + * are ancestors of 'node' and are not interior nodes) starting + * from this node (see prune_tree()). The main reason this is + * not done immediately, but asynchronously, is that the + * ancestors of 'node' are almost guaranteed to belong to + * different node buckets and we don't want to do juggle locks + * right now. + * + * Since prune_tree() also calls decrement_reference(), check + * the value of the 'pruning' parameter (which is only set to + * 'true' in the decrement_reference() call present in + * prune_tree()) to prevent an infinite loop and to allow a + * node sent to prune_tree() to be deleted by the delete_node() + * call in the code branch below. */ - if (!pruning && is_leaf(node) && rbtdb->task != NULL) { + if (!pruning && is_last_node_on_its_level(node) && + rbtdb->task != NULL) + { send_to_prune_tree(rbtdb, node, isc_rwlocktype_write); no_reference = false; } else { - /* We can now delete the node. */ + /* + * The node can now be deleted. + */ delete_node(rbtdb, node); } } else { @@ -2139,10 +2148,27 @@ restore_locks: } /* - * Prune the tree by cleaning up single leaves. A single execution of this - * function cleans up a single node; if the parent of the latter becomes a - * single leaf on its own level as a result, the parent is then also sent to - * this function. + * Prune the RBTDB tree of trees. Start by attempting to delete a node that is + * the only one left on its RBTDB level (see the send_to_prune_tree() call in + * decrement_reference()). Then, if the node has a parent (which can either + * exist on the same RBTDB level or on an upper RBTDB level), check whether the + * latter is an interior node (i.e. a node with a non-NULL 'down' pointer). If + * the parent node is not an interior node, attempt deleting the parent node as + * well and then move on to examining the parent node's parent, etc. Continue + * traversing the RBTDB tree until a node is encountered that is still an + * interior node after the previously-processed node gets deleted. + * + * It is acceptable for a node sent to this function to NOT be deleted in the + * process (e.g. if it gets reactivated in the meantime). Furthermore, node + * deletion is not a prerequisite for continuing RBTDB traversal. + * + * This function gets called once for every "starting node" and it continues + * traversing the RBTDB until the stop condition is met. In the worst case, + * the number of nodes processed by a single execution of this function is the + * number of tree levels, which is at most the maximum number of domain name + * labels (127); however, it should be much smaller in practice and deleting + * empty RBTDB nodes is critical to keeping the amount of memory used by the + * cache memory context within the configured limit anyway. */ static void prune_tree(isc_task_t *task, isc_event_t *event) { @@ -2156,22 +2182,44 @@ prune_tree(isc_task_t *task, isc_event_t *event) { isc_event_free(&event); RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_write); - - parent = node->parent; - NODE_LOCK(&rbtdb->node_locks[locknum].lock, isc_rwlocktype_write); - decrement_reference(rbtdb, node, 0, isc_rwlocktype_write, - isc_rwlocktype_write, true); + do { + parent = node->parent; + decrement_reference(rbtdb, node, 0, isc_rwlocktype_write, + isc_rwlocktype_write, true); + + /* + * Check whether the parent is an interior node. Note that it + * might have been one before the decrement_reference() call on + * the previous line, but decrementing the reference count for + * 'node' could have caused 'node->parent->down' to become + * NULL. + */ + if (parent != NULL && parent->down == NULL) { + /* + * Keep the node lock if possible; otherwise, release + * the old lock and acquire one for the parent. + */ + if (parent->locknum != locknum) { + NODE_UNLOCK(&rbtdb->node_locks[locknum].lock, + isc_rwlocktype_write); + locknum = parent->locknum; + NODE_LOCK(&rbtdb->node_locks[locknum].lock, + isc_rwlocktype_write); + } + + /* + * We need to gain a reference to the parent node + * before decrementing it in the next iteration. + */ + new_reference(rbtdb, parent, isc_rwlocktype_write); + } else { + parent = NULL; + } + + node = parent; + } while (node != NULL); NODE_UNLOCK(&rbtdb->node_locks[locknum].lock, isc_rwlocktype_write); - - if (parent != NULL && is_leaf(parent)) { - NODE_LOCK(&rbtdb->node_locks[parent->locknum].lock, - isc_rwlocktype_write); - send_to_prune_tree(rbtdb, parent, isc_rwlocktype_write); - NODE_UNLOCK(&rbtdb->node_locks[parent->locknum].lock, - isc_rwlocktype_write); - } - RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_write); detach((dns_db_t **)&rbtdb); @@ -7719,7 +7767,7 @@ hashsize(dns_db_t *db) { } static void -settask(dns_db_t *db, isc_task_t *task) { +settask(dns_db_t *db, isc_task_t *task, isc_task_t *prunetask) { dns_rbtdb_t *rbtdb; rbtdb = (dns_rbtdb_t *)db; @@ -7733,6 +7781,12 @@ settask(dns_db_t *db, isc_task_t *task) { if (task != NULL) { isc_task_attach(task, &rbtdb->task); } + if (rbtdb->prunetask != NULL) { + isc_task_detach(&rbtdb->prunetask); + } + if (prunetask != NULL) { + isc_task_attach(prunetask, &rbtdb->prunetask); + } RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write); } @@ -8410,6 +8464,7 @@ dns_rbtdb_create(isc_mem_t *mctx, const dns_name_t *origin, dns_dbtype_t type, isc_refcount_init(&rbtdb->references, 1); rbtdb->attributes = 0; rbtdb->task = NULL; + rbtdb->prunetask = NULL; rbtdb->serve_stale_ttl = 0; /* diff --git a/lib/dns/sdb.c b/lib/dns/sdb.c index 317eeb0b21..b58080a2fa 100644 --- a/lib/dns/sdb.c +++ b/lib/dns/sdb.c @@ -1263,9 +1263,10 @@ overmem(dns_db_t *db, bool over) { } static void -settask(dns_db_t *db, isc_task_t *task) { +settask(dns_db_t *db, isc_task_t *task, isc_task_t *prunetask) { UNUSED(db); UNUSED(task); + UNUSED(prunetask); } static dns_dbmethods_t sdb_methods = { diff --git a/lib/dns/sdlz.c b/lib/dns/sdlz.c index 7ab08f6bc9..ff67448865 100644 --- a/lib/dns/sdlz.c +++ b/lib/dns/sdlz.c @@ -1213,9 +1213,10 @@ overmem(dns_db_t *db, bool over) { } static void -settask(dns_db_t *db, isc_task_t *task) { +settask(dns_db_t *db, isc_task_t *task, isc_task_t *prunetask) { UNUSED(db); UNUSED(task); + UNUSED(prunetask); } /* diff --git a/lib/dns/zone.c b/lib/dns/zone.c index 729adcb6e4..19c2ce3651 100644 --- a/lib/dns/zone.c +++ b/lib/dns/zone.c @@ -2339,7 +2339,7 @@ zone_load(dns_zone_t *zone, unsigned int flags, bool locked) { isc_result_totext(result)); goto cleanup; } - dns_db_settask(db, zone->task); + dns_db_settask(db, zone->task, zone->task); if (zone->type == dns_zone_primary || zone->type == dns_zone_secondary || zone->type == dns_zone_mirror) @@ -14791,7 +14791,7 @@ ns_query(dns_zone_t *zone, dns_rdataset_t *soardataset, dns_stub_t *stub) { isc_result_totext(result)); goto cleanup; } - dns_db_settask(stub->db, zone->task); + dns_db_settask(stub->db, zone->task, zone->task); } result = dns_db_newversion(stub->db, &stub->version); @@ -16235,7 +16235,7 @@ dns_zone_settask(dns_zone_t *zone, isc_task_t *task) { isc_task_attach(task, &zone->task); ZONEDB_LOCK(&zone->dblock, isc_rwlocktype_read); if (zone->db != NULL) { - dns_db_settask(zone->db, zone->task); + dns_db_settask(zone->db, zone->task, zone->task); } ZONEDB_UNLOCK(&zone->dblock, isc_rwlocktype_read); UNLOCK_ZONE(zone); @@ -17508,7 +17508,7 @@ zone_replacedb(dns_zone_t *zone, dns_db_t *db, bool dump) { zone_detachdb(zone); } zone_attachdb(zone, db); - dns_db_settask(zone->db, zone->task); + dns_db_settask(zone->db, zone->task, zone->task); DNS_ZONE_SETFLAG(zone, DNS_ZONEFLG_LOADED | DNS_ZONEFLG_NEEDNOTIFY); return (ISC_R_SUCCESS);