Merge branch '4621-fix-cache-pruning-after-rndc-flush-9.18.25' into 'v9.18.25-release'

[9.18.25] Move the task creation into cache_create_db()

See merge request isc-projects/bind9!8835
This commit is contained in:
Michał Kępień 2024-03-06 18:15:41 +00:00
commit a7636df9bf
11 changed files with 178 additions and 81 deletions

View file

@ -1,3 +1,7 @@
6356. [bug] Create the pruning task in the dns_cache_flush(), so
the cache pruning still works after the flush.
[GL #4621]
6353. [bug] Improve the TTL-based cleaning by removing the expired
headers from the heap, so they don't block the next
cleaning round and clean more than a single item for

View file

@ -420,12 +420,12 @@ overmem(dns_db_t *db, bool over) {
}
static void
settask(dns_db_t *db, isc_task_t *task) {
settask(dns_db_t *db, isc_task_t *task, isc_task_t *prunetask) {
sampledb_t *sampledb = (sampledb_t *)db;
REQUIRE(VALID_SAMPLEDB(sampledb));
dns_db_settask(sampledb->rbtdb, task);
dns_db_settask(sampledb->rbtdb, task, prunetask);
}
static isc_result_t

View file

@ -47,6 +47,11 @@ Bug Fixes
:gl:`#4596`
- Using :option:`rndc flush` inadvertently caused cache cleaning to
become less effective. This could ultimately lead to the configured
:any:`max-cache-size` limit being exceeded and has now been fixed.
:gl:`#4621`
Known Issues
~~~~~~~~~~~~

View file

@ -129,6 +129,7 @@ struct dns_cache {
isc_mutex_t lock;
isc_mem_t *mctx; /* Main cache memory */
isc_mem_t *hmctx; /* Heap memory */
isc_taskmgr_t *taskmgr;
char *name;
isc_refcount_t references;
isc_refcount_t live_tasks;
@ -169,13 +170,47 @@ water(void *arg, int mark);
static isc_result_t
cache_create_db(dns_cache_t *cache, dns_db_t **db) {
isc_result_t result;
isc_task_t *dbtask = NULL;
isc_task_t *prunetask = NULL;
result = dns_db_create(cache->mctx, cache->db_type, dns_rootname,
dns_dbtype_cache, cache->rdclass, cache->db_argc,
cache->db_argv, db);
if (result == ISC_R_SUCCESS) {
dns_db_setservestalettl(*db, cache->serve_stale_ttl);
dns_db_setservestalerefresh(*db, cache->serve_stale_refresh);
if (result != ISC_R_SUCCESS) {
return (result);
}
dns_db_setservestalettl(*db, cache->serve_stale_ttl);
dns_db_setservestalerefresh(*db, cache->serve_stale_refresh);
if (cache->taskmgr == NULL) {
return (ISC_R_SUCCESS);
}
result = isc_task_create(cache->taskmgr, 1, &dbtask);
if (result != ISC_R_SUCCESS) {
goto cleanup_db;
}
isc_task_setname(dbtask, "cache_dbtask", NULL);
result = isc_task_create(cache->taskmgr, UINT_MAX, &prunetask);
if (result != ISC_R_SUCCESS) {
goto cleanup_dbtask;
}
isc_task_setname(prunetask, "cache_prunetask", NULL);
dns_db_settask(*db, dbtask, prunetask);
isc_task_detach(&prunetask);
isc_task_detach(&dbtask);
return (ISC_R_SUCCESS);
cleanup_dbtask:
isc_task_detach(&dbtask);
cleanup_db:
dns_db_detach(db);
return (result);
}
@ -187,12 +222,12 @@ dns_cache_create(isc_mem_t *cmctx, isc_mem_t *hmctx, isc_taskmgr_t *taskmgr,
isc_result_t result;
dns_cache_t *cache;
int i, extra = 0;
isc_task_t *dbtask;
REQUIRE(cachep != NULL);
REQUIRE(*cachep == NULL);
REQUIRE(cmctx != NULL);
REQUIRE(hmctx != NULL);
REQUIRE(taskmgr != NULL || strcmp(db_type, "rbt") != 0);
REQUIRE(cachename != NULL);
cache = isc_mem_get(cmctx, sizeof(*cache));
@ -201,6 +236,11 @@ dns_cache_create(isc_mem_t *cmctx, isc_mem_t *hmctx, isc_taskmgr_t *taskmgr,
isc_mem_attach(cmctx, &cache->mctx);
isc_mem_attach(hmctx, &cache->hmctx);
cache->taskmgr = NULL;
if (taskmgr != NULL) {
isc_taskmgr_attach(taskmgr, &cache->taskmgr);
}
cache->name = NULL;
if (cachename != NULL) {
cache->name = isc_mem_strdup(cmctx, cachename);
@ -257,18 +297,6 @@ dns_cache_create(isc_mem_t *cmctx, isc_mem_t *hmctx, isc_taskmgr_t *taskmgr,
if (result != ISC_R_SUCCESS) {
goto cleanup_dbargv;
}
if (taskmgr != NULL) {
dbtask = NULL;
result = isc_task_create(taskmgr, 1, &dbtask);
if (result != ISC_R_SUCCESS) {
goto cleanup_db;
}
isc_task_setname(dbtask, "cache_dbtask", NULL);
dns_db_settask(cache->db, dbtask);
isc_task_detach(&dbtask);
}
cache->magic = CACHE_MAGIC;
/*
@ -312,6 +340,9 @@ cleanup_lock:
if (cache->name != NULL) {
isc_mem_free(cmctx, cache->name);
}
if (cache->taskmgr != NULL) {
isc_taskmgr_detach(&cache->taskmgr);
}
isc_mem_detach(&cache->hmctx);
isc_mem_putanddetach(&cache->mctx, cache, sizeof(*cache));
return (result);
@ -378,6 +409,10 @@ cache_free(dns_cache_t *cache) {
isc_stats_detach(&cache->stats);
}
if (cache->taskmgr != NULL) {
isc_taskmgr_detach(&cache->taskmgr);
}
isc_mutex_destroy(&cache->lock);
cache->magic = 0;

View file

@ -829,10 +829,10 @@ dns_db_hashsize(dns_db_t *db) {
}
void
dns_db_settask(dns_db_t *db, isc_task_t *task) {
dns_db_settask(dns_db_t *db, isc_task_t *task, isc_task_t *prunetask) {
REQUIRE(DNS_DB_VALID(db));
(db->methods->settask)(db, task);
(db->methods->settask)(db, task, prunetask);
}
isc_result_t

View file

@ -61,25 +61,20 @@ dns_cache_create(isc_mem_t *cmctx, isc_mem_t *hmctx, isc_taskmgr_t *taskmgr,
const char *cachename, const char *db_type,
unsigned int db_argc, char **db_argv, dns_cache_t **cachep);
/*%<
* Create a new DNS cache.
*
* dns_cache_create2() will create a named cache.
*
* dns_cache_create3() will create a named cache using two separate memory
* contexts, one for cache data which can be cleaned and a separate one for
* memory allocated for the heap (which can grow without an upper limit and
* has no mechanism for shrinking).
*
* dns_cache_create() is a backward compatible version that internally
* specifies an empty cache name and a single memory context.
* Create a new named DNS cache using two separate memory contexts, one for
* cache data which can be cleaned and a separate one for memory allocated for
* the heap (which can grow without an upper limit and has no mechanism for
* shrinking).
*
* Requires:
*
*\li 'cmctx' (and 'hmctx' if applicable) is a valid memory context.
*\li 'cmctx' and 'hmctx' are valid memory contexts.
*
*\li 'taskmgr' is a valid task manager (if 'db_type' is "rbt").
*
*\li 'taskmgr' is a valid task manager and 'timermgr' is a valid timer
* manager, or both are NULL. If NULL, no periodic cleaning of the
* cache will take place.
* manager, or both are NULL (if 'db_type' is not "rbt"). If NULL, no
* periodic cleaning of the cache will take place.
*
*\li 'cachename' is a valid string. This must not be NULL.
*

View file

@ -139,7 +139,7 @@ typedef struct dns_dbmethods {
unsigned int (*nodecount)(dns_db_t *db, dns_dbtree_t);
bool (*ispersistent)(dns_db_t *db);
void (*overmem)(dns_db_t *db, bool overmem);
void (*settask)(dns_db_t *db, isc_task_t *);
void (*settask)(dns_db_t *db, isc_task_t *, isc_task_t *);
isc_result_t (*getoriginnode)(dns_db_t *db, dns_dbnode_t **nodep);
void (*transfernode)(dns_db_t *db, dns_dbnode_t **sourcep,
dns_dbnode_t **targetp);
@ -1389,13 +1389,14 @@ dns_db_hashsize(dns_db_t *db);
*/
void
dns_db_settask(dns_db_t *db, isc_task_t *task);
dns_db_settask(dns_db_t *db, isc_task_t *task, isc_task_t *prunetask);
/*%<
* If task is set then the final detach maybe performed asynchronously.
*
* Requires:
* \li 'db' is a valid database.
* \li 'task' to be valid or NULL.
* \li 'task' to be valid or NULL (default task to send events to).
* \li 'prunetask' to be valid or NULL (task to send tree-pruning events to).
*/
bool

View file

@ -462,6 +462,7 @@ struct dns_rbtdb {
rbtdb_version_t *future_version;
rbtdb_versionlist_t open_versions;
isc_task_t *task;
isc_task_t *prunetask;
dns_dbnode_t *soanode;
dns_dbnode_t *nsnode;
@ -1163,6 +1164,9 @@ free_rbtdb(dns_rbtdb_t *rbtdb, bool log, isc_event_t *event) {
if (rbtdb->task != NULL) {
isc_task_detach(&rbtdb->task);
}
if (rbtdb->prunetask != NULL) {
isc_task_detach(&rbtdb->prunetask);
}
RBTDB_DESTROYLOCK(&rbtdb->lock);
rbtdb->common.magic = 0;
@ -1850,7 +1854,7 @@ new_reference(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node,
* The tree lock must be held for the result to be valid.
*/
static bool
is_leaf(dns_rbtnode_t *node) {
is_last_node_on_its_level(dns_rbtnode_t *node) {
return (node->parent != NULL && node->parent->down == node &&
node->left == NULL && node->right == NULL);
}
@ -1867,7 +1871,7 @@ send_to_prune_tree(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node,
db = NULL;
attach((dns_db_t *)rbtdb, &db);
ev->ev_sender = db;
isc_task_send(rbtdb->task, &ev);
isc_task_send(rbtdb->prunetask, &ev);
}
/*%
@ -1900,7 +1904,7 @@ cleanup_dead_nodes(dns_rbtdb_t *rbtdb, int bucketnum) {
continue;
}
if (is_leaf(node) && rbtdb->task != NULL) {
if (is_last_node_on_its_level(node) && rbtdb->task != NULL) {
send_to_prune_tree(rbtdb, node, isc_rwlocktype_write);
} else if (node->down == NULL && node->data == NULL) {
/*
@ -2084,26 +2088,31 @@ decrement_reference(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node,
if (write_locked) {
/*
* If this node is the only one in the level it's in, deleting
* this node may recursively make its parent the only node in
* the parent level; if so, and if no one is currently using
* the parent node, this is almost the only opportunity to
* clean it up. But the recursive cleanup is not that trivial
* since the child and parent may be in different lock buckets,
* which would cause a lock order reversal problem. To avoid
* the trouble, we'll dispatch a separate event for batch
* cleaning. We need to check whether we're deleting the node
* as a result of pruning to avoid infinite dispatching.
* Note: pruning happens only when a task has been set for the
* rbtdb. If the user of the rbtdb chooses not to set a task,
* it's their responsibility to purge stale leaves (e.g. by
* periodic walk-through).
* If this node is the only one left on its RBTDB level,
* attempt pruning the RBTDB (i.e. deleting empty nodes that
* are ancestors of 'node' and are not interior nodes) starting
* from this node (see prune_tree()). The main reason this is
* not done immediately, but asynchronously, is that the
* ancestors of 'node' are almost guaranteed to belong to
* different node buckets and we don't want to do juggle locks
* right now.
*
* Since prune_tree() also calls decrement_reference(), check
* the value of the 'pruning' parameter (which is only set to
* 'true' in the decrement_reference() call present in
* prune_tree()) to prevent an infinite loop and to allow a
* node sent to prune_tree() to be deleted by the delete_node()
* call in the code branch below.
*/
if (!pruning && is_leaf(node) && rbtdb->task != NULL) {
if (!pruning && is_last_node_on_its_level(node) &&
rbtdb->task != NULL)
{
send_to_prune_tree(rbtdb, node, isc_rwlocktype_write);
no_reference = false;
} else {
/* We can now delete the node. */
/*
* The node can now be deleted.
*/
delete_node(rbtdb, node);
}
} else {
@ -2139,10 +2148,27 @@ restore_locks:
}
/*
* Prune the tree by cleaning up single leaves. A single execution of this
* function cleans up a single node; if the parent of the latter becomes a
* single leaf on its own level as a result, the parent is then also sent to
* this function.
* Prune the RBTDB tree of trees. Start by attempting to delete a node that is
* the only one left on its RBTDB level (see the send_to_prune_tree() call in
* decrement_reference()). Then, if the node has a parent (which can either
* exist on the same RBTDB level or on an upper RBTDB level), check whether the
* latter is an interior node (i.e. a node with a non-NULL 'down' pointer). If
* the parent node is not an interior node, attempt deleting the parent node as
* well and then move on to examining the parent node's parent, etc. Continue
* traversing the RBTDB tree until a node is encountered that is still an
* interior node after the previously-processed node gets deleted.
*
* It is acceptable for a node sent to this function to NOT be deleted in the
* process (e.g. if it gets reactivated in the meantime). Furthermore, node
* deletion is not a prerequisite for continuing RBTDB traversal.
*
* This function gets called once for every "starting node" and it continues
* traversing the RBTDB until the stop condition is met. In the worst case,
* the number of nodes processed by a single execution of this function is the
* number of tree levels, which is at most the maximum number of domain name
* labels (127); however, it should be much smaller in practice and deleting
* empty RBTDB nodes is critical to keeping the amount of memory used by the
* cache memory context within the configured limit anyway.
*/
static void
prune_tree(isc_task_t *task, isc_event_t *event) {
@ -2156,22 +2182,44 @@ prune_tree(isc_task_t *task, isc_event_t *event) {
isc_event_free(&event);
RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
parent = node->parent;
NODE_LOCK(&rbtdb->node_locks[locknum].lock, isc_rwlocktype_write);
decrement_reference(rbtdb, node, 0, isc_rwlocktype_write,
isc_rwlocktype_write, true);
do {
parent = node->parent;
decrement_reference(rbtdb, node, 0, isc_rwlocktype_write,
isc_rwlocktype_write, true);
/*
* Check whether the parent is an interior node. Note that it
* might have been one before the decrement_reference() call on
* the previous line, but decrementing the reference count for
* 'node' could have caused 'node->parent->down' to become
* NULL.
*/
if (parent != NULL && parent->down == NULL) {
/*
* Keep the node lock if possible; otherwise, release
* the old lock and acquire one for the parent.
*/
if (parent->locknum != locknum) {
NODE_UNLOCK(&rbtdb->node_locks[locknum].lock,
isc_rwlocktype_write);
locknum = parent->locknum;
NODE_LOCK(&rbtdb->node_locks[locknum].lock,
isc_rwlocktype_write);
}
/*
* We need to gain a reference to the parent node
* before decrementing it in the next iteration.
*/
new_reference(rbtdb, parent, isc_rwlocktype_write);
} else {
parent = NULL;
}
node = parent;
} while (node != NULL);
NODE_UNLOCK(&rbtdb->node_locks[locknum].lock, isc_rwlocktype_write);
if (parent != NULL && is_leaf(parent)) {
NODE_LOCK(&rbtdb->node_locks[parent->locknum].lock,
isc_rwlocktype_write);
send_to_prune_tree(rbtdb, parent, isc_rwlocktype_write);
NODE_UNLOCK(&rbtdb->node_locks[parent->locknum].lock,
isc_rwlocktype_write);
}
RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_write);
detach((dns_db_t **)&rbtdb);
@ -7719,7 +7767,7 @@ hashsize(dns_db_t *db) {
}
static void
settask(dns_db_t *db, isc_task_t *task) {
settask(dns_db_t *db, isc_task_t *task, isc_task_t *prunetask) {
dns_rbtdb_t *rbtdb;
rbtdb = (dns_rbtdb_t *)db;
@ -7733,6 +7781,12 @@ settask(dns_db_t *db, isc_task_t *task) {
if (task != NULL) {
isc_task_attach(task, &rbtdb->task);
}
if (rbtdb->prunetask != NULL) {
isc_task_detach(&rbtdb->prunetask);
}
if (prunetask != NULL) {
isc_task_attach(prunetask, &rbtdb->prunetask);
}
RBTDB_UNLOCK(&rbtdb->lock, isc_rwlocktype_write);
}
@ -8410,6 +8464,7 @@ dns_rbtdb_create(isc_mem_t *mctx, const dns_name_t *origin, dns_dbtype_t type,
isc_refcount_init(&rbtdb->references, 1);
rbtdb->attributes = 0;
rbtdb->task = NULL;
rbtdb->prunetask = NULL;
rbtdb->serve_stale_ttl = 0;
/*

View file

@ -1263,9 +1263,10 @@ overmem(dns_db_t *db, bool over) {
}
static void
settask(dns_db_t *db, isc_task_t *task) {
settask(dns_db_t *db, isc_task_t *task, isc_task_t *prunetask) {
UNUSED(db);
UNUSED(task);
UNUSED(prunetask);
}
static dns_dbmethods_t sdb_methods = {

View file

@ -1213,9 +1213,10 @@ overmem(dns_db_t *db, bool over) {
}
static void
settask(dns_db_t *db, isc_task_t *task) {
settask(dns_db_t *db, isc_task_t *task, isc_task_t *prunetask) {
UNUSED(db);
UNUSED(task);
UNUSED(prunetask);
}
/*

View file

@ -2339,7 +2339,7 @@ zone_load(dns_zone_t *zone, unsigned int flags, bool locked) {
isc_result_totext(result));
goto cleanup;
}
dns_db_settask(db, zone->task);
dns_db_settask(db, zone->task, zone->task);
if (zone->type == dns_zone_primary ||
zone->type == dns_zone_secondary || zone->type == dns_zone_mirror)
@ -14791,7 +14791,7 @@ ns_query(dns_zone_t *zone, dns_rdataset_t *soardataset, dns_stub_t *stub) {
isc_result_totext(result));
goto cleanup;
}
dns_db_settask(stub->db, zone->task);
dns_db_settask(stub->db, zone->task, zone->task);
}
result = dns_db_newversion(stub->db, &stub->version);
@ -16235,7 +16235,7 @@ dns_zone_settask(dns_zone_t *zone, isc_task_t *task) {
isc_task_attach(task, &zone->task);
ZONEDB_LOCK(&zone->dblock, isc_rwlocktype_read);
if (zone->db != NULL) {
dns_db_settask(zone->db, zone->task);
dns_db_settask(zone->db, zone->task, zone->task);
}
ZONEDB_UNLOCK(&zone->dblock, isc_rwlocktype_read);
UNLOCK_ZONE(zone);
@ -17508,7 +17508,7 @@ zone_replacedb(dns_zone_t *zone, dns_db_t *db, bool dump) {
zone_detachdb(zone);
}
zone_attachdb(zone, db);
dns_db_settask(zone->db, zone->task);
dns_db_settask(zone->db, zone->task, zone->task);
DNS_ZONE_SETFLAG(zone, DNS_ZONEFLG_LOADED | DNS_ZONEFLG_NEEDNOTIFY);
return (ISC_R_SUCCESS);