From 156a08e327f88b629dba7cab815a3d00ed9452b8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ond=C5=99ej=20Sur=C3=BD?= Date: Wed, 21 Feb 2024 16:58:46 +0100 Subject: [PATCH 1/3] Reduce lock contention during RBTDB tree pruning The log message for commit a9af1ac5ae67d266a913ba8a7118ac6f47723d95 explained: In some older BIND 9 branches, the extra queuing overhead eliminated by this change could be remotely exploited to cause excessive memory use. Due to architectural shift, this branch is not vulnerable to that issue, but applying the fix to the latter is nevertheless deemed prudent for consistency and to make the code future-proof. However, it turned out that having a single queue for the nodes to be pruned increased lock contention to a level where cleaning up nodes from the RBTDB took too long, causing the amount of memory used by the cache to grow indefinitely over time. This commit reverts the change to the pruning mechanism introduced by commit a9af1ac5ae67d266a913ba8a7118ac6f47723d95 as BIND branches newer than 9.16 were not affected by the excessive event queueing overhead issue mentioned in the log message for the above commit. (cherry picked from commit eed17611d8ceeb99ec3ff282aaea4d38bd334932) (cherry picked from commit 4b32456705a9812ee21b815da361fe1adf1f1b94) --- lib/dns/include/dns/rbt.h | 6 -- lib/dns/rbt.c | 1 - lib/dns/rbtdb.c | 149 +++++++++++++------------------------- 3 files changed, 50 insertions(+), 106 deletions(-) diff --git a/lib/dns/include/dns/rbt.h b/lib/dns/include/dns/rbt.h index 40bf09ee77..c144f83b49 100644 --- a/lib/dns/include/dns/rbt.h +++ b/lib/dns/include/dns/rbt.h @@ -124,12 +124,6 @@ struct dns_rbtnode { */ ISC_LINK(dns_rbtnode_t) deadlink; - /*% - * This linked list is used to store nodes from which tree pruning can - * be started. - */ - ISC_LINK(dns_rbtnode_t) prunelink; - /*@{*/ /*! * These values are used in the RBT DB implementation. The appropriate diff --git a/lib/dns/rbt.c b/lib/dns/rbt.c index 29f19c895b..4d8c142fc2 100644 --- a/lib/dns/rbt.c +++ b/lib/dns/rbt.c @@ -1576,7 +1576,6 @@ create_node(isc_mem_t *mctx, const dns_name_t *name, dns_rbtnode_t **nodep) { HASHVAL(node) = 0; ISC_LINK_INIT(node, deadlink); - ISC_LINK_INIT(node, prunelink); LOCKNUM(node) = 0; WILD(node) = 0; diff --git a/lib/dns/rbtdb.c b/lib/dns/rbtdb.c index b9e58a42b3..f5f99c0907 100644 --- a/lib/dns/rbtdb.c +++ b/lib/dns/rbtdb.c @@ -494,10 +494,6 @@ struct dns_rbtdb { */ rbtnodelist_t *deadnodes; - /* List of nodes from which recursive tree pruning can be started from. - * Locked by tree_lock. */ - rbtnodelist_t prunenodes; - /* * Heaps. These are used for TTL based expiry in a cache, * or for zone resigning in a zone DB. hmctx is the memory @@ -1007,7 +1003,6 @@ free_rbtdb(dns_rbtdb_t *rbtdb, bool log, isc_event_t *event) { unsigned int i; isc_result_t result; char buf[DNS_NAME_FORMATSIZE]; - dns_rbtnode_t *node = NULL; dns_rbt_t **treep; isc_time_t start; @@ -1033,6 +1028,8 @@ free_rbtdb(dns_rbtdb_t *rbtdb, bool log, isc_event_t *event) { * the overhead of unlinking all nodes here should be negligible. */ for (i = 0; i < rbtdb->node_lock_count; i++) { + dns_rbtnode_t *node; + node = ISC_LIST_HEAD(rbtdb->deadnodes[i]); while (node != NULL) { ISC_LIST_UNLINK(rbtdb->deadnodes[i], node, deadlink); @@ -1040,12 +1037,6 @@ free_rbtdb(dns_rbtdb_t *rbtdb, bool log, isc_event_t *event) { } } - node = ISC_LIST_HEAD(rbtdb->prunenodes); - while (node != NULL) { - ISC_LIST_UNLINK(rbtdb->prunenodes, node, prunelink); - node = ISC_LIST_HEAD(rbtdb->prunenodes); - } - if (event == NULL) { rbtdb->quantum = (rbtdb->task != NULL) ? 100 : 0; } @@ -1851,32 +1842,19 @@ is_leaf(dns_rbtnode_t *node) { node->left == NULL && node->right == NULL); } -/*% - * The tree lock must be held when this function is called as it reads and - * updates rbtdb->prunenodes. - */ static void send_to_prune_tree(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node, isc_rwlocktype_t locktype) { - bool pruning_queued = (ISC_LIST_HEAD(rbtdb->prunenodes) != NULL); - - INSIST(locktype == isc_rwlocktype_write); + isc_event_t *ev; + dns_db_t *db; + ev = isc_event_allocate(rbtdb->common.mctx, NULL, DNS_EVENT_RBTPRUNE, + prune_tree, node, sizeof(isc_event_t)); new_reference(rbtdb, node, locktype); - INSIST(!ISC_LINK_LINKED(node, prunelink)); - ISC_LIST_APPEND(rbtdb->prunenodes, node, prunelink); - - if (!pruning_queued) { - isc_event_t *ev = NULL; - dns_db_t *db = NULL; - - attach((dns_db_t *)rbtdb, &db); - - ev = isc_event_allocate(rbtdb->common.mctx, NULL, - DNS_EVENT_RBTPRUNE, prune_tree, db, - sizeof(isc_event_t)); - isc_task_send(rbtdb->task, &ev); - } + db = NULL; + attach((dns_db_t *)rbtdb, &db); + ev->ev_sender = db; + isc_task_send(rbtdb->task, &ev); } /*% @@ -2151,26 +2129,17 @@ restore_locks: } /* - * Prune the tree by recursively cleaning up single leaves. Go through all - * nodes stored in the rbtdb->prunenodes list; for each of them, in the worst - * case, it will be necessary to traverse a number of tree levels equal to the - * maximum legal number of domain name labels (127); in practice, the number of - * tree levels to traverse will virtually always be much smaller (a few levels - * at most). While holding the tree lock throughout this entire operation is - * less than ideal, so is splitting the latter up by queueing a separate - * prune_tree() run for each node to start pruning from (as queueing requires - * allocating memory and can therefore potentially be exploited to exhaust - * available memory). Also note that actually freeing up the memory used by - * RBTDB nodes (which is what this function does) is essential to keeping cache - * memory use in check, so since the tree lock needs to be acquired anyway, - * freeing as many nodes as possible before the tree lock gets released is - * prudent. + * Prune the tree by recursively cleaning-up single leaves. In the worst + * case, the number of iteration is the number of tree levels, which is at + * most the maximum number of domain name labels, i.e, 127. In practice, this + * should be much smaller (only a few times), and even the worst case would be + * acceptable for a single event. */ static void prune_tree(isc_task_t *task, isc_event_t *event) { - dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)event->ev_arg; - dns_rbtnode_t *node = NULL; - dns_rbtnode_t *parent = NULL; + dns_rbtdb_t *rbtdb = event->ev_sender; + dns_rbtnode_t *node = event->ev_arg; + dns_rbtnode_t *parent; unsigned int locknum; UNUSED(task); @@ -2178,60 +2147,44 @@ prune_tree(isc_task_t *task, isc_event_t *event) { isc_event_free(&event); RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_write); + locknum = node->locknum; + NODE_LOCK(&rbtdb->node_locks[locknum].lock, isc_rwlocktype_write); + do { + parent = node->parent; + decrement_reference(rbtdb, node, 0, isc_rwlocktype_write, + isc_rwlocktype_write, true); - while ((node = ISC_LIST_HEAD(rbtdb->prunenodes)) != NULL) { - locknum = node->locknum; - NODE_LOCK(&rbtdb->node_locks[locknum].lock, - isc_rwlocktype_write); - do { - if (ISC_LINK_LINKED(node, prunelink)) { - ISC_LIST_UNLINK(rbtdb->prunenodes, node, - prunelink); + if (parent != NULL && parent->down == NULL) { + /* + * node was the only down child of the parent and has + * just been removed. We'll then need to examine the + * parent. Keep the lock if possible; otherwise, + * release the old lock and acquire one for the parent. + */ + if (parent->locknum != locknum) { + NODE_UNLOCK(&rbtdb->node_locks[locknum].lock, + isc_rwlocktype_write); + locknum = parent->locknum; + NODE_LOCK(&rbtdb->node_locks[locknum].lock, + isc_rwlocktype_write); } - parent = node->parent; - decrement_reference(rbtdb, node, 0, - isc_rwlocktype_write, - isc_rwlocktype_write, true); - - if (parent != NULL && parent->down == NULL) { - /* - * node was the only down child of the parent - * and has just been removed. We'll then need - * to examine the parent. Keep the lock if - * possible; otherwise, release the old lock and - * acquire one for the parent. - */ - if (parent->locknum != locknum) { - NODE_UNLOCK( - &rbtdb->node_locks[locknum].lock, - isc_rwlocktype_write); - locknum = parent->locknum; - NODE_LOCK( - &rbtdb->node_locks[locknum].lock, - isc_rwlocktype_write); - } - - /* - * We need to gain a reference to the node - * before decrementing it in the next iteration. - */ - if (ISC_LINK_LINKED(parent, deadlink)) { - ISC_LIST_UNLINK( - rbtdb->deadnodes[locknum], + /* + * We need to gain a reference to the node before + * decrementing it in the next iteration. + */ + if (ISC_LINK_LINKED(parent, deadlink)) { + ISC_LIST_UNLINK(rbtdb->deadnodes[locknum], parent, deadlink); - } - new_reference(rbtdb, parent, - isc_rwlocktype_write); - } else { - parent = NULL; } + new_reference(rbtdb, parent, isc_rwlocktype_write); + } else { + parent = NULL; + } - node = parent; - } while (node != NULL); - NODE_UNLOCK(&rbtdb->node_locks[locknum].lock, - isc_rwlocktype_write); - } + node = parent; + } while (node != NULL); + NODE_UNLOCK(&rbtdb->node_locks[locknum].lock, isc_rwlocktype_write); RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_write); detach((dns_db_t **)&rbtdb); @@ -8373,8 +8326,6 @@ dns_rbtdb_create(isc_mem_t *mctx, const dns_name_t *origin, dns_dbtype_t type, ISC_LIST_INIT(rbtdb->deadnodes[i]); } - ISC_LIST_INIT(rbtdb->prunenodes); - rbtdb->active = rbtdb->node_lock_count; for (i = 0; i < (int)(rbtdb->node_lock_count); i++) { From 29e9d4193b16443094db085e0b842b856a42a44a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ond=C5=99ej=20Sur=C3=BD?= Date: Wed, 21 Feb 2024 13:32:09 +0100 Subject: [PATCH 2/3] Simplify the parent cleaning in the prune_tree() mechanism Instead of juggling with node locks in a cycle, cleanup the node we are just pruning and send any the parent that's also subject to the pruning to the prune tree via normal way (e.g. enqueue pruning on the parent). This simplifies the code and also spreads the pruning load across more event loop ticks which is better for lock contention as less things run in a tight loop. (cherry picked from commit 0b32d323e07d2468111aa725b36e72b0538e0521) (cherry picked from commit a4c225cb6dd6bfa409302528b817ea2a6699598d) --- lib/dns/rbtdb.c | 77 +++++++++++++++++-------------------------------- 1 file changed, 26 insertions(+), 51 deletions(-) diff --git a/lib/dns/rbtdb.c b/lib/dns/rbtdb.c index f5f99c0907..4239f1ed02 100644 --- a/lib/dns/rbtdb.c +++ b/lib/dns/rbtdb.c @@ -1820,8 +1820,9 @@ delete_node(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node) { */ static void new_reference(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node, - isc_rwlocktype_t locktype) { - if (locktype == isc_rwlocktype_write && ISC_LINK_LINKED(node, deadlink)) + isc_rwlocktype_t nlocktype) { + if (nlocktype == isc_rwlocktype_write && + ISC_LINK_LINKED(node, deadlink)) { ISC_LIST_UNLINK(rbtdb->deadnodes[node->locknum], node, deadlink); @@ -1844,13 +1845,13 @@ is_leaf(dns_rbtnode_t *node) { static void send_to_prune_tree(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node, - isc_rwlocktype_t locktype) { + isc_rwlocktype_t nlocktype) { isc_event_t *ev; dns_db_t *db; ev = isc_event_allocate(rbtdb->common.mctx, NULL, DNS_EVENT_RBTPRUNE, prune_tree, node, sizeof(isc_event_t)); - new_reference(rbtdb, node, locktype); + new_reference(rbtdb, node, nlocktype); db = NULL; attach((dns_db_t *)rbtdb, &db); ev->ev_sender = db; @@ -2070,10 +2071,6 @@ decrement_reference(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node, #undef KEEP_NODE if (write_locked) { - /* - * We can now delete the node. - */ - /* * If this node is the only one in the level it's in, deleting * this node may recursively make its parent the only node in @@ -2094,6 +2091,7 @@ decrement_reference(dns_rbtdb_t *rbtdb, dns_rbtnode_t *node, send_to_prune_tree(rbtdb, node, isc_rwlocktype_write); no_reference = false; } else { + /* We can now delete the node. */ delete_node(rbtdb, node); } } else { @@ -2129,62 +2127,39 @@ restore_locks: } /* - * Prune the tree by recursively cleaning-up single leaves. In the worst - * case, the number of iteration is the number of tree levels, which is at - * most the maximum number of domain name labels, i.e, 127. In practice, this - * should be much smaller (only a few times), and even the worst case would be - * acceptable for a single event. + * Prune the tree by cleaning up single leaves. A single execution of this + * function cleans up a single node; if the parent of the latter becomes a + * single leaf on its own level as a result, the parent is then also sent to + * this function. */ static void prune_tree(isc_task_t *task, isc_event_t *event) { dns_rbtdb_t *rbtdb = event->ev_sender; dns_rbtnode_t *node = event->ev_arg; - dns_rbtnode_t *parent; - unsigned int locknum; + dns_rbtnode_t *parent = NULL; + unsigned int locknum = node->locknum; UNUSED(task); isc_event_free(&event); RWLOCK(&rbtdb->tree_lock, isc_rwlocktype_write); - locknum = node->locknum; + + parent = node->parent; + NODE_LOCK(&rbtdb->node_locks[locknum].lock, isc_rwlocktype_write); - do { - parent = node->parent; - decrement_reference(rbtdb, node, 0, isc_rwlocktype_write, - isc_rwlocktype_write, true); - - if (parent != NULL && parent->down == NULL) { - /* - * node was the only down child of the parent and has - * just been removed. We'll then need to examine the - * parent. Keep the lock if possible; otherwise, - * release the old lock and acquire one for the parent. - */ - if (parent->locknum != locknum) { - NODE_UNLOCK(&rbtdb->node_locks[locknum].lock, - isc_rwlocktype_write); - locknum = parent->locknum; - NODE_LOCK(&rbtdb->node_locks[locknum].lock, - isc_rwlocktype_write); - } - - /* - * We need to gain a reference to the node before - * decrementing it in the next iteration. - */ - if (ISC_LINK_LINKED(parent, deadlink)) { - ISC_LIST_UNLINK(rbtdb->deadnodes[locknum], - parent, deadlink); - } - new_reference(rbtdb, parent, isc_rwlocktype_write); - } else { - parent = NULL; - } - - node = parent; - } while (node != NULL); + decrement_reference(rbtdb, node, 0, isc_rwlocktype_write, + isc_rwlocktype_write, true); NODE_UNLOCK(&rbtdb->node_locks[locknum].lock, isc_rwlocktype_write); + + if (parent != NULL && is_leaf(parent)) { + NODE_LOCK(&rbtdb->node_locks[parent->locknum].lock, + isc_rwlocktype_write); + send_to_prune_tree(rbtdb, parent, isc_rwlocktype_write); + NODE_UNLOCK(&rbtdb->node_locks[parent->locknum].lock, + isc_rwlocktype_write); + } + RWUNLOCK(&rbtdb->tree_lock, isc_rwlocktype_write); detach((dns_db_t **)&rbtdb); From 9e23c6967f626e9a7e62f0e0666b55eddb25af2c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ond=C5=99ej=20Sur=C3=BD?= Date: Thu, 22 Feb 2024 08:56:46 +0100 Subject: [PATCH 3/3] Add CHANGES and release note for [GL #4596] (cherry picked from commit f44755766775ca5a0b08e73986a91675d5c294b9) (cherry picked from commit f255ab3bf7b9cd350708e720c51bf67707f6fc74) --- CHANGES | 4 ++++ doc/notes/notes-current.rst | 5 +++++ 2 files changed, 9 insertions(+) diff --git a/CHANGES b/CHANGES index 662106db0f..1cee1334fb 100644 --- a/CHANGES +++ b/CHANGES @@ -1,3 +1,7 @@ +6352. [bug] Revert change 6319 and decrease lock contention during + RBTDB tree pruning by not cleaning up nodes recursively + within a single prune_tree() call. [GL #4596] + 6350. [bug] Address use after free in expire_lru_headers. [GL #4495] --- 9.18.24 released --- diff --git a/doc/notes/notes-current.rst b/doc/notes/notes-current.rst index 16e6a84f35..cd16ecdbec 100644 --- a/doc/notes/notes-current.rst +++ b/doc/notes/notes-current.rst @@ -41,6 +41,11 @@ Bug Fixes ISC would like to thank to Jinmei Tatuya from Infoblox for bringing this issue to our attention. +- A regression in cache-cleaning code enabled memory use to grow + significantly more quickly than before, until the configured + :any:`max-cache-size` limit was reached. This has been fixed. + :gl:`#4596` + Known Issues ~~~~~~~~~~~~