From 9298dcebbdcf9f4915f1c8bbe85d8bd4bad1af40 Mon Sep 17 00:00:00 2001 From: Diego Fronza Date: Fri, 16 Apr 2021 12:21:31 -0300 Subject: [PATCH 1/3] Fix deadlock between rndc addzone/delzone/modzone It follows a description of the steps that were leading to the deadlock: 1. `do_addzone` calls `isc_task_beginexclusive`. 2. `isc_task_beginexclusive` waits for (N_WORKERS - 1) halted tasks, this blocks waiting for those (no. workers -1) workers to halt. ... isc_task_beginexclusive(isc_task_t *task0) { ... while (manager->halted + 1 < manager->workers) { wake_all_queues(manager); WAIT(&manager->halt_cond, &manager->halt_lock); } ``` 3. It is possible that in `task.c / dispatch()` a worker is running a task event, if that event blocks it will not allow this worker to halt. 4. `do_addzone` acquires `LOCK(&view->new_zone_lock);`, 5. `rmzone` event is called from some worker's `dispatch()`, `rmzone` blocks waiting for the same lock. 6. `do_addzone` calls `isc_task_beginexclusive`. 7. Deadlock triggered, since: - `rmzone` is wating for the lock. - `isc_task_beginexclusive` is waiting for (no. workers - 1) to be halted - since `rmzone` event is blocked it won't allow the worker to halt. To fix this, we updated do_addzone code to call isc_task_beginexclusive before the lock is acquired, we postpone locking to the nearest required place, same for isc_task_beginexclusive. The same could happen with rndc modzone, so that was addressed as well. --- bin/named/server.c | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/bin/named/server.c b/bin/named/server.c index 6f050af36d..f26f867f90 100644 --- a/bin/named/server.c +++ b/bin/named/server.c @@ -13683,13 +13683,13 @@ do_addzone(named_server_t *server, ns_cfgctx_t *cfg, dns_view_t *view, #ifndef HAVE_LMDB FILE *fp = NULL; bool cleanup_config = false; -#else /* HAVE_LMDB */ +#else /* HAVE_LMDB */ MDB_txn *txn = NULL; MDB_dbi dbi; + bool locked = false; UNUSED(zoneconf); - LOCK(&view->new_zone_lock); -#endif /* HAVE_LMDB */ +#endif /* Zone shouldn't already exist */ if (redirect) { @@ -13709,12 +13709,16 @@ do_addzone(named_server_t *server, ns_cfgctx_t *cfg, dns_view_t *view, goto cleanup; } + result = isc_task_beginexclusive(server->task); + RUNTIME_CHECK(result == ISC_R_SUCCESS); + #ifndef HAVE_LMDB /* * Make sure we can open the configuration save file */ result = isc_stdio_open(view->new_zone_file, "a", &fp); if (result != ISC_R_SUCCESS) { + isc_task_endexclusive(server->task); TCHECK(putstr(text, "unable to create '")); TCHECK(putstr(text, view->new_zone_file)); TCHECK(putstr(text, "': ")); @@ -13725,9 +13729,12 @@ do_addzone(named_server_t *server, ns_cfgctx_t *cfg, dns_view_t *view, (void)isc_stdio_close(fp); fp = NULL; #else /* HAVE_LMDB */ + LOCK(&view->new_zone_lock); + locked = true; /* Make sure we can open the NZD database */ result = nzd_writable(view); if (result != ISC_R_SUCCESS) { + isc_task_endexclusive(server->task); TCHECK(putstr(text, "unable to open NZD database for '")); TCHECK(putstr(text, view->new_zone_db)); TCHECK(putstr(text, "'")); @@ -13736,9 +13743,6 @@ do_addzone(named_server_t *server, ns_cfgctx_t *cfg, dns_view_t *view, } #endif /* HAVE_LMDB */ - result = isc_task_beginexclusive(server->task); - RUNTIME_CHECK(result == ISC_R_SUCCESS); - /* Mark view unfrozen and configure zone */ dns_view_thaw(view); result = configure_zone(cfg->config, zoneobj, cfg->vconfig, @@ -13842,7 +13846,9 @@ cleanup: if (txn != NULL) { (void)nzd_close(&txn, false); } - UNLOCK(&view->new_zone_lock); + if (locked) { + UNLOCK(&view->new_zone_lock); + } #endif /* HAVE_LMDB */ if (zone != NULL) { @@ -13866,7 +13872,7 @@ do_modzone(named_server_t *server, ns_cfgctx_t *cfg, dns_view_t *view, #else /* HAVE_LMDB */ MDB_txn *txn = NULL; MDB_dbi dbi; - LOCK(&view->new_zone_lock); + bool locked = false; #endif /* HAVE_LMDB */ /* Zone must already exist */ @@ -13912,6 +13918,8 @@ do_modzone(named_server_t *server, ns_cfgctx_t *cfg, dns_view_t *view, (void)isc_stdio_close(fp); fp = NULL; #else /* HAVE_LMDB */ + LOCK(&view->new_zone_lock); + locked = true; /* Make sure we can open the NZD database */ result = nzd_writable(view); if (result != ISC_R_SUCCESS) { @@ -14064,7 +14072,9 @@ cleanup: if (txn != NULL) { (void)nzd_close(&txn, false); } - UNLOCK(&view->new_zone_lock); + if (locked) { + UNLOCK(&view->new_zone_lock); + } #endif /* HAVE_LMDB */ if (zone != NULL) { From d6224035d88a85f5f16f6072aa0f54b9936e73d8 Mon Sep 17 00:00:00 2001 From: Diego Fronza Date: Mon, 19 Apr 2021 14:23:31 -0300 Subject: [PATCH 2/3] Add system test for the deadlock fix The test spawns 4 parallel workers that keep adding, modifying and deleting zones, the main thread repeatedly checks wheter rndc status responds within a reasonable period. While environment and timing issues may affect the test, in most test cases the deadlock that was taking place before the fix used to trigger in less than 7 seconds in a machine with at least 2 cores. --- bin/tests/system/addzone/ns3/example.db | 2 + .../system/addzone/tests_rndc_deadlock.py | 90 +++++++++++++++++++ util/copyrights | 1 + 3 files changed, 93 insertions(+) create mode 100644 bin/tests/system/addzone/ns3/example.db create mode 100755 bin/tests/system/addzone/tests_rndc_deadlock.py diff --git a/bin/tests/system/addzone/ns3/example.db b/bin/tests/system/addzone/ns3/example.db new file mode 100644 index 0000000000..4f150a030a --- /dev/null +++ b/bin/tests/system/addzone/ns3/example.db @@ -0,0 +1,2 @@ +@ IN SOA localhost. localhost.localhost. 1 10800 3600 605800 86400 +@ IN NS localhost. diff --git a/bin/tests/system/addzone/tests_rndc_deadlock.py b/bin/tests/system/addzone/tests_rndc_deadlock.py new file mode 100755 index 0000000000..852c551a0d --- /dev/null +++ b/bin/tests/system/addzone/tests_rndc_deadlock.py @@ -0,0 +1,90 @@ +############################################################################ +# Copyright (C) Internet Systems Consortium, Inc. ("ISC") +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, you can obtain one at https://mozilla.org/MPL/2.0/. +# +# See the COPYRIGHT file distributed with this work for additional +# information regarding copyright ownership. +############################################################################ + +import concurrent.futures +import os +import subprocess +import time + + +def run_rndc(server, rndc_command): + ''' + Send the specified 'rndc_command' to 'server' with a timeout of 2 seconds + ''' + rndc = os.getenv('RNDC') + port = os.getenv('CONTROLPORT') + + cmdline = [rndc, '-c', '../common/rndc.conf', '-p', port, '-s', server] + cmdline.extend(rndc_command) + + subprocess.check_output(cmdline, stderr=subprocess.STDOUT, timeout=2) + + +def rndc_loop(test_state, domain): + ''' + Run "rndc addzone", "rndc modzone", and "rndc delzone" in a tight loop + until the test is considered finished, ignoring errors + ''' + rndc_commands = [ + ['addzone', domain, + '{ type master; file "example.db"; };'], + ['modzone', domain, + '{ type master; file "example.db"; allow-transfer { any; }; };'], + ['delzone', domain], + ] + + while not test_state['finished']: + for command in rndc_commands: + try: + run_rndc('10.53.0.3', command) + except subprocess.SubprocessError: + pass + + +def check_if_server_is_responsive(): + ''' + Check if server status can be successfully retrieved using "rndc status" + ''' + try: + run_rndc('10.53.0.3', ['status']) + return True + except subprocess.SubprocessError: + return False + + +def test_rndc_deadlock(): + ''' + Test whether running "rndc addzone", "rndc modzone", and "rndc delzone" + commands concurrently does not trigger a deadlock + ''' + test_state = {'finished': False} + + # Create 4 worker threads running "rndc" commands in a loop. + executor = concurrent.futures.ThreadPoolExecutor() + for i in range(1, 5): + domain = 'example%d' % i + executor.submit(rndc_loop, test_state, domain) + + # Run "rndc status" in 1-second intervals for a maximum of 10 seconds. If + # any "rndc status" command fails, the loop will be interrupted. + server_is_responsive = True + attempts = 10 + while server_is_responsive and attempts > 0: + server_is_responsive = check_if_server_is_responsive() + attempts -= 1 + time.sleep(1) + + # Signal worker threads that the test is finished. + test_state['finished'] = True + executor.shutdown() + + # Check whether all "rndc status" commands succeeded. + assert server_is_responsive diff --git a/util/copyrights b/util/copyrights index 92de9fbad9..13f643718c 100644 --- a/util/copyrights +++ b/util/copyrights @@ -221,6 +221,7 @@ ./bin/tests/system/addzone/ns2/default.nzf.in X 2010,2018,2019,2020 ./bin/tests/system/addzone/setup.sh SH 2010,2012,2013,2014,2016,2017,2018,2019,2020,2021 ./bin/tests/system/addzone/tests.sh SH 2010,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020,2021 +./bin/tests/system/addzone/tests_rndc_deadlock.py PYTHON 2021 ./bin/tests/system/allow-query/clean.sh SH 2010,2012,2014,2015,2016,2018,2019,2020,2021 ./bin/tests/system/allow-query/ns3/named.args X 2018,2019,2020,2021 ./bin/tests/system/allow-query/setup.sh SH 2010,2012,2016,2018,2019,2020,2021 From 6646655067fcb8777979b7d981ae8ebd07939273 Mon Sep 17 00:00:00 2001 From: Diego Fronza Date: Fri, 16 Apr 2021 13:28:08 -0300 Subject: [PATCH 3/3] Add CHANGES note for GL #2626 --- CHANGES | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGES b/CHANGES index f22fab71c8..1ad66df3d3 100644 --- a/CHANGES +++ b/CHANGES @@ -1,3 +1,6 @@ +5625. [bug] Address deadlock between rndc addzone/delzone. + [GL #2626] + 5624. [func] Remove the taskmgr dispatch threads and run the tasks on top of netmgr loops. [GL #2638]