From 8b6245e298376c7eb95709de333fe58c2f4d63bc Mon Sep 17 00:00:00 2001 From: Aram Sargsyan Date: Tue, 1 Mar 2022 12:47:24 +0000 Subject: [PATCH 1/3] Add a missing UNLOCK_LOOKUP There was a missing UNLOCK_LOOKUP in the recv_done() callback when the operation had been canceled. That omission could result in a deadlock situation. (cherry picked from commit 98820aef7ed413f12486dacc5a096bea0d0cd951) --- bin/dig/dighost.c | 1 + 1 file changed, 1 insertion(+) diff --git a/bin/dig/dighost.c b/bin/dig/dighost.c index e09c547e3c..3d205583a4 100644 --- a/bin/dig/dighost.c +++ b/bin/dig/dighost.c @@ -3583,6 +3583,7 @@ recv_done(isc_nmhandle_t *handle, isc_result_t eresult, isc_region_t *region, debug("recv_done: cancel"); isc_nmhandle_detach(&query->readhandle); query_detach(&query); + UNLOCK_LOOKUP; return; } From 5f5e1c0ac25f440b032a8cb9fd3b2f93205d91b1 Mon Sep 17 00:00:00 2001 From: Aram Sargsyan Date: Wed, 2 Mar 2022 00:06:34 +0000 Subject: [PATCH 2/3] Fix query context management issues in dighost.c For the reference, the _cancel_lookup() function iterates through the lookup's queries list and detaches them. In the ideal scenario, that should be the last reference and the query will be destroyed after that, but it is also possible that we are still expecting a callback, which also holds a reference (for example, _cancel_lookup() could have been called from recv_done(), when send_done() was still not executed). The start_udp() and start_tcp() functions are currently designed in slightly different ways: start_udp() creates a new query attachment `connectquery`, to be called in the callback function, while start_tcp() does not, which is a bug, but is hidden by the fact that when the query is being erroneously destroyed prematurely (before _cancel_lookup() is called) in the result of that, it also gets de-listed from the lookup's queries' list, so _cancel_lookup() doesn't even try to detach it. For better understanding, here's an illustration of the query's references count changes, and from where it was changed: UDP --- 1. _new_query() -> refcount = 1 (initial) 2. start_udp() -> refcount = 2 (lookup->current_query) 3. start_udp() -> refcount = 3 (connectquery) 4. udp_ready() -> refcount = 4 (readquery) 5. udp_ready() -> refcount = 5 (sendquery) 6. udp_ready() -> refcount = 4 (lookup->current_query) 7. udp_ready() -> refcount = 3 (connectquery) 8. send_done() -> refcount = 2 (sendquery) 9. recv_done() -> refcount = 1 (readquery) 10. _cancel_lookup() -> refcount = 0 (initial) 11. the query gets destroyed and removed from `lookup->q` TCP, fortunate scenario ----------------------- 1. _new_query() -> refcount = 1 (initial) 2. start_tcp() -> refcount = 2 (lookup->current_query) 3. launch_next_query() -> refcount = 3 (readquery) 4. launch_next_query() -> refcount = 4 (sendquery) 5. tcp_connected() -> refcount = 3 (lookup->current_query) 6. tcp_connected() -> refcount = 2 (bug, there was no connectquery) 7. send_done() -> refcount = 1 (sendquery) 8. recv_done() -> refcount = 0 (readquery) 9. the query gets prematurely destroyed and removed from `lookup->q` 10. _cancel_lookup() -> the query is not in `lookup->q` TCP, unfortunate scenario, revealing the bug -------------------------------------------- 1. _new_query() -> refcount = 1 (initial) 2. start_tcp() -> refcount = 2 (lookup->current_query) 3. launch_next_query() -> refcount = 3 (readquery) 4. launch_next_query() -> refcount = 4 (sendquery) 5. tcp_connected() -> refcount = 3 (lookup->current_query) 6. tcp_connected() -> refcount = 2 (bug, there was no connectquery) 7. recv_done() -> refcount = 1 (readquery) 8. _cancel_lookup() -> refcount = 0 (the query was in `lookup->q`) 9. we hit an assertion here when trying to destroy the query, because sendhandle is not detached (which is done by send_done()). 10. send_done() -> this never happens This commit does the following: 1. Add a `connectquery` attachment in start_tcp(), like done in start_udp(). 2. Add missing _cancel_lookup() calls for error scenarios, which were possibly missing because before fixing the bug, calling _cancel_lookup() and then calling query_detach() would cause an assertion. 3. Log a debug message and call isc_nm_cancelread(query->readhandle) for every query in the lookup from inside the _cancel_lookup() function, like it is done in _cancel_all(). 4. Add a `canceled` property for the query which becomes `true` when the lookup (and subsequently, its queries) are canceled. 5. Use the `canceled` property in the network manager callbacks to know that the query was canceled, and act like `eresult` was equal to `ISC_R_CANCELED`. (cherry picked from commit 4043fe909055b99b9aeec644f787b964d9c75192) --- bin/dig/dighost.c | 85 +++++++++++++++++++++++++++++++---------------- bin/dig/dighost.h | 1 + 2 files changed, 57 insertions(+), 29 deletions(-) diff --git a/bin/dig/dighost.c b/bin/dig/dighost.c index 3d205583a4..faad62c13c 100644 --- a/bin/dig/dighost.c +++ b/bin/dig/dighost.c @@ -2616,17 +2616,26 @@ send_done(isc_nmhandle_t *handle, isc_result_t eresult, void *arg) { isc_nmhandle_detach(&query->sendhandle); - if (eresult != ISC_R_SUCCESS) { - if (eresult != ISC_R_CANCELED) { - debug("send failed: %s", isc_result_totext(eresult)); + lookup_attach(query->lookup, &l); + + if (eresult == ISC_R_CANCELED || query->canceled) { + debug("send_done: cancel"); + if (!query->canceled) { + cancel_lookup(l); } query_detach(&query); - + lookup_detach(&l); + UNLOCK_LOOKUP; + return; + } else if (eresult != ISC_R_SUCCESS) { + debug("send failed: %s", isc_result_totext(eresult)); + cancel_lookup(l); + query_detach(&query); + lookup_detach(&l); UNLOCK_LOOKUP; return; } - lookup_attach(query->lookup, &l); if (l->ns_search_only && !l->trace_root && !l->tcp_mode) { debug("sending next, since searching"); next = ISC_LIST_NEXT(query, link); @@ -2666,6 +2675,12 @@ _cancel_lookup(dig_lookup_t *lookup, const char *file, unsigned int line) { REQUIRE(DIG_VALID_QUERY(query)); next = ISC_LIST_NEXT(query, link); ISC_LIST_DEQUEUE(lookup->q, query, link); + debug("canceling pending query %p, belonging to %p", query, + query->lookup); + query->canceled = true; + if (query->readhandle != NULL) { + isc_nm_cancelread(query->readhandle); + } query_detach(&query); query = next; } @@ -2685,7 +2700,8 @@ tcp_connected(isc_nmhandle_t *handle, isc_result_t eresult, void *arg); static void start_tcp(dig_query_t *query) { isc_result_t result; - dig_query_t *next; + dig_query_t *next = NULL; + dig_query_t *connectquery = NULL; REQUIRE(DIG_VALID_QUERY(query)); debug("start_tcp(%p)", query); @@ -2775,13 +2791,15 @@ start_tcp(dig_query_t *query) { REQUIRE(query != NULL); + query_attach(query, &connectquery); + if (query->lookup->tls_mode) { result = isc_tlsctx_createclient(&query->tlsctx); RUNTIME_CHECK(result == ISC_R_SUCCESS); isc_tlsctx_enable_dot_client_alpn(query->tlsctx); isc_nm_tlsdnsconnect(netmgr, &localaddr, &query->sockaddr, tcp_connected, - query, local_timeout, 0, + connectquery, local_timeout, 0, query->tlsctx); #if HAVE_LIBNGHTTP2 } else if (query->lookup->https_mode) { @@ -2801,13 +2819,13 @@ start_tcp(dig_query_t *query) { isc_nm_httpconnect(netmgr, &localaddr, &query->sockaddr, uri, !query->lookup->https_get, - tcp_connected, query, query->tlsctx, - local_timeout, 0); + tcp_connected, connectquery, + query->tlsctx, local_timeout, 0); #endif } else { isc_nm_tcpdnsconnect(netmgr, &localaddr, &query->sockaddr, tcp_connected, - query, local_timeout, 0); + connectquery, local_timeout, 0); } /* XXX: set DSCP */ @@ -2878,20 +2896,23 @@ udp_ready(isc_nmhandle_t *handle, isc_result_t eresult, void *arg) { dig_query_t *readquery = NULL; int local_timeout = timeout * 1000; - if (eresult == ISC_R_CANCELED) { + if (eresult == ISC_R_CANCELED || query->canceled) { + dig_lookup_t *l = query->lookup; + + debug("in cancel handler"); query_detach(&query); + if (!query->canceled) { + cancel_lookup(l); + } + lookup_detach(&l); return; } else if (eresult != ISC_R_SUCCESS) { dig_lookup_t *l = query->lookup; - if (eresult != ISC_R_CANCELED) { - debug("udp setup failed: %s", - isc_result_totext(eresult)); - } - + debug("udp setup failed: %s", isc_result_totext(eresult)); + query_detach(&query); cancel_lookup(l); lookup_detach(&l); - query_detach(&query); return; } @@ -3143,8 +3164,6 @@ launch_next_query(dig_query_t *query) { debug("have local timeout of %d", local_timeout); isc_nmhandle_settimeout(query->handle, local_timeout); - query_attach(query, &readquery); - xfr = query->lookup->rdtype == dns_rdatatype_ixfr || query->lookup->rdtype == dns_rdatatype_axfr; if (xfr && isc_nm_socket_type(query->handle) == isc_nm_tlsdnssocket && @@ -3163,6 +3182,8 @@ launch_next_query(dig_query_t *query) { return; } + query_attach(query, &readquery); + isc_nm_read(query->handle, recv_done, readquery); if (!query->first_soa_rcvd) { @@ -3233,9 +3254,12 @@ tcp_connected(isc_nmhandle_t *handle, isc_result_t eresult, void *arg) { isc_tlsctx_free(&query->tlsctx); } - if (eresult == ISC_R_CANCELED) { + if (eresult == ISC_R_CANCELED || query->canceled) { debug("in cancel handler"); isc_sockaddr_format(&query->sockaddr, sockstr, sizeof(sockstr)); + if (!query->canceled) { + cancel_lookup(l); + } query_detach(&query); lookup_detach(&l); clear_current_lookup(); @@ -3245,12 +3269,9 @@ tcp_connected(isc_nmhandle_t *handle, isc_result_t eresult, void *arg) { debug("unsuccessful connection: %s", isc_result_totext(eresult)); isc_sockaddr_format(&query->sockaddr, sockstr, sizeof(sockstr)); - if (eresult != ISC_R_CANCELED) { - dighost_warning("Connection to %s(%s) for %s failed: " - "%s.", - sockstr, query->servname, l->textname, - isc_result_totext(eresult)); - } + dighost_warning("Connection to %s(%s) for %s failed: %s.", + sockstr, query->servname, l->textname, + isc_result_totext(eresult)); /* XXX Clean up exitcodes */ if (exitcode < 9) { @@ -3271,6 +3292,7 @@ tcp_connected(isc_nmhandle_t *handle, isc_result_t eresult, void *arg) { next = NULL; } + cancel_lookup(l); query_detach(&query); lookup_detach(&l); @@ -3579,16 +3601,20 @@ recv_done(isc_nmhandle_t *handle, isc_result_t eresult, isc_region_t *region, isc_refcount_decrement0(&recvcount); debug("recvcount=%" PRIuFAST32, isc_refcount_current(&recvcount)); - if (eresult == ISC_R_CANCELED) { + lookup_attach(query->lookup, &l); + + if (eresult == ISC_R_CANCELED || query->canceled) { debug("recv_done: cancel"); isc_nmhandle_detach(&query->readhandle); + if (!query->canceled) { + cancel_lookup(l); + } query_detach(&query); + lookup_detach(&l); UNLOCK_LOOKUP; return; } - lookup_attach(query->lookup, &l); - if (query->lookup->use_usec) { TIME_NOW_HIRES(&query->time_recv); } else { @@ -4209,6 +4235,7 @@ cancel_all(void) { nq = ISC_LIST_NEXT(q, link); debug("canceling pending query %p, belonging to %p", q, current_lookup); + q->canceled = true; if (q->readhandle != NULL) { isc_nm_cancelread(q->readhandle); } diff --git a/bin/dig/dighost.h b/bin/dig/dighost.h index 618e975096..e38e319f53 100644 --- a/bin/dig/dighost.h +++ b/bin/dig/dighost.h @@ -188,6 +188,7 @@ struct dig_query { bool second_rr_rcvd; bool first_repeat_rcvd; bool warn_id; + bool canceled; uint32_t first_rr_serial; uint32_t second_rr_serial; uint32_t msg_count; From 97608c25fdc475f65ca4781a0d1763327491195f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ond=C5=99ej=20Sur=C3=BD?= Date: Wed, 2 Mar 2022 09:14:19 +0100 Subject: [PATCH 3/3] Add CHANGES note for [GL #3184] (cherry picked from commit f3228df622098c9735f8e0e28ad4ddf314e9a164) --- CHANGES | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGES b/CHANGES index 5cd237b445..cc47d1489a 100644 --- a/CHANGES +++ b/CHANGES @@ -4,6 +4,9 @@ and the dnssec-policy is rejected if both roles are not present for all algorithms in use. [GL #3142] +5821. [bug] Fix query context management issues in the TCP part + of dig. [GL #3184] + 5816. [bug] Make BIND compile with LibreSSL 3.5.0, as it was using not very accurate pre-processor checks for using shims. [GL #3172]