From 98820aef7ed413f12486dacc5a096bea0d0cd951 Mon Sep 17 00:00:00 2001 From: Aram Sargsyan Date: Tue, 1 Mar 2022 12:47:24 +0000 Subject: [PATCH 1/3] Add a missing UNLOCK_LOOKUP There was a missing UNLOCK_LOOKUP in the recv_done() callback when the operation had been canceled. That omission could result in a deadlock situation. --- bin/dig/dighost.c | 1 + 1 file changed, 1 insertion(+) diff --git a/bin/dig/dighost.c b/bin/dig/dighost.c index e09c547e3c..3d205583a4 100644 --- a/bin/dig/dighost.c +++ b/bin/dig/dighost.c @@ -3583,6 +3583,7 @@ recv_done(isc_nmhandle_t *handle, isc_result_t eresult, isc_region_t *region, debug("recv_done: cancel"); isc_nmhandle_detach(&query->readhandle); query_detach(&query); + UNLOCK_LOOKUP; return; } From 4043fe909055b99b9aeec644f787b964d9c75192 Mon Sep 17 00:00:00 2001 From: Aram Sargsyan Date: Wed, 2 Mar 2022 00:06:34 +0000 Subject: [PATCH 2/3] Fix query context management issues in dighost.c For the reference, the _cancel_lookup() function iterates through the lookup's queries list and detaches them. In the ideal scenario, that should be the last reference and the query will be destroyed after that, but it is also possible that we are still expecting a callback, which also holds a reference (for example, _cancel_lookup() could have been called from recv_done(), when send_done() was still not executed). The start_udp() and start_tcp() functions are currently designed in slightly different ways: start_udp() creates a new query attachment `connectquery`, to be called in the callback function, while start_tcp() does not, which is a bug, but is hidden by the fact that when the query is being erroneously destroyed prematurely (before _cancel_lookup() is called) in the result of that, it also gets de-listed from the lookup's queries' list, so _cancel_lookup() doesn't even try to detach it. For better understanding, here's an illustration of the query's references count changes, and from where it was changed: UDP --- 1. _new_query() -> refcount = 1 (initial) 2. start_udp() -> refcount = 2 (lookup->current_query) 3. start_udp() -> refcount = 3 (connectquery) 4. udp_ready() -> refcount = 4 (readquery) 5. udp_ready() -> refcount = 5 (sendquery) 6. udp_ready() -> refcount = 4 (lookup->current_query) 7. udp_ready() -> refcount = 3 (connectquery) 8. send_done() -> refcount = 2 (sendquery) 9. recv_done() -> refcount = 1 (readquery) 10. _cancel_lookup() -> refcount = 0 (initial) 11. the query gets destroyed and removed from `lookup->q` TCP, fortunate scenario ----------------------- 1. _new_query() -> refcount = 1 (initial) 2. start_tcp() -> refcount = 2 (lookup->current_query) 3. launch_next_query() -> refcount = 3 (readquery) 4. launch_next_query() -> refcount = 4 (sendquery) 5. tcp_connected() -> refcount = 3 (lookup->current_query) 6. tcp_connected() -> refcount = 2 (bug, there was no connectquery) 7. send_done() -> refcount = 1 (sendquery) 8. recv_done() -> refcount = 0 (readquery) 9. the query gets prematurely destroyed and removed from `lookup->q` 10. _cancel_lookup() -> the query is not in `lookup->q` TCP, unfortunate scenario, revealing the bug -------------------------------------------- 1. _new_query() -> refcount = 1 (initial) 2. start_tcp() -> refcount = 2 (lookup->current_query) 3. launch_next_query() -> refcount = 3 (readquery) 4. launch_next_query() -> refcount = 4 (sendquery) 5. tcp_connected() -> refcount = 3 (lookup->current_query) 6. tcp_connected() -> refcount = 2 (bug, there was no connectquery) 7. recv_done() -> refcount = 1 (readquery) 8. _cancel_lookup() -> refcount = 0 (the query was in `lookup->q`) 9. we hit an assertion here when trying to destroy the query, because sendhandle is not detached (which is done by send_done()). 10. send_done() -> this never happens This commit does the following: 1. Add a `connectquery` attachment in start_tcp(), like done in start_udp(). 2. Add missing _cancel_lookup() calls for error scenarios, which were possibly missing because before fixing the bug, calling _cancel_lookup() and then calling query_detach() would cause an assertion. 3. Log a debug message and call isc_nm_cancelread(query->readhandle) for every query in the lookup from inside the _cancel_lookup() function, like it is done in _cancel_all(). 4. Add a `canceled` property for the query which becomes `true` when the lookup (and subsequently, its queries) are canceled. 5. Use the `canceled` property in the network manager callbacks to know that the query was canceled, and act like `eresult` was equal to `ISC_R_CANCELED`. --- bin/dig/dighost.c | 85 +++++++++++++++++++++++++++++++---------------- bin/dig/dighost.h | 1 + 2 files changed, 57 insertions(+), 29 deletions(-) diff --git a/bin/dig/dighost.c b/bin/dig/dighost.c index 3d205583a4..faad62c13c 100644 --- a/bin/dig/dighost.c +++ b/bin/dig/dighost.c @@ -2616,17 +2616,26 @@ send_done(isc_nmhandle_t *handle, isc_result_t eresult, void *arg) { isc_nmhandle_detach(&query->sendhandle); - if (eresult != ISC_R_SUCCESS) { - if (eresult != ISC_R_CANCELED) { - debug("send failed: %s", isc_result_totext(eresult)); + lookup_attach(query->lookup, &l); + + if (eresult == ISC_R_CANCELED || query->canceled) { + debug("send_done: cancel"); + if (!query->canceled) { + cancel_lookup(l); } query_detach(&query); - + lookup_detach(&l); + UNLOCK_LOOKUP; + return; + } else if (eresult != ISC_R_SUCCESS) { + debug("send failed: %s", isc_result_totext(eresult)); + cancel_lookup(l); + query_detach(&query); + lookup_detach(&l); UNLOCK_LOOKUP; return; } - lookup_attach(query->lookup, &l); if (l->ns_search_only && !l->trace_root && !l->tcp_mode) { debug("sending next, since searching"); next = ISC_LIST_NEXT(query, link); @@ -2666,6 +2675,12 @@ _cancel_lookup(dig_lookup_t *lookup, const char *file, unsigned int line) { REQUIRE(DIG_VALID_QUERY(query)); next = ISC_LIST_NEXT(query, link); ISC_LIST_DEQUEUE(lookup->q, query, link); + debug("canceling pending query %p, belonging to %p", query, + query->lookup); + query->canceled = true; + if (query->readhandle != NULL) { + isc_nm_cancelread(query->readhandle); + } query_detach(&query); query = next; } @@ -2685,7 +2700,8 @@ tcp_connected(isc_nmhandle_t *handle, isc_result_t eresult, void *arg); static void start_tcp(dig_query_t *query) { isc_result_t result; - dig_query_t *next; + dig_query_t *next = NULL; + dig_query_t *connectquery = NULL; REQUIRE(DIG_VALID_QUERY(query)); debug("start_tcp(%p)", query); @@ -2775,13 +2791,15 @@ start_tcp(dig_query_t *query) { REQUIRE(query != NULL); + query_attach(query, &connectquery); + if (query->lookup->tls_mode) { result = isc_tlsctx_createclient(&query->tlsctx); RUNTIME_CHECK(result == ISC_R_SUCCESS); isc_tlsctx_enable_dot_client_alpn(query->tlsctx); isc_nm_tlsdnsconnect(netmgr, &localaddr, &query->sockaddr, tcp_connected, - query, local_timeout, 0, + connectquery, local_timeout, 0, query->tlsctx); #if HAVE_LIBNGHTTP2 } else if (query->lookup->https_mode) { @@ -2801,13 +2819,13 @@ start_tcp(dig_query_t *query) { isc_nm_httpconnect(netmgr, &localaddr, &query->sockaddr, uri, !query->lookup->https_get, - tcp_connected, query, query->tlsctx, - local_timeout, 0); + tcp_connected, connectquery, + query->tlsctx, local_timeout, 0); #endif } else { isc_nm_tcpdnsconnect(netmgr, &localaddr, &query->sockaddr, tcp_connected, - query, local_timeout, 0); + connectquery, local_timeout, 0); } /* XXX: set DSCP */ @@ -2878,20 +2896,23 @@ udp_ready(isc_nmhandle_t *handle, isc_result_t eresult, void *arg) { dig_query_t *readquery = NULL; int local_timeout = timeout * 1000; - if (eresult == ISC_R_CANCELED) { + if (eresult == ISC_R_CANCELED || query->canceled) { + dig_lookup_t *l = query->lookup; + + debug("in cancel handler"); query_detach(&query); + if (!query->canceled) { + cancel_lookup(l); + } + lookup_detach(&l); return; } else if (eresult != ISC_R_SUCCESS) { dig_lookup_t *l = query->lookup; - if (eresult != ISC_R_CANCELED) { - debug("udp setup failed: %s", - isc_result_totext(eresult)); - } - + debug("udp setup failed: %s", isc_result_totext(eresult)); + query_detach(&query); cancel_lookup(l); lookup_detach(&l); - query_detach(&query); return; } @@ -3143,8 +3164,6 @@ launch_next_query(dig_query_t *query) { debug("have local timeout of %d", local_timeout); isc_nmhandle_settimeout(query->handle, local_timeout); - query_attach(query, &readquery); - xfr = query->lookup->rdtype == dns_rdatatype_ixfr || query->lookup->rdtype == dns_rdatatype_axfr; if (xfr && isc_nm_socket_type(query->handle) == isc_nm_tlsdnssocket && @@ -3163,6 +3182,8 @@ launch_next_query(dig_query_t *query) { return; } + query_attach(query, &readquery); + isc_nm_read(query->handle, recv_done, readquery); if (!query->first_soa_rcvd) { @@ -3233,9 +3254,12 @@ tcp_connected(isc_nmhandle_t *handle, isc_result_t eresult, void *arg) { isc_tlsctx_free(&query->tlsctx); } - if (eresult == ISC_R_CANCELED) { + if (eresult == ISC_R_CANCELED || query->canceled) { debug("in cancel handler"); isc_sockaddr_format(&query->sockaddr, sockstr, sizeof(sockstr)); + if (!query->canceled) { + cancel_lookup(l); + } query_detach(&query); lookup_detach(&l); clear_current_lookup(); @@ -3245,12 +3269,9 @@ tcp_connected(isc_nmhandle_t *handle, isc_result_t eresult, void *arg) { debug("unsuccessful connection: %s", isc_result_totext(eresult)); isc_sockaddr_format(&query->sockaddr, sockstr, sizeof(sockstr)); - if (eresult != ISC_R_CANCELED) { - dighost_warning("Connection to %s(%s) for %s failed: " - "%s.", - sockstr, query->servname, l->textname, - isc_result_totext(eresult)); - } + dighost_warning("Connection to %s(%s) for %s failed: %s.", + sockstr, query->servname, l->textname, + isc_result_totext(eresult)); /* XXX Clean up exitcodes */ if (exitcode < 9) { @@ -3271,6 +3292,7 @@ tcp_connected(isc_nmhandle_t *handle, isc_result_t eresult, void *arg) { next = NULL; } + cancel_lookup(l); query_detach(&query); lookup_detach(&l); @@ -3579,16 +3601,20 @@ recv_done(isc_nmhandle_t *handle, isc_result_t eresult, isc_region_t *region, isc_refcount_decrement0(&recvcount); debug("recvcount=%" PRIuFAST32, isc_refcount_current(&recvcount)); - if (eresult == ISC_R_CANCELED) { + lookup_attach(query->lookup, &l); + + if (eresult == ISC_R_CANCELED || query->canceled) { debug("recv_done: cancel"); isc_nmhandle_detach(&query->readhandle); + if (!query->canceled) { + cancel_lookup(l); + } query_detach(&query); + lookup_detach(&l); UNLOCK_LOOKUP; return; } - lookup_attach(query->lookup, &l); - if (query->lookup->use_usec) { TIME_NOW_HIRES(&query->time_recv); } else { @@ -4209,6 +4235,7 @@ cancel_all(void) { nq = ISC_LIST_NEXT(q, link); debug("canceling pending query %p, belonging to %p", q, current_lookup); + q->canceled = true; if (q->readhandle != NULL) { isc_nm_cancelread(q->readhandle); } diff --git a/bin/dig/dighost.h b/bin/dig/dighost.h index 618e975096..e38e319f53 100644 --- a/bin/dig/dighost.h +++ b/bin/dig/dighost.h @@ -188,6 +188,7 @@ struct dig_query { bool second_rr_rcvd; bool first_repeat_rcvd; bool warn_id; + bool canceled; uint32_t first_rr_serial; uint32_t second_rr_serial; uint32_t msg_count; From f3228df622098c9735f8e0e28ad4ddf314e9a164 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ond=C5=99ej=20Sur=C3=BD?= Date: Wed, 2 Mar 2022 09:14:19 +0100 Subject: [PATCH 3/3] Add CHANGES note for [GL #3184] --- CHANGES | 3 +++ 1 file changed, 3 insertions(+) diff --git a/CHANGES b/CHANGES index 67cf5b7d8f..356c7ac046 100644 --- a/CHANGES +++ b/CHANGES @@ -1,3 +1,6 @@ +5821. [bug] Fix query context management issues in the TCP part + of dig. [GL #3184] + 5820. [placeholder] 5819. [placeholder]