Fix query context management issues in dighost.c

For the reference, the _cancel_lookup() function iterates through
the lookup's queries list and detaches them. In the ideal scenario,
that should be the last reference and the query will be destroyed
after that, but it is also possible that we are still expecting a
callback, which also holds a reference (for example, _cancel_lookup()
could have been called from recv_done(), when send_done() was still
not executed).

The start_udp() and start_tcp() functions are currently designed in
slightly different ways: start_udp() creates a new query attachment
`connectquery`, to be called in the callback function, while
start_tcp() does not, which is a bug, but is hidden by the fact
that when the query is being erroneously destroyed prematurely (before
_cancel_lookup() is called) in the result of that, it also gets
de-listed from the lookup's queries' list, so _cancel_lookup() doesn't
even try to detach it.

For better understanding, here's an illustration of the query's
references count changes, and from where it was changed:

UDP
---
 1. _new_query()        -> refcount = 1 (initial)
 2. start_udp()         -> refcount = 2 (lookup->current_query)
 3. start_udp()         -> refcount = 3 (connectquery)
 4. udp_ready()         -> refcount = 4 (readquery)
 5. udp_ready()         -> refcount = 5 (sendquery)
 6. udp_ready()         -> refcount = 4 (lookup->current_query)
 7. udp_ready()         -> refcount = 3 (connectquery)
 8. send_done()         -> refcount = 2 (sendquery)
 9. recv_done()         -> refcount = 1 (readquery)
10. _cancel_lookup()    -> refcount = 0 (initial)
11. the query gets destroyed and removed from `lookup->q`

TCP, fortunate scenario
-----------------------

 1. _new_query()        -> refcount = 1 (initial)
 2. start_tcp()         -> refcount = 2 (lookup->current_query)
 3. launch_next_query() -> refcount = 3 (readquery)
 4. launch_next_query() -> refcount = 4 (sendquery)
 5. tcp_connected()     -> refcount = 3 (lookup->current_query)
 6. tcp_connected()     -> refcount = 2 (bug, there was no connectquery)
 7. send_done()         -> refcount = 1 (sendquery)
 8. recv_done()         -> refcount = 0 (readquery)
 9. the query gets prematurely destroyed and removed from `lookup->q`
10. _cancel_lookup()    -> the query is not in `lookup->q`

TCP, unfortunate scenario, revealing the bug
--------------------------------------------

 1. _new_query()        -> refcount = 1 (initial)
 2. start_tcp()         -> refcount = 2 (lookup->current_query)
 3. launch_next_query() -> refcount = 3 (readquery)
 4. launch_next_query() -> refcount = 4 (sendquery)
 5. tcp_connected()     -> refcount = 3 (lookup->current_query)
 6. tcp_connected()     -> refcount = 2 (bug, there was no connectquery)
 7. recv_done()         -> refcount = 1 (readquery)
 8. _cancel_lookup()    -> refcount = 0 (the query was in `lookup->q`)
 9. we hit an assertion here when trying to destroy the query, because
    sendhandle is not detached (which is done by send_done()).
10. send_done()         -> this never happens

This commit does the following:

1. Add a `connectquery` attachment in start_tcp(), like done in
   start_udp().
2. Add missing _cancel_lookup() calls for error scenarios, which
   were possibly missing because before fixing the bug, calling
   _cancel_lookup() and then calling query_detach() would cause
   an assertion.
3. Log a debug message and call isc_nm_cancelread(query->readhandle)
   for every query in the lookup from inside the _cancel_lookup()
   function, like it is done in _cancel_all().
4. Add a `canceled` property for the query which becomes `true` when
   the lookup (and subsequently, its queries) are canceled.
5. Use the `canceled` property in the network manager callbacks to
   know that the query was canceled, and act like `eresult` was equal
   to `ISC_R_CANCELED`.

(cherry picked from commit 4043fe9090)
This commit is contained in:
Aram Sargsyan 2022-03-02 00:06:34 +00:00 committed by Ondřej Surý
parent 8b6245e298
commit 5f5e1c0ac2
2 changed files with 57 additions and 29 deletions

View file

@ -2616,17 +2616,26 @@ send_done(isc_nmhandle_t *handle, isc_result_t eresult, void *arg) {
isc_nmhandle_detach(&query->sendhandle);
if (eresult != ISC_R_SUCCESS) {
if (eresult != ISC_R_CANCELED) {
debug("send failed: %s", isc_result_totext(eresult));
lookup_attach(query->lookup, &l);
if (eresult == ISC_R_CANCELED || query->canceled) {
debug("send_done: cancel");
if (!query->canceled) {
cancel_lookup(l);
}
query_detach(&query);
lookup_detach(&l);
UNLOCK_LOOKUP;
return;
} else if (eresult != ISC_R_SUCCESS) {
debug("send failed: %s", isc_result_totext(eresult));
cancel_lookup(l);
query_detach(&query);
lookup_detach(&l);
UNLOCK_LOOKUP;
return;
}
lookup_attach(query->lookup, &l);
if (l->ns_search_only && !l->trace_root && !l->tcp_mode) {
debug("sending next, since searching");
next = ISC_LIST_NEXT(query, link);
@ -2666,6 +2675,12 @@ _cancel_lookup(dig_lookup_t *lookup, const char *file, unsigned int line) {
REQUIRE(DIG_VALID_QUERY(query));
next = ISC_LIST_NEXT(query, link);
ISC_LIST_DEQUEUE(lookup->q, query, link);
debug("canceling pending query %p, belonging to %p", query,
query->lookup);
query->canceled = true;
if (query->readhandle != NULL) {
isc_nm_cancelread(query->readhandle);
}
query_detach(&query);
query = next;
}
@ -2685,7 +2700,8 @@ tcp_connected(isc_nmhandle_t *handle, isc_result_t eresult, void *arg);
static void
start_tcp(dig_query_t *query) {
isc_result_t result;
dig_query_t *next;
dig_query_t *next = NULL;
dig_query_t *connectquery = NULL;
REQUIRE(DIG_VALID_QUERY(query));
debug("start_tcp(%p)", query);
@ -2775,13 +2791,15 @@ start_tcp(dig_query_t *query) {
REQUIRE(query != NULL);
query_attach(query, &connectquery);
if (query->lookup->tls_mode) {
result = isc_tlsctx_createclient(&query->tlsctx);
RUNTIME_CHECK(result == ISC_R_SUCCESS);
isc_tlsctx_enable_dot_client_alpn(query->tlsctx);
isc_nm_tlsdnsconnect(netmgr, &localaddr,
&query->sockaddr, tcp_connected,
query, local_timeout, 0,
connectquery, local_timeout, 0,
query->tlsctx);
#if HAVE_LIBNGHTTP2
} else if (query->lookup->https_mode) {
@ -2801,13 +2819,13 @@ start_tcp(dig_query_t *query) {
isc_nm_httpconnect(netmgr, &localaddr, &query->sockaddr,
uri, !query->lookup->https_get,
tcp_connected, query, query->tlsctx,
local_timeout, 0);
tcp_connected, connectquery,
query->tlsctx, local_timeout, 0);
#endif
} else {
isc_nm_tcpdnsconnect(netmgr, &localaddr,
&query->sockaddr, tcp_connected,
query, local_timeout, 0);
connectquery, local_timeout, 0);
}
/* XXX: set DSCP */
@ -2878,20 +2896,23 @@ udp_ready(isc_nmhandle_t *handle, isc_result_t eresult, void *arg) {
dig_query_t *readquery = NULL;
int local_timeout = timeout * 1000;
if (eresult == ISC_R_CANCELED) {
if (eresult == ISC_R_CANCELED || query->canceled) {
dig_lookup_t *l = query->lookup;
debug("in cancel handler");
query_detach(&query);
if (!query->canceled) {
cancel_lookup(l);
}
lookup_detach(&l);
return;
} else if (eresult != ISC_R_SUCCESS) {
dig_lookup_t *l = query->lookup;
if (eresult != ISC_R_CANCELED) {
debug("udp setup failed: %s",
isc_result_totext(eresult));
}
debug("udp setup failed: %s", isc_result_totext(eresult));
query_detach(&query);
cancel_lookup(l);
lookup_detach(&l);
query_detach(&query);
return;
}
@ -3143,8 +3164,6 @@ launch_next_query(dig_query_t *query) {
debug("have local timeout of %d", local_timeout);
isc_nmhandle_settimeout(query->handle, local_timeout);
query_attach(query, &readquery);
xfr = query->lookup->rdtype == dns_rdatatype_ixfr ||
query->lookup->rdtype == dns_rdatatype_axfr;
if (xfr && isc_nm_socket_type(query->handle) == isc_nm_tlsdnssocket &&
@ -3163,6 +3182,8 @@ launch_next_query(dig_query_t *query) {
return;
}
query_attach(query, &readquery);
isc_nm_read(query->handle, recv_done, readquery);
if (!query->first_soa_rcvd) {
@ -3233,9 +3254,12 @@ tcp_connected(isc_nmhandle_t *handle, isc_result_t eresult, void *arg) {
isc_tlsctx_free(&query->tlsctx);
}
if (eresult == ISC_R_CANCELED) {
if (eresult == ISC_R_CANCELED || query->canceled) {
debug("in cancel handler");
isc_sockaddr_format(&query->sockaddr, sockstr, sizeof(sockstr));
if (!query->canceled) {
cancel_lookup(l);
}
query_detach(&query);
lookup_detach(&l);
clear_current_lookup();
@ -3245,12 +3269,9 @@ tcp_connected(isc_nmhandle_t *handle, isc_result_t eresult, void *arg) {
debug("unsuccessful connection: %s",
isc_result_totext(eresult));
isc_sockaddr_format(&query->sockaddr, sockstr, sizeof(sockstr));
if (eresult != ISC_R_CANCELED) {
dighost_warning("Connection to %s(%s) for %s failed: "
"%s.",
sockstr, query->servname, l->textname,
isc_result_totext(eresult));
}
dighost_warning("Connection to %s(%s) for %s failed: %s.",
sockstr, query->servname, l->textname,
isc_result_totext(eresult));
/* XXX Clean up exitcodes */
if (exitcode < 9) {
@ -3271,6 +3292,7 @@ tcp_connected(isc_nmhandle_t *handle, isc_result_t eresult, void *arg) {
next = NULL;
}
cancel_lookup(l);
query_detach(&query);
lookup_detach(&l);
@ -3579,16 +3601,20 @@ recv_done(isc_nmhandle_t *handle, isc_result_t eresult, isc_region_t *region,
isc_refcount_decrement0(&recvcount);
debug("recvcount=%" PRIuFAST32, isc_refcount_current(&recvcount));
if (eresult == ISC_R_CANCELED) {
lookup_attach(query->lookup, &l);
if (eresult == ISC_R_CANCELED || query->canceled) {
debug("recv_done: cancel");
isc_nmhandle_detach(&query->readhandle);
if (!query->canceled) {
cancel_lookup(l);
}
query_detach(&query);
lookup_detach(&l);
UNLOCK_LOOKUP;
return;
}
lookup_attach(query->lookup, &l);
if (query->lookup->use_usec) {
TIME_NOW_HIRES(&query->time_recv);
} else {
@ -4209,6 +4235,7 @@ cancel_all(void) {
nq = ISC_LIST_NEXT(q, link);
debug("canceling pending query %p, belonging to %p", q,
current_lookup);
q->canceled = true;
if (q->readhandle != NULL) {
isc_nm_cancelread(q->readhandle);
}

View file

@ -188,6 +188,7 @@ struct dig_query {
bool second_rr_rcvd;
bool first_repeat_rcvd;
bool warn_id;
bool canceled;
uint32_t first_rr_serial;
uint32_t second_rr_serial;
uint32_t msg_count;