From 12e7dfa397c92807bdc4e6f55918d46eb15e0600 Mon Sep 17 00:00:00 2001 From: Aram Sargsyan Date: Thu, 6 Mar 2025 14:26:23 +0000 Subject: [PATCH 1/3] Fix resolver responses statistics counter The resquery_response() function increases the response counter without checking if the response was successful. Increase the counter only when the result indicates success. --- lib/dns/resolver.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/lib/dns/resolver.c b/lib/dns/resolver.c index d3c860e911..a470f27a70 100644 --- a/lib/dns/resolver.c +++ b/lib/dns/resolver.c @@ -7462,10 +7462,12 @@ resquery_response(isc_result_t eresult, isc_region_t *region, void *arg) { QTRACE("response"); - if (isc_sockaddr_pf(&query->addrinfo->sockaddr) == PF_INET) { - inc_stats(fctx->res, dns_resstatscounter_responsev4); - } else { - inc_stats(fctx->res, dns_resstatscounter_responsev6); + if (eresult == ISC_R_SUCCESS) { + if (isc_sockaddr_pf(&query->addrinfo->sockaddr) == PF_INET) { + inc_stats(fctx->res, dns_resstatscounter_responsev4); + } else { + inc_stats(fctx->res, dns_resstatscounter_responsev6); + } } rctx = isc_mem_get(fctx->mctx, sizeof(*rctx)); From 830e54811168bc3e69db93baf6132c18f3452f92 Mon Sep 17 00:00:00 2001 From: Aram Sargsyan Date: Thu, 6 Mar 2025 14:28:48 +0000 Subject: [PATCH 2/3] Fix the resolvers RTT-ranged responses statistics counters When a response times out the fctx_cancelquery() function incorrectly calculates it in the 'dns_resstatscounter_queryrtt5' counter (i.e. >=1600 ms). To avoid this, the rctx_timedout() function should make sure that 'rctx->finish' is NULL. And in order to adjust the RTT values for the timed out server, 'rctx->no_response' should be true. Update the rctx_timedout() function to make those changes. --- lib/dns/resolver.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lib/dns/resolver.c b/lib/dns/resolver.c index a470f27a70..504fc0254f 100644 --- a/lib/dns/resolver.c +++ b/lib/dns/resolver.c @@ -8049,6 +8049,9 @@ rctx_timedout(respctx_t *rctx) { fctx->timeout = true; fctx->timeouts++; + rctx->no_response = true; + rctx->finish = NULL; + now = isc_time_now(); /* netmgr timeouts are accurate to the millisecond */ if (isc_time_microdiff(&fctx->expires, &now) < US_PER_MS) { @@ -8059,8 +8062,6 @@ rctx_timedout(respctx_t *rctx) { } else { FCTXTRACE("query timed out; trying next server"); /* try next server */ - rctx->no_response = true; - rctx->finish = NULL; rctx->next_server = true; } From 0c7fa8d572bf3e742a627ff660175683e131908b Mon Sep 17 00:00:00 2001 From: Aram Sargsyan Date: Thu, 6 Mar 2025 14:37:09 +0000 Subject: [PATCH 3/3] Test resolver statistics when responses time out Add a test to check that the timed out responses do not skew the normal responses statistics counters, and that they do update the timeouts counter. --- bin/tests/system/resolver/tests.sh | 20 +++++++++++++++++++ .../system/resolver/tests_sh_resolver.py | 1 + 2 files changed, 21 insertions(+) diff --git a/bin/tests/system/resolver/tests.sh b/bin/tests/system/resolver/tests.sh index 2669021c62..01de8e7d7f 100755 --- a/bin/tests/system/resolver/tests.sh +++ b/bin/tests/system/resolver/tests.sh @@ -43,6 +43,12 @@ grep "status: NOERROR" dig.out.ns1.test${n} >/dev/null || ret=1 if [ $ret != 0 ]; then echo_i "failed"; fi status=$((status + ret)) +rndccmd 10.53.0.1 stats || ret=1 # Get the responses, RTT and timeout statistics before the following timeout tests +grep -F 'responses received' ns1/named.stats >ns1/named.stats.responses-before || true +grep -F 'queries with RTT' ns1/named.stats >ns1/named.stats.rtt-before || true +grep -F 'query timeouts' ns1/named.stats >ns1/named.stats.timeouts-before || true +mv ns1/named.stats ns1/named.stats-before + # 'resolver-query-timeout' is set to 5 seconds in ns1, so dig with a lower # timeout value should give up earlier than that. n=$((n + 1)) @@ -66,6 +72,20 @@ grep -F "EDE: 22 (No Reachable Authority)" dig.out.ns1.test${n} >/dev/null || re if [ $ret != 0 ]; then echo_i "failed"; fi status=$((status + ret)) +n=$((n + 1)) +echo_i "checking that the timeout didn't skew the resolver responses counters and did update the timeout counter ($n)" +ret=0 +rndccmd 10.53.0.1 stats || ret=1 +grep -F 'responses received' ns1/named.stats >ns1/named.stats.responses-after || true +grep -F 'queries with RTT' ns1/named.stats >ns1/named.stats.rtt-after || true +grep -F 'query timeouts' ns1/named.stats >ns1/named.stats.timeouts-after || true +mv ns1/named.stats ns1/named.stats-after +diff ns1/named.stats.responses-before ns1/named.stats.responses-after >/dev/null || ret=1 +diff ns1/named.stats.rtt-before ns1/named.stats.rtt-after >/dev/null || ret=1 +diff ns1/named.stats.timeouts-before ns1/named.stats.timeouts-after >/dev/null && ret=1 +if [ $ret != 0 ]; then echo_i "failed"; fi +status=$((status + ret)) + # 'resolver-query-timeout' is set to 5 seconds in ns1, so named should # interrupt the non-responsive query and send a SERVFAIL answer before dig's # own timeout fires, which is set to 7 seconds. This time, exampleudp.net is diff --git a/bin/tests/system/resolver/tests_sh_resolver.py b/bin/tests/system/resolver/tests_sh_resolver.py index 6e5a7967df..10a36ce4e4 100644 --- a/bin/tests/system/resolver/tests_sh_resolver.py +++ b/bin/tests/system/resolver/tests_sh_resolver.py @@ -21,6 +21,7 @@ pytestmark = pytest.mark.extra_artifacts( "nextpart.out.*", "ans*/ans.run", "ans*/query.log", + "ns1/named.stats*", "ns4/tld.db", "ns5/trusted.conf", "ns6/K*",