From 3aff00dc7b68076a4065090930a600aae24a97ea Mon Sep 17 00:00:00 2001 From: Colin Vidal Date: Thu, 23 Jan 2025 16:38:35 +0100 Subject: [PATCH 1/3] fix EDE 22 time out detection Extended DNS error 22 (No reachable authority) was previously detected when `fctx_expired` fired. It turns out this function is used as a "safety net" and the timeout detection should be caught earlier. It was working though, because of another issue fixed by !9927. Since this change, the recursive request timed out detection occurs before `fctx_expired` so EDE 22 is not added to the response message anymore. The fix of the problem is to add the EDE 22 code in two situations: - When the dispatch code timed out (rctx_timedout) the resolver code checks various properties to figure out if it needs to make another fetch attempt. One of the paramters if the fetch expiration time. If it expires, the whole recursion is canceled, so it now adds the EDE 22 code. - If the fetch expiration time doesn't expires in the case above (and other parameters allows it) a new fetch attempt is made (fctx_query). But before the new request is actually made, the fetch expiration time is re-checked. It might then has elapsed, and the whole recursion is canceled. So it now also adds the EDE 22 code here as well. (cherry picked from commit 78274ec2b17bb16ee5414aaaf7f91d30ff43daba) --- lib/dns/resolver.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/lib/dns/resolver.c b/lib/dns/resolver.c index 12368d0e5b..0089b69aa3 100644 --- a/lib/dns/resolver.c +++ b/lib/dns/resolver.c @@ -1942,6 +1942,7 @@ fctx_query(fetchctx_t *fctx, dns_adbaddrinfo_t *addrinfo, fctx_setretryinterval(fctx, srtt); if (isc_interval_iszero(&fctx->interval)) { FCTXTRACE("fetch expired"); + dns_ede_add(&fctx->edectx, DNS_EDE_NOREACHABLEAUTH, NULL); return ISC_R_TIMEDOUT; } @@ -7955,6 +7956,8 @@ rctx_timedout(respctx_t *rctx) { if (isc_time_microdiff(&fctx->expires, &now) < US_PER_MS) { FCTXTRACE("query timed out; stopped trying to make " "fetch happen"); + dns_ede_add(&fctx->edectx, DNS_EDE_NOREACHABLEAUTH, + NULL); } else { FCTXTRACE("query timed out; trying next server"); /* try next server */ From edd6f0eb35c5ffbb422d0078fe64a0b058cacfaf Mon Sep 17 00:00:00 2001 From: Colin Vidal Date: Thu, 23 Jan 2025 16:43:53 +0100 Subject: [PATCH 2/3] add new EDE 22 system tests This re-do a previously existing EDE 22 system test as well as add another one making sure the timed out flow detection works also on UDP when the resolver is contacting the authoritative server. (the existing test was using TCP to contact the authoritative servers). (cherry picked from commit 7cb8a028feb294f1c8bec04a03d582980a6a019b) --- bin/tests/system/resolver/ans2/ans.pl | 5 +++++ bin/tests/system/resolver/tests.sh | 14 ++++++++++++++ 2 files changed, 19 insertions(+) diff --git a/bin/tests/system/resolver/ans2/ans.pl b/bin/tests/system/resolver/ans2/ans.pl index b17fd6ec2e..079a7d9879 100644 --- a/bin/tests/system/resolver/ans2/ans.pl +++ b/bin/tests/system/resolver/ans2/ans.pl @@ -110,9 +110,14 @@ for (;;) { } elsif ($qname eq "net" && $qtype eq "NS") { $packet->header->aa(1); $packet->push("answer", new Net::DNS::RR("net 300 NS a.root-servers.nil.")); + } elsif ($qname eq "noresponse.exampleudp.net") { + next; } elsif ($qname =~ /example\.net/) { $packet->push("authority", new Net::DNS::RR("example.net 300 NS ns.example.net")); $packet->push("additional", new Net::DNS::RR("ns.example.net 300 A 10.53.0.3")); + } elsif ($qname =~ /exampleudp\.net/) { + $packet->push("authority", new Net::DNS::RR("exampleudp.net 300 NS ns.exampleudp.net")); + $packet->push("additional", new Net::DNS::RR("ns.exampleudp.net 300 A 10.53.0.2")); } elsif ($qname =~ /lame\.example\.org/) { $packet->header->ad(0); $packet->header->aa(0); diff --git a/bin/tests/system/resolver/tests.sh b/bin/tests/system/resolver/tests.sh index 026fd60c9f..2cb143fd55 100755 --- a/bin/tests/system/resolver/tests.sh +++ b/bin/tests/system/resolver/tests.sh @@ -50,6 +50,7 @@ echo_i "checking no response handling with a shorter than resolver-query-timeout ret=0 dig_with_opts +tcp +tries=1 +timeout=3 noresponse.example.net @10.53.0.1 a >dig.out.ns1.test${n} && ret=1 grep -F "no servers could be reached" dig.out.ns1.test${n} >/dev/null || ret=1 +grep -F "EDE: 22 (No Reachable Authority)" dig.out.ns1.test${n} >/dev/null && ret=1 if [ $ret != 0 ]; then echo_i "failed"; fi status=$((status + ret)) @@ -66,6 +67,19 @@ grep -F "EDE: 22 (No Reachable Authority)" dig.out.ns1.test${n} >/dev/null || re if [ $ret != 0 ]; then echo_i "failed"; fi status=$((status + ret)) +# 'resolver-query-timeout' is set to 5 seconds in ns1, so named should +# interrupt the non-responsive query and send a SERVFAIL answer before dig's +# own timeout fires, which is set to 7 seconds. This time, exampleudp.net is +# contacted using UDP transport by the resolver. +n=$((n + 1)) +echo_i "checking no response handling with a longer than resolver-query-timeout timeout (UDP recursion) ($n)" +ret=0 +dig_with_opts +tcp +tries=1 +timeout=7 noresponse.exampleudp.net @10.53.0.1 a >dig.out.ns1.test${n} || ret=1 +grep -F "status: SERVFAIL" dig.out.ns1.test${n} >/dev/null || ret=1 +grep -F "EDE: 22 (No Reachable Authority)" dig.out.ns1.test${n} >/dev/null || ret=1 +if [ $ret != 0 ]; then echo_i "failed"; fi +status=$((status + ret)) + n=$((n + 1)) echo_i "checking handling of bogus referrals ($n)" # If the server has the "INSIST(!external)" bug, this query will kill it. From 588924bbb5e2983c795f45bb49e5a4d82bb71734 Mon Sep 17 00:00:00 2001 From: Colin Vidal Date: Fri, 24 Jan 2025 11:23:43 +0100 Subject: [PATCH 3/3] update serve-stale test to support EDE 22 When EDE 3 (stale answer) was added the serve-stale tests were checking for those exclusively, i.e. grepping for no "EDE" in the dig output when no stale answer was expected. However, some stale tests disable stale answers and make the authoritative server unresponsive, effectively triggering a timed out request thus an EDE 22. Update those tests so they still tests the absence of EDE 3 error, but also the presence of EDE 22. (cherry picked from commit 27f3b8950a675afe875759027a241da7c5244511) --- bin/tests/system/serve-stale/tests.sh | 79 ++++++++++++++++++++------- 1 file changed, 58 insertions(+), 21 deletions(-) diff --git a/bin/tests/system/serve-stale/tests.sh b/bin/tests/system/serve-stale/tests.sh index 507081b02e..dc5e7d9d9f 100755 --- a/bin/tests/system/serve-stale/tests.sh +++ b/bin/tests/system/serve-stale/tests.sh @@ -340,11 +340,15 @@ $DIG -p ${PORT} @10.53.0.1 nxdomain.example TXT >dig.out.test$((n + 4)) & wait +# no stale answers are used and the authoritative queries timed out. So no EDE 3 +# is not sent but EDE 22 is sent. + n=$((n + 1)) echo_i "check stale data.example TXT (serve-stale off) ($n)" ret=0 grep "status: SERVFAIL" dig.out.test$n >/dev/null || ret=1 -grep "EDE" dig.out.test$n >/dev/null && ret=1 +grep "EDE: 22 (No Reachable Authority)" dig.out.test$n >/dev/null || ret=1 +grep "EDE: 3 (Stale Answer)" dig.out.test$n >/dev/null && ret=1 if [ $ret != 0 ]; then echo_i "failed"; fi status=$((status + ret)) @@ -352,7 +356,8 @@ n=$((n + 1)) echo_i "check stale othertype.example CAA (serve-stale off) ($n)" ret=0 grep "status: SERVFAIL" dig.out.test$n >/dev/null || ret=1 -grep "EDE" dig.out.test$n >/dev/null && ret=1 +grep "EDE: 22 (No Reachable Authority)" dig.out.test$n >/dev/null || ret=1 +grep "EDE: 3 (Stale Answer)" dig.out.test$n >/dev/null && ret=1 if [ $ret != 0 ]; then echo_i "failed"; fi status=$((status + ret)) @@ -360,7 +365,8 @@ n=$((n + 1)) echo_i "check stale nodata.example TXT (serve-stale off) ($n)" ret=0 grep "status: SERVFAIL" dig.out.test$n >/dev/null || ret=1 -grep "EDE" dig.out.test$n >/dev/null && ret=1 +grep "EDE: 22 (No Reachable Authority)" dig.out.test$n >/dev/null || ret=1 +grep "EDE: 3 (Stale Answer)" dig.out.test$n >/dev/null && ret=1 if [ $ret != 0 ]; then echo_i "failed"; fi status=$((status + ret)) @@ -368,7 +374,8 @@ n=$((n + 1)) echo_i "check stale nxdomain.example TXT (serve-stale off) ($n)" ret=0 grep "status: SERVFAIL" dig.out.test$n >/dev/null || ret=1 -grep "EDE" dig.out.test$n >/dev/null && ret=1 +grep "EDE: 22 (No Reachable Authority)" dig.out.test$n >/dev/null || ret=1 +grep "EDE: 3 (Stale Answer)" dig.out.test$n >/dev/null && ret=1 if [ $ret != 0 ]; then echo_i "failed"; fi status=$((status + ret)) @@ -739,11 +746,15 @@ $DIG -p ${PORT} @10.53.0.1 nxdomain.example TXT >dig.out.test$((n + 4)) & wait +# stale-answer is enabled, but with a very low TTL so the following answer have +# been removed from the stale cache. Hence, no EDE 3 anymore, but EDE 22. + n=$((n + 1)) echo_i "check ancient data.example TXT (low max-stale-ttl) ($n)" ret=0 grep "status: SERVFAIL" dig.out.test$n >/dev/null || ret=1 -grep "EDE" dig.out.test$n >/dev/null && ret=1 +grep "EDE: 22 (No Reachable Authority)" dig.out.test$n >/dev/null || ret=1 +grep "EDE: 3 (Stale Answer)" dig.out.test$n >/dev/null && ret=1 grep "ANSWER: 0," dig.out.test$n >/dev/null || ret=1 if [ $ret != 0 ]; then echo_i "failed"; fi status=$((status + ret)) @@ -752,7 +763,8 @@ n=$((n + 1)) echo_i "check ancient othertype.example CAA (low max-stale-ttl) ($n)" ret=0 grep "status: SERVFAIL" dig.out.test$n >/dev/null || ret=1 -grep "EDE" dig.out.test$n >/dev/null && ret=1 +grep "EDE: 22 (No Reachable Authority)" dig.out.test$n >/dev/null || ret=1 +grep "EDE: 3 (Stale Answer)" dig.out.test$n >/dev/null && ret=1 grep "ANSWER: 0," dig.out.test$n >/dev/null || ret=1 if [ $ret != 0 ]; then echo_i "failed"; fi status=$((status + ret)) @@ -761,7 +773,8 @@ n=$((n + 1)) echo_i "check ancient nodata.example TXT (low max-stale-ttl) ($n)" ret=0 grep "status: SERVFAIL" dig.out.test$n >/dev/null || ret=1 -grep "EDE" dig.out.test$n >/dev/null && ret=1 +grep "EDE: 22 (No Reachable Authority)" dig.out.test$n >/dev/null || ret=1 +grep "EDE: 3 (Stale Answer)" dig.out.test$n >/dev/null && ret=1 grep "ANSWER: 0," dig.out.test$n >/dev/null || ret=1 if [ $ret != 0 ]; then echo_i "failed"; fi status=$((status + ret)) @@ -770,7 +783,8 @@ n=$((n + 1)) echo_i "check ancient nxdomain.example TXT (low max-stale-ttl) ($n)" ret=0 grep "status: SERVFAIL" dig.out.test$n >/dev/null || ret=1 -grep "EDE" dig.out.test$n >/dev/null && ret=1 +grep "EDE: 22 (No Reachable Authority)" dig.out.test$n >/dev/null || ret=1 +grep "EDE: 3 (Stale Answer)" dig.out.test$n >/dev/null && ret=1 grep "ANSWER: 0," dig.out.test$n >/dev/null || ret=1 if [ $ret != 0 ]; then echo_i "failed"; fi status=$((status + ret)) @@ -1093,11 +1107,15 @@ $DIG -p ${PORT} @10.53.0.3 nxdomain.example TXT >dig.out.test$((n + 4)) & wait +# no stale answers are used and the authoritative queries timed out. So no EDE 3 +# is not sent but EDE 22 is sent. + n=$((n + 1)) echo_i "check fail of data.example TXT (max-stale-ttl default) ($n)" ret=0 grep "status: SERVFAIL" dig.out.test$n >/dev/null || ret=1 -grep "EDE" dig.out.test$n >/dev/null && ret=1 +grep "EDE: 22 (No Reachable Authority)" dig.out.test$n >/dev/null || ret=1 +grep "EDE: 3 (Stale Answer)" dig.out.test$n >/dev/null && ret=1 grep "ANSWER: 0," dig.out.test$n >/dev/null || ret=1 if [ $ret != 0 ]; then echo_i "failed"; fi status=$((status + ret)) @@ -1106,7 +1124,8 @@ n=$((n + 1)) echo_i "check fail of othertype.example CAA (max-stale-ttl default) ($n)" ret=0 grep "status: SERVFAIL" dig.out.test$n >/dev/null || ret=1 -grep "EDE" dig.out.test$n >/dev/null && ret=1 +grep "EDE: 22 (No Reachable Authority)" dig.out.test$n >/dev/null || ret=1 +grep "EDE: 3 (Stale Answer)" dig.out.test$n >/dev/null && ret=1 grep "ANSWER: 0," dig.out.test$n >/dev/null || ret=1 if [ $ret != 0 ]; then echo_i "failed"; fi status=$((status + ret)) @@ -1115,7 +1134,8 @@ n=$((n + 1)) echo_i "check fail of nodata.example TXT (max-stale-ttl default) ($n)" ret=0 grep "status: SERVFAIL" dig.out.test$n >/dev/null || ret=1 -grep "EDE" dig.out.test$n >/dev/null && ret=1 +grep "EDE: 22 (No Reachable Authority)" dig.out.test$n >/dev/null || ret=1 +grep "EDE: 3 (Stale Answer)" dig.out.test$n >/dev/null && ret=1 grep "ANSWER: 0," dig.out.test$n >/dev/null || ret=1 if [ $ret != 0 ]; then echo_i "failed"; fi status=$((status + ret)) @@ -1124,7 +1144,8 @@ n=$((n + 1)) echo_i "check fail of nxdomain.example TXT (max-stale-ttl default) ($n)" ret=0 grep "status: SERVFAIL" dig.out.test$n >/dev/null || ret=1 -grep "EDE" dig.out.test$n >/dev/null && ret=1 +grep "EDE: 22 (No Reachable Authority)" dig.out.test$n >/dev/null || ret=1 +grep "EDE: 3 (Stale Answer)" dig.out.test$n >/dev/null && ret=1 grep "ANSWER: 0," dig.out.test$n >/dev/null || ret=1 if [ $ret != 0 ]; then echo_i "failed"; fi status=$((status + ret)) @@ -1225,11 +1246,13 @@ status=$((status + ret)) # The notfound.example check is different than nxdomain.example because # we didn't send a prime query to add notfound.example to the cache. +# Independently, EDE 22 is sent as the authoritative server doesn't respond. n=$((n + 1)) echo_i "check notfound.example TXT (max-stale-ttl default) ($n)" ret=0 grep "status: SERVFAIL" dig.out.test$n >/dev/null || ret=1 -grep "EDE" dig.out.test$n >/dev/null && ret=1 +grep "EDE: 22 (No Reachable Authority)" dig.out.test$n >/dev/null || ret=1 +grep "EDE: 3 (Stale Answer)" dig.out.test$n >/dev/null && ret=1 grep "ANSWER: 0," dig.out.test$n >/dev/null || ret=1 if [ $ret != 0 ]; then echo_i "failed"; fi status=$((status + ret)) @@ -1341,11 +1364,15 @@ $DIG -p ${PORT} @10.53.0.4 nxdomain.example TXT >dig.out.test$((n + 4)) & wait +# no stale answers are used and the authoritative queries timed out. So no EDE 3 +# is not sent but EDE 22 is sent. + n=$((n + 1)) echo_i "check fail of data.example TXT (serve-stale answers disabled) ($n)" ret=0 grep "status: SERVFAIL" dig.out.test$n >/dev/null || ret=1 -grep "EDE" dig.out.test$n >/dev/null && ret=1 +grep "EDE: 22 (No Reachable Authority)" dig.out.test$n >/dev/null || ret=1 +grep "EDE: 3 (Stale Answer)" dig.out.test$n >/dev/null && ret=1 grep "ANSWER: 0," dig.out.test$n >/dev/null || ret=1 if [ $ret != 0 ]; then echo_i "failed"; fi status=$((status + ret)) @@ -1354,7 +1381,8 @@ n=$((n + 1)) echo_i "check fail of othertype.example TXT (serve-stale answers disabled) ($n)" ret=0 grep "status: SERVFAIL" dig.out.test$n >/dev/null || ret=1 -grep "EDE" dig.out.test$n >/dev/null && ret=1 +grep "EDE: 22 (No Reachable Authority)" dig.out.test$n >/dev/null || ret=1 +grep "EDE: 3 (Stale Answer)" dig.out.test$n >/dev/null && ret=1 grep "ANSWER: 0," dig.out.test$n >/dev/null || ret=1 if [ $ret != 0 ]; then echo_i "failed"; fi status=$((status + ret)) @@ -1363,7 +1391,8 @@ n=$((n + 1)) echo_i "check fail of nodata.example TXT (serve-stale answers disabled) ($n)" ret=0 grep "status: SERVFAIL" dig.out.test$n >/dev/null || ret=1 -grep "EDE" dig.out.test$n >/dev/null && ret=1 +grep "EDE: 22 (No Reachable Authority)" dig.out.test$n >/dev/null || ret=1 +grep "EDE: 3 (Stale Answer)" dig.out.test$n >/dev/null && ret=1 grep "ANSWER: 0," dig.out.test$n >/dev/null || ret=1 if [ $ret != 0 ]; then echo_i "failed"; fi status=$((status + ret)) @@ -1372,7 +1401,8 @@ n=$((n + 1)) echo_i "check fail of nxdomain.example TXT (serve-stale answers disabled) ($n)" ret=0 grep "status: SERVFAIL" dig.out.test$n >/dev/null || ret=1 -grep "EDE" dig.out.test$n >/dev/null && ret=1 +grep "EDE: 22 (No Reachable Authority)" dig.out.test$n >/dev/null || ret=1 +grep "EDE: 3 (Stale Answer)" dig.out.test$n >/dev/null && ret=1 grep "ANSWER: 0," dig.out.test$n >/dev/null || ret=1 if [ $ret != 0 ]; then echo_i "failed"; fi status=$((status + ret)) @@ -1549,11 +1579,15 @@ $DIG -p ${PORT} @10.53.0.5 nxdomain.example TXT >dig.out.test$((n + 4)) & wait +# no stale answers are used and the authoritative queries timed out. So no EDE 3 +# is not sent but EDE 22 is sent. + n=$((n + 1)) echo_i "check fail of data.example TXT (serve-stale cache disabled) ($n)" ret=0 grep "status: SERVFAIL" dig.out.test$n >/dev/null || ret=1 -grep "EDE" dig.out.test$n >/dev/null && ret=1 +grep "EDE: 22 (No Reachable Authority)" dig.out.test$n >/dev/null || ret=1 +grep "EDE: 3 (Stale Answer)" dig.out.test$n >/dev/null && ret=1 grep "ANSWER: 0," dig.out.test$n >/dev/null || ret=1 if [ $ret != 0 ]; then echo_i "failed"; fi status=$((status + ret)) @@ -1562,7 +1596,8 @@ n=$((n + 1)) echo_i "check fail of othertype.example CAA (serve-stale cache disabled) ($n)" ret=0 grep "status: SERVFAIL" dig.out.test$n >/dev/null || ret=1 -grep "EDE" dig.out.test$n >/dev/null && ret=1 +grep "EDE: 22 (No Reachable Authority)" dig.out.test$n >/dev/null || ret=1 +grep "EDE: 3 (Stale Answer)" dig.out.test$n >/dev/null && ret=1 grep "ANSWER: 0," dig.out.test$n >/dev/null || ret=1 if [ $ret != 0 ]; then echo_i "failed"; fi status=$((status + ret)) @@ -1571,7 +1606,8 @@ n=$((n + 1)) echo_i "check fail of nodata.example TXT (serve-stale cache disabled) ($n)" ret=0 grep "status: SERVFAIL" dig.out.test$n >/dev/null || ret=1 -grep "EDE" dig.out.test$n >/dev/null && ret=1 +grep "EDE: 22 (No Reachable Authority)" dig.out.test$n >/dev/null || ret=1 +grep "EDE: 3 (Stale Answer)" dig.out.test$n >/dev/null && ret=1 grep "ANSWER: 0," dig.out.test$n >/dev/null || ret=1 if [ $ret != 0 ]; then echo_i "failed"; fi status=$((status + ret)) @@ -1580,7 +1616,8 @@ n=$((n + 1)) echo_i "check fail of nxdomain.example TXT (serve-stale cache disabled) ($n)" ret=0 grep "status: SERVFAIL" dig.out.test$n >/dev/null || ret=1 -grep "EDE" dig.out.test$n >/dev/null && ret=1 +grep "EDE: 22 (No Reachable Authority)" dig.out.test$n >/dev/null || ret=1 +grep "EDE: 3 (Stale Answer)" dig.out.test$n >/dev/null && ret=1 grep "ANSWER: 0," dig.out.test$n >/dev/null || ret=1 if [ $ret != 0 ]; then echo_i "failed"; fi status=$((status + ret))