From 537187bf2f1a1c402d862bbb105da5947b18d217 Mon Sep 17 00:00:00 2001 From: Aram Sargsyan Date: Thu, 17 Nov 2022 13:48:36 +0000 Subject: [PATCH 1/3] Add serve-stale CNAME check with stale-answer-client-timeout off Prime the cache with the following records: shortttl.cname.example. 1 IN CNAME longttl.target.example. longttl.target.example. 600 IN A 10.53.0.2 Wait for the CNAME record to expire, disable the authoritative server, and query 'shortttl.cname.example' again, expecting a stale answer. --- bin/tests/system/serve-stale/ans2/ans.pl | 24 ++++++++ bin/tests/system/serve-stale/tests.sh | 74 ++++++++++++++++++++++++ 2 files changed, 98 insertions(+) diff --git a/bin/tests/system/serve-stale/ans2/ans.pl b/bin/tests/system/serve-stale/ans2/ans.pl index 0af055a4fb..f14ae67fe8 100644 --- a/bin/tests/system/serve-stale/ans2/ans.pl +++ b/bin/tests/system/serve-stale/ans2/ans.pl @@ -58,6 +58,8 @@ my $CAA = "othertype.example 2 IN CAA 0 issue \"ca1.example.net\""; my $negSOA = "example 2 IN SOA . . 0 0 0 0 300"; my $CNAME = "cname.example 7 IN CNAME target.example"; my $TARGET = "target.example 9 IN A $localaddr"; +my $SHORTCNAME = "shortttl.cname.example 1 IN CNAME longttl.target.example"; +my $LONGTARGET = "longttl.target.example 600 IN A $localaddr"; sub reply_handler { my ($qname, $qclass, $qtype) = @_; @@ -166,6 +168,28 @@ sub reply_handler { push @auth, $rr; } $rcode = "NOERROR"; + } elsif ($qname eq "shortttl.cname.example") { + if ($qtype eq "A") { + my $rr = new Net::DNS::RR($SHORTCNAME); + push @ans, $rr; + } else { + my $rr = new Net::DNS::RR($negSOA); + push @auth, $rr; + } + $rcode = "NOERROR"; + } elsif ($qname eq "longttl.target.example") { + if ($slow_response) { + print " Sleeping 3 seconds\n"; + sleep(3); + } + if ($qtype eq "A") { + my $rr = new Net::DNS::RR($LONGTARGET); + push @ans, $rr; + } else { + my $rr = new Net::DNS::RR($negSOA); + push @auth, $rr; + } + $rcode = "NOERROR"; } elsif ($qname eq "longttl.example") { if ($qtype eq "TXT") { my $rr = new Net::DNS::RR($LONGTXT); diff --git a/bin/tests/system/serve-stale/tests.sh b/bin/tests/system/serve-stale/tests.sh index b64fc5b48c..63a87a3d58 100755 --- a/bin/tests/system/serve-stale/tests.sh +++ b/bin/tests/system/serve-stale/tests.sh @@ -1829,6 +1829,80 @@ grep "data\.example\..*[12].*IN.*TXT.*A text record with a 2 second ttl" dig.out if [ $ret != 0 ]; then echo_i "failed"; fi status=$((status+ret)) +############################################################## +# Test for stale-answer-client-timeout off and CNAME record. # +############################################################## +echo_i "test stale-answer-client-timeout (0) and CNAME record" + +n=$((n+1)) +echo_i "prime cache shortttl.cname.example (stale-answer-client-timeout off) ($n)" +ret=0 +$DIG -p ${PORT} @10.53.0.3 shortttl.cname.example A > dig.out.test$n +grep "status: NOERROR" dig.out.test$n > /dev/null || ret=1 +grep "ANSWER: 2," dig.out.test$n > /dev/null || ret=1 +grep "shortttl\.cname\.example\..*1.*IN.*CNAME.*longttl\.target\.example\." dig.out.test$n > /dev/null || ret=1 +grep "longttl\.target\.example\..*600.*IN.*A.*10\.53\.0\.2" dig.out.test$n > /dev/null || ret=1 +if [ $ret != 0 ]; then echo_i "failed"; fi +status=$((status+ret)) + +# Allow RRset to become stale. +sleep 1 + +n=$((n+1)) +echo_i "disable responses from authoritative server ($n)" +ret=0 +$DIG -p ${PORT} @10.53.0.2 txt disable > dig.out.test$n +grep "ANSWER: 1," dig.out.test$n > /dev/null || ret=1 +grep "TXT.\"0\"" dig.out.test$n > /dev/null || ret=1 +if [ $ret != 0 ]; then echo_i "failed"; fi +status=$((status+ret)) + +n=$((n+1)) +ret=0 +echo_i "check stale shortttl.cname.example comes from cache (stale-answer-client-timeout off) ($n)" +nextpart ns3/named.run > /dev/null +$DIG -p ${PORT} @10.53.0.3 shortttl.cname.example A > dig.out.test$n +wait_for_log 5 "shortttl.cname.example A resolver failure, stale answer used" ns3/named.run || ret=1 +grep "status: NOERROR" dig.out.test$n > /dev/null || ret=1 +grep "EDE: 3 (Stale Answer): (resolver failure)" dig.out.test$n > /dev/null || ret=1 +grep "ANSWER: 2," dig.out.test$n > /dev/null || ret=1 +grep "shortttl\.cname\.example\..*3.*IN.*CNAME.*longttl\.target\.example\." dig.out.test$n > /dev/null || ret=1 +# We can't reliably test the TTL of the longttl.target.example A record. +grep "longttl\.target\.example\..*IN.*A.*10\.53\.0\.2" dig.out.test$n > /dev/null || ret=1 +if [ $ret != 0 ]; then echo_i "failed"; fi +status=$((status+ret)) + +n=$((n+1)) +echo_i "enable responses from authoritative server ($n)" +ret=0 +$DIG -p ${PORT} @10.53.0.2 txt enable > dig.out.test$n +grep "ANSWER: 1," dig.out.test$n > /dev/null || ret=1 +grep "TXT.\"1\"" dig.out.test$n > /dev/null || ret=1 +if [ $ret != 0 ]; then echo_i "failed"; fi +status=$((status+ret)) + +n=$((n+1)) +echo_i "check server is alive or restart ($n)" +ret=0 +$RNDCCMD 10.53.0.3 status > rndc.out.test$n 2>&1 || ret=1 +if [ $ret != 0 ]; then + echo_i "failed" + echo_i "restart ns3" + start_server --noclean --restart --port ${PORT} serve-stale ns3 +fi +status=$((status+ret)) + +n=$((n+1)) +echo_i "check server is alive or restart ($n)" +ret=0 +$RNDCCMD 10.53.0.3 status > rndc.out.test$n 2>&1 || ret=1 +if [ $ret != 0 ]; then + echo_i "failed" + echo_i "restart ns3" + start_server --noclean --restart --port ${PORT} serve-stale ns3 +fi +status=$((status+ret)) + ############################################# # Test for stale-answer-client-timeout 0. # ############################################# From 91a1a8efc5bca44ff3aa6861c31759449ea65ecd Mon Sep 17 00:00:00 2001 From: Matthijs Mekking Date: Thu, 17 Nov 2022 13:52:26 +0000 Subject: [PATCH 2/3] Consider non-stale data when in serve-stale mode With 'stale-answer-enable yes;' and 'stale-answer-client-timeout off;', consider the following situation: A CNAME record and its target record are in the cache, then the CNAME record expires, but the target record is still valid. When a new query for the CNAME record arrives, and the query fails, the stale record is used, and then the query "restarts" to follow the CNAME target. The problem is that the query's multiple stale options (like DNS_DBFIND_STALEOK) are not reset, so 'query_lookup()' treats the restarted query as a lookup following a failed lookup, and returns a SERVFAIL answer when there is no stale data found in the cache, even if there is valid non-stale data there available. With this change, query_lookup() now considers non-stale data in the cache in the first place, and returns it if it is available. --- lib/dns/include/dns/rdataset.h | 5 +++++ lib/ns/query.c | 12 +++++++++++- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/lib/dns/include/dns/rdataset.h b/lib/dns/include/dns/rdataset.h index 6ae21dcc4b..d240d2e019 100644 --- a/lib/dns/include/dns/rdataset.h +++ b/lib/dns/include/dns/rdataset.h @@ -156,6 +156,11 @@ struct dns_rdataset { * * \def DNS_RDATASETATTR_LOADORDER * Output the RRset in load order. + * + * \def DNS_RDATASETATTR_STALE_ADDED + * Set on rdatasets that were added during a stale-answer-client-timeout + * lookup. In other words, the RRset was added during a lookup of stale + * data and does not necessarily mean that the rdataset itself is stale. */ #define DNS_RDATASETATTR_NONE 0x00000000 /*%< No ordering. */ diff --git a/lib/ns/query.c b/lib/ns/query.c index 5bcbe7bb63..faab1340ed 100644 --- a/lib/ns/query.c +++ b/lib/ns/query.c @@ -5883,6 +5883,7 @@ query_lookup(query_ctx_t *qctx) { dns_ttl_t stale_refresh = 0; bool dbfind_stale = false; bool stale_timeout = false; + bool answer_found = false; bool stale_found = false; bool stale_refresh_window = false; uint16_t ede = 0; @@ -5990,6 +5991,14 @@ query_lookup(query_ctx_t *qctx) { */ stale_timeout = ((dboptions & DNS_DBFIND_STALETIMEOUT) != 0); + if (dns_rdataset_isassociated(qctx->rdataset) && + dns_rdataset_count(qctx->rdataset) > 0 && !STALE(qctx->rdataset)) + { + /* Found non-stale usable rdataset. */ + answer_found = true; + goto gotanswer; + } + if (dbfind_stale || stale_refresh_window || stale_timeout) { dns_name_format(qctx->client->query.qname, namebuf, sizeof(namebuf)); @@ -6121,7 +6130,8 @@ query_lookup(query_ctx_t *qctx) { } } - if (stale_timeout && stale_found) { +gotanswer: + if (stale_timeout && (answer_found || stale_found)) { /* * Mark RRsets that we are adding to the client message on a * lookup during 'stale-answer-client-timeout', so we can From 40dee61a1e6c3be351b70fe4a99fbcf0f8a741db Mon Sep 17 00:00:00 2001 From: Aram Sargsyan Date: Thu, 17 Nov 2022 14:21:31 +0000 Subject: [PATCH 3/3] Add a CHANGES note for [GL #3678] --- CHANGES | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CHANGES b/CHANGES index 556633512d..5689c877c9 100644 --- a/CHANGES +++ b/CHANGES @@ -1,3 +1,8 @@ +6059. [bug] In some serve stale scenarios, like when following an + expired CNAME record, named could return SERVFAIL if the + previous request wasn't successful. Consider non-stale + data when in serve-stale mode. [GL #3678] + 6058. [bug] Prevent named from crashing when "rndc delzone" attempts to delete a zone added by a catalog zone. [GL #3745]