From c6fd02aed534e21989fafd7b45b132a21c24fdbc Mon Sep 17 00:00:00 2001 From: Matthijs Mekking Date: Tue, 19 Jan 2021 09:04:29 +0100 Subject: [PATCH 1/4] Use stale data also if we are not resuming Before this change, BIND will only fallback to using stale data if there was an actual attempt to resolve the query. Then on a timeout, the stale data from cache becomes eligible. This commit changes this so that on any unexpected error stale data becomes eligble (you would still have to have 'stale-answer-enable' enabled of course). If there is no stale data, this may return in an error again, so don't loop on stale data lookup attempts. If the DNS_DBFIND_STALEOK flag is set, this means we already tried to lookup stale data, so if that is the case, don't use stale again. --- lib/ns/query.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/lib/ns/query.c b/lib/ns/query.c index f413ddee19..2248e4b0b3 100644 --- a/lib/ns/query.c +++ b/lib/ns/query.c @@ -7441,6 +7441,14 @@ root_key_sentinel_return_servfail(query_ctx_t *qctx, isc_result_t result) { */ static bool query_usestale(query_ctx_t *qctx) { + if ((qctx->client->query.dboptions & DNS_DBFIND_STALEOK) != 0) { + /* + * Query was already using stale, if that didn't work the + * last time, it won't work this time either. + */ + return (false); + } + qctx_clean(qctx); qctx_freedata(qctx); @@ -7548,7 +7556,7 @@ query_gotanswer(query_ctx_t *qctx, isc_result_t res) { "query_gotanswer: unexpected error: %s", isc_result_totext(result)); CCTRACE(ISC_LOG_ERROR, errmsg); - if (qctx->resuming && query_usestale(qctx)) { + if (query_usestale(qctx)) { /* * If serve-stale is enabled, query_usestale() already * set up 'qctx' for looking up a stale response. From aabdedeae321b05023f93c6871ad19165c5a4382 Mon Sep 17 00:00:00 2001 From: Matthijs Mekking Date: Wed, 27 Jan 2021 16:59:27 +0100 Subject: [PATCH 2/4] Only start stale refresh window when resuming If we did not attempt a fetch due to fetch-limits, we should not start the stale-refresh-time window. Introduce a new flag DNS_DBFIND_STALESTART to differentiate between a resolver failure and unexpected error. If we are resuming, this indicates a resolver failure, then start the stale-refresh-time window, otherwise don't start the stale-refresh-time window, but still fall back to using stale data. (This commit also wraps some docstrings to 80 characters width) --- lib/dns/include/dns/db.h | 37 ++++++++++++++++++++----------------- lib/dns/rbtdb.c | 9 +++++---- lib/ns/query.c | 8 ++++++++ 3 files changed, 33 insertions(+), 21 deletions(-) diff --git a/lib/dns/include/dns/db.h b/lib/dns/include/dns/db.h index 1e01c43dfd..f77cc797d4 100644 --- a/lib/dns/include/dns/db.h +++ b/lib/dns/include/dns/db.h @@ -241,33 +241,36 @@ struct dns_dbonupdatelistener { #define DNS_DBFIND_NOZONECUT 0x0200 /* - * DNS_DBFIND_STALEOK: This flag is set when BIND fails to refresh a - * RRset due to timeout (resolver-query-timeout), its intent is to - * try to look for stale data in cache as a fallback, but only if - * stale answers are enabled in configuration. - * - * This flag is also used to activate stale-refresh-time window, since it - * is the only way the database knows that a resolution has failed. + * DNS_DBFIND_STALEOK: This flag is set when BIND fails to refresh a RRset due + * to timeout (resolver-query-timeout). Its intent is to try to look for stale + * data in cache as a fallback, but only if stale answers are enabled in + * configuration. */ #define DNS_DBFIND_STALEOK 0x0400 /* - * DNS_DBFIND_STALEENABLED: This flag is used as a hint to the database - * that it may use stale data. It is always set during query lookup if - * stale answers are enabled, but only effectively used during - * stale-refresh-time window. Also during this window, the resolver will - * not try to resolve the query, in other words no attempt to refresh the - * data in cache is made when the stale-refresh-time window is active. + * DNS_DBFIND_STALEENABLED: This flag is used as a hint to the database that + * it may use stale data. It is always set during query lookup if stale + * answers are enabled, but only effectively used during stale-refresh-time + * window. Also during this window, the resolver will not try to resolve the + * query, in other words no attempt to refresh the data in cache is made when + * the stale-refresh-time window is active. */ #define DNS_DBFIND_STALEENABLED 0x0800 /* - * DNS_DBFIND_STALEONLY: This new introduced flag is used when we want - * stale data from the database, but not due to a failure in resolution, - * it also doesn't require stale-refresh-time window timer to be active. - * As long as there is a stale RRset available, it should be returned. + * DNS_DBFIND_STALEONLY: This flag is used when we want stale data from the + * database, but not due to a failure in resolution, it also doesn't require + * stale-refresh-time window timer to be active. As long as there is a stale + * RRset available, it should be returned. */ #define DNS_DBFIND_STALEONLY 0x1000 + +/* + * DNS_DBFIND_STALESTART: This flag is used to activate stale-refresh-time + * window. + */ +#define DNS_DBFIND_STALESTART 0x2000 /*@}*/ /*@{*/ diff --git a/lib/dns/rbtdb.c b/lib/dns/rbtdb.c index 13be34268f..83f6e30c7c 100644 --- a/lib/dns/rbtdb.c +++ b/lib/dns/rbtdb.c @@ -4565,11 +4565,12 @@ check_stale_header(dns_rbtnode_t *node, rdatasetheader_t *header, mark_header_stale(search->rbtdb, header); *header_prev = header; /* - * If DNS_DBFIND_STALEOK is set then it means we failed - * to resolve the name during recursion, in this case we - * mark the time in which the refresh failed. + * If DNS_DBFIND_STALESTART is set then it means we + * failed to resolve the name during recursion, in + * this case we mark the time in which the refresh + * failed. */ - if ((search->options & DNS_DBFIND_STALEOK) != 0) { + if ((search->options & DNS_DBFIND_STALESTART) != 0) { header->last_refresh_fail_ts = search->now; } else if ((search->options & DNS_DBFIND_STALEENABLED) != 0 && diff --git a/lib/ns/query.c b/lib/ns/query.c index 2248e4b0b3..e7a666ef2a 100644 --- a/lib/ns/query.c +++ b/lib/ns/query.c @@ -7560,7 +7560,15 @@ query_gotanswer(query_ctx_t *qctx, isc_result_t res) { /* * If serve-stale is enabled, query_usestale() already * set up 'qctx' for looking up a stale response. + * + * We only need to check if the query timed out or + * something else has gone wrong. If the query timed + * out, we will start the stale-refresh-time window. */ + if (qctx->resuming && result == ISC_R_TIMEDOUT) { + qctx->client->query.dboptions |= + DNS_DBFIND_STALESTART; + } return (query_lookup(qctx)); } From 11b74fc176f29820cbfff22e06ae9d8920b0a21c Mon Sep 17 00:00:00 2001 From: Matthijs Mekking Date: Thu, 28 Jan 2021 12:30:08 +0100 Subject: [PATCH 3/4] Add test for serve-stale /w fetch-limits Add a test case when fetch-limits are reached and we have stale data in cache. This test starts with a positive answer for 'data.example/TXT' in cache. 1. Reload named.conf to set fetch limits. 2. Disable responses from the authoritative server. 3. Now send a batch of queries to the resolver, until hitting the fetch limits. We can detect this by looking at the response RCODE, at some point we will see SERVFAIL responses. 4. At that point we will turn on serve-stale. 5. Clients should see stale answers now. 6. An incoming query should not set the stale-refresh-time window, so a following query should still get a stale answer because of a resolver failure (and not because it was in the stale-refresh-time window). --- bin/tests/system/serve-stale/ans2/ans.pl | 2 +- .../system/serve-stale/ns3/named6.conf.in | 50 ++++++++++ bin/tests/system/serve-stale/tests.sh | 99 +++++++++++++++++++ 3 files changed, 150 insertions(+), 1 deletion(-) create mode 100644 bin/tests/system/serve-stale/ns3/named6.conf.in diff --git a/bin/tests/system/serve-stale/ans2/ans.pl b/bin/tests/system/serve-stale/ans2/ans.pl index 8be8392f28..4bb9f896ff 100644 --- a/bin/tests/system/serve-stale/ans2/ans.pl +++ b/bin/tests/system/serve-stale/ans2/ans.pl @@ -145,7 +145,7 @@ sub reply_handler { $rcode = "NXDOMAIN"; } - # mark the answer as authoritative (by setting the 'aa' flag + # mark the answer as authoritative (by setting the 'aa' flag) return ($rcode, \@ans, \@auth, \@add, { aa => 1 }); } diff --git a/bin/tests/system/serve-stale/ns3/named6.conf.in b/bin/tests/system/serve-stale/ns3/named6.conf.in new file mode 100644 index 0000000000..1aea7c85c2 --- /dev/null +++ b/bin/tests/system/serve-stale/ns3/named6.conf.in @@ -0,0 +1,50 @@ +/* + * Copyright (C) Internet Systems Consortium, Inc. ("ISC") + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + * See the COPYRIGHT file distributed with this work for additional + * information regarding copyright ownership. + */ + + +/* + * Test stale-answer-client-timeout 0. + */ + +key rndc_key { + secret "1234abcd8765"; + algorithm hmac-sha256; +}; + +controls { + inet 10.53.0.3 port @CONTROLPORT@ allow { any; } keys { rndc_key; }; +}; + +options { + query-source address 10.53.0.3; + notify-source 10.53.0.3; + transfer-source 10.53.0.3; + port @PORT@; + pid-file "named.pid"; + listen-on { 10.53.0.3; }; + listen-on-v6 { none; }; + dnssec-validation no; + recursion yes; + stale-answer-enable no; + stale-cache-enable yes; + stale-answer-ttl 3; + stale-answer-client-timeout disabled; + stale-refresh-time 4; + resolver-query-timeout 10; + fetches-per-zone 1 fail; + fetches-per-server 1 fail; + max-stale-ttl 3600; +}; + +zone "." { + type hint; + file "root.db"; +}; diff --git a/bin/tests/system/serve-stale/tests.sh b/bin/tests/system/serve-stale/tests.sh index 92263d86df..47a117308e 100755 --- a/bin/tests/system/serve-stale/tests.sh +++ b/bin/tests/system/serve-stale/tests.sh @@ -2010,5 +2010,104 @@ grep "data\.example\..*[12].*IN.*TXT.*A text record with a 2 second ttl" dig.out if [ $ret != 0 ]; then echo_i "failed"; fi status=$((status+ret)) +#################################################################### +# Test if fetch-limits quota is reached, stale data is served. # +#################################################################### +echo_i "test stale data with fetch-limits" + +n=$((n+1)) +echo_i "updating ns3/named.conf ($n)" +ret=0 +copy_setports ns3/named6.conf.in ns3/named.conf +if [ $ret != 0 ]; then echo_i "failed"; fi +status=$((status+ret)) + +n=$((n+1)) +echo_i "running 'rndc reload' ($n)" +ret=0 +rndc_reload ns3 10.53.0.3 +if [ $ret != 0 ]; then echo_i "failed"; fi +status=$((status+ret)) + +# Disable responses from authoritative server. +n=$((n+1)) +echo_i "disable responses from authoritative server ($n)" +ret=0 +$DIG -p ${PORT} @10.53.0.2 txt disable > dig.out.test$n +grep "ANSWER: 1," dig.out.test$n > /dev/null || ret=1 +grep "TXT.\"0\"" dig.out.test$n > /dev/null || ret=1 +if [ $ret != 0 ]; then echo_i "failed"; fi +status=$((status+ret)) + +# Hit the fetch-limits. +burst() { + num=${1} + rm -f burst.input.$$ + while [ $num -gt 0 ]; do + num=`expr $num - 1` + echo "${num}.data.example A" >> burst.input.$$ + done + $PERL ../ditch.pl -p ${PORT} -s 10.53.0.3 burst.input.$$ + rm -f burst.input.$$ +} + +wait_for_fetchlimits() { + burst 20 + $DIG -p ${PORT} @10.53.0.3 data.example A > dig.out.test$n + grep "status: SERVFAIL" dig.out.test$n > /dev/null || return 1 +} + +n=$((n+1)) +echo_i "hit fetch limits ($n)" +ret=0 +retry_quiet 10 wait_for_fetchlimits || ret=1 +if [ $ret != 0 ]; then echo_i "failed"; fi +status=$((status+ret)) + +# Allow RRset to become stale. +sleep 2 + +# Turn on serve-stale. +n=$((n+1)) +echo_i "running 'rndc serve-stale on' ($n)" +ret=0 +$RNDCCMD 10.53.0.3 serve-stale on || ret=1 +if [ $ret != 0 ]; then echo_i "failed"; fi +status=$((status+ret)) + +n=$((n+1)) +echo_i "check 'rndc serve-stale status' ($n)" +ret=0 +$RNDCCMD 10.53.0.3 serve-stale status > rndc.out.test$n 2>&1 || ret=1 +grep '_default: on (rndc) (stale-answer-ttl=3 max-stale-ttl=3600 stale-refresh-time=4)' rndc.out.test$n > /dev/null || ret=1 +if [ $ret != 0 ]; then echo_i "failed"; fi +status=$((status+ret)) + +# Expect stale data now. +n=$((n+1)) +ret=0 +echo_i "check stale data.example comes from cache (fetch-limits) ($n)" +nextpart ns3/named.run > /dev/null +$DIG -p ${PORT} @10.53.0.3 data.example TXT > dig.out.test$n +wait_for_log 5 "data.example resolver failure, stale answer used" ns3/named.run || ret=1 +grep "status: NOERROR" dig.out.test$n > /dev/null || ret=1 +grep "ANSWER: 1," dig.out.test$n > /dev/null || ret=1 +grep "data\.example\..*3.*IN.*TXT.*A text record with a 2 second ttl" dig.out.test$n > /dev/null || ret=1 +if [ $ret != 0 ]; then echo_i "failed"; fi +status=$((status+ret)) + +# The previous query should not have started the stale-refresh-time window. +n=$((n+1)) +ret=0 +echo_i "check stale data.example comes from cache again (fetch-limits) ($n)" +nextpart ns3/named.run > /dev/null +$DIG -p ${PORT} @10.53.0.3 data.example TXT > dig.out.test$n +wait_for_log 5 "data.example resolver failure, stale answer used" ns3/named.run || ret=1 +grep "status: NOERROR" dig.out.test$n > /dev/null || ret=1 +grep "ANSWER: 1," dig.out.test$n > /dev/null || ret=1 +grep "data\.example\..*3.*IN.*TXT.*A text record with a 2 second ttl" dig.out.test$n > /dev/null || ret=1 +if [ $ret != 0 ]; then echo_i "failed"; fi +status=$((status+ret)) + echo_i "exit status: $status" [ $status -eq 0 ] || exit 1 From ed8421693c9cd26586b6545f60c77bde006c12d9 Mon Sep 17 00:00:00 2001 From: Matthijs Mekking Date: Thu, 28 Jan 2021 17:02:56 +0100 Subject: [PATCH 4/4] Add notes and change entry for [#2434] This concludes the serve-stale improvements. --- CHANGES | 4 ++++ doc/notes/notes-current.rst | 12 ++++++++---- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/CHANGES b/CHANGES index 44712112f0..63a2c54db1 100644 --- a/CHANGES +++ b/CHANGES @@ -1,3 +1,7 @@ +5573. [func] Also return stale data if an error occurred and we are + not resuming. Only start the stale-refresh-time window + if we timed out. [GL #2434] + 5572. [bug] Address potential double free in generatexml. [GL #2420] diff --git a/doc/notes/notes-current.rst b/doc/notes/notes-current.rst index b459256ff6..600911bd2a 100644 --- a/doc/notes/notes-current.rst +++ b/doc/notes/notes-current.rst @@ -24,9 +24,7 @@ Known Issues New Features ~~~~~~~~~~~~ -- None. - -- A new option, ```stale-answer-client-timeout``, has been added to +- A new option, ``stale-answer-client-timeout``, has been added to improve ``named``'s behavior with respect to serving stale data. The option defines the amount of time ``named`` waits before attempting to answer the query with a stale RRset from cache. If a stale answer @@ -41,7 +39,13 @@ New Features The option can be disabled by setting the value to ``off`` or ``disabled``. It also has no effect if ``stale-answer-enable`` is - disabled. + disabled. [GL #2247] + +- Also return stale data if an error occurred and we are not resuming a + query (and serve-stale is enabled). This may happen for example if + ``fetches-per-server`` or ``fetches-per-zone` limits are reached. In this + case, we will try to answer DNS requests with stale data, but not start + the ``stale-refresh-time`` window. [GL #2434] Removed Features ~~~~~~~~~~~~~~~~