Merge branch '2434-fetch-limit-serve-stale' into 'main'

Resolve "Serve stale when fetch limits are hit"

Closes #2434

See merge request isc-projects/bind9!4607
This commit is contained in:
Matthijs Mekking 2021-01-28 16:57:20 +00:00
commit 0fab6cf88e
8 changed files with 204 additions and 27 deletions

View file

@ -1,3 +1,7 @@
5573. [func] Also return stale data if an error occurred and we are
not resuming. Only start the stale-refresh-time window
if we timed out. [GL #2434]
5572. [bug] Address potential double free in generatexml.
[GL #2420]

View file

@ -145,7 +145,7 @@ sub reply_handler {
$rcode = "NXDOMAIN";
}
# mark the answer as authoritative (by setting the 'aa' flag
# mark the answer as authoritative (by setting the 'aa' flag)
return ($rcode, \@ans, \@auth, \@add, { aa => 1 });
}

View file

@ -0,0 +1,50 @@
/*
* Copyright (C) Internet Systems Consortium, Inc. ("ISC")
*
* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
*
* See the COPYRIGHT file distributed with this work for additional
* information regarding copyright ownership.
*/
/*
* Test stale-answer-client-timeout 0.
*/
key rndc_key {
secret "1234abcd8765";
algorithm hmac-sha256;
};
controls {
inet 10.53.0.3 port @CONTROLPORT@ allow { any; } keys { rndc_key; };
};
options {
query-source address 10.53.0.3;
notify-source 10.53.0.3;
transfer-source 10.53.0.3;
port @PORT@;
pid-file "named.pid";
listen-on { 10.53.0.3; };
listen-on-v6 { none; };
dnssec-validation no;
recursion yes;
stale-answer-enable no;
stale-cache-enable yes;
stale-answer-ttl 3;
stale-answer-client-timeout disabled;
stale-refresh-time 4;
resolver-query-timeout 10;
fetches-per-zone 1 fail;
fetches-per-server 1 fail;
max-stale-ttl 3600;
};
zone "." {
type hint;
file "root.db";
};

View file

@ -2010,5 +2010,104 @@ grep "data\.example\..*[12].*IN.*TXT.*A text record with a 2 second ttl" dig.out
if [ $ret != 0 ]; then echo_i "failed"; fi
status=$((status+ret))
####################################################################
# Test if fetch-limits quota is reached, stale data is served. #
####################################################################
echo_i "test stale data with fetch-limits"
n=$((n+1))
echo_i "updating ns3/named.conf ($n)"
ret=0
copy_setports ns3/named6.conf.in ns3/named.conf
if [ $ret != 0 ]; then echo_i "failed"; fi
status=$((status+ret))
n=$((n+1))
echo_i "running 'rndc reload' ($n)"
ret=0
rndc_reload ns3 10.53.0.3
if [ $ret != 0 ]; then echo_i "failed"; fi
status=$((status+ret))
# Disable responses from authoritative server.
n=$((n+1))
echo_i "disable responses from authoritative server ($n)"
ret=0
$DIG -p ${PORT} @10.53.0.2 txt disable > dig.out.test$n
grep "ANSWER: 1," dig.out.test$n > /dev/null || ret=1
grep "TXT.\"0\"" dig.out.test$n > /dev/null || ret=1
if [ $ret != 0 ]; then echo_i "failed"; fi
status=$((status+ret))
# Hit the fetch-limits.
burst() {
num=${1}
rm -f burst.input.$$
while [ $num -gt 0 ]; do
num=`expr $num - 1`
echo "${num}.data.example A" >> burst.input.$$
done
$PERL ../ditch.pl -p ${PORT} -s 10.53.0.3 burst.input.$$
rm -f burst.input.$$
}
wait_for_fetchlimits() {
burst 20
$DIG -p ${PORT} @10.53.0.3 data.example A > dig.out.test$n
grep "status: SERVFAIL" dig.out.test$n > /dev/null || return 1
}
n=$((n+1))
echo_i "hit fetch limits ($n)"
ret=0
retry_quiet 10 wait_for_fetchlimits || ret=1
if [ $ret != 0 ]; then echo_i "failed"; fi
status=$((status+ret))
# Allow RRset to become stale.
sleep 2
# Turn on serve-stale.
n=$((n+1))
echo_i "running 'rndc serve-stale on' ($n)"
ret=0
$RNDCCMD 10.53.0.3 serve-stale on || ret=1
if [ $ret != 0 ]; then echo_i "failed"; fi
status=$((status+ret))
n=$((n+1))
echo_i "check 'rndc serve-stale status' ($n)"
ret=0
$RNDCCMD 10.53.0.3 serve-stale status > rndc.out.test$n 2>&1 || ret=1
grep '_default: on (rndc) (stale-answer-ttl=3 max-stale-ttl=3600 stale-refresh-time=4)' rndc.out.test$n > /dev/null || ret=1
if [ $ret != 0 ]; then echo_i "failed"; fi
status=$((status+ret))
# Expect stale data now.
n=$((n+1))
ret=0
echo_i "check stale data.example comes from cache (fetch-limits) ($n)"
nextpart ns3/named.run > /dev/null
$DIG -p ${PORT} @10.53.0.3 data.example TXT > dig.out.test$n
wait_for_log 5 "data.example resolver failure, stale answer used" ns3/named.run || ret=1
grep "status: NOERROR" dig.out.test$n > /dev/null || ret=1
grep "ANSWER: 1," dig.out.test$n > /dev/null || ret=1
grep "data\.example\..*3.*IN.*TXT.*A text record with a 2 second ttl" dig.out.test$n > /dev/null || ret=1
if [ $ret != 0 ]; then echo_i "failed"; fi
status=$((status+ret))
# The previous query should not have started the stale-refresh-time window.
n=$((n+1))
ret=0
echo_i "check stale data.example comes from cache again (fetch-limits) ($n)"
nextpart ns3/named.run > /dev/null
$DIG -p ${PORT} @10.53.0.3 data.example TXT > dig.out.test$n
wait_for_log 5 "data.example resolver failure, stale answer used" ns3/named.run || ret=1
grep "status: NOERROR" dig.out.test$n > /dev/null || ret=1
grep "ANSWER: 1," dig.out.test$n > /dev/null || ret=1
grep "data\.example\..*3.*IN.*TXT.*A text record with a 2 second ttl" dig.out.test$n > /dev/null || ret=1
if [ $ret != 0 ]; then echo_i "failed"; fi
status=$((status+ret))
echo_i "exit status: $status"
[ $status -eq 0 ] || exit 1

View file

@ -24,9 +24,7 @@ Known Issues
New Features
~~~~~~~~~~~~
- None.
- A new option, ```stale-answer-client-timeout``, has been added to
- A new option, ``stale-answer-client-timeout``, has been added to
improve ``named``'s behavior with respect to serving stale data. The option
defines the amount of time ``named`` waits before attempting
to answer the query with a stale RRset from cache. If a stale answer
@ -41,7 +39,13 @@ New Features
The option can be disabled by setting the value to ``off`` or
``disabled``. It also has no effect if ``stale-answer-enable`` is
disabled.
disabled. [GL #2247]
- Also return stale data if an error occurred and we are not resuming a
query (and serve-stale is enabled). This may happen for example if
``fetches-per-server`` or ``fetches-per-zone` limits are reached. In this
case, we will try to answer DNS requests with stale data, but not start
the ``stale-refresh-time`` window. [GL #2434]
Removed Features
~~~~~~~~~~~~~~~~

View file

@ -241,33 +241,36 @@ struct dns_dbonupdatelistener {
#define DNS_DBFIND_NOZONECUT 0x0200
/*
* DNS_DBFIND_STALEOK: This flag is set when BIND fails to refresh a
* RRset due to timeout (resolver-query-timeout), its intent is to
* try to look for stale data in cache as a fallback, but only if
* stale answers are enabled in configuration.
*
* This flag is also used to activate stale-refresh-time window, since it
* is the only way the database knows that a resolution has failed.
* DNS_DBFIND_STALEOK: This flag is set when BIND fails to refresh a RRset due
* to timeout (resolver-query-timeout). Its intent is to try to look for stale
* data in cache as a fallback, but only if stale answers are enabled in
* configuration.
*/
#define DNS_DBFIND_STALEOK 0x0400
/*
* DNS_DBFIND_STALEENABLED: This flag is used as a hint to the database
* that it may use stale data. It is always set during query lookup if
* stale answers are enabled, but only effectively used during
* stale-refresh-time window. Also during this window, the resolver will
* not try to resolve the query, in other words no attempt to refresh the
* data in cache is made when the stale-refresh-time window is active.
* DNS_DBFIND_STALEENABLED: This flag is used as a hint to the database that
* it may use stale data. It is always set during query lookup if stale
* answers are enabled, but only effectively used during stale-refresh-time
* window. Also during this window, the resolver will not try to resolve the
* query, in other words no attempt to refresh the data in cache is made when
* the stale-refresh-time window is active.
*/
#define DNS_DBFIND_STALEENABLED 0x0800
/*
* DNS_DBFIND_STALEONLY: This new introduced flag is used when we want
* stale data from the database, but not due to a failure in resolution,
* it also doesn't require stale-refresh-time window timer to be active.
* As long as there is a stale RRset available, it should be returned.
* DNS_DBFIND_STALEONLY: This flag is used when we want stale data from the
* database, but not due to a failure in resolution, it also doesn't require
* stale-refresh-time window timer to be active. As long as there is a stale
* RRset available, it should be returned.
*/
#define DNS_DBFIND_STALEONLY 0x1000
/*
* DNS_DBFIND_STALESTART: This flag is used to activate stale-refresh-time
* window.
*/
#define DNS_DBFIND_STALESTART 0x2000
/*@}*/
/*@{*/

View file

@ -4565,11 +4565,12 @@ check_stale_header(dns_rbtnode_t *node, rdatasetheader_t *header,
mark_header_stale(search->rbtdb, header);
*header_prev = header;
/*
* If DNS_DBFIND_STALEOK is set then it means we failed
* to resolve the name during recursion, in this case we
* mark the time in which the refresh failed.
* If DNS_DBFIND_STALESTART is set then it means we
* failed to resolve the name during recursion, in
* this case we mark the time in which the refresh
* failed.
*/
if ((search->options & DNS_DBFIND_STALEOK) != 0) {
if ((search->options & DNS_DBFIND_STALESTART) != 0) {
header->last_refresh_fail_ts = search->now;
} else if ((search->options &
DNS_DBFIND_STALEENABLED) != 0 &&

View file

@ -7441,6 +7441,14 @@ root_key_sentinel_return_servfail(query_ctx_t *qctx, isc_result_t result) {
*/
static bool
query_usestale(query_ctx_t *qctx) {
if ((qctx->client->query.dboptions & DNS_DBFIND_STALEOK) != 0) {
/*
* Query was already using stale, if that didn't work the
* last time, it won't work this time either.
*/
return (false);
}
qctx_clean(qctx);
qctx_freedata(qctx);
@ -7548,11 +7556,19 @@ query_gotanswer(query_ctx_t *qctx, isc_result_t res) {
"query_gotanswer: unexpected error: %s",
isc_result_totext(result));
CCTRACE(ISC_LOG_ERROR, errmsg);
if (qctx->resuming && query_usestale(qctx)) {
if (query_usestale(qctx)) {
/*
* If serve-stale is enabled, query_usestale() already
* set up 'qctx' for looking up a stale response.
*
* We only need to check if the query timed out or
* something else has gone wrong. If the query timed
* out, we will start the stale-refresh-time window.
*/
if (qctx->resuming && result == ISC_R_TIMEDOUT) {
qctx->client->query.dboptions |=
DNS_DBFIND_STALESTART;
}
return (query_lookup(qctx));
}