Add stale-refresh-time option

Before this update, BIND would attempt to do a full recursive resolution
process for each query received if the requested rrset had its ttl
expired. If the resolution fails for any reason, only then BIND would
check for stale rrset in cache (if 'stale-cache-enable' and
'stale-answer-enable' is on).

The problem with this approach is that if an authoritative server is
unreachable or is failing to respond, it is very unlikely that the
problem will be fixed in the next seconds.

A better approach to improve performance in those cases, is to mark the
moment in which a resolution failed, and if new queries arrive for that
same rrset, try to respond directly from the stale cache, and do that
for a window of time configured via 'stale-refresh-time'.

Only when this interval expires we then try to do a normal refresh of
the rrset.

The logic behind this commit is as following:

- In query.c / query_gotanswer(), if the test of 'result' variable falls
  to the default case, an error is assumed to have happened, and a call
  to 'query_usestale()' is made to check if serving of stale rrset is
  enabled in configuration.

- If serving of stale answers is enabled, a flag will be turned on in
  the query context to look for stale records:
  query.c:6839
  qctx->client->query.dboptions |= DNS_DBFIND_STALEOK;

- A call to query_lookup() will be made again, inside it a call to
  'dns_db_findext()' is made, which in turn will invoke rbdb.c /
  cache_find().

- In rbtdb.c / cache_find() the important bits of this change is the
  call to 'check_stale_header()', which is a function that yields true
  if we should skip the stale entry, or false if we should consider it.

- In check_stale_header() we now check if the DNS_DBFIND_STALEOK option
  is set, if that is the case we know that this new search for stale
  records was made due to a failure in a normal resolution, so we keep
  track of the time in which the failured occured in rbtdb.c:4559:
  header->last_refresh_fail_ts = search->now;

- In check_stale_header(), if DNS_DBFIND_STALEOK is not set, then we
  know this is a normal lookup, if the record is stale and the query
  time is between last failure time + stale-refresh-time window, then
  we return false so cache_find() knows it can consider this stale
  rrset entry to return as a response.

The last additions are two new methods to the database interface:
- setservestale_refresh
- getservestale_refresh

Those were added so rbtdb can be aware of the value set in configuration
option, since in that level we have no access to the view object.
This commit is contained in:
Diego Fronza 2020-10-19 17:02:03 -03:00
parent 04d9ac6321
commit 4827ad0ec4
14 changed files with 206 additions and 6 deletions

View file

@ -196,6 +196,7 @@ options {\n\
servfail-ttl 1;\n\
# sortlist <none>\n\
stale-answer-enable false;\n\
stale-refresh-time 30; /* 30 seconds */\n\
stale-answer-ttl 1; /* 1 second */\n\
stale-cache-enable false;\n\
synth-from-dnssec no;\n\

View file

@ -3897,6 +3897,7 @@ configure_view(dns_view_t *view, dns_viewlist_t *viewlist, cfg_obj_t *config,
size_t max_adb_size;
uint32_t lame_ttl, fail_ttl;
uint32_t max_stale_ttl = 0;
uint32_t stale_refresh_time = 0;
dns_tsig_keyring_t *ring = NULL;
dns_view_t *pview = NULL; /* Production view */
isc_mem_t *cmctx = NULL, *hmctx = NULL;
@ -4395,6 +4396,11 @@ configure_view(dns_view_t *view, dns_viewlist_t *viewlist, cfg_obj_t *config,
view->staleanswersok = dns_stale_answer_conf;
}
obj = NULL;
result = named_config_get(maps, "stale-refresh-time", &obj);
INSIST(result == ISC_R_SUCCESS);
stale_refresh_time = cfg_obj_asduration(obj);
/*
* Configure the view's cache.
*
@ -4529,6 +4535,7 @@ configure_view(dns_view_t *view, dns_viewlist_t *viewlist, cfg_obj_t *config,
dns_cache_setcachesize(cache, max_cache_size);
dns_cache_setservestalettl(cache, max_stale_ttl);
dns_cache_setservestalerefresh(cache, stale_refresh_time);
dns_cache_detach(&cache);

View file

@ -589,6 +589,8 @@ static dns_dbmethods_t sampledb_methods = {
NULL, /* getsize */
NULL, /* setservestalettl */
NULL, /* getservestalettl */
NULL, /* setservestalerefresh */
NULL, /* getservestalerefresh */
NULL, /* setgluecachestats */
NULL /* adjusthashsize */
};

View file

@ -999,6 +999,13 @@ dns_cache_getservestalettl(dns_cache_t *cache) {
return (result == ISC_R_SUCCESS ? ttl : 0);
}
void
dns_cache_setservestalerefresh(dns_cache_t *cache, uint32_t interval) {
REQUIRE(VALID_CACHE(cache));
(void)dns_db_setservestalerefresh(cache->db, interval);
}
/*
* The cleaner task is shutting down; do the necessary cleanup.
*/

View file

@ -1089,6 +1089,28 @@ dns_db_getservestalettl(dns_db_t *db, dns_ttl_t *ttl) {
return (ISC_R_NOTIMPLEMENTED);
}
isc_result_t
dns_db_setservestalerefresh(dns_db_t *db, uint32_t interval) {
REQUIRE(DNS_DB_VALID(db));
REQUIRE((db->attributes & DNS_DBATTR_CACHE) != 0);
if (db->methods->setservestalerefresh != NULL) {
return ((db->methods->setservestalerefresh)(db, interval));
}
return (ISC_R_NOTIMPLEMENTED);
}
isc_result_t
dns_db_getservestalerefresh(dns_db_t *db, uint32_t *interval) {
REQUIRE(DNS_DB_VALID(db));
REQUIRE((db->attributes & DNS_DBATTR_CACHE) != 0);
if (db->methods->getservestalerefresh != NULL) {
return ((db->methods->getservestalerefresh)(db, interval));
}
return (ISC_R_NOTIMPLEMENTED);
}
isc_result_t
dns_db_setgluecachestats(dns_db_t *db, isc_stats_t *stats) {
REQUIRE(dns_db_iszone(db));

View file

@ -967,6 +967,8 @@ static dns_dbmethods_t rpsdb_db_methods = {
NULL, /* getsize */
NULL, /* setservestalettl */
NULL, /* getservestalettl */
NULL, /* setservestalerefresh */
NULL, /* getservestalerefresh */
NULL, /* setgluecachestats */
NULL /* adjusthashsize */
};

View file

@ -255,6 +255,18 @@ dns_cache_getservestalettl(dns_cache_t *cache);
*\li 'cache' to be valid.
*/
void
dns_cache_setservestalerefresh(dns_cache_t *cache, uint32_t interval);
/*%<
* Sets the length of time to wait before attempting to refresh a rrset
* if a previous attempt in doing so has failed.
* During this time window if stale rrset are available in cache they
* will be directly returned to client.
*
* Requires:
*\li 'cache' to be valid.
*/
isc_result_t
dns_cache_flush(dns_cache_t *cache);
/*%<

View file

@ -178,6 +178,8 @@ typedef struct dns_dbmethods {
uint64_t *records, uint64_t *bytes);
isc_result_t (*setservestalettl)(dns_db_t *db, dns_ttl_t ttl);
isc_result_t (*getservestalettl)(dns_db_t *db, dns_ttl_t *ttl);
isc_result_t (*setservestalerefresh)(dns_db_t *db, uint32_t interval);
isc_result_t (*getservestalerefresh)(dns_db_t *db, uint32_t *interval);
isc_result_t (*setgluecachestats)(dns_db_t *db, isc_stats_t *stats);
isc_result_t (*adjusthashsize)(dns_db_t *db, size_t size);
} dns_dbmethods_t;
@ -238,6 +240,7 @@ struct dns_dbonupdatelistener {
#define DNS_DBFIND_ADDITIONALOK 0x0100
#define DNS_DBFIND_NOZONECUT 0x0200
#define DNS_DBFIND_STALEOK 0x0400
#define DNS_DBFIND_STALEENABLED 0x0800
/*@}*/
/*@{*/
@ -1701,6 +1704,39 @@ dns_db_getservestalettl(dns_db_t *db, dns_ttl_t *ttl);
* \li #ISC_R_NOTIMPLEMENTED - Not supported by this DB implementation.
*/
isc_result_t
dns_db_setservestalerefresh(dns_db_t *db, uint32_t interval);
/*%<
* Sets the length of time to wait before attempting to refresh a rrset
* if a previous attempt in doing so has failed.
* During this time window if stale rrset are available in cache they
* will be directly returned to client.
*
* Requires:
* \li 'db' is a valid cache database.
* \li 'interval' is number of seconds before attempting to refresh data.
*
* Returns:
* \li #ISC_R_SUCCESS
* \li #ISC_R_NOTIMPLEMENTED - Not supported by this DB implementation.
*/
isc_result_t
dns_db_getservestalerefresh(dns_db_t *db, uint32_t *interval);
/*%<
* Gets the length of time in which stale answers are directly returned from
* cache before attempting to refresh them, in case a previous attempt in
* doing so has failed.
*
* Requires:
* \li 'db' is a valid cache database.
* \li 'interval' is number of seconds before attempting to refresh data.
*
* Returns:
* \li #ISC_R_SUCCESS
* \li #ISC_R_NOTIMPLEMENTED - Not supported by this DB implementation.
*/
isc_result_t
dns_db_setgluecachestats(dns_db_t *db, isc_stats_t *stats);
/*%<

View file

@ -205,6 +205,7 @@ typedef struct rdatasetheader {
rbtdb_rdatatype_t type;
atomic_uint_least16_t attributes;
dns_trust_t trust;
isc_stdtime_t last_refresh_fail_ts;
struct noqname *noqname;
struct noqname *closest;
unsigned int is_mmapped : 1;
@ -488,6 +489,13 @@ struct dns_rbtdb {
*/
dns_ttl_t serve_stale_ttl;
/*
* The time after a failed lookup, where stale answers from cache
* may be used directly in a DNS response without attempting a
* new iterative lookup.
*/
uint32_t serve_stale_refresh;
/*
* This is a linked list used to implement the LRU cache. There will
* be node_lock_count linked lists here. Nodes in bucket 1 will be
@ -4547,6 +4555,27 @@ check_stale_header(dns_rbtnode_t *node, rdatasetheader_t *header,
stale > search->now) {
mark_header_stale(search->rbtdb, header);
*header_prev = header;
/*
* If DNS_DBFIND_STALEOK is set then it means we failed
* to resolve the name during recursion, in this case we
* mark the time in which the refresh failed.
*/
if ((search->options & DNS_DBFIND_STALEOK) != 0) {
header->last_refresh_fail_ts = search->now;
} else if ((search->options &
DNS_DBFIND_STALEENABLED) != 0 &&
search->now <
(header->last_refresh_fail_ts +
search->rbtdb->serve_stale_refresh))
{
/*
* If we are within interval between last
* refresh failure time + 'stale-refresh-time',
* then don't skip this stale entry but use it
* instead.
*/
return (false);
}
return ((search->options & DNS_DBFIND_STALEOK) == 0);
}
@ -8379,6 +8408,29 @@ getservestalettl(dns_db_t *db, dns_ttl_t *ttl) {
return (ISC_R_SUCCESS);
}
static isc_result_t
setservestalerefresh(dns_db_t *db, uint32_t interval) {
dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
REQUIRE(VALID_RBTDB(rbtdb));
REQUIRE(IS_CACHE(rbtdb));
/* currently no bounds checking. 0 means disable. */
rbtdb->serve_stale_refresh = interval;
return (ISC_R_SUCCESS);
}
static isc_result_t
getservestalerefresh(dns_db_t *db, uint32_t *interval) {
dns_rbtdb_t *rbtdb = (dns_rbtdb_t *)db;
REQUIRE(VALID_RBTDB(rbtdb));
REQUIRE(IS_CACHE(rbtdb));
*interval = rbtdb->serve_stale_refresh;
return (ISC_R_SUCCESS);
}
static dns_dbmethods_t zone_methods = { attach,
detach,
beginload,
@ -8426,6 +8478,8 @@ static dns_dbmethods_t zone_methods = { attach,
getsize,
NULL, /* setservestalettl */
NULL, /* getservestalettl */
NULL, /* setservestalerefresh */
NULL, /* getservestalerefresh */
setgluecachestats,
adjusthashsize };
@ -8476,6 +8530,8 @@ static dns_dbmethods_t cache_methods = { attach,
NULL, /* getsize */
setservestalettl,
getservestalettl,
setservestalerefresh,
getservestalerefresh,
NULL,
adjusthashsize };

View file

@ -1309,6 +1309,8 @@ static dns_dbmethods_t sdb_methods = {
NULL, /* getsize */
NULL, /* setservestalettl */
NULL, /* getservestalettl */
NULL, /* setservestalerefresh */
NULL, /* getservestalerefresh */
NULL, /* setgluecachestats */
NULL /* adjusthashsize */
};

View file

@ -1281,6 +1281,8 @@ static dns_dbmethods_t sdlzdb_methods = {
NULL, /* getsize */
NULL, /* setservestalettl */
NULL, /* getservestalettl */
NULL, /* setservestalerefresh */
NULL, /* getservestalerefresh */
NULL, /* setgluecachestats */
NULL /* adjusthashsize */
};

View file

@ -82,6 +82,7 @@ dns_cache_flushname
dns_cache_flushnode
dns_cache_getcachesize
dns_cache_getname
dns_cache_getservestalerefresh
dns_cache_getservestalettl
dns_cache_getstats
dns_cache_load
@ -93,6 +94,7 @@ dns_cache_renderxml
@END LIBXML2
dns_cache_setcachesize
dns_cache_setfilename
dns_cache_setservestalerefresh
dns_cache_setservestalettl
dns_cache_updatestats
dns_catz_add_zone
@ -198,6 +200,7 @@ dns_db_getnsec3parameters
dns_db_getoriginnode
dns_db_getrrsetstats
dns_db_getservestalettl
dns_db_getservestalerefresh
dns_db_getsigningtime
dns_db_getsize
dns_db_getsoaserial
@ -223,6 +226,7 @@ dns_db_serialize
dns_db_setcachestats
dns_db_setgluecachestats
dns_db_setservestalettl
dns_db_setservestalerefresh
dns_db_setsigningtime
dns_db_settask
dns_db_subtractrdataset

View file

@ -2051,6 +2051,7 @@ static cfg_clausedef_t view_clauses[] = {
{ "stale-answer-enable", &cfg_type_boolean, 0 },
{ "stale-answer-ttl", &cfg_type_duration, 0 },
{ "stale-cache-enable", &cfg_type_boolean, 0 },
{ "stale-refresh-time", &cfg_type_duration, 0 },
{ "suppress-initial-notify", &cfg_type_boolean, CFG_CLAUSEFLAG_NYI },
{ "synth-from-dnssec", &cfg_type_boolean, 0 },
{ "topology", &cfg_type_bracketed_aml, CFG_CLAUSEFLAG_ANCIENT },

View file

@ -5523,6 +5523,9 @@ query_lookup(query_ctx_t *qctx) {
dns_clientinfo_t ci;
dns_name_t *rpzqname = NULL;
unsigned int dboptions;
dns_ttl_t stale_ttl = 0;
dns_ttl_t stale_refresh = 0;
bool dbfind_stale = false;
CCTRACE(ISC_LOG_DEBUG(3), "query_lookup");
@ -5581,6 +5584,22 @@ query_lookup(query_ctx_t *qctx) {
dboptions |= DNS_DBFIND_COVERINGNSEC;
}
dns_db_getservestalerefresh(qctx->client->view->cachedb,
&stale_refresh);
dns_db_getservestalettl(qctx->client->view->cachedb, &stale_ttl);
if (stale_refresh > 0) {
if (qctx->client->view->staleanswersok == dns_stale_answer_yes)
{
dboptions |= DNS_DBFIND_STALEENABLED;
} else if (qctx->client->view->staleanswersok ==
dns_stale_answer_conf) {
if (qctx->client->view->staleanswersenable &&
stale_ttl > 0) {
dboptions |= DNS_DBFIND_STALEENABLED;
}
}
}
result = dns_db_findext(qctx->db, rpzqname, qctx->version, qctx->type,
dboptions, qctx->client->now, &qctx->node,
qctx->fname, &cm, &ci, qctx->rdataset,
@ -5601,10 +5620,28 @@ query_lookup(query_ctx_t *qctx) {
dns_cache_updatestats(qctx->view->cache, result);
}
if ((qctx->client->query.dboptions & DNS_DBFIND_STALEOK) != 0) {
/*
* If DNS_DBFIND_STALEOK is set this means we are dealing with a
* lookup following a failed lookup and it is okay to serve a stale
* answer. This will start a time window in rbtdb, tracking the last
* time the RRset lookup failed.
*
* A stale answer may also be served if this is a normal lookup,
* the view has enabled serve-stale (DNS_DBFIND_STALE_ENABLED is set),
* and the request is within the stale-refresh-time window. If this
* is the case we have to make sure that the lookup found a stale
* answer, otherwise "fresh" answers are also treated as stale.
*/
dbfind_stale = ((dboptions & DNS_DBFIND_STALEOK) != 0);
if (dbfind_stale != 0 ||
(((dboptions & DNS_DBFIND_STALEENABLED) != 0) &&
STALE(qctx->rdataset)))
{
char namebuf[DNS_NAME_FORMATSIZE];
bool success;
inc_stats(qctx->client, ns_statscounter_trystale);
qctx->client->query.dboptions &= ~DNS_DBFIND_STALEOK;
if (dns_rdataset_isassociated(qctx->rdataset) &&
dns_rdataset_count(qctx->rdataset) > 0 &&
@ -5618,10 +5655,20 @@ query_lookup(query_ctx_t *qctx) {
dns_name_format(qctx->client->query.qname, namebuf,
sizeof(namebuf));
isc_log_write(ns_lctx, NS_LOGCATEGORY_SERVE_STALE,
NS_LOGMODULE_QUERY, ISC_LOG_INFO,
"%s resolver failure, stale answer %s", namebuf,
success ? "used" : "unavailable");
if (dbfind_stale) {
isc_log_write(ns_lctx, NS_LOGCATEGORY_SERVE_STALE,
NS_LOGMODULE_QUERY, ISC_LOG_INFO,
"%s resolver failure, stale answer %s",
namebuf,
success ? "used" : "unavailable");
} else {
isc_log_write(ns_lctx, NS_LOGCATEGORY_SERVE_STALE,
NS_LOGMODULE_QUERY, ISC_LOG_INFO,
"%s query within stale refresh time, "
"stale answer %s",
namebuf,
success ? "used" : "unavailable");
}
if (!success) {
QUERY_ERROR(qctx, DNS_R_SERVFAIL);
@ -6833,7 +6880,6 @@ query_usestale(query_ctx_t *qctx) {
if (staleanswersok) {
qctx->client->query.dboptions |= DNS_DBFIND_STALEOK;
inc_stats(qctx->client, ns_statscounter_trystale);
if (qctx->client->query.fetch != NULL) {
dns_resolver_destroyfetch(&qctx->client->query.fetch);
}