Serve expired cache update fixes (#1174)

- Fixes a regression bug with serve-expired that appeared in 1.22.0
  and would not allow the iterator to update the cache with
  not-yet-validated entries resulting in increased outgoing traffic.

- Treat serve_expired_norec_ttl as a backoff timer for failed updates of expired records.
- Try to use expired answers instead of SERVFAIL if serve-expired is
  enabled even without serve-expired-client-timeout.
- Add suggestion to refresh the cached norec_ttl and expired_ttl when a
  response cannot update the usable expired entry.
This commit is contained in:
Yorgos Thessalonikefs 2024-12-31 16:28:12 +01:00 committed by GitHub
parent e57e537c85
commit fff9f62a1e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 170 additions and 46 deletions

View file

@ -1845,10 +1845,10 @@ lookup_cache:
* its qname must be that used for cache
* lookup. */
if((worker->env.cfg->prefetch &&
*worker->env.now >= rep->prefetch_ttl) ||
rep->prefetch_ttl <= *worker->env.now) ||
(worker->env.cfg->serve_expired &&
*worker->env.now > rep->ttl)) {
rep->ttl < *worker->env.now &&
!(*worker->env.now < rep->serve_expired_norec_ttl))) {
time_t leeway = rep->ttl - *worker->env.now;
if(rep->ttl < *worker->env.now)
leeway = 0;

41
services/cache/dns.c vendored
View file

@ -1056,7 +1056,7 @@ dns_cache_lookup(struct module_env* env,
int
dns_cache_store(struct module_env* env, struct query_info* msgqinf,
struct reply_info* msgrep, int is_referral, time_t leeway, int pside,
struct reply_info* msgrep, int is_referral, time_t leeway, int pside,
struct regional* region, uint32_t flags, time_t qstarttime,
int is_valrec)
{
@ -1066,7 +1066,7 @@ dns_cache_store(struct module_env* env, struct query_info* msgqinf,
* useful expired record exists. */
struct msgreply_entry* e = msg_cache_lookup(env,
msgqinf->qname, msgqinf->qname_len, msgqinf->qtype,
msgqinf->qclass, flags, 0, 0);
msgqinf->qclass, flags, 0, 1);
if(e) {
struct reply_info* cached = e->entry.data;
if(cached->ttl < *env->now
@ -1081,7 +1081,42 @@ dns_cache_store(struct module_env* env, struct query_info* msgqinf,
&& cached->security != sec_status_bogus
&& (env->need_to_validate &&
msgrep->security == sec_status_unchecked)
&& !is_valrec) {
/* Exceptions to that rule are:
* o recursions that don't need validation but
* need to update the cache for coherence
* (delegation information while iterating,
* DNSKEY and DS lookups from validator)
* o explicit RRSIG queries that are not
* validated. */
&& !is_valrec
&& msgqinf->qtype != LDNS_RR_TYPE_RRSIG) {
if((int)FLAGS_GET_RCODE(msgrep->flags) !=
LDNS_RCODE_NOERROR &&
(int)FLAGS_GET_RCODE(msgrep->flags) !=
LDNS_RCODE_NXDOMAIN) {
/* The current response has an
* erroneous rcode. Adjust norec time
* so that additional lookups are not
* performed for some time. */
verbose(VERB_ALGO, "set "
"serve-expired-norec-ttl for "
"response in cache");
cached->serve_expired_norec_ttl =
NORR_TTL + *env->now;
if(env->cfg->serve_expired_ttl_reset &&
cached->serve_expired_ttl
< *env->now +
env->cfg->serve_expired_ttl) {
/* Reset serve-expired-ttl for
* valid response in cache. */
verbose(VERB_ALGO, "reset "
"serve-expired-ttl "
"for response in cache");
cached->serve_expired_ttl =
*env->now +
env->cfg->serve_expired_ttl;
}
}
verbose(VERB_ALGO, "a validated expired entry "
"could be overwritten, skip caching "
"the new message at this stage");

View file

@ -1512,8 +1512,10 @@ void mesh_query_done(struct mesh_state* mstate)
}
if(mstate->s.return_rcode == LDNS_RCODE_SERVFAIL ||
(rep && FLAGS_GET_RCODE(rep->flags) == LDNS_RCODE_SERVFAIL)) {
/* we are SERVFAILing; check for expired answer here */
mesh_serve_expired_callback(mstate);
if(mstate->s.env->cfg->serve_expired) {
/* we are SERVFAILing; check for expired answer here */
mesh_respond_serve_expired(mstate);
}
if((mstate->reply_list || mstate->cb_list)
&& mstate->s.env->cfg->log_servfail
&& !mstate->s.env->cfg->val_log_squelch) {

View file

@ -9,7 +9,6 @@ server:
ede: yes
ede-serve-expired: yes
stub-zone:
name: "example.com"
stub-addr: 1.2.3.4
@ -25,8 +24,10 @@ SCENARIO_BEGIN Test serve-expired with client-timeout and a SERVFAIL upstream re
; - check that we get the expired cached answer
; - query again (the answer is available on the upstream server now)
; - check that we get the immediate expired answer back instead
; - (the upstream query does happen after the expired reply and updates the cache)
; - query again (the upstream has no answer)
; - query again (the answer is available on the upstream server now)
; - check that we *still* get the immediate expired answer back instead, recursion is blocked for NORR_TTL(5)
; - wait for NORR_TTL(5) to expire
; - query again
; - check that we get the freshly cached answer
; ns.example.com.
@ -73,7 +74,7 @@ RANGE_BEGIN 30 40
RANGE_END
; ns.example.com.
RANGE_BEGIN 50 60
RANGE_BEGIN 50 100
ADDRESS 1.2.3.4
; response to A query
ENTRY_BEGIN
@ -148,12 +149,8 @@ ENTRY_BEGIN
example.com. IN A
ENTRY_END
; Allow for upstream query to resolve.
STEP 51 TRAFFIC
; Check that we got an immediate stale answer because of the previous failure,
; regardless if upstream has the answer already in this range. The query will
; be resolved after the immediate cached answer and will cache the result.
; regardless if upstream has the answer already in this range.
STEP 60 CHECK_ANSWER
ENTRY_BEGIN
MATCH all ttl ede=3
@ -170,15 +167,42 @@ ENTRY_END
; Query again
STEP 70 QUERY
ENTRY_BEGIN
REPLY RD DO
SECTION QUESTION
example.com. IN A
ENTRY_END
; Check that we still get the immediate stale answer because of the previous failure,
; regardless if upstream has the answer already in this range. NORR_TTL(5) blocks us from
; recursion.
STEP 80 CHECK_ANSWER
ENTRY_BEGIN
MATCH all ttl ede=3
REPLY QR RD RA DO NOERROR
SECTION QUESTION
example.com. IN A
SECTION ANSWER
example.com. 123 IN A 5.6.7.8
SECTION AUTHORITY
example.com. 123 IN NS ns.example.com.
SECTION ADDITIONAL
ns.example.com. 123 IN A 1.2.3.4
ENTRY_END
; Let NORR_TTL(5) expire
STEP 81 TIME_PASSES ELAPSE 5
; Query again
STEP 90 QUERY
ENTRY_BEGIN
REPLY RD
SECTION QUESTION
example.com. IN A
ENTRY_END
; Check that we got the cached updated answer from the previous step since
; there is no upstream in this range.
STEP 80 CHECK_ANSWER
; Check fresh reply
STEP 100 CHECK_ANSWER
ENTRY_BEGIN
MATCH all ttl
REPLY QR RD RA NOERROR

View file

@ -30,13 +30,13 @@ SCENARIO_BEGIN Test serve-expired with client-timeout and bogus answer
; - wait for the record to expire
; - (upstream now has a bogus response)
; - query again for www.example.com. IN A
; - check that we get the expired valid response instead
; - query once more
; - check that we get the expired valid response instead; recursion is blocked for NORR_TTL(5) because of the failure
; - (upstream has the valid response again)
; - query once more
; - check that we get the immediate expired valid response
; - (the prefetch query updates the cache with the valid response)
; - let NORR_TTL(5) expire
; - query one last time
; - check that we get the immediate valid cache response; upstream does not have an answer at this moment
; - check that we get the immediate valid cache response
; The example.com NS and ns.example.com A record are commented out.
; This to make the test succeed. It then keeps the dnssec valid lookup.
@ -199,7 +199,7 @@ RANGE_END
;;
;; ns.example.com. with valid data again
;;
RANGE_BEGIN 40 60
RANGE_BEGIN 40 70
ADDRESS 1.2.3.4
; response to query of interest
ENTRY_BEGIN
@ -279,7 +279,7 @@ SECTION QUESTION
www.example.com. IN A
ENTRY_END
; immediate cached answer because upstream is valid again
; immediate cached answer; although upstream is valid again
STEP 50 CHECK_ANSWER
ENTRY_BEGIN
MATCH all ttl ede=3
@ -297,7 +297,9 @@ SECTION ADDITIONAL
;ns.example.com. 123 IN RRSIG A 3 3 3600 20070926134150 20070829134150 2854 example.com. MC0CFQCQMyTjn7WWwpwAR1LlVeLpRgZGuQIUCcJDEkwAuzytTDRlYK7nIMwH1CM= ;{id = 2854}
ENTRY_END
; upstream query is resolved before this query comes in
STEP 51 TIME_PASSES ELAPSE 5
; query one last time
STEP 60 QUERY
ENTRY_BEGIN
REPLY RD DO
@ -305,7 +307,7 @@ SECTION QUESTION
www.example.com. IN A
ENTRY_END
; prefetch query updated the cache, since there is no upstream response in this range
; this is the fresh valid response
STEP 70 CHECK_ANSWER
ENTRY_BEGIN
MATCH all ttl

View file

@ -16,11 +16,11 @@ SCENARIO_BEGIN Serve expired ttl with reset on forwarder with a timeout on upstr
; - Wait for it to expire (+ serve-expired-ttl)
; - Send query again
; - Upstream timeouts
; - Error response from iterator SERVFAIL, resets expired-ttl on cache
; - Check we are getting the SERVFAIL response
; - Error response from iterator SERVFAIL, resets expired-ttl on cache and sets norec_ttl blocking recursion
; - Check we are getting the cached response because it was expired-ttl-reset
; - Query again
; - Check we are getting the expired answer
; - Upstream still timeouts
; - Check we are getting the expired answer; prefetching is blocked by norec_ttl
; - If there was prefetching the test would fail with the pending upstream query
STEP 1 QUERY
ENTRY_BEGIN
@ -54,7 +54,7 @@ STEP 4 TIME_PASSES ELAPSE 12
STEP 5 QUERY
ENTRY_BEGIN
REPLY RD
REPLY RD DO
SECTION QUESTION
www.example.com. IN A
ENTRY_END
@ -67,14 +67,16 @@ STEP 8 TIMEOUT
STEP 9 TIMEOUT
STEP 10 TIMEOUT
; Returns servfail
; Returns
; but error response from iterator resets the expired ttl
STEP 11 CHECK_ANSWER
ENTRY_BEGIN
MATCH all ttl
REPLY QR RA RD SERVFAIL
MATCH all ttl ede=3
REPLY QR RA RD DO NOERROR
SECTION QUESTION
www.example.com. IN A
SECTION ANSWER
www.example.com. 123 IN A 0.0.0.0
ENTRY_END
; Query again
@ -96,8 +98,4 @@ SECTION ANSWER
www.example.com. 123 IN A 0.0.0.0
ENTRY_END
; But the pending query times out!
; Only one because RTT reached the limit.
STEP 16 TIMEOUT
SCENARIO_END

View file

@ -31,12 +31,18 @@ SCENARIO_BEGIN Test serve-expired with client-timeout and bogus answer
; - (upstream now has a bogus response)
; - query again for www.example.com. IN A
; - check that we get the immediate expired valid response
; - (prefetch response is bogus and is not cached)
; - (prefetch response is bogus and is not cached; recursion is blocked for NORR_TTL(5) because of the failure)
; - (upstream has a valid response again)
; - query once more
; - check that we still get the immediate expired valid response and not the fresh bogus one
; - (upstream has a valid response again; prefetch will update the cache)
; - check that we still get the immediate expired valid response (prefetch will not trigger because of NORR_TTL(5))
; - query and check that cache was not updated
; - let NORR_TTL(5) expire
; - query once more
; - check that we still get the immediate expired valid response
; - (prefetch should be allowed to refresh the record at this point)
; - (upstream does not have the answer anymore)
; - query one last time
; - check that we get an immediate valid cache response
; - check that we get the immediate valid cache response
; The example.com NS and ns.example.com A record are commented out.
; This to make the test succeed. It then keeps the dnssec valid lookup.
@ -273,6 +279,7 @@ SECTION ADDITIONAL
;ns.example.com. 123 IN RRSIG A 3 3 3600 20070926134150 20070829134150 2854 example.com. MC0CFQCQMyTjn7WWwpwAR1LlVeLpRgZGuQIUCcJDEkwAuzytTDRlYK7nIMwH1CM= ;{id = 2854}
ENTRY_END
; query with response available on the server
STEP 40 QUERY
ENTRY_BEGIN
REPLY RD DO
@ -280,7 +287,8 @@ SECTION QUESTION
www.example.com. IN A
ENTRY_END
; this is still the immediate cache response because the previous upstream response was bogus
; this is still the immediate expired cache response because the previous upstream response was bogus
; upstream query did not go out because of the previous failure NORR_TTL(5).
STEP 50 CHECK_ANSWER
ENTRY_BEGIN
MATCH all ttl ede=3
@ -298,6 +306,7 @@ SECTION ADDITIONAL
;ns.example.com. 123 IN RRSIG A 3 3 3600 20070926134150 20070829134150 2854 example.com. MC0CFQCQMyTjn7WWwpwAR1LlVeLpRgZGuQIUCcJDEkwAuzytTDRlYK7nIMwH1CM= ;{id = 2854}
ENTRY_END
; query with response available
STEP 60 QUERY
ENTRY_BEGIN
REPLY RD DO
@ -305,9 +314,63 @@ SECTION QUESTION
www.example.com. IN A
ENTRY_END
; this is the immediate cache response because the previous upstream response was valid
; this is still the immediate expired cache response because resolution is blocked for NORR_TTL(5)
STEP 70 CHECK_ANSWER
ENTRY_BEGIN
MATCH all ttl ede=3
REPLY QR RD RA AD DO NOERROR
SECTION QUESTION
www.example.com. IN A
SECTION ANSWER
www.example.com. 123 IN A 10.20.30.40
www.example.com. 123 IN RRSIG A 3 3 3600 20070926134150 20070829134150 2854 example.com. MC0CFC99iE9K5y2WNgI0gFvBWaTi9wm6AhUAoUqOpDtG5Zct+Qr9F3mSdnbc6V4= ;{id = 2854}
SECTION AUTHORITY
;example.com. 123 IN NS ns.example.com.
;example.com. 123 IN RRSIG NS 3 2 3600 20070926134150 20070829134150 2854 example.com. MC0CFQCN+qHdJxoI/2tNKwsb08pra/G7aAIUAWA5sDdJTbrXA1/3OaesGBAO3sI= ;{id = 2854}
SECTION ADDITIONAL
;ns.example.com. 123 IN A 1.2.3.4
;ns.example.com. 123 IN RRSIG A 3 3 3600 20070926134150 20070829134150 2854 example.com. MC0CFQCQMyTjn7WWwpwAR1LlVeLpRgZGuQIUCcJDEkwAuzytTDRlYK7nIMwH1CM= ;{id = 2854}
ENTRY_END
; expire NORR_TTL(5)
STEP 71 TIME_PASSES ELAPSE 5
; query again
STEP 80 QUERY
ENTRY_BEGIN
REPLY RD DO
SECTION QUESTION
www.example.com. IN A
ENTRY_END
; this is still the immediate expired cache response but prefetching will be allowed to update the cache
STEP 90 CHECK_ANSWER
ENTRY_BEGIN
MATCH all ttl ede=3
REPLY QR RD RA AD DO NOERROR
SECTION QUESTION
www.example.com. IN A
SECTION ANSWER
www.example.com. 123 IN A 10.20.30.40
www.example.com. 123 IN RRSIG A 3 3 3600 20070926134150 20070829134150 2854 example.com. MC0CFC99iE9K5y2WNgI0gFvBWaTi9wm6AhUAoUqOpDtG5Zct+Qr9F3mSdnbc6V4= ;{id = 2854}
SECTION AUTHORITY
;example.com. 123 IN NS ns.example.com.
;example.com. 123 IN RRSIG NS 3 2 3600 20070926134150 20070829134150 2854 example.com. MC0CFQCN+qHdJxoI/2tNKwsb08pra/G7aAIUAWA5sDdJTbrXA1/3OaesGBAO3sI= ;{id = 2854}
SECTION ADDITIONAL
;ns.example.com. 123 IN A 1.2.3.4
;ns.example.com. 123 IN RRSIG A 3 3 3600 20070926134150 20070829134150 2854 example.com. MC0CFQCQMyTjn7WWwpwAR1LlVeLpRgZGuQIUCcJDEkwAuzytTDRlYK7nIMwH1CM= ;{id = 2854}
ENTRY_END
STEP 100 QUERY
ENTRY_BEGIN
REPLY RD DO
SECTION QUESTION
www.example.com. IN A
ENTRY_END
; this is the immediate cache response because the previous upstream response was valid
STEP 110 CHECK_ANSWER
ENTRY_BEGIN
MATCH all ttl
REPLY QR RD RA AD DO NOERROR
SECTION QUESTION