[9.20] fix: usr: Fix a zone refresh bug

A secondary zone could fail to further refresh with new
versions of the zone from a primary server if named was
reconfigured during the SOA request step of an ongoing
zone transfer. This has been fixed.

Closes #5307

Backport of MR !10468

Merge branch 'backport-5307-zone-refresh-stuck-after-reconfiguration-fix-9.20' into 'bind-9.20'

See merge request isc-projects/bind9!10495
This commit is contained in:
Ondřej Surý 2025-05-28 17:53:22 +00:00
commit f09bb8b88c
6 changed files with 189 additions and 7 deletions

View file

@ -0,0 +1,111 @@
"""
Copyright (C) Internet Systems Consortium, Inc. ("ISC")
SPDX-License-Identifier: MPL-2.0
This Source Code Form is subject to the terms of the Mozilla Public
License, v. 2.0. If a copy of the MPL was not distributed with this
file, you can obtain one at https://mozilla.org/MPL/2.0/.
See the COPYRIGHT file distributed with this work for additional
information regarding copyright ownership.
"""
from typing import AsyncGenerator
import dns.message
import dns.rdataclass
import dns.rdatatype
import dns.rrset
from isctest.asyncserver import (
ControllableAsyncDnsServer,
DomainHandler,
QueryContext,
ResponseAction,
DnsResponseSend,
ToggleResponsesCommand,
)
class AXFRServer(DomainHandler):
"""
Yield SOA and AXFR responses. Every new AXFR response increments the SOA
version.
"""
domains = ["xfr-and-reconfig"]
def __init__(self) -> None:
super().__init__()
self.soa_version = 0
async def get_responses(
self, qctx: QueryContext
) -> AsyncGenerator[ResponseAction, None]:
# This is oversimplified because I am lazy - we are appending the SOA
# RRset to the ANSWER section for _every_ QTYPE. named is only
# expected to send a SOA query over UDP and then an AXFR query over
# TCP. Responses to both of those start with a SOA RRset in the ANSWER
# section :-)
soa_message = dns.message.make_response(qctx.query)
soa_rrset = dns.rrset.from_text(
qctx.qname,
300,
dns.rdataclass.IN,
dns.rdatatype.SOA,
f". . {self.soa_version} 0 0 0 0",
)
soa_message.answer.append(soa_rrset)
yield DnsResponseSend(soa_message, authoritative=True)
if qctx.qtype == dns.rdatatype.SOA:
# If QTYPE=SOA, the SOA record is the complete response.
return
if qctx.qtype != dns.rdatatype.AXFR:
# If QTYPE=AXFR, we will continue cramming RRsets into the ANSWER
# section of a subsequent DNS message below.
#
# If QTYPE was not SOA or AXFR, abort. Yeah, we just sent a broken
# response by yielding DnsResponseSend() with a SOA RRset in the
# ANSWER section above. We will have to carry that burden for the
# rest of our lives.
return
# Send just the obligatory NS RRset at zone apex in the next message.
# This is stupidly inefficient, but makes looping below simpler as we
# will already have been done with the mandatory stuff by then.
ns_message = dns.message.make_response(qctx.query)
ns_rrset = dns.rrset.from_text(
qctx.qname, 300, dns.rdataclass.IN, dns.rdatatype.NS, "."
)
ns_message.answer.append(ns_rrset)
yield DnsResponseSend(ns_message, authoritative=True)
# Generate the AXFR with a txt rrset.
txt_message = dns.message.make_response(qctx.query)
txt_rrset = dns.rrset.from_text(
qctx.qname,
300,
dns.rdataclass.IN,
dns.rdatatype.TXT,
"foo bar",
)
txt_message.answer.append(txt_rrset)
yield DnsResponseSend(txt_message, authoritative=True)
# Finish the AXFR transaction by sending the second SOA RRset.
yield DnsResponseSend(soa_message, authoritative=True)
# This makes sure that the next SOA request causes a new zone transfer
self.soa_version += 1
if __name__ == "__main__":
server = ControllableAsyncDnsServer([ToggleResponsesCommand])
server.install_response_handler(AXFRServer())
server.run()

View file

@ -104,3 +104,10 @@ zone "ixfr-too-big" {
primaries { 10.53.0.1; };
file "ixfr-too-big.bk";
};
zone "xfr-and-reconfig" {
type secondary;
primaries { 10.53.0.9; };
file "xfr-and-reconfig.bk";
request-ixfr no; # ans9 supports only axfr
};

View file

@ -19,6 +19,10 @@ DIGOPTS="+tcp +noadd +nosea +nostat +noquest +nocomm +nocmd -p ${PORT}"
RNDCCMD="$RNDC -c ../_common/rndc.conf -p ${CONTROLPORT} -s"
NS_PARAMS="-m record -c named.conf -d 99 -g -T maxcachesize=2097152"
dig_with_opts() (
"$DIG" -p "$PORT" "$@"
)
status=0
n=0
@ -738,5 +742,44 @@ if [ $tmp -eq 0 ]; then
fi
status=$((status + tmp))
nextpart ns6/named.run >/dev/null
sendcmd() (
dig_with_opts "@${1}" "${2}._control." TXT +time=5 +tries=1 +tcp >/dev/null 2>&1
)
# See #5307#note_558185
n=$((n + 1))
echo_i "test reconfiguration when zone transfer is in the middle of a SOA query (part 1) ($n)"
tmp=0
# Check that xfr-and-reconfig has been successfully transferred by the secondary.
grep -F 'zone xfr-and-reconfig/IN: zone transfer finished: success' ns6/named.run 2>&1 >/dev/null || tmp=0
# Make ans6 receive queries without responding to them.
sendcmd 10.53.0.9 "disable.send-responses"
sleep 1
# Try to reload the zone from an unresponsive primary.
$RNDCCMD 10.53.0.6 reload xfr-and-reconfig 2>&1 | sed 's/^/ns6 /' | cat_i
sleep 1
# Reconfigure named while zone transfer attempt is in progress.
$RNDCCMD 10.53.0.6 reconfig 2>&1 | sed 's/^/ns6 /' | cat_i
# Confirm that the ongoing SOA request was canceled, caused by the reconfiguratoin.
retry_quiet 60 wait_for_message "refresh: request result: operation canceled" || tmp=1
if test $tmp != 0; then echo_i "failed"; fi
status=$((status + tmp))
nextpart ns6/named.run >/dev/null
n=$((n + 1))
echo_i "test reconfiguration when zone transfer is in the middle of a SOA query (part 2) ($n)"
tmp=0
# Make ans6 receive queries and respond to them.
sendcmd 10.53.0.9 "enable.send-responses"
sleep 1
# Try to reload the zone from the primary.
$RNDCCMD 10.53.0.6 reload xfr-and-reconfig 2>&1 | sed 's/^/ns6 /' | cat_i
retry_quiet 60 wait_for_message "zone xfr-and-reconfig/IN: Transfer started." || tmp=1
if test $tmp != 0; then echo_i "failed"; fi
status=$((status + tmp))
echo_i "exit status: $status"
[ $status -eq 0 ] || exit 1

View file

@ -51,6 +51,7 @@ pytestmark = pytest.mark.extra_artifacts(
"ns6/primary.db.jnl",
"ns6/sec.bk",
"ns6/xot-primary-try-next.bk",
"ns6/xfr-and-reconfig.bk",
"ns7/edns-expire.bk",
"ns7/primary2.db",
"ns7/sec.bk",

View file

@ -143,7 +143,7 @@ dns_requestmgr_create(isc_mem_t *mctx, isc_loopmgr_t *loopmgr,
for (size_t i = 0; i < nloops; i++) {
ISC_LIST_INIT(requestmgr->requests[i]);
/* unreferenced in requests_shutdown() */
/* unreferenced in requests_cancel() */
isc_loop_ref(isc_loop_get(requestmgr->loopmgr, i));
}
@ -169,7 +169,7 @@ dns_requestmgr_create(isc_mem_t *mctx, isc_loopmgr_t *loopmgr,
}
static void
requests_shutdown(void *arg) {
requests_cancel(void *arg) {
dns_requestmgr_t *requestmgr = arg;
dns_request_t *request = NULL, *next = NULL;
uint32_t tid = isc_tid();
@ -181,7 +181,7 @@ requests_shutdown(void *arg) {
/* The callback has been already scheduled */
continue;
}
req_sendevent(request, ISC_R_SHUTTINGDOWN);
req_sendevent(request, ISC_R_CANCELED);
}
isc_loop_unref(isc_loop_get(requestmgr->loopmgr, tid));
@ -217,12 +217,12 @@ dns_requestmgr_shutdown(dns_requestmgr_t *requestmgr) {
if (i == tid) {
/* Run the current loop synchronously */
requests_shutdown(requestmgr);
requests_cancel(requestmgr);
continue;
}
isc_loop_t *loop = isc_loop_get(requestmgr->loopmgr, i);
isc_async_run(loop, requests_shutdown, requestmgr);
isc_async_run(loop, requests_cancel, requestmgr);
}
}

View file

@ -13609,6 +13609,7 @@ stub_callback(void *arg) {
case ISC_R_SUCCESS:
break;
case ISC_R_SHUTTINGDOWN:
case ISC_R_CANCELED:
goto exiting;
case ISC_R_TIMEDOUT:
if (!DNS_ZONE_FLAG(zone, DNS_ZONEFLG_NOEDNS)) {
@ -13919,6 +13920,7 @@ refresh_callback(void *arg) {
dns_rdata_t rdata = DNS_RDATA_INIT;
dns_rdata_soa_t soa;
isc_result_t result;
const isc_result_t eresult = dns_request_getresult(request);
isc_sockaddr_t curraddr;
uint32_t serial, oldserial = 0;
bool do_queue_xfrin = false;
@ -13927,6 +13929,12 @@ refresh_callback(void *arg) {
ENTER;
if (isc_log_wouldlog(dns_lctx, ISC_LOG_DEBUG(3))) {
dns_zone_logc(zone, DNS_LOGCATEGORY_XFER_IN, ISC_LOG_DEBUG(3),
"refresh: request result: %s",
isc_result_totext(eresult));
}
now = isc_time_now();
LOCK_ZONE(zone);
@ -13942,10 +13950,11 @@ refresh_callback(void *arg) {
isc_sockaddr_format(&curraddr, primary, sizeof(primary));
isc_sockaddr_format(&zone->sourceaddr, source, sizeof(source));
switch (dns_request_getresult(request)) {
switch (eresult) {
case ISC_R_SUCCESS:
break;
case ISC_R_SHUTTINGDOWN:
case ISC_R_CANCELED:
goto exiting;
case ISC_R_TIMEDOUT:
if (!DNS_ZONE_FLAG(zone, DNS_ZONEFLG_NOEDNS)) {
@ -13988,7 +13997,7 @@ refresh_callback(void *arg) {
}
FALLTHROUGH;
default:
result = dns_request_getresult(request);
result = eresult;
dns_zone_logc(zone, DNS_LOGCATEGORY_XFER_IN, ISC_LOG_INFO,
"refresh: failure trying primary "
"%s (source %s): %s",
@ -14308,6 +14317,17 @@ next_primary:
goto detach;
exiting:
/*
* We can get here not only during shutdown, but also when the refresh
* is canceled during reconfiguration. In that case, make sure to clear
* the DNS_ZONEFLG_REFRESH flag so that future zone refreshes don't get
* stuck, and make sure a new refresh attempt is made again soon after
* the reconfiguration is complete.
*/
DNS_ZONE_CLRFLAG(zone, DNS_ZONEFLG_REFRESH);
zone->refreshtime = now;
zone_settimer(zone, &now);
dns_request_destroy(&zone->request);
goto detach;