From b6bd2a53117d8e21f5ad5c7794da84846038ca47 Mon Sep 17 00:00:00 2001 From: Aram Sargsyan Date: Thu, 10 Mar 2022 17:30:34 +0000 Subject: [PATCH 1/4] Fix an issue in dig when retrying with the next server after SERVFAIL After a query results in a SERVFAIL result, and there is another registered query in the lookup's queries list, `dig` starts the next query to try another server, but for some reason, reports about that also when the current query is in the head of the list, even if there is no other query in the list to try. Use the same condition for both decisions, and after starting the next query, jump to the "detach_query" label instead of "next_lookup", because there is no need to start the next lookup after we just started a query in the current lookup. (cherry picked from commit e888c62fbd770e31f72b7d6bf08253d38ddc2e5d) --- bin/dig/dighost.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/bin/dig/dighost.c b/bin/dig/dighost.c index 10bf24fc13..bd0818d4f4 100644 --- a/bin/dig/dighost.c +++ b/bin/dig/dighost.c @@ -3923,15 +3923,6 @@ recv_done(isc_nmhandle_t *handle, isc_result_t eresult, isc_region_t *region, } else { start_udp(next); } - } - - /* - * If our query is at the head of the list and there - * is no next, we're the only one left, so fall - * through to print the message. - */ - if ((ISC_LIST_HEAD(l->q) != query) || - (ISC_LIST_NEXT(query, link) != NULL)) { dighost_comments(l, "Got %s from %s, trying next " "server", @@ -3939,7 +3930,7 @@ recv_done(isc_nmhandle_t *handle, isc_result_t eresult, isc_region_t *region, ? "SERVFAIL reply" : "recursion not available", query->servname); - goto next_lookup; + goto detach_query; } } From 58685cd573e8de1fd023232dc5b3118c78ec6f52 Mon Sep 17 00:00:00 2001 From: Aram Sargsyan Date: Wed, 9 Mar 2022 19:45:54 +0000 Subject: [PATCH 2/4] When resending a UDP request, insert the query to the lookup's list When a query times out, and `dig` (or `host`) creates a new query to resend the request, it is being prepended to the lookup's queries list, which can cause a confusion later, making `dig` (or `host`) believe that there is another new query in the list, but that is actually the old one, which was timed out. That mistake will result in an assertion failure. That can happen, in particular, when after a timed out request, the retried request returns a SERVFAIL result, and the recursion is enabled, and `+nofail` option was used with `dig` (that is the default behavior in `host`, unless the `-s` option is provided). Fix the problem by inserting the query just after the current, timed-out query, instead of prepending to the list. Before calling start_udp() detach `l->current_query`, like it is done in another place in the function. Slightly update a couple of debug messages to make them more consistent. (cherry picked from commit a96247594816e892bdf9fd2ca4025ad078e52995) --- bin/dig/dighost.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/bin/dig/dighost.c b/bin/dig/dighost.c index bd0818d4f4..360e8a3b97 100644 --- a/bin/dig/dighost.c +++ b/bin/dig/dighost.c @@ -3061,7 +3061,8 @@ force_next(dig_query_t *query) { if (l->retries > 1) { l->retries--; - debug("making new TCP request, %d tries left", l->retries); + debug("making new %s request, %d tries left", + l->tcp_mode ? "TCP" : "UDP", l->retries); requeue_lookup(l, true); lookup_detach(&l); isc_refcount_decrement0(&recvcount); @@ -3625,13 +3626,14 @@ recv_done(isc_nmhandle_t *handle, isc_result_t eresult, isc_region_t *region, dig_query_t *newq = NULL; l->retries--; - debug("resending UDP request to first server, %d tries left", - l->retries); + debug("making new UDP request, %d tries left", l->retries); newq = new_query(l, query->servname, query->userarg); - ISC_LIST_PREPEND(l->q, newq, link); - - start_udp(ISC_LIST_HEAD(l->q)); + ISC_LIST_INSERTAFTER(l->q, query, newq, link); + if (l->current_query == query) { + query_detach(&l->current_query); + } + start_udp(newq); goto detach_query; } From 1e17d5a13099b42c0f131b8be306b4d9501fd821 Mon Sep 17 00:00:00 2001 From: Aram Sargsyan Date: Wed, 9 Mar 2022 20:17:51 +0000 Subject: [PATCH 3/4] Add CHANGES note for [GL #3020] (cherry picked from commit e353700189871c9c7c1e01e44b6c3e7606ee370e) --- CHANGES | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CHANGES b/CHANGES index 995535a98d..77094587cb 100644 --- a/CHANGES +++ b/CHANGES @@ -1,3 +1,9 @@ +5831. [bug] When resending a UDP request in the result of a timeout, + the recv_done() function in dighost.c was prepending + the new query into the loookup's queries list instead + of inserting, which could cause an assertion failure + when the resent query's result was SERVFAIL. [GL #3020] + 5828. [bug] Replace single TCP write timer with per-TCP write timers. [GL #3200] From f64cd23e7bd0399505114a9b28893995eb4e162f Mon Sep 17 00:00:00 2001 From: Aram Sargsyan Date: Thu, 10 Mar 2022 00:12:37 +0000 Subject: [PATCH 4/4] Add digdelv system test to check timed-out result followed by a SERVFAIL This test ensures that `dig` retries with another attempt after a timed-out request, and that it does not crash when the retried request returns a SERVFAIL result. See [GL #3020] for the latter issue. (cherry picked from commit 3ec5d2d6ede18e502ef0ef4521fe917f6531f258) --- bin/tests/system/digdelv/ans8/ans.py | 138 +++++++++++++++++++++++++++ bin/tests/system/digdelv/clean.sh | 2 + bin/tests/system/digdelv/tests.sh | 16 ++++ 3 files changed, 156 insertions(+) create mode 100644 bin/tests/system/digdelv/ans8/ans.py diff --git a/bin/tests/system/digdelv/ans8/ans.py b/bin/tests/system/digdelv/ans8/ans.py new file mode 100644 index 0000000000..18fbdece3e --- /dev/null +++ b/bin/tests/system/digdelv/ans8/ans.py @@ -0,0 +1,138 @@ +# Copyright (C) Internet Systems Consortium, Inc. ("ISC") +# +# SPDX-License-Identifier: MPL-2.0 +# +# This Source Code Form is subject to the terms of the Mozilla Public +# License, v. 2.0. If a copy of the MPL was not distributed with this +# file, you can obtain one at https://mozilla.org/MPL/2.0/. +# +# See the COPYRIGHT file distributed with this work for additional +# information regarding copyright ownership. + +from __future__ import print_function +import os +import sys +import signal +import socket +import select +import struct + +import dns, dns.message +from dns.rcode import * + +def create_response(msg): + m = dns.message.from_wire(msg) + qname = m.question[0].name.to_text() + rrtype = m.question[0].rdtype + typename = dns.rdatatype.to_text(rrtype) + + with open("query.log", "a") as f: + f.write("%s %s\n" % (typename, qname)) + print("%s %s" % (typename, qname), end=" ") + + r = dns.message.make_response(m) + r.set_rcode(SERVFAIL) + return r + +def sigterm(signum, frame): + print("Shutting down now...") + os.remove("ans.pid") + running = False + sys.exit(0) + +ip4 = "10.53.0.8" + +try: port=int(os.environ["PORT"]) +except: port=5300 + +query4_udp = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) +query4_udp.bind((ip4, port)) + +query4_tcp = socket.socket(socket.AF_INET, socket.SOCK_STREAM) +query4_tcp.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) +query4_tcp.bind((ip4, port)) +query4_tcp.listen(100) + +signal.signal(signal.SIGTERM, sigterm) + +f = open("ans.pid", "w") +pid = os.getpid() +print (pid, file=f) +f.close() + +running = True + +print ("Listening on %s port %d" % (ip4, port)) +print ("Ctrl-c to quit") + +input = [query4_udp, query4_tcp] + +n_udp = 0 +n_tcp = 0 +hung_conns = [] + +while running: + try: + inputready, outputready, exceptready = select.select(input, [], []) + except select.error as e: + break + except socket.error as e: + break + except KeyboardInterrupt: + break + + for s in inputready: + if s == query4_udp: + print("UDP query received on %s" % ip4, end=" ") + n_udp = n_udp + 1 + msg = s.recvfrom(65535) + # Do not response to every other query. + if n_udp % 2 == 1: + print("NO RESPONSE") + continue + rsp = create_response(msg[0]) + if rsp: + print(dns.rcode.to_text(rsp.rcode())) + s.sendto(rsp.to_wire(), msg[1]) + else: + print("NO RESPONSE") + elif s == query4_tcp: + print("TCP query received on %s" % ip4, end=" ") + n_tcp = n_tcp + 1 + conn = None + try: + conn, addr = s.accept() + # Do not response to every other query, hang the connection. + if n_tcp % 2 == 1: + print("NO RESPONSE") + hung_conns.append(conn) + conn = None + continue + else: + # get TCP message length + msg = conn.recv(2) + if len(msg) != 2: + print("NO RESPONSE") + conn.close() + continue + length = struct.unpack('>H', msg[:2])[0] + msg = conn.recv(length) + if len(msg) != length: + print("NO RESPONSE") + conn.close() + continue + rsp = create_response(msg) + if rsp: + print(dns.rcode.to_text(rsp.rcode())) + wire = rsp.to_wire() + conn.send(struct.pack('>H', len(wire))) + conn.send(wire) + else: + print("NO RESPONSE") + except: + print("NO RESPONSE") + if conn: + conn.close() + + if not running: + break diff --git a/bin/tests/system/digdelv/clean.sh b/bin/tests/system/digdelv/clean.sh index ac84c3f80e..ed9ad87a5b 100644 --- a/bin/tests/system/digdelv/clean.sh +++ b/bin/tests/system/digdelv/clean.sh @@ -17,6 +17,8 @@ rm -f ./*/anchor.* rm -f ./*/named.conf rm -f ./*/named.memstats rm -f ./*/named.run +rm -f ./ans*/ans.run +rm -f ./ans*/query.log rm -f ./delv.out.test* rm -f ./dig.out.*test* rm -f ./dig.out.mm.* diff --git a/bin/tests/system/digdelv/tests.sh b/bin/tests/system/digdelv/tests.sh index 28a765d129..f7e852a1c9 100644 --- a/bin/tests/system/digdelv/tests.sh +++ b/bin/tests/system/digdelv/tests.sh @@ -998,6 +998,22 @@ if [ -x "$DIG" ] ; then if [ $ret -ne 0 ]; then echo_i "failed"; fi status=$((status+ret)) + # See [GL #3020] for more information + n=$((n+1)) + echo_i "check that dig handles UDP timeout followed by a SERVFAIL correctly ($n)" + ret=0 + dig_with_opts +timeout=1 +nofail @10.53.0.8 a.example > dig.out.test$n 2>&1 || ret=1 + grep "status: SERVFAIL" dig.out.test$n > /dev/null || ret=1 + if [ $ret -ne 0 ]; then echo_i "failed"; fi + status=$((status+ret)) + + n=$((n+1)) + echo_i "check that dig handles TCP timeout followed by a SERVFAIL correctly ($n)" + ret=0 + dig_with_opts +timeout=1 +nofail +tcp @10.53.0.8 a.example > dig.out.test$n 2>&1 || ret=1 + grep "status: SERVFAIL" dig.out.test$n > /dev/null || ret=1 + if [ $ret -ne 0 ]; then echo_i "failed"; fi + status=$((status+ret)) else echo_i "$DIG is needed, so skipping these dig tests" fi