From f0dd9358daaf02a360847d4569714d6252f977cf Mon Sep 17 00:00:00 2001 From: Michal Nowak Date: Wed, 10 Jun 2026 17:22:44 +0000 Subject: [PATCH] Make the fetchlimit per-domain limit check robust under load The per-domain limit step required the active fetch count for lamesub.example to read exactly 40 on a single sample. It parsed the cumulative "allowed" field (6) rather than the active count (field 3), and even the active count can briefly dip below 40 between bursts. Sample the active count with retry_quiet, flag only a count above 40, and require the limit to be reached at least once. Assisted-by: Claude:claude-opus-4-7 --- bin/tests/system/fetchlimit/tests.sh | 29 ++++++++++++++++++++++------ 1 file changed, 23 insertions(+), 6 deletions(-) diff --git a/bin/tests/system/fetchlimit/tests.sh b/bin/tests/system/fetchlimit/tests.sh index b36f792570..f994406249 100644 --- a/bin/tests/system/fetchlimit/tests.sh +++ b/bin/tests/system/fetchlimit/tests.sh @@ -60,6 +60,15 @@ stat() { return 0 } +# Succeed once the lame domain reaches the fetches-per-zone limit (40), +# checking both recursive clients (stat) and active fetches (rndc fetchlimit +# field 3). Used with retry_quiet to wait out transient low readings. +perdomain_at_limit() { + stat 10.53.0.3 40 40 || return 1 + active=$(rndccmd 10.53.0.3 fetchlimit | awk '/lamesub/ { print $3 }') + [ "${active:-0}" -eq 40 ] +} + _wait_for_message() ( nextpartpeek "$1" >wait_for_message.$n grep -F "$2" wait_for_message.$n >/dev/null @@ -177,6 +186,7 @@ echo_i "checking lame server clients are dropped at the per-domain limit ($n)" ret=0 fail=0 success=0 +reached=0 sendcmd 10.53.0.4 send-responses "disable" for try in 1 2 3 4 5; do burst 10.53.0.3 d $try 300 @@ -185,15 +195,22 @@ for try in 1 2 3 4 5; do && success=$((success + 1)) grep "status: SERVFAIL" dig.out.ns3.$n.$try >/dev/null 2>&1 \ && fail=$(($fail + 1)) - stat 10.53.0.3 40 40 || ret=1 - allowed=$(rndccmd 10.53.0.3 fetchlimit | awk '/lamesub/ { print $6 }') - [ "${allowed:-0}" -eq 40 ] || { - echo_i "allowed ${allowed}/40" + # The active count (field 3) is capped at 40 but can dip lower between + # bursts, so retry the sample, flag only a count above 40, and require the + # limit to be reached at least once. (Field 6 is the cumulative "allowed" + # counter, which grows past 40 as refilled slots are counted.) + retry_quiet 5 perdomain_at_limit && reached=1 + if [ "${active:-0}" -gt 40 ]; then + echo_i "active ${active}/40" ret=1 - } - [ $ret -eq 1 ] && break + break + fi sleep 1 done +[ "$reached" -eq 1 ] || { + echo_i "per-domain limit (40) not reached" + ret=1 +} echo_i "$success successful valid queries, $fail SERVFAIL" if [ $ret != 0 ]; then echo_i "failed"; fi status=$((status + ret))