Mesh reply counters (#1374)

* Statistics counter for number of queries dropped by limit on reply addresses

Request list entries can be associated with multiple pending "reply
addresses". Basically each request list entry keeps its own list of
clients that should receive the response once the recursion is finished.
This requires keeping allocations around for each client, and there is
a global limit on the number of *additional* reply addresses that can
be allocated. (Each new request list entry seems to get its own initial
reply address which is not counted against the limit.)

This commit adds a statistics counter "num_queries_replyaddr_limit" that
counts the number of incoming client queries that have been dropped due
to the restriction on allocating additional reply addresses. This allows
distinguishing these drops from other kinds of drops.

* Statistics counter for number of mesh reply entries

Request list entries can be associated with multiple pending "reply
addresses". Since there is a limit on the number of additional reply
addresses that can be allocated which can cause incoming queries to be
dropped if exceeded, it would be nice to be able to track this number.

This commit basically exports the mesh_area's internal counter
`num_reply_addrs` as "threadX.requestlist.current.replies" /
"total.requestlist.current.replies".
This commit is contained in:
Robert Edmonds 2025-11-13 03:33:05 -05:00 committed by GitHub
parent 98f4257890
commit fceb4e8585
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 71 additions and 1 deletions

View file

@ -801,6 +801,8 @@ print_stats(RES* ssl, const char* nm, struct ub_stats_info* s)
(unsigned long)s->svr.num_queries_cookie_invalid)) return 0; (unsigned long)s->svr.num_queries_cookie_invalid)) return 0;
if(!ssl_printf(ssl, "%s.num.queries_discard_timeout"SQ"%lu\n", nm, if(!ssl_printf(ssl, "%s.num.queries_discard_timeout"SQ"%lu\n", nm,
(unsigned long)s->svr.num_queries_discard_timeout)) return 0; (unsigned long)s->svr.num_queries_discard_timeout)) return 0;
if(!ssl_printf(ssl, "%s.num.queries_replyaddr_limit"SQ"%lu\n", nm,
(unsigned long)s->svr.num_queries_replyaddr_limit)) return 0;
if(!ssl_printf(ssl, "%s.num.queries_wait_limit"SQ"%lu\n", nm, if(!ssl_printf(ssl, "%s.num.queries_wait_limit"SQ"%lu\n", nm,
(unsigned long)s->svr.num_queries_wait_limit)) return 0; (unsigned long)s->svr.num_queries_wait_limit)) return 0;
if(!ssl_printf(ssl, "%s.num.cachehits"SQ"%lu\n", nm, if(!ssl_printf(ssl, "%s.num.cachehits"SQ"%lu\n", nm,
@ -845,6 +847,8 @@ print_stats(RES* ssl, const char* nm, struct ub_stats_info* s)
(unsigned long)s->mesh_num_states)) return 0; (unsigned long)s->mesh_num_states)) return 0;
if(!ssl_printf(ssl, "%s.requestlist.current.user"SQ"%lu\n", nm, if(!ssl_printf(ssl, "%s.requestlist.current.user"SQ"%lu\n", nm,
(unsigned long)s->mesh_num_reply_states)) return 0; (unsigned long)s->mesh_num_reply_states)) return 0;
if(!ssl_printf(ssl, "%s.requestlist.current.replies"SQ"%lu\n", nm,
(unsigned long)s->mesh_num_reply_addrs)) return 0;
#ifndef S_SPLINT_S #ifndef S_SPLINT_S
sumwait.tv_sec = s->mesh_replies_sum_wait_sec; sumwait.tv_sec = s->mesh_replies_sum_wait_sec;
sumwait.tv_usec = s->mesh_replies_sum_wait_usec; sumwait.tv_usec = s->mesh_replies_sum_wait_usec;

View file

@ -262,6 +262,7 @@ server_stats_compile(struct worker* worker, struct ub_stats_info* s, int reset)
s->svr = worker->stats; s->svr = worker->stats;
s->mesh_num_states = (long long)worker->env.mesh->all.count; s->mesh_num_states = (long long)worker->env.mesh->all.count;
s->mesh_num_reply_states = (long long)worker->env.mesh->num_reply_states; s->mesh_num_reply_states = (long long)worker->env.mesh->num_reply_states;
s->mesh_num_reply_addrs = (long long)worker->env.mesh->num_reply_addrs;
s->mesh_jostled = (long long)worker->env.mesh->stats_jostled; s->mesh_jostled = (long long)worker->env.mesh->stats_jostled;
s->mesh_dropped = (long long)worker->env.mesh->stats_dropped; s->mesh_dropped = (long long)worker->env.mesh->stats_dropped;
s->mesh_replies_sent = (long long)worker->env.mesh->replies_sent; s->mesh_replies_sent = (long long)worker->env.mesh->replies_sent;
@ -284,6 +285,8 @@ server_stats_compile(struct worker* worker, struct ub_stats_info* s, int reset)
NUM_BUCKETS_HIST); NUM_BUCKETS_HIST);
s->svr.num_queries_discard_timeout += s->svr.num_queries_discard_timeout +=
(long long)worker->env.mesh->num_queries_discard_timeout; (long long)worker->env.mesh->num_queries_discard_timeout;
s->svr.num_queries_replyaddr_limit +=
(long long)worker->env.mesh->num_queries_replyaddr_limit;
s->svr.num_queries_wait_limit += s->svr.num_queries_wait_limit +=
(long long)worker->env.mesh->num_queries_wait_limit; (long long)worker->env.mesh->num_queries_wait_limit;
s->svr.num_dns_error_reports += s->svr.num_dns_error_reports +=
@ -448,6 +451,8 @@ void server_stats_add(struct ub_stats_info* total, struct ub_stats_info* a)
total->svr.num_queries_cookie_invalid += a->svr.num_queries_cookie_invalid; total->svr.num_queries_cookie_invalid += a->svr.num_queries_cookie_invalid;
total->svr.num_queries_discard_timeout += total->svr.num_queries_discard_timeout +=
a->svr.num_queries_discard_timeout; a->svr.num_queries_discard_timeout;
total->svr.num_queries_replyaddr_limit +=
a->svr.num_queries_replyaddr_limit;
total->svr.num_queries_wait_limit += a->svr.num_queries_wait_limit; total->svr.num_queries_wait_limit += a->svr.num_queries_wait_limit;
total->svr.num_dns_error_reports += a->svr.num_dns_error_reports; total->svr.num_dns_error_reports += a->svr.num_dns_error_reports;
total->svr.num_queries_missed_cache += a->svr.num_queries_missed_cache; total->svr.num_queries_missed_cache += a->svr.num_queries_missed_cache;
@ -519,6 +524,7 @@ void server_stats_add(struct ub_stats_info* total, struct ub_stats_info* a)
total->mesh_num_states += a->mesh_num_states; total->mesh_num_states += a->mesh_num_states;
total->mesh_num_reply_states += a->mesh_num_reply_states; total->mesh_num_reply_states += a->mesh_num_reply_states;
total->mesh_num_reply_addrs += a->mesh_num_reply_addrs;
total->mesh_jostled += a->mesh_jostled; total->mesh_jostled += a->mesh_jostled;
total->mesh_dropped += a->mesh_dropped; total->mesh_dropped += a->mesh_dropped;
total->mesh_replies_sent += a->mesh_replies_sent; total->mesh_replies_sent += a->mesh_replies_sent;

View file

@ -880,6 +880,11 @@ number of queries removed due to discard\-timeout by thread
.UNINDENT .UNINDENT
.INDENT 0.0 .INDENT 0.0
.TP .TP
.B threadX.num.queries_replyaddr_limit
number of queries removed due to replyaddr limits by thread
.UNINDENT
.INDENT 0.0
.TP
.B threadX.num.queries_wait_limit .B threadX.num.queries_wait_limit
number of queries removed due to wait\-limit by thread number of queries removed due to wait\-limit by thread
.UNINDENT .UNINDENT
@ -994,6 +999,13 @@ Current size of the request list, only the requests from client queries.
.UNINDENT .UNINDENT
.INDENT 0.0 .INDENT 0.0
.TP .TP
.B threadX.requestlist.current.replies
Current count of the number of reply entries waiting on request list
entries. Because a request list entry can send results to multiple reply
addresses, this number may be larger than the size of the request list.
.UNINDENT
.INDENT 0.0
.TP
.B threadX.recursion.time.avg .B threadX.recursion.time.avg
Average time it took to answer queries that needed recursive processing. Average time it took to answer queries that needed recursive processing.
Note that queries that were answered from the cache are not in this average. Note that queries that were answered from the cache are not in this average.
@ -1048,6 +1060,11 @@ summed over threads.
.UNINDENT .UNINDENT
.INDENT 0.0 .INDENT 0.0
.TP .TP
.B total.num.queries_replyaddr_limit
summed over threads.
.UNINDENT
.INDENT 0.0
.TP
.B total.num.queries_wait_limit .B total.num.queries_wait_limit
summed over threads. summed over threads.
.UNINDENT .UNINDENT
@ -1138,6 +1155,16 @@ summed over threads.
.UNINDENT .UNINDENT
.INDENT 0.0 .INDENT 0.0
.TP .TP
.B total.requestlist.current.user
summed over threads.
.UNINDENT
.INDENT 0.0
.TP
.B total.requestlist.current.replies
summed over threads.
.UNINDENT
.INDENT 0.0
.TP
.B total.recursion.time.median .B total.recursion.time.median
averaged over threads. averaged over threads.
.UNINDENT .UNINDENT

View file

@ -815,6 +815,10 @@ number of statistic counters:
number of queries removed due to discard-timeout by thread number of queries removed due to discard-timeout by thread
@@UAHL@unbound-control.stats@threadX.num.queries_replyaddr_limit@@
number of queries removed due to replyaddr limits by thread
@@UAHL@unbound-control.stats@threadX.num.queries_wait_limit@@ @@UAHL@unbound-control.stats@threadX.num.queries_wait_limit@@
number of queries removed due to wait-limit by thread number of queries removed due to wait-limit by thread
@ -910,6 +914,12 @@ number of statistic counters:
Current size of the request list, only the requests from client queries. Current size of the request list, only the requests from client queries.
@@UAHL@unbound-control.stats@threadX.requestlist.current.replies@@
Current count of the number of reply entries waiting on request list
entries. Because a request list entry can send results to multiple reply
addresses, this number may be larger than the size of the request list.
@@UAHL@unbound-control.stats@threadX.recursion.time.avg@@ @@UAHL@unbound-control.stats@threadX.recursion.time.avg@@
Average time it took to answer queries that needed recursive processing. Average time it took to answer queries that needed recursive processing.
Note that queries that were answered from the cache are not in this average. Note that queries that were answered from the cache are not in this average.
@ -955,6 +965,10 @@ number of statistic counters:
summed over threads. summed over threads.
@@UAHL@unbound-control.stats@total.num.queries_replyaddr_limit@@
summed over threads.
@@UAHL@unbound-control.stats@total.num.queries_wait_limit@@ @@UAHL@unbound-control.stats@total.num.queries_wait_limit@@
summed over threads. summed over threads.
@ -1027,6 +1041,14 @@ number of statistic counters:
summed over threads. summed over threads.
@@UAHL@unbound-control.stats@total.requestlist.current.user@@
summed over threads.
@@UAHL@unbound-control.stats@total.requestlist.current.replies@@
summed over threads.
@@UAHL@unbound-control.stats@total.recursion.time.median@@ @@UAHL@unbound-control.stats@total.recursion.time.median@@
averaged over threads. averaged over threads.

View file

@ -853,6 +853,8 @@ struct ub_server_stats {
long long qquic; long long qquic;
/** number of queries removed due to discard-timeout */ /** number of queries removed due to discard-timeout */
long long num_queries_discard_timeout; long long num_queries_discard_timeout;
/** number of queries removed due to replyaddr limit */
long long num_queries_replyaddr_limit;
/** number of queries removed due to wait-limit */ /** number of queries removed due to wait-limit */
long long num_queries_wait_limit; long long num_queries_wait_limit;
/** number of dns error reports generated */ /** number of dns error reports generated */
@ -872,6 +874,8 @@ struct ub_stats_info {
long long mesh_num_states; long long mesh_num_states;
/** mesh stats: current number of reply (user) states */ /** mesh stats: current number of reply (user) states */
long long mesh_num_reply_states; long long mesh_num_reply_states;
/** mesh stats: current number of reply entries */
long long mesh_num_reply_addrs;
/** mesh stats: number of reply states overwritten with a new one */ /** mesh stats: number of reply states overwritten with a new one */
long long mesh_jostled; long long mesh_jostled;
/** mesh stats: number of incoming queries dropped */ /** mesh stats: number of incoming queries dropped */

View file

@ -231,6 +231,7 @@ mesh_create(struct module_stack* stack, struct module_env* env)
mesh->ans_expired = 0; mesh->ans_expired = 0;
mesh->ans_cachedb = 0; mesh->ans_cachedb = 0;
mesh->num_queries_discard_timeout = 0; mesh->num_queries_discard_timeout = 0;
mesh->num_queries_replyaddr_limit = 0;
mesh->num_queries_wait_limit = 0; mesh->num_queries_wait_limit = 0;
mesh->num_dns_error_reports = 0; mesh->num_dns_error_reports = 0;
mesh->max_reply_states = env->cfg->num_queries_per_thread; mesh->max_reply_states = env->cfg->num_queries_per_thread;
@ -474,7 +475,7 @@ void mesh_new_client(struct mesh_area* mesh, struct query_info* qinfo,
verbose(VERB_ALGO, "Too many requests queued. " verbose(VERB_ALGO, "Too many requests queued. "
"dropping incoming query."); "dropping incoming query.");
comm_point_drop_reply(rep); comm_point_drop_reply(rep);
mesh->stats_dropped++; mesh->num_queries_replyaddr_limit++;
return; return;
} }
} }
@ -2295,6 +2296,7 @@ mesh_stats_clear(struct mesh_area* mesh)
memset(&mesh->rpz_action[0], 0, sizeof(size_t)*UB_STATS_RPZ_ACTION_NUM); memset(&mesh->rpz_action[0], 0, sizeof(size_t)*UB_STATS_RPZ_ACTION_NUM);
mesh->ans_nodata = 0; mesh->ans_nodata = 0;
mesh->num_queries_discard_timeout = 0; mesh->num_queries_discard_timeout = 0;
mesh->num_queries_replyaddr_limit = 0;
mesh->num_queries_wait_limit = 0; mesh->num_queries_wait_limit = 0;
mesh->num_dns_error_reports = 0; mesh->num_dns_error_reports = 0;
} }

View file

@ -141,6 +141,8 @@ struct mesh_area {
size_t rpz_action[UB_STATS_RPZ_ACTION_NUM]; size_t rpz_action[UB_STATS_RPZ_ACTION_NUM];
/** stats, number of queries removed due to discard-timeout */ /** stats, number of queries removed due to discard-timeout */
size_t num_queries_discard_timeout; size_t num_queries_discard_timeout;
/** stats, number of queries removed due to replyaddr limit */
size_t num_queries_replyaddr_limit;
/** stats, number of queries removed due to wait-limit */ /** stats, number of queries removed due to wait-limit */
size_t num_queries_wait_limit; size_t num_queries_wait_limit;
/** stats, number of dns error reports generated */ /** stats, number of dns error reports generated */

View file

@ -236,6 +236,8 @@ static void pr_stats(const char* nm, struct ub_stats_info* s)
s->svr.num_queries_cookie_invalid); s->svr.num_queries_cookie_invalid);
PR_UL_NM("num.queries_discard_timeout", PR_UL_NM("num.queries_discard_timeout",
s->svr.num_queries_discard_timeout); s->svr.num_queries_discard_timeout);
PR_UL_NM("num.queries_replyaddr_limit",
s->svr.num_queries_replyaddr_limit);
PR_UL_NM("num.queries_wait_limit", s->svr.num_queries_wait_limit); PR_UL_NM("num.queries_wait_limit", s->svr.num_queries_wait_limit);
PR_UL_NM("num.cachehits", PR_UL_NM("num.cachehits",
s->svr.num_queries - s->svr.num_queries_missed_cache); s->svr.num_queries - s->svr.num_queries_missed_cache);
@ -263,6 +265,7 @@ static void pr_stats(const char* nm, struct ub_stats_info* s)
PR_UL_NM("requestlist.exceeded", s->mesh_dropped); PR_UL_NM("requestlist.exceeded", s->mesh_dropped);
PR_UL_NM("requestlist.current.all", s->mesh_num_states); PR_UL_NM("requestlist.current.all", s->mesh_num_states);
PR_UL_NM("requestlist.current.user", s->mesh_num_reply_states); PR_UL_NM("requestlist.current.user", s->mesh_num_reply_states);
PR_UL_NM("requestlist.current.replies", s->mesh_num_reply_addrs);
#ifndef S_SPLINT_S #ifndef S_SPLINT_S
sumwait.tv_sec = s->mesh_replies_sum_wait_sec; sumwait.tv_sec = s->mesh_replies_sum_wait_sec;
sumwait.tv_usec = s->mesh_replies_sum_wait_usec; sumwait.tv_usec = s->mesh_replies_sum_wait_usec;