Mesh reply counters (#1374)

* Statistics counter for number of queries dropped by limit on reply addresses Request list entries can be associated with multiple pending "reply addresses". Basically each request list entry keeps its own list of clients that should receive the response once the recursion is finished. This requires keeping allocations around for each client, and there is a global limit on the number of *additional* reply addresses that can be allocated. (Each new request list entry seems to get its own initial reply address which is not counted against the limit.) This commit adds a statistics counter "num_queries_replyaddr_limit" that counts the number of incoming client queries that have been dropped due to the restriction on allocating additional reply addresses. This allows distinguishing these drops from other kinds of drops. * Statistics counter for number of mesh reply entries Request list entries can be associated with multiple pending "reply addresses". Since there is a limit on the number of additional reply addresses that can be allocated which can cause incoming queries to be dropped if exceeded, it would be nice to be able to track this number. This commit basically exports the mesh_area's internal counter `num_reply_addrs` as "threadX.requestlist.current.replies" / "total.requestlist.current.replies".
2025-12-18 23:06:06 -05:00 · 2025-11-13 03:33:05 -05:00 · 2025-11-13 03:33:05 -05:00 · fceb4e8585
commit fceb4e8585
parent 98f4257890
8 changed files with 71 additions and 1 deletions
--- a/daemon/remote.c
+++ b/daemon/remote.c
@ -801,6 +801,8 @@ print_stats(RES* ssl, const char* nm, struct ub_stats_info* s)
 		(unsigned long)s->svr.num_queries_cookie_invalid)) return 0;
 	if(!ssl_printf(ssl, "%s.num.queries_discard_timeout"SQ"%lu\n", nm,
 		(unsigned long)s->svr.num_queries_discard_timeout)) return 0;
 	if(!ssl_printf(ssl, "%s.num.queries_replyaddr_limit"SQ"%lu\n", nm,
 		(unsigned long)s->svr.num_queries_replyaddr_limit)) return 0;
 	if(!ssl_printf(ssl, "%s.num.queries_wait_limit"SQ"%lu\n", nm,
 		(unsigned long)s->svr.num_queries_wait_limit)) return 0;
 	if(!ssl_printf(ssl, "%s.num.cachehits"SQ"%lu\n", nm,
@ -845,6 +847,8 @@ print_stats(RES* ssl, const char* nm, struct ub_stats_info* s)
 		(unsigned long)s->mesh_num_states)) return 0;
 	if(!ssl_printf(ssl, "%s.requestlist.current.user"SQ"%lu\n", nm,
 		(unsigned long)s->mesh_num_reply_states)) return 0;
 	if(!ssl_printf(ssl, "%s.requestlist.current.replies"SQ"%lu\n", nm,
 		(unsigned long)s->mesh_num_reply_addrs)) return 0;
 #ifndef S_SPLINT_S
 	sumwait.tv_sec = s->mesh_replies_sum_wait_sec;
 	sumwait.tv_usec = s->mesh_replies_sum_wait_usec;
--- a/daemon/stats.c
+++ b/daemon/stats.c
@ -262,6 +262,7 @@ server_stats_compile(struct worker* worker, struct ub_stats_info* s, int reset)
 	s->svr = worker->stats;
 	s->mesh_num_states = (long long)worker->env.mesh->all.count;
 	s->mesh_num_reply_states = (long long)worker->env.mesh->num_reply_states;
 	s->mesh_num_reply_addrs = (long long)worker->env.mesh->num_reply_addrs;
 	s->mesh_jostled = (long long)worker->env.mesh->stats_jostled;
 	s->mesh_dropped = (long long)worker->env.mesh->stats_dropped;
 	s->mesh_replies_sent = (long long)worker->env.mesh->replies_sent;
@ -284,6 +285,8 @@ server_stats_compile(struct worker* worker, struct ub_stats_info* s, int reset)
 		NUM_BUCKETS_HIST);
 	s->svr.num_queries_discard_timeout +=
 		(long long)worker->env.mesh->num_queries_discard_timeout;
 	s->svr.num_queries_replyaddr_limit +=
 		(long long)worker->env.mesh->num_queries_replyaddr_limit;
 	s->svr.num_queries_wait_limit +=
 		(long long)worker->env.mesh->num_queries_wait_limit;
 	s->svr.num_dns_error_reports +=
@ -448,6 +451,8 @@ void server_stats_add(struct ub_stats_info* total, struct ub_stats_info* a)
 	total->svr.num_queries_cookie_invalid += a->svr.num_queries_cookie_invalid;
 	total->svr.num_queries_discard_timeout +=
 		a->svr.num_queries_discard_timeout;
 	total->svr.num_queries_replyaddr_limit +=
 		a->svr.num_queries_replyaddr_limit;
 	total->svr.num_queries_wait_limit += a->svr.num_queries_wait_limit;
 	total->svr.num_dns_error_reports += a->svr.num_dns_error_reports;
 	total->svr.num_queries_missed_cache += a->svr.num_queries_missed_cache;
@ -519,6 +524,7 @@ void server_stats_add(struct ub_stats_info* total, struct ub_stats_info* a)
 	total->mesh_num_states += a->mesh_num_states;
 	total->mesh_num_reply_states += a->mesh_num_reply_states;
 	total->mesh_num_reply_addrs += a->mesh_num_reply_addrs;
 	total->mesh_jostled += a->mesh_jostled;
 	total->mesh_dropped += a->mesh_dropped;
 	total->mesh_replies_sent += a->mesh_replies_sent;
--- a/doc/unbound-control.8.in
+++ b/doc/unbound-control.8.in
@ -880,6 +880,11 @@ number of queries removed due to discard\-timeout by thread
 .UNINDENT
 .INDENT 0.0
 .TP
 .B threadX.num.queries_replyaddr_limit 
 number of queries removed due to replyaddr limits by thread
 .UNINDENT
 .INDENT 0.0
 .TP
 .B threadX.num.queries_wait_limit 
 number of queries removed due to wait\-limit by thread
 .UNINDENT
@ -994,6 +999,13 @@ Current size of the request list, only the requests from client queries.
 .UNINDENT
 .INDENT 0.0
 .TP
 .B threadX.requestlist.current.replies 
 Current count of the number of reply entries waiting on request list
 entries. Because a request list entry can send results to multiple reply
 addresses, this number may be larger than the size of the request list.
 .UNINDENT
 .INDENT 0.0
 .TP
 .B threadX.recursion.time.avg 
 Average time it took to answer queries that needed recursive processing.
 Note that queries that were answered from the cache are not in this average.
@ -1048,6 +1060,11 @@ summed over threads.
 .UNINDENT
 .INDENT 0.0
 .TP
 .B total.num.queries_replyaddr_limit 
 summed over threads.
 .UNINDENT
 .INDENT 0.0
 .TP
 .B total.num.queries_wait_limit 
 summed over threads.
 .UNINDENT
@ -1138,6 +1155,16 @@ summed over threads.
 .UNINDENT
 .INDENT 0.0
 .TP
 .B total.requestlist.current.user 
 summed over threads.
 .UNINDENT
 .INDENT 0.0
 .TP
 .B total.requestlist.current.replies 
 summed over threads.
 .UNINDENT
 .INDENT 0.0
 .TP
 .B total.recursion.time.median 
 averaged over threads.
 .UNINDENT
--- a/doc/unbound-control.rst
+++ b/doc/unbound-control.rst
@ -815,6 +815,10 @@ number of statistic counters:
    number of queries removed due to discard-timeout by thread
@@UAHL@unbound-control.stats@threadX.num.queries_replyaddr_limit@@
    number of queries removed due to replyaddr limits by thread
@@UAHL@unbound-control.stats@threadX.num.queries_wait_limit@@
    number of queries removed due to wait-limit by thread
@ -910,6 +914,12 @@ number of statistic counters:
    Current size of the request list, only the requests from client queries.
@@UAHL@unbound-control.stats@threadX.requestlist.current.replies@@
    Current count of the number of reply entries waiting on request list
    entries. Because a request list entry can send results to multiple reply
    addresses, this number may be larger than the size of the request list.
@@UAHL@unbound-control.stats@threadX.recursion.time.avg@@
    Average time it took to answer queries that needed recursive processing.
    Note that queries that were answered from the cache are not in this average.
@ -955,6 +965,10 @@ number of statistic counters:
    summed over threads.
@@UAHL@unbound-control.stats@total.num.queries_replyaddr_limit@@
    summed over threads.
@@UAHL@unbound-control.stats@total.num.queries_wait_limit@@
    summed over threads.
@ -1027,6 +1041,14 @@ number of statistic counters:
    summed over threads.
@@UAHL@unbound-control.stats@total.requestlist.current.user@@
    summed over threads.
@@UAHL@unbound-control.stats@total.requestlist.current.replies@@
    summed over threads.
@@UAHL@unbound-control.stats@total.recursion.time.median@@
    averaged over threads.
--- a/libunbound/unbound.h
+++ b/libunbound/unbound.h
@ -853,6 +853,8 @@ struct ub_server_stats {
 	long long qquic;
 	/** number of queries removed due to discard-timeout */
 	long long num_queries_discard_timeout;
 	/** number of queries removed due to replyaddr limit */
 	long long num_queries_replyaddr_limit;
 	/** number of queries removed due to wait-limit */
 	long long num_queries_wait_limit;
 	/** number of dns error reports generated */
@ -872,6 +874,8 @@ struct ub_stats_info {
 	long long mesh_num_states;
 	/** mesh stats: current number of reply (user) states */
 	long long mesh_num_reply_states;
 	/** mesh stats: current number of reply entries */
 	long long mesh_num_reply_addrs;
 	/** mesh stats: number of reply states overwritten with a new one */
 	long long mesh_jostled;
 	/** mesh stats: number of incoming queries dropped */
--- a/services/mesh.c
+++ b/services/mesh.c
@ -231,6 +231,7 @@ mesh_create(struct module_stack* stack, struct module_env* env)
 	mesh->ans_expired = 0;
 	mesh->ans_cachedb = 0;
 	mesh->num_queries_discard_timeout = 0;
 	mesh->num_queries_replyaddr_limit = 0;
 	mesh->num_queries_wait_limit = 0;
 	mesh->num_dns_error_reports = 0;
 	mesh->max_reply_states = env->cfg->num_queries_per_thread;
@ -474,7 +475,7 @@ void mesh_new_client(struct mesh_area* mesh, struct query_info* qinfo,
 			verbose(VERB_ALGO, "Too many requests queued. "
 				"dropping incoming query.");
 			comm_point_drop_reply(rep);
-			mesh->stats_dropped++;
+			mesh->num_queries_replyaddr_limit++;
 			return;
 		}
 	}
@ -2295,6 +2296,7 @@ mesh_stats_clear(struct mesh_area* mesh)
 	memset(&mesh->rpz_action[0], 0, sizeof(size_t)*UB_STATS_RPZ_ACTION_NUM);
 	mesh->ans_nodata = 0;
 	mesh->num_queries_discard_timeout = 0;
 	mesh->num_queries_replyaddr_limit = 0;
 	mesh->num_queries_wait_limit = 0;
 	mesh->num_dns_error_reports = 0;
 }
--- a/services/mesh.h
+++ b/services/mesh.h
@ -141,6 +141,8 @@ struct mesh_area {
 	size_t rpz_action[UB_STATS_RPZ_ACTION_NUM];
 	/** stats, number of queries removed due to discard-timeout */
 	size_t num_queries_discard_timeout;
 	/** stats, number of queries removed due to replyaddr limit */
 	size_t num_queries_replyaddr_limit;
 	/** stats, number of queries removed due to wait-limit */
 	size_t num_queries_wait_limit;
 	/** stats, number of dns error reports generated */
--- a/smallapp/unbound-control.c
+++ b/smallapp/unbound-control.c
@ -236,6 +236,8 @@ static void pr_stats(const char* nm, struct ub_stats_info* s)
 		s->svr.num_queries_cookie_invalid);
 	PR_UL_NM("num.queries_discard_timeout",
 		s->svr.num_queries_discard_timeout);
 	PR_UL_NM("num.queries_replyaddr_limit",
 		s->svr.num_queries_replyaddr_limit);
 	PR_UL_NM("num.queries_wait_limit", s->svr.num_queries_wait_limit);
 	PR_UL_NM("num.cachehits",
 		s->svr.num_queries - s->svr.num_queries_missed_cache);
@ -263,6 +265,7 @@ static void pr_stats(const char* nm, struct ub_stats_info* s)
 	PR_UL_NM("requestlist.exceeded", s->mesh_dropped);
 	PR_UL_NM("requestlist.current.all", s->mesh_num_states);
 	PR_UL_NM("requestlist.current.user", s->mesh_num_reply_states);
 	PR_UL_NM("requestlist.current.replies", s->mesh_num_reply_addrs);
 #ifndef S_SPLINT_S
 	sumwait.tv_sec = s->mesh_replies_sum_wait_sec;
 	sumwait.tv_usec = s->mesh_replies_sum_wait_usec;