From c6f13f12cd862ffae071e56ee7e1fa9998fc23c3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ond=C5=99ej=20Sur=C3=BD?= <ondrej@isc.org>
Date: Thu, 18 Jan 2024 17:24:22 +0100
Subject: [PATCH 1/6] Throttle reading from TCP if the sends are not getting
 through

When TCP client would not read the DNS message sent to them, the TCP
sends inside named would accumulate and cause degradation of the
service.  Throttle the reading from the TCP socket when we accumulate
enough DNS data to be sent.  Currently this is limited in a way that a
single largest possible DNS message can fit into the buffer.

(cherry picked from commit 26006f7b44474819fac2a76dc6cd6f69f0d76828)
---
 lib/isc/netmgr/netmgr-int.h |  5 ++--
 lib/isc/netmgr/netmgr.c     |  1 +
 lib/isc/netmgr/tcp.c        | 58 +++++++++++++++++++++++++++++++++++++
 lib/isc/netmgr/tcpdns.c     | 54 ++++++++++++++++++++++++++++++++++
 4 files changed, 116 insertions(+), 2 deletions(-)

diff --git a/lib/isc/netmgr/netmgr-int.h b/lib/isc/netmgr/netmgr-int.h
index f95a51173e..6d2bd9ad9f 100644
--- a/lib/isc/netmgr/netmgr-int.h
+++ b/lib/isc/netmgr/netmgr-int.h
@@ -62,9 +62,10 @@
 #endif
 
 /*
- * The TCP receive buffer can fit one maximum sized DNS message plus its size,
- * the receive buffer here affects TCP, DoT and DoH.
+ * The TCP send and receive buffers can fit one maximum sized DNS message plus
+ * its size, the receive buffer here affects TCP, DoT and DoH.
  */
+#define ISC_NETMGR_TCP_SENDBUF_SIZE (sizeof(uint16_t) + UINT16_MAX)
 #define ISC_NETMGR_TCP_RECVBUF_SIZE (sizeof(uint16_t) + UINT16_MAX)
 
 /* Pick the larger buffer */
diff --git a/lib/isc/netmgr/netmgr.c b/lib/isc/netmgr/netmgr.c
index 820099553d..7b09639440 100644
--- a/lib/isc/netmgr/netmgr.c
+++ b/lib/isc/netmgr/netmgr.c
@@ -2086,6 +2086,7 @@ isc__nmsocket_writetimeout_cb(void *data, isc_result_t eresult) {
 
 	sock = req->sock;
 
+	isc__nm_start_reading(sock);
 	isc__nmsocket_reset(sock);
 }
 
diff --git a/lib/isc/netmgr/tcp.c b/lib/isc/netmgr/tcp.c
index 16b53cc579..016a9e9059 100644
--- a/lib/isc/netmgr/tcp.c
+++ b/lib/isc/netmgr/tcp.c
@@ -905,6 +905,31 @@ isc__nm_tcp_read_cb(uv_stream_t *stream, ssize_t nread, const uv_buf_t *buf) {
 
 	/* The readcb could have paused the reading */
 	if (atomic_load(&sock->reading)) {
+		if (!sock->client) {
+			/*
+			 * Stop reading if we have accumulated enough bytes in
+			 * the send queue; this means that the TCP client is not
+			 * reading back the data we sending to it, and there's
+			 * no reason to continue processing more incoming DNS
+			 * messages, if the client is not reading back the
+			 * responses.
+			 */
+			size_t write_queue_size =
+				uv_stream_get_write_queue_size(
+					&sock->uv_handle.stream);
+
+			if (write_queue_size >= ISC_NETMGR_TCP_SENDBUF_SIZE) {
+				isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
+					      ISC_LOGMODULE_NETMGR,
+					      ISC_LOG_DEBUG(3),
+					      "throttling TCP connection, "
+					      "the other side is "
+					      "not reading the data (%zu)",
+					      write_queue_size);
+				isc__nm_stop_reading(sock);
+			}
+		}
+
 		/* The timer will be updated */
 		isc__nmsocket_timer_restart(sock);
 	}
@@ -1095,6 +1120,33 @@ isc__nm_tcp_send(isc_nmhandle_t *handle, const isc_region_t *region,
 	return;
 }
 
+static void
+tcp_maybe_restart_reading(isc_nmsocket_t *sock) {
+	if (!sock->client && sock->reading &&
+	    !uv_is_active(&sock->uv_handle.handle))
+	{
+		/*
+		 * Restart reading if we have less data in the send queue than
+		 * the send buffer size, this means that the TCP client has
+		 * started reading some data again.  Starting reading when we go
+		 * under the limit instead of waiting for all data has been
+		 * flushed allows faster recovery (in case there was a
+		 * congestion and now there isn't).
+		 */
+		size_t write_queue_size =
+			uv_stream_get_write_queue_size(&sock->uv_handle.stream);
+		if (write_queue_size < ISC_NETMGR_TCP_SENDBUF_SIZE) {
+			isc_log_write(
+				isc_lctx, ISC_LOGCATEGORY_GENERAL,
+				ISC_LOGMODULE_NETMGR, ISC_LOG_DEBUG(3),
+				"resuming TCP connection, the other side  "
+				"is reading the data again (%zu)",
+				write_queue_size);
+			isc__nm_start_reading(sock);
+		}
+	}
+}
+
 static void
 tcp_send_cb(uv_write_t *req, int status) {
 	isc__nm_uvreq_t *uvreq = (isc__nm_uvreq_t *)req->data;
@@ -1112,10 +1164,16 @@ tcp_send_cb(uv_write_t *req, int status) {
 		isc__nm_incstats(sock, STATID_SENDFAIL);
 		isc__nm_failed_send_cb(sock, uvreq,
 				       isc__nm_uverr2result(status));
+
+		if (!sock->client && sock->reading) {
+			isc__nm_start_reading(sock);
+			isc__nmsocket_reset(sock);
+		}
 		return;
 	}
 
 	isc__nm_sendcb(sock, uvreq, ISC_R_SUCCESS, false);
+	tcp_maybe_restart_reading(sock);
 }
 
 /*
diff --git a/lib/isc/netmgr/tcpdns.c b/lib/isc/netmgr/tcpdns.c
index dd3e9ec3fe..00ecb0f3d2 100644
--- a/lib/isc/netmgr/tcpdns.c
+++ b/lib/isc/netmgr/tcpdns.c
@@ -913,6 +913,27 @@ isc__nm_tcpdns_read_cb(uv_stream_t *stream, ssize_t nread,
 	result = isc__nm_process_sock_buffer(sock);
 	if (result != ISC_R_SUCCESS) {
 		isc__nm_failed_read_cb(sock, result, true);
+	} else if (!sock->client) {
+		/*
+		 * Stop reading if we have accumulated enough bytes in
+		 * the send queue; this means that the TCP client is not
+		 * reading back the data we sending to it, and there's
+		 * no reason to continue processing more incoming DNS
+		 * messages, if the client is not reading back the
+		 * responses.
+		 */
+		size_t write_queue_size =
+			uv_stream_get_write_queue_size(&sock->uv_handle.stream);
+
+		if (write_queue_size >= ISC_NETMGR_TCP_SENDBUF_SIZE) {
+			isc_log_write(isc_lctx, ISC_LOGCATEGORY_GENERAL,
+				      ISC_LOGMODULE_NETMGR, ISC_LOG_DEBUG(3),
+				      "throttling TCP connection, "
+				      "the other side is "
+				      "not reading the data (%zu)",
+				      write_queue_size);
+			isc__nm_stop_reading(sock);
+		}
 	}
 free:
 	if (nread < 0) {
@@ -1135,6 +1156,33 @@ isc__nm_tcpdns_send(isc_nmhandle_t *handle, isc_region_t *region,
 	return;
 }
 
+static void
+tcpdns_maybe_restart_reading(isc_nmsocket_t *sock) {
+	if (!sock->client && sock->reading &&
+	    !uv_is_active(&sock->uv_handle.handle))
+	{
+		/*
+		 * Restart reading if we have less data in the send queue than
+		 * the send buffer size, this means that the TCP client has
+		 * started reading some data again.  Starting reading when we go
+		 * under the limit instead of waiting for all data has been
+		 * flushed allows faster recovery (in case there was a
+		 * congestion and now there isn't).
+		 */
+		size_t write_queue_size =
+			uv_stream_get_write_queue_size(&sock->uv_handle.stream);
+		if (write_queue_size < ISC_NETMGR_TCP_SENDBUF_SIZE) {
+			isc_log_write(
+				isc_lctx, ISC_LOGCATEGORY_GENERAL,
+				ISC_LOGMODULE_NETMGR, ISC_LOG_DEBUG(3),
+				"resuming TCP connection, the other side  "
+				"is reading the data again (%zu)",
+				write_queue_size);
+			isc__nm_start_reading(sock);
+		}
+	}
+}
+
 static void
 tcpdns_send_cb(uv_write_t *req, int status) {
 	isc__nm_uvreq_t *uvreq = (isc__nm_uvreq_t *)req->data;
@@ -1152,10 +1200,16 @@ tcpdns_send_cb(uv_write_t *req, int status) {
 		isc__nm_incstats(sock, STATID_SENDFAIL);
 		isc__nm_failed_send_cb(sock, uvreq,
 				       isc__nm_uverr2result(status));
+
+		if (!sock->client && sock->reading) {
+			isc__nm_start_reading(sock);
+			isc__nmsocket_reset(sock);
+		}
 		return;
 	}
 
 	isc__nm_sendcb(sock, uvreq, ISC_R_SUCCESS, false);
+	tcpdns_maybe_restart_reading(sock);
 }
 
 /*

From 4e70342142f42f6c46d1baf3232f2881ced7a3a2 Mon Sep 17 00:00:00 2001
From: Aram Sargsyan <aram@isc.org>
Date: Tue, 12 Mar 2024 15:29:51 +0000
Subject: [PATCH 2/6] ns_client: reuse TCP send buffers

Constantly allocating, reallocating and deallocating 64K TCP send
buffers by 'ns_client' instances takes too much CPU time.

There is an existing mechanism to reuse the ns_clent_t structure
associated with the handle using 'isc_nmhandle_getdata/_setdata'
(see ns_client_request()), but it doesn't work with TCP, because
every time ns_client_request() is called it gets a new handle even
for the same TCP connection, see the comments in
streamdns_on_complete_dnsmessage().

To solve the problem, we introduce an array of available (unused)
TCP buffers stored in ns_clientmgr_t structure so that a 'client'
working via TCP can have a chance to reuse one (if there is one)
instead of allocating a new one every time.
---
 lib/ns/client.c            | 108 +++++++++++++++++++++++++++++++------
 lib/ns/include/ns/client.h |   2 +
 2 files changed, 94 insertions(+), 16 deletions(-)

diff --git a/lib/ns/client.c b/lib/ns/client.c
index a62343bc7b..e3cd815922 100644
--- a/lib/ns/client.c
+++ b/lib/ns/client.c
@@ -101,6 +101,9 @@
 #define COOKIE_SIZE 24U /* 8 + 4 + 4 + 8 */
 #define ECS_SIZE    20U /* 2 + 1 + 1 + [0..16] */
 
+#define TCPBUFFERS_FILLCOUNT 1U
+#define TCPBUFFERS_FREEMAX   8U
+
 #define WANTNSID(x)	(((x)->attributes & NS_CLIENTATTR_WANTNSID) != 0)
 #define WANTEXPIRE(x)	(((x)->attributes & NS_CLIENTATTR_WANTEXPIRE) != 0)
 #define WANTPAD(x)	(((x)->attributes & NS_CLIENTATTR_WANTPAD) != 0)
@@ -336,6 +339,30 @@ client_senddone(isc_nmhandle_t *handle, isc_result_t result, void *cbarg) {
 	isc_nmhandle_detach(&handle);
 }
 
+static void
+client_setup_tcp_buffer(ns_client_t *client) {
+	REQUIRE(client->tcpbuf == NULL);
+
+	client->tcpbuf = isc_mempool_get(client->manager->tcp_buffers);
+	client->tcpbuf_size = NS_CLIENT_TCP_BUFFER_SIZE;
+}
+
+static void
+client_put_tcp_buffer(ns_client_t *client) {
+	if (client->tcpbuf == NULL) {
+		return;
+	}
+
+	if (client->tcpbuf_size == NS_CLIENT_TCP_BUFFER_SIZE) {
+		isc_mempool_put(client->manager->tcp_buffers, client->tcpbuf);
+	} else {
+		isc_mem_put(client->manager->send_mctx, client->tcpbuf,
+			    client->tcpbuf_size);
+	}
+
+	client->tcpbuf_size = 0;
+}
+
 static void
 client_allocsendbuf(ns_client_t *client, isc_buffer_t *buffer,
 		    unsigned char **datap) {
@@ -345,12 +372,9 @@ client_allocsendbuf(ns_client_t *client, isc_buffer_t *buffer,
 	REQUIRE(datap != NULL);
 
 	if (TCP_CLIENT(client)) {
-		INSIST(client->tcpbuf == NULL);
-		client->tcpbuf = isc_mem_get(client->manager->send_mctx,
-					     NS_CLIENT_TCP_BUFFER_SIZE);
-		client->tcpbuf_size = NS_CLIENT_TCP_BUFFER_SIZE;
+		client_setup_tcp_buffer(client);
 		data = client->tcpbuf;
-		isc_buffer_init(buffer, data, NS_CLIENT_TCP_BUFFER_SIZE);
+		isc_buffer_init(buffer, data, client->tcpbuf_size);
 	} else {
 		data = client->sendbuf;
 		if ((client->attributes & NS_CLIENTATTR_HAVECOOKIE) == 0) {
@@ -383,11 +407,56 @@ client_sendpkg(ns_client_t *client, isc_buffer_t *buffer) {
 
 	if (isc_buffer_base(buffer) == client->tcpbuf) {
 		size_t used = isc_buffer_usedlength(buffer);
-		client->tcpbuf = isc_mem_reget(client->manager->send_mctx,
-					       client->tcpbuf,
-					       client->tcpbuf_size, used);
-		client->tcpbuf_size = used;
-		r.base = client->tcpbuf;
+		const size_t threshold = (3 * NS_CLIENT_TCP_BUFFER_SIZE) / 4;
+
+		INSIST(client->tcpbuf_size == NS_CLIENT_TCP_BUFFER_SIZE);
+		/*
+		 * Copy the data into a smaller buffer before sending,
+		 * and keep the original big TCP send buffer for reuse
+		 * by other clients.
+		 */
+		if (used > threshold) {
+			/*
+			 * The data in the buffer is very large, so there is
+			 * no point in using a smaller buffer.
+			 */
+			r.base = buffer->base;
+		} else if (used > NS_CLIENT_SEND_BUFFER_SIZE) {
+			/*
+			 * We can save space by allocating a new buffer with a
+			 * correct size and freeing the big buffer.
+			 */
+			unsigned char *new_tcpbuf =
+				isc_mem_get(client->manager->send_mctx, used);
+			memmove(new_tcpbuf, buffer->base, used);
+
+			/*
+			 * Put the big buffer so we can replace the pointer
+			 * and the size with the new ones.
+			 */
+			client_put_tcp_buffer(client);
+
+			/*
+			 * Keep the new buffer's information so it can be freed.
+			 */
+			client->tcpbuf = new_tcpbuf;
+			client->tcpbuf_size = used;
+
+			r.base = new_tcpbuf;
+		} else {
+			/*
+			 * The data fits in the available space in
+			 * 'sendbuf', there is no need for a new buffer.
+			 */
+			memmove(client->sendbuf, buffer->base, used);
+
+			/*
+			 * Put the big buffer, we don't need a dynamic buffer.
+			 */
+			client_put_tcp_buffer(client);
+
+			r.base = client->sendbuf;
+		}
 		r.length = used;
 	} else {
 		isc_buffer_usedregion(buffer, &r);
@@ -461,8 +530,7 @@ ns_client_sendraw(ns_client_t *client, dns_message_t *message) {
 	return;
 done:
 	if (client->tcpbuf != NULL) {
-		isc_mem_put(client->manager->send_mctx, client->tcpbuf,
-			    client->tcpbuf_size);
+		client_put_tcp_buffer(client);
 	}
 
 	ns_client_drop(client, result);
@@ -746,8 +814,7 @@ renderend:
 
 cleanup:
 	if (client->tcpbuf != NULL) {
-		isc_mem_put(client->manager->send_mctx, client->tcpbuf,
-			    client->tcpbuf_size);
+		client_put_tcp_buffer(client);
 	}
 
 	if (cleanup_cctx) {
@@ -1629,8 +1696,7 @@ ns__client_reset_cb(void *client0) {
 
 	ns_client_endrequest(client);
 	if (client->tcpbuf != NULL) {
-		isc_mem_put(client->manager->send_mctx, client->tcpbuf,
-			    client->tcpbuf_size);
+		client_put_tcp_buffer(client);
 	}
 
 	if (client->keytag != NULL) {
@@ -2341,6 +2407,7 @@ ns__client_setup(ns_client_t *client, ns_clientmgr_t *mgr, bool new) {
 
 		client->sendbuf = isc_mem_get(client->manager->send_mctx,
 					      NS_CLIENT_SEND_BUFFER_SIZE);
+
 		/*
 		 * Set magic earlier than usual because ns_query_init()
 		 * and the functions it calls will require it.
@@ -2461,6 +2528,8 @@ clientmgr_destroy(ns_clientmgr_t *manager) {
 	isc_task_detach(&manager->task);
 	ns_server_detach(&manager->sctx);
 
+	isc_mempool_destroy(&manager->tcp_buffers);
+
 	isc_mem_detach(&manager->send_mctx);
 
 	isc_mem_putanddetach(&manager->mctx, manager, sizeof(*manager));
@@ -2554,6 +2623,13 @@ ns_clientmgr_create(ns_server_t *sctx, isc_taskmgr_t *taskmgr,
 	 */
 	(void)isc_mem_arena_set_muzzy_decay_ms(manager->send_mctx, 0);
 
+	isc_mempool_create(manager->send_mctx,
+			   (size_t)NS_CLIENT_TCP_BUFFER_SIZE,
+			   &manager->tcp_buffers);
+	isc_mempool_setfillcount(manager->tcp_buffers, TCPBUFFERS_FILLCOUNT);
+	isc_mempool_setfreemax(manager->tcp_buffers, TCPBUFFERS_FREEMAX);
+	isc_mempool_setname(manager->tcp_buffers, "ns_clientmgr_tcp");
+
 	manager->magic = MANAGER_MAGIC;
 
 	MTRACE("create");
diff --git a/lib/ns/include/ns/client.h b/lib/ns/include/ns/client.h
index 7a7196f763..3570689c64 100644
--- a/lib/ns/include/ns/client.h
+++ b/lib/ns/include/ns/client.h
@@ -159,6 +159,8 @@ struct ns_clientmgr {
 	/* Lock covers the recursing list */
 	isc_mutex_t   reclock;
 	client_list_t recursing; /*%< Recursing clients */
+
+	isc_mempool_t *tcp_buffers;
 };
 
 /*% nameserver client structure */

From 3f6b7f57a60b29af10118a979709b57dc2f951bb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ond=C5=99ej=20Sur=C3=BD?= <ondrej@isc.org>
Date: Tue, 4 Jun 2024 08:38:35 +0200
Subject: [PATCH 3/6] Replace the tcp_buffers memory pool with static per-loop
 buffer

As a single thread can process only one TCP send at the time, we don't
really need a memory pool for the TCP buffers, but it's enough to have
a single per-loop (client manager) static buffer that's being used to
assemble the DNS message and then it gets copied into own sending
buffer.

In the future, this should get optimized by exposing the uv_try API
from the network manager, and first try to send the message directly
and allocate the sending buffer only if we need to send the data
asynchronously.

(cherry picked from commit 297cc840fbaf34b9dfa1d02d88a023cd5bf5dc4a)
---
 lib/ns/client.c            | 36 +++++-------------------------------
 lib/ns/include/ns/client.h |  5 +++--
 2 files changed, 8 insertions(+), 33 deletions(-)

diff --git a/lib/ns/client.c b/lib/ns/client.c
index e3cd815922..7143e49249 100644
--- a/lib/ns/client.c
+++ b/lib/ns/client.c
@@ -343,7 +343,7 @@ static void
 client_setup_tcp_buffer(ns_client_t *client) {
 	REQUIRE(client->tcpbuf == NULL);
 
-	client->tcpbuf = isc_mempool_get(client->manager->tcp_buffers);
+	client->tcpbuf = client->manager->tcp_buffer;
 	client->tcpbuf_size = NS_CLIENT_TCP_BUFFER_SIZE;
 }
 
@@ -353,13 +353,12 @@ client_put_tcp_buffer(ns_client_t *client) {
 		return;
 	}
 
-	if (client->tcpbuf_size == NS_CLIENT_TCP_BUFFER_SIZE) {
-		isc_mempool_put(client->manager->tcp_buffers, client->tcpbuf);
-	} else {
+	if (client->tcpbuf != client->manager->tcp_buffer) {
 		isc_mem_put(client->manager->send_mctx, client->tcpbuf,
 			    client->tcpbuf_size);
 	}
 
+	client->tcpbuf = NULL;
 	client->tcpbuf_size = 0;
 }
 
@@ -407,21 +406,14 @@ client_sendpkg(ns_client_t *client, isc_buffer_t *buffer) {
 
 	if (isc_buffer_base(buffer) == client->tcpbuf) {
 		size_t used = isc_buffer_usedlength(buffer);
-		const size_t threshold = (3 * NS_CLIENT_TCP_BUFFER_SIZE) / 4;
-
 		INSIST(client->tcpbuf_size == NS_CLIENT_TCP_BUFFER_SIZE);
+
 		/*
 		 * Copy the data into a smaller buffer before sending,
 		 * and keep the original big TCP send buffer for reuse
 		 * by other clients.
 		 */
-		if (used > threshold) {
-			/*
-			 * The data in the buffer is very large, so there is
-			 * no point in using a smaller buffer.
-			 */
-			r.base = buffer->base;
-		} else if (used > NS_CLIENT_SEND_BUFFER_SIZE) {
+		if (used > NS_CLIENT_SEND_BUFFER_SIZE) {
 			/*
 			 * We can save space by allocating a new buffer with a
 			 * correct size and freeing the big buffer.
@@ -1727,8 +1719,6 @@ ns__client_put_cb(void *client0) {
 	client->magic = 0;
 	client->shuttingdown = true;
 
-	isc_mem_put(client->manager->send_mctx, client->sendbuf,
-		    NS_CLIENT_SEND_BUFFER_SIZE);
 	if (client->opt != NULL) {
 		INSIST(dns_rdataset_isassociated(client->opt));
 		dns_rdataset_disassociate(client->opt);
@@ -2405,9 +2395,6 @@ ns__client_setup(ns_client_t *client, ns_clientmgr_t *mgr, bool new) {
 		dns_message_create(client->mctx, DNS_MESSAGE_INTENTPARSE,
 				   &client->message);
 
-		client->sendbuf = isc_mem_get(client->manager->send_mctx,
-					      NS_CLIENT_SEND_BUFFER_SIZE);
-
 		/*
 		 * Set magic earlier than usual because ns_query_init()
 		 * and the functions it calls will require it.
@@ -2424,7 +2411,6 @@ ns__client_setup(ns_client_t *client, ns_clientmgr_t *mgr, bool new) {
 		ns_clientmgr_t *oldmgr = client->manager;
 		ns_server_t *sctx = client->sctx;
 		isc_task_t *task = client->task;
-		unsigned char *sendbuf = client->sendbuf;
 		dns_message_t *message = client->message;
 		isc_mem_t *oldmctx = client->mctx;
 		ns_query_t query = client->query;
@@ -2439,7 +2425,6 @@ ns__client_setup(ns_client_t *client, ns_clientmgr_t *mgr, bool new) {
 					 .manager = oldmgr,
 					 .sctx = sctx,
 					 .task = task,
-					 .sendbuf = sendbuf,
 					 .message = message,
 					 .query = query,
 					 .tid = tid };
@@ -2464,8 +2449,6 @@ ns__client_setup(ns_client_t *client, ns_clientmgr_t *mgr, bool new) {
 	return (ISC_R_SUCCESS);
 
 cleanup:
-	isc_mem_put(client->manager->send_mctx, client->sendbuf,
-		    NS_CLIENT_SEND_BUFFER_SIZE);
 	dns_message_detach(&client->message);
 	isc_task_detach(&client->task);
 	ns_clientmgr_detach(&client->manager);
@@ -2528,8 +2511,6 @@ clientmgr_destroy(ns_clientmgr_t *manager) {
 	isc_task_detach(&manager->task);
 	ns_server_detach(&manager->sctx);
 
-	isc_mempool_destroy(&manager->tcp_buffers);
-
 	isc_mem_detach(&manager->send_mctx);
 
 	isc_mem_putanddetach(&manager->mctx, manager, sizeof(*manager));
@@ -2623,13 +2604,6 @@ ns_clientmgr_create(ns_server_t *sctx, isc_taskmgr_t *taskmgr,
 	 */
 	(void)isc_mem_arena_set_muzzy_decay_ms(manager->send_mctx, 0);
 
-	isc_mempool_create(manager->send_mctx,
-			   (size_t)NS_CLIENT_TCP_BUFFER_SIZE,
-			   &manager->tcp_buffers);
-	isc_mempool_setfillcount(manager->tcp_buffers, TCPBUFFERS_FILLCOUNT);
-	isc_mempool_setfreemax(manager->tcp_buffers, TCPBUFFERS_FREEMAX);
-	isc_mempool_setname(manager->tcp_buffers, "ns_clientmgr_tcp");
-
 	manager->magic = MANAGER_MAGIC;
 
 	MTRACE("create");
diff --git a/lib/ns/include/ns/client.h b/lib/ns/include/ns/client.h
index 3570689c64..08ee1123a8 100644
--- a/lib/ns/include/ns/client.h
+++ b/lib/ns/include/ns/client.h
@@ -160,7 +160,7 @@ struct ns_clientmgr {
 	isc_mutex_t   reclock;
 	client_list_t recursing; /*%< Recursing clients */
 
-	isc_mempool_t *tcp_buffers;
+	uint8_t tcp_buffer[NS_CLIENT_TCP_BUFFER_SIZE];
 };
 
 /*% nameserver client structure */
@@ -189,7 +189,6 @@ struct ns_client {
 	unsigned char  *tcpbuf;
 	size_t		tcpbuf_size;
 	dns_message_t  *message;
-	unsigned char  *sendbuf;
 	dns_rdataset_t *opt;
 	dns_ednsopt_t  *ede;
 	uint16_t	udpsize;
@@ -242,6 +241,8 @@ struct ns_client {
 	 * bits will be used as the rcode in the response message.
 	 */
 	int32_t rcode_override;
+
+	uint8_t sendbuf[NS_CLIENT_SEND_BUFFER_SIZE];
 };
 
 #define NS_CLIENT_MAGIC	   ISC_MAGIC('N', 'S', 'C', 'c')

From 0b1d70ed2a729e39b2d19f5aa486ba89c98b0c90 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ond=C5=99ej=20Sur=C3=BD?= <ondrej@isc.org>
Date: Tue, 4 Jun 2024 11:21:24 +0200
Subject: [PATCH 4/6] Remove the extra memory context with own arena for
 sending

(cherry picked from commit 8d4cc41c291f8a77a723ae8e62533538b3632d50)
---
 lib/ns/client.c            | 61 ++------------------------------------
 lib/ns/include/ns/client.h |  1 -
 2 files changed, 2 insertions(+), 60 deletions(-)

diff --git a/lib/ns/client.c b/lib/ns/client.c
index 7143e49249..590859f020 100644
--- a/lib/ns/client.c
+++ b/lib/ns/client.c
@@ -354,7 +354,7 @@ client_put_tcp_buffer(ns_client_t *client) {
 	}
 
 	if (client->tcpbuf != client->manager->tcp_buffer) {
-		isc_mem_put(client->manager->send_mctx, client->tcpbuf,
+		isc_mem_put(client->manager->mctx, client->tcpbuf,
 			    client->tcpbuf_size);
 	}
 
@@ -419,7 +419,7 @@ client_sendpkg(ns_client_t *client, isc_buffer_t *buffer) {
 			 * correct size and freeing the big buffer.
 			 */
 			unsigned char *new_tcpbuf =
-				isc_mem_get(client->manager->send_mctx, used);
+				isc_mem_get(client->manager->mctx, used);
 			memmove(new_tcpbuf, buffer->base, used);
 
 			/*
@@ -2511,8 +2511,6 @@ clientmgr_destroy(ns_clientmgr_t *manager) {
 	isc_task_detach(&manager->task);
 	ns_server_detach(&manager->sctx);
 
-	isc_mem_detach(&manager->send_mctx);
-
 	isc_mem_putanddetach(&manager->mctx, manager, sizeof(*manager));
 }
 
@@ -2549,61 +2547,6 @@ ns_clientmgr_create(ns_server_t *sctx, isc_taskmgr_t *taskmgr,
 
 	ISC_LIST_INIT(manager->recursing);
 
-	/*
-	 * We create specialised per-worker memory context specifically
-	 * dedicated and tuned for allocating send buffers as it is a very
-	 * common operation. Not doing so may result in excessive memory
-	 * use in certain workloads.
-	 *
-	 * Please see this thread for more details:
-	 *
-	 * https://github.com/jemalloc/jemalloc/issues/2483
-	 *
-	 * In particular, this information from the jemalloc developers is
-	 * of the most interest:
-	 *
-	 * https://github.com/jemalloc/jemalloc/issues/2483#issuecomment-1639019699
-	 * https://github.com/jemalloc/jemalloc/issues/2483#issuecomment-1698173849
-	 *
-	 * In essence, we use the following memory management strategy:
-	 *
-	 * 1. We use a per-worker memory arena for send buffers memory
-	 * allocation to reduce lock contention (In reality, we create a
-	 * per-client manager arena, but we have one client manager per
-	 * worker).
-	 *
-	 * 2. The automatically created arenas settings remain unchanged
-	 * and may be controlled by users (e.g. by setting the
-	 * "MALLOC_CONF" variable).
-	 *
-	 * 3. We attune the arenas to not use dirty pages cache as the
-	 * cache would have a poor reuse rate, and that is known to
-	 * significantly contribute to excessive memory use.
-	 *
-	 * 4. There is no strict need for the dirty cache, as there is a
-	 * per arena bin for each allocation size, so because we initially
-	 * allocate strictly 64K per send buffer (enough for a DNS
-	 * message), allocations would get directed to one bin (an "object
-	 * pool" or a "slab") maintained within an arena. That is, there
-	 * is an object pool already, specifically to optimise for the
-	 * case of frequent allocations of objects of the given size. The
-	 * object pool should suffice our needs, as we will end up
-	 * recycling the objects from there without the need to back it by
-	 * an additional layer of dirty pages cache. The dirty pages cache
-	 * would have worked better in the case when there are more
-	 * allocation bins involved due to a higher reuse rate (the case
-	 * of a more "generic" memory management).
-	 */
-	isc_mem_create_arena(&manager->send_mctx);
-	isc_mem_setname(manager->send_mctx, "sendbufs");
-	(void)isc_mem_arena_set_dirty_decay_ms(manager->send_mctx, 0);
-	/*
-	 * Disable muzzy pages cache too, as versions < 5.2.0 have it
-	 * enabled by default. The muzzy pages cache goes right below the
-	 * dirty pages cache and backs it.
-	 */
-	(void)isc_mem_arena_set_muzzy_decay_ms(manager->send_mctx, 0);
-
 	manager->magic = MANAGER_MAGIC;
 
 	MTRACE("create");
diff --git a/lib/ns/include/ns/client.h b/lib/ns/include/ns/client.h
index 08ee1123a8..ea2d83e079 100644
--- a/lib/ns/include/ns/client.h
+++ b/lib/ns/include/ns/client.h
@@ -144,7 +144,6 @@ struct ns_clientmgr {
 	unsigned int magic;
 
 	isc_mem_t      *mctx;
-	isc_mem_t      *send_mctx;
 	ns_server_t    *sctx;
 	isc_taskmgr_t  *taskmgr;
 	isc_timermgr_t *timermgr;

From 964891a7949715805e00ecedc1cb11b2703984ea Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ond=C5=99ej=20Sur=C3=BD?= <ondrej@isc.org>
Date: Wed, 5 Jun 2024 09:15:39 +0200
Subject: [PATCH 5/6] Limit the number of DNS message processed from a single
 TCP read

The single TCP read can create as much as 64k divided by the minimum
size of the DNS message.  This can clog the processing thread and trash
the memory allocator because we need to do as much as ~20k allocations in
a single UV loop tick.

Limit the number of the DNS messages processed in a single UV loop tick
to just single DNS message and limit the number of the outstanding DNS
messages back to 23.  This effectively limits the number of pipelined
DNS messages to that number (this is the limit we already had before).

This reverts commit 780a89012d8627b9284983702dced8a3f65688aa.
---
 lib/isc/netmgr/netmgr-int.h | 8 ++++++++
 lib/isc/netmgr/netmgr.c     | 9 ++++++---
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/lib/isc/netmgr/netmgr-int.h b/lib/isc/netmgr/netmgr-int.h
index 6d2bd9ad9f..4ceb182e7b 100644
--- a/lib/isc/netmgr/netmgr-int.h
+++ b/lib/isc/netmgr/netmgr-int.h
@@ -2262,6 +2262,14 @@ isc__nmsocket_readtimeout_cb(uv_timer_t *timer);
 void
 isc__nmsocket_writetimeout_cb(void *data, isc_result_t eresult);
 
+/*%<
+ *
+ * Maximum number of simultaneous handles in flight supported for a single
+ * connected TCPDNS socket. This value was chosen arbitrarily, and may be
+ * changed in the future.
+ */
+#define STREAM_CLIENTS_PER_CONN 23
+
 #define UV_RUNTIME_CHECK(func, ret)                                      \
 	if (ret != 0) {                                                  \
 		FATAL_ERROR("%s failed: %s\n", #func, uv_strerror(ret)); \
diff --git a/lib/isc/netmgr/netmgr.c b/lib/isc/netmgr/netmgr.c
index 7b09639440..336cad4e23 100644
--- a/lib/isc/netmgr/netmgr.c
+++ b/lib/isc/netmgr/netmgr.c
@@ -2343,8 +2343,10 @@ processbuffer(isc_nmsocket_t *sock) {
  * timers. If we do have a full message, reset the timer.
  *
  * Stop reading if this is a client socket, or if the server socket
- * has been set to sequential mode. In this case we'll be called again
- * later by isc__nm_resume_processing().
+ * has been set to sequential mode, or the number of queries we are
+ * processing simultaneously has reached the clients-per-connection
+ * limit. In this case we'll be called again later by
+ * isc__nm_resume_processing().
  */
 isc_result_t
 isc__nm_process_sock_buffer(isc_nmsocket_t *sock) {
@@ -2384,7 +2386,8 @@ isc__nm_process_sock_buffer(isc_nmsocket_t *sock) {
 			isc__nmsocket_timer_stop(sock);
 
 			if (atomic_load(&sock->client) ||
-			    atomic_load(&sock->sequential))
+			    atomic_load(&sock->sequential) ||
+			    atomic_load(&sock->ah) >= STREAM_CLIENTS_PER_CONN)
 			{
 				isc__nm_stop_reading(sock);
 				goto done;

From 41eb89503c76cb2f117a6f39e635d6139b146ddb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ond=C5=99ej=20Sur=C3=BD?= <ondrej@isc.org>
Date: Fri, 19 Jan 2024 21:11:32 +0100
Subject: [PATCH 6/6] Add CHANGES and release note for [GL #4481]

(cherry picked from commit 3e4babc58e1ed169a25ae9083f8f3c7d3e8389a3)
---
 CHANGES                     | 5 +++++
 doc/notes/notes-current.rst | 4 ++++
 2 files changed, 9 insertions(+)

diff --git a/CHANGES b/CHANGES
index 09abb7e88b..8587a64ab9 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,3 +1,8 @@
+6399.	[security]	Malicious DNS client that sends many queries over
+			TCP but never reads responses can cause server to
+			respond slowly or not respond at all for other
+			clients. (CVE-2024-0760) [GL #4481]
+
 6398.	[bug]		Fix potential data races in our DoH implementation
 			related to HTTP/2 session object management and
 			endpoints set object management after reconfiguration.
diff --git a/doc/notes/notes-current.rst b/doc/notes/notes-current.rst
index 38e66417b7..58fe4ad959 100644
--- a/doc/notes/notes-current.rst
+++ b/doc/notes/notes-current.rst
@@ -15,6 +15,10 @@ Notes for BIND 9.18.28
 Security Fixes
 ~~~~~~~~~~~~~~
 
+- Malicious DNS client that sends many queries over TCP but never reads
+  responses can cause server to respond slowly or not respond at all for other
+  clients. :cve:`2024-0760` :gl:`#4481`
+
 - Named could trigger an assertion failure when looking up the NS
   records of parent zones as part of looking up DS records.  This
   has been fixed. :gl:`#4661`