diff --git a/CHANGES b/CHANGES index 7f5b381442..3165203b82 100644 --- a/CHANGES +++ b/CHANGES @@ -1,3 +1,8 @@ +6399. [security] Malicious DNS client that sends many queries over + TCP but never reads responses can cause server to + respond slowly or not respond at all for other + clients. (CVE-2024-0760) [GL #4481] + 6398. [bug] Fix potential data races in our DoH implementation related to HTTP/2 session object management and endpoints set object management after reconfiguration. diff --git a/doc/notes/notes-current.rst b/doc/notes/notes-current.rst index 4a3db14ed4..1cda5eeab3 100644 --- a/doc/notes/notes-current.rst +++ b/doc/notes/notes-current.rst @@ -15,7 +15,9 @@ Notes for BIND 9.19.25 Security Fixes ~~~~~~~~~~~~~~ -- None. +- Malicious DNS client that sends many queries over TCP but never reads + responses can cause server to respond slowly or not respond at all for other + clients. :cve:`2024-0760` :gl:`#4481` New Features ~~~~~~~~~~~~ diff --git a/lib/isc/netmgr/netmgr-int.h b/lib/isc/netmgr/netmgr-int.h index 56e5bef4a9..f5d8c11418 100644 --- a/lib/isc/netmgr/netmgr-int.h +++ b/lib/isc/netmgr/netmgr-int.h @@ -61,9 +61,10 @@ #endif /* - * The TCP receive buffer can fit one maximum sized DNS message plus its size, - * the receive buffer here affects TCP, DoT and DoH. + * The TCP send and receive buffers can fit one maximum sized DNS message plus + * its size, the receive buffer here affects TCP, DoT and DoH. */ +#define ISC_NETMGR_TCP_SENDBUF_SIZE (sizeof(uint16_t) + UINT16_MAX) #define ISC_NETMGR_TCP_RECVBUF_SIZE (sizeof(uint16_t) + UINT16_MAX) /* Pick the larger buffer */ @@ -84,6 +85,11 @@ STATIC_ASSERT(ISC_NETMGR_TCP_RECVBUF_SIZE <= ISC_NETMGR_RECVBUF_SIZE, "TCP receive buffer size must be smaller or equal than worker " "receive buffer size"); +/*% + * Maximum outstanding DNS message that we process in a single TCP read. + */ +#define ISC_NETMGR_MAX_STREAM_CLIENTS_PER_CONN 23 + /*% * Regular TCP buffer size. */ @@ -660,6 +666,9 @@ struct isc_nmsocket { ISC_LIST(isc_nmhandle_t) active_handles; ISC_LIST(isc__nm_uvreq_t) active_uvreqs; + size_t active_handles_cur; + size_t active_handles_max; + /*% * Used to pass a result back from listen or connect events. */ diff --git a/lib/isc/netmgr/netmgr.c b/lib/isc/netmgr/netmgr.c index 7ee0502ae1..ba56506007 100644 --- a/lib/isc/netmgr/netmgr.c +++ b/lib/isc/netmgr/netmgr.c @@ -688,6 +688,7 @@ isc___nmsocket_init(isc_nmsocket_t *sock, isc__networker_t *worker, .inactive_handles = ISC_LIST_INITIALIZER, .result = ISC_R_UNSET, .active_handles = ISC_LIST_INITIALIZER, + .active_handles_max = ISC_NETMGR_MAX_STREAM_CLIENTS_PER_CONN, .active_link = ISC_LINK_INITIALIZER, .active = true, }; @@ -853,6 +854,7 @@ isc___nmhandle_get(isc_nmsocket_t *sock, isc_sockaddr_t const *peer, } ISC_LIST_APPEND(sock->active_handles, handle, active_link); + sock->active_handles_cur++; switch (sock->type) { case isc_nm_udpsocket: @@ -967,6 +969,8 @@ nmhandle_destroy(isc_nmhandle_t *handle) { } ISC_LIST_UNLINK(sock->active_handles, handle, active_link); + INSIST(sock->active_handles_cur > 0); + sock->active_handles_cur--; if (sock->closehandle_cb == NULL) { nmhandle__destroy(handle); @@ -1128,6 +1132,7 @@ isc__nmsocket_writetimeout_cb(void *data, isc_result_t eresult) { sock = req->sock; + isc__nm_start_reading(sock); isc__nmsocket_reset(sock); } diff --git a/lib/isc/netmgr/streamdns.c b/lib/isc/netmgr/streamdns.c index cab9c22f62..35148fed6c 100644 --- a/lib/isc/netmgr/streamdns.c +++ b/lib/isc/netmgr/streamdns.c @@ -91,6 +91,9 @@ streamdns_try_close_unused(isc_nmsocket_t *sock); static bool streamdns_closing(isc_nmsocket_t *sock); +static void +streamdns_resume_processing(void *arg); + static void streamdns_resumeread(isc_nmsocket_t *sock, isc_nmhandle_t *transphandle) { if (!sock->streamdns.reading) { @@ -169,18 +172,32 @@ streamdns_on_complete_dnsmessage(isc_dnsstream_assembler_t *dnsasm, stop = true; } + if (sock->active_handles_max != 0 && + (sock->active_handles_cur >= sock->active_handles_max)) + { + stop = true; + } + INSIST(sock->active_handles_cur <= sock->active_handles_max); + isc__nmsocket_timer_stop(sock); - if (!stop && last_datum) { + if (stop) { + streamdns_pauseread(sock, transphandle); + } else if (last_datum) { /* * We have processed all data, need to read more. * The call also restarts the timer. */ streamdns_readmore(sock, transphandle); - } else if (stop) { + } else { + /* + * Process more DNS messages in the next loop tick. + */ streamdns_pauseread(sock, transphandle); + isc_async_run(sock->worker->loop, streamdns_resume_processing, + sock); } - return (!stop); + return (false); } /* @@ -670,6 +687,12 @@ streamdns_resume_processing(void *arg) { return; } + if (sock->active_handles_max != 0 && + (sock->active_handles_cur >= sock->active_handles_max)) + { + return; + } + streamdns_handle_incoming_data(sock, sock->outerhandle, NULL, 0); } diff --git a/lib/isc/netmgr/tcp.c b/lib/isc/netmgr/tcp.c index 63612bf130..45e1c79c73 100644 --- a/lib/isc/netmgr/tcp.c +++ b/lib/isc/netmgr/tcp.c @@ -655,6 +655,7 @@ isc__nm_tcp_failed_read_cb(isc_nmsocket_t *sock, isc_result_t result, isc__nmsocket_timer_stop(sock); isc__nm_stop_reading(sock); + sock->reading = false; if (sock->recv_cb != NULL) { isc__nm_uvreq_t *req = isc__nm_get_read_req(sock, NULL); @@ -701,13 +702,14 @@ isc__nm_tcp_read(isc_nmhandle_t *handle, isc_nm_recv_cb_t cb, void *cbarg) { goto failure; } + sock->reading = true; + if (!sock->manual_read_timer) { isc__nmsocket_timer_start(sock); } return; failure: - sock->reading = true; isc__nm_tcp_failed_read_cb(sock, result, true); } @@ -720,6 +722,7 @@ isc__nm_tcp_read_stop(isc_nmhandle_t *handle) { isc__nmsocket_timer_stop(sock); isc__nm_stop_reading(sock); + sock->reading = false; return; } @@ -771,8 +774,29 @@ isc__nm_tcp_read_cb(uv_stream_t *stream, ssize_t nread, const uv_buf_t *buf) { isc__nm_readcb(sock, req, ISC_R_SUCCESS, false); - /* The readcb could have paused the reading */ - if (sock->reading && !sock->manual_read_timer) { + if (!sock->client && sock->reading) { + /* + * Stop reading if we have accumulated enough bytes in the send + * queue; this means that the TCP client is not reading back the + * data we sending to it, and there's no reason to continue + * processing more incoming DNS messages, if the client is not + * reading back the responses. + */ + size_t write_queue_size = + uv_stream_get_write_queue_size(&sock->uv_handle.stream); + + if (write_queue_size >= ISC_NETMGR_TCP_SENDBUF_SIZE) { + isc__nmsocket_log( + sock, ISC_LOG_DEBUG(3), + "throttling TCP connection, the other side is " + "not reading the data (%zu)", + write_queue_size); + isc__nm_stop_reading(sock); + } + } else if (uv_is_active(&sock->uv_handle.handle) && + !sock->manual_read_timer) + { + /* The readcb could have paused the reading */ /* The timer will be updated */ isc__nmsocket_timer_restart(sock); } @@ -996,6 +1020,32 @@ isc__nm_tcp_senddns(isc_nmhandle_t *handle, const isc_region_t *region, tcp_send(handle, region, cb, cbarg, true); } +static void +tcp_maybe_restart_reading(isc_nmsocket_t *sock) { + if (!sock->client && sock->reading && + !uv_is_active(&sock->uv_handle.handle)) + { + /* + * Restart reading if we have less data in the send queue than + * the send buffer size, this means that the TCP client has + * started reading some data again. Starting reading when we go + * under the limit instead of waiting for all data has been + * flushed allows faster recovery (in case there was a + * congestion and now there isn't). + */ + size_t write_queue_size = + uv_stream_get_write_queue_size(&sock->uv_handle.stream); + if (write_queue_size < ISC_NETMGR_TCP_SENDBUF_SIZE) { + isc__nmsocket_log( + sock, ISC_LOG_DEBUG(3), + "resuming TCP connection, the other side " + "is reading the data again (%zu)", + write_queue_size); + isc__nm_start_reading(sock); + } + } +} + static void tcp_send_cb(uv_write_t *req, int status) { isc__nm_uvreq_t *uvreq = (isc__nm_uvreq_t *)req->data; @@ -1013,10 +1063,15 @@ tcp_send_cb(uv_write_t *req, int status) { isc__nm_incstats(sock, STATID_SENDFAIL); isc__nm_failed_send_cb(sock, uvreq, isc_uverr2result(status), false); + if (!sock->client && sock->reading) { + isc__nm_start_reading(sock); + isc__nmsocket_reset(sock); + } return; } isc__nm_sendcb(sock, uvreq, ISC_R_SUCCESS, false); + tcp_maybe_restart_reading(sock); } static isc_result_t @@ -1045,6 +1100,7 @@ tcp_send_direct(isc_nmsocket_t *sock, isc__nm_uvreq_t *req) { if (r == (int)(bufs[0].len)) { /* Wrote everything */ isc__nm_sendcb(sock, req, ISC_R_SUCCESS, true); + tcp_maybe_restart_reading(sock); return (ISC_R_SUCCESS); } else if (r > 0) { bufs[0].base += (size_t)r; @@ -1064,6 +1120,7 @@ tcp_send_direct(isc_nmsocket_t *sock, isc__nm_uvreq_t *req) { if (r == (int)(bufs[0].len + bufs[1].len)) { /* Wrote everything */ isc__nm_sendcb(sock, req, ISC_R_SUCCESS, true); + tcp_maybe_restart_reading(sock); return (ISC_R_SUCCESS); } else if (r == 1) { /* Partial write of DNSMSG length */ @@ -1148,6 +1205,7 @@ isc__nm_tcp_close(isc_nmsocket_t *sock) { /* 2. close the socket + destroy the socket in callback */ isc__nmsocket_clearcb(sock); isc__nm_stop_reading(sock); + sock->reading = false; uv_close(&sock->uv_handle.handle, tcp_close_cb); /* 1. close the timer */ @@ -1226,7 +1284,7 @@ isc__nmhandle_tcp_set_manual_timer(isc_nmhandle_t *handle, const bool manual) { REQUIRE(VALID_NMSOCK(sock)); REQUIRE(sock->type == isc_nm_tcpsocket); REQUIRE(sock->tid == isc_tid()); - REQUIRE(!sock->reading); + REQUIRE(!uv_is_active(&sock->uv_handle.handle)); sock->manual_read_timer = manual; } diff --git a/lib/ns/client.c b/lib/ns/client.c index f6ccdd0c5b..c1341be828 100644 --- a/lib/ns/client.c +++ b/lib/ns/client.c @@ -98,6 +98,9 @@ #define COOKIE_SIZE 24U /* 8 + 4 + 4 + 8 */ #define ECS_SIZE 20U /* 2 + 1 + 1 + [0..16] */ +#define TCPBUFFERS_FILLCOUNT 1U +#define TCPBUFFERS_FREEMAX 8U + #define WANTNSID(x) (((x)->attributes & NS_CLIENTATTR_WANTNSID) != 0) #define WANTEXPIRE(x) (((x)->attributes & NS_CLIENTATTR_WANTEXPIRE) != 0) #define WANTPAD(x) (((x)->attributes & NS_CLIENTATTR_WANTPAD) != 0) @@ -369,6 +372,29 @@ client_senddone(isc_nmhandle_t *handle, isc_result_t result, void *cbarg) { isc_nmhandle_detach(&handle); } +static void +client_setup_tcp_buffer(ns_client_t *client) { + REQUIRE(client->tcpbuf == NULL); + + client->tcpbuf = client->manager->tcp_buffer; + client->tcpbuf_size = NS_CLIENT_TCP_BUFFER_SIZE; +} + +static void +client_put_tcp_buffer(ns_client_t *client) { + if (client->tcpbuf == NULL) { + return; + } + + if (client->tcpbuf != client->manager->tcp_buffer) { + isc_mem_put(client->manager->mctx, client->tcpbuf, + client->tcpbuf_size); + } + + client->tcpbuf = NULL; + client->tcpbuf_size = 0; +} + static void client_allocsendbuf(ns_client_t *client, isc_buffer_t *buffer, unsigned char **datap) { @@ -378,12 +404,9 @@ client_allocsendbuf(ns_client_t *client, isc_buffer_t *buffer, REQUIRE(datap != NULL); if (TCP_CLIENT(client)) { - INSIST(client->tcpbuf == NULL); - client->tcpbuf = isc_mem_get(client->manager->send_mctx, - NS_CLIENT_TCP_BUFFER_SIZE); - client->tcpbuf_size = NS_CLIENT_TCP_BUFFER_SIZE; + client_setup_tcp_buffer(client); data = client->tcpbuf; - isc_buffer_init(buffer, data, NS_CLIENT_TCP_BUFFER_SIZE); + isc_buffer_init(buffer, data, client->tcpbuf_size); } else { data = client->sendbuf; if ((client->attributes & NS_CLIENTATTR_HAVECOOKIE) == 0) { @@ -416,11 +439,49 @@ client_sendpkg(ns_client_t *client, isc_buffer_t *buffer) { if (isc_buffer_base(buffer) == client->tcpbuf) { size_t used = isc_buffer_usedlength(buffer); - client->tcpbuf = isc_mem_creget( - client->manager->send_mctx, client->tcpbuf, - client->tcpbuf_size, used, sizeof(char)); - client->tcpbuf_size = used; - r.base = client->tcpbuf; + INSIST(client->tcpbuf_size == NS_CLIENT_TCP_BUFFER_SIZE); + + /* + * Copy the data into a smaller buffer before sending, + * and keep the original big TCP send buffer for reuse + * by other clients. + */ + if (used > NS_CLIENT_SEND_BUFFER_SIZE) { + /* + * We can save space by allocating a new buffer with a + * correct size and freeing the big buffer. + */ + unsigned char *new_tcpbuf = + isc_mem_get(client->manager->mctx, used); + memmove(new_tcpbuf, buffer->base, used); + + /* + * Put the big buffer so we can replace the pointer + * and the size with the new ones. + */ + client_put_tcp_buffer(client); + + /* + * Keep the new buffer's information so it can be freed. + */ + client->tcpbuf = new_tcpbuf; + client->tcpbuf_size = used; + + r.base = new_tcpbuf; + } else { + /* + * The data fits in the available space in + * 'sendbuf', there is no need for a new buffer. + */ + memmove(client->sendbuf, buffer->base, used); + + /* + * Put the big buffer, we don't need a dynamic buffer. + */ + client_put_tcp_buffer(client); + + r.base = client->sendbuf; + } r.length = used; } else { isc_buffer_usedregion(buffer, &r); @@ -498,8 +559,7 @@ ns_client_sendraw(ns_client_t *client, dns_message_t *message) { return; done: if (client->tcpbuf != NULL) { - isc_mem_put(client->manager->send_mctx, client->tcpbuf, - client->tcpbuf_size); + client_put_tcp_buffer(client); } ns_client_drop(client, result); @@ -785,8 +845,7 @@ renderend: cleanup: if (client->tcpbuf != NULL) { - isc_mem_put(client->manager->send_mctx, client->tcpbuf, - client->tcpbuf_size); + client_put_tcp_buffer(client); } if (cleanup_cctx) { @@ -1625,8 +1684,7 @@ ns__client_reset_cb(void *client0) { ns_client_endrequest(client); if (client->tcpbuf != NULL) { - isc_mem_put(client->manager->send_mctx, client->tcpbuf, - client->tcpbuf_size); + client_put_tcp_buffer(client); } if (client->keytag != NULL) { @@ -1662,8 +1720,6 @@ ns__client_put_cb(void *client0) { client->magic = 0; - isc_mem_put(manager->send_mctx, client->sendbuf, - NS_CLIENT_SEND_BUFFER_SIZE); if (client->opt != NULL) { INSIST(dns_rdataset_isassociated(client->opt)); dns_rdataset_disassociate(client->opt); @@ -2371,8 +2427,6 @@ ns__client_setup(ns_client_t *client, ns_clientmgr_t *mgr, bool new) { client->manager->rdspool, DNS_MESSAGE_INTENTPARSE, &client->message); - client->sendbuf = isc_mem_get(client->manager->send_mctx, - NS_CLIENT_SEND_BUFFER_SIZE); /* * Set magic earlier than usual because ns_query_init() * and the functions it calls will require it. @@ -2393,7 +2447,6 @@ ns__client_setup(ns_client_t *client, ns_clientmgr_t *mgr, bool new) { *client = (ns_client_t){ .magic = 0, .manager = client->manager, - .sendbuf = client->sendbuf, .message = client->message, .query = client->query, }; @@ -2418,8 +2471,6 @@ ns__client_setup(ns_client_t *client, ns_clientmgr_t *mgr, bool new) { return (ISC_R_SUCCESS); cleanup: - isc_mem_put(client->manager->send_mctx, client->sendbuf, - NS_CLIENT_SEND_BUFFER_SIZE); dns_message_detach(&client->message); ns_clientmgr_detach(&client->manager); @@ -2447,8 +2498,6 @@ clientmgr_destroy_cb(void *arg) { dns_message_destroypools(&manager->rdspool, &manager->namepool); - isc_mem_detach(&manager->send_mctx); - isc_mem_putanddetach(&manager->mctx, manager, sizeof(*manager)); } @@ -2483,61 +2532,6 @@ ns_clientmgr_create(ns_server_t *sctx, isc_loopmgr_t *loopmgr, dns_message_createpools(mctx, &manager->namepool, &manager->rdspool); - /* - * We create specialised per-worker memory context specifically - * dedicated and tuned for allocating send buffers as it is a very - * common operation. Not doing so may result in excessive memory - * use in certain workloads. - * - * Please see this thread for more details: - * - * https://github.com/jemalloc/jemalloc/issues/2483 - * - * In particular, this information from the jemalloc developers is - * of the most interest: - * - * https://github.com/jemalloc/jemalloc/issues/2483#issuecomment-1639019699 - * https://github.com/jemalloc/jemalloc/issues/2483#issuecomment-1698173849 - * - * In essence, we use the following memory management strategy: - * - * 1. We use a per-worker memory arena for send buffers memory - * allocation to reduce lock contention (In reality, we create a - * per-client manager arena, but we have one client manager per - * worker). - * - * 2. The automatically created arenas settings remain unchanged - * and may be controlled by users (e.g. by setting the - * "MALLOC_CONF" variable). - * - * 3. We attune the arenas to not use dirty pages cache as the - * cache would have a poor reuse rate, and that is known to - * significantly contribute to excessive memory use. - * - * 4. There is no strict need for the dirty cache, as there is a - * per arena bin for each allocation size, so because we initially - * allocate strictly 64K per send buffer (enough for a DNS - * message), allocations would get directed to one bin (an "object - * pool" or a "slab") maintained within an arena. That is, there - * is an object pool already, specifically to optimise for the - * case of frequent allocations of objects of the given size. The - * object pool should suffice our needs, as we will end up - * recycling the objects from there without the need to back it by - * an additional layer of dirty pages cache. The dirty pages cache - * would have worked better in the case when there are more - * allocation bins involved due to a higher reuse rate (the case - * of a more "generic" memory management). - */ - isc_mem_create_arena(&manager->send_mctx); - isc_mem_setname(manager->send_mctx, "sendbufs"); - (void)isc_mem_arena_set_dirty_decay_ms(manager->send_mctx, 0); - /* - * Disable muzzy pages cache too, as versions < 5.2.0 have it - * enabled by default. The muzzy pages cache goes right below the - * dirty pages cache and backs it. - */ - (void)isc_mem_arena_set_muzzy_decay_ms(manager->send_mctx, 0); - manager->magic = MANAGER_MAGIC; MTRACE("create"); diff --git a/lib/ns/include/ns/client.h b/lib/ns/include/ns/client.h index 345f5e7782..67126b01f7 100644 --- a/lib/ns/include/ns/client.h +++ b/lib/ns/include/ns/client.h @@ -144,7 +144,6 @@ struct ns_clientmgr { unsigned int magic; isc_mem_t *mctx; - isc_mem_t *send_mctx; isc_mempool_t *namepool; isc_mempool_t *rdspool; @@ -158,6 +157,8 @@ struct ns_clientmgr { /* Lock covers the recursing list */ isc_mutex_t reclock; client_list_t recursing; /*%< Recursing clients */ + + uint8_t tcp_buffer[NS_CLIENT_TCP_BUFFER_SIZE]; }; /*% nameserver client structure */ @@ -178,7 +179,6 @@ struct ns_client { unsigned char *tcpbuf; size_t tcpbuf_size; dns_message_t *message; - unsigned char *sendbuf; dns_rdataset_t *opt; dns_ednsopt_t *ede; uint16_t udpsize; @@ -228,6 +228,8 @@ struct ns_client { * bits will be used as the rcode in the response message. */ int32_t rcode_override; + + uint8_t sendbuf[NS_CLIENT_SEND_BUFFER_SIZE]; }; #define NS_CLIENT_MAGIC ISC_MAGIC('N', 'S', 'C', 'c')