From a280855f7ba743027440c274efdff4bb4c1e5668 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ond=C5=99ej=20Sur=C3=BD?= Date: Wed, 13 Jul 2022 09:34:47 +0200 Subject: [PATCH 1/3] Handle the transient TCP connect() failures on FreeBSD On FreeBSD (and perhaps other *BSD) systems, the TCP connect() call (via uv_tcp_connect()) can fail with transient UV_EADDRINUSE error. The UDP code already handles this by trying three times (is a charm) before giving up. Add a code for the TCP, TCPDNS and TLSDNS layers to also try three times before giving up by calling uv_tcp_connect() from the callback two more time on UV_EADDRINUSE error. Additionally, stop the timer only if we succeed or on hard error via isc__nm_failed_connect_cb(). --- lib/isc/netmgr/netmgr-int.h | 1 + lib/isc/netmgr/netmgr.c | 5 ++++- lib/isc/netmgr/tcp.c | 30 +++++++++++++++++++++++------- lib/isc/netmgr/tcpdns.c | 30 +++++++++++++++++++++++------- lib/isc/netmgr/tlsdns.c | 23 ++++++++++++++++++++--- lib/isc/netmgr/udp.c | 3 +-- 6 files changed, 72 insertions(+), 20 deletions(-) diff --git a/lib/isc/netmgr/netmgr-int.h b/lib/isc/netmgr/netmgr-int.h index ebb57a3390..96f863fa4d 100644 --- a/lib/isc/netmgr/netmgr-int.h +++ b/lib/isc/netmgr/netmgr-int.h @@ -377,6 +377,7 @@ struct isc__nm_uvreq { isc__nm_cb_t cb; /* callback */ void *cbarg; /* callback argument */ isc_nm_timer_t *timer; /* TCP write timer */ + int connect_tries; /* connect retries */ union { uv_handle_t handle; diff --git a/lib/isc/netmgr/netmgr.c b/lib/isc/netmgr/netmgr.c index 4b416fae06..350117d6f7 100644 --- a/lib/isc/netmgr/netmgr.c +++ b/lib/isc/netmgr/netmgr.c @@ -2465,7 +2465,10 @@ isc___nm_uvreq_get(isc_nm_t *mgr, isc_nmsocket_t *sock FLARG) { req = isc_mem_get(mgr->mctx, sizeof(*req)); } - *req = (isc__nm_uvreq_t){ .magic = 0 }; + *req = (isc__nm_uvreq_t){ + .magic = 0, + .connect_tries = 3, + }; ISC_LINK_INIT(req, link); req->uv_req.req.data = req; isc___nmsocket_attach(sock, &req->sock FLARG_PASS); diff --git a/lib/isc/netmgr/tcp.c b/lib/isc/netmgr/tcp.c index f7af009081..3d139b0dc1 100644 --- a/lib/isc/netmgr/tcp.c +++ b/lib/isc/netmgr/tcp.c @@ -228,9 +228,6 @@ tcp_connect_cb(uv_connect_t *uvreq, int status) { REQUIRE(VALID_NMSOCK(sock)); REQUIRE(sock->tid == isc_nm_tid()); - isc__nmsocket_timer_stop(sock); - uv_handle_set_data((uv_handle_t *)&sock->read_timer, sock); - req = uv_handle_get_data((uv_handle_t *)uvreq); REQUIRE(VALID_UVREQ(req)); @@ -239,9 +236,7 @@ tcp_connect_cb(uv_connect_t *uvreq, int status) { if (atomic_load(&sock->timedout)) { result = ISC_R_TIMEDOUT; goto error; - } - - if (!atomic_load(&sock->connecting)) { + } else if (!atomic_load(&sock->connecting)) { /* * The connect was cancelled from timeout; just clean up * the req. @@ -260,11 +255,33 @@ tcp_connect_cb(uv_connect_t *uvreq, int status) { /* Timeout status code here indicates hard error */ result = ISC_R_TIMEDOUT; goto error; + } else if (status == UV_EADDRINUSE) { + /* + * On FreeBSD the TCP connect() call sometimes results in a + * spurious transient EADDRINUSE. Try a few more times before + * giving up. + */ + if (--req->connect_tries > 0) { + r = uv_tcp_connect(&req->uv_req.connect, + &sock->uv_handle.tcp, + &req->peer.type.sa, tcp_connect_cb); + if (r != 0) { + isc__nm_incstats(sock, STATID_CONNECTFAIL); + result = isc_uverr2result(r); + goto error; + } + return; + } + result = isc_uverr2result(status); + goto error; } else if (status != 0) { result = isc_uverr2result(status); goto error; } + isc__nmsocket_timer_stop(sock); + uv_handle_set_data((uv_handle_t *)&sock->read_timer, sock); + isc__nm_incstats(sock, STATID_CONNECT); r = uv_tcp_getpeername(&sock->uv_handle.tcp, (struct sockaddr *)&ss, &(int){ sizeof(ss) }); @@ -281,7 +298,6 @@ tcp_connect_cb(uv_connect_t *uvreq, int status) { isc__nm_connectcb(sock, req, ISC_R_SUCCESS, false); return; - error: isc__nm_failed_connect_cb(sock, req, result, false); } diff --git a/lib/isc/netmgr/tcpdns.c b/lib/isc/netmgr/tcpdns.c index e0da11b43f..685268eed9 100644 --- a/lib/isc/netmgr/tcpdns.c +++ b/lib/isc/netmgr/tcpdns.c @@ -198,9 +198,6 @@ tcpdns_connect_cb(uv_connect_t *uvreq, int status) { REQUIRE(VALID_NMSOCK(sock)); REQUIRE(sock->tid == isc_nm_tid()); - isc__nmsocket_timer_stop(sock); - uv_handle_set_data((uv_handle_t *)&sock->read_timer, sock); - req = uv_handle_get_data((uv_handle_t *)uvreq); REQUIRE(VALID_UVREQ(req)); @@ -209,9 +206,7 @@ tcpdns_connect_cb(uv_connect_t *uvreq, int status) { if (atomic_load(&sock->timedout)) { result = ISC_R_TIMEDOUT; goto error; - } - - if (isc__nm_closing(sock)) { + } else if (isc__nm_closing(sock)) { /* Network manager shutting down */ result = ISC_R_SHUTTINGDOWN; goto error; @@ -223,11 +218,33 @@ tcpdns_connect_cb(uv_connect_t *uvreq, int status) { /* Timeout status code here indicates hard error */ result = ISC_R_TIMEDOUT; goto error; + } else if (status == UV_EADDRINUSE) { + /* + * On FreeBSD the TCP connect() call sometimes results in a + * spurious transient EADDRINUSE. Try a few more times before + * giving up. + */ + if (--req->connect_tries > 0) { + r = uv_tcp_connect( + &req->uv_req.connect, &sock->uv_handle.tcp, + &req->peer.type.sa, tcpdns_connect_cb); + if (r != 0) { + isc__nm_incstats(sock, STATID_CONNECTFAIL); + result = isc_uverr2result(r); + goto error; + } + return; + } + result = isc_uverr2result(status); + goto error; } else if (status != 0) { result = isc_uverr2result(status); goto error; } + isc__nmsocket_timer_stop(sock); + uv_handle_set_data((uv_handle_t *)&sock->read_timer, sock); + isc__nm_incstats(sock, STATID_CONNECT); r = uv_tcp_getpeername(&sock->uv_handle.tcp, (struct sockaddr *)&ss, &(int){ sizeof(ss) }); @@ -244,7 +261,6 @@ tcpdns_connect_cb(uv_connect_t *uvreq, int status) { isc__nm_connectcb(sock, req, ISC_R_SUCCESS, false); return; - error: isc__nm_failed_connect_cb(sock, req, result, false); } diff --git a/lib/isc/netmgr/tlsdns.c b/lib/isc/netmgr/tlsdns.c index 42f47cb423..44c546836b 100644 --- a/lib/isc/netmgr/tlsdns.c +++ b/lib/isc/netmgr/tlsdns.c @@ -246,9 +246,7 @@ tlsdns_connect_cb(uv_connect_t *uvreq, int status) { if (atomic_load(&sock->timedout)) { result = ISC_R_TIMEDOUT; goto error; - } - - if (isc__nm_closing(sock)) { + } else if (isc__nm_closing(sock)) { /* Network manager shutting down */ result = ISC_R_SHUTTINGDOWN; goto error; @@ -260,6 +258,25 @@ tlsdns_connect_cb(uv_connect_t *uvreq, int status) { /* Timeout status code here indicates hard error */ result = ISC_R_TIMEDOUT; goto error; + } else if (status == UV_EADDRINUSE) { + /* + * On FreeBSD the TCP connect() call sometimes results in a + * spurious transient EADDRINUSE. Try a few more times before + * giving up. + */ + if (--req->connect_tries > 0) { + r = uv_tcp_connect( + &req->uv_req.connect, &sock->uv_handle.tcp, + &req->peer.type.sa, tlsdns_connect_cb); + if (r != 0) { + isc__nm_incstats(sock, STATID_CONNECTFAIL); + result = isc_uverr2result(r); + goto error; + } + return; + } + result = isc_uverr2result(status); + goto error; } else if (status != 0) { result = isc_uverr2result(status); goto error; diff --git a/lib/isc/netmgr/udp.c b/lib/isc/netmgr/udp.c index 37a3f0480a..b4aa2ea763 100644 --- a/lib/isc/netmgr/udp.c +++ b/lib/isc/netmgr/udp.c @@ -846,7 +846,6 @@ udp_connect_direct(isc_nmsocket_t *sock, isc__nm_uvreq_t *req) { isc__networker_t *worker = NULL; int uv_bind_flags = UV_UDP_REUSEADDR; isc_result_t result = ISC_R_UNSET; - int tries = 3; int r; REQUIRE(isc__nm_in_netthread()); @@ -901,7 +900,7 @@ udp_connect_direct(isc_nmsocket_t *sock, isc__nm_uvreq_t *req) { do { r = isc_uv_udp_connect(&sock->uv_handle.udp, &req->peer.type.sa); - } while (r == UV_EADDRINUSE && --tries > 0); + } while (r == UV_EADDRINUSE && --req->connect_tries > 0); if (r != 0) { isc__nm_incstats(sock, STATID_CONNECTFAIL); goto done; From 3e10d3b45f31538602dee12ff2a7c36385438c1e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ond=C5=99ej=20Sur=C3=BD?= Date: Thu, 14 Jul 2022 13:22:34 +0200 Subject: [PATCH 2/3] Cleanup the STATID_CONNECT and STATID_CONNECTFAIL stat counters The STATID_CONNECT and STATID_CONNECTFAIL statistics were used incorrectly. The STATID_CONNECT was incremented twice (once in the *_connect_direct() and once in the callback) and STATID_CONNECTFAIL would not be incremented at all if the failure happened in the callback. Closes: #3452 --- lib/isc/netmgr/netmgr.c | 2 ++ lib/isc/netmgr/tcp.c | 4 +--- lib/isc/netmgr/tcpdns.c | 4 +--- lib/isc/netmgr/tlsdns.c | 4 +--- 4 files changed, 5 insertions(+), 9 deletions(-) diff --git a/lib/isc/netmgr/netmgr.c b/lib/isc/netmgr/netmgr.c index 350117d6f7..f1a0b94186 100644 --- a/lib/isc/netmgr/netmgr.c +++ b/lib/isc/netmgr/netmgr.c @@ -1919,6 +1919,8 @@ isc__nm_failed_connect_cb(isc_nmsocket_t *sock, isc__nm_uvreq_t *req, REQUIRE(sock->tid == isc_nm_tid()); REQUIRE(req->cb.connect != NULL); + isc__nm_incstats(sock, STATID_CONNECTFAIL); + isc__nmsocket_timer_stop(sock); uv_handle_set_data((uv_handle_t *)&sock->read_timer, sock); diff --git a/lib/isc/netmgr/tcp.c b/lib/isc/netmgr/tcp.c index 3d139b0dc1..6b417963e1 100644 --- a/lib/isc/netmgr/tcp.c +++ b/lib/isc/netmgr/tcp.c @@ -165,7 +165,6 @@ tcp_connect_direct(isc_nmsocket_t *sock, isc__nm_uvreq_t *req) { isc__nm_incstats(sock, STATID_CONNECTFAIL); goto done; } - isc__nm_incstats(sock, STATID_CONNECT); uv_handle_set_data((uv_handle_t *)&sock->read_timer, &req->uv_req.connect); @@ -219,7 +218,7 @@ isc__nm_async_tcpconnect(isc__networker_t *worker, isc__netievent_t *ev0) { static void tcp_connect_cb(uv_connect_t *uvreq, int status) { - isc_result_t result; + isc_result_t result = ISC_R_UNSET; isc__nm_uvreq_t *req = NULL; isc_nmsocket_t *sock = uv_handle_get_data((uv_handle_t *)uvreq->handle); struct sockaddr_storage ss; @@ -266,7 +265,6 @@ tcp_connect_cb(uv_connect_t *uvreq, int status) { &sock->uv_handle.tcp, &req->peer.type.sa, tcp_connect_cb); if (r != 0) { - isc__nm_incstats(sock, STATID_CONNECTFAIL); result = isc_uverr2result(r); goto error; } diff --git a/lib/isc/netmgr/tcpdns.c b/lib/isc/netmgr/tcpdns.c index 685268eed9..ce281a118f 100644 --- a/lib/isc/netmgr/tcpdns.c +++ b/lib/isc/netmgr/tcpdns.c @@ -135,7 +135,6 @@ tcpdns_connect_direct(isc_nmsocket_t *sock, isc__nm_uvreq_t *req) { isc__nm_incstats(sock, STATID_CONNECTFAIL); goto done; } - isc__nm_incstats(sock, STATID_CONNECT); uv_handle_set_data((uv_handle_t *)&sock->read_timer, &req->uv_req.connect); @@ -189,7 +188,7 @@ isc__nm_async_tcpdnsconnect(isc__networker_t *worker, isc__netievent_t *ev0) { static void tcpdns_connect_cb(uv_connect_t *uvreq, int status) { - isc_result_t result; + isc_result_t result = ISC_R_UNSET; isc__nm_uvreq_t *req = NULL; isc_nmsocket_t *sock = uv_handle_get_data((uv_handle_t *)uvreq->handle); struct sockaddr_storage ss; @@ -229,7 +228,6 @@ tcpdns_connect_cb(uv_connect_t *uvreq, int status) { &req->uv_req.connect, &sock->uv_handle.tcp, &req->peer.type.sa, tcpdns_connect_cb); if (r != 0) { - isc__nm_incstats(sock, STATID_CONNECTFAIL); result = isc_uverr2result(r); goto error; } diff --git a/lib/isc/netmgr/tlsdns.c b/lib/isc/netmgr/tlsdns.c index 44c546836b..73f153bb9c 100644 --- a/lib/isc/netmgr/tlsdns.c +++ b/lib/isc/netmgr/tlsdns.c @@ -173,7 +173,6 @@ tlsdns_connect_direct(isc_nmsocket_t *sock, isc__nm_uvreq_t *req) { isc__nm_incstats(sock, STATID_CONNECTFAIL); goto done; } - isc__nm_incstats(sock, STATID_CONNECT); uv_handle_set_data((uv_handle_t *)&sock->read_timer, &req->uv_req.connect); @@ -229,7 +228,7 @@ isc__nm_async_tlsdnsconnect(isc__networker_t *worker, isc__netievent_t *ev0) { static void tlsdns_connect_cb(uv_connect_t *uvreq, int status) { - isc_result_t result; + isc_result_t result = ISC_R_UNSET; isc__nm_uvreq_t *req = NULL; isc_nmsocket_t *sock = uv_handle_get_data((uv_handle_t *)uvreq->handle); struct sockaddr_storage ss; @@ -269,7 +268,6 @@ tlsdns_connect_cb(uv_connect_t *uvreq, int status) { &req->uv_req.connect, &sock->uv_handle.tcp, &req->peer.type.sa, tlsdns_connect_cb); if (r != 0) { - isc__nm_incstats(sock, STATID_CONNECTFAIL); result = isc_uverr2result(r); goto error; } From 6e71fd2a88977eb17553d8dfcb542a2ebe4be834 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ond=C5=99ej=20Sur=C3=BD?= Date: Wed, 13 Jul 2022 09:43:00 +0200 Subject: [PATCH 3/3] Add CHANGES note for [GL #3451] --- CHANGES | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CHANGES b/CHANGES index 49e43f3fa4..c9ef106552 100644 --- a/CHANGES +++ b/CHANGES @@ -1,3 +1,7 @@ +5926. [func] Handle transient TCP connect() EADDRINUSE failures + on FreeBSD (and possibly other BSDs) by trying three + times before giving up. [GL #3451] + 5925. [bug] With a forwarder configured for all queries, resolution failures encountered during DS chasing could trigger assertion failures due to a logic bug in