From 3af7cfe2ec2c5c9b4dda9caac62af5d83df798ea Mon Sep 17 00:00:00 2001 From: Yonas Habteab Date: Thu, 31 Oct 2024 10:19:40 +0100 Subject: [PATCH 1/3] JsonRpcConnection: Don't drop client from cache prematurely PR #7445 incorrectly assumed that a peer that had already disconnected and never reconnected was due to the endpoint client being dropped after a successful socket shutdown. However, the issue at that time was that there was not a single timeout guards that could cancel the `async_shutdown` call, petentially blocking indefinetely. Although removing the client from cache early might have allowed the endpoint to reconnect, it did not resolve the underlying problem. Now that we have a proper cancellation timeout, we can wait until the currently used socket is fully closed before dropping the client from our cache. When our socket termination works reliably, the `ApiListener` reconnect timer should attempt to reconnect this endpoint after the next tick. Additionally, we now have logs both for before and after socket termination, which may help identify if it is hanging somewhere in between. --- lib/remote/jsonrpcconnection.cpp | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/lib/remote/jsonrpcconnection.cpp b/lib/remote/jsonrpcconnection.cpp index d49c0b359..d4455e393 100644 --- a/lib/remote/jsonrpcconnection.cpp +++ b/lib/remote/jsonrpcconnection.cpp @@ -254,16 +254,6 @@ void JsonRpcConnection::Disconnect() Log(LogWarning, "JsonRpcConnection") << "API client disconnected for identity '" << m_Identity << "'"; - // We need to unregister the endpoint client as soon as possible not to confuse Icinga 2, - // given that Endpoint::GetConnected() is just performing a check that the endpoint's client - // cache is not empty, which could result in an already disconnected endpoint never trying to - // reconnect again. See #7444. - if (m_Endpoint) { - m_Endpoint->RemoveClient(this); - } else { - ApiListener::GetInstance()->RemoveAnonymousClient(this); - } - m_OutgoingMessagesQueued.Set(); m_WriterDone.Wait(yc); @@ -272,6 +262,12 @@ void JsonRpcConnection::Disconnect() m_HeartbeatTimer.cancel(); m_Stream->GracefulDisconnect(m_IoStrand, yc); + + if (m_Endpoint) { + m_Endpoint->RemoveClient(this); + } else { + ApiListener::GetInstance()->RemoveAnonymousClient(this); + } }); } } From 41373ad0e55838ed257e340c953d4432b8aee69c Mon Sep 17 00:00:00 2001 From: Yonas Habteab Date: Mon, 25 Nov 2024 11:30:10 +0100 Subject: [PATCH 2/3] Log before & after an RPC client is disconnected --- lib/remote/jsonrpcconnection.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/lib/remote/jsonrpcconnection.cpp b/lib/remote/jsonrpcconnection.cpp index d4455e393..47a0d2b79 100644 --- a/lib/remote/jsonrpcconnection.cpp +++ b/lib/remote/jsonrpcconnection.cpp @@ -250,10 +250,10 @@ void JsonRpcConnection::Disconnect() if (!m_ShuttingDown.exchange(true)) { JsonRpcConnection::Ptr keepAlive (this); - IoEngine::SpawnCoroutine(m_IoStrand, [this, keepAlive](asio::yield_context yc) { - Log(LogWarning, "JsonRpcConnection") - << "API client disconnected for identity '" << m_Identity << "'"; + Log(LogNotice, "JsonRpcConnection") + << "Disconnecting API client for identity '" << m_Identity << "'"; + IoEngine::SpawnCoroutine(m_IoStrand, [this, keepAlive](asio::yield_context yc) { m_OutgoingMessagesQueued.Set(); m_WriterDone.Wait(yc); @@ -268,6 +268,9 @@ void JsonRpcConnection::Disconnect() } else { ApiListener::GetInstance()->RemoveAnonymousClient(this); } + + Log(LogWarning, "JsonRpcConnection") + << "API client disconnected for identity '" << m_Identity << "'"; }); } } From 142564193144d2f8c64cf4b9e159eb5ddbcee378 Mon Sep 17 00:00:00 2001 From: Yonas Habteab Date: Thu, 28 Nov 2024 11:06:44 +0100 Subject: [PATCH 3/3] Don't endlessly wait on writer coroutine on disconnect --- lib/remote/jsonrpcconnection.cpp | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/lib/remote/jsonrpcconnection.cpp b/lib/remote/jsonrpcconnection.cpp index 47a0d2b79..b8ae22aaa 100644 --- a/lib/remote/jsonrpcconnection.cpp +++ b/lib/remote/jsonrpcconnection.cpp @@ -256,7 +256,22 @@ void JsonRpcConnection::Disconnect() IoEngine::SpawnCoroutine(m_IoStrand, [this, keepAlive](asio::yield_context yc) { m_OutgoingMessagesQueued.Set(); - m_WriterDone.Wait(yc); + { + Timeout writerTimeout( + m_IoStrand, + boost::posix_time::seconds(5), + [this]() { + // The writer coroutine could not finish soon enough to unblock the waiter down blow, + // so we have to do this on our own, and the coroutine will be terminated forcibly when + // the ops on the underlying socket are cancelled. + boost::system::error_code ec; + m_Stream->lowest_layer().cancel(ec); + } + ); + + m_WriterDone.Wait(yc); + // We don't need to explicitly cancel the timer here; its destructor will handle it for us. + } m_CheckLivenessTimer.cancel(); m_HeartbeatTimer.cancel();