From ecf5632ef8adcb79e03746133efde18e3b53ad6a Mon Sep 17 00:00:00 2001 From: Yonas Habteab Date: Mon, 13 Apr 2026 09:32:34 +0200 Subject: [PATCH 1/3] Timeout: lift `VERIFY` -> `ASSERT` to prevent crashes in release builds `strand.running_in_this_thread()` relies on thread-local storage internally, and may return false positives if the coroutine is resumed in a different thread than it was suspended in. In debug builds, this is not problem, since there's no TLS optimization done by the compiler, but in release builds, the compiler might cache the address of the thread-local variable read before the coroutine suspension, and thus potentially reuse the same address in a different thread after resumption, which would cause `running_in_this_thread()` to return false or even crash (but we didn't see any crashes related to this). So, perform the assertion only in debug builds to prevent potential wrong usages of the `Timeout` class. For more details, see [^1][^2][^3]. [^1]: https://github.com/chriskohlhoff/asio/issues/1366 [^2]: https://bugs.llvm.org/show_bug.cgi?id=19177 [^3]: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=26461 --- lib/base/io-engine.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/base/io-engine.hpp b/lib/base/io-engine.hpp index 5e9ef084c..6f074d253 100644 --- a/lib/base/io-engine.hpp +++ b/lib/base/io-engine.hpp @@ -237,7 +237,7 @@ public: Timeout(boost::asio::io_context::strand& strand, const Timer::duration_type& timeoutFromNow, OnTimeout onTimeout) : m_Timer(strand.context(), timeoutFromNow), m_Cancelled(Shared>::Make(false)) { - VERIFY(strand.running_in_this_thread()); + ASSERT(strand.running_in_this_thread()); m_Timer.async_wait(boost::asio::bind_executor( strand, [cancelled = m_Cancelled, onTimeout = std::move(onTimeout)](boost::system::error_code ec) { From 25a18a5a7e1ea5bebdf890b67e2441962dba55de Mon Sep 17 00:00:00 2001 From: Yonas Habteab Date: Mon, 13 Apr 2026 09:43:12 +0200 Subject: [PATCH 2/3] OTel: downgrade `broken_pipe` errors to debug log --- lib/otel/otel.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/otel/otel.cpp b/lib/otel/otel.cpp index f2a0d6aca..42be64f3a 100644 --- a/lib/otel/otel.cpp +++ b/lib/otel/otel.cpp @@ -353,7 +353,7 @@ void OTel::ExportLoop(boost::asio::yield_context& yc) // indicate a broken connection and force a reconnect in those cases. For the `end_of_stream` case, // we downgrade the log severity to debug level since this is a normal occurrence when using an OTEL // collector compatible backend that don't honor keep-alive connections (e.g., OpenSearch Data Prepper). - if (m_Stopped || (ser && ser->code() == http::error::end_of_stream)) { + if (m_Stopped || (ser && (ser->code() == http::error::end_of_stream || ser->code() == boost::asio::error::broken_pipe))) { severity = LogDebug; } Log{severity, "OTelExporter", DiagnosticInformation(ex, false)}; From e4436cbcf02e4df750bc38b805c88ce3f90b0394 Mon Sep 17 00:00:00 2001 From: Yonas Habteab Date: Wed, 15 Apr 2026 17:13:56 +0200 Subject: [PATCH 3/3] IoEngine: introduce & use `IsStrandRunningOnThisThread` function --- lib/base/io-engine.cpp | 2 +- lib/base/io-engine.hpp | 23 ++++++++++++++++++++++- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/lib/base/io-engine.cpp b/lib/base/io-engine.cpp index 2f93fc693..423f9f7c2 100644 --- a/lib/base/io-engine.cpp +++ b/lib/base/io-engine.cpp @@ -30,7 +30,7 @@ using namespace icinga; CpuBoundWork::CpuBoundWork(boost::asio::yield_context yc, boost::asio::io_context::strand& strand) : m_Done(false) { - VERIFY(strand.running_in_this_thread()); + VERIFY(IoEngine::IsStrandRunningOnThisThread(strand)); auto& ie (IoEngine::Get()); Shared::Ptr cv; diff --git a/lib/base/io-engine.hpp b/lib/base/io-engine.hpp index 6f074d253..cc2eb72de 100644 --- a/lib/base/io-engine.hpp +++ b/lib/base/io-engine.hpp @@ -96,6 +96,27 @@ public: static IoEngine& Get(); + /** + * Checks whether the given strand is currently running in the calling thread. + * + * This is a simple wrapper around @c running_in_this_thread() with a little but significant difference: + * It is marked as @c noinline to prevent the compiler from ever inlining the call to this function and + * thus potentially optimizing away the thread-local storage access that is required for this function + * to work correctly. This is especially important for the case where the caller is a coroutine that have + * some suspension points between the calls to this function, and cause the compiler to assume that the + * thread-local access performed by @c running_in_this_thread() is invariant across these suspensions and + * thus optimize it by caching the result in a register or on the stack, which would lead to incorrect + * results after resuming the coroutine on a different thread. For more details, see [^1][^2][^3]. + * + * [^1]: https://github.com/chriskohlhoff/asio/issues/1366 + * [^2]: https://bugs.llvm.org/show_bug.cgi?id=19177 + * [^3]: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=26461 + */ + BOOST_NOINLINE static bool IsStrandRunningOnThisThread(const boost::asio::io_context::strand& strand) + { + return strand.running_in_this_thread(); + } + boost::asio::io_context& GetIoContext(); static inline size_t GetCoroutineStackSize() { @@ -237,7 +258,7 @@ public: Timeout(boost::asio::io_context::strand& strand, const Timer::duration_type& timeoutFromNow, OnTimeout onTimeout) : m_Timer(strand.context(), timeoutFromNow), m_Cancelled(Shared>::Make(false)) { - ASSERT(strand.running_in_this_thread()); + ASSERT(IoEngine::IsStrandRunningOnThisThread(strand)); m_Timer.async_wait(boost::asio::bind_executor( strand, [cancelled = m_Cancelled, onTimeout = std::move(onTimeout)](boost::system::error_code ec) {