mirror of
https://github.com/Icinga/icinga2.git
synced 2026-04-05 17:17:47 -04:00
678 lines
25 KiB
C++
678 lines
25 KiB
C++
// SPDX-FileCopyrightText: 2026 Icinga GmbH <https://icinga.com>
|
|
// SPDX-License-Identifier: GPL-3.0-or-later
|
|
|
|
#include "otel/otel.hpp"
|
|
#include "base/application.hpp"
|
|
#include "base/defer.hpp"
|
|
#include "base/tcpsocket.hpp"
|
|
#include "base/tlsutility.hpp"
|
|
#include <boost/asio/read.hpp>
|
|
#include <boost/beast/http/message.hpp>
|
|
#include <boost/lexical_cast.hpp>
|
|
#include <future>
|
|
|
|
using namespace icinga;
|
|
|
|
namespace http = boost::beast::http;
|
|
namespace v1_metrics = opentelemetry::proto::metrics::v1;
|
|
|
|
// The max buffer size used to batch Protobuf writes to Asio streams.
|
|
static constexpr std::size_t l_BufferSize = 64UL * 1024;
|
|
// The OpenTelemetry schema convention URL used in the exported metrics.
|
|
// See https://opentelemetry.io/docs/specs/semconv/
|
|
static constexpr std::string_view l_OTelSchemaConv = "https://opentelemetry.io/schemas/1.39.0";
|
|
|
|
template std::size_t OTel::Record(Gauge&, int64_t, double, double, AttrsMap);
|
|
template std::size_t OTel::Record(Gauge&, double, double, double, AttrsMap);
|
|
template void OTel::SetAttribute(Attribute&, std::string_view&&, String&&);
|
|
template void OTel::SetAttribute(Attribute&, String&&, Value&);
|
|
|
|
/**
|
|
* Calculate the exponential backoff duration for retrying failed exports or reconnections.
|
|
*
|
|
* This method calculates the backoff duration based on the number of retry attempts using an exponential
|
|
* backoff strategy as per OTel specifications. The backoff duration starts at a minimum value and doubles
|
|
* with each attempt, up to a maximum cap (30s). This helps to avoid overwhelming the OpenTelemetry backend
|
|
* with rapid retry attempts in case of transient errors.
|
|
*
|
|
* @param attempt The current retry attempt number (starting from 1).
|
|
*
|
|
* @return The calculated backoff duration in milliseconds.
|
|
*/
|
|
static constexpr std::chrono::milliseconds Backoff(uint64_t attempt)
|
|
{
|
|
using namespace std::chrono;
|
|
|
|
constexpr milliseconds MaxBackoffMs = seconds(30);
|
|
constexpr milliseconds MinBackoffMs = milliseconds(100);
|
|
|
|
// 2^attempt may overflow, so we cap it to a safe value within the 64-bit range,
|
|
// which is sufficient to reach MaxBackoffMs from MinBackoffMs.
|
|
constexpr uint64_t maxSafeAttempt = 16; // 2^16 * 100ms = 6553.6s > 30s
|
|
auto exponential = MinBackoffMs * (1ULL << std::min(attempt, maxSafeAttempt));
|
|
if (exponential >= MaxBackoffMs) {
|
|
return MaxBackoffMs;
|
|
}
|
|
return duration_cast<milliseconds>(exponential);
|
|
}
|
|
|
|
OTel::OTel(OTelConnInfo& connInfo): OTel{connInfo, IoEngine::Get().GetIoContext()}
|
|
{
|
|
}
|
|
|
|
OTel::OTel(OTelConnInfo& connInfo, boost::asio::io_context& io)
|
|
: m_ConnInfo{std::move(connInfo)},
|
|
m_Strand{io},
|
|
m_ExportAsioCV{io},
|
|
m_RetryExportAndConnTimer{io},
|
|
m_Exporting{false},
|
|
m_Stopped{false}
|
|
{
|
|
if (m_ConnInfo.EnableTls) {
|
|
m_TlsContext = MakeAsioSslContext(m_ConnInfo.TlsCrt, m_ConnInfo.TlsKey, m_ConnInfo.TlsCaCrt);
|
|
}
|
|
}
|
|
|
|
void OTel::Start()
|
|
{
|
|
if (m_Stopped.exchange(false)) {
|
|
ResetExporting(true);
|
|
}
|
|
|
|
IoEngine::SpawnCoroutine(m_Strand, [this, keepAlive = ConstPtr(this)](boost::asio::yield_context yc) {
|
|
ExportLoop(yc);
|
|
});
|
|
}
|
|
|
|
/**
|
|
* Stop the OTel exporter and disconnect from the OpenTelemetry backend.
|
|
*
|
|
* This method blocks until the exporter has fully stopped and disconnected from the backend.
|
|
* It cancels any ongoing export operations and clears all its internal state, so that it can be
|
|
* safely restarted later if needed.
|
|
*/
|
|
void OTel::Stop()
|
|
{
|
|
if (m_Stopped.exchange(true)) {
|
|
return;
|
|
}
|
|
|
|
std::promise<void> promise;
|
|
IoEngine::SpawnCoroutine(m_Strand, [this, &promise, keepAlive = ConstPtr(this)](boost::asio::yield_context& yc) {
|
|
m_ExportAsioCV.NotifyAll(); // Wake up the export loop if it's waiting for new export requests.
|
|
m_RetryExportAndConnTimer.cancel();
|
|
|
|
if (!m_Stream) {
|
|
promise.set_value();
|
|
return;
|
|
}
|
|
|
|
// We only wait for ongoing export operations to complete if we're currently exporting,
|
|
// otherwise there will be nothing that would wake us up from the `WaitForClear` sleep
|
|
// below, and we would end up blocking indefinitely, so we have to check the exporting
|
|
// state here first.
|
|
if (Exporting()) {
|
|
Timeout writerTimeout(m_Strand, boost::posix_time::seconds(5), [this] {
|
|
boost::system::error_code ec;
|
|
std::visit([&ec](auto& stream) { stream->lowest_layer().cancel(ec); }, *m_Stream);
|
|
});
|
|
while (m_Request) {
|
|
m_ExportAsioCV.Wait(yc);
|
|
}
|
|
}
|
|
|
|
// Check if the stream is still valid before attempting to disconnect, since the above lowest_layer.cancel()
|
|
// may have caused the export loop to detect a broken connection and reset the stream already.
|
|
if (m_Stream) {
|
|
if (auto* tlsStreamPtr = std::get_if<Shared<AsioTlsStream>::Ptr>(&*m_Stream); tlsStreamPtr) {
|
|
(*tlsStreamPtr)->GracefulDisconnect(m_Strand, yc);
|
|
} else if (auto* tcpStreamPtr = std::get_if<Shared<AsioTcpStream>::Ptr>(&*m_Stream); tcpStreamPtr) {
|
|
boost::system::error_code ec;
|
|
(*tcpStreamPtr)->lowest_layer().shutdown(AsioTcpStream::lowest_layer_type::shutdown_both, ec);
|
|
(*tcpStreamPtr)->lowest_layer().close(ec);
|
|
}
|
|
}
|
|
|
|
Log(LogInformation, "OTelExporter")
|
|
<< "Disconnected from OpenTelemetry backend.";
|
|
|
|
m_Stream.reset();
|
|
promise.set_value();
|
|
});
|
|
promise.get_future().wait();
|
|
}
|
|
|
|
/**
|
|
* Export the given OTel metrics request to the OpenTelemetry backend.
|
|
*
|
|
* This method initiates the export of the provided OTel metrics request to the configured
|
|
* OpenTelemetry backend. If an export is already in progress, it waits for the previous
|
|
* export to complete before proceeding with the new export request (blocking the caller).
|
|
*
|
|
* @param request The OTel metrics request to export.
|
|
*/
|
|
void OTel::Export(std::unique_ptr<MetricsRequest>&& request)
|
|
{
|
|
std::unique_lock lock(m_Mutex);
|
|
if (m_Exporting) {
|
|
Log(LogWarning, "OTelExporter")
|
|
<< "Received export request while previous export is still in progress. Waiting for it to complete.";
|
|
|
|
m_ExportCV.wait(lock, [this] { return m_Stopped || !m_Exporting; });
|
|
if (m_Stopped) {
|
|
return;
|
|
}
|
|
}
|
|
m_Exporting = true;
|
|
lock.unlock();
|
|
|
|
// Access to m_Request is serialized via m_Strand, so we must post the actual export operation to it.
|
|
boost::asio::post(m_Strand, [this, keepAlive = ConstPtr(this), request = std::move(request)]() mutable {
|
|
m_Request = std::move(request);
|
|
m_ExportAsioCV.NotifyAll();
|
|
});
|
|
}
|
|
|
|
/**
|
|
* Populate the standard OTel resource attributes in the given ResourceMetrics Protobuf object.
|
|
*
|
|
* This method populates the standard OTel resource attributes as per OTel specifications[^1][^2]
|
|
* into the provided ResourceMetrics Protobuf object. It sets attributes such as service name,
|
|
* instance ID, version, and telemetry SDK information.
|
|
*
|
|
* @param rm The ResourceMetrics Protobuf object to populate.
|
|
*
|
|
* [^1]: https://opentelemetry.io/docs/specs/semconv/resource/#telemetry-sdk
|
|
* [^2]: https://opentelemetry.io/docs/specs/semconv/resource/service/
|
|
*/
|
|
void OTel::PopulateResourceAttrs(const std::unique_ptr<v1_metrics::ResourceMetrics>& rm)
|
|
{
|
|
using namespace std::string_view_literals;
|
|
|
|
rm->set_schema_url(l_OTelSchemaConv.data());
|
|
auto* resource = rm->mutable_resource();
|
|
|
|
auto* attr = resource->add_attributes();
|
|
SetAttribute(*attr, "service.name"sv, "Icinga 2"sv);
|
|
|
|
auto instanceID = Application::GetEnvironmentId();
|
|
if (instanceID.IsEmpty()) {
|
|
instanceID = "unknown";
|
|
}
|
|
attr = resource->add_attributes();
|
|
SetAttribute(*attr, "service.instance.id"sv, std::move(instanceID));
|
|
|
|
attr = resource->add_attributes();
|
|
SetAttribute(*attr, "service.version"sv, Application::GetAppVersion());
|
|
|
|
attr = resource->add_attributes();
|
|
// We don't actually use OTel SDKs here, but to comply with OTel specs, we need to provide these attributes anyway.
|
|
SetAttribute(*attr, "telemetry.sdk.language"sv, "cpp"sv);
|
|
|
|
attr = resource->add_attributes();
|
|
SetAttribute(*attr, "telemetry.sdk.name"sv, "Icinga 2 OTel Integration"sv);
|
|
|
|
attr = resource->add_attributes();
|
|
SetAttribute(*attr, "telemetry.sdk.version"sv, Application::GetAppVersion());
|
|
|
|
auto* ism = rm->add_scope_metrics();
|
|
ism->set_schema_url(l_OTelSchemaConv.data());
|
|
ism->mutable_scope()->set_name("icinga2");
|
|
ism->mutable_scope()->set_version(Application::GetAppVersion());
|
|
}
|
|
|
|
/**
|
|
* Establish a connection to the OpenTelemetry backend endpoint.
|
|
*
|
|
* In case of connection failures, it retries as per OTel spec[^1] with exponential backoff until a successful
|
|
* connection is established or the exporter is stopped. Therefore, @c m_Stream is not guaranteed to be valid
|
|
* after this method returns, so the caller must check it before using it.
|
|
*
|
|
* @param yc The Boost.Asio yield context for asynchronous operations.
|
|
*
|
|
* [^1]: https://opentelemetry.io/docs/specs/otlp/#otlphttp-connection
|
|
*/
|
|
void OTel::Connect(boost::asio::yield_context& yc)
|
|
{
|
|
Log(LogInformation, "OTelExporter")
|
|
<< "Connecting to OpenTelemetry backend on host '" << m_ConnInfo.Host << ":" << m_ConnInfo.Port << "'.";
|
|
|
|
for (uint64_t attempt = 1; !m_Stopped; ++attempt) {
|
|
try {
|
|
decltype(m_Stream) stream;
|
|
if (m_ConnInfo.EnableTls) {
|
|
stream = Shared<AsioTlsStream>::Make(m_Strand.context(), *m_TlsContext, m_ConnInfo.Host);
|
|
} else {
|
|
stream = Shared<AsioTcpStream>::Make(m_Strand.context());
|
|
}
|
|
|
|
Timeout timeout{m_Strand, boost::posix_time::seconds(10), [this, stream] {
|
|
Log(LogCritical, "OTelExporter")
|
|
<< "Timeout while connecting to OpenTelemetry backend '" << m_ConnInfo.Host << ":" << m_ConnInfo.Port << "', cancelling attempt.";
|
|
|
|
boost::system::error_code ec;
|
|
std::visit([&ec](auto& s) { s->lowest_layer().cancel(ec); }, *stream);
|
|
}};
|
|
|
|
std::visit([this, &yc](auto& streamArg) {
|
|
icinga::Connect(streamArg->lowest_layer(), m_ConnInfo.Host, std::to_string(m_ConnInfo.Port), yc);
|
|
|
|
if constexpr (std::is_same_v<std::decay_t<decltype(streamArg)>, Shared<AsioTlsStream>::Ptr>) {
|
|
streamArg->next_layer().async_handshake(AsioTlsStream::next_layer_type::client, yc);
|
|
|
|
if (m_ConnInfo.VerifyPeerCertificate && !streamArg->next_layer().IsVerifyOK()) {
|
|
BOOST_THROW_EXCEPTION(std::runtime_error(
|
|
"TLS certificate validation failed: " + streamArg->next_layer().GetVerifyError()
|
|
));
|
|
}
|
|
}
|
|
}, *stream);
|
|
|
|
m_Stream = std::move(stream);
|
|
|
|
Log(LogInformation, "OTelExporter")
|
|
<< "Successfully connected to OpenTelemetry backend.";
|
|
return;
|
|
} catch (const std::exception& ex) {
|
|
Log(m_Stopped ? LogDebug : LogCritical, "OTelExporter")
|
|
<< "Cannot connect to OpenTelemetry backend '" << m_ConnInfo.Host << ":" << m_ConnInfo.Port
|
|
<< "' (attempt #" << attempt << "): " << ex.what();
|
|
|
|
if (!m_Stopped) {
|
|
boost::system::error_code ec;
|
|
m_RetryExportAndConnTimer.expires_after(Backoff(attempt));
|
|
m_RetryExportAndConnTimer.async_wait(yc[ec]);
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Main export loop for exporting OTel metrics to the configured backend.
|
|
*
|
|
* This method runs in a loop, waiting for new metrics to be available for export. In case of export failures,
|
|
* it retries the export as per OTel spec[^1] with exponential backoff until the export succeeds or the exporter
|
|
* is stopped. After a successful export, it clears the exported metrics from @c m_Request to make room for new metrics.
|
|
*
|
|
* @param yc The Asio yield context for asynchronous operations.
|
|
*
|
|
* [^1]: https://opentelemetry.io/docs/specs/otlp/#retryable-response-codes
|
|
*/
|
|
void OTel::ExportLoop(boost::asio::yield_context& yc)
|
|
{
|
|
Defer cleanup{[this] {
|
|
m_Request.reset();
|
|
m_ExportAsioCV.NotifyAll();
|
|
ResetExporting(true /* notify all */);
|
|
}};
|
|
|
|
namespace ch = std::chrono;
|
|
|
|
while (true) {
|
|
// Wait for a new export request to be available. If the exporter is stopped while waiting,
|
|
// we will be notified without a new request, so we also check the stopped state here to
|
|
// avoid waiting indefinitely in that case.
|
|
while (!m_Request && !m_Stopped) {
|
|
m_ExportAsioCV.Wait(yc);
|
|
}
|
|
|
|
if (m_Stopped) {
|
|
break;
|
|
}
|
|
|
|
if (!m_Stream) {
|
|
Connect(yc);
|
|
}
|
|
|
|
for (uint64_t attempt = 1; m_Stream && !m_Stopped; ++attempt) {
|
|
try {
|
|
ExportImpl(yc);
|
|
m_Request.reset();
|
|
m_ExportAsioCV.NotifyAll();
|
|
ResetExporting(false /* notify one */);
|
|
break;
|
|
} catch (const RetryableExportError& ex) {
|
|
ch::milliseconds retryAfter;
|
|
if (auto throttle = ex.Throttle(); throttle > 0ms) {
|
|
retryAfter = throttle;
|
|
} else {
|
|
retryAfter = Backoff(attempt);
|
|
}
|
|
|
|
Log(LogWarning, "OTelExporter")
|
|
<< "Failed to export metrics to OpenTelemetry backend (attempt #" << attempt << "). Retrying in "
|
|
<< retryAfter.count() << "ms.";
|
|
|
|
boost::system::error_code ec;
|
|
m_RetryExportAndConnTimer.expires_after(retryAfter);
|
|
m_RetryExportAndConnTimer.async_wait(yc[ec]);
|
|
} catch (const std::exception& ex) {
|
|
LogSeverity severity = LogCritical;
|
|
const auto* ser{dynamic_cast<const boost::system::system_error*>(&ex)};
|
|
// Since we don't have a proper connection health check mechanism, we assume that certain errors
|
|
// indicate a broken connection and force a reconnect in those cases. For the `end_of_stream` case,
|
|
// we downgrade the log severity to debug level since this is a normal occurrence when using an OTEL
|
|
// collector compatible backend that don't honor keep-alive connections (e.g., OpenSearch Data Prepper).
|
|
if (m_Stopped || (ser && ser->code() == http::error::end_of_stream)) {
|
|
severity = LogDebug;
|
|
}
|
|
Log{severity, "OTelExporter", DiagnosticInformation(ex, false)};
|
|
m_Stream.reset(); // Force reconnect on next export attempt.
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
void OTel::ExportImpl(boost::asio::yield_context& yc) const
|
|
{
|
|
AsioProtobufOutStream outputS{*m_Stream, m_ConnInfo, yc};
|
|
[[maybe_unused]] auto serialized = m_Request->SerializeToZeroCopyStream(&outputS);
|
|
ASSERT(serialized);
|
|
// Must have completed chunk writing successfully, otherwise reading the response will hang forever.
|
|
if (!outputS.WriterDone()) {
|
|
BOOST_THROW_EXCEPTION(std::runtime_error("BUG: Protobuf output stream writer did not complete successfully."));
|
|
}
|
|
|
|
IncomingHttpResponse responseMsg{*m_Stream};
|
|
responseMsg.Parse(yc);
|
|
|
|
if (auto ct = responseMsg[http::field::content_type]; ct != "application/x-protobuf") {
|
|
if (responseMsg.result() == http::status::ok) {
|
|
// Some OpenTelemetry Collector compatible backends (e.g., Prometheus OTLP Receiver) respond with 200 OK
|
|
// but without the expected Protobuf content type. So, don't do anything here since the request succeeded.
|
|
return;
|
|
}
|
|
Log(LogWarning, "OTelExporter")
|
|
<< "Unexpected Content-Type from OpenTelemetry backend '" << ct << "' (" << responseMsg.reason() << "):\n"
|
|
<< responseMsg.body();
|
|
} else if (responseMsg.result_int() >= 200 && responseMsg.result_int() <= 299) {
|
|
// We've got a valid Protobuf response, so we've to deserialize the body to check for partial success.
|
|
// See https://opentelemetry.io/docs/specs/otlp/#partial-success-1.
|
|
google::protobuf::Arena arena;
|
|
auto* response = MetricsResponse::default_instance().New(&arena);
|
|
[[maybe_unused]] auto deserialized = response->ParseFromString(responseMsg.body());
|
|
ASSERT(deserialized);
|
|
|
|
if (response->has_partial_success()) {
|
|
const auto& ps = response->partial_success();
|
|
const auto& msg = ps.error_message();
|
|
if (ps.rejected_data_points() > 0 || !msg.empty()) {
|
|
Log(LogWarning, "OTelExporter")
|
|
<< "OpenTelemetry backend reported partial success: " << (msg.empty() ? "<none>" : msg)
|
|
<< " (" << ps.rejected_data_points() << " metric data points rejected).";
|
|
}
|
|
}
|
|
} else if (IsRetryableExportError(responseMsg.result())) {
|
|
uint64_t throttleSeconds = 0;
|
|
if (auto throttle = responseMsg[http::field::retry_after]; !throttle.empty()) {
|
|
try {
|
|
throttleSeconds = boost::lexical_cast<uint64_t>(throttle);
|
|
} catch (const std::exception& ex) {
|
|
Log(LogWarning, "OTelExporter")
|
|
<< "Failed to parse 'Retry-After' header from OpenTelemetry backend response: " << ex.what();
|
|
}
|
|
}
|
|
BOOST_THROW_EXCEPTION(RetryableExportError{throttleSeconds});
|
|
} else {
|
|
Log(LogWarning, "OTelExporter")
|
|
<< "OpenTelemetry backend responded with non-success and non-retryable status code "
|
|
<< responseMsg.result_int() << " (" << responseMsg.reason() << ").\n" << responseMsg.body();
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Reset the exporting state and notify waiters.
|
|
*
|
|
* This method resets the internal exporting state to indicate that no export is currently
|
|
* in progress. It then notifies either one or all waiters waiting for the export to complete,
|
|
* based on the @c notifyAll parameter.
|
|
*
|
|
* @param notifyAll If true, notifies all waiters; otherwise, notifies only one waiter.
|
|
*/
|
|
void OTel::ResetExporting(bool notifyAll)
|
|
{
|
|
{
|
|
std::lock_guard lock(m_Mutex);
|
|
m_Exporting = false;
|
|
}
|
|
if (notifyAll) {
|
|
m_ExportCV.notify_all();
|
|
} else {
|
|
m_ExportCV.notify_one();
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Validate the given OTel metric name according to OTel naming conventions[^1].
|
|
* Here's the ABNF definition for reference:
|
|
* @verbatim
|
|
* instrument-name = ALPHA 0*254 ("_" / "." / "-" / "/" / ALPHA / DIGIT)
|
|
* ALPHA = %x41-5A / %x61-7A; A-Z / a-z
|
|
* DIGIT = %x30-39 ; 0-9
|
|
* @endverbatim
|
|
*
|
|
* @param name The metric name to validate.
|
|
*
|
|
* @throws std::invalid_argument if the metric name is invalid.
|
|
*
|
|
* [^1]: https://opentelemetry.io/docs/specs/otel/metrics/api/#instrument-name-syntax
|
|
*/
|
|
void OTel::ValidateName(std::string_view name)
|
|
{
|
|
if (name.empty() || name.size() > 255) {
|
|
BOOST_THROW_EXCEPTION(std::invalid_argument("OTel instrument name must be between 1 and 255 characters long."));
|
|
}
|
|
|
|
auto isAlpha = [](char c) { return ('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z'); };
|
|
auto isDigit = [](char c) { return '0' <= c && c <= '9'; };
|
|
for (std::size_t i = 0; i < name.size(); ++i) {
|
|
auto c = name[i];
|
|
if (i == 0 && !isAlpha(c)) {
|
|
BOOST_THROW_EXCEPTION(std::invalid_argument("OTel instrument name must start with an alphabetic character."));
|
|
}
|
|
if (!isAlpha(c) && !isDigit(c) && c != '_' && c != '.' && c != '-' && c != '/') {
|
|
BOOST_THROW_EXCEPTION(std::invalid_argument(
|
|
"OTel instrument name contains invalid character '" + std::string(1, c) + "'."
|
|
));
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Set the given OTel attribute key-value pair in the provided @c Attribute Protobuf object.
|
|
*
|
|
* This method sets the given key-value pair in the provided KeyValue Protobuf object according to
|
|
* OTel specifications[^1]. While the OTel specs[^2] allows a wider range of attr value types, we
|
|
* only support the most common/scalar types (Boolean, Number (double), and String) for simplicity.
|
|
*
|
|
* @param attr The OTel attribute Protobuf object to set the value for.
|
|
* @param key The attribute key to set. Must not be empty.
|
|
* @param value The Value object containing the value to set in the attribute.
|
|
*
|
|
* @throws std::invalid_argument if key is empty or if @c Value represents an unsupported attribute value type.
|
|
*
|
|
* [^1]: https://opentelemetry.io/docs/specs/otel/common/#attribute
|
|
* [^2]: https://opentelemetry.io/docs/specs/otel/common/#anyvalue
|
|
*/
|
|
template <typename Key, typename AttrVal, typename>
|
|
void OTel::SetAttribute(Attribute& attr, Key&& key, AttrVal&& value)
|
|
{
|
|
if (begin(key) == end(key)) {
|
|
BOOST_THROW_EXCEPTION(std::invalid_argument("OTel attribute key must not be empty."));
|
|
}
|
|
|
|
if constexpr (std::is_rvalue_reference_v<Key> && std::is_same_v<std::decay_t<Key>, String>) {
|
|
attr.set_key(std::move(key.GetData()));
|
|
} else {
|
|
attr.set_key(std::string{std::forward<Key>(key)});
|
|
}
|
|
|
|
constexpr bool isRvalReference = std::is_rvalue_reference_v<AttrVal>;
|
|
if constexpr (isRvalReference && std::is_same_v<std::decay_t<AttrVal>, String>) {
|
|
attr.mutable_value()->set_string_value(std::move(value.GetData()));
|
|
} else if constexpr (std::is_constructible_v<std::string, AttrVal>) {
|
|
attr.mutable_value()->set_string_value(std::string{std::forward<AttrVal>(value)});
|
|
} else {
|
|
switch (value.GetType()) {
|
|
case ValueBoolean:
|
|
attr.mutable_value()->set_bool_value(value.template Get<bool>());
|
|
break;
|
|
case ValueNumber:
|
|
attr.mutable_value()->set_double_value(value.template Get<double>());
|
|
break;
|
|
case ValueString:
|
|
if (isRvalReference) {
|
|
attr.mutable_value()->set_string_value(std::move(value.template Get<String>().GetData()));
|
|
} else {
|
|
attr.mutable_value()->set_string_value(value.template Get<String>().GetData());
|
|
}
|
|
break;
|
|
default:
|
|
BOOST_THROW_EXCEPTION(std::invalid_argument(
|
|
"OTel attribute value must be of type Boolean, Number, or String, got '" + value.GetTypeName() + "'."
|
|
));
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Record a data point in the given OTel Gauge metric stream with the provided value, timestamps, and attributes.
|
|
*
|
|
* This method adds a new data point to the provided Gauge Protobuf object with the given value, start and end
|
|
* timestamps, and a set of attributes. The value can be either an int64_t or a double, depending on the type
|
|
* of the Gauge. The timestamps are expected to be in seconds and will be converted to nanoseconds as required
|
|
* by OTel specifications. The attributes are provided as a map of key-value pairs and will be set in the data
|
|
* point according to OTel attribute specs.
|
|
*
|
|
* @tparam T The type of the data point value, which must be either int64_t or double.
|
|
*
|
|
* @param gauge The Gauge Protobuf object to record the data point in.
|
|
* @param data The value of the data point to record.
|
|
* @param start The start timestamp of the data point in seconds.
|
|
* @param end The end timestamp of the data point in seconds.
|
|
* @param attrs A map of attribute key-value pairs to set in the data point.
|
|
*
|
|
* @return The size in bytes of the recorded data point after serialization.
|
|
*
|
|
* @throws std::invalid_argument if any attribute key is empty or has an unsupported value type.
|
|
*/
|
|
template<typename T, typename>
|
|
std::size_t OTel::Record(Gauge& gauge, T data, double start, double end, AttrsMap attrs)
|
|
{
|
|
namespace ch = std::chrono;
|
|
|
|
auto* dataPoint = gauge.add_data_points();
|
|
if constexpr (std::is_same_v<T, double>) {
|
|
dataPoint->set_as_double(data);
|
|
} else {
|
|
dataPoint->set_as_int(data);
|
|
}
|
|
|
|
dataPoint->set_start_time_unix_nano(
|
|
static_cast<uint64_t>(ch::duration_cast<ch::nanoseconds>(ch::duration<double>(start)).count())
|
|
);
|
|
dataPoint->set_time_unix_nano(
|
|
static_cast<uint64_t>(ch::duration_cast<ch::nanoseconds>(ch::duration<double>(end)).count())
|
|
);
|
|
|
|
while (!attrs.empty()) {
|
|
auto* attr = dataPoint->add_attributes();
|
|
auto node = attrs.extract(attrs.begin());
|
|
SetAttribute(*attr, std::move(node.key()), std::move(node.mapped()));
|
|
}
|
|
return dataPoint->ByteSizeLong();
|
|
}
|
|
|
|
/**
|
|
* Determine if the given HTTP status code represents a retryable export error as per OTel specs[^1].
|
|
*
|
|
* @param status The HTTP status code to check.
|
|
*
|
|
* @return true if the status code indicates a retryable error; false otherwise.
|
|
*
|
|
* [^1]: https://opentelemetry.io/docs/specs/otlp/#retryable-response-codes
|
|
*/
|
|
bool OTel::IsRetryableExportError(const http::status status)
|
|
{
|
|
return status == http::status::too_many_requests
|
|
|| status == http::status::bad_gateway
|
|
|| status == http::status::service_unavailable
|
|
|| status == http::status::gateway_timeout;
|
|
}
|
|
|
|
AsioProtobufOutStream::AsioProtobufOutStream(const AsioTlsOrTcpStream& stream, const OTelConnInfo& connInfo, boost::asio::yield_context yc)
|
|
: m_Writer{stream}, m_YieldContext{std::move(yc)}
|
|
{
|
|
m_Writer.method(http::verb::post);
|
|
m_Writer.target(connInfo.MetricsEndpoint);
|
|
m_Writer.set(http::field::host, connInfo.Host + ":" + std::to_string(connInfo.Port));
|
|
m_Writer.set(http::field::content_type, "application/x-protobuf");
|
|
if (!connInfo.BasicAuth.IsEmpty()) {
|
|
m_Writer.set(http::field::authorization, "Basic " + connInfo.BasicAuth);
|
|
}
|
|
m_Writer.StartStreaming();
|
|
}
|
|
|
|
bool AsioProtobufOutStream::Next(void** data, int* size)
|
|
{
|
|
if (m_Buffered == l_BufferSize) {
|
|
Flush();
|
|
}
|
|
// Prepare a new buffer segment that the Protobuf serializer can write into.
|
|
// The buffer size is fixed to l_BufferSize, and as seen above, we flush if the previous buffer
|
|
// segment was fully used (which is always the case on each Next call after the initial one), so
|
|
// we'll end up reusing the same memory region for each Next call because when we flush, we also
|
|
// consume the committed data, and that region becomes writable again.
|
|
auto buf = m_Writer.Prepare(l_BufferSize - m_Buffered);
|
|
*data = buf.data();
|
|
*size = static_cast<int>(l_BufferSize);
|
|
m_Buffered = l_BufferSize;
|
|
return true;
|
|
}
|
|
|
|
void AsioProtobufOutStream::BackUp(int count)
|
|
{
|
|
// Make sure we've not already finalized the HTTP body because BackUp
|
|
// is supposed to be called only after a preceding (final) Next call.
|
|
ASSERT(!m_Writer.Done());
|
|
ASSERT(static_cast<std::size_t>(count) <= m_Buffered);
|
|
ASSERT(m_Buffered == l_BufferSize);
|
|
// If the last prepared buffer segment was not fully used, we need to adjust the buffered size,
|
|
// so that we don't commit unused memory regions with the below Flush() call. If count is zero,
|
|
// this adjustment is a no-op, and indicates that the entire buffer was used and there won't be
|
|
// any subsequent Next calls anymore (i.e., the Protobuf serialization is complete).
|
|
m_Buffered -= count;
|
|
Flush(true);
|
|
}
|
|
|
|
int64_t AsioProtobufOutStream::ByteCount() const
|
|
{
|
|
return m_Pos + static_cast<int64_t>(m_Buffered);
|
|
}
|
|
|
|
/**
|
|
* Flush any buffered data to the underlying Asio stream.
|
|
*
|
|
* If the `finish` parameter is set to true, it indicates that no more data will
|
|
* be buffered/generated, and the HTTP body will be finalized accordingly.
|
|
*
|
|
* @param finish Whether this is the final flush operation.
|
|
*/
|
|
void AsioProtobufOutStream::Flush(bool finish)
|
|
{
|
|
ASSERT(m_Buffered > 0 || finish);
|
|
m_Writer.Commit(m_Buffered);
|
|
m_Writer.Flush(m_YieldContext, finish);
|
|
m_Pos += static_cast<int64_t>(m_Buffered);
|
|
m_Buffered = 0;
|
|
}
|
|
|
|
/**
|
|
* Check if the underlying HTTP request writer has completed writing.
|
|
*
|
|
* @return true if the writer has finished writing; false otherwise.
|
|
*/
|
|
bool AsioProtobufOutStream::WriterDone()
|
|
{
|
|
return m_Writer.Done();
|
|
}
|