From bd76df327ab21397dfdff5192e5b41df5ffcafad Mon Sep 17 00:00:00 2001 From: Andres Martinez Gotor Date: Tue, 10 Mar 2026 15:40:11 +0100 Subject: [PATCH] Advisor: Add metrics for check processing (#119923) --- apps/advisor/go.mod | 2 +- apps/advisor/pkg/app/app.go | 22 +++++ apps/advisor/pkg/app/metrics/metrics.go | 86 ++++++++++++++++++++ apps/advisor/pkg/app/metrics/metrics_test.go | 32 ++++++++ apps/advisor/pkg/app/utils.go | 2 + pkg/registry/apps/advisor/register.go | 5 +- pkg/server/wire_gen.go | 4 +- 7 files changed, 149 insertions(+), 4 deletions(-) create mode 100644 apps/advisor/pkg/app/metrics/metrics.go create mode 100644 apps/advisor/pkg/app/metrics/metrics_test.go diff --git a/apps/advisor/go.mod b/apps/advisor/go.mod index b2442981cdb..9e66f898230 100644 --- a/apps/advisor/go.mod +++ b/apps/advisor/go.mod @@ -13,6 +13,7 @@ require ( github.com/grafana/grafana-plugin-sdk-go v0.290.0 github.com/grafana/grafana/pkg/apimachinery v0.0.0 github.com/grafana/grafana/pkg/plugins v0.0.0 + github.com/prometheus/client_golang v1.23.2 github.com/stretchr/testify v1.11.1 k8s.io/apimachinery v0.35.1 k8s.io/apiserver v0.35.1 @@ -266,7 +267,6 @@ require ( github.com/pkg/errors v0.9.1 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect github.com/prometheus/alertmanager v0.28.2 // indirect - github.com/prometheus/client_golang v1.23.2 // indirect github.com/prometheus/client_model v0.6.2 // indirect github.com/prometheus/common v0.67.5 // indirect github.com/prometheus/common/sigv4 v0.1.0 // indirect diff --git a/apps/advisor/pkg/app/app.go b/apps/advisor/pkg/app/app.go index 5ea140bc261..26b288eb211 100644 --- a/apps/advisor/pkg/app/app.go +++ b/apps/advisor/pkg/app/app.go @@ -5,12 +5,15 @@ import ( "encoding/json" "fmt" "net/http" + "time" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/runtime/schema" "k8s.io/apiserver/pkg/authorization/authorizer" "k8s.io/client-go/rest" + "github.com/prometheus/client_golang/prometheus" + "github.com/grafana/grafana-app-sdk/app" "github.com/grafana/grafana-app-sdk/k8s" appsdkapiserver "github.com/grafana/grafana-app-sdk/k8s/apiserver" @@ -24,6 +27,7 @@ import ( "github.com/grafana/grafana/apps/advisor/pkg/app/checks" "github.com/grafana/grafana/apps/advisor/pkg/app/checkscheduler" "github.com/grafana/grafana/apps/advisor/pkg/app/checktyperegisterer" + "github.com/grafana/grafana/apps/advisor/pkg/app/metrics" "github.com/grafana/grafana/pkg/apimachinery/identity" "github.com/grafana/grafana/pkg/services/org" "github.com/grafana/grafana/pkg/setting" @@ -93,32 +97,46 @@ func New(cfg app.Config) (app.App, error) { go func() { logger := log.WithContext(ctx).With("check", check.ID()) logger.Debug("Processing check", "namespace", req.Object.GetNamespace()) + start := time.Now() + checkType := check.ID() orgID, err := getOrgIDFromNamespace(req.Object.GetNamespace()) if err != nil { logger.Error("Error getting org ID from namespace", "error", err) + metrics.OrgIDErrorsTotal.Inc() return } ctx = identity.WithServiceIdentityContext(context.WithoutCancel(ctx), orgID) err = processCheck(ctx, logger, client, typesClient, req.Object, check) if err != nil { logger.Error("Error processing check", "error", err) + metrics.CheckProcessingTotal.WithLabelValues("process", "error", checkType).Inc() + } else { + metrics.CheckProcessingTotal.WithLabelValues("process", "success", checkType).Inc() } + metrics.CheckProcessingDurationSeconds.WithLabelValues("process", checkType).Observe(time.Since(start).Seconds()) }() } if req.Action == resource.AdmissionActionUpdate && retryAnnotationChanged(req.OldObject, req.Object) { go func() { logger := log.WithContext(ctx).With("check", check.ID()) logger.Debug("Updating check", "namespace", req.Object.GetNamespace(), "name", req.Object.GetName()) + start := time.Now() + checkType := check.ID() orgID, err := getOrgIDFromNamespace(req.Object.GetNamespace()) if err != nil { logger.Error("Error getting org ID from namespace", "error", err) + metrics.OrgIDErrorsTotal.Inc() return } ctx = identity.WithServiceIdentityContext(context.WithoutCancel(ctx), orgID) err = processCheckRetry(ctx, logger, client, typesClient, req.Object, check) if err != nil { logger.Error("Error processing check retry", "error", err) + metrics.CheckProcessingTotal.WithLabelValues("retry", "error", checkType).Inc() + } else { + metrics.CheckProcessingTotal.WithLabelValues("retry", "success", checkType).Inc() } + metrics.CheckProcessingDurationSeconds.WithLabelValues("retry", checkType).Observe(time.Since(start).Seconds()) }() } } @@ -144,11 +162,13 @@ func New(cfg app.Config) (app.App, error) { err := ctr.RegisterCheckTypesInNamespace(ctx, logger, namespace) if err != nil { logger.Error("Failed to register check types", "namespace", namespace, "error", err) + metrics.CheckRegistrationTotal.WithLabelValues("error").Inc() w.WriteHeader(http.StatusInternalServerError) _ = json.NewEncoder(w).Encode(map[string]string{"error": err.Error()}) return err } + metrics.CheckRegistrationTotal.WithLabelValues("success").Inc() // Return typed response matching the manifest return json.NewEncoder(w).Encode(advisorv0alpha1.CreateRegisterResponse{ TypeMeta: metav1.TypeMeta{ @@ -201,7 +221,9 @@ func ProvideAppInstaller( checkRegistry checkregistry.CheckService, cfg *setting.Cfg, orgService org.Service, + registerer prometheus.Registerer, ) (*AdvisorAppInstaller, error) { + metrics.MustRegister(registerer) provider := simple.NewAppProvider(advisorapi.LocalManifest(), nil, New) pluginConfig := cfg.PluginSettings["grafana-advisor-app"] specificConfig := checkregistry.AdvisorAppConfig{ diff --git a/apps/advisor/pkg/app/metrics/metrics.go b/apps/advisor/pkg/app/metrics/metrics.go new file mode 100644 index 00000000000..a0cf177b78a --- /dev/null +++ b/apps/advisor/pkg/app/metrics/metrics.go @@ -0,0 +1,86 @@ +package metrics + +import ( + "errors" + + "github.com/prometheus/client_golang/prometheus" +) + +const ( + namespace = "advisor_app" +) + +var ( + // CheckProcessingTotal counts check processing outcomes by operation, status, and check_type. + CheckProcessingTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: namespace, + Name: "check_processing_total", + Help: "Total number of check processing operations by operation, status, and check type", + }, + []string{"operation", "status", "check_type"}, + ) + + // CheckProcessingDurationSeconds is the duration of check processing operations. + CheckProcessingDurationSeconds = prometheus.NewHistogramVec( + prometheus.HistogramOpts{ + Namespace: namespace, + Name: "check_processing_duration_seconds", + Help: "Duration of check processing operations in seconds", + Buckets: prometheus.DefBuckets, + }, + []string{"operation", "check_type"}, + ) + + // CheckRegistrationTotal counts register check types endpoint outcomes. + CheckRegistrationTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: namespace, + Name: "check_registration_total", + Help: "Total number of check type registration operations", + }, + []string{"status"}, + ) + + // OrgIDErrorsTotal counts errors resolving org ID from namespace. + OrgIDErrorsTotal = prometheus.NewCounter( + prometheus.CounterOpts{ + Namespace: namespace, + Name: "org_id_errors_total", + Help: "Total number of errors resolving org ID from namespace", + }, + ) + + // StepPanicsTotal counts panics recovered in step execution. + StepPanicsTotal = prometheus.NewCounterVec( + prometheus.CounterOpts{ + Namespace: namespace, + Name: "step_panics_total", + Help: "Total number of panics recovered in check step execution", + }, + []string{"step_id"}, + ) +) + +// MustRegister registers all metrics with the given registerer. No-op if registerer is nil. +func MustRegister(registerer prometheus.Registerer) { + if registerer == nil { + return + } + metricsToRegister := []prometheus.Collector{ + CheckProcessingTotal, + CheckProcessingDurationSeconds, + CheckRegistrationTotal, + OrgIDErrorsTotal, + StepPanicsTotal, + } + for _, metric := range metricsToRegister { + if err := registerer.Register(metric); err != nil { + var alreadyRegistered prometheus.AlreadyRegisteredError + if errors.As(err, &alreadyRegistered) { + continue + } + panic(err) + } + } +} diff --git a/apps/advisor/pkg/app/metrics/metrics_test.go b/apps/advisor/pkg/app/metrics/metrics_test.go new file mode 100644 index 00000000000..5f1b21dc58d --- /dev/null +++ b/apps/advisor/pkg/app/metrics/metrics_test.go @@ -0,0 +1,32 @@ +package metrics + +import ( + "testing" + + "github.com/prometheus/client_golang/prometheus" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestRegister(t *testing.T) { + registry := prometheus.NewRegistry() + MustRegister(registry) + // Register with nil is a no-op + MustRegister(nil) +} + +func TestRegisterAndRecord(t *testing.T) { + registry := prometheus.NewRegistry() + MustRegister(registry) + + CheckProcessingTotal.WithLabelValues("process", "success", "datasource").Inc() + CheckProcessingTotal.WithLabelValues("process", "error", "plugin").Inc() + CheckProcessingDurationSeconds.WithLabelValues("process", "datasource").Observe(0.5) + CheckRegistrationTotal.WithLabelValues("success").Inc() + OrgIDErrorsTotal.Inc() + StepPanicsTotal.WithLabelValues("step-1").Inc() + + metrics, err := registry.Gather() + require.NoError(t, err) + assert.GreaterOrEqual(t, len(metrics), 5) +} diff --git a/apps/advisor/pkg/app/utils.go b/apps/advisor/pkg/app/utils.go index 9e88f9b5b11..e6da8647b09 100644 --- a/apps/advisor/pkg/app/utils.go +++ b/apps/advisor/pkg/app/utils.go @@ -14,6 +14,7 @@ import ( "github.com/grafana/grafana-app-sdk/resource" advisorv0alpha1 "github.com/grafana/grafana/apps/advisor/pkg/apis/advisor/v0alpha1" "github.com/grafana/grafana/apps/advisor/pkg/app/checks" + "github.com/grafana/grafana/apps/advisor/pkg/app/metrics" "github.com/grafana/grafana/pkg/services/contexthandler" k8serrors "k8s.io/apimachinery/pkg/api/errors" ) @@ -218,6 +219,7 @@ func runStepsInParallel(ctx context.Context, log logging.Logger, spec *advisorv0 defer func() { if r := recover(); r != nil { log.Error("panic recovered in step", "step", step.ID(), "error", r, "item", item) + metrics.StepPanicsTotal.WithLabelValues(step.ID()).Inc() } }() logger := log.With("step", step.ID()) diff --git a/pkg/registry/apps/advisor/register.go b/pkg/registry/apps/advisor/register.go index a0dd4993285..fbc5d8bf87b 100644 --- a/pkg/registry/apps/advisor/register.go +++ b/pkg/registry/apps/advisor/register.go @@ -5,6 +5,8 @@ import ( authlib "github.com/grafana/authlib/types" appsdkapiserver "github.com/grafana/grafana-app-sdk/k8s/apiserver" + "github.com/prometheus/client_golang/prometheus" + advisorapp "github.com/grafana/grafana/apps/advisor/pkg/app" "github.com/grafana/grafana/apps/advisor/pkg/app/checkregistry" "github.com/grafana/grafana/pkg/services/accesscontrol" @@ -29,13 +31,14 @@ func ProvideAppInstaller( checkRegistry checkregistry.CheckService, cfg *setting.Cfg, orgService org.Service, + registerer prometheus.Registerer, ) (*AppInstaller, error) { if err := registerAccessControlRoles(accessControlService); err != nil { return nil, fmt.Errorf("registering access control roles: %w", err) } authorizer := grafanaauthorizer.NewResourceAuthorizer(accessClient) - i, err := advisorapp.ProvideAppInstaller(authorizer, checkRegistry, cfg, orgService) + i, err := advisorapp.ProvideAppInstaller(authorizer, checkRegistry, cfg, orgService, registerer) if err != nil { return nil, err } diff --git a/pkg/server/wire_gen.go b/pkg/server/wire_gen.go index 9dfb69bba51..acb1e9a90a8 100644 --- a/pkg/server/wire_gen.go +++ b/pkg/server/wire_gen.go @@ -841,7 +841,7 @@ func Initialize(ctx context.Context, cfg *setting.Cfg, opts Options, apiOpts api return nil, err } checkregistryService := checkregistry.ProvideService(service15, pluginstoreService, plugincontextProvider, middlewareHandler, plugincheckerService, repoManager, preinstallImpl, managedpluginsNoop, noop, ssosettingsimplService, cfg, pluginerrsStore) - advisorAppInstaller, err := advisor2.ProvideAppInstaller(acimplService, accessClient, checkregistryService, cfg, orgService) + advisorAppInstaller, err := advisor2.ProvideAppInstaller(acimplService, accessClient, checkregistryService, cfg, orgService, registerer) if err != nil { return nil, err } @@ -1540,7 +1540,7 @@ func InitializeForTest(ctx context.Context, t sqlutil.ITestDB, testingT interfac return nil, err } checkregistryService := checkregistry.ProvideService(service15, pluginstoreService, plugincontextProvider, middlewareHandler, plugincheckerService, repoManager, preinstallImpl, managedpluginsNoop, noop, ssosettingsimplService, cfg, pluginerrsStore) - advisorAppInstaller, err := advisor2.ProvideAppInstaller(acimplService, accessClient, checkregistryService, cfg, orgService) + advisorAppInstaller, err := advisor2.ProvideAppInstaller(acimplService, accessClient, checkregistryService, cfg, orgService, registerer) if err != nil { return nil, err }