Advisor: Add metrics for check processing (#119923)

This commit is contained in:
Andres Martinez Gotor 2026-03-10 15:40:11 +01:00 committed by GitHub
parent beb32caa2c
commit bd76df327a
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 149 additions and 4 deletions

View file

@ -13,6 +13,7 @@ require (
github.com/grafana/grafana-plugin-sdk-go v0.290.0
github.com/grafana/grafana/pkg/apimachinery v0.0.0
github.com/grafana/grafana/pkg/plugins v0.0.0
github.com/prometheus/client_golang v1.23.2
github.com/stretchr/testify v1.11.1
k8s.io/apimachinery v0.35.1
k8s.io/apiserver v0.35.1
@ -266,7 +267,6 @@ require (
github.com/pkg/errors v0.9.1 // indirect
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
github.com/prometheus/alertmanager v0.28.2 // indirect
github.com/prometheus/client_golang v1.23.2 // indirect
github.com/prometheus/client_model v0.6.2 // indirect
github.com/prometheus/common v0.67.5 // indirect
github.com/prometheus/common/sigv4 v0.1.0 // indirect

View file

@ -5,12 +5,15 @@ import (
"encoding/json"
"fmt"
"net/http"
"time"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime/schema"
"k8s.io/apiserver/pkg/authorization/authorizer"
"k8s.io/client-go/rest"
"github.com/prometheus/client_golang/prometheus"
"github.com/grafana/grafana-app-sdk/app"
"github.com/grafana/grafana-app-sdk/k8s"
appsdkapiserver "github.com/grafana/grafana-app-sdk/k8s/apiserver"
@ -24,6 +27,7 @@ import (
"github.com/grafana/grafana/apps/advisor/pkg/app/checks"
"github.com/grafana/grafana/apps/advisor/pkg/app/checkscheduler"
"github.com/grafana/grafana/apps/advisor/pkg/app/checktyperegisterer"
"github.com/grafana/grafana/apps/advisor/pkg/app/metrics"
"github.com/grafana/grafana/pkg/apimachinery/identity"
"github.com/grafana/grafana/pkg/services/org"
"github.com/grafana/grafana/pkg/setting"
@ -93,32 +97,46 @@ func New(cfg app.Config) (app.App, error) {
go func() {
logger := log.WithContext(ctx).With("check", check.ID())
logger.Debug("Processing check", "namespace", req.Object.GetNamespace())
start := time.Now()
checkType := check.ID()
orgID, err := getOrgIDFromNamespace(req.Object.GetNamespace())
if err != nil {
logger.Error("Error getting org ID from namespace", "error", err)
metrics.OrgIDErrorsTotal.Inc()
return
}
ctx = identity.WithServiceIdentityContext(context.WithoutCancel(ctx), orgID)
err = processCheck(ctx, logger, client, typesClient, req.Object, check)
if err != nil {
logger.Error("Error processing check", "error", err)
metrics.CheckProcessingTotal.WithLabelValues("process", "error", checkType).Inc()
} else {
metrics.CheckProcessingTotal.WithLabelValues("process", "success", checkType).Inc()
}
metrics.CheckProcessingDurationSeconds.WithLabelValues("process", checkType).Observe(time.Since(start).Seconds())
}()
}
if req.Action == resource.AdmissionActionUpdate && retryAnnotationChanged(req.OldObject, req.Object) {
go func() {
logger := log.WithContext(ctx).With("check", check.ID())
logger.Debug("Updating check", "namespace", req.Object.GetNamespace(), "name", req.Object.GetName())
start := time.Now()
checkType := check.ID()
orgID, err := getOrgIDFromNamespace(req.Object.GetNamespace())
if err != nil {
logger.Error("Error getting org ID from namespace", "error", err)
metrics.OrgIDErrorsTotal.Inc()
return
}
ctx = identity.WithServiceIdentityContext(context.WithoutCancel(ctx), orgID)
err = processCheckRetry(ctx, logger, client, typesClient, req.Object, check)
if err != nil {
logger.Error("Error processing check retry", "error", err)
metrics.CheckProcessingTotal.WithLabelValues("retry", "error", checkType).Inc()
} else {
metrics.CheckProcessingTotal.WithLabelValues("retry", "success", checkType).Inc()
}
metrics.CheckProcessingDurationSeconds.WithLabelValues("retry", checkType).Observe(time.Since(start).Seconds())
}()
}
}
@ -144,11 +162,13 @@ func New(cfg app.Config) (app.App, error) {
err := ctr.RegisterCheckTypesInNamespace(ctx, logger, namespace)
if err != nil {
logger.Error("Failed to register check types", "namespace", namespace, "error", err)
metrics.CheckRegistrationTotal.WithLabelValues("error").Inc()
w.WriteHeader(http.StatusInternalServerError)
_ = json.NewEncoder(w).Encode(map[string]string{"error": err.Error()})
return err
}
metrics.CheckRegistrationTotal.WithLabelValues("success").Inc()
// Return typed response matching the manifest
return json.NewEncoder(w).Encode(advisorv0alpha1.CreateRegisterResponse{
TypeMeta: metav1.TypeMeta{
@ -201,7 +221,9 @@ func ProvideAppInstaller(
checkRegistry checkregistry.CheckService,
cfg *setting.Cfg,
orgService org.Service,
registerer prometheus.Registerer,
) (*AdvisorAppInstaller, error) {
metrics.MustRegister(registerer)
provider := simple.NewAppProvider(advisorapi.LocalManifest(), nil, New)
pluginConfig := cfg.PluginSettings["grafana-advisor-app"]
specificConfig := checkregistry.AdvisorAppConfig{

View file

@ -0,0 +1,86 @@
package metrics
import (
"errors"
"github.com/prometheus/client_golang/prometheus"
)
const (
namespace = "advisor_app"
)
var (
// CheckProcessingTotal counts check processing outcomes by operation, status, and check_type.
CheckProcessingTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: namespace,
Name: "check_processing_total",
Help: "Total number of check processing operations by operation, status, and check type",
},
[]string{"operation", "status", "check_type"},
)
// CheckProcessingDurationSeconds is the duration of check processing operations.
CheckProcessingDurationSeconds = prometheus.NewHistogramVec(
prometheus.HistogramOpts{
Namespace: namespace,
Name: "check_processing_duration_seconds",
Help: "Duration of check processing operations in seconds",
Buckets: prometheus.DefBuckets,
},
[]string{"operation", "check_type"},
)
// CheckRegistrationTotal counts register check types endpoint outcomes.
CheckRegistrationTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: namespace,
Name: "check_registration_total",
Help: "Total number of check type registration operations",
},
[]string{"status"},
)
// OrgIDErrorsTotal counts errors resolving org ID from namespace.
OrgIDErrorsTotal = prometheus.NewCounter(
prometheus.CounterOpts{
Namespace: namespace,
Name: "org_id_errors_total",
Help: "Total number of errors resolving org ID from namespace",
},
)
// StepPanicsTotal counts panics recovered in step execution.
StepPanicsTotal = prometheus.NewCounterVec(
prometheus.CounterOpts{
Namespace: namespace,
Name: "step_panics_total",
Help: "Total number of panics recovered in check step execution",
},
[]string{"step_id"},
)
)
// MustRegister registers all metrics with the given registerer. No-op if registerer is nil.
func MustRegister(registerer prometheus.Registerer) {
if registerer == nil {
return
}
metricsToRegister := []prometheus.Collector{
CheckProcessingTotal,
CheckProcessingDurationSeconds,
CheckRegistrationTotal,
OrgIDErrorsTotal,
StepPanicsTotal,
}
for _, metric := range metricsToRegister {
if err := registerer.Register(metric); err != nil {
var alreadyRegistered prometheus.AlreadyRegisteredError
if errors.As(err, &alreadyRegistered) {
continue
}
panic(err)
}
}
}

View file

@ -0,0 +1,32 @@
package metrics
import (
"testing"
"github.com/prometheus/client_golang/prometheus"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)
func TestRegister(t *testing.T) {
registry := prometheus.NewRegistry()
MustRegister(registry)
// Register with nil is a no-op
MustRegister(nil)
}
func TestRegisterAndRecord(t *testing.T) {
registry := prometheus.NewRegistry()
MustRegister(registry)
CheckProcessingTotal.WithLabelValues("process", "success", "datasource").Inc()
CheckProcessingTotal.WithLabelValues("process", "error", "plugin").Inc()
CheckProcessingDurationSeconds.WithLabelValues("process", "datasource").Observe(0.5)
CheckRegistrationTotal.WithLabelValues("success").Inc()
OrgIDErrorsTotal.Inc()
StepPanicsTotal.WithLabelValues("step-1").Inc()
metrics, err := registry.Gather()
require.NoError(t, err)
assert.GreaterOrEqual(t, len(metrics), 5)
}

View file

@ -14,6 +14,7 @@ import (
"github.com/grafana/grafana-app-sdk/resource"
advisorv0alpha1 "github.com/grafana/grafana/apps/advisor/pkg/apis/advisor/v0alpha1"
"github.com/grafana/grafana/apps/advisor/pkg/app/checks"
"github.com/grafana/grafana/apps/advisor/pkg/app/metrics"
"github.com/grafana/grafana/pkg/services/contexthandler"
k8serrors "k8s.io/apimachinery/pkg/api/errors"
)
@ -218,6 +219,7 @@ func runStepsInParallel(ctx context.Context, log logging.Logger, spec *advisorv0
defer func() {
if r := recover(); r != nil {
log.Error("panic recovered in step", "step", step.ID(), "error", r, "item", item)
metrics.StepPanicsTotal.WithLabelValues(step.ID()).Inc()
}
}()
logger := log.With("step", step.ID())

View file

@ -5,6 +5,8 @@ import (
authlib "github.com/grafana/authlib/types"
appsdkapiserver "github.com/grafana/grafana-app-sdk/k8s/apiserver"
"github.com/prometheus/client_golang/prometheus"
advisorapp "github.com/grafana/grafana/apps/advisor/pkg/app"
"github.com/grafana/grafana/apps/advisor/pkg/app/checkregistry"
"github.com/grafana/grafana/pkg/services/accesscontrol"
@ -29,13 +31,14 @@ func ProvideAppInstaller(
checkRegistry checkregistry.CheckService,
cfg *setting.Cfg,
orgService org.Service,
registerer prometheus.Registerer,
) (*AppInstaller, error) {
if err := registerAccessControlRoles(accessControlService); err != nil {
return nil, fmt.Errorf("registering access control roles: %w", err)
}
authorizer := grafanaauthorizer.NewResourceAuthorizer(accessClient)
i, err := advisorapp.ProvideAppInstaller(authorizer, checkRegistry, cfg, orgService)
i, err := advisorapp.ProvideAppInstaller(authorizer, checkRegistry, cfg, orgService, registerer)
if err != nil {
return nil, err
}

View file

@ -841,7 +841,7 @@ func Initialize(ctx context.Context, cfg *setting.Cfg, opts Options, apiOpts api
return nil, err
}
checkregistryService := checkregistry.ProvideService(service15, pluginstoreService, plugincontextProvider, middlewareHandler, plugincheckerService, repoManager, preinstallImpl, managedpluginsNoop, noop, ssosettingsimplService, cfg, pluginerrsStore)
advisorAppInstaller, err := advisor2.ProvideAppInstaller(acimplService, accessClient, checkregistryService, cfg, orgService)
advisorAppInstaller, err := advisor2.ProvideAppInstaller(acimplService, accessClient, checkregistryService, cfg, orgService, registerer)
if err != nil {
return nil, err
}
@ -1540,7 +1540,7 @@ func InitializeForTest(ctx context.Context, t sqlutil.ITestDB, testingT interfac
return nil, err
}
checkregistryService := checkregistry.ProvideService(service15, pluginstoreService, plugincontextProvider, middlewareHandler, plugincheckerService, repoManager, preinstallImpl, managedpluginsNoop, noop, ssosettingsimplService, cfg, pluginerrsStore)
advisorAppInstaller, err := advisor2.ProvideAppInstaller(acimplService, accessClient, checkregistryService, cfg, orgService)
advisorAppInstaller, err := advisor2.ProvideAppInstaller(acimplService, accessClient, checkregistryService, cfg, orgService, registerer)
if err != nil {
return nil, err
}