mirror of
https://github.com/grafana/grafana.git
synced 2026-06-11 09:31:50 -04:00
Advisor: Add metrics for check processing (#119923)
This commit is contained in:
parent
beb32caa2c
commit
bd76df327a
7 changed files with 149 additions and 4 deletions
|
|
@ -13,6 +13,7 @@ require (
|
|||
github.com/grafana/grafana-plugin-sdk-go v0.290.0
|
||||
github.com/grafana/grafana/pkg/apimachinery v0.0.0
|
||||
github.com/grafana/grafana/pkg/plugins v0.0.0
|
||||
github.com/prometheus/client_golang v1.23.2
|
||||
github.com/stretchr/testify v1.11.1
|
||||
k8s.io/apimachinery v0.35.1
|
||||
k8s.io/apiserver v0.35.1
|
||||
|
|
@ -266,7 +267,6 @@ require (
|
|||
github.com/pkg/errors v0.9.1 // indirect
|
||||
github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
|
||||
github.com/prometheus/alertmanager v0.28.2 // indirect
|
||||
github.com/prometheus/client_golang v1.23.2 // indirect
|
||||
github.com/prometheus/client_model v0.6.2 // indirect
|
||||
github.com/prometheus/common v0.67.5 // indirect
|
||||
github.com/prometheus/common/sigv4 v0.1.0 // indirect
|
||||
|
|
|
|||
|
|
@ -5,12 +5,15 @@ import (
|
|||
"encoding/json"
|
||||
"fmt"
|
||||
"net/http"
|
||||
"time"
|
||||
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/apimachinery/pkg/runtime/schema"
|
||||
"k8s.io/apiserver/pkg/authorization/authorizer"
|
||||
"k8s.io/client-go/rest"
|
||||
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
|
||||
"github.com/grafana/grafana-app-sdk/app"
|
||||
"github.com/grafana/grafana-app-sdk/k8s"
|
||||
appsdkapiserver "github.com/grafana/grafana-app-sdk/k8s/apiserver"
|
||||
|
|
@ -24,6 +27,7 @@ import (
|
|||
"github.com/grafana/grafana/apps/advisor/pkg/app/checks"
|
||||
"github.com/grafana/grafana/apps/advisor/pkg/app/checkscheduler"
|
||||
"github.com/grafana/grafana/apps/advisor/pkg/app/checktyperegisterer"
|
||||
"github.com/grafana/grafana/apps/advisor/pkg/app/metrics"
|
||||
"github.com/grafana/grafana/pkg/apimachinery/identity"
|
||||
"github.com/grafana/grafana/pkg/services/org"
|
||||
"github.com/grafana/grafana/pkg/setting"
|
||||
|
|
@ -93,32 +97,46 @@ func New(cfg app.Config) (app.App, error) {
|
|||
go func() {
|
||||
logger := log.WithContext(ctx).With("check", check.ID())
|
||||
logger.Debug("Processing check", "namespace", req.Object.GetNamespace())
|
||||
start := time.Now()
|
||||
checkType := check.ID()
|
||||
orgID, err := getOrgIDFromNamespace(req.Object.GetNamespace())
|
||||
if err != nil {
|
||||
logger.Error("Error getting org ID from namespace", "error", err)
|
||||
metrics.OrgIDErrorsTotal.Inc()
|
||||
return
|
||||
}
|
||||
ctx = identity.WithServiceIdentityContext(context.WithoutCancel(ctx), orgID)
|
||||
err = processCheck(ctx, logger, client, typesClient, req.Object, check)
|
||||
if err != nil {
|
||||
logger.Error("Error processing check", "error", err)
|
||||
metrics.CheckProcessingTotal.WithLabelValues("process", "error", checkType).Inc()
|
||||
} else {
|
||||
metrics.CheckProcessingTotal.WithLabelValues("process", "success", checkType).Inc()
|
||||
}
|
||||
metrics.CheckProcessingDurationSeconds.WithLabelValues("process", checkType).Observe(time.Since(start).Seconds())
|
||||
}()
|
||||
}
|
||||
if req.Action == resource.AdmissionActionUpdate && retryAnnotationChanged(req.OldObject, req.Object) {
|
||||
go func() {
|
||||
logger := log.WithContext(ctx).With("check", check.ID())
|
||||
logger.Debug("Updating check", "namespace", req.Object.GetNamespace(), "name", req.Object.GetName())
|
||||
start := time.Now()
|
||||
checkType := check.ID()
|
||||
orgID, err := getOrgIDFromNamespace(req.Object.GetNamespace())
|
||||
if err != nil {
|
||||
logger.Error("Error getting org ID from namespace", "error", err)
|
||||
metrics.OrgIDErrorsTotal.Inc()
|
||||
return
|
||||
}
|
||||
ctx = identity.WithServiceIdentityContext(context.WithoutCancel(ctx), orgID)
|
||||
err = processCheckRetry(ctx, logger, client, typesClient, req.Object, check)
|
||||
if err != nil {
|
||||
logger.Error("Error processing check retry", "error", err)
|
||||
metrics.CheckProcessingTotal.WithLabelValues("retry", "error", checkType).Inc()
|
||||
} else {
|
||||
metrics.CheckProcessingTotal.WithLabelValues("retry", "success", checkType).Inc()
|
||||
}
|
||||
metrics.CheckProcessingDurationSeconds.WithLabelValues("retry", checkType).Observe(time.Since(start).Seconds())
|
||||
}()
|
||||
}
|
||||
}
|
||||
|
|
@ -144,11 +162,13 @@ func New(cfg app.Config) (app.App, error) {
|
|||
err := ctr.RegisterCheckTypesInNamespace(ctx, logger, namespace)
|
||||
if err != nil {
|
||||
logger.Error("Failed to register check types", "namespace", namespace, "error", err)
|
||||
metrics.CheckRegistrationTotal.WithLabelValues("error").Inc()
|
||||
w.WriteHeader(http.StatusInternalServerError)
|
||||
_ = json.NewEncoder(w).Encode(map[string]string{"error": err.Error()})
|
||||
return err
|
||||
}
|
||||
|
||||
metrics.CheckRegistrationTotal.WithLabelValues("success").Inc()
|
||||
// Return typed response matching the manifest
|
||||
return json.NewEncoder(w).Encode(advisorv0alpha1.CreateRegisterResponse{
|
||||
TypeMeta: metav1.TypeMeta{
|
||||
|
|
@ -201,7 +221,9 @@ func ProvideAppInstaller(
|
|||
checkRegistry checkregistry.CheckService,
|
||||
cfg *setting.Cfg,
|
||||
orgService org.Service,
|
||||
registerer prometheus.Registerer,
|
||||
) (*AdvisorAppInstaller, error) {
|
||||
metrics.MustRegister(registerer)
|
||||
provider := simple.NewAppProvider(advisorapi.LocalManifest(), nil, New)
|
||||
pluginConfig := cfg.PluginSettings["grafana-advisor-app"]
|
||||
specificConfig := checkregistry.AdvisorAppConfig{
|
||||
|
|
|
|||
86
apps/advisor/pkg/app/metrics/metrics.go
Normal file
86
apps/advisor/pkg/app/metrics/metrics.go
Normal file
|
|
@ -0,0 +1,86 @@
|
|||
package metrics
|
||||
|
||||
import (
|
||||
"errors"
|
||||
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
)
|
||||
|
||||
const (
|
||||
namespace = "advisor_app"
|
||||
)
|
||||
|
||||
var (
|
||||
// CheckProcessingTotal counts check processing outcomes by operation, status, and check_type.
|
||||
CheckProcessingTotal = prometheus.NewCounterVec(
|
||||
prometheus.CounterOpts{
|
||||
Namespace: namespace,
|
||||
Name: "check_processing_total",
|
||||
Help: "Total number of check processing operations by operation, status, and check type",
|
||||
},
|
||||
[]string{"operation", "status", "check_type"},
|
||||
)
|
||||
|
||||
// CheckProcessingDurationSeconds is the duration of check processing operations.
|
||||
CheckProcessingDurationSeconds = prometheus.NewHistogramVec(
|
||||
prometheus.HistogramOpts{
|
||||
Namespace: namespace,
|
||||
Name: "check_processing_duration_seconds",
|
||||
Help: "Duration of check processing operations in seconds",
|
||||
Buckets: prometheus.DefBuckets,
|
||||
},
|
||||
[]string{"operation", "check_type"},
|
||||
)
|
||||
|
||||
// CheckRegistrationTotal counts register check types endpoint outcomes.
|
||||
CheckRegistrationTotal = prometheus.NewCounterVec(
|
||||
prometheus.CounterOpts{
|
||||
Namespace: namespace,
|
||||
Name: "check_registration_total",
|
||||
Help: "Total number of check type registration operations",
|
||||
},
|
||||
[]string{"status"},
|
||||
)
|
||||
|
||||
// OrgIDErrorsTotal counts errors resolving org ID from namespace.
|
||||
OrgIDErrorsTotal = prometheus.NewCounter(
|
||||
prometheus.CounterOpts{
|
||||
Namespace: namespace,
|
||||
Name: "org_id_errors_total",
|
||||
Help: "Total number of errors resolving org ID from namespace",
|
||||
},
|
||||
)
|
||||
|
||||
// StepPanicsTotal counts panics recovered in step execution.
|
||||
StepPanicsTotal = prometheus.NewCounterVec(
|
||||
prometheus.CounterOpts{
|
||||
Namespace: namespace,
|
||||
Name: "step_panics_total",
|
||||
Help: "Total number of panics recovered in check step execution",
|
||||
},
|
||||
[]string{"step_id"},
|
||||
)
|
||||
)
|
||||
|
||||
// MustRegister registers all metrics with the given registerer. No-op if registerer is nil.
|
||||
func MustRegister(registerer prometheus.Registerer) {
|
||||
if registerer == nil {
|
||||
return
|
||||
}
|
||||
metricsToRegister := []prometheus.Collector{
|
||||
CheckProcessingTotal,
|
||||
CheckProcessingDurationSeconds,
|
||||
CheckRegistrationTotal,
|
||||
OrgIDErrorsTotal,
|
||||
StepPanicsTotal,
|
||||
}
|
||||
for _, metric := range metricsToRegister {
|
||||
if err := registerer.Register(metric); err != nil {
|
||||
var alreadyRegistered prometheus.AlreadyRegisteredError
|
||||
if errors.As(err, &alreadyRegistered) {
|
||||
continue
|
||||
}
|
||||
panic(err)
|
||||
}
|
||||
}
|
||||
}
|
||||
32
apps/advisor/pkg/app/metrics/metrics_test.go
Normal file
32
apps/advisor/pkg/app/metrics/metrics_test.go
Normal file
|
|
@ -0,0 +1,32 @@
|
|||
package metrics
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
func TestRegister(t *testing.T) {
|
||||
registry := prometheus.NewRegistry()
|
||||
MustRegister(registry)
|
||||
// Register with nil is a no-op
|
||||
MustRegister(nil)
|
||||
}
|
||||
|
||||
func TestRegisterAndRecord(t *testing.T) {
|
||||
registry := prometheus.NewRegistry()
|
||||
MustRegister(registry)
|
||||
|
||||
CheckProcessingTotal.WithLabelValues("process", "success", "datasource").Inc()
|
||||
CheckProcessingTotal.WithLabelValues("process", "error", "plugin").Inc()
|
||||
CheckProcessingDurationSeconds.WithLabelValues("process", "datasource").Observe(0.5)
|
||||
CheckRegistrationTotal.WithLabelValues("success").Inc()
|
||||
OrgIDErrorsTotal.Inc()
|
||||
StepPanicsTotal.WithLabelValues("step-1").Inc()
|
||||
|
||||
metrics, err := registry.Gather()
|
||||
require.NoError(t, err)
|
||||
assert.GreaterOrEqual(t, len(metrics), 5)
|
||||
}
|
||||
|
|
@ -14,6 +14,7 @@ import (
|
|||
"github.com/grafana/grafana-app-sdk/resource"
|
||||
advisorv0alpha1 "github.com/grafana/grafana/apps/advisor/pkg/apis/advisor/v0alpha1"
|
||||
"github.com/grafana/grafana/apps/advisor/pkg/app/checks"
|
||||
"github.com/grafana/grafana/apps/advisor/pkg/app/metrics"
|
||||
"github.com/grafana/grafana/pkg/services/contexthandler"
|
||||
k8serrors "k8s.io/apimachinery/pkg/api/errors"
|
||||
)
|
||||
|
|
@ -218,6 +219,7 @@ func runStepsInParallel(ctx context.Context, log logging.Logger, spec *advisorv0
|
|||
defer func() {
|
||||
if r := recover(); r != nil {
|
||||
log.Error("panic recovered in step", "step", step.ID(), "error", r, "item", item)
|
||||
metrics.StepPanicsTotal.WithLabelValues(step.ID()).Inc()
|
||||
}
|
||||
}()
|
||||
logger := log.With("step", step.ID())
|
||||
|
|
|
|||
|
|
@ -5,6 +5,8 @@ import (
|
|||
|
||||
authlib "github.com/grafana/authlib/types"
|
||||
appsdkapiserver "github.com/grafana/grafana-app-sdk/k8s/apiserver"
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
|
||||
advisorapp "github.com/grafana/grafana/apps/advisor/pkg/app"
|
||||
"github.com/grafana/grafana/apps/advisor/pkg/app/checkregistry"
|
||||
"github.com/grafana/grafana/pkg/services/accesscontrol"
|
||||
|
|
@ -29,13 +31,14 @@ func ProvideAppInstaller(
|
|||
checkRegistry checkregistry.CheckService,
|
||||
cfg *setting.Cfg,
|
||||
orgService org.Service,
|
||||
registerer prometheus.Registerer,
|
||||
) (*AppInstaller, error) {
|
||||
if err := registerAccessControlRoles(accessControlService); err != nil {
|
||||
return nil, fmt.Errorf("registering access control roles: %w", err)
|
||||
}
|
||||
|
||||
authorizer := grafanaauthorizer.NewResourceAuthorizer(accessClient)
|
||||
i, err := advisorapp.ProvideAppInstaller(authorizer, checkRegistry, cfg, orgService)
|
||||
i, err := advisorapp.ProvideAppInstaller(authorizer, checkRegistry, cfg, orgService, registerer)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
|
|
|||
4
pkg/server/wire_gen.go
generated
4
pkg/server/wire_gen.go
generated
|
|
@ -841,7 +841,7 @@ func Initialize(ctx context.Context, cfg *setting.Cfg, opts Options, apiOpts api
|
|||
return nil, err
|
||||
}
|
||||
checkregistryService := checkregistry.ProvideService(service15, pluginstoreService, plugincontextProvider, middlewareHandler, plugincheckerService, repoManager, preinstallImpl, managedpluginsNoop, noop, ssosettingsimplService, cfg, pluginerrsStore)
|
||||
advisorAppInstaller, err := advisor2.ProvideAppInstaller(acimplService, accessClient, checkregistryService, cfg, orgService)
|
||||
advisorAppInstaller, err := advisor2.ProvideAppInstaller(acimplService, accessClient, checkregistryService, cfg, orgService, registerer)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
|
@ -1540,7 +1540,7 @@ func InitializeForTest(ctx context.Context, t sqlutil.ITestDB, testingT interfac
|
|||
return nil, err
|
||||
}
|
||||
checkregistryService := checkregistry.ProvideService(service15, pluginstoreService, plugincontextProvider, middlewareHandler, plugincheckerService, repoManager, preinstallImpl, managedpluginsNoop, noop, ssosettingsimplService, cfg, pluginerrsStore)
|
||||
advisorAppInstaller, err := advisor2.ProvideAppInstaller(acimplService, accessClient, checkregistryService, cfg, orgService)
|
||||
advisorAppInstaller, err := advisor2.ProvideAppInstaller(acimplService, accessClient, checkregistryService, cfg, orgService, registerer)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in a new issue