diff --git a/hack/tools/instrumentation/endpoint-mappings.yaml b/hack/tools/instrumentation/endpoint-mappings.yaml index bbf1edac4e0..7567dc025b9 100644 --- a/hack/tools/instrumentation/endpoint-mappings.yaml +++ b/hack/tools/instrumentation/endpoint-mappings.yaml @@ -21,10 +21,12 @@ coreComponents: - "cmd/kube-controller-manager/" - "pkg/controller/" - "staging/src/k8s.io/controller-manager/" + - "staging/src/k8s.io/dynamic-resource-allocation/resourceclaim/" - "staging/src/k8s.io/endpointslice/" kube-scheduler: - "cmd/kube-scheduler/" - "pkg/scheduler/" + - "staging/src/k8s.io/dynamic-resource-allocation/resourceclaim/" - "staging/src/k8s.io/kube-scheduler/" kube-proxy: - "cmd/kube-proxy/" diff --git a/hack/tools/instrumentation/endpoint_mapping.go b/hack/tools/instrumentation/endpoint_mapping.go index b1eb9740203..816f5121a92 100644 --- a/hack/tools/instrumentation/endpoint_mapping.go +++ b/hack/tools/instrumentation/endpoint_mapping.go @@ -18,6 +18,7 @@ package main import ( "os" + "slices" "sort" "strings" @@ -59,26 +60,24 @@ func (c *endpointMappingConfig) inferComponentEndpoints(filePath string) []metri endpoint := c.inferEndpoint(filePath) if c.isSharedPath(filePath) { + // The assumption here is that none of the standalone components + // use the metrics under the path. return c.allCoreComponentEndpoints(endpoint) } - component := c.inferComponent(filePath, c.CoreComponents) - if component != "" { - return []metric.ComponentEndpoint{{ + // Core and standalone components may explicitly share the same metrics through their path patterns. + components := c.inferComponents(filePath, c.CoreComponents) + components = append(components, c.inferComponents(filePath, c.StandaloneComponents)...) + + var endpoints []metric.ComponentEndpoint + for _, component := range components { + endpoints = append(endpoints, metric.ComponentEndpoint{ Component: component, Endpoint: endpoint, - }} + }) } - component = c.inferComponent(filePath, c.StandaloneComponents) - if component != "" { - return []metric.ComponentEndpoint{{ - Component: component, - Endpoint: endpoint, - }} - } - - return nil + return endpoints } func (c *endpointMappingConfig) isSharedPath(filePath string) bool { @@ -90,23 +89,18 @@ func (c *endpointMappingConfig) isSharedPath(filePath string) bool { return false } -func (c *endpointMappingConfig) inferComponent(filePath string, components map[string][]string) string { - // Sort component names for deterministic iteration order - componentNames := make([]string, 0, len(components)) - for name := range components { - componentNames = append(componentNames, name) - } - sort.Strings(componentNames) - - for _, component := range componentNames { - patterns := components[component] +func (c *endpointMappingConfig) inferComponents(filePath string, components map[string][]string) []string { + var matchingComponents []string + for component, patterns := range components { for _, pattern := range patterns { if strings.Contains(filePath, pattern) { - return component + matchingComponents = append(matchingComponents, component) } } } - return "" + // Sort to ensure consistent result, regardless of map iteration order. + slices.Sort(matchingComponents) + return matchingComponents } func (c *endpointMappingConfig) inferEndpoint(filePath string) string { diff --git a/pkg/controller/resourceclaim/controller.go b/pkg/controller/resourceclaim/controller.go index 2cefc2fe8ac..47a3d883872 100644 --- a/pkg/controller/resourceclaim/controller.go +++ b/pkg/controller/resourceclaim/controller.go @@ -52,9 +52,10 @@ import ( "k8s.io/client-go/util/workqueue" "k8s.io/component-base/metrics" "k8s.io/dynamic-resource-allocation/resourceclaim" + resourceclaimmetrics "k8s.io/dynamic-resource-allocation/resourceclaim/metrics" "k8s.io/klog/v2" podutil "k8s.io/kubernetes/pkg/api/v1/pod" - resourceclaimmetrics "k8s.io/kubernetes/pkg/controller/resourceclaim/metrics" + controllermetrics "k8s.io/kubernetes/pkg/controller/resourceclaim/metrics" "k8s.io/utils/ptr" ) @@ -187,7 +188,8 @@ func NewController( deletedObjects: newUIDCache(maxUIDCacheEntries), } - resourceclaimmetrics.RegisterMetrics(newCustomCollector(ec.claimLister, getAdminAccessMetricLabel, logger)) + resourceclaimmetrics.RegisterMetrics() + controllermetrics.RegisterMetrics(newCustomCollector(ec.claimLister, getAdminAccessMetricLabel, logger)) if _, err := podInformer.Informer().AddEventHandlerWithOptions(cache.ResourceEventHandlerFuncs{ AddFunc: func(obj interface{}) { @@ -1768,11 +1770,11 @@ type customCollector struct { var _ metrics.StableCollector = &customCollector{} func (collector *customCollector) DescribeWithStability(ch chan<- *metrics.Desc) { - ch <- resourceclaimmetrics.NumResourceClaimsDesc + ch <- controllermetrics.NumResourceClaimsDesc } func (collector *customCollector) CollectWithStability(ch chan<- metrics.Metric) { - rcMetrics := make(map[resourceclaimmetrics.NumResourceClaimLabels]int) + rcMetrics := make(map[controllermetrics.NumResourceClaimLabels]int) rcList, err := collector.rcLister.List(labels.Everything()) if err != nil { collector.logger.Error(err, "failed to list resource claims for metrics collection") @@ -1791,11 +1793,11 @@ func (collector *customCollector) CollectWithStability(ch chan<- metrics.Metric) } else if val, ok := rc.Annotations[resourceapi.PodResourceClaimAnnotation]; ok && val != "" { source = "resource_claim_template" } - rcMetrics[resourceclaimmetrics.NumResourceClaimLabels{Allocated: allocated, AdminAccess: adminAccess, Source: source}]++ + rcMetrics[controllermetrics.NumResourceClaimLabels{Allocated: allocated, AdminAccess: adminAccess, Source: source}]++ } for rcLabels, count := range rcMetrics { ch <- metrics.NewLazyConstMetric( - resourceclaimmetrics.NumResourceClaimsDesc, + controllermetrics.NumResourceClaimsDesc, metrics.GaugeValue, float64(count), rcLabels.Allocated, diff --git a/pkg/controller/resourceclaim/controller_test.go b/pkg/controller/resourceclaim/controller_test.go index 7a11ec1c667..9ef435e2342 100644 --- a/pkg/controller/resourceclaim/controller_test.go +++ b/pkg/controller/resourceclaim/controller_test.go @@ -45,9 +45,10 @@ import ( featuregatetesting "k8s.io/component-base/featuregate/testing" "k8s.io/component-base/metrics" "k8s.io/component-base/metrics/testutil" + resourceclaimmetrics "k8s.io/dynamic-resource-allocation/resourceclaim/metrics" "k8s.io/klog/v2" "k8s.io/kubernetes/pkg/controller" - resourceclaimmetrics "k8s.io/kubernetes/pkg/controller/resourceclaim/metrics" + controllermetrics "k8s.io/kubernetes/pkg/controller/resourceclaim/metrics" "k8s.io/kubernetes/pkg/features" "k8s.io/kubernetes/test/utils/ktesting" "k8s.io/utils/ptr" @@ -1060,7 +1061,7 @@ func TestResourceClaimEventHandler(t *testing.T) { expectQueue(tCtx, []string{}) _, err = claimClient.Create(tCtx, testClaim, metav1.CreateOptions{}) - em = em.withUpdates(resourceclaimmetrics.NumResourceClaimLabels{Allocated: "false", AdminAccess: "false", Source: ""}, 1) + em = em.withUpdates(controllermetrics.NumResourceClaimLabels{Allocated: "false", AdminAccess: "false", Source: ""}, 1) tCtx.Step("create claim", func(tCtx ktesting.TContext) { tCtx.ExpectNoError(err) em.Eventually(tCtx) @@ -1077,8 +1078,8 @@ func TestResourceClaimEventHandler(t *testing.T) { }) _, err = claimClient.Update(tCtx, testClaimAllocated, metav1.UpdateOptions{}) - em = em.withUpdates(resourceclaimmetrics.NumResourceClaimLabels{Allocated: "false", AdminAccess: "false", Source: ""}, -1) - em = em.withUpdates(resourceclaimmetrics.NumResourceClaimLabels{Allocated: "true", AdminAccess: "false", Source: ""}, 1) + em = em.withUpdates(controllermetrics.NumResourceClaimLabels{Allocated: "false", AdminAccess: "false", Source: ""}, -1) + em = em.withUpdates(controllermetrics.NumResourceClaimLabels{Allocated: "true", AdminAccess: "false", Source: ""}, 1) tCtx.Step("allocate claim", func(tCtx ktesting.TContext) { tCtx.ExpectNoError(err) em.Eventually(tCtx) @@ -1097,7 +1098,7 @@ func TestResourceClaimEventHandler(t *testing.T) { otherClaimAllocated := testClaimAllocated.DeepCopy() otherClaimAllocated.Name += "2" _, err = claimClient.Create(tCtx, otherClaimAllocated, metav1.CreateOptions{}) - em = em.withUpdates(resourceclaimmetrics.NumResourceClaimLabels{Allocated: "true", AdminAccess: "false", Source: ""}, 1) + em = em.withUpdates(controllermetrics.NumResourceClaimLabels{Allocated: "true", AdminAccess: "false", Source: ""}, 1) tCtx.Step("create allocated claim", func(tCtx ktesting.TContext) { tCtx.ExpectNoError(err) em.Eventually(tCtx) @@ -1105,8 +1106,8 @@ func TestResourceClaimEventHandler(t *testing.T) { }) _, err = claimClient.Update(tCtx, testClaim, metav1.UpdateOptions{}) - em = em.withUpdates(resourceclaimmetrics.NumResourceClaimLabels{Allocated: "false", AdminAccess: "false", Source: ""}, 1) - em = em.withUpdates(resourceclaimmetrics.NumResourceClaimLabels{Allocated: "true", AdminAccess: "false", Source: ""}, -1) + em = em.withUpdates(controllermetrics.NumResourceClaimLabels{Allocated: "false", AdminAccess: "false", Source: ""}, 1) + em = em.withUpdates(controllermetrics.NumResourceClaimLabels{Allocated: "true", AdminAccess: "false", Source: ""}, -1) tCtx.Step("deallocate claim", func(tCtx ktesting.TContext) { tCtx.ExpectNoError(err) em.Eventually(tCtx) @@ -1114,7 +1115,7 @@ func TestResourceClaimEventHandler(t *testing.T) { }) err = claimClient.Delete(tCtx, testClaim.Name, metav1.DeleteOptions{}) - em = em.withUpdates(resourceclaimmetrics.NumResourceClaimLabels{Allocated: "false", AdminAccess: "false", Source: ""}, -1) + em = em.withUpdates(controllermetrics.NumResourceClaimLabels{Allocated: "false", AdminAccess: "false", Source: ""}, -1) tCtx.Step("delete deallocated claim", func(tCtx ktesting.TContext) { tCtx.ExpectNoError(err) em.Eventually(tCtx) @@ -1122,7 +1123,7 @@ func TestResourceClaimEventHandler(t *testing.T) { }) err = claimClient.Delete(tCtx, otherClaimAllocated.Name, metav1.DeleteOptions{}) - em = em.withUpdates(resourceclaimmetrics.NumResourceClaimLabels{Allocated: "true", AdminAccess: "false", Source: ""}, -1) + em = em.withUpdates(controllermetrics.NumResourceClaimLabels{Allocated: "true", AdminAccess: "false", Source: ""}, -1) tCtx.Step("delete allocated claim", func(tCtx ktesting.TContext) { tCtx.ExpectNoError(err) em.Eventually(tCtx) @@ -1130,7 +1131,7 @@ func TestResourceClaimEventHandler(t *testing.T) { }) _, err = claimClient.Create(tCtx, templatedTestClaimWithAdmin, metav1.CreateOptions{}) - em = em.withUpdates(resourceclaimmetrics.NumResourceClaimLabels{Allocated: "false", AdminAccess: "true", Source: "resource_claim_template"}, 1) + em = em.withUpdates(controllermetrics.NumResourceClaimLabels{Allocated: "false", AdminAccess: "true", Source: "resource_claim_template"}, 1) tCtx.Step("create claim with admin access", func(tCtx ktesting.TContext) { tCtx.ExpectNoError(err) em.Eventually(tCtx) @@ -1145,8 +1146,8 @@ func TestResourceClaimEventHandler(t *testing.T) { }) _, err = claimClient.Update(tCtx, templatedTestClaimWithAdminAllocated, metav1.UpdateOptions{}) - em = em.withUpdates(resourceclaimmetrics.NumResourceClaimLabels{Allocated: "false", AdminAccess: "true", Source: "resource_claim_template"}, -1) - em = em.withUpdates(resourceclaimmetrics.NumResourceClaimLabels{Allocated: "true", AdminAccess: "true", Source: "resource_claim_template"}, 1) + em = em.withUpdates(controllermetrics.NumResourceClaimLabels{Allocated: "false", AdminAccess: "true", Source: "resource_claim_template"}, -1) + em = em.withUpdates(controllermetrics.NumResourceClaimLabels{Allocated: "true", AdminAccess: "true", Source: "resource_claim_template"}, 1) tCtx.Step("allocate claim with admin access", func(tCtx ktesting.TContext) { tCtx.ExpectNoError(err) em.Eventually(tCtx) @@ -1163,59 +1164,59 @@ func TestResourceClaimEventHandler(t *testing.T) { otherClaimAllocated = templatedTestClaimWithAdminAllocated.DeepCopy() otherClaimAllocated.Name += "2" _, err = claimClient.Create(tCtx, otherClaimAllocated, metav1.CreateOptions{}) - em = em.withUpdates(resourceclaimmetrics.NumResourceClaimLabels{Allocated: "true", AdminAccess: "true", Source: "resource_claim_template"}, 1) + em = em.withUpdates(controllermetrics.NumResourceClaimLabels{Allocated: "true", AdminAccess: "true", Source: "resource_claim_template"}, 1) tCtx.Step("create allocated claim with admin access", func(tCtx ktesting.TContext) { tCtx.ExpectNoError(err) em.Eventually(tCtx) }) _, err = claimClient.Update(tCtx, templatedTestClaimWithAdmin, metav1.UpdateOptions{}) - em = em.withUpdates(resourceclaimmetrics.NumResourceClaimLabels{Allocated: "false", AdminAccess: "true", Source: "resource_claim_template"}, 1) - em = em.withUpdates(resourceclaimmetrics.NumResourceClaimLabels{Allocated: "true", AdminAccess: "true", Source: "resource_claim_template"}, -1) + em = em.withUpdates(controllermetrics.NumResourceClaimLabels{Allocated: "false", AdminAccess: "true", Source: "resource_claim_template"}, 1) + em = em.withUpdates(controllermetrics.NumResourceClaimLabels{Allocated: "true", AdminAccess: "true", Source: "resource_claim_template"}, -1) tCtx.Step("deallocate claim with admin access", func(tCtx ktesting.TContext) { tCtx.ExpectNoError(err) em.Eventually(tCtx) }) err = claimClient.Delete(tCtx, templatedTestClaimWithAdmin.Name, metav1.DeleteOptions{}) - em = em.withUpdates(resourceclaimmetrics.NumResourceClaimLabels{Allocated: "false", AdminAccess: "true", Source: "resource_claim_template"}, -1) + em = em.withUpdates(controllermetrics.NumResourceClaimLabels{Allocated: "false", AdminAccess: "true", Source: "resource_claim_template"}, -1) tCtx.Step("delete deallocated claim with admin access", func(tCtx ktesting.TContext) { tCtx.ExpectNoError(err) em.Eventually(tCtx) }) err = claimClient.Delete(tCtx, otherClaimAllocated.Name, metav1.DeleteOptions{}) - em = em.withUpdates(resourceclaimmetrics.NumResourceClaimLabels{Allocated: "true", AdminAccess: "true", Source: "resource_claim_template"}, -1) + em = em.withUpdates(controllermetrics.NumResourceClaimLabels{Allocated: "true", AdminAccess: "true", Source: "resource_claim_template"}, -1) tCtx.Step("delete allocated claim with admin access", func(tCtx ktesting.TContext) { tCtx.ExpectNoError(err) em.Eventually(tCtx) }) _, err = claimClient.Create(tCtx, extendedTestClaim, metav1.CreateOptions{}) - em = em.withUpdates(resourceclaimmetrics.NumResourceClaimLabels{Allocated: "false", AdminAccess: "false", Source: "extended_resource"}, 1) + em = em.withUpdates(controllermetrics.NumResourceClaimLabels{Allocated: "false", AdminAccess: "false", Source: "extended_resource"}, 1) tCtx.Step("create extended resource claim", func(tCtx ktesting.TContext) { tCtx.ExpectNoError(err) em.Eventually(tCtx) }) _, err = claimClient.Update(tCtx, extendedTestClaimAllocated, metav1.UpdateOptions{}) - em = em.withUpdates(resourceclaimmetrics.NumResourceClaimLabels{Allocated: "false", AdminAccess: "false", Source: "extended_resource"}, -1) - em = em.withUpdates(resourceclaimmetrics.NumResourceClaimLabels{Allocated: "true", AdminAccess: "false", Source: "extended_resource"}, 1) + em = em.withUpdates(controllermetrics.NumResourceClaimLabels{Allocated: "false", AdminAccess: "false", Source: "extended_resource"}, -1) + em = em.withUpdates(controllermetrics.NumResourceClaimLabels{Allocated: "true", AdminAccess: "false", Source: "extended_resource"}, 1) tCtx.Step("allocate extended resource claim", func(tCtx ktesting.TContext) { tCtx.ExpectNoError(err) em.Eventually(tCtx) }) _, err = claimClient.Update(tCtx, extendedTestClaim, metav1.UpdateOptions{}) - em = em.withUpdates(resourceclaimmetrics.NumResourceClaimLabels{Allocated: "false", AdminAccess: "false", Source: "extended_resource"}, 1) - em = em.withUpdates(resourceclaimmetrics.NumResourceClaimLabels{Allocated: "true", AdminAccess: "false", Source: "extended_resource"}, -1) + em = em.withUpdates(controllermetrics.NumResourceClaimLabels{Allocated: "false", AdminAccess: "false", Source: "extended_resource"}, 1) + em = em.withUpdates(controllermetrics.NumResourceClaimLabels{Allocated: "true", AdminAccess: "false", Source: "extended_resource"}, -1) tCtx.Step("deallocate extended resource claim", func(tCtx ktesting.TContext) { tCtx.ExpectNoError(err) em.Eventually(tCtx) }) err = claimClient.Delete(tCtx, extendedTestClaim.Name, metav1.DeleteOptions{}) - em = em.withUpdates(resourceclaimmetrics.NumResourceClaimLabels{Allocated: "false", AdminAccess: "false", Source: "extended_resource"}, -1) + em = em.withUpdates(controllermetrics.NumResourceClaimLabels{Allocated: "false", AdminAccess: "false", Source: "extended_resource"}, -1) tCtx.Step("delete extended resource claim", func(tCtx ktesting.TContext) { tCtx.ExpectNoError(err) em.Eventually(tCtx) @@ -1238,7 +1239,7 @@ func testEventHandlers(tCtx ktesting.TContext) { updateObjects []object deleteObjects []object expectedKeys []string - expectedMetrics map[resourceclaimmetrics.NumResourceClaimLabels]float64 + expectedMetrics map[controllermetrics.NumResourceClaimLabels]float64 }{ "nothing": {}, "new-podgroup-feature-disabled": { @@ -1256,7 +1257,7 @@ func testEventHandlers(tCtx ktesting.TContext) { initialObjects: []runtime.Object{testPodGroupClaim}, createObjects: []object{testPodGroupWithResourceInStatus}, expectedKeys: []string{}, - expectedMetrics: map[resourceclaimmetrics.NumResourceClaimLabels]float64{ + expectedMetrics: map[controllermetrics.NumResourceClaimLabels]float64{ {Allocated: "false", AdminAccess: "false"}: 1, }, }, @@ -1265,7 +1266,7 @@ func testEventHandlers(tCtx ktesting.TContext) { initialObjects: []runtime.Object{testPodGroupWithResourceInStatus}, createObjects: []object{testPodGroupClaim}, expectedKeys: []string{testClaimKey}, - expectedMetrics: map[resourceclaimmetrics.NumResourceClaimLabels]float64{ + expectedMetrics: map[controllermetrics.NumResourceClaimLabels]float64{ {Allocated: "false", AdminAccess: "false"}: 1, }, }, @@ -1748,7 +1749,7 @@ func createResourceClaimReactor() func(action k8stesting.Action) (handled bool, } type numMetrics struct { - metrics map[resourceclaimmetrics.NumResourceClaimLabels]float64 + metrics map[controllermetrics.NumResourceClaimLabels]float64 lister resourcelisters.ResourceClaimLister } @@ -1767,7 +1768,7 @@ func getNumMetric(lister resourcelisters.ResourceClaimLister, logger klog.Logger return numMetrics{}, fmt.Errorf("failed to gather metrics: %w", err) } - metricName := "resourceclaim_controller_resource_claims" + metricName := "dynamic_resource_allocation_resource_claims" em = newNumMetrics(lister) @@ -1786,7 +1787,7 @@ func getNumMetric(lister resourcelisters.ResourceClaimLister, logger klog.Logger source := labels["source"] value := metric.GetGauge().GetValue() - em.metrics[resourceclaimmetrics.NumResourceClaimLabels{ + em.metrics[controllermetrics.NumResourceClaimLabels{ Allocated: allocated, AdminAccess: adminAccess, Source: source, @@ -1871,18 +1872,18 @@ func handleErr(t *testing.T, err error, metricName string) { } func setupMetrics() { // Enable test mode to prevent global custom collector registration - resourceclaimmetrics.SetTestMode(true) + controllermetrics.SetTestMode(true) // Reset counter metrics for each test (they are registered by the controller itself) resourceclaimmetrics.ResourceClaimCreate.Reset() } func newNumMetrics(lister resourcelisters.ResourceClaimLister) numMetrics { - metrics := make(map[resourceclaimmetrics.NumResourceClaimLabels]float64) + metrics := make(map[controllermetrics.NumResourceClaimLabels]float64) for _, allocated := range []string{"false", "true"} { for _, adminAccess := range []string{"false", "true"} { for _, source := range []string{"", "extended_resource", "resource_claim_template"} { - metrics[resourceclaimmetrics.NumResourceClaimLabels{ + metrics[controllermetrics.NumResourceClaimLabels{ Allocated: allocated, AdminAccess: adminAccess, Source: source, @@ -1896,7 +1897,7 @@ func newNumMetrics(lister resourcelisters.ResourceClaimLister) numMetrics { } } -func (em numMetrics) withUpdates(rcLabels resourceclaimmetrics.NumResourceClaimLabels, n float64) numMetrics { +func (em numMetrics) withUpdates(rcLabels controllermetrics.NumResourceClaimLabels, n float64) numMetrics { em.metrics[rcLabels] += n return numMetrics{ metrics: em.metrics, diff --git a/pkg/controller/resourceclaim/metrics/metrics.go b/pkg/controller/resourceclaim/metrics/metrics.go index 293b3f80b83..0cf321781d7 100644 --- a/pkg/controller/resourceclaim/metrics/metrics.go +++ b/pkg/controller/resourceclaim/metrics/metrics.go @@ -23,8 +23,8 @@ import ( "k8s.io/component-base/metrics/legacyregistry" ) -// ResourceClaimSubsystem - subsystem name used for ResourceClaim creation -const ResourceClaimSubsystem = "resourceclaim_controller" +// subsystem is intentionally generic because similar metrics exist also elsewhere. +const subsystem = "dynamic_resource_allocation" type NumResourceClaimLabels struct { Allocated string @@ -33,25 +33,12 @@ type NumResourceClaimLabels struct { } var ( - // ResourceClaimCreate tracks the total number of - // ResourceClaims creation requests - // categorized by their creation status and admin access. - ResourceClaimCreate = metrics.NewCounterVec( - &metrics.CounterOpts{ - Subsystem: ResourceClaimSubsystem, - Name: "creates_total", - Help: "Number of ResourceClaims creation requests, categorized by creation status and admin access", - StabilityLevel: metrics.ALPHA, - }, - []string{"status", "admin_access"}, - ) - // NumResourceClaimsDesc tracks the number of ResourceClaims, // categorized by their allocation status, admin access, and source. // Source can be 'resource_claim_template' (created from a template), // 'extended_resource' (extended resources), or empty (manually created by a user). NumResourceClaimsDesc = metrics.NewDesc( - metrics.BuildFQName("", ResourceClaimSubsystem, "resource_claims"), + metrics.BuildFQName("", subsystem, "resource_claims"), "Number of ResourceClaims, categorized by allocation status, admin access, and source. "+ "Source can be 'resource_claim_template' (created from a template), "+ "'extended_resource' (extended resources), or empty (manually created by a user).", @@ -73,7 +60,6 @@ func SetTestMode(enabled bool) { // RegisterMetrics registers ResourceClaim metrics. func RegisterMetrics(collector metrics.StableCollector) { registerMetrics.Do(func() { - legacyregistry.MustRegister(ResourceClaimCreate) if !testMode && collector != nil { // Only register custom collector in non-test mode legacyregistry.CustomMustRegister(collector) diff --git a/pkg/scheduler/framework/plugins/dynamicresources/dynamicresources_test.go b/pkg/scheduler/framework/plugins/dynamicresources/dynamicresources_test.go index 20ad1ae6436..b6a3e608d20 100644 --- a/pkg/scheduler/framework/plugins/dynamicresources/dynamicresources_test.go +++ b/pkg/scheduler/framework/plugins/dynamicresources/dynamicresources_test.go @@ -2232,7 +2232,7 @@ func testPlugin(tCtx ktesting.TContext) { classes: []*resourceapi.DeviceClass{deviceClassWithExtendResourceName}, want: want{}, metrics: func(tCtx ktesting.TContext, g compbasemetrics.Gatherer) { - _, err := testutil.GetCounterValuesFromGatherer(g, "scheduler_resourceclaim_creates_total", map[string]string{}, "status") + _, err := testutil.GetCounterValuesFromGatherer(g, "dynamic_resource_allocation_resourceclaim_creates_total", map[string]string{}, "status") require.ErrorContains(tCtx, err, "not found") }, }, @@ -2302,7 +2302,7 @@ func testPlugin(tCtx ktesting.TContext) { }, }, metrics: func(tCtx ktesting.TContext, g compbasemetrics.Gatherer) { - _, err := testutil.GetCounterValuesFromGatherer(g, "scheduler_resourceclaim_creates_total", map[string]string{}, "status") + _, err := testutil.GetCounterValuesFromGatherer(g, "dynamic_resource_allocation_resourceclaim_creates_total", map[string]string{}, "status") require.ErrorContains(tCtx, err, "not found") }, }, @@ -2324,7 +2324,7 @@ func testPlugin(tCtx ktesting.TContext) { }, }, metrics: func(tCtx ktesting.TContext, g compbasemetrics.Gatherer) { - metric, err := testutil.GetCounterValuesFromGatherer(g, "scheduler_resourceclaim_creates_total", map[string]string{}, "status") + metric, err := testutil.GetCounterValuesFromGatherer(g, "dynamic_resource_allocation_resourceclaim_creates_total", map[string]string{}, "status") require.NoError(tCtx, err) require.Equal(tCtx, 1, int(metric["success"])) }, @@ -2347,7 +2347,7 @@ func testPlugin(tCtx ktesting.TContext) { }, }, metrics: func(tCtx ktesting.TContext, g compbasemetrics.Gatherer) { - metric, err := testutil.GetCounterValuesFromGatherer(g, "scheduler_resourceclaim_creates_total", map[string]string{}, "status") + metric, err := testutil.GetCounterValuesFromGatherer(g, "dynamic_resource_allocation_resourceclaim_creates_total", map[string]string{}, "status") require.NoError(tCtx, err) require.Equal(tCtx, 1, int(metric["success"])) }, @@ -2370,7 +2370,7 @@ func testPlugin(tCtx ktesting.TContext) { }, }, metrics: func(tCtx ktesting.TContext, g compbasemetrics.Gatherer) { - metric, err := testutil.GetCounterValuesFromGatherer(g, "scheduler_resourceclaim_creates_total", map[string]string{}, "status") + metric, err := testutil.GetCounterValuesFromGatherer(g, "dynamic_resource_allocation_resourceclaim_creates_total", map[string]string{}, "status") require.NoError(tCtx, err) require.Equal(tCtx, 1, int(metric["success"])) }, @@ -2395,7 +2395,7 @@ func testPlugin(tCtx ktesting.TContext) { }, }, metrics: func(tCtx ktesting.TContext, g compbasemetrics.Gatherer) { - metric, err := testutil.GetCounterValuesFromGatherer(g, "scheduler_resourceclaim_creates_total", map[string]string{}, "status") + metric, err := testutil.GetCounterValuesFromGatherer(g, "dynamic_resource_allocation_resourceclaim_creates_total", map[string]string{}, "status") require.NoError(tCtx, err) require.Equal(tCtx, 1, int(metric["success"])) }, @@ -2418,7 +2418,7 @@ func testPlugin(tCtx ktesting.TContext) { }, }, metrics: func(tCtx ktesting.TContext, g compbasemetrics.Gatherer) { - _, err := testutil.GetCounterValuesFromGatherer(g, "scheduler_resourceclaim_creates_total", map[string]string{}, "status") + _, err := testutil.GetCounterValuesFromGatherer(g, "dynamic_resource_allocation_resourceclaim_creates_total", map[string]string{}, "status") require.ErrorContains(tCtx, err, "not found") }, }, @@ -2440,7 +2440,7 @@ func testPlugin(tCtx ktesting.TContext) { }, }, metrics: func(tCtx ktesting.TContext, g compbasemetrics.Gatherer) { - _, err := testutil.GetCounterValuesFromGatherer(g, "scheduler_resourceclaim_creates_total", map[string]string{}, "status") + _, err := testutil.GetCounterValuesFromGatherer(g, "dynamic_resource_allocation_resourceclaim_creates_total", map[string]string{}, "status") require.ErrorContains(tCtx, err, "not found") }, }, @@ -2462,7 +2462,7 @@ func testPlugin(tCtx ktesting.TContext) { }, }, metrics: func(tCtx ktesting.TContext, g compbasemetrics.Gatherer) { - metric, err := testutil.GetCounterValuesFromGatherer(g, "scheduler_resourceclaim_creates_total", map[string]string{}, "status") + metric, err := testutil.GetCounterValuesFromGatherer(g, "dynamic_resource_allocation_resourceclaim_creates_total", map[string]string{}, "status") require.NoError(tCtx, err) require.Equal(tCtx, 1, int(metric["success"])) }, @@ -2479,7 +2479,7 @@ func testPlugin(tCtx ktesting.TContext) { unreserveBeforePreBind: &result{}, }, metrics: func(tCtx ktesting.TContext, g compbasemetrics.Gatherer) { - metric, err := testutil.GetCounterValuesFromGatherer(g, "scheduler_resourceclaim_creates_total", map[string]string{}, "status") + metric, err := testutil.GetCounterValuesFromGatherer(g, "dynamic_resource_allocation_resourceclaim_creates_total", map[string]string{}, "status") require.NoError(tCtx, err) require.Equal(tCtx, 1, int(metric["success"])) }, @@ -2510,7 +2510,7 @@ func testPlugin(tCtx ktesting.TContext) { }, }, metrics: func(tCtx ktesting.TContext, g compbasemetrics.Gatherer) { - metric, err := testutil.GetCounterValuesFromGatherer(g, "scheduler_resourceclaim_creates_total", map[string]string{}, "status") + metric, err := testutil.GetCounterValuesFromGatherer(g, "dynamic_resource_allocation_resourceclaim_creates_total", map[string]string{}, "status") require.NoError(tCtx, err) require.Equal(tCtx, 1, int(metric["failure"])) }, diff --git a/pkg/scheduler/framework/plugins/dynamicresources/extendeddynamicresources.go b/pkg/scheduler/framework/plugins/dynamicresources/extendeddynamicresources.go index e6569023229..054cfdd55d4 100644 --- a/pkg/scheduler/framework/plugins/dynamicresources/extendeddynamicresources.go +++ b/pkg/scheduler/framework/plugins/dynamicresources/extendeddynamicresources.go @@ -548,10 +548,10 @@ func (pl *DynamicResources) createExtendedResourceClaimInAPI( createdClaim, err := pl.clientset.ResourceV1().ResourceClaims(claim.Namespace).Create(ctx, claim, metav1.CreateOptions{}) if err != nil { - metrics.ResourceClaimCreatesTotal.WithLabelValues("failure").Inc() + metrics.ResourceClaimCreatesTotal.WithLabelValues("failure", "false").Inc() return nil, fmt.Errorf("create claim for extended resources %v: %w", klog.KObj(claim), err) } - metrics.ResourceClaimCreatesTotal.WithLabelValues("success").Inc() + metrics.ResourceClaimCreatesTotal.WithLabelValues("success", "false").Inc() logger.V(5).Info("created claim for extended resources", "pod", klog.KObj(pod), "node", nodeName, "resourceclaim", klog.Format(createdClaim)) return createdClaim, nil diff --git a/pkg/scheduler/metrics/metrics.go b/pkg/scheduler/metrics/metrics.go index 2bf59c47ef3..57343c2f42e 100644 --- a/pkg/scheduler/metrics/metrics.go +++ b/pkg/scheduler/metrics/metrics.go @@ -23,6 +23,7 @@ import ( utilfeature "k8s.io/apiserver/pkg/util/feature" "k8s.io/component-base/metrics" "k8s.io/component-base/metrics/legacyregistry" + resourceclaimmetrics "k8s.io/dynamic-resource-allocation/resourceclaim/metrics" "k8s.io/kubernetes/pkg/features" volumebindingmetrics "k8s.io/kubernetes/pkg/scheduler/framework/plugins/volumebinding/metrics" ) @@ -166,7 +167,8 @@ var ( AsyncAPIPendingCalls *metrics.GaugeVec // The below is only available when the DRAExtendedResource feature gate is enabled. - ResourceClaimCreatesTotal *metrics.CounterVec + // This is the same metric that also gets recorded in the kube-controller-manager. + ResourceClaimCreatesTotal = resourceclaimmetrics.ResourceClaimCreate podGroupScheduleAttempts *metrics.CounterVec podGroupSchedulingLatency *metrics.HistogramVec @@ -200,7 +202,7 @@ func Register() { ) } if utilfeature.DefaultFeatureGate.Enabled(features.DRAExtendedResource) { - RegisterMetrics(ResourceClaimCreatesTotal) + resourceclaimmetrics.RegisterMetrics() } if utilfeature.DefaultFeatureGate.Enabled(features.GenericWorkload) { RegisterMetrics( @@ -459,15 +461,6 @@ func InitMetrics() { }, []string{"call_type"}) - ResourceClaimCreatesTotal = metrics.NewCounterVec( - &metrics.CounterOpts{ - Subsystem: SchedulerSubsystem, - Name: "resourceclaim_creates_total", - Help: "Number of ResourceClaims creation requests within scheduler", - StabilityLevel: metrics.ALPHA, - }, - []string{"status"}) - DRABindingConditionsAllocationsTotal = metrics.NewCounterVec( &metrics.CounterOpts{ Subsystem: SchedulerSubsystem, diff --git a/staging/publishing/import-restrictions.yaml b/staging/publishing/import-restrictions.yaml index a61a5da7e34..1322938257a 100644 --- a/staging/publishing/import-restrictions.yaml +++ b/staging/publishing/import-restrictions.yaml @@ -270,6 +270,8 @@ - k8s.io/apiserver/pkg/cel - k8s.io/apiserver/pkg/cel/environment - k8s.io/client-go + - k8s.io/component-base/metrics + - k8s.io/component-base/metrics/legacyregistry - k8s.io/component-helpers/scheduling/corev1/nodeaffinity - k8s.io/dynamic-resource-allocation - k8s.io/klog diff --git a/staging/src/k8s.io/dynamic-resource-allocation/go.mod b/staging/src/k8s.io/dynamic-resource-allocation/go.mod index f8d83276301..462e8351729 100644 --- a/staging/src/k8s.io/dynamic-resource-allocation/go.mod +++ b/staging/src/k8s.io/dynamic-resource-allocation/go.mod @@ -19,6 +19,7 @@ require ( k8s.io/apimachinery v0.0.0 k8s.io/apiserver v0.0.0 k8s.io/client-go v0.0.0 + k8s.io/component-base v0.0.0 k8s.io/component-helpers v0.0.0 k8s.io/klog/v2 v2.140.0 k8s.io/kubelet v0.0.0 @@ -83,7 +84,6 @@ require ( gopkg.in/evanphx/json-patch.v4 v4.13.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect gopkg.in/yaml.v3 v3.0.1 // indirect - k8s.io/component-base v0.0.0 // indirect k8s.io/kube-openapi v0.0.0-20260509150519-312035bf509b // indirect sigs.k8s.io/json v0.0.0-20250730193827-2d320260d730 // indirect sigs.k8s.io/structured-merge-diff/v6 v6.3.2 // indirect diff --git a/staging/src/k8s.io/dynamic-resource-allocation/resourceclaim/metrics/OWNERS b/staging/src/k8s.io/dynamic-resource-allocation/resourceclaim/metrics/OWNERS new file mode 100644 index 00000000000..b26e7a4dc7e --- /dev/null +++ b/staging/src/k8s.io/dynamic-resource-allocation/resourceclaim/metrics/OWNERS @@ -0,0 +1,8 @@ +# See the OWNERS docs at https://go.k8s.io/owners + +approvers: + - sig-instrumentation-approvers +reviewers: + - sig-instrumentation-reviewers +labels: + - sig/instrumentation diff --git a/staging/src/k8s.io/dynamic-resource-allocation/resourceclaim/metrics/metrics.go b/staging/src/k8s.io/dynamic-resource-allocation/resourceclaim/metrics/metrics.go new file mode 100644 index 00000000000..be1e90b1fb6 --- /dev/null +++ b/staging/src/k8s.io/dynamic-resource-allocation/resourceclaim/metrics/metrics.go @@ -0,0 +1,53 @@ +/* +Copyright The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package metrics + +import ( + "sync" + + "k8s.io/component-base/metrics" + "k8s.io/component-base/metrics/legacyregistry" +) + +// subsystem is intentionally generic because these metrics are exposed in kube-controller-manager and kube-scheduler. +const subsystem = "dynamic_resource_allocation" + +var ( + // ResourceClaimCreate tracks the total number of + // ResourceClaims creation requests + // categorized by their creation status and admin access. + // Used by kube-controller-manager and kube-scheduler, so + // the component where this metric gets collected is another dimension. + ResourceClaimCreate = metrics.NewCounterVec( + &metrics.CounterOpts{ + Subsystem: subsystem, + Name: "resourceclaim_creates_total", + Help: "Number of ResourceClaims creation requests, categorized by creation status and admin access", + StabilityLevel: metrics.ALPHA, + }, + []string{"status", "admin_access"}, + ) +) + +var registerMetrics sync.Once + +// RegisterMetrics registers ResourceClaim metrics. +func RegisterMetrics() { + registerMetrics.Do(func() { + legacyregistry.MustRegister(ResourceClaimCreate) + }) +} diff --git a/test/integration/dra/core.go b/test/integration/dra/core.go index 564c5a8412b..5cfda23e074 100644 --- a/test/integration/dra/core.go +++ b/test/integration/dra/core.go @@ -42,10 +42,10 @@ import ( "k8s.io/component-base/featuregate" "k8s.io/component-base/metrics/testutil" draclient "k8s.io/dynamic-resource-allocation/client" + resourceclaimmetrics "k8s.io/dynamic-resource-allocation/resourceclaim/metrics" "k8s.io/dynamic-resource-allocation/resourceslice" "k8s.io/klog/v2" "k8s.io/kubernetes/pkg/controller/resourceclaim" - resourceclaimmetrics "k8s.io/kubernetes/pkg/controller/resourceclaim/metrics" "k8s.io/kubernetes/pkg/features" st "k8s.io/kubernetes/pkg/scheduler/testing" e2epod "k8s.io/kubernetes/test/e2e/framework/pod"