From f1bf45e18969e2f27d972abf6950e570a2adfb9e Mon Sep 17 00:00:00 2001 From: Prathamesh Bhope Date: Tue, 20 Jan 2026 18:11:22 -0800 Subject: [PATCH] endpointslice/metrics: promote metrics to beta and add test --- .../k8s.io/endpointslice/metrics/metrics.go | 12 +- .../k8s.io/endpointslice/reconciler_test.go | 202 ++++++++++++++++-- .../testdata/stable-metrics-list.yaml | 64 ++++++ 3 files changed, 249 insertions(+), 29 deletions(-) diff --git a/staging/src/k8s.io/endpointslice/metrics/metrics.go b/staging/src/k8s.io/endpointslice/metrics/metrics.go index 5323a5dbc7c..7368d2b7800 100644 --- a/staging/src/k8s.io/endpointslice/metrics/metrics.go +++ b/staging/src/k8s.io/endpointslice/metrics/metrics.go @@ -34,7 +34,7 @@ var ( Subsystem: EndpointSliceSubsystem, Name: "endpoints_added_per_sync", Help: "Number of endpoints added on each Service sync", - StabilityLevel: metrics.ALPHA, + StabilityLevel: metrics.BETA, Buckets: metrics.ExponentialBuckets(2, 2, 15), }, []string{}, @@ -46,7 +46,7 @@ var ( Subsystem: EndpointSliceSubsystem, Name: "endpoints_removed_per_sync", Help: "Number of endpoints removed on each Service sync", - StabilityLevel: metrics.ALPHA, + StabilityLevel: metrics.BETA, Buckets: metrics.ExponentialBuckets(2, 2, 15), }, []string{}, @@ -57,7 +57,7 @@ var ( Subsystem: EndpointSliceSubsystem, Name: "endpoints_desired", Help: "Number of endpoints desired", - StabilityLevel: metrics.ALPHA, + StabilityLevel: metrics.BETA, }, []string{}, ) @@ -67,7 +67,7 @@ var ( Subsystem: EndpointSliceSubsystem, Name: "num_endpoint_slices", Help: "Number of EndpointSlices", - StabilityLevel: metrics.ALPHA, + StabilityLevel: metrics.BETA, }, []string{}, ) @@ -78,7 +78,7 @@ var ( Subsystem: EndpointSliceSubsystem, Name: "desired_endpoint_slices", Help: "Number of EndpointSlices that would exist with perfect endpoint allocation", - StabilityLevel: metrics.ALPHA, + StabilityLevel: metrics.BETA, }, []string{}, ) @@ -127,7 +127,7 @@ var ( Subsystem: EndpointSliceSubsystem, Name: "services_count_by_traffic_distribution", Help: "Number of Services using some specific trafficDistribution", - StabilityLevel: metrics.ALPHA, + StabilityLevel: metrics.BETA, }, []string{"traffic_distribution"}, // A trafficDistribution value ) diff --git a/staging/src/k8s.io/endpointslice/reconciler_test.go b/staging/src/k8s.io/endpointslice/reconciler_test.go index ea4291c65a7..76083e7a867 100644 --- a/staging/src/k8s.io/endpointslice/reconciler_test.go +++ b/staging/src/k8s.io/endpointslice/reconciler_test.go @@ -39,6 +39,7 @@ import ( corelisters "k8s.io/client-go/listers/core/v1" k8stesting "k8s.io/client-go/testing" "k8s.io/client-go/tools/record" + "k8s.io/component-base/metrics/legacyregistry" "k8s.io/component-base/metrics/testutil" "k8s.io/endpointslice/metrics" "k8s.io/endpointslice/topologycache" @@ -51,6 +52,10 @@ const ( controllerName = "endpointslice-controller.k8s.io" ) +func init() { + metrics.RegisterMetrics() +} + func expectAction(t *testing.T, actions []k8stesting.Action, index int, verb, resource string) { t.Helper() if len(actions) <= index { @@ -114,7 +119,7 @@ var defaultMaxEndpointsPerSlice = int32(100) // Even when there are no pods, we want to have a placeholder slice for each service func TestReconcileEmpty(t *testing.T) { client := newClientset() - setupMetrics() + resetMetrics() namespace := "test" svc, _ := newServiceAndEndpointMeta("foo", namespace) @@ -468,7 +473,7 @@ func TestReconcile1Pod(t *testing.T) { for name, testCase := range testCases { t.Run(name, func(t *testing.T) { client := newClientset() - setupMetrics() + resetMetrics() triggerTime := time.Now().UTC() r := newReconciler(client, []*corev1.Node{node1}, defaultMaxEndpointsPerSlice) reconcileHelper(t, r, &testCase.service, []*corev1.Pod{pod1}, []*discovery.EndpointSlice{}, triggerTime) @@ -574,7 +579,7 @@ func TestReconcile1EndpointSlice(t *testing.T) { for _, tc := range testCases { t.Run(tc.desc, func(t *testing.T) { client := newClientset() - setupMetrics() + resetMetrics() existingSlices := []*discovery.EndpointSlice{} if tc.existing != nil { @@ -727,7 +732,7 @@ func TestReconcile1EndpointSlicePublishNotReadyAddresses(t *testing.T) { // reconcile should create 3 slices, completely filling 2 of them func TestReconcileManyPods(t *testing.T) { client := newClientset() - setupMetrics() + resetMetrics() namespace := "test" svc, _ := newServiceAndEndpointMeta("foo", namespace) @@ -760,7 +765,7 @@ func TestReconcileManyPods(t *testing.T) { // this approach requires 1 update + 1 create instead of 2 updates + 1 create func TestReconcileEndpointSlicesSomePreexisting(t *testing.T) { client := newClientset() - setupMetrics() + resetMetrics() namespace := "test" svc, endpointMeta := newServiceAndEndpointMeta("foo", namespace) @@ -816,7 +821,7 @@ func TestReconcileEndpointSlicesSomePreexisting(t *testing.T) { // this approach requires 2 creates instead of 2 updates + 1 create func TestReconcileEndpointSlicesSomePreexistingWorseAllocation(t *testing.T) { client := newClientset() - setupMetrics() + resetMetrics() namespace := "test" svc, endpointMeta := newServiceAndEndpointMeta("foo", namespace) @@ -973,7 +978,7 @@ func TestReconcileEndpointSlicesServicesReservedLabels(t *testing.T) { // reconcile repacks the endpoints into 3 slices, and deletes the extras func TestReconcileEndpointSlicesRecycling(t *testing.T) { client := newClientset() - setupMetrics() + resetMetrics() namespace := "test" svc, endpointMeta := newServiceAndEndpointMeta("foo", namespace) @@ -1023,7 +1028,7 @@ func TestReconcileEndpointSlicesRecycling(t *testing.T) { // for update. func TestReconcileEndpointSlicesUpdatePacking(t *testing.T) { client := newClientset() - setupMetrics() + resetMetrics() namespace := "test" svc, endpointMeta := newServiceAndEndpointMeta("foo", namespace) @@ -1082,7 +1087,7 @@ func TestReconcileEndpointSlicesUpdatePacking(t *testing.T) { // address type will be replaced with a newer IPv4 type. func TestReconcileEndpointSlicesReplaceDeprecated(t *testing.T) { client := newClientset() - setupMetrics() + resetMetrics() namespace := "test" svc, endpointMeta := newServiceAndEndpointMeta("foo", namespace) @@ -1155,7 +1160,7 @@ func TestReconcileEndpointSlicesRecreation(t *testing.T) { for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { client := newClientset() - setupMetrics() + resetMetrics() namespace := "test" svc, endpointMeta := newServiceAndEndpointMeta("foo", namespace) @@ -1204,7 +1209,7 @@ func TestReconcileEndpointSlicesRecreation(t *testing.T) { // This test ensures that EndpointSlices are grouped correctly in that case. func TestReconcileEndpointSlicesNamedPorts(t *testing.T) { client := newClientset() - setupMetrics() + resetMetrics() namespace := "test" portNameIntStr := intstr.IntOrString{ @@ -1312,7 +1317,7 @@ func TestReconcileMaxEndpointsPerSlice(t *testing.T) { for _, testCase := range testCases { t.Run(fmt.Sprintf("maxEndpointsPerSlice: %d", testCase.maxEndpointsPerSlice), func(t *testing.T) { client := newClientset() - setupMetrics() + resetMetrics() r := newReconciler(client, []*corev1.Node{{ObjectMeta: metav1.ObjectMeta{Name: "node-1"}}}, testCase.maxEndpointsPerSlice) reconcileHelper(t, r, &svc, pods, []*discovery.EndpointSlice{}, time.Now()) expectUnorderedSlicesWithLengths(t, fetchEndpointSlices(t, client, namespace), testCase.expectedSliceLengths) @@ -1323,7 +1328,7 @@ func TestReconcileMaxEndpointsPerSlice(t *testing.T) { func TestReconcileEndpointSlicesMetrics(t *testing.T) { client := newClientset() - setupMetrics() + resetMetrics() namespace := "test" svc, _ := newServiceAndEndpointMeta("foo", namespace) @@ -1347,6 +1352,154 @@ func TestReconcileEndpointSlicesMetrics(t *testing.T) { expectMetrics(t, expectedMetrics{desiredSlices: 1, actualSlices: 1, desiredEndpoints: 10, addedPerSync: 20, removedPerSync: 10, numCreated: 1, numUpdated: 1, numDeleted: 0, slicesChangedPerSync: 2}) } +func TestReconcileEndpointSlicesMetrics_EndpointsAddedPerSync_CollectAndCompare(t *testing.T) { + resetMetrics() + client := newClientset() + namespace := "test" + svc, endpointMeta := newServiceAndEndpointMeta("foo", namespace) + + keepPod := newPod(1, namespace, true, 1, false) + newDesiredPod := newPod(2, namespace, true, 1, false) + + // Existing slice has one endpoint that should remain. Reconcile should add exactly 1 endpoint. + existingSlice := newEmptyEndpointSlice(1, namespace, endpointMeta, svc) + existingSlice.Endpoints = []discovery.Endpoint{ + podToEndpoint(keepPod, &corev1.Node{}, &svc, discovery.AddressTypeIPv4), + } + createdSlice, err := client.DiscoveryV1().EndpointSlices(namespace).Create(context.TODO(), existingSlice, metav1.CreateOptions{}) + if err != nil { + t.Fatalf("Expected no error creating EndpointSlice, got: %v", err) + } + + r := newReconciler(client, []*corev1.Node{{ObjectMeta: metav1.ObjectMeta{Name: "node-1"}}}, defaultMaxEndpointsPerSlice) + reconcileHelper(t, r, &svc, []*corev1.Pod{keepPod, newDesiredPod}, []*discovery.EndpointSlice{createdSlice}, time.Now()) + + want := ` +# HELP endpoint_slice_controller_endpoints_added_per_sync [BETA] Number of endpoints added on each Service sync +# TYPE endpoint_slice_controller_endpoints_added_per_sync histogram +endpoint_slice_controller_endpoints_added_per_sync_bucket{le="2"} 1 +endpoint_slice_controller_endpoints_added_per_sync_bucket{le="4"} 1 +endpoint_slice_controller_endpoints_added_per_sync_bucket{le="8"} 1 +endpoint_slice_controller_endpoints_added_per_sync_bucket{le="16"} 1 +endpoint_slice_controller_endpoints_added_per_sync_bucket{le="32"} 1 +endpoint_slice_controller_endpoints_added_per_sync_bucket{le="64"} 1 +endpoint_slice_controller_endpoints_added_per_sync_bucket{le="128"} 1 +endpoint_slice_controller_endpoints_added_per_sync_bucket{le="256"} 1 +endpoint_slice_controller_endpoints_added_per_sync_bucket{le="512"} 1 +endpoint_slice_controller_endpoints_added_per_sync_bucket{le="1024"} 1 +endpoint_slice_controller_endpoints_added_per_sync_bucket{le="2048"} 1 +endpoint_slice_controller_endpoints_added_per_sync_bucket{le="4096"} 1 +endpoint_slice_controller_endpoints_added_per_sync_bucket{le="8192"} 1 +endpoint_slice_controller_endpoints_added_per_sync_bucket{le="16384"} 1 +endpoint_slice_controller_endpoints_added_per_sync_bucket{le="32768"} 1 +endpoint_slice_controller_endpoints_added_per_sync_bucket{le="+Inf"} 1 +endpoint_slice_controller_endpoints_added_per_sync_sum 1 +endpoint_slice_controller_endpoints_added_per_sync_count 1 +` + + if err := testutil.CollectAndCompare(metrics.EndpointsAddedPerSync, strings.NewReader(want), "endpoint_slice_controller_endpoints_added_per_sync"); err != nil { + t.Fatal(err) + } +} + +func TestReconcileEndpointSlicesMetrics_EndpointsRemovedPerSync_CollectAndCompare(t *testing.T) { + resetMetrics() + + client := newClientset() + namespace := "test" + svc, endpointMeta := newServiceAndEndpointMeta("foo", namespace) + + stalePod := newPod(0, namespace, true, 1, false) + keepPod := newPod(1, namespace, true, 1, false) + + // Existing slice has one stale endpoint and one endpoint that should remain. + // Reconcile should remove exactly 1 endpoint. + existingSlice := newEmptyEndpointSlice(1, namespace, endpointMeta, svc) + existingSlice.Endpoints = []discovery.Endpoint{ + podToEndpoint(stalePod, &corev1.Node{}, &svc, discovery.AddressTypeIPv4), + podToEndpoint(keepPod, &corev1.Node{}, &svc, discovery.AddressTypeIPv4), + } + createdSlice, err := client.DiscoveryV1().EndpointSlices(namespace).Create(context.TODO(), existingSlice, metav1.CreateOptions{}) + if err != nil { + t.Fatalf("Expected no error creating EndpointSlice, got: %v", err) + } + + r := newReconciler(client, []*corev1.Node{{ObjectMeta: metav1.ObjectMeta{Name: "node-1"}}}, defaultMaxEndpointsPerSlice) + reconcileHelper(t, r, &svc, []*corev1.Pod{keepPod}, []*discovery.EndpointSlice{createdSlice}, time.Now()) + + want := ` +# HELP endpoint_slice_controller_endpoints_removed_per_sync [BETA] Number of endpoints removed on each Service sync +# TYPE endpoint_slice_controller_endpoints_removed_per_sync histogram +endpoint_slice_controller_endpoints_removed_per_sync_bucket{le="2"} 1 +endpoint_slice_controller_endpoints_removed_per_sync_bucket{le="4"} 1 +endpoint_slice_controller_endpoints_removed_per_sync_bucket{le="8"} 1 +endpoint_slice_controller_endpoints_removed_per_sync_bucket{le="16"} 1 +endpoint_slice_controller_endpoints_removed_per_sync_bucket{le="32"} 1 +endpoint_slice_controller_endpoints_removed_per_sync_bucket{le="64"} 1 +endpoint_slice_controller_endpoints_removed_per_sync_bucket{le="128"} 1 +endpoint_slice_controller_endpoints_removed_per_sync_bucket{le="256"} 1 +endpoint_slice_controller_endpoints_removed_per_sync_bucket{le="512"} 1 +endpoint_slice_controller_endpoints_removed_per_sync_bucket{le="1024"} 1 +endpoint_slice_controller_endpoints_removed_per_sync_bucket{le="2048"} 1 +endpoint_slice_controller_endpoints_removed_per_sync_bucket{le="4096"} 1 +endpoint_slice_controller_endpoints_removed_per_sync_bucket{le="8192"} 1 +endpoint_slice_controller_endpoints_removed_per_sync_bucket{le="16384"} 1 +endpoint_slice_controller_endpoints_removed_per_sync_bucket{le="32768"} 1 +endpoint_slice_controller_endpoints_removed_per_sync_bucket{le="+Inf"} 1 +endpoint_slice_controller_endpoints_removed_per_sync_sum 1 +endpoint_slice_controller_endpoints_removed_per_sync_count 1 +` + + if err := testutil.CollectAndCompare(metrics.EndpointsRemovedPerSync, strings.NewReader(want), "endpoint_slice_controller_endpoints_removed_per_sync"); err != nil { + t.Fatal(err) + } +} + +func TestReconcileEndpointSlicesMetrics_GaugeUpdates(t *testing.T) { + resetMetrics() + client := newClientset() + namespace := "test" + svc, _ := newServiceAndEndpointMeta("foo", namespace) + + r := newReconciler(client, []*corev1.Node{{ObjectMeta: metav1.ObjectMeta{Name: "node-1"}}}, defaultMaxEndpointsPerSlice) + + svc.Spec.TrafficDistribution = ptr.To(corev1.ServiceTrafficDistributionPreferClose) + if _, err := client.CoreV1().Services(namespace).Create(context.TODO(), &svc, metav1.CreateOptions{}); err != nil { + t.Fatalf("Expected no error creating Service, got: %v", err) + } + + // Reconcile with exactly 2 pods to deterministically set gauge metric values. + pod1 := newPod(1, namespace, true, 1, false) + pod2 := newPod(2, namespace, true, 1, false) + reconcileHelper(t, r, &svc, []*corev1.Pod{pod1, pod2}, []*discovery.EndpointSlice{}, time.Unix(0, 0)) + + want := ` +# HELP endpoint_slice_controller_desired_endpoint_slices [BETA] Number of EndpointSlices that would exist with perfect endpoint allocation +# TYPE endpoint_slice_controller_desired_endpoint_slices gauge +endpoint_slice_controller_desired_endpoint_slices 1 +# HELP endpoint_slice_controller_endpoints_desired [BETA] Number of endpoints desired +# TYPE endpoint_slice_controller_endpoints_desired gauge +endpoint_slice_controller_endpoints_desired 2 +# HELP endpoint_slice_controller_num_endpoint_slices [BETA] Number of EndpointSlices +# TYPE endpoint_slice_controller_num_endpoint_slices gauge +endpoint_slice_controller_num_endpoint_slices 1 +# HELP endpoint_slice_controller_services_count_by_traffic_distribution [BETA] Number of Services using some specific trafficDistribution +# TYPE endpoint_slice_controller_services_count_by_traffic_distribution gauge +endpoint_slice_controller_services_count_by_traffic_distribution{traffic_distribution="PreferClose"} 1 +` + + if err := testutil.GatherAndCompare( + legacyregistry.DefaultGatherer, + strings.NewReader(want), + "endpoint_slice_controller_endpoints_desired", + "endpoint_slice_controller_num_endpoint_slices", + "endpoint_slice_controller_desired_endpoint_slices", + "endpoint_slice_controller_services_count_by_traffic_distribution", + ); err != nil { + t.Fatal(err) + } +} + // When a Service has a non-nil deletionTimestamp we want to avoid creating any // new EndpointSlices but continue to allow updates and deletes through. This // test uses 3 EndpointSlices, 1 "to-create", 1 "to-update", and 1 "to-delete". @@ -1417,7 +1570,7 @@ func TestReconcilerFinalizeSvcDeletionTimestamp(t *testing.T) { for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { client := newClientset() - setupMetrics() + resetMetrics() r := newReconciler(client, []*corev1.Node{{ObjectMeta: metav1.ObjectMeta{Name: "node-1"}}}, defaultMaxEndpointsPerSlice) namespace := "test" @@ -1670,7 +1823,7 @@ func TestReconcilerPodMissingNode(t *testing.T) { for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { client := newClientset() - setupMetrics() + resetMetrics() r := newReconciler(client, tc.existingNodes, defaultMaxEndpointsPerSlice) logger, _ := ktesting.NewTestContext(t) @@ -1900,7 +2053,7 @@ func TestReconcileTopology(t *testing.T) { createEndpointSlices(t, client, ns, tc.existingSlices) logger, _ := ktesting.NewTestContext(t) - setupMetrics() + resetMetrics() r := newReconciler(client, tc.nodes, defaultMaxEndpointsPerSlice) if tc.topologyCacheEnabled { r.topologyCache = topologycache.NewTopologyCache() @@ -2162,7 +2315,7 @@ func TestReconcile_TrafficDistribution(t *testing.T) { t.Run(tc.name, func(t *testing.T) { client := newClientset() logger, _ := ktesting.NewTestContext(t) - setupMetrics() + resetMetrics() r := newReconciler(client, nodes, defaultMaxEndpointsPerSlice) r.preferSameTrafficDistribution = tc.preferSameFeatureGateEnabled @@ -2214,8 +2367,7 @@ func TestReconcile_TrafficDistribution(t *testing.T) { func TestReconcileHeadlessServiceNoPorts(t *testing.T) { namespace := "test" client := newClientset() - setupMetrics() - + resetMetrics() svc := corev1.Service{ ObjectMeta: metav1.ObjectMeta{ Name: "headless-no-ports", @@ -2479,7 +2631,7 @@ func expectMetrics(t *testing.T, em expectedMetrics) { handleErr(t, err, "slicesChangedPreferSameNode") actualSlicesChangedPerSyncTrafficDist := actualSlicesChangedPreferClose + actualSlicesChangedPreferSameZone + actualSlicesChangedPreferSameNode if actualSlicesChangedPerSyncTrafficDist != float64(em.slicesChangedPerSyncTrafficDist) { - t.Errorf("Expected slicesChangedPerSyncTrafficDist to be %d, got %v", em.slicesChangedPerSyncTrafficDist, actualSlicesChangedPerSyncTopology) + t.Errorf("Expected slicesChangedPerSyncTrafficDist to be %d, got %v", em.slicesChangedPerSyncTrafficDist, actualSlicesChangedPerSyncTrafficDist) } actualSyncSuccesses, err := testutil.GetCounterMetricValue(metrics.EndpointSliceSyncs.WithLabelValues("success")) @@ -2494,7 +2646,12 @@ func expectMetrics(t *testing.T, em expectedMetrics) { t.Errorf("Expected endpointSliceSyncErrors to be %d, got %v", em.syncErrors, actualSyncErrors) } - for _, trafficDistribution := range []string{"PreferClose", "ImplementationSpecific"} { + for _, trafficDistribution := range []string{ + "PreferClose", + "PreferSameZone", + "PreferSameNode", + "ImplementationSpecific", + } { gotServicesCount, err := testutil.GetGaugeMetricValue(metrics.ServicesCountByTrafficDistribution.WithLabelValues(trafficDistribution)) var wantServicesCount int if em.servicesCountByTrafficDistribution != nil { @@ -2513,8 +2670,7 @@ func handleErr(t *testing.T, err error, metricName string) { } } -func setupMetrics() { - metrics.RegisterMetrics() +func resetMetrics() { metrics.NumEndpointSlices.Reset() metrics.DesiredEndpointSlices.Reset() metrics.EndpointsDesired.Reset() diff --git a/test/instrumentation/testdata/stable-metrics-list.yaml b/test/instrumentation/testdata/stable-metrics-list.yaml index eae9b4fe660..ab5f1087118 100644 --- a/test/instrumentation/testdata/stable-metrics-list.yaml +++ b/test/instrumentation/testdata/stable-metrics-list.yaml @@ -208,6 +208,70 @@ help: The count of disabled metrics. type: Counter stabilityLevel: BETA +- name: desired_endpoint_slices + subsystem: endpoint_slice_controller + help: Number of EndpointSlices that would exist with perfect endpoint allocation + type: Gauge + stabilityLevel: BETA +- name: endpoints_added_per_sync + subsystem: endpoint_slice_controller + help: Number of endpoints added on each Service sync + type: Histogram + stabilityLevel: BETA + buckets: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 1024 + - 2048 + - 4096 + - 8192 + - 16384 + - 32768 +- name: endpoints_desired + subsystem: endpoint_slice_controller + help: Number of endpoints desired + type: Gauge + stabilityLevel: BETA +- name: endpoints_removed_per_sync + subsystem: endpoint_slice_controller + help: Number of endpoints removed on each Service sync + type: Histogram + stabilityLevel: BETA + buckets: + - 2 + - 4 + - 8 + - 16 + - 32 + - 64 + - 128 + - 256 + - 512 + - 1024 + - 2048 + - 4096 + - 8192 + - 16384 + - 32768 +- name: num_endpoint_slices + subsystem: endpoint_slice_controller + help: Number of EndpointSlices + type: Gauge + stabilityLevel: BETA +- name: services_count_by_traffic_distribution + subsystem: endpoint_slice_controller + help: Number of Services using some specific trafficDistribution + type: Gauge + stabilityLevel: BETA + labels: + - traffic_distribution - name: hidden_metrics_total help: The count of hidden metrics. type: Counter