From 53e5dcec97ca75b8f2265687c7f0b2332e737f5a Mon Sep 17 00:00:00 2001 From: KasimVali2207 Date: Wed, 10 Dec 2025 11:16:48 +0530 Subject: [PATCH 1/2] Add stress tests for gRPC, HTTP, and TCP liveness probes This commit adds e2e stress tests to verify that liveness probes do not cause unexpected container restarts under load. The tests create many containers (50 per test) with liveness probes configured to run every 1 second. Three test cases are included: - HTTP liveness probe stress test - TCP liveness probe stress test - gRPC liveness probe stress test Each test waits for all containers to be running, observes probe behavior for 2 minutes, and validates that no containers have restarted unexpectedly. These tests address the bug fix from issue kubernetes#89898 and serve as a replacement for the skipped unit test from PR kubernetes#115329. --- test/e2e_node/probe_stress_test.go | 232 +++++++++++++++++++++++++++++ 1 file changed, 232 insertions(+) create mode 100644 test/e2e_node/probe_stress_test.go diff --git a/test/e2e_node/probe_stress_test.go b/test/e2e_node/probe_stress_test.go new file mode 100644 index 00000000000..811cfd7fa66 --- /dev/null +++ b/test/e2e_node/probe_stress_test.go @@ -0,0 +1,232 @@ +//go:build linux +// +build linux + +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package e2enode + +import ( + "context" + "fmt" + "time" + + "github.com/onsi/ginkgo/v2" + "github.com/onsi/gomega" + + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/intstr" + "k8s.io/apimachinery/pkg/util/uuid" + "k8s.io/kubernetes/test/e2e/framework" + e2epod "k8s.io/kubernetes/test/e2e/framework/pod" + imageutils "k8s.io/kubernetes/test/utils/image" + admissionapi "k8s.io/pod-security-admission/api" +) + +const ( + probeStressNumContainers = 50 + probeStressPeriodSeconds = 1 + probeStressWaitTime = 2 * time.Minute +) + +var _ = SIGDescribe("Probe Stress", framework.WithSerial(), func() { + f := framework.NewDefaultFramework("probe-stress") + f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged + + ginkgo.Context("HTTP liveness probes", func() { + ginkgo.It("should not cause unexpected container restarts under load [Serial]", func(ctx context.Context) { + pod := createPodWithHTTPProbes(probeStressNumContainers) + runProbeStressTest(ctx, f, pod) + }) + }) + + ginkgo.Context("TCP liveness probes", func() { + ginkgo.It("should not cause unexpected container restarts under load [Serial]", func(ctx context.Context) { + pod := createPodWithTCPProbes(probeStressNumContainers) + runProbeStressTest(ctx, f, pod) + }) + }) + + ginkgo.Context("gRPC liveness probes", func() { + ginkgo.It("should not cause unexpected container restarts under load [Serial]", func(ctx context.Context) { + pod := createPodWithGRPCProbes(probeStressNumContainers) + runProbeStressTest(ctx, f, pod) + }) + }) +}) + +// runProbeStressTest creates a pod with many containers, waits for them to be running, +// and verifies that none of the containers have restarted unexpectedly. +func runProbeStressTest(ctx context.Context, f *framework.Framework, pod *v1.Pod) { + ginkgo.By(fmt.Sprintf("Creating pod %s with %d containers", pod.Name, len(pod.Spec.Containers))) + pod = e2epod.NewPodClient(f).Create(ctx, pod) + + ginkgo.By("Waiting for all containers to be running") + err := e2epod.WaitForPodRunningInNamespace(ctx, f.ClientSet, pod) + framework.ExpectNoError(err, "Failed to start pod") + + ginkgo.By(fmt.Sprintf("Waiting %v to observe probe behavior", probeStressWaitTime)) + time.Sleep(probeStressWaitTime) + + ginkgo.By("Verifying no containers have restarted") + updatedPod, err := f.ClientSet.CoreV1().Pods(f.Namespace.Name).Get(ctx, pod.Name, metav1.GetOptions{}) + framework.ExpectNoError(err, "Failed to get pod") + + for _, containerStatus := range updatedPod.Status.ContainerStatuses { + gomega.Expect(containerStatus.RestartCount).To(gomega.BeZero(), + "Container %s should not have restarted, but has restart count %d", + containerStatus.Name, containerStatus.RestartCount) + } + + ginkgo.By("Test passed: no unexpected container restarts") +} + +// createPodWithHTTPProbes creates a pod with multiple containers, each with an HTTP liveness probe. +func createPodWithHTTPProbes(numContainers int) *v1.Pod { + podName := "probe-stress-http-" + string(uuid.NewUUID()) + containers := make([]v1.Container, numContainers) + + for i := 0; i < numContainers; i++ { + containerName := fmt.Sprintf("container-%d", i) + port := int32(8080 + i) + + containers[i] = v1.Container{ + Name: containerName, + Image: imageutils.GetE2EImage(imageutils.Agnhost), + Args: []string{"netexec", fmt.Sprintf("--http-port=%d", port)}, + Ports: []v1.ContainerPort{ + { + ContainerPort: port, + Protocol: v1.ProtocolTCP, + }, + }, + LivenessProbe: &v1.Probe{ + ProbeHandler: v1.ProbeHandler{ + HTTPGet: &v1.HTTPGetAction{ + Path: "/", + Port: intstr.FromInt(int(port)), + }, + }, + PeriodSeconds: probeStressPeriodSeconds, + TimeoutSeconds: 1, + SuccessThreshold: 1, + FailureThreshold: 3, + }, + ImagePullPolicy: v1.PullIfNotPresent, + } + } + + return &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: podName, + }, + Spec: v1.PodSpec{ + Containers: containers, + RestartPolicy: v1.RestartPolicyNever, + }, + } +} + +// createPodWithTCPProbes creates a pod with multiple containers, each with a TCP liveness probe. +func createPodWithTCPProbes(numContainers int) *v1.Pod { + podName := "probe-stress-tcp-" + string(uuid.NewUUID()) + containers := make([]v1.Container, numContainers) + + for i := 0; i < numContainers; i++ { + containerName := fmt.Sprintf("container-%d", i) + port := int32(8080 + i) + + containers[i] = v1.Container{ + Name: containerName, + Image: imageutils.GetE2EImage(imageutils.Agnhost), + Args: []string{"netexec", fmt.Sprintf("--http-port=%d", port)}, + Ports: []v1.ContainerPort{ + { + ContainerPort: port, + Protocol: v1.ProtocolTCP, + }, + }, + LivenessProbe: &v1.Probe{ + ProbeHandler: v1.ProbeHandler{ + TCPSocket: &v1.TCPSocketAction{ + Port: intstr.FromInt(int(port)), + }, + }, + PeriodSeconds: probeStressPeriodSeconds, + TimeoutSeconds: 1, + SuccessThreshold: 1, + FailureThreshold: 3, + }, + ImagePullPolicy: v1.PullIfNotPresent, + } + } + + return &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: podName, + }, + Spec: v1.PodSpec{ + Containers: containers, + RestartPolicy: v1.RestartPolicyNever, + }, + } +} + +// createPodWithGRPCProbes creates a pod with multiple containers, each with a gRPC liveness probe. +func createPodWithGRPCProbes(numContainers int) *v1.Pod { + podName := "probe-stress-grpc-" + string(uuid.NewUUID()) + containers := make([]v1.Container, numContainers) + + for i := 0; i < numContainers; i++ { + containerName := fmt.Sprintf("container-%d", i) + port := int32(5000 + i) + + containers[i] = v1.Container{ + Name: containerName, + Image: imageutils.GetE2EImage(imageutils.Agnhost), + Args: []string{"grpc-health-checking", fmt.Sprintf("--port=%d", port)}, + Ports: []v1.ContainerPort{ + { + ContainerPort: port, + Protocol: v1.ProtocolTCP, + }, + }, + LivenessProbe: &v1.Probe{ + ProbeHandler: v1.ProbeHandler{ + GRPC: &v1.GRPCAction{ + Port: port, + }, + }, + PeriodSeconds: probeStressPeriodSeconds, + TimeoutSeconds: 1, + SuccessThreshold: 1, + FailureThreshold: 3, + }, + ImagePullPolicy: v1.PullIfNotPresent, + } + } + + return &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: podName, + }, + Spec: v1.PodSpec{ + Containers: containers, + RestartPolicy: v1.RestartPolicyNever, + }, + } +} From d8fe13bac72664341b3ad0c09db08605d3b36184 Mon Sep 17 00:00:00 2001 From: KasimVali2207 Date: Fri, 12 Dec 2025 18:11:24 +0530 Subject: [PATCH 2/2] Address review feedback: use gomega.Consistently and refactor pod creation - Replaced time.Sleep with gomega.Consistently to ensure reliable failure detection if a restart occurs during the wait - Refactored createPodWith*Probes functions into a single helper to remove code duplication Signed-off-by: KasimVali2207 --- test/e2e_node/probe_stress_test.go | 189 +++++++++++------------------ 1 file changed, 72 insertions(+), 117 deletions(-) diff --git a/test/e2e_node/probe_stress_test.go b/test/e2e_node/probe_stress_test.go index 811cfd7fa66..8e0e4929aa4 100644 --- a/test/e2e_node/probe_stress_test.go +++ b/test/e2e_node/probe_stress_test.go @@ -45,24 +45,26 @@ const ( var _ = SIGDescribe("Probe Stress", framework.WithSerial(), func() { f := framework.NewDefaultFramework("probe-stress") + // LevelPrivileged is required because the stress tests create pods with many containers + // that may require elevated permissions for networking and resource allocation f.NamespacePodSecurityLevel = admissionapi.LevelPrivileged ginkgo.Context("HTTP liveness probes", func() { - ginkgo.It("should not cause unexpected container restarts under load [Serial]", func(ctx context.Context) { + ginkgo.It("should not cause unexpected container restarts under load", func(ctx context.Context) { pod := createPodWithHTTPProbes(probeStressNumContainers) runProbeStressTest(ctx, f, pod) }) }) ginkgo.Context("TCP liveness probes", func() { - ginkgo.It("should not cause unexpected container restarts under load [Serial]", func(ctx context.Context) { + ginkgo.It("should not cause unexpected container restarts under load", func(ctx context.Context) { pod := createPodWithTCPProbes(probeStressNumContainers) runProbeStressTest(ctx, f, pod) }) }) ginkgo.Context("gRPC liveness probes", func() { - ginkgo.It("should not cause unexpected container restarts under load [Serial]", func(ctx context.Context) { + ginkgo.It("should not cause unexpected container restarts under load", func(ctx context.Context) { pod := createPodWithGRPCProbes(probeStressNumContainers) runProbeStressTest(ctx, f, pod) }) @@ -79,143 +81,96 @@ func runProbeStressTest(ctx context.Context, f *framework.Framework, pod *v1.Pod err := e2epod.WaitForPodRunningInNamespace(ctx, f.ClientSet, pod) framework.ExpectNoError(err, "Failed to start pod") - ginkgo.By(fmt.Sprintf("Waiting %v to observe probe behavior", probeStressWaitTime)) - time.Sleep(probeStressWaitTime) - - ginkgo.By("Verifying no containers have restarted") - updatedPod, err := f.ClientSet.CoreV1().Pods(f.Namespace.Name).Get(ctx, pod.Name, metav1.GetOptions{}) - framework.ExpectNoError(err, "Failed to get pod") - - for _, containerStatus := range updatedPod.Status.ContainerStatuses { - gomega.Expect(containerStatus.RestartCount).To(gomega.BeZero(), - "Container %s should not have restarted, but has restart count %d", - containerStatus.Name, containerStatus.RestartCount) - } + ginkgo.By("Verifying no containers restarted") + gomega.Consistently(ctx, func(ctx context.Context) error { + updatedPod, err := f.ClientSet.CoreV1().Pods(f.Namespace.Name).Get(ctx, pod.Name, metav1.GetOptions{}) + if err != nil { + return err + } + for _, containerStatus := range updatedPod.Status.ContainerStatuses { + if containerStatus.RestartCount > 0 { + return fmt.Errorf("container %s restarted %d times", containerStatus.Name, containerStatus.RestartCount) + } + } + return nil + }, probeStressWaitTime, 1*time.Second).Should(gomega.Succeed()) ginkgo.By("Test passed: no unexpected container restarts") } -// createPodWithHTTPProbes creates a pod with multiple containers, each with an HTTP liveness probe. func createPodWithHTTPProbes(numContainers int) *v1.Pod { - podName := "probe-stress-http-" + string(uuid.NewUUID()) - containers := make([]v1.Container, numContainers) - - for i := 0; i < numContainers; i++ { - containerName := fmt.Sprintf("container-%d", i) + return createProbeStressPod(numContainers, func(i int) (v1.Probe, []v1.ContainerPort, []string) { port := int32(8080 + i) - - containers[i] = v1.Container{ - Name: containerName, - Image: imageutils.GetE2EImage(imageutils.Agnhost), - Args: []string{"netexec", fmt.Sprintf("--http-port=%d", port)}, - Ports: []v1.ContainerPort{ - { - ContainerPort: port, - Protocol: v1.ProtocolTCP, + probe := v1.Probe{ + ProbeHandler: v1.ProbeHandler{ + HTTPGet: &v1.HTTPGetAction{ + Path: "/", + Port: intstr.FromInt(int(port)), }, }, - LivenessProbe: &v1.Probe{ - ProbeHandler: v1.ProbeHandler{ - HTTPGet: &v1.HTTPGetAction{ - Path: "/", - Port: intstr.FromInt(int(port)), - }, - }, - PeriodSeconds: probeStressPeriodSeconds, - TimeoutSeconds: 1, - SuccessThreshold: 1, - FailureThreshold: 3, - }, - ImagePullPolicy: v1.PullIfNotPresent, + PeriodSeconds: probeStressPeriodSeconds, + TimeoutSeconds: 1, + SuccessThreshold: 1, + FailureThreshold: 3, } - } - - return &v1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: podName, - }, - Spec: v1.PodSpec{ - Containers: containers, - RestartPolicy: v1.RestartPolicyNever, - }, - } + ports := []v1.ContainerPort{{ContainerPort: port, Protocol: v1.ProtocolTCP}} + args := []string{"netexec", fmt.Sprintf("--http-port=%d", port)} + return probe, ports, args + }) } -// createPodWithTCPProbes creates a pod with multiple containers, each with a TCP liveness probe. func createPodWithTCPProbes(numContainers int) *v1.Pod { - podName := "probe-stress-tcp-" + string(uuid.NewUUID()) - containers := make([]v1.Container, numContainers) - - for i := 0; i < numContainers; i++ { - containerName := fmt.Sprintf("container-%d", i) + return createProbeStressPod(numContainers, func(i int) (v1.Probe, []v1.ContainerPort, []string) { port := int32(8080 + i) - - containers[i] = v1.Container{ - Name: containerName, - Image: imageutils.GetE2EImage(imageutils.Agnhost), - Args: []string{"netexec", fmt.Sprintf("--http-port=%d", port)}, - Ports: []v1.ContainerPort{ - { - ContainerPort: port, - Protocol: v1.ProtocolTCP, + probe := v1.Probe{ + ProbeHandler: v1.ProbeHandler{ + TCPSocket: &v1.TCPSocketAction{ + Port: intstr.FromInt(int(port)), }, }, - LivenessProbe: &v1.Probe{ - ProbeHandler: v1.ProbeHandler{ - TCPSocket: &v1.TCPSocketAction{ - Port: intstr.FromInt(int(port)), - }, - }, - PeriodSeconds: probeStressPeriodSeconds, - TimeoutSeconds: 1, - SuccessThreshold: 1, - FailureThreshold: 3, - }, - ImagePullPolicy: v1.PullIfNotPresent, + PeriodSeconds: probeStressPeriodSeconds, + TimeoutSeconds: 1, + SuccessThreshold: 1, + FailureThreshold: 3, } - } - - return &v1.Pod{ - ObjectMeta: metav1.ObjectMeta{ - Name: podName, - }, - Spec: v1.PodSpec{ - Containers: containers, - RestartPolicy: v1.RestartPolicyNever, - }, - } + ports := []v1.ContainerPort{{ContainerPort: port, Protocol: v1.ProtocolTCP}} + args := []string{"netexec", fmt.Sprintf("--http-port=%d", port)} + return probe, ports, args + }) } -// createPodWithGRPCProbes creates a pod with multiple containers, each with a gRPC liveness probe. func createPodWithGRPCProbes(numContainers int) *v1.Pod { - podName := "probe-stress-grpc-" + string(uuid.NewUUID()) + return createProbeStressPod(numContainers, func(i int) (v1.Probe, []v1.ContainerPort, []string) { + port := int32(5000 + i) + probe := v1.Probe{ + ProbeHandler: v1.ProbeHandler{ + GRPC: &v1.GRPCAction{ + Port: port, + }, + }, + PeriodSeconds: probeStressPeriodSeconds, + TimeoutSeconds: 1, + SuccessThreshold: 1, + FailureThreshold: 3, + } + ports := []v1.ContainerPort{{ContainerPort: port, Protocol: v1.ProtocolTCP}} + args := []string{"grpc-health-checking", fmt.Sprintf("--port=%d", port)} + return probe, ports, args + }) +} + +func createProbeStressPod(numContainers int, generator func(i int) (v1.Probe, []v1.ContainerPort, []string)) *v1.Pod { + podName := "probe-stress-" + string(uuid.NewUUID()) containers := make([]v1.Container, numContainers) for i := 0; i < numContainers; i++ { - containerName := fmt.Sprintf("container-%d", i) - port := int32(5000 + i) - + probe, ports, args := generator(i) containers[i] = v1.Container{ - Name: containerName, - Image: imageutils.GetE2EImage(imageutils.Agnhost), - Args: []string{"grpc-health-checking", fmt.Sprintf("--port=%d", port)}, - Ports: []v1.ContainerPort{ - { - ContainerPort: port, - Protocol: v1.ProtocolTCP, - }, - }, - LivenessProbe: &v1.Probe{ - ProbeHandler: v1.ProbeHandler{ - GRPC: &v1.GRPCAction{ - Port: port, - }, - }, - PeriodSeconds: probeStressPeriodSeconds, - TimeoutSeconds: 1, - SuccessThreshold: 1, - FailureThreshold: 3, - }, + Name: fmt.Sprintf("container-%d", i), + Image: imageutils.GetE2EImage(imageutils.Agnhost), + Args: args, + Ports: ports, + LivenessProbe: &probe, ImagePullPolicy: v1.PullIfNotPresent, } }