Merge pull request #138799 from rzlink/fix/windows-e2e-pod-overhead

e2e/windows: respect pod.Spec.Overhead in Memory Limits and Kubelet-Stats tests
2026-06-09 00:34:10 -04:00 · 2026-05-14 03:07:48 +05:30 · 2026-05-14 03:07:48 +05:30 · 6b0e464c7a
commit 6b0e464c7a
parent f7b19d4927 139746d315
3 changed files with 218 additions and 9 deletions
--- a/test/e2e/windows/kubelet_stats.go
+++ b/test/e2e/windows/kubelet_stats.go
@ -45,7 +45,7 @@ var _ = sigDescribe(feature.Windows, "Kubelet-Stats", framework.WithSerial(), sk

 	ginkgo.Describe("Kubelet stats collection for Windows nodes", func() {

-		ginkgo.Context("when running 10 pods", func() {
+		ginkgo.Context("when running up to 10 pods", func() {
 			// 10 seconds is the default scrape timeout for metrics-server and kube-prometheus
 			ginkgo.It("should return within 10 seconds", func(ctx context.Context) {

@ -54,14 +54,45 @@ var _ = sigDescribe(feature.Windows, "Kubelet-Stats", framework.WithSerial(), sk
 				framework.ExpectNoError(err, "Error finding Windows node")
 				framework.Logf("Using node: %v", targetNode.Name)

-				ginkgo.By("Scheduling 10 pods")
+				// Adjust pod count for any per-pod overhead the cluster's admission chain
+				// injects (Pod Overhead, KEP-688); a no-op when overhead is 0.
+				const baselineNumPods = 10
+				const minNumPods = 3
+				numPods := baselineNumPods
+				overhead, err := detectPodOverheadMemory(ctx, f.ClientSet, f.Namespace.Name)
+				framework.ExpectNoError(err, "detecting pod overhead memory")
+				if overhead > 0 {
+					// Use admission accounting (allocatable - sum of admitted requests incl. overhead),
+					// not /stats/summary's Memory.AvailableBytes: the latter is a working-set view
+					// and can disagree with admission when a prior [Serial] test still has Terminating pods.
+					allocatable := targetNode.Status.Allocatable.Memory().Value()
+					waitForNodeMemoryToSettle(ctx, f.ClientSet, targetNode.Name, overhead+windowsTestMemorySafetyBuffer)
+					existing := sumExistingPodMemoryReservation(ctx, f.ClientSet, targetNode.Name)
+					maxPods := (allocatable - existing - windowsTestMemorySafetyBuffer) / overhead
+					if maxPods < int64(numPods) {
+						numPods = int(maxPods)
+					}
+					// Fail when the node lacks the capacity to schedule the minimum
+					// required pod count. This indicates a misconfigured test cluster
+					// (allocatable too small, existing reservation unexpectedly large,
+					// or pod overhead higher than the cluster was sized for); surface
+					// it loudly rather than silently skipping the test.
+					if numPods < minNumPods {
+						framework.Failf("Node %s has insufficient memory capacity to schedule %d pods: allocatable=%d, existing-reservation=%d, per-pod-overhead=%d, safety-buffer=%d, max-pods=%d. Right-size the test cluster, lower per-pod overhead, or reduce existing reservation.",
+							targetNode.Name, minNumPods, allocatable, existing, overhead, windowsTestMemorySafetyBuffer, maxPods)
+					}
+					framework.Logf("Adjusted pod count to %d (baseline=%d, overhead=%d bytes, allocatable=%d bytes, existing-reservation=%d bytes, safety-buffer=%d bytes)",
+						numPods, baselineNumPods, overhead, allocatable, existing, windowsTestMemorySafetyBuffer)
+				}
+
+				ginkgo.By(fmt.Sprintf("Scheduling %d pods", numPods))
 				powershellImage := imageutils.GetConfig(imageutils.BusyBox)
-				pods := newKubeletStatsTestPods(10, powershellImage, targetNode.Name)
+				pods := newKubeletStatsTestPods(numPods, powershellImage, targetNode.Name)
 				e2epod.NewPodClient(f).CreateBatch(ctx, pods)

-				ginkgo.By("Waiting up to 3 minutes for pods to be running")
+				ginkgo.By(fmt.Sprintf("Waiting up to 3 minutes for %d pods to be running", numPods))
 				timeout := 3 * time.Minute
-				err = e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 10, timeout)
+				err = e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, numPods, timeout)
 				framework.ExpectNoError(err)

 				ginkgo.By("Getting kubelet stats 5 times and checking average duration")
@ -100,7 +131,7 @@ var _ = sigDescribe(feature.Windows, "Kubelet-Stats", framework.WithSerial(), sk
 							}
 						}
 					}
-					gomega.Expect(statsChecked).To(gomega.Equal(10), "Should find stats for 10 pods in kubelet stats")
+					gomega.Expect(statsChecked).To(gomega.Equal(numPods), fmt.Sprintf("Should find stats for %d pods in kubelet stats", numPods))

 					time.Sleep(5 * time.Second)
 				}
@ -146,6 +177,24 @@ var _ = sigDescribe(feature.Windows, "Kubelet-Stats", skipUnlessWindows(func() {
 				framework.ExpectNoError(err, "Error finding Windows node")
 				framework.Logf("Using node: %v", targetNode.Name)

+				// Fail if the node lacks capacity for the required pod count once overhead
+				// is accounted for. A misconfigured test cluster should be surfaced loudly
+				// rather than silently skipped. No-op when the cluster injects no overhead.
+				const numPods = 3
+				overhead, err := detectPodOverheadMemory(ctx, f.ClientSet, f.Namespace.Name)
+				framework.ExpectNoError(err, "detecting pod overhead memory")
+				if overhead > 0 {
+					allocatable := targetNode.Status.Allocatable.Memory().Value()
+					needed := int64(numPods)*overhead + windowsTestMemorySafetyBuffer
+					waitForNodeMemoryToSettle(ctx, f.ClientSet, targetNode.Name, needed)
+					existing := sumExistingPodMemoryReservation(ctx, f.ClientSet, targetNode.Name)
+					free := allocatable - existing - windowsTestMemorySafetyBuffer
+					if free < int64(numPods)*overhead {
+						framework.Failf("Node %s has insufficient memory capacity to schedule %d pods: allocatable=%d, existing-reservation=%d, per-pod-overhead=%d, safety-buffer=%d, free=%d (need %d). Right-size the test cluster, lower per-pod overhead, or reduce existing reservation.",
+							targetNode.Name, numPods, allocatable, existing, overhead, windowsTestMemorySafetyBuffer, free, int64(numPods)*overhead)
+					}
+				}
+
 				ginkgo.By("Scheduling 3 pods")
 				powershellImage := imageutils.GetConfig(imageutils.BusyBox)
 				pods := newKubeletStatsTestPods(3, powershellImage, targetNode.Name)
--- a/test/e2e/windows/memory_limits.go
+++ b/test/e2e/windows/memory_limits.go
@ -106,12 +106,24 @@ func overrideAllocatableMemoryTest(ctx context.Context, f *framework.Framework,
 	})
 	framework.ExpectNoError(err)

-	framework.Logf("Scheduling 1 pod per node to consume all allocatable memory")
+	// Subtract any per-pod overhead the cluster's admission chain injects
+	// (Pod Overhead, KEP-688) so limit+overhead fits Allocatable.Memory.
+	overhead, err := detectPodOverheadMemory(ctx, f.ClientSet, f.Namespace.Name)
+	framework.ExpectNoError(err, "detecting pod overhead memory")
+
+	framework.Logf("Scheduling 1 pod per node to consume all allocatable memory (detected overhead: %d bytes)", overhead)
 	for _, node := range nodeList.Items {
 		status := node.Status
-		podMemLimt := resource.NewQuantity(status.Allocatable.Memory().Value()-(1024*1024*100), resource.BinarySI)
+		// Subtract overhead (consume pod's own), existing (DaemonSets/system pods,
+		// incl. their overhead), and safety buffer (kubelet accounting noise).
+		existing := sumExistingPodMemoryReservation(ctx, f.ClientSet, node.Name)
+		podMemLimt := resource.NewQuantity(
+			status.Allocatable.Memory().Value()-existing-overhead-windowsTestMemorySafetyBuffer,
+			resource.BinarySI,
+		)
 		podName := "mem-test-" + string(uuid.NewUUID())
-		framework.Logf("Scheduling pod %s on node %s (allocatable memory=%v) with memory limit %v", podName, node.Name, status.Allocatable.Memory(), podMemLimt)
+		framework.Logf("Scheduling pod %s on node %s (allocatable=%d, existing-reservation=%d, overhead=%d, safety-buffer=%d) with memory limit %v",
+			podName, node.Name, status.Allocatable.Memory().Value(), existing, overhead, windowsTestMemorySafetyBuffer, podMemLimt)
 		pod := &v1.Pod{
 			ObjectMeta: metav1.ObjectMeta{
 				Name: podName,
--- a/test/e2e/windows/utils.go
+++ b/test/e2e/windows/utils.go
@ -21,16 +21,21 @@ import (
 	"fmt"
 	"math/rand"
 	"strings"
+	"sync"
 	"time"

 	"github.com/onsi/ginkgo/v2"
 	appsv1 "k8s.io/api/apps/v1"
 	v1 "k8s.io/api/core/v1"
 	apierrors "k8s.io/apimachinery/pkg/api/errors"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/apimachinery/pkg/util/uuid"
 	"k8s.io/apimachinery/pkg/util/wait"
+	clientset "k8s.io/client-go/kubernetes"
 	"k8s.io/kubernetes/pkg/controller/deployment/util"
 	"k8s.io/kubernetes/test/e2e/framework"
 	e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper"
+	imageutils "k8s.io/kubernetes/test/utils/image"

 	semver "github.com/blang/semver/v4"
 )
@ -99,3 +104,146 @@ func skipUnlessContainerdOneSevenOrGreater(ctx context.Context, f *framework.Fra
 		e2eskipper.Skipf("container runtime is < 1.7.0")
 	}
 }
+
+// windowsTestMemorySafetyBuffer is fixed headroom (in bytes) reserved on a
+// node above the sum of admitted pod requests + pod.Spec.Overhead. It absorbs
+// kubelet/cgroup memory-accounting drift that would otherwise produce flaky
+// OutOfmemory admission errors at the capacity boundary.
+//
+// 256 MiB is empirical, not derived from any kubelet constant: a comfortable
+// upper bound on the accounting jitter we observed in practice (single- to
+// low-tens of MiB) while remaining a small fraction (~5-10%) of typical
+// Windows test-node Allocatable. It can be tightened with measurements if it
+// ever becomes a constraint, or replaced with a fraction of Allocatable if
+// we ever need to run on nodes too small for a fixed buffer.
+const windowsTestMemorySafetyBuffer int64 = 256 * 1024 * 1024
+
+var (
+	podOverheadMemoryOnce  sync.Once
+	podOverheadMemoryCache int64
+	podOverheadMemoryErr   error
+)
+
+// detectPodOverheadMemory returns the per-pod memory overhead the cluster's
+// admission chain injects (Pod Overhead, KEP-688), in bytes, or 0 if no
+// overhead is applied. The error is non-nil only when the probe itself
+// could not be completed (e.g., the API server rejected the DryRun create);
+// a successful probe with no overhead returns (0, nil).
+//
+// Result and error are cached for the lifetime of the test process — the
+// admission chain is traversed at most once per run. See
+// computePodOverheadMemory for probe details.
+func detectPodOverheadMemory(ctx context.Context, c clientset.Interface, namespace string) (int64, error) {
+	podOverheadMemoryOnce.Do(func() {
+		podOverheadMemoryCache, podOverheadMemoryErr = computePodOverheadMemory(ctx, c, namespace)
+	})
+	return podOverheadMemoryCache, podOverheadMemoryErr
+}
+
+// computePodOverheadMemory performs a single DryRun pod create and inspects
+// the mutated result for pod.Spec.Overhead. DryRun (rather than reading
+// RuntimeClass directly) is needed because admission webhooks may inject
+// overhead conditionally on namespace, labels, or other request-scoped data
+// that is not visible from a static API read.
+//
+// Assumes overhead is RuntimeClass-driven (or otherwise static per pod) and
+// does not vary with container count, image, or per-container requests; a
+// future webhook that scaled overhead by pod shape would invalidate the
+// capacity calculations in this package. Callers must use a namespace that
+// admits a pause-image pod (current callers set PodSecurity LevelPrivileged).
+func computePodOverheadMemory(ctx context.Context, c clientset.Interface, namespace string) (int64, error) {
+	probePod := &v1.Pod{
+		ObjectMeta: metav1.ObjectMeta{
+			Name: "overhead-probe-" + string(uuid.NewUUID()),
+		},
+		Spec: v1.PodSpec{
+			Containers: []v1.Container{{
+				Name:  "probe",
+				Image: imageutils.GetPauseImageName(),
+			}},
+			NodeSelector: map[string]string{
+				"kubernetes.io/os": "windows",
+			},
+		},
+	}
+	result, err := c.CoreV1().Pods(namespace).Create(
+		ctx, probePod, metav1.CreateOptions{DryRun: []string{metav1.DryRunAll}},
+	)
+	if err != nil {
+		return 0, fmt.Errorf("DryRun pod create for overhead detection failed: %w", err)
+	}
+	if result.Spec.Overhead == nil {
+		return 0, nil
+	}
+	if mem, ok := result.Spec.Overhead[v1.ResourceMemory]; ok {
+		framework.Logf("Detected pod overhead memory: %d bytes (per Pod Overhead admission, KEP-688)", mem.Value())
+		return mem.Value(), nil
+	}
+	return 0, nil
+}
+
+// waitForNodeMemoryToSettle polls until the node has at least neededBytes
+// of free admittable memory (allocatable - sum of admitted-pod requests),
+// or the 90s timeout elapses. Useful in [Serial] tests that follow one
+// which leaves large pods Terminating — their requests stay counted until
+// fully removed.
+//
+// On timeout the function does not fail the test; it logs a tagged
+// "did NOT settle" message so a downstream OutOfmemory failure points
+// back at the cause (leftover pods) rather than appearing as an
+// unexplained scheduling error.
+func waitForNodeMemoryToSettle(ctx context.Context, c clientset.Interface, nodeName string, neededBytes int64) {
+	node, err := c.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{})
+	if err != nil {
+		framework.Logf("waitForNodeMemoryToSettle: cannot get node %s: %v (skipping wait)", nodeName, err)
+		return
+	}
+	allocatable := node.Status.Allocatable.Memory().Value()
+	var lastFree int64
+	pollErr := wait.PollUntilContextTimeout(ctx, 5*time.Second, 90*time.Second, true, func(ctx context.Context) (bool, error) {
+		existing := sumExistingPodMemoryReservation(ctx, c, nodeName)
+		lastFree = allocatable - existing
+		framework.Logf("Waiting for node %s memory to settle: existing-reservation=%d free=%d (need >= %d)",
+			nodeName, existing, lastFree, neededBytes)
+		return lastFree >= neededBytes, nil
+	})
+	if pollErr == nil {
+		framework.Logf("Node %s memory settled: free=%d (need >= %d)", nodeName, lastFree, neededBytes)
+		return
+	}
+	framework.Logf("Node %s memory did NOT settle within 90s: free=%d (need >= %d): %v "+
+		"(test will likely fail with OutOfmemory; check for leftover Terminating pods)",
+		nodeName, lastFree, neededBytes, pollErr)
+}
+
+// sumExistingPodMemoryReservation returns the total memory reservation
+// (sum of container memory requests + pod overhead) for all non-terminal pods
+// scheduled on the given node. Tests that compute "remaining schedulable
+// memory" (e.g., the Memory Limits test) must subtract this from
+// allocatable to leave room for DaemonSets and other system pods.
+func sumExistingPodMemoryReservation(ctx context.Context, c clientset.Interface, nodeName string) int64 {
+	podList, err := c.CoreV1().Pods("").List(ctx, metav1.ListOptions{
+		FieldSelector: "spec.nodeName=" + nodeName,
+	})
+	if err != nil {
+		framework.Logf("Could not list pods on node %s: %v (assuming 0 reservation)", nodeName, err)
+		return 0
+	}
+	var total int64
+	for _, p := range podList.Items {
+		if p.Status.Phase == v1.PodSucceeded || p.Status.Phase == v1.PodFailed {
+			continue
+		}
+		for _, c := range p.Spec.Containers {
+			if mem, ok := c.Resources.Requests[v1.ResourceMemory]; ok {
+				total += mem.Value()
+			}
+		}
+		if p.Spec.Overhead != nil {
+			if oh, ok := p.Spec.Overhead[v1.ResourceMemory]; ok {
+				total += oh.Value()
+			}
+		}
+	}
+	return total
+}