diff --git a/test/e2e/windows/kubelet_stats.go b/test/e2e/windows/kubelet_stats.go index 3ea3e29e2c9..a74c8b443b0 100644 --- a/test/e2e/windows/kubelet_stats.go +++ b/test/e2e/windows/kubelet_stats.go @@ -45,7 +45,7 @@ var _ = sigDescribe(feature.Windows, "Kubelet-Stats", framework.WithSerial(), sk ginkgo.Describe("Kubelet stats collection for Windows nodes", func() { - ginkgo.Context("when running 10 pods", func() { + ginkgo.Context("when running up to 10 pods", func() { // 10 seconds is the default scrape timeout for metrics-server and kube-prometheus ginkgo.It("should return within 10 seconds", func(ctx context.Context) { @@ -54,14 +54,45 @@ var _ = sigDescribe(feature.Windows, "Kubelet-Stats", framework.WithSerial(), sk framework.ExpectNoError(err, "Error finding Windows node") framework.Logf("Using node: %v", targetNode.Name) - ginkgo.By("Scheduling 10 pods") + // Adjust pod count for any per-pod overhead the cluster's admission chain + // injects (Pod Overhead, KEP-688); a no-op when overhead is 0. + const baselineNumPods = 10 + const minNumPods = 3 + numPods := baselineNumPods + overhead, err := detectPodOverheadMemory(ctx, f.ClientSet, f.Namespace.Name) + framework.ExpectNoError(err, "detecting pod overhead memory") + if overhead > 0 { + // Use admission accounting (allocatable - sum of admitted requests incl. overhead), + // not /stats/summary's Memory.AvailableBytes: the latter is a working-set view + // and can disagree with admission when a prior [Serial] test still has Terminating pods. + allocatable := targetNode.Status.Allocatable.Memory().Value() + waitForNodeMemoryToSettle(ctx, f.ClientSet, targetNode.Name, overhead+windowsTestMemorySafetyBuffer) + existing := sumExistingPodMemoryReservation(ctx, f.ClientSet, targetNode.Name) + maxPods := (allocatable - existing - windowsTestMemorySafetyBuffer) / overhead + if maxPods < int64(numPods) { + numPods = int(maxPods) + } + // Fail when the node lacks the capacity to schedule the minimum + // required pod count. This indicates a misconfigured test cluster + // (allocatable too small, existing reservation unexpectedly large, + // or pod overhead higher than the cluster was sized for); surface + // it loudly rather than silently skipping the test. + if numPods < minNumPods { + framework.Failf("Node %s has insufficient memory capacity to schedule %d pods: allocatable=%d, existing-reservation=%d, per-pod-overhead=%d, safety-buffer=%d, max-pods=%d. Right-size the test cluster, lower per-pod overhead, or reduce existing reservation.", + targetNode.Name, minNumPods, allocatable, existing, overhead, windowsTestMemorySafetyBuffer, maxPods) + } + framework.Logf("Adjusted pod count to %d (baseline=%d, overhead=%d bytes, allocatable=%d bytes, existing-reservation=%d bytes, safety-buffer=%d bytes)", + numPods, baselineNumPods, overhead, allocatable, existing, windowsTestMemorySafetyBuffer) + } + + ginkgo.By(fmt.Sprintf("Scheduling %d pods", numPods)) powershellImage := imageutils.GetConfig(imageutils.BusyBox) - pods := newKubeletStatsTestPods(10, powershellImage, targetNode.Name) + pods := newKubeletStatsTestPods(numPods, powershellImage, targetNode.Name) e2epod.NewPodClient(f).CreateBatch(ctx, pods) - ginkgo.By("Waiting up to 3 minutes for pods to be running") + ginkgo.By(fmt.Sprintf("Waiting up to 3 minutes for %d pods to be running", numPods)) timeout := 3 * time.Minute - err = e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 10, timeout) + err = e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, numPods, timeout) framework.ExpectNoError(err) ginkgo.By("Getting kubelet stats 5 times and checking average duration") @@ -100,7 +131,7 @@ var _ = sigDescribe(feature.Windows, "Kubelet-Stats", framework.WithSerial(), sk } } } - gomega.Expect(statsChecked).To(gomega.Equal(10), "Should find stats for 10 pods in kubelet stats") + gomega.Expect(statsChecked).To(gomega.Equal(numPods), fmt.Sprintf("Should find stats for %d pods in kubelet stats", numPods)) time.Sleep(5 * time.Second) } @@ -146,6 +177,24 @@ var _ = sigDescribe(feature.Windows, "Kubelet-Stats", skipUnlessWindows(func() { framework.ExpectNoError(err, "Error finding Windows node") framework.Logf("Using node: %v", targetNode.Name) + // Fail if the node lacks capacity for the required pod count once overhead + // is accounted for. A misconfigured test cluster should be surfaced loudly + // rather than silently skipped. No-op when the cluster injects no overhead. + const numPods = 3 + overhead, err := detectPodOverheadMemory(ctx, f.ClientSet, f.Namespace.Name) + framework.ExpectNoError(err, "detecting pod overhead memory") + if overhead > 0 { + allocatable := targetNode.Status.Allocatable.Memory().Value() + needed := int64(numPods)*overhead + windowsTestMemorySafetyBuffer + waitForNodeMemoryToSettle(ctx, f.ClientSet, targetNode.Name, needed) + existing := sumExistingPodMemoryReservation(ctx, f.ClientSet, targetNode.Name) + free := allocatable - existing - windowsTestMemorySafetyBuffer + if free < int64(numPods)*overhead { + framework.Failf("Node %s has insufficient memory capacity to schedule %d pods: allocatable=%d, existing-reservation=%d, per-pod-overhead=%d, safety-buffer=%d, free=%d (need %d). Right-size the test cluster, lower per-pod overhead, or reduce existing reservation.", + targetNode.Name, numPods, allocatable, existing, overhead, windowsTestMemorySafetyBuffer, free, int64(numPods)*overhead) + } + } + ginkgo.By("Scheduling 3 pods") powershellImage := imageutils.GetConfig(imageutils.BusyBox) pods := newKubeletStatsTestPods(3, powershellImage, targetNode.Name) diff --git a/test/e2e/windows/memory_limits.go b/test/e2e/windows/memory_limits.go index 31df1a4dd8f..4b4ab091150 100644 --- a/test/e2e/windows/memory_limits.go +++ b/test/e2e/windows/memory_limits.go @@ -106,12 +106,24 @@ func overrideAllocatableMemoryTest(ctx context.Context, f *framework.Framework, }) framework.ExpectNoError(err) - framework.Logf("Scheduling 1 pod per node to consume all allocatable memory") + // Subtract any per-pod overhead the cluster's admission chain injects + // (Pod Overhead, KEP-688) so limit+overhead fits Allocatable.Memory. + overhead, err := detectPodOverheadMemory(ctx, f.ClientSet, f.Namespace.Name) + framework.ExpectNoError(err, "detecting pod overhead memory") + + framework.Logf("Scheduling 1 pod per node to consume all allocatable memory (detected overhead: %d bytes)", overhead) for _, node := range nodeList.Items { status := node.Status - podMemLimt := resource.NewQuantity(status.Allocatable.Memory().Value()-(1024*1024*100), resource.BinarySI) + // Subtract overhead (consume pod's own), existing (DaemonSets/system pods, + // incl. their overhead), and safety buffer (kubelet accounting noise). + existing := sumExistingPodMemoryReservation(ctx, f.ClientSet, node.Name) + podMemLimt := resource.NewQuantity( + status.Allocatable.Memory().Value()-existing-overhead-windowsTestMemorySafetyBuffer, + resource.BinarySI, + ) podName := "mem-test-" + string(uuid.NewUUID()) - framework.Logf("Scheduling pod %s on node %s (allocatable memory=%v) with memory limit %v", podName, node.Name, status.Allocatable.Memory(), podMemLimt) + framework.Logf("Scheduling pod %s on node %s (allocatable=%d, existing-reservation=%d, overhead=%d, safety-buffer=%d) with memory limit %v", + podName, node.Name, status.Allocatable.Memory().Value(), existing, overhead, windowsTestMemorySafetyBuffer, podMemLimt) pod := &v1.Pod{ ObjectMeta: metav1.ObjectMeta{ Name: podName, diff --git a/test/e2e/windows/utils.go b/test/e2e/windows/utils.go index ab8ce886f5a..5466bc6daa9 100644 --- a/test/e2e/windows/utils.go +++ b/test/e2e/windows/utils.go @@ -21,16 +21,21 @@ import ( "fmt" "math/rand" "strings" + "sync" "time" "github.com/onsi/ginkgo/v2" appsv1 "k8s.io/api/apps/v1" v1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/uuid" "k8s.io/apimachinery/pkg/util/wait" + clientset "k8s.io/client-go/kubernetes" "k8s.io/kubernetes/pkg/controller/deployment/util" "k8s.io/kubernetes/test/e2e/framework" e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper" + imageutils "k8s.io/kubernetes/test/utils/image" semver "github.com/blang/semver/v4" ) @@ -99,3 +104,146 @@ func skipUnlessContainerdOneSevenOrGreater(ctx context.Context, f *framework.Fra e2eskipper.Skipf("container runtime is < 1.7.0") } } + +// windowsTestMemorySafetyBuffer is fixed headroom (in bytes) reserved on a +// node above the sum of admitted pod requests + pod.Spec.Overhead. It absorbs +// kubelet/cgroup memory-accounting drift that would otherwise produce flaky +// OutOfmemory admission errors at the capacity boundary. +// +// 256 MiB is empirical, not derived from any kubelet constant: a comfortable +// upper bound on the accounting jitter we observed in practice (single- to +// low-tens of MiB) while remaining a small fraction (~5-10%) of typical +// Windows test-node Allocatable. It can be tightened with measurements if it +// ever becomes a constraint, or replaced with a fraction of Allocatable if +// we ever need to run on nodes too small for a fixed buffer. +const windowsTestMemorySafetyBuffer int64 = 256 * 1024 * 1024 + +var ( + podOverheadMemoryOnce sync.Once + podOverheadMemoryCache int64 + podOverheadMemoryErr error +) + +// detectPodOverheadMemory returns the per-pod memory overhead the cluster's +// admission chain injects (Pod Overhead, KEP-688), in bytes, or 0 if no +// overhead is applied. The error is non-nil only when the probe itself +// could not be completed (e.g., the API server rejected the DryRun create); +// a successful probe with no overhead returns (0, nil). +// +// Result and error are cached for the lifetime of the test process — the +// admission chain is traversed at most once per run. See +// computePodOverheadMemory for probe details. +func detectPodOverheadMemory(ctx context.Context, c clientset.Interface, namespace string) (int64, error) { + podOverheadMemoryOnce.Do(func() { + podOverheadMemoryCache, podOverheadMemoryErr = computePodOverheadMemory(ctx, c, namespace) + }) + return podOverheadMemoryCache, podOverheadMemoryErr +} + +// computePodOverheadMemory performs a single DryRun pod create and inspects +// the mutated result for pod.Spec.Overhead. DryRun (rather than reading +// RuntimeClass directly) is needed because admission webhooks may inject +// overhead conditionally on namespace, labels, or other request-scoped data +// that is not visible from a static API read. +// +// Assumes overhead is RuntimeClass-driven (or otherwise static per pod) and +// does not vary with container count, image, or per-container requests; a +// future webhook that scaled overhead by pod shape would invalidate the +// capacity calculations in this package. Callers must use a namespace that +// admits a pause-image pod (current callers set PodSecurity LevelPrivileged). +func computePodOverheadMemory(ctx context.Context, c clientset.Interface, namespace string) (int64, error) { + probePod := &v1.Pod{ + ObjectMeta: metav1.ObjectMeta{ + Name: "overhead-probe-" + string(uuid.NewUUID()), + }, + Spec: v1.PodSpec{ + Containers: []v1.Container{{ + Name: "probe", + Image: imageutils.GetPauseImageName(), + }}, + NodeSelector: map[string]string{ + "kubernetes.io/os": "windows", + }, + }, + } + result, err := c.CoreV1().Pods(namespace).Create( + ctx, probePod, metav1.CreateOptions{DryRun: []string{metav1.DryRunAll}}, + ) + if err != nil { + return 0, fmt.Errorf("DryRun pod create for overhead detection failed: %w", err) + } + if result.Spec.Overhead == nil { + return 0, nil + } + if mem, ok := result.Spec.Overhead[v1.ResourceMemory]; ok { + framework.Logf("Detected pod overhead memory: %d bytes (per Pod Overhead admission, KEP-688)", mem.Value()) + return mem.Value(), nil + } + return 0, nil +} + +// waitForNodeMemoryToSettle polls until the node has at least neededBytes +// of free admittable memory (allocatable - sum of admitted-pod requests), +// or the 90s timeout elapses. Useful in [Serial] tests that follow one +// which leaves large pods Terminating — their requests stay counted until +// fully removed. +// +// On timeout the function does not fail the test; it logs a tagged +// "did NOT settle" message so a downstream OutOfmemory failure points +// back at the cause (leftover pods) rather than appearing as an +// unexplained scheduling error. +func waitForNodeMemoryToSettle(ctx context.Context, c clientset.Interface, nodeName string, neededBytes int64) { + node, err := c.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{}) + if err != nil { + framework.Logf("waitForNodeMemoryToSettle: cannot get node %s: %v (skipping wait)", nodeName, err) + return + } + allocatable := node.Status.Allocatable.Memory().Value() + var lastFree int64 + pollErr := wait.PollUntilContextTimeout(ctx, 5*time.Second, 90*time.Second, true, func(ctx context.Context) (bool, error) { + existing := sumExistingPodMemoryReservation(ctx, c, nodeName) + lastFree = allocatable - existing + framework.Logf("Waiting for node %s memory to settle: existing-reservation=%d free=%d (need >= %d)", + nodeName, existing, lastFree, neededBytes) + return lastFree >= neededBytes, nil + }) + if pollErr == nil { + framework.Logf("Node %s memory settled: free=%d (need >= %d)", nodeName, lastFree, neededBytes) + return + } + framework.Logf("Node %s memory did NOT settle within 90s: free=%d (need >= %d): %v "+ + "(test will likely fail with OutOfmemory; check for leftover Terminating pods)", + nodeName, lastFree, neededBytes, pollErr) +} + +// sumExistingPodMemoryReservation returns the total memory reservation +// (sum of container memory requests + pod overhead) for all non-terminal pods +// scheduled on the given node. Tests that compute "remaining schedulable +// memory" (e.g., the Memory Limits test) must subtract this from +// allocatable to leave room for DaemonSets and other system pods. +func sumExistingPodMemoryReservation(ctx context.Context, c clientset.Interface, nodeName string) int64 { + podList, err := c.CoreV1().Pods("").List(ctx, metav1.ListOptions{ + FieldSelector: "spec.nodeName=" + nodeName, + }) + if err != nil { + framework.Logf("Could not list pods on node %s: %v (assuming 0 reservation)", nodeName, err) + return 0 + } + var total int64 + for _, p := range podList.Items { + if p.Status.Phase == v1.PodSucceeded || p.Status.Phase == v1.PodFailed { + continue + } + for _, c := range p.Spec.Containers { + if mem, ok := c.Resources.Requests[v1.ResourceMemory]; ok { + total += mem.Value() + } + } + if p.Spec.Overhead != nil { + if oh, ok := p.Spec.Overhead[v1.ResourceMemory]; ok { + total += oh.Value() + } + } + } + return total +}