Merge pull request #138799 from rzlink/fix/windows-e2e-pod-overhead

e2e/windows: respect pod.Spec.Overhead in Memory Limits and Kubelet-Stats tests
This commit is contained in:
Kubernetes Prow Robot 2026-05-14 03:07:48 +05:30 committed by GitHub
commit 6b0e464c7a
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 218 additions and 9 deletions

View file

@ -45,7 +45,7 @@ var _ = sigDescribe(feature.Windows, "Kubelet-Stats", framework.WithSerial(), sk
ginkgo.Describe("Kubelet stats collection for Windows nodes", func() {
ginkgo.Context("when running 10 pods", func() {
ginkgo.Context("when running up to 10 pods", func() {
// 10 seconds is the default scrape timeout for metrics-server and kube-prometheus
ginkgo.It("should return within 10 seconds", func(ctx context.Context) {
@ -54,14 +54,45 @@ var _ = sigDescribe(feature.Windows, "Kubelet-Stats", framework.WithSerial(), sk
framework.ExpectNoError(err, "Error finding Windows node")
framework.Logf("Using node: %v", targetNode.Name)
ginkgo.By("Scheduling 10 pods")
// Adjust pod count for any per-pod overhead the cluster's admission chain
// injects (Pod Overhead, KEP-688); a no-op when overhead is 0.
const baselineNumPods = 10
const minNumPods = 3
numPods := baselineNumPods
overhead, err := detectPodOverheadMemory(ctx, f.ClientSet, f.Namespace.Name)
framework.ExpectNoError(err, "detecting pod overhead memory")
if overhead > 0 {
// Use admission accounting (allocatable - sum of admitted requests incl. overhead),
// not /stats/summary's Memory.AvailableBytes: the latter is a working-set view
// and can disagree with admission when a prior [Serial] test still has Terminating pods.
allocatable := targetNode.Status.Allocatable.Memory().Value()
waitForNodeMemoryToSettle(ctx, f.ClientSet, targetNode.Name, overhead+windowsTestMemorySafetyBuffer)
existing := sumExistingPodMemoryReservation(ctx, f.ClientSet, targetNode.Name)
maxPods := (allocatable - existing - windowsTestMemorySafetyBuffer) / overhead
if maxPods < int64(numPods) {
numPods = int(maxPods)
}
// Fail when the node lacks the capacity to schedule the minimum
// required pod count. This indicates a misconfigured test cluster
// (allocatable too small, existing reservation unexpectedly large,
// or pod overhead higher than the cluster was sized for); surface
// it loudly rather than silently skipping the test.
if numPods < minNumPods {
framework.Failf("Node %s has insufficient memory capacity to schedule %d pods: allocatable=%d, existing-reservation=%d, per-pod-overhead=%d, safety-buffer=%d, max-pods=%d. Right-size the test cluster, lower per-pod overhead, or reduce existing reservation.",
targetNode.Name, minNumPods, allocatable, existing, overhead, windowsTestMemorySafetyBuffer, maxPods)
}
framework.Logf("Adjusted pod count to %d (baseline=%d, overhead=%d bytes, allocatable=%d bytes, existing-reservation=%d bytes, safety-buffer=%d bytes)",
numPods, baselineNumPods, overhead, allocatable, existing, windowsTestMemorySafetyBuffer)
}
ginkgo.By(fmt.Sprintf("Scheduling %d pods", numPods))
powershellImage := imageutils.GetConfig(imageutils.BusyBox)
pods := newKubeletStatsTestPods(10, powershellImage, targetNode.Name)
pods := newKubeletStatsTestPods(numPods, powershellImage, targetNode.Name)
e2epod.NewPodClient(f).CreateBatch(ctx, pods)
ginkgo.By("Waiting up to 3 minutes for pods to be running")
ginkgo.By(fmt.Sprintf("Waiting up to 3 minutes for %d pods to be running", numPods))
timeout := 3 * time.Minute
err = e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 10, timeout)
err = e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, numPods, timeout)
framework.ExpectNoError(err)
ginkgo.By("Getting kubelet stats 5 times and checking average duration")
@ -100,7 +131,7 @@ var _ = sigDescribe(feature.Windows, "Kubelet-Stats", framework.WithSerial(), sk
}
}
}
gomega.Expect(statsChecked).To(gomega.Equal(10), "Should find stats for 10 pods in kubelet stats")
gomega.Expect(statsChecked).To(gomega.Equal(numPods), fmt.Sprintf("Should find stats for %d pods in kubelet stats", numPods))
time.Sleep(5 * time.Second)
}
@ -146,6 +177,24 @@ var _ = sigDescribe(feature.Windows, "Kubelet-Stats", skipUnlessWindows(func() {
framework.ExpectNoError(err, "Error finding Windows node")
framework.Logf("Using node: %v", targetNode.Name)
// Fail if the node lacks capacity for the required pod count once overhead
// is accounted for. A misconfigured test cluster should be surfaced loudly
// rather than silently skipped. No-op when the cluster injects no overhead.
const numPods = 3
overhead, err := detectPodOverheadMemory(ctx, f.ClientSet, f.Namespace.Name)
framework.ExpectNoError(err, "detecting pod overhead memory")
if overhead > 0 {
allocatable := targetNode.Status.Allocatable.Memory().Value()
needed := int64(numPods)*overhead + windowsTestMemorySafetyBuffer
waitForNodeMemoryToSettle(ctx, f.ClientSet, targetNode.Name, needed)
existing := sumExistingPodMemoryReservation(ctx, f.ClientSet, targetNode.Name)
free := allocatable - existing - windowsTestMemorySafetyBuffer
if free < int64(numPods)*overhead {
framework.Failf("Node %s has insufficient memory capacity to schedule %d pods: allocatable=%d, existing-reservation=%d, per-pod-overhead=%d, safety-buffer=%d, free=%d (need %d). Right-size the test cluster, lower per-pod overhead, or reduce existing reservation.",
targetNode.Name, numPods, allocatable, existing, overhead, windowsTestMemorySafetyBuffer, free, int64(numPods)*overhead)
}
}
ginkgo.By("Scheduling 3 pods")
powershellImage := imageutils.GetConfig(imageutils.BusyBox)
pods := newKubeletStatsTestPods(3, powershellImage, targetNode.Name)

View file

@ -106,12 +106,24 @@ func overrideAllocatableMemoryTest(ctx context.Context, f *framework.Framework,
})
framework.ExpectNoError(err)
framework.Logf("Scheduling 1 pod per node to consume all allocatable memory")
// Subtract any per-pod overhead the cluster's admission chain injects
// (Pod Overhead, KEP-688) so limit+overhead fits Allocatable.Memory.
overhead, err := detectPodOverheadMemory(ctx, f.ClientSet, f.Namespace.Name)
framework.ExpectNoError(err, "detecting pod overhead memory")
framework.Logf("Scheduling 1 pod per node to consume all allocatable memory (detected overhead: %d bytes)", overhead)
for _, node := range nodeList.Items {
status := node.Status
podMemLimt := resource.NewQuantity(status.Allocatable.Memory().Value()-(1024*1024*100), resource.BinarySI)
// Subtract overhead (consume pod's own), existing (DaemonSets/system pods,
// incl. their overhead), and safety buffer (kubelet accounting noise).
existing := sumExistingPodMemoryReservation(ctx, f.ClientSet, node.Name)
podMemLimt := resource.NewQuantity(
status.Allocatable.Memory().Value()-existing-overhead-windowsTestMemorySafetyBuffer,
resource.BinarySI,
)
podName := "mem-test-" + string(uuid.NewUUID())
framework.Logf("Scheduling pod %s on node %s (allocatable memory=%v) with memory limit %v", podName, node.Name, status.Allocatable.Memory(), podMemLimt)
framework.Logf("Scheduling pod %s on node %s (allocatable=%d, existing-reservation=%d, overhead=%d, safety-buffer=%d) with memory limit %v",
podName, node.Name, status.Allocatable.Memory().Value(), existing, overhead, windowsTestMemorySafetyBuffer, podMemLimt)
pod := &v1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: podName,

View file

@ -21,16 +21,21 @@ import (
"fmt"
"math/rand"
"strings"
"sync"
"time"
"github.com/onsi/ginkgo/v2"
appsv1 "k8s.io/api/apps/v1"
v1 "k8s.io/api/core/v1"
apierrors "k8s.io/apimachinery/pkg/api/errors"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/uuid"
"k8s.io/apimachinery/pkg/util/wait"
clientset "k8s.io/client-go/kubernetes"
"k8s.io/kubernetes/pkg/controller/deployment/util"
"k8s.io/kubernetes/test/e2e/framework"
e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper"
imageutils "k8s.io/kubernetes/test/utils/image"
semver "github.com/blang/semver/v4"
)
@ -99,3 +104,146 @@ func skipUnlessContainerdOneSevenOrGreater(ctx context.Context, f *framework.Fra
e2eskipper.Skipf("container runtime is < 1.7.0")
}
}
// windowsTestMemorySafetyBuffer is fixed headroom (in bytes) reserved on a
// node above the sum of admitted pod requests + pod.Spec.Overhead. It absorbs
// kubelet/cgroup memory-accounting drift that would otherwise produce flaky
// OutOfmemory admission errors at the capacity boundary.
//
// 256 MiB is empirical, not derived from any kubelet constant: a comfortable
// upper bound on the accounting jitter we observed in practice (single- to
// low-tens of MiB) while remaining a small fraction (~5-10%) of typical
// Windows test-node Allocatable. It can be tightened with measurements if it
// ever becomes a constraint, or replaced with a fraction of Allocatable if
// we ever need to run on nodes too small for a fixed buffer.
const windowsTestMemorySafetyBuffer int64 = 256 * 1024 * 1024
var (
podOverheadMemoryOnce sync.Once
podOverheadMemoryCache int64
podOverheadMemoryErr error
)
// detectPodOverheadMemory returns the per-pod memory overhead the cluster's
// admission chain injects (Pod Overhead, KEP-688), in bytes, or 0 if no
// overhead is applied. The error is non-nil only when the probe itself
// could not be completed (e.g., the API server rejected the DryRun create);
// a successful probe with no overhead returns (0, nil).
//
// Result and error are cached for the lifetime of the test process — the
// admission chain is traversed at most once per run. See
// computePodOverheadMemory for probe details.
func detectPodOverheadMemory(ctx context.Context, c clientset.Interface, namespace string) (int64, error) {
podOverheadMemoryOnce.Do(func() {
podOverheadMemoryCache, podOverheadMemoryErr = computePodOverheadMemory(ctx, c, namespace)
})
return podOverheadMemoryCache, podOverheadMemoryErr
}
// computePodOverheadMemory performs a single DryRun pod create and inspects
// the mutated result for pod.Spec.Overhead. DryRun (rather than reading
// RuntimeClass directly) is needed because admission webhooks may inject
// overhead conditionally on namespace, labels, or other request-scoped data
// that is not visible from a static API read.
//
// Assumes overhead is RuntimeClass-driven (or otherwise static per pod) and
// does not vary with container count, image, or per-container requests; a
// future webhook that scaled overhead by pod shape would invalidate the
// capacity calculations in this package. Callers must use a namespace that
// admits a pause-image pod (current callers set PodSecurity LevelPrivileged).
func computePodOverheadMemory(ctx context.Context, c clientset.Interface, namespace string) (int64, error) {
probePod := &v1.Pod{
ObjectMeta: metav1.ObjectMeta{
Name: "overhead-probe-" + string(uuid.NewUUID()),
},
Spec: v1.PodSpec{
Containers: []v1.Container{{
Name: "probe",
Image: imageutils.GetPauseImageName(),
}},
NodeSelector: map[string]string{
"kubernetes.io/os": "windows",
},
},
}
result, err := c.CoreV1().Pods(namespace).Create(
ctx, probePod, metav1.CreateOptions{DryRun: []string{metav1.DryRunAll}},
)
if err != nil {
return 0, fmt.Errorf("DryRun pod create for overhead detection failed: %w", err)
}
if result.Spec.Overhead == nil {
return 0, nil
}
if mem, ok := result.Spec.Overhead[v1.ResourceMemory]; ok {
framework.Logf("Detected pod overhead memory: %d bytes (per Pod Overhead admission, KEP-688)", mem.Value())
return mem.Value(), nil
}
return 0, nil
}
// waitForNodeMemoryToSettle polls until the node has at least neededBytes
// of free admittable memory (allocatable - sum of admitted-pod requests),
// or the 90s timeout elapses. Useful in [Serial] tests that follow one
// which leaves large pods Terminating — their requests stay counted until
// fully removed.
//
// On timeout the function does not fail the test; it logs a tagged
// "did NOT settle" message so a downstream OutOfmemory failure points
// back at the cause (leftover pods) rather than appearing as an
// unexplained scheduling error.
func waitForNodeMemoryToSettle(ctx context.Context, c clientset.Interface, nodeName string, neededBytes int64) {
node, err := c.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{})
if err != nil {
framework.Logf("waitForNodeMemoryToSettle: cannot get node %s: %v (skipping wait)", nodeName, err)
return
}
allocatable := node.Status.Allocatable.Memory().Value()
var lastFree int64
pollErr := wait.PollUntilContextTimeout(ctx, 5*time.Second, 90*time.Second, true, func(ctx context.Context) (bool, error) {
existing := sumExistingPodMemoryReservation(ctx, c, nodeName)
lastFree = allocatable - existing
framework.Logf("Waiting for node %s memory to settle: existing-reservation=%d free=%d (need >= %d)",
nodeName, existing, lastFree, neededBytes)
return lastFree >= neededBytes, nil
})
if pollErr == nil {
framework.Logf("Node %s memory settled: free=%d (need >= %d)", nodeName, lastFree, neededBytes)
return
}
framework.Logf("Node %s memory did NOT settle within 90s: free=%d (need >= %d): %v "+
"(test will likely fail with OutOfmemory; check for leftover Terminating pods)",
nodeName, lastFree, neededBytes, pollErr)
}
// sumExistingPodMemoryReservation returns the total memory reservation
// (sum of container memory requests + pod overhead) for all non-terminal pods
// scheduled on the given node. Tests that compute "remaining schedulable
// memory" (e.g., the Memory Limits test) must subtract this from
// allocatable to leave room for DaemonSets and other system pods.
func sumExistingPodMemoryReservation(ctx context.Context, c clientset.Interface, nodeName string) int64 {
podList, err := c.CoreV1().Pods("").List(ctx, metav1.ListOptions{
FieldSelector: "spec.nodeName=" + nodeName,
})
if err != nil {
framework.Logf("Could not list pods on node %s: %v (assuming 0 reservation)", nodeName, err)
return 0
}
var total int64
for _, p := range podList.Items {
if p.Status.Phase == v1.PodSucceeded || p.Status.Phase == v1.PodFailed {
continue
}
for _, c := range p.Spec.Containers {
if mem, ok := c.Resources.Requests[v1.ResourceMemory]; ok {
total += mem.Value()
}
}
if p.Spec.Overhead != nil {
if oh, ok := p.Spec.Overhead[v1.ResourceMemory]; ok {
total += oh.Value()
}
}
}
return total
}