mirror of
https://github.com/kubernetes/kubernetes.git
synced 2026-06-09 00:34:10 -04:00
Merge pull request #138799 from rzlink/fix/windows-e2e-pod-overhead
e2e/windows: respect pod.Spec.Overhead in Memory Limits and Kubelet-Stats tests
This commit is contained in:
commit
6b0e464c7a
3 changed files with 218 additions and 9 deletions
|
|
@ -45,7 +45,7 @@ var _ = sigDescribe(feature.Windows, "Kubelet-Stats", framework.WithSerial(), sk
|
|||
|
||||
ginkgo.Describe("Kubelet stats collection for Windows nodes", func() {
|
||||
|
||||
ginkgo.Context("when running 10 pods", func() {
|
||||
ginkgo.Context("when running up to 10 pods", func() {
|
||||
// 10 seconds is the default scrape timeout for metrics-server and kube-prometheus
|
||||
ginkgo.It("should return within 10 seconds", func(ctx context.Context) {
|
||||
|
||||
|
|
@ -54,14 +54,45 @@ var _ = sigDescribe(feature.Windows, "Kubelet-Stats", framework.WithSerial(), sk
|
|||
framework.ExpectNoError(err, "Error finding Windows node")
|
||||
framework.Logf("Using node: %v", targetNode.Name)
|
||||
|
||||
ginkgo.By("Scheduling 10 pods")
|
||||
// Adjust pod count for any per-pod overhead the cluster's admission chain
|
||||
// injects (Pod Overhead, KEP-688); a no-op when overhead is 0.
|
||||
const baselineNumPods = 10
|
||||
const minNumPods = 3
|
||||
numPods := baselineNumPods
|
||||
overhead, err := detectPodOverheadMemory(ctx, f.ClientSet, f.Namespace.Name)
|
||||
framework.ExpectNoError(err, "detecting pod overhead memory")
|
||||
if overhead > 0 {
|
||||
// Use admission accounting (allocatable - sum of admitted requests incl. overhead),
|
||||
// not /stats/summary's Memory.AvailableBytes: the latter is a working-set view
|
||||
// and can disagree with admission when a prior [Serial] test still has Terminating pods.
|
||||
allocatable := targetNode.Status.Allocatable.Memory().Value()
|
||||
waitForNodeMemoryToSettle(ctx, f.ClientSet, targetNode.Name, overhead+windowsTestMemorySafetyBuffer)
|
||||
existing := sumExistingPodMemoryReservation(ctx, f.ClientSet, targetNode.Name)
|
||||
maxPods := (allocatable - existing - windowsTestMemorySafetyBuffer) / overhead
|
||||
if maxPods < int64(numPods) {
|
||||
numPods = int(maxPods)
|
||||
}
|
||||
// Fail when the node lacks the capacity to schedule the minimum
|
||||
// required pod count. This indicates a misconfigured test cluster
|
||||
// (allocatable too small, existing reservation unexpectedly large,
|
||||
// or pod overhead higher than the cluster was sized for); surface
|
||||
// it loudly rather than silently skipping the test.
|
||||
if numPods < minNumPods {
|
||||
framework.Failf("Node %s has insufficient memory capacity to schedule %d pods: allocatable=%d, existing-reservation=%d, per-pod-overhead=%d, safety-buffer=%d, max-pods=%d. Right-size the test cluster, lower per-pod overhead, or reduce existing reservation.",
|
||||
targetNode.Name, minNumPods, allocatable, existing, overhead, windowsTestMemorySafetyBuffer, maxPods)
|
||||
}
|
||||
framework.Logf("Adjusted pod count to %d (baseline=%d, overhead=%d bytes, allocatable=%d bytes, existing-reservation=%d bytes, safety-buffer=%d bytes)",
|
||||
numPods, baselineNumPods, overhead, allocatable, existing, windowsTestMemorySafetyBuffer)
|
||||
}
|
||||
|
||||
ginkgo.By(fmt.Sprintf("Scheduling %d pods", numPods))
|
||||
powershellImage := imageutils.GetConfig(imageutils.BusyBox)
|
||||
pods := newKubeletStatsTestPods(10, powershellImage, targetNode.Name)
|
||||
pods := newKubeletStatsTestPods(numPods, powershellImage, targetNode.Name)
|
||||
e2epod.NewPodClient(f).CreateBatch(ctx, pods)
|
||||
|
||||
ginkgo.By("Waiting up to 3 minutes for pods to be running")
|
||||
ginkgo.By(fmt.Sprintf("Waiting up to 3 minutes for %d pods to be running", numPods))
|
||||
timeout := 3 * time.Minute
|
||||
err = e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, 10, timeout)
|
||||
err = e2epod.WaitForPodsRunningReady(ctx, f.ClientSet, f.Namespace.Name, numPods, timeout)
|
||||
framework.ExpectNoError(err)
|
||||
|
||||
ginkgo.By("Getting kubelet stats 5 times and checking average duration")
|
||||
|
|
@ -100,7 +131,7 @@ var _ = sigDescribe(feature.Windows, "Kubelet-Stats", framework.WithSerial(), sk
|
|||
}
|
||||
}
|
||||
}
|
||||
gomega.Expect(statsChecked).To(gomega.Equal(10), "Should find stats for 10 pods in kubelet stats")
|
||||
gomega.Expect(statsChecked).To(gomega.Equal(numPods), fmt.Sprintf("Should find stats for %d pods in kubelet stats", numPods))
|
||||
|
||||
time.Sleep(5 * time.Second)
|
||||
}
|
||||
|
|
@ -146,6 +177,24 @@ var _ = sigDescribe(feature.Windows, "Kubelet-Stats", skipUnlessWindows(func() {
|
|||
framework.ExpectNoError(err, "Error finding Windows node")
|
||||
framework.Logf("Using node: %v", targetNode.Name)
|
||||
|
||||
// Fail if the node lacks capacity for the required pod count once overhead
|
||||
// is accounted for. A misconfigured test cluster should be surfaced loudly
|
||||
// rather than silently skipped. No-op when the cluster injects no overhead.
|
||||
const numPods = 3
|
||||
overhead, err := detectPodOverheadMemory(ctx, f.ClientSet, f.Namespace.Name)
|
||||
framework.ExpectNoError(err, "detecting pod overhead memory")
|
||||
if overhead > 0 {
|
||||
allocatable := targetNode.Status.Allocatable.Memory().Value()
|
||||
needed := int64(numPods)*overhead + windowsTestMemorySafetyBuffer
|
||||
waitForNodeMemoryToSettle(ctx, f.ClientSet, targetNode.Name, needed)
|
||||
existing := sumExistingPodMemoryReservation(ctx, f.ClientSet, targetNode.Name)
|
||||
free := allocatable - existing - windowsTestMemorySafetyBuffer
|
||||
if free < int64(numPods)*overhead {
|
||||
framework.Failf("Node %s has insufficient memory capacity to schedule %d pods: allocatable=%d, existing-reservation=%d, per-pod-overhead=%d, safety-buffer=%d, free=%d (need %d). Right-size the test cluster, lower per-pod overhead, or reduce existing reservation.",
|
||||
targetNode.Name, numPods, allocatable, existing, overhead, windowsTestMemorySafetyBuffer, free, int64(numPods)*overhead)
|
||||
}
|
||||
}
|
||||
|
||||
ginkgo.By("Scheduling 3 pods")
|
||||
powershellImage := imageutils.GetConfig(imageutils.BusyBox)
|
||||
pods := newKubeletStatsTestPods(3, powershellImage, targetNode.Name)
|
||||
|
|
|
|||
|
|
@ -106,12 +106,24 @@ func overrideAllocatableMemoryTest(ctx context.Context, f *framework.Framework,
|
|||
})
|
||||
framework.ExpectNoError(err)
|
||||
|
||||
framework.Logf("Scheduling 1 pod per node to consume all allocatable memory")
|
||||
// Subtract any per-pod overhead the cluster's admission chain injects
|
||||
// (Pod Overhead, KEP-688) so limit+overhead fits Allocatable.Memory.
|
||||
overhead, err := detectPodOverheadMemory(ctx, f.ClientSet, f.Namespace.Name)
|
||||
framework.ExpectNoError(err, "detecting pod overhead memory")
|
||||
|
||||
framework.Logf("Scheduling 1 pod per node to consume all allocatable memory (detected overhead: %d bytes)", overhead)
|
||||
for _, node := range nodeList.Items {
|
||||
status := node.Status
|
||||
podMemLimt := resource.NewQuantity(status.Allocatable.Memory().Value()-(1024*1024*100), resource.BinarySI)
|
||||
// Subtract overhead (consume pod's own), existing (DaemonSets/system pods,
|
||||
// incl. their overhead), and safety buffer (kubelet accounting noise).
|
||||
existing := sumExistingPodMemoryReservation(ctx, f.ClientSet, node.Name)
|
||||
podMemLimt := resource.NewQuantity(
|
||||
status.Allocatable.Memory().Value()-existing-overhead-windowsTestMemorySafetyBuffer,
|
||||
resource.BinarySI,
|
||||
)
|
||||
podName := "mem-test-" + string(uuid.NewUUID())
|
||||
framework.Logf("Scheduling pod %s on node %s (allocatable memory=%v) with memory limit %v", podName, node.Name, status.Allocatable.Memory(), podMemLimt)
|
||||
framework.Logf("Scheduling pod %s on node %s (allocatable=%d, existing-reservation=%d, overhead=%d, safety-buffer=%d) with memory limit %v",
|
||||
podName, node.Name, status.Allocatable.Memory().Value(), existing, overhead, windowsTestMemorySafetyBuffer, podMemLimt)
|
||||
pod := &v1.Pod{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: podName,
|
||||
|
|
|
|||
|
|
@ -21,16 +21,21 @@ import (
|
|||
"fmt"
|
||||
"math/rand"
|
||||
"strings"
|
||||
"sync"
|
||||
"time"
|
||||
|
||||
"github.com/onsi/ginkgo/v2"
|
||||
appsv1 "k8s.io/api/apps/v1"
|
||||
v1 "k8s.io/api/core/v1"
|
||||
apierrors "k8s.io/apimachinery/pkg/api/errors"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
"k8s.io/apimachinery/pkg/util/uuid"
|
||||
"k8s.io/apimachinery/pkg/util/wait"
|
||||
clientset "k8s.io/client-go/kubernetes"
|
||||
"k8s.io/kubernetes/pkg/controller/deployment/util"
|
||||
"k8s.io/kubernetes/test/e2e/framework"
|
||||
e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper"
|
||||
imageutils "k8s.io/kubernetes/test/utils/image"
|
||||
|
||||
semver "github.com/blang/semver/v4"
|
||||
)
|
||||
|
|
@ -99,3 +104,146 @@ func skipUnlessContainerdOneSevenOrGreater(ctx context.Context, f *framework.Fra
|
|||
e2eskipper.Skipf("container runtime is < 1.7.0")
|
||||
}
|
||||
}
|
||||
|
||||
// windowsTestMemorySafetyBuffer is fixed headroom (in bytes) reserved on a
|
||||
// node above the sum of admitted pod requests + pod.Spec.Overhead. It absorbs
|
||||
// kubelet/cgroup memory-accounting drift that would otherwise produce flaky
|
||||
// OutOfmemory admission errors at the capacity boundary.
|
||||
//
|
||||
// 256 MiB is empirical, not derived from any kubelet constant: a comfortable
|
||||
// upper bound on the accounting jitter we observed in practice (single- to
|
||||
// low-tens of MiB) while remaining a small fraction (~5-10%) of typical
|
||||
// Windows test-node Allocatable. It can be tightened with measurements if it
|
||||
// ever becomes a constraint, or replaced with a fraction of Allocatable if
|
||||
// we ever need to run on nodes too small for a fixed buffer.
|
||||
const windowsTestMemorySafetyBuffer int64 = 256 * 1024 * 1024
|
||||
|
||||
var (
|
||||
podOverheadMemoryOnce sync.Once
|
||||
podOverheadMemoryCache int64
|
||||
podOverheadMemoryErr error
|
||||
)
|
||||
|
||||
// detectPodOverheadMemory returns the per-pod memory overhead the cluster's
|
||||
// admission chain injects (Pod Overhead, KEP-688), in bytes, or 0 if no
|
||||
// overhead is applied. The error is non-nil only when the probe itself
|
||||
// could not be completed (e.g., the API server rejected the DryRun create);
|
||||
// a successful probe with no overhead returns (0, nil).
|
||||
//
|
||||
// Result and error are cached for the lifetime of the test process — the
|
||||
// admission chain is traversed at most once per run. See
|
||||
// computePodOverheadMemory for probe details.
|
||||
func detectPodOverheadMemory(ctx context.Context, c clientset.Interface, namespace string) (int64, error) {
|
||||
podOverheadMemoryOnce.Do(func() {
|
||||
podOverheadMemoryCache, podOverheadMemoryErr = computePodOverheadMemory(ctx, c, namespace)
|
||||
})
|
||||
return podOverheadMemoryCache, podOverheadMemoryErr
|
||||
}
|
||||
|
||||
// computePodOverheadMemory performs a single DryRun pod create and inspects
|
||||
// the mutated result for pod.Spec.Overhead. DryRun (rather than reading
|
||||
// RuntimeClass directly) is needed because admission webhooks may inject
|
||||
// overhead conditionally on namespace, labels, or other request-scoped data
|
||||
// that is not visible from a static API read.
|
||||
//
|
||||
// Assumes overhead is RuntimeClass-driven (or otherwise static per pod) and
|
||||
// does not vary with container count, image, or per-container requests; a
|
||||
// future webhook that scaled overhead by pod shape would invalidate the
|
||||
// capacity calculations in this package. Callers must use a namespace that
|
||||
// admits a pause-image pod (current callers set PodSecurity LevelPrivileged).
|
||||
func computePodOverheadMemory(ctx context.Context, c clientset.Interface, namespace string) (int64, error) {
|
||||
probePod := &v1.Pod{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: "overhead-probe-" + string(uuid.NewUUID()),
|
||||
},
|
||||
Spec: v1.PodSpec{
|
||||
Containers: []v1.Container{{
|
||||
Name: "probe",
|
||||
Image: imageutils.GetPauseImageName(),
|
||||
}},
|
||||
NodeSelector: map[string]string{
|
||||
"kubernetes.io/os": "windows",
|
||||
},
|
||||
},
|
||||
}
|
||||
result, err := c.CoreV1().Pods(namespace).Create(
|
||||
ctx, probePod, metav1.CreateOptions{DryRun: []string{metav1.DryRunAll}},
|
||||
)
|
||||
if err != nil {
|
||||
return 0, fmt.Errorf("DryRun pod create for overhead detection failed: %w", err)
|
||||
}
|
||||
if result.Spec.Overhead == nil {
|
||||
return 0, nil
|
||||
}
|
||||
if mem, ok := result.Spec.Overhead[v1.ResourceMemory]; ok {
|
||||
framework.Logf("Detected pod overhead memory: %d bytes (per Pod Overhead admission, KEP-688)", mem.Value())
|
||||
return mem.Value(), nil
|
||||
}
|
||||
return 0, nil
|
||||
}
|
||||
|
||||
// waitForNodeMemoryToSettle polls until the node has at least neededBytes
|
||||
// of free admittable memory (allocatable - sum of admitted-pod requests),
|
||||
// or the 90s timeout elapses. Useful in [Serial] tests that follow one
|
||||
// which leaves large pods Terminating — their requests stay counted until
|
||||
// fully removed.
|
||||
//
|
||||
// On timeout the function does not fail the test; it logs a tagged
|
||||
// "did NOT settle" message so a downstream OutOfmemory failure points
|
||||
// back at the cause (leftover pods) rather than appearing as an
|
||||
// unexplained scheduling error.
|
||||
func waitForNodeMemoryToSettle(ctx context.Context, c clientset.Interface, nodeName string, neededBytes int64) {
|
||||
node, err := c.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{})
|
||||
if err != nil {
|
||||
framework.Logf("waitForNodeMemoryToSettle: cannot get node %s: %v (skipping wait)", nodeName, err)
|
||||
return
|
||||
}
|
||||
allocatable := node.Status.Allocatable.Memory().Value()
|
||||
var lastFree int64
|
||||
pollErr := wait.PollUntilContextTimeout(ctx, 5*time.Second, 90*time.Second, true, func(ctx context.Context) (bool, error) {
|
||||
existing := sumExistingPodMemoryReservation(ctx, c, nodeName)
|
||||
lastFree = allocatable - existing
|
||||
framework.Logf("Waiting for node %s memory to settle: existing-reservation=%d free=%d (need >= %d)",
|
||||
nodeName, existing, lastFree, neededBytes)
|
||||
return lastFree >= neededBytes, nil
|
||||
})
|
||||
if pollErr == nil {
|
||||
framework.Logf("Node %s memory settled: free=%d (need >= %d)", nodeName, lastFree, neededBytes)
|
||||
return
|
||||
}
|
||||
framework.Logf("Node %s memory did NOT settle within 90s: free=%d (need >= %d): %v "+
|
||||
"(test will likely fail with OutOfmemory; check for leftover Terminating pods)",
|
||||
nodeName, lastFree, neededBytes, pollErr)
|
||||
}
|
||||
|
||||
// sumExistingPodMemoryReservation returns the total memory reservation
|
||||
// (sum of container memory requests + pod overhead) for all non-terminal pods
|
||||
// scheduled on the given node. Tests that compute "remaining schedulable
|
||||
// memory" (e.g., the Memory Limits test) must subtract this from
|
||||
// allocatable to leave room for DaemonSets and other system pods.
|
||||
func sumExistingPodMemoryReservation(ctx context.Context, c clientset.Interface, nodeName string) int64 {
|
||||
podList, err := c.CoreV1().Pods("").List(ctx, metav1.ListOptions{
|
||||
FieldSelector: "spec.nodeName=" + nodeName,
|
||||
})
|
||||
if err != nil {
|
||||
framework.Logf("Could not list pods on node %s: %v (assuming 0 reservation)", nodeName, err)
|
||||
return 0
|
||||
}
|
||||
var total int64
|
||||
for _, p := range podList.Items {
|
||||
if p.Status.Phase == v1.PodSucceeded || p.Status.Phase == v1.PodFailed {
|
||||
continue
|
||||
}
|
||||
for _, c := range p.Spec.Containers {
|
||||
if mem, ok := c.Resources.Requests[v1.ResourceMemory]; ok {
|
||||
total += mem.Value()
|
||||
}
|
||||
}
|
||||
if p.Spec.Overhead != nil {
|
||||
if oh, ok := p.Spec.Overhead[v1.ResourceMemory]; ok {
|
||||
total += oh.Value()
|
||||
}
|
||||
}
|
||||
}
|
||||
return total
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in a new issue