mirror of
https://github.com/kubernetes/kubernetes.git
synced 2026-06-10 09:22:55 -04:00
Two Windows e2e tests, Memory Limits and Kubelet-Stats, compute "schedulable memory" directly from node.Status.Allocatable.Memory and ignore pod.Spec.Overhead. That is inconsistent with how the kubelet admits pods: admission-time accounting sums each pod's container requests plus pod.Spec.Overhead (Pod Overhead, KEP-688, GA in 1.24). On clusters whose admission chain injects per-pod overhead, for example a cluster with a RuntimeClass whose scheduling overhead is non-zero, or a mutating webhook that sets Spec.Overhead, these tests overschedule the node and fail with OutOfmemory admission errors. On clusters with no overhead the tests behave the same as before. Add three helpers in test/e2e/windows/utils.go: - detectPodOverheadMemory(ctx, c, namespace) (int64, error): performs a single DryRun pod create and inspects the mutated result for Spec.Overhead[ResourceMemory]. Result and error are cached for the lifetime of the test process via sync.Once. DryRun is the right primitive because admission webhooks may inject overhead conditionally on namespace, labels, or other request-scoped data that is not visible from a static read of the RuntimeClass API. - sumExistingPodMemoryReservation(ctx, c, nodeName): sums per-pod container requests + Spec.Overhead for non-terminal pods on a node. Used to leave room for DaemonSets and system pods. - waitForNodeMemoryToSettle(ctx, c, nodeName, neededBytes): polls until enough memory frees up after a previous [Serial] test; on timeout logs a tagged "did NOT settle" message but does not fail the test. Adopt the helpers in: - memory_limits.go: subtract overhead + existing reservation + safety buffer (256 MiB) from Allocatable.Memory when sizing the consume pod, instead of subtracting a hard-coded 100 MiB. - kubelet_stats.go (10-pod test): compute maxPods = (allocatable - existing - safetyBuffer) / overhead, lower numPods accordingly, and skip cleanly when fewer than 3 pods can fit. - kubelet_stats.go (3-pod test): apply the same skip-on-insufficient-room logic. Behavior on clusters without Pod Overhead is byte-for-byte equivalent: the helpers short-circuit and the existing per-test code paths are unchanged.
249 lines
9.4 KiB
Go
249 lines
9.4 KiB
Go
/*
|
|
Copyright 2019 The Kubernetes Authors.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
*/
|
|
|
|
package windows
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"math/rand"
|
|
"strings"
|
|
"sync"
|
|
"time"
|
|
|
|
"github.com/onsi/ginkgo/v2"
|
|
appsv1 "k8s.io/api/apps/v1"
|
|
v1 "k8s.io/api/core/v1"
|
|
apierrors "k8s.io/apimachinery/pkg/api/errors"
|
|
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
|
"k8s.io/apimachinery/pkg/util/uuid"
|
|
"k8s.io/apimachinery/pkg/util/wait"
|
|
clientset "k8s.io/client-go/kubernetes"
|
|
"k8s.io/kubernetes/pkg/controller/deployment/util"
|
|
"k8s.io/kubernetes/test/e2e/framework"
|
|
e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper"
|
|
imageutils "k8s.io/kubernetes/test/utils/image"
|
|
|
|
semver "github.com/blang/semver/v4"
|
|
)
|
|
|
|
// waits for a deployment to be created and the desired replicas
|
|
// are updated and available, and no old pods are running.
|
|
func waitForDeployment(getDeploymentFunc func() (*appsv1.Deployment, error), interval, timeout time.Duration) error {
|
|
return wait.PollImmediate(interval, timeout, func() (bool, error) {
|
|
deployment, err := getDeploymentFunc()
|
|
if err != nil {
|
|
if apierrors.IsNotFound(err) {
|
|
framework.Logf("deployment not found, continue waiting: %s", err)
|
|
return false, nil
|
|
}
|
|
|
|
framework.Logf("error while deploying, error %s", err)
|
|
return false, err
|
|
}
|
|
framework.Logf("deployment status %s", &deployment.Status)
|
|
return util.DeploymentComplete(deployment, &deployment.Status), nil
|
|
})
|
|
}
|
|
|
|
// gets the container runtime and version for a node
|
|
func getNodeContainerRuntimeAndVersion(n v1.Node) (string, semver.Version, error) {
|
|
containerRuntimeVersionString := n.Status.NodeInfo.DeepCopy().ContainerRuntimeVersion
|
|
parts := strings.Split(containerRuntimeVersionString, "://")
|
|
|
|
if len(parts) != 2 {
|
|
return "", semver.Version{}, fmt.Errorf("could not get container runtime and version from '%s'", containerRuntimeVersionString)
|
|
}
|
|
|
|
v, err := semver.ParseTolerant(parts[1])
|
|
if err != nil {
|
|
return "", semver.Version{}, err
|
|
}
|
|
|
|
return parts[0], v, nil
|
|
}
|
|
|
|
func getRandomUserGrounName() string {
|
|
var letters = []rune("abcdefghijklmnopqrstuvwxya")
|
|
|
|
s := make([]rune, 8)
|
|
for i := range s {
|
|
s[i] = letters[rand.Intn(len(letters))]
|
|
}
|
|
|
|
return "hpc-" + string(s)
|
|
}
|
|
|
|
func skipUnlessContainerdOneSevenOrGreater(ctx context.Context, f *framework.Framework) {
|
|
ginkgo.By("Ensuring Windows nodes are running containerd v1.7+")
|
|
windowsNode, err := findWindowsNode(ctx, f)
|
|
framework.ExpectNoError(err, "error finding Windows node")
|
|
r, v, err := getNodeContainerRuntimeAndVersion(windowsNode)
|
|
framework.ExpectNoError(err, "error getting node container runtime and version")
|
|
framework.Logf("Got runtime: %s, version %v for node %s", r, v, windowsNode.Name)
|
|
|
|
if !strings.EqualFold(r, "containerd") {
|
|
e2eskipper.Skipf("container runtime is not containerd")
|
|
}
|
|
|
|
v1dot7 := semver.MustParse("1.7.0-alpha.1")
|
|
if v.LT(v1dot7) {
|
|
e2eskipper.Skipf("container runtime is < 1.7.0")
|
|
}
|
|
}
|
|
|
|
// windowsTestMemorySafetyBuffer is fixed headroom (in bytes) reserved on a
|
|
// node above the sum of admitted pod requests + pod.Spec.Overhead. It absorbs
|
|
// kubelet/cgroup memory-accounting drift that would otherwise produce flaky
|
|
// OutOfmemory admission errors at the capacity boundary.
|
|
//
|
|
// 256 MiB is empirical, not derived from any kubelet constant: a comfortable
|
|
// upper bound on the accounting jitter we observed in practice (single- to
|
|
// low-tens of MiB) while remaining a small fraction (~5-10%) of typical
|
|
// Windows test-node Allocatable. It can be tightened with measurements if it
|
|
// ever becomes a constraint, or replaced with a fraction of Allocatable if
|
|
// we ever need to run on nodes too small for a fixed buffer.
|
|
const windowsTestMemorySafetyBuffer int64 = 256 * 1024 * 1024
|
|
|
|
var (
|
|
podOverheadMemoryOnce sync.Once
|
|
podOverheadMemoryCache int64
|
|
podOverheadMemoryErr error
|
|
)
|
|
|
|
// detectPodOverheadMemory returns the per-pod memory overhead the cluster's
|
|
// admission chain injects (Pod Overhead, KEP-688), in bytes, or 0 if no
|
|
// overhead is applied. The error is non-nil only when the probe itself
|
|
// could not be completed (e.g., the API server rejected the DryRun create);
|
|
// a successful probe with no overhead returns (0, nil).
|
|
//
|
|
// Result and error are cached for the lifetime of the test process — the
|
|
// admission chain is traversed at most once per run. See
|
|
// computePodOverheadMemory for probe details.
|
|
func detectPodOverheadMemory(ctx context.Context, c clientset.Interface, namespace string) (int64, error) {
|
|
podOverheadMemoryOnce.Do(func() {
|
|
podOverheadMemoryCache, podOverheadMemoryErr = computePodOverheadMemory(ctx, c, namespace)
|
|
})
|
|
return podOverheadMemoryCache, podOverheadMemoryErr
|
|
}
|
|
|
|
// computePodOverheadMemory performs a single DryRun pod create and inspects
|
|
// the mutated result for pod.Spec.Overhead. DryRun (rather than reading
|
|
// RuntimeClass directly) is needed because admission webhooks may inject
|
|
// overhead conditionally on namespace, labels, or other request-scoped data
|
|
// that is not visible from a static API read.
|
|
//
|
|
// Assumes overhead is RuntimeClass-driven (or otherwise static per pod) and
|
|
// does not vary with container count, image, or per-container requests; a
|
|
// future webhook that scaled overhead by pod shape would invalidate the
|
|
// capacity calculations in this package. Callers must use a namespace that
|
|
// admits a pause-image pod (current callers set PodSecurity LevelPrivileged).
|
|
func computePodOverheadMemory(ctx context.Context, c clientset.Interface, namespace string) (int64, error) {
|
|
probePod := &v1.Pod{
|
|
ObjectMeta: metav1.ObjectMeta{
|
|
Name: "overhead-probe-" + string(uuid.NewUUID()),
|
|
},
|
|
Spec: v1.PodSpec{
|
|
Containers: []v1.Container{{
|
|
Name: "probe",
|
|
Image: imageutils.GetPauseImageName(),
|
|
}},
|
|
NodeSelector: map[string]string{
|
|
"kubernetes.io/os": "windows",
|
|
},
|
|
},
|
|
}
|
|
result, err := c.CoreV1().Pods(namespace).Create(
|
|
ctx, probePod, metav1.CreateOptions{DryRun: []string{metav1.DryRunAll}},
|
|
)
|
|
if err != nil {
|
|
return 0, fmt.Errorf("DryRun pod create for overhead detection failed: %w", err)
|
|
}
|
|
if result.Spec.Overhead == nil {
|
|
return 0, nil
|
|
}
|
|
if mem, ok := result.Spec.Overhead[v1.ResourceMemory]; ok {
|
|
framework.Logf("Detected pod overhead memory: %d bytes (per Pod Overhead admission, KEP-688)", mem.Value())
|
|
return mem.Value(), nil
|
|
}
|
|
return 0, nil
|
|
}
|
|
|
|
// waitForNodeMemoryToSettle polls until the node has at least neededBytes
|
|
// of free admittable memory (allocatable - sum of admitted-pod requests),
|
|
// or the 90s timeout elapses. Useful in [Serial] tests that follow one
|
|
// which leaves large pods Terminating — their requests stay counted until
|
|
// fully removed.
|
|
//
|
|
// On timeout the function does not fail the test; it logs a tagged
|
|
// "did NOT settle" message so a downstream OutOfmemory failure points
|
|
// back at the cause (leftover pods) rather than appearing as an
|
|
// unexplained scheduling error.
|
|
func waitForNodeMemoryToSettle(ctx context.Context, c clientset.Interface, nodeName string, neededBytes int64) {
|
|
node, err := c.CoreV1().Nodes().Get(ctx, nodeName, metav1.GetOptions{})
|
|
if err != nil {
|
|
framework.Logf("waitForNodeMemoryToSettle: cannot get node %s: %v (skipping wait)", nodeName, err)
|
|
return
|
|
}
|
|
allocatable := node.Status.Allocatable.Memory().Value()
|
|
var lastFree int64
|
|
pollErr := wait.PollUntilContextTimeout(ctx, 5*time.Second, 90*time.Second, true, func(ctx context.Context) (bool, error) {
|
|
existing := sumExistingPodMemoryReservation(ctx, c, nodeName)
|
|
lastFree = allocatable - existing
|
|
framework.Logf("Waiting for node %s memory to settle: existing-reservation=%d free=%d (need >= %d)",
|
|
nodeName, existing, lastFree, neededBytes)
|
|
return lastFree >= neededBytes, nil
|
|
})
|
|
if pollErr == nil {
|
|
framework.Logf("Node %s memory settled: free=%d (need >= %d)", nodeName, lastFree, neededBytes)
|
|
return
|
|
}
|
|
framework.Logf("Node %s memory did NOT settle within 90s: free=%d (need >= %d): %v "+
|
|
"(test will likely fail with OutOfmemory; check for leftover Terminating pods)",
|
|
nodeName, lastFree, neededBytes, pollErr)
|
|
}
|
|
|
|
// sumExistingPodMemoryReservation returns the total memory reservation
|
|
// (sum of container memory requests + pod overhead) for all non-terminal pods
|
|
// scheduled on the given node. Tests that compute "remaining schedulable
|
|
// memory" (e.g., the Memory Limits test) must subtract this from
|
|
// allocatable to leave room for DaemonSets and other system pods.
|
|
func sumExistingPodMemoryReservation(ctx context.Context, c clientset.Interface, nodeName string) int64 {
|
|
podList, err := c.CoreV1().Pods("").List(ctx, metav1.ListOptions{
|
|
FieldSelector: "spec.nodeName=" + nodeName,
|
|
})
|
|
if err != nil {
|
|
framework.Logf("Could not list pods on node %s: %v (assuming 0 reservation)", nodeName, err)
|
|
return 0
|
|
}
|
|
var total int64
|
|
for _, p := range podList.Items {
|
|
if p.Status.Phase == v1.PodSucceeded || p.Status.Phase == v1.PodFailed {
|
|
continue
|
|
}
|
|
for _, c := range p.Spec.Containers {
|
|
if mem, ok := c.Resources.Requests[v1.ResourceMemory]; ok {
|
|
total += mem.Value()
|
|
}
|
|
}
|
|
if p.Spec.Overhead != nil {
|
|
if oh, ok := p.Spec.Overhead[v1.ResourceMemory]; ok {
|
|
total += oh.Value()
|
|
}
|
|
}
|
|
}
|
|
return total
|
|
}
|