Merge pull request #135007 from ania-borowiec/nnn_code_update

KEP-5278 Bring back clearing NominatedNodeName after scheduling or binding failure
This commit is contained in:
Kubernetes Prow Robot 2025-11-04 07:14:24 -08:00 committed by GitHub
commit f09cd625bc
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 31 additions and 85 deletions

View file

@ -136,15 +136,7 @@ func (sched *Scheduler) ScheduleOne(ctx context.Context) {
}()
}
// newFailureNominatingInfo returns the appropriate NominatingInfo for scheduling failures.
// When NominatedNodeNameForExpectation feature is enabled, it returns nil (no clearing).
// Otherwise, it returns NominatingInfo to clear the pod's nominated node.
func (sched *Scheduler) newFailureNominatingInfo() *fwk.NominatingInfo {
if sched.nominatedNodeNameForExpectationEnabled {
return nil
}
return &fwk.NominatingInfo{NominatingMode: fwk.ModeOverride, NominatedNodeName: ""}
}
var clearNominatedNode = &fwk.NominatingInfo{NominatingMode: fwk.ModeOverride, NominatedNodeName: ""}
// schedulingCycle tries to schedule a single Pod.
func (sched *Scheduler) schedulingCycle(
@ -164,13 +156,13 @@ func (sched *Scheduler) schedulingCycle(
}()
if err == ErrNoNodesAvailable {
status := fwk.NewStatus(fwk.UnschedulableAndUnresolvable).WithError(err)
return ScheduleResult{nominatingInfo: sched.newFailureNominatingInfo()}, podInfo, status
return ScheduleResult{nominatingInfo: clearNominatedNode}, podInfo, status
}
fitError, ok := err.(*framework.FitError)
if !ok {
logger.Error(err, "Error selecting node for pod", "pod", klog.KObj(pod))
return ScheduleResult{nominatingInfo: sched.newFailureNominatingInfo()}, podInfo, fwk.AsStatus(err)
return ScheduleResult{nominatingInfo: clearNominatedNode}, podInfo, fwk.AsStatus(err)
}
// SchedulePod() may have failed because the pod would not fit on any host, so we try to
@ -180,7 +172,7 @@ func (sched *Scheduler) schedulingCycle(
if !schedFramework.HasPostFilterPlugins() {
logger.V(3).Info("No PostFilter plugins are registered, so no preemption will be performed")
return ScheduleResult{}, podInfo, fwk.NewStatus(fwk.Unschedulable).WithError(err)
return ScheduleResult{nominatingInfo: clearNominatedNode}, podInfo, fwk.NewStatus(fwk.Unschedulable).WithError(err)
}
// Run PostFilter plugins to attempt to make the pod schedulable in a future scheduling cycle.
@ -213,7 +205,7 @@ func (sched *Scheduler) schedulingCycle(
// This relies on the fact that Error will check if the pod has been bound
// to a node and if so will not add it back to the unscheduled pods queue
// (otherwise this would cause an infinite loop).
return ScheduleResult{nominatingInfo: sched.newFailureNominatingInfo()}, assumedPodInfo, fwk.AsStatus(err)
return ScheduleResult{nominatingInfo: clearNominatedNode}, assumedPodInfo, fwk.AsStatus(err)
}
// Run the Reserve method of reserve plugins.
@ -234,9 +226,9 @@ func (sched *Scheduler) schedulingCycle(
}
fitErr.Diagnosis.NodeToStatus.Set(scheduleResult.SuggestedHost, sts)
fitErr.Diagnosis.AddPluginStatus(sts)
return ScheduleResult{nominatingInfo: sched.newFailureNominatingInfo()}, assumedPodInfo, fwk.NewStatus(sts.Code()).WithError(fitErr)
return ScheduleResult{nominatingInfo: clearNominatedNode}, assumedPodInfo, fwk.NewStatus(sts.Code()).WithError(fitErr)
}
return ScheduleResult{nominatingInfo: sched.newFailureNominatingInfo()}, assumedPodInfo, sts
return ScheduleResult{nominatingInfo: clearNominatedNode}, assumedPodInfo, sts
}
// Run "permit" plugins.
@ -258,10 +250,10 @@ func (sched *Scheduler) schedulingCycle(
}
fitErr.Diagnosis.NodeToStatus.Set(scheduleResult.SuggestedHost, runPermitStatus)
fitErr.Diagnosis.AddPluginStatus(runPermitStatus)
return ScheduleResult{nominatingInfo: sched.newFailureNominatingInfo()}, assumedPodInfo, fwk.NewStatus(runPermitStatus.Code()).WithError(fitErr)
return ScheduleResult{nominatingInfo: clearNominatedNode}, assumedPodInfo, fwk.NewStatus(runPermitStatus.Code()).WithError(fitErr)
}
return ScheduleResult{nominatingInfo: sched.newFailureNominatingInfo()}, assumedPodInfo, runPermitStatus
return ScheduleResult{nominatingInfo: clearNominatedNode}, assumedPodInfo, runPermitStatus
}
// At the end of a successful scheduling cycle, pop and move up Pods if needed.
@ -393,7 +385,7 @@ func (sched *Scheduler) handleBindingCycleError(
}
}
sched.FailureHandler(ctx, fwk, podInfo, status, sched.newFailureNominatingInfo(), start)
sched.FailureHandler(ctx, fwk, podInfo, status, clearNominatedNode, start)
}
func (sched *Scheduler) frameworkForPod(pod *v1.Pod) (framework.Framework, error) {

View file

@ -916,33 +916,6 @@ func TestSchedulerScheduleOne(t *testing.T) {
mockScheduleResult: emptyScheduleResult,
eventReason: "FailedScheduling",
},
{
name: "pod with existing nominated node name on scheduling error keeps nomination",
sendPod: func() *v1.Pod {
p := podWithID("foo", "")
p.Status.NominatedNodeName = "existing-node"
return p
}(),
injectSchedulingError: schedulingErr,
mockScheduleResult: scheduleResultOk,
expectError: schedulingErr,
expectErrorPod: func() *v1.Pod {
p := podWithID("foo", "")
p.Status.NominatedNodeName = "existing-node"
return p
}(),
expectPodInBackoffQ: func() *v1.Pod {
p := podWithID("foo", "")
p.Status.NominatedNodeName = "existing-node"
return p
}(),
// Depending on the timing, if asyncAPICallsEnabled, the NNN update might not be sent yet while checking the expectNominatedNodeName.
// So, asyncAPICallsEnabled is set to false.
asyncAPICallsEnabled: ptr.To(false),
nominatedNodeNameForExpectationEnabled: ptr.To(true),
expectNominatedNodeName: "existing-node",
eventReason: "FailedScheduling",
},
{
name: "pod with existing nominated node name on scheduling error clears nomination",
sendPod: func() *v1.Pod {
@ -965,9 +938,9 @@ func TestSchedulerScheduleOne(t *testing.T) {
}(),
// Depending on the timing, if asyncAPICallsEnabled, the NNN update might not be sent yet while checking the expectNominatedNodeName.
// So, asyncAPICallsEnabled is set to false.
asyncAPICallsEnabled: ptr.To(false),
nominatedNodeNameForExpectationEnabled: ptr.To(false),
eventReason: "FailedScheduling",
asyncAPICallsEnabled: ptr.To(false),
expectNominatedNodeName: "",
eventReason: "FailedScheduling",
},
}
@ -986,7 +959,7 @@ func TestSchedulerScheduleOne(t *testing.T) {
for _, nominatedNodeNameForExpectationEnabled := range nominatedNodeNameForExpectationEnabled {
if (asyncAPICallsEnabled || nominatedNodeNameForExpectationEnabled) && !qHintEnabled {
// If the QHint feature gate is disabled, NominatedNodeNameForExpectation and SchedulerAsyncAPICalls cannot be enabled
// because that means users set the emilation version to 1.33 or later.
// because that means users set the emulation version to 1.33 or later.
continue
}
t.Run(fmt.Sprintf("%s (Queueing hints enabled: %v, Async API calls enabled: %v, NominatedNodeNameForExpectation enabled: %v)", item.name, qHintEnabled, asyncAPICallsEnabled, nominatedNodeNameForExpectationEnabled), func(t *testing.T) {
@ -1157,11 +1130,7 @@ func TestSchedulerScheduleOne(t *testing.T) {
t.Errorf("Unexpected error. Wanted %v, got %v", item.expectError.Error(), gotError.Error())
}
if item.expectError != nil {
var expectedNominatingInfo *fwk.NominatingInfo
// Check nominatingInfo expectation based on feature gate
if !nominatedNodeNameForExpectationEnabled {
expectedNominatingInfo = &fwk.NominatingInfo{NominatingMode: fwk.ModeOverride, NominatedNodeName: ""}
}
expectedNominatingInfo := &fwk.NominatingInfo{NominatingMode: fwk.ModeOverride, NominatedNodeName: ""}
if diff := cmp.Diff(expectedNominatingInfo, gotNominatingInfo); diff != "" {
t.Errorf("Unexpected nominatingInfo (-want,+got):\n%s", diff)
}

View file

@ -94,15 +94,12 @@ func TestNominatedNode(t *testing.T) {
nodeCapacity map[v1.ResourceName]string
// A slice of pods to be created in batch.
podsToCreate [][]*v1.Pod
// Each postCheck function is run after each batch of pods' creation.
// Each postCheck function is run after creating the corresponding batch of pods. The check is run for each pod in the batch.
postChecks []func(ctx context.Context, cs clientset.Interface, pod *v1.Pod) error
// Delete the fake node or not. Optional.
deleteFakeNode bool
// Pods to be deleted. Optional.
podNamesToDelete []string
// Whether NominatedNodeName will be always nil at the end of the test,
// regardless of the NominatedNodeNameForExpectation feature gate state.
expectNilNominatedNodeName bool
// Register dummy plugin to simulate particular scheduling failures. Optional.
customPlugins *configv1.Plugins
@ -127,11 +124,12 @@ func TestNominatedNode(t *testing.T) {
},
postChecks: []func(ctx context.Context, cs clientset.Interface, pod *v1.Pod) error{
testutils.WaitForPodToSchedule,
// Expect NNN to be set on "medium" pod after starting preemption.
testutils.WaitForNominatedNodeName,
// Expect NNN to be set on "high" pod after starting preemption.
testutils.WaitForNominatedNodeName,
},
podNamesToDelete: []string{"low-1", "low-2", "low-3", "low-4"},
expectNilNominatedNodeName: true,
podNamesToDelete: []string{"low-1", "low-2", "low-3", "low-4"},
},
{
name: "mid-priority pod preempts low-priority pod, followed by a high-priority pod without additional preemption",
@ -149,7 +147,9 @@ func TestNominatedNode(t *testing.T) {
},
postChecks: []func(ctx context.Context, cs clientset.Interface, pod *v1.Pod) error{
testutils.WaitForPodToSchedule,
// Expect NNN to be set on "medium" pod after starting preemption.
testutils.WaitForNominatedNodeName,
// Expect "high" pod to get scheduled before "medium" re-enters the scheduling cycle.
testutils.WaitForPodToSchedule,
},
podNamesToDelete: []string{"low"},
@ -205,7 +205,7 @@ func TestNominatedNode(t *testing.T) {
podNamesToDelete: []string{"low"},
},
{
name: "mid-priority pod preempts low-priority pod, but failed the scheduling unexpectedly",
name: "mid-priority pod preempts low-priority pod, but failed on PreBind",
nodeCapacity: map[v1.ResourceName]string{v1.ResourceCPU: "1"},
podsToCreate: [][]*v1.Pod{
{
@ -233,7 +233,7 @@ func TestNominatedNode(t *testing.T) {
for _, asyncPreemptionEnabled := range []bool{true, false} {
for _, asyncAPICallsEnabled := range []bool{true, false} {
for _, nominatedNodeNameForExpectationEnabled := range []bool{false} {
for _, nominatedNodeNameForExpectationEnabled := range []bool{true, false} {
for _, tt := range tests {
t.Run(fmt.Sprintf("%s (Async preemption: %v, Async API calls: %v, NNN for expectation: %v)", tt.name, asyncPreemptionEnabled, asyncAPICallsEnabled, nominatedNodeNameForExpectationEnabled), func(t *testing.T) {
featuregatetesting.SetFeatureGatesDuringTest(t, utilfeature.DefaultFeatureGate, featuregatetesting.FeatureOverrides{
@ -301,33 +301,18 @@ func TestNominatedNode(t *testing.T) {
}
}
if nominatedNodeNameForExpectationEnabled && !tt.expectNilNominatedNodeName {
// Verify if .status.nominatedNodeName is not cleared when NominatedNodeNameForExpectation is enabled.
// Wait for 1 second to make sure the pod is re-processed, what would potentially clear the NominatedNodeName (when the feature won't work).
select {
case <-time.After(time.Second):
case <-testCtx.Ctx.Done():
}
pod, err := cs.CoreV1().Pods(ns).Get(testCtx.Ctx, "medium", metav1.GetOptions{})
// Verify if .status.nominatedNodeName is cleared.
if err := wait.PollUntilContextTimeout(testCtx.Ctx, 100*time.Millisecond, wait.ForeverTestTimeout, false, func(ctx context.Context) (bool, error) {
pod, err := cs.CoreV1().Pods(ns).Get(ctx, "medium", metav1.GetOptions{})
if err != nil {
t.Errorf("Error getting the medium pod: %v", err)
} else if len(pod.Status.NominatedNodeName) == 0 {
t.Errorf(".status.nominatedNodeName of the medium pod was cleared: %v", err)
}
} else {
// Verify if .status.nominatedNodeName is cleared when NominatedNodeNameForExpectation is disabled.
if err := wait.PollUntilContextTimeout(testCtx.Ctx, 100*time.Millisecond, wait.ForeverTestTimeout, false, func(ctx context.Context) (bool, error) {
pod, err := cs.CoreV1().Pods(ns).Get(ctx, "medium", metav1.GetOptions{})
if err != nil {
t.Errorf("Error getting the medium pod: %v", err)
}
if len(pod.Status.NominatedNodeName) == 0 {
return true, nil
}
return false, err
}); err != nil {
t.Errorf(".status.nominatedNodeName of the medium pod was not cleared: %v", err)
if len(pod.Status.NominatedNodeName) == 0 {
return true, nil
}
return false, err
}); err != nil {
t.Errorf(".status.nominatedNodeName of the medium pod was not cleared: %v", err)
}
})
}