From 81448febcf409d349a2f3143d90231b5f6fd5375 Mon Sep 17 00:00:00 2001 From: Ania Borowiec Date: Fri, 31 Oct 2025 11:41:38 +0000 Subject: [PATCH 1/2] KEP-5278 Clear NominatedNodeName upon scheduling failure --- pkg/scheduler/schedule_one.go | 28 ++++++++------------ pkg/scheduler/schedule_one_test.go | 41 ++++-------------------------- 2 files changed, 15 insertions(+), 54 deletions(-) diff --git a/pkg/scheduler/schedule_one.go b/pkg/scheduler/schedule_one.go index c39ce0b1fd3..feac13fc289 100644 --- a/pkg/scheduler/schedule_one.go +++ b/pkg/scheduler/schedule_one.go @@ -136,15 +136,7 @@ func (sched *Scheduler) ScheduleOne(ctx context.Context) { }() } -// newFailureNominatingInfo returns the appropriate NominatingInfo for scheduling failures. -// When NominatedNodeNameForExpectation feature is enabled, it returns nil (no clearing). -// Otherwise, it returns NominatingInfo to clear the pod's nominated node. -func (sched *Scheduler) newFailureNominatingInfo() *fwk.NominatingInfo { - if sched.nominatedNodeNameForExpectationEnabled { - return nil - } - return &fwk.NominatingInfo{NominatingMode: fwk.ModeOverride, NominatedNodeName: ""} -} +var clearNominatedNode = &fwk.NominatingInfo{NominatingMode: fwk.ModeOverride, NominatedNodeName: ""} // schedulingCycle tries to schedule a single Pod. func (sched *Scheduler) schedulingCycle( @@ -164,13 +156,13 @@ func (sched *Scheduler) schedulingCycle( }() if err == ErrNoNodesAvailable { status := fwk.NewStatus(fwk.UnschedulableAndUnresolvable).WithError(err) - return ScheduleResult{nominatingInfo: sched.newFailureNominatingInfo()}, podInfo, status + return ScheduleResult{nominatingInfo: clearNominatedNode}, podInfo, status } fitError, ok := err.(*framework.FitError) if !ok { logger.Error(err, "Error selecting node for pod", "pod", klog.KObj(pod)) - return ScheduleResult{nominatingInfo: sched.newFailureNominatingInfo()}, podInfo, fwk.AsStatus(err) + return ScheduleResult{nominatingInfo: clearNominatedNode}, podInfo, fwk.AsStatus(err) } // SchedulePod() may have failed because the pod would not fit on any host, so we try to @@ -180,7 +172,7 @@ func (sched *Scheduler) schedulingCycle( if !schedFramework.HasPostFilterPlugins() { logger.V(3).Info("No PostFilter plugins are registered, so no preemption will be performed") - return ScheduleResult{}, podInfo, fwk.NewStatus(fwk.Unschedulable).WithError(err) + return ScheduleResult{nominatingInfo: clearNominatedNode}, podInfo, fwk.NewStatus(fwk.Unschedulable).WithError(err) } // Run PostFilter plugins to attempt to make the pod schedulable in a future scheduling cycle. @@ -213,7 +205,7 @@ func (sched *Scheduler) schedulingCycle( // This relies on the fact that Error will check if the pod has been bound // to a node and if so will not add it back to the unscheduled pods queue // (otherwise this would cause an infinite loop). - return ScheduleResult{nominatingInfo: sched.newFailureNominatingInfo()}, assumedPodInfo, fwk.AsStatus(err) + return ScheduleResult{nominatingInfo: clearNominatedNode}, assumedPodInfo, fwk.AsStatus(err) } // Run the Reserve method of reserve plugins. @@ -234,9 +226,9 @@ func (sched *Scheduler) schedulingCycle( } fitErr.Diagnosis.NodeToStatus.Set(scheduleResult.SuggestedHost, sts) fitErr.Diagnosis.AddPluginStatus(sts) - return ScheduleResult{nominatingInfo: sched.newFailureNominatingInfo()}, assumedPodInfo, fwk.NewStatus(sts.Code()).WithError(fitErr) + return ScheduleResult{nominatingInfo: clearNominatedNode}, assumedPodInfo, fwk.NewStatus(sts.Code()).WithError(fitErr) } - return ScheduleResult{nominatingInfo: sched.newFailureNominatingInfo()}, assumedPodInfo, sts + return ScheduleResult{nominatingInfo: clearNominatedNode}, assumedPodInfo, sts } // Run "permit" plugins. @@ -258,10 +250,10 @@ func (sched *Scheduler) schedulingCycle( } fitErr.Diagnosis.NodeToStatus.Set(scheduleResult.SuggestedHost, runPermitStatus) fitErr.Diagnosis.AddPluginStatus(runPermitStatus) - return ScheduleResult{nominatingInfo: sched.newFailureNominatingInfo()}, assumedPodInfo, fwk.NewStatus(runPermitStatus.Code()).WithError(fitErr) + return ScheduleResult{nominatingInfo: clearNominatedNode}, assumedPodInfo, fwk.NewStatus(runPermitStatus.Code()).WithError(fitErr) } - return ScheduleResult{nominatingInfo: sched.newFailureNominatingInfo()}, assumedPodInfo, runPermitStatus + return ScheduleResult{nominatingInfo: clearNominatedNode}, assumedPodInfo, runPermitStatus } // At the end of a successful scheduling cycle, pop and move up Pods if needed. @@ -393,7 +385,7 @@ func (sched *Scheduler) handleBindingCycleError( } } - sched.FailureHandler(ctx, fwk, podInfo, status, sched.newFailureNominatingInfo(), start) + sched.FailureHandler(ctx, fwk, podInfo, status, clearNominatedNode, start) } func (sched *Scheduler) frameworkForPod(pod *v1.Pod) (framework.Framework, error) { diff --git a/pkg/scheduler/schedule_one_test.go b/pkg/scheduler/schedule_one_test.go index ce4de2043a6..c4546707b98 100644 --- a/pkg/scheduler/schedule_one_test.go +++ b/pkg/scheduler/schedule_one_test.go @@ -916,33 +916,6 @@ func TestSchedulerScheduleOne(t *testing.T) { mockScheduleResult: emptyScheduleResult, eventReason: "FailedScheduling", }, - { - name: "pod with existing nominated node name on scheduling error keeps nomination", - sendPod: func() *v1.Pod { - p := podWithID("foo", "") - p.Status.NominatedNodeName = "existing-node" - return p - }(), - injectSchedulingError: schedulingErr, - mockScheduleResult: scheduleResultOk, - expectError: schedulingErr, - expectErrorPod: func() *v1.Pod { - p := podWithID("foo", "") - p.Status.NominatedNodeName = "existing-node" - return p - }(), - expectPodInBackoffQ: func() *v1.Pod { - p := podWithID("foo", "") - p.Status.NominatedNodeName = "existing-node" - return p - }(), - // Depending on the timing, if asyncAPICallsEnabled, the NNN update might not be sent yet while checking the expectNominatedNodeName. - // So, asyncAPICallsEnabled is set to false. - asyncAPICallsEnabled: ptr.To(false), - nominatedNodeNameForExpectationEnabled: ptr.To(true), - expectNominatedNodeName: "existing-node", - eventReason: "FailedScheduling", - }, { name: "pod with existing nominated node name on scheduling error clears nomination", sendPod: func() *v1.Pod { @@ -965,9 +938,9 @@ func TestSchedulerScheduleOne(t *testing.T) { }(), // Depending on the timing, if asyncAPICallsEnabled, the NNN update might not be sent yet while checking the expectNominatedNodeName. // So, asyncAPICallsEnabled is set to false. - asyncAPICallsEnabled: ptr.To(false), - nominatedNodeNameForExpectationEnabled: ptr.To(false), - eventReason: "FailedScheduling", + asyncAPICallsEnabled: ptr.To(false), + expectNominatedNodeName: "", + eventReason: "FailedScheduling", }, } @@ -986,7 +959,7 @@ func TestSchedulerScheduleOne(t *testing.T) { for _, nominatedNodeNameForExpectationEnabled := range nominatedNodeNameForExpectationEnabled { if (asyncAPICallsEnabled || nominatedNodeNameForExpectationEnabled) && !qHintEnabled { // If the QHint feature gate is disabled, NominatedNodeNameForExpectation and SchedulerAsyncAPICalls cannot be enabled - // because that means users set the emilation version to 1.33 or later. + // because that means users set the emulation version to 1.33 or later. continue } t.Run(fmt.Sprintf("%s (Queueing hints enabled: %v, Async API calls enabled: %v, NominatedNodeNameForExpectation enabled: %v)", item.name, qHintEnabled, asyncAPICallsEnabled, nominatedNodeNameForExpectationEnabled), func(t *testing.T) { @@ -1157,11 +1130,7 @@ func TestSchedulerScheduleOne(t *testing.T) { t.Errorf("Unexpected error. Wanted %v, got %v", item.expectError.Error(), gotError.Error()) } if item.expectError != nil { - var expectedNominatingInfo *fwk.NominatingInfo - // Check nominatingInfo expectation based on feature gate - if !nominatedNodeNameForExpectationEnabled { - expectedNominatingInfo = &fwk.NominatingInfo{NominatingMode: fwk.ModeOverride, NominatedNodeName: ""} - } + expectedNominatingInfo := &fwk.NominatingInfo{NominatingMode: fwk.ModeOverride, NominatedNodeName: ""} if diff := cmp.Diff(expectedNominatingInfo, gotNominatingInfo); diff != "" { t.Errorf("Unexpected nominatingInfo (-want,+got):\n%s", diff) } From c98804b77f3bf5ef2c9df7b956e8fbd7f6670027 Mon Sep 17 00:00:00 2001 From: Ania Borowiec Date: Fri, 31 Oct 2025 13:13:01 +0000 Subject: [PATCH 2/2] Fix existing integration tests for NominatedNodeName --- .../nominatednodename_test.go | 47 +++++++------------ 1 file changed, 16 insertions(+), 31 deletions(-) diff --git a/test/integration/scheduler/preemption/nominatednodename/nominatednodename_test.go b/test/integration/scheduler/preemption/nominatednodename/nominatednodename_test.go index 768e89e9507..917666d9a10 100644 --- a/test/integration/scheduler/preemption/nominatednodename/nominatednodename_test.go +++ b/test/integration/scheduler/preemption/nominatednodename/nominatednodename_test.go @@ -94,15 +94,12 @@ func TestNominatedNode(t *testing.T) { nodeCapacity map[v1.ResourceName]string // A slice of pods to be created in batch. podsToCreate [][]*v1.Pod - // Each postCheck function is run after each batch of pods' creation. + // Each postCheck function is run after creating the corresponding batch of pods. The check is run for each pod in the batch. postChecks []func(ctx context.Context, cs clientset.Interface, pod *v1.Pod) error // Delete the fake node or not. Optional. deleteFakeNode bool // Pods to be deleted. Optional. podNamesToDelete []string - // Whether NominatedNodeName will be always nil at the end of the test, - // regardless of the NominatedNodeNameForExpectation feature gate state. - expectNilNominatedNodeName bool // Register dummy plugin to simulate particular scheduling failures. Optional. customPlugins *configv1.Plugins @@ -127,11 +124,12 @@ func TestNominatedNode(t *testing.T) { }, postChecks: []func(ctx context.Context, cs clientset.Interface, pod *v1.Pod) error{ testutils.WaitForPodToSchedule, + // Expect NNN to be set on "medium" pod after starting preemption. testutils.WaitForNominatedNodeName, + // Expect NNN to be set on "high" pod after starting preemption. testutils.WaitForNominatedNodeName, }, - podNamesToDelete: []string{"low-1", "low-2", "low-3", "low-4"}, - expectNilNominatedNodeName: true, + podNamesToDelete: []string{"low-1", "low-2", "low-3", "low-4"}, }, { name: "mid-priority pod preempts low-priority pod, followed by a high-priority pod without additional preemption", @@ -149,7 +147,9 @@ func TestNominatedNode(t *testing.T) { }, postChecks: []func(ctx context.Context, cs clientset.Interface, pod *v1.Pod) error{ testutils.WaitForPodToSchedule, + // Expect NNN to be set on "medium" pod after starting preemption. testutils.WaitForNominatedNodeName, + // Expect "high" pod to get scheduled before "medium" re-enters the scheduling cycle. testutils.WaitForPodToSchedule, }, podNamesToDelete: []string{"low"}, @@ -205,7 +205,7 @@ func TestNominatedNode(t *testing.T) { podNamesToDelete: []string{"low"}, }, { - name: "mid-priority pod preempts low-priority pod, but failed the scheduling unexpectedly", + name: "mid-priority pod preempts low-priority pod, but failed on PreBind", nodeCapacity: map[v1.ResourceName]string{v1.ResourceCPU: "1"}, podsToCreate: [][]*v1.Pod{ { @@ -233,7 +233,7 @@ func TestNominatedNode(t *testing.T) { for _, asyncPreemptionEnabled := range []bool{true, false} { for _, asyncAPICallsEnabled := range []bool{true, false} { - for _, nominatedNodeNameForExpectationEnabled := range []bool{false} { + for _, nominatedNodeNameForExpectationEnabled := range []bool{true, false} { for _, tt := range tests { t.Run(fmt.Sprintf("%s (Async preemption: %v, Async API calls: %v, NNN for expectation: %v)", tt.name, asyncPreemptionEnabled, asyncAPICallsEnabled, nominatedNodeNameForExpectationEnabled), func(t *testing.T) { featuregatetesting.SetFeatureGatesDuringTest(t, utilfeature.DefaultFeatureGate, featuregatetesting.FeatureOverrides{ @@ -301,33 +301,18 @@ func TestNominatedNode(t *testing.T) { } } - if nominatedNodeNameForExpectationEnabled && !tt.expectNilNominatedNodeName { - // Verify if .status.nominatedNodeName is not cleared when NominatedNodeNameForExpectation is enabled. - // Wait for 1 second to make sure the pod is re-processed, what would potentially clear the NominatedNodeName (when the feature won't work). - select { - case <-time.After(time.Second): - case <-testCtx.Ctx.Done(): - } - pod, err := cs.CoreV1().Pods(ns).Get(testCtx.Ctx, "medium", metav1.GetOptions{}) + // Verify if .status.nominatedNodeName is cleared. + if err := wait.PollUntilContextTimeout(testCtx.Ctx, 100*time.Millisecond, wait.ForeverTestTimeout, false, func(ctx context.Context) (bool, error) { + pod, err := cs.CoreV1().Pods(ns).Get(ctx, "medium", metav1.GetOptions{}) if err != nil { t.Errorf("Error getting the medium pod: %v", err) - } else if len(pod.Status.NominatedNodeName) == 0 { - t.Errorf(".status.nominatedNodeName of the medium pod was cleared: %v", err) } - } else { - // Verify if .status.nominatedNodeName is cleared when NominatedNodeNameForExpectation is disabled. - if err := wait.PollUntilContextTimeout(testCtx.Ctx, 100*time.Millisecond, wait.ForeverTestTimeout, false, func(ctx context.Context) (bool, error) { - pod, err := cs.CoreV1().Pods(ns).Get(ctx, "medium", metav1.GetOptions{}) - if err != nil { - t.Errorf("Error getting the medium pod: %v", err) - } - if len(pod.Status.NominatedNodeName) == 0 { - return true, nil - } - return false, err - }); err != nil { - t.Errorf(".status.nominatedNodeName of the medium pod was not cleared: %v", err) + if len(pod.Status.NominatedNodeName) == 0 { + return true, nil } + return false, err + }); err != nil { + t.Errorf(".status.nominatedNodeName of the medium pod was not cleared: %v", err) } }) }