mirror of
https://github.com/kubernetes/kubernetes.git
synced 2026-05-28 04:04:39 -04:00
Merge pull request #135007 from ania-borowiec/nnn_code_update
KEP-5278 Bring back clearing NominatedNodeName after scheduling or binding failure
This commit is contained in:
commit
f09cd625bc
3 changed files with 31 additions and 85 deletions
|
|
@ -136,15 +136,7 @@ func (sched *Scheduler) ScheduleOne(ctx context.Context) {
|
|||
}()
|
||||
}
|
||||
|
||||
// newFailureNominatingInfo returns the appropriate NominatingInfo for scheduling failures.
|
||||
// When NominatedNodeNameForExpectation feature is enabled, it returns nil (no clearing).
|
||||
// Otherwise, it returns NominatingInfo to clear the pod's nominated node.
|
||||
func (sched *Scheduler) newFailureNominatingInfo() *fwk.NominatingInfo {
|
||||
if sched.nominatedNodeNameForExpectationEnabled {
|
||||
return nil
|
||||
}
|
||||
return &fwk.NominatingInfo{NominatingMode: fwk.ModeOverride, NominatedNodeName: ""}
|
||||
}
|
||||
var clearNominatedNode = &fwk.NominatingInfo{NominatingMode: fwk.ModeOverride, NominatedNodeName: ""}
|
||||
|
||||
// schedulingCycle tries to schedule a single Pod.
|
||||
func (sched *Scheduler) schedulingCycle(
|
||||
|
|
@ -164,13 +156,13 @@ func (sched *Scheduler) schedulingCycle(
|
|||
}()
|
||||
if err == ErrNoNodesAvailable {
|
||||
status := fwk.NewStatus(fwk.UnschedulableAndUnresolvable).WithError(err)
|
||||
return ScheduleResult{nominatingInfo: sched.newFailureNominatingInfo()}, podInfo, status
|
||||
return ScheduleResult{nominatingInfo: clearNominatedNode}, podInfo, status
|
||||
}
|
||||
|
||||
fitError, ok := err.(*framework.FitError)
|
||||
if !ok {
|
||||
logger.Error(err, "Error selecting node for pod", "pod", klog.KObj(pod))
|
||||
return ScheduleResult{nominatingInfo: sched.newFailureNominatingInfo()}, podInfo, fwk.AsStatus(err)
|
||||
return ScheduleResult{nominatingInfo: clearNominatedNode}, podInfo, fwk.AsStatus(err)
|
||||
}
|
||||
|
||||
// SchedulePod() may have failed because the pod would not fit on any host, so we try to
|
||||
|
|
@ -180,7 +172,7 @@ func (sched *Scheduler) schedulingCycle(
|
|||
|
||||
if !schedFramework.HasPostFilterPlugins() {
|
||||
logger.V(3).Info("No PostFilter plugins are registered, so no preemption will be performed")
|
||||
return ScheduleResult{}, podInfo, fwk.NewStatus(fwk.Unschedulable).WithError(err)
|
||||
return ScheduleResult{nominatingInfo: clearNominatedNode}, podInfo, fwk.NewStatus(fwk.Unschedulable).WithError(err)
|
||||
}
|
||||
|
||||
// Run PostFilter plugins to attempt to make the pod schedulable in a future scheduling cycle.
|
||||
|
|
@ -213,7 +205,7 @@ func (sched *Scheduler) schedulingCycle(
|
|||
// This relies on the fact that Error will check if the pod has been bound
|
||||
// to a node and if so will not add it back to the unscheduled pods queue
|
||||
// (otherwise this would cause an infinite loop).
|
||||
return ScheduleResult{nominatingInfo: sched.newFailureNominatingInfo()}, assumedPodInfo, fwk.AsStatus(err)
|
||||
return ScheduleResult{nominatingInfo: clearNominatedNode}, assumedPodInfo, fwk.AsStatus(err)
|
||||
}
|
||||
|
||||
// Run the Reserve method of reserve plugins.
|
||||
|
|
@ -234,9 +226,9 @@ func (sched *Scheduler) schedulingCycle(
|
|||
}
|
||||
fitErr.Diagnosis.NodeToStatus.Set(scheduleResult.SuggestedHost, sts)
|
||||
fitErr.Diagnosis.AddPluginStatus(sts)
|
||||
return ScheduleResult{nominatingInfo: sched.newFailureNominatingInfo()}, assumedPodInfo, fwk.NewStatus(sts.Code()).WithError(fitErr)
|
||||
return ScheduleResult{nominatingInfo: clearNominatedNode}, assumedPodInfo, fwk.NewStatus(sts.Code()).WithError(fitErr)
|
||||
}
|
||||
return ScheduleResult{nominatingInfo: sched.newFailureNominatingInfo()}, assumedPodInfo, sts
|
||||
return ScheduleResult{nominatingInfo: clearNominatedNode}, assumedPodInfo, sts
|
||||
}
|
||||
|
||||
// Run "permit" plugins.
|
||||
|
|
@ -258,10 +250,10 @@ func (sched *Scheduler) schedulingCycle(
|
|||
}
|
||||
fitErr.Diagnosis.NodeToStatus.Set(scheduleResult.SuggestedHost, runPermitStatus)
|
||||
fitErr.Diagnosis.AddPluginStatus(runPermitStatus)
|
||||
return ScheduleResult{nominatingInfo: sched.newFailureNominatingInfo()}, assumedPodInfo, fwk.NewStatus(runPermitStatus.Code()).WithError(fitErr)
|
||||
return ScheduleResult{nominatingInfo: clearNominatedNode}, assumedPodInfo, fwk.NewStatus(runPermitStatus.Code()).WithError(fitErr)
|
||||
}
|
||||
|
||||
return ScheduleResult{nominatingInfo: sched.newFailureNominatingInfo()}, assumedPodInfo, runPermitStatus
|
||||
return ScheduleResult{nominatingInfo: clearNominatedNode}, assumedPodInfo, runPermitStatus
|
||||
}
|
||||
|
||||
// At the end of a successful scheduling cycle, pop and move up Pods if needed.
|
||||
|
|
@ -393,7 +385,7 @@ func (sched *Scheduler) handleBindingCycleError(
|
|||
}
|
||||
}
|
||||
|
||||
sched.FailureHandler(ctx, fwk, podInfo, status, sched.newFailureNominatingInfo(), start)
|
||||
sched.FailureHandler(ctx, fwk, podInfo, status, clearNominatedNode, start)
|
||||
}
|
||||
|
||||
func (sched *Scheduler) frameworkForPod(pod *v1.Pod) (framework.Framework, error) {
|
||||
|
|
|
|||
|
|
@ -916,33 +916,6 @@ func TestSchedulerScheduleOne(t *testing.T) {
|
|||
mockScheduleResult: emptyScheduleResult,
|
||||
eventReason: "FailedScheduling",
|
||||
},
|
||||
{
|
||||
name: "pod with existing nominated node name on scheduling error keeps nomination",
|
||||
sendPod: func() *v1.Pod {
|
||||
p := podWithID("foo", "")
|
||||
p.Status.NominatedNodeName = "existing-node"
|
||||
return p
|
||||
}(),
|
||||
injectSchedulingError: schedulingErr,
|
||||
mockScheduleResult: scheduleResultOk,
|
||||
expectError: schedulingErr,
|
||||
expectErrorPod: func() *v1.Pod {
|
||||
p := podWithID("foo", "")
|
||||
p.Status.NominatedNodeName = "existing-node"
|
||||
return p
|
||||
}(),
|
||||
expectPodInBackoffQ: func() *v1.Pod {
|
||||
p := podWithID("foo", "")
|
||||
p.Status.NominatedNodeName = "existing-node"
|
||||
return p
|
||||
}(),
|
||||
// Depending on the timing, if asyncAPICallsEnabled, the NNN update might not be sent yet while checking the expectNominatedNodeName.
|
||||
// So, asyncAPICallsEnabled is set to false.
|
||||
asyncAPICallsEnabled: ptr.To(false),
|
||||
nominatedNodeNameForExpectationEnabled: ptr.To(true),
|
||||
expectNominatedNodeName: "existing-node",
|
||||
eventReason: "FailedScheduling",
|
||||
},
|
||||
{
|
||||
name: "pod with existing nominated node name on scheduling error clears nomination",
|
||||
sendPod: func() *v1.Pod {
|
||||
|
|
@ -965,9 +938,9 @@ func TestSchedulerScheduleOne(t *testing.T) {
|
|||
}(),
|
||||
// Depending on the timing, if asyncAPICallsEnabled, the NNN update might not be sent yet while checking the expectNominatedNodeName.
|
||||
// So, asyncAPICallsEnabled is set to false.
|
||||
asyncAPICallsEnabled: ptr.To(false),
|
||||
nominatedNodeNameForExpectationEnabled: ptr.To(false),
|
||||
eventReason: "FailedScheduling",
|
||||
asyncAPICallsEnabled: ptr.To(false),
|
||||
expectNominatedNodeName: "",
|
||||
eventReason: "FailedScheduling",
|
||||
},
|
||||
}
|
||||
|
||||
|
|
@ -986,7 +959,7 @@ func TestSchedulerScheduleOne(t *testing.T) {
|
|||
for _, nominatedNodeNameForExpectationEnabled := range nominatedNodeNameForExpectationEnabled {
|
||||
if (asyncAPICallsEnabled || nominatedNodeNameForExpectationEnabled) && !qHintEnabled {
|
||||
// If the QHint feature gate is disabled, NominatedNodeNameForExpectation and SchedulerAsyncAPICalls cannot be enabled
|
||||
// because that means users set the emilation version to 1.33 or later.
|
||||
// because that means users set the emulation version to 1.33 or later.
|
||||
continue
|
||||
}
|
||||
t.Run(fmt.Sprintf("%s (Queueing hints enabled: %v, Async API calls enabled: %v, NominatedNodeNameForExpectation enabled: %v)", item.name, qHintEnabled, asyncAPICallsEnabled, nominatedNodeNameForExpectationEnabled), func(t *testing.T) {
|
||||
|
|
@ -1157,11 +1130,7 @@ func TestSchedulerScheduleOne(t *testing.T) {
|
|||
t.Errorf("Unexpected error. Wanted %v, got %v", item.expectError.Error(), gotError.Error())
|
||||
}
|
||||
if item.expectError != nil {
|
||||
var expectedNominatingInfo *fwk.NominatingInfo
|
||||
// Check nominatingInfo expectation based on feature gate
|
||||
if !nominatedNodeNameForExpectationEnabled {
|
||||
expectedNominatingInfo = &fwk.NominatingInfo{NominatingMode: fwk.ModeOverride, NominatedNodeName: ""}
|
||||
}
|
||||
expectedNominatingInfo := &fwk.NominatingInfo{NominatingMode: fwk.ModeOverride, NominatedNodeName: ""}
|
||||
if diff := cmp.Diff(expectedNominatingInfo, gotNominatingInfo); diff != "" {
|
||||
t.Errorf("Unexpected nominatingInfo (-want,+got):\n%s", diff)
|
||||
}
|
||||
|
|
|
|||
|
|
@ -94,15 +94,12 @@ func TestNominatedNode(t *testing.T) {
|
|||
nodeCapacity map[v1.ResourceName]string
|
||||
// A slice of pods to be created in batch.
|
||||
podsToCreate [][]*v1.Pod
|
||||
// Each postCheck function is run after each batch of pods' creation.
|
||||
// Each postCheck function is run after creating the corresponding batch of pods. The check is run for each pod in the batch.
|
||||
postChecks []func(ctx context.Context, cs clientset.Interface, pod *v1.Pod) error
|
||||
// Delete the fake node or not. Optional.
|
||||
deleteFakeNode bool
|
||||
// Pods to be deleted. Optional.
|
||||
podNamesToDelete []string
|
||||
// Whether NominatedNodeName will be always nil at the end of the test,
|
||||
// regardless of the NominatedNodeNameForExpectation feature gate state.
|
||||
expectNilNominatedNodeName bool
|
||||
|
||||
// Register dummy plugin to simulate particular scheduling failures. Optional.
|
||||
customPlugins *configv1.Plugins
|
||||
|
|
@ -127,11 +124,12 @@ func TestNominatedNode(t *testing.T) {
|
|||
},
|
||||
postChecks: []func(ctx context.Context, cs clientset.Interface, pod *v1.Pod) error{
|
||||
testutils.WaitForPodToSchedule,
|
||||
// Expect NNN to be set on "medium" pod after starting preemption.
|
||||
testutils.WaitForNominatedNodeName,
|
||||
// Expect NNN to be set on "high" pod after starting preemption.
|
||||
testutils.WaitForNominatedNodeName,
|
||||
},
|
||||
podNamesToDelete: []string{"low-1", "low-2", "low-3", "low-4"},
|
||||
expectNilNominatedNodeName: true,
|
||||
podNamesToDelete: []string{"low-1", "low-2", "low-3", "low-4"},
|
||||
},
|
||||
{
|
||||
name: "mid-priority pod preempts low-priority pod, followed by a high-priority pod without additional preemption",
|
||||
|
|
@ -149,7 +147,9 @@ func TestNominatedNode(t *testing.T) {
|
|||
},
|
||||
postChecks: []func(ctx context.Context, cs clientset.Interface, pod *v1.Pod) error{
|
||||
testutils.WaitForPodToSchedule,
|
||||
// Expect NNN to be set on "medium" pod after starting preemption.
|
||||
testutils.WaitForNominatedNodeName,
|
||||
// Expect "high" pod to get scheduled before "medium" re-enters the scheduling cycle.
|
||||
testutils.WaitForPodToSchedule,
|
||||
},
|
||||
podNamesToDelete: []string{"low"},
|
||||
|
|
@ -205,7 +205,7 @@ func TestNominatedNode(t *testing.T) {
|
|||
podNamesToDelete: []string{"low"},
|
||||
},
|
||||
{
|
||||
name: "mid-priority pod preempts low-priority pod, but failed the scheduling unexpectedly",
|
||||
name: "mid-priority pod preempts low-priority pod, but failed on PreBind",
|
||||
nodeCapacity: map[v1.ResourceName]string{v1.ResourceCPU: "1"},
|
||||
podsToCreate: [][]*v1.Pod{
|
||||
{
|
||||
|
|
@ -233,7 +233,7 @@ func TestNominatedNode(t *testing.T) {
|
|||
|
||||
for _, asyncPreemptionEnabled := range []bool{true, false} {
|
||||
for _, asyncAPICallsEnabled := range []bool{true, false} {
|
||||
for _, nominatedNodeNameForExpectationEnabled := range []bool{false} {
|
||||
for _, nominatedNodeNameForExpectationEnabled := range []bool{true, false} {
|
||||
for _, tt := range tests {
|
||||
t.Run(fmt.Sprintf("%s (Async preemption: %v, Async API calls: %v, NNN for expectation: %v)", tt.name, asyncPreemptionEnabled, asyncAPICallsEnabled, nominatedNodeNameForExpectationEnabled), func(t *testing.T) {
|
||||
featuregatetesting.SetFeatureGatesDuringTest(t, utilfeature.DefaultFeatureGate, featuregatetesting.FeatureOverrides{
|
||||
|
|
@ -301,33 +301,18 @@ func TestNominatedNode(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
if nominatedNodeNameForExpectationEnabled && !tt.expectNilNominatedNodeName {
|
||||
// Verify if .status.nominatedNodeName is not cleared when NominatedNodeNameForExpectation is enabled.
|
||||
// Wait for 1 second to make sure the pod is re-processed, what would potentially clear the NominatedNodeName (when the feature won't work).
|
||||
select {
|
||||
case <-time.After(time.Second):
|
||||
case <-testCtx.Ctx.Done():
|
||||
}
|
||||
pod, err := cs.CoreV1().Pods(ns).Get(testCtx.Ctx, "medium", metav1.GetOptions{})
|
||||
// Verify if .status.nominatedNodeName is cleared.
|
||||
if err := wait.PollUntilContextTimeout(testCtx.Ctx, 100*time.Millisecond, wait.ForeverTestTimeout, false, func(ctx context.Context) (bool, error) {
|
||||
pod, err := cs.CoreV1().Pods(ns).Get(ctx, "medium", metav1.GetOptions{})
|
||||
if err != nil {
|
||||
t.Errorf("Error getting the medium pod: %v", err)
|
||||
} else if len(pod.Status.NominatedNodeName) == 0 {
|
||||
t.Errorf(".status.nominatedNodeName of the medium pod was cleared: %v", err)
|
||||
}
|
||||
} else {
|
||||
// Verify if .status.nominatedNodeName is cleared when NominatedNodeNameForExpectation is disabled.
|
||||
if err := wait.PollUntilContextTimeout(testCtx.Ctx, 100*time.Millisecond, wait.ForeverTestTimeout, false, func(ctx context.Context) (bool, error) {
|
||||
pod, err := cs.CoreV1().Pods(ns).Get(ctx, "medium", metav1.GetOptions{})
|
||||
if err != nil {
|
||||
t.Errorf("Error getting the medium pod: %v", err)
|
||||
}
|
||||
if len(pod.Status.NominatedNodeName) == 0 {
|
||||
return true, nil
|
||||
}
|
||||
return false, err
|
||||
}); err != nil {
|
||||
t.Errorf(".status.nominatedNodeName of the medium pod was not cleared: %v", err)
|
||||
if len(pod.Status.NominatedNodeName) == 0 {
|
||||
return true, nil
|
||||
}
|
||||
return false, err
|
||||
}); err != nil {
|
||||
t.Errorf(".status.nominatedNodeName of the medium pod was not cleared: %v", err)
|
||||
}
|
||||
})
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in a new issue