From b2a399cf30492ce0a2a141b936be0fb1e80617b6 Mon Sep 17 00:00:00 2001 From: Mohammad Varmazyar Date: Wed, 5 Nov 2025 09:24:40 +0100 Subject: [PATCH 1/9] scheduler: add metric for pods scheduled after flush This metric tracks pods that successfully schedule after being flushed from unschedulablePods due to timeout. High values may indicate missing queue hint optimizations or event handling issues. --- pkg/scheduler/backend/queue/scheduling_queue.go | 2 ++ pkg/scheduler/framework/types.go | 4 ++++ pkg/scheduler/metrics/metrics.go | 10 ++++++++++ pkg/scheduler/schedule_one.go | 4 ++++ 4 files changed, 20 insertions(+) diff --git a/pkg/scheduler/backend/queue/scheduling_queue.go b/pkg/scheduler/backend/queue/scheduling_queue.go index 717b55656f1..826a5a0e09f 100644 --- a/pkg/scheduler/backend/queue/scheduling_queue.go +++ b/pkg/scheduler/backend/queue/scheduling_queue.go @@ -948,6 +948,8 @@ func (p *PriorityQueue) flushUnschedulablePodsLeftover(logger klog.Logger) { for _, pInfo := range p.unschedulablePods.podInfoMap { lastScheduleTime := pInfo.Timestamp if currentTime.Sub(lastScheduleTime) > p.podMaxInUnschedulablePodsDuration { + // Mark this pod as flushed so we can detect if it schedules soon after + pInfo.FlushedFromUnschedulableAt = ¤tTime podsToMove = append(podsToMove, pInfo) } } diff --git a/pkg/scheduler/framework/types.go b/pkg/scheduler/framework/types.go index 9e92527be17..de20feed86f 100644 --- a/pkg/scheduler/framework/types.go +++ b/pkg/scheduler/framework/types.go @@ -526,6 +526,10 @@ type QueuedPodInfo struct { // That's why we need to distinguish ConsecutiveErrorsCount for the error status and UnschedulableCount for the unschedulable status. // See https://github.com/kubernetes/kubernetes/issues/128744 for the discussion. ConsecutiveErrorsCount int + // FlushedFromUnschedulableAt tracks when this pod was last flushed from unschedulablePods + // due to timeout. This is used to detect if the pod becomes schedulable soon after flush, + // which may indicate missing queue hint optimizations or event handling bugs. + FlushedFromUnschedulableAt *time.Time // The time when the pod is added to the queue for the first time. The pod may be added // back to the queue multiple times before it's successfully scheduled. // It shouldn't be updated once initialized. It's used to record the e2e scheduling diff --git a/pkg/scheduler/metrics/metrics.go b/pkg/scheduler/metrics/metrics.go index 8fbdc0d98ab..e26f567a16e 100644 --- a/pkg/scheduler/metrics/metrics.go +++ b/pkg/scheduler/metrics/metrics.go @@ -131,6 +131,7 @@ var ( PodSchedulingSLIDuration *metrics.HistogramVec PodSchedulingAttempts *metrics.Histogram + PodScheduledAfterFlush *metrics.Counter FrameworkExtensionPointDuration *metrics.HistogramVec PluginExecutionDuration *metrics.HistogramVec @@ -296,6 +297,14 @@ func InitMetrics() { StabilityLevel: metrics.STABLE, }) + PodScheduledAfterFlush = metrics.NewCounter( + &metrics.CounterOpts{ + Subsystem: SchedulerSubsystem, + Name: "pod_scheduled_after_flush_total", + Help: "Number of pods that were successfully scheduled after being flushed from unschedulablePods due to timeout. This metric helps detect potential queue hint bugs or event handling issues.", + StabilityLevel: metrics.ALPHA, + }) + FrameworkExtensionPointDuration = metrics.NewHistogramVec( &metrics.HistogramOpts{ Subsystem: SchedulerSubsystem, @@ -463,6 +472,7 @@ func InitMetrics() { pendingPods, PodSchedulingSLIDuration, PodSchedulingAttempts, + PodScheduledAfterFlush, FrameworkExtensionPointDuration, PluginExecutionDuration, SchedulerQueueIncomingPods, diff --git a/pkg/scheduler/schedule_one.go b/pkg/scheduler/schedule_one.go index 683bfaed690..4cc5ad717c4 100644 --- a/pkg/scheduler/schedule_one.go +++ b/pkg/scheduler/schedule_one.go @@ -340,6 +340,10 @@ func (sched *Scheduler) bindingCycle( if assumedPodInfo.InitialAttemptTimestamp != nil { metrics.PodSchedulingSLIDuration.WithLabelValues(getAttemptsLabel(assumedPodInfo)).Observe(metrics.SinceInSeconds(*assumedPodInfo.InitialAttemptTimestamp)) } + // Count pods scheduled after being flushed from unschedulablePods + if assumedPodInfo.FlushedFromUnschedulableAt != nil { + metrics.PodScheduledAfterFlush.Inc() + } // Run "postbind" plugins. schedFramework.RunPostBindPlugins(ctx, state, assumedPod, scheduleResult.SuggestedHost) From bc632c72d08db137070fc635e44da1cd928af96b Mon Sep 17 00:00:00 2001 From: Mohammad Varmazyar Date: Wed, 5 Nov 2025 19:48:20 +0100 Subject: [PATCH 2/9] scheduler: add metric for pods scheduled after flush Add counter metric to track pods that schedule immediately after being flushed from unschedulablePods due to timeout. Uses a boolean flag that is cleared when pods return to queue or move via events. --- pkg/scheduler/backend/queue/scheduling_queue.go | 11 ++++++++++- pkg/scheduler/framework/types.go | 10 ++++++---- pkg/scheduler/schedule_one.go | 2 +- 3 files changed, 17 insertions(+), 6 deletions(-) diff --git a/pkg/scheduler/backend/queue/scheduling_queue.go b/pkg/scheduler/backend/queue/scheduling_queue.go index 826a5a0e09f..b1cf9d1e1bd 100644 --- a/pkg/scheduler/backend/queue/scheduling_queue.go +++ b/pkg/scheduler/backend/queue/scheduling_queue.go @@ -894,6 +894,8 @@ func (p *PriorityQueue) AddUnschedulableIfNotPresent(logger klog.Logger, pInfo * // We changed ConsecutiveErrorsCount or UnschedulableCount plus Timestamp, and now the calculated backoff time should be different, // removing the cached backoff time. pInfo.BackoffExpiration = time.Time{} + // Clear the flush flag since the pod is returning to the queue after a scheduling attempt. + pInfo.WasFlushedFromUnschedulable = false if !p.isSchedulingQueueHintEnabled { // fall back to the old behavior which doesn't depend on the queueing hint. @@ -949,7 +951,7 @@ func (p *PriorityQueue) flushUnschedulablePodsLeftover(logger klog.Logger) { lastScheduleTime := pInfo.Timestamp if currentTime.Sub(lastScheduleTime) > p.podMaxInUnschedulablePodsDuration { // Mark this pod as flushed so we can detect if it schedules soon after - pInfo.FlushedFromUnschedulableAt = ¤tTime + pInfo.WasFlushedFromUnschedulable = true podsToMove = append(podsToMove, pInfo) } } @@ -1237,6 +1239,13 @@ func (p *PriorityQueue) movePodsToActiveOrBackoffQueue(logger klog.Logger, podIn continue } + // Clear the flush flag if this pod is being moved by an event (not by timeout flush). + // EventUnschedulableTimeout is the event used by flushUnschedulablePodsLeftover, + // where the flag is set to true before calling this function. + if event != framework.EventUnschedulableTimeout { + pInfo.WasFlushedFromUnschedulable = false + } + p.unschedulablePods.delete(pInfo.Pod, pInfo.Gated()) queue := p.requeuePodWithQueueingStrategy(logger, pInfo, schedulingHint, event.Label()) if queue == activeQ || (p.isPopFromBackoffQEnabled && queue == backoffQ) { diff --git a/pkg/scheduler/framework/types.go b/pkg/scheduler/framework/types.go index de20feed86f..607f18ad683 100644 --- a/pkg/scheduler/framework/types.go +++ b/pkg/scheduler/framework/types.go @@ -526,10 +526,12 @@ type QueuedPodInfo struct { // That's why we need to distinguish ConsecutiveErrorsCount for the error status and UnschedulableCount for the unschedulable status. // See https://github.com/kubernetes/kubernetes/issues/128744 for the discussion. ConsecutiveErrorsCount int - // FlushedFromUnschedulableAt tracks when this pod was last flushed from unschedulablePods - // due to timeout. This is used to detect if the pod becomes schedulable soon after flush, - // which may indicate missing queue hint optimizations or event handling bugs. - FlushedFromUnschedulableAt *time.Time + // WasFlushedFromUnschedulable tracks whether this pod was most recently moved to activeQ + // by the periodic flush from unschedulablePods due to timeout (rather than by an event). + // This is used to detect if the pod becomes schedulable soon after flush, which may + // indicate missing queue hint optimizations or event handling bugs. + // This flag is cleared when the pod returns to the queue for any reason. + WasFlushedFromUnschedulable bool // The time when the pod is added to the queue for the first time. The pod may be added // back to the queue multiple times before it's successfully scheduled. // It shouldn't be updated once initialized. It's used to record the e2e scheduling diff --git a/pkg/scheduler/schedule_one.go b/pkg/scheduler/schedule_one.go index 4cc5ad717c4..4285ffe808d 100644 --- a/pkg/scheduler/schedule_one.go +++ b/pkg/scheduler/schedule_one.go @@ -341,7 +341,7 @@ func (sched *Scheduler) bindingCycle( metrics.PodSchedulingSLIDuration.WithLabelValues(getAttemptsLabel(assumedPodInfo)).Observe(metrics.SinceInSeconds(*assumedPodInfo.InitialAttemptTimestamp)) } // Count pods scheduled after being flushed from unschedulablePods - if assumedPodInfo.FlushedFromUnschedulableAt != nil { + if assumedPodInfo.WasFlushedFromUnschedulable { metrics.PodScheduledAfterFlush.Inc() } // Run "postbind" plugins. From 6a1a71ddc5b2bfcfe1d53607aa3782a48f1043fd Mon Sep 17 00:00:00 2001 From: Mohammad Varmazyar Date: Thu, 6 Nov 2025 09:35:35 +0100 Subject: [PATCH 3/9] Removing the reduntant WasFlushedFromUnschedulable --- pkg/scheduler/backend/queue/scheduling_queue.go | 7 ------- 1 file changed, 7 deletions(-) diff --git a/pkg/scheduler/backend/queue/scheduling_queue.go b/pkg/scheduler/backend/queue/scheduling_queue.go index b1cf9d1e1bd..ded796af4b3 100644 --- a/pkg/scheduler/backend/queue/scheduling_queue.go +++ b/pkg/scheduler/backend/queue/scheduling_queue.go @@ -1239,13 +1239,6 @@ func (p *PriorityQueue) movePodsToActiveOrBackoffQueue(logger klog.Logger, podIn continue } - // Clear the flush flag if this pod is being moved by an event (not by timeout flush). - // EventUnschedulableTimeout is the event used by flushUnschedulablePodsLeftover, - // where the flag is set to true before calling this function. - if event != framework.EventUnschedulableTimeout { - pInfo.WasFlushedFromUnschedulable = false - } - p.unschedulablePods.delete(pInfo.Pod, pInfo.Gated()) queue := p.requeuePodWithQueueingStrategy(logger, pInfo, schedulingHint, event.Label()) if queue == activeQ || (p.isPopFromBackoffQEnabled && queue == backoffQ) { From a12eb9a3c3faf256b31b393a79f40a085f85a793 Mon Sep 17 00:00:00 2001 From: Mohammad Varmazyar Date: Thu, 13 Nov 2025 19:41:17 +0100 Subject: [PATCH 4/9] Update pkg/scheduler/metrics/metrics.go MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-authored-by: Maciej SkoczeĹ„ <87243939+macsko@users.noreply.github.com> --- pkg/scheduler/metrics/metrics.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pkg/scheduler/metrics/metrics.go b/pkg/scheduler/metrics/metrics.go index e26f567a16e..6c1b9cca555 100644 --- a/pkg/scheduler/metrics/metrics.go +++ b/pkg/scheduler/metrics/metrics.go @@ -301,7 +301,7 @@ func InitMetrics() { &metrics.CounterOpts{ Subsystem: SchedulerSubsystem, Name: "pod_scheduled_after_flush_total", - Help: "Number of pods that were successfully scheduled after being flushed from unschedulablePods due to timeout. This metric helps detect potential queue hint bugs or event handling issues.", + Help: "Number of pods that were successfully scheduled after being flushed from unschedulablePods due to timeout. This metric helps detect potential queueing hint misconfigurations or event handling issues.", StabilityLevel: metrics.ALPHA, }) From e5e8ef993cb95274ca3ac56e6c9fd19b1ea062c9 Mon Sep 17 00:00:00 2001 From: Mohammad Varmazyar Date: Fri, 14 Nov 2025 09:28:45 +0100 Subject: [PATCH 5/9] Add unit test for WasFlushedFromUnschedulable flag --- .../backend/queue/scheduling_queue_test.go | 89 +++++++++++++++++++ 1 file changed, 89 insertions(+) diff --git a/pkg/scheduler/backend/queue/scheduling_queue_test.go b/pkg/scheduler/backend/queue/scheduling_queue_test.go index 505d2bbe8b9..28ebc5ea075 100644 --- a/pkg/scheduler/backend/queue/scheduling_queue_test.go +++ b/pkg/scheduler/backend/queue/scheduling_queue_test.go @@ -3023,6 +3023,95 @@ func TestHighPriorityFlushUnschedulablePodsLeftover(t *testing.T) { } } +// TestFlushUnschedulablePodsLeftoverSetsFlag verifies that the WasFlushedFromUnschedulable +// flag is correctly set when pods are flushed and cleared when they return to the queue. +func TestFlushUnschedulablePodsLeftoverSetsFlag(t *testing.T) { + c := testingclock.NewFakeClock(time.Now()) + m := makeEmptyQueueingHintMapPerProfile() + m[""][nodeAdd] = []*QueueingHintFunction{ + { + PluginName: "fakePlugin", + QueueingHintFn: queueHintReturnQueue, + }, + } + logger, ctx := ktesting.NewTestContext(t) + ctx, cancel := context.WithCancel(ctx) + defer cancel() + q := NewTestQueue(ctx, newDefaultQueueSort(), WithClock(c), WithQueueingHintMapPerProfile(m)) + + pod := st.MakePod().Name("test-pod").Namespace("ns1").UID("tp-1").Priority(midPriority).NominatedNodeName("node1").Obj() + podutil.UpdatePodCondition(&pod.Status, &v1.PodCondition{ + Type: v1.PodScheduled, + Status: v1.ConditionFalse, + Reason: v1.PodReasonUnschedulable, + Message: "fake scheduling failure", + }) + + // Add pod to activeQ and pop it to simulate a scheduling attempt + q.Add(logger, pod) + pInfo, err := q.Pop(logger) + if err != nil { + t.Fatalf("unexpected error from Pop: %v", err) + } + if pInfo.Pod != pod { + t.Errorf("Expected: %v after Pop, but got: %v", pod.Name, pInfo.Pod.Name) + } + + // Verify flag is initially false + if pInfo.WasFlushedFromUnschedulable { + t.Errorf("Expected WasFlushedFromUnschedulable to be false initially, but got true") + } + + // Add pod to unschedulablePods (simulating failed scheduling) + err = q.AddUnschedulableIfNotPresent(logger, q.newQueuedPodInfo(pod, "fakePlugin"), q.SchedulingCycle()) + if err != nil { + t.Fatalf("unexpected error from AddUnschedulableIfNotPresent: %v", err) + } + + // Advance time past the flush duration and flush + c.Step(DefaultPodMaxInUnschedulablePodsDuration + time.Second) + q.flushUnschedulablePodsLeftover(logger) + + // Pop the pod and verify flag is now true + pInfo, err = q.Pop(logger) + if err != nil { + t.Fatalf("unexpected error from Pop after flush: %v", err) + } + if pInfo.Pod != pod { + t.Errorf("Expected: %v after Pop, but got: %v", pod.Name, pInfo.Pod.Name) + } + if !pInfo.WasFlushedFromUnschedulable { + t.Errorf("Expected WasFlushedFromUnschedulable to be true after flush, but got false") + } + + // Simulate pod failing to schedule again and returning to queue + err = q.AddUnschedulableIfNotPresent(logger, pInfo, q.SchedulingCycle()) + if err != nil { + t.Fatalf("unexpected error from AddUnschedulableIfNotPresent: %v", err) + } + + // Verify flag is cleared when pod returns to queue + // Pod can be in either backoffQ or unschedulablePods depending on queueing hints + q.lock.Lock() + defer q.lock.Unlock() + + var internalPInfo *framework.QueuedPodInfo + // Check backoffQ first + if pInfoFromBackoff, exists := q.backoffQ.get(newQueuedPodInfoForLookup(pod)); exists { + internalPInfo = pInfoFromBackoff + } else { + // Check unschedulablePods + internalPInfo = q.unschedulablePods.get(pod) + } + + if internalPInfo == nil { + t.Fatalf("pod should be in either backoffQ or unschedulablePods") + } + if internalPInfo.WasFlushedFromUnschedulable { + t.Errorf("Expected WasFlushedFromUnschedulable to be cleared (false) after returning to queue, but got true") + } +} + func TestPriorityQueue_initPodMaxInUnschedulablePodsDuration(t *testing.T) { pod1 := st.MakePod().Name("test-pod-1").Namespace("ns1").UID("tp-1").NominatedNodeName("node1").Obj() pod2 := st.MakePod().Name("test-pod-2").Namespace("ns2").UID("tp-2").NominatedNodeName("node2").Obj() From ec05bcf1865c3e6d586b9dfa998b387ac7ba6059 Mon Sep 17 00:00:00 2001 From: Mohammad Varmazyar Date: Fri, 14 Nov 2025 19:57:24 +0100 Subject: [PATCH 6/9] test: simplify TestFlushUnschedulablePodsLeftoverSetsFlag scheduler: add logging for pods scheduled after flush and preserve UnschedulablePlugins --- pkg/scheduler/backend/queue/active_queue.go | 6 ++- .../backend/queue/scheduling_queue_test.go | 52 ++++++------------- pkg/scheduler/schedule_one.go | 1 + 3 files changed, 20 insertions(+), 39 deletions(-) diff --git a/pkg/scheduler/backend/queue/active_queue.go b/pkg/scheduler/backend/queue/active_queue.go index de722833d44..fa1bca28368 100644 --- a/pkg/scheduler/backend/queue/active_queue.go +++ b/pkg/scheduler/backend/queue/active_queue.go @@ -323,11 +323,13 @@ func (aq *activeQueue) unlockedPop(logger klog.Logger) (*framework.QueuedPodInfo } aq.schedCycle++ - // Update metrics and reset the set of unschedulable plugins for the next attempt. + // Update metrics and reset the set of pending plugins for the next attempt. + // Note: We don't clear UnschedulablePlugins here because: + // 1. If the pod schedules successfully, we need UnschedulablePlugins for logging/debugging + // 2. If the pod fails to schedule again, UnschedulablePlugins will be overwritten anyway for plugin := range pInfo.UnschedulablePlugins.Union(pInfo.PendingPlugins) { metrics.UnschedulableReason(plugin, pInfo.Pod.Spec.SchedulerName).Dec() } - pInfo.UnschedulablePlugins.Clear() pInfo.PendingPlugins.Clear() pInfo.GatingPlugin = "" pInfo.GatingPluginEvents = nil diff --git a/pkg/scheduler/backend/queue/scheduling_queue_test.go b/pkg/scheduler/backend/queue/scheduling_queue_test.go index 28ebc5ea075..585ab56ef8a 100644 --- a/pkg/scheduler/backend/queue/scheduling_queue_test.go +++ b/pkg/scheduler/backend/queue/scheduling_queue_test.go @@ -649,7 +649,7 @@ func Test_InFlightPods(t *testing.T) { }, }, { - name: "popped pod must have empty UnschedulablePlugins and PendingPlugins", + name: "popped pod preserves UnschedulablePlugins but clears PendingPlugins", isSchedulingQueueHintEnabled: true, initialPods: []*v1.Pod{pod1}, actions: []action{ @@ -666,8 +666,13 @@ func Test_InFlightPods(t *testing.T) { {eventHappens: &pvAdd}, // Active again. {callback: func(t *testing.T, q *PriorityQueue) { poppedPod = popPod(t, logger, q, pod1) - if len(poppedPod.UnschedulablePlugins) > 0 { - t.Errorf("QueuedPodInfo from Pop should have empty UnschedulablePlugins, got instead: %+v", poppedPod) + // UnschedulablePlugins should be preserved for logging/debugging + if !poppedPod.UnschedulablePlugins.Equal(sets.New("fooPlugin2")) { + t.Errorf("QueuedPodInfo from Pop should preserve UnschedulablePlugins, expected fooPlugin2, got: %+v", poppedPod.UnschedulablePlugins) + } + // PendingPlugins should still be cleared + if len(poppedPod.PendingPlugins) > 0 { + t.Errorf("QueuedPodInfo from Pop should have empty PendingPlugins, got instead: %+v", poppedPod.PendingPlugins) } }}, {callback: func(t *testing.T, q *PriorityQueue) { @@ -3028,33 +3033,18 @@ func TestHighPriorityFlushUnschedulablePodsLeftover(t *testing.T) { func TestFlushUnschedulablePodsLeftoverSetsFlag(t *testing.T) { c := testingclock.NewFakeClock(time.Now()) m := makeEmptyQueueingHintMapPerProfile() - m[""][nodeAdd] = []*QueueingHintFunction{ - { - PluginName: "fakePlugin", - QueueingHintFn: queueHintReturnQueue, - }, - } logger, ctx := ktesting.NewTestContext(t) ctx, cancel := context.WithCancel(ctx) defer cancel() q := NewTestQueue(ctx, newDefaultQueueSort(), WithClock(c), WithQueueingHintMapPerProfile(m)) pod := st.MakePod().Name("test-pod").Namespace("ns1").UID("tp-1").Priority(midPriority).NominatedNodeName("node1").Obj() - podutil.UpdatePodCondition(&pod.Status, &v1.PodCondition{ - Type: v1.PodScheduled, - Status: v1.ConditionFalse, - Reason: v1.PodReasonUnschedulable, - Message: "fake scheduling failure", - }) // Add pod to activeQ and pop it to simulate a scheduling attempt q.Add(logger, pod) pInfo, err := q.Pop(logger) if err != nil { - t.Fatalf("unexpected error from Pop: %v", err) - } - if pInfo.Pod != pod { - t.Errorf("Expected: %v after Pop, but got: %v", pod.Name, pInfo.Pod.Name) + t.Fatalf("Unexpected error from Pop: %v", err) } // Verify flag is initially false @@ -3065,7 +3055,7 @@ func TestFlushUnschedulablePodsLeftoverSetsFlag(t *testing.T) { // Add pod to unschedulablePods (simulating failed scheduling) err = q.AddUnschedulableIfNotPresent(logger, q.newQueuedPodInfo(pod, "fakePlugin"), q.SchedulingCycle()) if err != nil { - t.Fatalf("unexpected error from AddUnschedulableIfNotPresent: %v", err) + t.Fatalf("Unexpected error from AddUnschedulableIfNotPresent: %v", err) } // Advance time past the flush duration and flush @@ -3075,37 +3065,25 @@ func TestFlushUnschedulablePodsLeftoverSetsFlag(t *testing.T) { // Pop the pod and verify flag is now true pInfo, err = q.Pop(logger) if err != nil { - t.Fatalf("unexpected error from Pop after flush: %v", err) - } - if pInfo.Pod != pod { - t.Errorf("Expected: %v after Pop, but got: %v", pod.Name, pInfo.Pod.Name) + t.Fatalf("Unexpected error from Pop after flush: %v", err) } if !pInfo.WasFlushedFromUnschedulable { t.Errorf("Expected WasFlushedFromUnschedulable to be true after flush, but got false") } // Simulate pod failing to schedule again and returning to queue - err = q.AddUnschedulableIfNotPresent(logger, pInfo, q.SchedulingCycle()) + err = q.AddUnschedulableIfNotPresent(logger, q.newQueuedPodInfo(pInfo.Pod, "fakePlugin"), q.SchedulingCycle()) if err != nil { - t.Fatalf("unexpected error from AddUnschedulableIfNotPresent: %v", err) + t.Fatalf("Unexpected error from AddUnschedulableIfNotPresent: %v", err) } // Verify flag is cleared when pod returns to queue - // Pod can be in either backoffQ or unschedulablePods depending on queueing hints q.lock.Lock() defer q.lock.Unlock() - var internalPInfo *framework.QueuedPodInfo - // Check backoffQ first - if pInfoFromBackoff, exists := q.backoffQ.get(newQueuedPodInfoForLookup(pod)); exists { - internalPInfo = pInfoFromBackoff - } else { - // Check unschedulablePods - internalPInfo = q.unschedulablePods.get(pod) - } - + internalPInfo := q.unschedulablePods.get(pod) if internalPInfo == nil { - t.Fatalf("pod should be in either backoffQ or unschedulablePods") + t.Fatalf("pod should be in unschedulablePods") } if internalPInfo.WasFlushedFromUnschedulable { t.Errorf("Expected WasFlushedFromUnschedulable to be cleared (false) after returning to queue, but got true") diff --git a/pkg/scheduler/schedule_one.go b/pkg/scheduler/schedule_one.go index 4285ffe808d..4908b711da7 100644 --- a/pkg/scheduler/schedule_one.go +++ b/pkg/scheduler/schedule_one.go @@ -342,6 +342,7 @@ func (sched *Scheduler) bindingCycle( } // Count pods scheduled after being flushed from unschedulablePods if assumedPodInfo.WasFlushedFromUnschedulable { + logger.Info("Pod scheduled after flush from unschedulablePods", "pod", klog.KObj(assumedPodInfo.Pod), "unschedulablePlugins", assumedPodInfo.UnschedulablePlugins) metrics.PodScheduledAfterFlush.Inc() } // Run "postbind" plugins. From d64e09c69730af479ca171b47ea6e607e441b939 Mon Sep 17 00:00:00 2001 From: Mohammad Varmazyar Date: Mon, 24 Nov 2025 20:32:41 +0100 Subject: [PATCH 7/9] Clear plugins at handleSchedulingFailure and preserve both at Pop --- pkg/scheduler/backend/queue/active_queue.go | 9 ++++----- .../backend/queue/scheduling_queue_test.go | 14 ++++++++------ pkg/scheduler/schedule_one.go | 7 ++++++- 3 files changed, 18 insertions(+), 12 deletions(-) diff --git a/pkg/scheduler/backend/queue/active_queue.go b/pkg/scheduler/backend/queue/active_queue.go index fa1bca28368..0fdd0f0d450 100644 --- a/pkg/scheduler/backend/queue/active_queue.go +++ b/pkg/scheduler/backend/queue/active_queue.go @@ -323,14 +323,13 @@ func (aq *activeQueue) unlockedPop(logger klog.Logger) (*framework.QueuedPodInfo } aq.schedCycle++ - // Update metrics and reset the set of pending plugins for the next attempt. - // Note: We don't clear UnschedulablePlugins here because: - // 1. If the pod schedules successfully, we need UnschedulablePlugins for logging/debugging - // 2. If the pod fails to schedule again, UnschedulablePlugins will be overwritten anyway + // Update metrics for unschedulable plugins. + // Note: We don't clear UnschedulablePlugins and PendingPlugins here because: + // 1. If the pod schedules successfully, we need them for logging/debugging + // 2. If the pod fails to schedule, they will be cleared and repopulated in handleSchedulingFailure for plugin := range pInfo.UnschedulablePlugins.Union(pInfo.PendingPlugins) { metrics.UnschedulableReason(plugin, pInfo.Pod.Spec.SchedulerName).Dec() } - pInfo.PendingPlugins.Clear() pInfo.GatingPlugin = "" pInfo.GatingPluginEvents = nil diff --git a/pkg/scheduler/backend/queue/scheduling_queue_test.go b/pkg/scheduler/backend/queue/scheduling_queue_test.go index 585ab56ef8a..5fec77b1f23 100644 --- a/pkg/scheduler/backend/queue/scheduling_queue_test.go +++ b/pkg/scheduler/backend/queue/scheduling_queue_test.go @@ -649,7 +649,7 @@ func Test_InFlightPods(t *testing.T) { }, }, { - name: "popped pod preserves UnschedulablePlugins but clears PendingPlugins", + name: "popped pod preserves UnschedulablePlugins and PendingPlugins", isSchedulingQueueHintEnabled: true, initialPods: []*v1.Pod{pod1}, actions: []action{ @@ -670,9 +670,9 @@ func Test_InFlightPods(t *testing.T) { if !poppedPod.UnschedulablePlugins.Equal(sets.New("fooPlugin2")) { t.Errorf("QueuedPodInfo from Pop should preserve UnschedulablePlugins, expected fooPlugin2, got: %+v", poppedPod.UnschedulablePlugins) } - // PendingPlugins should still be cleared - if len(poppedPod.PendingPlugins) > 0 { - t.Errorf("QueuedPodInfo from Pop should have empty PendingPlugins, got instead: %+v", poppedPod.PendingPlugins) + // PendingPlugins are preserved after Pop() for logging + if !poppedPod.PendingPlugins.Equal(sets.New("fooPlugin1")) { + t.Errorf("QueuedPodInfo from Pop should preserve PendingPlugins, expected fooPlugin1, got: %+v", poppedPod.PendingPlugins) } }}, {callback: func(t *testing.T, q *PriorityQueue) { @@ -940,8 +940,10 @@ func TestPop(t *testing.T) { // Now check result of Pop. poppedPod = popPod(t, logger, q, pod) - if len(poppedPod.PendingPlugins) > 0 { - t.Errorf("QueuedPodInfo from Pop should have empty PendingPlugins, got instead: %+v", poppedPod) + // PendingPlugins are preserved after Pop() so they can be logged if scheduling + // succeeds, or cleared in handleSchedulingFailure() if it fails. + if len(poppedPod.PendingPlugins) != 1 || !poppedPod.PendingPlugins.Has("fooPlugin1") { + t.Errorf("QueuedPodInfo from Pop should preserve PendingPlugins, expected fooPlugin1, got instead: %+v", poppedPod) } }) } diff --git a/pkg/scheduler/schedule_one.go b/pkg/scheduler/schedule_one.go index 4908b711da7..1160f456b8f 100644 --- a/pkg/scheduler/schedule_one.go +++ b/pkg/scheduler/schedule_one.go @@ -342,7 +342,7 @@ func (sched *Scheduler) bindingCycle( } // Count pods scheduled after being flushed from unschedulablePods if assumedPodInfo.WasFlushedFromUnschedulable { - logger.Info("Pod scheduled after flush from unschedulablePods", "pod", klog.KObj(assumedPodInfo.Pod), "unschedulablePlugins", assumedPodInfo.UnschedulablePlugins) + logger.Info("Pod scheduled after flush from unschedulablePods", "pod", klog.KObj(assumedPodInfo.Pod), "unschedulablePlugins", assumedPodInfo.UnschedulablePlugins, "pendingPlugins", assumedPodInfo.PendingPlugins) metrics.PodScheduledAfterFlush.Inc() } // Run "postbind" plugins. @@ -1063,6 +1063,11 @@ func (sched *Scheduler) handleSchedulingFailure(ctx context.Context, fwk framewo err := status.AsError() errMsg := status.Message() + // Clear plugin sets to avoid stale data from previous scheduling attempts. + // They will be repopulated below for FitError cases. + podInfo.UnschedulablePlugins.Clear() + podInfo.PendingPlugins.Clear() + if err == ErrNoNodesAvailable { logger.V(2).Info("Unable to schedule pod; no nodes are registered to the cluster; waiting", "pod", klog.KObj(pod)) } else if fitError, ok := err.(*framework.FitError); ok { // Inject UnschedulablePlugins to PodInfo, which will be used later for moving Pods between queues efficiently. From 4f455c9c0d7ea2f9c1cbc3d030401b7c76b08845 Mon Sep 17 00:00:00 2001 From: Mohammad Varmazyar Date: Wed, 26 Nov 2025 09:54:32 +0100 Subject: [PATCH 8/9] Refactor plugin clearing to use ClearRejectorPlugins method --- pkg/scheduler/backend/queue/active_queue.go | 5 ----- pkg/scheduler/framework/types.go | 10 ++++++++++ pkg/scheduler/schedule_one.go | 9 +++++---- 3 files changed, 15 insertions(+), 9 deletions(-) diff --git a/pkg/scheduler/backend/queue/active_queue.go b/pkg/scheduler/backend/queue/active_queue.go index 0fdd0f0d450..9581c65d996 100644 --- a/pkg/scheduler/backend/queue/active_queue.go +++ b/pkg/scheduler/backend/queue/active_queue.go @@ -324,14 +324,9 @@ func (aq *activeQueue) unlockedPop(logger klog.Logger) (*framework.QueuedPodInfo aq.schedCycle++ // Update metrics for unschedulable plugins. - // Note: We don't clear UnschedulablePlugins and PendingPlugins here because: - // 1. If the pod schedules successfully, we need them for logging/debugging - // 2. If the pod fails to schedule, they will be cleared and repopulated in handleSchedulingFailure for plugin := range pInfo.UnschedulablePlugins.Union(pInfo.PendingPlugins) { metrics.UnschedulableReason(plugin, pInfo.Pod.Spec.SchedulerName).Dec() } - pInfo.GatingPlugin = "" - pInfo.GatingPluginEvents = nil return pInfo, nil } diff --git a/pkg/scheduler/framework/types.go b/pkg/scheduler/framework/types.go index 607f18ad683..6cb9e98afc3 100644 --- a/pkg/scheduler/framework/types.go +++ b/pkg/scheduler/framework/types.go @@ -619,6 +619,16 @@ func (pqi *QueuedPodInfo) DeepCopy() *QueuedPodInfo { } } +// ClearRejectorPlugins clears the plugin-related fields that track why a pod +// was rejected in a previous scheduling attempt. This should be called at the +// beginning of a new scheduling attempt to ensure stale data doesn't persist. +func (pqi *QueuedPodInfo) ClearRejectorPlugins() { + pqi.UnschedulablePlugins.Clear() + pqi.PendingPlugins.Clear() + pqi.GatingPlugin = "" + pqi.GatingPluginEvents = nil +} + // PodInfo is a wrapper to a Pod with additional pre-computed information to // accelerate processing. This information is typically immutable (e.g., pre-processed // inter-pod affinity selectors). diff --git a/pkg/scheduler/schedule_one.go b/pkg/scheduler/schedule_one.go index 1160f456b8f..3a2fbe69ba7 100644 --- a/pkg/scheduler/schedule_one.go +++ b/pkg/scheduler/schedule_one.go @@ -1063,10 +1063,11 @@ func (sched *Scheduler) handleSchedulingFailure(ctx context.Context, fwk framewo err := status.AsError() errMsg := status.Message() - // Clear plugin sets to avoid stale data from previous scheduling attempts. - // They will be repopulated below for FitError cases. - podInfo.UnschedulablePlugins.Clear() - podInfo.PendingPlugins.Clear() + // Clear plugin-related fields to avoid stale data from previous scheduling attempts. + // These fields will be repopulated below for FitError cases. + // We clear them here (rather than at Pop) because we sometimes want to use them + // for logging when a pod schedules successfully (e.g., after being flushed). + podInfo.ClearRejectorPlugins() if err == ErrNoNodesAvailable { logger.V(2).Info("Unable to schedule pod; no nodes are registered to the cluster; waiting", "pod", klog.KObj(pod)) From 4c2fff19340562b474c9f204b3656159a1247e6b Mon Sep 17 00:00:00 2001 From: Mohammad Varmazyar Date: Wed, 26 Nov 2025 14:08:05 +0100 Subject: [PATCH 9/9] Address comments, log level, test assersion consistency and remove unnecessary locks in TestFlushUnschedulablePodsLeftoverSetsFlag --- pkg/scheduler/backend/queue/scheduling_queue_test.go | 5 +---- pkg/scheduler/framework/types.go | 5 ++--- pkg/scheduler/schedule_one.go | 2 +- 3 files changed, 4 insertions(+), 8 deletions(-) diff --git a/pkg/scheduler/backend/queue/scheduling_queue_test.go b/pkg/scheduler/backend/queue/scheduling_queue_test.go index 5fec77b1f23..cad1e9373f3 100644 --- a/pkg/scheduler/backend/queue/scheduling_queue_test.go +++ b/pkg/scheduler/backend/queue/scheduling_queue_test.go @@ -942,7 +942,7 @@ func TestPop(t *testing.T) { poppedPod = popPod(t, logger, q, pod) // PendingPlugins are preserved after Pop() so they can be logged if scheduling // succeeds, or cleared in handleSchedulingFailure() if it fails. - if len(poppedPod.PendingPlugins) != 1 || !poppedPod.PendingPlugins.Has("fooPlugin1") { + if !poppedPod.PendingPlugins.Equal(sets.New("fooPlugin1")) { t.Errorf("QueuedPodInfo from Pop should preserve PendingPlugins, expected fooPlugin1, got instead: %+v", poppedPod) } }) @@ -3080,9 +3080,6 @@ func TestFlushUnschedulablePodsLeftoverSetsFlag(t *testing.T) { } // Verify flag is cleared when pod returns to queue - q.lock.Lock() - defer q.lock.Unlock() - internalPInfo := q.unschedulablePods.get(pod) if internalPInfo == nil { t.Fatalf("pod should be in unschedulablePods") diff --git a/pkg/scheduler/framework/types.go b/pkg/scheduler/framework/types.go index 6cb9e98afc3..8f264cc1eea 100644 --- a/pkg/scheduler/framework/types.go +++ b/pkg/scheduler/framework/types.go @@ -529,7 +529,7 @@ type QueuedPodInfo struct { // WasFlushedFromUnschedulable tracks whether this pod was most recently moved to activeQ // by the periodic flush from unschedulablePods due to timeout (rather than by an event). // This is used to detect if the pod becomes schedulable soon after flush, which may - // indicate missing queue hint optimizations or event handling bugs. + // indicate queueing hint misconfigurations or event handling bugs. // This flag is cleared when the pod returns to the queue for any reason. WasFlushedFromUnschedulable bool // The time when the pod is added to the queue for the first time. The pod may be added @@ -620,8 +620,7 @@ func (pqi *QueuedPodInfo) DeepCopy() *QueuedPodInfo { } // ClearRejectorPlugins clears the plugin-related fields that track why a pod -// was rejected in a previous scheduling attempt. This should be called at the -// beginning of a new scheduling attempt to ensure stale data doesn't persist. +// was rejected in a previous scheduling attempt. func (pqi *QueuedPodInfo) ClearRejectorPlugins() { pqi.UnschedulablePlugins.Clear() pqi.PendingPlugins.Clear() diff --git a/pkg/scheduler/schedule_one.go b/pkg/scheduler/schedule_one.go index 3a2fbe69ba7..ba10a112a5d 100644 --- a/pkg/scheduler/schedule_one.go +++ b/pkg/scheduler/schedule_one.go @@ -342,7 +342,7 @@ func (sched *Scheduler) bindingCycle( } // Count pods scheduled after being flushed from unschedulablePods if assumedPodInfo.WasFlushedFromUnschedulable { - logger.Info("Pod scheduled after flush from unschedulablePods", "pod", klog.KObj(assumedPodInfo.Pod), "unschedulablePlugins", assumedPodInfo.UnschedulablePlugins, "pendingPlugins", assumedPodInfo.PendingPlugins) + logger.V(4).Info("Pod scheduled after flush from unschedulablePods", "pod", klog.KObj(assumedPodInfo.Pod), "unschedulablePlugins", assumedPodInfo.UnschedulablePlugins, "pendingPlugins", assumedPodInfo.PendingPlugins) metrics.PodScheduledAfterFlush.Inc() } // Run "postbind" plugins.