diff --git a/pkg/scheduler/backend/queue/scheduling_queue.go b/pkg/scheduler/backend/queue/scheduling_queue.go index 826a5a0e09f..b1cf9d1e1bd 100644 --- a/pkg/scheduler/backend/queue/scheduling_queue.go +++ b/pkg/scheduler/backend/queue/scheduling_queue.go @@ -894,6 +894,8 @@ func (p *PriorityQueue) AddUnschedulableIfNotPresent(logger klog.Logger, pInfo * // We changed ConsecutiveErrorsCount or UnschedulableCount plus Timestamp, and now the calculated backoff time should be different, // removing the cached backoff time. pInfo.BackoffExpiration = time.Time{} + // Clear the flush flag since the pod is returning to the queue after a scheduling attempt. + pInfo.WasFlushedFromUnschedulable = false if !p.isSchedulingQueueHintEnabled { // fall back to the old behavior which doesn't depend on the queueing hint. @@ -949,7 +951,7 @@ func (p *PriorityQueue) flushUnschedulablePodsLeftover(logger klog.Logger) { lastScheduleTime := pInfo.Timestamp if currentTime.Sub(lastScheduleTime) > p.podMaxInUnschedulablePodsDuration { // Mark this pod as flushed so we can detect if it schedules soon after - pInfo.FlushedFromUnschedulableAt = ¤tTime + pInfo.WasFlushedFromUnschedulable = true podsToMove = append(podsToMove, pInfo) } } @@ -1237,6 +1239,13 @@ func (p *PriorityQueue) movePodsToActiveOrBackoffQueue(logger klog.Logger, podIn continue } + // Clear the flush flag if this pod is being moved by an event (not by timeout flush). + // EventUnschedulableTimeout is the event used by flushUnschedulablePodsLeftover, + // where the flag is set to true before calling this function. + if event != framework.EventUnschedulableTimeout { + pInfo.WasFlushedFromUnschedulable = false + } + p.unschedulablePods.delete(pInfo.Pod, pInfo.Gated()) queue := p.requeuePodWithQueueingStrategy(logger, pInfo, schedulingHint, event.Label()) if queue == activeQ || (p.isPopFromBackoffQEnabled && queue == backoffQ) { diff --git a/pkg/scheduler/framework/types.go b/pkg/scheduler/framework/types.go index de20feed86f..607f18ad683 100644 --- a/pkg/scheduler/framework/types.go +++ b/pkg/scheduler/framework/types.go @@ -526,10 +526,12 @@ type QueuedPodInfo struct { // That's why we need to distinguish ConsecutiveErrorsCount for the error status and UnschedulableCount for the unschedulable status. // See https://github.com/kubernetes/kubernetes/issues/128744 for the discussion. ConsecutiveErrorsCount int - // FlushedFromUnschedulableAt tracks when this pod was last flushed from unschedulablePods - // due to timeout. This is used to detect if the pod becomes schedulable soon after flush, - // which may indicate missing queue hint optimizations or event handling bugs. - FlushedFromUnschedulableAt *time.Time + // WasFlushedFromUnschedulable tracks whether this pod was most recently moved to activeQ + // by the periodic flush from unschedulablePods due to timeout (rather than by an event). + // This is used to detect if the pod becomes schedulable soon after flush, which may + // indicate missing queue hint optimizations or event handling bugs. + // This flag is cleared when the pod returns to the queue for any reason. + WasFlushedFromUnschedulable bool // The time when the pod is added to the queue for the first time. The pod may be added // back to the queue multiple times before it's successfully scheduled. // It shouldn't be updated once initialized. It's used to record the e2e scheduling diff --git a/pkg/scheduler/schedule_one.go b/pkg/scheduler/schedule_one.go index 4cc5ad717c4..4285ffe808d 100644 --- a/pkg/scheduler/schedule_one.go +++ b/pkg/scheduler/schedule_one.go @@ -341,7 +341,7 @@ func (sched *Scheduler) bindingCycle( metrics.PodSchedulingSLIDuration.WithLabelValues(getAttemptsLabel(assumedPodInfo)).Observe(metrics.SinceInSeconds(*assumedPodInfo.InitialAttemptTimestamp)) } // Count pods scheduled after being flushed from unschedulablePods - if assumedPodInfo.FlushedFromUnschedulableAt != nil { + if assumedPodInfo.WasFlushedFromUnschedulable { metrics.PodScheduledAfterFlush.Inc() } // Run "postbind" plugins.