diff --git a/pkg/controller/devicetainteviction/device_taint_eviction.go b/pkg/controller/devicetainteviction/device_taint_eviction.go index d6075e98b31..b80042080cf 100644 --- a/pkg/controller/devicetainteviction/device_taint_eviction.go +++ b/pkg/controller/devicetainteviction/device_taint_eviction.go @@ -434,6 +434,7 @@ func (tc *Controller) maybeDeletePod(ctx context.Context, podRef tainteviction.N // Doing this immediately is not useful because // it would just race with the informers update // (rule status reads from cache!). + tc.logger.V(5).Info("Adding delayed status update because of pod eviction", "deviceTaintRule", klog.KObj(reason.rule), "delay", ruleStatusPeriod) tc.workqueue.AddAfter(workItemForRule(reason.rule), ruleStatusPeriod) } } @@ -1016,7 +1017,14 @@ func (tc *Controller) Run(ctx context.Context, numWorkers int) error { func (tc *Controller) evictPod(podRef tainteviction.NamespacedObject, eviction evictionAndReason) { tc.deletePodAt[podRef] = eviction now := time.Now() - tc.workqueue.AddAfter(workItem{podRef: podRef}, eviction.when.Sub(now)) + delay := eviction.when.Sub(now) + if delay <= 0 { + tc.logger.V(3).Info("Adding immediate pod eviction", "pod", podRef, "eviction", eviction) + tc.workqueue.Add(workItem{podRef: podRef}) + } else { + tc.logger.V(3).Info("Adding delayed pod eviction", "pod", podRef, "eviction", eviction, "delay", delay) + tc.workqueue.AddAfter(workItem{podRef: podRef}, delay) + } if tc.evictPodHook != nil { tc.evictPodHook(podRef, eviction) @@ -1275,7 +1283,8 @@ func (tc *Controller) handleRuleChange(oldRule, newRule *resourcealpha.DeviceTai } if oldRule == nil { - // Update the status at least once. + // Update the status at least once, immediately and before evicting any pods. + tc.logger.V(5).Info("Adding immediate status update because of new rule", "deviceTaintRule", klog.KObj(newRule)) tc.workqueue.Add(workItemForRule(newRule)) } @@ -1289,9 +1298,13 @@ func (tc *Controller) handleRuleChange(oldRule, newRule *resourcealpha.DeviceTai if oldRule != nil && newRule != nil && - oldRule.UID == newRule.UID && - apiequality.Semantic.DeepEqual(&oldRule.Spec, &newRule.Spec) { - return + oldRule.UID == newRule.UID { + if apiequality.Semantic.DeepEqual(&oldRule.Spec, &newRule.Spec) { + return + } + // Update the status at least once, immediately and before evicting any pods. + tc.logger.V(5).Info("Adding immediate status update because of modified rule spec", "deviceTaintRule", klog.KObj(newRule)) + tc.workqueue.Add(workItemForRule(newRule)) } // Rule spec changes should be rare. Simply do a brute-force re-evaluation of all allocated claims. @@ -1475,13 +1488,14 @@ func (tc *Controller) handlePod(pod *v1.Pod) { return } - tc.logger.V(3).Info("Going to evict pod", "pod", podRef, "eviction", eviction) tc.evictPod(podRef, *eviction) // If any reason is because of a taint, then eviction is in progress and the status may need to be updated. + // But don't do it immediately because more pod changes may be coming in. for _, reason := range eviction.reason { if reason.rule != nil { - tc.workqueue.Add(workItemForRule(reason.rule)) + tc.logger.V(5).Info("Adding delayed status update because of pod change", "deviceTaintRule", klog.KObj(reason.rule), "delay", ruleStatusPeriod) + tc.workqueue.AddAfter(workItemForRule(reason.rule), ruleStatusPeriod) } } } diff --git a/pkg/controller/devicetainteviction/device_taint_eviction_test.go b/pkg/controller/devicetainteviction/device_taint_eviction_test.go index f4ecdb7ebcc..9cb4cb79533 100644 --- a/pkg/controller/devicetainteviction/device_taint_eviction_test.go +++ b/pkg/controller/devicetainteviction/device_taint_eviction_test.go @@ -855,7 +855,7 @@ func testController(tCtx ktesting.TContext) { finalState: state{ allocatedClaims: l(ac(inUseClaim, newEvictionTime(taintTime, ruleEvict))), deletePodAt: evictMap{newObject(podWithClaimName): *newEvictionTime(taintTime, ruleEvict)}, - queued: MockState[workItem]{Ready: newWorkItems(ruleEvict, podWithClaimName)}, + queued: MockState[workItem]{Ready: newWorkItems(ruleEvict, podWithClaimName), Later: newDelayedWorkItems(ruleEvict, ruleStatusPeriod)}, }, process: []step{ { @@ -883,23 +883,31 @@ func testController(tCtx ktesting.TContext) { finalState: state{ allocatedClaims: l(ac(inUseClaimWithToleration, newEvictionTime(metav1Time(taintTime.Add(tolerationDuration)), ruleEvict))), deletePodAt: evictMap{newObject(podWithClaimName): *newEvictionTime(metav1Time(taintTime.Add(tolerationDuration)), ruleEvict)}, - queued: MockState[workItem]{Ready: newWorkItems(ruleEvict), Later: newDelayedWorkItems(podWithClaimName, tolerationDuration)}, + queued: MockState[workItem]{Ready: newWorkItems(ruleEvict), Later: newDelayedWorkItems(podWithClaimName, tolerationDuration, ruleEvict, ruleStatusPeriod)}, }, process: []step{ { // Initial update. - deletePodAt: evictMap{newObject(podWithClaimName): *newEvictionTime(metav1Time(taintTime.Add(tolerationDuration)), ruleEvict)}, - pods: l(podWithClaimName), - rules: l(inProgress(ruleEvict, true, "PodsPendingEviction", "1 pod needs to be evicted in 1 namespace.", taintTime)), - - queuedProcessed: MockState[workItem]{Later: newDelayedWorkItems(podWithClaimName, tolerationDuration)}, - advance: tolerationDuration, + deletePodAt: evictMap{newObject(podWithClaimName): *newEvictionTime(metav1Time(taintTime.Add(tolerationDuration)), ruleEvict)}, + pods: l(podWithClaimName), + rules: l(inProgress(ruleEvict, true, "PodsPendingEviction", "1 pod needs to be evicted in 1 namespace.", taintTime)), + queuedProcessed: MockState[workItem]{Later: newDelayedWorkItems(podWithClaimName, tolerationDuration, ruleEvict, ruleStatusPeriod)}, + advance: ruleStatusPeriod, + queuedShifted: MockState[workItem]{Ready: newWorkItems(ruleEvict), Later: newDelayedWorkItems(podWithClaimName, tolerationDuration-ruleStatusPeriod)}, + }, + { + // Process the pod eviction. + deletePodAt: evictMap{newObject(podWithClaimName): *newEvictionTime(metav1Time(taintTime.Add(tolerationDuration)), ruleEvict)}, + pods: l(podWithClaimName), + rules: l(inProgress(ruleEvict, true, "PodsPendingEviction", "1 pod needs to be evicted in 1 namespace.", metav1Time(taintTime.Add(ruleStatusPeriod)))), + queuedProcessed: MockState[workItem]{Later: newDelayedWorkItems(podWithClaimName, tolerationDuration-ruleStatusPeriod)}, + advance: tolerationDuration - ruleStatusPeriod, queuedShifted: MockState[workItem]{Ready: newWorkItems(podWithClaimName)}, }, { // Deleted, but condition not updated yet. ruleStats: map[types.UID]taintRuleStats{ruleEvict.UID: {numEvictedPods: 1}}, - rules: l(inProgress(ruleEvict, true, "PodsPendingEviction", "1 pod needs to be evicted in 1 namespace.", taintTime)), + rules: l(inProgress(ruleEvict, true, "PodsPendingEviction", "1 pod needs to be evicted in 1 namespace.", metav1Time(taintTime.Add(ruleStatusPeriod)))), queuedProcessed: MockState[workItem]{Later: newDelayedWorkItems(ruleEvict, ruleStatusPeriod)}, advance: ruleStatusPeriod, queuedShifted: MockState[workItem]{Ready: newWorkItems(ruleEvict)}, @@ -923,7 +931,7 @@ func testController(tCtx ktesting.TContext) { finalState: state{ allocatedClaims: l(ac(inUseClaim, newEvictionTime(taintTime, ruleEvict)), ac(inUseClaimOtherNamespace, newEvictionTime(taintTime, ruleEvict))), deletePodAt: evictMap{newObject(podWithClaimName): *newEvictionTime(taintTime, ruleEvict), newObject(podWithClaimNameOtherNamespace): *newEvictionTime(taintTime, ruleEvict)}, - queued: MockState[workItem]{Ready: newWorkItems(ruleEvict, podWithClaimName, podWithClaimNameOtherNamespace)}, + queued: MockState[workItem]{Ready: newWorkItems(ruleEvict, podWithClaimName, podWithClaimNameOtherNamespace), Later: newDelayedWorkItems(ruleEvict, ruleStatusPeriod)}, }, process: []step{ { @@ -953,7 +961,7 @@ func testController(tCtx ktesting.TContext) { finalState: state{ allocatedClaims: l(ac(inUseClaim, newEvictionTime(taintTime, ruleEvict)), ac(inUseClaimOtherName, newEvictionTime(taintTime, ruleEvict))), deletePodAt: evictMap{newObject(podWithClaimName): *newEvictionTime(taintTime, ruleEvict), newObject(podWithClaimNameOtherName): *newEvictionTime(taintTime, ruleEvict)}, - queued: MockState[workItem]{Ready: newWorkItems(ruleEvict, podWithClaimName, podWithClaimNameOtherName)}, + queued: MockState[workItem]{Ready: newWorkItems(ruleEvict, podWithClaimName, podWithClaimNameOtherName), Later: newDelayedWorkItems(ruleEvict, ruleStatusPeriod)}, }, process: []step{ { @@ -1039,7 +1047,7 @@ func testController(tCtx ktesting.TContext) { finalState: state{ allocatedClaims: l(ac(inUseClaim, newEvictionTime(taintTime, ruleEvict)), ac(inUseClaimOtherNameShared, newEvictionTime(taintTime, ruleEvict))), deletePodAt: evictMap{newObject(podWithTwoClaimNames): *newEvictionTime(taintTime, ruleEvict)}, - queued: MockState[workItem]{Ready: newWorkItems(ruleEvict, podWithTwoClaimNames)}, + queued: MockState[workItem]{Ready: newWorkItems(ruleEvict, podWithTwoClaimNames), Later: newDelayedWorkItems(ruleEvict, ruleStatusPeriod)}, }, process: []step{ { @@ -1079,7 +1087,7 @@ func testController(tCtx ktesting.TContext) { finalState: state{ allocatedClaims: l(ac(inUseClaim, newEvictionTime(taintTime, ruleEvictInstance1)), ac(inUseClaimOtherNameShared, newEvictionTime(taintTimeLater, ruleEvictInstance2Later))), deletePodAt: evictMap{newObject(podWithTwoClaimNames): *newEvictionTime(taintTime, ruleEvictInstance1, ruleEvictInstance2Later)}, - queued: MockState[workItem]{Ready: newWorkItems(ruleEvictInstance1, ruleEvictInstance2Later, podWithTwoClaimNames)}, + queued: MockState[workItem]{Ready: newWorkItems(ruleEvictInstance1, ruleEvictInstance2Later, podWithTwoClaimNames), Later: newDelayedWorkItems(ruleEvictInstance1, ruleStatusPeriod, ruleEvictInstance2Later, ruleStatusPeriod)}, }, process: []step{ { @@ -1111,7 +1119,7 @@ func testController(tCtx ktesting.TContext) { finalState: state{ allocatedClaims: l(ac(inUseClaim, newEvictionTime(taintTime, ruleEvictInstance1)), ac(inUseClaimOtherNameShared, newEvictionTime(taintTimeLater, ruleEvictInstance2Later))), deletePodAt: evictMap{newObject(podWithTwoClaimNames): *newEvictionTime(taintTime, ruleEvictInstance1, ruleEvictInstance2Later)}, - queued: MockState[workItem]{Ready: newWorkItems(ruleEvictInstance1, ruleEvictInstance2Later, podWithTwoClaimNames), Later: newDelayedWorkItems(podWithTwoClaimNames, ruleEvictInstance2Later.Spec.Taint.TimeAdded.Sub(taintTime.Time))}, + queued: MockState[workItem]{Ready: newWorkItems(ruleEvictInstance1, ruleEvictInstance2Later, podWithTwoClaimNames), Later: newDelayedWorkItems(podWithTwoClaimNames, ruleEvictInstance2Later.Spec.Taint.TimeAdded.Sub(taintTime.Time), ruleEvictInstance2Later, ruleStatusPeriod, ruleEvictInstance1, ruleStatusPeriod)}, }, process: []step{ // The pod is scheduled for much later and time needs to advance a few times before it gets processed. @@ -1119,7 +1127,7 @@ func testController(tCtx ktesting.TContext) { ruleStats: map[types.UID]taintRuleStats{ruleEvictInstance1.UID: {numEvictedPods: 1}, ruleEvictInstance2Later.UID: {numEvictedPods: 1}}, // Initial update of both rules before eviction. rules: l(inProgress(ruleEvictInstance1, true, "PodsPendingEviction", "1 pod needs to be evicted in 1 namespace.", taintTime), inProgress(ruleEvictInstance2Later, true, "PodsPendingEviction", "1 pod needs to be evicted in 1 namespace.", taintTime)), - queuedProcessed: MockState[workItem]{Later: newDelayedWorkItems(podWithTwoClaimNames, ruleEvictInstance2Later.Spec.Taint.TimeAdded.Sub(taintTime.Time), ruleEvictInstance1, ruleStatusPeriod, ruleEvictInstance2Later, ruleStatusPeriod)}, + queuedProcessed: MockState[workItem]{Later: newDelayedWorkItems(podWithTwoClaimNames, ruleEvictInstance2Later.Spec.Taint.TimeAdded.Sub(taintTime.Time), ruleEvictInstance2Later, ruleStatusPeriod, ruleEvictInstance1, ruleStatusPeriod)}, advance: ruleStatusPeriod, queuedShifted: MockState[workItem]{Ready: newWorkItems(ruleEvictInstance1, ruleEvictInstance2Later), Later: newDelayedWorkItems(podWithTwoClaimNames, ruleEvictInstance2Later.Spec.Taint.TimeAdded.Sub(taintTime.Time)-ruleStatusPeriod)}, }, @@ -1496,22 +1504,30 @@ func testController(tCtx ktesting.TContext) { return claim }(), newEvictionTime(metav1Time(taintTime.Add(30*time.Second)), ruleEvict, ruleEvictOther, sliceTaintedTwice, sliceTaintedTwice.Spec.Devices[0].Name, 0, sliceTaintedTwice, sliceTaintedTwice.Spec.Devices[0].Name, 1))), deletePodAt: evictMap{newObject(podWithClaimName): *newEvictionTime(metav1Time(taintTime.Add(30*time.Second)), ruleEvict, ruleEvictOther, sliceTaintedTwice, sliceTaintedTwice.Spec.Devices[0].Name, 0, sliceTaintedTwice, sliceTaintedTwice.Spec.Devices[0].Name, 1)}, - queued: MockState[workItem]{Ready: newWorkItems(ruleEvict, ruleEvictOther), Later: newDelayedWorkItems(podWithClaimName, 30*time.Second)}, + queued: MockState[workItem]{Ready: newWorkItems(ruleEvict, ruleEvictOther), Later: newDelayedWorkItems(podWithClaimName, 30*time.Second, ruleEvict, ruleStatusPeriod, ruleEvictOther, ruleStatusPeriod)}, }, process: []step{ - // First advance time, then delete. { deletePodAt: evictMap{newObject(podWithClaimName): *newEvictionTime(metav1Time(taintTime.Add(30*time.Second)), ruleEvict, ruleEvictOther, sliceTaintedTwice, sliceTaintedTwice.Spec.Devices[0].Name, 0, sliceTaintedTwice, sliceTaintedTwice.Spec.Devices[0].Name, 1)}, pods: l(podWithClaimName), rules: l(inProgress(ruleEvict, true, "PodsPendingEviction", "1 pod needs to be evicted in 1 namespace.", taintTime), inProgress(ruleEvictOther, true, "PodsPendingEviction", "1 pod needs to be evicted in 1 namespace.", taintTime)), - queuedProcessed: MockState[workItem]{Later: newDelayedWorkItems(podWithClaimName, 30*time.Second)}, - advance: 30 * time.Second, + queuedProcessed: MockState[workItem]{Later: newDelayedWorkItems(podWithClaimName, 30*time.Second, ruleEvict, ruleStatusPeriod, ruleEvictOther, ruleStatusPeriod)}, + advance: ruleStatusPeriod, + queuedShifted: MockState[workItem]{Ready: newWorkItems(ruleEvict, ruleEvictOther), Later: newDelayedWorkItems(podWithClaimName, 30*time.Second-ruleStatusPeriod)}, + }, + // First advance time, then delete. + { + deletePodAt: evictMap{newObject(podWithClaimName): *newEvictionTime(metav1Time(taintTime.Add(30*time.Second)), ruleEvict, ruleEvictOther, sliceTaintedTwice, sliceTaintedTwice.Spec.Devices[0].Name, 0, sliceTaintedTwice, sliceTaintedTwice.Spec.Devices[0].Name, 1)}, + pods: l(podWithClaimName), + rules: l(inProgress(ruleEvict, true, "PodsPendingEviction", "1 pod needs to be evicted in 1 namespace.", metav1Time(taintTime.Add(ruleStatusPeriod))), inProgress(ruleEvictOther, true, "PodsPendingEviction", "1 pod needs to be evicted in 1 namespace.", metav1Time(taintTime.Add(ruleStatusPeriod)))), + queuedProcessed: MockState[workItem]{Later: newDelayedWorkItems(podWithClaimName, 20*time.Second)}, + advance: 20 * time.Second, queuedShifted: MockState[workItem]{Ready: newWorkItems(podWithClaimName)}, }, { ruleStats: map[types.UID]taintRuleStats{ruleEvict.UID: {numEvictedPods: 1}, ruleEvictOther.UID: {numEvictedPods: 1}}, // Not updated yet. - rules: l(inProgress(ruleEvict, true, "PodsPendingEviction", "1 pod needs to be evicted in 1 namespace.", taintTime), inProgress(ruleEvictOther, true, "PodsPendingEviction", "1 pod needs to be evicted in 1 namespace.", taintTime)), + rules: l(inProgress(ruleEvict, true, "PodsPendingEviction", "1 pod needs to be evicted in 1 namespace.", metav1Time(taintTime.Add(ruleStatusPeriod))), inProgress(ruleEvictOther, true, "PodsPendingEviction", "1 pod needs to be evicted in 1 namespace.", metav1Time(taintTime.Add(ruleStatusPeriod)))), queuedProcessed: MockState[workItem]{Later: newDelayedWorkItems(ruleEvict, ruleStatusPeriod, ruleEvictOther, ruleStatusPeriod)}, advance: ruleStatusPeriod, queuedShifted: MockState[workItem]{Ready: newWorkItems(ruleEvict, ruleEvictOther)}, @@ -1934,6 +1950,11 @@ func testHandlers(tContext *testContext, tc testCase) { } } + queueCmpOpts := []cmp.Option{ + cmpopts.SortSlices(compareWorkItems), + cmpopts.SortSlices(compareDelayedWorkItems), + } + assertEqual(tContext, tc.finalState.ruleStats, tContext.taintRuleStats, "taintRuleStats") assertEqual(tContext, tc.finalState.deletePodAt, tContext.deletePodAt, "deletePodAt") assertEqual(tContext, tc.finalState.allocatedClaimsAsMap(), tContext.allocatedClaims, "allocated claims") @@ -1942,7 +1963,7 @@ func testHandlers(tContext *testContext, tc testCase) { assert.Equal(tContext, tc.finalState.slicesAsMap()[key], tContext.pools[key], "pool") } } - assertEqual(tContext, tc.finalState.queued, tContext.mockQueue.State(), "work queue after event handlers", cmpopts.SortSlices(compareWorkItems)) + assertEqual(tContext, tc.finalState.queued, tContext.mockQueue.State(), "work queue after event handlers", queueCmpOpts...) assert.Empty(tContext, tc.finalState.pods, "pods not checked for final state") assert.Empty(tContext, tc.finalState.rules, "rules not checked for final state") @@ -1971,13 +1992,14 @@ func testHandlers(tContext *testContext, tc testCase) { assertEqual(tContext, state.rules, actualRules, prefix+"rules after flushing work queue") // Advance time and potentially make pending work items ready. - assertEqual(tContext, state.queuedProcessed, tContext.mockQueue.State(), prefix+"work queue after processing", cmpopts.SortSlices(compareWorkItems)) + assertEqual(tContext, state.queuedProcessed, tContext.mockQueue.State(), prefix+"work queue after processing", queueCmpOpts...) time.Sleep(state.advance) for _, item := range tContext.mockQueue.State().Later { + fmt.Println(item.Item, item.Duration) tContext.mockQueue.CancelAfter(item.Item) tContext.mockQueue.AddAfter(item.Item, item.Duration-state.advance) } - assertEqual(tContext, state.queuedShifted, tContext.mockQueue.State(), prefix+"work queue after moving time forward", cmpopts.SortSlices(compareWorkItems)) + assertEqual(tContext, state.queuedShifted, tContext.mockQueue.State(), prefix+"work queue after moving time forward", queueCmpOpts...) } assertEqual(tContext, tc.wantEvents, tContext.recorder.Events, "overall events", @@ -2001,6 +2023,17 @@ func compareWorkItems(a, b workItem) int { return strings.Compare(string(a.ruleRef.UID), string(b.ruleRef.UID)) } +func compareDelayedWorkItems(a, b MockDelayedItem[workItem]) int { + delta := a.Duration - b.Duration + if delta > 0 { + return 1 + } + if delta < 0 { + return -1 + } + return compareWorkItems(a.Item, b.Item) +} + func applyEventPair(tContext *testContext, event any) { switch pair := event.(type) { case [2]*resourceapi.ResourceSlice: @@ -2388,7 +2421,9 @@ func synctestDeviceTaintRule(tCtx ktesting.TContext) { wg.Wait() }() wg.Go(func() { - assert.NoError(tCtx, controller.Run(tCtx, 10 /* workers */), "eviction controller failed") + // Run with 1 worker to ensure sequential execution. Concurrent workers cause + // non-deterministic ordering of status updates, leading to flakes in Status assertions. + assert.NoError(tCtx, controller.Run(tCtx, 1 /* workers */), "eviction controller failed") }) // Eventually the controller should have synced it's informers. @@ -2420,10 +2455,11 @@ func synctestDeviceTaintRule(tCtx ktesting.TContext) { rule, err := tCtx.Client().ResourceV1alpha3().DeviceTaintRules().Update(tCtx, rule, metav1.UpdateOptions{}) tCtx.ExpectNoError(err, "update rule") - // Wait for eviction. The rule gets updated with another delay. + // Wait for eviction. tCtx.Wait() evicted := metav1.Now() tCtx.Logf("TIME: eviction done at %s", evicted) + // The rule status got updated once before evicting pods, but not yet after evicting it. check(tCtx, "evict: ", l(inProgress(rule, true, "PodsPendingEviction", "1 pod needs to be evicted in 1 namespace.", &evicted)), nil) // AddAfter does not move time forward. Do it ourselves... @@ -2433,7 +2469,7 @@ func synctestDeviceTaintRule(tCtx ktesting.TContext) { tCtx.Wait() done := metav1.Now() tCtx.Logf("TIME: done at %s", done) - check(tCtx, "done: ", l(inProgress(rule, false, "Completed", "1 pod evicted since starting the controller.", &slept)), nil) + check(tCtx, "done: ", l(inProgress(rule, false, "Completed", "1 pod evicted since starting the controller.", &done)), nil) assertEqual(tCtx, map[types.UID]taintRuleStats{rule.UID: {numEvictedPods: 1}}, controller.taintRuleStats, "taint rule statistics should have counted the pod") // Delete the rule and verify that we don't leak memory by still tracking it.