Merge pull request #135611 from Karthik-K-N/fix-device-taint

Fix flake TestDeviceTaintRule test
This commit is contained in:
Kubernetes Prow Robot 2026-01-06 14:45:08 +05:30 committed by GitHub
commit 76c562be4a
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 83 additions and 33 deletions

View file

@ -434,6 +434,7 @@ func (tc *Controller) maybeDeletePod(ctx context.Context, podRef tainteviction.N
// Doing this immediately is not useful because
// it would just race with the informers update
// (rule status reads from cache!).
tc.logger.V(5).Info("Adding delayed status update because of pod eviction", "deviceTaintRule", klog.KObj(reason.rule), "delay", ruleStatusPeriod)
tc.workqueue.AddAfter(workItemForRule(reason.rule), ruleStatusPeriod)
}
}
@ -1016,7 +1017,14 @@ func (tc *Controller) Run(ctx context.Context, numWorkers int) error {
func (tc *Controller) evictPod(podRef tainteviction.NamespacedObject, eviction evictionAndReason) {
tc.deletePodAt[podRef] = eviction
now := time.Now()
tc.workqueue.AddAfter(workItem{podRef: podRef}, eviction.when.Sub(now))
delay := eviction.when.Sub(now)
if delay <= 0 {
tc.logger.V(3).Info("Adding immediate pod eviction", "pod", podRef, "eviction", eviction)
tc.workqueue.Add(workItem{podRef: podRef})
} else {
tc.logger.V(3).Info("Adding delayed pod eviction", "pod", podRef, "eviction", eviction, "delay", delay)
tc.workqueue.AddAfter(workItem{podRef: podRef}, delay)
}
if tc.evictPodHook != nil {
tc.evictPodHook(podRef, eviction)
@ -1275,7 +1283,8 @@ func (tc *Controller) handleRuleChange(oldRule, newRule *resourcealpha.DeviceTai
}
if oldRule == nil {
// Update the status at least once.
// Update the status at least once, immediately and before evicting any pods.
tc.logger.V(5).Info("Adding immediate status update because of new rule", "deviceTaintRule", klog.KObj(newRule))
tc.workqueue.Add(workItemForRule(newRule))
}
@ -1289,9 +1298,13 @@ func (tc *Controller) handleRuleChange(oldRule, newRule *resourcealpha.DeviceTai
if oldRule != nil &&
newRule != nil &&
oldRule.UID == newRule.UID &&
apiequality.Semantic.DeepEqual(&oldRule.Spec, &newRule.Spec) {
return
oldRule.UID == newRule.UID {
if apiequality.Semantic.DeepEqual(&oldRule.Spec, &newRule.Spec) {
return
}
// Update the status at least once, immediately and before evicting any pods.
tc.logger.V(5).Info("Adding immediate status update because of modified rule spec", "deviceTaintRule", klog.KObj(newRule))
tc.workqueue.Add(workItemForRule(newRule))
}
// Rule spec changes should be rare. Simply do a brute-force re-evaluation of all allocated claims.
@ -1475,13 +1488,14 @@ func (tc *Controller) handlePod(pod *v1.Pod) {
return
}
tc.logger.V(3).Info("Going to evict pod", "pod", podRef, "eviction", eviction)
tc.evictPod(podRef, *eviction)
// If any reason is because of a taint, then eviction is in progress and the status may need to be updated.
// But don't do it immediately because more pod changes may be coming in.
for _, reason := range eviction.reason {
if reason.rule != nil {
tc.workqueue.Add(workItemForRule(reason.rule))
tc.logger.V(5).Info("Adding delayed status update because of pod change", "deviceTaintRule", klog.KObj(reason.rule), "delay", ruleStatusPeriod)
tc.workqueue.AddAfter(workItemForRule(reason.rule), ruleStatusPeriod)
}
}
}

View file

@ -855,7 +855,7 @@ func testController(tCtx ktesting.TContext) {
finalState: state{
allocatedClaims: l(ac(inUseClaim, newEvictionTime(taintTime, ruleEvict))),
deletePodAt: evictMap{newObject(podWithClaimName): *newEvictionTime(taintTime, ruleEvict)},
queued: MockState[workItem]{Ready: newWorkItems(ruleEvict, podWithClaimName)},
queued: MockState[workItem]{Ready: newWorkItems(ruleEvict, podWithClaimName), Later: newDelayedWorkItems(ruleEvict, ruleStatusPeriod)},
},
process: []step{
{
@ -883,23 +883,31 @@ func testController(tCtx ktesting.TContext) {
finalState: state{
allocatedClaims: l(ac(inUseClaimWithToleration, newEvictionTime(metav1Time(taintTime.Add(tolerationDuration)), ruleEvict))),
deletePodAt: evictMap{newObject(podWithClaimName): *newEvictionTime(metav1Time(taintTime.Add(tolerationDuration)), ruleEvict)},
queued: MockState[workItem]{Ready: newWorkItems(ruleEvict), Later: newDelayedWorkItems(podWithClaimName, tolerationDuration)},
queued: MockState[workItem]{Ready: newWorkItems(ruleEvict), Later: newDelayedWorkItems(podWithClaimName, tolerationDuration, ruleEvict, ruleStatusPeriod)},
},
process: []step{
{
// Initial update.
deletePodAt: evictMap{newObject(podWithClaimName): *newEvictionTime(metav1Time(taintTime.Add(tolerationDuration)), ruleEvict)},
pods: l(podWithClaimName),
rules: l(inProgress(ruleEvict, true, "PodsPendingEviction", "1 pod needs to be evicted in 1 namespace.", taintTime)),
queuedProcessed: MockState[workItem]{Later: newDelayedWorkItems(podWithClaimName, tolerationDuration)},
advance: tolerationDuration,
deletePodAt: evictMap{newObject(podWithClaimName): *newEvictionTime(metav1Time(taintTime.Add(tolerationDuration)), ruleEvict)},
pods: l(podWithClaimName),
rules: l(inProgress(ruleEvict, true, "PodsPendingEviction", "1 pod needs to be evicted in 1 namespace.", taintTime)),
queuedProcessed: MockState[workItem]{Later: newDelayedWorkItems(podWithClaimName, tolerationDuration, ruleEvict, ruleStatusPeriod)},
advance: ruleStatusPeriod,
queuedShifted: MockState[workItem]{Ready: newWorkItems(ruleEvict), Later: newDelayedWorkItems(podWithClaimName, tolerationDuration-ruleStatusPeriod)},
},
{
// Process the pod eviction.
deletePodAt: evictMap{newObject(podWithClaimName): *newEvictionTime(metav1Time(taintTime.Add(tolerationDuration)), ruleEvict)},
pods: l(podWithClaimName),
rules: l(inProgress(ruleEvict, true, "PodsPendingEviction", "1 pod needs to be evicted in 1 namespace.", metav1Time(taintTime.Add(ruleStatusPeriod)))),
queuedProcessed: MockState[workItem]{Later: newDelayedWorkItems(podWithClaimName, tolerationDuration-ruleStatusPeriod)},
advance: tolerationDuration - ruleStatusPeriod,
queuedShifted: MockState[workItem]{Ready: newWorkItems(podWithClaimName)},
},
{
// Deleted, but condition not updated yet.
ruleStats: map[types.UID]taintRuleStats{ruleEvict.UID: {numEvictedPods: 1}},
rules: l(inProgress(ruleEvict, true, "PodsPendingEviction", "1 pod needs to be evicted in 1 namespace.", taintTime)),
rules: l(inProgress(ruleEvict, true, "PodsPendingEviction", "1 pod needs to be evicted in 1 namespace.", metav1Time(taintTime.Add(ruleStatusPeriod)))),
queuedProcessed: MockState[workItem]{Later: newDelayedWorkItems(ruleEvict, ruleStatusPeriod)},
advance: ruleStatusPeriod,
queuedShifted: MockState[workItem]{Ready: newWorkItems(ruleEvict)},
@ -923,7 +931,7 @@ func testController(tCtx ktesting.TContext) {
finalState: state{
allocatedClaims: l(ac(inUseClaim, newEvictionTime(taintTime, ruleEvict)), ac(inUseClaimOtherNamespace, newEvictionTime(taintTime, ruleEvict))),
deletePodAt: evictMap{newObject(podWithClaimName): *newEvictionTime(taintTime, ruleEvict), newObject(podWithClaimNameOtherNamespace): *newEvictionTime(taintTime, ruleEvict)},
queued: MockState[workItem]{Ready: newWorkItems(ruleEvict, podWithClaimName, podWithClaimNameOtherNamespace)},
queued: MockState[workItem]{Ready: newWorkItems(ruleEvict, podWithClaimName, podWithClaimNameOtherNamespace), Later: newDelayedWorkItems(ruleEvict, ruleStatusPeriod)},
},
process: []step{
{
@ -953,7 +961,7 @@ func testController(tCtx ktesting.TContext) {
finalState: state{
allocatedClaims: l(ac(inUseClaim, newEvictionTime(taintTime, ruleEvict)), ac(inUseClaimOtherName, newEvictionTime(taintTime, ruleEvict))),
deletePodAt: evictMap{newObject(podWithClaimName): *newEvictionTime(taintTime, ruleEvict), newObject(podWithClaimNameOtherName): *newEvictionTime(taintTime, ruleEvict)},
queued: MockState[workItem]{Ready: newWorkItems(ruleEvict, podWithClaimName, podWithClaimNameOtherName)},
queued: MockState[workItem]{Ready: newWorkItems(ruleEvict, podWithClaimName, podWithClaimNameOtherName), Later: newDelayedWorkItems(ruleEvict, ruleStatusPeriod)},
},
process: []step{
{
@ -1039,7 +1047,7 @@ func testController(tCtx ktesting.TContext) {
finalState: state{
allocatedClaims: l(ac(inUseClaim, newEvictionTime(taintTime, ruleEvict)), ac(inUseClaimOtherNameShared, newEvictionTime(taintTime, ruleEvict))),
deletePodAt: evictMap{newObject(podWithTwoClaimNames): *newEvictionTime(taintTime, ruleEvict)},
queued: MockState[workItem]{Ready: newWorkItems(ruleEvict, podWithTwoClaimNames)},
queued: MockState[workItem]{Ready: newWorkItems(ruleEvict, podWithTwoClaimNames), Later: newDelayedWorkItems(ruleEvict, ruleStatusPeriod)},
},
process: []step{
{
@ -1079,7 +1087,7 @@ func testController(tCtx ktesting.TContext) {
finalState: state{
allocatedClaims: l(ac(inUseClaim, newEvictionTime(taintTime, ruleEvictInstance1)), ac(inUseClaimOtherNameShared, newEvictionTime(taintTimeLater, ruleEvictInstance2Later))),
deletePodAt: evictMap{newObject(podWithTwoClaimNames): *newEvictionTime(taintTime, ruleEvictInstance1, ruleEvictInstance2Later)},
queued: MockState[workItem]{Ready: newWorkItems(ruleEvictInstance1, ruleEvictInstance2Later, podWithTwoClaimNames)},
queued: MockState[workItem]{Ready: newWorkItems(ruleEvictInstance1, ruleEvictInstance2Later, podWithTwoClaimNames), Later: newDelayedWorkItems(ruleEvictInstance1, ruleStatusPeriod, ruleEvictInstance2Later, ruleStatusPeriod)},
},
process: []step{
{
@ -1111,7 +1119,7 @@ func testController(tCtx ktesting.TContext) {
finalState: state{
allocatedClaims: l(ac(inUseClaim, newEvictionTime(taintTime, ruleEvictInstance1)), ac(inUseClaimOtherNameShared, newEvictionTime(taintTimeLater, ruleEvictInstance2Later))),
deletePodAt: evictMap{newObject(podWithTwoClaimNames): *newEvictionTime(taintTime, ruleEvictInstance1, ruleEvictInstance2Later)},
queued: MockState[workItem]{Ready: newWorkItems(ruleEvictInstance1, ruleEvictInstance2Later, podWithTwoClaimNames), Later: newDelayedWorkItems(podWithTwoClaimNames, ruleEvictInstance2Later.Spec.Taint.TimeAdded.Sub(taintTime.Time))},
queued: MockState[workItem]{Ready: newWorkItems(ruleEvictInstance1, ruleEvictInstance2Later, podWithTwoClaimNames), Later: newDelayedWorkItems(podWithTwoClaimNames, ruleEvictInstance2Later.Spec.Taint.TimeAdded.Sub(taintTime.Time), ruleEvictInstance2Later, ruleStatusPeriod, ruleEvictInstance1, ruleStatusPeriod)},
},
process: []step{
// The pod is scheduled for much later and time needs to advance a few times before it gets processed.
@ -1119,7 +1127,7 @@ func testController(tCtx ktesting.TContext) {
ruleStats: map[types.UID]taintRuleStats{ruleEvictInstance1.UID: {numEvictedPods: 1}, ruleEvictInstance2Later.UID: {numEvictedPods: 1}},
// Initial update of both rules before eviction.
rules: l(inProgress(ruleEvictInstance1, true, "PodsPendingEviction", "1 pod needs to be evicted in 1 namespace.", taintTime), inProgress(ruleEvictInstance2Later, true, "PodsPendingEviction", "1 pod needs to be evicted in 1 namespace.", taintTime)),
queuedProcessed: MockState[workItem]{Later: newDelayedWorkItems(podWithTwoClaimNames, ruleEvictInstance2Later.Spec.Taint.TimeAdded.Sub(taintTime.Time), ruleEvictInstance1, ruleStatusPeriod, ruleEvictInstance2Later, ruleStatusPeriod)},
queuedProcessed: MockState[workItem]{Later: newDelayedWorkItems(podWithTwoClaimNames, ruleEvictInstance2Later.Spec.Taint.TimeAdded.Sub(taintTime.Time), ruleEvictInstance2Later, ruleStatusPeriod, ruleEvictInstance1, ruleStatusPeriod)},
advance: ruleStatusPeriod,
queuedShifted: MockState[workItem]{Ready: newWorkItems(ruleEvictInstance1, ruleEvictInstance2Later), Later: newDelayedWorkItems(podWithTwoClaimNames, ruleEvictInstance2Later.Spec.Taint.TimeAdded.Sub(taintTime.Time)-ruleStatusPeriod)},
},
@ -1496,22 +1504,30 @@ func testController(tCtx ktesting.TContext) {
return claim
}(), newEvictionTime(metav1Time(taintTime.Add(30*time.Second)), ruleEvict, ruleEvictOther, sliceTaintedTwice, sliceTaintedTwice.Spec.Devices[0].Name, 0, sliceTaintedTwice, sliceTaintedTwice.Spec.Devices[0].Name, 1))),
deletePodAt: evictMap{newObject(podWithClaimName): *newEvictionTime(metav1Time(taintTime.Add(30*time.Second)), ruleEvict, ruleEvictOther, sliceTaintedTwice, sliceTaintedTwice.Spec.Devices[0].Name, 0, sliceTaintedTwice, sliceTaintedTwice.Spec.Devices[0].Name, 1)},
queued: MockState[workItem]{Ready: newWorkItems(ruleEvict, ruleEvictOther), Later: newDelayedWorkItems(podWithClaimName, 30*time.Second)},
queued: MockState[workItem]{Ready: newWorkItems(ruleEvict, ruleEvictOther), Later: newDelayedWorkItems(podWithClaimName, 30*time.Second, ruleEvict, ruleStatusPeriod, ruleEvictOther, ruleStatusPeriod)},
},
process: []step{
// First advance time, then delete.
{
deletePodAt: evictMap{newObject(podWithClaimName): *newEvictionTime(metav1Time(taintTime.Add(30*time.Second)), ruleEvict, ruleEvictOther, sliceTaintedTwice, sliceTaintedTwice.Spec.Devices[0].Name, 0, sliceTaintedTwice, sliceTaintedTwice.Spec.Devices[0].Name, 1)},
pods: l(podWithClaimName),
rules: l(inProgress(ruleEvict, true, "PodsPendingEviction", "1 pod needs to be evicted in 1 namespace.", taintTime), inProgress(ruleEvictOther, true, "PodsPendingEviction", "1 pod needs to be evicted in 1 namespace.", taintTime)),
queuedProcessed: MockState[workItem]{Later: newDelayedWorkItems(podWithClaimName, 30*time.Second)},
advance: 30 * time.Second,
queuedProcessed: MockState[workItem]{Later: newDelayedWorkItems(podWithClaimName, 30*time.Second, ruleEvict, ruleStatusPeriod, ruleEvictOther, ruleStatusPeriod)},
advance: ruleStatusPeriod,
queuedShifted: MockState[workItem]{Ready: newWorkItems(ruleEvict, ruleEvictOther), Later: newDelayedWorkItems(podWithClaimName, 30*time.Second-ruleStatusPeriod)},
},
// First advance time, then delete.
{
deletePodAt: evictMap{newObject(podWithClaimName): *newEvictionTime(metav1Time(taintTime.Add(30*time.Second)), ruleEvict, ruleEvictOther, sliceTaintedTwice, sliceTaintedTwice.Spec.Devices[0].Name, 0, sliceTaintedTwice, sliceTaintedTwice.Spec.Devices[0].Name, 1)},
pods: l(podWithClaimName),
rules: l(inProgress(ruleEvict, true, "PodsPendingEviction", "1 pod needs to be evicted in 1 namespace.", metav1Time(taintTime.Add(ruleStatusPeriod))), inProgress(ruleEvictOther, true, "PodsPendingEviction", "1 pod needs to be evicted in 1 namespace.", metav1Time(taintTime.Add(ruleStatusPeriod)))),
queuedProcessed: MockState[workItem]{Later: newDelayedWorkItems(podWithClaimName, 20*time.Second)},
advance: 20 * time.Second,
queuedShifted: MockState[workItem]{Ready: newWorkItems(podWithClaimName)},
},
{
ruleStats: map[types.UID]taintRuleStats{ruleEvict.UID: {numEvictedPods: 1}, ruleEvictOther.UID: {numEvictedPods: 1}},
// Not updated yet.
rules: l(inProgress(ruleEvict, true, "PodsPendingEviction", "1 pod needs to be evicted in 1 namespace.", taintTime), inProgress(ruleEvictOther, true, "PodsPendingEviction", "1 pod needs to be evicted in 1 namespace.", taintTime)),
rules: l(inProgress(ruleEvict, true, "PodsPendingEviction", "1 pod needs to be evicted in 1 namespace.", metav1Time(taintTime.Add(ruleStatusPeriod))), inProgress(ruleEvictOther, true, "PodsPendingEviction", "1 pod needs to be evicted in 1 namespace.", metav1Time(taintTime.Add(ruleStatusPeriod)))),
queuedProcessed: MockState[workItem]{Later: newDelayedWorkItems(ruleEvict, ruleStatusPeriod, ruleEvictOther, ruleStatusPeriod)},
advance: ruleStatusPeriod,
queuedShifted: MockState[workItem]{Ready: newWorkItems(ruleEvict, ruleEvictOther)},
@ -1934,6 +1950,11 @@ func testHandlers(tContext *testContext, tc testCase) {
}
}
queueCmpOpts := []cmp.Option{
cmpopts.SortSlices(compareWorkItems),
cmpopts.SortSlices(compareDelayedWorkItems),
}
assertEqual(tContext, tc.finalState.ruleStats, tContext.taintRuleStats, "taintRuleStats")
assertEqual(tContext, tc.finalState.deletePodAt, tContext.deletePodAt, "deletePodAt")
assertEqual(tContext, tc.finalState.allocatedClaimsAsMap(), tContext.allocatedClaims, "allocated claims")
@ -1942,7 +1963,7 @@ func testHandlers(tContext *testContext, tc testCase) {
assert.Equal(tContext, tc.finalState.slicesAsMap()[key], tContext.pools[key], "pool")
}
}
assertEqual(tContext, tc.finalState.queued, tContext.mockQueue.State(), "work queue after event handlers", cmpopts.SortSlices(compareWorkItems))
assertEqual(tContext, tc.finalState.queued, tContext.mockQueue.State(), "work queue after event handlers", queueCmpOpts...)
assert.Empty(tContext, tc.finalState.pods, "pods not checked for final state")
assert.Empty(tContext, tc.finalState.rules, "rules not checked for final state")
@ -1971,13 +1992,14 @@ func testHandlers(tContext *testContext, tc testCase) {
assertEqual(tContext, state.rules, actualRules, prefix+"rules after flushing work queue")
// Advance time and potentially make pending work items ready.
assertEqual(tContext, state.queuedProcessed, tContext.mockQueue.State(), prefix+"work queue after processing", cmpopts.SortSlices(compareWorkItems))
assertEqual(tContext, state.queuedProcessed, tContext.mockQueue.State(), prefix+"work queue after processing", queueCmpOpts...)
time.Sleep(state.advance)
for _, item := range tContext.mockQueue.State().Later {
fmt.Println(item.Item, item.Duration)
tContext.mockQueue.CancelAfter(item.Item)
tContext.mockQueue.AddAfter(item.Item, item.Duration-state.advance)
}
assertEqual(tContext, state.queuedShifted, tContext.mockQueue.State(), prefix+"work queue after moving time forward", cmpopts.SortSlices(compareWorkItems))
assertEqual(tContext, state.queuedShifted, tContext.mockQueue.State(), prefix+"work queue after moving time forward", queueCmpOpts...)
}
assertEqual(tContext, tc.wantEvents, tContext.recorder.Events, "overall events",
@ -2001,6 +2023,17 @@ func compareWorkItems(a, b workItem) int {
return strings.Compare(string(a.ruleRef.UID), string(b.ruleRef.UID))
}
func compareDelayedWorkItems(a, b MockDelayedItem[workItem]) int {
delta := a.Duration - b.Duration
if delta > 0 {
return 1
}
if delta < 0 {
return -1
}
return compareWorkItems(a.Item, b.Item)
}
func applyEventPair(tContext *testContext, event any) {
switch pair := event.(type) {
case [2]*resourceapi.ResourceSlice:
@ -2388,7 +2421,9 @@ func synctestDeviceTaintRule(tCtx ktesting.TContext) {
wg.Wait()
}()
wg.Go(func() {
assert.NoError(tCtx, controller.Run(tCtx, 10 /* workers */), "eviction controller failed")
// Run with 1 worker to ensure sequential execution. Concurrent workers cause
// non-deterministic ordering of status updates, leading to flakes in Status assertions.
assert.NoError(tCtx, controller.Run(tCtx, 1 /* workers */), "eviction controller failed")
})
// Eventually the controller should have synced it's informers.
@ -2420,10 +2455,11 @@ func synctestDeviceTaintRule(tCtx ktesting.TContext) {
rule, err := tCtx.Client().ResourceV1alpha3().DeviceTaintRules().Update(tCtx, rule, metav1.UpdateOptions{})
tCtx.ExpectNoError(err, "update rule")
// Wait for eviction. The rule gets updated with another delay.
// Wait for eviction.
tCtx.Wait()
evicted := metav1.Now()
tCtx.Logf("TIME: eviction done at %s", evicted)
// The rule status got updated once before evicting pods, but not yet after evicting it.
check(tCtx, "evict: ", l(inProgress(rule, true, "PodsPendingEviction", "1 pod needs to be evicted in 1 namespace.", &evicted)), nil)
// AddAfter does not move time forward. Do it ourselves...
@ -2433,7 +2469,7 @@ func synctestDeviceTaintRule(tCtx ktesting.TContext) {
tCtx.Wait()
done := metav1.Now()
tCtx.Logf("TIME: done at %s", done)
check(tCtx, "done: ", l(inProgress(rule, false, "Completed", "1 pod evicted since starting the controller.", &slept)), nil)
check(tCtx, "done: ", l(inProgress(rule, false, "Completed", "1 pod evicted since starting the controller.", &done)), nil)
assertEqual(tCtx, map[types.UID]taintRuleStats{rule.UID: {numEvictedPods: 1}}, controller.taintRuleStats, "taint rule statistics should have counted the pod")
// Delete the rule and verify that we don't leak memory by still tracking it.