From 29e92367db0cedc10523fedd340e55efb0e67b4d Mon Sep 17 00:00:00 2001 From: Patrick Ohly Date: Fri, 27 Feb 2026 14:34:56 +0100 Subject: [PATCH] DRA device taints: avoid unnecessary Pod lookup When rapidly processing informer events it can happen that a pod gets scheduled twice (seen only in the TestEviction/update unit test): - Claim update observed, pod from informer cache with NodeName from update -> queue pod for eviction. - Pod update observed, claim from informer cache -> queue pod again. The effect is one additional Get call to the apiserver. We can avoid it by maintaining an LRU cache with the UIDs of the pods which we have evicted and thus don't need to do anything for. --- .../device_taint_eviction.go | 18 ++++++-- .../devicetainteviction/uid_cache.go | 46 +++++++++++++++++++ 2 files changed, 61 insertions(+), 3 deletions(-) create mode 100644 pkg/controller/devicetainteviction/uid_cache.go diff --git a/pkg/controller/devicetainteviction/device_taint_eviction.go b/pkg/controller/devicetainteviction/device_taint_eviction.go index 2569486fa58..2b385efad90 100644 --- a/pkg/controller/devicetainteviction/device_taint_eviction.go +++ b/pkg/controller/devicetainteviction/device_taint_eviction.go @@ -67,6 +67,8 @@ const ( // updates while eviction is in progress. Once it is done, it no longer gets // updated until in progress again. ruleStatusPeriod = 10 * time.Second + + maxUIDCacheEntries = 500 ) // Controller listens to Taint changes of DRA devices and Toleration changes of ResourceClaims, @@ -105,6 +107,10 @@ type Controller struct { metrics metrics.Metrics workqueue workqueue.TypedRateLimitingInterface[workItem] + // The evictedPods cache keeps track of Pods for which we know that + // they have been evicted. + evictedPods *uidCache + evictPodHook func(pod tainteviction.NamespacedObject, eviction evictionAndReason) cancelEvictHook func(pod tainteviction.NamespacedObject) bool @@ -383,10 +389,11 @@ func (tc *Controller) maybeDeletePod(ctx context.Context, podRef tainteviction.N tc.mutex.Lock() tc.maybeDeletePodCount++ eviction, ok := tc.deletePodAt[podRef] + evicted := tc.evictedPods.has(podRef.UID) tc.mutex.Unlock() - logger.V(5).Info("Processing pod deletion work item", "active", ok, "eviction", eviction) + logger.V(5).Info("Processing pod deletion work item", "active", ok, "eviction", eviction, "evicted", evicted) - if !ok { + if !ok || evicted { logger.V(5).Info("Work item for pod deletion obsolete, nothing to do") return 0, nil } @@ -401,8 +408,12 @@ func (tc *Controller) maybeDeletePod(ctx context.Context, podRef tainteviction.N defer func() { if finalErr == nil { // Forget the deletion time, we are done. + // Also remember that we don't even need to + // check the pod again, should it have been + // added to the queue again in the meantime. tc.mutex.Lock() delete(tc.deletePodAt, podRef) + tc.evictedPods.add(podRef.UID) tc.mutex.Unlock() } }() @@ -739,7 +750,8 @@ func New(c clientset.Interface, podInformer coreinformers.PodInformer, claimInfo sliceInformer.Informer().HasSyncedChecker(), classInformer.Informer().HasSyncedChecker(), }, - metrics: metrics.Global, + metrics: metrics.Global, + evictedPods: newUIDCache(maxUIDCacheEntries), } // The informer for DeviceTaintRules only gets instantiated if the corresponding diff --git a/pkg/controller/devicetainteviction/uid_cache.go b/pkg/controller/devicetainteviction/uid_cache.go new file mode 100644 index 00000000000..9d33ef11680 --- /dev/null +++ b/pkg/controller/devicetainteviction/uid_cache.go @@ -0,0 +1,46 @@ +/* +Copyright 2022 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package devicetainteviction + +import ( + "k8s.io/utils/lru" + + "k8s.io/apimachinery/pkg/types" +) + +// uidCache is an LRU cache for uid. +type uidCache struct { + cache *lru.Cache +} + +// newUIDCache returns a uidCache. +func newUIDCache(maxCacheEntries int) *uidCache { + return &uidCache{ + cache: lru.New(maxCacheEntries), + } +} + +// add adds a uid to the cache. +func (c *uidCache) add(uid types.UID) { + c.cache.Add(uid, nil) +} + +// has returns if a uid is in the cache. +func (c *uidCache) has(uid types.UID) bool { + _, found := c.cache.Get(uid) + return found +}