2022-03-22 04:59:20 -04:00
/ *
Copyright 2020 The Kubernetes Authors .
Licensed under the Apache License , Version 2.0 ( the "License" ) ;
you may not use this file except in compliance with the License .
You may obtain a copy of the License at
http : //www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing , software
distributed under the License is distributed on an "AS IS" BASIS ,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND , either express or implied .
See the License for the specific language governing permissions and
limitations under the License .
* /
2022-03-22 11:56:49 -04:00
package resourceclaim
2022-03-22 04:59:20 -04:00
import (
"context"
2023-04-14 03:50:52 -04:00
"errors"
2022-03-22 04:59:20 -04:00
"fmt"
2024-02-21 04:06:54 -05:00
"slices"
2022-03-22 11:56:49 -04:00
"strings"
2025-10-27 10:07:56 -04:00
"sync"
2022-03-22 04:59:20 -04:00
"time"
v1 "k8s.io/api/core/v1"
2025-07-04 11:43:31 -04:00
resourceapi "k8s.io/api/resource/v1"
2023-04-14 03:50:52 -04:00
apierrors "k8s.io/apimachinery/pkg/api/errors"
2022-03-22 04:59:20 -04:00
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
2025-07-08 00:24:31 -04:00
"k8s.io/apimachinery/pkg/labels"
2023-06-22 08:15:17 -04:00
"k8s.io/apimachinery/pkg/types"
2022-03-22 04:59:20 -04:00
"k8s.io/apimachinery/pkg/util/runtime"
2025-12-30 00:44:48 -05:00
"k8s.io/apimachinery/pkg/util/sets"
2022-03-22 04:59:20 -04:00
"k8s.io/apimachinery/pkg/util/wait"
2023-04-14 03:50:52 -04:00
corev1apply "k8s.io/client-go/applyconfigurations/core/v1"
2022-03-22 11:56:49 -04:00
v1informers "k8s.io/client-go/informers/core/v1"
2025-07-04 11:43:31 -04:00
resourceinformers "k8s.io/client-go/informers/resource/v1"
2022-03-22 04:59:20 -04:00
clientset "k8s.io/client-go/kubernetes"
"k8s.io/client-go/kubernetes/scheme"
v1core "k8s.io/client-go/kubernetes/typed/core/v1"
2022-03-22 11:56:49 -04:00
v1listers "k8s.io/client-go/listers/core/v1"
2025-07-04 11:43:31 -04:00
resourcelisters "k8s.io/client-go/listers/resource/v1"
2022-03-22 04:59:20 -04:00
"k8s.io/client-go/tools/cache"
"k8s.io/client-go/tools/record"
"k8s.io/client-go/util/workqueue"
2025-07-08 00:24:31 -04:00
"k8s.io/component-base/metrics"
2022-03-22 11:56:49 -04:00
"k8s.io/dynamic-resource-allocation/resourceclaim"
"k8s.io/klog/v2"
podutil "k8s.io/kubernetes/pkg/api/v1/pod"
2025-07-08 00:24:31 -04:00
resourceclaimmetrics "k8s.io/kubernetes/pkg/controller/resourceclaim/metrics"
2023-09-15 09:07:03 -04:00
"k8s.io/utils/ptr"
2022-03-22 04:59:20 -04:00
)
2022-03-22 11:56:49 -04:00
const (
2025-12-30 00:44:48 -05:00
// podResourceClaimIndex is the lookup name for the index function which indexes by pod ResourceClaim.
2022-03-22 11:56:49 -04:00
podResourceClaimIndex = "pod-resource-claim-index"
2022-03-22 04:59:20 -04:00
2025-12-30 00:44:48 -05:00
// podResourceClaimTemplateIndexKey is the lookup name for the index function which indexes only by pod ResourceClaim templates.
podResourceClaimTemplateIndexKey = "pod-resource-claim-template-index"
2023-04-14 03:50:52 -04:00
// podResourceClaimAnnotation is the special annotation that generated
// ResourceClaims get. Its value is the pod.spec.resourceClaims[].name
// for which it was generated. This is used only inside the controller
// and not documented as part of the Kubernetes API.
podResourceClaimAnnotation = "resource.kubernetes.io/pod-claim-name"
2023-07-06 12:39:47 -04:00
// claimPodOwnerIndex is used to find ResourceClaims which have
// a specific pod as owner. Values for this index are the pod UID.
claimPodOwnerIndex = "claim-pod-owner-index"
2023-04-14 03:50:52 -04:00
// Field manager used to update the pod status.
fieldManager = "ResourceClaimController"
2022-03-22 11:56:49 -04:00
maxUIDCacheEntries = 500
)
// Controller creates ResourceClaims for ResourceClaimTemplates in a pod spec.
type Controller struct {
2025-02-28 14:32:59 -05:00
// features defines the feature gates that are enabled.
features Features
2024-10-09 14:12:49 -04:00
2022-03-22 11:56:49 -04:00
// kubeClient is the kube API client used to communicate with the API
// server.
2022-03-22 04:59:20 -04:00
kubeClient clientset . Interface
2022-03-22 11:56:49 -04:00
// claimLister is the shared ResourceClaim lister used to fetch and store ResourceClaim
2022-03-22 04:59:20 -04:00
// objects from the API server. It is shared with other controllers and
2022-03-22 11:56:49 -04:00
// therefore the ResourceClaim objects in its store should be treated as immutable.
2024-06-14 06:40:48 -04:00
claimLister resourcelisters . ResourceClaimLister
2022-03-22 11:56:49 -04:00
claimsSynced cache . InformerSynced
2023-07-06 12:39:47 -04:00
claimCache cache . MutationCache
2022-03-22 04:59:20 -04:00
// podLister is the shared Pod lister used to fetch Pod
// objects from the API server. It is shared with other controllers and
// therefore the Pod objects in its store should be treated as immutable.
2022-03-22 11:56:49 -04:00
podLister v1listers . PodLister
podSynced cache . InformerSynced
// templateLister is the shared ResourceClaimTemplate lister used to
// fetch template objects from the API server. It is shared with other
// controllers and therefore the objects in its store should be treated
// as immutable.
2024-06-14 06:40:48 -04:00
templateLister resourcelisters . ResourceClaimTemplateLister
2022-03-22 11:56:49 -04:00
templatesSynced cache . InformerSynced
2022-03-22 04:59:20 -04:00
2024-06-22 10:45:33 -04:00
// podIndexer has the common PodResourceClaim indexer installed To
2022-03-22 04:59:20 -04:00
// limit iteration over pods to those of interest.
podIndexer cache . Indexer
// recorder is used to record events in the API server
recorder record . EventRecorder
2024-04-28 12:26:18 -04:00
queue workqueue . TypedRateLimitingInterface [ string ]
2022-03-22 11:56:49 -04:00
// The deletedObjects cache keeps track of Pods for which we know that
// they have existed and have been removed. For those we can be sure
// that a ReservedFor entry needs to be removed.
deletedObjects * uidCache
2022-03-22 04:59:20 -04:00
}
2022-03-22 11:56:49 -04:00
const (
claimKeyPrefix = "claim:"
podKeyPrefix = "pod:"
)
2025-02-28 14:32:59 -05:00
// Features defines which features should be enabled in the controller.
type Features struct {
AdminAccess bool
PrioritizedList bool
}
2022-03-22 11:56:49 -04:00
// NewController creates a ResourceClaim controller.
2022-03-22 04:59:20 -04:00
func NewController (
2023-06-22 08:10:15 -04:00
logger klog . Logger ,
2025-02-28 14:32:59 -05:00
features Features ,
2022-03-22 04:59:20 -04:00
kubeClient clientset . Interface ,
2022-03-22 11:56:49 -04:00
podInformer v1informers . PodInformer ,
2024-06-14 06:40:48 -04:00
claimInformer resourceinformers . ResourceClaimInformer ,
templateInformer resourceinformers . ResourceClaimTemplateInformer ) ( * Controller , error ) {
2022-03-22 11:56:49 -04:00
ec := & Controller {
2025-02-28 14:32:59 -05:00
features : features ,
kubeClient : kubeClient ,
podLister : podInformer . Lister ( ) ,
podIndexer : podInformer . Informer ( ) . GetIndexer ( ) ,
podSynced : podInformer . Informer ( ) . HasSynced ,
claimLister : claimInformer . Lister ( ) ,
claimsSynced : claimInformer . Informer ( ) . HasSynced ,
templateLister : templateInformer . Lister ( ) ,
templatesSynced : templateInformer . Informer ( ) . HasSynced ,
2024-04-28 12:26:18 -04:00
queue : workqueue . NewTypedRateLimitingQueueWithConfig (
workqueue . DefaultTypedControllerRateLimiter [ string ] ( ) ,
workqueue . TypedRateLimitingQueueConfig [ string ] { Name : "resource_claim" } ,
) ,
deletedObjects : newUIDCache ( maxUIDCacheEntries ) ,
2022-03-22 04:59:20 -04:00
}
2025-07-08 00:24:31 -04:00
resourceclaimmetrics . RegisterMetrics ( newCustomCollector ( ec . claimLister , getAdminAccessMetricLabel , logger ) )
2022-03-22 04:59:20 -04:00
2024-07-26 09:26:00 -04:00
if _ , err := podInformer . Informer ( ) . AddEventHandlerWithOptions ( cache . ResourceEventHandlerFuncs {
2022-03-22 11:56:49 -04:00
AddFunc : func ( obj interface { } ) {
2023-06-22 08:10:15 -04:00
ec . enqueuePod ( logger , obj , false )
2022-03-22 11:56:49 -04:00
} ,
UpdateFunc : func ( old , updated interface { } ) {
2023-06-22 08:10:15 -04:00
ec . enqueuePod ( logger , updated , false )
2022-03-22 11:56:49 -04:00
} ,
DeleteFunc : func ( obj interface { } ) {
2023-06-22 08:10:15 -04:00
ec . enqueuePod ( logger , obj , true )
2022-03-22 11:56:49 -04:00
} ,
2024-07-26 09:26:00 -04:00
} , cache . HandlerOptions { Logger : & logger } ) ; err != nil {
2022-03-22 11:56:49 -04:00
return nil , err
}
2024-07-26 09:26:00 -04:00
if _ , err := claimInformer . Informer ( ) . AddEventHandlerWithOptions ( cache . ResourceEventHandlerFuncs {
2023-06-22 08:10:15 -04:00
AddFunc : func ( obj interface { } ) {
2025-12-30 00:44:48 -05:00
logger . V ( 6 ) . Info ( "New claim" , "claimDump" , obj )
2024-09-26 08:43:12 -04:00
ec . enqueueResourceClaim ( logger , nil , obj )
2023-06-22 08:10:15 -04:00
} ,
2022-03-22 11:56:49 -04:00
UpdateFunc : func ( old , updated interface { } ) {
2025-12-30 00:44:48 -05:00
logger . V ( 6 ) . Info ( "Updated claim" , "claimDump" , updated )
2024-09-26 08:43:12 -04:00
ec . enqueueResourceClaim ( logger , old , updated )
2023-06-22 08:10:15 -04:00
} ,
DeleteFunc : func ( obj interface { } ) {
2025-12-30 00:44:48 -05:00
logger . V ( 6 ) . Info ( "Deleted claim" , "claimDump" , obj )
2024-09-26 08:43:12 -04:00
ec . enqueueResourceClaim ( logger , obj , nil )
2022-03-22 11:56:49 -04:00
} ,
2024-07-26 09:26:00 -04:00
} , cache . HandlerOptions { Logger : & logger } ) ; err != nil {
2022-03-22 11:56:49 -04:00
return nil , err
}
2025-12-30 00:44:48 -05:00
if _ , err := templateInformer . Informer ( ) . AddEventHandlerWithOptions ( cache . ResourceEventHandlerFuncs {
AddFunc : func ( obj interface { } ) {
logger . V ( 6 ) . Info ( "New claim template" , "claimTemplateDump" , obj )
ec . enqueueResourceClaimTemplate ( logger , obj )
} ,
UpdateFunc : func ( old , updated interface { } ) {
logger . V ( 6 ) . Info ( "Updated claim template" , "claimTemplateDump" , updated )
ec . enqueueResourceClaimTemplate ( logger , updated )
} ,
DeleteFunc : func ( obj interface { } ) {
logger . V ( 6 ) . Info ( "Deleted claim template" , "claimTemplateDump" , obj )
} ,
} , cache . HandlerOptions { Logger : & logger } ) ; err != nil {
return nil , err
}
2022-03-22 11:56:49 -04:00
if err := ec . podIndexer . AddIndexers ( cache . Indexers { podResourceClaimIndex : podResourceClaimIndexFunc } ) ; err != nil {
return nil , fmt . Errorf ( "could not initialize ResourceClaim controller: %w" , err )
2022-03-22 04:59:20 -04:00
}
2025-12-30 00:44:48 -05:00
if err := ec . podIndexer . AddIndexers ( cache . Indexers { podResourceClaimTemplateIndexKey : podResourceClaimTemplateIndexFunc } ) ; err != nil {
return nil , fmt . Errorf ( "could not initialize ResourceClaim controller: %w" , err )
}
2023-07-06 12:39:47 -04:00
// The mutation cache acts as an additional layer for the informer
// cache and after a create made by the controller returns that
// object until the informer catches up. That is necessary
// when a ResourceClaim got created, updating the pod status fails,
// and then a retry occurs before the informer cache is updated.
// In that scenario, the controller would create another claim
// instead of continuing with the existing one.
claimInformerCache := claimInformer . Informer ( ) . GetIndexer ( )
if err := claimInformerCache . AddIndexers ( cache . Indexers { claimPodOwnerIndex : claimPodOwnerIndexFunc } ) ; err != nil {
return nil , fmt . Errorf ( "could not initialize ResourceClaim controller: %w" , err )
}
2024-07-26 09:26:00 -04:00
ec . claimCache = cache . NewIntegerResourceVersionMutationCache ( logger , claimInformerCache , claimInformerCache ,
2023-07-06 12:39:47 -04:00
// Very long time to live, unlikely to be needed because
// the informer cache should get updated soon.
time . Hour ,
// Allow storing objects not in the underlying cache - that's the point...
// It's safe because in case of a race (claim is in mutation cache, claim
// gets deleted, controller updates status based on mutation cache) the
// "bad" pod status will get detected and fixed when the informer catches up.
true ,
)
2022-03-22 04:59:20 -04:00
return ec , nil
}
2025-12-30 00:44:48 -05:00
func ( ec * Controller ) enqueueResourceClaimTemplate ( logger klog . Logger , obj interface { } ) {
if d , ok := obj . ( cache . DeletedFinalStateUnknown ) ; ok {
obj = d . Obj
}
template , ok := obj . ( * resourceapi . ResourceClaimTemplate )
if ! ok {
// Not a template,return
runtime . HandleErrorWithLogger ( logger , nil , "EnqueueResourceClaimTemplate called for unexpected object" , "type" , fmt . Sprintf ( "%T" , obj ) )
return
}
logger . V ( 6 ) . Info ( "ResourceClaimTemplate added or updated" , "resourceClaimTemplate" , klog . KObj ( template ) )
// Enqueue all pods with this template name
objects , err := ec . podIndexer . ByIndex ( podResourceClaimTemplateIndexKey , fmt . Sprintf ( "%s/%s" , template . Namespace , template . Name ) )
if err != nil {
runtime . HandleErrorWithLogger ( logger , err , "Unable to list pods for claim template" , "resourceClaimTemplate" , klog . KObj ( template ) )
return
}
if len ( objects ) == 0 {
logger . V ( 6 ) . Info ( "ResourceClaimTemplate change unrelated to any known pod" , "resourceClaimTemplate" , klog . KObj ( template ) )
return
}
for _ , object := range objects {
pod , ok1 := object . ( * v1 . Pod )
if ! ok1 {
// Not a pod?!
runtime . HandleErrorWithLogger ( logger , nil , "EnqueueResourceClaimTemplate called for unexpected object" , "type" , fmt . Sprintf ( "%T" , obj ) )
return
}
logger . V ( 4 ) . Info (
"Enqueuing pod due to ResourceClaim change" ,
"resourceClaimTemplate" , klog . KObj ( template ) ,
"pod" , klog . KObj ( pod ) ,
)
ec . enqueuePod ( logger , object , false )
}
}
2023-06-22 08:10:15 -04:00
func ( ec * Controller ) enqueuePod ( logger klog . Logger , obj interface { } , deleted bool ) {
2022-03-22 11:56:49 -04:00
if d , ok := obj . ( cache . DeletedFinalStateUnknown ) ; ok {
obj = d . Obj
}
2022-03-22 04:59:20 -04:00
pod , ok := obj . ( * v1 . Pod )
if ! ok {
2022-03-22 11:56:49 -04:00
// Not a pod?!
2025-12-30 00:44:48 -05:00
logger . Error ( nil , "EnqueuePod called for unexpected object" , "type" , fmt . Sprintf ( "%T" , obj ) )
2022-03-22 04:59:20 -04:00
return
}
2025-09-27 17:13:13 -04:00
// Check if pod has any resource claims to process.
// Extended resource claims are stored in pod.Status.ExtendedResourceClaimStatus,
// not in pod.Spec.ResourceClaims, so we need to check both locations.
hasResourceClaims := len ( pod . Spec . ResourceClaims ) > 0
// For cleanup of extended resource claims, we must consider claims present
// in pod status regardless of the current feature gate state. The claim may
// have been created when the feature was enabled and still needs cleanup.
hasExtendedResourceClaims := pod . Status . ExtendedResourceClaimStatus != nil
if ! hasResourceClaims && ! hasExtendedResourceClaims {
2022-03-22 11:56:49 -04:00
// Nothing to do for it at all.
2022-03-22 04:59:20 -04:00
return
}
2023-05-22 08:44:32 -04:00
if deleted {
2025-12-30 00:44:48 -05:00
logger . V ( 6 ) . Info ( "Pod got deleted" , "pod" , klog . KObj ( pod ) )
2023-05-22 08:44:32 -04:00
ec . deletedObjects . Add ( pod . UID )
}
2025-12-30 00:44:48 -05:00
logger . V ( 6 ) . Info ( "Pod with resource claims changed" , "pod" , klog . KObj ( pod ) , "deleted" , deleted )
2023-06-22 08:10:15 -04:00
2022-03-22 11:56:49 -04:00
// Release reservations of a deleted or completed pod?
2024-09-12 11:07:40 -04:00
needsClaims , reason := podNeedsClaims ( pod , deleted )
if needsClaims {
logger . V ( 6 ) . Info ( "Not touching claims" , "pod" , klog . KObj ( pod ) , "reason" , reason )
} else {
2022-03-22 11:56:49 -04:00
for _ , podClaim := range pod . Spec . ResourceClaims {
2023-04-14 03:50:52 -04:00
claimName , _ , err := resourceclaim . Name ( pod , & podClaim )
switch {
case err != nil :
// Either the claim was not created (nothing to do here) or
// the API changed. The later will also get reported elsewhere,
// so here it's just a debug message.
2024-09-12 11:07:40 -04:00
logger . V ( 6 ) . Info ( "Nothing to do for claim during pod change" , "pod" , klog . KObj ( pod ) , "podClaim" , podClaim . Name , "err" , err , "reason" , reason )
2023-04-14 03:50:52 -04:00
case claimName != nil :
key := claimKeyPrefix + pod . Namespace + "/" + * claimName
2024-09-12 11:07:40 -04:00
logger . V ( 6 ) . Info ( "Process claim" , "pod" , klog . KObj ( pod ) , "claim" , klog . KRef ( pod . Namespace , * claimName ) , "key" , key , "reason" , reason )
2023-05-22 13:08:20 -04:00
ec . queue . Add ( key )
2023-04-14 03:50:52 -04:00
default :
// Nothing to do, claim wasn't generated.
2024-09-12 11:07:40 -04:00
logger . V ( 6 ) . Info ( "Nothing to do for skipped claim during pod change" , "pod" , klog . KObj ( pod ) , "podClaim" , podClaim . Name , "reason" , reason )
2023-04-14 03:50:52 -04:00
}
2022-03-22 11:56:49 -04:00
}
2025-09-27 17:13:13 -04:00
// Process extended resource claims for completed/deleted pods.
// Extended resource claims are created by the scheduler and stored in
// pod.Status.ExtendedResourceClaimStatus, not in pod.Spec.ResourceClaims.
// Without this, extended resource claims would never be cleaned up when
// pods complete, causing device resources to remain allocated indefinitely.
if hasExtendedResourceClaims {
claimName := pod . Status . ExtendedResourceClaimStatus . ResourceClaimName
key := claimKeyPrefix + pod . Namespace + "/" + claimName
logger . V ( 6 ) . Info ( "Process extended resource claim" , "pod" , klog . KObj ( pod ) , "claim" , klog . KRef ( pod . Namespace , claimName ) , "key" , key , "reason" , reason )
ec . queue . Add ( key )
}
2022-03-22 11:56:49 -04:00
}
2023-05-22 13:08:20 -04:00
needsWork , reason := ec . podNeedsWork ( pod )
if needsWork {
2025-12-30 00:44:48 -05:00
logger . V ( 6 ) . Info ( "Enqueueing pod" , "pod" , klog . KObj ( pod ) , "reason" , reason )
2023-05-22 13:08:20 -04:00
ec . queue . Add ( podKeyPrefix + pod . Namespace + "/" + pod . Name )
return
}
2025-12-30 00:44:48 -05:00
logger . V ( 6 ) . Info ( "Not enqueueing pod" , "pod" , klog . KObj ( pod ) , "reason" , reason )
2023-05-22 13:08:20 -04:00
}
func podNeedsClaims ( pod * v1 . Pod , deleted bool ) ( bool , string ) {
if deleted {
return false , "pod got removed"
}
if podutil . IsPodTerminal ( pod ) {
return false , "pod has terminated"
}
if pod . DeletionTimestamp != nil && pod . Spec . NodeName == "" {
return false , "pod got deleted before scheduling"
}
// Still needs claims.
return true , "pod might run"
}
// podNeedsWork checks whether a new or modified pod needs to be processed
// further by a worker. It returns a boolean with the result and an explanation
// for it.
func ( ec * Controller ) podNeedsWork ( pod * v1 . Pod ) ( bool , string ) {
if pod . DeletionTimestamp != nil {
// Nothing else to do for the pod.
return false , "pod is deleted"
}
for _ , podClaim := range pod . Spec . ResourceClaims {
claimName , checkOwner , err := resourceclaim . Name ( pod , & podClaim )
if err != nil {
return true , err . Error ( )
}
// If the claimName is nil, then it has been determined before
// that the claim is not needed.
if claimName == nil {
return false , "claim is not needed"
}
claim , err := ec . claimLister . ResourceClaims ( pod . Namespace ) . Get ( * claimName )
if apierrors . IsNotFound ( err ) {
2024-05-24 09:24:24 -04:00
if podClaim . ResourceClaimTemplateName != nil {
2023-05-22 13:08:20 -04:00
return true , "must create ResourceClaim from template"
2022-03-22 04:59:20 -04:00
}
2023-05-22 13:08:20 -04:00
// User needs to create claim.
return false , "claim is missing and must be created by user"
}
if err != nil {
// Shouldn't happen.
return true , fmt . Sprintf ( "internal error while checking for claim: %v" , err )
2022-03-22 04:59:20 -04:00
}
2024-06-22 10:45:33 -04:00
if checkOwner {
if err := resourceclaim . IsForPod ( pod , claim ) ; err != nil {
// Cannot proceed with the pod unless that other claim gets deleted.
return false , err . Error ( )
}
2023-05-22 13:08:20 -04:00
}
2023-05-22 13:44:58 -04:00
// This check skips over the reasons below that only apply
// when a pod has been scheduled already. We need to keep checking
// for more claims that might need to be created.
2023-05-22 13:08:20 -04:00
if pod . Spec . NodeName == "" {
2023-05-22 13:44:58 -04:00
continue
}
if claim . Status . Allocation != nil &&
! resourceclaim . IsReservedForPod ( pod , claim ) &&
resourceclaim . CanBeReserved ( claim ) {
// Need to reserve it.
2024-06-22 10:45:33 -04:00
return true , fmt . Sprintf ( "need to reserve claim %s for pod" , klog . KObj ( claim ) )
2023-05-22 13:08:20 -04:00
}
2022-03-22 04:59:20 -04:00
}
2023-05-22 13:08:20 -04:00
return false , "nothing to do"
2022-03-22 11:56:49 -04:00
}
2024-09-26 08:43:12 -04:00
func ( ec * Controller ) enqueueResourceClaim ( logger klog . Logger , oldObj , newObj interface { } ) {
2025-06-25 12:01:22 -04:00
deleted := newObj == nil
2024-09-26 08:43:12 -04:00
if d , ok := oldObj . ( cache . DeletedFinalStateUnknown ) ; ok {
oldObj = d . Obj
2023-05-22 13:08:20 -04:00
}
2024-09-26 08:43:12 -04:00
oldClaim , ok := oldObj . ( * resourceapi . ResourceClaim )
if oldObj != nil && ! ok {
return
}
newClaim , ok := newObj . ( * resourceapi . ResourceClaim )
if newObj != nil && ! ok {
2022-03-22 11:56:49 -04:00
return
}
2025-07-08 00:24:31 -04:00
// Check if both the old and new claim are nil in case DeletedFinalStateUnknown.Obj can be nil.
if oldClaim == nil && newClaim == nil {
return
2024-09-26 08:43:12 -04:00
}
claim := newClaim
if claim == nil {
claim = oldClaim
}
2023-05-22 13:08:20 -04:00
if ! deleted {
// When starting up, we have to check all claims to find those with
// stale pods in ReservedFor. During an update, a pod might get added
// that already no longer exists.
key := claimKeyPrefix + claim . Namespace + "/" + claim . Name
2025-12-30 00:44:48 -05:00
logger . V ( 6 ) . Info ( "Enqueueing new or updated claim" , "claim" , klog . KObj ( claim ) , "key" , key )
2023-05-22 13:08:20 -04:00
ec . queue . Add ( key )
} else {
2025-12-30 00:44:48 -05:00
logger . V ( 6 ) . Info ( "Not enqueueing deleted claim" , "claim" , klog . KObj ( claim ) )
2023-05-22 13:08:20 -04:00
}
// Also check whether this causes work for any of the currently
// known pods which use the ResourceClaim.
2022-03-22 11:56:49 -04:00
objs , err := ec . podIndexer . ByIndex ( podResourceClaimIndex , fmt . Sprintf ( "%s/%s" , claim . Namespace , claim . Name ) )
2022-03-22 04:59:20 -04:00
if err != nil {
2025-12-30 00:44:48 -05:00
logger . Error ( err , "Failed to list pods from cache" )
2022-03-22 04:59:20 -04:00
return
}
2023-06-22 08:10:15 -04:00
if len ( objs ) == 0 {
2025-12-30 00:44:48 -05:00
logger . V ( 6 ) . Info ( "ResourceClaim change unrelated to any known pod" , "claim" , klog . KObj ( claim ) )
2023-06-22 08:10:15 -04:00
return
}
2022-03-22 04:59:20 -04:00
for _ , obj := range objs {
2023-06-22 08:10:15 -04:00
ec . enqueuePod ( logger , obj , false )
2022-03-22 04:59:20 -04:00
}
}
2022-03-22 11:56:49 -04:00
func ( ec * Controller ) Run ( ctx context . Context , workers int ) {
2025-09-24 02:08:12 -04:00
defer runtime . HandleCrashWithContext ( ctx )
2022-03-22 04:59:20 -04:00
2023-12-13 03:11:08 -05:00
eventBroadcaster := record . NewBroadcaster ( record . WithContext ( ctx ) )
2023-02-14 13:35:44 -05:00
eventBroadcaster . StartLogging ( klog . Infof )
eventBroadcaster . StartRecordingToSink ( & v1core . EventSinkImpl { Interface : ec . kubeClient . CoreV1 ( ) . Events ( "" ) } )
ec . recorder = eventBroadcaster . NewRecorder ( scheme . Scheme , v1 . EventSource { Component : "resource_claim" } )
defer eventBroadcaster . Shutdown ( )
2025-10-27 10:07:56 -04:00
logger := klog . FromContext ( ctx )
logger . Info ( "Starting resource claim controller" )
var wg sync . WaitGroup
defer func ( ) {
logger . Info ( "Shutting down resource claim controller" )
ec . queue . ShutDown ( )
wg . Wait ( )
} ( )
2025-09-10 01:32:12 -04:00
if ! cache . WaitForNamedCacheSyncWithContext ( ctx , ec . podSynced , ec . claimsSynced , ec . templatesSynced ) {
2022-03-22 04:59:20 -04:00
return
}
for i := 0 ; i < workers ; i ++ {
2025-10-27 10:07:56 -04:00
wg . Go ( func ( ) {
wait . UntilWithContext ( ctx , ec . runWorker , time . Second )
} )
2022-03-22 04:59:20 -04:00
}
<- ctx . Done ( )
}
2022-03-22 11:56:49 -04:00
func ( ec * Controller ) runWorker ( ctx context . Context ) {
2022-03-22 04:59:20 -04:00
for ec . processNextWorkItem ( ctx ) {
}
}
2022-03-22 11:56:49 -04:00
func ( ec * Controller ) processNextWorkItem ( ctx context . Context ) bool {
2022-03-22 04:59:20 -04:00
key , shutdown := ec . queue . Get ( )
if shutdown {
return false
}
defer ec . queue . Done ( key )
2024-04-28 12:26:18 -04:00
err := ec . syncHandler ( ctx , key )
2022-03-22 04:59:20 -04:00
if err == nil {
ec . queue . Forget ( key )
return true
}
2025-09-24 02:08:12 -04:00
runtime . HandleErrorWithContext ( ctx , err , "Work item failed" , "item" , key )
2022-03-22 04:59:20 -04:00
ec . queue . AddRateLimited ( key )
return true
}
2022-03-22 11:56:49 -04:00
// syncHandler is invoked for each work item which might need to be processed.
// If an error is returned from this function, the item will be requeued.
func ( ec * Controller ) syncHandler ( ctx context . Context , key string ) error {
sep := strings . Index ( key , ":" )
if sep < 0 {
return fmt . Errorf ( "unexpected key: %s" , key )
}
prefix , object := key [ 0 : sep + 1 ] , key [ sep + 1 : ]
namespace , name , err := cache . SplitMetaNamespaceKey ( object )
2022-03-22 04:59:20 -04:00
if err != nil {
return err
}
2022-03-22 11:56:49 -04:00
switch prefix {
case podKeyPrefix :
return ec . syncPod ( ctx , namespace , name )
case claimKeyPrefix :
return ec . syncClaim ( ctx , namespace , name )
default :
return fmt . Errorf ( "unexpected key prefix: %s" , prefix )
}
}
func ( ec * Controller ) syncPod ( ctx context . Context , namespace , name string ) error {
logger := klog . LoggerWithValues ( klog . FromContext ( ctx ) , "pod" , klog . KRef ( namespace , name ) )
ctx = klog . NewContext ( ctx , logger )
2022-03-22 04:59:20 -04:00
pod , err := ec . podLister . Pods ( namespace ) . Get ( name )
if err != nil {
2023-04-14 03:50:52 -04:00
if apierrors . IsNotFound ( err ) {
2025-12-30 00:44:48 -05:00
logger . V ( 5 ) . Info ( "Nothing to do for pod, it is gone" )
2022-03-22 04:59:20 -04:00
return nil
}
return err
}
// Ignore pods which are already getting deleted.
if pod . DeletionTimestamp != nil {
2025-12-30 00:44:48 -05:00
logger . V ( 5 ) . Info ( "Nothing to do for pod, it is marked for deletion" )
2022-03-22 04:59:20 -04:00
return nil
}
2023-04-14 03:50:52 -04:00
var newPodClaims map [ string ] string
2022-03-22 11:56:49 -04:00
for _ , podClaim := range pod . Spec . ResourceClaims {
2023-04-14 03:50:52 -04:00
if err := ec . handleClaim ( ctx , pod , podClaim , & newPodClaims ) ; err != nil {
2023-02-14 13:35:44 -05:00
if ec . recorder != nil {
ec . recorder . Event ( pod , v1 . EventTypeWarning , "FailedResourceClaimCreation" , fmt . Sprintf ( "PodResourceClaim %s: %v" , podClaim . Name , err ) )
}
2022-03-22 11:56:49 -04:00
return fmt . Errorf ( "pod %s/%s, PodResourceClaim %s: %v" , namespace , name , podClaim . Name , err )
2022-03-22 04:59:20 -04:00
}
}
2023-04-14 03:50:52 -04:00
if newPodClaims != nil {
// Patch the pod status with the new information about
// generated ResourceClaims.
statuses := make ( [ ] * corev1apply . PodResourceClaimStatusApplyConfiguration , 0 , len ( newPodClaims ) )
for podClaimName , resourceClaimName := range newPodClaims {
statuses = append ( statuses , corev1apply . PodResourceClaimStatus ( ) . WithName ( podClaimName ) . WithResourceClaimName ( resourceClaimName ) )
}
podApply := corev1apply . Pod ( name , namespace ) . WithStatus ( corev1apply . PodStatus ( ) . WithResourceClaimStatuses ( statuses ... ) )
if _ , err := ec . kubeClient . CoreV1 ( ) . Pods ( namespace ) . ApplyStatus ( ctx , podApply , metav1 . ApplyOptions { FieldManager : fieldManager , Force : true } ) ; err != nil {
return fmt . Errorf ( "update pod %s/%s ResourceClaimStatuses: %v" , namespace , name , err )
}
}
2023-05-22 13:44:58 -04:00
if pod . Spec . NodeName == "" {
2024-10-11 10:08:54 -04:00
// Scheduler will handle reservations.
2025-12-30 00:44:48 -05:00
logger . V ( 5 ) . Info ( "Nothing to do for pod, scheduler will deal with it" )
2023-05-22 13:44:58 -04:00
return nil
}
for _ , podClaim := range pod . Spec . ResourceClaims {
claimName , checkOwner , err := resourceclaim . Name ( pod , & podClaim )
if err != nil {
return err
}
// If nil, then it has been determined that the claim is not needed
// and can be skipped.
if claimName == nil {
continue
}
claim , err := ec . claimLister . ResourceClaims ( pod . Namespace ) . Get ( * claimName )
if err != nil {
2024-06-22 10:45:33 -04:00
if apierrors . IsNotFound ( err ) {
return nil
}
2023-05-22 13:44:58 -04:00
return fmt . Errorf ( "retrieve claim: %v" , err )
}
if checkOwner {
if err := resourceclaim . IsForPod ( pod , claim ) ; err != nil {
return err
}
}
if claim . Status . Allocation != nil &&
! resourceclaim . IsReservedForPod ( pod , claim ) &&
resourceclaim . CanBeReserved ( claim ) {
2025-12-30 00:44:48 -05:00
logger . V ( 5 ) . Info ( "Reserve claim for pod" , "resourceClaim" , klog . KObj ( claim ) )
2023-05-22 13:44:58 -04:00
if err := ec . reserveForPod ( ctx , pod , claim ) ; err != nil {
return err
}
}
}
2022-03-22 04:59:20 -04:00
return nil
}
2024-06-22 10:45:33 -04:00
// handleClaim is invoked for each resource claim of a pod.
2023-04-14 03:50:52 -04:00
func ( ec * Controller ) handleClaim ( ctx context . Context , pod * v1 . Pod , podClaim v1 . PodResourceClaim , newPodClaims * map [ string ] string ) error {
2022-03-22 11:56:49 -04:00
logger := klog . LoggerWithValues ( klog . FromContext ( ctx ) , "podClaim" , podClaim . Name )
ctx = klog . NewContext ( ctx , logger )
2025-12-30 00:44:48 -05:00
logger . V ( 5 ) . Info ( "Checking" , "podClaim" , podClaim . Name )
2023-04-14 03:50:52 -04:00
// resourceclaim.Name checks for the situation that the client doesn't
// know some future addition to the API. Therefore it gets called here
// even if there is no template to work on, because if some new field
// gets added, the expectation might be that the controller does
// something for it.
claimName , mustCheckOwner , err := resourceclaim . Name ( pod , & podClaim )
switch {
case errors . Is ( err , resourceclaim . ErrClaimNotFound ) :
// Continue below.
case err != nil :
return fmt . Errorf ( "checking for claim before creating it: %v" , err )
case claimName == nil :
// Nothing to do, no claim needed.
return nil
case * claimName != "" :
claimName := * claimName
// The ResourceClaim should exist because it is recorded in the pod.status.resourceClaimStatuses,
// but perhaps it was deleted accidentally. In that case we re-create it.
claim , err := ec . claimLister . ResourceClaims ( pod . Namespace ) . Get ( claimName )
if err != nil && ! apierrors . IsNotFound ( err ) {
return err
}
if claim != nil {
var err error
if mustCheckOwner {
err = resourceclaim . IsForPod ( pod , claim )
}
if err == nil {
// Already created, nothing more to do.
2025-12-30 00:44:48 -05:00
logger . V ( 5 ) . Info ( "Claim already created" , "podClaim" , podClaim . Name , "resourceClaim" , claimName )
2023-04-14 03:50:52 -04:00
return nil
}
2025-12-30 00:44:48 -05:00
logger . Error ( err , "Claim that was created for the pod is no longer owned by the pod, creating a new one" , "podClaim" , podClaim . Name , "resourceClaim" , claimName )
2023-04-14 03:50:52 -04:00
}
}
2024-05-24 09:24:24 -04:00
templateName := podClaim . ResourceClaimTemplateName
2022-03-22 11:56:49 -04:00
if templateName == nil {
2023-04-14 03:50:52 -04:00
// Nothing to do.
2022-03-22 04:59:20 -04:00
return nil
}
2023-04-14 03:50:52 -04:00
// Before we create a new ResourceClaim, check if there is an orphaned one.
// This covers the case that the controller has created it, but then fails
// before it can update the pod status.
claim , err := ec . findPodResourceClaim ( pod , podClaim )
if err != nil {
return fmt . Errorf ( "finding ResourceClaim for claim %s in pod %s/%s failed: %v" , podClaim . Name , pod . Namespace , pod . Name , err )
2022-03-22 04:59:20 -04:00
}
2023-04-14 03:50:52 -04:00
if claim == nil {
template , err := ec . templateLister . ResourceClaimTemplates ( pod . Namespace ) . Get ( * templateName )
if err != nil {
return fmt . Errorf ( "resource claim template %q: %v" , * templateName , err )
2022-03-22 04:59:20 -04:00
}
2025-02-28 14:32:59 -05:00
if ! ec . features . AdminAccess && needsAdminAccess ( template ) {
2024-10-09 14:12:49 -04:00
return errors . New ( "admin access is requested, but the feature is disabled" )
}
2025-02-28 14:32:59 -05:00
if ! ec . features . PrioritizedList && hasPrioritizedList ( template ) {
return errors . New ( "template includes a prioritized list of subrequests, but the feature is disabled" )
}
2023-04-14 03:50:52 -04:00
// Create the ResourceClaim with pod as owner, with a generated name that uses
// <pod>-<claim name> as base.
isTrue := true
annotations := template . Spec . ObjectMeta . Annotations
if annotations == nil {
annotations = make ( map [ string ] string )
}
annotations [ podResourceClaimAnnotation ] = podClaim . Name
2023-09-01 01:51:41 -04:00
generateName := pod . Name + "-" + podClaim . Name + "-"
2023-04-14 03:50:52 -04:00
maxBaseLen := 57 // Leave space for hyphen and 5 random characters in a name with 63 characters.
if len ( generateName ) > maxBaseLen {
// We could leave truncation to the apiserver, but as
// it removes at the end, we would loose everything
// from the pod claim name when the pod name is long.
// We can do better and truncate both strings,
// proportional to their length.
generateName = pod . Name [ 0 : len ( pod . Name ) * maxBaseLen / len ( generateName ) ] +
"-" +
podClaim . Name [ 0 : len ( podClaim . Name ) * maxBaseLen / len ( generateName ) ]
}
2024-06-14 06:40:48 -04:00
claim = & resourceapi . ResourceClaim {
2023-04-14 03:50:52 -04:00
ObjectMeta : metav1 . ObjectMeta {
GenerateName : generateName ,
OwnerReferences : [ ] metav1 . OwnerReference {
{
2025-10-24 17:34:31 -04:00
APIVersion : "v1" ,
Kind : "Pod" ,
Name : pod . Name ,
UID : pod . UID ,
Controller : & isTrue ,
2023-04-14 03:50:52 -04:00
} ,
2022-03-22 04:59:20 -04:00
} ,
2023-04-14 03:50:52 -04:00
Annotations : annotations ,
Labels : template . Spec . ObjectMeta . Labels ,
2022-03-22 04:59:20 -04:00
} ,
2023-04-14 03:50:52 -04:00
Spec : template . Spec . Spec ,
}
2025-07-08 00:24:31 -04:00
metricLabel := getAdminAccessMetricLabel ( claim )
2023-04-14 03:50:52 -04:00
claimName := claim . Name
2025-07-04 11:43:31 -04:00
claim , err = ec . kubeClient . ResourceV1 ( ) . ResourceClaims ( pod . Namespace ) . Create ( ctx , claim , metav1 . CreateOptions { } )
2023-04-14 03:50:52 -04:00
if err != nil {
2025-07-08 00:24:31 -04:00
resourceclaimmetrics . ResourceClaimCreate . WithLabelValues ( "failure" , metricLabel ) . Inc ( )
2023-04-14 03:50:52 -04:00
return fmt . Errorf ( "create ResourceClaim %s: %v" , claimName , err )
}
2025-07-08 00:24:31 -04:00
resourceclaimmetrics . ResourceClaimCreate . WithLabelValues ( "success" , metricLabel ) . Inc ( )
2024-06-15 08:28:27 -04:00
logger . V ( 4 ) . Info ( "Created ResourceClaim" , "claim" , klog . KObj ( claim ) , "pod" , klog . KObj ( pod ) )
2023-07-06 12:39:47 -04:00
ec . claimCache . Mutation ( claim )
2022-03-22 04:59:20 -04:00
}
2023-04-14 03:50:52 -04:00
// Remember the new ResourceClaim for a batch PodStatus update in our caller.
if * newPodClaims == nil {
* newPodClaims = make ( map [ string ] string )
2022-03-22 11:56:49 -04:00
}
2023-04-14 03:50:52 -04:00
( * newPodClaims ) [ podClaim . Name ] = claim . Name
2022-03-22 11:56:49 -04:00
return nil
}
2024-10-09 14:12:49 -04:00
func needsAdminAccess ( claimTemplate * resourceapi . ResourceClaimTemplate ) bool {
for _ , request := range claimTemplate . Spec . Spec . Devices . Requests {
2025-07-04 11:43:31 -04:00
if request . Exactly != nil && ptr . Deref ( request . Exactly . AdminAccess , false ) {
2024-10-09 14:12:49 -04:00
return true
}
}
return false
}
2025-02-28 14:32:59 -05:00
func hasPrioritizedList ( claimTemplate * resourceapi . ResourceClaimTemplate ) bool {
for _ , request := range claimTemplate . Spec . Spec . Devices . Requests {
if len ( request . FirstAvailable ) > 0 {
return true
}
}
return false
}
2023-04-14 03:50:52 -04:00
// findPodResourceClaim looks for an existing ResourceClaim with the right
// annotation (ties it to the pod claim) and the right ownership (ties it to
// the pod).
2024-06-14 06:40:48 -04:00
func ( ec * Controller ) findPodResourceClaim ( pod * v1 . Pod , podClaim v1 . PodResourceClaim ) ( * resourceapi . ResourceClaim , error ) {
2023-07-06 12:39:47 -04:00
// Only claims owned by the pod will get returned here.
claims , err := ec . claimCache . ByIndex ( claimPodOwnerIndex , string ( pod . UID ) )
2023-04-14 03:50:52 -04:00
if err != nil {
return nil , err
}
2023-07-06 12:39:47 -04:00
for _ , claimObj := range claims {
2024-06-14 06:40:48 -04:00
claim , ok := claimObj . ( * resourceapi . ResourceClaim )
2023-07-06 12:39:47 -04:00
if ! ok {
return nil , fmt . Errorf ( "unexpected object of type %T returned by claim cache" , claimObj )
2023-04-14 03:50:52 -04:00
}
podClaimName , ok := claim . Annotations [ podResourceClaimAnnotation ]
2024-09-16 11:06:59 -04:00
// No annotation? Then it cannot be an automatically generated claim
// and we need to ignore it.
if ! ok {
2023-04-14 03:50:52 -04:00
continue
}
2024-09-16 11:06:59 -04:00
// Not the claim for this particular pod claim?
if podClaimName != podClaim . Name {
2023-04-14 03:50:52 -04:00
continue
}
// Pick the first one that matches. There shouldn't be more than one. If there is,
// then all others will be ignored until the pod gets deleted. Then they also get
// cleaned up.
return claim , nil
}
return nil , nil
}
2024-06-14 06:40:48 -04:00
func ( ec * Controller ) reserveForPod ( ctx context . Context , pod * v1 . Pod , claim * resourceapi . ResourceClaim ) error {
2023-05-22 13:44:58 -04:00
claim = claim . DeepCopy ( )
claim . Status . ReservedFor = append ( claim . Status . ReservedFor ,
2024-06-14 06:40:48 -04:00
resourceapi . ResourceClaimConsumerReference {
2023-05-22 13:44:58 -04:00
Resource : "pods" ,
Name : pod . Name ,
UID : pod . UID ,
} )
2025-07-04 11:43:31 -04:00
if _ , err := ec . kubeClient . ResourceV1 ( ) . ResourceClaims ( claim . Namespace ) . UpdateStatus ( ctx , claim , metav1 . UpdateOptions { } ) ; err != nil {
2024-06-22 10:45:33 -04:00
return fmt . Errorf ( "reserve claim %s for pod: %w" , klog . KObj ( claim ) , err )
2023-05-22 13:44:58 -04:00
}
return nil
}
2022-03-22 11:56:49 -04:00
func ( ec * Controller ) syncClaim ( ctx context . Context , namespace , name string ) error {
2023-06-22 08:10:15 -04:00
logger := klog . LoggerWithValues ( klog . FromContext ( ctx ) , "claim" , klog . KRef ( namespace , name ) )
2022-03-22 11:56:49 -04:00
ctx = klog . NewContext ( ctx , logger )
claim , err := ec . claimLister . ResourceClaims ( namespace ) . Get ( name )
if err != nil {
2023-04-14 03:50:52 -04:00
if apierrors . IsNotFound ( err ) {
2025-12-30 00:44:48 -05:00
logger . V ( 5 ) . Info ( "Nothing to do for claim, it is gone" )
2022-03-22 11:56:49 -04:00
return nil
}
return err
}
2026-01-24 13:58:13 -05:00
// Check if the ReservedFor entries are all still valid. Essentially we are
// validating and potentially removing pod references, but just leaving any
// non-pod references in the list.
remaining := make ( [ ] resourceapi . ResourceClaimConsumerReference , 0 , len ( claim . Status . ReservedFor ) )
2022-03-22 11:56:49 -04:00
for _ , reservedFor := range claim . Status . ReservedFor {
if reservedFor . APIGroup == "" &&
reservedFor . Resource == "pods" {
// A pod falls into one of three categories:
// - we have it in our cache -> don't remove it until we are told that it got removed
// - we don't have it in our cache anymore, but we have seen it before -> it was deleted, remove it
// - not in our cache, not seen -> double-check with API server before removal
keepEntry := true
// Tracking deleted pods in the LRU cache is an
// optimization. Without this cache, the code would
// have to do the API call below for every deleted pod
// to ensure that the pod really doesn't exist. With
// the cache, most of the time the pod will be recorded
// as deleted and the API call can be avoided.
if ec . deletedObjects . Has ( reservedFor . UID ) {
// We know that the pod was deleted. This is
// easy to check and thus is done first.
keepEntry = false
} else {
pod , err := ec . podLister . Pods ( claim . Namespace ) . Get ( reservedFor . Name )
2023-06-22 08:13:42 -04:00
switch {
2023-04-14 03:50:52 -04:00
case err != nil && ! apierrors . IsNotFound ( err ) :
2022-03-22 11:56:49 -04:00
return err
2023-06-22 08:13:42 -04:00
case err != nil :
2022-03-22 11:56:49 -04:00
// We might not have it in our informer cache
// yet. Removing the pod while the scheduler is
// scheduling it would be bad. We have to be
// absolutely sure and thus have to check with
// the API server.
pod , err := ec . kubeClient . CoreV1 ( ) . Pods ( claim . Namespace ) . Get ( ctx , reservedFor . Name , metav1 . GetOptions { } )
2023-04-14 03:50:52 -04:00
if err != nil && ! apierrors . IsNotFound ( err ) {
2022-03-22 11:56:49 -04:00
return err
}
if pod == nil || pod . UID != reservedFor . UID {
2025-12-30 00:44:48 -05:00
logger . V ( 6 ) . Info ( "Remove reservation because pod is gone or got replaced" , "pod" , klog . KObj ( pod ) , "claim" , klog . KRef ( namespace , name ) )
2022-03-22 11:56:49 -04:00
keepEntry = false
}
2023-06-22 08:13:42 -04:00
case pod . UID != reservedFor . UID :
2025-12-30 00:44:48 -05:00
logger . V ( 6 ) . Info ( "Remove reservation because pod got replaced with new instance" , "pod" , klog . KObj ( pod ) , "claim" , klog . KRef ( namespace , name ) )
2023-06-22 08:13:42 -04:00
keepEntry = false
case isPodDone ( pod ) :
2025-12-30 00:44:48 -05:00
logger . V ( 6 ) . Info ( "Remove reservation because pod will not run anymore" , "pod" , klog . KObj ( pod ) , "claim" , klog . KRef ( namespace , name ) )
2022-03-22 11:56:49 -04:00
keepEntry = false
}
}
if keepEntry {
2026-01-24 13:58:13 -05:00
remaining = append ( remaining , reservedFor )
2022-03-22 11:56:49 -04:00
}
continue
}
2026-01-24 13:58:13 -05:00
// We don't know how to check this entry, so we just keep it to avoid
// accidentally removing a reservation for a non-pod consumer that we
// don't support yet.
remaining = append ( remaining , reservedFor )
2022-03-22 11:56:49 -04:00
}
2024-06-14 06:40:48 -04:00
builtinControllerFinalizer := slices . Index ( claim . Finalizers , resourceapi . Finalizer )
2026-01-24 13:58:13 -05:00
logger . V ( 5 ) . Info ( "Claim reserved for counts" , "currentCount" , len ( claim . Status . ReservedFor ) , "claim" , klog . KRef ( namespace , name ) , "updatedCount" , len ( remaining ) , "builtinController" , builtinControllerFinalizer >= 0 )
if len ( remaining ) < len ( claim . Status . ReservedFor ) {
2024-02-21 04:06:54 -05:00
// This is not using a patch because we want the update to fail if anything
// changed in the meantime.
2022-03-22 11:56:49 -04:00
claim := claim . DeepCopy ( )
2026-01-24 13:58:13 -05:00
claim . Status . ReservedFor = remaining
2023-06-28 09:33:07 -04:00
2024-10-11 10:08:54 -04:00
// DRA always performs delayed allocations. Relatedly, it also
// deallocates a claim as soon as the last consumer stops using
2023-06-28 09:33:07 -04:00
// it. This ensures that the claim can be allocated again as needed by
// some future consumer instead of trying to schedule that consumer
// onto the node that was chosen for the previous consumer. It also
// releases the underlying resources for use by other claims.
//
// This has to be triggered by the transition from "was being used" to
// "is not used anymore" because a DRA driver is not required to set
// `status.reservedFor` together with `status.allocation`, i.e. a claim
// that is "currently unused" should not get deallocated.
//
// This does not matter for claims that were created for a pod. For
// those, the resource claim controller will trigger deletion when the
// pod is done. However, it doesn't hurt to also trigger deallocation
// for such claims and not checking for them keeps this code simpler.
2026-01-24 13:58:13 -05:00
if len ( remaining ) == 0 {
2024-10-11 10:08:54 -04:00
// This is a sanity check. There shouldn't be any claims without this
// finalizer because there's no longer any other way of allocating claims.
// Classic DRA was the alternative earlier.
2024-02-21 04:06:54 -05:00
if builtinControllerFinalizer >= 0 {
2024-06-13 11:25:39 -04:00
// Allocated by scheduler with structured parameters. We can "deallocate"
// by clearing the allocation.
claim . Status . Allocation = nil
2024-02-21 04:06:54 -05:00
}
}
2025-07-04 11:43:31 -04:00
claim , err := ec . kubeClient . ResourceV1 ( ) . ResourceClaims ( claim . Namespace ) . UpdateStatus ( ctx , claim , metav1 . UpdateOptions { } )
2024-02-21 04:06:54 -05:00
if err != nil {
return err
2023-06-28 09:33:07 -04:00
}
2025-12-18 04:20:52 -05:00
logger . V ( 5 ) . Info ( "Removed consumers" , "claim" , klog . KRef ( namespace , name ) , "currentCount" , len ( claim . Status . ReservedFor ) , "allocated" , claim . Status . Allocation != nil )
2023-06-28 09:33:07 -04:00
2024-02-21 04:06:54 -05:00
// Now also remove the finalizer if it is not needed anymore.
// Note that the index may have changed as a result of the UpdateStatus call.
2024-06-14 06:40:48 -04:00
builtinControllerFinalizer := slices . Index ( claim . Finalizers , resourceapi . Finalizer )
2024-02-21 04:06:54 -05:00
if builtinControllerFinalizer >= 0 && claim . Status . Allocation == nil {
claim . Finalizers = slices . Delete ( claim . Finalizers , builtinControllerFinalizer , builtinControllerFinalizer + 1 )
2025-07-04 11:43:31 -04:00
if _ , err := ec . kubeClient . ResourceV1 ( ) . ResourceClaims ( claim . Namespace ) . Update ( ctx , claim , metav1 . UpdateOptions { } ) ; err != nil {
2024-02-21 04:06:54 -05:00
return err
}
2025-12-18 04:20:52 -05:00
logger . V ( 5 ) . Info ( "Removed finalizer after removing consumers" , "claim" , klog . KRef ( namespace , name ) )
2024-02-21 04:06:54 -05:00
}
2026-01-24 13:58:13 -05:00
} else if builtinControllerFinalizer >= 0 && claim . DeletionTimestamp != nil && len ( remaining ) == 0 {
2024-02-21 04:06:54 -05:00
claim := claim . DeepCopy ( )
if claim . Status . Allocation != nil {
// This can happen when a claim with immediate allocation
// stopped being used, remained allocated, and then got
// deleted. As above we then need to clear the allocation.
claim . Status . Allocation = nil
var err error
2025-07-04 11:43:31 -04:00
claim , err = ec . kubeClient . ResourceV1 ( ) . ResourceClaims ( claim . Namespace ) . UpdateStatus ( ctx , claim , metav1 . UpdateOptions { } )
2024-02-21 04:06:54 -05:00
if err != nil {
return err
}
2025-12-18 04:20:52 -05:00
logger . V ( 5 ) . Info ( "Removed allocation because not needed" , "claim" , klog . KRef ( namespace , name ) )
2024-02-21 04:06:54 -05:00
}
// Whether it was allocated or not, remove the finalizer to unblock removal.
claim . Finalizers = slices . Delete ( claim . Finalizers , builtinControllerFinalizer , builtinControllerFinalizer + 1 )
2025-07-04 11:43:31 -04:00
_ , err := ec . kubeClient . ResourceV1 ( ) . ResourceClaims ( claim . Namespace ) . Update ( ctx , claim , metav1 . UpdateOptions { } )
2022-03-22 11:56:49 -04:00
if err != nil {
return err
}
2025-12-18 04:20:52 -05:00
logger . V ( 5 ) . Info ( "Removed finalizer because not needed" , "claim" , klog . KRef ( namespace , name ) )
2022-03-22 04:59:20 -04:00
}
2022-03-22 11:56:49 -04:00
2026-01-24 13:58:13 -05:00
if len ( remaining ) == 0 {
2023-06-22 08:15:17 -04:00
// Claim is not reserved. If it was generated for a pod and
// that pod is not going to run, the claim can be
// deleted. Normally the garbage collector does that, but the
// pod itself might not get deleted for a while.
podName , podUID := owningPod ( claim )
if podName != "" {
pod , err := ec . podLister . Pods ( claim . Namespace ) . Get ( podName )
switch {
case err == nil :
// Pod already replaced or not going to run?
if pod . UID != podUID || isPodDone ( pod ) {
// We are certain that the owning pod is not going to need
// the claim and therefore remove the claim.
2025-07-04 11:43:31 -04:00
err := ec . kubeClient . ResourceV1 ( ) . ResourceClaims ( claim . Namespace ) . Delete ( ctx , claim . Name , metav1 . DeleteOptions { } )
2023-06-22 08:15:17 -04:00
if err != nil {
2025-12-18 04:20:52 -05:00
return fmt . Errorf ( "delete unused generated claim %s: %w" , klog . KObj ( claim ) , err )
2023-06-22 08:15:17 -04:00
}
2025-12-18 04:20:52 -05:00
logger . V ( 5 ) . Info ( "Deleted unused generated claim" , "claim" , klog . KObj ( claim ) , "pod" , klog . KObj ( pod ) )
2023-06-22 08:15:17 -04:00
} else {
2025-12-30 00:44:48 -05:00
logger . V ( 6 ) . Info ( "Wrong pod content, not deleting claim" , "claim" , klog . KObj ( claim ) , "podUID" , podUID , "podContent" , pod )
2023-06-22 08:15:17 -04:00
}
2023-04-14 03:50:52 -04:00
case apierrors . IsNotFound ( err ) :
2023-06-22 08:15:17 -04:00
// We might not know the pod *yet*. Instead of doing an expensive API call,
// let the garbage collector handle the case that the pod is truly gone.
2025-12-30 00:44:48 -05:00
logger . V ( 5 ) . Info ( "Pod for claim not found" , "claim" , klog . KObj ( claim ) , "pod" , klog . KRef ( claim . Namespace , podName ) )
2023-06-22 08:15:17 -04:00
default :
return fmt . Errorf ( "lookup pod: %v" , err )
}
} else {
2025-12-30 00:44:48 -05:00
logger . V ( 5 ) . Info ( "Claim not generated for a pod" , "claim" , klog . KObj ( claim ) )
2023-06-22 08:15:17 -04:00
}
}
2022-03-22 04:59:20 -04:00
return nil
}
2022-03-22 11:56:49 -04:00
2024-06-14 06:40:48 -04:00
func owningPod ( claim * resourceapi . ResourceClaim ) ( string , types . UID ) {
2023-06-22 08:15:17 -04:00
for _ , owner := range claim . OwnerReferences {
2023-09-15 09:07:03 -04:00
if ptr . Deref ( owner . Controller , false ) &&
2023-06-22 08:15:17 -04:00
owner . APIVersion == "v1" &&
owner . Kind == "Pod" {
return owner . Name , owner . UID
}
}
return "" , ""
}
2025-12-30 00:44:48 -05:00
func podResourceClaimTemplateIndexFunc ( obj interface { } ) ( [ ] string , error ) {
pod , ok := obj . ( * v1 . Pod )
if ! ok {
return [ ] string { } , nil
}
keySet := sets . NewString ( )
for _ , podClaim := range pod . Spec . ResourceClaims {
if podClaim . ResourceClaimTemplateName != nil {
resourceTemplate := * podClaim . ResourceClaimTemplateName
keySet . Insert ( fmt . Sprintf ( "%s/%s" , pod . Namespace , resourceTemplate ) )
}
}
return keySet . List ( ) , nil
}
2022-03-22 11:56:49 -04:00
// podResourceClaimIndexFunc is an index function that returns ResourceClaim keys (=
2023-05-22 13:44:58 -04:00
// namespace/name) for ResourceClaim or ResourceClaimTemplates in a given pod.
2022-03-22 11:56:49 -04:00
func podResourceClaimIndexFunc ( obj interface { } ) ( [ ] string , error ) {
pod , ok := obj . ( * v1 . Pod )
if ! ok {
return [ ] string { } , nil
}
keys := [ ] string { }
for _ , podClaim := range pod . Spec . ResourceClaims {
2023-05-22 13:44:58 -04:00
claimName , _ , err := resourceclaim . Name ( pod , & podClaim )
if err != nil || claimName == nil {
// Index functions are not supposed to fail, the caller will panic.
// For both error reasons (claim not created yet, unknown API)
// we simply don't index.
continue
2022-03-22 11:56:49 -04:00
}
2023-05-22 13:44:58 -04:00
keys = append ( keys , fmt . Sprintf ( "%s/%s" , pod . Namespace , * claimName ) )
2022-03-22 11:56:49 -04:00
}
return keys , nil
}
2023-06-22 05:21:37 -04:00
// isPodDone returns true if it is certain that none of the containers are running and never will run.
func isPodDone ( pod * v1 . Pod ) bool {
return podutil . IsPodPhaseTerminal ( pod . Status . Phase ) ||
// Deleted and not scheduled:
pod . DeletionTimestamp != nil && pod . Spec . NodeName == ""
}
2023-07-06 12:39:47 -04:00
// claimPodOwnerIndexFunc is an index function that returns the pod UIDs of
// all pods which own the resource claim. Should only be one, though.
func claimPodOwnerIndexFunc ( obj interface { } ) ( [ ] string , error ) {
2024-06-14 06:40:48 -04:00
claim , ok := obj . ( * resourceapi . ResourceClaim )
2023-07-06 12:39:47 -04:00
if ! ok {
return nil , nil
}
var keys [ ] string
for _ , owner := range claim . OwnerReferences {
if owner . Controller != nil &&
* owner . Controller &&
owner . APIVersion == "v1" &&
owner . Kind == "Pod" {
keys = append ( keys , string ( owner . UID ) )
}
}
return keys , nil
}
2025-07-08 00:24:31 -04:00
func getAdminAccessMetricLabel ( claim * resourceapi . ResourceClaim ) string {
if claim == nil {
return "false"
}
for _ , request := range claim . Spec . Devices . Requests {
2025-07-04 11:43:31 -04:00
// Sub-requests in FirstAvailable don't have admin access.
if request . Exactly != nil && ptr . Deref ( request . Exactly . AdminAccess , false ) {
2025-07-08 00:24:31 -04:00
return "true"
}
}
return "false"
}
func newCustomCollector ( rcLister resourcelisters . ResourceClaimLister , adminAccessFunc func ( * resourceapi . ResourceClaim ) string , logger klog . Logger ) metrics . StableCollector {
return & customCollector {
rcLister : rcLister ,
adminAccessFunc : adminAccessFunc ,
logger : logger ,
}
}
type customCollector struct {
metrics . BaseStableCollector
rcLister resourcelisters . ResourceClaimLister
adminAccessFunc func ( * resourceapi . ResourceClaim ) string
logger klog . Logger
}
var _ metrics . StableCollector = & customCollector { }
func ( collector * customCollector ) DescribeWithStability ( ch chan <- * metrics . Desc ) {
ch <- resourceclaimmetrics . NumResourceClaimsDesc
}
func ( collector * customCollector ) CollectWithStability ( ch chan <- metrics . Metric ) {
2025-10-08 09:16:42 -04:00
rcMetrics := make ( map [ resourceclaimmetrics . NumResourceClaimLabels ] int )
2025-07-08 00:24:31 -04:00
rcList , err := collector . rcLister . List ( labels . Everything ( ) )
if err != nil {
collector . logger . Error ( err , "failed to list resource claims for metrics collection" )
return
}
for _ , rc := range rcList {
// Determine if the ResourceClaim is allocated
allocated := "false"
if rc . Status . Allocation != nil {
allocated = "true"
}
adminAccess := collector . adminAccessFunc ( rc )
2025-10-08 09:16:42 -04:00
source := ""
if val , ok := rc . Annotations [ resourceapi . ExtendedResourceClaimAnnotation ] ; ok && val == "true" {
source = "extended_resource"
} else if val , ok := rc . Annotations [ podResourceClaimAnnotation ] ; ok && val != "" {
source = "resource_claim_template"
2025-07-08 00:24:31 -04:00
}
2025-10-08 09:16:42 -04:00
rcMetrics [ resourceclaimmetrics . NumResourceClaimLabels { Allocated : allocated , AdminAccess : adminAccess , Source : source } ] ++
}
2025-10-29 06:38:05 -04:00
for rcLabels , count := range rcMetrics {
2025-10-08 09:16:42 -04:00
ch <- metrics . NewLazyConstMetric (
resourceclaimmetrics . NumResourceClaimsDesc ,
metrics . GaugeValue ,
float64 ( count ) ,
2025-10-29 06:38:05 -04:00
rcLabels . Allocated ,
rcLabels . AdminAccess ,
rcLabels . Source ,
2025-10-08 09:16:42 -04:00
)
2025-07-08 00:24:31 -04:00
}
}