From c67937dd352602900df8b5606b276d0eff3f9d83 Mon Sep 17 00:00:00 2001 From: yliao Date: Tue, 4 Nov 2025 16:41:07 +0000 Subject: [PATCH] switched from storing name to storing a pointer to the device class. --- .../lister_contract_test.go | 4 +- .../plugins/dynamicresources/dra_manager.go | 5 +- .../dynamicresources/dynamicresources.go | 10 +-- .../dynamicresources/dynamicresources_test.go | 40 ++++++++--- .../framework/plugins/noderesources/fit.go | 2 +- .../extendedresourcecache.go | 34 ++++----- .../extendedresourcecache_test.go | 70 ++++++++++--------- .../kube-scheduler/framework/listers.go | 2 +- 8 files changed, 89 insertions(+), 78 deletions(-) diff --git a/pkg/scheduler/framework/autoscaler_contract/lister_contract_test.go b/pkg/scheduler/framework/autoscaler_contract/lister_contract_test.go index 7226e424896..ad1b79e2e37 100644 --- a/pkg/scheduler/framework/autoscaler_contract/lister_contract_test.go +++ b/pkg/scheduler/framework/autoscaler_contract/lister_contract_test.go @@ -145,6 +145,6 @@ func (s *sharedDRAManagerContract) DeviceClassResolver() fwk.DeviceClassResolver type deviceClassResolverContract struct{} -func (d *deviceClassResolverContract) GetDeviceClass(_ v1.ResourceName) string { - return "" +func (d *deviceClassResolverContract) GetDeviceClass(_ v1.ResourceName) *resourceapi.DeviceClass { + return nil } diff --git a/pkg/scheduler/framework/plugins/dynamicresources/dra_manager.go b/pkg/scheduler/framework/plugins/dynamicresources/dra_manager.go index 41ad036a4a3..b40f0202bb6 100644 --- a/pkg/scheduler/framework/plugins/dynamicresources/dra_manager.go +++ b/pkg/scheduler/framework/plugins/dynamicresources/dra_manager.go @@ -51,7 +51,6 @@ type DefaultDRAManager struct { func NewDRAManager(ctx context.Context, claimsCache *assumecache.AssumeCache, resourceSliceTracker *resourceslicetracker.Tracker, informerFactory informers.SharedInformerFactory) *DefaultDRAManager { logger := klog.FromContext(ctx) - dcLister := informerFactory.Resource().V1().DeviceClasses().Lister() manager := &DefaultDRAManager{ resourceClaimTracker: &claimTracker{ cache: claimsCache, @@ -60,11 +59,11 @@ func NewDRAManager(ctx context.Context, claimsCache *assumecache.AssumeCache, re logger: logger, }, resourceSliceLister: &resourceSliceLister{tracker: resourceSliceTracker}, - deviceClassLister: &deviceClassLister{classLister: dcLister}, + deviceClassLister: &deviceClassLister{classLister: informerFactory.Resource().V1().DeviceClasses().Lister()}, } if utilfeature.DefaultFeatureGate.Enabled(features.DRAExtendedResource) { - manager.extendedResourceCache = extendedresourcecache.NewExtendedResourceCache(dcLister, logger) + manager.extendedResourceCache = extendedresourcecache.NewExtendedResourceCache(logger) } // Reacting to events is more efficient than iterating over the list diff --git a/pkg/scheduler/framework/plugins/dynamicresources/dynamicresources.go b/pkg/scheduler/framework/plugins/dynamicresources/dynamicresources.go index 935f978cd64..e48b9a7d4a8 100644 --- a/pkg/scheduler/framework/plugins/dynamicresources/dynamicresources.go +++ b/pkg/scheduler/framework/plugins/dynamicresources/dynamicresources.go @@ -414,7 +414,7 @@ func hasDeviceClassMappedExtendedResource(reqs v1.ResourceList, cache fwk.Device continue } if schedutil.IsDRAExtendedResourceName(rName) { - if cache.GetDeviceClass(rName) != "" { + if cache.GetDeviceClass(rName) != nil { return true } } @@ -750,7 +750,7 @@ func (pl *DynamicResources) filterExtendedResources(state *stateData, pod *v1.Po continue } allocatable, okScalar := nodeInfo.GetAllocatable().GetScalarResources()[rName] - isBackedByDRA := cache.GetDeviceClass(rName) != "" + isBackedByDRA := cache.GetDeviceClass(rName) != nil if isBackedByDRA { if allocatable > 0 { // node provides the resource via device plugin @@ -825,9 +825,9 @@ func createDeviceRequests(pod *v1.Pod, extendedResources map[v1.ResourceName]int if !ok || crq == 0 { continue } - className := cache.GetDeviceClass(r) + class := cache.GetDeviceClass(r) // skip if the request does not map to a device class - if className == "" { + if class == nil { continue } keys := make([]string, 0, len(creqs)) @@ -851,7 +851,7 @@ func createDeviceRequests(pod *v1.Pod, extendedResources map[v1.ResourceName]int resourceapi.DeviceRequest{ Name: fmt.Sprintf("container-%d-request-%d", i, ridx), // need to be container name index - extended resource name index Exactly: &resourceapi.ExactDeviceRequest{ - DeviceClassName: className, // map external resource name -> device class name + DeviceClassName: class.Name, // map external resource name -> device class name AllocationMode: resourceapi.DeviceAllocationModeExactCount, Count: crq, }, diff --git a/pkg/scheduler/framework/plugins/dynamicresources/dynamicresources_test.go b/pkg/scheduler/framework/plugins/dynamicresources/dynamicresources_test.go index 15d1ededbd7..63c34f507ad 100644 --- a/pkg/scheduler/framework/plugins/dynamicresources/dynamicresources_test.go +++ b/pkg/scheduler/framework/plugins/dynamicresources/dynamicresources_test.go @@ -2708,10 +2708,10 @@ func Test_isSchedulableAfterPodChange(t *testing.T) { // mockDeviceClassResolver is a simple mock implementation of fwk.DeviceClassResolver for testing type mockDeviceClassResolver struct { - mapping map[v1.ResourceName]string + mapping map[v1.ResourceName]*resourceapi.DeviceClass } -func (m *mockDeviceClassResolver) GetDeviceClass(resourceName v1.ResourceName) string { +func (m *mockDeviceClassResolver) GetDeviceClass(resourceName v1.ResourceName) *resourceapi.DeviceClass { return m.mapping[resourceName] } @@ -2754,16 +2754,36 @@ func Test_createDeviceRequests(t *testing.T) { v1.ResourceName(extendedResourceName): 1, v1.ResourceName(extendedResourceName + "init"): 2, } - devMap := map[v1.ResourceName]string{ - v1.ResourceName(extendedResourceName): "class", + devMap := map[v1.ResourceName]*resourceapi.DeviceClass{ + v1.ResourceName(extendedResourceName): { + ObjectMeta: metav1.ObjectMeta{ + Name: "class", + }, + }, } - devMap2 := map[v1.ResourceName]string{ - v1.ResourceName(extendedResourceName): "class", - v1.ResourceName(extendedResourceName + "1"): "class1", + devMap2 := map[v1.ResourceName]*resourceapi.DeviceClass{ + v1.ResourceName(extendedResourceName): { + ObjectMeta: metav1.ObjectMeta{ + Name: "class", + }, + }, + v1.ResourceName(extendedResourceName + "1"): { + ObjectMeta: metav1.ObjectMeta{ + Name: "class1", + }, + }, } - devMapInit := map[v1.ResourceName]string{ - v1.ResourceName(extendedResourceName): "class", - v1.ResourceName(extendedResourceName + "init"): "classInit", + devMapInit := map[v1.ResourceName]*resourceapi.DeviceClass{ + v1.ResourceName(extendedResourceName): { + ObjectMeta: metav1.ObjectMeta{ + Name: "class", + }, + }, + v1.ResourceName(extendedResourceName + "init"): { + ObjectMeta: metav1.ObjectMeta{ + Name: "classInit", + }, + }, } devReq := resourceapi.DeviceRequest{ Name: "container-0-request-0", diff --git a/pkg/scheduler/framework/plugins/noderesources/fit.go b/pkg/scheduler/framework/plugins/noderesources/fit.go index 839b8ca1704..2c1bac3d44e 100644 --- a/pkg/scheduler/framework/plugins/noderesources/fit.go +++ b/pkg/scheduler/framework/plugins/noderesources/fit.go @@ -208,7 +208,7 @@ func shouldDelegateResourceToDRA(rName v1.ResourceName, nodeInfo fwk.NodeInfo, d // If draManager is available, check the cache for a mapping if draManager != nil { cache := draManager.DeviceClassResolver() - return cache.GetDeviceClass(rName) != "" + return cache.GetDeviceClass(rName) != nil } // If draManager is nil (e.g., kubelet admission check), delegate resources that are not in diff --git a/staging/src/k8s.io/dynamic-resource-allocation/deviceclass/extendedresourcecache/extendedresourcecache.go b/staging/src/k8s.io/dynamic-resource-allocation/deviceclass/extendedresourcecache/extendedresourcecache.go index dfaf9bee80b..ceab71a7b42 100644 --- a/staging/src/k8s.io/dynamic-resource-allocation/deviceclass/extendedresourcecache/extendedresourcecache.go +++ b/staging/src/k8s.io/dynamic-resource-allocation/deviceclass/extendedresourcecache/extendedresourcecache.go @@ -23,7 +23,6 @@ import ( v1 "k8s.io/api/core/v1" resourceapi "k8s.io/api/resource/v1" utilruntime "k8s.io/apimachinery/pkg/util/runtime" - resourcelisters "k8s.io/client-go/listers/resource/v1" "k8s.io/client-go/tools/cache" klog "k8s.io/klog/v2" ) @@ -31,13 +30,12 @@ import ( // ExtendedResourceCache maintains a global cache of extended resource to device class mappings, // based on informer events. For that it implements the cache.ResourceEventHandler interface. type ExtendedResourceCache struct { - deviceClassLister resourcelisters.DeviceClassLister - logger klog.Logger - handlers []cache.ResourceEventHandler + logger klog.Logger + handlers []cache.ResourceEventHandler mutex sync.RWMutex // mapping maps extended resource name to device class name - mapping map[v1.ResourceName]string + mapping map[v1.ResourceName]*resourceapi.DeviceClass } var _ cache.ResourceEventHandler = &ExtendedResourceCache{} @@ -46,12 +44,11 @@ var _ cache.ResourceEventHandler = &ExtendedResourceCache{} // is responsible for registering the instance as a handler of DeviceClass events. // // Additional event handlers may be registered here or via AddEventHandler. -func NewExtendedResourceCache(deviceClassLister resourcelisters.DeviceClassLister, logger klog.Logger, handlers ...cache.ResourceEventHandler) *ExtendedResourceCache { +func NewExtendedResourceCache(logger klog.Logger, handlers ...cache.ResourceEventHandler) *ExtendedResourceCache { cache := &ExtendedResourceCache{ - deviceClassLister: deviceClassLister, - logger: logger, - handlers: handlers, - mapping: make(map[v1.ResourceName]string), + logger: logger, + handlers: handlers, + mapping: make(map[v1.ResourceName]*resourceapi.DeviceClass), } return cache @@ -73,9 +70,9 @@ func (c *ExtendedResourceCache) AddEventHandler(handler cache.ResourceEventHandl // // This (and only this) method may be called on a nil ExtendedResourceCache. The nil // instance always returns the empty string. -func (c *ExtendedResourceCache) GetDeviceClass(resourceName v1.ResourceName) string { +func (c *ExtendedResourceCache) GetDeviceClass(resourceName v1.ResourceName) *resourceapi.DeviceClass { if c == nil { - return "" + return nil } c.mutex.RLock() defer c.mutex.RUnlock() @@ -144,14 +141,7 @@ func (c *ExtendedResourceCache) updateMapping(newDeviceClass, oldDeviceClass *re var classWithSameExtendedResourceName *resourceapi.DeviceClass if newDeviceClass.Spec.ExtendedResourceName != nil { - name := c.mapping[v1.ResourceName(*newDeviceClass.Spec.ExtendedResourceName)] - var err error - if c.deviceClassLister != nil && name != "" { - classWithSameExtendedResourceName, err = c.deviceClassLister.Get(name) - } - if err != nil { - c.logger.V(5).Info("Failed to get device class", "extendedResource", *newDeviceClass.Spec.ExtendedResourceName, "deviceClass", name, "error", err) - } + classWithSameExtendedResourceName = c.mapping[v1.ResourceName(*newDeviceClass.Spec.ExtendedResourceName)] } if classWithSameExtendedResourceName != nil { if newDeviceClass.CreationTimestamp.Before(&classWithSameExtendedResourceName.CreationTimestamp) { @@ -174,14 +164,14 @@ func (c *ExtendedResourceCache) updateMapping(newDeviceClass, oldDeviceClass *re // Add new mappings if newDeviceClass.Spec.ExtendedResourceName != nil { - c.mapping[v1.ResourceName(*newDeviceClass.Spec.ExtendedResourceName)] = newDeviceClass.Name + c.mapping[v1.ResourceName(*newDeviceClass.Spec.ExtendedResourceName)] = newDeviceClass c.logger.V(5).Info("Updated extended resource cache for explicit mapping", "extendedResource", *newDeviceClass.Spec.ExtendedResourceName, "deviceClass", newDeviceClass.Name) } // Always add the default mapping defaultResourceName := v1.ResourceName(resourceapi.ResourceDeviceClassPrefix + newDeviceClass.Name) - c.mapping[defaultResourceName] = newDeviceClass.Name + c.mapping[defaultResourceName] = newDeviceClass c.logger.V(5).Info("Updated extended resource cache for default mapping", "extendedResource", defaultResourceName, "deviceClass", newDeviceClass.Name) diff --git a/staging/src/k8s.io/dynamic-resource-allocation/deviceclass/extendedresourcecache/extendedresourcecache_test.go b/staging/src/k8s.io/dynamic-resource-allocation/deviceclass/extendedresourcecache/extendedresourcecache_test.go index a87614d47f3..569978058c7 100644 --- a/staging/src/k8s.io/dynamic-resource-allocation/deviceclass/extendedresourcecache/extendedresourcecache_test.go +++ b/staging/src/k8s.io/dynamic-resource-allocation/deviceclass/extendedresourcecache/extendedresourcecache_test.go @@ -32,14 +32,14 @@ import ( ) type deviceClassResolver interface { - GetDeviceClass(resourceName v1.ResourceName) string + GetDeviceClass(resourceName v1.ResourceName) *resourceapi.DeviceClass } func TestNil(t *testing.T) { var cache *ExtendedResourceCache var resolver deviceClassResolver = cache - if className := resolver.GetDeviceClass("example.com/gpu"); className != "" { - t.Errorf("Expected the empty class name from a nil instance, got instead: %q", className) + if class := resolver.GetDeviceClass("example.com/gpu"); class != nil { + t.Errorf("Expected the nil class from a nil instance, got instead: %q", class.Name) } } @@ -91,7 +91,7 @@ func TestHandlers(t *testing.T) { } }, } - erCache := NewExtendedResourceCache(nil, logger, firstHandler) + erCache := NewExtendedResourceCache(logger, firstHandler) secondHandler := &cache.ResourceEventHandlerFuncs{ AddFunc: func(obj interface{}) { if obj != class { @@ -101,8 +101,8 @@ func TestHandlers(t *testing.T) { if numAdd != 2 { t.Errorf("second handler expected Add to be called last once, actual add #%d", numAdd) } - if className := erCache.GetDeviceClass(resourceName); className != class.Name { - t.Errorf("expected %q, got %q", class.Name, className) + if deviceClass := erCache.GetDeviceClass(resourceName); deviceClass == nil || deviceClass.Name != class.Name { + t.Errorf("expected %q, got %q", class.Name, deviceClass) } }, UpdateFunc: func(oldObj, newObj interface{}) { @@ -116,8 +116,8 @@ func TestHandlers(t *testing.T) { if numUpdate != 2 { t.Errorf("second handler expected Update to be called last once, actual update #%d", numUpdate) } - if className := erCache.GetDeviceClass(resourceName); className != "" { - t.Errorf("expected %q, got %q", "", className) + if class := erCache.GetDeviceClass(resourceName); class != nil { + t.Errorf("expected %q, got %v", "", class) } }, DeleteFunc: func(obj interface{}) { @@ -128,8 +128,8 @@ func TestHandlers(t *testing.T) { if numDelete != 2 { t.Errorf("second handler expected Delete to be called last once, actual delete #%d", numDelete) } - if className := erCache.GetDeviceClass(resourceName); className != "" { - t.Errorf("expected %q, got %q", "", className) + if class := erCache.GetDeviceClass(resourceName); class != nil { + t.Errorf("expected %q, got %q", "", class) } }, } @@ -147,7 +147,7 @@ func TestExtendedResourceCache(t *testing.T) { informerFactory := informers.NewSharedInformerFactory(client, 0) deviceClassInformer := informerFactory.Resource().V1().DeviceClasses() - cache := NewExtendedResourceCache(deviceClassInformer.Lister(), logger) + cache := NewExtendedResourceCache(logger) if _, err := deviceClassInformer.Informer().AddEventHandler(cache); err != nil { logger.Error(err, "Failed to add event handler for device classes") } @@ -229,20 +229,21 @@ func TestExtendedResourceCache(t *testing.T) { time.Sleep(1 * time.Second) // Verify explicit mapping - deviceClassName := cache.GetDeviceClass("example.com/gpu") - if deviceClassName != "gpu-class" { - t.Errorf("Expected to find device class 'gpu-class' for 'example.com/gpu', got %s", deviceClassName) + deviceClass := cache.GetDeviceClass("example.com/gpu") + if deviceClass == nil || deviceClass.Name != "gpu-class" { + t.Errorf("Expected to find device class 'gpu-class' for 'example.com/gpu', got %v", deviceClass) } // Verify default mapping defaultResourceName := v1.ResourceName("deviceclass.resource.kubernetes.io/fpga-class") - deviceClassName = cache.GetDeviceClass(defaultResourceName) - if deviceClassName != "fpga-class" { - t.Errorf("Expected to find device class 'fpga-class' for '%s', got %s", defaultResourceName, deviceClassName) + deviceClass = cache.GetDeviceClass(defaultResourceName) + if deviceClass == nil || deviceClass.Name != "fpga-class" { + t.Errorf("Expected to find device class 'fpga-class' for '%s', got %v", defaultResourceName, deviceClass) } // Verify both device classes have default mappings - if cache.GetDeviceClass("deviceclass.resource.kubernetes.io/gpu-class") != "gpu-class" { + deviceClass = cache.GetDeviceClass("deviceclass.resource.kubernetes.io/gpu-class") + if deviceClass == nil || deviceClass.Name != "gpu-class" { t.Error("Expected default mapping for gpu-class") } @@ -254,9 +255,9 @@ func TestExtendedResourceCache(t *testing.T) { time.Sleep(1 * time.Second) // should keep deviceClass1, since it is newer than deviceClass3 - deviceClassName = cache.GetDeviceClass("example.com/gpu") - if deviceClassName != "gpu-class" { - t.Errorf("Expected to find device class 'gpu-class' for 'example.com/gpu', got %s", deviceClassName) + deviceClass = cache.GetDeviceClass("example.com/gpu") + if deviceClass == nil || deviceClass.Name != "gpu-class" { + t.Errorf("Expected to find device class 'gpu-class' for 'example.com/gpu', got %v", deviceClass) } // deviceClass4 is newer than deviceClass1, hence it will replace deviceClass1 @@ -267,9 +268,9 @@ func TestExtendedResourceCache(t *testing.T) { time.Sleep(1 * time.Second) // deviceClass4 replaces deviceClass1, since it is newer with the same example.com/gpu extended resource name - deviceClassName = cache.GetDeviceClass("example.com/gpu") - if deviceClassName != "gpu-class-4" { - t.Errorf("Expected to find device class 'gpu-class' for 'example.com/gpu', got %s", deviceClassName) + deviceClass = cache.GetDeviceClass("example.com/gpu") + if deviceClass == nil || deviceClass.Name != "gpu-class-4" { + t.Errorf("Expected to find device class 'gpu-class' for 'example.com/gpu', got %v", deviceClass) } // deviceClass0 is created at the same time as deviceClass4, but its name is alphabetically ordered earlier, @@ -282,9 +283,9 @@ func TestExtendedResourceCache(t *testing.T) { // deviceClass0 replaces deviceClass4, it is created at the same time as deviceClass4, but its name is // alphabetically ordered earlier - deviceClassName = cache.GetDeviceClass("example.com/gpu") - if deviceClassName != "gpu-class-0" { - t.Errorf("Expected to find device class 'gpu-class' for 'example.com/gpu', got %s", deviceClassName) + deviceClass = cache.GetDeviceClass("example.com/gpu") + if deviceClass == nil || deviceClass.Name != "gpu-class-0" { + t.Errorf("Expected to find device class 'gpu-class' for 'example.com/gpu', got %v", deviceClass) } // Test modifying a device class @@ -297,11 +298,12 @@ func TestExtendedResourceCache(t *testing.T) { time.Sleep(1 * time.Second) // Should have the new mapping - if cache.GetDeviceClass("test.com/gpu") != "gpu-class-0" { - t.Errorf("Expected to find device class 'gpu-class-0' for 'test.com/gpu' after modification, got %s", deviceClassName) + deviceClass = cache.GetDeviceClass("test.com/gpu") + if deviceClass == nil || deviceClass.Name != "gpu-class-0" { + t.Errorf("Expected to find device class 'gpu-class-0' for 'test.com/gpu' after modification, got %v", deviceClass) } // Should not have the old mapping for example.com/gpu - if cache.GetDeviceClass("example.com/gpu") != "" { + if cache.GetDeviceClass("example.com/gpu") != nil { t.Errorf("Expected 'example.com/gpu' to be removed after modification, got %s", cache.GetDeviceClass("example.com/gpu")) } @@ -312,12 +314,12 @@ func TestExtendedResourceCache(t *testing.T) { } time.Sleep(1 * time.Second) - deviceClassName = cache.GetDeviceClass("test.com/gpu") - if deviceClassName != "" { - t.Errorf("Expected 'test.com/gpu' to be removed after deleting device class, got %s", deviceClassName) + deviceClass = cache.GetDeviceClass("test.com/gpu") + if deviceClass != nil { + t.Errorf("Expected 'test.com/gpu' to be removed after deleting device class, got %s", deviceClass) } // Verify the default mapping is removed - if cache.GetDeviceClass("deviceclass.resource.kubernetes.io/gpu-class-0") != "" { + if cache.GetDeviceClass("deviceclass.resource.kubernetes.io/gpu-class-0") != nil { t.Errorf("Expected 'deviceclass.resource.kubernetes.io/gpu-class-0' to be removed after deleting device class, got %s", cache.GetDeviceClass("deviceclass.resource.kubernetes.io/gpu-class")) } } diff --git a/staging/src/k8s.io/kube-scheduler/framework/listers.go b/staging/src/k8s.io/kube-scheduler/framework/listers.go index 4e6d2f8a716..ffe45688889 100644 --- a/staging/src/k8s.io/kube-scheduler/framework/listers.go +++ b/staging/src/k8s.io/kube-scheduler/framework/listers.go @@ -114,7 +114,7 @@ type DeviceClassResolver interface { // GetDeviceClass returns the device class name for the given extended resource name. // Returns empty string if no mapping exists for the resource name or // the DRAExtendedResource feature is disabled. - GetDeviceClass(resourceName v1.ResourceName) string + GetDeviceClass(resourceName v1.ResourceName) *resourceapi.DeviceClass } // SharedDRAManager can be used to obtain DRA objects, and track modifications to them in-memory - mainly by the DRA plugin.