switched from storing name to storing a pointer to the device class.

This commit is contained in:
yliao 2025-11-04 16:41:07 +00:00
parent b83a6a83f0
commit c67937dd35
8 changed files with 89 additions and 78 deletions

View file

@ -145,6 +145,6 @@ func (s *sharedDRAManagerContract) DeviceClassResolver() fwk.DeviceClassResolver
type deviceClassResolverContract struct{}
func (d *deviceClassResolverContract) GetDeviceClass(_ v1.ResourceName) string {
return ""
func (d *deviceClassResolverContract) GetDeviceClass(_ v1.ResourceName) *resourceapi.DeviceClass {
return nil
}

View file

@ -51,7 +51,6 @@ type DefaultDRAManager struct {
func NewDRAManager(ctx context.Context, claimsCache *assumecache.AssumeCache, resourceSliceTracker *resourceslicetracker.Tracker, informerFactory informers.SharedInformerFactory) *DefaultDRAManager {
logger := klog.FromContext(ctx)
dcLister := informerFactory.Resource().V1().DeviceClasses().Lister()
manager := &DefaultDRAManager{
resourceClaimTracker: &claimTracker{
cache: claimsCache,
@ -60,11 +59,11 @@ func NewDRAManager(ctx context.Context, claimsCache *assumecache.AssumeCache, re
logger: logger,
},
resourceSliceLister: &resourceSliceLister{tracker: resourceSliceTracker},
deviceClassLister: &deviceClassLister{classLister: dcLister},
deviceClassLister: &deviceClassLister{classLister: informerFactory.Resource().V1().DeviceClasses().Lister()},
}
if utilfeature.DefaultFeatureGate.Enabled(features.DRAExtendedResource) {
manager.extendedResourceCache = extendedresourcecache.NewExtendedResourceCache(dcLister, logger)
manager.extendedResourceCache = extendedresourcecache.NewExtendedResourceCache(logger)
}
// Reacting to events is more efficient than iterating over the list

View file

@ -414,7 +414,7 @@ func hasDeviceClassMappedExtendedResource(reqs v1.ResourceList, cache fwk.Device
continue
}
if schedutil.IsDRAExtendedResourceName(rName) {
if cache.GetDeviceClass(rName) != "" {
if cache.GetDeviceClass(rName) != nil {
return true
}
}
@ -750,7 +750,7 @@ func (pl *DynamicResources) filterExtendedResources(state *stateData, pod *v1.Po
continue
}
allocatable, okScalar := nodeInfo.GetAllocatable().GetScalarResources()[rName]
isBackedByDRA := cache.GetDeviceClass(rName) != ""
isBackedByDRA := cache.GetDeviceClass(rName) != nil
if isBackedByDRA {
if allocatable > 0 {
// node provides the resource via device plugin
@ -825,9 +825,9 @@ func createDeviceRequests(pod *v1.Pod, extendedResources map[v1.ResourceName]int
if !ok || crq == 0 {
continue
}
className := cache.GetDeviceClass(r)
class := cache.GetDeviceClass(r)
// skip if the request does not map to a device class
if className == "" {
if class == nil {
continue
}
keys := make([]string, 0, len(creqs))
@ -851,7 +851,7 @@ func createDeviceRequests(pod *v1.Pod, extendedResources map[v1.ResourceName]int
resourceapi.DeviceRequest{
Name: fmt.Sprintf("container-%d-request-%d", i, ridx), // need to be container name index - extended resource name index
Exactly: &resourceapi.ExactDeviceRequest{
DeviceClassName: className, // map external resource name -> device class name
DeviceClassName: class.Name, // map external resource name -> device class name
AllocationMode: resourceapi.DeviceAllocationModeExactCount,
Count: crq,
},

View file

@ -2708,10 +2708,10 @@ func Test_isSchedulableAfterPodChange(t *testing.T) {
// mockDeviceClassResolver is a simple mock implementation of fwk.DeviceClassResolver for testing
type mockDeviceClassResolver struct {
mapping map[v1.ResourceName]string
mapping map[v1.ResourceName]*resourceapi.DeviceClass
}
func (m *mockDeviceClassResolver) GetDeviceClass(resourceName v1.ResourceName) string {
func (m *mockDeviceClassResolver) GetDeviceClass(resourceName v1.ResourceName) *resourceapi.DeviceClass {
return m.mapping[resourceName]
}
@ -2754,16 +2754,36 @@ func Test_createDeviceRequests(t *testing.T) {
v1.ResourceName(extendedResourceName): 1,
v1.ResourceName(extendedResourceName + "init"): 2,
}
devMap := map[v1.ResourceName]string{
v1.ResourceName(extendedResourceName): "class",
devMap := map[v1.ResourceName]*resourceapi.DeviceClass{
v1.ResourceName(extendedResourceName): {
ObjectMeta: metav1.ObjectMeta{
Name: "class",
},
},
}
devMap2 := map[v1.ResourceName]string{
v1.ResourceName(extendedResourceName): "class",
v1.ResourceName(extendedResourceName + "1"): "class1",
devMap2 := map[v1.ResourceName]*resourceapi.DeviceClass{
v1.ResourceName(extendedResourceName): {
ObjectMeta: metav1.ObjectMeta{
Name: "class",
},
},
v1.ResourceName(extendedResourceName + "1"): {
ObjectMeta: metav1.ObjectMeta{
Name: "class1",
},
},
}
devMapInit := map[v1.ResourceName]string{
v1.ResourceName(extendedResourceName): "class",
v1.ResourceName(extendedResourceName + "init"): "classInit",
devMapInit := map[v1.ResourceName]*resourceapi.DeviceClass{
v1.ResourceName(extendedResourceName): {
ObjectMeta: metav1.ObjectMeta{
Name: "class",
},
},
v1.ResourceName(extendedResourceName + "init"): {
ObjectMeta: metav1.ObjectMeta{
Name: "classInit",
},
},
}
devReq := resourceapi.DeviceRequest{
Name: "container-0-request-0",

View file

@ -208,7 +208,7 @@ func shouldDelegateResourceToDRA(rName v1.ResourceName, nodeInfo fwk.NodeInfo, d
// If draManager is available, check the cache for a mapping
if draManager != nil {
cache := draManager.DeviceClassResolver()
return cache.GetDeviceClass(rName) != ""
return cache.GetDeviceClass(rName) != nil
}
// If draManager is nil (e.g., kubelet admission check), delegate resources that are not in

View file

@ -23,7 +23,6 @@ import (
v1 "k8s.io/api/core/v1"
resourceapi "k8s.io/api/resource/v1"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
resourcelisters "k8s.io/client-go/listers/resource/v1"
"k8s.io/client-go/tools/cache"
klog "k8s.io/klog/v2"
)
@ -31,13 +30,12 @@ import (
// ExtendedResourceCache maintains a global cache of extended resource to device class mappings,
// based on informer events. For that it implements the cache.ResourceEventHandler interface.
type ExtendedResourceCache struct {
deviceClassLister resourcelisters.DeviceClassLister
logger klog.Logger
handlers []cache.ResourceEventHandler
logger klog.Logger
handlers []cache.ResourceEventHandler
mutex sync.RWMutex
// mapping maps extended resource name to device class name
mapping map[v1.ResourceName]string
mapping map[v1.ResourceName]*resourceapi.DeviceClass
}
var _ cache.ResourceEventHandler = &ExtendedResourceCache{}
@ -46,12 +44,11 @@ var _ cache.ResourceEventHandler = &ExtendedResourceCache{}
// is responsible for registering the instance as a handler of DeviceClass events.
//
// Additional event handlers may be registered here or via AddEventHandler.
func NewExtendedResourceCache(deviceClassLister resourcelisters.DeviceClassLister, logger klog.Logger, handlers ...cache.ResourceEventHandler) *ExtendedResourceCache {
func NewExtendedResourceCache(logger klog.Logger, handlers ...cache.ResourceEventHandler) *ExtendedResourceCache {
cache := &ExtendedResourceCache{
deviceClassLister: deviceClassLister,
logger: logger,
handlers: handlers,
mapping: make(map[v1.ResourceName]string),
logger: logger,
handlers: handlers,
mapping: make(map[v1.ResourceName]*resourceapi.DeviceClass),
}
return cache
@ -73,9 +70,9 @@ func (c *ExtendedResourceCache) AddEventHandler(handler cache.ResourceEventHandl
//
// This (and only this) method may be called on a nil ExtendedResourceCache. The nil
// instance always returns the empty string.
func (c *ExtendedResourceCache) GetDeviceClass(resourceName v1.ResourceName) string {
func (c *ExtendedResourceCache) GetDeviceClass(resourceName v1.ResourceName) *resourceapi.DeviceClass {
if c == nil {
return ""
return nil
}
c.mutex.RLock()
defer c.mutex.RUnlock()
@ -144,14 +141,7 @@ func (c *ExtendedResourceCache) updateMapping(newDeviceClass, oldDeviceClass *re
var classWithSameExtendedResourceName *resourceapi.DeviceClass
if newDeviceClass.Spec.ExtendedResourceName != nil {
name := c.mapping[v1.ResourceName(*newDeviceClass.Spec.ExtendedResourceName)]
var err error
if c.deviceClassLister != nil && name != "" {
classWithSameExtendedResourceName, err = c.deviceClassLister.Get(name)
}
if err != nil {
c.logger.V(5).Info("Failed to get device class", "extendedResource", *newDeviceClass.Spec.ExtendedResourceName, "deviceClass", name, "error", err)
}
classWithSameExtendedResourceName = c.mapping[v1.ResourceName(*newDeviceClass.Spec.ExtendedResourceName)]
}
if classWithSameExtendedResourceName != nil {
if newDeviceClass.CreationTimestamp.Before(&classWithSameExtendedResourceName.CreationTimestamp) {
@ -174,14 +164,14 @@ func (c *ExtendedResourceCache) updateMapping(newDeviceClass, oldDeviceClass *re
// Add new mappings
if newDeviceClass.Spec.ExtendedResourceName != nil {
c.mapping[v1.ResourceName(*newDeviceClass.Spec.ExtendedResourceName)] = newDeviceClass.Name
c.mapping[v1.ResourceName(*newDeviceClass.Spec.ExtendedResourceName)] = newDeviceClass
c.logger.V(5).Info("Updated extended resource cache for explicit mapping",
"extendedResource", *newDeviceClass.Spec.ExtendedResourceName,
"deviceClass", newDeviceClass.Name)
}
// Always add the default mapping
defaultResourceName := v1.ResourceName(resourceapi.ResourceDeviceClassPrefix + newDeviceClass.Name)
c.mapping[defaultResourceName] = newDeviceClass.Name
c.mapping[defaultResourceName] = newDeviceClass
c.logger.V(5).Info("Updated extended resource cache for default mapping",
"extendedResource", defaultResourceName,
"deviceClass", newDeviceClass.Name)

View file

@ -32,14 +32,14 @@ import (
)
type deviceClassResolver interface {
GetDeviceClass(resourceName v1.ResourceName) string
GetDeviceClass(resourceName v1.ResourceName) *resourceapi.DeviceClass
}
func TestNil(t *testing.T) {
var cache *ExtendedResourceCache
var resolver deviceClassResolver = cache
if className := resolver.GetDeviceClass("example.com/gpu"); className != "" {
t.Errorf("Expected the empty class name from a nil instance, got instead: %q", className)
if class := resolver.GetDeviceClass("example.com/gpu"); class != nil {
t.Errorf("Expected the nil class from a nil instance, got instead: %q", class.Name)
}
}
@ -91,7 +91,7 @@ func TestHandlers(t *testing.T) {
}
},
}
erCache := NewExtendedResourceCache(nil, logger, firstHandler)
erCache := NewExtendedResourceCache(logger, firstHandler)
secondHandler := &cache.ResourceEventHandlerFuncs{
AddFunc: func(obj interface{}) {
if obj != class {
@ -101,8 +101,8 @@ func TestHandlers(t *testing.T) {
if numAdd != 2 {
t.Errorf("second handler expected Add to be called last once, actual add #%d", numAdd)
}
if className := erCache.GetDeviceClass(resourceName); className != class.Name {
t.Errorf("expected %q, got %q", class.Name, className)
if deviceClass := erCache.GetDeviceClass(resourceName); deviceClass == nil || deviceClass.Name != class.Name {
t.Errorf("expected %q, got %q", class.Name, deviceClass)
}
},
UpdateFunc: func(oldObj, newObj interface{}) {
@ -116,8 +116,8 @@ func TestHandlers(t *testing.T) {
if numUpdate != 2 {
t.Errorf("second handler expected Update to be called last once, actual update #%d", numUpdate)
}
if className := erCache.GetDeviceClass(resourceName); className != "" {
t.Errorf("expected %q, got %q", "", className)
if class := erCache.GetDeviceClass(resourceName); class != nil {
t.Errorf("expected %q, got %v", "", class)
}
},
DeleteFunc: func(obj interface{}) {
@ -128,8 +128,8 @@ func TestHandlers(t *testing.T) {
if numDelete != 2 {
t.Errorf("second handler expected Delete to be called last once, actual delete #%d", numDelete)
}
if className := erCache.GetDeviceClass(resourceName); className != "" {
t.Errorf("expected %q, got %q", "", className)
if class := erCache.GetDeviceClass(resourceName); class != nil {
t.Errorf("expected %q, got %q", "", class)
}
},
}
@ -147,7 +147,7 @@ func TestExtendedResourceCache(t *testing.T) {
informerFactory := informers.NewSharedInformerFactory(client, 0)
deviceClassInformer := informerFactory.Resource().V1().DeviceClasses()
cache := NewExtendedResourceCache(deviceClassInformer.Lister(), logger)
cache := NewExtendedResourceCache(logger)
if _, err := deviceClassInformer.Informer().AddEventHandler(cache); err != nil {
logger.Error(err, "Failed to add event handler for device classes")
}
@ -229,20 +229,21 @@ func TestExtendedResourceCache(t *testing.T) {
time.Sleep(1 * time.Second)
// Verify explicit mapping
deviceClassName := cache.GetDeviceClass("example.com/gpu")
if deviceClassName != "gpu-class" {
t.Errorf("Expected to find device class 'gpu-class' for 'example.com/gpu', got %s", deviceClassName)
deviceClass := cache.GetDeviceClass("example.com/gpu")
if deviceClass == nil || deviceClass.Name != "gpu-class" {
t.Errorf("Expected to find device class 'gpu-class' for 'example.com/gpu', got %v", deviceClass)
}
// Verify default mapping
defaultResourceName := v1.ResourceName("deviceclass.resource.kubernetes.io/fpga-class")
deviceClassName = cache.GetDeviceClass(defaultResourceName)
if deviceClassName != "fpga-class" {
t.Errorf("Expected to find device class 'fpga-class' for '%s', got %s", defaultResourceName, deviceClassName)
deviceClass = cache.GetDeviceClass(defaultResourceName)
if deviceClass == nil || deviceClass.Name != "fpga-class" {
t.Errorf("Expected to find device class 'fpga-class' for '%s', got %v", defaultResourceName, deviceClass)
}
// Verify both device classes have default mappings
if cache.GetDeviceClass("deviceclass.resource.kubernetes.io/gpu-class") != "gpu-class" {
deviceClass = cache.GetDeviceClass("deviceclass.resource.kubernetes.io/gpu-class")
if deviceClass == nil || deviceClass.Name != "gpu-class" {
t.Error("Expected default mapping for gpu-class")
}
@ -254,9 +255,9 @@ func TestExtendedResourceCache(t *testing.T) {
time.Sleep(1 * time.Second)
// should keep deviceClass1, since it is newer than deviceClass3
deviceClassName = cache.GetDeviceClass("example.com/gpu")
if deviceClassName != "gpu-class" {
t.Errorf("Expected to find device class 'gpu-class' for 'example.com/gpu', got %s", deviceClassName)
deviceClass = cache.GetDeviceClass("example.com/gpu")
if deviceClass == nil || deviceClass.Name != "gpu-class" {
t.Errorf("Expected to find device class 'gpu-class' for 'example.com/gpu', got %v", deviceClass)
}
// deviceClass4 is newer than deviceClass1, hence it will replace deviceClass1
@ -267,9 +268,9 @@ func TestExtendedResourceCache(t *testing.T) {
time.Sleep(1 * time.Second)
// deviceClass4 replaces deviceClass1, since it is newer with the same example.com/gpu extended resource name
deviceClassName = cache.GetDeviceClass("example.com/gpu")
if deviceClassName != "gpu-class-4" {
t.Errorf("Expected to find device class 'gpu-class' for 'example.com/gpu', got %s", deviceClassName)
deviceClass = cache.GetDeviceClass("example.com/gpu")
if deviceClass == nil || deviceClass.Name != "gpu-class-4" {
t.Errorf("Expected to find device class 'gpu-class' for 'example.com/gpu', got %v", deviceClass)
}
// deviceClass0 is created at the same time as deviceClass4, but its name is alphabetically ordered earlier,
@ -282,9 +283,9 @@ func TestExtendedResourceCache(t *testing.T) {
// deviceClass0 replaces deviceClass4, it is created at the same time as deviceClass4, but its name is
// alphabetically ordered earlier
deviceClassName = cache.GetDeviceClass("example.com/gpu")
if deviceClassName != "gpu-class-0" {
t.Errorf("Expected to find device class 'gpu-class' for 'example.com/gpu', got %s", deviceClassName)
deviceClass = cache.GetDeviceClass("example.com/gpu")
if deviceClass == nil || deviceClass.Name != "gpu-class-0" {
t.Errorf("Expected to find device class 'gpu-class' for 'example.com/gpu', got %v", deviceClass)
}
// Test modifying a device class
@ -297,11 +298,12 @@ func TestExtendedResourceCache(t *testing.T) {
time.Sleep(1 * time.Second)
// Should have the new mapping
if cache.GetDeviceClass("test.com/gpu") != "gpu-class-0" {
t.Errorf("Expected to find device class 'gpu-class-0' for 'test.com/gpu' after modification, got %s", deviceClassName)
deviceClass = cache.GetDeviceClass("test.com/gpu")
if deviceClass == nil || deviceClass.Name != "gpu-class-0" {
t.Errorf("Expected to find device class 'gpu-class-0' for 'test.com/gpu' after modification, got %v", deviceClass)
}
// Should not have the old mapping for example.com/gpu
if cache.GetDeviceClass("example.com/gpu") != "" {
if cache.GetDeviceClass("example.com/gpu") != nil {
t.Errorf("Expected 'example.com/gpu' to be removed after modification, got %s", cache.GetDeviceClass("example.com/gpu"))
}
@ -312,12 +314,12 @@ func TestExtendedResourceCache(t *testing.T) {
}
time.Sleep(1 * time.Second)
deviceClassName = cache.GetDeviceClass("test.com/gpu")
if deviceClassName != "" {
t.Errorf("Expected 'test.com/gpu' to be removed after deleting device class, got %s", deviceClassName)
deviceClass = cache.GetDeviceClass("test.com/gpu")
if deviceClass != nil {
t.Errorf("Expected 'test.com/gpu' to be removed after deleting device class, got %s", deviceClass)
}
// Verify the default mapping is removed
if cache.GetDeviceClass("deviceclass.resource.kubernetes.io/gpu-class-0") != "" {
if cache.GetDeviceClass("deviceclass.resource.kubernetes.io/gpu-class-0") != nil {
t.Errorf("Expected 'deviceclass.resource.kubernetes.io/gpu-class-0' to be removed after deleting device class, got %s", cache.GetDeviceClass("deviceclass.resource.kubernetes.io/gpu-class"))
}
}

View file

@ -114,7 +114,7 @@ type DeviceClassResolver interface {
// GetDeviceClass returns the device class name for the given extended resource name.
// Returns empty string if no mapping exists for the resource name or
// the DRAExtendedResource feature is disabled.
GetDeviceClass(resourceName v1.ResourceName) string
GetDeviceClass(resourceName v1.ResourceName) *resourceapi.DeviceClass
}
// SharedDRAManager can be used to obtain DRA objects, and track modifications to them in-memory - mainly by the DRA plugin.