mirror of
https://github.com/kubernetes/kubernetes.git
synced 2026-05-23 10:29:27 -04:00
GatherAllocatedState and ListAllAllocatedDevices need to collect information
from different sources (allocated devices, in-flight claims), potentially even
multiple times (GatherAllocatedState first gets allocated devices, then the
capacities).
The underlying assumption that nothing bad happens in parallel is not always
true. The following log snippet shows how an update of the assume
cache (feeding the allocated devices tracker) and in-flight claims lands such
that GatherAllocatedState doesn't see the device in that claim as allocated:
dra_manager.go:263: I0115 15:11:04.407714 18778] scheduler: Starting GatherAllocatedState
...
allocateddevices.go:189: I0115 15:11:04.407945 18066] scheduler: Observed device allocation device="testdra-all-usesallresources-hvs5d.driver/worker-5/worker-5-device-094" claim="testdra-all-usesallresources-hvs5d/claim-0553"
dynamicresources.go:1150: I0115 15:11:04.407981 89109] scheduler: Claim stored in assume cache pod="testdra-all-usesallresources-hvs5d/my-pod-0553" claim="testdra-all-usesallresources-hvs5d/claim-0553" uid=<types.UID>: a84d3c4d-f752-4cfd-8993-f4ce58643685 resourceVersion="5680"
dra_manager.go:201: I0115 15:11:04.408008 89109] scheduler: Removed in-flight claim claim="testdra-all-usesallresources-hvs5d/claim-0553" uid=<types.UID>: a84d3c4d-f752-4cfd-8993-f4ce58643685 version="1211"
dynamicresources.go:1157: I0115 15:11:04.408044 89109] scheduler: Removed claim from in-flight claims pod="testdra-all-usesallresources-hvs5d/my-pod-0553" claim="testdra-all-usesallresources-hvs5d/claim-0553" uid=<types.UID>: a84d3c4d-f752-4cfd-8993-f4ce58643685 resourceVersion="5680" allocation=<
{
"devices": {
"results": [
{
"request": "req-1",
"driver": "testdra-all-usesallresources-hvs5d.driver",
"pool": "worker-5",
"device": "worker-5-device-094"
}
]
},
"nodeSelector": {
"nodeSelectorTerms": [
{
"matchFields": [
{
"key": "metadata.name",
"operator": "In",
"values": [
"worker-5"
]
}
]
}
]
},
"allocationTimestamp": "2026-01-15T14:11:04Z"
}
>
dra_manager.go:280: I0115 15:11:04.408085 18778] scheduler: Device is in flight for allocation device="testdra-all-usesallresources-hvs5d.driver/worker-5/worker-5-device-095" claim="testdra-all-usesallresources-hvs5d/claim-0086"
dra_manager.go:280: I0115 15:11:04.408137 18778] scheduler: Device is in flight for allocation device="testdra-all-usesallresources-hvs5d.driver/worker-5/worker-5-device-096" claim="testdra-all-usesallresources-hvs5d/claim-0165"
default_binder.go:69: I0115 15:11:04.408175 89109] scheduler: Attempting to bind pod to node pod="testdra-all-usesallresources-hvs5d/my-pod-0553" node="worker-5"
dra_manager.go:265: I0115 15:11:04.408264 18778] scheduler: Finished GatherAllocatedState allocatedDevices=<map[string]interface {} | len:2>: {
Initial state: "worker-5-device-094" is in-flight, not in cache
- goroutine #1: starts GatherAllocatedState, copies cache
- goroutine #2: adds to assume cache, removes from in-flight
- goroutine #1: checks in-flight
=> device never seen as allocated
This is the second reason for double allocation of the same device in two
different claims. The other was timing in the assume cache. Both were
tracked down with an integration test (separate commit). It did not fail
all the time, but enough that regressions should show up as flakes.
280 lines
9.3 KiB
Go
280 lines
9.3 KiB
Go
/*
|
|
Copyright 2024 The Kubernetes Authors.
|
|
|
|
Licensed under the Apache License, Version 2.0 (the "License");
|
|
you may not use this file except in compliance with the License.
|
|
You may obtain a copy of the License at
|
|
|
|
http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
Unless required by applicable law or agreed to in writing, software
|
|
distributed under the License is distributed on an "AS IS" BASIS,
|
|
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
See the License for the specific language governing permissions and
|
|
limitations under the License.
|
|
*/
|
|
|
|
package dynamicresources
|
|
|
|
import (
|
|
"sync"
|
|
|
|
resourceapi "k8s.io/api/resource/v1"
|
|
"k8s.io/apimachinery/pkg/util/sets"
|
|
utilfeature "k8s.io/apiserver/pkg/util/feature"
|
|
"k8s.io/client-go/tools/cache"
|
|
"k8s.io/dynamic-resource-allocation/structured"
|
|
"k8s.io/klog/v2"
|
|
"k8s.io/kubernetes/pkg/features"
|
|
schedutil "k8s.io/kubernetes/pkg/scheduler/util"
|
|
"k8s.io/utils/ptr"
|
|
)
|
|
|
|
// foreachAllocatedDevice invokes the provided callback for each
|
|
// device in the claim's allocation result which was allocated
|
|
// exclusively for the claim.
|
|
//
|
|
// Devices allocated with admin access can be shared with other
|
|
// claims and are skipped without invoking the callback.
|
|
//
|
|
// foreachAllocatedDevice does nothing if the claim is not allocated.
|
|
func foreachAllocatedDevice(claim *resourceapi.ResourceClaim,
|
|
dedicatedDeviceCallback func(deviceID structured.DeviceID),
|
|
enabledConsumableCapacity bool,
|
|
sharedDeviceCallback func(structured.SharedDeviceID),
|
|
consumedCapacityCallback func(structured.DeviceConsumedCapacity)) {
|
|
if claim.Status.Allocation == nil {
|
|
return
|
|
}
|
|
for _, result := range claim.Status.Allocation.Devices.Results {
|
|
// Kubernetes 1.31 did not set this, 1.32 always does.
|
|
// Supporting 1.31 is not worth the additional code that
|
|
// would have to be written (= looking up in request) because
|
|
// it is extremely unlikely that there really is a result
|
|
// that still exists in a cluster from 1.31 where this matters.
|
|
if ptr.Deref(result.AdminAccess, false) {
|
|
// Is not considered as allocated.
|
|
continue
|
|
}
|
|
deviceID := structured.MakeDeviceID(result.Driver, result.Pool, result.Device)
|
|
|
|
// None of the users of this helper need to abort iterating,
|
|
// therefore it's not supported as it only would add overhead.
|
|
|
|
// Execute sharedDeviceCallback and consumedCapacityCallback correspondingly
|
|
// if DRAConsumableCapacity feature is enabled
|
|
if enabledConsumableCapacity {
|
|
shared := result.ShareID != nil
|
|
if shared {
|
|
sharedDeviceID := structured.MakeSharedDeviceID(deviceID, result.ShareID)
|
|
sharedDeviceCallback(sharedDeviceID)
|
|
if result.ConsumedCapacity != nil {
|
|
deviceConsumedCapacity := structured.NewDeviceConsumedCapacity(deviceID, result.ConsumedCapacity)
|
|
consumedCapacityCallback(deviceConsumedCapacity)
|
|
}
|
|
continue
|
|
}
|
|
}
|
|
|
|
// Otherwise, execute dedicatedDeviceCallback
|
|
dedicatedDeviceCallback(deviceID)
|
|
}
|
|
}
|
|
|
|
// allocatedDevices reacts to events in a cache and maintains a set of all allocated devices.
|
|
// This is cheaper than repeatedly calling List, making strings unique, and building the set
|
|
// each time PreFilter is called.
|
|
//
|
|
// To simplify detecting concurrent changes, each modification bumps a revision counter,
|
|
// similar to ResourceVersion in the apiserver. Get and Capacities include the
|
|
// current value in their result. A caller than can compare againt the current value
|
|
// to determine whether some prior results are still up-to-date, without having to get
|
|
// and compare them.
|
|
//
|
|
// All methods are thread-safe. Get returns a cloned set.
|
|
type allocatedDevices struct {
|
|
logger klog.Logger
|
|
|
|
mutex sync.RWMutex
|
|
revision int64
|
|
ids sets.Set[structured.DeviceID]
|
|
shareIDs sets.Set[structured.SharedDeviceID]
|
|
capacities structured.ConsumedCapacityCollection
|
|
enabledConsumableCapacity bool
|
|
}
|
|
|
|
func newAllocatedDevices(logger klog.Logger) *allocatedDevices {
|
|
return &allocatedDevices{
|
|
logger: logger,
|
|
ids: sets.New[structured.DeviceID](),
|
|
shareIDs: sets.New[structured.SharedDeviceID](),
|
|
capacities: structured.NewConsumedCapacityCollection(),
|
|
enabledConsumableCapacity: utilfeature.DefaultFeatureGate.Enabled(features.DRAConsumableCapacity),
|
|
}
|
|
}
|
|
|
|
func (a *allocatedDevices) Get() (sets.Set[structured.DeviceID], int64) {
|
|
a.mutex.RLock()
|
|
defer a.mutex.RUnlock()
|
|
|
|
return a.ids.Clone(), a.revision
|
|
}
|
|
|
|
func (a *allocatedDevices) Capacities() (structured.ConsumedCapacityCollection, int64) {
|
|
a.mutex.RLock()
|
|
defer a.mutex.RUnlock()
|
|
|
|
return a.capacities.Clone(), a.revision
|
|
}
|
|
|
|
func (a *allocatedDevices) Revision() int64 {
|
|
a.mutex.RLock()
|
|
defer a.mutex.RUnlock()
|
|
|
|
return a.revision
|
|
}
|
|
|
|
func (a *allocatedDevices) handlers() cache.ResourceEventHandler {
|
|
return cache.ResourceEventHandlerFuncs{
|
|
AddFunc: a.onAdd,
|
|
UpdateFunc: a.onUpdate,
|
|
DeleteFunc: a.onDelete,
|
|
}
|
|
}
|
|
|
|
func (a *allocatedDevices) onAdd(obj any) {
|
|
claim, _, err := schedutil.As[*resourceapi.ResourceClaim](obj, nil)
|
|
if err != nil {
|
|
// Shouldn't happen.
|
|
a.logger.Error(err, "unexpected object in allocatedDevices.onAdd")
|
|
return
|
|
}
|
|
|
|
if claim.Status.Allocation != nil {
|
|
a.addDevices(claim)
|
|
}
|
|
}
|
|
|
|
func (a *allocatedDevices) onUpdate(oldObj, newObj any) {
|
|
originalClaim, modifiedClaim, err := schedutil.As[*resourceapi.ResourceClaim](oldObj, newObj)
|
|
if err != nil {
|
|
// Shouldn't happen.
|
|
a.logger.Error(err, "unexpected object in allocatedDevices.onUpdate")
|
|
return
|
|
}
|
|
|
|
switch {
|
|
case originalClaim.Status.Allocation == nil && modifiedClaim.Status.Allocation != nil:
|
|
a.addDevices(modifiedClaim)
|
|
case originalClaim.Status.Allocation != nil && modifiedClaim.Status.Allocation == nil:
|
|
a.removeDevices(originalClaim)
|
|
default:
|
|
// Nothing to do. Either both nil or both non-nil, in which case the content
|
|
// also must be the same (immutable!).
|
|
}
|
|
}
|
|
|
|
func (a *allocatedDevices) onDelete(obj any) {
|
|
claim, _, err := schedutil.As[*resourceapi.ResourceClaim](obj, nil)
|
|
if err != nil {
|
|
// Shouldn't happen.
|
|
a.logger.Error(err, "unexpected object in allocatedDevices.onDelete")
|
|
return
|
|
}
|
|
|
|
a.removeDevices(claim)
|
|
}
|
|
|
|
func (a *allocatedDevices) addDevices(claim *resourceapi.ResourceClaim) {
|
|
if claim.Status.Allocation == nil {
|
|
return
|
|
}
|
|
// Locking of the mutex gets minimized by pre-computing what needs to be done
|
|
// without holding the lock.
|
|
deviceIDs := make([]structured.DeviceID, 0, 20)
|
|
var shareIDs []structured.SharedDeviceID
|
|
var deviceCapacities []structured.DeviceConsumedCapacity
|
|
if a.enabledConsumableCapacity {
|
|
shareIDs = make([]structured.SharedDeviceID, 0, 20)
|
|
deviceCapacities = make([]structured.DeviceConsumedCapacity, 0, 20)
|
|
}
|
|
foreachAllocatedDevice(claim,
|
|
func(deviceID structured.DeviceID) {
|
|
a.logger.V(6).Info("Observed device allocation", "device", deviceID, "claim", klog.KObj(claim))
|
|
deviceIDs = append(deviceIDs, deviceID)
|
|
},
|
|
a.enabledConsumableCapacity,
|
|
func(sharedDeviceID structured.SharedDeviceID) {
|
|
a.logger.V(6).Info("Observed shared device allocation", "shared device", sharedDeviceID, "claim", klog.KObj(claim))
|
|
shareIDs = append(shareIDs, sharedDeviceID)
|
|
},
|
|
func(capacity structured.DeviceConsumedCapacity) {
|
|
a.logger.V(6).Info("Observed consumed capacity", "device", capacity.DeviceID, "consumed capacity", capacity.ConsumedCapacity, "claim", klog.KObj(claim))
|
|
deviceCapacities = append(deviceCapacities, capacity)
|
|
},
|
|
)
|
|
|
|
if len(deviceIDs) == 0 && len(shareIDs) == 0 && len(deviceCapacities) == 0 {
|
|
return
|
|
}
|
|
|
|
a.mutex.Lock()
|
|
defer a.mutex.Unlock()
|
|
a.revision++
|
|
for _, deviceID := range deviceIDs {
|
|
a.ids.Insert(deviceID)
|
|
}
|
|
for _, shareID := range shareIDs {
|
|
a.shareIDs.Insert(shareID)
|
|
}
|
|
for _, capacity := range deviceCapacities {
|
|
a.capacities.Insert(capacity)
|
|
}
|
|
}
|
|
|
|
func (a *allocatedDevices) removeDevices(claim *resourceapi.ResourceClaim) {
|
|
if claim.Status.Allocation == nil {
|
|
return
|
|
}
|
|
|
|
// Locking of the mutex gets minimized by pre-computing what needs to be done
|
|
// without holding the lock.
|
|
deviceIDs := make([]structured.DeviceID, 0, 20)
|
|
var shareIDs []structured.SharedDeviceID
|
|
var deviceCapacities []structured.DeviceConsumedCapacity
|
|
if a.enabledConsumableCapacity {
|
|
shareIDs = make([]structured.SharedDeviceID, 0, 20)
|
|
deviceCapacities = make([]structured.DeviceConsumedCapacity, 0, 20)
|
|
}
|
|
foreachAllocatedDevice(claim,
|
|
func(deviceID structured.DeviceID) {
|
|
a.logger.V(6).Info("Observed device deallocation", "device", deviceID, "claim", klog.KObj(claim))
|
|
deviceIDs = append(deviceIDs, deviceID)
|
|
},
|
|
a.enabledConsumableCapacity,
|
|
func(sharedDeviceID structured.SharedDeviceID) {
|
|
a.logger.V(6).Info("Observed shared device deallocation", "shared device", sharedDeviceID, "claim", klog.KObj(claim))
|
|
shareIDs = append(shareIDs, sharedDeviceID)
|
|
},
|
|
func(capacity structured.DeviceConsumedCapacity) {
|
|
a.logger.V(6).Info("Observed consumed capacity release", "device id", capacity.DeviceID, "consumed capacity", capacity.ConsumedCapacity, "claim", klog.KObj(claim))
|
|
deviceCapacities = append(deviceCapacities, capacity)
|
|
})
|
|
|
|
if len(deviceIDs) == 0 && len(shareIDs) == 0 && len(deviceCapacities) == 0 {
|
|
return
|
|
}
|
|
|
|
a.mutex.Lock()
|
|
defer a.mutex.Unlock()
|
|
a.revision++
|
|
for _, deviceID := range deviceIDs {
|
|
a.ids.Delete(deviceID)
|
|
}
|
|
for _, shareID := range shareIDs {
|
|
a.shareIDs.Delete(shareID)
|
|
}
|
|
for _, capacity := range deviceCapacities {
|
|
a.capacities.Remove(capacity)
|
|
}
|
|
}
|