DRA: log more information

For debugging double allocation of the same
device (https://github.com/kubernetes/kubernetes/issues/133602) it is necessary
to have information about pools, devices and in-flight claims. Log calls get
extended and the config for DRA CI jobs updated to enable higher verbosity for
relevant source files.

Log output in such a cluster at verbosity 6 looks like this:

I1215 10:28:54.166872       1 allocator_incubating.go:130] "Gathered pool information" logger="FilterWithNominatedPods.Filter.DynamicResources" pod="dra-8841/tester-3" node="kind-worker2" pools={"count":1,"devices":["dra-8841.k8s.io/kind-worker2/device-00"],"meta":[{"InvalidReason":"","id":"dra-8841.k8s.io/kind-worker2","isIncomplete":false,"isInvalid":false}]}
I1215 10:28:54.166941       1 allocator_incubating.go:254] "Gathered information about devices" logger="FilterWithNominatedPods.Filter.DynamicResources" pod="dra-8841/tester-3" node="kind-worker2" allocatedDevices={"count":2,"devices":["dra-8841.k8s.io/kind-worker/device-00","dra-8841.k8s.io/kind-worker3/device-00"]} minDevicesToBeAllocated=1
This commit is contained in:
Patrick Ohly 2025-12-15 11:54:19 +01:00
parent c180d6762d
commit 5d536bfb8e
7 changed files with 271 additions and 17 deletions

View file

@ -969,7 +969,7 @@ func (pl *DynamicResources) Reserve(ctx context.Context, cs fwk.CycleState, pod
if err != nil {
return statusError(logger, fmt.Errorf("internal error, couldn't signal allocation for claim %s", claim.Name))
}
logger.V(5).Info("Reserved resource in allocation result", "claim", klog.KObj(claim), "allocation", klog.Format(allocation))
logger.V(5).Info("Reserved resource in allocation result", "claim", klog.KObj(claim), "uid", claim.UID, "resourceVersion", claim.ResourceVersion, "allocation", klog.Format(allocation))
allocIndex++
}
}
@ -998,6 +998,7 @@ func (pl *DynamicResources) Unreserve(ctx context.Context, cs fwk.CycleState, po
// If allocation was in-flight, then it's not anymore and we need to revert the
// claim object in the assume cache to what it was before.
if deleted := pl.draManager.ResourceClaims().RemoveClaimPendingAllocation(claim.UID); deleted {
logger.V(5).Info("Released resource in allocation result", "claim", klog.KObj(claim), "uid", claim.UID, "resourceVersion", claim.ResourceVersion, "allocation", klog.Format(claim.Status.Allocation))
pl.draManager.ResourceClaims().AssumedClaimRestore(claim.Namespace, claim.Name)
}
@ -1138,7 +1139,14 @@ func (pl *DynamicResources) bindClaim(ctx context.Context, state *stateData, ind
}
if allocation != nil {
for _, claimUID := range claimUIDs {
pl.draManager.ResourceClaims().RemoveClaimPendingAllocation(claimUID)
if deleted := pl.draManager.ResourceClaims().RemoveClaimPendingAllocation(claimUID); deleted {
// Creating the claim may have failed.
resourceVersion := ""
if claim != nil {
resourceVersion = claim.ResourceVersion
}
logger.V(5).Info("Released resource in allocation result", "claim", klog.KObj(claim), "uid", claimUID, "resourceVersion", resourceVersion, "allocation", klog.Format(allocation))
}
}
}
}()

View file

@ -8,6 +8,7 @@ godebug default=go1.25
require (
github.com/blang/semver/v4 v4.0.0
github.com/go-logr/logr v1.4.3
github.com/google/cel-go v0.26.0
github.com/google/go-cmp v0.7.0
github.com/onsi/gomega v1.38.2
@ -32,7 +33,6 @@ require (
github.com/davecgh/go-spew v1.1.1 // indirect
github.com/emicklei/go-restful/v3 v3.12.2 // indirect
github.com/fxamacker/cbor/v2 v2.9.0 // indirect
github.com/go-logr/logr v1.4.3 // indirect
github.com/go-openapi/jsonpointer v0.21.0 // indirect
github.com/go-openapi/jsonreference v0.20.2 // indirect
github.com/go-openapi/swag v0.23.0 // indirect

View file

@ -157,11 +157,7 @@ func (a *Allocator) Allocate(ctx context.Context, node *v1.Node, claims []*resou
return nil, fmt.Errorf("gather pool information: %w", err)
}
alloc.pools = pools
if loggerV := alloc.logger.V(7); loggerV.Enabled() {
loggerV.Info("Gathered pool information", "numPools", len(pools), "pools", pools)
} else {
alloc.logger.V(5).Info("Gathered pool information", "numPools", len(pools))
}
alloc.logger.V(5).Info("Gathered pool information", "pools", logPools(alloc.logger, pools))
// We allocate one claim after the other and for each claim, all of
// its requests. For each individual device we pick one possible
@ -314,7 +310,7 @@ func (a *Allocator) Allocate(ctx context.Context, node *v1.Node, claims []*resou
// We can estimate the size based on what we need to allocate.
alloc.allocatingDevices = make(map[DeviceID]sets.Set[int], minDevicesTotal)
alloc.logger.V(6).Info("Gathered information about devices", "numAllocated", len(alloc.allocatedState.AllocatedDevices), "minDevicesToBeAllocated", minDevicesTotal)
alloc.logger.V(6).Info("Gathered information about devices", "allocatedDevices", logAllocatedDevices(alloc.logger, alloc.allocatedState.AllocatedDevices), "minDevicesToBeAllocated", minDevicesTotal)
// In practice, there aren't going to be many different CEL
// expressions. Most likely, there is going to be handful of different

View file

@ -19,12 +19,16 @@ package experimental
import (
"context"
"fmt"
"slices"
"github.com/go-logr/logr"
v1 "k8s.io/api/core/v1"
resourceapi "k8s.io/api/resource/v1"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/component-helpers/scheduling/corev1/nodeaffinity"
draapi "k8s.io/dynamic-resource-allocation/api"
"k8s.io/klog/v2"
"k8s.io/utils/ptr"
)
@ -444,3 +448,126 @@ type PoolID struct {
func (p PoolID) String() string {
return p.Driver.String() + "/" + p.Pool.String()
}
// At V(6), log only a limited number of devices to avoid blowing up logs. For
// many E2E tests, 10 devices is enough for all devices without having to
// truncate, at least when running the tests sequentially.
const maxDevicesLevel6 = 10
// logPools returns a handle for the value in a structured log call which
// includes varying amounts of information about the pools, depending on
// the verbosity of the logger.
func logPools(logger klog.Logger, pools []*Pool) any {
// We need to check verbosity here because our caller's source code
// location may be relevant (-vmodule !).
helper, logger := logger.WithCallStackHelper()
helper()
// We always produce the same output at V <= 5. 6 adds a summary and
// 7 is a complete dump.
verbosity := 5
for i := 7; i > verbosity; i-- {
if loggerV := logger.V(i); loggerV.Enabled() {
verbosity = i
break
}
}
return &poolsLogger{verbosity, pools}
}
type poolsLogger struct {
verbosity int
pools []*Pool
}
var _ logr.Marshaler = &poolsLogger{}
func (p *poolsLogger) MarshalLog() any {
info := map[string]any{"count": len(p.pools)}
if p.verbosity == 6 {
meta := make([]map[string]any, len(p.pools))
for i, pool := range p.pools {
meta[i] = map[string]any{
"id": pool.PoolID.String(),
"isIncomplete": pool.IsIncomplete,
"isInvalid": pool.IsInvalid,
"InvalidReason": pool.InvalidReason,
}
}
info["meta"] = meta
info["devices"] = p.listDevices(maxDevicesLevel6)
}
if p.verbosity >= 7 {
info["devices"] = p.listDevices(-1)
info["content"] = p.pools
}
return info
}
func (p *poolsLogger) listDevices(maxDevices int) []string {
var devices []string
for _, pool := range p.pools {
devices = p.addDevicesInSlices(devices, pool.PoolID, pool.DeviceSlicesTargetingNode, maxDevices)
devices = p.addDevicesInSlices(devices, pool.PoolID, pool.DeviceSlicesNotTargetingNode, maxDevices)
}
return devices
}
func (p *poolsLogger) addDevicesInSlices(devices []string, poolID PoolID, slices []*draapi.ResourceSlice, maxDevices int) []string {
for _, slice := range slices {
for _, device := range slice.Spec.Devices {
if maxDevices != -1 && len(devices) >= maxDevices {
devices = append(devices, "...")
return devices
}
devices = append(devices, DeviceID{Driver: poolID.Driver, Pool: poolID.Pool, Device: device.Name}.String())
}
}
return devices
}
// logPools returns a handle for the value in a structured log call which
// includes varying amounts of information about the allocated devices, depending on
// the verbosity of the logger.
func logAllocatedDevices(logger klog.Logger, allocatedDevices sets.Set[DeviceID]) any {
// We need to check verbosity here because our caller's source code
// location may be relevant (-vmodule !).
helper, logger := logger.WithCallStackHelper()
helper()
// We always produce the same output at V <= 5. 6 adds all IDs.
verbosity := 5
for i := 7; i > verbosity; i-- {
if loggerV := logger.V(i); loggerV.Enabled() {
verbosity = i
break
}
}
return &allocatedDevicesLogger{verbosity, allocatedDevices}
}
type allocatedDevicesLogger struct {
verbosity int
devices sets.Set[DeviceID]
}
var _ logr.Marshaler = &allocatedDevicesLogger{}
func (a *allocatedDevicesLogger) MarshalLog() any {
info := map[string]any{"count": len(a.devices)}
if a.verbosity >= 6 {
ids := make([]string, 0, len(a.devices))
for id := range a.devices {
if a.verbosity == 6 && len(ids) >= maxDevicesLevel6 {
ids = append(ids, "...")
break
}
ids = append(ids, id.String())
}
slices.Sort(ids)
info["devices"] = ids
}
return info
}

View file

@ -127,11 +127,7 @@ func (a *Allocator) Allocate(ctx context.Context, node *v1.Node, claims []*resou
return nil, fmt.Errorf("gather pool information: %w", err)
}
alloc.pools = pools
if loggerV := alloc.logger.V(7); loggerV.Enabled() {
loggerV.Info("Gathered pool information", "numPools", len(pools), "pools", pools)
} else {
alloc.logger.V(5).Info("Gathered pool information", "numPools", len(pools))
}
alloc.logger.V(5).Info("Gathered pool information", "pools", logPools(alloc.logger, pools))
// We allocate one claim after the other and for each claim, all of
// its requests. For each individual device we pick one possible
@ -255,7 +251,7 @@ func (a *Allocator) Allocate(ctx context.Context, node *v1.Node, claims []*resou
// We can estimate the size based on what we need to allocate.
alloc.allocatingDevices = make(map[DeviceID]sets.Set[int], minDevicesTotal)
alloc.logger.V(6).Info("Gathered information about devices", "numAllocated", len(alloc.allocatedDevices), "minDevicesToBeAllocated", minDevicesTotal)
alloc.logger.V(5).Info("Gathered information about devices", "allocatedDevices", logAllocatedDevices(alloc.logger, alloc.allocatedDevices), "minDevicesToBeAllocated", minDevicesTotal)
// In practice, there aren't going to be many different CEL
// expressions. Most likely, there is going to be handful of different

View file

@ -19,12 +19,16 @@ package incubating
import (
"context"
"fmt"
"slices"
"github.com/go-logr/logr"
v1 "k8s.io/api/core/v1"
resourceapi "k8s.io/api/resource/v1"
"k8s.io/apimachinery/pkg/util/sets"
"k8s.io/component-helpers/scheduling/corev1/nodeaffinity"
draapi "k8s.io/dynamic-resource-allocation/api"
"k8s.io/klog/v2"
"k8s.io/utils/ptr"
)
@ -390,3 +394,126 @@ type PoolID struct {
func (p PoolID) String() string {
return p.Driver.String() + "/" + p.Pool.String()
}
// At V(6), log only a limited number of devices to avoid blowing up logs. For
// many E2E tests, 10 devices is enough for all devices without having to
// truncate, at least when running the tests sequentially.
const maxDevicesLevel6 = 10
// logPools returns a handle for the value in a structured log call which
// includes varying amounts of information about the pools, depending on
// the verbosity of the logger.
func logPools(logger klog.Logger, pools []*Pool) any {
// We need to check verbosity here because our caller's source code
// location may be relevant (-vmodule !).
helper, logger := logger.WithCallStackHelper()
helper()
// We always produce the same output at V <= 5. 6 adds a summary and
// 7 is a complete dump.
verbosity := 5
for i := 7; i > verbosity; i-- {
if loggerV := logger.V(i); loggerV.Enabled() {
verbosity = i
break
}
}
return &poolsLogger{verbosity, pools}
}
type poolsLogger struct {
verbosity int
pools []*Pool
}
var _ logr.Marshaler = &poolsLogger{}
func (p *poolsLogger) MarshalLog() any {
info := map[string]any{"count": len(p.pools)}
if p.verbosity == 6 {
meta := make([]map[string]any, len(p.pools))
for i, pool := range p.pools {
meta[i] = map[string]any{
"id": pool.PoolID.String(),
"isIncomplete": pool.IsIncomplete,
"isInvalid": pool.IsInvalid,
"InvalidReason": pool.InvalidReason,
}
}
info["meta"] = meta
info["devices"] = p.listDevices(maxDevicesLevel6)
}
if p.verbosity >= 7 {
info["devices"] = p.listDevices(-1)
info["content"] = p.pools
}
return info
}
func (p *poolsLogger) listDevices(maxDevices int) []string {
var devices []string
for _, pool := range p.pools {
devices = p.addDevicesInSlices(devices, pool.PoolID, pool.DeviceSlicesTargetingNode, maxDevices)
devices = p.addDevicesInSlices(devices, pool.PoolID, pool.DeviceSlicesNotTargetingNode, maxDevices)
}
return devices
}
func (p *poolsLogger) addDevicesInSlices(devices []string, poolID PoolID, slices []*draapi.ResourceSlice, maxDevices int) []string {
for _, slice := range slices {
for _, device := range slice.Spec.Devices {
if maxDevices != -1 && len(devices) >= maxDevices {
devices = append(devices, "...")
return devices
}
devices = append(devices, DeviceID{Driver: poolID.Driver, Pool: poolID.Pool, Device: device.Name}.String())
}
}
return devices
}
// logPools returns a handle for the value in a structured log call which
// includes varying amounts of information about the allocated devices, depending on
// the verbosity of the logger.
func logAllocatedDevices(logger klog.Logger, allocatedDevices sets.Set[DeviceID]) any {
// We need to check verbosity here because our caller's source code
// location may be relevant (-vmodule !).
helper, logger := logger.WithCallStackHelper()
helper()
// We always produce the same output at V <= 5. 6 adds all IDs.
verbosity := 5
for i := 7; i > verbosity; i-- {
if loggerV := logger.V(i); loggerV.Enabled() {
verbosity = i
break
}
}
return &allocatedDevicesLogger{verbosity, allocatedDevices}
}
type allocatedDevicesLogger struct {
verbosity int
devices sets.Set[DeviceID]
}
var _ logr.Marshaler = &allocatedDevicesLogger{}
func (a *allocatedDevicesLogger) MarshalLog() any {
info := map[string]any{"count": len(a.devices)}
if a.verbosity >= 6 {
ids := make([]string, 0, len(a.devices))
for id := range a.devices {
if a.verbosity == 6 && len(ids) >= maxDevicesLevel6 {
ids = append(ids, "...")
break
}
ids = append(ids, id.String())
}
slices.Sort(ids)
info["devices"] = ids
}
return info
}

View file

@ -23,7 +23,7 @@ kubeadmConfigPatches:
- name: "v"
value: "5"
- name: "vmodule"
value: "allocator=6,dynamicresources=6" # structured/allocator.go, DRA scheduler plugin
value: "allocator*=6,pools*=6,dynamicresources=6,allocateddevices=6,dra_manager=6,extendeddynamicresources=6" # structured/internal/*/allocator*.go, DRA scheduler plugin
controllerManager:
extraArgs:
- name: "v"
@ -53,7 +53,7 @@ kubeadmConfigPatches:
scheduler:
extraArgs:
v: "5"
vmodule: "allocator=6,dynamicresources=6" # structured/allocator.go, DRA scheduler plugin
vmodule: "allocator*=6,pools*=6,dynamicresources=6,allocateddevices=6,dra_manager=6,extendeddynamicresources=6" # structured/internal/*/allocator*.go, DRA scheduler plugin
controllerManager:
extraArgs:
v: "5"