From 5d536bfb8e6950233ea658a53c77fff62b26d115 Mon Sep 17 00:00:00 2001 From: Patrick Ohly Date: Mon, 15 Dec 2025 11:54:19 +0100 Subject: [PATCH] DRA: log more information For debugging double allocation of the same device (https://github.com/kubernetes/kubernetes/issues/133602) it is necessary to have information about pools, devices and in-flight claims. Log calls get extended and the config for DRA CI jobs updated to enable higher verbosity for relevant source files. Log output in such a cluster at verbosity 6 looks like this: I1215 10:28:54.166872 1 allocator_incubating.go:130] "Gathered pool information" logger="FilterWithNominatedPods.Filter.DynamicResources" pod="dra-8841/tester-3" node="kind-worker2" pools={"count":1,"devices":["dra-8841.k8s.io/kind-worker2/device-00"],"meta":[{"InvalidReason":"","id":"dra-8841.k8s.io/kind-worker2","isIncomplete":false,"isInvalid":false}]} I1215 10:28:54.166941 1 allocator_incubating.go:254] "Gathered information about devices" logger="FilterWithNominatedPods.Filter.DynamicResources" pod="dra-8841/tester-3" node="kind-worker2" allocatedDevices={"count":2,"devices":["dra-8841.k8s.io/kind-worker/device-00","dra-8841.k8s.io/kind-worker3/device-00"]} minDevicesToBeAllocated=1 --- .../dynamicresources/dynamicresources.go | 12 +- .../k8s.io/dynamic-resource-allocation/go.mod | 2 +- .../experimental/allocator_experimental.go | 8 +- .../experimental/pools_experimental.go | 127 ++++++++++++++++++ .../incubating/allocator_incubating.go | 8 +- .../internal/incubating/pools_incubating.go | 127 ++++++++++++++++++ test/e2e/dra/kind.yaml | 4 +- 7 files changed, 271 insertions(+), 17 deletions(-) diff --git a/pkg/scheduler/framework/plugins/dynamicresources/dynamicresources.go b/pkg/scheduler/framework/plugins/dynamicresources/dynamicresources.go index b92a3e7b057..08cbece1711 100644 --- a/pkg/scheduler/framework/plugins/dynamicresources/dynamicresources.go +++ b/pkg/scheduler/framework/plugins/dynamicresources/dynamicresources.go @@ -969,7 +969,7 @@ func (pl *DynamicResources) Reserve(ctx context.Context, cs fwk.CycleState, pod if err != nil { return statusError(logger, fmt.Errorf("internal error, couldn't signal allocation for claim %s", claim.Name)) } - logger.V(5).Info("Reserved resource in allocation result", "claim", klog.KObj(claim), "allocation", klog.Format(allocation)) + logger.V(5).Info("Reserved resource in allocation result", "claim", klog.KObj(claim), "uid", claim.UID, "resourceVersion", claim.ResourceVersion, "allocation", klog.Format(allocation)) allocIndex++ } } @@ -998,6 +998,7 @@ func (pl *DynamicResources) Unreserve(ctx context.Context, cs fwk.CycleState, po // If allocation was in-flight, then it's not anymore and we need to revert the // claim object in the assume cache to what it was before. if deleted := pl.draManager.ResourceClaims().RemoveClaimPendingAllocation(claim.UID); deleted { + logger.V(5).Info("Released resource in allocation result", "claim", klog.KObj(claim), "uid", claim.UID, "resourceVersion", claim.ResourceVersion, "allocation", klog.Format(claim.Status.Allocation)) pl.draManager.ResourceClaims().AssumedClaimRestore(claim.Namespace, claim.Name) } @@ -1138,7 +1139,14 @@ func (pl *DynamicResources) bindClaim(ctx context.Context, state *stateData, ind } if allocation != nil { for _, claimUID := range claimUIDs { - pl.draManager.ResourceClaims().RemoveClaimPendingAllocation(claimUID) + if deleted := pl.draManager.ResourceClaims().RemoveClaimPendingAllocation(claimUID); deleted { + // Creating the claim may have failed. + resourceVersion := "" + if claim != nil { + resourceVersion = claim.ResourceVersion + } + logger.V(5).Info("Released resource in allocation result", "claim", klog.KObj(claim), "uid", claimUID, "resourceVersion", resourceVersion, "allocation", klog.Format(allocation)) + } } } }() diff --git a/staging/src/k8s.io/dynamic-resource-allocation/go.mod b/staging/src/k8s.io/dynamic-resource-allocation/go.mod index d9037c93836..99d92838fd0 100644 --- a/staging/src/k8s.io/dynamic-resource-allocation/go.mod +++ b/staging/src/k8s.io/dynamic-resource-allocation/go.mod @@ -8,6 +8,7 @@ godebug default=go1.25 require ( github.com/blang/semver/v4 v4.0.0 + github.com/go-logr/logr v1.4.3 github.com/google/cel-go v0.26.0 github.com/google/go-cmp v0.7.0 github.com/onsi/gomega v1.38.2 @@ -32,7 +33,6 @@ require ( github.com/davecgh/go-spew v1.1.1 // indirect github.com/emicklei/go-restful/v3 v3.12.2 // indirect github.com/fxamacker/cbor/v2 v2.9.0 // indirect - github.com/go-logr/logr v1.4.3 // indirect github.com/go-openapi/jsonpointer v0.21.0 // indirect github.com/go-openapi/jsonreference v0.20.2 // indirect github.com/go-openapi/swag v0.23.0 // indirect diff --git a/staging/src/k8s.io/dynamic-resource-allocation/structured/internal/experimental/allocator_experimental.go b/staging/src/k8s.io/dynamic-resource-allocation/structured/internal/experimental/allocator_experimental.go index fd63fff1e8e..b01fae657b7 100644 --- a/staging/src/k8s.io/dynamic-resource-allocation/structured/internal/experimental/allocator_experimental.go +++ b/staging/src/k8s.io/dynamic-resource-allocation/structured/internal/experimental/allocator_experimental.go @@ -157,11 +157,7 @@ func (a *Allocator) Allocate(ctx context.Context, node *v1.Node, claims []*resou return nil, fmt.Errorf("gather pool information: %w", err) } alloc.pools = pools - if loggerV := alloc.logger.V(7); loggerV.Enabled() { - loggerV.Info("Gathered pool information", "numPools", len(pools), "pools", pools) - } else { - alloc.logger.V(5).Info("Gathered pool information", "numPools", len(pools)) - } + alloc.logger.V(5).Info("Gathered pool information", "pools", logPools(alloc.logger, pools)) // We allocate one claim after the other and for each claim, all of // its requests. For each individual device we pick one possible @@ -314,7 +310,7 @@ func (a *Allocator) Allocate(ctx context.Context, node *v1.Node, claims []*resou // We can estimate the size based on what we need to allocate. alloc.allocatingDevices = make(map[DeviceID]sets.Set[int], minDevicesTotal) - alloc.logger.V(6).Info("Gathered information about devices", "numAllocated", len(alloc.allocatedState.AllocatedDevices), "minDevicesToBeAllocated", minDevicesTotal) + alloc.logger.V(6).Info("Gathered information about devices", "allocatedDevices", logAllocatedDevices(alloc.logger, alloc.allocatedState.AllocatedDevices), "minDevicesToBeAllocated", minDevicesTotal) // In practice, there aren't going to be many different CEL // expressions. Most likely, there is going to be handful of different diff --git a/staging/src/k8s.io/dynamic-resource-allocation/structured/internal/experimental/pools_experimental.go b/staging/src/k8s.io/dynamic-resource-allocation/structured/internal/experimental/pools_experimental.go index bd265d69c6f..ec39aea3fd3 100644 --- a/staging/src/k8s.io/dynamic-resource-allocation/structured/internal/experimental/pools_experimental.go +++ b/staging/src/k8s.io/dynamic-resource-allocation/structured/internal/experimental/pools_experimental.go @@ -19,12 +19,16 @@ package experimental import ( "context" "fmt" + "slices" + + "github.com/go-logr/logr" v1 "k8s.io/api/core/v1" resourceapi "k8s.io/api/resource/v1" "k8s.io/apimachinery/pkg/util/sets" "k8s.io/component-helpers/scheduling/corev1/nodeaffinity" draapi "k8s.io/dynamic-resource-allocation/api" + "k8s.io/klog/v2" "k8s.io/utils/ptr" ) @@ -444,3 +448,126 @@ type PoolID struct { func (p PoolID) String() string { return p.Driver.String() + "/" + p.Pool.String() } + +// At V(6), log only a limited number of devices to avoid blowing up logs. For +// many E2E tests, 10 devices is enough for all devices without having to +// truncate, at least when running the tests sequentially. +const maxDevicesLevel6 = 10 + +// logPools returns a handle for the value in a structured log call which +// includes varying amounts of information about the pools, depending on +// the verbosity of the logger. +func logPools(logger klog.Logger, pools []*Pool) any { + // We need to check verbosity here because our caller's source code + // location may be relevant (-vmodule !). + helper, logger := logger.WithCallStackHelper() + helper() + + // We always produce the same output at V <= 5. 6 adds a summary and + // 7 is a complete dump. + verbosity := 5 + for i := 7; i > verbosity; i-- { + if loggerV := logger.V(i); loggerV.Enabled() { + verbosity = i + break + } + } + return &poolsLogger{verbosity, pools} +} + +type poolsLogger struct { + verbosity int + pools []*Pool +} + +var _ logr.Marshaler = &poolsLogger{} + +func (p *poolsLogger) MarshalLog() any { + info := map[string]any{"count": len(p.pools)} + if p.verbosity == 6 { + meta := make([]map[string]any, len(p.pools)) + for i, pool := range p.pools { + meta[i] = map[string]any{ + "id": pool.PoolID.String(), + "isIncomplete": pool.IsIncomplete, + "isInvalid": pool.IsInvalid, + "InvalidReason": pool.InvalidReason, + } + } + info["meta"] = meta + info["devices"] = p.listDevices(maxDevicesLevel6) + } + if p.verbosity >= 7 { + info["devices"] = p.listDevices(-1) + info["content"] = p.pools + } + return info +} + +func (p *poolsLogger) listDevices(maxDevices int) []string { + var devices []string + for _, pool := range p.pools { + devices = p.addDevicesInSlices(devices, pool.PoolID, pool.DeviceSlicesTargetingNode, maxDevices) + devices = p.addDevicesInSlices(devices, pool.PoolID, pool.DeviceSlicesNotTargetingNode, maxDevices) + } + return devices +} + +func (p *poolsLogger) addDevicesInSlices(devices []string, poolID PoolID, slices []*draapi.ResourceSlice, maxDevices int) []string { + for _, slice := range slices { + for _, device := range slice.Spec.Devices { + if maxDevices != -1 && len(devices) >= maxDevices { + devices = append(devices, "...") + return devices + } + devices = append(devices, DeviceID{Driver: poolID.Driver, Pool: poolID.Pool, Device: device.Name}.String()) + } + } + return devices +} + +// logPools returns a handle for the value in a structured log call which +// includes varying amounts of information about the allocated devices, depending on +// the verbosity of the logger. +func logAllocatedDevices(logger klog.Logger, allocatedDevices sets.Set[DeviceID]) any { + // We need to check verbosity here because our caller's source code + // location may be relevant (-vmodule !). + helper, logger := logger.WithCallStackHelper() + helper() + + // We always produce the same output at V <= 5. 6 adds all IDs. + verbosity := 5 + for i := 7; i > verbosity; i-- { + if loggerV := logger.V(i); loggerV.Enabled() { + verbosity = i + break + } + } + + return &allocatedDevicesLogger{verbosity, allocatedDevices} +} + +type allocatedDevicesLogger struct { + verbosity int + devices sets.Set[DeviceID] +} + +var _ logr.Marshaler = &allocatedDevicesLogger{} + +func (a *allocatedDevicesLogger) MarshalLog() any { + info := map[string]any{"count": len(a.devices)} + if a.verbosity >= 6 { + ids := make([]string, 0, len(a.devices)) + for id := range a.devices { + if a.verbosity == 6 && len(ids) >= maxDevicesLevel6 { + ids = append(ids, "...") + break + } + ids = append(ids, id.String()) + } + slices.Sort(ids) + info["devices"] = ids + + } + return info +} diff --git a/staging/src/k8s.io/dynamic-resource-allocation/structured/internal/incubating/allocator_incubating.go b/staging/src/k8s.io/dynamic-resource-allocation/structured/internal/incubating/allocator_incubating.go index a051dd9640c..f5cd670e2e7 100644 --- a/staging/src/k8s.io/dynamic-resource-allocation/structured/internal/incubating/allocator_incubating.go +++ b/staging/src/k8s.io/dynamic-resource-allocation/structured/internal/incubating/allocator_incubating.go @@ -127,11 +127,7 @@ func (a *Allocator) Allocate(ctx context.Context, node *v1.Node, claims []*resou return nil, fmt.Errorf("gather pool information: %w", err) } alloc.pools = pools - if loggerV := alloc.logger.V(7); loggerV.Enabled() { - loggerV.Info("Gathered pool information", "numPools", len(pools), "pools", pools) - } else { - alloc.logger.V(5).Info("Gathered pool information", "numPools", len(pools)) - } + alloc.logger.V(5).Info("Gathered pool information", "pools", logPools(alloc.logger, pools)) // We allocate one claim after the other and for each claim, all of // its requests. For each individual device we pick one possible @@ -255,7 +251,7 @@ func (a *Allocator) Allocate(ctx context.Context, node *v1.Node, claims []*resou // We can estimate the size based on what we need to allocate. alloc.allocatingDevices = make(map[DeviceID]sets.Set[int], minDevicesTotal) - alloc.logger.V(6).Info("Gathered information about devices", "numAllocated", len(alloc.allocatedDevices), "minDevicesToBeAllocated", minDevicesTotal) + alloc.logger.V(5).Info("Gathered information about devices", "allocatedDevices", logAllocatedDevices(alloc.logger, alloc.allocatedDevices), "minDevicesToBeAllocated", minDevicesTotal) // In practice, there aren't going to be many different CEL // expressions. Most likely, there is going to be handful of different diff --git a/staging/src/k8s.io/dynamic-resource-allocation/structured/internal/incubating/pools_incubating.go b/staging/src/k8s.io/dynamic-resource-allocation/structured/internal/incubating/pools_incubating.go index 5522b1a8fde..ac9b05df03f 100644 --- a/staging/src/k8s.io/dynamic-resource-allocation/structured/internal/incubating/pools_incubating.go +++ b/staging/src/k8s.io/dynamic-resource-allocation/structured/internal/incubating/pools_incubating.go @@ -19,12 +19,16 @@ package incubating import ( "context" "fmt" + "slices" + + "github.com/go-logr/logr" v1 "k8s.io/api/core/v1" resourceapi "k8s.io/api/resource/v1" "k8s.io/apimachinery/pkg/util/sets" "k8s.io/component-helpers/scheduling/corev1/nodeaffinity" draapi "k8s.io/dynamic-resource-allocation/api" + "k8s.io/klog/v2" "k8s.io/utils/ptr" ) @@ -390,3 +394,126 @@ type PoolID struct { func (p PoolID) String() string { return p.Driver.String() + "/" + p.Pool.String() } + +// At V(6), log only a limited number of devices to avoid blowing up logs. For +// many E2E tests, 10 devices is enough for all devices without having to +// truncate, at least when running the tests sequentially. +const maxDevicesLevel6 = 10 + +// logPools returns a handle for the value in a structured log call which +// includes varying amounts of information about the pools, depending on +// the verbosity of the logger. +func logPools(logger klog.Logger, pools []*Pool) any { + // We need to check verbosity here because our caller's source code + // location may be relevant (-vmodule !). + helper, logger := logger.WithCallStackHelper() + helper() + + // We always produce the same output at V <= 5. 6 adds a summary and + // 7 is a complete dump. + verbosity := 5 + for i := 7; i > verbosity; i-- { + if loggerV := logger.V(i); loggerV.Enabled() { + verbosity = i + break + } + } + return &poolsLogger{verbosity, pools} +} + +type poolsLogger struct { + verbosity int + pools []*Pool +} + +var _ logr.Marshaler = &poolsLogger{} + +func (p *poolsLogger) MarshalLog() any { + info := map[string]any{"count": len(p.pools)} + if p.verbosity == 6 { + meta := make([]map[string]any, len(p.pools)) + for i, pool := range p.pools { + meta[i] = map[string]any{ + "id": pool.PoolID.String(), + "isIncomplete": pool.IsIncomplete, + "isInvalid": pool.IsInvalid, + "InvalidReason": pool.InvalidReason, + } + } + info["meta"] = meta + info["devices"] = p.listDevices(maxDevicesLevel6) + } + if p.verbosity >= 7 { + info["devices"] = p.listDevices(-1) + info["content"] = p.pools + } + return info +} + +func (p *poolsLogger) listDevices(maxDevices int) []string { + var devices []string + for _, pool := range p.pools { + devices = p.addDevicesInSlices(devices, pool.PoolID, pool.DeviceSlicesTargetingNode, maxDevices) + devices = p.addDevicesInSlices(devices, pool.PoolID, pool.DeviceSlicesNotTargetingNode, maxDevices) + } + return devices +} + +func (p *poolsLogger) addDevicesInSlices(devices []string, poolID PoolID, slices []*draapi.ResourceSlice, maxDevices int) []string { + for _, slice := range slices { + for _, device := range slice.Spec.Devices { + if maxDevices != -1 && len(devices) >= maxDevices { + devices = append(devices, "...") + return devices + } + devices = append(devices, DeviceID{Driver: poolID.Driver, Pool: poolID.Pool, Device: device.Name}.String()) + } + } + return devices +} + +// logPools returns a handle for the value in a structured log call which +// includes varying amounts of information about the allocated devices, depending on +// the verbosity of the logger. +func logAllocatedDevices(logger klog.Logger, allocatedDevices sets.Set[DeviceID]) any { + // We need to check verbosity here because our caller's source code + // location may be relevant (-vmodule !). + helper, logger := logger.WithCallStackHelper() + helper() + + // We always produce the same output at V <= 5. 6 adds all IDs. + verbosity := 5 + for i := 7; i > verbosity; i-- { + if loggerV := logger.V(i); loggerV.Enabled() { + verbosity = i + break + } + } + + return &allocatedDevicesLogger{verbosity, allocatedDevices} +} + +type allocatedDevicesLogger struct { + verbosity int + devices sets.Set[DeviceID] +} + +var _ logr.Marshaler = &allocatedDevicesLogger{} + +func (a *allocatedDevicesLogger) MarshalLog() any { + info := map[string]any{"count": len(a.devices)} + if a.verbosity >= 6 { + ids := make([]string, 0, len(a.devices)) + for id := range a.devices { + if a.verbosity == 6 && len(ids) >= maxDevicesLevel6 { + ids = append(ids, "...") + break + } + ids = append(ids, id.String()) + } + slices.Sort(ids) + info["devices"] = ids + + } + return info +} diff --git a/test/e2e/dra/kind.yaml b/test/e2e/dra/kind.yaml index e2536ad2d77..15ae095c200 100644 --- a/test/e2e/dra/kind.yaml +++ b/test/e2e/dra/kind.yaml @@ -23,7 +23,7 @@ kubeadmConfigPatches: - name: "v" value: "5" - name: "vmodule" - value: "allocator=6,dynamicresources=6" # structured/allocator.go, DRA scheduler plugin + value: "allocator*=6,pools*=6,dynamicresources=6,allocateddevices=6,dra_manager=6,extendeddynamicresources=6" # structured/internal/*/allocator*.go, DRA scheduler plugin controllerManager: extraArgs: - name: "v" @@ -53,7 +53,7 @@ kubeadmConfigPatches: scheduler: extraArgs: v: "5" - vmodule: "allocator=6,dynamicresources=6" # structured/allocator.go, DRA scheduler plugin + vmodule: "allocator*=6,pools*=6,dynamicresources=6,allocateddevices=6,dra_manager=6,extendeddynamicresources=6" # structured/internal/*/allocator*.go, DRA scheduler plugin controllerManager: extraArgs: v: "5"