mirror of
https://github.com/kubernetes/kubernetes.git
synced 2026-05-28 04:04:39 -04:00
DRA: log more information
For debugging double allocation of the same device (https://github.com/kubernetes/kubernetes/issues/133602) it is necessary to have information about pools, devices and in-flight claims. Log calls get extended and the config for DRA CI jobs updated to enable higher verbosity for relevant source files. Log output in such a cluster at verbosity 6 looks like this: I1215 10:28:54.166872 1 allocator_incubating.go:130] "Gathered pool information" logger="FilterWithNominatedPods.Filter.DynamicResources" pod="dra-8841/tester-3" node="kind-worker2" pools={"count":1,"devices":["dra-8841.k8s.io/kind-worker2/device-00"],"meta":[{"InvalidReason":"","id":"dra-8841.k8s.io/kind-worker2","isIncomplete":false,"isInvalid":false}]} I1215 10:28:54.166941 1 allocator_incubating.go:254] "Gathered information about devices" logger="FilterWithNominatedPods.Filter.DynamicResources" pod="dra-8841/tester-3" node="kind-worker2" allocatedDevices={"count":2,"devices":["dra-8841.k8s.io/kind-worker/device-00","dra-8841.k8s.io/kind-worker3/device-00"]} minDevicesToBeAllocated=1
This commit is contained in:
parent
c180d6762d
commit
5d536bfb8e
7 changed files with 271 additions and 17 deletions
|
|
@ -969,7 +969,7 @@ func (pl *DynamicResources) Reserve(ctx context.Context, cs fwk.CycleState, pod
|
|||
if err != nil {
|
||||
return statusError(logger, fmt.Errorf("internal error, couldn't signal allocation for claim %s", claim.Name))
|
||||
}
|
||||
logger.V(5).Info("Reserved resource in allocation result", "claim", klog.KObj(claim), "allocation", klog.Format(allocation))
|
||||
logger.V(5).Info("Reserved resource in allocation result", "claim", klog.KObj(claim), "uid", claim.UID, "resourceVersion", claim.ResourceVersion, "allocation", klog.Format(allocation))
|
||||
allocIndex++
|
||||
}
|
||||
}
|
||||
|
|
@ -998,6 +998,7 @@ func (pl *DynamicResources) Unreserve(ctx context.Context, cs fwk.CycleState, po
|
|||
// If allocation was in-flight, then it's not anymore and we need to revert the
|
||||
// claim object in the assume cache to what it was before.
|
||||
if deleted := pl.draManager.ResourceClaims().RemoveClaimPendingAllocation(claim.UID); deleted {
|
||||
logger.V(5).Info("Released resource in allocation result", "claim", klog.KObj(claim), "uid", claim.UID, "resourceVersion", claim.ResourceVersion, "allocation", klog.Format(claim.Status.Allocation))
|
||||
pl.draManager.ResourceClaims().AssumedClaimRestore(claim.Namespace, claim.Name)
|
||||
}
|
||||
|
||||
|
|
@ -1138,7 +1139,14 @@ func (pl *DynamicResources) bindClaim(ctx context.Context, state *stateData, ind
|
|||
}
|
||||
if allocation != nil {
|
||||
for _, claimUID := range claimUIDs {
|
||||
pl.draManager.ResourceClaims().RemoveClaimPendingAllocation(claimUID)
|
||||
if deleted := pl.draManager.ResourceClaims().RemoveClaimPendingAllocation(claimUID); deleted {
|
||||
// Creating the claim may have failed.
|
||||
resourceVersion := ""
|
||||
if claim != nil {
|
||||
resourceVersion = claim.ResourceVersion
|
||||
}
|
||||
logger.V(5).Info("Released resource in allocation result", "claim", klog.KObj(claim), "uid", claimUID, "resourceVersion", resourceVersion, "allocation", klog.Format(allocation))
|
||||
}
|
||||
}
|
||||
}
|
||||
}()
|
||||
|
|
|
|||
|
|
@ -8,6 +8,7 @@ godebug default=go1.25
|
|||
|
||||
require (
|
||||
github.com/blang/semver/v4 v4.0.0
|
||||
github.com/go-logr/logr v1.4.3
|
||||
github.com/google/cel-go v0.26.0
|
||||
github.com/google/go-cmp v0.7.0
|
||||
github.com/onsi/gomega v1.38.2
|
||||
|
|
@ -32,7 +33,6 @@ require (
|
|||
github.com/davecgh/go-spew v1.1.1 // indirect
|
||||
github.com/emicklei/go-restful/v3 v3.12.2 // indirect
|
||||
github.com/fxamacker/cbor/v2 v2.9.0 // indirect
|
||||
github.com/go-logr/logr v1.4.3 // indirect
|
||||
github.com/go-openapi/jsonpointer v0.21.0 // indirect
|
||||
github.com/go-openapi/jsonreference v0.20.2 // indirect
|
||||
github.com/go-openapi/swag v0.23.0 // indirect
|
||||
|
|
|
|||
|
|
@ -157,11 +157,7 @@ func (a *Allocator) Allocate(ctx context.Context, node *v1.Node, claims []*resou
|
|||
return nil, fmt.Errorf("gather pool information: %w", err)
|
||||
}
|
||||
alloc.pools = pools
|
||||
if loggerV := alloc.logger.V(7); loggerV.Enabled() {
|
||||
loggerV.Info("Gathered pool information", "numPools", len(pools), "pools", pools)
|
||||
} else {
|
||||
alloc.logger.V(5).Info("Gathered pool information", "numPools", len(pools))
|
||||
}
|
||||
alloc.logger.V(5).Info("Gathered pool information", "pools", logPools(alloc.logger, pools))
|
||||
|
||||
// We allocate one claim after the other and for each claim, all of
|
||||
// its requests. For each individual device we pick one possible
|
||||
|
|
@ -314,7 +310,7 @@ func (a *Allocator) Allocate(ctx context.Context, node *v1.Node, claims []*resou
|
|||
// We can estimate the size based on what we need to allocate.
|
||||
alloc.allocatingDevices = make(map[DeviceID]sets.Set[int], minDevicesTotal)
|
||||
|
||||
alloc.logger.V(6).Info("Gathered information about devices", "numAllocated", len(alloc.allocatedState.AllocatedDevices), "minDevicesToBeAllocated", minDevicesTotal)
|
||||
alloc.logger.V(6).Info("Gathered information about devices", "allocatedDevices", logAllocatedDevices(alloc.logger, alloc.allocatedState.AllocatedDevices), "minDevicesToBeAllocated", minDevicesTotal)
|
||||
|
||||
// In practice, there aren't going to be many different CEL
|
||||
// expressions. Most likely, there is going to be handful of different
|
||||
|
|
|
|||
|
|
@ -19,12 +19,16 @@ package experimental
|
|||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"slices"
|
||||
|
||||
"github.com/go-logr/logr"
|
||||
|
||||
v1 "k8s.io/api/core/v1"
|
||||
resourceapi "k8s.io/api/resource/v1"
|
||||
"k8s.io/apimachinery/pkg/util/sets"
|
||||
"k8s.io/component-helpers/scheduling/corev1/nodeaffinity"
|
||||
draapi "k8s.io/dynamic-resource-allocation/api"
|
||||
"k8s.io/klog/v2"
|
||||
"k8s.io/utils/ptr"
|
||||
)
|
||||
|
||||
|
|
@ -444,3 +448,126 @@ type PoolID struct {
|
|||
func (p PoolID) String() string {
|
||||
return p.Driver.String() + "/" + p.Pool.String()
|
||||
}
|
||||
|
||||
// At V(6), log only a limited number of devices to avoid blowing up logs. For
|
||||
// many E2E tests, 10 devices is enough for all devices without having to
|
||||
// truncate, at least when running the tests sequentially.
|
||||
const maxDevicesLevel6 = 10
|
||||
|
||||
// logPools returns a handle for the value in a structured log call which
|
||||
// includes varying amounts of information about the pools, depending on
|
||||
// the verbosity of the logger.
|
||||
func logPools(logger klog.Logger, pools []*Pool) any {
|
||||
// We need to check verbosity here because our caller's source code
|
||||
// location may be relevant (-vmodule !).
|
||||
helper, logger := logger.WithCallStackHelper()
|
||||
helper()
|
||||
|
||||
// We always produce the same output at V <= 5. 6 adds a summary and
|
||||
// 7 is a complete dump.
|
||||
verbosity := 5
|
||||
for i := 7; i > verbosity; i-- {
|
||||
if loggerV := logger.V(i); loggerV.Enabled() {
|
||||
verbosity = i
|
||||
break
|
||||
}
|
||||
}
|
||||
return &poolsLogger{verbosity, pools}
|
||||
}
|
||||
|
||||
type poolsLogger struct {
|
||||
verbosity int
|
||||
pools []*Pool
|
||||
}
|
||||
|
||||
var _ logr.Marshaler = &poolsLogger{}
|
||||
|
||||
func (p *poolsLogger) MarshalLog() any {
|
||||
info := map[string]any{"count": len(p.pools)}
|
||||
if p.verbosity == 6 {
|
||||
meta := make([]map[string]any, len(p.pools))
|
||||
for i, pool := range p.pools {
|
||||
meta[i] = map[string]any{
|
||||
"id": pool.PoolID.String(),
|
||||
"isIncomplete": pool.IsIncomplete,
|
||||
"isInvalid": pool.IsInvalid,
|
||||
"InvalidReason": pool.InvalidReason,
|
||||
}
|
||||
}
|
||||
info["meta"] = meta
|
||||
info["devices"] = p.listDevices(maxDevicesLevel6)
|
||||
}
|
||||
if p.verbosity >= 7 {
|
||||
info["devices"] = p.listDevices(-1)
|
||||
info["content"] = p.pools
|
||||
}
|
||||
return info
|
||||
}
|
||||
|
||||
func (p *poolsLogger) listDevices(maxDevices int) []string {
|
||||
var devices []string
|
||||
for _, pool := range p.pools {
|
||||
devices = p.addDevicesInSlices(devices, pool.PoolID, pool.DeviceSlicesTargetingNode, maxDevices)
|
||||
devices = p.addDevicesInSlices(devices, pool.PoolID, pool.DeviceSlicesNotTargetingNode, maxDevices)
|
||||
}
|
||||
return devices
|
||||
}
|
||||
|
||||
func (p *poolsLogger) addDevicesInSlices(devices []string, poolID PoolID, slices []*draapi.ResourceSlice, maxDevices int) []string {
|
||||
for _, slice := range slices {
|
||||
for _, device := range slice.Spec.Devices {
|
||||
if maxDevices != -1 && len(devices) >= maxDevices {
|
||||
devices = append(devices, "...")
|
||||
return devices
|
||||
}
|
||||
devices = append(devices, DeviceID{Driver: poolID.Driver, Pool: poolID.Pool, Device: device.Name}.String())
|
||||
}
|
||||
}
|
||||
return devices
|
||||
}
|
||||
|
||||
// logPools returns a handle for the value in a structured log call which
|
||||
// includes varying amounts of information about the allocated devices, depending on
|
||||
// the verbosity of the logger.
|
||||
func logAllocatedDevices(logger klog.Logger, allocatedDevices sets.Set[DeviceID]) any {
|
||||
// We need to check verbosity here because our caller's source code
|
||||
// location may be relevant (-vmodule !).
|
||||
helper, logger := logger.WithCallStackHelper()
|
||||
helper()
|
||||
|
||||
// We always produce the same output at V <= 5. 6 adds all IDs.
|
||||
verbosity := 5
|
||||
for i := 7; i > verbosity; i-- {
|
||||
if loggerV := logger.V(i); loggerV.Enabled() {
|
||||
verbosity = i
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
return &allocatedDevicesLogger{verbosity, allocatedDevices}
|
||||
}
|
||||
|
||||
type allocatedDevicesLogger struct {
|
||||
verbosity int
|
||||
devices sets.Set[DeviceID]
|
||||
}
|
||||
|
||||
var _ logr.Marshaler = &allocatedDevicesLogger{}
|
||||
|
||||
func (a *allocatedDevicesLogger) MarshalLog() any {
|
||||
info := map[string]any{"count": len(a.devices)}
|
||||
if a.verbosity >= 6 {
|
||||
ids := make([]string, 0, len(a.devices))
|
||||
for id := range a.devices {
|
||||
if a.verbosity == 6 && len(ids) >= maxDevicesLevel6 {
|
||||
ids = append(ids, "...")
|
||||
break
|
||||
}
|
||||
ids = append(ids, id.String())
|
||||
}
|
||||
slices.Sort(ids)
|
||||
info["devices"] = ids
|
||||
|
||||
}
|
||||
return info
|
||||
}
|
||||
|
|
|
|||
|
|
@ -127,11 +127,7 @@ func (a *Allocator) Allocate(ctx context.Context, node *v1.Node, claims []*resou
|
|||
return nil, fmt.Errorf("gather pool information: %w", err)
|
||||
}
|
||||
alloc.pools = pools
|
||||
if loggerV := alloc.logger.V(7); loggerV.Enabled() {
|
||||
loggerV.Info("Gathered pool information", "numPools", len(pools), "pools", pools)
|
||||
} else {
|
||||
alloc.logger.V(5).Info("Gathered pool information", "numPools", len(pools))
|
||||
}
|
||||
alloc.logger.V(5).Info("Gathered pool information", "pools", logPools(alloc.logger, pools))
|
||||
|
||||
// We allocate one claim after the other and for each claim, all of
|
||||
// its requests. For each individual device we pick one possible
|
||||
|
|
@ -255,7 +251,7 @@ func (a *Allocator) Allocate(ctx context.Context, node *v1.Node, claims []*resou
|
|||
// We can estimate the size based on what we need to allocate.
|
||||
alloc.allocatingDevices = make(map[DeviceID]sets.Set[int], minDevicesTotal)
|
||||
|
||||
alloc.logger.V(6).Info("Gathered information about devices", "numAllocated", len(alloc.allocatedDevices), "minDevicesToBeAllocated", minDevicesTotal)
|
||||
alloc.logger.V(5).Info("Gathered information about devices", "allocatedDevices", logAllocatedDevices(alloc.logger, alloc.allocatedDevices), "minDevicesToBeAllocated", minDevicesTotal)
|
||||
|
||||
// In practice, there aren't going to be many different CEL
|
||||
// expressions. Most likely, there is going to be handful of different
|
||||
|
|
|
|||
|
|
@ -19,12 +19,16 @@ package incubating
|
|||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"slices"
|
||||
|
||||
"github.com/go-logr/logr"
|
||||
|
||||
v1 "k8s.io/api/core/v1"
|
||||
resourceapi "k8s.io/api/resource/v1"
|
||||
"k8s.io/apimachinery/pkg/util/sets"
|
||||
"k8s.io/component-helpers/scheduling/corev1/nodeaffinity"
|
||||
draapi "k8s.io/dynamic-resource-allocation/api"
|
||||
"k8s.io/klog/v2"
|
||||
"k8s.io/utils/ptr"
|
||||
)
|
||||
|
||||
|
|
@ -390,3 +394,126 @@ type PoolID struct {
|
|||
func (p PoolID) String() string {
|
||||
return p.Driver.String() + "/" + p.Pool.String()
|
||||
}
|
||||
|
||||
// At V(6), log only a limited number of devices to avoid blowing up logs. For
|
||||
// many E2E tests, 10 devices is enough for all devices without having to
|
||||
// truncate, at least when running the tests sequentially.
|
||||
const maxDevicesLevel6 = 10
|
||||
|
||||
// logPools returns a handle for the value in a structured log call which
|
||||
// includes varying amounts of information about the pools, depending on
|
||||
// the verbosity of the logger.
|
||||
func logPools(logger klog.Logger, pools []*Pool) any {
|
||||
// We need to check verbosity here because our caller's source code
|
||||
// location may be relevant (-vmodule !).
|
||||
helper, logger := logger.WithCallStackHelper()
|
||||
helper()
|
||||
|
||||
// We always produce the same output at V <= 5. 6 adds a summary and
|
||||
// 7 is a complete dump.
|
||||
verbosity := 5
|
||||
for i := 7; i > verbosity; i-- {
|
||||
if loggerV := logger.V(i); loggerV.Enabled() {
|
||||
verbosity = i
|
||||
break
|
||||
}
|
||||
}
|
||||
return &poolsLogger{verbosity, pools}
|
||||
}
|
||||
|
||||
type poolsLogger struct {
|
||||
verbosity int
|
||||
pools []*Pool
|
||||
}
|
||||
|
||||
var _ logr.Marshaler = &poolsLogger{}
|
||||
|
||||
func (p *poolsLogger) MarshalLog() any {
|
||||
info := map[string]any{"count": len(p.pools)}
|
||||
if p.verbosity == 6 {
|
||||
meta := make([]map[string]any, len(p.pools))
|
||||
for i, pool := range p.pools {
|
||||
meta[i] = map[string]any{
|
||||
"id": pool.PoolID.String(),
|
||||
"isIncomplete": pool.IsIncomplete,
|
||||
"isInvalid": pool.IsInvalid,
|
||||
"InvalidReason": pool.InvalidReason,
|
||||
}
|
||||
}
|
||||
info["meta"] = meta
|
||||
info["devices"] = p.listDevices(maxDevicesLevel6)
|
||||
}
|
||||
if p.verbosity >= 7 {
|
||||
info["devices"] = p.listDevices(-1)
|
||||
info["content"] = p.pools
|
||||
}
|
||||
return info
|
||||
}
|
||||
|
||||
func (p *poolsLogger) listDevices(maxDevices int) []string {
|
||||
var devices []string
|
||||
for _, pool := range p.pools {
|
||||
devices = p.addDevicesInSlices(devices, pool.PoolID, pool.DeviceSlicesTargetingNode, maxDevices)
|
||||
devices = p.addDevicesInSlices(devices, pool.PoolID, pool.DeviceSlicesNotTargetingNode, maxDevices)
|
||||
}
|
||||
return devices
|
||||
}
|
||||
|
||||
func (p *poolsLogger) addDevicesInSlices(devices []string, poolID PoolID, slices []*draapi.ResourceSlice, maxDevices int) []string {
|
||||
for _, slice := range slices {
|
||||
for _, device := range slice.Spec.Devices {
|
||||
if maxDevices != -1 && len(devices) >= maxDevices {
|
||||
devices = append(devices, "...")
|
||||
return devices
|
||||
}
|
||||
devices = append(devices, DeviceID{Driver: poolID.Driver, Pool: poolID.Pool, Device: device.Name}.String())
|
||||
}
|
||||
}
|
||||
return devices
|
||||
}
|
||||
|
||||
// logPools returns a handle for the value in a structured log call which
|
||||
// includes varying amounts of information about the allocated devices, depending on
|
||||
// the verbosity of the logger.
|
||||
func logAllocatedDevices(logger klog.Logger, allocatedDevices sets.Set[DeviceID]) any {
|
||||
// We need to check verbosity here because our caller's source code
|
||||
// location may be relevant (-vmodule !).
|
||||
helper, logger := logger.WithCallStackHelper()
|
||||
helper()
|
||||
|
||||
// We always produce the same output at V <= 5. 6 adds all IDs.
|
||||
verbosity := 5
|
||||
for i := 7; i > verbosity; i-- {
|
||||
if loggerV := logger.V(i); loggerV.Enabled() {
|
||||
verbosity = i
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
return &allocatedDevicesLogger{verbosity, allocatedDevices}
|
||||
}
|
||||
|
||||
type allocatedDevicesLogger struct {
|
||||
verbosity int
|
||||
devices sets.Set[DeviceID]
|
||||
}
|
||||
|
||||
var _ logr.Marshaler = &allocatedDevicesLogger{}
|
||||
|
||||
func (a *allocatedDevicesLogger) MarshalLog() any {
|
||||
info := map[string]any{"count": len(a.devices)}
|
||||
if a.verbosity >= 6 {
|
||||
ids := make([]string, 0, len(a.devices))
|
||||
for id := range a.devices {
|
||||
if a.verbosity == 6 && len(ids) >= maxDevicesLevel6 {
|
||||
ids = append(ids, "...")
|
||||
break
|
||||
}
|
||||
ids = append(ids, id.String())
|
||||
}
|
||||
slices.Sort(ids)
|
||||
info["devices"] = ids
|
||||
|
||||
}
|
||||
return info
|
||||
}
|
||||
|
|
|
|||
|
|
@ -23,7 +23,7 @@ kubeadmConfigPatches:
|
|||
- name: "v"
|
||||
value: "5"
|
||||
- name: "vmodule"
|
||||
value: "allocator=6,dynamicresources=6" # structured/allocator.go, DRA scheduler plugin
|
||||
value: "allocator*=6,pools*=6,dynamicresources=6,allocateddevices=6,dra_manager=6,extendeddynamicresources=6" # structured/internal/*/allocator*.go, DRA scheduler plugin
|
||||
controllerManager:
|
||||
extraArgs:
|
||||
- name: "v"
|
||||
|
|
@ -53,7 +53,7 @@ kubeadmConfigPatches:
|
|||
scheduler:
|
||||
extraArgs:
|
||||
v: "5"
|
||||
vmodule: "allocator=6,dynamicresources=6" # structured/allocator.go, DRA scheduler plugin
|
||||
vmodule: "allocator*=6,pools*=6,dynamicresources=6,allocateddevices=6,dra_manager=6,extendeddynamicresources=6" # structured/internal/*/allocator*.go, DRA scheduler plugin
|
||||
controllerManager:
|
||||
extraArgs:
|
||||
v: "5"
|
||||
|
|
|
|||
Loading…
Reference in a new issue