From 30b811a99bd9556cb789e01a69ba191d79f13cec Mon Sep 17 00:00:00 2001 From: Tsubasa Watanabe Date: Fri, 27 Feb 2026 13:36:54 +0900 Subject: [PATCH] DRA Device Binding Conditions: add metrics for prebind flow This commit introduces metrics and improves log outputs for DRA Device Binding Conditions (KEP-5007): - scheduler_dra_bindingconditions_allocations_total Counts the number of per-device scheduling attempts during PreBind where BindingConditions are in use - scheduler_dra_bindingconditions_wait_duration_seconds Observes the time spent waiting for BindingConditions to be satisfied during PreBind. Signed-off-by: Tsubasa Watanabe --- .../dynamicresources/dynamicresources.go | 189 +++++++++++++++++- .../dynamicresources/dynamicresources_test.go | 68 ++++++- pkg/scheduler/metrics/metrics.go | 38 ++++ 3 files changed, 288 insertions(+), 7 deletions(-) diff --git a/pkg/scheduler/framework/plugins/dynamicresources/dynamicresources.go b/pkg/scheduler/framework/plugins/dynamicresources/dynamicresources.go index ab8b087bf38..46db7e45c09 100644 --- a/pkg/scheduler/framework/plugins/dynamicresources/dynamicresources.go +++ b/pkg/scheduler/framework/plugins/dynamicresources/dynamicresources.go @@ -50,6 +50,7 @@ import ( "k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature" "k8s.io/kubernetes/pkg/scheduler/framework/plugins/helper" "k8s.io/kubernetes/pkg/scheduler/framework/plugins/names" + schedmetrics "k8s.io/kubernetes/pkg/scheduler/metrics" schedutil "k8s.io/kubernetes/pkg/scheduler/util" "k8s.io/utils/ptr" ) @@ -146,6 +147,11 @@ type DynamicResources struct { draManager fwk.SharedDRAManager } +var ( + ErrDeviceBindingTimeout = errors.New("device binding timeout") + ErrDeviceBindingFailed = errors.New("device binding failed") +) + // New initializes a new plugin and returns it. func New(ctx context.Context, plArgs runtime.Object, fh fwk.Handle, fts feature.Features) (fwk.Plugin, error) { if !fts.EnableDynamicResourceAllocation { @@ -1095,18 +1101,61 @@ func (pl *DynamicResources) PreBind(ctx context.Context, cs fwk.CycleState, pod // We need to wait for the device to be attached to the node. pl.fh.EventRecorder().Eventf(pod, nil, v1.EventTypeNormal, "BindingConditionsPending", "Scheduling", "waiting for binding conditions for device on node %s", nodeName) + // START: Record start time for metrics duration calculation + start := time.Now() err = wait.PollUntilContextTimeout(ctx, 5*time.Second, pl.bindingTimeout, true, func(ctx context.Context) (bool, error) { return pl.isPodReadyForBinding(state) }) - if err != nil { - if errors.Is(err, context.DeadlineExceeded) { - err = errors.New("device binding timeout") + // GOAL: Calculate duration since start for metrics recording + duration := time.Since(start) + if errors.Is(err, context.DeadlineExceeded) { + err = fmt.Errorf("%w: %w", ErrDeviceBindingTimeout, err) + } + + // Determine status label based on error + statusLabel := schedmetrics.BindingConditionsStatusSuccess + switch { + case err == nil: + logger.V(5).Info("BindingConditions is met", + "pod", klog.KObj(pod), + "node", nodeName, + "devices", bindingConditionsStatusFormatter{state, statusLabel}, + ) + // keep success + case errors.Is(err, ErrDeviceBindingTimeout): + statusLabel = schedmetrics.BindingConditionsStatusTimeout + case errors.Is(err, ErrDeviceBindingFailed): + statusLabel = schedmetrics.BindingConditionsStatusFailed + default: + statusLabel = schedmetrics.BindingConditionsStatusError + } + + // Record metrics for binding conditions outcome (success/failure/timeout/error) + profileLabel := pl.fh.ProfileName() + for _, claim := range state.claims.all() { + if claim.Status.Allocation == nil { + continue } + for _, res := range claim.Status.Allocation.Devices.Results { + schedmetrics.DRABindingConditionsPreBindDuration. + WithLabelValues(profileLabel, res.Driver, statusLabel). + Observe(duration.Seconds()) + } + + // Count scheduling attempts that used devices with BindingConditions. + for _, bcStatus := range listBindingConditionsStatus(claim) { + schedmetrics.DRABindingConditionsAllocationsTotal. + WithLabelValues(profileLabel, bcStatus.Driver, statusLabel). + Inc() + } + } + + if err != nil { // Returning an error here causes another scheduling attempt. // In that next attempt, PreFilter will detect the timeout or // error and try to recover. - return statusError(logger, err) + return statusError(logger, err, "devices", formatBCStatusOneLine(getBindingConditionsStatusForStatus(state, statusLabel))) } // If we get here, we know that reserving the claim for @@ -1307,7 +1356,8 @@ func (pl *DynamicResources) isClaimReadyForBinding(claim *resourceapi.ResourceCl for _, cond := range deviceRequest.BindingFailureConditions { failedCond := apimeta.FindStatusCondition(deviceStatus.Conditions, cond) if failedCond != nil && failedCond.Status == metav1.ConditionTrue { - return false, fmt.Errorf("claim %s binding failed: reason=%s, message=%q", + return false, fmt.Errorf("%w: claim=%s, reason=%s, message=%q", + ErrDeviceBindingFailed, claim.Name, failedCond.Reason, failedCond.Message) @@ -1363,7 +1413,7 @@ func (pl *DynamicResources) isPodReadyForBinding(state *stateData) (bool, error) } if !ready { if pl.isClaimTimeout(claim) { - return false, fmt.Errorf("claim %s binding timeout", claim.Name) + return false, fmt.Errorf("%w: claim=%s", ErrDeviceBindingTimeout, claim.Name) } return false, nil } @@ -1422,3 +1472,130 @@ func getAllocatedDeviceStatus(claim *resourceapi.ResourceClaim, deviceRequest *r } return nil } + +// bindingConditionsStatusFormatter implements fmt.Stringer for lazy evaluation of device status formatting. +// This avoids expensive processing when logging is disabled. +type bindingConditionsStatusFormatter struct { + state *stateData + statusLabel string +} + +func (d bindingConditionsStatusFormatter) String() string { + return formatBCStatusOneLine(getBindingConditionsStatusForStatus(d.state, d.statusLabel)) +} + +// BindingConditionsStatus is a compact per-device summary for BindingConditions logging. +type BindingConditionsStatus struct { + Driver string `json:"driver"` + Pool string `json:"pool"` + Device string `json:"device"` + Pending []string `json:"pending,omitempty"` + Failed []string `json:"failed,omitempty"` // only failure conditions that are true +} + +// listBindingConditionsStatus inspects ONE claim and returns a summary per allocated device +// that has BindingConditions. +// - Pending: BindingConditions that are not yet true (or all conditions if device status is missing) +// - Failed: BindingFailureConditions that are true +func listBindingConditionsStatus(claim *resourceapi.ResourceClaim) []BindingConditionsStatus { + if claim == nil || claim.Status.Allocation == nil { + return nil + } + + var out []BindingConditionsStatus + for _, res := range claim.Status.Allocation.Devices.Results { + if len(res.BindingConditions) == 0 { + continue + } + + sum := BindingConditionsStatus{ + Driver: res.Driver, + Pool: res.Pool, + Device: res.Device, + } + + ds := getAllocatedDeviceStatus(claim, &res) + if ds == nil { + // No status yet => all BindingConditions are pending. + sum.Pending = append(sum.Pending, res.BindingConditions...) + out = append(out, sum) + continue + } + + // Failed conditions that are true. + for _, c := range res.BindingFailureConditions { + if apimeta.IsStatusConditionTrue(ds.Conditions, c) { + sum.Failed = append(sum.Failed, c) + } + } + + // Pending conditions: those not yet true. + for _, c := range res.BindingConditions { + if !apimeta.IsStatusConditionTrue(ds.Conditions, c) { + sum.Pending = append(sum.Pending, c) + } + } + + out = append(out, sum) + } + + return out +} + +// getBindingConditionsStatusForStatus selects which devices to log across ALL claims in the state. +func getBindingConditionsStatusForStatus(state *stateData, statusLabel string) []BindingConditionsStatus { + var out []BindingConditionsStatus + for _, claim := range state.claims.all() { + for _, s := range listBindingConditionsStatus(claim) { + switch statusLabel { + case schedmetrics.BindingConditionsStatusSuccess: + // success: show all BC devices + if len(s.Failed) == 0 && len(s.Pending) == 0 { + out = append(out, s) + } + case schedmetrics.BindingConditionsStatusFailed: + // failure: only devices that have at least one failure condition true + if len(s.Failed) > 0 { + out = append(out, BindingConditionsStatus{ + Driver: s.Driver, + Pool: s.Pool, + Device: s.Device, + Failed: append([]string(nil), s.Failed...), + }) + } + + case schedmetrics.BindingConditionsStatusTimeout: + // timeout/error: only devices still pending, and not failed + if len(s.Pending) > 0 && len(s.Failed) == 0 { + out = append(out, s) + } + + default: + // Treat unknown like timeout/error + if len(s.Pending) > 0 && len(s.Failed) == 0 { + out = append(out, s) + } + } + } + } + return out +} + +// formatBCStatusOneLine formats the selected devices into a single log-friendly line. +func formatBCStatusOneLine(devs []BindingConditionsStatus) string { + if len(devs) == 0 { + return "" + } + parts := make([]string, 0, len(devs)) + for _, d := range devs { + base := fmt.Sprintf(" %s", d.Driver, d.Pool, d.Device) + if len(d.Failed) > 0 { + base += fmt.Sprintf(" failed=[%s]", strings.Join(d.Failed, ",")) + } + if len(d.Pending) > 0 { + base += fmt.Sprintf(" pending=[%s]", strings.Join(d.Pending, ",")) + } + parts = append(parts, base) + } + return strings.Join(parts, "; ") +} diff --git a/pkg/scheduler/framework/plugins/dynamicresources/dynamicresources_test.go b/pkg/scheduler/framework/plugins/dynamicresources/dynamicresources_test.go index a4ab58803e0..17a0a3a23e5 100644 --- a/pkg/scheduler/framework/plugins/dynamicresources/dynamicresources_test.go +++ b/pkg/scheduler/framework/plugins/dynamicresources/dynamicresources_test.go @@ -2334,6 +2334,35 @@ func testPlugin(tCtx ktesting.TContext) { status: nil, }, }, + metrics: func(tCtx ktesting.TContext, g compbasemetrics.Gatherer) { + // Counter: allocations_total should have exactly one event + allocs, err := testutil.GetCounterValuesFromGatherer( + g, + "scheduler_dra_bindingconditions_allocations_total", + map[string]string{ + "status": "success", + }, + "driver", // group by driver label + ) + require.NoError(tCtx, err) + + var totalAllocs float64 + for _, v := range allocs { + totalAllocs += v + } + require.InEpsilon(tCtx, float64(1), totalAllocs, 0.1, "expected exactly one successful allocation with BindingConditions") + + // Histogram: one success sample with requires_bindingconditions=true + hist, err := testutil.GetHistogramVecFromGatherer( + g, + "scheduler_dra_bindingconditions_wait_duration_seconds", + map[string]string{ + "status": "success", + }, + ) + require.NoError(tCtx, err) + require.Equal(tCtx, uint64(1), hist.GetAggregatedSampleCount(), "expected one success sample in wait duration histogram") + }, }, "bound-claim-with-failed-binding": { enableDRADeviceBindingConditions: true, @@ -2452,9 +2481,38 @@ func testPlugin(tCtx ktesting.TContext) { }, }, // From isPodReadyForBinding. - status: fwk.AsStatus(errors.New("claim " + claim.Name + " binding timeout")), + status: fwk.AsStatus(fmt.Errorf("%w: claim=%s", ErrDeviceBindingTimeout, claim.Name)), }, }, + metrics: func(tCtx ktesting.TContext, g compbasemetrics.Gatherer) { + // Counter: timeouts_total should have exactly one event + timeouts, err := testutil.GetCounterValuesFromGatherer( + g, + "scheduler_dra_bindingconditions_allocations_total", + map[string]string{ + "status": "timeout", + }, + "driver", + ) + require.NoError(tCtx, err) + + var totalTimeouts float64 + for _, v := range timeouts { + totalTimeouts += v + } + require.InEpsilon(tCtx, float64(1), totalTimeouts, 0.1, "expected exactly one timeout with BindingConditions") + + // Histogram: one timeout sample with requires_bindingconditions=true + hist, err := testutil.GetHistogramVecFromGatherer( + g, + "scheduler_dra_bindingconditions_wait_duration_seconds", + map[string]string{ + "status": "timeout", + }, + ) + require.NoError(tCtx, err) + require.Equal(tCtx, uint64(1), hist.GetAggregatedSampleCount(), "expected one timeout sample in wait duration histogram") + }, }, "bound-claim-with-mixed-binding-conditions": { enableDRADeviceBindingConditions: true, @@ -2942,6 +3000,14 @@ func setupMetrics(features feature.Features) compbasemetrics.KubeRegistry { testRegistry.MustRegister(metrics.ResourceClaimCreatesTotal) metrics.ResourceClaimCreatesTotal.Reset() } + // DRA DeviceBindingConditions metrics. + if features.EnableDRADeviceBindingConditions { + testRegistry.MustRegister(metrics.DRABindingConditionsAllocationsTotal) + testRegistry.MustRegister(metrics.DRABindingConditionsPreBindDuration) + + metrics.DRABindingConditionsAllocationsTotal.Reset() + metrics.DRABindingConditionsPreBindDuration.Reset() + } return testRegistry } diff --git a/pkg/scheduler/metrics/metrics.go b/pkg/scheduler/metrics/metrics.go index 19f16013472..2dfa551f1bb 100644 --- a/pkg/scheduler/metrics/metrics.go +++ b/pkg/scheduler/metrics/metrics.go @@ -115,6 +115,14 @@ const ( BatchFlushPodNotBatchable = "pod_not_batchable" ) +// DRADeviceBindingConditions status labels +const ( + BindingConditionsStatusSuccess = "success" + BindingConditionsStatusFailed = "failure" + BindingConditionsStatusTimeout = "timeout" + BindingConditionsStatusError = "error" +) + // All the histogram based metrics have 1ms as size for the smallest bucket. var ( scheduleAttempts *metrics.CounterVec @@ -161,6 +169,9 @@ var ( podGroupScheduleAttempts *metrics.CounterVec podGroupSchedulingLatency *metrics.HistogramVec PodGroupSchedulingAlgorithmLatency *metrics.Histogram + // The below are only available when the DRADeviceBindingConditions feature gate is enabled. + DRABindingConditionsAllocationsTotal *metrics.CounterVec + DRABindingConditionsPreBindDuration *metrics.HistogramVec // metricsList is a list of all metrics that should be registered always, regardless of any feature gate's value. metricsList []metrics.Registerable @@ -199,6 +210,12 @@ func Register() { PodGroupSchedulingAlgorithmLatency, ) } + if utilfeature.DefaultFeatureGate.Enabled(features.DRADeviceBindingConditions) { + RegisterMetrics( + DRABindingConditionsAllocationsTotal, + DRABindingConditionsPreBindDuration, + ) + } }) } @@ -453,6 +470,27 @@ func InitMetrics() { }, []string{"status"}) + DRABindingConditionsAllocationsTotal = metrics.NewCounterVec( + &metrics.CounterOpts{ + Subsystem: SchedulerSubsystem, + Name: "dra_bindingconditions_allocations_total", + Help: "Number of allocations using devices with BindingConditions, counted per driver per scheduling attempt", + StabilityLevel: metrics.ALPHA, + }, + []string{"profile", "driver", "status"}, + ) + + DRABindingConditionsPreBindDuration = metrics.NewHistogramVec( + &metrics.HistogramOpts{ + Subsystem: SchedulerSubsystem, + Name: "dra_bindingconditions_wait_duration_seconds", + Help: "Time in seconds spent waiting for BindingConditions to be satisfied during PreBind.", + Buckets: metrics.ExponentialBuckets(0.1, 2, 14), + StabilityLevel: metrics.ALPHA, + }, + []string{"profile", "driver", "status"}, + ) + GetNodeHintDuration = metrics.NewHistogramVec( &metrics.HistogramOpts{ Subsystem: SchedulerSubsystem,