Merge pull request #137284 from ttsuuubasa/dra-dbc-metrics

DRA Device Binding Conditions: add metrics for prebind flow
This commit is contained in:
Kubernetes Prow Robot 2026-03-12 15:31:37 +05:30 committed by GitHub
commit 03d1c996c7
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
3 changed files with 288 additions and 7 deletions

View file

@ -50,6 +50,7 @@ import (
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/feature"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/helper"
"k8s.io/kubernetes/pkg/scheduler/framework/plugins/names"
schedmetrics "k8s.io/kubernetes/pkg/scheduler/metrics"
schedutil "k8s.io/kubernetes/pkg/scheduler/util"
"k8s.io/utils/ptr"
)
@ -146,6 +147,11 @@ type DynamicResources struct {
draManager fwk.SharedDRAManager
}
var (
ErrDeviceBindingTimeout = errors.New("device binding timeout")
ErrDeviceBindingFailed = errors.New("device binding failed")
)
// New initializes a new plugin and returns it.
func New(ctx context.Context, plArgs runtime.Object, fh fwk.Handle, fts feature.Features) (fwk.Plugin, error) {
if !fts.EnableDynamicResourceAllocation {
@ -1095,18 +1101,61 @@ func (pl *DynamicResources) PreBind(ctx context.Context, cs fwk.CycleState, pod
// We need to wait for the device to be attached to the node.
pl.fh.EventRecorder().Eventf(pod, nil, v1.EventTypeNormal, "BindingConditionsPending", "Scheduling", "waiting for binding conditions for device on node %s", nodeName)
// START: Record start time for metrics duration calculation
start := time.Now()
err = wait.PollUntilContextTimeout(ctx, 5*time.Second, pl.bindingTimeout, true,
func(ctx context.Context) (bool, error) {
return pl.isPodReadyForBinding(state)
})
if err != nil {
if errors.Is(err, context.DeadlineExceeded) {
err = errors.New("device binding timeout")
// GOAL: Calculate duration since start for metrics recording
duration := time.Since(start)
if errors.Is(err, context.DeadlineExceeded) {
err = fmt.Errorf("%w: %w", ErrDeviceBindingTimeout, err)
}
// Determine status label based on error
statusLabel := schedmetrics.BindingConditionsStatusSuccess
switch {
case err == nil:
logger.V(5).Info("BindingConditions is met",
"pod", klog.KObj(pod),
"node", nodeName,
"devices", bindingConditionsStatusFormatter{state, statusLabel},
)
// keep success
case errors.Is(err, ErrDeviceBindingTimeout):
statusLabel = schedmetrics.BindingConditionsStatusTimeout
case errors.Is(err, ErrDeviceBindingFailed):
statusLabel = schedmetrics.BindingConditionsStatusFailed
default:
statusLabel = schedmetrics.BindingConditionsStatusError
}
// Record metrics for binding conditions outcome (success/failure/timeout/error)
profileLabel := pl.fh.ProfileName()
for _, claim := range state.claims.all() {
if claim.Status.Allocation == nil {
continue
}
for _, res := range claim.Status.Allocation.Devices.Results {
schedmetrics.DRABindingConditionsPreBindDuration.
WithLabelValues(profileLabel, res.Driver, statusLabel).
Observe(duration.Seconds())
}
// Count scheduling attempts that used devices with BindingConditions.
for _, bcStatus := range listBindingConditionsStatus(claim) {
schedmetrics.DRABindingConditionsAllocationsTotal.
WithLabelValues(profileLabel, bcStatus.Driver, statusLabel).
Inc()
}
}
if err != nil {
// Returning an error here causes another scheduling attempt.
// In that next attempt, PreFilter will detect the timeout or
// error and try to recover.
return statusError(logger, err)
return statusError(logger, err, "devices", formatBCStatusOneLine(getBindingConditionsStatusForStatus(state, statusLabel)))
}
// If we get here, we know that reserving the claim for
@ -1307,7 +1356,8 @@ func (pl *DynamicResources) isClaimReadyForBinding(claim *resourceapi.ResourceCl
for _, cond := range deviceRequest.BindingFailureConditions {
failedCond := apimeta.FindStatusCondition(deviceStatus.Conditions, cond)
if failedCond != nil && failedCond.Status == metav1.ConditionTrue {
return false, fmt.Errorf("claim %s binding failed: reason=%s, message=%q",
return false, fmt.Errorf("%w: claim=%s, reason=%s, message=%q",
ErrDeviceBindingFailed,
claim.Name,
failedCond.Reason,
failedCond.Message)
@ -1363,7 +1413,7 @@ func (pl *DynamicResources) isPodReadyForBinding(state *stateData) (bool, error)
}
if !ready {
if pl.isClaimTimeout(claim) {
return false, fmt.Errorf("claim %s binding timeout", claim.Name)
return false, fmt.Errorf("%w: claim=%s", ErrDeviceBindingTimeout, claim.Name)
}
return false, nil
}
@ -1422,3 +1472,130 @@ func getAllocatedDeviceStatus(claim *resourceapi.ResourceClaim, deviceRequest *r
}
return nil
}
// bindingConditionsStatusFormatter implements fmt.Stringer for lazy evaluation of device status formatting.
// This avoids expensive processing when logging is disabled.
type bindingConditionsStatusFormatter struct {
state *stateData
statusLabel string
}
func (d bindingConditionsStatusFormatter) String() string {
return formatBCStatusOneLine(getBindingConditionsStatusForStatus(d.state, d.statusLabel))
}
// BindingConditionsStatus is a compact per-device summary for BindingConditions logging.
type BindingConditionsStatus struct {
Driver string `json:"driver"`
Pool string `json:"pool"`
Device string `json:"device"`
Pending []string `json:"pending,omitempty"`
Failed []string `json:"failed,omitempty"` // only failure conditions that are true
}
// listBindingConditionsStatus inspects ONE claim and returns a summary per allocated device
// that has BindingConditions.
// - Pending: BindingConditions that are not yet true (or all conditions if device status is missing)
// - Failed: BindingFailureConditions that are true
func listBindingConditionsStatus(claim *resourceapi.ResourceClaim) []BindingConditionsStatus {
if claim == nil || claim.Status.Allocation == nil {
return nil
}
var out []BindingConditionsStatus
for _, res := range claim.Status.Allocation.Devices.Results {
if len(res.BindingConditions) == 0 {
continue
}
sum := BindingConditionsStatus{
Driver: res.Driver,
Pool: res.Pool,
Device: res.Device,
}
ds := getAllocatedDeviceStatus(claim, &res)
if ds == nil {
// No status yet => all BindingConditions are pending.
sum.Pending = append(sum.Pending, res.BindingConditions...)
out = append(out, sum)
continue
}
// Failed conditions that are true.
for _, c := range res.BindingFailureConditions {
if apimeta.IsStatusConditionTrue(ds.Conditions, c) {
sum.Failed = append(sum.Failed, c)
}
}
// Pending conditions: those not yet true.
for _, c := range res.BindingConditions {
if !apimeta.IsStatusConditionTrue(ds.Conditions, c) {
sum.Pending = append(sum.Pending, c)
}
}
out = append(out, sum)
}
return out
}
// getBindingConditionsStatusForStatus selects which devices to log across ALL claims in the state.
func getBindingConditionsStatusForStatus(state *stateData, statusLabel string) []BindingConditionsStatus {
var out []BindingConditionsStatus
for _, claim := range state.claims.all() {
for _, s := range listBindingConditionsStatus(claim) {
switch statusLabel {
case schedmetrics.BindingConditionsStatusSuccess:
// success: show all BC devices
if len(s.Failed) == 0 && len(s.Pending) == 0 {
out = append(out, s)
}
case schedmetrics.BindingConditionsStatusFailed:
// failure: only devices that have at least one failure condition true
if len(s.Failed) > 0 {
out = append(out, BindingConditionsStatus{
Driver: s.Driver,
Pool: s.Pool,
Device: s.Device,
Failed: append([]string(nil), s.Failed...),
})
}
case schedmetrics.BindingConditionsStatusTimeout:
// timeout/error: only devices still pending, and not failed
if len(s.Pending) > 0 && len(s.Failed) == 0 {
out = append(out, s)
}
default:
// Treat unknown like timeout/error
if len(s.Pending) > 0 && len(s.Failed) == 0 {
out = append(out, s)
}
}
}
}
return out
}
// formatBCStatusOneLine formats the selected devices into a single log-friendly line.
func formatBCStatusOneLine(devs []BindingConditionsStatus) string {
if len(devs) == 0 {
return ""
}
parts := make([]string, 0, len(devs))
for _, d := range devs {
base := fmt.Sprintf("<driver=%s, pool=%s> %s", d.Driver, d.Pool, d.Device)
if len(d.Failed) > 0 {
base += fmt.Sprintf(" failed=[%s]", strings.Join(d.Failed, ","))
}
if len(d.Pending) > 0 {
base += fmt.Sprintf(" pending=[%s]", strings.Join(d.Pending, ","))
}
parts = append(parts, base)
}
return strings.Join(parts, "; ")
}

View file

@ -2334,6 +2334,35 @@ func testPlugin(tCtx ktesting.TContext) {
status: nil,
},
},
metrics: func(tCtx ktesting.TContext, g compbasemetrics.Gatherer) {
// Counter: allocations_total should have exactly one event
allocs, err := testutil.GetCounterValuesFromGatherer(
g,
"scheduler_dra_bindingconditions_allocations_total",
map[string]string{
"status": "success",
},
"driver", // group by driver label
)
require.NoError(tCtx, err)
var totalAllocs float64
for _, v := range allocs {
totalAllocs += v
}
require.InEpsilon(tCtx, float64(1), totalAllocs, 0.1, "expected exactly one successful allocation with BindingConditions")
// Histogram: one success sample with requires_bindingconditions=true
hist, err := testutil.GetHistogramVecFromGatherer(
g,
"scheduler_dra_bindingconditions_wait_duration_seconds",
map[string]string{
"status": "success",
},
)
require.NoError(tCtx, err)
require.Equal(tCtx, uint64(1), hist.GetAggregatedSampleCount(), "expected one success sample in wait duration histogram")
},
},
"bound-claim-with-failed-binding": {
enableDRADeviceBindingConditions: true,
@ -2452,9 +2481,38 @@ func testPlugin(tCtx ktesting.TContext) {
},
},
// From isPodReadyForBinding.
status: fwk.AsStatus(errors.New("claim " + claim.Name + " binding timeout")),
status: fwk.AsStatus(fmt.Errorf("%w: claim=%s", ErrDeviceBindingTimeout, claim.Name)),
},
},
metrics: func(tCtx ktesting.TContext, g compbasemetrics.Gatherer) {
// Counter: timeouts_total should have exactly one event
timeouts, err := testutil.GetCounterValuesFromGatherer(
g,
"scheduler_dra_bindingconditions_allocations_total",
map[string]string{
"status": "timeout",
},
"driver",
)
require.NoError(tCtx, err)
var totalTimeouts float64
for _, v := range timeouts {
totalTimeouts += v
}
require.InEpsilon(tCtx, float64(1), totalTimeouts, 0.1, "expected exactly one timeout with BindingConditions")
// Histogram: one timeout sample with requires_bindingconditions=true
hist, err := testutil.GetHistogramVecFromGatherer(
g,
"scheduler_dra_bindingconditions_wait_duration_seconds",
map[string]string{
"status": "timeout",
},
)
require.NoError(tCtx, err)
require.Equal(tCtx, uint64(1), hist.GetAggregatedSampleCount(), "expected one timeout sample in wait duration histogram")
},
},
"bound-claim-with-mixed-binding-conditions": {
enableDRADeviceBindingConditions: true,
@ -2942,6 +3000,14 @@ func setupMetrics(features feature.Features) compbasemetrics.KubeRegistry {
testRegistry.MustRegister(metrics.ResourceClaimCreatesTotal)
metrics.ResourceClaimCreatesTotal.Reset()
}
// DRA DeviceBindingConditions metrics.
if features.EnableDRADeviceBindingConditions {
testRegistry.MustRegister(metrics.DRABindingConditionsAllocationsTotal)
testRegistry.MustRegister(metrics.DRABindingConditionsPreBindDuration)
metrics.DRABindingConditionsAllocationsTotal.Reset()
metrics.DRABindingConditionsPreBindDuration.Reset()
}
return testRegistry
}

View file

@ -115,6 +115,14 @@ const (
BatchFlushPodNotBatchable = "pod_not_batchable"
)
// DRADeviceBindingConditions status labels
const (
BindingConditionsStatusSuccess = "success"
BindingConditionsStatusFailed = "failure"
BindingConditionsStatusTimeout = "timeout"
BindingConditionsStatusError = "error"
)
// All the histogram based metrics have 1ms as size for the smallest bucket.
var (
scheduleAttempts *metrics.CounterVec
@ -161,6 +169,9 @@ var (
podGroupScheduleAttempts *metrics.CounterVec
podGroupSchedulingLatency *metrics.HistogramVec
PodGroupSchedulingAlgorithmLatency *metrics.Histogram
// The below are only available when the DRADeviceBindingConditions feature gate is enabled.
DRABindingConditionsAllocationsTotal *metrics.CounterVec
DRABindingConditionsPreBindDuration *metrics.HistogramVec
// metricsList is a list of all metrics that should be registered always, regardless of any feature gate's value.
metricsList []metrics.Registerable
@ -199,6 +210,12 @@ func Register() {
PodGroupSchedulingAlgorithmLatency,
)
}
if utilfeature.DefaultFeatureGate.Enabled(features.DRADeviceBindingConditions) {
RegisterMetrics(
DRABindingConditionsAllocationsTotal,
DRABindingConditionsPreBindDuration,
)
}
})
}
@ -453,6 +470,27 @@ func InitMetrics() {
},
[]string{"status"})
DRABindingConditionsAllocationsTotal = metrics.NewCounterVec(
&metrics.CounterOpts{
Subsystem: SchedulerSubsystem,
Name: "dra_bindingconditions_allocations_total",
Help: "Number of allocations using devices with BindingConditions, counted per driver per scheduling attempt",
StabilityLevel: metrics.ALPHA,
},
[]string{"profile", "driver", "status"},
)
DRABindingConditionsPreBindDuration = metrics.NewHistogramVec(
&metrics.HistogramOpts{
Subsystem: SchedulerSubsystem,
Name: "dra_bindingconditions_wait_duration_seconds",
Help: "Time in seconds spent waiting for BindingConditions to be satisfied during PreBind.",
Buckets: metrics.ExponentialBuckets(0.1, 2, 14),
StabilityLevel: metrics.ALPHA,
},
[]string{"profile", "driver", "status"},
)
GetNodeHintDuration = metrics.NewHistogramVec(
&metrics.HistogramOpts{
Subsystem: SchedulerSubsystem,