Promote scheduler metrics to beta

This commit is contained in:
Prathamesh Bhope 2026-01-11 03:15:00 -08:00
parent 9cfdbc0d6e
commit d66ae77959
4 changed files with 208 additions and 4 deletions

View file

@ -18,7 +18,12 @@ package parallelize
import (
"fmt"
"sync"
"testing"
"k8s.io/component-base/metrics/testutil"
"k8s.io/klog/v2/ktesting"
"k8s.io/kubernetes/pkg/scheduler/metrics"
)
func TestChunkSize(t *testing.T) {
@ -52,3 +57,36 @@ func TestChunkSize(t *testing.T) {
})
}
}
func TestGoroutinesMetric(t *testing.T) {
metrics.Register()
metrics.Goroutines.Reset()
const (
operation = "test-operation"
pieces = 32
)
var (
mu sync.Mutex
peakValue float64
)
_, ctx := ktesting.NewTestContext(t)
p := NewParallelizer(DefaultParallelism)
p.Until(ctx, pieces, func(_ int) {
val, err := testutil.GetGaugeMetricValue(metrics.Goroutines.WithLabelValues(operation))
if err != nil {
t.Fatalf("failed to read goroutines metric inside Until: %v", err)
}
mu.Lock()
if val > peakValue {
peakValue = val
}
mu.Unlock()
}, operation)
if peakValue <= 0 {
t.Errorf("expected goroutines metric to be >0 during Until, peak was %v", peakValue)
}
}

View file

@ -4205,3 +4205,119 @@ func TestRunPlacementScorePlugins(t *testing.T) {
})
}
}
func TestPluginEvaluationTotalMetric(t *testing.T) {
_, ctx := ktesting.NewTestContext(t)
ctx, cancel := context.WithCancel(ctx)
defer cancel()
metrics.PluginEvaluationTotal.Reset()
registry := Registry{}
const (
preFilterPluginName = "plugin-eval-prefilter"
filterPluginNameA = "plugin-eval-filter-a"
filterPluginNameB = "plugin-eval-filter-b"
preScorePluginName = "plugin-eval-prescore"
scorePluginName = "plugin-eval-score"
profileName2 = "test-profile-2"
)
preFilterPl := &TestPlugin{name: preFilterPluginName, inj: injectedResult{PreFilterStatus: int(fwk.Success)}}
if err := registry.Register(preFilterPluginName, func(_ context.Context, _ runtime.Object, _ fwk.Handle) (fwk.Plugin, error) {
return preFilterPl, nil
}); err != nil {
t.Fatalf("failed to register prefilter plugin %q: %v", preFilterPluginName, err)
}
filterPlA := &TestPlugin{name: filterPluginNameA, inj: injectedResult{FilterStatus: int(fwk.Success)}}
if err := registry.Register(filterPluginNameA, func(_ context.Context, _ runtime.Object, _ fwk.Handle) (fwk.Plugin, error) {
return filterPlA, nil
}); err != nil {
t.Fatalf("failed to register filter plugin %q: %v", filterPluginNameA, err)
}
filterPlB := &TestPlugin{name: filterPluginNameB, inj: injectedResult{FilterStatus: int(fwk.Success)}}
if err := registry.Register(filterPluginNameB, func(_ context.Context, _ runtime.Object, _ fwk.Handle) (fwk.Plugin, error) {
return filterPlB, nil
}); err != nil {
t.Fatalf("failed to register filter plugin %q: %v", filterPluginNameB, err)
}
preScorePl := &TestPlugin{name: preScorePluginName, inj: injectedResult{PreScoreStatus: int(fwk.Success)}}
if err := registry.Register(preScorePluginName, func(_ context.Context, _ runtime.Object, _ fwk.Handle) (fwk.Plugin, error) {
return preScorePl, nil
}); err != nil {
t.Fatalf("failed to register prescore plugin %q: %v", preScorePluginName, err)
}
scorePl := &TestPlugin{name: scorePluginName, inj: injectedResult{}}
if err := registry.Register(scorePluginName, func(_ context.Context, _ runtime.Object, _ fwk.Handle) (fwk.Plugin, error) {
return scorePl, nil
}); err != nil {
t.Fatalf("failed to register score plugin %q: %v", scorePluginName, err)
}
// Profile 1: exercise PreFilter, Filter, PreScore and Score extension points.
cfgPls1 := &config.Plugins{}
cfgPls1.PreFilter.Enabled = append(cfgPls1.PreFilter.Enabled, config.Plugin{Name: preFilterPluginName})
cfgPls1.Filter.Enabled = append(cfgPls1.Filter.Enabled, config.Plugin{Name: filterPluginNameA})
cfgPls1.PreScore.Enabled = append(cfgPls1.PreScore.Enabled, config.Plugin{Name: preScorePluginName})
cfgPls1.Score.Enabled = append(cfgPls1.Score.Enabled, config.Plugin{Name: scorePluginName})
profile1 := config.KubeSchedulerProfile{
SchedulerName: testProfileName,
Plugins: cfgPls1,
}
f1, err := newFrameworkWithQueueSortAndBind(ctx, registry, profile1, WithSnapshotSharedLister(cache.NewEmptySnapshot()))
if err != nil {
t.Fatalf("failed to create framework (profile=%q): %v", testProfileName, err)
}
defer func() { _ = f1.Close() }()
state1 := framework.NewCycleState()
if _, st, _ := f1.RunPreFilterPlugins(ctx, state1, pod); st != nil && !st.IsSuccess() {
t.Fatalf("RunPreFilterPlugins returned unexpected status: %v", st)
}
if st := f1.RunFilterPlugins(ctx, state1, pod, nil); st != nil && !st.IsSuccess() {
t.Fatalf("RunFilterPlugins returned unexpected status: %v", st)
}
if st := f1.RunPreScorePlugins(ctx, state1, pod, nil); st != nil && !st.IsSuccess() {
t.Fatalf("RunPreScorePlugins returned unexpected status: %v", st)
}
if _, st := f1.RunScorePlugins(ctx, state1, pod, BuildNodeInfos(nodes)); st != nil && !st.IsSuccess() {
t.Fatalf("RunScorePlugins returned unexpected status: %v", st)
}
// Profile 2: exercise a different plugin and profile label on Filter.
cfgPls2 := &config.Plugins{}
cfgPls2.Filter.Enabled = append(cfgPls2.Filter.Enabled, config.Plugin{Name: filterPluginNameB})
profile2 := config.KubeSchedulerProfile{
SchedulerName: profileName2,
Plugins: cfgPls2,
}
f2, err := newFrameworkWithQueueSortAndBind(ctx, registry, profile2, WithSnapshotSharedLister(cache.NewEmptySnapshot()))
if err != nil {
t.Fatalf("failed to create framework (profile=%q): %v", profileName2, err)
}
defer func() { _ = f2.Close() }()
state2 := framework.NewCycleState()
if st := f2.RunFilterPlugins(ctx, state2, pod, nil); st != nil && !st.IsSuccess() {
t.Fatalf("RunFilterPlugins returned unexpected status: %v", st)
}
want := `# HELP scheduler_plugin_evaluation_total Number of attempts to schedule pods by each plugin and the extension point (available only in PreFilter, Filter, PreScore, and Score).
# TYPE scheduler_plugin_evaluation_total counter
scheduler_plugin_evaluation_total{extension_point="Filter",plugin="plugin-eval-filter-a",profile="test-profile"} 1
scheduler_plugin_evaluation_total{extension_point="Filter",plugin="plugin-eval-filter-b",profile="test-profile-2"} 1
scheduler_plugin_evaluation_total{extension_point="PreFilter",plugin="plugin-eval-prefilter",profile="test-profile"} 1
scheduler_plugin_evaluation_total{extension_point="PreScore",plugin="plugin-eval-prescore",profile="test-profile"} 1
scheduler_plugin_evaluation_total{extension_point="Score",plugin="plugin-eval-score",profile="test-profile"} 1
`
if err := testutil.GatherAndCompare(metrics.GetGather(), strings.NewReader(want), metrics.PluginEvaluationTotal.Name); err != nil {
t.Fatalf("unexpected plugin_evaluation_total metric output:\n%v", err)
}
}

View file

@ -290,7 +290,7 @@ func InitMetrics() {
Subsystem: SchedulerSubsystem,
Name: "goroutines",
Help: "Number of running goroutines split by the work they do such as binding.",
StabilityLevel: metrics.ALPHA,
StabilityLevel: metrics.BETA,
}, []string{"operation"})
BatchAttemptStats = metrics.NewCounterVec(
&metrics.CounterOpts{
@ -385,7 +385,7 @@ func InitMetrics() {
Name: "permit_wait_duration_seconds",
Help: "Duration of waiting on permit.",
Buckets: metrics.ExponentialBuckets(0.001, 2, 15),
StabilityLevel: metrics.ALPHA,
StabilityLevel: metrics.BETA,
},
[]string{"result"})
@ -402,7 +402,7 @@ func InitMetrics() {
Subsystem: SchedulerSubsystem,
Name: "unschedulable_pods",
Help: "The number of unschedulable pods broken down by plugin name. A pod will increment the gauge for all plugins that caused it to not schedule and so this metric have meaning only when broken down by plugin.",
StabilityLevel: metrics.ALPHA,
StabilityLevel: metrics.BETA,
}, []string{"plugin", "profile"})
PluginEvaluationTotal = metrics.NewCounterVec(
@ -410,7 +410,7 @@ func InitMetrics() {
Subsystem: SchedulerSubsystem,
Name: "plugin_evaluation_total",
Help: "Number of attempts to schedule pods by each plugin and the extension point (available only in PreFilter, Filter, PreScore, and Score).",
StabilityLevel: metrics.ALPHA,
StabilityLevel: metrics.BETA,
}, []string{"plugin", "extension_point", "profile"})
PreemptionGoroutinesDuration = metrics.NewHistogramVec(

View file

@ -422,6 +422,46 @@
labels:
- manager
- name
- name: goroutines
subsystem: scheduler
help: Number of running goroutines split by the work they do such as binding.
type: Gauge
stabilityLevel: BETA
labels:
- operation
- name: permit_wait_duration_seconds
subsystem: scheduler
help: Duration of waiting on permit.
type: Histogram
stabilityLevel: BETA
labels:
- result
buckets:
- 0.001
- 0.002
- 0.004
- 0.008
- 0.016
- 0.032
- 0.064
- 0.128
- 0.256
- 0.512
- 1.024
- 2.048
- 4.096
- 8.192
- 16.384
- name: plugin_evaluation_total
subsystem: scheduler
help: Number of attempts to schedule pods by each plugin and the extension point
(available only in PreFilter, Filter, PreScore, and Score).
type: Counter
stabilityLevel: BETA
labels:
- extension_point
- plugin
- profile
- name: pod_scheduling_sli_duration_seconds
subsystem: scheduler
help: E2e latency for a pod being scheduled, from the time the pod enters the scheduling
@ -451,6 +491,16 @@
- 1310.72
- 2621.44
- 5242.88
- name: unschedulable_pods
subsystem: scheduler
help: The number of unschedulable pods broken down by plugin name. A pod will increment
the gauge for all plugins that caused it to not schedule and so this metric have
meaning only when broken down by plugin.
type: Gauge
stabilityLevel: BETA
labels:
- plugin
- profile
- name: adds_total
subsystem: workqueue
help: Total number of adds handled by workqueue