diff --git a/pkg/scheduler/framework/parallelize/parallelism_test.go b/pkg/scheduler/framework/parallelize/parallelism_test.go index 8449c4f4510..7ed5b7c04e1 100644 --- a/pkg/scheduler/framework/parallelize/parallelism_test.go +++ b/pkg/scheduler/framework/parallelize/parallelism_test.go @@ -18,7 +18,12 @@ package parallelize import ( "fmt" + "sync" "testing" + + "k8s.io/component-base/metrics/testutil" + "k8s.io/klog/v2/ktesting" + "k8s.io/kubernetes/pkg/scheduler/metrics" ) func TestChunkSize(t *testing.T) { @@ -52,3 +57,36 @@ func TestChunkSize(t *testing.T) { }) } } + +func TestGoroutinesMetric(t *testing.T) { + metrics.Register() + metrics.Goroutines.Reset() + + const ( + operation = "test-operation" + pieces = 32 + ) + + var ( + mu sync.Mutex + peakValue float64 + ) + + _, ctx := ktesting.NewTestContext(t) + p := NewParallelizer(DefaultParallelism) + p.Until(ctx, pieces, func(_ int) { + val, err := testutil.GetGaugeMetricValue(metrics.Goroutines.WithLabelValues(operation)) + if err != nil { + t.Fatalf("failed to read goroutines metric inside Until: %v", err) + } + mu.Lock() + if val > peakValue { + peakValue = val + } + mu.Unlock() + }, operation) + + if peakValue <= 0 { + t.Errorf("expected goroutines metric to be >0 during Until, peak was %v", peakValue) + } +} diff --git a/pkg/scheduler/framework/runtime/framework_test.go b/pkg/scheduler/framework/runtime/framework_test.go index 4922635b3ea..666c0f893e0 100644 --- a/pkg/scheduler/framework/runtime/framework_test.go +++ b/pkg/scheduler/framework/runtime/framework_test.go @@ -4205,3 +4205,119 @@ func TestRunPlacementScorePlugins(t *testing.T) { }) } } + +func TestPluginEvaluationTotalMetric(t *testing.T) { + _, ctx := ktesting.NewTestContext(t) + ctx, cancel := context.WithCancel(ctx) + defer cancel() + + metrics.PluginEvaluationTotal.Reset() + + registry := Registry{} + + const ( + preFilterPluginName = "plugin-eval-prefilter" + filterPluginNameA = "plugin-eval-filter-a" + filterPluginNameB = "plugin-eval-filter-b" + preScorePluginName = "plugin-eval-prescore" + scorePluginName = "plugin-eval-score" + profileName2 = "test-profile-2" + ) + + preFilterPl := &TestPlugin{name: preFilterPluginName, inj: injectedResult{PreFilterStatus: int(fwk.Success)}} + if err := registry.Register(preFilterPluginName, func(_ context.Context, _ runtime.Object, _ fwk.Handle) (fwk.Plugin, error) { + return preFilterPl, nil + }); err != nil { + t.Fatalf("failed to register prefilter plugin %q: %v", preFilterPluginName, err) + } + + filterPlA := &TestPlugin{name: filterPluginNameA, inj: injectedResult{FilterStatus: int(fwk.Success)}} + if err := registry.Register(filterPluginNameA, func(_ context.Context, _ runtime.Object, _ fwk.Handle) (fwk.Plugin, error) { + return filterPlA, nil + }); err != nil { + t.Fatalf("failed to register filter plugin %q: %v", filterPluginNameA, err) + } + + filterPlB := &TestPlugin{name: filterPluginNameB, inj: injectedResult{FilterStatus: int(fwk.Success)}} + if err := registry.Register(filterPluginNameB, func(_ context.Context, _ runtime.Object, _ fwk.Handle) (fwk.Plugin, error) { + return filterPlB, nil + }); err != nil { + t.Fatalf("failed to register filter plugin %q: %v", filterPluginNameB, err) + } + + preScorePl := &TestPlugin{name: preScorePluginName, inj: injectedResult{PreScoreStatus: int(fwk.Success)}} + if err := registry.Register(preScorePluginName, func(_ context.Context, _ runtime.Object, _ fwk.Handle) (fwk.Plugin, error) { + return preScorePl, nil + }); err != nil { + t.Fatalf("failed to register prescore plugin %q: %v", preScorePluginName, err) + } + + scorePl := &TestPlugin{name: scorePluginName, inj: injectedResult{}} + if err := registry.Register(scorePluginName, func(_ context.Context, _ runtime.Object, _ fwk.Handle) (fwk.Plugin, error) { + return scorePl, nil + }); err != nil { + t.Fatalf("failed to register score plugin %q: %v", scorePluginName, err) + } + + // Profile 1: exercise PreFilter, Filter, PreScore and Score extension points. + cfgPls1 := &config.Plugins{} + cfgPls1.PreFilter.Enabled = append(cfgPls1.PreFilter.Enabled, config.Plugin{Name: preFilterPluginName}) + cfgPls1.Filter.Enabled = append(cfgPls1.Filter.Enabled, config.Plugin{Name: filterPluginNameA}) + cfgPls1.PreScore.Enabled = append(cfgPls1.PreScore.Enabled, config.Plugin{Name: preScorePluginName}) + cfgPls1.Score.Enabled = append(cfgPls1.Score.Enabled, config.Plugin{Name: scorePluginName}) + profile1 := config.KubeSchedulerProfile{ + SchedulerName: testProfileName, + Plugins: cfgPls1, + } + + f1, err := newFrameworkWithQueueSortAndBind(ctx, registry, profile1, WithSnapshotSharedLister(cache.NewEmptySnapshot())) + if err != nil { + t.Fatalf("failed to create framework (profile=%q): %v", testProfileName, err) + } + defer func() { _ = f1.Close() }() + + state1 := framework.NewCycleState() + if _, st, _ := f1.RunPreFilterPlugins(ctx, state1, pod); st != nil && !st.IsSuccess() { + t.Fatalf("RunPreFilterPlugins returned unexpected status: %v", st) + } + if st := f1.RunFilterPlugins(ctx, state1, pod, nil); st != nil && !st.IsSuccess() { + t.Fatalf("RunFilterPlugins returned unexpected status: %v", st) + } + if st := f1.RunPreScorePlugins(ctx, state1, pod, nil); st != nil && !st.IsSuccess() { + t.Fatalf("RunPreScorePlugins returned unexpected status: %v", st) + } + if _, st := f1.RunScorePlugins(ctx, state1, pod, BuildNodeInfos(nodes)); st != nil && !st.IsSuccess() { + t.Fatalf("RunScorePlugins returned unexpected status: %v", st) + } + + // Profile 2: exercise a different plugin and profile label on Filter. + cfgPls2 := &config.Plugins{} + cfgPls2.Filter.Enabled = append(cfgPls2.Filter.Enabled, config.Plugin{Name: filterPluginNameB}) + profile2 := config.KubeSchedulerProfile{ + SchedulerName: profileName2, + Plugins: cfgPls2, + } + + f2, err := newFrameworkWithQueueSortAndBind(ctx, registry, profile2, WithSnapshotSharedLister(cache.NewEmptySnapshot())) + if err != nil { + t.Fatalf("failed to create framework (profile=%q): %v", profileName2, err) + } + defer func() { _ = f2.Close() }() + + state2 := framework.NewCycleState() + if st := f2.RunFilterPlugins(ctx, state2, pod, nil); st != nil && !st.IsSuccess() { + t.Fatalf("RunFilterPlugins returned unexpected status: %v", st) + } + + want := `# HELP scheduler_plugin_evaluation_total Number of attempts to schedule pods by each plugin and the extension point (available only in PreFilter, Filter, PreScore, and Score). +# TYPE scheduler_plugin_evaluation_total counter +scheduler_plugin_evaluation_total{extension_point="Filter",plugin="plugin-eval-filter-a",profile="test-profile"} 1 +scheduler_plugin_evaluation_total{extension_point="Filter",plugin="plugin-eval-filter-b",profile="test-profile-2"} 1 +scheduler_plugin_evaluation_total{extension_point="PreFilter",plugin="plugin-eval-prefilter",profile="test-profile"} 1 +scheduler_plugin_evaluation_total{extension_point="PreScore",plugin="plugin-eval-prescore",profile="test-profile"} 1 +scheduler_plugin_evaluation_total{extension_point="Score",plugin="plugin-eval-score",profile="test-profile"} 1 +` + if err := testutil.GatherAndCompare(metrics.GetGather(), strings.NewReader(want), metrics.PluginEvaluationTotal.Name); err != nil { + t.Fatalf("unexpected plugin_evaluation_total metric output:\n%v", err) + } +} diff --git a/pkg/scheduler/metrics/metrics.go b/pkg/scheduler/metrics/metrics.go index 2dfa551f1bb..8b4ff617909 100644 --- a/pkg/scheduler/metrics/metrics.go +++ b/pkg/scheduler/metrics/metrics.go @@ -290,7 +290,7 @@ func InitMetrics() { Subsystem: SchedulerSubsystem, Name: "goroutines", Help: "Number of running goroutines split by the work they do such as binding.", - StabilityLevel: metrics.ALPHA, + StabilityLevel: metrics.BETA, }, []string{"operation"}) BatchAttemptStats = metrics.NewCounterVec( &metrics.CounterOpts{ @@ -385,7 +385,7 @@ func InitMetrics() { Name: "permit_wait_duration_seconds", Help: "Duration of waiting on permit.", Buckets: metrics.ExponentialBuckets(0.001, 2, 15), - StabilityLevel: metrics.ALPHA, + StabilityLevel: metrics.BETA, }, []string{"result"}) @@ -402,7 +402,7 @@ func InitMetrics() { Subsystem: SchedulerSubsystem, Name: "unschedulable_pods", Help: "The number of unschedulable pods broken down by plugin name. A pod will increment the gauge for all plugins that caused it to not schedule and so this metric have meaning only when broken down by plugin.", - StabilityLevel: metrics.ALPHA, + StabilityLevel: metrics.BETA, }, []string{"plugin", "profile"}) PluginEvaluationTotal = metrics.NewCounterVec( @@ -410,7 +410,7 @@ func InitMetrics() { Subsystem: SchedulerSubsystem, Name: "plugin_evaluation_total", Help: "Number of attempts to schedule pods by each plugin and the extension point (available only in PreFilter, Filter, PreScore, and Score).", - StabilityLevel: metrics.ALPHA, + StabilityLevel: metrics.BETA, }, []string{"plugin", "extension_point", "profile"}) PreemptionGoroutinesDuration = metrics.NewHistogramVec( diff --git a/test/instrumentation/testdata/stable-metrics-list.yaml b/test/instrumentation/testdata/stable-metrics-list.yaml index 44275a61f4b..08b4d99c43e 100644 --- a/test/instrumentation/testdata/stable-metrics-list.yaml +++ b/test/instrumentation/testdata/stable-metrics-list.yaml @@ -422,6 +422,46 @@ labels: - manager - name +- name: goroutines + subsystem: scheduler + help: Number of running goroutines split by the work they do such as binding. + type: Gauge + stabilityLevel: BETA + labels: + - operation +- name: permit_wait_duration_seconds + subsystem: scheduler + help: Duration of waiting on permit. + type: Histogram + stabilityLevel: BETA + labels: + - result + buckets: + - 0.001 + - 0.002 + - 0.004 + - 0.008 + - 0.016 + - 0.032 + - 0.064 + - 0.128 + - 0.256 + - 0.512 + - 1.024 + - 2.048 + - 4.096 + - 8.192 + - 16.384 +- name: plugin_evaluation_total + subsystem: scheduler + help: Number of attempts to schedule pods by each plugin and the extension point + (available only in PreFilter, Filter, PreScore, and Score). + type: Counter + stabilityLevel: BETA + labels: + - extension_point + - plugin + - profile - name: pod_scheduling_sli_duration_seconds subsystem: scheduler help: E2e latency for a pod being scheduled, from the time the pod enters the scheduling @@ -451,6 +491,16 @@ - 1310.72 - 2621.44 - 5242.88 +- name: unschedulable_pods + subsystem: scheduler + help: The number of unschedulable pods broken down by plugin name. A pod will increment + the gauge for all plugins that caused it to not schedule and so this metric have + meaning only when broken down by plugin. + type: Gauge + stabilityLevel: BETA + labels: + - plugin + - profile - name: adds_total subsystem: workqueue help: Total number of adds handled by workqueue