diff --git a/test/integration/scheduler_perf/executor.go b/test/integration/scheduler_perf/executor.go index 20895c64a25..099bcd3a4bb 100644 --- a/test/integration/scheduler_perf/executor.go +++ b/test/integration/scheduler_perf/executor.go @@ -265,7 +265,14 @@ func (e *WorkloadExecutor) runCreatePodsOp(tCtx ktesting.TContext, opIndex int, return err } default: - if err := waitUntilPodsScheduledInNamespace(tCtx, e.podInformer, nil, namespace, op.Count); err != nil { + // Default timeout is 10 minutes because even at the lowest observed QPS of ~10 pods/sec, + // a standard 5000-node test completes. Heavy test suites (e.g. TAS) can configure a custom + // podsSchedulingTimeout option to avoid meeting this strict default ceiling. + timeout := 10 * time.Minute + if e.opts != nil && e.opts.podsSchedulingTimeout > 0 { + timeout = e.opts.podsSchedulingTimeout + } + if err := waitUntilPodsScheduledInNamespace(tCtx, e.podInformer, nil, namespace, op.Count, timeout); err != nil { return fmt.Errorf("error in waiting for pods to get scheduled: %w", err) } } @@ -809,7 +816,7 @@ func waitUntilPodsScheduled(tCtx ktesting.TContext, podInformer coreinformers.Po if !ok { return fmt.Errorf("unknown namespace %s", namespace) } - if err := waitUntilPodsScheduledInNamespace(tCtx, podInformer, labelSelector, namespace, wantCount); err != nil { + if err := waitUntilPodsScheduledInNamespace(tCtx, podInformer, labelSelector, namespace, wantCount, 10*time.Minute); err != nil { return fmt.Errorf("error waiting for pods in namespace %q: %w", namespace, err) } } @@ -900,12 +907,14 @@ func getNodePreparer(prefix string, cno *createNodesOp, clientset clientset.Inte } // waitUntilPodsScheduledInNamespace blocks until all pods in the given -// namespace are scheduled. Times out after 10 minutes because even at the +// namespace are scheduled. Times out after 10 minutes by default because even at the // lowest observed QPS of ~10 pods/sec, a 5000-node test should complete. -func waitUntilPodsScheduledInNamespace(tCtx ktesting.TContext, podInformer coreinformers.PodInformer, labelSelector map[string]string, namespace string, wantCount int) error { +// Complex test suites (e.g. TAS where each pod gets scheduled multiple times for placements) +// may override this timeout via schedulerPerfOptions. +func waitUntilPodsScheduledInNamespace(tCtx ktesting.TContext, podInformer coreinformers.PodInformer, labelSelector map[string]string, namespace string, wantCount int, timeout time.Duration) error { var pendingPod *v1.Pod - err := wait.PollUntilContextTimeout(tCtx, 1*time.Second, 10*time.Minute, true, func(ctx context.Context) (bool, error) { + err := wait.PollUntilContextTimeout(tCtx, 1*time.Second, timeout, true, func(ctx context.Context) (bool, error) { select { case <-ctx.Done(): return true, ctx.Err() diff --git a/test/integration/scheduler_perf/gangscheduling/performance-config.yaml b/test/integration/scheduler_perf/gangscheduling/performance-config.yaml index 2f696ad1f18..50bfd3bbe54 100644 --- a/test/integration/scheduler_perf/gangscheduling/performance-config.yaml +++ b/test/integration/scheduler_perf/gangscheduling/performance-config.yaml @@ -20,6 +20,8 @@ countParam: $initPodGroups namespace: gang-0 templatePath: templates/podgroup.yaml + templateParams: + podsPerGroup: $podsPerGroup - opcode: waitForPodGroups # Wait for the scheduler's informer cache to reflect the newly created PodGroup objects. namespace: gang-0 @@ -48,46 +50,37 @@ podsPerGroup: 3 - name: 5000Nodes_1000Gangs_3000Pods labels: [performance] - # https://perf-dash.k8s.io/#/?jobname=scheduler-perf-benchmark&metriccategoryname=Scheduler&metricname=BenchmarkPerfScheduling&Metric=scheduler_podgroup_scheduling_attempt_duration_seconds&Name=BenchmarkPerfScheduling%2FGangScheduling%2F5000Nodes_1000Gangs_3000Pods%2Ftest&event=not%20applicable&extension_point=not%20applicable&plugin=not%20applicable&result=not%20applicable - # Measured scheduler_podgroup_scheduling_attempt_duration_seconds/Average ~3.7 ms; threshold set conservatively at 8. - threshold: 8 - thresholdMetricSelector: - name: scheduler_podgroup_scheduling_attempt_duration_seconds - labels: - result: scheduled - dataBucket: Average - expectLower: true params: initNodes: 5000 initPodGroups: 1000 podsPerGroup: 3 - name: 5000Nodes_2000Gangs_6000Pods labels: [performance] - # https://perf-dash.k8s.io/#/?jobname=scheduler-perf-benchmark&metriccategoryname=Scheduler&metricname=BenchmarkPerfScheduling&Metric=scheduler_podgroup_scheduling_attempt_duration_seconds&Name=BenchmarkPerfScheduling%2FGangScheduling%2F5000Nodes_2000Gangs_6000Pods%2Ftest&event=not%20applicable&extension_point=not%20applicable&plugin=not%20applicable&result=not%20applicable - # Measured scheduler_podgroup_scheduling_attempt_duration_seconds/Average ~5.0 ms; threshold set conservatively at 10. - threshold: 10 - thresholdMetricSelector: - name: scheduler_podgroup_scheduling_attempt_duration_seconds - labels: - result: scheduled - dataBucket: Average - expectLower: true params: initNodes: 5000 initPodGroups: 2000 podsPerGroup: 3 - name: 5000Nodes_3000Gangs_9000Pods labels: [performance] - # https://perf-dash.k8s.io/#/?jobname=scheduler-perf-benchmark&metriccategoryname=Scheduler&metricname=BenchmarkPerfScheduling&Metric=scheduler_podgroup_scheduling_attempt_duration_seconds&Name=BenchmarkPerfScheduling%2FGangScheduling%2F5000Nodes_3000Gangs_9000Pods%2Ftest&event=not%20applicable&extension_point=not%20applicable&plugin=not%20applicable&result=not%20applicable - # Measured scheduler_podgroup_scheduling_attempt_duration_seconds/Average ~5.7 ms; threshold set conservatively at 12. - threshold: 12 - thresholdMetricSelector: - name: scheduler_podgroup_scheduling_attempt_duration_seconds - labels: - result: scheduled - dataBucket: Average - expectLower: true params: initNodes: 5000 initPodGroups: 3000 podsPerGroup: 3 + - name: 5000Nodes_3Gangs_3000Pods_1000PerGroup + labels: [performance] + params: + initNodes: 5000 + initPodGroups: 3 + podsPerGroup: 1000 + - name: 5000Nodes_6Gangs_6000Pods_1000PerGroup + labels: [performance] + params: + initNodes: 5000 + initPodGroups: 6 + podsPerGroup: 1000 + - name: 5000Nodes_9Gangs_9000Pods_1000PerGroup + labels: [performance] + params: + initNodes: 5000 + initPodGroups: 9 + podsPerGroup: 1000 diff --git a/test/integration/scheduler_perf/gangscheduling/templates/gang-pod.yaml b/test/integration/scheduler_perf/gangscheduling/templates/gang-pod.yaml index 3f87739ff07..ad637bdbe92 100644 --- a/test/integration/scheduler_perf/gangscheduling/templates/gang-pod.yaml +++ b/test/integration/scheduler_perf/gangscheduling/templates/gang-pod.yaml @@ -4,7 +4,6 @@ metadata: name: test-gang-scheduling-{{.Index}} spec: schedulingGroup: - # Three pods share the same pod group. podGroupName: gang-{{DivideInt .Index .podsPerGroup}} containers: - image: registry.k8s.io/pause:3.10.1 diff --git a/test/integration/scheduler_perf/gangscheduling/templates/podgroup.yaml b/test/integration/scheduler_perf/gangscheduling/templates/podgroup.yaml index a26cf435b0f..63e5ce59663 100644 --- a/test/integration/scheduler_perf/gangscheduling/templates/podgroup.yaml +++ b/test/integration/scheduler_perf/gangscheduling/templates/podgroup.yaml @@ -5,4 +5,4 @@ metadata: spec: schedulingPolicy: gang: - minCount: 3 + minCount: {{.podsPerGroup}} diff --git a/test/integration/scheduler_perf/options.go b/test/integration/scheduler_perf/options.go index 208be5078d0..d86e58f3bb3 100644 --- a/test/integration/scheduler_perf/options.go +++ b/test/integration/scheduler_perf/options.go @@ -17,6 +17,8 @@ limitations under the License. package benchmark import ( + "time" + v1 "k8s.io/api/core/v1" "k8s.io/kubernetes/pkg/scheduler" frameworkruntime "k8s.io/kubernetes/pkg/scheduler/framework/runtime" @@ -42,6 +44,7 @@ type schedulerPerfOptions struct { preRunFn PreRunFn prepareFn HookFn nodeUpdateFn NodeUpdateFn + podsSchedulingTimeout time.Duration } // WithPrepareFn is the option to set a function that is called @@ -69,3 +72,11 @@ func WithPreRunFn(preRunFn PreRunFn) SchedulerPerfOption { s.preRunFn = preRunFn } } + +// WithPodsSchedulingTimeout is the option to set a custom timeout +// specifically for waiting for pods to be scheduled. +func WithPodsSchedulingTimeout(timeout time.Duration) SchedulerPerfOption { + return func(s *schedulerPerfOptions) { + s.podsSchedulingTimeout = timeout + } +} diff --git a/test/integration/scheduler_perf/tas/performance-config.yaml b/test/integration/scheduler_perf/tas/performance-config.yaml new file mode 100644 index 00000000000..6b7b6633693 --- /dev/null +++ b/test/integration/scheduler_perf/tas/performance-config.yaml @@ -0,0 +1,90 @@ +# The following labels are used in this file: +# +# - integration-test: test cases to run as the integration test. +# - performance: test cases to run in the performance test. +# - short: supplemental label for the above two labels (must not used alone), which literally means short execution time test cases. + +- name: TopologyAwareScheduling + featureGates: + GenericWorkload: true + GangScheduling: true + TopologyAwareWorkloadScheduling: true + workloadTemplate: + - opcode: createNodes + countParam: $initNodes + nodeTemplatePath: templates/node.yaml + - opcode: createNamespaces + prefix: tas + count: 1 + - opcode: createAny + # Create pod groups (gangs), each has a min count policy and topology constraint specified in pod group template. + # Each pod group is named gang-0, gang-1, ... gang-(n-1). + countParam: $initPodGroups + namespace: tas-0 + templatePath: templates/podgroup.yaml + templateParams: + podsPerGroup: $podsPerGroup + - opcode: waitForPodGroups + # Wait for the scheduler's informer cache to reflect the newly created PodGroup objects. + namespace: tas-0 + countParam: $initPodGroups + - opcode: createPods + # Create pods with reference to the pod groups (gangs) according to their indices (e.g., pods 0-2 → gang-0, pods 3-5 → gang-1, etc.). + countParam: $initPodGroups + countMultiplierParam: $podsPerGroup + namespace: tas-0 + podTemplatePath: templates/gang-pod.yaml + collectMetrics: true + templateParams: + podsPerGroup: $podsPerGroup + workloads: + - name: 10Nodes_3Gangs + labels: [integration-test, short] + params: + initNodes: 10 + initPodGroups: 3 + podsPerGroup: 3 + - name: 100Nodes_10Gangs + labels: [integration-test] + params: + initNodes: 100 + initPodGroups: 10 + podsPerGroup: 3 + - name: 5000Nodes_750Gangs_3000Pods + labels: [performance] + params: + initNodes: 5000 + initPodGroups: 750 + podsPerGroup: 4 + - name: 5000Nodes_1500Gangs_6000Pods + labels: [performance] + params: + initNodes: 5000 + initPodGroups: 1500 + podsPerGroup: 4 + - name: 5000Nodes_2250Gangs_9000Pods + labels: [performance] + params: + initNodes: 5000 + initPodGroups: 2250 + podsPerGroup: 4 + - name: 5000Nodes_3Gangs_3000Pods_1000PerGroup + labels: [performance] + params: + initNodes: 5000 + initPodGroups: 3 + podsPerGroup: 1000 + - name: 5000Nodes_6Gangs_6000Pods_1000PerGroup + labels: [performance] + params: + initNodes: 5000 + initPodGroups: 6 + podsPerGroup: 1000 + - name: 5000Nodes_9Gangs_9000Pods_1000PerGroup + labels: [performance] + params: + initNodes: 5000 + initPodGroups: 9 + podsPerGroup: 1000 + + diff --git a/test/integration/scheduler_perf/tas/tas_test.go b/test/integration/scheduler_perf/tas/tas_test.go new file mode 100644 index 00000000000..cb384b17eca --- /dev/null +++ b/test/integration/scheduler_perf/tas/tas_test.go @@ -0,0 +1,44 @@ +/* +Copyright The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package tas + +import ( + "fmt" + "os" + "testing" + "time" + + _ "k8s.io/component-base/logs/json/register" + perf "k8s.io/kubernetes/test/integration/scheduler_perf" +) + +func TestMain(m *testing.M) { + if err := perf.InitTests(); err != nil { + fmt.Fprintf(os.Stderr, "%v\n", err) + os.Exit(1) + } + + m.Run() +} + +func TestSchedulerPerf(t *testing.T) { + perf.RunIntegrationPerfScheduling(t, "performance-config.yaml", perf.WithPodsSchedulingTimeout(20*time.Minute)) +} + +func BenchmarkPerfScheduling(b *testing.B) { + perf.RunBenchmarkPerfScheduling(b, "performance-config.yaml", "tas", nil, perf.WithPodsSchedulingTimeout(20*time.Minute)) +} diff --git a/test/integration/scheduler_perf/tas/templates/gang-pod.yaml b/test/integration/scheduler_perf/tas/templates/gang-pod.yaml new file mode 100644 index 00000000000..df259e54104 --- /dev/null +++ b/test/integration/scheduler_perf/tas/templates/gang-pod.yaml @@ -0,0 +1,14 @@ +apiVersion: v1 +kind: Pod +metadata: + name: test-tas-scheduling-{{.Index}} +spec: + schedulingGroup: + podGroupName: gang-{{DivideInt .Index .podsPerGroup}} + containers: + - image: registry.k8s.io/pause:3.10.1 + name: pause + resources: + requests: + cpu: 100m + memory: 100Mi diff --git a/test/integration/scheduler_perf/tas/templates/node.yaml b/test/integration/scheduler_perf/tas/templates/node.yaml new file mode 100644 index 00000000000..15f2dbcffec --- /dev/null +++ b/test/integration/scheduler_perf/tas/templates/node.yaml @@ -0,0 +1,21 @@ +apiVersion: v1 +kind: Node +metadata: + name: node-{{.Index}} + labels: + kubernetes.io/hostname: node-{{.Index}} + topology.kubernetes.io/zone: zone-{{DivideInt .Index 100}} + topology.kubernetes.io/rack: rack-{{DivideInt .Index 100}} +status: + capacity: + cpu: "4" + memory: 32Gi + pods: "110" + allocatable: + cpu: "4" + memory: 32Gi + pods: "110" + phase: Running + conditions: + - type: Ready + status: "True" diff --git a/test/integration/scheduler_perf/tas/templates/podgroup.yaml b/test/integration/scheduler_perf/tas/templates/podgroup.yaml new file mode 100644 index 00000000000..4c7984b1064 --- /dev/null +++ b/test/integration/scheduler_perf/tas/templates/podgroup.yaml @@ -0,0 +1,11 @@ +apiVersion: scheduling.k8s.io/v1alpha2 +kind: PodGroup +metadata: + name: gang-{{.Index}} +spec: + schedulingPolicy: + gang: + minCount: {{.podsPerGroup}} + schedulingConstraints: + topology: + - key: topology.kubernetes.io/rack