From 59c9f75133f22ed24e944cf0b8a1a68e1c49af32 Mon Sep 17 00:00:00 2001 From: Antoni Zawodny Date: Thu, 12 Mar 2026 17:25:06 +0100 Subject: [PATCH 1/2] Add Workload-Aware Preemption fields to Workload and PodGroup APIs Co-authored-by: Omar Sayed --- api/openapi-spec/swagger.json | 26 ++ ...__scheduling.k8s.io__v1alpha2_openapi.json | 27 ++ pkg/api/testing/defaulting_test.go | 2 + pkg/apis/scheduling/fuzzer/fuzzer.go | 6 + pkg/apis/scheduling/types.go | 82 ++++ .../v1alpha2/zz_generated.conversion.go | 12 + .../v1alpha2/zz_generated.defaults.go | 17 + .../v1alpha2/zz_generated.validations.go | 235 +++++++++- pkg/apis/scheduling/validation/validation.go | 22 +- .../scheduling/validation/validation_test.go | 132 ------ pkg/apis/scheduling/zz_generated.deepcopy.go | 20 + pkg/generated/openapi/zz_generated.openapi.go | 45 ++ .../podgroup/declarative_validation_test.go | 336 +++++++++++--- pkg/registry/scheduling/podgroup/strategy.go | 118 ++++- .../scheduling/podgroup/strategy_test.go | 338 +++++++++----- .../workload/declarative_validation_test.go | 335 +++++++++++--- pkg/registry/scheduling/workload/strategy.go | 87 +++- .../scheduling/workload/strategy_test.go | 416 +++++++++++------- plugin/pkg/admission/priority/admission.go | 37 +- .../pkg/admission/priority/admission_test.go | 163 +++++++ .../api/scheduling/v1alpha2/generated.pb.go | 228 ++++++++++ .../api/scheduling/v1alpha2/generated.proto | 103 ++++- .../k8s.io/api/scheduling/v1alpha2/types.go | 117 ++++- .../v1alpha2/types_swagger_doc_generated.go | 6 + .../v1alpha2/zz_generated.deepcopy.go | 20 + .../scheduling.k8s.io.v1alpha2.PodGroup.json | 5 +- .../scheduling.k8s.io.v1alpha2.PodGroup.pb | Bin 654 -> 701 bytes .../scheduling.k8s.io.v1alpha2.PodGroup.yaml | 3 + .../scheduling.k8s.io.v1alpha2.Workload.json | 5 +- .../scheduling.k8s.io.v1alpha2.Workload.pb | Bin 552 -> 600 bytes .../scheduling.k8s.io.v1alpha2.Workload.yaml | 5 +- .../applyconfigurations/internal/internal.go | 19 + .../scheduling/v1alpha2/podgroupspec.go | 53 +++ .../scheduling/v1alpha2/podgrouptemplate.go | 48 ++ 34 files changed, 2429 insertions(+), 639 deletions(-) diff --git a/api/openapi-spec/swagger.json b/api/openapi-spec/swagger.json index 5f1c61bdecc..fe0a7b9b1da 100644 --- a/api/openapi-spec/swagger.json +++ b/api/openapi-spec/swagger.json @@ -20008,10 +20008,23 @@ "io.k8s.api.scheduling.v1alpha2.PodGroupSpec": { "description": "PodGroupSpec defines the desired state of a PodGroup.", "properties": { + "disruptionMode": { + "description": "DisruptionMode defines the mode in which a given PodGroup can be disrupted. Controllers are expected to fill this field by copying it from a PodGroupTemplate. One of Pod, PodGroup. Defaults to Pod if unset. This field is immutable. This field is available only when the WorkloadAwarePreemption feature gate is enabled.", + "type": "string" + }, "podGroupTemplateRef": { "$ref": "#/definitions/io.k8s.api.scheduling.v1alpha2.PodGroupTemplateReference", "description": "PodGroupTemplateRef references an optional PodGroup template within other object (e.g. Workload) that was used to create the PodGroup. This field is immutable." }, + "priority": { + "description": "Priority is the value of priority of this pod group. Various system components use this field to find the priority of the pod group. When Priority Admission Controller is enabled, it prevents users from setting this field. The admission controller populates this field from PriorityClassName. The higher the value, the higher the priority. This field is immutable. This field is available only when the WorkloadAwarePreemption feature gate is enabled.", + "format": "int32", + "type": "integer" + }, + "priorityClassName": { + "description": "PriorityClassName defines the priority that should be considered when scheduling this pod group. Controllers are expected to fill this field by copying it from a PodGroupTemplate. Otherwise, it is validated and resolved similarly to the PriorityClassName on PodGroupTemplate (i.e. if no priority class is specified, admission control can set this to the global default priority class if it exists. Otherwise, the pod group's priority will be zero). This field is immutable. This field is available only when the WorkloadAwarePreemption feature gate is enabled.", + "type": "string" + }, "resourceClaims": { "description": "ResourceClaims defines which ResourceClaims may be shared among Pods in the group. Pods consume the devices allocated to a PodGroup's claim by defining a claim in its own Spec.ResourceClaims that matches the PodGroup's claim exactly. The claim must have the same name and refer to the same ResourceClaim or ResourceClaimTemplate.\n\nThis is an alpha-level field and requires that the DRAWorkloadResourceClaims feature gate is enabled.\n\nThis field is immutable.", "items": { @@ -20074,10 +20087,23 @@ "io.k8s.api.scheduling.v1alpha2.PodGroupTemplate": { "description": "PodGroupTemplate represents a template for a set of pods with a scheduling policy.", "properties": { + "disruptionMode": { + "description": "DisruptionMode defines the mode in which a given PodGroup can be disrupted. One of Pod, PodGroup. This field is available only when the WorkloadAwarePreemption feature gate is enabled.", + "type": "string" + }, "name": { "description": "Name is a unique identifier for the PodGroupTemplate within the Workload. It must be a DNS label. This field is immutable.", "type": "string" }, + "priority": { + "description": "Priority is the value of priority of pod groups created from this template. Various system components use this field to find the priority of the pod group. When Priority Admission Controller is enabled, it prevents users from setting this field. The admission controller populates this field from PriorityClassName. The higher the value, the higher the priority. This field is available only when the WorkloadAwarePreemption feature gate is enabled.", + "format": "int32", + "type": "integer" + }, + "priorityClassName": { + "description": "PriorityClassName indicates the priority that should be considered when scheduling a pod group created from this template. If no priority class is specified, admission control can set this to the global default priority class if it exists. Otherwise, pod groups created from this template will have the priority set to zero. This field is available only when the WorkloadAwarePreemption feature gate is enabled.", + "type": "string" + }, "resourceClaims": { "description": "ResourceClaims defines which ResourceClaims may be shared among Pods in the group. Pods consume the devices allocated to a PodGroup's claim by defining a claim in its own Spec.ResourceClaims that matches the PodGroup's claim exactly. The claim must have the same name and refer to the same ResourceClaim or ResourceClaimTemplate.\n\nThis is an alpha-level field and requires that the DRAWorkloadResourceClaims feature gate is enabled.\n\nThis field is immutable.", "items": { diff --git a/api/openapi-spec/v3/apis__scheduling.k8s.io__v1alpha2_openapi.json b/api/openapi-spec/v3/apis__scheduling.k8s.io__v1alpha2_openapi.json index e1103e9229a..842d458042f 100644 --- a/api/openapi-spec/v3/apis__scheduling.k8s.io__v1alpha2_openapi.json +++ b/api/openapi-spec/v3/apis__scheduling.k8s.io__v1alpha2_openapi.json @@ -208,6 +208,11 @@ "io.k8s.api.scheduling.v1alpha2.PodGroupSpec": { "description": "PodGroupSpec defines the desired state of a PodGroup.", "properties": { + "disruptionMode": { + "default": "Pod", + "description": "DisruptionMode defines the mode in which a given PodGroup can be disrupted. Controllers are expected to fill this field by copying it from a PodGroupTemplate. One of Pod, PodGroup. Defaults to Pod if unset. This field is immutable. This field is available only when the WorkloadAwarePreemption feature gate is enabled.", + "type": "string" + }, "podGroupTemplateRef": { "allOf": [ { @@ -216,6 +221,15 @@ ], "description": "PodGroupTemplateRef references an optional PodGroup template within other object (e.g. Workload) that was used to create the PodGroup. This field is immutable." }, + "priority": { + "description": "Priority is the value of priority of this pod group. Various system components use this field to find the priority of the pod group. When Priority Admission Controller is enabled, it prevents users from setting this field. The admission controller populates this field from PriorityClassName. The higher the value, the higher the priority. This field is immutable. This field is available only when the WorkloadAwarePreemption feature gate is enabled.", + "format": "int32", + "type": "integer" + }, + "priorityClassName": { + "description": "PriorityClassName defines the priority that should be considered when scheduling this pod group. Controllers are expected to fill this field by copying it from a PodGroupTemplate. Otherwise, it is validated and resolved similarly to the PriorityClassName on PodGroupTemplate (i.e. if no priority class is specified, admission control can set this to the global default priority class if it exists. Otherwise, the pod group's priority will be zero). This field is immutable. This field is available only when the WorkloadAwarePreemption feature gate is enabled.", + "type": "string" + }, "resourceClaims": { "description": "ResourceClaims defines which ResourceClaims may be shared among Pods in the group. Pods consume the devices allocated to a PodGroup's claim by defining a claim in its own Spec.ResourceClaims that matches the PodGroup's claim exactly. The claim must have the same name and refer to the same ResourceClaim or ResourceClaimTemplate.\n\nThis is an alpha-level field and requires that the DRAWorkloadResourceClaims feature gate is enabled.\n\nThis field is immutable.", "items": { @@ -302,11 +316,24 @@ "io.k8s.api.scheduling.v1alpha2.PodGroupTemplate": { "description": "PodGroupTemplate represents a template for a set of pods with a scheduling policy.", "properties": { + "disruptionMode": { + "description": "DisruptionMode defines the mode in which a given PodGroup can be disrupted. One of Pod, PodGroup. This field is available only when the WorkloadAwarePreemption feature gate is enabled.", + "type": "string" + }, "name": { "default": "", "description": "Name is a unique identifier for the PodGroupTemplate within the Workload. It must be a DNS label. This field is immutable.", "type": "string" }, + "priority": { + "description": "Priority is the value of priority of pod groups created from this template. Various system components use this field to find the priority of the pod group. When Priority Admission Controller is enabled, it prevents users from setting this field. The admission controller populates this field from PriorityClassName. The higher the value, the higher the priority. This field is available only when the WorkloadAwarePreemption feature gate is enabled.", + "format": "int32", + "type": "integer" + }, + "priorityClassName": { + "description": "PriorityClassName indicates the priority that should be considered when scheduling a pod group created from this template. If no priority class is specified, admission control can set this to the global default priority class if it exists. Otherwise, pod groups created from this template will have the priority set to zero. This field is available only when the WorkloadAwarePreemption feature gate is enabled.", + "type": "string" + }, "resourceClaims": { "description": "ResourceClaims defines which ResourceClaims may be shared among Pods in the group. Pods consume the devices allocated to a PodGroup's claim by defining a claim in its own Spec.ResourceClaims that matches the PodGroup's claim exactly. The claim must have the same name and refer to the same ResourceClaim or ResourceClaimTemplate.\n\nThis is an alpha-level field and requires that the DRAWorkloadResourceClaims feature gate is enabled.\n\nThis field is immutable.", "items": { diff --git a/pkg/api/testing/defaulting_test.go b/pkg/api/testing/defaulting_test.go index ba06263df47..f3f7cdce7ee 100644 --- a/pkg/api/testing/defaulting_test.go +++ b/pkg/api/testing/defaulting_test.go @@ -219,6 +219,8 @@ func TestDefaulting(t *testing.T) { {Group: "storage.k8s.io", Version: "v1beta1", Kind: "VolumeAttachment"}: {}, {Group: "storage.k8s.io", Version: "v1beta1", Kind: "VolumeAttachmentList"}: {}, {Group: "authentication.k8s.io", Version: "v1", Kind: "TokenRequest"}: {}, + {Group: "scheduling.k8s.io", Version: "v1alpha2", Kind: "PodGroup"}: {}, + {Group: "scheduling.k8s.io", Version: "v1alpha2", Kind: "PodGroupList"}: {}, {Group: "scheduling.k8s.io", Version: "v1beta1", Kind: "PriorityClass"}: {}, {Group: "scheduling.k8s.io", Version: "v1", Kind: "PriorityClass"}: {}, {Group: "scheduling.k8s.io", Version: "v1beta1", Kind: "PriorityClassList"}: {}, diff --git a/pkg/apis/scheduling/fuzzer/fuzzer.go b/pkg/apis/scheduling/fuzzer/fuzzer.go index 794723bc93c..8142ef2e48c 100644 --- a/pkg/apis/scheduling/fuzzer/fuzzer.go +++ b/pkg/apis/scheduling/fuzzer/fuzzer.go @@ -33,5 +33,11 @@ var Funcs = func(codecs runtimeserializer.CodecFactory) []interface{} { s.PreemptionPolicy = &preemptLowerPriority } }, + func(s *scheduling.PodGroup, c randfill.Continue) { + c.FillNoCustom(s) + if s.Spec.DisruptionMode == nil { + s.Spec.DisruptionMode = new(scheduling.DisruptionModePod) + } + }, } } diff --git a/pkg/apis/scheduling/types.go b/pkg/apis/scheduling/types.go index 553e820b430..dd7950f702b 100644 --- a/pkg/apis/scheduling/types.go +++ b/pkg/apis/scheduling/types.go @@ -212,6 +212,38 @@ type PodGroupTemplate struct { // +listMapKey=name // +featureGate=DRAWorkloadResourceClaims ResourceClaims []PodGroupResourceClaim + + // DisruptionMode defines the mode in which a given PodGroup can be disrupted. + // One of Pod, PodGroup. + // This field is available only when the WorkloadAwarePreemption feature gate + // is enabled. + // + // +featureGate=WorkloadAwarePreemption + // +optional + DisruptionMode *DisruptionMode + + // PriorityClassName indicates the priority that should be considered when scheduling + // a pod group created from this template. If no priority class is specified, admission + // control can set this to the global default priority class if it exists. Otherwise, + // pod groups created from this template will have the priority set to zero. + // This field is available only when the WorkloadAwarePreemption feature gate + // is enabled. + // + // +featureGate=WorkloadAwarePreemption + // +optional + PriorityClassName string + + // Priority is the value of priority of pod groups created from this template. Various + // system components use this field to find the priority of the pod group. When + // Priority Admission Controller is enabled, it prevents users from setting this field. + // The admission controller populates this field from PriorityClassName. + // The higher the value, the higher the priority. + // This field is available only when the WorkloadAwarePreemption feature gate + // is enabled. + // + // +featureGate=WorkloadAwarePreemption + // +optional + Priority *int32 } // PodGroupSchedulingPolicy defines the scheduling configuration for a PodGroup. @@ -295,6 +327,19 @@ type PodGroupResourceClaim struct { ResourceClaimTemplateName *string } +// DisruptionMode describes the mode in which a PodGroup can be disrupted (e.g. preempted). +// +enum +type DisruptionMode string + +const ( + // DisruptionModePod means that individual pods can be disrupted or preempted independently. + // It doesn't depend on exact set of pods currently running in this PodGroup. + DisruptionModePod DisruptionMode = "Pod" + // DisruptionModePodGroup means that the whole PodGroup needs to be disrupted + // or preempted together. + DisruptionModePodGroup DisruptionMode = "PodGroup" +) + // +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object // PodGroup represents a runtime instance of pods grouped together. @@ -374,6 +419,43 @@ type PodGroupSpec struct { // +listMapKey=name // +featureGate=DRAWorkloadResourceClaims ResourceClaims []PodGroupResourceClaim + + // DisruptionMode defines the mode in which a given PodGroup can be disrupted. + // Controllers are expected to fill this field by copying it from a PodGroupTemplate. + // One of Pod, PodGroup. Defaults to Pod if unset. + // This field is immutable. + // This field is available only when the WorkloadAwarePreemption feature gate + // is enabled. + // + // +featureGate=WorkloadAwarePreemption + // +optional + DisruptionMode *DisruptionMode + + // PriorityClassName defines the priority that should be considered when scheduling this pod group. + // Controllers are expected to fill this field by copying it from a PodGroupTemplate. + // Otherwise, it is validated and resolved similarly to the PriorityClassName on PodGroupTemplate + // (i.e. if no priority class is specified, admission control can set this to the global default + // priority class if it exists. Otherwise, the pod group's priority will be zero). + // This field is immutable. + // This field is available only when the WorkloadAwarePreemption feature gate + // is enabled. + // + // +featureGate=WorkloadAwarePreemption + // +optional + PriorityClassName string + + // Priority is the value of priority of this pod group. Various system components + // use this field to find the priority of the pod group. When Priority Admission + // Controller is enabled, it prevents users from setting this field. The admission + // controller populates this field from PriorityClassName. + // The higher the value, the higher the priority. + // This field is immutable. + // This field is available only when the WorkloadAwarePreemption feature gate + // is enabled. + // + // +featureGate=WorkloadAwarePreemption + // +optional + Priority *int32 } // PodGroupStatus represents information about the status of a pod group. diff --git a/pkg/apis/scheduling/v1alpha2/zz_generated.conversion.go b/pkg/apis/scheduling/v1alpha2/zz_generated.conversion.go index 9eb188af026..673ca5b222c 100644 --- a/pkg/apis/scheduling/v1alpha2/zz_generated.conversion.go +++ b/pkg/apis/scheduling/v1alpha2/zz_generated.conversion.go @@ -408,6 +408,9 @@ func autoConvert_v1alpha2_PodGroupSpec_To_scheduling_PodGroupSpec(in *scheduling } out.SchedulingConstraints = (*scheduling.PodGroupSchedulingConstraints)(unsafe.Pointer(in.SchedulingConstraints)) out.ResourceClaims = *(*[]scheduling.PodGroupResourceClaim)(unsafe.Pointer(&in.ResourceClaims)) + out.DisruptionMode = (*scheduling.DisruptionMode)(unsafe.Pointer(in.DisruptionMode)) + out.PriorityClassName = in.PriorityClassName + out.Priority = (*int32)(unsafe.Pointer(in.Priority)) return nil } @@ -423,6 +426,9 @@ func autoConvert_scheduling_PodGroupSpec_To_v1alpha2_PodGroupSpec(in *scheduling } out.SchedulingConstraints = (*schedulingv1alpha2.PodGroupSchedulingConstraints)(unsafe.Pointer(in.SchedulingConstraints)) out.ResourceClaims = *(*[]schedulingv1alpha2.PodGroupResourceClaim)(unsafe.Pointer(&in.ResourceClaims)) + out.DisruptionMode = (*schedulingv1alpha2.DisruptionMode)(unsafe.Pointer(in.DisruptionMode)) + out.PriorityClassName = in.PriorityClassName + out.Priority = (*int32)(unsafe.Pointer(in.Priority)) return nil } @@ -460,6 +466,9 @@ func autoConvert_v1alpha2_PodGroupTemplate_To_scheduling_PodGroupTemplate(in *sc } out.SchedulingConstraints = (*scheduling.PodGroupSchedulingConstraints)(unsafe.Pointer(in.SchedulingConstraints)) out.ResourceClaims = *(*[]scheduling.PodGroupResourceClaim)(unsafe.Pointer(&in.ResourceClaims)) + out.DisruptionMode = (*scheduling.DisruptionMode)(unsafe.Pointer(in.DisruptionMode)) + out.PriorityClassName = in.PriorityClassName + out.Priority = (*int32)(unsafe.Pointer(in.Priority)) return nil } @@ -475,6 +484,9 @@ func autoConvert_scheduling_PodGroupTemplate_To_v1alpha2_PodGroupTemplate(in *sc } out.SchedulingConstraints = (*schedulingv1alpha2.PodGroupSchedulingConstraints)(unsafe.Pointer(in.SchedulingConstraints)) out.ResourceClaims = *(*[]schedulingv1alpha2.PodGroupResourceClaim)(unsafe.Pointer(&in.ResourceClaims)) + out.DisruptionMode = (*schedulingv1alpha2.DisruptionMode)(unsafe.Pointer(in.DisruptionMode)) + out.PriorityClassName = in.PriorityClassName + out.Priority = (*int32)(unsafe.Pointer(in.Priority)) return nil } diff --git a/pkg/apis/scheduling/v1alpha2/zz_generated.defaults.go b/pkg/apis/scheduling/v1alpha2/zz_generated.defaults.go index 7e0a05edc95..cd1725ad90f 100644 --- a/pkg/apis/scheduling/v1alpha2/zz_generated.defaults.go +++ b/pkg/apis/scheduling/v1alpha2/zz_generated.defaults.go @@ -22,6 +22,7 @@ limitations under the License. package v1alpha2 import ( + schedulingv1alpha2 "k8s.io/api/scheduling/v1alpha2" runtime "k8s.io/apimachinery/pkg/runtime" ) @@ -29,5 +30,21 @@ import ( // Public to allow building arbitrary schemes. // All generated defaulters are covering - they call all nested defaulters. func RegisterDefaults(scheme *runtime.Scheme) error { + scheme.AddTypeDefaultingFunc(&schedulingv1alpha2.PodGroup{}, func(obj interface{}) { SetObjectDefaults_PodGroup(obj.(*schedulingv1alpha2.PodGroup)) }) + scheme.AddTypeDefaultingFunc(&schedulingv1alpha2.PodGroupList{}, func(obj interface{}) { SetObjectDefaults_PodGroupList(obj.(*schedulingv1alpha2.PodGroupList)) }) return nil } + +func SetObjectDefaults_PodGroup(in *schedulingv1alpha2.PodGroup) { + if in.Spec.DisruptionMode == nil { + var ptrVar1 schedulingv1alpha2.DisruptionMode = "Pod" + in.Spec.DisruptionMode = &ptrVar1 + } +} + +func SetObjectDefaults_PodGroupList(in *schedulingv1alpha2.PodGroupList) { + for i := range in.Items { + a := &in.Items[i] + SetObjectDefaults_PodGroup(a) + } +} diff --git a/pkg/apis/scheduling/v1alpha2/zz_generated.validations.go b/pkg/apis/scheduling/v1alpha2/zz_generated.validations.go index b04ebccffab..e93231a5d44 100644 --- a/pkg/apis/scheduling/v1alpha2/zz_generated.validations.go +++ b/pkg/apis/scheduling/v1alpha2/zz_generated.validations.go @@ -31,6 +31,7 @@ import ( safe "k8s.io/apimachinery/pkg/api/safe" validate "k8s.io/apimachinery/pkg/api/validate" runtime "k8s.io/apimachinery/pkg/runtime" + sets "k8s.io/apimachinery/pkg/util/sets" field "k8s.io/apimachinery/pkg/util/validation/field" ) @@ -58,6 +59,16 @@ func RegisterValidations(scheme *runtime.Scheme) error { return nil } +var symbolsForDisruptionMode = sets.New(schedulingv1alpha2.DisruptionModePod, schedulingv1alpha2.DisruptionModePodGroup) + +// Validate_DisruptionMode validates an instance of DisruptionMode according +// to declarative validation rules in the API schema. +func Validate_DisruptionMode(ctx context.Context, op operation.Operation, fldPath *field.Path, obj, oldObj *schedulingv1alpha2.DisruptionMode) (errs field.ErrorList) { + errs = append(errs, validate.Enum(ctx, op, fldPath, obj, oldObj, symbolsForDisruptionMode, nil)...) + + return errs +} + // Validate_GangSchedulingPolicy validates an instance of GangSchedulingPolicy according // to declarative validation rules in the API schema. func Validate_GangSchedulingPolicy(ctx context.Context, op operation.Operation, fldPath *field.Path, obj, oldObj *schedulingv1alpha2.GangSchedulingPolicy) (errs field.ErrorList) { @@ -330,11 +341,11 @@ func Validate_PodGroupSpec(ctx context.Context, op operation.Operation, fldPath } // call field-attached validations earlyReturn := false - if e := validate.OptionalPointer(ctx, op, fldPath, obj, oldObj); len(e) != 0 { + if e := validate.Immutable(ctx, op, fldPath, obj, oldObj); len(e) != 0 { + errs = append(errs, e...) earlyReturn = true } - if e := validate.Immutable(ctx, op, fldPath, obj, oldObj).MarkAlpha(); len(e) != 0 { - errs = append(errs, e...) + if e := validate.OptionalPointer(ctx, op, fldPath, obj, oldObj); len(e) != 0 { earlyReturn = true } if earlyReturn { @@ -356,7 +367,7 @@ func Validate_PodGroupSpec(ctx context.Context, op operation.Operation, fldPath } // call field-attached validations earlyReturn := false - if e := validate.Immutable(ctx, op, fldPath, obj, oldObj).MarkAlpha(); len(e) != 0 { + if e := validate.Immutable(ctx, op, fldPath, obj, oldObj); len(e) != 0 { errs = append(errs, e...) earlyReturn = true } @@ -412,6 +423,10 @@ func Validate_PodGroupSpec(ctx context.Context, op operation.Operation, fldPath } // call field-attached validations earlyReturn := false + if e := validate.Immutable(ctx, op, fldPath, obj, oldObj); len(e) != 0 { + errs = append(errs, e...) + earlyReturn = true + } if e := validate.MaxItems(ctx, op, fldPath, obj, oldObj, 4); len(e) != 0 { errs = append(errs, e...) earlyReturn = true @@ -419,10 +434,6 @@ func Validate_PodGroupSpec(ctx context.Context, op operation.Operation, fldPath if e := validate.OptionalSlice(ctx, op, fldPath, obj, oldObj); len(e) != 0 { earlyReturn = true } - if e := validate.Immutable(ctx, op, fldPath, obj, oldObj).MarkAlpha(); len(e) != 0 { - errs = append(errs, e...) - earlyReturn = true - } if earlyReturn { return // do not proceed } @@ -439,6 +450,106 @@ func Validate_PodGroupSpec(ctx context.Context, op operation.Operation, fldPath return oldObj.ResourceClaims }), oldObj != nil)...) + // field schedulingv1alpha2.PodGroupSpec.DisruptionMode + errs = append(errs, + func(fldPath *field.Path, obj, oldObj *schedulingv1alpha2.DisruptionMode, oldValueCorrelated bool) (errs field.ErrorList) { + // don't revalidate unchanged data + if oldValueCorrelated && op.Type == operation.Update && (obj == oldObj || (obj != nil && oldObj != nil && *obj == *oldObj)) { + return nil + } + // call field-attached validations + earlyReturn := false + if e := validate.IfOption(ctx, op, fldPath, obj, oldObj, "WorkloadAwarePreemption", false, validate.ForbiddenPointer); len(e) != 0 { + errs = append(errs, e...) + earlyReturn = true + } + if e := validate.IfOption(ctx, op, fldPath, obj, oldObj, "WorkloadAwarePreemption", false, validate.OptionalPointer); len(e) != 0 { + earlyReturn = true + } + if e := validate.IfOption(ctx, op, fldPath, obj, oldObj, "WorkloadAwarePreemption", true, // optional fields with default values are effectively required + validate.RequiredPointer); len(e) != 0 { + errs = append(errs, e...) + earlyReturn = true + } + if e := validate.IfOption(ctx, op, fldPath, obj, oldObj, "WorkloadAwarePreemption", true, validate.Immutable); len(e) != 0 { + errs = append(errs, e...) + earlyReturn = true + } + if earlyReturn { + return // do not proceed + } + // call the type's validation function + errs = append(errs, Validate_DisruptionMode(ctx, op, fldPath, obj, oldObj)...) + return + }(fldPath.Child("disruptionMode"), obj.DisruptionMode, safe.Field(oldObj, func(oldObj *schedulingv1alpha2.PodGroupSpec) *schedulingv1alpha2.DisruptionMode { + return oldObj.DisruptionMode + }), oldObj != nil)...) + + // field schedulingv1alpha2.PodGroupSpec.PriorityClassName + errs = append(errs, + func(fldPath *field.Path, obj, oldObj *string, oldValueCorrelated bool) (errs field.ErrorList) { + // don't revalidate unchanged data + if oldValueCorrelated && op.Type == operation.Update && (obj == oldObj || (obj != nil && oldObj != nil && *obj == *oldObj)) { + return nil + } + // call field-attached validations + earlyReturn := false + if e := validate.IfOption(ctx, op, fldPath, obj, oldObj, "WorkloadAwarePreemption", false, validate.ForbiddenValue); len(e) != 0 { + errs = append(errs, e...) + earlyReturn = true + } + if e := validate.IfOption(ctx, op, fldPath, obj, oldObj, "WorkloadAwarePreemption", false, validate.OptionalValue); len(e) != 0 { + earlyReturn = true + } + if e := validate.IfOption(ctx, op, fldPath, obj, oldObj, "WorkloadAwarePreemption", true, validate.OptionalValue); len(e) != 0 { + earlyReturn = true + } + if e := validate.IfOption(ctx, op, fldPath, obj, oldObj, "WorkloadAwarePreemption", true, validate.Immutable); len(e) != 0 { + errs = append(errs, e...) + earlyReturn = true + } + if earlyReturn { + return // do not proceed + } + errs = append(errs, validate.IfOption(ctx, op, fldPath, obj, oldObj, "WorkloadAwarePreemption", true, validate.LongName)...) + return + }(fldPath.Child("priorityClassName"), &obj.PriorityClassName, safe.Field(oldObj, func(oldObj *schedulingv1alpha2.PodGroupSpec) *string { return &oldObj.PriorityClassName }), oldObj != nil)...) + + // field schedulingv1alpha2.PodGroupSpec.Priority + errs = append(errs, + func(fldPath *field.Path, obj, oldObj *int32, oldValueCorrelated bool) (errs field.ErrorList) { + // don't revalidate unchanged data + if oldValueCorrelated && op.Type == operation.Update && (obj == oldObj || (obj != nil && oldObj != nil && *obj == *oldObj)) { + return nil + } + // call field-attached validations + earlyReturn := false + if e := validate.IfOption(ctx, op, fldPath, obj, oldObj, "WorkloadAwarePreemption", false, validate.ForbiddenPointer); len(e) != 0 { + errs = append(errs, e...) + earlyReturn = true + } + if e := validate.IfOption(ctx, op, fldPath, obj, oldObj, "WorkloadAwarePreemption", false, validate.OptionalPointer); len(e) != 0 { + earlyReturn = true + } + if e := validate.IfOption(ctx, op, fldPath, obj, oldObj, "WorkloadAwarePreemption", true, validate.OptionalPointer); len(e) != 0 { + earlyReturn = true + } + if e := validate.IfOption(ctx, op, fldPath, obj, oldObj, "WorkloadAwarePreemption", true, validate.Immutable); len(e) != 0 { + errs = append(errs, e...) + earlyReturn = true + } + if earlyReturn { + return // do not proceed + } + errs = append(errs, validate.IfOption(ctx, op, fldPath, obj, oldObj, "WorkloadAwarePreemption", true, func(ctx context.Context, op operation.Operation, fldPath *field.Path, obj, oldObj *int32) field.ErrorList { + return validate.Maximum(ctx, op, fldPath, obj, oldObj, 1000000000) + })...) + errs = append(errs, validate.IfOption(ctx, op, fldPath, obj, oldObj, "WorkloadAwarePreemption", true, func(ctx context.Context, op operation.Operation, fldPath *field.Path, obj, oldObj *int32) field.ErrorList { + return validate.Minimum(ctx, op, fldPath, obj, oldObj, -2147483648) + })...) + return + }(fldPath.Child("priority"), obj.Priority, safe.Field(oldObj, func(oldObj *schedulingv1alpha2.PodGroupSpec) *int32 { return oldObj.Priority }), oldObj != nil)...) + return errs } @@ -557,6 +668,10 @@ func Validate_PodGroupTemplate(ctx context.Context, op operation.Operation, fldP } // call field-attached validations earlyReturn := false + if e := validate.Immutable(ctx, op, fldPath, obj, oldObj); len(e) != 0 { + errs = append(errs, e...) + earlyReturn = true + } if e := validate.MaxItems(ctx, op, fldPath, obj, oldObj, 4); len(e) != 0 { errs = append(errs, e...) earlyReturn = true @@ -564,10 +679,6 @@ func Validate_PodGroupTemplate(ctx context.Context, op operation.Operation, fldP if e := validate.OptionalSlice(ctx, op, fldPath, obj, oldObj); len(e) != 0 { earlyReturn = true } - if e := validate.Immutable(ctx, op, fldPath, obj, oldObj).MarkAlpha(); len(e) != 0 { - errs = append(errs, e...) - earlyReturn = true - } if earlyReturn { return // do not proceed } @@ -584,6 +695,92 @@ func Validate_PodGroupTemplate(ctx context.Context, op operation.Operation, fldP return oldObj.ResourceClaims }), oldObj != nil)...) + // field schedulingv1alpha2.PodGroupTemplate.DisruptionMode + errs = append(errs, + func(fldPath *field.Path, obj, oldObj *schedulingv1alpha2.DisruptionMode, oldValueCorrelated bool) (errs field.ErrorList) { + // don't revalidate unchanged data + if oldValueCorrelated && op.Type == operation.Update && (obj == oldObj || (obj != nil && oldObj != nil && *obj == *oldObj)) { + return nil + } + // call field-attached validations + earlyReturn := false + if e := validate.IfOption(ctx, op, fldPath, obj, oldObj, "WorkloadAwarePreemption", false, validate.ForbiddenPointer); len(e) != 0 { + errs = append(errs, e...) + earlyReturn = true + } + if e := validate.IfOption(ctx, op, fldPath, obj, oldObj, "WorkloadAwarePreemption", false, validate.OptionalPointer); len(e) != 0 { + earlyReturn = true + } + if e := validate.IfOption(ctx, op, fldPath, obj, oldObj, "WorkloadAwarePreemption", true, validate.OptionalPointer); len(e) != 0 { + earlyReturn = true + } + if earlyReturn { + return // do not proceed + } + // call the type's validation function + errs = append(errs, Validate_DisruptionMode(ctx, op, fldPath, obj, oldObj)...) + return + }(fldPath.Child("disruptionMode"), obj.DisruptionMode, safe.Field(oldObj, func(oldObj *schedulingv1alpha2.PodGroupTemplate) *schedulingv1alpha2.DisruptionMode { + return oldObj.DisruptionMode + }), oldObj != nil)...) + + // field schedulingv1alpha2.PodGroupTemplate.PriorityClassName + errs = append(errs, + func(fldPath *field.Path, obj, oldObj *string, oldValueCorrelated bool) (errs field.ErrorList) { + // don't revalidate unchanged data + if oldValueCorrelated && op.Type == operation.Update && (obj == oldObj || (obj != nil && oldObj != nil && *obj == *oldObj)) { + return nil + } + // call field-attached validations + earlyReturn := false + if e := validate.IfOption(ctx, op, fldPath, obj, oldObj, "WorkloadAwarePreemption", false, validate.ForbiddenValue); len(e) != 0 { + errs = append(errs, e...) + earlyReturn = true + } + if e := validate.IfOption(ctx, op, fldPath, obj, oldObj, "WorkloadAwarePreemption", false, validate.OptionalValue); len(e) != 0 { + earlyReturn = true + } + if e := validate.IfOption(ctx, op, fldPath, obj, oldObj, "WorkloadAwarePreemption", true, validate.OptionalValue); len(e) != 0 { + earlyReturn = true + } + if earlyReturn { + return // do not proceed + } + errs = append(errs, validate.IfOption(ctx, op, fldPath, obj, oldObj, "WorkloadAwarePreemption", true, validate.LongName)...) + return + }(fldPath.Child("priorityClassName"), &obj.PriorityClassName, safe.Field(oldObj, func(oldObj *schedulingv1alpha2.PodGroupTemplate) *string { return &oldObj.PriorityClassName }), oldObj != nil)...) + + // field schedulingv1alpha2.PodGroupTemplate.Priority + errs = append(errs, + func(fldPath *field.Path, obj, oldObj *int32, oldValueCorrelated bool) (errs field.ErrorList) { + // don't revalidate unchanged data + if oldValueCorrelated && op.Type == operation.Update && (obj == oldObj || (obj != nil && oldObj != nil && *obj == *oldObj)) { + return nil + } + // call field-attached validations + earlyReturn := false + if e := validate.IfOption(ctx, op, fldPath, obj, oldObj, "WorkloadAwarePreemption", false, validate.ForbiddenPointer); len(e) != 0 { + errs = append(errs, e...) + earlyReturn = true + } + if e := validate.IfOption(ctx, op, fldPath, obj, oldObj, "WorkloadAwarePreemption", false, validate.OptionalPointer); len(e) != 0 { + earlyReturn = true + } + if e := validate.IfOption(ctx, op, fldPath, obj, oldObj, "WorkloadAwarePreemption", true, validate.OptionalPointer); len(e) != 0 { + earlyReturn = true + } + if earlyReturn { + return // do not proceed + } + errs = append(errs, validate.IfOption(ctx, op, fldPath, obj, oldObj, "WorkloadAwarePreemption", true, func(ctx context.Context, op operation.Operation, fldPath *field.Path, obj, oldObj *int32) field.ErrorList { + return validate.Maximum(ctx, op, fldPath, obj, oldObj, 1000000000) + })...) + errs = append(errs, validate.IfOption(ctx, op, fldPath, obj, oldObj, "WorkloadAwarePreemption", true, func(ctx context.Context, op operation.Operation, fldPath *field.Path, obj, oldObj *int32) field.ErrorList { + return validate.Minimum(ctx, op, fldPath, obj, oldObj, -2147483648) + })...) + return + }(fldPath.Child("priority"), obj.Priority, safe.Field(oldObj, func(oldObj *schedulingv1alpha2.PodGroupTemplate) *int32 { return oldObj.Priority }), oldObj != nil)...) + return errs } @@ -798,11 +995,11 @@ func Validate_WorkloadSpec(ctx context.Context, op operation.Operation, fldPath } // call field-attached validations earlyReturn := false - if e := validate.OptionalPointer(ctx, op, fldPath, obj, oldObj); len(e) != 0 { + if e := validate.Immutable(ctx, op, fldPath, obj, oldObj); len(e) != 0 { + errs = append(errs, e...) earlyReturn = true } - if e := validate.Immutable(ctx, op, fldPath, obj, oldObj).MarkAlpha(); len(e) != 0 { - errs = append(errs, e...) + if e := validate.OptionalPointer(ctx, op, fldPath, obj, oldObj); len(e) != 0 { earlyReturn = true } if earlyReturn { @@ -824,6 +1021,10 @@ func Validate_WorkloadSpec(ctx context.Context, op operation.Operation, fldPath } // call field-attached validations earlyReturn := false + if e := validate.Immutable(ctx, op, fldPath, obj, oldObj); len(e) != 0 { + errs = append(errs, e...) + earlyReturn = true + } if e := validate.MaxItems(ctx, op, fldPath, obj, oldObj, 8); len(e) != 0 { errs = append(errs, e...) earlyReturn = true @@ -832,10 +1033,6 @@ func Validate_WorkloadSpec(ctx context.Context, op operation.Operation, fldPath errs = append(errs, e...) earlyReturn = true } - if e := validate.Immutable(ctx, op, fldPath, obj, oldObj).MarkAlpha(); len(e) != 0 { - errs = append(errs, e...) - earlyReturn = true - } if earlyReturn { return // do not proceed } diff --git a/pkg/apis/scheduling/validation/validation.go b/pkg/apis/scheduling/validation/validation.go index 09ca17a3172..16807cefefd 100644 --- a/pkg/apis/scheduling/validation/validation.go +++ b/pkg/apis/scheduling/validation/validation.go @@ -75,16 +75,7 @@ func ValidatePodGroup(podGroup *scheduling.PodGroup) field.ErrorList { // ValidatePodGroupUpdate tests if an update to PodGroup is valid. func ValidatePodGroupUpdate(podGroup, oldPodGroup *scheduling.PodGroup) field.ErrorList { - allErrs := apivalidation.ValidateObjectMetaUpdate(&podGroup.ObjectMeta, &oldPodGroup.ObjectMeta, field.NewPath("metadata")) - allErrs = append(allErrs, validatePodGroupSpecUpdate(&podGroup.Spec, &oldPodGroup.Spec, field.NewPath("spec"))...) - return allErrs -} - -func validatePodGroupSpecUpdate(spec, oldSpec *scheduling.PodGroupSpec, fldPath *field.Path) field.ErrorList { - allErrs := apivalidation.ValidateImmutableField(spec.PodGroupTemplateRef, oldSpec.PodGroupTemplateRef, fldPath.Child("podGroupTemplateRef")).WithOrigin("immutable").MarkCoveredByDeclarative() - allErrs = append(allErrs, apivalidation.ValidateImmutableField(spec.SchedulingPolicy, oldSpec.SchedulingPolicy, fldPath.Child("schedulingPolicy")).WithOrigin("immutable").MarkCoveredByDeclarative()...) - allErrs = append(allErrs, apivalidation.ValidateImmutableField(spec.ResourceClaims, oldSpec.ResourceClaims, fldPath.Child("resourceClaims")).WithOrigin("immutable").MarkCoveredByDeclarative()...) - return allErrs + return apivalidation.ValidateObjectMetaUpdate(&podGroup.ObjectMeta, &oldPodGroup.ObjectMeta, field.NewPath("metadata")) } // ValidateWorkload tests if all fields in a Workload are set correctly. @@ -94,16 +85,7 @@ func ValidateWorkload(workload *scheduling.Workload) field.ErrorList { // ValidateWorkloadUpdate tests if an update to Workload is valid. func ValidateWorkloadUpdate(workload, oldWorkload *scheduling.Workload) field.ErrorList { - allErrs := apivalidation.ValidateObjectMetaUpdate(&workload.ObjectMeta, &oldWorkload.ObjectMeta, field.NewPath("metadata")) - allErrs = append(allErrs, validateWorkloadSpecUpdate(&workload.Spec, &oldWorkload.Spec, field.NewPath("spec"))...) - return allErrs -} - -func validateWorkloadSpecUpdate(spec, oldSpec *scheduling.WorkloadSpec, fldPath *field.Path) field.ErrorList { - var allErrs field.ErrorList - allErrs = append(allErrs, apimachineryvalidation.ValidateImmutableField(spec.ControllerRef, oldSpec.ControllerRef, fldPath.Child("controllerRef")).WithOrigin("immutable").MarkCoveredByDeclarative()...) - allErrs = append(allErrs, apivalidation.ValidateImmutableField(spec.PodGroupTemplates, oldSpec.PodGroupTemplates, fldPath.Child("podGroupTemplates")).WithOrigin("immutable").MarkCoveredByDeclarative()...) - return allErrs + return apivalidation.ValidateObjectMetaUpdate(&workload.ObjectMeta, &oldWorkload.ObjectMeta, field.NewPath("metadata")) } // ValidatePodGroupStatusUpdate tests if an update to the status of a PodGroup is valid. diff --git a/pkg/apis/scheduling/validation/validation_test.go b/pkg/apis/scheduling/validation/validation_test.go index 7d0e79b67dc..9060b4cbd23 100644 --- a/pkg/apis/scheduling/validation/validation_test.go +++ b/pkg/apis/scheduling/validation/validation_test.go @@ -305,91 +305,6 @@ func TestValidateWorkloadUpdate(t *testing.T) { w.Namespace += "bar" }), }, - "set controller ref": { - old: mkWorkload(func(w *scheduling.Workload) { - w.Spec.ControllerRef = nil - }), - update: mkWorkload(), - }, - "unset controller ref": { - old: mkWorkload(), - update: mkWorkload(func(w *scheduling.Workload) { - w.Spec.ControllerRef = nil - }), - }, - "change pod group name": { - old: mkWorkload(), - update: mkWorkload(func(w *scheduling.Workload) { - w.Spec.PodGroupTemplates[0].Name += "bar" - }), - }, - "add pod group": { - old: mkWorkload(), - update: mkWorkload(func(w *scheduling.Workload) { - w.Spec.PodGroupTemplates = append(w.Spec.PodGroupTemplates, scheduling.PodGroupTemplate{ - Name: "group3", - SchedulingPolicy: scheduling.PodGroupSchedulingPolicy{ - Basic: &scheduling.BasicSchedulingPolicy{}, - }, - }) - }), - }, - "delete pod group": { - old: mkWorkload(), - update: mkWorkload(func(w *scheduling.Workload) { - w.Spec.PodGroupTemplates = w.Spec.PodGroupTemplates[:1] - }), - }, - "change gang min count": { - old: mkWorkload(), - update: mkWorkload(func(w *scheduling.Workload) { - w.Spec.PodGroupTemplates[1].SchedulingPolicy.Gang.MinCount = 5 - }), - }, - "change controllerRef": { - old: mkWorkload(), - update: mkWorkload(func(w *scheduling.Workload) { - w.Spec.ControllerRef.Kind += "bar" - }), - }, - "delete controllerRef": { - old: mkWorkload(), - update: mkWorkload(func(w *scheduling.Workload) { - w.Spec.ControllerRef = nil - }), - }, - "add resource claim": { - old: mkWorkload(), - update: mkWorkload(func(w *scheduling.Workload) { - w.Spec.PodGroupTemplates[0].ResourceClaims = append(w.Spec.PodGroupTemplates[0].ResourceClaims, scheduling.PodGroupResourceClaim{ - Name: "my-claim", - ResourceClaimName: new("my-claim-name"), - }) - }), - }, - "remove resource claim": { - old: mkWorkload(func(w *scheduling.Workload) { - w.Spec.PodGroupTemplates[0].ResourceClaims = append(w.Spec.PodGroupTemplates[0].ResourceClaims, scheduling.PodGroupResourceClaim{ - Name: "my-claim", - ResourceClaimName: new("my-claim-name"), - }) - }), - update: mkWorkload(), - }, - "change resource claim": { - old: mkWorkload(func(w *scheduling.Workload) { - w.Spec.PodGroupTemplates[0].ResourceClaims = append(w.Spec.PodGroupTemplates[0].ResourceClaims, scheduling.PodGroupResourceClaim{ - Name: "my-claim", - ResourceClaimName: new("my-claim-name"), - }) - }), - update: mkWorkload(func(w *scheduling.Workload) { - w.Spec.PodGroupTemplates[0].ResourceClaims = append(w.Spec.PodGroupTemplates[0].ResourceClaims, scheduling.PodGroupResourceClaim{ - Name: "my-claim", - ResourceClaimName: new("my-otherclaim-name"), - }) - }), - }, } for name, tc := range failureCases { tc.old.ResourceVersion = "0" @@ -574,53 +489,6 @@ func TestValidatePodGroupUpdate(t *testing.T) { pg.Namespace += "bar" }), }, - "change podGroup template ref name": { - old: mkPodGroup(), - update: mkPodGroup(func(pg *scheduling.PodGroup) { - pg.Spec.PodGroupTemplateRef.Workload.PodGroupTemplateName = "new-template" - }), - }, - "change podGroup template ref workload name": { - old: mkPodGroup(), - update: mkPodGroup(func(pg *scheduling.PodGroup) { - pg.Spec.PodGroupTemplateRef.Workload.WorkloadName = "new-workload" - }), - }, - "delete podGroup template ref": { - old: mkPodGroup(), - update: mkPodGroup(func(pg *scheduling.PodGroup) { - pg.Spec.PodGroupTemplateRef = nil - }), - }, - "change gang min count": { - old: mkPodGroup(), - update: mkPodGroup(func(pg *scheduling.PodGroup) { - pg.Spec.SchedulingPolicy.Gang.MinCount = 10 - }), - }, - "change scheduling policy": { - old: mkPodGroup(), - update: mkPodGroup(func(pg *scheduling.PodGroup) { - pg.Spec.SchedulingPolicy = scheduling.PodGroupSchedulingPolicy{ - Basic: &scheduling.BasicSchedulingPolicy{}, - } - }), - }, - "multiple scheduling policies": { - old: mkPodGroup(), - update: mkPodGroup(func(pg *scheduling.PodGroup) { - pg.Spec.SchedulingPolicy.Basic = &scheduling.BasicSchedulingPolicy{} - }), - }, - "change resource claims": { - old: mkPodGroup(), - update: mkPodGroup(func(pg *scheduling.PodGroup) { - pg.Spec.ResourceClaims = append(pg.Spec.ResourceClaims, scheduling.PodGroupResourceClaim{ - Name: "new-claim", - ResourceClaimName: new("resource-claim"), - }) - }), - }, } for name, tc := range failureCases { tc.old.ResourceVersion = "0" diff --git a/pkg/apis/scheduling/zz_generated.deepcopy.go b/pkg/apis/scheduling/zz_generated.deepcopy.go index 72ebee5f655..636a9590aec 100644 --- a/pkg/apis/scheduling/zz_generated.deepcopy.go +++ b/pkg/apis/scheduling/zz_generated.deepcopy.go @@ -235,6 +235,16 @@ func (in *PodGroupSpec) DeepCopyInto(out *PodGroupSpec) { (*in)[i].DeepCopyInto(&(*out)[i]) } } + if in.DisruptionMode != nil { + in, out := &in.DisruptionMode, &out.DisruptionMode + *out = new(DisruptionMode) + **out = **in + } + if in.Priority != nil { + in, out := &in.Priority, &out.Priority + *out = new(int32) + **out = **in + } return } @@ -294,6 +304,16 @@ func (in *PodGroupTemplate) DeepCopyInto(out *PodGroupTemplate) { (*in)[i].DeepCopyInto(&(*out)[i]) } } + if in.DisruptionMode != nil { + in, out := &in.DisruptionMode, &out.DisruptionMode + *out = new(DisruptionMode) + **out = **in + } + if in.Priority != nil { + in, out := &in.Priority, &out.Priority + *out = new(int32) + **out = **in + } return } diff --git a/pkg/generated/openapi/zz_generated.openapi.go b/pkg/generated/openapi/zz_generated.openapi.go index 2414e93e14c..1bd9d0c2143 100644 --- a/pkg/generated/openapi/zz_generated.openapi.go +++ b/pkg/generated/openapi/zz_generated.openapi.go @@ -54984,6 +54984,29 @@ func schema_k8sio_api_scheduling_v1alpha2_PodGroupSpec(ref common.ReferenceCallb }, }, }, + "disruptionMode": { + SchemaProps: spec.SchemaProps{ + Description: "DisruptionMode defines the mode in which a given PodGroup can be disrupted. Controllers are expected to fill this field by copying it from a PodGroupTemplate. One of Pod, PodGroup. Defaults to Pod if unset. This field is immutable. This field is available only when the WorkloadAwarePreemption feature gate is enabled.\n\n\nPossible enum values:\n - `\"Pod\"` means that individual pods can be disrupted or preempted independently. It doesn't depend on exact set of pods currently running in this PodGroup.\n - `\"PodGroup\"` means that the whole PodGroup needs to be disrupted or preempted together.", + Default: "Pod", + Type: []string{"string"}, + Format: "", + Enum: []interface{}{"Pod", "PodGroup"}, + }, + }, + "priorityClassName": { + SchemaProps: spec.SchemaProps{ + Description: "PriorityClassName defines the priority that should be considered when scheduling this pod group. Controllers are expected to fill this field by copying it from a PodGroupTemplate. Otherwise, it is validated and resolved similarly to the PriorityClassName on PodGroupTemplate (i.e. if no priority class is specified, admission control can set this to the global default priority class if it exists. Otherwise, the pod group's priority will be zero). This field is immutable. This field is available only when the WorkloadAwarePreemption feature gate is enabled.", + Type: []string{"string"}, + Format: "", + }, + }, + "priority": { + SchemaProps: spec.SchemaProps{ + Description: "Priority is the value of priority of this pod group. Various system components use this field to find the priority of the pod group. When Priority Admission Controller is enabled, it prevents users from setting this field. The admission controller populates this field from PriorityClassName. The higher the value, the higher the priority. This field is immutable. This field is available only when the WorkloadAwarePreemption feature gate is enabled.", + Type: []string{"integer"}, + Format: "int32", + }, + }, }, Required: []string{"schedulingPolicy"}, }, @@ -55108,6 +55131,28 @@ func schema_k8sio_api_scheduling_v1alpha2_PodGroupTemplate(ref common.ReferenceC }, }, }, + "disruptionMode": { + SchemaProps: spec.SchemaProps{ + Description: "DisruptionMode defines the mode in which a given PodGroup can be disrupted. One of Pod, PodGroup. This field is available only when the WorkloadAwarePreemption feature gate is enabled.\n\n\nPossible enum values:\n - `\"Pod\"` means that individual pods can be disrupted or preempted independently. It doesn't depend on exact set of pods currently running in this PodGroup.\n - `\"PodGroup\"` means that the whole PodGroup needs to be disrupted or preempted together.", + Type: []string{"string"}, + Format: "", + Enum: []interface{}{"Pod", "PodGroup"}, + }, + }, + "priorityClassName": { + SchemaProps: spec.SchemaProps{ + Description: "PriorityClassName indicates the priority that should be considered when scheduling a pod group created from this template. If no priority class is specified, admission control can set this to the global default priority class if it exists. Otherwise, pod groups created from this template will have the priority set to zero. This field is available only when the WorkloadAwarePreemption feature gate is enabled.", + Type: []string{"string"}, + Format: "", + }, + }, + "priority": { + SchemaProps: spec.SchemaProps{ + Description: "Priority is the value of priority of pod groups created from this template. Various system components use this field to find the priority of the pod group. When Priority Admission Controller is enabled, it prevents users from setting this field. The admission controller populates this field from PriorityClassName. The higher the value, the higher the priority. This field is available only when the WorkloadAwarePreemption feature gate is enabled.", + Type: []string{"integer"}, + Format: "int32", + }, + }, }, Required: []string{"name", "schedulingPolicy"}, }, diff --git a/pkg/registry/scheduling/podgroup/declarative_validation_test.go b/pkg/registry/scheduling/podgroup/declarative_validation_test.go index ded1016f236..2ffa55429c2 100644 --- a/pkg/registry/scheduling/podgroup/declarative_validation_test.go +++ b/pkg/registry/scheduling/podgroup/declarative_validation_test.go @@ -22,6 +22,7 @@ import ( "testing" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/sets" "k8s.io/apimachinery/pkg/util/validation/field" "k8s.io/apimachinery/pkg/util/version" genericapirequest "k8s.io/apiserver/pkg/endpoints/request" @@ -35,6 +36,11 @@ import ( _ "k8s.io/kubernetes/pkg/apis/scheduling/install" ) +var allowedDisruptionModes = sets.New( + scheduling.DisruptionModePod, + scheduling.DisruptionModePodGroup, +) + func TestDeclarativeValidate(t *testing.T) { apiVersions := []string{"v1alpha2"} // PodGroup is currently only in v1alpha2 for _, apiVersion := range apiVersions { @@ -55,9 +61,11 @@ func testDeclarativeValidate(t *testing.T, apiVersion string) { strategy := NewStrategy() testCases := map[string]struct { - input scheduling.PodGroup - expectedErrs field.ErrorList - tasEnabled bool + input scheduling.PodGroup + enableTopologyAwareScheduling bool + enableDRAWorkloadResourceClaims bool + enableWorkloadAwarePreemption bool + expectedErrs field.ErrorList }{ "valid": { input: mkValidPodGroup(), @@ -73,14 +81,14 @@ func testDeclarativeValidate(t *testing.T, apiVersion string) { }, "gang minCount negative": { input: mkValidPodGroup(setPodGroupMinCount(-1)), - expectedErrs: field.ErrorList{field.Invalid(field.NewPath("spec", "schedulingPolicy", "gang", "minCount"), -1, "").WithOrigin("minimum")}, + expectedErrs: field.ErrorList{field.Invalid(field.NewPath("spec", "schedulingPolicy", "gang", "minCount"), nil, "").WithOrigin("minimum")}, }, "no podGroupTemplateRef": { input: mkValidPodGroup(unsetPodGroupTemplateRef()), }, "empty podGroupTemplateRef": { input: mkValidPodGroup(setEmptyPodGroupTemplateRef()), - expectedErrs: field.ErrorList{field.Invalid(field.NewPath("spec", "podGroupTemplateRef"), "", "must specify one of: `workload`").WithOrigin("union")}, + expectedErrs: field.ErrorList{field.Invalid(field.NewPath("spec", "podGroupTemplateRef"), nil, "").WithOrigin("union")}, }, "podGroupTemplateRef with empty template name": { input: mkValidPodGroup(setPodGroupTemplateRef("", "workload")), @@ -119,50 +127,126 @@ func testDeclarativeValidate(t *testing.T, apiVersion string) { expectedErrs: field.ErrorList{field.Forbidden(field.NewPath("spec", "schedulingConstraints"), "")}, }, "valid with schedulingConstraints": { - input: mkValidPodGroup(addTopologyConstraint("foo")), - tasEnabled: true, + input: mkValidPodGroup(addTopologyConstraint("foo")), + enableTopologyAwareScheduling: true, }, "schedulingConstraints with multiple topology constraints": { - input: mkValidPodGroup(addTopologyConstraint("foo"), addTopologyConstraint("bar")), - expectedErrs: field.ErrorList{field.TooMany(field.NewPath("spec", "schedulingConstraints", "topology"), 2, 1).WithOrigin("maxItems")}, - tasEnabled: true, + input: mkValidPodGroup(addTopologyConstraint("foo"), addTopologyConstraint("bar")), + enableTopologyAwareScheduling: true, + expectedErrs: field.ErrorList{field.TooMany(field.NewPath("spec", "schedulingConstraints", "topology"), 2, 1).WithOrigin("maxItems")}, }, "valid with empty schedulingConstraints": { - input: mkValidPodGroup(setSchedulingConstraints()), - tasEnabled: true, + input: mkValidPodGroup(setSchedulingConstraints()), + enableTopologyAwareScheduling: true, }, "topologyConstraint with empty topology key": { - input: mkValidPodGroup(addTopologyConstraint("")), - expectedErrs: field.ErrorList{field.Required(field.NewPath("spec", "schedulingConstraints", "topology").Index(0).Child("key"), "")}, - tasEnabled: true, + input: mkValidPodGroup(addTopologyConstraint("")), + enableTopologyAwareScheduling: true, + expectedErrs: field.ErrorList{field.Required(field.NewPath("spec", "schedulingConstraints", "topology").Index(0).Child("key"), "")}, }, "valid with topology key with DNS prefix": { - input: mkValidPodGroup(addTopologyConstraint("example.com/Foo")), - tasEnabled: true, + input: mkValidPodGroup(addTopologyConstraint("example.com/Foo")), + enableTopologyAwareScheduling: true, }, "valid with topology key with prefix with max length": { - input: mkValidPodGroup(addTopologyConstraint(strings.Repeat("a", 253) + "/" + strings.Repeat("b", 63))), - tasEnabled: true, + input: mkValidPodGroup(addTopologyConstraint(strings.Repeat("a", 253) + "/" + strings.Repeat("b", 63))), + enableTopologyAwareScheduling: true, }, "with topology key with prefix exceending max prefix length": { - input: mkValidPodGroup(addTopologyConstraint(strings.Repeat("a", 254) + "/foo")), - expectedErrs: field.ErrorList{field.Invalid(field.NewPath("spec", "schedulingConstraints", "topology").Index(0).Child("key"), nil, "").WithOrigin("format=k8s-label-key")}, - tasEnabled: true, + input: mkValidPodGroup(addTopologyConstraint(strings.Repeat("a", 254) + "/foo")), + enableTopologyAwareScheduling: true, + expectedErrs: field.ErrorList{field.Invalid(field.NewPath("spec", "schedulingConstraints", "topology").Index(0).Child("key"), nil, "").WithOrigin("format=k8s-label-key")}, }, "with topology key with prefix exceending max name length": { - input: mkValidPodGroup(addTopologyConstraint("foo/" + strings.Repeat("b", 64))), - expectedErrs: field.ErrorList{field.Invalid(field.NewPath("spec", "schedulingConstraints", "topology").Index(0).Child("key"), nil, "").WithOrigin("format=k8s-label-key")}, - tasEnabled: true, + input: mkValidPodGroup(addTopologyConstraint("foo/" + strings.Repeat("b", 64))), + enableTopologyAwareScheduling: true, + expectedErrs: field.ErrorList{field.Invalid(field.NewPath("spec", "schedulingConstraints", "topology").Index(0).Child("key"), nil, "").WithOrigin("format=k8s-label-key")}, }, "with topology key without prefix exceeding max length": { - input: mkValidPodGroup(addTopologyConstraint(strings.Repeat("b", 64))), - expectedErrs: field.ErrorList{field.Invalid(field.NewPath("spec", "schedulingConstraints", "topology").Index(0).Child("key"), nil, "").WithOrigin("format=k8s-label-key")}, - tasEnabled: true, + input: mkValidPodGroup(addTopologyConstraint(strings.Repeat("b", 64))), + enableTopologyAwareScheduling: true, + expectedErrs: field.ErrorList{field.Invalid(field.NewPath("spec", "schedulingConstraints", "topology").Index(0).Child("key"), nil, "").WithOrigin("format=k8s-label-key")}, }, "with topology key with invalid characters": { - input: mkValidPodGroup(addTopologyConstraint("Example.com/Foo")), - expectedErrs: field.ErrorList{field.Invalid(field.NewPath("spec", "schedulingConstraints", "topology").Index(0).Child("key"), nil, "").WithOrigin("format=k8s-label-key")}, - tasEnabled: true, + input: mkValidPodGroup(addTopologyConstraint("Example.com/Foo")), + enableTopologyAwareScheduling: true, + expectedErrs: field.ErrorList{field.Invalid(field.NewPath("spec", "schedulingConstraints", "topology").Index(0).Child("key"), nil, "").WithOrigin("format=k8s-label-key")}, + }, + "pod disruption mode, workload aware preemption enabled": { + input: mkValidPodGroup(setDisruptionMode(scheduling.DisruptionModePod)), + enableWorkloadAwarePreemption: true, + }, + "pod disruption mode, workload aware preemption disabled": { + input: mkValidPodGroup(setDisruptionMode(scheduling.DisruptionModePod)), + expectedErrs: field.ErrorList{field.Forbidden(field.NewPath("spec", "disruptionMode"), "")}, + }, + "pod group disruption mode, workload aware preemption enabled": { + input: mkValidPodGroup(setDisruptionMode(scheduling.DisruptionModePodGroup)), + enableWorkloadAwarePreemption: true, + }, + "pod group disruption mode, workload aware preemption disabled": { + input: mkValidPodGroup(setDisruptionMode(scheduling.DisruptionModePodGroup)), + expectedErrs: field.ErrorList{field.Forbidden(field.NewPath("spec", "disruptionMode"), "")}, + }, + "invalid disruption mode, workload aware preemption enabled": { + input: mkValidPodGroup(setDisruptionMode("Invalid")), + enableWorkloadAwarePreemption: true, + expectedErrs: field.ErrorList{field.NotSupported(field.NewPath("spec", "disruptionMode"), "Invalid", sets.List(allowedDisruptionModes))}, + }, + "invalid disruption mode, workload aware preemption disabled": { + input: mkValidPodGroup(setDisruptionMode("Invalid")), + expectedErrs: field.ErrorList{field.Forbidden(field.NewPath("spec", "disruptionMode"), "")}, + }, + "valid pod group without disruption mode, workload aware preemption enabled": { + input: mkValidPodGroup(), + enableWorkloadAwarePreemption: true, + expectedErrs: field.ErrorList{field.Required(field.NewPath("spec", "disruptionMode"), "")}, + }, + "valid priority class name, workload aware preemption enabled": { + input: mkValidPodGroup(setDisruptionMode(scheduling.DisruptionModePod), setPriorityClassName("high-priority")), + enableWorkloadAwarePreemption: true, + }, + "valid priority class name, workload aware preemption disabled": { + input: mkValidPodGroup(setPriorityClassName("high-priority")), + expectedErrs: field.ErrorList{field.Forbidden(field.NewPath("spec", "priorityClassName"), "")}, + }, + "invalid priority class name, workload aware preemption enabled": { + input: mkValidPodGroup(setDisruptionMode(scheduling.DisruptionModePod), setPriorityClassName("high/priority")), + enableWorkloadAwarePreemption: true, + expectedErrs: field.ErrorList{ + field.Invalid(field.NewPath("spec", "priorityClassName"), nil, "").WithOrigin("format=k8s-long-name"), + }, + }, + "invalid priority class name, workload aware preemption disabled": { + input: mkValidPodGroup(setPriorityClassName("high/priority")), + expectedErrs: field.ErrorList{field.Forbidden(field.NewPath("spec", "priorityClassName"), "")}, + }, + "valid priority, workload aware preemption enabled": { + input: mkValidPodGroup(setDisruptionMode(scheduling.DisruptionModePod), setPriority(1000)), + enableWorkloadAwarePreemption: true, + }, + "valid priority, workload aware preemption disabled": { + input: mkValidPodGroup(setPriority(1000)), + expectedErrs: field.ErrorList{field.Forbidden(field.NewPath("spec", "priority"), "")}, + }, + "valid negative priority, workload aware preemption enabled": { + input: mkValidPodGroup(setDisruptionMode(scheduling.DisruptionModePod), setPriority(-2147483648)), + enableWorkloadAwarePreemption: true, + }, + "valid negative priority, workload aware preemption disabled": { + input: mkValidPodGroup(setPriority(-2147483648)), + expectedErrs: field.ErrorList{field.Forbidden(field.NewPath("spec", "priority"), "")}, + }, + "too high priority, workload aware preemption enabled": { + input: mkValidPodGroup(setDisruptionMode(scheduling.DisruptionModePod), setPriority(scheduling.HighestUserDefinablePriority+1)), + enableWorkloadAwarePreemption: true, + expectedErrs: field.ErrorList{ + field.Invalid(field.NewPath("spec", "priority"), nil, "").WithOrigin("maximum"), + }, + }, + "too high priority, workload aware preemption disabled": { + input: mkValidPodGroup(setPriority(scheduling.HighestUserDefinablePriority + 1)), + expectedErrs: field.ErrorList{field.Forbidden(field.NewPath("spec", "priority"), "")}, }, "ok resourceClaimName reference": { input: mkValidPodGroup(addResourceClaims(scheduling.PodGroupResourceClaim{Name: "claim", ResourceClaimName: new("resource-claim")})), @@ -251,8 +335,11 @@ func testDeclarativeValidate(t *testing.T, apiVersion string) { for k, tc := range testCases { t.Run(k, func(t *testing.T) { featuregatetesting.SetFeatureGatesDuringTest(t, utilfeature.DefaultFeatureGate, featuregatetesting.FeatureOverrides{ - features.GenericWorkload: tc.tasEnabled, - features.TopologyAwareWorkloadScheduling: tc.tasEnabled, + features.GenericWorkload: true, + features.TopologyAwareWorkloadScheduling: tc.enableTopologyAwareScheduling, + features.DRAWorkloadResourceClaims: tc.enableDRAWorkloadResourceClaims, + features.GangScheduling: tc.enableWorkloadAwarePreemption, + features.WorkloadAwarePreemption: tc.enableWorkloadAwarePreemption, }) apitesting.VerifyValidationEquivalence(t, ctx, &tc.input, strategy.Validate, tc.expectedErrs, apitesting.WithMinEmulationVersion(version.MustParse("1.36"))) }) @@ -279,10 +366,12 @@ func testDeclarativeValidateUpdate(t *testing.T, apiVersion string) { Verb: "update", }) testCases := map[string]struct { - oldObj scheduling.PodGroup - updateObj scheduling.PodGroup - expectedErrs field.ErrorList - tasEnabled bool + oldObj scheduling.PodGroup + updateObj scheduling.PodGroup + enableTopologyAwareScheduling bool + enableDRAWorkloadResourceClaims bool + enableWorkloadAwarePreemption bool + expectedErrs field.ErrorList }{ "valid update": { oldObj: mkValidPodGroup(setResourceVersion("1")), @@ -292,90 +381,189 @@ func testDeclarativeValidateUpdate(t *testing.T, apiVersion string) { oldObj: mkValidPodGroup(setResourceVersion("1")), updateObj: mkValidPodGroup(setResourceVersion("1"), setEmptyPodGroupTemplateRef()), expectedErrs: field.ErrorList{ - field.Invalid(field.NewPath("spec", "podGroupTemplateRef"), nil, "field is immutable").WithOrigin("immutable").MarkAlpha(), + field.Invalid(field.NewPath("spec", "podGroupTemplateRef"), nil, "").WithOrigin("immutable"), }, }, "invalid update unset podGroupTemplateRef": { oldObj: mkValidPodGroup(setResourceVersion("1")), updateObj: mkValidPodGroup(setResourceVersion("1"), unsetPodGroupTemplateRef()), expectedErrs: field.ErrorList{ - field.Invalid(field.NewPath("spec", "podGroupTemplateRef"), nil, "field is immutable").WithOrigin("immutable").MarkAlpha(), + field.Invalid(field.NewPath("spec", "podGroupTemplateRef"), nil, "").WithOrigin("immutable"), }, }, - "invalid update setPodGroupTemplateRef": { + "invalid update change podGroupTemplateRef template name": { oldObj: mkValidPodGroup(setResourceVersion("1")), - updateObj: mkValidPodGroup(setResourceVersion("1"), setPodGroupTemplateRef("other-template", "other-workload")), + updateObj: mkValidPodGroup(setResourceVersion("1"), setPodGroupTemplateRef("other-template", "workload")), expectedErrs: field.ErrorList{ - field.Invalid(field.NewPath("spec", "podGroupTemplateRef"), nil, "field is immutable").WithOrigin("immutable").MarkAlpha(), + field.Invalid(field.NewPath("spec", "podGroupTemplateRef"), nil, "").WithOrigin("immutable"), + }, + }, + "invalid update change podGroupTemplateRef workload name": { + oldObj: mkValidPodGroup(setResourceVersion("1")), + updateObj: mkValidPodGroup(setResourceVersion("1"), setPodGroupTemplateRef("template", "other-workload")), + expectedErrs: field.ErrorList{ + field.Invalid(field.NewPath("spec", "podGroupTemplateRef"), nil, "").WithOrigin("immutable"), }, }, "invalid update with neither basic nor gang": { oldObj: mkValidPodGroup(setResourceVersion("1")), updateObj: mkValidPodGroup(setResourceVersion("1"), clearPodGroupPolicy()), expectedErrs: field.ErrorList{ - field.Invalid(field.NewPath("spec", "schedulingPolicy"), nil, "field is immutable").WithOrigin("immutable").MarkAlpha(), + field.Invalid(field.NewPath("spec", "schedulingPolicy"), nil, "").WithOrigin("immutable"), }, }, "invalid update with both basic and gang": { oldObj: mkValidPodGroup(setResourceVersion("1")), updateObj: mkValidPodGroup(setResourceVersion("1"), setBothPolicies()), expectedErrs: field.ErrorList{ - field.Invalid(field.NewPath("spec", "schedulingPolicy"), nil, "field is immutable").WithOrigin("immutable").MarkAlpha(), + field.Invalid(field.NewPath("spec", "schedulingPolicy"), nil, "").WithOrigin("immutable"), + }, + }, + "invalid update of gang minCount": { + oldObj: mkValidPodGroup(setResourceVersion("1")), + updateObj: mkValidPodGroup(setResourceVersion("1"), setPodGroupMinCount(10)), + expectedErrs: field.ErrorList{ + field.Invalid(field.NewPath("spec", "schedulingPolicy"), nil, "").WithOrigin("immutable"), }, }, "invalid update from gang to basic policy": { oldObj: mkValidPodGroup(setResourceVersion("1")), updateObj: mkValidPodGroup(setResourceVersion("1"), setBasicPolicy()), - expectedErrs: field.ErrorList{field.Invalid(field.NewPath("spec", "schedulingPolicy"), nil, "field is immutable").WithOrigin("immutable").MarkAlpha()}, + expectedErrs: field.ErrorList{field.Invalid(field.NewPath("spec", "schedulingPolicy"), nil, "").WithOrigin("immutable")}, }, "valid update with unchanged scheduling constraints and TAS disabled": { oldObj: mkValidPodGroup(setResourceVersion("1"), addTopologyConstraint("foo")), updateObj: mkValidPodGroup(setResourceVersion("1"), addTopologyConstraint("foo")), }, "valid update with unchanged scheduling constraints": { - oldObj: mkValidPodGroup(setResourceVersion("1"), addTopologyConstraint("foo")), - updateObj: mkValidPodGroup(setResourceVersion("1"), addTopologyConstraint("foo")), - tasEnabled: true, + oldObj: mkValidPodGroup(setResourceVersion("1"), addTopologyConstraint("foo")), + updateObj: mkValidPodGroup(setResourceVersion("1"), addTopologyConstraint("foo")), + enableTopologyAwareScheduling: true, }, "invalid update to scheduling constraints": { - oldObj: mkValidPodGroup(setResourceVersion("1"), addTopologyConstraint("foo")), - updateObj: mkValidPodGroup(setResourceVersion("1"), setSchedulingConstraints()), - expectedErrs: field.ErrorList{field.Invalid(field.NewPath("spec", "schedulingConstraints"), nil, "field is immutable").WithOrigin("immutable")}, - tasEnabled: true, + oldObj: mkValidPodGroup(setResourceVersion("1"), addTopologyConstraint("foo")), + updateObj: mkValidPodGroup(setResourceVersion("1"), setSchedulingConstraints()), + enableTopologyAwareScheduling: true, + expectedErrs: field.ErrorList{field.Invalid(field.NewPath("spec", "schedulingConstraints"), nil, "field is immutable").WithOrigin("immutable")}, }, "invalid update to topology constraints": { - oldObj: mkValidPodGroup(setResourceVersion("1"), addTopologyConstraint("foo")), - updateObj: mkValidPodGroup(setResourceVersion("1"), addTopologyConstraint("foo"), addTopologyConstraint("bar")), - expectedErrs: field.ErrorList{field.Invalid(field.NewPath("spec", "schedulingConstraints"), nil, "field is immutable").WithOrigin("immutable")}, - tasEnabled: true, + oldObj: mkValidPodGroup(setResourceVersion("1"), addTopologyConstraint("foo")), + updateObj: mkValidPodGroup(setResourceVersion("1"), addTopologyConstraint("foo"), addTopologyConstraint("bar")), + enableTopologyAwareScheduling: true, + expectedErrs: field.ErrorList{field.Invalid(field.NewPath("spec", "schedulingConstraints"), nil, "field is immutable").WithOrigin("immutable")}, }, "invalid update to topology key": { - oldObj: mkValidPodGroup(setResourceVersion("1"), addTopologyConstraint("foo")), - updateObj: mkValidPodGroup(setResourceVersion("1"), addTopologyConstraint("bar")), - expectedErrs: field.ErrorList{field.Invalid(field.NewPath("spec", "schedulingConstraints"), nil, "field is immutable").WithOrigin("immutable")}, - tasEnabled: true, + oldObj: mkValidPodGroup(setResourceVersion("1"), addTopologyConstraint("foo")), + updateObj: mkValidPodGroup(setResourceVersion("1"), addTopologyConstraint("bar")), + enableTopologyAwareScheduling: true, + expectedErrs: field.ErrorList{field.Invalid(field.NewPath("spec", "schedulingConstraints"), nil, "field is immutable").WithOrigin("immutable")}, }, "invalid update to scheduling constraints with TAS disabled": { oldObj: mkValidPodGroup(setResourceVersion("1"), addTopologyConstraint("foo")), updateObj: mkValidPodGroup(setResourceVersion("1"), setSchedulingConstraints()), - expectedErrs: field.ErrorList{field.Invalid(field.NewPath("spec", "schedulingConstraints"), nil, "field is immutable").WithOrigin("immutable")}, + expectedErrs: field.ErrorList{field.Forbidden(field.NewPath("spec", "schedulingConstraints"), "")}, }, "invalid update to topology constraints with TAS disabled": { oldObj: mkValidPodGroup(setResourceVersion("1"), addTopologyConstraint("foo")), updateObj: mkValidPodGroup(setResourceVersion("1"), addTopologyConstraint("foo"), addTopologyConstraint("bar")), - expectedErrs: field.ErrorList{field.Invalid(field.NewPath("spec", "schedulingConstraints"), nil, "field is immutable").WithOrigin("immutable")}, + expectedErrs: field.ErrorList{field.Forbidden(field.NewPath("spec", "schedulingConstraints"), "")}, }, "invalid update to topology key with TAS disabled": { oldObj: mkValidPodGroup(setResourceVersion("1"), addTopologyConstraint("foo")), updateObj: mkValidPodGroup(setResourceVersion("1"), addTopologyConstraint("bar")), - expectedErrs: field.ErrorList{field.Invalid(field.NewPath("spec", "schedulingConstraints"), nil, "field is immutable").WithOrigin("immutable")}, + expectedErrs: field.ErrorList{field.Forbidden(field.NewPath("spec", "schedulingConstraints"), "")}, + }, + "invalid add of resource claims, DRA workload resource claims disabled": { + oldObj: mkValidPodGroup(setResourceVersion("1")), + updateObj: mkValidPodGroup(setResourceVersion("1"), addResourceClaims( + scheduling.PodGroupResourceClaim{Name: "my-claim", ResourceClaimTemplateName: new("my-template")}, + )), + expectedErrs: field.ErrorList{field.Invalid(field.NewPath("spec", "resourceClaims"), nil, "").WithOrigin("immutable")}, + }, + "invalid add of resource claims, DRA workload resource claims enabled": { + oldObj: mkValidPodGroup(setResourceVersion("1")), + updateObj: mkValidPodGroup(setResourceVersion("1"), addResourceClaims( + scheduling.PodGroupResourceClaim{Name: "my-claim", ResourceClaimTemplateName: new("my-template")}, + )), + enableDRAWorkloadResourceClaims: true, + expectedErrs: field.ErrorList{field.Invalid(field.NewPath("spec", "resourceClaims"), nil, "").WithOrigin("immutable")}, + }, + "invalid update of resource claims, DRA workload resource claims disabled": { + oldObj: mkValidPodGroup(setResourceVersion("1"), addResourceClaims( + scheduling.PodGroupResourceClaim{Name: "my-claim", ResourceClaimTemplateName: new("my-template")}, + )), + updateObj: mkValidPodGroup(setResourceVersion("1"), addResourceClaims( + scheduling.PodGroupResourceClaim{Name: "my-other-claim", ResourceClaimTemplateName: new("my-template")}, + )), + expectedErrs: field.ErrorList{field.Invalid(field.NewPath("spec", "resourceClaims"), nil, "").WithOrigin("immutable")}, + }, + "invalid update of resource claims, DRA workload resource claims enabled": { + oldObj: mkValidPodGroup(setResourceVersion("1"), addResourceClaims( + scheduling.PodGroupResourceClaim{Name: "my-claim", ResourceClaimTemplateName: new("my-template")}, + )), + updateObj: mkValidPodGroup(setResourceVersion("1"), addResourceClaims( + scheduling.PodGroupResourceClaim{Name: "my-other-claim", ResourceClaimTemplateName: new("my-template")}, + )), + enableDRAWorkloadResourceClaims: true, + expectedErrs: field.ErrorList{field.Invalid(field.NewPath("spec", "resourceClaims"), nil, "").WithOrigin("immutable")}, + }, + "invalid remove of resource claims, DRA workload resource claims disabled": { + oldObj: mkValidPodGroup(setResourceVersion("1"), addResourceClaims( + scheduling.PodGroupResourceClaim{Name: "my-claim", ResourceClaimTemplateName: new("my-template")}, + )), + updateObj: mkValidPodGroup(setResourceVersion("1")), + expectedErrs: field.ErrorList{field.Invalid(field.NewPath("spec", "resourceClaims"), nil, "").WithOrigin("immutable")}, + }, + "invalid remove of resource claims, DRA workload resource claims enabled": { + oldObj: mkValidPodGroup(setResourceVersion("1"), addResourceClaims( + scheduling.PodGroupResourceClaim{Name: "my-claim", ResourceClaimTemplateName: new("my-template")}, + )), + updateObj: mkValidPodGroup(setResourceVersion("1")), + enableDRAWorkloadResourceClaims: true, + expectedErrs: field.ErrorList{field.Invalid(field.NewPath("spec", "resourceClaims"), nil, "").WithOrigin("immutable")}, + }, + "invalid update of disruption mode, workload aware preemption enabled": { + oldObj: mkValidPodGroup(setResourceVersion("1"), setDisruptionMode(scheduling.DisruptionModePod)), + updateObj: mkValidPodGroup(setResourceVersion("1"), setDisruptionMode(scheduling.DisruptionModePodGroup)), + enableWorkloadAwarePreemption: true, + expectedErrs: field.ErrorList{field.Invalid(field.NewPath("spec", "disruptionMode"), nil, "").WithOrigin("immutable")}, + }, + "invalid update of disruption mode, workload aware preemption disabled": { + oldObj: mkValidPodGroup(setResourceVersion("1"), setDisruptionMode(scheduling.DisruptionModePod)), + updateObj: mkValidPodGroup(setResourceVersion("1"), setDisruptionMode(scheduling.DisruptionModePodGroup)), + expectedErrs: field.ErrorList{field.Forbidden(field.NewPath("spec", "disruptionMode"), "")}, + }, + "invalid update of priority class name, workload aware preemption enabled": { + oldObj: mkValidPodGroup(setResourceVersion("1"), setPriorityClassName("low-priority")), + updateObj: mkValidPodGroup(setResourceVersion("1"), setPriorityClassName("high-priority")), + enableWorkloadAwarePreemption: true, + expectedErrs: field.ErrorList{field.Invalid(field.NewPath("spec", "priorityClassName"), nil, "").WithOrigin("immutable")}, + }, + "invalid update of priority class name, workload aware preemption disabled": { + oldObj: mkValidPodGroup(setResourceVersion("1"), setPriorityClassName("low-priority")), + updateObj: mkValidPodGroup(setResourceVersion("1"), setPriorityClassName("high-priority")), + expectedErrs: field.ErrorList{field.Forbidden(field.NewPath("spec", "priorityClassName"), "")}, + }, + "invalid update of priority, workload aware preemption enabled": { + oldObj: mkValidPodGroup(setResourceVersion("1"), setPriority(1000)), + updateObj: mkValidPodGroup(setResourceVersion("1"), setPriority(2000)), + enableWorkloadAwarePreemption: true, + expectedErrs: field.ErrorList{field.Invalid(field.NewPath("spec", "priority"), nil, "").WithOrigin("immutable")}, + }, + "invalid update of priority, workload aware preemption disabled": { + oldObj: mkValidPodGroup(setResourceVersion("1"), setPriority(1000)), + updateObj: mkValidPodGroup(setResourceVersion("1"), setPriority(2000)), + expectedErrs: field.ErrorList{field.Forbidden(field.NewPath("spec", "priority"), "")}, }, } for k, tc := range testCases { t.Run(k, func(t *testing.T) { featuregatetesting.SetFeatureGatesDuringTest(t, utilfeature.DefaultFeatureGate, featuregatetesting.FeatureOverrides{ - features.GenericWorkload: tc.tasEnabled, - features.TopologyAwareWorkloadScheduling: tc.tasEnabled, + features.GenericWorkload: true, + features.TopologyAwareWorkloadScheduling: tc.enableTopologyAwareScheduling, + features.DRAWorkloadResourceClaims: tc.enableDRAWorkloadResourceClaims, + features.GangScheduling: tc.enableWorkloadAwarePreemption, + features.WorkloadAwarePreemption: tc.enableWorkloadAwarePreemption, }) strategy := NewStrategy() apitesting.VerifyUpdateValidationEquivalence(t, ctx, &tc.updateObj, &tc.oldObj, strategy.ValidateUpdate, tc.expectedErrs, apitesting.WithMinEmulationVersion(version.MustParse("1.36"))) @@ -626,3 +814,21 @@ func addTopologyConstraint(value string) func(obj *scheduling.PodGroup) { obj.Spec.SchedulingConstraints.Topology = append(obj.Spec.SchedulingConstraints.Topology, scheduling.TopologyConstraint{Key: value}) } } + +func setDisruptionMode(mode scheduling.DisruptionMode) func(obj *scheduling.PodGroup) { + return func(obj *scheduling.PodGroup) { + obj.Spec.DisruptionMode = &mode + } +} + +func setPriorityClassName(priorityClassName string) func(obj *scheduling.PodGroup) { + return func(obj *scheduling.PodGroup) { + obj.Spec.PriorityClassName = priorityClassName + } +} + +func setPriority(priority int32) func(obj *scheduling.PodGroup) { + return func(obj *scheduling.PodGroup) { + obj.Spec.Priority = new(priority) + } +} diff --git a/pkg/registry/scheduling/podgroup/strategy.go b/pkg/registry/scheduling/podgroup/strategy.go index 42c3b7986ff..ef86023e167 100644 --- a/pkg/registry/scheduling/podgroup/strategy.go +++ b/pkg/registry/scheduling/podgroup/strategy.go @@ -75,9 +75,15 @@ func (*podGroupStrategy) Validate(ctx context.Context, obj runtime.Object) field podGroup := obj.(*scheduling.PodGroup) allErrs := validation.ValidatePodGroup(podGroup) opts := []string{} - if schedulingConstraintsInUse(nil) { + if utilfeature.DefaultFeatureGate.Enabled(features.TopologyAwareWorkloadScheduling) { opts = append(opts, string(features.TopologyAwareWorkloadScheduling)) } + if utilfeature.DefaultFeatureGate.Enabled(features.DRAWorkloadResourceClaims) { + opts = append(opts, string(features.DRAWorkloadResourceClaims)) + } + if utilfeature.DefaultFeatureGate.Enabled(features.WorkloadAwarePreemption) { + opts = append(opts, string(features.WorkloadAwarePreemption)) + } return rest.ValidateDeclarativelyWithMigrationChecks(ctx, legacyscheme.Scheme, obj, nil, allErrs, operation.Create, rest.WithDeclarativeEnforcement(), rest.WithOptions(opts)) } @@ -103,9 +109,19 @@ func (*podGroupStrategy) ValidateUpdate(ctx context.Context, obj, old runtime.Ob oldPodGroup := old.(*scheduling.PodGroup) allErrs := validation.ValidatePodGroupUpdate(newPodGroup, oldPodGroup) opts := []string{} - if schedulingConstraintsInUse(oldPodGroup) { + // Declarative validation will always allow fields to remain unchanged, so if any + // of the fields which are covered by these gates are set, we will not re-validate them + // (even if the gates are disabled) as long as they do not change values. If a gate + // is disabled, they will not be allowed to change values. + if utilfeature.DefaultFeatureGate.Enabled(features.TopologyAwareWorkloadScheduling) { opts = append(opts, string(features.TopologyAwareWorkloadScheduling)) } + if utilfeature.DefaultFeatureGate.Enabled(features.DRAWorkloadResourceClaims) { + opts = append(opts, string(features.DRAWorkloadResourceClaims)) + } + if utilfeature.DefaultFeatureGate.Enabled(features.WorkloadAwarePreemption) { + opts = append(opts, string(features.WorkloadAwarePreemption)) + } return rest.ValidateDeclarativelyWithMigrationChecks(ctx, legacyscheme.Scheme, newPodGroup, oldPodGroup, allErrs, operation.Update, rest.WithDeclarativeEnforcement(), rest.WithOptions(opts)) } @@ -160,46 +176,104 @@ func (*podGroupStatusStrategy) WarningsOnUpdate(ctx context.Context, obj, old ru } // dropDisabledPodGroupFields removes fields which are covered by a feature gate. -func dropDisabledPodGroupFields(newPodGroup, oldPodGroup *scheduling.PodGroup) { - dropDisabledSchedulingConstraintsFields(newPodGroup, oldPodGroup) - dropDisabledDRAWorkloadResourceClaimsFields(newPodGroup, oldPodGroup) +func dropDisabledPodGroupFields(podGroup, oldPodGroup *scheduling.PodGroup) { + var podGroupSpec, oldPodGroupSpec *scheduling.PodGroupSpec + if podGroup != nil { + podGroupSpec = &podGroup.Spec + } + if oldPodGroup != nil { + oldPodGroupSpec = &oldPodGroup.Spec + } + dropDisabledPodGroupSpecFields(podGroupSpec, oldPodGroupSpec) +} + +func dropDisabledPodGroupSpecFields(podGroupSpec, oldPodGroupSpec *scheduling.PodGroupSpec) { + dropDisabledSchedulingConstraintsFields(podGroupSpec, oldPodGroupSpec) + dropDisabledDRAWorkloadResourceClaimsFields(podGroupSpec, oldPodGroupSpec) + dropDisabledDisruptionModeField(podGroupSpec, oldPodGroupSpec) + dropDisabledPriorityClassNameField(podGroupSpec, oldPodGroupSpec) + dropDisabledPriorityField(podGroupSpec, oldPodGroupSpec) } func dropDisabledPodGroupStatusFields(newPodGroup, oldPodGroup *scheduling.PodGroup) { - if newPodGroup == nil { + var oldPodGroupSpec *scheduling.PodGroupSpec + if oldPodGroup != nil { + oldPodGroupSpec = &oldPodGroup.Spec + } + if utilfeature.DefaultFeatureGate.Enabled(features.DRAWorkloadResourceClaims) || draWorkloadResourceClaimsInUse(oldPodGroupSpec) { + // No need to drop anything. return } - - if !utilfeature.DefaultFeatureGate.Enabled(features.DRAWorkloadResourceClaims) && !draWorkloadResourceClaimsInUse(oldPodGroup) { - newPodGroup.Status.ResourceClaimStatuses = nil - } + newPodGroup.Status.ResourceClaimStatuses = nil } // dropDisabledSchedulingConstraintsFields drops the SchedulingConstraints field // from the new PodGroup if the TopologyAwareWorkloadScheduling feature gate is disabled // and it was not used in the old PodGroup. -func dropDisabledSchedulingConstraintsFields(newPodGroup, oldPodGroup *scheduling.PodGroup) { - if schedulingConstraintsInUse(oldPodGroup) { +func dropDisabledSchedulingConstraintsFields(podGroupSpec, oldPodGroupSpec *scheduling.PodGroupSpec) { + if utilfeature.DefaultFeatureGate.Enabled(features.TopologyAwareWorkloadScheduling) || schedulingConstraintsInUse(oldPodGroupSpec) { // No need to drop anything. return } - newPodGroup.Spec.SchedulingConstraints = nil -} - -func schedulingConstraintsInUse(pg *scheduling.PodGroup) bool { - return utilfeature.DefaultFeatureGate.Enabled(features.TopologyAwareWorkloadScheduling) || (pg != nil && pg.Spec.SchedulingConstraints != nil) + podGroupSpec.SchedulingConstraints = nil } // dropDisabledDRAWorkloadResourceClaimsFields removes resource claim references // unless they are already used by the old PodGroup spec. -func dropDisabledDRAWorkloadResourceClaimsFields(podGroup, oldPodGroup *scheduling.PodGroup) { - if draWorkloadResourceClaimsInUse(oldPodGroup) { +func dropDisabledDRAWorkloadResourceClaimsFields(podGroupSpec, oldPodGroupSpec *scheduling.PodGroupSpec) { + if utilfeature.DefaultFeatureGate.Enabled(features.DRAWorkloadResourceClaims) || draWorkloadResourceClaimsInUse(oldPodGroupSpec) { // No need to drop anything. return } - podGroup.Spec.ResourceClaims = nil + podGroupSpec.ResourceClaims = nil } -func draWorkloadResourceClaimsInUse(pg *scheduling.PodGroup) bool { - return utilfeature.DefaultFeatureGate.Enabled(features.DRAWorkloadResourceClaims) || (pg != nil && len(pg.Spec.ResourceClaims) > 0) +// dropDisabledDisruptionModeField removes the DisruptionMode field unless it is +// already used in the old PodGroup spec. +func dropDisabledDisruptionModeField(podGroupSpec, oldPodGroupSpec *scheduling.PodGroupSpec) { + if utilfeature.DefaultFeatureGate.Enabled(features.WorkloadAwarePreemption) || disruptionModeInUse(oldPodGroupSpec) { + // No need to drop anything. + return + } + podGroupSpec.DisruptionMode = nil +} + +// dropDisabledPriorityClassNameField removes the PriorityClassName field unless +// it is already used in the old PodGroup spec. +func dropDisabledPriorityClassNameField(podGroupSpec, oldPodGroupSpec *scheduling.PodGroupSpec) { + if utilfeature.DefaultFeatureGate.Enabled(features.WorkloadAwarePreemption) || priorityClassNameInUse(oldPodGroupSpec) { + // No need to drop anything. + return + } + podGroupSpec.PriorityClassName = "" +} + +// dropDisabledPriorityField removes the Priority field unless it is already used +// in the old PodGroup spec. +func dropDisabledPriorityField(podGroupSpec, oldPodGroupSpec *scheduling.PodGroupSpec) { + if utilfeature.DefaultFeatureGate.Enabled(features.WorkloadAwarePreemption) || priorityInUse(oldPodGroupSpec) { + // No need to drop anything. + return + } + podGroupSpec.Priority = nil +} + +func schedulingConstraintsInUse(podGroupSpec *scheduling.PodGroupSpec) bool { + return podGroupSpec != nil && podGroupSpec.SchedulingConstraints != nil +} + +func draWorkloadResourceClaimsInUse(podGroupSpec *scheduling.PodGroupSpec) bool { + return podGroupSpec != nil && len(podGroupSpec.ResourceClaims) > 0 +} + +func disruptionModeInUse(podGroupSpec *scheduling.PodGroupSpec) bool { + return podGroupSpec != nil && podGroupSpec.DisruptionMode != nil +} + +func priorityClassNameInUse(podGroupSpec *scheduling.PodGroupSpec) bool { + return podGroupSpec != nil && podGroupSpec.PriorityClassName != "" +} + +func priorityInUse(podGroupSpec *scheduling.PodGroupSpec) bool { + return podGroupSpec != nil && podGroupSpec.Priority != nil } diff --git a/pkg/registry/scheduling/podgroup/strategy_test.go b/pkg/registry/scheduling/podgroup/strategy_test.go index 325fb76606b..c7f33fb412c 100644 --- a/pkg/registry/scheduling/podgroup/strategy_test.go +++ b/pkg/registry/scheduling/podgroup/strategy_test.go @@ -18,12 +18,10 @@ package podgroup import ( "context" - "fmt" "strings" "testing" "github.com/google/go-cmp/cmp" - "github.com/stretchr/testify/assert" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" genericapirequest "k8s.io/apiserver/pkg/endpoints/request" utilfeature "k8s.io/apiserver/pkg/util/feature" @@ -52,29 +50,22 @@ var podGroup = &scheduling.PodGroup{ }, } -var podGroupWithSchedulingConstraints = &scheduling.PodGroup{ - ObjectMeta: metav1.ObjectMeta{ - Name: "foo", - Namespace: metav1.NamespaceDefault, - }, - Spec: scheduling.PodGroupSpec{ - PodGroupTemplateRef: &scheduling.PodGroupTemplateReference{ - Workload: &scheduling.WorkloadPodGroupTemplateReference{ - WorkloadName: "w", - PodGroupTemplateName: "t", - }, - }, - SchedulingPolicy: scheduling.PodGroupSchedulingPolicy{ - Gang: &scheduling.GangSchedulingPolicy{ - MinCount: 5, - }, - }, - SchedulingConstraints: &scheduling.PodGroupSchedulingConstraints{ - Topology: []scheduling.TopologyConstraint{ - {Key: "foo"}, - }, - }, - }, +func podGroupWithSchedulingConstraints(keys ...string) *scheduling.PodGroup { + pg := podGroup.DeepCopy() + pg.Spec.SchedulingConstraints = &scheduling.PodGroupSchedulingConstraints{ + Topology: []scheduling.TopologyConstraint{}, + } + for _, key := range keys { + constraint := scheduling.TopologyConstraint{Key: key} + pg.Spec.SchedulingConstraints.Topology = append(pg.Spec.SchedulingConstraints.Topology, constraint) + } + return pg +} + +func podGroupWithDisruptionMode(mode scheduling.DisruptionMode) *scheduling.PodGroup { + pg := podGroup.DeepCopy() + pg.Spec.DisruptionMode = &mode + return pg } var ( @@ -82,6 +73,11 @@ var ( minCountError = "must be greater than or equal to 1" oneOfError = "must specify one of: `basic`, `gang`" multipleFieldsSetError = "must specify exactly one of: `basic`, `gang`" + tooManyItemsError = "must have at most 1 item" + maximumError = "must be less than or equal to 1000000000" + subdomainNameError = "lowercase RFC 1123 subdomain must consist of lower case alphanumeric characters" + forbiddenError = "Forbidden" + supportedModesError = `supported values: "Pod", "PodGroup"` ) func TestStrategy(t *testing.T) { @@ -106,13 +102,13 @@ func ctxWithRequestInfo() context.Context { func TestStrategyCreate(t *testing.T) { ctx := ctxWithRequestInfo() now := metav1.Now() - type testCase struct { - obj *scheduling.PodGroup - expectObj *scheduling.PodGroup - expectValidationError string - tasEnabled bool - } - testCases := map[string]testCase{ + testCases := map[string]struct { + obj *scheduling.PodGroup + expectObj *scheduling.PodGroup + enableTopologyAwareScheduling bool + enableWorkloadAwarePreemption bool + expectValidationError string + }{ "simple": { obj: podGroup, expectObj: podGroup, @@ -157,51 +153,116 @@ func TestStrategyCreate(t *testing.T) { }(), expectObj: podGroup, }, - "multiple topology constraints": { - obj: func() *scheduling.PodGroup { - newPodGroup := podGroupWithSchedulingConstraints.DeepCopy() - newPodGroup.Spec.SchedulingConstraints.Topology = []scheduling.TopologyConstraint{ - {Key: "foo"}, - {Key: "bar"}, - } - return newPodGroup - }(), - expectValidationError: "must have at most 1 item", - tasEnabled: true, + "multiple topology constraints, topology aware scheduling enabled": { + obj: podGroupWithSchedulingConstraints("foo", "bar"), + enableTopologyAwareScheduling: true, + expectValidationError: tooManyItemsError, }, - "invalid topology key": { - obj: func() *scheduling.PodGroup { - newPodGroup := podGroupWithSchedulingConstraints.DeepCopy() - newPodGroup.Spec.SchedulingConstraints.Topology[0].Key = "foo-" - return newPodGroup - }(), - expectValidationError: "Invalid value: \"foo-\"", - tasEnabled: true, + "multiple topology constraints, topology aware scheduling disabled": { + obj: podGroupWithSchedulingConstraints("foo", "bar"), + expectObj: podGroup, + }, + "invalid topology key, topology aware scheduling enabled": { + obj: podGroupWithSchedulingConstraints("foo-"), + enableTopologyAwareScheduling: true, + expectValidationError: "Invalid value: \"foo-\"", + }, + "invalid topology key, topology aware scheduling disabled": { + obj: podGroupWithSchedulingConstraints("foo-"), + expectObj: podGroup, }, "with TAS feature gate disabled, drop scheduling constraints on creation": { - obj: podGroupWithSchedulingConstraints.DeepCopy(), - expectObj: podGroup, - tasEnabled: false, + obj: podGroupWithSchedulingConstraints("foo-"), + expectObj: podGroup, + }, + "workload aware preemption disabled - drop disruption mode": { + obj: podGroupWithDisruptionMode(scheduling.DisruptionModePod), + expectObj: podGroup, + }, + "workload aware preemption enabled - preserve disruption mode (pod)": { + obj: podGroupWithDisruptionMode(scheduling.DisruptionModePod), + expectObj: podGroupWithDisruptionMode(scheduling.DisruptionModePod), + enableWorkloadAwarePreemption: true, + }, + "workload aware preemption enabled - preserve disruption mode (pod group)": { + obj: podGroupWithDisruptionMode(scheduling.DisruptionModePodGroup), + expectObj: podGroupWithDisruptionMode(scheduling.DisruptionModePodGroup), + enableWorkloadAwarePreemption: true, + }, + "workload aware preemption enabled - unknown disruption mode": { + obj: podGroupWithDisruptionMode(scheduling.DisruptionMode("Invalid")), + enableWorkloadAwarePreemption: true, + expectValidationError: supportedModesError, + }, + "workload aware preemption disabled - drop priorityClassName": { + obj: func() *scheduling.PodGroup { + pg := podGroupWithDisruptionMode(scheduling.DisruptionModePod) + pg.Spec.PriorityClassName = "high-priority" + return pg + }(), + expectObj: podGroup, + }, + "workload aware preemption enabled - invalid priorityClassName": { + obj: func() *scheduling.PodGroup { + pg := podGroupWithDisruptionMode(scheduling.DisruptionModePod) + pg.Spec.PriorityClassName = "invalid/priority/class/name" + return pg + }(), + enableWorkloadAwarePreemption: true, + expectValidationError: subdomainNameError, + }, + "workload aware preemption enabled - preserve priorityClassName": { + obj: func() *scheduling.PodGroup { + pg := podGroupWithDisruptionMode(scheduling.DisruptionModePod) + pg.Spec.PriorityClassName = "high-priority" + return pg + }(), + expectObj: func() *scheduling.PodGroup { + pg := podGroupWithDisruptionMode(scheduling.DisruptionModePod) + pg.Spec.PriorityClassName = "high-priority" + return pg + }(), + enableWorkloadAwarePreemption: true, + }, + "workload aware preemption disabled - drop priority": { + obj: func() *scheduling.PodGroup { + pg := podGroupWithDisruptionMode(scheduling.DisruptionModePod) + pg.Spec.Priority = new(int32(1000)) + return pg + }(), + expectObj: podGroup, + }, + "workload aware preemption enabled - preserve priority": { + obj: func() *scheduling.PodGroup { + pg := podGroupWithDisruptionMode(scheduling.DisruptionModePod) + pg.Spec.Priority = new(int32(1000)) + return pg + }(), + expectObj: func() *scheduling.PodGroup { + pg := podGroupWithDisruptionMode(scheduling.DisruptionModePod) + pg.Spec.Priority = new(int32(1000)) + return pg + }(), + enableWorkloadAwarePreemption: true, + }, + "workload aware preemption enabled - too high priority": { + obj: func() *scheduling.PodGroup { + pg := podGroupWithDisruptionMode(scheduling.DisruptionModePod) + pg.Spec.Priority = new(int32(scheduling.HighestUserDefinablePriority + 1)) + return pg + }(), + enableWorkloadAwarePreemption: true, + expectValidationError: maximumError, }, } - allTestCases := make(map[string]testCase) for name, tc := range testCases { - allTestCases[name] = tc - if tc.tasEnabled { - newTc := testCase{ - obj: tc.obj, - expectObj: podGroup, - } - allTestCases[fmt.Sprintf("drops scheduling constraints, originally %s", name)] = newTc - } - } - - for name, tc := range allTestCases { t.Run(name, func(t *testing.T) { featuregatetesting.SetFeatureGatesDuringTest(t, utilfeature.DefaultFeatureGate, featuregatetesting.FeatureOverrides{ - features.GenericWorkload: tc.tasEnabled, - features.TopologyAwareWorkloadScheduling: tc.tasEnabled, + features.GenericWorkload: true, + features.TopologyAwareWorkloadScheduling: tc.enableTopologyAwareScheduling, + features.GangScheduling: tc.enableWorkloadAwarePreemption, + features.WorkloadAwarePreemption: tc.enableWorkloadAwarePreemption, }) podGroup := tc.obj.DeepCopy() @@ -225,6 +286,7 @@ func TestStrategyCreate(t *testing.T) { if warnings := strategy.WarningsOnCreate(ctx, podGroup); len(warnings) != 0 { t.Fatalf("unexpected warnings: %q", warnings) } + strategy.Canonicalize(podGroup) if tc.expectObj != nil { if diff := cmp.Diff(tc.expectObj, podGroup); diff != "" { t.Errorf("got unexpected podGroup object (-want, +got): %s", diff) @@ -237,10 +299,11 @@ func TestStrategyCreate(t *testing.T) { func TestStrategyUpdate(t *testing.T) { ctx := ctxWithRequestInfo() testCases := map[string]struct { - oldObj *scheduling.PodGroup - newObj *scheduling.PodGroup - expectValidationError string - tasEnabled bool + oldObj *scheduling.PodGroup + newObj *scheduling.PodGroup + enableTopologyAwareScheduling bool + enableWorkloadAwarePreemption bool + expectValidationError string }{ "no changes": { oldObj: podGroup, @@ -290,69 +353,104 @@ func TestStrategyUpdate(t *testing.T) { expectValidationError: fieldImmutableError, }, "changing scheduling constraints not allowed": { - oldObj: podGroupWithSchedulingConstraints, - newObj: func() *scheduling.PodGroup { - newPodGroup := podGroupWithSchedulingConstraints.DeepCopy() - newPodGroup.Spec.SchedulingConstraints = &scheduling.PodGroupSchedulingConstraints{} - return newPodGroup - }(), - expectValidationError: fieldImmutableError, - tasEnabled: true, + oldObj: podGroupWithSchedulingConstraints("foo"), + newObj: podGroupWithSchedulingConstraints(), + enableTopologyAwareScheduling: true, + expectValidationError: fieldImmutableError, }, "changing topology constraints not allowed": { - oldObj: podGroupWithSchedulingConstraints, - newObj: func() *scheduling.PodGroup { - newPodGroup := podGroupWithSchedulingConstraints.DeepCopy() - newPodGroup.Spec.SchedulingConstraints.Topology = []scheduling.TopologyConstraint{} - return newPodGroup - }(), - expectValidationError: fieldImmutableError, - tasEnabled: true, + oldObj: podGroupWithSchedulingConstraints("foo"), + newObj: podGroupWithSchedulingConstraints(), + enableTopologyAwareScheduling: true, + expectValidationError: fieldImmutableError, }, "changing topology key not allowed": { - oldObj: podGroupWithSchedulingConstraints, - newObj: func() *scheduling.PodGroup { - newPodGroup := podGroupWithSchedulingConstraints.DeepCopy() - newPodGroup.Spec.SchedulingConstraints.Topology[0].Key = "foobar" - return newPodGroup - }(), - expectValidationError: fieldImmutableError, - tasEnabled: true, + oldObj: podGroupWithSchedulingConstraints("foo"), + newObj: podGroupWithSchedulingConstraints("foobar"), + enableTopologyAwareScheduling: true, + expectValidationError: fieldImmutableError, }, "changing scheduling constraints not allowed with TAS disabled": { - oldObj: podGroupWithSchedulingConstraints, - newObj: func() *scheduling.PodGroup { - newPodGroup := podGroupWithSchedulingConstraints.DeepCopy() - newPodGroup.Spec.SchedulingConstraints = &scheduling.PodGroupSchedulingConstraints{} - return newPodGroup - }(), - expectValidationError: fieldImmutableError, + oldObj: podGroupWithSchedulingConstraints("foo"), + newObj: podGroupWithSchedulingConstraints(), + expectValidationError: forbiddenError, }, "changing topology constraints not allowed with TAS disabled": { - oldObj: podGroupWithSchedulingConstraints, - newObj: func() *scheduling.PodGroup { - newPodGroup := podGroupWithSchedulingConstraints.DeepCopy() - newPodGroup.Spec.SchedulingConstraints.Topology = []scheduling.TopologyConstraint{} - return newPodGroup - }(), - expectValidationError: fieldImmutableError, + oldObj: podGroupWithSchedulingConstraints("foo"), + newObj: podGroupWithSchedulingConstraints(), + expectValidationError: forbiddenError, }, "changing topology key not allowed with TAS disabled": { - oldObj: podGroupWithSchedulingConstraints, - newObj: func() *scheduling.PodGroup { - newPodGroup := podGroupWithSchedulingConstraints.DeepCopy() - newPodGroup.Spec.SchedulingConstraints.Topology[0].Key = "foobar" - return newPodGroup + oldObj: podGroupWithSchedulingConstraints("foo"), + newObj: podGroupWithSchedulingConstraints("foobar"), + expectValidationError: forbiddenError, + }, + "disruption mode update, workload aware preemption disabled": { + oldObj: podGroupWithDisruptionMode(scheduling.DisruptionModePod), + newObj: podGroupWithDisruptionMode(scheduling.DisruptionModePodGroup), + expectValidationError: forbiddenError, + }, + "disruption mode update, workload aware preemption enabled": { + oldObj: podGroupWithDisruptionMode(scheduling.DisruptionModePod), + newObj: podGroupWithDisruptionMode(scheduling.DisruptionModePodGroup), + enableWorkloadAwarePreemption: true, + expectValidationError: fieldImmutableError, + }, + "priority class name update, workload aware preemption disabled": { + oldObj: func() *scheduling.PodGroup { + pg := podGroupWithDisruptionMode(scheduling.DisruptionModePod) + pg.Spec.PriorityClassName = "low-priority" + return pg }(), - expectValidationError: fieldImmutableError, + newObj: func() *scheduling.PodGroup { + pg := podGroupWithDisruptionMode(scheduling.DisruptionModePod) + pg.Spec.PriorityClassName = "high-priority" + return pg + }(), + expectValidationError: forbiddenError, + }, + "priority class name update, workload aware preemption enabled": { + oldObj: podGroupWithDisruptionMode(scheduling.DisruptionModePod), + newObj: func() *scheduling.PodGroup { + pg := podGroupWithDisruptionMode(scheduling.DisruptionModePod) + pg.Spec.PriorityClassName = "high-priority" + return pg + }(), + enableWorkloadAwarePreemption: true, + expectValidationError: fieldImmutableError, + }, + "priority update, workload aware preemption disabled": { + oldObj: func() *scheduling.PodGroup { + pg := podGroupWithDisruptionMode(scheduling.DisruptionModePod) + pg.Spec.Priority = new(int32(1000)) + return pg + }(), + newObj: func() *scheduling.PodGroup { + pg := podGroupWithDisruptionMode(scheduling.DisruptionModePod) + pg.Spec.Priority = new(int32(2000)) + return pg + }(), + expectValidationError: forbiddenError, + }, + "priority update, workload aware preemption enabled": { + oldObj: podGroupWithDisruptionMode(scheduling.DisruptionModePod), + newObj: func() *scheduling.PodGroup { + pg := podGroupWithDisruptionMode(scheduling.DisruptionModePod) + pg.Spec.Priority = new(int32(2000)) + return pg + }(), + enableWorkloadAwarePreemption: true, + expectValidationError: fieldImmutableError, }, } for name, tc := range testCases { t.Run(name, func(t *testing.T) { featuregatetesting.SetFeatureGatesDuringTest(t, utilfeature.DefaultFeatureGate, featuregatetesting.FeatureOverrides{ - features.GenericWorkload: tc.tasEnabled, - features.TopologyAwareWorkloadScheduling: tc.tasEnabled, + features.GenericWorkload: true, + features.TopologyAwareWorkloadScheduling: tc.enableTopologyAwareScheduling, + features.GangScheduling: tc.enableWorkloadAwarePreemption, + features.WorkloadAwarePreemption: tc.enableWorkloadAwarePreemption, }) podGroup := tc.oldObj.DeepCopy() newPodGroup := tc.newObj.DeepCopy() @@ -478,7 +576,9 @@ func TestStatusStrategyUpdate(t *testing.T) { expectObj := tc.expectObj.DeepCopy() expectObj.ResourceVersion = "4" - assert.Equal(t, expectObj, newObj) + if diff := cmp.Diff(expectObj, newObj); diff != "" { + t.Errorf("PodGroup mismatch (-want +got):\n%s", diff) + } }) } } diff --git a/pkg/registry/scheduling/workload/declarative_validation_test.go b/pkg/registry/scheduling/workload/declarative_validation_test.go index e731eaf1e8e..bb58d82331e 100644 --- a/pkg/registry/scheduling/workload/declarative_validation_test.go +++ b/pkg/registry/scheduling/workload/declarative_validation_test.go @@ -23,6 +23,7 @@ import ( "testing" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/util/sets" "k8s.io/apimachinery/pkg/util/validation/field" "k8s.io/apimachinery/pkg/util/version" genericapirequest "k8s.io/apiserver/pkg/endpoints/request" @@ -36,6 +37,11 @@ import ( _ "k8s.io/kubernetes/pkg/apis/scheduling/install" ) +var allowedDisruptionModes = sets.New( + scheduling.DisruptionModePod, + scheduling.DisruptionModePodGroup, +) + func TestDeclarativeValidate(t *testing.T) { apiVersions := []string{"v1alpha2"} // Workload is currently only in v1alpha2 for _, apiVersion := range apiVersions { @@ -55,9 +61,11 @@ func testDeclarativeValidate(t *testing.T, apiVersion string) { }) testCases := map[string]struct { - input scheduling.Workload - expectedErrs field.ErrorList - tasEnabled bool + input scheduling.Workload + enableTopologyAwareScheduling bool + enableDRAWorkloadResourceClaims bool + enableWorkloadAwarePreemption bool + expectedErrs field.ErrorList }{ "valid": { input: mkValidWorkload(), @@ -142,50 +150,122 @@ func testDeclarativeValidate(t *testing.T, apiVersion string) { input: mkValidWorkload(setBasicPolicy(0)), }, "valid with schedulingConstraints": { - input: mkValidWorkload(addTopologyConstraint(0, "foo")), - tasEnabled: true, + input: mkValidWorkload(addTopologyConstraint(0, "foo")), + enableTopologyAwareScheduling: true, }, "valid with empty schedulingConstraints": { - input: mkValidWorkload(setSchedulingConstraints(0)), - tasEnabled: true, + input: mkValidWorkload(setSchedulingConstraints(0)), + enableTopologyAwareScheduling: true, }, "with multiple topology constraints": { - input: mkValidWorkload(addTopologyConstraint(0, "foo"), addTopologyConstraint(0, "bar")), - expectedErrs: field.ErrorList{field.TooMany(field.NewPath("spec", "podGroupTemplates").Index(0).Child("schedulingConstraints", "topology"), 2, 1).WithOrigin("maxItems")}, - tasEnabled: true, + input: mkValidWorkload(addTopologyConstraint(0, "foo"), addTopologyConstraint(0, "bar")), + enableTopologyAwareScheduling: true, + expectedErrs: field.ErrorList{field.TooMany(field.NewPath("spec", "podGroupTemplates").Index(0).Child("schedulingConstraints", "topology"), 2, 1).WithOrigin("maxItems")}, }, "with empty topology key": { - input: mkValidWorkload(addTopologyConstraint(0, "")), - expectedErrs: field.ErrorList{field.Required(field.NewPath("spec", "podGroupTemplates").Index(0).Child("schedulingConstraints", "topology").Index(0).Child("key"), "")}, - tasEnabled: true, + input: mkValidWorkload(addTopologyConstraint(0, "")), + enableTopologyAwareScheduling: true, + expectedErrs: field.ErrorList{field.Required(field.NewPath("spec", "podGroupTemplates").Index(0).Child("schedulingConstraints", "topology").Index(0).Child("key"), "")}, }, "valid with topology key with DNS prefix": { - input: mkValidWorkload(addTopologyConstraint(0, "example.com/Foo")), - tasEnabled: true, + input: mkValidWorkload(addTopologyConstraint(0, "example.com/Foo")), + enableTopologyAwareScheduling: true, }, "valid with topology key with prefix with max length": { - input: mkValidWorkload(addTopologyConstraint(0, strings.Repeat("a", 253)+"/"+strings.Repeat("b", 63))), - tasEnabled: true, + input: mkValidWorkload(addTopologyConstraint(0, strings.Repeat("a", 253)+"/"+strings.Repeat("b", 63))), + enableTopologyAwareScheduling: true, }, "with topology key with prefix exceending max prefix length": { - input: mkValidWorkload(addTopologyConstraint(0, strings.Repeat("a", 254)+"/foo")), - expectedErrs: field.ErrorList{field.Invalid(field.NewPath("spec", "podGroupTemplates").Index(0).Child("schedulingConstraints", "topology").Index(0).Child("key"), nil, "").WithOrigin("format=k8s-label-key")}, - tasEnabled: true, + input: mkValidWorkload(addTopologyConstraint(0, strings.Repeat("a", 254)+"/foo")), + enableTopologyAwareScheduling: true, + expectedErrs: field.ErrorList{field.Invalid(field.NewPath("spec", "podGroupTemplates").Index(0).Child("schedulingConstraints", "topology").Index(0).Child("key"), nil, "").WithOrigin("format=k8s-label-key")}, }, "with topology key with prefix exceending max name length": { - input: mkValidWorkload(addTopologyConstraint(0, "foo/"+strings.Repeat("b", 64))), - expectedErrs: field.ErrorList{field.Invalid(field.NewPath("spec", "podGroupTemplates").Index(0).Child("schedulingConstraints", "topology").Index(0).Child("key"), nil, "").WithOrigin("format=k8s-label-key")}, - tasEnabled: true, + input: mkValidWorkload(addTopologyConstraint(0, "foo/"+strings.Repeat("b", 64))), + enableTopologyAwareScheduling: true, + expectedErrs: field.ErrorList{field.Invalid(field.NewPath("spec", "podGroupTemplates").Index(0).Child("schedulingConstraints", "topology").Index(0).Child("key"), nil, "").WithOrigin("format=k8s-label-key")}, }, "with topology key without prefix exceeding max length": { - input: mkValidWorkload(addTopologyConstraint(0, strings.Repeat("b", 64))), - expectedErrs: field.ErrorList{field.Invalid(field.NewPath("spec", "podGroupTemplates").Index(0).Child("schedulingConstraints", "topology").Index(0).Child("key"), nil, "").WithOrigin("format=k8s-label-key")}, - tasEnabled: true, + input: mkValidWorkload(addTopologyConstraint(0, strings.Repeat("b", 64))), + enableTopologyAwareScheduling: true, + expectedErrs: field.ErrorList{field.Invalid(field.NewPath("spec", "podGroupTemplates").Index(0).Child("schedulingConstraints", "topology").Index(0).Child("key"), nil, "").WithOrigin("format=k8s-label-key")}, }, "with topology key with invalid characters": { - input: mkValidWorkload(addTopologyConstraint(0, "Example.com/Foo")), - expectedErrs: field.ErrorList{field.Invalid(field.NewPath("spec", "podGroupTemplates").Index(0).Child("schedulingConstraints", "topology").Index(0).Child("key"), nil, "").WithOrigin("format=k8s-label-key")}, - tasEnabled: true, + input: mkValidWorkload(addTopologyConstraint(0, "Example.com/Foo")), + enableTopologyAwareScheduling: true, + expectedErrs: field.ErrorList{field.Invalid(field.NewPath("spec", "podGroupTemplates").Index(0).Child("schedulingConstraints", "topology").Index(0).Child("key"), nil, "").WithOrigin("format=k8s-label-key")}, + }, + "pod disruption mode, workload aware preemption enabled": { + input: mkValidWorkload(setDisruptionMode(0, podDisruptionMode)), + enableWorkloadAwarePreemption: true, + }, + "pod disruption mode, workload aware preemption disabled": { + input: mkValidWorkload(setDisruptionMode(0, podDisruptionMode)), + expectedErrs: field.ErrorList{field.Forbidden(field.NewPath("spec", "podGroupTemplates").Index(0).Child("disruptionMode"), "")}, + }, + "pod group disruption mode, workload aware preemption enabled": { + input: mkValidWorkload(setDisruptionMode(0, podGroupDisruptionMode)), + enableWorkloadAwarePreemption: true, + }, + "pod group disruption mode, workload aware preemption disabled": { + input: mkValidWorkload(setDisruptionMode(0, podGroupDisruptionMode)), + expectedErrs: field.ErrorList{field.Forbidden(field.NewPath("spec", "podGroupTemplates").Index(0).Child("disruptionMode"), "")}, + }, + "invalid disruption mode, workload aware preemption enabled": { + input: mkValidWorkload(setDisruptionMode(0, invalidDisruptionMode)), + enableWorkloadAwarePreemption: true, + expectedErrs: field.ErrorList{field.NotSupported(field.NewPath("spec", "podGroupTemplates").Index(0).Child("disruptionMode"), invalidDisruptionMode, sets.List(allowedDisruptionModes))}, + }, + "invalid disruption mode, workload aware preemption disabled": { + input: mkValidWorkload(setDisruptionMode(0, invalidDisruptionMode)), + expectedErrs: field.ErrorList{field.Forbidden(field.NewPath("spec", "podGroupTemplates").Index(0).Child("disruptionMode"), "")}, + }, + "valid priorityClassName, workload aware preemption enabled": { + input: mkValidWorkload(setPriorityClassName(0, "high-priority")), + enableWorkloadAwarePreemption: true, + }, + "valid priorityClassName, workload aware preemption disabled": { + input: mkValidWorkload(setPriorityClassName(0, "high-priority")), + expectedErrs: field.ErrorList{field.Forbidden(field.NewPath("spec", "podGroupTemplates").Index(0).Child("priorityClassName"), "")}, + }, + "invalid priorityClassName, workload aware preemption enabled": { + input: mkValidWorkload(setPriorityClassName(0, "high/priority")), + enableWorkloadAwarePreemption: true, + expectedErrs: field.ErrorList{ + field.Invalid(field.NewPath("spec", "podGroupTemplates").Index(0).Child("priorityClassName"), nil, "").WithOrigin("format=k8s-long-name"), + }, + }, + "invalid priorityClassName, workload aware preemption disabled": { + input: mkValidWorkload(setPriorityClassName(0, "high/priority")), + expectedErrs: field.ErrorList{field.Forbidden(field.NewPath("spec", "podGroupTemplates").Index(0).Child("priorityClassName"), "")}, + }, + + "valid priority, workload aware preemption enabled": { + input: mkValidWorkload(setPriority(0, 1000)), + enableWorkloadAwarePreemption: true, + }, + "valid priority, workload aware preemption disabled": { + input: mkValidWorkload(setPriority(0, 1000)), + expectedErrs: field.ErrorList{field.Forbidden(field.NewPath("spec", "podGroupTemplates").Index(0).Child("priority"), "")}, + }, + "valid negative priority, workload aware preemption enabled": { + input: mkValidWorkload(setPriority(0, -2147483648)), + enableWorkloadAwarePreemption: true, + }, + "valid negative priority, workload aware preemption disabled": { + input: mkValidWorkload(setPriority(0, -2147483648)), + expectedErrs: field.ErrorList{field.Forbidden(field.NewPath("spec", "podGroupTemplates").Index(0).Child("priority"), "")}, + }, + "too high priority, workload aware preemption enabled": { + input: mkValidWorkload(setPriority(0, scheduling.HighestUserDefinablePriority+1)), + enableWorkloadAwarePreemption: true, + expectedErrs: field.ErrorList{ + field.Invalid(field.NewPath("spec", "podGroupTemplates").Index(0).Child("priority"), nil, "").WithOrigin("maximum"), + }, + }, + "too high priority, workload aware preemption disabled": { + input: mkValidWorkload(setPriority(0, scheduling.HighestUserDefinablePriority+1)), + expectedErrs: field.ErrorList{field.Forbidden(field.NewPath("spec", "podGroupTemplates").Index(0).Child("priority"), "")}, }, "ok resourceClaimName reference": { input: mkValidWorkload(addResourceClaims(scheduling.PodGroupResourceClaim{Name: "claim", ResourceClaimName: new("resource-claim")})), @@ -273,8 +353,11 @@ func testDeclarativeValidate(t *testing.T, apiVersion string) { for k, tc := range testCases { t.Run(k, func(t *testing.T) { featuregatetesting.SetFeatureGatesDuringTest(t, utilfeature.DefaultFeatureGate, featuregatetesting.FeatureOverrides{ - features.GenericWorkload: tc.tasEnabled, - features.TopologyAwareWorkloadScheduling: tc.tasEnabled, + features.GenericWorkload: true, + features.TopologyAwareWorkloadScheduling: tc.enableTopologyAwareScheduling, + features.DRAWorkloadResourceClaims: tc.enableDRAWorkloadResourceClaims, + features.GangScheduling: tc.enableWorkloadAwarePreemption, + features.WorkloadAwarePreemption: tc.enableWorkloadAwarePreemption, }) apitesting.VerifyValidationEquivalence(t, ctx, &tc.input, Strategy.Validate, tc.expectedErrs, apitesting.WithMinEmulationVersion(version.MustParse("1.36"))) }) @@ -292,10 +375,12 @@ func TestDeclarativeValidateUpdate(t *testing.T) { func testDeclarativeValidateUpdate(t *testing.T, apiVersion string) { testCases := map[string]struct { - oldObj scheduling.Workload - updateObj scheduling.Workload - tasEnabled bool - expectedErrs field.ErrorList + oldObj scheduling.Workload + updateObj scheduling.Workload + enableTopologyAwareScheduling bool + enableDRAWorkloadResourceClaims bool + enableWorkloadAwarePreemption bool + expectedErrs field.ErrorList }{ "valid update": { oldObj: mkValidWorkload(setResourceVersion("1")), @@ -308,24 +393,31 @@ func testDeclarativeValidateUpdate(t *testing.T, apiVersion string) { "set controllerRef": { oldObj: mkValidWorkload(setResourceVersion("1")), updateObj: mkValidWorkload(setResourceVersion("1"), setControllerRef("apps", "Deployment", "different-deployment")), - expectedErrs: field.ErrorList{field.Invalid(field.NewPath("spec", "controllerRef"), nil, "field is immutable").WithOrigin("immutable").MarkAlpha()}, + expectedErrs: field.ErrorList{field.Invalid(field.NewPath("spec", "controllerRef"), nil, "").WithOrigin("immutable")}, }, "invalid update controllerRef": { oldObj: mkValidWorkload(setResourceVersion("1"), setControllerRef("apps", "Deployment", "my-deployment")), updateObj: mkValidWorkload(setResourceVersion("1"), setControllerRef("apps", "Deployment", "different-deployment")), - expectedErrs: field.ErrorList{field.Invalid(field.NewPath("spec", "controllerRef"), nil, "field is immutable").WithOrigin("immutable").MarkAlpha()}, + expectedErrs: field.ErrorList{field.Invalid(field.NewPath("spec", "controllerRef"), nil, "").WithOrigin("immutable")}, }, "unset controllerRef": { oldObj: mkValidWorkload(setResourceVersion("1"), setControllerRef("apps", "Deployment", "different-deployment")), updateObj: mkValidWorkload(setResourceVersion("1")), - expectedErrs: field.ErrorList{field.Invalid(field.NewPath("spec", "controllerRef"), nil, "field is immutable").WithOrigin("immutable").MarkAlpha()}, + expectedErrs: field.ErrorList{field.Invalid(field.NewPath("spec", "controllerRef"), nil, "").WithOrigin("immutable")}, }, "invalid update empty podGroupTemplates": { oldObj: mkValidWorkload(setResourceVersion("1")), updateObj: mkValidWorkload(setResourceVersion("1"), setEmptyPodGroupTemplates()), expectedErrs: field.ErrorList{ field.Required(field.NewPath("spec", "podGroupTemplates"), "must have at least one item"), - field.Invalid(field.NewPath("spec", "podGroupTemplates"), []scheduling.PodGroupTemplate{}, "field is immutable").WithOrigin("immutable").MarkAlpha(), + field.Invalid(field.NewPath("spec", "podGroupTemplates"), nil, "").WithOrigin("immutable"), + }, + }, + "change podGroupTemplate name": { + oldObj: mkValidWorkload(setResourceVersion("1"), addPodGroupTemplate("worker1")), + updateObj: mkValidWorkload(setResourceVersion("1"), addPodGroupTemplate("worker2")), + expectedErrs: field.ErrorList{ + field.Invalid(field.NewPath("spec", "podGroupTemplates"), nil, "").WithOrigin("immutable"), }, }, "invalid update too many podGroupTemplates": { @@ -333,55 +425,65 @@ func testDeclarativeValidateUpdate(t *testing.T, apiVersion string) { updateObj: mkValidWorkload(setResourceVersion("1"), setManyPodGroupTemplates(scheduling.WorkloadMaxPodGroupTemplates+1)), expectedErrs: field.ErrorList{ field.TooMany(field.NewPath("spec", "podGroupTemplates"), scheduling.WorkloadMaxPodGroupTemplates+1, scheduling.WorkloadMaxPodGroupTemplates).WithOrigin("maxItems"), - field.Invalid(field.NewPath("spec", "podGroupTemplates"), nil, "field is immutable").WithOrigin("immutable").MarkAlpha(), + field.Invalid(field.NewPath("spec", "podGroupTemplates"), nil, "").WithOrigin("immutable"), }, }, - "invalid update podGroupTemplate": { + "add podGroupTemplate": { oldObj: mkValidWorkload(setResourceVersion("1")), updateObj: mkValidWorkload(setResourceVersion("1"), addPodGroupTemplate("worker1")), - expectedErrs: field.ErrorList{field.Invalid(field.NewPath("spec", "podGroupTemplates"), nil, "field is immutable").WithOrigin("immutable").MarkAlpha()}, + expectedErrs: field.ErrorList{field.Invalid(field.NewPath("spec", "podGroupTemplates"), nil, "").WithOrigin("immutable")}, + }, + "remove podGroupTemplate": { + oldObj: mkValidWorkload(setResourceVersion("1"), addPodGroupTemplate("worker1")), + updateObj: mkValidWorkload(setResourceVersion("1")), + expectedErrs: field.ErrorList{field.Invalid(field.NewPath("spec", "podGroupTemplates"), nil, "").WithOrigin("immutable")}, }, "invalid update with neither basic nor gang": { oldObj: mkValidWorkload(setResourceVersion("1")), updateObj: mkValidWorkload(setResourceVersion("1"), clearPodGroupPolicy(0)), expectedErrs: field.ErrorList{ - field.Invalid(field.NewPath("spec", "podGroupTemplates"), nil, "field is immutable").WithOrigin("immutable").MarkAlpha(), + field.Invalid(field.NewPath("spec", "podGroupTemplates"), nil, "").WithOrigin("immutable"), }, }, "invalid update with both basic and gang": { oldObj: mkValidWorkload(setResourceVersion("1")), updateObj: mkValidWorkload(setResourceVersion("1"), setBothPolicies(0)), expectedErrs: field.ErrorList{ - field.Invalid(field.NewPath("spec", "podGroupTemplates"), nil, "field is immutable").WithOrigin("immutable").MarkAlpha(), + field.Invalid(field.NewPath("spec", "podGroupTemplates"), nil, "").WithOrigin("immutable"), }, }, + "invalid update of gang minCount": { + oldObj: mkValidWorkload(setResourceVersion("1")), + updateObj: mkValidWorkload(setResourceVersion("1"), setPodGroupMinCount(0, 10)), + expectedErrs: field.ErrorList{field.Invalid(field.NewPath("spec", "podGroupTemplates"), nil, "").WithOrigin("immutable")}, + }, "valid update from gang to basic policy": { oldObj: mkValidWorkload(setResourceVersion("1")), updateObj: mkValidWorkload(setResourceVersion("1"), setBasicPolicy(0)), - expectedErrs: field.ErrorList{field.Invalid(field.NewPath("spec", "podGroupTemplates"), nil, "field is immutable").WithOrigin("immutable").MarkAlpha()}, + expectedErrs: field.ErrorList{field.Invalid(field.NewPath("spec", "podGroupTemplates"), nil, "").WithOrigin("immutable")}, }, "valid update with unchanged scheduling constraints": { - oldObj: mkValidWorkload(setResourceVersion("1"), addTopologyConstraint(0, "foo")), - updateObj: mkValidWorkload(setResourceVersion("1"), addTopologyConstraint(0, "foo")), - tasEnabled: true, + oldObj: mkValidWorkload(setResourceVersion("1"), addTopologyConstraint(0, "foo")), + updateObj: mkValidWorkload(setResourceVersion("1"), addTopologyConstraint(0, "foo")), + enableTopologyAwareScheduling: true, }, "invalid update to scheduling constraints": { - oldObj: mkValidWorkload(setResourceVersion("1"), addTopologyConstraint(0, "foo")), - updateObj: mkValidWorkload(setResourceVersion("1"), setSchedulingConstraints(0)), - expectedErrs: field.ErrorList{field.Invalid(field.NewPath("spec", "podGroupTemplates"), nil, "field is immutable").WithOrigin("immutable").MarkAlpha()}, - tasEnabled: true, + oldObj: mkValidWorkload(setResourceVersion("1"), addTopologyConstraint(0, "foo")), + updateObj: mkValidWorkload(setResourceVersion("1"), setSchedulingConstraints(0)), + enableTopologyAwareScheduling: true, + expectedErrs: field.ErrorList{field.Invalid(field.NewPath("spec", "podGroupTemplates"), nil, "field is immutable").WithOrigin("immutable")}, }, "invalid update to topology constraints": { - oldObj: mkValidWorkload(setResourceVersion("1"), addTopologyConstraint(0, "foo")), - updateObj: mkValidWorkload(setResourceVersion("1"), addTopologyConstraint(0, "foo"), addTopologyConstraint(0, "bar")), - expectedErrs: field.ErrorList{field.Invalid(field.NewPath("spec", "podGroupTemplates"), nil, "field is immutable").WithOrigin("immutable").MarkAlpha()}, - tasEnabled: true, + oldObj: mkValidWorkload(setResourceVersion("1"), addTopologyConstraint(0, "foo")), + updateObj: mkValidWorkload(setResourceVersion("1"), addTopologyConstraint(0, "foo"), addTopologyConstraint(0, "bar")), + enableTopologyAwareScheduling: true, + expectedErrs: field.ErrorList{field.Invalid(field.NewPath("spec", "podGroupTemplates"), nil, "field is immutable").WithOrigin("immutable")}, }, "invalid update to topology key": { - oldObj: mkValidWorkload(setResourceVersion("1"), addTopologyConstraint(0, "foo")), - updateObj: mkValidWorkload(setResourceVersion("1"), addTopologyConstraint(0, "bar")), - expectedErrs: field.ErrorList{field.Invalid(field.NewPath("spec", "podGroupTemplates"), nil, "field is immutable").WithOrigin("immutable").MarkAlpha()}, - tasEnabled: true, + oldObj: mkValidWorkload(setResourceVersion("1"), addTopologyConstraint(0, "foo")), + updateObj: mkValidWorkload(setResourceVersion("1"), addTopologyConstraint(0, "bar")), + enableTopologyAwareScheduling: true, + expectedErrs: field.ErrorList{field.Invalid(field.NewPath("spec", "podGroupTemplates"), nil, "field is immutable").WithOrigin("immutable")}, }, "valid update with unchanged scheduling constraints with TAS disabled": { oldObj: mkValidWorkload(setResourceVersion("1"), addTopologyConstraint(0, "foo")), @@ -390,24 +492,109 @@ func testDeclarativeValidateUpdate(t *testing.T, apiVersion string) { "invalid update to scheduling constraints with TAS disabled": { oldObj: mkValidWorkload(setResourceVersion("1"), addTopologyConstraint(0, "foo")), updateObj: mkValidWorkload(setResourceVersion("1"), setSchedulingConstraints(0)), - expectedErrs: field.ErrorList{field.Invalid(field.NewPath("spec", "podGroupTemplates"), nil, "field is immutable").WithOrigin("immutable").MarkAlpha()}, + expectedErrs: field.ErrorList{field.Invalid(field.NewPath("spec", "podGroupTemplates"), nil, "field is immutable").WithOrigin("immutable")}, }, "invalid update to topology constraints with TAS disabled": { oldObj: mkValidWorkload(setResourceVersion("1"), addTopologyConstraint(0, "foo")), updateObj: mkValidWorkload(setResourceVersion("1"), addTopologyConstraint(0, "foo"), addTopologyConstraint(0, "bar")), - expectedErrs: field.ErrorList{field.Invalid(field.NewPath("spec", "podGroupTemplates"), nil, "field is immutable").WithOrigin("immutable").MarkAlpha()}, + expectedErrs: field.ErrorList{field.Invalid(field.NewPath("spec", "podGroupTemplates"), nil, "field is immutable").WithOrigin("immutable")}, }, "invalid update to topology key with TAS disabled": { oldObj: mkValidWorkload(setResourceVersion("1"), addTopologyConstraint(0, "foo")), updateObj: mkValidWorkload(setResourceVersion("1"), addTopologyConstraint(0, "bar")), - expectedErrs: field.ErrorList{field.Invalid(field.NewPath("spec", "podGroupTemplates"), nil, "field is immutable").WithOrigin("immutable").MarkAlpha()}, + expectedErrs: field.ErrorList{field.Invalid(field.NewPath("spec", "podGroupTemplates"), nil, "field is immutable").WithOrigin("immutable")}, + }, + "invalid add of resource claims, DRA workload resource claims disabled": { + oldObj: mkValidWorkload(setResourceVersion("1")), + updateObj: mkValidWorkload(setResourceVersion("1"), addResourceClaims( + scheduling.PodGroupResourceClaim{Name: "my-claim", ResourceClaimTemplateName: new("my-template")}, + )), + expectedErrs: field.ErrorList{field.Invalid(field.NewPath("spec", "podGroupTemplates"), nil, "").WithOrigin("immutable")}, + }, + "invalid add of resource claims, DRA workload resource claims enabled": { + oldObj: mkValidWorkload(setResourceVersion("1")), + updateObj: mkValidWorkload(setResourceVersion("1"), addResourceClaims( + scheduling.PodGroupResourceClaim{Name: "my-claim", ResourceClaimTemplateName: new("my-template")}, + )), + enableDRAWorkloadResourceClaims: true, + expectedErrs: field.ErrorList{field.Invalid(field.NewPath("spec", "podGroupTemplates"), nil, "").WithOrigin("immutable")}, + }, + "invalid update of resource claims, DRA workload resource claims disabled": { + oldObj: mkValidWorkload(setResourceVersion("1"), addResourceClaims( + scheduling.PodGroupResourceClaim{Name: "my-claim", ResourceClaimTemplateName: new("my-template")}, + )), + updateObj: mkValidWorkload(setResourceVersion("1"), addResourceClaims( + scheduling.PodGroupResourceClaim{Name: "my-other-claim", ResourceClaimTemplateName: new("my-template")}, + )), + expectedErrs: field.ErrorList{field.Invalid(field.NewPath("spec", "podGroupTemplates"), nil, "").WithOrigin("immutable")}, + }, + "invalid update of resource claims, DRA workload resource claims enabled": { + oldObj: mkValidWorkload(setResourceVersion("1"), addResourceClaims( + scheduling.PodGroupResourceClaim{Name: "my-claim", ResourceClaimTemplateName: new("my-template")}, + )), + updateObj: mkValidWorkload(setResourceVersion("1"), addResourceClaims( + scheduling.PodGroupResourceClaim{Name: "my-other-claim", ResourceClaimTemplateName: new("my-template")}, + )), + enableDRAWorkloadResourceClaims: true, + expectedErrs: field.ErrorList{field.Invalid(field.NewPath("spec", "podGroupTemplates"), nil, "").WithOrigin("immutable")}, + }, + "invalid remove of resource claims, DRA workload resource claims disabled": { + oldObj: mkValidWorkload(setResourceVersion("1"), addResourceClaims( + scheduling.PodGroupResourceClaim{Name: "my-claim", ResourceClaimTemplateName: new("my-template")}, + )), + updateObj: mkValidWorkload(setResourceVersion("1")), + expectedErrs: field.ErrorList{field.Invalid(field.NewPath("spec", "podGroupTemplates"), nil, "").WithOrigin("immutable")}, + }, + "invalid remove of resource claims, DRA workload resource claims enabled": { + oldObj: mkValidWorkload(setResourceVersion("1"), addResourceClaims( + scheduling.PodGroupResourceClaim{Name: "my-claim", ResourceClaimTemplateName: new("my-template")}, + )), + updateObj: mkValidWorkload(setResourceVersion("1")), + enableDRAWorkloadResourceClaims: true, + expectedErrs: field.ErrorList{field.Invalid(field.NewPath("spec", "podGroupTemplates"), nil, "").WithOrigin("immutable")}, + }, + "invalid update of disruption mode, workload aware preemption enabled": { + oldObj: mkValidWorkload(setResourceVersion("1"), setDisruptionMode(0, podDisruptionMode)), + updateObj: mkValidWorkload(setResourceVersion("1"), setDisruptionMode(0, podGroupDisruptionMode)), + enableWorkloadAwarePreemption: true, + expectedErrs: field.ErrorList{field.Invalid(field.NewPath("spec", "podGroupTemplates"), nil, "").WithOrigin("immutable")}, + }, + "invalid update of disruption mode, workload aware preemption disabled": { + oldObj: mkValidWorkload(setResourceVersion("1"), setDisruptionMode(0, podDisruptionMode)), + updateObj: mkValidWorkload(setResourceVersion("1"), setDisruptionMode(0, podGroupDisruptionMode)), + expectedErrs: field.ErrorList{field.Invalid(field.NewPath("spec", "podGroupTemplates"), nil, "").WithOrigin("immutable")}, + }, + "invalid update of priority class name, workload aware preemption enabled": { + oldObj: mkValidWorkload(setResourceVersion("1"), setPriorityClassName(0, "low-priority")), + updateObj: mkValidWorkload(setResourceVersion("1"), setPriorityClassName(0, "high-priority")), + enableWorkloadAwarePreemption: true, + expectedErrs: field.ErrorList{field.Invalid(field.NewPath("spec", "podGroupTemplates"), nil, "").WithOrigin("immutable")}, + }, + "invalid update of priority class name, workload aware preemption disabled": { + oldObj: mkValidWorkload(setResourceVersion("1"), setPriorityClassName(0, "low-priority")), + updateObj: mkValidWorkload(setResourceVersion("1"), setPriorityClassName(0, "high-priority")), + expectedErrs: field.ErrorList{field.Invalid(field.NewPath("spec", "podGroupTemplates"), nil, "").WithOrigin("immutable")}, + }, + "invalid update of priority, workload aware preemption enabled": { + oldObj: mkValidWorkload(setResourceVersion("1"), setPriority(0, 1000)), + updateObj: mkValidWorkload(setResourceVersion("1"), setPriority(0, 2000)), + enableWorkloadAwarePreemption: true, + expectedErrs: field.ErrorList{field.Invalid(field.NewPath("spec", "podGroupTemplates"), nil, "").WithOrigin("immutable")}, + }, + "invalid update of priority, workload aware preemption disabled": { + oldObj: mkValidWorkload(setResourceVersion("1"), setPriority(0, 1000)), + updateObj: mkValidWorkload(setResourceVersion("1"), setPriority(0, 2000)), + expectedErrs: field.ErrorList{field.Invalid(field.NewPath("spec", "podGroupTemplates"), nil, "").WithOrigin("immutable")}, }, } for k, tc := range testCases { t.Run(k, func(t *testing.T) { featuregatetesting.SetFeatureGatesDuringTest(t, utilfeature.DefaultFeatureGate, featuregatetesting.FeatureOverrides{ - features.GenericWorkload: tc.tasEnabled, - features.TopologyAwareWorkloadScheduling: tc.tasEnabled, + features.GenericWorkload: true, + features.TopologyAwareWorkloadScheduling: tc.enableTopologyAwareScheduling, + features.DRAWorkloadResourceClaims: tc.enableDRAWorkloadResourceClaims, + features.GangScheduling: tc.enableWorkloadAwarePreemption, + features.WorkloadAwarePreemption: tc.enableWorkloadAwarePreemption, }) ctx := genericapirequest.WithRequestInfo(genericapirequest.NewDefaultContext(), &genericapirequest.RequestInfo{ APIPrefix: "apis", @@ -561,3 +748,21 @@ func addResourceClaims(claims ...scheduling.PodGroupResourceClaim) func(obj *sch obj.Spec.PodGroupTemplates[0].ResourceClaims = append(obj.Spec.PodGroupTemplates[0].ResourceClaims, claims...) } } + +func setDisruptionMode(pgIdx int, mode scheduling.DisruptionMode) func(obj *scheduling.Workload) { + return func(obj *scheduling.Workload) { + obj.Spec.PodGroupTemplates[pgIdx].DisruptionMode = &mode + } +} + +func setPriorityClassName(pgIdx int, priorityClassName string) func(obj *scheduling.Workload) { + return func(obj *scheduling.Workload) { + obj.Spec.PodGroupTemplates[pgIdx].PriorityClassName = priorityClassName + } +} + +func setPriority(pgIdx int, priority int32) func(obj *scheduling.Workload) { + return func(obj *scheduling.Workload) { + obj.Spec.PodGroupTemplates[pgIdx].Priority = new(priority) + } +} diff --git a/pkg/registry/scheduling/workload/strategy.go b/pkg/registry/scheduling/workload/strategy.go index 9477f615c00..b42531fb494 100644 --- a/pkg/registry/scheduling/workload/strategy.go +++ b/pkg/registry/scheduling/workload/strategy.go @@ -52,9 +52,15 @@ func (workloadStrategy) Validate(ctx context.Context, obj runtime.Object) field. workloadScheduling := obj.(*scheduling.Workload) allErrs := validation.ValidateWorkload(workloadScheduling) opts := []string{} - if utilfeature.DefaultFeatureGate.Enabled(features.TopologyAwareWorkloadScheduling) || anySchedulingConstraintsInUse(nil) { + if utilfeature.DefaultFeatureGate.Enabled(features.TopologyAwareWorkloadScheduling) { opts = append(opts, string(features.TopologyAwareWorkloadScheduling)) } + if utilfeature.DefaultFeatureGate.Enabled(features.DRAWorkloadResourceClaims) { + opts = append(opts, string(features.DRAWorkloadResourceClaims)) + } + if utilfeature.DefaultFeatureGate.Enabled(features.WorkloadAwarePreemption) { + opts = append(opts, string(features.WorkloadAwarePreemption)) + } return rest.ValidateDeclarativelyWithMigrationChecks(ctx, legacyscheme.Scheme, obj, nil, allErrs, operation.Create, rest.WithDeclarativeEnforcement(), rest.WithOptions(opts)) } @@ -75,9 +81,19 @@ func (workloadStrategy) PrepareForUpdate(ctx context.Context, obj, old runtime.O func (workloadStrategy) ValidateUpdate(ctx context.Context, obj, old runtime.Object) field.ErrorList { allErrs := validation.ValidateWorkloadUpdate(obj.(*scheduling.Workload), old.(*scheduling.Workload)) opts := []string{} - if utilfeature.DefaultFeatureGate.Enabled(features.TopologyAwareWorkloadScheduling) || anySchedulingConstraintsInUse(old.(*scheduling.Workload)) { + // Declarative validation will always allow fields to remain unchanged, so if any + // of the fields which are covered by these gates are set, we will not re-validate them + // (even if the gates are disabled) as long as they do not change values. If a gate + // is disabled, they will not be allowed to change values. + if utilfeature.DefaultFeatureGate.Enabled(features.TopologyAwareWorkloadScheduling) { opts = append(opts, string(features.TopologyAwareWorkloadScheduling)) } + if utilfeature.DefaultFeatureGate.Enabled(features.DRAWorkloadResourceClaims) { + opts = append(opts, string(features.DRAWorkloadResourceClaims)) + } + if utilfeature.DefaultFeatureGate.Enabled(features.WorkloadAwarePreemption) { + opts = append(opts, string(features.WorkloadAwarePreemption)) + } return rest.ValidateDeclarativelyWithMigrationChecks(ctx, legacyscheme.Scheme, obj, old, allErrs, operation.Update, rest.WithDeclarativeEnforcement(), rest.WithOptions(opts)) } @@ -122,6 +138,9 @@ func dropDisabledPodGroupTemplatesFields(templates, oldTemplates []scheduling.Po template := &templates[i] dropDisabledSchedulingConstraintsFields(template, oldTemplate) dropDisabledDRAWorkloadResourceClaimsFields(template, oldTemplate) + dropDisabledDisruptionModeField(template, oldTemplate) + dropDisabledPriorityClassNameField(template, oldTemplate) + dropDisabledPriorityField(template, oldTemplate) } } @@ -131,27 +150,9 @@ func dropDisabledSchedulingConstraintsFields(template, oldTemplate *scheduling.P if utilfeature.DefaultFeatureGate.Enabled(features.TopologyAwareWorkloadScheduling) || schedulingConstraintsInUse(oldTemplate) { return } - template.SchedulingConstraints = nil } -func anySchedulingConstraintsInUse(workload *scheduling.Workload) bool { - if workload == nil { - return false - } - - for i := range workload.Spec.PodGroupTemplates { - if schedulingConstraintsInUse(&workload.Spec.PodGroupTemplates[i]) { - return true - } - } - return false -} - -func schedulingConstraintsInUse(pgt *scheduling.PodGroupTemplate) bool { - return pgt != nil && pgt.SchedulingConstraints != nil -} - // dropDisabledDRAWorkloadResourceClaimsFields removes resource claim references from // podGroupTemplates unless they are already used by the old Workload spec. func dropDisabledDRAWorkloadResourceClaimsFields(template, oldTemplate *scheduling.PodGroupTemplate) { @@ -161,6 +162,52 @@ func dropDisabledDRAWorkloadResourceClaimsFields(template, oldTemplate *scheduli template.ResourceClaims = nil } +// dropDisabledDisruptionModeField removes the DisruptionMode field from a template +// unless it is already used in the old template. +func dropDisabledDisruptionModeField(template, oldTemplate *scheduling.PodGroupTemplate) { + if utilfeature.DefaultFeatureGate.Enabled(features.WorkloadAwarePreemption) || disruptionModeInUse(oldTemplate) { + // No need to drop anything. + return + } + template.DisruptionMode = nil +} + +// dropDisabledPriorityClassNameField removes the PriorityClassName field from a template +// unless it is already used in the old template. +func dropDisabledPriorityClassNameField(template, oldTemplate *scheduling.PodGroupTemplate) { + if utilfeature.DefaultFeatureGate.Enabled(features.WorkloadAwarePreemption) || priorityClassNameInUse(oldTemplate) { + // No need to drop anything. + return + } + template.PriorityClassName = "" +} + +// dropDisabledPriorityField removes the Priority field from a template unless it is +// already used in the old template. +func dropDisabledPriorityField(template, oldTemplate *scheduling.PodGroupTemplate) { + if utilfeature.DefaultFeatureGate.Enabled(features.WorkloadAwarePreemption) || priorityInUse(oldTemplate) { + // No need to drop anything. + return + } + template.Priority = nil +} + +func schedulingConstraintsInUse(pgt *scheduling.PodGroupTemplate) bool { + return pgt != nil && pgt.SchedulingConstraints != nil +} + func draWorkloadResourceClaimsInUse(pgt *scheduling.PodGroupTemplate) bool { return pgt != nil && len(pgt.ResourceClaims) > 0 } + +func disruptionModeInUse(pgt *scheduling.PodGroupTemplate) bool { + return pgt != nil && pgt.DisruptionMode != nil +} + +func priorityClassNameInUse(pgt *scheduling.PodGroupTemplate) bool { + return pgt != nil && pgt.PriorityClassName != "" +} + +func priorityInUse(pgt *scheduling.PodGroupTemplate) bool { + return pgt != nil && pgt.Priority != nil +} diff --git a/pkg/registry/scheduling/workload/strategy_test.go b/pkg/registry/scheduling/workload/strategy_test.go index d1a632a412e..0e1eecb5ca4 100644 --- a/pkg/registry/scheduling/workload/strategy_test.go +++ b/pkg/registry/scheduling/workload/strategy_test.go @@ -30,24 +30,37 @@ import ( "k8s.io/kubernetes/pkg/features" ) -var workload = &scheduling.Workload{ - ObjectMeta: metav1.ObjectMeta{ - Name: "foo", - Namespace: metav1.NamespaceDefault, - }, - Spec: scheduling.WorkloadSpec{ - PodGroupTemplates: []scheduling.PodGroupTemplate{ - { - Name: "bar", - SchedulingPolicy: scheduling.PodGroupSchedulingPolicy{ - Gang: &scheduling.GangSchedulingPolicy{ - MinCount: 5, +var ( + workload = &scheduling.Workload{ + ObjectMeta: metav1.ObjectMeta{ + Name: "foo", + Namespace: metav1.NamespaceDefault, + }, + Spec: scheduling.WorkloadSpec{ + PodGroupTemplates: []scheduling.PodGroupTemplate{ + { + Name: "bar", + SchedulingPolicy: scheduling.PodGroupSchedulingPolicy{ + Gang: &scheduling.GangSchedulingPolicy{ + MinCount: 5, + }, }, }, }, }, - }, -} + } + + podDisruptionMode = scheduling.DisruptionModePod + podGroupDisruptionMode = scheduling.DisruptionModePodGroup + invalidDisruptionMode = scheduling.DisruptionMode("Invalid") + + fieldImmutableError = "field is immutable" + minCountError = "must be greater than or equal to 1" + tooManyItemsError = "must have at most 1 item" + requiredError = "Required value" + subdomainNameError = "lowercase RFC 1123 subdomain must consist of lower case alphanumeric characters" + supportedModesError = `supported values: "Pod", "PodGroup"` +) func TestWorkloadStrategy(t *testing.T) { if !Strategy.NamespaceScoped() { @@ -67,38 +80,28 @@ func ctxWithRequestInfo() context.Context { }) } -func TestPodSchedulingStrategyCreate(t *testing.T) { - t.Run("simple", func(t *testing.T) { - ctx := ctxWithRequestInfo() - workload := workload.DeepCopy() +func TestStrategyCreate(t *testing.T) { + ctx := ctxWithRequestInfo() - Strategy.PrepareForCreate(ctx, workload) - errs := Strategy.Validate(ctx, workload) - if len(errs) != 0 { - t.Errorf("Unexpected validation error: %v", errs) - } - }) - - t.Run("failed validation", func(t *testing.T) { - ctx := ctxWithRequestInfo() - workload := workload.DeepCopy() - workload.Spec.PodGroupTemplates[0].SchedulingPolicy.Gang.MinCount = -1 - - Strategy.PrepareForCreate(ctx, workload) - errs := Strategy.Validate(ctx, workload) - if len(errs) == 0 { - t.Errorf("Expected validation error") - } - }) -} - -func TestPodSchedulingStrategyCreate_SchedulingConstraints(t *testing.T) { testCases := map[string]struct { - obj *scheduling.Workload - expectObj *scheduling.Workload - expectValidationError string - tasEnabled bool + obj *scheduling.Workload + expectObj *scheduling.Workload + enableTopologyAwareScheduling bool + enableWorkloadAwarePreemption bool + expectValidationError string }{ + "simple": { + obj: workload, + expectObj: workload, + }, + "failed validation": { + obj: func() *scheduling.Workload { + w := workload.DeepCopy() + w.Spec.PodGroupTemplates[0].SchedulingPolicy.Gang.MinCount = -1 + return w + }(), + expectValidationError: minCountError, + }, "drops field with SchedulingConstraints set and TAS disabled": { obj: func() *scheduling.Workload { workload := workload.DeepCopy() @@ -107,8 +110,7 @@ func TestPodSchedulingStrategyCreate_SchedulingConstraints(t *testing.T) { } return workload }(), - expectObj: workload, - tasEnabled: false, + expectObj: workload, }, "valid with SchedulingConstraints set and TAS enabled": { obj: func() *scheduling.Workload { @@ -125,7 +127,7 @@ func TestPodSchedulingStrategyCreate_SchedulingConstraints(t *testing.T) { } return workload }(), - tasEnabled: true, + enableTopologyAwareScheduling: true, }, "invalid with multiple topology constraints": { obj: func() *scheduling.Workload { @@ -138,8 +140,8 @@ func TestPodSchedulingStrategyCreate_SchedulingConstraints(t *testing.T) { } return workload }(), - expectValidationError: "must have at most 1 item", - tasEnabled: true, + enableTopologyAwareScheduling: true, + expectValidationError: tooManyItemsError, }, "invalid with invalid topology key": { obj: func() *scheduling.Workload { @@ -151,20 +153,95 @@ func TestPodSchedulingStrategyCreate_SchedulingConstraints(t *testing.T) { } return workload }(), - expectValidationError: "Required value", - tasEnabled: true, + enableTopologyAwareScheduling: true, + expectValidationError: requiredError, + }, + "workload aware preemption disabled - drop disruption mode": { + obj: func() *scheduling.Workload { + w := workload.DeepCopy() + w.Spec.PodGroupTemplates[0].DisruptionMode = &podDisruptionMode + return w + }(), + expectObj: workload, + }, + "workload aware preemption enabled - preserve disruption mode (pod)": { + obj: func() *scheduling.Workload { + w := workload.DeepCopy() + w.Spec.PodGroupTemplates[0].DisruptionMode = &podDisruptionMode + return w + }(), + enableWorkloadAwarePreemption: true, + expectObj: func() *scheduling.Workload { + w := workload.DeepCopy() + w.Spec.PodGroupTemplates[0].DisruptionMode = &podDisruptionMode + return w + }(), + }, + "workload aware preemption enabled - preserve disruption mode (pod group)": { + obj: func() *scheduling.Workload { + w := workload.DeepCopy() + w.Spec.PodGroupTemplates[0].DisruptionMode = &podGroupDisruptionMode + return w + }(), + enableWorkloadAwarePreemption: true, + expectObj: func() *scheduling.Workload { + w := workload.DeepCopy() + w.Spec.PodGroupTemplates[0].DisruptionMode = &podGroupDisruptionMode + return w + }(), + }, + "workload aware preemption enabled - unknown disruption mode": { + obj: func() *scheduling.Workload { + w := workload.DeepCopy() + w.Spec.PodGroupTemplates[0].DisruptionMode = &invalidDisruptionMode + return w + }(), + enableWorkloadAwarePreemption: true, + expectValidationError: supportedModesError, + }, + "workload aware preemption enabled - preserve priorityClassName": { + obj: func() *scheduling.Workload { + w := workload.DeepCopy() + w.Spec.PodGroupTemplates[0].PriorityClassName = "high-priority" + return w + }(), + enableWorkloadAwarePreemption: true, + expectObj: func() *scheduling.Workload { + w := workload.DeepCopy() + w.Spec.PodGroupTemplates[0].PriorityClassName = "high-priority" + return w + }(), + }, + "workload aware preemption disabled - drop priorityClassName": { + obj: func() *scheduling.Workload { + w := workload.DeepCopy() + w.Spec.PodGroupTemplates[0].PriorityClassName = "high-priority" + return w + }(), + expectObj: workload, + }, + "workload aware preemption enabled - invalid priorityClassName": { + obj: func() *scheduling.Workload { + w := workload.DeepCopy() + w.Spec.PodGroupTemplates[0].PriorityClassName = "invalid/priority/class/name" + return w + }(), + enableWorkloadAwarePreemption: true, + expectValidationError: subdomainNameError, }, } for name, tc := range testCases { t.Run(name, func(t *testing.T) { - featuregatetesting.SetFeatureGatesDuringTest(t, utilfeature.DefaultFeatureGate, featuregatetesting.FeatureOverrides{ - features.GenericWorkload: tc.tasEnabled, - features.TopologyAwareWorkloadScheduling: tc.tasEnabled, - }) - ctx := ctxWithRequestInfo() workload := tc.obj.DeepCopy() + featuregatetesting.SetFeatureGatesDuringTest(t, utilfeature.DefaultFeatureGate, featuregatetesting.FeatureOverrides{ + features.GenericWorkload: true, + features.TopologyAwareWorkloadScheduling: tc.enableTopologyAwareScheduling, + features.GangScheduling: tc.enableWorkloadAwarePreemption, + features.WorkloadAwarePreemption: tc.enableWorkloadAwarePreemption, + }) + Strategy.PrepareForCreate(ctx, workload) if errs := Strategy.Validate(ctx, workload); len(errs) != 0 { if tc.expectValidationError == "" { @@ -186,74 +263,52 @@ func TestPodSchedulingStrategyCreate_SchedulingConstraints(t *testing.T) { } } -func TestPodSchedulingStrategyUpdate(t *testing.T) { - t.Run("no changes", func(t *testing.T) { - ctx := ctxWithRequestInfo() - workload := workload.DeepCopy() - newWorkload := workload.DeepCopy() - newWorkload.ResourceVersion = "4" +func TestStrategyUpdate(t *testing.T) { + ctx := ctxWithRequestInfo() - Strategy.PrepareForUpdate(ctx, newWorkload, workload) - errs := Strategy.ValidateUpdate(ctx, newWorkload, workload) - if len(errs) != 0 { - t.Errorf("Unexpected validation error: %v", errs) - } - }) - - t.Run("name update", func(t *testing.T) { - ctx := ctxWithRequestInfo() - workload := workload.DeepCopy() - newWorkload := workload.DeepCopy() - newWorkload.Name += "bar" - newWorkload.ResourceVersion = "4" - - Strategy.PrepareForUpdate(ctx, newWorkload, workload) - errs := Strategy.ValidateUpdate(ctx, newWorkload, workload) - if len(errs) == 0 { - t.Errorf("Expected validation error") - } - }) - - t.Run("invalid spec update - controllerRef", func(t *testing.T) { - ctx := ctxWithRequestInfo() - workload := workload.DeepCopy() - newWorkload := workload.DeepCopy() - newWorkload.Spec.ControllerRef = &scheduling.TypedLocalObjectReference{ - Kind: "foo", - Name: "baz", - } - newWorkload.ResourceVersion = "4" - - Strategy.PrepareForUpdate(ctx, newWorkload, workload) - errs := Strategy.ValidateUpdate(ctx, newWorkload, workload) - if len(errs) == 0 { - t.Errorf("Expected validation error") - } - }) - - t.Run("invalid spec update - podGroupTemplates", func(t *testing.T) { - ctx := ctxWithRequestInfo() - workload := workload.DeepCopy() - newWorkload := workload.DeepCopy() - newWorkload.Spec.PodGroupTemplates[0].SchedulingPolicy.Gang.MinCount = 4 - newWorkload.ResourceVersion = "4" - - Strategy.PrepareForUpdate(ctx, newWorkload, workload) - errs := Strategy.ValidateUpdate(ctx, newWorkload, workload) - if len(errs) == 0 { - t.Errorf("Expected validation error") - } - }) -} - -func TestPodSchedulingStrategyUpdate_SchedulingConstraints(t *testing.T) { testCases := map[string]struct { - oldObj *scheduling.Workload - newObj *scheduling.Workload - expectObj *scheduling.Workload - expectValidationError string - tasEnabled bool + oldObj *scheduling.Workload + newObj *scheduling.Workload + enableTopologyAwareScheduling bool + enableWorkloadAwarePreemption bool + expectValidationError string + expectWorkload *scheduling.Workload }{ + "no changes": { + oldObj: workload, + newObj: workload, + expectWorkload: workload, + }, + "name update": { + oldObj: workload, + newObj: func() *scheduling.Workload { + w := workload.DeepCopy() + w.Name += "bar" + return w + }(), + expectValidationError: fieldImmutableError, + }, + "invalid spec update - controllerRef": { + oldObj: workload, + newObj: func() *scheduling.Workload { + w := workload.DeepCopy() + w.Spec.ControllerRef = &scheduling.TypedLocalObjectReference{ + Kind: "foo", + Name: "baz", + } + return w + }(), + expectValidationError: fieldImmutableError, + }, + "invalid spec update - podGroupTemplates": { + oldObj: workload, + newObj: func() *scheduling.Workload { + w := workload.DeepCopy() + w.Spec.PodGroupTemplates[0].SchedulingPolicy.Gang.MinCount = 4 + return w + }(), + expectValidationError: fieldImmutableError, + }, "valid update with scheduling constraints unchanged and TAS disabled": { oldObj: func() *scheduling.Workload { workload := workload.DeepCopy() @@ -271,7 +326,7 @@ func TestPodSchedulingStrategyUpdate_SchedulingConstraints(t *testing.T) { } return workload }(), - expectObj: func() *scheduling.Workload { + expectWorkload: func() *scheduling.Workload { workload := workload.DeepCopy() workload.Spec.PodGroupTemplates = append(workload.Spec.PodGroupTemplates, *workload.Spec.PodGroupTemplates[0].DeepCopy()) workload.Spec.PodGroupTemplates[0].SchedulingConstraints = &scheduling.PodGroupSchedulingConstraints{ @@ -279,7 +334,6 @@ func TestPodSchedulingStrategyUpdate_SchedulingConstraints(t *testing.T) { } return workload }(), - tasEnabled: false, }, "valid update with scheduling constraints unchanged and TAS enabled": { oldObj: func() *scheduling.Workload { @@ -298,7 +352,7 @@ func TestPodSchedulingStrategyUpdate_SchedulingConstraints(t *testing.T) { } return workload }(), - expectObj: func() *scheduling.Workload { + expectWorkload: func() *scheduling.Workload { workload := workload.DeepCopy() workload.Spec.PodGroupTemplates = append(workload.Spec.PodGroupTemplates, *workload.Spec.PodGroupTemplates[0].DeepCopy()) workload.Spec.PodGroupTemplates[0].SchedulingConstraints = &scheduling.PodGroupSchedulingConstraints{ @@ -306,7 +360,7 @@ func TestPodSchedulingStrategyUpdate_SchedulingConstraints(t *testing.T) { } return workload }(), - tasEnabled: true, + enableTopologyAwareScheduling: true, }, "changing topology key not allowed with TAS disabled": { oldObj: func() *scheduling.Workload { @@ -323,8 +377,7 @@ func TestPodSchedulingStrategyUpdate_SchedulingConstraints(t *testing.T) { } return workload }(), - expectValidationError: "field is immutable", - tasEnabled: false, + expectValidationError: fieldImmutableError, }, "changing topology key not allowed with TAS enabled": { oldObj: func() *scheduling.Workload { @@ -341,31 +394,19 @@ func TestPodSchedulingStrategyUpdate_SchedulingConstraints(t *testing.T) { } return workload }(), - expectValidationError: "field is immutable", - tasEnabled: true, + enableTopologyAwareScheduling: true, + expectValidationError: fieldImmutableError, }, - "changing topology constraints not allowed with TAS disabled": { - oldObj: func() *scheduling.Workload { - workload := workload.DeepCopy() - workload.Spec.PodGroupTemplates = append(workload.Spec.PodGroupTemplates, *workload.Spec.PodGroupTemplates[0].DeepCopy()) - workload.Spec.PodGroupTemplates[0].SchedulingConstraints = &scheduling.PodGroupSchedulingConstraints{ - Topology: []scheduling.TopologyConstraint{{Key: "foo"}}, - } - return workload - }(), + "topology constraint addition is dropped with TAS disabled": { + oldObj: workload, newObj: func() *scheduling.Workload { workload := workload.DeepCopy() - workload.Spec.PodGroupTemplates = append(workload.Spec.PodGroupTemplates, *workload.Spec.PodGroupTemplates[0].DeepCopy()) workload.Spec.PodGroupTemplates[0].SchedulingConstraints = &scheduling.PodGroupSchedulingConstraints{ Topology: []scheduling.TopologyConstraint{{Key: "foo"}}, } - workload.Spec.PodGroupTemplates[1].SchedulingConstraints = &scheduling.PodGroupSchedulingConstraints{ - Topology: []scheduling.TopologyConstraint{{Key: "foo"}}, - } return workload }(), - expectValidationError: "field is immutable", - tasEnabled: false, + expectWorkload: workload, }, "changing topology constraints not allowed with TAS enabled": { oldObj: func() *scheduling.Workload { @@ -387,8 +428,8 @@ func TestPodSchedulingStrategyUpdate_SchedulingConstraints(t *testing.T) { } return workload }(), - expectValidationError: "field is immutable", - tasEnabled: true, + enableTopologyAwareScheduling: true, + expectValidationError: fieldImmutableError, }, "adding scheduling constraints not allowed with TAS disabled": { oldObj: func() *scheduling.Workload { @@ -409,8 +450,7 @@ func TestPodSchedulingStrategyUpdate_SchedulingConstraints(t *testing.T) { } return workload }(), - expectValidationError: "field is immutable", - tasEnabled: false, + expectValidationError: fieldImmutableError, }, "adding scheduling constraints not allowed with TAS enabled": { oldObj: func() *scheduling.Workload { @@ -431,22 +471,72 @@ func TestPodSchedulingStrategyUpdate_SchedulingConstraints(t *testing.T) { } return workload }(), - expectValidationError: "field is immutable", - tasEnabled: true, + enableTopologyAwareScheduling: true, + expectValidationError: fieldImmutableError, + }, + "disruption mode update, workload aware preemption disabled": { + oldObj: func() *scheduling.Workload { + w := workload.DeepCopy() + w.Spec.PodGroupTemplates[0].DisruptionMode = &podGroupDisruptionMode + return w + }(), + newObj: func() *scheduling.Workload { + w := workload.DeepCopy() + w.Spec.PodGroupTemplates[0].DisruptionMode = &podDisruptionMode + return w + }(), + expectValidationError: fieldImmutableError, + }, + "disruption mode update, workload aware preemption enabled": { + oldObj: workload, + newObj: func() *scheduling.Workload { + w := workload.DeepCopy() + w.Spec.PodGroupTemplates[0].DisruptionMode = &podDisruptionMode + return w + }(), + enableWorkloadAwarePreemption: true, + expectValidationError: fieldImmutableError, + }, + "priorityClassName update, workload aware preemption disabled": { + oldObj: func() *scheduling.Workload { + w := workload.DeepCopy() + w.Spec.PodGroupTemplates[0].PriorityClassName = "high-priority" + return w + }(), + newObj: func() *scheduling.Workload { + w := workload.DeepCopy() + w.Spec.PodGroupTemplates[0].PriorityClassName = "low-priority" + return w + }(), + expectValidationError: fieldImmutableError, + }, + "priorityClassName update, workload aware preemption enabled": { + oldObj: func() *scheduling.Workload { + w := workload.DeepCopy() + w.Spec.PodGroupTemplates[0].PriorityClassName = "high-priority" + return w + }(), + newObj: func() *scheduling.Workload { + w := workload.DeepCopy() + w.Spec.PodGroupTemplates[0].PriorityClassName = "low-priority" + return w + }(), + enableWorkloadAwarePreemption: true, + expectValidationError: fieldImmutableError, }, } for name, tc := range testCases { t.Run(name, func(t *testing.T) { featuregatetesting.SetFeatureGatesDuringTest(t, utilfeature.DefaultFeatureGate, featuregatetesting.FeatureOverrides{ - features.GenericWorkload: tc.tasEnabled, - features.TopologyAwareWorkloadScheduling: tc.tasEnabled, + features.GenericWorkload: true, + features.TopologyAwareWorkloadScheduling: tc.enableTopologyAwareScheduling, + features.GangScheduling: tc.enableWorkloadAwarePreemption, + features.WorkloadAwarePreemption: tc.enableWorkloadAwarePreemption, }) - ctx := ctxWithRequestInfo() oldWorkload := tc.oldObj.DeepCopy() - oldWorkload.ResourceVersion = "1" newWorkload := tc.newObj.DeepCopy() - newWorkload.ResourceVersion = "2" + newWorkload.ResourceVersion = "4" Strategy.PrepareForUpdate(ctx, newWorkload, oldWorkload) if errs := Strategy.ValidateUpdate(ctx, newWorkload, oldWorkload); len(errs) != 0 { @@ -459,12 +549,20 @@ func TestPodSchedulingStrategyUpdate_SchedulingConstraints(t *testing.T) { if errMsg := errs[0].Error(); !strings.Contains(errMsg, tc.expectValidationError) { t.Fatalf("error %#v does not contain the expected message %q", errMsg, tc.expectValidationError) } + return } - if tc.expectObj != nil { - tc.expectObj.ResourceVersion = newWorkload.ResourceVersion - if diff := cmp.Diff(tc.expectObj, newWorkload); diff != "" { - t.Errorf("got unexpected workload object (-want, +got): %s", diff) - } + if tc.expectValidationError != "" { + t.Fatal("expected validation error(s), got none") + } + if warnings := Strategy.WarningsOnUpdate(ctx, newWorkload, oldWorkload); len(warnings) != 0 { + t.Fatalf("unexpected warnings: %q", warnings) + } + Strategy.Canonicalize(newWorkload) + + expectWorkload := tc.expectWorkload.DeepCopy() + expectWorkload.ResourceVersion = "4" + if diff := cmp.Diff(expectWorkload, newWorkload); diff != "" { + t.Errorf("Workload mismatch (-want +got):\n%s", diff) } }) } diff --git a/plugin/pkg/admission/priority/admission.go b/plugin/pkg/admission/priority/admission.go index 106fbaac283..91101f4f533 100644 --- a/plugin/pkg/admission/priority/admission.go +++ b/plugin/pkg/admission/priority/admission.go @@ -27,11 +27,13 @@ import ( "k8s.io/apimachinery/pkg/labels" "k8s.io/apiserver/pkg/admission" genericadmissioninitializers "k8s.io/apiserver/pkg/admission/initializer" + utilfeature "k8s.io/apiserver/pkg/util/feature" "k8s.io/client-go/informers" "k8s.io/client-go/kubernetes" schedulingv1listers "k8s.io/client-go/listers/scheduling/v1" "k8s.io/kubernetes/pkg/apis/core" "k8s.io/kubernetes/pkg/apis/scheduling" + "k8s.io/kubernetes/pkg/features" ) const ( @@ -90,11 +92,11 @@ func (p *Plugin) SetExternalKubeInformerFactory(f informers.SharedInformerFactor var ( podResource = core.Resource("pods") + podGroupResource = scheduling.Resource("podgroups") priorityClassResource = scheduling.Resource("priorityclasses") ) -// Admit checks Pods and admits or rejects them. It also resolves the priority of pods based on their PriorityClass. -// Note that pod validation mechanism prevents update of a pod priority. +// Admit checks Pods and PodGroups and admits or rejects them. It also resolves the priority of pods and pod groups based on their PriorityClass. func (p *Plugin) Admit(ctx context.Context, a admission.Attributes, o admission.ObjectInterfaces) error { operation := a.GetOperation() // Ignore all calls to subresources @@ -107,7 +109,11 @@ func (p *Plugin) Admit(ctx context.Context, a admission.Attributes, o admission. return p.admitPod(a) } return nil - + case podGroupResource: + if operation == admission.Create { + return p.admitPodGroup(a) + } + return nil default: return nil } @@ -185,6 +191,31 @@ func (p *Plugin) admitPod(a admission.Attributes) error { return nil } +// admitPodGroup makes sure a new pod group does not set spec.Priority field. It also makes sure that +// the PriorityClassName exists if it is provided and resolves the pod group priority from the PriorityClassName. +func (p *Plugin) admitPodGroup(attributes admission.Attributes) error { + if !utilfeature.DefaultFeatureGate.Enabled(features.WorkloadAwarePreemption) { + return nil + } + + pg, ok := attributes.GetObject().(*scheduling.PodGroup) + if !ok { + return errors.NewBadRequest("resource was marked with kind PodGroup but was unable to be converted") + } + + priorityClassName, priority, _, err := p.establishPriority(attributes, &pg.Spec.PriorityClassName) + if err != nil { + return err + } + // Reject if the pod group already contained a priority that differs from the one computed from the priority class. + if pg.Spec.Priority != nil && *pg.Spec.Priority != priority { + return admission.NewForbidden(attributes, fmt.Errorf("priority set in the pod group (%d) must match the priority computed (%d) based on the priority class set in the spec", *pg.Spec.Priority, priority)) + } + pg.Spec.Priority = &priority + pg.Spec.PriorityClassName = priorityClassName + return nil +} + // validatePriorityClass ensures that the value field is not larger than the highest user definable priority. If the GlobalDefault is set, it ensures that there is no other PriorityClass whose GlobalDefault is set. func (p *Plugin) validatePriorityClass(a admission.Attributes) error { operation := a.GetOperation() diff --git a/plugin/pkg/admission/priority/admission_test.go b/plugin/pkg/admission/priority/admission_test.go index 1e66a26f7c6..887fa9c32da 100644 --- a/plugin/pkg/admission/priority/admission_test.go +++ b/plugin/pkg/admission/priority/admission_test.go @@ -20,18 +20,24 @@ import ( "context" "testing" + "github.com/google/go-cmp/cmp" "k8s.io/klog/v2" + "k8s.io/klog/v2/ktesting" schedulingv1 "k8s.io/api/scheduling/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" "k8s.io/apiserver/pkg/admission" admissiontesting "k8s.io/apiserver/pkg/admission/testing" "k8s.io/apiserver/pkg/authentication/user" + "k8s.io/apiserver/pkg/util/feature" "k8s.io/client-go/informers" + featuregatetesting "k8s.io/component-base/featuregate/testing" api "k8s.io/kubernetes/pkg/apis/core" "k8s.io/kubernetes/pkg/apis/scheduling" v1 "k8s.io/kubernetes/pkg/apis/scheduling/v1" "k8s.io/kubernetes/pkg/controller" + "k8s.io/kubernetes/pkg/features" ) func addPriorityClasses(ctrl *Plugin, priorityClasses []*scheduling.PriorityClass) error { @@ -760,3 +766,160 @@ func TestPodAdmission(t *testing.T) { } } } + +func TestAdmitPodGroup(t *testing.T) { + podGroup := func(priorityClassName string) *scheduling.PodGroup { + return &scheduling.PodGroup{ + ObjectMeta: metav1.ObjectMeta{ + Name: "test-podgroup", + Namespace: metav1.NamespaceDefault, + }, + Spec: scheduling.PodGroupSpec{ + SchedulingPolicy: scheduling.PodGroupSchedulingPolicy{ + Basic: &scheduling.BasicSchedulingPolicy{}, + }, + PriorityClassName: priorityClassName, + }, + } + } + + podGroupWithPriority := func(priorityClassName string, priority int32) *scheduling.PodGroup { + pg := podGroup(priorityClassName) + pg.Spec.Priority = new(priority) + return pg + } + + attributes := func(podGroup *scheduling.PodGroup, operation admission.Operation) admission.Attributes { + var oldPodGroup runtime.Object + var options runtime.Object = &metav1.CreateOptions{} + if operation == admission.Update { + oldPodGroup = podGroup.DeepCopy() + options = &metav1.UpdateOptions{} + } + return admission.NewAttributesRecord( + podGroup, + oldPodGroup, + scheduling.Kind("PodGroup").WithVersion("v1alpha2"), + podGroup.ObjectMeta.Namespace, + "", + scheduling.Resource("podgroups").WithVersion("v1alpha2"), + "", + operation, + options, + false, + nil, + ) + } + + testCases := []struct { + name string + priorityClasses []*scheduling.PriorityClass + preparePodGroup *scheduling.PodGroup + operation admission.Operation + expectedPriorityClass string + expectedPriority int32 + enableWorkloadAwarePreemption bool + expectError bool + }{ + { + name: "pod group with empty priorityClassName, accepted and set to global default", + priorityClasses: []*scheduling.PriorityClass{defaultClass1, nondefaultClass1}, + preparePodGroup: podGroup("" /* empty priorityClassName */), + operation: admission.Create, + expectedPriorityClass: "default1", + expectedPriority: defaultClass1.Value, + enableWorkloadAwarePreemption: true, + }, + { + name: "pod group with explicit priorityClassName, accepted", + priorityClasses: []*scheduling.PriorityClass{defaultClass1, nondefaultClass1}, + preparePodGroup: podGroup("nondefault1"), + operation: admission.Create, + expectedPriorityClass: "nondefault1", + expectedPriority: nondefaultClass1.Value, + enableWorkloadAwarePreemption: true, + }, + { + name: "pod group with non-existent priorityClassName, rejected", + priorityClasses: []*scheduling.PriorityClass{defaultClass1, nondefaultClass1}, + preparePodGroup: podGroup("non-existent"), + operation: admission.Create, + enableWorkloadAwarePreemption: true, + expectError: true, + }, + { + name: "pod group with any priorityClassName but feature gate disabled, skips validation", + priorityClasses: []*scheduling.PriorityClass{defaultClass1, nondefaultClass1}, + preparePodGroup: podGroup("non-existent"), + operation: admission.Create, + }, + { + name: "pod group with no priorityClassName and no global default, accepted and priority should be zero", + priorityClasses: []*scheduling.PriorityClass{nondefaultClass1}, + preparePodGroup: podGroup("" /* empty priorityClassName */), + operation: admission.Create, + expectedPriorityClass: "", + expectedPriority: 0, + enableWorkloadAwarePreemption: true, + }, + { + name: "pod group create with pre-set Priority matching computed value, accepted", + priorityClasses: []*scheduling.PriorityClass{defaultClass1, nondefaultClass1}, + preparePodGroup: podGroupWithPriority("nondefault1", nondefaultClass1.Value), + operation: admission.Create, + expectedPriorityClass: "nondefault1", + expectedPriority: nondefaultClass1.Value, + enableWorkloadAwarePreemption: true, + }, + { + name: "pod group create with pre-set Priority not matching computed value, rejected", + priorityClasses: []*scheduling.PriorityClass{defaultClass1, nondefaultClass1}, + preparePodGroup: podGroupWithPriority("nondefault1", int32(9999)), + operation: admission.Create, + enableWorkloadAwarePreemption: true, + expectError: true, + }, + { + name: "update operation is a no-op, admission does not mutate pod group on update", + priorityClasses: []*scheduling.PriorityClass{defaultClass1, nondefaultClass1}, + preparePodGroup: podGroup("non-existent"), + operation: admission.Update, + enableWorkloadAwarePreemption: true, + }, + } + + for _, tt := range testCases { + t.Run(tt.name, func(t *testing.T) { + featuregatetesting.SetFeatureGatesDuringTest(t, feature.DefaultFeatureGate, featuregatetesting.FeatureOverrides{ + features.GenericWorkload: true, + features.GangScheduling: true, + features.WorkloadAwarePreemption: tt.enableWorkloadAwarePreemption, + }) + + admissionPlugin := NewPlugin() + if err := addPriorityClasses(admissionPlugin, tt.priorityClasses); err != nil { + t.Fatalf("unable to configure priority classes: %v", err) + } + + _, ctx := ktesting.NewTestContext(t) + podGroupCopy := tt.preparePodGroup.DeepCopy() + err := admissionPlugin.Admit(ctx, attributes(tt.preparePodGroup, tt.operation), nil) + if (err != nil) != tt.expectError { + t.Errorf("PodGroup Admit(), error = %v, want = %v", err, tt.expectError) + } + if !tt.expectError && tt.operation == admission.Create && tt.enableWorkloadAwarePreemption && tt.preparePodGroup.Spec.PodGroupTemplateRef == nil { + if tt.preparePodGroup.Spec.PriorityClassName != tt.expectedPriorityClass { + t.Errorf("PodGroup Admit(), priorityClassName = %v, want = %v", tt.preparePodGroup.Spec.PriorityClassName, tt.expectedPriorityClass) + } + if *tt.preparePodGroup.Spec.Priority != tt.expectedPriority { + t.Errorf("PodGroup Admit(), Priority = %v, want = %v", *tt.preparePodGroup.Spec.Priority, tt.expectedPriority) + } + } + if tt.operation != admission.Create { + if diff := cmp.Diff(tt.preparePodGroup, podGroupCopy); len(diff) > 0 { + t.Errorf("PodGroup Admit() should not modify the PodGroup (-want +got):\n%s", diff) + } + } + }) + } +} diff --git a/staging/src/k8s.io/api/scheduling/v1alpha2/generated.pb.go b/staging/src/k8s.io/api/scheduling/v1alpha2/generated.pb.go index 74ee4485df9..12402df3d27 100644 --- a/staging/src/k8s.io/api/scheduling/v1alpha2/generated.pb.go +++ b/staging/src/k8s.io/api/scheduling/v1alpha2/generated.pb.go @@ -397,6 +397,23 @@ func (m *PodGroupSpec) MarshalToSizedBuffer(dAtA []byte) (int, error) { _ = i var l int _ = l + if m.Priority != nil { + i = encodeVarintGenerated(dAtA, i, uint64(*m.Priority)) + i-- + dAtA[i] = 0x38 + } + i -= len(m.PriorityClassName) + copy(dAtA[i:], m.PriorityClassName) + i = encodeVarintGenerated(dAtA, i, uint64(len(m.PriorityClassName))) + i-- + dAtA[i] = 0x32 + if m.DisruptionMode != nil { + i -= len(*m.DisruptionMode) + copy(dAtA[i:], *m.DisruptionMode) + i = encodeVarintGenerated(dAtA, i, uint64(len(*m.DisruptionMode))) + i-- + dAtA[i] = 0x2a + } if len(m.ResourceClaims) > 0 { for iNdEx := len(m.ResourceClaims) - 1; iNdEx >= 0; iNdEx-- { { @@ -519,6 +536,23 @@ func (m *PodGroupTemplate) MarshalToSizedBuffer(dAtA []byte) (int, error) { _ = i var l int _ = l + if m.Priority != nil { + i = encodeVarintGenerated(dAtA, i, uint64(*m.Priority)) + i-- + dAtA[i] = 0x38 + } + i -= len(m.PriorityClassName) + copy(dAtA[i:], m.PriorityClassName) + i = encodeVarintGenerated(dAtA, i, uint64(len(m.PriorityClassName))) + i-- + dAtA[i] = 0x32 + if m.DisruptionMode != nil { + i -= len(*m.DisruptionMode) + copy(dAtA[i:], *m.DisruptionMode) + i = encodeVarintGenerated(dAtA, i, uint64(len(*m.DisruptionMode))) + i-- + dAtA[i] = 0x2a + } if len(m.ResourceClaims) > 0 { for iNdEx := len(m.ResourceClaims) - 1; iNdEx >= 0; iNdEx-- { { @@ -986,6 +1020,15 @@ func (m *PodGroupSpec) Size() (n int) { n += 1 + l + sovGenerated(uint64(l)) } } + if m.DisruptionMode != nil { + l = len(*m.DisruptionMode) + n += 1 + l + sovGenerated(uint64(l)) + } + l = len(m.PriorityClassName) + n += 1 + l + sovGenerated(uint64(l)) + if m.Priority != nil { + n += 1 + sovGenerated(uint64(*m.Priority)) + } return n } @@ -1030,6 +1073,15 @@ func (m *PodGroupTemplate) Size() (n int) { n += 1 + l + sovGenerated(uint64(l)) } } + if m.DisruptionMode != nil { + l = len(*m.DisruptionMode) + n += 1 + l + sovGenerated(uint64(l)) + } + l = len(m.PriorityClassName) + n += 1 + l + sovGenerated(uint64(l)) + if m.Priority != nil { + n += 1 + sovGenerated(uint64(*m.Priority)) + } return n } @@ -1250,6 +1302,9 @@ func (this *PodGroupSpec) String() string { `SchedulingPolicy:` + strings.Replace(strings.Replace(this.SchedulingPolicy.String(), "PodGroupSchedulingPolicy", "PodGroupSchedulingPolicy", 1), `&`, ``, 1) + `,`, `SchedulingConstraints:` + strings.Replace(this.SchedulingConstraints.String(), "PodGroupSchedulingConstraints", "PodGroupSchedulingConstraints", 1) + `,`, `ResourceClaims:` + repeatedStringForResourceClaims + `,`, + `DisruptionMode:` + valueToStringGenerated(this.DisruptionMode) + `,`, + `PriorityClassName:` + fmt.Sprintf("%v", this.PriorityClassName) + `,`, + `Priority:` + valueToStringGenerated(this.Priority) + `,`, `}`, }, "") return s @@ -1289,6 +1344,9 @@ func (this *PodGroupTemplate) String() string { `SchedulingPolicy:` + strings.Replace(strings.Replace(this.SchedulingPolicy.String(), "PodGroupSchedulingPolicy", "PodGroupSchedulingPolicy", 1), `&`, ``, 1) + `,`, `SchedulingConstraints:` + strings.Replace(this.SchedulingConstraints.String(), "PodGroupSchedulingConstraints", "PodGroupSchedulingConstraints", 1) + `,`, `ResourceClaims:` + repeatedStringForResourceClaims + `,`, + `DisruptionMode:` + valueToStringGenerated(this.DisruptionMode) + `,`, + `PriorityClassName:` + fmt.Sprintf("%v", this.PriorityClassName) + `,`, + `Priority:` + valueToStringGenerated(this.Priority) + `,`, `}`, }, "") return s @@ -2409,6 +2467,91 @@ func (m *PodGroupSpec) Unmarshal(dAtA []byte) error { return err } iNdEx = postIndex + case 5: + if wireType != 2 { + return fmt.Errorf("proto: wrong wireType = %d for field DisruptionMode", wireType) + } + var stringLen uint64 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowGenerated + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + stringLen |= uint64(b&0x7F) << shift + if b < 0x80 { + break + } + } + intStringLen := int(stringLen) + if intStringLen < 0 { + return ErrInvalidLengthGenerated + } + postIndex := iNdEx + intStringLen + if postIndex < 0 { + return ErrInvalidLengthGenerated + } + if postIndex > l { + return io.ErrUnexpectedEOF + } + s := DisruptionMode(dAtA[iNdEx:postIndex]) + m.DisruptionMode = &s + iNdEx = postIndex + case 6: + if wireType != 2 { + return fmt.Errorf("proto: wrong wireType = %d for field PriorityClassName", wireType) + } + var stringLen uint64 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowGenerated + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + stringLen |= uint64(b&0x7F) << shift + if b < 0x80 { + break + } + } + intStringLen := int(stringLen) + if intStringLen < 0 { + return ErrInvalidLengthGenerated + } + postIndex := iNdEx + intStringLen + if postIndex < 0 { + return ErrInvalidLengthGenerated + } + if postIndex > l { + return io.ErrUnexpectedEOF + } + m.PriorityClassName = string(dAtA[iNdEx:postIndex]) + iNdEx = postIndex + case 7: + if wireType != 0 { + return fmt.Errorf("proto: wrong wireType = %d for field Priority", wireType) + } + var v int32 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowGenerated + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + v |= int32(b&0x7F) << shift + if b < 0x80 { + break + } + } + m.Priority = &v default: iNdEx = preIndex skippy, err := skipGenerated(dAtA[iNdEx:]) @@ -2712,6 +2855,91 @@ func (m *PodGroupTemplate) Unmarshal(dAtA []byte) error { return err } iNdEx = postIndex + case 5: + if wireType != 2 { + return fmt.Errorf("proto: wrong wireType = %d for field DisruptionMode", wireType) + } + var stringLen uint64 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowGenerated + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + stringLen |= uint64(b&0x7F) << shift + if b < 0x80 { + break + } + } + intStringLen := int(stringLen) + if intStringLen < 0 { + return ErrInvalidLengthGenerated + } + postIndex := iNdEx + intStringLen + if postIndex < 0 { + return ErrInvalidLengthGenerated + } + if postIndex > l { + return io.ErrUnexpectedEOF + } + s := DisruptionMode(dAtA[iNdEx:postIndex]) + m.DisruptionMode = &s + iNdEx = postIndex + case 6: + if wireType != 2 { + return fmt.Errorf("proto: wrong wireType = %d for field PriorityClassName", wireType) + } + var stringLen uint64 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowGenerated + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + stringLen |= uint64(b&0x7F) << shift + if b < 0x80 { + break + } + } + intStringLen := int(stringLen) + if intStringLen < 0 { + return ErrInvalidLengthGenerated + } + postIndex := iNdEx + intStringLen + if postIndex < 0 { + return ErrInvalidLengthGenerated + } + if postIndex > l { + return io.ErrUnexpectedEOF + } + m.PriorityClassName = string(dAtA[iNdEx:postIndex]) + iNdEx = postIndex + case 7: + if wireType != 0 { + return fmt.Errorf("proto: wrong wireType = %d for field Priority", wireType) + } + var v int32 + for shift := uint(0); ; shift += 7 { + if shift >= 64 { + return ErrIntOverflowGenerated + } + if iNdEx >= l { + return io.ErrUnexpectedEOF + } + b := dAtA[iNdEx] + iNdEx++ + v |= int32(b&0x7F) << shift + if b < 0x80 { + break + } + } + m.Priority = &v default: iNdEx = preIndex skippy, err := skipGenerated(dAtA[iNdEx:]) diff --git a/staging/src/k8s.io/api/scheduling/v1alpha2/generated.proto b/staging/src/k8s.io/api/scheduling/v1alpha2/generated.proto index 81583e06f1f..73b3308e38b 100644 --- a/staging/src/k8s.io/api/scheduling/v1alpha2/generated.proto +++ b/staging/src/k8s.io/api/scheduling/v1alpha2/generated.proto @@ -194,7 +194,7 @@ message PodGroupSpec { // // +optional // +k8s:optional - // +k8s:alpha(since:"1.36")=+k8s:immutable + // +k8s:immutable optional PodGroupTemplateReference podGroupTemplateRef = 1; // SchedulingPolicy defines the scheduling policy for this instance of the PodGroup. @@ -202,7 +202,7 @@ message PodGroupSpec { // This field is immutable. // // +required - // +k8s:alpha(since:"1.36")=+k8s:immutable + // +k8s:immutable optional PodGroupSchedulingPolicy schedulingPolicy = 2; // SchedulingConstraints defines optional scheduling constraints (e.g. topology) for this PodGroup. @@ -237,9 +237,59 @@ message PodGroupSpec { // +k8s:listType=map // +k8s:listMapKey=name // +k8s:maxItems=4 - // +k8s:alpha(since:"1.36")=+k8s:immutable + // +k8s:immutable // +featureGate=DRAWorkloadResourceClaims repeated PodGroupResourceClaim resourceClaims = 4; + + // DisruptionMode defines the mode in which a given PodGroup can be disrupted. + // Controllers are expected to fill this field by copying it from a PodGroupTemplate. + // One of Pod, PodGroup. Defaults to Pod if unset. + // This field is immutable. + // This field is available only when the WorkloadAwarePreemption feature gate + // is enabled. + // + // +featureGate=WorkloadAwarePreemption + // +optional + // +k8s:ifDisabled("WorkloadAwarePreemption")=+k8s:forbidden + // +k8s:ifEnabled("WorkloadAwarePreemption")=+k8s:optional + // +k8s:ifEnabled("WorkloadAwarePreemption")=+k8s:immutable + // +default="Pod" + optional string disruptionMode = 5; + + // PriorityClassName defines the priority that should be considered when scheduling this pod group. + // Controllers are expected to fill this field by copying it from a PodGroupTemplate. + // Otherwise, it is validated and resolved similarly to the PriorityClassName on PodGroupTemplate + // (i.e. if no priority class is specified, admission control can set this to the global default + // priority class if it exists. Otherwise, the pod group's priority will be zero). + // This field is immutable. + // This field is available only when the WorkloadAwarePreemption feature gate + // is enabled. + // + // +featureGate=WorkloadAwarePreemption + // +optional + // +k8s:ifDisabled("WorkloadAwarePreemption")=+k8s:forbidden + // +k8s:ifEnabled("WorkloadAwarePreemption")=+k8s:optional + // +k8s:ifEnabled("WorkloadAwarePreemption")=+k8s:format=k8s-long-name + // +k8s:ifEnabled("WorkloadAwarePreemption")=+k8s:immutable + optional string priorityClassName = 6; + + // Priority is the value of priority of this pod group. Various system components + // use this field to find the priority of the pod group. When Priority Admission + // Controller is enabled, it prevents users from setting this field. The admission + // controller populates this field from PriorityClassName. + // The higher the value, the higher the priority. + // This field is immutable. + // This field is available only when the WorkloadAwarePreemption feature gate + // is enabled. + // + // +featureGate=WorkloadAwarePreemption + // +optional + // +k8s:ifDisabled("WorkloadAwarePreemption")=+k8s:forbidden + // +k8s:ifEnabled("WorkloadAwarePreemption")=+k8s:optional + // +k8s:ifEnabled("WorkloadAwarePreemption")=+k8s:immutable + // +k8s:ifEnabled("WorkloadAwarePreemption")=+k8s:maximum=1000000000 # HighestUserDefinablePriority + // +k8s:ifEnabled("WorkloadAwarePreemption")=+k8s:minimum=-2147483648 + optional int32 priority = 7; } // PodGroupStatus represents information about the status of a pod group. @@ -326,9 +376,50 @@ message PodGroupTemplate { // +k8s:listType=map // +k8s:listMapKey=name // +k8s:maxItems=4 - // +k8s:alpha(since:"1.36")=+k8s:immutable + // +k8s:immutable // +featureGate=DRAWorkloadResourceClaims repeated PodGroupResourceClaim resourceClaims = 4; + + // DisruptionMode defines the mode in which a given PodGroup can be disrupted. + // One of Pod, PodGroup. + // This field is available only when the WorkloadAwarePreemption feature gate + // is enabled. + // + // +featureGate=WorkloadAwarePreemption + // +optional + // +k8s:ifDisabled("WorkloadAwarePreemption")=+k8s:forbidden + // +k8s:ifEnabled("WorkloadAwarePreemption")=+k8s:optional + optional string disruptionMode = 5; + + // PriorityClassName indicates the priority that should be considered when scheduling + // a pod group created from this template. If no priority class is specified, admission + // control can set this to the global default priority class if it exists. Otherwise, + // pod groups created from this template will have the priority set to zero. + // This field is available only when the WorkloadAwarePreemption feature gate + // is enabled. + // + // +featureGate=WorkloadAwarePreemption + // +optional + // +k8s:ifDisabled("WorkloadAwarePreemption")=+k8s:forbidden + // +k8s:ifEnabled("WorkloadAwarePreemption")=+k8s:optional + // +k8s:ifEnabled("WorkloadAwarePreemption")=+k8s:format=k8s-long-name + optional string priorityClassName = 6; + + // Priority is the value of priority of pod groups created from this template. Various + // system components use this field to find the priority of the pod group. When + // Priority Admission Controller is enabled, it prevents users from setting this field. + // The admission controller populates this field from PriorityClassName. + // The higher the value, the higher the priority. + // This field is available only when the WorkloadAwarePreemption feature gate + // is enabled. + // + // +featureGate=WorkloadAwarePreemption + // +optional + // +k8s:ifDisabled("WorkloadAwarePreemption")=+k8s:forbidden + // +k8s:ifEnabled("WorkloadAwarePreemption")=+k8s:optional + // +k8s:ifEnabled("WorkloadAwarePreemption")=+k8s:maximum=1000000000 # HighestUserDefinablePriority + // +k8s:ifEnabled("WorkloadAwarePreemption")=+k8s:minimum=-2147483648 + optional int32 priority = 7; } // PodGroupTemplateReference references a PodGroup template defined in some object (e.g. Workload). @@ -440,7 +531,7 @@ message WorkloadSpec { // // +optional // +k8s:optional - // +k8s:alpha(since:"1.36")=+k8s:immutable + // +k8s:immutable optional TypedLocalObjectReference controllerRef = 1; // PodGroupTemplates is the list of templates that make up the Workload. @@ -453,7 +544,7 @@ message WorkloadSpec { // +k8s:listType=map // +k8s:listMapKey=name // +k8s:maxItems=8 - // +k8s:alpha(since:"1.36")=+k8s:immutable + // +k8s:immutable repeated PodGroupTemplate podGroupTemplates = 2; } diff --git a/staging/src/k8s.io/api/scheduling/v1alpha2/types.go b/staging/src/k8s.io/api/scheduling/v1alpha2/types.go index b9cde8faadd..1f84f2c43d3 100644 --- a/staging/src/k8s.io/api/scheduling/v1alpha2/types.go +++ b/staging/src/k8s.io/api/scheduling/v1alpha2/types.go @@ -67,7 +67,7 @@ type WorkloadSpec struct { // // +optional // +k8s:optional - // +k8s:alpha(since:"1.36")=+k8s:immutable + // +k8s:immutable ControllerRef *TypedLocalObjectReference `json:"controllerRef,omitempty" protobuf:"bytes,1,opt,name=controllerRef"` // PodGroupTemplates is the list of templates that make up the Workload. @@ -80,7 +80,7 @@ type WorkloadSpec struct { // +k8s:listType=map // +k8s:listMapKey=name // +k8s:maxItems=8 - // +k8s:alpha(since:"1.36")=+k8s:immutable + // +k8s:immutable PodGroupTemplates []PodGroupTemplate `json:"podGroupTemplates" protobuf:"bytes,2,rep,name=podGroupTemplates"` } @@ -159,9 +159,50 @@ type PodGroupTemplate struct { // +k8s:listType=map // +k8s:listMapKey=name // +k8s:maxItems=4 - // +k8s:alpha(since:"1.36")=+k8s:immutable + // +k8s:immutable // +featureGate=DRAWorkloadResourceClaims ResourceClaims []PodGroupResourceClaim `json:"resourceClaims,omitempty" patchStrategy:"merge,retainKeys" patchMergeKey:"name" protobuf:"bytes,4,rep,name=resourceClaims"` + + // DisruptionMode defines the mode in which a given PodGroup can be disrupted. + // One of Pod, PodGroup. + // This field is available only when the WorkloadAwarePreemption feature gate + // is enabled. + // + // +featureGate=WorkloadAwarePreemption + // +optional + // +k8s:ifDisabled("WorkloadAwarePreemption")=+k8s:forbidden + // +k8s:ifEnabled("WorkloadAwarePreemption")=+k8s:optional + DisruptionMode *DisruptionMode `json:"disruptionMode,omitempty" protobuf:"bytes,5,opt,name=disruptionMode,casttype=DisruptionMode"` + + // PriorityClassName indicates the priority that should be considered when scheduling + // a pod group created from this template. If no priority class is specified, admission + // control can set this to the global default priority class if it exists. Otherwise, + // pod groups created from this template will have the priority set to zero. + // This field is available only when the WorkloadAwarePreemption feature gate + // is enabled. + // + // +featureGate=WorkloadAwarePreemption + // +optional + // +k8s:ifDisabled("WorkloadAwarePreemption")=+k8s:forbidden + // +k8s:ifEnabled("WorkloadAwarePreemption")=+k8s:optional + // +k8s:ifEnabled("WorkloadAwarePreemption")=+k8s:format=k8s-long-name + PriorityClassName string `json:"priorityClassName,omitempty" protobuf:"bytes,6,opt,name=priorityClassName"` + + // Priority is the value of priority of pod groups created from this template. Various + // system components use this field to find the priority of the pod group. When + // Priority Admission Controller is enabled, it prevents users from setting this field. + // The admission controller populates this field from PriorityClassName. + // The higher the value, the higher the priority. + // This field is available only when the WorkloadAwarePreemption feature gate + // is enabled. + // + // +featureGate=WorkloadAwarePreemption + // +optional + // +k8s:ifDisabled("WorkloadAwarePreemption")=+k8s:forbidden + // +k8s:ifEnabled("WorkloadAwarePreemption")=+k8s:optional + // +k8s:ifEnabled("WorkloadAwarePreemption")=+k8s:maximum=1000000000 # HighestUserDefinablePriority + // +k8s:ifEnabled("WorkloadAwarePreemption")=+k8s:minimum=-2147483648 + Priority *int32 `json:"priority,omitempty" protobuf:"varint,7,opt,name=priority"` } // PodGroupSchedulingPolicy defines the scheduling configuration for a PodGroup. @@ -259,6 +300,20 @@ type PodGroupResourceClaim struct { ResourceClaimTemplateName *string `json:"resourceClaimTemplateName,omitempty" protobuf:"bytes,3,opt,name=resourceClaimTemplateName"` } +// DisruptionMode describes the mode in which a PodGroup can be disrupted (e.g. preempted). +// +enum +// +k8s:enum +type DisruptionMode string + +const ( + // DisruptionModePod means that individual pods can be disrupted or preempted independently. + // It doesn't depend on exact set of pods currently running in this PodGroup. + DisruptionModePod DisruptionMode = "Pod" + // DisruptionModePodGroup means that the whole PodGroup needs to be disrupted + // or preempted together. + DisruptionModePodGroup DisruptionMode = "PodGroup" +) + // +genclient // +k8s:deepcopy-gen:interfaces=k8s.io/apimachinery/pkg/runtime.Object // +k8s:supportsSubresource="/status" @@ -307,7 +362,7 @@ type PodGroupSpec struct { // // +optional // +k8s:optional - // +k8s:alpha(since:"1.36")=+k8s:immutable + // +k8s:immutable PodGroupTemplateRef *PodGroupTemplateReference `json:"podGroupTemplateRef" protobuf:"bytes,1,opt,name=podGroupTemplateRef"` // SchedulingPolicy defines the scheduling policy for this instance of the PodGroup. @@ -315,7 +370,7 @@ type PodGroupSpec struct { // This field is immutable. // // +required - // +k8s:alpha(since:"1.36")=+k8s:immutable + // +k8s:immutable SchedulingPolicy PodGroupSchedulingPolicy `json:"schedulingPolicy" protobuf:"bytes,2,opt,name=schedulingPolicy"` // SchedulingConstraints defines optional scheduling constraints (e.g. topology) for this PodGroup. @@ -350,9 +405,59 @@ type PodGroupSpec struct { // +k8s:listType=map // +k8s:listMapKey=name // +k8s:maxItems=4 - // +k8s:alpha(since:"1.36")=+k8s:immutable + // +k8s:immutable // +featureGate=DRAWorkloadResourceClaims ResourceClaims []PodGroupResourceClaim `json:"resourceClaims,omitempty" patchStrategy:"merge,retainKeys" patchMergeKey:"name" protobuf:"bytes,4,rep,name=resourceClaims"` + + // DisruptionMode defines the mode in which a given PodGroup can be disrupted. + // Controllers are expected to fill this field by copying it from a PodGroupTemplate. + // One of Pod, PodGroup. Defaults to Pod if unset. + // This field is immutable. + // This field is available only when the WorkloadAwarePreemption feature gate + // is enabled. + // + // +featureGate=WorkloadAwarePreemption + // +optional + // +k8s:ifDisabled("WorkloadAwarePreemption")=+k8s:forbidden + // +k8s:ifEnabled("WorkloadAwarePreemption")=+k8s:optional + // +k8s:ifEnabled("WorkloadAwarePreemption")=+k8s:immutable + // +default="Pod" + DisruptionMode *DisruptionMode `json:"disruptionMode,omitempty" protobuf:"bytes,5,opt,name=disruptionMode,casttype=DisruptionMode"` + + // PriorityClassName defines the priority that should be considered when scheduling this pod group. + // Controllers are expected to fill this field by copying it from a PodGroupTemplate. + // Otherwise, it is validated and resolved similarly to the PriorityClassName on PodGroupTemplate + // (i.e. if no priority class is specified, admission control can set this to the global default + // priority class if it exists. Otherwise, the pod group's priority will be zero). + // This field is immutable. + // This field is available only when the WorkloadAwarePreemption feature gate + // is enabled. + // + // +featureGate=WorkloadAwarePreemption + // +optional + // +k8s:ifDisabled("WorkloadAwarePreemption")=+k8s:forbidden + // +k8s:ifEnabled("WorkloadAwarePreemption")=+k8s:optional + // +k8s:ifEnabled("WorkloadAwarePreemption")=+k8s:format=k8s-long-name + // +k8s:ifEnabled("WorkloadAwarePreemption")=+k8s:immutable + PriorityClassName string `json:"priorityClassName,omitempty" protobuf:"bytes,6,opt,name=priorityClassName"` + + // Priority is the value of priority of this pod group. Various system components + // use this field to find the priority of the pod group. When Priority Admission + // Controller is enabled, it prevents users from setting this field. The admission + // controller populates this field from PriorityClassName. + // The higher the value, the higher the priority. + // This field is immutable. + // This field is available only when the WorkloadAwarePreemption feature gate + // is enabled. + // + // +featureGate=WorkloadAwarePreemption + // +optional + // +k8s:ifDisabled("WorkloadAwarePreemption")=+k8s:forbidden + // +k8s:ifEnabled("WorkloadAwarePreemption")=+k8s:optional + // +k8s:ifEnabled("WorkloadAwarePreemption")=+k8s:immutable + // +k8s:ifEnabled("WorkloadAwarePreemption")=+k8s:maximum=1000000000 # HighestUserDefinablePriority + // +k8s:ifEnabled("WorkloadAwarePreemption")=+k8s:minimum=-2147483648 + Priority *int32 `json:"priority,omitempty" protobuf:"varint,7,opt,name=priority"` } // PodGroupStatus represents information about the status of a pod group. diff --git a/staging/src/k8s.io/api/scheduling/v1alpha2/types_swagger_doc_generated.go b/staging/src/k8s.io/api/scheduling/v1alpha2/types_swagger_doc_generated.go index 749eddb2c02..2dc06b1c6f7 100644 --- a/staging/src/k8s.io/api/scheduling/v1alpha2/types_swagger_doc_generated.go +++ b/staging/src/k8s.io/api/scheduling/v1alpha2/types_swagger_doc_generated.go @@ -111,6 +111,9 @@ var map_PodGroupSpec = map[string]string{ "schedulingPolicy": "SchedulingPolicy defines the scheduling policy for this instance of the PodGroup. Controllers are expected to fill this field by copying it from a PodGroupTemplate. This field is immutable.", "schedulingConstraints": "SchedulingConstraints defines optional scheduling constraints (e.g. topology) for this PodGroup. Controllers are expected to fill this field by copying it from a PodGroupTemplate. This field is immutable. This field is only available when the TopologyAwareWorkloadScheduling feature gate is enabled.", "resourceClaims": "ResourceClaims defines which ResourceClaims may be shared among Pods in the group. Pods consume the devices allocated to a PodGroup's claim by defining a claim in its own Spec.ResourceClaims that matches the PodGroup's claim exactly. The claim must have the same name and refer to the same ResourceClaim or ResourceClaimTemplate.\n\nThis is an alpha-level field and requires that the DRAWorkloadResourceClaims feature gate is enabled.\n\nThis field is immutable.", + "disruptionMode": "DisruptionMode defines the mode in which a given PodGroup can be disrupted. Controllers are expected to fill this field by copying it from a PodGroupTemplate. One of Pod, PodGroup. Defaults to Pod if unset. This field is immutable. This field is available only when the WorkloadAwarePreemption feature gate is enabled.", + "priorityClassName": "PriorityClassName defines the priority that should be considered when scheduling this pod group. Controllers are expected to fill this field by copying it from a PodGroupTemplate. Otherwise, it is validated and resolved similarly to the PriorityClassName on PodGroupTemplate (i.e. if no priority class is specified, admission control can set this to the global default priority class if it exists. Otherwise, the pod group's priority will be zero). This field is immutable. This field is available only when the WorkloadAwarePreemption feature gate is enabled.", + "priority": "Priority is the value of priority of this pod group. Various system components use this field to find the priority of the pod group. When Priority Admission Controller is enabled, it prevents users from setting this field. The admission controller populates this field from PriorityClassName. The higher the value, the higher the priority. This field is immutable. This field is available only when the WorkloadAwarePreemption feature gate is enabled.", } func (PodGroupSpec) SwaggerDoc() map[string]string { @@ -133,6 +136,9 @@ var map_PodGroupTemplate = map[string]string{ "schedulingPolicy": "SchedulingPolicy defines the scheduling policy for this PodGroupTemplate.", "schedulingConstraints": "SchedulingConstraints defines optional scheduling constraints (e.g. topology) for this PodGroupTemplate. This field is only available when the TopologyAwareWorkloadScheduling feature gate is enabled.", "resourceClaims": "ResourceClaims defines which ResourceClaims may be shared among Pods in the group. Pods consume the devices allocated to a PodGroup's claim by defining a claim in its own Spec.ResourceClaims that matches the PodGroup's claim exactly. The claim must have the same name and refer to the same ResourceClaim or ResourceClaimTemplate.\n\nThis is an alpha-level field and requires that the DRAWorkloadResourceClaims feature gate is enabled.\n\nThis field is immutable.", + "disruptionMode": "DisruptionMode defines the mode in which a given PodGroup can be disrupted. One of Pod, PodGroup. This field is available only when the WorkloadAwarePreemption feature gate is enabled.", + "priorityClassName": "PriorityClassName indicates the priority that should be considered when scheduling a pod group created from this template. If no priority class is specified, admission control can set this to the global default priority class if it exists. Otherwise, pod groups created from this template will have the priority set to zero. This field is available only when the WorkloadAwarePreemption feature gate is enabled.", + "priority": "Priority is the value of priority of pod groups created from this template. Various system components use this field to find the priority of the pod group. When Priority Admission Controller is enabled, it prevents users from setting this field. The admission controller populates this field from PriorityClassName. The higher the value, the higher the priority. This field is available only when the WorkloadAwarePreemption feature gate is enabled.", } func (PodGroupTemplate) SwaggerDoc() map[string]string { diff --git a/staging/src/k8s.io/api/scheduling/v1alpha2/zz_generated.deepcopy.go b/staging/src/k8s.io/api/scheduling/v1alpha2/zz_generated.deepcopy.go index 36a8b605cc4..901bb0a3ad9 100644 --- a/staging/src/k8s.io/api/scheduling/v1alpha2/zz_generated.deepcopy.go +++ b/staging/src/k8s.io/api/scheduling/v1alpha2/zz_generated.deepcopy.go @@ -234,6 +234,16 @@ func (in *PodGroupSpec) DeepCopyInto(out *PodGroupSpec) { (*in)[i].DeepCopyInto(&(*out)[i]) } } + if in.DisruptionMode != nil { + in, out := &in.DisruptionMode, &out.DisruptionMode + *out = new(DisruptionMode) + **out = **in + } + if in.Priority != nil { + in, out := &in.Priority, &out.Priority + *out = new(int32) + **out = **in + } return } @@ -293,6 +303,16 @@ func (in *PodGroupTemplate) DeepCopyInto(out *PodGroupTemplate) { (*in)[i].DeepCopyInto(&(*out)[i]) } } + if in.DisruptionMode != nil { + in, out := &in.DisruptionMode, &out.DisruptionMode + *out = new(DisruptionMode) + **out = **in + } + if in.Priority != nil { + in, out := &in.Priority, &out.Priority + *out = new(int32) + **out = **in + } return } diff --git a/staging/src/k8s.io/api/testdata/HEAD/scheduling.k8s.io.v1alpha2.PodGroup.json b/staging/src/k8s.io/api/testdata/HEAD/scheduling.k8s.io.v1alpha2.PodGroup.json index 6872a89766e..bab4a230a3a 100644 --- a/staging/src/k8s.io/api/testdata/HEAD/scheduling.k8s.io.v1alpha2.PodGroup.json +++ b/staging/src/k8s.io/api/testdata/HEAD/scheduling.k8s.io.v1alpha2.PodGroup.json @@ -69,7 +69,10 @@ "resourceClaimName": "resourceClaimNameValue", "resourceClaimTemplateName": "resourceClaimTemplateNameValue" } - ] + ], + "disruptionMode": "disruptionModeValue", + "priorityClassName": "priorityClassNameValue", + "priority": 7 }, "status": { "conditions": [ diff --git a/staging/src/k8s.io/api/testdata/HEAD/scheduling.k8s.io.v1alpha2.PodGroup.pb b/staging/src/k8s.io/api/testdata/HEAD/scheduling.k8s.io.v1alpha2.PodGroup.pb index b9ebedde83536e86e683e3fded0f5d6f0adb300e..4813dba502602bd65d7852b15d8ce4672f0c620f 100644 GIT binary patch delta 71 zcmeBU-OD;bm#K?&qkak_FyGV_Zv WODdgn5{rxd5_2Kq7VMLim~sGvtQk81 delta 23 fcmdnX+Q&LUm+3akM*S2<#@@+O82cs%GUWgOUR(#y diff --git a/staging/src/k8s.io/api/testdata/HEAD/scheduling.k8s.io.v1alpha2.PodGroup.yaml b/staging/src/k8s.io/api/testdata/HEAD/scheduling.k8s.io.v1alpha2.PodGroup.yaml index 37e11ee4251..169b493f20b 100644 --- a/staging/src/k8s.io/api/testdata/HEAD/scheduling.k8s.io.v1alpha2.PodGroup.yaml +++ b/staging/src/k8s.io/api/testdata/HEAD/scheduling.k8s.io.v1alpha2.PodGroup.yaml @@ -33,10 +33,13 @@ metadata: selfLink: selfLinkValue uid: uidValue spec: + disruptionMode: disruptionModeValue podGroupTemplateRef: workload: podGroupTemplateName: podGroupTemplateNameValue workloadName: workloadNameValue + priority: 7 + priorityClassName: priorityClassNameValue resourceClaims: - name: nameValue resourceClaimName: resourceClaimNameValue diff --git a/staging/src/k8s.io/api/testdata/HEAD/scheduling.k8s.io.v1alpha2.Workload.json b/staging/src/k8s.io/api/testdata/HEAD/scheduling.k8s.io.v1alpha2.Workload.json index 0fcb88663b3..672365d0ad8 100644 --- a/staging/src/k8s.io/api/testdata/HEAD/scheduling.k8s.io.v1alpha2.Workload.json +++ b/staging/src/k8s.io/api/testdata/HEAD/scheduling.k8s.io.v1alpha2.Workload.json @@ -71,7 +71,10 @@ "resourceClaimName": "resourceClaimNameValue", "resourceClaimTemplateName": "resourceClaimTemplateNameValue" } - ] + ], + "disruptionMode": "disruptionModeValue", + "priorityClassName": "priorityClassNameValue", + "priority": 7 } ] } diff --git a/staging/src/k8s.io/api/testdata/HEAD/scheduling.k8s.io.v1alpha2.Workload.pb b/staging/src/k8s.io/api/testdata/HEAD/scheduling.k8s.io.v1alpha2.Workload.pb index f8233dc98e44831a02243faa1eb197db0bf3ecd9..d26486caaa9c5cca78172cec15f716e4842f75cc 100644 GIT binary patch delta 78 zcmZ3%a)V`pF4Iz$jru8!jQb}~Vbo-r$~bu+W16v+a7t!zQE5R*W`3S;eoAUsVoqtQ ekyt@dW`0p-Nu_g6VsWuwVlG77f?bM1i2(pE`x=t~ delta 30 mcmcb?vVvuTF4I@$jru8!jQx|RFlsWUP2SI##v;X_!~g)D+X%)0 diff --git a/staging/src/k8s.io/api/testdata/HEAD/scheduling.k8s.io.v1alpha2.Workload.yaml b/staging/src/k8s.io/api/testdata/HEAD/scheduling.k8s.io.v1alpha2.Workload.yaml index 92d274ec03d..b145b727bf7 100644 --- a/staging/src/k8s.io/api/testdata/HEAD/scheduling.k8s.io.v1alpha2.Workload.yaml +++ b/staging/src/k8s.io/api/testdata/HEAD/scheduling.k8s.io.v1alpha2.Workload.yaml @@ -38,7 +38,10 @@ spec: kind: kindValue name: nameValue podGroupTemplates: - - name: nameValue + - disruptionMode: disruptionModeValue + name: nameValue + priority: 7 + priorityClassName: priorityClassNameValue resourceClaims: - name: nameValue resourceClaimName: resourceClaimNameValue diff --git a/staging/src/k8s.io/client-go/applyconfigurations/internal/internal.go b/staging/src/k8s.io/client-go/applyconfigurations/internal/internal.go index 3736e88bc9d..c0b255cc739 100644 --- a/staging/src/k8s.io/client-go/applyconfigurations/internal/internal.go +++ b/staging/src/k8s.io/client-go/applyconfigurations/internal/internal.go @@ -14801,9 +14801,19 @@ var schemaYAML = typed.YAMLObject(`types: - name: io.k8s.api.scheduling.v1alpha2.PodGroupSpec map: fields: + - name: disruptionMode + type: + scalar: string + default: Pod - name: podGroupTemplateRef type: namedType: io.k8s.api.scheduling.v1alpha2.PodGroupTemplateReference + - name: priority + type: + scalar: numeric + - name: priorityClassName + type: + scalar: string - name: resourceClaims type: list: @@ -14841,10 +14851,19 @@ var schemaYAML = typed.YAMLObject(`types: - name: io.k8s.api.scheduling.v1alpha2.PodGroupTemplate map: fields: + - name: disruptionMode + type: + scalar: string - name: name type: scalar: string default: "" + - name: priority + type: + scalar: numeric + - name: priorityClassName + type: + scalar: string - name: resourceClaims type: list: diff --git a/staging/src/k8s.io/client-go/applyconfigurations/scheduling/v1alpha2/podgroupspec.go b/staging/src/k8s.io/client-go/applyconfigurations/scheduling/v1alpha2/podgroupspec.go index dc0bdd8108a..76538c6221c 100644 --- a/staging/src/k8s.io/client-go/applyconfigurations/scheduling/v1alpha2/podgroupspec.go +++ b/staging/src/k8s.io/client-go/applyconfigurations/scheduling/v1alpha2/podgroupspec.go @@ -18,6 +18,10 @@ limitations under the License. package v1alpha2 +import ( + schedulingv1alpha2 "k8s.io/api/scheduling/v1alpha2" +) + // PodGroupSpecApplyConfiguration represents a declarative configuration of the PodGroupSpec type for use // with apply. // @@ -46,6 +50,31 @@ type PodGroupSpecApplyConfiguration struct { // // This field is immutable. ResourceClaims []PodGroupResourceClaimApplyConfiguration `json:"resourceClaims,omitempty"` + // DisruptionMode defines the mode in which a given PodGroup can be disrupted. + // Controllers are expected to fill this field by copying it from a PodGroupTemplate. + // One of Pod, PodGroup. Defaults to Pod if unset. + // This field is immutable. + // This field is available only when the WorkloadAwarePreemption feature gate + // is enabled. + DisruptionMode *schedulingv1alpha2.DisruptionMode `json:"disruptionMode,omitempty"` + // PriorityClassName defines the priority that should be considered when scheduling this pod group. + // Controllers are expected to fill this field by copying it from a PodGroupTemplate. + // Otherwise, it is validated and resolved similarly to the PriorityClassName on PodGroupTemplate + // (i.e. if no priority class is specified, admission control can set this to the global default + // priority class if it exists. Otherwise, the pod group's priority will be zero). + // This field is immutable. + // This field is available only when the WorkloadAwarePreemption feature gate + // is enabled. + PriorityClassName *string `json:"priorityClassName,omitempty"` + // Priority is the value of priority of this pod group. Various system components + // use this field to find the priority of the pod group. When Priority Admission + // Controller is enabled, it prevents users from setting this field. The admission + // controller populates this field from PriorityClassName. + // The higher the value, the higher the priority. + // This field is immutable. + // This field is available only when the WorkloadAwarePreemption feature gate + // is enabled. + Priority *int32 `json:"priority,omitempty"` } // PodGroupSpecApplyConfiguration constructs a declarative configuration of the PodGroupSpec type for use with @@ -90,3 +119,27 @@ func (b *PodGroupSpecApplyConfiguration) WithResourceClaims(values ...*PodGroupR } return b } + +// WithDisruptionMode sets the DisruptionMode field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the DisruptionMode field is set to the value of the last call. +func (b *PodGroupSpecApplyConfiguration) WithDisruptionMode(value schedulingv1alpha2.DisruptionMode) *PodGroupSpecApplyConfiguration { + b.DisruptionMode = &value + return b +} + +// WithPriorityClassName sets the PriorityClassName field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the PriorityClassName field is set to the value of the last call. +func (b *PodGroupSpecApplyConfiguration) WithPriorityClassName(value string) *PodGroupSpecApplyConfiguration { + b.PriorityClassName = &value + return b +} + +// WithPriority sets the Priority field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the Priority field is set to the value of the last call. +func (b *PodGroupSpecApplyConfiguration) WithPriority(value int32) *PodGroupSpecApplyConfiguration { + b.Priority = &value + return b +} diff --git a/staging/src/k8s.io/client-go/applyconfigurations/scheduling/v1alpha2/podgrouptemplate.go b/staging/src/k8s.io/client-go/applyconfigurations/scheduling/v1alpha2/podgrouptemplate.go index d3e5b2065a4..695caf2997e 100644 --- a/staging/src/k8s.io/client-go/applyconfigurations/scheduling/v1alpha2/podgrouptemplate.go +++ b/staging/src/k8s.io/client-go/applyconfigurations/scheduling/v1alpha2/podgrouptemplate.go @@ -18,6 +18,10 @@ limitations under the License. package v1alpha2 +import ( + schedulingv1alpha2 "k8s.io/api/scheduling/v1alpha2" +) + // PodGroupTemplateApplyConfiguration represents a declarative configuration of the PodGroupTemplate type for use // with apply. // @@ -42,6 +46,26 @@ type PodGroupTemplateApplyConfiguration struct { // // This field is immutable. ResourceClaims []PodGroupResourceClaimApplyConfiguration `json:"resourceClaims,omitempty"` + // DisruptionMode defines the mode in which a given PodGroup can be disrupted. + // One of Pod, PodGroup. + // This field is available only when the WorkloadAwarePreemption feature gate + // is enabled. + DisruptionMode *schedulingv1alpha2.DisruptionMode `json:"disruptionMode,omitempty"` + // PriorityClassName indicates the priority that should be considered when scheduling + // a pod group created from this template. If no priority class is specified, admission + // control can set this to the global default priority class if it exists. Otherwise, + // pod groups created from this template will have the priority set to zero. + // This field is available only when the WorkloadAwarePreemption feature gate + // is enabled. + PriorityClassName *string `json:"priorityClassName,omitempty"` + // Priority is the value of priority of pod groups created from this template. Various + // system components use this field to find the priority of the pod group. When + // Priority Admission Controller is enabled, it prevents users from setting this field. + // The admission controller populates this field from PriorityClassName. + // The higher the value, the higher the priority. + // This field is available only when the WorkloadAwarePreemption feature gate + // is enabled. + Priority *int32 `json:"priority,omitempty"` } // PodGroupTemplateApplyConfiguration constructs a declarative configuration of the PodGroupTemplate type for use with @@ -86,3 +110,27 @@ func (b *PodGroupTemplateApplyConfiguration) WithResourceClaims(values ...*PodGr } return b } + +// WithDisruptionMode sets the DisruptionMode field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the DisruptionMode field is set to the value of the last call. +func (b *PodGroupTemplateApplyConfiguration) WithDisruptionMode(value schedulingv1alpha2.DisruptionMode) *PodGroupTemplateApplyConfiguration { + b.DisruptionMode = &value + return b +} + +// WithPriorityClassName sets the PriorityClassName field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the PriorityClassName field is set to the value of the last call. +func (b *PodGroupTemplateApplyConfiguration) WithPriorityClassName(value string) *PodGroupTemplateApplyConfiguration { + b.PriorityClassName = &value + return b +} + +// WithPriority sets the Priority field in the declarative configuration to the given value +// and returns the receiver, so that objects can be built by chaining "With" function invocations. +// If called multiple times, the Priority field is set to the value of the last call. +func (b *PodGroupTemplateApplyConfiguration) WithPriority(value int32) *PodGroupTemplateApplyConfiguration { + b.Priority = &value + return b +} From ccaaf9d3a509f54b69c021002afb102b70824e22 Mon Sep 17 00:00:00 2001 From: Antoni Zawodny Date: Wed, 18 Mar 2026 16:25:24 +0100 Subject: [PATCH 2/2] Marry WAP logic with the new API fields --- .../default_preemption_test.go | 8 +- .../framework/preemption/executor.go | 10 +- .../preemption/podgrouppreemption.go | 15 +- .../preemption/podgrouppreemption_test.go | 417 +++++++++++++++++- pkg/scheduler/framework/preemption/types.go | 63 ++- .../framework/preemption/types_test.go | 263 +++++++++++ pkg/scheduler/testing/wrappers.go | 12 + pkg/scheduler/util/utils.go | 12 + .../scheduler/podgroup/podgroup_test.go | 20 +- .../preemption/podgrouppreemption_test.go | 121 ++++- 10 files changed, 883 insertions(+), 58 deletions(-) create mode 100644 pkg/scheduler/framework/preemption/types_test.go diff --git a/pkg/scheduler/framework/plugins/defaultpreemption/default_preemption_test.go b/pkg/scheduler/framework/plugins/defaultpreemption/default_preemption_test.go index 43692098652..4aa81652b4c 100644 --- a/pkg/scheduler/framework/plugins/defaultpreemption/default_preemption_test.go +++ b/pkg/scheduler/framework/plugins/defaultpreemption/default_preemption_test.go @@ -2499,7 +2499,7 @@ func TestPreEnqueue(t *testing.T) { podToTriggerPreemption: st.MakePod().Name("p").UID("p").Namespace(v1.NamespaceDefault).PodGroupName("pg1").Priority(highPriority).Obj(), podToCheck: st.MakePod().Name("p_other").UID("p_other").Namespace(v1.NamespaceDefault).PodGroupName("pg1").Priority(highPriority).Obj(), pgs: []*v1alpha2.PodGroup{ - st.MakePodGroup().Name("pg1").UID("pg1").Namespace(v1.NamespaceDefault).Obj(), + st.MakePodGroup().Name("pg1").UID("pg1").Namespace(v1.NamespaceDefault).Priority(highPriority).Obj(), }, features: feature.Features{EnableAsyncPreemption: true, EnableWorkloadAwarePreemption: true}, expectPreemption: true, @@ -2521,8 +2521,8 @@ func TestPreEnqueue(t *testing.T) { podToTriggerPreemption: st.MakePod().Name("p").UID("p").Namespace(v1.NamespaceDefault).PodGroupName("pg1").Priority(highPriority).Obj(), podToCheck: st.MakePod().Name("p_other").UID("p_other").Namespace(v1.NamespaceDefault).PodGroupName("pg2").Priority(highPriority).Obj(), pgs: []*v1alpha2.PodGroup{ - st.MakePodGroup().Name("pg1").UID("pg1").Namespace(v1.NamespaceDefault).Obj(), - st.MakePodGroup().Name("pg2").UID("pg2").Namespace(v1.NamespaceDefault).Obj(), + st.MakePodGroup().Name("pg1").UID("pg1").Namespace(v1.NamespaceDefault).Priority(highPriority).Obj(), + st.MakePodGroup().Name("pg2").UID("pg2").Namespace(v1.NamespaceDefault).Priority(highPriority).Obj(), }, features: feature.Features{EnableAsyncPreemption: true, EnableWorkloadAwarePreemption: true}, expectPreemption: true, @@ -2533,7 +2533,7 @@ func TestPreEnqueue(t *testing.T) { podToTriggerPreemption: st.MakePod().Name("p").UID("p").Namespace(v1.NamespaceDefault).PodGroupName("pg1").Priority(highPriority).Obj(), podToCheck: st.MakePod().Name("p_other").UID("p_other").Namespace(v1.NamespaceDefault).PodGroupName("pg_missing").Priority(highPriority).Obj(), pgs: []*v1alpha2.PodGroup{ - st.MakePodGroup().Name("pg1").UID("pg1").Namespace(v1.NamespaceDefault).Obj(), + st.MakePodGroup().Name("pg1").UID("pg1").Namespace(v1.NamespaceDefault).Priority(highPriority).Obj(), }, features: feature.Features{EnableAsyncPreemption: true, EnableWorkloadAwarePreemption: true}, expectPreemption: false, diff --git a/pkg/scheduler/framework/preemption/executor.go b/pkg/scheduler/framework/preemption/executor.go index ee2692b2275..319e9a90279 100644 --- a/pkg/scheduler/framework/preemption/executor.go +++ b/pkg/scheduler/framework/preemption/executor.go @@ -496,15 +496,7 @@ func (p *podGroupExecutorPreemptor) Obj() runtime.Object { } func (p *podGroupExecutorPreemptor) Priority() int32 { - // TODO(Argh4k): Replace it with v1alpha2 pod group priority - // once it's available. - maxPrio := int32(0) - for _, pod := range p.pods { - if prio := corev1helpers.PodPriority(pod); prio > maxPrio { - maxPrio = prio - } - } - return maxPrio + return util.PodGroupPriority(p.pg) } func (p *podGroupExecutorPreemptor) Pods() map[string]*v1.Pod { diff --git a/pkg/scheduler/framework/preemption/podgrouppreemption.go b/pkg/scheduler/framework/preemption/podgrouppreemption.go index 1f8667894f8..65d41a37a27 100644 --- a/pkg/scheduler/framework/preemption/podgrouppreemption.go +++ b/pkg/scheduler/framework/preemption/podgrouppreemption.go @@ -26,6 +26,7 @@ import ( policy "k8s.io/api/policy/v1" schedulingapi "k8s.io/api/scheduling/v1alpha2" policylisters "k8s.io/client-go/listers/policy/v1" + schedulinglisters "k8s.io/client-go/listers/scheduling/v1alpha2" "k8s.io/kubernetes/pkg/scheduler/framework/plugins/names" "k8s.io/klog/v2" @@ -36,8 +37,9 @@ import ( // PodGroupEvaluator is a preemption evaluator that knows how to run // preemption where a preemptor is a pod group and the domain is the whole cluster. type PodGroupEvaluator struct { - Handle fwk.Handle - pdbLister policylisters.PodDisruptionBudgetLister + Handle fwk.Handle + pdbLister policylisters.PodDisruptionBudgetLister + podGroupLister schedulinglisters.PodGroupLister Executor *Executor } @@ -45,9 +47,10 @@ type PodGroupEvaluator struct { // NewPodGroupEvaluator creates a new PodGroupEvaluator. func NewPodGroupEvaluator(fh fwk.Handle, executor *Executor) *PodGroupEvaluator { return &PodGroupEvaluator{ - Handle: fh, - pdbLister: fh.SharedInformerFactory().Policy().V1().PodDisruptionBudgets().Lister(), - Executor: executor, + Handle: fh, + pdbLister: fh.SharedInformerFactory().Policy().V1().PodDisruptionBudgets().Lister(), + podGroupLister: fh.SharedInformerFactory().Scheduling().V1alpha2().PodGroups().Lister(), + Executor: executor, } } @@ -68,7 +71,7 @@ func (ev *PodGroupEvaluator) Preempt(ctx context.Context, pg *schedulingapi.PodG if err != nil { return fwk.AsStatus(fmt.Errorf("failed to list node infos: %w", err)) } - domain := newDomainForWorkloadPreemption(allNodes, "cluster-domain") + domain := newDomainForWorkloadPreemption(allNodes, ev.podGroupLister, "cluster-domain") preemptor := newPodGroupPreemptor(pg, pods) pdbs, err := getPodDisruptionBudgets(ev.pdbLister) if err != nil { diff --git a/pkg/scheduler/framework/preemption/podgrouppreemption_test.go b/pkg/scheduler/framework/preemption/podgrouppreemption_test.go index 592e57d5c72..700eb6cf304 100644 --- a/pkg/scheduler/framework/preemption/podgrouppreemption_test.go +++ b/pkg/scheduler/framework/preemption/podgrouppreemption_test.go @@ -18,6 +18,7 @@ package preemption import ( "context" + "fmt" "testing" "time" @@ -27,12 +28,35 @@ import ( schedulingapi "k8s.io/api/scheduling/v1alpha2" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/util/sets" + schedulinglisters "k8s.io/client-go/listers/scheduling/v1alpha2" "k8s.io/klog/v2/ktesting" fwk "k8s.io/kube-scheduler/framework" "k8s.io/kubernetes/pkg/scheduler/framework" st "k8s.io/kubernetes/pkg/scheduler/testing" ) +type mockPodGroupLister struct { + schedulinglisters.PodGroupLister + podGroups map[string]*schedulingapi.PodGroup +} + +func (m *mockPodGroupLister) PodGroups(namespace string) schedulinglisters.PodGroupNamespaceLister { + return &mockPodGroupNamespaceLister{podGroups: m.podGroups, namespace: namespace} +} + +type mockPodGroupNamespaceLister struct { + schedulinglisters.PodGroupNamespaceLister + podGroups map[string]*schedulingapi.PodGroup + namespace string +} + +func (m *mockPodGroupNamespaceLister) Get(name string) (*schedulingapi.PodGroup, error) { + if pg, ok := m.podGroups[name]; ok { + return pg, nil + } + return nil, fmt.Errorf("pod group %s not found", name) +} + func TestPodGroupEvaluator_SelectVictimsOnDomain(t *testing.T) { // blockingRule mocks complex scheduling constraints for tests. // It states: "Node 'nodeName' can host 'capacity' preempting pods, @@ -47,6 +71,7 @@ func TestPodGroupEvaluator_SelectVictimsOnDomain(t *testing.T) { name string nodeNames []string initPods []*v1.Pod + initPodGroups []*schedulingapi.PodGroup preemptor *podGroupPreemptor pdbs []*policy.PodDisruptionBudget blockingRules []blockingRule @@ -61,8 +86,12 @@ func TestPodGroupEvaluator_SelectVictimsOnDomain(t *testing.T) { st.MakePod().Name("p2").UID("v2").Node("node2").Priority(lowPriority).PodGroupName("pg1").Obj(), st.MakePod().Name("p3").UID("v3").Node("node3").Priority(lowPriority).PodGroupName("pg2").Obj(), }, + initPodGroups: []*schedulingapi.PodGroup{ + st.MakePodGroup().Name("pg1").UID("pg1").DisruptionMode(schedulingapi.DisruptionModePod).Obj(), + st.MakePodGroup().Name("pg2").UID("pg2").DisruptionMode(schedulingapi.DisruptionModePod).Obj(), + }, preemptor: newPodGroupPreemptor( - &schedulingapi.PodGroup{ObjectMeta: metav1.ObjectMeta{Name: "preemptor-pg"}}, + st.MakePodGroup().Name("preemptor-pg").Priority(highPriority).Obj(), []*v1.Pod{st.MakePod().Name("p").UID("p").Priority(highPriority).Obj()}, ), blockingRules: []blockingRule{ @@ -81,8 +110,11 @@ func TestPodGroupEvaluator_SelectVictimsOnDomain(t *testing.T) { st.MakePod().Name("p2").UID("v2").Node("node2").Priority(lowPriority).PodGroupName("pg1").StartTime(metav1.Unix(0, 0)).Obj(), st.MakePod().Name("p3").UID("v3").Node("node3").Priority(midPriority).Obj(), }, + initPodGroups: []*schedulingapi.PodGroup{ + st.MakePodGroup().Name("pg1").UID("pg1").DisruptionMode(schedulingapi.DisruptionModePod).Obj(), + }, preemptor: newPodGroupPreemptor( - &schedulingapi.PodGroup{ObjectMeta: metav1.ObjectMeta{Name: "preemptor-pg"}}, + st.MakePodGroup().Name("preemptor-pg").Priority(highPriority).Obj(), []*v1.Pod{st.MakePod().Name("p").UID("p").Priority(highPriority).Obj()}, ), blockingRules: []blockingRule{ @@ -100,8 +132,11 @@ func TestPodGroupEvaluator_SelectVictimsOnDomain(t *testing.T) { st.MakePod().Name("p1").UID("v1").Node("node1").Priority(lowPriority).PodGroupName("pg1").StartTime(metav1.Unix(1, 0)).Obj(), st.MakePod().Name("p2").UID("v2").Node("node2").Priority(lowPriority).PodGroupName("pg1").StartTime(metav1.Unix(0, 0)).Obj(), }, + initPodGroups: []*schedulingapi.PodGroup{ + st.MakePodGroup().Name("pg1").UID("pg1").DisruptionMode(schedulingapi.DisruptionModePod).Obj(), + }, preemptor: newPodGroupPreemptor( - &schedulingapi.PodGroup{ObjectMeta: metav1.ObjectMeta{Name: "preemptor-pg"}}, + st.MakePodGroup().Name("preemptor-pg").Priority(highPriority).Obj(), []*v1.Pod{st.MakePod().Name("p").UID("p").Priority(highPriority).Obj()}, ), blockingRules: []blockingRule{ @@ -121,8 +156,13 @@ func TestPodGroupEvaluator_SelectVictimsOnDomain(t *testing.T) { st.MakePod().Name("p4").UID("v4").Node("node4").Priority(midPriority).Obj(), st.MakePod().Name("p5").UID("v5").Node("node5").Priority(highPriority).PodGroupName("pg3").StartTime(metav1.Unix(0, 0)).Obj(), }, + initPodGroups: []*schedulingapi.PodGroup{ + st.MakePodGroup().Name("pg1").UID("pg1").DisruptionMode(schedulingapi.DisruptionModePod).Obj(), + st.MakePodGroup().Name("pg2").UID("pg2").DisruptionMode(schedulingapi.DisruptionModePod).Obj(), + st.MakePodGroup().Name("pg3").UID("pg3").DisruptionMode(schedulingapi.DisruptionModePod).Obj(), + }, preemptor: newPodGroupPreemptor( - &schedulingapi.PodGroup{ObjectMeta: metav1.ObjectMeta{Name: "preemptor-pg"}}, + st.MakePodGroup().Name("preemptor-pg").Priority(highPriority).Obj(), []*v1.Pod{ st.MakePod().Name("p-1").UID("p-1").Priority(highPriority).Obj(), st.MakePod().Name("p-2").UID("p-2").Priority(highPriority).Obj(), @@ -153,7 +193,7 @@ func TestPodGroupEvaluator_SelectVictimsOnDomain(t *testing.T) { }, }, preemptor: newPodGroupPreemptor( - &schedulingapi.PodGroup{ObjectMeta: metav1.ObjectMeta{Name: "preemptor-pg"}}, + st.MakePodGroup().Name("preemptor-pg").Priority(highPriority).Obj(), []*v1.Pod{st.MakePod().Name("p").UID("p").Priority(highPriority).Obj()}, ), blockingRules: []blockingRule{ @@ -172,7 +212,7 @@ func TestPodGroupEvaluator_SelectVictimsOnDomain(t *testing.T) { st.MakePod().Name("p3").UID("v3").Node("node3").Priority(lowPriority).PodGroupName("pg2").Obj(), }, preemptor: newPodGroupPreemptor( - &schedulingapi.PodGroup{ObjectMeta: metav1.ObjectMeta{Name: "preemptor-pg"}}, + st.MakePodGroup().Name("preemptor-pg").Priority(highPriority).Obj(), []*v1.Pod{ st.MakePod().Name("p-1").UID("p-1").Priority(highPriority).PreemptionPolicy(v1.PreemptNever).Obj(), st.MakePod().Name("p-2").UID("p-2").Priority(highPriority).Obj(), @@ -187,6 +227,358 @@ func TestPodGroupEvaluator_SelectVictimsOnDomain(t *testing.T) { expectedPods: []string{}, expectedStatus: fwk.NewStatus(fwk.Unschedulable), }, + { + name: "Preempt single lower priority pod", + nodeNames: []string{"node1"}, + initPods: []*v1.Pod{ + st.MakePod().Name("p1").UID("v1").Node("node1").Priority(lowPriority).Obj(), + }, + preemptor: newPodGroupPreemptor( + st.MakePodGroup().Name("preemptor-pg").Priority(highPriority).Obj(), + []*v1.Pod{st.MakePod().Name("p").UID("p").Priority(highPriority).Obj()}, + ), + blockingRules: []blockingRule{ + {nodeName: "node1", blockingVictims: sets.New("p1"), capacity: 1}, + }, + expectedPods: []string{"p1"}, + expectedStatus: fwk.NewStatus(fwk.Success), + }, + { + name: "Priority: Prefer lower priority victim", + nodeNames: []string{"node1"}, + initPods: []*v1.Pod{ + st.MakePod().Name("p1").UID("v1").Node("node1").Priority(lowPriority).Obj(), + st.MakePod().Name("p2").UID("v2").Node("node1").Priority(midPriority).Obj(), + st.MakePod().Name("p3").UID("v3").Node("node1").Priority(highPriority).Obj(), + }, + preemptor: newPodGroupPreemptor( + st.MakePodGroup().Name("preemptor-pg").Priority(highPriority).Obj(), + []*v1.Pod{st.MakePod().Name("p").UID("p").Priority(highPriority).Obj()}, + ), + blockingRules: []blockingRule{ + {nodeName: "node1", blockingVictims: sets.New("p1"), capacity: 1}, + {nodeName: "node1", blockingVictims: sets.New("p2"), capacity: 1}, + {nodeName: "node1", blockingVictims: sets.New("p3"), capacity: 1}, + }, + expectedPods: []string{"p1"}, + expectedStatus: fwk.NewStatus(fwk.Success), + }, + { + name: "Efficiency: Preempt minimum number of victims", + nodeNames: []string{"node1"}, + initPods: []*v1.Pod{ + st.MakePod().Name("p1").UID("v1").Node("node1").Priority(lowPriority).Obj(), + st.MakePod().Name("p2").UID("v2").Node("node1").Priority(lowPriority).Obj(), + st.MakePod().Name("p3").UID("v3").Node("node1").Priority(lowPriority).Obj(), + st.MakePod().Name("p4").UID("v4").Node("node1").Priority(lowPriority).Obj(), + }, + preemptor: newPodGroupPreemptor( + st.MakePodGroup().Name("preemptor-pg").Priority(highPriority).Obj(), + []*v1.Pod{st.MakePod().Name("p").UID("p").Priority(highPriority).Obj()}, + ), + blockingRules: []blockingRule{ + {nodeName: "node1", blockingVictims: sets.New("p1", "p2"), capacity: 1}, + {nodeName: "node1", blockingVictims: sets.New("p1"), capacity: 1}, + }, + expectedPods: []string{"p1"}, + expectedStatus: fwk.NewStatus(fwk.Success), + }, + { + name: "PDB: Prefer non-violating victim", + nodeNames: []string{"node1"}, + initPods: []*v1.Pod{ + st.MakePod().Name("victim-pdb").UID("v1").Node("node1").Label("app", "foo").Priority(lowPriority).Obj(), + st.MakePod().Name("victim-no-pdb").UID("v2").Node("node1").Priority(lowPriority).Obj(), + }, + pdbs: []*policy.PodDisruptionBudget{ + { + Spec: policy.PodDisruptionBudgetSpec{Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": "foo"}}}, + Status: policy.PodDisruptionBudgetStatus{DisruptionsAllowed: 0}, + }, + }, + preemptor: newPodGroupPreemptor( + st.MakePodGroup().Name("preemptor-pg").Priority(highPriority).Obj(), + []*v1.Pod{st.MakePod().Name("p").UID("p").Priority(highPriority).Obj()}, + ), + blockingRules: []blockingRule{ + {nodeName: "node1", blockingVictims: sets.New("victim-pdb"), capacity: 1}, + {nodeName: "node1", blockingVictims: sets.New("victim-no-pdb"), capacity: 1}, + }, + expectedPods: []string{"victim-no-pdb"}, + expectedStatus: fwk.NewStatus(fwk.Success), + }, + { + name: "PodGroup with PreemptNever does not perform preemption", + nodeNames: []string{"node1", "node2", "node3"}, + initPods: []*v1.Pod{ + st.MakePod().Name("p1").UID("v1").Node("node1").Priority(lowPriority).Obj(), + st.MakePod().Name("p2").UID("v2").Node("node2").Priority(lowPriority).PodGroupName("pg1").Obj(), + st.MakePod().Name("p3").UID("v3").Node("node3").Priority(lowPriority).PodGroupName("pg2").Obj(), + }, + initPodGroups: []*schedulingapi.PodGroup{ + st.MakePodGroup().Name("pg1").UID("pg1").DisruptionMode(schedulingapi.DisruptionModePod).Priority(lowPriority).Obj(), + st.MakePodGroup().Name("pg2").UID("pg2").DisruptionMode(schedulingapi.DisruptionModePod).Priority(lowPriority).Obj(), + }, + preemptor: newPodGroupPreemptor( + st.MakePodGroup().Name("preemptor-pg").Priority(highPriority).Obj(), + []*v1.Pod{ + st.MakePod().Name("p-1").UID("p-1").Priority(highPriority).PreemptionPolicy(v1.PreemptNever).Obj(), + st.MakePod().Name("p-2").UID("p-2").Priority(highPriority).Obj(), + }, + ), + blockingRules: []blockingRule{ + {nodeName: "node1", capacity: 1, blockingVictims: sets.New("p1")}, + {nodeName: "node2", capacity: 1, blockingVictims: sets.New("p2")}, + {nodeName: "node3", capacity: 1, blockingVictims: sets.New("p3")}, + }, + expectedPods: []string{}, + expectedStatus: fwk.NewStatus(fwk.Unschedulable), + }, + { + name: "PDB: Prefer lower priority pod for preemption, when preemption without pdb violation is not possible", + nodeNames: []string{"node1"}, + initPods: []*v1.Pod{ + st.MakePod().Name("p1").UID("v1").Node("node1").Label("app", "foo").Priority(lowPriority).Obj(), + st.MakePod().Name("p2").UID("v2").Node("node1").Label("app", "foo").Priority(midPriority).Obj(), + }, + pdbs: []*policy.PodDisruptionBudget{ + { + Spec: policy.PodDisruptionBudgetSpec{Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": "foo"}}}, + Status: policy.PodDisruptionBudgetStatus{DisruptionsAllowed: 0}, + }, + }, + preemptor: newPodGroupPreemptor( + st.MakePodGroup().Name("preemptor-pg").Priority(highPriority).Obj(), + []*v1.Pod{st.MakePod().Name("p").UID("p").Priority(highPriority).Obj()}, + ), + blockingRules: []blockingRule{ + {nodeName: "node1", blockingVictims: sets.New("p1"), capacity: 1}, + {nodeName: "node1", blockingVictims: sets.New("p2"), capacity: 1}, + }, + expectedPods: []string{"p1"}, + expectedStatus: fwk.NewStatus(fwk.Success), + }, + { + name: "PodGroup: Preempt group as a whole", + nodeNames: []string{"node1", "node2"}, + initPods: []*v1.Pod{ + st.MakePod().Name("p1").UID("v1").Node("node1").Priority(lowPriority).PodGroupName("pg1").Obj(), + st.MakePod().Name("p2").UID("v2").Node("node2").Priority(lowPriority).PodGroupName("pg1").Obj(), + }, + initPodGroups: []*schedulingapi.PodGroup{ + st.MakePodGroup().Name("pg1").UID("pg1").DisruptionMode(schedulingapi.DisruptionModePodGroup).Priority(lowPriority).Obj(), + }, + preemptor: newPodGroupPreemptor( + st.MakePodGroup().Name("preemptor-pg").Priority(highPriority).Obj(), + []*v1.Pod{st.MakePod().Name("p").UID("p").Priority(highPriority).Obj()}, + ), + blockingRules: []blockingRule{ + {nodeName: "node1", capacity: 1, blockingVictims: sets.New("p1")}, + }, + expectedPods: []string{"p1", "p2"}, + expectedStatus: fwk.NewStatus(fwk.Success), + }, + { + name: "PodGroup: Prefer single pod over podGroup for preemption candidate", + nodeNames: []string{"node1"}, + initPods: []*v1.Pod{ + st.MakePod().Name("p1").UID("p1").Node("node1").Priority(lowPriority).Obj(), + st.MakePod().Name("g1-1").UID("g1").Node("node1").PodGroupName("pg1").Priority(lowPriority).Obj(), + st.MakePod().Name("g1-2").UID("g2").Node("node1").PodGroupName("pg1").Priority(lowPriority).Obj(), + }, + initPodGroups: []*schedulingapi.PodGroup{ + st.MakePodGroup().Name("pg1").UID("pg1").DisruptionMode(schedulingapi.DisruptionModePodGroup).Priority(lowPriority).Obj(), + }, + preemptor: newPodGroupPreemptor( + st.MakePodGroup().Name("preemptor-pg").Priority(highPriority).Obj(), + []*v1.Pod{st.MakePod().Name("p").UID("p").Priority(highPriority).Obj()}, + ), + blockingRules: []blockingRule{ + {nodeName: "node1", blockingVictims: sets.New("g1-1", "g1-2"), capacity: 1}, + {nodeName: "node1", blockingVictims: sets.New("p1"), capacity: 1}, + }, + expectedPods: []string{"p1"}, + expectedStatus: fwk.NewStatus(fwk.Success), + }, + { + name: "PodGroup: Preempt group as a whole on single node", + nodeNames: []string{"node1"}, + initPods: []*v1.Pod{ + st.MakePod().Name("g1-1").UID("g1").Node("node1").PodGroupName("pg1").Priority(lowPriority).Obj(), + st.MakePod().Name("g1-2").UID("g2").Node("node1").PodGroupName("pg1").Priority(lowPriority).Obj(), + st.MakePod().Name("p1").UID("p1").Node("node1").Priority(midPriority).Obj(), + }, + initPodGroups: []*schedulingapi.PodGroup{ + st.MakePodGroup().Name("pg1").UID("pg1").DisruptionMode(schedulingapi.DisruptionModePodGroup).Priority(lowPriority).Obj(), + }, + preemptor: newPodGroupPreemptor( + st.MakePodGroup().Name("preemptor-pg").Priority(highPriority).Obj(), + []*v1.Pod{st.MakePod().Name("p").UID("p").Priority(highPriority).Obj()}, + ), + blockingRules: []blockingRule{ + {nodeName: "node1", capacity: 1, blockingVictims: sets.New("g1-1")}, // Only g1-1 is blocking + }, + expectedPods: []string{"g1-1", "g1-2"}, // Both must be preempted + expectedStatus: fwk.NewStatus(fwk.Success), + }, + { + name: "PDB: Unit violation if any member violates", + nodeNames: []string{"node1"}, + initPods: []*v1.Pod{ + st.MakePod().Name("g1-1").UID("g1").Node("node1").Label("app", "foo").PodGroupName("pg1").Priority(lowPriority).Obj(), + st.MakePod().Name("g1-2").UID("g2").Node("node1").PodGroupName("pg1").Priority(lowPriority).Obj(), + st.MakePod().Name("p1").UID("p1").Node("node1").Priority(lowPriority).Obj(), + }, + initPodGroups: []*schedulingapi.PodGroup{ + st.MakePodGroup().Name("pg1").UID("pg1").DisruptionMode(schedulingapi.DisruptionModePodGroup).Priority(lowPriority).Obj(), + }, + pdbs: []*policy.PodDisruptionBudget{ + { + Spec: policy.PodDisruptionBudgetSpec{Selector: &metav1.LabelSelector{MatchLabels: map[string]string{"app": "foo"}}}, + Status: policy.PodDisruptionBudgetStatus{DisruptionsAllowed: 0}, + }, + }, + preemptor: newPodGroupPreemptor( + st.MakePodGroup().Name("preemptor-pg").Priority(highPriority).Obj(), + []*v1.Pod{st.MakePod().Name("p").UID("p").Priority(highPriority).Obj()}, + ), + blockingRules: []blockingRule{ + {nodeName: "node1", capacity: 1, blockingVictims: sets.New("g1-1")}, // Only g1-1 is blocking + {nodeName: "node1", capacity: 1, blockingVictims: sets.New("p1")}, // p1 is also blocking + }, + expectedPods: []string{"p1"}, // p1 is preferred because pg1 unit-violates PDB (via g1-1) + expectedStatus: fwk.NewStatus(fwk.Success), + }, + { + name: "PodGroup: Prefer preempting single pod over group of same priority", + nodeNames: []string{"node1"}, + initPods: []*v1.Pod{ + st.MakePod().Name("p1").UID("p1").Node("node1").Priority(lowPriority).Obj(), + st.MakePod().Name("g1-1").UID("g1").Node("node1").PodGroupName("pg1").Priority(lowPriority).Obj(), + st.MakePod().Name("g1-2").UID("g2").Node("node1").PodGroupName("pg1").Priority(lowPriority).Obj(), + }, + initPodGroups: []*schedulingapi.PodGroup{ + st.MakePodGroup().Name("pg1").UID("pg1").DisruptionMode(schedulingapi.DisruptionModePodGroup).Priority(lowPriority).Obj(), + }, + preemptor: newPodGroupPreemptor( + st.MakePodGroup().Name("preemptor-pg").Priority(highPriority).Obj(), + []*v1.Pod{st.MakePod().Name("p").UID("p").Priority(highPriority).Obj()}, + ), + blockingRules: []blockingRule{ + {nodeName: "node1", capacity: 1, blockingVictims: sets.New("g1-1", "g1-2")}, + {nodeName: "node1", capacity: 1, blockingVictims: sets.New("p1")}, + }, + expectedPods: []string{"p1"}, // p1 is preempted because the PodGroup is "more important" at the same priority level + expectedStatus: fwk.NewStatus(fwk.Success), + }, + { + name: "Failure: Cannot preempt the victim with higher priority", + nodeNames: []string{"node1"}, + initPods: []*v1.Pod{ + st.MakePod().Name("p1").UID("v1").Node("node1").Priority(highPriority).Obj(), + }, + preemptor: newPodGroupPreemptor( + st.MakePodGroup().Name("preemptor-pg").Priority(midPriority).Obj(), + []*v1.Pod{st.MakePod().Name("p").UID("p").Priority(midPriority).Obj()}, + ), + blockingRules: []blockingRule{ + {nodeName: "node1", blockingVictims: sets.New("p1")}, + }, + expectedPods: []string{}, + expectedStatus: fwk.NewStatus(fwk.UnschedulableAndUnresolvable), + }, + { + name: "Failure: Cannot preempt if node is empty", + nodeNames: []string{"node1"}, + initPods: []*v1.Pod{}, + preemptor: newPodGroupPreemptor( + st.MakePodGroup().Name("preemptor-pg").Priority(midPriority).Obj(), + []*v1.Pod{st.MakePod().Name("p").UID("p").Priority(midPriority).Obj()}, + ), + blockingRules: []blockingRule{}, + expectedPods: []string{}, + expectedStatus: fwk.NewStatus(fwk.UnschedulableAndUnresolvable), + }, + { + name: "Priority divergence: candidate victim PodGroup has lower priority than the Pods from that group", + nodeNames: []string{"node1", "node2"}, + initPods: []*v1.Pod{ + st.MakePod().Name("p1").UID("v1").Node("node1").Priority(highPriority).PodGroupName("pg1").Obj(), + st.MakePod().Name("p2").UID("v2").Node("node2").Priority(highPriority).PodGroupName("pg1").Obj(), + }, + initPodGroups: []*schedulingapi.PodGroup{ + st.MakePodGroup().Name("pg1").UID("pg1").DisruptionMode(schedulingapi.DisruptionModePodGroup).Priority(midPriority).Obj(), + }, + preemptor: newPodGroupPreemptor( + st.MakePodGroup().Name("preemptor-pg").Priority(highPriority).Obj(), + []*v1.Pod{st.MakePod().Name("p").UID("p").Priority(highPriority).Obj()}, + ), + blockingRules: []blockingRule{ + {nodeName: "node1", capacity: 1, blockingVictims: sets.New("p1")}, + }, + expectedPods: []string{"p1", "p2"}, + expectedStatus: fwk.NewStatus(fwk.Success), + }, + { + name: "Priority divergence: candidate victim PodGroup has higher priority than the Pods from that group", + nodeNames: []string{"node1", "node2"}, + initPods: []*v1.Pod{ + st.MakePod().Name("p1").UID("v1").Node("node1").Priority(lowPriority).PodGroupName("pg1").Obj(), + st.MakePod().Name("p2").UID("v2").Node("node2").Priority(lowPriority).PodGroupName("pg1").Obj(), + }, + initPodGroups: []*schedulingapi.PodGroup{ + st.MakePodGroup().Name("pg1").UID("pg1").DisruptionMode(schedulingapi.DisruptionModePodGroup).Priority(midPriority).Obj(), + }, + preemptor: newPodGroupPreemptor( + st.MakePodGroup().Name("preemptor-pg").Priority(midPriority).Obj(), + []*v1.Pod{st.MakePod().Name("p").UID("p").Priority(midPriority).Obj()}, + ), + blockingRules: []blockingRule{ + {nodeName: "node1", capacity: 1, blockingVictims: sets.New("p1")}, + }, + expectedPods: []string{}, + expectedStatus: fwk.NewStatus(fwk.UnschedulableAndUnresolvable), + }, + { + name: "Priority divergence: preemptor PodGroup has higher priority than the Pod from preemptor PodGroup", + nodeNames: []string{"node1", "node2"}, + initPods: []*v1.Pod{ + st.MakePod().Name("p1").UID("v1").Node("node1").Priority(midPriority).PodGroupName("pg1").Obj(), + st.MakePod().Name("p2").UID("v2").Node("node2").Priority(midPriority).PodGroupName("pg1").Obj(), + }, + initPodGroups: []*schedulingapi.PodGroup{ + st.MakePodGroup().Name("pg1").UID("pg1").DisruptionMode(schedulingapi.DisruptionModePodGroup).Priority(midPriority).Obj(), + }, + preemptor: newPodGroupPreemptor( + st.MakePodGroup().Name("preemptor-pg").Priority(highPriority).Obj(), + []*v1.Pod{st.MakePod().Name("p").UID("p").Priority(lowPriority).Obj()}, + ), + blockingRules: []blockingRule{ + {nodeName: "node1", capacity: 1, blockingVictims: sets.New("p1")}, + }, + expectedPods: []string{"p1", "p2"}, + expectedStatus: fwk.NewStatus(fwk.Success), + }, + { + name: "Priority divergence: preemptor PodGroup has lower priority than the Pod from preemptor PodGroup", + nodeNames: []string{"node1", "node2"}, + initPods: []*v1.Pod{ + st.MakePod().Name("p1").UID("v1").Node("node1").Priority(midPriority).PodGroupName("pg1").Obj(), + st.MakePod().Name("p2").UID("v2").Node("node2").Priority(midPriority).PodGroupName("pg1").Obj(), + }, + initPodGroups: []*schedulingapi.PodGroup{ + st.MakePodGroup().Name("pg1").UID("pg1").DisruptionMode(schedulingapi.DisruptionModePodGroup).Priority(midPriority).Obj(), + }, + preemptor: newPodGroupPreemptor( + st.MakePodGroup().Name("preemptor-pg").Priority(lowPriority).Obj(), + []*v1.Pod{st.MakePod().Name("p").UID("p").Priority(highPriority).Obj()}, + ), + blockingRules: []blockingRule{ + {nodeName: "node1", capacity: 1, blockingVictims: sets.New("p1")}, + }, + expectedPods: []string{}, + expectedStatus: fwk.NewStatus(fwk.UnschedulableAndUnresolvable), + }, } for _, tt := range tests { @@ -213,7 +605,12 @@ func TestPodGroupEvaluator_SelectVictimsOnDomain(t *testing.T) { domainNodes = append(domainNodes, nodeInfos[name]) } - domain := newDomainForWorkloadPreemption(domainNodes, "test-domain") + podGroups := make(map[string]*schedulingapi.PodGroup) + for _, pg := range tt.initPodGroups { + podGroups[pg.Name] = pg + } + pgLister := &mockPodGroupLister{podGroups: podGroups} + domain := newDomainForWorkloadPreemption(domainNodes, pgLister, "test-domain") // Create a mock podGroupSchedulingFunc. // This simulates whether the preempting PodGroup can schedule given the current @@ -268,10 +665,12 @@ func TestPodGroupEvaluator_SelectVictimsOnDomain(t *testing.T) { if wantCode != fwk.Success { return } + if victims == nil { + t.Fatalf("expected non-nil victims on success") + } - gotPods := victims.Pods gotNames := sets.Set[string]{} - for _, p := range gotPods { + for _, p := range victims.Pods { gotNames.Insert(p.Name) } wantNames := sets.New(tt.expectedPods...) diff --git a/pkg/scheduler/framework/preemption/types.go b/pkg/scheduler/framework/preemption/types.go index ba8900d8ce0..f8034e03c19 100644 --- a/pkg/scheduler/framework/preemption/types.go +++ b/pkg/scheduler/framework/preemption/types.go @@ -17,11 +17,15 @@ limitations under the License. package preemption import ( + "maps" + "slices" "sync/atomic" v1 "k8s.io/api/core/v1" schedulingapi "k8s.io/api/scheduling/v1alpha2" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + schedulinglisters "k8s.io/client-go/listers/scheduling/v1alpha2" corev1helpers "k8s.io/component-helpers/scheduling/corev1" extenderv1 "k8s.io/kube-scheduler/extender/v1" fwk "k8s.io/kube-scheduler/framework" @@ -37,21 +41,14 @@ type podGroupPreemptor struct { } func newPodGroupPreemptor(pg *schedulingapi.PodGroup, pods []*v1.Pod) *podGroupPreemptor { - prio := int32(0) preemptionPolicy := v1.PreemptLowerPriority - // TODO(Argh4k): Replace it with pg.Spec.Priority once it's implemented: - // https://github.com/kubernetes/kubernetes/pull/136589 for _, pod := range pods { - if p := corev1helpers.PodPriority(pod); p > prio { - prio = p - } if p := pod.Spec.PreemptionPolicy; p != nil && *p == v1.PreemptNever { preemptionPolicy = *p } } - return &podGroupPreemptor{ - priority: prio, + priority: util.PodGroupPriority(pg), pods: pods, podGroup: pg, preemptionPolicy: preemptionPolicy, @@ -86,19 +83,53 @@ type domain struct { allPossibleVictims []*victim } -func newDomainForWorkloadPreemption(nodes []fwk.NodeInfo, name string) *domain { - // TODO(Argh4k): PodGroups with a DisruptionMode == DisruptionModePodGroup - // should be treated as a single Victim. - // https://github.com/kubernetes/kubernetes/pull/136589 - // TODO(Argh4k): For pod groups victims use pg.Spec.Priority once it's implemented: - // https://github.com/kubernetes/kubernetes/pull/136589 - allPossibleVictims := make([]*victim, 0, len(nodes)) +// isPodGroupPreemptiblePod checks if a pod is a part of a pod group that should +// be treated as a single unit for preemption purposes. +// If the pod is a part of such a pod group, it returns the pod group and true. +// In all other cases, it returns nil and false. +func isPodGroupPreemptiblePod(p *v1.Pod, pgLister schedulinglisters.PodGroupLister) (*schedulingapi.PodGroup, bool) { + if p.Spec.SchedulingGroup == nil { + return nil, false + } + pgName := p.Spec.SchedulingGroup.PodGroupName + pg, err := pgLister.PodGroups(p.Namespace).Get(*pgName) + if err != nil { + return nil, false + } + if mode := pg.Spec.DisruptionMode; mode == nil || *mode != schedulingapi.DisruptionModePodGroup { + return nil, false + } + return pg, true +} + +// newDomainForWorkloadPreemption creates a new domain for workload preemption. +// The domain is the whole cluster and it contains victims that are computed based +// on the pods and their scheduling groups. +// Pods that are part of a pod group with the PodGroup disruption mode are grouped +// together into a single victim. Otherwise, they are treated as individual victims. +func newDomainForWorkloadPreemption(nodes []fwk.NodeInfo, pgLister schedulinglisters.PodGroupLister, name string) *domain { + victimMap := map[types.UID]*victim{} for _, node := range nodes { for _, p := range node.GetPods() { - allPossibleVictims = append(allPossibleVictims, newVictim([]fwk.PodInfo{p}, corev1helpers.PodPriority(p.GetPod()), []fwk.NodeInfo{node})) + // TODO: Calling the lister here is not ideal given we do this + // for every pod in the cluster. Instead, we should be getting + // this information from the snapshot. + pg, ok := isPodGroupPreemptiblePod(p.GetPod(), pgLister) + if !ok { + victimMap[p.GetPod().UID] = newVictim([]fwk.PodInfo{p}, corev1helpers.PodPriority(p.GetPod()), []fwk.NodeInfo{node}) + continue + } + victim, ok := victimMap[pg.UID] + if ok { + victim.pods = append(victim.pods, p) + victim.affectedNodes[node.Node().Name] = node + continue + } + victimMap[pg.UID] = newVictim([]fwk.PodInfo{p}, util.PodGroupPriority(pg), []fwk.NodeInfo{node}) } } + allPossibleVictims := slices.Collect(maps.Values(victimMap)) return &domain{ nodes: nodes, allPossibleVictims: allPossibleVictims, diff --git a/pkg/scheduler/framework/preemption/types_test.go b/pkg/scheduler/framework/preemption/types_test.go new file mode 100644 index 00000000000..c53349f4962 --- /dev/null +++ b/pkg/scheduler/framework/preemption/types_test.go @@ -0,0 +1,263 @@ +/* +Copyright The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package preemption + +import ( + "sort" + "testing" + + "github.com/google/go-cmp/cmp" + v1 "k8s.io/api/core/v1" + schedulingapi "k8s.io/api/scheduling/v1alpha2" + "k8s.io/apimachinery/pkg/util/sets" + fwk "k8s.io/kube-scheduler/framework" + "k8s.io/kubernetes/pkg/scheduler/framework" + st "k8s.io/kubernetes/pkg/scheduler/testing" +) + +func TestIsPodGroupPreemptiblePod(t *testing.T) { + tests := []struct { + name string + pod *v1.Pod + podGroups map[string]*schedulingapi.PodGroup + wantPodGroup *schedulingapi.PodGroup + wantOk bool + }{ + { + name: "pod without scheduling group", + pod: st.MakePod().Name("p1").Namespace("default").Obj(), + wantPodGroup: nil, + wantOk: false, + }, + { + name: "pod group not found", + pod: st.MakePod().Name("p1").Namespace("default").PodGroupName("pg1").Obj(), + podGroups: map[string]*schedulingapi.PodGroup{}, + wantPodGroup: nil, + wantOk: false, + }, + { + name: "pod group with nil disruption mode", + pod: st.MakePod().Name("p1").Namespace("default").PodGroupName("pg1").Obj(), + podGroups: map[string]*schedulingapi.PodGroup{ + "pg1": st.MakePodGroup().Name("pg1").Namespace("default").Obj(), + }, + wantPodGroup: nil, + wantOk: false, + }, + { + name: "pod group with DisruptionModePod", + pod: st.MakePod().Name("p1").Namespace("default").PodGroupName("pg1").Obj(), + podGroups: map[string]*schedulingapi.PodGroup{ + "pg1": st.MakePodGroup().Name("pg1").Namespace("default").DisruptionMode(schedulingapi.DisruptionModePod).Obj(), + }, + wantPodGroup: nil, + wantOk: false, + }, + { + name: "pod group with DisruptionModePodGroup", + pod: st.MakePod().Name("p1").Namespace("default").PodGroupName("pg1").Obj(), + podGroups: map[string]*schedulingapi.PodGroup{ + "pg1": st.MakePodGroup().Name("pg1").Namespace("default").DisruptionMode(schedulingapi.DisruptionModePodGroup).Obj(), + }, + wantPodGroup: st.MakePodGroup().Name("pg1").Namespace("default").DisruptionMode(schedulingapi.DisruptionModePodGroup).Obj(), + wantOk: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + pgLister := &mockPodGroupLister{podGroups: tt.podGroups} + podGroup, ok := isPodGroupPreemptiblePod(tt.pod, pgLister) + if ok != tt.wantOk { + t.Errorf("isPodGroupPreemptiblePod() gotOk = %v, want %v", ok, tt.wantOk) + } + if diff := cmp.Diff(tt.wantPodGroup, podGroup); diff != "" { + t.Errorf("isPodGroupPreemptiblePod() gotPodGroup mismatch (-want +got):\n%s", diff) + } + }) + } +} + +type expectedVictim struct { + pods sets.Set[string] + affectedNodes sets.Set[string] + priority int32 +} + +func TestNewDomainForWorkloadPreemption(t *testing.T) { + tests := []struct { + name string + nodes []*v1.Node + pods []*v1.Pod + podGroups map[string]*schedulingapi.PodGroup + domainName string + wantVictims []expectedVictim + }{ + { + name: "no pods", + nodes: []*v1.Node{ + st.MakeNode().Name("node1").Obj(), + }, + pods: nil, + podGroups: nil, + domainName: "test-domain", + wantVictims: nil, + }, + { + name: "pods without pod groups", + nodes: []*v1.Node{ + st.MakeNode().Name("node1").Obj(), + st.MakeNode().Name("node2").Obj(), + }, + pods: []*v1.Pod{ + st.MakePod().Name("p1").UID("p1").Node("node1").Priority(10).Obj(), + st.MakePod().Name("p2").UID("p2").Node("node2").Priority(20).Obj(), + }, + podGroups: nil, + domainName: "test-domain", + wantVictims: []expectedVictim{ + {pods: sets.New("p1"), affectedNodes: sets.New("node1"), priority: 10}, + {pods: sets.New("p2"), affectedNodes: sets.New("node2"), priority: 20}, + }, + }, + { + name: "pods with pod group (DisruptionModePodGroup)", + nodes: []*v1.Node{ + st.MakeNode().Name("node1").Obj(), + st.MakeNode().Name("node2").Obj(), + }, + pods: []*v1.Pod{ + st.MakePod().Name("p1").UID("p1").Node("node1").PodGroupName("pg1").Priority(10).Obj(), + st.MakePod().Name("p2").UID("p2").Node("node2").PodGroupName("pg1").Priority(10).Obj(), + }, + podGroups: map[string]*schedulingapi.PodGroup{ + "pg1": st.MakePodGroup().Name("pg1").UID("pg1").DisruptionMode(schedulingapi.DisruptionModePodGroup).Priority(50).Obj(), + }, + domainName: "test-domain", + wantVictims: []expectedVictim{ + {pods: sets.New("p1", "p2"), affectedNodes: sets.New("node1", "node2"), priority: 50}, + }, + }, + { + name: "pods with pod group (DisruptionModePod)", + nodes: []*v1.Node{ + st.MakeNode().Name("node1").Obj(), + st.MakeNode().Name("node2").Obj(), + }, + pods: []*v1.Pod{ + st.MakePod().Name("p1").UID("p1").Node("node1").PodGroupName("pg1").Priority(10).Obj(), + st.MakePod().Name("p2").UID("p2").Node("node2").PodGroupName("pg1").Priority(20).Obj(), + }, + podGroups: map[string]*schedulingapi.PodGroup{ + "pg1": st.MakePodGroup().Name("pg1").UID("pg1").DisruptionMode(schedulingapi.DisruptionModePod).Priority(50).Obj(), + }, + domainName: "test-domain", + wantVictims: []expectedVictim{ + {pods: sets.New("p1"), affectedNodes: sets.New("node1"), priority: 10}, + {pods: sets.New("p2"), affectedNodes: sets.New("node2"), priority: 20}, + }, + }, + { + name: "mix of pod groups and individual pods", + nodes: []*v1.Node{ + st.MakeNode().Name("node1").Obj(), + st.MakeNode().Name("node2").Obj(), + }, + pods: []*v1.Pod{ + st.MakePod().Name("p1").UID("p1").Node("node1").PodGroupName("pg1").Priority(10).Obj(), + st.MakePod().Name("p2").UID("p2").Node("node2").PodGroupName("pg1").Priority(10).Obj(), + st.MakePod().Name("p3").UID("p3").Node("node1").PodGroupName("pg2").Priority(20).Obj(), + st.MakePod().Name("p4").UID("p4").Node("node2").Priority(30).Obj(), + }, + podGroups: map[string]*schedulingapi.PodGroup{ + "pg1": st.MakePodGroup().Name("pg1").UID("pg1").DisruptionMode(schedulingapi.DisruptionModePodGroup).Priority(50).Obj(), + "pg2": st.MakePodGroup().Name("pg2").UID("pg2").DisruptionMode(schedulingapi.DisruptionModePod).Priority(60).Obj(), + }, + domainName: "test-domain", + wantVictims: []expectedVictim{ + {pods: sets.New("p1", "p2"), affectedNodes: sets.New("node1", "node2"), priority: 50}, + {pods: sets.New("p3"), affectedNodes: sets.New("node1"), priority: 20}, + {pods: sets.New("p4"), affectedNodes: sets.New("node2"), priority: 30}, + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + nodeInfos := make(map[string]fwk.NodeInfo) + for _, node := range tt.nodes { + ni := framework.NewNodeInfo() + ni.SetNode(node) + nodeInfos[node.Name] = ni + } + for _, p := range tt.pods { + if ni, ok := nodeInfos[p.Spec.NodeName]; ok { + pi, _ := framework.NewPodInfo(p) + ni.AddPodInfo(pi) + } + } + + var domainNodes []fwk.NodeInfo + for _, node := range tt.nodes { + if ni, ok := nodeInfos[node.Name]; ok { + domainNodes = append(domainNodes, ni) + } + } + + pgLister := &mockPodGroupLister{podGroups: tt.podGroups} + domain := newDomainForWorkloadPreemption(domainNodes, pgLister, tt.domainName) + + if domain.GetName() != tt.domainName { + t.Errorf("expected domain name %q, got %q", tt.domainName, domain.GetName()) + } + + victims := domain.GetAllPossibleVictims() + + var gotVictims []expectedVictim + for _, v := range victims { + ev := expectedVictim{ + pods: sets.New[string](), + affectedNodes: sets.New[string](), + priority: v.Priority(), + } + for _, p := range v.Pods() { + ev.pods.Insert(p.GetPod().Name) + } + for n := range v.AffectedNodes() { + ev.affectedNodes.Insert(n) + } + gotVictims = append(gotVictims, ev) + } + + sortVictims := func(vs []expectedVictim) { + sort.Slice(vs, func(i, j int) bool { + if vs[i].pods.Len() == 0 || vs[j].pods.Len() == 0 { + return vs[i].pods.Len() < vs[j].pods.Len() + } + return sets.List(vs[i].pods)[0] < sets.List(vs[j].pods)[0] + }) + } + sortVictims(gotVictims) + sortVictims(tt.wantVictims) + + if diff := cmp.Diff(tt.wantVictims, gotVictims, cmp.AllowUnexported(expectedVictim{})); diff != "" { + t.Errorf("victims mismatch (-want +got):\n%s", diff) + } + }) + } +} diff --git a/pkg/scheduler/testing/wrappers.go b/pkg/scheduler/testing/wrappers.go index cae98691acc..f8dcaee4e26 100644 --- a/pkg/scheduler/testing/wrappers.go +++ b/pkg/scheduler/testing/wrappers.go @@ -1654,6 +1654,18 @@ func (wrapper *PodGroupWrapper) ResourceClaimStatuses(statuses ...schedulingapi. return wrapper } +// DisruptionMode sets the disruption mode of the inner PodGroup. +func (wrapper *PodGroupWrapper) DisruptionMode(mode schedulingapi.DisruptionMode) *PodGroupWrapper { + wrapper.PodGroup.Spec.DisruptionMode = &mode + return wrapper +} + +// Priority sets the priority of the inner PodGroup. +func (wrapper *PodGroupWrapper) Priority(priority int32) *PodGroupWrapper { + wrapper.PodGroup.Spec.Priority = &priority + return wrapper +} + // WorkloadWrapper wraps a Workload inside. type WorkloadWrapper struct{ schedulingapi.Workload } diff --git a/pkg/scheduler/util/utils.go b/pkg/scheduler/util/utils.go index b7320f59fdc..8605c60359d 100644 --- a/pkg/scheduler/util/utils.go +++ b/pkg/scheduler/util/utils.go @@ -257,3 +257,15 @@ func GetHostPorts(pod *v1.Pod) []v1.ContainerPort { } return ports } + +// PodGroupPriority returns priority of a given pod group. +func PodGroupPriority(pg *schedulingv1alpha2.PodGroup) int32 { + if pg.Spec.Priority != nil { + return *pg.Spec.Priority + } + // When priority of a pod group is nil, it means it was created at a time + // that there was no global default priority class and the priority class + // name of the pod group was empty (or when the WorkloadAwarePreemption + // feature gate was disabled). So, we resolve to the static default priority. + return 0 +} diff --git a/test/integration/scheduler/podgroup/podgroup_test.go b/test/integration/scheduler/podgroup/podgroup_test.go index ad4b3a582cc..a595f0056ed 100644 --- a/test/integration/scheduler/podgroup/podgroup_test.go +++ b/test/integration/scheduler/podgroup/podgroup_test.go @@ -68,9 +68,11 @@ func TestPodGroupScheduling(t *testing.T) { PodGroupTemplate(st.MakePodGroupTemplate().Name("t").MinCount(3).Obj()). Obj() - gangPodGroup := st.MakePodGroup().Name("pg1").TemplateRef("t1", "workload").MinCount(3).Obj() + gangPodGroup := st.MakePodGroup().Name("pg1").TemplateRef("t1", "workload"). + Priority(100).MinCount(3).Obj() - otherGangPodGroup := st.MakePodGroup().Name("pg2").TemplateRef("t", "other-workload").MinCount(3).Obj() + otherGangPodGroup := st.MakePodGroup().Name("pg2").TemplateRef("t", "other-workload"). + Priority(100).MinCount(3).Obj() basicPodGroup := st.MakePodGroup().Name("pg1").TemplateRef("t2", "workload").BasicPolicy().Obj() @@ -108,8 +110,10 @@ func TestPodGroupScheduling(t *testing.T) { midP2 := st.MakePod().Name("mid-p2").Req(map[v1.ResourceName]string{v1.ResourceCPU: "1"}).Container("image"). PodGroupName("mid-pg").Priority(50).Obj() - midPodGroup := st.MakePodGroup().Name("mid-pg").TemplateRef("t-mid", "workload").MinCount(2).Obj() - midPodGroupWithConstraint := st.MakePodGroup().Name("mid-pg").TemplateRef("t-mid", "workload").MinCount(2).TopologyKey("topology.kubernetes.io/zone").Obj() + midPodGroup := st.MakePodGroup().Name("mid-pg").TemplateRef("t-mid", "workload"). + Priority(50).MinCount(2).Obj() + midPodGroupWithConstraint := st.MakePodGroup().Name("mid-pg").TemplateRef("t-mid", "workload"). + Priority(50).MinCount(2).TopologyKey("topology.kubernetes.io/zone").Obj() otherP1 := st.MakePod().Name("other-p1").Req(map[v1.ResourceName]string{v1.ResourceCPU: "1"}).Container("image"). PodGroupName("pg2").Priority(100).Obj() @@ -792,8 +796,10 @@ func TestWorkloadAwarePreemptionInvocation(t *testing.T) { }) node := st.MakeNode().Name("node").Capacity(map[v1.ResourceName]string{v1.ResourceCPU: "4"}).Obj() + workload := st.MakeWorkload().Name("workload").PodGroupTemplate(st.MakePodGroupTemplate().Name("t1").MinCount(3).Obj()).Obj() - pg := st.MakePodGroup().Namespace("default").Name("pg1").TemplateRef("t1", "workload").MinCount(3).Obj() + pg := st.MakePodGroup().Namespace("default").Name("pg1").TemplateRef("t1", "workload"). + DisruptionMode(schedulingapi.DisruptionModePodGroup).Priority(100).MinCount(3).Obj() // Low priority pods taking up all resources lowPods := []*v1.Pod{ @@ -901,8 +907,10 @@ func TestPostFilterInvocationCount(t *testing.T) { }) node := st.MakeNode().Name("node").Capacity(map[v1.ResourceName]string{v1.ResourceCPU: "4"}).Obj() + workload := st.MakeWorkload().Name("workload").PodGroupTemplate(st.MakePodGroupTemplate().Name("t1").MinCount(3).Obj()).Obj() - pg := st.MakePodGroup().Namespace("default").Name("pg1").TemplateRef("t1", "workload").MinCount(3).Obj() + pg := st.MakePodGroup().Namespace("default").Name("pg1").TemplateRef("t1", "workload"). + DisruptionMode(schedulingapi.DisruptionModePodGroup).Priority(100).MinCount(3).Obj() // Low priority pods taking up all resources lowPods := []*v1.Pod{ diff --git a/test/integration/scheduler/preemption/podgrouppreemption_test.go b/test/integration/scheduler/preemption/podgrouppreemption_test.go index be9ff6490a9..1857a7f7393 100644 --- a/test/integration/scheduler/preemption/podgrouppreemption_test.go +++ b/test/integration/scheduler/preemption/podgrouppreemption_test.go @@ -64,7 +64,7 @@ func TestPodGroupPreemption(t *testing.T) { st.MakeNode().Name("node1").Capacity(map[v1.ResourceName]string{v1.ResourceCPU: "3", v1.ResourceMemory: "4Gi", v1.ResourcePods: "32"}).Obj(), }, podGroups: []*schedulingapi.PodGroup{ - st.MakePodGroup().Name("pg1").Namespace("default").MinCount(3).Obj(), + st.MakePodGroup().Name("pg1").Namespace("default").Priority(100).MinCount(3).Obj(), }, initialPods: []*v1.Pod{ st.MakePod().Name("low-1").Req(map[v1.ResourceName]string{v1.ResourceCPU: "1"}).Container("image").ZeroTerminationGracePeriod().Priority(10).Obj(), @@ -86,7 +86,7 @@ func TestPodGroupPreemption(t *testing.T) { st.MakeNode().Name("node1").Capacity(map[v1.ResourceName]string{v1.ResourceCPU: "3", v1.ResourceMemory: "4Gi", v1.ResourcePods: "32"}).Obj(), }, podGroups: []*schedulingapi.PodGroup{ - st.MakePodGroup().Name("pg1").Namespace("default").BasicPolicy().Obj(), + st.MakePodGroup().Name("pg1").Namespace("default").Priority(100).BasicPolicy().Obj(), }, initialPods: []*v1.Pod{ st.MakePod().Name("low-1").Req(map[v1.ResourceName]string{v1.ResourceCPU: "1"}).Container("image").ZeroTerminationGracePeriod().Priority(10).Obj(), @@ -109,7 +109,7 @@ func TestPodGroupPreemption(t *testing.T) { st.MakeNode().Name("node2").Capacity(map[v1.ResourceName]string{v1.ResourceCPU: "2", v1.ResourceMemory: "4Gi", v1.ResourcePods: "32"}).Obj(), }, podGroups: []*schedulingapi.PodGroup{ - st.MakePodGroup().Name("pg1").Namespace("default").MinCount(3).Obj(), + st.MakePodGroup().Name("pg1").Namespace("default").Priority(100).MinCount(3).Obj(), }, initialPods: []*v1.Pod{ // low-1 takes all CPU on node1 @@ -135,7 +135,7 @@ func TestPodGroupPreemption(t *testing.T) { st.MakeNode().Name("node2").Capacity(map[v1.ResourceName]string{v1.ResourceCPU: "2", v1.ResourceMemory: "4Gi", v1.ResourcePods: "32"}).Obj(), }, podGroups: []*schedulingapi.PodGroup{ - st.MakePodGroup().Name("pg1").Namespace("default").BasicPolicy().Obj(), + st.MakePodGroup().Name("pg1").Namespace("default").Priority(100).BasicPolicy().Obj(), }, initialPods: []*v1.Pod{ // low-1 takes half CPU on node1 @@ -160,7 +160,7 @@ func TestPodGroupPreemption(t *testing.T) { st.MakeNode().Name("node1").Capacity(map[v1.ResourceName]string{v1.ResourceCPU: "3", v1.ResourceMemory: "4Gi", v1.ResourcePods: "32"}).Obj(), }, podGroups: []*schedulingapi.PodGroup{ - st.MakePodGroup().Name("pg1").Namespace("default").MinCount(2).Obj(), + st.MakePodGroup().Name("pg1").Namespace("default").Priority(100).MinCount(2).Obj(), }, initialPods: []*v1.Pod{ st.MakePod().Name("low-1").Label("app", "foo").Req(map[v1.ResourceName]string{v1.ResourceCPU: "1"}).Container("image").ZeroTerminationGracePeriod().Priority(10).Obj(), @@ -189,7 +189,7 @@ func TestPodGroupPreemption(t *testing.T) { st.MakeNode().Name("node2").Capacity(map[v1.ResourceName]string{v1.ResourceCPU: "2", v1.ResourceMemory: "4Gi", v1.ResourcePods: "32"}).Obj(), }, podGroups: []*schedulingapi.PodGroup{ - st.MakePodGroup().Name("pg1").Namespace("default").MinCount(4).Obj(), + st.MakePodGroup().Name("pg1").Namespace("default").Priority(100).MinCount(4).Obj(), }, initialPods: []*v1.Pod{ st.MakePod().Name("low-1").Node("node1").Req(map[v1.ResourceName]string{v1.ResourceCPU: "1"}).Container("image").ZeroTerminationGracePeriod().Priority(10).Obj(), @@ -213,7 +213,7 @@ func TestPodGroupPreemption(t *testing.T) { st.MakeNode().Name("node1").Capacity(map[v1.ResourceName]string{v1.ResourceCPU: "3", v1.ResourceMemory: "4Gi", v1.ResourcePods: "32"}).Obj(), }, podGroups: []*schedulingapi.PodGroup{ - st.MakePodGroup().Name("pg1").Namespace("default").MinCount(3).Obj(), + st.MakePodGroup().Name("pg1").Namespace("default").Priority(100).MinCount(3).Obj(), }, initialPods: []*v1.Pod{ st.MakePod().Name("mid-1").Req(map[v1.ResourceName]string{v1.ResourceCPU: "1"}).Container("image").ZeroTerminationGracePeriod().Priority(500).Obj(), @@ -236,7 +236,7 @@ func TestPodGroupPreemption(t *testing.T) { st.MakeNode().Name("node1").Capacity(map[v1.ResourceName]string{v1.ResourceCPU: "3", v1.ResourceMemory: "4Gi", v1.ResourcePods: "32"}).Obj(), }, podGroups: []*schedulingapi.PodGroup{ - st.MakePodGroup().Name("pg1").Namespace("default").MinCount(2).Obj(), + st.MakePodGroup().Name("pg1").Namespace("default").Priority(100).MinCount(2).Obj(), }, initialPods: []*v1.Pod{ st.MakePod().Name("mid-1").Req(map[v1.ResourceName]string{v1.ResourceCPU: "1"}).Container("image").ZeroTerminationGracePeriod().Priority(50).Obj(), @@ -251,6 +251,111 @@ func TestPodGroupPreemption(t *testing.T) { expectedPreempted: []string{"low-1", "low-2"}, expectedPodsPreemptedByWAP: 2, }, + { + name: "Preempt the whole PodGroup even if preempting a single Pod would suffice", + nodes: []*v1.Node{ + st.MakeNode().Name("node1").Capacity(map[v1.ResourceName]string{v1.ResourceCPU: "3", v1.ResourceMemory: "4Gi", v1.ResourcePods: "32"}).Obj(), + }, + podGroups: []*schedulingapi.PodGroup{ + st.MakePodGroup().Name("pg1").Namespace("default").Priority(100).MinCount(1).Obj(), + st.MakePodGroup().Name("pg2").Namespace("default").DisruptionMode(schedulingapi.DisruptionModePodGroup).Priority(10).MinCount(3).Obj(), + }, + initialPods: []*v1.Pod{ + st.MakePod().Name("low-1").Req(map[v1.ResourceName]string{v1.ResourceCPU: "1"}).Container("image").PodGroupName("pg2").ZeroTerminationGracePeriod().Priority(10).Obj(), + st.MakePod().Name("low-2").Req(map[v1.ResourceName]string{v1.ResourceCPU: "1"}).Container("image").PodGroupName("pg2").ZeroTerminationGracePeriod().Priority(10).Obj(), + st.MakePod().Name("low-3").Req(map[v1.ResourceName]string{v1.ResourceCPU: "1"}).Container("image").PodGroupName("pg2").ZeroTerminationGracePeriod().Priority(10).Obj(), + }, + preemptorPods: []*v1.Pod{ + st.MakePod().Name("high-1").Req(map[v1.ResourceName]string{v1.ResourceCPU: "1"}).Container("image").PodGroupName("pg1").ZeroTerminationGracePeriod().Priority(100).Obj(), + }, + expectedScheduled: []string{"high-1"}, + expectedPreempted: []string{"low-1", "low-2", "low-3"}, + expectedPodsPreemptedByWAP: 3, + }, + { + name: "Preempt the whole basic PodGroup with a PodGroup disruption mode", + nodes: []*v1.Node{ + st.MakeNode().Name("node1").Capacity(map[v1.ResourceName]string{v1.ResourceCPU: "3", v1.ResourceMemory: "4Gi", v1.ResourcePods: "32"}).Obj(), + }, + podGroups: []*schedulingapi.PodGroup{ + st.MakePodGroup().Name("pg1").Namespace("default").Priority(100).MinCount(1).Obj(), + st.MakePodGroup().Name("pg2").Namespace("default").DisruptionMode(schedulingapi.DisruptionModePodGroup).Priority(10).BasicPolicy().Obj(), + }, + initialPods: []*v1.Pod{ + st.MakePod().Name("low-1").Req(map[v1.ResourceName]string{v1.ResourceCPU: "1"}).Container("image").PodGroupName("pg2").ZeroTerminationGracePeriod().Priority(10).Obj(), + st.MakePod().Name("low-2").Req(map[v1.ResourceName]string{v1.ResourceCPU: "1"}).Container("image").PodGroupName("pg2").ZeroTerminationGracePeriod().Priority(10).Obj(), + st.MakePod().Name("low-3").Req(map[v1.ResourceName]string{v1.ResourceCPU: "1"}).Container("image").PodGroupName("pg2").ZeroTerminationGracePeriod().Priority(10).Obj(), + }, + preemptorPods: []*v1.Pod{ + st.MakePod().Name("high-1").Req(map[v1.ResourceName]string{v1.ResourceCPU: "1"}).Container("image").PodGroupName("pg1").ZeroTerminationGracePeriod().Priority(100).Obj(), + }, + expectedScheduled: []string{"high-1"}, + expectedPreempted: []string{"low-1", "low-2", "low-3"}, + expectedPodsPreemptedByWAP: 3, + }, + { + name: "Priority divergence in PodGroups - preemptor PodGroup has higher priority than the victim candidate PodGroup", + nodes: []*v1.Node{ + st.MakeNode().Name("node1").Capacity(map[v1.ResourceName]string{v1.ResourceCPU: "3", v1.ResourceMemory: "4Gi", v1.ResourcePods: "32"}).Obj(), + }, + podGroups: []*schedulingapi.PodGroup{ + st.MakePodGroup().Name("pg1").Namespace("default").Priority(100).MinCount(1).Obj(), + st.MakePodGroup().Name("pg2").Namespace("default").DisruptionMode(schedulingapi.DisruptionModePodGroup).Priority(10).MinCount(3).Obj(), + }, + initialPods: []*v1.Pod{ + st.MakePod().Name("high-1").Req(map[v1.ResourceName]string{v1.ResourceCPU: "1"}).Container("image").PodGroupName("pg2").ZeroTerminationGracePeriod().Priority(100).Obj(), + st.MakePod().Name("high-2").Req(map[v1.ResourceName]string{v1.ResourceCPU: "1"}).Container("image").PodGroupName("pg2").ZeroTerminationGracePeriod().Priority(100).Obj(), + st.MakePod().Name("high-3").Req(map[v1.ResourceName]string{v1.ResourceCPU: "1"}).Container("image").PodGroupName("pg2").ZeroTerminationGracePeriod().Priority(100).Obj(), + }, + preemptorPods: []*v1.Pod{ + st.MakePod().Name("low-1").Req(map[v1.ResourceName]string{v1.ResourceCPU: "1"}).Container("image").PodGroupName("pg1").ZeroTerminationGracePeriod().Priority(10).Obj(), + }, + expectedScheduled: []string{"low-1"}, + expectedPreempted: []string{"high-1", "high-2", "high-3"}, + expectedPodsPreemptedByWAP: 3, + }, + { + name: "Priority divergence in PodGroups - preemptor PodGroup has too low priority", + nodes: []*v1.Node{ + st.MakeNode().Name("node1").Capacity(map[v1.ResourceName]string{v1.ResourceCPU: "3", v1.ResourceMemory: "4Gi", v1.ResourcePods: "32"}).Obj(), + }, + podGroups: []*schedulingapi.PodGroup{ + st.MakePodGroup().Name("pg1").Namespace("default").Priority(10).MinCount(1).Obj(), + st.MakePodGroup().Name("pg2").Namespace("default").DisruptionMode(schedulingapi.DisruptionModePodGroup).Priority(100).MinCount(3).Obj(), + }, + initialPods: []*v1.Pod{ + st.MakePod().Name("low-1").Req(map[v1.ResourceName]string{v1.ResourceCPU: "1"}).Container("image").PodGroupName("pg2").ZeroTerminationGracePeriod().Priority(10).Obj(), + st.MakePod().Name("low-2").Req(map[v1.ResourceName]string{v1.ResourceCPU: "1"}).Container("image").PodGroupName("pg2").ZeroTerminationGracePeriod().Priority(10).Obj(), + st.MakePod().Name("low-3").Req(map[v1.ResourceName]string{v1.ResourceCPU: "1"}).Container("image").PodGroupName("pg2").ZeroTerminationGracePeriod().Priority(10).Obj(), + }, + preemptorPods: []*v1.Pod{ + st.MakePod().Name("high-1").Req(map[v1.ResourceName]string{v1.ResourceCPU: "1"}).Container("image").PodGroupName("pg1").ZeroTerminationGracePeriod().Priority(100).Obj(), + }, + expectedScheduled: []string{"low-1", "low-2", "low-3"}, + expectedPreempted: []string{}, + expectedUnschedulable: []string{"high-1"}, + expectedPodsPreemptedByWAP: 0, + }, + { + name: "Preemptor Pod without PodGroupName does not respect the PodGroup disruption mode", + nodes: []*v1.Node{ + st.MakeNode().Name("node1").Capacity(map[v1.ResourceName]string{v1.ResourceCPU: "3", v1.ResourceMemory: "4Gi", v1.ResourcePods: "32"}).Obj(), + }, + podGroups: []*schedulingapi.PodGroup{ + st.MakePodGroup().Name("pg1").Namespace("default").Priority(10).MinCount(1).Obj(), + }, + initialPods: []*v1.Pod{ + st.MakePod().Name("low-1").Req(map[v1.ResourceName]string{v1.ResourceCPU: "1"}).Container("image").PodGroupName("pg1").ZeroTerminationGracePeriod().Priority(30).Obj(), + st.MakePod().Name("low-2").Req(map[v1.ResourceName]string{v1.ResourceCPU: "1"}).Container("image").PodGroupName("pg1").ZeroTerminationGracePeriod().Priority(20).Obj(), + st.MakePod().Name("low-3").Req(map[v1.ResourceName]string{v1.ResourceCPU: "1"}).Container("image").PodGroupName("pg1").ZeroTerminationGracePeriod().Priority(10).Obj(), + }, + preemptorPods: []*v1.Pod{ + st.MakePod().Name("high-1").Req(map[v1.ResourceName]string{v1.ResourceCPU: "1"}).Container("image").ZeroTerminationGracePeriod().Priority(100).Obj(), + }, + expectedScheduled: []string{"high-1", "low-1", "low-2"}, + expectedPreempted: []string{"low-3"}, + expectedPodsPreemptedByWAP: 0, + }, } for _, tt := range tests {