mirror of
https://github.com/kubernetes/kubernetes.git
synced 2026-06-09 08:55:55 -04:00
DRA upgrade/downgrade: add DeviceTaints test
This automatically tests a few scenarios across cluster upgrade/downgrade.
This commit is contained in:
parent
d3a1be8586
commit
aa3f79d4c9
4 changed files with 164 additions and 19 deletions
|
|
@ -60,7 +60,7 @@ func (b *Builder) ExtendedResourceName(i int) string {
|
|||
case SingletonIndex:
|
||||
return e2enode.SampleDeviceResourceName
|
||||
default:
|
||||
return b.driver.Name + "/resource" + fmt.Sprintf("-%d", i)
|
||||
return b.Driver.Name + "/resource" + fmt.Sprintf("-%d", i)
|
||||
}
|
||||
}
|
||||
|
||||
|
|
@ -68,7 +68,7 @@ func (b *Builder) ExtendedResourceName(i int) string {
|
|||
// namespace.
|
||||
type Builder struct {
|
||||
namespace string
|
||||
driver *Driver
|
||||
Driver *Driver
|
||||
UseExtendedResourceName bool
|
||||
|
||||
podCounter int
|
||||
|
|
@ -79,7 +79,7 @@ type Builder struct {
|
|||
|
||||
// ClassName returns the default device class name.
|
||||
func (b *Builder) ClassName() string {
|
||||
return b.namespace + b.driver.NameSuffix + "-class"
|
||||
return b.namespace + b.Driver.NameSuffix + "-class"
|
||||
}
|
||||
|
||||
// SingletonIndex causes Builder.Class and ExtendedResourceName to create a
|
||||
|
|
@ -115,14 +115,14 @@ func (b *Builder) Class(i int) *resourceapi.DeviceClass {
|
|||
}
|
||||
class.Spec.Selectors = []resourceapi.DeviceSelector{{
|
||||
CEL: &resourceapi.CELDeviceSelector{
|
||||
Expression: fmt.Sprintf(`device.driver == "%s"`, b.driver.Name),
|
||||
Expression: fmt.Sprintf(`device.driver == "%s"`, b.Driver.Name),
|
||||
},
|
||||
}}
|
||||
if b.ClassParameters != "" {
|
||||
class.Spec.Config = []resourceapi.DeviceClassConfiguration{{
|
||||
DeviceConfiguration: resourceapi.DeviceConfiguration{
|
||||
Opaque: &resourceapi.OpaqueDeviceConfiguration{
|
||||
Driver: b.driver.Name,
|
||||
Driver: b.Driver.Name,
|
||||
Parameters: runtime.RawExtension{Raw: []byte(b.ClassParameters)},
|
||||
},
|
||||
},
|
||||
|
|
@ -135,7 +135,7 @@ func (b *Builder) Class(i int) *resourceapi.DeviceClass {
|
|||
// that test pods can reference
|
||||
func (b *Builder) ExternalClaim() *resourceapi.ResourceClaim {
|
||||
b.claimCounter++
|
||||
name := "external-claim" + b.driver.NameSuffix // This is what podExternal expects.
|
||||
name := "external-claim" + b.Driver.NameSuffix // This is what podExternal expects.
|
||||
if b.claimCounter > 1 {
|
||||
name += fmt.Sprintf("-%d", b.claimCounter)
|
||||
}
|
||||
|
|
@ -160,7 +160,7 @@ func (b *Builder) claimSpecWithV1beta1() resourcev1beta1.ResourceClaimSpec {
|
|||
Config: []resourcev1beta1.DeviceClaimConfiguration{{
|
||||
DeviceConfiguration: resourcev1beta1.DeviceConfiguration{
|
||||
Opaque: &resourcev1beta1.OpaqueDeviceConfiguration{
|
||||
Driver: b.driver.Name,
|
||||
Driver: b.Driver.Name,
|
||||
Parameters: runtime.RawExtension{
|
||||
Raw: []byte(parameters),
|
||||
},
|
||||
|
|
@ -188,7 +188,7 @@ func (b *Builder) claimSpecWithV1beta2() resourcev1beta2.ResourceClaimSpec {
|
|||
Config: []resourcev1beta2.DeviceClaimConfiguration{{
|
||||
DeviceConfiguration: resourcev1beta2.DeviceConfiguration{
|
||||
Opaque: &resourcev1beta2.OpaqueDeviceConfiguration{
|
||||
Driver: b.driver.Name,
|
||||
Driver: b.Driver.Name,
|
||||
Parameters: runtime.RawExtension{
|
||||
Raw: []byte(parameters),
|
||||
},
|
||||
|
|
@ -216,7 +216,7 @@ func (b *Builder) ClaimSpec() resourceapi.ResourceClaimSpec {
|
|||
Config: []resourceapi.DeviceClaimConfiguration{{
|
||||
DeviceConfiguration: resourceapi.DeviceConfiguration{
|
||||
Opaque: &resourceapi.OpaqueDeviceConfiguration{
|
||||
Driver: b.driver.Name,
|
||||
Driver: b.Driver.Name,
|
||||
Parameters: runtime.RawExtension{
|
||||
Raw: []byte(parameters),
|
||||
},
|
||||
|
|
@ -249,7 +249,7 @@ func (b *Builder) Pod() *v1.Pod {
|
|||
pod.Spec.RestartPolicy = v1.RestartPolicyNever
|
||||
pod.GenerateName = ""
|
||||
b.podCounter++
|
||||
pod.Name = fmt.Sprintf("tester%s-%d", b.driver.NameSuffix, b.podCounter)
|
||||
pod.Name = fmt.Sprintf("tester%s-%d", b.Driver.NameSuffix, b.podCounter)
|
||||
return pod
|
||||
}
|
||||
|
||||
|
|
@ -314,11 +314,15 @@ func (b *Builder) PodInlineMultiple() (*v1.Pod, *resourceapi.ResourceClaimTempla
|
|||
}
|
||||
|
||||
// PodExternal adds a pod that references external resource claim with default class name and parameters.
|
||||
//
|
||||
// Note that this references *the initial* result of ExternalClaim. When generating multiple such
|
||||
// external claims, pod.Spec.ResourceClaims[0].ResourceClaimName must be adapted by the caller,
|
||||
// if desired.
|
||||
func (b *Builder) PodExternal() *v1.Pod {
|
||||
pod := b.Pod()
|
||||
pod.Spec.Containers[0].Name = "with-resource"
|
||||
podClaimName := "resource-claim"
|
||||
externalClaimName := "external-claim" + b.driver.NameSuffix
|
||||
externalClaimName := "external-claim" + b.Driver.NameSuffix
|
||||
pod.Spec.ResourceClaims = []v1.PodResourceClaim{
|
||||
{
|
||||
Name: podClaimName,
|
||||
|
|
@ -417,7 +421,7 @@ func (b *Builder) DeletePodAndWaitForNotFound(tCtx ktesting.TContext, pod *v1.Po
|
|||
func (b *Builder) TestPod(tCtx ktesting.TContext, pod *v1.Pod, env ...string) {
|
||||
tCtx.Helper()
|
||||
|
||||
if !b.driver.WithKubelet {
|
||||
if !b.Driver.WithKubelet {
|
||||
// Less testing when we cannot rely on the kubelet to actually run the pod.
|
||||
err := e2epod.WaitForPodScheduled(tCtx, tCtx.Client(), pod.Namespace, pod.Name)
|
||||
tCtx.ExpectNoError(err, "schedule pod")
|
||||
|
|
@ -474,7 +478,7 @@ func TestContainerEnv(tCtx ktesting.TContext, pod *v1.Pod, containerName string,
|
|||
}
|
||||
|
||||
func NewBuilder(f *framework.Framework, driver *Driver) *Builder {
|
||||
b := &Builder{driver: driver}
|
||||
b := &Builder{Driver: driver}
|
||||
ginkgo.BeforeEach(func() {
|
||||
b.setUp(f.TContext(context.Background()))
|
||||
})
|
||||
|
|
@ -482,7 +486,7 @@ func NewBuilder(f *framework.Framework, driver *Driver) *Builder {
|
|||
}
|
||||
|
||||
func NewBuilderNow(tCtx ktesting.TContext, driver *Driver) *Builder {
|
||||
b := &Builder{driver: driver}
|
||||
b := &Builder{Driver: driver}
|
||||
b.setUp(tCtx)
|
||||
return b
|
||||
}
|
||||
|
|
@ -542,7 +546,7 @@ func (b *Builder) tearDown(tCtx ktesting.TContext) {
|
|||
}
|
||||
}
|
||||
|
||||
for host, plugin := range b.driver.Nodes {
|
||||
for host, plugin := range b.Driver.Nodes {
|
||||
tCtx.Logf("Waiting for resources on %s to be unprepared", host)
|
||||
tCtx.Eventually(func(ktesting.TContext) []app.ClaimID { return plugin.GetPreparedResources() }).WithTimeout(time.Minute).Should(gomega.BeEmpty(), "prepared claims on host %s", host)
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1038,6 +1038,7 @@ func (d *Driver) TearDown(tCtx ktesting.TContext) {
|
|||
//
|
||||
// Only use this in tests where kubelet support for DRA is guaranteed.
|
||||
func (d *Driver) IsGone(tCtx ktesting.TContext) {
|
||||
tCtx.Helper()
|
||||
tCtx.Logf("Waiting for ResourceSlices of driver %s to be removed...", d.Name)
|
||||
tCtx.Eventually(d.NewGetSlices()).WithTimeout(2 * time.Minute).Should(gomega.HaveField("Items", gomega.BeEmpty()))
|
||||
}
|
||||
|
|
|
|||
137
test/e2e_dra/devicetaints_test.go
Normal file
137
test/e2e_dra/devicetaints_test.go
Normal file
|
|
@ -0,0 +1,137 @@
|
|||
/*
|
||||
Copyright The Kubernetes Authors.
|
||||
|
||||
Licensed under the Apache License, Version 2.0 (the "License");
|
||||
you may not use this file except in compliance with the License.
|
||||
You may obtain a copy of the License at
|
||||
|
||||
http://www.apache.org/licenses/LICENSE-2.0
|
||||
|
||||
Unless required by applicable law or agreed to in writing, software
|
||||
distributed under the License is distributed on an "AS IS" BASIS,
|
||||
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
See the License for the specific language governing permissions and
|
||||
limitations under the License.
|
||||
*/
|
||||
|
||||
package e2edra
|
||||
|
||||
import (
|
||||
"time"
|
||||
|
||||
"github.com/onsi/gomega"
|
||||
resourceapi "k8s.io/api/resource/v1"
|
||||
resourcealpha "k8s.io/api/resource/v1alpha3"
|
||||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
||||
drautils "k8s.io/kubernetes/test/e2e/dra/utils"
|
||||
e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
|
||||
"k8s.io/kubernetes/test/utils/ktesting"
|
||||
"k8s.io/utils/ptr"
|
||||
)
|
||||
|
||||
// deviceTaints checks that:
|
||||
// - A pod which gets scheduled on the previous release because of a toleration is kept running after an upgrade.
|
||||
// - A DeviceTaintRule created to evict the pod before a downgrade prevents pod scheduling after a downgrade.
|
||||
func deviceTaints(tCtx ktesting.TContext, b *drautils.Builder) upgradedTestFunc {
|
||||
namespace := tCtx.Namespace()
|
||||
taintKey := "devicetaints"
|
||||
taintValueFromSlice := "from-slice"
|
||||
taintValueFromRule := "from-rule"
|
||||
taintedDevice := "tainted-device"
|
||||
|
||||
// We need additional devices which are only used by this test.
|
||||
// We achieve that with cluster-scoped devices that start out with
|
||||
// a taint.
|
||||
slice := &resourceapi.ResourceSlice{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: "devicetaints",
|
||||
},
|
||||
Spec: resourceapi.ResourceSliceSpec{
|
||||
Driver: b.Driver.Name,
|
||||
Pool: resourceapi.ResourcePool{
|
||||
Name: "devicetaints",
|
||||
ResourceSliceCount: 1,
|
||||
},
|
||||
AllNodes: ptr.To(true),
|
||||
Devices: []resourceapi.Device{{
|
||||
Name: taintedDevice,
|
||||
Attributes: map[resourceapi.QualifiedName]resourceapi.DeviceAttribute{
|
||||
"example.com/type": {
|
||||
StringValue: ptr.To("devicetaints"),
|
||||
},
|
||||
},
|
||||
Taints: []resourceapi.DeviceTaint{{
|
||||
Key: taintKey,
|
||||
Value: taintValueFromSlice,
|
||||
Effect: resourceapi.DeviceTaintEffectNoSchedule,
|
||||
}},
|
||||
}},
|
||||
},
|
||||
}
|
||||
_, err := tCtx.Client().ResourceV1().ResourceSlices().Create(tCtx, slice, metav1.CreateOptions{})
|
||||
tCtx.ExpectNoError(err)
|
||||
|
||||
tCtx.Log("The pod wants exactly the tainted device -> not schedulable.")
|
||||
claim := b.ExternalClaim()
|
||||
pod := b.PodExternal()
|
||||
pod.Spec.ResourceClaims[0].ResourceClaimName = &claim.Name
|
||||
claim.Spec.Devices.Requests[0].Exactly.Selectors = []resourceapi.DeviceSelector{{
|
||||
CEL: &resourceapi.CELDeviceSelector{
|
||||
Expression: `device.attributes["example.com"].?type.orValue("") == "devicetaints"`,
|
||||
},
|
||||
}}
|
||||
b.Create(tCtx, claim, pod)
|
||||
tCtx.ExpectNoError(e2epod.WaitForPodNameUnschedulableInNamespace(tCtx, tCtx.Client(), pod.Name, namespace))
|
||||
|
||||
tCtx.Log("Adding a toleration makes the pod schedulable.")
|
||||
claim.Spec.Devices.Requests[0].Exactly.Tolerations = []resourceapi.DeviceToleration{{
|
||||
Key: taintKey,
|
||||
Value: taintValueFromSlice,
|
||||
Effect: resourceapi.DeviceTaintEffectNoSchedule,
|
||||
}}
|
||||
tCtx.ExpectNoError(tCtx.Client().ResourceV1().ResourceClaims(namespace).Delete(tCtx, claim.Name, metav1.DeleteOptions{}))
|
||||
_, err = tCtx.Client().ResourceV1().ResourceClaims(namespace).Create(tCtx, claim, metav1.CreateOptions{})
|
||||
tCtx.ExpectNoError(err)
|
||||
b.TestPod(tCtx, pod)
|
||||
|
||||
return func(tCtx ktesting.TContext) downgradedTestFunc {
|
||||
tCtx.Log("Pod running consistently after upgrade.")
|
||||
tCtx.Consistently(func(tCtx ktesting.TContext) error {
|
||||
return e2epod.WaitForPodRunningInNamespace(tCtx, tCtx.Client(), pod)
|
||||
}).WithTimeout(30 * time.Second).WithPolling(5 * time.Second).Should(gomega.Succeed())
|
||||
|
||||
tCtx.Logf("Evict pod through DeviceTaintRule.")
|
||||
rule := &resourcealpha.DeviceTaintRule{
|
||||
ObjectMeta: metav1.ObjectMeta{
|
||||
Name: "device-taint-rule",
|
||||
},
|
||||
Spec: resourcealpha.DeviceTaintRuleSpec{
|
||||
DeviceSelector: &resourcealpha.DeviceTaintSelector{
|
||||
Driver: &b.Driver.Name,
|
||||
Pool: &slice.Spec.Pool.Name,
|
||||
Device: &taintedDevice,
|
||||
},
|
||||
Taint: resourcealpha.DeviceTaint{
|
||||
Key: taintKey,
|
||||
Value: taintValueFromRule,
|
||||
Effect: resourcealpha.DeviceTaintEffectNoExecute,
|
||||
},
|
||||
},
|
||||
}
|
||||
_, err := tCtx.Client().ResourceV1alpha3().DeviceTaintRules().Create(tCtx, rule, metav1.CreateOptions{})
|
||||
tCtx.ExpectNoError(err)
|
||||
tCtx.ExpectNoError(e2epod.WaitForPodNotFoundInNamespace(tCtx, tCtx.Client(), pod.Name, namespace, 5*time.Minute))
|
||||
|
||||
return func(tCtx ktesting.TContext) {
|
||||
tCtx.Log("DeviceTaintRule still in effect.")
|
||||
b.Create(tCtx, pod)
|
||||
tCtx.ExpectNoError(e2epod.WaitForPodNameUnschedulableInNamespace(tCtx, tCtx.Client(), pod.Name, namespace))
|
||||
|
||||
// We must clean up manually, otherwise the code which checks for ResourceSlice deletion after
|
||||
// driver removal gets stuck waiting for the removal of special ResourceSlice.
|
||||
// This cannot be scheduled via tCtx.Cleanup after creating it because then it would be removed
|
||||
// after the first sub-test.
|
||||
tCtx.ExpectNoError(tCtx.Client().ResourceV1().ResourceSlices().Delete(tCtx, slice.Name, metav1.DeleteOptions{}))
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
@ -66,7 +66,8 @@ func init() {
|
|||
// sub-test. That function then returns the next piece of code, which then
|
||||
// returns the final code. Each callback function is executed as a sub-test.
|
||||
// The builder is configured to not delete objects when that sub-test ends,
|
||||
// so objects persist until the entire test is done.
|
||||
// so objects persist until the entire test is done. The same DRA driver
|
||||
// is used for all sub-tests.
|
||||
//
|
||||
// Each sub-test must be self-contained. They intentionally run in a random
|
||||
// order. However, they share the same cluster and the 8 devices which are
|
||||
|
|
@ -74,6 +75,7 @@ func init() {
|
|||
var subTests = map[string]initialTestFunc{
|
||||
"core DRA": coreDRA,
|
||||
"ResourceClaim device status": resourceClaimDeviceStatus,
|
||||
"DeviceTaints": deviceTaints,
|
||||
}
|
||||
|
||||
type initialTestFunc func(tCtx ktesting.TContext, builder *drautils.Builder) upgradedTestFunc
|
||||
|
|
@ -210,8 +212,8 @@ func testUpgradeDowngrade(tCtx ktesting.TContext) {
|
|||
tCtx.Step(fmt.Sprintf("bring up v%d.%d", major, previousMinor), func(tCtx ktesting.TContext) {
|
||||
cluster = localupcluster.New(tCtx)
|
||||
localUpClusterEnv := map[string]string{
|
||||
"RUNTIME_CONFIG": "resource.k8s.io/v1beta1,resource.k8s.io/v1beta2",
|
||||
"FEATURE_GATES": "DynamicResourceAllocation=true",
|
||||
"RUNTIME_CONFIG": "resource.k8s.io/v1beta1,resource.k8s.io/v1beta2,resource.k8s.io/v1alpha3",
|
||||
"FEATURE_GATES": "DynamicResourceAllocation=true,DRADeviceTaintRules=true,DRADeviceTaints=true",
|
||||
// *not* needed because driver will run in "local filesystem" mode (= driver.IsLocal): "ALLOW_PRIVILEGED": "1",
|
||||
}
|
||||
cluster.Start(tCtx, binDir, localUpClusterEnv)
|
||||
|
|
@ -247,6 +249,7 @@ func testUpgradeDowngrade(tCtx ktesting.TContext) {
|
|||
})
|
||||
}
|
||||
})
|
||||
numSlices := len(driver.NewGetSlices()(tCtx).Items)
|
||||
|
||||
// We could split this up into first updating the apiserver, then control plane components, then restarting kubelet.
|
||||
// For the purpose of this test here we we primarily care about full before/after comparisons, so not done yet.
|
||||
|
|
@ -255,7 +258,7 @@ func testUpgradeDowngrade(tCtx ktesting.TContext) {
|
|||
|
||||
// The kubelet wipes all ResourceSlices on a restart because it doesn't know which drivers were running.
|
||||
// Wait for the ResourceSlice controller in the driver to notice and recreate the ResourceSlices.
|
||||
tCtx.WithStep("wait for ResourceSlices").Eventually(driver.NewGetSlices()).WithTimeout(5 * time.Minute).Should(gomega.HaveField("Items", gomega.HaveLen(len(nodes.NodeNames))))
|
||||
tCtx.WithStep("wait for ResourceSlices").Eventually(driver.NewGetSlices()).WithTimeout(5 * time.Minute).Should(gomega.HaveField("Items", gomega.HaveLen(numSlices)))
|
||||
|
||||
downgradedTestFuncs := make(map[string]downgradedTestFunc, len(subTests))
|
||||
tCtx.Run("after-cluster-upgrade", func(tCtx ktesting.TContext) {
|
||||
|
|
|
|||
Loading…
Reference in a new issue