From 03e337cfb7a31ccdca774320fbe824891d31468d Mon Sep 17 00:00:00 2001 From: Patrick Ohly Date: Thu, 25 Sep 2025 17:59:17 +0200 Subject: [PATCH 1/2] ktesting: support for synctest A "sync test" runs the test inside a testing/synctest bubble. Time moves forward in a deterministic fashion when all goroutines are blocked waiting for time to progress. This simplifies testing concurrent behavior. ktesting enables writing such tests with a new SyncTest method: it can start a new sub-test (similar to Run) or turn an existing test (typically a top-level Test) into a sync test when no new name is given. TContext.IsSyncTest can be used to check the mode of the current test, which may be useful in common helper code. TContext.Wait directly maps to synctest.Wait. This new functionality is limited to tests which use an underlying testing.T instance. --- test/utils/ktesting/clientcontext.go | 6 +- test/utils/ktesting/errorcontext.go | 6 +- test/utils/ktesting/tcontext.go | 91 ++++++++++++++++++++++++---- test/utils/ktesting/withcontext.go | 6 +- 4 files changed, 93 insertions(+), 16 deletions(-) diff --git a/test/utils/ktesting/clientcontext.go b/test/utils/ktesting/clientcontext.go index 36da9793bdd..47c5a4547bf 100644 --- a/test/utils/ktesting/clientcontext.go +++ b/test/utils/ktesting/clientcontext.go @@ -87,7 +87,11 @@ func (cCtx clientContext) ExpectNoError(err error, explain ...interface{}) { } func (cCtx clientContext) Run(name string, cb func(tCtx TContext)) bool { - return run(cCtx, name, cb) + return run(cCtx, name, false, cb) +} + +func (cCtx clientContext) SyncTest(name string, cb func(tCtx TContext)) bool { + return run(cCtx, name, true, cb) } func (cCtx clientContext) Logger() klog.Logger { diff --git a/test/utils/ktesting/errorcontext.go b/test/utils/ktesting/errorcontext.go index c3adc913e6b..d6d22ba2436 100644 --- a/test/utils/ktesting/errorcontext.go +++ b/test/utils/ktesting/errorcontext.go @@ -173,7 +173,11 @@ func (eCtx *errorContext) ExpectNoError(err error, explain ...interface{}) { } func (cCtx *errorContext) Run(name string, cb func(tCtx TContext)) bool { - return run(cCtx, name, cb) + return run(cCtx, name, false, cb) +} + +func (cCtx *errorContext) SyncTest(name string, cb func(tCtx TContext)) bool { + return run(cCtx, name, true, cb) } func (eCtx *errorContext) Logger() klog.Logger { diff --git a/test/utils/ktesting/tcontext.go b/test/utils/ktesting/tcontext.go index 69c2fe1f196..deb44a5cb97 100644 --- a/test/utils/ktesting/tcontext.go +++ b/test/utils/ktesting/tcontext.go @@ -22,6 +22,7 @@ import ( "fmt" "strings" "testing" + "testing/synctest" "time" "github.com/onsi/gomega" @@ -92,6 +93,31 @@ type TContext interface { // test when called elsewhere. Run(name string, f func(tCtx TContext)) bool + // SyncTest uses [synctest.Test] to execute the callback inside a bubble. + // Creates a new subtest if the name is non-empty, otherwise it creates + // the bubble directly in the current test context. + // + // Only works in Go unit tests. + SyncTest(name string, f func(tCtx TContext)) bool + + // IsSyncTest returns true if the context was created by SyncTest. + // + // Inside such a context, Wait is usable. This can be used in + // code which runs inside synctest bubbles and outside: + // - Inside a bubble, Wait can be used to block until + // background activity has settled down (= "durably blocked"). + // Eventually and Consistently both call Wait and then check + // the condition. + // - Outside, polling or some synchronization mechanism has to be used. + IsSyncTest() bool + + // Wait calls [synctest.Wait] and thus ensures that all background + // activity has settled down (= "durably blocked"). + // + // Only works inside a bubble started by SyncTest (can be checked with + // IsSyncTest), panics elsewhere. + Wait() + // Cancel can be invoked to cancel the context before the test is completed. // Tests which use the context to control goroutines and then wait for // termination of those goroutines must call Cancel to avoid a deadlock. @@ -358,7 +384,8 @@ func InitCtx(ctx context.Context, tb TB, _ ...InitOption) TContext { // A simpler API is to use TContext.Run as replacement // for [testing.T.Run]. func WithTB(parentCtx TContext, tb TB) TContext { - tCtx := InitCtx(klog.NewContext(parentCtx, newLogger(tb, false /* don't buffer log output */)), tb) + // TODO: honor the initial settings for logging. + tCtx := InitCtx(klog.NewContext(parentCtx, newLogger(tb, false /* don't buffer log output */)), withKlogHeader(tb)) tCtx = WithCancel(tCtx) tCtx = WithClients(tCtx, @@ -371,27 +398,53 @@ func WithTB(parentCtx TContext, tb TB) TContext { return tCtx } -// run implements the different Run methods. It's not an exported +// run implements the different Run and SyncTest methods. It's not an exported // method because tCtx.Run is more discoverable (same usage as // with normal Go). -func run(tCtx TContext, name string, cb func(tCtx TContext)) bool { +func run(tCtx TContext, name string, syncTest bool, cb func(tCtx TContext)) bool { tCtx.Helper() switch tb := tCtx.TB().(type) { - case interface { - Run(string, func(t *testing.T)) bool - }: + case *testing.T: + if syncTest { + f := func(t *testing.T) { + // We must not propagate the parent's + // cancellation channel into the bubble, + // it causes "panic: receive on synctest channel from outside bubble". + // + // Sync tests shouldn't need the overall suite timeout, + // so this seems okay. + cb(synctestContext{TContext: WithTB(WithoutCancel(tCtx), t)}) + } + if name != "" { + return tb.Run(name, func(t *testing.T) { synctest.Test(t, f) }) + } + synctest.Test(tb, f) + return true + } return tb.Run(name, func(t *testing.T) { cb(WithTB(tCtx, t)) }) - case interface { - Run(string, func(t *testing.B)) bool - }: - return tb.Run(name, func(b *testing.B) { cb(WithTB(tCtx, b)) }) - default: - tCtx.Fatalf("Run not implemented, underlying %T does not support it", tCtx.TB()) + case *testing.B: + if !syncTest { + return tb.Run(name, func(b *testing.B) { cb(WithTB(tCtx, b)) }) + } } + what := "Run" + if syncTest { + what = "SyncTest" + } + tCtx.Fatalf("%s not implemented, underlying %T does not support it", what, tCtx.TB()) + return false } +type synctestContext struct { + TContext +} + +func (s synctestContext) IsSyncTest() bool { + return true +} + // WithContext constructs a new TContext with a different Context instance. // This can be used in callbacks which receive a Context, for example // from Gomega: @@ -482,7 +535,19 @@ func cleanupCtx(tCtx TContext, cb func(TContext)) { } func (cCtx tContext) Run(name string, cb func(tCtx TContext)) bool { - return run(cCtx, name, cb) + return run(cCtx, name, false, cb) +} + +func (cCtx tContext) SyncTest(name string, cb func(tCtx TContext)) bool { + return run(cCtx, name, true, cb) +} + +func (cCtx tContext) IsSyncTest() bool { + return false +} + +func (cCtx tContext) Wait() { + synctest.Wait() } func (tCtx tContext) Logger() klog.Logger { diff --git a/test/utils/ktesting/withcontext.go b/test/utils/ktesting/withcontext.go index 6f1ddfa7221..80b02e2d87c 100644 --- a/test/utils/ktesting/withcontext.go +++ b/test/utils/ktesting/withcontext.go @@ -107,7 +107,11 @@ func (wCtx withContext) ExpectNoError(err error, explain ...interface{}) { } func (cCtx withContext) Run(name string, cb func(tCtx TContext)) bool { - return run(cCtx, name, cb) + return run(cCtx, name, false, cb) +} + +func (cCtx withContext) SyncTest(name string, cb func(tCtx TContext)) bool { + return run(cCtx, name, true, cb) } func (wCtx withContext) Logger() klog.Logger { From 8d3bc085ce4da294001d4730d468036975e2db47 Mon Sep 17 00:00:00 2001 From: Patrick Ohly Date: Thu, 30 Oct 2025 12:49:19 +0100 Subject: [PATCH 2/2] DRA integration: refactor helper functions They now live in a separate file, use ktesting+Gomega consistently, and follow (mostly) best practices: - strict validation during create - wait for completion of delete The latter is not complete yet because it was found that some binding condition test keeps a ResourceCLaim in an undeletable state. That's okayish (namespace stays around until apiserver gets restarted), but is worth looking into later. --- test/integration/dra/dra_test.go | 167 +++-------------------- test/integration/dra/helpers_test.go | 190 +++++++++++++++++++++++++++ 2 files changed, 211 insertions(+), 146 deletions(-) create mode 100644 test/integration/dra/helpers_test.go diff --git a/test/integration/dra/dra_test.go b/test/integration/dra/dra_test.go index 3000b1b0eb4..b788332e73a 100644 --- a/test/integration/dra/dra_test.go +++ b/test/integration/dra/dra_test.go @@ -22,7 +22,6 @@ import ( "encoding/json" "errors" "fmt" - "regexp" "sort" "strings" "sync" @@ -51,7 +50,6 @@ import ( utilfeature "k8s.io/apiserver/pkg/util/feature" resourceapiac "k8s.io/client-go/applyconfigurations/resource/v1" "k8s.io/client-go/informers" - "k8s.io/client-go/kubernetes" "k8s.io/client-go/tools/cache" "k8s.io/client-go/util/retry" "k8s.io/component-base/featuregate" @@ -121,105 +119,6 @@ var ( numNodes = 2 ) -// createTestNamespace creates a namespace with a name that is derived from the -// current test name: -// - Non-alpha-numeric characters replaced by hyphen. -// - Truncated in the middle to make it short enough for GenerateName. -// - Hyphen plus random suffix added by the apiserver. -func createTestNamespace(tCtx ktesting.TContext, labels map[string]string) string { - tCtx.Helper() - name := regexp.MustCompile(`[^[:alnum:]_-]`).ReplaceAllString(tCtx.Name(), "-") - name = strings.ToLower(name) - if len(name) > 63 { - name = name[:30] + "--" + name[len(name)-30:] - } - ns := &v1.Namespace{ObjectMeta: metav1.ObjectMeta{GenerateName: name + "-"}} - ns.Labels = labels - ns, err := tCtx.Client().CoreV1().Namespaces().Create(tCtx, ns, metav1.CreateOptions{}) - tCtx.ExpectNoError(err, "create test namespace") - tCtx.CleanupCtx(func(tCtx ktesting.TContext) { - tCtx.ExpectNoError(tCtx.Client().CoreV1().Namespaces().Delete(tCtx, ns.Name, metav1.DeleteOptions{}), "delete test namespace") - }) - return ns.Name -} - -// createSlice creates the given ResourceSlice and removes it when the test is done. -func createSlice(tCtx ktesting.TContext, slice *resourceapi.ResourceSlice) *resourceapi.ResourceSlice { - tCtx.Helper() - slice, err := tCtx.Client().ResourceV1().ResourceSlices().Create(tCtx, slice, metav1.CreateOptions{}) - tCtx.ExpectNoError(err, "create ResourceSlice") - tCtx.CleanupCtx(func(tCtx ktesting.TContext) { - tCtx.Log("Cleaning up ResourceSlice...") - err := tCtx.Client().ResourceV1().ResourceSlices().Delete(tCtx, slice.Name, metav1.DeleteOptions{}) - tCtx.ExpectNoError(err, "delete ResourceSlice") - }) - return slice -} - -// createTestClass creates a DeviceClass with a driver name derived from the test namespace -func createTestClass(tCtx ktesting.TContext, namespace string) (*resourceapi.DeviceClass, string) { - tCtx.Helper() - driverName := namespace + ".driver" - class := class.DeepCopy() - class.Name = namespace + ".class" - class.Spec.Selectors = []resourceapi.DeviceSelector{{ - CEL: &resourceapi.CELDeviceSelector{ - Expression: fmt.Sprintf("device.driver == %q", driverName), - }, - }} - _, err := tCtx.Client().ResourceV1().DeviceClasses().Create(tCtx, class, metav1.CreateOptions{}) - tCtx.ExpectNoError(err, "create class") - tCtx.CleanupCtx(func(tCtx ktesting.TContext) { - tCtx.Log("Cleaning up DeviceClass...") - err := tCtx.Client().ResourceV1().DeviceClasses().Delete(tCtx, class.Name, metav1.DeleteOptions{}) - tCtx.ExpectNoError(err, "delete class") - }) - - return class, driverName -} - -// createClaim creates a claim and in the namespace. -// The class must already exist and is used for all requests. -func createClaim(tCtx ktesting.TContext, namespace string, suffix string, class *resourceapi.DeviceClass, claim *resourceapi.ResourceClaim) *resourceapi.ResourceClaim { - tCtx.Helper() - claim = claim.DeepCopy() - claim.Namespace = namespace - claim.Name += suffix - claimName := claim.Name - for i := range claim.Spec.Devices.Requests { - request := &claim.Spec.Devices.Requests[i] - if request.Exactly != nil && request.Exactly.DeviceClassName != "" { - request.Exactly.DeviceClassName = class.Name - continue - } - for e := range request.FirstAvailable { - subRequest := &request.FirstAvailable[e] - subRequest.DeviceClassName = class.Name - } - } - claim, err := tCtx.Client().ResourceV1().ResourceClaims(namespace).Create(tCtx, claim, metav1.CreateOptions{}) - tCtx.ExpectNoError(err, "create claim "+claimName) - return claim -} - -// createPod create a pod in the namespace, referencing the given claim. -func createPod(tCtx ktesting.TContext, namespace string, suffix string, claim *resourceapi.ResourceClaim, pod *v1.Pod) *v1.Pod { - tCtx.Helper() - pod = pod.DeepCopy() - pod.Name += suffix - podName := pod.Name - pod.Namespace = namespace - pod.Spec.ResourceClaims[0].ResourceClaimName = &claim.Name - pod, err := tCtx.Client().CoreV1().Pods(namespace).Create(tCtx, pod, metav1.CreateOptions{}) - tCtx.ExpectNoError(err, "create pod "+podName) - tCtx.CleanupCtx(func(tCtx ktesting.TContext) { - tCtx.Log("Cleaning up Pod...") - err := tCtx.Client().CoreV1().Pods(namespace).Delete(tCtx, pod.Name, metav1.DeleteOptions{}) - tCtx.ExpectNoError(err, "delete Pod") - }) - return pod -} - func TestDRA(t *testing.T) { // Each sub-test brings up the API server in a certain // configuration. These sub-tests must run sequentially because they @@ -375,7 +274,7 @@ func createNodes(tCtx ktesting.TContext) { Name: fmt.Sprintf("worker-%d", i), }, } - node, err := tCtx.Client().CoreV1().Nodes().Create(tCtx, node, metav1.CreateOptions{}) + node, err := tCtx.Client().CoreV1().Nodes().Create(tCtx, node, metav1.CreateOptions{FieldValidation: "Strict"}) tCtx.ExpectNoError(err, fmt.Sprintf("creating node #%d", i)) // Make the node ready. @@ -523,7 +422,7 @@ func testPod(tCtx ktesting.TContext, draEnabled bool) { namespace := createTestNamespace(tCtx, nil) podWithClaimName := podWithClaimName.DeepCopy() podWithClaimName.Namespace = namespace - pod, err := tCtx.Client().CoreV1().Pods(namespace).Create(tCtx, podWithClaimName, metav1.CreateOptions{}) + pod, err := tCtx.Client().CoreV1().Pods(namespace).Create(tCtx, podWithClaimName, metav1.CreateOptions{FieldValidation: "Strict"}) tCtx.ExpectNoError(err, "create pod") if draEnabled { assert.NotEmpty(tCtx, pod.Spec.ResourceClaims, "should store resource claims in pod spec") @@ -535,7 +434,7 @@ func testPod(tCtx ktesting.TContext, draEnabled bool) { // testAPIDisabled checks that the resource.k8s.io API is disabled. func testAPIDisabled(tCtx ktesting.TContext) { tCtx.Parallel() - _, err := tCtx.Client().ResourceV1().ResourceClaims(claim.Namespace).Create(tCtx, claim, metav1.CreateOptions{}) + _, err := tCtx.Client().ResourceV1().ResourceClaims(claim.Namespace).Create(tCtx, claim, metav1.CreateOptions{FieldValidation: "Strict"}) if !apierrors.IsNotFound(err) { tCtx.Fatalf("expected 'resource not found' error, got %v", err) } @@ -547,7 +446,7 @@ func testConvert(tCtx ktesting.TContext) { namespace := createTestNamespace(tCtx, nil) claim := claim.DeepCopy() claim.Namespace = namespace - claim, err := tCtx.Client().ResourceV1().ResourceClaims(namespace).Create(tCtx, claim, metav1.CreateOptions{}) + claim, err := tCtx.Client().ResourceV1().ResourceClaims(namespace).Create(tCtx, claim, metav1.CreateOptions{FieldValidation: "Strict"}) tCtx.ExpectNoError(err, "create claim") claimBeta2, err := tCtx.Client().ResourceV1beta2().ResourceClaims(namespace).Get(tCtx, claim.Name, metav1.GetOptions{}) tCtx.ExpectNoError(err, "get claim") @@ -565,7 +464,7 @@ func testAdminAccess(tCtx ktesting.TContext, adminAccessEnabled bool) { claim1.Namespace = namespace claim1.Spec.Devices.Requests[0].Exactly.AdminAccess = ptr.To(true) // create claim with AdminAccess in non-admin namespace - _, err := tCtx.Client().ResourceV1().ResourceClaims(namespace).Create(tCtx, claim1, metav1.CreateOptions{}) + _, err := tCtx.Client().ResourceV1().ResourceClaims(namespace).Create(tCtx, claim1, metav1.CreateOptions{FieldValidation: "Strict"}) if adminAccessEnabled { if err != nil { // should result in validation error @@ -581,7 +480,7 @@ func testAdminAccess(tCtx ktesting.TContext, adminAccessEnabled bool) { claim2.Namespace = adminNS claim2.Name = "claim2" claim2.Spec.Devices.Requests[0].Exactly.AdminAccess = ptr.To(true) - claim2, err := tCtx.Client().ResourceV1().ResourceClaims(adminNS).Create(tCtx, claim2, metav1.CreateOptions{}) + claim2, err := tCtx.Client().ResourceV1().ResourceClaims(adminNS).Create(tCtx, claim2, metav1.CreateOptions{FieldValidation: "Strict"}) tCtx.ExpectNoError(err, "create claim") if !ptr.Deref(claim2.Spec.Devices.Requests[0].Exactly.AdminAccess, true) { tCtx.Fatalf("should store AdminAccess in ResourceClaim %v", claim2) @@ -705,7 +604,7 @@ func testPrioritizedList(tCtx ktesting.TContext, enabled bool) { func testExtendedResource(tCtx ktesting.TContext, enabled bool) { tCtx.Parallel() - c, err := tCtx.Client().ResourceV1().DeviceClasses().Create(tCtx, classWithExtendedResource, metav1.CreateOptions{}) + c, err := tCtx.Client().ResourceV1().DeviceClasses().Create(tCtx, classWithExtendedResource, metav1.CreateOptions{FieldValidation: "Strict"}) tCtx.ExpectNoError(err, "create class") if enabled { require.NotEmpty(tCtx, c.Spec.ExtendedResourceName, "should store ExtendedResourceName") @@ -716,7 +615,7 @@ func testExtendedResource(tCtx ktesting.TContext, enabled bool) { pod := podWithExtendedResource.DeepCopy() pod.Namespace = namespace - _, err := tCtx.Client().CoreV1().Pods(namespace).Create(tCtx, pod, metav1.CreateOptions{}) + _, err := tCtx.Client().CoreV1().Pods(namespace).Create(tCtx, pod, metav1.CreateOptions{FieldValidation: "Strict"}) tCtx.ExpectNoError(err, "create pod") schedulingAttempted := gomega.HaveField("Status.Conditions", gomega.ContainElement( gstruct.MatchFields(gstruct.IgnoreExtras, gstruct.Fields{ @@ -1094,7 +993,7 @@ func testResourceClaimDeviceStatus(tCtx ktesting.TContext, enabled bool) { }, } - claim, err := tCtx.Client().ResourceV1().ResourceClaims(namespace).Create(tCtx, claim, metav1.CreateOptions{}) + claim, err := tCtx.Client().ResourceV1().ResourceClaims(namespace).Create(tCtx, claim, metav1.CreateOptions{FieldValidation: "Strict"}) tCtx.ExpectNoError(err, "create ResourceClaim") deviceStatus := []resourceapi.AllocatedDeviceStatus{{ @@ -1247,8 +1146,7 @@ func testResourceClaimDeviceStatus(tCtx ktesting.TContext, enabled bool) { // and prints some information about it. func testMaxResourceSlice(tCtx ktesting.TContext) { slice := NewMaxResourceSlice() - createdSlice, err := tCtx.Client().ResourceV1().ResourceSlices().Create(tCtx, slice, metav1.CreateOptions{}) - tCtx.ExpectNoError(err) + createdSlice := createSlice(tCtx, slice) totalSize := createdSlice.Size() var managedFieldsSize int for _, f := range createdSlice.ManagedFields { @@ -1323,7 +1221,7 @@ func testControllerManagerMetrics(tCtx ktesting.TContext) { }, } - _, err := tCtx.Client().ResourceV1().ResourceClaimTemplates(namespace).Create(tCtx, template1, metav1.CreateOptions{}) + _, err := tCtx.Client().ResourceV1().ResourceClaimTemplates(namespace).Create(tCtx, template1, metav1.CreateOptions{FieldValidation: "Strict"}) tCtx.ExpectNoError(err, "create ResourceClaimTemplate without admin access") pod1 := &v1.Pod{ @@ -1343,7 +1241,7 @@ func testControllerManagerMetrics(tCtx ktesting.TContext) { }, } - _, err = tCtx.Client().CoreV1().Pods(namespace).Create(tCtx, pod1, metav1.CreateOptions{}) + _, err = tCtx.Client().CoreV1().Pods(namespace).Create(tCtx, pod1, metav1.CreateOptions{FieldValidation: "Strict"}) tCtx.ExpectNoError(err, "create Pod with ResourceClaimTemplate without admin access") time.Sleep(200 * time.Millisecond) @@ -1377,7 +1275,7 @@ func testControllerManagerMetrics(tCtx ktesting.TContext) { }, } - _, err = tCtx.Client().ResourceV1().ResourceClaimTemplates(adminNS).Create(tCtx, template2, metav1.CreateOptions{}) + _, err = tCtx.Client().ResourceV1().ResourceClaimTemplates(adminNS).Create(tCtx, template2, metav1.CreateOptions{FieldValidation: "Strict"}) tCtx.ExpectNoError(err, "create ResourceClaimTemplate with admin access") pod2 := &v1.Pod{ @@ -1397,7 +1295,7 @@ func testControllerManagerMetrics(tCtx ktesting.TContext) { }, } - _, err = tCtx.Client().CoreV1().Pods(adminNS).Create(tCtx, pod2, metav1.CreateOptions{}) + _, err = tCtx.Client().CoreV1().Pods(adminNS).Create(tCtx, pod2, metav1.CreateOptions{FieldValidation: "Strict"}) tCtx.ExpectNoError(err, "create Pod with ResourceClaimTemplate with admin access in admin namespace") time.Sleep(200 * time.Millisecond) @@ -1428,7 +1326,7 @@ func testControllerManagerMetrics(tCtx ktesting.TContext) { }, } - _, err = tCtx.Client().ResourceV1().ResourceClaimTemplates(namespace).Create(tCtx, invalidTemplate, metav1.CreateOptions{}) + _, err = tCtx.Client().ResourceV1().ResourceClaimTemplates(namespace).Create(tCtx, invalidTemplate, metav1.CreateOptions{FieldValidation: "Strict"}) require.Error(tCtx, err, "should fail to create ResourceClaimTemplate with AdminAccess in non-admin namespace") require.ErrorContains(tCtx, err, "admin access to devices requires the `resource.kubernetes.io/admin-access: true` label on the containing namespace") @@ -1452,7 +1350,7 @@ func testControllerManagerMetrics(tCtx ktesting.TContext) { }, } - _, err = tCtx.Client().ResourceV1().ResourceClaimTemplates(namespace).Create(tCtx, template4, metav1.CreateOptions{}) + _, err = tCtx.Client().ResourceV1().ResourceClaimTemplates(namespace).Create(tCtx, template4, metav1.CreateOptions{FieldValidation: "Strict"}) tCtx.ExpectNoError(err, "create second ResourceClaimTemplate without admin access") pod4 := &v1.Pod{ @@ -1472,7 +1370,7 @@ func testControllerManagerMetrics(tCtx ktesting.TContext) { }, } - _, err = tCtx.Client().CoreV1().Pods(namespace).Create(tCtx, pod4, metav1.CreateOptions{}) + _, err = tCtx.Client().CoreV1().Pods(namespace).Create(tCtx, pod4, metav1.CreateOptions{FieldValidation: "Strict"}) tCtx.ExpectNoError(err, "create second Pod with ResourceClaimTemplate without admin access") time.Sleep(200 * time.Millisecond) @@ -1529,7 +1427,7 @@ func testDeviceBindingConditions(tCtx ktesting.TContext, enabled bool) { }, }, } - slice, err := tCtx.Client().ResourceV1().ResourceSlices().Create(tCtx, slice, metav1.CreateOptions{}) + slice, err := tCtx.Client().ResourceV1().ResourceSlices().Create(tCtx, slice, metav1.CreateOptions{FieldValidation: "Strict"}) tCtx.ExpectNoError(err, "create slice") haveBindingConditionFields := len(slice.Spec.Devices[0].BindingConditions) > 0 || len(slice.Spec.Devices[0].BindingFailureConditions) > 0 @@ -1561,7 +1459,7 @@ func testDeviceBindingConditions(tCtx ktesting.TContext, enabled bool) { }, }, } - _, err = tCtx.Client().ResourceV1().ResourceSlices().Create(tCtx, sliceWithoutBinding, metav1.CreateOptions{}) + _, err = tCtx.Client().ResourceV1().ResourceSlices().Create(tCtx, sliceWithoutBinding, metav1.CreateOptions{FieldValidation: "Strict"}) tCtx.ExpectNoError(err, "create slice without binding conditions") // Schedule first pod and wait for the scheduler to reach the binding phase, which marks the claim as allocated. @@ -1590,8 +1488,7 @@ func testDeviceBindingConditions(tCtx ktesting.TContext, enabled bool) { )), }))), "first allocated claim") - err = waitForPodScheduled(tCtx, tCtx.Client(), namespace, pod.Name) - tCtx.ExpectNoError(err, "first pod scheduled") + waitForPodScheduled(tCtx, namespace, pod.Name) // Second pod should get the device with binding conditions. claim2 := createClaim(tCtx, namespace, "-b", class, claim) @@ -1684,27 +1581,5 @@ func testDeviceBindingConditions(tCtx ktesting.TContext, enabled bool) { return err }) tCtx.ExpectNoError(err, "add binding condition to second claim") - err = waitForPodScheduled(tCtx, tCtx.Client(), namespace, pod.Name) - tCtx.ExpectNoError(err, "second pod scheduled") -} - -func waitForPodScheduled(ctx context.Context, client kubernetes.Interface, namespace, podName string) error { - timeout := time.After(60 * time.Second) - tick := time.Tick(1 * time.Second) - for { - select { - case <-timeout: - return fmt.Errorf("timed out waiting for pod %s/%s to be scheduled", namespace, podName) - case <-tick: - pod, err := client.CoreV1().Pods(namespace).Get(ctx, podName, metav1.GetOptions{}) - if err != nil { - continue - } - for _, cond := range pod.Status.Conditions { - if cond.Type == v1.PodScheduled && cond.Status == v1.ConditionTrue { - return nil - } - } - } - } + waitForPodScheduled(tCtx, namespace, pod.Name) } diff --git a/test/integration/dra/helpers_test.go b/test/integration/dra/helpers_test.go new file mode 100644 index 00000000000..7d696ef318c --- /dev/null +++ b/test/integration/dra/helpers_test.go @@ -0,0 +1,190 @@ +/* +Copyright 2025 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package dra + +import ( + "context" + "fmt" + "regexp" + "strings" + "time" + + "github.com/onsi/gomega" + + v1 "k8s.io/api/core/v1" + resourceapi "k8s.io/api/resource/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/kubernetes/test/utils/ktesting" + "k8s.io/utils/ptr" +) + +// must can be wrapped around a Create/Update/Patch/Get/Delete call and handles the error checking: +// +// pod = must(tCtx, tCtx.Client().CoreV1().Pods(namespace).Create, pod, metav1.CreateOptions{}) +func must[R, P, O any](tCtx ktesting.TContext, call func(context.Context, P, O) (*R, error), p P, o O) *R { + tCtx.Helper() + r, err := call(tCtx, p, o) + tCtx.ExpectNoError(err) + return r +} + +// createTestNamespace creates a namespace with a name that is derived from the +// current test name: +// - Non-alpha-numeric characters replaced by hyphen. +// - Truncated in the middle to make it short enough for GenerateName. +// - Hyphen plus random suffix added by the apiserver. +func createTestNamespace(tCtx ktesting.TContext, labels map[string]string) string { + tCtx.Helper() + name := regexp.MustCompile(`[^[:alnum:]_-]`).ReplaceAllString(tCtx.Name(), "-") + name = strings.ToLower(name) + if len(name) > 63 { + name = name[:30] + "--" + name[len(name)-30:] + } + ns := &v1.Namespace{ObjectMeta: metav1.ObjectMeta{GenerateName: name + "-"}} + ns.Labels = labels + ns, err := tCtx.Client().CoreV1().Namespaces().Create(tCtx, ns, metav1.CreateOptions{}) + tCtx.ExpectNoError(err, "create test namespace") + tCtx.CleanupCtx(func(tCtx ktesting.TContext) { + tCtx.ExpectNoError(tCtx.Client().CoreV1().Namespaces().Delete(tCtx, ns.Name, metav1.DeleteOptions{})) + // *Not* waiting here. Deleting namespaces is slow. + }) + return ns.Name +} + +// createSlice creates the given ResourceSlice and removes it when the test is done. +func createSlice(tCtx ktesting.TContext, slice *resourceapi.ResourceSlice) *resourceapi.ResourceSlice { + tCtx.Helper() + slice, err := tCtx.Client().ResourceV1().ResourceSlices().Create(tCtx, slice, metav1.CreateOptions{}) + tCtx.ExpectNoError(err, "create ResourceSlice") + tCtx.CleanupCtx(func(tCtx ktesting.TContext) { + tCtx.Log("Cleaning up ResourceSlice...") + deleteAndWait(tCtx, tCtx.Client().ResourceV1().ResourceSlices().Delete, tCtx.Client().ResourceV1().ResourceSlices().Get, slice.Name) + }) + return slice +} + +// createTestClass creates a DeviceClass with a driver name derived from the test namespace +func createTestClass(tCtx ktesting.TContext, namespace string) (*resourceapi.DeviceClass, string) { + tCtx.Helper() + driverName := namespace + ".driver" + class := class.DeepCopy() + class.Name = namespace + ".class" + class.Spec.Selectors = []resourceapi.DeviceSelector{{ + CEL: &resourceapi.CELDeviceSelector{ + Expression: fmt.Sprintf("device.driver == %q", driverName), + }, + }} + _, err := tCtx.Client().ResourceV1().DeviceClasses().Create(tCtx, class, metav1.CreateOptions{}) + tCtx.ExpectNoError(err, "create class") + tCtx.CleanupCtx(func(tCtx ktesting.TContext) { + tCtx.Log("Cleaning up DeviceClass...") + deleteAndWait(tCtx, tCtx.Client().ResourceV1().DeviceClasses().Delete, tCtx.Client().ResourceV1().DeviceClasses().Get, class.Name) + }) + + return class, driverName +} + +// createClaim creates a claim and in the namespace. +// The class must already exist and is used for all requests. +func createClaim(tCtx ktesting.TContext, namespace string, suffix string, class *resourceapi.DeviceClass, claim *resourceapi.ResourceClaim) *resourceapi.ResourceClaim { + tCtx.Helper() + claim = claim.DeepCopy() + claim.Namespace = namespace + claim.Name += suffix + claimName := claim.Name + for i := range claim.Spec.Devices.Requests { + request := &claim.Spec.Devices.Requests[i] + if request.Exactly != nil && request.Exactly.DeviceClassName != "" { + request.Exactly.DeviceClassName = class.Name + continue + } + for e := range request.FirstAvailable { + subRequest := &request.FirstAvailable[e] + subRequest.DeviceClassName = class.Name + } + } + claim, err := tCtx.Client().ResourceV1().ResourceClaims(namespace).Create(tCtx, claim, metav1.CreateOptions{}) + tCtx.ExpectNoError(err, "create claim "+claimName) + // TODO: some tests leak claims. Probably they need to be fixed... later. + // tCtx.CleanupCtx(func(tCtx ktesting.TContext) { + // // We want to know when tearing this down gets stuck. + // deleteAndWait(tCtx, tCtx.Client().ResourceV1().ResourceClaims(namespace).Delete, tCtx.Client().ResourceV1().ResourceClaims(namespace).Get, claim.Name) + // }) + return claim +} + +// createPod create a pod in the namespace, referencing the given claim. +func createPod(tCtx ktesting.TContext, namespace string, suffix string, claim *resourceapi.ResourceClaim, pod *v1.Pod) *v1.Pod { + tCtx.Helper() + pod = pod.DeepCopy() + pod.Name += suffix + podName := pod.Name + pod.Namespace = namespace + pod.Spec.ResourceClaims[0].ResourceClaimName = &claim.Name + pod, err := tCtx.Client().CoreV1().Pods(namespace).Create(tCtx, pod, metav1.CreateOptions{}) + tCtx.ExpectNoError(err, "create pod "+podName) + tCtx.CleanupCtx(func(tCtx ktesting.TContext) { + tCtx.Log("Cleaning up Pod...") + // We must delete pods before uninstalling our driver. + // Also, we want to know when stopping it gets stuck. + deleteAndWait(tCtx, tCtx.Client().CoreV1().Pods(namespace).Delete, tCtx.Client().CoreV1().Pods(namespace).Get, pod.Name) + }) + return pod +} + +func waitForPodScheduled(tCtx ktesting.TContext, namespace, podName string) { + tCtx.Helper() + + ktesting.Eventually(tCtx, func(tCtx ktesting.TContext) *v1.Pod { + return must(tCtx, tCtx.Client().CoreV1().Pods(namespace).Get, podName, metav1.GetOptions{}) + }).WithTimeout(60*time.Second).Should( + gomega.HaveField("Status.Conditions", gomega.ContainElement( + gomega.And( + gomega.HaveField("Type", gomega.Equal(v1.PodScheduled)), + gomega.HaveField("Status", gomega.Equal(v1.ConditionTrue)), + ), + )), + "Pod %s should have been scheduled.", podName, + ) +} + +func deleteAndWait[T any](tCtx ktesting.TContext, del func(context.Context, string, metav1.DeleteOptions) error, get func(context.Context, string, metav1.GetOptions) (T, error), name string) { + tCtx.Helper() + + var t T + var anyT any = t + var options metav1.DeleteOptions + if _, ok := anyT.(*v1.Pod); ok { + // Special case for pods: we don't have a kubelet which acknowledges + // shutdown of a scheduled pod, so we have to force-delete. + options.GracePeriodSeconds = ptr.To(int64(0)) + } + + tCtx.ExpectNoError(del(tCtx, name, options), fmt.Sprintf("delete %T %s", t, name)) + waitForNotFound(tCtx, get, name) +} + +func waitForNotFound[T any](tCtx ktesting.TContext, get func(context.Context, string, metav1.GetOptions) (T, error), name string) { + tCtx.Helper() + + var t T + ktesting.Eventually(tCtx, func(tCtx ktesting.TContext) error { + _, err := get(tCtx, name, metav1.GetOptions{}) + return err + }).WithTimeout(60*time.Second).Should(gomega.MatchError(apierrors.IsNotFound, "IsNotFound"), "Object %T %s should have been removed.", t, name) +}