From de477148792c421372b8727fbfcda0d8d6c09463 Mon Sep 17 00:00:00 2001 From: Patrick Ohly Date: Wed, 10 Dec 2025 12:34:04 +0100 Subject: [PATCH] DRA upgrade/downgrade: rewrite as Go unit test tCtx.Run and sub-tests make it much simpler to separate the different steps than with Ginkgo because unless a test runs tCtx.Parallel (which we don't do here), everything runs sequentially in a deterministic order. Right now we get: ... localupcluster.go:285: I1210 12:24:22.067524] bring up v1.34: stopping kubelet localupcluster.go:285: I1210 12:24:22.067548] bring up v1.34: stopping kube-scheduler localupcluster.go:285: I1210 12:24:22.067570] bring up v1.34: stopping kube-controller-manager localupcluster.go:285: I1210 12:24:22.067589] bring up v1.34: stopping kube-apiserver --- PASS: TestUpgradeDowngrade (94.78s) --- PASS: TestUpgradeDowngrade/after-cluster-creation (2.07s) --- PASS: TestUpgradeDowngrade/after-cluster-creation/core_DRA (2.05s) --- PASS: TestUpgradeDowngrade/after-cluster-creation/ResourceClaim_device_status (0.02s) --- PASS: TestUpgradeDowngrade/after-cluster-upgrade (4.10s) --- PASS: TestUpgradeDowngrade/after-cluster-upgrade/core_DRA (4.09s) --- PASS: TestUpgradeDowngrade/after-cluster-upgrade/ResourceClaim_device_status (0.01s) --- PASS: TestUpgradeDowngrade/after-cluster-downgrade (1.24s) --- PASS: TestUpgradeDowngrade/after-cluster-downgrade/core_DRA (1.21s) --- PASS: TestUpgradeDowngrade/after-cluster-downgrade/ResourceClaim_device_status (0.02s) PASS It's even possible to use `-failfast` and e.g. `-run=TestUpgradeDowngrade/after-cluster-creation/core_DRA`: `go test` then runs everything up to that sub-test or any failing sub-test, then stops and cleans up. --- test/e2e_dra/README.md | 22 +- test/e2e_dra/coredra_test.go | 21 +- test/e2e_dra/resourceclaimstatus_test.go | 17 +- test/e2e_dra/upgradedowngrade_test.go | 436 +++++++++-------------- 4 files changed, 197 insertions(+), 299 deletions(-) diff --git a/test/e2e_dra/README.md b/test/e2e_dra/README.md index a3e8abbd446..74eb638a2d1 100644 --- a/test/e2e_dra/README.md +++ b/test/e2e_dra/README.md @@ -1,12 +1,13 @@ This directory contains a testsuite with automatic upgrade/downgrade tests for DRA. Conceptually this is like an integration test, in the sense that it -starts/stops cluster components and runs tests against them. +starts/stops cluster components and runs tests against them. It has its own +directory because it needs to be started differently than other integration +tests or unit tests, which makes it more like an E2E suite. The difference is that it starts Kubernetes components by running the actual binaries, relying on local-up-cluster.sh for the logic and configuration -steps. Because local-up-cluster.sh needs additional permissions and -preparations on the host, the test cannot run in "make test-integration" and -just skips itself there. +steps. local-up-cluster.sh needs additional permissions and +preparations on the host. To run it: - Make sure that hack/local-up-cluster.sh works: @@ -23,21 +24,16 @@ To run it: Otherwise a test tmp directory is used. - Invoke as a Go test (no need for the ginkgo CLI), for example: - go test -v -count=1 -timeout=1h ./test/e2e_dra -args -ginkgo.v - dlv test ./test/e2e_dra -- -ginkgo.v - make test KUBE_TIMEOUT=-timeout=1h WHAT=test/e2e_dra FULL_LOG=true KUBE_TEST_ARGS="-count=1 -args -ginkgo.v" + go test -v -count=1 -timeout=1h ./test/e2e_dra + dlv test ./test/e2e_dra -- -test.v + make test KUBE_TIMEOUT=-timeout=1h WHAT=test/e2e_dra FULL_LOG=true KUBE_TEST_ARGS="-count=1" `make test` instead of `make test-integration` is intentional: `local-up-cluster.sh` itself wants to start etcd. `-count=1` ensures that test runs each time it is invoked. -`-v` and `-ginkgo.v` make the test output visible while the test runs. +`-v`/`-test.v`/`FULL_LOG=true` make the test output visible while the test runs. To simplify starting from scratch, `./test/e2e_dra/run.sh` cleans up, sets permissions, and then invokes whatever command is specified on the command line: ./test/e2e_dra/run.sh go test ./test/e2e_dra - -The test is implemented as a Ginkgo suite because that allows reusing the same -helper code as in E2E tests. Long-term the goal is to port that helper code to -ktesting, support ktesting in test/e2e, and turn this test into a normal Go -test. diff --git a/test/e2e_dra/coredra_test.go b/test/e2e_dra/coredra_test.go index d5f0a928644..336c3dd200b 100644 --- a/test/e2e_dra/coredra_test.go +++ b/test/e2e_dra/coredra_test.go @@ -17,28 +17,29 @@ limitations under the License. package e2edra import ( + "time" + "github.com/onsi/gomega" v1 "k8s.io/api/core/v1" apierrors "k8s.io/apimachinery/pkg/api/errors" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" drautils "k8s.io/kubernetes/test/e2e/dra/utils" - "k8s.io/kubernetes/test/e2e/framework" e2epod "k8s.io/kubernetes/test/e2e/framework/pod" "k8s.io/kubernetes/test/utils/ktesting" ) -func coreDRA(tCtx ktesting.TContext, f *framework.Framework, b *drautils.Builder) step2Func { - namespace := f.Namespace.Name +func coreDRA(tCtx ktesting.TContext, b *drautils.Builder) upgradedTestFunc { + namespace := tCtx.Namespace() claim := b.ExternalClaim() pod := b.PodExternal() b.Create(tCtx, claim, pod) b.TestPod(tCtx, pod) - return func(tCtx ktesting.TContext) step3Func { + return func(tCtx ktesting.TContext) downgradedTestFunc { // Remove pod prepared in step 1. - framework.ExpectNoError(f.ClientSet.ResourceV1beta1().ResourceClaims(namespace).Delete(tCtx, claim.Name, metav1.DeleteOptions{})) - framework.ExpectNoError(f.ClientSet.CoreV1().Pods(namespace).Delete(tCtx, pod.Name, metav1.DeleteOptions{})) - framework.ExpectNoError(e2epod.WaitForPodNotFoundInNamespace(tCtx, f.ClientSet, pod.Name, namespace, f.Timeouts.PodDelete)) + tCtx.ExpectNoError(tCtx.Client().ResourceV1beta1().ResourceClaims(namespace).Delete(tCtx, claim.Name, metav1.DeleteOptions{})) + tCtx.ExpectNoError(tCtx.Client().CoreV1().Pods(namespace).Delete(tCtx, pod.Name, metav1.DeleteOptions{})) + tCtx.ExpectNoError(e2epod.WaitForPodNotFoundInNamespace(tCtx, tCtx.Client(), pod.Name, namespace, 3*time.Minute)) // Create another claim and pod, this time using the latest Kubernetes. claim = b.ExternalClaim() @@ -56,13 +57,13 @@ func coreDRA(tCtx ktesting.TContext, f *framework.Framework, b *drautils.Builder // or (even weirder) with // getting *v1.Pod: pods "tester-2" is forbidden: User "kubernetes-admin" cannot get resource "pods" in API group "" in the namespace "dra-9021" ktesting.Eventually(tCtx, func(tCtx ktesting.TContext) error { - return f.ClientSet.ResourceV1beta1().ResourceClaims(namespace).Delete(tCtx, claim.Name, metav1.DeleteOptions{}) + return tCtx.Client().ResourceV1beta1().ResourceClaims(namespace).Delete(tCtx, claim.Name, metav1.DeleteOptions{}) }).Should(gomega.Succeed(), "delete claim after downgrade") ktesting.Eventually(tCtx, func(tCtx ktesting.TContext) error { - return f.ClientSet.CoreV1().Pods(namespace).Delete(tCtx, pod.Name, metav1.DeleteOptions{}) + return tCtx.Client().CoreV1().Pods(namespace).Delete(tCtx, pod.Name, metav1.DeleteOptions{}) }).Should(gomega.Succeed(), "delete pod after downgrade") ktesting.Eventually(tCtx, func(tCtx ktesting.TContext) *v1.Pod { - pod, err := f.ClientSet.CoreV1().Pods(namespace).Get(tCtx, pod.Name, metav1.GetOptions{}) + pod, err := tCtx.Client().CoreV1().Pods(namespace).Get(tCtx, pod.Name, metav1.GetOptions{}) if apierrors.IsNotFound(err) { return nil } diff --git a/test/e2e_dra/resourceclaimstatus_test.go b/test/e2e_dra/resourceclaimstatus_test.go index d4a65624628..b49bdd45599 100644 --- a/test/e2e_dra/resourceclaimstatus_test.go +++ b/test/e2e_dra/resourceclaimstatus_test.go @@ -31,15 +31,12 @@ import ( resourceapiacv1beta2 "k8s.io/client-go/applyconfigurations/resource/v1beta2" draapiv1beta2 "k8s.io/dynamic-resource-allocation/api/v1beta2" drautils "k8s.io/kubernetes/test/e2e/dra/utils" - "k8s.io/kubernetes/test/e2e/framework" "k8s.io/kubernetes/test/utils/ktesting" ) -// resourceClaimDeviceStatus corresponds to testResourceClaimDeviceStatus in test/integration/dra -// and was copied from there, therefore the unit-test style with tCtx and require. -// This is the preferred style for new tests. -func resourceClaimDeviceStatus(tCtx ktesting.TContext, f *framework.Framework, b *drautils.Builder) step2Func { - namespace := f.Namespace.Name +// resourceClaimDeviceStatus corresponds to testResourceClaimDeviceStatus in test/integration/dra. +func resourceClaimDeviceStatus(tCtx ktesting.TContext, b *drautils.Builder) upgradedTestFunc { + namespace := tCtx.Namespace() claimName := "claim-with-device-status" claim := &resourceapiv1beta2.ResourceClaim{ ObjectMeta: metav1.ObjectMeta{ @@ -111,7 +108,6 @@ func resourceClaimDeviceStatus(tCtx ktesting.TContext, f *framework.Framework, b err = client.ResourceClaims(namespace).Delete(tCtx, claim.Name, metav1.DeleteOptions{}) tCtx.ExpectNoError(err, "delete claim") } - tCtx.CleanupCtx(removeClaim) claim, err = tCtx.Client().ResourceV1beta2().ResourceClaims(namespace).UpdateStatus(tCtx, claim, metav1.UpdateOptions{}) tCtx.ExpectNoError(err, "add allocation result") @@ -193,7 +189,7 @@ func resourceClaimDeviceStatus(tCtx ktesting.TContext, f *framework.Framework, b tCtx.ExpectNoError(encoder.Encode(claim)) tCtx.Logf("Final ResourceClaim:\n%s", buffer.String()) - return func(tCtx ktesting.TContext) step3Func { + return func(tCtx ktesting.TContext) downgradedTestFunc { // Update one entry, remove the other. deviceStatusAC := resourceapiac.AllocatedDeviceStatus(). WithDriver("two"). @@ -227,8 +223,9 @@ func resourceClaimDeviceStatus(tCtx ktesting.TContext, f *framework.Framework, b tCtx.ExpectNoError(err, "remove device status three") require.Equal(tCtx, deviceStatus, claim.Status.Devices, "after removing device status three") - // The cleanup order is so that we have to run this explicitly now. - // The tCtx.CleanupCtx is more for the sake of completeness. + // This was created in a prior sub-test, so we have to + // clean up manually for a proper termination of the + // overall test. removeClaim(tCtx) } } diff --git a/test/e2e_dra/upgradedowngrade_test.go b/test/e2e_dra/upgradedowngrade_test.go index ea8d867a5b8..6048bd9e3cf 100644 --- a/test/e2e_dra/upgradedowngrade_test.go +++ b/test/e2e_dra/upgradedowngrade_test.go @@ -19,9 +19,7 @@ package e2edra import ( "archive/tar" "compress/gzip" - "context" _ "embed" - "flag" "fmt" "io" "net/http" @@ -34,19 +32,16 @@ import ( "testing" "time" - "github.com/onsi/ginkgo/v2" "github.com/onsi/gomega" "k8s.io/apimachinery/pkg/util/version" restclient "k8s.io/client-go/rest" "k8s.io/kubernetes/cmd/kubeadm/app/util/errors" drautils "k8s.io/kubernetes/test/e2e/dra/utils" - "k8s.io/kubernetes/test/e2e/framework" e2enode "k8s.io/kubernetes/test/e2e/framework/node" e2etestfiles "k8s.io/kubernetes/test/e2e/framework/testfiles" "k8s.io/kubernetes/test/utils/ktesting" "k8s.io/kubernetes/test/utils/localupcluster" - admissionapi "k8s.io/pod-security-admission/api" ) var errHTTP404 = errors.New("resource not found (404)") @@ -69,31 +64,23 @@ func init() { // // The "test code" gets registered here with a single function for each // sub-test. That function then returns the next piece of code, which then -// returns the final code. -// -// For performance reasons there is only a single `It("works")` which -// runs everything. Failures in sub-tests are reported separately *if they -// are reported via the TContext*. Failures reported via ginkgo.Fail -// currently abort the entire test. This will be addressed by converting -// everything to ktesting-based unit tests. +// returns the final code. Each callback function is executed as a sub-test. +// The builder is configured to not delete objects when that sub-test ends, +// so objects persist until the entire test is done. // // Each sub-test must be self-contained. They intentionally run in a random // order. However, they share the same cluster and the 8 devices which are // available there. -var subTests = map[string]step1Func{ +var subTests = map[string]initialTestFunc{ "core DRA": coreDRA, "ResourceClaim device status": resourceClaimDeviceStatus, } -// step1Func is passed everything that is needed to run -type step1Func func(tCtx ktesting.TContext, f *framework.Framework, builder *drautils.Builder) step2Func +type initialTestFunc func(tCtx ktesting.TContext, builder *drautils.Builder) upgradedTestFunc -type step2Func func(tCtx ktesting.TContext) step3Func +type upgradedTestFunc func(tCtx ktesting.TContext) downgradedTestFunc -type step3Func func(tCtx ktesting.TContext) - -// steps has the names for the actual ginkgo.It where sub-test results are provided. -var steps = []string{"before downgrade", "after downgrade", "after upgrade"} +type downgradedTestFunc func(tCtx ktesting.TContext) var repoRoot = repoRootDefault() @@ -117,272 +104,189 @@ func repoRootDefault() string { return "../../" } -func TestUpgradeDowngrade(t *testing.T) { - suiteConfig, reporterConfig := framework.CreateGinkgoConfig() - suiteConfig.RandomizeAllSpecs = false - ginkgo.RunSpecs(t, "DRA", suiteConfig, reporterConfig) -} - -var _ = ginkgo.Describe("DRA upgrade/downgrade", func() { - // Initialize the default values by registering flags. We don't actually expose those flags. - var fs flag.FlagSet - framework.RegisterCommonFlags(&fs) - framework.RegisterClusterFlags(&fs) - +func TestUpgradeDowngrade(t *testing.T) { testUpgradeDowngrade(ktesting.Init(t)) } +func testUpgradeDowngrade(tCtx ktesting.TContext) { // Some other things normally done by test/e2e. e2etestfiles.AddFileSource(e2etestfiles.RootFileSource{Root: repoRoot}) - gomega.RegisterFailHandler(ginkgo.Fail) - // sub-test -> step -> failure - // - // Initially each failure string is empty. - results := make(map[string]map[string]string, len(subTests)) - for subTest := range subTests { - results[subTest] = make(map[string]string, len(steps)) + // Ideally we shouldn't have any code which directly calls gomega.Expect, + // but we are not there yet (e.g. e2epod.MakePod). So for now we install + // one fail handler which records failures in the main test context. + gomega.RegisterFailHandler(func(message string, callerSkip ...int) { + tCtx.Helper() + tCtx.Fatal(message) + }) + + envName, dir := currentBinDir() + if dir == "" { + tCtx.Fatalf("%s must be set to test DRA upgrade/downgrade scenarios.", envName) } - ginkgo.It("works", func(ctx context.Context) { - // TODO: replace with helper code from https://github.com/kubernetes/kubernetes/pull/122481 should that get merged. - tCtx := ktesting.Init(GinkgoContextTB()) - tCtx = ktesting.WithContext(tCtx, ctx) + // Determine what we need to downgrade to. + tCtx = ktesting.Begin(tCtx, "get source code version") + gitVersion, _, err := sourceVersion(tCtx, repoRoot) + tCtx.ExpectNoError(err, "determine source code version for repo root %q", repoRoot) + version, err := version.ParseGeneric(gitVersion) + tCtx.ExpectNoError(err, "parse version %s of repo root %q", gitVersion, repoRoot) + major, previousMinor := version.Major(), version.Minor()-1 + if strings.Contains(gitVersion, "-alpha.0") { + // All version up to and including x.y.z-alpha.0 are treated as if we were + // still the previous minor version x.(y-1). There are two reason for this: + // + // - During code freeze around (at?) -rc.0, the master branch already + // identfies itself as the next release with -alpha.0. Without this + // special case, we would change the version skew testing from what + // has been tested and been known to work to something else, which + // can and at least once did break. + // + // - Early in the next cycle the differences compared to the previous + // release are small, so it's more interesting to go back further. + previousMinor-- + } + tCtx.Logf("got version: major: %d, minor: %d, previous minor: %d", major, version.Minor(), previousMinor) + tCtx = ktesting.End(tCtx) - envName, dir := currentBinDir() - if dir == "" { - tCtx.Fatalf("%s must be set to test DRA upgrade/downgrade scenarios.", envName) + // KUBERNETES_SERVER_CACHE_DIR can be set to keep downloaded files across test restarts. + binDir, cacheBinaries := os.LookupEnv("KUBERNETES_SERVER_CACHE_DIR") + if !cacheBinaries { + binDir = tCtx.TempDir() + } + haveBinaries := false + + // Get the previous release. + tCtx = ktesting.Begin(tCtx, "get previous release info") + tCtx.Logf("stable release %d.%d", major, previousMinor) + previousURL, previousVersion, err := serverDownloadURL(tCtx, "stable", major, previousMinor) + if errors.Is(err, errHTTP404) { + tCtx.Logf("stable doesn't exist, get latest release %d.%d", major, previousMinor) + previousURL, previousVersion, err = serverDownloadURL(tCtx, "latest", major, previousMinor) + } + tCtx.ExpectNoError(err) + tCtx.Logf("got previous release version: %s, URL: %s", previousVersion, previousURL) + tCtx = ktesting.End(tCtx) + + if cacheBinaries { + binDir = path.Join(binDir, previousVersion) + _, err := os.Stat(path.Join(binDir, string(localupcluster.KubeClusterComponents[0]))) + if err == nil { + haveBinaries = true } - - // Determine what we need to downgrade to. - tCtx = ktesting.Begin(tCtx, "get source code version") - gitVersion, _, err := sourceVersion(tCtx, repoRoot) - tCtx.ExpectNoError(err, "determine source code version for repo root %q", repoRoot) - version, err := version.ParseGeneric(gitVersion) - tCtx.ExpectNoError(err, "parse version %s of repo root %q", gitVersion, repoRoot) - major, previousMinor := version.Major(), version.Minor()-1 - if strings.Contains(gitVersion, "-alpha.0") { - // All version up to and including x.y.z-alpha.0 are treated as if we were - // still the previous minor version x.(y-1). There are two reason for this: - // - // - During code freeze around (at?) -rc.0, the master branch already - // identfies itself as the next release with -alpha.0. Without this - // special case, we would change the version skew testing from what - // has been tested and been known to work to something else, which - // can and at least once did break. - // - // - Early in the next cycle the differences compared to the previous - // release are small, so it's more interesting to go back further. - previousMinor-- - } - tCtx.Logf("got version: major: %d, minor: %d, previous minor: %d", major, version.Minor(), previousMinor) - tCtx = ktesting.End(tCtx) - - // KUBERNETES_SERVER_CACHE_DIR can be set to keep downloaded files across test restarts. - binDir, cacheBinaries := os.LookupEnv("KUBERNETES_SERVER_CACHE_DIR") - if !cacheBinaries { - binDir = tCtx.TempDir() - } - haveBinaries := false - - // Get the previous release. - tCtx = ktesting.Begin(tCtx, "get previous release info") - tCtx.Logf("stable release %d.%d", major, previousMinor) - previousURL, previousVersion, err := serverDownloadURL(tCtx, "stable", major, previousMinor) - if errors.Is(err, errHTTP404) { - tCtx.Logf("stable doesn't exist, get latest release %d.%d", major, previousMinor) - previousURL, previousVersion, err = serverDownloadURL(tCtx, "latest", major, previousMinor) - } - tCtx.ExpectNoError(err) - tCtx.Logf("got previous release version: %s, URL: %s", previousVersion, previousURL) - tCtx = ktesting.End(tCtx) - - if cacheBinaries { - binDir = path.Join(binDir, previousVersion) - _, err := os.Stat(path.Join(binDir, string(localupcluster.KubeClusterComponents[0]))) - if err == nil { - haveBinaries = true + } + if !haveBinaries { + tCtx = ktesting.Begin(tCtx, fmt.Sprintf("download and unpack %s", previousURL)) + req, err := http.NewRequestWithContext(tCtx, http.MethodGet, previousURL, nil) + tCtx.ExpectNoError(err, "construct request") + response, err := http.DefaultClient.Do(req) + tCtx.ExpectNoError(err, "download") + defer func() { + _ = response.Body.Close() + }() + decompress, err := gzip.NewReader(response.Body) + tCtx.ExpectNoError(err, "construct gzip reader") + unpack := tar.NewReader(decompress) + for { + header, err := unpack.Next() + if err == io.EOF { + break + } + base := path.Base(header.Name) + if slices.Contains(localupcluster.KubeClusterComponents, localupcluster.KubeComponentName(base)) { + data, err := io.ReadAll(unpack) + tCtx.ExpectNoError(err, fmt.Sprintf("read content of %s", header.Name)) + tCtx.ExpectNoError(os.MkdirAll(binDir, 0755), "create directory for binaries") + tCtx.ExpectNoError(os.WriteFile(path.Join(binDir, base), data, 0555), fmt.Sprintf("write content of %s", header.Name)) } } - if !haveBinaries { - tCtx = ktesting.Begin(tCtx, fmt.Sprintf("download and unpack %s", previousURL)) - req, err := http.NewRequestWithContext(tCtx, http.MethodGet, previousURL, nil) - tCtx.ExpectNoError(err, "construct request") - response, err := http.DefaultClient.Do(req) - tCtx.ExpectNoError(err, "download") - defer func() { - _ = response.Body.Close() - }() - decompress, err := gzip.NewReader(response.Body) - tCtx.ExpectNoError(err, "construct gzip reader") - unpack := tar.NewReader(decompress) - for { - header, err := unpack.Next() - if err == io.EOF { - break - } - base := path.Base(header.Name) - if slices.Contains(localupcluster.KubeClusterComponents, localupcluster.KubeComponentName(base)) { - data, err := io.ReadAll(unpack) - tCtx.ExpectNoError(err, fmt.Sprintf("read content of %s", header.Name)) - tCtx.ExpectNoError(os.MkdirAll(binDir, 0755), "create directory for binaries") - tCtx.ExpectNoError(os.WriteFile(path.Join(binDir, base), data, 0555), fmt.Sprintf("write content of %s", header.Name)) - } - } - tCtx = ktesting.End(tCtx) - } - - tCtx = ktesting.Begin(tCtx, fmt.Sprintf("bring up v%d.%d", major, previousMinor)) - cluster := localupcluster.New(tCtx) - localUpClusterEnv := map[string]string{ - "RUNTIME_CONFIG": "resource.k8s.io/v1beta1,resource.k8s.io/v1beta2", - "FEATURE_GATES": "DynamicResourceAllocation=true", - // *not* needed because driver will run in "local filesystem" mode (= driver.IsLocal): "ALLOW_PRIVILEGED": "1", - } - cluster.Start(tCtx, binDir, localUpClusterEnv) tCtx = ktesting.End(tCtx) + } - restConfig := cluster.LoadConfig(tCtx) - restConfig.UserAgent = fmt.Sprintf("%s -- dra", restclient.DefaultKubernetesUserAgent()) - tCtx = ktesting.WithRESTConfig(tCtx, restConfig) - // TODO: rewrite all DRA test code to use ktesting.TContext once https://github.com/kubernetes/kubernetes/pull/122481 is - // merged, then we don't need to fake a Framework instance. - f := &framework.Framework{ - BaseName: "dra", - Timeouts: framework.NewTimeoutContext(), - ClientSet: tCtx.Client(), - DynamicClient: tCtx.Dynamic(), + tCtx = ktesting.Begin(tCtx, fmt.Sprintf("bring up v%d.%d", major, previousMinor)) + cluster := localupcluster.New(tCtx) + localUpClusterEnv := map[string]string{ + "RUNTIME_CONFIG": "resource.k8s.io/v1beta1,resource.k8s.io/v1beta2", + "FEATURE_GATES": "DynamicResourceAllocation=true", + // *not* needed because driver will run in "local filesystem" mode (= driver.IsLocal): "ALLOW_PRIVILEGED": "1", + } + cluster.Start(tCtx, binDir, localUpClusterEnv) + tCtx = ktesting.End(tCtx) - // The driver containers have to run with sufficient privileges to - // modify /var/lib/kubelet/plugins. - NamespacePodSecurityLevel: admissionapi.LevelPrivileged, - } - f.SetClientConfig(restConfig) + restConfig := cluster.LoadConfig(tCtx) + restConfig.UserAgent = fmt.Sprintf("%s -- dra", restclient.DefaultKubernetesUserAgent()) + tCtx = tCtx.WithRESTConfig(restConfig).WithNamespace("default") - namespace, err := f.CreateNamespace(tCtx, f.BaseName, map[string]string{ - "e2e-framework": f.BaseName, - }) - tCtx.ExpectNoError(err, "create namespace") - f.Namespace = namespace - f.UniqueName = namespace.Name + tCtx = ktesting.Begin(tCtx, fmt.Sprintf("v%d.%d", major, previousMinor)) - tCtx = ktesting.Begin(tCtx, fmt.Sprintf("v%d.%d", major, previousMinor)) + tCtx.ExpectNoError(e2enode.WaitForAllNodesSchedulable(tCtx, tCtx.Client(), 5*time.Minute), "wait for all nodes to be schedulable") + nodes := drautils.NewNodesNow(tCtx, 1, 1) - tCtx.ExpectNoError(e2enode.WaitForAllNodesSchedulable(tCtx, tCtx.Client(), f.Timeouts.NodeSchedulable), "wait for all nodes to be schedulable") - nodes := drautils.NewNodesNow(tCtx, 1, 1) + // Opening sockets locally avoids intermittent errors and delays caused by proxying through the restarted apiserver. + // We could speed up testing by shortening the sync delay in the ResourceSlice controller, but let's better + // test the defaults. + driver := drautils.NewDriverInstance(tCtx) + driver.IsLocal = true + driver.Run(tCtx, "/var/lib/kubelet", nodes, drautils.DriverResourcesNow(nodes, 8)) + b := drautils.NewBuilderNow(tCtx, driver) + b.SkipCleanup = true - // Opening sockets locally avoids intermittent errors and delays caused by proxying through the restarted apiserver. - // We could speed up testing by shortening the sync delay in the ResourceSlice controller, but let's better - // test the defaults. - driver := drautils.NewDriverInstance(tCtx) - driver.IsLocal = true - driver.Run(tCtx, nodes, drautils.DriverResourcesNow(nodes, 8)) - b := drautils.NewBuilderNow(tCtx, driver) + tCtx = ktesting.End(tCtx) - tCtx = ktesting.End(tCtx) - - steps2 := make(map[string]step2Func, len(subTests)) - for subTest, step1 := range subTests { - tCtx = ktesting.Begin(tCtx, subTest) - var result error - func() { - tCtx, finalize := ktesting.WithError(tCtx, &result) - defer finalize() - - // This only gets set in case of success. - steps2[subTest] = step1(tCtx, f, b) - }() - if result != nil { - results[subTest][steps[0]] = result.Error() - } - tCtx = ktesting.End(tCtx) - } - - tCtx = ktesting.Begin(tCtx, fmt.Sprintf("update to %s", gitVersion)) - // We could split this up into first updating the apiserver, then control plane components, then restarting kubelet. - // For the purpose of this test here we we primarily care about full before/after comparisons, so not done yet. - // TODO - restoreOptions := cluster.Modify(tCtx, localupcluster.ModifyOptions{Upgrade: true, BinDir: dir}) - tCtx = ktesting.End(tCtx) - - // The kubelet wipes all ResourceSlices on a restart because it doesn't know which drivers were running. - // Wait for the ResourceSlice controller in the driver to notice and recreate the ResourceSlices. - tCtx = ktesting.Begin(tCtx, "wait for ResourceSlices") - ktesting.Eventually(tCtx, driver.NewGetSlices()).WithTimeout(5 * time.Minute).Should(gomega.HaveField("Items", gomega.HaveLen(len(nodes.NodeNames)))) - tCtx = ktesting.End(tCtx) - - steps3 := make(map[string]step3Func, len(subTests)) - for subTest := range subTests { - step2 := steps2[subTest] - if step2 == nil { - continue - } - tCtx = ktesting.Begin(tCtx, subTest) - var result error - func() { - tCtx, finalize := ktesting.WithError(tCtx, &result) - defer finalize() - - // This only gets set in case of success. - steps3[subTest] = step2(tCtx) - }() - if result != nil { - results[subTest][steps[0]] = result.Error() - } - tCtx = ktesting.End(tCtx) - } - - // Roll back. - tCtx = ktesting.Begin(tCtx, "downgrade") - cluster.Modify(tCtx, restoreOptions) - tCtx = ktesting.End(tCtx) - - // TODO: ensure that kube-controller-manager is up-and-running. - // This works around https://github.com/kubernetes/kubernetes/issues/132334 and can be removed - // once a fix for that is backported. - tCtx = ktesting.Begin(tCtx, "wait for kube-controller-manager") - ktesting.Eventually(tCtx, func(tCtx ktesting.TContext) string { - output, _ := cluster.GetSystemLogs(tCtx, localupcluster.KubeControllerManager) - return output - }).Should(gomega.ContainSubstring(`"Caches are synced" controller="resource_claim"`)) - tCtx = ktesting.End(tCtx) - - for subTest := range subTests { - step3 := steps3[subTest] - if step3 == nil { - continue - } - tCtx = ktesting.Begin(tCtx, subTest) - var result error - func() { - tCtx, finalize := ktesting.WithError(tCtx, &result) - defer finalize() - - step3(tCtx) - }() - if result != nil { - results[subTest][steps[0]] = result.Error() - } - tCtx = ktesting.End(tCtx) + upgradedTestFuncs := make(map[string]upgradedTestFunc, len(subTests)) + tCtx.Run("after-cluster-creation", func(tCtx ktesting.TContext) { + for subTest, f := range subTests { + tCtx.Run(subTest, func(tCtx ktesting.TContext) { + // This only gets set if f doesn't panic because of a fatal error, + // so below we won't continue if step 1 already failed. + // Other sub-tests are not affected. + upgradedTestFuncs[subTest] = f(tCtx, b) + }) } }) - // This runs last because by default Ginkgo does not randomize within - // a top-level container. - for subTest := range subTests { - ginkgo.Context(subTest, func() { - for _, step := range steps { - ginkgo.It(step, func() { - failure := results[subTest][step] - if failure != "" { - // Source code location will be useless here. We can't have both: - // separate test results *and* correct source code location. - // This will become better with testing.T-based unit tests. - _, _ = ginkgo.GinkgoWriter.Write([]byte("For log output see 'DRA upgrade/downgrade works'\n")) - ginkgo.Fail(failure) - } - }) - } - }) - } -}) + tCtx = ktesting.Begin(tCtx, fmt.Sprintf("update to %s", gitVersion)) + // We could split this up into first updating the apiserver, then control plane components, then restarting kubelet. + // For the purpose of this test here we we primarily care about full before/after comparisons, so not done yet. + // TODO + restoreOptions := cluster.Modify(tCtx, localupcluster.ModifyOptions{Upgrade: true, BinDir: dir}) + tCtx = ktesting.End(tCtx) + + // The kubelet wipes all ResourceSlices on a restart because it doesn't know which drivers were running. + // Wait for the ResourceSlice controller in the driver to notice and recreate the ResourceSlices. + tCtx = ktesting.Begin(tCtx, "wait for ResourceSlices") + ktesting.Eventually(tCtx, driver.NewGetSlices()).WithTimeout(5 * time.Minute).Should(gomega.HaveField("Items", gomega.HaveLen(len(nodes.NodeNames)))) + tCtx = ktesting.End(tCtx) + + downgradedTestFuncs := make(map[string]downgradedTestFunc, len(subTests)) + tCtx.Run("after-cluster-upgrade", func(tCtx ktesting.TContext) { + for subTest, f := range upgradedTestFuncs { + tCtx.Run(subTest, func(tCtx ktesting.TContext) { + downgradedTestFuncs[subTest] = f(tCtx) + }) + } + }) + + // Roll back. + tCtx = ktesting.Begin(tCtx, "downgrade") + cluster.Modify(tCtx, restoreOptions) + tCtx = ktesting.End(tCtx) + + // TODO: ensure that kube-controller-manager is up-and-running. + // This works around https://github.com/kubernetes/kubernetes/issues/132334 and can be removed + // once a fix for that is backported. + tCtx = ktesting.Begin(tCtx, "wait for kube-controller-manager") + ktesting.Eventually(tCtx, func(tCtx ktesting.TContext) string { + output, _ := cluster.GetSystemLogs(tCtx, localupcluster.KubeControllerManager) + return output + }).Should(gomega.ContainSubstring(`"Caches are synced" controller="resource_claim"`)) + tCtx = ktesting.End(tCtx) + + tCtx.Run("after-cluster-downgrade", func(tCtx ktesting.TContext) { + for subTest, f := range downgradedTestFuncs { + tCtx.Run(subTest, func(tCtx ktesting.TContext) { + f(tCtx) + }) + } + }) +} // sourceVersion identifies the Kubernetes git version based on hack/print-workspace-status.sh. //