From 24c232d6b1aeab01407c073334a99bb3fa463902 Mon Sep 17 00:00:00 2001
From: Ed Bartosh <eduard.bartosh@intel.com>
Date: Sun, 22 Mar 2026 18:41:37 +0200
Subject: [PATCH 1/3] localupcluster: properly query /readyz and /healthz

Despite being called checkReadiness, the function was only performing
a liveness check: /healthz was polled over HTTPS without verifying the
certificate or authenticating, and any HTTP response was accepted as a
signal that the component was up. The only exception was kubelet,
where a node readiness check was added on top.

Switched to /readyz for kube-apiserver and kube-scheduler,
kept /healthz for the rest and require HTTP 200 in all cases.

This ensures that the kube-apiserver is fully initialized before
dependent components are started.
---
 test/utils/localupcluster/localupcluster.go | 52 +++++++++++++++------
 1 file changed, 37 insertions(+), 15 deletions(-)

diff --git a/test/utils/localupcluster/localupcluster.go b/test/utils/localupcluster/localupcluster.go
index 9dd1c45ec5c..3191b20166c 100644
--- a/test/utils/localupcluster/localupcluster.go
+++ b/test/utils/localupcluster/localupcluster.go
@@ -462,17 +462,34 @@ func (c *Cluster) checkReadiness(tCtx ktesting.TContext, cmd *Cmd) {
 	tCtx = tCtx.WithRESTConfig(restConfig)
 	tCtx = tCtx.WithStep(fmt.Sprintf("wait for %s readiness", cmd.Name))
 
+	// For the apiserver we use the admin client certificate with the cluster CA.
+	tlsConfig, err := restclient.TLSConfigFor(restConfig)
+	if err != nil {
+		tCtx.Fatalf("get TLS config for readiness check: %v", err)
+	}
+
+	// The kubelet requires client authentication for /healthz. Use the admin client
+	// certificate with InsecureSkipVerify because the kubelet uses a self-signed cert.
+	tlsConfigWithClientCert := tlsConfig.Clone()
+	tlsConfigWithClientCert.InsecureSkipVerify = true
+
+	// For other components we can skip TLS verification because they use self-signed certs.
+	insecureTLSConfig := &tls.Config{InsecureSkipVerify: true}
+
 	switch {
 	case strings.HasPrefix(cmd.Name, string(KubeAPIServer)):
-		c.checkHealthz(tCtx, cmd, "https", c.settings["API_HOST_IP"], c.settings["API_SECURE_PORT"])
+		c.checkReadyz(tCtx, cmd, "https", c.settings["API_HOST_IP"], c.settings["API_SECURE_PORT"], tlsConfig)
 	case strings.HasPrefix(cmd.Name, string(KubeScheduler)):
-		c.checkHealthz(tCtx, cmd, "https", c.settings["API_HOST_IP"], c.settings["SCHEDULER_SECURE_PORT"])
+		c.checkReadyz(tCtx, cmd, "https", c.settings["API_HOST_IP"], c.settings["SCHEDULER_SECURE_PORT"], insecureTLSConfig)
 	case strings.HasPrefix(cmd.Name, string(KubeControllerManager)):
-		c.checkHealthz(tCtx, cmd, "https", c.settings["API_HOST_IP"], c.settings["KCM_SECURE_PORT"])
+		// TODO: switch to /readyz once it is implemented and available in all tested releases.
+		c.checkHealthz(tCtx, cmd, "https", c.settings["API_HOST_IP"], c.settings["KCM_SECURE_PORT"], insecureTLSConfig)
 	case strings.HasPrefix(cmd.Name, string(KubeProxy)):
-		c.checkHealthz(tCtx, cmd, "http" /* not an error! */, c.settings["API_HOST_IP"], c.settings["PROXY_HEALTHZ_PORT"])
+		// TODO: switch to /readyz once it is implemented and available in all tested releases.
+		c.checkHealthz(tCtx, cmd, "http" /* not an error! */, c.settings["API_HOST_IP"], c.settings["PROXY_HEALTHZ_PORT"], insecureTLSConfig)
 	case strings.HasPrefix(cmd.Name, string(Kubelet)):
-		c.checkHealthz(tCtx, cmd, "https", c.settings["KUBELET_HOST"], c.settings["KUBELET_PORT"])
+		// TODO: switch to /readyz once it is implemented and available in all tested releases.
+		c.checkHealthz(tCtx, cmd, "https", c.settings["KUBELET_HOST"], c.settings["KUBELET_PORT"], tlsConfigWithClientCert)
 
 		// Also wait for the node to be ready.
 		tCtx.WithStep("wait for node ready").Eventually(func(tCtx ktesting.TContext) (*corev1.NodeList, error) {
@@ -484,22 +501,25 @@ func (c *Cluster) checkReadiness(tCtx ktesting.TContext, cmd *Cmd) {
 	}
 }
 
-func (c *Cluster) checkHealthz(tCtx ktesting.TContext, cmd *Cmd, method, hostIP, port string) {
-	url := fmt.Sprintf("%s://%s:%s/healthz", method, hostIP, port)
-	tCtx.WithStep(fmt.Sprintf("check health %s", url)).Eventually(func(tCtx ktesting.TContext) error {
+func (c *Cluster) checkHealthz(tCtx ktesting.TContext, cmd *Cmd, scheme, hostIP, port string, tlsConfig *tls.Config) {
+	c.checkEndpoint(tCtx, cmd, scheme, hostIP, port, "/healthz", tlsConfig)
+}
+
+func (c *Cluster) checkReadyz(tCtx ktesting.TContext, cmd *Cmd, scheme, hostIP, port string, tlsConfig *tls.Config) {
+	c.checkEndpoint(tCtx, cmd, scheme, hostIP, port, "/readyz", tlsConfig)
+}
+
+func (c *Cluster) checkEndpoint(tCtx ktesting.TContext, cmd *Cmd, scheme, hostIP, port, path string, tlsConfig *tls.Config) {
+	url := fmt.Sprintf("%s://%s:%s%s", scheme, hostIP, port, path)
+	tCtx.WithStep(fmt.Sprintf("check %s", url)).Eventually(func(tCtx ktesting.TContext) error {
 		if !cmd.Running() {
 			return gomega.StopTrying(fmt.Sprintf("%s stopped unexpectedly", cmd.Name))
 		}
-		// Like kube::util::wait_for_url in local-up-cluster.sh we use https,
-		// but don't check the certificate.
 		req, err := http.NewRequestWithContext(tCtx, http.MethodGet, url, nil)
 		if err != nil {
 			return fmt.Errorf("create request: %w", err)
 		}
-		tr := &http.Transport{
-			TLSClientConfig: &tls.Config{InsecureSkipVerify: true},
-		}
-		client := &http.Client{Transport: tr}
+		client := &http.Client{Transport: &http.Transport{TLSClientConfig: tlsConfig}}
 		resp, err := client.Do(req)
 		if err != nil {
 			return fmt.Errorf("get %s: %w", url, err)
@@ -507,7 +527,9 @@ func (c *Cluster) checkHealthz(tCtx ktesting.TContext, cmd *Cmd, method, hostIP,
 		if err := resp.Body.Close(); err != nil {
 			return fmt.Errorf("close GET response: %w", err)
 		}
-		// Any response is fine, we just need to get here. In practice, we get a 403 Forbidden.
+		if resp.StatusCode != http.StatusOK {
+			return fmt.Errorf("%s returned %d, waiting for 200", url, resp.StatusCode)
+		}
 		return nil
 	}).Should(gomega.Succeed(), fmt.Sprintf("HTTP GET %s", url))
 }

From 51d0c8843d6789ccbc056a2f5d41c5cfa23147db Mon Sep 17 00:00:00 2001
From: Ed Bartosh <eduard.bartosh@intel.com>
Date: Sun, 22 Mar 2026 18:41:51 +0200
Subject: [PATCH 2/3] localupcluster: set readiness polling interval to 1
 second

Without an explicit interval, Gomega's default polling is very frequent,
generating a large volume of /readyz and /healthz requests in the component
logs. Set an explicit 1-second interval to reduce noise while still
detecting readiness promptly.
---
 test/utils/localupcluster/localupcluster.go | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/utils/localupcluster/localupcluster.go b/test/utils/localupcluster/localupcluster.go
index 3191b20166c..f3a71c3e537 100644
--- a/test/utils/localupcluster/localupcluster.go
+++ b/test/utils/localupcluster/localupcluster.go
@@ -531,7 +531,7 @@ func (c *Cluster) checkEndpoint(tCtx ktesting.TContext, cmd *Cmd, scheme, hostIP
 			return fmt.Errorf("%s returned %d, waiting for 200", url, resp.StatusCode)
 		}
 		return nil
-	}).Should(gomega.Succeed(), fmt.Sprintf("HTTP GET %s", url))
+	}).WithPolling(time.Second).Should(gomega.Succeed(), fmt.Sprintf("HTTP GET %s", url))
 }
 
 func dumpProcesses(tCtx ktesting.TContext) {

From e3aa2b9b2930403456d50d80421b6e241788e695 Mon Sep 17 00:00:00 2001
From: Ed Bartosh <eduard.bartosh@intel.com>
Date: Thu, 26 Mar 2026 11:34:22 +0200
Subject: [PATCH 3/3] test/localupcluster: stop all components before starting
 replacements
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Modify() was replacing components one at a time: stop X, start X, stop Y,
start Y, ... in version-skew order (apiserver last on downgrade). This
caused a crash during downgrade: KCM-1.35 started against the still-
running apiserver-1.36, passed its /healthz, and then immediately lose
its connection when apiserver-1.36 was killed by the localupcluster.
KCM-1.35 would reconnect to the not-yet-ready apiserver-1.35, hit a
403 RBAC error during controller initialization, and exit — because that
initialization phase does not retry on RBAC errors.

Fix by splitting Modify() into two phases:

  Phase 1 — stop all components to be replaced, in reverse startup order
  (kube-proxy down to apiserver), so dependent components release their
  connections before the apiserver is stopped.

  Phase 2 — start all replacement components in standard startup order
  (apiserver first), so each component connects to a fully-ready apiserver.
---
 test/e2e_dra/upgradedowngrade_test.go       |  2 +-
 test/utils/localupcluster/localupcluster.go | 45 +++++++++++----------
 2 files changed, 24 insertions(+), 23 deletions(-)

diff --git a/test/e2e_dra/upgradedowngrade_test.go b/test/e2e_dra/upgradedowngrade_test.go
index 5237ea6d07a..d32afeade65 100644
--- a/test/e2e_dra/upgradedowngrade_test.go
+++ b/test/e2e_dra/upgradedowngrade_test.go
@@ -315,7 +315,7 @@ func testUpgradeDowngrade(tCtx ktesting.TContext) {
 	// We could split this up into first updating the apiserver, then control plane components, then restarting kubelet.
 	// For the purpose of this test here we we primarily care about full before/after comparisons, so not done yet.
 	// TODO
-	restoreOptions := cluster.Modify(tCtx.WithStep(fmt.Sprintf("update to %s", gitVersion)), "1-"+gitVersion, localupcluster.ModifyOptions{Upgrade: true, BinDir: dir})
+	restoreOptions := cluster.Modify(tCtx.WithStep(fmt.Sprintf("update to %s", gitVersion)), "1-"+gitVersion, localupcluster.ModifyOptions{BinDir: dir})
 
 	// kubelet wipes all resource slices because it doesn't know which drivers were running.
 	// We need to wait for them to be recreated.
diff --git a/test/utils/localupcluster/localupcluster.go b/test/utils/localupcluster/localupcluster.go
index f3a71c3e537..1e9fa99b9c8 100644
--- a/test/utils/localupcluster/localupcluster.go
+++ b/test/utils/localupcluster/localupcluster.go
@@ -342,10 +342,6 @@ type ModifyOptions struct {
 
 	// FileByComponent overrides BinDir for those components which are specified here.
 	FileByComponent map[KubeComponentName]string
-
-	// Upgrade determines whether the apiserver gets updated first (upgrade)
-	// or last (downgrade).
-	Upgrade bool
 }
 
 func (m ModifyOptions) GetComponentFile(component KubeComponentName) string {
@@ -373,24 +369,19 @@ func (c *Cluster) Modify(tCtx ktesting.TContext, state string, options ModifyOpt
 		FileByComponent: make(map[KubeComponentName]string),
 	}
 
-	restore.Upgrade = !options.Upgrade
-	components := slices.Clone(KubeClusterComponents)
-	if !options.Upgrade {
-		slices.Reverse(components)
-	}
-	for _, component := range components {
-		c.modifyComponent(tCtx, state, options, component, &restore)
-	}
-	return restore
-}
-
-func (c *Cluster) modifyComponent(tCtx ktesting.TContext, state string, options ModifyOptions, component KubeComponentName, restore *ModifyOptions) {
-	tCtx.Helper()
-	tCtx = tCtx.WithStep(fmt.Sprintf("modify %s", component))
-
 	// We could also do things like turning feature gates on or off.
 	// For now we only support replacing the file.
-	if fileName := options.GetComponentFile(component); fileName != "" {
+	updated := make(map[KubeComponentName]*Cmd)
+
+	// Phase 1: stop all components that need modification in reverse order
+	// so that dependent components (KCM, scheduler) are stopped before the
+	// apiserver they depend on.
+	for _, component := range slices.Backward(KubeClusterComponents) {
+		fileName := options.GetComponentFile(component)
+		if fileName == "" {
+			continue
+		}
+		tCtx := tCtx.WithStep(fmt.Sprintf("stop %s", component))
 		cmd, ok := c.running[component]
 		if !ok {
 			tCtx.Fatal("not running")
@@ -418,9 +409,19 @@ func (c *Cluster) modifyComponent(tCtx ktesting.TContext, state string, options
 		cmd.Name = string(component) + "-" + state
 		cmd.CommandLine = cmdLine
 		cmd.LogFile = path.Join(c.dir, fmt.Sprintf("%s-%s.log", component, state))
-
-		c.runComponentWithRetry(tCtx, component, cmd)
+		updated[component] = cmd
 	}
+
+	// Phase 2: start all stopped components in the standard startup order
+	// (apiserver first) so that each component starts against a fully-ready
+	// apiserver.
+	for _, component := range KubeClusterComponents {
+		if cmd, ok := updated[component]; ok {
+			c.runComponentWithRetry(tCtx, component, cmd)
+		}
+	}
+
+	return restore
 }
 
 func (c *Cluster) runComponentWithRetry(tCtx ktesting.TContext, component KubeComponentName, cmd *Cmd) {