diff --git a/test/e2e/node/gpu.go b/test/e2e/node/gpu.go index d1cd1037972..1f1a4a409bf 100644 --- a/test/e2e/node/gpu.go +++ b/test/e2e/node/gpu.go @@ -153,13 +153,12 @@ var _ = SIGDescribe(feature.GPUDevicePlugin, framework.WithSerial(), "Test using func createAndValidatePod(ctx context.Context, f *framework.Framework, podClient *e2epod.PodClient, pod *v1.Pod) { pod = podClient.Create(ctx, pod) - ginkgo.By("Watching for error events or started pod") - ev, err := podClient.WaitForErrorEventOrSuccessWithTimeout(ctx, pod, framework.PodStartTimeout*6) + ginkgo.By("Waiting for pod to start") + err := e2epod.WaitTimeoutForPodRunningInNamespace(ctx, f.ClientSet, pod.Name, f.Namespace.Name, framework.PodStartTimeout*6) framework.ExpectNoError(err) - gomega.Expect(ev).To(gomega.BeNil()) ginkgo.By("Waiting for pod completion") - err = e2epod.WaitForPodNoLongerRunningInNamespace(ctx, f.ClientSet, pod.Name, f.Namespace.Name) + err = e2epod.WaitTimeoutForPodNoLongerRunningInNamespace(ctx, f.ClientSet, pod.Name, f.Namespace.Name, framework.PodStartTimeout*6) framework.ExpectNoError(err) pod, err = podClient.Get(ctx, pod.Name, metav1.GetOptions{}) framework.ExpectNoError(err) @@ -184,9 +183,26 @@ func testNvidiaCLIPod() *v1.Pod { "bash", "-c", ` -nvidia-smi -apt-get update -y && \ - DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated cuda-demo-suite-12-5 +set -euo pipefail + +nvidia_smi_ready=false +for i in $(seq 1 12); do + nvidia_smi_output="$(nvidia-smi 2>&1 || true)" + echo "${nvidia_smi_output}" + if [[ "${nvidia_smi_output}" == *"NVIDIA-SMI"* ]]; then + nvidia_smi_ready=true + break + fi + echo "nvidia-smi did not become ready yet (attempt ${i}/12), retrying in 10s" + sleep 10 +done +if [[ "${nvidia_smi_ready}" != "true" ]]; then + echo "nvidia-smi never became ready" + exit 1 +fi + +apt-get update -y -o Acquire::Retries=5 && \ + DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated -o Acquire::Retries=5 cuda-demo-suite-12-5 /usr/local/cuda/extras/demo_suite/deviceQuery /usr/local/cuda/extras/demo_suite/vectorAdd /usr/local/cuda/extras/demo_suite/bandwidthTest --device=all --csv