test/e2e/node: reduce flakiness in GPU nvidia-smi test

This commit is contained in:
Jiefeng Xu 2026-02-08 22:40:45 -08:00
parent 97a2334637
commit 6e203664eb

View file

@ -153,13 +153,12 @@ var _ = SIGDescribe(feature.GPUDevicePlugin, framework.WithSerial(), "Test using
func createAndValidatePod(ctx context.Context, f *framework.Framework, podClient *e2epod.PodClient, pod *v1.Pod) {
pod = podClient.Create(ctx, pod)
ginkgo.By("Watching for error events or started pod")
ev, err := podClient.WaitForErrorEventOrSuccessWithTimeout(ctx, pod, framework.PodStartTimeout*6)
ginkgo.By("Waiting for pod to start")
err := e2epod.WaitTimeoutForPodRunningInNamespace(ctx, f.ClientSet, pod.Name, f.Namespace.Name, framework.PodStartTimeout*6)
framework.ExpectNoError(err)
gomega.Expect(ev).To(gomega.BeNil())
ginkgo.By("Waiting for pod completion")
err = e2epod.WaitForPodNoLongerRunningInNamespace(ctx, f.ClientSet, pod.Name, f.Namespace.Name)
err = e2epod.WaitTimeoutForPodNoLongerRunningInNamespace(ctx, f.ClientSet, pod.Name, f.Namespace.Name, framework.PodStartTimeout*6)
framework.ExpectNoError(err)
pod, err = podClient.Get(ctx, pod.Name, metav1.GetOptions{})
framework.ExpectNoError(err)
@ -184,9 +183,26 @@ func testNvidiaCLIPod() *v1.Pod {
"bash",
"-c",
`
nvidia-smi
apt-get update -y && \
DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated cuda-demo-suite-12-5
set -euo pipefail
nvidia_smi_ready=false
for i in $(seq 1 12); do
nvidia_smi_output="$(nvidia-smi 2>&1 || true)"
echo "${nvidia_smi_output}"
if [[ "${nvidia_smi_output}" == *"NVIDIA-SMI"* ]]; then
nvidia_smi_ready=true
break
fi
echo "nvidia-smi did not become ready yet (attempt ${i}/12), retrying in 10s"
sleep 10
done
if [[ "${nvidia_smi_ready}" != "true" ]]; then
echo "nvidia-smi never became ready"
exit 1
fi
apt-get update -y -o Acquire::Retries=5 && \
DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated -o Acquire::Retries=5 cuda-demo-suite-12-5
/usr/local/cuda/extras/demo_suite/deviceQuery
/usr/local/cuda/extras/demo_suite/vectorAdd
/usr/local/cuda/extras/demo_suite/bandwidthTest --device=all --csv