mirror of
https://github.com/kubernetes/kubernetes.git
synced 2026-05-28 04:04:39 -04:00
test/e2e/node: reduce flakiness in GPU nvidia-smi test
This commit is contained in:
parent
97a2334637
commit
6e203664eb
1 changed files with 23 additions and 7 deletions
|
|
@ -153,13 +153,12 @@ var _ = SIGDescribe(feature.GPUDevicePlugin, framework.WithSerial(), "Test using
|
|||
func createAndValidatePod(ctx context.Context, f *framework.Framework, podClient *e2epod.PodClient, pod *v1.Pod) {
|
||||
pod = podClient.Create(ctx, pod)
|
||||
|
||||
ginkgo.By("Watching for error events or started pod")
|
||||
ev, err := podClient.WaitForErrorEventOrSuccessWithTimeout(ctx, pod, framework.PodStartTimeout*6)
|
||||
ginkgo.By("Waiting for pod to start")
|
||||
err := e2epod.WaitTimeoutForPodRunningInNamespace(ctx, f.ClientSet, pod.Name, f.Namespace.Name, framework.PodStartTimeout*6)
|
||||
framework.ExpectNoError(err)
|
||||
gomega.Expect(ev).To(gomega.BeNil())
|
||||
|
||||
ginkgo.By("Waiting for pod completion")
|
||||
err = e2epod.WaitForPodNoLongerRunningInNamespace(ctx, f.ClientSet, pod.Name, f.Namespace.Name)
|
||||
err = e2epod.WaitTimeoutForPodNoLongerRunningInNamespace(ctx, f.ClientSet, pod.Name, f.Namespace.Name, framework.PodStartTimeout*6)
|
||||
framework.ExpectNoError(err)
|
||||
pod, err = podClient.Get(ctx, pod.Name, metav1.GetOptions{})
|
||||
framework.ExpectNoError(err)
|
||||
|
|
@ -184,9 +183,26 @@ func testNvidiaCLIPod() *v1.Pod {
|
|||
"bash",
|
||||
"-c",
|
||||
`
|
||||
nvidia-smi
|
||||
apt-get update -y && \
|
||||
DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated cuda-demo-suite-12-5
|
||||
set -euo pipefail
|
||||
|
||||
nvidia_smi_ready=false
|
||||
for i in $(seq 1 12); do
|
||||
nvidia_smi_output="$(nvidia-smi 2>&1 || true)"
|
||||
echo "${nvidia_smi_output}"
|
||||
if [[ "${nvidia_smi_output}" == *"NVIDIA-SMI"* ]]; then
|
||||
nvidia_smi_ready=true
|
||||
break
|
||||
fi
|
||||
echo "nvidia-smi did not become ready yet (attempt ${i}/12), retrying in 10s"
|
||||
sleep 10
|
||||
done
|
||||
if [[ "${nvidia_smi_ready}" != "true" ]]; then
|
||||
echo "nvidia-smi never became ready"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
apt-get update -y -o Acquire::Retries=5 && \
|
||||
DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated -o Acquire::Retries=5 cuda-demo-suite-12-5
|
||||
/usr/local/cuda/extras/demo_suite/deviceQuery
|
||||
/usr/local/cuda/extras/demo_suite/vectorAdd
|
||||
/usr/local/cuda/extras/demo_suite/bandwidthTest --device=all --csv
|
||||
|
|
|
|||
Loading…
Reference in a new issue