2024-09-17 13:56:04 -04:00
/ *
Copyright 2024 The Kubernetes Authors .
Licensed under the Apache License , Version 2.0 ( the "License" ) ;
you may not use this file except in compliance with the License .
You may obtain a copy of the License at
http : //www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing , software
distributed under the License is distributed on an "AS IS" BASIS ,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND , either express or implied .
See the License for the specific language governing permissions and
limitations under the License .
* /
package node
import (
"context"
2025-03-13 18:14:21 -04:00
"fmt"
2024-09-18 16:57:27 -04:00
"os"
2024-09-19 10:11:01 -04:00
"regexp"
2024-09-18 16:57:27 -04:00
"time"
appsv1 "k8s.io/api/apps/v1"
2024-09-17 13:56:04 -04:00
v1 "k8s.io/api/core/v1"
"k8s.io/apimachinery/pkg/api/resource"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/util/uuid"
clientset "k8s.io/client-go/kubernetes"
2024-09-18 16:57:27 -04:00
extensionsinternal "k8s.io/kubernetes/pkg/apis/extensions"
2024-09-17 13:56:04 -04:00
"k8s.io/kubernetes/test/e2e/feature"
"k8s.io/kubernetes/test/e2e/framework"
e2egpu "k8s.io/kubernetes/test/e2e/framework/gpu"
2024-09-19 12:32:57 -04:00
e2ejob "k8s.io/kubernetes/test/e2e/framework/job"
2024-09-18 16:57:27 -04:00
e2emanifest "k8s.io/kubernetes/test/e2e/framework/manifest"
2024-09-17 13:56:04 -04:00
e2enode "k8s.io/kubernetes/test/e2e/framework/node"
e2epod "k8s.io/kubernetes/test/e2e/framework/pod"
2024-09-18 16:57:27 -04:00
e2eresource "k8s.io/kubernetes/test/e2e/framework/resource"
2024-09-17 13:56:04 -04:00
e2eskipper "k8s.io/kubernetes/test/e2e/framework/skipper"
2024-09-18 16:57:27 -04:00
e2etestfiles "k8s.io/kubernetes/test/e2e/framework/testfiles"
2024-09-17 13:56:04 -04:00
admissionapi "k8s.io/pod-security-admission/api"
"github.com/onsi/ginkgo/v2"
"github.com/onsi/gomega"
)
2024-09-19 12:32:57 -04:00
// NOTE: All the tests in this file are run serially because they share a limited set of GPU(s), please inspect
// the CI job definitions to see how many GPU(s) are available in the environment
// Currently the CI jobs have 2 nodes each with 4 Nvidia T4's across both GCE and AWS harness(es).
var _ = SIGDescribe ( feature . GPUDevicePlugin , framework . WithSerial ( ) , "Sanity test using nvidia-smi" , func ( ) {
2024-09-17 13:56:04 -04:00
2024-09-19 12:32:57 -04:00
f := framework . NewDefaultFramework ( "nvidia-gpu1" )
2024-09-17 13:56:04 -04:00
f . NamespacePodSecurityLevel = admissionapi . LevelPrivileged
var podClient * e2epod . PodClient
ginkgo . BeforeEach ( func ( ) {
2024-09-18 16:57:27 -04:00
e2eskipper . SkipUnlessProviderIs ( "aws" , "gce" )
2024-09-17 13:56:04 -04:00
podClient = e2epod . NewPodClient ( f )
} )
2024-09-20 11:40:33 -04:00
f . It ( "should run nvidia-smi and cuda-demo-suite" , func ( ctx context . Context ) {
2024-09-18 16:57:27 -04:00
SetupEnvironmentAndSkipIfNeeded ( ctx , f , f . ClientSet )
2024-09-17 13:56:04 -04:00
pod := testNvidiaCLIPod ( )
ginkgo . By ( "Creating a pod that runs nvidia-smi" )
createAndValidatePod ( ctx , f , podClient , pod )
ginkgo . By ( "Getting logs from the pod" )
log , err := e2epod . GetPodLogs ( ctx , f . ClientSet , f . Namespace . Name , pod . Name , pod . Spec . Containers [ 0 ] . Name )
framework . ExpectNoError ( err )
ginkgo . By ( "Checking output from nvidia-smi" )
gomega . Expect ( log ) . To ( gomega . ContainSubstring ( "NVIDIA-SMI" ) )
gomega . Expect ( log ) . To ( gomega . ContainSubstring ( "Driver Version:" ) )
gomega . Expect ( log ) . To ( gomega . ContainSubstring ( "CUDA Version:" ) )
} )
2024-09-19 12:32:57 -04:00
} )
var _ = SIGDescribe ( feature . GPUDevicePlugin , framework . WithSerial ( ) , "Test using a Pod" , func ( ) {
f := framework . NewDefaultFramework ( "nvidia-gpu2" )
f . NamespacePodSecurityLevel = admissionapi . LevelPrivileged
var podClient * e2epod . PodClient
ginkgo . BeforeEach ( func ( ) {
e2eskipper . SkipUnlessProviderIs ( "aws" , "gce" )
podClient = e2epod . NewPodClient ( f )
} )
2024-09-17 13:56:04 -04:00
f . It ( "should run gpu based matrix multiplication" , func ( ctx context . Context ) {
2024-09-18 16:57:27 -04:00
SetupEnvironmentAndSkipIfNeeded ( ctx , f , f . ClientSet )
2024-09-17 13:56:04 -04:00
pod := testMatrixMultiplicationPod ( )
ginkgo . By ( "Creating a pod that runs matrix multiplication" )
createAndValidatePod ( ctx , f , podClient , pod )
ginkgo . By ( "Getting logs from the pod" )
log , err := e2epod . GetPodLogs ( ctx , f . ClientSet , f . Namespace . Name , pod . Name , pod . Spec . Containers [ 0 ] . Name )
framework . ExpectNoError ( err )
ginkgo . By ( "Checking output from nvidia-smi" )
2024-09-20 11:40:33 -04:00
framework . Logf ( "Got container logs for %s:\n%v" , pod . Spec . Containers [ 0 ] . Name , log )
2024-09-17 13:56:04 -04:00
gomega . Expect ( log ) . To ( gomega . ContainSubstring ( "TensorFlow version" ) )
gomega . Expect ( log ) . To ( gomega . ContainSubstring ( "Matrix multiplication result:" ) )
gomega . Expect ( log ) . To ( gomega . ContainSubstring ( "Time taken for 5000x5000 matrix multiplication" ) )
} )
2024-09-19 12:32:57 -04:00
} )
var _ = SIGDescribe ( feature . GPUDevicePlugin , framework . WithSerial ( ) , "Test using a Job" , func ( ) {
f := framework . NewDefaultFramework ( "nvidia-gpu2" )
f . NamespacePodSecurityLevel = admissionapi . LevelPrivileged
ginkgo . BeforeEach ( func ( ) {
e2eskipper . SkipUnlessProviderIs ( "aws" , "gce" )
} )
2024-09-19 10:11:01 -04:00
f . It ( "should run gpu based jobs" , func ( ctx context . Context ) {
SetupEnvironmentAndSkipIfNeeded ( ctx , f , f . ClientSet )
// Job set to have 5 completions with parallelism of 1 to ensure that it lasts long enough to experience the node recreation
completions := int32 ( 5 )
ginkgo . By ( "Starting GPU job" )
StartJob ( ctx , f , completions )
job , err := e2ejob . GetJob ( ctx , f . ClientSet , f . Namespace . Name , "cuda-add" )
framework . ExpectNoError ( err )
// make sure job is running by waiting for its first pod to start running
2024-09-22 17:24:42 -04:00
err = e2ejob . WaitForJobPodsRunningWithTimeout ( ctx , f . ClientSet , f . Namespace . Name , job . Name , 1 , e2ejob . JobTimeout * 2 )
2024-09-19 10:11:01 -04:00
framework . ExpectNoError ( err )
numNodes , err := e2enode . TotalRegistered ( ctx , f . ClientSet )
framework . ExpectNoError ( err )
_ , err = e2enode . CheckReady ( ctx , f . ClientSet , numNodes , framework . NodeReadyInitialTimeout )
framework . ExpectNoError ( err )
ginkgo . By ( "Waiting for gpu job to finish" )
2024-09-22 17:24:42 -04:00
err = e2ejob . WaitForJobFinishWithTimeout ( ctx , f . ClientSet , f . Namespace . Name , job . Name , e2ejob . JobTimeout * 2 )
2024-09-19 10:11:01 -04:00
framework . ExpectNoError ( err )
ginkgo . By ( "Done with gpu job" )
gomega . Expect ( job . Status . Failed ) . To ( gomega . BeZero ( ) , "Job pods failed during node recreation: %v" , job . Status . Failed )
VerifyJobNCompletions ( ctx , f , completions )
} )
2024-09-17 13:56:04 -04:00
} )
func createAndValidatePod ( ctx context . Context , f * framework . Framework , podClient * e2epod . PodClient , pod * v1 . Pod ) {
pod = podClient . Create ( ctx , pod )
2026-03-01 14:50:57 -05:00
ginkgo . By ( "Waiting for pod to start or complete" )
err := e2epod . WaitForPodCondition ( ctx , f . ClientSet , f . Namespace . Name , pod . Name , "started or completed" , framework . PodStartTimeout * 6 , func ( p * v1 . Pod ) ( bool , error ) {
switch p . Status . Phase {
case v1 . PodRunning , v1 . PodSucceeded , v1 . PodFailed :
return true , nil
default :
return false , nil
}
} )
2024-09-17 13:56:04 -04:00
framework . ExpectNoError ( err )
ginkgo . By ( "Waiting for pod completion" )
2026-02-09 01:40:45 -05:00
err = e2epod . WaitTimeoutForPodNoLongerRunningInNamespace ( ctx , f . ClientSet , pod . Name , f . Namespace . Name , framework . PodStartTimeout * 6 )
2024-09-17 13:56:04 -04:00
framework . ExpectNoError ( err )
pod , err = podClient . Get ( ctx , pod . Name , metav1 . GetOptions { } )
framework . ExpectNoError ( err )
ginkgo . By ( "Checking that the pod succeeded" )
gomega . Expect ( pod . Status . Phase ) . To ( gomega . Equal ( v1 . PodSucceeded ) )
}
func testNvidiaCLIPod ( ) * v1 . Pod {
podName := "gpu-cli-" + string ( uuid . NewUUID ( ) )
pod := v1 . Pod {
ObjectMeta : metav1 . ObjectMeta {
Name : podName ,
Annotations : map [ string ] string { } ,
} ,
Spec : v1 . PodSpec {
Containers : [ ] v1 . Container {
{
Name : "nvidia-smi" ,
2024-09-20 11:40:33 -04:00
Image : "nvidia/cuda:12.5.0-devel-ubuntu22.04" ,
Command : [ ] string {
"bash" ,
"-c" ,
`
2026-02-09 01:40:45 -05:00
set - euo pipefail
nvidia_smi_ready = false
for i in $ ( seq 1 12 ) ; do
nvidia_smi_output = "$(nvidia-smi 2>&1 || true)"
echo "${nvidia_smi_output}"
if [ [ "${nvidia_smi_output}" == * "NVIDIA-SMI" * ] ] ; then
nvidia_smi_ready = true
break
fi
echo "nvidia-smi did not become ready yet (attempt ${i}/12), retrying in 10s"
sleep 10
done
if [ [ "${nvidia_smi_ready}" != "true" ] ] ; then
echo "nvidia-smi never became ready"
exit 1
fi
apt - get update - y - o Acquire : : Retries = 5 && \
DEBIAN_FRONTEND = noninteractive apt - get install - y -- allow - unauthenticated - o Acquire : : Retries = 5 cuda - demo - suite - 12 - 5
2024-09-20 11:40:33 -04:00
/ usr / local / cuda / extras / demo_suite / deviceQuery
/ usr / local / cuda / extras / demo_suite / vectorAdd
/ usr / local / cuda / extras / demo_suite / bandwidthTest -- device = all -- csv
/ usr / local / cuda / extras / demo_suite / busGrind - a
` ,
} ,
2024-09-17 13:56:04 -04:00
Resources : v1 . ResourceRequirements {
Limits : v1 . ResourceList {
"nvidia.com/gpu" : resource . MustParse ( "1" ) ,
} ,
} ,
} ,
} ,
RestartPolicy : v1 . RestartPolicyNever ,
} ,
}
return & pod
}
func testMatrixMultiplicationPod ( ) * v1 . Pod {
podName := "gpu-matmul-" + string ( uuid . NewUUID ( ) )
pod := v1 . Pod {
ObjectMeta : metav1 . ObjectMeta {
Name : podName ,
Annotations : map [ string ] string { } ,
} ,
Spec : v1 . PodSpec {
Containers : [ ] v1 . Container {
{
Name : "gpu-matmul" ,
Image : "tensorflow/tensorflow:latest-gpu" ,
Command : [ ] string {
"python" ,
"-c" ,
`
import tensorflow as tf
import time
print ( "TensorFlow version:" , tf . __version__ )
print ( "Num GPUs Available: " , len ( tf . config . experimental . list_physical_devices ( ' GPU ' ) ) )
# Simple matrix multiplication test
with tf . device ( ' / GPU : 0 ' ) :
a = tf . constant ( [ [ 1.0 , 2.0 , 3.0 ] , [ 4.0 , 5.0 , 6.0 ] ] )
b = tf . constant ( [ [ 1.0 , 2.0 ] , [ 3.0 , 4.0 ] , [ 5.0 , 6.0 ] ] )
c = tf . matmul ( a , b )
print ( "Matrix multiplication result:" , c . numpy ( ) )
# Performance test
n = 5000
start_time = time . time ( )
with tf . device ( ' / GPU : 0 ' ) :
matrix1 = tf . random . normal ( ( n , n ) )
matrix2 = tf . random . normal ( ( n , n ) )
result = tf . matmul ( matrix1 , matrix2 )
end_time = time . time ( )
print ( f "Time taken for {n}x{n} matrix multiplication: {end_time - start_time:.2f} seconds" )
` ,
} ,
Resources : v1 . ResourceRequirements {
Limits : v1 . ResourceList {
"nvidia.com/gpu" : resource . MustParse ( "1" ) ,
} ,
} ,
} ,
} ,
RestartPolicy : v1 . RestartPolicyNever ,
} ,
}
return & pod
}
2024-09-18 16:57:27 -04:00
func SetupEnvironmentAndSkipIfNeeded ( ctx context . Context , f * framework . Framework , clientSet clientset . Interface ) {
if framework . ProviderIs ( "gce" ) {
2024-09-22 12:46:51 -04:00
SetupNVIDIAGPUNode ( ctx , f )
2024-09-24 09:11:18 -04:00
} else if framework . ProviderIs ( "aws" ) {
// see nvidia-device-plugin.yml in https://github.com/NVIDIA/k8s-device-plugin/tree/main/deployments/static
waitForGPUs ( ctx , f , "kube-system" , "nvidia-device-plugin-daemonset" )
2024-09-18 16:57:27 -04:00
}
2024-09-24 09:11:18 -04:00
2024-09-17 13:56:04 -04:00
nodes , err := e2enode . GetReadySchedulableNodes ( ctx , clientSet )
framework . ExpectNoError ( err )
capacity := 0
allocatable := 0
for _ , node := range nodes . Items {
val , ok := node . Status . Capacity [ e2egpu . NVIDIAGPUResourceName ]
if ! ok {
continue
}
capacity += int ( val . Value ( ) )
val , ok = node . Status . Allocatable [ e2egpu . NVIDIAGPUResourceName ]
if ! ok {
continue
}
allocatable += int ( val . Value ( ) )
}
if capacity == 0 {
2024-09-24 09:11:18 -04:00
framework . Failf ( "%d ready nodes do not have any Nvidia GPU(s). Bailing out..." , len ( nodes . Items ) )
2024-09-17 13:56:04 -04:00
}
if allocatable == 0 {
2024-09-24 09:11:18 -04:00
framework . Failf ( "%d ready nodes do not have any allocatable Nvidia GPU(s). Bailing out..." , len ( nodes . Items ) )
2024-09-17 13:56:04 -04:00
}
}
2024-09-18 16:57:27 -04:00
2025-06-19 03:32:23 -04:00
func isControlPlaneNode ( node v1 . Node ) bool {
_ , isControlPlane := node . Labels [ "node-role.kubernetes.io/control-plane" ]
if isControlPlane {
framework . Logf ( "Node: %q is a control-plane node (label)" , node . Name )
return true
}
for _ , taint := range node . Spec . Taints {
if taint . Key == "node-role.kubernetes.io/control-plane" {
framework . Logf ( "Node: %q is a control-plane node (taint)" , node . Name )
return true
}
}
framework . Logf ( "Node: %q is NOT a control-plane node" , node . Name )
return false
}
2025-03-13 18:14:21 -04:00
func areGPUsAvailableOnAllSchedulableNodes ( ctx context . Context , clientSet clientset . Interface ) error {
2024-09-18 16:57:27 -04:00
framework . Logf ( "Getting list of Nodes from API server" )
nodeList , err := clientSet . CoreV1 ( ) . Nodes ( ) . List ( ctx , metav1 . ListOptions { } )
2024-11-22 16:26:59 -05:00
if err != nil {
2025-03-13 18:14:21 -04:00
return fmt . Errorf ( "unexpected error getting node list: %w" , err )
2024-11-22 16:26:59 -05:00
}
2024-09-18 16:57:27 -04:00
for _ , node := range nodeList . Items {
2025-06-19 03:32:23 -04:00
if node . Spec . Unschedulable || isControlPlaneNode ( node ) {
2024-09-24 10:09:25 -04:00
continue
}
2024-09-18 16:57:27 -04:00
framework . Logf ( "gpuResourceName %s" , e2egpu . NVIDIAGPUResourceName )
if val , ok := node . Status . Capacity [ e2egpu . NVIDIAGPUResourceName ] ; ! ok || val . Value ( ) == 0 {
2025-03-13 18:14:21 -04:00
return fmt . Errorf ( "nvidia GPUs not available on Node: %q" , node . Name )
2024-09-18 16:57:27 -04:00
}
}
framework . Logf ( "Nvidia GPUs exist on all schedulable nodes" )
2025-03-13 18:14:21 -04:00
return nil
2024-09-18 16:57:27 -04:00
}
func logOSImages ( ctx context . Context , f * framework . Framework ) {
nodeList , err := f . ClientSet . CoreV1 ( ) . Nodes ( ) . List ( ctx , metav1 . ListOptions { } )
framework . ExpectNoError ( err , "getting node list" )
for _ , node := range nodeList . Items {
framework . Logf ( "Nodename: %v, OS Image: %v" , node . Name , node . Status . NodeInfo . OSImage )
}
}
const (
// Nvidia driver installation can take upwards of 5 minutes.
driverInstallTimeout = 10 * time . Minute
)
// SetupNVIDIAGPUNode install Nvidia Drivers and wait for Nvidia GPUs to be available on nodes
2024-09-22 12:46:51 -04:00
func SetupNVIDIAGPUNode ( ctx context . Context , f * framework . Framework ) {
2024-09-18 16:57:27 -04:00
logOSImages ( ctx , f )
var err error
var ds * appsv1 . DaemonSet
dsYamlURLFromEnv := os . Getenv ( "NVIDIA_DRIVER_INSTALLER_DAEMONSET" )
if dsYamlURLFromEnv != "" {
// Using DaemonSet from remote URL
framework . Logf ( "Using remote nvidia-driver-installer daemonset manifest from %v" , dsYamlURLFromEnv )
ds , err = e2emanifest . DaemonSetFromURL ( ctx , dsYamlURLFromEnv )
framework . ExpectNoError ( err , "failed get remote" )
} else {
// Using default local DaemonSet
framework . Logf ( "Using default local nvidia-driver-installer daemonset manifest." )
2024-09-27 14:23:57 -04:00
data , err := e2etestfiles . Read ( "test/e2e/testing-manifests/gpu/gce/nvidia-driver-installer.yaml" )
2024-09-18 16:57:27 -04:00
framework . ExpectNoError ( err , "failed to read local manifest for nvidia-driver-installer daemonset" )
ds , err = e2emanifest . DaemonSetFromData ( data )
framework . ExpectNoError ( err , "failed to parse local manifest for nvidia-driver-installer daemonset" )
}
2024-09-22 12:46:51 -04:00
prev , err := f . ClientSet . AppsV1 ( ) . DaemonSets ( f . Namespace . Name ) . Get ( ctx , ds . Name , metav1 . GetOptions { } )
if err == nil && prev != nil {
2024-09-27 14:23:57 -04:00
framework . Logf ( "nvidia-driver-installer Daemonset already installed, skipping..." )
} else {
ds . Namespace = f . Namespace . Name
_ , err = f . ClientSet . AppsV1 ( ) . DaemonSets ( f . Namespace . Name ) . Create ( ctx , ds , metav1 . CreateOptions { } )
framework . ExpectNoError ( err , "failed to create nvidia-driver-installer daemonset" )
framework . Logf ( "Successfully created daemonset to install Nvidia drivers." )
2024-09-22 12:46:51 -04:00
}
2024-09-27 14:23:57 -04:00
data , err := e2etestfiles . Read ( "test/e2e/testing-manifests/gpu/gce/nvidia-gpu-device-plugin.yaml" )
framework . ExpectNoError ( err , "failed to read local manifest for nvidia-gpu-device-plugin daemonset" )
ds , err = e2emanifest . DaemonSetFromData ( data )
framework . ExpectNoError ( err , "failed to parse local manifest for nvidia-gpu-device-plugin daemonset" )
prev , err = f . ClientSet . AppsV1 ( ) . DaemonSets ( ds . Namespace ) . Get ( ctx , ds . Name , metav1 . GetOptions { } )
if err == nil && prev != nil {
framework . Logf ( "nvidia-gpu-device-plugin Daemonset already installed, skipping..." )
} else {
_ , err = f . ClientSet . AppsV1 ( ) . DaemonSets ( ds . Namespace ) . Create ( ctx , ds , metav1 . CreateOptions { } )
framework . ExpectNoError ( err , "failed to create nvidia-gpu-device-plugin daemonset" )
framework . Logf ( "Successfully created daemonset to install Nvidia device plugin." )
}
2024-09-18 16:57:27 -04:00
2024-09-24 09:11:18 -04:00
waitForGPUs ( ctx , f , ds . Namespace , ds . Name )
}
func waitForGPUs ( ctx context . Context , f * framework . Framework , namespace , name string ) {
pods , err := e2eresource . WaitForControlledPods ( ctx , f . ClientSet , namespace , name , extensionsinternal . Kind ( "DaemonSet" ) )
2024-09-18 16:57:27 -04:00
framework . ExpectNoError ( err , "failed to get pods controlled by the nvidia-driver-installer daemonset" )
devicepluginPods , err := e2eresource . WaitForControlledPods ( ctx , f . ClientSet , "kube-system" , "nvidia-gpu-device-plugin" , extensionsinternal . Kind ( "DaemonSet" ) )
if err == nil {
framework . Logf ( "Adding deviceplugin addon pod." )
pods . Items = append ( pods . Items , devicepluginPods . Items ... )
}
// Wait for Nvidia GPUs to be available on nodes
framework . Logf ( "Waiting for drivers to be installed and GPUs to be available in Node Capacity..." )
2025-03-13 18:14:21 -04:00
gomega . Eventually ( ctx , func ( ctx context . Context ) error {
2024-09-18 16:57:27 -04:00
return areGPUsAvailableOnAllSchedulableNodes ( ctx , f . ClientSet )
2025-03-13 18:14:21 -04:00
} , driverInstallTimeout , time . Second ) . Should ( gomega . Succeed ( ) )
2024-09-18 16:57:27 -04:00
}
2024-09-19 10:11:01 -04:00
// StartJob starts a simple CUDA job that requests gpu and the specified number of completions
func StartJob ( ctx context . Context , f * framework . Framework , completions int32 ) {
var activeSeconds int64 = 3600
testJob := e2ejob . NewTestJob ( "succeed" , "cuda-add" , v1 . RestartPolicyAlways , 1 , completions , & activeSeconds , 6 )
testJob . Spec . Template . Spec = v1 . PodSpec {
RestartPolicy : v1 . RestartPolicyOnFailure ,
Containers : [ ] v1 . Container {
{
Name : "vector-addition" ,
Image : "cupy/cupy:v13.3.0" ,
Command : [ ] string {
"python3" ,
"-c" ,
`
import cupy as cp
import numpy as np
import time
# Set the number of elements to test
num_elements_list = [ 10 , 100 , 1000 , 10000 , 100000 , 1000000 ]
for num_elements in num_elements_list :
# Create random input vectors on the CPU
h_A = np . random . rand ( num_elements ) . astype ( np . float32 )
h_B = np . random . rand ( num_elements ) . astype ( np . float32 )
# Transfer the input vectors to the GPU
d_A = cp . asarray ( h_A )
d_B = cp . asarray ( h_B )
# Perform vector addition on the GPU
start_gpu = time . time ( )
d_C = d_A + d_B
gpu_time = time . time ( ) - start_gpu
# Transfer the result back to the CPU
h_C = cp . asnumpy ( d_C )
# Compute the expected result on the CPU
start_cpu = time . time ( )
h_C_expected = h_A + h_B
cpu_time = time . time ( ) - start_cpu
# Verify the result
if np . allclose ( h_C_expected , h_C , atol = 1e-5 ) :
print ( f "GPU time: {gpu_time:.6f} seconds" )
print ( f "CPU time: {cpu_time:.6f} seconds" )
print ( f "GPU speedup: {cpu_time / gpu_time:.2f}x" )
else :
print ( f "Test FAILED for {num_elements} elements." )
# Print the first few elements for verification
print ( "First few elements of A:" , h_A [ : 5 ] )
print ( "First few elements of B:" , h_B [ : 5 ] )
print ( "First few elements of C:" , h_C [ : 5 ] )
print ( f "Test PASSED" )
` ,
} ,
Resources : v1 . ResourceRequirements {
Limits : v1 . ResourceList {
e2egpu . NVIDIAGPUResourceName : * resource . NewQuantity ( 1 , resource . DecimalSI ) ,
} ,
} ,
} ,
} ,
}
ns := f . Namespace . Name
_ , err := e2ejob . CreateJob ( ctx , f . ClientSet , ns , testJob )
framework . ExpectNoError ( err )
framework . Logf ( "Created job %v" , testJob )
}
func podNames ( pods [ ] v1 . Pod ) [ ] string {
originalPodNames := make ( [ ] string , len ( pods ) )
for i , p := range pods {
originalPodNames [ i ] = p . ObjectMeta . Name
}
return originalPodNames
}
// VerifyJobNCompletions verifies that the job has completions number of successful pods
func VerifyJobNCompletions ( ctx context . Context , f * framework . Framework , completions int32 ) {
ns := f . Namespace . Name
pods , err := e2ejob . GetJobPods ( ctx , f . ClientSet , f . Namespace . Name , "cuda-add" )
framework . ExpectNoError ( err )
createdPods := pods . Items
createdPodNames := podNames ( createdPods )
framework . Logf ( "Got the following pods for job cuda-add: %v" , createdPodNames )
successes := int32 ( 0 )
regex := regexp . MustCompile ( "PASSED" )
for _ , podName := range createdPodNames {
e2epod . NewPodClient ( f ) . WaitForFinish ( ctx , podName , 5 * time . Minute )
logs , err := e2epod . GetPodLogs ( ctx , f . ClientSet , ns , podName , "vector-addition" )
framework . ExpectNoError ( err , "Should be able to get logs for pod %v" , podName )
if regex . MatchString ( logs ) {
successes ++
}
gomega . Expect ( logs ) . To ( gomega . Not ( gomega . ContainSubstring ( "FAILED" ) ) )
}
if successes != completions {
framework . Failf ( "Only got %v completions. Expected %v completions." , successes , completions )
}
}