Skip to content

Commit 994d6a3

Browse files
Change CUDA install validation test to support both v11.4 and v11.x > 11.4 on ARM (#4305)
Starting with CUDA Toolkit v11.5, the samples where the deviceQuery source code is installed was moved out of the CUDA install package and into its own github repository. The structure of the sample directy tree also changed thus requiring different paths for compiling and executing the deviceQuery utility. Co-authored-by: Luca Carrogu <[email protected]>
1 parent a32bafb commit 994d6a3

File tree

1 file changed

+34
-4
lines changed

1 file changed

+34
-4
lines changed

cli/src/pcluster/resources/imagebuilder/parallelcluster_validate.yaml

Lines changed: 34 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,36 @@ phases:
167167
VERSION=$(echo ${PATTERN} | tr -d '\n' | cut -d = -f 2 | xargs)
168168
echo ${VERSION}
169169
170+
- name: CudaSamplesSrcDir
171+
action: ExecuteBash
172+
inputs:
173+
commands:
174+
- |
175+
set -v
176+
cuda_ver="{{ validate.CudaVersion.outputs.stdout }}"
177+
if [ ${cuda_ver} \> '11.4' ]; then
178+
PATTERN=$(grep -F "default['cluster']['nvidia']['cuda_sample_version']" {{ CookbookDefaultFile }})
179+
VERSION=$(echo ${PATTERN} | tr -d '\n' | cut -d = -f 2 | xargs)
180+
echo cuda-samples-${VERSION}/Samples
181+
else
182+
echo cuda-${cuda_ver}/samples
183+
fi
184+
185+
- name: CudaSamplesBinDir
186+
action: ExecuteBash
187+
inputs:
188+
commands:
189+
- |
190+
set -v
191+
cuda_ver="{{ validate.CudaVersion.outputs.stdout }}"
192+
if [ ${cuda_ver} \> '11.4' ]; then
193+
PATTERN=$(grep -F "default['cluster']['nvidia']['cuda_sample_version']" {{ CookbookDefaultFile }})
194+
VERSION=$(echo ${PATTERN} | tr -d '\n' | cut -d = -f 2 | xargs)
195+
echo cuda-samples-${VERSION}/bin
196+
else
197+
echo cuda-${cuda_ver}/samples/bin
198+
fi
199+
170200
- name: ArmPLVersion
171201
action: ExecuteBash
172202
inputs:
@@ -280,7 +310,7 @@ phases:
280310
apt list --installed | grep "nvidia-fabric.*manager" | grep "${nvidia_driver_version}" || exit 1
281311
apt-mark showhold | grep "nvidia-fabric.*manager" || exit 1
282312
fi
283-
echo "Fabric Manager match Nvidia driver and version is locked"
313+
echo "Fabric Manager match Nvidia driver and version is locked"
284314
fi
285315
286316
echo "Testing CUDA installation with nvcc"
@@ -294,11 +324,11 @@ phases:
294324
echo "Testing CUDA with deviceQuery..."
295325
if [ {{ validate.OperatingSystemArchitecture.outputs.stdout }} != 'arm64' ]; then
296326
/usr/local/cuda-${cuda_ver}/extras/demo_suite/deviceQuery | grep -o "Result = PASS"
297-
[[ $? -ne 0 ]] && echo "CUDA deviceQuery test failed" && exit 1
327+
[[ $? -ne 0 ]] && echo "CUDA deviceQuery test failed" && exit 1
298328
else
299-
cd /usr/local/cuda-${cuda_ver}/samples/1_Utilities/deviceQuery
329+
cd /usr/local/{{ validate.CudaSamplesSrcDir.outputs.stdout }}/1_Utilities/deviceQuery
300330
make
301-
/usr/local/cuda-${cuda_ver}/samples/bin/sbsa/linux/release/deviceQuery | grep -o "Result = PASS"
331+
/usr/local/{{ validate.CudaSamplesBinDir.outputs.stdout }}/sbsa/linux/release/deviceQuery | grep -o "Result = PASS"
302332
[[ $? -ne 0 ]] && echo "CUDA deviceQuery test failed" && exit 1
303333
fi
304334
echo "CUDA deviceQuery test passed"

0 commit comments

Comments
 (0)