Skip to content

Commit 19ed4f7

Browse files
committed
Upgrade NVidia CUDA toolkit from 11.4.4 to 11.7.1
1 parent 43e79e9 commit 19ed4f7

File tree

3 files changed

+29
-2
lines changed

3 files changed

+29
-2
lines changed

CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ This file is used to list changes made in each version of the AWS ParallelCluste
1212
**CHANGES**
1313
- Upgrade NVIDIA driver to version 470.141.03.
1414
- Upgrade NVIDIA Fabric Manager to version 470.141.03.
15+
- Upgrade NVIDIA CUDA Toolkit to version 11.7.1.
1516
- Disable cron job tasks man-db and mlocate, which may have a negative impact on node performance.
1617
- Add support for generating Slurm Configuration files for Compute Resources with Multiple Instance Types.
1718
- Reduce timeout from 50 to a maximum of 5min in case of DynamoDB connection issues at compute node bootstrap.

attributes/default.rb

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -180,11 +180,13 @@
180180
# NVIDIA
181181
default['cluster']['nvidia']['enabled'] = 'no'
182182
default['cluster']['nvidia']['driver_version'] = '470.141.03'
183-
default['cluster']['nvidia']['cuda_version'] = '11.4'
183+
default['cluster']['nvidia']['cuda_version'] = '11.7'
184+
default['cluster']['nvidia']['cuda_sample_version'] = '11.6'
184185
default['cluster']['nvidia']['driver_url_architecture_id'] = arm_instance? ? 'aarch64' : 'x86_64'
185186
default['cluster']['nvidia']['cuda_url_architecture_id'] = arm_instance? ? 'linux_sbsa' : 'linux'
186187
default['cluster']['nvidia']['driver_url'] = "https://us.download.nvidia.com/tesla/#{node['cluster']['nvidia']['driver_version']}/NVIDIA-Linux-#{node['cluster']['nvidia']['driver_url_architecture_id']}-#{node['cluster']['nvidia']['driver_version']}.run"
187-
default['cluster']['nvidia']['cuda_url'] = "https://developer.download.nvidia.com/compute/cuda/11.4.4/local_installers/cuda_11.4.4_470.82.01_#{node['cluster']['nvidia']['cuda_url_architecture_id']}.run"
188+
default['cluster']['nvidia']['cuda_url'] = "https://developer.download.nvidia.com/compute/cuda/11.7.1/local_installers/cuda_11.7.1_515.65.01_#{node['cluster']['nvidia']['cuda_url_architecture_id']}.run"
189+
default['cluster']['nvidia']['sample_url'] = "https://github.com/NVIDIA/cuda-samples/archive/refs/tags/v#{node['cluster']['nvidia']['cuda_sample_version']}.tar.gz"
188190

189191
# NVIDIA fabric-manager
190192
# The package name of Fabric Manager for alinux2 and centos7 is nvidia-fabric-manager-version

cookbooks/aws-parallelcluster-install/recipes/nvidia.rb

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,30 @@
8383
creates "/usr/local/cuda-#{node['cluster']['nvidia']['cuda_version']}"
8484
end
8585

86+
# Get CUDA Sample Files
87+
cuda_tmp_samplefile = "/tmp/cuda-sample.tar.gz"
88+
remote_file cuda_tmp_samplefile do
89+
source node['cluster']['nvidia']['sample_url']
90+
mode '0644'
91+
retries 3
92+
retry_delay 5
93+
not_if { ::File.exist?("/usr/local/cuda-#{node['cluster']['nvidia']['cuda_version']}/samples") }
94+
end
95+
96+
# Unpack CUDA Samples
97+
bash 'cuda.sample install' do
98+
user 'root'
99+
group 'root'
100+
cwd '/tmp'
101+
code <<-CUDA
102+
set -e
103+
tar xf #{cuda_tmp_samplefile}
104+
mv "./cuda-samples-#{node['cluster']['nvidia']['cuda_sample_version']}" "/usr/local/"
105+
rm -f #{cuda_tmp_samplefile}
106+
CUDA
107+
creates "/usr/local/cuda-#{node['cluster']['nvidia']['cuda_version']}/samples"
108+
end
109+
86110
cookbook_file 'blacklist-nouveau.conf' do
87111
source 'nvidia/blacklist-nouveau.conf'
88112
path '/etc/modprobe.d/blacklist-nouveau.conf'

0 commit comments

Comments
 (0)