File tree Expand file tree Collapse file tree 3 files changed +29
-2
lines changed
cookbooks/aws-parallelcluster-install/recipes Expand file tree Collapse file tree 3 files changed +29
-2
lines changed Original file line number Diff line number Diff line change @@ -12,6 +12,7 @@ This file is used to list changes made in each version of the AWS ParallelCluste
1212** CHANGES**
1313- Upgrade NVIDIA driver to version 470.141.03.
1414- Upgrade NVIDIA Fabric Manager to version 470.141.03.
15+ - Upgrade NVIDIA CUDA Toolkit to version 11.7.1.
1516- Disable cron job tasks man-db and mlocate, which may have a negative impact on node performance.
1617- Add support for generating Slurm Configuration files for Compute Resources with Multiple Instance Types.
1718- Reduce timeout from 50 to a maximum of 5min in case of DynamoDB connection issues at compute node bootstrap.
Original file line number Diff line number Diff line change 180180# NVIDIA
181181default [ 'cluster' ] [ 'nvidia' ] [ 'enabled' ] = 'no'
182182default [ 'cluster' ] [ 'nvidia' ] [ 'driver_version' ] = '470.141.03'
183- default [ 'cluster' ] [ 'nvidia' ] [ 'cuda_version' ] = '11.4'
183+ default [ 'cluster' ] [ 'nvidia' ] [ 'cuda_version' ] = '11.7'
184+ default [ 'cluster' ] [ 'nvidia' ] [ 'cuda_sample_version' ] = '11.6'
184185default [ 'cluster' ] [ 'nvidia' ] [ 'driver_url_architecture_id' ] = arm_instance? ? 'aarch64' : 'x86_64'
185186default [ 'cluster' ] [ 'nvidia' ] [ 'cuda_url_architecture_id' ] = arm_instance? ? 'linux_sbsa' : 'linux'
186187default [ 'cluster' ] [ 'nvidia' ] [ 'driver_url' ] = "https://us.download.nvidia.com/tesla/#{ node [ 'cluster' ] [ 'nvidia' ] [ 'driver_version' ] } /NVIDIA-Linux-#{ node [ 'cluster' ] [ 'nvidia' ] [ 'driver_url_architecture_id' ] } -#{ node [ 'cluster' ] [ 'nvidia' ] [ 'driver_version' ] } .run"
187- default [ 'cluster' ] [ 'nvidia' ] [ 'cuda_url' ] = "https://developer.download.nvidia.com/compute/cuda/11.4.4/local_installers/cuda_11.4.4_470.82.01_#{ node [ 'cluster' ] [ 'nvidia' ] [ 'cuda_url_architecture_id' ] } .run"
188+ default [ 'cluster' ] [ 'nvidia' ] [ 'cuda_url' ] = "https://developer.download.nvidia.com/compute/cuda/11.7.1/local_installers/cuda_11.7.1_515.65.01_#{ node [ 'cluster' ] [ 'nvidia' ] [ 'cuda_url_architecture_id' ] } .run"
189+ default [ 'cluster' ] [ 'nvidia' ] [ 'sample_url' ] = "https://github.com/NVIDIA/cuda-samples/archive/refs/tags/v#{ node [ 'cluster' ] [ 'nvidia' ] [ 'cuda_sample_version' ] } .tar.gz"
188190
189191# NVIDIA fabric-manager
190192# The package name of Fabric Manager for alinux2 and centos7 is nvidia-fabric-manager-version
Original file line number Diff line number Diff line change 8383 creates "/usr/local/cuda-#{ node [ 'cluster' ] [ 'nvidia' ] [ 'cuda_version' ] } "
8484 end
8585
86+ # Get CUDA Sample Files
87+ cuda_tmp_samplefile = "/tmp/cuda-sample.tar.gz"
88+ remote_file cuda_tmp_samplefile do
89+ source node [ 'cluster' ] [ 'nvidia' ] [ 'sample_url' ]
90+ mode '0644'
91+ retries 3
92+ retry_delay 5
93+ not_if { ::File . exist? ( "/usr/local/cuda-#{ node [ 'cluster' ] [ 'nvidia' ] [ 'cuda_version' ] } /samples" ) }
94+ end
95+
96+ # Unpack CUDA Samples
97+ bash 'cuda.sample install' do
98+ user 'root'
99+ group 'root'
100+ cwd '/tmp'
101+ code <<-CUDA
102+ set -e
103+ tar xf #{ cuda_tmp_samplefile }
104+ mv "./cuda-samples-#{ node [ 'cluster' ] [ 'nvidia' ] [ 'cuda_sample_version' ] } " "/usr/local/"
105+ rm -f #{ cuda_tmp_samplefile }
106+ CUDA
107+ creates "/usr/local/cuda-#{ node [ 'cluster' ] [ 'nvidia' ] [ 'cuda_version' ] } /samples"
108+ end
109+
86110 cookbook_file 'blacklist-nouveau.conf' do
87111 source 'nvidia/blacklist-nouveau.conf'
88112 path '/etc/modprobe.d/blacklist-nouveau.conf'
You can’t perform that action at this time.
0 commit comments