Skip to content

Commit 13334d8

Browse files
Add workaround for fabric_manager on redhat8
Signed-off-by: Francesco Giordano <[email protected]>
1 parent 9560de6 commit 13334d8

File tree

2 files changed

+23
-2
lines changed

2 files changed

+23
-2
lines changed

cookbooks/aws-parallelcluster-install/recipes/nvidia.rb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
repo_uri = node['cluster']['nvidia']['cuda']['repository_uri'].gsub('_domain_', repo_domain)
2525
add_package_repository("nvidia-repo", repo_uri, "#{repo_uri}/#{node['cluster']['nvidia']['fabricmanager']['repository_key']}", "/")
2626

27-
fabric_manager 'Install Nvidia Fabric Manager' unless redhat8?
27+
fabric_manager 'Install Nvidia Fabric Manager'
2828

2929
nvidia_dcgm 'install datacenter-gpu-manager'
3030

cookbooks/aws-parallelcluster-install/resources/fabric_manager/fabric_manager_redhat8.rb

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,4 +17,25 @@
1717
end
1818

1919
use 'partial/_fabric_manager_common.rb'
20-
use 'partial/_fabric_manager_install_rhel.rb'
20+
# Temporarely commented to enable the workaround
21+
# use 'partial/_fabric_manager_install_rhel.rb'
22+
23+
# Workaround to download and install nvidia fabric_manager on redhat8 due to bug https://partners.nvidia.com/Bug/ViewBug/4056528
24+
# rpm_package = https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/
25+
action :install_package do
26+
rpm_package = "#{node['cluster']['nvidia']['fabricmanager']['package']}-#{node['cluster']['nvidia']['fabricmanager']['version']}-1.x86_64.rpm"
27+
repo_domain = node['cluster']['region'].start_with?("cn-") ? "cn" : "com"
28+
repo_uri = node['cluster']['nvidia']['cuda']['repository_uri'].gsub('_domain_', repo_domain)
29+
remote_file rpm_package do
30+
source "#{repo_uri}/#{rpm_package}"
31+
mode '0644'
32+
retries 3
33+
retry_delay 5
34+
action :create_if_missing
35+
end
36+
package rpm_package do
37+
retries 3
38+
retry_delay 5
39+
source rpm_package
40+
end
41+
end

0 commit comments

Comments
 (0)