Skip to content

Commit a29492d

Browse files
authored
Adds infra to use nvidia dependencies from pypi and cleans up patches (#1248)
* Installs NCCL from redist, uses system NCCL, and adds pypi RPATH * Cleans up nvrtc patches and adds it using main script * Fixes typo * Adds more dependencies and builds torch with dynamic linking * NCCL dirs have to be specified. Otherwise picks up different version * Handles 11.8 * Adds echo message for nccl 2.15 * Fixes logic for 11.8 and adds missing names for DEPS_SONAME
1 parent 2b8d2eb commit a29492d

File tree

3 files changed

+62
-80
lines changed

3 files changed

+62
-80
lines changed

common/install_cuda.sh

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ function install_116 {
2424
}
2525

2626
function install_117 {
27-
echo "Installing CUDA 11.7 and CuDNN 8.5"
27+
echo "Installing CUDA 11.7 and CuDNN 8.5 and NCCL 2.14"
2828
rm -rf /usr/local/cuda-11.7 /usr/local/cuda
2929
# install CUDA 11.7.0 in the same container
3030
wget -q https://developer.download.nvidia.com/compute/cuda/11.7.0/local_installers/cuda_11.7.0_515.43.04_linux.run
@@ -42,10 +42,20 @@ function install_117 {
4242
cd ..
4343
rm -rf tmp_cudnn
4444
ldconfig
45+
46+
# NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
47+
mkdir tmp_nccl && cd tmp_nccl
48+
wget -q https://developer.download.nvidia.com/compute/redist/nccl/v2.14/nccl_2.14.3-1+cuda11.7_x86_64.txz
49+
tar xf nccl_2.14.3-1+cuda11.7_x86_64.txz
50+
cp -a nccl_2.14.3-1+cuda11.7_x86_64/include/* /usr/local/cuda/include/
51+
cp -a nccl_2.14.3-1+cuda11.7_x86_64/lib/* /usr/local/cuda/lib64/
52+
cd ..
53+
rm -rf tmp_nccl
54+
ldconfig
4555
}
4656

4757
function install_118 {
48-
echo "Installing CUDA 11.8 and cuDNN 8.5"
58+
echo "Installing CUDA 11.8 and cuDNN 8.5 and NCCL 2.15"
4959
rm -rf /usr/local/cuda-11.8 /usr/local/cuda
5060
# install CUDA 11.8.0 in the same container
5161
wget -q https://developer.download.nvidia.com/compute/cuda/11.8.0/local_installers/cuda_11.8.0_520.61.05_linux.run
@@ -63,6 +73,16 @@ function install_118 {
6373
cd ..
6474
rm -rf tmp_cudnn
6575
ldconfig
76+
77+
# NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
78+
mkdir tmp_nccl && cd tmp_nccl
79+
wget -q https://developer.download.nvidia.com/compute/redist/nccl/v2.15.5/nccl_2.15.5-1+cuda11.8_x86_64.txz
80+
tar xf nccl_2.15.5-1+cuda11.8_x86_64.txz
81+
cp -a nccl_2.15.5-1+cuda11.8_x86_64/include/* /usr/local/cuda/include/
82+
cp -a nccl_2.15.5-1+cuda11.8_x86_64/lib/* /usr/local/cuda/lib64/
83+
cd ..
84+
rm -rf tmp_nccl
85+
ldconfig
6686
}
6787

6888
function prune_116 {

manywheel/build_cuda.sh

Lines changed: 40 additions & 64 deletions
Original file line numberDiff line numberDiff line change
@@ -142,76 +142,14 @@ DEPS_SONAME=(
142142
"libcublasLt.so.11"
143143
"libgomp.so.1"
144144
)
145-
elif [[ $CUDA_VERSION == "11.7" ]]; then
145+
elif [[ $CUDA_VERSION == "11.7" || $CUDA_VERSION == "11.8" ]]; then
146146
export USE_STATIC_CUDNN=0
147147
# Try parallelizing nvcc as well
148148
export TORCH_NVCC_FLAGS="-Xfatbin -compress-all --threads 2"
149149
DEPS_LIST=(
150-
"/usr/local/cuda/lib64/libcudart.so.11.0"
151-
"/usr/local/cuda/lib64/libnvToolsExt.so.1"
152-
"/usr/local/cuda/lib64/libnvrtc.so.11.2" # this is not a mistake for 11.7, it links to 11.7.50
153-
"/usr/local/cuda/lib64/libnvrtc-builtins.so.11.7"
154150
"$LIBGOMP_PATH"
155151
)
156152
DEPS_SONAME=(
157-
"libcudart.so.11.0"
158-
"libnvToolsExt.so.1"
159-
"libnvrtc.so.11.2"
160-
"libnvrtc-builtins.so.11.7"
161-
"libgomp.so.1"
162-
)
163-
164-
if [[ -z "$PYTORCH_EXTRA_INSTALL_REQUIREMENTS" ]]; then
165-
echo "Bundling with cudnn and cublas."
166-
DEPS_LIST+=(
167-
"/usr/local/cuda/lib64/libcudnn_adv_infer.so.8"
168-
"/usr/local/cuda/lib64/libcudnn_adv_train.so.8"
169-
"/usr/local/cuda/lib64/libcudnn_cnn_infer.so.8"
170-
"/usr/local/cuda/lib64/libcudnn_cnn_train.so.8"
171-
"/usr/local/cuda/lib64/libcudnn_ops_infer.so.8"
172-
"/usr/local/cuda/lib64/libcudnn_ops_train.so.8"
173-
"/usr/local/cuda/lib64/libcudnn.so.8"
174-
"/usr/local/cuda/lib64/libcublas.so.11"
175-
"/usr/local/cuda/lib64/libcublasLt.so.11"
176-
)
177-
DEPS_SONAME+=(
178-
"libcudnn_adv_infer.so.8"
179-
"libcudnn_adv_train.so.8"
180-
"libcudnn_cnn_infer.so.8"
181-
"libcudnn_cnn_train.so.8"
182-
"libcudnn_ops_infer.so.8"
183-
"libcudnn_ops_train.so.8"
184-
"libcudnn.so.8"
185-
"libcublas.so.11"
186-
"libcublasLt.so.11"
187-
)
188-
else
189-
echo "Using cudnn and cublas from pypi."
190-
CUDA_RPATHS=(
191-
'$ORIGIN/../../nvidia/cublas/lib'
192-
'$ORIGIN/../../nvidia/cudnn/lib'
193-
)
194-
CUDA_RPATHS=$(IFS=: ; echo "${CUDA_RPATHS[*]}")
195-
export C_SO_RPATH=$CUDA_RPATHS':$ORIGIN:$ORIGIN/lib'
196-
export LIB_SO_RPATH=$CUDA_RPATHS':$ORIGIN'
197-
export FORCE_RPATH="--force-rpath"
198-
fi
199-
elif [[ $CUDA_VERSION == "11.8" ]]; then
200-
export USE_STATIC_CUDNN=0
201-
# Try parallelizing nvcc as well
202-
export TORCH_NVCC_FLAGS="-Xfatbin -compress-all --threads 2"
203-
DEPS_LIST=(
204-
"/usr/local/cuda/lib64/libcudart.so.11.0"
205-
"/usr/local/cuda/lib64/libnvToolsExt.so.1"
206-
"/usr/local/cuda/lib64/libnvrtc.so.11.2" # this is not a mistake for 11.8, it links to 11.8.89
207-
"/usr/local/cuda/lib64/libnvrtc-builtins.so.11.8"
208-
"$LIBGOMP_PATH"
209-
)
210-
DEPS_SONAME=(
211-
"libcudart.so.11.0"
212-
"libnvToolsExt.so.1"
213-
"libnvrtc.so.11.2"
214-
"libnvrtc-builtins.so.11.8"
215153
"libgomp.so.1"
216154
)
217155

@@ -227,6 +165,9 @@ elif [[ $CUDA_VERSION == "11.8" ]]; then
227165
"/usr/local/cuda/lib64/libcudnn.so.8"
228166
"/usr/local/cuda/lib64/libcublas.so.11"
229167
"/usr/local/cuda/lib64/libcublasLt.so.11"
168+
"/usr/local/cuda/lib64/libcudart.so.11.0"
169+
"/usr/local/cuda/lib64/libnvToolsExt.so.1"
170+
"/usr/local/cuda/lib64/libnvrtc.so.11.2" # this is not a mistake, it links to more specific cuda version
230171
)
231172
DEPS_SONAME+=(
232173
"libcudnn_adv_infer.so.8"
@@ -238,17 +179,52 @@ elif [[ $CUDA_VERSION == "11.8" ]]; then
238179
"libcudnn.so.8"
239180
"libcublas.so.11"
240181
"libcublasLt.so.11"
182+
"libcudart.so.11.0"
183+
"libnvToolsExt.so.1"
184+
"libnvrtc.so.11.2"
241185
)
186+
if [[ $CUDA_VERSION == "11.7" ]]; then
187+
DEPS_LIST+=(
188+
"/usr/local/cuda/lib64/libnvrtc-builtins.so.11.7"
189+
)
190+
DEPS_SONAME+=(
191+
"libnvrtc-builtins.so.11.7"
192+
)
193+
fi
194+
if [[ $CUDA_VERSION == "11.8" ]]; then
195+
DEPS_LIST+=(
196+
"/usr/local/cuda/lib64/libnvrtc-builtins.so.11.8"
197+
)
198+
DEPS_SONAME+=(
199+
"libnvrtc-builtins.so.11.8"
200+
)
201+
fi
242202
else
243-
echo "Using cudnn and cublas from pypi."
203+
echo "Using nvidia libs from pypi."
244204
CUDA_RPATHS=(
245205
'$ORIGIN/../../nvidia/cublas/lib'
206+
'$ORIGIN/../../nvidia/cuda_cupti/lib'
207+
'$ORIGIN/../../nvidia/cuda_nvrtc/lib'
208+
'$ORIGIN/../../nvidia/cuda_runtime/lib'
246209
'$ORIGIN/../../nvidia/cudnn/lib'
210+
'$ORIGIN/../../nvidia/cufft/lib'
211+
'$ORIGIN/../../nvidia/curand/lib'
212+
'$ORIGIN/../../nvidia/cusolver/lib'
213+
'$ORIGIN/../../nvidia/cusparse/lib'
214+
'$ORIGIN/../../nvidia/nccl/lib'
215+
'$ORIGIN/../../nvidia/nvtx/lib'
247216
)
248217
CUDA_RPATHS=$(IFS=: ; echo "${CUDA_RPATHS[*]}")
249218
export C_SO_RPATH=$CUDA_RPATHS':$ORIGIN:$ORIGIN/lib'
250219
export LIB_SO_RPATH=$CUDA_RPATHS':$ORIGIN'
251220
export FORCE_RPATH="--force-rpath"
221+
export USE_STATIC_NCCL=0
222+
export USE_SYSTEM_NCCL=1
223+
export ATEN_STATIC_CUDA=0
224+
export USE_CUDA_STATIC_LINK=0
225+
export USE_CUPTI_SO=1
226+
export NCCL_INCLUDE_DIR="/usr/local/cuda/include/"
227+
export NCCL_LIB_DIR="/usr/local/cuda/lib64/"
252228
fi
253229
else
254230
echo "Unknown cuda version $CUDA_VERSION"

release/pypi/prep_binary_for_pypi.sh

Lines changed: 0 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -56,22 +56,8 @@ for whl_file in "$@"; do
5656
if [[ $whl_file == *"with.pypi.cudnn"* ]]; then
5757
rm -rf "${whl_dir}/caffe2"
5858
rm -rf "${whl_dir}"/torch/lib/libnvrtc*
59-
sed -i -e "s/Requires-Dist: nvidia-cuda-runtime-cu11/Requires-Dist: nvidia-cuda-runtime-cu11 (==11.7.99)/" "${whl_dir}"/*/METADATA
60-
sed -i -e "/^Requires-Dist: nvidia-cublas-cu11 (==11.10.3.66).*/a Requires-Dist: nvidia-cuda-nvrtc-cu11 (==11.7.99) ; platform_system == \"Linux\"" "${whl_dir}"/*/METADATA
6159

6260
sed -i -e "s/-with-pypi-cudnn//g" "${whl_dir}/torch/version.py"
63-
find "${whl_dir}/torch/" -maxdepth 1 -type f -name "*.so*" | while read sofile; do
64-
patchelf --set-rpath '$ORIGIN/../../nvidia/cublas/lib:$ORIGIN/../../nvidia/cudnn/lib:$ORIGIN/../../nvidia/cuda_nvrtc/lib:$ORIGIN:$ORIGIN/lib' \
65-
--force-rpath $sofile
66-
patchelf --print-rpath $sofile
67-
done
68-
69-
find "${whl_dir}/torch/lib" -maxdepth 1 -type f -name "*.so*" | while read sofile; do
70-
patchelf --set-rpath '$ORIGIN/../../nvidia/cublas/lib:$ORIGIN/../../nvidia/cudnn/lib:$ORIGIN/../../nvidia/cuda_nvrtc/lib:$ORIGIN' \
71-
--force-rpath $sofile
72-
patchelf --print-rpath $sofile
73-
done
74-
patchelf --replace-needed libnvrtc-d833c4f3.so.11.2 libnvrtc.so.11.2 "${whl_dir}/torch/lib/libcaffe2_nvrtc.so"
7561
fi
7662

7763
find "${dist_info_folder}" -type f -exec sed -i "s!${version_with_suffix}!${version_no_suffix}!" {} \;

0 commit comments

Comments
 (0)