[aarch64] Add CUDA 12.4 build script for ARM wheel (#1775)

tinglvv · malfet · web-flow · commit a79e1cef54eb · 2024-04-19T12:19:58.000-07:00
Add cuda_aarch64 ARM wheel build script with CUDA 12.4. Reference #1302. --------- Co-authored-by: Nikita Shulga <2453524+malfet@users.noreply.github.com>
diff --git a/.github/workflows/build-manywheel-images.yml b/.github/workflows/build-manywheel-images.yml
@@ -13,6 +13,7 @@ on:
       - .github/workflows/build-manywheel-images.yml
       - manywheel/Dockerfile
       - manywheel/Dockerfile_aarch64
+      - manywheel/Dockerfile_cuda_aarch64
       - manywheel/Dockerfile_cxx11-abi
       - manywheel/build_docker.sh
       - 'common/*'
@@ -21,6 +22,7 @@ on:
       - .github/workflows/build-manywheel-images.yml
       - manywheel/Dockerfile
       - manywheel/Dockerfile_aarch64
+      - manywheel/Dockerfile_cuda_aarch64
       - manywheel/Dockerfile_cxx11-abi
       - 'common/*'
       - manywheel/build_docker.sh
@@ -54,6 +56,25 @@ jobs:
       - name: Build Docker Image
         run: |
           manywheel/build_docker.sh
+  build-docker-cuda-aarch64:
+    runs-on: linux.arm64.2xlarge
+    strategy:
+      matrix:
+        cuda_version: ["12.4"]
+    env:
+      GPU_ARCH_TYPE: cuda-aarch64
+      GPU_ARCH_VERSION: ${{ matrix.cuda_version }}
+    steps:
+      - name: Checkout PyTorch
+        uses: actions/checkout@v3
+      - name: Authenticate if WITH_PUSH
+        run: |
+          if [[ "${WITH_PUSH}" == true ]]; then
+            echo "${DOCKER_TOKEN}" | docker login -u "${DOCKER_ID}" --password-stdin
+          fi
+      - name: Build Docker Image
+        run: |
+          manywheel/build_docker.sh
   build-docker-rocm:
     runs-on: linux.12xlarge
     strategy:
diff --git a/aarch64_linux/aarch64_ci_build.sh b/aarch64_linux/aarch64_ci_build.sh
@@ -26,4 +26,10 @@ cd /
 git config --global --add safe.directory /pytorch
 pip install -r /pytorch/requirements.txt
 pip install auditwheel
-python /builder/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn
+if [ -n "$GPU_ARCH_VERSION" ]; then
+    echo "BASE_CUDA_VERSION is set to: $GPU_ARCH_VERSION"
+    python /builder/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn --enable-cuda
+else
+    echo "BASE_CUDA_VERSION is not set."
+    python /builder/aarch64_linux/aarch64_wheel_ci_build.py --enable-mkldnn
+fi
diff --git a/aarch64_linux/aarch64_wheel_ci_build.py b/aarch64_linux/aarch64_wheel_ci_build.py
@@ -9,103 +9,201 @@
 
 
 def list_dir(path: str) -> List[str]:
-    ''''
+    """'
     Helper for getting paths for Python
-    '''
+    """
     return check_output(["ls", "-1", path]).decode().split("\n")
 
 
 def build_ArmComputeLibrary() -> None:
-    '''
+    """
     Using ArmComputeLibrary for aarch64 PyTorch
-    '''
-    print('Building Arm Compute Library')
-    acl_build_flags=["debug=0", "neon=1", "opencl=0", "os=linux", "openmp=1", "cppthreads=0",
-                     "arch=armv8a", "multi_isa=1", "fixed_format_kernels=1", "build=native"]
-    acl_install_dir="/acl"
-    acl_checkout_dir="ComputeLibrary"
+    """
+    print("Building Arm Compute Library")
+    acl_build_flags = [
+        "debug=0",
+        "neon=1",
+        "opencl=0",
+        "os=linux",
+        "openmp=1",
+        "cppthreads=0",
+        "arch=armv8a",
+        "multi_isa=1",
+        "fixed_format_kernels=1",
+        "build=native",
+    ]
+    acl_install_dir = "/acl"
+    acl_checkout_dir = "ComputeLibrary"
     os.makedirs(acl_install_dir)
-    check_call(["git", "clone", "https://github.com/ARM-software/ComputeLibrary.git", "-b", "v23.08",
-                "--depth", "1", "--shallow-submodules"])
-    check_call(["scons", "Werror=1", "-j8", f"build_dir=/{acl_install_dir}/build"] + acl_build_flags,
-               cwd=acl_checkout_dir)
+    check_call(
+        [
+            "git",
+            "clone",
+            "https://github.com/ARM-software/ComputeLibrary.git",
+            "-b",
+            "v23.08",
+            "--depth",
+            "1",
+            "--shallow-submodules",
+        ]
+    )
+    check_call(
+        ["scons", "Werror=1", "-j8", f"build_dir=/{acl_install_dir}/build"]
+        + acl_build_flags,
+        cwd=acl_checkout_dir,
+    )
     for d in ["arm_compute", "include", "utils", "support", "src"]:
         shutil.copytree(f"{acl_checkout_dir}/{d}", f"{acl_install_dir}/{d}")
 
 
+def update_wheel(wheel_path) -> None:
+    """
+    Update the cuda wheel libraries
+    """
+    folder = os.path.dirname(wheel_path)
+    wheelname = os.path.basename(wheel_path)
+    os.mkdir(f"{folder}/tmp")
+    os.system(f"unzip {wheel_path} -d {folder}/tmp")
+    libs_to_copy = [
+        "/usr/local/cuda/extras/CUPTI/lib64/libcupti.so.12",
+        "/usr/local/cuda/lib64/libcudnn.so.8",
+        "/usr/local/cuda/lib64/libcublas.so.12",
+        "/usr/local/cuda/lib64/libcublasLt.so.12",
+        "/usr/local/cuda/lib64/libcudart.so.12",
+        "/usr/local/cuda/lib64/libcufft.so.11",
+        "/usr/local/cuda/lib64/libcusparse.so.12",
+        "/usr/local/cuda/lib64/libcusparseLt.so.0",
+        "/usr/local/cuda/lib64/libcusolver.so.11",
+        "/usr/local/cuda/lib64/libcurand.so.10",
+        "/usr/local/cuda/lib64/libnvToolsExt.so.1",
+        "/usr/local/cuda/lib64/libnvJitLink.so.12",
+        "/usr/local/cuda/lib64/libnvrtc.so.12",
+        "/usr/local/cuda/lib64/libnvrtc-builtins.so.12.4",
+        "/usr/local/cuda/lib64/libcudnn_adv_infer.so.8",
+        "/usr/local/cuda/lib64/libcudnn_adv_train.so.8",
+        "/usr/local/cuda/lib64/libcudnn_cnn_infer.so.8",
+        "/usr/local/cuda/lib64/libcudnn_cnn_train.so.8",
+        "/usr/local/cuda/lib64/libcudnn_ops_infer.so.8",
+        "/usr/local/cuda/lib64/libcudnn_ops_train.so.8",
+        "/opt/conda/envs/aarch64_env/lib/libopenblas.so.0",
+        "/opt/conda/envs/aarch64_env/lib/libgfortran.so.5",
+        "/opt/conda/envs/aarch64_env/lib/libgomp.so.1",
+        "/acl/build/libarm_compute.so",
+        "/acl/build/libarm_compute_graph.so",
+        "/acl/build/libarm_compute_core.so",
+    ]
+    # Copy libraries to unzipped_folder/a/lib
+    for lib_path in libs_to_copy:
+        lib_name = os.path.basename(lib_path)
+        shutil.copy2(lib_path, f"{folder}/tmp/torch/lib/{lib_name}")
+    os.system(
+        f"cd {folder}/tmp/torch/lib/; patchelf --set-rpath '$ORIGIN' {folder}/tmp/torch/lib/libtorch_cuda.so"
+    )
+    os.mkdir(f"{folder}/cuda_wheel")
+    os.system(f"cd {folder}/tmp/; zip -r {folder}/cuda_wheel/{wheelname} *")
+    shutil.move(
+        f"{folder}/cuda_wheel/{wheelname}",
+        f"/dist/{wheelname}",
+        copy_function=shutil.copy2,
+    )
+    os.system(f"rm -rf {folder}/tmp {folder}/dist/cuda_wheel/")
+
+
 def complete_wheel(folder: str) -> str:
-    '''
+    """
     Complete wheel build and put in artifact location
-    '''
+    """
     wheel_name = list_dir(f"/{folder}/dist")[0]
 
-    if "pytorch" in folder:
+    if "pytorch" in folder and not enable_cuda:
         print("Repairing Wheel with AuditWheel")
-        check_call(["auditwheel","repair", f"dist/{wheel_name}"], cwd=folder)
+        check_call(["auditwheel", "repair", f"dist/{wheel_name}"], cwd=folder)
         repaired_wheel_name = list_dir(f"/{folder}/wheelhouse")[0]
 
         print(f"Moving {repaired_wheel_name} wheel to /{folder}/dist")
-        os.rename(f"/{folder}/wheelhouse/{repaired_wheel_name}", f"/{folder}/dist/{repaired_wheel_name}")
+        os.rename(
+            f"/{folder}/wheelhouse/{repaired_wheel_name}",
+            f"/{folder}/dist/{repaired_wheel_name}",
+        )
     else:
         repaired_wheel_name = wheel_name
 
-    print(f"Copying {repaired_wheel_name} to artfacts")
-    shutil.copy2(f"/{folder}/dist/{repaired_wheel_name}", f"/artifacts/{repaired_wheel_name}")
+    print(f"Copying {repaired_wheel_name} to artifacts")
+    shutil.copy2(
+        f"/{folder}/dist/{repaired_wheel_name}", f"/artifacts/{repaired_wheel_name}"
+    )
 
     return repaired_wheel_name
 
 
 def parse_arguments():
-    '''
+    """
     Parse inline arguments
-    '''
+    """
     from argparse import ArgumentParser
+
     parser = ArgumentParser("AARCH64 wheels python CD")
     parser.add_argument("--debug", action="store_true")
     parser.add_argument("--build-only", action="store_true")
     parser.add_argument("--test-only", type=str)
     parser.add_argument("--enable-mkldnn", action="store_true")
+    parser.add_argument("--enable-cuda", action="store_true")
     return parser.parse_args()
 
 
-if __name__ == '__main__':
-    '''
+if __name__ == "__main__":
+    """
     Entry Point
-    '''
+    """
     args = parse_arguments()
     enable_mkldnn = args.enable_mkldnn
-    repo = Repository('/pytorch')
+    enable_cuda = args.enable_cuda
+    repo = Repository("/pytorch")
     branch = repo.head.name
-    if branch == 'HEAD':
-        branch = 'master'
-
+    if branch == "HEAD":
+        branch = "master"
 
-    print('Building PyTorch wheel')
+    print("Building PyTorch wheel")
     build_vars = "CMAKE_SHARED_LINKER_FLAGS=-Wl,-z,max-page-size=0x10000 "
     os.system("python setup.py clean")
 
     override_package_version = os.getenv("OVERRIDE_PACKAGE_VERSION")
     if override_package_version is not None:
         version = override_package_version
-        build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version} PYTORCH_BUILD_NUMBER=1 "
-    elif branch in ['nightly', 'master']:
-        build_date = check_output(['git', 'log', '--pretty=format:%cs', '-1'], cwd='/pytorch').decode().replace('-', '')
-        version = check_output(['cat', 'version.txt'], cwd='/pytorch').decode().strip()[:-2]
+        build_vars += (
+            f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version} PYTORCH_BUILD_NUMBER=1 "
+        )
+    elif branch in ["nightly", "master"]:
+        build_date = (
+            check_output(["git", "log", "--pretty=format:%cs", "-1"], cwd="/pytorch")
+            .decode()
+            .replace("-", "")
+        )
+        version = (
+            check_output(["cat", "version.txt"], cwd="/pytorch").decode().strip()[:-2]
+        )
         build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={version}.dev{build_date} PYTORCH_BUILD_NUMBER=1 "
     elif branch.startswith(("v1.", "v2.")):
         build_vars += f"BUILD_TEST=0 PYTORCH_BUILD_VERSION={branch[1:branch.find('-')]} PYTORCH_BUILD_NUMBER=1 "
 
     if enable_mkldnn:
         build_ArmComputeLibrary()
         print("build pytorch with mkldnn+acl backend")
-        build_vars += "USE_MKLDNN=ON USE_MKLDNN_ACL=ON " \
-                      "ACL_ROOT_DIR=/acl " \
-                      "LD_LIBRARY_PATH=/pytorch/build/lib:/acl/build:$LD_LIBRARY_PATH " \
-                      "ACL_INCLUDE_DIR=/acl/build " \
-                      "ACL_LIBRARY=/acl/build "
+        build_vars += (
+            "USE_MKLDNN=ON USE_MKLDNN_ACL=ON "
+            "ACL_ROOT_DIR=/acl "
+            "LD_LIBRARY_PATH=/pytorch/build/lib:/acl/build:$LD_LIBRARY_PATH "
+            "ACL_INCLUDE_DIR=/acl/build "
+            "ACL_LIBRARY=/acl/build "
+        )
     else:
         print("build pytorch without mkldnn backend")
 
     os.system(f"cd /pytorch; {build_vars} python3 setup.py bdist_wheel")
-    pytorch_wheel_name = complete_wheel("pytorch")
-    print(f"Build Compelete. Created {pytorch_wheel_name}..")
+    if enable_cuda:
+        print("Updating Cuda Dependency")
+        filename = os.listdir("/pytorch/dist/")
+        wheel_path = f"/pytorch/dist/{filename[0]}"
+        update_wheel(wheel_path)
+    pytorch_wheel_name = complete_wheel("/pytorch/")
+    print(f"Build Complete. Created {pytorch_wheel_name}..")
diff --git a/common/install_cuda_aarch64.sh b/common/install_cuda_aarch64.sh
@@ -0,0 +1,90 @@
+#!/bin/bash
+
+set -ex
+
+function install_cusparselt_052 {
+    # cuSparseLt license: https://docs.nvidia.com/cuda/cusparselt/license.html
+    mkdir tmp_cusparselt && pushd tmp_cusparselt
+    wget -q https://developer.download.nvidia.com/compute/cusparselt/redist/libcusparse_lt/linux-sbsa/libcusparse_lt-linux-sbsa-0.5.2.1-archive.tar.xz
+    tar xf libcusparse_lt-linux-sbsa-0.5.2.1-archive.tar.xz
+    cp -a libcusparse_lt-linux-sbsa-0.5.2.1-archive/include/* /usr/local/cuda/include/
+    cp -a libcusparse_lt-linux-sbsa-0.5.2.1-archive/lib/* /usr/local/cuda/lib64/
+    popd
+    rm -rf tmp_cusparselt
+}
+
+function install_124 {
+  echo "Installing CUDA 12.4 and cuDNN 8.9 and NCCL 2.20.5 and cuSparseLt-0.5.2"
+  rm -rf /usr/local/cuda-12.4 /usr/local/cuda
+  # install CUDA 12.4.0 in the same container
+  wget -q https://developer.download.nvidia.com/compute/cuda/12.4.0/local_installers/cuda_12.4.0_550.54.14_linux_sbsa.run
+  chmod +x cuda_12.4.0_550.54.14_linux_sbsa.run
+  ./cuda_12.4.0_550.54.14_linux_sbsa.run --toolkit --silent
+  rm -f cuda_12.4.0_550.54.14_linux_sbsa.run
+  rm -f /usr/local/cuda && ln -s /usr/local/cuda-12.4 /usr/local/cuda
+
+  # cuDNN license: https://developer.nvidia.com/cudnn/license_agreement
+  mkdir tmp_cudnn && cd tmp_cudnn
+  wget -q https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/linux-sbsa/cudnn-linux-sbsa-8.9.2.26_cuda12-archive.tar.xz -O cudnn-linux-sbsa-8.9.2.26_cuda12-archive.tar.xz
+  tar xf cudnn-linux-sbsa-8.9.2.26_cuda12-archive.tar.xz
+  cp -a cudnn-linux-sbsa-8.9.2.26_cuda12-archive/include/* /usr/local/cuda/include/
+  cp -a cudnn-linux-sbsa-8.9.2.26_cuda12-archive/lib/* /usr/local/cuda/lib64/
+  cd ..
+  rm -rf tmp_cudnn
+
+  # NCCL license: https://docs.nvidia.com/deeplearning/nccl/#licenses
+  # Follow build: https://github.com/NVIDIA/nccl/tree/master?tab=readme-ov-file#build
+  git clone -b v2.20.5-1 --depth 1 https://github.com/NVIDIA/nccl.git
+  cd nccl && make -j src.build
+  cp -a build/include/* /usr/local/cuda/include/
+  cp -a build/lib/* /usr/local/cuda/lib64/
+  cd ..
+  rm -rf nccl
+
+  install_cusparselt_052
+
+  ldconfig
+}
+
+function prune_124 {
+  echo "Pruning CUDA 12.4"
+  #####################################################################################
+  # CUDA 12.4 prune static libs
+  #####################################################################################
+  export NVPRUNE="/usr/local/cuda-12.4/bin/nvprune"
+  export CUDA_LIB_DIR="/usr/local/cuda-12.4/lib64"
+
+  export GENCODE="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
+  export GENCODE_CUDNN="-gencode arch=compute_50,code=sm_50 -gencode arch=compute_60,code=sm_60 -gencode arch=compute_61,code=sm_61 -gencode arch=compute_70,code=sm_70 -gencode arch=compute_75,code=sm_75 -gencode arch=compute_80,code=sm_80 -gencode arch=compute_86,code=sm_86 -gencode arch=compute_90,code=sm_90"
+
+  if [[ -n "$OVERRIDE_GENCODE" ]]; then
+      export GENCODE=$OVERRIDE_GENCODE
+  fi
+
+  # all CUDA libs except CuDNN and CuBLAS
+  ls $CUDA_LIB_DIR/ | grep "\.a" | grep -v "culibos" | grep -v "cudart" | grep -v "cudnn" | grep -v "cublas" | grep -v "metis"  \
+      | xargs -I {} bash -c \
+                "echo {} && $NVPRUNE $GENCODE $CUDA_LIB_DIR/{} -o $CUDA_LIB_DIR/{}"
+
+  # prune CuDNN and CuBLAS
+  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublas_static.a -o $CUDA_LIB_DIR/libcublas_static.a
+  $NVPRUNE $GENCODE_CUDNN $CUDA_LIB_DIR/libcublasLt_static.a -o $CUDA_LIB_DIR/libcublasLt_static.a
+
+  #####################################################################################
+  # CUDA 12.1 prune visual tools
+  #####################################################################################
+  export CUDA_BASE="/usr/local/cuda-12.4/"
+  rm -rf $CUDA_BASE/libnvvp $CUDA_BASE/nsightee_plugins $CUDA_BASE/nsight-compute-2024.1.0 $CUDA_BASE/nsight-systems-2023.4.4/
+}
+
+# idiomatic parameter and option handling in sh
+while test $# -gt 0
+do
+    case "$1" in
+    12.4) install_124; prune_124
+        ;;
+    *) echo "bad argument $1"; exit 1
+        ;;
+    esac
+    shift
+done
diff --git a/manywheel/Dockerfile_cuda_aarch64 b/manywheel/Dockerfile_cuda_aarch64
diff --git a/manywheel/build_docker.sh b/manywheel/build_docker.sh