Skip to content

CI: Install latest compute-sanitizer separately from CTK #594

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 25 commits into from
May 3, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 29 additions & 22 deletions .github/actions/fetch_ctk/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ inputs:
description: "A list of the CTK components to install as a comma-separated list. e.g. 'cuda_nvcc,cuda_nvrtc,cuda_cudart'"
required: false
type: string
default: "cuda_nvcc,cuda_cudart,cuda_nvrtc,cuda_profiler_api,cuda_cccl,cuda_sanitizer_api,libnvjitlink"
default: "cuda_nvcc,cuda_cudart,cuda_nvrtc,cuda_profiler_api,cuda_cccl,libnvjitlink"

runs:
using: composite
Expand Down Expand Up @@ -50,38 +50,40 @@ runs:
if: ${{ steps.ctk-get-cache.outputs.cache-hit != 'true' }}
shell: bash --noprofile --norc -xeuo pipefail {0}
run: |
CUDA_PATH="./cuda_toolkit"
mkdir $CUDA_PATH
# Everything under this folder is packed and stored in the GitHub Cache space,
# and unpacked after retrieving from the cache.
CACHE_TMP_DIR="./cache_tmp_dir"
rm -rf $CACHE_TMP_DIR
mkdir $CACHE_TMP_DIR

# The binary archives (redist) are guaranteed to be updated as part of the release posting.
CTK_BASE_URL="https://developer.download.nvidia.com/compute/cuda/redist/"
CTK_JSON_URL="$CTK_BASE_URL/redistrib_${{ inputs.cuda-version }}.json"
if [[ "${{ inputs.host-platform }}" == linux* ]]; then
if [[ "${{ inputs.host-platform }}" == "linux-64" ]]; then
CTK_SUBDIR="linux-x86_64"
elif [[ "${{ inputs.host-platform }}" == "linux-aarch64" ]]; then
CTK_SUBDIR="linux-sbsa"
fi
function extract() {
tar -xvf $1 -C $CUDA_PATH --strip-components=1
tar -xvf $1 -C $CACHE_TMP_DIR --strip-components=1
}
elif [[ "${{ inputs.host-platform }}" == "win-64" ]]; then
CTK_SUBDIR="windows-x86_64"
function extract() {
_TEMP_DIR_=$(mktemp -d)
unzip $1 -d $_TEMP_DIR_
cp -r $_TEMP_DIR_/*/* $CUDA_PATH
cp -r $_TEMP_DIR_/*/* $CACHE_TMP_DIR
rm -rf $_TEMP_DIR_
}
fi

function populate_cuda_path() {
# take the component name as a argument
function download() {
curl -kLSs $1 -o $2
}
local CTK_COMPONENT=$1
local CTK_VERSION=$2
CTK_COMPONENT_REL_PATH="$(curl -s ${CTK_BASE_URL}/redistrib_${CTK_VERSION}.json |
CTK_COMPONENT=$1
CTK_COMPONENT_REL_PATH="$(curl -s $CTK_JSON_URL |
python -c "import sys, json; print(json.load(sys.stdin)['${CTK_COMPONENT}']['${CTK_SUBDIR}']['relative_path'])")"
CTK_COMPONENT_URL="${CTK_BASE_URL}/${CTK_COMPONENT_REL_PATH}"
CTK_COMPONENT_COMPONENT_FILENAME="$(basename $CTK_COMPONENT_REL_PATH)"
Expand All @@ -98,23 +100,23 @@ runs:
CTK_CACHE_COMPONENTS="${CTK_CACHE_COMPONENTS//,,/,}"
# Get headers and shared libraries in place
for item in $(echo $CTK_CACHE_COMPONENTS | tr ',' ' '); do
ctk_version="${{ inputs.cuda-version }}"
if [[ "$item" == "cuda_sanitizer_api" ]]; then
# Always use latest CTK for cuda_sanitizer_api
# FIXME: Automatically track latest CTK version
CUDA_MAJOR="$(cut -d '.' -f 1 <<< ${{ inputs.cuda-version }})"
if [[ "$CUDA_MAJOR" == "12" ]]; then
# TODO: Automatically track latest CTK minor version
ctk_version="12.8.0"
fi
fi
populate_cuda_path "$item" "$ctk_version"
populate_cuda_path "$item"
done
ls -l $CUDA_PATH
ls -l $CACHE_TMP_DIR

# Prepare the cache
# Note: try to escape | and > ...
tar -czvf ${CTK_CACHE_FILENAME} ${CUDA_PATH}
tar -czvf ${CTK_CACHE_FILENAME} ${CACHE_TMP_DIR}

# "Move" files from temp dir to CUDA_PATH
CUDA_PATH="./cuda_toolkit"
mkdir -p $CUDA_PATH
# Unfortunately we cannot use "rsync -av $CACHE_TMP_DIR/ $CUDA_PATH" because
# not all runners have rsync pre-installed (or even installable, such as
# Git Bash). We do it in the dumb way.
cp -r $CACHE_TMP_DIR/* $CUDA_PATH
rm -rf $CACHE_TMP_DIR
ls -l $CUDA_PATH

- name: Upload CTK cache
if: ${{ always() &&
Expand All @@ -129,8 +131,13 @@ runs:
shell: bash --noprofile --norc -xeuo pipefail {0}
run: |
ls -l
CACHE_TMP_DIR="./cache_tmp_dir"
CUDA_PATH="./cuda_toolkit"
mkdir -p $CUDA_PATH
tar -xzvf $CTK_CACHE_FILENAME
# Can't use rsync here, see above
cp -r $CACHE_TMP_DIR/* $CUDA_PATH
rm -rf $CACHE_TMP_DIR $CTK_CACHE_FILENAME
ls -l $CUDA_PATH
if [ ! -d "$CUDA_PATH/include" ]; then
exit 1
Expand Down
33 changes: 33 additions & 0 deletions .github/workflows/guess_latest.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#!/bin/bash
# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
#
# SPDX-License-Identifier: Apache-2.0

# URL to search
URL="https://developer.download.nvidia.com/compute/cuda/redist/"

# Fetch the directory listing and extract the latest version number
get_latest_version() {
# Get the HTML content of the page
local html_content=$(wget -q -O - "$URL")

# Extract links matching the pattern redistrib_?.?.?.json
local files=$(echo "$html_content" | grep -oP 'redistrib_[0-9]+\.[0-9]+\.[0-9]+\.json' | cut -d'"' -f2)

# If files were found, extract the version numbers and find the latest
if [ -n "$files" ]; then
# Extract just the version numbers using regex
local versions=$(echo "$files" | grep -oP 'redistrib_\K[0-9]+\.[0-9]+\.[0-9]+(?=\.json)')

# Sort the versions and get the latest
local latest_version=$(echo "$versions" | sort -V | tail -n 1)
echo "$latest_version"
else
echo "No files matching the pattern were found."
return 1
fi
}

# Call the function and store the result
latest_version=$(get_latest_version)
echo $latest_version
6 changes: 4 additions & 2 deletions .github/workflows/install_gpu_driver.ps1
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
#Requires -RunAsAdministrator
# Copyright (c) 2025, NVIDIA CORPORATION & AFFILIATES. ALL RIGHTS RESERVED.
#
# SPDX-License-Identifier: Apache-2.0

# Install the driver
function Install-Driver {
Expand All @@ -23,7 +25,7 @@ function Install-Driver {
$ProgressPreference = $ProgressPreference_tmp
Write-Output 'Download complete!'

# Install the file with the specified path from earlier as well as the RunAs admin option
# Install the file with the specified path from earlier
Write-Output 'Running the driver installer...'
Start-Process -FilePath $file_dir -ArgumentList $install_args -Wait
Write-Output 'Done!'
Expand Down
41 changes: 29 additions & 12 deletions .github/workflows/test-wheel-linux.yml
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,14 @@ jobs:
with:
fetch-depth: 0

- name: Install dependencies
uses: ./.github/actions/install_unix_deps
continue-on-error: false
with:
# gcc for Cython tests, jq/wget for artifact fetching
dependencies: "build-essential jq wget"
dependent_exes: "gcc jq wget"

- name: Set environment variables
run: |
PYTHON_VERSION_FORMATTED=$(echo '${{ inputs.python-version }}' | tr -d '.')
Expand All @@ -78,6 +86,17 @@ jobs:
fi
fi

# We don't test compute-sanitizer on CTK<12 because backporting fixes is too much effort
# We only test compute-sanitizer on python 3.12 arbitrarily; we don't need to use sanitizer on the entire matrix
# Only local ctk installs have compute-sanitizer; there is not wheel for it
if [[ "${{ inputs.python-version }}" == "3.12" && "${{ inputs.cuda-version }}" != "11.8.0" && "${{ inputs.local-ctk }}" == 1 ]]; then
SETUP_SANITIZER=1
echo "LATEST_CUDA_VERSION=$(bash .github/workflows/guess_latest.sh)" >> $GITHUB_ENV
else
SETUP_SANITIZER=0
fi
echo "SETUP_SANITIZER=${SETUP_SANITIZER}" >> $GITHUB_ENV

# make outputs from the previous job as env vars
CUDA_CORE_ARTIFACT_BASENAME="cuda-core-python${PYTHON_VERSION_FORMATTED}-${{ inputs.host-platform }}"
echo "PYTHON_VERSION_FORMATTED=${PYTHON_VERSION_FORMATTED}" >> $GITHUB_ENV
Expand All @@ -91,14 +110,6 @@ jobs:
echo "SKIP_CUDA_BINDINGS_TEST=${SKIP_CUDA_BINDINGS_TEST}" >> $GITHUB_ENV
echo "SKIP_CUDA_CORE_CYTHON_TEST=${SKIP_CUDA_CORE_CYTHON_TEST}" >> $GITHUB_ENV

- name: Install dependencies
uses: ./.github/actions/install_unix_deps
continue-on-error: false
with:
# gcc for Cython tests, jq/wget for artifact fetching
dependencies: "build-essential jq wget"
dependent_exes: "gcc jq wget"

- name: Download cuda-python build artifacts
if: ${{ env.SKIP_CUDA_BINDINGS_TEST == '0'}}
uses: actions/download-artifact@v4
Expand Down Expand Up @@ -184,12 +195,18 @@ jobs:
host-platform: ${{ inputs.host-platform }}
cuda-version: ${{ inputs.cuda-version }}

- name: Set up latest cuda_sanitizer_api
if: ${{ env.SETUP_SANITIZER == '1' }}
uses: ./.github/actions/fetch_ctk
continue-on-error: false
with:
host-platform: ${{ inputs.host-platform }}
cuda-version: ${{ env.LATEST_CUDA_VERSION }}
cuda-components: "cuda_sanitizer_api"

- name: Set up compute-sanitizer
run: |
# We don't test compute-sanitizer on CTK<12 because backporting fixes is too much effort
# We only test compute-sanitizer on python 3.12 arbitrarily; we don't need to use sanitizer on the entire matrix
# Only local ctk installs have compute-sanitizer; there is not wheel for it
if [[ "${{ inputs.python-version }}" == "3.12" && "${{ inputs.cuda-version }}" != "11.8.0" && "${{ inputs.local-ctk }}" == 1 ]]; then
if [[ "${SETUP_SANITIZER}" == 1 ]]; then
COMPUTE_SANITIZER="${CUDA_HOME}/bin/compute-sanitizer"
COMPUTE_SANITIZER_VERSION=$(${COMPUTE_SANITIZER} --version | grep -Eo "[0-9]{4}\.[0-9]\.[0-9]" | sed -e 's/\.//g')
SANITIZER_CMD="${COMPUTE_SANITIZER} --target-processes=all --launch-timeout=0 --tool=memcheck --error-exitcode=1"
Expand Down