diff --git a/.github/workflows/iris-tests-apptainer.yml b/.github/workflows/iris-tests-dev.yml similarity index 69% rename from .github/workflows/iris-tests-apptainer.yml rename to .github/workflows/iris-tests-dev.yml index effc30b1..036828c6 100644 --- a/.github/workflows/iris-tests-apptainer.yml +++ b/.github/workflows/iris-tests-dev.yml @@ -1,4 +1,4 @@ -name: Iris Tests with Apptainer +name: Iris Development Tests on: push: @@ -15,6 +15,9 @@ jobs: build-apptainer-image: runs-on: [self-hosted, mi3008x] timeout-minutes: 90 + strategy: + matrix: + rocm_version: ["6.3.1", "7.0"] steps: - name: Checkout repository @@ -31,22 +34,52 @@ jobs: # Create persistent Apptainer directory mkdir -p ~/apptainer - # Build Apptainer image from definition file (only if it doesn't exist) - if [ ! -f ~/apptainer/iris-dev.sif ]; then - echo "Building new Apptainer image..." - apptainer build ~/apptainer/iris-dev.sif apptainer/iris.def + # Compute hash of the definition file + DEF_FILE="apptainer/iris-rocm${{ matrix.rocm_version }}.def" + CURRENT_HASH=$(sha256sum "$DEF_FILE" | awk '{print $1}') + HASH_FILE=~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif.sha256 + + # Check if we need to rebuild + REBUILD=false + if [ ! -f ~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif ]; then + echo "Apptainer image not found. Building new image for ROCm ${{ matrix.rocm_version }}..." + REBUILD=true + elif [ ! -f "$HASH_FILE" ]; then + echo "Hash file not found. Rebuilding image for ROCm ${{ matrix.rocm_version }}..." + REBUILD=true else - echo "Using existing Apptainer image" + STORED_HASH=$(cat "$HASH_FILE") + if [ "$CURRENT_HASH" != "$STORED_HASH" ]; then + echo "Definition file has changed. Rebuilding image for ROCm ${{ matrix.rocm_version }}..." + echo " Previous hash: $STORED_HASH" + echo " Current hash: $CURRENT_HASH" + REBUILD=true + else + echo "Using existing Apptainer image for ROCm ${{ matrix.rocm_version }} (hash: $CURRENT_HASH)" + fi fi + + # Build if needed + if [ "$REBUILD" = true ]; then + apptainer build --force ~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif "$DEF_FILE" + echo "$CURRENT_HASH" > "$HASH_FILE" + echo "Successfully built and stored hash: $CURRENT_HASH" + fi + test-1-2-4-ranks: - name: Test 1/2/4 Ranks (Parallel) + name: Test 1/2/4 Ranks (Parallel) - ROCm ${{ matrix.rocm_version }} needs: build-apptainer-image runs-on: [self-hosted, mi3008x] timeout-minutes: 20 + strategy: + matrix: + rocm_version: ["6.3.1", "7.0"] steps: - name: Checkout repository uses: actions/checkout@v4 + with: + fetch-depth: 0 - name: Run 1, 2, 4 rank tests in parallel run: | @@ -70,7 +103,7 @@ jobs: echo "Starting 1-rank test on GPUs 0,1..." apptainer exec --overlay "${OVERLAY_1}" --no-home --cleanenv --env HIP_VISIBLE_DEVICES="0,1" \ --bind "${PWD}:/iris_workspace" --cwd /iris_workspace \ - ~/apptainer/iris-dev.sif bash -c " + ~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif bash -c " pip install -e . bash .github/scripts/run_tests.sh 1 " & @@ -79,7 +112,7 @@ jobs: echo "Starting 2-rank test on GPUs 2,3..." apptainer exec --overlay "${OVERLAY_2}" --no-home --cleanenv --env HIP_VISIBLE_DEVICES="2,3" \ --bind "${PWD}:/iris_workspace" --cwd /iris_workspace \ - ~/apptainer/iris-dev.sif bash -c " + ~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif bash -c " pip install -e . bash .github/scripts/run_tests.sh 2 " & @@ -88,7 +121,7 @@ jobs: echo "Starting 4-rank test on GPUs 4,5,6,7..." apptainer exec --overlay "${OVERLAY_4}" --no-home --cleanenv --env HIP_VISIBLE_DEVICES="4,5,6,7" \ --bind "${PWD}:/iris_workspace" --cwd /iris_workspace \ - ~/apptainer/iris-dev.sif bash -c " + ~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif bash -c " pip install -e . bash .github/scripts/run_tests.sh 4 " & @@ -118,15 +151,19 @@ jobs: echo "✅ All parallel tests (1, 2, 4 ranks) passed!" test-8-ranks: - name: Test 8 Ranks + name: Test 8 Ranks - ROCm ${{ matrix.rocm_version }} needs: build-apptainer-image runs-on: [self-hosted, mi3008x] - timeout-minutes: 15 + timeout-minutes: 30 + strategy: + matrix: + rocm_version: ["6.3.1", "7.0"] steps: - name: Checkout repository uses: actions/checkout@v4 - + with: + fetch-depth: 0 - name: Run 8-rank test run: | # Create unique overlay image for isolation @@ -139,7 +176,7 @@ jobs: echo "::group::Running 8-rank test on all GPUs" apptainer exec --overlay "${OVERLAY_8}" --no-home --cleanenv --env HIP_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \ --bind "${PWD}:/iris_workspace" --cwd /iris_workspace \ - ~/apptainer/iris-dev.sif bash -c " + ~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif bash -c " pip install -e . bash .github/scripts/run_tests.sh 8 " diff --git a/.github/workflows/iris-external-validation-test.yml b/.github/workflows/iris-tests-external.yml similarity index 54% rename from .github/workflows/iris-external-validation-test.yml rename to .github/workflows/iris-tests-external.yml index 1dbbe977..ff3427a4 100644 --- a/.github/workflows/iris-external-validation-test.yml +++ b/.github/workflows/iris-tests-external.yml @@ -1,4 +1,4 @@ -name: Iris External Validation Test +name: Iris External Validation on: push: @@ -15,6 +15,9 @@ jobs: build-apptainer-image: runs-on: [self-hosted, mi3008x] timeout-minutes: 90 + strategy: + matrix: + rocm_version: ["6.3.1", "7.0"] steps: - name: Checkout repository @@ -31,19 +34,46 @@ jobs: # Create persistent Apptainer directory mkdir -p ~/apptainer - # Build Apptainer image from definition file (only if it doesn't exist) - if [ ! -f ~/apptainer/iris-dev.sif ]; then - echo "Building new Apptainer image..." - apptainer build ~/apptainer/iris-dev.sif apptainer/iris.def + # Compute hash of the definition file + DEF_FILE="apptainer/iris-rocm${{ matrix.rocm_version }}.def" + CURRENT_HASH=$(sha256sum "$DEF_FILE" | awk '{print $1}') + HASH_FILE=~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif.sha256 + + # Check if we need to rebuild + REBUILD=false + if [ ! -f ~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif ]; then + echo "Apptainer image not found. Building new image for ROCm ${{ matrix.rocm_version }}..." + REBUILD=true + elif [ ! -f "$HASH_FILE" ]; then + echo "Hash file not found. Rebuilding image for ROCm ${{ matrix.rocm_version }}..." + REBUILD=true else - echo "Using existing Apptainer image" + STORED_HASH=$(cat "$HASH_FILE") + if [ "$CURRENT_HASH" != "$STORED_HASH" ]; then + echo "Definition file has changed. Rebuilding image for ROCm ${{ matrix.rocm_version }}..." + echo " Previous hash: $STORED_HASH" + echo " Current hash: $CURRENT_HASH" + REBUILD=true + else + echo "Using existing Apptainer image for ROCm ${{ matrix.rocm_version }} (hash: $CURRENT_HASH)" + fi + fi + + # Build if needed + if [ "$REBUILD" = true ]; then + apptainer build --force ~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif "$DEF_FILE" + echo "$CURRENT_HASH" > "$HASH_FILE" + echo "Successfully built and stored hash: $CURRENT_HASH" fi external-validation-test: - name: External Validation Test + name: External Validation Test - ROCm ${{ matrix.rocm_version }} needs: build-apptainer-image runs-on: [self-hosted, mi3008x] timeout-minutes: 30 + strategy: + matrix: + rocm_version: ["6.3.1", "7.0"] steps: - name: Checkout repository @@ -61,7 +91,7 @@ jobs: echo "::group::Running external validation test" apptainer exec --overlay "${OVERLAY}" --no-home --cleanenv \ --bind "${PWD}:/iris_workspace" --cwd /iris_workspace \ - ~/apptainer/iris-dev.sif bash -c " + ~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif bash -c " pip install git+https://github.com/${{ github.repository }}.git@${{ github.sha }} wget -O test_iris_distributed.py https://gist.githubusercontent.com/mawad-amd/6375dc078e39e256828f379e03310ec7/raw/a527c3192bee4615292769e340b1c73676f6945a/test_iris_distributed.py python test_iris_distributed.py diff --git a/.github/workflows/iris-pip-install-test.yml b/.github/workflows/iris-tests-package.yml similarity index 71% rename from .github/workflows/iris-pip-install-test.yml rename to .github/workflows/iris-tests-package.yml index 48cec6c9..31671e3b 100644 --- a/.github/workflows/iris-pip-install-test.yml +++ b/.github/workflows/iris-tests-package.yml @@ -1,4 +1,4 @@ -name: Iris Pip Install Test +name: Iris Package Tests on: push: @@ -15,6 +15,9 @@ jobs: build-apptainer-image: runs-on: [self-hosted, mi3008x] timeout-minutes: 90 + strategy: + matrix: + rocm_version: ["6.3.1", "7.0"] steps: - name: Checkout repository @@ -31,18 +34,45 @@ jobs: # Create persistent Apptainer directory mkdir -p ~/apptainer - # Build Apptainer image from definition file (only if it doesn't exist) - if [ ! -f ~/apptainer/iris-dev.sif ]; then - echo "Building new Apptainer image..." - apptainer build ~/apptainer/iris-dev.sif apptainer/iris.def + # Compute hash of the definition file + DEF_FILE="apptainer/iris-rocm${{ matrix.rocm_version }}.def" + CURRENT_HASH=$(sha256sum "$DEF_FILE" | awk '{print $1}') + HASH_FILE=~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif.sha256 + + # Check if we need to rebuild + REBUILD=false + if [ ! -f ~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif ]; then + echo "Apptainer image not found. Building new image for ROCm ${{ matrix.rocm_version }}..." + REBUILD=true + elif [ ! -f "$HASH_FILE" ]; then + echo "Hash file not found. Rebuilding image for ROCm ${{ matrix.rocm_version }}..." + REBUILD=true else - echo "Using existing Apptainer image" + STORED_HASH=$(cat "$HASH_FILE") + if [ "$CURRENT_HASH" != "$STORED_HASH" ]; then + echo "Definition file has changed. Rebuilding image for ROCm ${{ matrix.rocm_version }}..." + echo " Previous hash: $STORED_HASH" + echo " Current hash: $CURRENT_HASH" + REBUILD=true + else + echo "Using existing Apptainer image for ROCm ${{ matrix.rocm_version }} (hash: $CURRENT_HASH)" + fi + fi + + # Build if needed + if [ "$REBUILD" = true ]; then + apptainer build --force ~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif "$DEF_FILE" + echo "$CURRENT_HASH" > "$HASH_FILE" + echo "Successfully built and stored hash: $CURRENT_HASH" fi test-1-2-4-ranks: - name: Pip Install Test 1/2/4 Ranks (Parallel) + name: Pip Install Test 1/2/4 Ranks (Parallel) - ROCm ${{ matrix.rocm_version }} needs: build-apptainer-image runs-on: [self-hosted, mi3008x] timeout-minutes: 30 + strategy: + matrix: + rocm_version: ["6.3.1", "7.0"] steps: - name: Checkout repository @@ -72,7 +102,7 @@ jobs: echo "Starting 1-rank test on GPUs 0,1..." apptainer exec --overlay "${OVERLAY_1}" --no-home --cleanenv --env HIP_VISIBLE_DEVICES="0,1" \ --bind "${PWD}:/iris_workspace" --cwd /iris_workspace \ - ~/apptainer/iris-dev.sif bash -c " + ~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif bash -c " pip install git+https://github.com/${{ github.repository }}.git@${{ github.sha }} bash .github/scripts/run_tests.sh 1 " & @@ -81,7 +111,7 @@ jobs: echo "Starting 2-rank test on GPUs 2,3..." apptainer exec --overlay "${OVERLAY_2}" --no-home --cleanenv --env HIP_VISIBLE_DEVICES="2,3" \ --bind "${PWD}:/iris_workspace" --cwd /iris_workspace \ - ~/apptainer/iris-dev.sif bash -c " + ~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif bash -c " pip install git+https://github.com/${{ github.repository }}.git@${{ github.sha }} bash .github/scripts/run_tests.sh 2 " & @@ -90,7 +120,7 @@ jobs: echo "Starting 4-rank test on GPUs 4,5,6,7..." apptainer exec --overlay "${OVERLAY_4}" --no-home --cleanenv --env HIP_VISIBLE_DEVICES="4,5,6,7" \ --bind "${PWD}:/iris_workspace" --cwd /iris_workspace \ - ~/apptainer/iris-dev.sif bash -c " + ~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif bash -c " pip install git+https://github.com/${{ github.repository }}.git@${{ github.sha }} bash .github/scripts/run_tests.sh 4 " & @@ -120,10 +150,13 @@ jobs: echo "✅ All parallel tests (1, 2, 4 ranks) passed!" test-8-ranks: - name: Pip Install Test 8 Ranks + name: Pip Install Test 8 Ranks - ROCm ${{ matrix.rocm_version }} needs: build-apptainer-image runs-on: [self-hosted, mi3008x] timeout-minutes: 30 + strategy: + matrix: + rocm_version: ["6.3.1", "7.0"] steps: - name: Checkout repository @@ -143,7 +176,7 @@ jobs: echo "::group::Running 8-rank test on all GPUs" apptainer exec --overlay "${OVERLAY_8}" --no-home --cleanenv --env HIP_VISIBLE_DEVICES="0,1,2,3,4,5,6,7" \ --bind "${PWD}:/iris_workspace" --cwd /iris_workspace \ - ~/apptainer/iris-dev.sif bash -c " + ~/apptainer/iris-dev-rocm${{ matrix.rocm_version }}.sif bash -c " pip install git+https://github.com/${{ github.repository }}.git@${{ github.sha }} bash .github/scripts/run_tests.sh 8 " diff --git a/apptainer/iris-rocm6.3.1.def b/apptainer/iris-rocm6.3.1.def new file mode 100644 index 00000000..9974fd34 --- /dev/null +++ b/apptainer/iris-rocm6.3.1.def @@ -0,0 +1,35 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. + +Bootstrap: docker +From: rocm/pytorch:rocm6.3.1_ubuntu22.04_py3.10_pytorch + +%post + /bin/bash -c " + apt-get update && apt-get install -y git + export TRITON_PATH=/workspace/triton + conda env list + source /opt/conda/bin/activate py_3.10 + conda install -y -n py_3.10 -c conda-forge jupyter ninja cmake wheel + git clone https://github.com/triton-lang/triton.git \$TRITON_PATH + cd \$TRITON_PATH + git checkout dd5823453bcc7973eabadb65f9d827c43281c434 + pip install -e . + wget https://github.com/ROCm/rocprofiler-systems/releases/download/rocm-6.3.1/rocprofiler-systems-install.py + python3 ./rocprofiler-systems-install.py --prefix /opt/rocprofiler-systems --rocm 6.3 + " + +%environment + # Define environment variables + export TRITON_PATH=/workspace/triton + export PYTHONPATH=$TRITON_PATH/python/ + export LD_LIBRARY_PATH=/opt/rocm/lib:$LD_LIBRARY_PATH + export ROCM_PATH=/opt/rocm + export PATH=/opt/conda/envs/py_3.10/bin:/opt/rocm/bin:$PATH + export OMPI_MCA_mtl="^ofi" + export OMPI_MCA_pml="ob1" + +%runscript + echo "Welcome to the ROCm-aware Apptainer image!" + source /opt/conda/bin/activate py_3.10 + exec "$@" \ No newline at end of file diff --git a/apptainer/iris-rocm7.0.def b/apptainer/iris-rocm7.0.def new file mode 100644 index 00000000..1989575d --- /dev/null +++ b/apptainer/iris-rocm7.0.def @@ -0,0 +1,33 @@ +# SPDX-License-Identifier: MIT +# Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. + +Bootstrap: docker +From: rocm/pytorch:rocm7.0_ubuntu22.04_py3.10_pytorch_release_2.8.0 + +%post + /bin/bash -c " + apt-get update && apt-get install -y git + export TRITON_PATH=/workspace/triton + #conda env list + #source /opt/conda/bin/activate py_3.10 + #conda install -y -n py_3.10 -c conda-forge jupyter ninja cmake wheel + git clone https://github.com/triton-lang/triton.git \$TRITON_PATH + cd \$TRITON_PATH + git checkout aafec417bded34db6308f5b3d6023daefae43905 + pip install -e . + " + +%environment + # Define environment variables + export TRITON_PATH=/workspace/triton + export PYTHONPATH=$TRITON_PATH/python/ + export LD_LIBRARY_PATH=/opt/rocm/lib:$LD_LIBRARY_PATH + export ROCM_PATH=/opt/rocm + export PATH=/opt/conda/envs/py_3.10/bin:/opt/rocm/bin:$PATH + export OMPI_MCA_mtl="^ofi" + export OMPI_MCA_pml="ob1" + +%runscript + echo "Welcome to the ROCm-aware Apptainer image!" + source /opt/conda/bin/activate py_3.10 + exec "$@" \ No newline at end of file