From 7e60a70a5d9beec0e6b9fcb470b12f31c4ed66c7 Mon Sep 17 00:00:00 2001 From: Steboss Date: Fri, 1 Aug 2025 12:13:27 +0100 Subject: [PATCH 01/13] test new version of axlearn --- .github/container/manifest.yaml | 4 ++-- .github/workflows/ci.yaml | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/container/manifest.yaml b/.github/container/manifest.yaml index e122c4338..96aeaaf25 100644 --- a/.github/container/manifest.yaml +++ b/.github/container/manifest.yaml @@ -102,6 +102,6 @@ pathwaysutils: latest_verified_commit: 359776d454940ffaa337c36d1df16308d44a95a9 mode: pip-vcs axlearn: - url: https://github.com/Steboss/axlearn.git - tracking_ref: sbosisio/working_branch + url: https://github.com/apple/axlearn.git + tracking_ref: main mode: git-clone diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 1a6f53ec4..2a3f581e5 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -210,7 +210,7 @@ jobs: CUDA_IMAGE: ${{ needs.metadata.outputs.CUDA_IMAGE }} MANIFEST_ARTIFACT_NAME: ${{ needs.metadata.outputs.MANIFEST_ARTIFACT_NAME }} SOURCE_URLREFS: ${{ needs.bump-manifest.outputs.SOURCE_URLREFS }} - MODE: ${{ github.event_name == 'workflow_dispatch' && inputs.MODE || 'full' }} + MODE: "axlearn" #${{ github.event_name == 'workflow_dispatch' && inputs.MODE || 'full' }} secrets: inherit arm64: @@ -222,7 +222,7 @@ jobs: CUDA_IMAGE: ${{ needs.metadata.outputs.CUDA_IMAGE }} MANIFEST_ARTIFACT_NAME: ${{ needs.metadata.outputs.MANIFEST_ARTIFACT_NAME }} SOURCE_URLREFS: ${{ needs.bump-manifest.outputs.SOURCE_URLREFS }} - MODE: ${{ github.event_name == 'workflow_dispatch' && inputs.MODE || 'full' }} + MODE: "axlearn" #${{ github.event_name == 'workflow_dispatch' && inputs.MODE || 'full' }} secrets: inherit # Only merge if everything succeeds From fd721ecadd289d26d9a4d256d1e0215f26acbe7b Mon Sep 17 00:00:00 2001 From: Steboss Date: Fri, 1 Aug 2025 14:53:38 +0100 Subject: [PATCH 02/13] fix the exclusion list --- .github/container/test-axlearn.sh | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/.github/container/test-axlearn.sh b/.github/container/test-axlearn.sh index 288a03a4e..26f3172e5 100755 --- a/.github/container/test-axlearn.sh +++ b/.github/container/test-axlearn.sh @@ -191,15 +191,28 @@ TEST_8_DEVICES_FILES=("gda_test.py" "trainer_test.py" "utils_test.py" ) +# we do not need to test the following TEST_8_DEVICES_WITH_PATHS=() for file in "${TEST_8_DEVICES_FILES[@]}"; do - found_files=$(find . -name "$file" -type f 2>/dev/null) - if [[ -n "$found_files" ]]; then - while IFS= read -r found_file; do + # Handle the ambiguous 'utils_test.py' as a special case. + if [[ "$file" == "utils_test.py" ]]; then + # Find the one specific 'utils_test.py' we want by its full path. + # Adjust the path if your target file is located elsewhere. + found_file=$(find . -path '*/axlearn/common/utils_test.py' -type f 2>/dev/null | head -n 1) + if [[ -n "$found_file" ]]; then TEST_8_DEVICES_WITH_PATHS+=("$found_file") - done <<< "$found_files" + else + echo "Warning: Desired utils_test.py not found at '*/axlearn/common/utils_test.py'" + fi else - echo "Warning: Test file $file not found in current directory structure" + # For all other (unambiguous) files, find them by name. + # This will add all found files to the array. + readarray -t found_files < <(find . -name "$file" -type f 2>/dev/null) + if [ ${#found_files[@]} -gt 0 ]; then + TEST_8_DEVICES_WITH_PATHS+=( "${found_files[@]}" ) + else + echo "Warning: Test file '$file' not found in current directory structure" + fi fi done From 2498214779f4e1c53d7291dd37f785b3be8871b2 Mon Sep 17 00:00:00 2001 From: Steboss Date: Fri, 1 Aug 2025 14:57:38 +0100 Subject: [PATCH 03/13] fix comment --- .github/container/test-axlearn.sh | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/container/test-axlearn.sh b/.github/container/test-axlearn.sh index 26f3172e5..50ba554c3 100755 --- a/.github/container/test-axlearn.sh +++ b/.github/container/test-axlearn.sh @@ -191,13 +191,11 @@ TEST_8_DEVICES_FILES=("gda_test.py" "trainer_test.py" "utils_test.py" ) -# we do not need to test the following TEST_8_DEVICES_WITH_PATHS=() for file in "${TEST_8_DEVICES_FILES[@]}"; do # Handle the ambiguous 'utils_test.py' as a special case. if [[ "$file" == "utils_test.py" ]]; then - # Find the one specific 'utils_test.py' we want by its full path. - # Adjust the path if your target file is located elsewhere. + # We do not need to test cli or gcloud utils_test found_file=$(find . -path '*/axlearn/common/utils_test.py' -type f 2>/dev/null | head -n 1) if [[ -n "$found_file" ]]; then TEST_8_DEVICES_WITH_PATHS+=("$found_file") From bbd74833d24984023a411494901e1c6ae3a6c56b Mon Sep 17 00:00:00 2001 From: Steboss Date: Fri, 1 Aug 2025 15:54:39 +0100 Subject: [PATCH 04/13] reset standard ci' --- .github/workflows/ci.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 2a3f581e5..1a6f53ec4 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -210,7 +210,7 @@ jobs: CUDA_IMAGE: ${{ needs.metadata.outputs.CUDA_IMAGE }} MANIFEST_ARTIFACT_NAME: ${{ needs.metadata.outputs.MANIFEST_ARTIFACT_NAME }} SOURCE_URLREFS: ${{ needs.bump-manifest.outputs.SOURCE_URLREFS }} - MODE: "axlearn" #${{ github.event_name == 'workflow_dispatch' && inputs.MODE || 'full' }} + MODE: ${{ github.event_name == 'workflow_dispatch' && inputs.MODE || 'full' }} secrets: inherit arm64: @@ -222,7 +222,7 @@ jobs: CUDA_IMAGE: ${{ needs.metadata.outputs.CUDA_IMAGE }} MANIFEST_ARTIFACT_NAME: ${{ needs.metadata.outputs.MANIFEST_ARTIFACT_NAME }} SOURCE_URLREFS: ${{ needs.bump-manifest.outputs.SOURCE_URLREFS }} - MODE: "axlearn" #${{ github.event_name == 'workflow_dispatch' && inputs.MODE || 'full' }} + MODE: ${{ github.event_name == 'workflow_dispatch' && inputs.MODE || 'full' }} secrets: inherit # Only merge if everything succeeds From 084fb673f4dc5c0e303dd98a8b2d585f910c22d8 Mon Sep 17 00:00:00 2001 From: Steboss Date: Mon, 4 Aug 2025 11:53:39 +0100 Subject: [PATCH 05/13] re-test new axlearn testings --- .github/container/test-axlearn.sh | 52 ++++++++----------- .../axlearn/axlearn-job.yml | 2 +- .github/workflows/ci.yaml | 4 +- 3 files changed, 26 insertions(+), 32 deletions(-) diff --git a/.github/container/test-axlearn.sh b/.github/container/test-axlearn.sh index 50ba554c3..60ef2fc03 100755 --- a/.github/container/test-axlearn.sh +++ b/.github/container/test-axlearn.sh @@ -11,9 +11,9 @@ usage() { echo "" echo " OPTIONS DESCRIPTION" echo " -d, --directory DIR Directory to run tests in." - echo " Default: 'axlearn/axlearn/common'." + echo " Default: 'opt/axlearn'." echo " -t, --test-files FILES Pattern for test files to run." - echo " Default: '*_test.py'." + echo " Default: 'axlearn/common/*_test.py'." echo " -o, --output DIRECTORY Output directory for logs and summary." echo " Default: 'test_runs/'." echo " -h, --help Show this help message and exit." @@ -39,7 +39,7 @@ run_tests() { } # DEFAULT VALUES -DIR='/opt/axlearn/axlearn/common' +DIR='/opt/axlearn' TEST_FILES=() OUTPUT_DIRECTORY='' @@ -95,15 +95,6 @@ LOG_DIRECTORY="${OUTPUT_DIRECTORY}/logs" mkdir -p "${LOG_DIRECTORY}" -if [ "${#TEST_FILES[@]}" -gt 0 ]; then - echo " Test Files:" - for f in "${TEST_FILES[@]}"; do - echo " $f" - done -else - echo " Test Files Pattern: '*_test.py' (default)" -fi - # DEPENDENCIES pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu pip install timm transformers scikit-learn grain evaluate prefixed wandb @@ -115,26 +106,28 @@ curl https://huggingface.co/FacebookAI/roberta-base/raw/main/merges.txt -o /opt/ curl https://huggingface.co/FacebookAI/roberta-base/raw/main/vocab.json -o /opt/axlearn/axlearn/data/tokenizers/bpe/roberta-base-vocab.json # RETRIEVE TEST FILES +expanded_test_files=() if [ "${#TEST_FILES[@]}" -eq 0 ]; then - TEST_FILES=("*_test.py") + # if we are not giving anything for --test-files than we can match all those *_test.py files + readarray -t expanded_test_files < <(find . -name "*_test.py" -type f) + # otherwise let's check in the --test-files pattern +else + for pattern in "${TEST_FILES[@]}"; do + readarray -t found_files < <(find . -name "$pattern" -type f) + if [ ${#found_files[@]} -gt 0 ]; then + expanded_test_files+=( "${found_files[@]}" ) + else + echo "Warning: No files found matching pattern '$pattern'" + fi + done fi -expanded_test_files=() -for pattern in "${TEST_FILES[@]}"; do - # retrieve all the files - files=( $pattern ) - if [ "${#files[@]}" -gt 0 ]; then - expanded_test_files+=( "${files[@]}" ) - else - echo "Warning: No files matched pattern '$pattern'" - fi -done - if [ "${#expanded_test_files[@]}" -eq 0 ]; then echo "No test files found to run." exit 1 fi +# EXCLUDE PATTERNS EXCLUDE_PATTERNS=("array_serialization_test.py" "t5_test.py" # tensorflow bug "loss_test.py" @@ -185,11 +178,12 @@ done # RUN TESTS -TEST_8_DEVICES_FILES=("gda_test.py" - "input_base_test.py" - "input_dispatch_test.py" - "trainer_test.py" - "utils_test.py" +TEST_8_DEVICES_FILES=( + "axlearn/common/gda_test.py" + "axlearn/common/input_base_test.py" + "axlearn/common/input_dispatch_test.py" + "axlearn/common/trainer_test.py" + "axlearn/common/utils_test.py" ) TEST_8_DEVICES_WITH_PATHS=() for file in "${TEST_8_DEVICES_FILES[@]}"; do diff --git a/.github/eks-workflow-files/axlearn/axlearn-job.yml b/.github/eks-workflow-files/axlearn/axlearn-job.yml index 24e7ca8e7..900a0d4ab 100644 --- a/.github/eks-workflow-files/axlearn/axlearn-job.yml +++ b/.github/eks-workflow-files/axlearn/axlearn-job.yml @@ -22,7 +22,7 @@ spec: LOG_DIR="/output/${RUN_ID}" mkdir -p ${LOG_DIR} # test on JAX, make sure 8 devices are visible - pytest-xdist.sh 8 4 ${LOG_DIR}/axlearn-unittests.jsonl test-axlearn.sh --directory "." --output ${LOG_DIR} --test-files "/opt/axlearn/axlearn/common/*_test.py" + pytest-xdist.sh 8 4 ${LOG_DIR}/axlearn-unittests.jsonl test-axlearn.sh --directory "." --output ${LOG_DIR} --test-files "axlearn/common/*_test.py" env: - name: RUN_ID value: PLACEHOLDER diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 1a6f53ec4..2a3f581e5 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -210,7 +210,7 @@ jobs: CUDA_IMAGE: ${{ needs.metadata.outputs.CUDA_IMAGE }} MANIFEST_ARTIFACT_NAME: ${{ needs.metadata.outputs.MANIFEST_ARTIFACT_NAME }} SOURCE_URLREFS: ${{ needs.bump-manifest.outputs.SOURCE_URLREFS }} - MODE: ${{ github.event_name == 'workflow_dispatch' && inputs.MODE || 'full' }} + MODE: "axlearn" #${{ github.event_name == 'workflow_dispatch' && inputs.MODE || 'full' }} secrets: inherit arm64: @@ -222,7 +222,7 @@ jobs: CUDA_IMAGE: ${{ needs.metadata.outputs.CUDA_IMAGE }} MANIFEST_ARTIFACT_NAME: ${{ needs.metadata.outputs.MANIFEST_ARTIFACT_NAME }} SOURCE_URLREFS: ${{ needs.bump-manifest.outputs.SOURCE_URLREFS }} - MODE: ${{ github.event_name == 'workflow_dispatch' && inputs.MODE || 'full' }} + MODE: "axlearn" #${{ github.event_name == 'workflow_dispatch' && inputs.MODE || 'full' }} secrets: inherit # Only merge if everything succeeds From 87f0edbbf980473e1cb1a7a096e2e2f4d2bc1043 Mon Sep 17 00:00:00 2001 From: Steboss Date: Mon, 4 Aug 2025 13:35:37 +0100 Subject: [PATCH 06/13] correct execution --- .github/eks-workflow-files/axlearn/axlearn-job.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/eks-workflow-files/axlearn/axlearn-job.yml b/.github/eks-workflow-files/axlearn/axlearn-job.yml index 900a0d4ab..fbc5603f0 100644 --- a/.github/eks-workflow-files/axlearn/axlearn-job.yml +++ b/.github/eks-workflow-files/axlearn/axlearn-job.yml @@ -21,6 +21,7 @@ spec: LOG_DIR="/output/${RUN_ID}" mkdir -p ${LOG_DIR} + cd /opt/axlearn # test on JAX, make sure 8 devices are visible pytest-xdist.sh 8 4 ${LOG_DIR}/axlearn-unittests.jsonl test-axlearn.sh --directory "." --output ${LOG_DIR} --test-files "axlearn/common/*_test.py" env: From a9dd58a5272c4be7daa3b44ad8db35759f1b484b Mon Sep 17 00:00:00 2001 From: Steboss Date: Mon, 4 Aug 2025 17:09:04 +0100 Subject: [PATCH 07/13] weird error here --- .github/eks-workflow-files/axlearn/axlearn-job.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/eks-workflow-files/axlearn/axlearn-job.yml b/.github/eks-workflow-files/axlearn/axlearn-job.yml index fbc5603f0..4fffede30 100644 --- a/.github/eks-workflow-files/axlearn/axlearn-job.yml +++ b/.github/eks-workflow-files/axlearn/axlearn-job.yml @@ -21,9 +21,9 @@ spec: LOG_DIR="/output/${RUN_ID}" mkdir -p ${LOG_DIR} - cd /opt/axlearn + pwd # test on JAX, make sure 8 devices are visible - pytest-xdist.sh 8 4 ${LOG_DIR}/axlearn-unittests.jsonl test-axlearn.sh --directory "." --output ${LOG_DIR} --test-files "axlearn/common/*_test.py" + pytest-xdist.sh 8 4 ${LOG_DIR}/axlearn-unittests.jsonl test-axlearn.sh --output ${LOG_DIR} --test-files "axlearn/common/*_test.py" env: - name: RUN_ID value: PLACEHOLDER From 8f48ea247f6738e7a26ca6f6e19229c84935ad4d Mon Sep 17 00:00:00 2001 From: Steboss Date: Mon, 4 Aug 2025 20:45:45 +0100 Subject: [PATCH 08/13] weird error here --- .github/container/test-axlearn.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/container/test-axlearn.sh b/.github/container/test-axlearn.sh index 60ef2fc03..57cbac0c0 100755 --- a/.github/container/test-axlearn.sh +++ b/.github/container/test-axlearn.sh @@ -113,6 +113,8 @@ if [ "${#TEST_FILES[@]}" -eq 0 ]; then # otherwise let's check in the --test-files pattern else for pattern in "${TEST_FILES[@]}"; do + echo "looking for pattern: $pattern" + echo "Cmd: find . -name \"$pattern\" -type f" readarray -t found_files < <(find . -name "$pattern" -type f) if [ ${#found_files[@]} -gt 0 ]; then expanded_test_files+=( "${found_files[@]}" ) From 91e5f7387937550983a49711cd666ae1407472de Mon Sep 17 00:00:00 2001 From: Steboss Date: Mon, 4 Aug 2025 21:48:21 +0100 Subject: [PATCH 09/13] Fix error it s path not name --- .github/container/test-axlearn.sh | 2 +- .github/eks-workflow-files/axlearn/axlearn-job.yml | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/.github/container/test-axlearn.sh b/.github/container/test-axlearn.sh index 57cbac0c0..6bf941287 100755 --- a/.github/container/test-axlearn.sh +++ b/.github/container/test-axlearn.sh @@ -115,7 +115,7 @@ else for pattern in "${TEST_FILES[@]}"; do echo "looking for pattern: $pattern" echo "Cmd: find . -name \"$pattern\" -type f" - readarray -t found_files < <(find . -name "$pattern" -type f) + readarray -t found_files < <(find . -path "./$pattern" -type f) if [ ${#found_files[@]} -gt 0 ]; then expanded_test_files+=( "${found_files[@]}" ) else diff --git a/.github/eks-workflow-files/axlearn/axlearn-job.yml b/.github/eks-workflow-files/axlearn/axlearn-job.yml index 4fffede30..c28fdbf8c 100644 --- a/.github/eks-workflow-files/axlearn/axlearn-job.yml +++ b/.github/eks-workflow-files/axlearn/axlearn-job.yml @@ -21,9 +21,8 @@ spec: LOG_DIR="/output/${RUN_ID}" mkdir -p ${LOG_DIR} - pwd # test on JAX, make sure 8 devices are visible - pytest-xdist.sh 8 4 ${LOG_DIR}/axlearn-unittests.jsonl test-axlearn.sh --output ${LOG_DIR} --test-files "axlearn/common/*_test.py" + pytest-xdist.sh 8 4 ${LOG_DIR}/axlearn-unittests.jsonl test-axlearn.sh --output ${LOG_DIR} --test-files "*_test.py" env: - name: RUN_ID value: PLACEHOLDER From ccee3f0fb0c080e17af24fc447297c4568be461c Mon Sep 17 00:00:00 2001 From: Steboss Date: Mon, 4 Aug 2025 21:49:46 +0100 Subject: [PATCH 10/13] fix Docker and CI --- .github/container/Dockerfile.axlearn | 2 +- .github/workflows/ci.yaml | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/container/Dockerfile.axlearn b/.github/container/Dockerfile.axlearn index 736e04a50..8a44385a0 100644 --- a/.github/container/Dockerfile.axlearn +++ b/.github/container/Dockerfile.axlearn @@ -1,6 +1,6 @@ # syntax=docker/dockerfile:1-labs ARG BASE_IMAGE=ghcr.io/nvidia/jax-mealkit:jax -ARG URLREF_AXLEARN=https://github.com/Steboss/axlearn.git#sbosisio/working_branch +ARG URLREF_AXLEARN=https://github.com/apple/axlearn.git ARG SRC_PATH_AXLEARN=/opt/axlearn ############################################################################### diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 2a3f581e5..1a6f53ec4 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -210,7 +210,7 @@ jobs: CUDA_IMAGE: ${{ needs.metadata.outputs.CUDA_IMAGE }} MANIFEST_ARTIFACT_NAME: ${{ needs.metadata.outputs.MANIFEST_ARTIFACT_NAME }} SOURCE_URLREFS: ${{ needs.bump-manifest.outputs.SOURCE_URLREFS }} - MODE: "axlearn" #${{ github.event_name == 'workflow_dispatch' && inputs.MODE || 'full' }} + MODE: ${{ github.event_name == 'workflow_dispatch' && inputs.MODE || 'full' }} secrets: inherit arm64: @@ -222,7 +222,7 @@ jobs: CUDA_IMAGE: ${{ needs.metadata.outputs.CUDA_IMAGE }} MANIFEST_ARTIFACT_NAME: ${{ needs.metadata.outputs.MANIFEST_ARTIFACT_NAME }} SOURCE_URLREFS: ${{ needs.bump-manifest.outputs.SOURCE_URLREFS }} - MODE: "axlearn" #${{ github.event_name == 'workflow_dispatch' && inputs.MODE || 'full' }} + MODE: ${{ github.event_name == 'workflow_dispatch' && inputs.MODE || 'full' }} secrets: inherit # Only merge if everything succeeds From 026c984d02c235b8118e2c5503b97458fffafff6 Mon Sep 17 00:00:00 2001 From: Steboss Date: Tue, 5 Aug 2025 09:07:58 +0100 Subject: [PATCH 11/13] fix the test files --- .github/eks-workflow-files/axlearn/axlearn-job.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/eks-workflow-files/axlearn/axlearn-job.yml b/.github/eks-workflow-files/axlearn/axlearn-job.yml index c28fdbf8c..0a77f4c86 100644 --- a/.github/eks-workflow-files/axlearn/axlearn-job.yml +++ b/.github/eks-workflow-files/axlearn/axlearn-job.yml @@ -22,7 +22,7 @@ spec: LOG_DIR="/output/${RUN_ID}" mkdir -p ${LOG_DIR} # test on JAX, make sure 8 devices are visible - pytest-xdist.sh 8 4 ${LOG_DIR}/axlearn-unittests.jsonl test-axlearn.sh --output ${LOG_DIR} --test-files "*_test.py" + pytest-xdist.sh 8 4 ${LOG_DIR}/axlearn-unittests.jsonl test-axlearn.sh --output ${LOG_DIR} --test-files "axlearn/common/*_test.py" env: - name: RUN_ID value: PLACEHOLDER From 711f0f0fb60657c0ef0379b2ce5455c4fc7918fb Mon Sep 17 00:00:00 2001 From: Steboss Date: Thu, 7 Aug 2025 18:19:45 +0100 Subject: [PATCH 12/13] fix comment --- .github/container/test-axlearn.sh | 34 ++++++------------------------- 1 file changed, 6 insertions(+), 28 deletions(-) diff --git a/.github/container/test-axlearn.sh b/.github/container/test-axlearn.sh index 6bf941287..4cd21cd9d 100755 --- a/.github/container/test-axlearn.sh +++ b/.github/container/test-axlearn.sh @@ -180,35 +180,13 @@ done # RUN TESTS -TEST_8_DEVICES_FILES=( - "axlearn/common/gda_test.py" - "axlearn/common/input_base_test.py" - "axlearn/common/input_dispatch_test.py" - "axlearn/common/trainer_test.py" - "axlearn/common/utils_test.py" +TEST_8_DEVICES_WITH_PATHS=( + "./axlearn/common/gda_test.py" + "./axlearn/common/input_base_test.py" + "./axlearn/common/input_dispatch_test.py" + "./axlearn/common/trainer_test.py" + "./axlearn/common/utils_test.py" ) -TEST_8_DEVICES_WITH_PATHS=() -for file in "${TEST_8_DEVICES_FILES[@]}"; do - # Handle the ambiguous 'utils_test.py' as a special case. - if [[ "$file" == "utils_test.py" ]]; then - # We do not need to test cli or gcloud utils_test - found_file=$(find . -path '*/axlearn/common/utils_test.py' -type f 2>/dev/null | head -n 1) - if [[ -n "$found_file" ]]; then - TEST_8_DEVICES_WITH_PATHS+=("$found_file") - else - echo "Warning: Desired utils_test.py not found at '*/axlearn/common/utils_test.py'" - fi - else - # For all other (unambiguous) files, find them by name. - # This will add all found files to the array. - readarray -t found_files < <(find . -name "$file" -type f 2>/dev/null) - if [ ${#found_files[@]} -gt 0 ]; then - TEST_8_DEVICES_WITH_PATHS+=( "${found_files[@]}" ) - else - echo "Warning: Test file '$file' not found in current directory structure" - fi - fi -done run_tests "" "for_8_devices" "8_dev" "${TEST_8_DEVICES_WITH_PATHS[@]}" # All the other tests From 83de72805c291e88064dd0705f68dc2cf2f2a4bc Mon Sep 17 00:00:00 2001 From: Steboss Date: Thu, 14 Aug 2025 18:30:45 +0200 Subject: [PATCH 13/13] trigger nccl tests --- .github/eks-workflow-files/mpi-nccl-test.yml | 2 +- .github/workflows/_ci.yaml | 7 ++++++- .github/workflows/ci.yaml | 7 ++++--- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/.github/eks-workflow-files/mpi-nccl-test.yml b/.github/eks-workflow-files/mpi-nccl-test.yml index 0e34cb7a2..e02ddf45c 100644 --- a/.github/eks-workflow-files/mpi-nccl-test.yml +++ b/.github/eks-workflow-files/mpi-nccl-test.yml @@ -71,7 +71,7 @@ spec: resources: limits: nvidia.com/gpu: 8 - hugepages-2Mi: 5120Mi + #hugepages-2Mi: 5120Mi vpc.amazonaws.com/efa: 32 memory: 32000Mi imagePullSecrets: diff --git a/.github/workflows/_ci.yaml b/.github/workflows/_ci.yaml index aafc32fdb..86ee9b239 100644 --- a/.github/workflows/_ci.yaml +++ b/.github/workflows/_ci.yaml @@ -54,7 +54,12 @@ jobs: secrets: inherit test-nccl: - if: inputs.ARCHITECTURE == 'amd64' # build only amd64 + if: >- + inputs.ARCHITECTURE == 'amd64' && + ( + inputs.MODE == 'full' || + inputs.MODE == 'nccl' + ) needs: build-base uses: ./.github/workflows/_test_nccl.yaml with: diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 1a6f53ec4..3553e46fd 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -51,7 +51,8 @@ on: - t5x - run build rosetta - maxtext - run only the tests for maxtext - axlearn - run only the tests for axlearn - options: [full, jax, te, t5x, maxtext, axlearn] + - nccl - run only the nccl tests + options: [full, jax, te, t5x, maxtext, axlearn, nccl] default: full concurrency: @@ -210,7 +211,7 @@ jobs: CUDA_IMAGE: ${{ needs.metadata.outputs.CUDA_IMAGE }} MANIFEST_ARTIFACT_NAME: ${{ needs.metadata.outputs.MANIFEST_ARTIFACT_NAME }} SOURCE_URLREFS: ${{ needs.bump-manifest.outputs.SOURCE_URLREFS }} - MODE: ${{ github.event_name == 'workflow_dispatch' && inputs.MODE || 'full' }} + MODE: "nccl" # ${{ github.event_name == 'workflow_dispatch' && inputs.MODE || 'full' }} secrets: inherit arm64: @@ -222,7 +223,7 @@ jobs: CUDA_IMAGE: ${{ needs.metadata.outputs.CUDA_IMAGE }} MANIFEST_ARTIFACT_NAME: ${{ needs.metadata.outputs.MANIFEST_ARTIFACT_NAME }} SOURCE_URLREFS: ${{ needs.bump-manifest.outputs.SOURCE_URLREFS }} - MODE: ${{ github.event_name == 'workflow_dispatch' && inputs.MODE || 'full' }} + MODE: "nccl" # ${{ github.event_name == 'workflow_dispatch' && inputs.MODE || 'full' }} secrets: inherit # Only merge if everything succeeds