Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/container/Dockerfile.axlearn
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# syntax=docker/dockerfile:1-labs
ARG BASE_IMAGE=ghcr.io/nvidia/jax-mealkit:jax
ARG URLREF_AXLEARN=https://github.com/Steboss/axlearn.git#sbosisio/working_branch
ARG URLREF_AXLEARN=https://github.com/apple/axlearn.git
ARG SRC_PATH_AXLEARN=/opt/axlearn

###############################################################################
Expand Down
4 changes: 2 additions & 2 deletions .github/container/manifest.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -102,8 +102,8 @@ pathwaysutils:
latest_verified_commit: 359776d454940ffaa337c36d1df16308d44a95a9
mode: pip-vcs
axlearn:
url: https://github.com/Steboss/axlearn.git
tracking_ref: sbosisio/working_branch
url: https://github.com/apple/axlearn.git
tracking_ref: main
mode: git-clone
qwix:
url: https://github.com/google/qwix.git
Expand Down
65 changes: 25 additions & 40 deletions .github/container/test-axlearn.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@ usage() {
echo ""
echo " OPTIONS DESCRIPTION"
echo " -d, --directory DIR Directory to run tests in."
echo " Default: 'axlearn/axlearn/common'."
echo " Default: 'opt/axlearn'."
echo " -t, --test-files FILES Pattern for test files to run."
echo " Default: '*_test.py'."
echo " Default: 'axlearn/common/*_test.py'."
echo " -o, --output DIRECTORY Output directory for logs and summary."
echo " Default: 'test_runs/<timestamp>'."
echo " -h, --help Show this help message and exit."
Expand All @@ -39,7 +39,7 @@ run_tests() {
}

# DEFAULT VALUES
DIR='/opt/axlearn/axlearn/common'
DIR='/opt/axlearn'
TEST_FILES=()
OUTPUT_DIRECTORY=''

Expand Down Expand Up @@ -95,15 +95,6 @@ LOG_DIRECTORY="${OUTPUT_DIRECTORY}/logs"

mkdir -p "${LOG_DIRECTORY}"

if [ "${#TEST_FILES[@]}" -gt 0 ]; then
echo " Test Files:"
for f in "${TEST_FILES[@]}"; do
echo " $f"
done
else
echo " Test Files Pattern: '*_test.py' (default)"
fi

# DEPENDENCIES
pip install torch torchvision --index-url https://download.pytorch.org/whl/cpu
pip install timm transformers scikit-learn grain evaluate prefixed wandb
Expand All @@ -115,26 +106,30 @@ curl https://huggingface.co/FacebookAI/roberta-base/raw/main/merges.txt -o /opt/
curl https://huggingface.co/FacebookAI/roberta-base/raw/main/vocab.json -o /opt/axlearn/axlearn/data/tokenizers/bpe/roberta-base-vocab.json

# RETRIEVE TEST FILES
expanded_test_files=()
if [ "${#TEST_FILES[@]}" -eq 0 ]; then
TEST_FILES=("*_test.py")
# if we are not giving anything for --test-files than we can match all those *_test.py files
readarray -t expanded_test_files < <(find . -name "*_test.py" -type f)
# otherwise let's check in the --test-files pattern
else
for pattern in "${TEST_FILES[@]}"; do
echo "looking for pattern: $pattern"
echo "Cmd: find . -name \"$pattern\" -type f"
readarray -t found_files < <(find . -path "./$pattern" -type f)
if [ ${#found_files[@]} -gt 0 ]; then
expanded_test_files+=( "${found_files[@]}" )
else
echo "Warning: No files found matching pattern '$pattern'"
fi
done
fi

expanded_test_files=()
for pattern in "${TEST_FILES[@]}"; do
# retrieve all the files
files=( $pattern )
if [ "${#files[@]}" -gt 0 ]; then
expanded_test_files+=( "${files[@]}" )
else
echo "Warning: No files matched pattern '$pattern'"
fi
done

if [ "${#expanded_test_files[@]}" -eq 0 ]; then
echo "No test files found to run."
exit 1
fi

# EXCLUDE PATTERNS
EXCLUDE_PATTERNS=("array_serialization_test.py"
"t5_test.py" # tensorflow bug
"loss_test.py"
Expand Down Expand Up @@ -185,23 +180,13 @@ done


# RUN TESTS
TEST_8_DEVICES_FILES=("gda_test.py"
"input_base_test.py"
"input_dispatch_test.py"
"trainer_test.py"
"utils_test.py"
TEST_8_DEVICES_WITH_PATHS=(
"./axlearn/common/gda_test.py"
"./axlearn/common/input_base_test.py"
"./axlearn/common/input_dispatch_test.py"
"./axlearn/common/trainer_test.py"
"./axlearn/common/utils_test.py"
)
TEST_8_DEVICES_WITH_PATHS=()
for file in "${TEST_8_DEVICES_FILES[@]}"; do
found_files=$(find . -name "$file" -type f 2>/dev/null)
if [[ -n "$found_files" ]]; then
while IFS= read -r found_file; do
TEST_8_DEVICES_WITH_PATHS+=("$found_file")
done <<< "$found_files"
else
echo "Warning: Test file $file not found in current directory structure"
fi
done

run_tests "" "for_8_devices" "8_dev" "${TEST_8_DEVICES_WITH_PATHS[@]}"
# All the other tests
Expand Down
2 changes: 1 addition & 1 deletion .github/eks-workflow-files/axlearn/axlearn-job.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ spec:
LOG_DIR="/output/${RUN_ID}"
mkdir -p ${LOG_DIR}
# test on JAX, make sure 8 devices are visible
pytest-xdist.sh 8 4 ${LOG_DIR}/axlearn-unittests.jsonl test-axlearn.sh --directory "." --output ${LOG_DIR} --test-files "/opt/axlearn/axlearn/common/*_test.py"
pytest-xdist.sh 8 4 ${LOG_DIR}/axlearn-unittests.jsonl test-axlearn.sh --output ${LOG_DIR} --test-files "axlearn/common/*_test.py"
env:
- name: RUN_ID
value: PLACEHOLDER
Expand Down
2 changes: 1 addition & 1 deletion .github/eks-workflow-files/mpi-nccl-test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ spec:
resources:
limits:
nvidia.com/gpu: 8
hugepages-2Mi: 5120Mi
#hugepages-2Mi: 5120Mi
vpc.amazonaws.com/efa: 32
memory: 32000Mi
imagePullSecrets:
Expand Down
7 changes: 6 additions & 1 deletion .github/workflows/_ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,12 @@ jobs:
secrets: inherit

test-nccl:
if: inputs.ARCHITECTURE == 'amd64' # build only amd64
if: >-
inputs.ARCHITECTURE == 'amd64' &&
(
inputs.MODE == 'full' ||
inputs.MODE == 'nccl'
)
needs: build-base
uses: ./.github/workflows/_test_nccl.yaml
with:
Expand Down
7 changes: 4 additions & 3 deletions .github/workflows/ci.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,8 @@ on:
- t5x - run build rosetta
- maxtext - run only the tests for maxtext
- axlearn - run only the tests for axlearn
options: [full, jax, te, t5x, maxtext, axlearn]
- nccl - run only the nccl tests
options: [full, jax, te, t5x, maxtext, axlearn, nccl]
default: full

concurrency:
Expand Down Expand Up @@ -210,7 +211,7 @@ jobs:
CUDA_IMAGE: ${{ needs.metadata.outputs.CUDA_IMAGE }}
MANIFEST_ARTIFACT_NAME: ${{ needs.metadata.outputs.MANIFEST_ARTIFACT_NAME }}
SOURCE_URLREFS: ${{ needs.bump-manifest.outputs.SOURCE_URLREFS }}
MODE: ${{ github.event_name == 'workflow_dispatch' && inputs.MODE || 'full' }}
MODE: "nccl" # ${{ github.event_name == 'workflow_dispatch' && inputs.MODE || 'full' }}
secrets: inherit

arm64:
Expand All @@ -222,7 +223,7 @@ jobs:
CUDA_IMAGE: ${{ needs.metadata.outputs.CUDA_IMAGE }}
MANIFEST_ARTIFACT_NAME: ${{ needs.metadata.outputs.MANIFEST_ARTIFACT_NAME }}
SOURCE_URLREFS: ${{ needs.bump-manifest.outputs.SOURCE_URLREFS }}
MODE: ${{ github.event_name == 'workflow_dispatch' && inputs.MODE || 'full' }}
MODE: "nccl" # ${{ github.event_name == 'workflow_dispatch' && inputs.MODE || 'full' }}
secrets: inherit

# Only merge if everything succeeds
Expand Down
Loading