Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 7 additions & 12 deletions .github/actions/linux-testenv/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,11 @@ name: Setup Test Environment
inputs:
pytorch:
type: string
default: 'main'
default: 'https://github.com/daisyden/pytorch.git@distributed_2.10'
description: Pytorch main by default, or 'commit/branch', or 'repo@commit/repo@branch'
torch_xpu_ops:
type: string
default: 'main'
default: 'daisyden/distributed_2.10'
description: Torch-xpu-ops version, 'commit/branch', or 'repo@commit/repo@branch', or 'pinned' for pytorch pin
python:
type: string
Expand Down Expand Up @@ -69,9 +69,9 @@ runs:
fi
TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)')
if [[ "${{ inputs.pytorch }}" == *"https://"* ]];then
PYTORCH_REPO="$(echo ${{ inputs.pytorch }} |sed 's/@.*//')"
PYTORCH_REPO="https://github.com/daisyden/pytorch.git"
else
PYTORCH_REPO="https://github.com/pytorch/pytorch.git"
PYTORCH_REPO="https://github.com/daisyden/pytorch.git"
fi
git clone ${PYTORCH_REPO} pytorch
cd pytorch
Expand All @@ -97,14 +97,9 @@ runs:
TORCH_XPU_OPS_COMMIT="${{ inputs.torch_xpu_ops }}"
fi
fi
if [ "${{ github.event_name }}" == "pull_request" ] && [[ "${{ inputs.pytorch }}" != *"_wheel" ]];then
cp -r ${{ github.workspace }}/torch-xpu-ops third_party/torch-xpu-ops
cd third_party/torch-xpu-ops
else
git clone ${TORCH_XPU_OPS_REPO} third_party/torch-xpu-ops
cd third_party/torch-xpu-ops
git checkout ${TORCH_XPU_OPS_COMMIT}
fi
git clone ${TORCH_XPU_OPS_REPO} third_party/torch-xpu-ops
cd third_party/torch-xpu-ops
git checkout ${TORCH_XPU_OPS_COMMIT}
git status && git diff && git show -s
- name: Install E2E Requirements
shell: bash -xe {0}
Expand Down
8 changes: 3 additions & 5 deletions .github/actions/linux-uttest/action.yml
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ runs:
tee ${{ github.workspace }}/ut_log/op_ut/op_ut_with_skip_test.log
ls -al
cp *.xml ${{ github.workspace }}/ut_log
find op_ut_with_skip_nn op_ut_with_skip_quantization/core -type f -exec sh -c '
find op_ut_with_skip_nn op_ut_with_skip_quantization/core op_ut_with_all_functorch -type f -exec sh -c '
dir_path=$(dirname "$1");
case "$dir_path" in
*"op_ut_with_skip_quantization/core"*)
Expand All @@ -90,6 +90,7 @@ runs:
ls -al op_ut_with_skip_nn op_ut_with_skip_quantization/core
cp op_ut_with_skip_nn/*.xml ${{ github.workspace }}/ut_log
cp op_ut_with_skip_quantization/core/*.xml ${{ github.workspace }}/ut_log
cp op_ut_with_all_functorch/*.xml ${{ github.workspace }}/ut_log
# Cases run with a on-demand white list, since some suites are too
# slow to go through all operators on CPU. So add cases on-demand
# when XPU implementatoin is done.
Expand Down Expand Up @@ -180,10 +181,7 @@ runs:
echo -e "[ERROR] XCCL is not enabled"
exit 1
fi
export CCL_ROOT=$(dirname $(which python))/../
export PATH="${CCL_ROOT}/bin/libfabric:${PATH}"
export LD_LIBRARY_PATH="${CCL_ROOT}/lib:${LD_LIBRARY_PATH}"
python run_distributed.py \
python run_distributed_local.py \
2> ${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test_error.log | \
tee ${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test.log
find ../ -type f -name "*.xml" -exec cp {} ${{ github.workspace }}/ut_log/ \;
Expand Down
45 changes: 22 additions & 23 deletions .github/scripts/build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ done

# Set pytorch
rm -rf ${WORKSPACE}/pytorch
git clone ${PYTORCH_REPO} ${WORKSPACE}/pytorch
git clone https://github.com/daisyden/pytorch.git ${WORKSPACE}/pytorch
cd ${WORKSPACE}/pytorch
git checkout ${PYTORCH_COMMIT}
git remote -v && git branch && git show -s
Expand All @@ -44,33 +44,32 @@ git remote -v && git branch && git show -s
# Pre Build
cd ${WORKSPACE}/pytorch
python -m pip install requests
python third_party/torch-xpu-ops/.github/scripts/apply_torch_pr.py
git submodule sync && git submodule update --init --recursive
python -m pip install -r requirements.txt
python -m pip install mkl-static==2025.2.0 mkl-include==2025.2.0
python -m pip install mkl-static mkl-include
export USE_STATIC_MKL=1
if [ "${XPU_ONEAPI_PATH}" == "" ];then
export PYTORCH_EXTRA_INSTALL_REQUIREMENTS=" \
intel-cmplr-lib-rt==2025.2.1 | \
intel-cmplr-lib-ur==2025.2.1 | \
intel-cmplr-lic-rt==2025.2.1 | \
intel-sycl-rt==2025.2.1 | \
oneccl-devel==2021.16.1 | \
oneccl==2021.16.1 | \
impi-rt==2021.16.1 | \
onemkl-sycl-blas==2025.2.0 | \
onemkl-sycl-dft==2025.2.0 | \
onemkl-sycl-lapack==2025.2.0 | \
onemkl-sycl-rng==2025.2.0 | \
onemkl-sycl-sparse==2025.2.0 | \
dpcpp-cpp-rt==2025.2.1 | \
intel-opencl-rt==2025.2.1 | \
mkl==2025.2.0 | \
intel-openmp==2025.2.1 | \
tbb==2022.2.0 | \
tcmlib==1.4.0 | \
umf==0.11.0 | \
intel-pti==0.13.1
intel-cmplr-lib-rt | \
intel-cmplr-lib-ur | \
intel-cmplr-lic-rt | \
intel-sycl-rt | \
oneccl-devel | \
oneccl | \
impi-rt | \
onemkl-sycl-blas | \
onemkl-sycl-dft | \
onemkl-sycl-lapack | \
onemkl-sycl-rng | \
onemkl-sycl-sparse | \
dpcpp-cpp-rt | \
intel-opencl-rt | \
mkl | \
intel-openmp | \
tbb | \
tcmlib | \
umf | \
intel-pti
"
fi

Expand Down
36 changes: 19 additions & 17 deletions .github/workflows/_linux_build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,11 @@ on:
description: Runner label
pytorch:
type: string
default: 'main'
default: 'https://github.com/daisyden/pytorch.git@distributed_2.10'
description: Pytorch main by default, or 'commit/branch', or 'repo@commit/repo@branch'
torch_xpu_ops:
type: string
default: 'main'
default: 'daisyden/distributed_2.10'
description: Torch-xpu-ops main by default, 'commit/branch', or 'repo@commit/repo@branch', or 'pinned' for pytorch pin
triton:
required: false
Expand Down Expand Up @@ -59,7 +59,7 @@ jobs:
if: ${{ ! endsWith(inputs.pytorch, '_wheel') }}
runs-on: ${{ needs.runner.outputs.runner_id }}
container:
image: 'pytorch/manylinux2_28-builder:xpu-2.9'
image: 'intelgpu/ubuntu-22.04-lts2:2523.31'
volumes:
- ${{ github.workspace }}:${{ github.workspace }}
env:
Expand All @@ -72,21 +72,30 @@ jobs:
steps:
- name: Install gh-cli
run: |
rm -rf ./*.whl ./*.log
cat /etc/os-release
hostname && id
# install gh
dnf install -y 'dnf-command(config-manager)'
dnf config-manager --add-repo https://cli.github.com/packages/rpm/gh-cli.repo
dnf install -y gh --repo gh-cli
gh --version
sudo apt update
sudo apt install -y gpg-agent wget curl cmake git unzip zip libgl1 zlib1g-dev numactl \
libglib2.0-dev rsync jq gcc-11 g++-11 python3-dev python3-venv gh
- name: Setup python-${{ inputs.python }}
run: |
rm -rf /tmp/xpu-tool/myvenv
local_python=$(echo ${{ inputs.python }} |awk -F. '{printf("cp%s%s-cp%s%s", $1, $2, $1, $2)}')
/opt/python/${local_python}/bin/python -m venv /tmp/xpu-tool/myvenv
curl -LsSf https://astral.sh/uv/install.sh | sh
source $HOME/.local/bin/env
uv venv /tmp/xpu-tool/myvenv --python 3.10 --clear
source /tmp/xpu-tool/myvenv/bin/activate
which python && python -V
which pip && pip list
pip install -U pip wheel setuptools
uv pip install -U pip wheel setuptools
- name: Install oneapi
run: |
rm -rf /opt/intel/oneapi
wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/aa5447b5-3644-43c8-8ec4-72d53f6ecc19/intel-deep-learning-essentials-2025.3.0.338_offline.sh
sudo bash intel-deep-learning-essentials-2025.3.0.338_offline.sh -a -s --eula accept
source /opt/intel/oneapi/setvars.sh
icpx --version
- name: Checkout torch-xpu-ops
uses: actions/checkout@v4
with:
Expand All @@ -113,8 +122,6 @@ jobs:
TORCH_XPU_OPS_REPO="https://github.com/intel/torch-xpu-ops.git"
TORCH_XPU_OPS_COMMIT="${{ inputs.torch_xpu_ops }}"
fi
# gcc 11
source /opt/rh/gcc-toolset-11/enable
source ${{ github.workspace }}/torch-xpu-ops/.github/scripts/env.sh
${{ github.workspace }}/torch-xpu-ops/.github/scripts/build.sh \
--WORKSPACE="${{ github.workspace }}" \
Expand All @@ -129,8 +136,6 @@ jobs:
fi
- name: Build Torchvision and Torchaudio
run: |
# gcc 11
source /opt/rh/gcc-toolset-11/enable
cd ./pytorch
TORCHVISION_COMMIT_ID="$(cat .github/ci_commit_pins/vision.txt)"
TORCHAUDIO_COMMIT_ID="$(cat .github/ci_commit_pins/audio.txt)"
Expand Down Expand Up @@ -177,9 +182,6 @@ jobs:
curl -sSL https://raw.githubusercontent.com/intel/intel-xpu-backend-for-triton/${TRITON_COMMIT_ID}/python/triton/__init__.py 2>&1 |\
grep '__version__' |head -n 1 |awk -F "'" '{print $2}'
)"
# gcc 13
dnf install -y gcc-toolset-13-gcc-c++ zlib-devel
source /opt/rh/gcc-toolset-13/enable
pip install cmake ninja pybind11
python .github/scripts/build_triton_wheel.py --device xpu --commit-hash ${TRITON_COMMIT_ID} --triton-version ${TRITON_VERSION_NAME} \
2>&1 |tee ${{ github.workspace }}/build_triton_${TRITON_COMMIT_ID}.log
Expand Down
8 changes: 4 additions & 4 deletions .github/workflows/_linux_ut.yml
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,11 @@ on:
description: Runner label
pytorch:
type: string
default: 'main'
default: 'https://github.com/daisyden/pytorch.git@distributed_2.10'
description: Pytorch main by default, or 'commit/branch', or 'repo@commit/repo@branch'
torch_xpu_ops:
type: string
default: 'main'
default: 'daisyden/distributed_2.10'
description: Torch-xpu-ops version, 'commit/branch', or 'repo@commit/repo@branch', or 'pinned' for pytorch pin
python:
type: string
Expand Down Expand Up @@ -97,12 +97,12 @@ jobs:

test-in-baremetal:
needs: runner
timeout-minutes: 600
timeout-minutes: 1200
if: ${{ contains(inputs.ut, 'distributed') }}
runs-on: ${{ needs.runner.outputs.runner_id }}
env:
AGENT_TOOLSDIRECTORY: /tmp/xpu-tool
PYTEST_ADDOPTS: -v --timeout 3600 --timeout_method=thread -n 1
PYTEST_ADDOPTS: -v
steps:
- name: Checkout torch-xpu-ops
uses: actions/checkout@v4
Expand Down
9 changes: 4 additions & 5 deletions .github/workflows/pull.yml
Original file line number Diff line number Diff line change
Expand Up @@ -100,8 +100,8 @@ jobs:
build: [build]
uses: ./.github/workflows/_linux_build.yml
with:
runner: pvc_rolling
pytorch: ${{ needs.conditions-filter.outputs.pytorch }}
pytorch: distributed_2.10
runner: PVC-7358

linux-ut:
needs: [conditions-filter, linux-build]
Expand All @@ -128,9 +128,8 @@ jobs:
ut_name: [xpu_distributed]
uses: ./.github/workflows/_linux_ut.yml
with:
runner: pvc_rolling
pytorch: ${{ needs.conditions-filter.outputs.pytorch }}
torch_xpu_ops: ${{ needs.conditions-filter.outputs.pytorch == 'nightly_wheel' && 'pinned' || 'main' }}
runner: PVC-7358
pytorch: distributed_2.10
ut: ${{ matrix.ut_name }}

linux-e2e:
Expand Down
Loading