diff --git a/.github/actions/linux-testenv/action.yml b/.github/actions/linux-testenv/action.yml index ba5eb8a25..c19d12c99 100644 --- a/.github/actions/linux-testenv/action.yml +++ b/.github/actions/linux-testenv/action.yml @@ -3,11 +3,11 @@ name: Setup Test Environment inputs: pytorch: type: string - default: 'main' + default: 'https://github.com/daisyden/pytorch.git@distributed_2.10' description: Pytorch main by default, or 'commit/branch', or 'repo@commit/repo@branch' torch_xpu_ops: type: string - default: 'main' + default: 'daisyden/distributed_2.10' description: Torch-xpu-ops version, 'commit/branch', or 'repo@commit/repo@branch', or 'pinned' for pytorch pin python: type: string @@ -69,9 +69,9 @@ runs: fi TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)') if [[ "${{ inputs.pytorch }}" == *"https://"* ]];then - PYTORCH_REPO="$(echo ${{ inputs.pytorch }} |sed 's/@.*//')" + PYTORCH_REPO="https://github.com/daisyden/pytorch.git" else - PYTORCH_REPO="https://github.com/pytorch/pytorch.git" + PYTORCH_REPO="https://github.com/daisyden/pytorch.git" fi git clone ${PYTORCH_REPO} pytorch cd pytorch @@ -97,14 +97,9 @@ runs: TORCH_XPU_OPS_COMMIT="${{ inputs.torch_xpu_ops }}" fi fi - if [ "${{ github.event_name }}" == "pull_request" ] && [[ "${{ inputs.pytorch }}" != *"_wheel" ]];then - cp -r ${{ github.workspace }}/torch-xpu-ops third_party/torch-xpu-ops - cd third_party/torch-xpu-ops - else - git clone ${TORCH_XPU_OPS_REPO} third_party/torch-xpu-ops - cd third_party/torch-xpu-ops - git checkout ${TORCH_XPU_OPS_COMMIT} - fi + git clone ${TORCH_XPU_OPS_REPO} third_party/torch-xpu-ops + cd third_party/torch-xpu-ops + git checkout ${TORCH_XPU_OPS_COMMIT} git status && git diff && git show -s - name: Install E2E Requirements shell: bash -xe {0} diff --git a/.github/actions/linux-uttest/action.yml b/.github/actions/linux-uttest/action.yml index 3bc1729ba..9657c3667 100644 --- a/.github/actions/linux-uttest/action.yml +++ b/.github/actions/linux-uttest/action.yml @@ -77,7 +77,7 @@ runs: tee ${{ github.workspace }}/ut_log/op_ut/op_ut_with_skip_test.log ls -al cp *.xml ${{ github.workspace }}/ut_log - find op_ut_with_skip_nn op_ut_with_skip_quantization/core -type f -exec sh -c ' + find op_ut_with_skip_nn op_ut_with_skip_quantization/core op_ut_with_all_functorch -type f -exec sh -c ' dir_path=$(dirname "$1"); case "$dir_path" in *"op_ut_with_skip_quantization/core"*) @@ -90,6 +90,7 @@ runs: ls -al op_ut_with_skip_nn op_ut_with_skip_quantization/core cp op_ut_with_skip_nn/*.xml ${{ github.workspace }}/ut_log cp op_ut_with_skip_quantization/core/*.xml ${{ github.workspace }}/ut_log + cp op_ut_with_all_functorch/*.xml ${{ github.workspace }}/ut_log # Cases run with a on-demand white list, since some suites are too # slow to go through all operators on CPU. So add cases on-demand # when XPU implementatoin is done. @@ -180,10 +181,7 @@ runs: echo -e "[ERROR] XCCL is not enabled" exit 1 fi - export CCL_ROOT=$(dirname $(which python))/../ - export PATH="${CCL_ROOT}/bin/libfabric:${PATH}" - export LD_LIBRARY_PATH="${CCL_ROOT}/lib:${LD_LIBRARY_PATH}" - python run_distributed.py \ + python run_distributed_local.py \ 2> ${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test_error.log | \ tee ${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test.log find ../ -type f -name "*.xml" -exec cp {} ${{ github.workspace }}/ut_log/ \; diff --git a/.github/scripts/build.sh b/.github/scripts/build.sh index b4f526297..27f324ed1 100755 --- a/.github/scripts/build.sh +++ b/.github/scripts/build.sh @@ -19,7 +19,7 @@ done # Set pytorch rm -rf ${WORKSPACE}/pytorch -git clone ${PYTORCH_REPO} ${WORKSPACE}/pytorch +git clone https://github.com/daisyden/pytorch.git ${WORKSPACE}/pytorch cd ${WORKSPACE}/pytorch git checkout ${PYTORCH_COMMIT} git remote -v && git branch && git show -s @@ -44,33 +44,32 @@ git remote -v && git branch && git show -s # Pre Build cd ${WORKSPACE}/pytorch python -m pip install requests -python third_party/torch-xpu-ops/.github/scripts/apply_torch_pr.py git submodule sync && git submodule update --init --recursive python -m pip install -r requirements.txt -python -m pip install mkl-static==2025.2.0 mkl-include==2025.2.0 +python -m pip install mkl-static mkl-include export USE_STATIC_MKL=1 if [ "${XPU_ONEAPI_PATH}" == "" ];then export PYTORCH_EXTRA_INSTALL_REQUIREMENTS=" \ - intel-cmplr-lib-rt==2025.2.1 | \ - intel-cmplr-lib-ur==2025.2.1 | \ - intel-cmplr-lic-rt==2025.2.1 | \ - intel-sycl-rt==2025.2.1 | \ - oneccl-devel==2021.16.1 | \ - oneccl==2021.16.1 | \ - impi-rt==2021.16.1 | \ - onemkl-sycl-blas==2025.2.0 | \ - onemkl-sycl-dft==2025.2.0 | \ - onemkl-sycl-lapack==2025.2.0 | \ - onemkl-sycl-rng==2025.2.0 | \ - onemkl-sycl-sparse==2025.2.0 | \ - dpcpp-cpp-rt==2025.2.1 | \ - intel-opencl-rt==2025.2.1 | \ - mkl==2025.2.0 | \ - intel-openmp==2025.2.1 | \ - tbb==2022.2.0 | \ - tcmlib==1.4.0 | \ - umf==0.11.0 | \ - intel-pti==0.13.1 + intel-cmplr-lib-rt | \ + intel-cmplr-lib-ur | \ + intel-cmplr-lic-rt | \ + intel-sycl-rt | \ + oneccl-devel | \ + oneccl | \ + impi-rt | \ + onemkl-sycl-blas | \ + onemkl-sycl-dft | \ + onemkl-sycl-lapack | \ + onemkl-sycl-rng | \ + onemkl-sycl-sparse | \ + dpcpp-cpp-rt | \ + intel-opencl-rt | \ + mkl | \ + intel-openmp | \ + tbb | \ + tcmlib | \ + umf | \ + intel-pti " fi diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index d0f5e983d..d867851a2 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -10,11 +10,11 @@ on: description: Runner label pytorch: type: string - default: 'main' + default: 'https://github.com/daisyden/pytorch.git@distributed_2.10' description: Pytorch main by default, or 'commit/branch', or 'repo@commit/repo@branch' torch_xpu_ops: type: string - default: 'main' + default: 'daisyden/distributed_2.10' description: Torch-xpu-ops main by default, 'commit/branch', or 'repo@commit/repo@branch', or 'pinned' for pytorch pin triton: required: false @@ -59,7 +59,7 @@ jobs: if: ${{ ! endsWith(inputs.pytorch, '_wheel') }} runs-on: ${{ needs.runner.outputs.runner_id }} container: - image: 'pytorch/manylinux2_28-builder:xpu-2.9' + image: 'intelgpu/ubuntu-22.04-lts2:2523.31' volumes: - ${{ github.workspace }}:${{ github.workspace }} env: @@ -72,21 +72,30 @@ jobs: steps: - name: Install gh-cli run: | + rm -rf ./*.whl ./*.log cat /etc/os-release hostname && id # install gh - dnf install -y 'dnf-command(config-manager)' - dnf config-manager --add-repo https://cli.github.com/packages/rpm/gh-cli.repo - dnf install -y gh --repo gh-cli - gh --version + sudo apt update + sudo apt install -y gpg-agent wget curl cmake git unzip zip libgl1 zlib1g-dev numactl \ + libglib2.0-dev rsync jq gcc-11 g++-11 python3-dev python3-venv gh - name: Setup python-${{ inputs.python }} run: | rm -rf /tmp/xpu-tool/myvenv - local_python=$(echo ${{ inputs.python }} |awk -F. '{printf("cp%s%s-cp%s%s", $1, $2, $1, $2)}') - /opt/python/${local_python}/bin/python -m venv /tmp/xpu-tool/myvenv + curl -LsSf https://astral.sh/uv/install.sh | sh + source $HOME/.local/bin/env + uv venv /tmp/xpu-tool/myvenv --python 3.10 --clear + source /tmp/xpu-tool/myvenv/bin/activate which python && python -V which pip && pip list - pip install -U pip wheel setuptools + uv pip install -U pip wheel setuptools + - name: Install oneapi + run: | + rm -rf /opt/intel/oneapi + wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/aa5447b5-3644-43c8-8ec4-72d53f6ecc19/intel-deep-learning-essentials-2025.3.0.338_offline.sh + sudo bash intel-deep-learning-essentials-2025.3.0.338_offline.sh -a -s --eula accept + source /opt/intel/oneapi/setvars.sh + icpx --version - name: Checkout torch-xpu-ops uses: actions/checkout@v4 with: @@ -113,8 +122,6 @@ jobs: TORCH_XPU_OPS_REPO="https://github.com/intel/torch-xpu-ops.git" TORCH_XPU_OPS_COMMIT="${{ inputs.torch_xpu_ops }}" fi - # gcc 11 - source /opt/rh/gcc-toolset-11/enable source ${{ github.workspace }}/torch-xpu-ops/.github/scripts/env.sh ${{ github.workspace }}/torch-xpu-ops/.github/scripts/build.sh \ --WORKSPACE="${{ github.workspace }}" \ @@ -129,8 +136,6 @@ jobs: fi - name: Build Torchvision and Torchaudio run: | - # gcc 11 - source /opt/rh/gcc-toolset-11/enable cd ./pytorch TORCHVISION_COMMIT_ID="$(cat .github/ci_commit_pins/vision.txt)" TORCHAUDIO_COMMIT_ID="$(cat .github/ci_commit_pins/audio.txt)" @@ -177,9 +182,6 @@ jobs: curl -sSL https://raw.githubusercontent.com/intel/intel-xpu-backend-for-triton/${TRITON_COMMIT_ID}/python/triton/__init__.py 2>&1 |\ grep '__version__' |head -n 1 |awk -F "'" '{print $2}' )" - # gcc 13 - dnf install -y gcc-toolset-13-gcc-c++ zlib-devel - source /opt/rh/gcc-toolset-13/enable pip install cmake ninja pybind11 python .github/scripts/build_triton_wheel.py --device xpu --commit-hash ${TRITON_COMMIT_ID} --triton-version ${TRITON_VERSION_NAME} \ 2>&1 |tee ${{ github.workspace }}/build_triton_${TRITON_COMMIT_ID}.log diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index 6abbd2470..c88cf34ff 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -9,11 +9,11 @@ on: description: Runner label pytorch: type: string - default: 'main' + default: 'https://github.com/daisyden/pytorch.git@distributed_2.10' description: Pytorch main by default, or 'commit/branch', or 'repo@commit/repo@branch' torch_xpu_ops: type: string - default: 'main' + default: 'daisyden/distributed_2.10' description: Torch-xpu-ops version, 'commit/branch', or 'repo@commit/repo@branch', or 'pinned' for pytorch pin python: type: string @@ -97,12 +97,12 @@ jobs: test-in-baremetal: needs: runner - timeout-minutes: 600 + timeout-minutes: 1200 if: ${{ contains(inputs.ut, 'distributed') }} runs-on: ${{ needs.runner.outputs.runner_id }} env: AGENT_TOOLSDIRECTORY: /tmp/xpu-tool - PYTEST_ADDOPTS: -v --timeout 3600 --timeout_method=thread -n 1 + PYTEST_ADDOPTS: -v steps: - name: Checkout torch-xpu-ops uses: actions/checkout@v4 diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index bad665086..9e44d4738 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -100,8 +100,8 @@ jobs: build: [build] uses: ./.github/workflows/_linux_build.yml with: - runner: pvc_rolling - pytorch: ${{ needs.conditions-filter.outputs.pytorch }} + pytorch: distributed_2.10 + runner: PVC-7358 linux-ut: needs: [conditions-filter, linux-build] @@ -128,9 +128,8 @@ jobs: ut_name: [xpu_distributed] uses: ./.github/workflows/_linux_ut.yml with: - runner: pvc_rolling - pytorch: ${{ needs.conditions-filter.outputs.pytorch }} - torch_xpu_ops: ${{ needs.conditions-filter.outputs.pytorch == 'nightly_wheel' && 'pinned' || 'main' }} + runner: PVC-7358 + pytorch: distributed_2.10 ut: ${{ matrix.ut_name }} linux-e2e: