From 5b26b45638c29d11fb55918dbe9108f3f27c6daa Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Wed, 9 Jul 2025 15:24:57 +0000 Subject: [PATCH 01/12] distributed_weekly test --- .github/actions/linux-testenv/action.yml | 4 ++-- .github/actions/linux-uttest/action.yml | 6 ++++-- .github/scripts/build.sh | 1 - .github/workflows/_linux_build.yml | 4 ++-- .github/workflows/_linux_ut.yml | 5 ++--- .github/workflows/pull.yml | 9 ++++----- 6 files changed, 14 insertions(+), 15 deletions(-) diff --git a/.github/actions/linux-testenv/action.yml b/.github/actions/linux-testenv/action.yml index ba5eb8a256..f2f202ce91 100644 --- a/.github/actions/linux-testenv/action.yml +++ b/.github/actions/linux-testenv/action.yml @@ -3,11 +3,11 @@ name: Setup Test Environment inputs: pytorch: type: string - default: 'main' + default: 'https://github.com/daisyden/pytorch.git@distributed_2.9' description: Pytorch main by default, or 'commit/branch', or 'repo@commit/repo@branch' torch_xpu_ops: type: string - default: 'main' + default: 'daisyden/distributed_2.9' description: Torch-xpu-ops version, 'commit/branch', or 'repo@commit/repo@branch', or 'pinned' for pytorch pin python: type: string diff --git a/.github/actions/linux-uttest/action.yml b/.github/actions/linux-uttest/action.yml index 3bc1729bac..28da2ffaf2 100644 --- a/.github/actions/linux-uttest/action.yml +++ b/.github/actions/linux-uttest/action.yml @@ -169,11 +169,13 @@ runs: tee ${{ github.workspace }}/ut_log/xpu_profiling/test_profiler_tree.log - name: xpu_distributed - shell: timeout 36000 bash -xeu -o pipefail {0} + shell: bash -xeu -o pipefail {0} if: ${{ inputs.ut_name == 'xpu_distributed' }} run: | xpu-smi topology -m mkdir -p ut_log/xpu_distributed + pip install pytest pytest-timeout xmlrunner unittest-xml-reporting zstandard transformers + cp ../pytorch/third_party/torch-xpu-ops/.github/scripts/ut_result_check.sh ut_log/xpu_distributed/ cd pytorch/third_party/torch-xpu-ops/test/xpu XCCL_ENABLE=$(python -c "import torch;print(torch.distributed.is_xccl_available())") if [[ "${XCCL_ENABLE,,}" == 'false' ]] || [[ "${XCCL_ENABLE}" == '0' ]]; then @@ -183,7 +185,7 @@ runs: export CCL_ROOT=$(dirname $(which python))/../ export PATH="${CCL_ROOT}/bin/libfabric:${PATH}" export LD_LIBRARY_PATH="${CCL_ROOT}/lib:${LD_LIBRARY_PATH}" - python run_distributed.py \ + python run_distributed_local.py \ 2> ${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test_error.log | \ tee ${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test.log find ../ -type f -name "*.xml" -exec cp {} ${{ github.workspace }}/ut_log/ \; diff --git a/.github/scripts/build.sh b/.github/scripts/build.sh index b4f5262979..b0b7f17b2c 100755 --- a/.github/scripts/build.sh +++ b/.github/scripts/build.sh @@ -44,7 +44,6 @@ git remote -v && git branch && git show -s # Pre Build cd ${WORKSPACE}/pytorch python -m pip install requests -python third_party/torch-xpu-ops/.github/scripts/apply_torch_pr.py git submodule sync && git submodule update --init --recursive python -m pip install -r requirements.txt python -m pip install mkl-static==2025.2.0 mkl-include==2025.2.0 diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index d0f5e983d7..24dcbf7967 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -10,11 +10,11 @@ on: description: Runner label pytorch: type: string - default: 'main' + default: 'https://github.com/daisyden/pytorch.git@distributed_2.9' description: Pytorch main by default, or 'commit/branch', or 'repo@commit/repo@branch' torch_xpu_ops: type: string - default: 'main' + default: 'daisyden/distributed_2.9' description: Torch-xpu-ops main by default, 'commit/branch', or 'repo@commit/repo@branch', or 'pinned' for pytorch pin triton: required: false diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index 6abbd24703..197f930d74 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -9,11 +9,11 @@ on: description: Runner label pytorch: type: string - default: 'main' + default: 'https://github.com/daisyden/pytorch.git@distributed_2.9' description: Pytorch main by default, or 'commit/branch', or 'repo@commit/repo@branch' torch_xpu_ops: type: string - default: 'main' + default: 'daisyden/distributed_2.9' description: Torch-xpu-ops version, 'commit/branch', or 'repo@commit/repo@branch', or 'pinned' for pytorch pin python: type: string @@ -166,7 +166,6 @@ jobs: else ut_list="${{ inputs.ut }}" fi - cp ${{ github.workspace }}/.github/scripts/ut_result_check.sh ./ for ut_name in ${ut_list} do cp Known_issue.log.tmp Known_issue.log diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index bad665086d..8f2f483a61 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -100,8 +100,8 @@ jobs: build: [build] uses: ./.github/workflows/_linux_build.yml with: - runner: pvc_rolling - pytorch: ${{ needs.conditions-filter.outputs.pytorch }} + pytorch: distributed_2.9 + runner: PVC-7358 linux-ut: needs: [conditions-filter, linux-build] @@ -128,9 +128,8 @@ jobs: ut_name: [xpu_distributed] uses: ./.github/workflows/_linux_ut.yml with: - runner: pvc_rolling - pytorch: ${{ needs.conditions-filter.outputs.pytorch }} - torch_xpu_ops: ${{ needs.conditions-filter.outputs.pytorch == 'nightly_wheel' && 'pinned' || 'main' }} + runner: PVC-7358 + pytorch: distributed_2.9 ut: ${{ matrix.ut_name }} linux-e2e: From ae96e86d8b32cbdd56c7bb0fc847a6154fcd88a0 Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Mon, 28 Jul 2025 17:23:17 +0800 Subject: [PATCH 02/12] update --- .github/workflows/pull.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index 8f2f483a61..0ba9f028de 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -100,8 +100,13 @@ jobs: build: [build] uses: ./.github/workflows/_linux_build.yml with: +<<<<<<< HEAD pytorch: distributed_2.9 runner: PVC-7358 +======= + pytorch: distributed_2.8 + runner: pvc_e2e +>>>>>>> 680c2cce (update) linux-ut: needs: [conditions-filter, linux-build] From 1cd97059d39aa4e124b7ae0cb111ed5e484951af Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Mon, 28 Jul 2025 23:18:35 +0800 Subject: [PATCH 03/12] update --- .github/workflows/pull.yml | 5 ----- 1 file changed, 5 deletions(-) diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index 0ba9f028de..8f2f483a61 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -100,13 +100,8 @@ jobs: build: [build] uses: ./.github/workflows/_linux_build.yml with: -<<<<<<< HEAD pytorch: distributed_2.9 runner: PVC-7358 -======= - pytorch: distributed_2.8 - runner: pvc_e2e ->>>>>>> 680c2cce (update) linux-ut: needs: [conditions-filter, linux-build] From d21a12d753c3602233bc7338fab1b9e713d1d1d0 Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Fri, 29 Aug 2025 17:15:27 +0800 Subject: [PATCH 04/12] update --- .github/actions/linux-testenv/action.yml | 4 ++-- .github/scripts/build.sh | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/actions/linux-testenv/action.yml b/.github/actions/linux-testenv/action.yml index f2f202ce91..dc42b53709 100644 --- a/.github/actions/linux-testenv/action.yml +++ b/.github/actions/linux-testenv/action.yml @@ -69,9 +69,9 @@ runs: fi TORCH_COMMIT_ID=$(python -c 'import torch; print(torch.version.git_version)') if [[ "${{ inputs.pytorch }}" == *"https://"* ]];then - PYTORCH_REPO="$(echo ${{ inputs.pytorch }} |sed 's/@.*//')" + PYTORCH_REPO="https://github.com/daisyden/pytorch.git" else - PYTORCH_REPO="https://github.com/pytorch/pytorch.git" + PYTORCH_REPO="https://github.com/daisyden/pytorch.git" fi git clone ${PYTORCH_REPO} pytorch cd pytorch diff --git a/.github/scripts/build.sh b/.github/scripts/build.sh index b0b7f17b2c..44ae14a35c 100755 --- a/.github/scripts/build.sh +++ b/.github/scripts/build.sh @@ -19,7 +19,7 @@ done # Set pytorch rm -rf ${WORKSPACE}/pytorch -git clone ${PYTORCH_REPO} ${WORKSPACE}/pytorch +git clone https://github.com/daisyden/pytorch.git ${WORKSPACE}/pytorch cd ${WORKSPACE}/pytorch git checkout ${PYTORCH_COMMIT} git remote -v && git branch && git show -s From af4a869b8863d1a1915b0a6931265860c474ade4 Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Fri, 29 Aug 2025 22:34:14 +0800 Subject: [PATCH 05/12] update --- .github/actions/linux-testenv/action.yml | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/.github/actions/linux-testenv/action.yml b/.github/actions/linux-testenv/action.yml index dc42b53709..0bb76ed359 100644 --- a/.github/actions/linux-testenv/action.yml +++ b/.github/actions/linux-testenv/action.yml @@ -97,14 +97,9 @@ runs: TORCH_XPU_OPS_COMMIT="${{ inputs.torch_xpu_ops }}" fi fi - if [ "${{ github.event_name }}" == "pull_request" ] && [[ "${{ inputs.pytorch }}" != *"_wheel" ]];then - cp -r ${{ github.workspace }}/torch-xpu-ops third_party/torch-xpu-ops - cd third_party/torch-xpu-ops - else - git clone ${TORCH_XPU_OPS_REPO} third_party/torch-xpu-ops - cd third_party/torch-xpu-ops - git checkout ${TORCH_XPU_OPS_COMMIT} - fi + git clone ${TORCH_XPU_OPS_REPO} third_party/torch-xpu-ops + cd third_party/torch-xpu-ops + git checkout ${TORCH_XPU_OPS_COMMIT} git status && git diff && git show -s - name: Install E2E Requirements shell: bash -xe {0} From 0ab3dbfeec2770afbf2d68095bcdd52753c2b8ca Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Fri, 29 Aug 2025 23:29:04 +0800 Subject: [PATCH 06/12] update --- .github/workflows/_linux_ut.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index 197f930d74..4d12a5ed59 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -102,7 +102,7 @@ jobs: runs-on: ${{ needs.runner.outputs.runner_id }} env: AGENT_TOOLSDIRECTORY: /tmp/xpu-tool - PYTEST_ADDOPTS: -v --timeout 3600 --timeout_method=thread -n 1 + PYTEST_ADDOPTS: -v steps: - name: Checkout torch-xpu-ops uses: actions/checkout@v4 From b3b5023f1de68df120d8c27402e18cef1780ccb0 Mon Sep 17 00:00:00 2001 From: xiangdong <40376367+zxd1997066@users.noreply.github.com> Date: Sat, 30 Aug 2025 12:49:51 +0800 Subject: [PATCH 07/12] Update _linux_ut.yml --- .github/workflows/_linux_ut.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index 4d12a5ed59..e2b39f74e6 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -97,7 +97,7 @@ jobs: test-in-baremetal: needs: runner - timeout-minutes: 600 + timeout-minutes: 1200 if: ${{ contains(inputs.ut, 'distributed') }} runs-on: ${{ needs.runner.outputs.runner_id }} env: From 5a493d538ec6538ecaff788917d3bc6a14f77428 Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Thu, 11 Sep 2025 17:26:31 +0800 Subject: [PATCH 08/12] update for rc --- .github/actions/linux-testenv/action.yml | 2 +- .github/workflows/_linux_build.yml | 2 +- .github/workflows/_linux_ut.yml | 2 +- .github/workflows/pull.yml | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/actions/linux-testenv/action.yml b/.github/actions/linux-testenv/action.yml index 0bb76ed359..57b89961e5 100644 --- a/.github/actions/linux-testenv/action.yml +++ b/.github/actions/linux-testenv/action.yml @@ -3,7 +3,7 @@ name: Setup Test Environment inputs: pytorch: type: string - default: 'https://github.com/daisyden/pytorch.git@distributed_2.9' + default: 'https://github.com/daisyden/pytorch.git@release/2.9' description: Pytorch main by default, or 'commit/branch', or 'repo@commit/repo@branch' torch_xpu_ops: type: string diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index 24dcbf7967..accf64860a 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -10,7 +10,7 @@ on: description: Runner label pytorch: type: string - default: 'https://github.com/daisyden/pytorch.git@distributed_2.9' + default: 'https://github.com/daisyden/pytorch.git@release/2.9' description: Pytorch main by default, or 'commit/branch', or 'repo@commit/repo@branch' torch_xpu_ops: type: string diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index e2b39f74e6..d437fbb192 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -9,7 +9,7 @@ on: description: Runner label pytorch: type: string - default: 'https://github.com/daisyden/pytorch.git@distributed_2.9' + default: 'https://github.com/daisyden/pytorch.git@release/2.9' description: Pytorch main by default, or 'commit/branch', or 'repo@commit/repo@branch' torch_xpu_ops: type: string diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index 8f2f483a61..8e41aa6793 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -100,7 +100,7 @@ jobs: build: [build] uses: ./.github/workflows/_linux_build.yml with: - pytorch: distributed_2.9 + pytorch: release/2.9 runner: PVC-7358 linux-ut: @@ -129,7 +129,7 @@ jobs: uses: ./.github/workflows/_linux_ut.yml with: runner: PVC-7358 - pytorch: distributed_2.9 + pytorch: release/2.9 ut: ${{ matrix.ut_name }} linux-e2e: From f68c348c5d61a712cf7dcd76ef86451a678e8294 Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Fri, 12 Sep 2025 17:12:54 +0800 Subject: [PATCH 09/12] update --- .github/actions/linux-testenv/action.yml | 2 +- .github/workflows/_linux_build.yml | 2 +- .github/workflows/_linux_ut.yml | 2 +- .github/workflows/pull.yml | 4 ++-- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/actions/linux-testenv/action.yml b/.github/actions/linux-testenv/action.yml index 57b89961e5..0bb76ed359 100644 --- a/.github/actions/linux-testenv/action.yml +++ b/.github/actions/linux-testenv/action.yml @@ -3,7 +3,7 @@ name: Setup Test Environment inputs: pytorch: type: string - default: 'https://github.com/daisyden/pytorch.git@release/2.9' + default: 'https://github.com/daisyden/pytorch.git@distributed_2.9' description: Pytorch main by default, or 'commit/branch', or 'repo@commit/repo@branch' torch_xpu_ops: type: string diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index accf64860a..24dcbf7967 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -10,7 +10,7 @@ on: description: Runner label pytorch: type: string - default: 'https://github.com/daisyden/pytorch.git@release/2.9' + default: 'https://github.com/daisyden/pytorch.git@distributed_2.9' description: Pytorch main by default, or 'commit/branch', or 'repo@commit/repo@branch' torch_xpu_ops: type: string diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index d437fbb192..e2b39f74e6 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -9,7 +9,7 @@ on: description: Runner label pytorch: type: string - default: 'https://github.com/daisyden/pytorch.git@release/2.9' + default: 'https://github.com/daisyden/pytorch.git@distributed_2.9' description: Pytorch main by default, or 'commit/branch', or 'repo@commit/repo@branch' torch_xpu_ops: type: string diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index 8e41aa6793..8f2f483a61 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -100,7 +100,7 @@ jobs: build: [build] uses: ./.github/workflows/_linux_build.yml with: - pytorch: release/2.9 + pytorch: distributed_2.9 runner: PVC-7358 linux-ut: @@ -129,7 +129,7 @@ jobs: uses: ./.github/workflows/_linux_ut.yml with: runner: PVC-7358 - pytorch: release/2.9 + pytorch: distributed_2.9 ut: ${{ matrix.ut_name }} linux-e2e: From a5e8b6e339535fe7e6aa3e16571272270da452b0 Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Fri, 10 Oct 2025 15:24:50 +0800 Subject: [PATCH 10/12] update for 2.10 --- .github/actions/linux-testenv/action.yml | 4 ++-- .github/workflows/_linux_build.yml | 4 ++-- .github/workflows/_linux_ut.yml | 4 ++-- .github/workflows/pull.yml | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/.github/actions/linux-testenv/action.yml b/.github/actions/linux-testenv/action.yml index 0bb76ed359..c19d12c991 100644 --- a/.github/actions/linux-testenv/action.yml +++ b/.github/actions/linux-testenv/action.yml @@ -3,11 +3,11 @@ name: Setup Test Environment inputs: pytorch: type: string - default: 'https://github.com/daisyden/pytorch.git@distributed_2.9' + default: 'https://github.com/daisyden/pytorch.git@distributed_2.10' description: Pytorch main by default, or 'commit/branch', or 'repo@commit/repo@branch' torch_xpu_ops: type: string - default: 'daisyden/distributed_2.9' + default: 'daisyden/distributed_2.10' description: Torch-xpu-ops version, 'commit/branch', or 'repo@commit/repo@branch', or 'pinned' for pytorch pin python: type: string diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index 24dcbf7967..94fc4fe5d4 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -10,11 +10,11 @@ on: description: Runner label pytorch: type: string - default: 'https://github.com/daisyden/pytorch.git@distributed_2.9' + default: 'https://github.com/daisyden/pytorch.git@distributed_2.10' description: Pytorch main by default, or 'commit/branch', or 'repo@commit/repo@branch' torch_xpu_ops: type: string - default: 'daisyden/distributed_2.9' + default: 'daisyden/distributed_2.10' description: Torch-xpu-ops main by default, 'commit/branch', or 'repo@commit/repo@branch', or 'pinned' for pytorch pin triton: required: false diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index e2b39f74e6..e7b1b6f6ce 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -9,11 +9,11 @@ on: description: Runner label pytorch: type: string - default: 'https://github.com/daisyden/pytorch.git@distributed_2.9' + default: 'https://github.com/daisyden/pytorch.git@distributed_2.10' description: Pytorch main by default, or 'commit/branch', or 'repo@commit/repo@branch' torch_xpu_ops: type: string - default: 'daisyden/distributed_2.9' + default: 'daisyden/distributed_2.10' description: Torch-xpu-ops version, 'commit/branch', or 'repo@commit/repo@branch', or 'pinned' for pytorch pin python: type: string diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index 8f2f483a61..9e44d4738c 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -100,7 +100,7 @@ jobs: build: [build] uses: ./.github/workflows/_linux_build.yml with: - pytorch: distributed_2.9 + pytorch: distributed_2.10 runner: PVC-7358 linux-ut: @@ -129,7 +129,7 @@ jobs: uses: ./.github/workflows/_linux_ut.yml with: runner: PVC-7358 - pytorch: distributed_2.9 + pytorch: distributed_2.10 ut: ${{ matrix.ut_name }} linux-e2e: From 43062c0953bf4409ff9a3fc1090ee6605509568f Mon Sep 17 00:00:00 2001 From: "Zeng, Xiangdong" Date: Fri, 10 Oct 2025 15:24:50 +0800 Subject: [PATCH 11/12] update for 2.10 --- .github/actions/linux-uttest/action.yml | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/.github/actions/linux-uttest/action.yml b/.github/actions/linux-uttest/action.yml index 28da2ffaf2..c700539d82 100644 --- a/.github/actions/linux-uttest/action.yml +++ b/.github/actions/linux-uttest/action.yml @@ -77,7 +77,7 @@ runs: tee ${{ github.workspace }}/ut_log/op_ut/op_ut_with_skip_test.log ls -al cp *.xml ${{ github.workspace }}/ut_log - find op_ut_with_skip_nn op_ut_with_skip_quantization/core -type f -exec sh -c ' + find op_ut_with_skip_nn op_ut_with_skip_quantization/core op_ut_with_all_functorch -type f -exec sh -c ' dir_path=$(dirname "$1"); case "$dir_path" in *"op_ut_with_skip_quantization/core"*) @@ -90,6 +90,7 @@ runs: ls -al op_ut_with_skip_nn op_ut_with_skip_quantization/core cp op_ut_with_skip_nn/*.xml ${{ github.workspace }}/ut_log cp op_ut_with_skip_quantization/core/*.xml ${{ github.workspace }}/ut_log + cp op_ut_with_all_functorch/*.xml ${{ github.workspace }}/ut_log # Cases run with a on-demand white list, since some suites are too # slow to go through all operators on CPU. So add cases on-demand # when XPU implementatoin is done. @@ -169,13 +170,11 @@ runs: tee ${{ github.workspace }}/ut_log/xpu_profiling/test_profiler_tree.log - name: xpu_distributed - shell: bash -xeu -o pipefail {0} + shell: timeout 36000 bash -xeu -o pipefail {0} if: ${{ inputs.ut_name == 'xpu_distributed' }} run: | xpu-smi topology -m mkdir -p ut_log/xpu_distributed - pip install pytest pytest-timeout xmlrunner unittest-xml-reporting zstandard transformers - cp ../pytorch/third_party/torch-xpu-ops/.github/scripts/ut_result_check.sh ut_log/xpu_distributed/ cd pytorch/third_party/torch-xpu-ops/test/xpu XCCL_ENABLE=$(python -c "import torch;print(torch.distributed.is_xccl_available())") if [[ "${XCCL_ENABLE,,}" == 'false' ]] || [[ "${XCCL_ENABLE}" == '0' ]]; then From c36046745f3f12c2878a2c488299ecf947a673fd Mon Sep 17 00:00:00 2001 From: mengfei25 Date: Tue, 4 Nov 2025 14:51:37 +0800 Subject: [PATCH 12/12] test oneapi 2025.3 --- .github/actions/linux-uttest/action.yml | 3 -- .github/scripts/build.sh | 42 ++++++++++++------------- .github/workflows/_linux_build.yml | 32 ++++++++++--------- .github/workflows/_linux_ut.yml | 1 + 4 files changed, 39 insertions(+), 39 deletions(-) diff --git a/.github/actions/linux-uttest/action.yml b/.github/actions/linux-uttest/action.yml index c700539d82..9657c3667d 100644 --- a/.github/actions/linux-uttest/action.yml +++ b/.github/actions/linux-uttest/action.yml @@ -181,9 +181,6 @@ runs: echo -e "[ERROR] XCCL is not enabled" exit 1 fi - export CCL_ROOT=$(dirname $(which python))/../ - export PATH="${CCL_ROOT}/bin/libfabric:${PATH}" - export LD_LIBRARY_PATH="${CCL_ROOT}/lib:${LD_LIBRARY_PATH}" python run_distributed_local.py \ 2> ${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test_error.log | \ tee ${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test.log diff --git a/.github/scripts/build.sh b/.github/scripts/build.sh index 44ae14a35c..27f324ed1c 100755 --- a/.github/scripts/build.sh +++ b/.github/scripts/build.sh @@ -46,30 +46,30 @@ cd ${WORKSPACE}/pytorch python -m pip install requests git submodule sync && git submodule update --init --recursive python -m pip install -r requirements.txt -python -m pip install mkl-static==2025.2.0 mkl-include==2025.2.0 +python -m pip install mkl-static mkl-include export USE_STATIC_MKL=1 if [ "${XPU_ONEAPI_PATH}" == "" ];then export PYTORCH_EXTRA_INSTALL_REQUIREMENTS=" \ - intel-cmplr-lib-rt==2025.2.1 | \ - intel-cmplr-lib-ur==2025.2.1 | \ - intel-cmplr-lic-rt==2025.2.1 | \ - intel-sycl-rt==2025.2.1 | \ - oneccl-devel==2021.16.1 | \ - oneccl==2021.16.1 | \ - impi-rt==2021.16.1 | \ - onemkl-sycl-blas==2025.2.0 | \ - onemkl-sycl-dft==2025.2.0 | \ - onemkl-sycl-lapack==2025.2.0 | \ - onemkl-sycl-rng==2025.2.0 | \ - onemkl-sycl-sparse==2025.2.0 | \ - dpcpp-cpp-rt==2025.2.1 | \ - intel-opencl-rt==2025.2.1 | \ - mkl==2025.2.0 | \ - intel-openmp==2025.2.1 | \ - tbb==2022.2.0 | \ - tcmlib==1.4.0 | \ - umf==0.11.0 | \ - intel-pti==0.13.1 + intel-cmplr-lib-rt | \ + intel-cmplr-lib-ur | \ + intel-cmplr-lic-rt | \ + intel-sycl-rt | \ + oneccl-devel | \ + oneccl | \ + impi-rt | \ + onemkl-sycl-blas | \ + onemkl-sycl-dft | \ + onemkl-sycl-lapack | \ + onemkl-sycl-rng | \ + onemkl-sycl-sparse | \ + dpcpp-cpp-rt | \ + intel-opencl-rt | \ + mkl | \ + intel-openmp | \ + tbb | \ + tcmlib | \ + umf | \ + intel-pti " fi diff --git a/.github/workflows/_linux_build.yml b/.github/workflows/_linux_build.yml index 94fc4fe5d4..d867851a29 100644 --- a/.github/workflows/_linux_build.yml +++ b/.github/workflows/_linux_build.yml @@ -59,7 +59,7 @@ jobs: if: ${{ ! endsWith(inputs.pytorch, '_wheel') }} runs-on: ${{ needs.runner.outputs.runner_id }} container: - image: 'pytorch/manylinux2_28-builder:xpu-2.9' + image: 'intelgpu/ubuntu-22.04-lts2:2523.31' volumes: - ${{ github.workspace }}:${{ github.workspace }} env: @@ -72,21 +72,30 @@ jobs: steps: - name: Install gh-cli run: | + rm -rf ./*.whl ./*.log cat /etc/os-release hostname && id # install gh - dnf install -y 'dnf-command(config-manager)' - dnf config-manager --add-repo https://cli.github.com/packages/rpm/gh-cli.repo - dnf install -y gh --repo gh-cli - gh --version + sudo apt update + sudo apt install -y gpg-agent wget curl cmake git unzip zip libgl1 zlib1g-dev numactl \ + libglib2.0-dev rsync jq gcc-11 g++-11 python3-dev python3-venv gh - name: Setup python-${{ inputs.python }} run: | rm -rf /tmp/xpu-tool/myvenv - local_python=$(echo ${{ inputs.python }} |awk -F. '{printf("cp%s%s-cp%s%s", $1, $2, $1, $2)}') - /opt/python/${local_python}/bin/python -m venv /tmp/xpu-tool/myvenv + curl -LsSf https://astral.sh/uv/install.sh | sh + source $HOME/.local/bin/env + uv venv /tmp/xpu-tool/myvenv --python 3.10 --clear + source /tmp/xpu-tool/myvenv/bin/activate which python && python -V which pip && pip list - pip install -U pip wheel setuptools + uv pip install -U pip wheel setuptools + - name: Install oneapi + run: | + rm -rf /opt/intel/oneapi + wget https://registrationcenter-download.intel.com/akdlm/IRC_NAS/aa5447b5-3644-43c8-8ec4-72d53f6ecc19/intel-deep-learning-essentials-2025.3.0.338_offline.sh + sudo bash intel-deep-learning-essentials-2025.3.0.338_offline.sh -a -s --eula accept + source /opt/intel/oneapi/setvars.sh + icpx --version - name: Checkout torch-xpu-ops uses: actions/checkout@v4 with: @@ -113,8 +122,6 @@ jobs: TORCH_XPU_OPS_REPO="https://github.com/intel/torch-xpu-ops.git" TORCH_XPU_OPS_COMMIT="${{ inputs.torch_xpu_ops }}" fi - # gcc 11 - source /opt/rh/gcc-toolset-11/enable source ${{ github.workspace }}/torch-xpu-ops/.github/scripts/env.sh ${{ github.workspace }}/torch-xpu-ops/.github/scripts/build.sh \ --WORKSPACE="${{ github.workspace }}" \ @@ -129,8 +136,6 @@ jobs: fi - name: Build Torchvision and Torchaudio run: | - # gcc 11 - source /opt/rh/gcc-toolset-11/enable cd ./pytorch TORCHVISION_COMMIT_ID="$(cat .github/ci_commit_pins/vision.txt)" TORCHAUDIO_COMMIT_ID="$(cat .github/ci_commit_pins/audio.txt)" @@ -177,9 +182,6 @@ jobs: curl -sSL https://raw.githubusercontent.com/intel/intel-xpu-backend-for-triton/${TRITON_COMMIT_ID}/python/triton/__init__.py 2>&1 |\ grep '__version__' |head -n 1 |awk -F "'" '{print $2}' )" - # gcc 13 - dnf install -y gcc-toolset-13-gcc-c++ zlib-devel - source /opt/rh/gcc-toolset-13/enable pip install cmake ninja pybind11 python .github/scripts/build_triton_wheel.py --device xpu --commit-hash ${TRITON_COMMIT_ID} --triton-version ${TRITON_VERSION_NAME} \ 2>&1 |tee ${{ github.workspace }}/build_triton_${TRITON_COMMIT_ID}.log diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index e7b1b6f6ce..c88cf34ff8 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -166,6 +166,7 @@ jobs: else ut_list="${{ inputs.ut }}" fi + cp ${{ github.workspace }}/.github/scripts/ut_result_check.sh ./ for ut_name in ${ut_list} do cp Known_issue.log.tmp Known_issue.log