Skip to content

Commit f0072c4

Browse files
committed
try standalone job with same setup as PyTorch symm-mem job
1 parent a17723c commit f0072c4

File tree

2 files changed

+121
-10
lines changed

2 files changed

+121
-10
lines changed

.github/matrix.json

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -52,16 +52,6 @@
5252
"pytorch-version": "pytorch-nightly",
5353
"alias": "h100"
5454
},
55-
{
56-
"runner": "linux.aws.h100.4",
57-
"python-version": "3.12",
58-
"ref-eager": false,
59-
"image": "nvidia/cuda:13.0.1-devel-ubuntu24.04",
60-
"runtime-version": "cu130",
61-
"container-options": "--gpus all",
62-
"pytorch-version": "pytorch-nightly",
63-
"alias": "h100-distributed"
64-
},
6555
{
6656
"runner": "linux.dgx.b200",
6757
"python-version": "3.12",
Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
name: Symmetric Memory (H100)
2+
3+
on:
4+
pull_request:
5+
paths:
6+
- .github/workflows/h100-symm-mem.yml
7+
workflow_dispatch:
8+
push:
9+
tags:
10+
- ciflow/h100-symm-mem/*
11+
schedule:
12+
- cron: 22 8 * * *
13+
14+
concurrency:
15+
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
16+
cancel-in-progress: true
17+
18+
permissions:
19+
id-token: write
20+
contents: read
21+
22+
jobs:
23+
resolve-pytorch-ci-image:
24+
name: resolve-pytorch-ci-image
25+
runs-on: ubuntu-latest
26+
outputs:
27+
hash: ${{ steps.hash.outputs.hash }}
28+
steps:
29+
- name: Checkout PyTorch (for CI docker hash)
30+
uses: actions/checkout@v5
31+
with:
32+
repository: pytorch/pytorch
33+
fetch-depth: 1
34+
path: pytorch-ci-src
35+
36+
- name: Compute docker image hash
37+
id: hash
38+
run: |
39+
set -eux
40+
cd pytorch-ci-src
41+
HASH=$(git rev-parse HEAD:.ci/docker)
42+
echo "hash=${HASH}" >> "${GITHUB_OUTPUT}"
43+
echo "Resolved PyTorch CI image hash: ${HASH}"
44+
45+
h100-symm-mem:
46+
name: linux-jammy-cuda12.8-py3.12-gcc11-sm90-symm
47+
needs: resolve-pytorch-ci-image
48+
runs-on: linux.aws.h100.4
49+
timeout-minutes: 360
50+
container:
51+
image: ghcr.io/pytorch/ci-image:${{ needs.resolve-pytorch-ci-image.outputs.hash }}
52+
options: >-
53+
--gpus all
54+
--ipc=host
55+
--cap-add=SYS_PTRACE
56+
--shm-size=4g
57+
-e NVIDIA_DRIVER_CAPABILITIES=all
58+
defaults:
59+
run:
60+
shell: bash -le {0}
61+
steps:
62+
- name: Checkout Helion
63+
uses: actions/checkout@v5
64+
65+
- name: Install system dependencies
66+
run: |
67+
set -eux
68+
export DEBIAN_FRONTEND=noninteractive
69+
apt-get update
70+
apt-get install -y --no-install-recommends \
71+
libdw1 curl wget git pkg-config zlib1g-dev build-essential pciutils psmisc jq unzip ca-certificates
72+
73+
- name: Verify NVIDIA GPUs
74+
run: nvidia-smi
75+
76+
- name: Install uv
77+
uses: astral-sh/setup-uv@v7
78+
with:
79+
python-version: "3.12"
80+
enable-cache: true
81+
82+
- name: Create virtual environment
83+
run: |
84+
uv venv --python 3.12
85+
86+
- name: Install NVSHMEM 3.4.5 for CUDA 13
87+
run: |
88+
set -euxo pipefail
89+
GPU_COUNT=$(nvidia-smi -L | wc -l)
90+
if [ "$GPU_COUNT" -lt 4 ]; then
91+
echo "Error: Expected at least 4 GPUs but found $GPU_COUNT"
92+
exit 1
93+
fi
94+
curl -L https://raw.githubusercontent.com/pytorch/pytorch/main/.ci/docker/common/install_cuda.sh -o install_cuda.sh
95+
chmod +x install_cuda.sh
96+
source install_cuda.sh
97+
install_nvshmem 13 3.4.5
98+
99+
- name: Install PyTorch via nightly cu128 channel
100+
run: |
101+
set -eux
102+
source .venv/bin/activate
103+
uv pip install -U --pre torch --index-url https://download.pytorch.org/whl/nightly/cu128
104+
python -c "import torch; print(f'PyTorch version: {torch.__version__}')"
105+
106+
- name: Install Helion (editable) with dev extras
107+
run: |
108+
set -eux
109+
source .venv/bin/activate
110+
uv pip install setuptools
111+
SETUPTOOLS_SCM_PRETEND_VERSION="0.0.0" uv pip install -e .'[dev]'
112+
python -c "import helion; print(f'Helion version: {helion.__version__}')"
113+
114+
- name: Run Symmetric Memory distributed tests
115+
env:
116+
NCCL_NVSHMEM_ENABLE: "1"
117+
TORCH_SYMMMEM: "NVSHMEM"
118+
run: |
119+
set -eux
120+
source .venv/bin/activate
121+
pytest -rf -vs --timeout=120 test/test_examples_dist.py

0 commit comments

Comments
 (0)