try standalone job with same setup as PyTorch symm-mem job

yf225 · yf225 · commit f0072c4a9da6 · 2025-11-09T02:15:07.000-08:00
diff --git a/.github/matrix.json b/.github/matrix.json
@@ -52,16 +52,6 @@
       "pytorch-version": "pytorch-nightly",
       "alias": "h100"
     },
-    {
-      "runner": "linux.aws.h100.4",
-      "python-version": "3.12",
-      "ref-eager": false,
-      "image": "nvidia/cuda:13.0.1-devel-ubuntu24.04",
-      "runtime-version": "cu130",
-      "container-options": "--gpus all",
-      "pytorch-version": "pytorch-nightly",
-      "alias": "h100-distributed"
-    },
     {
       "runner": "linux.dgx.b200",
       "python-version": "3.12",
diff --git a/.github/workflows/h100-symm-mem.yml b/.github/workflows/h100-symm-mem.yml
@@ -0,0 +1,121 @@
+name: Symmetric Memory (H100)
+
+on:
+  pull_request:
+    paths:
+      - .github/workflows/h100-symm-mem.yml
+  workflow_dispatch:
+  push:
+    tags:
+      - ciflow/h100-symm-mem/*
+  schedule:
+    - cron: 22 8 * * *
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
+  cancel-in-progress: true
+
+permissions:
+  id-token: write
+  contents: read
+
+jobs:
+  resolve-pytorch-ci-image:
+    name: resolve-pytorch-ci-image
+    runs-on: ubuntu-latest
+    outputs:
+      hash: ${{ steps.hash.outputs.hash }}
+    steps:
+      - name: Checkout PyTorch (for CI docker hash)
+        uses: actions/checkout@v5
+        with:
+          repository: pytorch/pytorch
+          fetch-depth: 1
+          path: pytorch-ci-src
+
+      - name: Compute docker image hash
+        id: hash
+        run: |
+          set -eux
+          cd pytorch-ci-src
+          HASH=$(git rev-parse HEAD:.ci/docker)
+          echo "hash=${HASH}" >> "${GITHUB_OUTPUT}"
+          echo "Resolved PyTorch CI image hash: ${HASH}"
+
+  h100-symm-mem:
+    name: linux-jammy-cuda12.8-py3.12-gcc11-sm90-symm
+    needs: resolve-pytorch-ci-image
+    runs-on: linux.aws.h100.4
+    timeout-minutes: 360
+    container:
+      image: ghcr.io/pytorch/ci-image:${{ needs.resolve-pytorch-ci-image.outputs.hash }}
+      options: >-
+        --gpus all
+        --ipc=host
+        --cap-add=SYS_PTRACE
+        --shm-size=4g
+        -e NVIDIA_DRIVER_CAPABILITIES=all
+    defaults:
+      run:
+        shell: bash -le {0}
+    steps:
+      - name: Checkout Helion
+        uses: actions/checkout@v5
+
+      - name: Install system dependencies
+        run: |
+          set -eux
+          export DEBIAN_FRONTEND=noninteractive
+          apt-get update
+          apt-get install -y --no-install-recommends \
+            libdw1 curl wget git pkg-config zlib1g-dev build-essential pciutils psmisc jq unzip ca-certificates
+
+      - name: Verify NVIDIA GPUs
+        run: nvidia-smi
+
+      - name: Install uv
+        uses: astral-sh/setup-uv@v7
+        with:
+          python-version: "3.12"
+          enable-cache: true
+
+      - name: Create virtual environment
+        run: |
+          uv venv --python 3.12
+
+      - name: Install NVSHMEM 3.4.5 for CUDA 13
+        run: |
+          set -euxo pipefail
+          GPU_COUNT=$(nvidia-smi -L | wc -l)
+          if [ "$GPU_COUNT" -lt 4 ]; then
+            echo "Error: Expected at least 4 GPUs but found $GPU_COUNT"
+            exit 1
+          fi
+          curl -L https://raw.githubusercontent.com/pytorch/pytorch/main/.ci/docker/common/install_cuda.sh -o install_cuda.sh
+          chmod +x install_cuda.sh
+          source install_cuda.sh
+          install_nvshmem 13 3.4.5
+
+      - name: Install PyTorch via nightly cu128 channel
+        run: |
+          set -eux
+          source .venv/bin/activate
+          uv pip install -U --pre torch --index-url https://download.pytorch.org/whl/nightly/cu128
+          python -c "import torch; print(f'PyTorch version: {torch.__version__}')"
+
+      - name: Install Helion (editable) with dev extras
+        run: |
+          set -eux
+          source .venv/bin/activate
+          uv pip install setuptools
+          SETUPTOOLS_SCM_PRETEND_VERSION="0.0.0" uv pip install -e .'[dev]'
+          python -c "import helion; print(f'Helion version: {helion.__version__}')"
+
+      - name: Run Symmetric Memory distributed tests
+        env:
+          NCCL_NVSHMEM_ENABLE: "1"
+          TORCH_SYMMMEM: "NVSHMEM"
+        run: |
+          set -eux
+          source .venv/bin/activate
+          pytest -rf -vs --timeout=120 test/test_examples_dist.py