Skip to content

Commit f24febc

Browse files
committed
try standalone job with same setup as PyTorch symm-mem job
1 parent a17723c commit f24febc

File tree

2 files changed

+126
-10
lines changed

2 files changed

+126
-10
lines changed

.github/matrix.json

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -52,16 +52,6 @@
5252
"pytorch-version": "pytorch-nightly",
5353
"alias": "h100"
5454
},
55-
{
56-
"runner": "linux.aws.h100.4",
57-
"python-version": "3.12",
58-
"ref-eager": false,
59-
"image": "nvidia/cuda:13.0.1-devel-ubuntu24.04",
60-
"runtime-version": "cu130",
61-
"container-options": "--gpus all",
62-
"pytorch-version": "pytorch-nightly",
63-
"alias": "h100-distributed"
64-
},
6555
{
6656
"runner": "linux.dgx.b200",
6757
"python-version": "3.12",
Lines changed: 126 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,126 @@
1+
name: Symmetric Memory (H100)
2+
3+
on:
4+
pull_request:
5+
paths:
6+
- .github/workflows/h100-symm-mem.yml
7+
workflow_dispatch:
8+
push:
9+
tags:
10+
- ciflow/h100-symm-mem/*
11+
schedule:
12+
- cron: 22 8 * * *
13+
14+
concurrency:
15+
group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}
16+
cancel-in-progress: true
17+
18+
permissions:
19+
id-token: write
20+
contents: read
21+
22+
jobs:
23+
resolve-pytorch-ci-image:
24+
name: resolve-pytorch-ci-image
25+
runs-on: ubuntu-latest
26+
outputs:
27+
hash: ${{ steps.hash.outputs.hash }}
28+
steps:
29+
- name: Checkout repositories
30+
uses: actions/checkout@v5
31+
with:
32+
fetch-depth: 0
33+
34+
- name: Checkout PyTorch (for CI docker hash)
35+
uses: actions/checkout@v5
36+
with:
37+
repository: pytorch/pytorch
38+
path: pytorch-ci-src
39+
fetch-depth: 0
40+
41+
- name: Compute docker image hash
42+
id: hash
43+
run: |
44+
set -eux
45+
cd pytorch-ci-src
46+
HASH=$(git rev-parse HEAD:.ci/docker)
47+
echo "hash=${HASH}" >> "${GITHUB_OUTPUT}"
48+
echo "Resolved PyTorch CI image hash: ${HASH}"
49+
50+
h100-symm-mem:
51+
name: linux-jammy-cuda12.8-py3.12-gcc11-sm90-symm
52+
needs: resolve-pytorch-ci-image
53+
runs-on: linux.aws.h100.4
54+
timeout-minutes: 360
55+
container:
56+
image: ghcr.io/pytorch/ci-image:${{ needs.resolve-pytorch-ci-image.outputs.hash }}
57+
options: >-
58+
--gpus all
59+
--ipc=host
60+
--cap-add=SYS_PTRACE
61+
--shm-size=4g
62+
-e NVIDIA_DRIVER_CAPABILITIES=all
63+
defaults:
64+
run:
65+
shell: bash -le {0}
66+
steps:
67+
- name: Checkout Helion
68+
uses: actions/checkout@v5
69+
70+
- name: Install system dependencies
71+
run: |
72+
set -eux
73+
export DEBIAN_FRONTEND=noninteractive
74+
apt-get update
75+
apt-get install -y --no-install-recommends \
76+
libdw1 curl wget git pkg-config zlib1g-dev build-essential pciutils psmisc jq unzip ca-certificates
77+
78+
- name: Verify NVIDIA GPUs
79+
run: nvidia-smi
80+
81+
- name: Install uv
82+
uses: astral-sh/setup-uv@v7
83+
with:
84+
python-version: "3.12"
85+
enable-cache: true
86+
87+
- name: Create virtual environment
88+
run: |
89+
uv venv --python 3.12
90+
91+
- name: Install NVSHMEM 3.4.5 for CUDA 13
92+
run: |
93+
set -euxo pipefail
94+
GPU_COUNT=$(nvidia-smi -L | wc -l)
95+
if [ "$GPU_COUNT" -lt 4 ]; then
96+
echo "Error: Expected at least 4 GPUs but found $GPU_COUNT"
97+
exit 1
98+
fi
99+
curl -L https://raw.githubusercontent.com/pytorch/pytorch/main/.ci/docker/common/install_cuda.sh -o install_cuda.sh
100+
chmod +x install_cuda.sh
101+
source install_cuda.sh
102+
install_nvshmem 13 3.4.5
103+
104+
- name: Install PyTorch via nightly cu128 channel
105+
run: |
106+
set -eux
107+
source .venv/bin/activate
108+
uv pip install -U --pre torch --index-url https://download.pytorch.org/whl/nightly/cu128
109+
python -c "import torch; print(f'PyTorch version: {torch.__version__}')"
110+
111+
- name: Install Helion (editable) with dev extras
112+
run: |
113+
set -eux
114+
source .venv/bin/activate
115+
uv pip install setuptools
116+
SETUPTOOLS_SCM_PRETEND_VERSION="0.0.0" uv pip install -e .'[dev]'
117+
python -c "import helion; print(f'Helion version: {helion.__version__}')"
118+
119+
- name: Run Symmetric Memory distributed tests
120+
env:
121+
NCCL_NVSHMEM_ENABLE: "1"
122+
TORCH_SYMMMEM: "NVSHMEM"
123+
run: |
124+
set -eux
125+
source .venv/bin/activate
126+
pytest -rf -vs --timeout=120 test/test_examples_dist.py

0 commit comments

Comments
 (0)