Skip to content

Commit d7e007e

Browse files
committed
Add distributed CI job
1 parent 0d01365 commit d7e007e

File tree

2 files changed

+32
-2
lines changed

2 files changed

+32
-2
lines changed

.github/matrix.json

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,16 @@
5252
"pytorch-version": "pytorch-nightly",
5353
"alias": "h100"
5454
},
55+
{
56+
"runner": "linux.aws.h100.4",
57+
"python-version": "3.12",
58+
"ref-eager": false,
59+
"image": "nvidia/cuda:13.0.1-devel-ubuntu24.04",
60+
"runtime-version": "cu130",
61+
"container-options": "--gpus all",
62+
"pytorch-version": "pytorch-nightly",
63+
"alias": "h100-distributed"
64+
},
5565
{
5666
"runner": "linux.dgx.b200",
5767
"python-version": "3.12",

.github/workflows/test.yml

Lines changed: 22 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,6 +62,22 @@ jobs:
6262
- name: Check out code
6363
uses: actions/checkout@v5
6464

65+
- name: Install system dependencies
66+
if: contains(matrix.alias, 'distributed')
67+
run: |
68+
set -eux
69+
apt-get update
70+
apt-get install -y curl wget git pkg-config zlib1g-dev build-essential
71+
72+
- name: Install NVSHMEM
73+
if: contains(matrix.alias, 'distributed')
74+
run: |
75+
set -euxo pipefail
76+
curl -L https://raw.githubusercontent.com/pytorch/pytorch/main/.ci/docker/common/install_cuda.sh -o install_cuda.sh
77+
chmod +x install_cuda.sh
78+
source install_cuda.sh
79+
install_nvshmem 13 3.4.5
80+
6581
- name: Install uv
6682
uses: astral-sh/setup-uv@v7
6783
with:
@@ -97,7 +113,7 @@ jobs:
97113
fi
98114
99115
- name: Install Triton
100-
if: contains(matrix.alias, 'cpu') || (steps.cache.outputs.cache-hit != 'true' && matrix.pytorch-version != 'pytorch-2.9')
116+
if: contains(matrix.alias, 'cpu') || (steps.cache.outputs.cache-hit != 'true' && matrix.pytorch-version != 'pytorch-2.9') || contains(matrix.alias, 'distributed')
101117
run: |
102118
set -x
103119
source .venv/bin/activate
@@ -139,7 +155,11 @@ jobs:
139155
if [[ "${{ contains(matrix.alias, 'cpu') }}" == "true" ]]; then export TRITON_CPU_BACKEND=1; fi
140156
# -rf: print failed tests
141157
# --timeout: max allowed time for each test
142-
pytest -rf --timeout=60
158+
if [[ "${{ matrix.alias }}" == *distributed* ]]; then
159+
pytest -rf --timeout=120 test/test_distributed.py
160+
else
161+
pytest -rf --timeout=60
162+
fi
143163
144164
test-notebooks:
145165
name: test-notebooks-cu128-py3.12-pytorch-2.9-a10g

0 commit comments

Comments
 (0)