Skip to content

Commit 5618769

Browse files
authored
Migrated gpu-tests ci to pytorch infra (#2934)
* Try another gpu infra * Removed actions/setup-python As does not work on centos * another attempt * fix syntax error * yet another attempt * another try * fix small problem * fixed bugs * fixed exported vars * restructure steps * remove -i flag * update container run flags * added missing cwd into container * another try
1 parent d1f1512 commit 5618769

File tree

1 file changed

+130
-60
lines changed

1 file changed

+130
-60
lines changed

.github/workflows/gpu-tests.yml

Lines changed: 130 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -16,95 +16,165 @@ concurrency:
1616
group: gpu-tests-${{ github.ref_name }}-${{ !(github.ref_protected) || github.sha }}
1717
cancel-in-progress: true
1818

19+
# Cherry-picked from https://github.com/pytorch/test-infra/blob/main/.github/workflows/linux_job.yml
20+
1921
jobs:
20-
gpu-tests:
21-
runs-on: [self-hosted, 2-gpus]
22-
timeout-minutes: 45
23-
defaults:
24-
run:
25-
shell: bash
22+
gpu-tests:
2623
strategy:
27-
max-parallel: 1
28-
fail-fast: true
2924
matrix:
3025
pytorch-channel: [pytorch, pytorch-nightly]
26+
fail-fast: false
3127
env:
32-
AGENT_TOOLSDIRECTORY: /tmp/python
28+
DOCKER_IMAGE: "pytorch/conda-builder:cuda11.7"
29+
REPOSITORY: ${{ github.repository }}
30+
PR_NUMBER: ${{ github.event.pull_request.number }}
31+
runs-on: linux.8xlarge.nvidia.gpu
32+
timeout-minutes: 45
3333

3434
steps:
35-
- uses: actions/checkout@v3
36-
37-
- name: Clean python tool path
35+
- name: Clean workspace
3836
run: |
39-
rm -rf ${AGENT_TOOLSDIRECTORY}
37+
echo "::group::Cleanup debug output"
38+
sudo rm -rfv "${GITHUB_WORKSPACE}"
39+
mkdir -p "${GITHUB_WORKSPACE}"
40+
echo "::endgroup::"
41+
42+
- name: Checkout repository (pytorch/test-infra)
43+
uses: actions/checkout@v3
44+
with:
45+
# Support the use case where we need to checkout someone's fork
46+
repository: pytorch/test-infra
47+
path: test-infra
48+
49+
- name: Setup Linux
50+
uses: ./test-infra/.github/actions/setup-linux
4051

41-
- uses: actions/setup-python@v4
52+
- name: Pull docker image
53+
uses: ./test-infra/.github/actions/pull-docker-image
4254
with:
43-
python-version: 3.9
55+
docker-image: ${{ env.DOCKER_IMAGE }}
4456

45-
- name: Install PyTorch
46-
# https://pytorch.org/get-started/locally/
47-
if: ${{ matrix.pytorch-channel == 'pytorch' }}
57+
- name: Checkout repository (${{ github.repository }})
58+
uses: actions/checkout@v3
59+
with:
60+
# Support the use case where we need to checkout someone's fork
61+
repository: ${{ github.repository }}
62+
ref: ${{ github.ref }}
63+
path: ${{ github.repository }}
64+
fetch-depth: 1
65+
66+
- name: Start Pytorch container
67+
working-directory: ${{ github.repository }}
4868
run: |
49-
pip install --upgrade torch torchvision --extra-index-url https://download.pytorch.org/whl/cu117
50-
nvidia-smi
51-
python -c "import torch; print(torch.__version__, ', CUDA is available: ', torch.cuda.is_available()); exit(not torch.cuda.is_available())"
52-
pip list
69+
docker run --name pthd --gpus=all --rm \
70+
--cap-add=SYS_PTRACE \
71+
--detach \
72+
--ipc=host \
73+
--security-opt seccomp=unconfined \
74+
--shm-size=2g \
75+
--tty \
76+
--ulimit stack=10485760:83886080 \
77+
-v $PWD:/work \
78+
-w /work \
79+
${DOCKER_IMAGE}
80+
81+
script=$(cat << EOF
82+
83+
set -x
84+
85+
nvidia-smi
86+
ls -alh
5387
54-
- name: Install PyTorch (nightly)
55-
# https://pytorch.org/get-started/locally/
56-
if: ${{ matrix.pytorch-channel == 'pytorch-nightly' }}
88+
conda --version
89+
python --version
90+
91+
EOF
92+
)
93+
docker exec -t pthd /bin/bash -c "${script}"
94+
95+
- name: Install PyTorch and dependencies
96+
continue-on-error: false
5797
run: |
58-
pip install --upgrade --pre torch torchvision --extra-index-url https://download.pytorch.org/whl/nightly/cu117
98+
99+
script=$(cat << EOF
100+
101+
set -x
102+
103+
# Install PyTorch
104+
if [ "${{ matrix.pytorch-channel }}" == "pytorch" ]; then
105+
pip install --upgrade torch torchvision --extra-index-url https://download.pytorch.org/whl/cu117
106+
else
107+
pip install --upgrade --pre torch torchvision --extra-index-url https://download.pytorch.org/whl/nightly/cu117
108+
fi
109+
59110
nvidia-smi
60111
python -c "import torch; print(torch.__version__, ', CUDA is available: ', torch.cuda.is_available()); exit(not torch.cuda.is_available())"
61112
pip list
62113
63-
- name: Install dependencies
64-
run: |
114+
# Install dependencies
65115
pip install -r requirements-dev.txt
66116
pip install -e .
67117
118+
EOF
119+
)
120+
121+
docker exec -t pthd /bin/bash -c "${script}"
122+
68123
- name: Run 1 Node 2 GPUs Unit Tests
124+
continue-on-error: false
69125
run: |
126+
127+
script=$(cat << EOF
128+
129+
set -x
130+
70131
bash tests/run_gpu_tests.sh 2
71132
133+
EOF
134+
)
135+
136+
docker exec -t pthd /bin/bash -c "${script}"
137+
72138
- name: Upload coverage to Codecov
73139
uses: codecov/codecov-action@v3
74140
with:
75-
file: ./coverage.xml
141+
file: ${{ github.repository }}/coverage.xml
76142
flags: gpu-2
77143
fail_ci_if_error: false
78144

79-
- name: Install additional example dependencies
80-
run: pip install fire
81-
82-
- name: Check training on cifar10, run without backend
83-
run: |
84-
export example_path="examples/contrib/cifar10"
85-
# initial run
86-
export stop_cmd="--stop_iteration=500"
87-
CI=1 python ${example_path}/main.py run --checkpoint_every=200 ${stop_cmd}
88-
# resume
89-
export resume_opt="--resume-from=/tmp/output-cifar10/resnet18_backend-None-1_stop-on-500/training_checkpoint_400.pt"
90-
CI=1 python ${example_path}/main.py run --checkpoint_every=200 --num_epochs=7 ${resume_opt}
91-
92-
- name: Check training on cifar10, run with NCCL backend using torchrun
93-
run: |
94-
export example_path="examples/contrib/cifar10"
95-
# initial run
96-
export stop_cmd="--stop_iteration=500"
97-
CI=1 torchrun --nproc_per_node=2 ${example_path}/main.py run --backend=nccl --checkpoint_every=200 ${stop_cmd}
98-
# resume
99-
export resume_opt="--resume-from=/tmp/output-cifar10/resnet18_backend-nccl-2_stop-on-500/training_checkpoint_400.pt"
100-
CI=1 torchrun --nproc_per_node=2 ${example_path}/main.py run --backend=nccl --checkpoint_every=200 --num_epochs=7 ${resume_opt}
101-
102-
- name: Check training on cifar10, run with NCCL backend using spawn
145+
- name: Run examples in container
146+
continue-on-error: false
103147
run: |
104-
export example_path="examples/contrib/cifar10"
105-
# initial run
106-
export stop_cmd="--stop_iteration=500"
107-
CI=1 python -u ${example_path}/main.py run --backend=nccl --nproc_per_node=2 --checkpoint_every=200 ${stop_cmd}
108-
# resume
109-
export resume_opt="--resume-from=/tmp/output-cifar10/resnet18_backend-nccl-2_stop-on-500/training_checkpoint_400.pt"
110-
CI=1 python -u ${example_path}/main.py run --backend=nccl --nproc_per_node=2 --checkpoint_every=200 --num_epochs=7 ${resume_opt}
148+
SCRIPT=$(cat << EOF
149+
150+
set -x
151+
152+
# Install additional example dependencies
153+
pip install fire
154+
155+
# Check training on cifar10, run without backend
156+
## initial run
157+
CI=1 python examples/contrib/cifar10/main.py run --checkpoint_every=200 --stop_iteration=500
158+
## resume
159+
CI=1 python examples/contrib/cifar10/main.py run --checkpoint_every=200 --num_epochs=7 --resume-from=/tmp/output-cifar10/resnet18_backend-None-1_stop-on-500/training_checkpoint_400.pt
160+
161+
# Check training on cifar10, run with NCCL backend using torchrun
162+
## initial run
163+
CI=1 torchrun --nproc_per_node=2 examples/contrib/cifar10/main.py run --backend=nccl --checkpoint_every=200 --stop_iteration=500
164+
## resume
165+
CI=1 torchrun --nproc_per_node=2 examples/contrib/cifar10/main.py run --backend=nccl --checkpoint_every=200 --num_epochs=7 --resume-from=/tmp/output-cifar10/resnet18_backend-nccl-2_stop-on-500/training_checkpoint_400.pt
166+
167+
# Check training on cifar10, run with NCCL backend using spawn
168+
## initial run
169+
CI=1 python -u examples/contrib/cifar10/main.py run --backend=nccl --nproc_per_node=2 --checkpoint_every=200 --stop_iteration=500
170+
## resume
171+
CI=1 python -u examples/contrib/cifar10/main.py run --backend=nccl --nproc_per_node=2 --checkpoint_every=200 --num_epochs=7 --resume-from=/tmp/output-cifar10/resnet18_backend-nccl-2_stop-on-500/training_checkpoint_400.pt
172+
173+
EOF
174+
)
175+
176+
docker exec -t pthd /bin/bash -c "${script}"
177+
178+
- name: Teardown Linux
179+
if: ${{ always() }}
180+
uses: ./test-infra/.github/actions/teardown-linux

0 commit comments

Comments
 (0)