@@ -16,95 +16,165 @@ concurrency:
1616 group : gpu-tests-${{ github.ref_name }}-${{ !(github.ref_protected) || github.sha }}
1717 cancel-in-progress : true
1818
19+ # Cherry-picked from https://github.com/pytorch/test-infra/blob/main/.github/workflows/linux_job.yml
20+
1921jobs :
20- gpu-tests :
21- runs-on : [self-hosted, 2-gpus]
22- timeout-minutes : 45
23- defaults :
24- run :
25- shell : bash
22+ gpu-tests :
2623 strategy :
27- max-parallel : 1
28- fail-fast : true
2924 matrix :
3025 pytorch-channel : [pytorch, pytorch-nightly]
26+ fail-fast : false
3127 env :
32- AGENT_TOOLSDIRECTORY : /tmp/python
28+ DOCKER_IMAGE : " pytorch/conda-builder:cuda11.7"
29+ REPOSITORY : ${{ github.repository }}
30+ PR_NUMBER : ${{ github.event.pull_request.number }}
31+ runs-on : linux.8xlarge.nvidia.gpu
32+ timeout-minutes : 45
3333
3434 steps :
35- - uses : actions/checkout@v3
36-
37- - name : Clean python tool path
35+ - name : Clean workspace
3836 run : |
39- rm -rf ${AGENT_TOOLSDIRECTORY}
37+ echo "::group::Cleanup debug output"
38+ sudo rm -rfv "${GITHUB_WORKSPACE}"
39+ mkdir -p "${GITHUB_WORKSPACE}"
40+ echo "::endgroup::"
41+
42+ - name : Checkout repository (pytorch/test-infra)
43+ uses : actions/checkout@v3
44+ with :
45+ # Support the use case where we need to checkout someone's fork
46+ repository : pytorch/test-infra
47+ path : test-infra
48+
49+ - name : Setup Linux
50+ uses : ./test-infra/.github/actions/setup-linux
4051
41- - uses : actions/setup-python@v4
52+ - name : Pull docker image
53+ uses : ./test-infra/.github/actions/pull-docker-image
4254 with :
43- python-version : 3.9
55+ docker-image : ${{ env.DOCKER_IMAGE }}
4456
45- - name : Install PyTorch
46- # https://pytorch.org/get-started/locally/
47- if : ${{ matrix.pytorch-channel == 'pytorch' }}
57+ - name : Checkout repository (${{ github.repository }})
58+ uses : actions/checkout@v3
59+ with :
60+ # Support the use case where we need to checkout someone's fork
61+ repository : ${{ github.repository }}
62+ ref : ${{ github.ref }}
63+ path : ${{ github.repository }}
64+ fetch-depth : 1
65+
66+ - name : Start Pytorch container
67+ working-directory : ${{ github.repository }}
4868 run : |
49- pip install --upgrade torch torchvision --extra-index-url https://download.pytorch.org/whl/cu117
50- nvidia-smi
51- python -c "import torch; print(torch.__version__, ', CUDA is available: ', torch.cuda.is_available()); exit(not torch.cuda.is_available())"
52- pip list
69+ docker run --name pthd --gpus=all --rm \
70+ --cap-add=SYS_PTRACE \
71+ --detach \
72+ --ipc=host \
73+ --security-opt seccomp=unconfined \
74+ --shm-size=2g \
75+ --tty \
76+ --ulimit stack=10485760:83886080 \
77+ -v $PWD:/work \
78+ -w /work \
79+ ${DOCKER_IMAGE}
80+
81+ script=$(cat << EOF
82+
83+ set -x
84+
85+ nvidia-smi
86+ ls -alh
5387
54- - name : Install PyTorch (nightly)
55- # https://pytorch.org/get-started/locally/
56- if : ${{ matrix.pytorch-channel == 'pytorch-nightly' }}
88+ conda --version
89+ python --version
90+
91+ EOF
92+ )
93+ docker exec -t pthd /bin/bash -c "${script}"
94+
95+ - name : Install PyTorch and dependencies
96+ continue-on-error : false
5797 run : |
58- pip install --upgrade --pre torch torchvision --extra-index-url https://download.pytorch.org/whl/nightly/cu117
98+
99+ script=$(cat << EOF
100+
101+ set -x
102+
103+ # Install PyTorch
104+ if [ "${{ matrix.pytorch-channel }}" == "pytorch" ]; then
105+ pip install --upgrade torch torchvision --extra-index-url https://download.pytorch.org/whl/cu117
106+ else
107+ pip install --upgrade --pre torch torchvision --extra-index-url https://download.pytorch.org/whl/nightly/cu117
108+ fi
109+
59110 nvidia-smi
60111 python -c "import torch; print(torch.__version__, ', CUDA is available: ', torch.cuda.is_available()); exit(not torch.cuda.is_available())"
61112 pip list
62113
63- - name : Install dependencies
64- run : |
114+ # Install dependencies
65115 pip install -r requirements-dev.txt
66116 pip install -e .
67117
118+ EOF
119+ )
120+
121+ docker exec -t pthd /bin/bash -c "${script}"
122+
68123 - name : Run 1 Node 2 GPUs Unit Tests
124+ continue-on-error : false
69125 run : |
126+
127+ script=$(cat << EOF
128+
129+ set -x
130+
70131 bash tests/run_gpu_tests.sh 2
71132
133+ EOF
134+ )
135+
136+ docker exec -t pthd /bin/bash -c "${script}"
137+
72138 - name : Upload coverage to Codecov
73139 uses : codecov/codecov-action@v3
74140 with :
75- file : . /coverage.xml
141+ file : ${{ github.repository }} /coverage.xml
76142 flags : gpu-2
77143 fail_ci_if_error : false
78144
79- - name : Install additional example dependencies
80- run : pip install fire
81-
82- - name : Check training on cifar10, run without backend
83- run : |
84- export example_path="examples/contrib/cifar10"
85- # initial run
86- export stop_cmd="--stop_iteration=500"
87- CI=1 python ${example_path}/main.py run --checkpoint_every=200 ${stop_cmd}
88- # resume
89- export resume_opt="--resume-from=/tmp/output-cifar10/resnet18_backend-None-1_stop-on-500/training_checkpoint_400.pt"
90- CI=1 python ${example_path}/main.py run --checkpoint_every=200 --num_epochs=7 ${resume_opt}
91-
92- - name : Check training on cifar10, run with NCCL backend using torchrun
93- run : |
94- export example_path="examples/contrib/cifar10"
95- # initial run
96- export stop_cmd="--stop_iteration=500"
97- CI=1 torchrun --nproc_per_node=2 ${example_path}/main.py run --backend=nccl --checkpoint_every=200 ${stop_cmd}
98- # resume
99- export resume_opt="--resume-from=/tmp/output-cifar10/resnet18_backend-nccl-2_stop-on-500/training_checkpoint_400.pt"
100- CI=1 torchrun --nproc_per_node=2 ${example_path}/main.py run --backend=nccl --checkpoint_every=200 --num_epochs=7 ${resume_opt}
101-
102- - name : Check training on cifar10, run with NCCL backend using spawn
145+ - name : Run examples in container
146+ continue-on-error : false
103147 run : |
104- export example_path="examples/contrib/cifar10"
105- # initial run
106- export stop_cmd="--stop_iteration=500"
107- CI=1 python -u ${example_path}/main.py run --backend=nccl --nproc_per_node=2 --checkpoint_every=200 ${stop_cmd}
108- # resume
109- export resume_opt="--resume-from=/tmp/output-cifar10/resnet18_backend-nccl-2_stop-on-500/training_checkpoint_400.pt"
110- CI=1 python -u ${example_path}/main.py run --backend=nccl --nproc_per_node=2 --checkpoint_every=200 --num_epochs=7 ${resume_opt}
148+ SCRIPT=$(cat << EOF
149+
150+ set -x
151+
152+ # Install additional example dependencies
153+ pip install fire
154+
155+ # Check training on cifar10, run without backend
156+ ## initial run
157+ CI=1 python examples/contrib/cifar10/main.py run --checkpoint_every=200 --stop_iteration=500
158+ ## resume
159+ CI=1 python examples/contrib/cifar10/main.py run --checkpoint_every=200 --num_epochs=7 --resume-from=/tmp/output-cifar10/resnet18_backend-None-1_stop-on-500/training_checkpoint_400.pt
160+
161+ # Check training on cifar10, run with NCCL backend using torchrun
162+ # # initial run
163+ CI=1 torchrun --nproc_per_node=2 examples/contrib/cifar10/main.py run --backend=nccl --checkpoint_every=200 --stop_iteration=500
164+ # # resume
165+ CI=1 torchrun --nproc_per_node=2 examples/contrib/cifar10/main.py run --backend=nccl --checkpoint_every=200 --num_epochs=7 --resume-from=/tmp/output-cifar10/resnet18_backend-nccl-2_stop-on-500/training_checkpoint_400.pt
166+
167+ # Check training on cifar10, run with NCCL backend using spawn
168+ # # initial run
169+ CI=1 python -u examples/contrib/cifar10/main.py run --backend=nccl --nproc_per_node=2 --checkpoint_every=200 --stop_iteration=500
170+ # # resume
171+ CI=1 python -u examples/contrib/cifar10/main.py run --backend=nccl --nproc_per_node=2 --checkpoint_every=200 --num_epochs=7 --resume-from=/tmp/output-cifar10/resnet18_backend-nccl-2_stop-on-500/training_checkpoint_400.pt
172+
173+ EOF
174+ )
175+
176+ docker exec -t pthd /bin/bash -c "${script}"
177+
178+ - name : Teardown Linux
179+ if : ${{ always() }}
180+ uses : ./test-infra/.github/actions/teardown-linux
0 commit comments