pytorch
diff --git a/‎.circleci/config.yml‎
Lines changed: 0 additions & 15 deletions b/‎.circleci/config.yml‎
Lines changed: 0 additions & 15 deletions
diff --git a/‎.github/workflows/gpu-tests.yml‎
Lines changed: 130 additions & 60 deletions b/‎.github/workflows/gpu-tests.yml‎
Lines changed: 130 additions & 60 deletions
diff --git a/‎.github/workflows/hvd-tests.yml‎
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/hvd-tests.yml‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎.github/workflows/pytorch-version-tests.yml‎
Lines changed: 26 additions & 20 deletions b/‎.github/workflows/pytorch-version-tests.yml‎
Lines changed: 26 additions & 20 deletions
diff --git a/‎.github/workflows/trigger_circle_ci.py‎
Lines changed: 0 additions & 3 deletions b/‎.github/workflows/trigger_circle_ci.py‎
Lines changed: 0 additions & 3 deletions
@@ -187,11 +187,6 @@ jobs:
             export mnist4_cmd='CUDA_VISIBLE_DEVICES=0 python examples/mnist/mnist_save_resume_engine.py --epochs=2 --resume_from=/tmp/mnist_save_resume/checkpoint_1.pt'
             docker exec -it pthd /bin/bash -c "$mnist4_cmd"
 
-      - run:
-          name: Codecov upload
-          command: |
-            bash <(curl -s https://codecov.io/bash) -Z -F gpu
-
   one_gpu_windows_tests:
     <<: *one_gpu_windows
 
@@ -258,11 +253,6 @@ jobs:
             export test_cmd='bash tests/run_gpu_tests.sh 2'
             docker exec -it pthd /bin/bash -c "${test_cmd}"
 
-      - run:
-          name: Codecov upload
-          command: |
-            bash <(curl -s https://codecov.io/bash) -Z -F gpu-2
-
   two_gpus_check_dist_cifar10_example:
     <<: *two_gpus
 
@@ -353,11 +343,6 @@ jobs:
             export test_cmd='CUDA_VISIBLE_DEVICES="" pytest --cov ignite --cov-append --cov-report term-missing --cov-report xml -vvv tests/ -m distributed -k hvd'
             docker exec -it pthd /bin/bash -c "${test_cmd}"
 
-      - run:
-          name: Codecov upload
-          command: |
-            bash <(curl -s https://codecov.io/bash) -Z -F gpu-2-hvd
-
       - run:
           name: "Check CIFAR10 using horovodrun"
           command: |
 
@@ -16,95 +16,165 @@ concurrency:
   group: gpu-tests-${{ github.ref_name }}-${{ !(github.ref_protected) || github.sha }}
   cancel-in-progress: true
 
+# Cherry-picked from https://github.com/pytorch/test-infra/blob/main/.github/workflows/linux_job.yml
+
 jobs:
-  gpu-tests:
-    runs-on: [self-hosted, 2-gpus]
-    timeout-minutes: 45
-    defaults:
-      run:
-        shell: bash
+  gpu-tests:    
     strategy:
-      max-parallel: 1
-      fail-fast: true
       matrix:
         pytorch-channel: [pytorch, pytorch-nightly]
+      fail-fast: false
     env:
-      AGENT_TOOLSDIRECTORY: /tmp/python
+      DOCKER_IMAGE: "pytorch/conda-builder:cuda11.7"
+      REPOSITORY: ${{ github.repository }}
+      PR_NUMBER: ${{ github.event.pull_request.number }}
+    runs-on: linux.8xlarge.nvidia.gpu
+    timeout-minutes: 45
 
     steps:
-      - uses: actions/checkout@v3
-
-      - name: Clean python tool path
+      - name: Clean workspace
         run: |
-          rm -rf ${AGENT_TOOLSDIRECTORY}
+          echo "::group::Cleanup debug output"
+          sudo rm -rfv "${GITHUB_WORKSPACE}"
+          mkdir -p "${GITHUB_WORKSPACE}"
+          echo "::endgroup::"
+
+      - name: Checkout repository (pytorch/test-infra)
+        uses: actions/checkout@v3
+        with:
+          # Support the use case where we need to checkout someone's fork
+          repository: pytorch/test-infra
+          path: test-infra
+
+      - name: Setup Linux
+        uses: ./test-infra/.github/actions/setup-linux
 
-      - uses: actions/setup-python@v4
+      - name: Pull docker image
+        uses: ./test-infra/.github/actions/pull-docker-image
         with:
-          python-version: 3.9
+          docker-image: ${{ env.DOCKER_IMAGE }}
 
-      - name: Install PyTorch
-        # https://pytorch.org/get-started/locally/
-        if: ${{ matrix.pytorch-channel == 'pytorch' }}
+      - name: Checkout repository (${{ github.repository }})
+        uses: actions/checkout@v3
+        with:
+          # Support the use case where we need to checkout someone's fork
+          repository: ${{ github.repository }}
+          ref: ${{ github.ref }}
+          path: ${{ github.repository }}
+          fetch-depth: 1
+
+      - name: Start Pytorch container
+        working-directory: ${{ github.repository }}
         run: |
-          pip install --upgrade torch torchvision --extra-index-url https://download.pytorch.org/whl/cu117
-          nvidia-smi
-          python -c "import torch; print(torch.__version__, ', CUDA is available: ', torch.cuda.is_available()); exit(not torch.cuda.is_available())"
-          pip list
+          docker run --name pthd --gpus=all --rm \
+            --cap-add=SYS_PTRACE \
+            --detach \
+            --ipc=host \
+            --security-opt seccomp=unconfined \
+            --shm-size=2g \
+            --tty \
+            --ulimit stack=10485760:83886080 \
+            -v $PWD:/work \
+            -w /work \
+            ${DOCKER_IMAGE}
+
+          script=$(cat << EOF
+
+            set -x
+
+            nvidia-smi
+            ls -alh
 
-      - name: Install PyTorch (nightly)
-        # https://pytorch.org/get-started/locally/
-        if: ${{ matrix.pytorch-channel == 'pytorch-nightly' }}
+            conda --version
+            python --version
+
+          EOF
+          )
+          docker exec -t pthd /bin/bash -c "${script}"
+
+      - name: Install PyTorch and dependencies
+        continue-on-error: false
         run: |
-          pip install --upgrade --pre torch torchvision --extra-index-url https://download.pytorch.org/whl/nightly/cu117
+
+          script=$(cat << EOF
+
+          set -x
+
+          # Install PyTorch
+          if [ "${{ matrix.pytorch-channel }}" == "pytorch" ]; then
+            pip install --upgrade torch torchvision --extra-index-url https://download.pytorch.org/whl/cu117
+          else
+            pip install --upgrade --pre torch torchvision --extra-index-url https://download.pytorch.org/whl/nightly/cu117
+          fi
+
           nvidia-smi
           python -c "import torch; print(torch.__version__, ', CUDA is available: ', torch.cuda.is_available()); exit(not torch.cuda.is_available())"
           pip list
 
-      - name: Install dependencies
-        run: |
+          # Install dependencies
           pip install -r requirements-dev.txt
           pip install -e .
 
+          EOF
+          )
+
+          docker exec -t pthd /bin/bash -c "${script}"
+
       - name: Run 1 Node 2 GPUs Unit Tests
+        continue-on-error: false
         run: |
+
+          script=$(cat << EOF
+
+          set -x
+
           bash tests/run_gpu_tests.sh 2
 
+          EOF
+          )
+
+          docker exec -t pthd /bin/bash -c "${script}"
+
       - name: Upload coverage to Codecov
         uses: codecov/codecov-action@v3
         with:
-          file: ./coverage.xml
+          file: ${{ github.repository }}/coverage.xml
           flags: gpu-2
           fail_ci_if_error: false
 
-      - name: Install additional example dependencies
-        run: pip install fire
-
-      - name: Check training on cifar10, run without backend
-        run: |
-          export example_path="examples/contrib/cifar10"
-          # initial run
-          export stop_cmd="--stop_iteration=500"
-          CI=1 python ${example_path}/main.py run --checkpoint_every=200 ${stop_cmd}
-          # resume
-          export resume_opt="--resume-from=/tmp/output-cifar10/resnet18_backend-None-1_stop-on-500/training_checkpoint_400.pt"
-          CI=1 python ${example_path}/main.py run --checkpoint_every=200 --num_epochs=7 ${resume_opt}
-
-      - name: Check training on cifar10, run with NCCL backend using torchrun
-        run: |
-          export example_path="examples/contrib/cifar10"
-          # initial run
-          export stop_cmd="--stop_iteration=500"
-          CI=1 torchrun --nproc_per_node=2 ${example_path}/main.py run --backend=nccl --checkpoint_every=200 ${stop_cmd}
-          # resume
-          export resume_opt="--resume-from=/tmp/output-cifar10/resnet18_backend-nccl-2_stop-on-500/training_checkpoint_400.pt"
-          CI=1 torchrun --nproc_per_node=2 ${example_path}/main.py run --backend=nccl --checkpoint_every=200 --num_epochs=7 ${resume_opt}
-
-      - name: Check training on cifar10, run with NCCL backend using spawn
+      - name: Run examples in container
+        continue-on-error: false
         run: |
-          export example_path="examples/contrib/cifar10"
-          # initial run
-          export stop_cmd="--stop_iteration=500"
-          CI=1 python -u ${example_path}/main.py run --backend=nccl --nproc_per_node=2 --checkpoint_every=200 ${stop_cmd}
-          # resume
-          export resume_opt="--resume-from=/tmp/output-cifar10/resnet18_backend-nccl-2_stop-on-500/training_checkpoint_400.pt"
-          CI=1 python -u ${example_path}/main.py run --backend=nccl --nproc_per_node=2 --checkpoint_every=200 --num_epochs=7 ${resume_opt}
+          SCRIPT=$(cat << EOF
+          
+          set -x
+
+          # Install additional example dependencies
+          pip install fire
+
+          # Check training on cifar10, run without backend
+          ## initial run
+          CI=1 python examples/contrib/cifar10/main.py run --checkpoint_every=200 --stop_iteration=500
+          ## resume
+          CI=1 python examples/contrib/cifar10/main.py run --checkpoint_every=200 --num_epochs=7 --resume-from=/tmp/output-cifar10/resnet18_backend-None-1_stop-on-500/training_checkpoint_400.pt
+    
+          # Check training on cifar10, run with NCCL backend using torchrun
+          ## initial run
+          CI=1 torchrun --nproc_per_node=2 examples/contrib/cifar10/main.py run --backend=nccl --checkpoint_every=200 --stop_iteration=500
+          ## resume
+          CI=1 torchrun --nproc_per_node=2 examples/contrib/cifar10/main.py run --backend=nccl --checkpoint_every=200 --num_epochs=7 --resume-from=/tmp/output-cifar10/resnet18_backend-nccl-2_stop-on-500/training_checkpoint_400.pt
+
+          # Check training on cifar10, run with NCCL backend using spawn
+          ## initial run
+          CI=1 python -u examples/contrib/cifar10/main.py run --backend=nccl --nproc_per_node=2 --checkpoint_every=200 --stop_iteration=500
+          ## resume
+          CI=1 python -u examples/contrib/cifar10/main.py run --backend=nccl --nproc_per_node=2 --checkpoint_every=200 --num_epochs=7 --resume-from=/tmp/output-cifar10/resnet18_backend-nccl-2_stop-on-500/training_checkpoint_400.pt
+
+          EOF
+          )
+
+          docker exec -t pthd /bin/bash -c "${script}"
+
+      - name: Teardown Linux
+        if: ${{ always() }}
+        uses: ./test-infra/.github/actions/teardown-linux
@@ -25,6 +25,7 @@ concurrency:
 jobs:
   horovod-tests:
     runs-on: ubuntu-latest
+    timeout-minutes: 60
     strategy:
       matrix:
         python-version: [3.8]
 
@@ -16,40 +16,41 @@ jobs:
       matrix:
         python-version: [3.8, 3.9, "3.10"]
         pytorch-version:
-          [1.12.1, 1.11.0, 1.10.0, 1.9.1, 1.8.1, 1.7.1, 1.6.0, 1.5.1, 1.4.0, 1.3.1]
+          [1.13.1, 1.12.1, 1.11.0, 1.10.0, 1.9.1, 1.8.1, 1.7.1, 1.6.0, 1.5.1, 1.4.0]
         exclude:
-          - pytorch-version: 1.3.1
-            python-version: 3.8
-          - pytorch-version: 1.3.1
-            python-version: 3.9
           - pytorch-version: 1.4.0
             python-version: 3.9
+          - pytorch-version: 1.4.0
+            python-version: 3.10
+            
           - pytorch-version: 1.5.1
             python-version: 3.9
+          - pytorch-version: 1.5.1
+            python-version: 3.10
+
           - pytorch-version: 1.6.0
             python-version: 3.9
+          - pytorch-version: 1.6.0
+            python-version: 3.10
+
           # disabling python 3.9 support with PyTorch 1.7.1 and 1.8.1, to stop repeated pytorch-version test fail.
           # https://github.com/pytorch/ignite/issues/2383
           - pytorch-version: 1.7.1
             python-version: 3.9
-          - pytorch-version: 1.8.1
-            python-version: 3.9
-          - pytorch-version: 1.3.1
-            python-version: 3.10
-          - pytorch-version: 1.4.0
-            python-version: 3.10
-          - pytorch-version: 1.5.1
-            python-version: 3.10
-          - pytorch-version: 1.6.0
-            python-version: 3.10
           - pytorch-version: 1.7.1
             python-version: 3.10
+
+          - pytorch-version: 1.8.1
+            python-version: 3.9
           - pytorch-version: 1.8.1
             python-version: 3.10
+
           - pytorch-version: 1.9.1
             python-version: 3.10
+
           - pytorch-version: 1.10.0
             python-version: 3.10
+
           - pytorch-version: 1.11.0
             python-version: 3.10
 
@@ -86,17 +87,22 @@ jobs:
 
       - name: Install dependencies
         shell: bash -l {0}
+        if: ${{ matrix.pytorch-version != '1.4.0' }}
         run: |
           conda install pytorch=${{ matrix.pytorch-version }} torchvision cpuonly python=${{ matrix.python-version }} -c pytorch
           pip install -r requirements-dev.txt
           python setup.py install
 
-      - name: Install appropriate Pillow for PyTorch 1.3.1
+      # There is no more torchvision 0.5.0 binaries in anaconda pytorch channel:
+      # https://anaconda.org/pytorch/torchvision/files 
+      - name: Install appropriate dependencies for PyTorch 1.4.0
         shell: bash -l {0}
-        if: ${{ matrix.pytorch-version == '1.3.1' }}
-        run: |
-          pip install --upgrade 'Pillow<7'
-          python -c "import torchvision"
+        if: ${{ matrix.pytorch-version == '1.4.0' }}
+        run: |         
+          conda install pytorch=${{ matrix.pytorch-version }} cpuonly python=${{ matrix.python-version }} -c pytorch
+          pip install torchvision==0.5.0
+          pip install -r requirements-dev.txt
+          python setup.py install
 
       - name: Download MNIST
         uses: pytorch-ignite/download-mnist-github-action@master
 
@@ -42,7 +42,6 @@ def assert_pipeline_created(pipeline_id, headers):
 
 
 def get_workflow_id(pipeline_id, headers):
-
     while True:
         result = requests.get(f"https://circleci.com/api/v2/pipeline/{pipeline_id}/workflow", headers=headers)
         assert_result(result, 200)
@@ -59,7 +58,6 @@ def get_workflow_id(pipeline_id, headers):
 
 
 def assert_workflows_successful(pipeline_id, headers):
-
     workflow_id = get_workflow_id(pipeline_id, headers)
 
     base_url = "https://app.circleci.com/pipelines/github/pytorch/ignite"
@@ -84,7 +82,6 @@ def assert_workflows_successful(pipeline_id, headers):
 
 
 if __name__ == "__main__":
-
     print("Trigger new pipeline on Circle-CI")
 
     if "CIRCLE_TOKEN" not in os.environ: