From 596dc43d295545133295dc8e6634224f6992700d Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Tue, 22 Nov 2022 12:43:32 +0100 Subject: [PATCH 01/13] enable CUDA tests on GHA --- test/common_utils.py | 4 ++-- test/conftest.py | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/test/common_utils.py b/test/common_utils.py index 9e919a14935..b76158b6c9c 100644 --- a/test/common_utils.py +++ b/test/common_utils.py @@ -13,11 +13,11 @@ import __main__ # noqa: 401 -IN_CIRCLE_CI = os.getenv("CIRCLECI", False) == "true" +IN_OSS_CI = any(os.getenv(var) == "true" for var in ["CIRCLECI", "GITHUB_ACTIONS"]) IN_RE_WORKER = os.environ.get("INSIDE_RE_WORKER") is not None IN_FBCODE = os.environ.get("IN_FBCODE_TORCHVISION") == "1" CUDA_NOT_AVAILABLE_MSG = "CUDA device not available" -CIRCLECI_GPU_NO_CUDA_MSG = "We're in a CircleCI GPU machine, and this test doesn't need cuda." +OSS_CI_GPU_NO_CUDA_MSG = "We're in an OSS GPU machine, and this test doesn't need cuda." @contextlib.contextmanager diff --git a/test/conftest.py b/test/conftest.py index 1a9b2db7f5c..54441f8430f 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -3,7 +3,7 @@ import numpy as np import pytest import torch -from common_utils import CIRCLECI_GPU_NO_CUDA_MSG, CUDA_NOT_AVAILABLE_MSG, IN_CIRCLE_CI, IN_FBCODE, IN_RE_WORKER +from common_utils import OSS_CI_GPU_NO_CUDA_MSG, CUDA_NOT_AVAILABLE_MSG, IN_OSS_CI, IN_FBCODE, IN_RE_WORKER def pytest_configure(config): @@ -18,7 +18,7 @@ def pytest_collection_modifyitems(items): # # Typically here, we try to optimize CI time. In particular, the GPU CI instances don't need to run the # tests that don't need CUDA, because those tests are extensively tested in the CPU CI instances already. - # This is true for both CircleCI and the fbcode internal CI. + # This is true for both OSS CI and the fbcode internal CI. # In the fbcode CI, we have an additional constraint: we try to avoid skipping tests. So instead of relying on # pytest.mark.skip, in fbcode we literally just remove those tests from the `items` list, and it's as if # these tests never existed. @@ -49,12 +49,12 @@ def pytest_collection_modifyitems(items): # TODO: something more robust would be to do that only in a sandcastle instance, # so that we can still see the test being skipped when testing locally from a devvm continue - elif IN_CIRCLE_CI: + elif IN_OSS_CI: # Here we're not in fbcode, so we can safely collect and skip tests. if not needs_cuda and torch.cuda.is_available(): - # Similar to what happens in RE workers: we don't need the CircleCI GPU machines + # Similar to what happens in RE workers: we don't need the OSS CI GPU machines # to run the CPU-only tests. - item.add_marker(pytest.mark.skip(reason=CIRCLECI_GPU_NO_CUDA_MSG)) + item.add_marker(pytest.mark.skip(reason=OSS_CI_GPU_NO_CUDA_MSG)) if item.get_closest_marker("dont_collect") is not None: # currently, this is only used for some tests we're sure we dont want to run on fbcode From 5199577f18a361cc985411ca38ea0b844b77fac5 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Tue, 22 Nov 2022 13:28:40 +0100 Subject: [PATCH 02/13] debug env vars --- .github/workflows/test-linux-gpu.yml | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/.github/workflows/test-linux-gpu.yml b/.github/workflows/test-linux-gpu.yml index a4d938f23ed..3545b79c630 100644 --- a/.github/workflows/test-linux-gpu.yml +++ b/.github/workflows/test-linux-gpu.yml @@ -46,16 +46,18 @@ jobs: conda create -yp ci_env python="${PYTHON_VERSION}" numpy libpng jpeg scipy conda activate /work/ci_env - # Install PyTorch, Torchvision, and testing libraries - set -ex - conda install \ - --yes \ - -c "pytorch-${CHANNEL}" \ - -c nvidia "pytorch-${CHANNEL}"::pytorch[build="*${VERSION}*"] \ - "${CUDATOOLKIT}" - python3 setup.py develop - python3 -m pip install pytest pytest-mock 'av<10' - - # Run Tests - python3 -m torch.utils.collect_env - python3 -m pytest --junitxml=test-results/junit.xml -v --durations 20 + python3 -c "import os; print('\n'.join(f'{key}: {value}' for key, value in sorted(os.environ.items())))" + +# # Install PyTorch, Torchvision, and testing libraries +# set -ex +# conda install \ +# --yes \ +# -c "pytorch-${CHANNEL}" \ +# -c nvidia "pytorch-${CHANNEL}"::pytorch[build="*${VERSION}*"] \ +# "${CUDATOOLKIT}" +# python3 setup.py develop +# python3 -m pip install pytest pytest-mock 'av<10' +# +# # Run Tests +# python3 -m torch.utils.collect_env +# python3 -m pytest --junitxml=test-results/junit.xml -v --durations 20 From 336d875a9ae52d6fd3f9559b9cbb5ba0c098f3bc Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Tue, 22 Nov 2022 13:51:25 +0100 Subject: [PATCH 03/13] add debug tests --- .github/workflows/test-linux-gpu.yml | 22 ++++++++++++---------- test/test_gpu_ci.py | 15 +++++++++++++++ 2 files changed, 27 insertions(+), 10 deletions(-) create mode 100644 test/test_gpu_ci.py diff --git a/.github/workflows/test-linux-gpu.yml b/.github/workflows/test-linux-gpu.yml index 3545b79c630..08a9c6b9665 100644 --- a/.github/workflows/test-linux-gpu.yml +++ b/.github/workflows/test-linux-gpu.yml @@ -48,16 +48,18 @@ jobs: python3 -c "import os; print('\n'.join(f'{key}: {value}' for key, value in sorted(os.environ.items())))" -# # Install PyTorch, Torchvision, and testing libraries -# set -ex -# conda install \ -# --yes \ -# -c "pytorch-${CHANNEL}" \ -# -c nvidia "pytorch-${CHANNEL}"::pytorch[build="*${VERSION}*"] \ -# "${CUDATOOLKIT}" -# python3 setup.py develop -# python3 -m pip install pytest pytest-mock 'av<10' -# + # Install PyTorch, Torchvision, and testing libraries + set -ex + conda install \ + --yes \ + -c "pytorch-${CHANNEL}" \ + -c nvidia "pytorch-${CHANNEL}"::pytorch[build="*${VERSION}*"] \ + "${CUDATOOLKIT}" + python3 setup.py develop + python3 -m pip install pytest pytest-mock 'av<10' + + python3 -m pytest -vrA test/test_gpu_ci.py + # # Run Tests # python3 -m torch.utils.collect_env # python3 -m pytest --junitxml=test-results/junit.xml -v --durations 20 diff --git a/test/test_gpu_ci.py b/test/test_gpu_ci.py new file mode 100644 index 00000000000..349b6973d7b --- /dev/null +++ b/test/test_gpu_ci.py @@ -0,0 +1,15 @@ +import os + +from common_utils import needs_cuda, cpu_and_gpu +import pytest + + +@pytest.mark.parametrize("device", cpu_and_gpu()) +def test_cpu_and_gpu(device): + print(os.getenv("GITHUB_ACTIONS")) + assert device != "cpu", "This should not be run on a GPU machine" + + +@needs_cuda +def test_needs_cuda(): + assert True From 2126c0d6ddc1d0df1bcfb9fe2322dbc1259b9f0e Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Tue, 22 Nov 2022 14:09:13 +0100 Subject: [PATCH 04/13] more debug output --- test/test_gpu_ci.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/test/test_gpu_ci.py b/test/test_gpu_ci.py index 349b6973d7b..77789a91134 100644 --- a/test/test_gpu_ci.py +++ b/test/test_gpu_ci.py @@ -5,11 +5,14 @@ @pytest.mark.parametrize("device", cpu_and_gpu()) -def test_cpu_and_gpu(device): +def test_cpu_and_gpu(request, device): print(os.getenv("GITHUB_ACTIONS")) + for mark in request.node.own_markers: + print(mark) assert device != "cpu", "This should not be run on a GPU machine" @needs_cuda -def test_needs_cuda(): - assert True +def test_needs_cuda(request): + for mark in request.node.own_markers: + print(mark) From 56bc60a56020b3d7841be15f24948f53be3d0bbe Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Tue, 22 Nov 2022 14:39:52 +0100 Subject: [PATCH 05/13] check if CUDA is available --- .github/workflows/test-linux-gpu.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test-linux-gpu.yml b/.github/workflows/test-linux-gpu.yml index 08a9c6b9665..5607efa900b 100644 --- a/.github/workflows/test-linux-gpu.yml +++ b/.github/workflows/test-linux-gpu.yml @@ -55,8 +55,12 @@ jobs: -c "pytorch-${CHANNEL}" \ -c nvidia "pytorch-${CHANNEL}"::pytorch[build="*${VERSION}*"] \ "${CUDATOOLKIT}" + + python3 -m torch.utils.collect_env + python3 -c "import torch; exit(not torch.cuda.is_available())" + python3 setup.py develop - python3 -m pip install pytest pytest-mock 'av<10' + python3 -m pip install --progress-bar=off pytest pytest-mock 'av<10' python3 -m pytest -vrA test/test_gpu_ci.py From 0a520ca47f2654ec1027ec199d978677775a04b6 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Tue, 22 Nov 2022 14:58:47 +0100 Subject: [PATCH 06/13] try access nvidia driver --- .github/workflows/test-linux-gpu.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/test-linux-gpu.yml b/.github/workflows/test-linux-gpu.yml index 5607efa900b..b0f24ea3b48 100644 --- a/.github/workflows/test-linux-gpu.yml +++ b/.github/workflows/test-linux-gpu.yml @@ -29,6 +29,9 @@ jobs: script: | # Mark Build Directory Safe git config --global --add safe.directory /__w/vision/vision + + nvidia-smi + exit 0 # Set up Environment Variables export PYTHON_VERSION="${{ matrix.python_version }}" From 350aa70f5c82675a4950db43faec5672466221f8 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Tue, 22 Nov 2022 15:06:59 +0100 Subject: [PATCH 07/13] try modinfo --- .github/workflows/test-linux-gpu.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/test-linux-gpu.yml b/.github/workflows/test-linux-gpu.yml index b0f24ea3b48..3cec3c3e01a 100644 --- a/.github/workflows/test-linux-gpu.yml +++ b/.github/workflows/test-linux-gpu.yml @@ -30,7 +30,8 @@ jobs: # Mark Build Directory Safe git config --global --add safe.directory /__w/vision/vision - nvidia-smi + modinfo nvidia || true + nvidia-smi || true exit 0 # Set up Environment Variables From cc9595a2693d3887314b204f6486bfe913170fa9 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Tue, 22 Nov 2022 15:18:26 +0100 Subject: [PATCH 08/13] revert debug --- .github/workflows/build-wheels-linux.yml | 4 +++- .github/workflows/test-linux-gpu.yml | 20 ++++---------------- test/test_gpu_ci.py | 18 ------------------ 3 files changed, 7 insertions(+), 35 deletions(-) delete mode 100644 test/test_gpu_ci.py diff --git a/.github/workflows/build-wheels-linux.yml b/.github/workflows/build-wheels-linux.yml index d2000cae995..16a9a9f5f3c 100644 --- a/.github/workflows/build-wheels-linux.yml +++ b/.github/workflows/build-wheels-linux.yml @@ -38,7 +38,9 @@ jobs: post-script: ${{ matrix.post-script }} package-name: ${{ matrix.package-name }} smoke-test-script: ${{ matrix.smoke-test-script }} - trigger-event: ${{ github.event_name }} + # Using "development" as trigger event so these binaries are not uploaded + # to official channels yet + trigger-event: development secrets: AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} diff --git a/.github/workflows/test-linux-gpu.yml b/.github/workflows/test-linux-gpu.yml index 3cec3c3e01a..a4d938f23ed 100644 --- a/.github/workflows/test-linux-gpu.yml +++ b/.github/workflows/test-linux-gpu.yml @@ -29,10 +29,6 @@ jobs: script: | # Mark Build Directory Safe git config --global --add safe.directory /__w/vision/vision - - modinfo nvidia || true - nvidia-smi || true - exit 0 # Set up Environment Variables export PYTHON_VERSION="${{ matrix.python_version }}" @@ -50,8 +46,6 @@ jobs: conda create -yp ci_env python="${PYTHON_VERSION}" numpy libpng jpeg scipy conda activate /work/ci_env - python3 -c "import os; print('\n'.join(f'{key}: {value}' for key, value in sorted(os.environ.items())))" - # Install PyTorch, Torchvision, and testing libraries set -ex conda install \ @@ -59,15 +53,9 @@ jobs: -c "pytorch-${CHANNEL}" \ -c nvidia "pytorch-${CHANNEL}"::pytorch[build="*${VERSION}*"] \ "${CUDATOOLKIT}" - - python3 -m torch.utils.collect_env - python3 -c "import torch; exit(not torch.cuda.is_available())" - python3 setup.py develop - python3 -m pip install --progress-bar=off pytest pytest-mock 'av<10' - - python3 -m pytest -vrA test/test_gpu_ci.py + python3 -m pip install pytest pytest-mock 'av<10' -# # Run Tests -# python3 -m torch.utils.collect_env -# python3 -m pytest --junitxml=test-results/junit.xml -v --durations 20 + # Run Tests + python3 -m torch.utils.collect_env + python3 -m pytest --junitxml=test-results/junit.xml -v --durations 20 diff --git a/test/test_gpu_ci.py b/test/test_gpu_ci.py deleted file mode 100644 index 77789a91134..00000000000 --- a/test/test_gpu_ci.py +++ /dev/null @@ -1,18 +0,0 @@ -import os - -from common_utils import needs_cuda, cpu_and_gpu -import pytest - - -@pytest.mark.parametrize("device", cpu_and_gpu()) -def test_cpu_and_gpu(request, device): - print(os.getenv("GITHUB_ACTIONS")) - for mark in request.node.own_markers: - print(mark) - assert device != "cpu", "This should not be run on a GPU machine" - - -@needs_cuda -def test_needs_cuda(request): - for mark in request.node.own_markers: - print(mark) From 1a2efbee6fe97a91659dc1e8ab8449e438aad4da Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Tue, 22 Nov 2022 15:25:09 +0100 Subject: [PATCH 09/13] fix revert --- .github/workflows/build-wheels-linux.yml | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/.github/workflows/build-wheels-linux.yml b/.github/workflows/build-wheels-linux.yml index 16a9a9f5f3c..d2000cae995 100644 --- a/.github/workflows/build-wheels-linux.yml +++ b/.github/workflows/build-wheels-linux.yml @@ -38,9 +38,7 @@ jobs: post-script: ${{ matrix.post-script }} package-name: ${{ matrix.package-name }} smoke-test-script: ${{ matrix.smoke-test-script }} - # Using "development" as trigger event so these binaries are not uploaded - # to official channels yet - trigger-event: development + trigger-event: ${{ github.event_name }} secrets: AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} From 5d6a57b76ffca7589cc5bacbd575a0f77ca39784 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Tue, 22 Nov 2022 15:30:47 +0100 Subject: [PATCH 10/13] lint --- test/conftest.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/conftest.py b/test/conftest.py index 54441f8430f..bb01342b494 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -3,7 +3,7 @@ import numpy as np import pytest import torch -from common_utils import OSS_CI_GPU_NO_CUDA_MSG, CUDA_NOT_AVAILABLE_MSG, IN_OSS_CI, IN_FBCODE, IN_RE_WORKER +from common_utils import CUDA_NOT_AVAILABLE_MSG, IN_FBCODE, IN_OSS_CI, IN_RE_WORKER, OSS_CI_GPU_NO_CUDA_MSG def pytest_configure(config): From 8d35989f324b350181ed37983fc9b0c97eefda81 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Thu, 8 Dec 2022 13:21:25 +0100 Subject: [PATCH 11/13] set smoke test for CUDA --- .github/workflows/test-linux-gpu.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/test-linux-gpu.yml b/.github/workflows/test-linux-gpu.yml index a4d938f23ed..1ea289cb5c1 100644 --- a/.github/workflows/test-linux-gpu.yml +++ b/.github/workflows/test-linux-gpu.yml @@ -53,6 +53,8 @@ jobs: -c "pytorch-${CHANNEL}" \ -c nvidia "pytorch-${CHANNEL}"::pytorch[build="*${VERSION}*"] \ "${CUDATOOLKIT}" + python -c 'import torch; exit(not torch.cuda.is_available())' + python3 setup.py develop python3 -m pip install pytest pytest-mock 'av<10' From df22a674c6b17a224a9f316cc5e68b6b4e73a103 Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Thu, 9 Feb 2023 15:42:55 +0100 Subject: [PATCH 12/13] cleanup --- .github/workflows/test-linux-gpu.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/test-linux-gpu.yml b/.github/workflows/test-linux-gpu.yml index dbb847ab970..b14600c8db0 100644 --- a/.github/workflows/test-linux-gpu.yml +++ b/.github/workflows/test-linux-gpu.yml @@ -53,7 +53,6 @@ jobs: -c "pytorch-${CHANNEL}" \ -c nvidia "pytorch-${CHANNEL}"::pytorch[build="*${VERSION}*"] \ "${CUDATOOLKIT}" - python -c 'import torch; exit(not torch.cuda.is_available())' python3 setup.py develop python3 -m pip install pytest pytest-mock 'av<10' From 9bc64e7e6a445c26789fbd943d4c791b3bd2f2db Mon Sep 17 00:00:00 2001 From: Philip Meier Date: Thu, 9 Feb 2023 15:44:18 +0100 Subject: [PATCH 13/13] revert unrelated --- .github/workflows/test-linux-gpu.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/test-linux-gpu.yml b/.github/workflows/test-linux-gpu.yml index b14600c8db0..831de27e350 100644 --- a/.github/workflows/test-linux-gpu.yml +++ b/.github/workflows/test-linux-gpu.yml @@ -53,7 +53,6 @@ jobs: -c "pytorch-${CHANNEL}" \ -c nvidia "pytorch-${CHANNEL}"::pytorch[build="*${VERSION}*"] \ "${CUDATOOLKIT}" - python3 setup.py develop python3 -m pip install pytest pytest-mock 'av<10'