From a06810434f3c42059712aa36f11a50e6be9f841a Mon Sep 17 00:00:00 2001 From: Wenqi Li Date: Fri, 15 Oct 2021 09:45:00 +0100 Subject: [PATCH 1/4] upgrade torch image to 2109 Signed-off-by: Wenqi Li --- .github/workflows/cron.yml | 6 +++--- .github/workflows/pythonapp-gpu.yml | 4 ++-- Dockerfile | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/.github/workflows/cron.yml b/.github/workflows/cron.yml index 77d43d35bd..c864b4603e 100644 --- a/.github/workflows/cron.yml +++ b/.github/workflows/cron.yml @@ -62,7 +62,7 @@ jobs: if: github.repository == 'Project-MONAI/MONAI' strategy: matrix: - container: ["pytorch:21.02", "pytorch:21.08"] # 21.02 for backward comp. + container: ["pytorch:21.02", "pytorch:21.09"] # 21.02 for backward comp. container: image: nvcr.io/nvidia/${{ matrix.container }}-py3 # testing with the latest pytorch base image options: "--gpus all" @@ -106,7 +106,7 @@ jobs: if: github.repository == 'Project-MONAI/MONAI' strategy: matrix: - container: ["pytorch:21.02", "pytorch:21.08"] # 21.02 for backward comp. + container: ["pytorch:21.02", "pytorch:21.09"] # 21.02 for backward comp. container: image: nvcr.io/nvidia/${{ matrix.container }}-py3 # testing with the latest pytorch base image options: "--gpus all" @@ -204,7 +204,7 @@ jobs: if: github.repository == 'Project-MONAI/MONAI' needs: cron-gpu # so that monai itself is verified first container: - image: nvcr.io/nvidia/pytorch:21.08-py3 # testing with the latest pytorch base image + image: nvcr.io/nvidia/pytorch:21.09-py3 # testing with the latest pytorch base image options: "--gpus all --ipc=host" runs-on: [self-hosted, linux, x64, common] steps: diff --git a/.github/workflows/pythonapp-gpu.yml b/.github/workflows/pythonapp-gpu.yml index edaa2487ce..8f285af859 100644 --- a/.github/workflows/pythonapp-gpu.yml +++ b/.github/workflows/pythonapp-gpu.yml @@ -43,9 +43,9 @@ jobs: - environment: PT19+CUDA114 # we explicitly set pytorch to -h to avoid pip install error # https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes - # 21.08: 1.10.0a0+3fd9dcf + # 21.09: 1.10.0a0+3fd9dcf pytorch: "-h" - base: "nvcr.io/nvidia/pytorch:21.08-py3" + base: "nvcr.io/nvidia/pytorch:21.09-py3" - environment: PT19+CUDA102 pytorch: "torch==1.9.1 torchvision==0.10.1" base: "nvcr.io/nvidia/cuda:10.2-devel-ubuntu18.04" diff --git a/Dockerfile b/Dockerfile index 77fe1f828f..ce4306c639 100644 --- a/Dockerfile +++ b/Dockerfile @@ -11,7 +11,7 @@ # To build with a different base image # please run `docker build` using the `--build-arg PYTORCH_IMAGE=...` flag. -ARG PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:21.08-py3 +ARG PYTORCH_IMAGE=nvcr.io/nvidia/pytorch:21.09-py3 FROM ${PYTORCH_IMAGE} LABEL maintainer="monai.contact@gmail.com" From c6d84e82884f4b83c170ff1b112e202d6d4d16ba Mon Sep 17 00:00:00 2001 From: Wenqi Li Date: Fri, 15 Oct 2021 09:47:36 +0100 Subject: [PATCH 2/4] temp tests Signed-off-by: Wenqi Li --- .github/workflows/cron.yml | 59 +++-------------------------- .github/workflows/pythonapp-gpu.yml | 6 +-- 2 files changed, 6 insertions(+), 59 deletions(-) diff --git a/.github/workflows/cron.yml b/.github/workflows/cron.yml index c864b4603e..f9c43b9e97 100644 --- a/.github/workflows/cron.yml +++ b/.github/workflows/cron.yml @@ -5,64 +5,16 @@ on: # - cron: "0 2 * * *" # at 02:00 UTC # Allows you to run this workflow manually from the Actions tab workflow_dispatch: + push: + branches: + - test-2109 jobs: - cron-gpu: - if: github.repository == 'Project-MONAI/MONAI' - container: - image: nvcr.io/nvidia/pytorch:20.03-py3 # CUDA 10.2 - options: "--gpus all" - runs-on: [self-hosted, linux, x64, common] - strategy: - matrix: - pytorch-version: [1.5.1, 1.6.0, 1.7.1, 1.8.1, latest] - steps: - - uses: actions/checkout@v2 - - name: Install the dependencies - run: | - which python - python -m pip install --upgrade pip wheel - python -m pip uninstall -y torch torchvision - if [ ${{ matrix.pytorch-version }} == "latest" ]; then - python -m pip install torch torchvision - elif [ ${{ matrix.pytorch-version }} == "1.5.1" ]; then - python -m pip install torch==1.5.1 torchvision==0.6.1 - elif [ ${{ matrix.pytorch-version }} == "1.6.0" ]; then - python -m pip install torch==1.6.0 torchvision==0.7.0 - elif [ ${{ matrix.pytorch-version }} == "1.7.1" ]; then - python -m pip install torch==1.7.1 torchvision==0.8.2 - elif [ ${{ matrix.pytorch-version }} == "1.8.1" ]; then - python -m pip install torch==1.8.1 torchvision==0.9.1 - fi - python -m pip install -r requirements-dev.txt - python -m pip list - - name: Run tests report coverage - run: | - export LAUNCH_DELAY=$[ $RANDOM % 16 * 60 ] - echo "Sleep $LAUNCH_DELAY" - sleep $LAUNCH_DELAY - nvidia-smi - export CUDA_VISIBLE_DEVICES=$(python -m tests.utils) - echo $CUDA_VISIBLE_DEVICES - trap 'if pgrep python; then pkill python; fi;' ERR - python -c $'import torch\na,b=torch.zeros(1,device="cuda:0"),torch.zeros(1,device="cuda:1");\nwhile True:print(a,b)' > /dev/null & - python -c "import torch; print(torch.__version__); print('{} of GPUs available'.format(torch.cuda.device_count()))" - python -c 'import torch; print(torch.rand(5, 3, device=torch.device("cuda:0")))' - BUILD_MONAI=1 ./runtests.sh --coverage --unittests # unit tests with coverage report - BUILD_MONAI=1 ./runtests.sh --coverage --net # integration tests with coverage report - coverage xml - if pgrep python; then pkill python; fi - - name: Upload coverage - uses: codecov/codecov-action@v1 - with: - fail_ci_if_error: false - file: ./coverage.xml - cron-pt-image: if: github.repository == 'Project-MONAI/MONAI' strategy: matrix: - container: ["pytorch:21.02", "pytorch:21.09"] # 21.02 for backward comp. + container: ["pytorch:21.09"] # 21.02 for backward comp. container: image: nvcr.io/nvidia/${{ matrix.container }}-py3 # testing with the latest pytorch base image options: "--gpus all" @@ -106,7 +58,7 @@ jobs: if: github.repository == 'Project-MONAI/MONAI' strategy: matrix: - container: ["pytorch:21.02", "pytorch:21.09"] # 21.02 for backward comp. + container: ["pytorch:21.09"] # 21.02 for backward comp. container: image: nvcr.io/nvidia/${{ matrix.container }}-py3 # testing with the latest pytorch base image options: "--gpus all" @@ -202,7 +154,6 @@ jobs: cron-tutorial-notebooks: if: github.repository == 'Project-MONAI/MONAI' - needs: cron-gpu # so that monai itself is verified first container: image: nvcr.io/nvidia/pytorch:21.09-py3 # testing with the latest pytorch base image options: "--gpus all --ipc=host" diff --git a/.github/workflows/pythonapp-gpu.yml b/.github/workflows/pythonapp-gpu.yml index 8f285af859..3f015d1627 100644 --- a/.github/workflows/pythonapp-gpu.yml +++ b/.github/workflows/pythonapp-gpu.yml @@ -6,6 +6,7 @@ on: branches: - main - releasing/* + - test-2109 pull_request: concurrency: @@ -19,12 +20,7 @@ jobs: strategy: matrix: environment: - - "PT16+CUDA110" - - "PT17+CUDA102" - - "PT17+CUDA110" - - "PT18+CUDA102" - "PT19+CUDA114" - - "PT19+CUDA102" include: - environment: PT16+CUDA110 # we explicitly set pytorch to -h to avoid pip install error From e35b7d1f204677426c7e579e83b23370c2ddb737 Mon Sep 17 00:00:00 2001 From: Wenqi Li Date: Fri, 15 Oct 2021 11:48:00 +0100 Subject: [PATCH 3/4] more time for fast training Signed-off-by: Wenqi Li --- tests/test_integration_fast_train.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_integration_fast_train.py b/tests/test_integration_fast_train.py index 2cf7eee479..9fd37a0897 100644 --- a/tests/test_integration_fast_train.py +++ b/tests/test_integration_fast_train.py @@ -76,7 +76,7 @@ def tearDown(self): shutil.rmtree(self.data_dir) # test the fast training speed is as expected - @TimedCall(seconds=30, daemon=False) + @TimedCall(seconds=100, daemon=False, force_quit=False) def test_train_timing(self): images = sorted(glob(os.path.join(self.data_dir, "img*.nii.gz"))) segs = sorted(glob(os.path.join(self.data_dir, "seg*.nii.gz"))) From ff47fa4869da4cda9e6e12574bd977fe102c25e6 Mon Sep 17 00:00:00 2001 From: Wenqi Li Date: Fri, 15 Oct 2021 14:45:59 +0100 Subject: [PATCH 4/4] Revert "temp tests" This reverts commit a5b4447b44edb0bacefeab6c4171309f1437b196. Signed-off-by: Wenqi Li --- .github/workflows/cron.yml | 59 ++++++++++++++++++++++++++--- .github/workflows/pythonapp-gpu.yml | 6 ++- 2 files changed, 59 insertions(+), 6 deletions(-) diff --git a/.github/workflows/cron.yml b/.github/workflows/cron.yml index f9c43b9e97..c864b4603e 100644 --- a/.github/workflows/cron.yml +++ b/.github/workflows/cron.yml @@ -5,16 +5,64 @@ on: # - cron: "0 2 * * *" # at 02:00 UTC # Allows you to run this workflow manually from the Actions tab workflow_dispatch: - push: - branches: - - test-2109 jobs: + cron-gpu: + if: github.repository == 'Project-MONAI/MONAI' + container: + image: nvcr.io/nvidia/pytorch:20.03-py3 # CUDA 10.2 + options: "--gpus all" + runs-on: [self-hosted, linux, x64, common] + strategy: + matrix: + pytorch-version: [1.5.1, 1.6.0, 1.7.1, 1.8.1, latest] + steps: + - uses: actions/checkout@v2 + - name: Install the dependencies + run: | + which python + python -m pip install --upgrade pip wheel + python -m pip uninstall -y torch torchvision + if [ ${{ matrix.pytorch-version }} == "latest" ]; then + python -m pip install torch torchvision + elif [ ${{ matrix.pytorch-version }} == "1.5.1" ]; then + python -m pip install torch==1.5.1 torchvision==0.6.1 + elif [ ${{ matrix.pytorch-version }} == "1.6.0" ]; then + python -m pip install torch==1.6.0 torchvision==0.7.0 + elif [ ${{ matrix.pytorch-version }} == "1.7.1" ]; then + python -m pip install torch==1.7.1 torchvision==0.8.2 + elif [ ${{ matrix.pytorch-version }} == "1.8.1" ]; then + python -m pip install torch==1.8.1 torchvision==0.9.1 + fi + python -m pip install -r requirements-dev.txt + python -m pip list + - name: Run tests report coverage + run: | + export LAUNCH_DELAY=$[ $RANDOM % 16 * 60 ] + echo "Sleep $LAUNCH_DELAY" + sleep $LAUNCH_DELAY + nvidia-smi + export CUDA_VISIBLE_DEVICES=$(python -m tests.utils) + echo $CUDA_VISIBLE_DEVICES + trap 'if pgrep python; then pkill python; fi;' ERR + python -c $'import torch\na,b=torch.zeros(1,device="cuda:0"),torch.zeros(1,device="cuda:1");\nwhile True:print(a,b)' > /dev/null & + python -c "import torch; print(torch.__version__); print('{} of GPUs available'.format(torch.cuda.device_count()))" + python -c 'import torch; print(torch.rand(5, 3, device=torch.device("cuda:0")))' + BUILD_MONAI=1 ./runtests.sh --coverage --unittests # unit tests with coverage report + BUILD_MONAI=1 ./runtests.sh --coverage --net # integration tests with coverage report + coverage xml + if pgrep python; then pkill python; fi + - name: Upload coverage + uses: codecov/codecov-action@v1 + with: + fail_ci_if_error: false + file: ./coverage.xml + cron-pt-image: if: github.repository == 'Project-MONAI/MONAI' strategy: matrix: - container: ["pytorch:21.09"] # 21.02 for backward comp. + container: ["pytorch:21.02", "pytorch:21.09"] # 21.02 for backward comp. container: image: nvcr.io/nvidia/${{ matrix.container }}-py3 # testing with the latest pytorch base image options: "--gpus all" @@ -58,7 +106,7 @@ jobs: if: github.repository == 'Project-MONAI/MONAI' strategy: matrix: - container: ["pytorch:21.09"] # 21.02 for backward comp. + container: ["pytorch:21.02", "pytorch:21.09"] # 21.02 for backward comp. container: image: nvcr.io/nvidia/${{ matrix.container }}-py3 # testing with the latest pytorch base image options: "--gpus all" @@ -154,6 +202,7 @@ jobs: cron-tutorial-notebooks: if: github.repository == 'Project-MONAI/MONAI' + needs: cron-gpu # so that monai itself is verified first container: image: nvcr.io/nvidia/pytorch:21.09-py3 # testing with the latest pytorch base image options: "--gpus all --ipc=host" diff --git a/.github/workflows/pythonapp-gpu.yml b/.github/workflows/pythonapp-gpu.yml index 3f015d1627..8f285af859 100644 --- a/.github/workflows/pythonapp-gpu.yml +++ b/.github/workflows/pythonapp-gpu.yml @@ -6,7 +6,6 @@ on: branches: - main - releasing/* - - test-2109 pull_request: concurrency: @@ -20,7 +19,12 @@ jobs: strategy: matrix: environment: + - "PT16+CUDA110" + - "PT17+CUDA102" + - "PT17+CUDA110" + - "PT18+CUDA102" - "PT19+CUDA114" + - "PT19+CUDA102" include: - environment: PT16+CUDA110 # we explicitly set pytorch to -h to avoid pip install error