From 80bcc70ab027e7fbbb558483b3e34519d9da4ac2 Mon Sep 17 00:00:00 2001 From: Anton Schwaighofer Date: Wed, 26 Jan 2022 15:20:11 +0000 Subject: [PATCH 01/16] workaround --- .../datamodules_and_datasets/datamodules.py | 29 ++++++++++++++----- InnerEye/ML/SSL/encoders.py | 7 +++-- .../SSL/lightning_containers/ssl_container.py | 2 +- InnerEye/ML/model_training.py | 15 ++++++++-- Tests/SSL/test_ssl_containers.py | 7 +++-- 5 files changed, 44 insertions(+), 16 deletions(-) diff --git a/InnerEye/ML/SSL/datamodules_and_datasets/datamodules.py b/InnerEye/ML/SSL/datamodules_and_datasets/datamodules.py index c399b1b4b..4ec1aabe2 100644 --- a/InnerEye/ML/SSL/datamodules_and_datasets/datamodules.py +++ b/InnerEye/ML/SSL/datamodules_and_datasets/datamodules.py @@ -5,14 +5,14 @@ import logging import os -from typing import Any, Callable, Optional, Sized, Union +from typing import Any, Callable, Optional, Sized, Union, Dict import numpy as np import torch from pl_bolts.datamodules.vision_datamodule import VisionDataModule from pytorch_lightning import LightningDataModule from pytorch_lightning.trainer.supporters import CombinedLoader -from torch.utils.data import Dataset +from torch.utils.data import Dataset, DataLoader from InnerEye.ML.SSL.utils import SSLDataModuleType @@ -122,16 +122,27 @@ def __init__(self, if use_balanced_loss_linear_head: self.class_weights = self.linear_head_module.compute_class_weights() self.batch_size = self.encoder_module.batch_size + self.train_loader_cycle_mode: Optional[str] = None + self._is_prepared = False def prepare_data(self, *args: Any, **kwargs: Any) -> None: """ Saves files to data_dir """ + if self._is_prepared: + return self.encoder_module.prepare_data() self.linear_head_module.prepare_data() - logging.info(f"Length of encoder train dataloader {len(self.encoder_module.train_dataloader())}") - logging.info(f"Length of linear head train dataloader {len(self.linear_head_module.train_dataloader())}") + len_encoder_train = len(self.encoder_module.train_dataloader()) + len_linear_head_train = len(self.linear_head_module.train_dataloader()) + logging.info(f"Length of encoder train dataloader {len_encoder_train}") + logging.info(f"Length of linear head train dataloader {len_linear_head_train}") logging.info(f"Length of total train dataloader {len(self.train_dataloader())}") + self.train_loader_cycle_mode = self._cycle_mode(len_encoder_train, len_linear_head_train) + self._is_prepared = True + + def _cycle_mode(self, len_encoder: int, len_linear_head: int) -> str: + return "max_size_cycle" if len_encoder > len_linear_head else "min_size" def get_combined_loader(self, encoder_loader: Sized, linear_head_loader: Sized) -> CombinedLoader: """ @@ -140,19 +151,21 @@ def get_combined_loader(self, encoder_loader: Sized, linear_head_loader: Sized) :param encoder_loader: The dataloader to use for the SSL encoder. :param linear_head_loader: The dataloader to use for the linear head. """ - mode = "max_size_cycle" if len(encoder_loader) > len(linear_head_loader) else "min_size" + mode = self._cycle_mode(len(encoder_loader), len(linear_head_loader)) dataloaders = { SSLDataModuleType.ENCODER: encoder_loader, SSLDataModuleType.LINEAR_HEAD: linear_head_loader } return CombinedLoader(dataloaders, mode=mode) - def train_dataloader(self, *args: Any, **kwargs: Any) -> CombinedLoader: # type: ignore + def train_dataloader(self, *args: Any, **kwargs: Any) -> Dict[str, DataLoader]: # type: ignore """ The train dataloaders """ - return self.get_combined_loader(encoder_loader=self.encoder_module.train_dataloader(), - linear_head_loader=self.linear_head_module.train_dataloader()) + return { + SSLDataModuleType.ENCODER: self.encoder_module.train_dataloader(), + SSLDataModuleType.LINEAR_HEAD: self.linear_head_module.train_dataloader() + } def val_dataloader(self, *args: Any, **kwargs: Any) -> CombinedLoader: # type: ignore """ diff --git a/InnerEye/ML/SSL/encoders.py b/InnerEye/ML/SSL/encoders.py index ab308e7fa..db0c6bd87 100644 --- a/InnerEye/ML/SSL/encoders.py +++ b/InnerEye/ML/SSL/encoders.py @@ -79,9 +79,10 @@ def get_encoder_output_dim( from InnerEye.ML.SSL.lightning_modules.ssl_online_evaluator import ( SSLOnlineEvaluatorInnerEye, ) - - batch = next(iter(dm.train_dataloader())) - batch = batch[SSLDataModuleType.LINEAR_HEAD] if isinstance(batch, dict) else batch # type: ignore + dataloaders = dm.train_dataloader() + dataloader = dataloaders[SSLDataModuleType.LINEAR_HEAD] \ + if isinstance(dataloaders, dict) else dataloaders # type: ignore + batch = next(iter(dataloader)) x, _ = SSLOnlineEvaluatorInnerEye.to_device(batch, device) else: x = torch.rand((1, 3, 256, 256)).to(device) diff --git a/InnerEye/ML/SSL/lightning_containers/ssl_container.py b/InnerEye/ML/SSL/lightning_containers/ssl_container.py index 3257110db..3747c9519 100644 --- a/InnerEye/ML/SSL/lightning_containers/ssl_container.py +++ b/InnerEye/ML/SSL/lightning_containers/ssl_container.py @@ -6,7 +6,7 @@ from dataclasses import dataclass from enum import Enum from pathlib import Path -from typing import Any, List, Optional, Tuple, Union +from typing import Any, Dict, List, Optional, Tuple, Union import param from pytorch_lightning import Callback, LightningModule diff --git a/InnerEye/ML/model_training.py b/InnerEye/ML/model_training.py index f247dcf2d..bd481685a 100644 --- a/InnerEye/ML/model_training.py +++ b/InnerEye/ML/model_training.py @@ -17,6 +17,7 @@ from InnerEye.Azure.azure_util import RUN_CONTEXT, is_offline_run_context from InnerEye.Common.common_util import SUBJECT_METRICS_FILE_NAME, change_working_directory from InnerEye.Common.resource_monitor import ResourceMonitor +from InnerEye.ML.SSL.datamodules_and_datasets.datamodules import CombinedDataModule from InnerEye.ML.common import ARGS_TXT, AUTOSAVE_CHECKPOINT_FILE_NAME, ModelExecutionMode, \ VISUALIZATION_FOLDER from InnerEye.ML.lightning_base import InnerEyeContainer, InnerEyeLightning @@ -57,7 +58,8 @@ def write_args_file(config: Any, outputs_folder: Path) -> None: def create_lightning_trainer(container: LightningContainer, resume_from_checkpoint: Optional[Path] = None, - num_nodes: int = 1) -> \ + num_nodes: int = 1, + multiple_trainloader_mode: str = "max_size_cycle") -> \ Tuple[Trainer, StoringLogger]: """ Creates a Pytorch Lightning Trainer object for the given model configuration. It creates checkpoint handlers @@ -174,6 +176,7 @@ def create_lightning_trainer(container: LightningContainer, detect_anomaly=container.detect_anomaly, profiler=container.pl_profiler, resume_from_checkpoint=str(resume_from_checkpoint) if resume_from_checkpoint else None, + multiple_trainloader_mode=multiple_trainloader_mode, **additional_args) return trainer, storing_logger @@ -230,6 +233,13 @@ def model_train(checkpoint_path: Optional[Path], container.before_training_on_local_rank_zero() container.before_training_on_all_ranks() + # Workaround for a bug in PL 1.5.5: We need to pass the cycle mode for the training data as a trainer argument + # because training data that uses a CombinedLoader is not split correctly in DDP + multiple_trainloader_mode = "max_size_cycle" + if isinstance(data_module, CombinedDataModule): + data_module.prepare_data() + multiple_trainloader_mode = data_module.train_loader_cycle_mode + # Create the trainer object. Backup the environment variables before doing that, in case we need to run a second # training in the unit tests.d old_environ = dict(os.environ) @@ -238,7 +248,8 @@ def model_train(checkpoint_path: Optional[Path], seed_everything(container.get_effective_random_seed()) trainer, storing_logger = create_lightning_trainer(container, checkpoint_path, - num_nodes=num_nodes) + num_nodes=num_nodes, + multiple_trainloader_mode=multiple_trainloader_mode) rank_info = ", ".join(f"{env}: {os.getenv(env)}" for env in [ENV_GLOBAL_RANK, ENV_LOCAL_RANK, ENV_NODE_RANK]) logging.info(f"Environment variables: {rank_info}. trainer.global_rank: {trainer.global_rank}") diff --git a/Tests/SSL/test_ssl_containers.py b/Tests/SSL/test_ssl_containers.py index b158f7bed..52cf29084 100644 --- a/Tests/SSL/test_ssl_containers.py +++ b/Tests/SSL/test_ssl_containers.py @@ -615,8 +615,11 @@ def test_simclr_dataset_length(test_output_dirs: OutputFolderForTests, model = container.create_model() expected_num_train_iters = (num_encoder_images * 0.9) // encoder_batch_size assert model.train_iters_per_epoch == expected_num_train_iters - train_loaders = container.get_data_module().train_dataloader() - assert isinstance(train_loaders, CombinedLoader) + data_module = container.get_data_module() + data_module.prepare_data() + train_loaders_dict = data_module.train_dataloader() + assert isinstance(train_loaders_dict, dict) + train_loaders = CombinedLoader(train_loaders_dict, mode=data_module.train_loader_cycle_mode) assert len(train_loaders) == expected_num_train_iters expected_num_val_iters = (num_encoder_images * 0.1) // encoder_batch_size val_loaders = container.get_data_module().val_dataloader() From 66df3747232699047ace0151c0cbf90f5c63c2c2 Mon Sep 17 00:00:00 2001 From: Anton Schwaighofer Date: Wed, 26 Jan 2022 20:38:50 +0000 Subject: [PATCH 02/16] next as method --- InnerEye/ML/SSL/encoders.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/InnerEye/ML/SSL/encoders.py b/InnerEye/ML/SSL/encoders.py index db0c6bd87..7718d9969 100644 --- a/InnerEye/ML/SSL/encoders.py +++ b/InnerEye/ML/SSL/encoders.py @@ -82,7 +82,7 @@ def get_encoder_output_dim( dataloaders = dm.train_dataloader() dataloader = dataloaders[SSLDataModuleType.LINEAR_HEAD] \ if isinstance(dataloaders, dict) else dataloaders # type: ignore - batch = next(iter(dataloader)) + batch = iter(dataloader).next() x, _ = SSLOnlineEvaluatorInnerEye.to_device(batch, device) else: x = torch.rand((1, 3, 256, 256)).to(device) From f07a7f075eb1d55466a6efd04cc5e09b5b3b7ad6 Mon Sep 17 00:00:00 2001 From: Anton Schwaighofer Date: Wed, 26 Jan 2022 22:07:15 +0000 Subject: [PATCH 03/16] max size both --- InnerEye/ML/SSL/datamodules_and_datasets/datamodules.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/InnerEye/ML/SSL/datamodules_and_datasets/datamodules.py b/InnerEye/ML/SSL/datamodules_and_datasets/datamodules.py index 4ec1aabe2..88d3e4d19 100644 --- a/InnerEye/ML/SSL/datamodules_and_datasets/datamodules.py +++ b/InnerEye/ML/SSL/datamodules_and_datasets/datamodules.py @@ -142,7 +142,7 @@ def prepare_data(self, *args: Any, **kwargs: Any) -> None: self._is_prepared = True def _cycle_mode(self, len_encoder: int, len_linear_head: int) -> str: - return "max_size_cycle" if len_encoder > len_linear_head else "min_size" + return "max_size_cycle" # if len_encoder > len_linear_head else "min_size" def get_combined_loader(self, encoder_loader: Sized, linear_head_loader: Sized) -> CombinedLoader: """ From b3256361b8075823951ac23fcb6868d67da6ca52 Mon Sep 17 00:00:00 2001 From: Anton Schwaighofer Date: Wed, 26 Jan 2022 22:09:06 +0000 Subject: [PATCH 04/16] val_minsize_train_maxsize --- InnerEye/ML/SSL/datamodules_and_datasets/datamodules.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/InnerEye/ML/SSL/datamodules_and_datasets/datamodules.py b/InnerEye/ML/SSL/datamodules_and_datasets/datamodules.py index 88d3e4d19..abe265045 100644 --- a/InnerEye/ML/SSL/datamodules_and_datasets/datamodules.py +++ b/InnerEye/ML/SSL/datamodules_and_datasets/datamodules.py @@ -156,7 +156,7 @@ def get_combined_loader(self, encoder_loader: Sized, linear_head_loader: Sized) SSLDataModuleType.ENCODER: encoder_loader, SSLDataModuleType.LINEAR_HEAD: linear_head_loader } - return CombinedLoader(dataloaders, mode=mode) + return CombinedLoader(dataloaders, mode="min_size") def train_dataloader(self, *args: Any, **kwargs: Any) -> Dict[str, DataLoader]: # type: ignore """ From 97df1d98faae3f5e384bfaa19d51b1bc43b71a9c Mon Sep 17 00:00:00 2001 From: Anton Schwaighofer Date: Wed, 26 Jan 2022 22:19:24 +0000 Subject: [PATCH 05/16] val_maxsize_train_minsize --- InnerEye/ML/SSL/datamodules_and_datasets/datamodules.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/InnerEye/ML/SSL/datamodules_and_datasets/datamodules.py b/InnerEye/ML/SSL/datamodules_and_datasets/datamodules.py index abe265045..9860c0c9a 100644 --- a/InnerEye/ML/SSL/datamodules_and_datasets/datamodules.py +++ b/InnerEye/ML/SSL/datamodules_and_datasets/datamodules.py @@ -142,7 +142,7 @@ def prepare_data(self, *args: Any, **kwargs: Any) -> None: self._is_prepared = True def _cycle_mode(self, len_encoder: int, len_linear_head: int) -> str: - return "max_size_cycle" # if len_encoder > len_linear_head else "min_size" + return "min_size" # if len_encoder > len_linear_head else "min_size" def get_combined_loader(self, encoder_loader: Sized, linear_head_loader: Sized) -> CombinedLoader: """ @@ -156,7 +156,7 @@ def get_combined_loader(self, encoder_loader: Sized, linear_head_loader: Sized) SSLDataModuleType.ENCODER: encoder_loader, SSLDataModuleType.LINEAR_HEAD: linear_head_loader } - return CombinedLoader(dataloaders, mode="min_size") + return CombinedLoader(dataloaders, mode="max_size_cycle") def train_dataloader(self, *args: Any, **kwargs: Any) -> Dict[str, DataLoader]: # type: ignore """ From dd2ba015ea2b4501fdc9ceb678490e5f2c1766fb Mon Sep 17 00:00:00 2001 From: Anton Schwaighofer Date: Thu, 27 Jan 2022 10:07:33 +0000 Subject: [PATCH 06/16] back to next() --- InnerEye/ML/SSL/encoders.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/InnerEye/ML/SSL/encoders.py b/InnerEye/ML/SSL/encoders.py index 7718d9969..db0c6bd87 100644 --- a/InnerEye/ML/SSL/encoders.py +++ b/InnerEye/ML/SSL/encoders.py @@ -82,7 +82,7 @@ def get_encoder_output_dim( dataloaders = dm.train_dataloader() dataloader = dataloaders[SSLDataModuleType.LINEAR_HEAD] \ if isinstance(dataloaders, dict) else dataloaders # type: ignore - batch = iter(dataloader).next() + batch = next(iter(dataloader)) x, _ = SSLOnlineEvaluatorInnerEye.to_device(batch, device) else: x = torch.rand((1, 3, 256, 256)).to(device) From ab09c22773872d5f1ba5a79bbd5d509286becf4e Mon Sep 17 00:00:00 2001 From: Anton Schwaighofer Date: Thu, 27 Jan 2022 10:28:30 +0000 Subject: [PATCH 07/16] metrics update --- Tests/SSL/test_ssl_containers.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Tests/SSL/test_ssl_containers.py b/Tests/SSL/test_ssl_containers.py index 52cf29084..7879a3886 100644 --- a/Tests/SSL/test_ssl_containers.py +++ b/Tests/SSL/test_ssl_containers.py @@ -130,8 +130,8 @@ def test_innereye_ssl_container_cifar10_resnet_simclr() -> None: # Check the metrics that were recorded during training # Note: It is possible that after the PyTorch 1.10 upgrade, we can't get parity between local runs and runs on # the hosted build agents. If that suspicion is confirmed, we need to add branching for local and cloud results. - expected_metrics = {'simclr/val/loss': 2.8797268867492676, - 'ssl_online_evaluator/val/loss': 2.272602081298828, + expected_metrics = {'simclr/val/loss': 2.8736934661865234, + 'ssl_online_evaluator/val/loss': 2.2684895992279053, 'ssl_online_evaluator/val/AccuracyAtThreshold05': 0.20000000298023224, 'simclr/train/loss': 3.6261773109436035, 'simclr/learning_rate': 0.0, From 8a275df91e604800c355635164103c08d7a5dfee Mon Sep 17 00:00:00 2001 From: Anton Schwaighofer Date: Thu, 27 Jan 2022 10:31:01 +0000 Subject: [PATCH 08/16] changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 68e1707a7..2cf43ff80 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -84,6 +84,7 @@ gets uploaded to AzureML, by skipping all test folders. ### Fixed - ([#606](https://github.com/microsoft/InnerEye-DeepLearning/pull/606)) Bug fix: registered models do not include the hi-ml submodule +- ([#646](https://github.com/microsoft/InnerEye-DeepLearning/pull/646)) Workaround for bug in PL: CombinedLoader cannot be used for training data when using DDP - ([#593](https://github.com/microsoft/InnerEye-DeepLearning/pull/593)) Bug fix for hi-ml 0.1.11 issue (#130): empty mount point is turned into ".", which fails the AML job - ([#587](https://github.com/microsoft/InnerEye-DeepLearning/pull/587)) Bug fix for regression in AzureML's handling of environments: upgrade to hi-ml 0.1.11 - ([#625](https://github.com/microsoft/InnerEye-DeepLearning/pull/625)) updates to PandaDeepMIL to enable the use of a SSL pre-trained checkpoint and updated commit to hi-ml From 880d0d500f723f49ac49d68a7d136458b474e6f0 Mon Sep 17 00:00:00 2001 From: Anton Schwaighofer Date: Thu, 27 Jan 2022 11:02:06 +0000 Subject: [PATCH 09/16] undo all changes for experiments --- InnerEye/ML/SSL/datamodules_and_datasets/datamodules.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/InnerEye/ML/SSL/datamodules_and_datasets/datamodules.py b/InnerEye/ML/SSL/datamodules_and_datasets/datamodules.py index 9860c0c9a..f19f78931 100644 --- a/InnerEye/ML/SSL/datamodules_and_datasets/datamodules.py +++ b/InnerEye/ML/SSL/datamodules_and_datasets/datamodules.py @@ -5,7 +5,7 @@ import logging import os -from typing import Any, Callable, Optional, Sized, Union, Dict +from typing import Any, Callable, Dict, Optional, Sized, Union import numpy as np import torch @@ -138,11 +138,14 @@ def prepare_data(self, *args: Any, **kwargs: Any) -> None: logging.info(f"Length of encoder train dataloader {len_encoder_train}") logging.info(f"Length of linear head train dataloader {len_linear_head_train}") logging.info(f"Length of total train dataloader {len(self.train_dataloader())}") + # Workaround for a bug in PL: We can't use a CombinedLoader for the training data. Instead, + # need to return a dictionary and set a cycle mode flag on the trainer. This flag can only be computed + # once the data is prepared. We read this flag out later before we construct the Trainer object. self.train_loader_cycle_mode = self._cycle_mode(len_encoder_train, len_linear_head_train) self._is_prepared = True def _cycle_mode(self, len_encoder: int, len_linear_head: int) -> str: - return "min_size" # if len_encoder > len_linear_head else "min_size" + return "max_size_cycle" if len_encoder > len_linear_head else "min_size" def get_combined_loader(self, encoder_loader: Sized, linear_head_loader: Sized) -> CombinedLoader: """ @@ -156,7 +159,7 @@ def get_combined_loader(self, encoder_loader: Sized, linear_head_loader: Sized) SSLDataModuleType.ENCODER: encoder_loader, SSLDataModuleType.LINEAR_HEAD: linear_head_loader } - return CombinedLoader(dataloaders, mode="max_size_cycle") + return CombinedLoader(dataloaders, mode=mode) def train_dataloader(self, *args: Any, **kwargs: Any) -> Dict[str, DataLoader]: # type: ignore """ From 5bb57fc4062169aef4b9f85650222f921429fa52 Mon Sep 17 00:00:00 2001 From: Anton Schwaighofer Date: Thu, 27 Jan 2022 11:13:22 +0000 Subject: [PATCH 10/16] mypy flake --- InnerEye/ML/SSL/encoders.py | 7 +++---- InnerEye/ML/SSL/lightning_containers/ssl_container.py | 2 +- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/InnerEye/ML/SSL/encoders.py b/InnerEye/ML/SSL/encoders.py index db0c6bd87..6455db840 100644 --- a/InnerEye/ML/SSL/encoders.py +++ b/InnerEye/ML/SSL/encoders.py @@ -79,10 +79,9 @@ def get_encoder_output_dim( from InnerEye.ML.SSL.lightning_modules.ssl_online_evaluator import ( SSLOnlineEvaluatorInnerEye, ) - dataloaders = dm.train_dataloader() - dataloader = dataloaders[SSLDataModuleType.LINEAR_HEAD] \ - if isinstance(dataloaders, dict) else dataloaders # type: ignore - batch = next(iter(dataloader)) + loaders = dm.train_dataloader() + loader = loaders[SSLDataModuleType.LINEAR_HEAD] if isinstance(loaders, dict) else loaders # type: ignore + batch = next(iter(loader)) x, _ = SSLOnlineEvaluatorInnerEye.to_device(batch, device) else: x = torch.rand((1, 3, 256, 256)).to(device) diff --git a/InnerEye/ML/SSL/lightning_containers/ssl_container.py b/InnerEye/ML/SSL/lightning_containers/ssl_container.py index 3747c9519..3257110db 100644 --- a/InnerEye/ML/SSL/lightning_containers/ssl_container.py +++ b/InnerEye/ML/SSL/lightning_containers/ssl_container.py @@ -6,7 +6,7 @@ from dataclasses import dataclass from enum import Enum from pathlib import Path -from typing import Any, Dict, List, Optional, Tuple, Union +from typing import Any, List, Optional, Tuple, Union import param from pytorch_lightning import Callback, LightningModule From 5d65235417a7b71fb3cb7eaf023d0eac030564e7 Mon Sep 17 00:00:00 2001 From: Anton Schwaighofer Date: Thu, 27 Jan 2022 11:16:49 +0000 Subject: [PATCH 11/16] mypy flake --- InnerEye/ML/SSL/datamodules_and_datasets/datamodules.py | 2 +- InnerEye/ML/model_training.py | 1 + Tests/SSL/test_ssl_containers.py | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/InnerEye/ML/SSL/datamodules_and_datasets/datamodules.py b/InnerEye/ML/SSL/datamodules_and_datasets/datamodules.py index f19f78931..b96d03bf2 100644 --- a/InnerEye/ML/SSL/datamodules_and_datasets/datamodules.py +++ b/InnerEye/ML/SSL/datamodules_and_datasets/datamodules.py @@ -161,7 +161,7 @@ def get_combined_loader(self, encoder_loader: Sized, linear_head_loader: Sized) } return CombinedLoader(dataloaders, mode=mode) - def train_dataloader(self, *args: Any, **kwargs: Any) -> Dict[str, DataLoader]: # type: ignore + def train_dataloader(self, *args: Any, **kwargs: Any) -> Dict[SSLDataModuleType, DataLoader]: # type: ignore """ The train dataloaders """ diff --git a/InnerEye/ML/model_training.py b/InnerEye/ML/model_training.py index bd481685a..89aab6ea8 100644 --- a/InnerEye/ML/model_training.py +++ b/InnerEye/ML/model_training.py @@ -238,6 +238,7 @@ def model_train(checkpoint_path: Optional[Path], multiple_trainloader_mode = "max_size_cycle" if isinstance(data_module, CombinedDataModule): data_module.prepare_data() + assert data_module.train_loader_cycle_mode is not None, "This field should be computed during prepare_data" multiple_trainloader_mode = data_module.train_loader_cycle_mode # Create the trainer object. Backup the environment variables before doing that, in case we need to run a second diff --git a/Tests/SSL/test_ssl_containers.py b/Tests/SSL/test_ssl_containers.py index 7879a3886..95968be36 100644 --- a/Tests/SSL/test_ssl_containers.py +++ b/Tests/SSL/test_ssl_containers.py @@ -619,6 +619,7 @@ def test_simclr_dataset_length(test_output_dirs: OutputFolderForTests, data_module.prepare_data() train_loaders_dict = data_module.train_dataloader() assert isinstance(train_loaders_dict, dict) + assert data_module.train_loader_cycle_mode train_loaders = CombinedLoader(train_loaders_dict, mode=data_module.train_loader_cycle_mode) assert len(train_loaders) == expected_num_train_iters expected_num_val_iters = (num_encoder_images * 0.1) // encoder_batch_size From 802674fbb668106816e8697d65aed54703ee4c93 Mon Sep 17 00:00:00 2001 From: Anton Schwaighofer Date: Thu, 27 Jan 2022 14:02:08 +0000 Subject: [PATCH 12/16] reduce default number of workers clean up iterator --- InnerEye/ML/SSL/encoders.py | 5 ++++- InnerEye/ML/SSL/lightning_containers/ssl_container.py | 2 +- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/InnerEye/ML/SSL/encoders.py b/InnerEye/ML/SSL/encoders.py index 6455db840..b169a9d2b 100644 --- a/InnerEye/ML/SSL/encoders.py +++ b/InnerEye/ML/SSL/encoders.py @@ -81,7 +81,10 @@ def get_encoder_output_dim( ) loaders = dm.train_dataloader() loader = loaders[SSLDataModuleType.LINEAR_HEAD] if isinstance(loaders, dict) else loaders # type: ignore - batch = next(iter(loader)) + iterator = iter(loader) + batch = next(iterator) + # Dataloaders are often multi-process, we don't want to leave any process running for nothing + del iterator x, _ = SSLOnlineEvaluatorInnerEye.to_device(batch, device) else: x = torch.rand((1, 3, 256, 256)).to(device) diff --git a/InnerEye/ML/SSL/lightning_containers/ssl_container.py b/InnerEye/ML/SSL/lightning_containers/ssl_container.py index 3257110db..e2b21a646 100644 --- a/InnerEye/ML/SSL/lightning_containers/ssl_container.py +++ b/InnerEye/ML/SSL/lightning_containers/ssl_container.py @@ -83,7 +83,7 @@ class SSLContainer(LightningContainer): use_balanced_binary_loss_for_linear_head = param.Boolean(default=False, doc="Whether to use a balanced loss for the training of " "the linear head") - num_workers = param.Integer(default=6, doc="Number of workers to use for dataloader processes.") + num_workers = param.Integer(default=4, doc="Number of workers to use for dataloader processes.") is_debug_model = param.Boolean(default=False, doc="If True, the training will be restricted to 1 batch per epoch." "Used for debugging and tests.") From 35dfa9357196feeb8feeb78952042073ba5312a3 Mon Sep 17 00:00:00 2001 From: Anton Schwaighofer Date: Thu, 27 Jan 2022 22:31:29 +0000 Subject: [PATCH 13/16] undo del iter --- InnerEye/ML/SSL/encoders.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/InnerEye/ML/SSL/encoders.py b/InnerEye/ML/SSL/encoders.py index b169a9d2b..9a83836de 100644 --- a/InnerEye/ML/SSL/encoders.py +++ b/InnerEye/ML/SSL/encoders.py @@ -83,8 +83,6 @@ def get_encoder_output_dim( loader = loaders[SSLDataModuleType.LINEAR_HEAD] if isinstance(loaders, dict) else loaders # type: ignore iterator = iter(loader) batch = next(iterator) - # Dataloaders are often multi-process, we don't want to leave any process running for nothing - del iterator x, _ = SSLOnlineEvaluatorInnerEye.to_device(batch, device) else: x = torch.rand((1, 3, 256, 256)).to(device) From 14f0cb704689cb852e10dcbd6505b3d0cac22847 Mon Sep 17 00:00:00 2001 From: Anton Schwaighofer Date: Fri, 28 Jan 2022 08:54:00 +0000 Subject: [PATCH 14/16] fix test --- Tests/SSL/test_data_modules.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/Tests/SSL/test_data_modules.py b/Tests/SSL/test_data_modules.py index 2fa750f93..85f165b0f 100644 --- a/Tests/SSL/test_data_modules.py +++ b/Tests/SSL/test_data_modules.py @@ -216,9 +216,6 @@ def test_combined_data_module() -> None: assert torch.isclose(combined_loader.class_weights, torch.tensor([0.21, 0.79], dtype=torch.float32), atol=1e-3).all() - train_dataloader = combined_loader.train_dataloader() - assert isinstance(train_dataloader, CombinedLoader) - indices_classifier_module_short = [] val_dataloader = combined_loader.val_dataloader() assert isinstance(val_dataloader, CombinedLoader) From 38f73e4e71737687dd68ab62a0ccb38de524a4d3 Mon Sep 17 00:00:00 2001 From: Anton Schwaighofer Date: Fri, 28 Jan 2022 11:27:53 +0000 Subject: [PATCH 15/16] Fix for PR build hanging --- environment.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/environment.yml b/environment.yml index 374f17b3c..a845d2e1e 100644 --- a/environment.yml +++ b/environment.yml @@ -69,6 +69,8 @@ dependencies: - tabulate==0.8.7 - tensorboard==2.3.0 - tensorboardX==2.1 + # Version 0.13 causes PR builds to hang + - terminado==0.12.1 - torchmetrics==0.6.0 - umap-learn==0.5.2 - yacs==0.1.8 From ba5c6b49edf1d0129a07c31c87abd8e5a781068a Mon Sep 17 00:00:00 2001 From: Anton Schwaighofer Date: Fri, 28 Jan 2022 12:06:43 +0000 Subject: [PATCH 16/16] Switching to Ubuntu 20 --- azure-pipelines/build-pr.yml | 2 +- environment.yml | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/azure-pipelines/build-pr.yml b/azure-pipelines/build-pr.yml index 04b85fee6..da6454f42 100644 --- a/azure-pipelines/build-pr.yml +++ b/azure-pipelines/build-pr.yml @@ -31,7 +31,7 @@ jobs: - job: Linux pool: - vmImage: 'ubuntu-18.04' + vmImage: 'ubuntu-20.04' steps: - template: build.yaml diff --git a/environment.yml b/environment.yml index a845d2e1e..374f17b3c 100644 --- a/environment.yml +++ b/environment.yml @@ -69,8 +69,6 @@ dependencies: - tabulate==0.8.7 - tensorboard==2.3.0 - tensorboardX==2.1 - # Version 0.13 causes PR builds to hang - - terminado==0.12.1 - torchmetrics==0.6.0 - umap-learn==0.5.2 - yacs==0.1.8