From 3aa6a8412b5ee0c2da3fc7d013943cabc0c4d1e9 Mon Sep 17 00:00:00 2001 From: Anton Schwaighofer Date: Tue, 14 Dec 2021 20:41:35 +0000 Subject: [PATCH 1/2] fix --- .idea/InnerEye-DeepLearning.iml | 2 +- InnerEye/ML/deep_learning_config.py | 11 +++++++++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/.idea/InnerEye-DeepLearning.iml b/.idea/InnerEye-DeepLearning.iml index 76e3fef5c..d052cf6ee 100644 --- a/.idea/InnerEye-DeepLearning.iml +++ b/.idea/InnerEye-DeepLearning.iml @@ -3,10 +3,10 @@ - + diff --git a/InnerEye/ML/deep_learning_config.py b/InnerEye/ML/deep_learning_config.py index e4724898a..bb8804b87 100644 --- a/InnerEye/ML/deep_learning_config.py +++ b/InnerEye/ML/deep_learning_config.py @@ -22,6 +22,7 @@ from InnerEye.ML.common import CHECKPOINT_FOLDER, DATASET_CSV_FILE_NAME, \ ModelExecutionMode, VISUALIZATION_FOLDER, \ create_unique_timestamp_id, get_best_checkpoint_path +from health_azure.utils import is_global_rank_zero @unique @@ -135,8 +136,14 @@ def create(project_root: Path, else: logging.info("All results will be written to a subfolder of the project root folder.") root = project_root.absolute() / DEFAULT_AML_UPLOAD_DIR - timestamp = create_unique_timestamp_id() - run_folder = root / f"{timestamp}_{model_name}" + if is_global_rank_zero(): + timestamp = create_unique_timestamp_id() + run_folder = root / f"{timestamp}_{model_name}" + else: + # Handle the case where there are multiple DDP threads on the same machine outside AML. + # Each child process will be started with the current working directory set to be the output + # folder of the rank 0 process. We want all other process to write to that same folder. + run_folder = Path.cwd().absolute() outputs_folder = run_folder logs_folder = run_folder / DEFAULT_LOGS_DIR_NAME else: From c44411f85d7dd894e73c6824750570781d44d146 Mon Sep 17 00:00:00 2001 From: Anton Schwaighofer Date: Tue, 14 Dec 2021 20:43:54 +0000 Subject: [PATCH 2/2] changelog --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 8a709138c..b8ad69e91 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -95,6 +95,7 @@ in inference-only runs when using lightning containers. - ([#553](https://github.com/microsoft/InnerEye-DeepLearning/pull/553)) Fix incomplete test data module setup in Lightning inference. - ([#557](https://github.com/microsoft/InnerEye-DeepLearning/pull/557)) Fix issue where learning rate was not set correctly in the SimCLR module +- ([#622](https://github.com/microsoft/InnerEye-DeepLearning/pull/622)) Fix issue with multi-GPU jobs on a VM: each process tries to create a folder structure - ([#558](https://github.com/microsoft/InnerEye-DeepLearning/pull/558)) Fix issue with the CovidModel config where model weights from a finetuning run were incompatible with the model architecture created for non-finetuning runs. - ([#604](https://github.com/microsoft/InnerEye-DeepLearning/pull/604)) Fix issue where runs on a VM would download the dataset even when a local dataset is provided.