From e4c4dcb27e34ff418cfb4f0fd2bed5258ac13b69 Mon Sep 17 00:00:00 2001 From: Shruthi42 <13177030+Shruthi42@users.noreply.github.com> Date: Mon, 24 May 2021 13:37:09 +0100 Subject: [PATCH 1/7] Ensure no overlap between subjects in splits --- .../classification/CovidHierarchicalModel.py | 38 ++++++++++++++----- 1 file changed, 28 insertions(+), 10 deletions(-) diff --git a/InnerEye/ML/configs/classification/CovidHierarchicalModel.py b/InnerEye/ML/configs/classification/CovidHierarchicalModel.py index b6a176742..a5008a5d3 100644 --- a/InnerEye/ML/configs/classification/CovidHierarchicalModel.py +++ b/InnerEye/ML/configs/classification/CovidHierarchicalModel.py @@ -62,7 +62,6 @@ class CovidHierarchicalModel(ScalarModelBase): "is assumed to contain unique ids.") def __init__(self, covid_dataset_id: str = COVID_DATASET_ID, **kwargs: Any): - learning_rate = 1e-5 if self.use_pretrained_model else 1e-4 super().__init__(target_names=['CVX03vs12', 'CVX0vs3', 'CVX1vs2'], loss_type=ScalarLoss.CustomClassification, class_names=['CVX0', 'CVX1', 'CVX2', 'CVX3'], @@ -81,10 +80,16 @@ def __init__(self, covid_dataset_id: str = COVID_DATASET_ID, **kwargs: Any): num_epochs=50, l_rate_scheduler=LRSchedulerType.Step, l_rate_step_gamma=1.0, - l_rate=learning_rate, l_rate_multi_step_milestones=None, **kwargs) self.num_classes = 3 + + @property + def l_rate(self) -> float: + return 1e-5 if self.use_pretrained_model else 1e-4 + + def validate(self) -> None: + super().validate() if not self.use_pretrained_model and self.freeze_encoder: raise ValueError("No encoder to freeze when training from scratch. You requested training from scratch and" "encoder freezing.") @@ -94,15 +99,28 @@ def should_generate_multilabel_report(self) -> bool: def get_model_train_test_dataset_splits(self, dataset_df: pd.DataFrame) -> DatasetSplits: if self.test_set_ids_csv: - test_df = pd.read_csv(self.local_dataset / self.test_set_ids_csv) - in_test_set = dataset_df.series.isin(test_df.series) - train_ids = dataset_df.series[~in_test_set].values - test_ids = dataset_df.series[in_test_set].values + test_set_ids_csv = self.local_dataset / self.test_set_ids_csv + test_series = pd.read_csv(test_set_ids_csv).series + + all_series = dataset_df.series.values + check_all_test_series = all([series in all_series for series in test_series]) + if not check_all_test_series: + raise ValueError(f"Not all test series from {test_set_ids_csv} were found in the dataset.") + + test_set_subjects = dataset_df[dataset_df.series.isin(test_series)].subject.values + train_and_val_series = dataset_df[~dataset_df.subject.isin(test_set_subjects)].series.values num_val_samples = 400 - val_ids = train_ids[:num_val_samples] - train_ids = train_ids[num_val_samples:] - return DatasetSplits.from_subject_ids(dataset_df, train_ids=train_ids, val_ids=val_ids, test_ids=test_ids, - subject_column="series", group_column="subject") + val_series = train_and_val_series[:num_val_samples] + train_series = train_and_val_series[num_val_samples:] + + logging.info(f"Dropped {len(all_series) - (len(test_series) + len(train_and_val_series))} series " + f"due to subject overlap with test set.") + return DatasetSplits.from_subject_ids(dataset_df, + train_ids=train_series, + val_ids=val_series, + test_ids=test_series, + subject_column="series", + group_column="subject") else: return DatasetSplits.from_proportions(dataset_df, proportion_train=0.8, From 712d41b57c424db336eee0c84e29798cb139488a Mon Sep 17 00:00:00 2001 From: Shruthi42 <13177030+Shruthi42@users.noreply.github.com> Date: Fri, 28 May 2021 09:58:33 +0100 Subject: [PATCH 2/7] mypy --- InnerEye/ML/configs/classification/CovidHierarchicalModel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/InnerEye/ML/configs/classification/CovidHierarchicalModel.py b/InnerEye/ML/configs/classification/CovidHierarchicalModel.py index a5008a5d3..511d7f265 100644 --- a/InnerEye/ML/configs/classification/CovidHierarchicalModel.py +++ b/InnerEye/ML/configs/classification/CovidHierarchicalModel.py @@ -85,7 +85,7 @@ def __init__(self, covid_dataset_id: str = COVID_DATASET_ID, **kwargs: Any): self.num_classes = 3 @property - def l_rate(self) -> float: + def l_rate(self) -> float: # type: ignore return 1e-5 if self.use_pretrained_model else 1e-4 def validate(self) -> None: From 3e39b793fa5f72a4e39a741f8796b59eda8a5e8b Mon Sep 17 00:00:00 2001 From: Shruthi42 <13177030+Shruthi42@users.noreply.github.com> Date: Fri, 28 May 2021 10:49:18 +0100 Subject: [PATCH 3/7] Refactor --- InnerEye/ML/configs/classification/CovidHierarchicalModel.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/InnerEye/ML/configs/classification/CovidHierarchicalModel.py b/InnerEye/ML/configs/classification/CovidHierarchicalModel.py index 511d7f265..1a0cd02d6 100644 --- a/InnerEye/ML/configs/classification/CovidHierarchicalModel.py +++ b/InnerEye/ML/configs/classification/CovidHierarchicalModel.py @@ -84,11 +84,8 @@ def __init__(self, covid_dataset_id: str = COVID_DATASET_ID, **kwargs: Any): **kwargs) self.num_classes = 3 - @property - def l_rate(self) -> float: # type: ignore - return 1e-5 if self.use_pretrained_model else 1e-4 - def validate(self) -> None: + self.l_rate = 1e-5 if self.use_pretrained_model else 1e-4 super().validate() if not self.use_pretrained_model and self.freeze_encoder: raise ValueError("No encoder to freeze when training from scratch. You requested training from scratch and" From 42aed0c20a56d080fef93660244fdca7249e496d Mon Sep 17 00:00:00 2001 From: Shruthi42 <13177030+Shruthi42@users.noreply.github.com> Date: Fri, 4 Jun 2021 09:26:01 +0100 Subject: [PATCH 4/7] Address PR comments --- InnerEye/ML/configs/classification/CovidHierarchicalModel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/InnerEye/ML/configs/classification/CovidHierarchicalModel.py b/InnerEye/ML/configs/classification/CovidHierarchicalModel.py index 1a0cd02d6..5554ced06 100644 --- a/InnerEye/ML/configs/classification/CovidHierarchicalModel.py +++ b/InnerEye/ML/configs/classification/CovidHierarchicalModel.py @@ -100,7 +100,7 @@ def get_model_train_test_dataset_splits(self, dataset_df: pd.DataFrame) -> Datas test_series = pd.read_csv(test_set_ids_csv).series all_series = dataset_df.series.values - check_all_test_series = all([series in all_series for series in test_series]) + check_all_test_series = all(test_series.isin(all_series)) if not check_all_test_series: raise ValueError(f"Not all test series from {test_set_ids_csv} were found in the dataset.") From c4ba2a4e248d258094c2248193bd6d77664df9a3 Mon Sep 17 00:00:00 2001 From: Shruthi42 <13177030+Shruthi42@users.noreply.github.com> Date: Fri, 4 Jun 2021 10:13:48 +0100 Subject: [PATCH 5/7] Address PR comments --- .../ML/configs/classification/CovidHierarchicalModel.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/InnerEye/ML/configs/classification/CovidHierarchicalModel.py b/InnerEye/ML/configs/classification/CovidHierarchicalModel.py index 5554ced06..c891fcf6b 100644 --- a/InnerEye/ML/configs/classification/CovidHierarchicalModel.py +++ b/InnerEye/ML/configs/classification/CovidHierarchicalModel.py @@ -1,6 +1,8 @@ import codecs import logging import pickle +import random +import math from pathlib import Path from typing import Any, Callable @@ -106,7 +108,9 @@ def get_model_train_test_dataset_splits(self, dataset_df: pd.DataFrame) -> Datas test_set_subjects = dataset_df[dataset_df.series.isin(test_series)].subject.values train_and_val_series = dataset_df[~dataset_df.subject.isin(test_set_subjects)].series.values - num_val_samples = 400 + random.seed(42) + random.shuffle(train_and_val_series) + num_val_samples = math.floor(0.1*len(train_and_val_series)) val_series = train_and_val_series[:num_val_samples] train_series = train_and_val_series[num_val_samples:] From 27da1bba6ea6faf0e50a3eeea516d1de0a527784 Mon Sep 17 00:00:00 2001 From: Shruthi42 <13177030+Shruthi42@users.noreply.github.com> Date: Mon, 7 Jun 2021 09:49:48 +0100 Subject: [PATCH 6/7] Address PR comments --- InnerEye/ML/configs/classification/CovidHierarchicalModel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/InnerEye/ML/configs/classification/CovidHierarchicalModel.py b/InnerEye/ML/configs/classification/CovidHierarchicalModel.py index f541a5d3a..b038a14ea 100644 --- a/InnerEye/ML/configs/classification/CovidHierarchicalModel.py +++ b/InnerEye/ML/configs/classification/CovidHierarchicalModel.py @@ -110,7 +110,7 @@ def get_model_train_test_dataset_splits(self, dataset_df: pd.DataFrame) -> Datas train_and_val_series = dataset_df[~dataset_df.subject.isin(test_set_subjects)].series.values random.seed(42) random.shuffle(train_and_val_series) - num_val_samples = math.floor(0.1*len(train_and_val_series)) + num_val_samples = math.floor(0.11*len(train_and_val_series)) val_series = train_and_val_series[:num_val_samples] train_series = train_and_val_series[num_val_samples:] From 55af7ed670e47e1c6e24e11914f763ef9e2518b7 Mon Sep 17 00:00:00 2001 From: Shruthi42 <13177030+Shruthi42@users.noreply.github.com> Date: Mon, 7 Jun 2021 09:59:36 +0100 Subject: [PATCH 7/7] Address PR comments --- InnerEye/ML/configs/classification/CovidHierarchicalModel.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/InnerEye/ML/configs/classification/CovidHierarchicalModel.py b/InnerEye/ML/configs/classification/CovidHierarchicalModel.py index c9fe9459b..d7b99c343 100644 --- a/InnerEye/ML/configs/classification/CovidHierarchicalModel.py +++ b/InnerEye/ML/configs/classification/CovidHierarchicalModel.py @@ -110,7 +110,7 @@ def get_model_train_test_dataset_splits(self, dataset_df: pd.DataFrame) -> Datas train_and_val_series = dataset_df[~dataset_df.subject.isin(test_set_subjects)].series.values random.seed(42) random.shuffle(train_and_val_series) - num_val_samples = math.floor(0.11*len(train_and_val_series)) + num_val_samples = math.floor(len(train_and_val_series) / 9) val_series = train_and_val_series[:num_val_samples] train_series = train_and_val_series[num_val_samples:]