Skip to content
This repository was archived by the owner on Mar 21, 2024. It is now read-only.
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ jobs that run in AzureML.

### Fixed

- ([#495](https://github.com/microsoft/InnerEye-DeepLearning/pull/495)) Fix model comparison.
- ([#482](https://github.com/microsoft/InnerEye-DeepLearning/pull/482)) Check bool parameter is either true or false.
- ([#475](https://github.com/microsoft/InnerEye-DeepLearning/pull/475)) Bug in AML SDK meant that we could not train
any large models anymore because data loaders ran out of memory.
Expand Down
2 changes: 0 additions & 2 deletions InnerEye/Common/common_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,6 @@
empty_string_to_none = lambda x: None if (x is None or len(x.strip()) == 0) else x
string_to_path = lambda x: None if (x is None or len(x.strip()) == 0) else Path(x)

# File name pattern that will match anything returned by epoch_folder_name.
EPOCH_FOLDER_NAME_PATTERN = "epoch_[0-9][0-9][0-9]"

SUBJECT_METRICS_FILE_NAME = "metrics.csv"
EPOCH_METRICS_FILE_NAME = "epoch_metrics.csv"
Expand Down
30 changes: 10 additions & 20 deletions InnerEye/ML/baselines_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,8 @@
from InnerEye.Common import common_util
from InnerEye.Common.Statistics import wilcoxon_signed_rank_test
from InnerEye.Common.Statistics.wilcoxon_signed_rank_test import WilcoxonTestConfig
from InnerEye.Common.common_util import BASELINE_WILCOXON_RESULTS_FILE, ENSEMBLE_SPLIT_NAME, \
EPOCH_FOLDER_NAME_PATTERN, FULL_METRICS_DATAFRAME_FILE, ModelProcessing, OTHER_RUNS_SUBDIR_NAME, \
SUBJECT_METRICS_FILE_NAME, remove_file_or_directory
from InnerEye.Common.common_util import BASELINE_WILCOXON_RESULTS_FILE, FULL_METRICS_DATAFRAME_FILE, ModelProcessing, \
SUBJECT_METRICS_FILE_NAME, get_best_epoch_results_path, remove_file_or_directory
from InnerEye.Common.fixed_paths import DEFAULT_AML_UPLOAD_DIR
from InnerEye.ML.common import DATASET_CSV_FILE_NAME, ModelExecutionMode
from InnerEye.ML.config import SegmentationModelBase
Expand Down Expand Up @@ -65,27 +64,18 @@ def compare_scores_against_baselines(model_config: SegmentationModelBase, azure_
the Wilcoxon results file.
"""
# The attribute will only be present for a segmentation model; and it might be None or empty even for that.
comparison_blob_storage_paths = getattr(model_config, 'comparison_blob_storage_paths')
comparison_blob_storage_paths = model_config.comparison_blob_storage_paths
if not comparison_blob_storage_paths:
return
outputs_path = model_config.outputs_folder
if model_proc == ModelProcessing.ENSEMBLE_CREATION:
outputs_path = outputs_path / OTHER_RUNS_SUBDIR_NAME / ENSEMBLE_SPLIT_NAME
model_epoch_paths = sorted(outputs_path.glob(EPOCH_FOLDER_NAME_PATTERN))
if not model_epoch_paths:
logging.warning("Cannot compare scores against baselines: no matches found for "
f"{outputs_path}/{EPOCH_FOLDER_NAME_PATTERN}")
return
# Use the last (highest-numbered) epoch path for the current run.
model_epoch_path = model_epoch_paths[-1]
model_metrics_path = model_epoch_path / ModelExecutionMode.TEST.value / SUBJECT_METRICS_FILE_NAME
model_dataset_path = model_epoch_path / ModelExecutionMode.TEST.value / DATASET_CSV_FILE_NAME
outputs_path = model_config.outputs_folder / get_best_epoch_results_path(ModelExecutionMode.TEST, model_proc)
if not outputs_path.is_dir():
raise FileNotFoundError(f"Cannot compare scores against baselines: no best epoch results found at {outputs_path}")
model_metrics_path = outputs_path / SUBJECT_METRICS_FILE_NAME
model_dataset_path = outputs_path / DATASET_CSV_FILE_NAME
if not model_dataset_path.exists():
logging.warning(f"Not comparing with baselines because no {model_dataset_path} file found for this run")
return
raise FileNotFoundError(f"Not comparing with baselines because no {model_dataset_path} file found for this run")
if not model_metrics_path.exists():
logging.warning(f"Not comparing with baselines because no {model_metrics_path} file found for this run")
return
raise FileNotFoundError(f"Not comparing with baselines because no {model_metrics_path} file found for this run")
model_metrics_df = pd.read_csv(model_metrics_path)
model_dataset_df = pd.read_csv(model_dataset_path)
comparison_result = download_and_compare_scores(outputs_path,
Expand Down
5 changes: 5 additions & 0 deletions InnerEye/ML/configs/segmentation/BasicModel2Epochs.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,13 @@

fg_classes = ["spinalcord", "lung_r", "lung_l"]

default_single_comparison_blob = "refs_pull_483_merge_1624269679_90b1d23c/outputs/best_validation_epoch/Test"


class BasicModel2Epochs(SegmentationModelBase):
def __init__(self, **kwargs: Any) -> None:
comparison_blob_storage_paths = kwargs.pop("comparison_blob_storage_paths",
[("Single", default_single_comparison_blob)])
super().__init__(
should_validate=False,
architecture="Basic",
Expand All @@ -39,6 +43,7 @@ def __init__(self, **kwargs: Any) -> None:
recovery_checkpoint_save_interval=1,
use_mixed_precision=True,
azure_dataset_id=AZURE_DATASET_ID,
comparison_blob_storage_paths=comparison_blob_storage_paths,
dataset_mountpoint="/tmp/innereye",
# Use an LR scheduler with a pronounced and clearly visible decay, to be able to easily see if that
# is applied correctly in run recovery.
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
There were not enough data points for any statistically meaningful comparisons.
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
,Patient,Structure,Dice,HausdorffDistance_mm,MeanDistance_mm,seriesId,institutionId,split,mode
0,5,lung_l,0.0,inf,inf,402ba5d42f37357f18f29af17b0846cbca4c430fea5b15a6baad5ec29dd6c9ba,b7f757fb-12e0-489e-a6da-f64895cdd229,CURRENT,Test
6,5,lung_r,0.0,inf,inf,402ba5d42f37357f18f29af17b0846cbca4c430fea5b15a6baad5ec29dd6c9ba,b7f757fb-12e0-489e-a6da-f64895cdd229,CURRENT,Test
12,5,spinalcord,0.0,inf,inf,402ba5d42f37357f18f29af17b0846cbca4c430fea5b15a6baad5ec29dd6c9ba,b7f757fb-12e0-489e-a6da-f64895cdd229,CURRENT,Test
0,5,lung_l,0.0,inf,inf,402ba5d42f37357f18f29af17b0846cbca4c430fea5b15a6baad5ec29dd6c9ba,b7f757fb-12e0-489e-a6da-f64895cdd229,Single,Test
6,5,lung_r,0.0,inf,inf,402ba5d42f37357f18f29af17b0846cbca4c430fea5b15a6baad5ec29dd6c9ba,b7f757fb-12e0-489e-a6da-f64895cdd229,Single,Test
12,5,spinalcord,0.0,inf,inf,402ba5d42f37357f18f29af17b0846cbca4c430fea5b15a6baad5ec29dd6c9ba,b7f757fb-12e0-489e-a6da-f64895cdd229,Single,Test
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
There were not enough data points for any statistically meaningful comparisons.
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
,Patient,Structure,Dice,HausdorffDistance_mm,MeanDistance_mm,seriesId,institutionId,split,mode
0,4,lung_l,0.0,inf,inf,aee46d8ee00ed5a5b5d763957e58b8d8cd27f36b9722ada79937fead949f8581,b7f757fb-12e0-489e-a6da-f64895cdd229,CURRENT,Test
6,4,lung_r,0.0,inf,inf,aee46d8ee00ed5a5b5d763957e58b8d8cd27f36b9722ada79937fead949f8581,b7f757fb-12e0-489e-a6da-f64895cdd229,CURRENT,Test
12,4,spinalcord,0.0,inf,inf,aee46d8ee00ed5a5b5d763957e58b8d8cd27f36b9722ada79937fead949f8581,b7f757fb-12e0-489e-a6da-f64895cdd229,CURRENT,Test
18,5,lung_l,0.0,inf,inf,402ba5d42f37357f18f29af17b0846cbca4c430fea5b15a6baad5ec29dd6c9ba,b7f757fb-12e0-489e-a6da-f64895cdd229,CURRENT,Test
24,5,lung_r,0.0,inf,inf,402ba5d42f37357f18f29af17b0846cbca4c430fea5b15a6baad5ec29dd6c9ba,b7f757fb-12e0-489e-a6da-f64895cdd229,CURRENT,Test
30,5,spinalcord,0.0,inf,inf,402ba5d42f37357f18f29af17b0846cbca4c430fea5b15a6baad5ec29dd6c9ba,b7f757fb-12e0-489e-a6da-f64895cdd229,CURRENT,Test
36,6,lung_l,0.0,inf,inf,dfa85a2f00323404eac4fd784713e6e2dc980dc9e33fd9f5d3cda7b82c7e6ab2,b7f757fb-12e0-489e-a6da-f64895cdd229,CURRENT,Test
42,6,lung_r,0.0,inf,inf,dfa85a2f00323404eac4fd784713e6e2dc980dc9e33fd9f5d3cda7b82c7e6ab2,b7f757fb-12e0-489e-a6da-f64895cdd229,CURRENT,Test
48,6,spinalcord,0.0,inf,inf,dfa85a2f00323404eac4fd784713e6e2dc980dc9e33fd9f5d3cda7b82c7e6ab2,b7f757fb-12e0-489e-a6da-f64895cdd229,CURRENT,Test
54,7,lung_l,0.0,inf,inf,8e1c5884be4d4d6581242e862641b08ecb8b2a13d71b4a708f35a951db12793c,b7f757fb-12e0-489e-a6da-f64895cdd229,CURRENT,Test
60,7,lung_r,0.0,inf,inf,8e1c5884be4d4d6581242e862641b08ecb8b2a13d71b4a708f35a951db12793c,b7f757fb-12e0-489e-a6da-f64895cdd229,CURRENT,Test
66,7,spinalcord,0.0,inf,inf,8e1c5884be4d4d6581242e862641b08ecb8b2a13d71b4a708f35a951db12793c,b7f757fb-12e0-489e-a6da-f64895cdd229,CURRENT,Test
0,5,lung_l,0.0,inf,inf,402ba5d42f37357f18f29af17b0846cbca4c430fea5b15a6baad5ec29dd6c9ba,b7f757fb-12e0-489e-a6da-f64895cdd229,Single,Test
6,5,lung_r,0.0,inf,inf,402ba5d42f37357f18f29af17b0846cbca4c430fea5b15a6baad5ec29dd6c9ba,b7f757fb-12e0-489e-a6da-f64895cdd229,Single,Test
12,5,spinalcord,0.0,inf,inf,402ba5d42f37357f18f29af17b0846cbca4c430fea5b15a6baad5ec29dd6c9ba,b7f757fb-12e0-489e-a6da-f64895cdd229,Single,Test
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
154 changes: 144 additions & 10 deletions Tests/ML/runners/test_compare_against_baselines.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,41 +5,96 @@

import pandas as pd
import pytest
from typing import List
from unittest import mock

from InnerEye.Common import common_util
from InnerEye.Common.common_util import BEST_EPOCH_FOLDER_NAME
from InnerEye.Common.common_util import BASELINE_WILCOXON_RESULTS_FILE, BEST_EPOCH_FOLDER_NAME, ENSEMBLE_SPLIT_NAME, \
FULL_METRICS_DATAFRAME_FILE, OTHER_RUNS_SUBDIR_NAME, SUBJECT_METRICS_FILE_NAME, ModelProcessing
from InnerEye.Common.fixed_paths import DEFAULT_AML_UPLOAD_DIR
from InnerEye.Common.output_directories import OutputFolderForTests
from InnerEye.ML.baselines_util import ComparisonBaseline, get_comparison_baselines, perform_score_comparisons
from InnerEye.ML.common import ModelExecutionMode
from InnerEye.ML.baselines_util import ComparisonBaseline, compare_scores_against_baselines, get_comparison_baselines, \
perform_score_comparisons
from InnerEye.ML.common import DATASET_CSV_FILE_NAME, ModelExecutionMode
from InnerEye.ML.config import SegmentationModelBase
from Tests.ML.util import get_default_azure_config
from Tests.AfterTraining.test_after_training import get_most_recent_run_id


@pytest.mark.skipif(common_util.is_windows(), reason="Loading tk sometimes fails on Windows")
def test_perform_score_comparisons() -> None:
def create_dataset_df() -> pd.DataFrame:
"""
Create a test dataframe for DATASET_CSV_FILE_NAME.

:return: Test dataframe.
"""
dataset_df = pd.DataFrame()
dataset_df['subject'] = list(range(10))
dataset_df['seriesId'] = [f"s{i}" for i in range(10)]
dataset_df['institutionId'] = ["xyz"] * 10
return dataset_df


def create_metrics_df() -> pd.DataFrame:
"""
Create a test dataframe for SUBJECT_METRICS_FILE_NAME.

:return: Test dataframe.
"""
metrics_df = pd.DataFrame()
metrics_df['Patient'] = list(range(10))
metrics_df['Structure'] = ['appendix'] * 10
metrics_df['Dice'] = [0.5 + i * 0.02 for i in range(10)]
return metrics_df


def create_comparison_metrics_df() -> pd.DataFrame:
"""
Create a test dataframe for comparison metrics.

:return: Test dataframe.
"""
comparison_metrics_df = pd.DataFrame()
comparison_metrics_df['Patient'] = list(range(10))
comparison_metrics_df['Structure'] = ['appendix'] * 10
comparison_metrics_df['Dice'] = [0.51 + i * 0.02 for i in range(10)]
return comparison_metrics_df


def create_comparison_baseline(dataset_df: pd.DataFrame) -> ComparisonBaseline:
"""
Create a test ComparisonBaseline.

:param dataset_df: Dataset dataframe.
:return: New test ComparisonBaseline
"""
comparison_metrics_df = create_comparison_metrics_df()
comparison_name = "DefaultName"
comparison_run_rec_id = "DefaultRunRecId"
baseline = ComparisonBaseline(comparison_name, dataset_df, comparison_metrics_df, comparison_run_rec_id)
return ComparisonBaseline(comparison_name, dataset_df, comparison_metrics_df, comparison_run_rec_id)


def check_wilcoxon_lines(wilcoxon_lines: List[str], baseline: ComparisonBaseline) -> None:
"""
Assert that the wilcoxon lines are as expected.

:param wilcoxon_lines: Lines to test.
:param baseline: Expected comparison baseline.
"""
assert wilcoxon_lines[0] == f"Run 1: {baseline.name}"
assert wilcoxon_lines[1] == "Run 2: CURRENT"
assert wilcoxon_lines[3].find("WORSE") > 0


@pytest.mark.skipif(common_util.is_windows(), reason="Loading tk sometimes fails on Windows")
def test_perform_score_comparisons() -> None:
dataset_df = create_dataset_df()
metrics_df = create_metrics_df()
baseline = create_comparison_baseline(dataset_df)
result = perform_score_comparisons(dataset_df, metrics_df, [baseline])
assert result.did_comparisons
assert len(result.wilcoxon_lines) == 5
assert result.wilcoxon_lines[0] == f"Run 1: {comparison_name}"
assert result.wilcoxon_lines[1] == "Run 2: CURRENT"
assert result.wilcoxon_lines[3].find("WORSE") > 0
assert list(result.plots.keys()) == [f"{comparison_name}_vs_CURRENT"]
check_wilcoxon_lines(result.wilcoxon_lines, baseline)
assert list(result.plots.keys()) == [f"{baseline.name}_vs_CURRENT"]


@pytest.mark.after_training_single_run
Expand All @@ -52,3 +107,82 @@ def test_get_comparison_data(test_output_dirs: OutputFolderForTests) -> None:
azure_config, [(comparison_name, comparison_path)])
assert len(baselines) == 1
assert baselines[0].name == comparison_name


@pytest.mark.parametrize('model_proc', [ModelProcessing.DEFAULT, ModelProcessing.ENSEMBLE_CREATION])
def test_compare_scores_against_baselines_throws(model_proc: ModelProcessing,
test_output_dirs: OutputFolderForTests) -> None:
"""
Test that exceptions are raised if the files necessary for baseline comparison are missing,
then test that when all required files are present that baseline comparison files are written.

:param model_proc: Model processing to test.
:param test_output_dirs: Test output directories.
:return: None.
"""
config = SegmentationModelBase(should_validate=False,
comparison_blob_storage_paths=[
('Single', 'dummy_blob_single/outputs/epoch_120/Test'),
('5fold', 'dummy_blob_ensemble/outputs/epoch_120/Test')])
config.set_output_to(test_output_dirs.root_dir)

azure_config = get_default_azure_config()

# If the BEST_EPOCH_FOLDER_NAME folder is missing, expect an exception to be raised.
with pytest.raises(FileNotFoundError) as ex:
compare_scores_against_baselines(
model_config=config,
azure_config=azure_config, model_proc=model_proc)
assert "Cannot compare scores against baselines: no best epoch results found at" in str(ex)

best_epoch_folder_path = config.outputs_folder
if model_proc == ModelProcessing.ENSEMBLE_CREATION:
best_epoch_folder_path = best_epoch_folder_path / OTHER_RUNS_SUBDIR_NAME / ENSEMBLE_SPLIT_NAME
best_epoch_folder_path = best_epoch_folder_path / BEST_EPOCH_FOLDER_NAME / ModelExecutionMode.TEST.value

best_epoch_folder_path.mkdir(parents=True)

# If the BEST_EPOCH_FOLDER_NAME folder exists but DATASET_CSV_FILE_NAME is missing,
# expect an exception to be raised.
with pytest.raises(FileNotFoundError) as ex:
compare_scores_against_baselines(
model_config=config,
azure_config=azure_config, model_proc=model_proc)
assert "Not comparing with baselines because no " in str(ex)
assert DATASET_CSV_FILE_NAME in str(ex)

model_dataset_path = best_epoch_folder_path / DATASET_CSV_FILE_NAME
dataset_df = create_dataset_df()
dataset_df.to_csv(model_dataset_path)

# If the BEST_EPOCH_FOLDER_NAME folder exists but SUBJECT_METRICS_FILE_NAME is missing,
# expect an exception to be raised.
with pytest.raises(FileNotFoundError) as ex:
compare_scores_against_baselines(
model_config=config,
azure_config=azure_config, model_proc=model_proc)
assert "Not comparing with baselines because no " in str(ex)
assert SUBJECT_METRICS_FILE_NAME in str(ex)

model_metrics_path = best_epoch_folder_path / SUBJECT_METRICS_FILE_NAME
metrics_df = create_metrics_df()
metrics_df.to_csv(model_metrics_path)

baseline = create_comparison_baseline(dataset_df)

# Patch get_comparison_baselines to return the baseline above.
with mock.patch('InnerEye.ML.baselines_util.get_comparison_baselines', return_value=[baseline]):
compare_scores_against_baselines(
model_config=config,
azure_config=azure_config, model_proc=model_proc)

# Check the wilcoxoon results file is present and has expected contents.
wilcoxon_path = best_epoch_folder_path / BASELINE_WILCOXON_RESULTS_FILE
assert wilcoxon_path.is_file()

wilcoxon_lines = [line.strip() for line in wilcoxon_path.read_text().splitlines()]
check_wilcoxon_lines(wilcoxon_lines, baseline)

# Check the full metrics results file is present.
full_metrics_path = best_epoch_folder_path / FULL_METRICS_DATAFRAME_FILE
assert full_metrics_path.is_file()