diff --git a/CHANGELOG.md b/CHANGELOG.md index 921117d48..0b3ea7937 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -21,6 +21,7 @@ jobs that run in AzureML. ### Fixed +- ([#495](https://github.com/microsoft/InnerEye-DeepLearning/pull/495)) Fix model comparison. - ([#482](https://github.com/microsoft/InnerEye-DeepLearning/pull/482)) Check bool parameter is either true or false. - ([#475](https://github.com/microsoft/InnerEye-DeepLearning/pull/475)) Bug in AML SDK meant that we could not train any large models anymore because data loaders ran out of memory. diff --git a/InnerEye/Common/common_util.py b/InnerEye/Common/common_util.py index a727fd698..2bd8a3db2 100644 --- a/InnerEye/Common/common_util.py +++ b/InnerEye/Common/common_util.py @@ -24,8 +24,6 @@ empty_string_to_none = lambda x: None if (x is None or len(x.strip()) == 0) else x string_to_path = lambda x: None if (x is None or len(x.strip()) == 0) else Path(x) -# File name pattern that will match anything returned by epoch_folder_name. -EPOCH_FOLDER_NAME_PATTERN = "epoch_[0-9][0-9][0-9]" SUBJECT_METRICS_FILE_NAME = "metrics.csv" EPOCH_METRICS_FILE_NAME = "epoch_metrics.csv" diff --git a/InnerEye/ML/baselines_util.py b/InnerEye/ML/baselines_util.py index 1d2768826..945c77bf9 100755 --- a/InnerEye/ML/baselines_util.py +++ b/InnerEye/ML/baselines_util.py @@ -20,9 +20,8 @@ from InnerEye.Common import common_util from InnerEye.Common.Statistics import wilcoxon_signed_rank_test from InnerEye.Common.Statistics.wilcoxon_signed_rank_test import WilcoxonTestConfig -from InnerEye.Common.common_util import BASELINE_WILCOXON_RESULTS_FILE, ENSEMBLE_SPLIT_NAME, \ - EPOCH_FOLDER_NAME_PATTERN, FULL_METRICS_DATAFRAME_FILE, ModelProcessing, OTHER_RUNS_SUBDIR_NAME, \ - SUBJECT_METRICS_FILE_NAME, remove_file_or_directory +from InnerEye.Common.common_util import BASELINE_WILCOXON_RESULTS_FILE, FULL_METRICS_DATAFRAME_FILE, ModelProcessing, \ + SUBJECT_METRICS_FILE_NAME, get_best_epoch_results_path, remove_file_or_directory from InnerEye.Common.fixed_paths import DEFAULT_AML_UPLOAD_DIR from InnerEye.ML.common import DATASET_CSV_FILE_NAME, ModelExecutionMode from InnerEye.ML.config import SegmentationModelBase @@ -65,27 +64,18 @@ def compare_scores_against_baselines(model_config: SegmentationModelBase, azure_ the Wilcoxon results file. """ # The attribute will only be present for a segmentation model; and it might be None or empty even for that. - comparison_blob_storage_paths = getattr(model_config, 'comparison_blob_storage_paths') + comparison_blob_storage_paths = model_config.comparison_blob_storage_paths if not comparison_blob_storage_paths: return - outputs_path = model_config.outputs_folder - if model_proc == ModelProcessing.ENSEMBLE_CREATION: - outputs_path = outputs_path / OTHER_RUNS_SUBDIR_NAME / ENSEMBLE_SPLIT_NAME - model_epoch_paths = sorted(outputs_path.glob(EPOCH_FOLDER_NAME_PATTERN)) - if not model_epoch_paths: - logging.warning("Cannot compare scores against baselines: no matches found for " - f"{outputs_path}/{EPOCH_FOLDER_NAME_PATTERN}") - return - # Use the last (highest-numbered) epoch path for the current run. - model_epoch_path = model_epoch_paths[-1] - model_metrics_path = model_epoch_path / ModelExecutionMode.TEST.value / SUBJECT_METRICS_FILE_NAME - model_dataset_path = model_epoch_path / ModelExecutionMode.TEST.value / DATASET_CSV_FILE_NAME + outputs_path = model_config.outputs_folder / get_best_epoch_results_path(ModelExecutionMode.TEST, model_proc) + if not outputs_path.is_dir(): + raise FileNotFoundError(f"Cannot compare scores against baselines: no best epoch results found at {outputs_path}") + model_metrics_path = outputs_path / SUBJECT_METRICS_FILE_NAME + model_dataset_path = outputs_path / DATASET_CSV_FILE_NAME if not model_dataset_path.exists(): - logging.warning(f"Not comparing with baselines because no {model_dataset_path} file found for this run") - return + raise FileNotFoundError(f"Not comparing with baselines because no {model_dataset_path} file found for this run") if not model_metrics_path.exists(): - logging.warning(f"Not comparing with baselines because no {model_metrics_path} file found for this run") - return + raise FileNotFoundError(f"Not comparing with baselines because no {model_metrics_path} file found for this run") model_metrics_df = pd.read_csv(model_metrics_path) model_dataset_df = pd.read_csv(model_dataset_path) comparison_result = download_and_compare_scores(outputs_path, diff --git a/InnerEye/ML/configs/segmentation/BasicModel2Epochs.py b/InnerEye/ML/configs/segmentation/BasicModel2Epochs.py index 5acfaaf01..efc4d1d10 100644 --- a/InnerEye/ML/configs/segmentation/BasicModel2Epochs.py +++ b/InnerEye/ML/configs/segmentation/BasicModel2Epochs.py @@ -13,9 +13,13 @@ fg_classes = ["spinalcord", "lung_r", "lung_l"] +default_single_comparison_blob = "refs_pull_483_merge_1624269679_90b1d23c/outputs/best_validation_epoch/Test" + class BasicModel2Epochs(SegmentationModelBase): def __init__(self, **kwargs: Any) -> None: + comparison_blob_storage_paths = kwargs.pop("comparison_blob_storage_paths", + [("Single", default_single_comparison_blob)]) super().__init__( should_validate=False, architecture="Basic", @@ -39,6 +43,7 @@ def __init__(self, **kwargs: Any) -> None: recovery_checkpoint_save_interval=1, use_mixed_precision=True, azure_dataset_id=AZURE_DATASET_ID, + comparison_blob_storage_paths=comparison_blob_storage_paths, dataset_mountpoint="/tmp/innereye", # Use an LR scheduler with a pronounced and clearly visible decay, to be able to easily see if that # is applied correctly in run recovery. diff --git a/RegressionTestResults/PR_BasicModel2Epochs/OUTPUT/best_validation_epoch/Test/BaselineComparisonWilcoxonSignedRankTestResults.txt b/RegressionTestResults/PR_BasicModel2Epochs/OUTPUT/best_validation_epoch/Test/BaselineComparisonWilcoxonSignedRankTestResults.txt new file mode 100644 index 000000000..3e7fbfc54 --- /dev/null +++ b/RegressionTestResults/PR_BasicModel2Epochs/OUTPUT/best_validation_epoch/Test/BaselineComparisonWilcoxonSignedRankTestResults.txt @@ -0,0 +1 @@ +There were not enough data points for any statistically meaningful comparisons. \ No newline at end of file diff --git a/RegressionTestResults/PR_BasicModel2Epochs/OUTPUT/best_validation_epoch/Test/MetricsAcrossAllRuns.csv b/RegressionTestResults/PR_BasicModel2Epochs/OUTPUT/best_validation_epoch/Test/MetricsAcrossAllRuns.csv new file mode 100644 index 000000000..1f0fa576a --- /dev/null +++ b/RegressionTestResults/PR_BasicModel2Epochs/OUTPUT/best_validation_epoch/Test/MetricsAcrossAllRuns.csv @@ -0,0 +1,7 @@ +,Patient,Structure,Dice,HausdorffDistance_mm,MeanDistance_mm,seriesId,institutionId,split,mode +0,5,lung_l,0.0,inf,inf,402ba5d42f37357f18f29af17b0846cbca4c430fea5b15a6baad5ec29dd6c9ba,b7f757fb-12e0-489e-a6da-f64895cdd229,CURRENT,Test +6,5,lung_r,0.0,inf,inf,402ba5d42f37357f18f29af17b0846cbca4c430fea5b15a6baad5ec29dd6c9ba,b7f757fb-12e0-489e-a6da-f64895cdd229,CURRENT,Test +12,5,spinalcord,0.0,inf,inf,402ba5d42f37357f18f29af17b0846cbca4c430fea5b15a6baad5ec29dd6c9ba,b7f757fb-12e0-489e-a6da-f64895cdd229,CURRENT,Test +0,5,lung_l,0.0,inf,inf,402ba5d42f37357f18f29af17b0846cbca4c430fea5b15a6baad5ec29dd6c9ba,b7f757fb-12e0-489e-a6da-f64895cdd229,Single,Test +6,5,lung_r,0.0,inf,inf,402ba5d42f37357f18f29af17b0846cbca4c430fea5b15a6baad5ec29dd6c9ba,b7f757fb-12e0-489e-a6da-f64895cdd229,Single,Test +12,5,spinalcord,0.0,inf,inf,402ba5d42f37357f18f29af17b0846cbca4c430fea5b15a6baad5ec29dd6c9ba,b7f757fb-12e0-489e-a6da-f64895cdd229,Single,Test diff --git a/RegressionTestResults/PR_BasicModel2Epochs/OUTPUT/best_validation_epoch/Test/scatterplots/Single_vs_CURRENT.png b/RegressionTestResults/PR_BasicModel2Epochs/OUTPUT/best_validation_epoch/Test/scatterplots/Single_vs_CURRENT.png new file mode 100644 index 000000000..7800cc7dc --- /dev/null +++ b/RegressionTestResults/PR_BasicModel2Epochs/OUTPUT/best_validation_epoch/Test/scatterplots/Single_vs_CURRENT.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:527930cc54b6e3581f87da7b9056b005cccd7c242f7afcc7382b73690d98f49d +size 56806 diff --git a/RegressionTestResults/PR_Train2Nodes/OUTPUT/best_validation_epoch/Test/BaselineComparisonWilcoxonSignedRankTestResults.txt b/RegressionTestResults/PR_Train2Nodes/OUTPUT/best_validation_epoch/Test/BaselineComparisonWilcoxonSignedRankTestResults.txt new file mode 100644 index 000000000..3e7fbfc54 --- /dev/null +++ b/RegressionTestResults/PR_Train2Nodes/OUTPUT/best_validation_epoch/Test/BaselineComparisonWilcoxonSignedRankTestResults.txt @@ -0,0 +1 @@ +There were not enough data points for any statistically meaningful comparisons. \ No newline at end of file diff --git a/RegressionTestResults/PR_Train2Nodes/OUTPUT/best_validation_epoch/Test/MetricsAcrossAllRuns.csv b/RegressionTestResults/PR_Train2Nodes/OUTPUT/best_validation_epoch/Test/MetricsAcrossAllRuns.csv new file mode 100644 index 000000000..5965bb8de --- /dev/null +++ b/RegressionTestResults/PR_Train2Nodes/OUTPUT/best_validation_epoch/Test/MetricsAcrossAllRuns.csv @@ -0,0 +1,16 @@ +,Patient,Structure,Dice,HausdorffDistance_mm,MeanDistance_mm,seriesId,institutionId,split,mode +0,4,lung_l,0.0,inf,inf,aee46d8ee00ed5a5b5d763957e58b8d8cd27f36b9722ada79937fead949f8581,b7f757fb-12e0-489e-a6da-f64895cdd229,CURRENT,Test +6,4,lung_r,0.0,inf,inf,aee46d8ee00ed5a5b5d763957e58b8d8cd27f36b9722ada79937fead949f8581,b7f757fb-12e0-489e-a6da-f64895cdd229,CURRENT,Test +12,4,spinalcord,0.0,inf,inf,aee46d8ee00ed5a5b5d763957e58b8d8cd27f36b9722ada79937fead949f8581,b7f757fb-12e0-489e-a6da-f64895cdd229,CURRENT,Test +18,5,lung_l,0.0,inf,inf,402ba5d42f37357f18f29af17b0846cbca4c430fea5b15a6baad5ec29dd6c9ba,b7f757fb-12e0-489e-a6da-f64895cdd229,CURRENT,Test +24,5,lung_r,0.0,inf,inf,402ba5d42f37357f18f29af17b0846cbca4c430fea5b15a6baad5ec29dd6c9ba,b7f757fb-12e0-489e-a6da-f64895cdd229,CURRENT,Test +30,5,spinalcord,0.0,inf,inf,402ba5d42f37357f18f29af17b0846cbca4c430fea5b15a6baad5ec29dd6c9ba,b7f757fb-12e0-489e-a6da-f64895cdd229,CURRENT,Test +36,6,lung_l,0.0,inf,inf,dfa85a2f00323404eac4fd784713e6e2dc980dc9e33fd9f5d3cda7b82c7e6ab2,b7f757fb-12e0-489e-a6da-f64895cdd229,CURRENT,Test +42,6,lung_r,0.0,inf,inf,dfa85a2f00323404eac4fd784713e6e2dc980dc9e33fd9f5d3cda7b82c7e6ab2,b7f757fb-12e0-489e-a6da-f64895cdd229,CURRENT,Test +48,6,spinalcord,0.0,inf,inf,dfa85a2f00323404eac4fd784713e6e2dc980dc9e33fd9f5d3cda7b82c7e6ab2,b7f757fb-12e0-489e-a6da-f64895cdd229,CURRENT,Test +54,7,lung_l,0.0,inf,inf,8e1c5884be4d4d6581242e862641b08ecb8b2a13d71b4a708f35a951db12793c,b7f757fb-12e0-489e-a6da-f64895cdd229,CURRENT,Test +60,7,lung_r,0.0,inf,inf,8e1c5884be4d4d6581242e862641b08ecb8b2a13d71b4a708f35a951db12793c,b7f757fb-12e0-489e-a6da-f64895cdd229,CURRENT,Test +66,7,spinalcord,0.0,inf,inf,8e1c5884be4d4d6581242e862641b08ecb8b2a13d71b4a708f35a951db12793c,b7f757fb-12e0-489e-a6da-f64895cdd229,CURRENT,Test +0,5,lung_l,0.0,inf,inf,402ba5d42f37357f18f29af17b0846cbca4c430fea5b15a6baad5ec29dd6c9ba,b7f757fb-12e0-489e-a6da-f64895cdd229,Single,Test +6,5,lung_r,0.0,inf,inf,402ba5d42f37357f18f29af17b0846cbca4c430fea5b15a6baad5ec29dd6c9ba,b7f757fb-12e0-489e-a6da-f64895cdd229,Single,Test +12,5,spinalcord,0.0,inf,inf,402ba5d42f37357f18f29af17b0846cbca4c430fea5b15a6baad5ec29dd6c9ba,b7f757fb-12e0-489e-a6da-f64895cdd229,Single,Test diff --git a/RegressionTestResults/PR_Train2Nodes/OUTPUT/best_validation_epoch/Test/scatterplots/Single_vs_CURRENT.png b/RegressionTestResults/PR_Train2Nodes/OUTPUT/best_validation_epoch/Test/scatterplots/Single_vs_CURRENT.png new file mode 100644 index 000000000..7800cc7dc --- /dev/null +++ b/RegressionTestResults/PR_Train2Nodes/OUTPUT/best_validation_epoch/Test/scatterplots/Single_vs_CURRENT.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:527930cc54b6e3581f87da7b9056b005cccd7c242f7afcc7382b73690d98f49d +size 56806 diff --git a/Tests/ML/runners/test_compare_against_baselines.py b/Tests/ML/runners/test_compare_against_baselines.py index 0f6388e50..69e5c854e 100644 --- a/Tests/ML/runners/test_compare_against_baselines.py +++ b/Tests/ML/runners/test_compare_against_baselines.py @@ -5,41 +5,96 @@ import pandas as pd import pytest +from typing import List +from unittest import mock from InnerEye.Common import common_util -from InnerEye.Common.common_util import BEST_EPOCH_FOLDER_NAME +from InnerEye.Common.common_util import BASELINE_WILCOXON_RESULTS_FILE, BEST_EPOCH_FOLDER_NAME, ENSEMBLE_SPLIT_NAME, \ + FULL_METRICS_DATAFRAME_FILE, OTHER_RUNS_SUBDIR_NAME, SUBJECT_METRICS_FILE_NAME, ModelProcessing from InnerEye.Common.fixed_paths import DEFAULT_AML_UPLOAD_DIR from InnerEye.Common.output_directories import OutputFolderForTests -from InnerEye.ML.baselines_util import ComparisonBaseline, get_comparison_baselines, perform_score_comparisons -from InnerEye.ML.common import ModelExecutionMode +from InnerEye.ML.baselines_util import ComparisonBaseline, compare_scores_against_baselines, get_comparison_baselines, \ + perform_score_comparisons +from InnerEye.ML.common import DATASET_CSV_FILE_NAME, ModelExecutionMode +from InnerEye.ML.config import SegmentationModelBase from Tests.ML.util import get_default_azure_config from Tests.AfterTraining.test_after_training import get_most_recent_run_id -@pytest.mark.skipif(common_util.is_windows(), reason="Loading tk sometimes fails on Windows") -def test_perform_score_comparisons() -> None: +def create_dataset_df() -> pd.DataFrame: + """ + Create a test dataframe for DATASET_CSV_FILE_NAME. + + :return: Test dataframe. + """ dataset_df = pd.DataFrame() dataset_df['subject'] = list(range(10)) dataset_df['seriesId'] = [f"s{i}" for i in range(10)] dataset_df['institutionId'] = ["xyz"] * 10 + return dataset_df + + +def create_metrics_df() -> pd.DataFrame: + """ + Create a test dataframe for SUBJECT_METRICS_FILE_NAME. + + :return: Test dataframe. + """ metrics_df = pd.DataFrame() metrics_df['Patient'] = list(range(10)) metrics_df['Structure'] = ['appendix'] * 10 metrics_df['Dice'] = [0.5 + i * 0.02 for i in range(10)] + return metrics_df + + +def create_comparison_metrics_df() -> pd.DataFrame: + """ + Create a test dataframe for comparison metrics. + + :return: Test dataframe. + """ comparison_metrics_df = pd.DataFrame() comparison_metrics_df['Patient'] = list(range(10)) comparison_metrics_df['Structure'] = ['appendix'] * 10 comparison_metrics_df['Dice'] = [0.51 + i * 0.02 for i in range(10)] + return comparison_metrics_df + + +def create_comparison_baseline(dataset_df: pd.DataFrame) -> ComparisonBaseline: + """ + Create a test ComparisonBaseline. + + :param dataset_df: Dataset dataframe. + :return: New test ComparisonBaseline + """ + comparison_metrics_df = create_comparison_metrics_df() comparison_name = "DefaultName" comparison_run_rec_id = "DefaultRunRecId" - baseline = ComparisonBaseline(comparison_name, dataset_df, comparison_metrics_df, comparison_run_rec_id) + return ComparisonBaseline(comparison_name, dataset_df, comparison_metrics_df, comparison_run_rec_id) + + +def check_wilcoxon_lines(wilcoxon_lines: List[str], baseline: ComparisonBaseline) -> None: + """ + Assert that the wilcoxon lines are as expected. + + :param wilcoxon_lines: Lines to test. + :param baseline: Expected comparison baseline. + """ + assert wilcoxon_lines[0] == f"Run 1: {baseline.name}" + assert wilcoxon_lines[1] == "Run 2: CURRENT" + assert wilcoxon_lines[3].find("WORSE") > 0 + + +@pytest.mark.skipif(common_util.is_windows(), reason="Loading tk sometimes fails on Windows") +def test_perform_score_comparisons() -> None: + dataset_df = create_dataset_df() + metrics_df = create_metrics_df() + baseline = create_comparison_baseline(dataset_df) result = perform_score_comparisons(dataset_df, metrics_df, [baseline]) assert result.did_comparisons assert len(result.wilcoxon_lines) == 5 - assert result.wilcoxon_lines[0] == f"Run 1: {comparison_name}" - assert result.wilcoxon_lines[1] == "Run 2: CURRENT" - assert result.wilcoxon_lines[3].find("WORSE") > 0 - assert list(result.plots.keys()) == [f"{comparison_name}_vs_CURRENT"] + check_wilcoxon_lines(result.wilcoxon_lines, baseline) + assert list(result.plots.keys()) == [f"{baseline.name}_vs_CURRENT"] @pytest.mark.after_training_single_run @@ -52,3 +107,82 @@ def test_get_comparison_data(test_output_dirs: OutputFolderForTests) -> None: azure_config, [(comparison_name, comparison_path)]) assert len(baselines) == 1 assert baselines[0].name == comparison_name + + +@pytest.mark.parametrize('model_proc', [ModelProcessing.DEFAULT, ModelProcessing.ENSEMBLE_CREATION]) +def test_compare_scores_against_baselines_throws(model_proc: ModelProcessing, + test_output_dirs: OutputFolderForTests) -> None: + """ + Test that exceptions are raised if the files necessary for baseline comparison are missing, + then test that when all required files are present that baseline comparison files are written. + + :param model_proc: Model processing to test. + :param test_output_dirs: Test output directories. + :return: None. + """ + config = SegmentationModelBase(should_validate=False, + comparison_blob_storage_paths=[ + ('Single', 'dummy_blob_single/outputs/epoch_120/Test'), + ('5fold', 'dummy_blob_ensemble/outputs/epoch_120/Test')]) + config.set_output_to(test_output_dirs.root_dir) + + azure_config = get_default_azure_config() + + # If the BEST_EPOCH_FOLDER_NAME folder is missing, expect an exception to be raised. + with pytest.raises(FileNotFoundError) as ex: + compare_scores_against_baselines( + model_config=config, + azure_config=azure_config, model_proc=model_proc) + assert "Cannot compare scores against baselines: no best epoch results found at" in str(ex) + + best_epoch_folder_path = config.outputs_folder + if model_proc == ModelProcessing.ENSEMBLE_CREATION: + best_epoch_folder_path = best_epoch_folder_path / OTHER_RUNS_SUBDIR_NAME / ENSEMBLE_SPLIT_NAME + best_epoch_folder_path = best_epoch_folder_path / BEST_EPOCH_FOLDER_NAME / ModelExecutionMode.TEST.value + + best_epoch_folder_path.mkdir(parents=True) + + # If the BEST_EPOCH_FOLDER_NAME folder exists but DATASET_CSV_FILE_NAME is missing, + # expect an exception to be raised. + with pytest.raises(FileNotFoundError) as ex: + compare_scores_against_baselines( + model_config=config, + azure_config=azure_config, model_proc=model_proc) + assert "Not comparing with baselines because no " in str(ex) + assert DATASET_CSV_FILE_NAME in str(ex) + + model_dataset_path = best_epoch_folder_path / DATASET_CSV_FILE_NAME + dataset_df = create_dataset_df() + dataset_df.to_csv(model_dataset_path) + + # If the BEST_EPOCH_FOLDER_NAME folder exists but SUBJECT_METRICS_FILE_NAME is missing, + # expect an exception to be raised. + with pytest.raises(FileNotFoundError) as ex: + compare_scores_against_baselines( + model_config=config, + azure_config=azure_config, model_proc=model_proc) + assert "Not comparing with baselines because no " in str(ex) + assert SUBJECT_METRICS_FILE_NAME in str(ex) + + model_metrics_path = best_epoch_folder_path / SUBJECT_METRICS_FILE_NAME + metrics_df = create_metrics_df() + metrics_df.to_csv(model_metrics_path) + + baseline = create_comparison_baseline(dataset_df) + + # Patch get_comparison_baselines to return the baseline above. + with mock.patch('InnerEye.ML.baselines_util.get_comparison_baselines', return_value=[baseline]): + compare_scores_against_baselines( + model_config=config, + azure_config=azure_config, model_proc=model_proc) + + # Check the wilcoxoon results file is present and has expected contents. + wilcoxon_path = best_epoch_folder_path / BASELINE_WILCOXON_RESULTS_FILE + assert wilcoxon_path.is_file() + + wilcoxon_lines = [line.strip() for line in wilcoxon_path.read_text().splitlines()] + check_wilcoxon_lines(wilcoxon_lines, baseline) + + # Check the full metrics results file is present. + full_metrics_path = best_epoch_folder_path / FULL_METRICS_DATAFRAME_FILE + assert full_metrics_path.is_file()