diff --git a/CHANGELOG.md b/CHANGELOG.md index af057d167..bbe10f2cd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -32,6 +32,7 @@ any large models anymore because data loaders ran out of memory. ### Added +- ([#489](https://github.com/microsoft/InnerEye-DeepLearning/pull/489)) Remove portal query for outliers. - ([#488](https://github.com/microsoft/InnerEye-DeepLearning/pull/488)) Better handling of missing seriesId in segmentation cross validation reports. - ([#454](https://github.com/microsoft/InnerEye-DeepLearning/pull/454)) Checking that labels are mutually exclusive. - ([#447](https://github.com/microsoft/InnerEye-DeepLearning/pull/447/)) Added a sanity check to ensure there are no diff --git a/InnerEye/ML/visualizers/plot_cross_validation.py b/InnerEye/ML/visualizers/plot_cross_validation.py index dc2e35b61..b8fc8cac1 100644 --- a/InnerEye/ML/visualizers/plot_cross_validation.py +++ b/InnerEye/ML/visualizers/plot_cross_validation.py @@ -51,8 +51,6 @@ MAX_STRUCTURES_PER_PLOT = 7 DRIVER_LOG_BASENAME = "70_driver_log.txt" RUN_RECOVERY_ID_KEY = 'run_recovery_id' -# noinspection SQL -PORTAL_QUERY_TEMPLATE = "SELECT * FROM ROOT as r WHERE true AND ({}) AND ({})" WILCOXON_RESULTS_FILE = "CrossValidationWilcoxonSignedRankTestResults.txt" MANN_WHITNEY_RESULTS_FILE = "CrossValidationMannWhitneyTestResults.txt" METRICS_BY_MODE_AND_STRUCTURE_FILE = "ResultsByModeAndStructure.csv" @@ -655,18 +653,20 @@ def plot_metrics(config: PlotCrossValidationConfig, def save_outliers(config: PlotCrossValidationConfig, - dataset_split_metrics: Dict[ModelExecutionMode, pd.DataFrame], root: Path) -> None: + dataset_split_metrics: Dict[ModelExecutionMode, pd.DataFrame], + root: Path) -> Dict[ModelExecutionMode, Path]: """ Given the dataframe for the downloaded metrics identifies outliers (score < mean - 3sd) across the splits and saves them in a file outlier.csv in the provided root. :param config: PlotCrossValidationConfig :param dataset_split_metrics: Mapping between model execution mode and a dataframe containing all metrics for it :param root: Root directory to the results for Train/Test and Val datasets - :return: + :return: Dictionary of mode and file path. """ stats_columns = ['count', 'mean', 'min', 'max'] + outliers_paths = {} for mode, df in dataset_split_metrics.items(): - outliers_std = str(root / "{}_outliers.txt".format(mode.value)) + outliers_std = root / "{}_outliers.txt".format(mode.value) with open(outliers_std, 'w') as f: # to make sure no columns or rows are truncated with DEFAULT_PD_DISPLAY_CONTEXT: @@ -679,36 +679,23 @@ def save_outliers(config: PlotCrossValidationConfig, f.write(f"\n\n=== METRIC: {metric_type} ===\n\n") if len(outliers) > 0: - # If running inside institution there may be no CSV_SERIES_HEADER and CSV_INSTITUTION_HEADER columns + # If running inside institution there may be no CSV_SERIES_HEADER or CSV_INSTITUTION_HEADER columns groupby_columns = [MetricsFileColumns.Patient.value, MetricsFileColumns.Structure.value] - if CSV_SERIES_HEADER in outliers.columns and CSV_INSTITUTION_HEADER in outliers.columns: - groupby_columns += [CSV_SERIES_HEADER, CSV_INSTITUTION_HEADER] + if CSV_SERIES_HEADER in outliers.columns: + groupby_columns.append(CSV_SERIES_HEADER) + if CSV_INSTITUTION_HEADER in outliers.columns: + groupby_columns.append(CSV_INSTITUTION_HEADER) outliers_summary = str(outliers.groupby(groupby_columns) .describe()[metric_type][stats_columns] .sort_values(stats_columns, ascending=False)) f.write(outliers_summary) - if CSV_INSTITUTION_HEADER in outliers.columns and CSV_SERIES_HEADER in outliers.columns: - f.write("\n\n") - f.write(create_portal_query_for_outliers(outliers)) else: f.write("No outliers found") print("Saved outliers to: {}".format(outliers_std)) + outliers_paths[mode] = outliers_std - -def create_portal_query_for_outliers(df: pd.DataFrame) -> str: - """ - Create a portal query string as a conjunction of the disjunctions of the unique InstitutionId and seriesId values. - - The passed data frame must have CSV_INSTITUTION_HEADER and CSV_SERIES_HEADER columns - """ - if CSV_INSTITUTION_HEADER not in df.columns or CSV_SERIES_HEADER not in df.columns: - raise ValueError(f"Data frame must have columns {CSV_INSTITUTION_HEADER} and {CSV_SERIES_HEADER}") - return PORTAL_QUERY_TEMPLATE.format( - " OR ".join(map(lambda x: 'r.InstitutionId = "{}"'.format(x), df[CSV_INSTITUTION_HEADER].unique())), - " OR ".join(map(lambda x: 'STARTSWITH(r.VersionedDicomImageSeries.Latest.Series.InstanceUID,"{}")'.format(x), - df[CSV_SERIES_HEADER].unique())) - ) + return outliers_paths def create_results_breakdown(df: pd.DataFrame, root_folder: Path) -> Tuple[Path, Path]: diff --git a/Tests/ML/visualizers/test_plot_cross_validation.py b/Tests/ML/visualizers/test_plot_cross_validation.py index 6bbf3ed4e..9536957ec 100644 --- a/Tests/ML/visualizers/test_plot_cross_validation.py +++ b/Tests/ML/visualizers/test_plot_cross_validation.py @@ -8,7 +8,6 @@ import pandas as pd import pytest -from pytest import raises from azureml.core import Run from pandas.core.dtypes.common import is_string_dtype @@ -25,8 +24,8 @@ from InnerEye.ML.utils.csv_util import CSV_INSTITUTION_HEADER, CSV_SERIES_HEADER from InnerEye.ML.visualizers.plot_cross_validation import COL_MODE, \ METRICS_BY_MODE_AND_STRUCTURE_FILE, METRICS_BY_MODE_FILE, \ - OfflineCrossvalConfigAndFiles, PORTAL_QUERY_TEMPLATE, PlotCrossValidationConfig, RUN_RECOVERY_ID_KEY, \ - RunResultFiles, add_comparison_data, check_result_file_counts, create_portal_query_for_outliers, \ + OfflineCrossvalConfigAndFiles, PlotCrossValidationConfig, RUN_RECOVERY_ID_KEY, \ + RunResultFiles, add_comparison_data, check_result_file_counts, \ create_results_breakdown, download_crossval_result_files, get_split_id, load_dataframes, \ plot_cross_validation_from_files, save_outliers from Tests.AfterTraining.test_after_training import get_most_recent_run_id @@ -331,38 +330,16 @@ def test_save_outliers(test_config: PlotCrossValidationConfig, test_config.outlier_range = 0 assert test_config.run_recovery_id dataset_split_metrics = {x: _get_metrics_df(test_config.run_recovery_id, x) for x in [ModelExecutionMode.VAL]} - save_outliers(test_config, dataset_split_metrics, test_config.outputs_directory) + outliers_paths = save_outliers(test_config, dataset_split_metrics, test_config.outputs_directory) filename = f"{ModelExecutionMode.VAL.value}_outliers.txt" - assert_text_files_match(full_file=test_config.outputs_directory / filename, expected_file=full_ml_test_data_path(filename)) + assert_text_files_match(full_file=outliers_paths[ModelExecutionMode.VAL], expected_file=full_ml_test_data_path(filename)) # Now test without the CSV_INSTITUTION_HEADER and CSV_SERIES_HEADER columns, which will be missing in institutions' environments dataset_split_metrics_pruned = { x: _get_metrics_df(test_config.run_recovery_id, x).drop(columns=[CSV_INSTITUTION_HEADER, CSV_SERIES_HEADER], errors="ignore") for x in [ModelExecutionMode.VAL]} - save_outliers(test_config, dataset_split_metrics_pruned, test_config.outputs_directory) + outliers_paths = save_outliers(test_config, dataset_split_metrics_pruned, test_config.outputs_directory) test_data_filename = f"{ModelExecutionMode.VAL.value}_outliers_pruned.txt" - assert_text_files_match(full_file=test_config.outputs_directory / filename, expected_file=full_ml_test_data_path(test_data_filename)) - - -def test_create_portal_query_for_outliers() -> None: - test_df = pd.DataFrame({ - CSV_INSTITUTION_HEADER: range(2), - CSV_SERIES_HEADER: range(3, 5), - "other": range(2) - }) - expected = PORTAL_QUERY_TEMPLATE.format('r.InstitutionId = "0" OR r.InstitutionId = "1"', - 'STARTSWITH(r.VersionedDicomImageSeries.Latest.Series.InstanceUID,"3") OR ' - 'STARTSWITH(r.VersionedDicomImageSeries.Latest.Series.InstanceUID,"4")') - assert expected == create_portal_query_for_outliers(test_df) - with raises(ValueError) as institution_column_missing_error: - test_df_pruned = test_df.drop(columns=[CSV_INSTITUTION_HEADER]) - create_portal_query_for_outliers(test_df_pruned) - error_message = str(institution_column_missing_error.value) - assert CSV_INSTITUTION_HEADER in error_message - with raises(ValueError) as series_column_missing_error: - test_df_pruned = test_df.drop(columns=[CSV_SERIES_HEADER]) - create_portal_query_for_outliers(test_df_pruned) - error_message = str(series_column_missing_error.value) - assert CSV_SERIES_HEADER in error_message + assert_text_files_match(full_file=outliers_paths[ModelExecutionMode.VAL], expected_file=full_ml_test_data_path(test_data_filename)) def test_create_summary(test_output_dirs: OutputFolderForTests) -> None: