From f929e98bb210604f59b40c937b3cbb9dbbbcdeda Mon Sep 17 00:00:00 2001 From: Behrooz <3968947+behxyz@users.noreply.github.com> Date: Mon, 22 Mar 2021 23:51:37 +0000 Subject: [PATCH 01/22] Implement PatchWSIDataset and SmartCachePathWSIDataset Signed-off-by: Behrooz <3968947+behxyz@users.noreply.github.com> --- monai/apps/pathology/datasets.py | 145 +++++++++++++++++++++++++++++++ 1 file changed, 145 insertions(+) create mode 100644 monai/apps/pathology/datasets.py diff --git a/monai/apps/pathology/datasets.py b/monai/apps/pathology/datasets.py new file mode 100644 index 0000000000..53eae60c3b --- /dev/null +++ b/monai/apps/pathology/datasets.py @@ -0,0 +1,145 @@ +# Copyright 2020 - 2021 MONAI Consortium +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys +from typing import Callable, Dict, List, Optional, Sequence, Union, Tuple + +import numpy as np + +from monai.data import Dataset, SmartCacheDataset +from monai.data.image_reader import WSIReader + +__all__ = ["PatchWSIDataset", "SmartCachePatchWSIDataset"] + + +class PatchWSIDataset(Dataset): + """ + This dataset read whole slide images, extract regions, and crate patches. + It reads labels for each patch and privide each patch with its associated class labels. + + Args: + data: The input image directory and the label file + [{"image": "path/to/image/directory", "label": "path/to/label/file.txt"}] + region_size: the region to be extracted from the whole slide image + grid_size: the grid size on which the patches should be extracted + patch_size: the patches extracted from the region on the grid + image_reader_name: (cuCIM is default) + transform: + + """ + + def __init__( + self, + data: dict, + region_size: Union[int, Tuple[int, int]], + grid_size: Union[int, Tuple[int, int]], + patch_size: Union[int, Tuple[int, int]], + image_reader_name="cuCIM", + transform=None, + ): + if type(region_size) == int: + self.region_size = (region_size, region_size) + else: + self.region_size = region_size + + if type(grid_size) == int: + self.grid_size = (grid_size, grid_size) + else: + self.grid_size = grid_size + + self.patch_size = patch_size + self.sub_region_size = (self.region_size[0] / self.grid_size[0], self.region_size[1] / self.grid_size[1]) + + self.transform = transform + self.image_base_path = data[0]["image"] + self.samples = self.load_samples(data[0]["label"]) + self.image_path_list = {x[0] for x in self.samples} + self.num_samples = len(self.samples) + + self.image_reader_name = image_reader_name + self.image_reader = WSIReader(image_reader_name) + self.wsi_object_dict = {} + self._fetch_wsi_objects() + + def _fetch_wsi_objects(self): + for image_path in self.image_path_list: + self.wsi_object_dict[image_path] = self.image_reader.read(image_path) + + def process_label_row(self, row): + row = row.strip("\n").split(",") + # create full image path + image_name = row[0] + ".tif" + image_path = os.path.join(self.image_base_path, image_name) + # change center locations to upper left location + location = (int(row[2]) - self.region_size[1] // 2, int(row[1]) - self.region_size[0] // 2) + # convert labels to float32 and add empty HxW channel to label + labels = tuple(int(lbl) for lbl in row[3:]) + labels = np.array(labels, dtype=np.float32)[:, np.newaxis, np.newaxis] + return image_path, location, labels + + def load_samples(self, loc_path): + with open(loc_path) as label_file: + rows = [self.process_label_row(row) for row in label_file.readlines()] + return rows + + def __len__(self): + return self.num_samples + + def __getitem__(self, index): + image_path, location, labels = self.samples[index] + # OpenSlide causes issue if using the stored image objects + if self.image_reader_name == "openslide": + img_obj = self.image_reader.read(image_path) + else: + img_obj = self.wsi_object_dict[image_path] + images, _ = self.image_reader.get_data( + img=img_obj, + location=location, + size=self.region_size, + grid_shape=self.grid_size, + patch_size=self.patch_size, + ) + samples = [{"image": images[i], "label": labels[i]} for i in range(labels.shape[0])] + if self.transform: + samples = self.transform(samples) + return samples + + +class SmartCachePatchWSIDataset(SmartCacheDataset): + """ + Add SmartCache functionality to PatchWSIDataset + """ + + def __init__( + self, + data, + region_size, + grid_size, + patch_size, + transform, + replace_rate, + cache_num, + cache_rate=1.0, + num_init_workers=None, + num_replace_workers=0, + image_reader_name="cuCIM", + ): + extractor = PatchWSIDataset(data, region_size, grid_size, patch_size, image_reader_name) + super().__init__( + data=extractor, + transform=transform, + replace_rate=replace_rate, + cache_num=cache_num, + cache_rate=cache_rate, + num_init_workers=num_init_workers, + num_replace_workers=num_replace_workers, + ) From ea0b51547a62a551850d5bd044c29ae8e994206c Mon Sep 17 00:00:00 2001 From: Behrooz <3968947+behxyz@users.noreply.github.com> Date: Tue, 23 Mar 2021 18:12:56 +0000 Subject: [PATCH 02/22] Remove label preprocessing and adopt new type of inputs Signed-off-by: Behrooz <3968947+behxyz@users.noreply.github.com> --- monai/apps/pathology/datasets.py | 30 +++++++----------------------- 1 file changed, 7 insertions(+), 23 deletions(-) diff --git a/monai/apps/pathology/datasets.py b/monai/apps/pathology/datasets.py index 53eae60c3b..4ed654468d 100644 --- a/monai/apps/pathology/datasets.py +++ b/monai/apps/pathology/datasets.py @@ -11,6 +11,7 @@ import os import sys +import json from typing import Callable, Dict, List, Optional, Sequence, Union, Tuple import numpy as np @@ -28,7 +29,7 @@ class PatchWSIDataset(Dataset): Args: data: The input image directory and the label file - [{"image": "path/to/image/directory", "label": "path/to/label/file.txt"}] + [{"image": "path/to/image1", "label": [0,0,0,1,0,1,0,0,1]}, "location": [200, 500]] region_size: the region to be extracted from the whole slide image grid_size: the grid size on which the patches should be extracted patch_size: the patches extracted from the region on the grid @@ -60,37 +61,20 @@ def __init__( self.sub_region_size = (self.region_size[0] / self.grid_size[0], self.region_size[1] / self.grid_size[1]) self.transform = transform - self.image_base_path = data[0]["image"] - self.samples = self.load_samples(data[0]["label"]) - self.image_path_list = {x[0] for x in self.samples} + self.samples = data self.num_samples = len(self.samples) + self.image_path_list = list({x['image'] for x in self.samples}) self.image_reader_name = image_reader_name self.image_reader = WSIReader(image_reader_name) - self.wsi_object_dict = {} + self.wsi_object_dict = None self._fetch_wsi_objects() def _fetch_wsi_objects(self): + self.wsi_object_dict = {} for image_path in self.image_path_list: self.wsi_object_dict[image_path] = self.image_reader.read(image_path) - def process_label_row(self, row): - row = row.strip("\n").split(",") - # create full image path - image_name = row[0] + ".tif" - image_path = os.path.join(self.image_base_path, image_name) - # change center locations to upper left location - location = (int(row[2]) - self.region_size[1] // 2, int(row[1]) - self.region_size[0] // 2) - # convert labels to float32 and add empty HxW channel to label - labels = tuple(int(lbl) for lbl in row[3:]) - labels = np.array(labels, dtype=np.float32)[:, np.newaxis, np.newaxis] - return image_path, location, labels - - def load_samples(self, loc_path): - with open(loc_path) as label_file: - rows = [self.process_label_row(row) for row in label_file.readlines()] - return rows - def __len__(self): return self.num_samples @@ -130,7 +114,7 @@ def __init__( cache_num, cache_rate=1.0, num_init_workers=None, - num_replace_workers=0, + num_replace_workers=None, image_reader_name="cuCIM", ): extractor = PatchWSIDataset(data, region_size, grid_size, patch_size, image_reader_name) From 1d7ec171465c5aa2ee6d339e2ccce024ef5d65c2 Mon Sep 17 00:00:00 2001 From: Behrooz <3968947+behxyz@users.noreply.github.com> Date: Tue, 23 Mar 2021 19:46:15 +0000 Subject: [PATCH 03/22] Update type hints Signed-off-by: Behrooz <3968947+behxyz@users.noreply.github.com> --- monai/apps/pathology/datasets.py | 36 ++++++++++++++------------------ 1 file changed, 16 insertions(+), 20 deletions(-) diff --git a/monai/apps/pathology/datasets.py b/monai/apps/pathology/datasets.py index 4ed654468d..77f9436cc9 100644 --- a/monai/apps/pathology/datasets.py +++ b/monai/apps/pathology/datasets.py @@ -9,12 +9,8 @@ # See the License for the specific language governing permissions and # limitations under the License. -import os import sys -import json -from typing import Callable, Dict, List, Optional, Sequence, Union, Tuple - -import numpy as np +from typing import Callable, List, Optional, Sequence, Tuple, Union from monai.data import Dataset, SmartCacheDataset from monai.data.image_reader import WSIReader @@ -40,12 +36,12 @@ class PatchWSIDataset(Dataset): def __init__( self, - data: dict, + data: List, region_size: Union[int, Tuple[int, int]], grid_size: Union[int, Tuple[int, int]], patch_size: Union[int, Tuple[int, int]], - image_reader_name="cuCIM", - transform=None, + image_reader_name: str = "cuCIM", + transform: Union[Sequence[Callable], Callable] = None, ): if type(region_size) == int: self.region_size = (region_size, region_size) @@ -63,7 +59,7 @@ def __init__( self.transform = transform self.samples = data self.num_samples = len(self.samples) - self.image_path_list = list({x['image'] for x in self.samples}) + self.image_path_list = list({x["image"] for x in self.samples}) self.image_reader_name = image_reader_name self.image_reader = WSIReader(image_reader_name) @@ -105,17 +101,17 @@ class SmartCachePatchWSIDataset(SmartCacheDataset): def __init__( self, - data, - region_size, - grid_size, - patch_size, - transform, - replace_rate, - cache_num, - cache_rate=1.0, - num_init_workers=None, - num_replace_workers=None, - image_reader_name="cuCIM", + data: List, + region_size: Union[int, Tuple[int, int]], + grid_size: Union[int, Tuple[int, int]], + patch_size: Union[int, Tuple[int, int]], + image_reader_name: str = "cuCIM", + transform: Union[Sequence[Callable], Callable] = None, + replace_rate: float = 0.5, + cache_num: int = sys.maxsize, + cache_rate: float = 1.0, + num_init_workers: Optional[int] = None, + num_replace_workers: Optional[int] = None, ): extractor = PatchWSIDataset(data, region_size, grid_size, patch_size, image_reader_name) super().__init__( From 9065fe05ae06a0df8de92c4a2e35ffbe88e6bf49 Mon Sep 17 00:00:00 2001 From: Behrooz <3968947+behxyz@users.noreply.github.com> Date: Tue, 23 Mar 2021 19:53:10 +0000 Subject: [PATCH 04/22] Add init file Signed-off-by: Behrooz <3968947+behxyz@users.noreply.github.com> --- monai/apps/pathology/__init__.py | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 monai/apps/pathology/__init__.py diff --git a/monai/apps/pathology/__init__.py b/monai/apps/pathology/__init__.py new file mode 100644 index 0000000000..14ae193634 --- /dev/null +++ b/monai/apps/pathology/__init__.py @@ -0,0 +1,10 @@ +# Copyright 2020 - 2021 MONAI Consortium +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. From ce4150df1bdccb1aab61197ce34a613554eabc13 Mon Sep 17 00:00:00 2001 From: Behrooz <3968947+behxyz@users.noreply.github.com> Date: Tue, 23 Mar 2021 23:12:57 +0000 Subject: [PATCH 05/22] Change grid_size to grid_shape Signed-off-by: Behrooz <3968947+behxyz@users.noreply.github.com> --- monai/apps/pathology/datasets.py | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/monai/apps/pathology/datasets.py b/monai/apps/pathology/datasets.py index 77f9436cc9..d2df1cf28c 100644 --- a/monai/apps/pathology/datasets.py +++ b/monai/apps/pathology/datasets.py @@ -27,7 +27,7 @@ class PatchWSIDataset(Dataset): data: The input image directory and the label file [{"image": "path/to/image1", "label": [0,0,0,1,0,1,0,0,1]}, "location": [200, 500]] region_size: the region to be extracted from the whole slide image - grid_size: the grid size on which the patches should be extracted + grid_shape: the grid shape on which the patches should be extracted patch_size: the patches extracted from the region on the grid image_reader_name: (cuCIM is default) transform: @@ -38,7 +38,7 @@ def __init__( self, data: List, region_size: Union[int, Tuple[int, int]], - grid_size: Union[int, Tuple[int, int]], + grid_shape: Union[int, Tuple[int, int]], patch_size: Union[int, Tuple[int, int]], image_reader_name: str = "cuCIM", transform: Union[Sequence[Callable], Callable] = None, @@ -48,13 +48,13 @@ def __init__( else: self.region_size = region_size - if type(grid_size) == int: - self.grid_size = (grid_size, grid_size) + if type(grid_shape) == int: + self.grid_shape = (grid_shape, grid_shape) else: - self.grid_size = grid_size + self.grid_shape = grid_shape self.patch_size = patch_size - self.sub_region_size = (self.region_size[0] / self.grid_size[0], self.region_size[1] / self.grid_size[1]) + self.sub_region_size = (self.region_size[0] / self.grid_shape[0], self.region_size[1] / self.grid_shape[1]) self.transform = transform self.samples = data @@ -75,20 +75,20 @@ def __len__(self): return self.num_samples def __getitem__(self, index): - image_path, location, labels = self.samples[index] + data = self.samples[index] # OpenSlide causes issue if using the stored image objects if self.image_reader_name == "openslide": - img_obj = self.image_reader.read(image_path) + img_obj = self.image_reader.read(data["image"]) else: - img_obj = self.wsi_object_dict[image_path] + img_obj = self.wsi_object_dict[data["image"]] images, _ = self.image_reader.get_data( img=img_obj, - location=location, + location=data["location"], size=self.region_size, - grid_shape=self.grid_size, + grid_shape=self.grid_shape, patch_size=self.patch_size, ) - samples = [{"image": images[i], "label": labels[i]} for i in range(labels.shape[0])] + samples = [{"image": images[i], "label": data["label"][i]} for i in range(len(data["label"]))] if self.transform: samples = self.transform(samples) return samples @@ -103,7 +103,7 @@ def __init__( self, data: List, region_size: Union[int, Tuple[int, int]], - grid_size: Union[int, Tuple[int, int]], + grid_shape: Union[int, Tuple[int, int]], patch_size: Union[int, Tuple[int, int]], image_reader_name: str = "cuCIM", transform: Union[Sequence[Callable], Callable] = None, @@ -113,7 +113,7 @@ def __init__( num_init_workers: Optional[int] = None, num_replace_workers: Optional[int] = None, ): - extractor = PatchWSIDataset(data, region_size, grid_size, patch_size, image_reader_name) + extractor = PatchWSIDataset(data, region_size, grid_shape, patch_size, image_reader_name) super().__init__( data=extractor, transform=transform, From f2c83069452a78a1778e6f684a633ff46bc6a2be Mon Sep 17 00:00:00 2001 From: Behrooz <3968947+behxyz@users.noreply.github.com> Date: Tue, 23 Mar 2021 23:13:44 +0000 Subject: [PATCH 06/22] Add a unittest for PatchWSIDataset Signed-off-by: Behrooz <3968947+behxyz@users.noreply.github.com> --- tests/test_patch_wsi_dataset.py | 55 +++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 tests/test_patch_wsi_dataset.py diff --git a/tests/test_patch_wsi_dataset.py b/tests/test_patch_wsi_dataset.py new file mode 100644 index 0000000000..fc0e879f1d --- /dev/null +++ b/tests/test_patch_wsi_dataset.py @@ -0,0 +1,55 @@ +import os +import unittest +from unittest import skipUnless +from urllib import request + +import numpy as np +from numpy.testing import assert_array_equal +from parameterized import parameterized + +from monai.apps.pathology.datasets import PatchWSIDataset + +FILE_URL = "http://openslide.cs.cmu.edu/download/openslide-testdata/Generic-TIFF/CMU-1.tiff" + +TEST_CASE_1 = [ + FILE_URL, + { + "data": [{"image": "./CMU-1.tiff", "location": [10000, 20000], "label": [0, 0, 0, 1]}], + "region_size": (8, 8), + "grid_shape": (2, 2), + "patch_size": 1, + "image_reader_name": "cuCIM", + }, + [ + {"image": np.array([[[247]], [[245]], [[248]]], dtype=np.uint8), "label": 0}, + {"image": np.array([[[245]], [[247]], [[244]]], dtype=np.uint8), "label": 0}, + {"image": np.array([[[246]], [[246]], [[246]]], dtype=np.uint8), "label": 0}, + {"image": np.array([[[246]], [[246]], [[246]]], dtype=np.uint8), "label": 1}, + ], +] + + +class TestCuCIMReader(unittest.TestCase): + @parameterized.expand([TEST_CASE_1]) + def test_read_patches(self, file_url, input_parameters, expected): + self.camelyon_data_download(file_url) + dataset = PatchWSIDataset(**input_parameters) + samples = dataset[0] + image_compare = [ + assert_array_equal(samples[i]["image"], expected[i]["image"]) is None for i in range(len(samples)) + ] + label_compare = [ + assert_array_equal(samples[i]["label"], expected[i]["label"]) is None for i in range(len(samples)) + ] + self.assertTrue(all(image_compare) and all(label_compare)) + + def camelyon_data_download(self, file_url): + filename = os.path.basename(file_url) + if not os.path.exists(filename): + print(f"Test image [{filename}] does not exist. Downloading...") + request.urlretrieve(file_url, filename) + return filename + + +if __name__ == "__main__": + unittest.main() From 3506740266ce6acebbf86aedbce7745b02a0228a Mon Sep 17 00:00:00 2001 From: Behrooz <3968947+behxyz@users.noreply.github.com> Date: Tue, 23 Mar 2021 23:46:39 +0000 Subject: [PATCH 07/22] Add more unittests Signed-off-by: Behrooz <3968947+behxyz@users.noreply.github.com> --- tests/test_patch_wsi_dataset.py | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/tests/test_patch_wsi_dataset.py b/tests/test_patch_wsi_dataset.py index fc0e879f1d..95a2eea7a0 100644 --- a/tests/test_patch_wsi_dataset.py +++ b/tests/test_patch_wsi_dataset.py @@ -8,9 +8,30 @@ from parameterized import parameterized from monai.apps.pathology.datasets import PatchWSIDataset +from monai.utils import optional_import + +_, has_cim = optional_import("cucim") FILE_URL = "http://openslide.cs.cmu.edu/download/openslide-testdata/Generic-TIFF/CMU-1.tiff" +TEST_CASE_0 = [ + FILE_URL, + { + "data": [ + {"image": "./CMU-1.tiff", "location": [10000, 20000], "label": [1]}, + {"image": "./CMU-1.tiff", "location": [0, 0], "label": [0]}, + ], + "region_size": (1, 1), + "grid_shape": (1, 1), + "patch_size": 1, + "image_reader_name": "cuCIM", + }, + [ + {"image": np.array([[[246]], [[245]], [[250]]], dtype=np.uint8), "label": 1}, + {"image": np.array([[[239]], [[239]], [[239]]], dtype=np.uint8), "label": 0}, + ], +] + TEST_CASE_1 = [ FILE_URL, { @@ -18,7 +39,6 @@ "region_size": (8, 8), "grid_shape": (2, 2), "patch_size": 1, - "image_reader_name": "cuCIM", }, [ {"image": np.array([[[247]], [[245]], [[248]]], dtype=np.uint8), "label": 0}, @@ -29,8 +49,9 @@ ] -class TestCuCIMReader(unittest.TestCase): - @parameterized.expand([TEST_CASE_1]) +class TestPatchWSIDataset(unittest.TestCase): + @parameterized.expand([TEST_CASE_0, TEST_CASE_1]) + @skipUnless(has_cim, "Requires CuCIM") def test_read_patches(self, file_url, input_parameters, expected): self.camelyon_data_download(file_url) dataset = PatchWSIDataset(**input_parameters) From d81ef163e2a19c7e7e16af44e3fa689c48c7e40b Mon Sep 17 00:00:00 2001 From: Behrooz <3968947+behxyz@users.noreply.github.com> Date: Wed, 24 Mar 2021 01:21:59 +0000 Subject: [PATCH 08/22] Update docstrings and make minor changes Signed-off-by: Behrooz <3968947+behxyz@users.noreply.github.com> --- monai/apps/pathology/datasets.py | 56 ++++++++++++++++++++++---------- 1 file changed, 38 insertions(+), 18 deletions(-) diff --git a/monai/apps/pathology/datasets.py b/monai/apps/pathology/datasets.py index d2df1cf28c..96a3638b7a 100644 --- a/monai/apps/pathology/datasets.py +++ b/monai/apps/pathology/datasets.py @@ -12,6 +12,8 @@ import sys from typing import Callable, List, Optional, Sequence, Tuple, Union +import numpy as np + from monai.data import Dataset, SmartCacheDataset from monai.data.image_reader import WSIReader @@ -21,16 +23,33 @@ class PatchWSIDataset(Dataset): """ This dataset read whole slide images, extract regions, and crate patches. - It reads labels for each patch and privide each patch with its associated class labels. + It also reads labels for each patch and privide each patch with its associated class labels. Args: - data: The input image directory and the label file - [{"image": "path/to/image1", "label": [0,0,0,1,0,1,0,0,1]}, "location": [200, 500]] + data: the list of input samples including image, location, and label (see below for more details) region_size: the region to be extracted from the whole slide image grid_shape: the grid shape on which the patches should be extracted patch_size: the patches extracted from the region on the grid - image_reader_name: (cuCIM is default) - transform: + image_reader_name: the name of library to be used for loading whole slide imaging, + either CuCIM or OpenSlide (the default is CuCIM) + transform: transforms to be executed on input data. + + Note: + The input data has the following form as an example: + [{"image": "path/to/image1.tiff", "location": [200, 500], "label": [0,0,0,1]}]. + + This means from "image1.tiff" extract a region at "location" with the side of "region_size", and + then extract patches with the size of "patch_size" from a square grid with the shape of "grid_shape". + Be aware the the "grid_shape" should construct a grid with the same number of element as "labels", so + for this example the "grid_size" should be (2, 2). + + The output will look like the following: + [ + {"image": np.array([...], dtype=np.uint8), "label": 0}, + {"image": np.array([...], dtype=np.uint8), "label": 0}, + {"image": np.array([...], dtype=np.uint8), "label": 0}, + {"image": np.array([...], dtype=np.uint8), "label": 1}, + ] """ @@ -39,7 +58,7 @@ def __init__( data: List, region_size: Union[int, Tuple[int, int]], grid_shape: Union[int, Tuple[int, int]], - patch_size: Union[int, Tuple[int, int]], + patch_size: int, image_reader_name: str = "cuCIM", transform: Union[Sequence[Callable], Callable] = None, ): @@ -58,13 +77,14 @@ def __init__( self.transform = transform self.samples = data - self.num_samples = len(self.samples) self.image_path_list = list({x["image"] for x in self.samples}) self.image_reader_name = image_reader_name self.image_reader = WSIReader(image_reader_name) self.wsi_object_dict = None - self._fetch_wsi_objects() + if self.image_reader_name != "openslide": + # OpenSlide causes memeory issue if we prefetch image objects + self._fetch_wsi_objects() def _fetch_wsi_objects(self): self.wsi_object_dict = {} @@ -72,26 +92,26 @@ def _fetch_wsi_objects(self): self.wsi_object_dict[image_path] = self.image_reader.read(image_path) def __len__(self): - return self.num_samples + return len(self.samples) def __getitem__(self, index): - data = self.samples[index] - # OpenSlide causes issue if using the stored image objects + sample = self.samples[index] if self.image_reader_name == "openslide": - img_obj = self.image_reader.read(data["image"]) + img_obj = self.image_reader.read(sample["image"]) else: - img_obj = self.wsi_object_dict[data["image"]] + img_obj = self.wsi_object_dict[sample["image"]] images, _ = self.image_reader.get_data( img=img_obj, - location=data["location"], + location=sample["location"], size=self.region_size, grid_shape=self.grid_shape, patch_size=self.patch_size, ) - samples = [{"image": images[i], "label": data["label"][i]} for i in range(len(data["label"]))] + labels = np.array(sample["label"], dtype=np.float32)[:, np.newaxis, np.newaxis] + patches = [{"image": images[i], "label": labels[i]} for i in range(len(sample["label"]))] if self.transform: - samples = self.transform(samples) - return samples + patches = self.transform(patches) + return patches class SmartCachePatchWSIDataset(SmartCacheDataset): @@ -104,7 +124,7 @@ def __init__( data: List, region_size: Union[int, Tuple[int, int]], grid_shape: Union[int, Tuple[int, int]], - patch_size: Union[int, Tuple[int, int]], + patch_size: int, image_reader_name: str = "cuCIM", transform: Union[Sequence[Callable], Callable] = None, replace_rate: float = 0.5, From 472c4212de2f0eb72f7df1adbfb7658e75be3f97 Mon Sep 17 00:00:00 2001 From: Behrooz <3968947+behxyz@users.noreply.github.com> Date: Wed, 24 Mar 2021 01:22:35 +0000 Subject: [PATCH 09/22] Convert labels to numpy to match the change in dataset Signed-off-by: Behrooz <3968947+behxyz@users.noreply.github.com> --- tests/test_patch_wsi_dataset.py | 26 +++++++++++--------------- 1 file changed, 11 insertions(+), 15 deletions(-) diff --git a/tests/test_patch_wsi_dataset.py b/tests/test_patch_wsi_dataset.py index 95a2eea7a0..b36ffb8b87 100644 --- a/tests/test_patch_wsi_dataset.py +++ b/tests/test_patch_wsi_dataset.py @@ -18,8 +18,7 @@ FILE_URL, { "data": [ - {"image": "./CMU-1.tiff", "location": [10000, 20000], "label": [1]}, - {"image": "./CMU-1.tiff", "location": [0, 0], "label": [0]}, + {"image": "./CMU-1.tiff", "location": [0, 0], "label": [1]}, ], "region_size": (1, 1), "grid_shape": (1, 1), @@ -27,8 +26,7 @@ "image_reader_name": "cuCIM", }, [ - {"image": np.array([[[246]], [[245]], [[250]]], dtype=np.uint8), "label": 1}, - {"image": np.array([[[239]], [[239]], [[239]]], dtype=np.uint8), "label": 0}, + {"image": np.array([[[239]], [[239]], [[239]]], dtype=np.uint8), "label": np.array([[1]])}, ], ] @@ -41,10 +39,10 @@ "patch_size": 1, }, [ - {"image": np.array([[[247]], [[245]], [[248]]], dtype=np.uint8), "label": 0}, - {"image": np.array([[[245]], [[247]], [[244]]], dtype=np.uint8), "label": 0}, - {"image": np.array([[[246]], [[246]], [[246]]], dtype=np.uint8), "label": 0}, - {"image": np.array([[[246]], [[246]], [[246]]], dtype=np.uint8), "label": 1}, + {"image": np.array([[[247]], [[245]], [[248]]], dtype=np.uint8), "label": np.array([[0]])}, + {"image": np.array([[[245]], [[247]], [[244]]], dtype=np.uint8), "label": np.array([[0]])}, + {"image": np.array([[[246]], [[246]], [[246]]], dtype=np.uint8), "label": np.array([[0]])}, + {"image": np.array([[[246]], [[246]], [[246]]], dtype=np.uint8), "label": np.array([[1]])}, ], ] @@ -56,13 +54,11 @@ def test_read_patches(self, file_url, input_parameters, expected): self.camelyon_data_download(file_url) dataset = PatchWSIDataset(**input_parameters) samples = dataset[0] - image_compare = [ - assert_array_equal(samples[i]["image"], expected[i]["image"]) is None for i in range(len(samples)) - ] - label_compare = [ - assert_array_equal(samples[i]["label"], expected[i]["label"]) is None for i in range(len(samples)) - ] - self.assertTrue(all(image_compare) and all(label_compare)) + for i in range(len(samples)): + self.assertTupleEqual(samples[i]["label"].shape, expected[i]["label"].shape) + self.assertTupleEqual(samples[i]["image"].shape, expected[i]["image"].shape) + self.assertIsNone(assert_array_equal(samples[i]["label"], expected[i]["label"])) + self.assertIsNone(assert_array_equal(samples[i]["image"], expected[i]["image"])) def camelyon_data_download(self, file_url): filename = os.path.basename(file_url) From 79c35b5fdb5b6006b71cdb0aa9ac9a5d499fbb77 Mon Sep 17 00:00:00 2001 From: Behrooz <3968947+behxyz@users.noreply.github.com> Date: Wed, 24 Mar 2021 01:49:00 +0000 Subject: [PATCH 10/22] Update location from center to corner Signed-off-by: Behrooz <3968947+behxyz@users.noreply.github.com> --- monai/apps/pathology/datasets.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/monai/apps/pathology/datasets.py b/monai/apps/pathology/datasets.py index 96a3638b7a..d5f8d1dac2 100644 --- a/monai/apps/pathology/datasets.py +++ b/monai/apps/pathology/datasets.py @@ -100,9 +100,10 @@ def __getitem__(self, index): img_obj = self.image_reader.read(sample["image"]) else: img_obj = self.wsi_object_dict[sample["image"]] + location = [sample["location"][i] - self.region_size[i] // 2 for i in range(len(self.region_size))] images, _ = self.image_reader.get_data( img=img_obj, - location=sample["location"], + location=location, size=self.region_size, grid_shape=self.grid_shape, patch_size=self.patch_size, From c8542bd1887c945b1f160b59645d48358f088109 Mon Sep 17 00:00:00 2001 From: Behrooz <3968947+behxyz@users.noreply.github.com> Date: Wed, 24 Mar 2021 02:32:06 +0000 Subject: [PATCH 11/22] Update unittests locations from center to corner Signed-off-by: Behrooz <3968947+behxyz@users.noreply.github.com> --- tests/test_patch_wsi_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_patch_wsi_dataset.py b/tests/test_patch_wsi_dataset.py index b36ffb8b87..a1b1d2d2c0 100644 --- a/tests/test_patch_wsi_dataset.py +++ b/tests/test_patch_wsi_dataset.py @@ -33,7 +33,7 @@ TEST_CASE_1 = [ FILE_URL, { - "data": [{"image": "./CMU-1.tiff", "location": [10000, 20000], "label": [0, 0, 0, 1]}], + "data": [{"image": "./CMU-1.tiff", "location": [10004, 20004], "label": [0, 0, 0, 1]}], "region_size": (8, 8), "grid_shape": (2, 2), "patch_size": 1, From 854ceb58509c56e63f80dd2b94f5a045078e8bea Mon Sep 17 00:00:00 2001 From: Behrooz <3968947+behxyz@users.noreply.github.com> Date: Wed, 24 Mar 2021 02:36:50 +0000 Subject: [PATCH 12/22] Update docs for pathology datasets Signed-off-by: Behrooz <3968947+behxyz@users.noreply.github.com> --- docs/source/apps.rst | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/docs/source/apps.rst b/docs/source/apps.rst index 1c4f4c3dfb..3396dc2a82 100644 --- a/docs/source/apps.rst +++ b/docs/source/apps.rst @@ -62,3 +62,13 @@ Applications :members: .. autoclass:: Fetch2DSliced :members: + + +`Pathology` +---------- + +.. automodule:: monai.apps.pathology.datasets +.. autoclass:: PatchWSIDataset + :members: +.. autoclass:: SmartCachePatchWSIDataset + :members: From c9ffa443e0979fa816450b3a163cd3d9c08821c5 Mon Sep 17 00:00:00 2001 From: Behrooz <3968947+behxyz@users.noreply.github.com> Date: Wed, 24 Mar 2021 04:32:06 +0000 Subject: [PATCH 13/22] Update type hint and doc Signed-off-by: Behrooz <3968947+behxyz@users.noreply.github.com> --- docs/source/apps.rst | 3 +-- monai/apps/pathology/datasets.py | 15 +++++++-------- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/docs/source/apps.rst b/docs/source/apps.rst index 3396dc2a82..4c45a5fb39 100644 --- a/docs/source/apps.rst +++ b/docs/source/apps.rst @@ -63,9 +63,8 @@ Applications .. autoclass:: Fetch2DSliced :members: - `Pathology` ----------- +----------- .. automodule:: monai.apps.pathology.datasets .. autoclass:: PatchWSIDataset diff --git a/monai/apps/pathology/datasets.py b/monai/apps/pathology/datasets.py index d5f8d1dac2..fd86acdd5a 100644 --- a/monai/apps/pathology/datasets.py +++ b/monai/apps/pathology/datasets.py @@ -35,6 +35,7 @@ class PatchWSIDataset(Dataset): transform: transforms to be executed on input data. Note: + The input data has the following form as an example: [{"image": "path/to/image1.tiff", "location": [200, 500], "label": [0,0,0,1]}]. @@ -60,14 +61,14 @@ def __init__( grid_shape: Union[int, Tuple[int, int]], patch_size: int, image_reader_name: str = "cuCIM", - transform: Union[Sequence[Callable], Callable] = None, + transform: Optional[Callable] = None, ): - if type(region_size) == int: + if isinstance(region_size, int): self.region_size = (region_size, region_size) else: - self.region_size = region_size + self.region_size = region_size - if type(grid_shape) == int: + if isinstance(grid_shape, int): self.grid_shape = (grid_shape, grid_shape) else: self.grid_shape = grid_shape @@ -116,9 +117,7 @@ def __getitem__(self, index): class SmartCachePatchWSIDataset(SmartCacheDataset): - """ - Add SmartCache functionality to PatchWSIDataset - """ + """Add SmartCache functionality to PatchWSIDataset.""" def __init__( self, @@ -126,8 +125,8 @@ def __init__( region_size: Union[int, Tuple[int, int]], grid_shape: Union[int, Tuple[int, int]], patch_size: int, + transform: Union[Sequence[Callable], Callable], image_reader_name: str = "cuCIM", - transform: Union[Sequence[Callable], Callable] = None, replace_rate: float = 0.5, cache_num: int = sys.maxsize, cache_rate: float = 1.0, From 96b6df5510514e83d5b39ef5e79d4cb9ea5c8f3c Mon Sep 17 00:00:00 2001 From: Behrooz <3968947+behxyz@users.noreply.github.com> Date: Wed, 24 Mar 2021 17:43:41 +0000 Subject: [PATCH 14/22] Update docstrings Signed-off-by: Behrooz <3968947+behxyz@users.noreply.github.com> --- monai/apps/pathology/datasets.py | 62 ++++++++++++++++++++------------ 1 file changed, 39 insertions(+), 23 deletions(-) diff --git a/monai/apps/pathology/datasets.py b/monai/apps/pathology/datasets.py index fd86acdd5a..2bf29e3325 100644 --- a/monai/apps/pathology/datasets.py +++ b/monai/apps/pathology/datasets.py @@ -26,31 +26,23 @@ class PatchWSIDataset(Dataset): It also reads labels for each patch and privide each patch with its associated class labels. Args: - data: the list of input samples including image, location, and label (see below for more details) - region_size: the region to be extracted from the whole slide image - grid_shape: the grid shape on which the patches should be extracted - patch_size: the patches extracted from the region on the grid - image_reader_name: the name of library to be used for loading whole slide imaging, - either CuCIM or OpenSlide (the default is CuCIM) + data: the list of input samples including image, location, and label (see below for more details). + region_size: the region to be extracted from the whole slide image. + grid_shape: the grid shape on which the patches should be extracted. + patch_size: the patches extracted from the region on the grid. + image_reader_name: the name of library to be used for loading whole slide imaging, either CuCIM or OpenSlide. + Defaults to CuCIM. transform: transforms to be executed on input data. Note: - The input data has the following form as an example: - [{"image": "path/to/image1.tiff", "location": [200, 500], "label": [0,0,0,1]}]. - - This means from "image1.tiff" extract a region at "location" with the side of "region_size", and - then extract patches with the size of "patch_size" from a square grid with the shape of "grid_shape". - Be aware the the "grid_shape" should construct a grid with the same number of element as "labels", so - for this example the "grid_size" should be (2, 2). + `[{"image": "path/to/image1.tiff", "location": [200, 500], "label": [0,0,0,1]}]`. - The output will look like the following: - [ - {"image": np.array([...], dtype=np.uint8), "label": 0}, - {"image": np.array([...], dtype=np.uint8), "label": 0}, - {"image": np.array([...], dtype=np.uint8), "label": 0}, - {"image": np.array([...], dtype=np.uint8), "label": 1}, - ] + This means from "image1.tiff" extract a region centered at the given location `location` + with the size of `region_size`, and then extract patches with the size of `patch_size` + from a square grid with the shape of `grid_shape`. + Be aware the the `grid_shape` should construct a grid with the same number of element as `labels`, + so for this example the `grid_shape` should be (2, 2). """ @@ -66,7 +58,7 @@ def __init__( if isinstance(region_size, int): self.region_size = (region_size, region_size) else: - self.region_size = region_size + self.region_size = region_size if isinstance(grid_shape, int): self.grid_shape = (grid_shape, grid_shape) @@ -88,6 +80,8 @@ def __init__( self._fetch_wsi_objects() def _fetch_wsi_objects(self): + """Load all the image objects and reuse them when asked for an item. + """ self.wsi_object_dict = {} for image_path in self.image_path_list: self.wsi_object_dict[image_path] = self.image_reader.read(image_path) @@ -117,7 +111,29 @@ def __getitem__(self, index): class SmartCachePatchWSIDataset(SmartCacheDataset): - """Add SmartCache functionality to PatchWSIDataset.""" + """Add SmartCache functionality to `PatchWSIDataset`. + + Args: + data: the list of input samples including image, location, and label (see `PatchWSIDataset` for more details) + region_size: the region to be extracted from the whole slide image. + grid_shape: the grid shape on which the patches should be extracted. + patch_size: the patches extracted from the region on the grid. + image_reader_name: the name of library to be used for loading whole slide imaging, either CuCIM or OpenSlide. + Defaults to CuCIM. + transform: transforms to be executed on input data. + replace_rate: percentage of the cached items to be replaced in every epoch. + cache_num: number of items to be cached. Default is `sys.maxsize`. + will take the minimum of (cache_num, data_length x cache_rate, data_length). + cache_rate: percentage of cached data in total, default is 1.0 (cache all). + will take the minimum of (cache_num, data_length x cache_rate, data_length). + num_init_workers: the number of worker threads to initialize the cache for first epoch. + If num_init_workers is None then the number returned by os.cpu_count() is used. + num_replace_workers: the number of worker threads to prepare the replacement cache for every epoch. + If num_replace_workers is None then the number returned by os.cpu_count() is used. + progress: whether to display a progress bar when caching for the first epoch. + + + """ def __init__( self, @@ -135,7 +151,7 @@ def __init__( ): extractor = PatchWSIDataset(data, region_size, grid_shape, patch_size, image_reader_name) super().__init__( - data=extractor, + data=extractor, # type: ignore transform=transform, replace_rate=replace_rate, cache_num=cache_num, From 2d56c98384770b50a428961fde8b34f631ff57a9 Mon Sep 17 00:00:00 2001 From: Behrooz <3968947+behxyz@users.noreply.github.com> Date: Wed, 24 Mar 2021 21:23:35 +0000 Subject: [PATCH 15/22] Format docstring Signed-off-by: Behrooz <3968947+behxyz@users.noreply.github.com> --- monai/apps/pathology/datasets.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/monai/apps/pathology/datasets.py b/monai/apps/pathology/datasets.py index 2bf29e3325..b1605a4fb5 100644 --- a/monai/apps/pathology/datasets.py +++ b/monai/apps/pathology/datasets.py @@ -80,8 +80,7 @@ def __init__( self._fetch_wsi_objects() def _fetch_wsi_objects(self): - """Load all the image objects and reuse them when asked for an item. - """ + """Load all the image objects and reuse them when asked for an item.""" self.wsi_object_dict = {} for image_path in self.image_path_list: self.wsi_object_dict[image_path] = self.image_reader.read(image_path) From de3b67bd3ce02f9e863c456ab352e986bb77d047 Mon Sep 17 00:00:00 2001 From: Behrooz <3968947+behxyz@users.noreply.github.com> Date: Wed, 24 Mar 2021 22:34:25 +0000 Subject: [PATCH 16/22] Update length of the smartcache dataset Signed-off-by: Behrooz <3968947+behxyz@users.noreply.github.com> --- monai/apps/pathology/datasets.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/monai/apps/pathology/datasets.py b/monai/apps/pathology/datasets.py index b1605a4fb5..dc79be9484 100644 --- a/monai/apps/pathology/datasets.py +++ b/monai/apps/pathology/datasets.py @@ -131,7 +131,6 @@ class SmartCachePatchWSIDataset(SmartCacheDataset): If num_replace_workers is None then the number returned by os.cpu_count() is used. progress: whether to display a progress bar when caching for the first epoch. - """ def __init__( @@ -148,9 +147,10 @@ def __init__( num_init_workers: Optional[int] = None, num_replace_workers: Optional[int] = None, ): - extractor = PatchWSIDataset(data, region_size, grid_shape, patch_size, image_reader_name) + patch_wsi_dataset = PatchWSIDataset(data, region_size, grid_shape, patch_size, image_reader_name) + self.len_dataset = len(patch_wsi_dataset) super().__init__( - data=extractor, # type: ignore + data=patch_wsi_dataset, # type: ignore transform=transform, replace_rate=replace_rate, cache_num=cache_num, @@ -158,3 +158,6 @@ def __init__( num_init_workers=num_init_workers, num_replace_workers=num_replace_workers, ) + + def __len__(self): + return self.len_dataset From ef8daf92f391ce3e72a3df83ba781e849b73434e Mon Sep 17 00:00:00 2001 From: Behrooz <3968947+behxyz@users.noreply.github.com> Date: Wed, 24 Mar 2021 22:35:02 +0000 Subject: [PATCH 17/22] Add unittest for SmartCachePatchWSIDataset Signed-off-by: Behrooz <3968947+behxyz@users.noreply.github.com> --- tests/test_smartcache_patch_wsi_dataset.py | 93 ++++++++++++++++++++++ 1 file changed, 93 insertions(+) create mode 100644 tests/test_smartcache_patch_wsi_dataset.py diff --git a/tests/test_smartcache_patch_wsi_dataset.py b/tests/test_smartcache_patch_wsi_dataset.py new file mode 100644 index 0000000000..3e370e4c2b --- /dev/null +++ b/tests/test_smartcache_patch_wsi_dataset.py @@ -0,0 +1,93 @@ +import os +import unittest +from unittest import skipUnless +from urllib import request + +import numpy as np +from numpy.testing import assert_array_equal +from parameterized import parameterized + +from monai.apps.pathology.datasets import SmartCachePatchWSIDataset +from monai.utils import optional_import + +_, has_cim = optional_import("cucim") + +FILE_URL = "http://openslide.cs.cmu.edu/download/openslide-testdata/Generic-TIFF/CMU-1.tiff" + +TEST_CASE_0 = [ + FILE_URL, + { + "data": [ + {"image": "./CMU-1.tiff", "location": [0, 0], "label": [1]}, + {"image": "./CMU-1.tiff", "location": [0, 0], "label": [1]}, + {"image": "./CMU-1.tiff", "location": [0, 0], "label": [1]}, + {"image": "./CMU-1.tiff", "location": [0, 0], "label": [1]}, + ], + "region_size": (1, 1), + "grid_shape": (1, 1), + "patch_size": 1, + "transform": lambda x: x, + "image_reader_name": "cuCIM", + "replace_rate": 0.5, + "cache_num": 2, + "num_init_workers": 1, + "num_replace_workers": 1, + }, + [ + {"image": np.array([[[239]], [[239]], [[239]]], dtype=np.uint8), "label": np.array([[1]])}, + ], +] + +TEST_CASE_1 = [ + FILE_URL, + { + "data": [ + {"image": "./CMU-1.tiff", "location": [10004, 20004], "label": [0, 0, 0, 1]}, + {"image": "./CMU-1.tiff", "location": [10004, 20004], "label": [0, 0, 0, 1]}, + {"image": "./CMU-1.tiff", "location": [10004, 20004], "label": [0, 0, 0, 1]}, + {"image": "./CMU-1.tiff", "location": [10004, 20004], "label": [0, 0, 0, 1]}, + {"image": "./CMU-1.tiff", "location": [10004, 20004], "label": [0, 0, 0, 1]}, + {"image": "./CMU-1.tiff", "location": [10004, 20004], "label": [0, 0, 0, 1]}, + ], + "region_size": (8, 8), + "grid_shape": (2, 2), + "patch_size": 1, + "transform": lambda x: x, + "replace_rate": 0.5, + "cache_num": 2, + "num_init_workers": 1, + "num_replace_workers": 1, + }, + [ + {"image": np.array([[[247]], [[245]], [[248]]], dtype=np.uint8), "label": np.array([[0]])}, + {"image": np.array([[[245]], [[247]], [[244]]], dtype=np.uint8), "label": np.array([[0]])}, + {"image": np.array([[[246]], [[246]], [[246]]], dtype=np.uint8), "label": np.array([[0]])}, + {"image": np.array([[[246]], [[246]], [[246]]], dtype=np.uint8), "label": np.array([[1]])}, + ], +] + + +class TestSmartCachePatchWSIDataset(unittest.TestCase): + @parameterized.expand([TEST_CASE_0, TEST_CASE_1]) + @skipUnless(has_cim, "Requires CuCIM") + def test_read_patches(self, file_url, input_parameters, expected): + self.camelyon_data_download(file_url) + dataset = SmartCachePatchWSIDataset(**input_parameters) + self.assertEqual(len(dataset), len(input_parameters["data"])) + for samples in dataset: + for i in range(len(samples)): + self.assertTupleEqual(samples[i]["label"].shape, expected[i]["label"].shape) + self.assertTupleEqual(samples[i]["image"].shape, expected[i]["image"].shape) + self.assertIsNone(assert_array_equal(samples[i]["label"], expected[i]["label"])) + self.assertIsNone(assert_array_equal(samples[i]["image"], expected[i]["image"])) + + def camelyon_data_download(self, file_url): + filename = os.path.basename(file_url) + if not os.path.exists(filename): + print(f"Test image [{filename}] does not exist. Downloading...") + request.urlretrieve(file_url, filename) + return filename + + +if __name__ == "__main__": + unittest.main() From 3f130f04c0a5074205307410fd56bfe2b209cdb1 Mon Sep 17 00:00:00 2001 From: Behrooz <3968947+behxyz@users.noreply.github.com> Date: Thu, 25 Mar 2021 00:09:30 +0000 Subject: [PATCH 18/22] Minor changes and fixes Signed-off-by: Behrooz <3968947+behxyz@users.noreply.github.com> --- monai/apps/pathology/__init__.py | 2 ++ monai/apps/pathology/datasets.py | 19 +++++++++---------- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/monai/apps/pathology/__init__.py b/monai/apps/pathology/__init__.py index 14ae193634..2040d510d1 100644 --- a/monai/apps/pathology/__init__.py +++ b/monai/apps/pathology/__init__.py @@ -8,3 +8,5 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + +from .datasets import PatchWSIDataset, SmartCacheDataset \ No newline at end of file diff --git a/monai/apps/pathology/datasets.py b/monai/apps/pathology/datasets.py index dc79be9484..6dcf240870 100644 --- a/monai/apps/pathology/datasets.py +++ b/monai/apps/pathology/datasets.py @@ -22,8 +22,8 @@ class PatchWSIDataset(Dataset): """ - This dataset read whole slide images, extract regions, and crate patches. - It also reads labels for each patch and privide each patch with its associated class labels. + This dataset reads whole slide images, extracts regions, and creates patches. + It also reads labels for each patch and provides each patch with its associated class labels. Args: data: the list of input samples including image, location, and label (see below for more details). @@ -55,6 +55,8 @@ def __init__( image_reader_name: str = "cuCIM", transform: Optional[Callable] = None, ): + super().__init__(data, transform) + if isinstance(region_size, int): self.region_size = (region_size, region_size) else: @@ -68,15 +70,13 @@ def __init__( self.patch_size = patch_size self.sub_region_size = (self.region_size[0] / self.grid_shape[0], self.region_size[1] / self.grid_shape[1]) - self.transform = transform - self.samples = data - self.image_path_list = list({x["image"] for x in self.samples}) + self.image_path_list = list({x["image"] for x in self.data}) self.image_reader_name = image_reader_name self.image_reader = WSIReader(image_reader_name) self.wsi_object_dict = None if self.image_reader_name != "openslide": - # OpenSlide causes memeory issue if we prefetch image objects + # OpenSlide causes memory issue if we prefetch image objects self._fetch_wsi_objects() def _fetch_wsi_objects(self): @@ -85,11 +85,8 @@ def _fetch_wsi_objects(self): for image_path in self.image_path_list: self.wsi_object_dict[image_path] = self.image_reader.read(image_path) - def __len__(self): - return len(self.samples) - def __getitem__(self, index): - sample = self.samples[index] + sample = self.data[index] if self.image_reader_name == "openslide": img_obj = self.image_reader.read(sample["image"]) else: @@ -146,6 +143,7 @@ def __init__( cache_rate: float = 1.0, num_init_workers: Optional[int] = None, num_replace_workers: Optional[int] = None, + progress: bool = True, ): patch_wsi_dataset = PatchWSIDataset(data, region_size, grid_shape, patch_size, image_reader_name) self.len_dataset = len(patch_wsi_dataset) @@ -157,6 +155,7 @@ def __init__( cache_rate=cache_rate, num_init_workers=num_init_workers, num_replace_workers=num_replace_workers, + progress=progress, ) def __len__(self): From 9c4e158229e6d5317a80309be90773a93eecfd12 Mon Sep 17 00:00:00 2001 From: Behrooz <3968947+behxyz@users.noreply.github.com> Date: Wed, 24 Mar 2021 20:18:23 -0400 Subject: [PATCH 19/22] Add unnittest for OpenSlide option Signed-off-by: Behrooz <3968947+behxyz@users.noreply.github.com> --- tests/test_patch_wsi_dataset.py | 68 ++++++++++++++++++++++++++++++++- 1 file changed, 66 insertions(+), 2 deletions(-) diff --git a/tests/test_patch_wsi_dataset.py b/tests/test_patch_wsi_dataset.py index a1b1d2d2c0..730519ed52 100644 --- a/tests/test_patch_wsi_dataset.py +++ b/tests/test_patch_wsi_dataset.py @@ -11,6 +11,7 @@ from monai.utils import optional_import _, has_cim = optional_import("cucim") +_, has_osl = optional_import("openslide") FILE_URL = "http://openslide.cs.cmu.edu/download/openslide-testdata/Generic-TIFF/CMU-1.tiff" @@ -37,6 +38,57 @@ "region_size": (8, 8), "grid_shape": (2, 2), "patch_size": 1, + "image_reader_name": "cuCIM", + }, + [ + {"image": np.array([[[247]], [[245]], [[248]]], dtype=np.uint8), "label": np.array([[0]])}, + {"image": np.array([[[245]], [[247]], [[244]]], dtype=np.uint8), "label": np.array([[0]])}, + {"image": np.array([[[246]], [[246]], [[246]]], dtype=np.uint8), "label": np.array([[0]])}, + {"image": np.array([[[246]], [[246]], [[246]]], dtype=np.uint8), "label": np.array([[1]])}, + ], +] + +TEST_CASE_2 = [ + FILE_URL, + { + "data": [ + {"image": "./CMU-1.tiff", "location": [0, 0], "label": [1]}, + ], + "region_size": 1, + "grid_shape": 1, + "patch_size": 1, + "image_reader_name": "cuCIM", + }, + [ + {"image": np.array([[[239]], [[239]], [[239]]], dtype=np.uint8), "label": np.array([[1]])}, + ], +] + + +TEST_CASE_OPENSLIDE_0 = [ + FILE_URL, + { + "data": [ + {"image": "./CMU-1.tiff", "location": [0, 0], "label": [1]}, + ], + "region_size": (1, 1), + "grid_shape": (1, 1), + "patch_size": 1, + "image_reader_name": "OpenSlide", + }, + [ + {"image": np.array([[[239]], [[239]], [[239]]], dtype=np.uint8), "label": np.array([[1]])}, + ], +] + +TEST_CASE_OPENSLIDE_1 = [ + FILE_URL, + { + "data": [{"image": "./CMU-1.tiff", "location": [10004, 20004], "label": [0, 0, 0, 1]}], + "region_size": (8, 8), + "grid_shape": (2, 2), + "patch_size": 1, + "image_reader_name": "OpenSlide", }, [ {"image": np.array([[[247]], [[245]], [[248]]], dtype=np.uint8), "label": np.array([[0]])}, @@ -48,9 +100,21 @@ class TestPatchWSIDataset(unittest.TestCase): - @parameterized.expand([TEST_CASE_0, TEST_CASE_1]) + @parameterized.expand([TEST_CASE_0, TEST_CASE_1, TEST_CASE_2]) @skipUnless(has_cim, "Requires CuCIM") - def test_read_patches(self, file_url, input_parameters, expected): + def test_read_patches_cucim(self, file_url, input_parameters, expected): + self.camelyon_data_download(file_url) + dataset = PatchWSIDataset(**input_parameters) + samples = dataset[0] + for i in range(len(samples)): + self.assertTupleEqual(samples[i]["label"].shape, expected[i]["label"].shape) + self.assertTupleEqual(samples[i]["image"].shape, expected[i]["image"].shape) + self.assertIsNone(assert_array_equal(samples[i]["label"], expected[i]["label"])) + self.assertIsNone(assert_array_equal(samples[i]["image"], expected[i]["image"])) + + @parameterized.expand([TEST_CASE_OPENSLIDE_0, TEST_CASE_OPENSLIDE_1]) + @skipUnless(has_osl, "Requires OpenSlide") + def test_read_patches_openslide(self, file_url, input_parameters, expected): self.camelyon_data_download(file_url) dataset = PatchWSIDataset(**input_parameters) samples = dataset[0] From 27aac0c6e961d7c905ff31d56f665743a3f83445 Mon Sep 17 00:00:00 2001 From: Behrooz <3968947+behxyz@users.noreply.github.com> Date: Wed, 24 Mar 2021 20:22:56 -0400 Subject: [PATCH 20/22] Add new line Signed-off-by: Behrooz <3968947+behxyz@users.noreply.github.com> --- monai/apps/pathology/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/monai/apps/pathology/__init__.py b/monai/apps/pathology/__init__.py index 2040d510d1..bbdb812c03 100644 --- a/monai/apps/pathology/__init__.py +++ b/monai/apps/pathology/__init__.py @@ -9,4 +9,4 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .datasets import PatchWSIDataset, SmartCacheDataset \ No newline at end of file +from .datasets import PatchWSIDataset, SmartCacheDataset From 11210d406f9cedbb74902275b610e2a067bdbc51 Mon Sep 17 00:00:00 2001 From: Behrooz <3968947+behxyz@users.noreply.github.com> Date: Thu, 25 Mar 2021 00:17:01 -0400 Subject: [PATCH 21/22] Remove SmartCachePatchWSIDataset test to fix it Signed-off-by: Behrooz <3968947+behxyz@users.noreply.github.com> --- monai/apps/pathology/datasets.py | 4 - tests/test_smartcache_patch_wsi_dataset.py | 93 ---------------------- 2 files changed, 97 deletions(-) delete mode 100644 tests/test_smartcache_patch_wsi_dataset.py diff --git a/monai/apps/pathology/datasets.py b/monai/apps/pathology/datasets.py index 6dcf240870..f9ce0bc62b 100644 --- a/monai/apps/pathology/datasets.py +++ b/monai/apps/pathology/datasets.py @@ -146,7 +146,6 @@ def __init__( progress: bool = True, ): patch_wsi_dataset = PatchWSIDataset(data, region_size, grid_shape, patch_size, image_reader_name) - self.len_dataset = len(patch_wsi_dataset) super().__init__( data=patch_wsi_dataset, # type: ignore transform=transform, @@ -157,6 +156,3 @@ def __init__( num_replace_workers=num_replace_workers, progress=progress, ) - - def __len__(self): - return self.len_dataset diff --git a/tests/test_smartcache_patch_wsi_dataset.py b/tests/test_smartcache_patch_wsi_dataset.py deleted file mode 100644 index 3e370e4c2b..0000000000 --- a/tests/test_smartcache_patch_wsi_dataset.py +++ /dev/null @@ -1,93 +0,0 @@ -import os -import unittest -from unittest import skipUnless -from urllib import request - -import numpy as np -from numpy.testing import assert_array_equal -from parameterized import parameterized - -from monai.apps.pathology.datasets import SmartCachePatchWSIDataset -from monai.utils import optional_import - -_, has_cim = optional_import("cucim") - -FILE_URL = "http://openslide.cs.cmu.edu/download/openslide-testdata/Generic-TIFF/CMU-1.tiff" - -TEST_CASE_0 = [ - FILE_URL, - { - "data": [ - {"image": "./CMU-1.tiff", "location": [0, 0], "label": [1]}, - {"image": "./CMU-1.tiff", "location": [0, 0], "label": [1]}, - {"image": "./CMU-1.tiff", "location": [0, 0], "label": [1]}, - {"image": "./CMU-1.tiff", "location": [0, 0], "label": [1]}, - ], - "region_size": (1, 1), - "grid_shape": (1, 1), - "patch_size": 1, - "transform": lambda x: x, - "image_reader_name": "cuCIM", - "replace_rate": 0.5, - "cache_num": 2, - "num_init_workers": 1, - "num_replace_workers": 1, - }, - [ - {"image": np.array([[[239]], [[239]], [[239]]], dtype=np.uint8), "label": np.array([[1]])}, - ], -] - -TEST_CASE_1 = [ - FILE_URL, - { - "data": [ - {"image": "./CMU-1.tiff", "location": [10004, 20004], "label": [0, 0, 0, 1]}, - {"image": "./CMU-1.tiff", "location": [10004, 20004], "label": [0, 0, 0, 1]}, - {"image": "./CMU-1.tiff", "location": [10004, 20004], "label": [0, 0, 0, 1]}, - {"image": "./CMU-1.tiff", "location": [10004, 20004], "label": [0, 0, 0, 1]}, - {"image": "./CMU-1.tiff", "location": [10004, 20004], "label": [0, 0, 0, 1]}, - {"image": "./CMU-1.tiff", "location": [10004, 20004], "label": [0, 0, 0, 1]}, - ], - "region_size": (8, 8), - "grid_shape": (2, 2), - "patch_size": 1, - "transform": lambda x: x, - "replace_rate": 0.5, - "cache_num": 2, - "num_init_workers": 1, - "num_replace_workers": 1, - }, - [ - {"image": np.array([[[247]], [[245]], [[248]]], dtype=np.uint8), "label": np.array([[0]])}, - {"image": np.array([[[245]], [[247]], [[244]]], dtype=np.uint8), "label": np.array([[0]])}, - {"image": np.array([[[246]], [[246]], [[246]]], dtype=np.uint8), "label": np.array([[0]])}, - {"image": np.array([[[246]], [[246]], [[246]]], dtype=np.uint8), "label": np.array([[1]])}, - ], -] - - -class TestSmartCachePatchWSIDataset(unittest.TestCase): - @parameterized.expand([TEST_CASE_0, TEST_CASE_1]) - @skipUnless(has_cim, "Requires CuCIM") - def test_read_patches(self, file_url, input_parameters, expected): - self.camelyon_data_download(file_url) - dataset = SmartCachePatchWSIDataset(**input_parameters) - self.assertEqual(len(dataset), len(input_parameters["data"])) - for samples in dataset: - for i in range(len(samples)): - self.assertTupleEqual(samples[i]["label"].shape, expected[i]["label"].shape) - self.assertTupleEqual(samples[i]["image"].shape, expected[i]["image"].shape) - self.assertIsNone(assert_array_equal(samples[i]["label"], expected[i]["label"])) - self.assertIsNone(assert_array_equal(samples[i]["image"], expected[i]["image"])) - - def camelyon_data_download(self, file_url): - filename = os.path.basename(file_url) - if not os.path.exists(filename): - print(f"Test image [{filename}] does not exist. Downloading...") - request.urlretrieve(file_url, filename) - return filename - - -if __name__ == "__main__": - unittest.main() From 7ec1e50c9c771880a1eae145a15e514ba5a89630 Mon Sep 17 00:00:00 2001 From: Richard Brown <33289025+rijobro@users.noreply.github.com> Date: Thu, 25 Mar 2021 08:39:26 +0000 Subject: [PATCH 22/22] move init docstring to class docstring Signed-off-by: Richard Brown <33289025+rijobro@users.noreply.github.com> --- monai/data/dataset.py | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/monai/data/dataset.py b/monai/data/dataset.py index 813008e3a8..9a4e932160 100644 --- a/monai/data/dataset.py +++ b/monai/data/dataset.py @@ -582,6 +582,21 @@ class SmartCacheDataset(Randomizable, CacheDataset): This replacement will not work if setting the `multiprocessing_context` of DataLoader to `spawn` or on windows(the default multiprocessing method is `spawn`) and setting `num_workers` greater than 0. + Args: + data: input data to load and transform to generate dataset for model. + transform: transforms to execute operations on input data. + replace_rate: percentage of the cached items to be replaced in every epoch. + cache_num: number of items to be cached. Default is `sys.maxsize`. + will take the minimum of (cache_num, data_length x cache_rate, data_length). + cache_rate: percentage of cached data in total, default is 1.0 (cache all). + will take the minimum of (cache_num, data_length x cache_rate, data_length). + num_init_workers: the number of worker threads to initialize the cache for first epoch. + If num_init_workers is None then the number returned by os.cpu_count() is used. + num_replace_workers: the number of worker threads to prepare the replacement cache for every epoch. + If num_replace_workers is None then the number returned by os.cpu_count() is used. + progress: whether to display a progress bar when caching for the first epoch. + shuffle: whether to shuffle the whole data list before preparing the cache content for first epoch. + seed: random seed if shuffle is `True`, default to `0`. """ def __init__( @@ -597,24 +612,6 @@ def __init__( shuffle: bool = True, seed: int = 0, ) -> None: - """ - Args: - data: input data to load and transform to generate dataset for model. - transform: transforms to execute operations on input data. - replace_rate: percentage of the cached items to be replaced in every epoch. - cache_num: number of items to be cached. Default is `sys.maxsize`. - will take the minimum of (cache_num, data_length x cache_rate, data_length). - cache_rate: percentage of cached data in total, default is 1.0 (cache all). - will take the minimum of (cache_num, data_length x cache_rate, data_length). - num_init_workers: the number of worker threads to initialize the cache for first epoch. - If num_init_workers is None then the number returned by os.cpu_count() is used. - num_replace_workers: the number of worker threads to prepare the replacement cache for every epoch. - If num_replace_workers is None then the number returned by os.cpu_count() is used. - progress: whether to display a progress bar when caching for the first epoch. - shuffle: whether to shuffle the whole data list before preparing the cache content for first epoch. - seed: random seed if shuffle is `True`, default to `0`. - - """ if shuffle: self.set_random_state(seed=seed) self.randomize(data)