From f929e98bb210604f59b40c937b3cbb9dbbbcdeda Mon Sep 17 00:00:00 2001
From: Behrooz <3968947+behxyz@users.noreply.github.com>
Date: Mon, 22 Mar 2021 23:51:37 +0000
Subject: [PATCH 01/22] Implement PatchWSIDataset and SmartCachePathWSIDataset

Signed-off-by: Behrooz <3968947+behxyz@users.noreply.github.com>
---
 monai/apps/pathology/datasets.py | 145 +++++++++++++++++++++++++++++++
 1 file changed, 145 insertions(+)
 create mode 100644 monai/apps/pathology/datasets.py

diff --git a/monai/apps/pathology/datasets.py b/monai/apps/pathology/datasets.py
new file mode 100644
index 0000000000..53eae60c3b
--- /dev/null
+++ b/monai/apps/pathology/datasets.py
@@ -0,0 +1,145 @@
+# Copyright 2020 - 2021 MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+from typing import Callable, Dict, List, Optional, Sequence, Union, Tuple
+
+import numpy as np
+
+from monai.data import Dataset, SmartCacheDataset
+from monai.data.image_reader import WSIReader
+
+__all__ = ["PatchWSIDataset", "SmartCachePatchWSIDataset"]
+
+
+class PatchWSIDataset(Dataset):
+    """
+    This dataset read whole slide images, extract regions, and crate patches.
+    It reads labels for each patch and privide each patch with its associated class labels.
+
+    Args:
+        data: The input image directory and the label file
+        [{"image": "path/to/image/directory", "label": "path/to/label/file.txt"}]
+        region_size: the region to be extracted from the whole slide image
+        grid_size: the grid size on which the patches should be extracted
+        patch_size: the patches extracted from the region on the grid
+        image_reader_name: (cuCIM is default)
+        transform:
+
+    """
+
+    def __init__(
+        self,
+        data: dict,
+        region_size: Union[int, Tuple[int, int]],
+        grid_size: Union[int, Tuple[int, int]],
+        patch_size: Union[int, Tuple[int, int]],
+        image_reader_name="cuCIM",
+        transform=None,
+    ):
+        if type(region_size) == int:
+            self.region_size = (region_size, region_size)
+        else:
+            self.region_size = region_size
+
+        if type(grid_size) == int:
+            self.grid_size = (grid_size, grid_size)
+        else:
+            self.grid_size = grid_size
+
+        self.patch_size = patch_size
+        self.sub_region_size = (self.region_size[0] / self.grid_size[0], self.region_size[1] / self.grid_size[1])
+
+        self.transform = transform
+        self.image_base_path = data[0]["image"]
+        self.samples = self.load_samples(data[0]["label"])
+        self.image_path_list = {x[0] for x in self.samples}
+        self.num_samples = len(self.samples)
+
+        self.image_reader_name = image_reader_name
+        self.image_reader = WSIReader(image_reader_name)
+        self.wsi_object_dict = {}
+        self._fetch_wsi_objects()
+
+    def _fetch_wsi_objects(self):
+        for image_path in self.image_path_list:
+            self.wsi_object_dict[image_path] = self.image_reader.read(image_path)
+
+    def process_label_row(self, row):
+        row = row.strip("\n").split(",")
+        # create full image path
+        image_name = row[0] + ".tif"
+        image_path = os.path.join(self.image_base_path, image_name)
+        # change center locations to upper left location
+        location = (int(row[2]) - self.region_size[1] // 2, int(row[1]) - self.region_size[0] // 2)
+        # convert labels to float32 and add empty HxW channel to label
+        labels = tuple(int(lbl) for lbl in row[3:])
+        labels = np.array(labels, dtype=np.float32)[:, np.newaxis, np.newaxis]
+        return image_path, location, labels
+
+    def load_samples(self, loc_path):
+        with open(loc_path) as label_file:
+            rows = [self.process_label_row(row) for row in label_file.readlines()]
+        return rows
+
+    def __len__(self):
+        return self.num_samples
+
+    def __getitem__(self, index):
+        image_path, location, labels = self.samples[index]
+        # OpenSlide causes issue if using the stored image objects
+        if self.image_reader_name == "openslide":
+            img_obj = self.image_reader.read(image_path)
+        else:
+            img_obj = self.wsi_object_dict[image_path]
+        images, _ = self.image_reader.get_data(
+            img=img_obj,
+            location=location,
+            size=self.region_size,
+            grid_shape=self.grid_size,
+            patch_size=self.patch_size,
+        )
+        samples = [{"image": images[i], "label": labels[i]} for i in range(labels.shape[0])]
+        if self.transform:
+            samples = self.transform(samples)
+        return samples
+
+
+class SmartCachePatchWSIDataset(SmartCacheDataset):
+    """
+    Add SmartCache functionality to PatchWSIDataset
+    """
+
+    def __init__(
+        self,
+        data,
+        region_size,
+        grid_size,
+        patch_size,
+        transform,
+        replace_rate,
+        cache_num,
+        cache_rate=1.0,
+        num_init_workers=None,
+        num_replace_workers=0,
+        image_reader_name="cuCIM",
+    ):
+        extractor = PatchWSIDataset(data, region_size, grid_size, patch_size, image_reader_name)
+        super().__init__(
+            data=extractor,
+            transform=transform,
+            replace_rate=replace_rate,
+            cache_num=cache_num,
+            cache_rate=cache_rate,
+            num_init_workers=num_init_workers,
+            num_replace_workers=num_replace_workers,
+        )

From ea0b51547a62a551850d5bd044c29ae8e994206c Mon Sep 17 00:00:00 2001
From: Behrooz <3968947+behxyz@users.noreply.github.com>
Date: Tue, 23 Mar 2021 18:12:56 +0000
Subject: [PATCH 02/22] Remove label preprocessing and adopt new type of inputs

Signed-off-by: Behrooz <3968947+behxyz@users.noreply.github.com>
---
 monai/apps/pathology/datasets.py | 30 +++++++-----------------------
 1 file changed, 7 insertions(+), 23 deletions(-)

diff --git a/monai/apps/pathology/datasets.py b/monai/apps/pathology/datasets.py
index 53eae60c3b..4ed654468d 100644
--- a/monai/apps/pathology/datasets.py
+++ b/monai/apps/pathology/datasets.py
@@ -11,6 +11,7 @@
 
 import os
 import sys
+import json
 from typing import Callable, Dict, List, Optional, Sequence, Union, Tuple
 
 import numpy as np
@@ -28,7 +29,7 @@ class PatchWSIDataset(Dataset):
 
     Args:
         data: The input image directory and the label file
-        [{"image": "path/to/image/directory", "label": "path/to/label/file.txt"}]
+        [{"image": "path/to/image1", "label": [0,0,0,1,0,1,0,0,1]}, "location": [200, 500]]
         region_size: the region to be extracted from the whole slide image
         grid_size: the grid size on which the patches should be extracted
         patch_size: the patches extracted from the region on the grid
@@ -60,37 +61,20 @@ def __init__(
         self.sub_region_size = (self.region_size[0] / self.grid_size[0], self.region_size[1] / self.grid_size[1])
 
         self.transform = transform
-        self.image_base_path = data[0]["image"]
-        self.samples = self.load_samples(data[0]["label"])
-        self.image_path_list = {x[0] for x in self.samples}
+        self.samples = data
         self.num_samples = len(self.samples)
+        self.image_path_list = list({x['image'] for x in self.samples})
 
         self.image_reader_name = image_reader_name
         self.image_reader = WSIReader(image_reader_name)
-        self.wsi_object_dict = {}
+        self.wsi_object_dict = None
         self._fetch_wsi_objects()
 
     def _fetch_wsi_objects(self):
+        self.wsi_object_dict = {}
         for image_path in self.image_path_list:
             self.wsi_object_dict[image_path] = self.image_reader.read(image_path)
 
-    def process_label_row(self, row):
-        row = row.strip("\n").split(",")
-        # create full image path
-        image_name = row[0] + ".tif"
-        image_path = os.path.join(self.image_base_path, image_name)
-        # change center locations to upper left location
-        location = (int(row[2]) - self.region_size[1] // 2, int(row[1]) - self.region_size[0] // 2)
-        # convert labels to float32 and add empty HxW channel to label
-        labels = tuple(int(lbl) for lbl in row[3:])
-        labels = np.array(labels, dtype=np.float32)[:, np.newaxis, np.newaxis]
-        return image_path, location, labels
-
-    def load_samples(self, loc_path):
-        with open(loc_path) as label_file:
-            rows = [self.process_label_row(row) for row in label_file.readlines()]
-        return rows
-
     def __len__(self):
         return self.num_samples
 
@@ -130,7 +114,7 @@ def __init__(
         cache_num,
         cache_rate=1.0,
         num_init_workers=None,
-        num_replace_workers=0,
+        num_replace_workers=None,
         image_reader_name="cuCIM",
     ):
         extractor = PatchWSIDataset(data, region_size, grid_size, patch_size, image_reader_name)

From 1d7ec171465c5aa2ee6d339e2ccce024ef5d65c2 Mon Sep 17 00:00:00 2001
From: Behrooz <3968947+behxyz@users.noreply.github.com>
Date: Tue, 23 Mar 2021 19:46:15 +0000
Subject: [PATCH 03/22] Update type hints

Signed-off-by: Behrooz <3968947+behxyz@users.noreply.github.com>
---
 monai/apps/pathology/datasets.py | 36 ++++++++++++++------------------
 1 file changed, 16 insertions(+), 20 deletions(-)

diff --git a/monai/apps/pathology/datasets.py b/monai/apps/pathology/datasets.py
index 4ed654468d..77f9436cc9 100644
--- a/monai/apps/pathology/datasets.py
+++ b/monai/apps/pathology/datasets.py
@@ -9,12 +9,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
 import sys
-import json
-from typing import Callable, Dict, List, Optional, Sequence, Union, Tuple
-
-import numpy as np
+from typing import Callable, List, Optional, Sequence, Tuple, Union
 
 from monai.data import Dataset, SmartCacheDataset
 from monai.data.image_reader import WSIReader
@@ -40,12 +36,12 @@ class PatchWSIDataset(Dataset):
 
     def __init__(
         self,
-        data: dict,
+        data: List,
         region_size: Union[int, Tuple[int, int]],
         grid_size: Union[int, Tuple[int, int]],
         patch_size: Union[int, Tuple[int, int]],
-        image_reader_name="cuCIM",
-        transform=None,
+        image_reader_name: str = "cuCIM",
+        transform: Union[Sequence[Callable], Callable] = None,
     ):
         if type(region_size) == int:
             self.region_size = (region_size, region_size)
@@ -63,7 +59,7 @@ def __init__(
         self.transform = transform
         self.samples = data
         self.num_samples = len(self.samples)
-        self.image_path_list = list({x['image'] for x in self.samples})
+        self.image_path_list = list({x["image"] for x in self.samples})
 
         self.image_reader_name = image_reader_name
         self.image_reader = WSIReader(image_reader_name)
@@ -105,17 +101,17 @@ class SmartCachePatchWSIDataset(SmartCacheDataset):
 
     def __init__(
         self,
-        data,
-        region_size,
-        grid_size,
-        patch_size,
-        transform,
-        replace_rate,
-        cache_num,
-        cache_rate=1.0,
-        num_init_workers=None,
-        num_replace_workers=None,
-        image_reader_name="cuCIM",
+        data: List,
+        region_size: Union[int, Tuple[int, int]],
+        grid_size: Union[int, Tuple[int, int]],
+        patch_size: Union[int, Tuple[int, int]],
+        image_reader_name: str = "cuCIM",
+        transform: Union[Sequence[Callable], Callable] = None,
+        replace_rate: float = 0.5,
+        cache_num: int = sys.maxsize,
+        cache_rate: float = 1.0,
+        num_init_workers: Optional[int] = None,
+        num_replace_workers: Optional[int] = None,
     ):
         extractor = PatchWSIDataset(data, region_size, grid_size, patch_size, image_reader_name)
         super().__init__(

From 9065fe05ae06a0df8de92c4a2e35ffbe88e6bf49 Mon Sep 17 00:00:00 2001
From: Behrooz <3968947+behxyz@users.noreply.github.com>
Date: Tue, 23 Mar 2021 19:53:10 +0000
Subject: [PATCH 04/22] Add init file

Signed-off-by: Behrooz <3968947+behxyz@users.noreply.github.com>
---
 monai/apps/pathology/__init__.py | 10 ++++++++++
 1 file changed, 10 insertions(+)
 create mode 100644 monai/apps/pathology/__init__.py

diff --git a/monai/apps/pathology/__init__.py b/monai/apps/pathology/__init__.py
new file mode 100644
index 0000000000..14ae193634
--- /dev/null
+++ b/monai/apps/pathology/__init__.py
@@ -0,0 +1,10 @@
+# Copyright 2020 - 2021 MONAI Consortium
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.

From ce4150df1bdccb1aab61197ce34a613554eabc13 Mon Sep 17 00:00:00 2001
From: Behrooz <3968947+behxyz@users.noreply.github.com>
Date: Tue, 23 Mar 2021 23:12:57 +0000
Subject: [PATCH 05/22] Change grid_size to grid_shape

Signed-off-by: Behrooz <3968947+behxyz@users.noreply.github.com>
---
 monai/apps/pathology/datasets.py | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/monai/apps/pathology/datasets.py b/monai/apps/pathology/datasets.py
index 77f9436cc9..d2df1cf28c 100644
--- a/monai/apps/pathology/datasets.py
+++ b/monai/apps/pathology/datasets.py
@@ -27,7 +27,7 @@ class PatchWSIDataset(Dataset):
         data: The input image directory and the label file
         [{"image": "path/to/image1", "label": [0,0,0,1,0,1,0,0,1]}, "location": [200, 500]]
         region_size: the region to be extracted from the whole slide image
-        grid_size: the grid size on which the patches should be extracted
+        grid_shape: the grid shape on which the patches should be extracted
         patch_size: the patches extracted from the region on the grid
         image_reader_name: (cuCIM is default)
         transform:
@@ -38,7 +38,7 @@ def __init__(
         self,
         data: List,
         region_size: Union[int, Tuple[int, int]],
-        grid_size: Union[int, Tuple[int, int]],
+        grid_shape: Union[int, Tuple[int, int]],
         patch_size: Union[int, Tuple[int, int]],
         image_reader_name: str = "cuCIM",
         transform: Union[Sequence[Callable], Callable] = None,
@@ -48,13 +48,13 @@ def __init__(
         else:
             self.region_size = region_size
 
-        if type(grid_size) == int:
-            self.grid_size = (grid_size, grid_size)
+        if type(grid_shape) == int:
+            self.grid_shape = (grid_shape, grid_shape)
         else:
-            self.grid_size = grid_size
+            self.grid_shape = grid_shape
 
         self.patch_size = patch_size
-        self.sub_region_size = (self.region_size[0] / self.grid_size[0], self.region_size[1] / self.grid_size[1])
+        self.sub_region_size = (self.region_size[0] / self.grid_shape[0], self.region_size[1] / self.grid_shape[1])
 
         self.transform = transform
         self.samples = data
@@ -75,20 +75,20 @@ def __len__(self):
         return self.num_samples
 
     def __getitem__(self, index):
-        image_path, location, labels = self.samples[index]
+        data = self.samples[index]
         # OpenSlide causes issue if using the stored image objects
         if self.image_reader_name == "openslide":
-            img_obj = self.image_reader.read(image_path)
+            img_obj = self.image_reader.read(data["image"])
         else:
-            img_obj = self.wsi_object_dict[image_path]
+            img_obj = self.wsi_object_dict[data["image"]]
         images, _ = self.image_reader.get_data(
             img=img_obj,
-            location=location,
+            location=data["location"],
             size=self.region_size,
-            grid_shape=self.grid_size,
+            grid_shape=self.grid_shape,
             patch_size=self.patch_size,
         )
-        samples = [{"image": images[i], "label": labels[i]} for i in range(labels.shape[0])]
+        samples = [{"image": images[i], "label": data["label"][i]} for i in range(len(data["label"]))]
         if self.transform:
             samples = self.transform(samples)
         return samples
@@ -103,7 +103,7 @@ def __init__(
         self,
         data: List,
         region_size: Union[int, Tuple[int, int]],
-        grid_size: Union[int, Tuple[int, int]],
+        grid_shape: Union[int, Tuple[int, int]],
         patch_size: Union[int, Tuple[int, int]],
         image_reader_name: str = "cuCIM",
         transform: Union[Sequence[Callable], Callable] = None,
@@ -113,7 +113,7 @@ def __init__(
         num_init_workers: Optional[int] = None,
         num_replace_workers: Optional[int] = None,
     ):
-        extractor = PatchWSIDataset(data, region_size, grid_size, patch_size, image_reader_name)
+        extractor = PatchWSIDataset(data, region_size, grid_shape, patch_size, image_reader_name)
         super().__init__(
             data=extractor,
             transform=transform,

From f2c83069452a78a1778e6f684a633ff46bc6a2be Mon Sep 17 00:00:00 2001
From: Behrooz <3968947+behxyz@users.noreply.github.com>
Date: Tue, 23 Mar 2021 23:13:44 +0000
Subject: [PATCH 06/22] Add a unittest for PatchWSIDataset

Signed-off-by: Behrooz <3968947+behxyz@users.noreply.github.com>
---
 tests/test_patch_wsi_dataset.py | 55 +++++++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)
 create mode 100644 tests/test_patch_wsi_dataset.py

diff --git a/tests/test_patch_wsi_dataset.py b/tests/test_patch_wsi_dataset.py
new file mode 100644
index 0000000000..fc0e879f1d
--- /dev/null
+++ b/tests/test_patch_wsi_dataset.py
@@ -0,0 +1,55 @@
+import os
+import unittest
+from unittest import skipUnless
+from urllib import request
+
+import numpy as np
+from numpy.testing import assert_array_equal
+from parameterized import parameterized
+
+from monai.apps.pathology.datasets import PatchWSIDataset
+
+FILE_URL = "http://openslide.cs.cmu.edu/download/openslide-testdata/Generic-TIFF/CMU-1.tiff"
+
+TEST_CASE_1 = [
+    FILE_URL,
+    {
+        "data": [{"image": "./CMU-1.tiff", "location": [10000, 20000], "label": [0, 0, 0, 1]}],
+        "region_size": (8, 8),
+        "grid_shape": (2, 2),
+        "patch_size": 1,
+        "image_reader_name": "cuCIM",
+    },
+    [
+        {"image": np.array([[[247]], [[245]], [[248]]], dtype=np.uint8), "label": 0},
+        {"image": np.array([[[245]], [[247]], [[244]]], dtype=np.uint8), "label": 0},
+        {"image": np.array([[[246]], [[246]], [[246]]], dtype=np.uint8), "label": 0},
+        {"image": np.array([[[246]], [[246]], [[246]]], dtype=np.uint8), "label": 1},
+    ],
+]
+
+
+class TestCuCIMReader(unittest.TestCase):
+    @parameterized.expand([TEST_CASE_1])
+    def test_read_patches(self, file_url, input_parameters, expected):
+        self.camelyon_data_download(file_url)
+        dataset = PatchWSIDataset(**input_parameters)
+        samples = dataset[0]
+        image_compare = [
+            assert_array_equal(samples[i]["image"], expected[i]["image"]) is None for i in range(len(samples))
+        ]
+        label_compare = [
+            assert_array_equal(samples[i]["label"], expected[i]["label"]) is None for i in range(len(samples))
+        ]
+        self.assertTrue(all(image_compare) and all(label_compare))
+
+    def camelyon_data_download(self, file_url):
+        filename = os.path.basename(file_url)
+        if not os.path.exists(filename):
+            print(f"Test image [{filename}] does not exist. Downloading...")
+            request.urlretrieve(file_url, filename)
+        return filename
+
+
+if __name__ == "__main__":
+    unittest.main()

From 3506740266ce6acebbf86aedbce7745b02a0228a Mon Sep 17 00:00:00 2001
From: Behrooz <3968947+behxyz@users.noreply.github.com>
Date: Tue, 23 Mar 2021 23:46:39 +0000
Subject: [PATCH 07/22] Add more unittests

Signed-off-by: Behrooz <3968947+behxyz@users.noreply.github.com>
---
 tests/test_patch_wsi_dataset.py | 27 ++++++++++++++++++++++++---
 1 file changed, 24 insertions(+), 3 deletions(-)

diff --git a/tests/test_patch_wsi_dataset.py b/tests/test_patch_wsi_dataset.py
index fc0e879f1d..95a2eea7a0 100644
--- a/tests/test_patch_wsi_dataset.py
+++ b/tests/test_patch_wsi_dataset.py
@@ -8,9 +8,30 @@
 from parameterized import parameterized
 
 from monai.apps.pathology.datasets import PatchWSIDataset
+from monai.utils import optional_import
+
+_, has_cim = optional_import("cucim")
 
 FILE_URL = "http://openslide.cs.cmu.edu/download/openslide-testdata/Generic-TIFF/CMU-1.tiff"
 
+TEST_CASE_0 = [
+    FILE_URL,
+    {
+        "data": [
+            {"image": "./CMU-1.tiff", "location": [10000, 20000], "label": [1]},
+            {"image": "./CMU-1.tiff", "location": [0, 0], "label": [0]},
+        ],
+        "region_size": (1, 1),
+        "grid_shape": (1, 1),
+        "patch_size": 1,
+        "image_reader_name": "cuCIM",
+    },
+    [
+        {"image": np.array([[[246]], [[245]], [[250]]], dtype=np.uint8), "label": 1},
+        {"image": np.array([[[239]], [[239]], [[239]]], dtype=np.uint8), "label": 0},
+    ],
+]
+
 TEST_CASE_1 = [
     FILE_URL,
     {
@@ -18,7 +39,6 @@
         "region_size": (8, 8),
         "grid_shape": (2, 2),
         "patch_size": 1,
-        "image_reader_name": "cuCIM",
     },
     [
         {"image": np.array([[[247]], [[245]], [[248]]], dtype=np.uint8), "label": 0},
@@ -29,8 +49,9 @@
 ]
 
 
-class TestCuCIMReader(unittest.TestCase):
-    @parameterized.expand([TEST_CASE_1])
+class TestPatchWSIDataset(unittest.TestCase):
+    @parameterized.expand([TEST_CASE_0, TEST_CASE_1])
+    @skipUnless(has_cim, "Requires CuCIM")
     def test_read_patches(self, file_url, input_parameters, expected):
         self.camelyon_data_download(file_url)
         dataset = PatchWSIDataset(**input_parameters)

From d81ef163e2a19c7e7e16af44e3fa689c48c7e40b Mon Sep 17 00:00:00 2001
From: Behrooz <3968947+behxyz@users.noreply.github.com>
Date: Wed, 24 Mar 2021 01:21:59 +0000
Subject: [PATCH 08/22] Update docstrings and make minor changes

Signed-off-by: Behrooz <3968947+behxyz@users.noreply.github.com>
---
 monai/apps/pathology/datasets.py | 56 ++++++++++++++++++++++----------
 1 file changed, 38 insertions(+), 18 deletions(-)

diff --git a/monai/apps/pathology/datasets.py b/monai/apps/pathology/datasets.py
index d2df1cf28c..96a3638b7a 100644
--- a/monai/apps/pathology/datasets.py
+++ b/monai/apps/pathology/datasets.py
@@ -12,6 +12,8 @@
 import sys
 from typing import Callable, List, Optional, Sequence, Tuple, Union
 
+import numpy as np
+
 from monai.data import Dataset, SmartCacheDataset
 from monai.data.image_reader import WSIReader
 
@@ -21,16 +23,33 @@
 class PatchWSIDataset(Dataset):
     """
     This dataset read whole slide images, extract regions, and crate patches.
-    It reads labels for each patch and privide each patch with its associated class labels.
+    It also reads labels for each patch and privide each patch with its associated class labels.
 
     Args:
-        data: The input image directory and the label file
-        [{"image": "path/to/image1", "label": [0,0,0,1,0,1,0,0,1]}, "location": [200, 500]]
+        data: the list of input samples including image, location, and label (see below for more details)
         region_size: the region to be extracted from the whole slide image
         grid_shape: the grid shape on which the patches should be extracted
         patch_size: the patches extracted from the region on the grid
-        image_reader_name: (cuCIM is default)
-        transform:
+        image_reader_name: the name of library to be used for loading whole slide imaging,
+            either CuCIM or OpenSlide (the default is CuCIM)
+        transform: transforms to be executed on input data.
+
+    Note:
+        The input data has the following form as an example:
+        [{"image": "path/to/image1.tiff", "location": [200, 500], "label": [0,0,0,1]}].
+
+        This means from "image1.tiff" extract a region at "location" with the side of "region_size", and
+        then extract patches with the size of "patch_size" from a square grid with the shape of "grid_shape".
+        Be aware the the "grid_shape" should construct a grid with the same number of element as "labels", so
+        for this example the "grid_size" should be (2, 2).
+
+        The output will look like the following:
+        [
+            {"image": np.array([...], dtype=np.uint8), "label": 0},
+            {"image": np.array([...], dtype=np.uint8), "label": 0},
+            {"image": np.array([...], dtype=np.uint8), "label": 0},
+            {"image": np.array([...], dtype=np.uint8), "label": 1},
+        ]
 
     """
 
@@ -39,7 +58,7 @@ def __init__(
         data: List,
         region_size: Union[int, Tuple[int, int]],
         grid_shape: Union[int, Tuple[int, int]],
-        patch_size: Union[int, Tuple[int, int]],
+        patch_size: int,
         image_reader_name: str = "cuCIM",
         transform: Union[Sequence[Callable], Callable] = None,
     ):
@@ -58,13 +77,14 @@ def __init__(
 
         self.transform = transform
         self.samples = data
-        self.num_samples = len(self.samples)
         self.image_path_list = list({x["image"] for x in self.samples})
 
         self.image_reader_name = image_reader_name
         self.image_reader = WSIReader(image_reader_name)
         self.wsi_object_dict = None
-        self._fetch_wsi_objects()
+        if self.image_reader_name != "openslide":
+            # OpenSlide causes memeory issue if we prefetch image objects
+            self._fetch_wsi_objects()
 
     def _fetch_wsi_objects(self):
         self.wsi_object_dict = {}
@@ -72,26 +92,26 @@ def _fetch_wsi_objects(self):
             self.wsi_object_dict[image_path] = self.image_reader.read(image_path)
 
     def __len__(self):
-        return self.num_samples
+        return len(self.samples)
 
     def __getitem__(self, index):
-        data = self.samples[index]
-        # OpenSlide causes issue if using the stored image objects
+        sample = self.samples[index]
         if self.image_reader_name == "openslide":
-            img_obj = self.image_reader.read(data["image"])
+            img_obj = self.image_reader.read(sample["image"])
         else:
-            img_obj = self.wsi_object_dict[data["image"]]
+            img_obj = self.wsi_object_dict[sample["image"]]
         images, _ = self.image_reader.get_data(
             img=img_obj,
-            location=data["location"],
+            location=sample["location"],
             size=self.region_size,
             grid_shape=self.grid_shape,
             patch_size=self.patch_size,
         )
-        samples = [{"image": images[i], "label": data["label"][i]} for i in range(len(data["label"]))]
+        labels = np.array(sample["label"], dtype=np.float32)[:, np.newaxis, np.newaxis]
+        patches = [{"image": images[i], "label": labels[i]} for i in range(len(sample["label"]))]
         if self.transform:
-            samples = self.transform(samples)
-        return samples
+            patches = self.transform(patches)
+        return patches
 
 
 class SmartCachePatchWSIDataset(SmartCacheDataset):
@@ -104,7 +124,7 @@ def __init__(
         data: List,
         region_size: Union[int, Tuple[int, int]],
         grid_shape: Union[int, Tuple[int, int]],
-        patch_size: Union[int, Tuple[int, int]],
+        patch_size: int,
         image_reader_name: str = "cuCIM",
         transform: Union[Sequence[Callable], Callable] = None,
         replace_rate: float = 0.5,

From 472c4212de2f0eb72f7df1adbfb7658e75be3f97 Mon Sep 17 00:00:00 2001
From: Behrooz <3968947+behxyz@users.noreply.github.com>
Date: Wed, 24 Mar 2021 01:22:35 +0000
Subject: [PATCH 09/22] Convert labels to numpy to match the change in dataset

Signed-off-by: Behrooz <3968947+behxyz@users.noreply.github.com>
---
 tests/test_patch_wsi_dataset.py | 26 +++++++++++---------------
 1 file changed, 11 insertions(+), 15 deletions(-)

diff --git a/tests/test_patch_wsi_dataset.py b/tests/test_patch_wsi_dataset.py
index 95a2eea7a0..b36ffb8b87 100644
--- a/tests/test_patch_wsi_dataset.py
+++ b/tests/test_patch_wsi_dataset.py
@@ -18,8 +18,7 @@
     FILE_URL,
     {
         "data": [
-            {"image": "./CMU-1.tiff", "location": [10000, 20000], "label": [1]},
-            {"image": "./CMU-1.tiff", "location": [0, 0], "label": [0]},
+            {"image": "./CMU-1.tiff", "location": [0, 0], "label": [1]},
         ],
         "region_size": (1, 1),
         "grid_shape": (1, 1),
@@ -27,8 +26,7 @@
         "image_reader_name": "cuCIM",
     },
     [
-        {"image": np.array([[[246]], [[245]], [[250]]], dtype=np.uint8), "label": 1},
-        {"image": np.array([[[239]], [[239]], [[239]]], dtype=np.uint8), "label": 0},
+        {"image": np.array([[[239]], [[239]], [[239]]], dtype=np.uint8), "label": np.array([[1]])},
     ],
 ]
 
@@ -41,10 +39,10 @@
         "patch_size": 1,
     },
     [
-        {"image": np.array([[[247]], [[245]], [[248]]], dtype=np.uint8), "label": 0},
-        {"image": np.array([[[245]], [[247]], [[244]]], dtype=np.uint8), "label": 0},
-        {"image": np.array([[[246]], [[246]], [[246]]], dtype=np.uint8), "label": 0},
-        {"image": np.array([[[246]], [[246]], [[246]]], dtype=np.uint8), "label": 1},
+        {"image": np.array([[[247]], [[245]], [[248]]], dtype=np.uint8), "label": np.array([[0]])},
+        {"image": np.array([[[245]], [[247]], [[244]]], dtype=np.uint8), "label": np.array([[0]])},
+        {"image": np.array([[[246]], [[246]], [[246]]], dtype=np.uint8), "label": np.array([[0]])},
+        {"image": np.array([[[246]], [[246]], [[246]]], dtype=np.uint8), "label": np.array([[1]])},
     ],
 ]
 
@@ -56,13 +54,11 @@ def test_read_patches(self, file_url, input_parameters, expected):
         self.camelyon_data_download(file_url)
         dataset = PatchWSIDataset(**input_parameters)
         samples = dataset[0]
-        image_compare = [
-            assert_array_equal(samples[i]["image"], expected[i]["image"]) is None for i in range(len(samples))
-        ]
-        label_compare = [
-            assert_array_equal(samples[i]["label"], expected[i]["label"]) is None for i in range(len(samples))
-        ]
-        self.assertTrue(all(image_compare) and all(label_compare))
+        for i in range(len(samples)):
+            self.assertTupleEqual(samples[i]["label"].shape, expected[i]["label"].shape)
+            self.assertTupleEqual(samples[i]["image"].shape, expected[i]["image"].shape)
+            self.assertIsNone(assert_array_equal(samples[i]["label"], expected[i]["label"]))
+            self.assertIsNone(assert_array_equal(samples[i]["image"], expected[i]["image"]))
 
     def camelyon_data_download(self, file_url):
         filename = os.path.basename(file_url)

From 79c35b5fdb5b6006b71cdb0aa9ac9a5d499fbb77 Mon Sep 17 00:00:00 2001
From: Behrooz <3968947+behxyz@users.noreply.github.com>
Date: Wed, 24 Mar 2021 01:49:00 +0000
Subject: [PATCH 10/22] Update location from center to corner

Signed-off-by: Behrooz <3968947+behxyz@users.noreply.github.com>
---
 monai/apps/pathology/datasets.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/monai/apps/pathology/datasets.py b/monai/apps/pathology/datasets.py
index 96a3638b7a..d5f8d1dac2 100644
--- a/monai/apps/pathology/datasets.py
+++ b/monai/apps/pathology/datasets.py
@@ -100,9 +100,10 @@ def __getitem__(self, index):
             img_obj = self.image_reader.read(sample["image"])
         else:
             img_obj = self.wsi_object_dict[sample["image"]]
+        location = [sample["location"][i] - self.region_size[i] // 2 for i in range(len(self.region_size))]
         images, _ = self.image_reader.get_data(
             img=img_obj,
-            location=sample["location"],
+            location=location,
             size=self.region_size,
             grid_shape=self.grid_shape,
             patch_size=self.patch_size,

From c8542bd1887c945b1f160b59645d48358f088109 Mon Sep 17 00:00:00 2001
From: Behrooz <3968947+behxyz@users.noreply.github.com>
Date: Wed, 24 Mar 2021 02:32:06 +0000
Subject: [PATCH 11/22] Update unittests locations from center to corner

Signed-off-by: Behrooz <3968947+behxyz@users.noreply.github.com>
---
 tests/test_patch_wsi_dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_patch_wsi_dataset.py b/tests/test_patch_wsi_dataset.py
index b36ffb8b87..a1b1d2d2c0 100644
--- a/tests/test_patch_wsi_dataset.py
+++ b/tests/test_patch_wsi_dataset.py
@@ -33,7 +33,7 @@
 TEST_CASE_1 = [
     FILE_URL,
     {
-        "data": [{"image": "./CMU-1.tiff", "location": [10000, 20000], "label": [0, 0, 0, 1]}],
+        "data": [{"image": "./CMU-1.tiff", "location": [10004, 20004], "label": [0, 0, 0, 1]}],
         "region_size": (8, 8),
         "grid_shape": (2, 2),
         "patch_size": 1,

From 854ceb58509c56e63f80dd2b94f5a045078e8bea Mon Sep 17 00:00:00 2001
From: Behrooz <3968947+behxyz@users.noreply.github.com>
Date: Wed, 24 Mar 2021 02:36:50 +0000
Subject: [PATCH 12/22] Update docs for pathology datasets

Signed-off-by: Behrooz <3968947+behxyz@users.noreply.github.com>
---
 docs/source/apps.rst | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/docs/source/apps.rst b/docs/source/apps.rst
index 1c4f4c3dfb..3396dc2a82 100644
--- a/docs/source/apps.rst
+++ b/docs/source/apps.rst
@@ -62,3 +62,13 @@ Applications
     :members:
 .. autoclass:: Fetch2DSliced
     :members:
+
+
+`Pathology`
+----------
+
+.. automodule:: monai.apps.pathology.datasets
+.. autoclass:: PatchWSIDataset
+    :members:
+.. autoclass:: SmartCachePatchWSIDataset
+    :members:

From c9ffa443e0979fa816450b3a163cd3d9c08821c5 Mon Sep 17 00:00:00 2001
From: Behrooz <3968947+behxyz@users.noreply.github.com>
Date: Wed, 24 Mar 2021 04:32:06 +0000
Subject: [PATCH 13/22] Update type hint and doc

Signed-off-by: Behrooz <3968947+behxyz@users.noreply.github.com>
---
 docs/source/apps.rst             |  3 +--
 monai/apps/pathology/datasets.py | 15 +++++++--------
 2 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/docs/source/apps.rst b/docs/source/apps.rst
index 3396dc2a82..4c45a5fb39 100644
--- a/docs/source/apps.rst
+++ b/docs/source/apps.rst
@@ -63,9 +63,8 @@ Applications
 .. autoclass:: Fetch2DSliced
     :members:
 
-
 `Pathology`
-----------
+-----------
 
 .. automodule:: monai.apps.pathology.datasets
 .. autoclass:: PatchWSIDataset
diff --git a/monai/apps/pathology/datasets.py b/monai/apps/pathology/datasets.py
index d5f8d1dac2..fd86acdd5a 100644
--- a/monai/apps/pathology/datasets.py
+++ b/monai/apps/pathology/datasets.py
@@ -35,6 +35,7 @@ class PatchWSIDataset(Dataset):
         transform: transforms to be executed on input data.
 
     Note:
+
         The input data has the following form as an example:
         [{"image": "path/to/image1.tiff", "location": [200, 500], "label": [0,0,0,1]}].
 
@@ -60,14 +61,14 @@ def __init__(
         grid_shape: Union[int, Tuple[int, int]],
         patch_size: int,
         image_reader_name: str = "cuCIM",
-        transform: Union[Sequence[Callable], Callable] = None,
+        transform: Optional[Callable] = None,
     ):
-        if type(region_size) == int:
+        if isinstance(region_size, int):
             self.region_size = (region_size, region_size)
         else:
-            self.region_size = region_size
+            self.region_size  = region_size
 
-        if type(grid_shape) == int:
+        if isinstance(grid_shape, int):
             self.grid_shape = (grid_shape, grid_shape)
         else:
             self.grid_shape = grid_shape
@@ -116,9 +117,7 @@ def __getitem__(self, index):
 
 
 class SmartCachePatchWSIDataset(SmartCacheDataset):
-    """
-    Add SmartCache functionality to PatchWSIDataset
-    """
+    """Add SmartCache functionality to PatchWSIDataset."""
 
     def __init__(
         self,
@@ -126,8 +125,8 @@ def __init__(
         region_size: Union[int, Tuple[int, int]],
         grid_shape: Union[int, Tuple[int, int]],
         patch_size: int,
+        transform: Union[Sequence[Callable], Callable],
         image_reader_name: str = "cuCIM",
-        transform: Union[Sequence[Callable], Callable] = None,
         replace_rate: float = 0.5,
         cache_num: int = sys.maxsize,
         cache_rate: float = 1.0,

From 96b6df5510514e83d5b39ef5e79d4cb9ea5c8f3c Mon Sep 17 00:00:00 2001
From: Behrooz <3968947+behxyz@users.noreply.github.com>
Date: Wed, 24 Mar 2021 17:43:41 +0000
Subject: [PATCH 14/22] Update docstrings

Signed-off-by: Behrooz <3968947+behxyz@users.noreply.github.com>
---
 monai/apps/pathology/datasets.py | 62 ++++++++++++++++++++------------
 1 file changed, 39 insertions(+), 23 deletions(-)

diff --git a/monai/apps/pathology/datasets.py b/monai/apps/pathology/datasets.py
index fd86acdd5a..2bf29e3325 100644
--- a/monai/apps/pathology/datasets.py
+++ b/monai/apps/pathology/datasets.py
@@ -26,31 +26,23 @@ class PatchWSIDataset(Dataset):
     It also reads labels for each patch and privide each patch with its associated class labels.
 
     Args:
-        data: the list of input samples including image, location, and label (see below for more details)
-        region_size: the region to be extracted from the whole slide image
-        grid_shape: the grid shape on which the patches should be extracted
-        patch_size: the patches extracted from the region on the grid
-        image_reader_name: the name of library to be used for loading whole slide imaging,
-            either CuCIM or OpenSlide (the default is CuCIM)
+        data: the list of input samples including image, location, and label (see below for more details).
+        region_size: the region to be extracted from the whole slide image.
+        grid_shape: the grid shape on which the patches should be extracted.
+        patch_size: the patches extracted from the region on the grid.
+        image_reader_name: the name of library to be used for loading whole slide imaging, either CuCIM or OpenSlide.
+            Defaults to CuCIM.
         transform: transforms to be executed on input data.
 
     Note:
-
         The input data has the following form as an example:
-        [{"image": "path/to/image1.tiff", "location": [200, 500], "label": [0,0,0,1]}].
-
-        This means from "image1.tiff" extract a region at "location" with the side of "region_size", and
-        then extract patches with the size of "patch_size" from a square grid with the shape of "grid_shape".
-        Be aware the the "grid_shape" should construct a grid with the same number of element as "labels", so
-        for this example the "grid_size" should be (2, 2).
+        `[{"image": "path/to/image1.tiff", "location": [200, 500], "label": [0,0,0,1]}]`.
 
-        The output will look like the following:
-        [
-            {"image": np.array([...], dtype=np.uint8), "label": 0},
-            {"image": np.array([...], dtype=np.uint8), "label": 0},
-            {"image": np.array([...], dtype=np.uint8), "label": 0},
-            {"image": np.array([...], dtype=np.uint8), "label": 1},
-        ]
+        This means from "image1.tiff" extract a region centered at the given location `location`
+        with the size of `region_size`, and then extract patches with the size of `patch_size`
+        from a square grid with the shape of `grid_shape`.
+        Be aware the the `grid_shape` should construct a grid with the same number of element as `labels`,
+        so for this example the `grid_shape` should be (2, 2).
 
     """
 
@@ -66,7 +58,7 @@ def __init__(
         if isinstance(region_size, int):
             self.region_size = (region_size, region_size)
         else:
-            self.region_size  = region_size
+            self.region_size = region_size
 
         if isinstance(grid_shape, int):
             self.grid_shape = (grid_shape, grid_shape)
@@ -88,6 +80,8 @@ def __init__(
             self._fetch_wsi_objects()
 
     def _fetch_wsi_objects(self):
+        """Load all the image objects and reuse them when asked for an item.
+        """
         self.wsi_object_dict = {}
         for image_path in self.image_path_list:
             self.wsi_object_dict[image_path] = self.image_reader.read(image_path)
@@ -117,7 +111,29 @@ def __getitem__(self, index):
 
 
 class SmartCachePatchWSIDataset(SmartCacheDataset):
-    """Add SmartCache functionality to PatchWSIDataset."""
+    """Add SmartCache functionality to `PatchWSIDataset`.
+
+    Args:
+        data: the list of input samples including image, location, and label (see `PatchWSIDataset` for more details)
+        region_size: the region to be extracted from the whole slide image.
+        grid_shape: the grid shape on which the patches should be extracted.
+        patch_size: the patches extracted from the region on the grid.
+        image_reader_name: the name of library to be used for loading whole slide imaging, either CuCIM or OpenSlide.
+            Defaults to CuCIM.
+        transform: transforms to be executed on input data.
+        replace_rate: percentage of the cached items to be replaced in every epoch.
+        cache_num: number of items to be cached. Default is `sys.maxsize`.
+            will take the minimum of (cache_num, data_length x cache_rate, data_length).
+        cache_rate: percentage of cached data in total, default is 1.0 (cache all).
+            will take the minimum of (cache_num, data_length x cache_rate, data_length).
+        num_init_workers: the number of worker threads to initialize the cache for first epoch.
+            If num_init_workers is None then the number returned by os.cpu_count() is used.
+        num_replace_workers: the number of worker threads to prepare the replacement cache for every epoch.
+            If num_replace_workers is None then the number returned by os.cpu_count() is used.
+        progress: whether to display a progress bar when caching for the first epoch.
+
+
+    """
 
     def __init__(
         self,
@@ -135,7 +151,7 @@ def __init__(
     ):
         extractor = PatchWSIDataset(data, region_size, grid_shape, patch_size, image_reader_name)
         super().__init__(
-            data=extractor,
+            data=extractor,  # type: ignore
             transform=transform,
             replace_rate=replace_rate,
             cache_num=cache_num,

From 2d56c98384770b50a428961fde8b34f631ff57a9 Mon Sep 17 00:00:00 2001
From: Behrooz <3968947+behxyz@users.noreply.github.com>
Date: Wed, 24 Mar 2021 21:23:35 +0000
Subject: [PATCH 15/22] Format docstring

Signed-off-by: Behrooz <3968947+behxyz@users.noreply.github.com>
---
 monai/apps/pathology/datasets.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/monai/apps/pathology/datasets.py b/monai/apps/pathology/datasets.py
index 2bf29e3325..b1605a4fb5 100644
--- a/monai/apps/pathology/datasets.py
+++ b/monai/apps/pathology/datasets.py
@@ -80,8 +80,7 @@ def __init__(
             self._fetch_wsi_objects()
 
     def _fetch_wsi_objects(self):
-        """Load all the image objects and reuse them when asked for an item.
-        """
+        """Load all the image objects and reuse them when asked for an item."""
         self.wsi_object_dict = {}
         for image_path in self.image_path_list:
             self.wsi_object_dict[image_path] = self.image_reader.read(image_path)

From de3b67bd3ce02f9e863c456ab352e986bb77d047 Mon Sep 17 00:00:00 2001
From: Behrooz <3968947+behxyz@users.noreply.github.com>
Date: Wed, 24 Mar 2021 22:34:25 +0000
Subject: [PATCH 16/22] Update length of the smartcache dataset

Signed-off-by: Behrooz <3968947+behxyz@users.noreply.github.com>
---
 monai/apps/pathology/datasets.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/monai/apps/pathology/datasets.py b/monai/apps/pathology/datasets.py
index b1605a4fb5..dc79be9484 100644
--- a/monai/apps/pathology/datasets.py
+++ b/monai/apps/pathology/datasets.py
@@ -131,7 +131,6 @@ class SmartCachePatchWSIDataset(SmartCacheDataset):
             If num_replace_workers is None then the number returned by os.cpu_count() is used.
         progress: whether to display a progress bar when caching for the first epoch.
 
-
     """
 
     def __init__(
@@ -148,9 +147,10 @@ def __init__(
         num_init_workers: Optional[int] = None,
         num_replace_workers: Optional[int] = None,
     ):
-        extractor = PatchWSIDataset(data, region_size, grid_shape, patch_size, image_reader_name)
+        patch_wsi_dataset = PatchWSIDataset(data, region_size, grid_shape, patch_size, image_reader_name)
+        self.len_dataset = len(patch_wsi_dataset)
         super().__init__(
-            data=extractor,  # type: ignore
+            data=patch_wsi_dataset,  # type: ignore
             transform=transform,
             replace_rate=replace_rate,
             cache_num=cache_num,
@@ -158,3 +158,6 @@ def __init__(
             num_init_workers=num_init_workers,
             num_replace_workers=num_replace_workers,
         )
+
+    def __len__(self):
+        return self.len_dataset

From ef8daf92f391ce3e72a3df83ba781e849b73434e Mon Sep 17 00:00:00 2001
From: Behrooz <3968947+behxyz@users.noreply.github.com>
Date: Wed, 24 Mar 2021 22:35:02 +0000
Subject: [PATCH 17/22] Add unittest for SmartCachePatchWSIDataset

Signed-off-by: Behrooz <3968947+behxyz@users.noreply.github.com>
---
 tests/test_smartcache_patch_wsi_dataset.py | 93 ++++++++++++++++++++++
 1 file changed, 93 insertions(+)
 create mode 100644 tests/test_smartcache_patch_wsi_dataset.py

diff --git a/tests/test_smartcache_patch_wsi_dataset.py b/tests/test_smartcache_patch_wsi_dataset.py
new file mode 100644
index 0000000000..3e370e4c2b
--- /dev/null
+++ b/tests/test_smartcache_patch_wsi_dataset.py
@@ -0,0 +1,93 @@
+import os
+import unittest
+from unittest import skipUnless
+from urllib import request
+
+import numpy as np
+from numpy.testing import assert_array_equal
+from parameterized import parameterized
+
+from monai.apps.pathology.datasets import SmartCachePatchWSIDataset
+from monai.utils import optional_import
+
+_, has_cim = optional_import("cucim")
+
+FILE_URL = "http://openslide.cs.cmu.edu/download/openslide-testdata/Generic-TIFF/CMU-1.tiff"
+
+TEST_CASE_0 = [
+    FILE_URL,
+    {
+        "data": [
+            {"image": "./CMU-1.tiff", "location": [0, 0], "label": [1]},
+            {"image": "./CMU-1.tiff", "location": [0, 0], "label": [1]},
+            {"image": "./CMU-1.tiff", "location": [0, 0], "label": [1]},
+            {"image": "./CMU-1.tiff", "location": [0, 0], "label": [1]},
+        ],
+        "region_size": (1, 1),
+        "grid_shape": (1, 1),
+        "patch_size": 1,
+        "transform": lambda x: x,
+        "image_reader_name": "cuCIM",
+        "replace_rate": 0.5,
+        "cache_num": 2,
+        "num_init_workers": 1,
+        "num_replace_workers": 1,
+    },
+    [
+        {"image": np.array([[[239]], [[239]], [[239]]], dtype=np.uint8), "label": np.array([[1]])},
+    ],
+]
+
+TEST_CASE_1 = [
+    FILE_URL,
+    {
+        "data": [
+            {"image": "./CMU-1.tiff", "location": [10004, 20004], "label": [0, 0, 0, 1]},
+            {"image": "./CMU-1.tiff", "location": [10004, 20004], "label": [0, 0, 0, 1]},
+            {"image": "./CMU-1.tiff", "location": [10004, 20004], "label": [0, 0, 0, 1]},
+            {"image": "./CMU-1.tiff", "location": [10004, 20004], "label": [0, 0, 0, 1]},
+            {"image": "./CMU-1.tiff", "location": [10004, 20004], "label": [0, 0, 0, 1]},
+            {"image": "./CMU-1.tiff", "location": [10004, 20004], "label": [0, 0, 0, 1]},
+        ],
+        "region_size": (8, 8),
+        "grid_shape": (2, 2),
+        "patch_size": 1,
+        "transform": lambda x: x,
+        "replace_rate": 0.5,
+        "cache_num": 2,
+        "num_init_workers": 1,
+        "num_replace_workers": 1,
+    },
+    [
+        {"image": np.array([[[247]], [[245]], [[248]]], dtype=np.uint8), "label": np.array([[0]])},
+        {"image": np.array([[[245]], [[247]], [[244]]], dtype=np.uint8), "label": np.array([[0]])},
+        {"image": np.array([[[246]], [[246]], [[246]]], dtype=np.uint8), "label": np.array([[0]])},
+        {"image": np.array([[[246]], [[246]], [[246]]], dtype=np.uint8), "label": np.array([[1]])},
+    ],
+]
+
+
+class TestSmartCachePatchWSIDataset(unittest.TestCase):
+    @parameterized.expand([TEST_CASE_0, TEST_CASE_1])
+    @skipUnless(has_cim, "Requires CuCIM")
+    def test_read_patches(self, file_url, input_parameters, expected):
+        self.camelyon_data_download(file_url)
+        dataset = SmartCachePatchWSIDataset(**input_parameters)
+        self.assertEqual(len(dataset), len(input_parameters["data"]))
+        for samples in dataset:
+            for i in range(len(samples)):
+                self.assertTupleEqual(samples[i]["label"].shape, expected[i]["label"].shape)
+                self.assertTupleEqual(samples[i]["image"].shape, expected[i]["image"].shape)
+                self.assertIsNone(assert_array_equal(samples[i]["label"], expected[i]["label"]))
+                self.assertIsNone(assert_array_equal(samples[i]["image"], expected[i]["image"]))
+
+    def camelyon_data_download(self, file_url):
+        filename = os.path.basename(file_url)
+        if not os.path.exists(filename):
+            print(f"Test image [{filename}] does not exist. Downloading...")
+            request.urlretrieve(file_url, filename)
+        return filename
+
+
+if __name__ == "__main__":
+    unittest.main()

From 3f130f04c0a5074205307410fd56bfe2b209cdb1 Mon Sep 17 00:00:00 2001
From: Behrooz <3968947+behxyz@users.noreply.github.com>
Date: Thu, 25 Mar 2021 00:09:30 +0000
Subject: [PATCH 18/22] Minor changes and fixes

Signed-off-by: Behrooz <3968947+behxyz@users.noreply.github.com>
---
 monai/apps/pathology/__init__.py |  2 ++
 monai/apps/pathology/datasets.py | 19 +++++++++----------
 2 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/monai/apps/pathology/__init__.py b/monai/apps/pathology/__init__.py
index 14ae193634..2040d510d1 100644
--- a/monai/apps/pathology/__init__.py
+++ b/monai/apps/pathology/__init__.py
@@ -8,3 +8,5 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+from .datasets import PatchWSIDataset, SmartCacheDataset
\ No newline at end of file
diff --git a/monai/apps/pathology/datasets.py b/monai/apps/pathology/datasets.py
index dc79be9484..6dcf240870 100644
--- a/monai/apps/pathology/datasets.py
+++ b/monai/apps/pathology/datasets.py
@@ -22,8 +22,8 @@
 
 class PatchWSIDataset(Dataset):
     """
-    This dataset read whole slide images, extract regions, and crate patches.
-    It also reads labels for each patch and privide each patch with its associated class labels.
+    This dataset reads whole slide images, extracts regions, and creates patches.
+    It also reads labels for each patch and provides each patch with its associated class labels.
 
     Args:
         data: the list of input samples including image, location, and label (see below for more details).
@@ -55,6 +55,8 @@ def __init__(
         image_reader_name: str = "cuCIM",
         transform: Optional[Callable] = None,
     ):
+        super().__init__(data, transform)
+
         if isinstance(region_size, int):
             self.region_size = (region_size, region_size)
         else:
@@ -68,15 +70,13 @@ def __init__(
         self.patch_size = patch_size
         self.sub_region_size = (self.region_size[0] / self.grid_shape[0], self.region_size[1] / self.grid_shape[1])
 
-        self.transform = transform
-        self.samples = data
-        self.image_path_list = list({x["image"] for x in self.samples})
+        self.image_path_list = list({x["image"] for x in self.data})
 
         self.image_reader_name = image_reader_name
         self.image_reader = WSIReader(image_reader_name)
         self.wsi_object_dict = None
         if self.image_reader_name != "openslide":
-            # OpenSlide causes memeory issue if we prefetch image objects
+            # OpenSlide causes memory issue if we prefetch image objects
             self._fetch_wsi_objects()
 
     def _fetch_wsi_objects(self):
@@ -85,11 +85,8 @@ def _fetch_wsi_objects(self):
         for image_path in self.image_path_list:
             self.wsi_object_dict[image_path] = self.image_reader.read(image_path)
 
-    def __len__(self):
-        return len(self.samples)
-
     def __getitem__(self, index):
-        sample = self.samples[index]
+        sample = self.data[index]
         if self.image_reader_name == "openslide":
             img_obj = self.image_reader.read(sample["image"])
         else:
@@ -146,6 +143,7 @@ def __init__(
         cache_rate: float = 1.0,
         num_init_workers: Optional[int] = None,
         num_replace_workers: Optional[int] = None,
+        progress: bool = True,
     ):
         patch_wsi_dataset = PatchWSIDataset(data, region_size, grid_shape, patch_size, image_reader_name)
         self.len_dataset = len(patch_wsi_dataset)
@@ -157,6 +155,7 @@ def __init__(
             cache_rate=cache_rate,
             num_init_workers=num_init_workers,
             num_replace_workers=num_replace_workers,
+            progress=progress,
         )
 
     def __len__(self):

From 9c4e158229e6d5317a80309be90773a93eecfd12 Mon Sep 17 00:00:00 2001
From: Behrooz <3968947+behxyz@users.noreply.github.com>
Date: Wed, 24 Mar 2021 20:18:23 -0400
Subject: [PATCH 19/22] Add unnittest for OpenSlide option

Signed-off-by: Behrooz <3968947+behxyz@users.noreply.github.com>
---
 tests/test_patch_wsi_dataset.py | 68 ++++++++++++++++++++++++++++++++-
 1 file changed, 66 insertions(+), 2 deletions(-)

diff --git a/tests/test_patch_wsi_dataset.py b/tests/test_patch_wsi_dataset.py
index a1b1d2d2c0..730519ed52 100644
--- a/tests/test_patch_wsi_dataset.py
+++ b/tests/test_patch_wsi_dataset.py
@@ -11,6 +11,7 @@
 from monai.utils import optional_import
 
 _, has_cim = optional_import("cucim")
+_, has_osl = optional_import("openslide")
 
 FILE_URL = "http://openslide.cs.cmu.edu/download/openslide-testdata/Generic-TIFF/CMU-1.tiff"
 
@@ -37,6 +38,57 @@
         "region_size": (8, 8),
         "grid_shape": (2, 2),
         "patch_size": 1,
+        "image_reader_name": "cuCIM",
+    },
+    [
+        {"image": np.array([[[247]], [[245]], [[248]]], dtype=np.uint8), "label": np.array([[0]])},
+        {"image": np.array([[[245]], [[247]], [[244]]], dtype=np.uint8), "label": np.array([[0]])},
+        {"image": np.array([[[246]], [[246]], [[246]]], dtype=np.uint8), "label": np.array([[0]])},
+        {"image": np.array([[[246]], [[246]], [[246]]], dtype=np.uint8), "label": np.array([[1]])},
+    ],
+]
+
+TEST_CASE_2 = [
+    FILE_URL,
+    {
+        "data": [
+            {"image": "./CMU-1.tiff", "location": [0, 0], "label": [1]},
+        ],
+        "region_size": 1,
+        "grid_shape": 1,
+        "patch_size": 1,
+        "image_reader_name": "cuCIM",
+    },
+    [
+        {"image": np.array([[[239]], [[239]], [[239]]], dtype=np.uint8), "label": np.array([[1]])},
+    ],
+]
+
+
+TEST_CASE_OPENSLIDE_0 = [
+    FILE_URL,
+    {
+        "data": [
+            {"image": "./CMU-1.tiff", "location": [0, 0], "label": [1]},
+        ],
+        "region_size": (1, 1),
+        "grid_shape": (1, 1),
+        "patch_size": 1,
+        "image_reader_name": "OpenSlide",
+    },
+    [
+        {"image": np.array([[[239]], [[239]], [[239]]], dtype=np.uint8), "label": np.array([[1]])},
+    ],
+]
+
+TEST_CASE_OPENSLIDE_1 = [
+    FILE_URL,
+    {
+        "data": [{"image": "./CMU-1.tiff", "location": [10004, 20004], "label": [0, 0, 0, 1]}],
+        "region_size": (8, 8),
+        "grid_shape": (2, 2),
+        "patch_size": 1,
+        "image_reader_name": "OpenSlide",
     },
     [
         {"image": np.array([[[247]], [[245]], [[248]]], dtype=np.uint8), "label": np.array([[0]])},
@@ -48,9 +100,21 @@
 
 
 class TestPatchWSIDataset(unittest.TestCase):
-    @parameterized.expand([TEST_CASE_0, TEST_CASE_1])
+    @parameterized.expand([TEST_CASE_0, TEST_CASE_1, TEST_CASE_2])
     @skipUnless(has_cim, "Requires CuCIM")
-    def test_read_patches(self, file_url, input_parameters, expected):
+    def test_read_patches_cucim(self, file_url, input_parameters, expected):
+        self.camelyon_data_download(file_url)
+        dataset = PatchWSIDataset(**input_parameters)
+        samples = dataset[0]
+        for i in range(len(samples)):
+            self.assertTupleEqual(samples[i]["label"].shape, expected[i]["label"].shape)
+            self.assertTupleEqual(samples[i]["image"].shape, expected[i]["image"].shape)
+            self.assertIsNone(assert_array_equal(samples[i]["label"], expected[i]["label"]))
+            self.assertIsNone(assert_array_equal(samples[i]["image"], expected[i]["image"]))
+
+    @parameterized.expand([TEST_CASE_OPENSLIDE_0, TEST_CASE_OPENSLIDE_1])
+    @skipUnless(has_osl, "Requires OpenSlide")
+    def test_read_patches_openslide(self, file_url, input_parameters, expected):
         self.camelyon_data_download(file_url)
         dataset = PatchWSIDataset(**input_parameters)
         samples = dataset[0]

From 27aac0c6e961d7c905ff31d56f665743a3f83445 Mon Sep 17 00:00:00 2001
From: Behrooz <3968947+behxyz@users.noreply.github.com>
Date: Wed, 24 Mar 2021 20:22:56 -0400
Subject: [PATCH 20/22] Add new line

Signed-off-by: Behrooz <3968947+behxyz@users.noreply.github.com>
---
 monai/apps/pathology/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/monai/apps/pathology/__init__.py b/monai/apps/pathology/__init__.py
index 2040d510d1..bbdb812c03 100644
--- a/monai/apps/pathology/__init__.py
+++ b/monai/apps/pathology/__init__.py
@@ -9,4 +9,4 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .datasets import PatchWSIDataset, SmartCacheDataset
\ No newline at end of file
+from .datasets import PatchWSIDataset, SmartCacheDataset

From 11210d406f9cedbb74902275b610e2a067bdbc51 Mon Sep 17 00:00:00 2001
From: Behrooz <3968947+behxyz@users.noreply.github.com>
Date: Thu, 25 Mar 2021 00:17:01 -0400
Subject: [PATCH 21/22] Remove SmartCachePatchWSIDataset test to fix it

Signed-off-by: Behrooz <3968947+behxyz@users.noreply.github.com>
---
 monai/apps/pathology/datasets.py           |  4 -
 tests/test_smartcache_patch_wsi_dataset.py | 93 ----------------------
 2 files changed, 97 deletions(-)
 delete mode 100644 tests/test_smartcache_patch_wsi_dataset.py

diff --git a/monai/apps/pathology/datasets.py b/monai/apps/pathology/datasets.py
index 6dcf240870..f9ce0bc62b 100644
--- a/monai/apps/pathology/datasets.py
+++ b/monai/apps/pathology/datasets.py
@@ -146,7 +146,6 @@ def __init__(
         progress: bool = True,
     ):
         patch_wsi_dataset = PatchWSIDataset(data, region_size, grid_shape, patch_size, image_reader_name)
-        self.len_dataset = len(patch_wsi_dataset)
         super().__init__(
             data=patch_wsi_dataset,  # type: ignore
             transform=transform,
@@ -157,6 +156,3 @@ def __init__(
             num_replace_workers=num_replace_workers,
             progress=progress,
         )
-
-    def __len__(self):
-        return self.len_dataset
diff --git a/tests/test_smartcache_patch_wsi_dataset.py b/tests/test_smartcache_patch_wsi_dataset.py
deleted file mode 100644
index 3e370e4c2b..0000000000
--- a/tests/test_smartcache_patch_wsi_dataset.py
+++ /dev/null
@@ -1,93 +0,0 @@
-import os
-import unittest
-from unittest import skipUnless
-from urllib import request
-
-import numpy as np
-from numpy.testing import assert_array_equal
-from parameterized import parameterized
-
-from monai.apps.pathology.datasets import SmartCachePatchWSIDataset
-from monai.utils import optional_import
-
-_, has_cim = optional_import("cucim")
-
-FILE_URL = "http://openslide.cs.cmu.edu/download/openslide-testdata/Generic-TIFF/CMU-1.tiff"
-
-TEST_CASE_0 = [
-    FILE_URL,
-    {
-        "data": [
-            {"image": "./CMU-1.tiff", "location": [0, 0], "label": [1]},
-            {"image": "./CMU-1.tiff", "location": [0, 0], "label": [1]},
-            {"image": "./CMU-1.tiff", "location": [0, 0], "label": [1]},
-            {"image": "./CMU-1.tiff", "location": [0, 0], "label": [1]},
-        ],
-        "region_size": (1, 1),
-        "grid_shape": (1, 1),
-        "patch_size": 1,
-        "transform": lambda x: x,
-        "image_reader_name": "cuCIM",
-        "replace_rate": 0.5,
-        "cache_num": 2,
-        "num_init_workers": 1,
-        "num_replace_workers": 1,
-    },
-    [
-        {"image": np.array([[[239]], [[239]], [[239]]], dtype=np.uint8), "label": np.array([[1]])},
-    ],
-]
-
-TEST_CASE_1 = [
-    FILE_URL,
-    {
-        "data": [
-            {"image": "./CMU-1.tiff", "location": [10004, 20004], "label": [0, 0, 0, 1]},
-            {"image": "./CMU-1.tiff", "location": [10004, 20004], "label": [0, 0, 0, 1]},
-            {"image": "./CMU-1.tiff", "location": [10004, 20004], "label": [0, 0, 0, 1]},
-            {"image": "./CMU-1.tiff", "location": [10004, 20004], "label": [0, 0, 0, 1]},
-            {"image": "./CMU-1.tiff", "location": [10004, 20004], "label": [0, 0, 0, 1]},
-            {"image": "./CMU-1.tiff", "location": [10004, 20004], "label": [0, 0, 0, 1]},
-        ],
-        "region_size": (8, 8),
-        "grid_shape": (2, 2),
-        "patch_size": 1,
-        "transform": lambda x: x,
-        "replace_rate": 0.5,
-        "cache_num": 2,
-        "num_init_workers": 1,
-        "num_replace_workers": 1,
-    },
-    [
-        {"image": np.array([[[247]], [[245]], [[248]]], dtype=np.uint8), "label": np.array([[0]])},
-        {"image": np.array([[[245]], [[247]], [[244]]], dtype=np.uint8), "label": np.array([[0]])},
-        {"image": np.array([[[246]], [[246]], [[246]]], dtype=np.uint8), "label": np.array([[0]])},
-        {"image": np.array([[[246]], [[246]], [[246]]], dtype=np.uint8), "label": np.array([[1]])},
-    ],
-]
-
-
-class TestSmartCachePatchWSIDataset(unittest.TestCase):
-    @parameterized.expand([TEST_CASE_0, TEST_CASE_1])
-    @skipUnless(has_cim, "Requires CuCIM")
-    def test_read_patches(self, file_url, input_parameters, expected):
-        self.camelyon_data_download(file_url)
-        dataset = SmartCachePatchWSIDataset(**input_parameters)
-        self.assertEqual(len(dataset), len(input_parameters["data"]))
-        for samples in dataset:
-            for i in range(len(samples)):
-                self.assertTupleEqual(samples[i]["label"].shape, expected[i]["label"].shape)
-                self.assertTupleEqual(samples[i]["image"].shape, expected[i]["image"].shape)
-                self.assertIsNone(assert_array_equal(samples[i]["label"], expected[i]["label"]))
-                self.assertIsNone(assert_array_equal(samples[i]["image"], expected[i]["image"]))
-
-    def camelyon_data_download(self, file_url):
-        filename = os.path.basename(file_url)
-        if not os.path.exists(filename):
-            print(f"Test image [{filename}] does not exist. Downloading...")
-            request.urlretrieve(file_url, filename)
-        return filename
-
-
-if __name__ == "__main__":
-    unittest.main()

From 7ec1e50c9c771880a1eae145a15e514ba5a89630 Mon Sep 17 00:00:00 2001
From: Richard Brown <33289025+rijobro@users.noreply.github.com>
Date: Thu, 25 Mar 2021 08:39:26 +0000
Subject: [PATCH 22/22] move init docstring to class docstring

Signed-off-by: Richard Brown <33289025+rijobro@users.noreply.github.com>
---
 monai/data/dataset.py | 33 +++++++++++++++------------------
 1 file changed, 15 insertions(+), 18 deletions(-)

diff --git a/monai/data/dataset.py b/monai/data/dataset.py
index 813008e3a8..9a4e932160 100644
--- a/monai/data/dataset.py
+++ b/monai/data/dataset.py
@@ -582,6 +582,21 @@ class SmartCacheDataset(Randomizable, CacheDataset):
         This replacement will not work if setting the `multiprocessing_context` of DataLoader to `spawn`
         or on windows(the default multiprocessing method is `spawn`) and setting `num_workers` greater than 0.
 
+    Args:
+        data: input data to load and transform to generate dataset for model.
+        transform: transforms to execute operations on input data.
+        replace_rate: percentage of the cached items to be replaced in every epoch.
+        cache_num: number of items to be cached. Default is `sys.maxsize`.
+            will take the minimum of (cache_num, data_length x cache_rate, data_length).
+        cache_rate: percentage of cached data in total, default is 1.0 (cache all).
+            will take the minimum of (cache_num, data_length x cache_rate, data_length).
+        num_init_workers: the number of worker threads to initialize the cache for first epoch.
+            If num_init_workers is None then the number returned by os.cpu_count() is used.
+        num_replace_workers: the number of worker threads to prepare the replacement cache for every epoch.
+            If num_replace_workers is None then the number returned by os.cpu_count() is used.
+        progress: whether to display a progress bar when caching for the first epoch.
+        shuffle: whether to shuffle the whole data list before preparing the cache content for first epoch.
+        seed: random seed if shuffle is `True`, default to `0`.
     """
 
     def __init__(
@@ -597,24 +612,6 @@ def __init__(
         shuffle: bool = True,
         seed: int = 0,
     ) -> None:
-        """
-        Args:
-            data: input data to load and transform to generate dataset for model.
-            transform: transforms to execute operations on input data.
-            replace_rate: percentage of the cached items to be replaced in every epoch.
-            cache_num: number of items to be cached. Default is `sys.maxsize`.
-                will take the minimum of (cache_num, data_length x cache_rate, data_length).
-            cache_rate: percentage of cached data in total, default is 1.0 (cache all).
-                will take the minimum of (cache_num, data_length x cache_rate, data_length).
-            num_init_workers: the number of worker threads to initialize the cache for first epoch.
-                If num_init_workers is None then the number returned by os.cpu_count() is used.
-            num_replace_workers: the number of worker threads to prepare the replacement cache for every epoch.
-                If num_replace_workers is None then the number returned by os.cpu_count() is used.
-            progress: whether to display a progress bar when caching for the first epoch.
-            shuffle: whether to shuffle the whole data list before preparing the cache content for first epoch.
-            seed: random seed if shuffle is `True`, default to `0`.
-
-        """
         if shuffle:
             self.set_random_state(seed=seed)
             self.randomize(data)