From 1ba2a83903d19576e47a29c0add90ab31a12cf03 Mon Sep 17 00:00:00 2001 From: Seyed-Ahmad Ahmadi <85887144+nvahmadi@users.noreply.github.com> Date: Tue, 21 Dec 2021 15:22:50 +0100 Subject: [PATCH 1/3] Update CSVDataset in dataset.py `kwargs` argument currently addresses merge operation. There is no way to change `kwargs` for `read_csv ()` call (e.g. to change delimiter from standard `,` to `\t`. Suggestion: Have `kwargs_read_csv` and `kwargs_merge` arguments to allow separate and finer user-control over both loading & merging mechanism. --- monai/data/dataset.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/monai/data/dataset.py b/monai/data/dataset.py index cbb534f04a..1b5bb4877e 100644 --- a/monai/data/dataset.py +++ b/monai/data/dataset.py @@ -1314,22 +1314,23 @@ def __init__( col_types: Optional[Dict[str, Optional[Dict[str, Any]]]] = None, col_groups: Optional[Dict[str, Sequence[str]]] = None, transform: Optional[Callable] = None, - **kwargs, + **kwargs_read_csv: Optional[Dict] = {}, + **kwargs_merge: Optional[Dict] = {}, ): srcs = (src,) if not isinstance(src, (tuple, list)) else src dfs: List = [] for i in srcs: if isinstance(i, str): - dfs.append(pd.read_csv(i)) + dfs.append(pd.read_csv(i, **kwargs_read_csv)) elif isinstance(i, pd.DataFrame): dfs.append(i) else: raise ValueError("`src` must be file path or pandas `DataFrame`.") - # in case treating deprecated arg `filename` as kwargs, remove it from `kwargs` - kwargs.pop("filename", None) + # in case treating deprecated arg `filename` as kwargs, remove it from `kwargs_merge` + kwargs_merge.pop("filename", None) data = convert_tables_to_dicts( - dfs=dfs, row_indices=row_indices, col_names=col_names, col_types=col_types, col_groups=col_groups, **kwargs + dfs=dfs, row_indices=row_indices, col_names=col_names, col_types=col_types, col_groups=col_groups, **kwargs_merge ) super().__init__(data=data, transform=transform) From eb454705e1ebb4956c453930a3315b8f52acbb4e Mon Sep 17 00:00:00 2001 From: Wenqi Li Date: Tue, 1 Mar 2022 12:28:53 +0000 Subject: [PATCH 2/3] Update dataset.py --- monai/data/dataset.py | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/monai/data/dataset.py b/monai/data/dataset.py index 1b5bb4877e..016c8f2efd 100644 --- a/monai/data/dataset.py +++ b/monai/data/dataset.py @@ -1298,6 +1298,7 @@ class CSVDataset(Dataset): be the new column name, the `value` is the names of columns to combine. for example: `col_groups={"ehr": [f"ehr_{i}" for i in range(10)], "meta": ["meta_1", "meta_2"]}` transform: transform to apply on the loaded items of a dictionary data. + kwargs_read_csv: dictionary args to pass to pandas `read_csv` function. kwargs: additional arguments for `pandas.merge()` API to join tables. .. deprecated:: 0.8.0 @@ -1314,23 +1315,23 @@ def __init__( col_types: Optional[Dict[str, Optional[Dict[str, Any]]]] = None, col_groups: Optional[Dict[str, Sequence[str]]] = None, transform: Optional[Callable] = None, - **kwargs_read_csv: Optional[Dict] = {}, - **kwargs_merge: Optional[Dict] = {}, + kwargs_read_csv: Optional[Dict] = None, + **kwargs, ): srcs = (src,) if not isinstance(src, (tuple, list)) else src dfs: List = [] for i in srcs: if isinstance(i, str): - dfs.append(pd.read_csv(i, **kwargs_read_csv)) + dfs.append(pd.read_csv(i, **kwargs_read_csv) if kwargs_read_csv else pd.read_csv(i)) elif isinstance(i, pd.DataFrame): dfs.append(i) else: raise ValueError("`src` must be file path or pandas `DataFrame`.") - # in case treating deprecated arg `filename` as kwargs, remove it from `kwargs_merge` - kwargs_merge.pop("filename", None) + # in case treating deprecated arg `filename` as kwargs, remove it from `kwargs` + kwargs.pop("filename", None) data = convert_tables_to_dicts( - dfs=dfs, row_indices=row_indices, col_names=col_names, col_types=col_types, col_groups=col_groups, **kwargs_merge + dfs=dfs, row_indices=row_indices, col_names=col_names, col_types=col_types, col_groups=col_groups, **kwargs ) super().__init__(data=data, transform=transform) From ec9ae200677af2f90036542cd1396b26b61bce64 Mon Sep 17 00:00:00 2001 From: Wenqi Li Date: Tue, 1 Mar 2022 12:34:47 +0000 Subject: [PATCH 3/3] Update iterable_dataset.py --- monai/data/iterable_dataset.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/monai/data/iterable_dataset.py b/monai/data/iterable_dataset.py index 19efc925fc..957d0e8a56 100644 --- a/monai/data/iterable_dataset.py +++ b/monai/data/iterable_dataset.py @@ -176,6 +176,7 @@ class CSVIterableDataset(IterableDataset): seed: random seed to initialize the random state for all the workers if `shuffle` is True, set `seed += 1` in every iter() call, refer to the PyTorch idea: https://github.com/pytorch/pytorch/blob/v1.10.0/torch/utils/data/distributed.py#L98. + kwargs_read_csv: dictionary args to pass to pandas `read_csv` function. Default to ``{"chunksize", chunksize}``. kwargs: additional arguments for `pandas.merge()` API to join tables. .. deprecated:: 0.8.0 @@ -195,6 +196,7 @@ def __init__( transform: Optional[Callable] = None, shuffle: bool = False, seed: int = 0, + kwargs_read_csv: Optional[Dict] = None, **kwargs, ): self.src = src @@ -205,6 +207,7 @@ def __init__( self.col_groups = col_groups self.shuffle = shuffle self.seed = seed + self.kwargs_read_csv = kwargs_read_csv or {"chunksize", chunksize} # in case treating deprecated arg `filename` as kwargs, remove it from `kwargs` kwargs.pop("filename", None) self.kwargs = kwargs @@ -230,7 +233,7 @@ def reset(self, src: Optional[Union[Union[str, Sequence[str]], Union[Iterable, S self.iters = [] for i in srcs: if isinstance(i, str): - self.iters.append(pd.read_csv(i, chunksize=self.chunksize)) + self.iters.append(pd.read_csv(i, **self.kwargs_read_csv)) elif isinstance(i, Iterable): self.iters.append(i) else: