diff --git a/cobra/preprocessing/kbins_discretizer.py b/cobra/preprocessing/kbins_discretizer.py index 080a315..0fbb29d 100644 --- a/cobra/preprocessing/kbins_discretizer.py +++ b/cobra/preprocessing/kbins_discretizer.py @@ -17,6 +17,7 @@ from typing import List import numbers import logging +import math # third party imports import numpy as np @@ -24,7 +25,6 @@ from tqdm.auto import tqdm from sklearn.base import BaseEstimator from sklearn.exceptions import NotFittedError -#from sklearn.cluster import KMeans log = logging.getLogger(__name__) @@ -70,12 +70,12 @@ class KBinsDiscretizer(BaseEstimator): "starting_precision", "label_format", "change_endpoint_format"] - def __init__(self, n_bins: int=10, strategy: str="quantile", - closed: str="right", - auto_adapt_bins: bool=False, - starting_precision: int=0, - label_format: str="{} - {}", - change_endpoint_format: bool=False): + def __init__(self, n_bins: int = 10, strategy: str = "quantile", + closed: str = "right", + auto_adapt_bins: bool = False, + starting_precision: int = 0, + label_format: str = "{} - {}", + change_endpoint_format: bool = False): # validate number of bins self._validate_n_bins(n_bins) @@ -163,7 +163,7 @@ def set_attributes_from_dict(self, params: dict): self.set_params(**params) self._bins_by_column = { - key: ([tuple(l) for l in value] if value else None) + key: ([tuple(v) for v in value] if value else None) for key, value in _bins_by_column.items() } @@ -215,7 +215,6 @@ def _fit_column(self, data: pd.DataFrame, List[tuple] list of bins as tuples """ - col_min, col_max = data[column_name].min(), data[column_name].max() if col_min == col_max: @@ -223,6 +222,22 @@ def _fit_column(self, data: pd.DataFrame, "will be ignored in computation".format(column_name)) return None + prop_inf = (np.sum(np.isinf(data[column_name])) + / data[column_name].shape[0]) + + if prop_inf > 0: + log.warning(f"Column {column_name} has " + f"{prop_inf:.1%} inf values, thus it was skipped. " + f"Consider dropping or transforming it.") + return None + + prop_nan = data[column_name].isna().sum() / data[column_name].shape[0] + + if prop_nan >= 0.99: + log.warning(f"Column {column_name} is" + f" {prop_nan:.1%}% NaNs, " + f"consider dropping or transforming it.") + n_bins = self.n_bins if self.auto_adapt_bins: size = len(data.index) @@ -403,8 +418,22 @@ def _compute_bin_edges(self, data: pd.DataFrame, column_name: str, # bin_edges = (centers[1:] + centers[:-1]) * 0.5 # bin_edges = np.r_[col_min, bin_edges, col_max] - # Make sure the bin_edges are unique and sorted - return sorted(list(set(bin_edges))) + # nans lead to unexpected behavior during sorting, + # by replacing with inf we ensure these stay at the + # outermost edges + if math.isnan(bin_edges[0]): + bin_edges[0] = -np.inf + + if math.isnan(bin_edges[-1]): + bin_edges[-1] = np.inf + + if np.isnan(bin_edges).sum() > 0: + log.warning(f"Column {column_name} " + "has NaNs present in bin definitions") + + # Make sure the bin_edges are unique + # and order remains the same + return list(dict.fromkeys(bin_edges)) def _compute_minimal_precision_of_bin_edges(self, bin_edges: list) -> int: """Compute the minimal precision of a list of bin_edges so that we end @@ -468,7 +497,7 @@ def _compute_bins_from_edges(self, bin_edges: list) -> List[tuple]: @staticmethod def _create_index(intervals: List[tuple], - closed: str="right") -> pd.IntervalIndex: + closed: str = "right") -> pd.IntervalIndex: """Create an pd.IntervalIndex based on a list of tuples. This is basically a wrapper around pd.IntervalIndex.from_tuples However, the lower bound of the first entry in the list (the lower bin) diff --git a/tests/preprocessing/test_kbins_discretizer.py b/tests/preprocessing/test_kbins_discretizer.py index ced0ddc..5b0aeeb 100644 --- a/tests/preprocessing/test_kbins_discretizer.py +++ b/tests/preprocessing/test_kbins_discretizer.py @@ -3,6 +3,7 @@ import numpy as np import pandas as pd +import math from cobra.preprocessing.kbins_discretizer import KBinsDiscretizer @@ -14,7 +15,7 @@ def does_not_raise(): class TestKBinsDiscretizer: - ################# Test for public methods ################# + # ---------------- Test for public methods ---------------- def test_attributes_to_dict(self): discretizer = KBinsDiscretizer() @@ -118,7 +119,7 @@ def test_transform(self, scenario, expectation): actual = discretizer.transform(data, ["variable"]) pd.testing.assert_frame_equal(actual, expected) - ################# Test for private methods ################# + # ---------------- Test for private methods ---------------- @pytest.mark.parametrize("n_bins, expectation", [(1, pytest.raises(ValueError)), (10.5, pytest.raises(ValueError)), @@ -163,9 +164,12 @@ def test_transform_column(self): (10, False, # almost constant pd.DataFrame({"variable": [0] + ([1] * 100)}), + None), + (2, False, + pd.DataFrame({"variable": [5.4, 9.3, np.inf]}), None)], ids=["regular", "auto_adapt_bins", - "two bin edges"]) + "two bin edges", "infs"]) def test_fit_column(self, n_bins, auto_adapt_bins, data, expected): discretizer = KBinsDiscretizer(n_bins=n_bins, auto_adapt_bins=auto_adapt_bins) @@ -218,7 +222,9 @@ def test_compute_minimal_precision_of_bin_edges(self, bin_edges, @pytest.mark.parametrize("bin_edges, expected", [([0, 1, 1.5, 2], [(0, 1), (1, 1.5), (1.5, 2)]), - ([0, 1, 1.5, 3], [(0, 1), (1, 2), (2, 3)])]) + ([0, 1, 1.5, 3], [(0, 1), (1, 2), (2, 3)]), + ([np.inf, 0.0, -np.inf], + [(np.inf, 0.0), (0.0, -np.inf)])]) def test_compute_bins_from_edges(self, bin_edges, expected): discretizer = KBinsDiscretizer()