PythonPredictions · sandervh14 · Apr 16, 2021 · Mar 17, 2021 · Mar 19, 2021 · Apr 2, 2021
diff --git a/cobra/preprocessing/kbins_discretizer.py b/cobra/preprocessing/kbins_discretizer.py
@@ -17,14 +17,14 @@
 from typing import List
 import numbers
 import logging
+import math
 
 # third party imports
 import numpy as np
 import pandas as pd
 from tqdm.auto import tqdm
 from sklearn.base import BaseEstimator
 from sklearn.exceptions import NotFittedError
-#from sklearn.cluster import KMeans
 
 log = logging.getLogger(__name__)
 
@@ -70,12 +70,12 @@ class KBinsDiscretizer(BaseEstimator):
                   "starting_precision", "label_format",
                   "change_endpoint_format"]
 
-    def __init__(self, n_bins: int=10, strategy: str="quantile",
-                 closed: str="right",
-                 auto_adapt_bins: bool=False,
-                 starting_precision: int=0,
-                 label_format: str="{} - {}",
-                 change_endpoint_format: bool=False):
+    def __init__(self, n_bins: int = 10, strategy: str = "quantile",
+                 closed: str = "right",
+                 auto_adapt_bins: bool = False,
+                 starting_precision: int = 0,
+                 label_format: str = "{} - {}",
+                 change_endpoint_format: bool = False):
 
         # validate number of bins
         self._validate_n_bins(n_bins)
@@ -163,7 +163,7 @@ def set_attributes_from_dict(self, params: dict):
         self.set_params(**params)
 
         self._bins_by_column = {
-            key: ([tuple(l) for l in value] if value else None)
+            key: ([tuple(v) for v in value] if value else None)
             for key, value in _bins_by_column.items()
         }
 
@@ -215,14 +215,29 @@ def _fit_column(self, data: pd.DataFrame,
         List[tuple]
             list of bins as tuples
         """
-
         col_min, col_max = data[column_name].min(), data[column_name].max()
 
         if col_min == col_max:
             log.warning("Predictor '{}' is constant and "
                         "will be ignored in computation".format(column_name))
             return None
 
+        prop_inf = (np.sum(np.isinf(data[column_name]))
+                    / data[column_name].shape[0])
+
+        if prop_inf > 0:
+            log.warning(f"Column {column_name} has "
+                        f"{prop_inf:.1%} inf values, thus it was skipped. "
+                        f"Consider dropping or transforming it.")
+            return None
+
+        prop_nan = data[column_name].isna().sum() / data[column_name].shape[0]
+
+        if prop_nan >= 0.99:
+            log.warning(f"Column {column_name} is"
+                        f" {prop_nan:.1%}% NaNs, "
+                        f"consider dropping or transforming it.")
+
         n_bins = self.n_bins
         if self.auto_adapt_bins:
             size = len(data.index)
@@ -403,8 +418,22 @@ def _compute_bin_edges(self, data: pd.DataFrame, column_name: str,
         #     bin_edges = (centers[1:] + centers[:-1]) * 0.5
         #     bin_edges = np.r_[col_min, bin_edges, col_max]
 
-        # Make sure the bin_edges are unique and sorted
-        return sorted(list(set(bin_edges)))
+        # nans lead to unexpected behavior during sorting,
+        # by replacing with inf we ensure these stay at the
+        # outermost edges
+        if math.isnan(bin_edges[0]):
+            bin_edges[0] = -np.inf
+
+        if math.isnan(bin_edges[-1]):
+            bin_edges[-1] = np.inf
+
+        if np.isnan(bin_edges).sum() > 0:
+            log.warning(f"Column {column_name} "
+                        "has NaNs present in bin definitions")
+
+        # Make sure the bin_edges are unique
+        # and order remains the same
+        return list(dict.fromkeys(bin_edges))
 
     def _compute_minimal_precision_of_bin_edges(self, bin_edges: list) -> int:
         """Compute the minimal precision of a list of bin_edges so that we end
@@ -468,7 +497,7 @@ def _compute_bins_from_edges(self, bin_edges: list) -> List[tuple]:
 
     @staticmethod
     def _create_index(intervals: List[tuple],
-                      closed: str="right") -> pd.IntervalIndex:
+                      closed: str = "right") -> pd.IntervalIndex:
         """Create an pd.IntervalIndex based on a list of tuples.
         This is basically a wrapper around pd.IntervalIndex.from_tuples
         However, the lower bound of the first entry in the list (the lower bin)

diff --git a/tests/preprocessing/test_kbins_discretizer.py b/tests/preprocessing/test_kbins_discretizer.py
@@ -3,6 +3,7 @@
 
 import numpy as np
 import pandas as pd
+import math
 
 from cobra.preprocessing.kbins_discretizer import KBinsDiscretizer
 
@@ -14,7 +15,7 @@ def does_not_raise():
 
 class TestKBinsDiscretizer:
 
-    ################# Test for public methods #################
+    # ---------------- Test for public methods ----------------
     def test_attributes_to_dict(self):
 
         discretizer = KBinsDiscretizer()
@@ -118,7 +119,7 @@ def test_transform(self, scenario, expectation):
             actual = discretizer.transform(data, ["variable"])
             pd.testing.assert_frame_equal(actual, expected)
 
-    ################# Test for private methods #################
+    # ---------------- Test for private methods ----------------
     @pytest.mark.parametrize("n_bins, expectation",
                              [(1, pytest.raises(ValueError)),
                               (10.5, pytest.raises(ValueError)),
@@ -163,9 +164,12 @@ def test_transform_column(self):
                               (10, False,
                                # almost constant
                                pd.DataFrame({"variable": [0] + ([1] * 100)}),
+                               None),
+                              (2, False,
+                               pd.DataFrame({"variable": [5.4, 9.3, np.inf]}),
                                None)],
                              ids=["regular", "auto_adapt_bins",
-                                  "two bin edges"])
+                                  "two bin edges", "infs"])
     def test_fit_column(self, n_bins, auto_adapt_bins, data, expected):
         discretizer = KBinsDiscretizer(n_bins=n_bins,
                                        auto_adapt_bins=auto_adapt_bins)
@@ -218,7 +222,9 @@ def test_compute_minimal_precision_of_bin_edges(self, bin_edges,
 
     @pytest.mark.parametrize("bin_edges, expected",
                              [([0, 1, 1.5, 2], [(0, 1), (1, 1.5), (1.5, 2)]),
-                              ([0, 1, 1.5, 3], [(0, 1), (1, 2), (2, 3)])])
+                              ([0, 1, 1.5, 3], [(0, 1), (1, 2), (2, 3)]),
+                              ([np.inf, 0.0, -np.inf],
+                               [(np.inf, 0.0), (0.0, -np.inf)])])
     def test_compute_bins_from_edges(self, bin_edges, expected):
 
         discretizer = KBinsDiscretizer()