Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 41 additions & 12 deletions cobra/preprocessing/kbins_discretizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,14 @@
from typing import List
import numbers
import logging
import math

# third party imports
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from sklearn.base import BaseEstimator
from sklearn.exceptions import NotFittedError
#from sklearn.cluster import KMeans

log = logging.getLogger(__name__)

Expand Down Expand Up @@ -70,12 +70,12 @@ class KBinsDiscretizer(BaseEstimator):
"starting_precision", "label_format",
"change_endpoint_format"]

def __init__(self, n_bins: int=10, strategy: str="quantile",
closed: str="right",
auto_adapt_bins: bool=False,
starting_precision: int=0,
label_format: str="{} - {}",
change_endpoint_format: bool=False):
def __init__(self, n_bins: int = 10, strategy: str = "quantile",
closed: str = "right",
auto_adapt_bins: bool = False,
starting_precision: int = 0,
label_format: str = "{} - {}",
change_endpoint_format: bool = False):

# validate number of bins
self._validate_n_bins(n_bins)
Expand Down Expand Up @@ -163,7 +163,7 @@ def set_attributes_from_dict(self, params: dict):
self.set_params(**params)

self._bins_by_column = {
key: ([tuple(l) for l in value] if value else None)
key: ([tuple(v) for v in value] if value else None)
for key, value in _bins_by_column.items()
}

Expand Down Expand Up @@ -215,14 +215,29 @@ def _fit_column(self, data: pd.DataFrame,
List[tuple]
list of bins as tuples
"""

col_min, col_max = data[column_name].min(), data[column_name].max()

if col_min == col_max:
log.warning("Predictor '{}' is constant and "
"will be ignored in computation".format(column_name))
return None

prop_inf = (np.sum(np.isinf(data[column_name]))
/ data[column_name].shape[0])

if prop_inf > 0:
log.warning(f"Column {column_name} has "
f"{prop_inf:.1%} inf values, thus it was skipped. "
f"Consider dropping or transforming it.")
return None

prop_nan = data[column_name].isna().sum() / data[column_name].shape[0]

if prop_nan >= 0.99:
log.warning(f"Column {column_name} is"
f" {prop_nan:.1%}% NaNs, "
f"consider dropping or transforming it.")

n_bins = self.n_bins
if self.auto_adapt_bins:
size = len(data.index)
Expand Down Expand Up @@ -403,8 +418,22 @@ def _compute_bin_edges(self, data: pd.DataFrame, column_name: str,
# bin_edges = (centers[1:] + centers[:-1]) * 0.5
# bin_edges = np.r_[col_min, bin_edges, col_max]

# Make sure the bin_edges are unique and sorted
return sorted(list(set(bin_edges)))
# nans lead to unexpected behavior during sorting,
# by replacing with inf we ensure these stay at the
# outermost edges
if math.isnan(bin_edges[0]):
bin_edges[0] = -np.inf

if math.isnan(bin_edges[-1]):
bin_edges[-1] = np.inf

if np.isnan(bin_edges).sum() > 0:
log.warning(f"Column {column_name} "
"has NaNs present in bin definitions")

# Make sure the bin_edges are unique
# and order remains the same
return list(dict.fromkeys(bin_edges))

def _compute_minimal_precision_of_bin_edges(self, bin_edges: list) -> int:
"""Compute the minimal precision of a list of bin_edges so that we end
Expand Down Expand Up @@ -468,7 +497,7 @@ def _compute_bins_from_edges(self, bin_edges: list) -> List[tuple]:

@staticmethod
def _create_index(intervals: List[tuple],
closed: str="right") -> pd.IntervalIndex:
closed: str = "right") -> pd.IntervalIndex:
"""Create an pd.IntervalIndex based on a list of tuples.
This is basically a wrapper around pd.IntervalIndex.from_tuples
However, the lower bound of the first entry in the list (the lower bin)
Expand Down
14 changes: 10 additions & 4 deletions tests/preprocessing/test_kbins_discretizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import numpy as np
import pandas as pd
import math

from cobra.preprocessing.kbins_discretizer import KBinsDiscretizer

Expand All @@ -14,7 +15,7 @@ def does_not_raise():

class TestKBinsDiscretizer:

################# Test for public methods #################
# ---------------- Test for public methods ----------------
def test_attributes_to_dict(self):

discretizer = KBinsDiscretizer()
Expand Down Expand Up @@ -118,7 +119,7 @@ def test_transform(self, scenario, expectation):
actual = discretizer.transform(data, ["variable"])
pd.testing.assert_frame_equal(actual, expected)

################# Test for private methods #################
# ---------------- Test for private methods ----------------
@pytest.mark.parametrize("n_bins, expectation",
[(1, pytest.raises(ValueError)),
(10.5, pytest.raises(ValueError)),
Expand Down Expand Up @@ -163,9 +164,12 @@ def test_transform_column(self):
(10, False,
# almost constant
pd.DataFrame({"variable": [0] + ([1] * 100)}),
None),
(2, False,
pd.DataFrame({"variable": [5.4, 9.3, np.inf]}),
None)],
ids=["regular", "auto_adapt_bins",
"two bin edges"])
"two bin edges", "infs"])
def test_fit_column(self, n_bins, auto_adapt_bins, data, expected):
discretizer = KBinsDiscretizer(n_bins=n_bins,
auto_adapt_bins=auto_adapt_bins)
Expand Down Expand Up @@ -218,7 +222,9 @@ def test_compute_minimal_precision_of_bin_edges(self, bin_edges,

@pytest.mark.parametrize("bin_edges, expected",
[([0, 1, 1.5, 2], [(0, 1), (1, 1.5), (1.5, 2)]),
([0, 1, 1.5, 3], [(0, 1), (1, 2), (2, 3)])])
([0, 1, 1.5, 3], [(0, 1), (1, 2), (2, 3)]),
([np.inf, 0.0, -np.inf],
[(np.inf, 0.0), (0.0, -np.inf)])])
def test_compute_bins_from_edges(self, bin_edges, expected):

discretizer = KBinsDiscretizer()
Expand Down