Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
51 changes: 29 additions & 22 deletions cobra/preprocessing/categorical_data_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,6 @@
import re
from typing import Optional

import logging
log = logging.getLogger(__name__)

# third party imports
import numpy as np
import pandas as pd
Expand All @@ -29,6 +26,9 @@
from sklearn.base import BaseEstimator
from sklearn.exceptions import NotFittedError

import logging
log = logging.getLogger(__name__)


class CategoricalDataProcessor(BaseEstimator):
"""
Expand Down Expand Up @@ -58,12 +58,12 @@ class CategoricalDataProcessor(BaseEstimator):
"category_size_threshold", "p_value_threshold",
"scale_contingency_table", "forced_categories"]

def __init__(self, regroup: bool=True, regroup_name: str="Other",
keep_missing: bool=True,
category_size_threshold: int=5,
p_value_threshold: float=0.001,
scale_contingency_table: bool=True,
forced_categories: dict={}):
def __init__(self, regroup: bool = True, regroup_name: str = "Other",
keep_missing: bool = True,
category_size_threshold: int = 5,
p_value_threshold: float = 0.001,
scale_contingency_table: bool = True,
forced_categories: dict = {}):

self.regroup = regroup
self.regroup_name = regroup_name
Expand Down Expand Up @@ -305,7 +305,8 @@ def _transform_column(self, data: pd.DataFrame,
data.loc[:, column_name_clean] = (CategoricalDataProcessor
._replace_categories(
data[column_name_clean],
categories))
categories,
self.regroup_name))

# change data to categorical
data.loc[:, column_name_clean] = (data[column_name_clean]
Expand Down Expand Up @@ -366,7 +367,7 @@ def _get_small_categories(predictor_series: pd.Series,

@staticmethod
def _replace_missings(data: pd.DataFrame,
column_names: Optional[list]=None) -> pd.DataFrame:
column_names: Optional[list] = None) -> pd.DataFrame:
"""Replace missing values (incl empty strings)

Parameters
Expand Down Expand Up @@ -398,23 +399,25 @@ def _replace_missings(data: pd.DataFrame,
@staticmethod
def _compute_p_value(X: pd.Series, y: pd.Series, category: str,
scale_contingency_table: bool) -> float:
"""Summary
"""Calculates p-value in contingency table (chi-square test) in
order to evaluate whether category of interest is significantly
different from the rest of the categories, given the target variable.

Parameters
----------
X : pd.Series
Description
Variables data.
y : pd.Series
Description
Target data.
category : str
Description
Category for which we carry out the test
scale_contingency_table : bool
Description
Whether we scale contingency table with incidence rate

Returns
-------
float
Description
p-value of chi-square test
"""
df = pd.concat([X, y], axis=1)
df["other_categories"] = np.where(X == category, 0, 1)
Expand All @@ -434,20 +437,24 @@ def _compute_p_value(X: pd.Series, y: pd.Series, category: str,
return stats.chi2_contingency(contigency_table, correction=False)[1]

@staticmethod
def _replace_categories(data: pd.Series, categories: set) -> pd.Series:
def _replace_categories(data: pd.Series, categories: set,
replace_with: str) -> pd.Series:
"""replace categories in set with "Other" and transform the remaining
categories to strings to avoid type errors later on in the pipeline

Parameters
----------
data : pd.Series
Description
Dataset which contains the variable to be replaced
categories : set
Description
Cleaned categories.
replace_with: str
String to be used as replacement for category.

Returns
-------
pd.Series
Description
Series with replaced categories
"""
return data.apply(lambda x: str(x) if x in categories else "Other")
return data.apply(
lambda x: str(x) if x in categories else replace_with)
47 changes: 24 additions & 23 deletions cobra/preprocessing/preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,6 @@
from datetime import datetime
import time

import logging
log = logging.getLogger(__name__)
# third party imports
import pandas as pd
from sklearn.model_selection import train_test_split
Expand All @@ -27,6 +25,9 @@
from cobra.preprocessing import TargetEncoder
from cobra.preprocessing import CategoricalDataProcessor

import logging
log = logging.getLogger(__name__)


class PreProcessor(BaseEstimator):

Expand Down Expand Up @@ -56,7 +57,7 @@ class PreProcessor(BaseEstimator):
def __init__(self, categorical_data_processor: CategoricalDataProcessor,
discretizer: KBinsDiscretizer,
target_encoder: TargetEncoder,
is_fitted: bool=False):
is_fitted: bool = False):

self._categorical_data_processor = categorical_data_processor
self._discretizer = discretizer
Expand All @@ -66,22 +67,22 @@ def __init__(self, categorical_data_processor: CategoricalDataProcessor,

@classmethod
def from_params(cls,
n_bins: int=10,
strategy: str="quantile",
closed: str="right",
auto_adapt_bins: bool=False,
starting_precision: int=0,
label_format: str="{} - {}",
change_endpoint_format: bool=False,
regroup: bool=True,
regroup_name: str="Other",
keep_missing: bool=True,
category_size_threshold: int=5,
p_value_threshold: float=0.001,
scale_contingency_table: bool=True,
forced_categories: dict={},
weight: float=0.0,
imputation_strategy: str="mean"):
n_bins: int = 10,
strategy: str = "quantile",
closed: str = "right",
auto_adapt_bins: bool = False,
starting_precision: int = 0,
label_format: str = "{} - {}",
change_endpoint_format: bool = False,
regroup: bool = True,
regroup_name: str = "Other",
keep_missing: bool = True,
category_size_threshold: int = 5,
p_value_threshold: float = 0.001,
scale_contingency_table: bool = True,
forced_categories: dict = {},
weight: float = 0.0,
imputation_strategy: str = "mean"):
"""Constructor to instantiate PreProcessor from all the parameters
that can be set in all its required (attribute) classes
along with good default values.
Expand Down Expand Up @@ -338,10 +339,10 @@ def fit_transform(self, train_data: pd.DataFrame, continuous_vars: list,
@staticmethod
def train_selection_validation_split(data: pd.DataFrame,
target_column_name: str,
train_prop: float=0.6,
selection_prop: float=0.2,
validation_prop: float=0.2,
stratify_split=True)->pd.DataFrame:
train_prop: float = 0.6,
selection_prop: float = 0.2,
validation_prop: float = 0.2,
stratify_split=True) -> pd.DataFrame:
"""Split dataset into train-selection-validation datasets and merge
them into one big DataFrame with an additional column "split"
indicating to which dataset the corresponding row belongs to.
Expand Down
134 changes: 133 additions & 1 deletion tests/preprocessing/test_categorical_data_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,138 @@ def test_replace_categories(self, cleaned_categories, expected):
data = pd.Series(data=["c1", "c2", "c3", "c4"])

actual = (CategoricalDataProcessor
._replace_categories(data, cleaned_categories))
._replace_categories(data, cleaned_categories, 'Other'))

pd.testing.assert_series_equal(actual, expected)

def test_all_cats_not_significant(self):
# Expected
e = {'categorical_var': ['A', 'A', 'A', 'A',
'B', 'B', 'B', 'B',
'C', 'C', 'C', 'C'],
'target': [1, 1, 1, 1,
0, 0, 0, 0,
1, 0, 1, 0],
'categorical_var_processed': ['A', 'A', 'A', 'A',
'B', 'B', 'B', 'B',
'C', 'C', 'C', 'C']}

# data -> actual
d = {'categorical_var': ['A', 'A', 'A', 'A',
'B', 'B', 'B', 'B',
'C', 'C', 'C', 'C'],
'target': [1, 1, 1, 1,
0, 0, 0, 0,
1, 0, 1, 0]}

discrete_vars = ['categorical_var']
target_column_name = 'target'

data = pd.DataFrame(d, columns=['categorical_var', 'target'])
expected = pd.DataFrame(e, columns=['categorical_var',
'target',
'categorical_var_processed'])

categorical_data_processor = CategoricalDataProcessor(
category_size_threshold=0,
p_value_threshold=0.0001)

categorical_data_processor.fit(data,
discrete_vars,
target_column_name)

actual = categorical_data_processor.transform(data,
discrete_vars)

pd.testing.assert_frame_equal(actual, expected)

def test_regroup_name(self):
# Expected
e = {'categorical_var': ['A', 'A', 'A', 'A', 'A', 'A',
'B', 'B', 'B', 'B', 'B', 'B',
'C', 'C', 'C', 'C', 'C', 'C'],
'target': [1, 1, 1, 1, 1, 1,
0, 0, 0, 0, 0, 0,
1, 0, 1, 0, 1, 0],
'categorical_var_processed': [
'A', 'A', 'A', 'A', 'A', 'A',
'B', 'B', 'B', 'B', 'B', 'B',
'OTH', 'OTH', 'OTH', 'OTH', 'OTH', 'OTH']}

# data -> actual
d = {'categorical_var': ['A', 'A', 'A', 'A', 'A', 'A',
'B', 'B', 'B', 'B', 'B', 'B',
'C', 'C', 'C', 'C', 'C', 'C'],
'target': [1, 1, 1, 1, 1, 1,
0, 0, 0, 0, 0, 0,
1, 0, 1, 0, 1, 0]}

discrete_vars = ['categorical_var']
target_column_name = 'target'

data = pd.DataFrame(d, columns=['categorical_var', 'target'])
expected = pd.DataFrame(e, columns=['categorical_var',
'target',
'categorical_var_processed'])

expected['categorical_var_processed'] = (
expected['categorical_var_processed'].astype("category"))

categorical_data_processor = CategoricalDataProcessor(
category_size_threshold=0,
regroup_name='OTH',
p_value_threshold=0.05)

categorical_data_processor.fit(data,
discrete_vars,
target_column_name)

actual = categorical_data_processor.transform(data,
discrete_vars)

pd.testing.assert_frame_equal(actual, expected)

def test_force_category(self):
# Expected
e = {'categorical_var': ['A', 'A', 'A', 'A', 'A', 'A',
'B', 'B', 'B', 'B', 'B', 'B',
'C', 'C', 'C', 'C', 'C', 'C'],
'target': [1, 1, 1, 1, 1, 1,
0, 0, 0, 0, 0, 0,
1, 0, 1, 0, 1, 0],
'categorical_var_processed': ['A', 'A', 'A', 'A', 'A', 'A',
'B', 'B', 'B', 'B', 'B', 'B',
'C', 'C', 'C', 'C', 'C', 'C']}

# data -> actual
d = {'categorical_var': ['A', 'A', 'A', 'A', 'A', 'A',
'B', 'B', 'B', 'B', 'B', 'B',
'C', 'C', 'C', 'C', 'C', 'C'],
'target': [1, 1, 1, 1, 1, 1,
0, 0, 0, 0, 0, 0,
1, 0, 1, 0, 1, 0]}

discrete_vars = ['categorical_var']
target_column_name = 'target'

data = pd.DataFrame(d, columns=['categorical_var', 'target'])
expected = pd.DataFrame(e, columns=['categorical_var',
'target',
'categorical_var_processed'])

expected['categorical_var_processed'] = (
expected['categorical_var_processed'].astype("category"))

categorical_data_processor = CategoricalDataProcessor(
category_size_threshold=0,
forced_categories={'categorical_var': ['C']},
p_value_threshold=0.05)

categorical_data_processor.fit(data,
discrete_vars,
target_column_name)

actual = categorical_data_processor.transform(data,
discrete_vars)

pd.testing.assert_frame_equal(actual, expected)