diff --git a/cobra/preprocessing/categorical_data_processor.py b/cobra/preprocessing/categorical_data_processor.py index 68c41a3..bc6dc4a 100644 --- a/cobra/preprocessing/categorical_data_processor.py +++ b/cobra/preprocessing/categorical_data_processor.py @@ -18,9 +18,6 @@ import re from typing import Optional -import logging -log = logging.getLogger(__name__) - # third party imports import numpy as np import pandas as pd @@ -29,6 +26,9 @@ from sklearn.base import BaseEstimator from sklearn.exceptions import NotFittedError +import logging +log = logging.getLogger(__name__) + class CategoricalDataProcessor(BaseEstimator): """ @@ -58,12 +58,12 @@ class CategoricalDataProcessor(BaseEstimator): "category_size_threshold", "p_value_threshold", "scale_contingency_table", "forced_categories"] - def __init__(self, regroup: bool=True, regroup_name: str="Other", - keep_missing: bool=True, - category_size_threshold: int=5, - p_value_threshold: float=0.001, - scale_contingency_table: bool=True, - forced_categories: dict={}): + def __init__(self, regroup: bool = True, regroup_name: str = "Other", + keep_missing: bool = True, + category_size_threshold: int = 5, + p_value_threshold: float = 0.001, + scale_contingency_table: bool = True, + forced_categories: dict = {}): self.regroup = regroup self.regroup_name = regroup_name @@ -305,7 +305,8 @@ def _transform_column(self, data: pd.DataFrame, data.loc[:, column_name_clean] = (CategoricalDataProcessor ._replace_categories( data[column_name_clean], - categories)) + categories, + self.regroup_name)) # change data to categorical data.loc[:, column_name_clean] = (data[column_name_clean] @@ -366,7 +367,7 @@ def _get_small_categories(predictor_series: pd.Series, @staticmethod def _replace_missings(data: pd.DataFrame, - column_names: Optional[list]=None) -> pd.DataFrame: + column_names: Optional[list] = None) -> pd.DataFrame: """Replace missing values (incl empty strings) Parameters @@ -398,23 +399,25 @@ def _replace_missings(data: pd.DataFrame, @staticmethod def _compute_p_value(X: pd.Series, y: pd.Series, category: str, scale_contingency_table: bool) -> float: - """Summary + """Calculates p-value in contingency table (chi-square test) in + order to evaluate whether category of interest is significantly + different from the rest of the categories, given the target variable. Parameters ---------- X : pd.Series - Description + Variables data. y : pd.Series - Description + Target data. category : str - Description + Category for which we carry out the test scale_contingency_table : bool - Description + Whether we scale contingency table with incidence rate Returns ------- float - Description + p-value of chi-square test """ df = pd.concat([X, y], axis=1) df["other_categories"] = np.where(X == category, 0, 1) @@ -434,20 +437,24 @@ def _compute_p_value(X: pd.Series, y: pd.Series, category: str, return stats.chi2_contingency(contigency_table, correction=False)[1] @staticmethod - def _replace_categories(data: pd.Series, categories: set) -> pd.Series: + def _replace_categories(data: pd.Series, categories: set, + replace_with: str) -> pd.Series: """replace categories in set with "Other" and transform the remaining categories to strings to avoid type errors later on in the pipeline Parameters ---------- data : pd.Series - Description + Dataset which contains the variable to be replaced categories : set - Description + Cleaned categories. + replace_with: str + String to be used as replacement for category. Returns ------- pd.Series - Description + Series with replaced categories """ - return data.apply(lambda x: str(x) if x in categories else "Other") + return data.apply( + lambda x: str(x) if x in categories else replace_with) diff --git a/cobra/preprocessing/preprocessor.py b/cobra/preprocessing/preprocessor.py index 7061d2a..e185ba6 100644 --- a/cobra/preprocessing/preprocessor.py +++ b/cobra/preprocessing/preprocessor.py @@ -15,8 +15,6 @@ from datetime import datetime import time -import logging -log = logging.getLogger(__name__) # third party imports import pandas as pd from sklearn.model_selection import train_test_split @@ -27,6 +25,9 @@ from cobra.preprocessing import TargetEncoder from cobra.preprocessing import CategoricalDataProcessor +import logging +log = logging.getLogger(__name__) + class PreProcessor(BaseEstimator): @@ -56,7 +57,7 @@ class PreProcessor(BaseEstimator): def __init__(self, categorical_data_processor: CategoricalDataProcessor, discretizer: KBinsDiscretizer, target_encoder: TargetEncoder, - is_fitted: bool=False): + is_fitted: bool = False): self._categorical_data_processor = categorical_data_processor self._discretizer = discretizer @@ -66,22 +67,22 @@ def __init__(self, categorical_data_processor: CategoricalDataProcessor, @classmethod def from_params(cls, - n_bins: int=10, - strategy: str="quantile", - closed: str="right", - auto_adapt_bins: bool=False, - starting_precision: int=0, - label_format: str="{} - {}", - change_endpoint_format: bool=False, - regroup: bool=True, - regroup_name: str="Other", - keep_missing: bool=True, - category_size_threshold: int=5, - p_value_threshold: float=0.001, - scale_contingency_table: bool=True, - forced_categories: dict={}, - weight: float=0.0, - imputation_strategy: str="mean"): + n_bins: int = 10, + strategy: str = "quantile", + closed: str = "right", + auto_adapt_bins: bool = False, + starting_precision: int = 0, + label_format: str = "{} - {}", + change_endpoint_format: bool = False, + regroup: bool = True, + regroup_name: str = "Other", + keep_missing: bool = True, + category_size_threshold: int = 5, + p_value_threshold: float = 0.001, + scale_contingency_table: bool = True, + forced_categories: dict = {}, + weight: float = 0.0, + imputation_strategy: str = "mean"): """Constructor to instantiate PreProcessor from all the parameters that can be set in all its required (attribute) classes along with good default values. @@ -338,10 +339,10 @@ def fit_transform(self, train_data: pd.DataFrame, continuous_vars: list, @staticmethod def train_selection_validation_split(data: pd.DataFrame, target_column_name: str, - train_prop: float=0.6, - selection_prop: float=0.2, - validation_prop: float=0.2, - stratify_split=True)->pd.DataFrame: + train_prop: float = 0.6, + selection_prop: float = 0.2, + validation_prop: float = 0.2, + stratify_split=True) -> pd.DataFrame: """Split dataset into train-selection-validation datasets and merge them into one big DataFrame with an additional column "split" indicating to which dataset the corresponding row belongs to. diff --git a/tests/preprocessing/test_categorical_data_processor.py b/tests/preprocessing/test_categorical_data_processor.py index 95ebc56..953cdde 100644 --- a/tests/preprocessing/test_categorical_data_processor.py +++ b/tests/preprocessing/test_categorical_data_processor.py @@ -116,6 +116,138 @@ def test_replace_categories(self, cleaned_categories, expected): data = pd.Series(data=["c1", "c2", "c3", "c4"]) actual = (CategoricalDataProcessor - ._replace_categories(data, cleaned_categories)) + ._replace_categories(data, cleaned_categories, 'Other')) pd.testing.assert_series_equal(actual, expected) + + def test_all_cats_not_significant(self): + # Expected + e = {'categorical_var': ['A', 'A', 'A', 'A', + 'B', 'B', 'B', 'B', + 'C', 'C', 'C', 'C'], + 'target': [1, 1, 1, 1, + 0, 0, 0, 0, + 1, 0, 1, 0], + 'categorical_var_processed': ['A', 'A', 'A', 'A', + 'B', 'B', 'B', 'B', + 'C', 'C', 'C', 'C']} + + # data -> actual + d = {'categorical_var': ['A', 'A', 'A', 'A', + 'B', 'B', 'B', 'B', + 'C', 'C', 'C', 'C'], + 'target': [1, 1, 1, 1, + 0, 0, 0, 0, + 1, 0, 1, 0]} + + discrete_vars = ['categorical_var'] + target_column_name = 'target' + + data = pd.DataFrame(d, columns=['categorical_var', 'target']) + expected = pd.DataFrame(e, columns=['categorical_var', + 'target', + 'categorical_var_processed']) + + categorical_data_processor = CategoricalDataProcessor( + category_size_threshold=0, + p_value_threshold=0.0001) + + categorical_data_processor.fit(data, + discrete_vars, + target_column_name) + + actual = categorical_data_processor.transform(data, + discrete_vars) + + pd.testing.assert_frame_equal(actual, expected) + + def test_regroup_name(self): + # Expected + e = {'categorical_var': ['A', 'A', 'A', 'A', 'A', 'A', + 'B', 'B', 'B', 'B', 'B', 'B', + 'C', 'C', 'C', 'C', 'C', 'C'], + 'target': [1, 1, 1, 1, 1, 1, + 0, 0, 0, 0, 0, 0, + 1, 0, 1, 0, 1, 0], + 'categorical_var_processed': [ + 'A', 'A', 'A', 'A', 'A', 'A', + 'B', 'B', 'B', 'B', 'B', 'B', + 'OTH', 'OTH', 'OTH', 'OTH', 'OTH', 'OTH']} + + # data -> actual + d = {'categorical_var': ['A', 'A', 'A', 'A', 'A', 'A', + 'B', 'B', 'B', 'B', 'B', 'B', + 'C', 'C', 'C', 'C', 'C', 'C'], + 'target': [1, 1, 1, 1, 1, 1, + 0, 0, 0, 0, 0, 0, + 1, 0, 1, 0, 1, 0]} + + discrete_vars = ['categorical_var'] + target_column_name = 'target' + + data = pd.DataFrame(d, columns=['categorical_var', 'target']) + expected = pd.DataFrame(e, columns=['categorical_var', + 'target', + 'categorical_var_processed']) + + expected['categorical_var_processed'] = ( + expected['categorical_var_processed'].astype("category")) + + categorical_data_processor = CategoricalDataProcessor( + category_size_threshold=0, + regroup_name='OTH', + p_value_threshold=0.05) + + categorical_data_processor.fit(data, + discrete_vars, + target_column_name) + + actual = categorical_data_processor.transform(data, + discrete_vars) + + pd.testing.assert_frame_equal(actual, expected) + + def test_force_category(self): + # Expected + e = {'categorical_var': ['A', 'A', 'A', 'A', 'A', 'A', + 'B', 'B', 'B', 'B', 'B', 'B', + 'C', 'C', 'C', 'C', 'C', 'C'], + 'target': [1, 1, 1, 1, 1, 1, + 0, 0, 0, 0, 0, 0, + 1, 0, 1, 0, 1, 0], + 'categorical_var_processed': ['A', 'A', 'A', 'A', 'A', 'A', + 'B', 'B', 'B', 'B', 'B', 'B', + 'C', 'C', 'C', 'C', 'C', 'C']} + + # data -> actual + d = {'categorical_var': ['A', 'A', 'A', 'A', 'A', 'A', + 'B', 'B', 'B', 'B', 'B', 'B', + 'C', 'C', 'C', 'C', 'C', 'C'], + 'target': [1, 1, 1, 1, 1, 1, + 0, 0, 0, 0, 0, 0, + 1, 0, 1, 0, 1, 0]} + + discrete_vars = ['categorical_var'] + target_column_name = 'target' + + data = pd.DataFrame(d, columns=['categorical_var', 'target']) + expected = pd.DataFrame(e, columns=['categorical_var', + 'target', + 'categorical_var_processed']) + + expected['categorical_var_processed'] = ( + expected['categorical_var_processed'].astype("category")) + + categorical_data_processor = CategoricalDataProcessor( + category_size_threshold=0, + forced_categories={'categorical_var': ['C']}, + p_value_threshold=0.05) + + categorical_data_processor.fit(data, + discrete_vars, + target_column_name) + + actual = categorical_data_processor.transform(data, + discrete_vars) + + pd.testing.assert_frame_equal(actual, expected)