PythonPredictions · sandervh14 · Mar 16, 2021 · Mar 15, 2021 · Mar 15, 2021
diff --git a/cobra/preprocessing/categorical_data_processor.py b/cobra/preprocessing/categorical_data_processor.py
@@ -18,9 +18,6 @@
 import re
 from typing import Optional
 
-import logging
-log = logging.getLogger(__name__)
-
 # third party imports
 import numpy as np
 import pandas as pd
@@ -29,6 +26,9 @@
 from sklearn.base import BaseEstimator
 from sklearn.exceptions import NotFittedError
 
+import logging
+log = logging.getLogger(__name__)
+
 
 class CategoricalDataProcessor(BaseEstimator):
     """
@@ -58,12 +58,12 @@ class CategoricalDataProcessor(BaseEstimator):
                   "category_size_threshold", "p_value_threshold",
                   "scale_contingency_table", "forced_categories"]
 
-    def __init__(self, regroup: bool=True, regroup_name: str="Other",
-                 keep_missing: bool=True,
-                 category_size_threshold: int=5,
-                 p_value_threshold: float=0.001,
-                 scale_contingency_table: bool=True,
-                 forced_categories: dict={}):
+    def __init__(self, regroup: bool = True, regroup_name: str = "Other",
+                 keep_missing: bool = True,
+                 category_size_threshold: int = 5,
+                 p_value_threshold: float = 0.001,
+                 scale_contingency_table: bool = True,
+                 forced_categories: dict = {}):
 
         self.regroup = regroup
         self.regroup_name = regroup_name
@@ -305,7 +305,8 @@ def _transform_column(self, data: pd.DataFrame,
             data.loc[:, column_name_clean] = (CategoricalDataProcessor
                                               ._replace_categories(
                                                   data[column_name_clean],
-                                                  categories))
+                                                  categories,
+                                                  self.regroup_name))
 
         # change data to categorical
         data.loc[:, column_name_clean] = (data[column_name_clean]
@@ -366,7 +367,7 @@ def _get_small_categories(predictor_series: pd.Series,
 
     @staticmethod
     def _replace_missings(data: pd.DataFrame,
-                          column_names: Optional[list]=None) -> pd.DataFrame:
+                          column_names: Optional[list] = None) -> pd.DataFrame:
         """Replace missing values (incl empty strings)
 
         Parameters
@@ -398,23 +399,25 @@ def _replace_missings(data: pd.DataFrame,
     @staticmethod
     def _compute_p_value(X: pd.Series, y: pd.Series, category: str,
                          scale_contingency_table: bool) -> float:
-        """Summary
+        """Calculates p-value in contingency table (chi-square test) in
+        order to evaluate whether category of interest is significantly
+        different from the rest of the categories, given the target variable.
 
         Parameters
         ----------
         X : pd.Series
-            Description
+            Variables data.
         y : pd.Series
-            Description
+            Target data.
         category : str
-            Description
+            Category for which we carry out the test
         scale_contingency_table : bool
-            Description
+            Whether we scale contingency table with incidence rate
 
         Returns
         -------
         float
-            Description
+            p-value of chi-square test
         """
         df = pd.concat([X, y], axis=1)
         df["other_categories"] = np.where(X == category, 0, 1)
@@ -434,20 +437,24 @@ def _compute_p_value(X: pd.Series, y: pd.Series, category: str,
         return stats.chi2_contingency(contigency_table, correction=False)[1]
 
     @staticmethod
-    def _replace_categories(data: pd.Series, categories: set) -> pd.Series:
+    def _replace_categories(data: pd.Series, categories: set,
+                            replace_with: str) -> pd.Series:
         """replace categories in set with "Other" and transform the remaining
         categories to strings to avoid type errors later on in the pipeline
 
         Parameters
         ----------
         data : pd.Series
-            Description
+            Dataset which contains the variable to be replaced
         categories : set
-            Description
+            Cleaned categories.
+        replace_with: str
+            String to be used as replacement for category.
 
         Returns
         -------
         pd.Series
-            Description
+            Series with replaced categories
         """
-        return data.apply(lambda x: str(x) if x in categories else "Other")
+        return data.apply(
+            lambda x: str(x) if x in categories else replace_with)
diff --git a/cobra/preprocessing/preprocessor.py b/cobra/preprocessing/preprocessor.py
@@ -15,8 +15,6 @@
 from datetime import datetime
 import time
 
-import logging
-log = logging.getLogger(__name__)
 # third party imports
 import pandas as pd
 from sklearn.model_selection import train_test_split
@@ -27,6 +25,9 @@
 from cobra.preprocessing import TargetEncoder
 from cobra.preprocessing import CategoricalDataProcessor
 
+import logging
+log = logging.getLogger(__name__)
+
 
 class PreProcessor(BaseEstimator):
 
@@ -56,7 +57,7 @@ class PreProcessor(BaseEstimator):
     def __init__(self, categorical_data_processor: CategoricalDataProcessor,
                  discretizer: KBinsDiscretizer,
                  target_encoder: TargetEncoder,
-                 is_fitted: bool=False):
+                 is_fitted: bool = False):
 
         self._categorical_data_processor = categorical_data_processor
         self._discretizer = discretizer
@@ -66,22 +67,22 @@ def __init__(self, categorical_data_processor: CategoricalDataProcessor,
 
     @classmethod
     def from_params(cls,
-                    n_bins: int=10,
-                    strategy: str="quantile",
-                    closed: str="right",
-                    auto_adapt_bins: bool=False,
-                    starting_precision: int=0,
-                    label_format: str="{} - {}",
-                    change_endpoint_format: bool=False,
-                    regroup: bool=True,
-                    regroup_name: str="Other",
-                    keep_missing: bool=True,
-                    category_size_threshold: int=5,
-                    p_value_threshold: float=0.001,
-                    scale_contingency_table: bool=True,
-                    forced_categories: dict={},
-                    weight: float=0.0,
-                    imputation_strategy: str="mean"):
+                    n_bins: int = 10,
+                    strategy: str = "quantile",
+                    closed: str = "right",
+                    auto_adapt_bins: bool = False,
+                    starting_precision: int = 0,
+                    label_format: str = "{} - {}",
+                    change_endpoint_format: bool = False,
+                    regroup: bool = True,
+                    regroup_name: str = "Other",
+                    keep_missing: bool = True,
+                    category_size_threshold: int = 5,
+                    p_value_threshold: float = 0.001,
+                    scale_contingency_table: bool = True,
+                    forced_categories: dict = {},
+                    weight: float = 0.0,
+                    imputation_strategy: str = "mean"):
         """Constructor to instantiate PreProcessor from all the parameters
         that can be set in all its required (attribute) classes
         along with good default values.
@@ -338,10 +339,10 @@ def fit_transform(self, train_data: pd.DataFrame, continuous_vars: list,
     @staticmethod
     def train_selection_validation_split(data: pd.DataFrame,
                                          target_column_name: str,
-                                         train_prop: float=0.6,
-                                         selection_prop: float=0.2,
-                                         validation_prop: float=0.2,
-                                         stratify_split=True)->pd.DataFrame:
+                                         train_prop: float = 0.6,
+                                         selection_prop: float = 0.2,
+                                         validation_prop: float = 0.2,
+                                         stratify_split=True) -> pd.DataFrame:
         """Split dataset into train-selection-validation datasets and merge
         them into one big DataFrame with an additional column "split"
         indicating to which dataset the corresponding row belongs to.

diff --git a/tests/preprocessing/test_categorical_data_processor.py b/tests/preprocessing/test_categorical_data_processor.py
@@ -116,6 +116,138 @@ def test_replace_categories(self, cleaned_categories, expected):
         data = pd.Series(data=["c1", "c2", "c3", "c4"])
 
         actual = (CategoricalDataProcessor
-                  ._replace_categories(data, cleaned_categories))
+                  ._replace_categories(data, cleaned_categories, 'Other'))
 
         pd.testing.assert_series_equal(actual, expected)
+
+    def test_all_cats_not_significant(self):
+        # Expected
+        e = {'categorical_var': ['A', 'A', 'A', 'A',
+                                 'B', 'B', 'B', 'B',
+                                 'C', 'C', 'C', 'C'],
+             'target': [1, 1, 1, 1,
+                        0, 0, 0, 0,
+                        1, 0, 1, 0],
+             'categorical_var_processed': ['A', 'A', 'A', 'A',
+                                           'B', 'B', 'B', 'B',
+                                           'C', 'C', 'C', 'C']}
+
+        # data -> actual
+        d = {'categorical_var': ['A', 'A', 'A', 'A',
+                                 'B', 'B', 'B', 'B',
+                                 'C', 'C', 'C', 'C'],
+             'target': [1, 1, 1, 1,
+                        0, 0, 0, 0,
+                        1, 0, 1, 0]}
+
+        discrete_vars = ['categorical_var']
+        target_column_name = 'target'
+
+        data = pd.DataFrame(d, columns=['categorical_var', 'target'])
+        expected = pd.DataFrame(e, columns=['categorical_var',
+                                            'target',
+                                            'categorical_var_processed'])
+
+        categorical_data_processor = CategoricalDataProcessor(
+                    category_size_threshold=0,
+                    p_value_threshold=0.0001)
+
+        categorical_data_processor.fit(data,
+                                       discrete_vars,
+                                       target_column_name)
+
+        actual = categorical_data_processor.transform(data,
+                                                      discrete_vars)
+
+        pd.testing.assert_frame_equal(actual, expected)
+
+    def test_regroup_name(self):
+        # Expected
+        e = {'categorical_var': ['A', 'A', 'A', 'A', 'A', 'A',
+                                 'B', 'B', 'B', 'B', 'B', 'B',
+                                 'C', 'C', 'C', 'C', 'C', 'C'],
+             'target': [1, 1, 1, 1, 1, 1,
+                        0, 0, 0, 0, 0, 0,
+                        1, 0, 1, 0, 1, 0],
+             'categorical_var_processed': [
+                'A', 'A', 'A', 'A', 'A', 'A',
+                'B', 'B', 'B', 'B', 'B', 'B',
+                'OTH', 'OTH', 'OTH', 'OTH', 'OTH', 'OTH']}
+
+        # data -> actual
+        d = {'categorical_var': ['A', 'A', 'A', 'A', 'A', 'A',
+                                 'B', 'B', 'B', 'B', 'B', 'B',
+                                 'C', 'C', 'C', 'C', 'C', 'C'],
+             'target': [1, 1, 1, 1, 1, 1,
+                        0, 0, 0, 0, 0, 0,
+                        1, 0, 1, 0, 1, 0]}
+
+        discrete_vars = ['categorical_var']
+        target_column_name = 'target'
+
+        data = pd.DataFrame(d, columns=['categorical_var', 'target'])
+        expected = pd.DataFrame(e, columns=['categorical_var',
+                                            'target',
+                                            'categorical_var_processed'])
+
+        expected['categorical_var_processed'] = (
+            expected['categorical_var_processed'].astype("category"))
+
+        categorical_data_processor = CategoricalDataProcessor(
+                    category_size_threshold=0,
+                    regroup_name='OTH',
+                    p_value_threshold=0.05)
+
+        categorical_data_processor.fit(data,
+                                       discrete_vars,
+                                       target_column_name)
+
+        actual = categorical_data_processor.transform(data,
+                                                      discrete_vars)
+
+        pd.testing.assert_frame_equal(actual, expected)
+
+    def test_force_category(self):
+        # Expected
+        e = {'categorical_var': ['A', 'A', 'A', 'A', 'A', 'A',
+                                 'B', 'B', 'B', 'B', 'B', 'B',
+                                 'C', 'C', 'C', 'C', 'C', 'C'],
+             'target': [1, 1, 1, 1, 1, 1,
+                        0, 0, 0, 0, 0, 0,
+                        1, 0, 1, 0, 1, 0],
+             'categorical_var_processed': ['A', 'A', 'A', 'A', 'A', 'A',
+                                           'B', 'B', 'B', 'B', 'B', 'B',
+                                           'C', 'C', 'C', 'C', 'C', 'C']}
+
+        # data -> actual
+        d = {'categorical_var': ['A', 'A', 'A', 'A', 'A', 'A',
+                                 'B', 'B', 'B', 'B', 'B', 'B',
+                                 'C', 'C', 'C', 'C', 'C', 'C'],
+             'target': [1, 1, 1, 1, 1, 1,
+                        0, 0, 0, 0, 0, 0,
+                        1, 0, 1, 0, 1, 0]}
+
+        discrete_vars = ['categorical_var']
+        target_column_name = 'target'
+
+        data = pd.DataFrame(d, columns=['categorical_var', 'target'])
+        expected = pd.DataFrame(e, columns=['categorical_var',
+                                            'target',
+                                            'categorical_var_processed'])
+
+        expected['categorical_var_processed'] = (
+            expected['categorical_var_processed'].astype("category"))
+
+        categorical_data_processor = CategoricalDataProcessor(
+                    category_size_threshold=0,
+                    forced_categories={'categorical_var': ['C']},
+                    p_value_threshold=0.05)
+
+        categorical_data_processor.fit(data,
+                                       discrete_vars,
+                                       target_column_name)
+
+        actual = categorical_data_processor.transform(data,
+                                                      discrete_vars)
+
+        pd.testing.assert_frame_equal(actual, expected)