diff --git a/cobra/preprocessing/preprocessor.py b/cobra/preprocessing/preprocessor.py index e0e01cc..e80f798 100644 --- a/cobra/preprocessing/preprocessor.py +++ b/cobra/preprocessing/preprocessor.py @@ -223,6 +223,54 @@ def from_pipeline(cls, pipeline: dict): target_encoder, is_fitted=pipeline["_is_fitted"], ) + + def get_continous_and_discreate_columns( + self, + df : pd.DataFrame, + id_col_name : str, + target_column_name :str + ) -> tuple: + """Filters out the continious and discreate varaibles out of a dataframe and returns a tuple containing lists of column names + It assumes that numerical comumns with less than or equal to 10 different values are categorical + + Parameters + ---------- + df : pd.DataFrame + DataFrame that you want to divide in discreate and continous variables + id_col_name : str + column name of the id column, can be None + target_column_name : str + column name of the target column + + Returns + ------- + tuple + tuple containing 2 lists of column names. (continuous_vars, discrete_vars) + """ + if id_col_name == None: + log.warning("id_col_name is equal to None. If there is no id column ignore this warning") + + # find continuous_vars and discrete_vars in the dateframe + col_dtypes = df.dtypes + discrete_vars = [col for col in col_dtypes[col_dtypes==object].index.tolist() if col not in [id_col_name, target_column_name]] + + + for col in df.columns: + if col not in discrete_vars and col not in [id_col_name, target_column_name]: # omit discrete because a string, and target + val_counts = df[col].nunique() + if val_counts > 1 and val_counts <= 10: # the column contains less than 10 different values + discrete_vars.append(col) + + continuous_vars = list(set(df.columns) + - set(discrete_vars) + - set([id_col_name, target_column_name])) + log.warning( + f"""Cobra automaticaly assumes that following variables are + discrete: {discrete_vars} + continuous: {continuous_vars} + If you want to change this behaviour you can specify the discrete/continuous variables yourself with the continuous_vars and discrete_vars keywords. \nIt assumes that numerical comumns with less than or equal to 10 different values are categorical""" + ) + return continuous_vars, discrete_vars def fit( self, @@ -230,21 +278,32 @@ def fit( continuous_vars: list, discrete_vars: list, target_column_name: str, + id_col_name: str = None ): """Fit the data to the preprocessing pipeline. + If you put continious_vars and target_vars equal to `None` and give the id_col_name Cobra will guess which varaibles are continious and which are not Parameters ---------- train_data : pd.DataFrame Data to be preprocessed. - continuous_vars : list - List of continuous variables. - discrete_vars : list - List of discrete variables. + continuous_vars : list | None + List of continuous variables, can be None. + discrete_vars : list | None + List of discrete variables, can be None. target_column_name : str Column name of the target. + id_col_name : str, optional + _description_, by default None """ + if not (continuous_vars and discrete_vars): + continuous_vars, discrete_vars = self.get_continous_and_discreate_columns( + df=train_data, + id_col_name=id_col_name, + target_column_name=target_column_name + ) + # get list of all variables preprocessed_variable_names = PreProcessor._get_variable_list( continuous_vars, discrete_vars @@ -359,27 +418,38 @@ def fit_transform( continuous_vars: list, discrete_vars: list, target_column_name: str, + id_col_name: str = None ) -> pd.DataFrame: + """Fit preprocessing pipeline and transform the data. + If you put continious_vars and target_vars equal to `None` and give the id_col_name Cobra will guess which varaibles are continious and which are not Parameters ---------- train_data : pd.DataFrame Data to be preprocessed continuous_vars : list - List of continuous variables. + List of continuous variables, can be None. discrete_vars : list - List of discrete variables. + List of discrete variables, can be None. target_column_name : str Column name of the target. + id_col_name : str, optional + _description_, by default None Returns ------- pd.DataFrame Transformed (preprocessed) data. """ + if not (continuous_vars and discrete_vars) and id_col_name: + continuous_vars, discrete_vars = self.get_continous_and_discreate_columns( + df=train_data, + id_col_name=id_col_name, + target_column_name=target_column_name - self.fit(train_data, continuous_vars, discrete_vars, target_column_name) + ) + self.fit(train_data, continuous_vars, discrete_vars, target_column_name, id_col_name) return self.transform(train_data, continuous_vars, discrete_vars) diff --git a/requirements.txt b/requirements.txt index f2c226f..054b9fe 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,7 @@ numpy>=1.19.4 pandas>=1.1.5 scipy>=1.5.4 -scikit-learn>=0.24.1 +scikit-learn>=1.2.0 matplotlib>=3.4.3 seaborn>=0.11.0 tqdm>=4.62.2 \ No newline at end of file diff --git a/tests/model_building/test_models.py b/tests/model_building/test_models.py index 4e4db78..7eca6e6 100644 --- a/tests/model_building/test_models.py +++ b/tests/model_building/test_models.py @@ -225,7 +225,6 @@ def test_serialize(self): "copy_X": True, "fit_intercept": True, "n_jobs": None, - "normalize": "deprecated", "positive": False } } @@ -244,7 +243,6 @@ def test_deserialize(self): "copy_X": True, "fit_intercept": True, "n_jobs": None, - "normalize": "deprecated", "positive": False }, "coef_": [[0.5, 0.75]], diff --git a/tests/preprocessing/test_preprocessor.py b/tests/preprocessing/test_preprocessor.py index 1239e50..cd2f43b 100644 --- a/tests/preprocessing/test_preprocessor.py +++ b/tests/preprocessing/test_preprocessor.py @@ -160,6 +160,95 @@ def test_get_variable_list( assert actual == expected + @pytest.mark.parametrize( + ("input, expected"), + [ + # example 1 + ( + pd.DataFrame({ + "ID": list(range(20)), + "A": [1,2,3,4,5,6,7,8,9,9,8,9,8,9,6,5,6,6,9,8], + "B": ["Cat"] *5 + ["Dog"]*10 + ["Fish"]*5, + "C": [1,2,3,4,9,10,11,12,13,5,6,7,8,15,19,18,14,16,13,17], + "Target": [1]*2 + [0]*5 + [1]*3 + [0]*5 + [1]*5 + } + ), + pd.DataFrame({ + 'ID': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19], + 'A': [1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 8, 9, 8, 9, 6, 5, 6, 6, 9, 8], + 'B': ['Cat','Cat','Cat','Cat','Cat','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Fish','Fish','Fish','Fish','Fish'], + 'C': [1, 2, 3, 4, 9, 10, 11, 12, 13, 5, 6, 7, 8, 15, 19, 18, 14, 16, 13, 17], + 'Target': [1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1], + 'C_bin': ['1.0 - 3.0','1.0 - 3.0','1.0 - 3.0','3.0 - 5.0','7.0 - 9.0','9.0 - 10.0','10.0 - 12.0','10.0 - 12.0','12.0 - 13.0','3.0 - 5.0','5.0 - 7.0','5.0 - 7.0','7.0 - 9.0','13.0 - 15.0','17.0 - 19.0','17.0 - 19.0','13.0 - 15.0','15.0 - 17.0','12.0 - 13.0','15.0 - 17.0'], + 'B_processed': ['Cat','Cat','Cat','Cat','Cat','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Fish','Fish','Fish','Fish','Fish'], + 'A_processed': [1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 8, 9, 8, 9, 6, 5, 6, 6, 9, 8], + 'B_enc': [0.4,0.4,0.4,0.4,0.4,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,1.0,1.0,1.0,1.0,1.0], + 'A_enc': [1.0,1.0,0.0,0.0,0.5,0.5,0.0,0.5,0.6,0.6,0.5,0.6,0.5,0.6,0.5,0.5,0.5,0.5,0.6,0.5], + 'C_enc': [0.6666666666666666,0.6666666666666666,0.6666666666666666,0.5,0.0,0.0,0.5,0.5,1.0,0.5,0.0,0.0,0.0,0.5,0.5,0.5,0.5,1.0,1.0,1.0] + } + ), + ) + ] + ) + def test_fit_transform_without_id_col_name(self, input, expected): + + preprocessor = PreProcessor.from_params(model_type="classification") + + continuous_vars, discrete_vars = preprocessor.get_continous_and_discreate_columns(input, "ID","Target") + + calculated = preprocessor.fit_transform( + input, + continuous_vars=continuous_vars, + discrete_vars=discrete_vars, + target_column_name="Target" + ) + pd.testing.assert_frame_equal(calculated, expected, check_dtype=False, check_categorical=False) + + @pytest.mark.parametrize( + ("input, expected"), + [ + # example 1 + ( + pd.DataFrame({ + "ID": list(range(20)), + "A": [1,2,3,4,5,6,7,8,9,9,8,9,8,9,6,5,6,6,9,8], + "B": ["Cat"] *5 + ["Dog"]*10 + ["Fish"]*5, + "C": [1,2,3,4,9,10,11,12,13,5,6,7,8,15,19,18,14,16,13,17], + "Target": [1]*2 + [0]*5 + [1]*3 + [0]*5 + [1]*5 + } + ), + pd.DataFrame({ + 'ID': [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19], + 'A': [1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 8, 9, 8, 9, 6, 5, 6, 6, 9, 8], + 'B': ['Cat','Cat','Cat','Cat','Cat','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Fish','Fish','Fish','Fish','Fish'], + 'C': [1, 2, 3, 4, 9, 10, 11, 12, 13, 5, 6, 7, 8, 15, 19, 18, 14, 16, 13, 17], + 'Target': [1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1], + 'C_bin': ['1.0 - 3.0','1.0 - 3.0','1.0 - 3.0','3.0 - 5.0','7.0 - 9.0','9.0 - 10.0','10.0 - 12.0','10.0 - 12.0','12.0 - 13.0','3.0 - 5.0','5.0 - 7.0','5.0 - 7.0','7.0 - 9.0','13.0 - 15.0','17.0 - 19.0','17.0 - 19.0','13.0 - 15.0','15.0 - 17.0','12.0 - 13.0','15.0 - 17.0'], + 'B_processed': ['Cat','Cat','Cat','Cat','Cat','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Dog','Fish','Fish','Fish','Fish','Fish'], + 'A_processed': [1, 2, 3, 4, 5, 6, 7, 8, 9, 9, 8, 9, 8, 9, 6, 5, 6, 6, 9, 8], + 'B_enc': [0.4,0.4,0.4,0.4,0.4,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,0.3,1.0,1.0,1.0,1.0,1.0], + 'A_enc': [1.0,1.0,0.0,0.0,0.5,0.5,0.0,0.5,0.6,0.6,0.5,0.6,0.5,0.6,0.5,0.5,0.5,0.5,0.6,0.5], + 'C_enc': [0.6666666666666666,0.6666666666666666,0.6666666666666666,0.5,0.0,0.0,0.5,0.5,1.0,0.5,0.0,0.0,0.0,0.5,0.5,0.5,0.5,1.0,1.0,1.0] + } + ), + ) + ] + ) + def test_fit_transform_with_id_col_name(self, input, expected): + + preprocessor = PreProcessor.from_params(model_type="classification") + + # continuous_vars, discrete_vars = preprocessor.get_continous_and_discreate_columns(input, "ID","Target") + + calculated = preprocessor.fit_transform( + input, + continuous_vars=None, + discrete_vars=None, + target_column_name="Target", + id_col_name="ID" + ) + pd.testing.assert_frame_equal(calculated, expected, check_dtype=False, check_categorical=False) + @staticmethod def mock_transform(df: pd.DataFrame, args): """Mock the transform method."""