From 353b9bccfc375d7b28bc2852d1b10046cb193978 Mon Sep 17 00:00:00 2001
From: Patrick Leonardy <patrick.leonardy@pythonpredictions.com>
Date: Fri, 2 Dec 2022 15:35:14 +0100
Subject: [PATCH 1/3] Defaults id_column to None for PIGs & tests

---
 cobra/evaluation/pigs_tables.py        | 44 ++++++++++++++++----------
 tests/preprocessing/test_pig_tables.py | 39 +++++++++++++++++++++++
 2 files changed, 66 insertions(+), 17 deletions(-)
 create mode 100644 tests/preprocessing/test_pig_tables.py

diff --git a/cobra/evaluation/pigs_tables.py b/cobra/evaluation/pigs_tables.py
index 4c58eaa..6cca2d0 100644
--- a/cobra/evaluation/pigs_tables.py
+++ b/cobra/evaluation/pigs_tables.py
@@ -8,9 +8,9 @@
 import cobra.utils as utils
 
 def generate_pig_tables(basetable: pd.DataFrame,
-                        id_column_name: str,
                         target_column_name: str,
-                        preprocessed_predictors: list) -> pd.DataFrame:
+                        preprocessed_predictors: list,
+                        id_column_name: str = None) -> pd.DataFrame:
     """Compute PIG tables for all predictors in preprocessed_predictors.
 
     The output is a DataFrame with columns ``variable``, ``label``,
@@ -20,35 +20,41 @@ def generate_pig_tables(basetable: pd.DataFrame,
     ----------
     basetable : pd.DataFrame
         Basetable to compute PIG tables from.
-    id_column_name : str
-        Name of the basetable column containing the IDs of the basetable rows
-        (e.g. customernumber).
     target_column_name : str
         Name of the basetable column containing the target values to predict.
     preprocessed_predictors: list
         List of basetable column names containing preprocessed predictors.
-
+    id_column_name : str, default=None
+        Name of the basetable column containing the IDs of the basetable rows
+        (e.g. customernumber). 
     Returns
     -------
     pd.DataFrame
         DataFrame containing a PIG table for all predictors.
     """
+
+    #check if there is a id-column and define no_predictor accordingly
+    if id_column_name == None:
+        no_predictor = [target_column_name]
+    else:
+        no_predictor = [id_column_name, target_column_name]
+    
+
     pigs = [
         compute_pig_table(basetable,
                           column_name,
                           target_column_name,
-                          id_column_name)
+                          )
         for column_name in sorted(preprocessed_predictors)
-        if column_name not in [id_column_name, target_column_name]
+        if column_name not in no_predictor
     ]
-    output = pd.concat(pigs)
+    output = pd.concat(pigs, ignore_index=True)
     return output
 
 
 def compute_pig_table(basetable: pd.DataFrame,
                       predictor_column_name: str,
-                      target_column_name: str,
-                      id_column_name: str) -> pd.DataFrame:
+                      target_column_name: str) -> pd.DataFrame:
     """Compute the PIG table of a given predictor for a given target.
 
     Parameters
@@ -59,8 +65,6 @@ def compute_pig_table(basetable: pd.DataFrame,
         Predictor name of which to compute the pig table.
     target_column_name : str
         Name of the target variable.
-    id_column_name : str
-        Name of the id column (used to count population size).
 
     Returns
     -------
@@ -72,12 +76,18 @@ def compute_pig_table(basetable: pd.DataFrame,
     # group by the binned variable, compute the incidence
     # (= mean of the target for the given bin) and compute the bin size
     # (e.g. COUNT(id_column_name)). After that, rename the columns
+
     res = (basetable.groupby(predictor_column_name)
-           .agg({target_column_name: "mean", id_column_name: "size"})
+           .agg(
+                avg_target = (target_column_name, "mean"),
+                pop_size = (target_column_name, "size")
+           )
            .reset_index()
-           .rename(columns={predictor_column_name: "label",
-                            target_column_name: "avg_target",
-                            id_column_name: "pop_size"}))
+           .rename(
+                columns={predictor_column_name: "label"}
+           )
+    )
+
 
     # add the column name to a variable column
     # add the average incidence
diff --git a/tests/preprocessing/test_pig_tables.py b/tests/preprocessing/test_pig_tables.py
new file mode 100644
index 0000000..33c6d82
--- /dev/null
+++ b/tests/preprocessing/test_pig_tables.py
@@ -0,0 +1,39 @@
+import pytest
+
+import pandas as pd
+from cobra.evaluation.pigs_tables import generate_pig_tables
+
+class TestPigTablesGeneration:
+
+    @pytest.mark.parametrize("id_col_name", [None, "col_id"]) # test None as this is the default value in generate pig tabels 
+    def test_col_id(self, id_col_name):
+        
+        # input
+        data = pd.DataFrame({
+            'col_id': [0, 1, 3, 4, 6],
+            'survived': [0, 1, 1, 0, 0],
+            'pclass': [3, 1, 1, 3, 1],
+            'sex': ['male', 'female', 'female', 'male', 'male'],
+            'age': [22.0, 38.0, 35.0, 35.0, 54.0]
+        })
+        target = "survived"
+        prep_col = ["pclass", "sex", "age"]
+        
+        # output
+        out = generate_pig_tables(
+            basetable= data,
+            target_column_name=target,
+            preprocessed_predictors=prep_col,
+            id_column_name=id_col_name
+        )
+        
+        # expected
+        expected = pd.DataFrame({
+            'variable': ['age', 'age', 'age', 'age', 'pclass', 'pclass', 'sex', 'sex'],
+            'label': [22.0, 35.0, 38.0, 54.0, 1, 3, 'female', 'male'],
+            'pop_size': [0.2, 0.4, 0.2, 0.2, 0.6, 0.4, 0.4, 0.6],
+            'global_avg_target': [0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4],
+            'avg_target': [0.0, 0.5, 1.0, 0.0, 0.6666666666666666, 0.0, 1.0, 0.0]
+        })
+
+        pd.testing.assert_frame_equal(out, expected)
\ No newline at end of file

From a9c21caa12c7a5afda01f22b1c16ac2d3e30a927 Mon Sep 17 00:00:00 2001
From: Patrick Leonardy <patrick.leonardy@pythonpredictions.com>
Date: Wed, 7 Dec 2022 11:47:09 +0100
Subject: [PATCH 2/3] added type hint for test_col_id

---
 tests/preprocessing/test_pig_tables.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tests/preprocessing/test_pig_tables.py b/tests/preprocessing/test_pig_tables.py
index 33c6d82..47af512 100644
--- a/tests/preprocessing/test_pig_tables.py
+++ b/tests/preprocessing/test_pig_tables.py
@@ -3,10 +3,12 @@
 import pandas as pd
 from cobra.evaluation.pigs_tables import generate_pig_tables
 
+from typing import Optional
+
 class TestPigTablesGeneration:
 
     @pytest.mark.parametrize("id_col_name", [None, "col_id"]) # test None as this is the default value in generate pig tabels 
-    def test_col_id(self, id_col_name):
+    def test_col_id(self, id_col_name: Optional[str]):
         
         # input
         data = pd.DataFrame({
@@ -36,4 +38,5 @@ def test_col_id(self, id_col_name):
             'avg_target': [0.0, 0.5, 1.0, 0.0, 0.6666666666666666, 0.0, 1.0, 0.0]
         })
 
-        pd.testing.assert_frame_equal(out, expected)
\ No newline at end of file
+        pd.testing.assert_frame_equal(out, expected)
+        
\ No newline at end of file

From 88205803549a72d288c106570a3646efff455e45 Mon Sep 17 00:00:00 2001
From: Patrick Leonardy <patrick.leonardy@pythonpredictions.com>
Date: Wed, 7 Dec 2022 12:05:50 +0100
Subject: [PATCH 3/3] Add new line at end of file, Format with black

---
 tests/preprocessing/test_pig_tables.py | 58 ++++++++++++++++----------
 1 file changed, 36 insertions(+), 22 deletions(-)

diff --git a/tests/preprocessing/test_pig_tables.py b/tests/preprocessing/test_pig_tables.py
index 47af512..3b1e6a7 100644
--- a/tests/preprocessing/test_pig_tables.py
+++ b/tests/preprocessing/test_pig_tables.py
@@ -5,38 +5,52 @@
 
 from typing import Optional
 
-class TestPigTablesGeneration:
 
-    @pytest.mark.parametrize("id_col_name", [None, "col_id"]) # test None as this is the default value in generate pig tabels 
+class TestPigTablesGeneration:
+    @pytest.mark.parametrize(
+        "id_col_name", [None, "col_id"]
+    )  # test None as this is the default value in generate pig tabels
     def test_col_id(self, id_col_name: Optional[str]):
-        
+
         # input
-        data = pd.DataFrame({
-            'col_id': [0, 1, 3, 4, 6],
-            'survived': [0, 1, 1, 0, 0],
-            'pclass': [3, 1, 1, 3, 1],
-            'sex': ['male', 'female', 'female', 'male', 'male'],
-            'age': [22.0, 38.0, 35.0, 35.0, 54.0]
-        })
+        data = pd.DataFrame(
+            {
+                "col_id": [0, 1, 3, 4, 6],
+                "survived": [0, 1, 1, 0, 0],
+                "pclass": [3, 1, 1, 3, 1],
+                "sex": ["male", "female", "female", "male", "male"],
+                "age": [22.0, 38.0, 35.0, 35.0, 54.0],
+            }
+        )
         target = "survived"
         prep_col = ["pclass", "sex", "age"]
-        
+
         # output
         out = generate_pig_tables(
-            basetable= data,
+            basetable=data,
             target_column_name=target,
             preprocessed_predictors=prep_col,
-            id_column_name=id_col_name
+            id_column_name=id_col_name,
         )
-        
+
         # expected
-        expected = pd.DataFrame({
-            'variable': ['age', 'age', 'age', 'age', 'pclass', 'pclass', 'sex', 'sex'],
-            'label': [22.0, 35.0, 38.0, 54.0, 1, 3, 'female', 'male'],
-            'pop_size': [0.2, 0.4, 0.2, 0.2, 0.6, 0.4, 0.4, 0.6],
-            'global_avg_target': [0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4],
-            'avg_target': [0.0, 0.5, 1.0, 0.0, 0.6666666666666666, 0.0, 1.0, 0.0]
-        })
+        expected = pd.DataFrame(
+            {
+                "variable": [
+                    "age",
+                    "age",
+                    "age",
+                    "age",
+                    "pclass",
+                    "pclass",
+                    "sex",
+                    "sex",
+                ],
+                "label": [22.0, 35.0, 38.0, 54.0, 1, 3, "female", "male"],
+                "pop_size": [0.2, 0.4, 0.2, 0.2, 0.6, 0.4, 0.4, 0.6],
+                "global_avg_target": [0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4],
+                "avg_target": [0.0, 0.5, 1.0, 0.0, 0.6666666666666666, 0.0, 1.0, 0.0],
+            }
+        )
 
         pd.testing.assert_frame_equal(out, expected)
-        
\ No newline at end of file