diff --git a/CHANGELOG.md b/CHANGELOG.md index 8e29ca37..27f3c106 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,9 +9,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## Unreleased ### Added -- `CatalogCoverage` metric ([#266](https://github.com/MobileTeleSystems/RecTools/pull/266)) +- `CatalogCoverage` metric ([#266](https://github.com/MobileTeleSystems/RecTools/pull/266), [#267](https://github.com/MobileTeleSystems/RecTools/pull/267)) - `divide_by_achievable` argument to `NDCG` metric ([#266](https://github.com/MobileTeleSystems/RecTools/pull/266)) +### Changed +- Interactions extra columns are not dropped in `Dataset.filter_interactions` method [#267](https://github.com/MobileTeleSystems/RecTools/pull/267) + ## [0.11.0] - 17.02.2025 ### Added diff --git a/rectools/dataset/dataset.py b/rectools/dataset/dataset.py index 7e2942bc..6d7a7d52 100644 --- a/rectools/dataset/dataset.py +++ b/rectools/dataset/dataset.py @@ -401,7 +401,8 @@ def filter_interactions( # 1x internal -> 2x internal user_id_map = IdMap.from_values(interactions_df[Columns.User].values) item_id_map = IdMap.from_values(interactions_df[Columns.Item].values) - interactions = Interactions.from_raw(interactions_df, user_id_map, item_id_map) + # We shouldn't drop extra columns if they are present + interactions = Interactions.from_raw(interactions_df, user_id_map, item_id_map, keep_extra_cols=True) def _handle_features( features: tp.Optional[Features], target_id_map: IdMap, dataset_id_map: IdMap diff --git a/rectools/metrics/catalog.py b/rectools/metrics/catalog.py index f00e7969..31468413 100644 --- a/rectools/metrics/catalog.py +++ b/rectools/metrics/catalog.py @@ -16,6 +16,7 @@ import typing as tp +import attr import pandas as pd from rectools import Columns @@ -23,16 +24,21 @@ from .base import Catalog, MetricAtK +@attr.s class CatalogCoverage(MetricAtK): """ - Share of items in catalog that is present in recommendations for all users. + Count (or share) of items from catalog that is present in recommendations for all users. Parameters ---------- k : int Number of items at the top of recommendations list that will be used to calculate metric. + normalize: bool, default ``False`` + Flag, which says whether to normalize metric or not. """ + normalize: bool = attr.ib(default=False) + def calc(self, reco: pd.DataFrame, catalog: Catalog) -> float: """ Calculate metric value. @@ -49,7 +55,10 @@ def calc(self, reco: pd.DataFrame, catalog: Catalog) -> float: float Value of metric (aggregated for all users). """ - return reco.loc[reco[Columns.Rank] <= self.k, Columns.Item].nunique() / len(catalog) + res = reco.loc[reco[Columns.Rank] <= self.k, Columns.Item].nunique() + if self.normalize: + return res / len(catalog) + return res CatalogMetric = CatalogCoverage diff --git a/rectools/metrics/ranking.py b/rectools/metrics/ranking.py index ea3518e1..5d44457e 100644 --- a/rectools/metrics/ranking.py +++ b/rectools/metrics/ranking.py @@ -314,28 +314,27 @@ class NDCG(_RankingMetric): r""" Normalized Discounted Cumulative Gain at k (NDCG@k). - Estimates relevance of recommendations taking in account their order. + Estimates relevance of recommendations taking in account their order. `"Discounted Gain"` + means that original item relevance is being discounted based on this + items rank. The closer is item to the top the, the more gain is achieved. + `"Cumulative"` means that all items discounted gains from ``k`` ranks are being summed. + `"Normalized"` means that the actual value of DCG is being divided by the `"Ideal DCG"` (IDCG). + This is the maximum possible value of `DCG@k`, used as normalization coefficient to ensure that + `NDCG@k` values lie in ``[0, 1]``. .. math:: NDCG@k=\frac{1}{|U|}\sum_{u \in U}\frac{DCG_u@k}{IDCG_u@k} + DCG_u@k = \sum_{i=1}^{k} \frac{rel_u(i)}{log(i + 1)} + where - - :math:`DCG_u@k` is "Discounted Cumulative Gain" at k for user u. - - `"Gain"` stands for relevance of item at position i to user. It equals to ``1`` if this item - is relevant, ``0`` otherwise - - `"Discounted Gain"` means that original item relevance is being discounted based on this - items rank. The closer is item to the top the, the more gain is achieved. - - `"Discounted Cumulative Gain"` means that discounted gains are summed together. - - :math:`IDCG_u@k` is `"Ideal Discounted Cumulative Gain"` at k for user u. This is maximum - possible value of `DCG@k`, used as normalization coefficient to ensure that `NDCG@k` - values lie in ``[0, 1]``. - - When `divide_by_achievable` is set to ``False`` (default) `IDCG_u@k` is the same value for all - users and is equal to: - :math:`IDCG_u@k = \sum_{i=1}^{k} \frac{1}{log(i + 1)}` - When `divide_by_achievable` is set to ``True``, the formula for IDCG depends - on number of each user relevant items in the test set. The formula is: - :math:`IDCG_u@k = \sum_{i=1}^{\min (|R(u)|, k)} \frac{1}{log(i + 1)}` + - :math:`IDCG_u@k = \sum_{i=1}^{k} \frac{1}{log(i + 1)}` when `divide_by_achievable` is set + to ``False`` (default). + - :math:`IDCG_u@k = \sum_{i=1}^{\min (|R(u)|, k)} \frac{1}{log(i + 1)}` when + `divide_by_achievable` is set to ``True``. + - :math:`rel_u(i)` is `"Gain"`. Here it is an indicator function, it equals to ``1`` if the + item at rank ``i`` is relevant to user ``u``, ``0`` otherwise. + - :math:`|R_u|` is number of relevant (ground truth) items for user ``u``. Parameters ---------- diff --git a/tests/dataset/test_dataset.py b/tests/dataset/test_dataset.py index 2582d082..01de48a6 100644 --- a/tests/dataset/test_dataset.py +++ b/tests/dataset/test_dataset.py @@ -362,19 +362,19 @@ def dataset_to_filter(self) -> Dataset: user_id_map = IdMap.from_values([10, 11, 12, 13, 14]) df = pd.DataFrame( [ - [0, 0, 1, "2021-09-01"], - [4, 2, 1, "2021-09-02"], - [2, 1, 1, "2021-09-02"], - [2, 2, 1, "2021-09-03"], - [3, 2, 1, "2021-09-03"], - [3, 3, 1, "2021-09-03"], - [3, 4, 1, "2021-09-04"], - [1, 2, 1, "2021-09-04"], - [3, 1, 1, "2021-09-05"], - [4, 2, 1, "2021-09-05"], - [3, 3, 1, "2021-09-06"], + [0, 0, 1, "2021-09-01", 1], + [4, 2, 1, "2021-09-02", 1], + [2, 1, 1, "2021-09-02", 1], + [2, 2, 1, "2021-09-03", 1], + [3, 2, 1, "2021-09-03", 1], + [3, 3, 1, "2021-09-03", 1], + [3, 4, 1, "2021-09-04", 1], + [1, 2, 1, "2021-09-04", 1], + [3, 1, 1, "2021-09-05", 1], + [4, 2, 1, "2021-09-05", 1], + [3, 3, 1, "2021-09-06", 1], ], - columns=[Columns.User, Columns.Item, Columns.Weight, Columns.Datetime], + columns=[Columns.User, Columns.Item, Columns.Weight, Columns.Datetime, "extra"], ).astype({Columns.Datetime: "datetime64[ns]"}) interactions = Interactions(df) return Dataset(user_id_map, item_id_map, interactions) @@ -426,12 +426,12 @@ def test_filter_dataset_interactions_df_rows_without_features( ) expected_interactions_2x_internal_df = pd.DataFrame( [ - [0, 0, 1, "2021-09-01"], - [1, 1, 1, "2021-09-02"], - [2, 2, 1, "2021-09-02"], - [2, 1, 1, "2021-09-03"], + [0, 0, 1, "2021-09-01", 1], + [1, 1, 1, "2021-09-02", 1], + [2, 2, 1, "2021-09-02", 1], + [2, 1, 1, "2021-09-03", 1], ], - columns=[Columns.User, Columns.Item, Columns.Weight, Columns.Datetime], + columns=[Columns.User, Columns.Item, Columns.Weight, Columns.Datetime, "extra"], ).astype({Columns.Datetime: "datetime64[ns]", Columns.Weight: float}) np.testing.assert_equal(filtered_dataset.user_id_map.external_ids, expected_external_user_ids) np.testing.assert_equal(filtered_dataset.item_id_map.external_ids, expected_external_item_ids) @@ -464,12 +464,12 @@ def test_filter_dataset_interactions_df_rows_with_features( ) expected_interactions_2x_internal_df = pd.DataFrame( [ - [0, 0, 1, "2021-09-01"], - [1, 1, 1, "2021-09-02"], - [2, 2, 1, "2021-09-02"], - [2, 1, 1, "2021-09-03"], + [0, 0, 1, "2021-09-01", 1], + [1, 1, 1, "2021-09-02", 1], + [2, 2, 1, "2021-09-02", 1], + [2, 1, 1, "2021-09-03", 1], ], - columns=[Columns.User, Columns.Item, Columns.Weight, Columns.Datetime], + columns=[Columns.User, Columns.Item, Columns.Weight, Columns.Datetime, "extra"], ).astype({Columns.Datetime: "datetime64[ns]", Columns.Weight: float}) np.testing.assert_equal(filtered_dataset.user_id_map.external_ids, expected_external_user_ids) np.testing.assert_equal(filtered_dataset.item_id_map.external_ids, expected_external_item_ids) diff --git a/tests/metrics/test_catalog.py b/tests/metrics/test_catalog.py index 3a8e7f01..543b2433 100644 --- a/tests/metrics/test_catalog.py +++ b/tests/metrics/test_catalog.py @@ -16,6 +16,7 @@ import numpy as np import pandas as pd +import pytest from rectools import Columns from rectools.metrics import CatalogCoverage @@ -23,7 +24,6 @@ class TestCatalogCoverage: def setup_method(self) -> None: - self.metric = CatalogCoverage(k=2) self.reco = pd.DataFrame( { Columns.User: [1, 1, 1, 2, 2, 3, 4], @@ -32,7 +32,8 @@ def setup_method(self) -> None: } ) - def test_calc(self) -> None: + @pytest.mark.parametrize("normalize,expected", ((True, 0.4), (False, 2.0))) + def test_calc(self, normalize: bool, expected: float) -> None: catalog = np.arange(5) - expected = 0.4 - assert self.metric.calc(self.reco, catalog) == expected + metric = CatalogCoverage(k=2, normalize=normalize) + assert metric.calc(self.reco, catalog) == expected diff --git a/tests/metrics/test_scoring.py b/tests/metrics/test_scoring.py index 17dfe4c6..8366ea26 100644 --- a/tests/metrics/test_scoring.py +++ b/tests/metrics/test_scoring.py @@ -119,7 +119,7 @@ def test_success(self) -> None: "sufficient": SufficientReco(k=2), "unrepeated": UnrepeatedReco(k=2), "covered_users": CoveredUsers(k=2), - "catalog_coverage": CatalogCoverage(k=2), + "catalog_coverage": CatalogCoverage(k=2, normalize=True), } with pytest.warns(UserWarning, match="Custom metrics are not supported"): actual = calc_metrics(