Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
## Unreleased

### Added
- `CatalogCoverage` metric ([#266](https://github.com/MobileTeleSystems/RecTools/pull/266))
- `CatalogCoverage` metric ([#266](https://github.com/MobileTeleSystems/RecTools/pull/266), [#267](https://github.com/MobileTeleSystems/RecTools/pull/267))
- `divide_by_achievable` argument to `NDCG` metric ([#266](https://github.com/MobileTeleSystems/RecTools/pull/266))

### Changed
- Interactions extra columns are not dropped in `Dataset.filter_interactions` method [#267](https://github.com/MobileTeleSystems/RecTools/pull/267)

## [0.11.0] - 17.02.2025

### Added
Expand Down
3 changes: 2 additions & 1 deletion rectools/dataset/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -401,7 +401,8 @@ def filter_interactions(
# 1x internal -> 2x internal
user_id_map = IdMap.from_values(interactions_df[Columns.User].values)
item_id_map = IdMap.from_values(interactions_df[Columns.Item].values)
interactions = Interactions.from_raw(interactions_df, user_id_map, item_id_map)
# We shouldn't drop extra columns if they are present
interactions = Interactions.from_raw(interactions_df, user_id_map, item_id_map, keep_extra_cols=True)

def _handle_features(
features: tp.Optional[Features], target_id_map: IdMap, dataset_id_map: IdMap
Expand Down
13 changes: 11 additions & 2 deletions rectools/metrics/catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,23 +16,29 @@

import typing as tp

import attr
import pandas as pd

from rectools import Columns

from .base import Catalog, MetricAtK


@attr.s
class CatalogCoverage(MetricAtK):
"""
Share of items in catalog that is present in recommendations for all users.
Count (or share) of items from catalog that is present in recommendations for all users.

Parameters
----------
k : int
Number of items at the top of recommendations list that will be used to calculate metric.
normalize: bool, default ``False``
Flag, which says whether to normalize metric or not.
"""

normalize: bool = attr.ib(default=False)

def calc(self, reco: pd.DataFrame, catalog: Catalog) -> float:
"""
Calculate metric value.
Expand All @@ -49,7 +55,10 @@ def calc(self, reco: pd.DataFrame, catalog: Catalog) -> float:
float
Value of metric (aggregated for all users).
"""
return reco.loc[reco[Columns.Rank] <= self.k, Columns.Item].nunique() / len(catalog)
res = reco.loc[reco[Columns.Rank] <= self.k, Columns.Item].nunique()
if self.normalize:
return res / len(catalog)
return res


CatalogMetric = CatalogCoverage
Expand Down
33 changes: 16 additions & 17 deletions rectools/metrics/ranking.py
Original file line number Diff line number Diff line change
Expand Up @@ -314,28 +314,27 @@ class NDCG(_RankingMetric):
r"""
Normalized Discounted Cumulative Gain at k (NDCG@k).

Estimates relevance of recommendations taking in account their order.
Estimates relevance of recommendations taking in account their order. `"Discounted Gain"`
means that original item relevance is being discounted based on this
items rank. The closer is item to the top the, the more gain is achieved.
`"Cumulative"` means that all items discounted gains from ``k`` ranks are being summed.
`"Normalized"` means that the actual value of DCG is being divided by the `"Ideal DCG"` (IDCG).
This is the maximum possible value of `DCG@k`, used as normalization coefficient to ensure that
`NDCG@k` values lie in ``[0, 1]``.

.. math::
NDCG@k=\frac{1}{|U|}\sum_{u \in U}\frac{DCG_u@k}{IDCG_u@k}

DCG_u@k = \sum_{i=1}^{k} \frac{rel_u(i)}{log(i + 1)}

where
- :math:`DCG_u@k` is "Discounted Cumulative Gain" at k for user u.
- `"Gain"` stands for relevance of item at position i to user. It equals to ``1`` if this item
is relevant, ``0`` otherwise
- `"Discounted Gain"` means that original item relevance is being discounted based on this
items rank. The closer is item to the top the, the more gain is achieved.
- `"Discounted Cumulative Gain"` means that discounted gains are summed together.
- :math:`IDCG_u@k` is `"Ideal Discounted Cumulative Gain"` at k for user u. This is maximum
possible value of `DCG@k`, used as normalization coefficient to ensure that `NDCG@k`
values lie in ``[0, 1]``.

When `divide_by_achievable` is set to ``False`` (default) `IDCG_u@k` is the same value for all
users and is equal to:
:math:`IDCG_u@k = \sum_{i=1}^{k} \frac{1}{log(i + 1)}`
When `divide_by_achievable` is set to ``True``, the formula for IDCG depends
on number of each user relevant items in the test set. The formula is:
:math:`IDCG_u@k = \sum_{i=1}^{\min (|R(u)|, k)} \frac{1}{log(i + 1)}`
- :math:`IDCG_u@k = \sum_{i=1}^{k} \frac{1}{log(i + 1)}` when `divide_by_achievable` is set
to ``False`` (default).
- :math:`IDCG_u@k = \sum_{i=1}^{\min (|R(u)|, k)} \frac{1}{log(i + 1)}` when
`divide_by_achievable` is set to ``True``.
- :math:`rel_u(i)` is `"Gain"`. Here it is an indicator function, it equals to ``1`` if the
item at rank ``i`` is relevant to user ``u``, ``0`` otherwise.
- :math:`|R_u|` is number of relevant (ground truth) items for user ``u``.

Parameters
----------
Expand Down
44 changes: 22 additions & 22 deletions tests/dataset/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -362,19 +362,19 @@ def dataset_to_filter(self) -> Dataset:
user_id_map = IdMap.from_values([10, 11, 12, 13, 14])
df = pd.DataFrame(
[
[0, 0, 1, "2021-09-01"],
[4, 2, 1, "2021-09-02"],
[2, 1, 1, "2021-09-02"],
[2, 2, 1, "2021-09-03"],
[3, 2, 1, "2021-09-03"],
[3, 3, 1, "2021-09-03"],
[3, 4, 1, "2021-09-04"],
[1, 2, 1, "2021-09-04"],
[3, 1, 1, "2021-09-05"],
[4, 2, 1, "2021-09-05"],
[3, 3, 1, "2021-09-06"],
[0, 0, 1, "2021-09-01", 1],
[4, 2, 1, "2021-09-02", 1],
[2, 1, 1, "2021-09-02", 1],
[2, 2, 1, "2021-09-03", 1],
[3, 2, 1, "2021-09-03", 1],
[3, 3, 1, "2021-09-03", 1],
[3, 4, 1, "2021-09-04", 1],
[1, 2, 1, "2021-09-04", 1],
[3, 1, 1, "2021-09-05", 1],
[4, 2, 1, "2021-09-05", 1],
[3, 3, 1, "2021-09-06", 1],
],
columns=[Columns.User, Columns.Item, Columns.Weight, Columns.Datetime],
columns=[Columns.User, Columns.Item, Columns.Weight, Columns.Datetime, "extra"],
).astype({Columns.Datetime: "datetime64[ns]"})
interactions = Interactions(df)
return Dataset(user_id_map, item_id_map, interactions)
Expand Down Expand Up @@ -426,12 +426,12 @@ def test_filter_dataset_interactions_df_rows_without_features(
)
expected_interactions_2x_internal_df = pd.DataFrame(
[
[0, 0, 1, "2021-09-01"],
[1, 1, 1, "2021-09-02"],
[2, 2, 1, "2021-09-02"],
[2, 1, 1, "2021-09-03"],
[0, 0, 1, "2021-09-01", 1],
[1, 1, 1, "2021-09-02", 1],
[2, 2, 1, "2021-09-02", 1],
[2, 1, 1, "2021-09-03", 1],
],
columns=[Columns.User, Columns.Item, Columns.Weight, Columns.Datetime],
columns=[Columns.User, Columns.Item, Columns.Weight, Columns.Datetime, "extra"],
).astype({Columns.Datetime: "datetime64[ns]", Columns.Weight: float})
np.testing.assert_equal(filtered_dataset.user_id_map.external_ids, expected_external_user_ids)
np.testing.assert_equal(filtered_dataset.item_id_map.external_ids, expected_external_item_ids)
Expand Down Expand Up @@ -464,12 +464,12 @@ def test_filter_dataset_interactions_df_rows_with_features(
)
expected_interactions_2x_internal_df = pd.DataFrame(
[
[0, 0, 1, "2021-09-01"],
[1, 1, 1, "2021-09-02"],
[2, 2, 1, "2021-09-02"],
[2, 1, 1, "2021-09-03"],
[0, 0, 1, "2021-09-01", 1],
[1, 1, 1, "2021-09-02", 1],
[2, 2, 1, "2021-09-02", 1],
[2, 1, 1, "2021-09-03", 1],
],
columns=[Columns.User, Columns.Item, Columns.Weight, Columns.Datetime],
columns=[Columns.User, Columns.Item, Columns.Weight, Columns.Datetime, "extra"],
).astype({Columns.Datetime: "datetime64[ns]", Columns.Weight: float})
np.testing.assert_equal(filtered_dataset.user_id_map.external_ids, expected_external_user_ids)
np.testing.assert_equal(filtered_dataset.item_id_map.external_ids, expected_external_item_ids)
Expand Down
9 changes: 5 additions & 4 deletions tests/metrics/test_catalog.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,14 @@

import numpy as np
import pandas as pd
import pytest

from rectools import Columns
from rectools.metrics import CatalogCoverage


class TestCatalogCoverage:
def setup_method(self) -> None:
self.metric = CatalogCoverage(k=2)
self.reco = pd.DataFrame(
{
Columns.User: [1, 1, 1, 2, 2, 3, 4],
Expand All @@ -32,7 +32,8 @@ def setup_method(self) -> None:
}
)

def test_calc(self) -> None:
@pytest.mark.parametrize("normalize,expected", ((True, 0.4), (False, 2.0)))
def test_calc(self, normalize: bool, expected: float) -> None:
catalog = np.arange(5)
expected = 0.4
assert self.metric.calc(self.reco, catalog) == expected
metric = CatalogCoverage(k=2, normalize=normalize)
assert metric.calc(self.reco, catalog) == expected
2 changes: 1 addition & 1 deletion tests/metrics/test_scoring.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ def test_success(self) -> None:
"sufficient": SufficientReco(k=2),
"unrepeated": UnrepeatedReco(k=2),
"covered_users": CoveredUsers(k=2),
"catalog_coverage": CatalogCoverage(k=2),
"catalog_coverage": CatalogCoverage(k=2, normalize=True),
}
with pytest.warns(UserWarning, match="Custom metrics are not supported"):
actual = calc_metrics(
Expand Down