diff --git a/lenskit/algorithms/knn/item.py b/lenskit/algorithms/knn/item.py index e95b466ec..2dd89ecaf 100644 --- a/lenskit/algorithms/knn/item.py +++ b/lenskit/algorithms/knn/item.py @@ -8,6 +8,7 @@ Item-based k-NN collaborative filtering. """ +# pyright: basic from __future__ import annotations import logging @@ -20,6 +21,7 @@ import torch from lenskit import ConfigWarning, DataWarning, util +from lenskit.data import FeedbackType from lenskit.data.matrix import normalize_sparse_rows, sparse_ratings from lenskit.parallel import ensure_parallel_init from lenskit.util.logging import pbh_update, progress_handle @@ -50,15 +52,18 @@ class ItemItem(Predictor): items, regardless of the rating the user gave the items. Args: - nnbrs(int): + nnbrs: the maximum number of neighbors for scoring each item (``None`` for unlimited) - min_nbrs(int): the minimum number of neighbors for scoring each item - min_sim(float): minimum similarity threshold for considering a neighbor - save_nbrs(float): + min_nbrs: the minimum number of neighbors for scoring each item + min_sim: + Minimum similarity threshold for considering a neighbor. Must be + positive; if less than the smallest 32-bit normal (:math:`1.175 + \\times 10^{-38}`), is clamped to that value. + save_nbrs: the number of neighbors to save per item in the trained model (``None`` for unlimited) - feedback(str): + feedback: Control how feedback should be interpreted. Specifies defaults for the other settings, which can be overridden individually; can be one of the following values: @@ -72,15 +77,15 @@ class ItemItem(Predictor): Configure for implicit-feedback mode: ignore rating values, do not center ratings, and use the ``sum`` aggregate method for prediction. - center(bool): + center: whether to normalize (mean-center) rating vectors prior to computing similarities and aggregating user rating values. Defaults to ``True``; turn this off when working with unary data and other data types that don't respond well to centering. - aggregate(str): + aggregate: the type of aggregation to do. Can be ``weighted-average`` (the default) or ``sum``. - use_ratings(bool): + use_ratings: whether or not to use the rating values. If ``False``, it ignores rating values and considers an implicit feedback signal of 1 for every (user,item) pair present. @@ -99,6 +104,9 @@ class ItemItem(Predictor): save_nbrs: int | None feedback: Literal["explicit", "implicit"] block_size: int + center: bool + aggregate: str + use_ratings: bool item_index_: pd.Index "The index of item IDs." @@ -119,7 +127,7 @@ def __init__( min_nbrs: int = 1, min_sim: float = 1.0e-6, save_nbrs: int | None = None, - feedback: Literal["explicit", "implicit"] = "explicit", + feedback: FeedbackType = "explicit", block_size: int = 250, **kwargs, ): @@ -175,8 +183,14 @@ def _check_setup(self): if self.min_sim < 0: _log.warning("item-item does not currently support negative similarities") warnings.warn("item-item does not currently support negative similarities") + elif self.min_sim == 0: + f4i = np.finfo("f4") + _log.warn( + "minimum similarity %e is too low, using %e", self.min_sim, f4i.smallest_normal + ) + self.min_sim = float(f4i.smallest_normal) - def fit(self, ratings, **kwargs): + def fit(self, ratings: pd.DataFrame, **kwargs): """ Train a model. @@ -184,7 +198,7 @@ def fit(self, ratings, **kwargs): algorithm parameters. Args: - ratings(pandas.DataFrame): + ratings: (user,item,rating) data for computing item similarities. """ ensure_parallel_init() diff --git a/lenskit/data/__init__.py b/lenskit/data/__init__.py index bdf84d83b..7db51d280 100644 --- a/lenskit/data/__init__.py +++ b/lenskit/data/__init__.py @@ -4,4 +4,8 @@ # Licensed under the MIT license, see LICENSE.md for details. # SPDX-License-Identifier: MIT +from typing import Literal, TypeAlias + from .matrix import RatingMatrix, sparse_ratings # noqa: F401 + +FeedbackType: TypeAlias = Literal["explicit", "implicit"]