Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve types and documentation for item k-NN #405

Merged
merged 5 commits into from
May 28, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 25 additions & 11 deletions lenskit/algorithms/knn/item.py
Original file line number Diff line number Diff line change
@@ -8,6 +8,7 @@
Item-based k-NN collaborative filtering.
"""

# pyright: basic
from __future__ import annotations

import logging
@@ -20,6 +21,7 @@
import torch

from lenskit import ConfigWarning, DataWarning, util
from lenskit.data import FeedbackType
from lenskit.data.matrix import normalize_sparse_rows, sparse_ratings
from lenskit.parallel import ensure_parallel_init
from lenskit.util.logging import pbh_update, progress_handle
@@ -50,15 +52,18 @@ class ItemItem(Predictor):
items, regardless of the rating the user gave the items.

Args:
nnbrs(int):
nnbrs:
the maximum number of neighbors for scoring each item (``None`` for
unlimited)
min_nbrs(int): the minimum number of neighbors for scoring each item
min_sim(float): minimum similarity threshold for considering a neighbor
save_nbrs(float):
min_nbrs: the minimum number of neighbors for scoring each item
min_sim:
Minimum similarity threshold for considering a neighbor. Must be
positive; if less than the smallest 32-bit normal (:math:`1.175
\\times 10^{-38}`), is clamped to that value.
save_nbrs:
the number of neighbors to save per item in the trained model
(``None`` for unlimited)
feedback(str):
feedback:
Control how feedback should be interpreted. Specifies defaults for
the other settings, which can be overridden individually; can be one
of the following values:
@@ -72,15 +77,15 @@ class ItemItem(Predictor):
Configure for implicit-feedback mode: ignore rating values, do
not center ratings, and use the ``sum`` aggregate method for
prediction.
center(bool):
center:
whether to normalize (mean-center) rating vectors prior to computing
similarities and aggregating user rating values. Defaults to
``True``; turn this off when working with unary data and other data
types that don't respond well to centering.
aggregate(str):
aggregate:
the type of aggregation to do. Can be ``weighted-average`` (the
default) or ``sum``.
use_ratings(bool):
use_ratings:
whether or not to use the rating values. If ``False``, it ignores
rating values and considers an implicit feedback signal of 1 for
every (user,item) pair present.
@@ -99,6 +104,9 @@ class ItemItem(Predictor):
save_nbrs: int | None
feedback: Literal["explicit", "implicit"]
block_size: int
center: bool
aggregate: str
use_ratings: bool

item_index_: pd.Index
"The index of item IDs."
@@ -119,7 +127,7 @@ def __init__(
min_nbrs: int = 1,
min_sim: float = 1.0e-6,
save_nbrs: int | None = None,
feedback: Literal["explicit", "implicit"] = "explicit",
feedback: FeedbackType = "explicit",
block_size: int = 250,
**kwargs,
):
@@ -175,16 +183,22 @@ def _check_setup(self):
if self.min_sim < 0:
_log.warning("item-item does not currently support negative similarities")
warnings.warn("item-item does not currently support negative similarities")
elif self.min_sim == 0:
f4i = np.finfo("f4")
_log.warn(
"minimum similarity %e is too low, using %e", self.min_sim, f4i.smallest_normal
)
self.min_sim = float(f4i.smallest_normal)

def fit(self, ratings, **kwargs):
def fit(self, ratings: pd.DataFrame, **kwargs):
"""
Train a model.

The model-training process depends on ``save_nbrs`` and ``min_sim``, but *not* on other
algorithm parameters.

Args:
ratings(pandas.DataFrame):
ratings:
(user,item,rating) data for computing item similarities.
"""
ensure_parallel_init()
4 changes: 4 additions & 0 deletions lenskit/data/__init__.py
Original file line number Diff line number Diff line change
@@ -4,4 +4,8 @@
# Licensed under the MIT license, see LICENSE.md for details.
# SPDX-License-Identifier: MIT

from typing import Literal, TypeAlias

from .matrix import RatingMatrix, sparse_ratings # noqa: F401

FeedbackType: TypeAlias = Literal["explicit", "implicit"]
Loading