Merge pull request #582 from mdekstrand/feature/cleanup-bulk

Add predict_pipeline, remove data.bulk module, and clean up batch functions
lenskit · Dec 31, 2024 · 48eb645 · 48eb645
2 parents 4725505 + c0d4133
commit 48eb645
Show file tree

Hide file tree

Showing 11 changed files with 183 additions and 267 deletions.
diff --git a/lenskit-funksvd/tests/test_funksvd.py b/lenskit-funksvd/tests/test_funksvd.py
@@ -13,10 +13,11 @@
 
 from pytest import approx, mark
 
-from lenskit.data import Dataset, ItemList, from_interactions_df
-from lenskit.data.bulk import dict_to_df, iter_item_lists
+from lenskit import batch
+from lenskit.data import Dataset, ItemList, ItemListCollection, UserIDKey, from_interactions_df
 from lenskit.funksvd import FunkSVDScorer
 from lenskit.metrics import call_metric, quick_measure_model
+from lenskit.pipeline.common import predict_pipeline
 from lenskit.testing import BasicComponentTests, ScorerTests, wantjit
 
 _log = logging.getLogger(__name__)
@@ -169,15 +170,17 @@ def test_fsvd_save_load(ml_ds: Dataset):
 def test_fsvd_known_preds(ml_ds: Dataset):
     algo = FunkSVDScorer(15, iterations=125, lrate=0.001)
     _log.info("training %s on ml data", algo)
-    algo.train(ml_ds)
+    pipe = predict_pipeline(algo, fallback=False)
+    pipe.train(ml_ds)
 
     dir = Path(__file__).parent
     pred_file = dir / "funksvd-preds.csv"
     _log.info("reading known predictions from %s", pred_file)
     known_preds = pd.read_csv(str(pred_file))
+    known = ItemListCollection.from_df(known_preds, UserIDKey)
 
-    preds = {u: algo(u, il) for (u, il) in iter_item_lists(known_preds)}
-    preds = dict_to_df(preds)
+    preds = batch.predict(pipe, known, n_jobs=1)
+    preds = preds.to_df().drop(columns=["prediction"], errors="ignore")
 
     known_preds.rename(columns={"prediction": "expected"}, inplace=True)
     merged = pd.merge(known_preds, preds)

diff --git a/lenskit/lenskit/batch/__init__.py b/lenskit/lenskit/batch/__init__.py
@@ -10,9 +10,81 @@
 
 from __future__ import annotations
 
-from ._predict import predict
-from ._recommend import recommend
+from typing import Mapping, Sequence
+
+from lenskit.data import ID, GenericKey, ItemList, ItemListCollection, UserIDKey
+from lenskit.pipeline import Pipeline
+
 from ._results import BatchResults
 from ._runner import BatchPipelineRunner, InvocationSpec
 
 __all__ = ["BatchPipelineRunner", "BatchResults", "InvocationSpec", "predict", "recommend"]
+
+
+def predict(
+    pipeline: Pipeline,
+    test: ItemListCollection[GenericKey] | Mapping[ID, ItemList],
+    *,
+    n_jobs: int | None = None,
+    **kwargs,
+) -> ItemListCollection[GenericKey]:
+    """
+    Convenience function to batch-generate rating predictions (or other per-item
+    scores) from a pipeline.  This is a batch version of :func:`lenskit.predict`.
+
+    Stability:
+        Caller
+    """
+
+    runner = BatchPipelineRunner(n_jobs=n_jobs)
+    runner.predict()
+    outs = runner.run(pipeline, test)
+    return outs.output("predictions")  # type: ignore
+
+
+def score(
+    pipeline: Pipeline,
+    test: ItemListCollection[GenericKey] | Mapping[ID, ItemList],
+    *,
+    n_jobs: int | None = None,
+    **kwargs,
+) -> ItemListCollection[GenericKey]:
+    """
+    Convenience function to batch-generate personalized scores from a pipeline.
+    This is a batch version of :func:`lenskit.predict`.
+
+    Stability:
+        Caller
+    """
+
+    runner = BatchPipelineRunner(n_jobs=n_jobs)
+    runner.score()
+    outs = runner.run(pipeline, test)
+    return outs.output("scores")  # type: ignore
+
+
+def recommend(
+    pipeline: Pipeline,
+    users: Sequence[ID | UserIDKey],
+    n: int | None = None,
+    candidates=None,
+    *,
+    n_jobs: int | None = None,
+    **kwargs,
+) -> ItemListCollection[UserIDKey]:
+    """
+    Convenience function to batch-generate recommendations from a pipeline. This
+    is a batch version of :func:`lenskit.recommend`.
+
+    .. todo::
+
+        Support more inputs than just user IDs.
+
+    Stability:
+        Caller
+    """
+
+    runner = BatchPipelineRunner(n_jobs=n_jobs)
+    runner.recommend(n=n)
+    outs = runner.run(pipeline, users)
+    return outs.output("recommendations")  # type: ignore
diff --git a/lenskit/lenskit/batch/_predict.py b/lenskit/lenskit/batch/_predict.py
diff --git a/lenskit/lenskit/batch/_recommend.py b/lenskit/lenskit/batch/_recommend.py
diff --git a/lenskit/lenskit/batch/_runner.py b/lenskit/lenskit/batch/_runner.py
@@ -68,17 +68,30 @@ def __init__(self, *, n_jobs: int | None = None):
     def add_invocation(self, inv: InvocationSpec):
         self.invocations.append(inv)
 
+    def score(self, component: str = "scorer", *, output: str = "scores"):
+        """
+        Request the batch run to generate test item scores.
+
+        Args:
+            component:
+                The name of the rating predictor component to run.
+            output:
+                The name of the results in the output dictionary.
+        """
+        self.add_invocation(InvocationSpec("score", {component: output}, "test-items"))
+
     def predict(self, component: str = "rating-predictor", *, output: str = "predictions"):
         """
-        Request the batch run to generate test item scores or rating predictins.
+        Request the batch run to generate test item rating predictions.  It is identical
+        to :meth:`score` but with different defaults.
 
         Args:
             component:
                 The name of the rating predictor component to run.
             output:
                 The name of the results in the output dictionary.
         """
-        self.add_invocation(InvocationSpec("predict-ratings", {component: output}, "test-items"))
+        return self.score(component, output=output)
 
     def recommend(
         self, component: str = "recommender", *, output: str = "recommendations", **extra: Any

diff --git a/lenskit/lenskit/data/bulk.py b/lenskit/lenskit/data/bulk.py