Merge pull request #596 from mdekstrand/feature/pydantic-config

Use Pydantic models for component configurations
lenskit · Jan 11, 2025 · d1afbcb · d1afbcb
2 parents f077dc7 + c6186d9
commit d1afbcb
Show file tree

Hide file tree

Showing 66 changed files with 1,227 additions and 986 deletions.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -611,9 +611,12 @@ jobs:
       - name: Download ML data
         run: |
           python -m lenskit.data.fetch ml-100k ml-1m ml-10m ml-20m
-      - name: "📕 Validate documentation examples"
+      - name: "📕 Validate code examples"
         run: |
-          pytest --cov=lenskit/lenskit --cov=lenskit-funksvd/lenskit --cov=lenskit-implicit/lenskit --cov=lenskit-hpf/lenskit --nbval-lax --doctest-glob='*.rst' --ignore='docs/_ext' --log-file test-docs.log docs */lenskit
+          sphinx-build -b doctest docs build/doc
+      - name: "📕 Validate example notebooks"
+        run: |
+          pytest --cov=lenskit/lenskit --cov=lenskit-funksvd/lenskit --cov=lenskit-implicit/lenskit --cov=lenskit-hpf/lenskit --nbval-lax --log-file test-notebooks.log docs
       - name: "📐 Coverage results"
         if: '${{ !cancelled() }}'
         run: |

diff --git a/.vscode/ltex.dictionary.en-US.txt b/.vscode/ltex.dictionary.en-US.txt
@@ -22,3 +22,6 @@ RecSys
 PyArrow
 Numba
 DuckDB
+ItemList
+Pydantic
+dataclass
diff --git a/conftest.py b/conftest.py
@@ -16,10 +16,10 @@
 from pytest import fixture, skip
 
 from lenskit.parallel import ensure_parallel_init
+from lenskit.random import set_global_rng
 
 # bring common fixtures into scope
 from lenskit.testing import ml_100k, ml_ds, ml_ds_unchecked, ml_ratings  # noqa: F401
-from lenskit.util.random import set_global_rng
 
 logging.getLogger("numba").setLevel(logging.INFO)
 

diff --git a/docs/api/data-types.rst b/docs/api/data-types.rst
@@ -17,4 +17,4 @@ Entity Identifiers
 Containers
 ~~~~~~~~~~
 
-.. autoclass:: UITuple
+.. autoclass:: UIPair
diff --git a/docs/api/index.rst b/docs/api/index.rst
@@ -12,7 +12,6 @@ Core Abstractions
     lenskit.pipeline
     lenskit.diagnostics
     lenskit.operations
-    lenskit.types
 
 .. toctree::
     :caption: Core
@@ -81,3 +80,4 @@ and may be useful in building new models and components for LensKit.
     lenskit.parallel
     lenskit.testing
     lenskit.util
+    lenskit.random
diff --git a/docs/api/pipeline.rst b/docs/api/pipeline.rst
@@ -31,7 +31,6 @@ LensKit components.
 
     ~lenskit.pipeline.Component
     ~lenskit.pipeline.Trainable
-    ~lenskit.pipeline.Configurable
 
 Standard Pipelines
 ------------------

diff --git a/docs/conf.py b/docs/conf.py
@@ -4,8 +4,10 @@
 # Licensed under the MIT license, see LICENSE.md for details.
 # SPDX-License-Identifier: MIT
 
+import doctest
 import sys
 from importlib.metadata import version
+from os import fspath
 from pathlib import Path
 
 from packaging.version import Version
@@ -25,6 +27,7 @@
     "sphinx.ext.napoleon",
     "sphinx.ext.autodoc",
     "sphinx.ext.autosummary",
+    "sphinx.ext.doctest",
     "sphinx.ext.intersphinx",
     "sphinx.ext.mathjax",
     "sphinx.ext.extlinks",
@@ -102,9 +105,9 @@
 autodoc_typehints = "description"
 autodoc_type_aliases = {
     "ArrayLike": "numpy.typing.ArrayLike",
-    "SeedLike": "lenskit.types.SeedLike",
-    "RNGLike": "lenskit.types.RNGLike",
-    "RNGInput": "lenskit.types.RNGInput",
+    "SeedLike": "lenskit.random.SeedLike",
+    "RNGLike": "lenskit.random.RNGLike",
+    "RNGInput": "lenskit.random.RNGInput",
     "IDSequence": "lenskit.data.types.IDSequence",
 }
 # autosummary_generate_overwrite = False
@@ -133,6 +136,10 @@
 
 bibtex_bibfiles = ["lenskit.bib"]
 nb_execution_mode = "off"
+doctest_path = [fspath((Path(__file__).parent / "guide" / "examples").resolve())]
+doctest_default_flags = (
+    doctest.ELLIPSIS | doctest.IGNORE_EXCEPTION_DETAIL | doctest.NORMALIZE_WHITESPACE
+)
 
 mermaid_d3_zoom = True
 

diff --git a/docs/guide/GettingStarted.ipynb b/docs/guide/GettingStarted.ipynb
diff --git a/docs/guide/batch.rst b/docs/guide/batch.rst
@@ -39,7 +39,7 @@ For an example, let's start with importing things to run a quick batch:
 Load and split some data:
 
     >>> data = load_movielens('data/ml-100k.zip')
-    >>> split = sample_users(data, 150, SampleN(5))
+    >>> split = sample_users(data, 150, SampleN(5, rng=1024), rng=42)
 
 Configure and train the model:
 
@@ -62,9 +62,9 @@ And measure their results:
     >>> measure.add_metric(RBP())
     >>> scores = measure.compute(recs, split.test)
     >>> scores.list_summary()    # doctest: +ELLIPSIS
-            mean    median     std
+              mean    median     std
     metric
-    RBP  0.07...    0.0...  0.1...
+    RBP    0.09...    0.0...  0.1...
 
 
 The :py:func:`predict` function works similarly, but for rating predictions;

diff --git a/docs/guide/conventions.rst b/docs/guide/conventions.rst
@@ -42,11 +42,11 @@ splitting support <./splitting>`_.
 
     Now that `SPEC 7`_ has standardized RNG seeding across the scientific Python
     ecosystem, we use that with some lightweight helpers in the
-    :mod:`lenskit.util.random` module instead of using SeedBank.
+    :mod:`lenskit.random` module instead of using SeedBank.
 
 LensKit extends SPEC 7 with a global RNG that components can use as a fallback,
 to make it easier to configure system-wide generation for things like tests.
-This is configured with :func:`~lenskit.util.random.set_global_rng`.
+This is configured with :func:`~lenskit.random.set_global_rng`.
 
 When implementing a component that uses randomness in its training, we recommend
 deferring conversion of the provided RNG into an actual generator until
@@ -56,7 +56,7 @@ When using the RNG to create initial state for e.g. training a model with
 PyTorch, it can be useful to create that state in NumPy and then convert to a
 tensor, so that components are consistent in their random number generation
 behavior instead of having variation between NumPy and other backends.
-Components can use the :func:`~lenskit.util.random_generator` function to
+Components can use the :func:`~lenskit.random_generator` function to
 convert seed material or a generator into a NumPy generator, falling back to the
 global RNG if one is specified.
 

diff --git a/docs/guide/examples/blendcomp.py b/docs/guide/examples/blendcomp.py
@@ -0,0 +1,43 @@
+from pydantic import BaseModel
+
+from lenskit.data import ItemList
+from lenskit.pipeline import Component
+
+
+class LinearBlendConfig(BaseModel):
+    "Configuration for :class:`LinearBlendScorer`."
+
+    # define the parameter with a type, default value, and docstring.
+    mix_weight: float = 0.5
+    r"""
+    Linear blending mixture weight :math:`\alpha`.
+    """
+
+
+class LinearBlendScorer(Component):
+    r"""
+    Score items with a linear blend of two other scores.
+
+    Given a mixture weight :math:`\alpha` and two scores
+    :math:`s_i^{\mathrm{left}}` and :math:`s_i^{\mathrm{right}}`, this
+    computes :math:`s_i = \alpha s_i^{\mathrm{left}} + (1 - \alpha)
+    s_i^{\mathrm{right}}`.  Missing values propagate, so only items
+    scored in both inputs have scores in the output.
+    """
+
+    # define the configuration attribute, with a docstring to make sure
+    # it shows up in component docs.
+    config: LinearBlendConfig
+    "Configuration parameters for the linear blend."
+
+    # the __call__ method defines the component's operation
+    def __call__(self, left: ItemList, right: ItemList) -> ItemList:
+        """
+        Blend the scores of two item lists.
+        """
+        ls = left.scores("pandas", index="ids")
+        rs = right.scores("pandas", index="ids")
+        ls, rs = ls.align(rs)
+        alpha = self.config.mix_weight
+        combined = ls * alpha + rs * (1 - alpha)
+        return ItemList(item_ids=combined.index, scores=combined.values)
diff --git a/docs/guide/migrating.rst b/docs/guide/migrating.rst
@@ -71,6 +71,11 @@ New code should use :py:func:`lenskit.data.from_interactions_df` to convert a Pa
 data frame into a :py:func:`~lenskit.data.Dataset`, or one of the standard loaders
 such as :py:func:`lenskit.data.load_movielens`.
 
+While most LensKit data frame code still recognizes the legacy ``user`` and
+``item`` columns from LensKit 0.14 and earlier, data frames of LensKit data
+should use the column names ``user_id`` and ``item_id`` instead, to
+unambiguously distinguish them from user and item numbers.
+
 Additional dataset construction support and possible implementations (e.g.
 database-backed datasets) are coming, but this is the migration path for the
 typical code patterns used in LensKit 0.14 and earlier.
@@ -180,10 +185,18 @@ them for very different ways of turning scoring models into full recommenders.
 .. note::
 
     Since 2025, we no longer use the term “algorithm” in LensKit, as it is
-    ambiguous and promotes confusion about very different things.  Instead we
+    ambiguous and promotes confusion about very different things.  Instead, we
     have “pipelines” consisting of ”components”, some of which may be ”models”
     (for scoring, ranking, etc.).
 
+Configuration Components
+........................
+
+Individual components now use Pydantic_ models to represent their configuration
+(e.g. hyperparameters).  This is to reduce redundancy, improve documentation,
+enable consistent serialization, and validate parameter values in a consistent
+and automated fashion.  See :ref:`component-config` for details.
+
 Obtaining Recommendations
 -------------------------