terrier-org · cmacdonald · Aug 21, 2024 · Nov 9, 2023 · Nov 9, 2023 · Aug 20, 2024
diff --git a/docs/apply.rst b/docs/apply.rst
@@ -43,6 +43,8 @@ function.
 +       |         |             |                  +---------------------------+----------------------+-----------------------+
 |       |         |             |                  | `pt.apply.by_query()`     | dataframe for 1 query| dataframe for 1 query |
 +-------+---------+-------------+------------------+---------------------------+----------------------+-----------------------+
+|   D   |  None   |  N to 0     | Indexing         | `pt.apply.indexer()`      | iterable dictionary  | anything              | 
++-------+---------+-------------+------------------+---------------------------+----------------------+-----------------------+
 
 In each case, the result from calling a pyterrier.apply method is another PyTerrier transformer 
 (i.e. extends ``pt.Transformer``), which can be used for experimentation or combined with other 

diff --git a/pyterrier/apply.py b/pyterrier/apply.py
@@ -1,9 +1,9 @@
 from functools import partial
-from typing import Callable, Any, Dict, Union, Sequence
+from typing import Callable, Any, Dict, Union, Iterator, Sequence
 import numpy.typing as npt
 import pandas as pd
 import pyterrier as pt
-from pyterrier.apply_base import ApplyDocumentScoringTransformer, ApplyQueryTransformer, ApplyDocFeatureTransformer, ApplyForEachQuery, ApplyGenericTransformer
+from pyterrier.apply_base import ApplyDocumentScoringTransformer, ApplyQueryTransformer, ApplyDocFeatureTransformer, ApplyForEachQuery, ApplyGenericTransformer, ApplyIndexer
 
 def _bind(instance, func, as_name=None):
     """
@@ -87,6 +87,10 @@ def _doclen(df):
 
             pipe = pt.terrier.Retriever(index) >> pt.apply.doc_score(_doclen, batch_size=128)
 
+        Can also be used to create individual features that are combined using the ``**`` feature-union operator::
+
+            pipeline = bm25 >> ( some_features ** pt.apply.doc_score(_doclen) )
+
     """
     return ApplyDocumentScoringTransformer(fn, *args, batch_size=batch_size, **kwargs)
 
@@ -116,9 +120,36 @@ def _features(row):
             p = pt.terrier.Retriever(index, wmodel="BM25") >> 
                 pt.apply.doc_features(_features )
 
+        NB: If you only want to calculate a single feature to add to existing features, it is better to use ``pt.apply.doc_score()`` 
+        and the ``**`` feature union operator::
+
+            pipeline = bm25 >> ( some_features ** pt.apply.doc_score(one_feature) )
+
     """
     return ApplyDocFeatureTransformer(fn, *args, **kwargs)
 
+def indexer(fn : Callable[[Iterator[Dict[str,Any]]], Any], **kwargs) -> pt.Indexer:
+    """
+        Create an instance of pt.Indexer using a funcing that takes as input an interable dictionary.
+
+        The supplied function is called once. It may optionally return something (typically a reference to the "index").
+
+        Arguments:
+            fn(Callable): the function that consumed documents.
+
+        Example::
+
+            # make a pt.Indexer that returns the numnber of documents consumed
+            def _counter(iter_dict):
+                count = 0
+                for d in iter_dict:
+                    count += 1
+                return count
+            indexer = pt.apply.indexer(_counter)
+            rtr = indexer.index([ {'docno' : 'd1'}, {'docno' : 'd2'}])
+    """
+    return ApplyIndexer(fn, **kwargs)
+
 def rename(columns : Dict[str,str], *args, errors='raise', **kwargs) -> pt.Transformer:
     """
         Creates a transformer that renames columns in a dataframe. 
@@ -170,9 +201,10 @@ def __init__(self):
         _bind(self, lambda self, fn, *args, **kwargs : query(fn, *args, **kwargs), as_name='query')
         _bind(self, lambda self, fn, *args, **kwargs : doc_score(fn, *args, **kwargs), as_name='doc_score')
         _bind(self, lambda self, fn, *args, **kwargs : doc_features(fn, *args, **kwargs), as_name='doc_features')
+        _bind(self, lambda self, fn, *args, **kwargs : indexer(fn, *args, **kwargs), as_name='indexer')
         _bind(self, lambda self, fn, *args, **kwargs : rename(fn, *args, **kwargs), as_name='rename')
         _bind(self, lambda self, fn, *args, **kwargs : by_query(fn, *args, **kwargs), as_name='by_query')
-        _bind(self, lambda self, fn, *args, **kwargs : generic(fn, *args, **kwargs), as_name='generic')
+        _bind(self, lambda self, fn, *args, **kwargs : generic(fn, *args, **kwargs), as_name='generic')     
 
     def __getattr__(self, item):
         return partial(generic_apply, item)

diff --git a/pyterrier/apply_base.py b/pyterrier/apply_base.py
@@ -1,4 +1,5 @@
-from .transformer import Transformer
+
+from .transformer import Transformer, Indexer
 from .model import add_ranks, split_df
 import pandas as pd
 import pyterrier as pt
@@ -268,3 +269,14 @@ def transform(self, inputRes):
         rtr = pd.concat([self.fn(chunk_df) for chunk_df in iterator])
         return rtr
 
+class ApplyIndexer(Indexer):
+    """
+    Allows arbitrary indexer pipelines components to be written as functions.
+    """
+
+    def __init__(self, fn,  *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.fn = fn
+
+    def index(self, iter_dict):
+        return self.fn(iter_dict)
diff --git a/tests/test_apply.py b/tests/test_apply.py
@@ -18,6 +18,16 @@ def test_drop_columns(self):
         rtr = p(testDF)
         self.assertTrue("Bla" not in rtr.columns)
 
+    def test_index_apply(self):
+        def _counter(iter_dict):
+            count = 0
+            for d in iter_dict:
+                count += 1
+            return count
+        indexer = pt.apply.indexer(_counter)
+        rtr = indexer.index([ {'docno' : 'd1'}, {'docno' : 'd2'}])
+        self.assertEqual(2, rtr)
+
     def test_make_columns(self):
         from pyterrier.transformer import Transformer
         testDF = pd.DataFrame([["q1", "the bear and the wolf", 1]], columns=["qid", "query", "Bla"])