Skip to content

Commit

Permalink
TerrierIndex artifact
Browse files Browse the repository at this point in the history
  • Loading branch information
seanmacavaney committed Sep 24, 2024
1 parent 25b82b1 commit 3104cd0
Show file tree
Hide file tree
Showing 4 changed files with 105 additions and 0 deletions.
4 changes: 4 additions & 0 deletions pyterrier/terrier/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# java stuff
from pyterrier.terrier import java
from pyterrier.terrier._index import TerrierIndex
from pyterrier.terrier._text_loader import TerrierTextLoader, terrier_text_loader
from pyterrier.terrier.java import configure, set_version, set_helper_version, extend_classpath, J, set_property, set_properties, run, version, check_version, check_helper_version
from pyterrier.terrier.retriever import RetrieverBase, Retriever, FeaturesRetriever, TextScorer
Expand Down Expand Up @@ -49,6 +50,9 @@ class BatchRetrieveBase(RetrieverBase):
# java stuff
'java', 'configure', 'set_version', 'set_helper_version', 'extend_classpath', 'J', 'version', 'check_version', 'check_helper_version',

# High-level API
'TerrierIndex',

# retrieval
'BatchRetrieveBase', 'Retriever', 'RetrieverBase', 'BatchRetrieve', 'TerrierRetrieve', 'FeaturesRetriever', 'FeaturesBatchRetrieve', 'TerrierRetrieve', 'TextScorer',

Expand Down
80 changes: 80 additions & 0 deletions pyterrier/terrier/_index.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
import os
from typing import List, Dict
import pyterrier as pt


@pt.java.required
class TerrierIndex(pt.Artifact):
"""A Terrier index."""

ARTIFACT_TYPE = 'sparse_index'
ARTIFACT_FORMAT = 'terrier'

def __init__(self, path):
"""Initialises a TerrierIndex for the given path."""
super().__init__(path)
self._index_ref = None
self._index_obj = None

def retriever(
self,
*,
controls: Dict = None,
properties: Dict = None,
metadata: List[str] = ["docno"],
num_results: int = 1000,
wmodel: str = 'DPH',
threads: int = 1,
) -> pt.Transformer:
"""Creates a ``pt.terrier.Retriever`` object for this index.
Args:
controls: The controls to set for this retriever. Controls are specific settings for a given search request.
properties: The properties to use for this retriever. Properties are settings that apply globally to the index.
metadata: The metadata fields to return for each search result.
num_results: The maximum number of results to return per query.
wmodel: The weighting model to use for scoring.
threads: The number of threads to use during retrieval.
Returns:
A retriever object for this index.
"""
return pt.terrier.Retriever(self.index_obj(), controls, properties, metadata, num_results, wmodel)

def bm25(
self,
*,
k1: float = 1.2,
b: float = 0.75,
num_results: int = 1000,
threads: int = 1,
) -> pt.Transformer:
"""Creates a BM25 retriever for this index.
Args:
k1: BM25's ``k1`` parameter, which controls TF saturation.
b: BM25's ``b`` parameter, which controls the length penalty.
num_results: The maximum number of results to return per query.
threads: The number of threads to use during retrieval.
"""
return self.retriever(
wmodel='BM25',
controls={'bm25.k_1': k1, 'bm25.b': b},
num_results=num_results,
threads=threads
)

def __repr__(self):
return f'TerrierIndex({self.path!r})'

def index_ref(self):
"""Returns the internal Java index reference object for this index."""
if self._index_ref is None:
self._index_ref = pt.IndexRef.of(os.path.realpath(self.path))
return self._index_ref

def index_obj(self):
"""Returns the internal Java index object for this index."""
if self._index_obj is None:
self._index_obj = pt.IndexFactory.of(self.index_ref())
return self._index_obj
15 changes: 15 additions & 0 deletions pyterrier/terrier/_metadata_adapter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from typing import List


def terrier_artifact_metadata_adapter(path: str, dir_listing: List[str]):
"""Guess whether this path is a terrier index.
Some terrier indexes are missing pt_meta.json, but we can assume they are terrier indexes based on the
presence of a data.properties file.
"""
if 'data.properties' in dir_listing:
return {
'type': 'sparse_index',
'format': 'terrier',
'package_hint': 'python-terrier',
}
6 changes: 6 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,12 @@ def get_version(rel_path):
'pyterrier.java = pyterrier.java:CoreJavaInit',
'pyterrier.terrier.java = pyterrier.terrier.java:TerrierJavaInit',
],
'pyterrier.artifact': [
'sparse_index.terrier = pyterrier.terrier:TerrierIndex',
],
'pyterrier.artifact.metadata_adapter': [
'terrier = pyterrier.terrier._metadata_adapter:terrier_artifact_metadata_adapter',
],
'pyterrier.artifact.url_protocol_resolver': [
'hf = pyterrier._artifact:_hf_url_resolver'
],
Expand Down

0 comments on commit 3104cd0

Please sign in to comment.