diff --git a/.github/workflows/style.yml b/.github/workflows/style.yml index 96231ca8..4be34222 100644 --- a/.github/workflows/style.yml +++ b/.github/workflows/style.yml @@ -1,4 +1,4 @@ -name: Code Style Checks +name: style on: push: @@ -7,7 +7,7 @@ on: branches: [ master ] jobs: - build: + flake8: runs-on: 'ubuntu-latest' steps: - uses: actions/checkout@v4 @@ -24,3 +24,21 @@ jobs: - name: pt.java.required checks run: | flake8 ./pyterrier --select=PT --show-source --statistics --count + + mypy: + runs-on: 'ubuntu-latest' + steps: + - uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: '3.10' + + - name: Install + run: | + pip install mypy --upgrade -r requirements.txt -r requirements-test.txt + pip install -e . + + - name: MyPy + run: 'mypy --disable-error-code=import-untyped pyterrier || true' diff --git a/pyterrier/__init__.py b/pyterrier/__init__.py index 76e580a1..b02e1bb6 100644 --- a/pyterrier/__init__.py +++ b/pyterrier/__init__.py @@ -31,6 +31,8 @@ # will be set in terrier.terrier.java once java is loaded IndexRef = None +# will be set in once utils.set_tqdm() once _() runs +tqdm = None # deprecated functions explored to the main namespace, which will be removed in a future version diff --git a/pyterrier/_ops.py b/pyterrier/_ops.py index 311d4966..63223ac7 100644 --- a/pyterrier/_ops.py +++ b/pyterrier/_ops.py @@ -32,7 +32,7 @@ def __len__(self) -> int: def _flatten(transformers: Iterable[Transformer], cls: type) -> Tuple[Transformer]: return list(chain.from_iterable( - (t._transformers if isinstance(t, cls) else [t]) + (t._transformers if isinstance(t, cls) else [t]) # type: ignore for t in transformers )) @@ -193,6 +193,7 @@ def fuse_left(self, left: Transformer) -> Optional[Transformer]: # If the preceding component supports a native rank cutoff (via fuse_rank_cutoff), apply it. if isinstance(left, SupportsFuseRankCutoff): return left.fuse_rank_cutoff(self.k) + return None class FeatureUnion(NAryTransformerBase): """ @@ -295,7 +296,7 @@ def compile(self) -> Transformer: """ Returns a new transformer that fuses feature unions where possible. """ - out = deque() + out : deque = deque() inp = deque([t.compile() for t in self._transformers]) while inp: right = inp.popleft() @@ -382,7 +383,7 @@ def compile(self, verbose: bool = False) -> Transformer: """Returns a new transformer that iteratively fuses adjacent transformers to form a more efficient pipeline.""" # compile constituent transformers (flatten allows complie() to return Compose pipelines) inp = deque(_flatten((t.compile() for t in self._transformers), Compose)) - out = deque() + out : deque = deque() counter = 1 while inp: if verbose: diff --git a/pyterrier/apply_base.py b/pyterrier/apply_base.py index 186a11e9..c8187c6e 100644 --- a/pyterrier/apply_base.py +++ b/pyterrier/apply_base.py @@ -1,7 +1,7 @@ from typing import Callable, Any, Union, Optional, Iterable import itertools import more_itertools -import numpy as np +import numpy.typing as npt import pandas as pd import pyterrier as pt @@ -92,7 +92,7 @@ def transform(self, inp: pd.DataFrame) -> pd.DataFrame: # batching iterator = pt.model.split_df(inp, batch_size=self.batch_size) if self.verbose: - iterator = pt.tqdm(iterator, desc="pt.apply", unit='row') + iterator = pt.tqdm(iterator, desc="pt.apply", unit='row') # type: ignore return pd.concat([self._apply_df(chunk_df) for chunk_df in iterator]) def _apply_df(self, inp: pd.DataFrame) -> pd.DataFrame: @@ -148,7 +148,7 @@ def transform(self, res: pd.DataFrame) -> pd.DataFrame: it = res.groupby("qid") lastqid = None if self.verbose: - it = pt.tqdm(it, unit='query') + it = pt.tqdm(it, unit='query') # type: ignore try: if self.batch_size is None: query_dfs = [] @@ -163,7 +163,7 @@ def transform(self, res: pd.DataFrame) -> pd.DataFrame: iterator = pt.model.split_df(group, batch_size=self.batch_size) query_dfs.append( pd.concat([self.fn(chunk_df) for chunk_df in iterator]) ) except Exception as a: - raise Exception("Problem applying %s for qid %s" % (self.fn, lastqid)) from a + raise Exception("Problem applying %r for qid %s" % (self.fn, lastqid)) from a # %r because its a function with bytes representation (mypy) if self.add_ranks: try: @@ -253,7 +253,7 @@ def __repr__(self): def _transform_rowwise(self, outputRes): if self.verbose: pt.tqdm.pandas(desc="pt.apply.doc_score", unit="d") - outputRes["score"] = outputRes.progress_apply(self.fn, axis=1).astype('float64') + outputRes["score"] = outputRes.progress_apply(self.fn, axis=1).astype('float64') # type: ignore else: outputRes["score"] = outputRes.apply(self.fn, axis=1).astype('float64') outputRes = pt.model.add_ranks(outputRes) @@ -275,7 +275,7 @@ def transform(self, inp: pd.DataFrame) -> pd.DataFrame: iterator = pt.model.split_df(outputRes, batch_size=self.batch_size) if self.verbose: - iterator = pt.tqdm(iterator, desc="pt.apply", unit='row') + iterator = pt.tqdm(iterator, desc="pt.apply", unit='row') # type: ignore rtr = pd.concat([self._transform_batchwise(chunk_df) for chunk_df in iterator]) rtr = pt.model.add_ranks(rtr) return rtr @@ -294,7 +294,7 @@ def _feature_fn(row): pipe = pt.terrier.Retriever(index) >> pt.apply.doc_features(_feature_fn) >> pt.LTRpipeline(xgBoost()) """ def __init__(self, - fn: Callable[[Union[pd.Series, pt.model.IterDictRecord]], np.array], + fn: Callable[[Union[pd.Series, pt.model.IterDictRecord]], npt.NDArray], *, verbose: bool = False ): @@ -313,7 +313,7 @@ def transform_iter(self, inp: pt.model.IterDict) -> pt.model.IterDict: # we assume that the function can take a dictionary as well as a pandas.Series. As long as [""] notation is used # to access fields, both should work if self.verbose: - inp = pt.tqdm(inp, desc="pt.apply.doc_features") + inp = pt.tqdm(inp, desc="pt.apply.doc_features") # type: ignore for row in inp: row["features"] = self.fn(row) yield row @@ -322,8 +322,8 @@ def transform(self, inp: pd.DataFrame) -> pd.DataFrame: fn = self.fn outputRes = inp.copy() if self.verbose: - pt.tqdm.pandas(desc="pt.apply.doc_features", unit="d") - outputRes["features"] = outputRes.progress_apply(fn, axis=1) + pt.tqdm.pandas(desc="pt.apply.doc_features", unit="d") # type: ignore + outputRes["features"] = outputRes.progress_apply(fn, axis=1) # type: ignore else: outputRes["features"] = outputRes.apply(fn, axis=1) return outputRes @@ -368,7 +368,7 @@ def transform_iter(self, inp: pt.model.IterDict) -> pt.model.IterDict: # we assume that the function can take a dictionary as well as a pandas.Series. As long as [""] notation is used # to access fields, both should work if self.verbose: - inp = pt.tqdm(inp, desc="pt.apply.query") + inp = pt.tqdm(inp, desc="pt.apply.query") # type: ignore for row in inp: row = row.copy() if "query" in row: @@ -384,8 +384,8 @@ def transform(self, inp: pd.DataFrame) -> pd.DataFrame: outputRes = inp.copy() try: if self.verbose: - pt.tqdm.pandas(desc="pt.apply.query", unit="d") - outputRes["query"] = outputRes.progress_apply(self.fn, axis=1) + pt.tqdm.pandas(desc="pt.apply.query", unit="d") # type: ignore + outputRes["query"] = outputRes.progress_apply(self.fn, axis=1) # type: ignore else: outputRes["query"] = outputRes.apply(self.fn, axis=1) except ValueError as ve: @@ -444,7 +444,7 @@ def transform(self, inp: pd.DataFrame) -> pd.DataFrame: # batching iterator = pt.model.split_df(inp, batch_size=self.batch_size) if self.verbose: - iterator = pt.tqdm(iterator, desc="pt.apply", unit='row') + iterator = pt.tqdm(iterator, desc="pt.apply", unit='row') # type: ignore rtr = pd.concat([self.fn(chunk_df) for chunk_df in iterator]) return rtr diff --git a/pyterrier/datasets.py b/pyterrier/datasets.py index 79932eaa..deba064c 100644 --- a/pyterrier/datasets.py +++ b/pyterrier/datasets.py @@ -3,8 +3,9 @@ import json import pandas as pd from .transformer import is_lambda +from abc import abstractmethod import types -from typing import Union, Tuple, Iterator, Dict, Any, List, Literal +from typing import Union, Tuple, Iterator, Dict, Any, List, Literal, Optional from warnings import warn import requests from .io import autoopen, touch @@ -54,12 +55,13 @@ def get_corpus(self): """ pass + @abstractmethod def get_corpus_iter(self, verbose=True) -> pt.model.IterDict: """ Returns an iter of dicts for this collection. If verbose=True, a tqdm pbar shows the progress over this iterator. """ pass - + def get_corpus_lang(self) -> Union[str,None]: """ Returns the ISO 639-1 language code for the corpus, or None for multiple/other/unknown @@ -72,6 +74,7 @@ def get_index(self, variant=None, **kwargs): """ pass + @abstractmethod def get_topics(self, variant=None) -> pd.DataFrame: """ Returns the topics, as a dataframe, ready for retrieval. @@ -84,6 +87,7 @@ def get_topics_lang(self) -> Union[str,None]: """ return None + @abstractmethod def get_qrels(self, variant=None) -> pd.DataFrame: """ Returns the qrels, as a dataframe, ready for evaluation. @@ -109,7 +113,7 @@ def get_results(self, variant=None) -> pd.DataFrame: """ Returns a standard result set provided by the dataset. This is useful for re-ranking experiments. """ - pass + return None class RemoteDataset(Dataset): @@ -139,7 +143,7 @@ def download(URLs : Union[str,List[str]], filename : str, **kwargs): r = requests.get(url, allow_redirects=True, stream=True, **kwargs) r.raise_for_status() total = int(r.headers.get('content-length', 0)) - with pt.io.finalized_open(filename, 'b') as file, pt.tqdm( + with pt.io.finalized_open(filename, 'b') as file, pt.tqdm( # type: ignore desc=basename, total=total, unit='iB', @@ -507,7 +511,7 @@ def get_results(self, variant=None) -> pd.DataFrame: result.sort_values(by=['qid', 'score', 'docno'], ascending=[True, False, True], inplace=True) # ensure data is sorted by qid, -score, did # result doesn't yet contain queries (only qids) so load and merge them in topics = self.get_topics(variant) - result = pd.merge(result, topics, how='left', on='qid', copy=False) + result = pd.merge(result, topics, how='left', on='qid') return result def _describe_component(self, component): @@ -610,7 +614,7 @@ def transform(self, inp: pd.DataFrame) -> pd.DataFrame: set_docnos = set(docnos) it = (tuple(getattr(doc, f) for f in fields) for doc in docstore.get_many_iter(set_docnos)) if self.verbose: - it = pd.tqdm(it, unit='d', total=len(set_docnos), desc='IRDSTextLoader') + it = pt.tqdm(it, unit='d', total=len(set_docnos), desc='IRDSTextLoader') # type: ignore metadata = pd.DataFrame(list(it), columns=fields).set_index('doc_id') metadata_frame = metadata.loc[docnos].reset_index(drop=True) @@ -1104,7 +1108,7 @@ def _merge_years(self, component, variant): "corpus_iter" : lambda dataset, **kwargs : pt.index.treccollection2textgen(dataset.get_corpus(), num_docs=11429, verbose=kwargs.get("verbose", False)) } -DATASET_MAP = { +DATASET_MAP : Dict[str, Dataset] = { # used for UGlasgow teaching "50pct" : RemoteDataset("50pct", FIFTY_PCT_FILES), # umass antique corpus - see http://ciir.cs.umass.edu/downloads/Antique/ @@ -1222,7 +1226,7 @@ def list_datasets(en_only=True): def transformer_from_dataset( dataset : Union[str, Dataset], clz, - variant: str = None, + variant: Optional[str] = None, version: str = 'latest', **kwargs) -> pt.Transformer: """Returns a Transformer instance of type ``clz`` for the provided index of variant ``variant``.""" diff --git a/pyterrier/debug.py b/pyterrier/debug.py index 6704072a..0ff5ea1a 100644 --- a/pyterrier/debug.py +++ b/pyterrier/debug.py @@ -1,7 +1,7 @@ from . import Transformer -from typing import List +from typing import List, Optional -def print_columns(by_query : bool = False, message : str = None) -> Transformer: +def print_columns(by_query : Optional[bool] = False, message : Optional[str] = None) -> Transformer: """ Returns a transformer that can be inserted into pipelines that can print the column names of the dataframe at this stage in the pipeline: @@ -82,8 +82,8 @@ def print_rows( by_query : bool = True, jupyter: bool = True, head : int = 2, - message : str = None, - columns : List[str] = None) -> Transformer: + message : Optional[str] = None, + columns : Optional[List[str]] = None) -> Transformer: """ Returns a transformer that can be inserted into pipelines that can print some of the dataframe at this stage in the pipeline: diff --git a/pyterrier/java/_core.py b/pyterrier/java/_core.py index c3490c0e..01f1d46c 100644 --- a/pyterrier/java/_core.py +++ b/pyterrier/java/_core.py @@ -1,3 +1,4 @@ +# type: ignore import os from pyterrier.java import required_raise, required, before_init, started, mavenresolver, JavaClasses, JavaInitializer, register_config from typing import Optional @@ -153,7 +154,7 @@ def add_jar(jar_path): @before_init -def add_package(org_name: str = None, package_name: str = None, version: str = None, file_type='jar'): +def add_package(org_name : str, package_name : str, version : Optional[str] = None, file_type : str = 'jar'): if version is None or version == 'snapshot': version = mavenresolver.latest_version_num(org_name, package_name) file_name = mavenresolver.get_package_jar(org_name, package_name, version, artifact=file_type) diff --git a/pyterrier/java/_utils.py b/pyterrier/java/_utils.py index 2366d301..5eb71d7d 100644 --- a/pyterrier/java/_utils.py +++ b/pyterrier/java/_utils.py @@ -1,3 +1,4 @@ +# type: ignore import sys import warnings from functools import wraps @@ -387,7 +388,7 @@ def register_config(name, config: Dict[str, Any]): class JavaClasses: def __init__(self, **mapping: Union[str, Callable[[], str]]): self._mapping = mapping - self._cache = {} + self._cache : Dict[str, Callable]= {} def __dir__(self): return list(self._mapping.keys()) diff --git a/pyterrier/model.py b/pyterrier/model.py index 724ed09f..6aa1245c 100644 --- a/pyterrier/model.py +++ b/pyterrier/model.py @@ -239,6 +239,7 @@ def split_df(df : pd.DataFrame, N: Optional[int] = None, *, batch_size: Optional assert (N is None) != (batch_size is None), "Either N or batch_size should be provided (and not both)" if N is None: + assert batch_size is not None N = math.ceil(len(df) / batch_size) type = None diff --git a/pyterrier/new.py b/pyterrier/new.py index 67d3e27d..35ab093e 100644 --- a/pyterrier/new.py +++ b/pyterrier/new.py @@ -1,5 +1,5 @@ -from typing import Sequence, Union +from typing import Sequence, Union, Optional, cast, Iterable import pandas as pd from .model import add_ranks @@ -9,7 +9,7 @@ def empty_Q() -> pd.DataFrame: """ return pd.DataFrame(columns=["qid", "query"]) -def queries(queries : Union[str, Sequence[str]], qid : Union[str, Sequence[str]] = None, **others) -> pd.DataFrame: +def queries(queries : Union[str, Sequence[str]], qid : Optional[Union[str, Iterable[str]]] = None, **others) -> pd.DataFrame: """ Creates a new queries dataframe. Will return a dataframe with the columns `["qid", "query"]`. Any further lists in others will also be added. @@ -40,7 +40,7 @@ def queries(queries : Union[str, Sequence[str]], qid : Union[str, Sequence[str]] assert type(qid) == str return pd.DataFrame({"qid" : [qid], "query" : [queries], **others}) if qid is None: - qid = map(str, range(1, len(queries)+1)) + qid = cast(Iterable[str], map(str, range(1, len(queries)+1))) # noqa: PT100 (this is typing.cast, not jinus.cast) return pd.DataFrame({"qid" : qid, "query" : queries, **others}) Q = queries @@ -53,8 +53,8 @@ def empty_R() -> pd.DataFrame: def ranked_documents( scores : Sequence[Sequence[float]], - qid : Sequence[str] = None, - docno=None, + qid : Optional[Sequence[str]] = None, + docno : Optional[Sequence[Sequence[str]]] = None, **others) -> pd.DataFrame: """ Creates a new ranked documents dataframe. Will return a dataframe with the columns `["qid", "docno", "score", "rank"]`. @@ -120,4 +120,4 @@ def ranked_documents( raise ValueError("We assume multiple documents, for now") return add_ranks(rtr) -R = ranked_documents \ No newline at end of file +R = ranked_documents diff --git a/pyterrier/pipelines.py b/pyterrier/pipelines.py index 1cac0af0..a5901436 100644 --- a/pyterrier/pipelines.py +++ b/pyterrier/pipelines.py @@ -2,19 +2,22 @@ import os import pandas as pd import numpy as np -from typing import Callable, Union, Dict, List, Tuple, Sequence, Any, Literal, Optional +from typing import Callable, Iterator, Union, Dict, List, Tuple, Sequence, Any, Literal, Optional +import types from . import Transformer from .model import coerce_dataframe_types import ir_measures -from ir_measures.measures import BaseMeasure +import tqdm as tqdm_module +from ir_measures import Measure, Metric import pyterrier as pt -MEASURE_TYPE=Union[str,BaseMeasure] +MEASURE_TYPE=Union[str,Measure] MEASURES_TYPE=Sequence[MEASURE_TYPE] SAVEMODE_TYPE=Literal['reuse', 'overwrite', 'error', 'warn'] SYSTEM_OR_RESULTS_TYPE = Union[Transformer, pd.DataFrame] +SAVEFORMAT_TYPE = Union[Literal['trec'], types.ModuleType] -def _bold_cols(data, col_type): +def _bold_cols(data : pd.Series, col_type): if not data.name in col_type: return [''] * len(data) @@ -29,7 +32,7 @@ def _bold_cols(data, col_type): is_max[len(data) - list(reversed(data)).index(max_value) - 1] = colormaxlast_attr return is_max -def _color_cols(data, col_type, +def _color_cols(data : pd.Series, col_type, colormax='antiquewhite', colormaxlast='lightgreen', colormin='antiquewhite', colorminlast='lightgreen' ): if not data.name in col_type: @@ -75,12 +78,12 @@ def _mean_of_measures(result, measures=None, num_q = None): mean_dict[measure] = value / (1 if measure in measures_no_mean else num_q) return mean_dict -def _convert_measures(metrics : MEASURES_TYPE) -> Tuple[Sequence[BaseMeasure], Dict[BaseMeasure,str]]: +def _convert_measures(metrics : MEASURES_TYPE) -> Tuple[Sequence[Measure], Dict[Measure,str]]: from ir_measures import parse_trec_measure rtr = [] rev_mapping = {} for m in metrics: - if isinstance(m, BaseMeasure): + if isinstance(m, Measure): rtr.append(m) continue elif isinstance(m, str): @@ -102,69 +105,80 @@ def _convert_measures(metrics : MEASURES_TYPE) -> Tuple[Sequence[BaseMeasure], D #list(iter_calc([ir_measures.AP], qrels, run)) #[Metric(query_id='Q0', measure=AP, value=1.0), Metric(query_id='Q1', measure=AP, value=1.0)] def _ir_measures_to_dict( - seq : Sequence, - metrics: Sequence[BaseMeasure], - rev_mapping : Dict[BaseMeasure,str], + seq : Iterator[Metric], + measures: Sequence[Measure], + rev_mapping : Dict[Measure,str], num_q : int, perquery : bool = True, - backfill_qids : Sequence[str] = None): + backfill_qids : Optional[Sequence[str]] = None) -> Union[ Dict[str, Dict[str, float]], Dict[str, float]]: from collections import defaultdict if perquery: # qid -> measure -> value - rtr=defaultdict(dict) - for m in seq: - metric = m.measure - metric = rev_mapping.get(metric, str(metric)) - rtr[m.query_id][metric] = m.value + rtr_perquery : Dict[str, Dict[str, float]] = defaultdict(dict) + for metric in seq: + measure = metric.measure + measure_name = rev_mapping.get(measure, str(measure)) + rtr_perquery[metric.query_id][measure_name] = metric.value # When reporting per-query results, it can desirable to show something for topics that were executed # do not have corresponding qrels. If the caller passes in backfill_qids, we'll ensure that these # qids are present, and if not add placeholders with NaN values for all measures. if backfill_qids is not None: backfill_count = 0 for qid in backfill_qids: - if qid not in rtr: + if qid not in rtr_perquery: backfill_count += 1 - for metric in metrics: - rtr[qid][rev_mapping.get(metric, str(metric))] = float('NaN') + for m in measures: + rtr_perquery[qid][rev_mapping.get(m, str(m))] = float('NaN') if backfill_count > 0: warn(f'{backfill_count} topic(s) not found in qrels. Scores for these topics are given as NaN and should not contribute to averages.') - return rtr + return rtr_perquery assert backfill_qids is None, "backfill_qids only supported when perquery=True" - # measure -> value - rtr = {rev_mapping.get(m, str(m)): m.aggregator() for m in metrics} - for m in seq: - metric = m.measure - metric = rev_mapping.get(metric, str(metric)) - rtr[metric].add(m.value) - for m in rtr: - rtr[m] = rtr[m].result() - return rtr + + metric_agg = {rev_mapping.get(m, str(m)): m.aggregator() for m in measures} + for metric in seq: + measure_name = rev_mapping.get(metric.measure, str(metric.measure)) + metric_agg[measure_name].add(metric.value) + + rtr_aggregated : Dict[str,float] = {} # measure -> value + for m_name in metric_agg: + rtr_aggregated[m_name] = metric_agg[m_name].result() + return rtr_aggregated def _run_and_evaluate( system : SYSTEM_OR_RESULTS_TYPE, - topics : pd.DataFrame, + topics : Optional[pd.DataFrame], qrels: pd.DataFrame, metrics : MEASURES_TYPE, - pbar = None, - save_mode : SAVEMODE_TYPE = None, - save_file : str = None, + pbar : Optional[tqdm_module.tqdm] = None, + save_mode : Optional[SAVEMODE_TYPE] = None, + save_file : Optional[str] = None, + save_format : SAVEFORMAT_TYPE = 'trec', perquery : bool = False, batch_size : Optional[int] = None, - backfill_qids : Sequence[str] = None): + backfill_qids : Optional[Sequence[str]] = None): from .io import read_results, write_results if pbar is None: - pbar = pt.tqdm(disable=True) + pbar = pt.tqdm(disable=True) # type: ignore metrics, rev_mapping = _convert_measures(metrics) qrels = qrels.rename(columns={'qid': 'query_id', 'docno': 'doc_id', 'label': 'relevance'}) from timeit import default_timer as timer - runtime = 0 + runtime : float = 0. num_q = qrels['query_id'].nunique() if save_file is not None and os.path.exists(save_file): if save_mode == 'reuse': - system = read_results(save_file) + if save_format == 'trec': + system = read_results(save_file) + elif type(save_format) == types.ModuleType: + with pt.io.autoopen(save_file, 'rb') as fin: + system = save_format.load(fin) + elif isinstance(save_format, tuple) and len(save_format) == 2: + with pt.io.autoopen(save_file, 'rb') as fin: + system = save_format[0](fin) + else: + raise ValueError("Unknown save_format %s" % str(save_format)) elif save_mode == 'overwrite': os.remove(save_file) elif save_mode == 'warn': @@ -178,12 +192,16 @@ def _run_and_evaluate( else: raise ValueError("Unknown save_mode argument '%s', valid options are 'error', 'warn', 'reuse' or 'overwrite'." % save_mode) + res : pd.DataFrame # if its a DataFrame, use it as the results if isinstance(system, pd.DataFrame): res = system res = coerce_dataframe_types(res) if len(res) == 0: - raise ValueError("%d topics, but no results in dataframe" % len(topics)) + if topics is None: + raise ValueError("No topics specified, and no results in dataframe") + else: + raise ValueError("%d topics, but no results in dataframe" % len(topics)) evalMeasuresDict = _ir_measures_to_dict( ir_measures.iter_calc(metrics, qrels, res.rename(columns=_irmeasures_columns)), metrics, @@ -194,6 +212,8 @@ def _run_and_evaluate( pbar.update() elif batch_size is None: + + assert topics is not None, "topics must be specified" #transformer, evaluate all queries at once starttime = timer() @@ -203,8 +223,16 @@ def _run_and_evaluate( # write results to save_file; we can be sure this file does not exist if save_file is not None: - write_results(res, save_file) - + if save_format == 'trec': + write_results(res, save_file) + elif type(save_format) == types.ModuleType: + with pt.io.autoopen(save_file, 'wb') as fout: + save_format.dump(res, fout) + elif isinstance(save_format, tuple) and len(save_format) == 2: + with pt.io.autoopen(save_file, 'wb') as fout: + save_format[1](res, fout) + else: + raise ValueError("Unknown save_format %s" % str(save_format)) res = coerce_dataframe_types(res) if len(res) == 0: @@ -219,12 +247,18 @@ def _run_and_evaluate( backfill_qids) pbar.update() else: + assert topics is not None, "topics must be specified" + if save_file is not None: + # only + assert save_format == 'trec', 'save_format=%s is not supported when save_dir is enabled and batch_size is not None' % str(save_format) + #transformer, evaluate queries in batches assert batch_size > 0 starttime = timer() evalMeasuresDict = {} remaining_qrel_qids = set(qrels.query_id) try: + batch_topics : pd.DataFrame for i, (res, batch_topics) in enumerate( system.transform_gen(topics, batch_size=batch_size, output_topics=True)): if len(res) == 0: raise ValueError("batch of %d topics, but no results received in batch %d from %s" % (len(batch_topics), i, str(system) ) ) @@ -267,6 +301,7 @@ def _run_and_evaluate( if not perquery: # aggregate measures if not in per query mode aggregators = {rev_mapping.get(m, str(m)): m.aggregator() for m in metrics} + q : str for q in evalMeasuresDict: for metric in metrics: s_metric = rev_mapping.get(metric, str(metric)) @@ -282,21 +317,22 @@ def Experiment( topics : pd.DataFrame, qrels : pd.DataFrame, eval_metrics : MEASURES_TYPE, - names : Sequence[str] = None, + names : Optional[Sequence[str]] = None, perquery : bool = False, dataframe : bool = True, batch_size : Optional[int] = None, filter_by_qrels : bool = False, filter_by_topics : bool = True, - baseline : int = None, + baseline : Optional[int] = None, test : Union[str,TEST_FN_TYPE] = "t", - correction : str = None, + correction : Optional[str] = None, correction_alpha : float = 0.05, - highlight : str = None, - round : Union[int,Dict[str,int]] = None, + highlight : Optional[str] = None, + round : Optional[Union[int,Dict[str,int]]] = None, verbose : bool = False, - save_dir : str = None, + save_dir : Optional[str] = None, save_mode : SAVEMODE_TYPE = 'warn', + save_format : SAVEFORMAT_TYPE = 'trec', **kwargs): """ Allows easy comparison of multiple retrieval transformer pipelines using a common set of topics, and @@ -324,6 +360,9 @@ def Experiment( save_mode(str): Defines how existing files are used when ``save_dir`` is set. If set to "reuse", then files will be preferred over transformers for evaluation. If set to "overwrite", existing files will be replaced. If set to "warn" or "error", the presence of any existing file will cause a warning or error, respectively. Default is "warn". + save_format(str): How are result being saved. Defaults to 'trec', which uses ``pt.io.read_results()`` and ``pt.io.write_results()`` for saving system outputs. + If TREC results format is insufficient, set ``save_format=pickle``. Alternatively, a tuple of read and write function can be specified, for instance, + ``save_format=(pandas.from_csv, pandas.DataFrame.to_csv)``, or even ``save_format=(pandas.from_parquet, pandas.DataFrame.to_parquet)``. dataframe(bool): If True return results as a dataframe, else as a dictionary of dictionaries. Default=True. baseline(int): If set to the index of an item of the retr_system list, will calculate the number of queries improved, degraded and the statistical significance (paired t-test p value) for each measure. @@ -420,10 +459,14 @@ def _apply_round(measure, value): raise ValueError('There is no overlap between the qids found in the topics and qrels. If this is intentional, set filter_by_topics=False and filter_by_qrels=False.') from scipy import stats + test_fn : TEST_FN_TYPE if test == "t": - test = stats.ttest_rel - if test == "wilcoxon": - test = stats.wilcoxon + test_fn = stats.ttest_rel + elif test == "wilcoxon": + test_fn = stats.wilcoxon + else: + assert not isinstance(test, str), "Unknown test function name %s" % test + test_fn = test # obtain system names if not specified if names is None: @@ -453,7 +496,7 @@ def _apply_round(measure, value): mrt_needed = False if "mrt" in eval_metrics: mrt_needed = True - eval_metrics = eval_metrics.copy() + eval_metrics = list(eval_metrics).copy() eval_metrics.remove("mrt") # progress bar construction @@ -469,12 +512,20 @@ def _apply_round(measure, value): # round number of batches up for each system tqdm_args['total'] = math.ceil((len(topics) / batch_size)) * len(retr_systems) - with pt.tqdm(**tqdm_args) as pbar: + with pt.tqdm(**tqdm_args) as pbar: # type: ignore # run and evaluate each system for name, system in zip(names, retr_systems): save_file = None if save_dir is not None: - save_file = os.path.join(save_dir, "%s.res.gz" % name) + if save_format == 'trec': + save_ext = 'res.gz' + elif type(save_format) == types.ModuleType: + save_ext = 'mod' + elif isinstance(save_format, tuple): + save_ext = 'custom' + else: + raise ValueError("Unrecognised save_mode %s" % str(save_format)) + save_file = os.path.join(save_dir, "%s.%s" % (name, save_ext)) time, evalMeasuresDict = _run_and_evaluate( system, topics, qrels, eval_metrics, @@ -483,6 +534,7 @@ def _apply_round(measure, value): backfill_qids=all_topic_qids if perquery else None, save_file=save_file, save_mode=save_mode, + save_format=save_format, pbar=pbar) if baseline is not None: @@ -518,7 +570,7 @@ def _apply_round(measure, value): if dataframe: if perquery: df = pd.DataFrame(evalsRows, columns=["name", "qid", "measure", "value"]).sort_values(['name', 'qid']) - if round is not None: + if round is not None and isinstance(round, int): df["value"] = df["value"].round(round) return df @@ -526,7 +578,7 @@ def _apply_round(measure, value): if mrt_needed: highlight_cols["mrt"] = "-" - p_col_names=[] + p_col_names : List[str] = [] if baseline is not None: assert len(evalDictsPerQ) == len(retr_systems) baselinePerQuery={} @@ -547,7 +599,7 @@ def _apply_round(measure, value): perQuery = np.array( [ evalDictsPerQ[i][q][m] for q in evalDictsPerQ[baseline] ]) delta_plus = (perQuery > baselinePerQuery[m]).sum() delta_minus = (perQuery < baselinePerQuery[m]).sum() - p = test(perQuery, baselinePerQuery[m])[1] + p = test_fn(perQuery, baselinePerQuery[m])[1] additionals.extend([delta_plus, delta_minus, p]) evalsRows[i].extend(additionals) delta_names=[] @@ -565,12 +617,12 @@ def _apply_round(measure, value): # multiple testing correction. This adds two new columns for each measure experience statistical significance testing if baseline is not None and correction is not None: - import statsmodels.stats.multitest + import statsmodels.stats.multitest # type: ignore for pcol in p_col_names: pcol_reject = pcol.replace("p-value", "reject") pcol_corrected = pcol + " corrected" reject, corrected, _, _ = statsmodels.stats.multitest.multipletests(df[pcol].drop(df.index[baseline]), alpha=correction_alpha, method=correction) - insert_pos = df.columns.get_loc(pcol) + insert_pos : int = df.columns.get_loc(pcol) # add reject/corrected values for the baseline reject = np.insert(reject, baseline, False) corrected = np.insert(corrected, baseline, np.nan) @@ -579,9 +631,9 @@ def _apply_round(measure, value): df.insert(insert_pos+2, pcol_corrected, corrected) if highlight == "color" or highlight == "colour" : - df = df.style.apply(_color_cols, axis=0, col_type=highlight_cols) + df = df.style.apply(_color_cols, axis=0, col_type=highlight_cols) # type: ignore elif highlight == "bold": - df = df.style.apply(_bold_cols, axis=0, col_type=highlight_cols) + df = df.style.apply(_bold_cols, axis=0, col_type=highlight_cols) # type: ignore return df return evalDict @@ -597,6 +649,12 @@ def _apply_round(measure, value): List[GRID_SCAN_PARAM_SETTING] ] +GRID_SEARCH_RETURN_TYPE_BOTH = Tuple[ + Transformer, + float, + List[GRID_SCAN_PARAM_SETTING] +] + def _save_state(param_dict): rtr = [] for tran, param_set in param_dict.items(): @@ -635,7 +693,7 @@ def KFoldGridSearch( jobs : int = 1, backend='joblib', verbose: bool = False, - batch_size = None) -> Tuple[pd.DataFrame, GRID_SEARCH_RETURN_TYPE_SETTING]: + batch_size : Optional[int] = None) -> Tuple[pd.DataFrame, GRID_SEARCH_RETURN_TYPE_SETTING]: """ Applies a GridSearch using different folds. It returns the *results* of the tuned transformer pipeline on the test topics. The number of topics dataframes passed @@ -684,7 +742,7 @@ def KFoldGridSearch( qrels = [qrels] * num_folds FOLDS=list(range(0, num_folds)) - results=[] + results : List[pd.DataFrame] = [] settings=[] # save state @@ -704,6 +762,9 @@ def KFoldGridSearch( # safety - give the GridSearch a stable initial setting _restore_state(initial_state) + optPipe: Transformer + max_measure: float + max_setting: List[GRID_SCAN_PARAM_SETTING] optPipe, max_measure, max_setting = GridSearch( pipeline, params, @@ -732,9 +793,9 @@ def GridSearch( jobs : int = 1, backend='joblib', verbose: bool = False, - batch_size = None, - return_type : str = "opt_pipeline" - ) -> Union[Transformer,GRID_SEARCH_RETURN_TYPE_SETTING]: + batch_size : Optional[int] = None, + return_type : Literal['opt_pipeline', 'best_setting', 'both'] = "opt_pipeline" + ) -> Union[Transformer,GRID_SEARCH_RETURN_TYPE_SETTING,GRID_SEARCH_RETURN_TYPE_BOTH]: """ GridSearch is essentially, an argmax GridScan(), i.e. it returns an instance of the pipeline to tune with the best parameter settings among params, that were found that were obtained using the specified @@ -772,11 +833,12 @@ def GridSearch( verbose, batch_size, dataframe=False) + assert not isinstance(grid_outcomes, pd.DataFrame) assert len(grid_outcomes) > 0, "GridScan returned 0 rows" max_measure = grid_outcomes[0][1][metric] max_setting = grid_outcomes[0][0] - for setting, measures in grid_outcomes: + for setting, measures in grid_outcomes: # TODO what is the type of this iteration? if measures[metric] > max_measure: max_measure = measures[metric] max_setting = setting @@ -794,7 +856,7 @@ def GridSearch( for tran, param, value in max_setting: tran.set_parameter(param, value) return (pipeline, max_measure, max_setting) - + raise ValueError("Unknown return_type option %s" % return_type) def GridScan( pipeline : Transformer, @@ -807,7 +869,7 @@ def GridScan( verbose: bool = False, batch_size = None, dataframe = True, - ) -> Union[pd.DataFrame, List [ Tuple [ List[ GRID_SCAN_PARAM_SETTING ] , Dict[str,float] ] ] ]: + ) -> Union[pd.DataFrame, List [ Tuple [ List[ GRID_SCAN_PARAM_SETTING ], Dict[str,float] ] ] ]: """ GridScan applies a set of named parameters on a given pipeline and evaluates the outcome. The topics and qrels must be specified. The trec_eval measure names can be optionally specified. @@ -900,7 +962,7 @@ def _evaluate_several_settings(inputs : List[Tuple]): eval_list = [] #for each combination of parameter values if jobs == 1: - for v in pt.tqdm(combinations, total=len(combinations), desc="GridScan", mininterval=0.3) if verbose else combinations: + for v in pt.tqdm(combinations, total=len(combinations), desc="GridScan", mininterval=0.3) if verbose else combinations: # type: ignore parameter_list, eval_scores = _evaluate_one_setting(keys, v) eval_list.append( (parameter_list, eval_scores) ) else: diff --git a/pyterrier/py.typed b/pyterrier/py.typed new file mode 100644 index 00000000..e69de29b diff --git a/pyterrier/terrier/__init__.py b/pyterrier/terrier/__init__.py index 2ad96fe7..4ca2903c 100644 --- a/pyterrier/terrier/__init__.py +++ b/pyterrier/terrier/__init__.py @@ -1,3 +1,4 @@ +# type: ignore # java stuff from pyterrier.terrier import java from pyterrier.terrier._text_loader import TerrierTextLoader, terrier_text_loader diff --git a/pyterrier/terrier/_text_loader.py b/pyterrier/terrier/_text_loader.py index 3d095b07..777c81de 100644 --- a/pyterrier/terrier/_text_loader.py +++ b/pyterrier/terrier/_text_loader.py @@ -1,3 +1,4 @@ +# type: ignore from typing import List, Union, Literal import pandas as pd diff --git a/pyterrier/terrier/index.py b/pyterrier/terrier/index.py index d3b64be7..2b6dcee5 100644 --- a/pyterrier/terrier/index.py +++ b/pyterrier/terrier/index.py @@ -1,3 +1,4 @@ +# type: ignore """ This file contains all the indexers. """ diff --git a/pyterrier/terrier/index_factory.py b/pyterrier/terrier/index_factory.py index ad1d1dbc..23f19520 100644 --- a/pyterrier/terrier/index_factory.py +++ b/pyterrier/terrier/index_factory.py @@ -1,3 +1,4 @@ +# type: ignore from typing import Union, List import pyterrier as pt diff --git a/pyterrier/terrier/java.py b/pyterrier/terrier/java.py index 47252487..b93651ca 100644 --- a/pyterrier/terrier/java.py +++ b/pyterrier/terrier/java.py @@ -1,3 +1,4 @@ +# type: ignore import os import sys import json diff --git a/pyterrier/terrier/retriever.py b/pyterrier/terrier/retriever.py index 9f143142..d3e39102 100644 --- a/pyterrier/terrier/retriever.py +++ b/pyterrier/terrier/retriever.py @@ -113,7 +113,7 @@ def matchop(t, w=1): @staticmethod def from_dataset(dataset : Union[str,Dataset], - variant : str = None, + variant : Optional[str] = None, version='latest', **kwargs): """ @@ -469,6 +469,7 @@ def fuse_feature_union(self, other: pt.Transformer, is_left: bool) -> Optional[p del controls['wmodel'] return FeaturesRetriever(self.indexref, features, controls=controls, properties=self.properties, metadata=self.metadata, threads=self.threads, verbose=self.verbose) + return None @pt.java.required @@ -672,7 +673,7 @@ def __setstate__(self, d): @staticmethod def from_dataset(dataset : Union[str,Dataset], - variant : str = None, + variant : Optional[str] = None, version='latest', **kwargs): return pt.datasets.transformer_from_dataset(dataset, variant=variant, version=version, clz=FeaturesRetriever, **kwargs) @@ -833,6 +834,7 @@ def fuse_left(self, left: pt.Transformer) -> Optional[pt.Transformer]: threads=self.threads, wmodel=left.controls['wmodel'], ) + return None def fuse_rank_cutoff(self, k: int) -> Optional[pt.Transformer]: """ @@ -862,3 +864,5 @@ def fuse_feature_union(self, other: pt.Transformer, is_left: bool) -> Optional[p features = self.features + ["WMODEL:" + other.controls['wmodel']] if is_left else ["WMODEL:" + other.controls['wmodel']] + self.features return FeaturesRetriever(self.indexref, features, controls=self.controls, properties=self.properties, threads=self.threads, wmodel=self.wmodel, verbose=self.verbose) + + return None diff --git a/pyterrier/terrier/rewrite.py b/pyterrier/terrier/rewrite.py index 376a84a2..aa60a629 100644 --- a/pyterrier/terrier/rewrite.py +++ b/pyterrier/terrier/rewrite.py @@ -1,3 +1,4 @@ +# type: ignore import pandas as pd from warnings import warn from typing import List,Union diff --git a/pyterrier/text.py b/pyterrier/text.py index 2eb0fe1e..d9a3b393 100644 --- a/pyterrier/text.py +++ b/pyterrier/text.py @@ -63,6 +63,7 @@ def get_text( if not isinstance(indexlike, HasTextLoader): raise ValueError('indexlike must provide a .text_loader() method.') + result : pt.Transformer result = indexlike.text_loader(metadata, verbose=verbose and not by_query, **kwargs) if by_query: diff --git a/pyterrier/utils.py b/pyterrier/utils.py index 2ee631e8..91d0f778 100644 --- a/pyterrier/utils.py +++ b/pyterrier/utils.py @@ -1,6 +1,6 @@ import inspect import sys -from typing import Callable, Tuple, List, Callable +from typing import Callable, Tuple, List, Callable, Dict, Set import platform from functools import wraps from importlib.metadata import EntryPoint @@ -39,7 +39,7 @@ def convert_qrels_to_dataframe(qrels_dict) -> pd.DataFrame: Returns: pd.DataFrame: columns=['qid', 'docno', 'label'] """ - result = {'qid': [], 'docno': [], 'label': []} + result : Dict[str,List[str]] = {'qid': [], 'docno': [], 'label': []} for qid in qrels_dict: for docno, label in qrels_dict[qid]: result['qid'].append(qid) @@ -171,7 +171,7 @@ def get_class_methods(cls) -> List[Tuple[str, Callable]]: """ all_attrs = inspect.getmembers(cls, predicate=inspect.isfunction) - base_attrs = set() + base_attrs : Set[str] = set() for base in cls.__bases__: base_attrs.update(name for name, _ in inspect.getmembers(base, predicate=inspect.isfunction)) diff --git a/tests/test_experiment.py b/tests/test_experiment.py index 20f70faa..b27f8fa5 100644 --- a/tests/test_experiment.py +++ b/tests/test_experiment.py @@ -149,25 +149,34 @@ def test_save(self): ] topics = pt.datasets.get_dataset("vaswani").get_topics().head(10) qrels = pt.datasets.get_dataset("vaswani").get_qrels() - df1 = pt.Experiment(brs, topics, qrels, eval_metrics=["map", "mrt"], save_dir=self.test_dir) - # check save_dir files are there - self.assertTrue(os.path.exists(os.path.join(self.test_dir, "TerrierRetr(DPH).res.gz"))) - self.assertTrue(os.path.exists(os.path.join(self.test_dir, "TerrierRetr(BM25).res.gz"))) - # check for warning - with pytest.warns(UserWarning): - # reuse only kicks in when save_mode is set. - df2 = pt.Experiment(brs, topics, qrels, eval_metrics=["map", "mrt"], save_dir=self.test_dir) + import pickle + for name, format, ext in [ + ('trec', 'trec', 'res.gz'), + ('pkl_manual', pickle, 'mod'), + ('pandas', (pd.read_csv, pd.DataFrame.to_csv), 'custom') + ]: + with self.subTest(name): + df1 = pt.Experiment(brs, topics, qrels, eval_metrics=["map", "mrt"], save_dir=self.test_dir, save_format=format) + print("\n". join(os.listdir(self.test_dir))) + # check save_dir files are there + self.assertTrue(os.path.exists(os.path.join(self.test_dir, "TerrierRetr(DPH)." + ext)), os.path.join(self.test_dir, "TerrierRetr(DPH)." + ext) + " not found") + self.assertTrue(os.path.exists(os.path.join(self.test_dir, "TerrierRetr(BM25)." + ext)), os.path.join(self.test_dir, "TerrierRetr(BM25)." + ext) + " not found") - # check for error when save_mode='error' - with self.assertRaises(ValueError): - # reuse only kicks in when save_mode is set. - df2 = pt.Experiment(brs, topics, qrels, eval_metrics=["map", "mrt"], save_dir=self.test_dir, save_mode='error') + # check for warning + with pytest.warns(UserWarning): + # reuse only kicks in when save_mode is set. + df2 = pt.Experiment(brs, topics, qrels, eval_metrics=["map", "mrt"], save_dir=self.test_dir, save_format=format) + + # check for error when save_mode='error' + with self.assertRaises(ValueError): + # reuse only kicks in when save_mode is set. + df2 = pt.Experiment(brs, topics, qrels, eval_metrics=["map", "mrt"], save_dir=self.test_dir, save_mode='error', save_format=format) - # allow it to reuse - df2 = pt.Experiment(brs, topics, qrels, eval_metrics=["map", "mrt"], save_dir=self.test_dir, save_mode='reuse') - # a successful experiment using save_dir should be faster - self.assertTrue(df2.iloc[0]["mrt"] < df1.iloc[0]["mrt"]) + # allow it to reuse + df2 = pt.Experiment(brs, topics, qrels, eval_metrics=["map", "mrt"], save_dir=self.test_dir, save_mode='reuse', save_format=format) + # a successful experiment using save_dir should be faster + self.assertTrue(df2.iloc[0]["mrt"] < df1.iloc[0]["mrt"]) def test_empty(self): df1 = pt.new.ranked_documents([[1]]).head(0)