diff --git a/README.md b/README.md
index 93c1b1a925..ca9361b895 100644
--- a/README.md
+++ b/README.md
@@ -89,9 +89,12 @@ See individual pages for details!
| SPLADE++ CoCondenser-SelfDistil | [✓](docs/regressions/regressions-msmarco-passage-splade-pp-sd.md) | [✓](docs/regressions/regressions-dl19-passage-splade-pp-sd.md) | [✓](docs/regressions/regressions-dl20-passage-splade-pp-sd.md) |
| SPLADE++ CoCondenser-SelfDistil (ONNX) | [✓](docs/regressions/regressions-msmarco-passage-splade-pp-sd-onnx.md) | [✓](docs/regressions/regressions-dl19-passage-splade-pp-sd-onnx.md) | [✓](docs/regressions/regressions-dl20-passage-splade-pp-sd-onnx.md) |
| **Learned Dense** (HNSW) | | | |
-| cosDPR-distil w/ HNSW | [✓](docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw.md) | [✓](docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw.md) | [✓](docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw.md) |
-| cosDPR-distil w/ HSNW (ONNX) | [✓](docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw-onnx.md) | [✓](docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw-onnx.md) | [✓](docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw-onnx.md) |
-| OpenAI-ada2 w/ HNSW | [✓](docs/regressions/regressions-msmarco-passage-openai-ada2.md) | [✓](docs/regressions/regressions-dl19-passage-openai-ada2.md) | [✓](docs/regressions/regressions-dl20-passage-openai-ada2.md) |
+| cosDPR-distil w/ HNSW fp32 | [✓](docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw.md) | [✓](docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw.md) | [✓](docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw.md) |
+| cosDPR-distil w/ HNSW int8 | [✓](docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw-int8.md) | [✓](docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw-int8.md) | [✓](docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw-int8.md) |
+| cosDPR-distil w/ HSNW fp32 (ONNX) | [✓](docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw-onnx.md) | [✓](docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw-onnx.md) | [✓](docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw-onnx.md) |
+| cosDPR-distil w/ HSNW int8 (ONNX) | [✓](docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw-int8-onnx.md) | [✓](docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw-int8-onnx.md) | [✓](docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw-int8-onnx.md) |
+| OpenAI Ada2 w/ HNSW fp32 | [✓](docs/regressions/regressions-msmarco-passage-openai-ada2.md) | [✓](docs/regressions/regressions-dl19-passage-openai-ada2.md) | [✓](docs/regressions/regressions-dl20-passage-openai-ada2.md) |
+| OpenAI Ada2 w/ HNSW int8 | [✓](docs/regressions/regressions-msmarco-passage-openai-ada2-int8.md) | [✓](docs/regressions/regressions-dl19-passage-openai-ada2-int8.md) | [✓](docs/regressions/regressions-dl20-passage-openai-ada2-int8.md) |
| **Learned Dense** (Inverted; experimental) | | | |
| cosDPR-distil w/ "fake words" | [✓](docs/regressions/regressions-msmarco-passage-cos-dpr-distil-fw.md) | [✓](docs/regressions/regressions-dl19-passage-cos-dpr-distil-fw.md) | [✓](docs/regressions/regressions-dl20-passage-cos-dpr-distil-fw.md) |
| cosDPR-distil w/ "LexLSH" | [✓](docs/regressions/regressions-msmarco-passage-cos-dpr-distil-lexlsh.md) | [✓](docs/regressions/regressions-dl19-passage-cos-dpr-distil-lexlsh.md) | [✓](docs/regressions/regressions-dl20-passage-cos-dpr-distil-lexlsh.md) |
diff --git a/docs/regressions.md b/docs/regressions.md
index dec59541f6..36f7eb59f6 100644
--- a/docs/regressions.md
+++ b/docs/regressions.md
@@ -51,13 +51,15 @@ nohup python src/main/python/run_regression.py --index --verify --search --regre
nohup python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-splade-pp-ed >& logs/log.msmarco-passage-splade-pp-ed &
nohup python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-splade-pp-sd >& logs/log.msmarco-passage-splade-pp-sd &
nohup python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-cos-dpr-distil-hnsw >& logs/log.msmarco-passage-cos-dpr-distil-hnsw &
+nohup python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-cos-dpr-distil-hnsw-int8 >& logs/log.msmarco-passage-cos-dpr-distil-hnsw-int8 &
nohup python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-cos-dpr-distil-fw >& logs/log.msmarco-passage-cos-dpr-distil-fw &
nohup python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-cos-dpr-distil-lexlsh >& logs/log.msmarco-passage-cos-dpr-distil-lexlsh &
nohup python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-openai-ada2 >& logs/log.msmarco-passage-openai-ada2 &
-nohup python src/main/python/run_regression.py --search-pool 1 --index --verify --search --regression msmarco-passage-splade-pp-ed-onnx >& logs/log.msmarco-passage-splade-pp-ed-onnx &
-nohup python src/main/python/run_regression.py --search-pool 1 --index --verify --search --regression msmarco-passage-splade-pp-sd-onnx >& logs/log.msmarco-passage-splade-pp-sd-onnx &
-nohup python src/main/python/run_regression.py --search-pool 1 --index --verify --search --regression msmarco-passage-cos-dpr-distil-hnsw-onnx >& logs/log.msmarco-passage-cos-dpr-distil-hnsw-onnx &
+nohup python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-splade-pp-ed-onnx >& logs/log.msmarco-passage-splade-pp-ed-onnx &
+nohup python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-splade-pp-sd-onnx >& logs/log.msmarco-passage-splade-pp-sd-onnx &
+nohup python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-cos-dpr-distil-hnsw-onnx >& logs/log.msmarco-passage-cos-dpr-distil-hnsw-onnx &
+nohup python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-cos-dpr-distil-hnsw-int8-onnx >& logs/log.msmarco-passage-cos-dpr-distil-hnsw-int8-onnx &
nohup python src/main/python/run_regression.py --index --verify --search --regression msmarco-doc >& logs/log.msmarco-doc &
nohup python src/main/python/run_regression.py --index --verify --search --regression msmarco-doc-wp >& logs/log.msmarco-doc-wp &
@@ -83,13 +85,15 @@ nohup python src/main/python/run_regression.py --index --verify --search --regre
nohup python src/main/python/run_regression.py --index --verify --search --regression dl19-passage-splade-pp-ed >& logs/log.dl19-passage-splade-pp-ed &
nohup python src/main/python/run_regression.py --index --verify --search --regression dl19-passage-splade-pp-sd >& logs/log.dl19-passage-splade-pp-sd &
nohup python src/main/python/run_regression.py --index --verify --search --regression dl19-passage-cos-dpr-distil-hnsw >& logs/log.dl19-passage-cos-dpr-distil-hnsw &
+nohup python src/main/python/run_regression.py --index --verify --search --regression dl19-passage-cos-dpr-distil-hnsw-int8 >& logs/log.dl19-passage-cos-dpr-distil-hnsw-int8 &
nohup python src/main/python/run_regression.py --index --verify --search --regression dl19-passage-cos-dpr-distil-fw >& logs/log.dl19-passage-cos-dpr-distil-fw &
nohup python src/main/python/run_regression.py --index --verify --search --regression dl19-passage-cos-dpr-distil-lexlsh >& logs/log.dl19-passage-cos-dpr-distil-lexlsh &
nohup python src/main/python/run_regression.py --index --verify --search --regression dl19-passage-openai-ada2 >& logs/log.dl19-passage-openai-ada2 &
-nohup python src/main/python/run_regression.py --search-pool 1 --index --verify --search --regression dl19-passage-splade-pp-ed-onnx >& logs/log.dl19-passage-splade-pp-ed-onnx &
-nohup python src/main/python/run_regression.py --search-pool 1 --index --verify --search --regression dl19-passage-splade-pp-sd-onnx >& logs/log.dl19-passage-splade-pp-sd-onnx &
-nohup python src/main/python/run_regression.py --search-pool 1 --index --verify --search --regression dl19-passage-cos-dpr-distil-hnsw-onnx >& logs/log.dl19-passage-cos-dpr-distil-hnsw-onnx &
+nohup python src/main/python/run_regression.py --index --verify --search --regression dl19-passage-splade-pp-ed-onnx >& logs/log.dl19-passage-splade-pp-ed-onnx &
+nohup python src/main/python/run_regression.py --index --verify --search --regression dl19-passage-splade-pp-sd-onnx >& logs/log.dl19-passage-splade-pp-sd-onnx &
+nohup python src/main/python/run_regression.py --index --verify --search --regression dl19-passage-cos-dpr-distil-hnsw-onnx >& logs/log.dl19-passage-cos-dpr-distil-hnsw-onnx &
+nohup python src/main/python/run_regression.py --index --verify --search --regression dl19-passage-cos-dpr-distil-hnsw-int8-onnx >& logs/log.dl19-passage-cos-dpr-distil-hnsw-int8-onnx &
nohup python src/main/python/run_regression.py --index --verify --search --regression dl19-doc >& logs/log.dl19-doc &
nohup python src/main/python/run_regression.py --index --verify --search --regression dl19-doc-wp >& logs/log.dl19-doc-wp &
@@ -115,13 +119,15 @@ nohup python src/main/python/run_regression.py --index --verify --search --regre
nohup python src/main/python/run_regression.py --index --verify --search --regression dl20-passage-splade-pp-ed >& logs/log.dl20-passage-splade-pp-ed &
nohup python src/main/python/run_regression.py --index --verify --search --regression dl20-passage-splade-pp-sd >& logs/log.dl20-passage-splade-pp-sd &
nohup python src/main/python/run_regression.py --index --verify --search --regression dl20-passage-cos-dpr-distil-hnsw >& logs/log.dl20-passage-cos-dpr-distil-hnsw &
+nohup python src/main/python/run_regression.py --index --verify --search --regression dl20-passage-cos-dpr-distil-hnsw-int8 >& logs/log.dl20-passage-cos-dpr-distil-hnsw-int8 &
nohup python src/main/python/run_regression.py --index --verify --search --regression dl20-passage-cos-dpr-distil-fw >& logs/log.dl20-passage-cos-dpr-distil-fw &
nohup python src/main/python/run_regression.py --index --verify --search --regression dl20-passage-cos-dpr-distil-lexlsh >& logs/log.dl20-passage-cos-dpr-distil-lexlsh &
nohup python src/main/python/run_regression.py --index --verify --search --regression dl20-passage-openai-ada2 >& logs/log.dl20-passage-openai-ada2 &
-nohup python src/main/python/run_regression.py --search-pool 1 --index --verify --search --regression dl20-passage-splade-pp-ed-onnx >& logs/log.dl20-passage-splade-pp-ed-onnx &
-nohup python src/main/python/run_regression.py --search-pool 1 --index --verify --search --regression dl20-passage-splade-pp-sd-onnx >& logs/log.dl20-passage-splade-pp-sd-onnx &
-nohup python src/main/python/run_regression.py --search-pool 1 --index --verify --search --regression dl20-passage-cos-dpr-distil-hnsw-onnx >& logs/log.dl20-passage-cos-dpr-distil-hnsw-onnx &
+nohup python src/main/python/run_regression.py --index --verify --search --regression dl20-passage-splade-pp-ed-onnx >& logs/log.dl20-passage-splade-pp-ed-onnx &
+nohup python src/main/python/run_regression.py --index --verify --search --regression dl20-passage-splade-pp-sd-onnx >& logs/log.dl20-passage-splade-pp-sd-onnx &
+nohup python src/main/python/run_regression.py --index --verify --search --regression dl20-passage-cos-dpr-distil-hnsw-onnx >& logs/log.dl20-passage-cos-dpr-distil-hnsw-onnx &
+nohup python src/main/python/run_regression.py --index --verify --search --regression dl20-passage-cos-dpr-distil-hnsw-int8-onnx >& logs/log.dl20-passage-cos-dpr-distil-hnsw-int8-onnx &
nohup python src/main/python/run_regression.py --index --verify --search --regression dl20-doc >& logs/log.dl20-doc &
nohup python src/main/python/run_regression.py --index --verify --search --regression dl20-doc-wp >& logs/log.dl20-doc-wp &
diff --git a/docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw-int8-onnx.md b/docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw-int8-onnx.md
new file mode 100644
index 0000000000..24f529ef99
--- /dev/null
+++ b/docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw-int8-onnx.md
@@ -0,0 +1,123 @@
+# Anserini Regressions: TREC 2019 Deep Learning Track (Passage)
+
+**Model**: cosDPR-distil with HNSW indexes (using ONNX for on-the-fly query encoding)
+
+This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [TREC 2019 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper:
+
+> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom.
+
+In these experiments, we are performing query inference "on-the-fly" with ONNX.
+
+Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast).
+For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md).
+
+The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl19-passage-cos-dpr-distil-hnsw-int8-onnx.yaml).
+Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil-hnsw-int8-onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation.
+
+From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end:
+
+```bash
+python src/main/python/run_regression.py --index --verify --search --regression dl19-passage-cos-dpr-distil-hnsw-int8-onnx
+```
+
+We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil.
+
+From any machine, the following command will download the corpus and perform the complete regression, end to end:
+
+```bash
+python src/main/python/run_regression.py --download --index --verify --search --regression dl19-passage-cos-dpr-distil-hnsw-int8-onnx
+```
+
+The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results.
+
+## Corpus Download
+
+Download the corpus and unpack into `collections/`:
+
+```bash
+wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.tar -P collections/
+tar xvf collections/msmarco-passage-cos-dpr-distil.tar -C collections/
+```
+
+To confirm, `msmarco-passage-cos-dpr-distil.tar` is 57 GB and has MD5 checksum `e20ffbc8b5e7f760af31298aefeaebbd`.
+With the corpus downloaded, the following command will perform the remaining steps below:
+
+```bash
+python src/main/python/run_regression.py --index --verify --search --regression dl19-passage-cos-dpr-distil-hnsw-int8-onnx \
+ --corpus-path collections/msmarco-passage-cos-dpr-distil
+```
+
+## Indexing
+
+Sample indexing command, building HNSW indexes:
+
+```bash
+target/appassembler/bin/IndexHnswDenseVectors \
+ -collection JsonDenseVectorCollection \
+ -input /path/to/msmarco-passage-cos-dpr-distil \
+ -generator HnswDenseVectorDocumentGenerator \
+ -index indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil-int8/ \
+ -threads 16 -M 16 -efC 100 -memoryBuffer 65536 -noMerge -quantize.int8 \
+ >& logs/log.msmarco-passage-cos-dpr-distil &
+```
+
+The path `/path/to/msmarco-passage-cos-dpr-distil/` should point to the corpus downloaded above.
+Upon completion, we should have an index with 8,841,823 documents.
+
+Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments.
+This is because merging index segments is a costly operation and not worthwhile given our query set.
+Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory.
+See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact.
+
+## Retrieval
+
+Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule.
+The regression experiments here evaluate on the 43 topics for which NIST has provided judgments as part of the TREC 2019 Deep Learning Track.
+The original data can be found [here](https://trec.nist.gov/data/deep2019.html).
+
+After indexing has completed, you should be able to perform retrieval as follows:
+
+```bash
+target/appassembler/bin/SearchHnswDenseVectors \
+ -index indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil-int8/ \
+ -topics tools/topics-and-qrels/topics.dl19-passage.txt \
+ -topicReader TsvInt \
+ -output runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.dl19-passage.txt \
+ -generator VectorQueryGenerator -topicField title -threads 16 -hits 1000 -efSearch 1000 -encoder CosDprDistil &
+```
+
+Note that we are performing query inference "on-the-fly" with ONNX in these experiments.
+
+Evaluation can be performed using `trec_eval`:
+
+```bash
+tools/eval/trec_eval.9.0.4/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.dl19-passage.txt
+tools/eval/trec_eval.9.0.4/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.dl19-passage.txt
+tools/eval/trec_eval.9.0.4/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.dl19-passage.txt
+tools/eval/trec_eval.9.0.4/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.dl19-passage.txt
+```
+
+## Effectiveness
+
+With the above commands, you should be able to reproduce the following results:
+
+| **AP@1000** | **cosDPR-distil**|
+|:-------------------------------------------------------------------------------------------------------------|-----------|
+| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.458 |
+| **nDCG@10** | **cosDPR-distil**|
+| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.717 |
+| **R@100** | **cosDPR-distil**|
+| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.605 |
+| **R@1000** | **cosDPR-distil**|
+| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.805 |
+
+Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run.
+Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](../../src/main/resources/regression/dl19-passage-cos-dpr-distil-hnsw-int8-onnx.yaml).
+
+Also note that retrieval metrics are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking).
+Also, for computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`).
+The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820).
+
+## Reproduction Log[*](reproducibility.md)
+
+To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil-hnsw-int8-onnx.template) and run `bin/build.sh` to rebuild the documentation.
diff --git a/docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw-int8.md b/docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw-int8.md
new file mode 100644
index 0000000000..bbad60de68
--- /dev/null
+++ b/docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw-int8.md
@@ -0,0 +1,121 @@
+# Anserini Regressions: TREC 2019 Deep Learning Track (Passage)
+
+**Model**: cosDPR-distil with HNSW indexes (using pre-encoded queries)
+
+This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [TREC 2019 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper:
+
+> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom.
+
+In these experiments, we are using pre-encoded queries (i.e., cached results of query encoding).
+
+Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast).
+For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md).
+
+The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl19-passage-cos-dpr-distil-hnsw-int8.yaml).
+Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil-hnsw-int8.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation.
+
+From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end:
+
+```bash
+python src/main/python/run_regression.py --index --verify --search --regression dl19-passage-cos-dpr-distil-hnsw-int8
+```
+
+We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil.
+
+From any machine, the following command will download the corpus and perform the complete regression, end to end:
+
+```bash
+python src/main/python/run_regression.py --download --index --verify --search --regression dl19-passage-cos-dpr-distil-hnsw-int8
+```
+
+The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results.
+
+## Corpus Download
+
+Download the corpus and unpack into `collections/`:
+
+```bash
+wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.tar -P collections/
+tar xvf collections/msmarco-passage-cos-dpr-distil.tar -C collections/
+```
+
+To confirm, `msmarco-passage-cos-dpr-distil.tar` is 57 GB and has MD5 checksum `e20ffbc8b5e7f760af31298aefeaebbd`.
+With the corpus downloaded, the following command will perform the remaining steps below:
+
+```bash
+python src/main/python/run_regression.py --index --verify --search --regression dl19-passage-cos-dpr-distil-hnsw-int8 \
+ --corpus-path collections/msmarco-passage-cos-dpr-distil
+```
+
+## Indexing
+
+Sample indexing command, building HNSW indexes:
+
+```bash
+target/appassembler/bin/IndexHnswDenseVectors \
+ -collection JsonDenseVectorCollection \
+ -input /path/to/msmarco-passage-cos-dpr-distil \
+ -generator HnswDenseVectorDocumentGenerator \
+ -index indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil-int8/ \
+ -threads 16 -M 16 -efC 100 -memoryBuffer 65536 -noMerge -quantize.int8 \
+ >& logs/log.msmarco-passage-cos-dpr-distil &
+```
+
+The path `/path/to/msmarco-passage-cos-dpr-distil/` should point to the corpus downloaded above.
+Upon completion, we should have an index with 8,841,823 documents.
+
+Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments.
+This is because merging index segments is a costly operation and not worthwhile given our query set.
+Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory.
+See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact.
+
+## Retrieval
+
+Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule.
+The regression experiments here evaluate on the 43 topics for which NIST has provided judgments as part of the TREC 2019 Deep Learning Track.
+The original data can be found [here](https://trec.nist.gov/data/deep2019.html).
+
+After indexing has completed, you should be able to perform retrieval as follows:
+
+```bash
+target/appassembler/bin/SearchHnswDenseVectors \
+ -index indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil-int8/ \
+ -topics tools/topics-and-qrels/topics.dl19-passage.cos-dpr-distil.jsonl.gz \
+ -topicReader JsonIntVector \
+ -output runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.dl19-passage.cos-dpr-distil.jsonl.txt \
+ -generator VectorQueryGenerator -topicField vector -threads 16 -hits 1000 -efSearch 1000 &
+```
+
+Evaluation can be performed using `trec_eval`:
+
+```bash
+tools/eval/trec_eval.9.0.4/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.dl19-passage.cos-dpr-distil.jsonl.txt
+tools/eval/trec_eval.9.0.4/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.dl19-passage.cos-dpr-distil.jsonl.txt
+tools/eval/trec_eval.9.0.4/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.dl19-passage.cos-dpr-distil.jsonl.txt
+tools/eval/trec_eval.9.0.4/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.dl19-passage.cos-dpr-distil.jsonl.txt
+```
+
+## Effectiveness
+
+With the above commands, you should be able to reproduce the following results:
+
+| **AP@1000** | **cosDPR-distil**|
+|:-------------------------------------------------------------------------------------------------------------|-----------|
+| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.458 |
+| **nDCG@10** | **cosDPR-distil**|
+| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.717 |
+| **R@100** | **cosDPR-distil**|
+| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.605 |
+| **R@1000** | **cosDPR-distil**|
+| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.805 |
+
+Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run.
+Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](../../src/main/resources/regression/dl19-passage-cos-dpr-distil-hnsw-int8.yaml).
+
+Also note that retrieval metrics are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking).
+Also, for computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`).
+The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820).
+
+## Reproduction Log[*](reproducibility.md)
+
+To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil-hnsw-int8.template) and run `bin/build.sh` to rebuild the documentation.
diff --git a/docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw-onnx.md b/docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw-onnx.md
index b08012b307..5335899358 100644
--- a/docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw-onnx.md
+++ b/docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw-onnx.md
@@ -6,7 +6,7 @@ This page describes regression experiments, integrated into Anserini's regressio
> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom.
-In these experiments, we are using pre-encoded queries (i.e., cached results of query encoding).
+In these experiments, we are performing query inference "on-the-fly" with ONNX.
Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast).
For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md).
@@ -57,14 +57,16 @@ target/appassembler/bin/IndexHnswDenseVectors \
-input /path/to/msmarco-passage-cos-dpr-distil \
-generator HnswDenseVectorDocumentGenerator \
-index indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil/ \
- -threads 16 -M 16 -efC 100 -memoryBuffer 65536 \
+ -threads 16 -M 16 -efC 100 -memoryBuffer 65536 -noMerge \
>& logs/log.msmarco-passage-cos-dpr-distil &
```
The path `/path/to/msmarco-passage-cos-dpr-distil/` should point to the corpus downloaded above.
-
Upon completion, we should have an index with 8,841,823 documents.
+Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments.
+This is because merging index segments is a costly operation and not worthwhile given our query set.
+
## Retrieval
Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule.
@@ -82,6 +84,8 @@ target/appassembler/bin/SearchHnswDenseVectors \
-generator VectorQueryGenerator -topicField title -threads 16 -hits 1000 -efSearch 1000 -encoder CosDprDistil &
```
+Note that we are performing query inference "on-the-fly" with ONNX in these experiments.
+
Evaluation can be performed using `trec_eval`:
```bash
diff --git a/docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw.md b/docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw.md
index dc625e14d0..9b533ff518 100644
--- a/docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw.md
+++ b/docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw.md
@@ -57,14 +57,16 @@ target/appassembler/bin/IndexHnswDenseVectors \
-input /path/to/msmarco-passage-cos-dpr-distil \
-generator HnswDenseVectorDocumentGenerator \
-index indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil/ \
- -threads 16 -M 16 -efC 100 -memoryBuffer 65536 \
+ -threads 16 -M 16 -efC 100 -memoryBuffer 65536 -noMerge \
>& logs/log.msmarco-passage-cos-dpr-distil &
```
The path `/path/to/msmarco-passage-cos-dpr-distil/` should point to the corpus downloaded above.
-
Upon completion, we should have an index with 8,841,823 documents.
+Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments.
+This is because merging index segments is a costly operation and not worthwhile given our query set.
+
## Retrieval
Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule.
diff --git a/docs/regressions/regressions-dl19-passage-openai-ada2-int8.md b/docs/regressions/regressions-dl19-passage-openai-ada2-int8.md
new file mode 100644
index 0000000000..e71b7c03b8
--- /dev/null
+++ b/docs/regressions/regressions-dl19-passage-openai-ada2-int8.md
@@ -0,0 +1,123 @@
+# Anserini Regressions: TREC 2019 Deep Learning Track (Passage)
+
+**NOTE:** We're currently having issues with this regression, which throws "Retried waiting for GCLocker too often" errors.
+
+**Model**: OpenAI-ada2 embeddings (using pre-encoded queries) with HNSW indexes
+
+This page describes regression experiments, integrated into Anserini's regression testing framework, using OpenAI-ada2 embeddings on the [TREC 2019 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper:
+
+> Jimmy Lin, Ronak Pradeep, Tommaso Teofili, and Jasper Xian. [Vector Search with OpenAI Embeddings: Lucene Is All You Need.](https://arxiv.org/abs/2308.14963) _arXiv:2308.14963_, 2023.
+
+In these experiments, we are using pre-encoded queries (i.e., cached results of query encoding).
+
+Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast).
+For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md).
+
+The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl19-passage-openai-ada2-int8.yaml).
+Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl19-passage-openai-ada2-int8.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation.
+
+From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end:
+
+```bash
+python src/main/python/run_regression.py --index --verify --search --regression dl19-passage-openai-ada2-int8
+```
+
+We make available a version of the MS MARCO Passage Corpus that has already been encoded with the OpenAI-ada2 embedding model.
+
+From any machine, the following command will download the corpus and perform the complete regression, end to end:
+
+```bash
+python src/main/python/run_regression.py --download --index --verify --search --regression dl19-passage-openai-ada2-int8
+```
+
+The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results.
+
+## Corpus Download
+
+Download the corpus and unpack into `collections/`:
+
+```bash
+wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-openai-ada2.tar -P collections/
+tar xvf collections/msmarco-passage-openai-ada2.tar -C collections/
+```
+
+To confirm, `msmarco-passage-openai-ada2.tar` is 109 GB and has MD5 checksum `a4d843d522ff3a3af7edbee789a63402`.
+With the corpus downloaded, the following command will perform the remaining steps below:
+
+```bash
+python src/main/python/run_regression.py --index --verify --search --regression dl19-passage-openai-ada2-int8 \
+ --corpus-path collections/msmarco-passage-openai-ada2
+```
+
+## Indexing
+
+Sample indexing command, building HNSW indexes:
+
+```bash
+target/appassembler/bin/IndexHnswDenseVectors \
+ -collection JsonDenseVectorCollection \
+ -input /path/to/msmarco-passage-openai-ada2 \
+ -generator HnswDenseVectorDocumentGenerator \
+ -index indexes/lucene-hnsw.msmarco-passage-openai-ada2-int8/ \
+ -threads 16 -M 16 -efC 100 -memoryBuffer 65536 -noMerge -quantize.int8 \
+ >& logs/log.msmarco-passage-openai-ada2 &
+```
+
+The path `/path/to/msmarco-passage-openai-ada2/` should point to the corpus downloaded above.
+Upon completion, we should have an index with 8,841,823 documents.
+
+Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments.
+This is because merging index segments is a costly operation and not worthwhile given our query set.
+Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory.
+See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact.
+
+## Retrieval
+
+Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule.
+The regression experiments here evaluate on the 43 topics for which NIST has provided judgments as part of the TREC 2019 Deep Learning Track.
+The original data can be found [here](https://trec.nist.gov/data/deep2019.html).
+
+After indexing has completed, you should be able to perform retrieval as follows:
+
+```bash
+target/appassembler/bin/SearchHnswDenseVectors \
+ -index indexes/lucene-hnsw.msmarco-passage-openai-ada2-int8/ \
+ -topics tools/topics-and-qrels/topics.dl19-passage.openai-ada2.jsonl.gz \
+ -topicReader JsonIntVector \
+ -output runs/run.msmarco-passage-openai-ada2.openai-ada2.topics.dl19-passage.openai-ada2.jsonl.txt \
+ -generator VectorQueryGenerator -topicField vector -threads 16 -hits 1000 -efSearch 1000 &
+```
+
+Evaluation can be performed using `trec_eval`:
+
+```bash
+tools/eval/trec_eval.9.0.4/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-openai-ada2.openai-ada2.topics.dl19-passage.openai-ada2.jsonl.txt
+tools/eval/trec_eval.9.0.4/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-openai-ada2.openai-ada2.topics.dl19-passage.openai-ada2.jsonl.txt
+tools/eval/trec_eval.9.0.4/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-openai-ada2.openai-ada2.topics.dl19-passage.openai-ada2.jsonl.txt
+tools/eval/trec_eval.9.0.4/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-openai-ada2.openai-ada2.topics.dl19-passage.openai-ada2.jsonl.txt
+```
+
+## Effectiveness
+
+With the above commands, you should be able to reproduce the following results:
+
+| **AP@1000** | **OpenAI-ada2**|
+|:-------------------------------------------------------------------------------------------------------------|-----------|
+| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.479 |
+| **nDCG@10** | **OpenAI-ada2**|
+| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.704 |
+| **R@100** | **OpenAI-ada2**|
+| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.624 |
+| **R@1000** | **OpenAI-ada2**|
+| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.857 |
+
+Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run.
+Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](../../src/main/resources/regression/dl19-passage-openai-ada2-int8.yaml).
+
+Also note that retrieval metrics are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking).
+Also, for computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`).
+The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820).
+
+## Reproduction Log[*](reproducibility.md)
+
+To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl19-passage-openai-ada2-int8.template) and run `bin/build.sh` to rebuild the documentation.
diff --git a/docs/regressions/regressions-dl19-passage-openai-ada2.md b/docs/regressions/regressions-dl19-passage-openai-ada2.md
index c5382dc3c5..57f5ab8932 100644
--- a/docs/regressions/regressions-dl19-passage-openai-ada2.md
+++ b/docs/regressions/regressions-dl19-passage-openai-ada2.md
@@ -57,14 +57,16 @@ target/appassembler/bin/IndexHnswDenseVectors \
-input /path/to/msmarco-passage-openai-ada2 \
-generator HnswDenseVectorDocumentGenerator \
-index indexes/lucene-hnsw.msmarco-passage-openai-ada2/ \
- -threads 16 -M 16 -efC 100 -memoryBuffer 65536 \
+ -threads 16 -M 16 -efC 100 -memoryBuffer 65536 -noMerge \
>& logs/log.msmarco-passage-openai-ada2 &
```
The path `/path/to/msmarco-passage-openai-ada2/` should point to the corpus downloaded above.
-
Upon completion, we should have an index with 8,841,823 documents.
+Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments.
+This is because merging index segments is a costly operation and not worthwhile given our query set.
+
## Retrieval
Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule.
diff --git a/docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw-int8-onnx.md b/docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw-int8-onnx.md
new file mode 100644
index 0000000000..21c2f8cd12
--- /dev/null
+++ b/docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw-int8-onnx.md
@@ -0,0 +1,123 @@
+# Anserini Regressions: TREC 2020 Deep Learning Track (Passage)
+
+**Model**: cosDPR-distil with HNSW indexes (using ONNX for on-the-fly query encoding)
+
+This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [TREC 2020 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper:
+
+> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom.
+
+In these experiments, we are performing query inference "on-the-fly" with ONNX.
+
+Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast).
+For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md).
+
+The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl20-passage-cos-dpr-distil-hnsw-int8-onnx.yaml).
+Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil-hnsw-int8-onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation.
+
+From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end:
+
+```bash
+python src/main/python/run_regression.py --index --verify --search --regression dl20-passage-cos-dpr-distil-hnsw-int8-onnx
+```
+
+We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil.
+
+From any machine, the following command will download the corpus and perform the complete regression, end to end:
+
+```bash
+python src/main/python/run_regression.py --download --index --verify --search --regression dl20-passage-cos-dpr-distil-hnsw-int8-onnx
+```
+
+The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results.
+
+## Corpus Download
+
+Download the corpus and unpack into `collections/`:
+
+```bash
+wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.tar -P collections/
+tar xvf collections/msmarco-passage-cos-dpr-distil.tar -C collections/
+```
+
+To confirm, `msmarco-passage-cos-dpr-distil.tar` is 57 GB and has MD5 checksum `e20ffbc8b5e7f760af31298aefeaebbd`.
+With the corpus downloaded, the following command will perform the remaining steps below:
+
+```bash
+python src/main/python/run_regression.py --index --verify --search --regression dl20-passage-cos-dpr-distil-hnsw-int8-onnx \
+ --corpus-path collections/msmarco-passage-cos-dpr-distil
+```
+
+## Indexing
+
+Sample indexing command, building HNSW indexes:
+
+```bash
+target/appassembler/bin/IndexHnswDenseVectors \
+ -collection JsonDenseVectorCollection \
+ -input /path/to/msmarco-passage-cos-dpr-distil \
+ -generator HnswDenseVectorDocumentGenerator \
+ -index indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil-int8/ \
+ -threads 16 -M 16 -efC 100 -memoryBuffer 65536 -noMerge -quantize.int8 \
+ >& logs/log.msmarco-passage-cos-dpr-distil &
+```
+
+The path `/path/to/msmarco-passage-cos-dpr-distil/` should point to the corpus downloaded above.
+Upon completion, we should have an index with 8,841,823 documents.
+
+Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments.
+This is because merging index segments is a costly operation and not worthwhile given our query set.
+Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory.
+See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact.
+
+## Retrieval
+
+Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule.
+The regression experiments here evaluate on the 54 topics for which NIST has provided judgments as part of the TREC 2020 Deep Learning Track.
+The original data can be found [here](https://trec.nist.gov/data/deep2020.html).
+
+After indexing has completed, you should be able to perform retrieval as follows:
+
+```bash
+target/appassembler/bin/SearchHnswDenseVectors \
+ -index indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil-int8/ \
+ -topics tools/topics-and-qrels/topics.dl20.txt \
+ -topicReader TsvInt \
+ -output runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.dl20.txt \
+ -generator VectorQueryGenerator -topicField title -threads 16 -hits 1000 -efSearch 1000 -encoder CosDprDistil &
+```
+
+Note that we are performing query inference "on-the-fly" with ONNX in these experiments.
+
+Evaluation can be performed using `trec_eval`:
+
+```bash
+tools/eval/trec_eval.9.0.4/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.dl20.txt
+tools/eval/trec_eval.9.0.4/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.dl20.txt
+tools/eval/trec_eval.9.0.4/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.dl20.txt
+tools/eval/trec_eval.9.0.4/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.dl20.txt
+```
+
+## Effectiveness
+
+With the above commands, you should be able to reproduce the following results:
+
+| **AP@1000** | **cosDPR-distil**|
+|:-------------------------------------------------------------------------------------------------------------|-----------|
+| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.482 |
+| **nDCG@10** | **cosDPR-distil**|
+| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.701 |
+| **R@100** | **cosDPR-distil**|
+| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.712 |
+| **R@1000** | **cosDPR-distil**|
+| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.843 |
+
+Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run.
+Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](../../src/main/resources/regression/dl20-passage-cos-dpr-distil-hnsw-int8-onnx.yaml).
+
+Also note that retrieval metrics are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking).
+Also, for computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`).
+The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820).
+
+## Reproduction Log[*](reproducibility.md)
+
+To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil-hnsw-int8-onnx.template) and run `bin/build.sh` to rebuild the documentation.
diff --git a/docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw-int8.md b/docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw-int8.md
new file mode 100644
index 0000000000..cc9c2f14b6
--- /dev/null
+++ b/docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw-int8.md
@@ -0,0 +1,121 @@
+# Anserini Regressions: TREC 2020 Deep Learning Track (Passage)
+
+**Model**: cosDPR-distil with HNSW indexes (using pre-encoded queries)
+
+This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [TREC 2020 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper:
+
+> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom.
+
+In these experiments, we are using pre-encoded queries (i.e., cached results of query encoding).
+
+Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast).
+For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md).
+
+The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl20-passage-cos-dpr-distil-hnsw-int8.yaml).
+Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil-hnsw-int8.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation.
+
+From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end:
+
+```bash
+python src/main/python/run_regression.py --index --verify --search --regression dl20-passage-cos-dpr-distil-hnsw-int8
+```
+
+We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil.
+
+From any machine, the following command will download the corpus and perform the complete regression, end to end:
+
+```bash
+python src/main/python/run_regression.py --download --index --verify --search --regression dl20-passage-cos-dpr-distil-hnsw-int8
+```
+
+The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results.
+
+## Corpus Download
+
+Download the corpus and unpack into `collections/`:
+
+```bash
+wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.tar -P collections/
+tar xvf collections/msmarco-passage-cos-dpr-distil.tar -C collections/
+```
+
+To confirm, `msmarco-passage-cos-dpr-distil.tar` is 57 GB and has MD5 checksum `e20ffbc8b5e7f760af31298aefeaebbd`.
+With the corpus downloaded, the following command will perform the remaining steps below:
+
+```bash
+python src/main/python/run_regression.py --index --verify --search --regression dl20-passage-cos-dpr-distil-hnsw-int8 \
+ --corpus-path collections/msmarco-passage-cos-dpr-distil
+```
+
+## Indexing
+
+Sample indexing command, building HNSW indexes:
+
+```bash
+target/appassembler/bin/IndexHnswDenseVectors \
+ -collection JsonDenseVectorCollection \
+ -input /path/to/msmarco-passage-cos-dpr-distil \
+ -generator HnswDenseVectorDocumentGenerator \
+ -index indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil-int8/ \
+ -threads 16 -M 16 -efC 100 -memoryBuffer 65536 -noMerge -quantize.int8 \
+ >& logs/log.msmarco-passage-cos-dpr-distil &
+```
+
+The path `/path/to/msmarco-passage-cos-dpr-distil/` should point to the corpus downloaded above.
+Upon completion, we should have an index with 8,841,823 documents.
+
+Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments.
+This is because merging index segments is a costly operation and not worthwhile given our query set.
+Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory.
+See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact.
+
+## Retrieval
+
+Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule.
+The regression experiments here evaluate on the 54 topics for which NIST has provided judgments as part of the TREC 2020 Deep Learning Track.
+The original data can be found [here](https://trec.nist.gov/data/deep2020.html).
+
+After indexing has completed, you should be able to perform retrieval as follows:
+
+```bash
+target/appassembler/bin/SearchHnswDenseVectors \
+ -index indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil-int8/ \
+ -topics tools/topics-and-qrels/topics.dl20.cos-dpr-distil.jsonl.gz \
+ -topicReader JsonIntVector \
+ -output runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.dl20.cos-dpr-distil.jsonl.txt \
+ -generator VectorQueryGenerator -topicField vector -threads 16 -hits 1000 -efSearch 1000 &
+```
+
+Evaluation can be performed using `trec_eval`:
+
+```bash
+tools/eval/trec_eval.9.0.4/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.dl20.cos-dpr-distil.jsonl.txt
+tools/eval/trec_eval.9.0.4/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.dl20.cos-dpr-distil.jsonl.txt
+tools/eval/trec_eval.9.0.4/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.dl20.cos-dpr-distil.jsonl.txt
+tools/eval/trec_eval.9.0.4/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.dl20.cos-dpr-distil.jsonl.txt
+```
+
+## Effectiveness
+
+With the above commands, you should be able to reproduce the following results:
+
+| **AP@1000** | **cosDPR-distil**|
+|:-------------------------------------------------------------------------------------------------------------|-----------|
+| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.482 |
+| **nDCG@10** | **cosDPR-distil**|
+| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.701 |
+| **R@100** | **cosDPR-distil**|
+| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.712 |
+| **R@1000** | **cosDPR-distil**|
+| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.843 |
+
+Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run.
+Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](../../src/main/resources/regression/dl20-passage-cos-dpr-distil-hnsw-int8.yaml).
+
+Also note that retrieval metrics are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking).
+Also, for computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`).
+The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820).
+
+## Reproduction Log[*](reproducibility.md)
+
+To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil-hnsw-int8.template) and run `bin/build.sh` to rebuild the documentation.
diff --git a/docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw-onnx.md b/docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw-onnx.md
index f040a9ce41..a802d3370e 100644
--- a/docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw-onnx.md
+++ b/docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw-onnx.md
@@ -6,7 +6,7 @@ This page describes regression experiments, integrated into Anserini's regressio
> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom.
-In these experiments, we are using pre-encoded queries (i.e., cached results of query encoding).
+In these experiments, we are performing query inference "on-the-fly" with ONNX.
Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast).
For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md).
@@ -57,14 +57,16 @@ target/appassembler/bin/IndexHnswDenseVectors \
-input /path/to/msmarco-passage-cos-dpr-distil \
-generator HnswDenseVectorDocumentGenerator \
-index indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil/ \
- -threads 16 -M 16 -efC 100 -memoryBuffer 65536 \
+ -threads 16 -M 16 -efC 100 -memoryBuffer 65536 -noMerge \
>& logs/log.msmarco-passage-cos-dpr-distil &
```
The path `/path/to/msmarco-passage-cos-dpr-distil/` should point to the corpus downloaded above.
-
Upon completion, we should have an index with 8,841,823 documents.
+Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments.
+This is because merging index segments is a costly operation and not worthwhile given our query set.
+
## Retrieval
Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule.
@@ -82,6 +84,8 @@ target/appassembler/bin/SearchHnswDenseVectors \
-generator VectorQueryGenerator -topicField title -threads 16 -hits 1000 -efSearch 1000 -encoder CosDprDistil &
```
+Note that we are performing query inference "on-the-fly" with ONNX in these experiments.
+
Evaluation can be performed using `trec_eval`:
```bash
diff --git a/docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw.md b/docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw.md
index c2f46b422c..d67487d1e4 100644
--- a/docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw.md
+++ b/docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw.md
@@ -57,14 +57,16 @@ target/appassembler/bin/IndexHnswDenseVectors \
-input /path/to/msmarco-passage-cos-dpr-distil \
-generator HnswDenseVectorDocumentGenerator \
-index indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil/ \
- -threads 16 -M 16 -efC 100 -memoryBuffer 65536 \
+ -threads 16 -M 16 -efC 100 -memoryBuffer 65536 -noMerge \
>& logs/log.msmarco-passage-cos-dpr-distil &
```
The path `/path/to/msmarco-passage-cos-dpr-distil/` should point to the corpus downloaded above.
-
Upon completion, we should have an index with 8,841,823 documents.
+Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments.
+This is because merging index segments is a costly operation and not worthwhile given our query set.
+
## Retrieval
Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule.
diff --git a/docs/regressions/regressions-dl20-passage-openai-ada2-int8.md b/docs/regressions/regressions-dl20-passage-openai-ada2-int8.md
new file mode 100644
index 0000000000..beb86feb50
--- /dev/null
+++ b/docs/regressions/regressions-dl20-passage-openai-ada2-int8.md
@@ -0,0 +1,123 @@
+# Anserini Regressions: TREC 2020 Deep Learning Track (Passage)
+
+**NOTE:** We're currently having issues with this regression, which throws "Retried waiting for GCLocker too often" errors.
+
+**Model**: OpenAI-ada2 embeddings (using pre-encoded queries) with HNSW indexes
+
+This page describes regression experiments, integrated into Anserini's regression testing framework, using OpenAI-ada2 embeddings on the [TREC 2020 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper:
+
+> Jimmy Lin, Ronak Pradeep, Tommaso Teofili, and Jasper Xian. [Vector Search with OpenAI Embeddings: Lucene Is All You Need.](https://arxiv.org/abs/2308.14963) _arXiv:2308.14963_, 2023.
+
+In these experiments, we are using pre-encoded queries (i.e., cached results of query encoding).
+
+Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast).
+For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md).
+
+The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl20-passage-openai-ada2-int8.yaml).
+Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl20-passage-openai-ada2-int8.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation.
+
+From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end:
+
+```bash
+python src/main/python/run_regression.py --index --verify --search --regression dl20-passage-openai-ada2-int8
+```
+
+We make available a version of the MS MARCO Passage Corpus that has already been encoded with the OpenAI-ada2 embedding model.
+
+From any machine, the following command will download the corpus and perform the complete regression, end to end:
+
+```bash
+python src/main/python/run_regression.py --download --index --verify --search --regression dl20-passage-openai-ada2-int8
+```
+
+The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results.
+
+## Corpus Download
+
+Download the corpus and unpack into `collections/`:
+
+```bash
+wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-openai-ada2.tar -P collections/
+tar xvf collections/msmarco-passage-openai-ada2.tar -C collections/
+```
+
+To confirm, `msmarco-passage-openai-ada2.tar` is 109 GB and has MD5 checksum `a4d843d522ff3a3af7edbee789a63402`.
+With the corpus downloaded, the following command will perform the remaining steps below:
+
+```bash
+python src/main/python/run_regression.py --index --verify --search --regression dl20-passage-openai-ada2-int8 \
+ --corpus-path collections/msmarco-passage-openai-ada2
+```
+
+## Indexing
+
+Sample indexing command, building HNSW indexes:
+
+```bash
+target/appassembler/bin/IndexHnswDenseVectors \
+ -collection JsonDenseVectorCollection \
+ -input /path/to/msmarco-passage-openai-ada2 \
+ -generator HnswDenseVectorDocumentGenerator \
+ -index indexes/lucene-hnsw.msmarco-passage-openai-ada2-int8/ \
+ -threads 16 -M 16 -efC 100 -memoryBuffer 65536 -noMerge -quantize.int8 \
+ >& logs/log.msmarco-passage-openai-ada2 &
+```
+
+The path `/path/to/msmarco-passage-openai-ada2/` should point to the corpus downloaded above.
+Upon completion, we should have an index with 8,841,823 documents.
+
+Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments.
+This is because merging index segments is a costly operation and not worthwhile given our query set.
+Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory.
+See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact.
+
+## Retrieval
+
+Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule.
+The regression experiments here evaluate on the 54 topics for which NIST has provided judgments as part of the TREC 2020 Deep Learning Track.
+The original data can be found [here](https://trec.nist.gov/data/deep2020.html).
+
+After indexing has completed, you should be able to perform retrieval as follows:
+
+```bash
+target/appassembler/bin/SearchHnswDenseVectors \
+ -index indexes/lucene-hnsw.msmarco-passage-openai-ada2-int8/ \
+ -topics tools/topics-and-qrels/topics.dl20-passage.openai-ada2.jsonl.gz \
+ -topicReader JsonIntVector \
+ -output runs/run.msmarco-passage-openai-ada2.openai-ada2.topics.dl20-passage.openai-ada2.jsonl.txt \
+ -generator VectorQueryGenerator -topicField vector -threads 16 -hits 1000 -efSearch 1000 &
+```
+
+Evaluation can be performed using `trec_eval`:
+
+```bash
+tools/eval/trec_eval.9.0.4/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-openai-ada2.openai-ada2.topics.dl20-passage.openai-ada2.jsonl.txt
+tools/eval/trec_eval.9.0.4/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-openai-ada2.openai-ada2.topics.dl20-passage.openai-ada2.jsonl.txt
+tools/eval/trec_eval.9.0.4/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-openai-ada2.openai-ada2.topics.dl20-passage.openai-ada2.jsonl.txt
+tools/eval/trec_eval.9.0.4/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-openai-ada2.openai-ada2.topics.dl20-passage.openai-ada2.jsonl.txt
+```
+
+## Effectiveness
+
+With the above commands, you should be able to reproduce the following results:
+
+| **AP@1000** | **OpenAI-ada2**|
+|:-------------------------------------------------------------------------------------------------------------|-----------|
+| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.477 |
+| **nDCG@10** | **OpenAI-ada2**|
+| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.676 |
+| **R@100** | **OpenAI-ada2**|
+| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.723 |
+| **R@1000** | **OpenAI-ada2**|
+| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.867 |
+
+Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run.
+Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](../../src/main/resources/regression/dl20-passage-openai-ada2-int8.yaml).
+
+Also note that retrieval metrics are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking).
+Also, for computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`).
+The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820).
+
+## Reproduction Log[*](reproducibility.md)
+
+To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl20-passage-openai-ada2-int8.template) and run `bin/build.sh` to rebuild the documentation.
diff --git a/docs/regressions/regressions-dl20-passage-openai-ada2.md b/docs/regressions/regressions-dl20-passage-openai-ada2.md
index f3e93c63ef..ed3f4be6d7 100644
--- a/docs/regressions/regressions-dl20-passage-openai-ada2.md
+++ b/docs/regressions/regressions-dl20-passage-openai-ada2.md
@@ -57,14 +57,16 @@ target/appassembler/bin/IndexHnswDenseVectors \
-input /path/to/msmarco-passage-openai-ada2 \
-generator HnswDenseVectorDocumentGenerator \
-index indexes/lucene-hnsw.msmarco-passage-openai-ada2/ \
- -threads 16 -M 16 -efC 100 -memoryBuffer 65536 \
+ -threads 16 -M 16 -efC 100 -memoryBuffer 65536 -noMerge \
>& logs/log.msmarco-passage-openai-ada2 &
```
The path `/path/to/msmarco-passage-openai-ada2/` should point to the corpus downloaded above.
-
Upon completion, we should have an index with 8,841,823 documents.
+Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments.
+This is because merging index segments is a costly operation and not worthwhile given our query set.
+
## Retrieval
Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule.
diff --git a/docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw-int8-onnx.md b/docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw-int8-onnx.md
new file mode 100644
index 0000000000..f130af6512
--- /dev/null
+++ b/docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw-int8-onnx.md
@@ -0,0 +1,115 @@
+# Anserini Regressions: MS MARCO Passage Ranking
+
+**Model**: cosDPR-distil with HNSW indexes (using ONNX for on-the-fly query encoding)
+
+This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [MS MARCO passage ranking task](https://github.com/microsoft/MSMARCO-Passage-Ranking), as described in the following paper:
+
+> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom.
+
+In these experiments, we are performing query inference "on-the-fly" with ONNX.
+
+The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw-int8-onnx.yaml).
+Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-hnsw-int8-onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation.
+
+From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end:
+
+```bash
+python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-cos-dpr-distil-hnsw-int8-onnx
+```
+
+We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil.
+
+From any machine, the following command will download the corpus and perform the complete regression, end to end:
+
+```bash
+python src/main/python/run_regression.py --download --index --verify --search --regression msmarco-passage-cos-dpr-distil-hnsw-int8-onnx
+```
+
+The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results.
+
+## Corpus Download
+
+Download the corpus and unpack into `collections/`:
+
+```bash
+wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.tar -P collections/
+tar xvf collections/msmarco-passage-cos-dpr-distil.tar -C collections/
+```
+
+To confirm, `msmarco-passage-cos-dpr-distil.tar` is 57 GB and has MD5 checksum `e20ffbc8b5e7f760af31298aefeaebbd`.
+With the corpus downloaded, the following command will perform the remaining steps below:
+
+```bash
+python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-cos-dpr-distil-hnsw-int8-onnx \
+ --corpus-path collections/msmarco-passage-cos-dpr-distil
+```
+
+## Indexing
+
+Sample indexing command, building HNSW indexes:
+
+```bash
+target/appassembler/bin/IndexHnswDenseVectors \
+ -collection JsonDenseVectorCollection \
+ -input /path/to/msmarco-passage-cos-dpr-distil \
+ -generator HnswDenseVectorDocumentGenerator \
+ -index indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil-int8/ \
+ -threads 16 -M 16 -efC 100 -memoryBuffer 65536 -noMerge -quantize.int8 \
+ >& logs/log.msmarco-passage-cos-dpr-distil &
+```
+
+The path `/path/to/msmarco-passage-cos-dpr-distil/` should point to the corpus downloaded above.
+Upon completion, we should have an index with 8,841,823 documents.
+
+Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments.
+This is because merging index segments is a costly operation and not worthwhile given our query set.
+Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory.
+See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact.
+
+## Retrieval
+
+Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule.
+The regression experiments here evaluate on the 6980 dev set questions; see [this page](../../docs/experiments-msmarco-passage.md) for more details.
+
+After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes:
+
+```bash
+target/appassembler/bin/SearchHnswDenseVectors \
+ -index indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil-int8/ \
+ -topics tools/topics-and-qrels/topics.msmarco-passage.dev-subset.txt \
+ -topicReader TsvInt \
+ -output runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.msmarco-passage.dev-subset.txt \
+ -generator VectorQueryGenerator -topicField title -threads 16 -hits 1000 -efSearch 1000 -encoder CosDprDistil &
+```
+
+Note that we are performing query inference "on-the-fly" with ONNX in these experiments.
+
+Evaluation can be performed using `trec_eval`:
+
+```bash
+tools/eval/trec_eval.9.0.4/trec_eval -c -m map tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.msmarco-passage.dev-subset.txt
+tools/eval/trec_eval.9.0.4/trec_eval -c -M 10 -m recip_rank tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.msmarco-passage.dev-subset.txt
+tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.msmarco-passage.dev-subset.txt
+tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.msmarco-passage.dev-subset.txt
+```
+
+## Effectiveness
+
+With the above commands, you should be able to reproduce the following results:
+
+| **AP@1000** | **cosDPR-distil**|
+|:-------------------------------------------------------------------------------------------------------------|-----------|
+| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.393 |
+| **RR@10** | **cosDPR-distil**|
+| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.388 |
+| **R@100** | **cosDPR-distil**|
+| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.903 |
+| **R@1000** | **cosDPR-distil**|
+| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.974 |
+
+Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run.
+Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](../../src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw-int8-onnx.yaml).
+
+## Reproduction Log[*](../../docs/reproducibility.md)
+
+To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-hnsw-int8-onnx.template) and run `bin/build.sh` to rebuild the documentation.
diff --git a/docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw-int8.md b/docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw-int8.md
new file mode 100644
index 0000000000..e97fb49c17
--- /dev/null
+++ b/docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw-int8.md
@@ -0,0 +1,115 @@
+# Anserini Regressions: MS MARCO Passage Ranking
+
+**Model**: cosDPR-distil with HNSW indexes (using pre-encoded queries)
+
+This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [MS MARCO passage ranking task](https://github.com/microsoft/MSMARCO-Passage-Ranking), as described in the following paper:
+
+> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom.
+
+In these experiments, we are using pre-encoded queries (i.e., cached results of query encoding).
+
+The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw-int8.yaml).
+Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-hnsw-int8.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation.
+
+From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end:
+
+```bash
+python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-cos-dpr-distil-hnsw-int8
+```
+
+We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil.
+
+From any machine, the following command will download the corpus and perform the complete regression, end to end:
+
+```bash
+python src/main/python/run_regression.py --download --index --verify --search --regression msmarco-passage-cos-dpr-distil-hnsw-int8
+```
+
+The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results.
+
+## Corpus Download
+
+Download the corpus and unpack into `collections/`:
+
+```bash
+wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.tar -P collections/
+tar xvf collections/msmarco-passage-cos-dpr-distil.tar -C collections/
+```
+
+To confirm, `msmarco-passage-cos-dpr-distil.tar` is 57 GB and has MD5 checksum `e20ffbc8b5e7f760af31298aefeaebbd`.
+With the corpus downloaded, the following command will perform the remaining steps below:
+
+```bash
+python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-cos-dpr-distil-hnsw-int8 \
+ --corpus-path collections/msmarco-passage-cos-dpr-distil
+```
+
+## Indexing
+
+Sample indexing command, building HNSW indexes:
+
+```bash
+target/appassembler/bin/IndexHnswDenseVectors \
+ -collection JsonDenseVectorCollection \
+ -input /path/to/msmarco-passage-cos-dpr-distil \
+ -generator HnswDenseVectorDocumentGenerator \
+ -index indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil-int8/ \
+ -threads 16 -M 16 -efC 100 -memoryBuffer 65536 -noMerge -quantize.int8 \
+ >& logs/log.msmarco-passage-cos-dpr-distil &
+```
+
+The path `/path/to/msmarco-passage-cos-dpr-distil/` should point to the corpus downloaded above.
+Upon completion, we should have an index with 8,841,823 documents.
+
+Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments.
+This is because merging index segments is a costly operation and not worthwhile given our query set.
+Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory.
+See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact.
+
+## Retrieval
+
+Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule.
+The regression experiments here evaluate on the 6980 dev set questions; see [this page](../../docs/experiments-msmarco-passage.md) for more details.
+
+After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes:
+
+```bash
+target/appassembler/bin/SearchHnswDenseVectors \
+ -index indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil-int8/ \
+ -topics tools/topics-and-qrels/topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.gz \
+ -topicReader JsonIntVector \
+ -output runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.txt \
+ -generator VectorQueryGenerator -topicField vector -threads 16 -hits 1000 -efSearch 1000 &
+```
+
+Evaluation can be performed using `trec_eval`:
+
+```bash
+tools/eval/trec_eval.9.0.4/trec_eval -c -m map tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.txt
+tools/eval/trec_eval.9.0.4/trec_eval -c -M 10 -m recip_rank tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.txt
+tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.txt
+tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.txt
+```
+
+## Effectiveness
+
+With the above commands, you should be able to reproduce the following results:
+
+| **AP@1000** | **cosDPR-distil**|
+|:-------------------------------------------------------------------------------------------------------------|-----------|
+| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.393 |
+| **RR@10** | **cosDPR-distil**|
+| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.388 |
+| **R@100** | **cosDPR-distil**|
+| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.903 |
+| **R@1000** | **cosDPR-distil**|
+| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.974 |
+
+Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run.
+Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](../../src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw-int8.yaml).
+
+## Reproduction Log[*](../../docs/reproducibility.md)
+
+To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-hnsw-int8.template) and run `bin/build.sh` to rebuild the documentation.
+
++ Results reproduced by [@yilinjz](https://github.com/yilinjz) on 2023-09-01 (commit [`4ae518b`](https://github.com/castorini/anserini/commit/4ae518bb284ebcba0b273a473bc8774735cb7d19))
\ No newline at end of file
diff --git a/docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw-onnx.md b/docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw-onnx.md
index 6df98970f1..a8f41de6ee 100644
--- a/docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw-onnx.md
+++ b/docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw-onnx.md
@@ -6,7 +6,7 @@ This page describes regression experiments, integrated into Anserini's regressio
> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom.
-In these experiments, we are using pre-encoded queries (i.e., cached results of query encoding).
+In these experiments, we are performing query inference "on-the-fly" with ONNX.
The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw-onnx.yaml).
Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-hnsw-onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation.
@@ -54,14 +54,16 @@ target/appassembler/bin/IndexHnswDenseVectors \
-input /path/to/msmarco-passage-cos-dpr-distil \
-generator HnswDenseVectorDocumentGenerator \
-index indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil/ \
- -threads 16 -M 16 -efC 100 -memoryBuffer 65536 \
+ -threads 16 -M 16 -efC 100 -memoryBuffer 65536 -noMerge \
>& logs/log.msmarco-passage-cos-dpr-distil &
```
The path `/path/to/msmarco-passage-cos-dpr-distil/` should point to the corpus downloaded above.
-
Upon completion, we should have an index with 8,841,823 documents.
+Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments.
+This is because merging index segments is a costly operation and not worthwhile given our query set.
+
## Retrieval
Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule.
@@ -78,6 +80,8 @@ target/appassembler/bin/SearchHnswDenseVectors \
-generator VectorQueryGenerator -topicField title -threads 16 -hits 1000 -efSearch 1000 -encoder CosDprDistil &
```
+Note that we are performing query inference "on-the-fly" with ONNX in these experiments.
+
Evaluation can be performed using `trec_eval`:
```bash
diff --git a/docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw.md b/docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw.md
index cf41e9645a..0d98114e80 100644
--- a/docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw.md
+++ b/docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw.md
@@ -54,14 +54,16 @@ target/appassembler/bin/IndexHnswDenseVectors \
-input /path/to/msmarco-passage-cos-dpr-distil \
-generator HnswDenseVectorDocumentGenerator \
-index indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil/ \
- -threads 16 -M 16 -efC 100 -memoryBuffer 65536 \
+ -threads 16 -M 16 -efC 100 -memoryBuffer 65536 -noMerge \
>& logs/log.msmarco-passage-cos-dpr-distil &
```
The path `/path/to/msmarco-passage-cos-dpr-distil/` should point to the corpus downloaded above.
-
Upon completion, we should have an index with 8,841,823 documents.
+Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments.
+This is because merging index segments is a costly operation and not worthwhile given our query set.
+
## Retrieval
Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule.
diff --git a/docs/regressions/regressions-msmarco-passage-openai-ada2-int8.md b/docs/regressions/regressions-msmarco-passage-openai-ada2-int8.md
new file mode 100644
index 0000000000..7b2053dec9
--- /dev/null
+++ b/docs/regressions/regressions-msmarco-passage-openai-ada2-int8.md
@@ -0,0 +1,116 @@
+# Anserini Regressions: MS MARCO Passage Ranking
+
+**NOTE:** We're currently having issues with this regression, which throws "Retried waiting for GCLocker too often" errors.
+
+**Model**: OpenAI-ada2 embeddings (using pre-encoded queries) with HNSW indexes
+
+This page describes regression experiments, integrated into Anserini's regression testing framework, using OpenAI-ada2 embeddings on the [MS MARCO passage ranking task](https://github.com/microsoft/MSMARCO-Passage-Ranking), as described in the following paper:
+
+> Jimmy Lin, Ronak Pradeep, Tommaso Teofili, and Jasper Xian. [Vector Search with OpenAI Embeddings: Lucene Is All You Need.](https://arxiv.org/abs/2308.14963) _arXiv:2308.14963_, 2023.
+
+In these experiments, we are using pre-encoded queries (i.e., cached results of query encoding).
+
+The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/msmarco-passage-openai-ada2-int8.yaml).
+Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/msmarco-passage-openai-ada2-int8.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation.
+
+From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end:
+
+```bash
+python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-openai-ada2-int8
+```
+
+We make available a version of the MS MARCO Passage Corpus that has already been encoded with the OpenAI-ada2 embedding model.
+
+From any machine, the following command will download the corpus and perform the complete regression, end to end:
+
+```bash
+python src/main/python/run_regression.py --download --index --verify --search --regression msmarco-passage-openai-ada2-int8
+```
+
+The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results.
+
+## Corpus Download
+
+Download the corpus and unpack into `collections/`:
+
+```bash
+wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-openai-ada2.tar -P collections/
+tar xvf collections/msmarco-passage-openai-ada2.tar -C collections/
+```
+
+To confirm, `msmarco-passage-openai-ada2.tar` is 109 GB and has MD5 checksum `a4d843d522ff3a3af7edbee789a63402`.
+With the corpus downloaded, the following command will perform the remaining steps below:
+
+```bash
+python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-openai-ada2-int8 \
+ --corpus-path collections/msmarco-passage-openai-ada2
+```
+
+## Indexing
+
+Sample indexing command, building HNSW indexes:
+
+```bash
+target/appassembler/bin/IndexHnswDenseVectors \
+ -collection JsonDenseVectorCollection \
+ -input /path/to/msmarco-passage-openai-ada2 \
+ -generator HnswDenseVectorDocumentGenerator \
+ -index indexes/lucene-hnsw.msmarco-passage-openai-ada2-int8/ \
+ -threads 16 -M 16 -efC 100 -memoryBuffer 65536 -noMerge -quantize.int8 \
+ >& logs/log.msmarco-passage-openai-ada2 &
+```
+
+The path `/path/to/msmarco-passage-openai-ada2/` should point to the corpus downloaded above.
+Upon completion, we should have an index with 8,841,823 documents.
+
+Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments.
+This is because merging index segments is a costly operation and not worthwhile given our query set.
+Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory.
+See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact.
+
+## Retrieval
+
+Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule.
+The regression experiments here evaluate on the 6980 dev set questions; see [this page](../../docs/experiments-msmarco-passage.md) for more details.
+
+After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes:
+
+```bash
+target/appassembler/bin/SearchHnswDenseVectors \
+ -index indexes/lucene-hnsw.msmarco-passage-openai-ada2-int8/ \
+ -topics tools/topics-and-qrels/topics.msmarco-passage.dev-subset.openai-ada2.jsonl.gz \
+ -topicReader JsonIntVector \
+ -output runs/run.msmarco-passage-openai-ada2.openai-ada2.topics.msmarco-passage.dev-subset.openai-ada2.jsonl.txt \
+ -generator VectorQueryGenerator -topicField vector -threads 16 -hits 1000 -efSearch 1000 &
+```
+
+Evaluation can be performed using `trec_eval`:
+
+```bash
+tools/eval/trec_eval.9.0.4/trec_eval -c -m map tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-openai-ada2.openai-ada2.topics.msmarco-passage.dev-subset.openai-ada2.jsonl.txt
+tools/eval/trec_eval.9.0.4/trec_eval -c -M 10 -m recip_rank tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-openai-ada2.openai-ada2.topics.msmarco-passage.dev-subset.openai-ada2.jsonl.txt
+tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-openai-ada2.openai-ada2.topics.msmarco-passage.dev-subset.openai-ada2.jsonl.txt
+tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-openai-ada2.openai-ada2.topics.msmarco-passage.dev-subset.openai-ada2.jsonl.txt
+```
+
+## Effectiveness
+
+With the above commands, you should be able to reproduce the following results:
+
+| **AP@1000** | **OpenAI-ada2**|
+|:-------------------------------------------------------------------------------------------------------------|-----------|
+| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.350 |
+| **RR@10** | **OpenAI-ada2**|
+| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.343 |
+| **R@100** | **OpenAI-ada2**|
+| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.898 |
+| **R@1000** | **OpenAI-ada2**|
+| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.985 |
+
+Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run.
+Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](../../src/main/resources/regression/msmarco-passage-openai-ada2-int8.yaml).
+
+## Reproduction Log[*](../../docs/reproducibility.md)
+
+To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/msmarco-passage-openai-ada2-int8.template) and run `bin/build.sh` to rebuild the documentation.
+
diff --git a/docs/regressions/regressions-msmarco-passage-openai-ada2.md b/docs/regressions/regressions-msmarco-passage-openai-ada2.md
index c6ca60e2bc..8f58da3521 100644
--- a/docs/regressions/regressions-msmarco-passage-openai-ada2.md
+++ b/docs/regressions/regressions-msmarco-passage-openai-ada2.md
@@ -54,14 +54,16 @@ target/appassembler/bin/IndexHnswDenseVectors \
-input /path/to/msmarco-passage-openai-ada2 \
-generator HnswDenseVectorDocumentGenerator \
-index indexes/lucene-hnsw.msmarco-passage-openai-ada2/ \
- -threads 16 -M 16 -efC 100 -memoryBuffer 65536 \
+ -threads 16 -M 16 -efC 100 -memoryBuffer 65536 -noMerge \
>& logs/log.msmarco-passage-openai-ada2 &
```
The path `/path/to/msmarco-passage-openai-ada2/` should point to the corpus downloaded above.
-
Upon completion, we should have an index with 8,841,823 documents.
+Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments.
+This is because merging index segments is a costly operation and not worthwhile given our query set.
+
## Retrieval
Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule.
diff --git a/pom.xml b/pom.xml
index 8027e32204..9b43d57aa3 100644
--- a/pom.xml
+++ b/pom.xml
@@ -26,7 +26,7 @@
- 9.8.0
+ 9.9.1
UTF-8
@@ -463,5 +463,15 @@
api
0.21.0
+
+ me.tongfei
+ progressbar
+ 0.10.0
+
+
+ commons-codec
+ commons-codec
+ 1.15
+
diff --git a/src/main/java/io/anserini/index/IndexCollection.java b/src/main/java/io/anserini/index/IndexCollection.java
index f8775d92f0..2242837027 100644
--- a/src/main/java/io/anserini/index/IndexCollection.java
+++ b/src/main/java/io/anserini/index/IndexCollection.java
@@ -31,6 +31,7 @@
import org.apache.logging.log4j.Logger;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.core.WhitespaceAnalyzer;
+import org.apache.lucene.codecs.lucene99.Lucene99Codec;
import org.apache.lucene.index.ConcurrentMergeScheduler;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
@@ -161,20 +162,6 @@ public static class Args extends AbstractIndexer.Args {
public IndexCollection(Args args) throws Exception {
super(args);
- LOG.info("IndexCollection settings:");
- LOG.info(" + Generator: " + args.generatorClass);
- LOG.info(" + Language: " + args.language);
- LOG.info(" + Stemmer: " + args.stemmer);
- LOG.info(" + Keep stopwords? " + args.keepStopwords);
- LOG.info(" + Stopwords: " + args.stopwords);
- LOG.info(" + Store positions? " + args.storePositions);
- LOG.info(" + Store docvectors? " + args.storeDocvectors);
- LOG.info(" + Store document \"contents\" field? " + args.storeContents);
- LOG.info(" + Store document \"raw\" field? " + args.storeRaw);
- LOG.info(" + Additional fields to index: " + Arrays.toString(args.fields));
- LOG.info(" + Whitelist: " + args.whitelist);
- LOG.info(" + Pretokenized?: " + args.pretokenized);
-
try {
super.generatorClass = (Class>)
Class.forName("io.anserini.index.generator." + args.generatorClass);
@@ -206,6 +193,21 @@ public IndexCollection(Args args) throws Exception {
config.setMergeScheduler(new ConcurrentMergeScheduler());
super.writer = new IndexWriter(dir, config);
+
+ LOG.info("IndexCollection settings:");
+ LOG.info(" + Generator: " + args.generatorClass);
+ LOG.info(" + Language: " + args.language);
+ LOG.info(" + Stemmer: " + args.stemmer);
+ LOG.info(" + Keep stopwords? " + args.keepStopwords);
+ LOG.info(" + Stopwords: " + args.stopwords);
+ LOG.info(" + Store positions? " + args.storePositions);
+ LOG.info(" + Store docvectors? " + args.storeDocvectors);
+ LOG.info(" + Store document \"contents\" field? " + args.storeContents);
+ LOG.info(" + Store document \"raw\" field? " + args.storeRaw);
+ LOG.info(" + Additional fields to index: " + Arrays.toString(args.fields));
+ LOG.info(" + Whitelist: " + args.whitelist);
+ LOG.info(" + Pretokenized?: " + args.pretokenized);
+ LOG.info(" + Codec: " + this.writer.getConfig().getCodec());
}
private Analyzer getAnalyzer() {
diff --git a/src/main/java/io/anserini/index/IndexHnswDenseVectors.java b/src/main/java/io/anserini/index/IndexHnswDenseVectors.java
index 10e7c15640..161fbc4fac 100644
--- a/src/main/java/io/anserini/index/IndexHnswDenseVectors.java
+++ b/src/main/java/io/anserini/index/IndexHnswDenseVectors.java
@@ -23,11 +23,13 @@
import org.apache.lucene.codecs.KnnVectorsFormat;
import org.apache.lucene.codecs.KnnVectorsReader;
import org.apache.lucene.codecs.KnnVectorsWriter;
-import org.apache.lucene.codecs.lucene95.Lucene95Codec;
-import org.apache.lucene.codecs.lucene95.Lucene95HnswVectorsFormat;
+import org.apache.lucene.codecs.lucene99.Lucene99Codec;
+import org.apache.lucene.codecs.lucene99.Lucene99HnswScalarQuantizedVectorsFormat;
+import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat;
import org.apache.lucene.index.ConcurrentMergeScheduler;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.index.NoMergePolicy;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.TieredMergePolicy;
@@ -56,20 +58,34 @@ public static final class Args extends AbstractIndexer.Args {
@Option(name = "-efC", metaVar = "[num]", usage = "HNSW parameters ef Construction")
public int efC = 100;
+ @Option(name = "-quantize.int8", usage = "Quantize vectors into int8.")
+ public boolean quantizeInt8 = false;
+
@Option(name = "-storeVectors", usage = "Boolean switch to store raw raw vectors.")
public boolean storeVectors = false;
+
+ @Option(name = "-noMerge", usage = "Do not merge segments (fast indexing, slow retrieval).")
+ public boolean noMerge = false;
+
+ @Option(name = "-maxThreadMemoryBeforeFlush", metaVar = "[num]", usage = "Maximum memory consumption per thread before triggering a forced flush (in MB); must be smaller than 2048.")
+ public int maxThreadMemoryBeforeFlush = 2047;
+ // This is the most aggressive possible setting; default is 1945.
+ // If the setting is too aggressive, may result in GCLocker issues.
+
+ @Option(name = "-maxMergedSegmentSize", metaVar = "[num]", usage = "Maximum sized segment to produce during normal merging (in MB).")
+ public int maxMergedSegmentSize = 1024 * 16;
+
+ @Option(name = "-segmentsPerTier", metaVar = "[num]", usage = "Allowed number of segments per tier.")
+ public int segmentsPerTier = 10;
+
+ @Option(name = "-maxMergeAtOnce", metaVar = "[num]", usage = "Maximum number of segments to be merged at a time during \"normal\" merging.")
+ public int maxMergeAtOnce = 10;
}
@SuppressWarnings("unchecked")
public IndexHnswDenseVectors(Args args) throws Exception {
super(args);
- LOG.info("HnswIndexer settings:");
- LOG.info(" + Generator: " + args.generatorClass);
- LOG.info(" + M: " + args.M);
- LOG.info(" + efC: " + args.efC);
- LOG.info(" + Store document vectors? " + args.storeVectors);
-
try {
super.generatorClass = (Class>)
Class.forName("io.anserini.index.generator." + args.generatorClass);
@@ -79,26 +95,49 @@ public IndexHnswDenseVectors(Args args) throws Exception {
try {
final Directory dir = FSDirectory.open(Paths.get(args.index));
- final IndexWriterConfig config = new IndexWriterConfig().setCodec(
- new Lucene95Codec() {
- @Override
- public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
- return new DelegatingKnnVectorsFormat(
- new Lucene95HnswVectorsFormat(args.M, args.efC), 4096);
- }
- });
+ final IndexWriterConfig config;
+
+ if (args.quantizeInt8) {
+ config = new IndexWriterConfig().setCodec(
+ new Lucene99Codec() {
+ @Override
+ public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
+ return new DelegatingKnnVectorsFormat(
+ new Lucene99HnswScalarQuantizedVectorsFormat(args.M, args.efC), 4096);
+ }
+ });
+ } else {
+ config = new IndexWriterConfig().setCodec(
+ new Lucene99Codec() {
+ @Override
+ public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
+ return new DelegatingKnnVectorsFormat(
+ new Lucene99HnswVectorsFormat(args.M, args.efC), 4096);
+ }
+ });
+ }
config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
config.setRAMBufferSizeMB(args.memoryBuffer);
+ config.setRAMPerThreadHardLimitMB(args.maxThreadMemoryBeforeFlush);
config.setUseCompoundFile(false);
config.setMergeScheduler(new ConcurrentMergeScheduler());
- if (args.optimize) {
- // If we're going to merge down into a single segment at the end, skip intermediate merges,
- // since they are a waste of time.
+ if (args.noMerge) {
+ config.setMergePolicy(NoMergePolicy.INSTANCE);
+ } else {
TieredMergePolicy mergePolicy = new TieredMergePolicy();
- mergePolicy.setMaxMergeAtOnce(256);
- mergePolicy.setSegmentsPerTier(256);
+ if (args.optimize) {
+ // If we're going to merge down into a single segment at the end, skip intermediate merges,
+ // since they are a waste of time.
+ mergePolicy.setMaxMergeAtOnce(256);
+ mergePolicy.setSegmentsPerTier(256);
+ } else {
+ mergePolicy.setFloorSegmentMB(1024);
+ mergePolicy.setMaxMergedSegmentMB(args.maxMergedSegmentSize);
+ mergePolicy.setSegmentsPerTier(args.segmentsPerTier);
+ mergePolicy.setMaxMergeAtOnce(args.maxMergeAtOnce);
+ }
config.setMergePolicy(mergePolicy);
}
@@ -106,6 +145,24 @@ public KnnVectorsFormat getKnnVectorsFormatForField(String field) {
} catch (Exception e) {
throw new IllegalArgumentException(String.format("Unable to create IndexWriter: %s.", e.getMessage()));
}
+
+ LOG.info("HnswIndexer settings:");
+ LOG.info(" + Generator: " + args.generatorClass);
+ LOG.info(" + M: " + args.M);
+ LOG.info(" + efC: " + args.efC);
+ LOG.info(" + Store document vectors? " + args.storeVectors);
+ LOG.info(" + Codec: " + this.writer.getConfig().getCodec());
+ LOG.info(" + MemoryBuffer: " + args.memoryBuffer);
+ LOG.info(" + MaxThreadMemoryBeforeFlush: " + args.maxThreadMemoryBeforeFlush);
+
+ if (args.noMerge) {
+ LOG.info(" + MergePolicy: NoMerge");
+ } else {
+ LOG.info(" + MergePolicy: TieredMergePolicy");
+ LOG.info(" + MaxMergedSegmentSize: " + args.maxMergedSegmentSize);
+ LOG.info(" + SegmentsPerTier: " + args.segmentsPerTier);
+ LOG.info(" + MaxMergeAtOnce: " + args.maxMergeAtOnce);
+ }
}
// Solution provided by Solr, see https://www.mail-archive.com/java-user@lucene.apache.org/msg52149.html
diff --git a/src/main/java/io/anserini/index/IndexInfo.java b/src/main/java/io/anserini/index/IndexInfo.java
new file mode 100644
index 0000000000..767493ef8b
--- /dev/null
+++ b/src/main/java/io/anserini/index/IndexInfo.java
@@ -0,0 +1,102 @@
+/*
+ * Anserini: A Lucene toolkit for reproducible information retrieval research
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.anserini.index;
+
+public enum IndexInfo {
+ MSMARCO_V1_PASSAGE("msmarco-v1-passage",
+ "Lucene index of the MS MARCO V1 passage corpus. (Lucene 9)",
+ "lucene-index.msmarco-v1-passage.20221004.252b5e.tar.gz",
+ "lucene-index.msmarco-v1-passage.20221004.252b5e.README.md",
+ new String[] {
+ "https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.msmarco-v1-passage.20221004.252b5e.tar.gz" },
+ "c697b18c9a0686ca760583e615dbe450", "2170758938", "352316036", "8841823",
+ "2660824", false),
+
+ CACM("cacm",
+ "Lucene index of the CACM corpus. (Lucene 9)",
+ "lucene-index.cacm.tar.gz",
+ new String[] {
+ "https://github.com/castorini/anserini-data/raw/master/CACM/lucene-index.cacm.20221005.252b5e.tar.gz" },
+ "cfe14d543c6a27f4d742fb2d0099b8e0",
+ "2347197",
+ "320968",
+ "3204",
+ "14363");
+
+ public final String indexName;
+ public final String description;
+ public final String filename;
+ public final String readme;
+ public final String[] urls;
+ public final String md5;
+ public final String size;
+ public final String totalTerms;
+ public final String totalDocs;
+ public final String totalUniqueTerms;
+ public final boolean downloaded;
+
+ // constructor with all 11 fields
+ IndexInfo(String indexName, String description, String filename, String readme, String[] urls, String md5,
+ String size, String totalTerms, String totalDocs, String totalUniqueTerms, boolean downloaded) {
+ this.indexName = indexName;
+ this.description = description;
+ this.filename = filename;
+ this.readme = readme;
+ this.urls = urls;
+ this.md5 = md5;
+ this.size = size;
+ this.totalTerms = totalTerms;
+ this.totalDocs = totalDocs;
+ this.totalUniqueTerms = totalUniqueTerms;
+ this.downloaded = downloaded;
+ }
+
+ // constructor with 9 fields
+ IndexInfo(String indexName, String description, String filename, String[] urls, String md5, String size,
+ String totalTerms, String totalDocs, String totalUniqueTerms) {
+ this.indexName = indexName;
+ this.description = description;
+ this.filename = filename;
+ this.readme = "";
+ this.urls = urls;
+ this.md5 = md5;
+ this.size = size;
+ this.totalTerms = totalTerms;
+ this.totalDocs = totalDocs;
+ this.totalUniqueTerms = totalUniqueTerms;
+ this.downloaded = false;
+ }
+
+ public static boolean contains(String indexName) {
+ for (IndexInfo indexInfo : IndexInfo.values()) {
+ if (indexInfo.indexName.equals(indexName)) {
+ return true;
+ }
+ }
+ return false;
+ }
+
+ public static IndexInfo get(String indexName) {
+ for (IndexInfo indexInfo : IndexInfo.values()) {
+ if (indexInfo.indexName.equals(indexName)) {
+ return indexInfo;
+ }
+ }
+ throw new IllegalArgumentException("Index name " + indexName + " not found!");
+ }
+
+}
diff --git a/src/main/java/io/anserini/index/IndexInvertedDenseVectors.java b/src/main/java/io/anserini/index/IndexInvertedDenseVectors.java
index e3a0c8e810..249c626a97 100644
--- a/src/main/java/io/anserini/index/IndexInvertedDenseVectors.java
+++ b/src/main/java/io/anserini/index/IndexInvertedDenseVectors.java
@@ -25,7 +25,7 @@
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
-import org.apache.lucene.codecs.lucene95.Lucene95Codec;
+import org.apache.lucene.codecs.lucene99.Lucene99Codec;
import org.apache.lucene.index.ConcurrentMergeScheduler;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
@@ -78,10 +78,6 @@ public static final class Args extends AbstractIndexer.Args {
public IndexInvertedDenseVectors(Args args) {
super(args);
- LOG.info("InvertedDenseIndexer settings:");
- LOG.info(" + Generator: " + args.generatorClass);
- LOG.info(" + Encoding: " + args.encoding);
-
try {
super.generatorClass = (Class>)
Class.forName("io.anserini.index.generator." + args.generatorClass);
@@ -104,7 +100,7 @@ public IndexInvertedDenseVectors(Args args) {
try {
final Directory dir = FSDirectory.open(Paths.get(args.index));
- final IndexWriterConfig config = new IndexWriterConfig(analyzer).setCodec(new Lucene95Codec());
+ final IndexWriterConfig config = new IndexWriterConfig(analyzer);
config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
config.setRAMBufferSizeMB(args.memoryBuffer);
config.setUseCompoundFile(false);
@@ -113,6 +109,11 @@ public IndexInvertedDenseVectors(Args args) {
} catch (Exception e) {
throw new IllegalArgumentException(String.format("Unable to create IndexWriter: %s.", e.getMessage()));
}
+
+ LOG.info("InvertedDenseIndexer settings:");
+ LOG.info(" + Generator: " + args.generatorClass);
+ LOG.info(" + Encoding: " + args.encoding);
+ LOG.info(" + Codec: " + this.writer.getConfig().getCodec());
}
public static void main(String[] args) throws Exception {
diff --git a/src/main/java/io/anserini/search/SearchCollection.java b/src/main/java/io/anserini/search/SearchCollection.java
index d94027cf3f..271070bb47 100644
--- a/src/main/java/io/anserini/search/SearchCollection.java
+++ b/src/main/java/io/anserini/search/SearchCollection.java
@@ -44,6 +44,8 @@
import io.anserini.search.topicreader.BackgroundLinkingTopicReader;
import io.anserini.search.topicreader.TopicReader;
import io.anserini.search.topicreader.Topics;
+import io.anserini.util.PrebuiltIndexHandler;
+
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.commons.lang3.time.DurationFormatUtils;
@@ -989,12 +991,27 @@ public void run() {
public SearchCollection(Args args) throws IOException {
this.args = args;
- Path indexPath = Paths.get(args.index);
+ Path indexPath = Path.of(args.index);
+ PrebuiltIndexHandler indexHandler = new PrebuiltIndexHandler(args.index);
+ if (!Files.exists(indexPath)) {
+ // it doesn't exist locally, we try to download it from remote
+ try {
+ indexHandler.initialize();
+ indexHandler.download();
+ indexPath = Path.of(indexHandler.decompressIndex());
+ } catch (IOException e) {
+ throw new RuntimeException("MD5 checksum does not match!");
+ } catch (Exception e) {
+ throw new IllegalArgumentException(String.format("Index path '%s' does not exist or is not a directory.", args.index));
+ }
+ } else {
+ // if it exists locally, we use it
+ indexPath = Paths.get(args.index);
+ }
if (!Files.exists(indexPath) || !Files.isDirectory(indexPath) || !Files.isReadable(indexPath)) {
throw new IllegalArgumentException(String.format("Index path '%s' does not exist or is not a directory.", args.index));
}
-
LOG.info("============ Initializing Searcher ============");
LOG.info("Index: " + indexPath);
this.reader = args.inmem ? DirectoryReader.open(MMapDirectory.open(indexPath)) :
diff --git a/src/main/java/io/anserini/util/PrebuiltIndexHandler.java b/src/main/java/io/anserini/util/PrebuiltIndexHandler.java
new file mode 100644
index 0000000000..e9ad12a1f5
--- /dev/null
+++ b/src/main/java/io/anserini/util/PrebuiltIndexHandler.java
@@ -0,0 +1,206 @@
+/*
+ * Anserini: A Lucene toolkit for reproducible information retrieval research
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.anserini.util;
+
+import me.tongfei.progressbar.ProgressBar;
+
+import org.apache.commons.codec.digest.DigestUtils;
+import org.apache.commons.io.IOUtils;
+import org.apache.commons.io.input.CountingInputStream;
+
+import io.anserini.index.IndexInfo;
+
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStream;
+import java.net.HttpURLConnection;
+import java.net.URL;
+import java.nio.file.Files;
+import java.nio.file.Path;
+import java.nio.file.Paths;
+
+public class PrebuiltIndexHandler {
+ private String indexName;
+ private String saveRootPath;
+ private IndexInfo info = null;
+ private Path indexFolderPath = null;
+ private boolean initialized = false;
+ private Path savePath;
+
+ public PrebuiltIndexHandler(String indexName) {
+ this.indexName = indexName;
+ this.saveRootPath = getCache();
+ }
+
+ private String getCache() {
+ /*
+ * Get the pyserini cache path firs to avoid double downloads. If the pyserini
+ * cache path does not exist, use the anserini cache path.
+ */
+ final Path PyseriniPath = Paths.get(System.getProperty("user.home"), ".cache", "pyserini", "indexes");
+ final Path AnseriniPath = Paths.get(System.getProperty("user.home"), ".cache", "anserini", "indexes");
+ if (checkFileExist(PyseriniPath)) {
+ return PyseriniPath.toString();
+ } else {
+ return AnseriniPath.toString();
+ }
+ }
+
+ private static boolean checkFileExist(Path path) {
+ return path.toFile().exists();
+ }
+
+ private boolean checkIndexFileExist() {
+ /*
+ * Check if the index file exists. If the index file exists, return true.
+ * Otherwise, return false.
+ */
+ if (checkFileExist(savePath) || checkFileExist(Paths.get(savePath.toString().replace(".gz", "")))
+ || checkFileExist(Paths.get(savePath.toString().replace(".tar.gz", "")))) {
+ return true;
+ }
+ return false;
+ }
+
+ private static IndexInfo getIndexInfo(String indexName) {
+ /*
+ * Get the index info from the index name.
+ */
+ try {
+ IndexInfo info = IndexInfo.get(indexName);
+ return info;
+ } catch (IllegalArgumentException e) {
+ throw new IllegalArgumentException("Index not found!" + e.getMessage());
+ }
+ }
+
+ private static boolean checkMD5(InputStream st, String md5) throws IOException {
+ /*
+ * Check the MD5 checksum of the index file.
+ */
+ String generatedChecksum = DigestUtils.md5Hex(st);
+ return generatedChecksum.equals(md5);
+ }
+
+ public void initialize() {
+ if (initialized) {
+ return;
+ }
+ info = getIndexInfo(indexName);
+ // check if saveRootPath exists
+ if (!checkFileExist(Paths.get(saveRootPath))) {
+ try {
+ Files.createDirectories(Paths.get(saveRootPath));
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }
+ savePath = Paths.get(saveRootPath, info.filename);
+ initialized = true;
+ }
+
+ public void download() throws IOException {
+ /*
+ * Download the index file to the save path. If the file already exists, do
+ * nothing. If the file does not exist, download the file and check the MD5
+ * checksum.
+ */
+ if (!initialized) {
+ throw new IllegalStateException("Handler not initialized!");
+ }
+ if (checkIndexFileExist()) {
+ System.out.println("Index file already exists! Skip downloading.");
+ return;
+ }
+
+ URL url = new URL(info.urls[0]);
+ HttpURLConnection httpConnection = (HttpURLConnection) (url.openConnection());
+ long completeFileSize = httpConnection.getContentLengthLong();
+
+ try (InputStream inputStream = url.openStream();
+ CountingInputStream cis = new CountingInputStream(inputStream);
+ FileOutputStream fileOS = new FileOutputStream(savePath.toFile());
+ ProgressBar pb = new ProgressBar(indexName, Math.floorDiv(completeFileSize, 1000))) {
+
+ pb.setExtraMessage("Downloading...");
+
+ new Thread(() -> {
+ try {
+ IOUtils.copyLarge(cis, fileOS);
+ } catch (IOException e) {
+ e.printStackTrace();
+ }
+ }).start();
+
+ while (cis.getByteCount() < completeFileSize) {
+ pb.stepTo(Math.floorDiv(cis.getByteCount(), 1000));
+ }
+
+ pb.stepTo(Math.floorDiv(cis.getByteCount(), 1000));
+ pb.close();
+
+ InputStream is = Files.newInputStream(savePath);
+ if (!checkMD5(is, info.md5)) {
+ throw new IOException("MD5 check failed!");
+ }
+ }
+ }
+
+ public String decompressIndex() throws Exception {
+ /*
+ * Decompress the tar.gz or tar index file to an archive folder. If the folder
+ * already exists, do nothing.
+ */
+ if (!initialized) {
+ throw new IllegalStateException("Handler not initialized!");
+ }
+ if (!checkIndexFileExist()) {
+ throw new Exception("Index file does not exist!");
+ }
+
+ String indexFolder = savePath.toString().replace(".tar.gz", "");
+ if (checkFileExist(Paths.get(indexFolder))) {
+ System.out.println("Index folder already exists!");
+ return indexFolder;
+ }
+ System.out.println("Decompressing index...");
+
+ if (checkFileExist(Paths.get(savePath.toString()))) {
+ ProcessBuilder pbGZIP = new ProcessBuilder("gzip", "-d", savePath.toString());
+ Process pGZIP = pbGZIP.start();
+ pGZIP.waitFor();
+ }
+
+ if (checkFileExist(Paths.get(savePath.toString().replace(".gz", "")))) {
+ ProcessBuilder pbTAR = new ProcessBuilder("tar", "-xvf",
+ savePath.toString().substring(0, savePath.toString().length() - 3), "-C", saveRootPath);
+ Process pTar = pbTAR.start();
+ pTar.waitFor();
+
+ // detele the tar file for saving space
+ Files.delete(Path.of(savePath.toString().replace(".gz", "")));
+ }
+
+ System.out.println("Index decompressed successfully!");
+ this.indexFolderPath = Paths.get(indexFolder);
+ return indexFolder;
+ }
+
+ public Path getIndexFolderPath() {
+ return this.indexFolderPath;
+ }
+}
diff --git a/src/main/python/regressions-batch03.txt b/src/main/python/regressions-batch03.txt
index baab7fc298..4a38ff5dfc 100644
--- a/src/main/python/regressions-batch03.txt
+++ b/src/main/python/regressions-batch03.txt
@@ -1,8 +1,10 @@
+python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-cos-dpr-distil-hnsw > logs/log.msmarco-passage-cos-dpr-distil-hnsw 2>&1
+python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-cos-dpr-distil-hnsw-int8 > logs/log.msmarco-passage-cos-dpr-distil-hnsw-int8 2>&1
+python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-openai-ada2 > logs/log.msmarco-passage-openai-ada2 2>&1
+
# MS MARCO V1 passage
python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-splade-pp-ed > logs/log.msmarco-passage-splade-pp-ed 2>&1
python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-splade-pp-sd > logs/log.msmarco-passage-splade-pp-sd 2>&1
-python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-openai-ada2 > logs/log.msmarco-passage-openai-ada2 2>&1
-python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-cos-dpr-distil-hnsw > logs/log.msmarco-passage-cos-dpr-distil-hnsw 2>&1
python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-cos-dpr-distil-fw > logs/log.msmarco-passage-cos-dpr-distil-fw 2>&1
python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-cos-dpr-distil-lexlsh > logs/log.msmarco-passage-cos-dpr-distil-lexlsh 2>&1
@@ -20,6 +22,10 @@ python src/main/python/run_regression.py --index --verify --search --regression
python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-distill-splade-max > logs/log.msmarco-passage-distill-splade-max 2>&1
python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-splade-distil-cocodenser-medium > logs/log.msmarco-passage-splade-distil-cocodenser-medium 2>&1
+# HNSW search-only
+python src/main/python/run_regression.py --search --regression msmarco-passage-cos-dpr-distil-hnsw-onnx > logs/log.msmarco-passage-cos-dpr-distil-hnsw-onnx 2>&1
+python src/main/python/run_regression.py --search --regression msmarco-passage-cos-dpr-distil-hnsw-int8-onnx > logs/log.msmarco-passage-cos-dpr-distil-hnsw-int8-onnx 2>&1
+
# MS MARCO V1 doc
python src/main/python/run_regression.py --index --verify --search --regression msmarco-doc > logs/log.msmarco-doc 2>&1
python src/main/python/run_regression.py --index --verify --search --regression msmarco-doc-wp > logs/log.msmarco-doc-wp 2>&1
@@ -34,9 +40,8 @@ python src/main/python/run_regression.py --index --verify --search --regression
python src/main/python/run_regression.py --index --verify --search --regression msmarco-doc-segmented-unicoil-noexp > logs/log.msmarco-doc-segmented-unicoil-noexp 2>&1
# MS MARCO V1 passage ONNX runs - uses same index, so need to make sure previous runs finish
-python src/main/python/run_regression.py --search-pool 1 --verify --search --regression msmarco-passage-splade-pp-ed-onnx > logs/log.msmarco-passage-splade-pp-ed-onnx 2>&1
-python src/main/python/run_regression.py --search-pool 1 --verify --search --regression msmarco-passage-splade-pp-sd-onnx > logs/log.msmarco-passage-splade-pp-sd-onnx 2>&1
-python src/main/python/run_regression.py --search-pool 1 --verify --search --regression msmarco-passage-cos-dpr-distil-hnsw-onnx > logs/log.msmarco-passage-cos-dpr-distil-hnsw-onnx 2>&1
+python src/main/python/run_regression.py --search --regression msmarco-passage-splade-pp-ed-onnx > logs/log.msmarco-passage-splade-pp-ed-onnx 2>&1
+python src/main/python/run_regression.py --search --regression msmarco-passage-splade-pp-sd-onnx > logs/log.msmarco-passage-splade-pp-sd-onnx 2>&1
# MIRACL
python src/main/python/run_regression.py --index --verify --search --regression miracl-v1.0-ar > logs/log.miracl-v1.0-ar 2>&1
@@ -121,107 +126,107 @@ python src/main/python/run_regression.py --index --verify --search --regression
python src/main/python/run_regression.py --index --verify --search --regression mrtydi-v1.1-te-aca > logs/log.mrtydi-v1.1-te-aca 2>&1
python src/main/python/run_regression.py --index --verify --search --regression mrtydi-v1.1-th-aca > logs/log.mrtydi-v1.1-th-aca 2>&1
-# DL19 - ONNX
-python src/main/python/run_regression.py --search-pool 1 --verify --search --regression dl19-passage-splade-pp-ed-onnx > logs/log.dl19-passage-splade-pp-ed-onnx 2>&1
-python src/main/python/run_regression.py --search-pool 1 --verify --search --regression dl19-passage-splade-pp-sd-onnx > logs/log.dl19-passage-splade-pp-sd-onnx 2>&1
-python src/main/python/run_regression.py --search-pool 1 --verify --search --regression dl19-passage-cos-dpr-distil-hnsw-onnx > logs/log.dl19-passage-cos-dpr-distil-hnsw-onnx 2>&1
-
-# Other DL19
-python src/main/python/run_regression.py --verify --search --regression dl19-passage > logs/log.dl19-passage 2>&1
-python src/main/python/run_regression.py --verify --search --regression dl19-passage-bm25-b8 > logs/log.dl19-passage-bm25-b8 2>&1
-python src/main/python/run_regression.py --verify --search --regression dl19-passage-ca > logs/log.dl19-passage-ca 2>&1
-python src/main/python/run_regression.py --verify --search --regression dl19-passage-wp > logs/log.dl19-passage-wp 2>&1
-python src/main/python/run_regression.py --verify --search --regression dl19-passage-hgf-wp > logs/log.dl19-passage-hgf-wp 2>&1
-python src/main/python/run_regression.py --verify --search --regression dl19-passage-docTTTTTquery > logs/log.dl19-passage-docTTTTTquery 2>&1
-
-python src/main/python/run_regression.py --verify --search --regression dl19-passage-unicoil > logs/log.dl19-passage-unicoil 2>&1
-python src/main/python/run_regression.py --verify --search --regression dl19-passage-unicoil-noexp > logs/log.dl19-passage-unicoil-noexp 2>&1
-python src/main/python/run_regression.py --verify --search --regression dl19-passage-splade-distil-cocodenser-medium > logs/log.dl19-passage-splade-distil-cocodenser-medium 2>&1
-python src/main/python/run_regression.py --verify --search --regression dl19-passage-splade-pp-ed > logs/log.dl19-passage-splade-pp-ed 2>&1
-python src/main/python/run_regression.py --verify --search --regression dl19-passage-splade-pp-sd > logs/log.dl19-passage-splade-pp-sd 2>&1
-python src/main/python/run_regression.py --verify --search --regression dl19-passage-cos-dpr-distil-hnsw > logs/log.dl19-passage-cos-dpr-distil-hnsw 2>&1
-python src/main/python/run_regression.py --verify --search --regression dl19-passage-cos-dpr-distil-fw > logs/log.dl19-passage-cos-dpr-distil-fw 2>&1
-python src/main/python/run_regression.py --verify --search --regression dl19-passage-cos-dpr-distil-lexlsh > logs/log.dl19-passage-cos-dpr-distil-lexlsh 2>&1
-python src/main/python/run_regression.py --verify --search --regression dl19-passage-openai-ada2 > logs/log.dl19-passage-openai-ada2 2>&1
-
-python src/main/python/run_regression.py --verify --search --regression dl19-doc > logs/log.dl19-doc 2>&1
-python src/main/python/run_regression.py --verify --search --regression dl19-doc-ca > logs/log.dl19-doc-ca 2>&1
-python src/main/python/run_regression.py --verify --search --regression dl19-doc-wp > logs/log.dl19-doc-wp 2>&1
-python src/main/python/run_regression.py --verify --search --regression dl19-doc-hgf-wp > logs/log.dl19-doc-hgf-wp 2>&1
-python src/main/python/run_regression.py --verify --search --regression dl19-doc-segmented-wp > logs/log.dl19-doc-segmented-wp 2>&1
-python src/main/python/run_regression.py --verify --search --regression dl19-doc-docTTTTTquery > logs/log.dl19-doc-docTTTTTquery 2>&1
-
-python src/main/python/run_regression.py --verify --search --regression dl19-doc-segmented > logs/log.dl19-doc-segmented 2>&1
-python src/main/python/run_regression.py --verify --search --regression dl19-doc-segmented-ca > logs/log.dl19-doc-segmented-ca 2>&1
-python src/main/python/run_regression.py --verify --search --regression dl19-doc-segmented-docTTTTTquery > logs/log.dl19-doc-segmented-docTTTTTquery 2>&1
-python src/main/python/run_regression.py --verify --search --regression dl19-doc-segmented-unicoil > logs/log.dl19-doc-segmented-unicoil 2>&1
-python src/main/python/run_regression.py --verify --search --regression dl19-doc-segmented-unicoil-noexp > logs/log.dl19-doc-segmented-unicoil-noexp 2>&1
-
-# DL20 - ONNX
-python src/main/python/run_regression.py --search-pool 1 --verify --search --regression dl20-passage-splade-pp-ed-onnx > logs/log.dl20-passage-splade-pp-ed-onnx 2>&1
-python src/main/python/run_regression.py --search-pool 1 --verify --search --regression dl20-passage-splade-pp-sd-onnx > logs/log.dl20-passage-splade-pp-sd-onnx 2>&1
-python src/main/python/run_regression.py --search-pool 1 --verify --search --regression dl20-passage-cos-dpr-distil-hnsw-onnx > logs/log.dl20-passage-cos-dpr-distil-hnsw-onnx 2>&1
-
-# Other DL20
-python src/main/python/run_regression.py --verify --search --regression dl20-passage > logs/log.dl20-passage 2>&1
-python src/main/python/run_regression.py --verify --search --regression dl20-passage-bm25-b8 > logs/log.dl20-passage-bm25-b8 2>&1
-python src/main/python/run_regression.py --verify --search --regression dl20-passage-ca > logs/log.dl20-passage-ca 2>&1
-python src/main/python/run_regression.py --verify --search --regression dl20-passage-wp > logs/log.dl20-passage-wp 2>&1
-python src/main/python/run_regression.py --verify --search --regression dl20-passage-hgf-wp > logs/log.dl20-passage-hgf-wp 2>&1
-python src/main/python/run_regression.py --verify --search --regression dl20-passage-docTTTTTquery > logs/log.dl20-passage-docTTTTTquery 2>&1
-
-python src/main/python/run_regression.py --verify --search --regression dl20-passage-unicoil > logs/log.dl20-passage-unicoil 2>&1
-python src/main/python/run_regression.py --verify --search --regression dl20-passage-unicoil-noexp > logs/log.dl20-passage-unicoil-noexp 2>&1
-python src/main/python/run_regression.py --verify --search --regression dl20-passage-splade-distil-cocodenser-medium > logs/log.dl20-passage-splade-distil-cocodenser-medium 2>&1
-python src/main/python/run_regression.py --verify --search --regression dl20-passage-splade-pp-ed > logs/log.dl20-passage-splade-pp-ed 2>&1
-python src/main/python/run_regression.py --verify --search --regression dl20-passage-splade-pp-sd > logs/log.dl20-passage-splade-pp-sd 2>&1
-python src/main/python/run_regression.py --verify --search --regression dl20-passage-cos-dpr-distil-hnsw > logs/log.dl20-passage-cos-dpr-distil-hnsw 2>&1
-python src/main/python/run_regression.py --verify --search --regression dl20-passage-cos-dpr-distil-fw > logs/log.dl20-passage-cos-dpr-distil-fw 2>&1
-python src/main/python/run_regression.py --verify --search --regression dl20-passage-cos-dpr-distil-lexlsh > logs/log.dl20-passage-cos-dpr-distil-lexlsh 2>&1
-python src/main/python/run_regression.py --verify --search --regression dl20-passage-openai-ada2 > logs/log.dl20-passage-openai-ada2 2>&1
-
-python src/main/python/run_regression.py --verify --search --regression dl20-doc > logs/log.dl20-doc 2>&1
-python src/main/python/run_regression.py --verify --search --regression dl20-doc-ca > logs/log.dl20-doc-ca 2>&1
-python src/main/python/run_regression.py --verify --search --regression dl20-doc-wp > logs/log.dl20-doc-wp 2>&1
-python src/main/python/run_regression.py --verify --search --regression dl20-doc-hgf-wp > logs/log.dl20-doc-hgf-wp 2>&1
-python src/main/python/run_regression.py --verify --search --regression dl20-doc-docTTTTTquery > logs/log.dl20-doc-docTTTTTquery 2>&1
-
-python src/main/python/run_regression.py --verify --search --regression dl20-doc-segmented > logs/log.dl20-doc-segmented 2>&1
-python src/main/python/run_regression.py --verify --search --regression dl20-doc-segmented-ca > logs/log.dl20-doc-segmented-ca 2>&1
-python src/main/python/run_regression.py --verify --search --regression dl20-doc-segmented-wp > logs/log.dl20-doc-segmented-wp 2>&1
-python src/main/python/run_regression.py --verify --search --regression dl20-doc-segmented-docTTTTTquery > logs/log.dl20-doc-segmented-docTTTTTquery 2>&1
-python src/main/python/run_regression.py --verify --search --regression dl20-doc-segmented-unicoil > logs/log.dl20-doc-segmented-unicoil 2>&1
-python src/main/python/run_regression.py --verify --search --regression dl20-doc-segmented-unicoil-noexp > logs/log.dl20-doc-segmented-unicoil-noexp 2>&1
+# DL19
+python src/main/python/run_regression.py --search --regression dl19-passage > logs/log.dl19-passage 2>&1
+python src/main/python/run_regression.py --search --regression dl19-passage-bm25-b8 > logs/log.dl19-passage-bm25-b8 2>&1
+python src/main/python/run_regression.py --search --regression dl19-passage-ca > logs/log.dl19-passage-ca 2>&1
+python src/main/python/run_regression.py --search --regression dl19-passage-wp > logs/log.dl19-passage-wp 2>&1
+python src/main/python/run_regression.py --search --regression dl19-passage-hgf-wp > logs/log.dl19-passage-hgf-wp 2>&1
+python src/main/python/run_regression.py --search --regression dl19-passage-docTTTTTquery > logs/log.dl19-passage-docTTTTTquery 2>&1
+python src/main/python/run_regression.py --search --regression dl19-passage-splade-pp-ed-onnx > logs/log.dl19-passage-splade-pp-ed-onnx 2>&1
+python src/main/python/run_regression.py --search --regression dl19-passage-splade-pp-sd-onnx > logs/log.dl19-passage-splade-pp-sd-onnx 2>&1
+
+python src/main/python/run_regression.py --search --regression dl19-passage-unicoil > logs/log.dl19-passage-unicoil 2>&1
+python src/main/python/run_regression.py --search --regression dl19-passage-unicoil-noexp > logs/log.dl19-passage-unicoil-noexp 2>&1
+python src/main/python/run_regression.py --search --regression dl19-passage-splade-distil-cocodenser-medium > logs/log.dl19-passage-splade-distil-cocodenser-medium 2>&1
+python src/main/python/run_regression.py --search --regression dl19-passage-splade-pp-ed > logs/log.dl19-passage-splade-pp-ed 2>&1
+python src/main/python/run_regression.py --search --regression dl19-passage-splade-pp-sd > logs/log.dl19-passage-splade-pp-sd 2>&1
+python src/main/python/run_regression.py --search --regression dl19-passage-cos-dpr-distil-hnsw > logs/log.dl19-passage-cos-dpr-distil-hnsw 2>&1
+python src/main/python/run_regression.py --search --regression dl19-passage-cos-dpr-distil-hnsw-int8 > logs/log.dl19-passage-cos-dpr-distil-hnsw-int8 2>&1
+python src/main/python/run_regression.py --search --regression dl19-passage-cos-dpr-distil-hnsw-onnx > logs/log.dl19-passage-cos-dpr-distil-hnsw-onnx 2>&1
+python src/main/python/run_regression.py --search --regression dl19-passage-cos-dpr-distil-hnsw-int8-onnx > logs/log.dl19-passage-cos-dpr-distil-hnsw-int8-onnx 2>&1
+python src/main/python/run_regression.py --search --regression dl19-passage-cos-dpr-distil-fw > logs/log.dl19-passage-cos-dpr-distil-fw 2>&1
+python src/main/python/run_regression.py --search --regression dl19-passage-cos-dpr-distil-lexlsh > logs/log.dl19-passage-cos-dpr-distil-lexlsh 2>&1
+python src/main/python/run_regression.py --search --regression dl19-passage-openai-ada2 > logs/log.dl19-passage-openai-ada2 2>&1
+
+python src/main/python/run_regression.py --search --regression dl19-doc > logs/log.dl19-doc 2>&1
+python src/main/python/run_regression.py --search --regression dl19-doc-ca > logs/log.dl19-doc-ca 2>&1
+python src/main/python/run_regression.py --search --regression dl19-doc-wp > logs/log.dl19-doc-wp 2>&1
+python src/main/python/run_regression.py --search --regression dl19-doc-hgf-wp > logs/log.dl19-doc-hgf-wp 2>&1
+python src/main/python/run_regression.py --search --regression dl19-doc-segmented-wp > logs/log.dl19-doc-segmented-wp 2>&1
+python src/main/python/run_regression.py --search --regression dl19-doc-docTTTTTquery > logs/log.dl19-doc-docTTTTTquery 2>&1
+
+python src/main/python/run_regression.py --search --regression dl19-doc-segmented > logs/log.dl19-doc-segmented 2>&1
+python src/main/python/run_regression.py --search --regression dl19-doc-segmented-ca > logs/log.dl19-doc-segmented-ca 2>&1
+python src/main/python/run_regression.py --search --regression dl19-doc-segmented-docTTTTTquery > logs/log.dl19-doc-segmented-docTTTTTquery 2>&1
+python src/main/python/run_regression.py --search --regression dl19-doc-segmented-unicoil > logs/log.dl19-doc-segmented-unicoil 2>&1
+python src/main/python/run_regression.py --search --regression dl19-doc-segmented-unicoil-noexp > logs/log.dl19-doc-segmented-unicoil-noexp 2>&1
+
+# DL20
+python src/main/python/run_regression.py --search --regression dl20-passage > logs/log.dl20-passage 2>&1
+python src/main/python/run_regression.py --search --regression dl20-passage-bm25-b8 > logs/log.dl20-passage-bm25-b8 2>&1
+python src/main/python/run_regression.py --search --regression dl20-passage-ca > logs/log.dl20-passage-ca 2>&1
+python src/main/python/run_regression.py --search --regression dl20-passage-wp > logs/log.dl20-passage-wp 2>&1
+python src/main/python/run_regression.py --search --regression dl20-passage-hgf-wp > logs/log.dl20-passage-hgf-wp 2>&1
+python src/main/python/run_regression.py --search --regression dl20-passage-docTTTTTquery > logs/log.dl20-passage-docTTTTTquery 2>&1
+python src/main/python/run_regression.py --search --regression dl20-passage-splade-pp-ed-onnx > logs/log.dl20-passage-splade-pp-ed-onnx 2>&1
+python src/main/python/run_regression.py --search --regression dl20-passage-splade-pp-sd-onnx > logs/log.dl20-passage-splade-pp-sd-onnx 2>&1
+
+python src/main/python/run_regression.py --search --regression dl20-passage-unicoil > logs/log.dl20-passage-unicoil 2>&1
+python src/main/python/run_regression.py --search --regression dl20-passage-unicoil-noexp > logs/log.dl20-passage-unicoil-noexp 2>&1
+python src/main/python/run_regression.py --search --regression dl20-passage-splade-distil-cocodenser-medium > logs/log.dl20-passage-splade-distil-cocodenser-medium 2>&1
+python src/main/python/run_regression.py --search --regression dl20-passage-splade-pp-ed > logs/log.dl20-passage-splade-pp-ed 2>&1
+python src/main/python/run_regression.py --search --regression dl20-passage-splade-pp-sd > logs/log.dl20-passage-splade-pp-sd 2>&1
+python src/main/python/run_regression.py --search --regression dl20-passage-cos-dpr-distil-hnsw > logs/log.dl20-passage-cos-dpr-distil-hnsw 2>&1
+python src/main/python/run_regression.py --search --regression dl20-passage-cos-dpr-distil-hnsw-int8 > logs/log.dl20-passage-cos-dpr-distil-hnsw-int8 2>&1
+python src/main/python/run_regression.py --search --regression dl20-passage-cos-dpr-distil-hnsw-onnx > logs/log.dl20-passage-cos-dpr-distil-hnsw-onnx 2>&1
+python src/main/python/run_regression.py --search --regression dl20-passage-cos-dpr-distil-hnsw-int8-onnx > logs/log.dl20-passage-cos-dpr-distil-hnsw-int8-onnx 2>&1
+python src/main/python/run_regression.py --search --regression dl20-passage-cos-dpr-distil-fw > logs/log.dl20-passage-cos-dpr-distil-fw 2>&1
+python src/main/python/run_regression.py --search --regression dl20-passage-cos-dpr-distil-lexlsh > logs/log.dl20-passage-cos-dpr-distil-lexlsh 2>&1
+python src/main/python/run_regression.py --search --regression dl20-passage-openai-ada2 > logs/log.dl20-passage-openai-ada2 2>&1
+
+python src/main/python/run_regression.py --search --regression dl20-doc > logs/log.dl20-doc 2>&1
+python src/main/python/run_regression.py --search --regression dl20-doc-ca > logs/log.dl20-doc-ca 2>&1
+python src/main/python/run_regression.py --search --regression dl20-doc-wp > logs/log.dl20-doc-wp 2>&1
+python src/main/python/run_regression.py --search --regression dl20-doc-hgf-wp > logs/log.dl20-doc-hgf-wp 2>&1
+python src/main/python/run_regression.py --search --regression dl20-doc-docTTTTTquery > logs/log.dl20-doc-docTTTTTquery 2>&1
+
+python src/main/python/run_regression.py --search --regression dl20-doc-segmented > logs/log.dl20-doc-segmented 2>&1
+python src/main/python/run_regression.py --search --regression dl20-doc-segmented-ca > logs/log.dl20-doc-segmented-ca 2>&1
+python src/main/python/run_regression.py --search --regression dl20-doc-segmented-wp > logs/log.dl20-doc-segmented-wp 2>&1
+python src/main/python/run_regression.py --search --regression dl20-doc-segmented-docTTTTTquery > logs/log.dl20-doc-segmented-docTTTTTquery 2>&1
+python src/main/python/run_regression.py --search --regression dl20-doc-segmented-unicoil > logs/log.dl20-doc-segmented-unicoil 2>&1
+python src/main/python/run_regression.py --search --regression dl20-doc-segmented-unicoil-noexp > logs/log.dl20-doc-segmented-unicoil-noexp 2>&1
# DL21/22
-python src/main/python/run_regression.py --verify --search --regression dl21-passage > logs/log.dl21-passage 2>&1
-python src/main/python/run_regression.py --verify --search --regression dl21-passage-d2q-t5 > logs/log.dl21-passage-d2q-t5 2>&1
-python src/main/python/run_regression.py --verify --search --regression dl21-passage-augmented > logs/log.dl21-passage-augmented 2>&1
-python src/main/python/run_regression.py --verify --search --regression dl21-passage-augmented-d2q-t5 > logs/log.dl21-passage-augmented-d2q-t5 2>&1
-python src/main/python/run_regression.py --verify --search --regression dl21-passage-unicoil-noexp-0shot > logs/log.dl21-passage-unicoil-noexp-0shot 2>&1
-python src/main/python/run_regression.py --verify --search --regression dl21-passage-unicoil-0shot > logs/log.dl21-passage-unicoil-0shot 2>&1
-
-python src/main/python/run_regression.py --verify --search --regression dl21-passage-splade-pp-ed > logs/log.dl21-passage-splade-pp-ed 2>&1
-python src/main/python/run_regression.py --verify --search --regression dl21-passage-splade-pp-sd > logs/log.dl21-passage-splade-pp-sd 2>&1
-
-python src/main/python/run_regression.py --verify --search --regression dl21-doc > logs/log.dl21-doc 2>&1
-python src/main/python/run_regression.py --verify --search --regression dl21-doc-d2q-t5 > logs/log.dl21-doc-d2q-t5 2>&1
-python src/main/python/run_regression.py --verify --search --regression dl21-doc-segmented > logs/log.dl21-doc-segmented 2>&1
-python src/main/python/run_regression.py --verify --search --regression dl21-doc-segmented-d2q-t5 > logs/log.dl21-doc-segmented-d2q-t5 2>&1
-python src/main/python/run_regression.py --verify --search --regression dl21-doc-segmented-unicoil-noexp-0shot > logs/log.dl21-doc-segmented-unicoil-noexp-0shot 2>&1
-python src/main/python/run_regression.py --verify --search --regression dl21-doc-segmented-unicoil-noexp-0shot-v2 > logs/log.dl21-doc-segmented-unicoil-noexp-0shot-v2 2>&1
-python src/main/python/run_regression.py --verify --search --regression dl21-doc-segmented-unicoil-0shot > logs/log.dl21-doc-segmented-unicoil-0shot 2>&1
-python src/main/python/run_regression.py --verify --search --regression dl21-doc-segmented-unicoil-0shot-v2 > logs/log.dl21-doc-segmented-unicoil-0shot-v2 2>&1
-
-python src/main/python/run_regression.py --verify --search --regression dl22-passage > logs/log.dl22-passage 2>&1
-python src/main/python/run_regression.py --verify --search --regression dl22-passage-d2q-t5 > logs/log.dl22-passage-d2q-t5 2>&1
-python src/main/python/run_regression.py --verify --search --regression dl22-passage-augmented > logs/log.dl22-passage-augmented 2>&1
-python src/main/python/run_regression.py --verify --search --regression dl22-passage-augmented-d2q-t5 > logs/log.dl22-passage-augmented-d2q-t5 2>&1
-python src/main/python/run_regression.py --verify --search --regression dl22-passage-unicoil-noexp-0shot > logs/log.dl22-passage-unicoil-noexp-0shot 2>&1
-python src/main/python/run_regression.py --verify --search --regression dl22-passage-unicoil-0shot > logs/log.dl22-passage-unicoil-0shot 2>&1
-
-python src/main/python/run_regression.py --verify --search --regression dl22-passage-splade-pp-ed > logs/log.dl22-passage-splade-pp-ed 2>&1
-python src/main/python/run_regression.py --verify --search --regression dl22-passage-splade-pp-sd > logs/log.dl22-passage-splade-pp-sd 2>&1
+python src/main/python/run_regression.py --search --regression dl21-passage > logs/log.dl21-passage 2>&1
+python src/main/python/run_regression.py --search --regression dl21-passage-d2q-t5 > logs/log.dl21-passage-d2q-t5 2>&1
+python src/main/python/run_regression.py --search --regression dl21-passage-augmented > logs/log.dl21-passage-augmented 2>&1
+python src/main/python/run_regression.py --search --regression dl21-passage-augmented-d2q-t5 > logs/log.dl21-passage-augmented-d2q-t5 2>&1
+python src/main/python/run_regression.py --search --regression dl21-passage-unicoil-noexp-0shot > logs/log.dl21-passage-unicoil-noexp-0shot 2>&1
+python src/main/python/run_regression.py --search --regression dl21-passage-unicoil-0shot > logs/log.dl21-passage-unicoil-0shot 2>&1
+
+python src/main/python/run_regression.py --search --regression dl21-passage-splade-pp-ed > logs/log.dl21-passage-splade-pp-ed 2>&1
+python src/main/python/run_regression.py --search --regression dl21-passage-splade-pp-sd > logs/log.dl21-passage-splade-pp-sd 2>&1
+
+python src/main/python/run_regression.py --search --regression dl21-doc > logs/log.dl21-doc 2>&1
+python src/main/python/run_regression.py --search --regression dl21-doc-d2q-t5 > logs/log.dl21-doc-d2q-t5 2>&1
+python src/main/python/run_regression.py --search --regression dl21-doc-segmented > logs/log.dl21-doc-segmented 2>&1
+python src/main/python/run_regression.py --search --regression dl21-doc-segmented-d2q-t5 > logs/log.dl21-doc-segmented-d2q-t5 2>&1
+python src/main/python/run_regression.py --search --regression dl21-doc-segmented-unicoil-noexp-0shot > logs/log.dl21-doc-segmented-unicoil-noexp-0shot 2>&1
+python src/main/python/run_regression.py --search --regression dl21-doc-segmented-unicoil-noexp-0shot-v2 > logs/log.dl21-doc-segmented-unicoil-noexp-0shot-v2 2>&1
+python src/main/python/run_regression.py --search --regression dl21-doc-segmented-unicoil-0shot > logs/log.dl21-doc-segmented-unicoil-0shot 2>&1
+python src/main/python/run_regression.py --search --regression dl21-doc-segmented-unicoil-0shot-v2 > logs/log.dl21-doc-segmented-unicoil-0shot-v2 2>&1
+
+python src/main/python/run_regression.py --search --regression dl22-passage > logs/log.dl22-passage 2>&1
+python src/main/python/run_regression.py --search --regression dl22-passage-d2q-t5 > logs/log.dl22-passage-d2q-t5 2>&1
+python src/main/python/run_regression.py --search --regression dl22-passage-augmented > logs/log.dl22-passage-augmented 2>&1
+python src/main/python/run_regression.py --search --regression dl22-passage-augmented-d2q-t5 > logs/log.dl22-passage-augmented-d2q-t5 2>&1
+python src/main/python/run_regression.py --search --regression dl22-passage-unicoil-noexp-0shot > logs/log.dl22-passage-unicoil-noexp-0shot 2>&1
+python src/main/python/run_regression.py --search --regression dl22-passage-unicoil-0shot > logs/log.dl22-passage-unicoil-0shot 2>&1
+
+python src/main/python/run_regression.py --search --regression dl22-passage-splade-pp-ed > logs/log.dl22-passage-splade-pp-ed 2>&1
+python src/main/python/run_regression.py --search --regression dl22-passage-splade-pp-sd > logs/log.dl22-passage-splade-pp-sd 2>&1
# CIRAL
python src/main/python/run_regression.py --index --verify --search --regression ciral-v1.0-ha > logs/log.ciral-v1.0-ha 2>&1
diff --git a/src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil-hnsw-int8-onnx.template b/src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil-hnsw-int8-onnx.template
new file mode 100644
index 0000000000..fd1fa8fa91
--- /dev/null
+++ b/src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil-hnsw-int8-onnx.template
@@ -0,0 +1,101 @@
+# Anserini Regressions: TREC 2019 Deep Learning Track (Passage)
+
+**Model**: cosDPR-distil with HNSW indexes (using ONNX for on-the-fly query encoding)
+
+This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [TREC 2019 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper:
+
+> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom.
+
+In these experiments, we are performing query inference "on-the-fly" with ONNX.
+
+Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast).
+For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md).
+
+The exact configurations for these regressions are stored in [this YAML file](${yaml}).
+Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation.
+
+From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end:
+
+```bash
+python src/main/python/run_regression.py --index --verify --search --regression ${test_name}
+```
+
+We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil.
+
+From any machine, the following command will download the corpus and perform the complete regression, end to end:
+
+```bash
+python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name}
+```
+
+The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results.
+
+## Corpus Download
+
+Download the corpus and unpack into `collections/`:
+
+```bash
+wget ${download_url} -P collections/
+tar xvf collections/${corpus}.tar -C collections/
+```
+
+To confirm, `${corpus}.tar` is 57 GB and has MD5 checksum `${download_checksum}`.
+With the corpus downloaded, the following command will perform the remaining steps below:
+
+```bash
+python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \
+ --corpus-path collections/${corpus}
+```
+
+## Indexing
+
+Sample indexing command, building HNSW indexes:
+
+```bash
+${index_cmds}
+```
+
+The path `/path/to/${corpus}/` should point to the corpus downloaded above.
+Upon completion, we should have an index with 8,841,823 documents.
+
+Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments.
+This is because merging index segments is a costly operation and not worthwhile given our query set.
+Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory.
+See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact.
+
+## Retrieval
+
+Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule.
+The regression experiments here evaluate on the 43 topics for which NIST has provided judgments as part of the TREC 2019 Deep Learning Track.
+The original data can be found [here](https://trec.nist.gov/data/deep2019.html).
+
+After indexing has completed, you should be able to perform retrieval as follows:
+
+```bash
+${ranking_cmds}
+```
+
+Note that we are performing query inference "on-the-fly" with ONNX in these experiments.
+
+Evaluation can be performed using `trec_eval`:
+
+```bash
+${eval_cmds}
+```
+
+## Effectiveness
+
+With the above commands, you should be able to reproduce the following results:
+
+${effectiveness}
+
+Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run.
+Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](${yaml}).
+
+Also note that retrieval metrics are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking).
+Also, for computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`).
+The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820).
+
+## Reproduction Log[*](reproducibility.md)
+
+To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation.
diff --git a/src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil-hnsw-int8.template b/src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil-hnsw-int8.template
new file mode 100644
index 0000000000..0900647c40
--- /dev/null
+++ b/src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil-hnsw-int8.template
@@ -0,0 +1,99 @@
+# Anserini Regressions: TREC 2019 Deep Learning Track (Passage)
+
+**Model**: cosDPR-distil with HNSW indexes (using pre-encoded queries)
+
+This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [TREC 2019 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper:
+
+> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom.
+
+In these experiments, we are using pre-encoded queries (i.e., cached results of query encoding).
+
+Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast).
+For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md).
+
+The exact configurations for these regressions are stored in [this YAML file](${yaml}).
+Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation.
+
+From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end:
+
+```bash
+python src/main/python/run_regression.py --index --verify --search --regression ${test_name}
+```
+
+We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil.
+
+From any machine, the following command will download the corpus and perform the complete regression, end to end:
+
+```bash
+python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name}
+```
+
+The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results.
+
+## Corpus Download
+
+Download the corpus and unpack into `collections/`:
+
+```bash
+wget ${download_url} -P collections/
+tar xvf collections/${corpus}.tar -C collections/
+```
+
+To confirm, `${corpus}.tar` is 57 GB and has MD5 checksum `${download_checksum}`.
+With the corpus downloaded, the following command will perform the remaining steps below:
+
+```bash
+python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \
+ --corpus-path collections/${corpus}
+```
+
+## Indexing
+
+Sample indexing command, building HNSW indexes:
+
+```bash
+${index_cmds}
+```
+
+The path `/path/to/${corpus}/` should point to the corpus downloaded above.
+Upon completion, we should have an index with 8,841,823 documents.
+
+Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments.
+This is because merging index segments is a costly operation and not worthwhile given our query set.
+Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory.
+See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact.
+
+## Retrieval
+
+Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule.
+The regression experiments here evaluate on the 43 topics for which NIST has provided judgments as part of the TREC 2019 Deep Learning Track.
+The original data can be found [here](https://trec.nist.gov/data/deep2019.html).
+
+After indexing has completed, you should be able to perform retrieval as follows:
+
+```bash
+${ranking_cmds}
+```
+
+Evaluation can be performed using `trec_eval`:
+
+```bash
+${eval_cmds}
+```
+
+## Effectiveness
+
+With the above commands, you should be able to reproduce the following results:
+
+${effectiveness}
+
+Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run.
+Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](${yaml}).
+
+Also note that retrieval metrics are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking).
+Also, for computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`).
+The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820).
+
+## Reproduction Log[*](reproducibility.md)
+
+To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation.
diff --git a/src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil-hnsw-onnx.template b/src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil-hnsw-onnx.template
index 355d54348f..07322676fc 100644
--- a/src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil-hnsw-onnx.template
+++ b/src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil-hnsw-onnx.template
@@ -6,7 +6,7 @@ This page describes regression experiments, integrated into Anserini's regressio
> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom.
-In these experiments, we are using pre-encoded queries (i.e., cached results of query encoding).
+In these experiments, we are performing query inference "on-the-fly" with ONNX.
Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast).
For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md).
@@ -56,9 +56,11 @@ ${index_cmds}
```
The path `/path/to/${corpus}/` should point to the corpus downloaded above.
-
Upon completion, we should have an index with 8,841,823 documents.
+Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments.
+This is because merging index segments is a costly operation and not worthwhile given our query set.
+
## Retrieval
Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule.
@@ -71,6 +73,8 @@ After indexing has completed, you should be able to perform retrieval as follows
${ranking_cmds}
```
+Note that we are performing query inference "on-the-fly" with ONNX in these experiments.
+
Evaluation can be performed using `trec_eval`:
```bash
diff --git a/src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil-hnsw.template b/src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil-hnsw.template
index 81c68f7ec2..a1839cf6df 100644
--- a/src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil-hnsw.template
+++ b/src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil-hnsw.template
@@ -56,9 +56,11 @@ ${index_cmds}
```
The path `/path/to/${corpus}/` should point to the corpus downloaded above.
-
Upon completion, we should have an index with 8,841,823 documents.
+Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments.
+This is because merging index segments is a costly operation and not worthwhile given our query set.
+
## Retrieval
Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule.
diff --git a/src/main/resources/docgen/templates/dl19-passage-openai-ada2-int8.template b/src/main/resources/docgen/templates/dl19-passage-openai-ada2-int8.template
new file mode 100644
index 0000000000..99454b9d31
--- /dev/null
+++ b/src/main/resources/docgen/templates/dl19-passage-openai-ada2-int8.template
@@ -0,0 +1,101 @@
+# Anserini Regressions: TREC 2019 Deep Learning Track (Passage)
+
+**NOTE:** We're currently having issues with this regression, which throws "Retried waiting for GCLocker too often" errors.
+
+**Model**: OpenAI-ada2 embeddings (using pre-encoded queries) with HNSW indexes
+
+This page describes regression experiments, integrated into Anserini's regression testing framework, using OpenAI-ada2 embeddings on the [TREC 2019 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper:
+
+> Jimmy Lin, Ronak Pradeep, Tommaso Teofili, and Jasper Xian. [Vector Search with OpenAI Embeddings: Lucene Is All You Need.](https://arxiv.org/abs/2308.14963) _arXiv:2308.14963_, 2023.
+
+In these experiments, we are using pre-encoded queries (i.e., cached results of query encoding).
+
+Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast).
+For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md).
+
+The exact configurations for these regressions are stored in [this YAML file](${yaml}).
+Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation.
+
+From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end:
+
+```bash
+python src/main/python/run_regression.py --index --verify --search --regression ${test_name}
+```
+
+We make available a version of the MS MARCO Passage Corpus that has already been encoded with the OpenAI-ada2 embedding model.
+
+From any machine, the following command will download the corpus and perform the complete regression, end to end:
+
+```bash
+python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name}
+```
+
+The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results.
+
+## Corpus Download
+
+Download the corpus and unpack into `collections/`:
+
+```bash
+wget ${download_url} -P collections/
+tar xvf collections/${corpus}.tar -C collections/
+```
+
+To confirm, `${corpus}.tar` is 109 GB and has MD5 checksum `${download_checksum}`.
+With the corpus downloaded, the following command will perform the remaining steps below:
+
+```bash
+python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \
+ --corpus-path collections/${corpus}
+```
+
+## Indexing
+
+Sample indexing command, building HNSW indexes:
+
+```bash
+${index_cmds}
+```
+
+The path `/path/to/${corpus}/` should point to the corpus downloaded above.
+Upon completion, we should have an index with 8,841,823 documents.
+
+Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments.
+This is because merging index segments is a costly operation and not worthwhile given our query set.
+Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory.
+See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact.
+
+## Retrieval
+
+Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule.
+The regression experiments here evaluate on the 43 topics for which NIST has provided judgments as part of the TREC 2019 Deep Learning Track.
+The original data can be found [here](https://trec.nist.gov/data/deep2019.html).
+
+After indexing has completed, you should be able to perform retrieval as follows:
+
+```bash
+${ranking_cmds}
+```
+
+Evaluation can be performed using `trec_eval`:
+
+```bash
+${eval_cmds}
+```
+
+## Effectiveness
+
+With the above commands, you should be able to reproduce the following results:
+
+${effectiveness}
+
+Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run.
+Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](${yaml}).
+
+Also note that retrieval metrics are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking).
+Also, for computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`).
+The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820).
+
+## Reproduction Log[*](reproducibility.md)
+
+To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation.
diff --git a/src/main/resources/docgen/templates/dl19-passage-openai-ada2.template b/src/main/resources/docgen/templates/dl19-passage-openai-ada2.template
index 15fe5a605a..f84f7d2d1d 100644
--- a/src/main/resources/docgen/templates/dl19-passage-openai-ada2.template
+++ b/src/main/resources/docgen/templates/dl19-passage-openai-ada2.template
@@ -56,9 +56,11 @@ ${index_cmds}
```
The path `/path/to/${corpus}/` should point to the corpus downloaded above.
-
Upon completion, we should have an index with 8,841,823 documents.
+Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments.
+This is because merging index segments is a costly operation and not worthwhile given our query set.
+
## Retrieval
Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule.
diff --git a/src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil-hnsw-int8-onnx.template b/src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil-hnsw-int8-onnx.template
new file mode 100644
index 0000000000..9179919a51
--- /dev/null
+++ b/src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil-hnsw-int8-onnx.template
@@ -0,0 +1,101 @@
+# Anserini Regressions: TREC 2020 Deep Learning Track (Passage)
+
+**Model**: cosDPR-distil with HNSW indexes (using ONNX for on-the-fly query encoding)
+
+This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [TREC 2020 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper:
+
+> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom.
+
+In these experiments, we are performing query inference "on-the-fly" with ONNX.
+
+Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast).
+For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md).
+
+The exact configurations for these regressions are stored in [this YAML file](${yaml}).
+Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation.
+
+From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end:
+
+```bash
+python src/main/python/run_regression.py --index --verify --search --regression ${test_name}
+```
+
+We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil.
+
+From any machine, the following command will download the corpus and perform the complete regression, end to end:
+
+```bash
+python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name}
+```
+
+The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results.
+
+## Corpus Download
+
+Download the corpus and unpack into `collections/`:
+
+```bash
+wget ${download_url} -P collections/
+tar xvf collections/${corpus}.tar -C collections/
+```
+
+To confirm, `${corpus}.tar` is 57 GB and has MD5 checksum `${download_checksum}`.
+With the corpus downloaded, the following command will perform the remaining steps below:
+
+```bash
+python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \
+ --corpus-path collections/${corpus}
+```
+
+## Indexing
+
+Sample indexing command, building HNSW indexes:
+
+```bash
+${index_cmds}
+```
+
+The path `/path/to/${corpus}/` should point to the corpus downloaded above.
+Upon completion, we should have an index with 8,841,823 documents.
+
+Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments.
+This is because merging index segments is a costly operation and not worthwhile given our query set.
+Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory.
+See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact.
+
+## Retrieval
+
+Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule.
+The regression experiments here evaluate on the 54 topics for which NIST has provided judgments as part of the TREC 2020 Deep Learning Track.
+The original data can be found [here](https://trec.nist.gov/data/deep2020.html).
+
+After indexing has completed, you should be able to perform retrieval as follows:
+
+```bash
+${ranking_cmds}
+```
+
+Note that we are performing query inference "on-the-fly" with ONNX in these experiments.
+
+Evaluation can be performed using `trec_eval`:
+
+```bash
+${eval_cmds}
+```
+
+## Effectiveness
+
+With the above commands, you should be able to reproduce the following results:
+
+${effectiveness}
+
+Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run.
+Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](${yaml}).
+
+Also note that retrieval metrics are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking).
+Also, for computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`).
+The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820).
+
+## Reproduction Log[*](reproducibility.md)
+
+To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation.
diff --git a/src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil-hnsw-int8.template b/src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil-hnsw-int8.template
new file mode 100644
index 0000000000..e9e46d5fd7
--- /dev/null
+++ b/src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil-hnsw-int8.template
@@ -0,0 +1,99 @@
+# Anserini Regressions: TREC 2020 Deep Learning Track (Passage)
+
+**Model**: cosDPR-distil with HNSW indexes (using pre-encoded queries)
+
+This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [TREC 2020 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper:
+
+> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom.
+
+In these experiments, we are using pre-encoded queries (i.e., cached results of query encoding).
+
+Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast).
+For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md).
+
+The exact configurations for these regressions are stored in [this YAML file](${yaml}).
+Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation.
+
+From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end:
+
+```bash
+python src/main/python/run_regression.py --index --verify --search --regression ${test_name}
+```
+
+We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil.
+
+From any machine, the following command will download the corpus and perform the complete regression, end to end:
+
+```bash
+python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name}
+```
+
+The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results.
+
+## Corpus Download
+
+Download the corpus and unpack into `collections/`:
+
+```bash
+wget ${download_url} -P collections/
+tar xvf collections/${corpus}.tar -C collections/
+```
+
+To confirm, `${corpus}.tar` is 57 GB and has MD5 checksum `${download_checksum}`.
+With the corpus downloaded, the following command will perform the remaining steps below:
+
+```bash
+python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \
+ --corpus-path collections/${corpus}
+```
+
+## Indexing
+
+Sample indexing command, building HNSW indexes:
+
+```bash
+${index_cmds}
+```
+
+The path `/path/to/${corpus}/` should point to the corpus downloaded above.
+Upon completion, we should have an index with 8,841,823 documents.
+
+Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments.
+This is because merging index segments is a costly operation and not worthwhile given our query set.
+Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory.
+See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact.
+
+## Retrieval
+
+Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule.
+The regression experiments here evaluate on the 54 topics for which NIST has provided judgments as part of the TREC 2020 Deep Learning Track.
+The original data can be found [here](https://trec.nist.gov/data/deep2020.html).
+
+After indexing has completed, you should be able to perform retrieval as follows:
+
+```bash
+${ranking_cmds}
+```
+
+Evaluation can be performed using `trec_eval`:
+
+```bash
+${eval_cmds}
+```
+
+## Effectiveness
+
+With the above commands, you should be able to reproduce the following results:
+
+${effectiveness}
+
+Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run.
+Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](${yaml}).
+
+Also note that retrieval metrics are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking).
+Also, for computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`).
+The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820).
+
+## Reproduction Log[*](reproducibility.md)
+
+To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation.
diff --git a/src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil-hnsw-onnx.template b/src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil-hnsw-onnx.template
index 36fee4a8b0..e5b80bf511 100644
--- a/src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil-hnsw-onnx.template
+++ b/src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil-hnsw-onnx.template
@@ -6,7 +6,7 @@ This page describes regression experiments, integrated into Anserini's regressio
> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom.
-In these experiments, we are using pre-encoded queries (i.e., cached results of query encoding).
+In these experiments, we are performing query inference "on-the-fly" with ONNX.
Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast).
For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md).
@@ -56,9 +56,11 @@ ${index_cmds}
```
The path `/path/to/${corpus}/` should point to the corpus downloaded above.
-
Upon completion, we should have an index with 8,841,823 documents.
+Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments.
+This is because merging index segments is a costly operation and not worthwhile given our query set.
+
## Retrieval
Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule.
@@ -71,6 +73,8 @@ After indexing has completed, you should be able to perform retrieval as follows
${ranking_cmds}
```
+Note that we are performing query inference "on-the-fly" with ONNX in these experiments.
+
Evaluation can be performed using `trec_eval`:
```bash
diff --git a/src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil-hnsw.template b/src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil-hnsw.template
index e3f6969a9f..2ec64f9a41 100644
--- a/src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil-hnsw.template
+++ b/src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil-hnsw.template
@@ -56,9 +56,11 @@ ${index_cmds}
```
The path `/path/to/${corpus}/` should point to the corpus downloaded above.
-
Upon completion, we should have an index with 8,841,823 documents.
+Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments.
+This is because merging index segments is a costly operation and not worthwhile given our query set.
+
## Retrieval
Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule.
diff --git a/src/main/resources/docgen/templates/dl20-passage-openai-ada2-int8.template b/src/main/resources/docgen/templates/dl20-passage-openai-ada2-int8.template
new file mode 100644
index 0000000000..eea224f03e
--- /dev/null
+++ b/src/main/resources/docgen/templates/dl20-passage-openai-ada2-int8.template
@@ -0,0 +1,101 @@
+# Anserini Regressions: TREC 2020 Deep Learning Track (Passage)
+
+**NOTE:** We're currently having issues with this regression, which throws "Retried waiting for GCLocker too often" errors.
+
+**Model**: OpenAI-ada2 embeddings (using pre-encoded queries) with HNSW indexes
+
+This page describes regression experiments, integrated into Anserini's regression testing framework, using OpenAI-ada2 embeddings on the [TREC 2020 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html), as described in the following paper:
+
+> Jimmy Lin, Ronak Pradeep, Tommaso Teofili, and Jasper Xian. [Vector Search with OpenAI Embeddings: Lucene Is All You Need.](https://arxiv.org/abs/2308.14963) _arXiv:2308.14963_, 2023.
+
+In these experiments, we are using pre-encoded queries (i.e., cached results of query encoding).
+
+Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast).
+For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md).
+
+The exact configurations for these regressions are stored in [this YAML file](${yaml}).
+Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation.
+
+From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end:
+
+```bash
+python src/main/python/run_regression.py --index --verify --search --regression ${test_name}
+```
+
+We make available a version of the MS MARCO Passage Corpus that has already been encoded with the OpenAI-ada2 embedding model.
+
+From any machine, the following command will download the corpus and perform the complete regression, end to end:
+
+```bash
+python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name}
+```
+
+The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results.
+
+## Corpus Download
+
+Download the corpus and unpack into `collections/`:
+
+```bash
+wget ${download_url} -P collections/
+tar xvf collections/${corpus}.tar -C collections/
+```
+
+To confirm, `${corpus}.tar` is 109 GB and has MD5 checksum `${download_checksum}`.
+With the corpus downloaded, the following command will perform the remaining steps below:
+
+```bash
+python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \
+ --corpus-path collections/${corpus}
+```
+
+## Indexing
+
+Sample indexing command, building HNSW indexes:
+
+```bash
+${index_cmds}
+```
+
+The path `/path/to/${corpus}/` should point to the corpus downloaded above.
+Upon completion, we should have an index with 8,841,823 documents.
+
+Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments.
+This is because merging index segments is a costly operation and not worthwhile given our query set.
+Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory.
+See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact.
+
+## Retrieval
+
+Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule.
+The regression experiments here evaluate on the 54 topics for which NIST has provided judgments as part of the TREC 2020 Deep Learning Track.
+The original data can be found [here](https://trec.nist.gov/data/deep2020.html).
+
+After indexing has completed, you should be able to perform retrieval as follows:
+
+```bash
+${ranking_cmds}
+```
+
+Evaluation can be performed using `trec_eval`:
+
+```bash
+${eval_cmds}
+```
+
+## Effectiveness
+
+With the above commands, you should be able to reproduce the following results:
+
+${effectiveness}
+
+Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run.
+Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](${yaml}).
+
+Also note that retrieval metrics are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking).
+Also, for computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`).
+The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820).
+
+## Reproduction Log[*](reproducibility.md)
+
+To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation.
diff --git a/src/main/resources/docgen/templates/dl20-passage-openai-ada2.template b/src/main/resources/docgen/templates/dl20-passage-openai-ada2.template
index 4cd598d31b..069c3d41fe 100644
--- a/src/main/resources/docgen/templates/dl20-passage-openai-ada2.template
+++ b/src/main/resources/docgen/templates/dl20-passage-openai-ada2.template
@@ -56,9 +56,11 @@ ${index_cmds}
```
The path `/path/to/${corpus}/` should point to the corpus downloaded above.
-
Upon completion, we should have an index with 8,841,823 documents.
+Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments.
+This is because merging index segments is a costly operation and not worthwhile given our query set.
+
## Retrieval
Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule.
diff --git a/src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-hnsw-int8-onnx.template b/src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-hnsw-int8-onnx.template
new file mode 100644
index 0000000000..a7e21103f1
--- /dev/null
+++ b/src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-hnsw-int8-onnx.template
@@ -0,0 +1,93 @@
+# Anserini Regressions: MS MARCO Passage Ranking
+
+**Model**: cosDPR-distil with HNSW indexes (using ONNX for on-the-fly query encoding)
+
+This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [MS MARCO passage ranking task](https://github.com/microsoft/MSMARCO-Passage-Ranking), as described in the following paper:
+
+> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom.
+
+In these experiments, we are performing query inference "on-the-fly" with ONNX.
+
+The exact configurations for these regressions are stored in [this YAML file](${yaml}).
+Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation.
+
+From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end:
+
+```bash
+python src/main/python/run_regression.py --index --verify --search --regression ${test_name}
+```
+
+We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil.
+
+From any machine, the following command will download the corpus and perform the complete regression, end to end:
+
+```bash
+python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name}
+```
+
+The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results.
+
+## Corpus Download
+
+Download the corpus and unpack into `collections/`:
+
+```bash
+wget ${download_url} -P collections/
+tar xvf collections/${corpus}.tar -C collections/
+```
+
+To confirm, `${corpus}.tar` is 57 GB and has MD5 checksum `${download_checksum}`.
+With the corpus downloaded, the following command will perform the remaining steps below:
+
+```bash
+python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \
+ --corpus-path collections/${corpus}
+```
+
+## Indexing
+
+Sample indexing command, building HNSW indexes:
+
+```bash
+${index_cmds}
+```
+
+The path `/path/to/${corpus}/` should point to the corpus downloaded above.
+Upon completion, we should have an index with 8,841,823 documents.
+
+Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments.
+This is because merging index segments is a costly operation and not worthwhile given our query set.
+Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory.
+See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact.
+
+## Retrieval
+
+Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule.
+The regression experiments here evaluate on the 6980 dev set questions; see [this page](${root_path}/docs/experiments-msmarco-passage.md) for more details.
+
+After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes:
+
+```bash
+${ranking_cmds}
+```
+
+Note that we are performing query inference "on-the-fly" with ONNX in these experiments.
+
+Evaluation can be performed using `trec_eval`:
+
+```bash
+${eval_cmds}
+```
+
+## Effectiveness
+
+With the above commands, you should be able to reproduce the following results:
+
+${effectiveness}
+
+Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run.
+Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](${yaml}).
+
+## Reproduction Log[*](${root_path}/docs/reproducibility.md)
+
+To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation.
diff --git a/src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-hnsw-int8.template b/src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-hnsw-int8.template
new file mode 100644
index 0000000000..ae04c97479
--- /dev/null
+++ b/src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-hnsw-int8.template
@@ -0,0 +1,93 @@
+# Anserini Regressions: MS MARCO Passage Ranking
+
+**Model**: cosDPR-distil with HNSW indexes (using pre-encoded queries)
+
+This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [MS MARCO passage ranking task](https://github.com/microsoft/MSMARCO-Passage-Ranking), as described in the following paper:
+
+> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom.
+
+In these experiments, we are using pre-encoded queries (i.e., cached results of query encoding).
+
+The exact configurations for these regressions are stored in [this YAML file](${yaml}).
+Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation.
+
+From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end:
+
+```bash
+python src/main/python/run_regression.py --index --verify --search --regression ${test_name}
+```
+
+We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil.
+
+From any machine, the following command will download the corpus and perform the complete regression, end to end:
+
+```bash
+python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name}
+```
+
+The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results.
+
+## Corpus Download
+
+Download the corpus and unpack into `collections/`:
+
+```bash
+wget ${download_url} -P collections/
+tar xvf collections/${corpus}.tar -C collections/
+```
+
+To confirm, `${corpus}.tar` is 57 GB and has MD5 checksum `${download_checksum}`.
+With the corpus downloaded, the following command will perform the remaining steps below:
+
+```bash
+python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \
+ --corpus-path collections/${corpus}
+```
+
+## Indexing
+
+Sample indexing command, building HNSW indexes:
+
+```bash
+${index_cmds}
+```
+
+The path `/path/to/${corpus}/` should point to the corpus downloaded above.
+Upon completion, we should have an index with 8,841,823 documents.
+
+Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments.
+This is because merging index segments is a costly operation and not worthwhile given our query set.
+Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory.
+See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact.
+
+## Retrieval
+
+Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule.
+The regression experiments here evaluate on the 6980 dev set questions; see [this page](${root_path}/docs/experiments-msmarco-passage.md) for more details.
+
+After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes:
+
+```bash
+${ranking_cmds}
+```
+
+Evaluation can be performed using `trec_eval`:
+
+```bash
+${eval_cmds}
+```
+
+## Effectiveness
+
+With the above commands, you should be able to reproduce the following results:
+
+${effectiveness}
+
+Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run.
+Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](${yaml}).
+
+## Reproduction Log[*](${root_path}/docs/reproducibility.md)
+
+To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation.
+
++ Results reproduced by [@yilinjz](https://github.com/yilinjz) on 2023-09-01 (commit [`4ae518b`](https://github.com/castorini/anserini/commit/4ae518bb284ebcba0b273a473bc8774735cb7d19))
\ No newline at end of file
diff --git a/src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-hnsw-onnx.template b/src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-hnsw-onnx.template
index bbc6f5a298..cad852f311 100644
--- a/src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-hnsw-onnx.template
+++ b/src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-hnsw-onnx.template
@@ -6,7 +6,7 @@ This page describes regression experiments, integrated into Anserini's regressio
> Xueguang Ma, Tommaso Teofili, and Jimmy Lin. [Anserini Gets Dense Retrieval: Integration of Lucene's HNSW Indexes.](https://dl.acm.org/doi/10.1145/3583780.3615112) _Proceedings of the 32nd International Conference on Information and Knowledge Management (CIKM 2023)_, October 2023, pages 5366–5370, Birmingham, the United Kingdom.
-In these experiments, we are using pre-encoded queries (i.e., cached results of query encoding).
+In these experiments, we are performing query inference "on-the-fly" with ONNX.
The exact configurations for these regressions are stored in [this YAML file](${yaml}).
Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation.
@@ -53,9 +53,11 @@ ${index_cmds}
```
The path `/path/to/${corpus}/` should point to the corpus downloaded above.
-
Upon completion, we should have an index with 8,841,823 documents.
+Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments.
+This is because merging index segments is a costly operation and not worthwhile given our query set.
+
## Retrieval
Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule.
@@ -67,6 +69,8 @@ After indexing has completed, you should be able to perform retrieval as follows
${ranking_cmds}
```
+Note that we are performing query inference "on-the-fly" with ONNX in these experiments.
+
Evaluation can be performed using `trec_eval`:
```bash
diff --git a/src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-hnsw.template b/src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-hnsw.template
index 98b1ed5b42..f5d7400267 100644
--- a/src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-hnsw.template
+++ b/src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-hnsw.template
@@ -53,9 +53,11 @@ ${index_cmds}
```
The path `/path/to/${corpus}/` should point to the corpus downloaded above.
-
Upon completion, we should have an index with 8,841,823 documents.
+Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments.
+This is because merging index segments is a costly operation and not worthwhile given our query set.
+
## Retrieval
Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule.
diff --git a/src/main/resources/docgen/templates/msmarco-passage-openai-ada2-int8.template b/src/main/resources/docgen/templates/msmarco-passage-openai-ada2-int8.template
new file mode 100644
index 0000000000..b9e3a3c5e5
--- /dev/null
+++ b/src/main/resources/docgen/templates/msmarco-passage-openai-ada2-int8.template
@@ -0,0 +1,94 @@
+# Anserini Regressions: MS MARCO Passage Ranking
+
+**NOTE:** We're currently having issues with this regression, which throws "Retried waiting for GCLocker too often" errors.
+
+**Model**: OpenAI-ada2 embeddings (using pre-encoded queries) with HNSW indexes
+
+This page describes regression experiments, integrated into Anserini's regression testing framework, using OpenAI-ada2 embeddings on the [MS MARCO passage ranking task](https://github.com/microsoft/MSMARCO-Passage-Ranking), as described in the following paper:
+
+> Jimmy Lin, Ronak Pradeep, Tommaso Teofili, and Jasper Xian. [Vector Search with OpenAI Embeddings: Lucene Is All You Need.](https://arxiv.org/abs/2308.14963) _arXiv:2308.14963_, 2023.
+
+In these experiments, we are using pre-encoded queries (i.e., cached results of query encoding).
+
+The exact configurations for these regressions are stored in [this YAML file](${yaml}).
+Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation.
+
+From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end:
+
+```bash
+python src/main/python/run_regression.py --index --verify --search --regression ${test_name}
+```
+
+We make available a version of the MS MARCO Passage Corpus that has already been encoded with the OpenAI-ada2 embedding model.
+
+From any machine, the following command will download the corpus and perform the complete regression, end to end:
+
+```bash
+python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name}
+```
+
+The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results.
+
+## Corpus Download
+
+Download the corpus and unpack into `collections/`:
+
+```bash
+wget ${download_url} -P collections/
+tar xvf collections/${corpus}.tar -C collections/
+```
+
+To confirm, `${corpus}.tar` is 109 GB and has MD5 checksum `${download_checksum}`.
+With the corpus downloaded, the following command will perform the remaining steps below:
+
+```bash
+python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \
+ --corpus-path collections/${corpus}
+```
+
+## Indexing
+
+Sample indexing command, building HNSW indexes:
+
+```bash
+${index_cmds}
+```
+
+The path `/path/to/${corpus}/` should point to the corpus downloaded above.
+Upon completion, we should have an index with 8,841,823 documents.
+
+Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments.
+This is because merging index segments is a costly operation and not worthwhile given our query set.
+Furthermore, we are using Lucene's [Automatic Byte Quantization](https://www.elastic.co/search-labs/blog/articles/scalar-quantization-in-lucene) feature, which increase the on-disk footprint of the indexes since we're storing both the int8 quantized vectors and the float32 vectors, but only the int8 quantized vectors need to be loaded into memory.
+See [issue #2292](https://github.com/castorini/anserini/issues/2292) for some experiments reporting the performance impact.
+
+## Retrieval
+
+Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule.
+The regression experiments here evaluate on the 6980 dev set questions; see [this page](${root_path}/docs/experiments-msmarco-passage.md) for more details.
+
+After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes:
+
+```bash
+${ranking_cmds}
+```
+
+Evaluation can be performed using `trec_eval`:
+
+```bash
+${eval_cmds}
+```
+
+## Effectiveness
+
+With the above commands, you should be able to reproduce the following results:
+
+${effectiveness}
+
+Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run.
+Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](${yaml}).
+
+## Reproduction Log[*](${root_path}/docs/reproducibility.md)
+
+To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation.
+
diff --git a/src/main/resources/docgen/templates/msmarco-passage-openai-ada2.template b/src/main/resources/docgen/templates/msmarco-passage-openai-ada2.template
index c1b658c309..7bf567dda7 100644
--- a/src/main/resources/docgen/templates/msmarco-passage-openai-ada2.template
+++ b/src/main/resources/docgen/templates/msmarco-passage-openai-ada2.template
@@ -53,9 +53,11 @@ ${index_cmds}
```
The path `/path/to/${corpus}/` should point to the corpus downloaded above.
-
Upon completion, we should have an index with 8,841,823 documents.
+Note that here we are explicitly using Lucene's `NoMergePolicy` merge policy, which suppresses any merging of index segments.
+This is because merging index segments is a costly operation and not worthwhile given our query set.
+
## Retrieval
Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule.
diff --git a/src/main/resources/regression/dl19-passage-cos-dpr-distil-hnsw-int8-onnx.yaml b/src/main/resources/regression/dl19-passage-cos-dpr-distil-hnsw-int8-onnx.yaml
new file mode 100644
index 0000000000..cae9fc745e
--- /dev/null
+++ b/src/main/resources/regression/dl19-passage-cos-dpr-distil-hnsw-int8-onnx.yaml
@@ -0,0 +1,65 @@
+---
+corpus: msmarco-passage-cos-dpr-distil
+corpus_path: collections/msmarco/msmarco-passage-cos-dpr-distil/
+
+download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.tar
+download_checksum: e20ffbc8b5e7f760af31298aefeaebbd
+
+index_path: indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil-int8/
+index_type: hnsw
+collection_class: JsonDenseVectorCollection
+generator_class: HnswDenseVectorDocumentGenerator
+index_threads: 16
+index_options: -M 16 -efC 100 -memoryBuffer 65536 -noMerge -quantize.int8
+
+metrics:
+ - metric: AP@1000
+ command: tools/eval/trec_eval.9.0.4/trec_eval
+ params: -m map -c -l 2
+ separator: "\t"
+ parse_index: 2
+ metric_precision: 4
+ can_combine: false
+ - metric: nDCG@10
+ command: tools/eval/trec_eval.9.0.4/trec_eval
+ params: -m ndcg_cut.10 -c
+ separator: "\t"
+ parse_index: 2
+ metric_precision: 4
+ can_combine: false
+ - metric: R@100
+ command: tools/eval/trec_eval.9.0.4/trec_eval
+ params: -m recall.100 -c -l 2
+ separator: "\t"
+ parse_index: 2
+ metric_precision: 4
+ can_combine: false
+ - metric: R@1000
+ command: tools/eval/trec_eval.9.0.4/trec_eval
+ params: -m recall.1000 -c -l 2
+ separator: "\t"
+ parse_index: 2
+ metric_precision: 4
+ can_combine: false
+
+topic_reader: TsvInt
+topics:
+ - name: "[DL19 (Passage)](https://trec.nist.gov/data/deep2020.html)"
+ id: dl19
+ path: topics.dl19-passage.txt
+ qrel: qrels.dl19-passage.txt
+
+models:
+ - name: cos-dpr-distil-hnsw
+ display: cosDPR-distil
+ type: hnsw
+ params: -generator VectorQueryGenerator -topicField title -threads 16 -hits 1000 -efSearch 1000 -encoder CosDprDistil
+ results:
+ AP@1000:
+ - 0.458
+ nDCG@10:
+ - 0.717
+ R@100:
+ - 0.605
+ R@1000:
+ - 0.805
diff --git a/src/main/resources/regression/dl19-passage-cos-dpr-distil-hnsw-int8.yaml b/src/main/resources/regression/dl19-passage-cos-dpr-distil-hnsw-int8.yaml
new file mode 100644
index 0000000000..596ff276ed
--- /dev/null
+++ b/src/main/resources/regression/dl19-passage-cos-dpr-distil-hnsw-int8.yaml
@@ -0,0 +1,65 @@
+---
+corpus: msmarco-passage-cos-dpr-distil
+corpus_path: collections/msmarco/msmarco-passage-cos-dpr-distil/
+
+download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.tar
+download_checksum: e20ffbc8b5e7f760af31298aefeaebbd
+
+index_path: indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil-int8/
+index_type: hnsw
+collection_class: JsonDenseVectorCollection
+generator_class: HnswDenseVectorDocumentGenerator
+index_threads: 16
+index_options: -M 16 -efC 100 -memoryBuffer 65536 -noMerge -quantize.int8
+
+metrics:
+ - metric: AP@1000
+ command: tools/eval/trec_eval.9.0.4/trec_eval
+ params: -m map -c -l 2
+ separator: "\t"
+ parse_index: 2
+ metric_precision: 4
+ can_combine: false
+ - metric: nDCG@10
+ command: tools/eval/trec_eval.9.0.4/trec_eval
+ params: -m ndcg_cut.10 -c
+ separator: "\t"
+ parse_index: 2
+ metric_precision: 4
+ can_combine: false
+ - metric: R@100
+ command: tools/eval/trec_eval.9.0.4/trec_eval
+ params: -m recall.100 -c -l 2
+ separator: "\t"
+ parse_index: 2
+ metric_precision: 4
+ can_combine: false
+ - metric: R@1000
+ command: tools/eval/trec_eval.9.0.4/trec_eval
+ params: -m recall.1000 -c -l 2
+ separator: "\t"
+ parse_index: 2
+ metric_precision: 4
+ can_combine: false
+
+topic_reader: JsonIntVector
+topics:
+ - name: "[DL19 (Passage)](https://trec.nist.gov/data/deep2020.html)"
+ id: dl19
+ path: topics.dl19-passage.cos-dpr-distil.jsonl.gz
+ qrel: qrels.dl19-passage.txt
+
+models:
+ - name: cos-dpr-distil-hnsw
+ display: cosDPR-distil
+ type: hnsw
+ params: -generator VectorQueryGenerator -topicField vector -threads 16 -hits 1000 -efSearch 1000
+ results:
+ AP@1000:
+ - 0.458
+ nDCG@10:
+ - 0.717
+ R@100:
+ - 0.605
+ R@1000:
+ - 0.805
diff --git a/src/main/resources/regression/dl19-passage-cos-dpr-distil-hnsw-onnx.yaml b/src/main/resources/regression/dl19-passage-cos-dpr-distil-hnsw-onnx.yaml
index 31966a5457..8422854271 100644
--- a/src/main/resources/regression/dl19-passage-cos-dpr-distil-hnsw-onnx.yaml
+++ b/src/main/resources/regression/dl19-passage-cos-dpr-distil-hnsw-onnx.yaml
@@ -10,7 +10,7 @@ index_type: hnsw
collection_class: JsonDenseVectorCollection
generator_class: HnswDenseVectorDocumentGenerator
index_threads: 16
-index_options: -M 16 -efC 100 -memoryBuffer 65536
+index_options: -M 16 -efC 100 -memoryBuffer 65536 -noMerge
metrics:
- metric: AP@1000
diff --git a/src/main/resources/regression/dl19-passage-cos-dpr-distil-hnsw.yaml b/src/main/resources/regression/dl19-passage-cos-dpr-distil-hnsw.yaml
index 8b38c5c1b2..fa55081708 100644
--- a/src/main/resources/regression/dl19-passage-cos-dpr-distil-hnsw.yaml
+++ b/src/main/resources/regression/dl19-passage-cos-dpr-distil-hnsw.yaml
@@ -10,7 +10,7 @@ index_type: hnsw
collection_class: JsonDenseVectorCollection
generator_class: HnswDenseVectorDocumentGenerator
index_threads: 16
-index_options: -M 16 -efC 100 -memoryBuffer 65536
+index_options: -M 16 -efC 100 -memoryBuffer 65536 -noMerge
metrics:
- metric: AP@1000
diff --git a/src/main/resources/regression/dl19-passage-openai-ada2-int8.yaml b/src/main/resources/regression/dl19-passage-openai-ada2-int8.yaml
new file mode 100644
index 0000000000..f05d1f2290
--- /dev/null
+++ b/src/main/resources/regression/dl19-passage-openai-ada2-int8.yaml
@@ -0,0 +1,65 @@
+---
+corpus: msmarco-passage-openai-ada2
+corpus_path: collections/msmarco/msmarco-passage-openai-ada2/
+
+download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-openai-ada2.tar
+download_checksum: a4d843d522ff3a3af7edbee789a63402
+
+index_path: indexes/lucene-hnsw.msmarco-passage-openai-ada2-int8/
+index_type: hnsw
+collection_class: JsonDenseVectorCollection
+generator_class: HnswDenseVectorDocumentGenerator
+index_threads: 16
+index_options: -M 16 -efC 100 -memoryBuffer 65536 -noMerge -quantize.int8
+
+metrics:
+ - metric: AP@1000
+ command: tools/eval/trec_eval.9.0.4/trec_eval
+ params: -m map -c -l 2
+ separator: "\t"
+ parse_index: 2
+ metric_precision: 4
+ can_combine: false
+ - metric: nDCG@10
+ command: tools/eval/trec_eval.9.0.4/trec_eval
+ params: -m ndcg_cut.10 -c
+ separator: "\t"
+ parse_index: 2
+ metric_precision: 4
+ can_combine: false
+ - metric: R@100
+ command: tools/eval/trec_eval.9.0.4/trec_eval
+ params: -m recall.100 -c -l 2
+ separator: "\t"
+ parse_index: 2
+ metric_precision: 4
+ can_combine: false
+ - metric: R@1000
+ command: tools/eval/trec_eval.9.0.4/trec_eval
+ params: -m recall.1000 -c -l 2
+ separator: "\t"
+ parse_index: 2
+ metric_precision: 4
+ can_combine: false
+
+topic_reader: JsonIntVector
+topics:
+ - name: "[DL19 (Passage)](https://trec.nist.gov/data/deep2020.html)"
+ id: dl19
+ path: topics.dl19-passage.openai-ada2.jsonl.gz
+ qrel: qrels.dl19-passage.txt
+
+models:
+ - name: openai-ada2
+ display: OpenAI-ada2
+ type: hnsw
+ params: -generator VectorQueryGenerator -topicField vector -threads 16 -hits 1000 -efSearch 1000
+ results:
+ AP@1000:
+ - 0.479
+ nDCG@10:
+ - 0.704
+ R@100:
+ - 0.624
+ R@1000:
+ - 0.857
diff --git a/src/main/resources/regression/dl19-passage-openai-ada2.yaml b/src/main/resources/regression/dl19-passage-openai-ada2.yaml
index 9667d0907c..2c4b0796f7 100644
--- a/src/main/resources/regression/dl19-passage-openai-ada2.yaml
+++ b/src/main/resources/regression/dl19-passage-openai-ada2.yaml
@@ -10,7 +10,7 @@ index_type: hnsw
collection_class: JsonDenseVectorCollection
generator_class: HnswDenseVectorDocumentGenerator
index_threads: 16
-index_options: -M 16 -efC 100 -memoryBuffer 65536
+index_options: -M 16 -efC 100 -memoryBuffer 65536 -noMerge
metrics:
- metric: AP@1000
diff --git a/src/main/resources/regression/dl20-passage-cos-dpr-distil-hnsw-int8-onnx.yaml b/src/main/resources/regression/dl20-passage-cos-dpr-distil-hnsw-int8-onnx.yaml
new file mode 100644
index 0000000000..4e64494b59
--- /dev/null
+++ b/src/main/resources/regression/dl20-passage-cos-dpr-distil-hnsw-int8-onnx.yaml
@@ -0,0 +1,65 @@
+---
+corpus: msmarco-passage-cos-dpr-distil
+corpus_path: collections/msmarco/msmarco-passage-cos-dpr-distil/
+
+download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.tar
+download_checksum: e20ffbc8b5e7f760af31298aefeaebbd
+
+index_path: indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil-int8/
+index_type: hnsw
+collection_class: JsonDenseVectorCollection
+generator_class: HnswDenseVectorDocumentGenerator
+index_threads: 16
+index_options: -M 16 -efC 100 -memoryBuffer 65536 -noMerge -quantize.int8
+
+metrics:
+ - metric: AP@1000
+ command: tools/eval/trec_eval.9.0.4/trec_eval
+ params: -m map -c -l 2
+ separator: "\t"
+ parse_index: 2
+ metric_precision: 4
+ can_combine: false
+ - metric: nDCG@10
+ command: tools/eval/trec_eval.9.0.4/trec_eval
+ params: -m ndcg_cut.10 -c
+ separator: "\t"
+ parse_index: 2
+ metric_precision: 4
+ can_combine: false
+ - metric: R@100
+ command: tools/eval/trec_eval.9.0.4/trec_eval
+ params: -m recall.100 -c -l 2
+ separator: "\t"
+ parse_index: 2
+ metric_precision: 4
+ can_combine: false
+ - metric: R@1000
+ command: tools/eval/trec_eval.9.0.4/trec_eval
+ params: -m recall.1000 -c -l 2
+ separator: "\t"
+ parse_index: 2
+ metric_precision: 4
+ can_combine: false
+
+topic_reader: TsvInt
+topics:
+ - name: "[DL20 (Passage)](https://trec.nist.gov/data/deep2020.html)"
+ id: dl20
+ path: topics.dl20.txt
+ qrel: qrels.dl20-passage.txt
+
+models:
+ - name: cos-dpr-distil-hnsw
+ display: cosDPR-distil
+ type: hnsw
+ params: -generator VectorQueryGenerator -topicField title -threads 16 -hits 1000 -efSearch 1000 -encoder CosDprDistil
+ results:
+ AP@1000:
+ - 0.482
+ nDCG@10:
+ - 0.701
+ R@100:
+ - 0.712
+ R@1000:
+ - 0.843
diff --git a/src/main/resources/regression/dl20-passage-cos-dpr-distil-hnsw-int8.yaml b/src/main/resources/regression/dl20-passage-cos-dpr-distil-hnsw-int8.yaml
new file mode 100644
index 0000000000..98968a983f
--- /dev/null
+++ b/src/main/resources/regression/dl20-passage-cos-dpr-distil-hnsw-int8.yaml
@@ -0,0 +1,65 @@
+---
+corpus: msmarco-passage-cos-dpr-distil
+corpus_path: collections/msmarco/msmarco-passage-cos-dpr-distil/
+
+download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.tar
+download_checksum: e20ffbc8b5e7f760af31298aefeaebbd
+
+index_path: indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil-int8/
+index_type: hnsw
+collection_class: JsonDenseVectorCollection
+generator_class: HnswDenseVectorDocumentGenerator
+index_threads: 16
+index_options: -M 16 -efC 100 -memoryBuffer 65536 -noMerge -quantize.int8
+
+metrics:
+ - metric: AP@1000
+ command: tools/eval/trec_eval.9.0.4/trec_eval
+ params: -m map -c -l 2
+ separator: "\t"
+ parse_index: 2
+ metric_precision: 4
+ can_combine: false
+ - metric: nDCG@10
+ command: tools/eval/trec_eval.9.0.4/trec_eval
+ params: -m ndcg_cut.10 -c
+ separator: "\t"
+ parse_index: 2
+ metric_precision: 4
+ can_combine: false
+ - metric: R@100
+ command: tools/eval/trec_eval.9.0.4/trec_eval
+ params: -m recall.100 -c -l 2
+ separator: "\t"
+ parse_index: 2
+ metric_precision: 4
+ can_combine: false
+ - metric: R@1000
+ command: tools/eval/trec_eval.9.0.4/trec_eval
+ params: -m recall.1000 -c -l 2
+ separator: "\t"
+ parse_index: 2
+ metric_precision: 4
+ can_combine: false
+
+topic_reader: JsonIntVector
+topics:
+ - name: "[DL20 (Passage)](https://trec.nist.gov/data/deep2020.html)"
+ id: dl20
+ path: topics.dl20.cos-dpr-distil.jsonl.gz
+ qrel: qrels.dl20-passage.txt
+
+models:
+ - name: cos-dpr-distil-hnsw
+ display: cosDPR-distil
+ type: hnsw
+ params: -generator VectorQueryGenerator -topicField vector -threads 16 -hits 1000 -efSearch 1000
+ results:
+ AP@1000:
+ - 0.482
+ nDCG@10:
+ - 0.701
+ R@100:
+ - 0.712
+ R@1000:
+ - 0.843
diff --git a/src/main/resources/regression/dl20-passage-cos-dpr-distil-hnsw-onnx.yaml b/src/main/resources/regression/dl20-passage-cos-dpr-distil-hnsw-onnx.yaml
index d2e0f89991..055abefa4b 100644
--- a/src/main/resources/regression/dl20-passage-cos-dpr-distil-hnsw-onnx.yaml
+++ b/src/main/resources/regression/dl20-passage-cos-dpr-distil-hnsw-onnx.yaml
@@ -10,7 +10,7 @@ index_type: hnsw
collection_class: JsonDenseVectorCollection
generator_class: HnswDenseVectorDocumentGenerator
index_threads: 16
-index_options: -M 16 -efC 100 -memoryBuffer 65536
+index_options: -M 16 -efC 100 -memoryBuffer 65536 -noMerge
metrics:
- metric: AP@1000
diff --git a/src/main/resources/regression/dl20-passage-cos-dpr-distil-hnsw.yaml b/src/main/resources/regression/dl20-passage-cos-dpr-distil-hnsw.yaml
index f120eadd61..f149679351 100644
--- a/src/main/resources/regression/dl20-passage-cos-dpr-distil-hnsw.yaml
+++ b/src/main/resources/regression/dl20-passage-cos-dpr-distil-hnsw.yaml
@@ -10,7 +10,7 @@ index_type: hnsw
collection_class: JsonDenseVectorCollection
generator_class: HnswDenseVectorDocumentGenerator
index_threads: 16
-index_options: -M 16 -efC 100 -memoryBuffer 65536
+index_options: -M 16 -efC 100 -memoryBuffer 65536 -noMerge
metrics:
- metric: AP@1000
diff --git a/src/main/resources/regression/dl20-passage-openai-ada2-int8.yaml b/src/main/resources/regression/dl20-passage-openai-ada2-int8.yaml
new file mode 100644
index 0000000000..6f26a14fe7
--- /dev/null
+++ b/src/main/resources/regression/dl20-passage-openai-ada2-int8.yaml
@@ -0,0 +1,65 @@
+---
+corpus: msmarco-passage-openai-ada2
+corpus_path: collections/msmarco/msmarco-passage-openai-ada2/
+
+download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-openai-ada2.tar
+download_checksum: a4d843d522ff3a3af7edbee789a63402
+
+index_path: indexes/lucene-hnsw.msmarco-passage-openai-ada2-int8/
+index_type: hnsw
+collection_class: JsonDenseVectorCollection
+generator_class: HnswDenseVectorDocumentGenerator
+index_threads: 16
+index_options: -M 16 -efC 100 -memoryBuffer 65536 -noMerge -quantize.int8
+
+metrics:
+ - metric: AP@1000
+ command: tools/eval/trec_eval.9.0.4/trec_eval
+ params: -m map -c -l 2
+ separator: "\t"
+ parse_index: 2
+ metric_precision: 4
+ can_combine: false
+ - metric: nDCG@10
+ command: tools/eval/trec_eval.9.0.4/trec_eval
+ params: -m ndcg_cut.10 -c
+ separator: "\t"
+ parse_index: 2
+ metric_precision: 4
+ can_combine: false
+ - metric: R@100
+ command: tools/eval/trec_eval.9.0.4/trec_eval
+ params: -m recall.100 -c -l 2
+ separator: "\t"
+ parse_index: 2
+ metric_precision: 4
+ can_combine: false
+ - metric: R@1000
+ command: tools/eval/trec_eval.9.0.4/trec_eval
+ params: -m recall.1000 -c -l 2
+ separator: "\t"
+ parse_index: 2
+ metric_precision: 4
+ can_combine: false
+
+topic_reader: JsonIntVector
+topics:
+ - name: "[DL20 (Passage)](https://trec.nist.gov/data/deep2020.html)"
+ id: dl20
+ path: topics.dl20-passage.openai-ada2.jsonl.gz
+ qrel: qrels.dl20-passage.txt
+
+models:
+ - name: openai-ada2
+ display: OpenAI-ada2
+ type: hnsw
+ params: -generator VectorQueryGenerator -topicField vector -threads 16 -hits 1000 -efSearch 1000
+ results:
+ AP@1000:
+ - 0.477
+ nDCG@10:
+ - 0.676
+ R@100:
+ - 0.723
+ R@1000:
+ - 0.867
diff --git a/src/main/resources/regression/dl20-passage-openai-ada2.yaml b/src/main/resources/regression/dl20-passage-openai-ada2.yaml
index 152d18765c..ff7d16aa64 100644
--- a/src/main/resources/regression/dl20-passage-openai-ada2.yaml
+++ b/src/main/resources/regression/dl20-passage-openai-ada2.yaml
@@ -10,7 +10,7 @@ index_type: hnsw
collection_class: JsonDenseVectorCollection
generator_class: HnswDenseVectorDocumentGenerator
index_threads: 16
-index_options: -M 16 -efC 100 -memoryBuffer 65536
+index_options: -M 16 -efC 100 -memoryBuffer 65536 -noMerge
metrics:
- metric: AP@1000
diff --git a/src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw-int8-onnx.yaml b/src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw-int8-onnx.yaml
new file mode 100644
index 0000000000..1691c197c8
--- /dev/null
+++ b/src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw-int8-onnx.yaml
@@ -0,0 +1,65 @@
+---
+corpus: msmarco-passage-cos-dpr-distil
+corpus_path: collections/msmarco/msmarco-passage-cos-dpr-distil/
+
+download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.tar
+download_checksum: e20ffbc8b5e7f760af31298aefeaebbd
+
+index_path: indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil-int8/
+index_type: hnsw
+collection_class: JsonDenseVectorCollection
+generator_class: HnswDenseVectorDocumentGenerator
+index_threads: 16
+index_options: -M 16 -efC 100 -memoryBuffer 65536 -noMerge -quantize.int8
+
+metrics:
+ - metric: AP@1000
+ command: tools/eval/trec_eval.9.0.4/trec_eval
+ params: -c -m map
+ separator: "\t"
+ parse_index: 2
+ metric_precision: 4
+ can_combine: false
+ - metric: RR@10
+ command: tools/eval/trec_eval.9.0.4/trec_eval
+ params: -c -M 10 -m recip_rank
+ separator: "\t"
+ parse_index: 2
+ metric_precision: 4
+ can_combine: false
+ - metric: R@100
+ command: tools/eval/trec_eval.9.0.4/trec_eval
+ params: -c -m recall.100
+ separator: "\t"
+ parse_index: 2
+ metric_precision: 4
+ can_combine: false
+ - metric: R@1000
+ command: tools/eval/trec_eval.9.0.4/trec_eval
+ params: -c -m recall.1000
+ separator: "\t"
+ parse_index: 2
+ metric_precision: 4
+ can_combine: false
+
+topic_reader: TsvInt
+topics:
+ - name: "[MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking)"
+ id: dev
+ path: topics.msmarco-passage.dev-subset.txt
+ qrel: qrels.msmarco-passage.dev-subset.txt
+
+models:
+ - name: cos-dpr-distil-hnsw
+ display: cosDPR-distil
+ type: hnsw
+ params: -generator VectorQueryGenerator -topicField title -threads 16 -hits 1000 -efSearch 1000 -encoder CosDprDistil
+ results:
+ AP@1000:
+ - 0.393
+ RR@10:
+ - 0.388
+ R@100:
+ - 0.903
+ R@1000:
+ - 0.974
diff --git a/src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw-int8.yaml b/src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw-int8.yaml
new file mode 100644
index 0000000000..6f93e3b017
--- /dev/null
+++ b/src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw-int8.yaml
@@ -0,0 +1,65 @@
+---
+corpus: msmarco-passage-cos-dpr-distil
+corpus_path: collections/msmarco/msmarco-passage-cos-dpr-distil/
+
+download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.tar
+download_checksum: e20ffbc8b5e7f760af31298aefeaebbd
+
+index_path: indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil-int8/
+index_type: hnsw
+collection_class: JsonDenseVectorCollection
+generator_class: HnswDenseVectorDocumentGenerator
+index_threads: 16
+index_options: -M 16 -efC 100 -memoryBuffer 65536 -noMerge -quantize.int8
+
+metrics:
+ - metric: AP@1000
+ command: tools/eval/trec_eval.9.0.4/trec_eval
+ params: -c -m map
+ separator: "\t"
+ parse_index: 2
+ metric_precision: 4
+ can_combine: false
+ - metric: RR@10
+ command: tools/eval/trec_eval.9.0.4/trec_eval
+ params: -c -M 10 -m recip_rank
+ separator: "\t"
+ parse_index: 2
+ metric_precision: 4
+ can_combine: false
+ - metric: R@100
+ command: tools/eval/trec_eval.9.0.4/trec_eval
+ params: -c -m recall.100
+ separator: "\t"
+ parse_index: 2
+ metric_precision: 4
+ can_combine: false
+ - metric: R@1000
+ command: tools/eval/trec_eval.9.0.4/trec_eval
+ params: -c -m recall.1000
+ separator: "\t"
+ parse_index: 2
+ metric_precision: 4
+ can_combine: false
+
+topic_reader: JsonIntVector
+topics:
+ - name: "[MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking)"
+ id: dev
+ path: topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.gz
+ qrel: qrels.msmarco-passage.dev-subset.txt
+
+models:
+ - name: cos-dpr-distil-hnsw
+ display: cosDPR-distil
+ type: hnsw
+ params: -generator VectorQueryGenerator -topicField vector -threads 16 -hits 1000 -efSearch 1000
+ results:
+ AP@1000:
+ - 0.393
+ RR@10:
+ - 0.388
+ R@100:
+ - 0.903
+ R@1000:
+ - 0.974
diff --git a/src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw-onnx.yaml b/src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw-onnx.yaml
index 4746f389f3..372d40a67b 100644
--- a/src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw-onnx.yaml
+++ b/src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw-onnx.yaml
@@ -10,7 +10,7 @@ index_type: hnsw
collection_class: JsonDenseVectorCollection
generator_class: HnswDenseVectorDocumentGenerator
index_threads: 16
-index_options: -M 16 -efC 100 -memoryBuffer 65536
+index_options: -M 16 -efC 100 -memoryBuffer 65536 -noMerge
metrics:
- metric: AP@1000
diff --git a/src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw.yaml b/src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw.yaml
index c63c3b7972..a235e7823e 100644
--- a/src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw.yaml
+++ b/src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw.yaml
@@ -10,7 +10,7 @@ index_type: hnsw
collection_class: JsonDenseVectorCollection
generator_class: HnswDenseVectorDocumentGenerator
index_threads: 16
-index_options: -M 16 -efC 100 -memoryBuffer 65536
+index_options: -M 16 -efC 100 -memoryBuffer 65536 -noMerge
metrics:
- metric: AP@1000
diff --git a/src/main/resources/regression/msmarco-passage-openai-ada2-int8.yaml b/src/main/resources/regression/msmarco-passage-openai-ada2-int8.yaml
new file mode 100644
index 0000000000..9332504916
--- /dev/null
+++ b/src/main/resources/regression/msmarco-passage-openai-ada2-int8.yaml
@@ -0,0 +1,65 @@
+---
+corpus: msmarco-passage-openai-ada2
+corpus_path: collections/msmarco/msmarco-passage-openai-ada2/
+
+download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-openai-ada2.tar
+download_checksum: a4d843d522ff3a3af7edbee789a63402
+
+index_path: indexes/lucene-hnsw.msmarco-passage-openai-ada2-int8/
+index_type: hnsw
+collection_class: JsonDenseVectorCollection
+generator_class: HnswDenseVectorDocumentGenerator
+index_threads: 16
+index_options: -M 16 -efC 100 -memoryBuffer 65536 -noMerge -quantize.int8
+
+metrics:
+ - metric: AP@1000
+ command: tools/eval/trec_eval.9.0.4/trec_eval
+ params: -c -m map
+ separator: "\t"
+ parse_index: 2
+ metric_precision: 4
+ can_combine: false
+ - metric: RR@10
+ command: tools/eval/trec_eval.9.0.4/trec_eval
+ params: -c -M 10 -m recip_rank
+ separator: "\t"
+ parse_index: 2
+ metric_precision: 4
+ can_combine: false
+ - metric: R@100
+ command: tools/eval/trec_eval.9.0.4/trec_eval
+ params: -c -m recall.100
+ separator: "\t"
+ parse_index: 2
+ metric_precision: 4
+ can_combine: false
+ - metric: R@1000
+ command: tools/eval/trec_eval.9.0.4/trec_eval
+ params: -c -m recall.1000
+ separator: "\t"
+ parse_index: 2
+ metric_precision: 4
+ can_combine: false
+
+topic_reader: JsonIntVector
+topics:
+ - name: "[MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking)"
+ id: dev
+ path: topics.msmarco-passage.dev-subset.openai-ada2.jsonl.gz
+ qrel: qrels.msmarco-passage.dev-subset.txt
+
+models:
+ - name: openai-ada2
+ display: OpenAI-ada2
+ type: hnsw
+ params: -generator VectorQueryGenerator -topicField vector -threads 16 -hits 1000 -efSearch 1000
+ results:
+ AP@1000:
+ - 0.350
+ RR@10:
+ - 0.343
+ R@100:
+ - 0.898
+ R@1000:
+ - 0.985
diff --git a/src/main/resources/regression/msmarco-passage-openai-ada2.yaml b/src/main/resources/regression/msmarco-passage-openai-ada2.yaml
index 08289ce2c0..5bd13b6d28 100644
--- a/src/main/resources/regression/msmarco-passage-openai-ada2.yaml
+++ b/src/main/resources/regression/msmarco-passage-openai-ada2.yaml
@@ -10,7 +10,7 @@ index_type: hnsw
collection_class: JsonDenseVectorCollection
generator_class: HnswDenseVectorDocumentGenerator
index_threads: 16
-index_options: -M 16 -efC 100 -memoryBuffer 65536
+index_options: -M 16 -efC 100 -memoryBuffer 65536 -noMerge
metrics:
- metric: AP@1000
diff --git a/src/test/java/io/anserini/index/IndexHnswDenseVectorsTest.java b/src/test/java/io/anserini/index/IndexHnswDenseVectorsTest.java
index bade9e0366..535475aee5 100644
--- a/src/test/java/io/anserini/index/IndexHnswDenseVectorsTest.java
+++ b/src/test/java/io/anserini/index/IndexHnswDenseVectorsTest.java
@@ -152,4 +152,26 @@ public void test1() throws Exception {
assertNotNull(results);
assertEquals(100, results.get("documents"));
}
+
+ @Test
+ public void testQuantizedInt8() throws Exception {
+ String indexPath = "target/idx-sample-hnsw" + System.currentTimeMillis();
+ String[] indexArgs = new String[] {
+ "-collection", "JsonDenseVectorCollection",
+ "-input", "src/test/resources/sample_docs/openai_ada2/json_vector",
+ "-index", indexPath,
+ "-generator", "HnswDenseVectorDocumentGenerator",
+ "-threads", "1",
+ "-M", "16", "-efC", "100", "-quantize.int8"
+ };
+
+ IndexHnswDenseVectors.main(indexArgs);
+
+ IndexReader reader = IndexReaderUtils.getReader(indexPath);
+ assertNotNull(reader);
+
+ Map results = IndexReaderUtils.getIndexStats(reader, Constants.VECTOR);
+ assertNotNull(results);
+ assertEquals(100, results.get("documents"));
+ }
}
\ No newline at end of file
diff --git a/src/test/java/io/anserini/index/PrebuiltIndexHandlerTest.java b/src/test/java/io/anserini/index/PrebuiltIndexHandlerTest.java
new file mode 100644
index 0000000000..a1ba830f19
--- /dev/null
+++ b/src/test/java/io/anserini/index/PrebuiltIndexHandlerTest.java
@@ -0,0 +1,58 @@
+/*
+ * Anserini: A Lucene toolkit for reproducible information retrieval research
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.anserini.index;
+
+import java.io.IOException;
+
+import org.junit.After;
+import org.junit.Before;
+import org.junit.Test;
+
+import io.anserini.util.PrebuiltIndexHandler;
+
+public class PrebuiltIndexHandlerTest {
+ private PrebuiltIndexHandler handler;
+
+ @Test
+ public void testHandler() throws Exception {
+ try {
+ handler.download();
+ handler.decompressIndex();
+ } catch (IOException e) {
+ e.printStackTrace();
+ throw new Exception("Failed to download index.", e);
+ } catch (Exception e) {
+ e.printStackTrace();
+ throw new Exception("Failed to decompress index.", e);
+ }
+ }
+
+ @Before
+ public void setUp() throws Exception {
+ handler = new PrebuiltIndexHandler("cacm"); // we use a lightweight index for testing
+ handler.initialize();
+ }
+
+ @After
+ public void tearDown() throws Exception {
+ // delete the index downloaded
+ if (handler.getIndexFolderPath() != null && handler.getIndexFolderPath().toFile().exists()) {
+ handler.getIndexFolderPath().toFile().delete();
+ }
+ }
+
+}
diff --git a/src/test/java/io/anserini/index/PrebuiltIndexTest.java b/src/test/java/io/anserini/index/PrebuiltIndexTest.java
new file mode 100644
index 0000000000..c00aed4f23
--- /dev/null
+++ b/src/test/java/io/anserini/index/PrebuiltIndexTest.java
@@ -0,0 +1,61 @@
+/*
+ * Anserini: A Lucene toolkit for reproducible information retrieval research
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package io.anserini.index;
+
+import java.io.IOException;
+import java.net.HttpURLConnection;
+import java.net.URL;
+
+import org.junit.Test;
+
+public class PrebuiltIndexTest {
+
+ // test checksum validity
+ @Test
+ public void testChecksum() {
+ for (IndexInfo info : IndexInfo.values()) {
+ // check each checksum is valid
+ assert info.md5.length() == 32;
+ assert info.md5.matches("[a-fA-F0-9]+");
+ }
+ }
+
+ // test url validity
+ @Test
+ public void testUrls() {
+ for (IndexInfo info : IndexInfo.values()) {
+ for (String url : info.urls) {
+ // check each url status code is 200
+ try {
+ final URL requestUrl = new URL("http://example.com");
+ final HttpURLConnection con = (HttpURLConnection) requestUrl.openConnection();
+ assert con.getResponseCode() == 200;
+ } catch (IOException e) {
+ throw new RuntimeException("Error connecting to " + url, e);
+ } catch (Exception e) {
+ throw new RuntimeException("Malformed URL: " + url, e);
+ }
+ }
+ }
+ }
+
+ // test number of prebuilt-indexes
+ @Test
+ public void testNumPrebuiltIndexes() {
+ assert IndexInfo.values().length == 2;
+ }
+}
diff --git a/src/test/java/io/anserini/search/SearchHnswDenseVectorsTest.java b/src/test/java/io/anserini/search/SearchHnswDenseVectorsTest.java
index 683af4c779..2a9e2a9fab 100644
--- a/src/test/java/io/anserini/search/SearchHnswDenseVectorsTest.java
+++ b/src/test/java/io/anserini/search/SearchHnswDenseVectorsTest.java
@@ -305,13 +305,13 @@ public void testBasicAda2() throws Exception {
"-hits", "5"};
SearchHnswDenseVectors.main(searchArgs);
- TestUtils.checkFile(runfile, new String[] {
+ TestUtils.checkRunFileApproximate(runfile, new String[] {
"160885 Q0 45 1 0.863064 Anserini",
"160885 Q0 44 2 0.861596 Anserini",
"160885 Q0 40 3 0.858651 Anserini",
"160885 Q0 48 4 0.858514 Anserini",
- "160885 Q0 41 5 0.856264 Anserini",
- "867490 Q0 10 1 0.850332 Anserini",
+ "160885 Q0 41 5 0.856265 Anserini",
+ "867490 Q0 10 1 0.850331 Anserini",
"867490 Q0 45 2 0.846281 Anserini",
"867490 Q0 44 3 0.845236 Anserini",
"867490 Q0 95 4 0.845013 Anserini",
@@ -393,7 +393,7 @@ public void testBasicWithOnnx() throws Exception {
SearchHnswDenseVectors.main(searchArgs);
// Note output is slightly different from pre-encoded query vectors.
- TestUtils.checkFile(runfile, new String[] {
+ TestUtils.checkRunFileApproximate(runfile, new String[] {
"2 Q0 208 1 0.578723 Anserini",
"2 Q0 224 2 0.578716 Anserini",
"2 Q0 384 3 0.573913 Anserini",
@@ -437,7 +437,7 @@ public void testRemoveQuery() throws Exception {
"-removeQuery"};
SearchHnswDenseVectors.main(searchArgs);
- TestUtils.checkFile(runfile, new String[] {
+ TestUtils.checkRunFileApproximate(runfile, new String[] {
"10 Q0 45 1 0.846281 Anserini",
"10 Q0 44 2 0.845236 Anserini",
"10 Q0 95 3 0.845013 Anserini",
@@ -480,7 +480,7 @@ public void testPassage() throws Exception {
"-hits", "10"};
SearchHnswDenseVectors.main(searchArgs);
- TestUtils.checkFile(runfile, new String[] {
+ TestUtils.checkRunFileApproximate(runfile, new String[] {
"160885 Q0 44 1 0.863064 Anserini",
"160885 Q0 40 2 0.858651 Anserini",
"160885 Q0 48 3 0.858514 Anserini",
diff --git a/src/test/java/io/anserini/search/SearchInvertedDenseVectorsTest.java b/src/test/java/io/anserini/search/SearchInvertedDenseVectorsTest.java
index e0071e0d02..f9524aded0 100644
--- a/src/test/java/io/anserini/search/SearchInvertedDenseVectorsTest.java
+++ b/src/test/java/io/anserini/search/SearchInvertedDenseVectorsTest.java
@@ -110,7 +110,6 @@ public void testInvalidIndex2() throws Exception {
"-hits", "5",
"-encoding", "fw"};
SearchInvertedDenseVectors.main(searchArgs);
-
assertEquals("Error: \"src/\" does not appear to be a valid index.\n", err.toString());
restoreStderr();
}
@@ -310,4 +309,4 @@ public void searchLLTest() throws Exception {
new File(runfile).delete();
}
-}
\ No newline at end of file
+}