From 6369184625c84dfa45b775ddedeb2bc31ceb84c7 Mon Sep 17 00:00:00 2001 From: Jimmy Lin Date: Sat, 18 Nov 2023 07:44:11 -0500 Subject: [PATCH] Clean up of dense inverted indexing code (#2263) --- README.md | 8 +- docs/regressions.md | 18 +- ...ressions-dl19-passage-cos-dpr-distil-fw.md | 114 ++++ ...-dl19-passage-cos-dpr-distil-hnsw-onnx.md} | 26 +- ...sions-dl19-passage-cos-dpr-distil-hnsw.md} | 24 +- ...ions-dl19-passage-cos-dpr-distil-lexlsh.md | 114 ++++ ...ressions-dl20-passage-cos-dpr-distil-fw.md | 114 ++++ ...-dl20-passage-cos-dpr-distil-hnsw-onnx.md} | 26 +- ...sions-dl20-passage-cos-dpr-distil-hnsw.md} | 24 +- ...ions-dl20-passage-cos-dpr-distil-lexlsh.md | 114 ++++ ...sions-msmarco-passage-cos-dpr-distil-fw.md | 108 ++++ ...marco-passage-cos-dpr-distil-hnsw-onnx.md} | 32 +- ...ns-msmarco-passage-cos-dpr-distil-hnsw.md} | 24 +- ...s-msmarco-passage-cos-dpr-distil-lexlsh.md | 108 ++++ pom.xml | 20 +- src/main/java/io/anserini/index/Counters.java | 51 ++ .../io/anserini/index/IndexCollection.java | 30 -- .../anserini/index/IndexHnswDenseVectors.java | 34 +- .../index/IndexInvertedDenseVectors.java | 504 +++++------------- .../io/anserini/index/IndexReaderUtils.java | 19 +- .../InvertedDenseVectorDocumentGenerator.java | 24 +- .../search/EvaluateInvertedDenseVectors.java | 256 --------- .../search/SearchInvertedDenseVectors.java | 137 +---- .../io/anserini/search/SearchVectorArgs.java | 103 ---- src/main/python/regressions-batch03.txt | 20 +- src/main/python/run_regression.py | 18 +- .../dl19-passage-cos-dpr-distil-fw.template | 92 ++++ ...passage-cos-dpr-distil-hnsw-onnx.template} | 0 ...dl19-passage-cos-dpr-distil-hnsw.template} | 0 ...l19-passage-cos-dpr-distil-lexlsh.template | 92 ++++ .../dl20-passage-cos-dpr-distil-fw.template | 92 ++++ ...passage-cos-dpr-distil-hnsw-onnx.template} | 0 ...dl20-passage-cos-dpr-distil-hnsw.template} | 0 ...l20-passage-cos-dpr-distil-lexlsh.template | 92 ++++ ...msmarco-passage-cos-dpr-distil-fw.template | 86 +++ ...passage-cos-dpr-distil-hnsw-onnx.template} | 0 ...arco-passage-cos-dpr-distil-hnsw.template} | 0 ...rco-passage-cos-dpr-distil-lexlsh.template | 86 +++ .../dl19-passage-cos-dpr-distil-fw.yaml | 69 +++ ...l19-passage-cos-dpr-distil-hnsw-onnx.yaml} | 6 +- ... => dl19-passage-cos-dpr-distil-hnsw.yaml} | 4 +- .../dl19-passage-cos-dpr-distil-lexlsh.yaml | 69 +++ .../regression/dl19-passage-openai-ada2.yaml | 2 + .../dl20-passage-cos-dpr-distil-fw.yaml | 69 +++ ...l20-passage-cos-dpr-distil-hnsw-onnx.yaml} | 6 +- ... => dl20-passage-cos-dpr-distil-hnsw.yaml} | 4 +- .../dl20-passage-cos-dpr-distil-lexlsh.yaml | 69 +++ .../regression/dl20-passage-openai-ada2.yaml | 2 + .../msmarco-passage-cos-dpr-distil-fw.yaml | 69 +++ ...rco-passage-cos-dpr-distil-hnsw-onnx.yaml} | 6 +- ... msmarco-passage-cos-dpr-distil-hnsw.yaml} | 4 +- ...msmarco-passage-cos-dpr-distil-lexlsh.yaml | 69 +++ .../msmarco-passage-openai-ada2.yaml | 2 + src/test/java/io/anserini/doc/DataModel.java | 29 +- .../doc/GenerateRegressionDocsTest.java | 1 - .../index/IndexInvertedDenseVectorsTest.java | 161 ++++-- .../EvaluateInvertedDenseVectorsTest.java | 47 -- .../SearchInvertedDenseVectorsTest.java | 117 +++- src/test/resources/mini-word-vectors.txt | 4 - ...s.msmarco-passage-dev-cos-dpr-distil.jsonl | 2 + 60 files changed, 2244 insertions(+), 1177 deletions(-) create mode 100644 docs/regressions/regressions-dl19-passage-cos-dpr-distil-fw.md rename docs/regressions/{regressions-dl19-passage-cos-dpr-distil-onnx.md => regressions-dl19-passage-cos-dpr-distil-hnsw-onnx.md} (88%) rename docs/regressions/{regressions-dl19-passage-cos-dpr-distil.md => regressions-dl19-passage-cos-dpr-distil-hnsw.md} (88%) create mode 100644 docs/regressions/regressions-dl19-passage-cos-dpr-distil-lexlsh.md create mode 100644 docs/regressions/regressions-dl20-passage-cos-dpr-distil-fw.md rename docs/regressions/{regressions-dl20-passage-cos-dpr-distil-onnx.md => regressions-dl20-passage-cos-dpr-distil-hnsw-onnx.md} (88%) rename docs/regressions/{regressions-dl20-passage-cos-dpr-distil.md => regressions-dl20-passage-cos-dpr-distil-hnsw.md} (88%) create mode 100644 docs/regressions/regressions-dl20-passage-cos-dpr-distil-lexlsh.md create mode 100644 docs/regressions/regressions-msmarco-passage-cos-dpr-distil-fw.md rename docs/regressions/{regressions-msmarco-passage-cos-dpr-distil-onnx.md => regressions-msmarco-passage-cos-dpr-distil-hnsw-onnx.md} (87%) rename docs/regressions/{regressions-msmarco-passage-cos-dpr-distil.md => regressions-msmarco-passage-cos-dpr-distil-hnsw.md} (86%) create mode 100644 docs/regressions/regressions-msmarco-passage-cos-dpr-distil-lexlsh.md create mode 100644 src/main/java/io/anserini/index/Counters.java delete mode 100644 src/main/java/io/anserini/search/EvaluateInvertedDenseVectors.java delete mode 100644 src/main/java/io/anserini/search/SearchVectorArgs.java create mode 100644 src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil-fw.template rename src/main/resources/docgen/templates/{dl19-passage-cos-dpr-distil-onnx.template => dl19-passage-cos-dpr-distil-hnsw-onnx.template} (100%) rename src/main/resources/docgen/templates/{dl19-passage-cos-dpr-distil.template => dl19-passage-cos-dpr-distil-hnsw.template} (100%) create mode 100644 src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil-lexlsh.template create mode 100644 src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil-fw.template rename src/main/resources/docgen/templates/{dl20-passage-cos-dpr-distil-onnx.template => dl20-passage-cos-dpr-distil-hnsw-onnx.template} (100%) rename src/main/resources/docgen/templates/{dl20-passage-cos-dpr-distil.template => dl20-passage-cos-dpr-distil-hnsw.template} (100%) create mode 100644 src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil-lexlsh.template create mode 100644 src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-fw.template rename src/main/resources/docgen/templates/{msmarco-passage-cos-dpr-distil-onnx.template => msmarco-passage-cos-dpr-distil-hnsw-onnx.template} (100%) rename src/main/resources/docgen/templates/{msmarco-passage-cos-dpr-distil.template => msmarco-passage-cos-dpr-distil-hnsw.template} (100%) create mode 100644 src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-lexlsh.template create mode 100644 src/main/resources/regression/dl19-passage-cos-dpr-distil-fw.yaml rename src/main/resources/regression/{dl19-passage-cos-dpr-distil-onnx.yaml => dl19-passage-cos-dpr-distil-hnsw-onnx.yaml} (94%) rename src/main/resources/regression/{dl19-passage-cos-dpr-distil.yaml => dl19-passage-cos-dpr-distil-hnsw.yaml} (96%) create mode 100644 src/main/resources/regression/dl19-passage-cos-dpr-distil-lexlsh.yaml create mode 100644 src/main/resources/regression/dl20-passage-cos-dpr-distil-fw.yaml rename src/main/resources/regression/{dl20-passage-cos-dpr-distil-onnx.yaml => dl20-passage-cos-dpr-distil-hnsw-onnx.yaml} (94%) rename src/main/resources/regression/{dl20-passage-cos-dpr-distil.yaml => dl20-passage-cos-dpr-distil-hnsw.yaml} (96%) create mode 100644 src/main/resources/regression/dl20-passage-cos-dpr-distil-lexlsh.yaml create mode 100644 src/main/resources/regression/msmarco-passage-cos-dpr-distil-fw.yaml rename src/main/resources/regression/{msmarco-passage-cos-dpr-distil-onnx.yaml => msmarco-passage-cos-dpr-distil-hnsw-onnx.yaml} (95%) rename src/main/resources/regression/{msmarco-passage-cos-dpr-distil.yaml => msmarco-passage-cos-dpr-distil-hnsw.yaml} (96%) create mode 100644 src/main/resources/regression/msmarco-passage-cos-dpr-distil-lexlsh.yaml delete mode 100644 src/test/java/io/anserini/search/EvaluateInvertedDenseVectorsTest.java delete mode 100644 src/test/resources/mini-word-vectors.txt create mode 100644 src/test/resources/sample_topics/sample-topics.msmarco-passage-dev-cos-dpr-distil.jsonl diff --git a/README.md b/README.md index 875ae803b8..be5703721c 100644 --- a/README.md +++ b/README.md @@ -89,9 +89,11 @@ See individual pages for details! | SPLADE++ CoCondenser-SelfDistil | [✓](docs/regressions/regressions-msmarco-passage-splade-pp-sd.md) | [✓](docs/regressions/regressions-dl19-passage-splade-pp-sd.md) | [✓](docs/regressions/regressions-dl20-passage-splade-pp-sd.md) | | SPLADE++ CoCondenser-SelfDistil (ONNX) | [✓](docs/regressions/regressions-msmarco-passage-splade-pp-sd-onnx.md) | [✓](docs/regressions/regressions-dl19-passage-splade-pp-sd-onnx.md) | [✓](docs/regressions/regressions-dl20-passage-splade-pp-sd-onnx.md) | | **Learned Dense** | | | | -| cosDPR-distil | [✓](docs/regressions/regressions-msmarco-passage-cos-dpr-distil.md) | [✓](docs/regressions/regressions-dl19-passage-cos-dpr-distil.md) | [✓](docs/regressions/regressions-dl20-passage-cos-dpr-distil.md) | -| cosDPR-distil (ONNX) | [✓](docs/regressions/regressions-msmarco-passage-cos-dpr-distil-onnx.md) | [✓](docs/regressions/regressions-dl19-passage-cos-dpr-distil-onnx.md) | [✓](docs/regressions/regressions-dl20-passage-cos-dpr-distil-onnx.md) | -| OpenAI-ada2 | [✓](docs/regressions/regressions-msmarco-passage-openai-ada2.md) | [✓](docs/regressions/regressions-dl19-passage-openai-ada2.md) | [✓](docs/regressions/regressions-dl20-passage-openai-ada2.md) | +| cosDPR-distil w/ HNSW | [✓](docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw.md) | [✓](docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw.md) | [✓](docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw.md) | +| cosDPR-distil w/ HSNW (ONNX) | [✓](docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw-onnx.md) | [✓](docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw-onnx.md) | [✓](docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw-onnx.md) | +| cosDPR-distil w/ "fake words" | [✓](docs/regressions/regressions-msmarco-passage-cos-dpr-distil-fw.md) | [✓](docs/regressions/regressions-dl19-passage-cos-dpr-distil-fw.md) | [✓](docs/regressions/regressions-dl20-passage-cos-dpr-distil-fw.md) | +| cosDPR-distil w/ "LexLSH" | [✓](docs/regressions/regressions-msmarco-passage-cos-dpr-distil-lexlsh.md) | [✓](docs/regressions/regressions-dl19-passage-cos-dpr-distil-lexlsh.md) | [✓](docs/regressions/regressions-dl20-passage-cos-dpr-distil-lexlsh.md) | +| OpenAI-ada2 w/ HNSW | [✓](docs/regressions/regressions-msmarco-passage-openai-ada2.md) | [✓](docs/regressions/regressions-dl19-passage-openai-ada2.md) | [✓](docs/regressions/regressions-dl20-passage-openai-ada2.md) | ### Available Corpora for Download diff --git a/docs/regressions.md b/docs/regressions.md index 3cac4d7a93..a806407475 100644 --- a/docs/regressions.md +++ b/docs/regressions.md @@ -50,12 +50,14 @@ nohup python src/main/python/run_regression.py --index --verify --search --regre nohup python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-splade-distil-cocodenser-medium >& logs/log.msmarco-passage-splade-distil-cocodenser-medium & nohup python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-splade-pp-ed >& logs/log.msmarco-passage-splade-pp-ed & nohup python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-splade-pp-sd >& logs/log.msmarco-passage-splade-pp-sd & -nohup python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-cos-dpr-distil >& logs/log.msmarco-passage-cos-dpr-distil & +nohup python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-cos-dpr-distil-hnsw >& logs/log.msmarco-passage-cos-dpr-distil-hnsw & +nohup python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-cos-dpr-distil-fw >& logs/log.msmarco-passage-cos-dpr-distil-fw & +nohup python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-cos-dpr-distil-lexlsh >& logs/log.msmarco-passage-cos-dpr-distil-lexlsh & nohup python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-openai-ada2 >& logs/log.msmarco-passage-openai-ada2 & nohup python src/main/python/run_regression.py --search-pool 1 --index --verify --search --regression msmarco-passage-splade-pp-ed-onnx >& logs/log.msmarco-passage-splade-pp-ed-onnx & nohup python src/main/python/run_regression.py --search-pool 1 --index --verify --search --regression msmarco-passage-splade-pp-sd-onnx >& logs/log.msmarco-passage-splade-pp-sd-onnx & -nohup python src/main/python/run_regression.py --search-pool 1 --index --verify --search --regression msmarco-passage-cos-dpr-distil-onnx >& logs/log.msmarco-passage-cos-dpr-distil-onnx & +nohup python src/main/python/run_regression.py --search-pool 1 --index --verify --search --regression msmarco-passage-cos-dpr-distil-hnsw-onnx >& logs/log.msmarco-passage-cos-dpr-distil-hnsw-onnx & nohup python src/main/python/run_regression.py --index --verify --search --regression msmarco-doc >& logs/log.msmarco-doc & nohup python src/main/python/run_regression.py --index --verify --search --regression msmarco-doc-wp >& logs/log.msmarco-doc-wp & @@ -80,12 +82,14 @@ nohup python src/main/python/run_regression.py --index --verify --search --regre nohup python src/main/python/run_regression.py --index --verify --search --regression dl19-passage-splade-distil-cocodenser-medium >& logs/log.dl19-passage-splade-distil-cocodenser-medium & nohup python src/main/python/run_regression.py --index --verify --search --regression dl19-passage-splade-pp-ed >& logs/log.dl19-passage-splade-pp-ed & nohup python src/main/python/run_regression.py --index --verify --search --regression dl19-passage-splade-pp-sd >& logs/log.dl19-passage-splade-pp-sd & -nohup python src/main/python/run_regression.py --index --verify --search --regression dl19-passage-cos-dpr-distil >& logs/log.dl19-passage-cos-dpr-distil & +nohup python src/main/python/run_regression.py --index --verify --search --regression dl19-passage-cos-dpr-distil-hnsw >& logs/log.dl19-passage-cos-dpr-distil-hnsw & +nohup python src/main/python/run_regression.py --index --verify --search --regression dl19-passage-cos-dpr-distil-fw >& logs/log.dl19-passage-cos-dpr-distil-fw & +nohup python src/main/python/run_regression.py --index --verify --search --regression dl19-passage-cos-dpr-distil-lexlsh >& logs/log.dl19-passage-cos-dpr-distil-lexlsh & nohup python src/main/python/run_regression.py --index --verify --search --regression dl19-passage-openai-ada2 >& logs/log.dl19-passage-openai-ada2 & nohup python src/main/python/run_regression.py --search-pool 1 --index --verify --search --regression dl19-passage-splade-pp-ed-onnx >& logs/log.dl19-passage-splade-pp-ed-onnx & nohup python src/main/python/run_regression.py --search-pool 1 --index --verify --search --regression dl19-passage-splade-pp-sd-onnx >& logs/log.dl19-passage-splade-pp-sd-onnx & -nohup python src/main/python/run_regression.py --search-pool 1 --index --verify --search --regression dl19-passage-cos-dpr-distil-onnx >& logs/log.dl19-passage-cos-dpr-distil-onnx & +nohup python src/main/python/run_regression.py --search-pool 1 --index --verify --search --regression dl19-passage-cos-dpr-distil-hnsw-onnx >& logs/log.dl19-passage-cos-dpr-distil-hnsw-onnx & nohup python src/main/python/run_regression.py --index --verify --search --regression dl19-doc >& logs/log.dl19-doc & nohup python src/main/python/run_regression.py --index --verify --search --regression dl19-doc-wp >& logs/log.dl19-doc-wp & @@ -110,12 +114,14 @@ nohup python src/main/python/run_regression.py --index --verify --search --regre nohup python src/main/python/run_regression.py --index --verify --search --regression dl20-passage-splade-distil-cocodenser-medium >& logs/log.dl20-passage-splade-distil-cocodenser-medium & nohup python src/main/python/run_regression.py --index --verify --search --regression dl20-passage-splade-pp-ed >& logs/log.dl20-passage-splade-pp-ed & nohup python src/main/python/run_regression.py --index --verify --search --regression dl20-passage-splade-pp-sd >& logs/log.dl20-passage-splade-pp-sd & -nohup python src/main/python/run_regression.py --index --verify --search --regression dl20-passage-cos-dpr-distil >& logs/log.dl20-passage-cos-dpr-distil & +nohup python src/main/python/run_regression.py --index --verify --search --regression dl20-passage-cos-dpr-distil-hnsw >& logs/log.dl20-passage-cos-dpr-distil-hnsw & +nohup python src/main/python/run_regression.py --index --verify --search --regression dl20-passage-cos-dpr-distil-fw >& logs/log.dl20-passage-cos-dpr-distil-fw & +nohup python src/main/python/run_regression.py --index --verify --search --regression dl20-passage-cos-dpr-distil-lexlsh >& logs/log.dl20-passage-cos-dpr-distil-lexlsh & nohup python src/main/python/run_regression.py --index --verify --search --regression dl20-passage-openai-ada2 >& logs/log.dl20-passage-openai-ada2 & nohup python src/main/python/run_regression.py --search-pool 1 --index --verify --search --regression dl20-passage-splade-pp-ed-onnx >& logs/log.dl20-passage-splade-pp-ed-onnx & nohup python src/main/python/run_regression.py --search-pool 1 --index --verify --search --regression dl20-passage-splade-pp-sd-onnx >& logs/log.dl20-passage-splade-pp-sd-onnx & -nohup python src/main/python/run_regression.py --search-pool 1 --index --verify --search --regression dl20-passage-cos-dpr-distil-onnx >& logs/log.dl20-passage-cos-dpr-distil-onnx & +nohup python src/main/python/run_regression.py --search-pool 1 --index --verify --search --regression dl20-passage-cos-dpr-distil-hnsw-onnx >& logs/log.dl20-passage-cos-dpr-distil-hnsw-onnx & nohup python src/main/python/run_regression.py --index --verify --search --regression dl20-doc >& logs/log.dl20-doc & nohup python src/main/python/run_regression.py --index --verify --search --regression dl20-doc-wp >& logs/log.dl20-doc-wp & diff --git a/docs/regressions/regressions-dl19-passage-cos-dpr-distil-fw.md b/docs/regressions/regressions-dl19-passage-cos-dpr-distil-fw.md new file mode 100644 index 0000000000..b355cfba42 --- /dev/null +++ b/docs/regressions/regressions-dl19-passage-cos-dpr-distil-fw.md @@ -0,0 +1,114 @@ +# Anserini Regressions: TREC 2019 Deep Learning Track (Passage) + +**Model**: cosDPR-distil with inverted indexes using the "fake-words" technique (q=40); pre-encoded queries + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [TREC 2019 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html). +In these experiments, we are using pre-encoded queries (i.e., cached results of query encoding). + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl19-passage-cos-dpr-distil-fw.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil-fw.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl19-passage-cos-dpr-distil-fw +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression dl19-passage-cos-dpr-distil-fw +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.tar -P collections/ +tar xvf collections/msmarco-passage-cos-dpr-distil.tar -C collections/ +``` + +To confirm, `msmarco-passage-cos-dpr-distil.tar` is 57 GB and has MD5 checksum `e20ffbc8b5e7f760af31298aefeaebbd`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl19-passage-cos-dpr-distil-fw \ + --corpus-path collections/msmarco-passage-cos-dpr-distil +``` + +## Indexing + +Sample indexing command, applying inverted indexes to dense vectors using the "fake-words" technique: + +```bash +target/appassembler/bin/IndexInvertedDenseVectors \ + -collection JsonDenseVectorCollection \ + -input /path/to/msmarco-passage-cos-dpr-distil \ + -index indexes/lucene-index.msmarco-passage-cos-dpr-distil.fw-40/ \ + -generator InvertedDenseVectorDocumentGenerator \ + -threads 16 -encoding fw -fw.q 40 \ + >& logs/log.msmarco-passage-cos-dpr-distil & +``` + +The path `/path/to/msmarco-passage-cos-dpr-distil/` should point to the corpus downloaded above. + +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 43 topics for which NIST has provided judgments as part of the TREC 2019 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2019.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +target/appassembler/bin/SearchInvertedDenseVectors \ + -index indexes/lucene-index.msmarco-passage-cos-dpr-distil.fw-40/ \ + -topics tools/topics-and-qrels/topics.dl19-passage.cos-dpr-distil.jsonl.gz \ + -topicreader JsonIntVector \ + -output runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-fw-40.topics.dl19-passage.cos-dpr-distil.jsonl.txt \ + -topicfield vector -encoding fw -fw.q 40 -hits 1000 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +tools/eval/trec_eval.9.0.4/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-fw-40.topics.dl19-passage.cos-dpr-distil.jsonl.txt +tools/eval/trec_eval.9.0.4/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-fw-40.topics.dl19-passage.cos-dpr-distil.jsonl.txt +tools/eval/trec_eval.9.0.4/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-fw-40.topics.dl19-passage.cos-dpr-distil.jsonl.txt +tools/eval/trec_eval.9.0.4/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-fw-40.topics.dl19-passage.cos-dpr-distil.jsonl.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **cosDPR-distill**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.4271 | +| **nDCG@10** | **cosDPR-distill**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.6857 | +| **R@100** | **cosDPR-distill**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.5766 | +| **R@1000** | **cosDPR-distill**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.7902 | + +Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. +Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](../../src/main/resources/regression/dl19-passage-cos-dpr-distil-fw.yaml). + +Also note that retrieval metrics are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +Also, for computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil-fw.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-dl19-passage-cos-dpr-distil-onnx.md b/docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw-onnx.md similarity index 88% rename from docs/regressions/regressions-dl19-passage-cos-dpr-distil-onnx.md rename to docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw-onnx.md index 5f9e5390fc..d68a6c8754 100644 --- a/docs/regressions/regressions-dl19-passage-cos-dpr-distil-onnx.md +++ b/docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw-onnx.md @@ -11,13 +11,13 @@ In these experiments, we are using pre-encoded queries (i.e., cached results of Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). -The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl19-passage-cos-dpr-distil-onnx.yaml). -Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil-onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl19-passage-cos-dpr-distil-hnsw-onnx.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil-hnsw-onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: ```bash -python src/main/python/run_regression.py --index --verify --search --regression dl19-passage-cos-dpr-distil-onnx +python src/main/python/run_regression.py --index --verify --search --regression dl19-passage-cos-dpr-distil-hnsw-onnx ``` We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. @@ -25,7 +25,7 @@ We make available a version of the MS MARCO Passage Corpus that has already been From any machine, the following command will download the corpus and perform the complete regression, end to end: ```bash -python src/main/python/run_regression.py --download --index --verify --search --regression dl19-passage-cos-dpr-distil-onnx +python src/main/python/run_regression.py --download --index --verify --search --regression dl19-passage-cos-dpr-distil-hnsw-onnx ``` The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. @@ -43,7 +43,7 @@ To confirm, `msmarco-passage-cos-dpr-distil.tar` is 57 GB and has MD5 checksum ` With the corpus downloaded, the following command will perform the remaining steps below: ```bash -python src/main/python/run_regression.py --index --verify --search --regression dl19-passage-cos-dpr-distil-onnx \ +python src/main/python/run_regression.py --index --verify --search --regression dl19-passage-cos-dpr-distil-hnsw-onnx \ --corpus-path collections/msmarco-passage-cos-dpr-distil ``` @@ -57,7 +57,7 @@ target/appassembler/bin/IndexHnswDenseVectors \ -input /path/to/msmarco-passage-cos-dpr-distil \ -index indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil/ \ -generator LuceneDenseVectorDocumentGenerator \ - -threads 16 -M 16 -efC 100 -encoder CosDprDistil \ + -threads 16 -M 16 -efC 100 \ >& logs/log.msmarco-passage-cos-dpr-distil & ``` @@ -78,17 +78,17 @@ target/appassembler/bin/SearchHnswDenseVectors \ -index indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil/ \ -topics tools/topics-and-qrels/topics.dl19-passage.txt \ -topicreader TsvInt \ - -output runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil.topics.dl19-passage.txt \ + -output runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.dl19-passage.txt \ -querygenerator VectorQueryGenerator -topicfield title -threads 16 -hits 1000 -efSearch 1000 -encoder CosDprDistil & ``` Evaluation can be performed using `trec_eval`: ```bash -tools/eval/trec_eval.9.0.4/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil.topics.dl19-passage.txt -tools/eval/trec_eval.9.0.4/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil.topics.dl19-passage.txt -tools/eval/trec_eval.9.0.4/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil.topics.dl19-passage.txt -tools/eval/trec_eval.9.0.4/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil.topics.dl19-passage.txt +tools/eval/trec_eval.9.0.4/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.dl19-passage.txt +tools/eval/trec_eval.9.0.4/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.dl19-passage.txt +tools/eval/trec_eval.9.0.4/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.dl19-passage.txt +tools/eval/trec_eval.9.0.4/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.dl19-passage.txt ``` ## Effectiveness @@ -106,7 +106,7 @@ With the above commands, you should be able to reproduce the following results: | [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.805 | Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. -Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](../../src/main/resources/regression/dl19-passage-cos-dpr-distil-onnx.yaml). +Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](../../src/main/resources/regression/dl19-passage-cos-dpr-distil-hnsw-onnx.yaml). Also note that retrieval metrics are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). Also, for computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). @@ -114,4 +114,4 @@ The experimental results reported here are directly comparable to the results re ## Reproduction Log[*](reproducibility.md) -To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil-onnx.template) and run `bin/build.sh` to rebuild the documentation. +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil-hnsw-onnx.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-dl19-passage-cos-dpr-distil.md b/docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw.md similarity index 88% rename from docs/regressions/regressions-dl19-passage-cos-dpr-distil.md rename to docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw.md index 85da7c9e5b..c86f607c0a 100644 --- a/docs/regressions/regressions-dl19-passage-cos-dpr-distil.md +++ b/docs/regressions/regressions-dl19-passage-cos-dpr-distil-hnsw.md @@ -11,13 +11,13 @@ In these experiments, we are using pre-encoded queries (i.e., cached results of Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). -The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl19-passage-cos-dpr-distil.yaml). -Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl19-passage-cos-dpr-distil-hnsw.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil-hnsw.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: ```bash -python src/main/python/run_regression.py --index --verify --search --regression dl19-passage-cos-dpr-distil +python src/main/python/run_regression.py --index --verify --search --regression dl19-passage-cos-dpr-distil-hnsw ``` We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. @@ -25,7 +25,7 @@ We make available a version of the MS MARCO Passage Corpus that has already been From any machine, the following command will download the corpus and perform the complete regression, end to end: ```bash -python src/main/python/run_regression.py --download --index --verify --search --regression dl19-passage-cos-dpr-distil +python src/main/python/run_regression.py --download --index --verify --search --regression dl19-passage-cos-dpr-distil-hnsw ``` The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. @@ -43,7 +43,7 @@ To confirm, `msmarco-passage-cos-dpr-distil.tar` is 57 GB and has MD5 checksum ` With the corpus downloaded, the following command will perform the remaining steps below: ```bash -python src/main/python/run_regression.py --index --verify --search --regression dl19-passage-cos-dpr-distil \ +python src/main/python/run_regression.py --index --verify --search --regression dl19-passage-cos-dpr-distil-hnsw \ --corpus-path collections/msmarco-passage-cos-dpr-distil ``` @@ -78,17 +78,17 @@ target/appassembler/bin/SearchHnswDenseVectors \ -index indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil/ \ -topics tools/topics-and-qrels/topics.dl19-passage.cos-dpr-distil.jsonl.gz \ -topicreader JsonIntVector \ - -output runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil.topics.dl19-passage.cos-dpr-distil.jsonl.txt \ + -output runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.dl19-passage.cos-dpr-distil.jsonl.txt \ -querygenerator VectorQueryGenerator -topicfield vector -threads 16 -hits 1000 -efSearch 1000 & ``` Evaluation can be performed using `trec_eval`: ```bash -tools/eval/trec_eval.9.0.4/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil.topics.dl19-passage.cos-dpr-distil.jsonl.txt -tools/eval/trec_eval.9.0.4/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil.topics.dl19-passage.cos-dpr-distil.jsonl.txt -tools/eval/trec_eval.9.0.4/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil.topics.dl19-passage.cos-dpr-distil.jsonl.txt -tools/eval/trec_eval.9.0.4/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil.topics.dl19-passage.cos-dpr-distil.jsonl.txt +tools/eval/trec_eval.9.0.4/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.dl19-passage.cos-dpr-distil.jsonl.txt +tools/eval/trec_eval.9.0.4/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.dl19-passage.cos-dpr-distil.jsonl.txt +tools/eval/trec_eval.9.0.4/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.dl19-passage.cos-dpr-distil.jsonl.txt +tools/eval/trec_eval.9.0.4/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.dl19-passage.cos-dpr-distil.jsonl.txt ``` ## Effectiveness @@ -106,7 +106,7 @@ With the above commands, you should be able to reproduce the following results: | [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.805 | Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. -Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](../../src/main/resources/regression/dl19-passage-cos-dpr-distil.yaml). +Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](../../src/main/resources/regression/dl19-passage-cos-dpr-distil-hnsw.yaml). Also note that retrieval metrics are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). Also, for computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). @@ -114,4 +114,4 @@ The experimental results reported here are directly comparable to the results re ## Reproduction Log[*](reproducibility.md) -To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil.template) and run `bin/build.sh` to rebuild the documentation. +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil-hnsw.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-dl19-passage-cos-dpr-distil-lexlsh.md b/docs/regressions/regressions-dl19-passage-cos-dpr-distil-lexlsh.md new file mode 100644 index 0000000000..8e4431311e --- /dev/null +++ b/docs/regressions/regressions-dl19-passage-cos-dpr-distil-lexlsh.md @@ -0,0 +1,114 @@ +# Anserini Regressions: TREC 2019 Deep Learning Track (Passage) + +**Model**: cosDPR-distil with inverted indexes using the "LexLSH" technique (b=600); pre-encoded queries + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [TREC 2019 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html). +In these experiments, we are using pre-encoded queries (i.e., cached results of query encoding). + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl19-passage-cos-dpr-distil-lexlsh.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil-lexlsh.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl19-passage-cos-dpr-distil-lexlsh +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression dl19-passage-cos-dpr-distil-lexlsh +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.tar -P collections/ +tar xvf collections/msmarco-passage-cos-dpr-distil.tar -C collections/ +``` + +To confirm, `msmarco-passage-cos-dpr-distil.tar` is 57 GB and has MD5 checksum `e20ffbc8b5e7f760af31298aefeaebbd`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl19-passage-cos-dpr-distil-lexlsh \ + --corpus-path collections/msmarco-passage-cos-dpr-distil +``` + +## Indexing + +Sample indexing command, applying inverted indexes to dense vectors using the "LexLSH" technique: + +```bash +target/appassembler/bin/IndexInvertedDenseVectors \ + -collection JsonDenseVectorCollection \ + -input /path/to/msmarco-passage-cos-dpr-distil \ + -index indexes/lucene-index.msmarco-passage-cos-dpr-distil.lexlsh-600/ \ + -generator InvertedDenseVectorDocumentGenerator \ + -threads 16 -encoding lexlsh -lexlsh.b 600 \ + >& logs/log.msmarco-passage-cos-dpr-distil & +``` + +The path `/path/to/msmarco-passage-cos-dpr-distil/` should point to the corpus downloaded above. + +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 43 topics for which NIST has provided judgments as part of the TREC 2019 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2019.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +target/appassembler/bin/SearchInvertedDenseVectors \ + -index indexes/lucene-index.msmarco-passage-cos-dpr-distil.lexlsh-600/ \ + -topics tools/topics-and-qrels/topics.dl19-passage.cos-dpr-distil.jsonl.gz \ + -topicreader JsonIntVector \ + -output runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-lexlsh-600.topics.dl19-passage.cos-dpr-distil.jsonl.txt \ + -topicfield vector -encoding lexlsh -lexlsh.b 600 -hits 1000 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +tools/eval/trec_eval.9.0.4/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-lexlsh-600.topics.dl19-passage.cos-dpr-distil.jsonl.txt +tools/eval/trec_eval.9.0.4/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-lexlsh-600.topics.dl19-passage.cos-dpr-distil.jsonl.txt +tools/eval/trec_eval.9.0.4/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-lexlsh-600.topics.dl19-passage.cos-dpr-distil.jsonl.txt +tools/eval/trec_eval.9.0.4/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl19-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-lexlsh-600.topics.dl19-passage.cos-dpr-distil.jsonl.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **cosDPR-distill**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.4118 | +| **nDCG@10** | **cosDPR-distill**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.6716 | +| **R@100** | **cosDPR-distill**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.5545 | +| **R@1000** | **cosDPR-distill**| +| [DL19 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.7610 | + +Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. +Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](../../src/main/resources/regression/dl19-passage-cos-dpr-distil-lexlsh.yaml). + +Also note that retrieval metrics are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +Also, for computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil-lexlsh.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-dl20-passage-cos-dpr-distil-fw.md b/docs/regressions/regressions-dl20-passage-cos-dpr-distil-fw.md new file mode 100644 index 0000000000..1b165bcdfc --- /dev/null +++ b/docs/regressions/regressions-dl20-passage-cos-dpr-distil-fw.md @@ -0,0 +1,114 @@ +# Anserini Regressions: TREC 2020 Deep Learning Track (Passage) + +**Model**: cosDPR-distil with inverted indexes using the "fake-words" technique (q=40); pre-encoded queries + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [TREC 2020 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html). +In these experiments, we are using pre-encoded queries (i.e., cached results of query encoding). + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl20-passage-cos-dpr-distil-fw.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil-fw.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl20-passage-cos-dpr-distil-fw +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression dl20-passage-cos-dpr-distil-fw +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.tar -P collections/ +tar xvf collections/msmarco-passage-cos-dpr-distil.tar -C collections/ +``` + +To confirm, `msmarco-passage-cos-dpr-distil.tar` is 57 GB and has MD5 checksum `e20ffbc8b5e7f760af31298aefeaebbd`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl20-passage-cos-dpr-distil-fw \ + --corpus-path collections/msmarco-passage-cos-dpr-distil +``` + +## Indexing + +Sample indexing command, applying inverted indexes to dense vectors using the "fake-words" technique: + +```bash +target/appassembler/bin/IndexInvertedDenseVectors \ + -collection JsonDenseVectorCollection \ + -input /path/to/msmarco-passage-cos-dpr-distil \ + -index indexes/lucene-index.msmarco-passage-cos-dpr-distil.fw-40/ \ + -generator InvertedDenseVectorDocumentGenerator \ + -threads 16 -encoding fw -fw.q 40 \ + >& logs/log.msmarco-passage-cos-dpr-distil & +``` + +The path `/path/to/msmarco-passage-cos-dpr-distil/` should point to the corpus downloaded above. + +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 54 topics for which NIST has provided judgments as part of the TREC 2020 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2020.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +target/appassembler/bin/SearchInvertedDenseVectors \ + -index indexes/lucene-index.msmarco-passage-cos-dpr-distil.fw-40/ \ + -topics tools/topics-and-qrels/topics.dl20.cos-dpr-distil.jsonl.gz \ + -topicreader JsonIntVector \ + -output runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-fw-40.topics.dl20.cos-dpr-distil.jsonl.txt \ + -topicfield vector -encoding fw -fw.q 40 -hits 1000 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +tools/eval/trec_eval.9.0.4/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-fw-40.topics.dl20.cos-dpr-distil.jsonl.txt +tools/eval/trec_eval.9.0.4/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-fw-40.topics.dl20.cos-dpr-distil.jsonl.txt +tools/eval/trec_eval.9.0.4/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-fw-40.topics.dl20.cos-dpr-distil.jsonl.txt +tools/eval/trec_eval.9.0.4/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-fw-40.topics.dl20.cos-dpr-distil.jsonl.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **cosDPR-distill**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.4597 | +| **nDCG@10** | **cosDPR-distill**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.6666 | +| **R@100** | **cosDPR-distill**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.6909 | +| **R@1000** | **cosDPR-distill**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.8194 | + +Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. +Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](../../src/main/resources/regression/dl20-passage-cos-dpr-distil-fw.yaml). + +Also note that retrieval metrics are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +Also, for computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil-fw.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-dl20-passage-cos-dpr-distil-onnx.md b/docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw-onnx.md similarity index 88% rename from docs/regressions/regressions-dl20-passage-cos-dpr-distil-onnx.md rename to docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw-onnx.md index 9e54349093..3230dd9f27 100644 --- a/docs/regressions/regressions-dl20-passage-cos-dpr-distil-onnx.md +++ b/docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw-onnx.md @@ -11,13 +11,13 @@ In these experiments, we are using pre-encoded queries (i.e., cached results of Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). -The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl20-passage-cos-dpr-distil-onnx.yaml). -Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil-onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl20-passage-cos-dpr-distil-hnsw-onnx.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil-hnsw-onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: ```bash -python src/main/python/run_regression.py --index --verify --search --regression dl20-passage-cos-dpr-distil-onnx +python src/main/python/run_regression.py --index --verify --search --regression dl20-passage-cos-dpr-distil-hnsw-onnx ``` We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. @@ -25,7 +25,7 @@ We make available a version of the MS MARCO Passage Corpus that has already been From any machine, the following command will download the corpus and perform the complete regression, end to end: ```bash -python src/main/python/run_regression.py --download --index --verify --search --regression dl20-passage-cos-dpr-distil-onnx +python src/main/python/run_regression.py --download --index --verify --search --regression dl20-passage-cos-dpr-distil-hnsw-onnx ``` The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. @@ -43,7 +43,7 @@ To confirm, `msmarco-passage-cos-dpr-distil.tar` is 57 GB and has MD5 checksum ` With the corpus downloaded, the following command will perform the remaining steps below: ```bash -python src/main/python/run_regression.py --index --verify --search --regression dl20-passage-cos-dpr-distil-onnx \ +python src/main/python/run_regression.py --index --verify --search --regression dl20-passage-cos-dpr-distil-hnsw-onnx \ --corpus-path collections/msmarco-passage-cos-dpr-distil ``` @@ -57,7 +57,7 @@ target/appassembler/bin/IndexHnswDenseVectors \ -input /path/to/msmarco-passage-cos-dpr-distil \ -index indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil/ \ -generator LuceneDenseVectorDocumentGenerator \ - -threads 16 -M 16 -efC 100 -encoder CosDprDistil \ + -threads 16 -M 16 -efC 100 \ >& logs/log.msmarco-passage-cos-dpr-distil & ``` @@ -78,17 +78,17 @@ target/appassembler/bin/SearchHnswDenseVectors \ -index indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil/ \ -topics tools/topics-and-qrels/topics.dl20.txt \ -topicreader TsvInt \ - -output runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil.topics.dl20.txt \ + -output runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.dl20.txt \ -querygenerator VectorQueryGenerator -topicfield title -threads 16 -hits 1000 -efSearch 1000 -encoder CosDprDistil & ``` Evaluation can be performed using `trec_eval`: ```bash -tools/eval/trec_eval.9.0.4/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil.topics.dl20.txt -tools/eval/trec_eval.9.0.4/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil.topics.dl20.txt -tools/eval/trec_eval.9.0.4/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil.topics.dl20.txt -tools/eval/trec_eval.9.0.4/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil.topics.dl20.txt +tools/eval/trec_eval.9.0.4/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.dl20.txt +tools/eval/trec_eval.9.0.4/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.dl20.txt +tools/eval/trec_eval.9.0.4/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.dl20.txt +tools/eval/trec_eval.9.0.4/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.dl20.txt ``` ## Effectiveness @@ -106,7 +106,7 @@ With the above commands, you should be able to reproduce the following results: | [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.843 | Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. -Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](../../src/main/resources/regression/dl20-passage-cos-dpr-distil-onnx.yaml). +Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](../../src/main/resources/regression/dl20-passage-cos-dpr-distil-hnsw-onnx.yaml). Also note that retrieval metrics are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). Also, for computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). @@ -114,4 +114,4 @@ The experimental results reported here are directly comparable to the results re ## Reproduction Log[*](reproducibility.md) -To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil-onnx.template) and run `bin/build.sh` to rebuild the documentation. +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil-hnsw-onnx.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-dl20-passage-cos-dpr-distil.md b/docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw.md similarity index 88% rename from docs/regressions/regressions-dl20-passage-cos-dpr-distil.md rename to docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw.md index 7abb0e4d63..64196afc58 100644 --- a/docs/regressions/regressions-dl20-passage-cos-dpr-distil.md +++ b/docs/regressions/regressions-dl20-passage-cos-dpr-distil-hnsw.md @@ -11,13 +11,13 @@ In these experiments, we are using pre-encoded queries (i.e., cached results of Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). -The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl20-passage-cos-dpr-distil.yaml). -Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl20-passage-cos-dpr-distil-hnsw.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil-hnsw.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: ```bash -python src/main/python/run_regression.py --index --verify --search --regression dl20-passage-cos-dpr-distil +python src/main/python/run_regression.py --index --verify --search --regression dl20-passage-cos-dpr-distil-hnsw ``` We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. @@ -25,7 +25,7 @@ We make available a version of the MS MARCO Passage Corpus that has already been From any machine, the following command will download the corpus and perform the complete regression, end to end: ```bash -python src/main/python/run_regression.py --download --index --verify --search --regression dl20-passage-cos-dpr-distil +python src/main/python/run_regression.py --download --index --verify --search --regression dl20-passage-cos-dpr-distil-hnsw ``` The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. @@ -43,7 +43,7 @@ To confirm, `msmarco-passage-cos-dpr-distil.tar` is 57 GB and has MD5 checksum ` With the corpus downloaded, the following command will perform the remaining steps below: ```bash -python src/main/python/run_regression.py --index --verify --search --regression dl20-passage-cos-dpr-distil \ +python src/main/python/run_regression.py --index --verify --search --regression dl20-passage-cos-dpr-distil-hnsw \ --corpus-path collections/msmarco-passage-cos-dpr-distil ``` @@ -78,17 +78,17 @@ target/appassembler/bin/SearchHnswDenseVectors \ -index indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil/ \ -topics tools/topics-and-qrels/topics.dl20.cos-dpr-distil.jsonl.gz \ -topicreader JsonIntVector \ - -output runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil.topics.dl20.cos-dpr-distil.jsonl.txt \ + -output runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.dl20.cos-dpr-distil.jsonl.txt \ -querygenerator VectorQueryGenerator -topicfield vector -threads 16 -hits 1000 -efSearch 1000 & ``` Evaluation can be performed using `trec_eval`: ```bash -tools/eval/trec_eval.9.0.4/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil.topics.dl20.cos-dpr-distil.jsonl.txt -tools/eval/trec_eval.9.0.4/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil.topics.dl20.cos-dpr-distil.jsonl.txt -tools/eval/trec_eval.9.0.4/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil.topics.dl20.cos-dpr-distil.jsonl.txt -tools/eval/trec_eval.9.0.4/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil.topics.dl20.cos-dpr-distil.jsonl.txt +tools/eval/trec_eval.9.0.4/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.dl20.cos-dpr-distil.jsonl.txt +tools/eval/trec_eval.9.0.4/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.dl20.cos-dpr-distil.jsonl.txt +tools/eval/trec_eval.9.0.4/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.dl20.cos-dpr-distil.jsonl.txt +tools/eval/trec_eval.9.0.4/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.dl20.cos-dpr-distil.jsonl.txt ``` ## Effectiveness @@ -106,7 +106,7 @@ With the above commands, you should be able to reproduce the following results: | [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.843 | Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. -Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](../../src/main/resources/regression/dl20-passage-cos-dpr-distil.yaml). +Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](../../src/main/resources/regression/dl20-passage-cos-dpr-distil-hnsw.yaml). Also note that retrieval metrics are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). Also, for computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). @@ -114,4 +114,4 @@ The experimental results reported here are directly comparable to the results re ## Reproduction Log[*](reproducibility.md) -To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil.template) and run `bin/build.sh` to rebuild the documentation. +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil-hnsw.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-dl20-passage-cos-dpr-distil-lexlsh.md b/docs/regressions/regressions-dl20-passage-cos-dpr-distil-lexlsh.md new file mode 100644 index 0000000000..e2777b4798 --- /dev/null +++ b/docs/regressions/regressions-dl20-passage-cos-dpr-distil-lexlsh.md @@ -0,0 +1,114 @@ +# Anserini Regressions: TREC 2020 Deep Learning Track (Passage) + +**Model**: cosDPR-distil with inverted indexes using the "LexLSH" technique (b=600); pre-encoded queries + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [TREC 2020 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html). +In these experiments, we are using pre-encoded queries (i.e., cached results of query encoding). + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/dl20-passage-cos-dpr-distil-lexlsh.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil-lexlsh.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl20-passage-cos-dpr-distil-lexlsh +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression dl20-passage-cos-dpr-distil-lexlsh +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.tar -P collections/ +tar xvf collections/msmarco-passage-cos-dpr-distil.tar -C collections/ +``` + +To confirm, `msmarco-passage-cos-dpr-distil.tar` is 57 GB and has MD5 checksum `e20ffbc8b5e7f760af31298aefeaebbd`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression dl20-passage-cos-dpr-distil-lexlsh \ + --corpus-path collections/msmarco-passage-cos-dpr-distil +``` + +## Indexing + +Sample indexing command, applying inverted indexes to dense vectors using the "LexLSH" technique: + +```bash +target/appassembler/bin/IndexInvertedDenseVectors \ + -collection JsonDenseVectorCollection \ + -input /path/to/msmarco-passage-cos-dpr-distil \ + -index indexes/lucene-index.msmarco-passage-cos-dpr-distil.lexlsh-600/ \ + -generator InvertedDenseVectorDocumentGenerator \ + -threads 16 -encoding lexlsh -lexlsh.b 600 \ + >& logs/log.msmarco-passage-cos-dpr-distil & +``` + +The path `/path/to/msmarco-passage-cos-dpr-distil/` should point to the corpus downloaded above. + +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 54 topics for which NIST has provided judgments as part of the TREC 2020 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2020.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +target/appassembler/bin/SearchInvertedDenseVectors \ + -index indexes/lucene-index.msmarco-passage-cos-dpr-distil.lexlsh-600/ \ + -topics tools/topics-and-qrels/topics.dl20.cos-dpr-distil.jsonl.gz \ + -topicreader JsonIntVector \ + -output runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-lexlsh-600.topics.dl20.cos-dpr-distil.jsonl.txt \ + -topicfield vector -encoding lexlsh -lexlsh.b 600 -hits 1000 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +tools/eval/trec_eval.9.0.4/trec_eval -m map -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-lexlsh-600.topics.dl20.cos-dpr-distil.jsonl.txt +tools/eval/trec_eval.9.0.4/trec_eval -m ndcg_cut.10 -c tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-lexlsh-600.topics.dl20.cos-dpr-distil.jsonl.txt +tools/eval/trec_eval.9.0.4/trec_eval -m recall.100 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-lexlsh-600.topics.dl20.cos-dpr-distil.jsonl.txt +tools/eval/trec_eval.9.0.4/trec_eval -m recall.1000 -c -l 2 tools/topics-and-qrels/qrels.dl20-passage.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-lexlsh-600.topics.dl20.cos-dpr-distil.jsonl.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **cosDPR-distill**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.4486 | +| **nDCG@10** | **cosDPR-distill**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.6569 | +| **R@100** | **cosDPR-distill**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.6662 | +| **R@1000** | **cosDPR-distill**| +| [DL20 (Passage)](https://trec.nist.gov/data/deep2020.html) | 0.8131 | + +Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. +Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](../../src/main/resources/regression/dl20-passage-cos-dpr-distil-lexlsh.yaml). + +Also note that retrieval metrics are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +Also, for computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil-lexlsh.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-msmarco-passage-cos-dpr-distil-fw.md b/docs/regressions/regressions-msmarco-passage-cos-dpr-distil-fw.md new file mode 100644 index 0000000000..91a3bece3f --- /dev/null +++ b/docs/regressions/regressions-msmarco-passage-cos-dpr-distil-fw.md @@ -0,0 +1,108 @@ +# Anserini Regressions: MS MARCO Passage Ranking + +**Model**: cosDPR-distil with inverted indexes using the "fake-words" technique (q=40); pre-encoded queries + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [MS MARCO passage ranking task](https://github.com/microsoft/MSMARCO-Passage-Ranking). +In these experiments, we are using pre-encoded queries (i.e., cached results of query encoding). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/msmarco-passage-cos-dpr-distil-fw.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-fw.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-cos-dpr-distil-fw +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression msmarco-passage-cos-dpr-distil-fw +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.tar -P collections/ +tar xvf collections/msmarco-passage-cos-dpr-distil.tar -C collections/ +``` + +To confirm, `msmarco-passage-cos-dpr-distil.tar` is 57 GB and has MD5 checksum `e20ffbc8b5e7f760af31298aefeaebbd`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-cos-dpr-distil-fw \ + --corpus-path collections/msmarco-passage-cos-dpr-distil +``` + +## Indexing + +Sample indexing command, applying inverted indexes to dense vectors using the "fake-words" technique: + +```bash +target/appassembler/bin/IndexInvertedDenseVectors \ + -collection JsonDenseVectorCollection \ + -input /path/to/msmarco-passage-cos-dpr-distil \ + -index indexes/lucene-index.msmarco-passage-cos-dpr-distil.fw-40/ \ + -generator InvertedDenseVectorDocumentGenerator \ + -threads 16 -encoding fw -fw.q 40 \ + >& logs/log.msmarco-passage-cos-dpr-distil & +``` + +The path `/path/to/msmarco-passage-cos-dpr-distil/` should point to the corpus downloaded above. + +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 6980 dev set questions; see [this page](../../docs/experiments-msmarco-passage.md) for more details. + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +target/appassembler/bin/SearchInvertedDenseVectors \ + -index indexes/lucene-index.msmarco-passage-cos-dpr-distil.fw-40/ \ + -topics tools/topics-and-qrels/topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.gz \ + -topicreader JsonIntVector \ + -output runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-fw-40.topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.txt \ + -topicfield vector -encoding fw -fw.q 40 -hits 1000 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +tools/eval/trec_eval.9.0.4/trec_eval -c -m map tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-fw-40.topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -M 10 -m recip_rank tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-fw-40.topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-fw-40.topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-fw-40.topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **cosDPR-distill**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.3654 | +| **RR@10** | **cosDPR-distill**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.3605 | +| **R@100** | **cosDPR-distill**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.8711 | +| **R@1000** | **cosDPR-distill**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.9668 | + +Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. +Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](../../src/main/resources/regression/msmarco-passage-cos-dpr-distil-fw.yaml). + +## Reproduction Log[*](../../docs/reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-fw.template) and run `bin/build.sh` to rebuild the documentation. + ++ Results reproduced by [@yilinjz](https://github.com/yilinjz) on 2023-09-01 (commit [`4ae518b`](https://github.com/castorini/anserini/commit/4ae518bb284ebcba0b273a473bc8774735cb7d19)) \ No newline at end of file diff --git a/docs/regressions/regressions-msmarco-passage-cos-dpr-distil-onnx.md b/docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw-onnx.md similarity index 87% rename from docs/regressions/regressions-msmarco-passage-cos-dpr-distil-onnx.md rename to docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw-onnx.md index 9ef24c1c89..bbcefaea81 100644 --- a/docs/regressions/regressions-msmarco-passage-cos-dpr-distil-onnx.md +++ b/docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw-onnx.md @@ -8,13 +8,13 @@ This page describes regression experiments, integrated into Anserini's regressio In these experiments, we are using pre-encoded queries (i.e., cached results of query encoding). -The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/msmarco-passage-cos-dpr-distil-onnx.yaml). -Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw-onnx.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-hnsw-onnx.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: ```bash -python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-cos-dpr-distil-onnx +python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-cos-dpr-distil-hnsw-onnx ``` We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. @@ -22,7 +22,7 @@ We make available a version of the MS MARCO Passage Corpus that has already been From any machine, the following command will download the corpus and perform the complete regression, end to end: ```bash -python src/main/python/run_regression.py --download --index --verify --search --regression msmarco-passage-cos-dpr-distil-onnx +python src/main/python/run_regression.py --download --index --verify --search --regression msmarco-passage-cos-dpr-distil-hnsw-onnx ``` The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. @@ -40,7 +40,7 @@ To confirm, `msmarco-passage-cos-dpr-distil.tar` is 57 GB and has MD5 checksum ` With the corpus downloaded, the following command will perform the remaining steps below: ```bash -python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-cos-dpr-distil-onnx \ +python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-cos-dpr-distil-hnsw-onnx \ --corpus-path collections/msmarco-passage-cos-dpr-distil ``` @@ -74,36 +74,36 @@ target/appassembler/bin/SearchHnswDenseVectors \ -index indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil/ \ -topics tools/topics-and-qrels/topics.msmarco-passage.dev-subset.txt \ -topicreader TsvInt \ - -output runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-onnx.topics.msmarco-passage.dev-subset.txt \ + -output runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.msmarco-passage.dev-subset.txt \ -querygenerator VectorQueryGenerator -topicfield title -threads 16 -hits 1000 -efSearch 1000 -encoder CosDprDistil & ``` Evaluation can be performed using `trec_eval`: ```bash -tools/eval/trec_eval.9.0.4/trec_eval -c -m map tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-onnx.topics.msmarco-passage.dev-subset.txt -tools/eval/trec_eval.9.0.4/trec_eval -c -M 10 -m recip_rank tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-onnx.topics.msmarco-passage.dev-subset.txt -tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-onnx.topics.msmarco-passage.dev-subset.txt -tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-onnx.topics.msmarco-passage.dev-subset.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m map tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.msmarco-passage.dev-subset.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -M 10 -m recip_rank tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.msmarco-passage.dev-subset.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.msmarco-passage.dev-subset.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.msmarco-passage.dev-subset.txt ``` ## Effectiveness With the above commands, you should be able to reproduce the following results: -| **AP@1000** | **cosDPR-distil (ONNX)**| +| **AP@1000** | **cosDPR-distil**| |:-------------------------------------------------------------------------------------------------------------|-----------| | [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.393 | -| **RR@10** | **cosDPR-distil (ONNX)**| +| **RR@10** | **cosDPR-distil**| | [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.388 | -| **R@100** | **cosDPR-distil (ONNX)**| +| **R@100** | **cosDPR-distil**| | [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.903 | -| **R@1000** | **cosDPR-distil (ONNX)**| +| **R@1000** | **cosDPR-distil**| | [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.974 | Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. -Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](../../src/main/resources/regression/msmarco-passage-cos-dpr-distil-onnx.yaml). +Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](../../src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw-onnx.yaml). ## Reproduction Log[*](../../docs/reproducibility.md) -To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-onnx.template) and run `bin/build.sh` to rebuild the documentation. +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-hnsw-onnx.template) and run `bin/build.sh` to rebuild the documentation. diff --git a/docs/regressions/regressions-msmarco-passage-cos-dpr-distil.md b/docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw.md similarity index 86% rename from docs/regressions/regressions-msmarco-passage-cos-dpr-distil.md rename to docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw.md index 2cfba698f9..6422f440d7 100644 --- a/docs/regressions/regressions-msmarco-passage-cos-dpr-distil.md +++ b/docs/regressions/regressions-msmarco-passage-cos-dpr-distil-hnsw.md @@ -8,13 +8,13 @@ This page describes regression experiments, integrated into Anserini's regressio In these experiments, we are using pre-encoded queries (i.e., cached results of query encoding). -The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/msmarco-passage-cos-dpr-distil.yaml). -Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-hnsw.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: ```bash -python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-cos-dpr-distil +python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-cos-dpr-distil-hnsw ``` We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. @@ -22,7 +22,7 @@ We make available a version of the MS MARCO Passage Corpus that has already been From any machine, the following command will download the corpus and perform the complete regression, end to end: ```bash -python src/main/python/run_regression.py --download --index --verify --search --regression msmarco-passage-cos-dpr-distil +python src/main/python/run_regression.py --download --index --verify --search --regression msmarco-passage-cos-dpr-distil-hnsw ``` The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. @@ -40,7 +40,7 @@ To confirm, `msmarco-passage-cos-dpr-distil.tar` is 57 GB and has MD5 checksum ` With the corpus downloaded, the following command will perform the remaining steps below: ```bash -python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-cos-dpr-distil \ +python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-cos-dpr-distil-hnsw \ --corpus-path collections/msmarco-passage-cos-dpr-distil ``` @@ -74,17 +74,17 @@ target/appassembler/bin/SearchHnswDenseVectors \ -index indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil/ \ -topics tools/topics-and-qrels/topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.gz \ -topicreader JsonIntVector \ - -output runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil.topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.txt \ + -output runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.txt \ -querygenerator VectorQueryGenerator -topicfield vector -threads 16 -hits 1000 -efSearch 1000 & ``` Evaluation can be performed using `trec_eval`: ```bash -tools/eval/trec_eval.9.0.4/trec_eval -c -m map tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil.topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.txt -tools/eval/trec_eval.9.0.4/trec_eval -c -M 10 -m recip_rank tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil.topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.txt -tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil.topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.txt -tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil.topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m map tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -M 10 -m recip_rank tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-hnsw.topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.txt ``` ## Effectiveness @@ -102,10 +102,10 @@ With the above commands, you should be able to reproduce the following results: | [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.974 | Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. -Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](../../src/main/resources/regression/msmarco-passage-cos-dpr-distil.yaml). +Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](../../src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw.yaml). ## Reproduction Log[*](../../docs/reproducibility.md) -To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil.template) and run `bin/build.sh` to rebuild the documentation. +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-hnsw.template) and run `bin/build.sh` to rebuild the documentation. + Results reproduced by [@yilinjz](https://github.com/yilinjz) on 2023-09-01 (commit [`4ae518b`](https://github.com/castorini/anserini/commit/4ae518bb284ebcba0b273a473bc8774735cb7d19)) \ No newline at end of file diff --git a/docs/regressions/regressions-msmarco-passage-cos-dpr-distil-lexlsh.md b/docs/regressions/regressions-msmarco-passage-cos-dpr-distil-lexlsh.md new file mode 100644 index 0000000000..b49b71feee --- /dev/null +++ b/docs/regressions/regressions-msmarco-passage-cos-dpr-distil-lexlsh.md @@ -0,0 +1,108 @@ +# Anserini Regressions: MS MARCO Passage Ranking + +**Model**: cosDPR-distil with inverted indexes using the "LexLSH" technique (b=600); pre-encoded queries + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [MS MARCO passage ranking task](https://github.com/microsoft/MSMARCO-Passage-Ranking). +In these experiments, we are using pre-encoded queries (i.e., cached results of query encoding). + +The exact configurations for these regressions are stored in [this YAML file](../../src/main/resources/regression/msmarco-passage-cos-dpr-distil-lexlsh.yaml). +Note that this page is automatically generated from [this template](../../src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-lexlsh.template) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-cos-dpr-distil-lexlsh +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression msmarco-passage-cos-dpr-distil-lexlsh +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.tar -P collections/ +tar xvf collections/msmarco-passage-cos-dpr-distil.tar -C collections/ +``` + +To confirm, `msmarco-passage-cos-dpr-distil.tar` is 57 GB and has MD5 checksum `e20ffbc8b5e7f760af31298aefeaebbd`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-cos-dpr-distil-lexlsh \ + --corpus-path collections/msmarco-passage-cos-dpr-distil +``` + +## Indexing + +Sample indexing command, applying inverted indexes to dense vectors using the "LexLSH" technique: + +```bash +target/appassembler/bin/IndexInvertedDenseVectors \ + -collection JsonDenseVectorCollection \ + -input /path/to/msmarco-passage-cos-dpr-distil \ + -index indexes/lucene-index.msmarco-passage-cos-dpr-distil.lexlsh-600/ \ + -generator InvertedDenseVectorDocumentGenerator \ + -threads 16 -encoding lexlsh -lexlsh.b 600 \ + >& logs/log.msmarco-passage-cos-dpr-distil & +``` + +The path `/path/to/msmarco-passage-cos-dpr-distil/` should point to the corpus downloaded above. + +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 6980 dev set questions; see [this page](../../docs/experiments-msmarco-passage.md) for more details. + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +target/appassembler/bin/SearchInvertedDenseVectors \ + -index indexes/lucene-index.msmarco-passage-cos-dpr-distil.lexlsh-600/ \ + -topics tools/topics-and-qrels/topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.gz \ + -topicreader JsonIntVector \ + -output runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-lexlsh-600.topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.txt \ + -topicfield vector -encoding lexlsh -lexlsh.b 600 -hits 1000 & +``` + +Evaluation can be performed using `trec_eval`: + +```bash +tools/eval/trec_eval.9.0.4/trec_eval -c -m map tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-lexlsh-600.topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -M 10 -m recip_rank tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-lexlsh-600.topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.100 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-lexlsh-600.topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.txt +tools/eval/trec_eval.9.0.4/trec_eval -c -m recall.1000 tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt runs/run.msmarco-passage-cos-dpr-distil.cos-dpr-distil-lexlsh-600.topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.txt +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +| **AP@1000** | **cosDPR-distill**| +|:-------------------------------------------------------------------------------------------------------------|-----------| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.3509 | +| **RR@10** | **cosDPR-distill**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.3457 | +| **R@100** | **cosDPR-distill**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.8615 | +| **R@1000** | **cosDPR-distill**| +| [MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking) | 0.9596 | + +Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. +Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](../../src/main/resources/regression/msmarco-passage-cos-dpr-distil-lexlsh.yaml). + +## Reproduction Log[*](../../docs/reproducibility.md) + +To add to this reproduction log, modify [this template](../../src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-lexlsh.template) and run `bin/build.sh` to rebuild the documentation. + ++ Results reproduced by [@yilinjz](https://github.com/yilinjz) on 2023-09-01 (commit [`4ae518b`](https://github.com/castorini/anserini/commit/4ae518bb284ebcba0b273a473bc8774735cb7d19)) \ No newline at end of file diff --git a/pom.xml b/pom.xml index 9972fdee53..bb986658bb 100644 --- a/pom.xml +++ b/pom.xml @@ -118,6 +118,10 @@ io.anserini.index.IndexHnswDenseVectors IndexHnswDenseVectors + + io.anserini.index.IndexInvertedDenseVectors + IndexInvertedDenseVectors + io.anserini.search.SearchCollection SearchCollection @@ -126,6 +130,10 @@ io.anserini.search.SearchHnswDenseVectors SearchHnswDenseVectors + + io.anserini.search.SearchInvertedDenseVectors + SearchInvertedDenseVectors + io.anserini.index.IndexReaderUtils IndexReaderUtils @@ -150,18 +158,6 @@ io.anserini.util.ExtractTopDfTerms ExtractTopDfTerms - - io.anserini.index.IndexInvertedDenseVectors - IndexVectors - - - io.anserini.search.SearchInvertedDenseVectors - ApproximateNearestNeighborSearch - - - io.anserini.search.EvaluateInvertedDenseVectors - ApproximateNearestNeighborEval - diff --git a/src/main/java/io/anserini/index/Counters.java b/src/main/java/io/anserini/index/Counters.java new file mode 100644 index 0000000000..17e2b59f9f --- /dev/null +++ b/src/main/java/io/anserini/index/Counters.java @@ -0,0 +1,51 @@ +/* + * Anserini: A Lucene toolkit for reproducible information retrieval research + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package io.anserini.index; + +import io.anserini.collection.SourceDocument; + +import java.util.concurrent.atomic.AtomicLong; + +public final class Counters { + /** + * Counter for successfully indexed documents. + */ + public AtomicLong indexed = new AtomicLong(); + + /** + * Counter for empty documents that are not indexed. Empty documents are not necessary errors; + * it could be the case, for example, that a document is comprised solely of stopwords. + */ + public AtomicLong empty = new AtomicLong(); + + /** + * Counter for unindexable documents. These are cases where {@link SourceDocument#indexable()} + * returns false. + */ + public AtomicLong unindexable = new AtomicLong(); + + /** + * Counter for skipped documents. These are cases documents are skipped as part of normal + * processing logic, e.g., using a whitelist, not indexing retweets or deleted tweets. + */ + public AtomicLong skipped = new AtomicLong(); + + /** + * Counter for unexpected errors. + */ + public AtomicLong errors = new AtomicLong(); +} \ No newline at end of file diff --git a/src/main/java/io/anserini/index/IndexCollection.java b/src/main/java/io/anserini/index/IndexCollection.java index 26e9cb263f..e2d6baa996 100644 --- a/src/main/java/io/anserini/index/IndexCollection.java +++ b/src/main/java/io/anserini/index/IndexCollection.java @@ -223,36 +223,6 @@ public static class Args { public int shardCurrent = -1; } - public final class Counters { - /** - * Counter for successfully indexed documents. - */ - public AtomicLong indexed = new AtomicLong(); - - /** - * Counter for empty documents that are not indexed. Empty documents are not necessary errors; - * it could be the case, for example, that a document is comprised solely of stopwords. - */ - public AtomicLong empty = new AtomicLong(); - - /** - * Counter for unindexable documents. These are cases where {@link SourceDocument#indexable()} - * returns false. - */ - public AtomicLong unindexable = new AtomicLong(); - - /** - * Counter for skipped documents. These are cases documents are skipped as part of normal - * processing logic, e.g., using a whitelist, not indexing retweets or deleted tweets. - */ - public AtomicLong skipped = new AtomicLong(); - - /** - * Counter for unexpected errors. - */ - public AtomicLong errors = new AtomicLong(); - } - private final class LocalIndexerThread extends Thread { final private Path inputFile; final private IndexWriter writer; diff --git a/src/main/java/io/anserini/index/IndexHnswDenseVectors.java b/src/main/java/io/anserini/index/IndexHnswDenseVectors.java index 38d7525dc3..aac83e60cc 100644 --- a/src/main/java/io/anserini/index/IndexHnswDenseVectors.java +++ b/src/main/java/io/anserini/index/IndexHnswDenseVectors.java @@ -64,6 +64,7 @@ import java.util.concurrent.atomic.AtomicLong; public final class IndexHnswDenseVectors { + private static final Logger LOG = LogManager.getLogger(IndexHnswDenseVectors.class); public static final class Args { @@ -171,39 +172,6 @@ public static final class Args { } - private static final Logger LOG = LogManager.getLogger(IndexHnswDenseVectors.class); - - // This is the default analyzer used, unless another stemming algorithm or language is specified. - public final class Counters { - /** - * Counter for successfully indexed documents. - */ - public AtomicLong indexed = new AtomicLong(); - - /** - * Counter for empty documents that are not indexed. Empty documents are not necessary errors; - * it could be the case, for example, that a document is comprised solely of stopwords. - */ - public AtomicLong empty = new AtomicLong(); - - /** - * Counter for unindexable documents. These are cases where {@link SourceDocument#indexable()} - * returns false. - */ - public AtomicLong unindexable = new AtomicLong(); - - /** - * Counter for skipped documents. These are cases documents are skipped as part of normal - * processing logic, e.g., using a whitelist, not indexing retweets or deleted tweets. - */ - public AtomicLong skipped = new AtomicLong(); - - /** - * Counter for unexpected errors. - */ - public AtomicLong errors = new AtomicLong(); - } - private final class LocalIndexerThread extends Thread { final private Path inputFile; final private IndexWriter writer; diff --git a/src/main/java/io/anserini/index/IndexInvertedDenseVectors.java b/src/main/java/io/anserini/index/IndexInvertedDenseVectors.java index b2acf4f019..4e580be848 100644 --- a/src/main/java/io/anserini/index/IndexInvertedDenseVectors.java +++ b/src/main/java/io/anserini/index/IndexInvertedDenseVectors.java @@ -16,23 +16,6 @@ package io.anserini.index; -import java.io.File; -import java.io.FileReader; -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.Paths; -import java.util.HashMap; -import java.util.HashSet; -import java.util.LinkedList; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.concurrent.Executors; -import java.util.concurrent.ThreadPoolExecutor; -import java.util.concurrent.TimeUnit; -import java.util.concurrent.atomic.AtomicLong; - import io.anserini.analysis.fw.FakeWordsEncoderAnalyzer; import io.anserini.analysis.lexlsh.LexicalLshAnalyzer; import io.anserini.collection.DocumentCollection; @@ -42,7 +25,6 @@ import io.anserini.index.generator.InvalidDocumentException; import io.anserini.index.generator.LuceneDocumentGenerator; import io.anserini.index.generator.SkippedDocumentException; -import org.apache.commons.io.FileUtils; import org.apache.commons.lang3.time.DurationFormatUtils; import org.apache.logging.log4j.Level; import org.apache.logging.log4j.LogManager; @@ -51,15 +33,10 @@ import org.apache.lucene.analysis.Analyzer; import org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper; import org.apache.lucene.analysis.standard.StandardAnalyzer; -import org.apache.lucene.backward_codecs.lucene94.Lucene94Codec; -import org.apache.lucene.document.Document; -import org.apache.lucene.document.Field; -import org.apache.lucene.document.StringField; -import org.apache.lucene.document.TextField; +import org.apache.lucene.codecs.lucene95.Lucene95Codec; import org.apache.lucene.index.ConcurrentMergeScheduler; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; -import org.apache.lucene.index.Term; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.kohsuke.args4j.CmdLineException; @@ -67,10 +44,22 @@ import org.kohsuke.args4j.Option; import org.kohsuke.args4j.OptionHandlerFilter; import org.kohsuke.args4j.ParserProperties; -import org.kohsuke.args4j.spi.StringArrayOptionHandler; + +import java.io.File; +import java.io.IOException; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.Executors; +import java.util.concurrent.ThreadPoolExecutor; +import java.util.concurrent.TimeUnit; public final class IndexInvertedDenseVectors { + private static final Logger LOG = LogManager.getLogger(IndexInvertedDenseVectors.class); public static final String FIELD_ID = "id"; public static final String FIELD_VECTOR = "vector"; @@ -78,162 +67,62 @@ public final class IndexInvertedDenseVectors { public static final String FW = "fw"; public static final String LEXLSH = "lexlsh"; - public static final class Args { + @Option(name = "-collection", metaVar = "[class]", required = true, usage = "Collection class in io.anserini.collection.") + public String collectionClass; - // This is the name of the field in the Lucene document where the docid is stored. - public static final String ID = "id"; - - // This is the name of the field in the Lucene document that should be searched by default. - public static final String CONTENTS = "contents"; - - // This is the name of the field in the Lucene document where the raw document is stored. - public static final String RAW = "raw"; - - @Option(name = "-input", metaVar = "[path]", required = true, - usage = "Location of input collection.") + @Option(name = "-input", metaVar = "[path]", required = true, usage = "Input collection.") public String input; - @Option(name = "-threads", metaVar = "[num]", - usage = "Number of indexing threads.") - public int threads = 1; - - @Option(name = "-collection", metaVar = "[class]", - usage = "Collection class in package 'io.anserini.collection'.") - public String collectionClass; - - @Option(name = "-generator", metaVar = "[class]", - usage = "Document generator class in package 'io.anserini.index.generator'.") + @Option(name = "-generator", metaVar = "[class]", usage = "Document generator class in io.anserini.index.generator.") public String generatorClass = "InvertedDenseVectorDocumentGenerator"; - // optional general arguments - - @Option(name = "-verbose", forbids = {"-quiet"}, - usage = "Enables verbose logging for each indexing thread; can be noisy if collection has many small file segments.") - public boolean verbose = false; - - @Option(name = "-quiet", forbids = {"-verbose"}, - usage = "Turns off all logging.") - public boolean quiet = false; - - // optional arguments - - @Option(name = "-index", metaVar = "[path]", usage = "Index path.", required = true) + @Option(name = "-index", metaVar = "[path]", required = true, usage = "Index path.") public String index; - @Option(name = "-fields", handler = StringArrayOptionHandler.class, - usage = "List of fields to index (space separated), in addition to the default 'contents' field.") - public String[] fields = new String[]{}; - - @Option(name = "-storePositions", - usage = "Boolean switch to index store term positions; needed for phrase queries.") - public boolean storePositions = false; - - @Option(name = "-storeDocvectors", - usage = "Boolean switch to store document vectors; needed for (pseudo) relevance feedback.") - public boolean storeDocvectors = false; - - @Option(name = "-storeContents", - usage = "Boolean switch to store document contents.") - public boolean storeContents = false; - - @Option(name = "-storeRaw", - usage = "Boolean switch to store raw source documents.") - public boolean storeRaw = false; - - @Option(name = "-optimize", - usage = "Boolean switch to optimize index (i.e., force merge) into a single segment; costly for large collections.") - public boolean optimize = false; - - @Option(name = "-uniqueDocid", - usage = "Removes duplicate documents with the same docid during indexing. This significantly slows indexing throughput " + - "but may be needed for tweet collections since the streaming API might deliver a tweet multiple times.") - public boolean uniqueDocid = false; - - @Option(name = "-memorybuffer", metaVar = "[mb]", - usage = "Memory buffer size (in MB).") - public int memorybufferSize = 2048; - - @Option(name = "-whitelist", metaVar = "[file]", - usage = "File containing list of docids, one per line; only these docids will be indexed.") - public String whitelist = null; - - @Option(name = "-encoding", metaVar = "[word]", required = true, usage = "encoding must be one of {fw, lexlsh}") + @Option(name = "-encoding", metaVar = "[word]", usage = "Encoding method: {'fw', 'lexlsh'}.") public String encoding = FW; - @Option(name = "-stored", metaVar = "[boolean]", usage = "store vectors") - public boolean stored = false; + @Option(name = "-fw.q", metaVar = "[int]", usage = "Fake Words encoding: quantization factor.") + public int q = FakeWordsEncoderAnalyzer.DEFAULT_Q; - @Option(name = "-lexlsh.n", metaVar = "[int]", usage = "ngrams") + @Option(name = "-lexlsh.n", metaVar = "[int]", usage = "LexLSH encoding: n-gram size.") public int ngrams = 2; - @Option(name = "-lexlsh.d", metaVar = "[int]", usage = "decimals") + @Option(name = "-lexlsh.d", metaVar = "[int]", usage = "LexLSH encoding: decimal digits.") public int decimals = 1; - @Option(name = "-lexlsh.hsize", metaVar = "[int]", usage = "hash set size") + @Option(name = "-lexlsh.hsize", metaVar = "[int]", usage = "LexLSH encoding: hash set size.") public int hashSetSize = 1; - @Option(name = "-lexlsh.h", metaVar = "[int]", usage = "hash count") + @Option(name = "-lexlsh.h", metaVar = "[int]", usage = "LexLSH encoding: hash count.") public int hashCount = 1; - @Option(name = "-lexlsh.b", metaVar = "[int]", usage = "bucket count") + @Option(name = "-lexlsh.b", metaVar = "[int]", usage = "LexLSH encoding: bucket count.") public int bucketCount = 300; - @Option(name = "-fw.q", metaVar = "[int]", usage = "quantization factor") - public int q = FakeWordsEncoderAnalyzer.DEFAULT_Q; - // Sharding options + @Option(name = "-memorybuffer", metaVar = "[mb]", usage = "Memory buffer size in MB.") + public int memorybufferSize = 4096; - @Option(name = "-shard.count", metaVar = "[n]", - usage = "Number of shards to partition the document collection into.") - public int shardCount = -1; + @Option(name = "-threads", metaVar = "[num]", usage = "Number of indexing threads.") + public int threads = 4; - @Option(name = "-shard.current", metaVar = "[n]", - usage = "The current shard number to generate (indexed from 0).") - public int shardCurrent = -1; - } + @Option(name = "-verbose", forbids = {"-quiet"}, usage = "Enables verbose logging for each indexing thread.") + public boolean verbose = false; - private static final Logger LOG = LogManager.getLogger(IndexInvertedDenseVectors.class); + @Option(name = "-quiet", forbids = {"-verbose"}, usage = "Turns off all logging.") + public boolean quiet = false; - // This is the default analyzer used, unless another stemming algorithm or language is specified. - public final class Counters { - - /** - * Counter for successfully indexed documents. - */ - public AtomicLong indexed = new AtomicLong(); - - /** - * Counter for empty documents that are not indexed. Empty documents are not necessary errors; - * it could be the case, for example, that a document is comprised solely of stopwords. - */ - public AtomicLong empty = new AtomicLong(); - - /** - * Counter for unindexable documents. These are cases where {@link SourceDocument#indexable()} - * returns false. - */ - public AtomicLong unindexable = new AtomicLong(); - - /** - * Counter for skipped documents. These are cases documents are skipped as part of normal - * processing logic, e.g., using a whitelist, not indexing retweets or deleted tweets. - */ - public AtomicLong skipped = new AtomicLong(); - - /** - * Counter for unexpected errors. - */ - public AtomicLong errors = new AtomicLong(); + @Option(name = "-optimize", usage = "Optimizes index by merging into a single index segment.") + public boolean optimize = false; } private final class LocalIndexerThread extends Thread { - final private Path inputFile; final private IndexWriter writer; - final private DocumentCollection collection; - private FileSegment fileSegment; + final private DocumentCollection collection; - private LocalIndexerThread(IndexWriter writer, DocumentCollection collection, Path inputFile) { + private LocalIndexerThread(IndexWriter writer, DocumentCollection collection, Path inputFile) { this.writer = writer; this.collection = collection; this.inputFile = inputFile; @@ -241,10 +130,12 @@ private LocalIndexerThread(IndexWriter writer, DocumentCollection collection, Pa } @Override - @SuppressWarnings("unchecked") public void run() { + FileSegment segment = null; + try { - LuceneDocumentGenerator generator = (LuceneDocumentGenerator) + @SuppressWarnings("unchecked") + LuceneDocumentGenerator generator = (LuceneDocumentGenerator) generatorClass.getDeclaredConstructor(Args.class).newInstance(args); // We keep track of two separate counts: the total count of documents in this file segment (cnt), @@ -254,9 +145,7 @@ public void run() { int cnt = 0; int batch = 0; - FileSegment segment = collection.createFileSegment(inputFile); - // in order to call close() and clean up resources in case of exception - this.fileSegment = segment; + segment = collection.createFileSegment(inputFile); for (SourceDocument d : segment) { if (!d.indexable()) { @@ -264,9 +153,11 @@ public void run() { continue; } - Document doc; try { - doc = generator.createDocument(d); + writer.addDocument(generator.createDocument(d)); + + cnt++; + batch++; } catch (EmptyDocumentException e1) { counters.empty.incrementAndGet(); continue; @@ -278,20 +169,7 @@ public void run() { continue; } - if (whitelistDocids != null && !whitelistDocids.contains(d.id())) { - counters.skipped.incrementAndGet(); - continue; - } - - if (args.uniqueDocid) { - writer.updateDocument(new Term("id", d.id()), doc); - } else { - writer.addDocument(doc); - } - cnt++; - batch++; - - // And the counts from this batch, reset batch counter. + // Add the counts from this batch, reset batch counter. if (batch % 10000 == 0) { counters.indexed.addAndGet(batch); batch = 0; @@ -303,39 +181,34 @@ public void run() { int skipped = segment.getSkippedCount(); if (skipped > 0) { - // When indexing tweets, this is normal, because there are delete messages that are skipped over. counters.skipped.addAndGet(skipped); LOG.warn(inputFile.getParent().getFileName().toString() + File.separator + - inputFile.getFileName().toString() + ": " + skipped + " docs skipped."); + inputFile.getFileName().toString() + ": " + skipped + " docs skipped."); } if (segment.getErrorStatus()) { counters.errors.incrementAndGet(); LOG.error(inputFile.getParent().getFileName().toString() + File.separator + - inputFile.getFileName().toString() + ": error iterating through segment."); + inputFile.getFileName().toString() + ": error iterating through segment."); } // Log at the debug level because this can be quite noisy if there are lots of file segments. LOG.debug(inputFile.getParent().getFileName().toString() + File.separator + - inputFile.getFileName().toString() + ": " + cnt + " docs added."); + inputFile.getFileName().toString() + ": " + cnt + " docs added."); } catch (Exception e) { LOG.error(Thread.currentThread().getName() + ": Unexpected Exception:", e); } finally { - if (fileSegment != null) { - fileSegment.close(); - } + segment.close(); } } } private final Args args; private final Path collectionPath; - private final Set whitelistDocids; - private Class collectionClass; - private final Class generatorClass; - private DocumentCollection collection; + private final Class> generatorClass; + private final DocumentCollection collection; private final Counters counters; - private Path indexPath; + private final Path indexPath; @SuppressWarnings("unchecked") public IndexInvertedDenseVectors(Args args) throws Exception { @@ -356,237 +229,138 @@ public IndexInvertedDenseVectors(Args args) throws Exception { LOG.info("Starting indexer..."); LOG.info("============ Loading Parameters ============"); - LOG.info("DocumentCollection path: " + args.input); - LOG.info("CollectionClass: " + args.collectionClass); + LOG.info("Collection class: " + args.collectionClass); + LOG.info("Collection path: " + args.input); LOG.info("Generator: " + args.generatorClass); - LOG.info("Threads: " + args.threads); - LOG.info("Store document \"contents\" field? " + args.storeContents); - LOG.info("Store document \"raw\" field? " + args.storeRaw); - LOG.info("Optimize (merge segments)? " + args.optimize); - LOG.info("Whitelist: " + args.whitelist); LOG.info("Index path: " + args.index); + LOG.info("Encoding: " + args.encoding); + LOG.info("Threads: " + args.threads); + LOG.info("Optimize? " + args.optimize); - if (args.index != null) { - this.indexPath = Paths.get(args.index); - if (!Files.exists(this.indexPath)) { - Files.createDirectories(this.indexPath); - } + this.indexPath = Paths.get(args.index); + if (!Files.exists(this.indexPath)) { + Files.createDirectories(this.indexPath); } - // Our documentation uses /path/to/foo as a convention: to make copy and paste of the commands work, we assume - // collections/ as the path location. + // Our documentation uses /path/to/foo as a convention: to make copy and paste of the commands work, + // we assume collections/ as the path location. String pathStr = args.input; if (pathStr.startsWith("/path/to")) { pathStr = pathStr.replace("/path/to", "collections"); } - collectionPath = Paths.get(pathStr); + this.collectionPath = Paths.get(pathStr); if (!Files.exists(collectionPath) || !Files.isReadable(collectionPath)) { - throw new RuntimeException("Document directory " + collectionPath + " does not exist or is not readable, please check the path"); + throw new RuntimeException("Collection path " + collectionPath + " does not exist or is not readable!"); } - if (Files.isDirectory(collectionPath) && args.collectionClass == null) { - throw new RuntimeException("Collection class must be defined, got `null` instead"); - } + Class> collectionClass = (Class>) + Class.forName("io.anserini.collection." + args.collectionClass); + this.collection = collectionClass.getConstructor(Path.class).newInstance(collectionPath); - this.generatorClass = Class.forName("io.anserini.index.generator." + args.generatorClass); - if (args.collectionClass != null) { - this.collectionClass = Class.forName("io.anserini.collection." + args.collectionClass); - // Initialize the collection. - collection = (DocumentCollection) this.collectionClass.getConstructor(Path.class).newInstance(collectionPath); - } - - if (args.whitelist != null) { - List lines = FileUtils.readLines(new File(args.whitelist), "utf-8"); - this.whitelistDocids = new HashSet<>(lines); - } else { - this.whitelistDocids = null; - } + this.generatorClass = (Class>) + Class.forName("io.anserini.index.generator." + args.generatorClass); this.counters = new Counters(); } public Counters run() throws IOException { - final long start = System.nanoTime(); - LOG.info("============ Indexing Collection ============"); + final long start = System.nanoTime(); - int numThreads = args.threads; - IndexWriter writer = null; Analyzer vectorAnalyzer; if (args.encoding.equalsIgnoreCase(FW)) { vectorAnalyzer = new FakeWordsEncoderAnalyzer(args.q); } else if (args.encoding.equalsIgnoreCase(LEXLSH)) { - vectorAnalyzer = new LexicalLshAnalyzer(args.decimals, args.ngrams, args.hashCount, - args.bucketCount, args.hashSetSize); + vectorAnalyzer = new LexicalLshAnalyzer(args.decimals, args.ngrams, args.hashCount, args.bucketCount, args.hashSetSize); } else { - vectorAnalyzer = null; - System.err.println("error!"); + throw new RuntimeException("Invalid encoding scheme!"); } + Map map = new HashMap<>(); map.put(FIELD_VECTOR, vectorAnalyzer); Analyzer analyzer = new PerFieldAnalyzerWrapper(new StandardAnalyzer(), map); - // Used for LocalIndexThread - if (indexPath != null) { - final Directory dir = FSDirectory.open(indexPath); - final IndexWriterConfig config = new IndexWriterConfig(analyzer).setCodec(new Lucene94Codec()); - config.setOpenMode(IndexWriterConfig.OpenMode.CREATE); - config.setRAMBufferSizeMB(args.memorybufferSize); - config.setUseCompoundFile(false); - config.setMergeScheduler(new ConcurrentMergeScheduler()); - writer = new IndexWriter(dir, config); - } - - if (Files.isDirectory(collectionPath)) { - final ThreadPoolExecutor executor = (ThreadPoolExecutor) Executors.newFixedThreadPool(numThreads); - LOG.info("Thread pool with " + numThreads + " threads initialized."); + final Directory dir = FSDirectory.open(indexPath); + final IndexWriterConfig config = new IndexWriterConfig(analyzer).setCodec(new Lucene95Codec()); + config.setOpenMode(IndexWriterConfig.OpenMode.CREATE); + config.setRAMBufferSizeMB(args.memorybufferSize); + config.setUseCompoundFile(false); + config.setMergeScheduler(new ConcurrentMergeScheduler()); + IndexWriter writer = new IndexWriter(dir, config); - LOG.info("Initializing collection in " + collectionPath); + final ThreadPoolExecutor executor = (ThreadPoolExecutor) Executors.newFixedThreadPool(args.threads); + LOG.info("Thread pool with " + args.threads + " threads initialized."); + LOG.info("Initializing collection in " + collectionPath); - List segmentPaths = collection.getSegmentPaths(); - // when we want sharding to be done - if (args.shardCount > 1) { - segmentPaths = collection.getSegmentPaths(args.shardCount, args.shardCurrent); - } - final int segmentCnt = segmentPaths.size(); + List segmentPaths = collection.getSegmentPaths(); + final int segmentCnt = segmentPaths.size(); - LOG.info(String.format("%,d %s found", segmentCnt, (segmentCnt == 1 ? "file" : "files"))); - LOG.info("Starting to index..."); + LOG.info(String.format("%,d %s found", segmentCnt, (segmentCnt == 1 ? "file" : "files"))); + LOG.info("Starting to index..."); - for (Object segmentPath : segmentPaths) { - executor.execute(new LocalIndexerThread(writer, collection, (Path) segmentPath)); - } + segmentPaths.forEach((segmentPath) -> executor.execute(new LocalIndexerThread(writer, collection, segmentPath))); - executor.shutdown(); + executor.shutdown(); - try { - // Wait for existing tasks to terminate - while (!executor.awaitTermination(1, TimeUnit.MINUTES)) { - if (segmentCnt == 1) { - LOG.info(String.format("%,d documents indexed", counters.indexed.get())); - } else { - LOG.info(String.format("%.2f%% of files completed, %,d documents indexed", - (double) executor.getCompletedTaskCount() / segmentCnt * 100.0d, counters.indexed.get())); - } + try { + // Wait for existing tasks to terminate. + while (!executor.awaitTermination(1, TimeUnit.MINUTES)) { + if (segmentCnt == 1) { + LOG.info(String.format("%,d documents indexed", counters.indexed.get())); + } else { + LOG.info(String.format("%.2f%% of files completed, %,d documents indexed", + (double) executor.getCompletedTaskCount() / segmentCnt * 100.0d, counters.indexed.get())); } - } catch (InterruptedException ie) { - // (Re-)Cancel if current thread also interrupted - executor.shutdownNow(); - // Preserve interrupt status - Thread.currentThread().interrupt(); } + } catch (InterruptedException ie) { + // (Re-)Cancel if current thread also interrupted. + executor.shutdownNow(); + // Preserve interrupt status. + Thread.currentThread().interrupt(); + } - if (segmentCnt != executor.getCompletedTaskCount()) { - throw new RuntimeException("totalFiles = " + segmentCnt + - " is not equal to completedTaskCount = " + executor.getCompletedTaskCount()); - } + if (segmentCnt != executor.getCompletedTaskCount()) { + throw new RuntimeException("totalFiles = " + segmentCnt + + " is not equal to completedTaskCount = " + executor.getCompletedTaskCount()); + } - long numIndexed = writer.getDocStats().maxDoc; + long numIndexed = writer.getDocStats().maxDoc; - // Do a final commit - try { - if (writer != null) { - writer.commit(); - if (args.optimize) { - writer.forceMerge(1); - } - } - } finally { - try { - if (writer != null) { - writer.close(); - } - } catch (IOException e) { - // It is possible that this happens... but nothing much we can do at this point, - // so just log the error and move on. - LOG.error(e); - } - } - - if (numIndexed != counters.indexed.get()) { - LOG.warn("Unexpected difference between number of indexed documents and index maxDoc."); + // Do a final commit. + try { + writer.commit(); + if (args.optimize) { + writer.forceMerge(1); } - - LOG.info(String.format("Indexing Complete! %,d documents indexed", numIndexed)); - LOG.info("============ Final Counter Values ============"); - LOG.info(String.format("indexed: %,12d", counters.indexed.get())); - LOG.info(String.format("unindexable: %,12d", counters.unindexable.get())); - LOG.info(String.format("empty: %,12d", counters.empty.get())); - LOG.info(String.format("skipped: %,12d", counters.skipped.get())); - LOG.info(String.format("errors: %,12d", counters.errors.get())); - - final long durationMillis = TimeUnit.MILLISECONDS.convert(System.nanoTime() - start, TimeUnit.NANOSECONDS); - LOG.info(String.format("Total %,d documents indexed in %s", numIndexed, - DurationFormatUtils.formatDuration(durationMillis, "HH:mm:ss"))); - } else { - Map> vectors = readGloVe(new File(args.input)); - for (Map.Entry> entry : vectors.entrySet()) { - for (float[] vector: entry.getValue()) { - Document doc = new Document(); - doc.add(new StringField(FIELD_ID, entry.getKey(), Field.Store.YES)); - StringBuilder sb = new StringBuilder(); - for (double fv : vector) { - if (sb.length() > 0) { - sb.append(' '); - } - sb.append(fv); - } - doc.add(new TextField(FIELD_VECTOR, sb.toString(), args.stored ? Field.Store.YES : Field.Store.NO)); - try { - writer.addDocument(doc); - long cur = counters.indexed.incrementAndGet(); - if (cur % 100000 == 0) { - System.out.println(String.format("%s docs added", counters.indexed.get())); - } - } catch (IOException e) { - System.err.println("Error while indexing: " + e.getLocalizedMessage()); - counters.errors.incrementAndGet(); - } - } + } finally { + try { + writer.close(); + } catch (IOException e) { + // It is possible that this happens... but nothing much we can do at this point, + // so just log the error and move on. + LOG.error(e); } + } - writer.commit(); - writer.close(); + if (numIndexed != counters.indexed.get()) { + LOG.warn("Unexpected difference between number of indexed documents and index maxDoc."); + } - LOG.info(String.format("Indexing Complete! %,d documents indexed", counters.indexed.get())); + LOG.info(String.format("Indexing Complete! %,d documents indexed", numIndexed)); + LOG.info("============ Final Counter Values ============"); + LOG.info(String.format("indexed: %,12d", counters.indexed.get())); + LOG.info(String.format("unindexable: %,12d", counters.unindexable.get())); + LOG.info(String.format("empty: %,12d", counters.empty.get())); + LOG.info(String.format("skipped: %,12d", counters.skipped.get())); + LOG.info(String.format("errors: %,12d", counters.errors.get())); - final long durationMillis = TimeUnit.MILLISECONDS.convert(System.nanoTime() - start, TimeUnit.NANOSECONDS); - LOG.info(String.format("Total %,d documents indexed in %s", counters.indexed.get(), - DurationFormatUtils.formatDuration(durationMillis, "HH:mm:ss"))); - } + final long durationMillis = TimeUnit.MILLISECONDS.convert(System.nanoTime() - start, TimeUnit.NANOSECONDS); + LOG.info(String.format("Total %,d documents indexed in %s", numIndexed, + DurationFormatUtils.formatDuration(durationMillis, "HH:mm:ss"))); return counters; } - public static Map> readGloVe(File input) throws IOException { - Map> vectors = new HashMap<>(); - for (String line : org.apache.commons.io.IOUtils.readLines(new FileReader(input))) { - String[] s = line.split("\\s+"); - if (s.length > 2) { - String key = s[0]; - float[] vector = new float[s.length - 1]; - float norm = 0f; - for (int i = 1; i < s.length; i++) { - float f = Float.parseFloat(s[i]); - vector[i - 1] = f; - norm += Math.pow(f, 2); - } - norm = (float) Math.sqrt(norm); - for (int i = 0; i < vector.length; i++) { - vector[i] = vector[i] / norm; - } - if (vectors.containsKey(key)) { - List floats = new LinkedList<>(vectors.get(key)); - floats.add(vector); - vectors.put(key, floats); - } else { - vectors.put(key, List.of(vector)); - } - } - } - return vectors; - } - public static void main(String[] args) throws Exception { Args indexCollectionArgs = new Args(); CmdLineParser parser = new CmdLineParser(indexCollectionArgs, ParserProperties.defaults().withUsageWidth(100)); @@ -597,7 +371,7 @@ public static void main(String[] args) throws Exception { System.err.println(e.getMessage()); parser.printUsage(System.err); System.err.println("Example: " + IndexInvertedDenseVectors.class.getSimpleName() + - parser.printExample(OptionHandlerFilter.REQUIRED)); + parser.printExample(OptionHandlerFilter.REQUIRED)); return; } diff --git a/src/main/java/io/anserini/index/IndexReaderUtils.java b/src/main/java/io/anserini/index/IndexReaderUtils.java index 222314e23d..3fc468e96c 100755 --- a/src/main/java/io/anserini/index/IndexReaderUtils.java +++ b/src/main/java/io/anserini/index/IndexReaderUtils.java @@ -779,15 +779,15 @@ public static String convertLuceneDocidToDocid(IndexReader reader, int docid) { * @param reader index reader * @return map from name of statistic to its value */ - public static Map getIndexStats(IndexReader reader) { + public static Map getIndexStats(IndexReader reader, String field) { Map indexStats = new HashMap<>(); try { - Terms terms = MultiTerms.getTerms(reader, Constants.CONTENTS); + Terms terms = MultiTerms.getTerms(reader, field); indexStats.put("documents", reader.numDocs()); - indexStats.put("non_empty_documents", reader.getDocCount(Constants.CONTENTS)); + indexStats.put("non_empty_documents", reader.getDocCount(field)); indexStats.put("unique_terms", terms.size()); - indexStats.put("total_terms", reader.getSumTotalTermFreq(Constants.CONTENTS)); + indexStats.put("total_terms", reader.getSumTotalTermFreq(field)); } catch (IOException e) { // Eat any exceptions and just return null. return null; @@ -795,6 +795,10 @@ public static Map getIndexStats(IndexReader reader) { return indexStats; } + public static Map getIndexStats(IndexReader reader) { + return getIndexStats(reader, Constants.CONTENTS); + } + /** * Returns {@code FieldInfo} for indexed fields. * @@ -835,6 +839,9 @@ public static final class Args { @Option(name = "-index", metaVar = "[Path]", required = true, usage = "index path") String index; + @Option(name = "-field", metaVar = "[field]", usage = "field") + String field = Constants.CONTENTS; + @Option(name = "-stats", usage = "print index statistics") boolean stats; } @@ -851,11 +858,9 @@ public static void main(String[] argv) throws Exception { } IndexReader reader = IndexReaderUtils.getReader(args.index); - Map results = IndexReaderUtils.getIndexStats(reader); + Map results = IndexReaderUtils.getIndexStats(reader, args.field); if (args.stats) { - Terms terms = MultiTerms.getTerms(reader, Constants.CONTENTS); - System.out.println("Index statistics"); System.out.println("----------------"); System.out.println("documents: " + results.get("documents")); diff --git a/src/main/java/io/anserini/index/generator/InvertedDenseVectorDocumentGenerator.java b/src/main/java/io/anserini/index/generator/InvertedDenseVectorDocumentGenerator.java index 233b5b6b6a..ca2f7d6f05 100644 --- a/src/main/java/io/anserini/index/generator/InvertedDenseVectorDocumentGenerator.java +++ b/src/main/java/io/anserini/index/generator/InvertedDenseVectorDocumentGenerator.java @@ -16,29 +16,24 @@ package io.anserini.index.generator; -import java.util.ArrayList; - import com.fasterxml.jackson.core.JsonProcessingException; import com.fasterxml.jackson.core.type.TypeReference; -import com.fasterxml.jackson.databind.JsonMappingException; import com.fasterxml.jackson.databind.ObjectMapper; import io.anserini.collection.SourceDocument; import io.anserini.index.IndexInvertedDenseVectors; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; -import org.apache.lucene.document.StoredField; import org.apache.lucene.document.StringField; import org.apache.lucene.document.TextField; -import static io.anserini.index.IndexInvertedDenseVectors.Args.RAW; +import java.util.ArrayList; /** - * Converts a {@link SourceDocument} into a Lucene {@link Document}, ready to be indexed for ANN search. + * Converts a {@link SourceDocument} into a Lucene {@link Document}. * * @param type of the source document */ public class InvertedDenseVectorDocumentGenerator implements LuceneDocumentGenerator { - protected IndexInvertedDenseVectors.Args args; protected InvertedDenseVectorDocumentGenerator() { @@ -53,9 +48,9 @@ public InvertedDenseVectorDocumentGenerator(IndexInvertedDenseVectors.Args args) this.args = args; } - private float[] convertJsonArray(String vectorString) throws JsonMappingException, JsonProcessingException { + private float[] convertJsonArray(String vectorString) throws JsonProcessingException { ObjectMapper mapper = new ObjectMapper(); - ArrayList denseVector = mapper.readValue(vectorString, new TypeReference>(){}); + ArrayList denseVector = mapper.readValue(vectorString, new TypeReference<>() {}); int length = denseVector.size(); float[] vector = new float[length]; int i = 0; @@ -75,6 +70,7 @@ public Document createDocument(T src) throws InvalidDocumentException { } catch (Exception e) { throw new InvalidDocumentException(); } + StringBuilder sb = new StringBuilder(); for (double fv : contents) { if (sb.length() > 0) { @@ -82,15 +78,11 @@ public Document createDocument(T src) throws InvalidDocumentException { } sb.append(fv); } - // Make a new, empty document. - final Document document = new Document(); - // Store the collection docid. + final Document document = new Document(); document.add(new StringField(IndexInvertedDenseVectors.FIELD_ID, id, Field.Store.YES)); - document.add(new TextField(IndexInvertedDenseVectors.FIELD_VECTOR, sb.toString(), args.stored ? Field.Store.YES : Field.Store.NO)); - if (args.storeRaw) { - document.add(new StoredField(RAW, src.raw())); - } + document.add(new TextField(IndexInvertedDenseVectors.FIELD_VECTOR, sb.toString(), Field.Store.NO)); + return document; } } \ No newline at end of file diff --git a/src/main/java/io/anserini/search/EvaluateInvertedDenseVectors.java b/src/main/java/io/anserini/search/EvaluateInvertedDenseVectors.java deleted file mode 100644 index 63b16f0e20..0000000000 --- a/src/main/java/io/anserini/search/EvaluateInvertedDenseVectors.java +++ /dev/null @@ -1,256 +0,0 @@ -/* - * Anserini: A Lucene toolkit for reproducible information retrieval research - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package io.anserini.search; - -import com.google.common.collect.Sets; -import io.anserini.analysis.AnalyzerUtils; -import io.anserini.analysis.fw.FakeWordsEncoderAnalyzer; -import io.anserini.analysis.lexlsh.LexicalLshAnalyzer; -import io.anserini.index.IndexInvertedDenseVectors; -import io.anserini.search.topicreader.TrecTopicReader; -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.standard.StandardAnalyzer; -import org.apache.lucene.document.Document; -import org.apache.lucene.index.DirectoryReader; -import org.apache.lucene.index.StoredFields; -import org.apache.lucene.index.Term; -import org.apache.lucene.queries.CommonTermsQuery; -import org.apache.lucene.search.IndexSearcher; -import org.apache.lucene.search.ScoreDoc; -import org.apache.lucene.search.TopScoreDocCollector; -import org.apache.lucene.search.similarities.ClassicSimilarity; -import org.apache.lucene.store.Directory; -import org.apache.lucene.store.FSDirectory; -import org.kohsuke.args4j.CmdLineException; -import org.kohsuke.args4j.CmdLineParser; -import org.kohsuke.args4j.Option; -import org.kohsuke.args4j.OptionHandlerFilter; -import org.kohsuke.args4j.ParserProperties; - -import java.io.File; -import java.io.IOException; -import java.nio.file.Files; -import java.nio.file.Path; -import java.util.Collection; -import java.util.HashSet; -import java.util.LinkedList; -import java.util.List; -import java.util.Map; -import java.util.Set; -import java.util.TreeSet; - -import static org.apache.lucene.search.BooleanClause.Occur.SHOULD; - -public class EvaluateInvertedDenseVectors { - private static final String FW = "fw"; - private static final String LEXLSH = "lexlsh"; - - public static final class Args { - @Option(name = "-input", metaVar = "[file]", required = true, usage = "vectors model") - public File input; - - @Option(name = "-index", metaVar = "[path]", required = true, usage = "index path") - public Path path; - - @Option(name = "-topics", metaVar = "[file]", required = true, usage = "path to TREC topics file") - public Path topicsPath; - - @Option(name = "-topN", metaVar = "[int]", usage = "topN recall") - public int topN = 10; - - @Option(name = "-encoding", metaVar = "[word]", required = true, usage = "encoding must be one of {fw, lexlsh}") - public String encoding; - - @Option(name = "-depth", metaVar = "[int]", usage = "retrieval depth") - public int depth = 10; - - @Option(name = "-samples", metaVar = "[int]", usage = "no. of samples") - public int samples = Integer.MAX_VALUE; - - @Option(name = "-lexlsh.n", metaVar = "[int]", usage = "n-grams") - public int ngrams = 2; - - @Option(name = "-lexlsh.d", metaVar = "[int]", usage = "decimals") - public int decimals = 1; - - @Option(name = "-lexlsh.hsize", metaVar = "[int]", usage = "hash set size") - public int hashSetSize = 1; - - @Option(name = "-lexlsh.h", metaVar = "[int]", usage = "hash count") - public int hashCount = 1; - - @Option(name = "-lexlsh.b", metaVar = "[int]", usage = "bucket count") - public int bucketCount = 300; - - @Option(name = "-fw.q", metaVar = "[int]", usage = "quantization factor") - public int q = 60; - - @Option(name = "-cutoff", metaVar = "[float]", usage = "tf cutoff factor") - public float cutoff = 0.999f; - - @Option(name = "-msm", metaVar = "[float]", usage = "minimum should match") - public float msm = 0; - } - - public static void main(String[] args) throws Exception { - EvaluateInvertedDenseVectors.Args indexArgs = new EvaluateInvertedDenseVectors.Args(); - CmdLineParser parser = new CmdLineParser(indexArgs, ParserProperties.defaults().withUsageWidth(90)); - - try { - parser.parseArgument(args); - } catch (CmdLineException e) { - System.err.println(e.getMessage()); - parser.printUsage(System.err); - System.err.println("Example: " + EvaluateInvertedDenseVectors.class.getSimpleName() + - parser.printExample(OptionHandlerFilter.REQUIRED)); - return; - } - Analyzer vectorAnalyzer; - if (indexArgs.encoding.equalsIgnoreCase(FW)) { - vectorAnalyzer = new FakeWordsEncoderAnalyzer(indexArgs.q); - } else if (indexArgs.encoding.equalsIgnoreCase(LEXLSH)) { - vectorAnalyzer = new LexicalLshAnalyzer(indexArgs.decimals, indexArgs.ngrams, indexArgs.hashCount, - indexArgs.bucketCount, indexArgs.hashSetSize); - } else { - parser.printUsage(System.err); - System.err.println("Example: " + EvaluateInvertedDenseVectors.class.getSimpleName() + - parser.printExample(OptionHandlerFilter.REQUIRED)); - return; - } - - System.out.println(String.format("Loading model %s", indexArgs.input)); - - Map> wordVectors = IndexInvertedDenseVectors.readGloVe(indexArgs.input); - - Path indexDir = indexArgs.path; - if (!Files.exists(indexDir)) { - Files.createDirectories(indexDir); - } - - System.out.println(String.format("Reading index at %s", indexArgs.path)); - - Directory d = FSDirectory.open(indexDir); - DirectoryReader reader = DirectoryReader.open(d); - IndexSearcher searcher = new IndexSearcher(reader); - if (indexArgs.encoding.equalsIgnoreCase(FW)) { - searcher.setSimilarity(new ClassicSimilarity()); - } - - StandardAnalyzer standardAnalyzer = new StandardAnalyzer(); - double recall = 0; - double time = 0d; - System.out.println("Evaluating at retrieval depth: " + indexArgs.depth); - TrecTopicReader trecTopicReader = new TrecTopicReader(indexArgs.topicsPath); - Collection words = new LinkedList<>(); - trecTopicReader.read().values().forEach(e -> words.addAll(AnalyzerUtils.analyze(standardAnalyzer, e.get("title")))); - StoredFields storedFields = reader.storedFields(); - int queryCount = 0; - for (String word : words) { - if (wordVectors.containsKey(word)) { - Set truth = nearestVector(wordVectors, word, indexArgs.topN); - try { - List vectors = wordVectors.get(word); - for (float[] vector : vectors) { - StringBuilder sb = new StringBuilder(); - for (double fv : vector) { - if (sb.length() > 0) { - sb.append(' '); - } - sb.append(fv); - } - String fvString = sb.toString(); - - CommonTermsQuery simQuery = new CommonTermsQuery(SHOULD, SHOULD, indexArgs.cutoff); - if (indexArgs.msm > 0) { - simQuery.setLowFreqMinimumNumberShouldMatch(indexArgs.msm); - } - for (String token : AnalyzerUtils.analyze(vectorAnalyzer, fvString)) { - simQuery.add(new Term(IndexInvertedDenseVectors.FIELD_VECTOR, token)); - } - - long start = System.currentTimeMillis(); - TopScoreDocCollector results = TopScoreDocCollector.create(indexArgs.depth, Integer.MAX_VALUE); - searcher.search(simQuery, results); - time += System.currentTimeMillis() - start; - - Set observations = new HashSet<>(); - for (ScoreDoc sd : results.topDocs().scoreDocs) { - Document document = storedFields.document(sd.doc); - String wordValue = document.get(IndexInvertedDenseVectors.FIELD_ID); - observations.add(wordValue); - } - double intersection = Sets.intersection(truth, observations).size(); - double localRecall = intersection / (double) truth.size(); - recall += localRecall; - queryCount++; - } - } catch (IOException e) { - System.err.println("search for '" + word + "' failed " + e.getLocalizedMessage()); - } - } - if (queryCount >= indexArgs.samples) { - break; - } - } - recall /= queryCount; - time /= queryCount; - - System.out.println(String.format("R@%d: %.4f", indexArgs.depth, recall)); - System.out.println(String.format("avg query time: %s ms", time)); - - reader.close(); - d.close(); - } - - /** - * Calculate the nearest N words for a given input word. - * - * @param vectors vectors, keyed by word - * @param word the input word - * @param topN the number of similar word vectors to output - * @return the {@code topN} similar words of the input word - */ - private static Set nearestVector(Map> vectors, String word, int topN) { - Set intermediate = new TreeSet<>(); - List inputs = vectors.get(word); - String separateToken = "__"; - for (Map.Entry> entry : vectors.entrySet()) { - for (float[] value : entry.getValue()) { - for (float[] input : inputs) { - float sim = 0; - for (int i = 0; i < value.length; i++) { - sim += value[i] * input[i]; - } - // store the words, sorted by decreasing distance using natural order (in the $dist__$word format) - intermediate.add((1 - sim) + separateToken + entry.getKey()); - } - } - } - Set result = new HashSet<>(); - int i = 0; - for (String w : intermediate) { - if (i == topN) { - break; - } - // only add actual word String (not the distance) to the result collection - result.add(w.substring(w.indexOf(separateToken) + 2)); - i++; - } - return result; - } - -} diff --git a/src/main/java/io/anserini/search/SearchInvertedDenseVectors.java b/src/main/java/io/anserini/search/SearchInvertedDenseVectors.java index b5d19ba484..868a2dbe9b 100644 --- a/src/main/java/io/anserini/search/SearchInvertedDenseVectors.java +++ b/src/main/java/io/anserini/search/SearchInvertedDenseVectors.java @@ -16,49 +16,18 @@ package io.anserini.search; -import java.io.Closeable; -import java.io.File; -import java.io.IOException; -import java.io.PrintWriter; -import java.nio.charset.StandardCharsets; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.Paths; -import java.util.Collection; -import java.util.HashSet; -import java.util.LinkedList; -import java.util.List; -import java.util.Locale; -import java.util.Map; -import java.util.Set; -import java.util.SortedMap; -import java.util.TreeMap; -import java.util.concurrent.CompletionException; -import java.util.concurrent.ConcurrentSkipListMap; -import java.util.concurrent.ExecutorService; -import java.util.concurrent.Executors; -import java.util.concurrent.ThreadPoolExecutor; -import java.util.concurrent.TimeUnit; -import java.util.concurrent.atomic.AtomicInteger; - import io.anserini.analysis.fw.FakeWordsEncoderAnalyzer; -import io.anserini.index.IndexInvertedDenseVectors; +import io.anserini.index.Constants; import io.anserini.rerank.ScoredDocuments; import io.anserini.search.query.InvertedDenseVectorQueryGenerator; import io.anserini.search.topicreader.TopicReader; import org.apache.commons.lang3.time.DurationFormatUtils; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; -import org.apache.lucene.document.Document; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.StoredFields; -import org.apache.lucene.index.Term; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; -import org.apache.lucene.search.ScoreDoc; -import org.apache.lucene.search.TermQuery; -import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.TopScoreDocCollector; import org.apache.lucene.search.similarities.ClassicSimilarity; import org.apache.lucene.search.similarities.Similarity; @@ -70,26 +39,37 @@ import org.kohsuke.args4j.ParserProperties; import org.kohsuke.args4j.spi.StringArrayOptionHandler; +import java.io.Closeable; +import java.io.File; +import java.io.IOException; +import java.io.PrintWriter; +import java.nio.charset.StandardCharsets; +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; +import java.util.HashSet; +import java.util.Locale; +import java.util.Map; +import java.util.Set; +import java.util.SortedMap; +import java.util.TreeMap; +import java.util.concurrent.CompletionException; +import java.util.concurrent.ConcurrentSkipListMap; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.ThreadPoolExecutor; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; + import static io.anserini.index.IndexInvertedDenseVectors.FW; /** * Main entry point for inverted dense vector search. */ public final class SearchInvertedDenseVectors implements Closeable { - private static final Logger LOG = LogManager.getLogger(SearchInvertedDenseVectors.class); public static class Args { - - @Option(name = "-input", metaVar = "[file]", usage = "vectors model") - public File input; - - @Option(name = "-word", metaVar = "[word]", usage = "input word") - public String word; - - @Option(name = "-stored", metaVar = "[boolean]", usage = "fetch stored vectors from index") - public boolean stored; - @Option(name = "-index", metaVar = "[path]", required = true, usage = "Path to Lucene index") public String index; @@ -102,6 +82,10 @@ public static class Args { @Option(name = "-topicreader", usage = "TopicReader to use.") public String topicReader; + @Option(name = "-topicfield", usage = "Which field of the query should be used, default \"title\"." + + " For TREC ad hoc topics, description or narrative can be used.") + public String topicfield = "title"; + @Option(name = "-encoding", metaVar = "[word]", required = true, usage = "encoding must be one of {fw, lexlsh}") public String encoding; @@ -149,10 +133,6 @@ public static class Args { @Option(name = "-inmem", usage = "Boolean switch to read index in memory") public Boolean inmem = false; - @Option(name = "-topicfield", usage = "Which field of the query should be used, default \"title\"." + - " For TREC ad hoc topics, description or narrative can be used.") - public String topicfield = "title"; - @Option(name = "-runtag", metaVar = "[tag]", usage = "runtag") public String runtag = null; @@ -246,7 +226,7 @@ public void run() { int rank = 1; for (int i = 0; i < docs.documents.length; i++) { - String docid = docs.documents[i].get(IndexInvertedDenseVectors.Args.ID); + String docid = docs.documents[i].get(Constants.ID); if (args.selectMaxPassage) { docid = docid.split(args.selectMaxPassage_delimiter)[0]; @@ -452,70 +432,11 @@ public static void main(String[] args) throws Exception { } if (searchArgs.topicReader != null && searchArgs.topics != null) { searcher.runTopics(); - } else if (searchArgs.word != null) { - searcher.runWord(); - } else { - System.err.println("Either " + searchArgs.word + " or " + searchArgs.topicReader + " must be set"); - return; } + searcher.close(); final long durationMillis = TimeUnit.MILLISECONDS.convert(System.nanoTime() - start, TimeUnit.NANOSECONDS); LOG.info("Total run time: " + DurationFormatUtils.formatDuration(durationMillis, "HH:mm:ss")); } - private void runWord() throws IOException { - generator = new InvertedDenseVectorQueryGenerator(args, false); - Collection vectorStrings = new LinkedList<>(); - IndexSearcher searcher = new IndexSearcher(reader); - if (args.encoding.equalsIgnoreCase(FW)) { - searcher.setSimilarity(new ClassicSimilarity()); - } - StoredFields storedFields = reader.storedFields(); - if (args.stored) { - TopDocs topDocs = searcher.search(new TermQuery(new Term(IndexInvertedDenseVectors.FIELD_ID, args.word)), args.hits); - for (ScoreDoc scoreDoc : topDocs.scoreDocs) { - vectorStrings.add(storedFields.document(scoreDoc.doc).get(IndexInvertedDenseVectors.FIELD_VECTOR)); - } - } else { - System.out.println(String.format("Loading model %s", args.input)); - - Map> wordVectors = IndexInvertedDenseVectors.readGloVe(args.input); - - if (wordVectors.containsKey(args.word)) { - List vectors = wordVectors.get(args.word); - for (float[] vector : vectors) { - StringBuilder sb = new StringBuilder(); - for (double fv : vector) { - if (sb.length() > 0) { - sb.append(' '); - } - sb.append(fv); - } - String vectorString = sb.toString(); - vectorStrings.add(vectorString); - } - } - } - - for (String vectorString : vectorStrings) { - Query query = generator.buildQuery(vectorString); - - long start = System.currentTimeMillis(); - TopScoreDocCollector results = TopScoreDocCollector.create(args.hits, Integer.MAX_VALUE); - searcher.search(query, results); - long time = System.currentTimeMillis() - start; - - System.out.println(String.format("%d nearest neighbors of '%s':", args.hits, args.word)); - - int rank = 1; - for (ScoreDoc sd : results.topDocs().scoreDocs) { - Document document = storedFields.document(sd.doc); - String word = document.get(IndexInvertedDenseVectors.FIELD_ID); - System.out.println(String.format("%d. %s (%.3f)", rank, word, sd.score)); - rank++; - } - System.out.println(String.format("Search time: %dms", time)); - } - reader.close(); - } } \ No newline at end of file diff --git a/src/main/java/io/anserini/search/SearchVectorArgs.java b/src/main/java/io/anserini/search/SearchVectorArgs.java deleted file mode 100644 index a1004427f9..0000000000 --- a/src/main/java/io/anserini/search/SearchVectorArgs.java +++ /dev/null @@ -1,103 +0,0 @@ -/* - * Anserini: A Lucene toolkit for reproducible information retrieval research - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package io.anserini.search; - -import org.kohsuke.args4j.Option; -import org.kohsuke.args4j.spi.StringArrayOptionHandler; - - -public class SearchVectorArgs { - // required arguments - @Option(name = "-index", metaVar = "[path]", required = true, usage = "Path to Lucene index") - public String index; - - @Option(name = "-topics", metaVar = "[file]", handler = StringArrayOptionHandler.class, required = true, usage = "topics file") - public String[] topics; - - @Option(name = "-output", metaVar = "[file]", required = true, usage = "output file") - public String output; - - @Option(name = "-topicreader", required = true, usage = "TopicReader to use.") - public String topicReader; - - // optional arguments - @Option(name = "-querygenerator", usage = "QueryGenerator to use.") - public String queryGenerator = "BagOfWordsQueryGenerator"; - - @Option(name = "-threads", metaVar = "[int]", usage = "Number of threads to use for running different parameter configurations.") - public int threads = 1; - - @Option(name = "-parallelism", metaVar = "[int]", usage = "Number of threads to use for each individual parameter configuration.") - public int parallelism = 8; - - @Option(name = "-removeQuery", usage = "Remove docids that have the query id when writing final run output.") - public Boolean removeQuery = false; - - // Note that this option is set to false by default because duplicate documents usually indicate some underlying - // indexing issues, and we don't want to just eat errors silently. - @Option(name = "-removedups", usage = "Remove duplicate docids when writing final run output.") - public Boolean removedups = false; - - @Option(name = "-skipexists", usage = "When enabled, will skip if the run file exists") - public Boolean skipexists = false; - - @Option(name = "-hits", metaVar = "[number]", required = false, usage = "max number of hits to return") - public int hits = 1000; - - @Option(name = "-efSearch", metaVar = "[number]", required = false, usage = "efSearch parameter for HNSW search") - public int efSearch = 100; - - @Option(name = "-inmem", usage = "Boolean switch to read index in memory") - public Boolean inmem = false; - - @Option(name = "-topicfield", usage = "Which field of the query should be used, default \"title\"." + - " For TREC ad hoc topics, description or narrative can be used.") - public String topicfield = "title"; - - @Option(name = "-runtag", metaVar = "[tag]", usage = "runtag") - public String runtag = null; - - @Option(name = "-format", metaVar = "[output format]", usage = "Output format, default \"trec\", alternative \"msmarco\".") - public String format = "trec"; - - // --------------------------------------------- - // Simple built-in support for passage retrieval - // --------------------------------------------- - - // A simple approach to passage retrieval is to pre-segment documents in the corpus into passages and index those - // passages. At retrieval time, we retain only the max scoring passage from each document; this is often called MaxP, - // from Dai and Callan (SIGIR 2019) in the context of BERT, although the general approach dates back to Callan - // (SIGIR 1994), Hearst and Plaunt (SIGIR 1993), and lots of other papers from the 1990s and even earlier. - // - // One common convention is to label the passages of a docid as "docid.00000", "docid.00001", "docid.00002", ... - // We use this convention in CORD-19. Alternatively, in document expansion for the MS MARCO document corpus, we use - // '#' as the delimiter. - // - // The options below control various aspects of this behavior. - - @Option(name = "-selectMaxPassage", usage = "Select and retain only the max scoring segment from each document.") - public Boolean selectMaxPassage = false; - - @Option(name = "-selectMaxPassage.delimiter", metaVar = "[regexp]", - usage = "The delimiter (as a regular regression) for splitting the segment id from the doc id.") - public String selectMaxPassage_delimiter = "\\."; - - @Option(name = "-selectMaxPassage.hits", metaVar = "[int]", - usage = "Maximum number of hits to return per topic after segment id removal. " + - "Note that this is different from '-hits', which specifies the number of hits including the segment id. ") - public int selectMaxPassage_hits = Integer.MAX_VALUE; -} diff --git a/src/main/python/regressions-batch03.txt b/src/main/python/regressions-batch03.txt index 96a4afffb3..baab7fc298 100644 --- a/src/main/python/regressions-batch03.txt +++ b/src/main/python/regressions-batch03.txt @@ -2,7 +2,9 @@ python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-splade-pp-ed > logs/log.msmarco-passage-splade-pp-ed 2>&1 python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-splade-pp-sd > logs/log.msmarco-passage-splade-pp-sd 2>&1 python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-openai-ada2 > logs/log.msmarco-passage-openai-ada2 2>&1 -python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-cos-dpr-distil > logs/log.msmarco-passage-cos-dpr-distil 2>&1 +python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-cos-dpr-distil-hnsw > logs/log.msmarco-passage-cos-dpr-distil-hnsw 2>&1 +python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-cos-dpr-distil-fw > logs/log.msmarco-passage-cos-dpr-distil-fw 2>&1 +python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-cos-dpr-distil-lexlsh > logs/log.msmarco-passage-cos-dpr-distil-lexlsh 2>&1 python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage > logs/log.msmarco-passage 2>&1 python src/main/python/run_regression.py --index --verify --search --regression msmarco-passage-bm25-b8 > logs/log.msmarco-passage-bm25-b8 2>&1 @@ -34,7 +36,7 @@ python src/main/python/run_regression.py --index --verify --search --regression # MS MARCO V1 passage ONNX runs - uses same index, so need to make sure previous runs finish python src/main/python/run_regression.py --search-pool 1 --verify --search --regression msmarco-passage-splade-pp-ed-onnx > logs/log.msmarco-passage-splade-pp-ed-onnx 2>&1 python src/main/python/run_regression.py --search-pool 1 --verify --search --regression msmarco-passage-splade-pp-sd-onnx > logs/log.msmarco-passage-splade-pp-sd-onnx 2>&1 -python src/main/python/run_regression.py --search-pool 1 --verify --search --regression msmarco-passage-cos-dpr-distil-onnx > logs/log.msmarco-passage-cos-dpr-distil-onnx 2>&1 +python src/main/python/run_regression.py --search-pool 1 --verify --search --regression msmarco-passage-cos-dpr-distil-hnsw-onnx > logs/log.msmarco-passage-cos-dpr-distil-hnsw-onnx 2>&1 # MIRACL python src/main/python/run_regression.py --index --verify --search --regression miracl-v1.0-ar > logs/log.miracl-v1.0-ar 2>&1 @@ -122,7 +124,7 @@ python src/main/python/run_regression.py --index --verify --search --regression # DL19 - ONNX python src/main/python/run_regression.py --search-pool 1 --verify --search --regression dl19-passage-splade-pp-ed-onnx > logs/log.dl19-passage-splade-pp-ed-onnx 2>&1 python src/main/python/run_regression.py --search-pool 1 --verify --search --regression dl19-passage-splade-pp-sd-onnx > logs/log.dl19-passage-splade-pp-sd-onnx 2>&1 -python src/main/python/run_regression.py --search-pool 1 --verify --search --regression dl19-passage-cos-dpr-distil-onnx > logs/log.dl19-passage-cos-dpr-distil-onnx 2>&1 +python src/main/python/run_regression.py --search-pool 1 --verify --search --regression dl19-passage-cos-dpr-distil-hnsw-onnx > logs/log.dl19-passage-cos-dpr-distil-hnsw-onnx 2>&1 # Other DL19 python src/main/python/run_regression.py --verify --search --regression dl19-passage > logs/log.dl19-passage 2>&1 @@ -137,7 +139,9 @@ python src/main/python/run_regression.py --verify --search --regression dl19-pas python src/main/python/run_regression.py --verify --search --regression dl19-passage-splade-distil-cocodenser-medium > logs/log.dl19-passage-splade-distil-cocodenser-medium 2>&1 python src/main/python/run_regression.py --verify --search --regression dl19-passage-splade-pp-ed > logs/log.dl19-passage-splade-pp-ed 2>&1 python src/main/python/run_regression.py --verify --search --regression dl19-passage-splade-pp-sd > logs/log.dl19-passage-splade-pp-sd 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl19-passage-cos-dpr-distil > logs/log.dl19-passage-cos-dpr-distil 2>&1 +python src/main/python/run_regression.py --verify --search --regression dl19-passage-cos-dpr-distil-hnsw > logs/log.dl19-passage-cos-dpr-distil-hnsw 2>&1 +python src/main/python/run_regression.py --verify --search --regression dl19-passage-cos-dpr-distil-fw > logs/log.dl19-passage-cos-dpr-distil-fw 2>&1 +python src/main/python/run_regression.py --verify --search --regression dl19-passage-cos-dpr-distil-lexlsh > logs/log.dl19-passage-cos-dpr-distil-lexlsh 2>&1 python src/main/python/run_regression.py --verify --search --regression dl19-passage-openai-ada2 > logs/log.dl19-passage-openai-ada2 2>&1 python src/main/python/run_regression.py --verify --search --regression dl19-doc > logs/log.dl19-doc 2>&1 @@ -156,7 +160,7 @@ python src/main/python/run_regression.py --verify --search --regression dl19-doc # DL20 - ONNX python src/main/python/run_regression.py --search-pool 1 --verify --search --regression dl20-passage-splade-pp-ed-onnx > logs/log.dl20-passage-splade-pp-ed-onnx 2>&1 python src/main/python/run_regression.py --search-pool 1 --verify --search --regression dl20-passage-splade-pp-sd-onnx > logs/log.dl20-passage-splade-pp-sd-onnx 2>&1 -python src/main/python/run_regression.py --search-pool 1 --verify --search --regression dl20-passage-cos-dpr-distil-onnx > logs/log.dl20-passage-cos-dpr-distil-onnx 2>&1 +python src/main/python/run_regression.py --search-pool 1 --verify --search --regression dl20-passage-cos-dpr-distil-hnsw-onnx > logs/log.dl20-passage-cos-dpr-distil-hnsw-onnx 2>&1 # Other DL20 python src/main/python/run_regression.py --verify --search --regression dl20-passage > logs/log.dl20-passage 2>&1 @@ -171,18 +175,20 @@ python src/main/python/run_regression.py --verify --search --regression dl20-pas python src/main/python/run_regression.py --verify --search --regression dl20-passage-splade-distil-cocodenser-medium > logs/log.dl20-passage-splade-distil-cocodenser-medium 2>&1 python src/main/python/run_regression.py --verify --search --regression dl20-passage-splade-pp-ed > logs/log.dl20-passage-splade-pp-ed 2>&1 python src/main/python/run_regression.py --verify --search --regression dl20-passage-splade-pp-sd > logs/log.dl20-passage-splade-pp-sd 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl20-passage-cos-dpr-distil > logs/log.dl20-passage-cos-dpr-distil 2>&1 +python src/main/python/run_regression.py --verify --search --regression dl20-passage-cos-dpr-distil-hnsw > logs/log.dl20-passage-cos-dpr-distil-hnsw 2>&1 +python src/main/python/run_regression.py --verify --search --regression dl20-passage-cos-dpr-distil-fw > logs/log.dl20-passage-cos-dpr-distil-fw 2>&1 +python src/main/python/run_regression.py --verify --search --regression dl20-passage-cos-dpr-distil-lexlsh > logs/log.dl20-passage-cos-dpr-distil-lexlsh 2>&1 python src/main/python/run_regression.py --verify --search --regression dl20-passage-openai-ada2 > logs/log.dl20-passage-openai-ada2 2>&1 python src/main/python/run_regression.py --verify --search --regression dl20-doc > logs/log.dl20-doc 2>&1 python src/main/python/run_regression.py --verify --search --regression dl20-doc-ca > logs/log.dl20-doc-ca 2>&1 python src/main/python/run_regression.py --verify --search --regression dl20-doc-wp > logs/log.dl20-doc-wp 2>&1 python src/main/python/run_regression.py --verify --search --regression dl20-doc-hgf-wp > logs/log.dl20-doc-hgf-wp 2>&1 -python src/main/python/run_regression.py --verify --search --regression dl20-doc-segmented-wp > logs/log.dl20-doc-segmented-wp 2>&1 python src/main/python/run_regression.py --verify --search --regression dl20-doc-docTTTTTquery > logs/log.dl20-doc-docTTTTTquery 2>&1 python src/main/python/run_regression.py --verify --search --regression dl20-doc-segmented > logs/log.dl20-doc-segmented 2>&1 python src/main/python/run_regression.py --verify --search --regression dl20-doc-segmented-ca > logs/log.dl20-doc-segmented-ca 2>&1 +python src/main/python/run_regression.py --verify --search --regression dl20-doc-segmented-wp > logs/log.dl20-doc-segmented-wp 2>&1 python src/main/python/run_regression.py --verify --search --regression dl20-doc-segmented-docTTTTTquery > logs/log.dl20-doc-segmented-docTTTTTquery 2>&1 python src/main/python/run_regression.py --verify --search --regression dl20-doc-segmented-unicoil > logs/log.dl20-doc-segmented-unicoil 2>&1 python src/main/python/run_regression.py --verify --search --regression dl20-doc-segmented-unicoil-noexp > logs/log.dl20-doc-segmented-unicoil-noexp 2>&1 diff --git a/src/main/python/run_regression.py b/src/main/python/run_regression.py index c16253e037..dbf81a9187 100644 --- a/src/main/python/run_regression.py +++ b/src/main/python/run_regression.py @@ -55,11 +55,13 @@ INDEX_COMMAND = 'target/appassembler/bin/IndexCollection' INDEX_HNSW_COMMAND = 'target/appassembler/bin/IndexHnswDenseVectors' +INDEX_INVERTED_DENSE_COMMAND = 'target/appassembler/bin/IndexInvertedDenseVectors' INDEX_STATS_COMMAND = 'target/appassembler/bin/IndexReaderUtils' SEARCH_COMMAND = 'target/appassembler/bin/SearchCollection' SEARCH_HNSW_COMMAND = 'target/appassembler/bin/SearchHnswDenseVectors' +SEARCH_INVERTED_DENSE_COMMAND = 'target/appassembler/bin/SearchInvertedDenseVectors' def is_close(a, b, rel_tol=1e-09, abs_tol=0.0): @@ -115,7 +117,9 @@ def construct_indexing_command(yaml_data, args): if not os.path.exists('indexes'): os.makedirs('indexes') - if yaml_data['collection_class'] == 'JsonDenseVectorCollection': + if yaml_data.get('index_type') == 'inverted-dense': + root_cmd = INDEX_INVERTED_DENSE_COMMAND + elif yaml_data.get('index_type') == 'hnsw': root_cmd = INDEX_HNSW_COMMAND else: root_cmd = INDEX_COMMAND @@ -140,7 +144,7 @@ def construct_runfile_path(corpus, id, model_name): def construct_search_commands(yaml_data): ranking_commands = [ [ - SEARCH_HNSW_COMMAND if 'VectorQueryGenerator' in model['params'] else SEARCH_COMMAND, + SEARCH_INVERTED_DENSE_COMMAND if model.get('type') == 'inverted-dense' else SEARCH_HNSW_COMMAND if model.get('type') == 'hnsw' else SEARCH_COMMAND, '-index', construct_index_path(yaml_data), '-topics', os.path.join('tools/topics-and-qrels', topic_set['path']), '-topicreader', topic_set['topic_reader'] if 'topic_reader' in topic_set and topic_set['topic_reader'] else yaml_data['topic_reader'], @@ -368,14 +372,16 @@ def download_url(url, save_dir, local_filename=None, md5=None, force=False, verb # Verify index statistics. if args.verify: logger.info('='*10 + ' Verifying Index ' + '='*10) - if yaml_data['collection_class'] == 'JsonDenseVectorCollection': + if yaml_data.get('index_type') == 'hnsw': logger.info('Skipping verification step for HNSW dense indexes.') else: - index_utils_command = [INDEX_STATS_COMMAND, '-index', construct_index_path(yaml_data), '-stats'] - verification_command = ' '.join(index_utils_command) + verification_command_args = [INDEX_STATS_COMMAND, '-index', construct_index_path(yaml_data), '-stats'] + if yaml_data.get('index_type') == 'inverted-dense': + verification_command_args.extend(['-field', 'vector']) + verification_command = ' '.join(verification_command_args) logger.info(verification_command) if not args.dry_run: - out = check_output(' '.join(index_utils_command)).decode('utf-8').split('\n') + out = check_output(verification_command).decode('utf-8').split('\n') for line in out: stat = line.split(':')[0] if stat in yaml_data['index_stats']: diff --git a/src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil-fw.template b/src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil-fw.template new file mode 100644 index 0000000000..50d456e7e2 --- /dev/null +++ b/src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil-fw.template @@ -0,0 +1,92 @@ +# Anserini Regressions: TREC 2019 Deep Learning Track (Passage) + +**Model**: cosDPR-distil with inverted indexes using the "fake-words" technique (q=40); pre-encoded queries + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [TREC 2019 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html). +In these experiments, we are using pre-encoded queries (i.e., cached results of query encoding). + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 57 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, applying inverted indexes to dense vectors using the "fake-words" technique: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. + +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 43 topics for which NIST has provided judgments as part of the TREC 2019 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2019.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. +Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](${yaml}). + +Also note that retrieval metrics are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +Also, for computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. diff --git a/src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil-onnx.template b/src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil-hnsw-onnx.template similarity index 100% rename from src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil-onnx.template rename to src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil-hnsw-onnx.template diff --git a/src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil.template b/src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil-hnsw.template similarity index 100% rename from src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil.template rename to src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil-hnsw.template diff --git a/src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil-lexlsh.template b/src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil-lexlsh.template new file mode 100644 index 0000000000..9fca7f9358 --- /dev/null +++ b/src/main/resources/docgen/templates/dl19-passage-cos-dpr-distil-lexlsh.template @@ -0,0 +1,92 @@ +# Anserini Regressions: TREC 2019 Deep Learning Track (Passage) + +**Model**: cosDPR-distil with inverted indexes using the "LexLSH" technique (b=600); pre-encoded queries + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [TREC 2019 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html). +In these experiments, we are using pre-encoded queries (i.e., cached results of query encoding). + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 57 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, applying inverted indexes to dense vectors using the "LexLSH" technique: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. + +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 43 topics for which NIST has provided judgments as part of the TREC 2019 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2019.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. +Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](${yaml}). + +Also note that retrieval metrics are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +Also, for computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. diff --git a/src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil-fw.template b/src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil-fw.template new file mode 100644 index 0000000000..72d438f675 --- /dev/null +++ b/src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil-fw.template @@ -0,0 +1,92 @@ +# Anserini Regressions: TREC 2020 Deep Learning Track (Passage) + +**Model**: cosDPR-distil with inverted indexes using the "fake-words" technique (q=40); pre-encoded queries + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [TREC 2020 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html). +In these experiments, we are using pre-encoded queries (i.e., cached results of query encoding). + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 57 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, applying inverted indexes to dense vectors using the "fake-words" technique: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. + +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 54 topics for which NIST has provided judgments as part of the TREC 2020 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2020.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. +Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](${yaml}). + +Also note that retrieval metrics are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +Also, for computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. diff --git a/src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil-onnx.template b/src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil-hnsw-onnx.template similarity index 100% rename from src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil-onnx.template rename to src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil-hnsw-onnx.template diff --git a/src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil.template b/src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil-hnsw.template similarity index 100% rename from src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil.template rename to src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil-hnsw.template diff --git a/src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil-lexlsh.template b/src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil-lexlsh.template new file mode 100644 index 0000000000..e97ee592f2 --- /dev/null +++ b/src/main/resources/docgen/templates/dl20-passage-cos-dpr-distil-lexlsh.template @@ -0,0 +1,92 @@ +# Anserini Regressions: TREC 2020 Deep Learning Track (Passage) + +**Model**: cosDPR-distil with inverted indexes using the "LexLSH" technique (b=600); pre-encoded queries + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [TREC 2020 Deep Learning Track passage ranking task](https://trec.nist.gov/data/deep2019.html). +In these experiments, we are using pre-encoded queries (i.e., cached results of query encoding). + +Note that the NIST relevance judgments provide far more relevant passages per topic, unlike the "sparse" judgments provided by Microsoft (these are sometimes called "dense" judgments to emphasize this contrast). +For additional instructions on working with MS MARCO passage collection, refer to [this page](experiments-msmarco-passage.md). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 57 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, applying inverted indexes to dense vectors using the "LexLSH" technique: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. + +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 54 topics for which NIST has provided judgments as part of the TREC 2020 Deep Learning Track. +The original data can be found [here](https://trec.nist.gov/data/deep2020.html). + +After indexing has completed, you should be able to perform retrieval as follows: + +```bash +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. +Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](${yaml}). + +Also note that retrieval metrics are computed to depth 1000 hits per query (as opposed to 100 hits per query for document ranking). +Also, for computing nDCG, remember that we keep qrels of _all_ relevance grades, whereas for other metrics (e.g., AP), relevance grade 1 is considered not relevant (i.e., use the `-l 2` option in `trec_eval`). +The experimental results reported here are directly comparable to the results reported in the [track overview paper](https://arxiv.org/abs/2003.07820). + +## Reproduction Log[*](reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. diff --git a/src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-fw.template b/src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-fw.template new file mode 100644 index 0000000000..20ce757e0c --- /dev/null +++ b/src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-fw.template @@ -0,0 +1,86 @@ +# Anserini Regressions: MS MARCO Passage Ranking + +**Model**: cosDPR-distil with inverted indexes using the "fake-words" technique (q=40); pre-encoded queries + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [MS MARCO passage ranking task](https://github.com/microsoft/MSMARCO-Passage-Ranking). +In these experiments, we are using pre-encoded queries (i.e., cached results of query encoding). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 57 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, applying inverted indexes to dense vectors using the "fake-words" technique: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. + +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 6980 dev set questions; see [this page](${root_path}/docs/experiments-msmarco-passage.md) for more details. + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. +Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](${yaml}). + +## Reproduction Log[*](${root_path}/docs/reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. + ++ Results reproduced by [@yilinjz](https://github.com/yilinjz) on 2023-09-01 (commit [`4ae518b`](https://github.com/castorini/anserini/commit/4ae518bb284ebcba0b273a473bc8774735cb7d19)) \ No newline at end of file diff --git a/src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-onnx.template b/src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-hnsw-onnx.template similarity index 100% rename from src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-onnx.template rename to src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-hnsw-onnx.template diff --git a/src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil.template b/src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-hnsw.template similarity index 100% rename from src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil.template rename to src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-hnsw.template diff --git a/src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-lexlsh.template b/src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-lexlsh.template new file mode 100644 index 0000000000..cbd0c07afd --- /dev/null +++ b/src/main/resources/docgen/templates/msmarco-passage-cos-dpr-distil-lexlsh.template @@ -0,0 +1,86 @@ +# Anserini Regressions: MS MARCO Passage Ranking + +**Model**: cosDPR-distil with inverted indexes using the "LexLSH" technique (b=600); pre-encoded queries + +This page describes regression experiments, integrated into Anserini's regression testing framework, using the cosDPR-distil model on the [MS MARCO passage ranking task](https://github.com/microsoft/MSMARCO-Passage-Ranking). +In these experiments, we are using pre-encoded queries (i.e., cached results of query encoding). + +The exact configurations for these regressions are stored in [this YAML file](${yaml}). +Note that this page is automatically generated from [this template](${template}) as part of Anserini's regression pipeline, so do not modify this page directly; modify the template instead and then run `bin/build.sh` to rebuild the documentation. + +From one of our Waterloo servers (e.g., `orca`), the following command will perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} +``` + +We make available a version of the MS MARCO Passage Corpus that has already been encoded with cosDPR-distil. + +From any machine, the following command will download the corpus and perform the complete regression, end to end: + +```bash +python src/main/python/run_regression.py --download --index --verify --search --regression ${test_name} +``` + +The `run_regression.py` script automates the following steps, but if you want to perform each step manually, simply copy/paste from the commands below and you'll obtain the same regression results. + +## Corpus Download + +Download the corpus and unpack into `collections/`: + +```bash +wget ${download_url} -P collections/ +tar xvf collections/${corpus}.tar -C collections/ +``` + +To confirm, `${corpus}.tar` is 57 GB and has MD5 checksum `${download_checksum}`. +With the corpus downloaded, the following command will perform the remaining steps below: + +```bash +python src/main/python/run_regression.py --index --verify --search --regression ${test_name} \ + --corpus-path collections/${corpus} +``` + +## Indexing + +Sample indexing command, applying inverted indexes to dense vectors using the "LexLSH" technique: + +```bash +${index_cmds} +``` + +The path `/path/to/${corpus}/` should point to the corpus downloaded above. + +Upon completion, we should have an index with 8,841,823 documents. + +## Retrieval + +Topics and qrels are stored [here](https://github.com/castorini/anserini-tools/tree/master/topics-and-qrels), which is linked to the Anserini repo as a submodule. +The regression experiments here evaluate on the 6980 dev set questions; see [this page](${root_path}/docs/experiments-msmarco-passage.md) for more details. + +After indexing has completed, you should be able to perform retrieval as follows using HNSW indexes: + +```bash +${ranking_cmds} +``` + +Evaluation can be performed using `trec_eval`: + +```bash +${eval_cmds} +``` + +## Effectiveness + +With the above commands, you should be able to reproduce the following results: + +${effectiveness} + +Note that due to the non-deterministic nature of HNSW indexing, results may differ slightly between each experimental run. +Nevertheless, scores are generally within 0.005 of the reference values recorded in [our YAML configuration file](${yaml}). + +## Reproduction Log[*](${root_path}/docs/reproducibility.md) + +To add to this reproduction log, modify [this template](${template}) and run `bin/build.sh` to rebuild the documentation. + ++ Results reproduced by [@yilinjz](https://github.com/yilinjz) on 2023-09-01 (commit [`4ae518b`](https://github.com/castorini/anserini/commit/4ae518bb284ebcba0b273a473bc8774735cb7d19)) \ No newline at end of file diff --git a/src/main/resources/regression/dl19-passage-cos-dpr-distil-fw.yaml b/src/main/resources/regression/dl19-passage-cos-dpr-distil-fw.yaml new file mode 100644 index 0000000000..9e1896dc71 --- /dev/null +++ b/src/main/resources/regression/dl19-passage-cos-dpr-distil-fw.yaml @@ -0,0 +1,69 @@ +--- +corpus: msmarco-passage-cos-dpr-distil +corpus_path: collections/msmarco/msmarco-passage-cos-dpr-distil/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.tar +download_checksum: e20ffbc8b5e7f760af31298aefeaebbd + +index_path: indexes/lucene-index.msmarco-passage-cos-dpr-distil.fw-40/ +index_type: inverted-dense +collection_class: JsonDenseVectorCollection +generator_class: InvertedDenseVectorDocumentGenerator +index_threads: 16 +index_options: -encoding fw -fw.q 40 +index_stats: + documents: 8841823 + documents (non-empty): 8841823 + total terms: 2358948522 + +metrics: + - metric: AP@1000 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -m map -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: nDCG@10 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -m ndcg_cut.10 -c + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -m recall.100 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -m recall.1000 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: JsonIntVector +topics: + - name: "[DL19 (Passage)](https://trec.nist.gov/data/deep2020.html)" + id: dl19 + path: topics.dl19-passage.cos-dpr-distil.jsonl.gz + qrel: qrels.dl19-passage.txt + +models: + - name: cos-dpr-distil-fw-40 + display: cosDPR-distill + type: inverted-dense + params: -topicfield vector -encoding fw -fw.q 40 -hits 1000 + results: + AP@1000: + - 0.4271 + nDCG@10: + - 0.6857 + R@100: + - 0.5766 + R@1000: + - 0.7902 diff --git a/src/main/resources/regression/dl19-passage-cos-dpr-distil-onnx.yaml b/src/main/resources/regression/dl19-passage-cos-dpr-distil-hnsw-onnx.yaml similarity index 94% rename from src/main/resources/regression/dl19-passage-cos-dpr-distil-onnx.yaml rename to src/main/resources/regression/dl19-passage-cos-dpr-distil-hnsw-onnx.yaml index 954624d244..2e86181ba7 100644 --- a/src/main/resources/regression/dl19-passage-cos-dpr-distil-onnx.yaml +++ b/src/main/resources/regression/dl19-passage-cos-dpr-distil-hnsw-onnx.yaml @@ -6,10 +6,11 @@ download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr- download_checksum: e20ffbc8b5e7f760af31298aefeaebbd index_path: indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil/ +index_type: hnsw collection_class: JsonDenseVectorCollection generator_class: LuceneDenseVectorDocumentGenerator index_threads: 16 -index_options: -M 16 -efC 100 -encoder CosDprDistil +index_options: -M 16 -efC 100 metrics: - metric: AP@1000 @@ -49,8 +50,9 @@ topics: qrel: qrels.dl19-passage.txt models: - - name: cos-dpr-distil + - name: cos-dpr-distil-hnsw display: cosDPR-distil + type: hnsw params: -querygenerator VectorQueryGenerator -topicfield title -threads 16 -hits 1000 -efSearch 1000 -encoder CosDprDistil results: AP@1000: diff --git a/src/main/resources/regression/dl19-passage-cos-dpr-distil.yaml b/src/main/resources/regression/dl19-passage-cos-dpr-distil-hnsw.yaml similarity index 96% rename from src/main/resources/regression/dl19-passage-cos-dpr-distil.yaml rename to src/main/resources/regression/dl19-passage-cos-dpr-distil-hnsw.yaml index 5e6e395f37..b727ca45a2 100644 --- a/src/main/resources/regression/dl19-passage-cos-dpr-distil.yaml +++ b/src/main/resources/regression/dl19-passage-cos-dpr-distil-hnsw.yaml @@ -6,6 +6,7 @@ download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr- download_checksum: e20ffbc8b5e7f760af31298aefeaebbd index_path: indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil/ +index_type: hnsw collection_class: JsonDenseVectorCollection generator_class: LuceneDenseVectorDocumentGenerator index_threads: 16 @@ -49,8 +50,9 @@ topics: qrel: qrels.dl19-passage.txt models: - - name: cos-dpr-distil + - name: cos-dpr-distil-hnsw display: cosDPR-distil + type: hnsw params: -querygenerator VectorQueryGenerator -topicfield vector -threads 16 -hits 1000 -efSearch 1000 results: AP@1000: diff --git a/src/main/resources/regression/dl19-passage-cos-dpr-distil-lexlsh.yaml b/src/main/resources/regression/dl19-passage-cos-dpr-distil-lexlsh.yaml new file mode 100644 index 0000000000..053bdcae16 --- /dev/null +++ b/src/main/resources/regression/dl19-passage-cos-dpr-distil-lexlsh.yaml @@ -0,0 +1,69 @@ +--- +corpus: msmarco-passage-cos-dpr-distil +corpus_path: collections/msmarco/msmarco-passage-cos-dpr-distil/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.tar +download_checksum: e20ffbc8b5e7f760af31298aefeaebbd + +index_path: indexes/lucene-index.msmarco-passage-cos-dpr-distil.lexlsh-600/ +index_type: inverted-dense +collection_class: JsonDenseVectorCollection +generator_class: InvertedDenseVectorDocumentGenerator +index_threads: 16 +index_options: -encoding lexlsh -lexlsh.b 600 +index_stats: + documents: 8841823 + documents (non-empty): 8841823 + total terms: 5305093800 + +metrics: + - metric: AP@1000 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -m map -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: nDCG@10 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -m ndcg_cut.10 -c + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -m recall.100 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -m recall.1000 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: JsonIntVector +topics: + - name: "[DL19 (Passage)](https://trec.nist.gov/data/deep2020.html)" + id: dl19 + path: topics.dl19-passage.cos-dpr-distil.jsonl.gz + qrel: qrels.dl19-passage.txt + +models: + - name: cos-dpr-distil-lexlsh-600 + display: cosDPR-distill + type: inverted-dense + params: -topicfield vector -encoding lexlsh -lexlsh.b 600 -hits 1000 + results: + AP@1000: + - 0.4118 + nDCG@10: + - 0.6716 + R@100: + - 0.5545 + R@1000: + - 0.7610 diff --git a/src/main/resources/regression/dl19-passage-openai-ada2.yaml b/src/main/resources/regression/dl19-passage-openai-ada2.yaml index cf63f02762..85c214bb22 100644 --- a/src/main/resources/regression/dl19-passage-openai-ada2.yaml +++ b/src/main/resources/regression/dl19-passage-openai-ada2.yaml @@ -6,6 +6,7 @@ download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-openai-a download_checksum: a4d843d522ff3a3af7edbee789a63402 index_path: indexes/lucene-hnsw.msmarco-passage-openai-ada2/ +index_type: hnsw collection_class: JsonDenseVectorCollection generator_class: LuceneDenseVectorDocumentGenerator index_threads: 16 @@ -51,6 +52,7 @@ topics: models: - name: openai-ada2 display: OpenAI-ada2 + type: hnsw params: -querygenerator VectorQueryGenerator -topicfield vector -threads 16 -hits 1000 -efSearch 1000 results: AP@1000: diff --git a/src/main/resources/regression/dl20-passage-cos-dpr-distil-fw.yaml b/src/main/resources/regression/dl20-passage-cos-dpr-distil-fw.yaml new file mode 100644 index 0000000000..fb8737455a --- /dev/null +++ b/src/main/resources/regression/dl20-passage-cos-dpr-distil-fw.yaml @@ -0,0 +1,69 @@ +--- +corpus: msmarco-passage-cos-dpr-distil +corpus_path: collections/msmarco/msmarco-passage-cos-dpr-distil/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.tar +download_checksum: e20ffbc8b5e7f760af31298aefeaebbd + +index_path: indexes/lucene-index.msmarco-passage-cos-dpr-distil.fw-40/ +index_type: inverted-dense +collection_class: JsonDenseVectorCollection +generator_class: InvertedDenseVectorDocumentGenerator +index_threads: 16 +index_options: -encoding fw -fw.q 40 +index_stats: + documents: 8841823 + documents (non-empty): 8841823 + total terms: 2358948522 + +metrics: + - metric: AP@1000 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -m map -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: nDCG@10 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -m ndcg_cut.10 -c + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -m recall.100 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -m recall.1000 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: JsonIntVector +topics: + - name: "[DL20 (Passage)](https://trec.nist.gov/data/deep2020.html)" + id: dl20 + path: topics.dl20.cos-dpr-distil.jsonl.gz + qrel: qrels.dl20-passage.txt + +models: + - name: cos-dpr-distil-fw-40 + display: cosDPR-distill + type: inverted-dense + params: -topicfield vector -encoding fw -fw.q 40 -hits 1000 + results: + AP@1000: + - 0.4597 + nDCG@10: + - 0.6666 + R@100: + - 0.6909 + R@1000: + - 0.8194 diff --git a/src/main/resources/regression/dl20-passage-cos-dpr-distil-onnx.yaml b/src/main/resources/regression/dl20-passage-cos-dpr-distil-hnsw-onnx.yaml similarity index 94% rename from src/main/resources/regression/dl20-passage-cos-dpr-distil-onnx.yaml rename to src/main/resources/regression/dl20-passage-cos-dpr-distil-hnsw-onnx.yaml index cd91340869..3cd457b2ee 100644 --- a/src/main/resources/regression/dl20-passage-cos-dpr-distil-onnx.yaml +++ b/src/main/resources/regression/dl20-passage-cos-dpr-distil-hnsw-onnx.yaml @@ -6,10 +6,11 @@ download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr- download_checksum: e20ffbc8b5e7f760af31298aefeaebbd index_path: indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil/ +index_type: hnsw collection_class: JsonDenseVectorCollection generator_class: LuceneDenseVectorDocumentGenerator index_threads: 16 -index_options: -M 16 -efC 100 -encoder CosDprDistil +index_options: -M 16 -efC 100 metrics: - metric: AP@1000 @@ -49,8 +50,9 @@ topics: qrel: qrels.dl20-passage.txt models: - - name: cos-dpr-distil + - name: cos-dpr-distil-hnsw display: cosDPR-distil + type: hnsw params: -querygenerator VectorQueryGenerator -topicfield title -threads 16 -hits 1000 -efSearch 1000 -encoder CosDprDistil results: AP@1000: diff --git a/src/main/resources/regression/dl20-passage-cos-dpr-distil.yaml b/src/main/resources/regression/dl20-passage-cos-dpr-distil-hnsw.yaml similarity index 96% rename from src/main/resources/regression/dl20-passage-cos-dpr-distil.yaml rename to src/main/resources/regression/dl20-passage-cos-dpr-distil-hnsw.yaml index 406897a20d..5c4ddd872e 100644 --- a/src/main/resources/regression/dl20-passage-cos-dpr-distil.yaml +++ b/src/main/resources/regression/dl20-passage-cos-dpr-distil-hnsw.yaml @@ -6,6 +6,7 @@ download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr- download_checksum: e20ffbc8b5e7f760af31298aefeaebbd index_path: indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil/ +index_type: hnsw collection_class: JsonDenseVectorCollection generator_class: LuceneDenseVectorDocumentGenerator index_threads: 16 @@ -49,8 +50,9 @@ topics: qrel: qrels.dl20-passage.txt models: - - name: cos-dpr-distil + - name: cos-dpr-distil-hnsw display: cosDPR-distil + type: hnsw params: -querygenerator VectorQueryGenerator -topicfield vector -threads 16 -hits 1000 -efSearch 1000 results: AP@1000: diff --git a/src/main/resources/regression/dl20-passage-cos-dpr-distil-lexlsh.yaml b/src/main/resources/regression/dl20-passage-cos-dpr-distil-lexlsh.yaml new file mode 100644 index 0000000000..8cd6132361 --- /dev/null +++ b/src/main/resources/regression/dl20-passage-cos-dpr-distil-lexlsh.yaml @@ -0,0 +1,69 @@ +--- +corpus: msmarco-passage-cos-dpr-distil +corpus_path: collections/msmarco/msmarco-passage-cos-dpr-distil/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.tar +download_checksum: e20ffbc8b5e7f760af31298aefeaebbd + +index_path: indexes/lucene-index.msmarco-passage-cos-dpr-distil.lexlsh-600/ +index_type: inverted-dense +collection_class: JsonDenseVectorCollection +generator_class: InvertedDenseVectorDocumentGenerator +index_threads: 16 +index_options: -encoding lexlsh -lexlsh.b 600 +index_stats: + documents: 8841823 + documents (non-empty): 8841823 + total terms: 5305093800 + +metrics: + - metric: AP@1000 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -m map -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: nDCG@10 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -m ndcg_cut.10 -c + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -m recall.100 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -m recall.1000 -c -l 2 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: JsonIntVector +topics: + - name: "[DL20 (Passage)](https://trec.nist.gov/data/deep2020.html)" + id: dl20 + path: topics.dl20.cos-dpr-distil.jsonl.gz + qrel: qrels.dl20-passage.txt + +models: + - name: cos-dpr-distil-lexlsh-600 + display: cosDPR-distill + type: inverted-dense + params: -topicfield vector -encoding lexlsh -lexlsh.b 600 -hits 1000 + results: + AP@1000: + - 0.4486 + nDCG@10: + - 0.6569 + R@100: + - 0.6662 + R@1000: + - 0.8131 diff --git a/src/main/resources/regression/dl20-passage-openai-ada2.yaml b/src/main/resources/regression/dl20-passage-openai-ada2.yaml index 97c696f79c..be195093d3 100644 --- a/src/main/resources/regression/dl20-passage-openai-ada2.yaml +++ b/src/main/resources/regression/dl20-passage-openai-ada2.yaml @@ -6,6 +6,7 @@ download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-openai-a download_checksum: a4d843d522ff3a3af7edbee789a63402 index_path: indexes/lucene-hnsw.msmarco-passage-openai-ada2/ +index_type: hnsw collection_class: JsonDenseVectorCollection generator_class: LuceneDenseVectorDocumentGenerator index_threads: 16 @@ -51,6 +52,7 @@ topics: models: - name: openai-ada2 display: OpenAI-ada2 + type: hnsw params: -querygenerator VectorQueryGenerator -topicfield vector -threads 16 -hits 1000 -efSearch 1000 results: AP@1000: diff --git a/src/main/resources/regression/msmarco-passage-cos-dpr-distil-fw.yaml b/src/main/resources/regression/msmarco-passage-cos-dpr-distil-fw.yaml new file mode 100644 index 0000000000..f0e9bed0ba --- /dev/null +++ b/src/main/resources/regression/msmarco-passage-cos-dpr-distil-fw.yaml @@ -0,0 +1,69 @@ +--- +corpus: msmarco-passage-cos-dpr-distil +corpus_path: collections/msmarco/msmarco-passage-cos-dpr-distil/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.tar +download_checksum: e20ffbc8b5e7f760af31298aefeaebbd + +index_path: indexes/lucene-index.msmarco-passage-cos-dpr-distil.fw-40/ +index_type: inverted-dense +collection_class: JsonDenseVectorCollection +generator_class: InvertedDenseVectorDocumentGenerator +index_threads: 16 +index_options: -encoding fw -fw.q 40 +index_stats: + documents: 8841823 + documents (non-empty): 8841823 + total terms: 2358948522 + +metrics: + - metric: AP@1000 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -c -m map + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: RR@10 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -c -M 10 -m recip_rank + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -c -m recall.100 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -c -m recall.1000 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: JsonIntVector +topics: + - name: "[MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking)" + id: dev + path: topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.gz + qrel: qrels.msmarco-passage.dev-subset.txt + +models: + - name: cos-dpr-distil-fw-40 + display: cosDPR-distill + type: inverted-dense + params: -topicfield vector -encoding fw -fw.q 40 -hits 1000 + results: + AP@1000: + - 0.3654 + RR@10: + - 0.3605 + R@100: + - 0.8711 + R@1000: + - 0.9668 diff --git a/src/main/resources/regression/msmarco-passage-cos-dpr-distil-onnx.yaml b/src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw-onnx.yaml similarity index 95% rename from src/main/resources/regression/msmarco-passage-cos-dpr-distil-onnx.yaml rename to src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw-onnx.yaml index f2ef571b1c..d405ed98b6 100644 --- a/src/main/resources/regression/msmarco-passage-cos-dpr-distil-onnx.yaml +++ b/src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw-onnx.yaml @@ -6,6 +6,7 @@ download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr- download_checksum: e20ffbc8b5e7f760af31298aefeaebbd index_path: indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil/ +index_type: hnsw collection_class: JsonDenseVectorCollection generator_class: LuceneDenseVectorDocumentGenerator index_threads: 16 @@ -49,8 +50,9 @@ topics: qrel: qrels.msmarco-passage.dev-subset.txt models: - - name: cos-dpr-distil-onnx - display: cosDPR-distil (ONNX) + - name: cos-dpr-distil-hnsw + display: cosDPR-distil + type: hnsw params: -querygenerator VectorQueryGenerator -topicfield title -threads 16 -hits 1000 -efSearch 1000 -encoder CosDprDistil results: AP@1000: diff --git a/src/main/resources/regression/msmarco-passage-cos-dpr-distil.yaml b/src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw.yaml similarity index 96% rename from src/main/resources/regression/msmarco-passage-cos-dpr-distil.yaml rename to src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw.yaml index a73e5ef516..9464a90c38 100644 --- a/src/main/resources/regression/msmarco-passage-cos-dpr-distil.yaml +++ b/src/main/resources/regression/msmarco-passage-cos-dpr-distil-hnsw.yaml @@ -6,6 +6,7 @@ download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr- download_checksum: e20ffbc8b5e7f760af31298aefeaebbd index_path: indexes/lucene-hnsw.msmarco-passage-cos-dpr-distil/ +index_type: hnsw collection_class: JsonDenseVectorCollection generator_class: LuceneDenseVectorDocumentGenerator index_threads: 16 @@ -49,8 +50,9 @@ topics: qrel: qrels.msmarco-passage.dev-subset.txt models: - - name: cos-dpr-distil + - name: cos-dpr-distil-hnsw display: cosDPR-distil + type: hnsw params: -querygenerator VectorQueryGenerator -topicfield vector -threads 16 -hits 1000 -efSearch 1000 results: AP@1000: diff --git a/src/main/resources/regression/msmarco-passage-cos-dpr-distil-lexlsh.yaml b/src/main/resources/regression/msmarco-passage-cos-dpr-distil-lexlsh.yaml new file mode 100644 index 0000000000..57bdbf833e --- /dev/null +++ b/src/main/resources/regression/msmarco-passage-cos-dpr-distil-lexlsh.yaml @@ -0,0 +1,69 @@ +--- +corpus: msmarco-passage-cos-dpr-distil +corpus_path: collections/msmarco/msmarco-passage-cos-dpr-distil/ + +download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-cos-dpr-distil.tar +download_checksum: e20ffbc8b5e7f760af31298aefeaebbd + +index_path: indexes/lucene-index.msmarco-passage-cos-dpr-distil.lexlsh-600/ +index_type: inverted-dense +collection_class: JsonDenseVectorCollection +generator_class: InvertedDenseVectorDocumentGenerator +index_threads: 16 +index_options: -encoding lexlsh -lexlsh.b 600 +index_stats: + documents: 8841823 + documents (non-empty): 8841823 + total terms: 5305093800 + +metrics: + - metric: AP@1000 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -c -m map + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: RR@10 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -c -M 10 -m recip_rank + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@100 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -c -m recall.100 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + - metric: R@1000 + command: tools/eval/trec_eval.9.0.4/trec_eval + params: -c -m recall.1000 + separator: "\t" + parse_index: 2 + metric_precision: 4 + can_combine: false + +topic_reader: JsonIntVector +topics: + - name: "[MS MARCO Passage: Dev](https://github.com/microsoft/MSMARCO-Passage-Ranking)" + id: dev + path: topics.msmarco-passage.dev-subset.cos-dpr-distil.jsonl.gz + qrel: qrels.msmarco-passage.dev-subset.txt + +models: + - name: cos-dpr-distil-lexlsh-600 + display: cosDPR-distill + type: inverted-dense + params: -topicfield vector -encoding lexlsh -lexlsh.b 600 -hits 1000 + results: + AP@1000: + - 0.3509 + RR@10: + - 0.3457 + R@100: + - 0.8615 + R@1000: + - 0.9596 diff --git a/src/main/resources/regression/msmarco-passage-openai-ada2.yaml b/src/main/resources/regression/msmarco-passage-openai-ada2.yaml index 77549dc6be..20ff6f3899 100644 --- a/src/main/resources/regression/msmarco-passage-openai-ada2.yaml +++ b/src/main/resources/regression/msmarco-passage-openai-ada2.yaml @@ -6,6 +6,7 @@ download_url: https://rgw.cs.uwaterloo.ca/pyserini/data/msmarco-passage-openai-a download_checksum: a4d843d522ff3a3af7edbee789a63402 index_path: indexes/lucene-hnsw.msmarco-passage-openai-ada2/ +index_type: hnsw collection_class: JsonDenseVectorCollection generator_class: LuceneDenseVectorDocumentGenerator index_threads: 16 @@ -51,6 +52,7 @@ topics: models: - name: openai-ada2 display: OpenAI-ada2 + type: hnsw params: -querygenerator VectorQueryGenerator -topicfield vector -threads 16 -hits 1000 -efSearch 1000 results: AP@1000: diff --git a/src/test/java/io/anserini/doc/DataModel.java b/src/test/java/io/anserini/doc/DataModel.java index 144c0d7b9d..c51dffe4d2 100755 --- a/src/test/java/io/anserini/doc/DataModel.java +++ b/src/test/java/io/anserini/doc/DataModel.java @@ -27,9 +27,11 @@ public class DataModel { private static final String INDEX_COMMAND = "target/appassembler/bin/IndexCollection"; private static final String INDEX_HNSW_COMMAND = "target/appassembler/bin/IndexHnswDenseVectors"; + private static final String INDEX_INVERTED_DENSE_COMMAND = "target/appassembler/bin/IndexInvertedDenseVectors"; private static final String SEARCH_COMMAND = "target/appassembler/bin/SearchCollection"; private static final String SEARCH_HNSW_COMMAND = "target/appassembler/bin/SearchHnswDenseVectors"; + private static final String SEARCH_INVERTED_DENSE_COMMAND = "target/appassembler/bin/SearchInvertedDenseVectors"; private String corpus; private String corpus_path; @@ -79,6 +81,7 @@ public void setDownload_corpus(String download_corpus) { } private String index_path; + private String index_type; private String collection_class; private String generator_class; private int index_threads; @@ -93,6 +96,14 @@ public void setIndex_path(String index_path) { this.index_path = index_path; } + public String getIndex_type() { + return index_type; + } + + public void setIndex_type(String index_type) { + this.index_type = index_type; + } + public String getCollection_class() { return collection_class; } @@ -205,14 +216,18 @@ static class Topic { static class Model { private String name; private String display; + private String type; private String params; private Map> results; public String getName() { return name; } public void setName(String name) { this.name = name; } - public Map> getResults() { return results; } - public void setDisplay(String display) { this.display = display; } public String getDisplay() { return display; } + public void setDisplay(String display) { this.display = display; } + public String getType() { return type; } + public void setType(String type) { this.type = type; } + + public Map> getResults() { return results; } public void setResults(Map> results) { this.results = results; } public String getParams() { return params; } public void setParams(String params) { this.params = params; } @@ -263,8 +278,10 @@ static class Metric { public String generateIndexingCommand(String collection) { String indexCommand = INDEX_COMMAND; - if (getCollection_class().equals("JsonDenseVectorCollection")) { + if ("hnsw".equals(getIndex_type())) { indexCommand = INDEX_HNSW_COMMAND; + } else if ("inverted-dense".equals(getIndex_type())) { + indexCommand = INDEX_INVERTED_DENSE_COMMAND; } StringBuilder builder = new StringBuilder(); @@ -300,8 +317,10 @@ public String generateRankingCommand(String collection) { for (Model model : getModels()) { for (Topic topic : getTopics()) { String searchCommand = SEARCH_COMMAND; - if (model.getParams().contains("VectorQueryGenerator")) { + if ("hnsw".equals(model.getType())) { searchCommand = SEARCH_HNSW_COMMAND; + } else if ("inverted-dense".equals(model.getType())) { + searchCommand = SEARCH_INVERTED_DENSE_COMMAND; } builder.append(searchCommand).append(" \\\n"); builder.append(" -index").append(" ").append(getIndex_path()).append(" \\\n"); @@ -416,7 +435,7 @@ public String generateEffectiveness(String collection) { builder.append(String.format("| %1$-109s|", topic.getName())); for (Model model : getModels()) { // 3 digits for HNSW, 4 otherwise: - if (getCollection_class().equals("JsonDenseVectorCollection")) { + if ("hnsw".equals(getIndex_type())) { builder.append(String.format(" %-10.3f|", model.getResults().get(eval.getMetric()).get(i))); } else { builder.append(String.format(" %-10.4f|", model.getResults().get(eval.getMetric()).get(i))); diff --git a/src/test/java/io/anserini/doc/GenerateRegressionDocsTest.java b/src/test/java/io/anserini/doc/GenerateRegressionDocsTest.java index 4a46af5c24..07f85b3d3d 100755 --- a/src/test/java/io/anserini/doc/GenerateRegressionDocsTest.java +++ b/src/test/java/io/anserini/doc/GenerateRegressionDocsTest.java @@ -35,7 +35,6 @@ public void main() throws Exception { URL templatesRoot = GenerateRegressionDocsTest.class.getResource("/docgen/templates/"); for (final File fileEntry : new File(templatesRoot.toURI()).listFiles()) { - String fileName = fileEntry.getName(); // This is the name of the test, which can be different from the name of the collection, // e.g., multiple topics run on the same collection. String testName = fileEntry.getName().replaceAll(".template", ""); diff --git a/src/test/java/io/anserini/index/IndexInvertedDenseVectorsTest.java b/src/test/java/io/anserini/index/IndexInvertedDenseVectorsTest.java index fe8955216b..7d51c16044 100644 --- a/src/test/java/io/anserini/index/IndexInvertedDenseVectorsTest.java +++ b/src/test/java/io/anserini/index/IndexInvertedDenseVectorsTest.java @@ -21,13 +21,18 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import org.apache.logging.log4j.core.config.Configurator; +import org.apache.lucene.index.IndexReader; import org.junit.AfterClass; import org.junit.BeforeClass; import org.junit.Test; +import java.io.ByteArrayOutputStream; +import java.io.PrintStream; import java.util.LinkedList; import java.util.List; +import java.util.Map; +import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; /** @@ -37,6 +42,19 @@ public class IndexInvertedDenseVectorsTest { private static final Logger LOGGER = LogManager.getLogger(IndexInvertedDenseVectors.class); private static CustomAppender APPENDER; + private final ByteArrayOutputStream err = new ByteArrayOutputStream(); + private PrintStream save; + + private void redirectStderr() { + save = System.err; + err.reset(); + System.setErr(new PrintStream(err)); + } + + private void restoreStderr() { + System.setErr(save); + } + @BeforeClass public static void setupClass() { APPENDER = new CustomAppender("CustomAppender"); @@ -48,78 +66,113 @@ public static void setupClass() { } @Test - public void indexFWTest() throws Exception { - createIndex("target/idx-sample-fw" + System.currentTimeMillis(), "fw", false); - assertTrue(APPENDER.getLastLog().contains("Total 4 documents indexed")); + public void testEmptyInvocation() throws Exception { + redirectStderr(); + String[] indexArgs = new String[] {}; + + err.reset(); + IndexInvertedDenseVectors.main(indexArgs); + assertTrue(err.toString().contains("Example: IndexInvertedDenseVectors")); + + restoreStderr(); } - @Test - public void indexFWStoredTest() throws Exception { - createIndex("target/idx-sample-fw" + System.currentTimeMillis(), "fw", false); - assertTrue(APPENDER.getLastLog().contains("Total 4 documents indexed")); + @Test(expected = ClassNotFoundException.class) + public void testInvalidCollection() throws Exception { + String[] indexArgs = new String[] { + "-collection", "FakeCollection", + "-input", "src/test/resources/sample_docs/openai_ada2/json_vector", + "-generator", "InvertedDenseVectorDocumentGenerator", + "-index", "target/idx-sample-ll-vector" + System.currentTimeMillis(), + "-encoding", "lexlsh" + }; + + IndexInvertedDenseVectors.main(indexArgs); } - @Test - public void indexLLTest() throws Exception { - createIndex("target/idx-sample-ll" + System.currentTimeMillis(), "lexlsh", false); - assertTrue(APPENDER.getLastLog().contains("Total 4 documents indexed")); + @Test(expected = RuntimeException.class) + public void testCollectionPath() throws Exception { + String[] indexArgs = new String[] { + "-collection", "JsonDenseVectorCollection", + "-input", "invalid/path", + "-generator", "InvertedDenseVectorDocumentGenerator", + "-index", "target/idx-sample-ll-vector" + System.currentTimeMillis(), + "-encoding", "lexlsh" + }; + + IndexInvertedDenseVectors.main(indexArgs); } - @Test - public void indexLLStoredTest() throws Exception { - createIndex("target/idx-sample-ll" + System.currentTimeMillis(), "lexlsh", false); - assertTrue(APPENDER.getLastLog().contains("Total 4 documents indexed")); + @Test(expected = ClassNotFoundException.class) + public void testInvalidGenerator() throws Exception { + String[] indexArgs = new String[] { + "-collection", "JsonDenseVectorCollection", + "-input", "src/test/resources/sample_docs/openai_ada2/json_vector", + "-generator", "FakeGenerator", + "-index", "target/idx-sample-ll-vector" + System.currentTimeMillis(), + "-encoding", "lexlsh" + }; + + IndexInvertedDenseVectors.main(indexArgs); } - public static void createIndex(String path, String encoding, boolean stored) throws Exception { - List args = new LinkedList<>(); - args.add("-encoding"); - args.add(encoding); - args.add("-input"); - args.add("src/test/resources/mini-word-vectors.txt"); - args.add("-index"); - args.add(path); - if (stored) { - args.add("-stored"); - } - - IndexInvertedDenseVectors.main(args.toArray(new String[0])); + @Test(expected = RuntimeException.class) + public void testInvalidEncoding() throws Exception { + String[] indexArgs = new String[] { + "-collection", "JsonDenseVectorCollection", + "-input", "src/test/resources/sample_docs/openai_ada2/json_vector", + "-generator", "InvertedDenseVectorDocumentGenerator", + "-index", "target/idx-sample-ll-vector" + System.currentTimeMillis(), + "-encoding", "xxx" + }; + + IndexInvertedDenseVectors.main(indexArgs); } @Test public void testLLCollection() throws Exception { - List args = new LinkedList<>(); - args.add("-collection"); - args.add("JsonDenseVectorCollection"); - args.add("-encoding"); - args.add("lexlsh"); - args.add("-input"); - args.add("src/test/resources/sample_docs/openai_ada2/json_vector"); - args.add("-index"); - args.add("target/idx-sample-ll-vector" + System.currentTimeMillis()); - args.add("-stored"); - - IndexInvertedDenseVectors.main(args.toArray(new String[0])); - + String indexPath = "target/idx-sample-ll-vector" + System.currentTimeMillis(); + String[] indexArgs = new String[] { + "-collection", "JsonDenseVectorCollection", + "-input", "src/test/resources/sample_docs/openai_ada2/json_vector", + "-generator", "InvertedDenseVectorDocumentGenerator", + "-index", indexPath, + "-encoding", "lexlsh" + }; + + IndexInvertedDenseVectors.main(indexArgs); assertTrue(APPENDER.getLastLog().contains("Total 100 documents indexed")); + + IndexReader reader = IndexReaderUtils.getReader(indexPath); + Map results = IndexReaderUtils.getIndexStats(reader, IndexInvertedDenseVectors.FIELD_VECTOR); + + assertEquals(100, results.get("documents")); + assertEquals(100, results.get("non_empty_documents")); + assertEquals(4081, (int) ((Long) results.get("unique_terms")).longValue()); + assertEquals(30000, (int) ((Long) results.get("total_terms")).longValue()); } @Test public void testFWCollection() throws Exception { - List args = new LinkedList<>(); - args.add("-collection"); - args.add("JsonDenseVectorCollection"); - args.add("-encoding"); - args.add("fw"); - args.add("-input"); - args.add("src/test/resources/sample_docs/openai_ada2/json_vector"); - args.add("-index"); - args.add("target/idx-sample-fw-vector" + System.currentTimeMillis()); - args.add("-stored"); - - IndexInvertedDenseVectors.main(args.toArray(new String[0])); - + String indexPath = "target/idx-sample-fw-vector" + System.currentTimeMillis(); + String[] indexArgs = new String[] { + "-collection", "JsonDenseVectorCollection", + "-input", "src/test/resources/sample_docs/openai_ada2/json_vector", + "-generator", "InvertedDenseVectorDocumentGenerator", + "-index", indexPath, + "-encoding", "fw" + }; + + IndexInvertedDenseVectors.main(indexArgs); assertTrue(APPENDER.getLastLog().contains("Total 100 documents indexed")); + + IndexReader reader = IndexReaderUtils.getReader(indexPath); + Map results = IndexReaderUtils.getIndexStats(reader, IndexInvertedDenseVectors.FIELD_VECTOR); + + assertEquals(100, results.get("documents")); + assertEquals(100, results.get("non_empty_documents")); + assertEquals(1460, (int) ((Long) results.get("unique_terms")).longValue()); + assertEquals(53817, (int) ((Long) results.get("total_terms")).longValue()); } @AfterClass diff --git a/src/test/java/io/anserini/search/EvaluateInvertedDenseVectorsTest.java b/src/test/java/io/anserini/search/EvaluateInvertedDenseVectorsTest.java deleted file mode 100644 index de4bb02b24..0000000000 --- a/src/test/java/io/anserini/search/EvaluateInvertedDenseVectorsTest.java +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Anserini: A Lucene toolkit for reproducible information retrieval research - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package io.anserini.search; - -import io.anserini.index.IndexInvertedDenseVectorsTest; -import org.junit.Test; - -/** - * Tests for {@link EvaluateInvertedDenseVectors} - */ -public class EvaluateInvertedDenseVectorsTest { - - @Test - public void evalFWTest() throws Exception { - String path = "target/idx-sample-fw"; - String encoding = "fw"; - IndexInvertedDenseVectorsTest.createIndex(path, encoding, false); - String[] args = new String[]{"-encoding", encoding, "-input", "src/test/resources/mini-word-vectors.txt", "-index", - path, "-topics", "src/test/resources/sample_topics/Trec"}; - EvaluateInvertedDenseVectors.main(args); - } - - @Test - public void evalLLTest() throws Exception { - String path = "target/idx-sample-ll"; - String encoding = "lexlsh"; - IndexInvertedDenseVectorsTest.createIndex(path, encoding, false); - String[] args = new String[]{"-encoding", encoding, "-input", "src/test/resources/mini-word-vectors.txt", "-index", - path, "-topics", "src/test/resources/sample_topics/Trec"}; - EvaluateInvertedDenseVectors.main(args); - } - -} \ No newline at end of file diff --git a/src/test/java/io/anserini/search/SearchInvertedDenseVectorsTest.java b/src/test/java/io/anserini/search/SearchInvertedDenseVectorsTest.java index 1123455c6c..6a41a9451c 100644 --- a/src/test/java/io/anserini/search/SearchInvertedDenseVectorsTest.java +++ b/src/test/java/io/anserini/search/SearchInvertedDenseVectorsTest.java @@ -16,10 +16,20 @@ package io.anserini.search; +import io.anserini.index.IndexInvertedDenseVectors; import io.anserini.index.IndexInvertedDenseVectorsTest; import io.anserini.search.SearchInvertedDenseVectors; import org.junit.Test; +import java.io.BufferedReader; +import java.io.File; +import java.io.FileReader; +import java.io.IOException; +import java.util.LinkedList; +import java.util.List; + +import static org.junit.Assert.assertEquals; + /** * Tests for {@link SearchInvertedDenseVectors} */ @@ -27,40 +37,93 @@ public class SearchInvertedDenseVectorsTest { @Test public void searchFWTest() throws Exception { - String path = "target/idx-sample-fw" + System.currentTimeMillis(); - String encoding = "fw"; - IndexInvertedDenseVectorsTest.createIndex(path, encoding, false); - String[] args = new String[]{"-encoding", encoding, "-input", "src/test/resources/mini-word-vectors.txt", "-index", - path, "-word", "foo"}; - SearchInvertedDenseVectors.main(args); - } + String indexPath = "target/idx-sample-fw-vector-" + System.currentTimeMillis(); + String[] indexArgs = new String[] { + "-collection", "JsonDenseVectorCollection", + "-input", "src/test/resources/sample_docs/openai_ada2/json_vector", + "-generator", "InvertedDenseVectorDocumentGenerator", + "-index", indexPath, + "-encoding", "fw" + }; + IndexInvertedDenseVectors.main(indexArgs); - @Test - public void searchFWStoredTest() throws Exception { - String path = "target/idx-sample-fw-stored" + System.currentTimeMillis(); - String encoding = "fw"; - IndexInvertedDenseVectorsTest.createIndex(path, encoding, true); - String[] args = new String[]{"-encoding", encoding, "-stored", "-index", path, "-word", "foo"}; - SearchInvertedDenseVectors.main(args); + String runfile = "target/run-" + System.currentTimeMillis(); + String[] searchArgs = new String[] { + "-index", indexPath, + "-topics", "src/test/resources/sample_topics/sample-topics.msmarco-passage-dev-cos-dpr-distil.jsonl", + "-output", runfile, + "-topicreader", "JsonIntVector", + "-topicfield", "vector", + "-hits", "5", + "-encoding", "fw"}; + SearchInvertedDenseVectors.main(searchArgs); + + check(runfile, new String[] { + "2 Q0 26 1 21.478451 Anserini", + "2 Q0 122 2 19.947021 Anserini", + "2 Q0 71 3 19.537197 Anserini", + "2 Q0 80 4 19.263186 Anserini", + "2 Q0 74 5 19.188883 Anserini", + "1048585 Q0 30 1 21.119457 Anserini", + "1048585 Q0 114 2 20.725464 Anserini", + "1048585 Q0 36 3 20.413668 Anserini", + "1048585 Q0 4 4 20.092403 Anserini", + "1048585 Q0 13 5 20.087444 Anserini" + }); + + new File(runfile).delete(); } @Test public void searchLLTest() throws Exception { - String path = "target/idx-sample-ll" + System.currentTimeMillis(); - String encoding = "lexlsh"; - IndexInvertedDenseVectorsTest.createIndex(path, encoding, false); - String[] args = new String[]{"-encoding", encoding, "-input", "src/test/resources/mini-word-vectors.txt", "-index", - path, "-word", "foo"}; - SearchInvertedDenseVectors.main(args); + String indexPath = "target/idx-sample-fw-vector-" + System.currentTimeMillis(); + String[] indexArgs = new String[] { + "-collection", "JsonDenseVectorCollection", + "-input", "src/test/resources/sample_docs/openai_ada2/json_vector", + "-generator", "InvertedDenseVectorDocumentGenerator", + "-index", indexPath, + "-encoding", "lexlsh" + }; + IndexInvertedDenseVectors.main(indexArgs); + + String runfile = "target/run-" + System.currentTimeMillis(); + String[] searchArgs = new String[] { + "-index", indexPath, + "-topics", "src/test/resources/sample_topics/sample-topics.msmarco-passage-dev-cos-dpr-distil.jsonl", + "-output", runfile, + "-topicreader", "JsonIntVector", + "-topicfield", "vector", + "-hits", "5", + "-encoding", "lexlsh"}; + SearchInvertedDenseVectors.main(searchArgs); + + check(runfile, new String[] { + "2 Q0 14 1 43.783421 Anserini", + "2 Q0 17 2 42.912968 Anserini", + "2 Q0 5 3 42.801838 Anserini", + "2 Q0 6 4 41.686707 Anserini", + "2 Q0 65 5 41.679508 Anserini", + "1048585 Q0 99 1 44.071457 Anserini", + "1048585 Q0 50 2 40.613106 Anserini", + "1048585 Q0 4 3 39.676960 Anserini", + "1048585 Q0 10 4 39.406578 Anserini", + "1048585 Q0 6 5 38.794933 Anserini" + }); + + new File(runfile).delete(); } - @Test - public void searchLLStoredTest() throws Exception { - String path = "target/idx-sample-ll" + System.currentTimeMillis(); - String encoding = "lexlsh"; - IndexInvertedDenseVectorsTest.createIndex(path, encoding, true); - String[] args = new String[]{"-encoding", encoding, "-stored", "-index", path, "-word", "foo"}; - SearchInvertedDenseVectors.main(args); + protected void check(String output, String[] ref) throws IOException { + BufferedReader br = new BufferedReader(new FileReader(output)); + + int cnt = 0; + String s; + while ((s = br.readLine()) != null) { + assertEquals(ref[cnt], s); + cnt++; + } + + assertEquals(cnt, ref.length); } } \ No newline at end of file diff --git a/src/test/resources/mini-word-vectors.txt b/src/test/resources/mini-word-vectors.txt deleted file mode 100644 index cec4c9739a..0000000000 --- a/src/test/resources/mini-word-vectors.txt +++ /dev/null @@ -1,4 +0,0 @@ -simple 0.3 0.2 0.2 0.9 -foo 0.1 0.2 0.4 0.4 -text 0.2 0.2 0.1 0.9 -simple 0.3 0.2 0.1 0.9 \ No newline at end of file diff --git a/src/test/resources/sample_topics/sample-topics.msmarco-passage-dev-cos-dpr-distil.jsonl b/src/test/resources/sample_topics/sample-topics.msmarco-passage-dev-cos-dpr-distil.jsonl new file mode 100644 index 0000000000..3e2e5e55b3 --- /dev/null +++ b/src/test/resources/sample_topics/sample-topics.msmarco-passage-dev-cos-dpr-distil.jsonl @@ -0,0 +1,2 @@ +{"qid": "1048585", "vector": [0.04641808196902275, 0.003154488280415535, -0.015572609379887581, -0.019383687525987625, 0.010062959976494312, 0.015643978491425514, -0.03910994529724121, -0.07936178892850876, 0.011376140639185905, -0.019526425749063492, 0.08278747648000717, -0.03745419532060623, 0.046360988169908524, 0.00815742090344429, 0.0671434998512268, 0.013124669902026653, 0.00408941600471735, -0.014673366211354733, -0.003067061770707369, 0.029832039028406143, -0.04581858590245247, -0.02714858204126358, -0.006687229499220848, 0.018912656232714653, -0.0017868891591206193, 0.03545587882399559, -0.010883698239922523, -0.00017161897267214954, -0.024393757805228233, 0.020596953108906746, -0.05078583583235741, -0.001912676147185266, 0.015458419919013977, -0.028647322207689285, -0.0758219063282013, 0.020211562514305115, 0.060977257788181305, 0.016900064423680305, -0.03828207030892372, -0.08278747648000717, -0.01767084375023842, -0.03037443943321705, 0.03793950006365776, 0.03405705466866493, -0.008100326173007488, -0.02597813867032528, -0.008278748020529747, 0.027919363230466843, 0.052669964730739594, 0.044790878891944885, -0.004649658687412739, -0.009277907200157642, -0.0331435389816761, 0.02723422460258007, -0.009227949194610119, 0.013096122071146965, 0.03936687111854553, -0.001852012937888503, 0.013345912098884583, 0.03223001956939697, -0.014816103503108025, -0.03551297262310982, 0.04250708594918251, -0.007650704588741064, 0.03157342970371246, -0.013110395520925522, -0.035484425723552704, 0.052099015563726425, -0.00486019579693675, 0.02627788670361042, -0.04070860147476196, 0.013517196290194988, 0.051014214754104614, 0.03816788271069527, -0.004778122063726187, 0.035998277366161346, 0.00942064356058836, -0.08313004672527313, 0.03402850776910782, 0.032001640647649765, 0.04873042181134224, -0.025778306648135185, -0.00026941613759845495, -0.029346732422709465, 0.023037755861878395, 0.008235926739871502, -0.012132647447288036, -0.024036915972828865, -0.008371526375412941, -0.027648162096738815, 0.03976653516292572, 0.010262791998684406, -0.008357252925634384, 0.02663472853600979, -0.021981501951813698, 0.059321507811546326, 0.03728291019797325, -0.008842558600008488, 0.06788572669029236, 0.018413076177239418, -0.01477328222244978, 0.009627612307667732, -0.05315526947379112, -0.0009144090581685305, -0.001293554320000112, -0.0016245257575064898, 0.001562970457598567, 0.028204835951328278, -0.029575111344456673, 0.006651545409113169, 0.0004250887141097337, -0.0178421288728714, 0.016286294907331467, 0.027434056624770164, 0.014309386722743511, -0.06211915239691734, 0.077763132750988, 0.04875896871089935, 0.025121716782450676, 0.04921572655439377, -0.03322917968034744, 0.028647322207689285, 0.03211583197116852, 0.029261089861392975, -0.03014606051146984, -0.01918385736644268, 0.03534168750047684, 0.032258566468954086, 0.025078896433115005, -0.0228379238396883, 0.037654027342796326, -0.06857086718082428, -0.009934497065842152, -0.005527491215616465, 0.006480261217802763, 0.005227743647992611, 0.010605361312627792, 0.044134289026260376, -0.05960698425769806, -0.010127192363142967, -0.014644819311797619, 0.007008388172835112, -0.02132491208612919, 0.03248694911599159, -0.028090646490454674, -0.008799737319350243, -0.017128443345427513, 0.04319222271442413, -0.04795964062213898, 0.02519308589398861, -0.07850536704063416, 0.004200037103146315, -0.008628453128039837, 0.025735486298799515, 0.024736326187849045, -0.026748917996883392, 0.021110806614160538, 0.03793950006365776, -0.00927076954394579, 0.042135968804359436, -0.043877363204956055, 0.033172085881233215, -0.006972703617066145, 0.005138533189892769, 0.015558335930109024, -0.026477718725800514, -0.013410143554210663, -0.006701503414660692, -0.009227949194610119, -0.017171263694763184, 0.055324871093034744, 0.012410984374582767, -0.04473378509283066, 0.0007841615588404238, 0.030260249972343445, -0.002499682130292058, -0.056409671902656555, -0.019997457042336464, -0.07108303904533386, -0.037397101521492004, 0.0601208359003067, -0.0350562147796154, 0.01455204002559185, 0.0031437829602509737, 0.08021821081638336, 0.007069051265716553, -0.07148270308971405, -0.05843653902411461, -0.03808223828673363, -0.031002482399344444, 0.018341707065701485, -0.06240462884306908, -0.044362667948007584, 0.05897894129157066, -0.004788827151060104, -0.004899448249489069, -0.015715347602963448, 0.02245253510773182, -0.006790713872760534, 0.005234880372881889, -0.01159738376736641, -0.03679760545492172, -0.03194454684853554, -0.024550769478082657, 0.013781259767711163, -0.018413076177239418, -0.019212404265999794, -0.051927730441093445, 0.008407210931181908, -0.015387051738798618, -0.05755157023668289, -0.022152787074446678, -0.02258099801838398, 0.0356842577457428, 0.03248694911599159, 0.0026888088323175907, -0.014366481453180313, -0.037225816398859024, 0.023694345727562904, 0.05660950392484665, -0.0041572158224880695, -0.011611657217144966, 0.012525173835456371, 0.015187219716608524, 0.01744246482849121, -0.01748528517782688, 0.00867841113358736, 0.02370862103998661, 0.015315682627260685, -0.03517040237784386, 0.014958840794861317, 0.014073871076107025, -0.025806855410337448, 0.061319828033447266, 0.03220147266983986, -0.016200652346014977, -0.014352208003401756, -0.039395418018102646, 0.04521908983588219, -0.03771112114191055, -0.022766556590795517, -0.0341997928917408, -0.03659777343273163, -0.011369003914296627, 0.06440294533967972, 0.05738028511404991, -0.029603660106658936, -0.041850496083498, 0.0936925858259201, 0.008821148425340652, -0.050100695341825485, 0.0911233201622963, 0.04145083203911781, 0.04045167192816734, -0.034485265612602234, -0.009042390622198582, 0.020811058580875397, 0.0548110194504261, -0.07684961706399918, 0.03454235941171646, 0.014287976548075676, 0.004292815923690796, -0.009392096661031246, 0.05718045309185982, -0.016157831996679306, -0.036911796778440475, -0.03434252738952637, 0.03688324615359306, 0.026206517592072487, 0.049187179654836655, 0.009620475582778454, -0.08529964834451675, -0.03853899613022804, -0.015101577155292034, -0.007329546380788088, -0.0031919567845761776, 0.023994093760848045, 0.019683435559272766, -0.03885301947593689, -0.048102378845214844, 0.01835598237812519, -0.0582081601023674, 0.04410574212670326, 0.028861427679657936, -0.00412153173238039, 0.07153979688882828, -0.03128795698285103, -0.05472537502646446, -0.010726687498390675, 0.030659914016723633, 0.06126273050904274, 0.003004614496603608, -0.01992608979344368, -0.03842480853199959, 0.03796805068850517, -0.02266664057970047, -0.04447685927152634, 0.0016825126949697733, -0.059150222688913345, -0.01017715036869049, 0.040879882872104645, -0.06799992173910141, 0.07656414061784744, -0.07610738277435303, 0.009377822279930115, -0.027005845680832863, -0.03163052350282669, -0.016471853479743004, 0.04815947264432907, -0.02487906441092491, -0.061719492077827454, -0.027005845680832863, 0.004952974617481232, 0.024507947266101837, -0.04924427345395088, 0.026078054681420326, 0.0038431943394243717, 0.03988072648644447, -0.01276782713830471, -0.03702598437666893, 0.038881566375494, -0.09106622636318207, -0.021267816424369812, -0.016771601513028145, -0.02036857418715954, -0.05149951949715614, -0.011533151380717754, -0.037225816398859024, -0.0539545975625515, 0.08512835949659348, -0.0017628022469580173, 0.067543163895607, -0.08861114829778671, -0.023994093760848045, 0.0335717499256134, -0.054354261606931686, 0.021067984402179718, 0.015101577155292034, 0.014901746064424515, -0.019326593726873398, -0.02627788670361042, 0.025778306648135185, -0.007197514642030001, -0.058579277247190475, -0.0178421288728714, 0.0264491718262434, 0.015401325188577175, -0.032915160059928894, -0.020825332030653954, 0.0322871170938015, -0.006537355948239565, -0.13223157823085785, 0.06834249198436737, -0.024393757805228233, -0.046275343745946884, -0.017556654289364815, -0.030517175793647766, -0.04459104686975479, -0.022523902356624603, -0.049929410219192505, 0.02680601365864277, -0.027048666030168533, 0.005006501451134682, 0.012039868161082268, 0.018998298794031143, 0.03537023440003395, -0.05378331243991852, 0.04607551172375679, 0.01896975003182888, 0.013317364268004894, -0.039138492196798325, -0.02075396291911602, 0.019298046827316284, -0.029660753905773163, -0.009185127913951874, -0.03268677741289139, -0.018898382782936096, 0.046532269567251205, -0.005866491701453924, 0.016828695312142372, 0.004699616692960262, 0.0037825312465429306, -0.007700662594288588, -0.014844650402665138, 0.026520539075136185, 0.022638091817498207, 0.00634109228849411, -0.0409940741956234, -0.021467648446559906, -0.05087147653102875, 0.08524255454540253, -0.0012230778811499476, 0.03768257424235344, -0.021681753918528557, -0.013688480481505394, -0.03990927338600159, -0.0495011992752552, -0.01614355854690075, -0.07062628120183945, -0.006283997558057308, -0.04225016012787819, 0.025478558614850044, -0.0008546379394829273, 0.019155308604240417, -0.06046340614557266, -0.01425229199230671, -0.0020197289995849133, -0.07325264066457748, -0.04390591010451317, -0.02370862103998661, 0.04302094131708145, -0.05703771486878395, -0.023480240255594254, -0.012446668930351734, -0.019726257771253586, 0.07068337500095367, 0.038310617208480835, 0.01144037302583456, -0.02141055464744568, -0.00471032178029418, 0.02132491208612919, 0.00324726733379066, -0.019911814481019974, -0.045790039002895355, -0.021738849580287933, -0.024821968749165535, -0.03945251554250717, -0.006583745591342449, -0.01744246482849121, -0.06263300776481628, 0.05729464441537857, -0.02710576169192791, -0.0424785390496254, -0.08861114829778671, -0.06742896884679794, 0.018070507794618607, 0.025735486298799515, -0.009235085919499397, 0.01692861132323742, -0.02279510349035263, -0.05187063664197922, -0.032087285071611404, 0.005406165029853582, 0.018898382782936096, -0.006184081546962261, 0.0019001866457983851, -0.015058756805956364, 0.0013800886226817966, 0.02727704495191574, 0.04159357026219368, 0.048102378845214844, 0.021567564457654953, -0.04441976174712181, -0.0467321015894413, 0.057066261768341064, -0.003832489252090454, 0.0743374451994896, 0.010641044937074184, 0.02328040823340416, 0.048102378845214844, 0.02205287106335163, -0.01089797168970108, -0.03494202345609665, 0.008136010728776455, 0.039309777319431305, 0.054496996104717255, 0.0028154877945780754, 0.020211562514305115, -0.05994955077767372, -0.04621824994683266, 0.0014773282455280423, 0.005017206538468599, -0.019083941355347633, 0.013338775373995304, 0.012996206060051918, 0.012753553688526154, 0.03833916410803795, 0.006351797841489315, 0.02744833007454872, 0.07182527333498001, 0.011782941408455372, -0.003067061770707369, 0.012389574199914932, 0.0038110786117613316, 0.03671196475625038, 0.07770603895187378, 0.0009411723003722727, -0.014801830053329468, -0.04550456255674362, 0.0031277250964194536, -0.01922667771577835, 0.025721212849020958, -0.024550769478082657, -0.05321236327290535, -0.016543220728635788, -0.045618753880262375, -0.021396281197667122, -0.030574271455407143, 0.009470601566135883, -0.019826173782348633, -0.01731400191783905, -0.03351465240120888, 0.03340046480298042, -0.05344074219465256, -0.04484797269105911, -0.0028476037550717592, 0.06720059365034103, 0.051014214754104614, -0.030745554715394974, 0.028433214873075485, 0.021824492141604424, -0.04598987102508545, 0.046703554689884186, 0.009756076149642467, 0.021167900413274765, 0.03023170307278633, -0.007261746097356081, 0.06183367967605591, -0.04293529689311981, -0.010227108374238014, 0.016628863289952278, -0.052669964730739594, -0.031915999948978424, 0.0076292939484119415, -0.050186339765787125, -0.05346928909420967, -0.047445788979530334, 0.021267816424369812, 0.05038617178797722, 0.022266976535320282, 0.06708640605211258, 0.04730305075645447, -0.06737187504768372, -0.002076823730021715, 0.08981014043092728, 0.010091507807374, -0.027819447219371796, -0.0025942453648895025, 0.005092143546789885, -0.01052685547620058, -0.03006041795015335, 0.014480671845376492, 0.023294683545827866, 0.026706097647547722, 0.059892456978559494, 0.02610660158097744, 0.0036415783688426018, 0.04002346098423004, 0.0346280038356781, 0.032915160059928894, 0.025878222659230232, 0.03488492965698242, 0.010070097632706165, -0.04242144525051117, -0.0066194296814501286, 0.06057759374380112, 0.006305408198386431, -0.018641455098986626, 0.007001250982284546, -0.002592461183667183, 0.017428191378712654, -0.01076950877904892, -0.030488628894090652, -0.035826992243528366, -0.0031651936005800962, 0.07981854677200317, -0.014330797828733921, -0.03214437887072563, -0.04456249997019768, -0.015672525390982628, 0.018598634749650955, 0.014916019514203072, 0.00034613729803822935, 0.03434252738952637, -0.02266664057970047, -0.02079678513109684, -0.04501925781369209, 0.015558335930109024, -0.010291339829564095, -0.00090013537555933, -0.05181353911757469, -0.04613260552287102, -0.008607042953372002, -0.008228790014982224, 0.0035184677690267563, 0.008771190419793129, 0.05763721093535423, -0.017128443345427513, -0.026520539075136185, 0.011654478497803211, 0.012796374037861824, 0.029860585927963257, -0.0092921806499362, -0.01129049900919199, -0.007736346684396267, -0.007090461906045675, 0.05047181248664856, 0.0014434282202273607, -0.0020393552258610725, -0.04975812882184982, 0.006473124027252197, 0.013167491182684898, 0.015030208975076675, 0.036226656287908554, -0.024593589827418327, 0.01070527732372284, -0.02292356640100479, -0.019569246098399162, -0.030517175793647766, 0.08210233598947525, -0.04242144525051117, -0.03605537489056587, -0.01141896191984415, 0.029203996062278748, 0.006248313467949629, 0.024679232388734818, 0.032172925770282745, -0.04070860147476196, 0.05943569913506508, 0.04193613678216934, 0.049615390598773956, 0.08273038268089294, -0.042706917971372604, -0.02332323044538498, -0.016557496041059494, -0.022152787074446678, 0.019198130816221237, 0.006134123541414738, 0.012296794913709164, 0.02702011913061142, 0.03163052350282669, -0.001768154907040298, -0.022695187479257584, 0.04282110929489136, 0.04493361711502075, -0.04050876945257187, 0.048444945365190506, 0.018498718738555908, -0.0019180288072675467, 0.01141896191984415, 0.06040630862116814, 0.028290478512644768, 0.0760502889752388, -0.005117122549563646, 0.032344210892915726, 0.02574975974857807, -0.05038617178797722, 0.008906790055334568, 0.035998277366161346, 0.014730460941791534, -0.0013560017105191946, 0.05718045309185982, 0.01209696289151907, -0.03554151952266693, -0.05421152338385582, -0.001317641232162714, -0.045133449137210846, 0.033343371003866196, -0.04701757803559303, 0.004542605951428413, -0.016414757817983627, 0.021396281197667122, 0.005809396971017122, 0.048273663967847824, -0.062461722642183304, -0.011547425761818886, 0.047274503856897354, 0.04116535931825638, -0.029432374984025955, -0.01605791598558426, 0.03405705466866493, 0.002767314203083515, 0.027705257758498192, 0.008128874003887177, -0.009834581054747105, 0.008414347656071186, 0.022081417962908745, -0.036397941410541534, 0.005181354004889727, 0.061719492077827454, -0.015158671885728836, -0.030774103477597237, -0.08158848434686661, -0.012817785143852234, -0.01061963476240635, -0.007450873032212257, 0.06257591396570206, 0.002158897463232279, -0.07536514848470688, -0.05032907426357269, -0.06343233585357666, 0.04202178120613098, 0.028033552691340446, 0.04998650774359703, -0.04267837107181549, -0.03105957619845867, 0.007800578605383635, 0.024607863277196884, -0.024094009771943092, -0.025036074221134186, 0.031830355525016785, -0.022266976535320282, 0.012610816396772861, -0.012418121099472046, 0.05521068349480629, 0.011112077161669731, 0.03439962491393089, 0.08923918753862381, 0.011718709953129292, 0.03194454684853554, -0.02071114256978035, -0.0318589061498642, 0.006344660650938749, -0.025421464815735817, 0.0267631933093071, -0.034913476556539536, 0.006209060549736023, 0.027205677703022957, -0.020154468715190887, -9.807595051825047e-05, 0.11144907027482986, -0.07616447657346725, 0.04833075776696205, 0.050842929631471634, -0.06417457014322281, -0.054496996104717255, -0.012917701154947281, 0.06731478124856949, -0.01744246482849121, 0.0119470888748765, 0.03485638275742531, -0.0057844179682433605, 0.009013843722641468, 0.0024854084476828575, 0.03322917968034744, 0.0471603125333786, 0.009434917941689491, -0.05852217972278595, -0.0422787070274353, 0.0020518447272479534, 0.05378331243991852, -0.009456328116357327, -0.009641885757446289, -0.000578531005885452, 0.0005254506831988692, -0.009256496094167233, 0.013866902329027653, -0.004917290527373552, -0.009185127913951874, 0.0524701327085495, -0.0501292422413826, 0.04210742190480232, 0.025050347670912743, 0.05406878516077995, -0.016871517524123192, 0.002146407961845398, -0.011369003914296627, -0.0055381967686116695, -0.07707799226045609, -0.014102417975664139, 0.01988326758146286, -0.04302094131708145, 0.016086462885141373, -0.008814011700451374, -0.11316191405057907, -0.04930137097835541, 0.005727323237806559, -0.010926519520580769]} +{"qid": "2", "vector": [-0.007401271723210812, 0.028172582387924194, -0.00571675319224596, 0.01391828153282404, 0.035423532128334045, 0.01966598443686962, -0.0368560329079628, -0.02608572505414486, 0.009275906719267368, 0.07399503141641617, -0.020886264741420746, -0.09316582977771759, -0.04509735479950905, -0.008515441790223122, -0.008568497374653816, 0.03837696462869644, 0.021858952939510345, 0.020620986819267273, -0.05358627066016197, 0.006525852717459202, 0.013900596648454666, -0.0581490620970726, -0.016040509566664696, -0.04042845219373703, -0.01610240899026394, 0.005084505770355463, 0.01527120266109705, -0.003892962820827961, 0.012255869805812836, -0.06020054966211319, -0.017207736149430275, 0.022831641137599945, 0.0281018428504467, -0.03367269039154053, -0.020090429112315178, -0.039685674011707306, 3.438604107941501e-05, -0.013555734418332577, 0.1008058562874794, -0.07219114154577255, -0.0862332135438919, -0.015607221983373165, -0.02357442118227482, 0.001297654234804213, -0.004180348012596369, -0.0009340015822090209, 0.027712766081094742, 0.013776799663901329, -0.018551813438534737, -0.048917368054389954, 0.007931828498840332, -0.030206385999917984, -0.014944025315344334, 0.006163305137306452, -0.023892754688858986, -0.009629611857235432, -0.0649755597114563, 0.012335453182458878, 0.04902347922325134, -0.033301301300525665, -0.031072961166501045, 0.02647479996085167, -0.06479871273040771, -0.005049135070294142, 0.00771076325327158, 0.00409634318202734, -0.036113254725933075, 0.011628043837845325, -0.010451975278556347, 0.02560822293162346, 0.08361580222845078, 0.019648298621177673, -0.03629010543227196, 0.01879940740764141, -0.006375527940690517, 0.049483295530080795, 0.083403579890728, 0.02840249054133892, 0.017587969079613686, 0.01601398177444935, 0.060837216675281525, -0.07385355234146118, 0.021929694339632988, 0.012786426581442356, -0.021186914294958115, 0.07887616008520126, 0.07675392925739288, -0.03093148022890091, -0.07731986045837402, -0.010372391901910305, -0.0041936119087040424, -0.01774713583290577, 0.04113586246967316, 0.022071175277233124, 0.049200329929590225, 0.033301301300525665, 0.01269800029695034, 0.020514875650405884, -0.009470444172620773, 0.08326210081577301, 0.03646695986390114, -0.01998431794345379, -0.01251230575144291, 0.02520146407186985, -0.0034928342793136835, -0.02918064221739769, -0.01814505271613598, -0.005402840208262205, 0.028473231941461563, -0.0022681315895169973, 0.028933048248291016, -0.09196323156356812, -6.752859189873561e-05, 0.03179805725812912, 0.0037293743807822466, 0.05284348875284195, -0.0368206650018692, -0.05294959992170334, 0.0008057836093939841, -0.013661845587193966, -0.010425447486341, -0.007715184707194567, 0.025095351040363312, 0.05556701496243477, -0.0023145554587244987, 0.008281112648546696, 0.017685236409306526, -0.02201811969280243, 0.010089428164064884, -0.034751489758491516, -0.007914143614470959, -0.07484392076730728, -0.04431920498609543, -0.07717837393283844, 0.046016987413167953, 0.019612928852438927, -0.020391078665852547, -0.0901239663362503, 0.005920133087784052, 0.04216160625219345, 0.002997647738084197, -0.01367068849503994, -0.05747701972723007, -0.001225808053277433, -0.006552380509674549, 0.013679531402885914, 0.020302653312683105, 0.03409713879227638, 0.022743215784430504, 0.026545541360974312, 0.014501894824206829, -0.02541368640959263, 0.02318534627556801, 0.049377184361219406, 0.023008493706583977, -0.04796236380934715, -0.04506198316812515, -0.010982532054185867, -0.024759331718087196, 0.07484392076730728, -0.025625908747315407, 0.002374243224039674, -0.026793135330080986, -0.023698218166828156, 0.009726880118250847, -0.05111033469438553, -0.003572417888790369, -0.0005675855791196227, -0.010531558655202389, 0.03363732248544693, 0.006808816455304623, -0.05606220290064812, 0.057441651821136475, -0.007874351926147938, 0.015377313829958439, 0.008378380909562111, -0.0074852765537798405, 0.015616064891219139, -0.02444099821150303, -0.04110049083828926, 0.003483991837128997, -0.001391607103869319, 0.011831424199044704, -0.002528988989070058, 0.01986052095890045, 0.009382018819451332, -0.03441547229886055, 0.03742196038365364, 0.00077815045369789, -0.021151544526219368, -0.001071062171831727, -0.020214226096868515, 0.011141699738800526, 0.06667334586381912, -0.02258404716849327, -0.03763418644666672, -0.011513089761137962, -0.03749270364642143, 0.051428671926259995, -0.04148956760764122, -0.025944242253899574, -0.03802325949072838, -0.010345864109694958, 0.01821579411625862, 0.013591105118393898, -0.01899394579231739, 0.00820152834057808, -0.04088826850056648, -0.05054440721869469, 0.005429368000477552, 0.039402708411216736, -0.05217145010828972, 0.08644544333219528, -0.01443999633193016, -0.00028351644868962467, -0.03809400275349617, 0.05294959992170334, 0.0016756762051954865, 0.06037740036845207, -0.022354140877723694, 0.06214592233300209, 0.012229342013597488, 0.005274622235447168, 0.026421744376420975, -0.025820447131991386, 0.016924772411584854, 0.0019564293324947357, 0.011955220252275467, -0.015120877884328365, -0.014634533785283566, 0.012680315412580967, -0.0016314631793648005, 0.00035121774999424815, 0.001478928024880588, 0.09259990602731705, 0.0285970289260149, 0.03142666816711426, 0.07809800654649734, 0.06356074661016464, -0.013458465225994587, -0.09613694995641708, -0.050650518387556076, -0.012087860144674778, -0.0010666408343240619, -0.02123996987938881, 0.023485995829105377, 0.011875636875629425, 0.006919349078088999, -0.0329299122095108, 0.051534783095121384, -0.015474583022296429, 0.0455571711063385, -0.015324258245527744, -0.03259389102458954, 0.0756220743060112, -0.014068606309592724, -0.014033235609531403, 0.052772749215364456, -0.03713899850845337, 0.005035871174186468, -0.027695082128047943, -0.04951866343617439, -0.07746133953332901, -0.043187350034713745, 0.0203733928501606, -0.1115584746003151, 0.04835144057869911, 0.02092163637280464, 0.07647096365690231, -4.3625888793030754e-05, -0.08488913625478745, -0.00508892722427845, -0.025838131085038185, 0.001900057657621801, 0.028172582387924194, 0.08977026492357254, 0.03848307579755783, 0.03703288733959198, -0.03632547706365585, -0.02822563797235489, -0.01362647581845522, 0.03629010543227196, -0.03531741723418236, -0.035724177956581116, 0.019807465374469757, -0.03759881481528282, 0.029145270586013794, -0.008904516696929932, -0.0016248312313109636, -0.012848325073719025, 0.003932754509150982, -0.05669886991381645, 0.025183778256177902, 0.020284967496991158, 0.03965030238032341, -0.004934180993586779, 0.048245325684547424, -0.031851112842559814, 0.06126166135072708, 0.0006051667151041329, -0.002595308469608426, -0.024670906364917755, -0.0030551247764378786, 0.024564795196056366, -0.024582479149103165, 0.019648298621177673, 0.04683050885796547, 0.006601014640182257, 0.009850677102804184, 0.045238837599754333, 0.025944242253899574, 0.028190268203616142, 0.01803009957075119, 0.008351853117346764, 0.01821579411625862, -0.04060530662536621, -0.025732019916176796, -0.055390164256095886, 0.00681323790922761, 0.05634516477584839, -0.020196540281176567, -0.029587402939796448, -0.05507183074951172, 0.062393516302108765, 0.04718421399593353, 0.018083155155181885, -0.00273900106549263, 0.02666933834552765, 0.010619984939694405, 0.058007579296827316, -0.031143702566623688, -0.010363548994064331, 0.021204600110650063, 0.004889968317002058, -0.026633966714143753, 0.003775798249989748, -0.008055625483393669, -0.008312061429023743, -0.021469878032803535, 0.010929476469755173, -0.005654854699969292, 0.03752807527780533, 0.025272203609347343, -0.005367469508200884, -0.042515311390161514, -0.013785642571747303, -0.017110466957092285, -0.04194938391447067, 0.004505314398556948, 0.02307923510670662, -0.031072961166501045, -0.0015507742064073682, 0.03823548182845116, 0.00596434623003006, -0.008418172597885132, -0.027748137712478638, -0.011247810907661915, 0.02578507550060749, -0.017667552456259727, -0.0014944025315344334, -0.022230343893170357, -0.012521147727966309, -0.013246242888271809, 0.03890752047300339, -0.07731986045837402, -0.04573402553796768, 0.023415254428982735, 0.047573287039995193, -0.017959358170628548, -0.022937752306461334, -0.021080803126096725, 0.0184103325009346, -0.002639521611854434, -0.020302653312683105, 0.01716352254152298, -0.031355924904346466, -0.04113586246967316, 0.057547762989997864, -0.05033218488097191, -0.02511303685605526, -0.053550899028778076, 0.08347432315349579, -0.02451173961162567, 0.024653220549225807, 0.005508951377123594, -0.04011011868715286, 0.010787994600832462, -0.03206333518028259, -0.06288870424032211, 0.03653769940137863, 0.04743180796504021, -0.018640240654349327, -0.08375728130340576, 0.0008919991669245064, -0.026404058560729027, -0.041701789945364, 0.004054340533912182, 0.028561659157276154, -0.02442331239581108, 0.024741647765040398, 0.024016551673412323, -0.0164737980812788, 0.02705841325223446, 0.00019246511510573328, 0.03770492598414421, 0.006556801963597536, -0.009921418502926826, -0.0455925427377224, 0.022707844153046608, -0.020514875650405884, -0.054647382348775864, 0.052772749215364456, -0.009488129988312721, 0.054435160011053085, -0.06048351153731346, -0.0465475432574749, 0.018180424347519875, -0.03692677617073059, 0.04272753372788429, 0.010478503070771694, 0.0025533060543239117, 0.007007775362581015, 0.05998832359910011, -0.02424645982682705, -0.00608372176066041, 0.023804329335689545, -0.025643594563007355, -0.0687955766916275, -0.03574186563491821, -0.02355673536658287, -0.011026745662093163, -0.01230892539024353, -0.029923422262072563, 0.022141916677355766, -0.00042610368109308183, -0.04481439292430878, 0.035228993743658066, 0.0581844300031662, 0.01637652888894081, 0.03409713879227638, -0.03370806202292442, -0.03943808004260063, -0.054152198135852814, 0.06794667989015579, -0.08764803409576416, 0.012866009958088398, -0.07201428711414337, -0.039897896349430084, 0.07141298800706863, 0.06440963596105576, 0.029445920139551163, 0.025661278516054153, 0.024476367980241776, 0.02102774754166603, -0.0034530425909906626, -0.04177252948284149, 0.037669554352760315, 0.014599163085222244, -0.04449605569243431, -0.03112601675093174, 0.020497189834713936, -0.01589902862906456, -0.05775998532772064, -0.010823365300893784, -0.021063117310404778, 0.006114671006798744, -0.02180589735507965, 0.016420742496848106, 0.003090495243668556, 0.010266279801726341, 0.018852462992072105, -0.041348084807395935, 0.002668260131031275, -0.005323256365954876, 0.060943327844142914, 0.06950297951698303, -0.005486845038831234, -0.013635317794978619, 0.0465121753513813, 0.046299953013658524, 0.042126234620809555, -0.016146622598171234, -0.008634816855192184, -0.061226293444633484, -0.02987036667764187, 0.006503745913505554, 0.018162738531827927, 0.02046182006597519, 0.05875035747885704, 0.016624122858047485, 0.04049919173121452, 0.08764803409576416, 0.025643594563007355, -0.05057977885007858, 0.010310493409633636, 0.01784440502524376, -0.009567713364958763, -0.06967983394861221, -0.01506782229989767, -0.02472396194934845, 0.03209870681166649, -0.018083155155181885, 0.028349434956908226, 0.04127734526991844, 0.027995729818940163, -0.007551596499979496, 0.006096985656768084, 0.05669886991381645, -0.005097769666463137, 0.005243672989308834, -0.02578507550060749, 0.023238401859998703, -0.04721958190202713, -0.022424880415201187, 0.04293975606560707, 0.04216160625219345, 0.012627259828150272, -0.009337805211544037, -0.10299882292747498, 0.02258404716849327, 0.024953870102763176, 0.028844622895121574, 0.006525852717459202, 0.005071241874247789, -0.005460317246615887, -0.018357276916503906, 0.03270000219345093, 0.0006322472472675145, -0.0025643594563007355, -0.02822563797235489, 0.0012622837675735354, 0.023309143260121346, -0.01784440502524376, -0.0064949034713208675, 0.010540401563048363, -0.04092364013195038, 0.003539258148521185, 0.010257437825202942, 0.010478503070771694, -0.010558086447417736, -0.025732019916176796, 0.06440963596105576, -0.02375127375125885, -0.046865880489349365, 0.024016551673412323, 0.0005499003455042839, -0.00011764827650040388, 0.05935165658593178, -0.0227962713688612, -0.012149757705628872, -0.03314213454723358, 0.06370222568511963, -0.029127586632966995, -0.004708694294095039, 0.13362964987754822, -0.018870148807764053, -0.03257620707154274, -0.003928333520889282, -0.0047263796441257, 0.08156431466341019, 0.01830422133207321, -0.02180589735507965, 0.0044213091023266315, 0.004025602247565985, -0.02327377162873745, -0.025714335963129997, 0.017305005341768265, -0.01420124527066946, -0.03554732725024223, 0.016615280881524086, -0.006057193968445063, -0.01192869246006012, -0.0009831886272877455, -0.002462669275701046, -0.03170963004231453, 0.07477318495512009, 0.05164089426398277, 0.034362416714429855, 0.04838680848479271, 0.02626257762312889, 0.03982715308666229, 0.05224218964576721, -0.0227962713688612, -0.031568147242069244, 0.052878860384225845, -0.024653220549225807, 0.011017902754247189, 0.07307539880275726, -0.015509952791035175, -0.011380449868738651, -0.03517593815922737, 0.053374044597148895, 0.01109748613089323, -0.044920504093170166, 0.02394581213593483, -0.02249562181532383, 0.04799773544073105, -0.017623338848352432, 0.0076400223188102245, 0.006211939733475447, 0.008643659763038158, -0.026421744376420975, 0.025537483394145966, -0.06858334690332413, -0.006853029597550631, 0.0455571711063385, -0.007171363569796085, -0.039685674011707306, -0.007679814472794533, -0.004357200115919113, 0.0030794418416917324, 0.06193369999527931, 0.006782288663089275, 0.04732569679617882, -0.01657990925014019, -0.00011371055006748065, 0.0504382960498333, -0.001120801898650825, 0.017667552456259727, -0.013511521741747856, -0.03063083067536354, 0.04976625740528107, -0.008020254783332348, -0.01668602228164673, -0.021469878032803535, 0.007591388188302517, -0.040463823825120926, 0.02228339947760105, 0.04884662479162216, -0.02965814247727394, -0.0040278127416968346, -0.028738509863615036, 0.029905736446380615, 0.055107198655605316, 0.041701789945364, 0.04247993975877762, 0.025519797578454018, -0.05256052687764168, -0.02490081451833248, -0.08835544437170029, 0.05227756127715111, -0.019842837005853653, -0.07548059523105621, -0.031249813735485077, 0.07233262062072754, 0.047254953533411026, -0.042020123451948166, 0.0669209361076355, -0.018640240654349327, 0.0089619942009449, 0.039013635367155075, 0.00984183419495821, -0.054824236780405045, -0.03323056176304817, -0.02210654690861702, -0.006906085181981325, 0.07427799701690674, 0.037563443183898926, 0.09245841950178146, -0.04187864065170288, -0.013025177642703056, 0.038447704166173935, -0.010027529671788216, 0.020815525203943253, -0.004894389305263758, 0.006826501805335283, -0.015642592683434486, 0.04007474705576897, 0.10356474667787552, 0.025820447131991386, -0.024494053795933723, 0.011221283115446568, -0.006618699990212917, 0.029392864555120468, -0.007626758422702551, -0.047644030302762985, 0.005915712099522352, 0.007777083199471235, 0.04637069255113602, -0.03333667293190956, 0.033601950854063034, 0.009382018819451332, 0.02619183622300625, 0.029817309230566025, -0.01314013171941042, -0.008409330621361732, -0.037952519953250885, -0.008219214156270027, 0.018746351823210716, -0.029552031308412552, -0.020426448434591293, 0.02645711414515972, -0.09514657407999039, -0.052702005952596664, -0.01590787060558796, -0.01656222529709339, -0.029552031308412552, 0.01628810353577137, -0.011256653815507889, -0.05447053164243698, -0.019418390467762947, -0.0008146262262016535, 0.07724911719560623, -0.005668118596076965, 0.016500325873494148, -0.07091780006885529, -0.04442531615495682, 0.05082737281918526, -0.029552031308412552, 0.06260573863983154, 0.039013635367155075, 0.03625473752617836, -0.0011279865866526961, 0.03922585770487785, -0.0203557088971138, -0.020108114928007126, -0.02306154929101467, -0.015138562768697739, -0.01581060141324997, 0.01464337669312954, 0.0649755597114563, 0.005911290645599365, 0.016756761819124222, -0.039897896349430084, 0.027995729818940163, 0.014192403294146061, -0.016367686912417412, 0.010858736000955105, 0.02773045189678669, -0.009125582873821259, 0.01512972079217434, -0.006419741082936525, -0.034556955099105835, -0.10214993357658386, -0.00422013970091939, -0.022513307631015778, 0.002639521611854434, -4.390221874928102e-05, -0.06854797899723053, 0.03112601675093174, -0.024670906364917755, 0.00311260181479156, 0.04226771742105484, 0.03791714832186699, -0.028862306848168373, 0.04110049083828926, 0.0009356595692224801, 0.004186979960650206, -0.001129091833718121, -0.037174370139837265, -0.0397210419178009, -0.016553381457924843, -0.009576556272804737, 0.018100840970873833, -0.03422093391418457, 0.030860738828778267, 0.01755259744822979, 0.003302718047052622, 0.03285917267203331, 0.003119233762845397, -0.03646695986390114, 0.001717678620480001, -0.02316766045987606, -0.043965499848127365, 0.003528204746544361, 0.05273737758398056, -0.0011738575994968414, 0.013193187303841114]}