Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reproduce e2e sparse+dense Lucene #2317

Merged
merged 5 commits into from
Dec 29, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 17 additions & 39 deletions src/main/java/io/anserini/index/IndexInfo.java
Original file line number Diff line number Diff line change
Expand Up @@ -20,65 +20,43 @@ public enum IndexInfo {
MSMARCO_V1_PASSAGE("msmarco-v1-passage",
"Lucene index of the MS MARCO V1 passage corpus. (Lucene 9)",
"lucene-index.msmarco-v1-passage.20221004.252b5e.tar.gz",
"lucene-index.msmarco-v1-passage.20221004.252b5e.README.md",
new String[] {
"https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.msmarco-v1-passage.20221004.252b5e.tar.gz" },
"c697b18c9a0686ca760583e615dbe450", "2170758938", "352316036", "8841823",
"2660824", false),
"c697b18c9a0686ca760583e615dbe450"),

CACM("cacm",
"Lucene index of the CACM corpus. (Lucene 9)",
"lucene-index.cacm.tar.gz",
new String[] {
"https://github.com/castorini/anserini-data/raw/master/CACM/lucene-index.cacm.20221005.252b5e.tar.gz" },
"cfe14d543c6a27f4d742fb2d0099b8e0",
"2347197",
"320968",
"3204",
"14363");
"cfe14d543c6a27f4d742fb2d0099b8e0"),

MSMARCO_V1_PASSAGE_COS_DPR_DISTIL("msmarco-v1-passage-cos-dpr-distil",
"Lucene index of the MS MARCO V1 passage corpus encoded by cos-DPR Distil. (Lucene 9)",
"lucene-hnsw.msmarco-v1-passage-cos-dpr-distil.20231124.9d3427.tar.gz",
new String[] {
"https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-hnsw.msmarco-v1-passage-cos-dpr-distil.20231124.9d3427.tar.gz" },
"7aa825e292a411abbe1585fb4d9f20ee"),

MSMARCO_V1_PASSAGE_SPLADE_PP_ED("msmarco-v1-passage-splade-pp-ed",
"Lucene impact index of the MS MARCO passage corpus encoded by SPLADE++ CoCondenser-EnsembleDistil. (Lucene 9)",
"lucene-index.msmarco-v1-passage-splade-pp-ed.20230524.a59610.tar.gz",
new String[] {
"https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.msmarco-v1-passage-splade-pp-ed.20230524.a59610.tar.gz" },
"4b3c969033cbd017306df42ce134c395");

public final String indexName;
public final String description;
public final String filename;
public final String readme;
public final String[] urls;
public final String md5;
public final String size;
public final String totalTerms;
public final String totalDocs;
public final String totalUniqueTerms;
public final boolean downloaded;

// constructor with all 11 fields
IndexInfo(String indexName, String description, String filename, String readme, String[] urls, String md5,
String size, String totalTerms, String totalDocs, String totalUniqueTerms, boolean downloaded) {
this.indexName = indexName;
this.description = description;
this.filename = filename;
this.readme = readme;
this.urls = urls;
this.md5 = md5;
this.size = size;
this.totalTerms = totalTerms;
this.totalDocs = totalDocs;
this.totalUniqueTerms = totalUniqueTerms;
this.downloaded = downloaded;
}

// constructor with 9 fields
IndexInfo(String indexName, String description, String filename, String[] urls, String md5, String size,
String totalTerms, String totalDocs, String totalUniqueTerms) {
IndexInfo(String indexName, String description, String filename, String[] urls, String md5) {
this.indexName = indexName;
this.description = description;
this.filename = filename;
this.readme = "";
this.urls = urls;
this.md5 = md5;
this.size = size;
this.totalTerms = totalTerms;
this.totalDocs = totalDocs;
this.totalUniqueTerms = totalUniqueTerms;
this.downloaded = false;
}

public static boolean contains(String indexName) {
Expand Down
24 changes: 23 additions & 1 deletion src/main/java/io/anserini/search/HnswDenseSearcher.java
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@
import io.anserini.encoder.dense.DenseEncoder;
import io.anserini.index.Constants;
import io.anserini.search.query.VectorQueryGenerator;
import io.anserini.util.PrebuiltIndexHandler;

import org.apache.commons.lang3.time.DurationFormatUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
Expand All @@ -36,6 +38,8 @@
import javax.annotation.Nullable;
import java.io.Closeable;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.List;
import java.util.SortedMap;
Expand Down Expand Up @@ -77,8 +81,26 @@
// We might not be able to successfully create a reader for a variety of reasons, anything from path doesn't exist
// to corrupt index. Gather all possible exceptions together as an unchecked exception to make initialization and
// error reporting clearer.
Path indexPath = Path.of(args.index);
PrebuiltIndexHandler indexHandler = new PrebuiltIndexHandler(args.index);
if (!Files.exists(indexPath)) {
// it doesn't exist locally, we try to download it from remote
try {
indexHandler.initialize();
indexHandler.download();
indexPath = Path.of(indexHandler.decompressIndex());
} catch (IOException e) {
throw new RuntimeException("MD5 checksum does not match!");
} catch (Exception e) {
throw new IllegalArgumentException(String.format("\"%s\" does not appear to be a valid index.", args.index));

Check warning on line 95 in src/main/java/io/anserini/search/HnswDenseSearcher.java

View check run for this annotation

Codecov / codecov/patch

src/main/java/io/anserini/search/HnswDenseSearcher.java#L91-L95

Added lines #L91 - L95 were not covered by tests
}
} else {
// if it exists locally, we use it

Check warning on line 98 in src/main/java/io/anserini/search/HnswDenseSearcher.java

View check run for this annotation

Codecov / codecov/patch

src/main/java/io/anserini/search/HnswDenseSearcher.java#L98

Added line #L98 was not covered by tests
indexPath = Paths.get(args.index);
}

try {
this.reader = DirectoryReader.open(FSDirectory.open(Paths.get(args.index)));
this.reader = DirectoryReader.open(FSDirectory.open(indexPath));
} catch (IOException e) {
throw new IllegalArgumentException(String.format("\"%s\" does not appear to be a valid index.", args.index));
}
Expand Down
92 changes: 92 additions & 0 deletions src/main/python/e2e_sparse_dense_lucene/reproduction.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,92 @@
# Anserini: A toolkit for reproducible information retrieval research built on Lucene
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

import yaml
from typing import Union, Dict, List, Optional, Any
import os
import subprocess
TOPIC_NAMES = ['msmarco-passage-dev-subset', 'dl19-passage', 'dl20-passage']
EVAL_CMD_MAP = {
'map': '-m map -c -l 2', # AP
'ndcg_cut_10': '-m ndcg_cut.10 -c', # nDCG@10
'recall_1000_msmarco': '-c -m recall.1000', # R@1000 for MS MARCO
'recall_1000': '-m recall.1000 -c -l 2', # R@1000
'recip_rank': '-c -M 10 -m recip_rank' # RR@10
}
TOPIC_EVAL_MAP = {
'msmarco-passage-dev-subset': ['recip_rank', 'recall_1000_msmarco'],
'dl19-passage': ['map', 'ndcg_cut_10', 'recall_1000'],
'dl20-passage': ['map', 'ndcg_cut_10', 'recall_1000']
}


def get_output_run_file_name(topic: str, name: str):
return f'runs/{topic}_{name}.txt'


def get_search_command(model_name: str, cmd_template: str, topics: List[str]):
outputs = [get_output_run_file_name(
topic_name, model_name) for topic_name in TOPIC_NAMES]

for topic, output in zip(topics, outputs):
cmd = cmd_template.format(topic=topic, output=output)
yield cmd


def get_eval_command(param: str, qrel: str, run_file: str, cmd_template: str):
cmd = cmd_template.format(
param=param, qrel=qrel, output=run_file)
yield cmd


def main(config):
# print all search commands
for model_name, model_config in config['collections'].items():
print("running model: ", model_name)
# # search
# for cmd in get_search_command(model_name, model_config['search_command'], model_config['topics']):
# p = subprocess.Popen(
# cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
# stdout, stderr = p.communicate()
# if stderr:
# print(stderr.decode('utf-8'))

# eval
expected_results = model_config['results']
run_files = [get_output_run_file_name(
topic_name, model_name) for topic_name in TOPIC_NAMES]
eval_cmd = model_config['eval_command']
metric_precision = model_config['metric_precision']

for run_file, topic_name, qrel in zip(run_files, TOPIC_NAMES, model_config['qrels']):
for metric in TOPIC_EVAL_MAP[topic_name]:
for cmd in get_eval_command(EVAL_CMD_MAP[metric], qrel, run_file, eval_cmd):
p = subprocess.Popen(
cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout, stderr = p.communicate()
stdout = [out.strip()
for out in stdout.decode('utf-8').split('\t')]
actual_result = round(float(stdout[-1]), metric_precision)
expected_result = expected_results[topic_name][metric]
assert actual_result == expected_result, f'{model_name} {topic_name} {metric} {actual_result} != {expected_result}, expected: {expected_results[topic_name]}'
print(
f"{topic_name} {metric} {actual_result} == {expected_result}")
print(f"{model_name} passed!")
print("="*50)


if __name__ == '__main__':
with open('src/main/resources/e2e_sparse_dense_lucene/pre-encoded.yaml') as f:
config = yaml.load(f, Loader=yaml.FullLoader)
main(config)
78 changes: 78 additions & 0 deletions src/main/resources/e2e_sparse_dense_lucene/pre-encoded.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
---
collections:
bm25:
name: bm25
search_command: target/appassembler/bin/SearchCollection -index msmarco-v1-passage -topicReader TsvInt -topics {topic} -output {output} -bm25 -parallelism 12
topics:
- tools/topics-and-qrels/topics.msmarco-passage.dev-subset.txt
- tools/topics-and-qrels/topics.dl19-passage.txt
- tools/topics-and-qrels/topics.dl20.txt
qrels:
- tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt
- tools/topics-and-qrels/qrels.dl19-passage.txt
- tools/topics-and-qrels/qrels.dl20-passage.txt

eval_command: tools/eval/trec_eval.9.0.4/trec_eval {param} {qrel} {output}
results:
msmarco-passage-dev-subset:
recip_rank: 0.184
recall_1000_msmarco: 0.853
dl19-passage:
map: 0.301
ndcg_cut_10: 0.506
recall_1000: 0.750
dl20-passage:
map: 0.286
ndcg_cut_10: 0.480
recall_1000: 0.786
metric_precision: 3
cosdpr-distil:
name: cosdpr-distil
search_command: target/appassembler/bin/SearchHnswDenseVectors -index msmarco-v1-passage-cos-dpr-distil -topicReader TsvInt -topics {topic} -output {output} -generator VectorQueryGenerator -topicField title -threads 12 -hits 1000 -efSearch 1000 -encoder CosDprDistil
topics:
- tools/topics-and-qrels/topics.msmarco-passage.dev-subset.txt
- tools/topics-and-qrels/topics.dl19-passage.txt
- tools/topics-and-qrels/topics.dl20.txt
qrels:
- tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt
- tools/topics-and-qrels/qrels.dl19-passage.txt
- tools/topics-and-qrels/qrels.dl20-passage.txt
eval_command: tools/eval/trec_eval.9.0.4/trec_eval {param} {qrel} {output}
results:
msmarco-passage-dev-subset:
recip_rank: 0.389
recall_1000_msmarco: 0.975
dl19-passage:
map: 0.466
ndcg_cut_10: 0.725
recall_1000: 0.822
dl20-passage:
map: 0.487
ndcg_cut_10: 0.703
recall_1000: 0.852
metric_precision: 3
splade-pp-ed:
name: splade-pp-ed
search_command: target/appassembler/bin/SearchCollection -index msmarco-v1-passage-splade-pp-ed -topicReader TsvInt -topics {topic} -output {output} -impact -pretokenized -parallelism 12 -encoder SpladePlusPlusEnsembleDistil
topics:
- tools/topics-and-qrels/topics.msmarco-passage.dev-subset.txt
- tools/topics-and-qrels/topics.dl19-passage.txt
- tools/topics-and-qrels/topics.dl20.txt
qrels:
- tools/topics-and-qrels/qrels.msmarco-passage.dev-subset.txt
- tools/topics-and-qrels/qrels.dl19-passage.txt
- tools/topics-and-qrels/qrels.dl20-passage.txt
eval_command: tools/eval/trec_eval.9.0.4/trec_eval {param} {qrel} {output}
results:
msmarco-passage-dev-subset:
recip_rank: 0.383
recall_1000_msmarco: 0.983
dl19-passage:
map: 0.505
ndcg_cut_10: 0.731
recall_1000: 0.873
dl20-passage:
map: 0.500
ndcg_cut_10: 0.720
recall_1000: 0.900
metric_precision: 3
2 changes: 1 addition & 1 deletion src/test/java/io/anserini/index/PrebuiltIndexTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,6 @@ public void testUrls() {
// test number of prebuilt-indexes
@Test
public void testNumPrebuiltIndexes() {
assert IndexInfo.values().length == 2;
assert IndexInfo.values().length == 4;
}
}
Loading