From 3a8c31a381c77ddf26c4c1bdfadb95153080b88b Mon Sep 17 00:00:00 2001 From: lintool Date: Wed, 20 Dec 2023 12:44:47 -0500 Subject: [PATCH] Added docs; more tests. --- .../io/anserini/search/BaseSearchArgs.java | 4 +- .../java/io/anserini/search/BaseSearcher.java | 125 +++++++++++++++++- .../io/anserini/search/HnswDenseSearcher.java | 11 +- .../search/InvertedDenseSearcher.java | 9 +- .../java/io/anserini/search/ScoredDoc.java | 3 + .../java/io/anserini/search/ScoredDocs.java | 8 +- .../io/anserini/search/SearchCollection.java | 34 ++--- .../io/anserini/search/BaseSearcherTest.java | 15 ++- 8 files changed, 170 insertions(+), 39 deletions(-) diff --git a/src/main/java/io/anserini/search/BaseSearchArgs.java b/src/main/java/io/anserini/search/BaseSearchArgs.java index bdd4e6d20f..ec888ebc9a 100644 --- a/src/main/java/io/anserini/search/BaseSearchArgs.java +++ b/src/main/java/io/anserini/search/BaseSearchArgs.java @@ -31,11 +31,13 @@ public class BaseSearchArgs { @Option(name = "-threads", metaVar = "[int]", usage = "Number of threads for running queries in parallel.") public int threads = 4; + // In some test collections, a document is used as a query, usually denoted by setting the qid as the docid. In this + // case, we want to remove the docid from the ranked list. @Option(name = "-removeQuery", usage = "Remove docids that have the query id when writing final run output.") public Boolean removeQuery = false; // Note that this option is set to false by default because duplicate documents usually indicate some underlying - // indexing issues, and we don't want to just eat errors silently. + // corpus or indexing issues, and we don't want to just eat errors silently. @Option(name = "-removeDuplicates", usage = "Remove duplicate docids when writing final run output.") public Boolean removeDuplicates = false; diff --git a/src/main/java/io/anserini/search/BaseSearcher.java b/src/main/java/io/anserini/search/BaseSearcher.java index 9cbeb7b26d..1084253fea 100644 --- a/src/main/java/io/anserini/search/BaseSearcher.java +++ b/src/main/java/io/anserini/search/BaseSearcher.java @@ -20,6 +20,7 @@ import org.apache.lucene.document.Document; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.TopDocs; +import org.jetbrains.annotations.NotNull; import java.io.IOException; import java.util.ArrayList; @@ -27,14 +28,98 @@ import java.util.List; import java.util.Set; +/** + *

This class provides a base for all Lucene searchers, handling three basic common post-processing operations + * (duplicate removal, docid-as-qid removal, and MaxP) on ranked lists based on the supplied configuration.

+ * + *

In more detail:

+ * + * + * + * @param type of qid, typically string or integer + */ public class BaseSearcher> { protected final BaseSearchArgs args; + private IndexSearcher searcher; + /** + * Creates an instance of this class with supplied arguments. + * + * @param args configuration for duplicate removal, docid-as-qid removal, and MaxP + */ public BaseSearcher(BaseSearchArgs args) { this.args = args; } - public ScoredDoc[] processLuceneTopDocs(IndexSearcher searcher, K qid, TopDocs docs) throws IOException { + /** + * Creates an instance of this class with supplied arguments. + * + * @param args configuration for duplicate removal, docid-as-qid removal, and MaxP + * @param searcher {@link IndexSearcher} used for accessing documents from the index + */ + public BaseSearcher(BaseSearchArgs args, IndexSearcher searcher) { + this.args = args; + this.searcher = searcher; + } + + /** + * Sets the {@link IndexSearcher} used for accessing documents from the index. + * + * @param searcher the {@link IndexSearcher} used for accessing documents from the index + */ + protected void setIndexSearcher(IndexSearcher searcher) { + this.searcher = searcher; + } + + /** + * Gets the {@link IndexSearcher} used for accessing documents from the index. + * + * @return the {@link IndexSearcher} used for accessing documents from the index + */ + protected IndexSearcher getIndexSearcher() { + return this.searcher; + } + + /** + * Processes Lucene {@link TopDocs} for a query based on the configuration for duplicate removal, docid-as-qid + * removal, and MaxP. By default, retains references to the original Lucene docs (which can be memory intensive for + * long ranked lists). + * + * @param qid query id + * @param docs Lucene {@link TopDocs} + * @return processed ranked list + */ + public ScoredDoc[] processLuceneTopDocs(K qid, TopDocs docs) { + return processLuceneTopDocs(qid, docs, true); + } + + /** + * Processes Lucene {@link TopDocs} for a query based on the configuration for duplicate removal, docid-as-qid + * removal, and MaxP. Explicitly supports control over whether to retain references to the original Lucene docs + * (and hence memory usage). + * + * @param qid query id + * @param docs Lucene {@link TopDocs} + * @param keepLuceneDocument whether to retain references to the original Lucene docs + * @return processed ranked list + */ + public ScoredDoc[] processLuceneTopDocs(K qid, @NotNull TopDocs docs, boolean keepLuceneDocument) { List results = new ArrayList<>(); // For removing duplicate docids. Set docids = new HashSet<>(); @@ -42,7 +127,12 @@ public ScoredDoc[] processLuceneTopDocs(IndexSearcher searcher, K qid, TopDocs d int rank = 1; for (int i = 0; i < docs.scoreDocs.length; i++) { int lucene_docid = docs.scoreDocs[i].doc; - Document lucene_document = searcher.storedFields().document(docs.scoreDocs[i].doc); + Document lucene_document; + try { + lucene_document = searcher.storedFields().document(docs.scoreDocs[i].doc); + } catch (IOException e) { + throw new RuntimeException(String.format("Unable to fetch document %d", docs.scoreDocs[i].doc)); + } String docid = lucene_document.get(Constants.ID); if (args.selectMaxPassage) { @@ -56,7 +146,11 @@ public ScoredDoc[] processLuceneTopDocs(IndexSearcher searcher, K qid, TopDocs d if (args.removeQuery && docid.equals(qid)) continue; - results.add(new ScoredDoc(docid, lucene_docid, docs.scoreDocs[i].score, lucene_document)); + // Note that if keepLuceneDocument == true, then we're retaining references to a lot of objects that cannot be + // garbage collected. If we're running lots of queries, e.g., from SearchCollection, this can easily exhaust + // the heap. + results.add(new ScoredDoc(docid, lucene_docid, docs.scoreDocs[i].score, + keepLuceneDocument ? lucene_document : null)); // Note that this option is set to false by default because duplicate documents usually indicate some // underlying indexing issues, and we don't want to just eat errors silently. @@ -77,7 +171,30 @@ public ScoredDoc[] processLuceneTopDocs(IndexSearcher searcher, K qid, TopDocs d return results.toArray(new ScoredDoc[0]); } - public ScoredDoc[] processScoredDocs(K qid, ScoredDocs docs, boolean keepLuceneDocument) { + /** + * Processes {@link ScoredDocs} for a query based on the configuration for duplicate removal, docid-as-qid removal, + * and MaxP. By default, retains references to the original Lucene docs (which can be memory intensive for long + * ranked lists). + * + * @param qid query id + * @param docs {@link ScoredDocs} to process + * @return processed ranked list + */ + public ScoredDoc[] processScoredDocs(K qid, ScoredDocs docs) { + return processScoredDocs(qid, docs, true); + } + + /** + * Processes {@link ScoredDocs} for a query based on the configuration for duplicate removal, docid-as-qid removal, + * and MaxP. Explicitly supports control over whether to retain references to the original Lucene docs (and hence + * memory usage). + * + * @param qid query id + * @param docs {@link ScoredDocs} to process + * @param keepLuceneDocument whether to retain references to the original Lucene docs + * @return processed ranked list + */ + public ScoredDoc[] processScoredDocs(K qid, @NotNull ScoredDocs docs, boolean keepLuceneDocument) { List results = new ArrayList<>(); // For removing duplicate docids. Set docids = new HashSet<>(); diff --git a/src/main/java/io/anserini/search/HnswDenseSearcher.java b/src/main/java/io/anserini/search/HnswDenseSearcher.java index b5eb18cd5e..c3b29a207e 100644 --- a/src/main/java/io/anserini/search/HnswDenseSearcher.java +++ b/src/main/java/io/anserini/search/HnswDenseSearcher.java @@ -70,7 +70,6 @@ public static class Args extends BaseSearchArgs { } private final IndexReader reader; - private final IndexSearcher searcher; private final VectorQueryGenerator generator; private final DenseEncoder encoder; @@ -86,7 +85,7 @@ public HnswDenseSearcher(Args args) { throw new IllegalArgumentException(String.format("\"%s\" does not appear to be a valid index.", args.index)); } - this.searcher = new IndexSearcher(this.reader); + setIndexSearcher(new IndexSearcher(this.reader)); try { this.generator = (VectorQueryGenerator) Class @@ -161,9 +160,9 @@ public ScoredDoc[] search(float[] queryFloat, int hits) throws IOException { public ScoredDoc[] search(@Nullable K qid, float[] queryFloat, int hits) throws IOException { KnnFloatVectorQuery query = new KnnFloatVectorQuery(Constants.VECTOR, queryFloat, ((Args) args).efSearch); - TopDocs topDocs = searcher.search(query, hits, BREAK_SCORE_TIES_BY_DOCID, true); + TopDocs topDocs = getIndexSearcher().search(query, hits, BREAK_SCORE_TIES_BY_DOCID, true); - return super.processLuceneTopDocs(this.searcher, qid, topDocs); + return super.processLuceneTopDocs(qid, topDocs); } public ScoredDoc[] search(String queryString, int hits) throws IOException { @@ -180,9 +179,9 @@ public ScoredDoc[] search(@Nullable K qid, String queryString, int hits) throws } KnnFloatVectorQuery query = generator.buildQuery(Constants.VECTOR, queryString, ((Args) args).efSearch); - TopDocs topDocs = searcher.search(query, hits, BREAK_SCORE_TIES_BY_DOCID, true); + TopDocs topDocs = getIndexSearcher().search(query, hits, BREAK_SCORE_TIES_BY_DOCID, true); - return super.processLuceneTopDocs(this.searcher, qid, topDocs); + return super.processLuceneTopDocs(qid, topDocs); } @Override diff --git a/src/main/java/io/anserini/search/InvertedDenseSearcher.java b/src/main/java/io/anserini/search/InvertedDenseSearcher.java index d1666f488c..d3e826d687 100644 --- a/src/main/java/io/anserini/search/InvertedDenseSearcher.java +++ b/src/main/java/io/anserini/search/InvertedDenseSearcher.java @@ -83,7 +83,6 @@ public static class Args extends BaseSearchArgs { } private final IndexReader reader; - private final IndexSearcher searcher; private final InvertedDenseVectorQueryGenerator generator; public InvertedDenseSearcher(Args args) { @@ -98,9 +97,9 @@ public InvertedDenseSearcher(Args args) { throw new IllegalArgumentException(String.format("\"%s\" does not appear to be a valid index.", args.index)); } - this.searcher = new IndexSearcher(this.reader); + setIndexSearcher(new IndexSearcher(this.reader)); if (args.encoding.equalsIgnoreCase(FW)) { - searcher.setSimilarity(new ClassicSimilarity()); + getIndexSearcher().setSimilarity(new ClassicSimilarity()); } this.generator = new InvertedDenseVectorQueryGenerator(args, true); @@ -158,9 +157,9 @@ public ScoredDoc[] search(String queryString, int hits) throws IOException { public ScoredDoc[] search(K qid, String queryString, int hits) throws IOException { Query query = generator.buildQuery(queryString); - TopDocs topDocs = searcher.search(query, hits, BREAK_SCORE_TIES_BY_DOCID, true); + TopDocs topDocs = getIndexSearcher().search(query, hits, BREAK_SCORE_TIES_BY_DOCID, true); - return super.processLuceneTopDocs(this.searcher, qid, topDocs); + return super.processLuceneTopDocs(qid, topDocs); } @Override diff --git a/src/main/java/io/anserini/search/ScoredDoc.java b/src/main/java/io/anserini/search/ScoredDoc.java index 5b09b0acfa..746c8f1377 100644 --- a/src/main/java/io/anserini/search/ScoredDoc.java +++ b/src/main/java/io/anserini/search/ScoredDoc.java @@ -18,6 +18,9 @@ import org.apache.lucene.document.Document; +/** + * See documentation for {@link ScoredDocs}. + */ public class ScoredDoc { public String docid; public int lucene_docid; diff --git a/src/main/java/io/anserini/search/ScoredDocs.java b/src/main/java/io/anserini/search/ScoredDocs.java index e3cf2e9834..bc6cdfde60 100644 --- a/src/main/java/io/anserini/search/ScoredDocs.java +++ b/src/main/java/io/anserini/search/ScoredDocs.java @@ -36,7 +36,12 @@ import java.util.Map; /** - * ScoredDocuments object that converts TopDocs from the searcher into an Anserini format + * This class, {@link ScoredDocs} and its cousin {@link ScoredDoc} are closely related and should be discussed in + * the same context. Both are designed to be wrappers around Lucene's {@link TopDocs} object, which is the raw results + * from a search. Both ScoredDocs and ScoredDoc[] hold exactly the same information, except + * that the first is an object of arrays, whereas the second is an array of objects. In the development of Anserini, + * ScoredDocs seemed more natural for reranking, but when passing results over to Python, + * ScoredDoc[] seemed more natural. */ public class ScoredDocs { private static final Logger LOG = LogManager.getLogger(ScoredDocs.class); @@ -106,5 +111,4 @@ public static ScoredDocs fromQrels(Map qrels, IndexReader reade return scoredDocs; } - } diff --git a/src/main/java/io/anserini/search/SearchCollection.java b/src/main/java/io/anserini/search/SearchCollection.java index 0ec74c16c5..50e40af2af 100644 --- a/src/main/java/io/anserini/search/SearchCollection.java +++ b/src/main/java/io/anserini/search/SearchCollection.java @@ -636,7 +636,6 @@ public Args searchTweets() { } private final class Searcher> extends BaseSearcher { - private final IndexSearcher searcher; private final QueryGenerator generator; private final SdmQueryGenerator sdmQueryGenerator; private final Args args; @@ -644,8 +643,9 @@ private final class Searcher> extends BaseSearcher { public Searcher(IndexSearcher searcher, TaggedSimilarity taggedSimilarity, BaseSearchArgs args) { super(args); - this.searcher = searcher; - this.searcher.setSimilarity(taggedSimilarity.getSimilarity()); + setIndexSearcher(searcher); + getIndexSearcher().setSimilarity(taggedSimilarity.getSimilarity()); + this.sdmQueryGenerator = new SdmQueryGenerator(((Args) args).sdm_tw, ((Args) args).sdm_ow, ((Args) args).sdm_uw); try { @@ -675,26 +675,26 @@ public ScoredDocs search(K qid, String queryString, TopDocs rs = new TopDocs(new TotalHits(0, TotalHits.Relation.EQUAL_TO), new ScoreDoc[]{}); if (!isRerank || (args.rerankcutoff > 0 && args.rf_qrels == null) || (args.rf_qrels != null && !hasRelDocs)) { if (args.arbitraryScoreTieBreak) {// Figure out how to break the scoring ties. - rs = searcher.search(query, (isRerank && args.rf_qrels == null) ? args.rerankcutoff : args.hits); + rs = getIndexSearcher().search(query, (isRerank && args.rf_qrels == null) ? args.rerankcutoff : args.hits); } else { - rs = searcher.search(query, (isRerank && args.rf_qrels == null) ? args.rerankcutoff : args.hits, BREAK_SCORE_TIES_BY_DOCID, true); + rs = getIndexSearcher().search(query, (isRerank && args.rf_qrels == null) ? args.rerankcutoff : args.hits, BREAK_SCORE_TIES_BY_DOCID, true); } } List queryTokens = AnalyzerUtils.analyze(analyzer, queryString); - RerankerContext context = new RerankerContext<>(searcher, qid, query, null, queryString, queryTokens, null, args); + RerankerContext context = new RerankerContext<>(getIndexSearcher(), qid, query, null, queryString, queryTokens, null, args); ScoredDocs scoredFbDocs; if (isRerank && args.rf_qrels != null) { if (hasRelDocs) { scoredFbDocs = queryQrels; } else {//if no relevant documents, only perform score based tie breaking next LOG.info("No relevant documents for " + qid.toString()); - scoredFbDocs = ScoredDocs.fromTopDocs(rs, searcher); + scoredFbDocs = ScoredDocs.fromTopDocs(rs, getIndexSearcher()); cascade = new RerankerCascade(); cascade.add(new ScoreTiesAdjusterReranker()); } } else { - scoredFbDocs = ScoredDocs.fromTopDocs(rs, searcher); + scoredFbDocs = ScoredDocs.fromTopDocs(rs, getIndexSearcher()); } return cascade.run(scoredFbDocs, context); @@ -726,17 +726,17 @@ WashingtonPostGenerator.WashingtonPostField.KICKER.name, new BytesRef("Opinions" // Search using constructed query. TopDocs rs; if (args.arbitraryScoreTieBreak) { - rs = searcher.search(query, (isRerank && args.rf_qrels == null) ? args.rerankcutoff : args.hits); + rs = getIndexSearcher().search(query, (isRerank && args.rf_qrels == null) ? args.rerankcutoff : args.hits); } else { - rs = searcher.search(query, (isRerank && args.rf_qrels == null) ? args.rerankcutoff : + rs = getIndexSearcher().search(query, (isRerank && args.rf_qrels == null) ? args.rerankcutoff : args.hits, BREAK_SCORE_TIES_BY_DOCID, true); } - RerankerContext context = new RerankerContext<>(searcher, qid, query, docid, + RerankerContext context = new RerankerContext<>(getIndexSearcher(), qid, query, docid, StringUtils.join(", ", terms), terms, null, args); // Run the existing cascade. - ScoredDocs docs = cascade.run(ScoredDocs.fromTopDocs(rs, searcher), context); + ScoredDocs docs = cascade.run(ScoredDocs.fromTopDocs(rs, getIndexSearcher()), context); // Perform post-processing (e.g., date filter, dedupping, etc.) as a final step. return new NewsBackgroundLinkingReranker(analyzer, collectionClass).rerank(docs, context); @@ -775,25 +775,25 @@ public ScoredDocs searchTweets(K qid, TopDocs rs = new TopDocs(new TotalHits(0, TotalHits.Relation.EQUAL_TO), new ScoreDoc[]{}); if (!isRerank || (args.rerankcutoff > 0 && args.rf_qrels == null) || (args.rf_qrels != null && !hasRelDocs)) { if (args.arbitraryScoreTieBreak) {// Figure out how to break the scoring ties. - rs = searcher.search(compositeQuery, (isRerank && args.rf_qrels == null) ? args.rerankcutoff : args.hits); + rs = getIndexSearcher().search(compositeQuery, (isRerank && args.rf_qrels == null) ? args.rerankcutoff : args.hits); } else { - rs = searcher.search(compositeQuery, (isRerank && args.rf_qrels == null) ? args.rerankcutoff : args.hits, + rs = getIndexSearcher().search(compositeQuery, (isRerank && args.rf_qrels == null) ? args.rerankcutoff : args.hits, BREAK_SCORE_TIES_BY_TWEETID, true); } } - RerankerContext context = new RerankerContext<>(searcher, qid, keywordQuery, null, queryString, queryTokens, filter, args); + RerankerContext context = new RerankerContext<>(getIndexSearcher(), qid, keywordQuery, null, queryString, queryTokens, filter, args); ScoredDocs scoredFbDocs; if (isRerank && args.rf_qrels != null) { if (hasRelDocs) { scoredFbDocs = queryQrels; } else {//if no relevant documents, only perform score based tie breaking next - scoredFbDocs = ScoredDocs.fromTopDocs(rs, searcher); + scoredFbDocs = ScoredDocs.fromTopDocs(rs, getIndexSearcher()); cascade = new RerankerCascade(); cascade.add(new ScoreTiesAdjusterReranker()); } } else { - scoredFbDocs = ScoredDocs.fromTopDocs(rs, searcher); + scoredFbDocs = ScoredDocs.fromTopDocs(rs, getIndexSearcher()); } return cascade.run(scoredFbDocs, context); diff --git a/src/test/java/io/anserini/search/BaseSearcherTest.java b/src/test/java/io/anserini/search/BaseSearcherTest.java index c358c80ffc..73799aa412 100644 --- a/src/test/java/io/anserini/search/BaseSearcherTest.java +++ b/src/test/java/io/anserini/search/BaseSearcherTest.java @@ -64,8 +64,8 @@ public void testProcessLuceneTopDocs() throws Exception { assertEquals(0.57024956f, topDocs.scoreDocs[0].score, 10e-8); // Now we can test BaseSearcher itself: - BaseSearcher baseSearcher = new BaseSearcher<>(new BaseSearchArgs()); - ScoredDoc[] scoredDocs = baseSearcher.processLuceneTopDocs(indexSearcher, "q1", topDocs); + BaseSearcher baseSearcher = new BaseSearcher<>(new BaseSearchArgs(), indexSearcher); + ScoredDoc[] scoredDocs = baseSearcher.processLuceneTopDocs("q1", topDocs); assertEquals(1, scoredDocs.length); assertEquals(2, scoredDocs[0].lucene_docid); @@ -73,6 +73,13 @@ public void testProcessLuceneTopDocs() throws Exception { assertEquals("doc3", scoredDocs[0].docid); assertEquals("here is a test", scoredDocs[0].lucene_document.get(Constants.CONTENTS)); assertEquals("{\"contents\": \"here is a test\"}", scoredDocs[0].lucene_document.get(Constants.RAW)); + + scoredDocs = baseSearcher.processLuceneTopDocs("q1", topDocs, false); + assertEquals(1, scoredDocs.length); + assertEquals(2, scoredDocs[0].lucene_docid); + assertEquals(0.57024956f, scoredDocs[0].score, 10e-8); + assertEquals("doc3", scoredDocs[0].docid); + assertNull(scoredDocs[0].lucene_document); } @Test @@ -104,7 +111,7 @@ public void processScoredDocs() throws Exception { assertEquals(0.57024956f, topDocs.scoreDocs[0].score, 10e-8); // Now we can test BaseSearcher itself: - BaseSearcher baseSearcher = new BaseSearcher<>(new BaseSearchArgs()); + BaseSearcher baseSearcher = new BaseSearcher<>(new BaseSearchArgs(), indexSearcher); ScoredDoc[] scoredDocs = baseSearcher.processScoredDocs("q1", ScoredDocs.fromTopDocs(topDocs, indexSearcher), true); assertEquals(1, scoredDocs.length); @@ -119,6 +126,6 @@ public void processScoredDocs() throws Exception { assertEquals(2, scoredDocs[0].lucene_docid); assertEquals(0.57024956f, scoredDocs[0].score, 10e-8); assertEquals("doc3", scoredDocs[0].docid); - assertEquals(null, scoredDocs[0].lucene_document); + assertNull(scoredDocs[0].lucene_document); } }