Added docs; more tests.

castorini · Dec 20, 2023 · 3a8c31a · 3a8c31a
1 parent de53f6c
commit 3a8c31a
Show file tree

Hide file tree

Showing 8 changed files with 170 additions and 39 deletions.
diff --git a/src/main/java/io/anserini/search/BaseSearchArgs.java b/src/main/java/io/anserini/search/BaseSearchArgs.java
@@ -31,11 +31,13 @@ public class BaseSearchArgs {
   @Option(name = "-threads", metaVar = "[int]", usage = "Number of threads for running queries in parallel.")
   public int threads = 4;
 
+  // In some test collections, a document is used as a query, usually denoted by setting the qid as the docid. In this
+  // case, we want to remove the docid from the ranked list.
   @Option(name = "-removeQuery", usage = "Remove docids that have the query id when writing final run output.")
   public Boolean removeQuery = false;
 
   // Note that this option is set to false by default because duplicate documents usually indicate some underlying
-  // indexing issues, and we don't want to just eat errors silently.
+  // corpus or indexing issues, and we don't want to just eat errors silently.
   @Option(name = "-removeDuplicates", usage = "Remove duplicate docids when writing final run output.")
   public Boolean removeDuplicates = false;
 

diff --git a/src/main/java/io/anserini/search/BaseSearcher.java b/src/main/java/io/anserini/search/BaseSearcher.java
@@ -20,29 +20,119 @@
 import org.apache.lucene.document.Document;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.TopDocs;
+import org.jetbrains.annotations.NotNull;
 
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Set;
 
+/**
+ * <p>This class provides a base for all Lucene searchers, handling three basic common post-processing operations
+ * (duplicate removal, docid-as-qid removal, and MaxP) on ranked lists based on the supplied configuration.</p>
+ *
+ * <p>In more detail:</p>
+ *
+ * <ul>
+ *   <li><b>Duplicate removal.</b> If the <code>-removeDuplicates</code> flag is set, then we remove duplicate docids in
+ *   the ranked list. This is set false by default because duplicate documents usually indicate some underlying corpus
+ *   or indexing issues, and we don't want to just eat errors silently.</li>
+ *
+ *   <li><b>Docid-as-qid removal.</b> In some test collections, a document is used as a query, usually denoted by
+ *   setting the qid as the docid. If the <code>-removeQuery</code> is set, then we remove the docid from the ranked
+ *   list.</li>
+ *
+ *   <li><b>MaxP.</b> If the flag <code>-selectMaxPassage</code> is set, then we select the max scoring passage from a
+ *   document as the score for that document. This technique dates from Dai and Callan (SIGIR 2019) in the context of
+ *   BERT, although the general approach dates back to Callan (SIGIR 1994). We take <code>-selectMaxPassage.delimiter</code>
+ *   as the doc/passage delimiter; defaults to "." (dot), so the passages within a docid are labeled as "docid.00000",
+ *   "docid.00001", "docid.00002", etc. Using "#" (hash) is a common alternative, e.g., "docid#0". The number of docs
+ *   to return in the final ranked list is controlled by the parameter <code>-selectMaxPassage.hits</code>.
+ *   </li>
+ * </ul>
+ *
+ * @param <K> type of qid, typically string or integer
+ */
 public class BaseSearcher<K extends Comparable<K>> {
   protected final BaseSearchArgs args;
+  private IndexSearcher searcher;
 
+  /**
+   * Creates an instance of this class with supplied arguments.
+   *
+   * @param args configuration for duplicate removal, docid-as-qid removal, and MaxP
+   */
   public BaseSearcher(BaseSearchArgs args) {
     this.args = args;
   }
 
-  public ScoredDoc[] processLuceneTopDocs(IndexSearcher searcher, K qid, TopDocs docs) throws IOException {
+  /**
+   * Creates an instance of this class with supplied arguments.
+   *
+   * @param args configuration for duplicate removal, docid-as-qid removal, and MaxP
+   * @param searcher {@link IndexSearcher} used for accessing documents from the index
+   */
+  public BaseSearcher(BaseSearchArgs args, IndexSearcher searcher) {
+    this.args = args;
+    this.searcher = searcher;
+  }
+
+  /**
+   * Sets the {@link IndexSearcher} used for accessing documents from the index.
+   *
+   * @param searcher the {@link IndexSearcher} used for accessing documents from the index
+   */
+  protected void setIndexSearcher(IndexSearcher searcher) {
+    this.searcher = searcher;
+  }
+
+  /**
+   * Gets the {@link IndexSearcher} used for accessing documents from the index.
+   *
+   * @return the {@link IndexSearcher} used for accessing documents from the index
+   */
+  protected IndexSearcher getIndexSearcher() {
+    return this.searcher;
+  }
+
+  /**
+   * Processes Lucene {@link TopDocs} for a query based on the configuration for duplicate removal, docid-as-qid
+   * removal, and MaxP. By default, retains references to the original Lucene docs (which can be memory intensive for
+   * long ranked lists).
+   *
+   * @param qid query id
+   * @param docs Lucene {@link TopDocs}
+   * @return processed ranked list
+   */
+  public ScoredDoc[] processLuceneTopDocs(K qid, TopDocs docs) {
+    return processLuceneTopDocs(qid, docs, true);
+  }
+
+  /**
+   * Processes Lucene {@link TopDocs} for a query based on the configuration for duplicate removal, docid-as-qid
+   * removal, and MaxP. Explicitly supports control over whether to retain references to the original Lucene docs
+   * (and hence memory usage).
+   *
+   * @param qid query id
+   * @param docs Lucene {@link TopDocs}
+   * @param keepLuceneDocument whether to retain references to the original Lucene docs
+   * @return processed ranked list
+   */
+  public ScoredDoc[] processLuceneTopDocs(K qid, @NotNull TopDocs docs, boolean keepLuceneDocument) {
     List<ScoredDoc> results = new ArrayList<>();
     // For removing duplicate docids.
     Set<String> docids = new HashSet<>();
 
     int rank = 1;
     for (int i = 0; i < docs.scoreDocs.length; i++) {
       int lucene_docid = docs.scoreDocs[i].doc;
-      Document lucene_document = searcher.storedFields().document(docs.scoreDocs[i].doc);
+      Document lucene_document;
+      try {
+        lucene_document = searcher.storedFields().document(docs.scoreDocs[i].doc);
+      } catch (IOException e) {
+        throw new RuntimeException(String.format("Unable to fetch document %d", docs.scoreDocs[i].doc));
+      }
       String docid = lucene_document.get(Constants.ID);
 
       if (args.selectMaxPassage) {
@@ -56,7 +146,11 @@ public ScoredDoc[] processLuceneTopDocs(IndexSearcher searcher, K qid, TopDocs d
       if (args.removeQuery && docid.equals(qid))
         continue;
 
-      results.add(new ScoredDoc(docid, lucene_docid, docs.scoreDocs[i].score, lucene_document));
+      // Note that if keepLuceneDocument == true, then we're retaining references to a lot of objects that cannot be
+      // garbage collected. If we're running lots of queries, e.g., from SearchCollection, this can easily exhaust
+      // the heap.
+      results.add(new ScoredDoc(docid, lucene_docid, docs.scoreDocs[i].score,
+          keepLuceneDocument ? lucene_document : null));
 
       // Note that this option is set to false by default because duplicate documents usually indicate some
       // underlying indexing issues, and we don't want to just eat errors silently.
@@ -77,7 +171,30 @@ public ScoredDoc[] processLuceneTopDocs(IndexSearcher searcher, K qid, TopDocs d
     return results.toArray(new ScoredDoc[0]);
   }
 
-  public ScoredDoc[] processScoredDocs(K qid, ScoredDocs docs, boolean keepLuceneDocument) {
+  /**
+   * Processes {@link ScoredDocs} for a query based on the configuration for duplicate removal, docid-as-qid removal,
+   * and MaxP. By default, retains references to the original Lucene docs (which can be memory intensive for long
+   * ranked lists).
+   *
+   * @param qid query id
+   * @param docs {@link ScoredDocs} to process
+   * @return processed ranked list
+   */
+  public ScoredDoc[] processScoredDocs(K qid, ScoredDocs docs) {
+    return processScoredDocs(qid, docs, true);
+  }
+
+  /**
+   * Processes {@link ScoredDocs} for a query based on the configuration for duplicate removal, docid-as-qid removal,
+   * and MaxP. Explicitly supports control over whether to retain references to the original Lucene docs (and hence
+   * memory usage).
+   *
+   * @param qid query id
+   * @param docs {@link ScoredDocs} to process
+   * @param keepLuceneDocument whether to retain references to the original Lucene docs
+   * @return processed ranked list
+   */
+  public ScoredDoc[] processScoredDocs(K qid, @NotNull ScoredDocs docs, boolean keepLuceneDocument) {
     List<ScoredDoc> results = new ArrayList<>();
     // For removing duplicate docids.
     Set<String> docids = new HashSet<>();

diff --git a/src/main/java/io/anserini/search/HnswDenseSearcher.java b/src/main/java/io/anserini/search/HnswDenseSearcher.java
@@ -70,7 +70,6 @@ public static class Args extends BaseSearchArgs {
   }
 
   private final IndexReader reader;
-  private final IndexSearcher searcher;
   private final VectorQueryGenerator generator;
   private final DenseEncoder encoder;
 
@@ -86,7 +85,7 @@ public HnswDenseSearcher(Args args) {
       throw new IllegalArgumentException(String.format("\"%s\" does not appear to be a valid index.", args.index));
     }
 
-    this.searcher = new IndexSearcher(this.reader);
+    setIndexSearcher(new IndexSearcher(this.reader));
 
     try {
       this.generator = (VectorQueryGenerator) Class
@@ -161,9 +160,9 @@ public ScoredDoc[] search(float[] queryFloat, int hits) throws IOException {
 
   public ScoredDoc[] search(@Nullable K qid, float[] queryFloat, int hits) throws IOException {
     KnnFloatVectorQuery query = new KnnFloatVectorQuery(Constants.VECTOR, queryFloat, ((Args) args).efSearch);
-    TopDocs topDocs = searcher.search(query, hits, BREAK_SCORE_TIES_BY_DOCID, true);
+    TopDocs topDocs = getIndexSearcher().search(query, hits, BREAK_SCORE_TIES_BY_DOCID, true);
 
-    return super.processLuceneTopDocs(this.searcher, qid, topDocs);
+    return super.processLuceneTopDocs(qid, topDocs);
   }
 
   public ScoredDoc[] search(String queryString, int hits) throws IOException {
@@ -180,9 +179,9 @@ public ScoredDoc[] search(@Nullable K qid, String queryString, int hits) throws
     }
 
     KnnFloatVectorQuery query = generator.buildQuery(Constants.VECTOR, queryString, ((Args) args).efSearch);
-    TopDocs topDocs = searcher.search(query, hits, BREAK_SCORE_TIES_BY_DOCID, true);
+    TopDocs topDocs = getIndexSearcher().search(query, hits, BREAK_SCORE_TIES_BY_DOCID, true);
 
-    return super.processLuceneTopDocs(this.searcher, qid, topDocs);
+    return super.processLuceneTopDocs(qid, topDocs);
   }
 
   @Override

diff --git a/src/main/java/io/anserini/search/InvertedDenseSearcher.java b/src/main/java/io/anserini/search/InvertedDenseSearcher.java
@@ -83,7 +83,6 @@ public static class Args extends BaseSearchArgs {
   }
 
   private final IndexReader reader;
-  private final IndexSearcher searcher;
   private final InvertedDenseVectorQueryGenerator generator;
 
   public InvertedDenseSearcher(Args args) {
@@ -98,9 +97,9 @@ public InvertedDenseSearcher(Args args) {
       throw new IllegalArgumentException(String.format("\"%s\" does not appear to be a valid index.", args.index));
     }
 
-    this.searcher = new IndexSearcher(this.reader);
+    setIndexSearcher(new IndexSearcher(this.reader));
     if (args.encoding.equalsIgnoreCase(FW)) {
-      searcher.setSimilarity(new ClassicSimilarity());
+      getIndexSearcher().setSimilarity(new ClassicSimilarity());
     }
 
     this.generator = new InvertedDenseVectorQueryGenerator(args, true);
@@ -158,9 +157,9 @@ public ScoredDoc[] search(String queryString, int hits) throws IOException {
 
   public ScoredDoc[] search(K qid, String queryString, int hits) throws IOException {
     Query query = generator.buildQuery(queryString);
-    TopDocs topDocs = searcher.search(query, hits, BREAK_SCORE_TIES_BY_DOCID, true);
+    TopDocs topDocs = getIndexSearcher().search(query, hits, BREAK_SCORE_TIES_BY_DOCID, true);
 
-    return super.processLuceneTopDocs(this.searcher, qid, topDocs);
+    return super.processLuceneTopDocs(qid, topDocs);
   }
 
   @Override

diff --git a/src/main/java/io/anserini/search/ScoredDoc.java b/src/main/java/io/anserini/search/ScoredDoc.java
@@ -18,6 +18,9 @@
 
 import org.apache.lucene.document.Document;
 
+/**
+ * See documentation for {@link ScoredDocs}.
+ */
 public class ScoredDoc {
   public String docid;
   public int lucene_docid;

diff --git a/src/main/java/io/anserini/search/ScoredDocs.java b/src/main/java/io/anserini/search/ScoredDocs.java
@@ -36,7 +36,12 @@
 import java.util.Map;
 
 /**
- * ScoredDocuments object that converts TopDocs from the searcher into an Anserini format
+ * This class, {@link ScoredDocs} and its cousin {@link ScoredDoc} are closely related and should be discussed in
+ * the same context. Both are designed to be wrappers around Lucene's {@link TopDocs} object, which is the raw results
+ * from a search. Both <code>ScoredDocs</code> and <code>ScoredDoc[]</code> hold exactly the same information, except
+ * that the first is an object of arrays, whereas the second is an array of objects. In the development of Anserini,
+ * <code>ScoredDocs</code> seemed more natural for reranking, but when passing results over to Python,
+ * <code>ScoredDoc[]</code> seemed more natural.
  */
 public class ScoredDocs {
   private static final Logger LOG = LogManager.getLogger(ScoredDocs.class);
@@ -106,5 +111,4 @@ public static ScoredDocs fromQrels(Map<String, Integer> qrels, IndexReader reade
 
     return scoredDocs;
   }
-
 }
diff --git a/src/main/java/io/anserini/search/SearchCollection.java b/src/main/java/io/anserini/search/SearchCollection.java
@@ -636,16 +636,16 @@ public Args searchTweets() {
   }
 
   private final class Searcher<K extends Comparable<K>> extends BaseSearcher<K> {
-    private final IndexSearcher searcher;
     private final QueryGenerator generator;
     private final SdmQueryGenerator sdmQueryGenerator;
     private final Args args;
 
     public Searcher(IndexSearcher searcher, TaggedSimilarity taggedSimilarity, BaseSearchArgs args) {
       super(args);
 
-      this.searcher = searcher;
-      this.searcher.setSimilarity(taggedSimilarity.getSimilarity());
+      setIndexSearcher(searcher);
+      getIndexSearcher().setSimilarity(taggedSimilarity.getSimilarity());
+
       this.sdmQueryGenerator = new SdmQueryGenerator(((Args) args).sdm_tw, ((Args) args).sdm_ow, ((Args) args).sdm_uw);
 
       try {
@@ -675,26 +675,26 @@ public ScoredDocs search(K qid, String queryString,
       TopDocs rs = new TopDocs(new TotalHits(0, TotalHits.Relation.EQUAL_TO), new ScoreDoc[]{});
       if (!isRerank || (args.rerankcutoff > 0 && args.rf_qrels == null) || (args.rf_qrels != null && !hasRelDocs)) {
         if (args.arbitraryScoreTieBreak) {// Figure out how to break the scoring ties.
-          rs = searcher.search(query, (isRerank && args.rf_qrels == null) ? args.rerankcutoff : args.hits);
+          rs = getIndexSearcher().search(query, (isRerank && args.rf_qrels == null) ? args.rerankcutoff : args.hits);
         } else {
-          rs = searcher.search(query, (isRerank && args.rf_qrels == null) ? args.rerankcutoff : args.hits, BREAK_SCORE_TIES_BY_DOCID, true);
+          rs = getIndexSearcher().search(query, (isRerank && args.rf_qrels == null) ? args.rerankcutoff : args.hits, BREAK_SCORE_TIES_BY_DOCID, true);
         }
       }
 
       List<String> queryTokens = AnalyzerUtils.analyze(analyzer, queryString);
-      RerankerContext context = new RerankerContext<>(searcher, qid, query, null, queryString, queryTokens, null, args);
+      RerankerContext context = new RerankerContext<>(getIndexSearcher(), qid, query, null, queryString, queryTokens, null, args);
       ScoredDocs scoredFbDocs;
       if (isRerank && args.rf_qrels != null) {
         if (hasRelDocs) {
           scoredFbDocs = queryQrels;
         } else {//if no relevant documents, only perform score based tie breaking next
           LOG.info("No relevant documents for " + qid.toString());
-          scoredFbDocs = ScoredDocs.fromTopDocs(rs, searcher);
+          scoredFbDocs = ScoredDocs.fromTopDocs(rs, getIndexSearcher());
           cascade = new RerankerCascade();
           cascade.add(new ScoreTiesAdjusterReranker());
         }
       } else {
-        scoredFbDocs = ScoredDocs.fromTopDocs(rs, searcher);
+        scoredFbDocs = ScoredDocs.fromTopDocs(rs, getIndexSearcher());
       }
 
       return cascade.run(scoredFbDocs, context);
@@ -726,17 +726,17 @@ WashingtonPostGenerator.WashingtonPostField.KICKER.name, new BytesRef("Opinions"
       // Search using constructed query.
       TopDocs rs;
       if (args.arbitraryScoreTieBreak) {
-        rs = searcher.search(query, (isRerank && args.rf_qrels == null) ? args.rerankcutoff : args.hits);
+        rs = getIndexSearcher().search(query, (isRerank && args.rf_qrels == null) ? args.rerankcutoff : args.hits);
       } else {
-        rs = searcher.search(query, (isRerank && args.rf_qrels == null) ? args.rerankcutoff :
+        rs = getIndexSearcher().search(query, (isRerank && args.rf_qrels == null) ? args.rerankcutoff :
             args.hits, BREAK_SCORE_TIES_BY_DOCID, true);
       }
 
-      RerankerContext context = new RerankerContext<>(searcher, qid, query, docid,
+      RerankerContext context = new RerankerContext<>(getIndexSearcher(), qid, query, docid,
           StringUtils.join(", ", terms), terms, null, args);
 
       // Run the existing cascade.
-      ScoredDocs docs = cascade.run(ScoredDocs.fromTopDocs(rs, searcher), context);
+      ScoredDocs docs = cascade.run(ScoredDocs.fromTopDocs(rs, getIndexSearcher()), context);
 
       // Perform post-processing (e.g., date filter, dedupping, etc.) as a final step.
       return new NewsBackgroundLinkingReranker(analyzer, collectionClass).rerank(docs, context);
@@ -775,25 +775,25 @@ public ScoredDocs searchTweets(K qid,
       TopDocs rs = new TopDocs(new TotalHits(0, TotalHits.Relation.EQUAL_TO), new ScoreDoc[]{});
       if (!isRerank || (args.rerankcutoff > 0 && args.rf_qrels == null) || (args.rf_qrels != null && !hasRelDocs)) {
         if (args.arbitraryScoreTieBreak) {// Figure out how to break the scoring ties.
-          rs = searcher.search(compositeQuery, (isRerank && args.rf_qrels == null) ? args.rerankcutoff : args.hits);
+          rs = getIndexSearcher().search(compositeQuery, (isRerank && args.rf_qrels == null) ? args.rerankcutoff : args.hits);
         } else {
-          rs = searcher.search(compositeQuery, (isRerank && args.rf_qrels == null) ? args.rerankcutoff : args.hits,
+          rs = getIndexSearcher().search(compositeQuery, (isRerank && args.rf_qrels == null) ? args.rerankcutoff : args.hits,
               BREAK_SCORE_TIES_BY_TWEETID, true);
         }
       }
 
-      RerankerContext context = new RerankerContext<>(searcher, qid, keywordQuery, null, queryString, queryTokens, filter, args);
+      RerankerContext context = new RerankerContext<>(getIndexSearcher(), qid, keywordQuery, null, queryString, queryTokens, filter, args);
       ScoredDocs scoredFbDocs;
       if (isRerank && args.rf_qrels != null) {
         if (hasRelDocs) {
           scoredFbDocs = queryQrels;
         } else {//if no relevant documents, only perform score based tie breaking next
-          scoredFbDocs = ScoredDocs.fromTopDocs(rs, searcher);
+          scoredFbDocs = ScoredDocs.fromTopDocs(rs, getIndexSearcher());
           cascade = new RerankerCascade();
           cascade.add(new ScoreTiesAdjusterReranker());
         }
       } else {
-        scoredFbDocs = ScoredDocs.fromTopDocs(rs, searcher);
+        scoredFbDocs = ScoredDocs.fromTopDocs(rs, getIndexSearcher());
       }
 
       return cascade.run(scoredFbDocs, context);