Skip to content

Commit

Permalink
Added docs; more tests.
Browse files Browse the repository at this point in the history
  • Loading branch information
lintool committed Dec 20, 2023
1 parent de53f6c commit 3a8c31a
Show file tree
Hide file tree
Showing 8 changed files with 170 additions and 39 deletions.
4 changes: 3 additions & 1 deletion src/main/java/io/anserini/search/BaseSearchArgs.java
Original file line number Diff line number Diff line change
Expand Up @@ -31,11 +31,13 @@ public class BaseSearchArgs {
@Option(name = "-threads", metaVar = "[int]", usage = "Number of threads for running queries in parallel.")
public int threads = 4;

// In some test collections, a document is used as a query, usually denoted by setting the qid as the docid. In this
// case, we want to remove the docid from the ranked list.
@Option(name = "-removeQuery", usage = "Remove docids that have the query id when writing final run output.")
public Boolean removeQuery = false;

// Note that this option is set to false by default because duplicate documents usually indicate some underlying
// indexing issues, and we don't want to just eat errors silently.
// corpus or indexing issues, and we don't want to just eat errors silently.
@Option(name = "-removeDuplicates", usage = "Remove duplicate docids when writing final run output.")
public Boolean removeDuplicates = false;

Expand Down
125 changes: 121 additions & 4 deletions src/main/java/io/anserini/search/BaseSearcher.java
Original file line number Diff line number Diff line change
Expand Up @@ -20,29 +20,119 @@
import org.apache.lucene.document.Document;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.TopDocs;
import org.jetbrains.annotations.NotNull;

import java.io.IOException;
import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;

/**
* <p>This class provides a base for all Lucene searchers, handling three basic common post-processing operations
* (duplicate removal, docid-as-qid removal, and MaxP) on ranked lists based on the supplied configuration.</p>
*
* <p>In more detail:</p>
*
* <ul>
* <li><b>Duplicate removal.</b> If the <code>-removeDuplicates</code> flag is set, then we remove duplicate docids in
* the ranked list. This is set false by default because duplicate documents usually indicate some underlying corpus
* or indexing issues, and we don't want to just eat errors silently.</li>
*
* <li><b>Docid-as-qid removal.</b> In some test collections, a document is used as a query, usually denoted by
* setting the qid as the docid. If the <code>-removeQuery</code> is set, then we remove the docid from the ranked
* list.</li>
*
* <li><b>MaxP.</b> If the flag <code>-selectMaxPassage</code> is set, then we select the max scoring passage from a
* document as the score for that document. This technique dates from Dai and Callan (SIGIR 2019) in the context of
* BERT, although the general approach dates back to Callan (SIGIR 1994). We take <code>-selectMaxPassage.delimiter</code>
* as the doc/passage delimiter; defaults to "." (dot), so the passages within a docid are labeled as "docid.00000",
* "docid.00001", "docid.00002", etc. Using "#" (hash) is a common alternative, e.g., "docid#0". The number of docs
* to return in the final ranked list is controlled by the parameter <code>-selectMaxPassage.hits</code>.
* </li>
* </ul>
*
* @param <K> type of qid, typically string or integer
*/
public class BaseSearcher<K extends Comparable<K>> {
protected final BaseSearchArgs args;
private IndexSearcher searcher;

/**
* Creates an instance of this class with supplied arguments.
*
* @param args configuration for duplicate removal, docid-as-qid removal, and MaxP
*/
public BaseSearcher(BaseSearchArgs args) {
this.args = args;
}

public ScoredDoc[] processLuceneTopDocs(IndexSearcher searcher, K qid, TopDocs docs) throws IOException {
/**
* Creates an instance of this class with supplied arguments.
*
* @param args configuration for duplicate removal, docid-as-qid removal, and MaxP
* @param searcher {@link IndexSearcher} used for accessing documents from the index
*/
public BaseSearcher(BaseSearchArgs args, IndexSearcher searcher) {
this.args = args;
this.searcher = searcher;
}

/**
* Sets the {@link IndexSearcher} used for accessing documents from the index.
*
* @param searcher the {@link IndexSearcher} used for accessing documents from the index
*/
protected void setIndexSearcher(IndexSearcher searcher) {
this.searcher = searcher;
}

/**
* Gets the {@link IndexSearcher} used for accessing documents from the index.
*
* @return the {@link IndexSearcher} used for accessing documents from the index
*/
protected IndexSearcher getIndexSearcher() {
return this.searcher;
}

/**
* Processes Lucene {@link TopDocs} for a query based on the configuration for duplicate removal, docid-as-qid
* removal, and MaxP. By default, retains references to the original Lucene docs (which can be memory intensive for
* long ranked lists).
*
* @param qid query id
* @param docs Lucene {@link TopDocs}
* @return processed ranked list
*/
public ScoredDoc[] processLuceneTopDocs(K qid, TopDocs docs) {
return processLuceneTopDocs(qid, docs, true);
}

/**
* Processes Lucene {@link TopDocs} for a query based on the configuration for duplicate removal, docid-as-qid
* removal, and MaxP. Explicitly supports control over whether to retain references to the original Lucene docs
* (and hence memory usage).
*
* @param qid query id
* @param docs Lucene {@link TopDocs}
* @param keepLuceneDocument whether to retain references to the original Lucene docs
* @return processed ranked list
*/
public ScoredDoc[] processLuceneTopDocs(K qid, @NotNull TopDocs docs, boolean keepLuceneDocument) {
List<ScoredDoc> results = new ArrayList<>();
// For removing duplicate docids.
Set<String> docids = new HashSet<>();

int rank = 1;
for (int i = 0; i < docs.scoreDocs.length; i++) {
int lucene_docid = docs.scoreDocs[i].doc;
Document lucene_document = searcher.storedFields().document(docs.scoreDocs[i].doc);
Document lucene_document;
try {
lucene_document = searcher.storedFields().document(docs.scoreDocs[i].doc);
} catch (IOException e) {
throw new RuntimeException(String.format("Unable to fetch document %d", docs.scoreDocs[i].doc));

Check warning on line 134 in src/main/java/io/anserini/search/BaseSearcher.java

View check run for this annotation

Codecov / codecov/patch

src/main/java/io/anserini/search/BaseSearcher.java#L133-L134

Added lines #L133 - L134 were not covered by tests
}
String docid = lucene_document.get(Constants.ID);

if (args.selectMaxPassage) {
Expand All @@ -56,7 +146,11 @@ public ScoredDoc[] processLuceneTopDocs(IndexSearcher searcher, K qid, TopDocs d
if (args.removeQuery && docid.equals(qid))
continue;

results.add(new ScoredDoc(docid, lucene_docid, docs.scoreDocs[i].score, lucene_document));
// Note that if keepLuceneDocument == true, then we're retaining references to a lot of objects that cannot be
// garbage collected. If we're running lots of queries, e.g., from SearchCollection, this can easily exhaust
// the heap.
results.add(new ScoredDoc(docid, lucene_docid, docs.scoreDocs[i].score,
keepLuceneDocument ? lucene_document : null));

// Note that this option is set to false by default because duplicate documents usually indicate some
// underlying indexing issues, and we don't want to just eat errors silently.
Expand All @@ -77,7 +171,30 @@ public ScoredDoc[] processLuceneTopDocs(IndexSearcher searcher, K qid, TopDocs d
return results.toArray(new ScoredDoc[0]);
}

public ScoredDoc[] processScoredDocs(K qid, ScoredDocs docs, boolean keepLuceneDocument) {
/**
* Processes {@link ScoredDocs} for a query based on the configuration for duplicate removal, docid-as-qid removal,
* and MaxP. By default, retains references to the original Lucene docs (which can be memory intensive for long
* ranked lists).
*
* @param qid query id
* @param docs {@link ScoredDocs} to process
* @return processed ranked list
*/
public ScoredDoc[] processScoredDocs(K qid, ScoredDocs docs) {
return processScoredDocs(qid, docs, true);

Check warning on line 184 in src/main/java/io/anserini/search/BaseSearcher.java

View check run for this annotation

Codecov / codecov/patch

src/main/java/io/anserini/search/BaseSearcher.java#L184

Added line #L184 was not covered by tests
}

/**
* Processes {@link ScoredDocs} for a query based on the configuration for duplicate removal, docid-as-qid removal,
* and MaxP. Explicitly supports control over whether to retain references to the original Lucene docs (and hence
* memory usage).
*
* @param qid query id
* @param docs {@link ScoredDocs} to process
* @param keepLuceneDocument whether to retain references to the original Lucene docs
* @return processed ranked list
*/
public ScoredDoc[] processScoredDocs(K qid, @NotNull ScoredDocs docs, boolean keepLuceneDocument) {
List<ScoredDoc> results = new ArrayList<>();
// For removing duplicate docids.
Set<String> docids = new HashSet<>();
Expand Down
11 changes: 5 additions & 6 deletions src/main/java/io/anserini/search/HnswDenseSearcher.java
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,6 @@ public static class Args extends BaseSearchArgs {
}

private final IndexReader reader;
private final IndexSearcher searcher;
private final VectorQueryGenerator generator;
private final DenseEncoder encoder;

Expand All @@ -86,7 +85,7 @@ public HnswDenseSearcher(Args args) {
throw new IllegalArgumentException(String.format("\"%s\" does not appear to be a valid index.", args.index));
}

this.searcher = new IndexSearcher(this.reader);
setIndexSearcher(new IndexSearcher(this.reader));

try {
this.generator = (VectorQueryGenerator) Class
Expand Down Expand Up @@ -161,9 +160,9 @@ public ScoredDoc[] search(float[] queryFloat, int hits) throws IOException {

public ScoredDoc[] search(@Nullable K qid, float[] queryFloat, int hits) throws IOException {
KnnFloatVectorQuery query = new KnnFloatVectorQuery(Constants.VECTOR, queryFloat, ((Args) args).efSearch);
TopDocs topDocs = searcher.search(query, hits, BREAK_SCORE_TIES_BY_DOCID, true);
TopDocs topDocs = getIndexSearcher().search(query, hits, BREAK_SCORE_TIES_BY_DOCID, true);

return super.processLuceneTopDocs(this.searcher, qid, topDocs);
return super.processLuceneTopDocs(qid, topDocs);
}

public ScoredDoc[] search(String queryString, int hits) throws IOException {
Expand All @@ -180,9 +179,9 @@ public ScoredDoc[] search(@Nullable K qid, String queryString, int hits) throws
}

KnnFloatVectorQuery query = generator.buildQuery(Constants.VECTOR, queryString, ((Args) args).efSearch);
TopDocs topDocs = searcher.search(query, hits, BREAK_SCORE_TIES_BY_DOCID, true);
TopDocs topDocs = getIndexSearcher().search(query, hits, BREAK_SCORE_TIES_BY_DOCID, true);

return super.processLuceneTopDocs(this.searcher, qid, topDocs);
return super.processLuceneTopDocs(qid, topDocs);
}

@Override
Expand Down
9 changes: 4 additions & 5 deletions src/main/java/io/anserini/search/InvertedDenseSearcher.java
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,6 @@ public static class Args extends BaseSearchArgs {
}

private final IndexReader reader;
private final IndexSearcher searcher;
private final InvertedDenseVectorQueryGenerator generator;

public InvertedDenseSearcher(Args args) {
Expand All @@ -98,9 +97,9 @@ public InvertedDenseSearcher(Args args) {
throw new IllegalArgumentException(String.format("\"%s\" does not appear to be a valid index.", args.index));
}

this.searcher = new IndexSearcher(this.reader);
setIndexSearcher(new IndexSearcher(this.reader));
if (args.encoding.equalsIgnoreCase(FW)) {
searcher.setSimilarity(new ClassicSimilarity());
getIndexSearcher().setSimilarity(new ClassicSimilarity());
}

this.generator = new InvertedDenseVectorQueryGenerator(args, true);
Expand Down Expand Up @@ -158,9 +157,9 @@ public ScoredDoc[] search(String queryString, int hits) throws IOException {

public ScoredDoc[] search(K qid, String queryString, int hits) throws IOException {
Query query = generator.buildQuery(queryString);
TopDocs topDocs = searcher.search(query, hits, BREAK_SCORE_TIES_BY_DOCID, true);
TopDocs topDocs = getIndexSearcher().search(query, hits, BREAK_SCORE_TIES_BY_DOCID, true);

return super.processLuceneTopDocs(this.searcher, qid, topDocs);
return super.processLuceneTopDocs(qid, topDocs);
}

@Override
Expand Down
3 changes: 3 additions & 0 deletions src/main/java/io/anserini/search/ScoredDoc.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,9 @@

import org.apache.lucene.document.Document;

/**
* See documentation for {@link ScoredDocs}.
*/
public class ScoredDoc {
public String docid;
public int lucene_docid;
Expand Down
8 changes: 6 additions & 2 deletions src/main/java/io/anserini/search/ScoredDocs.java
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,12 @@
import java.util.Map;

/**
* ScoredDocuments object that converts TopDocs from the searcher into an Anserini format
* This class, {@link ScoredDocs} and its cousin {@link ScoredDoc} are closely related and should be discussed in
* the same context. Both are designed to be wrappers around Lucene's {@link TopDocs} object, which is the raw results
* from a search. Both <code>ScoredDocs</code> and <code>ScoredDoc[]</code> hold exactly the same information, except
* that the first is an object of arrays, whereas the second is an array of objects. In the development of Anserini,
* <code>ScoredDocs</code> seemed more natural for reranking, but when passing results over to Python,
* <code>ScoredDoc[]</code> seemed more natural.
*/
public class ScoredDocs {
private static final Logger LOG = LogManager.getLogger(ScoredDocs.class);
Expand Down Expand Up @@ -106,5 +111,4 @@ public static ScoredDocs fromQrels(Map<String, Integer> qrels, IndexReader reade

return scoredDocs;
}

}
34 changes: 17 additions & 17 deletions src/main/java/io/anserini/search/SearchCollection.java
Original file line number Diff line number Diff line change
Expand Up @@ -636,16 +636,16 @@ public Args searchTweets() {
}

private final class Searcher<K extends Comparable<K>> extends BaseSearcher<K> {
private final IndexSearcher searcher;
private final QueryGenerator generator;
private final SdmQueryGenerator sdmQueryGenerator;
private final Args args;

public Searcher(IndexSearcher searcher, TaggedSimilarity taggedSimilarity, BaseSearchArgs args) {
super(args);

this.searcher = searcher;
this.searcher.setSimilarity(taggedSimilarity.getSimilarity());
setIndexSearcher(searcher);
getIndexSearcher().setSimilarity(taggedSimilarity.getSimilarity());

this.sdmQueryGenerator = new SdmQueryGenerator(((Args) args).sdm_tw, ((Args) args).sdm_ow, ((Args) args).sdm_uw);

try {
Expand Down Expand Up @@ -675,26 +675,26 @@ public ScoredDocs search(K qid, String queryString,
TopDocs rs = new TopDocs(new TotalHits(0, TotalHits.Relation.EQUAL_TO), new ScoreDoc[]{});
if (!isRerank || (args.rerankcutoff > 0 && args.rf_qrels == null) || (args.rf_qrels != null && !hasRelDocs)) {
if (args.arbitraryScoreTieBreak) {// Figure out how to break the scoring ties.
rs = searcher.search(query, (isRerank && args.rf_qrels == null) ? args.rerankcutoff : args.hits);
rs = getIndexSearcher().search(query, (isRerank && args.rf_qrels == null) ? args.rerankcutoff : args.hits);
} else {
rs = searcher.search(query, (isRerank && args.rf_qrels == null) ? args.rerankcutoff : args.hits, BREAK_SCORE_TIES_BY_DOCID, true);
rs = getIndexSearcher().search(query, (isRerank && args.rf_qrels == null) ? args.rerankcutoff : args.hits, BREAK_SCORE_TIES_BY_DOCID, true);
}
}

List<String> queryTokens = AnalyzerUtils.analyze(analyzer, queryString);
RerankerContext context = new RerankerContext<>(searcher, qid, query, null, queryString, queryTokens, null, args);
RerankerContext context = new RerankerContext<>(getIndexSearcher(), qid, query, null, queryString, queryTokens, null, args);
ScoredDocs scoredFbDocs;
if (isRerank && args.rf_qrels != null) {
if (hasRelDocs) {
scoredFbDocs = queryQrels;

Check warning on line 689 in src/main/java/io/anserini/search/SearchCollection.java

View check run for this annotation

Codecov / codecov/patch

src/main/java/io/anserini/search/SearchCollection.java#L689

Added line #L689 was not covered by tests
} else {//if no relevant documents, only perform score based tie breaking next
LOG.info("No relevant documents for " + qid.toString());
scoredFbDocs = ScoredDocs.fromTopDocs(rs, searcher);
scoredFbDocs = ScoredDocs.fromTopDocs(rs, getIndexSearcher());
cascade = new RerankerCascade();
cascade.add(new ScoreTiesAdjusterReranker());

Check warning on line 694 in src/main/java/io/anserini/search/SearchCollection.java

View check run for this annotation

Codecov / codecov/patch

src/main/java/io/anserini/search/SearchCollection.java#L691-L694

Added lines #L691 - L694 were not covered by tests
}
} else {
scoredFbDocs = ScoredDocs.fromTopDocs(rs, searcher);
scoredFbDocs = ScoredDocs.fromTopDocs(rs, getIndexSearcher());
}

return cascade.run(scoredFbDocs, context);
Expand Down Expand Up @@ -726,17 +726,17 @@ WashingtonPostGenerator.WashingtonPostField.KICKER.name, new BytesRef("Opinions"
// Search using constructed query.
TopDocs rs;
if (args.arbitraryScoreTieBreak) {
rs = searcher.search(query, (isRerank && args.rf_qrels == null) ? args.rerankcutoff : args.hits);
rs = getIndexSearcher().search(query, (isRerank && args.rf_qrels == null) ? args.rerankcutoff : args.hits);
} else {
rs = searcher.search(query, (isRerank && args.rf_qrels == null) ? args.rerankcutoff :
rs = getIndexSearcher().search(query, (isRerank && args.rf_qrels == null) ? args.rerankcutoff :
args.hits, BREAK_SCORE_TIES_BY_DOCID, true);

Check warning on line 732 in src/main/java/io/anserini/search/SearchCollection.java

View check run for this annotation

Codecov / codecov/patch

src/main/java/io/anserini/search/SearchCollection.java#L732

Added line #L732 was not covered by tests
}

RerankerContext context = new RerankerContext<>(searcher, qid, query, docid,
RerankerContext context = new RerankerContext<>(getIndexSearcher(), qid, query, docid,
StringUtils.join(", ", terms), terms, null, args);

Check warning on line 736 in src/main/java/io/anserini/search/SearchCollection.java

View check run for this annotation

Codecov / codecov/patch

src/main/java/io/anserini/search/SearchCollection.java#L735-L736

Added lines #L735 - L736 were not covered by tests

// Run the existing cascade.
ScoredDocs docs = cascade.run(ScoredDocs.fromTopDocs(rs, searcher), context);
ScoredDocs docs = cascade.run(ScoredDocs.fromTopDocs(rs, getIndexSearcher()), context);

Check warning on line 739 in src/main/java/io/anserini/search/SearchCollection.java

View check run for this annotation

Codecov / codecov/patch

src/main/java/io/anserini/search/SearchCollection.java#L739

Added line #L739 was not covered by tests

// Perform post-processing (e.g., date filter, dedupping, etc.) as a final step.
return new NewsBackgroundLinkingReranker(analyzer, collectionClass).rerank(docs, context);

Check warning on line 742 in src/main/java/io/anserini/search/SearchCollection.java

View check run for this annotation

Codecov / codecov/patch

src/main/java/io/anserini/search/SearchCollection.java#L742

Added line #L742 was not covered by tests
Expand Down Expand Up @@ -775,25 +775,25 @@ public ScoredDocs searchTweets(K qid,
TopDocs rs = new TopDocs(new TotalHits(0, TotalHits.Relation.EQUAL_TO), new ScoreDoc[]{});
if (!isRerank || (args.rerankcutoff > 0 && args.rf_qrels == null) || (args.rf_qrels != null && !hasRelDocs)) {
if (args.arbitraryScoreTieBreak) {// Figure out how to break the scoring ties.
rs = searcher.search(compositeQuery, (isRerank && args.rf_qrels == null) ? args.rerankcutoff : args.hits);
rs = getIndexSearcher().search(compositeQuery, (isRerank && args.rf_qrels == null) ? args.rerankcutoff : args.hits);
} else {
rs = searcher.search(compositeQuery, (isRerank && args.rf_qrels == null) ? args.rerankcutoff : args.hits,
rs = getIndexSearcher().search(compositeQuery, (isRerank && args.rf_qrels == null) ? args.rerankcutoff : args.hits,
BREAK_SCORE_TIES_BY_TWEETID, true);
}
}

RerankerContext context = new RerankerContext<>(searcher, qid, keywordQuery, null, queryString, queryTokens, filter, args);
RerankerContext context = new RerankerContext<>(getIndexSearcher(), qid, keywordQuery, null, queryString, queryTokens, filter, args);
ScoredDocs scoredFbDocs;
if (isRerank && args.rf_qrels != null) {
if (hasRelDocs) {
scoredFbDocs = queryQrels;

Check warning on line 789 in src/main/java/io/anserini/search/SearchCollection.java

View check run for this annotation

Codecov / codecov/patch

src/main/java/io/anserini/search/SearchCollection.java#L789

Added line #L789 was not covered by tests
} else {//if no relevant documents, only perform score based tie breaking next
scoredFbDocs = ScoredDocs.fromTopDocs(rs, searcher);
scoredFbDocs = ScoredDocs.fromTopDocs(rs, getIndexSearcher());
cascade = new RerankerCascade();
cascade.add(new ScoreTiesAdjusterReranker());

Check warning on line 793 in src/main/java/io/anserini/search/SearchCollection.java

View check run for this annotation

Codecov / codecov/patch

src/main/java/io/anserini/search/SearchCollection.java#L791-L793

Added lines #L791 - L793 were not covered by tests
}
} else {
scoredFbDocs = ScoredDocs.fromTopDocs(rs, searcher);
scoredFbDocs = ScoredDocs.fromTopDocs(rs, getIndexSearcher());
}

return cascade.run(scoredFbDocs, context);
Expand Down
Loading

0 comments on commit 3a8c31a

Please sign in to comment.