Skip to content

Commit

Permalink
Use ScoreTracker to avoid wasteful searching for very large k (#384)
Browse files Browse the repository at this point in the history
* clarify

* use scoreTracker to short circuit new edge evaluation once we hit a local maximum
  • Loading branch information
jbellis authored Jan 9, 2025
1 parent dcb4139 commit 7300b1c
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 15 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -264,11 +264,8 @@ private SearchResult resume(int initialVisited, int topK, int rerankK, float thr
rerankedResults.setMaxSize(topK);

int numVisited = initialVisited;
// A bound that holds the minimum similarity to the query vector that a candidate vector must
// have to be considered -- will be set to the lowest score in the results queue once the queue is full.
var minAcceptedSimilarity = Float.NEGATIVE_INFINITY;
// track scores to predict when we are done with threshold queries
var scoreTracker = threshold > 0 ? new ScoreTracker.TwoPhaseTracker(threshold) : ScoreTracker.NO_OP;
var scoreTracker = threshold > 0 ? new ScoreTracker.TwoPhaseTracker(threshold) : new ScoreTracker.TwoPhaseTracker(1.0);
VectorFloat<?> similarities = null;

// add evicted results from the last call back to the candidates
Expand All @@ -283,37 +280,36 @@ private SearchResult resume(int initialVisited, int topK, int rerankK, float thr
while (candidates.size() > 0) {
// we're done when we have K results and the best candidate is worse than the worst result so far
float topCandidateScore = candidates.topScore();
if (topCandidateScore < minAcceptedSimilarity) {
if (approximateResults.size() >= rerankK && topCandidateScore < approximateResults.topScore()) {
break;
}
// when querying by threshold, also stop when we are probabilistically unlikely to find more qualifying results
if (scoreTracker.shouldStop()) {
if (threshold > 0 && scoreTracker.shouldStop()) {
break;
}

// process the top candidate
int topCandidateNode = candidates.pop();
if (acceptOrds.get(topCandidateNode) && topCandidateScore >= threshold) {
addTopCandidate(topCandidateNode, topCandidateScore, rerankK);

// update minAcceptedSimilarity if we've found K results
if (approximateResults.size() >= rerankK) {
minAcceptedSimilarity = approximateResults.topScore();
}
}

// if this candidate came from evictedResults, we don't need to evaluate its neighbors again
if (previouslyEvicted.get(topCandidateNode)) {
continue;
}

// skip edge loading if we've found a local maximum and we have enough results
if (scoreTracker.shouldStop() && candidates.size() >= rerankK - approximateResults.size()) {
continue;
}

// score the neighbors of the top candidate and add them to the queue
var scoreFunction = scoreProvider.scoreFunction();
var useEdgeLoading = scoreFunction.supportsEdgeLoadingSimilarity();
if (useEdgeLoading) {
similarities = scoreFunction.edgeLoadingSimilarityTo(topCandidateNode);
}

var it = view.getNeighborsIterator(topCandidateNode);
for (int i = 0; i < it.size(); i++) {
var friendOrd = it.nextInt();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ class TwoPhaseTracker implements ScoreTracker {
private int recentEntryIndex;

// Heap of the best scores seen so far
AbstractLongHeap bestScores;
BoundedLongHeap bestScores;

// observation count
private int observationCount;
Expand Down Expand Up @@ -87,8 +87,10 @@ public boolean shouldStop() {
return false;
}

// we're in phase 2 if the 99th percentile of the recent scores is worse than the best score
// (paper suggests median, but experimentally that is too prone to false positives.
// We're in phase 2 if the 99th percentile of the recent scores evaluated is lower
// than the worst of the best scores seen.
//
// (paper suggests using the median of recent scores, but experimentally that is too prone to false positives.
// 90th does seem to be enough, but 99th doesn't result in much extra work, so we'll be conservative)
double windowMedian = StatUtils.percentile(recentScores, 99);
double worstBest = sortableIntToFloat((int) bestScores.top());
Expand Down

0 comments on commit 7300b1c

Please sign in to comment.