Skip to content

Commit eff7353

Browse files
committed
add API contract verfication
1 parent 32b0d86 commit eff7353

File tree

4 files changed

+39
-11
lines changed

4 files changed

+39
-11
lines changed

src/docset.rs

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,8 @@ pub trait DocSet: Send {
4040
/// of `DocSet` should support it.
4141
///
4242
/// Calling `seek(TERMINATED)` is also legal and is the normal way to consume a `DocSet`.
43+
///
44+
/// `target` has to be larger or equal to `.doc()` when calling `seek`.
4345
fn seek(&mut self, target: DocId) -> DocId {
4446
let mut doc = self.doc();
4547
debug_assert!(doc <= target);
@@ -58,11 +60,19 @@ pub trait DocSet: Send {
5860
///
5961
/// ## API Behaviour
6062
/// If `seek_exact` is returning true, a call to `doc()` has to return target.
61-
/// If `seek_exact` is returning false, a call to `doc()` may return the previous doc,
62-
/// which may be lower than target.
63+
/// If `seek_exact` is returning false, a call to `doc()` may return any doc and should not be
64+
/// used until `seek_exact` returns true again.
65+
///
66+
/// Consecutive calls are not allowed to have decreasing `target` values.
67+
///
68+
/// # Warning
69+
/// This is an advanced API used by intersection. The API contract is tricky, avoid using it.
6370
fn seek_exact(&mut self, target: DocId) -> bool {
64-
let doc = self.seek(target);
65-
doc == target
71+
let current_doc = self.doc();
72+
if current_doc < target {
73+
self.seek(target);
74+
}
75+
self.doc() == target
6676
}
6777

6878
/// Fills a given mutable buffer with the next doc ids from the
@@ -103,8 +113,11 @@ pub trait DocSet: Send {
103113
/// length of the docset.
104114
fn size_hint(&self) -> u32;
105115

106-
/// Returns a best-effort hint of the
107-
/// cost to drive the docset.
116+
/// Returns a best-effort hint of the cost to consume the entire docset.
117+
///
118+
/// Consuming means calling advance until [`TERMINATED`] is returned.
119+
/// The cost should be relative to the cost of driving a Term query,
120+
/// which would be the number of documents in the DocSet.
108121
///
109122
/// By default this returns `size_hint()`.
110123
///

src/query/intersection.rs

Lines changed: 9 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,13 @@ use crate::{DocId, Score};
1212
/// For better performance, the function uses a
1313
/// specialized implementation if the two
1414
/// shortest scorers are `TermScorer`s.
15-
pub fn intersect_scorers(mut scorers: Vec<Box<dyn Scorer>>, num_docs: u32) -> Box<dyn Scorer> {
15+
///
16+
/// num_docs_segment is the number of documents in the segment. It is used for estimating the
17+
/// `size_hint` of the intersection.
18+
pub fn intersect_scorers(
19+
mut scorers: Vec<Box<dyn Scorer>>,
20+
num_docs_segment: u32,
21+
) -> Box<dyn Scorer> {
1622
if scorers.is_empty() {
1723
return Box::new(EmptyScorer);
1824
}
@@ -35,14 +41,14 @@ pub fn intersect_scorers(mut scorers: Vec<Box<dyn Scorer>>, num_docs: u32) -> Bo
3541
left: *(left.downcast::<TermScorer>().map_err(|_| ()).unwrap()),
3642
right: *(right.downcast::<TermScorer>().map_err(|_| ()).unwrap()),
3743
others: scorers,
38-
num_docs,
44+
num_docs: num_docs_segment,
3945
});
4046
}
4147
Box::new(Intersection {
4248
left,
4349
right,
4450
others: scorers,
45-
num_docs,
51+
num_docs: num_docs_segment,
4652
})
4753
}
4854

src/query/range_query/fast_field_range_doc_set.rs

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -183,7 +183,14 @@ impl<T: Send + Sync + PartialOrd + Copy + Debug + 'static> DocSet for RangeDocSe
183183
fn cost(&self) -> u64 {
184184
// Advancing the docset is pretty expensive since it scans the whole column, there is no
185185
// index currently (will change with an kd-tree)
186-
// Since we use SIMD to scan the fast field range query we lower the cost a little bit.
186+
// Since we use SIMD to scan the fast field range query we lower the cost a little bit,
187+
// assuming that we hit 10% of the docs like in size_hint.
188+
//
189+
// If we would return a cost higher than num_docs, we would never choose ff range query as
190+
// the driver in a DocSet, when intersecting a term query with a fast field. But
191+
// it's the faster choice when the term query has a lot of docids and the range
192+
// query has not.
193+
//
187194
// Ideally this would take the fast field codec into account
188195
(self.column.num_docs() as f64 * 0.8) as u64
189196
}

src/query/size_hint.rs

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,9 @@
1010
/// The estimated number of documents in the intersection.
1111
pub fn estimate_intersection<I>(mut docset_sizes: I, max_docs: u32) -> u32
1212
where I: Iterator<Item = u32> {
13-
if max_doc == 0u32 { return 0u32; }
13+
if max_docs == 0u32 {
14+
return 0u32;
15+
}
1416
// Terms tend to be not really randomly distributed.
1517
// This factor is used to adjust the estimate.
1618
let mut co_loc_factor: f64 = 1.3;

0 commit comments

Comments
 (0)