Skip to content

Commit 7235d4d

Browse files
committed
seek_exact + cost based intersection
Adds `seek_exact` and `cost` to `DocSet` for a more efficient intersection. Unlike `seek`, `seek_exact` does not require the DocSet to advance to the next hit, if the target does not exist. `cost` allows to address the different DocSet types and their cost model and is used to determine the DocSet that drives the intersection. E.g. fast field range queries may do a full scan. Phrase queries load the positions to check if a we have a hit. They both have a higher cost than their size_hint would suggest. Improves `size_hint` estimation for intersection and union, by having a estimation based on random distribution with a co-location factor. Refactor range query benchmark. Closes #2531 *Future Work* Implement `seek_exact` for BufferedUnionScorer and RangeDocSet (fast field range queries) Evaluate replacing `seek` with `seek_exact` to reduce code complexity
1 parent c35a782 commit 7235d4d

21 files changed

+748
-522
lines changed

Cargo.toml

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,7 @@ fnv = "1.0.7"
7272
winapi = "0.3.9"
7373

7474
[dev-dependencies]
75-
binggan = "0.14.0"
75+
binggan = "0.14.2"
7676
rand = "0.8.5"
7777
maplit = "1.0.2"
7878
matches = "0.1.9"
@@ -162,3 +162,8 @@ harness = false
162162
[[bench]]
163163
name = "agg_bench"
164164
harness = false
165+
166+
[[bench]]
167+
name = "range_query"
168+
harness = false
169+

benches/range_query.rs

Lines changed: 260 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,260 @@
1+
use std::fmt::Display;
2+
use std::net::Ipv6Addr;
3+
use std::ops::RangeInclusive;
4+
5+
use binggan::plugins::PeakMemAllocPlugin;
6+
use binggan::{black_box, BenchRunner, OutputValue, PeakMemAlloc, INSTRUMENTED_SYSTEM};
7+
use columnar::MonotonicallyMappableToU128;
8+
use rand::rngs::StdRng;
9+
use rand::{Rng, SeedableRng};
10+
use tantivy::collector::{Count, TopDocs};
11+
use tantivy::query::QueryParser;
12+
use tantivy::schema::*;
13+
use tantivy::{doc, Index};
14+
15+
#[global_allocator]
16+
pub static GLOBAL: &PeakMemAlloc<std::alloc::System> = &INSTRUMENTED_SYSTEM;
17+
18+
fn main() {
19+
bench_range_query();
20+
}
21+
22+
fn bench_range_query() {
23+
let index = get_index_0_to_100();
24+
let mut runner = BenchRunner::new();
25+
runner.add_plugin(PeakMemAllocPlugin::new(GLOBAL));
26+
27+
runner.set_name("range_query on u64");
28+
let field_name_and_descr: Vec<_> = vec![
29+
("id", "Single Valued Range Field"),
30+
("ids", "Multi Valued Range Field"),
31+
];
32+
let range_num_hits = vec![
33+
("90_percent", get_90_percent()),
34+
("10_percent", get_10_percent()),
35+
("1_percent", get_1_percent()),
36+
];
37+
38+
test_range(&mut runner, &index, &field_name_and_descr, range_num_hits);
39+
40+
runner.set_name("range_query on ip");
41+
let field_name_and_descr: Vec<_> = vec![
42+
("ip", "Single Valued Range Field"),
43+
("ips", "Multi Valued Range Field"),
44+
];
45+
let range_num_hits = vec![
46+
("90_percent", get_90_percent_ip()),
47+
("10_percent", get_10_percent_ip()),
48+
("1_percent", get_1_percent_ip()),
49+
];
50+
51+
test_range(&mut runner, &index, &field_name_and_descr, range_num_hits);
52+
}
53+
54+
fn test_range<T: Display>(
55+
runner: &mut BenchRunner,
56+
index: &Index,
57+
field_name_and_descr: &[(&str, &str)],
58+
range_num_hits: Vec<(&str, RangeInclusive<T>)>,
59+
) {
60+
for (field, suffix) in field_name_and_descr {
61+
let term_num_hits = vec![
62+
("", ""),
63+
("1_percent", "veryfew"),
64+
("10_percent", "few"),
65+
("90_percent", "most"),
66+
];
67+
let mut group = runner.new_group();
68+
group.set_name(suffix);
69+
// all intersect combinations
70+
for (range_name, range) in &range_num_hits {
71+
for (term_name, term) in &term_num_hits {
72+
let index = &index;
73+
let test_name = if term_name.is_empty() {
74+
format!("id_range_hit_{}", range_name)
75+
} else {
76+
format!(
77+
"id_range_hit_{}_intersect_with_term_{}",
78+
range_name, term_name
79+
)
80+
};
81+
group.register(test_name, move |_| {
82+
let query = if term_name.is_empty() {
83+
"".to_string()
84+
} else {
85+
format!("AND id_name:{}", term)
86+
};
87+
black_box(execute_query(field, range, &query, index));
88+
});
89+
}
90+
}
91+
group.run();
92+
}
93+
}
94+
95+
fn get_index_0_to_100() -> Index {
96+
let mut rng = StdRng::from_seed([1u8; 32]);
97+
let num_vals = 100_000;
98+
let docs: Vec<_> = (0..num_vals)
99+
.map(|_i| {
100+
let id_name = if rng.gen_bool(0.01) {
101+
"veryfew".to_string() // 1%
102+
} else if rng.gen_bool(0.1) {
103+
"few".to_string() // 9%
104+
} else {
105+
"most".to_string() // 90%
106+
};
107+
Doc {
108+
id_name,
109+
id: rng.gen_range(0..100),
110+
// Multiply by 1000, so that we create most buckets in the compact space
111+
// The benches depend on this range to select n-percent of elements with the
112+
// methods below.
113+
ip: Ipv6Addr::from_u128(rng.gen_range(0..100) * 1000),
114+
}
115+
})
116+
.collect();
117+
118+
create_index_from_docs(&docs)
119+
}
120+
121+
#[derive(Clone, Debug)]
122+
pub struct Doc {
123+
pub id_name: String,
124+
pub id: u64,
125+
pub ip: Ipv6Addr,
126+
}
127+
128+
pub fn create_index_from_docs(docs: &[Doc]) -> Index {
129+
let mut schema_builder = Schema::builder();
130+
let id_u64_field = schema_builder.add_u64_field("id", INDEXED | STORED | FAST);
131+
let ids_u64_field =
132+
schema_builder.add_u64_field("ids", NumericOptions::default().set_fast().set_indexed());
133+
134+
let id_f64_field = schema_builder.add_f64_field("id_f64", INDEXED | STORED | FAST);
135+
let ids_f64_field = schema_builder.add_f64_field(
136+
"ids_f64",
137+
NumericOptions::default().set_fast().set_indexed(),
138+
);
139+
140+
let id_i64_field = schema_builder.add_i64_field("id_i64", INDEXED | STORED | FAST);
141+
let ids_i64_field = schema_builder.add_i64_field(
142+
"ids_i64",
143+
NumericOptions::default().set_fast().set_indexed(),
144+
);
145+
146+
let text_field = schema_builder.add_text_field("id_name", STRING | STORED);
147+
let text_field2 = schema_builder.add_text_field("id_name_fast", STRING | STORED | FAST);
148+
149+
let ip_field = schema_builder.add_ip_addr_field("ip", FAST);
150+
let ips_field = schema_builder.add_ip_addr_field("ips", FAST);
151+
152+
let schema = schema_builder.build();
153+
154+
let index = Index::create_in_ram(schema);
155+
156+
{
157+
let mut index_writer = index.writer_with_num_threads(1, 50_000_000).unwrap();
158+
for doc in docs.iter() {
159+
index_writer
160+
.add_document(doc!(
161+
ids_i64_field => doc.id as i64,
162+
ids_i64_field => doc.id as i64,
163+
ids_f64_field => doc.id as f64,
164+
ids_f64_field => doc.id as f64,
165+
ids_u64_field => doc.id,
166+
ids_u64_field => doc.id,
167+
id_u64_field => doc.id,
168+
id_f64_field => doc.id as f64,
169+
id_i64_field => doc.id as i64,
170+
text_field => doc.id_name.to_string(),
171+
text_field2 => doc.id_name.to_string(),
172+
ips_field => doc.ip,
173+
ips_field => doc.ip,
174+
ip_field => doc.ip,
175+
))
176+
.unwrap();
177+
}
178+
179+
index_writer.commit().unwrap();
180+
}
181+
index
182+
}
183+
184+
fn get_90_percent() -> RangeInclusive<u64> {
185+
0..=90
186+
}
187+
188+
fn get_10_percent() -> RangeInclusive<u64> {
189+
0..=10
190+
}
191+
192+
fn get_1_percent() -> RangeInclusive<u64> {
193+
10..=10
194+
}
195+
196+
fn get_90_percent_ip() -> RangeInclusive<Ipv6Addr> {
197+
let start = Ipv6Addr::from_u128(0);
198+
let end = Ipv6Addr::from_u128(90 * 1000);
199+
start..=end
200+
}
201+
202+
fn get_10_percent_ip() -> RangeInclusive<Ipv6Addr> {
203+
let start = Ipv6Addr::from_u128(0);
204+
let end = Ipv6Addr::from_u128(10 * 1000);
205+
start..=end
206+
}
207+
208+
fn get_1_percent_ip() -> RangeInclusive<Ipv6Addr> {
209+
let start = Ipv6Addr::from_u128(10 * 1000);
210+
let end = Ipv6Addr::from_u128(10 * 1000);
211+
start..=end
212+
}
213+
214+
struct NumHits {
215+
count: usize,
216+
}
217+
impl OutputValue for NumHits {
218+
fn column_title() -> &'static str {
219+
"NumHits"
220+
}
221+
fn format(&self) -> Option<String> {
222+
Some(self.count.to_string())
223+
}
224+
}
225+
226+
fn execute_query<T: Display>(
227+
field: &str,
228+
id_range: &RangeInclusive<T>,
229+
suffix: &str,
230+
index: &Index,
231+
) -> NumHits {
232+
let gen_query_inclusive = |from: &T, to: &T| {
233+
format!(
234+
"{}:[{} TO {}] {}",
235+
field,
236+
&from.to_string(),
237+
&to.to_string(),
238+
suffix
239+
)
240+
};
241+
242+
let query = gen_query_inclusive(id_range.start(), id_range.end());
243+
execute_query_(&query, index)
244+
}
245+
246+
fn execute_query_(query: &str, index: &Index) -> NumHits {
247+
let query_from_text = |text: &str| {
248+
QueryParser::for_index(index, vec![])
249+
.parse_query(text)
250+
.unwrap()
251+
};
252+
let query = query_from_text(query);
253+
let reader = index.reader().unwrap();
254+
let searcher = reader.searcher();
255+
let num_hits = searcher
256+
.search(&query, &(TopDocs::with_limit(10), Count))
257+
.unwrap()
258+
.1;
259+
NumHits { count: num_hits }
260+
}

src/docset.rs

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,25 @@ pub trait DocSet: Send {
4949
doc
5050
}
5151

52+
/// Seeks to the target if possible and returns true if the target is in the DocSet.
53+
///
54+
/// Implementations may choose to advance past the target if target does not exist.
55+
///
56+
/// DocSets that already have an efficient `seek` method don't need to implement `seek_exact`.
57+
/// All wapper DocSets should forward `seek_exact` to the underlying DocSet.
58+
///
59+
/// ## API Behaviour
60+
/// If `seek_exact` is returning true, a call to `doc()` has to return target.
61+
/// If `seek_exact` is returning false, a call to `doc()` may return the previous doc,
62+
/// which may be lower than target.
63+
fn seek_exact(&mut self, target: DocId) -> bool {
64+
let current_doc = self.doc();
65+
if current_doc < target {
66+
self.seek(target);
67+
}
68+
self.doc() == target
69+
}
70+
5271
/// Fills a given mutable buffer with the next doc ids from the
5372
/// `DocSet`
5473
///
@@ -87,6 +106,23 @@ pub trait DocSet: Send {
87106
/// length of the docset.
88107
fn size_hint(&self) -> u32;
89108

109+
/// Returns a best-effort hint of the
110+
/// cost to drive the docset.
111+
///
112+
/// By default this returns `size_hint()`.
113+
///
114+
/// DocSets may have vastly different cost depending on their type,
115+
/// e.g. an intersection with 10 hits is much cheaper than
116+
/// a phrase search with 10 hits, since it needs to load positions.
117+
///
118+
/// ### Future Work
119+
/// We may want to differentiate `DocSet` costs more more granular, e.g.
120+
/// creation_cost, advance_cost, seek_cost on to get a good estimation
121+
/// what query types to choose.
122+
fn cost(&self) -> u64 {
123+
self.size_hint() as u64
124+
}
125+
90126
/// Returns the number documents matching.
91127
/// Calling this method consumes the `DocSet`.
92128
fn count(&mut self, alive_bitset: &AliveBitSet) -> u32 {
@@ -126,6 +162,10 @@ impl DocSet for &mut dyn DocSet {
126162
(**self).seek(target)
127163
}
128164

165+
fn seek_exact(&mut self, target: DocId) -> bool {
166+
(**self).seek_exact(target)
167+
}
168+
129169
fn doc(&self) -> u32 {
130170
(**self).doc()
131171
}
@@ -134,6 +174,10 @@ impl DocSet for &mut dyn DocSet {
134174
(**self).size_hint()
135175
}
136176

177+
fn cost(&self) -> u64 {
178+
(**self).cost()
179+
}
180+
137181
fn count(&mut self, alive_bitset: &AliveBitSet) -> u32 {
138182
(**self).count(alive_bitset)
139183
}
@@ -154,6 +198,11 @@ impl<TDocSet: DocSet + ?Sized> DocSet for Box<TDocSet> {
154198
unboxed.seek(target)
155199
}
156200

201+
fn seek_exact(&mut self, target: DocId) -> bool {
202+
let unboxed: &mut TDocSet = self.borrow_mut();
203+
unboxed.seek_exact(target)
204+
}
205+
157206
fn fill_buffer(&mut self, buffer: &mut [DocId; COLLECT_BLOCK_BUFFER_LEN]) -> usize {
158207
let unboxed: &mut TDocSet = self.borrow_mut();
159208
unboxed.fill_buffer(buffer)
@@ -169,6 +218,11 @@ impl<TDocSet: DocSet + ?Sized> DocSet for Box<TDocSet> {
169218
unboxed.size_hint()
170219
}
171220

221+
fn cost(&self) -> u64 {
222+
let unboxed: &TDocSet = self.borrow();
223+
unboxed.cost()
224+
}
225+
172226
fn count(&mut self, alive_bitset: &AliveBitSet) -> u32 {
173227
let unboxed: &mut TDocSet = self.borrow_mut();
174228
unboxed.count(alive_bitset)

src/postings/mod.rs

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -667,12 +667,15 @@ mod bench {
667667
.read_postings(&TERM_D, IndexRecordOption::Basic)
668668
.unwrap()
669669
.unwrap();
670-
let mut intersection = Intersection::new(vec![
671-
segment_postings_a,
672-
segment_postings_b,
673-
segment_postings_c,
674-
segment_postings_d,
675-
]);
670+
let mut intersection = Intersection::new(
671+
vec![
672+
segment_postings_a,
673+
segment_postings_b,
674+
segment_postings_c,
675+
segment_postings_d,
676+
],
677+
reader.searcher().num_docs() as u32,
678+
);
676679
while intersection.advance() != TERMINATED {}
677680
});
678681
}

0 commit comments

Comments
 (0)