Skip to content

Commit

Permalink
implement search after (#3941)
Browse files Browse the repository at this point in the history
* emit sort value for hits on ES api

* impl from/to string for GlobalDocAddress

* initial support for search after

miss error handling and tests, but works

* fix scroll test

it's mocking made it unrepresentative of what happends since recent sorting changes

* add unit test for search after

* add integration test

* document possible leaf_cache optimization on search_after

* pre-compute split_id ordering

* prevent use of mixed scroll+search_after
  • Loading branch information
trinity-1686a authored Oct 19, 2023
1 parent 359a2f2 commit 3bab372
Show file tree
Hide file tree
Showing 14 changed files with 634 additions and 85 deletions.
2 changes: 1 addition & 1 deletion quickwit/quickwit-common/src/binary_heap.rs
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ pub trait SortKeyMapper<Value> {
#[derive(Clone)]
pub struct TopK<T, O: Ord, S> {
heap: BinaryHeap<Reverse<OrderItemPair<O, T>>>,
sort_key_mapper: S,
pub sort_key_mapper: S,
k: usize,
}

Expand Down
1 change: 1 addition & 0 deletions quickwit/quickwit-proto/build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,7 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
"#[serde(default, skip_serializing_if = \"Option::is_none\")]",
)
.type_attribute(".", "#[derive(Serialize, Deserialize, utoipa::ToSchema)]")
.type_attribute("PartialHit", "#[derive(Eq, Hash)]")
.type_attribute("PartialHit.sort_value", "#[derive(Copy)]")
.type_attribute("SearchRequest", "#[derive(Eq, Hash)]")
.type_attribute("Shard", "#[derive(Eq)]")
Expand Down
5 changes: 5 additions & 0 deletions quickwit/quickwit-proto/protos/quickwit/search.proto
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,11 @@ message SearchRequest {
// that will make it possible to paginate through the results
// in a consistent manner.
optional uint32 scroll_ttl_secs = 15;

// Document with sort tuple smaller or equal to this are discarded to
// enable pagination.
// If split_id is empty, no comparison with _shard_doc should be done
optional PartialHit search_after = 16;
}

message SortField {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,11 @@ pub struct SearchRequest {
/// in a consistent manner.
#[prost(uint32, optional, tag = "15")]
pub scroll_ttl_secs: ::core::option::Option<u32>,
/// Document with sort tuple smaller or equal to this are discarded to
/// enable pagination.
/// If split_id is empty, no comparison with _shard_doc should be done
#[prost(message, optional, tag = "16")]
pub search_after: ::core::option::Option<PartialHit>,
}
#[derive(Serialize, Deserialize, utoipa::ToSchema)]
#[derive(Eq, Hash)]
Expand Down Expand Up @@ -249,6 +254,7 @@ pub struct Hit {
/// - the segment_ord,
/// - the doc id.
#[derive(Serialize, Deserialize, utoipa::ToSchema)]
#[derive(Eq, Hash)]
#[allow(clippy::derive_partial_eq_without_eq)]
#[derive(Clone, PartialEq, ::prost::Message)]
pub struct PartialHit {
Expand Down
115 changes: 109 additions & 6 deletions quickwit/quickwit-proto/src/search/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -52,11 +52,6 @@ impl fmt::Display for SplitSearchError {
}
}

// !!! Disclaimer !!!
//
// Prost imposes the PartialEq derived implementation.
// This is terrible because this means Eq, PartialEq are not really in line with Ord's
// implementation. if in presence of NaN.
impl Eq for SortByValue {}
impl Copy for SortByValue {}
impl From<SortValue> for SortByValue {
Expand All @@ -67,8 +62,61 @@ impl From<SortValue> for SortByValue {
}
}

impl Copy for SortValue {}
impl std::hash::Hash for SortByValue {
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
self.sort_value.hash(state);
}
}

impl SortByValue {
pub fn into_json(self) -> serde_json::Value {
use serde_json::Value::*;
match self.sort_value {
Some(SortValue::U64(num)) => Number(num.into()),
Some(SortValue::I64(num)) => Number(num.into()),
Some(SortValue::F64(num)) => {
if let Some(num) = serde_json::Number::from_f64(num) {
Number(num)
} else {
// TODO is there a better way to handle infinite/nan?
Null
}
}
Some(SortValue::Boolean(b)) => Bool(b),
None => Null,
}
}

pub fn try_from_json(value: serde_json::Value) -> Option<Self> {
use serde_json::Value::*;
let sort_value = match value {
Null => None,
Bool(b) => Some(SortValue::Boolean(b)),
Number(number) => {
if let Some(number) = number.as_u64() {
Some(SortValue::U64(number))
} else if let Some(number) = number.as_i64() {
Some(SortValue::I64(number))
} else if let Some(number) = number.as_f64() {
Some(SortValue::F64(number))
} else {
// this should never happen as we don't emit such number ourselves
return None;
}
}
String(_) | Array(_) | Object(_) => return None,
};
Some(SortByValue { sort_value })
}
}

// !!! Disclaimer !!!
//
// Prost imposes the PartialEq derived implementation.
// This is terrible because this means Eq, PartialEq are not really in line with Ord's
// implementation. if in presence of NaN.
impl Eq for SortValue {}
impl Copy for SortValue {}

impl Ord for SortValue {
fn cmp(&self, other: &Self) -> Ordering {
Expand Down Expand Up @@ -118,6 +166,61 @@ impl PartialOrd for SortValue {
}
}

impl std::hash::Hash for SortValue {
fn hash<H: std::hash::Hasher>(&self, state: &mut H) {
let this = self.normalize();
match this {
SortValue::U64(number) => {
0u8.hash(state);
number.hash(state);
}
SortValue::I64(number) => {
1u8.hash(state);
number.hash(state);
}
SortValue::F64(number) => {
2u8.hash(state);
number.to_bits().hash(state);
}
SortValue::Boolean(b) => {
3u8.hash(state);
b.hash(state);
}
}
}
}

impl SortValue {
/// Where multiple variant could represent the same logical value, convert to a canonical form.
///
/// For number, we prefer to represent them, in order, as i64, then as u64 and finaly as f64.
pub fn normalize(&self) -> Self {
match self {
SortValue::I64(_) => *self,
SortValue::Boolean(_) => *self,
SortValue::U64(number) => {
if let Ok(number) = (*number).try_into() {
SortValue::I64(number)
} else {
*self
}
}
SortValue::F64(number) => {
let number = *number;
if number.ceil() == number {
// number is not NaN, and is a natural number
if number >= i64::MIN as f64 && number <= i64::MAX as f64 {
return SortValue::I64(number as i64);
} else if number.is_sign_positive() && number <= u64::MAX as f64 {
return SortValue::U64(number as u64);
}
}
*self
}
}
}
}

impl PartialHit {
/// Helper to get access to the 1st sort value
pub fn sort_value(&self) -> Option<SortValue> {
Expand Down
Loading

0 comments on commit 3bab372

Please sign in to comment.