From df4497d9012da2c124b891cfb2706d797020599b Mon Sep 17 00:00:00 2001 From: Chris Tam Date: Tue, 15 Aug 2023 08:56:57 -0400 Subject: [PATCH] Support lenient parser --- Cargo.lock | 145 +++---- Cargo.toml | 3 +- src/index.rs | 62 +++ src/lib.rs | 53 ++- src/parser_error.rs | 972 ++++++++++++++++++++++++++++++++++++++++++ src/searcher.rs | 2 +- tests/tantivy_test.py | 29 +- 7 files changed, 1172 insertions(+), 94 deletions(-) create mode 100644 src/parser_error.rs diff --git a/Cargo.lock b/Cargo.lock index 7374aefc6..1d2f738a7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -22,6 +22,12 @@ dependencies = [ "memchr", ] +[[package]] +name = "allocator-api2" +version = "0.2.16" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0942ffc6dcaadf03badf6e6a2d0228460359d5e34b57ccdc720b7382dfbd5ec5" + [[package]] name = "android_system_properties" version = "0.1.5" @@ -139,15 +145,6 @@ dependencies = [ "unicode-width", ] -[[package]] -name = "combine" -version = "4.6.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "35ed6e9d84f0b51a7f52daf1c7d71dd136fd7a3f41a8462b8cdb8c78d920fad4" -dependencies = [ - "memchr", -] - [[package]] name = "core-foundation-sys" version = "0.8.3" @@ -289,17 +286,6 @@ dependencies = [ "libc", ] -[[package]] -name = "fail" -version = "0.5.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fe5e43d0f78a42ad591453aedb1d7ae631ce7ee445c7643691055a9ed8d3b01c" -dependencies = [ - "log", - "once_cell", - "rand", -] - [[package]] name = "fastdivide" version = "0.4.0" @@ -446,11 +432,12 @@ dependencies = [ [[package]] name = "hashbrown" -version = "0.13.2" +version = "0.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "43a3c133739dddd0d2990f9a4bdf8eb4b21ef50e4851ca85ab661199821d510e" +checksum = "2c6201b9ff9fd90a5a3bac2e56a830d0caa509576f0e503818ee82c181b3437a" dependencies = [ "ahash", + "allocator-api2", ] [[package]] @@ -519,6 +506,15 @@ dependencies = [ "either", ] +[[package]] +name = "itertools" +version = "0.11.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b1c173a5686ce8bfa551b3563d0c2170bf24ca44da99c7ca4bfdab5418c3fe57" +dependencies = [ + "either", +] + [[package]] name = "itoa" version = "1.0.5" @@ -611,18 +607,18 @@ dependencies = [ [[package]] name = "lru" -version = "0.10.1" +version = "0.11.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "718e8fae447df0c7e1ba7f5189829e63fd536945c8988d61444c19039f16b670" +checksum = "eedb2bdbad7e0634f83989bf596f497b070130daaa398ab22d84c39e266deec5" dependencies = [ "hashbrown", ] [[package]] name = "lz4_flex" -version = "0.10.0" +version = "0.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8b8c72594ac26bfd34f2d99dfced2edfaddfe8a476e3ff2ca0eb293d925c4f83" +checksum = "3ea9b256699eda7b0387ffbc776dd625e28bde3918446381781245b7a50349d8" [[package]] name = "matchers" @@ -651,9 +647,9 @@ checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" [[package]] name = "memmap2" -version = "0.6.2" +version = "0.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6d28bba84adfe6646737845bc5ebbfa2c08424eb1c37e94a1fd2a82adb56a872" +checksum = "f49388d20533534cd19360ad3d6a7dadc885944aa802ba3995040c5ec11288c6" dependencies = [ "libc", ] @@ -676,12 +672,28 @@ dependencies = [ "autocfg", ] +[[package]] +name = "minimal-lexical" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a" + [[package]] name = "murmurhash32" version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d9380db4c04d219ac5c51d14996bbf2c2e9a15229771b53f8671eb6c83cf44df" +[[package]] +name = "nom" +version = "7.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d273983c5a657a70a3e8f2a01329822f3b8c8172b73826411a55751e404a0a4a" +dependencies = [ + "memchr", + "minimal-lexical", +] + [[package]] name = "nu-ansi-term" version = "0.46.0" @@ -745,8 +757,7 @@ checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39" [[package]] name = "ownedbytes" version = "0.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c718e498b20704d5fb5d51d07f414a22f61c19254c1708e117b93fd76860739c" +source = "git+https://github.com/quickwit-oss/tantivy.git#47b315ff18e1569c214e86796abf8e8cb462834d" dependencies = [ "stable_deref_trait", ] @@ -792,12 +803,6 @@ version = "0.3.27" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "26072860ba924cbfa98ea39c8c19b4dd6a4a25423dbdf219c1eca91aa0cf6964" -[[package]] -name = "ppv-lite86" -version = "0.2.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" - [[package]] name = "proc-macro2" version = "1.0.51" @@ -877,36 +882,6 @@ dependencies = [ "proc-macro2", ] -[[package]] -name = "rand" -version = "0.8.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "34af8d1a0e25924bc5b7c43c079c942339d8f0a8b57c39049bef581b46327404" -dependencies = [ - "libc", - "rand_chacha", - "rand_core", -] - -[[package]] -name = "rand_chacha" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e6c10a63a0fa32252be49d21e7709d4d4baf8d231c2dbce1eaa8141b9b127d88" -dependencies = [ - "ppv-lite86", - "rand_core", -] - -[[package]] -name = "rand_core" -version = "0.6.4" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ec0be4795e2f6a28069bec0b5ff3e2ac9bafc99e6a9a7dc3547996c5c816922c" -dependencies = [ - "getrandom", -] - [[package]] name = "rayon" version = "1.6.1" @@ -1115,9 +1090,10 @@ dependencies = [ name = "tantivy" version = "0.20.1" dependencies = [ + "base64", "chrono", "futures", - "itertools", + "itertools 0.10.5", "pyo3", "pyo3-build-config", "serde_json", @@ -1127,8 +1103,7 @@ dependencies = [ [[package]] name = "tantivy" version = "0.20.2" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "aec540e9cebc88f523f67f596dee213e491f0c55961de013566f267a0c31f5e9" +source = "git+https://github.com/quickwit-oss/tantivy.git#47b315ff18e1569c214e86796abf8e8cb462834d" dependencies = [ "aho-corasick", "arc-swap", @@ -1140,11 +1115,10 @@ dependencies = [ "crc32fast", "crossbeam-channel", "downcast-rs", - "fail", "fastdivide", "fs4", "htmlescape", - "itertools", + "itertools 0.11.0", "levenshtein_automata", "log", "lru", @@ -1180,8 +1154,7 @@ dependencies = [ [[package]] name = "tantivy-bitpacker" version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "16099e96f0ede682084469b80d6909dc170aa2b11d2a45538b5b36b2a90090b9" +source = "git+https://github.com/quickwit-oss/tantivy.git#47b315ff18e1569c214e86796abf8e8cb462834d" dependencies = [ "bitpacking", ] @@ -1189,12 +1162,11 @@ dependencies = [ [[package]] name = "tantivy-columnar" version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "56e32b024b26eab93eb8648faf08004356bf9d47376557ee4409f4b210163656" +source = "git+https://github.com/quickwit-oss/tantivy.git#47b315ff18e1569c214e86796abf8e8cb462834d" dependencies = [ "fastdivide", "fnv", - "itertools", + "itertools 0.11.0", "serde", "tantivy-bitpacker", "tantivy-common", @@ -1205,8 +1177,7 @@ dependencies = [ [[package]] name = "tantivy-common" version = "0.5.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e7d12fdd6ec0f7e0962f129c03c696a85ec567734950cbb2b89af4a293ce342f" +source = "git+https://github.com/quickwit-oss/tantivy.git#47b315ff18e1569c214e86796abf8e8cb462834d" dependencies = [ "async-trait", "byteorder", @@ -1229,19 +1200,15 @@ dependencies = [ [[package]] name = "tantivy-query-grammar" version = "0.20.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "106d8f78ad1da4f0fdd526a0760c326c0573510d4dedabeb1962d35a35879797" +source = "git+https://github.com/quickwit-oss/tantivy.git#47b315ff18e1569c214e86796abf8e8cb462834d" dependencies = [ - "combine", - "once_cell", - "regex", + "nom", ] [[package]] name = "tantivy-sstable" version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eda34243d3ee64bd8f9ba74a3b0d05f4d07beff7767a727212e9b5a19c13dde7" +source = "git+https://github.com/quickwit-oss/tantivy.git#47b315ff18e1569c214e86796abf8e8cb462834d" dependencies = [ "tantivy-common", "tantivy-fst", @@ -1251,8 +1218,7 @@ dependencies = [ [[package]] name = "tantivy-stacker" version = "0.1.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "67b9e9470301b026ad3b95f79a791a2a3ee81f3ab16fbe412a9dd81ff834acf5" +source = "git+https://github.com/quickwit-oss/tantivy.git#47b315ff18e1569c214e86796abf8e8cb462834d" dependencies = [ "murmurhash32", "tantivy-common", @@ -1260,9 +1226,8 @@ dependencies = [ [[package]] name = "tantivy-tokenizer-api" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64186801b6e06b3a1c4275e23b517835ff4ecbb707318b838dc9de457c062200" +version = "0.1.0" +source = "git+https://github.com/quickwit-oss/tantivy.git#47b315ff18e1569c214e86796abf8e8cb462834d" dependencies = [ "serde", ] diff --git a/Cargo.toml b/Cargo.toml index a46cc1885..3e6fcce41 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -14,8 +14,9 @@ crate-type = ["cdylib"] pyo3-build-config = "0.19.1" [dependencies] +base64 = "0.21" chrono = "0.4.23" -tantivy = "0.20.1" +tantivy = { git = "https://github.com/quickwit-oss/tantivy.git" } itertools = "0.10.5" futures = "0.3.26" serde_json = "1.0.91" diff --git a/src/index.rs b/src/index.rs index cac739e07..1db5b2802 100644 --- a/src/index.rs +++ b/src/index.rs @@ -5,6 +5,7 @@ use pyo3::{exceptions, prelude::*, types::PyAny}; use crate::{ document::{extract_value, Document}, get_field, + parser_error::QueryParserErrorIntoPy, query::Query, schema::Schema, searcher::Searcher, @@ -394,6 +395,67 @@ impl Index { Ok(Query { inner: query }) } + + /// Parse a query leniently. + /// + /// This variant parses invalid query on a best effort basis. If some part of the query can't + /// reasonably be executed (range query without field, searching on a non existing field, + /// searching without precising field when no default field is provided...), they may get turned + /// into a "match-nothing" subquery. + /// + /// Args: + /// query: the query, following the tantivy query language. + /// default_fields_names (List[Field]): A list of fields used to search if no + /// field is specified in the query. + /// + /// Returns a tuple containing the parsed query and a list of errors. + /// + /// Raises ValueError if a field in `default_field_names` is not defined or marked as indexed. + #[pyo3(signature = (query, default_field_names = None))] + pub fn parse_query_lenient( + &self, + query: &str, + default_field_names: Option>, + ) -> PyResult<(Query, Vec)> { + let mut default_fields = vec![]; + let schema = self.index.schema(); + + if let Some(default_field_names_vec) = default_field_names { + for default_field_name in &default_field_names_vec { + if let Ok(field) = schema.get_field(default_field_name) { + let field_entry = schema.get_field_entry(field); + if !field_entry.is_indexed() { + return Err(exceptions::PyValueError::new_err( + format!( + "Field `{default_field_name}` is not set as indexed in the schema." + ), + )); + } + default_fields.push(field); + } else { + return Err(exceptions::PyValueError::new_err(format!( + "Field `{default_field_name}` is not defined in the schema." + ))); + } + } + } else { + for (field, field_entry) in self.index.schema().fields() { + if field_entry.is_indexed() { + default_fields.push(field); + } + } + } + let parser = + tv::query::QueryParser::for_index(&self.index, default_fields); + let (query, errors) = parser.parse_query_lenient(query); + + Python::with_gil(|py| { + let errors = + errors.into_iter().map(|err| err.into_py(py)).collect(); + + Ok((Query { inner: query }, errors)) + }) + } } impl Index { diff --git a/src/lib.rs b/src/lib.rs index 7fe6c2af2..c5cb649ff 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,9 +1,10 @@ use ::tantivy as tv; -use pyo3::{exceptions, prelude::*}; +use pyo3::{exceptions, prelude::*, wrap_pymodule}; mod document; mod facet; mod index; +mod parser_error; mod query; mod schema; mod schemabuilder; @@ -75,6 +76,56 @@ fn tantivy(_py: Python, m: &PyModule) -> PyResult<()> { m.add_class::()?; m.add_class::()?; m.add_class::()?; + + m.add_wrapped(wrap_pymodule!(query_parser_error))?; + + Ok(()) +} + +/// Submodule containing all the possible errors that can be raised during +/// query parsing. +/// +/// Example: +/// >>> import tantivy +/// >>> from tantivy import query_parser_error +/// +/// >>> builder = tantivy.SchemaBuilder() +/// +/// >>> title = builder.add_text_field("title", stored=True) +/// >>> body = builder.add_text_field("body") +/// >>> id = builder.add_unsigned_field("id") +/// >>> rating = builder.add_float_field("rating") +/// +/// >>> schema = builder.build() +/// >>> index = tantivy.Index(schema) +/// +/// >>> query, errors = index.parse_query_lenient( +/// "bod:'world' AND id:<3.5 AND rating:5.0" +/// ) +/// +/// >>> assert len(errors) == 2 +/// >>> assert isinstance(errors[0], query_parser_error.FieldDoesNotExistError) +/// >>> assert isinstance(errors[1], query_parser_error.ExpectedIntError) +#[pymodule] +fn query_parser_error(_py: Python, m: &PyModule) -> PyResult<()> { + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + m.add_class::()?; + Ok(()) } diff --git a/src/parser_error.rs b/src/parser_error.rs new file mode 100644 index 000000000..b971d610a --- /dev/null +++ b/src/parser_error.rs @@ -0,0 +1,972 @@ +use std::{ + convert::TryFrom, + net::AddrParseError, + num::{IntErrorKind, ParseFloatError, ParseIntError}, + str::ParseBoolError, +}; + +use pyo3::prelude::*; +use tantivy::{self as tv, schema::FacetParseError}; + +// TODO(https://github.com/PyO3/pyo3/issues/1190): Expose this to bindings once trait <-> ABC is +// supported in PyO3. +pub(crate) trait QueryParserError { + fn full_message(&self) -> String; +} + +/// A crate local version of the [`IntoPy`] trait to implement for +/// [`QueryParserError`](tv::query::QueryParserError). +pub(crate) trait QueryParserErrorIntoPy { + fn into_py(self, py: Python) -> PyObject; +} + +impl QueryParserErrorIntoPy for tv::query::QueryParserError { + fn into_py(self, py: Python) -> PyObject { + match self { + tv::query::QueryParserError::SyntaxError(message) => { + SyntaxError { message }.into_py(py) + } + tv::query::QueryParserError::UnsupportedQuery(message) => { + UnsupportedQueryError { message }.into_py(py) + } + tv::query::QueryParserError::FieldDoesNotExist(field) => { + FieldDoesNotExistError { field }.into_py(py) + } + tv::query::QueryParserError::FieldDoesNotHavePositionsIndexed( + field, + ) => FieldDoesNotHavePositionsIndexedError { field }.into_py(py), + tv::query::QueryParserError::ExpectedInt(parse_int_error) => { + ExpectedIntError { parse_int_error }.into_py(py) + } + tv::query::QueryParserError::ExpectedFloat(parse_float_error) => { + ExpectedFloatError { parse_float_error }.into_py(py) + } + tv::query::QueryParserError::ExpectedBool(parse_bool_error) => { + ExpectedBoolError { parse_bool_error }.into_py(py) + } + tv::query::QueryParserError::ExpectedBase64(decode_error) => { + ExpectedBase64Error { decode_error }.into_py(py) + } + tv::query::QueryParserError::AllButQueryForbidden => { + AllButQueryForbiddenError.into_py(py) + } + tv::query::QueryParserError::NoDefaultFieldDeclared => { + NoDefaultFieldDeclaredError.into_py(py) + } + tv::query::QueryParserError::FieldNotIndexed(field) => { + FieldNotIndexedError { field }.into_py(py) + } + tv::query::QueryParserError::PhrasePrefixRequiresAtLeastTwoTerms { + phrase, + tokenizer, + } => { + PhrasePrefixRequiresAtLeastTwoTermsError { phrase, tokenizer }.into_py(py) + } + tv::query::QueryParserError::UnknownTokenizer { tokenizer, field } => { + UnknownTokenizerError { tokenizer, field }.into_py(py) + } + tv::query::QueryParserError::RangeMustNotHavePhrase => { + RangeMustNotHavePhraseError.into_py(py) + } + tv::query::QueryParserError::DateFormatError(_) => { + DateFormatError { inner: self }.into_py(py) + } + tv::query::QueryParserError::FacetFormatError(facet_parse_error) => { + FacetFormatError { facet_parse_error }.into_py(py) + } + tv::query::QueryParserError::IpFormatError(addr_parse_error) => { + IpFormatError { addr_parse_error }.into_py(py) + } + } + } +} + +/// Error in the query syntax. +#[pyclass(frozen)] +pub(crate) struct SyntaxError { + message: String, +} + +#[pymethods] +impl SyntaxError { + #[getter] + fn inner_message(&self) -> &str { + self.message.as_str() + } + + fn __repr__(&self) -> String { + self.full_message() + } + + fn __str__(&self) -> String { + self.full_message() + } +} + +impl QueryParserError for SyntaxError { + fn full_message(&self) -> String { + format!("Syntax Error: {0}", self.message) + } +} + +impl From for tv::query::QueryParserError { + fn from(error: SyntaxError) -> Self { + tv::query::QueryParserError::SyntaxError(error.message) + } +} + +impl TryFrom for SyntaxError { + type Error = String; + + fn try_from( + error: tv::query::QueryParserError, + ) -> Result { + match error { + tv::query::QueryParserError::SyntaxError(message) => { + Ok(Self { message }) + } + _ => Err(format!("{error} is not a SyntaxError")), + } + } +} + +/// This query is unsupported. +#[pyclass(frozen)] +pub(crate) struct UnsupportedQueryError { + message: String, +} + +#[pymethods] +impl UnsupportedQueryError { + #[getter] + fn inner_message(&self) -> &str { + self.message.as_str() + } + + fn __repr__(&self) -> String { + self.full_message() + } + + fn __str__(&self) -> String { + self.full_message() + } +} + +impl QueryParserError for UnsupportedQueryError { + fn full_message(&self) -> String { + format!("Unsupported query: {0}", self.message) + } +} + +impl From for tv::query::QueryParserError { + fn from(error: UnsupportedQueryError) -> Self { + tv::query::QueryParserError::SyntaxError(error.message) + } +} + +impl TryFrom for UnsupportedQueryError { + type Error = String; + + fn try_from( + error: tv::query::QueryParserError, + ) -> Result { + match error { + tv::query::QueryParserError::UnsupportedQuery(message) => { + Ok(Self { message }) + } + _ => Err(format!("{error} is not an UnsupportedQuery error")), + } + } +} + +/// The query references a field that is not in the schema. +#[pyclass(frozen)] +pub struct FieldDoesNotExistError { + field: String, +} + +#[pymethods] +impl FieldDoesNotExistError { + /// The name of the field causing the error. + #[getter] + fn field(&self) -> &str { + self.field.as_str() + } + + fn __repr__(&self) -> String { + self.full_message() + } + + fn __str__(&self) -> String { + self.full_message() + } +} + +impl QueryParserError for FieldDoesNotExistError { + fn full_message(&self) -> String { + format!("Field does not exist: '{0}'", self.field) + } +} + +impl From for tv::query::QueryParserError { + fn from(error: FieldDoesNotExistError) -> Self { + tv::query::QueryParserError::FieldDoesNotExist(error.field) + } +} + +impl TryFrom for FieldDoesNotExistError { + type Error = String; + + fn try_from( + error: tv::query::QueryParserError, + ) -> Result { + match error { + tv::query::QueryParserError::FieldDoesNotExist(field) => { + Ok(Self { field }) + } + _ => Err(format!("{error} is not a FieldDoesNotExist error")), + } + } +} + +/// The query contains a term for a `u64` or `i64`-field, but the value is neither. +#[pyclass(frozen)] +pub(crate) struct ExpectedIntError { + parse_int_error: ParseIntError, +} + +#[pymethods] +impl ExpectedIntError { + /// If `true`, the value being parsed was empty. + fn caused_by_empty(&self) -> bool { + self.parse_int_error.kind() == &IntErrorKind::Empty + } + + /// If `true`, an invalid digit was found. + fn caused_by_invalid_digit(&self) -> bool { + self.parse_int_error.kind() == &IntErrorKind::InvalidDigit + } + + /// If `true`, the value being parsed was too large. + fn caused_by_pos_overflow(&self) -> bool { + self.parse_int_error.kind() == &IntErrorKind::PosOverflow + } + + /// If `true`, the value being parsed was too small. + fn caused_by_neg_overflow(&self) -> bool { + self.parse_int_error.kind() == &IntErrorKind::NegOverflow + } + + fn __repr__(&self) -> String { + self.full_message() + } + + fn __str__(&self) -> String { + self.full_message() + } +} + +impl QueryParserError for ExpectedIntError { + fn full_message(&self) -> String { + format!("Expected a valid integer: '{0:?}'", self.parse_int_error) + } +} + +impl From for tv::query::QueryParserError { + fn from(error: ExpectedIntError) -> Self { + tv::query::QueryParserError::ExpectedInt(error.parse_int_error) + } +} + +impl TryFrom for ExpectedIntError { + type Error = String; + + fn try_from( + error: tv::query::QueryParserError, + ) -> Result { + match error { + tv::query::QueryParserError::ExpectedInt(parse_int_error) => { + Ok(Self { parse_int_error }) + } + _ => Err(format!("{error} is not an ExpectedInt error")), + } + } +} + +/// The query contains a term for a `u64` or `i64`-field, but the value is neither. +#[pyclass(frozen)] +pub(crate) struct ExpectedBase64Error { + decode_error: base64::DecodeError, +} + +#[pymethods] +impl ExpectedBase64Error { + /// If `true`, an invalid byte was found in the query. Padding characters (`=`) interspersed in + /// the encoded form will be treated as invalid bytes. + fn caused_by_invalid_byte(&self) -> bool { + match self.decode_error { + base64::DecodeError::InvalidByte { .. } => true, + _ => false, + } + } + + /// If the error was caused by an invalid byte, returns the offset and offending byte. + fn invalid_byte_info(&self) -> Option<(usize, u8)> { + match self.decode_error { + base64::DecodeError::InvalidByte(position, byte) => { + Some((position, byte)) + } + _ => None, + } + } + + /// If `true`, the length of the base64 string was invalid. + fn caused_by_invalid_length(&self) -> bool { + match self.decode_error { + base64::DecodeError::InvalidLength => true, + _ => false, + } + } + + /// The last non-padding input symbol's encoded 6 bits have nonzero bits that will be discarded. + /// If `true`, this is indicative of corrupted or truncated Base64. + fn caused_by_invalid_last_symbol(&self) -> bool { + match self.decode_error { + base64::DecodeError::InvalidLastSymbol { .. } => true, + _ => false, + } + } + + /// If the error was caused by an invalid last symbol, returns the offset and offending byte. + fn invalid_last_symbol_info(&self) -> Option<(usize, u8)> { + match self.decode_error { + base64::DecodeError::InvalidLastSymbol(position, byte) => { + Some((position, byte)) + } + _ => None, + } + } + + /// The nature of the padding was not as configured: absent or incorrect when it must be + /// canonical, or present when it must be absent, etc. + fn caused_by_invalid_padding(&self) -> bool { + match self.decode_error { + base64::DecodeError::InvalidPadding => true, + _ => false, + } + } + + fn __repr__(&self) -> String { + self.full_message() + } + + fn __str__(&self) -> String { + self.full_message() + } +} + +impl QueryParserError for ExpectedBase64Error { + fn full_message(&self) -> String { + format!("Expected base64: {0:?}", self.decode_error) + } +} + +impl From for tv::query::QueryParserError { + fn from(error: ExpectedBase64Error) -> Self { + tv::query::QueryParserError::ExpectedBase64(error.decode_error) + } +} + +impl TryFrom for ExpectedBase64Error { + type Error = String; + + fn try_from( + error: tv::query::QueryParserError, + ) -> Result { + match error { + tv::query::QueryParserError::ExpectedBase64(decode_error) => { + Ok(Self { decode_error }) + } + _ => Err(format!("{error} is not an ExpectedBase64 error")), + } + } +} + +/// The query contains a term for a `f64`-field, but the value is not a f64. +#[pyclass(frozen)] +pub(crate) struct ExpectedFloatError { + parse_float_error: ParseFloatError, +} + +#[pymethods] +impl ExpectedFloatError { + fn __repr__(&self) -> String { + self.full_message() + } + + fn __str__(&self) -> String { + self.full_message() + } +} + +impl QueryParserError for ExpectedFloatError { + fn full_message(&self) -> String { + format!("Expected a float value: '{0:?}'", self.parse_float_error) + } +} + +impl From for tv::query::QueryParserError { + fn from(error: ExpectedFloatError) -> Self { + tv::query::QueryParserError::ExpectedFloat(error.parse_float_error) + } +} + +impl TryFrom for ExpectedFloatError { + type Error = String; + + fn try_from( + error: tv::query::QueryParserError, + ) -> Result { + match error { + tv::query::QueryParserError::ExpectedFloat(parse_float_error) => { + Ok(Self { parse_float_error }) + } + _ => Err(format!("{error} is not an ExpectedFloat error")), + } + } +} + +/// The query contains a term for a `bool`-field, but the value is not a bool. +#[pyclass(frozen)] +pub(crate) struct ExpectedBoolError { + parse_bool_error: ParseBoolError, +} + +#[pymethods] +impl ExpectedBoolError { + fn __repr__(&self) -> String { + self.full_message() + } + + fn __str__(&self) -> String { + self.full_message() + } +} + +impl QueryParserError for ExpectedBoolError { + fn full_message(&self) -> String { + format!("Expected a bool value: '{0:?}'", self.parse_bool_error) + } +} + +impl From for tv::query::QueryParserError { + fn from(error: ExpectedBoolError) -> Self { + tv::query::QueryParserError::ExpectedBool(error.parse_bool_error) + } +} + +impl TryFrom for ExpectedBoolError { + type Error = String; + + fn try_from( + error: tv::query::QueryParserError, + ) -> Result { + match error { + tv::query::QueryParserError::ExpectedBool(parse_bool_error) => { + Ok(Self { parse_bool_error }) + } + _ => Err(format!("{error} is not an ExpectedBool error")), + } + } +} + +/// It is forbidden queries that are only "excluding". (e.g. -title:pop) +#[pyclass(frozen)] +pub(crate) struct AllButQueryForbiddenError; + +#[pymethods] +impl AllButQueryForbiddenError { + fn __repr__(&self) -> String { + self.full_message() + } + + fn __str__(&self) -> String { + self.full_message() + } +} + +impl QueryParserError for AllButQueryForbiddenError { + fn full_message(&self) -> String { + "Invalid query: Only excluding terms given".to_string() + } +} + +impl From for tv::query::QueryParserError { + fn from(_error: AllButQueryForbiddenError) -> Self { + tv::query::QueryParserError::AllButQueryForbidden + } +} + +impl TryFrom for AllButQueryForbiddenError { + type Error = String; + + fn try_from( + error: tv::query::QueryParserError, + ) -> Result { + match error { + tv::query::QueryParserError::AllButQueryForbidden => Ok(Self {}), + _ => Err(format!("{error} is not an AllButQueryForbidden error")), + } + } +} + +/// If no default field is declared, running a query without any field specified is forbbidden. +#[pyclass(frozen)] +pub(crate) struct NoDefaultFieldDeclaredError; + +#[pymethods] +impl NoDefaultFieldDeclaredError { + fn __repr__(&self) -> String { + self.full_message() + } + + fn __str__(&self) -> String { + self.full_message() + } +} + +impl QueryParserError for NoDefaultFieldDeclaredError { + fn full_message(&self) -> String { + "No default field declared and no field specified in query".to_string() + } +} + +impl From for tv::query::QueryParserError { + fn from(_error: NoDefaultFieldDeclaredError) -> Self { + tv::query::QueryParserError::NoDefaultFieldDeclared + } +} + +impl TryFrom for NoDefaultFieldDeclaredError { + type Error = String; + + fn try_from( + error: tv::query::QueryParserError, + ) -> Result { + match error { + tv::query::QueryParserError::NoDefaultFieldDeclared => Ok(Self {}), + _ => Err(format!("{error} is not a NoDefaultFieldDeclared error")), + } + } +} + +/// The field searched for is not declared as indexed in the schema. +#[pyclass(frozen)] +pub(crate) struct FieldNotIndexedError { + field: String, +} + +#[pymethods] +impl FieldNotIndexedError { + fn field(&self) -> &str { + self.field.as_str() + } + + fn __repr__(&self) -> String { + self.full_message() + } + + fn __str__(&self) -> String { + self.full_message() + } +} + +impl QueryParserError for FieldNotIndexedError { + fn full_message(&self) -> String { + format!("The field '{0}' is not declared as indexed", self.field) + } +} + +impl From for tv::query::QueryParserError { + fn from(error: FieldNotIndexedError) -> Self { + tv::query::QueryParserError::FieldNotIndexed(error.field) + } +} + +impl TryFrom for FieldNotIndexedError { + type Error = String; + + fn try_from( + error: tv::query::QueryParserError, + ) -> Result { + match error { + tv::query::QueryParserError::FieldNotIndexed(field) => { + Ok(Self { field }) + } + _ => Err(format!("{error} is not an FieldNotIndexed error")), + } + } +} + +/// A phrase query was requested for a field that does not have any positions indexed. +#[pyclass(frozen)] +pub(crate) struct FieldDoesNotHavePositionsIndexedError { + field: String, +} + +#[pymethods] +impl FieldDoesNotHavePositionsIndexedError { + fn field(&self) -> &str { + self.field.as_str() + } + + fn __repr__(&self) -> String { + self.full_message() + } + + fn __str__(&self) -> String { + self.full_message() + } +} + +impl QueryParserError for FieldDoesNotHavePositionsIndexedError { + fn full_message(&self) -> String { + format!( + "The field '{0}' does not have positions indexed", + self.field + ) + } +} + +impl From + for tv::query::QueryParserError +{ + fn from(error: FieldDoesNotHavePositionsIndexedError) -> Self { + tv::query::QueryParserError::FieldDoesNotHavePositionsIndexed( + error.field, + ) + } +} + +impl TryFrom + for FieldDoesNotHavePositionsIndexedError +{ + type Error = String; + + fn try_from( + error: tv::query::QueryParserError, + ) -> Result { + match error { + tv::query::QueryParserError::FieldDoesNotHavePositionsIndexed( + field, + ) => Ok(Self { field }), + _ => Err(format!( + "{error} is not a FieldDoesNotHavePositionsIndexed error" + )), + } + } +} + +/// A phrase-prefix query requires at least two terms +#[pyclass(frozen)] +pub(crate) struct PhrasePrefixRequiresAtLeastTwoTermsError { + /// The phrase which triggered the issue. + phrase: String, + /// The tokenizer configured for the field. + tokenizer: String, +} + +#[pymethods] +impl PhrasePrefixRequiresAtLeastTwoTermsError { + fn phrase(&self) -> &str { + self.phrase.as_str() + } + + fn tokenizer(&self) -> &str { + self.tokenizer.as_str() + } + + fn __repr__(&self) -> String { + self.full_message() + } + + fn __str__(&self) -> String { + self.full_message() + } +} + +impl QueryParserError for PhrasePrefixRequiresAtLeastTwoTermsError { + fn full_message(&self) -> String { + format!( + "The phrase '{0:?}' does not produce at least two terms using the tokenizer '{1:?}'", + self.phrase, self.tokenizer + ) + } +} + +impl From + for tv::query::QueryParserError +{ + fn from(error: PhrasePrefixRequiresAtLeastTwoTermsError) -> Self { + tv::query::QueryParserError::PhrasePrefixRequiresAtLeastTwoTerms { + phrase: error.phrase, + tokenizer: error.tokenizer, + } + } +} + +impl TryFrom + for PhrasePrefixRequiresAtLeastTwoTermsError +{ + type Error = String; + + fn try_from( + error: tv::query::QueryParserError, + ) -> Result { + match error { + tv::query::QueryParserError::PhrasePrefixRequiresAtLeastTwoTerms { + phrase, + tokenizer, + } => Ok(Self { phrase, tokenizer }), + _ => Err(format!( + "{error} is not a PhrasePrefixRequiresAtLeastTwoTerms error" + )), + } + } +} + +/// The tokenizer for the given field is unknown. +#[pyclass(frozen)] +pub(crate) struct UnknownTokenizerError { + /// The name of the tokenizer. + tokenizer: String, + /// The field name. + field: String, +} + +#[pymethods] +impl UnknownTokenizerError { + fn tokenizer(&self) -> &str { + self.tokenizer.as_str() + } + + fn field(&self) -> &str { + self.field.as_str() + } + + fn __repr__(&self) -> String { + self.full_message() + } + + fn __str__(&self) -> String { + self.full_message() + } +} + +impl QueryParserError for UnknownTokenizerError { + fn full_message(&self) -> String { + format!( + "The tokenizer '{0:?}' for the field '{1:?}' is unknown", + self.tokenizer, self.field + ) + } +} + +impl From for tv::query::QueryParserError { + fn from(error: UnknownTokenizerError) -> Self { + tv::query::QueryParserError::UnknownTokenizer { + tokenizer: error.tokenizer, + field: error.field, + } + } +} + +impl TryFrom for UnknownTokenizerError { + type Error = String; + + fn try_from( + error: tv::query::QueryParserError, + ) -> Result { + match error { + tv::query::QueryParserError::UnknownTokenizer { + tokenizer, + field, + } => Ok(Self { tokenizer, field }), + _ => Err(format!("{error} is not an UnknownTokenizer error")), + } + } +} + +/// The query contains a range query with a phrase as one of the bounds. Only terms can be used as +/// bounds. +#[pyclass(frozen)] +pub(crate) struct RangeMustNotHavePhraseError; + +#[pymethods] +impl RangeMustNotHavePhraseError { + fn __repr__(&self) -> String { + self.full_message() + } + + fn __str__(&self) -> String { + self.full_message() + } +} + +impl QueryParserError for RangeMustNotHavePhraseError { + fn full_message(&self) -> String { + "A range query cannot have a phrase as one of the bounds".to_string() + } +} + +impl From for tv::query::QueryParserError { + fn from(_error: RangeMustNotHavePhraseError) -> Self { + tv::query::QueryParserError::RangeMustNotHavePhrase + } +} + +impl TryFrom for RangeMustNotHavePhraseError { + type Error = String; + + fn try_from( + error: tv::query::QueryParserError, + ) -> Result { + match error { + tv::query::QueryParserError::RangeMustNotHavePhrase => Ok(Self {}), + _ => Err(format!("{error} is not a RangeMustNotHavePhrase error")), + } + } +} + +/// The format for the date field is not RFC 3339 compliant. +#[pyclass(frozen)] +pub(crate) struct DateFormatError { + // Keep around the entire `QueryParserError` to avoid importing the `time` crate. + inner: tv::query::QueryParserError, +} + +#[pymethods] +impl DateFormatError { + fn __repr__(&self) -> String { + self.full_message() + } + + fn __str__(&self) -> String { + self.full_message() + } +} + +impl QueryParserError for DateFormatError { + fn full_message(&self) -> String { + "The date field has an invalid format".to_string() + } +} + +impl From for tv::query::QueryParserError { + fn from(error: DateFormatError) -> Self { + error.inner + } +} + +impl TryFrom for DateFormatError { + type Error = String; + + fn try_from( + error: tv::query::QueryParserError, + ) -> Result { + match error { + tv::query::QueryParserError::DateFormatError { .. } => { + Ok(Self { inner: error }) + } + _ => Err(format!("{error} is not a DateFormatError")), + } + } +} + +/// The format for the facet field is invalid. +#[pyclass(frozen)] +pub(crate) struct FacetFormatError { + facet_parse_error: FacetParseError, +} + +#[pymethods] +impl FacetFormatError { + fn __repr__(&self) -> String { + self.full_message() + } + + fn __str__(&self) -> String { + self.full_message() + } +} + +impl QueryParserError for FacetFormatError { + fn full_message(&self) -> String { + format!("The facet field is malformed: {0}", self.facet_parse_error) + } +} + +impl From for tv::query::QueryParserError { + fn from(error: FacetFormatError) -> Self { + tv::query::QueryParserError::FacetFormatError(error.facet_parse_error) + } +} + +impl TryFrom for FacetFormatError { + type Error = String; + + fn try_from( + error: tv::query::QueryParserError, + ) -> Result { + match error { + tv::query::QueryParserError::FacetFormatError( + facet_parse_error, + ) => Ok(Self { facet_parse_error }), + _ => Err(format!("{error} is not a FacetFormatError")), + } + } +} + +/// The format for the ip field is invalid. +#[pyclass(frozen)] +pub(crate) struct IpFormatError { + addr_parse_error: AddrParseError, +} + +#[pymethods] +impl IpFormatError { + fn __repr__(&self) -> String { + self.full_message() + } + + fn __str__(&self) -> String { + self.full_message() + } +} + +impl QueryParserError for IpFormatError { + fn full_message(&self) -> String { + format!("The facet field is malformed: {0}", self.addr_parse_error) + } +} + +impl From for tv::query::QueryParserError { + fn from(error: IpFormatError) -> Self { + tv::query::QueryParserError::IpFormatError(error.addr_parse_error) + } +} + +impl TryFrom for IpFormatError { + type Error = String; + + fn try_from( + error: tv::query::QueryParserError, + ) -> Result { + match error { + tv::query::QueryParserError::IpFormatError(addr_parse_error) => { + Ok(Self { addr_parse_error }) + } + _ => Err(format!("{error} is not an IpFormatError")), + } + } +} diff --git a/src/searcher.rs b/src/searcher.rs index ae37fa500..774b9264d 100644 --- a/src/searcher.rs +++ b/src/searcher.rs @@ -129,7 +129,7 @@ impl Searcher { if let Some(order_by) = order_by_field { let collector = TopDocs::with_limit(limit) .and_offset(offset) - .order_by_u64_field(order_by); + .order_by_fast_field(order_by, tv::Order::Desc); let top_docs_handle = multicollector.add_collector(collector); let ret = self.inner.search(query.get(), &multicollector); diff --git a/tests/tantivy_test.py b/tests/tantivy_test.py index efa23d5e9..3257c0f80 100644 --- a/tests/tantivy_test.py +++ b/tests/tantivy_test.py @@ -283,6 +283,30 @@ def test_query_errors(self, ram_index): with pytest.raises(ValueError): index.parse_query("bod:men", ["title", "body"]) + def test_query_lenient(self, ram_index_numeric_fields): + from tantivy import query_parser_error + + index = ram_index_numeric_fields + + query, errors = index.parse_query_lenient("rating:3.5") + assert len(errors) == 0 + assert repr(query) == """Query(TermQuery(Term(field=1, type=F64, 3.5)))""" + + _, errors = index.parse_query_lenient("bod:men") + assert len(errors) == 1 + assert isinstance(errors[0], query_parser_error.FieldDoesNotExistError) + + query, errors = index.parse_query_lenient( + "body:'hello' AND id:<3.5 OR rating:'hi'" + ) + assert len(errors) == 2 + assert isinstance(errors[0], query_parser_error.ExpectedIntError) + assert isinstance(errors[1], query_parser_error.ExpectedFloatError) + assert ( + repr(query) + == """Query(BooleanQuery { subqueries: [(Should, BooleanQuery { subqueries: [(Must, TermQuery(Term(field=3, type=Str, "hello")))] })] })""" + ) + def test_order_by_search(self): schema = ( SchemaBuilder() @@ -544,7 +568,10 @@ def test_create_readers(self): class TestSearcher(object): def test_searcher_repr(self, ram_index, ram_index_numeric_fields): assert repr(ram_index.searcher()) == "Searcher(num_docs=3, num_segments=1)" - assert repr(ram_index_numeric_fields.searcher()) == "Searcher(num_docs=2, num_segments=1)" + assert ( + repr(ram_index_numeric_fields.searcher()) + == "Searcher(num_docs=2, num_segments=1)" + ) class TestDocument(object):