diff --git a/Cargo.lock b/Cargo.lock index 91e3e9c42b3b4..981a50080903a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -6373,7 +6373,7 @@ dependencies = [ [[package]] name = "jsonb" version = "0.2.3" -source = "git+https://github.com/datafuselabs/jsonb?rev=d81fbee#d81fbee0b6005bceb2bc92d1f877cfac68e6008f" +source = "git+https://github.com/datafuselabs/jsonb?rev=fe6835a#fe6835a0a6813279366550db7ba6cd64aed317d8" dependencies = [ "byteorder", "fast-float", diff --git a/Cargo.toml b/Cargo.toml index 51a1a08d275f3..0753e1c9c28b3 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -115,7 +115,7 @@ opendal = { version = "0.37", features = [ ] } ethnum = { version = "1.3.2" } ordered-float = { version = "3.6.0", default-features = false } -jsonb = { git = "https://github.com/datafuselabs/jsonb", rev = "d81fbee" } +jsonb = { git = "https://github.com/datafuselabs/jsonb", rev = "fe6835a" } # openraft = { version = "0.8.2", features = ["compat-07"] } # For debugging diff --git a/src/query/functions/src/scalars/comparison.rs b/src/query/functions/src/scalars/comparison.rs index 3025d6bb2f3f0..20ba75ce57a47 100644 --- a/src/query/functions/src/scalars/comparison.rs +++ b/src/query/functions/src/scalars/comparison.rs @@ -441,6 +441,56 @@ fn register_tuple_cmp(registry: &mut FunctionRegistry) { fn register_like(registry: &mut FunctionRegistry) { registry.register_aliases("regexp", &["rlike"]); + registry.register_passthrough_nullable_2_arg::( + "like", + |_, _, _| FunctionDomain::Full, + variant_vectorize_like(|val, pat, _, pattern_type| { + match &pattern_type { + PatternType::OrdinalStr => { + if let Some(s) = jsonb::as_str(val) { + s.as_bytes() == pat + } else { + false + } + } + PatternType::EndOfPercent => { + // fast path, can use starts_with + if let Some(s) = jsonb::as_str(val) { + let v = s.as_bytes(); + v.starts_with(&pat[..pat.len() - 1]) + } else { + false + } + } + PatternType::StartOfPercent => { + // fast path, can use ends_with + if let Some(s) = jsonb::as_str(val) { + let v = s.as_bytes(); + v.ends_with(&pat[1..]) + } else { + false + } + } + PatternType::SurroundByPercent => { + jsonb::traverse_check_string(val, |v| { + if pat.len() > 2 { + memmem::find(v, &pat[1..pat.len() - 1]).is_some() + } else { + // true for empty '%%' pattern, which follows pg/mysql way + true + } + }) + } + PatternType::SimplePattern(simple_pattern) => { + jsonb::traverse_check_string(val, |v| { + simple_like(v, simple_pattern.0, simple_pattern.1, &simple_pattern.2) + }) + } + PatternType::ComplexPattern => jsonb::traverse_check_string(val, |v| like(v, pat)), + } + }), + ); + registry.register_passthrough_nullable_2_arg::( "like", |_, lhs, rhs| { @@ -583,6 +633,55 @@ fn vectorize_like( } } +fn variant_vectorize_like( + func: impl Fn(&[u8], &[u8], &mut EvalContext, &PatternType) -> bool + Copy, +) -> impl Fn(ValueRef, ValueRef, &mut EvalContext) -> Value + Copy +{ + move |arg1, arg2, ctx| match (arg1, arg2) { + (ValueRef::Scalar(arg1), ValueRef::Scalar(arg2)) => { + let pattern_type = check_pattern_type(arg2, false); + Value::Scalar(func(arg1, arg2, ctx, &pattern_type)) + } + (ValueRef::Column(arg1), ValueRef::Scalar(arg2)) => { + let arg1_iter = VariantType::iter_column(&arg1); + + let pattern_type = check_pattern_type(arg2, false); + // faster path for memmem to have a single instance of Finder + if pattern_type == PatternType::SurroundByPercent && arg2.len() > 2 { + let finder = memmem::Finder::new(&arg2[1..arg2.len() - 1]); + let it = arg1_iter.map(|arg1| finder.find(arg1).is_some()); + let bitmap = BooleanType::column_from_iter(it, &[]); + return Value::Column(bitmap); + } + + let mut builder = MutableBitmap::with_capacity(arg1.len()); + for arg1 in arg1_iter { + builder.push(func(arg1, arg2, ctx, &pattern_type)); + } + Value::Column(builder.into()) + } + (ValueRef::Scalar(arg1), ValueRef::Column(arg2)) => { + let arg2_iter = VariantType::iter_column(&arg2); + let mut builder = MutableBitmap::with_capacity(arg2.len()); + for arg2 in arg2_iter { + let pattern_type = check_pattern_type(arg2, false); + builder.push(func(arg1, arg2, ctx, &pattern_type)); + } + Value::Column(builder.into()) + } + (ValueRef::Column(arg1), ValueRef::Column(arg2)) => { + let arg1_iter = VariantType::iter_column(&arg1); + let arg2_iter = VariantType::iter_column(&arg2); + let mut builder = MutableBitmap::with_capacity(arg2.len()); + for (arg1, arg2) in arg1_iter.zip(arg2_iter) { + let pattern_type = check_pattern_type(arg2, false); + builder.push(func(arg1, arg2, ctx, &pattern_type)); + } + Value::Column(builder.into()) + } + } +} + fn vectorize_regexp( func: impl Fn( &[u8], diff --git a/src/query/functions/tests/it/scalars/comparison.rs b/src/query/functions/tests/it/scalars/comparison.rs index 9663805fa2426..46b595aad887a 100644 --- a/src/query/functions/tests/it/scalars/comparison.rs +++ b/src/query/functions/tests/it/scalars/comparison.rs @@ -372,6 +372,21 @@ fn test_like(file: &mut impl Write) { ("rhs", StringType::from_data(vec!["a%", "_b_", "abe", "a"])), ]; run_ast(file, "lhs like rhs", &columns); + + run_ast(file, "parse_json('\"hello\"') like 'h%'", &[]); + run_ast(file, "parse_json('{\"abc\":1,\"def\":22}') like '%e%'", &[]); + run_ast( + file, + "parse_json('{\"k1\":\"abc\",\"k2\":\"def\"}') like '%e%'", + &[], + ); + + let columns = [( + "lhs", + StringType::from_data(vec!["\"abc\"", "{\"abd\":12}", "[\"abe\",\"abf\"]"]), + )]; + run_ast(file, "parse_json(lhs) like 'a%'", &columns); + run_ast(file, "parse_json(lhs) like '%ab%'", &columns); } fn test_regexp(file: &mut impl Write) { diff --git a/src/query/functions/tests/it/scalars/testdata/comparison.txt b/src/query/functions/tests/it/scalars/testdata/comparison.txt index 7cfd50619efc1..be812df8518a7 100644 --- a/src/query/functions/tests/it/scalars/testdata/comparison.txt +++ b/src/query/functions/tests/it/scalars/testdata/comparison.txt @@ -1334,6 +1334,77 @@ evaluation (internal): +--------+------------------------------------------------------------------------------+ +ast : parse_json('"hello"') like 'h%' +raw expr : like(parse_json('"hello"'), 'h%') +checked expr : like(parse_json("\"hello\""), "h%") +optimized expr : true +output type : Boolean +output domain : {TRUE} +output : true + + +ast : parse_json('{"abc":1,"def":22}') like '%e%' +raw expr : like(parse_json('{"abc":1,"def":22}'), '%e%') +checked expr : like(parse_json("{\"abc\":1,\"def\":22}"), "%e%") +optimized expr : true +output type : Boolean +output domain : {TRUE} +output : true + + +ast : parse_json('{"k1":"abc","k2":"def"}') like '%e%' +raw expr : like(parse_json('{"k1":"abc","k2":"def"}'), '%e%') +checked expr : like(parse_json("{\"k1\":\"abc\",\"k2\":\"def\"}"), "%e%") +optimized expr : true +output type : Boolean +output domain : {TRUE} +output : true + + +ast : parse_json(lhs) like 'a%' +raw expr : like(parse_json(lhs::String), 'a%') +checked expr : like(parse_json(lhs), "a%") +evaluation: ++--------+------------------------------+---------+ +| | lhs | Output | ++--------+------------------------------+---------+ +| Type | String | Boolean | +| Domain | {"\"abc\""..="{\"abd\":12}"} | Unknown | +| Row 0 | '"abc"' | true | +| Row 1 | '{"abd":12}' | false | +| Row 2 | '["abe","abf"]' | false | ++--------+------------------------------+---------+ +evaluation (internal): ++--------+------------------------------------------------------------------------------------------------------------+ +| Column | Data | ++--------+------------------------------------------------------------------------------------------------------------+ +| lhs | StringColumn { data: 0x22616263227b22616264223a31327d5b22616265222c22616266225d, offsets: [0, 5, 15, 28] } | +| Output | Boolean([0b_____001]) | ++--------+------------------------------------------------------------------------------------------------------------+ + + +ast : parse_json(lhs) like '%ab%' +raw expr : like(parse_json(lhs::String), '%ab%') +checked expr : like(parse_json(lhs), "%ab%") +evaluation: ++--------+------------------------------+---------+ +| | lhs | Output | ++--------+------------------------------+---------+ +| Type | String | Boolean | +| Domain | {"\"abc\""..="{\"abd\":12}"} | Unknown | +| Row 0 | '"abc"' | true | +| Row 1 | '{"abd":12}' | true | +| Row 2 | '["abe","abf"]' | true | ++--------+------------------------------+---------+ +evaluation (internal): ++--------+------------------------------------------------------------------------------------------------------------+ +| Column | Data | ++--------+------------------------------------------------------------------------------------------------------------+ +| lhs | StringColumn { data: 0x22616263227b22616264223a31327d5b22616265222c22616266225d, offsets: [0, 5, 15, 28] } | +| Output | Boolean([0b_____111]) | ++--------+------------------------------------------------------------------------------------------------------------+ + + ast : lhs regexp rhs raw expr : regexp(lhs::String, rhs::String) checked expr : regexp(lhs, rhs) diff --git a/src/query/functions/tests/it/scalars/testdata/function_list.txt b/src/query/functions/tests/it/scalars/testdata/function_list.txt index b2319863ffbf2..28b23e0fac940 100644 --- a/src/query/functions/tests/it/scalars/testdata/function_list.txt +++ b/src/query/functions/tests/it/scalars/testdata/function_list.txt @@ -1776,8 +1776,10 @@ Functions overloads: 4 length(Array(T0) NULL) :: UInt64 NULL 5 length(String) :: UInt64 6 length(String NULL) :: UInt64 NULL -0 like(String, String) :: Boolean -1 like(String NULL, String NULL) :: Boolean NULL +0 like(Variant, String) :: Boolean +1 like(Variant NULL, String NULL) :: Boolean NULL +2 like(String, String) :: Boolean +3 like(String NULL, String NULL) :: Boolean NULL 0 ln(UInt8) :: Float64 1 ln(UInt8 NULL) :: Float64 NULL 2 ln(UInt16) :: Float64 diff --git a/tests/sqllogictests/suites/query/02_function/02_0005_function_compare b/tests/sqllogictests/suites/query/02_function/02_0005_function_compare index cb3039c3245a6..de7b31643ab39 100644 --- a/tests/sqllogictests/suites/query/02_function/02_0005_function_compare +++ b/tests/sqllogictests/suites/query/02_function/02_0005_function_compare @@ -1012,7 +1012,10 @@ SELECT parse_json('"cd"') like 'ab' ---- 0 - +query B +select parse_json('{"name":"jcs.sol"}') like '%.sol%'; +---- +1 query B SELECT parse_json('"ab"') not like 'ab' @@ -1026,7 +1029,10 @@ SELECT parse_json('"cd"') not like 'ab' ---- 1 - +query B +select parse_json('{"name":"jcs.sol"}') not like '%.sol%'; +---- +0 query B SELECT parse_json('"ab"') regexp '.*'