Skip to content

Commit

Permalink
feat(query): Like function support variant type (databendlabs#12064)
Browse files Browse the repository at this point in the history
* feat(query): Like function support variant type

* fix
  • Loading branch information
b41sh authored Jul 12, 2023
1 parent ad81886 commit ee97a48
Show file tree
Hide file tree
Showing 7 changed files with 199 additions and 6 deletions.
2 changes: 1 addition & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,7 @@ opendal = { version = "0.37", features = [
] }
ethnum = { version = "1.3.2" }
ordered-float = { version = "3.6.0", default-features = false }
jsonb = { git = "https://github.com/datafuselabs/jsonb", rev = "d81fbee" }
jsonb = { git = "https://github.com/datafuselabs/jsonb", rev = "fe6835a" }

# openraft = { version = "0.8.2", features = ["compat-07"] }
# For debugging
Expand Down
99 changes: 99 additions & 0 deletions src/query/functions/src/scalars/comparison.rs
Original file line number Diff line number Diff line change
Expand Up @@ -441,6 +441,56 @@ fn register_tuple_cmp(registry: &mut FunctionRegistry) {
fn register_like(registry: &mut FunctionRegistry) {
registry.register_aliases("regexp", &["rlike"]);

registry.register_passthrough_nullable_2_arg::<VariantType, StringType, BooleanType, _, _>(
"like",
|_, _, _| FunctionDomain::Full,
variant_vectorize_like(|val, pat, _, pattern_type| {
match &pattern_type {
PatternType::OrdinalStr => {
if let Some(s) = jsonb::as_str(val) {
s.as_bytes() == pat
} else {
false
}
}
PatternType::EndOfPercent => {
// fast path, can use starts_with
if let Some(s) = jsonb::as_str(val) {
let v = s.as_bytes();
v.starts_with(&pat[..pat.len() - 1])
} else {
false
}
}
PatternType::StartOfPercent => {
// fast path, can use ends_with
if let Some(s) = jsonb::as_str(val) {
let v = s.as_bytes();
v.ends_with(&pat[1..])
} else {
false
}
}
PatternType::SurroundByPercent => {
jsonb::traverse_check_string(val, |v| {
if pat.len() > 2 {
memmem::find(v, &pat[1..pat.len() - 1]).is_some()
} else {
// true for empty '%%' pattern, which follows pg/mysql way
true
}
})
}
PatternType::SimplePattern(simple_pattern) => {
jsonb::traverse_check_string(val, |v| {
simple_like(v, simple_pattern.0, simple_pattern.1, &simple_pattern.2)
})
}
PatternType::ComplexPattern => jsonb::traverse_check_string(val, |v| like(v, pat)),
}
}),
);

registry.register_passthrough_nullable_2_arg::<StringType, StringType, BooleanType, _, _>(
"like",
|_, lhs, rhs| {
Expand Down Expand Up @@ -583,6 +633,55 @@ fn vectorize_like(
}
}

fn variant_vectorize_like(
func: impl Fn(&[u8], &[u8], &mut EvalContext, &PatternType) -> bool + Copy,
) -> impl Fn(ValueRef<VariantType>, ValueRef<StringType>, &mut EvalContext) -> Value<BooleanType> + Copy
{
move |arg1, arg2, ctx| match (arg1, arg2) {
(ValueRef::Scalar(arg1), ValueRef::Scalar(arg2)) => {
let pattern_type = check_pattern_type(arg2, false);
Value::Scalar(func(arg1, arg2, ctx, &pattern_type))
}
(ValueRef::Column(arg1), ValueRef::Scalar(arg2)) => {
let arg1_iter = VariantType::iter_column(&arg1);

let pattern_type = check_pattern_type(arg2, false);
// faster path for memmem to have a single instance of Finder
if pattern_type == PatternType::SurroundByPercent && arg2.len() > 2 {
let finder = memmem::Finder::new(&arg2[1..arg2.len() - 1]);
let it = arg1_iter.map(|arg1| finder.find(arg1).is_some());
let bitmap = BooleanType::column_from_iter(it, &[]);
return Value::Column(bitmap);
}

let mut builder = MutableBitmap::with_capacity(arg1.len());
for arg1 in arg1_iter {
builder.push(func(arg1, arg2, ctx, &pattern_type));
}
Value::Column(builder.into())
}
(ValueRef::Scalar(arg1), ValueRef::Column(arg2)) => {
let arg2_iter = VariantType::iter_column(&arg2);
let mut builder = MutableBitmap::with_capacity(arg2.len());
for arg2 in arg2_iter {
let pattern_type = check_pattern_type(arg2, false);
builder.push(func(arg1, arg2, ctx, &pattern_type));
}
Value::Column(builder.into())
}
(ValueRef::Column(arg1), ValueRef::Column(arg2)) => {
let arg1_iter = VariantType::iter_column(&arg1);
let arg2_iter = VariantType::iter_column(&arg2);
let mut builder = MutableBitmap::with_capacity(arg2.len());
for (arg1, arg2) in arg1_iter.zip(arg2_iter) {
let pattern_type = check_pattern_type(arg2, false);
builder.push(func(arg1, arg2, ctx, &pattern_type));
}
Value::Column(builder.into())
}
}
}

fn vectorize_regexp(
func: impl Fn(
&[u8],
Expand Down
15 changes: 15 additions & 0 deletions src/query/functions/tests/it/scalars/comparison.rs
Original file line number Diff line number Diff line change
Expand Up @@ -372,6 +372,21 @@ fn test_like(file: &mut impl Write) {
("rhs", StringType::from_data(vec!["a%", "_b_", "abe", "a"])),
];
run_ast(file, "lhs like rhs", &columns);

run_ast(file, "parse_json('\"hello\"') like 'h%'", &[]);
run_ast(file, "parse_json('{\"abc\":1,\"def\":22}') like '%e%'", &[]);
run_ast(
file,
"parse_json('{\"k1\":\"abc\",\"k2\":\"def\"}') like '%e%'",
&[],
);

let columns = [(
"lhs",
StringType::from_data(vec!["\"abc\"", "{\"abd\":12}", "[\"abe\",\"abf\"]"]),
)];
run_ast(file, "parse_json(lhs) like 'a%'", &columns);
run_ast(file, "parse_json(lhs) like '%ab%'", &columns);
}

fn test_regexp(file: &mut impl Write) {
Expand Down
71 changes: 71 additions & 0 deletions src/query/functions/tests/it/scalars/testdata/comparison.txt
Original file line number Diff line number Diff line change
Expand Up @@ -1334,6 +1334,77 @@ evaluation (internal):
+--------+------------------------------------------------------------------------------+


ast : parse_json('"hello"') like 'h%'
raw expr : like(parse_json('"hello"'), 'h%')
checked expr : like<Variant, String>(parse_json<String>("\"hello\""), "h%")
optimized expr : true
output type : Boolean
output domain : {TRUE}
output : true


ast : parse_json('{"abc":1,"def":22}') like '%e%'
raw expr : like(parse_json('{"abc":1,"def":22}'), '%e%')
checked expr : like<Variant, String>(parse_json<String>("{\"abc\":1,\"def\":22}"), "%e%")
optimized expr : true
output type : Boolean
output domain : {TRUE}
output : true


ast : parse_json('{"k1":"abc","k2":"def"}') like '%e%'
raw expr : like(parse_json('{"k1":"abc","k2":"def"}'), '%e%')
checked expr : like<Variant, String>(parse_json<String>("{\"k1\":\"abc\",\"k2\":\"def\"}"), "%e%")
optimized expr : true
output type : Boolean
output domain : {TRUE}
output : true


ast : parse_json(lhs) like 'a%'
raw expr : like(parse_json(lhs::String), 'a%')
checked expr : like<Variant, String>(parse_json<String>(lhs), "a%")
evaluation:
+--------+------------------------------+---------+
| | lhs | Output |
+--------+------------------------------+---------+
| Type | String | Boolean |
| Domain | {"\"abc\""..="{\"abd\":12}"} | Unknown |
| Row 0 | '"abc"' | true |
| Row 1 | '{"abd":12}' | false |
| Row 2 | '["abe","abf"]' | false |
+--------+------------------------------+---------+
evaluation (internal):
+--------+------------------------------------------------------------------------------------------------------------+
| Column | Data |
+--------+------------------------------------------------------------------------------------------------------------+
| lhs | StringColumn { data: 0x22616263227b22616264223a31327d5b22616265222c22616266225d, offsets: [0, 5, 15, 28] } |
| Output | Boolean([0b_____001]) |
+--------+------------------------------------------------------------------------------------------------------------+


ast : parse_json(lhs) like '%ab%'
raw expr : like(parse_json(lhs::String), '%ab%')
checked expr : like<Variant, String>(parse_json<String>(lhs), "%ab%")
evaluation:
+--------+------------------------------+---------+
| | lhs | Output |
+--------+------------------------------+---------+
| Type | String | Boolean |
| Domain | {"\"abc\""..="{\"abd\":12}"} | Unknown |
| Row 0 | '"abc"' | true |
| Row 1 | '{"abd":12}' | true |
| Row 2 | '["abe","abf"]' | true |
+--------+------------------------------+---------+
evaluation (internal):
+--------+------------------------------------------------------------------------------------------------------------+
| Column | Data |
+--------+------------------------------------------------------------------------------------------------------------+
| lhs | StringColumn { data: 0x22616263227b22616264223a31327d5b22616265222c22616266225d, offsets: [0, 5, 15, 28] } |
| Output | Boolean([0b_____111]) |
+--------+------------------------------------------------------------------------------------------------------------+


ast : lhs regexp rhs
raw expr : regexp(lhs::String, rhs::String)
checked expr : regexp<String, String>(lhs, rhs)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1776,8 +1776,10 @@ Functions overloads:
4 length(Array(T0) NULL) :: UInt64 NULL
5 length(String) :: UInt64
6 length(String NULL) :: UInt64 NULL
0 like(String, String) :: Boolean
1 like(String NULL, String NULL) :: Boolean NULL
0 like(Variant, String) :: Boolean
1 like(Variant NULL, String NULL) :: Boolean NULL
2 like(String, String) :: Boolean
3 like(String NULL, String NULL) :: Boolean NULL
0 ln(UInt8) :: Float64
1 ln(UInt8 NULL) :: Float64 NULL
2 ln(UInt16) :: Float64
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1012,7 +1012,10 @@ SELECT parse_json('"cd"') like 'ab'
----
0


query B
select parse_json('{"name":"jcs.sol"}') like '%.sol%';
----
1

query B
SELECT parse_json('"ab"') not like 'ab'
Expand All @@ -1026,7 +1029,10 @@ SELECT parse_json('"cd"') not like 'ab'
----
1


query B
select parse_json('{"name":"jcs.sol"}') not like '%.sol%';
----
0

query B
SELECT parse_json('"ab"') regexp '.*'
Expand Down

0 comments on commit ee97a48

Please sign in to comment.