Skip to content

Commit

Permalink
mango: add $beginsWith operator
Browse files Browse the repository at this point in the history
Adds a `$beginsWith` operator to selectors, with json and text index
support. This is a compliment / precursor to optimising `$regex`
support as proposed in #4776.

For `json` indexes, a $beginsWith operator translates into a key
range query, as is common practice for _view queries. For example,
to find all rows with a key beginning with "W", we can use a range
`start_key="W", end_key="W\ufff0"`. Given Mango uses compound keys,
this is slightly more complex in practice, but the idea is the same.
As with other range operators (`$gt`, `$gte`, etc), `$beginsWith`
can be used in combination with equality operators and result sorting
but must result in a contiguous key range. That is, a range of
`start_key=[10, "W"], end_key=[10, "W\ufff0", {}]` would be valid,
but `start_key=["W", 10], end_key=["W\ufff0", 10, {}]` would not,
because the second element of the key may result in a non-contiguous
range.

For text indexes, `$beginsWith` translates to a Lucene query on
the specified field of `W*`.

If a non-string operand is provided to `$beginsWith`, the request will
fail with a 400 / `invalid_operator` error.
  • Loading branch information
willholley committed Oct 17, 2023
1 parent 6c2e503 commit 85e8a9d
Show file tree
Hide file tree
Showing 5 changed files with 169 additions and 0 deletions.
6 changes: 6 additions & 0 deletions src/mango/src/mango_idx_view.erl
Original file line number Diff line number Diff line change
Expand Up @@ -306,6 +306,8 @@ indexable({[{<<"$gt">>, _}]}) ->
true;
indexable({[{<<"$gte">>, _}]}) ->
true;
indexable({[{<<"$beginsWith">>, _}]}) ->
true;
% This is required to improve index selection for covering indexes.
% Making `$exists` indexable should not cause problems in other cases.
indexable({[{<<"$exists">>, _}]}) ->
Expand Down Expand Up @@ -412,6 +414,10 @@ range(_, _, LCmp, Low, HCmp, High) ->
% operators but its all straight forward once you figure out how
% we're basically just narrowing our logical ranges.

% beginsWith requires both a high and low bound
range({[{<<"$beginsWith">>, Arg}]}, LCmp, Low, HCmp, High) ->
{LCmp0, Low0, HCmp0, High0} = range({[{<<"$gte">>, Arg}]}, LCmp, Low, HCmp, High),
range({[{<<"$lte">>, <<Arg/binary, 16#10FFFF>>}]}, LCmp0, Low0, HCmp0, High0);
range({[{<<"$lt">>, Arg}]}, LCmp, Low, HCmp, High) ->
case range_pos(Low, Arg, High) of
min ->
Expand Down
32 changes: 32 additions & 0 deletions src/mango/src/mango_selector.erl
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,8 @@ norm_ops({[{<<"$text">>, Arg}]}) when
{[{<<"$default">>, {[{<<"$text">>, Arg}]}}]};
norm_ops({[{<<"$text">>, Arg}]}) ->
?MANGO_ERROR({bad_arg, '$text', Arg});
norm_ops({[{<<"$beginsWith">>, Arg}]} = Cond) when is_binary(Arg) ->
Cond;
% Not technically an operator but we pass it through here
% so that this function accepts its own output. This exists
% so that $text can have a field name value which simplifies
Expand Down Expand Up @@ -514,6 +516,11 @@ match({[{<<"$mod">>, [D, R]}]}, Value, _Cmp) when is_integer(Value) ->
Value rem D == R;
match({[{<<"$mod">>, _}]}, _Value, _Cmp) ->
false;
match({[{<<"$beginsWith">>, Prefix}]}, Value, _Cmp) when is_binary(Prefix), is_binary(Value) ->
string:prefix(Value, Prefix) /= nomatch;
% When Value is not a string, do not match
match({[{<<"$beginsWith">>, Prefix}]}, _, _Cmp) when is_binary(Prefix) ->
false;
match({[{<<"$regex">>, Regex}]}, Value, _Cmp) when is_binary(Value) ->
try
match == re:run(Value, Regex, [{capture, none}])
Expand Down Expand Up @@ -1054,4 +1061,29 @@ fields_nor_test() ->
},
?assertEqual([<<"field1">>, <<"field2">>], fields_of(Selector2)).

match_beginswith_test() ->
Doc =
{[
{<<"_id">>, <<"foo">>},
{<<"_rev">>, <<"bar">>},
{<<"user_id">>, 11}
]},
Check = fun(Field, Prefix) ->
Selector = {[{Field, {[{<<"$beginsWith">>, Prefix}]}}]},
% Call match_int/2 to avoid ERROR for missing metric; this is confusing
% in the middle of test output.
match_int(mango_selector:normalize(Selector), Doc)
end,
[
% matching
?assertEqual(true, Check(<<"_id">>, <<"f">>)),
% no match (user_id is not a binary string)
?assertEqual(false, Check(<<"user_id">>, <<"f">>)),
% invalid (prefix is not a binary string)
?assertThrow(
{mango_error, mango_selector, {invalid_operator, <<"$beginsWith">>}},
Check(<<"user_id">>, 1)
)
].

-endif.
3 changes: 3 additions & 0 deletions src/mango/src/mango_selector_text.erl
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,9 @@ convert(Path, {[{<<"$exists">>, ShouldExist}]}) ->
true -> FieldExists;
false -> {op_not, {FieldExists, false}}
end;
convert(Path, {[{<<"$beginsWith">>, Arg}]}) ->
PrefixSearch = [value_str(Arg), <<"*">>],
{op_field, {make_field(Path, Arg), PrefixSearch}};
% We're not checking the actual type here, just looking for
% anything that has a possibility of matching by checking
% for the field name. We use the same logic for $exists on
Expand Down
16 changes: 16 additions & 0 deletions src/mango/test/03-operator-test.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,22 @@ def test_exists_false_returns_missing_but_not_null(self):
for d in docs:
self.assertNotIn("twitter", d)

def test_beginswith(self):
docs = self.db.find({"location.state": {"$beginsWith": "New"}})
self.assertEqual(len(docs), 2)
self.assertUserIds([2, 10], docs)

# non-string prefixes should return an error
def test_beginswith_invalid_prefix(self):
docs = self.db.find({"location.state": {"$beginsWith": 123}})
self.assertEqual(len(docs), 2)

# non-string values in documents should not match the prefix,
# but should not error
def test_beginswith_invalid_prefix(self):
docs = self.db.find({"user_id": {"$beginsWith": "Foo"}})
self.assertEqual(len(docs), 0)


class OperatorJSONTests(mango.UserDocsTests, BaseOperatorTests.Common):
# START: text indexes do not support range queries across type boundaries so only
Expand Down
112 changes: 112 additions & 0 deletions src/mango/test/25-beginswith-test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
# Licensed under the Apache License, Version 2.0 (the "License"); you may not
# use this file except in compliance with the License. You may obtain a copy of
# the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
# License for the specific language governing permissions and limitations under
# the License.

import copy
import mango

DOCS = [
{"_id": "aaa", "name": "Jimi", "location": "AUS", "age": 27},
{"_id": "abc", "name": "Eddie", "location": "AND", "age": 65},
{"_id": "bbb", "name": "Harry", "location": "CAN", "age": 21},
{"_id": "ccc", "name": "Eddie", "location": "DEN", "age": 37},
{"_id": "ddd", "name": "Jones", "location": "ETH", "age": 49},
]


def to_utf8_bytes(list):
return [x.encode() for x in list]


class BeginsWithOperator(mango.DbPerClass):
def setUp(self):
self.db.recreate()
self.db.save_docs(copy.deepcopy(DOCS))
self.db.create_index(["location"])
self.db.create_index(["name", "location"])

def assertDocIds(self, user_ids, docs):
user_ids_returned = list(d["_id"] for d in docs)
user_ids.sort()
user_ids_returned.sort()
self.assertEqual(user_ids, user_ids_returned)

def test_basic(self):
docs = self.db.find({"location": {"$beginsWith": "A"}})

self.assertEqual(len(docs), 2)
self.assertDocIds(["aaa", "abc"], docs)

def test_json_range(self):
explain = self.db.find({"location": {"$beginsWith": "A"}}, explain=True)
self.assertEqual(explain["mrargs"]["start_key"], ["A"])
end_key_bytes = to_utf8_bytes(explain["mrargs"]["end_key"])
self.assertEqual(end_key_bytes, [b"A\xef\xbf\xbd", b"<MAX>"])

def test_compound_key(self):
selector = {"name": "Eddie", "location": {"$beginsWith": "A"}}
explain = self.db.find(selector, explain=True)

self.assertEqual(explain["mrargs"]["start_key"], ["Eddie", "A"])
end_key_bytes = to_utf8_bytes(explain["mrargs"]["end_key"])
self.assertEqual(end_key_bytes, [b"Eddie", b"A\xef\xbf\xbd", b"<MAX>"])

docs = self.db.find(selector)
self.assertEqual(len(docs), 1)
self.assertDocIds(["abc"], docs)

def test_sort_asc(self):
selector = {"location": {"$beginsWith": "A"}}
explain = self.db.find(selector, sort=["location"], explain=True)

self.assertEqual(explain["mrargs"]["start_key"], ["A"])
end_key_bytes = to_utf8_bytes(explain["mrargs"]["end_key"])
self.assertEqual(end_key_bytes, [b"A\xef\xbf\xbd", b"<MAX>"])
self.assertEqual(explain["mrargs"]["direction"], "fwd")

def test_sort_desc(self):
selector = {"location": {"$beginsWith": "A"}}
explain = self.db.find(selector, sort=[{"location": "desc"}], explain=True)

start_key_bytes = to_utf8_bytes(explain["mrargs"]["end_key"])
self.assertEqual(start_key_bytes, [b"A"])
self.assertEqual(explain["mrargs"]["end_key"], ["A"])
self.assertEqual(explain["mrargs"]["direction"], "rev")

def test_all_docs_range(self):
explain = self.db.find({"_id": {"$beginsWith": "a"}}, explain=True)
self.assertEqual(explain["mrargs"]["start_key"], "a")
end_key_bytes = to_utf8_bytes(explain["mrargs"]["end_key"])
self.assertEqual(end_key_bytes, [b"a", b"\xef\xbf\xbd"])

def test_no_index(self):
selector = {"foo": {"$beginsWith": "a"}}
resp_explain = self.db.find(selector, explain=True)

self.assertEqual(resp_explain["index"]["type"], "special")
self.assertEqual(resp_explain["mrargs"]["start_key"], None)
self.assertEqual(resp_explain["mrargs"]["end_key"], "<MAX>")

def test_invalid_operand(self):
try:
self.db.find({"_id": {"$beginsWith": True}})
except Exception as e:
self.assertEqual(e.response.status_code, 400)
resp = e.response.json()
self.assertEqual(resp["error"], "invalid_operator")
else:
raise AssertionError("expected find error")

def test_does_not_match_non_string_value(self):
selector = {"age": {"$beginsWith": "a"}}
docs = self.db.find(selector)

self.assertEqual(len(docs), 0)

0 comments on commit 85e8a9d

Please sign in to comment.