Skip to content

Commit

Permalink
mango: fix $beginsWith range
Browse files Browse the repository at this point in the history
In the intial implementation of $beginsWith, the range calculation
for json indexes mistakenly appends an integer with the size of
8 bits which gets maxed out at FF, rather than building a binary
with an extra 3 bytes at the end.

This commit fixes the `mango_idx_view:range/5` by correctly appending
the `U+FFFF` code point to create a utf-8 encoded binary. Additionally,
the Erlang `utf8` binary type ensures the result
is a valid utf8 string. If `Arg` is not a utf8 binary, this will
throw a badarg error.

We expect `Arg` strings to be a valid utf8 but, to be safe,
`mango_selector:norm_ops/1` is enhanced to verify
that any argument to `$beginsWith` is a utf8 string.
  • Loading branch information
willholley committed Nov 2, 2023
1 parent e9d703c commit cc4fa13
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 9 deletions.
9 changes: 7 additions & 2 deletions src/mango/src/mango_idx_view.erl
Original file line number Diff line number Diff line change
Expand Up @@ -347,7 +347,7 @@ range(Selector, Index) ->
range(Selector, Index, '$gt', mango_json:min(), '$lt', mango_json:max()).

% Adjust Low and High based on values found for the
% givend Index in Selector.
% given Index in Selector.
range({[{<<"$and">>, Args}]}, Index, LCmp, Low, HCmp, High) ->
lists:foldl(
fun
Expand Down Expand Up @@ -417,7 +417,8 @@ range(_, _, LCmp, Low, HCmp, High) ->
% beginsWith requires both a high and low bound
range({[{<<"$beginsWith">>, Arg}]}, LCmp, Low, HCmp, High) ->
{LCmp0, Low0, HCmp0, High0} = range({[{<<"$gte">>, Arg}]}, LCmp, Low, HCmp, High),
range({[{<<"$lte">>, <<Arg/binary, 16#10FFFF>>}]}, LCmp0, Low0, HCmp0, High0);
% we use FFFF here as the highest sortable code point
range({[{<<"$lte">>, <<Arg/binary, 16#FFFF/utf8>>}]}, LCmp0, Low0, HCmp0, High0);
range({[{<<"$lt">>, Arg}]}, LCmp, Low, HCmp, High) ->
case range_pos(Low, Arg, High) of
min ->
Expand Down Expand Up @@ -624,6 +625,10 @@ indexable_fields_gte_test() ->
Selector = #{<<"field">> => #{<<"$gte">> => undefined}},
?assertEqual([<<"field">>], indexable_fields_of(Selector)).

indexable_fields_beginswith_test() ->
Selector = #{<<"field">> => #{<<"$beginsWith">> => undefined}},
?assertEqual([<<"field">>], indexable_fields_of(Selector)).

indexable_fields_gt_test() ->
Selector = #{<<"field">> => #{<<"$gt">> => undefined}},
?assertEqual([<<"field">>], indexable_fields_of(Selector)).
Expand Down
13 changes: 11 additions & 2 deletions src/mango/src/mango_selector.erl
Original file line number Diff line number Diff line change
Expand Up @@ -136,7 +136,10 @@ norm_ops({[{<<"$text">>, Arg}]}) when
norm_ops({[{<<"$text">>, Arg}]}) ->
?MANGO_ERROR({bad_arg, '$text', Arg});
norm_ops({[{<<"$beginsWith">>, Arg}]} = Cond) when is_binary(Arg) ->
Cond;
case couch_util:validate_utf8(Arg) of
true -> Cond;
false -> ?MANGO_ERROR({bad_arg, '$beginsWith', Arg})
end;
% Not technically an operator but we pass it through here
% so that this function accepts its own output. This exists
% so that $text can have a field name value which simplifies
Expand Down Expand Up @@ -1070,12 +1073,18 @@ check_beginswith(Field, Prefix) ->
match_beginswith_test() ->
% matching
?assertEqual(true, check_beginswith(<<"_id">>, <<"f">>)),
% no match (user_id is not a binary string)
% no match (user_id field in the test doc contains an integer)
?assertEqual(false, check_beginswith(<<"user_id">>, <<"f">>)),
% invalid (prefix is not a binary string)
?assertThrow(
{mango_error, mango_selector, {invalid_operator, <<"$beginsWith">>}},
check_beginswith(<<"user_id">>, 1)
),
% invalid (prefix is not a utf8 string)
InvalidArg = <<32#FFFFF>>,
?assertThrow(
{mango_error, mango_selector, {bad_arg, '$beginsWith', InvalidArg}},
check_beginswith(<<"user_id">>, InvalidArg)
).

-endif.
10 changes: 5 additions & 5 deletions src/mango/test/25-beginswith-test.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,15 +54,15 @@ def test_json_range(self):

self.assertEqual(mrargs["start_key"], ["A"])
end_key_bytes = to_utf8_bytes(mrargs["end_key"])
self.assertEqual(end_key_bytes, [b"A\xef\xbf\xbd", b"<MAX>"])
self.assertEqual(end_key_bytes, [b"A\xef\xbf\xbf", b"<MAX>"])

def test_compound_key(self):
selector = {"name": "Eddie", "location": {"$beginsWith": "A"}}
mrargs = self.get_mrargs(selector)

self.assertEqual(mrargs["start_key"], ["Eddie", "A"])
end_key_bytes = to_utf8_bytes(mrargs["end_key"])
self.assertEqual(end_key_bytes, [b"Eddie", b"A\xef\xbf\xbd", b"<MAX>"])
self.assertEqual(end_key_bytes, [b"Eddie", b"A\xef\xbf\xbf", b"<MAX>"])

docs = self.db.find(selector)
self.assertEqual(len(docs), 1)
Expand All @@ -74,12 +74,12 @@ def test_sort(self):
{
"sort": ["location"],
"start_key": [b"A"],
"end_key": [b"A\xef\xbf\xbd", b"<MAX>"],
"end_key": [b"A\xef\xbf\xbf", b"<MAX>"],
"direction": "fwd",
},
{
"sort": [{"location": "desc"}],
"start_key": [b"A\xef\xbf\xbd", b"<MAX>"],
"start_key": [b"A\xef\xbf\xbf", b"<MAX>"],
"end_key": [b"A"],
"direction": "rev",
},
Expand All @@ -97,7 +97,7 @@ def test_all_docs_range(self):

self.assertEqual(mrargs["start_key"], "a")
end_key_bytes = to_utf8_bytes(mrargs["end_key"])
self.assertEqual(end_key_bytes, [b"a", b"\xef\xbf\xbd"])
self.assertEqual(end_key_bytes, [b"a", b"\xef\xbf\xbf"])

def test_no_index(self):
selector = {"foo": {"$beginsWith": "a"}}
Expand Down

0 comments on commit cc4fa13

Please sign in to comment.