From 359cc38de15db34d73d3a5776dc9e21867ac932f Mon Sep 17 00:00:00 2001 From: Will Holley Date: Fri, 3 Nov 2023 11:07:00 +0000 Subject: [PATCH] mango: fix $beginsWith range (#4828) In the intial implementation of $beginsWith, the range calculation for json indexes mistakenly appends an integer with the size of 8 bits which gets maxed out at FF, rather than building a binary with an extra 3 bytes at the end. This commit fixes the `mango_idx_view:range/5` by correctly appending the `U+FFFF` code point to create a utf-8 encoded binary. Additionally, the Erlang `utf8` binary type ensures the result is a valid utf8 string. If `Arg` is not a utf8 binary, this will throw a badarg error. We expect `Arg` strings to be a valid utf8 but, to be safe, `mango_selector:norm_ops/1` is enhanced to verify that any argument to `$beginsWith` is a utf8 string. --- src/mango/src/mango_idx_view.erl | 10 ++++++++-- src/mango/src/mango_selector.erl | 13 +++++++++++-- src/mango/test/25-beginswith-test.py | 10 +++++----- 3 files changed, 24 insertions(+), 9 deletions(-) diff --git a/src/mango/src/mango_idx_view.erl b/src/mango/src/mango_idx_view.erl index d1650e987cc..95e1fc5affe 100644 --- a/src/mango/src/mango_idx_view.erl +++ b/src/mango/src/mango_idx_view.erl @@ -347,7 +347,7 @@ range(Selector, Index) -> range(Selector, Index, '$gt', mango_json:min(), '$lt', mango_json:max()). % Adjust Low and High based on values found for the -% givend Index in Selector. +% given Index in Selector. range({[{<<"$and">>, Args}]}, Index, LCmp, Low, HCmp, High) -> lists:foldl( fun @@ -417,7 +417,9 @@ range(_, _, LCmp, Low, HCmp, High) -> % beginsWith requires both a high and low bound range({[{<<"$beginsWith">>, Arg}]}, LCmp, Low, HCmp, High) -> {LCmp0, Low0, HCmp0, High0} = range({[{<<"$gte">>, Arg}]}, LCmp, Low, HCmp, High), - range({[{<<"$lte">>, <>}]}, LCmp0, Low0, HCmp0, High0); + % U+FFFF is the highest sorting code point according to the collator rules, + % even though it's not the highest code point in UTF8. + range({[{<<"$lte">>, <>}]}, LCmp0, Low0, HCmp0, High0); range({[{<<"$lt">>, Arg}]}, LCmp, Low, HCmp, High) -> case range_pos(Low, Arg, High) of min -> @@ -624,6 +626,10 @@ indexable_fields_gte_test() -> Selector = #{<<"field">> => #{<<"$gte">> => undefined}}, ?assertEqual([<<"field">>], indexable_fields_of(Selector)). +indexable_fields_beginswith_test() -> + Selector = #{<<"field">> => #{<<"$beginsWith">> => undefined}}, + ?assertEqual([<<"field">>], indexable_fields_of(Selector)). + indexable_fields_gt_test() -> Selector = #{<<"field">> => #{<<"$gt">> => undefined}}, ?assertEqual([<<"field">>], indexable_fields_of(Selector)). diff --git a/src/mango/src/mango_selector.erl b/src/mango/src/mango_selector.erl index 93d3b10ca6c..42031b7569d 100644 --- a/src/mango/src/mango_selector.erl +++ b/src/mango/src/mango_selector.erl @@ -136,7 +136,10 @@ norm_ops({[{<<"$text">>, Arg}]}) when norm_ops({[{<<"$text">>, Arg}]}) -> ?MANGO_ERROR({bad_arg, '$text', Arg}); norm_ops({[{<<"$beginsWith">>, Arg}]} = Cond) when is_binary(Arg) -> - Cond; + case couch_util:validate_utf8(Arg) of + true -> Cond; + false -> ?MANGO_ERROR({bad_arg, '$beginsWith', Arg}) + end; % Not technically an operator but we pass it through here % so that this function accepts its own output. This exists % so that $text can have a field name value which simplifies @@ -1070,12 +1073,18 @@ check_beginswith(Field, Prefix) -> match_beginswith_test() -> % matching ?assertEqual(true, check_beginswith(<<"_id">>, <<"f">>)), - % no match (user_id is not a binary string) + % no match (user_id field in the test doc contains an integer) ?assertEqual(false, check_beginswith(<<"user_id">>, <<"f">>)), % invalid (prefix is not a binary string) ?assertThrow( {mango_error, mango_selector, {invalid_operator, <<"$beginsWith">>}}, check_beginswith(<<"user_id">>, 1) + ), + % invalid (prefix is not a utf8 string) + InvalidArg = <<16#EF>>, + ?assertThrow( + {mango_error, mango_selector, {bad_arg, '$beginsWith', InvalidArg}}, + check_beginswith(<<"user_id">>, InvalidArg) ). -endif. diff --git a/src/mango/test/25-beginswith-test.py b/src/mango/test/25-beginswith-test.py index 3b5134b6514..df96560b4e0 100644 --- a/src/mango/test/25-beginswith-test.py +++ b/src/mango/test/25-beginswith-test.py @@ -54,7 +54,7 @@ def test_json_range(self): self.assertEqual(mrargs["start_key"], ["A"]) end_key_bytes = to_utf8_bytes(mrargs["end_key"]) - self.assertEqual(end_key_bytes, [b"A\xef\xbf\xbd", b""]) + self.assertEqual(end_key_bytes, [b"A\xef\xbf\xbf", b""]) def test_compound_key(self): selector = {"name": "Eddie", "location": {"$beginsWith": "A"}} @@ -62,7 +62,7 @@ def test_compound_key(self): self.assertEqual(mrargs["start_key"], ["Eddie", "A"]) end_key_bytes = to_utf8_bytes(mrargs["end_key"]) - self.assertEqual(end_key_bytes, [b"Eddie", b"A\xef\xbf\xbd", b""]) + self.assertEqual(end_key_bytes, [b"Eddie", b"A\xef\xbf\xbf", b""]) docs = self.db.find(selector) self.assertEqual(len(docs), 1) @@ -74,12 +74,12 @@ def test_sort(self): { "sort": ["location"], "start_key": [b"A"], - "end_key": [b"A\xef\xbf\xbd", b""], + "end_key": [b"A\xef\xbf\xbf", b""], "direction": "fwd", }, { "sort": [{"location": "desc"}], - "start_key": [b"A\xef\xbf\xbd", b""], + "start_key": [b"A\xef\xbf\xbf", b""], "end_key": [b"A"], "direction": "rev", }, @@ -97,7 +97,7 @@ def test_all_docs_range(self): self.assertEqual(mrargs["start_key"], "a") end_key_bytes = to_utf8_bytes(mrargs["end_key"]) - self.assertEqual(end_key_bytes, [b"a", b"\xef\xbf\xbd"]) + self.assertEqual(end_key_bytes, [b"a", b"\xef\xbf\xbf"]) def test_no_index(self): selector = {"foo": {"$beginsWith": "a"}}