diff --git a/core/analysis/multi_delimited_token_stream.cpp b/core/analysis/multi_delimited_token_stream.cpp index c97540049..f95b02e86 100644 --- a/core/analysis/multi_delimited_token_stream.cpp +++ b/core/analysis/multi_delimited_token_stream.cpp @@ -85,7 +85,7 @@ class MultiDelimitedTokenStreamSingleCharsBase public: auto FindNextDelim() { auto where = static_cast(this)->FindNextDelim(); - return std::make_pair(where, 1); + return std::make_pair(where, size_t{1}); } }; @@ -409,7 +409,7 @@ irs::analysis::analyzer::ptr Make(MultiDelimitedAnalyser::Options&& opts) { return std::make_unique(std::move(opts)); } -constexpr std::string_view kDelimiterParamName{"delimiter"}; +constexpr std::string_view kDelimiterParamName{"delimiters"}; bool ParseVpackOptions(VPackSlice slice, MultiDelimitedAnalyser::Options& options) { @@ -418,46 +418,42 @@ bool ParseVpackOptions(VPackSlice slice, "Slice for multi_delimited_token_stream is not an object or string"); return false; } + auto delim_array_slice = slice.get(kDelimiterParamName); + if (!delim_array_slice.isArray()) { + IRS_LOG_WARN( + absl::StrCat("Invalid type or missing '", kDelimiterParamName, + "' (array expected) for multi_delimited_token_stream from " + "VPack arguments")); + return false; + } - if (auto delim_array_slice = slice.get(kDelimiterParamName); - !delim_array_slice.isNone()) { - if (!delim_array_slice.isArray()) { - IRS_LOG_WARN( - absl::StrCat("Invalid type '", kDelimiterParamName, - "' (array expected) for multi_delimited_token_stream from " - "VPack arguments")); + for (auto delim : VPackArrayIterator(delim_array_slice)) { + if (!delim.isString()) { + IRS_LOG_WARN(absl::StrCat( + "Invalid type in '", kDelimiterParamName, + "' (string expected) for multi_delimited_token_stream from " + "VPack arguments")); return false; } + auto view = ViewCast(delim.stringView()); - for (auto delim : VPackArrayIterator(delim_array_slice)) { - if (!delim.isString()) { - IRS_LOG_WARN(absl::StrCat( - "Invalid type in '", kDelimiterParamName, - "' (string expected) for multi_delimited_token_stream from " - "VPack arguments")); - return false; - } - auto view = ViewCast(delim.stringView()); + if (view.empty()) { + IRS_LOG_ERROR("Delimiter list contains an empty string."); + return false; + } - if (view.empty()) { - IRS_LOG_ERROR("Delimiter list contains an empty string."); + for (const auto& known : options.delimiters) { + if (view.starts_with(known) || known.starts_with(view)) { + IRS_LOG_ERROR( + absl::StrCat("Some delimiters are a prefix of others. See `", + ViewCast(bytes_view{known}), "` and `", + delim.stringView(), "`")); return false; } - - for (const auto& known : options.delimiters) { - if (view.starts_with(known) || known.starts_with(view)) { - IRS_LOG_ERROR( - absl::StrCat("Some delimiters are a prefix of others. See `", - ViewCast(bytes_view{known}), "` and `", - delim.stringView(), "`")); - return false; - } - } - - options.delimiters.emplace_back(view); } - } + options.delimiters.emplace_back(view); + } return true; } diff --git a/core/formats/columnstore2.cpp b/core/formats/columnstore2.cpp index dc2884cb2..eb6fbe81f 100644 --- a/core/formats/columnstore2.cpp +++ b/core/formats/columnstore2.cpp @@ -393,9 +393,11 @@ class column_base : public column_reader, private util::noncopyable { if (irs::IsNull(column_name)) { column_name = ""; } - IRS_LOG_WARN( - absl::StrCat("Failed to allocate memory for buffered column id ", - header().id, " name: ", column_name, " of size ", size)); + IRS_LOG_INFO(absl::StrCat( + "Failed to allocate memory for buffered column id ", header().id, + " name: ", column_name, " of size ", (size + mappings), + ". This can happen if no columns cache was configured or the " + "column data size exceeds the columns cache size.")); return false; } diff --git a/tests/analysis/multi_delimited_token_stream_tests.cpp b/tests/analysis/multi_delimited_token_stream_tests.cpp index 05b93de1c..e5c996f04 100644 --- a/tests/analysis/multi_delimited_token_stream_tests.cpp +++ b/tests/analysis/multi_delimited_token_stream_tests.cpp @@ -21,6 +21,10 @@ #include "analysis/multi_delimited_token_stream.hpp" #include "gtest/gtest.h" #include "tests_config.hpp" +#include "velocypack/Parser.h" + +using namespace arangodb::velocypack; +using namespace irs::analysis; namespace { @@ -30,6 +34,9 @@ irs::bstring operator""_b(const char* ptr, std::size_t size) { } class multi_delimited_token_stream_tests : public ::testing::Test { + public: + static void SetUpTestCase() { MultiDelimitedAnalyser::init(); } + virtual void SetUp() { // Code here will be called immediately after the constructor (right before // each test). @@ -48,15 +55,12 @@ class multi_delimited_token_stream_tests : public ::testing::Test { // ----------------------------------------------------------------------------- TEST_F(multi_delimited_token_stream_tests, consts) { - static_assert("multi_delimiter" == - irs::type::name()); + static_assert("multi_delimiter" == irs::type::name()); } TEST_F(multi_delimited_token_stream_tests, test_delimiter) { - auto stream = - irs::analysis::MultiDelimitedAnalyser::Make({.delimiters = {"a"_b}}); - ASSERT_EQ(irs::type::id(), - stream->type()); + auto stream = MultiDelimitedAnalyser::Make({.delimiters = {"a"_b}}); + ASSERT_EQ(irs::type::id(), stream->type()); ASSERT_TRUE(stream->reset("baccaad")); @@ -83,10 +87,8 @@ TEST_F(multi_delimited_token_stream_tests, test_delimiter) { } TEST_F(multi_delimited_token_stream_tests, test_delimiter_empty_match) { - auto stream = - irs::analysis::MultiDelimitedAnalyser::Make({.delimiters = {"."_b}}); - ASSERT_EQ(irs::type::id(), - stream->type()); + auto stream = MultiDelimitedAnalyser::Make({.delimiters = {"."_b}}); + ASSERT_EQ(irs::type::id(), stream->type()); ASSERT_TRUE(stream->reset("..")); @@ -97,10 +99,9 @@ TEST_F(multi_delimited_token_stream_tests, test_delimiter_empty_match) { } TEST_F(multi_delimited_token_stream_tests, test_delimiter_3) { - auto stream = irs::analysis::MultiDelimitedAnalyser::Make( - {.delimiters = {";"_b, ","_b, "|"_b}}); - ASSERT_EQ(irs::type::id(), - stream->type()); + auto stream = + MultiDelimitedAnalyser::Make({.delimiters = {";"_b, ","_b, "|"_b}}); + ASSERT_EQ(irs::type::id(), stream->type()); ASSERT_TRUE(stream->reset("a;b||c|d,ff")); @@ -133,10 +134,9 @@ TEST_F(multi_delimited_token_stream_tests, test_delimiter_3) { } TEST_F(multi_delimited_token_stream_tests, test_delimiter_5) { - auto stream = irs::analysis::MultiDelimitedAnalyser::Make( + auto stream = MultiDelimitedAnalyser::Make( {.delimiters = {";"_b, ","_b, "|"_b, "."_b, ":"_b}}); - ASSERT_EQ(irs::type::id(), - stream->type()); + ASSERT_EQ(irs::type::id(), stream->type()); ASSERT_TRUE(stream->reset("a:b||c.d,ff.")); @@ -169,10 +169,8 @@ TEST_F(multi_delimited_token_stream_tests, test_delimiter_5) { } TEST_F(multi_delimited_token_stream_tests, test_delimiter_single_long) { - auto stream = - irs::analysis::MultiDelimitedAnalyser::Make({.delimiters = {"foo"_b}}); - ASSERT_EQ(irs::type::id(), - stream->type()); + auto stream = MultiDelimitedAnalyser::Make({.delimiters = {"foo"_b}}); + ASSERT_EQ(irs::type::id(), stream->type()); ASSERT_TRUE(stream->reset("foobarfoobazbarfoobar")); @@ -197,9 +195,8 @@ TEST_F(multi_delimited_token_stream_tests, test_delimiter_single_long) { } TEST_F(multi_delimited_token_stream_tests, no_delimiter) { - auto stream = irs::analysis::MultiDelimitedAnalyser::Make({.delimiters = {}}); - ASSERT_EQ(irs::type::id(), - stream->type()); + auto stream = MultiDelimitedAnalyser::Make({.delimiters = {}}); + ASSERT_EQ(irs::type::id(), stream->type()); ASSERT_TRUE(stream->reset("foobar")); @@ -216,10 +213,9 @@ TEST_F(multi_delimited_token_stream_tests, no_delimiter) { } TEST_F(multi_delimited_token_stream_tests, multi_words) { - auto stream = irs::analysis::MultiDelimitedAnalyser::Make( - {.delimiters = {"foo"_b, "bar"_b, "baz"_b}}); - ASSERT_EQ(irs::type::id(), - stream->type()); + auto stream = + MultiDelimitedAnalyser::Make({.delimiters = {"foo"_b, "bar"_b, "baz"_b}}); + ASSERT_EQ(irs::type::id(), stream->type()); ASSERT_TRUE(stream->reset("fooxyzbarbazz")); @@ -240,10 +236,9 @@ TEST_F(multi_delimited_token_stream_tests, multi_words) { } TEST_F(multi_delimited_token_stream_tests, multi_words_2) { - auto stream = irs::analysis::MultiDelimitedAnalyser::Make( - {.delimiters = {"foo"_b, "bar"_b, "baz"_b}}); - ASSERT_EQ(irs::type::id(), - stream->type()); + auto stream = + MultiDelimitedAnalyser::Make({.delimiters = {"foo"_b, "bar"_b, "baz"_b}}); + ASSERT_EQ(irs::type::id(), stream->type()); ASSERT_TRUE(stream->reset("foobarbaz")); @@ -254,10 +249,9 @@ TEST_F(multi_delimited_token_stream_tests, multi_words_2) { } TEST_F(multi_delimited_token_stream_tests, trick_matching_1) { - auto stream = irs::analysis::MultiDelimitedAnalyser::Make( - {.delimiters = {"foo"_b, "ffa"_b}}); - ASSERT_EQ(irs::type::id(), - stream->type()); + auto stream = + MultiDelimitedAnalyser::Make({.delimiters = {"foo"_b, "ffa"_b}}); + ASSERT_EQ(irs::type::id(), stream->type()); ASSERT_TRUE(stream->reset("abcffoobar")); @@ -276,3 +270,58 @@ TEST_F(multi_delimited_token_stream_tests, trick_matching_1) { ASSERT_EQ(offset->end, 10); ASSERT_FALSE(stream->next()); } + +TEST_F(multi_delimited_token_stream_tests, construct) { + // wrong name + { + auto builder = Parser::fromJson(R"({"delimiter":["a", "b"]})"); + std::string in_str; + in_str.assign(builder->slice().startAs(), + builder->slice().byteSize()); + auto stream = analyzers::get( + "multi_delimiter", irs::type::get(), in_str); + ASSERT_EQ(nullptr, stream); + } + + // wrong type + { + auto builder = Parser::fromJson(R"({"delimiters":1})"); + std::string in_str; + in_str.assign(builder->slice().startAs(), + builder->slice().byteSize()); + auto stream = analyzers::get( + "multi_delimiter", irs::type::get(), in_str); + ASSERT_EQ(nullptr, stream); + } + + { + auto builder = Parser::fromJson(R"({"delimiters":["a", "b"]})"); + std::string in_str; + in_str.assign(builder->slice().startAs(), + builder->slice().byteSize()); + auto stream = analyzers::get( + "multi_delimiter", irs::type::get(), in_str); + ASSERT_NE(nullptr, stream); + ASSERT_TRUE(stream->reset("aib")); + ASSERT_TRUE(stream->next()); + auto* term = irs::get(*stream); + ASSERT_EQ("i", irs::ViewCast(term->value)); + ASSERT_FALSE(stream->next()); + } + { + auto builder = Parser::fromJson(R"({"delimiters":["a", "b", "c", "d"]})"); + std::string in_str; + in_str.assign(builder->slice().startAs(), + builder->slice().byteSize()); + std::string actual; + auto stream = + analyzers::normalize(actual, "multi_delimiter", + irs::type::get(), in_str); + + auto slice = Slice(reinterpret_cast(actual.data())); + ASSERT_TRUE(slice.isObject()); + auto delimiters = slice.get("delimiters"); + ASSERT_TRUE(delimiters.isArray()); + ASSERT_EQ(4, delimiters.length()); + } +}