Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

RCORE-2064 String EQ/NEQ optimisations for compressed strings #7820

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
71a2318
No unique ptrs for string interner + limit number of interners
nicola-cab Jun 12, 2024
7265c53
point fix client-reset test
nicola-cab Jun 13, 2024
219dbef
minor stuff
nicola-cab Jun 13, 2024
aec41a1
string compression tests
nicola-cab Jun 14, 2024
59e1a5a
Merge branch 'feature/string-compression' of github.com:realm/realm-c…
nicola-cab Jun 17, 2024
2e7f996
Merge branch 'feature/string-compression' of github.com:realm/realm-c…
nicola-cab Jun 17, 2024
2c45256
point fix check string ids + more tests
nicola-cab Jun 17, 2024
b27b52e
new node addition failure for string interner
nicola-cab Jun 17, 2024
7b8ffe7
point fix lookup compressed string id when rehashing
nicola-cab Jun 18, 2024
4d573e1
find_first optimization for compressed strings
nicola-cab Jun 18, 2024
ad2cfa4
Merge branch 'nc/string_interner_tests' into nc/eq_neq_string_compres…
nicola-cab Jun 18, 2024
50a9287
Merge branch 'feature/string-compression' of github.com:realm/realm-c…
nicola-cab Jun 19, 2024
71b34ed
core test passing
nicola-cab Jun 18, 2024
264aae4
compression tests for collection of strings
nicola-cab Jun 19, 2024
050fb75
Merge branch 'nc/string_interner_tests' into nc/eq_neq_string_compres…
nicola-cab Jun 19, 2024
508bbb7
code review
nicola-cab Jul 5, 2024
eedfa69
Merge branch 'feature/string-compression' of github.com:realm/realm-c…
nicola-cab Jul 8, 2024
9653f0f
Fixes (#7872)
jedelbo Jul 10, 2024
057a46c
Merge branch 'feature/string-compression' into nc/eq_neq_string_compr…
jedelbo Jul 10, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 20 additions & 3 deletions src/realm/array_string.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,14 @@ StringData ArrayString::get(size_t ndx) const
return {};
}

std::optional<StringID> realm::ArrayString::get_string_id(size_t ndx) const
{
if (m_type == Type::interned_strings) {
return StringID(static_cast<Array*>(m_arr)->get(ndx));
}
return m_string_interner->lookup(get(ndx));
}

Mixed ArrayString::get_any(size_t ndx) const
{
return Mixed(get(ndx));
Expand Down Expand Up @@ -274,6 +282,16 @@ void ArrayString::clear()
}

size_t ArrayString::find_first(StringData value, size_t begin, size_t end) const noexcept
{
// This should only be called if we don't have a string id for this particular array (aka no string interner)
std::optional<StringID> id;
if (m_type == Type::interned_strings)
id = m_string_interner->lookup(value);

return find_first(value, begin, end, id);
}

size_t ArrayString::find_first(StringData value, size_t begin, size_t end, std::optional<StringID> id) const noexcept
{
switch (m_type) {
case Type::small_strings:
Expand All @@ -289,14 +307,13 @@ size_t ArrayString::find_first(StringData value, size_t begin, size_t end) const
break;
}
case Type::interned_strings: {
// we need a way to avoid this lookup for each leaf array. The lookup must appear
// higher up the call stack and passed down.
auto id = m_string_interner->lookup(value);
if (id) {
return static_cast<Array*>(m_arr)->find_first(*id, begin, end);
}
nicola-cab marked this conversation as resolved.
Show resolved Hide resolved
break;
}
default:
break;
}
return not_found;
}
Expand Down
9 changes: 9 additions & 0 deletions src/realm/array_string.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include <realm/array_string_short.hpp>
#include <realm/array_blobs_small.hpp>
#include <realm/array_blobs_big.hpp>
#include <realm/string_interner.hpp>

namespace realm {

Expand Down Expand Up @@ -74,6 +75,10 @@ class ArrayString : public ArrayPayload {
{
m_string_interner = string_interner;
}
bool is_compressed() const
{
return m_type == Type::interned_strings;
}

void update_parent()
{
Expand All @@ -99,6 +104,7 @@ class ArrayString : public ArrayPayload {
}
void insert(size_t ndx, StringData value);
StringData get(size_t ndx) const;
std::optional<StringID> get_string_id(size_t ndx) const;
Mixed get_any(size_t ndx) const override;
bool is_null(size_t ndx) const;
void erase(size_t ndx);
Expand All @@ -107,6 +113,9 @@ class ArrayString : public ArrayPayload {

size_t find_first(StringData value, size_t begin, size_t end) const noexcept;

/// Special version for searching in an array or compressed strings.
size_t find_first(StringData value, size_t begin, size_t end, std::optional<StringID>) const noexcept;

size_t lower_bound(StringData value);

/// Get the specified element without the cost of constructing an
Expand Down
21 changes: 13 additions & 8 deletions src/realm/query_engine.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -453,7 +453,7 @@ bool StringNode<Equal>::do_consume_condition(ParentNode& node)
size_t StringNode<Equal>::_find_first_local(size_t start, size_t end)
{
if (m_needles.empty()) {
return m_leaf->find_first(m_string_value, start, end);
return m_leaf->find_first(m_string_value, start, end, m_interned_string_id);
}
else {
if (end == npos)
Expand Down Expand Up @@ -505,7 +505,8 @@ size_t StringNode<EqualIns>::_find_first_local(size_t start, size_t end)
}

StringNodeFulltext::StringNodeFulltext(StringData v, ColKey column, std::unique_ptr<LinkMap> lm)
: StringNodeEqualBase(v, column)
: m_value(v)
, m_col(column)
, m_link_map(std::move(lm))
{
if (!m_link_map)
Expand All @@ -518,17 +519,21 @@ void StringNodeFulltext::table_changed()
}

StringNodeFulltext::StringNodeFulltext(const StringNodeFulltext& other)
: StringNodeEqualBase(other)
: ParentNode(other)
, m_value(other.m_value)
, m_col(other.m_col)
, m_link_map(std::make_unique<LinkMap>(*other.m_link_map))
{
m_link_map = std::make_unique<LinkMap>(*other.m_link_map);
}

void StringNodeFulltext::_search_index_init()
void StringNodeFulltext::init(bool will_query_ranges)
{
StringIndex* index = m_link_map->get_target_table()->get_string_index(ParentNode::m_condition_column_key);
ParentNode::init(will_query_ranges);

StringIndex* index = m_link_map->get_target_table()->get_string_index(m_col);
REALM_ASSERT(index && index->is_fulltext_index());
m_index_matches.clear();
index->find_all_fulltext(m_index_matches, StringNodeBase::m_string_value);
index->find_all_fulltext(m_index_matches, m_value);

// If links exists, use backlinks to find the original objects
if (m_link_map->links_exist()) {
Expand All @@ -541,7 +546,7 @@ void StringNodeFulltext::_search_index_init()
}

m_index_evaluator = IndexEvaluator{};
m_index_evaluator->init(&m_index_matches);
m_index_evaluator.init(&m_index_matches);
}

std::unique_ptr<ArrayPayload> TwoColumnsNodeBase::update_cached_leaf_pointers_for_column(Allocator& alloc,
Expand Down
48 changes: 40 additions & 8 deletions src/realm/query_engine.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,8 @@ class ParentNode {
{
m_dD = 100.0;

if (m_condition_column_key)
m_table->check_column(m_condition_column_key);
if (m_child)
m_child->init(will_query_ranges);
}
Expand Down Expand Up @@ -1647,6 +1649,11 @@ class StringNodeBase : public ParentNode {
m_dT = 10.0;
}

void table_changed() override
{
m_string_interner = m_table.unchecked_ptr()->get_string_interner(m_condition_column_key);
}

void cluster_changed() override
{
m_leaf.emplace(m_table.unchecked_ptr()->get_alloc());
Expand All @@ -1662,6 +1669,7 @@ class StringNodeBase : public ParentNode {
m_end_s = 0;
m_leaf_start = 0;
m_leaf_end = 0;
m_interned_string_id = m_string_interner->lookup(m_value);
}

virtual void clear_leaf_state()
Expand All @@ -1673,6 +1681,8 @@ class StringNodeBase : public ParentNode {
: ParentNode(from)
, m_value(from.m_value)
, m_string_value(m_value)
, m_string_interner(from.m_string_interner)
, m_interned_string_id(from.m_interned_string_id)
{
}

Expand All @@ -1687,6 +1697,8 @@ class StringNodeBase : public ParentNode {
std::optional<std::string> m_value;
std::optional<ArrayString> m_leaf;
StringData m_string_value;
StringInterner* m_string_interner = nullptr;
std::optional<StringID> m_interned_string_id;

size_t m_end_s = 0;
size_t m_leaf_start = 0;
Expand All @@ -1703,7 +1715,7 @@ template <class TConditionFunction>
class StringNode : public StringNodeBase {
public:
constexpr static bool case_sensitive_comparison =
is_any_v<TConditionFunction, Greater, GreaterEqual, Less, LessEqual>;
is_any_v<TConditionFunction, NotEqual, Greater, GreaterEqual, Less, LessEqual>;
StringNode(StringData v, ColKey column)
: StringNodeBase(v, column)
{
Expand Down Expand Up @@ -1732,8 +1744,21 @@ class StringNode : public StringNodeBase {
TConditionFunction cond;

for (size_t s = start; s < end; ++s) {
if constexpr (std::is_same_v<TConditionFunction, NotEqual>) {
if (m_leaf->is_compressed()) {
if (m_interned_string_id) {
// The search string has been interned, so there might be a match
// We can compare the string IDs directly
const auto id = m_leaf->get_string_id(s);
if (m_string_interner->compare(*m_interned_string_id, *id) == 0) {
// The value matched, so we continue to the next value
continue;
}
}
return s;
}
}
StringData t = get_string(s);

if constexpr (case_sensitive_comparison) {
// case insensitive not implemented for: >, >=, <, <=
if (cond(t, m_string_value))
Expand Down Expand Up @@ -2061,20 +2086,24 @@ class StringNode<EqualIns> : public StringNodeEqualBase {
size_t _find_first_local(size_t start, size_t end) override;
};


class StringNodeFulltext : public StringNodeEqualBase {
class StringNodeFulltext : public ParentNode {
public:
StringNodeFulltext(StringData v, ColKey column, std::unique_ptr<LinkMap> lm = {});

void table_changed() override;

void _search_index_init() override;
void init(bool will_query_ranges) override;

bool has_search_index() const override
{
return true; // it's a required precondition for fulltext queries
}

const IndexEvaluator* index_based_keys() override
{
return &m_index_evaluator;
}

std::unique_ptr<ParentNode> clone() const override
{
return std::unique_ptr<ParentNode>(new StringNodeFulltext(*this));
Expand All @@ -2086,13 +2115,16 @@ class StringNodeFulltext : public StringNodeEqualBase {
}

private:
std::vector<ObjKey> m_index_matches;
std::string m_value;
ColKey m_col;
std::unique_ptr<LinkMap> m_link_map;
IndexEvaluator m_index_evaluator;
std::vector<ObjKey> m_index_matches;
StringNodeFulltext(const StringNodeFulltext&);

size_t _find_first_local(size_t, size_t) override
size_t find_first_local(size_t start, size_t end) override
{
REALM_UNREACHABLE();
return m_index_evaluator.do_search_index(m_cluster, start, end);
}
};

Expand Down
20 changes: 18 additions & 2 deletions src/realm/table.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1735,9 +1735,25 @@ ObjKey Table::find_first(ColKey col_key, T value) const
using LeafType = typename ColumnTypeTraits<T>::cluster_leaf_type;
LeafType leaf(get_alloc());

nicola-cab marked this conversation as resolved.
Show resolved Hide resolved
auto f = [&key, &col_key, &value, &leaf](const Cluster* cluster) {
// In case of a string column we can try to look up the StringID of the search string,
// and search for that in case the leaf is compressed.
std::optional<StringID> string_id;
if constexpr (std::is_same_v<T, StringData>) {
auto string_interner = get_string_interner(col_key);
REALM_ASSERT(string_interner != nullptr);
string_id = string_interner->lookup(value);
}

auto f = [&](const Cluster* cluster) {
cluster->init_leaf(col_key, &leaf);
size_t row = leaf.find_first(value, 0, cluster->node_size());
size_t row;
if constexpr (std::is_same_v<T, StringData>) {
row = leaf.find_first(value, 0, cluster->node_size(), string_id);
}
else {
row = leaf.find_first(value, 0, cluster->node_size());
}

if (row != realm::npos) {
key = cluster->get_real_key(row);
return IteratorControl::Stop;
Expand Down
40 changes: 35 additions & 5 deletions test/test_query.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -330,10 +330,13 @@ columns or queries involved
*/


TEST(Query_NextGen_StringConditions)
TEST_TYPES(Query_NextGen_StringConditions, std::true_type, std::false_type)
{
Group group;
TableRef table1 = group.add_table("table1");
SHARED_GROUP_TEST_PATH(path);

auto db = DB::create(make_in_realm_history(), path);
auto wt = db->start_write();
TableRef table1 = wt->add_table("table1");
auto col_str1 = table1->add_column(type_String, "str1");
auto col_str2 = table1->add_column(type_String, "str2");

Expand All @@ -342,6 +345,11 @@ TEST(Query_NextGen_StringConditions)
table1->create_object().set_all("!", "x").get_key();
ObjKey key_1_2 = table1->create_object().set_all("bar", "r").get_key();

if (TEST_TYPE::value) {
wt->commit_and_continue_as_read();
wt->promote_to_write();
}

ObjKey m;
// Equal
m = table1->column<String>(col_str1).equal("bar", false).find();
Expand Down Expand Up @@ -433,7 +441,7 @@ TEST(Query_NextGen_StringConditions)
CHECK_EQUAL(m, null_key);

// Test various compare operations with null
TableRef table2 = group.add_table("table2");
TableRef table2 = wt->add_table("table2");
auto col_str3 = table2->add_column(type_String, "str3", true);

ObjKey key_2_0 = table2->create_object().set(col_str3, "foo").get_key();
Expand All @@ -442,6 +450,11 @@ TEST(Query_NextGen_StringConditions)
ObjKey key_2_3 = table2->create_object().set(col_str3, "bar").get_key();
ObjKey key_2_4 = table2->create_object().set(col_str3, "").get_key();

if (TEST_TYPE::value) {
wt->commit_and_continue_as_read();
wt->promote_to_write();
}

size_t cnt;
cnt = table2->column<String>(col_str3).contains(StringData("")).count();
CHECK_EQUAL(cnt, 4);
Expand Down Expand Up @@ -522,6 +535,12 @@ TEST(Query_NextGen_StringConditions)
}
};

// not equal
check_results((table2->column<String>(col_str3) != StringData("")), {StringData(), "foo", "bar", "!"});
check_results((table2->column<String>(col_str3) != StringData()), {"", "foo", "bar", "!"});
check_results((table2->column<String>(col_str3) != StringData("foo")), {StringData(), "", "bar", "!"});
check_results((table2->column<String>(col_str3) != StringData("barr")), {StringData(), "", "foo", "bar", "!"});

// greater
check_results((table2->column<String>(col_str3) > StringData("")), {"foo", "bar", "!"});
check_results((table2->column<String>(col_str3) > StringData("b")), {"foo", "bar"});
Expand Down Expand Up @@ -553,7 +572,7 @@ TEST(Query_NextGen_StringConditions)
check_results((table2->column<String>(col_str3) <= StringData("barrrr")), {"bar", "", "!", StringData()});
check_results((table2->column<String>(col_str3) <= StringData("z")), {"foo", "bar", "", "!", StringData()});

TableRef table3 = group.add_table(StringData("table3"));
TableRef table3 = wt->add_table(StringData("table3"));
auto col_link1 = table3->add_column(*table2, "link1");

table3->create_object().set(col_link1, key_2_0);
Expand All @@ -562,6 +581,11 @@ TEST(Query_NextGen_StringConditions)
table3->create_object().set(col_link1, key_2_3);
table3->create_object().set(col_link1, key_2_4);

if (TEST_TYPE::value) {
wt->commit_and_continue_as_read();
wt->promote_to_write();
}

cnt = table3->link(col_link1).column<String>(col_str3).contains(StringData("")).count();
CHECK_EQUAL(cnt, 4);

Expand Down Expand Up @@ -638,8 +662,14 @@ TEST(Query_NextGen_StringConditions)
"This is a long search string that does not contain the word being searched for!, "
"This is a long search string that does not contain the word being searched for!, "
"needle";

table2->create_object().set(col_str3, long_string).get_key();

if (TEST_TYPE::value) {
wt->commit_and_continue_as_read();
wt->promote_to_write();
}

cnt = table2->column<String>(col_str3).contains(search_1, false).count();
CHECK_EQUAL(cnt, 1);

Expand Down
Loading