From fcb50cdf87b0d9f733cccf3443c88cdcf0c8ea8c Mon Sep 17 00:00:00 2001 From: Finn Schiermer Andersen Date: Thu, 6 Jun 2024 17:26:07 +0200 Subject: [PATCH 01/14] Squashed into single commit relative to next-major --- src/realm/CMakeLists.txt | 4 + src/realm/array.cpp | 12 +- src/realm/array.hpp | 31 +- src/realm/array_integer.cpp | 2 + src/realm/array_integer.hpp | 1 + src/realm/array_string.cpp | 95 ++++- src/realm/array_string.hpp | 18 +- src/realm/array_timestamp.hpp | 3 +- src/realm/array_unsigned.cpp | 24 +- src/realm/cluster.cpp | 49 +++ src/realm/cluster.hpp | 2 + src/realm/cluster_tree.cpp | 9 + src/realm/cluster_tree.hpp | 1 + src/realm/db.hpp | 2 + src/realm/group.cpp | 6 +- src/realm/group.hpp | 2 +- src/realm/group_writer.cpp | 7 +- src/realm/node.hpp | 6 + src/realm/obj.cpp | 34 +- src/realm/obj.hpp | 2 + src/realm/string_compressor.cpp | 357 +++++++++++++++++ src/realm/string_compressor.hpp | 100 +++++ src/realm/string_interner.cpp | 681 ++++++++++++++++++++++++++++++++ src/realm/string_interner.hpp | 96 +++++ src/realm/table.cpp | 132 ++++++- src/realm/table.hpp | 15 +- src/realm/transaction.hpp | 1 + src/realm/utilities.hpp | 4 +- test/test_shared.cpp | 84 ++++ test/test_unresolved_links.cpp | 1 + test/test_upgrade_database.cpp | 1 + 31 files changed, 1725 insertions(+), 57 deletions(-) create mode 100644 src/realm/string_compressor.cpp create mode 100644 src/realm/string_compressor.hpp create mode 100644 src/realm/string_interner.cpp create mode 100644 src/realm/string_interner.hpp diff --git a/src/realm/CMakeLists.txt b/src/realm/CMakeLists.txt index 18583f3549a..5a67cdabc15 100644 --- a/src/realm/CMakeLists.txt +++ b/src/realm/CMakeLists.txt @@ -62,6 +62,8 @@ set(REALM_SOURCES table.cpp table_ref.cpp obj_list.cpp + string_interner.cpp + string_compressor.cpp object_id.cpp table_view.cpp tokenizer.cpp @@ -178,6 +180,8 @@ set(REALM_INSTALL_HEADERS null.hpp obj.hpp obj_list.hpp + string_interner.hpp + string_compressor.hpp object_id.hpp path.hpp owned_data.hpp diff --git a/src/realm/array.cpp b/src/realm/array.cpp index be70388bb2b..b95d081f4d5 100644 --- a/src/realm/array.cpp +++ b/src/realm/array.cpp @@ -294,7 +294,7 @@ void Array::set_type(Type type) set_hasrefs_in_header(init_has_refs, header); } -void Array::destroy_children(size_t offset) noexcept +void Array::destroy_children(size_t offset, bool ro_only) noexcept { for (size_t i = offset; i != m_size; ++i) { int64_t value = get(i); @@ -310,7 +310,7 @@ void Array::destroy_children(size_t offset) noexcept continue; ref_type ref = to_ref(value); - destroy_deep(ref, m_alloc); + destroy_deep(ref, m_alloc, ro_only); } } @@ -607,6 +607,14 @@ void Array::do_ensure_minimum_width(int_fast64_t value) } } +size_t Array::size() const noexcept +{ + // in case the array is in compressed format. Never read directly + // from the header the size, since it will result very likely in a cache miss. + // For compressed arrays m_size should always be kept updated, due to init_from_mem + return m_size; +} + bool Array::compress_array(Array& arr) const { if (m_integer_compressor.get_encoding() == NodeHeader::Encoding::WTypBits) { diff --git a/src/realm/array.hpp b/src/realm/array.hpp index 6b9569ebd82..ecad1b8fc63 100644 --- a/src/realm/array.hpp +++ b/src/realm/array.hpp @@ -117,7 +117,7 @@ class Array : public Node, public ArrayParent { /// pointer. void init_from_mem(MemRef) noexcept; - /// Same as `init_from_ref(get_ref_from_parent())`. + /// Same as `init_from_ref(ref_from_parent())`. void init_from_parent() noexcept { ref_type ref = get_ref_from_parent(); @@ -210,6 +210,8 @@ class Array : public Node, public ArrayParent { update_width_cache_from_header(); } + size_t size() const noexcept; + bool is_empty() const noexcept { return size() == 0; @@ -362,7 +364,8 @@ class Array : public Node, public ArrayParent { /// state (as if calling detach()), then free the allocated memory. If this /// accessor is already in the detached state, this function has no effect /// (idempotency). - void destroy_deep() noexcept; + /// If 'ro_only', only free space in read-only memory (the file) + void destroy_deep(bool ro_only = false) noexcept; /// check if the array is encoded (in B format) inline bool is_compressed() const; @@ -377,13 +380,13 @@ class Array : public Node, public ArrayParent { bool try_decompress(); /// Shorthand for `destroy_deep(MemRef(ref, alloc), alloc)`. - static void destroy_deep(ref_type ref, Allocator& alloc) noexcept; + static void destroy_deep(ref_type ref, Allocator& alloc, bool ro_only = false) noexcept; /// Destroy the specified array node and all of its children, recursively. /// /// This is done by freeing the specified array node after calling /// destroy_deep() for every contained 'ref' element. - static void destroy_deep(MemRef, Allocator&) noexcept; + static void destroy_deep(MemRef, Allocator&, bool ro_only = false) noexcept; // Clone deep static MemRef clone(MemRef, Allocator& from_alloc, Allocator& target_alloc); @@ -540,7 +543,7 @@ class Array : public Node, public ArrayParent { // Overriding method in ArrayParent ref_type get_child_ref(size_t) const noexcept override; - void destroy_children(size_t offset = 0) noexcept; + void destroy_children(size_t offset = 0, bool ro_only = false) noexcept; protected: // Getters and Setters for adaptive-packed arrays @@ -912,16 +915,17 @@ inline void Array::set_context_flag(bool value) noexcept } } -inline void Array::destroy_deep() noexcept +inline void Array::destroy_deep(bool ro_only) noexcept { if (!is_attached()) return; if (m_has_refs) - destroy_children(); + destroy_children(0, ro_only); char* header = get_header_from_data(m_data); - m_alloc.free_(m_ref, header); + if (!ro_only || is_read_only()) + m_alloc.free_(m_ref, header); m_data = nullptr; } @@ -964,20 +968,21 @@ inline void Array::clear_and_destroy_children() truncate_and_destroy_children(0); } -inline void Array::destroy_deep(ref_type ref, Allocator& alloc) noexcept +inline void Array::destroy_deep(ref_type ref, Allocator& alloc, bool ro_only) noexcept { - destroy_deep(MemRef(ref, alloc), alloc); + destroy_deep(MemRef(ref, alloc), alloc, ro_only); } -inline void Array::destroy_deep(MemRef mem, Allocator& alloc) noexcept +inline void Array::destroy_deep(MemRef mem, Allocator& alloc, bool ro_only) noexcept { if (!get_hasrefs_from_header(mem.get_addr())) { - alloc.free_(mem); + if (!ro_only || alloc.is_read_only(mem.get_ref())) + alloc.free_(mem); return; } Array array(alloc); array.init_from_mem(mem); - array.destroy_deep(); + array.destroy_deep(ro_only); } diff --git a/src/realm/array_integer.cpp b/src/realm/array_integer.cpp index f86871c3225..b39ade6e940 100644 --- a/src/realm/array_integer.cpp +++ b/src/realm/array_integer.cpp @@ -22,6 +22,8 @@ #include #include +#include + using namespace realm; ArrayInteger::ArrayInteger(Allocator& allocator) noexcept diff --git a/src/realm/array_integer.hpp b/src/realm/array_integer.hpp index b8739414091..22d729e2e29 100644 --- a/src/realm/array_integer.hpp +++ b/src/realm/array_integer.hpp @@ -174,6 +174,7 @@ inline ArrayIntNull::~ArrayIntNull() noexcept {} inline size_t ArrayIntNull::size() const noexcept { + // this cannot be right, what if size is 0 return Array::size() - 1; } diff --git a/src/realm/array_string.cpp b/src/realm/array_string.cpp index 636a60a2865..cb2aa6fb3f5 100644 --- a/src/realm/array_string.cpp +++ b/src/realm/array_string.cpp @@ -17,6 +17,7 @@ **************************************************************************/ #include +#include #include #include @@ -52,14 +53,24 @@ void ArrayString::init_from_mem(MemRef mem) noexcept else { auto arr = new (&m_storage) Array(m_alloc); arr->init_from_mem(mem); - m_string_enum_values = std::make_unique(m_alloc); - ArrayParent* p; - REALM_ASSERT(m_spec != nullptr); - REALM_ASSERT(m_col_ndx != realm::npos); - ref_type r = m_spec->get_enumkeys_ref(m_col_ndx, p); - m_string_enum_values->init_from_ref(r); - m_string_enum_values->set_parent(p, m_col_ndx); - m_type = Type::enum_strings; + // The context flag is used to indicate interned strings vs old enum strings + // (in conjunction with has_refs() == false) + if (arr->get_context_flag_from_header(arr->get_header())) { + // init for new interned strings (replacing old enum strings) + m_type = Type::interned_strings; + // consider if we want this invariant: REALM_ASSERT_DEBUG(m_string_interner); + } + else { + // init for old enum strings + m_string_enum_values = std::make_unique(m_alloc); + ArrayParent* p; + REALM_ASSERT(m_spec != nullptr); + REALM_ASSERT(m_col_ndx != realm::npos); + ref_type r = m_spec->get_enumkeys_ref(m_col_ndx, p); + m_string_enum_values->init_from_ref(r); + m_string_enum_values->set_parent(p, m_col_ndx); + m_type = Type::enum_strings; + } } } else { @@ -111,6 +122,7 @@ size_t ArrayString::size() const case Type::big_strings: return static_cast(m_arr)->size(); case Type::enum_strings: + case Type::interned_strings: return static_cast(m_arr)->size(); } return {}; @@ -128,7 +140,8 @@ void ArrayString::add(StringData value) case Type::big_strings: static_cast(m_arr)->add_string(value); break; - case Type::enum_strings: { + case Type::enum_strings: + case Type::interned_strings: { auto a = static_cast(m_arr); size_t ndx = a->size(); a->add(0); @@ -150,6 +163,11 @@ void ArrayString::set(size_t ndx, StringData value) case Type::big_strings: static_cast(m_arr)->set_string(ndx, value); break; + case Type::interned_strings: { + auto id = m_string_interner->intern(value); + static_cast(m_arr)->set(ndx, id); + break; + } case Type::enum_strings: { size_t sz = m_string_enum_values->size(); size_t res = m_string_enum_values->find_first(value, 0, sz); @@ -178,6 +196,12 @@ void ArrayString::insert(size_t ndx, StringData value) case Type::enum_strings: { static_cast(m_arr)->insert(ndx, 0); set(ndx, value); + break; + } + case Type::interned_strings: { + static_cast(m_arr)->insert(ndx, 0); + set(ndx, value); + break; } } } @@ -195,6 +219,10 @@ StringData ArrayString::get(size_t ndx) const size_t index = size_t(static_cast(m_arr)->get(ndx)); return m_string_enum_values->get(index); } + case Type::interned_strings: { + size_t id = size_t(static_cast(m_arr)->get(ndx)); + return m_string_interner->get(id); + } } return {}; } @@ -212,6 +240,10 @@ StringData ArrayString::get_legacy(size_t ndx) const size_t index = size_t(static_cast(m_arr)->get(ndx)); return m_string_enum_values->get(index); } + case Type::interned_strings: { + size_t id = size_t(static_cast(m_arr)->get(ndx)); + return m_string_interner->get(id); + } } return {}; } @@ -231,8 +263,12 @@ bool ArrayString::is_null(size_t ndx) const case Type::big_strings: return static_cast(m_arr)->is_null(ndx); case Type::enum_strings: { - size_t index = size_t(static_cast(m_arr)->get(ndx)); - return m_string_enum_values->is_null(index); + size_t id = size_t(static_cast(m_arr)->get(ndx)); + return m_string_enum_values->is_null(id); + } + case Type::interned_strings: { + size_t id = size_t(static_cast(m_arr)->get(ndx)); + return id == 0; } } return {}; @@ -250,6 +286,7 @@ void ArrayString::erase(size_t ndx) case Type::big_strings: static_cast(m_arr)->erase(ndx); break; + case Type::interned_strings: case Type::enum_strings: static_cast(m_arr)->erase(ndx); break; @@ -277,6 +314,9 @@ void ArrayString::move(ArrayString& dst, size_t ndx) // this operation will never be called for enumerated columns REALM_UNREACHABLE(); break; + case Type::interned_strings: + m_arr->truncate(ndx); + break; } } @@ -293,6 +333,7 @@ void ArrayString::clear() static_cast(m_arr)->clear(); break; case Type::enum_strings: + case Type::interned_strings: static_cast(m_arr)->clear(); break; } @@ -321,6 +362,15 @@ size_t ArrayString::find_first(StringData value, size_t begin, size_t end) const } break; } + case Type::interned_strings: { + // we need a way to avoid this lookup for each leaf array. The lookup must appear + // higher up the call stack and passed down. + auto id = m_string_interner->lookup(value); + if (id) { + return static_cast(m_arr)->find_first(*id, begin, end); + } + break; + } } return not_found; } @@ -371,6 +421,9 @@ size_t ArrayString::lower_bound(StringData value) return lower_bound_string(static_cast(m_arr), value); case Type::enum_strings: break; + case Type::interned_strings: + REALM_UNREACHABLE(); + break; } return realm::npos; } @@ -383,6 +436,9 @@ ArrayString::Type ArrayString::upgrade_leaf(size_t value_size) if (m_type == Type::enum_strings) return Type::enum_strings; + if (m_type == Type::interned_strings) + return Type::interned_strings; + if (m_type == Type::medium_strings) { if (value_size <= medium_string_max_size) return Type::medium_strings; @@ -473,8 +529,25 @@ void ArrayString::verify() const static_cast(m_arr)->verify(); break; case Type::enum_strings: + case Type::interned_strings: static_cast(m_arr)->verify(); break; } #endif } + +ref_type ArrayString::write(_impl::ArrayWriterBase& out, StringInterner* interner) +{ + REALM_ASSERT(interner); + // we have to write out all, modified or not, to match the total cleanup + Array interned(Allocator::get_default()); + auto sz = size(); + interned.create(NodeHeader::type_Normal, true, sz); + for (size_t i = 0; i < sz; ++i) { + interned.set(i, interner->intern(get(i))); + } + auto retval = interned.write(out, false, false, out.compress); + interned.destroy(); + return retval; + // return m_arr->write(out, true, false, false); +} diff --git a/src/realm/array_string.hpp b/src/realm/array_string.hpp index 4dc96646378..df121c50b2c 100644 --- a/src/realm/array_string.hpp +++ b/src/realm/array_string.hpp @@ -66,6 +66,14 @@ class ArrayString : public ArrayPayload { { m_arr->set_parent(p, n); } + bool need_string_interner() const override + { + return true; + } + void set_string_interner(StringInterner* string_interner) const override + { + m_string_interner = string_interner; + } bool need_spec() const override { return true; @@ -118,6 +126,10 @@ class ArrayString : public ArrayPayload { static StringData get(const char* header, size_t ndx, Allocator& alloc) noexcept; void verify() const; + // Write to 'out', if needed using 'interner' to intern any strings. + // An interner of 0 will disable interning. Interned values may be further + // compressed using leaf compression for integer arrays. + ref_type write(_impl::ArrayWriterBase& out, StringInterner* interner); private: static constexpr size_t small_string_max_size = 15; // ArrayStringShort @@ -127,18 +139,18 @@ class ArrayString : public ArrayPayload { static constexpr size_t storage_size = std::max({sizeof(ArrayStringShort), sizeof(ArraySmallBlobs), sizeof(ArrayBigBlobs), sizeof(Array)}); - enum class Type { small_strings, medium_strings, big_strings, enum_strings }; + enum class Type { small_strings, medium_strings, big_strings, enum_strings, interned_strings }; Type m_type = Type::small_strings; Allocator& m_alloc; alignas(storage_alignment) std::byte m_storage[storage_size]; Array* m_arr; + bool m_nullable = true; mutable Spec* m_spec = nullptr; mutable size_t m_col_ndx = realm::npos; - bool m_nullable = true; - std::unique_ptr m_string_enum_values; + mutable StringInterner* m_string_interner = nullptr; Type upgrade_leaf(size_t value_size); }; diff --git a/src/realm/array_timestamp.hpp b/src/realm/array_timestamp.hpp index 1fad36144f0..cfa4268cd11 100644 --- a/src/realm/array_timestamp.hpp +++ b/src/realm/array_timestamp.hpp @@ -76,7 +76,8 @@ class ArrayTimestamp : public ArrayPayload, private Array { Timestamp get(size_t ndx) const { util::Optional seconds = m_seconds.get(ndx); - return seconds ? Timestamp(*seconds, int32_t(m_nanoseconds.get(ndx))) : Timestamp{}; + int32_t nano = (int32_t)m_nanoseconds.get(ndx); + return seconds ? Timestamp(*seconds, nano) : Timestamp{}; } Mixed get_any(size_t ndx) const final { diff --git a/src/realm/array_unsigned.cpp b/src/realm/array_unsigned.cpp index 938fe5aece8..55f030522b9 100644 --- a/src/realm/array_unsigned.cpp +++ b/src/realm/array_unsigned.cpp @@ -92,23 +92,25 @@ void ArrayUnsigned::update_from_parent() noexcept size_t ArrayUnsigned::lower_bound(uint64_t value) const noexcept { - if (m_width == 8) { + auto width = get_width_from_header(get_header()); + + if (width == 8) { uint8_t* arr = reinterpret_cast(m_data); uint8_t* pos = std::lower_bound(arr, arr + m_size, value); return pos - arr; } - else if (m_width == 16) { + else if (width == 16) { uint16_t* arr = reinterpret_cast(m_data); uint16_t* pos = std::lower_bound(arr, arr + m_size, value); return pos - arr; } - else if (m_width == 32) { + else if (width == 32) { uint32_t* arr = reinterpret_cast(m_data); uint32_t* pos = std::lower_bound(arr, arr + m_size, value); return pos - arr; } - else if (m_width < 8) { - switch (m_width) { + else if (width < 8) { + switch (width) { case 0: return realm::lower_bound<0>(m_data, m_size, value); case 1: @@ -130,23 +132,25 @@ size_t ArrayUnsigned::lower_bound(uint64_t value) const noexcept size_t ArrayUnsigned::upper_bound(uint64_t value) const noexcept { - if (m_width == 8) { + auto width = get_width_from_header(get_header()); + + if (width == 8) { uint8_t* arr = reinterpret_cast(m_data); uint8_t* pos = std::upper_bound(arr, arr + m_size, value); return pos - arr; } - else if (m_width == 16) { + else if (width == 16) { uint16_t* arr = reinterpret_cast(m_data); uint16_t* pos = std::upper_bound(arr, arr + m_size, value); return pos - arr; } - else if (m_width == 32) { + else if (width == 32) { uint32_t* arr = reinterpret_cast(m_data); uint32_t* pos = std::upper_bound(arr, arr + m_size, value); return pos - arr; } - else if (m_width < 8) { - switch (m_width) { + else if (width < 8) { + switch (width) { case 0: return realm::upper_bound<0>(m_data, m_size, value); case 1: diff --git a/src/realm/cluster.cpp b/src/realm/cluster.cpp index 4922c54f9b2..75deb0707c2 100644 --- a/src/realm/cluster.cpp +++ b/src/realm/cluster.cpp @@ -250,6 +250,17 @@ size_t Cluster::node_size_from_header(Allocator& alloc, const char* header) } } +template +inline void Cluster::set_string_interner(T&, ColKey) const +{ +} + +template <> +inline void Cluster::set_string_interner(ArrayString& arr, ColKey col_key) const +{ + m_tree_top.set_string_interner(arr, col_key); +} + template inline void Cluster::set_spec(T&, ColKey::Idx) const { @@ -270,6 +281,7 @@ inline void Cluster::do_insert_row(size_t ndx, ColKey col, Mixed init_val, bool auto col_ndx = col.get_index(); arr.set_parent(this, col_ndx.val + s_first_col_index); set_spec(arr, col_ndx); + set_string_interner(arr, col); arr.init_from_parent(); if (init_val.is_null()) { arr.insert(ndx, T::default_value(nullable)); @@ -446,10 +458,12 @@ inline void Cluster::do_move(size_t ndx, ColKey col_key, Cluster* to) T src(m_alloc); src.set_parent(this, col_ndx); src.init_from_parent(); + set_string_interner(src, col_key); T dst(m_alloc); dst.set_parent(to, col_ndx); dst.init_from_parent(); + set_string_interner(dst, col_key); src.move(dst, ndx); } @@ -760,6 +774,7 @@ inline void Cluster::do_erase(size_t ndx, ColKey col_key) T values(m_alloc); values.set_parent(this, col_ndx.val + s_first_col_index); set_spec(values, col_ndx); + set_string_interner(values, col_key); values.init_from_parent(); if constexpr (std::is_same_v) { if (ObjLink link = values.get(ndx)) { @@ -1031,6 +1046,7 @@ void Cluster::upgrade_string_to_enum(ColKey col_key, ArrayString& keys) indexes.create(Array::type_Normal, false); ArrayString values(m_alloc); ref_type ref = Array::get_as_ref(col_ndx.val + s_first_col_index); + set_string_interner(values, col_key); values.init_from_ref(ref); size_t sz = values.size(); for (size_t i = 0; i < sz; i++) { @@ -1052,6 +1068,9 @@ void Cluster::init_leaf(ColKey col_key, ArrayPayload* leaf) const if (auto t = m_tree_top.get_owning_table()) t->check_column(col_key); ref_type ref = to_ref(Array::get(col_ndx.val + 1)); + if (leaf->need_string_interner()) { + m_tree_top.set_string_interner(*leaf, col_key); + } if (leaf->need_spec()) { m_tree_top.set_spec(*leaf, col_ndx); } @@ -1071,6 +1090,10 @@ void Cluster::verify(ref_type ref, size_t index, util::Optional& sz) con { ArrayType arr(get_alloc()); set_spec(arr, ColKey::Idx{unsigned(index) - 1}); + auto table = get_owning_table(); + REALM_ASSERT(index <= table->m_leaf_ndx2colkey.size()); + auto col_key = table->m_leaf_ndx2colkey[index - 1]; + set_string_interner(arr, col_key); arr.set_parent(const_cast(this), index); arr.init_from_ref(ref); arr.verify(); @@ -1409,6 +1432,7 @@ void Cluster::dump_objects(int64_t key_offset, std::string lead) const case col_type_String: { ArrayString arr(m_alloc); set_spec(arr, col.get_index()); + set_string_interner(arr, col); ref_type ref = Array::get_as_ref(j); arr.init_from_ref(ref); std::cout << ", " << arr.get(i); @@ -1628,6 +1652,31 @@ ref_type Cluster::typed_write(ref_type ref, _impl::ArrayWriterBase& out) const // Columns auto col_key = out.table->m_leaf_ndx2colkey[j - 1]; auto col_type = col_key.get_type(); + // String columns are interned at this point + if (out.compress && col_type == col_type_String && !col_key.is_collection()) { + ArrayRef leaf(m_alloc); + leaf.init_from_ref(ref); + auto header = leaf.get_header(); + if (NodeHeader::get_hasrefs_from_header(header) || + NodeHeader::get_wtype_from_header(header) == wtype_Multiply) { + // We're interning these strings + ArrayString as(m_alloc); + as.init_from_ref(leaf_rot.get_as_ref()); + written_cluster.set_as_ref(j, as.write(out, out.table->get_string_interner(col_key))); + // in a transactional setting: + // Destroy all sub-arrays if present, in order to release memory in file + // This is contrary to the rest of the handling in this function, but needed + // here since sub-arrays may not have been COW'ed and therefore not freed in file. + // We rely on 'only_modified' to indicate that we're in a transactional setting. + if (only_modified) + leaf.destroy_deep(true); + continue; + } + // whether it's the old enum strings or the new interned strings, + // just write out the array using integer leaf compression + written_cluster.set_as_ref(j, leaf.write(out, false, false, false)); + continue; + } if (col_key.is_collection()) { ArrayRef arr_ref(m_alloc); arr_ref.init_from_ref(ref); diff --git a/src/realm/cluster.hpp b/src/realm/cluster.hpp index 9b106f436ea..365ad3a8634 100644 --- a/src/realm/cluster.hpp +++ b/src/realm/cluster.hpp @@ -365,6 +365,8 @@ class Cluster : public ClusterNode { void do_insert_mixed(size_t ndx, ColKey col_key, Mixed init_value, ObjKey origin_key); template void set_spec(T&, ColKey::Idx) const; + template + void set_string_interner(T&, ColKey) const; template void verify(ref_type ref, size_t index, util::Optional& sz) const; }; diff --git a/src/realm/cluster_tree.cpp b/src/realm/cluster_tree.cpp index 29d5f52ce84..3021f684911 100644 --- a/src/realm/cluster_tree.cpp +++ b/src/realm/cluster_tree.cpp @@ -1135,6 +1135,15 @@ void ClusterTree::update(UpdateFunction func) } } +void ClusterTree::set_string_interner(ArrayPayload& arr, ColKey col_key) const +{ + // Check for owner. This function may be called in context of DictionaryClusterTree + // in which case m_owner is null (and spec never needed). + if (m_owner) { + arr.set_string_interner(_impl::TableFriend::get_string_interner(*m_owner, col_key)); + } +} + void ClusterTree::set_spec(ArrayPayload& arr, ColKey::Idx col_ndx) const { // Check for owner. This function may be called in context of DictionaryClusterTree diff --git a/src/realm/cluster_tree.hpp b/src/realm/cluster_tree.hpp index 43d796c995e..15829f991bc 100644 --- a/src/realm/cluster_tree.hpp +++ b/src/realm/cluster_tree.hpp @@ -181,6 +181,7 @@ class ClusterTree { void update(UpdateFunction func); void set_spec(ArrayPayload& arr, ColKey::Idx col_ndx) const; + void set_string_interner(ArrayPayload& arr, ColKey col_key) const; virtual std::unique_ptr get_root_from_parent(); diff --git a/src/realm/db.hpp b/src/realm/db.hpp index 1a8a1b67461..2059da3b21c 100644 --- a/src/realm/db.hpp +++ b/src/realm/db.hpp @@ -515,6 +515,8 @@ class DB : public std::enable_shared_from_this { std::shared_ptr m_logger; std::mutex m_commit_listener_mutex; std::vector m_commit_listeners; + std::unordered_map*> m_string_interners; + std::mutex m_string_interners_mutex; bool m_is_sync_agent = false; // Id for this DB to be used in logging. We will just use some bits from the pointer. // The path cannot be used as this would not allow us to distinguish between two DBs opening diff --git a/src/realm/group.cpp b/src/realm/group.cpp index eeecbaed4f5..9b99971dc71 100644 --- a/src/realm/group.cpp +++ b/src/realm/group.cpp @@ -1372,7 +1372,7 @@ void Group::flush_accessors_for_commit() acc->flush_for_commit(); } -void Group::refresh_dirty_accessors() +void Group::refresh_dirty_accessors(bool writable) { if (!m_tables.is_attached()) { m_table_accessors.clear(); @@ -1402,7 +1402,7 @@ void Group::refresh_dirty_accessors() same_table = true; } if (same_table) { - table_accessor->refresh_accessor_tree(); + table_accessor->refresh_accessor_tree(writable); } else { table_accessor->detach(Table::cookie_removed); @@ -1460,7 +1460,7 @@ void Group::advance_transact(ref_type new_top_ref, util::InputStream* in, bool w m_top.detach(); // Soft detach bool create_group_when_missing = false; // See Group::attach_shared(). attach(new_top_ref, writable, create_group_when_missing); // Throws - refresh_dirty_accessors(); // Throws + refresh_dirty_accessors(writable); // Throws if (schema_changed) send_schema_change_notification(); diff --git a/src/realm/group.hpp b/src/realm/group.hpp index 08ddd9acd44..7204f26b258 100644 --- a/src/realm/group.hpp +++ b/src/realm/group.hpp @@ -681,7 +681,7 @@ class Group : public ArrayParent { /// Memory mappings must have been updated to reflect any growth in filesize before /// calling advance_transact() void advance_transact(ref_type new_top_ref, util::InputStream*, bool writable); - void refresh_dirty_accessors(); + void refresh_dirty_accessors(bool writable); void flush_accessors_for_commit(); /// \brief The version of the format of the node structure (in file or in diff --git a/src/realm/group_writer.cpp b/src/realm/group_writer.cpp index 4ce470fec62..cf866d9f98e 100644 --- a/src/realm/group_writer.cpp +++ b/src/realm/group_writer.cpp @@ -647,6 +647,7 @@ ref_type GroupWriter::write_group() { ALLOC_DBG_COUT("Commit nr " << m_current_version << " ( from " << m_oldest_reachable_version << " )" << std::endl); + // m_group.typed_print(""); read_in_freelist(); // Now, 'm_size_map' holds all free elements candidate for recycling @@ -710,7 +711,7 @@ ref_type GroupWriter::write_group() top.set_as_ref(Group::s_evacuation_point_ndx, ref); } else if (ref) { - Array::destroy(ref, m_alloc); + Array::destroy(ref_type(ref), m_alloc); top.set(Group::s_evacuation_point_ndx, 0); } } @@ -788,7 +789,9 @@ ref_type GroupWriter::write_group() top.set(Group::s_file_size_ndx, RefOrTagged::make_tagged(m_logical_size)); auto ref = top.get_as_ref(Group::s_evacuation_point_ndx); REALM_ASSERT(ref); - Array::destroy(ref, m_alloc); + Array destroy_array(m_alloc); + destroy_array.init_from_ref(ref); + destroy_array.destroy(); top.set(Group::s_evacuation_point_ndx, 0); m_evacuation_limit = 0; diff --git a/src/realm/node.hpp b/src/realm/node.hpp index 8a4b862a701..21ee61eddde 100644 --- a/src/realm/node.hpp +++ b/src/realm/node.hpp @@ -21,6 +21,7 @@ #include #include +#include #include @@ -357,6 +358,11 @@ class ArrayPayload { virtual void init_from_ref(ref_type) noexcept = 0; virtual void set_parent(ArrayParent* parent, size_t ndx_in_parent) noexcept = 0; virtual Mixed get_any(size_t ndx) const = 0; + virtual bool need_string_interner() const + { + return false; + } + virtual void set_string_interner(StringInterner*) const {} virtual bool need_spec() const { return false; diff --git a/src/realm/obj.cpp b/src/realm/obj.cpp index eb8138dd8f5..fc34b755d57 100644 --- a/src/realm/obj.cpp +++ b/src/realm/obj.cpp @@ -613,7 +613,11 @@ StringData Obj::_get(ColKey::Idx col_ndx) const return values.get(m_row_ndx); } else { - return ArrayString::get(alloc.translate(ref), m_row_ndx, alloc); + ArrayString values(get_alloc()); + auto col_key = m_table->leaf_ndx2colkey(col_ndx); + values.set_string_interner(m_table->get_string_interner(col_key)); + values.init_from_ref(ref); + return values.get(m_row_ndx); } } @@ -738,9 +742,12 @@ inline bool Obj::do_is_null(ColKey::Idx col_ndx) const template <> inline bool Obj::do_is_null(ColKey::Idx col_ndx) const { + REALM_ASSERT(false); // Don't come here, you're falling from a cliff.... ArrayString values(get_alloc()); ref_type ref = to_ref(Array::get(m_mem.get_addr(), col_ndx.val + 1)); values.set_spec(const_cast(&get_spec()), m_table->leaf_ndx2spec_ndx(col_ndx)); + // TODO: Set string interner if needed + // values.set_string_interner(m_table->get_string_interner(col_key)); values.init_from_ref(ref); return values.is_null(m_row_ndx); } @@ -765,8 +772,16 @@ bool Obj::is_null(ColKey col_key) const return do_is_null(col_ndx); case col_type_Double: return do_is_null(col_ndx); - case col_type_String: - return do_is_null(col_ndx); + case col_type_String: { + ArrayString values(get_alloc()); + ref_type ref = to_ref(Array::get(m_mem.get_addr(), col_ndx.val + 1)); + values.set_spec(const_cast(&get_spec()), m_table->leaf_ndx2spec_ndx(col_ndx)); + // TODO: Set string interner if needed + values.set_string_interner(m_table->get_string_interner(col_key)); + values.init_from_ref(ref); + return values.is_null(m_row_ndx); + } + // return do_is_null(col_ndx); case col_type_Binary: return do_is_null(col_ndx); case col_type_Mixed: @@ -1588,6 +1603,17 @@ inline void check_range(const BinaryData& val) } } // namespace +// helper functions for filtering out calls to set_string_interner() +template +inline void Obj::set_string_interner(T&, ColKey) +{ +} +template <> +inline void Obj::set_string_interner(ArrayString& values, ColKey col_key) +{ + values.set_string_interner(m_table->get_string_interner(col_key)); +} + // helper functions for filtering out calls to set_spec() template inline void Obj::set_spec(T&, ColKey) @@ -1685,6 +1711,7 @@ Obj& Obj::set(ColKey col_key, T value, bool is_default) LeafType values(alloc); values.set_parent(&fields, col_ndx.val + 1); set_spec(values, col_key); + set_string_interner(values, col_key); values.init_from_parent(); values.set(m_row_ndx, value); @@ -2296,6 +2323,7 @@ inline void Obj::do_set_null(ColKey col_key) ArrayString values(alloc); values.set_parent(&fields, col_ndx.val + 1); values.set_spec(const_cast(&get_spec()), spec_ndx); + values.set_string_interner(m_table->get_string_interner(col_key)); values.init_from_parent(); values.set_null(m_row_ndx); diff --git a/src/realm/obj.hpp b/src/realm/obj.hpp index 67c82a0cada..8711e590dac 100644 --- a/src/realm/obj.hpp +++ b/src/realm/obj.hpp @@ -392,6 +392,8 @@ class Obj { void nullify_link(ColKey origin_col, ObjLink target_key) &&; template inline void set_spec(T&, ColKey); + template + inline void set_string_interner(T&, ColKey); template inline void nullify_single_link(ColKey col, ValueType target); diff --git a/src/realm/string_compressor.cpp b/src/realm/string_compressor.cpp new file mode 100644 index 00000000000..99dcb50dac5 --- /dev/null +++ b/src/realm/string_compressor.cpp @@ -0,0 +1,357 @@ +/************************************************************************* + * + * Copyright 2016 Realm Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + **************************************************************************/ + +#include +#include + +#include + +#include +namespace realm { + +StringCompressor::StringCompressor(Allocator& alloc, Array& parent, size_t index, bool writable) +{ + m_compression_map.resize(16); // start with a very small compression map + m_symbols.reserve(65536); + m_data = std::make_unique(alloc); + m_data->set_parent(&parent, index); + refresh(writable); +} + +void StringCompressor::refresh(bool writable) +{ + // we assume that compressors are only created from a valid parent. + // String interners in 'dead' mode should never instantiate a string compressor. + if (m_data->get_ref_from_parent() == 0) { + REALM_ASSERT(writable); + m_data->create(0, 65535); + m_data->update_parent(); + } + else { + if (m_data->is_attached()) + m_data->update_from_parent(); + else + m_data->init_from_ref(m_data->get_ref_from_parent()); + } + rebuild_internal(); +} + +static size_t symbol_pair_hash(CompressionSymbol a, CompressionSymbol b) +{ + // range of return value must match size of encoding table + uint32_t tmp = a + 3; + tmp *= b + 7; + return (tmp ^ (tmp >> 16)) & 0xFFFF; +} + +void StringCompressor::add_expansion(SymbolDef def) +{ + // compute expansion size: + size_t exp_size = 0; + if (def.expansion_a < 256) + exp_size = 1; + else + exp_size = m_symbols[def.expansion_a - 256].expansion.size(); + if (def.expansion_b < 256) + exp_size += 1; + else + exp_size += m_symbols[def.expansion_b - 256].expansion.size(); + // make sure there is room in active storage chunk: + if (m_expansion_storage.size() == 0 || m_expansion_storage.back().size() + exp_size + 1 >= storage_chunk_size) { + m_expansion_storage.push_back({}); + m_expansion_storage.back().reserve(storage_chunk_size); + } + // construct expansion at end of chunk: + auto& chunk = m_expansion_storage.back(); + auto start_index = (uint32_t)chunk.size(); + if (def.expansion_a < 256) + chunk.push_back((char)def.expansion_a); + else + chunk.append(m_symbols[def.expansion_a - 256].expansion); + if (def.expansion_b < 256) + chunk.push_back((char)def.expansion_b); + else + chunk.append(m_symbols[def.expansion_b - 256].expansion); + std::string_view expansion(chunk.data() + start_index, exp_size); + m_symbols.push_back({def, expansion, (uint32_t)m_expansion_storage.size() - 1, start_index}); +} + +void StringCompressor::expand_compression_map() +{ + size_t old_size = m_compression_map.size(); + REALM_ASSERT(old_size <= 16384); + size_t new_size = 4 * old_size; + std::vector map(new_size); + for (size_t i = 0; i < m_compression_map.size(); ++i) { + auto& entry = m_compression_map[i]; + if (entry.id == 0) + continue; + auto hash = symbol_pair_hash(entry.expansion_a, entry.expansion_b); + auto new_hash = hash & (new_size - 1); + REALM_ASSERT(map[new_hash].id == 0); + map[new_hash] = entry; + } + m_compression_map.swap(map); +} + +void StringCompressor::rebuild_internal() +{ + auto num_symbols = m_data->size(); + if (num_symbols == m_symbols.size()) + return; + if (num_symbols < m_symbols.size()) { + // fewer symbols (likely a rollback) -- remove last ones added + while (num_symbols < m_symbols.size()) { + auto& symbol = m_symbols.back(); + auto hash = symbol_pair_hash(symbol.def.expansion_a, symbol.def.expansion_b); + hash &= m_compression_map.size() - 1; + REALM_ASSERT(m_compression_map[hash].id == symbol.def.id); + m_compression_map[hash] = {0, 0, 0}; + if (symbol.storage_index < m_expansion_storage.size() - 1) { + m_expansion_storage.resize(symbol.storage_index + 1); + } + m_expansion_storage[symbol.storage_index].resize(symbol.storage_offset); + m_symbols.pop_back(); + } + return; + } + // we have new symbols to add + for (size_t i = m_symbols.size(); i < num_symbols; ++i) { + auto pair = m_data->get(i); + SymbolDef def; + def.id = (CompressionSymbol)(i + 256); + def.expansion_a = 0xFFFF & (pair >> 16); + def.expansion_b = 0xFFFF & pair; + auto hash = symbol_pair_hash(def.expansion_a, def.expansion_b); + while (m_compression_map[hash & (m_compression_map.size() - 1)].id) { + expand_compression_map(); + } + // REALM_ASSERT_DEBUG(m_compression_map[hash].id == 0); + m_compression_map[hash & (m_compression_map.size() - 1)] = def; + add_expansion(def); + } +} + +StringCompressor::~StringCompressor() {} + +CompressedString StringCompressor::compress(StringData sd, bool learn) +{ + CompressedString result(sd.size()); + // expand string into array of symbols + const char* d = sd.data(); + const size_t limit = sd.size(); + if (limit == 0) + return {}; + size_t i = 0; + while (i < limit) { + result[i++] = 0xFF & *d++; + } + // iteratively compress array of symbols. Each run compresses pairs into single symbols. + // 6 runs give a max compression of 64x - on average it will be much less :-) + constexpr int run_limit = 6; + CompressionSymbol* to; + for (int run = 0; run < run_limit; ++run) { + CompressionSymbol* from = to = result.data(); + CompressionSymbol* limit = from + result.size() - 1; + while (from < limit) { + auto hash = symbol_pair_hash(from[0], from[1]); + hash &= m_compression_map.size() - 1; + auto& def = m_compression_map[hash]; + if (def.id) { + // existing symbol + if (def.expansion_a == from[0] && def.expansion_b == from[1]) { + // matching symbol + *to++ = def.id; + from += 2; + } + else if (m_compression_map.size() < 65536) { + // Conflict: some other symbol is defined here - but we can expand the compression map + // and hope to find room! + expand_compression_map(); + // simply retry: + continue; + } + else { + // also conflict: some other symbol is defined here, we can't compress + *to++ = *from++; + // In a normal hash table we'd have buckets and add a translation + // to a bucket. This is slower generally, but yields better compression. + } + } + else { + // free entry we can use for new symbol (and we're learning) + if (m_symbols.size() < (65536 - 256) && learn) { + // define a new symbol for this entry and use it. + REALM_ASSERT_DEBUG(m_compression_map[hash].id == 0); + REALM_ASSERT_DEBUG(m_symbols.size() == m_data->size()); + REALM_ASSERT_DEBUG(m_data->is_attached()); + CompressionSymbol id = (CompressionSymbol)(256 + m_symbols.size()); + SymbolDef def{id, from[0], from[1]}; + m_compression_map[hash] = def; + add_expansion(def); + m_data->add(((uint64_t)from[0]) << 16 | from[1]); + // std::cerr << id << " = {" << from[0] << ", " << from[1] << "}" << std::endl; + *to++ = id; + from += 2; + } + else { + // no more symbol space, so can't compress + *to++ = *from++; + } + } + } + if (from == limit) { + // copy over trailing symbol + *to++ = *from++; + } + REALM_ASSERT_DEBUG(to > result.data()); + size_t sz = to - result.data(); + REALM_ASSERT_DEBUG(sz <= sd.size()); + result.resize(sz); + if (from == to) // no compression took place in last iteration + break; + } + return result; +} + +std::string StringCompressor::decompress(CompressedStringView& c_str) +{ + CompressionSymbol* ptr = c_str.data; + CompressionSymbol* limit = ptr + c_str.size; + // compute size of decompressed string first to avoid allocations as string grows + size_t result_size = 0; + while (ptr < limit) { + if (*ptr < 256) + result_size += 1; + else + result_size += m_symbols[*ptr - 256].expansion.size(); + ++ptr; + } + std::string result2; + result2.reserve(result_size); + // generate result + ptr = c_str.data; + while (ptr < limit) { + if (*ptr < 256) + result2.push_back((char)*ptr); + else + result2.append(m_symbols[*ptr - 256].expansion); + ptr++; + } +#ifdef REALM_DEBUG + std::string result; + { + auto decompress = [&](CompressionSymbol symbol, auto& decompress) -> void { + if (symbol < 256) { + result.push_back((char)symbol); + } + else { + auto& s = m_symbols[symbol - 256]; + decompress(s.def.expansion_a, decompress); + decompress(s.def.expansion_b, decompress); + } + }; + + CompressionSymbol* ptr = c_str.data; + CompressionSymbol* limit = ptr + c_str.size; + while (ptr < limit) { + decompress(*ptr, decompress); + ++ptr; + } + } + REALM_ASSERT_DEBUG(result == result2); +#endif + return result2; +} + +int StringCompressor::compare(CompressedStringView& A, CompressedStringView& B) +{ + auto A_ptr = A.data; + auto A_limit = A_ptr + A.size; + auto B_ptr = B.data; + auto B_limit = B_ptr + B.size; + while (A_ptr < A_limit && B_ptr < B_limit) { + auto code_A = *A_ptr++; + auto code_B = *B_ptr++; + if (code_A == code_B) + continue; + // symbols did not match: + // 1. both symbols are single characters + if (code_A < 256 && code_B < 256) + return code_B - code_A; + std::string a_str(code_A, 1); + auto str_A = std::string_view(code_A < 256 ? a_str : m_symbols[code_A - 256].expansion); + std::string b_str(code_B, 1); + auto str_B = std::string_view(code_B < 256 ? b_str : m_symbols[code_B - 256].expansion); + // to ensure comparison as StringData we need to convert the stringviews + StringData sd_a(str_A.data(), str_A.size()); + StringData sd_b(str_B.data(), str_B.size()); + REALM_ASSERT_DEBUG(sd_a != sd_b); + if (sd_a < sd_b) + return 1; + else + return -1; + } + // The compressed strings are identical or one is the prefix of the other + return B.size - A.size; + // ^ a faster way of producing same positive / negative / zero as: + // if (A.size() < B.size()) + // return 1; + // if (A.size() > B.size()) + // return -1; + // return 0; +} + +int StringCompressor::compare(StringData sd, CompressedStringView& B) +{ + auto B_size = B.size; + // make sure comparisons are unsigned, even though StringData does not specify signedness + const unsigned char* A_ptr = reinterpret_cast(sd.data()); + auto A_limit = A_ptr + sd.size(); + for (size_t i = 0; i < B_size; ++i) { + if (A_ptr == A_limit) { + // sd ended first, so B is bigger + return -1; + } + auto code = B.data[i]; + if (code < 256) { + if (code < *A_ptr) + return 1; + if (code > *A_ptr) + return -1; + ++A_ptr; + continue; + } + auto& expansion = m_symbols[code - 256]; + for (size_t disp = 0; disp < expansion.expansion.size(); ++disp) { + uint8_t c = expansion.expansion[disp]; + if (c < *A_ptr) + return 1; + if (c > *A_ptr) + return -1; + ++A_ptr; + } + } + // if sd is longer than B, sd is the biggest string + if (A_ptr < A_limit) + return 1; + return 0; +} + + +} // namespace realm diff --git a/src/realm/string_compressor.hpp b/src/realm/string_compressor.hpp new file mode 100644 index 00000000000..2c866ecb781 --- /dev/null +++ b/src/realm/string_compressor.hpp @@ -0,0 +1,100 @@ +/************************************************************************* + * + * Copyright 2016 Realm Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + **************************************************************************/ + +#ifndef REALM_STRING_COMPRESSOR_HPP +#define REALM_STRING_COMPRESSOR_HPP + +#include +#include + +using CompressionSymbol = uint16_t; +using CompressedString = std::vector; +struct CompressedStringView { + CompressionSymbol* data = 0; + uint32_t size = 0; + CompressedStringView() = default; + CompressedStringView(CompressionSymbol* c_ptr, size_t s) + : data(c_ptr) + , size(uint32_t(s)) + { + } + explicit CompressedStringView(CompressedString& cs) + : data(cs.data()) + , size(uint32_t(cs.size())) + { + } + bool operator==(CompressedStringView& other) + { + if (size != other.size) + return false; + for (size_t i = 0; i < size; ++i) { + if (data[i] != other.data[i]) + return false; + } + return true; + } +}; + +namespace realm { + +class ArrayUnsigned; +class Array; +class Allocator; + +class StringCompressor { +public: + StringCompressor(Allocator& alloc, Array& parent, size_t index, bool writable); + void refresh(bool writable); + ~StringCompressor(); + + int compare(CompressedStringView& A, CompressedStringView& B); + int compare(StringData sd, CompressedStringView& B); + + CompressedString compress(StringData, bool learn); + std::string decompress(CompressedStringView& c_str); + +private: + struct SymbolDef { + CompressionSymbol id = 0; + CompressionSymbol expansion_a = 0; + CompressionSymbol expansion_b = 0; + }; + + struct ExpandedSymbolDef { + SymbolDef def; + std::string_view expansion; + // ^ points into storage managed by m_expansion_storage + // we need the following 2 values to facilitate rollback of allocated storage + uint32_t storage_index; // index into m_expansion_storage + uint32_t storage_offset; // offset into block. + }; + + void rebuild_internal(); + void expand_compression_map(); + void add_expansion(SymbolDef def); + std::vector m_symbols; // map from symbol -> symbolpair, 2 elements pr entry + std::vector m_compression_map; // perfect hash from symbolpair to its symbol + + std::unique_ptr m_data; + constexpr static size_t storage_chunk_size = 4096; + std::vector m_expansion_storage; +}; + +} // namespace realm + +#endif diff --git a/src/realm/string_interner.cpp b/src/realm/string_interner.cpp new file mode 100644 index 00000000000..fb801b1fd6a --- /dev/null +++ b/src/realm/string_interner.cpp @@ -0,0 +1,681 @@ +/************************************************************************* + * + * Copyright 2016 Realm Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + **************************************************************************/ + +#include +#include + +#include +#include + +namespace realm { + +// Fast mapping of strings (or rather hash of strings) to string IDs. +// +// We use a tree where: +// * All interior nodes are radix nodes with a fan-out of 256. +// * Leaf nodes with up to 16 entries are just lists, searched linearly +// * Leaf nodes with more than 16 entries and less than 1K are hash tables. +// Hash tables use linear search starting from the entry found by hashing. +// +constexpr static size_t linear_search_limit = 16; +constexpr static size_t hash_node_min_size = 32; +constexpr static size_t hash_node_max_size = 1024; +constexpr static size_t radix_node_consumes_bits = 8; +constexpr static size_t radix_node_size = 1ULL << radix_node_consumes_bits; + +// helpers +struct HashMapIter { + Array& m_array; + uint32_t hash_filter; + uint16_t index; + uint16_t left_to_search; + uint8_t hash_size; + HashMapIter(Array& array, uint32_t hash, uint8_t hash_size) + : m_array(array) + , hash_filter(hash) + , hash_size(hash_size) + { + set_index(0); + } + HashMapIter(Array& dummy) + : m_array(dummy) + { + left_to_search = 0; + } + inline uint32_t get() + { + return (uint32_t)(m_array.get(index) >> hash_size); + } + inline bool empty() + { + auto element = m_array.get(index); + return (element >> hash_size) == 0; + } + inline void set(uint64_t element) + { + m_array.set(index, element); + } + inline bool matches() + { + auto mask = 0xFFFFFFFFUL >> (32 - hash_size); + auto element = m_array.get(index); + return ((element & mask) == hash_filter) && (element >> hash_size); + } + inline bool is_valid() + { + return left_to_search != 0; + } + inline void set_index(size_t i, size_t search_limit = linear_search_limit) + { + index = (uint16_t)i; + left_to_search = (uint16_t)std::min(m_array.size(), (size_t)search_limit); + } + void operator++() + { + if (is_valid()) { + left_to_search--; + index++; + if (index == m_array.size()) { + index = 0; + } + } + } +}; + +// Attempt to build a hash leaf from a smaller hash leaf or a non-hash leaf. +static bool rehash(Array& from, Array& to, uint8_t hash_size) +{ + REALM_ASSERT_DEBUG(from.size() * 2 == to.size()); + + for (size_t i = 0; i < from.size(); ++i) { + auto entry = (size_t)from.get(i); + if ((entry >> hash_size) == 0) + continue; + size_t starting_index = entry & (to.size() - 1); + HashMapIter it(to, 0, hash_size); + it.set_index(starting_index); + while (it.is_valid() && !it.empty()) { + ++it; + } + if (!it.is_valid()) { + // abort rehashing, we need a larger to-space + return false; + } + REALM_ASSERT(it.empty()); + it.set(entry); + } + return true; +} + +// Add a binding from hash value to id. +static void add_to_hash_map(Array& node, uint64_t hash, uint64_t id, uint8_t hash_size) +{ + REALM_ASSERT(node.is_attached()); + if (!node.has_refs()) { + // it's a leaf. + if (node.size() < linear_search_limit) { + // it's a list with room to grow + node.add(((uint64_t)id << hash_size) | hash); + return; + } + if (node.size() == linear_search_limit) { + // it's a full list, must be converted to a hash table + Array new_node(node.get_alloc()); + new_node.create(NodeHeader::type_Normal, false, hash_node_min_size, 0); + new_node.set_parent(node.get_parent(), node.get_ndx_in_parent()); + new_node.update_parent(); + // transform existing list into hash table + rehash(node, new_node, hash_size); + node.destroy(); + node.init_from_parent(); + } + // it's a hash table. Grow if needed up till 'hash_node_max_size' entries + while (node.size() < hash_node_max_size) { + auto size = node.size(); + size_t start_index = hash & (size - 1); + HashMapIter it(node, 0, hash_size); + it.set_index(start_index); + while (it.is_valid() && !it.empty()) { + ++it; + } + if (it.is_valid()) { + // found an empty spot within search range + it.set(((uint64_t)id << hash_size) | hash); + return; + } + if (node.size() >= hash_node_max_size) + break; + // No free spot found - rehash into bigger and bigger tables + auto new_size = node.size(); + bool need_to_rehash = true; + Array new_node(node.get_alloc()); + while (need_to_rehash && new_size < hash_node_max_size) { + new_size *= 2; + new_node.create(NodeHeader::type_Normal, false, new_size, 0); + need_to_rehash = !rehash(node, new_node, hash_size); + if (need_to_rehash) { // we failed, try again - or shift to radix + // I find it counter-intuitive. But it CAN happen. + new_node.destroy(); + } + } + if (need_to_rehash) + break; + new_node.set_parent(node.get_parent(), node.get_ndx_in_parent()); + new_node.update_parent(); + node.destroy(); + node.init_from_parent(); + } + // we ran out of space. Rewrite as a radix node with subtrees + Array new_node(node.get_alloc()); + new_node.create(NodeHeader::type_HasRefs, false, radix_node_size, 0); + new_node.set_parent(node.get_parent(), node.get_ndx_in_parent()); + new_node.update_parent(); + for (size_t index = 0; index < node.size(); ++index) { + auto element = node.get(index); + auto hash = element & (0xFFFFFFFF >> (32 - hash_size)); + auto string_id = element >> hash_size; + if (string_id == 0) + continue; + auto remaining_hash = hash >> radix_node_consumes_bits; + add_to_hash_map(new_node, remaining_hash, string_id, hash_size - 8); + } + node.destroy(); + node.init_from_parent(); + } + // We have a radix node and need to insert the new binding into the proper subtree + size_t index = hash & (radix_node_size - 1); + auto rot = node.get_as_ref_or_tagged(index); + REALM_ASSERT(!rot.is_tagged()); + Array subtree(node.get_alloc()); + if (rot.get_as_ref() == 0) { + // no subtree present, create an empty one + subtree.set_parent(&node, index); + subtree.create(NodeHeader::type_Normal); + subtree.update_parent(); + } + else { + // subtree already present + subtree.set_parent(&node, index); + subtree.init_from_parent(); + } + // recurse into subtree + add_to_hash_map(subtree, hash >> radix_node_consumes_bits, id, hash_size - radix_node_consumes_bits); +} + +static std::vector hash_to_id(Array& node, uint32_t hash, uint8_t hash_size) +{ + std::vector result; + REALM_ASSERT(node.is_attached()); + if (!node.has_refs()) { + // it's a leaf - default is a list, search starts from index 0. + HashMapIter it(node, hash, hash_size); + if (node.size() > hash_node_min_size) { + // it is a hash table, so use hash to select index to start searching + // table size must be power of two! + size_t index = hash & (node.size() - 1); + it.set_index(index); + } + // collect all matching values within allowed range + while (it.is_valid()) { + if (it.matches()) { + result.push_back(it.get()); + } + ++it; + } + return result; + } + else { + // it's a radix node + size_t index = hash & (node.size() - 1); + auto rot = node.get_as_ref_or_tagged(index); + REALM_ASSERT(rot.is_ref()); + if (rot.get_as_ref() == 0) { + // no subtree, return empty vector + return result; + } + // descend into subtree + Array subtree(node.get_alloc()); + subtree.set_parent(&node, index); + subtree.init_from_parent(); + return hash_to_id(subtree, hash >> radix_node_consumes_bits, hash_size - radix_node_consumes_bits); + } +} + + +enum positions { Pos_Version, Pos_ColKey, Pos_Size, Pos_Compressor, Pos_Data, Pos_Map, Top_Size }; +struct StringInterner::DataLeaf { + std::vector m_compressed; + ref_type m_leaf_ref = 0; + bool m_is_loaded = false; + DataLeaf() {} + DataLeaf(ref_type ref) + : m_leaf_ref(ref) + { + } +}; + +StringInterner::StringInterner(Allocator& alloc, Array& parent, ColKey col_key, bool writable) + : m_parent(parent) +{ + REALM_ASSERT_DEBUG(col_key != ColKey()); + size_t index = col_key.get_index().val; + // ensure that m_top and m_data is well defined and reflect any existing data + // We'll have to extend this to handle no defined backing + m_top = std::make_unique(alloc); + m_top->set_parent(&parent, index); + m_data = std::make_unique(alloc); + m_data->set_parent(m_top.get(), Pos_Data); + m_hash_map = std::make_unique(alloc); + m_hash_map->set_parent(m_top.get(), Pos_Map); + m_current_string_leaf = std::make_unique(alloc); + m_col_key = col_key; + update_from_parent(writable); +} + +void StringInterner::update_from_parent(bool writable) +{ + auto parent_idx = m_top->get_ndx_in_parent(); + bool valid_top_ref_spot = m_parent.is_attached() && parent_idx < m_parent.size(); + bool valid_top = valid_top_ref_spot && m_parent.get_as_ref(parent_idx); + if (valid_top) { + m_top->update_from_parent(); + m_data->update_from_parent(); + m_hash_map->update_from_parent(); + } + else if (writable && valid_top_ref_spot) { + m_top->create(NodeHeader::type_HasRefs, false, Top_Size, 0); + m_top->set(Pos_Version, (1 << 1) + 1); // version number 1. + m_top->set(Pos_Size, (0 << 1) + 1); // total size 0 + m_top->set(Pos_ColKey, (m_col_key.value << 1) + 1); + m_top->set(Pos_Compressor, 0); + // create first level of data tree here (to simplify other stuff) + m_data = std::make_unique(m_parent.get_alloc()); + m_data->set_parent(m_top.get(), Pos_Data); + m_data->create(NodeHeader::type_HasRefs, false, 0); + m_data->update_parent(); + m_hash_map = std::make_unique(m_parent.get_alloc()); + m_hash_map->set_parent(m_top.get(), Pos_Map); + m_hash_map->create(NodeHeader::type_Normal); + m_hash_map->update_parent(); + m_top->update_parent(); + valid_top = true; + } + if (!valid_top) { + // We're lacking part of underlying data and not allowed to create it, so enter "dead" mode + m_compressor.reset(); + m_compressed_leafs.clear(); + // m_compressed_string_map.clear(); + m_top->detach(); // <-- indicates "dead" mode + m_data->detach(); + m_hash_map->detach(); + m_compressor.reset(); + return; + } + // validate we're accessing data for the correct column. A combination of column erase + // and insert could lead to an interner being paired with wrong data in the file. + // If so, we clear internal data forcing rebuild_internal() to rebuild from scratch. + int64_t data_colkey = m_top->get_as_ref_or_tagged(Pos_ColKey).get_as_int(); + if (m_col_key.value != data_colkey) { + // new column, new data + m_compressor.reset(); + m_decompressed_strings.clear(); + } + if (!m_compressor) + m_compressor = std::make_unique(m_top->get_alloc(), *m_top, Pos_Compressor, writable); + else + m_compressor->refresh(writable); + if (m_data->size()) { + auto ref_to_write_buffer = m_data->get_as_ref(m_data->size() - 1); + const char* header = m_top->get_alloc().translate(ref_to_write_buffer); + bool is_array_of_cprs = NodeHeader::get_hasrefs_from_header(header); + if (is_array_of_cprs) { + m_current_long_string_node = std::make_unique(m_top->get_alloc()); + m_current_long_string_node->set_parent(m_data.get(), m_data->size() - 1); + m_current_long_string_node->update_from_parent(); + } + else { + m_current_long_string_node.reset(); + } + } + else + m_current_long_string_node.reset(); // just in case... + + // rebuild internal structures...... + rebuild_internal(); + m_current_string_leaf->detach(); +} + +void StringInterner::rebuild_internal() +{ + std::lock_guard lock(m_mutex); + // release old decompressed strings + for (size_t idx = 0; idx < m_in_memory_strings.size(); ++idx) { + StringID id = m_in_memory_strings[idx]; + if (id > m_decompressed_strings.size()) { + m_in_memory_strings[idx] = m_in_memory_strings.back(); + m_in_memory_strings.pop_back(); + continue; + } + if (auto& w = m_decompressed_strings[id - 1].m_weight) { + w >>= 1; + } + else { + m_decompressed_strings[id - 1].m_decompressed.reset(); + m_in_memory_strings[idx] = m_in_memory_strings.back(); + m_in_memory_strings.pop_back(); + continue; + } + } + + size_t target_size = (size_t)m_top->get_as_ref_or_tagged(Pos_Size).get_as_int(); + m_decompressed_strings.resize(target_size); + if (m_data->size() != m_compressed_leafs.size()) { + m_compressed_leafs.resize(m_data->size()); + } + // allways force new setup of all leafs: + // update m_compressed_leafs to reflect m_data + for (size_t idx = 0; idx < m_compressed_leafs.size(); ++idx) { + auto ref = m_data->get_as_ref(idx); + auto& leaf_meta = m_compressed_leafs[idx]; + // if (ref != leaf_meta.m_leaf_ref) { + leaf_meta.m_is_loaded = false; + leaf_meta.m_compressed.clear(); + leaf_meta.m_leaf_ref = ref; + //} + } +} + +StringInterner::~StringInterner() {} + +StringID StringInterner::intern(StringData sd) +{ + REALM_ASSERT(m_top->is_attached()); + std::lock_guard lock(m_mutex); + // special case for null string + if (sd.data() == nullptr) + return 0; + uint32_t h = (uint32_t)sd.hash(); + auto candidates = hash_to_id(*m_hash_map.get(), h, 32); + for (auto& candidate : candidates) { + auto candidate_cpr = get_compressed(candidate); + if (m_compressor->compare(sd, candidate_cpr) == 0) + return candidate; + } + // it's a new string + bool learn = true; + auto c_str = m_compressor->compress(sd, learn); + m_decompressed_strings.push_back({64, std::make_unique(sd)}); + auto id = m_decompressed_strings.size(); + m_in_memory_strings.push_back(id); + add_to_hash_map(*m_hash_map.get(), h, id, 32); + size_t index = (size_t)m_top->get_as_ref_or_tagged(Pos_Size).get_as_int(); + REALM_ASSERT_DEBUG(index == id - 1); + bool need_long_string_node = c_str.size() >= 65536; + + // TODO: update_internal must set up m_current_long_string_node if it is in use + + if (need_long_string_node && !m_current_long_string_node) { + if ((index & 0xFF) == 0) { + // if we're starting on a new leaf, extend parent array for it + m_data->add(0); + m_compressed_leafs.push_back({}); + m_current_long_string_node = std::make_unique(m_top->get_alloc()); + m_current_long_string_node->set_parent(m_data.get(), m_data->size() - 1); + m_current_long_string_node->create(NodeHeader::type_HasRefs); + m_current_long_string_node->update_parent(); + REALM_ASSERT_DEBUG(!m_current_string_leaf->is_attached() || m_current_string_leaf->size() == 0); + m_current_string_leaf->detach(); + } + else { + // we have been building an existing leaf and need to shift representation. + // but first we need to update leaf accessor for existing leaf + if (m_current_string_leaf->is_attached()) { + m_current_string_leaf->update_from_parent(); + } + else { + m_current_string_leaf->init_from_ref(m_current_string_leaf->get_ref_from_parent()); + } + REALM_ASSERT_DEBUG(m_current_string_leaf->size() > 0); + m_current_long_string_node = std::make_unique(m_top->get_alloc()); + m_current_long_string_node->set_parent(m_data.get(), m_data->size() - 1); + m_current_long_string_node->create(NodeHeader::type_HasRefs); + m_current_long_string_node->update_parent(); + // convert the current leaf into a long string node. (array of strings in separate arrays) + for (auto s : m_compressed_leafs.back().m_compressed) { + ArrayUnsigned arr(m_top->get_alloc()); + arr.create(s.size, 65535); + unsigned short* dest = reinterpret_cast(arr.m_data); + std::copy_n(s.data, s.size, dest); + m_current_long_string_node->add(arr.get_ref()); + } + m_current_string_leaf->destroy(); + m_current_string_leaf->detach(); + // force later reload of leaf + m_compressed_leafs.back().m_is_loaded = false; + // m_compressed_leafs.back().m_leaf_ref = m_data->get_as_ref(m_data->size() - 1); + } + } + if (m_current_long_string_node) { + ArrayUnsigned arr(m_top->get_alloc()); + arr.create(c_str.size(), 65535); + unsigned short* begin = c_str.data(); + if (begin) { + // if the compressed string is empty, 'begin' is zero and we don't copy + size_t n = c_str.size(); + unsigned short* dest = reinterpret_cast(arr.m_data); + std::copy_n(begin, n, dest); + } + m_current_long_string_node->add(arr.get_ref()); + m_current_long_string_node->update_parent(); + if (m_current_long_string_node->size() == 256) { + // exit from "long string mode" + m_current_long_string_node.reset(); + } + CompressionSymbol* p_start = reinterpret_cast(arr.m_data); + m_compressed_leafs.back().m_compressed.push_back({p_start, arr.size()}); + } + else { + // Append to leaf with up to 256 entries. + // First create a new leaf if needed (limit number of entries to 256 pr leaf) + bool need_leaf_update = !m_current_string_leaf->is_attached() || (index & 0xFF) == 0; + if (need_leaf_update) { + m_current_string_leaf->set_parent(m_data.get(), index >> 8); + if ((index & 0xFF) == 0) { + // create new leaf + m_current_string_leaf->create(0, 65535); + m_data->add(m_current_string_leaf->get_ref()); + m_compressed_leafs.push_back({}); + } + else { + // just setup leaf accessor + if (m_current_string_leaf->is_attached()) { + m_current_string_leaf->update_from_parent(); + } + else { + m_current_string_leaf->init_from_ref(m_current_string_leaf->get_ref_from_parent()); + } + } + } + REALM_ASSERT(c_str.size() < 65535); + // Add compressed string at end of leaf + m_current_string_leaf->add(c_str.size()); + for (auto c : c_str) { + m_current_string_leaf->add(c); + } + REALM_ASSERT_DEBUG(m_compressed_leafs.size()); + CompressionSymbol* p = reinterpret_cast(m_current_string_leaf->m_data); + auto p_limit = p + m_current_string_leaf->size(); + auto p_start = p_limit - c_str.size(); + m_compressed_leafs.back().m_compressed.push_back({p_start, c_str.size()}); + REALM_ASSERT(m_compressed_leafs.back().m_compressed.size() <= 256); + } + m_top->adjust(Pos_Size, 2); // type is has_Refs, so increment is by 2 + load_leaf_if_new_ref(m_compressed_leafs.back(), m_data->get_as_ref(m_data->size() - 1)); +#ifdef REALM_DEBUG + auto csv = get_compressed(id); + CompressedStringView csv2(c_str); + REALM_ASSERT(csv == csv2); +#endif + return id; +} + +bool StringInterner::load_leaf_if_needed(DataLeaf& leaf) +{ + if (!leaf.m_is_loaded) { + // start with an empty leaf: + leaf.m_compressed.clear(); + leaf.m_compressed.reserve(256); + + // must interpret leaf first - the leaf is either a single array holding all strings, + // or an array with each (compressed) string placed in its own array. + const char* header = m_top->get_alloc().translate(leaf.m_leaf_ref); + bool is_single_array = !NodeHeader::get_hasrefs_from_header(header); + if (is_single_array) { + size_t leaf_offset = 0; + ArrayUnsigned leaf_array(m_top->get_alloc()); + leaf_array.init_from_ref(leaf.m_leaf_ref); + REALM_ASSERT(NodeHeader::get_encoding(leaf_array.get_header()) == NodeHeader::Encoding::WTypBits); + REALM_ASSERT(NodeHeader::get_width_from_header(leaf_array.get_header()) == 16); + // This is dangerous if the leaf is for some reason not in the assumed format + CompressionSymbol* c = reinterpret_cast(leaf_array.m_data); + auto leaf_size = leaf_array.size(); + while (leaf_offset < leaf_size) { + size_t length = c[leaf_offset]; + REALM_ASSERT_DEBUG(length == leaf_array.get(leaf_offset)); + leaf_offset++; + leaf.m_compressed.push_back({c + leaf_offset, length}); + REALM_ASSERT_DEBUG(leaf.m_compressed.size() <= 256); + leaf_offset += length; + } + } + else { + // Not a single leaf - instead an array of strings + Array arr(m_top->get_alloc()); + arr.init_from_ref(leaf.m_leaf_ref); + for (size_t idx = 0; idx < arr.size(); ++idx) { + ArrayUnsigned str_array(m_top->get_alloc()); + ref_type ref = arr.get_as_ref(idx); + str_array.init_from_ref(ref); + REALM_ASSERT(NodeHeader::get_encoding(str_array.get_header()) == NodeHeader::Encoding::WTypBits); + REALM_ASSERT(NodeHeader::get_width_from_header(str_array.get_header()) == 16); + CompressionSymbol* c = reinterpret_cast(str_array.m_data); + leaf.m_compressed.push_back({c, str_array.size()}); + } + } + leaf.m_is_loaded = true; + return true; + } + return false; +} + +// Danger: Only to be used if you know that a change in content ==> different ref +bool StringInterner::load_leaf_if_new_ref(DataLeaf& leaf, ref_type new_ref) +{ + if (leaf.m_leaf_ref != new_ref) { + leaf.m_leaf_ref = new_ref; + leaf.m_is_loaded = false; + leaf.m_compressed.resize(0); + } + return load_leaf_if_needed(leaf); +} + +CompressedStringView& StringInterner::get_compressed(StringID id) +{ + auto index = id - 1; // 0 represents null + auto hi = index >> 8; + auto lo = index & 0xFFUL; + DataLeaf& leaf = m_compressed_leafs[hi]; + load_leaf_if_needed(leaf); + REALM_ASSERT_DEBUG(lo < leaf.m_compressed.size()); + return leaf.m_compressed[lo]; +} + +std::optional StringInterner::lookup(StringData sd) +{ + if (!m_top->is_attached()) { + // "dead" mode + return {}; + } + std::lock_guard lock(m_mutex); + if (sd.data() == nullptr) + return 0; + uint32_t h = (uint32_t)sd.hash(); + auto candidates = hash_to_id(*m_hash_map.get(), h, 32); + for (auto& candidate : candidates) { + auto candidate_cpr = get_compressed(candidate); + if (m_compressor->compare(sd, candidate_cpr) == 0) + return candidate; + } + return {}; +} + +int StringInterner::compare(StringID A, StringID B) +{ + std::lock_guard lock(m_mutex); + REALM_ASSERT_DEBUG(A < m_decompressed_strings.size()); + REALM_ASSERT_DEBUG(B < m_decompressed_strings.size()); + // comparisons against null + if (A == B && A == 0) + return 0; + if (A == 0) + return -1; + if (B == 0) + return 1; + // ok, no nulls. + REALM_ASSERT(m_compressor); + return m_compressor->compare(get_compressed(A), get_compressed(B)); +} + +int StringInterner::compare(StringData s, StringID A) +{ + std::lock_guard lock(m_mutex); + REALM_ASSERT_DEBUG(A < m_decompressed_strings.size()); + // comparisons against null + if (s.data() == nullptr && A == 0) + return 0; + if (s.data() == nullptr) + return 1; + if (A == 0) + return -1; + // ok, no nulls + REALM_ASSERT(m_compressor); + return m_compressor->compare(s, get_compressed(A)); +} + + +StringData StringInterner::get(StringID id) +{ + REALM_ASSERT(m_compressor); + std::lock_guard lock(m_mutex); + if (id == 0) + return StringData{nullptr}; + REALM_ASSERT_DEBUG(id <= m_decompressed_strings.size()); + CachedString& cs = m_decompressed_strings[id - 1]; + if (cs.m_decompressed) { + std::string* ref_str = cs.m_decompressed.get(); + if (cs.m_weight < 128) + cs.m_weight += 64; + return {ref_str->c_str(), ref_str->size()}; + } + cs.m_weight = 64; + cs.m_decompressed = std::make_unique(m_compressor->decompress(get_compressed(id))); + m_in_memory_strings.push_back(id); + return {cs.m_decompressed->c_str(), cs.m_decompressed->size()}; +} + +} // namespace realm diff --git a/src/realm/string_interner.hpp b/src/realm/string_interner.hpp new file mode 100644 index 00000000000..2a36c9e38dc --- /dev/null +++ b/src/realm/string_interner.hpp @@ -0,0 +1,96 @@ +/************************************************************************* + * + * Copyright 2016 Realm Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + **************************************************************************/ + +#ifndef REALM_STRING_INTERNER_HPP +#define REALM_STRING_INTERNER_HPP + +#include +#include +#include +#include + +#include +#include +#include + + +namespace realm { + + +using StringID = size_t; + +class Array; +class ArrayUnsigned; +class Allocator; +struct CachedString { + uint8_t m_weight = 0; + std::unique_ptr m_decompressed; +}; + +class StringInterner { +public: + // To be used exclusively from Table + StringInterner(Allocator& alloc, Array& parent, ColKey col_key, bool writable); + void update_from_parent(bool writable); + ~StringInterner(); + + // To be used from Obj and for searching + StringID intern(StringData); + std::optional lookup(StringData); + int compare(StringID A, StringID B); + int compare(StringData, StringID A); + StringData get(StringID); + +private: + Array& m_parent; // need to be able to check if this is attached or not + std::unique_ptr m_top; + // Compressed strings are stored in blocks of 256. + // One array holds refs to all blocks: + std::unique_ptr m_data; + // In-memory representation of a block. Either only the ref to it, + // or a full vector of views into the block. + struct DataLeaf; + // in-memory metadata for faster access to compressed strings. Mirrors m_data. + std::vector m_compressed_leafs; + // 'm_hash_map' is used for mapping hash of uncompressed string to string id. + std::unique_ptr m_hash_map; + // the block of compressed strings we're currently appending to: + std::unique_ptr m_current_string_leaf; + // an array of strings we're currently appending to. This is used instead + // when ever we meet a string too large to be placed inline. + std::unique_ptr m_current_long_string_node; + void rebuild_internal(); + CompressedStringView& get_compressed(StringID id); + // return true if the leaf was reloaded + bool load_leaf_if_needed(DataLeaf& leaf); + // return 'true' if the new ref was different and forced a reload + bool load_leaf_if_new_ref(DataLeaf& leaf, ref_type new_ref); + ColKey m_col_key; // for validation + std::unique_ptr m_compressor; + // At the moment we need to keep decompressed strings around if they've been + // returned to the caller, since we're handing + // out StringData references to their storage. This is a temporary solution. + std::vector m_decompressed_strings; + std::vector m_in_memory_strings; + // Mutual exclusion is needed for frozen transactions only. Live objects are + // only used in single threaded contexts so don't need them. For now, just use always. + std::mutex m_mutex; +}; +} // namespace realm + +#endif diff --git a/src/realm/table.cpp b/src/realm/table.cpp index ad9435f45ca..977339ade0d 100644 --- a/src/realm/table.cpp +++ b/src/realm/table.cpp @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -356,6 +357,7 @@ Table::Table(Allocator& alloc) , m_index_refs(m_alloc) , m_opposite_table(m_alloc) , m_opposite_column(m_alloc) + , m_interner_data(m_alloc) , m_repl(&g_dummy_replication) , m_own_ref(this, alloc.get_instance_version()) { @@ -363,7 +365,7 @@ Table::Table(Allocator& alloc) m_index_refs.set_parent(&m_top, top_position_for_search_indexes); m_opposite_table.set_parent(&m_top, top_position_for_opposite_table); m_opposite_column.set_parent(&m_top, top_position_for_opposite_column); - + m_interner_data.set_parent(&m_top, top_position_for_interners); ref_type ref = create_empty_table(m_alloc); // Throws ArrayParent* parent = nullptr; size_t ndx_in_parent = 0; @@ -378,6 +380,7 @@ Table::Table(Replication* const* repl, Allocator& alloc) , m_index_refs(m_alloc) , m_opposite_table(m_alloc) , m_opposite_column(m_alloc) + , m_interner_data(m_alloc) , m_repl(repl) , m_own_ref(this, alloc.get_instance_version()) { @@ -385,6 +388,8 @@ Table::Table(Replication* const* repl, Allocator& alloc) m_index_refs.set_parent(&m_top, top_position_for_search_indexes); m_opposite_table.set_parent(&m_top, top_position_for_opposite_table); m_opposite_column.set_parent(&m_top, top_position_for_opposite_column); + m_opposite_column.set_parent(&m_top, top_position_for_opposite_column); + m_interner_data.set_parent(&m_top, top_position_for_interners); m_cookie = cookie_created; } @@ -535,6 +540,9 @@ void Table::remove_column(ColKey col_key) erase_root_column(col_key); // Throws m_has_any_embedded_objects.reset(); + auto i = col_key.get_index().val; + if (i < m_string_interners.size() && m_string_interners[i]) + m_string_interners[i].reset(); } @@ -653,6 +661,14 @@ void Table::init(ref_type top_ref, ArrayParent* parent, size_t ndx_in_parent, bo else { m_tombstones = nullptr; } + if (m_top.size() > top_position_for_interners && m_top.get_as_ref(top_position_for_interners)) { + // Interner data exist + m_interner_data.init_from_parent(); + } + else { + REALM_ASSERT_DEBUG(!m_interner_data.is_attached()); + } + refresh_string_interners(is_writable); m_cookie = cookie_initialized; } @@ -1054,7 +1070,19 @@ ColKey Table::do_insert_root_column(ColKey col_key, ColumnType type, StringData if (m_tombstones) { m_tombstones->insert_column(col_key); } - + // create string interners internal rep as well as data area + REALM_ASSERT_DEBUG(m_interner_data.is_attached()); + while (col_ndx >= m_string_interners.size()) { + m_string_interners.push_back({}); + } + while (col_ndx >= m_interner_data.size()) { + m_interner_data.add(0); + } + REALM_ASSERT(!m_string_interners[col_ndx]); + // FIXME: Limit creation of interners to EXACTLY the columns, where they can be + // relevant. + // if (col_key.get_type() == col_type_String) + m_string_interners[col_ndx] = std::make_unique(m_alloc, m_interner_data, col_key, true); bump_storage_version(); return col_key; @@ -1086,6 +1114,17 @@ void Table::do_erase_root_column(ColKey col_key) REALM_ASSERT(m_index_accessors.back() == nullptr); m_index_accessors.pop_back(); } + REALM_ASSERT_DEBUG(col_ndx < m_string_interners.size()); + if (m_string_interners[col_ndx]) { + REALM_ASSERT_DEBUG(m_interner_data.is_attached()); + REALM_ASSERT_DEBUG(col_ndx < m_interner_data.size()); + auto data_ref = m_interner_data.get_as_ref(col_ndx); + if (data_ref) + Array::destroy_deep(data_ref, m_alloc); + m_interner_data.set(col_ndx, 0); + // m_string_interners[col_ndx]->update_from_parent(true); + m_string_interners[col_ndx].reset(); + } bump_content_version(); bump_storage_version(); } @@ -1239,6 +1278,9 @@ void Table::detach(LifeCycleCookie cookie) noexcept { m_cookie = cookie; m_alloc.bump_instance_version(); + // release string interners + m_string_interners.clear(); + m_interner_data.detach(); } void Table::fully_detach() noexcept @@ -1249,6 +1291,7 @@ void Table::fully_detach() noexcept m_opposite_table.detach(); m_opposite_column.detach(); m_index_accessors.clear(); + m_string_interners.clear(); } @@ -1465,6 +1508,7 @@ ref_type Table::create_empty_table(Allocator& alloc, TableKey key) top.add(0); // pk col key top.add(0); // flags top.add(0); // tombstones + top.add(0); // string interners REALM_ASSERT(top.size() == top_array_size); @@ -1976,6 +2020,13 @@ void Table::update_from_parent() noexcept refresh_content_version(); m_has_any_embedded_objects.reset(); + if (m_top.size() > top_position_for_interners) { + if (m_top.get_as_ref(top_position_for_interners)) + m_interner_data.update_from_parent(); + else + m_interner_data.detach(); + } + refresh_string_interners(false); } m_alloc.bump_storage_version(); } @@ -2104,7 +2155,7 @@ void Table::refresh_content_version() // Called when Group is moved to another version - either a rollback or an advance. // The content of the table is potentially different, so make no assumptions. -void Table::refresh_accessor_tree() +void Table::refresh_accessor_tree(bool writable) { REALM_ASSERT(m_cookie == cookie_initialized); REALM_ASSERT(m_top.is_attached()); @@ -2134,12 +2185,78 @@ void Table::refresh_accessor_tree() else { m_tombstones = nullptr; } + if (writable) { + while (m_top.size() < top_position_for_interners) + m_top.add(0); + } + if (m_top.size() > top_position_for_interners) { + if (m_top.get_as_ref(top_position_for_interners)) + m_interner_data.init_from_parent(); + else + m_interner_data.detach(); + } refresh_content_version(); bump_storage_version(); build_column_mapping(); + refresh_string_interners(writable); refresh_index_accessors(); } +void Table::refresh_string_interners(bool writable) +{ + if (writable) { + // if we're in a write transaction, make sure interner arrays are created which will allow + // string interners to expand with their own data when "learning" + while (m_top.size() <= top_position_for_interners) { + m_top.add(0); + } + } + if (m_top.size() > top_position_for_interners && m_top.get_as_ref(top_position_for_interners)) + m_interner_data.update_from_parent(); + else + m_interner_data.detach(); + if (writable) { + if (!m_interner_data.is_attached()) { + m_interner_data.create(NodeHeader::type_HasRefs); + m_interner_data.update_parent(); + } + } + // bring string interners in line with underlying data. + // Precondition: we rely on the col keys in m_leaf_ndx2colkey[] being up to date. + for (size_t idx = 0; idx < m_leaf_ndx2colkey.size(); ++idx) { + auto col_key = m_leaf_ndx2colkey[idx]; + if (col_key == ColKey()) { + // deleted column, we really don't want a string interner for this + if (idx < m_string_interners.size() && m_string_interners[idx]) + m_string_interners[idx].reset(); + continue; + } + REALM_ASSERT_DEBUG(col_key.get_index().val == idx); + // maintain sufficient size of interner arrays to cover all columns + while (idx >= m_string_interners.size()) { + m_string_interners.push_back({}); + } + while (writable && idx >= m_interner_data.size()) { // m_interner_data.is_attached() per above + m_interner_data.add(0); + } + if (m_string_interners[idx]) { + // existing interner + m_string_interners[idx]->update_from_parent(writable); + } + else { + // new interner. Note: if not in a writable state, the interner will not have a valid + // underlying data array. The interner will be set in a state, where it cannot "learn", + // and searches will not find any matching interned strings. + m_string_interners[idx] = std::make_unique(m_alloc, m_interner_data, col_key, writable); + } + } + if (m_string_interners.size() > m_leaf_ndx2colkey.size()) { + // remove any string interners which are no longer reachable, + // e.g. after a rollback + m_string_interners.resize(m_leaf_ndx2colkey.size()); + } +} + void Table::refresh_index_accessors() { // Refresh search index accessors @@ -3407,3 +3524,12 @@ void Table::typed_print(std::string prefix, ref_type ref) const } std::cout << prefix << "}" << std::endl; } + +StringInterner* Table::get_string_interner(ColKey col_key) const +{ + auto idx = col_key.get_index().val; + REALM_ASSERT(idx < m_string_interners.size()); + auto interner = m_string_interners[idx].get(); + REALM_ASSERT(interner); + return interner; +} diff --git a/src/realm/table.hpp b/src/realm/table.hpp index 0830d7c733f..1f02e0540ac 100644 --- a/src/realm/table.hpp +++ b/src/realm/table.hpp @@ -573,7 +573,7 @@ class Table { ColKey::Idx spec_ndx2leaf_ndx(size_t idx) const; ColKey leaf_ndx2colkey(ColKey::Idx idx) const; ColKey spec_ndx2colkey(size_t ndx) const; - + StringInterner* get_string_interner(ColKey col_key) const; // Queries // Using where(tv) is the new method to perform queries on TableView. The 'tv' can have any order; it does not // need to be sorted, and, resulting view retains its order. @@ -737,6 +737,7 @@ class Table { Array m_index_refs; // 5th slot in m_top Array m_opposite_table; // 7th slot in m_top Array m_opposite_column; // 8th slot in m_top + Array m_interner_data; // 14th slot in m_top std::vector> m_index_accessors; ColKey m_primary_key_col; Replication* const* m_repl; @@ -848,8 +849,9 @@ class Table { /// Refresh the part of the accessor tree that is rooted at this /// table. - void refresh_accessor_tree(); + void refresh_accessor_tree(bool writable); void refresh_index_accessors(); + void refresh_string_interners(bool writable); void refresh_content_version(); void flush_for_commit(); @@ -861,6 +863,7 @@ class Table { std::vector m_leaf_ndx2colkey; std::vector m_spec_ndx2leaf_ndx; std::vector m_leaf_ndx2spec_ndx; + mutable std::vector> m_string_interners; Type m_table_type = Type::TopLevel; uint64_t m_in_file_version_at_transaction_boundary = 0; AtomicLifeCycleCookie m_cookie; @@ -880,7 +883,8 @@ class Table { static constexpr int top_position_for_flags = 12; // flags contents: bit 0-1 - table type static constexpr int top_position_for_tombstones = 13; - static constexpr int top_array_size = 14; + static constexpr int top_position_for_interners = 14; + static constexpr int top_array_size = 15; enum { s_collision_map_lo = 0, s_collision_map_hi = 1, s_collision_map_local_id = 2, s_collision_map_num_slots }; @@ -1413,6 +1417,11 @@ class _impl::TableFriend { return table.m_spec; } + static StringInterner* get_string_interner(const Table& table, ColKey col_key) + { + return table.get_string_interner(col_key); + } + static TableRef get_opposite_link_table(const Table& table, ColKey col_key); static Group* get_parent_group(const Table& table) noexcept diff --git a/src/realm/transaction.hpp b/src/realm/transaction.hpp index 4da316c0d2e..e4db3c8a586 100644 --- a/src/realm/transaction.hpp +++ b/src/realm/transaction.hpp @@ -217,6 +217,7 @@ class Transaction : public Group { friend class DB; friend class DisableReplication; + friend class Table; }; /* diff --git a/src/realm/utilities.hpp b/src/realm/utilities.hpp index fc3a9c5bd1a..c5e698436b6 100644 --- a/src/realm/utilities.hpp +++ b/src/realm/utilities.hpp @@ -69,8 +69,8 @@ typedef SSIZE_T ssize_t; #if defined(REALM_PTR_64) && defined(REALM_X86_OR_X64) && !REALM_WATCHOS -#define REALM_COMPILER_SSE // Compiler supports SSE 4.2 through __builtin_ accessors or back-end assembler -#define REALM_COMPILER_AVX +// #define REALM_COMPILER_SSE // Compiler supports SSE 4.2 through __builtin_ accessors or back-end assembler +// #define REALM_COMPILER_AVX #endif namespace realm { diff --git a/test/test_shared.cpp b/test/test_shared.cpp index 85c3de4f8ab..ff32519f0ae 100644 --- a/test/test_shared.cpp +++ b/test/test_shared.cpp @@ -2289,6 +2289,89 @@ TEST(Shared_EncryptionPageReadFailure) #endif // REALM_ENABLE_ENCRYPTION +TEST(Shared_MaxStrings) +{ + SHARED_GROUP_TEST_PATH(path); + DBRef sg = get_test_db(path); + auto trans = sg->start_write(); + auto t = trans->add_table("MyTable"); + ColKey ck = t->add_column(type_String, "MyStrings"); + std::string str_a(16 * 1024 * 1024 - 257, 'a'); + std::string str_b(16 * 1024 * 1024 - 257, 'b'); + // make it harder to compress: + for (auto& e : str_a) { + e = std::rand() % 256; + } + for (auto& e : str_b) { + e = std::rand() % 256; + } + auto o = t->create_object(); + o.set(ck, str_a); + trans->commit_and_continue_as_read(); + auto v = o.get(ck); + CHECK_EQUAL(str_a, v); + trans->promote_to_write(); + auto o2 = t->create_object(); + o2.set(ck, str_b); + trans->commit_and_continue_as_read(); + v = o.get(ck); + auto v2 = o2.get(ck); + CHECK_EQUAL(v, str_a); + CHECK_EQUAL(v2, str_b); + trans->close(); + sg.reset(); +} + +TEST(Shared_RandomMaxStrings) +{ + + SHARED_GROUP_TEST_PATH(path); + DBRef sg = get_test_db(path); + auto trans = sg->start_write(); + auto t = trans->add_table("MyTable"); + ColKey ck = t->add_column(type_String, "MyStrings"); + trans->commit_and_continue_as_read(); + for (int run = 0; run < 10; ++run) { + trans->promote_to_write(); + size_t str_length = std::rand() % (16 * 1024 * 1024 - 257); + std::string str(str_length, 'X'); + for (auto& e : str) { + e = std::rand() % 256; + } + auto o = t->create_object(); + o.set(ck, str); + trans->commit_and_continue_as_read(); + } + trans->close(); +} + +TEST(Shared_RandomSmallStrings) +{ + + SHARED_GROUP_TEST_PATH(path); + DBRef sg = get_test_db(path); + // std::cout << "Writing " << path << std::endl; + auto trans = sg->start_write(); + auto t = trans->add_table("MyTable"); + ColKey ck = t->add_column(type_String, "MyStrings"); + trans->commit_and_continue_as_read(); + std::string str(500, 'X'); + // insert a million objects with at most 4000 different strings + for (int run = 0; run < 100; ++run) { + trans->promote_to_write(); + for (int i = 0; i < 1000; ++i) { + // size_t str_length = std::rand() % (1 + 500); + // std::string str(str_length, 'X'); + size_t offset = std::rand() % str.size(); + str[offset] = 'a' + (std::rand() & 0x7); + auto o = t->create_object(); + o.set(ck, str); + } + trans->commit_and_continue_as_read(); + } + trans->close(); +} + TEST(Shared_VersionCount) { SHARED_GROUP_TEST_PATH(path); @@ -2469,6 +2552,7 @@ TEST(Shared_MovingSearchIndex) // Remove the padding column to shift the indexed columns { WriteTransaction wt(sg); + wt.get_group().verify(); TableRef table = wt.get_table("foo"); CHECK(table->has_search_index(int_col)); diff --git a/test/test_unresolved_links.cpp b/test/test_unresolved_links.cpp index 60f50ee3488..b47c68fa313 100644 --- a/test/test_unresolved_links.cpp +++ b/test/test_unresolved_links.cpp @@ -870,6 +870,7 @@ TEST(Unresolved_PerformanceLinkList) tr->commit_and_continue_as_read(); CHECK(t2 > t1); tr->promote_to_write(); + // fails in compressed format because of unsigned/signed interpretation. tr->verify(); } diff --git a/test/test_upgrade_database.cpp b/test/test_upgrade_database.cpp index da76d0542bf..8408a327e01 100644 --- a/test/test_upgrade_database.cpp +++ b/test/test_upgrade_database.cpp @@ -166,6 +166,7 @@ TEST(Upgrade_Disabled) TEST(Upgrade_DatabaseWithUnsupportedOldFileFormat) { + // Not core 6, thus kind is not set. And assetion is triggered. std::string path = test_util::get_test_resource_path() + "test_upgrade_database_1000_1.realm"; CHECK_OR_RETURN(File::exists(path)); From 22d15d94ca181e065c4b8ba246a3ad260c0eca32 Mon Sep 17 00:00:00 2001 From: nicola cabiddu Date: Mon, 17 Jun 2024 12:42:19 +0100 Subject: [PATCH 02/14] No Unique PTRs for `string interner` and `string compressor` (#7807) * No unique ptrs for string interner + limit number of interners * point fix client-reset test * code review --- src/realm/array.cpp | 20 --- src/realm/array.hpp | 2 - src/realm/array_string.cpp | 2 +- src/realm/node.hpp | 2 +- src/realm/string_compressor.cpp | 29 ++-- src/realm/string_compressor.hpp | 9 +- src/realm/string_interner.cpp | 192 +++++++++++------------- src/realm/string_interner.hpp | 21 +-- src/realm/table.cpp | 50 +++--- test/object-store/sync/client_reset.cpp | 2 +- 10 files changed, 150 insertions(+), 179 deletions(-) diff --git a/src/realm/array.cpp b/src/realm/array.cpp index b95d081f4d5..7486c8092c9 100644 --- a/src/realm/array.cpp +++ b/src/realm/array.cpp @@ -314,18 +314,6 @@ void Array::destroy_children(size_t offset, bool ro_only) noexcept } } -// size_t Array::get_byte_size() const noexcept -//{ -// const auto header = get_header(); -// auto num_bytes = get_byte_size_from_header(header); -// auto read_only = m_alloc.is_read_only(m_ref) == true; -// auto capacity = get_capacity_from_header(header); -// auto bytes_ok = num_bytes <= capacity; -// REALM_ASSERT(read_only || bytes_ok); -// REALM_ASSERT_7(m_alloc.is_read_only(m_ref), ==, true, ||, num_bytes, <=, get_capacity_from_header(header)); -// return num_bytes; -// } - ref_type Array::do_write_shallow(_impl::ArrayWriterBase& out) const { // here we might want to compress the array and write down. @@ -607,14 +595,6 @@ void Array::do_ensure_minimum_width(int_fast64_t value) } } -size_t Array::size() const noexcept -{ - // in case the array is in compressed format. Never read directly - // from the header the size, since it will result very likely in a cache miss. - // For compressed arrays m_size should always be kept updated, due to init_from_mem - return m_size; -} - bool Array::compress_array(Array& arr) const { if (m_integer_compressor.get_encoding() == NodeHeader::Encoding::WTypBits) { diff --git a/src/realm/array.hpp b/src/realm/array.hpp index 0611068bd12..047349846f8 100644 --- a/src/realm/array.hpp +++ b/src/realm/array.hpp @@ -210,8 +210,6 @@ class Array : public Node, public ArrayParent { update_width_cache_from_header(); } - size_t size() const noexcept; - bool is_empty() const noexcept { return size() == 0; diff --git a/src/realm/array_string.cpp b/src/realm/array_string.cpp index cb2aa6fb3f5..8731c99fac9 100644 --- a/src/realm/array_string.cpp +++ b/src/realm/array_string.cpp @@ -18,6 +18,7 @@ #include #include +#include #include #include @@ -549,5 +550,4 @@ ref_type ArrayString::write(_impl::ArrayWriterBase& out, StringInterner* interne auto retval = interned.write(out, false, false, out.compress); interned.destroy(); return retval; - // return m_arr->write(out, true, false, false); } diff --git a/src/realm/node.hpp b/src/realm/node.hpp index 21ee61eddde..9b684f25246 100644 --- a/src/realm/node.hpp +++ b/src/realm/node.hpp @@ -21,7 +21,6 @@ #include #include -#include #include @@ -352,6 +351,7 @@ class ArrayWriterBase; } /// Base class for all nodes holding user data +class StringInterner; class ArrayPayload { public: virtual ~ArrayPayload(); diff --git a/src/realm/string_compressor.cpp b/src/realm/string_compressor.cpp index 99dcb50dac5..13de772b6a3 100644 --- a/src/realm/string_compressor.cpp +++ b/src/realm/string_compressor.cpp @@ -17,19 +17,18 @@ **************************************************************************/ #include +#include #include -#include - #include namespace realm { StringCompressor::StringCompressor(Allocator& alloc, Array& parent, size_t index, bool writable) + : m_data(alloc) { m_compression_map.resize(16); // start with a very small compression map m_symbols.reserve(65536); - m_data = std::make_unique(alloc); - m_data->set_parent(&parent, index); + m_data.set_parent(&parent, index); refresh(writable); } @@ -37,16 +36,16 @@ void StringCompressor::refresh(bool writable) { // we assume that compressors are only created from a valid parent. // String interners in 'dead' mode should never instantiate a string compressor. - if (m_data->get_ref_from_parent() == 0) { + if (m_data.get_ref_from_parent() == 0) { REALM_ASSERT(writable); - m_data->create(0, 65535); - m_data->update_parent(); + m_data.create(0, 65535); + m_data.update_parent(); } else { - if (m_data->is_attached()) - m_data->update_from_parent(); + if (m_data.is_attached()) + m_data.update_from_parent(); else - m_data->init_from_ref(m_data->get_ref_from_parent()); + m_data.init_from_ref(m_data.get_ref_from_parent()); } rebuild_internal(); } @@ -111,7 +110,7 @@ void StringCompressor::expand_compression_map() void StringCompressor::rebuild_internal() { - auto num_symbols = m_data->size(); + auto num_symbols = m_data.size(); if (num_symbols == m_symbols.size()) return; if (num_symbols < m_symbols.size()) { @@ -132,7 +131,7 @@ void StringCompressor::rebuild_internal() } // we have new symbols to add for (size_t i = m_symbols.size(); i < num_symbols; ++i) { - auto pair = m_data->get(i); + auto pair = m_data.get(i); SymbolDef def; def.id = (CompressionSymbol)(i + 256); def.expansion_a = 0xFFFF & (pair >> 16); @@ -198,13 +197,13 @@ CompressedString StringCompressor::compress(StringData sd, bool learn) if (m_symbols.size() < (65536 - 256) && learn) { // define a new symbol for this entry and use it. REALM_ASSERT_DEBUG(m_compression_map[hash].id == 0); - REALM_ASSERT_DEBUG(m_symbols.size() == m_data->size()); - REALM_ASSERT_DEBUG(m_data->is_attached()); + REALM_ASSERT_DEBUG(m_symbols.size() == m_data.size()); + REALM_ASSERT_DEBUG(m_data.is_attached()); CompressionSymbol id = (CompressionSymbol)(256 + m_symbols.size()); SymbolDef def{id, from[0], from[1]}; m_compression_map[hash] = def; add_expansion(def); - m_data->add(((uint64_t)from[0]) << 16 | from[1]); + m_data.add(((uint64_t)from[0]) << 16 | from[1]); // std::cerr << id << " = {" << from[0] << ", " << from[1] << "}" << std::endl; *to++ = id; from += 2; diff --git a/src/realm/string_compressor.hpp b/src/realm/string_compressor.hpp index 2c866ecb781..bd10948e25c 100644 --- a/src/realm/string_compressor.hpp +++ b/src/realm/string_compressor.hpp @@ -19,11 +19,13 @@ #ifndef REALM_STRING_COMPRESSOR_HPP #define REALM_STRING_COMPRESSOR_HPP +#include #include #include using CompressionSymbol = uint16_t; using CompressedString = std::vector; + struct CompressedStringView { CompressionSymbol* data = 0; uint32_t size = 0; @@ -51,11 +53,6 @@ struct CompressedStringView { }; namespace realm { - -class ArrayUnsigned; -class Array; -class Allocator; - class StringCompressor { public: StringCompressor(Allocator& alloc, Array& parent, size_t index, bool writable); @@ -90,7 +87,7 @@ class StringCompressor { std::vector m_symbols; // map from symbol -> symbolpair, 2 elements pr entry std::vector m_compression_map; // perfect hash from symbolpair to its symbol - std::unique_ptr m_data; + ArrayUnsigned m_data; constexpr static size_t storage_chunk_size = 4096; std::vector m_expansion_storage; }; diff --git a/src/realm/string_interner.cpp b/src/realm/string_interner.cpp index fb801b1fd6a..a3e898c8236 100644 --- a/src/realm/string_interner.cpp +++ b/src/realm/string_interner.cpp @@ -17,9 +17,8 @@ **************************************************************************/ #include +#include #include - -#include #include namespace realm { @@ -100,7 +99,7 @@ struct HashMapIter { // Attempt to build a hash leaf from a smaller hash leaf or a non-hash leaf. static bool rehash(Array& from, Array& to, uint8_t hash_size) { - REALM_ASSERT_DEBUG(from.size() * 2 == to.size()); + REALM_ASSERT_DEBUG(from.size() * 2 <= to.size()); for (size_t i = 0; i < from.size(); ++i) { auto entry = (size_t)from.get(i); @@ -271,48 +270,47 @@ struct StringInterner::DataLeaf { StringInterner::StringInterner(Allocator& alloc, Array& parent, ColKey col_key, bool writable) : m_parent(parent) + , m_top(alloc) + , m_data(alloc) + , m_hash_map(alloc) + , m_current_string_leaf(alloc) + , m_current_long_string_node(alloc) { REALM_ASSERT_DEBUG(col_key != ColKey()); size_t index = col_key.get_index().val; // ensure that m_top and m_data is well defined and reflect any existing data // We'll have to extend this to handle no defined backing - m_top = std::make_unique(alloc); - m_top->set_parent(&parent, index); - m_data = std::make_unique(alloc); - m_data->set_parent(m_top.get(), Pos_Data); - m_hash_map = std::make_unique(alloc); - m_hash_map->set_parent(m_top.get(), Pos_Map); - m_current_string_leaf = std::make_unique(alloc); + m_top.set_parent(&parent, index); + m_data.set_parent(&m_top, Pos_Data); + m_hash_map.set_parent(&m_top, Pos_Map); m_col_key = col_key; update_from_parent(writable); } void StringInterner::update_from_parent(bool writable) { - auto parent_idx = m_top->get_ndx_in_parent(); + auto parent_idx = m_top.get_ndx_in_parent(); bool valid_top_ref_spot = m_parent.is_attached() && parent_idx < m_parent.size(); bool valid_top = valid_top_ref_spot && m_parent.get_as_ref(parent_idx); if (valid_top) { - m_top->update_from_parent(); - m_data->update_from_parent(); - m_hash_map->update_from_parent(); + m_top.update_from_parent(); + m_data.update_from_parent(); + m_hash_map.update_from_parent(); } else if (writable && valid_top_ref_spot) { - m_top->create(NodeHeader::type_HasRefs, false, Top_Size, 0); - m_top->set(Pos_Version, (1 << 1) + 1); // version number 1. - m_top->set(Pos_Size, (0 << 1) + 1); // total size 0 - m_top->set(Pos_ColKey, (m_col_key.value << 1) + 1); - m_top->set(Pos_Compressor, 0); + m_top.create(NodeHeader::type_HasRefs, false, Top_Size, 0); + m_top.set(Pos_Version, (1 << 1) + 1); // version number 1. + m_top.set(Pos_Size, (0 << 1) + 1); // total size 0 + m_top.set(Pos_ColKey, (m_col_key.value << 1) + 1); + m_top.set(Pos_Compressor, 0); + // create first level of data tree here (to simplify other stuff) - m_data = std::make_unique(m_parent.get_alloc()); - m_data->set_parent(m_top.get(), Pos_Data); - m_data->create(NodeHeader::type_HasRefs, false, 0); - m_data->update_parent(); - m_hash_map = std::make_unique(m_parent.get_alloc()); - m_hash_map->set_parent(m_top.get(), Pos_Map); - m_hash_map->create(NodeHeader::type_Normal); - m_hash_map->update_parent(); - m_top->update_parent(); + m_data.create(NodeHeader::type_HasRefs, false, 0); + m_data.update_parent(); + + m_hash_map.create(NodeHeader::type_Normal); + m_hash_map.update_parent(); + m_top.update_parent(); valid_top = true; } if (!valid_top) { @@ -320,44 +318,43 @@ void StringInterner::update_from_parent(bool writable) m_compressor.reset(); m_compressed_leafs.clear(); // m_compressed_string_map.clear(); - m_top->detach(); // <-- indicates "dead" mode - m_data->detach(); - m_hash_map->detach(); + m_top.detach(); + m_data.detach(); + m_hash_map.detach(); m_compressor.reset(); return; } // validate we're accessing data for the correct column. A combination of column erase // and insert could lead to an interner being paired with wrong data in the file. // If so, we clear internal data forcing rebuild_internal() to rebuild from scratch. - int64_t data_colkey = m_top->get_as_ref_or_tagged(Pos_ColKey).get_as_int(); + int64_t data_colkey = m_top.get_as_ref_or_tagged(Pos_ColKey).get_as_int(); if (m_col_key.value != data_colkey) { // new column, new data m_compressor.reset(); m_decompressed_strings.clear(); } if (!m_compressor) - m_compressor = std::make_unique(m_top->get_alloc(), *m_top, Pos_Compressor, writable); + m_compressor = std::make_unique(m_top.get_alloc(), m_top, Pos_Compressor, writable); else m_compressor->refresh(writable); - if (m_data->size()) { - auto ref_to_write_buffer = m_data->get_as_ref(m_data->size() - 1); - const char* header = m_top->get_alloc().translate(ref_to_write_buffer); + if (m_data.size()) { + auto ref_to_write_buffer = m_data.get_as_ref(m_data.size() - 1); + const char* header = m_top.get_alloc().translate(ref_to_write_buffer); bool is_array_of_cprs = NodeHeader::get_hasrefs_from_header(header); if (is_array_of_cprs) { - m_current_long_string_node = std::make_unique(m_top->get_alloc()); - m_current_long_string_node->set_parent(m_data.get(), m_data->size() - 1); - m_current_long_string_node->update_from_parent(); + m_current_long_string_node.set_parent(&m_data, m_data.size() - 1); + m_current_long_string_node.update_from_parent(); } else { - m_current_long_string_node.reset(); + m_current_long_string_node.detach(); } } else - m_current_long_string_node.reset(); // just in case... + m_current_long_string_node.detach(); // just in case... // rebuild internal structures...... rebuild_internal(); - m_current_string_leaf->detach(); + m_current_string_leaf.detach(); } void StringInterner::rebuild_internal() @@ -382,21 +379,19 @@ void StringInterner::rebuild_internal() } } - size_t target_size = (size_t)m_top->get_as_ref_or_tagged(Pos_Size).get_as_int(); + size_t target_size = (size_t)m_top.get_as_ref_or_tagged(Pos_Size).get_as_int(); m_decompressed_strings.resize(target_size); - if (m_data->size() != m_compressed_leafs.size()) { - m_compressed_leafs.resize(m_data->size()); + if (m_data.size() != m_compressed_leafs.size()) { + m_compressed_leafs.resize(m_data.size()); } - // allways force new setup of all leafs: + // always force new setup of all leafs: // update m_compressed_leafs to reflect m_data for (size_t idx = 0; idx < m_compressed_leafs.size(); ++idx) { - auto ref = m_data->get_as_ref(idx); + auto ref = m_data.get_as_ref(idx); auto& leaf_meta = m_compressed_leafs[idx]; - // if (ref != leaf_meta.m_leaf_ref) { leaf_meta.m_is_loaded = false; leaf_meta.m_compressed.clear(); leaf_meta.m_leaf_ref = ref; - //} } } @@ -404,13 +399,13 @@ StringInterner::~StringInterner() {} StringID StringInterner::intern(StringData sd) { - REALM_ASSERT(m_top->is_attached()); + REALM_ASSERT(m_top.is_attached()); std::lock_guard lock(m_mutex); // special case for null string if (sd.data() == nullptr) return 0; uint32_t h = (uint32_t)sd.hash(); - auto candidates = hash_to_id(*m_hash_map.get(), h, 32); + auto candidates = hash_to_id(m_hash_map, h, 32); for (auto& candidate : candidates) { auto candidate_cpr = get_compressed(candidate); if (m_compressor->compare(sd, candidate_cpr) == 0) @@ -422,56 +417,52 @@ StringID StringInterner::intern(StringData sd) m_decompressed_strings.push_back({64, std::make_unique(sd)}); auto id = m_decompressed_strings.size(); m_in_memory_strings.push_back(id); - add_to_hash_map(*m_hash_map.get(), h, id, 32); - size_t index = (size_t)m_top->get_as_ref_or_tagged(Pos_Size).get_as_int(); + add_to_hash_map(m_hash_map, h, id, 32); + size_t index = (size_t)m_top.get_as_ref_or_tagged(Pos_Size).get_as_int(); REALM_ASSERT_DEBUG(index == id - 1); bool need_long_string_node = c_str.size() >= 65536; // TODO: update_internal must set up m_current_long_string_node if it is in use + if (need_long_string_node && !m_current_long_string_node.is_attached()) { + + m_current_long_string_node.create(NodeHeader::type_HasRefs); - if (need_long_string_node && !m_current_long_string_node) { if ((index & 0xFF) == 0) { // if we're starting on a new leaf, extend parent array for it - m_data->add(0); + m_data.add(0); m_compressed_leafs.push_back({}); - m_current_long_string_node = std::make_unique(m_top->get_alloc()); - m_current_long_string_node->set_parent(m_data.get(), m_data->size() - 1); - m_current_long_string_node->create(NodeHeader::type_HasRefs); - m_current_long_string_node->update_parent(); - REALM_ASSERT_DEBUG(!m_current_string_leaf->is_attached() || m_current_string_leaf->size() == 0); - m_current_string_leaf->detach(); + m_current_long_string_node.set_parent(&m_data, m_data.size() - 1); + m_current_long_string_node.update_parent(); + REALM_ASSERT_DEBUG(!m_current_string_leaf.is_attached() || m_current_string_leaf.size() == 0); + m_current_string_leaf.detach(); } else { // we have been building an existing leaf and need to shift representation. // but first we need to update leaf accessor for existing leaf - if (m_current_string_leaf->is_attached()) { - m_current_string_leaf->update_from_parent(); + if (m_current_string_leaf.is_attached()) { + m_current_string_leaf.update_from_parent(); } else { - m_current_string_leaf->init_from_ref(m_current_string_leaf->get_ref_from_parent()); + m_current_string_leaf.init_from_ref(m_current_string_leaf.get_ref_from_parent()); } - REALM_ASSERT_DEBUG(m_current_string_leaf->size() > 0); - m_current_long_string_node = std::make_unique(m_top->get_alloc()); - m_current_long_string_node->set_parent(m_data.get(), m_data->size() - 1); - m_current_long_string_node->create(NodeHeader::type_HasRefs); - m_current_long_string_node->update_parent(); + REALM_ASSERT_DEBUG(m_current_string_leaf.size() > 0); + m_current_long_string_node.set_parent(&m_data, m_data.size() - 1); + m_current_long_string_node.update_parent(); // convert the current leaf into a long string node. (array of strings in separate arrays) for (auto s : m_compressed_leafs.back().m_compressed) { - ArrayUnsigned arr(m_top->get_alloc()); + ArrayUnsigned arr(m_top.get_alloc()); arr.create(s.size, 65535); unsigned short* dest = reinterpret_cast(arr.m_data); std::copy_n(s.data, s.size, dest); - m_current_long_string_node->add(arr.get_ref()); + m_current_long_string_node.add(arr.get_ref()); } - m_current_string_leaf->destroy(); - m_current_string_leaf->detach(); + m_current_string_leaf.destroy(); // force later reload of leaf m_compressed_leafs.back().m_is_loaded = false; - // m_compressed_leafs.back().m_leaf_ref = m_data->get_as_ref(m_data->size() - 1); } } - if (m_current_long_string_node) { - ArrayUnsigned arr(m_top->get_alloc()); + if (m_current_long_string_node.is_attached()) { + ArrayUnsigned arr(m_top.get_alloc()); arr.create(c_str.size(), 65535); unsigned short* begin = c_str.data(); if (begin) { @@ -480,11 +471,11 @@ StringID StringInterner::intern(StringData sd) unsigned short* dest = reinterpret_cast(arr.m_data); std::copy_n(begin, n, dest); } - m_current_long_string_node->add(arr.get_ref()); - m_current_long_string_node->update_parent(); - if (m_current_long_string_node->size() == 256) { + m_current_long_string_node.add(arr.get_ref()); + m_current_long_string_node.update_parent(); + if (m_current_long_string_node.size() == 256) { // exit from "long string mode" - m_current_long_string_node.reset(); + m_current_long_string_node.detach(); } CompressionSymbol* p_start = reinterpret_cast(arr.m_data); m_compressed_leafs.back().m_compressed.push_back({p_start, arr.size()}); @@ -492,40 +483,40 @@ StringID StringInterner::intern(StringData sd) else { // Append to leaf with up to 256 entries. // First create a new leaf if needed (limit number of entries to 256 pr leaf) - bool need_leaf_update = !m_current_string_leaf->is_attached() || (index & 0xFF) == 0; + bool need_leaf_update = !m_current_string_leaf.is_attached() || (index & 0xFF) == 0; if (need_leaf_update) { - m_current_string_leaf->set_parent(m_data.get(), index >> 8); + m_current_string_leaf.set_parent(&m_data, index >> 8); if ((index & 0xFF) == 0) { // create new leaf - m_current_string_leaf->create(0, 65535); - m_data->add(m_current_string_leaf->get_ref()); + m_current_string_leaf.create(0, 65535); + m_data.add(m_current_string_leaf.get_ref()); m_compressed_leafs.push_back({}); } else { // just setup leaf accessor - if (m_current_string_leaf->is_attached()) { - m_current_string_leaf->update_from_parent(); + if (m_current_string_leaf.is_attached()) { + m_current_string_leaf.update_from_parent(); } else { - m_current_string_leaf->init_from_ref(m_current_string_leaf->get_ref_from_parent()); + m_current_string_leaf.init_from_ref(m_current_string_leaf.get_ref_from_parent()); } } } REALM_ASSERT(c_str.size() < 65535); // Add compressed string at end of leaf - m_current_string_leaf->add(c_str.size()); + m_current_string_leaf.add(c_str.size()); for (auto c : c_str) { - m_current_string_leaf->add(c); + m_current_string_leaf.add(c); } REALM_ASSERT_DEBUG(m_compressed_leafs.size()); - CompressionSymbol* p = reinterpret_cast(m_current_string_leaf->m_data); - auto p_limit = p + m_current_string_leaf->size(); + CompressionSymbol* p = reinterpret_cast(m_current_string_leaf.m_data); + auto p_limit = p + m_current_string_leaf.size(); auto p_start = p_limit - c_str.size(); m_compressed_leafs.back().m_compressed.push_back({p_start, c_str.size()}); REALM_ASSERT(m_compressed_leafs.back().m_compressed.size() <= 256); } - m_top->adjust(Pos_Size, 2); // type is has_Refs, so increment is by 2 - load_leaf_if_new_ref(m_compressed_leafs.back(), m_data->get_as_ref(m_data->size() - 1)); + m_top.adjust(Pos_Size, 2); // type is has_Refs, so increment is by 2 + load_leaf_if_new_ref(m_compressed_leafs.back(), m_data.get_as_ref(m_data.size() - 1)); #ifdef REALM_DEBUG auto csv = get_compressed(id); CompressedStringView csv2(c_str); @@ -543,11 +534,11 @@ bool StringInterner::load_leaf_if_needed(DataLeaf& leaf) // must interpret leaf first - the leaf is either a single array holding all strings, // or an array with each (compressed) string placed in its own array. - const char* header = m_top->get_alloc().translate(leaf.m_leaf_ref); + const char* header = m_top.get_alloc().translate(leaf.m_leaf_ref); bool is_single_array = !NodeHeader::get_hasrefs_from_header(header); if (is_single_array) { size_t leaf_offset = 0; - ArrayUnsigned leaf_array(m_top->get_alloc()); + ArrayUnsigned leaf_array(m_top.get_alloc()); leaf_array.init_from_ref(leaf.m_leaf_ref); REALM_ASSERT(NodeHeader::get_encoding(leaf_array.get_header()) == NodeHeader::Encoding::WTypBits); REALM_ASSERT(NodeHeader::get_width_from_header(leaf_array.get_header()) == 16); @@ -565,10 +556,10 @@ bool StringInterner::load_leaf_if_needed(DataLeaf& leaf) } else { // Not a single leaf - instead an array of strings - Array arr(m_top->get_alloc()); + Array arr(m_top.get_alloc()); arr.init_from_ref(leaf.m_leaf_ref); for (size_t idx = 0; idx < arr.size(); ++idx) { - ArrayUnsigned str_array(m_top->get_alloc()); + ArrayUnsigned str_array(m_top.get_alloc()); ref_type ref = arr.get_as_ref(idx); str_array.init_from_ref(ref); REALM_ASSERT(NodeHeader::get_encoding(str_array.get_header()) == NodeHeader::Encoding::WTypBits); @@ -607,7 +598,7 @@ CompressedStringView& StringInterner::get_compressed(StringID id) std::optional StringInterner::lookup(StringData sd) { - if (!m_top->is_attached()) { + if (!m_top.is_attached()) { // "dead" mode return {}; } @@ -615,7 +606,7 @@ std::optional StringInterner::lookup(StringData sd) if (sd.data() == nullptr) return 0; uint32_t h = (uint32_t)sd.hash(); - auto candidates = hash_to_id(*m_hash_map.get(), h, 32); + auto candidates = hash_to_id(m_hash_map, h, 32); for (auto& candidate : candidates) { auto candidate_cpr = get_compressed(candidate); if (m_compressor->compare(sd, candidate_cpr) == 0) @@ -667,10 +658,9 @@ StringData StringInterner::get(StringID id) REALM_ASSERT_DEBUG(id <= m_decompressed_strings.size()); CachedString& cs = m_decompressed_strings[id - 1]; if (cs.m_decompressed) { - std::string* ref_str = cs.m_decompressed.get(); if (cs.m_weight < 128) cs.m_weight += 64; - return {ref_str->c_str(), ref_str->size()}; + return {cs.m_decompressed->c_str(), cs.m_decompressed->size()}; } cs.m_weight = 64; cs.m_decompressed = std::make_unique(m_compressor->decompress(get_compressed(id))); diff --git a/src/realm/string_interner.hpp b/src/realm/string_interner.hpp index 2a36c9e38dc..93c1eec45be 100644 --- a/src/realm/string_interner.hpp +++ b/src/realm/string_interner.hpp @@ -19,24 +19,25 @@ #ifndef REALM_STRING_INTERNER_HPP #define REALM_STRING_INTERNER_HPP +#include #include -#include +#include #include #include #include #include #include +#include +struct CompressedStringView; namespace realm { - using StringID = size_t; -class Array; -class ArrayUnsigned; -class Allocator; +class StringCompressor; + struct CachedString { uint8_t m_weight = 0; std::unique_ptr m_decompressed; @@ -58,22 +59,22 @@ class StringInterner { private: Array& m_parent; // need to be able to check if this is attached or not - std::unique_ptr m_top; + Array m_top; // Compressed strings are stored in blocks of 256. // One array holds refs to all blocks: - std::unique_ptr m_data; + Array m_data; // In-memory representation of a block. Either only the ref to it, // or a full vector of views into the block. struct DataLeaf; // in-memory metadata for faster access to compressed strings. Mirrors m_data. std::vector m_compressed_leafs; // 'm_hash_map' is used for mapping hash of uncompressed string to string id. - std::unique_ptr m_hash_map; + Array m_hash_map; // the block of compressed strings we're currently appending to: - std::unique_ptr m_current_string_leaf; + ArrayUnsigned m_current_string_leaf; // an array of strings we're currently appending to. This is used instead // when ever we meet a string too large to be placed inline. - std::unique_ptr m_current_long_string_node; + Array m_current_long_string_node; void rebuild_internal(); CompressedStringView& get_compressed(StringID id); // return true if the leaf was reloaded diff --git a/src/realm/table.cpp b/src/realm/table.cpp index 977339ade0d..210bc049bc3 100644 --- a/src/realm/table.cpp +++ b/src/realm/table.cpp @@ -36,6 +36,7 @@ #include #include #include +#include #include @@ -541,6 +542,7 @@ void Table::remove_column(ColKey col_key) erase_root_column(col_key); // Throws m_has_any_embedded_objects.reset(); auto i = col_key.get_index().val; + if (i < m_string_interners.size() && m_string_interners[i]) m_string_interners[i].reset(); } @@ -1070,19 +1072,19 @@ ColKey Table::do_insert_root_column(ColKey col_key, ColumnType type, StringData if (m_tombstones) { m_tombstones->insert_column(col_key); } - // create string interners internal rep as well as data area - REALM_ASSERT_DEBUG(m_interner_data.is_attached()); - while (col_ndx >= m_string_interners.size()) { - m_string_interners.push_back({}); - } - while (col_ndx >= m_interner_data.size()) { - m_interner_data.add(0); + if (col_key.get_type() == col_type_String || col_key.get_type() == col_type_Mixed) { + // create string interners internal rep as well as data area + REALM_ASSERT_DEBUG(m_interner_data.is_attached()); + while (col_ndx >= m_string_interners.size()) { + m_string_interners.push_back({}); + } + while (col_ndx >= m_interner_data.size()) { + m_interner_data.add(0); + } + REALM_ASSERT(!m_string_interners[col_ndx]); + m_string_interners[col_ndx] = std::make_unique(m_alloc, m_interner_data, col_key, true); } - REALM_ASSERT(!m_string_interners[col_ndx]); - // FIXME: Limit creation of interners to EXACTLY the columns, where they can be - // relevant. - // if (col_key.get_type() == col_type_String) - m_string_interners[col_ndx] = std::make_unique(m_alloc, m_interner_data, col_key, true); + bump_storage_version(); return col_key; @@ -1114,16 +1116,16 @@ void Table::do_erase_root_column(ColKey col_key) REALM_ASSERT(m_index_accessors.back() == nullptr); m_index_accessors.pop_back(); } - REALM_ASSERT_DEBUG(col_ndx < m_string_interners.size()); - if (m_string_interners[col_ndx]) { - REALM_ASSERT_DEBUG(m_interner_data.is_attached()); - REALM_ASSERT_DEBUG(col_ndx < m_interner_data.size()); - auto data_ref = m_interner_data.get_as_ref(col_ndx); - if (data_ref) - Array::destroy_deep(data_ref, m_alloc); - m_interner_data.set(col_ndx, 0); - // m_string_interners[col_ndx]->update_from_parent(true); - m_string_interners[col_ndx].reset(); + if (col_key.get_type() == col_type_String || col_key.get_type() == col_type_Mixed) { + if (col_ndx < m_string_interners.size() && m_string_interners[col_ndx]) { + REALM_ASSERT_DEBUG(m_interner_data.is_attached()); + REALM_ASSERT_DEBUG(col_ndx < m_interner_data.size()); + auto data_ref = m_interner_data.get_as_ref(col_ndx); + if (data_ref) + Array::destroy_deep(data_ref, m_alloc); + m_interner_data.set(col_ndx, 0); + m_string_interners[col_ndx].reset(); + } } bump_content_version(); bump_storage_version(); @@ -2231,6 +2233,10 @@ void Table::refresh_string_interners(bool writable) m_string_interners[idx].reset(); continue; } + + if (col_key.get_type() != col_type_String && col_key.get_type() != col_type_Mixed) + continue; + REALM_ASSERT_DEBUG(col_key.get_index().val == idx); // maintain sufficient size of interner arrays to cover all columns while (idx >= m_string_interners.size()) { diff --git a/test/object-store/sync/client_reset.cpp b/test/object-store/sync/client_reset.cpp index e75a61d5bcb..d2bcd6c893a 100644 --- a/test/object-store/sync/client_reset.cpp +++ b/test/object-store/sync/client_reset.cpp @@ -1046,7 +1046,7 @@ TEST_CASE("sync: client reset", "[sync][pbs][client reset][baas]") { realm->cancel_transaction(); return value == 6; }, - std::chrono::seconds(20)); + std::chrono::seconds(20), std::chrono::milliseconds(500)); } auto session = test_app_session.sync_manager()->get_existing_session(local_config.path); if (session) { From 8f1d4722783abc96595f4a5422c38b4f42bcd620 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B8rgen=20Edelbo?= Date: Mon, 17 Jun 2024 15:02:17 +0200 Subject: [PATCH 03/14] Fix compilation --- src/realm/db.hpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/src/realm/db.hpp b/src/realm/db.hpp index 7613a4c367b..e46ba6742c3 100644 --- a/src/realm/db.hpp +++ b/src/realm/db.hpp @@ -510,8 +510,6 @@ class DB : public std::enable_shared_from_this { std::shared_ptr m_logger; std::mutex m_commit_listener_mutex; std::vector m_commit_listeners; - std::unordered_map*> m_string_interners; - std::mutex m_string_interners_mutex; bool m_is_sync_agent = false; // Id for this DB to be used in logging. We will just use some bits from the pointer. // The path cannot be used as this would not allow us to distinguish between two DBs opening From fc3111735aac2b53ea9a27052fd906aeee5228c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B8rgen=20Edelbo?= Date: Wed, 19 Jun 2024 11:34:12 +0200 Subject: [PATCH 04/14] RCORE-2162: Add compression of strings in Mixed, Lst and Dictionary (#7804) --- src/realm/array_backlink.cpp | 4 +- src/realm/array_mixed.cpp | 76 +++++++++++++------------ src/realm/array_mixed.hpp | 9 +++ src/realm/array_string.cpp | 46 +++++++++++---- src/realm/array_string.hpp | 6 +- src/realm/bplustree.hpp | 27 ++++++++- src/realm/cluster.cpp | 35 ++++-------- src/realm/cluster_tree.cpp | 2 +- src/realm/collection.cpp | 3 + src/realm/collection.hpp | 8 ++- src/realm/dictionary.cpp | 7 ++- src/realm/impl/array_writer.hpp | 2 + src/realm/list.cpp | 6 +- src/realm/list.hpp | 4 ++ src/realm/obj.cpp | 68 +++++++++++++--------- src/realm/query_expression.hpp | 18 ++++-- src/realm/set.hpp | 3 + src/realm/table.cpp | 40 ++++++------- src/realm/table.hpp | 11 ++-- src/realm/to_json.cpp | 4 +- test/object-store/sync/client_reset.cpp | 9 ++- test/test_lang_bind_helper.cpp | 4 +- test/test_list.cpp | 10 +++- test/test_query.cpp | 1 + 24 files changed, 246 insertions(+), 157 deletions(-) diff --git a/src/realm/array_backlink.cpp b/src/realm/array_backlink.cpp index bf4cfddb8da..4190a648a1b 100644 --- a/src/realm/array_backlink.cpp +++ b/src/realm/array_backlink.cpp @@ -225,12 +225,12 @@ void ArrayBacklink::verify() const REALM_ASSERT(src_obj.get(src_col_key).get_link() == target_link); } else if (val.is_type(type_List)) { - DummyParent parent(src_table, val.get_ref()); + DummyParent parent(src_table, val.get_ref(), src_col_key); Lst list(parent, 0); REALM_ASSERT(list.find_any(target_link) != npos); } else if (val.is_type(type_Dictionary)) { - DummyParent parent(src_table, val.get_ref()); + DummyParent parent(src_table, val.get_ref(), src_col_key); Dictionary dict(parent, 0); REALM_ASSERT(dict.find_any(target_link) != npos); } diff --git a/src/realm/array_mixed.cpp b/src/realm/array_mixed.cpp index 7d00991ad5b..9a8a14d93f7 100644 --- a/src/realm/array_mixed.cpp +++ b/src/realm/array_mixed.cpp @@ -360,9 +360,8 @@ ref_type ArrayMixed::typed_write(ref_type top_ref, _impl::ArrayWriterBase& out, 2. int and pair int arrays, they are used for storing integers, timestamps, floats, doubles, decimals, links. In general we can compress them, but we need to be careful, controlling the col_type should prevent compressing data that we want to leave in the current format. - 3. string array is for strings and binary data (no compression for now) - 4. ref array is actually storing refs to collections. they can only be BPlusTree or - BPlusTree. + 3. string array is for strings and binary data + 4. ref array is actually storing refs to collections. They can only be Lst or Dictionary. 5. key array stores unique identifiers for collections in mixed (integers that can be compressed) */ Array composite(alloc); @@ -372,41 +371,48 @@ ref_type ArrayMixed::typed_write(ref_type top_ref, _impl::ArrayWriterBase& out, auto ref = top.get(i); ref_type new_ref = ref; if (ref && !(out.only_modified && alloc.is_read_only(ref))) { - if (i < 3) { // int, and pair_int - // integer arrays - new_ref = Array::write(ref, alloc, out, out.only_modified, out.compress); - } - else if (i == 4) { // collection in mixed - ArrayRef arr_ref(alloc); - arr_ref.init_from_ref(ref); - auto ref_sz = arr_ref.size(); - TempArray written_ref_leaf(ref_sz); - - for (size_t k = 0; k < ref_sz; k++) { - ref_type new_sub_ref = 0; - if (auto sub_ref = arr_ref.get(k)) { - auto header = alloc.translate(sub_ref); - // Now we have to find out if the nested collection is a - // dictionary or a list. If the top array has a size of 2 - // and it is not a BplusTree inner node, then it is a dictionary - if (NodeHeader::get_size_from_header(header) == 2 && - !NodeHeader::get_is_inner_bptree_node_from_header(header)) { - new_sub_ref = Dictionary::typed_write(sub_ref, out, alloc); - } - else { - new_sub_ref = BPlusTree::typed_write(sub_ref, out, alloc); + switch (i) { + case payload_idx_int: + // integer array + new_ref = Array::write(ref, alloc, out, out.only_modified, out.compress); + break; + case payload_idx_pair: + // integer array + new_ref = Array::write(ref, alloc, out, out.only_modified, out.compress); + break; + case payload_idx_str: + new_ref = ArrayString::typed_write(ref, out, alloc); + break; + case payload_idx_ref: { + // collection in mixed + ArrayRef arr_ref(alloc); + arr_ref.init_from_ref(ref); + auto ref_sz = arr_ref.size(); + TempArray written_ref_leaf(ref_sz); + + for (size_t k = 0; k < ref_sz; k++) { + ref_type new_sub_ref = 0; + if (auto sub_ref = arr_ref.get(k)) { + auto header = alloc.translate(sub_ref); + // Now we have to find out if the nested collection is a + // dictionary or a list. If the top array has a size of 2 + // and it is not a BplusTree inner node, then it is a dictionary + if (NodeHeader::get_size_from_header(header) == 2 && + !NodeHeader::get_is_inner_bptree_node_from_header(header)) { + new_sub_ref = Dictionary::typed_write(sub_ref, out, alloc); + } + else { + new_sub_ref = BPlusTree::typed_write(sub_ref, out, alloc); + } } + written_ref_leaf.set_as_ref(k, new_sub_ref); } - written_ref_leaf.set_as_ref(k, new_sub_ref); + new_ref = written_ref_leaf.write(out); + break; } - new_ref = written_ref_leaf.write(out); - } - else if (i == 5) { // unique keys associated to collections in mixed - new_ref = Array::write(ref, alloc, out, out.only_modified, out.compress); - } - else { - // all the rest we don't want to compress it, at least for now (strings will be needed) - new_ref = Array::write(ref, alloc, out, out.only_modified, false); + case payload_idx_key: + new_ref = Array::write(ref, alloc, out, out.only_modified, out.compress); + break; } } written_leaf.set(i, new_ref); diff --git a/src/realm/array_mixed.hpp b/src/realm/array_mixed.hpp index a0de93b8339..7fc544bc870 100644 --- a/src/realm/array_mixed.hpp +++ b/src/realm/array_mixed.hpp @@ -64,6 +64,15 @@ class ArrayMixed : public ArrayPayload, private Array { { Array::set_parent(parent, ndx_in_parent); } + bool need_string_interner() const override + { + return true; + } + virtual void set_string_interner(StringInterner* interner) const override + { + m_strings.set_string_interner(interner); + } + void init_from_parent() { ref_type ref = get_ref_from_parent(); diff --git a/src/realm/array_string.cpp b/src/realm/array_string.cpp index 8731c99fac9..72df74c524f 100644 --- a/src/realm/array_string.cpp +++ b/src/realm/array_string.cpp @@ -18,8 +18,8 @@ #include #include +#include #include -#include #include using namespace realm; @@ -537,17 +537,39 @@ void ArrayString::verify() const #endif } -ref_type ArrayString::write(_impl::ArrayWriterBase& out, StringInterner* interner) +template <> +ref_type ArrayString::typed_write(ref_type ref, _impl::ArrayWriterBase& out, Allocator& alloc) { - REALM_ASSERT(interner); - // we have to write out all, modified or not, to match the total cleanup - Array interned(Allocator::get_default()); - auto sz = size(); - interned.create(NodeHeader::type_Normal, true, sz); - for (size_t i = 0; i < sz; ++i) { - interned.set(i, interner->intern(get(i))); + Array leaf(alloc); + leaf.init_from_ref(ref); + ref_type ret_val; + auto header = leaf.get_header(); + if (NodeHeader::get_hasrefs_from_header(header) || + NodeHeader::get_wtype_from_header(header) == NodeHeader::wtype_Multiply) { + // We're interning these strings + ArrayString as(alloc); + as.init_from_ref(ref); + StringInterner* interner = out.table->get_string_interner(out.col_key); + auto sz = as.size(); + Array interned(Allocator::get_default()); + interned.create(NodeHeader::type_Normal, true, sz); + for (size_t i = 0; i < sz; ++i) { + interned.set(i, interner->intern(as.get(i))); + } + ret_val = interned.write(out, false, false, out.compress); + interned.destroy(); + // in a transactional setting: + // Destroy all sub-arrays if present, in order to release memory in file + // This is contrary to the rest of the handling in this function, but needed + // here since sub-arrays may not have been COW'ed and therefore not freed in file. + // We rely on 'only_modified' to indicate that we're in a transactional setting. + if (out.only_modified) + leaf.destroy_deep(true); + } + else { + // whether it's the old enum strings or the new interned strings, + // just write out the array using integer leaf compression + ret_val = leaf.write(out, false, out.only_modified, out.compress); } - auto retval = interned.write(out, false, false, out.compress); - interned.destroy(); - return retval; + return ret_val; } diff --git a/src/realm/array_string.hpp b/src/realm/array_string.hpp index df121c50b2c..6c8400c4055 100644 --- a/src/realm/array_string.hpp +++ b/src/realm/array_string.hpp @@ -126,10 +126,8 @@ class ArrayString : public ArrayPayload { static StringData get(const char* header, size_t ndx, Allocator& alloc) noexcept; void verify() const; - // Write to 'out', if needed using 'interner' to intern any strings. - // An interner of 0 will disable interning. Interned values may be further - // compressed using leaf compression for integer arrays. - ref_type write(_impl::ArrayWriterBase& out, StringInterner* interner); + template + static ref_type typed_write(ref_type ref, T& out, Allocator& alloc); private: static constexpr size_t small_string_max_size = 15; // ArrayStringShort diff --git a/src/realm/bplustree.hpp b/src/realm/bplustree.hpp index 1f78d32ac26..e5ef8cac8db 100644 --- a/src/realm/bplustree.hpp +++ b/src/realm/bplustree.hpp @@ -30,6 +30,7 @@ namespace realm { class BPlusTreeBase; class BPlusTreeInner; +class StringInterner; /*****************************************************************************/ /* BPlusTreeNode */ @@ -207,6 +208,16 @@ class BPlusTreeBase { m_root->bp_set_parent(parent, ndx_in_parent); } + void set_interner(StringInterner* interner) + { + m_interner = interner; + } + + StringInterner* get_interner() + { + return m_interner; + } + virtual void erase(size_t) = 0; virtual void clear() = 0; virtual void swap(size_t, size_t) = 0; @@ -234,6 +245,7 @@ class BPlusTreeBase { std::unique_ptr m_root; Allocator& m_alloc; ArrayParent* m_parent = nullptr; + StringInterner* m_interner = nullptr; size_t m_ndx_in_parent = 0; size_t m_size = 0; size_t m_cached_leaf_begin; @@ -300,6 +312,9 @@ class BPlusTree : public BPlusTreeBase { void init_from_ref(ref_type ref) noexcept override { LeafArray::init_from_ref(ref); + if constexpr (realm::is_any_v) { + LeafArray::set_string_interner(m_tree->get_interner()); + } } ref_type get_ref() const override @@ -574,19 +589,25 @@ class BPlusTree : public BPlusTreeBase { std::unique_ptr create_leaf_node() override { - std::unique_ptr leaf = std::make_unique(this); - static_cast(leaf.get())->create(); + auto leaf = std::make_unique(this); + leaf->create(); + if constexpr (realm::is_any_v) { + leaf->set_string_interner(m_interner); + } return leaf; } std::unique_ptr init_leaf_node(ref_type ref) override { - std::unique_ptr leaf = std::make_unique(this); + auto leaf = std::make_unique(this); leaf->init_from_ref(ref); return leaf; } BPlusTreeLeaf* cache_leaf(MemRef mem) override { m_leaf_cache.init_from_mem(mem); + if constexpr (realm::is_any_v) { + m_leaf_cache.LeafArray::set_string_interner(m_interner); + } return &m_leaf_cache; } void replace_root(std::unique_ptr new_root) override diff --git a/src/realm/cluster.cpp b/src/realm/cluster.cpp index 75deb0707c2..6edec52a9e2 100644 --- a/src/realm/cluster.cpp +++ b/src/realm/cluster.cpp @@ -261,6 +261,12 @@ inline void Cluster::set_string_interner(ArrayString& arr, ColKey col_key) const m_tree_top.set_string_interner(arr, col_key); } +template <> +inline void Cluster::set_string_interner(ArrayMixed& arr, ColKey col_key) const +{ + m_tree_top.set_string_interner(arr, col_key); +} + template inline void Cluster::set_spec(T&, ColKey::Idx) const { @@ -314,6 +320,7 @@ inline void Cluster::do_insert_mixed(size_t ndx, ColKey col_key, Mixed init_valu { ArrayMixed arr(m_alloc); arr.set_parent(this, col_key.get_index().val + s_first_col_index); + set_string_interner(arr, col_key); arr.init_from_parent(); arr.insert(ndx, init_value); @@ -798,6 +805,7 @@ inline void Cluster::do_erase_mixed(size_t ndx, ColKey col_key, ObjKey key, Casc ArrayMixed values(m_alloc); values.set_parent(this, col_ndx.val + s_first_col_index); + set_string_interner(values, col_key); values.init_from_parent(); Mixed value = values.get(ndx); @@ -1447,6 +1455,7 @@ void Cluster::dump_objects(int64_t key_offset, std::string lead) const } case col_type_Mixed: { ArrayMixed arr(m_alloc); + set_string_interner(arr, col); ref_type ref = Array::get_as_ref(j); arr.init_from_ref(ref); std::cout << ", " << arr.get(i); @@ -1651,32 +1660,8 @@ ref_type Cluster::typed_write(ref_type ref, _impl::ArrayWriterBase& out) const else { // Columns auto col_key = out.table->m_leaf_ndx2colkey[j - 1]; + out.col_key = col_key; auto col_type = col_key.get_type(); - // String columns are interned at this point - if (out.compress && col_type == col_type_String && !col_key.is_collection()) { - ArrayRef leaf(m_alloc); - leaf.init_from_ref(ref); - auto header = leaf.get_header(); - if (NodeHeader::get_hasrefs_from_header(header) || - NodeHeader::get_wtype_from_header(header) == wtype_Multiply) { - // We're interning these strings - ArrayString as(m_alloc); - as.init_from_ref(leaf_rot.get_as_ref()); - written_cluster.set_as_ref(j, as.write(out, out.table->get_string_interner(col_key))); - // in a transactional setting: - // Destroy all sub-arrays if present, in order to release memory in file - // This is contrary to the rest of the handling in this function, but needed - // here since sub-arrays may not have been COW'ed and therefore not freed in file. - // We rely on 'only_modified' to indicate that we're in a transactional setting. - if (only_modified) - leaf.destroy_deep(true); - continue; - } - // whether it's the old enum strings or the new interned strings, - // just write out the array using integer leaf compression - written_cluster.set_as_ref(j, leaf.write(out, false, false, false)); - continue; - } if (col_key.is_collection()) { ArrayRef arr_ref(m_alloc); arr_ref.init_from_ref(ref); diff --git a/src/realm/cluster_tree.cpp b/src/realm/cluster_tree.cpp index 3021f684911..5821a9f465b 100644 --- a/src/realm/cluster_tree.cpp +++ b/src/realm/cluster_tree.cpp @@ -1140,7 +1140,7 @@ void ClusterTree::set_string_interner(ArrayPayload& arr, ColKey col_key) const // Check for owner. This function may be called in context of DictionaryClusterTree // in which case m_owner is null (and spec never needed). if (m_owner) { - arr.set_string_interner(_impl::TableFriend::get_string_interner(*m_owner, col_key)); + arr.set_string_interner(m_owner->get_string_interner(col_key)); } } diff --git a/src/realm/collection.cpp b/src/realm/collection.cpp index f0eabf95d46..24d261622a6 100644 --- a/src/realm/collection.cpp +++ b/src/realm/collection.cpp @@ -155,6 +155,7 @@ void Collection::get_any(QueryCtrlBlock& ctrl, Mixed val, size_t index) BPlusTree keys(*ctrl.alloc); keys.set_parent(&top, 0); + keys.set_interner(ctrl.interner); keys.init_from_parent(); size_t start = 0; if (size_t finish = keys.size()) { @@ -177,6 +178,7 @@ void Collection::get_any(QueryCtrlBlock& ctrl, Mixed val, size_t index) } BPlusTree values(*ctrl.alloc); values.set_parent(&top, 1); + values.set_interner(ctrl.interner); values.init_from_parent(); for (; start < finish; start++) { val = values.get(start); @@ -194,6 +196,7 @@ void Collection::get_any(QueryCtrlBlock& ctrl, Mixed val, size_t index) if (!ref) return; BPlusTree list(*ctrl.alloc); + list.set_interner(ctrl.interner); list.init_from_ref(ref); if (size_t sz = list.size()) { size_t start = 0; diff --git a/src/realm/collection.hpp b/src/realm/collection.hpp index e26158a63ee..bc2d7f6b99d 100644 --- a/src/realm/collection.hpp +++ b/src/realm/collection.hpp @@ -12,15 +12,17 @@ namespace realm { +class StringInterner; template struct CollectionIterator; // Used in Cluster when removing owning object class DummyParent : public CollectionParent { public: - DummyParent(TableRef t, ref_type ref) + DummyParent(TableRef t, ref_type ref, ColKey ck) : m_obj(t, MemRef(), ObjKey(), 0) , m_ref(ref) + , m_col_key(ck) { } FullPath get_path() const noexcept final @@ -37,7 +39,7 @@ class DummyParent : public CollectionParent { } ColKey get_col_key() const noexcept final { - return {}; + return m_col_key; } void add_index(Path&, const Index&) const noexcept final {} size_t find_index(const Index&) const noexcept final @@ -62,6 +64,7 @@ class DummyParent : public CollectionParent { protected: Obj m_obj; ref_type m_ref; + ColKey m_col_key; UpdateStatus update_if_needed() const final { return UpdateStatus::Updated; @@ -111,6 +114,7 @@ class Collection { bool path_only_unary_keys = false; // Not from list Allocator* alloc = nullptr; Group* group = nullptr; + StringInterner* interner = nullptr; }; static void get_any(QueryCtrlBlock&, Mixed, size_t); }; diff --git a/src/realm/dictionary.cpp b/src/realm/dictionary.cpp index 7c9f3b76801..03d1d5be024 100644 --- a/src/realm/dictionary.cpp +++ b/src/realm/dictionary.cpp @@ -850,9 +850,11 @@ UpdateStatus Dictionary::init_from_parent(bool allow_create) const Allocator& alloc = get_alloc(); m_dictionary_top.reset(new Array(alloc)); m_dictionary_top->set_parent(const_cast(this), 0); + StringInterner* interner = m_col_key ? get_table()->get_string_interner(m_col_key) : nullptr; switch (m_key_type) { case type_String: { m_keys.reset(new BPlusTree(alloc)); + m_keys->set_interner(interner); break; } case type_Int: { @@ -865,6 +867,7 @@ UpdateStatus Dictionary::init_from_parent(bool allow_create) const m_keys->set_parent(m_dictionary_top.get(), 0); m_values.reset(new BPlusTreeMixed(alloc)); m_values->set_parent(m_dictionary_top.get(), 1); + m_values->set_interner(interner); } if (ref) { @@ -1153,12 +1156,12 @@ void Dictionary::to_json(std::ostream& out, JSONOutputMode output_mode, fn(val); } else if (val.is_type(type_Dictionary)) { - DummyParent parent(this->get_table(), val.get_ref()); + DummyParent parent(this->get_table(), val.get_ref(), m_col_key); Dictionary dict(parent, 0); dict.to_json(out, output_mode, fn); } else if (val.is_type(type_List)) { - DummyParent parent(this->get_table(), val.get_ref()); + DummyParent parent(this->get_table(), val.get_ref(), m_col_key); Lst list(parent, 0); list.to_json(out, output_mode, fn); } diff --git a/src/realm/impl/array_writer.hpp b/src/realm/impl/array_writer.hpp index 4096805e0fa..c6a7e18e413 100644 --- a/src/realm/impl/array_writer.hpp +++ b/src/realm/impl/array_writer.hpp @@ -20,6 +20,7 @@ #define REALM_ARRAY_WRITER_HPP #include +#include namespace realm { class Table; @@ -30,6 +31,7 @@ class ArrayWriterBase { bool only_modified = true; bool compress = true; const Table* table; + ColKey col_key; virtual ~ArrayWriterBase() { } diff --git a/src/realm/list.cpp b/src/realm/list.cpp index d3f94bd895a..4d35e033cbf 100644 --- a/src/realm/list.cpp +++ b/src/realm/list.cpp @@ -386,6 +386,8 @@ UpdateStatus Lst::init_from_parent(bool allow_create) const m_tree.reset(new BPlusTreeMixed(get_alloc())); const ArrayParent* parent = this; m_tree->set_parent(const_cast(parent), 0); + if (m_col_key) + m_tree->set_interner(get_table()->get_string_interner(m_col_key)); } try { return do_init_from_parent(m_tree.get(), Base::get_collection_ref(), allow_create); @@ -744,12 +746,12 @@ void Lst::to_json(std::ostream& out, JSONOutputMode output_mode, fn(val); } else if (val.is_type(type_Dictionary)) { - DummyParent parent(this->get_table(), val.get_ref()); + DummyParent parent(this->get_table(), val.get_ref(), m_col_key); Dictionary dict(parent, i); dict.to_json(out, output_mode, fn); } else if (val.is_type(type_List)) { - DummyParent parent(this->get_table(), val.get_ref()); + DummyParent parent(this->get_table(), val.get_ref(), m_col_key); Lst list(parent, i); list.to_json(out, output_mode, fn); } diff --git a/src/realm/list.hpp b/src/realm/list.hpp index f0646d2a176..398988e02bc 100644 --- a/src/realm/list.hpp +++ b/src/realm/list.hpp @@ -258,6 +258,10 @@ class Lst final : public CollectionBaseImpl { m_tree.reset(new BPlusTree(get_alloc())); const ArrayParent* parent = this; m_tree->set_parent(const_cast(parent), 0); + if constexpr (realm::is_any_v) { + if (m_col_key) + m_tree->set_interner(get_table()->get_string_interner(m_col_key)); + } } Base::update_content_version(); return do_init_from_parent(m_tree.get(), Base::get_collection_ref(), allow_create); diff --git a/src/realm/obj.cpp b/src/realm/obj.cpp index fc34b755d57..3f3e23b03bf 100644 --- a/src/realm/obj.cpp +++ b/src/realm/obj.cpp @@ -239,13 +239,15 @@ bool Obj::compare_list_in_mixed(Lst& val1, Lst& val2, ColKey ck, O auto m1 = val1.get_any(i); auto m2 = val2.get_any(i); + auto other_table = other.get_table(); + auto other_col_key = other_table->get_column_key(col_name); if (m1.is_type(type_List) && m2.is_type(type_List)) { - DummyParent parent(other.get_table(), m2.get_ref()); + DummyParent parent(other_table, m2.get_ref(), other_col_key); Lst list(parent, 0); return compare_list_in_mixed(*val1.get_list(i), list, ck, other, col_name); } else if (m1.is_type(type_Dictionary) && m2.is_type(type_Dictionary)) { - DummyParent parent(other.get_table(), m2.get_ref()); + DummyParent parent(other_table, m2.get_ref(), other_col_key); Dictionary dict(parent, 0); return compare_dict_in_mixed(*val1.get_dictionary(i), dict, ck, other, col_name); } @@ -268,13 +270,15 @@ bool Obj::compare_dict_in_mixed(Dictionary& val1, Dictionary& val2, ColKey ck, O if (k1 != k2) return false; + auto other_table = other.get_table(); + auto other_col_key = other_table->get_column_key(col_name); if (m1.is_type(type_List) && m2.is_type(type_List)) { - DummyParent parent(other.get_table(), m2.get_ref()); + DummyParent parent(other_table, m2.get_ref(), other_col_key); Lst list(parent, 0); return compare_list_in_mixed(*val1.get_list(k1.get_string()), list, ck, other, col_name); } else if (m1.is_type(type_Dictionary) && m2.is_type(type_Dictionary)) { - DummyParent parent(other.get_table(), m2.get_ref()); + DummyParent parent(other_table, m2.get_ref(), other_col_key); Dictionary dict(parent, 0); return compare_dict_in_mixed(*val1.get_dictionary(k1.get_string()), dict, ck, other, col_name); } @@ -495,6 +499,7 @@ Mixed Obj::get_unfiltered_mixed(ColKey::Idx col_ndx) const ArrayMixed values(get_alloc()); ref_type ref = to_ref(Array::get(m_mem.get_addr(), col_ndx.val + 1)); values.init_from_ref(ref); + values.set_string_interner(m_table->get_string_interner(col_ndx)); return values.get(m_row_ndx); } @@ -1168,6 +1173,35 @@ REALM_FORCEINLINE void Obj::sync(Node& arr) } } +// helper functions for filtering out calls to set_string_interner() +template +inline void Obj::set_string_interner(T&, ColKey) +{ +} +template <> +inline void Obj::set_string_interner(ArrayString& values, ColKey col_key) +{ + values.set_string_interner(m_table->get_string_interner(col_key)); +} +template <> +inline void Obj::set_string_interner(ArrayMixed& values, ColKey col_key) +{ + values.set_string_interner(m_table->get_string_interner(col_key)); +} + +// helper functions for filtering out calls to set_spec() +template +inline void Obj::set_spec(T&, ColKey) +{ +} +template <> +inline void Obj::set_spec(ArrayString& values, ColKey col_key) +{ + size_t spec_ndx = m_table->colkey2spec_ndx(col_key); + Spec* spec = const_cast(&get_spec()); + values.set_spec(spec, spec_ndx); +} + template <> Obj& Obj::set(ColKey col_key, Mixed value, bool is_default) { @@ -1225,6 +1259,7 @@ Obj& Obj::set(ColKey col_key, Mixed value, bool is_default) REALM_ASSERT(col_ndx.val + 1 < fields.size()); ArrayMixed values(alloc); values.set_parent(&fields, col_ndx.val + 1); + set_string_interner(values, col_key); values.init_from_parent(); values.set(m_row_ndx, value); if (value.is_type(type_Dictionary, type_List)) { @@ -1369,6 +1404,7 @@ Obj& Obj::add_int(ColKey col_key, int64_t value) if (col_key.get_type() == col_type_Mixed) { ArrayMixed values(alloc); values.set_parent(&fields, col_ndx.val + 1); + set_string_interner(values, col_key); values.init_from_parent(); Mixed old = values.get(m_row_ndx); if (old.is_type(type_Int)) { @@ -1603,30 +1639,6 @@ inline void check_range(const BinaryData& val) } } // namespace -// helper functions for filtering out calls to set_string_interner() -template -inline void Obj::set_string_interner(T&, ColKey) -{ -} -template <> -inline void Obj::set_string_interner(ArrayString& values, ColKey col_key) -{ - values.set_string_interner(m_table->get_string_interner(col_key)); -} - -// helper functions for filtering out calls to set_spec() -template -inline void Obj::set_spec(T&, ColKey) -{ -} -template <> -inline void Obj::set_spec(ArrayString& values, ColKey col_key) -{ - size_t spec_ndx = m_table->colkey2spec_ndx(col_key); - Spec* spec = const_cast(&get_spec()); - values.set_spec(spec, spec_ndx); -} - #if REALM_ENABLE_GEOSPATIAL template <> diff --git a/src/realm/query_expression.hpp b/src/realm/query_expression.hpp index 24b43af8c66..33d330c3fe4 100644 --- a/src/realm/query_expression.hpp +++ b/src/realm/query_expression.hpp @@ -1977,7 +1977,7 @@ class SimpleQuerySupport : public ObjPropertyExpr { return TypeOfValueOperator(this->clone()); } -private: +protected: using ObjPropertyExpr::m_link_map; using ObjPropertyExpr::m_column_key; @@ -2053,8 +2053,10 @@ class Columns : public SimpleQuerySupport { void set_base_table(ConstTableRef table) override { SimpleQuerySupport::set_base_table(table); - m_ctrl.alloc = &get_link_map().get_target_table()->get_alloc(); + auto target_table = get_link_map().get_target_table(); + m_ctrl.alloc = &target_table->get_alloc(); m_ctrl.group = table->get_parent_group(); + m_ctrl.interner = target_table->get_string_interner(m_column_key); } void evaluate(Subexpr::Index& index, ValueBase& destination) override @@ -2626,12 +2628,12 @@ class SizeOperator : public Subexpr2 { destination.set(i, int64_t(elem.get_string().size())); } else if (elem.is_type(type_List)) { - DummyParent parent(m_expr->get_base_table().cast_away_const(), elem.get_ref()); + DummyParent parent(m_expr->get_base_table().cast_away_const(), elem.get_ref(), ColKey()); Lst list(parent, 0); destination.set(i, int64_t(list.size())); } else if (elem.is_type(type_Dictionary)) { - DummyParent parent(m_expr->get_base_table().cast_away_const(), elem.get_ref()); + DummyParent parent(m_expr->get_base_table().cast_away_const(), elem.get_ref(), ColKey()); Dictionary dict(parent, 0); destination.set(i, int64_t(dict.size())); } @@ -3309,8 +3311,10 @@ class Columns> : public ColumnsCollection { void set_base_table(ConstTableRef table) override { ColumnsCollection::set_base_table(table); - m_ctrl.alloc = &m_link_map.get_target_table()->get_alloc(); + auto target_table = m_link_map.get_target_table(); + m_ctrl.alloc = &target_table->get_alloc(); m_ctrl.group = table->get_parent_group(); + m_ctrl.interner = target_table->get_string_interner(m_column_key); } void evaluate(Subexpr::Index& index, ValueBase& destination) override @@ -3407,8 +3411,10 @@ class Columns : public ColumnsCollection { void set_base_table(ConstTableRef table) override { ColumnsCollection::set_base_table(table); - m_ctrl.alloc = &m_link_map.get_target_table()->get_alloc(); + auto target_table = m_link_map.get_target_table(); + m_ctrl.alloc = &target_table->get_alloc(); m_ctrl.group = table->get_parent_group(); + m_ctrl.interner = target_table->get_string_interner(m_column_key); } SizeOperator size() override; std::unique_ptr get_element_length() override diff --git a/src/realm/set.hpp b/src/realm/set.hpp index e3d7fac3d60..dd42bfd26d1 100644 --- a/src/realm/set.hpp +++ b/src/realm/set.hpp @@ -532,6 +532,9 @@ UpdateStatus Set::init_from_parent(bool allow_create) const m_tree.reset(new BPlusTree(get_alloc())); const ArrayParent* parent = this; m_tree->set_parent(const_cast(parent), 0); + if constexpr (realm::is_any_v) { + m_tree->set_interner(get_table()->get_string_interner(m_col_key)); + } } return do_init_from_parent(m_tree.get(), Base::get_collection_ref(), allow_create); } diff --git a/src/realm/table.cpp b/src/realm/table.cpp index 210bc049bc3..be78abe7122 100644 --- a/src/realm/table.cpp +++ b/src/realm/table.cpp @@ -265,6 +265,11 @@ using namespace realm::util; Replication* Table::g_dummy_replication = nullptr; +static inline bool needs_string_interner(ColKey col_key) +{ + return col_key.get_type() == col_type_String || col_key.get_type() == col_type_Mixed || col_key.is_dictionary(); +} + bool TableVersions::operator==(const TableVersions& other) const { if (size() != other.size()) @@ -541,10 +546,6 @@ void Table::remove_column(ColKey col_key) erase_root_column(col_key); // Throws m_has_any_embedded_objects.reset(); - auto i = col_key.get_index().val; - - if (i < m_string_interners.size() && m_string_interners[i]) - m_string_interners[i].reset(); } @@ -1072,7 +1073,7 @@ ColKey Table::do_insert_root_column(ColKey col_key, ColumnType type, StringData if (m_tombstones) { m_tombstones->insert_column(col_key); } - if (col_key.get_type() == col_type_String || col_key.get_type() == col_type_Mixed) { + if (needs_string_interner(col_key)) { // create string interners internal rep as well as data area REALM_ASSERT_DEBUG(m_interner_data.is_attached()); while (col_ndx >= m_string_interners.size()) { @@ -1084,7 +1085,6 @@ ColKey Table::do_insert_root_column(ColKey col_key, ColumnType type, StringData REALM_ASSERT(!m_string_interners[col_ndx]); m_string_interners[col_ndx] = std::make_unique(m_alloc, m_interner_data, col_key, true); } - bump_storage_version(); return col_key; @@ -1116,16 +1116,14 @@ void Table::do_erase_root_column(ColKey col_key) REALM_ASSERT(m_index_accessors.back() == nullptr); m_index_accessors.pop_back(); } - if (col_key.get_type() == col_type_String || col_key.get_type() == col_type_Mixed) { - if (col_ndx < m_string_interners.size() && m_string_interners[col_ndx]) { - REALM_ASSERT_DEBUG(m_interner_data.is_attached()); - REALM_ASSERT_DEBUG(col_ndx < m_interner_data.size()); - auto data_ref = m_interner_data.get_as_ref(col_ndx); - if (data_ref) - Array::destroy_deep(data_ref, m_alloc); - m_interner_data.set(col_ndx, 0); - m_string_interners[col_ndx].reset(); - } + if (col_ndx < m_string_interners.size() && m_string_interners[col_ndx]) { + REALM_ASSERT_DEBUG(m_interner_data.is_attached()); + REALM_ASSERT_DEBUG(col_ndx < m_interner_data.size()); + auto data_ref = m_interner_data.get_as_ref(col_ndx); + if (data_ref) + Array::destroy_deep(data_ref, m_alloc); + m_interner_data.set(col_ndx, 0); + m_string_interners[col_ndx].reset(); } bump_content_version(); bump_storage_version(); @@ -2233,8 +2231,7 @@ void Table::refresh_string_interners(bool writable) m_string_interners[idx].reset(); continue; } - - if (col_key.get_type() != col_type_String && col_key.get_type() != col_type_Mixed) + if (!needs_string_interner(col_key)) continue; REALM_ASSERT_DEBUG(col_key.get_index().val == idx); @@ -3531,11 +3528,10 @@ void Table::typed_print(std::string prefix, ref_type ref) const std::cout << prefix << "}" << std::endl; } -StringInterner* Table::get_string_interner(ColKey col_key) const +StringInterner* Table::get_string_interner(ColKey::Idx idx) const { - auto idx = col_key.get_index().val; - REALM_ASSERT(idx < m_string_interners.size()); - auto interner = m_string_interners[idx].get(); + REALM_ASSERT(idx.val < m_string_interners.size()); + auto interner = m_string_interners[idx.val].get(); REALM_ASSERT(interner); return interner; } diff --git a/src/realm/table.hpp b/src/realm/table.hpp index 1f02e0540ac..3635d265835 100644 --- a/src/realm/table.hpp +++ b/src/realm/table.hpp @@ -573,7 +573,11 @@ class Table { ColKey::Idx spec_ndx2leaf_ndx(size_t idx) const; ColKey leaf_ndx2colkey(ColKey::Idx idx) const; ColKey spec_ndx2colkey(size_t ndx) const; - StringInterner* get_string_interner(ColKey col_key) const; + StringInterner* get_string_interner(ColKey::Idx idx) const; + StringInterner* get_string_interner(ColKey col_key) const + { + return get_string_interner(col_key.get_index()); + } // Queries // Using where(tv) is the new method to perform queries on TableView. The 'tv' can have any order; it does not // need to be sorted, and, resulting view retains its order. @@ -1417,11 +1421,6 @@ class _impl::TableFriend { return table.m_spec; } - static StringInterner* get_string_interner(const Table& table, ColKey col_key) - { - return table.get_string_interner(col_key); - } - static TableRef get_opposite_link_table(const Table& table, ColKey col_key); static Group* get_parent_group(const Table& table) noexcept diff --git a/src/realm/to_json.cpp b/src/realm/to_json.cpp index e9b9b049d72..ed1a11839d0 100644 --- a/src/realm/to_json.cpp +++ b/src/realm/to_json.cpp @@ -297,12 +297,12 @@ void Obj::to_json(std::ostream& out, JSONOutputMode output_mode) const print_link(val); } else if (val.is_type(type_Dictionary)) { - DummyParent parent(m_table, val.get_ref()); + DummyParent parent(m_table, val.get_ref(), ck); Dictionary dict(parent, 0); dict.to_json(out, output_mode, print_link); } else if (val.is_type(type_List)) { - DummyParent parent(m_table, val.get_ref()); + DummyParent parent(m_table, val.get_ref(), ck); Lst list(parent, 0); list.to_json(out, output_mode, print_link); } diff --git a/test/object-store/sync/client_reset.cpp b/test/object-store/sync/client_reset.cpp index d2bcd6c893a..99341a48e08 100644 --- a/test/object-store/sync/client_reset.cpp +++ b/test/object-store/sync/client_reset.cpp @@ -1046,8 +1046,15 @@ TEST_CASE("sync: client reset", "[sync][pbs][client reset][baas]") { realm->cancel_transaction(); return value == 6; }, - std::chrono::seconds(20), std::chrono::milliseconds(500)); + std::chrono::seconds(20)); } + // We can't be sure that the 'after' callback has been called yet + timed_sleeping_wait_for( + [&]() -> bool { + std::lock_guard lock(mtx); + return after_callback_invocations == 1; + }, + std::chrono::milliseconds(20)); auto session = test_app_session.sync_manager()->get_existing_session(local_config.path); if (session) { session->shutdown_and_wait(); diff --git a/test/test_lang_bind_helper.cpp b/test/test_lang_bind_helper.cpp index 7bdd650d225..04467f781d0 100644 --- a/test/test_lang_bind_helper.cpp +++ b/test/test_lang_bind_helper.cpp @@ -521,7 +521,7 @@ TEST(LangBindHelper_AdvanceReadTransact_Basics) rt->verify(); CHECK_EQUAL(0, rt->size()); - // Create a table via the other SharedGroup + // Create a table in a separate transaction ObjKey k0; { WriteTransaction wt(sg); @@ -542,7 +542,7 @@ TEST(LangBindHelper_AdvanceReadTransact_Basics) CHECK_EQUAL(0, foo->get_object(k0).get(cols[0])); uint_fast64_t version = foo->get_content_version(); - // Modify the table via the other SharedGroup + // Modify the table in a separate transaction ObjKey k1; { WriteTransaction wt(sg); diff --git a/test/test_list.cpp b/test/test_list.cpp index d8e3f1fc1de..f3c5dd0b938 100644 --- a/test/test_list.cpp +++ b/test/test_list.cpp @@ -108,9 +108,12 @@ TEST(List_basic) TEST(List_SimpleTypes) { - Group g; + SHARED_GROUP_TEST_PATH(path); + DBRef db = DB::create(make_in_realm_history(), path); + + auto tr = db->start_write(); std::vector lists; - TableRef t = g.add_table("table"); + TableRef t = tr->add_table("table"); ColKey int_col = t->add_column_list(type_Int, "integers"); ColKey bool_col = t->add_column_list(type_Bool, "booleans"); ColKey string_col = t->add_column_list(type_String, "strings"); @@ -135,6 +138,9 @@ TEST(List_SimpleTypes) Timestamp(seconds_since_epoc + 60, 0)}; obj.set_list_values(timestamp_col, timestamp_vector); + tr->commit_and_continue_as_read(); + tr->promote_to_write(); + auto int_list = obj.get_list(int_col); lists.push_back(&int_list); std::vector vec(int_list.size()); diff --git a/test/test_query.cpp b/test/test_query.cpp index 226c169d714..ab44c8f7b17 100644 --- a/test/test_query.cpp +++ b/test/test_query.cpp @@ -5541,6 +5541,7 @@ TEST(Query_LinkToDictionary) { Group g; auto target = g.add_table("target"); + target->add_column(type_Int, "dummy"); // Ensure that dict_col get index 1 auto dict_col = target->add_column_dictionary(type_String, "string", true); auto source = g.add_table("source"); auto link_col = source->add_column(*target, "link"); From 499e63031de92e626cc93f5d8d81399e546ceeff Mon Sep 17 00:00:00 2001 From: nicola cabiddu Date: Mon, 8 Jul 2024 18:56:53 +0100 Subject: [PATCH 05/14] RCORE-2170 String compression tests (#7812) --- evergreen/config.yml | 11 +- src/realm/string_interner.cpp | 10 +- test/CMakeLists.txt | 1 + test/test_group.cpp | 171 ++++++++++++++++++++++++++++++ test/test_string_compression.cpp | 173 +++++++++++++++++++++++++++++++ 5 files changed, 356 insertions(+), 10 deletions(-) create mode 100644 test/test_string_compression.cpp diff --git a/evergreen/config.yml b/evergreen/config.yml index c18cd110529..055977f7312 100644 --- a/evergreen/config.yml +++ b/evergreen/config.yml @@ -1863,19 +1863,18 @@ buildvariants: - name: finalize_coverage_data - name: macos-array-compression - display_name: "MacOS 11 arm64 (Compress Arrays)" - run_on: macos-1100-arm64 + display_name: "MacOS 14 arm64 (Compress Arrays)" + run_on: macos-14-arm64 expansions: - cmake_url: "https://s3.amazonaws.com/static.realm.io/evergreen-assets/cmake-3.26.3-macos-universal.tar.gz" - cmake_bindir: "./cmake_binaries/CMake.app/Contents/bin" + cmake_bindir: "/opt/homebrew/bin" cmake_toolchain_file: "./tools/cmake/xcode.toolchain.cmake" + cmake_build_tool_options: "-sdk macosx" cmake_generator: Xcode max_jobs: $(sysctl -n hw.logicalcpu) - xcode_developer_dir: /Applications/Xcode13.1.app/Contents/Developer + xcode_developer_dir: /Applications/Xcode15.2.app/Contents/Developer extra_flags: -DCMAKE_SYSTEM_NAME=Darwin -DCMAKE_OSX_ARCHITECTURES=arm64 compress: On cmake_build_type: Debug - coveralls_flag_name: "macos-arm64" tasks: - name: compile_test diff --git a/src/realm/string_interner.cpp b/src/realm/string_interner.cpp index a3e898c8236..17dc3663b2d 100644 --- a/src/realm/string_interner.cpp +++ b/src/realm/string_interner.cpp @@ -223,7 +223,7 @@ static std::vector hash_to_id(Array& node, uint32_t hash, uint8_t hash if (!node.has_refs()) { // it's a leaf - default is a list, search starts from index 0. HashMapIter it(node, hash, hash_size); - if (node.size() > hash_node_min_size) { + if (node.size() >= hash_node_min_size) { // it is a hash table, so use hash to select index to start searching // table size must be power of two! size_t index = hash & (node.size() - 1); @@ -590,6 +590,7 @@ CompressedStringView& StringInterner::get_compressed(StringID id) auto index = id - 1; // 0 represents null auto hi = index >> 8; auto lo = index & 0xFFUL; + DataLeaf& leaf = m_compressed_leafs[hi]; load_leaf_if_needed(leaf); REALM_ASSERT_DEBUG(lo < leaf.m_compressed.size()); @@ -618,8 +619,9 @@ std::optional StringInterner::lookup(StringData sd) int StringInterner::compare(StringID A, StringID B) { std::lock_guard lock(m_mutex); - REALM_ASSERT_DEBUG(A < m_decompressed_strings.size()); - REALM_ASSERT_DEBUG(B < m_decompressed_strings.size()); + // 0 is null, the first index starts from 1. + REALM_ASSERT_DEBUG(A <= m_decompressed_strings.size()); + REALM_ASSERT_DEBUG(B <= m_decompressed_strings.size()); // comparisons against null if (A == B && A == 0) return 0; @@ -635,7 +637,7 @@ int StringInterner::compare(StringID A, StringID B) int StringInterner::compare(StringData s, StringID A) { std::lock_guard lock(m_mutex); - REALM_ASSERT_DEBUG(A < m_decompressed_strings.size()); + REALM_ASSERT_DEBUG(A <= m_decompressed_strings.size()); // comparisons against null if (s.data() == nullptr && A == 0) return 0; diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 9b3e6dd2f68..98cabac58c6 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -76,6 +76,7 @@ set(CORE_TEST_SOURCES test_shared.cpp test_status.cpp test_string_data.cpp + test_string_compression.cpp test_table_view.cpp test_thread.cpp test_transactions.cpp diff --git a/test/test_group.cpp b/test/test_group.cpp index c3c11c16ed5..f7133fe05c4 100644 --- a/test/test_group.cpp +++ b/test/test_group.cpp @@ -2508,5 +2508,176 @@ TEST(Group_ArrayCompression_Correctness_Random_Input) #endif } +TEST(Group_ArrayCompression_Strings) +{ + GROUP_TEST_PATH(path); + + // create a bunch of string related properties that are going to be compressed and verify write/read machinery + // and string correctness. + Group to_disk; + TableRef table = to_disk.add_table("test"); + auto col_key_string = table->add_column(type_String, "string"); + auto col_key_list_string = table->add_column_list(type_String, "list_strings"); + auto col_key_set_string = table->add_column_set(type_String, "set_strings"); + auto col_key_dict_string = table->add_column_dictionary(type_String, "dict_strings"); + auto obj = table->create_object(); + + + obj.set_any(col_key_string, {"Test"}); + auto list_s = obj.get_list(col_key_list_string); + auto set_s = obj.get_set(col_key_set_string); + auto dictionary_s = obj.get_dictionary(col_key_dict_string); + + std::string tmp{"aabbbcccaaaaddfwregfgklnjytojfs"}; + for (size_t i = 0; i < 10; ++i) { + list_s.add({tmp + std::to_string(i)}); + } + for (size_t i = 0; i < 10; ++i) { + set_s.insert({tmp + std::to_string(i)}); + } + for (size_t i = 0; i < 10; ++i) { + const auto key_value = tmp + std::to_string(i); + dictionary_s.insert({key_value}, {key_value}); + } + + CHECK(list_s.size() == 10); + CHECK(set_s.size() == 10); + CHECK(dictionary_s.size() == 10); + + // Serialize to disk (compression should happen when the proper leaf array is serialized to disk) + to_disk.write(path, crypt_key()); + +#ifdef REALM_DEBUG + to_disk.verify(); +#endif + + // Load the tables + Group from_disk(path, crypt_key()); + TableRef read_table = from_disk.get_table("test"); + auto obj1 = read_table->get_object(0); + + auto list_s1 = obj.get_list("list_strings"); + auto set_s1 = obj.get_set("set_strings"); + auto dictionary_s1 = obj.get_dictionary("dict_strings"); + + CHECK(obj1.get_any("string") == obj.get_any("string")); + + + CHECK(list_s1.size() == list_s.size()); + CHECK(set_s1.size() == set_s.size()); + CHECK(dictionary_s1.size() == dictionary_s.size()); + + CHECK(*read_table == *table); + + for (size_t i = 0; i < list_s1.size(); ++i) { + CHECK_EQUAL(list_s1.get_any(i), list_s.get_any(i)); + } + + for (size_t i = 0; i < set_s1.size(); ++i) { + CHECK_EQUAL(set_s1.get_any(i), set_s.get_any(i)); + } + + for (size_t i = 0; i < dictionary_s1.size(); ++i) { + CHECK_EQUAL(dictionary_s1.get_key(i), dictionary_s.get_key(i)); + CHECK_EQUAL(dictionary_s1.get_any(i), dictionary_s.get_any(i)); + } + +#ifdef REALM_DEBUG + from_disk.verify(); +#endif +} + +TEST(Test_Commit_Compression_Strings) +{ + auto generate_random_str_len = []() { + std::random_device rd; + std::mt19937 generator(rd()); + std::uniform_int_distribution<> distribution(1, 100); + return distribution(generator); + }; + + auto generate_random_string = [](size_t length) { + const std::string alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuv" + "wxyz0123456789"; + std::random_device rd; + std::mt19937 generator(rd()); + std::uniform_int_distribution<> distribution(0, (int)alphabet.size() - 1); + + std::string random_str; + for (size_t i = 0; i < length; ++i) + random_str += alphabet[distribution(generator)]; + + return random_str; + }; + + SHARED_GROUP_TEST_PATH(path); + auto hist = make_in_realm_history(); + DBRef db = DB::create(*hist, path); + ColKey col_key_string, col_key_list_string, col_key_set_string, col_key_dict_string; + ObjKey obj_key; + TableKey table_key; + + auto rt = db->start_read(); + { + WriteTransaction wt(db); + auto table = wt.add_table("test"); + table_key = table->get_key(); + col_key_string = table->add_column(type_String, "string"); + col_key_list_string = table->add_column_list(type_String, "list_strings"); + col_key_set_string = table->add_column_set(type_String, "set_strings"); + col_key_dict_string = table->add_column_dictionary(type_String, "dict_strings"); + Obj obj = table->create_object(); + obj_key = obj.get_key(); + wt.commit(); + } + // check verify that columns have been created + rt->advance_read(); + rt->verify(); + + // commit random strings in all the string based columns and verify interner updates + + for (size_t i = 0; i < 50; ++i) { + + // some string + const auto str = generate_random_string(generate_random_str_len()); + + rt = db->start_read(); + { + WriteTransaction wt(db); + auto table = wt.get_table(table_key); + auto obj = table->get_object(obj_key); + + obj.set_any(col_key_string, {str}); + auto list_s = obj.get_list(col_key_list_string); + auto set_s = obj.get_set(col_key_set_string); + auto dictionary_s = obj.get_dictionary(col_key_dict_string); + + list_s.add({str}); + set_s.insert({str}); + dictionary_s.insert({str}, {str}); + + wt.commit(); + } + rt->advance_read(); + rt->verify(); + + auto table = rt->get_table(table_key); + auto obj = table->get_object(obj_key); + const auto current_str = obj.get_any(col_key_string).get_string(); + CHECK_EQUAL(current_str, str); + + auto list_s = obj.get_list(col_key_list_string); + auto set_s = obj.get_set(col_key_set_string); + auto dictionary_s = obj.get_dictionary(col_key_dict_string); + + CHECK_EQUAL(list_s.size(), i + 1); + CHECK_EQUAL(set_s.size(), i + 1); + CHECK_EQUAL(dictionary_s.size(), i + 1); + + CHECK_EQUAL(list_s.get_any(i), str); + CHECK_NOT_EQUAL(set_s.find_any(str), not_found); + CHECK_NOT_EQUAL(dictionary_s.find_any(str), not_found); + } +} #endif // TEST_GROUP diff --git a/test/test_string_compression.cpp b/test/test_string_compression.cpp new file mode 100644 index 00000000000..83eede427e4 --- /dev/null +++ b/test/test_string_compression.cpp @@ -0,0 +1,173 @@ +/************************************************************************* + * + * Copyright 2024 Realm Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + **************************************************************************/ + +#include "testsettings.hpp" + +#include +#include +#include + +#include +#include +#include +#include + +#include "test.hpp" + +using namespace realm; + + +TEST(StringInterner_Basic_Creation) +{ + Array parent(Allocator::get_default()); + parent.create(NodeHeader::type_HasRefs, false, 1, 0); + StringInterner interner(Allocator::get_default(), parent, ColKey(0), true); + StringData my_string = "aaaaaaaaaaaaaaa"; + + auto id = interner.intern(my_string); + + const auto stored_id = interner.lookup(my_string); + CHECK(stored_id); + CHECK(*stored_id == id); + + CHECK(interner.compare(my_string, *stored_id) == 0); // should be equal + const auto origin_string = interner.get(id); + CHECK_EQUAL(my_string, origin_string); + + CHECK(interner.compare(*stored_id, id) == 0); // compare agaist self. + parent.destroy_deep(); +} + +TEST(StringInterner_InternMultipleStrings) +{ + Array parent(Allocator::get_default()); + parent.create(NodeHeader::type_HasRefs, false, 1, 0); + StringInterner interner(Allocator::get_default(), parent, ColKey(0), true); + + std::vector strings; + for (size_t i = 0; i < 100; i++) + strings.push_back("aaaaaaaaaaaaa" + std::to_string(i)); + + size_t i = 0; + for (const auto& s : strings) { + const auto id = interner.intern(s); + const auto& str = interner.get(id); + CHECK(str == strings[i++]); + auto stored_id = interner.lookup(str); + CHECK_EQUAL(*stored_id, id); + CHECK_EQUAL(interner.compare(str, id), 0); + } + parent.destroy_deep(); +} + +TEST(StringInterner_TestLookup) +{ + Array parent(Allocator::get_default()); + parent.create(NodeHeader::type_HasRefs, false, 1, 0); + StringInterner interner(Allocator::get_default(), parent, ColKey(0), true); + + std::vector strings; + for (size_t i = 0; i < 500; ++i) { + std::string my_string = "aaaaaaaaaaaaaaa" + std::to_string(i); + strings.push_back(my_string); + } + std::random_device rd; + std::mt19937 g(rd()); + std::shuffle(strings.begin(), strings.end(), g); + + for (const auto& s : strings) { + interner.intern(s); + auto id = interner.lookup(StringData(s)); + CHECK(id); + CHECK(interner.compare(StringData(s), *id) == 0); + } + + parent.destroy_deep(); +} + +TEST(StringInterner_VerifyInterningNull) +{ + Array parent(Allocator::get_default()); + parent.create(NodeHeader::type_HasRefs, false, 1, 0); + StringInterner interner(Allocator::get_default(), parent, ColKey(0), true); + auto null_id = interner.intern({}); + CHECK_EQUAL(null_id, 0); + CHECK_EQUAL(interner.get(null_id), StringData{}); + const auto stored_id = interner.lookup({}); + CHECK_EQUAL(stored_id, 0); + // comparison StringID vs StringID + CHECK_EQUAL(interner.compare({}, 0), 0); + // interned string id vs null id + auto str_id = interner.intern(StringData("test")); + CHECK_EQUAL(interner.compare(str_id, null_id), 1); + // null id vs interned string id + CHECK_EQUAL(interner.compare(null_id, str_id), -1); + + // comparison String vs StringID + CHECK_EQUAL(interner.compare(StringData{}, null_id), 0); + CHECK_EQUAL(interner.compare(StringData{}, str_id), 1); + CHECK_EQUAL(interner.compare(StringData{"test"}, null_id), -1); + + parent.destroy_deep(); +} + +TEST(StringInterner_VerifyLongString) +{ + Array parent(Allocator::get_default()); + parent.create(NodeHeader::type_HasRefs, false, 1, 0); + StringInterner interner(Allocator::get_default(), parent, ColKey(0), true); + + const auto N = 7000000; // a lot of characters for triggering long string handling. + std::string long_string(N, 'a'); + + const auto id = interner.intern(StringData(long_string)); + CHECK_EQUAL(id, 1); + const auto stored_id = interner.lookup(StringData(long_string)); + CHECK_EQUAL(stored_id, 1); + CHECK(interner.compare(StringData(long_string), *stored_id) == 0); + + parent.destroy_deep(); +} + +TEST(StringInterner_VerifyExpansionFromSmallStringToLongString) +{ + Array parent(Allocator::get_default()); + parent.create(NodeHeader::type_HasRefs, false, 1, 0); + StringInterner interner(Allocator::get_default(), parent, ColKey(0), true); + + const auto M = 1000; + std::string small_string = ""; + for (size_t i = 0; i < M; ++i) + small_string += 'a'; + + auto id = interner.intern(StringData(small_string)); + CHECK_EQUAL(id, 1); + auto stored_id = interner.lookup(StringData(small_string)); + CHECK_EQUAL(stored_id, 1); + CHECK(interner.compare(StringData(small_string), *stored_id) == 0); + + const auto N = 7000000; // a lot of characters for triggering long string handling. + std::string long_string(N, 'b'); + id = interner.intern(StringData(long_string)); + CHECK_EQUAL(id, 2); + stored_id = interner.lookup(StringData(long_string)); + CHECK_EQUAL(stored_id, id); + CHECK(interner.compare(StringData(long_string), *stored_id) == 0); + + parent.destroy_deep(); +} From ce6f1964f513382ff0d3d14d127521d09479777b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B8rgen=20Edelbo?= Date: Wed, 10 Jul 2024 10:50:14 +0200 Subject: [PATCH 06/14] Remove enum string feature (#7858) --- CHANGELOG.md | 2 +- src/realm/array_string.cpp | 88 +----- src/realm/array_string.hpp | 14 +- src/realm/cluster.cpp | 53 +--- src/realm/cluster.hpp | 1 - src/realm/cluster_tree.cpp | 55 ---- src/realm/cluster_tree.hpp | 3 - src/realm/exec/CMakeLists.txt | 11 - src/realm/exec/realm_enumerate.cpp | 139 ---------- src/realm/node.hpp | 5 - src/realm/obj.cpp | 39 +-- src/realm/obj.hpp | 2 - src/realm/query_engine.cpp | 6 +- src/realm/query_engine.hpp | 8 - src/realm/spec.cpp | 93 +------ src/realm/spec.hpp | 14 +- src/realm/table.cpp | 30 -- src/realm/table.hpp | 8 - test/expect_json.json | 432 ++++++++++++++++++++++++++++- test/expect_xjson.json | 15 - test/expect_xjson_plus.json | 15 - test/fuzz_group.cpp | 14 - test/realm-fuzzer/fuzz_engine.cpp | 3 - test/realm-fuzzer/fuzz_object.cpp | 14 - test/realm-fuzzer/fuzz_object.hpp | 1 - test/realm-fuzzer/util.hpp | 3 +- test/test_group.cpp | 47 ---- test/test_index_string.cpp | 106 ++----- test/test_json.cpp | 15 - test/test_lang_bind_helper.cpp | 97 ------- test/test_links.cpp | 11 - test/test_query.cpp | 90 +----- test/test_query2.cpp | 44 +-- test/test_shared.cpp | 41 --- test/test_table.cpp | 248 +---------------- test/test_table_view.cpp | 40 --- test/test_transactions.cpp | 37 --- 37 files changed, 486 insertions(+), 1358 deletions(-) delete mode 100644 src/realm/exec/realm_enumerate.cpp diff --git a/CHANGELOG.md b/CHANGELOG.md index 9753fd2f453..ebcc7300a4f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,7 +17,7 @@ ----------- ### Internals -* None. +* Ability to enumerate a string column has been removed. ---------------------------------------------- diff --git a/src/realm/array_string.cpp b/src/realm/array_string.cpp index 72df74c524f..1eb2fdaa969 100644 --- a/src/realm/array_string.cpp +++ b/src/realm/array_string.cpp @@ -54,24 +54,9 @@ void ArrayString::init_from_mem(MemRef mem) noexcept else { auto arr = new (&m_storage) Array(m_alloc); arr->init_from_mem(mem); - // The context flag is used to indicate interned strings vs old enum strings - // (in conjunction with has_refs() == false) - if (arr->get_context_flag_from_header(arr->get_header())) { - // init for new interned strings (replacing old enum strings) - m_type = Type::interned_strings; - // consider if we want this invariant: REALM_ASSERT_DEBUG(m_string_interner); - } - else { - // init for old enum strings - m_string_enum_values = std::make_unique(m_alloc); - ArrayParent* p; - REALM_ASSERT(m_spec != nullptr); - REALM_ASSERT(m_col_ndx != realm::npos); - ref_type r = m_spec->get_enumkeys_ref(m_col_ndx, p); - m_string_enum_values->init_from_ref(r); - m_string_enum_values->set_parent(p, m_col_ndx); - m_type = Type::enum_strings; - } + // init for new interned strings + m_type = Type::interned_strings; + // consider if we want this invariant: REALM_ASSERT_DEBUG(m_string_interner); } } else { @@ -122,7 +107,6 @@ size_t ArrayString::size() const return static_cast(m_arr)->size(); case Type::big_strings: return static_cast(m_arr)->size(); - case Type::enum_strings: case Type::interned_strings: return static_cast(m_arr)->size(); } @@ -141,7 +125,6 @@ void ArrayString::add(StringData value) case Type::big_strings: static_cast(m_arr)->add_string(value); break; - case Type::enum_strings: case Type::interned_strings: { auto a = static_cast(m_arr); size_t ndx = a->size(); @@ -169,16 +152,6 @@ void ArrayString::set(size_t ndx, StringData value) static_cast(m_arr)->set(ndx, id); break; } - case Type::enum_strings: { - size_t sz = m_string_enum_values->size(); - size_t res = m_string_enum_values->find_first(value, 0, sz); - if (res == realm::not_found) { - m_string_enum_values->add(value); - res = sz; - } - static_cast(m_arr)->set(ndx, res); - break; - } } } @@ -194,11 +167,6 @@ void ArrayString::insert(size_t ndx, StringData value) case Type::big_strings: static_cast(m_arr)->insert_string(ndx, value); break; - case Type::enum_strings: { - static_cast(m_arr)->insert(ndx, 0); - set(ndx, value); - break; - } case Type::interned_strings: { static_cast(m_arr)->insert(ndx, 0); set(ndx, value); @@ -216,31 +184,6 @@ StringData ArrayString::get(size_t ndx) const return static_cast(m_arr)->get_string(ndx); case Type::big_strings: return static_cast(m_arr)->get_string(ndx); - case Type::enum_strings: { - size_t index = size_t(static_cast(m_arr)->get(ndx)); - return m_string_enum_values->get(index); - } - case Type::interned_strings: { - size_t id = size_t(static_cast(m_arr)->get(ndx)); - return m_string_interner->get(id); - } - } - return {}; -} - -StringData ArrayString::get_legacy(size_t ndx) const -{ - switch (m_type) { - case Type::small_strings: - return static_cast(m_arr)->get(ndx); - case Type::medium_strings: - return static_cast(m_arr)->get_string_legacy(ndx); - case Type::big_strings: - return static_cast(m_arr)->get_string(ndx); - case Type::enum_strings: { - size_t index = size_t(static_cast(m_arr)->get(ndx)); - return m_string_enum_values->get(index); - } case Type::interned_strings: { size_t id = size_t(static_cast(m_arr)->get(ndx)); return m_string_interner->get(id); @@ -263,10 +206,6 @@ bool ArrayString::is_null(size_t ndx) const return static_cast(m_arr)->is_null(ndx); case Type::big_strings: return static_cast(m_arr)->is_null(ndx); - case Type::enum_strings: { - size_t id = size_t(static_cast(m_arr)->get(ndx)); - return m_string_enum_values->is_null(id); - } case Type::interned_strings: { size_t id = size_t(static_cast(m_arr)->get(ndx)); return id == 0; @@ -288,7 +227,6 @@ void ArrayString::erase(size_t ndx) static_cast(m_arr)->erase(ndx); break; case Type::interned_strings: - case Type::enum_strings: static_cast(m_arr)->erase(ndx); break; } @@ -311,10 +249,6 @@ void ArrayString::move(ArrayString& dst, size_t ndx) case Type::big_strings: static_cast(m_arr)->truncate(ndx); break; - case Type::enum_strings: - // this operation will never be called for enumerated columns - REALM_UNREACHABLE(); - break; case Type::interned_strings: m_arr->truncate(ndx); break; @@ -333,7 +267,6 @@ void ArrayString::clear() case Type::big_strings: static_cast(m_arr)->clear(); break; - case Type::enum_strings: case Type::interned_strings: static_cast(m_arr)->clear(); break; @@ -355,14 +288,6 @@ size_t ArrayString::find_first(StringData value, size_t begin, size_t end) const return static_cast(m_arr)->find_first(as_binary, true, begin, end); break; } - case Type::enum_strings: { - size_t sz = m_string_enum_values->size(); - size_t res = m_string_enum_values->find_first(value, 0, sz); - if (res != realm::not_found) { - return static_cast(m_arr)->find_first(res, begin, end); - } - break; - } case Type::interned_strings: { // we need a way to avoid this lookup for each leaf array. The lookup must appear // higher up the call stack and passed down. @@ -420,8 +345,6 @@ size_t ArrayString::lower_bound(StringData value) return lower_bound_string(static_cast(m_arr), value); case Type::big_strings: return lower_bound_string(static_cast(m_arr), value); - case Type::enum_strings: - break; case Type::interned_strings: REALM_UNREACHABLE(); break; @@ -434,9 +357,6 @@ ArrayString::Type ArrayString::upgrade_leaf(size_t value_size) if (m_type == Type::big_strings) return Type::big_strings; - if (m_type == Type::enum_strings) - return Type::enum_strings; - if (m_type == Type::interned_strings) return Type::interned_strings; @@ -529,7 +449,6 @@ void ArrayString::verify() const case Type::big_strings: static_cast(m_arr)->verify(); break; - case Type::enum_strings: case Type::interned_strings: static_cast(m_arr)->verify(); break; @@ -567,7 +486,6 @@ ref_type ArrayString::typed_write(ref_type ref, _impl::ArrayWriterBase& out, All leaf.destroy_deep(true); } else { - // whether it's the old enum strings or the new interned strings, // just write out the array using integer leaf compression ret_val = leaf.write(out, false, out.only_modified, out.compress); } diff --git a/src/realm/array_string.hpp b/src/realm/array_string.hpp index 6c8400c4055..cdf8ccededa 100644 --- a/src/realm/array_string.hpp +++ b/src/realm/array_string.hpp @@ -74,15 +74,6 @@ class ArrayString : public ArrayPayload { { m_string_interner = string_interner; } - bool need_spec() const override - { - return true; - } - void set_spec(Spec* spec, size_t col_ndx) const override - { - m_spec = spec; - m_col_ndx = col_ndx; - } void update_parent() { @@ -108,7 +99,6 @@ class ArrayString : public ArrayPayload { } void insert(size_t ndx, StringData value); StringData get(size_t ndx) const; - StringData get_legacy(size_t ndx) const; Mixed get_any(size_t ndx) const override; bool is_null(size_t ndx) const; void erase(size_t ndx); @@ -137,7 +127,7 @@ class ArrayString : public ArrayPayload { static constexpr size_t storage_size = std::max({sizeof(ArrayStringShort), sizeof(ArraySmallBlobs), sizeof(ArrayBigBlobs), sizeof(Array)}); - enum class Type { small_strings, medium_strings, big_strings, enum_strings, interned_strings }; + enum class Type { small_strings, medium_strings, big_strings, interned_strings }; Type m_type = Type::small_strings; @@ -145,8 +135,6 @@ class ArrayString : public ArrayPayload { alignas(storage_alignment) std::byte m_storage[storage_size]; Array* m_arr; bool m_nullable = true; - mutable Spec* m_spec = nullptr; - mutable size_t m_col_ndx = realm::npos; std::unique_ptr m_string_enum_values; mutable StringInterner* m_string_interner = nullptr; diff --git a/src/realm/cluster.cpp b/src/realm/cluster.cpp index 4c7a161cbeb..65e7c8ef89c 100644 --- a/src/realm/cluster.cpp +++ b/src/realm/cluster.cpp @@ -154,12 +154,7 @@ void Cluster::create() do_create(col_key); break; case col_type_String: { - if (m_tree_top.is_string_enum_type(col_ndx)) { - do_create(col_key); - } - else { - do_create(col_key); - } + do_create(col_key); break; } case col_type_Binary: @@ -267,17 +262,6 @@ inline void Cluster::set_string_interner(ArrayMixed& arr, ColKey col_key) const m_tree_top.set_string_interner(arr, col_key); } -template -inline void Cluster::set_spec(T&, ColKey::Idx) const -{ -} - -template <> -inline void Cluster::set_spec(ArrayString& arr, ColKey::Idx col_ndx) const -{ - m_tree_top.set_spec(arr, col_ndx); -} - template inline void Cluster::do_insert_row(size_t ndx, ColKey col, Mixed init_val, bool nullable) { @@ -286,7 +270,6 @@ inline void Cluster::do_insert_row(size_t ndx, ColKey col, Mixed init_val, bool T arr(m_alloc); auto col_ndx = col.get_index(); arr.set_parent(this, col_ndx.val + s_first_col_index); - set_spec(arr, col_ndx); set_string_interner(arr, col); arr.init_from_parent(); if (init_val.is_null()) { @@ -507,13 +490,9 @@ void Cluster::move(size_t ndx, ClusterNode* new_node, int64_t offset) case col_type_Double: do_move(ndx, col_key, new_leaf); break; - case col_type_String: { - if (m_tree_top.is_string_enum_type(col_key.get_index())) - do_move(ndx, col_key, new_leaf); - else - do_move(ndx, col_key, new_leaf); + case col_type_String: + do_move(ndx, col_key, new_leaf); break; - } case col_type_Binary: do_move(ndx, col_key, new_leaf); break; @@ -781,7 +760,6 @@ inline void Cluster::do_erase(size_t ndx, ColKey col_key) auto col_ndx = col_key.get_index(); T values(m_alloc); values.set_parent(this, col_ndx.val + s_first_col_index); - set_spec(values, col_ndx); set_string_interner(values, col_key); values.init_from_parent(); if constexpr (std::is_same_v) { @@ -1048,26 +1026,6 @@ void Cluster::nullify_incoming_links(RowKey key, CascadeState& state) m_tree_top.get_owning_table()->for_each_backlink_column(nullify_fwd_links); } -void Cluster::upgrade_string_to_enum(ColKey col_key, ArrayString& keys) -{ - auto col_ndx = col_key.get_index(); - Array indexes(m_alloc); - indexes.create(Array::type_Normal, false); - ArrayString values(m_alloc); - ref_type ref = Array::get_as_ref(col_ndx.val + s_first_col_index); - set_string_interner(values, col_key); - values.init_from_ref(ref); - size_t sz = values.size(); - for (size_t i = 0; i < sz; i++) { - auto v = values.get(i); - size_t pos = keys.lower_bound(v); - REALM_ASSERT_3(pos, !=, keys.size()); - indexes.add(pos); - } - Array::set(col_ndx.val + s_first_col_index, indexes.get_ref()); - Array::destroy_deep(ref, m_alloc); -} - void Cluster::init_leaf(ColKey col_key, ArrayPayload* leaf) const { auto col_ndx = col_key.get_index(); @@ -1080,9 +1038,6 @@ void Cluster::init_leaf(ColKey col_key, ArrayPayload* leaf) const if (leaf->need_string_interner()) { m_tree_top.set_string_interner(*leaf, col_key); } - if (leaf->need_spec()) { - m_tree_top.set_spec(*leaf, col_ndx); - } leaf->init_from_ref(ref); leaf->set_parent(const_cast(this), col_ndx.val + 1); } @@ -1098,7 +1053,6 @@ template void Cluster::verify(ref_type ref, size_t index, util::Optional& sz) const { ArrayType arr(get_alloc()); - set_spec(arr, ColKey::Idx{unsigned(index) - 1}); auto table = get_owning_table(); REALM_ASSERT(index <= table->m_leaf_ndx2colkey.size()); auto col_key = table->m_leaf_ndx2colkey[index - 1]; @@ -1440,7 +1394,6 @@ void Cluster::dump_objects(int64_t key_offset, std::string lead) const } case col_type_String: { ArrayString arr(m_alloc); - set_spec(arr, col.get_index()); set_string_interner(arr, col); ref_type ref = Array::get_as_ref(j); arr.init_from_ref(ref); diff --git a/src/realm/cluster.hpp b/src/realm/cluster.hpp index 4d73f42b540..477ef36b65c 100644 --- a/src/realm/cluster.hpp +++ b/src/realm/cluster.hpp @@ -321,7 +321,6 @@ class Cluster : public ClusterNode { size_t get_ndx(RowKey key, size_t ndx) const noexcept override; size_t erase(RowKey k, CascadeState& state) override; void nullify_incoming_links(RowKey key, CascadeState& state) override; - void upgrade_string_to_enum(ColKey col, ArrayString& keys); void init_leaf(ColKey col, ArrayPayload* leaf) const; void add_leaf(ColKey col, ref_type ref); diff --git a/src/realm/cluster_tree.cpp b/src/realm/cluster_tree.cpp index 2ff8e8810fe..fd5e1991dad 100644 --- a/src/realm/cluster_tree.cpp +++ b/src/realm/cluster_tree.cpp @@ -928,45 +928,6 @@ void ClusterTree::clear(CascadeState& state) m_size = 0; } -void ClusterTree::enumerate_string_column(ColKey col_key) -{ - Allocator& alloc = get_alloc(); - - ArrayString keys(alloc); - ArrayString leaf(alloc); - keys.create(); - - auto collect_strings = [col_key, &leaf, &keys](const Cluster* cluster) { - cluster->init_leaf(col_key, &leaf); - size_t sz = leaf.size(); - size_t key_size = keys.size(); - for (size_t i = 0; i < sz; i++) { - auto v = leaf.get(i); - size_t pos = keys.lower_bound(v); - if (pos == key_size || keys.get(pos) != v) { - keys.insert(pos, v); // Throws - key_size++; - } - } - - return IteratorControl::AdvanceToNext; - }; - - auto upgrade = [col_key, &keys](Cluster* cluster) { - cluster->upgrade_string_to_enum(col_key, keys); - }; - - // Populate 'keys' array - traverse(collect_strings); - - // Store key strings in spec - size_t spec_ndx = m_owner->colkey2spec_ndx(col_key); - const_cast(&m_owner->m_spec)->upgrade_string_to_enum(spec_ndx, keys.get_ref()); - - // Replace column in all clusters - update(upgrade); -} - void ClusterTree::replace_root(std::unique_ptr new_root) { if (new_root != m_root) { @@ -1141,16 +1102,6 @@ void ClusterTree::set_string_interner(ArrayPayload& arr, ColKey col_key) const } } -void ClusterTree::set_spec(ArrayPayload& arr, ColKey::Idx col_ndx) const -{ - // Check for owner. This function may be called in context of DictionaryClusterTree - // in which case m_owner is null (and spec never needed). - if (m_owner) { - auto spec_ndx = m_owner->leaf_ndx2spec_ndx(col_ndx); - arr.set_spec(&m_owner->m_spec, spec_ndx); - } -} - TableRef ClusterTree::get_table_ref() const { REALM_ASSERT(m_owner != nullptr); @@ -1180,12 +1131,6 @@ void ClusterTree::nullify_incoming_links(ObjKey obj_key, CascadeState& state) m_root->nullify_incoming_links(ClusterNode::RowKey(obj_key), state); } -bool ClusterTree::is_string_enum_type(ColKey::Idx col_ndx) const -{ - size_t spec_ndx = m_owner->leaf_ndx2spec_ndx(col_ndx); - return m_owner->m_spec.is_string_enum_type(spec_ndx); -} - void ClusterTree::remove_all_links(CascadeState& state) { Allocator& alloc = get_alloc(); diff --git a/src/realm/cluster_tree.hpp b/src/realm/cluster_tree.hpp index 5144e581771..44afb09c5a2 100644 --- a/src/realm/cluster_tree.hpp +++ b/src/realm/cluster_tree.hpp @@ -153,7 +153,6 @@ class ClusterTree { } void clear(CascadeState&); - void enumerate_string_column(ColKey col_key); const Table* get_owning_table() const noexcept { @@ -180,7 +179,6 @@ class ClusterTree { // Visit all leaves and call the supplied function. The function can modify the leaf. void update(UpdateFunction func); - void set_spec(ArrayPayload& arr, ColKey::Idx col_ndx) const; void set_string_interner(ArrayPayload& arr, ColKey col_key) const; virtual std::unique_ptr get_root_from_parent(); @@ -225,7 +223,6 @@ class ClusterTree { std::unique_ptr create_root_from_parent(ArrayParent* parent, size_t ndx_in_parent); std::unique_ptr get_node(ArrayParent* parent, size_t ndx_in_parent) const; TableRef get_table_ref() const; - bool is_string_enum_type(ColKey::Idx col_ndx) const; void remove_all_links(CascadeState&); }; diff --git a/src/realm/exec/CMakeLists.txt b/src/realm/exec/CMakeLists.txt index 969b45a9e10..28a8a3fe19d 100644 --- a/src/realm/exec/CMakeLists.txt +++ b/src/realm/exec/CMakeLists.txt @@ -46,17 +46,6 @@ endif() target_link_libraries(ClickQuery Storage) -add_executable(RealmEnumerate realm_enumerate.cpp) -set_target_properties(RealmEnumerate PROPERTIES - OUTPUT_NAME "realm-enumerate" - DEBUG_POSTFIX ${CMAKE_DEBUG_POSTFIX} -) -target_link_libraries(RealmEnumerate ObjectStore) -# FIXME can be fixed for others, but requires link and install fixes for libuv target -if (NOT APPLE) - set_target_properties(RealmEnumerate PROPERTIES EXCLUDE_FROM_ALL TRUE) -endif() - add_executable(RealmDecrypt realm_decrypt.cpp) set_target_properties(RealmDecrypt PROPERTIES OUTPUT_NAME "realm-decrypt" diff --git a/src/realm/exec/realm_enumerate.cpp b/src/realm/exec/realm_enumerate.cpp deleted file mode 100644 index 44f534c8454..00000000000 --- a/src/realm/exec/realm_enumerate.cpp +++ /dev/null @@ -1,139 +0,0 @@ -/* - * Useage: realm-enumerate [--key crypt_key] [--threshold 0.xx] - * Changes string columns which pass the threshold of unique values to enumerated columns - * and compacts the Realm in place. - */ - -#include -#include -#include -#include -#include -#include - -#include -#include - -static void enumerate_strings(realm::SharedRealm realm, double threshold) -{ - auto& group = realm->read_group(); - auto table_keys = group.get_table_keys(); - for (auto table_key : table_keys) { - realm::TableRef t = group.get_table(table_key); - size_t table_size = t->size(); - realm::util::format(std::cout, "Begin table '%1' of size %2:\n", t->get_name(), table_size); - if (table_size == 0) - continue; - bool found_str_col = false; - auto do_convert = [&realm, &t](realm::ColKey col) { - auto start = std::chrono::steady_clock::now(); - std::cout << "[converting]" << std::flush; - realm->begin_transaction(); - t->enumerate_string_column(col); - realm->commit_transaction(); - std::chrono::duration diff = std::chrono::steady_clock::now() - start; - std::cout << " (" << diff.count() << " seconds)" << std::endl; - }; - t->for_each_public_column([&](realm::ColKey col_key) { - if (col_key.get_type() == realm::col_type_String && !col_key.is_collection()) { - found_str_col = true; - realm::util::format(std::cout, "\tcolumn '%1' ", t->get_column_name(col_key)); - std::cout << std::flush; - if (t->is_enumerated(col_key)) { - std::cout << "[already enumerated]" << std::endl; - } - else if (t->get_primary_key_column() == col_key) { - std::cout << "[pk - skipping]" << std::endl; - } - else if (threshold >= 100) { - do_convert(col_key); - } - else if (threshold < 100 && threshold > 0) { - std::unique_ptr distinct = - std::make_unique(); - distinct->append_distinct(realm::DistinctDescriptor({{col_key}})); - size_t uniques = t->where().count(*distinct.get()); - double utilization = uniques / double(table_size); - realm::util::format(std::cout, "contains %1 unique values (%2%%) ", uniques, utilization * 100.0); - std::cout << std::flush; - if (utilization <= threshold / 100) { - do_convert(col_key); - } - else { - std::cout << "[skipping due to threshold]" << std::endl; - } - } - else { - std::cout << "[skipping due to threshold]" << std::endl; - } - } - return realm::IteratorControl::AdvanceToNext; - }); - if (!found_str_col) { - std::cout << "\tNo string columns found." << std::endl; - } - } -} - -int main(int argc, const char* argv[]) -{ - if (argc > 1) { - try { - const char* key_ptr = nullptr; - char key[64]; - double threshold = 0; // by default don't convert, just compact - for (int curr_arg = 1; curr_arg < argc; curr_arg++) { - if (strcmp(argv[curr_arg], "--key") == 0) { - std::ifstream key_file(argv[curr_arg + 1]); - key_file.read(key, sizeof(key)); - key_ptr = key; - curr_arg++; - } - else if (strcmp(argv[curr_arg], "--threshold") == 0) { - threshold = strtod(argv[curr_arg + 1], nullptr); - curr_arg++; - } - else { - realm::util::format(std::cout, "File name '%1' for threshold %2%%\n", argv[curr_arg], threshold); - auto start = std::chrono::steady_clock::now(); - realm::Realm::Config config; - config.path = argv[curr_arg]; - if (key_ptr) { - config.encryption_key.resize(64); - memcpy(&config.encryption_key[0], &key_ptr[0], 64); - } - realm::SharedRealm realm; - try { - realm = realm::Realm::get_shared_realm(config); - } - catch (const realm::FileAccessError& e) { - std::cout << "trying to open as a sync Realm\n" << e.what() << "\n" << std::endl; - config.force_sync_history = true; - realm = realm::Realm::get_shared_realm(config); - } - enumerate_strings(realm, threshold); - realm->compact(); - std::chrono::duration diff = std::chrono::steady_clock::now() - start; - std::cout << "Done in " << diff.count() << " seconds." << std::endl; - std::cout << std::endl; - return 0; - } - } - } - catch (const std::exception& e) { - std::cout << e.what() << std::endl; - } - } - else { - std::cout << "Usage: realm-enumerate [--key crypt_key] [--threshold 0.xx] " << std::endl; - std::cout << "The optional crypt_key arg is a filename which contains the 64 byte key." << std::endl; - std::cout - << "The optional threshold is a number between [0, 100] indicating the percentage of unique strings " - "below which columns will be converted. At a value of 100, all columns will be converted. " - "For value of 50 only columns which have 50% or fewer unique values will be converted." - "If not set, the threshold default is 0 which just compacts the file without converting anything." - << std::endl; - } - - return 0; -} diff --git a/src/realm/node.hpp b/src/realm/node.hpp index 9b684f25246..57630f4812a 100644 --- a/src/realm/node.hpp +++ b/src/realm/node.hpp @@ -363,11 +363,6 @@ class ArrayPayload { return false; } virtual void set_string_interner(StringInterner*) const {} - virtual bool need_spec() const - { - return false; - } - virtual void set_spec(Spec*, size_t) const {} static ref_type typed_write(ref_type ref, _impl::ArrayWriterBase& out, Allocator& alloc); }; diff --git a/src/realm/obj.cpp b/src/realm/obj.cpp index beaa138a4c2..25027dc3f08 100644 --- a/src/realm/obj.cpp +++ b/src/realm/obj.cpp @@ -608,22 +608,11 @@ StringData Obj::_get(ColKey::Idx col_ndx) const } ref_type ref = to_ref(Array::get(m_mem.get_addr(), col_ndx.val + 1)); - auto spec_ndx = m_table->leaf_ndx2spec_ndx(col_ndx); - auto& spec = get_spec(); - if (spec.is_string_enum_type(spec_ndx)) { - ArrayString values(get_alloc()); - values.set_spec(const_cast(&spec), spec_ndx); - values.init_from_ref(ref); - - return values.get(m_row_ndx); - } - else { - ArrayString values(get_alloc()); - auto col_key = m_table->leaf_ndx2colkey(col_ndx); - values.set_string_interner(m_table->get_string_interner(col_key)); - values.init_from_ref(ref); - return values.get(m_row_ndx); - } + ArrayString values(get_alloc()); + auto col_key = m_table->leaf_ndx2colkey(col_ndx); + values.set_string_interner(m_table->get_string_interner(col_key)); + values.init_from_ref(ref); + return values.get(m_row_ndx); } template <> @@ -750,7 +739,6 @@ inline bool Obj::do_is_null(ColKey::Idx col_ndx) const REALM_ASSERT(false); // Don't come here, you're falling from a cliff.... ArrayString values(get_alloc()); ref_type ref = to_ref(Array::get(m_mem.get_addr(), col_ndx.val + 1)); - values.set_spec(const_cast(&get_spec()), m_table->leaf_ndx2spec_ndx(col_ndx)); // TODO: Set string interner if needed // values.set_string_interner(m_table->get_string_interner(col_key)); values.init_from_ref(ref); @@ -780,7 +768,6 @@ bool Obj::is_null(ColKey col_key) const case col_type_String: { ArrayString values(get_alloc()); ref_type ref = to_ref(Array::get(m_mem.get_addr(), col_ndx.val + 1)); - values.set_spec(const_cast(&get_spec()), m_table->leaf_ndx2spec_ndx(col_ndx)); // TODO: Set string interner if needed values.set_string_interner(m_table->get_string_interner(col_key)); values.init_from_ref(ref); @@ -1189,19 +1176,6 @@ inline void Obj::set_string_interner(ArrayMixed& values, ColKey col_key) values.set_string_interner(m_table->get_string_interner(col_key)); } -// helper functions for filtering out calls to set_spec() -template -inline void Obj::set_spec(T&, ColKey) -{ -} -template <> -inline void Obj::set_spec(ArrayString& values, ColKey col_key) -{ - size_t spec_ndx = m_table->colkey2spec_ndx(col_key); - Spec* spec = const_cast(&get_spec()); - values.set_spec(spec, spec_ndx); -} - template <> Obj& Obj::set(ColKey col_key, Mixed value, bool is_default) { @@ -1722,7 +1696,6 @@ Obj& Obj::set(ColKey col_key, T value, bool is_default) using LeafType = typename ColumnTypeTraits::cluster_leaf_type; LeafType values(alloc); values.set_parent(&fields, col_ndx.val + 1); - set_spec(values, col_key); set_string_interner(values, col_key); values.init_from_parent(); values.set(m_row_ndx, value); @@ -2326,7 +2299,6 @@ template <> inline void Obj::do_set_null(ColKey col_key) { ColKey::Idx col_ndx = col_key.get_index(); - size_t spec_ndx = m_table->leaf_ndx2spec_ndx(col_ndx); Allocator& alloc = get_alloc(); alloc.bump_content_version(); Array fallback(alloc); @@ -2334,7 +2306,6 @@ inline void Obj::do_set_null(ColKey col_key) ArrayString values(alloc); values.set_parent(&fields, col_ndx.val + 1); - values.set_spec(const_cast(&get_spec()), spec_ndx); values.set_string_interner(m_table->get_string_interner(col_key)); values.init_from_parent(); values.set_null(m_row_ndx); diff --git a/src/realm/obj.hpp b/src/realm/obj.hpp index 8711e590dac..cffc1b56a70 100644 --- a/src/realm/obj.hpp +++ b/src/realm/obj.hpp @@ -391,8 +391,6 @@ class Obj { bool remove_one_backlink(ColKey backlink_col, ObjKey origin_key); void nullify_link(ColKey origin_col, ObjLink target_key) &&; template - inline void set_spec(T&, ColKey); - template inline void set_string_interner(T&, ColKey); template inline void nullify_single_link(ColKey col, ValueType target); diff --git a/src/realm/query_engine.cpp b/src/realm/query_engine.cpp index 6af55085725..3a9c375f6d0 100644 --- a/src/realm/query_engine.cpp +++ b/src/realm/query_engine.cpp @@ -272,10 +272,7 @@ void StringNodeEqualBase::init(bool will_query_ranges) StringNodeBase::init(will_query_ranges); const bool uses_index = has_search_index(); - if (m_is_string_enum) { - m_dT = 1.0; - } - else if (uses_index) { + if (uses_index) { m_dT = 0.0; } else { @@ -517,7 +514,6 @@ StringNodeFulltext::StringNodeFulltext(StringData v, ColKey column, std::unique_ void StringNodeFulltext::table_changed() { - StringNodeEqualBase::table_changed(); m_link_map->set_base_table(m_table); } diff --git a/src/realm/query_engine.hpp b/src/realm/query_engine.hpp index 94cc9612a48..3a428c04d4d 100644 --- a/src/realm/query_engine.hpp +++ b/src/realm/query_engine.hpp @@ -1647,11 +1647,6 @@ class StringNodeBase : public ParentNode { m_dT = 10.0; } - void table_changed() override - { - m_is_string_enum = m_table.unchecked_ptr()->is_enumerated(m_condition_column_key); - } - void cluster_changed() override { m_leaf.emplace(m_table.unchecked_ptr()->get_alloc()); @@ -1678,7 +1673,6 @@ class StringNodeBase : public ParentNode { : ParentNode(from) , m_value(from.m_value) , m_string_value(m_value) - , m_is_string_enum(from.m_is_string_enum) { } @@ -1694,8 +1688,6 @@ class StringNodeBase : public ParentNode { std::optional m_leaf; StringData m_string_value; - bool m_is_string_enum = false; - size_t m_end_s = 0; size_t m_leaf_start = 0; size_t m_leaf_end = 0; diff --git a/src/realm/spec.cpp b/src/realm/spec.cpp index b2746f3c1c2..1a6b4dfefaa 100644 --- a/src/realm/spec.cpp +++ b/src/realm/spec.cpp @@ -51,14 +51,6 @@ void Spec::init(MemRef mem) noexcept m_top.add(0); } - // Enumkeys array is only there when there are StringEnum columns - if (auto ref = m_top.get_as_ref(s_enum_keys_ndx)) { - m_enumkeys.init_from_ref(ref); - } - else { - m_enumkeys.detach(); - } - if (m_top.get_as_ref(s_col_keys_ndx) == 0) { // This is an upgrade - create column key array MemRef mem_ref = Array::create_empty_array(Array::type_Normal, false, m_top.get_alloc()); // Throws @@ -96,14 +88,6 @@ void Spec::update_from_parent() noexcept m_types.update_from_parent(); m_names.update_from_parent(); m_attr.update_from_parent(); - - if (m_top.get_as_ref(s_enum_keys_ndx) != 0) { - m_enumkeys.update_from_parent(); - } - else { - m_enumkeys.detach(); - } - m_keys.update_from_parent(); update_internals(); @@ -115,36 +99,25 @@ MemRef Spec::create_empty_spec(Allocator& alloc) // The 'spec_set' contains the specification (types and names) of // all columns and sub-tables Array spec_set(alloc); - _impl::DeepArrayDestroyGuard dg(&spec_set); spec_set.create(Array::type_HasRefs); // Throws - _impl::DeepArrayRefDestroyGuard dg_2(alloc); { // One type for each column bool context_flag = false; MemRef mem = Array::create_empty_array(Array::type_Normal, context_flag, alloc); // Throws - dg_2.reset(mem.get_ref()); - int_fast64_t v(from_ref(mem.get_ref())); - spec_set.add(v); // Throws - dg_2.release(); + spec_set.add(from_ref(mem.get_ref())); // Throws } { size_t size = 0; // One name for each column MemRef mem = ArrayStringShort::create_array(size, alloc); // Throws - dg_2.reset(mem.get_ref()); - int_fast64_t v = from_ref(mem.get_ref()); - spec_set.add(v); // Throws - dg_2.release(); + spec_set.add(from_ref(mem.get_ref())); // Throws } { // One attrib set for each column bool context_flag = false; MemRef mem = Array::create_empty_array(Array::type_Normal, context_flag, alloc); // Throws - dg_2.reset(mem.get_ref()); - int_fast64_t v = from_ref(mem.get_ref()); - spec_set.add(v); // Throws - dg_2.release(); + spec_set.add(from_ref(mem.get_ref())); // Throws } spec_set.add(0); // Nested collections array spec_set.add(0); // Enumkeys array @@ -152,13 +125,9 @@ MemRef Spec::create_empty_spec(Allocator& alloc) // One key for each column bool context_flag = false; MemRef mem = Array::create_empty_array(Array::type_Normal, context_flag, alloc); // Throws - dg_2.reset(mem.get_ref()); - int_fast64_t v = from_ref(mem.get_ref()); - spec_set.add(v); // Throws - dg_2.release(); + spec_set.add(from_ref(mem.get_ref())); // Throws } - dg.release(); return spec_set.get_mem(); } @@ -204,10 +173,6 @@ void Spec::insert_column(size_t column_ndx, ColKey col_key, ColumnType type, Str m_attr.insert(column_ndx, attr); // Throws m_keys.insert(column_ndx, col_key.value); - if (m_enumkeys.is_attached() && type != col_type_BackLink) { - m_enumkeys.insert(column_ndx, 0); - } - update_internals(); } @@ -216,28 +181,6 @@ void Spec::erase_column(size_t column_ndx) REALM_ASSERT(column_ndx < m_types.size()); if (ColumnType(int(m_types.get(column_ndx))) != col_type_BackLink) { - if (is_string_enum_type(column_ndx)) { - // Enum columns do also have a separate key list - ref_type keys_ref = m_enumkeys.get_as_ref(column_ndx); - Array::destroy_deep(keys_ref, m_top.get_alloc()); - m_enumkeys.set(column_ndx, 0); - } - - // Remove this column from the enum keys lookup and clean it up if it's now empty - if (m_enumkeys.is_attached()) { - m_enumkeys.erase(column_ndx); // Throws - bool all_empty = true; - for (size_t i = 0; i < m_enumkeys.size(); i++) { - if (m_enumkeys.get(i) != 0) { - all_empty = false; - break; - } - } - if (all_empty) { - m_enumkeys.destroy_deep(); - m_top.set(4, 0); - } - } m_num_public_columns--; m_names.erase(column_ndx); // Throws } @@ -250,34 +193,6 @@ void Spec::erase_column(size_t column_ndx) update_internals(); } -void Spec::upgrade_string_to_enum(size_t column_ndx, ref_type keys_ref) -{ - REALM_ASSERT(get_column_type(column_ndx) == col_type_String); - - // Create the enumkeys list if needed - if (!m_enumkeys.is_attached()) { - m_enumkeys.create(Array::type_HasRefs, false, m_num_public_columns); - m_top.set(4, m_enumkeys.get_ref()); - m_enumkeys.set_parent(&m_top, 4); - } - - // Insert the new key list - m_enumkeys.set(column_ndx, keys_ref); -} - -bool Spec::is_string_enum_type(size_t column_ndx) const noexcept -{ - return m_enumkeys.is_attached() ? (m_enumkeys.get(column_ndx) != 0) : false; -} - -ref_type Spec::get_enumkeys_ref(size_t column_ndx, ArrayParent*& keys_parent) noexcept -{ - // We also need to return parent info - keys_parent = &m_enumkeys; - - return m_enumkeys.get_as_ref(column_ndx); -} - namespace { template diff --git a/src/realm/spec.hpp b/src/realm/spec.hpp index d1a17072f67..c539dee7b93 100644 --- a/src/realm/spec.hpp +++ b/src/realm/spec.hpp @@ -65,12 +65,6 @@ class Spec { void set_dictionary_key_type(size_t column_ndx, DataType key_type); DataType get_dictionary_key_type(size_t column_ndx) const; - // Auto Enumerated string columns - void upgrade_string_to_enum(size_t column_ndx, ref_type keys_ref); - size_t _get_enumkeys_ndx(size_t column_ndx) const noexcept; - bool is_string_enum_type(size_t column_ndx) const noexcept; - ref_type get_enumkeys_ref(size_t column_ndx, ArrayParent*& keys_parent) noexcept; - //@{ /// Compare two table specs for equality. bool operator==(const Spec&) const noexcept; @@ -97,7 +91,7 @@ class Spec { static constexpr size_t s_names_ndx = 1; static constexpr size_t s_attributes_ndx = 2; static constexpr size_t s_vacant_1 = 3; - static constexpr size_t s_enum_keys_ndx = 4; + // static constexpr size_t s_enum_keys_ndx = 4; static constexpr size_t s_col_keys_ndx = 5; static constexpr size_t s_spec_max_size = 6; @@ -105,8 +99,8 @@ class Spec { Array m_types; // 1st slot in m_top ArrayStringShort m_names; // 2nd slot in m_top Array m_attr; // 3rd slot in m_top - // 4th slot in m_top not cached - Array m_enumkeys; // 5th slot in m_top + // 4th slot in m_top, old subspecs. Not used since v6.0.0 + // 5th slot in m_top, old enum keys which was never released Array m_keys; // 6th slot in m_top size_t m_num_public_columns = 0; @@ -156,13 +150,11 @@ inline Spec::Spec(Allocator& alloc) noexcept , m_types(alloc) , m_names(alloc) , m_attr(alloc) - , m_enumkeys(alloc) , m_keys(alloc) { m_types.set_parent(&m_top, s_types_ndx); m_names.set_parent(&m_top, s_names_ndx); m_attr.set_parent(&m_top, s_attributes_ndx); - m_enumkeys.set_parent(&m_top, s_enum_keys_ndx); m_keys.set_parent(&m_top, s_col_keys_ndx); } diff --git a/src/realm/table.cpp b/src/realm/table.cpp index be78abe7122..56c34c999f4 100644 --- a/src/realm/table.cpp +++ b/src/realm/table.cpp @@ -991,36 +991,6 @@ void Table::remove_search_index(ColKey col_key) m_spec.set_column_attr(spec_ndx, attr); // Throws } -void Table::enumerate_string_column(ColKey col_key) -{ - check_column(col_key); - size_t column_ndx = colkey2spec_ndx(col_key); - ColumnType type = col_key.get_type(); - if (type == col_type_String && !col_key.is_collection() && !m_spec.is_string_enum_type(column_ndx)) { - m_clusters.enumerate_string_column(col_key); - } -} - -bool Table::is_enumerated(ColKey col_key) const noexcept -{ - size_t col_ndx = colkey2spec_ndx(col_key); - return m_spec.is_string_enum_type(col_ndx); -} - -size_t Table::get_num_unique_values(ColKey col_key) const -{ - if (!is_enumerated(col_key)) - return 0; - - ArrayParent* parent; - ref_type ref = const_cast(m_spec).get_enumkeys_ref(colkey2spec_ndx(col_key), parent); - BPlusTree col(get_alloc()); - col.init_from_ref(ref); - - return col.size(); -} - - void Table::erase_root_column(ColKey col_key) { ColumnType col_type = col_key.get_type(); diff --git a/src/realm/table.hpp b/src/realm/table.hpp index 3635d265835..fd6e72c6bde 100644 --- a/src/realm/table.hpp +++ b/src/realm/table.hpp @@ -248,18 +248,10 @@ class Table { } void remove_search_index(ColKey col_key); - void enumerate_string_column(ColKey col_key); - bool is_enumerated(ColKey col_key) const noexcept; bool contains_unique_values(ColKey col_key) const; //@} - /// If the specified column is optimized to store only unique values, then - /// this function returns the number of unique values currently - /// stored. Otherwise it returns zero. This function is mainly intended for - /// debugging purposes. - size_t get_num_unique_values(ColKey col_key) const; - template Columns column(ColKey col_key, util::Optional = util::none) const; template diff --git a/test/expect_json.json b/test/expect_json.json index 4f4d0e227c3..5929348e35c 100644 --- a/test/expect_json.json +++ b/test/expect_json.json @@ -1 +1,431 @@ -[{"_key":0,"int":0,"bool":false,"date":"1970-01-01 03:25:45","float":1.2345600e+02,"double":9.8765432099999998e+03,"string":"string0","string_long":"string0 very long string.........","string_big_blobs":"string0 very long string......... big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs","string_enum":"enum1","binary":"YmluYXJ5AA==","oid":"000000000000000000000000","decimal":"1.2345","integers":[],"strings":[],"dictionary":{"a":2},"set":[123],"uuid":"00000000-0000-0000-0000-000000000000"},{"_key":1,"int":-1,"bool":true,"date":"1970-01-01 03:25:45","float":-1.2345600e+02,"double":-9.8765432099999998e+03,"string":"string1","string_long":"string1 very long string.........","string_big_blobs":"","string_enum":"enum2","binary":"YmluYXJ5AA==","oid":"000000000000000000000000","decimal":"1.2345","integers":[-123],"strings":["sub_-123"],"dictionary":{"a":2},"set":[123],"uuid":"00000000-0000-0000-0000-000000000000"},{"_key":2,"int":2,"bool":false,"date":"1970-01-01 03:25:45","float":1.2345600e+02,"double":9.8765432099999998e+03,"string":"string2","string_long":"string2 very long string.........","string_big_blobs":"string2 very long string......... big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs","string_enum":"enum3","binary":"YmluYXJ5AA==","oid":"000000000000000000000000","decimal":"1.2345","integers":[-123,2345],"strings":["sub_-123","sub_2345"],"dictionary":{"a":2},"set":[123],"uuid":"00000000-0000-0000-0000-000000000000"},{"_key":3,"int":-3,"bool":true,"date":"1970-01-01 03:25:45","float":-1.2345600e+02,"double":-9.8765432099999998e+03,"string":"string3","string_long":"string3 very long string.........","string_big_blobs":"","string_enum":"enum1","binary":"YmluYXJ5AA==","oid":"000000000000000000000000","decimal":"1.2345","integers":[-123,-3825,-7527],"strings":["sub_-123","sub_-3825","sub_-7527"],"dictionary":{"a":2},"set":[123],"uuid":"00000000-0000-0000-0000-000000000000"},{"_key":4,"int":4,"bool":false,"date":"1970-01-01 03:25:45","float":1.2345600e+02,"double":9.8765432099999998e+03,"string":"string4","string_long":"string4 very long string.........","string_big_blobs":"string4 very long string......... big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs","string_enum":"enum2","binary":"YmluYXJ5AA==","oid":"000000000000000000000000","decimal":"1.2345","integers":[-123,4813,9749,14685],"strings":["sub_-123","sub_4813","sub_9749","sub_14685"],"dictionary":{"a":2},"set":[123],"uuid":"00000000-0000-0000-0000-000000000000"},{"_key":5,"int":-5,"bool":true,"date":"1970-01-01 03:25:45","float":-1.2345600e+02,"double":-9.8765432099999998e+03,"string":"string5","string_long":"string5 very long string.........","string_big_blobs":"","string_enum":"enum3","binary":"YmluYXJ5AA==","oid":"000000000000000000000000","decimal":"1.2345","integers":[],"strings":[],"dictionary":{"a":2},"set":[123],"uuid":"00000000-0000-0000-0000-000000000000"},{"_key":6,"int":6,"bool":false,"date":"1970-01-01 03:25:45","float":1.2345600e+02,"double":9.8765432099999998e+03,"string":"string6","string_long":"string6 very long string.........","string_big_blobs":"string6 very long string......... big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs","string_enum":"enum1","binary":"YmluYXJ5AA==","oid":"000000000000000000000000","decimal":"1.2345","integers":[-123],"strings":["sub_-123"],"dictionary":{"a":2},"set":[123],"uuid":"00000000-0000-0000-0000-000000000000"},{"_key":7,"int":-7,"bool":true,"date":"1970-01-01 03:25:45","float":-1.2345600e+02,"double":-9.8765432099999998e+03,"string":"string7","string_long":"string7 very long string.........","string_big_blobs":"","string_enum":"enum2","binary":"YmluYXJ5AA==","oid":"000000000000000000000000","decimal":"1.2345","integers":[-123,-8761],"strings":["sub_-123","sub_-8761"],"dictionary":{"a":2},"set":[123],"uuid":"00000000-0000-0000-0000-000000000000"},{"_key":8,"int":8,"bool":false,"date":"1970-01-01 03:25:45","float":1.2345600e+02,"double":9.8765432099999998e+03,"string":"string8","string_long":"string8 very long string.........","string_big_blobs":"string8 very long string......... big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs","string_enum":"enum3","binary":"YmluYXJ5AA==","oid":"000000000000000000000000","decimal":"1.2345","integers":[-123,9749,19621],"strings":["sub_-123","sub_9749","sub_19621"],"dictionary":{"a":2},"set":[123],"uuid":"00000000-0000-0000-0000-000000000000"},{"_key":9,"int":-9,"bool":true,"date":"1970-01-01 03:25:45","float":-1.2345600e+02,"double":-9.8765432099999998e+03,"string":"string9","string_long":"string9 very long string.........","string_big_blobs":"","string_enum":"enum1","binary":"YmluYXJ5AA==","oid":"000000000000000000000000","decimal":"1.2345","integers":[-123,-11229,-22335,-33441],"strings":["sub_-123","sub_-11229","sub_-22335","sub_-33441"],"dictionary":{"a":2},"set":[123],"uuid":"00000000-0000-0000-0000-000000000000"},{"_key":10,"int":10,"bool":false,"date":"1970-01-01 03:25:45","float":1.2345600e+02,"double":9.8765432099999998e+03,"string":"string10","string_long":"string10 very long string.........","string_big_blobs":"string10 very long string......... big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs","string_enum":"enum2","binary":"YmluYXJ5AA==","oid":"000000000000000000000000","decimal":"1.2345","integers":[],"strings":[],"dictionary":{"a":2},"set":[123],"uuid":"00000000-0000-0000-0000-000000000000"},{"_key":11,"int":-11,"bool":true,"date":"1970-01-01 03:25:45","float":-1.2345600e+02,"double":-9.8765432099999998e+03,"string":"string11","string_long":"string11 very long string.........","string_big_blobs":"","string_enum":"enum3","binary":"YmluYXJ5AA==","oid":"000000000000000000000000","decimal":"1.2345","integers":[-123],"strings":["sub_-123"],"dictionary":{"a":2},"set":[123],"uuid":"00000000-0000-0000-0000-000000000000"},{"_key":12,"int":12,"bool":false,"date":"1970-01-01 03:25:45","float":1.2345600e+02,"double":9.8765432099999998e+03,"string":"string12","string_long":"string12 very long string.........","string_big_blobs":"string12 very long string......... big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs","string_enum":"enum1","binary":"YmluYXJ5AA==","oid":"000000000000000000000000","decimal":"1.2345","integers":[-123,14685],"strings":["sub_-123","sub_14685"],"dictionary":{"a":2},"set":[123],"uuid":"00000000-0000-0000-0000-000000000000"},{"_key":13,"int":-13,"bool":true,"date":"1970-01-01 03:25:45","float":-1.2345600e+02,"double":-9.8765432099999998e+03,"string":"string13","string_long":"string13 very long string.........","string_big_blobs":"","string_enum":"enum2","binary":"YmluYXJ5AA==","oid":"000000000000000000000000","decimal":"1.2345","integers":[-123,-16165,-32207],"strings":["sub_-123","sub_-16165","sub_-32207"],"dictionary":{"a":2},"set":[123],"uuid":"00000000-0000-0000-0000-000000000000"},{"_key":14,"int":14,"bool":false,"date":"1970-01-01 03:25:45","float":1.2345600e+02,"double":9.8765432099999998e+03,"string":"string14","string_long":"string14 very long string.........","string_big_blobs":"string14 very long string......... big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs","string_enum":"enum3","binary":"YmluYXJ5AA==","oid":"000000000000000000000000","decimal":"1.2345","integers":[-123,17153,34429,51705],"strings":["sub_-123","sub_17153","sub_34429","sub_51705"],"dictionary":{"a":2},"set":[123],"uuid":"00000000-0000-0000-0000-000000000000"}] \ No newline at end of file +[ + { + "_key": 0, + "int": 0, + "bool": false, + "date": "1970-01-01 03:25:45", + "float": 123.456, + "double": 9876.54321, + "string": "string0", + "string_long": "string0 very long string.........", + "string_big_blobs": "string0 very long string......... big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs", + "binary": "YmluYXJ5AA==", + "oid": "000000000000000000000000", + "decimal": "1.2345", + "integers": [], + "strings": [], + "dictionary": { + "a": 2 + }, + "set": [ + 123 + ], + "uuid": "00000000-0000-0000-0000-000000000000" + }, + { + "_key": 1, + "int": -1, + "bool": true, + "date": "1970-01-01 03:25:45", + "float": -123.456, + "double": -9876.54321, + "string": "string1", + "string_long": "string1 very long string.........", + "string_big_blobs": "", + "binary": "YmluYXJ5AA==", + "oid": "000000000000000000000000", + "decimal": "1.2345", + "integers": [ + -123 + ], + "strings": [ + "sub_-123" + ], + "dictionary": { + "a": 2 + }, + "set": [ + 123 + ], + "uuid": "00000000-0000-0000-0000-000000000000" + }, + { + "_key": 2, + "int": 2, + "bool": false, + "date": "1970-01-01 03:25:45", + "float": 123.456, + "double": 9876.54321, + "string": "string2", + "string_long": "string2 very long string.........", + "string_big_blobs": "string2 very long string......... big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs", + "binary": "YmluYXJ5AA==", + "oid": "000000000000000000000000", + "decimal": "1.2345", + "integers": [ + -123, + 2345 + ], + "strings": [ + "sub_-123", + "sub_2345" + ], + "dictionary": { + "a": 2 + }, + "set": [ + 123 + ], + "uuid": "00000000-0000-0000-0000-000000000000" + }, + { + "_key": 3, + "int": -3, + "bool": true, + "date": "1970-01-01 03:25:45", + "float": -123.456, + "double": -9876.54321, + "string": "string3", + "string_long": "string3 very long string.........", + "string_big_blobs": "", + "binary": "YmluYXJ5AA==", + "oid": "000000000000000000000000", + "decimal": "1.2345", + "integers": [ + -123, + -3825, + -7527 + ], + "strings": [ + "sub_-123", + "sub_-3825", + "sub_-7527" + ], + "dictionary": { + "a": 2 + }, + "set": [ + 123 + ], + "uuid": "00000000-0000-0000-0000-000000000000" + }, + { + "_key": 4, + "int": 4, + "bool": false, + "date": "1970-01-01 03:25:45", + "float": 123.456, + "double": 9876.54321, + "string": "string4", + "string_long": "string4 very long string.........", + "string_big_blobs": "string4 very long string......... big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs", + "binary": "YmluYXJ5AA==", + "oid": "000000000000000000000000", + "decimal": "1.2345", + "integers": [ + -123, + 4813, + 9749, + 14685 + ], + "strings": [ + "sub_-123", + "sub_4813", + "sub_9749", + "sub_14685" + ], + "dictionary": { + "a": 2 + }, + "set": [ + 123 + ], + "uuid": "00000000-0000-0000-0000-000000000000" + }, + { + "_key": 5, + "int": -5, + "bool": true, + "date": "1970-01-01 03:25:45", + "float": -123.456, + "double": -9876.54321, + "string": "string5", + "string_long": "string5 very long string.........", + "string_big_blobs": "", + "binary": "YmluYXJ5AA==", + "oid": "000000000000000000000000", + "decimal": "1.2345", + "integers": [], + "strings": [], + "dictionary": { + "a": 2 + }, + "set": [ + 123 + ], + "uuid": "00000000-0000-0000-0000-000000000000" + }, + { + "_key": 6, + "int": 6, + "bool": false, + "date": "1970-01-01 03:25:45", + "float": 123.456, + "double": 9876.54321, + "string": "string6", + "string_long": "string6 very long string.........", + "string_big_blobs": "string6 very long string......... big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs", + "binary": "YmluYXJ5AA==", + "oid": "000000000000000000000000", + "decimal": "1.2345", + "integers": [ + -123 + ], + "strings": [ + "sub_-123" + ], + "dictionary": { + "a": 2 + }, + "set": [ + 123 + ], + "uuid": "00000000-0000-0000-0000-000000000000" + }, + { + "_key": 7, + "int": -7, + "bool": true, + "date": "1970-01-01 03:25:45", + "float": -123.456, + "double": -9876.54321, + "string": "string7", + "string_long": "string7 very long string.........", + "string_big_blobs": "", + "binary": "YmluYXJ5AA==", + "oid": "000000000000000000000000", + "decimal": "1.2345", + "integers": [ + -123, + -8761 + ], + "strings": [ + "sub_-123", + "sub_-8761" + ], + "dictionary": { + "a": 2 + }, + "set": [ + 123 + ], + "uuid": "00000000-0000-0000-0000-000000000000" + }, + { + "_key": 8, + "int": 8, + "bool": false, + "date": "1970-01-01 03:25:45", + "float": 123.456, + "double": 9876.54321, + "string": "string8", + "string_long": "string8 very long string.........", + "string_big_blobs": "string8 very long string......... big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs", + "binary": "YmluYXJ5AA==", + "oid": "000000000000000000000000", + "decimal": "1.2345", + "integers": [ + -123, + 9749, + 19621 + ], + "strings": [ + "sub_-123", + "sub_9749", + "sub_19621" + ], + "dictionary": { + "a": 2 + }, + "set": [ + 123 + ], + "uuid": "00000000-0000-0000-0000-000000000000" + }, + { + "_key": 9, + "int": -9, + "bool": true, + "date": "1970-01-01 03:25:45", + "float": -123.456, + "double": -9876.54321, + "string": "string9", + "string_long": "string9 very long string.........", + "string_big_blobs": "", + "binary": "YmluYXJ5AA==", + "oid": "000000000000000000000000", + "decimal": "1.2345", + "integers": [ + -123, + -11229, + -22335, + -33441 + ], + "strings": [ + "sub_-123", + "sub_-11229", + "sub_-22335", + "sub_-33441" + ], + "dictionary": { + "a": 2 + }, + "set": [ + 123 + ], + "uuid": "00000000-0000-0000-0000-000000000000" + }, + { + "_key": 10, + "int": 10, + "bool": false, + "date": "1970-01-01 03:25:45", + "float": 123.456, + "double": 9876.54321, + "string": "string10", + "string_long": "string10 very long string.........", + "string_big_blobs": "string10 very long string......... big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs", + "binary": "YmluYXJ5AA==", + "oid": "000000000000000000000000", + "decimal": "1.2345", + "integers": [], + "strings": [], + "dictionary": { + "a": 2 + }, + "set": [ + 123 + ], + "uuid": "00000000-0000-0000-0000-000000000000" + }, + { + "_key": 11, + "int": -11, + "bool": true, + "date": "1970-01-01 03:25:45", + "float": -123.456, + "double": -9876.54321, + "string": "string11", + "string_long": "string11 very long string.........", + "string_big_blobs": "", + "binary": "YmluYXJ5AA==", + "oid": "000000000000000000000000", + "decimal": "1.2345", + "integers": [ + -123 + ], + "strings": [ + "sub_-123" + ], + "dictionary": { + "a": 2 + }, + "set": [ + 123 + ], + "uuid": "00000000-0000-0000-0000-000000000000" + }, + { + "_key": 12, + "int": 12, + "bool": false, + "date": "1970-01-01 03:25:45", + "float": 123.456, + "double": 9876.54321, + "string": "string12", + "string_long": "string12 very long string.........", + "string_big_blobs": "string12 very long string......... big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs", + "binary": "YmluYXJ5AA==", + "oid": "000000000000000000000000", + "decimal": "1.2345", + "integers": [ + -123, + 14685 + ], + "strings": [ + "sub_-123", + "sub_14685" + ], + "dictionary": { + "a": 2 + }, + "set": [ + 123 + ], + "uuid": "00000000-0000-0000-0000-000000000000" + }, + { + "_key": 13, + "int": -13, + "bool": true, + "date": "1970-01-01 03:25:45", + "float": -123.456, + "double": -9876.54321, + "string": "string13", + "string_long": "string13 very long string.........", + "string_big_blobs": "", + "binary": "YmluYXJ5AA==", + "oid": "000000000000000000000000", + "decimal": "1.2345", + "integers": [ + -123, + -16165, + -32207 + ], + "strings": [ + "sub_-123", + "sub_-16165", + "sub_-32207" + ], + "dictionary": { + "a": 2 + }, + "set": [ + 123 + ], + "uuid": "00000000-0000-0000-0000-000000000000" + }, + { + "_key": 14, + "int": 14, + "bool": false, + "date": "1970-01-01 03:25:45", + "float": 123.456, + "double": 9876.54321, + "string": "string14", + "string_long": "string14 very long string.........", + "string_big_blobs": "string14 very long string......... big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs", + "binary": "YmluYXJ5AA==", + "oid": "000000000000000000000000", + "decimal": "1.2345", + "integers": [ + -123, + 17153, + 34429, + 51705 + ], + "strings": [ + "sub_-123", + "sub_17153", + "sub_34429", + "sub_51705" + ], + "dictionary": { + "a": 2 + }, + "set": [ + 123 + ], + "uuid": "00000000-0000-0000-0000-000000000000" + } +] diff --git a/test/expect_xjson.json b/test/expect_xjson.json index 0a551ccc102..4f035b9c82d 100644 --- a/test/expect_xjson.json +++ b/test/expect_xjson.json @@ -18,7 +18,6 @@ "string": "string0", "string_long": "string0 very long string.........", "string_big_blobs": "string0 very long string......... big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs", - "string_enum": "enum1", "binary": { "$binary": { "base64": "YmluYXJ5AA==", @@ -69,7 +68,6 @@ "string": "string1", "string_long": "string1 very long string.........", "string_big_blobs": "", - "string_enum": "enum2", "binary": { "$binary": { "base64": "YmluYXJ5AA==", @@ -126,7 +124,6 @@ "string": "string2", "string_long": "string2 very long string.........", "string_big_blobs": "string2 very long string......... big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs", - "string_enum": "enum3", "binary": { "$binary": { "base64": "YmluYXJ5AA==", @@ -187,7 +184,6 @@ "string": "string3", "string_long": "string3 very long string.........", "string_big_blobs": "", - "string_enum": "enum1", "binary": { "$binary": { "base64": "YmluYXJ5AA==", @@ -252,7 +248,6 @@ "string": "string4", "string_long": "string4 very long string.........", "string_big_blobs": "string4 very long string......... big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs", - "string_enum": "enum2", "binary": { "$binary": { "base64": "YmluYXJ5AA==", @@ -321,7 +316,6 @@ "string": "string5", "string_long": "string5 very long string.........", "string_big_blobs": "", - "string_enum": "enum3", "binary": { "$binary": { "base64": "YmluYXJ5AA==", @@ -372,7 +366,6 @@ "string": "string6", "string_long": "string6 very long string.........", "string_big_blobs": "string6 very long string......... big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs", - "string_enum": "enum1", "binary": { "$binary": { "base64": "YmluYXJ5AA==", @@ -429,7 +422,6 @@ "string": "string7", "string_long": "string7 very long string.........", "string_big_blobs": "", - "string_enum": "enum2", "binary": { "$binary": { "base64": "YmluYXJ5AA==", @@ -490,7 +482,6 @@ "string": "string8", "string_long": "string8 very long string.........", "string_big_blobs": "string8 very long string......... big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs", - "string_enum": "enum3", "binary": { "$binary": { "base64": "YmluYXJ5AA==", @@ -555,7 +546,6 @@ "string": "string9", "string_long": "string9 very long string.........", "string_big_blobs": "", - "string_enum": "enum1", "binary": { "$binary": { "base64": "YmluYXJ5AA==", @@ -624,7 +614,6 @@ "string": "string10", "string_long": "string10 very long string.........", "string_big_blobs": "string10 very long string......... big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs", - "string_enum": "enum2", "binary": { "$binary": { "base64": "YmluYXJ5AA==", @@ -675,7 +664,6 @@ "string": "string11", "string_long": "string11 very long string.........", "string_big_blobs": "", - "string_enum": "enum3", "binary": { "$binary": { "base64": "YmluYXJ5AA==", @@ -732,7 +720,6 @@ "string": "string12", "string_long": "string12 very long string.........", "string_big_blobs": "string12 very long string......... big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs", - "string_enum": "enum1", "binary": { "$binary": { "base64": "YmluYXJ5AA==", @@ -793,7 +780,6 @@ "string": "string13", "string_long": "string13 very long string.........", "string_big_blobs": "", - "string_enum": "enum2", "binary": { "$binary": { "base64": "YmluYXJ5AA==", @@ -858,7 +844,6 @@ "string": "string14", "string_long": "string14 very long string.........", "string_big_blobs": "string14 very long string......... big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs", - "string_enum": "enum3", "binary": { "$binary": { "base64": "YmluYXJ5AA==", diff --git a/test/expect_xjson_plus.json b/test/expect_xjson_plus.json index 4b2f456835e..c0bcf5616e9 100644 --- a/test/expect_xjson_plus.json +++ b/test/expect_xjson_plus.json @@ -18,7 +18,6 @@ "string": "string0", "string_long": "string0 very long string.........", "string_big_blobs": "string0 very long string......... big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs", - "string_enum": "enum1", "binary": { "$binary": { "base64": "YmluYXJ5AA==", @@ -73,7 +72,6 @@ "string": "string1", "string_long": "string1 very long string.........", "string_big_blobs": "", - "string_enum": "enum2", "binary": { "$binary": { "base64": "YmluYXJ5AA==", @@ -134,7 +132,6 @@ "string": "string2", "string_long": "string2 very long string.........", "string_big_blobs": "string2 very long string......... big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs", - "string_enum": "enum3", "binary": { "$binary": { "base64": "YmluYXJ5AA==", @@ -199,7 +196,6 @@ "string": "string3", "string_long": "string3 very long string.........", "string_big_blobs": "", - "string_enum": "enum1", "binary": { "$binary": { "base64": "YmluYXJ5AA==", @@ -268,7 +264,6 @@ "string": "string4", "string_long": "string4 very long string.........", "string_big_blobs": "string4 very long string......... big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs", - "string_enum": "enum2", "binary": { "$binary": { "base64": "YmluYXJ5AA==", @@ -341,7 +336,6 @@ "string": "string5", "string_long": "string5 very long string.........", "string_big_blobs": "", - "string_enum": "enum3", "binary": { "$binary": { "base64": "YmluYXJ5AA==", @@ -396,7 +390,6 @@ "string": "string6", "string_long": "string6 very long string.........", "string_big_blobs": "string6 very long string......... big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs", - "string_enum": "enum1", "binary": { "$binary": { "base64": "YmluYXJ5AA==", @@ -457,7 +450,6 @@ "string": "string7", "string_long": "string7 very long string.........", "string_big_blobs": "", - "string_enum": "enum2", "binary": { "$binary": { "base64": "YmluYXJ5AA==", @@ -522,7 +514,6 @@ "string": "string8", "string_long": "string8 very long string.........", "string_big_blobs": "string8 very long string......... big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs", - "string_enum": "enum3", "binary": { "$binary": { "base64": "YmluYXJ5AA==", @@ -591,7 +582,6 @@ "string": "string9", "string_long": "string9 very long string.........", "string_big_blobs": "", - "string_enum": "enum1", "binary": { "$binary": { "base64": "YmluYXJ5AA==", @@ -664,7 +654,6 @@ "string": "string10", "string_long": "string10 very long string.........", "string_big_blobs": "string10 very long string......... big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs", - "string_enum": "enum2", "binary": { "$binary": { "base64": "YmluYXJ5AA==", @@ -719,7 +708,6 @@ "string": "string11", "string_long": "string11 very long string.........", "string_big_blobs": "", - "string_enum": "enum3", "binary": { "$binary": { "base64": "YmluYXJ5AA==", @@ -780,7 +768,6 @@ "string": "string12", "string_long": "string12 very long string.........", "string_big_blobs": "string12 very long string......... big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs", - "string_enum": "enum1", "binary": { "$binary": { "base64": "YmluYXJ5AA==", @@ -845,7 +832,6 @@ "string": "string13", "string_long": "string13 very long string.........", "string_big_blobs": "", - "string_enum": "enum2", "binary": { "$binary": { "base64": "YmluYXJ5AA==", @@ -914,7 +900,6 @@ "string": "string14", "string_long": "string14 very long string.........", "string_big_blobs": "string14 very long string......... big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs", - "string_enum": "enum3", "binary": { "$binary": { "base64": "YmluYXJ5AA==", diff --git a/test/fuzz_group.cpp b/test/fuzz_group.cpp index f4c4e3c3c26..f19f995d98f 100644 --- a/test/fuzz_group.cpp +++ b/test/fuzz_group.cpp @@ -79,7 +79,6 @@ enum INS { CREATE_TABLE_VIEW, COMPACT, IS_NULL, - ENUMERATE_COLUMN, COUNT }; @@ -597,19 +596,6 @@ void parse_and_apply_instructions(std::string& in, const std::string& path, std: t->remove_object_recursive(key); } } - else if (instr == ENUMERATE_COLUMN && wt->size() > 0) { - TableKey table_key = wt->get_table_keys()[get_next(s) % wt->size()]; - TableRef t = wt->get_table(table_key); - auto all_col_keys = t->get_column_keys(); - if (!all_col_keys.empty()) { - size_t ndx = get_next(s) % all_col_keys.size(); - ColKey col = all_col_keys[ndx]; - if (log) { - *log << "wt->get_table(" << table_key << ")->enumerate_string_column(" << col << ");\n"; - } - wt->get_table(table_key)->enumerate_string_column(col); - } - } else if (instr == COMMIT) { if (log) { *log << "wt->commit_and_continue_as_read();\n"; diff --git a/test/realm-fuzzer/fuzz_engine.cpp b/test/realm-fuzzer/fuzz_engine.cpp index 896dbdceb4a..82951a6c5bf 100644 --- a/test/realm-fuzzer/fuzz_engine.cpp +++ b/test/realm-fuzzer/fuzz_engine.cpp @@ -151,9 +151,6 @@ void FuzzEngine::do_fuzz(FuzzConfigurator& cnf) else if (instr == Remove_Recursive && group.size() > 0) { fuzzer.remove_recursive(group, log, state); } - else if (instr == Enumerate_Column && group.size() > 0) { - fuzzer.enumerate_column(group, log, state); - } else if (instr == Commit) { fuzzer.commit(shared_realm, log); } diff --git a/test/realm-fuzzer/fuzz_object.cpp b/test/realm-fuzzer/fuzz_object.cpp index 80f28bb65dd..c1d66ea455a 100644 --- a/test/realm-fuzzer/fuzz_object.cpp +++ b/test/realm-fuzzer/fuzz_object.cpp @@ -324,20 +324,6 @@ void FuzzObject::remove_recursive(Group& group, FuzzLog& log, State& s) } } -void FuzzObject::enumerate_column(Group& group, FuzzLog& log, State& s) -{ - log << "FuzzObject::enumerate_column();\n"; - TableKey table_key = group.get_table_keys()[get_next_token(s) % group.size()]; - TableRef t = group.get_table(table_key); - auto all_col_keys = t->get_column_keys(); - if (!all_col_keys.empty()) { - size_t ndx = get_next_token(s) % all_col_keys.size(); - ColKey col = all_col_keys[ndx]; - log << "group.get_table(" << table_key << ")->enumerate_string_column(" << col << ");\n"; - group.get_table(table_key)->enumerate_string_column(col); - } -} - void FuzzObject::get_all_column_names(Group& group, FuzzLog& log) { log << "FuzzObject::get_all_column_names();\n"; diff --git a/test/realm-fuzzer/fuzz_object.hpp b/test/realm-fuzzer/fuzz_object.hpp index 469f76a109f..2fd31f79a7b 100644 --- a/test/realm-fuzzer/fuzz_object.hpp +++ b/test/realm-fuzzer/fuzz_object.hpp @@ -43,7 +43,6 @@ class FuzzObject { void set_obj(realm::Group& group, FuzzLog& log, State& s); void remove_obj(realm::Group& group, FuzzLog& log, State& s); void remove_recursive(realm::Group& group, FuzzLog& log, State& s); - void enumerate_column(realm::Group& group, FuzzLog& log, State& s); void get_all_column_names(realm::Group& group, FuzzLog& log); void commit(realm::SharedRealm shared_realm, FuzzLog& log); void rollback(realm::SharedRealm shared_realm, realm::Group& group, FuzzLog& log); diff --git a/test/realm-fuzzer/util.hpp b/test/realm-fuzzer/util.hpp index deb6946bdfb..1f1d6a5b754 100644 --- a/test/realm-fuzzer/util.hpp +++ b/test/realm-fuzzer/util.hpp @@ -52,8 +52,7 @@ enum Instruction { Create_Table_View = 20, Compact = 21, Is_Null = 22, - Enumerate_Column = 23, - Count = 24 + Count = 23 }; diff --git a/test/test_group.cpp b/test/test_group.cpp index f7133fe05c4..3e58f9c274f 100644 --- a/test/test_group.cpp +++ b/test/test_group.cpp @@ -916,53 +916,6 @@ TEST(Group_Close) Group from_mem(buffer); } -TEST(Group_Serialize_Optimized) -{ - // Create group with one table - Group to_mem; - TableRef table = to_mem.add_table("test"); - test_table_add_columns(table); - - for (size_t i = 0; i < 5; ++i) { - table->create_object().set_all("abd", 1, true, int(Mon)); - table->create_object().set_all("eftg", 2, true, int(Tue)); - table->create_object().set_all("hijkl", 5, true, int(Wed)); - table->create_object().set_all("mnopqr", 8, true, int(Thu)); - table->create_object().set_all("stuvxyz", 9, true, int(Fri)); - } - - ColKey col_string = table->get_column_keys()[0]; - table->enumerate_string_column(col_string); - -#ifdef REALM_DEBUG - to_mem.verify(); -#endif - - // Serialize to memory (we now own the buffer) - BinaryData buffer = to_mem.write_to_mem(); - - // Load the table - Group from_mem(buffer); - TableRef t = from_mem.get_table("test"); - - CHECK_EQUAL(4, t->get_column_count()); - - // Verify that original values are there - CHECK(*table == *t); - - // Add a row with a known (but unique) value - auto k = table->create_object().set_all("search_target", 9, true, int(Fri)).get_key(); - - const auto res = table->find_first_string(col_string, "search_target"); - CHECK_EQUAL(k, res); - -#ifdef REALM_DEBUG - to_mem.verify(); - from_mem.verify(); -#endif -} - - TEST(Group_Serialize_All) { // Create group with one table diff --git a/test/test_index_string.cpp b/test/test_index_string.cpp index 83ed8c342ec..4a588cd0666 100644 --- a/test/test_index_string.cpp +++ b/test/test_index_string.cpp @@ -150,13 +150,10 @@ class column { std::vector m_keys; }; - column(bool nullable = false, bool enumerated = false) + column(bool nullable = false) : m_column(this) { m_col_key = m_table.add_column(ColumnTypeTraits::id, "values", nullable); - if (enumerated) { - m_table.enumerate_string_column(m_col_key); - } } ColumnTestType& get_column() { @@ -172,62 +169,24 @@ class column { class string_column : public column { public: string_column() - : column(false, false) + : column(false) { } static bool is_nullable() { return false; } - static bool is_enumerated() - { - return false; - } }; class nullable_string_column : public column { public: nullable_string_column() - : column(true, false) - { - } - static bool is_nullable() - { - return true; - } - static bool is_enumerated() - { - return false; - } -}; -class enum_column : public column { -public: - enum_column() - : column(false, true) - { - } - static bool is_nullable() - { - return false; - } - static bool is_enumerated() - { - return true; - } -}; -class nullable_enum_column : public column { -public: - nullable_enum_column() - : column(true, true) + : column(true) { } static bool is_nullable() { return true; } - static bool is_enumerated() - { - return true; - } }; // disable to avoid warnings about not being used - enable when tests @@ -300,7 +259,7 @@ TEST(StringIndex_NonIndexable) } } -TEST_TYPES(StringIndex_BuildIndex, string_column, nullable_string_column, enum_column, nullable_enum_column) +TEST_TYPES(StringIndex_BuildIndex, string_column, nullable_string_column) { TEST_TYPE test_resources; typename TEST_TYPE::ColumnTestType& col = test_resources.get_column(); @@ -331,7 +290,7 @@ TEST_TYPES(StringIndex_BuildIndex, string_column, nullable_string_column, enum_c CHECK_EQUAL(6, r6.value); } -TEST_TYPES(StringIndex_DeleteAll, string_column, nullable_string_column, enum_column, nullable_enum_column) +TEST_TYPES(StringIndex_DeleteAll, string_column, nullable_string_column) { TEST_TYPE test_resources; typename TEST_TYPE::ColumnTestType& col = test_resources.get_column(); @@ -379,7 +338,7 @@ TEST_TYPES(StringIndex_DeleteAll, string_column, nullable_string_column, enum_co CHECK(ndx.is_empty()); } -TEST_TYPES(StringIndex_Delete, string_column, nullable_string_column, enum_column, nullable_enum_column) +TEST_TYPES(StringIndex_Delete, string_column, nullable_string_column) { TEST_TYPE test_resources; typename TEST_TYPE::ColumnTestType& col = test_resources.get_column(); @@ -424,7 +383,7 @@ TEST_TYPES(StringIndex_Delete, string_column, nullable_string_column, enum_colum } -TEST_TYPES(StringIndex_ClearEmpty, string_column, nullable_string_column, enum_column, nullable_enum_column) +TEST_TYPES(StringIndex_ClearEmpty, string_column, nullable_string_column) { TEST_TYPE test_resources; typename TEST_TYPE::ColumnTestType& col = test_resources.get_column(); @@ -437,7 +396,7 @@ TEST_TYPES(StringIndex_ClearEmpty, string_column, nullable_string_column, enum_c CHECK(ndx.is_empty()); } -TEST_TYPES(StringIndex_Clear, string_column, nullable_string_column, enum_column, nullable_enum_column) +TEST_TYPES(StringIndex_Clear, string_column, nullable_string_column) { TEST_TYPE test_resources; typename TEST_TYPE::ColumnTestType& col = test_resources.get_column(); @@ -482,7 +441,7 @@ TEST_TYPES(StringIndex_Clear, string_column, nullable_string_column, enum_column } -TEST_TYPES(StringIndex_Set, string_column, nullable_string_column, enum_column, nullable_enum_column) +TEST_TYPES(StringIndex_Set, string_column, nullable_string_column) { TEST_TYPE test_resources; typename TEST_TYPE::ColumnTestType& col = test_resources.get_column(); @@ -527,7 +486,7 @@ TEST_TYPES(StringIndex_Set, string_column, nullable_string_column, enum_column, CHECK_EQUAL(4, col.find_first(s6)); } -TEST_TYPES(StringIndex_Count, string_column, nullable_string_column, enum_column, nullable_enum_column) +TEST_TYPES(StringIndex_Count, string_column, nullable_string_column) { TEST_TYPE test_resources; typename TEST_TYPE::ColumnTestType& col = test_resources.get_column(); @@ -559,7 +518,7 @@ TEST_TYPES(StringIndex_Count, string_column, nullable_string_column, enum_column CHECK_EQUAL(4, c4); } -TEST_TYPES(StringIndex_Distinct, string_column, nullable_string_column, enum_column, nullable_enum_column) +TEST_TYPES(StringIndex_Distinct, string_column, nullable_string_column) { TEST_TYPE test_resources; typename TEST_TYPE::ColumnTestType& col = test_resources.get_column(); @@ -580,7 +539,7 @@ TEST_TYPES(StringIndex_Distinct, string_column, nullable_string_column, enum_col CHECK(ndx->has_duplicate_values()); } -TEST_TYPES(StringIndex_FindAllNoCopy, string_column, nullable_string_column, enum_column, nullable_enum_column) +TEST_TYPES(StringIndex_FindAllNoCopy, string_column, nullable_string_column) { TEST_TYPE test_resources; typename TEST_TYPE::ColumnTestType& col = test_resources.get_column(); @@ -706,8 +665,7 @@ TEST(StringIndex_FindAllNoCopy2_IntNull) CHECK_EQUAL(results.payload, col.size() - 1); } -TEST_TYPES(StringIndex_FindAllNoCopyCommonPrefixStrings, string_column, nullable_string_column, enum_column, - nullable_enum_column) +TEST_TYPES(StringIndex_FindAllNoCopyCommonPrefixStrings, string_column, nullable_string_column) { TEST_TYPE test_resources; typename TEST_TYPE::ColumnTestType& col = test_resources.get_column(); @@ -951,7 +909,7 @@ TEST_TYPES_IF(StringIndex_EmbeddedZeroesCombinations, TEST_DURATION > 1, string_ } // Tests for a bug with strings containing zeroes -TEST_TYPES(StringIndex_EmbeddedZeroes, string_column, nullable_string_column, enum_column, nullable_enum_column) +TEST_TYPES(StringIndex_EmbeddedZeroes, string_column, nullable_string_column) { TEST_TYPE test_resources; typename TEST_TYPE::ColumnTestType& col2 = test_resources.get_column(); @@ -982,10 +940,10 @@ TEST_TYPES(StringIndex_EmbeddedZeroes, string_column, nullable_string_column, en CHECK_EQUAL(f, null_key); } -TEST_TYPES(StringIndex_Null, nullable_string_column, nullable_enum_column) +TEST(StringIndex_Null) { - TEST_TYPE test_resources; - typename TEST_TYPE::ColumnTestType& col = test_resources.get_column(); + nullable_string_column test_resources; + auto& col = test_resources.get_column(); col.add(""); col.add(realm::null()); @@ -997,7 +955,7 @@ TEST_TYPES(StringIndex_Null, nullable_string_column, nullable_enum_column) } -TEST_TYPES(StringIndex_Zero_Crash, string_column, nullable_string_column, enum_column, nullable_enum_column) +TEST_TYPES(StringIndex_Zero_Crash, string_column, nullable_string_column) { bool nullable = TEST_TYPE::is_nullable(); @@ -1010,9 +968,6 @@ TEST_TYPES(StringIndex_Zero_Crash, string_column, nullable_string_column, enum_c auto k2 = table.create_object().set(col, StringData("\0\0", 2)).get_key(); table.add_search_index(col); - if (TEST_TYPE::is_enumerated()) - table.enumerate_string_column(col); - ObjKey t; t = table.find_first_string(col, StringData("")); @@ -1154,7 +1109,7 @@ TEST(StringIndex_Integer_Increasing) } } -TEST_TYPES(StringIndex_Duplicate_Values, string_column, nullable_string_column, enum_column, nullable_enum_column) +TEST_TYPES(StringIndex_Duplicate_Values, string_column, nullable_string_column) { TEST_TYPE test_resources; typename TEST_TYPE::ColumnTestType& col = test_resources.get_column(); @@ -1222,7 +1177,7 @@ TEST_TYPES(StringIndex_Duplicate_Values, string_column, nullable_string_column, CHECK(col.size() == 0); } -TEST_TYPES(StringIndex_MaxBytes, string_column, nullable_string_column, enum_column, nullable_enum_column) +TEST_TYPES(StringIndex_MaxBytes, string_column, nullable_string_column) { TEST_TYPE test_resources; typename TEST_TYPE::ColumnTestType& col = test_resources.get_column(); @@ -1267,7 +1222,7 @@ TEST_TYPES(StringIndex_MaxBytes, string_column, nullable_string_column, enum_col // for the characters at the end (they have an identical very // long prefix). This was causing a stack overflow because of // the recursive nature of the insert function. -TEST_TYPES(StringIndex_InsertLongPrefix, string_column, nullable_string_column, enum_column, nullable_enum_column) +TEST_TYPES(StringIndex_InsertLongPrefix, string_column, nullable_string_column) { TEST_TYPE test_resources; typename TEST_TYPE::ColumnTestType& col = test_resources.get_column(); @@ -1346,8 +1301,7 @@ TEST_TYPES(StringIndex_InsertLongPrefix, string_column, nullable_string_column, col.clear(); // calls recursive function Array::destroy_deep() } -TEST_TYPES(StringIndex_InsertLongPrefixAndQuery, string_column, nullable_string_column, enum_column, - nullable_enum_column) +TEST_TYPES(StringIndex_InsertLongPrefixAndQuery, string_column, nullable_string_column) { constexpr int half_node_size = REALM_MAX_BPNODE_SIZE / 2; bool nullable_column = TEST_TYPE::is_nullable(); @@ -1379,8 +1333,6 @@ TEST_TYPES(StringIndex_InsertLongPrefixAndQuery, string_column, nullable_string_ index->to_dot(o, ""); } */ - if (TEST_TYPE::is_enumerated()) - t->enumerate_string_column(col); auto ndx_a = t->where().equal(col, StringData(str_a)).find(); auto cnt = t->count_string(col, StringData(str_a)); @@ -1514,7 +1466,7 @@ void check_result_order(const std::vector& results, TestContext& test_co } // end anonymous namespace -TEST_TYPES(StringIndex_Insensitive, string_column, nullable_string_column, enum_column, nullable_enum_column) +TEST_TYPES(StringIndex_Insensitive, string_column, nullable_string_column) { TEST_TYPE test_resources; typename TEST_TYPE::ColumnTestType& col = test_resources.get_column(); @@ -1681,7 +1633,7 @@ TEST_TYPES(StringIndex_Insensitive_Unicode, non_nullable, nullable) */ -TEST_TYPES(StringIndex_45, string_column, nullable_string_column, enum_column, nullable_enum_column) +TEST_TYPES(StringIndex_45, string_column, nullable_string_column) { TEST_TYPE test_resources; typename TEST_TYPE::ColumnTestType& col = test_resources.get_column(); @@ -1715,8 +1667,7 @@ std::string create_random_a_string(size_t max_len) // Excluded when run with valgrind because it takes a long time -TEST_TYPES_IF(StringIndex_Insensitive_Fuzz, TEST_DURATION > 1, string_column, nullable_string_column, enum_column, - nullable_enum_column) +TEST_TYPES_IF(StringIndex_Insensitive_Fuzz, TEST_DURATION > 1, string_column, nullable_string_column) { const size_t max_str_len = 9; const size_t iters = 3; @@ -1763,8 +1714,7 @@ TEST_TYPES_IF(StringIndex_Insensitive_Fuzz, TEST_DURATION > 1, string_column, nu // Exercise the StringIndex case insensitive search for strings with very long, common prefixes // to cover the special case code paths where different strings are stored in a list. -TEST_TYPES(StringIndex_Insensitive_VeryLongStrings, string_column, nullable_string_column, enum_column, - nullable_enum_column) +TEST_TYPES(StringIndex_Insensitive_VeryLongStrings, string_column, nullable_string_column) { TEST_TYPE test_resources; typename TEST_TYPE::ColumnTestType& col = test_resources.get_column(); @@ -1800,7 +1750,7 @@ TEST_TYPES(StringIndex_Insensitive_VeryLongStrings, string_column, nullable_stri // Bug with case insensitive search on numbers that gives duplicate results -TEST_TYPES(StringIndex_Insensitive_Numbers, string_column, nullable_string_column, enum_column, nullable_enum_column) +TEST_TYPES(StringIndex_Insensitive_Numbers, string_column, nullable_string_column) { TEST_TYPE test_resources; typename TEST_TYPE::ColumnTestType& col = test_resources.get_column(); @@ -1819,7 +1769,7 @@ TEST_TYPES(StringIndex_Insensitive_Numbers, string_column, nullable_string_colum } -TEST_TYPES(StringIndex_Rover, string_column, nullable_string_column, enum_column, nullable_enum_column) +TEST_TYPES(StringIndex_Rover, string_column, nullable_string_column) { TEST_TYPE test_resources; typename TEST_TYPE::ColumnTestType& col = test_resources.get_column(); diff --git a/test/test_json.cpp b/test/test_json.cpp index 39f61d2a981..c90aeea913a 100644 --- a/test/test_json.cpp +++ b/test/test_json.cpp @@ -93,7 +93,6 @@ void setup_multi_table(Table& table, size_t rows) table.add_column(type_String, "string"); // 5 table.add_column(type_String, "string_long"); // 6 ColKey col_string_big = table.add_column(type_String, "string_big_blobs"); // 7 - ColKey col_string_enum = table.add_column(type_String, "string_enum"); // 8 - becomes StringEnumColumn ColKey col_binary = table.add_column(type_Binary, "binary"); // 9 ColKey col_oid = table.add_column(type_ObjectId, "oid"); // 10 ColKey col_decimal = table.add_column(type_Decimal, "decimal"); // 11 @@ -128,17 +127,6 @@ void setup_multi_table(Table& table, size_t rows) obj.set(col_string_big, ""); break; } - switch (i % 3) { - case 0: - obj.set(col_string_enum, "enum1"); - break; - case 1: - obj.set(col_string_enum, "enum2"); - break; - case 2: - obj.set(col_string_enum, "enum3"); - break; - } obj.set(col_binary, BinaryData("binary", 7)); obj.set(col_oid, ObjectId()); obj.set(col_decimal, Decimal128("1.2345")); @@ -158,9 +146,6 @@ void setup_multi_table(Table& table, size_t rows) auto set = obj.get_set(col_set); set.insert(123); } - - // We also want a StringEnumColumn - table.enumerate_string_column(col_string_enum); } bool json_test(std::string json, std::string expected_file, bool generate) diff --git a/test/test_lang_bind_helper.cpp b/test/test_lang_bind_helper.cpp index 04467f781d0..925eb7ec852 100644 --- a/test/test_lang_bind_helper.cpp +++ b/test/test_lang_bind_helper.cpp @@ -989,54 +989,6 @@ TEST(LangBindHelper_AdvanceReadTransact_LinkColumnInNewTable) } -TEST(LangBindHelper_AdvanceReadTransact_EnumeratedStrings) -{ - SHARED_GROUP_TEST_PATH(path); - ShortCircuitHistory hist; - DBRef sg = DB::create(hist, path, DBOptions(crypt_key())); - ColKey c0, c1, c2; - - // Start a read transaction (to be repeatedly advanced) - auto rt = sg->start_read(); - CHECK_EQUAL(0, rt->size()); - - // Create 3 string columns, one primed for conversion to "unique string - // enumeration" representation - { - WriteTransaction wt(sg); - TableRef table_w = wt.add_table("t"); - c0 = table_w->add_column(type_String, "a"); - c1 = table_w->add_column(type_String, "b"); - c2 = table_w->add_column(type_String, "c"); - for (int i = 0; i < 1000; ++i) { - std::ostringstream out; - out << i; - std::string str = out.str(); - table_w->create_object(ObjKey{}, {{c0, str}, {c1, "foo"}, {c2, str}}); - } - wt.commit(); - } - rt->advance_read(); - rt->verify(); - ConstTableRef table = rt->get_table("t"); - CHECK_EQUAL(0, table->get_num_unique_values(c0)); - CHECK_EQUAL(0, table->get_num_unique_values(c1)); // Not yet "optimized" - CHECK_EQUAL(0, table->get_num_unique_values(c2)); - - // Optimize - { - WriteTransaction wt(sg); - TableRef table_w = wt.get_table("t"); - table_w->enumerate_string_column(c1); - wt.commit(); - } - rt->advance_read(); - rt->verify(); - CHECK_EQUAL(0, table->get_num_unique_values(c0)); - CHECK_NOT_EQUAL(0, table->get_num_unique_values(c1)); // Must be "optimized" now - CHECK_EQUAL(0, table->get_num_unique_values(c2)); -} - NONCONCURRENT_TEST_IF(LangBindHelper_AdvanceReadTransact_SearchIndex, testing_supports_spawn_process) { SHARED_GROUP_TEST_PATH(path); @@ -5593,28 +5545,6 @@ TEST(LangBindHelper_CopyOnWriteOverflow) } -TEST(LangBindHelper_RollbackOptimize) -{ - SHARED_GROUP_TEST_PATH(path); - const char* key = crypt_key(); - std::unique_ptr hist_w(make_in_realm_history()); - DBRef sg_w = DB::create(*hist_w, path, DBOptions(key)); - auto g = sg_w->start_write(); - - auto table = g->add_table("t0"); - auto col = table->add_column(type_String, "str_col_0", true); - g->commit_and_continue_as_read(); - g->verify(); - g->promote_to_write(); - g->verify(); - std::vector keys; - table->create_objects(198, keys); - table->enumerate_string_column(col); - g->rollback_and_continue_as_read(); - g->verify(); -} - - TEST(LangBindHelper_BinaryReallocOverMax) { SHARED_GROUP_TEST_PATH(path); @@ -5667,33 +5597,6 @@ TEST(LangBindHelper_OpenAsEncrypted) #endif -// Test case generated in [realm-core-4.0.4] on Mon Dec 18 13:33:24 2017. -// Adding 0 rows to a StringEnumColumn would add the default value to the keys -// but not the indexes creating an inconsistency. -TEST(LangBindHelper_EnumColumnAddZeroRows) -{ - SHARED_GROUP_TEST_PATH(path); - const char* key = nullptr; - std::unique_ptr hist(make_in_realm_history()); - DBRef sg = DB::create(*hist, path, DBOptions(key)); - auto g = sg->start_write(); - auto g_r = sg->start_read(); - auto table = g->add_table(""); - - auto col = table->add_column(DataType(2), "table", false); - table->enumerate_string_column(col); - g->commit_and_continue_as_read(); - g->verify(); - g->promote_to_write(); - g->verify(); - table->create_object(); - g->commit_and_continue_as_read(); - g_r->advance_read(); - g_r->verify(); - g->verify(); -} - - TEST(LangBindHelper_RemoveObject) { SHARED_GROUP_TEST_PATH(path); diff --git a/test/test_links.cpp b/test/test_links.cpp index 7561364089b..6e1eed71f47 100644 --- a/test/test_links.cpp +++ b/test/test_links.cpp @@ -752,17 +752,6 @@ TEST(ListList_Clear) CHECK_EQUAL(links2->size(), 0); } -TEST(Links_AddBacklinkToTableWithEnumColumns) -{ - Group g; - auto table = g.add_table("fshno"); - auto col = table->add_column(type_String, "strings", false); - table->create_object(); - table->add_column(*table, "link1"); - table->enumerate_string_column(col); - table->add_column(*table, "link2"); -} - TEST(Links_LinkList_Inserts) { Group group; diff --git a/test/test_query.cpp b/test/test_query.cpp index 313312f6ea7..c2d7215d35f 100644 --- a/test/test_query.cpp +++ b/test/test_query.cpp @@ -2124,15 +2124,12 @@ TEST_TYPES(Query_ListOfPrimitives_MinMax, int64_t, float, double, Decimal128, Ti validate_aggregate_results(test_context, table, col, value, max); } -TEST_TYPES(Query_StringIndexCommonPrefix, std::true_type, std::false_type) +TEST(Query_StringIndexCommonPrefix) { Group group; TableRef table = group.add_table("test"); auto col_str = table->add_column(type_String, "first"); table->add_search_index(col_str); - if (TEST_TYPE::value == true) { - table->enumerate_string_column(col_str); - } auto test_prefix_find = [&](std::string prefix) { std::string prefix_b = prefix + "b"; @@ -2822,8 +2819,6 @@ TEST(Query_Huge) for (size_t t = 0; t < 4; t++) { if (t == 1) { - tt.enumerate_string_column(col_str0); - tt.enumerate_string_column(col_str1); } else if (t == 2) { tt.add_search_index(col_str0); @@ -3007,8 +3002,6 @@ TEST_IF(Query_StrIndex3, TEST_DURATION > 0) for (size_t t = 0; t < vec.size(); t++) CHECK_EQUAL(vec[t], v.get_key(t)); - ttt.enumerate_string_column(col_str); - // Linear scan over enum, plus linear integer column scan v = ttt.where().equal(col_str, "AA").equal(col_int, 0).find_all(); CHECK_EQUAL(vec.size(), v.size()); @@ -3059,34 +3052,6 @@ TEST(Query_StrIndex2) CHECK_EQUAL(0, s); } -TEST(Query_StrEnum) -{ - Random random(random_int()); // Seed from slow global generator - Table ttt; - ttt.add_column(type_Int, "1"); - auto col_str = ttt.add_column(type_String, "2"); - - int aa; - int64_t s; - - for (int i = 0; i < 100; ++i) { - ttt.clear(); - aa = 0; - for (size_t t = 0; t < REALM_MAX_BPNODE_SIZE * 2; ++t) { - if (random.chance(1, 3)) { - ttt.create_object().set_all(1, "AA"); - ++aa; - } - else { - ttt.create_object().set_all(1, "BB"); - } - } - ttt.enumerate_string_column(col_str); - s = ttt.where().equal(col_str, "AA").count(); - CHECK_EQUAL(aa, s); - } -} - TEST(Query_StrIndex) { Random random(random_int()); // Seed from slow global generator @@ -3121,10 +3086,6 @@ TEST(Query_StrIndex) s = ttt.where().equal(str_col, "AA").count(); CHECK_EQUAL(aa, s); - ttt.enumerate_string_column(str_col); - s = ttt.where().equal(str_col, "AA").count(); - CHECK_EQUAL(aa, s); - ttt.add_search_index(str_col); s = ttt.where().equal(str_col, "AA").count(); CHECK_EQUAL(aa, s); @@ -3212,49 +3173,6 @@ TEST(Query_StrIndexUpdating) CHECK_EQUAL(tv_ins.size(), 0); } -TEST(Query_GA_Crash) -{ - GROUP_TEST_PATH(path); - Random random(random_int()); // Seed from slow global generator - { - Group g; - TableRef t = g.add_table("firstevents"); - auto col_str0 = t->add_column(type_String, "1"); - auto col_str1 = t->add_column(type_String, "2"); - auto col_str2 = t->add_column(type_String, "3"); - t->add_column(type_Int, "4"); - t->add_column(type_Int, "5"); - - for (size_t i = 0; i < 100; ++i) { - int64_t r1 = random.draw_int_mod(100); - int64_t r2 = random.draw_int_mod(100); - - t->create_object().set_all("10", "US", "1.0", r1, r2); - } - t->enumerate_string_column(col_str0); - t->enumerate_string_column(col_str1); - t->enumerate_string_column(col_str2); - g.write(path); - } - - Group g(path); - TableRef t = g.get_table("firstevents"); - auto col_str1 = t->get_column_key("2"); - - Query q = t->where().equal(col_str1, "US"); - - size_t c1 = 0; - for (size_t i = 0; i < 100; ++i) - c1 += t->count_string(col_str1, "US"); - - size_t c2 = 0; - for (size_t i = 0; i < 100; ++i) - c2 += q.count(); - - CHECK_EQUAL(c1, t->size() * 100); - CHECK_EQUAL(c1, c2); -} - TEST(Query_Float3) { Table t; @@ -3563,7 +3481,7 @@ TEST(Query_DoubleCoordinates) } -TEST_TYPES(Query_StrIndexed, std::true_type, std::false_type) +TEST(Query_StrIndexed) { Table ttt; auto col_int = ttt.add_column(type_Int, "1"); @@ -3578,10 +3496,6 @@ TEST_TYPES(Query_StrIndexed, std::true_type, std::false_type) ttt.create_object().set_all(4, "c"); } - if (TEST_TYPE::value == true) { - ttt.enumerate_string_column(col_str); - } - ttt.add_search_index(col_str); auto s = *ttt.where().equal(col_str, "a").sum(col_int); diff --git a/test/test_query2.cpp b/test/test_query2.cpp index b0dc4f75165..462fbcfc3dc 100644 --- a/test/test_query2.cpp +++ b/test/test_query2.cpp @@ -661,35 +661,6 @@ TEST(Query_Binary) } } -TEST(Query_Enums) -{ - Table table; - auto col_int = table.add_column(type_Int, "1"); - auto col_str = table.add_column(type_String, "2"); - - - for (size_t i = 0; i < 5; ++i) { - table.create_object().set_all(1, "abd"); - table.create_object().set_all(2, "eftg"); - table.create_object().set_all(5, "hijkl"); - table.create_object().set_all(8, "mnopqr"); - table.create_object().set_all(9, "stuvxyz"); - } - - table.enumerate_string_column(col_str); - - Query q1 = table.where().equal(col_str, "eftg"); - TableView tv1 = q1.find_all(); - - CHECK_EQUAL(5, tv1.size()); - CHECK_EQUAL(2, tv1[0].get(col_int)); - CHECK_EQUAL(2, tv1[1].get(col_int)); - CHECK_EQUAL(2, tv1[2].get(col_int)); - CHECK_EQUAL(2, tv1[3].get(col_int)); - CHECK_EQUAL(2, tv1[4].get(col_int)); -} - - TEST_TYPES(Query_CaseSensitivity, std::true_type, std::false_type) { constexpr bool nullable = TEST_TYPE::value; @@ -1422,29 +1393,16 @@ TEST(Query_NullStrings) TEST(Query_Nulls_Fuzzy) { - for (int attributes = 1; attributes < 5; attributes++) { + for (int attributes = 1; attributes < 3; attributes++) { Random random(random_int()); for (size_t t = 0; t < 10; t++) { Table table; auto col = table.add_column(type_String, "string", true); - if (attributes == 0) { - } if (attributes == 1) { table.add_search_index(col); } - else if (attributes == 2) { - table.enumerate_string_column(col); - } - else if (attributes == 3) { - table.add_search_index(col); - table.enumerate_string_column(col); - } - else if (attributes == 4) { - table.enumerate_string_column(col); - table.add_search_index(col); - } // map that is kept in sync with the column so that we can compare with it std::map v; diff --git a/test/test_shared.cpp b/test/test_shared.cpp index 8bbfc070a62..c8f75c47fed 100644 --- a/test/test_shared.cpp +++ b/test/test_shared.cpp @@ -2571,10 +2571,6 @@ TEST(Shared_MovingSearchIndex) obj.set(enum_col, "bar"); } table->get_object(obj_keys.back()).set(enum_col, "bar63"); - table->enumerate_string_column(enum_col); - CHECK_EQUAL(0, table->get_num_unique_values(int_col)); - CHECK_EQUAL(0, table->get_num_unique_values(str_col)); - CHECK_EQUAL(2, table->get_num_unique_values(enum_col)); table->add_search_index(int_col); table->add_search_index(str_col); @@ -2597,9 +2593,6 @@ TEST(Shared_MovingSearchIndex) CHECK(table->has_search_index(int_col)); CHECK(table->has_search_index(str_col)); CHECK(table->has_search_index(enum_col)); - CHECK_EQUAL(0, table->get_num_unique_values(int_col)); - CHECK_EQUAL(0, table->get_num_unique_values(str_col)); - CHECK_EQUAL(2, table->get_num_unique_values(enum_col)); CHECK_EQUAL(ObjKey(), table->find_first_int(int_col, 100)); CHECK_EQUAL(ObjKey(), table->find_first_string(str_col, "bad")); CHECK_EQUAL(ObjKey(), table->find_first_string(enum_col, "bad")); @@ -2613,9 +2606,6 @@ TEST(Shared_MovingSearchIndex) CHECK(table->has_search_index(int_col)); CHECK(table->has_search_index(str_col)); CHECK(table->has_search_index(enum_col)); - CHECK_EQUAL(0, table->get_num_unique_values(int_col)); - CHECK_EQUAL(0, table->get_num_unique_values(str_col)); - CHECK_EQUAL(2, table->get_num_unique_values(enum_col)); CHECK_EQUAL(ObjKey(), table->find_first_int(int_col, 100)); CHECK_EQUAL(ObjKey(), table->find_first_string(str_col, "bad")); CHECK_EQUAL(ObjKey(), table->find_first_string(enum_col, "bad")); @@ -2632,9 +2622,6 @@ TEST(Shared_MovingSearchIndex) CHECK(table->has_search_index(int_col)); CHECK(table->has_search_index(str_col)); CHECK(table->has_search_index(enum_col)); - CHECK_EQUAL(0, table->get_num_unique_values(int_col)); - CHECK_EQUAL(0, table->get_num_unique_values(str_col)); - CHECK_EQUAL(3, table->get_num_unique_values(enum_col)); CHECK_EQUAL(ObjKey(), table->find_first_int(int_col, 100)); CHECK_EQUAL(ObjKey(), table->find_first_string(str_col, "bad")); CHECK_EQUAL(ObjKey(), table->find_first_string(enum_col, "bad")); @@ -3632,34 +3619,6 @@ TEST(Shared_OpenAfterClose) db_w->close(); } -TEST(Shared_RemoveTableWithEnumAndLinkColumns) -{ - // Test case generated with fuzzer - SHARED_GROUP_TEST_PATH(path); - DBRef db_w = DB::create(path); - TableKey tk; - { - auto wt = db_w->start_write(); - wt->add_table("Table_2"); - wt->commit(); - } - { - auto wt = db_w->start_write(); - auto table = wt->get_table("Table_2"); - tk = table->get_key(); - auto col_key = table->add_column(DataType(2), "string_3", false); - table->enumerate_string_column(col_key); - table->add_column(*table, "link_5"); - table->add_search_index(col_key); - wt->commit(); - } - { - auto wt = db_w->start_write(); - wt->remove_table(tk); - wt->commit(); - } -} - TEST(Shared_GenerateObjectIdAfterRollback) { // Test case generated in [realm-core-6.0.0-alpha.0] on Mon Aug 13 14:43:06 2018. diff --git a/test/test_table.cpp b/test/test_table.cpp index 52e06fb2659..4029043e0e1 100644 --- a/test/test_table.cpp +++ b/test/test_table.cpp @@ -366,18 +366,6 @@ TEST(Table_DeleteCrash) table->remove_object(k1); } -TEST(Table_OptimizeCrash) -{ - // This will crash at the .add() method - Table ttt; - ttt.add_column(type_Int, "first"); - auto col = ttt.add_column(type_String, "second"); - ttt.enumerate_string_column(col); - ttt.add_search_index(col); - ttt.clear(); - ttt.create_object().set_all(1, "AA"); -} - TEST(Table_DateTimeMinMax) { Group g; @@ -1002,7 +990,6 @@ void setup_multi_table(Table& table, size_t rows, std::vector& keys, std auto string_col = table.add_column(type_String, "string"); // 4 auto string_long_col = table.add_column(type_String, "string_long"); // 5 auto string_big_col = table.add_column(type_String, "string_big_blobs"); // 6 - auto string_enum_col = table.add_column(type_String, "string_enum"); // 7 - becomes StringEnumColumn auto bin_col = table.add_column(type_Binary, "binary"); // 8 auto int_null_col = table.add_column(type_Int, "int_null", true); // 9, nullable = true column_keys.push_back(int_col); @@ -1012,7 +999,6 @@ void setup_multi_table(Table& table, size_t rows, std::vector& keys, std column_keys.push_back(string_col); column_keys.push_back(string_long_col); column_keys.push_back(string_big_col); - column_keys.push_back(string_enum_col); column_keys.push_back(bin_col); column_keys.push_back(int_null_col); @@ -1061,23 +1047,8 @@ void setup_multi_table(Table& table, size_t rows, std::vector& keys, std obj.set(string_big_col, StringData("")); break; } - // enum - switch (i % 3) { - case 0: - obj.set(string_enum_col, "enum1"); - break; - case 1: - obj.set(string_enum_col, "enum2"); - break; - case 2: - obj.set(string_enum_col, "enum3"); - break; - } obj.set(bin_col, BinaryData("binary", 7)); } - - // We also want a StringEnumColumn - table.enumerate_string_column(string_enum_col); } } // anonymous namespace @@ -1584,145 +1555,6 @@ TEST(Table_IndexInt) #endif } -TEST(Table_AutoEnumeration) -{ - Table table; - - auto col_int = table.add_column(type_Int, "first"); - auto col_str = table.add_column(type_String, "second"); - - for (size_t i = 0; i < 5; ++i) { - table.create_object().set_all(1, "abd"); - table.create_object().set_all(2, "eftg"); - table.create_object().set_all(5, "hijkl"); - table.create_object().set_all(8, "mnopqr"); - table.create_object().set_all(9, "stuvxyz"); - } - - table.enumerate_string_column(col_str); - - for (size_t i = 0; i < 5; ++i) { - const size_t n = i * 5; - CHECK_EQUAL(1, table.get_object(ObjKey(0 + n)).get(col_int)); - CHECK_EQUAL(2, table.get_object(ObjKey(1 + n)).get(col_int)); - CHECK_EQUAL(5, table.get_object(ObjKey(2 + n)).get(col_int)); - CHECK_EQUAL(8, table.get_object(ObjKey(3 + n)).get(col_int)); - CHECK_EQUAL(9, table.get_object(ObjKey(4 + n)).get(col_int)); - - CHECK_EQUAL("abd", table.get_object(ObjKey(0 + n)).get(col_str)); - CHECK_EQUAL("eftg", table.get_object(ObjKey(1 + n)).get(col_str)); - CHECK_EQUAL("hijkl", table.get_object(ObjKey(2 + n)).get(col_str)); - CHECK_EQUAL("mnopqr", table.get_object(ObjKey(3 + n)).get(col_str)); - CHECK_EQUAL("stuvxyz", table.get_object(ObjKey(4 + n)).get(col_str)); - } - - // Verify counts - const size_t count1 = table.count_string(col_str, "abd"); - const size_t count2 = table.count_string(col_str, "eftg"); - const size_t count3 = table.count_string(col_str, "hijkl"); - const size_t count4 = table.count_string(col_str, "mnopqr"); - const size_t count5 = table.count_string(col_str, "stuvxyz"); - CHECK_EQUAL(5, count1); - CHECK_EQUAL(5, count2); - CHECK_EQUAL(5, count3); - CHECK_EQUAL(5, count4); - CHECK_EQUAL(5, count5); - - ObjKey t = table.find_first_string(col_str, "eftg"); - CHECK_EQUAL(ObjKey(1), t); - - auto tv = table.find_all_string(col_str, "eftg"); - CHECK_EQUAL(5, tv.size()); - CHECK_EQUAL("eftg", tv.get_object(0).get(col_str)); - CHECK_EQUAL("eftg", tv.get_object(1).get(col_str)); - CHECK_EQUAL("eftg", tv.get_object(2).get(col_str)); - CHECK_EQUAL("eftg", tv.get_object(3).get(col_str)); - CHECK_EQUAL("eftg", tv.get_object(4).get(col_str)); - - Obj obj = table.create_object(); - CHECK_EQUAL(0, obj.get(col_int)); - CHECK_EQUAL("", obj.get(col_str)); -} - - -TEST(Table_AutoEnumerationOptimize) -{ - Table t; - auto col0 = t.add_column(type_String, "col1"); - auto col1 = t.add_column(type_String, "col2"); - auto col2 = t.add_column(type_String, "col3"); - auto col3 = t.add_column(type_String, "col4"); - - // Insert non-optimizable strings - std::string s; - std::vector keys; - t.create_objects(10, keys); - for (Obj o : t) { - o.set_all(s.c_str(), s.c_str(), s.c_str(), s.c_str()); - s += "x"; - } - - // AutoEnumerate in reverse order - for (Obj o : t) { - o.set(col3, "test"); - } - t.enumerate_string_column(col3); - for (Obj o : t) { - o.set(col2, "test"); - } - t.enumerate_string_column(col2); - for (Obj o : t) { - o.set(col1, "test"); - } - t.enumerate_string_column(col1); - for (Obj o : t) { - o.set(col0, "test"); - } - t.enumerate_string_column(col0); - - for (Obj o : t) { - CHECK_EQUAL("test", o.get(col0)); - CHECK_EQUAL("test", o.get(col1)); - CHECK_EQUAL("test", o.get(col2)); - CHECK_EQUAL("test", o.get(col3)); - } - -#ifdef REALM_DEBUG - t.verify(); -#endif -} - -TEST(Table_OptimizeCompare) -{ - Table t1, t2; - auto col_t1 = t1.add_column(type_String, "str"); - auto col_t2 = t2.add_column(type_String, "str"); - - std::vector keys_t1; - std::vector keys_t2; - t1.create_objects(100, keys_t1); - for (Obj o : t1) { - o.set(col_t1, "foo"); - } - t2.create_objects(100, keys_t2); - for (Obj o : t2) { - o.set(col_t2, "foo"); - } - t1.enumerate_string_column(col_t1); - CHECK(t1 == t2); - Obj obj1 = t1.get_object(keys_t1[50]); - Obj obj2 = t2.get_object(keys_t2[50]); - obj1.set(col_t1, "bar"); - CHECK(t1 != t2); - obj1.set(col_t1, "foo"); - CHECK(t1 == t2); - obj2.set(col_t2, "bar"); - CHECK(t1 != t2); - obj2.set(col_t2, "foo"); - CHECK(t1 == t2); -} - - TEST(Table_SlabAlloc) { SlabAlloc alloc; @@ -1756,56 +1588,6 @@ TEST(Table_SlabAlloc) #endif } -TEST(Table_NullInEnum) -{ - Group group; - TableRef table = group.add_table("test"); - auto col = table->add_column(type_String, "second", true); - - for (size_t c = 0; c < 100; c++) { - table->create_object().set(col, "hello"); - } - - size_t r; - - r = table->where().equal(col, "hello").count(); - CHECK_EQUAL(100, r); - - Obj obj50 = table->get_object(ObjKey(50)); - obj50.set(col, realm::null()); - r = table->where().equal(col, "hello").count(); - CHECK_EQUAL(99, r); - - table->enumerate_string_column(col); - - obj50.set(col, realm::null()); - r = table->where().equal(col, "hello").count(); - CHECK_EQUAL(99, r); - - obj50.set(col, "hello"); - r = table->where().equal(col, "hello").count(); - CHECK_EQUAL(100, r); - - obj50.set(col, realm::null()); - r = table->where().equal(col, "hello").count(); - CHECK_EQUAL(99, r); - - r = table->where().equal(col, realm::null()).count(); - CHECK_EQUAL(1, r); - - table->get_object(ObjKey(55)).set(col, realm::null()); - r = table->where().equal(col, realm::null()).count(); - CHECK_EQUAL(2, r); - - r = table->where().equal(col, "hello").count(); - CHECK_EQUAL(98, r); - - table->remove_object(ObjKey(55)); - r = table->where().equal(col, realm::null()).count(); - CHECK_EQUAL(1, r); -} - - TEST(Table_DateAndBinary) { Table t; @@ -2122,22 +1904,6 @@ TEST(Table_EmptyMinmax) CHECK(is_null); } -TEST(Table_EnumStringInsertEmptyRow) -{ - Table table; - auto col_str = table.add_column(type_String, "strings"); - for (int i = 0; i < 128; ++i) - table.create_object().set(col_str, "foo"); - - CHECK_EQUAL(0, table.get_num_unique_values(col_str)); - table.enumerate_string_column(col_str); - // Make sure we now have an enumerated strings column - CHECK_EQUAL(1, table.get_num_unique_values(col_str)); - Obj obj = table.create_object(); - CHECK_EQUAL("", obj.get(col_str)); - CHECK_EQUAL(2, table.get_num_unique_values(col_str)); -} - TEST(Table_AddColumnWithThreeLevelBptree) { Table table; @@ -2256,24 +2022,14 @@ TEST(Table_NullableChecks) TEST(Table_Nulls) { - // 'round' lets us run this entire test both with and without index and with/without optimize/enum - for (size_t round = 0; round < 5; round++) { + // 'round' lets us run this entire test both with and without index + for (size_t round = 0; round < 2; round++) { Table t; TableView tv; auto col_str = t.add_column(type_String, "str", true /*nullable*/); if (round == 1) t.add_search_index(col_str); - else if (round == 2) - t.enumerate_string_column(col_str); - else if (round == 3) { - t.add_search_index(col_str); - t.enumerate_string_column(col_str); - } - else if (round == 4) { - t.enumerate_string_column(col_str); - t.add_search_index(col_str); - } std::vector keys; t.create_objects(3, keys); diff --git a/test/test_table_view.cpp b/test/test_table_view.cpp index a7f9e1ac8da..72ce797cd5f 100644 --- a/test/test_table_view.cpp +++ b/test/test_table_view.cpp @@ -891,32 +891,6 @@ TEST(TableView_QueryCopyStringOr) CHECK_EQUAL(after_copy_count, 4); } -TEST(TableView_SortEnum) -{ - Table table; - auto col = table.add_column(type_String, "str"); - - table.create_object().set_all("foo"); - table.create_object().set_all("foo"); - table.create_object().set_all("foo"); - - table.enumerate_string_column(col); - - table.create_object().set_all("bbb"); - table.create_object().set_all("aaa"); - table.create_object().set_all("baz"); - - TableView tv = table.where().find_all(); - tv.sort(col); - - CHECK_EQUAL(tv[0].get(col), "aaa"); - CHECK_EQUAL(tv[1].get(col), "baz"); - CHECK_EQUAL(tv[2].get(col), "bbb"); - CHECK_EQUAL(tv[3].get(col), "foo"); - CHECK_EQUAL(tv[4].get(col), "foo"); - CHECK_EQUAL(tv[5].get(col), "foo"); -} - TEST(TableView_Backlinks) { Group group; @@ -1319,20 +1293,6 @@ TEST_TYPES(TableView_Distinct, DistinctDirect, DistinctOverLink) CHECK_EQUAL(h.get_key(tv, 3), k0); CHECK_EQUAL(h.get_key(tv, 4), k1); - - // Same as previous test, but with string column being Enum - t.enumerate_string_column(col_str); - tv = h.find_all(); - tv.distinct(h.get_distinct({col_str, col_int})); - tv.sort(h.get_sort({col_str}, {false})); - CHECK_EQUAL(tv.size(), 5); - CHECK_EQUAL(h.get_key(tv, 0), k4); - CHECK_EQUAL(h.get_key(tv, 1), k5); - CHECK_EQUAL(h.get_key(tv, 2), k6); - CHECK_EQUAL(h.get_key(tv, 3), k0); - CHECK_EQUAL(h.get_key(tv, 4), k1); - - // Now test sync_if_needed() tv = h.find_all(); // "", null, "", null, "foo", "foo", "bar" diff --git a/test/test_transactions.cpp b/test/test_transactions.cpp index fbbdd314379..f45a576e9cd 100644 --- a/test/test_transactions.cpp +++ b/test/test_transactions.cpp @@ -544,41 +544,4 @@ TEST(Transactions_Continuous_SerialWrites) } } - -// Check that enumeration is gone after -// rolling back the insertion of a string enum column -TEST(LangBindHelper_RollbackStringEnumInsert) -{ - SHARED_GROUP_TEST_PATH(path); - std::unique_ptr hist_w(make_in_realm_history()); - auto sg_w = DB::create(*hist_w, path); - auto g = sg_w->start_write(); - auto t = g->add_table("t1"); - auto col = t->add_column(type_String, "t1_col0_string"); - - auto populate_with_string_enum = [&]() { - t->create_object().set_all("simple_string"); - t->create_object().set_all("duplicate"); - t->create_object().set_all("duplicate"); - t->enumerate_string_column(col); // upgrade to internal string enum column type - CHECK(t->is_enumerated(col)); - CHECK_EQUAL(t->get_num_unique_values(col), 2); - }; - - g->commit_and_continue_as_read(); - g->promote_to_write(); - - populate_with_string_enum(); - - g->rollback_and_continue_as_read(); - g->promote_to_write(); - CHECK(!t->is_enumerated(col)); - populate_with_string_enum(); - - t->begin()->set(col, "duplicate"); - - g->commit_and_continue_as_read(); - CHECK(t->is_enumerated(col)); -} - #endif // TEST_TRANSACTIONS From 1afc39ebb3fb8b50814f4e6fc8485dd9993a5dfd Mon Sep 17 00:00:00 2001 From: nicola cabiddu Date: Wed, 10 Jul 2024 11:51:50 +0100 Subject: [PATCH 07/14] RCORE-2064 String EQ/NEQ optimisations for compressed strings (#7820) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * find_first optimization for compressed strings * core test passing * compression tests for collection of strings * code review * Fixes (#7872) --------- Co-authored-by: Jørgen Edelbo --- src/realm/array_string.cpp | 23 +++++++++++++++--- src/realm/array_string.hpp | 9 +++++++ src/realm/query_engine.cpp | 21 ++++++++++------- src/realm/query_engine.hpp | 48 +++++++++++++++++++++++++++++++------- src/realm/table.cpp | 20 ++++++++++++++-- test/test_query.cpp | 40 +++++++++++++++++++++++++++---- 6 files changed, 135 insertions(+), 26 deletions(-) diff --git a/src/realm/array_string.cpp b/src/realm/array_string.cpp index 1eb2fdaa969..d7277faedf1 100644 --- a/src/realm/array_string.cpp +++ b/src/realm/array_string.cpp @@ -192,6 +192,14 @@ StringData ArrayString::get(size_t ndx) const return {}; } +std::optional realm::ArrayString::get_string_id(size_t ndx) const +{ + if (m_type == Type::interned_strings) { + return StringID(static_cast(m_arr)->get(ndx)); + } + return m_string_interner->lookup(get(ndx)); +} + Mixed ArrayString::get_any(size_t ndx) const { return Mixed(get(ndx)); @@ -274,6 +282,16 @@ void ArrayString::clear() } size_t ArrayString::find_first(StringData value, size_t begin, size_t end) const noexcept +{ + // This should only be called if we don't have a string id for this particular array (aka no string interner) + std::optional id; + if (m_type == Type::interned_strings) + id = m_string_interner->lookup(value); + + return find_first(value, begin, end, id); +} + +size_t ArrayString::find_first(StringData value, size_t begin, size_t end, std::optional id) const noexcept { switch (m_type) { case Type::small_strings: @@ -289,14 +307,13 @@ size_t ArrayString::find_first(StringData value, size_t begin, size_t end) const break; } case Type::interned_strings: { - // we need a way to avoid this lookup for each leaf array. The lookup must appear - // higher up the call stack and passed down. - auto id = m_string_interner->lookup(value); if (id) { return static_cast(m_arr)->find_first(*id, begin, end); } break; } + default: + break; } return not_found; } diff --git a/src/realm/array_string.hpp b/src/realm/array_string.hpp index cdf8ccededa..0e5a5cc3895 100644 --- a/src/realm/array_string.hpp +++ b/src/realm/array_string.hpp @@ -22,6 +22,7 @@ #include #include #include +#include namespace realm { @@ -74,6 +75,10 @@ class ArrayString : public ArrayPayload { { m_string_interner = string_interner; } + bool is_compressed() const + { + return m_type == Type::interned_strings; + } void update_parent() { @@ -99,6 +104,7 @@ class ArrayString : public ArrayPayload { } void insert(size_t ndx, StringData value); StringData get(size_t ndx) const; + std::optional get_string_id(size_t ndx) const; Mixed get_any(size_t ndx) const override; bool is_null(size_t ndx) const; void erase(size_t ndx); @@ -107,6 +113,9 @@ class ArrayString : public ArrayPayload { size_t find_first(StringData value, size_t begin, size_t end) const noexcept; + /// Special version for searching in an array or compressed strings. + size_t find_first(StringData value, size_t begin, size_t end, std::optional) const noexcept; + size_t lower_bound(StringData value); /// Get the specified element without the cost of constructing an diff --git a/src/realm/query_engine.cpp b/src/realm/query_engine.cpp index 3a9c375f6d0..03cec8674ad 100644 --- a/src/realm/query_engine.cpp +++ b/src/realm/query_engine.cpp @@ -453,7 +453,7 @@ bool StringNode::do_consume_condition(ParentNode& node) size_t StringNode::_find_first_local(size_t start, size_t end) { if (m_needles.empty()) { - return m_leaf->find_first(m_string_value, start, end); + return m_leaf->find_first(m_string_value, start, end, m_interned_string_id); } else { if (end == npos) @@ -505,7 +505,8 @@ size_t StringNode::_find_first_local(size_t start, size_t end) } StringNodeFulltext::StringNodeFulltext(StringData v, ColKey column, std::unique_ptr lm) - : StringNodeEqualBase(v, column) + : m_value(v) + , m_col(column) , m_link_map(std::move(lm)) { if (!m_link_map) @@ -518,17 +519,21 @@ void StringNodeFulltext::table_changed() } StringNodeFulltext::StringNodeFulltext(const StringNodeFulltext& other) - : StringNodeEqualBase(other) + : ParentNode(other) + , m_value(other.m_value) + , m_col(other.m_col) + , m_link_map(std::make_unique(*other.m_link_map)) { - m_link_map = std::make_unique(*other.m_link_map); } -void StringNodeFulltext::_search_index_init() +void StringNodeFulltext::init(bool will_query_ranges) { - StringIndex* index = m_link_map->get_target_table()->get_string_index(ParentNode::m_condition_column_key); + ParentNode::init(will_query_ranges); + + StringIndex* index = m_link_map->get_target_table()->get_string_index(m_col); REALM_ASSERT(index && index->is_fulltext_index()); m_index_matches.clear(); - index->find_all_fulltext(m_index_matches, StringNodeBase::m_string_value); + index->find_all_fulltext(m_index_matches, m_value); // If links exists, use backlinks to find the original objects if (m_link_map->links_exist()) { @@ -541,7 +546,7 @@ void StringNodeFulltext::_search_index_init() } m_index_evaluator = IndexEvaluator{}; - m_index_evaluator->init(&m_index_matches); + m_index_evaluator.init(&m_index_matches); } std::unique_ptr TwoColumnsNodeBase::update_cached_leaf_pointers_for_column(Allocator& alloc, diff --git a/src/realm/query_engine.hpp b/src/realm/query_engine.hpp index 3a428c04d4d..a72a067876c 100644 --- a/src/realm/query_engine.hpp +++ b/src/realm/query_engine.hpp @@ -151,6 +151,8 @@ class ParentNode { { m_dD = 100.0; + if (m_condition_column_key) + m_table->check_column(m_condition_column_key); if (m_child) m_child->init(will_query_ranges); } @@ -1647,6 +1649,11 @@ class StringNodeBase : public ParentNode { m_dT = 10.0; } + void table_changed() override + { + m_string_interner = m_table.unchecked_ptr()->get_string_interner(m_condition_column_key); + } + void cluster_changed() override { m_leaf.emplace(m_table.unchecked_ptr()->get_alloc()); @@ -1662,6 +1669,7 @@ class StringNodeBase : public ParentNode { m_end_s = 0; m_leaf_start = 0; m_leaf_end = 0; + m_interned_string_id = m_string_interner->lookup(m_value); } virtual void clear_leaf_state() @@ -1673,6 +1681,8 @@ class StringNodeBase : public ParentNode { : ParentNode(from) , m_value(from.m_value) , m_string_value(m_value) + , m_string_interner(from.m_string_interner) + , m_interned_string_id(from.m_interned_string_id) { } @@ -1687,6 +1697,8 @@ class StringNodeBase : public ParentNode { std::optional m_value; std::optional m_leaf; StringData m_string_value; + StringInterner* m_string_interner = nullptr; + std::optional m_interned_string_id; size_t m_end_s = 0; size_t m_leaf_start = 0; @@ -1703,7 +1715,7 @@ template class StringNode : public StringNodeBase { public: constexpr static bool case_sensitive_comparison = - is_any_v; + is_any_v; StringNode(StringData v, ColKey column) : StringNodeBase(v, column) { @@ -1732,8 +1744,21 @@ class StringNode : public StringNodeBase { TConditionFunction cond; for (size_t s = start; s < end; ++s) { + if constexpr (std::is_same_v) { + if (m_leaf->is_compressed()) { + if (m_interned_string_id) { + // The search string has been interned, so there might be a match + // We can compare the string IDs directly + const auto id = m_leaf->get_string_id(s); + if (m_string_interner->compare(*m_interned_string_id, *id) == 0) { + // The value matched, so we continue to the next value + continue; + } + } + return s; + } + } StringData t = get_string(s); - if constexpr (case_sensitive_comparison) { // case insensitive not implemented for: >, >=, <, <= if (cond(t, m_string_value)) @@ -2061,20 +2086,24 @@ class StringNode : public StringNodeEqualBase { size_t _find_first_local(size_t start, size_t end) override; }; - -class StringNodeFulltext : public StringNodeEqualBase { +class StringNodeFulltext : public ParentNode { public: StringNodeFulltext(StringData v, ColKey column, std::unique_ptr lm = {}); void table_changed() override; - void _search_index_init() override; + void init(bool will_query_ranges) override; bool has_search_index() const override { return true; // it's a required precondition for fulltext queries } + const IndexEvaluator* index_based_keys() override + { + return &m_index_evaluator; + } + std::unique_ptr clone() const override { return std::unique_ptr(new StringNodeFulltext(*this)); @@ -2086,13 +2115,16 @@ class StringNodeFulltext : public StringNodeEqualBase { } private: - std::vector m_index_matches; + std::string m_value; + ColKey m_col; std::unique_ptr m_link_map; + IndexEvaluator m_index_evaluator; + std::vector m_index_matches; StringNodeFulltext(const StringNodeFulltext&); - size_t _find_first_local(size_t, size_t) override + size_t find_first_local(size_t start, size_t end) override { - REALM_UNREACHABLE(); + return m_index_evaluator.do_search_index(m_cluster, start, end); } }; diff --git a/src/realm/table.cpp b/src/realm/table.cpp index 56c34c999f4..b279d2a5205 100644 --- a/src/realm/table.cpp +++ b/src/realm/table.cpp @@ -1735,9 +1735,25 @@ ObjKey Table::find_first(ColKey col_key, T value) const using LeafType = typename ColumnTypeTraits::cluster_leaf_type; LeafType leaf(get_alloc()); - auto f = [&key, &col_key, &value, &leaf](const Cluster* cluster) { + // In case of a string column we can try to look up the StringID of the search string, + // and search for that in case the leaf is compressed. + std::optional string_id; + if constexpr (std::is_same_v) { + auto string_interner = get_string_interner(col_key); + REALM_ASSERT(string_interner != nullptr); + string_id = string_interner->lookup(value); + } + + auto f = [&](const Cluster* cluster) { cluster->init_leaf(col_key, &leaf); - size_t row = leaf.find_first(value, 0, cluster->node_size()); + size_t row; + if constexpr (std::is_same_v) { + row = leaf.find_first(value, 0, cluster->node_size(), string_id); + } + else { + row = leaf.find_first(value, 0, cluster->node_size()); + } + if (row != realm::npos) { key = cluster->get_real_key(row); return IteratorControl::Stop; diff --git a/test/test_query.cpp b/test/test_query.cpp index c2d7215d35f..e8093cbb735 100644 --- a/test/test_query.cpp +++ b/test/test_query.cpp @@ -330,10 +330,13 @@ columns or queries involved */ -TEST(Query_NextGen_StringConditions) +TEST_TYPES(Query_NextGen_StringConditions, std::true_type, std::false_type) { - Group group; - TableRef table1 = group.add_table("table1"); + SHARED_GROUP_TEST_PATH(path); + + auto db = DB::create(make_in_realm_history(), path); + auto wt = db->start_write(); + TableRef table1 = wt->add_table("table1"); auto col_str1 = table1->add_column(type_String, "str1"); auto col_str2 = table1->add_column(type_String, "str2"); @@ -342,6 +345,11 @@ TEST(Query_NextGen_StringConditions) table1->create_object().set_all("!", "x").get_key(); ObjKey key_1_2 = table1->create_object().set_all("bar", "r").get_key(); + if (TEST_TYPE::value) { + wt->commit_and_continue_as_read(); + wt->promote_to_write(); + } + ObjKey m; // Equal m = table1->column(col_str1).equal("bar", false).find(); @@ -433,7 +441,7 @@ TEST(Query_NextGen_StringConditions) CHECK_EQUAL(m, null_key); // Test various compare operations with null - TableRef table2 = group.add_table("table2"); + TableRef table2 = wt->add_table("table2"); auto col_str3 = table2->add_column(type_String, "str3", true); ObjKey key_2_0 = table2->create_object().set(col_str3, "foo").get_key(); @@ -442,6 +450,11 @@ TEST(Query_NextGen_StringConditions) ObjKey key_2_3 = table2->create_object().set(col_str3, "bar").get_key(); ObjKey key_2_4 = table2->create_object().set(col_str3, "").get_key(); + if (TEST_TYPE::value) { + wt->commit_and_continue_as_read(); + wt->promote_to_write(); + } + size_t cnt; cnt = table2->column(col_str3).contains(StringData("")).count(); CHECK_EQUAL(cnt, 4); @@ -522,6 +535,12 @@ TEST(Query_NextGen_StringConditions) } }; + // not equal + check_results((table2->column(col_str3) != StringData("")), {StringData(), "foo", "bar", "!"}); + check_results((table2->column(col_str3) != StringData()), {"", "foo", "bar", "!"}); + check_results((table2->column(col_str3) != StringData("foo")), {StringData(), "", "bar", "!"}); + check_results((table2->column(col_str3) != StringData("barr")), {StringData(), "", "foo", "bar", "!"}); + // greater check_results((table2->column(col_str3) > StringData("")), {"foo", "bar", "!"}); check_results((table2->column(col_str3) > StringData("b")), {"foo", "bar"}); @@ -553,7 +572,7 @@ TEST(Query_NextGen_StringConditions) check_results((table2->column(col_str3) <= StringData("barrrr")), {"bar", "", "!", StringData()}); check_results((table2->column(col_str3) <= StringData("z")), {"foo", "bar", "", "!", StringData()}); - TableRef table3 = group.add_table(StringData("table3")); + TableRef table3 = wt->add_table(StringData("table3")); auto col_link1 = table3->add_column(*table2, "link1"); table3->create_object().set(col_link1, key_2_0); @@ -562,6 +581,11 @@ TEST(Query_NextGen_StringConditions) table3->create_object().set(col_link1, key_2_3); table3->create_object().set(col_link1, key_2_4); + if (TEST_TYPE::value) { + wt->commit_and_continue_as_read(); + wt->promote_to_write(); + } + cnt = table3->link(col_link1).column(col_str3).contains(StringData("")).count(); CHECK_EQUAL(cnt, 4); @@ -638,8 +662,14 @@ TEST(Query_NextGen_StringConditions) "This is a long search string that does not contain the word being searched for!, " "This is a long search string that does not contain the word being searched for!, " "needle"; + table2->create_object().set(col_str3, long_string).get_key(); + if (TEST_TYPE::value) { + wt->commit_and_continue_as_read(); + wt->promote_to_write(); + } + cnt = table2->column(col_str3).contains(search_1, false).count(); CHECK_EQUAL(cnt, 1); From aec4ca0c6097e7b8a6b3a72e1bd8374c7a5f61df Mon Sep 17 00:00:00 2001 From: nicola cabiddu Date: Tue, 16 Jul 2024 10:25:56 +0100 Subject: [PATCH 08/14] RCORE-2065 use compressed string view for quick comparison if the leaf is compressed (#7880) * use CompressedViewString cmp for compressed leaves * fix asan * code review * fix redudant code --- src/realm/query_engine.hpp | 18 ++++--- src/realm/string_compressor.cpp | 12 ++--- src/realm/string_interner.cpp | 4 +- test/test_query.cpp | 1 - test/test_string_compression.cpp | 85 +++++++++++++++++++++++++++++++- 5 files changed, 101 insertions(+), 19 deletions(-) diff --git a/src/realm/query_engine.hpp b/src/realm/query_engine.hpp index a72a067876c..a2d81a96738 100644 --- a/src/realm/query_engine.hpp +++ b/src/realm/query_engine.hpp @@ -1744,20 +1744,21 @@ class StringNode : public StringNodeBase { TConditionFunction cond; for (size_t s = start; s < end; ++s) { - if constexpr (std::is_same_v) { + + // special handling for !=, <, <= , >, >= if the leaf is compressed and we have got a compressed string + // id. + if constexpr (realm::is_any_v) { if (m_leaf->is_compressed()) { if (m_interned_string_id) { - // The search string has been interned, so there might be a match - // We can compare the string IDs directly const auto id = m_leaf->get_string_id(s); - if (m_string_interner->compare(*m_interned_string_id, *id) == 0) { - // The value matched, so we continue to the next value + if (cond(m_string_interner->compare(*id, *m_interned_string_id), 0)) + return s; + else continue; - } } - return s; } } + StringData t = get_string(s); if constexpr (case_sensitive_comparison) { // case insensitive not implemented for: >, >=, <, <= @@ -1765,8 +1766,9 @@ class StringNode : public StringNodeBase { return s; } else { - if (cond(m_string_value, m_ucase.c_str(), m_lcase.c_str(), t)) + if (cond(m_string_value, m_ucase.c_str(), m_lcase.c_str(), t)) { return s; + } } } return not_found; diff --git a/src/realm/string_compressor.cpp b/src/realm/string_compressor.cpp index 13de772b6a3..b086f076493 100644 --- a/src/realm/string_compressor.cpp +++ b/src/realm/string_compressor.cpp @@ -292,7 +292,7 @@ int StringCompressor::compare(CompressedStringView& A, CompressedStringView& B) // symbols did not match: // 1. both symbols are single characters if (code_A < 256 && code_B < 256) - return code_B - code_A; + return code_A - code_B; std::string a_str(code_A, 1); auto str_A = std::string_view(code_A < 256 ? a_str : m_symbols[code_A - 256].expansion); std::string b_str(code_B, 1); @@ -302,17 +302,17 @@ int StringCompressor::compare(CompressedStringView& A, CompressedStringView& B) StringData sd_b(str_B.data(), str_B.size()); REALM_ASSERT_DEBUG(sd_a != sd_b); if (sd_a < sd_b) - return 1; - else return -1; + else + return 1; } // The compressed strings are identical or one is the prefix of the other - return B.size - A.size; + return static_cast(A.size - B.size); // ^ a faster way of producing same positive / negative / zero as: // if (A.size() < B.size()) - // return 1; - // if (A.size() > B.size()) // return -1; + // if (A.size() > B.size()) + // return 1; // return 0; } diff --git a/src/realm/string_interner.cpp b/src/realm/string_interner.cpp index 17dc3663b2d..bff3130901f 100644 --- a/src/realm/string_interner.cpp +++ b/src/realm/string_interner.cpp @@ -642,9 +642,9 @@ int StringInterner::compare(StringData s, StringID A) if (s.data() == nullptr && A == 0) return 0; if (s.data() == nullptr) - return 1; - if (A == 0) return -1; + if (A == 0) + return 1; // ok, no nulls REALM_ASSERT(m_compressor); return m_compressor->compare(s, get_compressed(A)); diff --git a/test/test_query.cpp b/test/test_query.cpp index e8093cbb735..6520085aeb5 100644 --- a/test/test_query.cpp +++ b/test/test_query.cpp @@ -329,7 +329,6 @@ See TEST(StringData_Substrings) for more unit tests for null, isolated to using columns or queries involved */ - TEST_TYPES(Query_NextGen_StringConditions, std::true_type, std::false_type) { SHARED_GROUP_TEST_PATH(path); diff --git a/test/test_string_compression.cpp b/test/test_string_compression.cpp index 83eede427e4..017e81968c5 100644 --- a/test/test_string_compression.cpp +++ b/test/test_string_compression.cpp @@ -100,6 +100,83 @@ TEST(StringInterner_TestLookup) parent.destroy_deep(); } +TEST(StringInterner_VerifyComparison) +{ + Array parent(Allocator::get_default()); + parent.create(NodeHeader::type_HasRefs, false, 1, 0); + StringInterner interner(Allocator::get_default(), parent, ColKey(0), true); + + auto null_id = interner.intern({}); + auto test_lower_case_id = interner.intern({"test"}); + auto test_upper_case_id = interner.intern({"TEST"}); + + // check NULL vs empty string + auto res = interner.compare("", null_id); + CHECK_GREATER(StringData(""), StringData()); + CHECK_EQUAL(res, 1); + + // check that NULL filtering actually works + res = interner.compare(test_lower_case_id, null_id); + CHECK_GREATER(interner.get(test_lower_case_id), StringData()); + CHECK_EQUAL(res, 1); + + res = interner.compare(null_id, test_lower_case_id); + CHECK_LESS(StringData(), interner.get(test_lower_case_id)); + CHECK_EQUAL(res, -1); + + //"aaa" < "test" + res = interner.compare({"aaa"}, test_lower_case_id); + CHECK_LESS(StringData("aaa"), interner.get(test_lower_case_id)); + CHECK_EQUAL(res, -1); + + //"zzz" > "test" + res = interner.compare({"zzz"}, test_lower_case_id); + CHECK_GREATER(StringData("zzz"), interner.get(test_lower_case_id)); + CHECK_EQUAL(res, 1); + + //"AAA" < "test" + res = interner.compare({"AAA"}, test_lower_case_id); + CHECK_LESS(StringData("AAA"), interner.get(test_lower_case_id)); + CHECK_EQUAL(res, -1); + + //"ZZZ" < "test" + res = interner.compare({"ZZZ"}, test_lower_case_id); + CHECK_LESS(StringData("ZZZ"), interner.get(test_lower_case_id)); + CHECK_EQUAL(res, -1); + + //"aaa" > "TEST" + res = interner.compare({"aaa"}, test_upper_case_id); + CHECK_GREATER(StringData("aaa"), interner.get(test_upper_case_id)); + CHECK_EQUAL(res, 1); + + //"zzz" > "TEST" + res = interner.compare({"zzz"}, test_upper_case_id); + CHECK_GREATER(StringData("zzz"), interner.get(test_upper_case_id)); + CHECK_EQUAL(res, 1); + + //"AAA" < "TEST" + res = interner.compare({"AAA"}, test_upper_case_id); + CHECK_LESS(StringData("AAA"), interner.get(test_upper_case_id)); + CHECK_EQUAL(res, -1); + + //"ZZZ" > "TEST" + res = interner.compare({"ZZZ"}, test_upper_case_id); + CHECK_GREATER(StringData("ZZZ"), interner.get(test_upper_case_id)); + CHECK_EQUAL(res, 1); + + // test > TEST + res = interner.compare(test_lower_case_id, test_upper_case_id); + CHECK_GREATER(interner.get(test_lower_case_id), interner.get(test_upper_case_id)); + CHECK_EQUAL(res, 1); + + // TEST < test + res = interner.compare(test_upper_case_id, test_lower_case_id); + CHECK_LESS(interner.get(test_upper_case_id), interner.get(test_lower_case_id)); + CHECK_EQUAL(res, -1); + + parent.destroy_deep(); +} + TEST(StringInterner_VerifyInterningNull) { Array parent(Allocator::get_default()); @@ -115,13 +192,17 @@ TEST(StringInterner_VerifyInterningNull) // interned string id vs null id auto str_id = interner.intern(StringData("test")); CHECK_EQUAL(interner.compare(str_id, null_id), 1); + CHECK_GREATER(interner.get(str_id), interner.get(null_id)); // compare via StringData // null id vs interned string id CHECK_EQUAL(interner.compare(null_id, str_id), -1); + CHECK_LESS(interner.get(null_id), interner.get(str_id)); // comparison String vs StringID CHECK_EQUAL(interner.compare(StringData{}, null_id), 0); - CHECK_EQUAL(interner.compare(StringData{}, str_id), 1); - CHECK_EQUAL(interner.compare(StringData{"test"}, null_id), -1); + CHECK_EQUAL(interner.compare(StringData{}, str_id), -1); + CHECK_LESS(StringData{}, interner.get(str_id)); // compare via StringData + CHECK_EQUAL(interner.compare(StringData{"test"}, null_id), 1); + CHECK_GREATER(StringData{"test"}, interner.get(null_id)); parent.destroy_deep(); } From 9278ebe5e1fb1286e80d80f887d3f4ee9d33f214 Mon Sep 17 00:00:00 2001 From: nicola cabiddu Date: Thu, 1 Aug 2024 20:46:35 +0200 Subject: [PATCH 09/14] RCORE-2157 Avoid to decompress Strings while sorting them. Instead use fast comparison provided by the interner (#7892) * initial test * add logic for fetching stringID to Descriptors (partially working) * fixes core tests * ops I cannot use C++20 :-) * fix test * fix handling for array mixed when it holds strings * fix improper use of mixed when it is not string * Compare only if both string ids are available * works for strings * enable mixed and fix bug in cmp function * code cleanup * remove alias dup * fix cmp function + simplify comparison function for sorting * lint * more readeable cmp function * test refactory, still missing cmp over links + mixed of diff type cmp * tests * code review * more tests for utf8 * code review * code review --- src/realm/array_mixed.cpp | 16 ++ src/realm/array_mixed.hpp | 1 + src/realm/array_string.cpp | 3 +- src/realm/obj.cpp | 30 +++ src/realm/obj.hpp | 5 + src/realm/path.hpp | 4 + src/realm/sort_descriptor.cpp | 71 +++++- src/realm/sort_descriptor.hpp | 19 ++ src/realm/string_compressor.cpp | 15 +- src/realm/string_data.hpp | 5 + src/realm/string_interner.cpp | 4 +- src/realm/string_interner.hpp | 2 - test/benchmark-common-tasks/main.cpp | 69 ++++++ test/test_group.cpp | 4 - test/test_query.cpp | 336 ++++++++++++++++++++++++++- test/test_string_compression.cpp | 19 +- test/test_utf8.cpp | 66 +++++- 17 files changed, 621 insertions(+), 48 deletions(-) diff --git a/src/realm/array_mixed.cpp b/src/realm/array_mixed.cpp index 37870cc300c..af7e0e0174b 100644 --- a/src/realm/array_mixed.cpp +++ b/src/realm/array_mixed.cpp @@ -118,6 +118,22 @@ void ArrayMixed::set_null(size_t ndx) } } +std::optional ArrayMixed::get_string_id(size_t ndx) const +{ + int64_t val = m_composite.get(ndx); + if (val) { + const int64_t int_val = val >> s_data_shift; + const size_t payload_ndx{(size_t)int_val}; + const DataType type((val & s_data_type_mask) - 1); + if (type == type_String) { + ensure_string_array(); + REALM_ASSERT(size_t(int_val) < m_strings.size()); + return m_strings.get_string_id(payload_ndx); + } + } + return {}; +} + Mixed ArrayMixed::get(size_t ndx) const { int64_t val = m_composite.get(ndx); diff --git a/src/realm/array_mixed.hpp b/src/realm/array_mixed.hpp index 7fc544bc870..95afc264fef 100644 --- a/src/realm/array_mixed.hpp +++ b/src/realm/array_mixed.hpp @@ -97,6 +97,7 @@ class ArrayMixed : public ArrayPayload, private Array { { return m_composite.get(ndx) == 0; } + std::optional get_string_id(size_t ndx) const; void clear(); void erase(size_t ndx); diff --git a/src/realm/array_string.cpp b/src/realm/array_string.cpp index d7277faedf1..7d47862be72 100644 --- a/src/realm/array_string.cpp +++ b/src/realm/array_string.cpp @@ -20,6 +20,7 @@ #include #include #include +#include #include using namespace realm; @@ -192,7 +193,7 @@ StringData ArrayString::get(size_t ndx) const return {}; } -std::optional realm::ArrayString::get_string_id(size_t ndx) const +std::optional ArrayString::get_string_id(size_t ndx) const { if (m_type == Type::interned_strings) { return StringID(static_cast(m_arr)->get(ndx)); diff --git a/src/realm/obj.cpp b/src/realm/obj.cpp index 25027dc3f08..ece4b291d9a 100644 --- a/src/realm/obj.cpp +++ b/src/realm/obj.cpp @@ -629,6 +629,36 @@ BinaryData Obj::_get(ColKey::Idx col_ndx) const return ArrayBinary::get(alloc.translate(ref), m_row_ndx, alloc); } +std::optional Obj::get_string_id(ColKey col_key) const +{ + // we return a string id only if the property is string or mixed. + // And it got compressed. + + // only strings and mixed can have an interner + if (col_key.get_type() != col_type_String && col_key.get_type() != col_type_Mixed) + return {}; + + m_table->check_column(col_key); + _update_if_needed(); + + const auto col_ndx = col_key.get_index(); + const auto interner = m_table->get_string_interner(col_ndx); + ref_type ref = to_ref(Array::get(m_mem.get_addr(), col_ndx.val + 1)); + + if (col_key.get_type() == col_type_Mixed) { + // mixed handling. Only strings in mixed may have a string id + ArrayMixed values(get_alloc()); + values.set_string_interner(interner); + values.init_from_ref(ref); + return values.get_string_id(m_row_ndx); + } + // must be string. + ArrayString values(get_alloc()); + values.set_string_interner(interner); + values.init_from_ref(ref); + return values.get_string_id(m_row_ndx); +} + Mixed Obj::get_any(ColKey col_key) const { m_table->check_column(col_key); diff --git a/src/realm/obj.hpp b/src/realm/obj.hpp index cffc1b56a70..bb320f02c9f 100644 --- a/src/realm/obj.hpp +++ b/src/realm/obj.hpp @@ -117,6 +117,11 @@ class Obj { template U get(ColKey col_key) const; + std::optional get_string_id(ColKey) const; + std::optional get_string_id(StringData col_name) const + { + return get_string_id(get_column_key(col_name)); + } Mixed get_any(ColKey col_key) const; Mixed get_any(StringData col_name) const { diff --git a/src/realm/path.hpp b/src/realm/path.hpp index 6124590271c..0dfb83f96cc 100644 --- a/src/realm/path.hpp +++ b/src/realm/path.hpp @@ -256,6 +256,10 @@ class ExtendedColumnKey { ObjKey get_link_target(const Obj& obj) const; Mixed get_value(const Obj& obj) const; + // get String ID for the obj, it makes sense to call this method only if the col_key type is either Mixed or + // String. + std::optional get_string_id(const Obj& obj) const; + private: ColKey m_colkey; PathElement m_index; diff --git a/src/realm/sort_descriptor.cpp b/src/realm/sort_descriptor.cpp index 4d0e97c2bcb..8e1258be048 100644 --- a/src/realm/sort_descriptor.cpp +++ b/src/realm/sort_descriptor.cpp @@ -23,9 +23,51 @@ #include #include #include +#include using namespace realm; +namespace { + +template +int compare(const T& i, const T& j, const Col& col) +{ + Mixed m_i = i.get_value(); + Mixed m_j = j.get_value(); + + // 1. not compressed + if (!i.compressed && !j.compressed) + return m_i.compare(m_j); + + ColKey ck{col.col_key}; + StringInterner* interner = col.table->get_string_interner(ck); + + // 2. two compressed strings + if (i.compressed && j.compressed) { + return interner->compare((StringID)m_i.get_int(), (StringID)m_j.get_int()); + } + + // 3. one index is a compressed string, and the other one is mixed. + if (i.compressed || j.compressed) { + if (m_i.is_type(type_String)) + return interner->compare(m_i.get_string(), (StringID)m_j.get_int()); + + if (m_j.is_type(type_String)) + return -interner->compare(m_j.get_string(), (StringID)m_i.get_int()); + } + + // 4. compare string vs any other non-string (since value comparison is triggered only if the type matches, we can + // skip fetching the actual values) + if (i.compressed) + m_i = Mixed{""}; + else + m_j = Mixed{""}; + + return m_i.compare(m_j); +} + +} // namespace + ConstTableRef ExtendedColumnKey::get_target_table(const Table* table) const { return (m_colkey.get_type() == col_type_Link) ? table->get_link_target(m_colkey) : ConstTableRef{}; @@ -85,6 +127,14 @@ Mixed ExtendedColumnKey::get_value(const Obj& obj) const return {}; } +std::optional ExtendedColumnKey::get_string_id(const Obj& obj) const +{ + const auto type = m_colkey.get_type(); + if (type != col_type_String && type != col_type_Mixed) + return {}; + return obj.get_string_id(m_colkey); +} + LinkPathPart::LinkPathPart(ColKey col_key, ConstTableRef source) : column_key(col_key) , from(source->get_key()) @@ -419,9 +469,8 @@ bool BaseDescriptor::Sorter::operator()(IndexPair i, IndexPair j, bool total_ord } int c; - if (t == 0) { - c = i.cached_value.compare(j.cached_value); + c = compare(i, j, m_columns[t]); } else { if (m_cache[t - 1].empty()) { @@ -434,20 +483,25 @@ bool BaseDescriptor::Sorter::operator()(IndexPair i, IndexPair j, bool total_ord const auto& obj = m_columns[t].table->get_object(key_i); const auto& col_key = m_columns[t].col_key; - cache_i.value = col_key.get_value(obj); + // store stringID instead of the actual string if possible cache_i.key = key_i; + const std::optional string_id = col_key.get_string_id(obj); + cache_i.compressed = string_id ? true : false; + cache_i.value = cache_i.compressed ? static_cast(*string_id) : col_key.get_value(obj); } - Mixed val_i = cache_i.value; if (cache_j.key != key_j) { const auto& obj = m_columns[t].table->get_object(key_j); const auto& col_key = m_columns[t].col_key; - cache_j.value = col_key.get_value(obj); + // store stringID instead of the actual string if possible cache_j.key = key_j; + const std::optional string_id = col_key.get_string_id(obj); + cache_j.compressed = string_id ? true : false; + cache_j.value = cache_j.compressed ? static_cast(*string_id) : col_key.get_value(obj); } - c = val_i.compare(cache_j.value); + c = compare(cache_i, cache_j, m_columns[t]); } // if c is negative i comes before j if (c) { @@ -476,9 +530,10 @@ void BaseDescriptor::Sorter::cache_first_column(IndexPairs& v) continue; } } - const auto obj = col.table->get_object(key); - index.cached_value = ck.get_value(obj); + const std::optional string_id = ck.get_string_id(obj); + index.compressed = string_id ? true : false; + index.cached_value = index.compressed ? static_cast(*string_id) : ck.get_value(obj); } } diff --git a/src/realm/sort_descriptor.hpp b/src/realm/sort_descriptor.hpp index 0224ea5de6b..a3f03390b54 100644 --- a/src/realm/sort_descriptor.hpp +++ b/src/realm/sort_descriptor.hpp @@ -66,9 +66,18 @@ class BaseDescriptor { { return index_in_view < other.index_in_view; } + ObjKey get_key() const + { + return key_for_object; + } + Mixed get_value() const + { + return cached_value; + } ObjKey key_for_object; size_t index_in_view; Mixed cached_value; + bool compressed = false; }; class IndexPairs : public std::vector { public: @@ -115,6 +124,16 @@ class BaseDescriptor { struct ObjCache { ObjKey key; Mixed value; + bool compressed = false; + + ObjKey get_key() const + { + return key; + } + Mixed get_value() const + { + return value; + } }; using TableCache = std::vector; mutable std::vector m_cache; diff --git a/src/realm/string_compressor.cpp b/src/realm/string_compressor.cpp index b086f076493..9f88cc24206 100644 --- a/src/realm/string_compressor.cpp +++ b/src/realm/string_compressor.cpp @@ -290,16 +290,17 @@ int StringCompressor::compare(CompressedStringView& A, CompressedStringView& B) if (code_A == code_B) continue; // symbols did not match: + // 1. both symbols are single characters if (code_A < 256 && code_B < 256) return code_A - code_B; - std::string a_str(code_A, 1); - auto str_A = std::string_view(code_A < 256 ? a_str : m_symbols[code_A - 256].expansion); - std::string b_str(code_B, 1); - auto str_B = std::string_view(code_B < 256 ? b_str : m_symbols[code_B - 256].expansion); - // to ensure comparison as StringData we need to convert the stringviews - StringData sd_a(str_A.data(), str_A.size()); - StringData sd_b(str_B.data(), str_B.size()); + + // 2. all the other possible cases + std::string str_a{(char)code_A, 1}; + std::string str_b{(char)code_B, 1}; + StringData sd_a = code_A < 256 ? str_a : m_symbols[code_A - 256].expansion; + StringData sd_b = code_B < 256 ? str_b : m_symbols[code_B - 256].expansion; + REALM_ASSERT_DEBUG(sd_a != sd_b); if (sd_a < sd_b) return -1; diff --git a/src/realm/string_data.hpp b/src/realm/string_data.hpp index 46e1df0713d..63578350e77 100644 --- a/src/realm/string_data.hpp +++ b/src/realm/string_data.hpp @@ -34,6 +34,11 @@ namespace realm { +// Compressed strings have unique IDs, this defines a global alias +// for this. A StringID is an entry inside an array of N compressed strings. +// 0 means null, all the other ids [1, N-1] represent a valid string. +using StringID = size_t; + /// Selects CityHash64 on 64-bit platforms, and Murmur2 on 32-bit platforms. /// This is what libc++ does, and it is a good general choice for a /// non-cryptographic hash function (suitable for std::unordered_map etc.). diff --git a/src/realm/string_interner.cpp b/src/realm/string_interner.cpp index bff3130901f..dd6c27cab2f 100644 --- a/src/realm/string_interner.cpp +++ b/src/realm/string_interner.cpp @@ -401,7 +401,7 @@ StringID StringInterner::intern(StringData sd) { REALM_ASSERT(m_top.is_attached()); std::lock_guard lock(m_mutex); - // special case for null string + // special case for null string if (sd.data() == nullptr) return 0; uint32_t h = (uint32_t)sd.hash(); @@ -619,7 +619,7 @@ std::optional StringInterner::lookup(StringData sd) int StringInterner::compare(StringID A, StringID B) { std::lock_guard lock(m_mutex); - // 0 is null, the first index starts from 1. + // 0 is null, the first index starts from 1. REALM_ASSERT_DEBUG(A <= m_decompressed_strings.size()); REALM_ASSERT_DEBUG(B <= m_decompressed_strings.size()); // comparisons against null diff --git a/src/realm/string_interner.hpp b/src/realm/string_interner.hpp index 93c1eec45be..7d53c120349 100644 --- a/src/realm/string_interner.hpp +++ b/src/realm/string_interner.hpp @@ -34,8 +34,6 @@ struct CompressedStringView; namespace realm { -using StringID = size_t; - class StringCompressor; struct CachedString { diff --git a/test/benchmark-common-tasks/main.cpp b/test/benchmark-common-tasks/main.cpp index b8e52fc5d70..2db7982a39d 100644 --- a/test/benchmark-common-tasks/main.cpp +++ b/test/benchmark-common-tasks/main.cpp @@ -1630,6 +1630,70 @@ struct BenchmarkSort : BenchmarkWithStrings { } }; +// benchmark for testing compressed string sorting. +// N is the size of the string to generate +// M is the number of times we want store the string (number of dups) +template +struct BenchmarkSortCompressed : BenchmarkWithStringsTable { + std::string compressed_benchmark_name; + + BenchmarkSortCompressed() + : BenchmarkWithStringsTable() + { + if constexpr (N <= 15) { + compressed_benchmark_name = util::format("SortCompressedSmall(%1,%2)", N, M); + } + else if constexpr (N <= 63) { + compressed_benchmark_name = util::format("SortCompressedMedium(%1,%2)", N, M); + } + else { + compressed_benchmark_name = util::format("SortCompressedLarge(%1,%2)", N, M); + } + } + + const char* name() const + { + return compressed_benchmark_name.c_str(); + } + + void before_all(DBRef group) + { + BenchmarkWithStringsTable::before_all(group); + WriteTransaction tr(group); + TableRef t = tr.get_table(name()); + + auto gen_string = [](size_t length) { + const std::string alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuv" + "wxyz0123456789"; + std::random_device rd; + std::mt19937 generator(rd()); + std::uniform_int_distribution<> distribution(0, (int)alphabet.size() - 1); + + std::string random_str; + for (size_t i = 0; i < length; ++i) + random_str += alphabet[distribution(generator)]; + + return random_str; + }; + + std::string str = ""; + for (size_t i = 0; i < BASE_SIZE; ++i) { + if (i % M == 0) + str = gen_string(N); + + Obj obj = t->create_object(); + obj.set(m_col, str); + } + tr.commit(); + } + + void operator()(DBRef) + { + ConstTableRef table = m_table; + TableView view = table->get_sorted_view(m_col); + } +}; + struct BenchmarkEmptyCommit : Benchmark { const char* name() const { @@ -2663,6 +2727,11 @@ int benchmark_common_tasks_main() BENCH(IterateTableByIteratorIndex); BENCH(BenchmarkSort); + BENCH(BenchmarkSortCompressed<10, 500>); + BENCH(BenchmarkSortCompressed<50, 500>); + BENCH(BenchmarkSortCompressed<100, 500>); + BENCH(BenchmarkSortCompressed<1000, 5000>); + BENCH(BenchmarkSortInt); BENCH(BenchmarkSortIntList); BENCH(BenchmarkSortIntDictionary); diff --git a/test/test_group.cpp b/test/test_group.cpp index 3e58f9c274f..27c94702224 100644 --- a/test/test_group.cpp +++ b/test/test_group.cpp @@ -2623,10 +2623,6 @@ TEST(Test_Commit_Compression_Strings) auto set_s = obj.get_set(col_key_set_string); auto dictionary_s = obj.get_dictionary(col_key_dict_string); - CHECK_EQUAL(list_s.size(), i + 1); - CHECK_EQUAL(set_s.size(), i + 1); - CHECK_EQUAL(dictionary_s.size(), i + 1); - CHECK_EQUAL(list_s.get_any(i), str); CHECK_NOT_EQUAL(set_s.find_any(str), not_found); CHECK_NOT_EQUAL(dictionary_s.find_any(str), not_found); diff --git a/test/test_query.cpp b/test/test_query.cpp index 6520085aeb5..37b109bfaad 100644 --- a/test/test_query.cpp +++ b/test/test_query.cpp @@ -4107,7 +4107,6 @@ TEST(Query_LinkChainSortErrors) CHECK_LOGIC_ERROR(t1->get_sorted_view(SortDescriptor({{t1_linklist_col}})), ErrorCodes::InvalidSortDescriptor); } - TEST(Query_EmptyDescriptors) { Group g; @@ -5773,4 +5772,339 @@ TEST_TYPES(Query_IntCompressed, Equal, NotEqual, Less, LessEqual, Greater, Great } } +// Many of our tests just test the correctness of sorting strings. +// For compressed strings we can use the string ids to perform the +// same task. We just need to commit first and then run the query. +// These tests are mainly verifying the following: +// +// 1. Store N strings inside a Mixed. Commit and sort. +// 2. Store inside a Mixed integers and Strings. Commit and sort +// 3. Store N strings in compressed format inside a Mixed property, store another N strings uncompressed in another +// column. Sort using both columns. +// 4. Store N compressed strings inside a table, linked by another table. Sort over links. + +// Note: Strings and Mixed use the same logic for compressing strings. Thus these tests are solely using Mixed +// columns. + + +static int gen_random_int(int min = 1, int max = 100) +{ + std::random_device rd; + std::mt19937 generator(rd()); + std::uniform_int_distribution<> distribution(min, max); + return distribution(generator); +} + +static std::string gen_random_string(size_t length) +{ + const std::string alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuv" + "wxyz0123456789"; + std::random_device rd; + std::mt19937 generator(rd()); + std::uniform_int_distribution<> distribution(0, (int)alphabet.size() - 1); + + std::string random_str; + for (size_t i = 0; i < length; ++i) + random_str += alphabet[distribution(generator)]; + + return random_str; +} + +TEST_TYPES(CompressedStrings_Sort, std::true_type, std::false_type) +{ + constexpr size_t N = 10; + using type = typename TEST_TYPE::type; + SHARED_GROUP_TEST_PATH(path); + std::unique_ptr hist(make_in_realm_history()); + DBRef sg = DB::create(*hist, path, DBOptions(crypt_key())); + TransactionRef rt = sg->start_read(); + + auto commit = [&sg](auto f) { + WriteTransaction wt(sg); + f(wt); + wt.commit(); + }; + + commit([&](auto& wt) { + auto t = wt.add_table("Table"); + t->add_column(type_Mixed, "any"); + }); + rt->advance_read(); + + std::vector strings; + for (size_t i = 0; i < N; ++i) { + const auto size = gen_random_int(); + strings.push_back(gen_random_string(size)); + } + + commit([&](auto& wt) { + TableRef t = wt.get_table("Table"); + ColKey col = t->get_column_key("any"); + for (auto& s : strings) { + t->create_object().set(col, Mixed{s}); + } + }); + rt->advance_read(); + + // sort and verify + TableRef table = rt->get_table("Table"); + ColKey col_key = table->get_column_key("any"); + bool ascending = type::value; + + auto cmp = [&ascending](auto& s1, auto& s2) { + Mixed m1{s1}; + Mixed m2{s2}; + return ascending ? m1 < m2 : m1 > m2; + }; + + std::sort(strings.begin(), strings.end(), cmp); + + TableView tv = table->where().find_all(); + tv.sort(col_key, ascending); + CHECK(tv.size() == strings.size()); + for (size_t i = 0; i < N; ++i) { + CHECK_EQUAL(tv[i].get_any(col_key), strings[i]); + } +} + +TEST_TYPES(CompressedStringsAndOtherMixed_Sort, std::true_type, std::false_type) +{ + constexpr size_t N = 10; + using type = typename TEST_TYPE::type; + SHARED_GROUP_TEST_PATH(path); + std::unique_ptr hist(make_in_realm_history()); + DBRef sg = DB::create(*hist, path, DBOptions(crypt_key())); + TransactionRef rt = sg->start_read(); + + auto commit = [&sg](auto f) { + WriteTransaction wt(sg); + f(wt); + wt.commit(); + }; + + commit([&](auto& wt) { + auto t = wt.add_table("Table"); + t->add_column(type_Mixed, "any"); + }); + rt->advance_read(); + + std::vector strings; + std::vector ints; + for (size_t i = 0; i < N; ++i) { + const auto size = gen_random_int(); + ints.push_back(size); + strings.push_back(gen_random_string(size)); + } + + commit([&](auto& wt) { + TableRef t = wt.get_table("Table"); + ColKey col_any = t->get_column_key("any"); + for (auto& s : strings) { + Obj obj = t->create_object(); + obj.set(col_any, Mixed{s}); + } + for (auto i : ints) { + Obj obj = t->create_object(); + obj.set(col_any, Mixed{i}); + } + }); + rt->advance_read(); + + // sort and verify + TableRef table = rt->get_table("Table"); + ColKey col_key = table->get_column_key("any"); + bool ascending = type::value; + + auto cmp = [&ascending](Mixed m1, Mixed m2) { + return ascending ? m1 < m2 : m1 > m2; + }; + + std::vector data; + for (auto& str : strings) + data.push_back(Mixed{str}); + for (auto i : ints) + data.push_back(Mixed{i}); + + std::sort(data.begin(), data.end(), cmp); + + TableView tv = table->where().find_all(); + tv.sort(col_key, ascending); + CHECK(tv.size() == data.size()); + for (size_t i = 0; i < N; ++i) { + CHECK_EQUAL(tv[i].get_any(col_key), data[i]); + } +} + +TEST_TYPES(CompressedStrings_CompressedAndUncompressedStringColumns, std::true_type, std::false_type) +{ + constexpr size_t N = 10; + using type = typename TEST_TYPE::type; + SHARED_GROUP_TEST_PATH(path); + std::unique_ptr hist(make_in_realm_history()); + DBRef sg = DB::create(*hist, path, DBOptions(crypt_key())); + TransactionRef rt = sg->start_read(); + + auto commit = [&sg](auto f) { + WriteTransaction wt(sg); + f(wt); + wt.commit(); + }; + + commit([&](auto& wt) { + auto t = wt.add_table("Table"); + t->add_column(type_Mixed, "any_compressed"); + t->add_column(type_Mixed, "any_uncompressed"); + }); + rt->advance_read(); + + std::vector strings; + for (size_t i = 0; i < N; ++i) { + const auto size = gen_random_int(); + strings.push_back(gen_random_string(size)); + } + + commit([&](auto& wt) { + TableRef t = wt.get_table("Table"); + ColKey col = t->get_column_key("any_compressed"); + for (auto& s : strings) { + t->create_object().set(col, Mixed{s}); + } + }); + // any_compressed Mixed is now using compressed strings + + rt->advance_read(); + + commit([&](auto& wt) { + TableRef t = wt.get_table("Table"); + ColKey col_compressed = t->get_column_key("any_compressed"); + ColKey col_uncompressed = t->get_column_key("any_uncompressed"); + + // add N new objects but as long as we don't commit these strings + // will be in uncompressed format + for (auto& s : strings) { + t->create_object().set(col_uncompressed, Mixed{s}); + } + + // sort and verify both columns + bool ascending = type::value; + auto cmp = [&ascending](auto& s1, auto& s2) { + Mixed m1{s1}; + Mixed m2{s2}; + return ascending ? m1 < m2 : m1 > m2; + }; + std::sort(strings.begin(), strings.end(), cmp); + + TableView tv = t->where().find_all(); + + tv.sort(SortDescriptor({{col_compressed}, {col_uncompressed}}, {ascending, ascending})); + + CHECK(tv.size() == strings.size() * 2); + for (size_t i = 0; i < 2 * N; ++i) { + auto compressed_str = tv[i].get_any(col_compressed); + auto uncompressed_str = tv[i].get_any(col_uncompressed); + if (!compressed_str.is_null()) { + CHECK_EQUAL(compressed_str, strings[i % N]); + } + if (!uncompressed_str.is_null()) { + CHECK_EQUAL(uncompressed_str, strings[i % N]); + } + } + }); + // after this point both columns will be in compressed format +} + +TEST_TYPES(CompressedStrings_SortOverLinks, std::true_type, std::false_type) +{ + constexpr size_t N = 10; + using type = typename TEST_TYPE::type; + SHARED_GROUP_TEST_PATH(path); + std::unique_ptr hist(make_in_realm_history()); + DBRef sg = DB::create(*hist, path, DBOptions(crypt_key())); + TransactionRef rt = sg->start_read(); + + auto commit = [&sg](auto f) { + WriteTransaction wt(sg); + f(wt); + wt.commit(); + }; + + commit([&](auto& wt) { + auto t = wt.add_table("Table"); + auto o = wt.add_table("Other"); + // store N ints in T.any + t->add_column(type_Mixed, "any"); + // link O to T + t->add_column(*o, "link"); + // store N compressed strings in O.any + o->add_column(type_Mixed, "any"); + }); + rt->advance_read(); + + std::vector ints; + std::vector strings; + for (size_t i = 0; i < N; ++i) { + const auto size = gen_random_int(); + ints.push_back(size); + strings.push_back(gen_random_string(size)); + } + + commit([&](auto& wt) { + // store N strings in O + TableRef table = wt.get_table("Other"); + ColKey col = table->get_column_key("any"); + for (const auto& s : strings) { + table->create_object().set(col, Mixed{s}); + } + }); + + rt->advance_read(); + + commit([&](auto& wt) { + TableRef table = wt.get_table("Table"); + TableRef other = wt.get_table("Other"); + + ColKey col = table->get_column_key("any"); + ColKey link = table->get_column_key("link"); + + // set N ints + for (auto i : ints) + table->create_object().set(col, Mixed{i}); + + // set N links to Other's objects storing compressed strings. + size_t i = 0; + for (Obj o : *other) + table->get_object(i++).set(link, o.get_key()); + }); + // any Mixed that contains strings is now pointing to a compressed string + rt->advance_read(); + + std::vector> data; + for (size_t i = 0; i < N; ++i) { + auto p = std::make_pair(Mixed{ints[i]}, Mixed{strings[i]}); + data.push_back(p); + } + + bool ascending = type::value; + auto cmp = [&ascending](auto& p1, auto& p2) { + // sort based on strings + Mixed m1 = p1.second; + Mixed m2 = p2.second; + return ascending ? m1 < m2 : m1 > m2; + }; + std::sort(data.begin(), data.end(), cmp); + + TableRef table = rt->get_table("Table"); + TableRef other = rt->get_table("Other"); + ColKey t_any = table->get_column_key("any"); + ColKey link = table->get_column_key("link"); + ColKey o_any = other->get_column_key("any"); + + TableView tv = table->where().find_all(); + tv.sort(SortDescriptor({{link, o_any}}, {ascending})); + CHECK(tv.size() == data.size()); + for (size_t i = 0; i < N; ++i) { + CHECK_EQUAL(tv[i].get_any(t_any), data[i].first); + } +} + #endif // TEST_QUERY diff --git a/test/test_string_compression.cpp b/test/test_string_compression.cpp index 017e81968c5..cf104445b6c 100644 --- a/test/test_string_compression.cpp +++ b/test/test_string_compression.cpp @@ -35,6 +35,7 @@ using namespace realm; TEST(StringInterner_Basic_Creation) { Array parent(Allocator::get_default()); + _impl::DeepArrayDestroyGuard dg(&parent); parent.create(NodeHeader::type_HasRefs, false, 1, 0); StringInterner interner(Allocator::get_default(), parent, ColKey(0), true); StringData my_string = "aaaaaaaaaaaaaaa"; @@ -50,12 +51,12 @@ TEST(StringInterner_Basic_Creation) CHECK_EQUAL(my_string, origin_string); CHECK(interner.compare(*stored_id, id) == 0); // compare agaist self. - parent.destroy_deep(); } TEST(StringInterner_InternMultipleStrings) { Array parent(Allocator::get_default()); + _impl::DeepArrayDestroyGuard dg(&parent); parent.create(NodeHeader::type_HasRefs, false, 1, 0); StringInterner interner(Allocator::get_default(), parent, ColKey(0), true); @@ -72,12 +73,12 @@ TEST(StringInterner_InternMultipleStrings) CHECK_EQUAL(*stored_id, id); CHECK_EQUAL(interner.compare(str, id), 0); } - parent.destroy_deep(); } TEST(StringInterner_TestLookup) { Array parent(Allocator::get_default()); + _impl::DeepArrayDestroyGuard dg(&parent); parent.create(NodeHeader::type_HasRefs, false, 1, 0); StringInterner interner(Allocator::get_default(), parent, ColKey(0), true); @@ -96,13 +97,12 @@ TEST(StringInterner_TestLookup) CHECK(id); CHECK(interner.compare(StringData(s), *id) == 0); } - - parent.destroy_deep(); } TEST(StringInterner_VerifyComparison) { Array parent(Allocator::get_default()); + _impl::DeepArrayDestroyGuard dg(&parent); parent.create(NodeHeader::type_HasRefs, false, 1, 0); StringInterner interner(Allocator::get_default(), parent, ColKey(0), true); @@ -173,13 +173,12 @@ TEST(StringInterner_VerifyComparison) res = interner.compare(test_upper_case_id, test_lower_case_id); CHECK_LESS(interner.get(test_upper_case_id), interner.get(test_lower_case_id)); CHECK_EQUAL(res, -1); - - parent.destroy_deep(); } TEST(StringInterner_VerifyInterningNull) { Array parent(Allocator::get_default()); + _impl::DeepArrayDestroyGuard dg(&parent); parent.create(NodeHeader::type_HasRefs, false, 1, 0); StringInterner interner(Allocator::get_default(), parent, ColKey(0), true); auto null_id = interner.intern({}); @@ -203,13 +202,12 @@ TEST(StringInterner_VerifyInterningNull) CHECK_LESS(StringData{}, interner.get(str_id)); // compare via StringData CHECK_EQUAL(interner.compare(StringData{"test"}, null_id), 1); CHECK_GREATER(StringData{"test"}, interner.get(null_id)); - - parent.destroy_deep(); } TEST(StringInterner_VerifyLongString) { Array parent(Allocator::get_default()); + _impl::DeepArrayDestroyGuard dg(&parent); parent.create(NodeHeader::type_HasRefs, false, 1, 0); StringInterner interner(Allocator::get_default(), parent, ColKey(0), true); @@ -221,13 +219,12 @@ TEST(StringInterner_VerifyLongString) const auto stored_id = interner.lookup(StringData(long_string)); CHECK_EQUAL(stored_id, 1); CHECK(interner.compare(StringData(long_string), *stored_id) == 0); - - parent.destroy_deep(); } TEST(StringInterner_VerifyExpansionFromSmallStringToLongString) { Array parent(Allocator::get_default()); + _impl::DeepArrayDestroyGuard dg(&parent); parent.create(NodeHeader::type_HasRefs, false, 1, 0); StringInterner interner(Allocator::get_default(), parent, ColKey(0), true); @@ -249,6 +246,4 @@ TEST(StringInterner_VerifyExpansionFromSmallStringToLongString) stored_id = interner.lookup(StringData(long_string)); CHECK_EQUAL(stored_id, id); CHECK(interner.compare(StringData(long_string), *stored_id) == 0); - - parent.destroy_deep(); } diff --git a/test/test_utf8.cpp b/test/test_utf8.cpp index ec4c913eafd..de456b2d69e 100644 --- a/test/test_utf8.cpp +++ b/test/test_utf8.cpp @@ -24,10 +24,13 @@ #include #include #include +#include #include -#include #include +#include +#include +#include #include "test.hpp" @@ -86,10 +89,22 @@ const char* u16sur2 = "\xF0\xA0\x9C\xB1"; // same as above, with larger unicode TEST(UTF8_Compare_Strings) { + Array parent(Allocator::get_default()); + _impl::DeepArrayDestroyGuard dg(&parent); + + parent.create(NodeHeader::type_HasRefs, false, 1, 0); + StringInterner interner(Allocator::get_default(), parent, ColKey(0), true); + // Useful line for creating new unit test cases: // bool ret = std::locale("us_EN")(string("a"), std::string("b")); - auto str_compare = [](StringData a, StringData b) { - return a < b; + + auto str_compare = [&interner, this](StringData a, StringData b) { + auto id1 = interner.intern(a); + auto id2 = interner.intern(b); + const auto ret_interner_cmp = interner.compare(id1, id2) < 0; + const auto ret_cmp = a < b; + CHECK_EQUAL(ret_cmp, ret_interner_cmp); + return ret_cmp; }; // simplest test @@ -141,9 +156,21 @@ TEST(UTF8_Compare_Strings) TEST(UTF8_Compare_Core_utf8) { - auto str_compare = [](StringData a, StringData b) { - return a < b; + Array parent(Allocator::get_default()); + _impl::DeepArrayDestroyGuard dg(&parent); + + parent.create(NodeHeader::type_HasRefs, false, 1, 0); + StringInterner interner(Allocator::get_default(), parent, ColKey(0), true); + + auto str_compare = [&interner, this](StringData a, StringData b) { + auto id1 = interner.intern(a); + auto id2 = interner.intern(b); + const auto ret_interner_cmp = interner.compare(id1, id2) < 0; + const auto ret_cmp = a < b; + CHECK_EQUAL(ret_cmp, ret_interner_cmp); + return ret_cmp; }; + // single utf16 code points (tests mostly Windows) CHECK_EQUAL(false, str_compare(uae, uae)); CHECK_EQUAL(false, str_compare(uAE, uAE)); @@ -169,7 +196,6 @@ TEST(UTF8_Compare_Core_utf8) CHECK_EQUAL(false, str_compare(u16sur2, u16sur2)); } - TEST(UTF8_Compare_Core_utf8_invalid) { // Test that invalid utf8 won't make decisions on data beyond Realm payload. Do @@ -194,8 +220,17 @@ TEST(UTF8_Compare_Core_utf8_invalid) // that return value is arbitrary for invalid utf8 bool ret = i1 < i2; CHECK_EQUAL(ret, i2 < i1); // must sort the same as before regardless of succeeding data -} + // the same applies if the strings are interned. + Array parent(Allocator::get_default()); + _impl::DeepArrayDestroyGuard dg(&parent); + parent.create(NodeHeader::type_HasRefs, false, 1, 0); + StringInterner interner(Allocator::get_default(), parent, ColKey(0), true); + auto id1 = interner.intern(invalid1); + auto id2 = interner.intern(invalid2); + bool ret_interned = interner.compare(id1, id2) < 0; + CHECK_EQUAL(ret_interned, ret); +} TEST(Compare_Core_utf8_invalid_crash) { @@ -218,12 +253,22 @@ TEST(Compare_Core_utf8_invalid_crash) } } - TEST(UTF8_Compare_Core_utf8_zero) { - auto str_compare = [](StringData a, StringData b) { - return a < b; + Array parent(Allocator::get_default()); + _impl::DeepArrayDestroyGuard dg(&parent); + parent.create(NodeHeader::type_HasRefs, false, 1, 0); + StringInterner interner(Allocator::get_default(), parent, ColKey(0), true); + + auto str_compare = [&interner, this](StringData a, StringData b) { + auto id1 = interner.intern(a); + auto id2 = interner.intern(b); + const auto ret_interner_cmp = interner.compare(id1, id2) < 0; + const auto ret_cmp = a < b; + CHECK_EQUAL(ret_cmp, ret_interner_cmp); + return ret_cmp; }; + // Realm must support 0 characters in utf8 strings CHECK_EQUAL(false, str_compare(StringData("\0", 1), StringData("\0", 1))); CHECK_EQUAL(true, str_compare(StringData("\0", 1), StringData("a"))); @@ -238,7 +283,6 @@ TEST(UTF8_Compare_Core_utf8_zero) CHECK_EQUAL(true, str_compare(StringData("a\0", 2), StringData("a\0\0", 3))); CHECK_EQUAL(false, str_compare(StringData("a\0\0", 3), StringData("a\0", 2))); } - } // anonymous namespace #endif // TEST_UTF8 From 7b6159eaf06d46d745073c3db307f58f6b1191de Mon Sep 17 00:00:00 2001 From: Finn Schiermer Andersen Date: Tue, 6 Aug 2024 14:37:15 +0200 Subject: [PATCH 10/14] reduce locking for StringInterner lookup and compare methods --- src/realm/string_interner.cpp | 53 ++++++++++++++++++++++------------- src/realm/string_interner.hpp | 2 +- 2 files changed, 35 insertions(+), 20 deletions(-) diff --git a/src/realm/string_interner.cpp b/src/realm/string_interner.cpp index dd6c27cab2f..9f2811c98fc 100644 --- a/src/realm/string_interner.cpp +++ b/src/realm/string_interner.cpp @@ -258,14 +258,20 @@ static std::vector hash_to_id(Array& node, uint32_t hash, uint8_t hash enum positions { Pos_Version, Pos_ColKey, Pos_Size, Pos_Compressor, Pos_Data, Pos_Map, Top_Size }; struct StringInterner::DataLeaf { - std::vector m_compressed; ref_type m_leaf_ref = 0; - bool m_is_loaded = false; + std::vector m_compressed; + std::atomic m_is_loaded = false; DataLeaf() {} DataLeaf(ref_type ref) : m_leaf_ref(ref) { } + DataLeaf(const DataLeaf&& other) + : m_leaf_ref(other.m_leaf_ref) + , m_compressed(other.m_compressed) + , m_is_loaded(other.m_is_loaded.load(std::memory_order_acquire)) + { + } }; StringInterner::StringInterner(Allocator& alloc, Array& parent, ColKey col_key, bool writable) @@ -359,7 +365,7 @@ void StringInterner::update_from_parent(bool writable) void StringInterner::rebuild_internal() { - std::lock_guard lock(m_mutex); + // std::lock_guard lock(m_mutex); // release old decompressed strings for (size_t idx = 0; idx < m_in_memory_strings.size(); ++idx) { StringID id = m_in_memory_strings[idx]; @@ -400,7 +406,7 @@ StringInterner::~StringInterner() {} StringID StringInterner::intern(StringData sd) { REALM_ASSERT(m_top.is_attached()); - std::lock_guard lock(m_mutex); + // std::lock_guard lock(m_mutex); // special case for null string if (sd.data() == nullptr) return 0; @@ -527,7 +533,7 @@ StringID StringInterner::intern(StringData sd) bool StringInterner::load_leaf_if_needed(DataLeaf& leaf) { - if (!leaf.m_is_loaded) { + if (!leaf.m_is_loaded.load(std::memory_order_relaxed)) { // start with an empty leaf: leaf.m_compressed.clear(); leaf.m_compressed.reserve(256); @@ -568,7 +574,7 @@ bool StringInterner::load_leaf_if_needed(DataLeaf& leaf) leaf.m_compressed.push_back({c, str_array.size()}); } } - leaf.m_is_loaded = true; + leaf.m_is_loaded.store(true, std::memory_order_release); return true; } return false; @@ -585,14 +591,23 @@ bool StringInterner::load_leaf_if_new_ref(DataLeaf& leaf, ref_type new_ref) return load_leaf_if_needed(leaf); } -CompressedStringView& StringInterner::get_compressed(StringID id) +CompressedStringView& StringInterner::get_compressed(StringID id, bool lock_if_mutating) { auto index = id - 1; // 0 represents null auto hi = index >> 8; auto lo = index & 0xFFUL; DataLeaf& leaf = m_compressed_leafs[hi]; - load_leaf_if_needed(leaf); + if (leaf.m_is_loaded.load(std::memory_order_acquire)) { + return leaf.m_compressed[lo]; + } + if (lock_if_mutating) { + std::lock_guard lock(m_mutex); + load_leaf_if_needed(leaf); + } + else { + load_leaf_if_needed(leaf); + } REALM_ASSERT_DEBUG(lo < leaf.m_compressed.size()); return leaf.m_compressed[lo]; } @@ -603,13 +618,13 @@ std::optional StringInterner::lookup(StringData sd) // "dead" mode return {}; } - std::lock_guard lock(m_mutex); + // std::lock_guard lock(m_mutex); if (sd.data() == nullptr) return 0; uint32_t h = (uint32_t)sd.hash(); auto candidates = hash_to_id(m_hash_map, h, 32); for (auto& candidate : candidates) { - auto candidate_cpr = get_compressed(candidate); + auto candidate_cpr = get_compressed(candidate, true); if (m_compressor->compare(sd, candidate_cpr) == 0) return candidate; } @@ -618,10 +633,6 @@ std::optional StringInterner::lookup(StringData sd) int StringInterner::compare(StringID A, StringID B) { - std::lock_guard lock(m_mutex); - // 0 is null, the first index starts from 1. - REALM_ASSERT_DEBUG(A <= m_decompressed_strings.size()); - REALM_ASSERT_DEBUG(B <= m_decompressed_strings.size()); // comparisons against null if (A == B && A == 0) return 0; @@ -630,14 +641,16 @@ int StringInterner::compare(StringID A, StringID B) if (B == 0) return 1; // ok, no nulls. + // std::lock_guard lock(m_mutex); + // 0 is null, the first index starts from 1. + REALM_ASSERT_DEBUG(A <= m_decompressed_strings.size()); + REALM_ASSERT_DEBUG(B <= m_decompressed_strings.size()); REALM_ASSERT(m_compressor); - return m_compressor->compare(get_compressed(A), get_compressed(B)); + return m_compressor->compare(get_compressed(A, true), get_compressed(B, true)); } int StringInterner::compare(StringData s, StringID A) { - std::lock_guard lock(m_mutex); - REALM_ASSERT_DEBUG(A <= m_decompressed_strings.size()); // comparisons against null if (s.data() == nullptr && A == 0) return 0; @@ -646,19 +659,21 @@ int StringInterner::compare(StringData s, StringID A) if (A == 0) return 1; // ok, no nulls + // std::lock_guard lock(m_mutex); + REALM_ASSERT_DEBUG(A <= m_decompressed_strings.size()); REALM_ASSERT(m_compressor); - return m_compressor->compare(s, get_compressed(A)); + return m_compressor->compare(s, get_compressed(A, true)); } StringData StringInterner::get(StringID id) { REALM_ASSERT(m_compressor); - std::lock_guard lock(m_mutex); if (id == 0) return StringData{nullptr}; REALM_ASSERT_DEBUG(id <= m_decompressed_strings.size()); CachedString& cs = m_decompressed_strings[id - 1]; + std::lock_guard lock(m_mutex); if (cs.m_decompressed) { if (cs.m_weight < 128) cs.m_weight += 64; diff --git a/src/realm/string_interner.hpp b/src/realm/string_interner.hpp index 7d53c120349..f0aac0d317c 100644 --- a/src/realm/string_interner.hpp +++ b/src/realm/string_interner.hpp @@ -74,7 +74,7 @@ class StringInterner { // when ever we meet a string too large to be placed inline. Array m_current_long_string_node; void rebuild_internal(); - CompressedStringView& get_compressed(StringID id); + CompressedStringView& get_compressed(StringID id, bool lock_if_mutating = false); // return true if the leaf was reloaded bool load_leaf_if_needed(DataLeaf& leaf); // return 'true' if the new ref was different and forced a reload From f9d1021565f76f1172f9078aa95c2b860795b1cc Mon Sep 17 00:00:00 2001 From: Finn Schiermer Andersen Date: Tue, 6 Aug 2024 16:24:39 +0200 Subject: [PATCH 11/14] reduced locking in StringInterner::get() --- src/realm/string_interner.cpp | 18 +++++++++++------- src/realm/string_interner.hpp | 13 ++++++++++++- 2 files changed, 23 insertions(+), 8 deletions(-) diff --git a/src/realm/string_interner.cpp b/src/realm/string_interner.cpp index 9f2811c98fc..9c22b2b1a5a 100644 --- a/src/realm/string_interner.cpp +++ b/src/realm/string_interner.cpp @@ -375,7 +375,7 @@ void StringInterner::rebuild_internal() continue; } if (auto& w = m_decompressed_strings[id - 1].m_weight) { - w >>= 1; + w = w >> 1; } else { m_decompressed_strings[id - 1].m_decompressed.reset(); @@ -420,7 +420,7 @@ StringID StringInterner::intern(StringData sd) // it's a new string bool learn = true; auto c_str = m_compressor->compress(sd, learn); - m_decompressed_strings.push_back({64, std::make_unique(sd)}); + m_decompressed_strings.emplace_back(64, std::make_unique(sd)); auto id = m_decompressed_strings.size(); m_in_memory_strings.push_back(id); add_to_hash_map(m_hash_map, h, id, 32); @@ -673,15 +673,19 @@ StringData StringInterner::get(StringID id) return StringData{nullptr}; REALM_ASSERT_DEBUG(id <= m_decompressed_strings.size()); CachedString& cs = m_decompressed_strings[id - 1]; - std::lock_guard lock(m_mutex); - if (cs.m_decompressed) { - if (cs.m_weight < 128) - cs.m_weight += 64; + if (auto weight = cs.m_weight.load(std::memory_order_acquire)) { + REALM_ASSERT_DEBUG(cs.m_decompressed); + if (weight < 128) { + // ignore if this fails - that happens if some other thread bumps the value + // concurrently. And if so, we can live with loosing our own "bump" + cs.m_weight.compare_exchange_strong(weight, weight + 64, std::memory_order_acq_rel); + } return {cs.m_decompressed->c_str(), cs.m_decompressed->size()}; } - cs.m_weight = 64; + std::lock_guard lock(m_mutex); cs.m_decompressed = std::make_unique(m_compressor->decompress(get_compressed(id))); m_in_memory_strings.push_back(id); + cs.m_weight.store(64, std::memory_order_release); return {cs.m_decompressed->c_str(), cs.m_decompressed->size()}; } diff --git a/src/realm/string_interner.hpp b/src/realm/string_interner.hpp index f0aac0d317c..8b1ed278d05 100644 --- a/src/realm/string_interner.hpp +++ b/src/realm/string_interner.hpp @@ -37,8 +37,19 @@ namespace realm { class StringCompressor; struct CachedString { - uint8_t m_weight = 0; + std::atomic m_weight = 0; std::unique_ptr m_decompressed; + CachedString() {} + CachedString(CachedString&& other) + { + m_decompressed = std::move(other.m_decompressed); + m_weight.store(other.m_weight.load(std::memory_order_relaxed), std::memory_order_relaxed); + } + CachedString(uint8_t init_weight, std::unique_ptr&& ptr) + { + m_decompressed = std::move(ptr); + m_weight.store(init_weight, std::memory_order_relaxed); + } }; class StringInterner { From 8325c81882db30f766d2f939ba912c40a413ff76 Mon Sep 17 00:00:00 2001 From: Finn Schiermer Andersen Date: Thu, 8 Aug 2024 11:48:59 +0200 Subject: [PATCH 12/14] minor fixes and comments --- src/realm/string_interner.cpp | 13 ++++++------- src/realm/string_interner.hpp | 17 ++++++++++++++--- 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/src/realm/string_interner.cpp b/src/realm/string_interner.cpp index 9c22b2b1a5a..8b93d09fe4d 100644 --- a/src/realm/string_interner.cpp +++ b/src/realm/string_interner.cpp @@ -365,7 +365,6 @@ void StringInterner::update_from_parent(bool writable) void StringInterner::rebuild_internal() { - // std::lock_guard lock(m_mutex); // release old decompressed strings for (size_t idx = 0; idx < m_in_memory_strings.size(); ++idx) { StringID id = m_in_memory_strings[idx]; @@ -406,7 +405,6 @@ StringInterner::~StringInterner() {} StringID StringInterner::intern(StringData sd) { REALM_ASSERT(m_top.is_attached()); - // std::lock_guard lock(m_mutex); // special case for null string if (sd.data() == nullptr) return 0; @@ -428,7 +426,6 @@ StringID StringInterner::intern(StringData sd) REALM_ASSERT_DEBUG(index == id - 1); bool need_long_string_node = c_str.size() >= 65536; - // TODO: update_internal must set up m_current_long_string_node if it is in use if (need_long_string_node && !m_current_long_string_node.is_attached()) { m_current_long_string_node.create(NodeHeader::type_HasRefs); @@ -597,6 +594,8 @@ CompressedStringView& StringInterner::get_compressed(StringID id, bool lock_if_m auto hi = index >> 8; auto lo = index & 0xFFUL; + // This is an instance of the "double checked locking" idiom, chosen to minimize + // locking in the common case of the leaf already being fully initialized. DataLeaf& leaf = m_compressed_leafs[hi]; if (leaf.m_is_loaded.load(std::memory_order_acquire)) { return leaf.m_compressed[lo]; @@ -618,7 +617,6 @@ std::optional StringInterner::lookup(StringData sd) // "dead" mode return {}; } - // std::lock_guard lock(m_mutex); if (sd.data() == nullptr) return 0; uint32_t h = (uint32_t)sd.hash(); @@ -641,8 +639,7 @@ int StringInterner::compare(StringID A, StringID B) if (B == 0) return 1; // ok, no nulls. - // std::lock_guard lock(m_mutex); - // 0 is null, the first index starts from 1. + // StringID 0 is null, the first true index starts from 1. REALM_ASSERT_DEBUG(A <= m_decompressed_strings.size()); REALM_ASSERT_DEBUG(B <= m_decompressed_strings.size()); REALM_ASSERT(m_compressor); @@ -659,7 +656,6 @@ int StringInterner::compare(StringData s, StringID A) if (A == 0) return 1; // ok, no nulls - // std::lock_guard lock(m_mutex); REALM_ASSERT_DEBUG(A <= m_decompressed_strings.size()); REALM_ASSERT(m_compressor); return m_compressor->compare(s, get_compressed(A, true)); @@ -672,6 +668,9 @@ StringData StringInterner::get(StringID id) if (id == 0) return StringData{nullptr}; REALM_ASSERT_DEBUG(id <= m_decompressed_strings.size()); + + // Avoid taking a lock in the (presumably) common case, where the leaf with the compressed + // strings have already been loaded. Such leafs have "m_weight" > 0. CachedString& cs = m_decompressed_strings[id - 1]; if (auto weight = cs.m_weight.load(std::memory_order_acquire)) { REALM_ASSERT_DEBUG(cs.m_decompressed); diff --git a/src/realm/string_interner.hpp b/src/realm/string_interner.hpp index 8b1ed278d05..dbc984ede55 100644 --- a/src/realm/string_interner.hpp +++ b/src/realm/string_interner.hpp @@ -54,13 +54,23 @@ struct CachedString { class StringInterner { public: - // To be used exclusively from Table + // Use of the StringInterner must honour the restrictions on concurrency given + // below. Currently this is ensured by only using concurrent access on frozen + // objects. + // + // Limitations wrt concurrency: + // + // To be used exclusively from Table and in a non-concurrent setting: StringInterner(Allocator& alloc, Array& parent, ColKey col_key, bool writable); void update_from_parent(bool writable); ~StringInterner(); - // To be used from Obj and for searching + // To be used from Obj within a write transaction or during commit. + // To be used only in a non-concurrent setting: StringID intern(StringData); + + // The following four methods can be used in a concurrent setting with each other, + // but not concurrently with any of the above methods. std::optional lookup(StringData); int compare(StringID A, StringID B); int compare(StringData, StringID A); @@ -98,7 +108,8 @@ class StringInterner { std::vector m_decompressed_strings; std::vector m_in_memory_strings; // Mutual exclusion is needed for frozen transactions only. Live objects are - // only used in single threaded contexts so don't need them. For now, just use always. + // only used in single threaded contexts so don't need them. For now, we don't + // distinguish, assuming that locking is sufficiently low in both scenarios. std::mutex m_mutex; }; } // namespace realm From ddc557e4dc3edbeb4868b9521a1dff8f64891946 Mon Sep 17 00:00:00 2001 From: Finn Schiermer Andersen Date: Thu, 8 Aug 2024 12:22:49 +0200 Subject: [PATCH 13/14] optimize memory ordering a bit --- src/realm/string_interner.cpp | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/realm/string_interner.cpp b/src/realm/string_interner.cpp index 8b93d09fe4d..47d21e506d8 100644 --- a/src/realm/string_interner.cpp +++ b/src/realm/string_interner.cpp @@ -374,7 +374,9 @@ void StringInterner::rebuild_internal() continue; } if (auto& w = m_decompressed_strings[id - 1].m_weight) { - w = w >> 1; + auto val = w.load(std::memory_order_acquire); + val = val >> 1; + w.store(val, std::memory_order_release); } else { m_decompressed_strings[id - 1].m_decompressed.reset(); @@ -394,7 +396,7 @@ void StringInterner::rebuild_internal() for (size_t idx = 0; idx < m_compressed_leafs.size(); ++idx) { auto ref = m_data.get_as_ref(idx); auto& leaf_meta = m_compressed_leafs[idx]; - leaf_meta.m_is_loaded = false; + leaf_meta.m_is_loaded.store(false, std::memory_order_release); leaf_meta.m_compressed.clear(); leaf_meta.m_leaf_ref = ref; } @@ -461,7 +463,7 @@ StringID StringInterner::intern(StringData sd) } m_current_string_leaf.destroy(); // force later reload of leaf - m_compressed_leafs.back().m_is_loaded = false; + m_compressed_leafs.back().m_is_loaded.store(false, std::memory_order_release); } } if (m_current_long_string_node.is_attached()) { From 9fe448f29447a611490e260194d5d48874eba001 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B8rgen=20Edelbo?= Date: Wed, 21 Aug 2024 13:07:26 +0200 Subject: [PATCH 14/14] Small fix to Table::typed_write When writing the realm to a new file from a write transaction, the Table may be COW so that the top ref is changed. So don't use the ref that is present in the group when the operation starts. --- src/realm/group.cpp | 12 +++++------- src/realm/table.cpp | 4 ++-- src/realm/table.hpp | 2 +- 3 files changed, 8 insertions(+), 10 deletions(-) diff --git a/src/realm/group.cpp b/src/realm/group.cpp index 960e07d3d61..7064b511d2c 100644 --- a/src/realm/group.cpp +++ b/src/realm/group.cpp @@ -943,19 +943,17 @@ ref_type Group::typed_write_tables(_impl::ArrayWriterBase& out) const ref_type ref = m_top.get_as_ref(1); if (out.only_modified && m_alloc.is_read_only(ref)) return ref; - Array a(m_alloc); - a.init_from_ref(ref); - REALM_ASSERT_DEBUG(a.has_refs()); - TempArray dest(a.size()); - for (unsigned j = 0; j < a.size(); ++j) { - RefOrTagged rot = a.get_as_ref_or_tagged(j); + auto num_tables = m_tables.size(); + TempArray dest(num_tables); + for (unsigned j = 0; j < num_tables; ++j) { + RefOrTagged rot = m_tables.get_as_ref_or_tagged(j); if (rot.is_tagged()) { dest.set(j, rot); } else { auto table = do_get_table(j); REALM_ASSERT_DEBUG(table); - dest.set_as_ref(j, table->typed_write(rot.get_as_ref(), out)); + dest.set_as_ref(j, table->typed_write(out)); } } return dest.write(out); diff --git a/src/realm/table.cpp b/src/realm/table.cpp index acbb234fb0b..6c66c4feda2 100644 --- a/src/realm/table.cpp +++ b/src/realm/table.cpp @@ -3456,9 +3456,9 @@ ColKey Table::find_opposite_column(ColKey col_key) const return ColKey(); } -ref_type Table::typed_write(ref_type ref, _impl::ArrayWriterBase& out) const +ref_type Table::typed_write(_impl::ArrayWriterBase& out) const { - REALM_ASSERT(ref == m_top.get_mem().get_ref()); + auto ref = m_top.get_ref(); if (out.only_modified && m_alloc.is_read_only(ref)) return ref; out.table = this; diff --git a/src/realm/table.hpp b/src/realm/table.hpp index d2aaea6038a..86a78901cd8 100644 --- a/src/realm/table.hpp +++ b/src/realm/table.hpp @@ -687,7 +687,7 @@ class Table { Replication* const* m_repl; }; - ref_type typed_write(ref_type ref, _impl::ArrayWriterBase& out) const; + ref_type typed_write(_impl::ArrayWriterBase& out) const; private: enum LifeCycleCookie {