diff --git a/CHANGELOG.md b/CHANGELOG.md index 162c7983f86..4576f00bc43 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -17,7 +17,7 @@ ----------- ### Internals -* None. +* Ability to enumerate a string column has been removed. ---------------------------------------------- diff --git a/src/realm/CMakeLists.txt b/src/realm/CMakeLists.txt index 18583f3549a..5a67cdabc15 100644 --- a/src/realm/CMakeLists.txt +++ b/src/realm/CMakeLists.txt @@ -62,6 +62,8 @@ set(REALM_SOURCES table.cpp table_ref.cpp obj_list.cpp + string_interner.cpp + string_compressor.cpp object_id.cpp table_view.cpp tokenizer.cpp @@ -178,6 +180,8 @@ set(REALM_INSTALL_HEADERS null.hpp obj.hpp obj_list.hpp + string_interner.hpp + string_compressor.hpp object_id.hpp path.hpp owned_data.hpp diff --git a/src/realm/array.cpp b/src/realm/array.cpp index a2f61e4491c..b942413a8d6 100644 --- a/src/realm/array.cpp +++ b/src/realm/array.cpp @@ -294,7 +294,7 @@ void Array::set_type(Type type) set_hasrefs_in_header(init_has_refs, header); } -void Array::destroy_children(size_t offset) noexcept +void Array::destroy_children(size_t offset, bool ro_only) noexcept { for (size_t i = offset; i != m_size; ++i) { int64_t value = get(i); @@ -310,22 +310,10 @@ void Array::destroy_children(size_t offset) noexcept continue; ref_type ref = to_ref(value); - destroy_deep(ref, m_alloc); + destroy_deep(ref, m_alloc, ro_only); } } -// size_t Array::get_byte_size() const noexcept -//{ -// const auto header = get_header(); -// auto num_bytes = get_byte_size_from_header(header); -// auto read_only = m_alloc.is_read_only(m_ref) == true; -// auto capacity = get_capacity_from_header(header); -// auto bytes_ok = num_bytes <= capacity; -// REALM_ASSERT(read_only || bytes_ok); -// REALM_ASSERT_7(m_alloc.is_read_only(m_ref), ==, true, ||, num_bytes, <=, get_capacity_from_header(header)); -// return num_bytes; -// } - ref_type Array::do_write_shallow(_impl::ArrayWriterBase& out) const { // here we might want to compress the array and write down. diff --git a/src/realm/array.hpp b/src/realm/array.hpp index 858915fe27f..aee1168920b 100644 --- a/src/realm/array.hpp +++ b/src/realm/array.hpp @@ -117,7 +117,7 @@ class Array : public Node, public ArrayParent { /// pointer. void init_from_mem(MemRef) noexcept; - /// Same as `init_from_ref(get_ref_from_parent())`. + /// Same as `init_from_ref(ref_from_parent())`. void init_from_parent() noexcept { ref_type ref = get_ref_from_parent(); @@ -362,7 +362,8 @@ class Array : public Node, public ArrayParent { /// state (as if calling detach()), then free the allocated memory. If this /// accessor is already in the detached state, this function has no effect /// (idempotency). - void destroy_deep() noexcept; + /// If 'ro_only', only free space in read-only memory (the file) + void destroy_deep(bool ro_only = false) noexcept; /// check if the array is encoded (in B format) inline bool is_compressed() const; @@ -377,13 +378,13 @@ class Array : public Node, public ArrayParent { bool try_decompress(); /// Shorthand for `destroy_deep(MemRef(ref, alloc), alloc)`. - static void destroy_deep(ref_type ref, Allocator& alloc) noexcept; + static void destroy_deep(ref_type ref, Allocator& alloc, bool ro_only = false) noexcept; /// Destroy the specified array node and all of its children, recursively. /// /// This is done by freeing the specified array node after calling /// destroy_deep() for every contained 'ref' element. - static void destroy_deep(MemRef, Allocator&) noexcept; + static void destroy_deep(MemRef, Allocator&, bool ro_only = false) noexcept; // Clone deep static MemRef clone(MemRef, Allocator& from_alloc, Allocator& target_alloc); @@ -544,7 +545,7 @@ class Array : public Node, public ArrayParent { // Overriding method in ArrayParent ref_type get_child_ref(size_t) const noexcept override; - void destroy_children(size_t offset = 0) noexcept; + void destroy_children(size_t offset = 0, bool ro_only = false) noexcept; protected: // Getters and Setters for adaptive-packed arrays @@ -916,16 +917,17 @@ inline void Array::set_context_flag(bool value) noexcept } } -inline void Array::destroy_deep() noexcept +inline void Array::destroy_deep(bool ro_only) noexcept { if (!is_attached()) return; if (m_has_refs) - destroy_children(); + destroy_children(0, ro_only); char* header = get_header_from_data(m_data); - m_alloc.free_(m_ref, header); + if (!ro_only || is_read_only()) + m_alloc.free_(m_ref, header); m_data = nullptr; } @@ -968,20 +970,21 @@ inline void Array::clear_and_destroy_children() truncate_and_destroy_children(0); } -inline void Array::destroy_deep(ref_type ref, Allocator& alloc) noexcept +inline void Array::destroy_deep(ref_type ref, Allocator& alloc, bool ro_only) noexcept { - destroy_deep(MemRef(ref, alloc), alloc); + destroy_deep(MemRef(ref, alloc), alloc, ro_only); } -inline void Array::destroy_deep(MemRef mem, Allocator& alloc) noexcept +inline void Array::destroy_deep(MemRef mem, Allocator& alloc, bool ro_only) noexcept { if (!get_hasrefs_from_header(mem.get_addr())) { - alloc.free_(mem); + if (!ro_only || alloc.is_read_only(mem.get_ref())) + alloc.free_(mem); return; } Array array(alloc); array.init_from_mem(mem); - array.destroy_deep(); + array.destroy_deep(ro_only); } diff --git a/src/realm/array_backlink.cpp b/src/realm/array_backlink.cpp index bf4cfddb8da..4190a648a1b 100644 --- a/src/realm/array_backlink.cpp +++ b/src/realm/array_backlink.cpp @@ -225,12 +225,12 @@ void ArrayBacklink::verify() const REALM_ASSERT(src_obj.get(src_col_key).get_link() == target_link); } else if (val.is_type(type_List)) { - DummyParent parent(src_table, val.get_ref()); + DummyParent parent(src_table, val.get_ref(), src_col_key); Lst list(parent, 0); REALM_ASSERT(list.find_any(target_link) != npos); } else if (val.is_type(type_Dictionary)) { - DummyParent parent(src_table, val.get_ref()); + DummyParent parent(src_table, val.get_ref(), src_col_key); Dictionary dict(parent, 0); REALM_ASSERT(dict.find_any(target_link) != npos); } diff --git a/src/realm/array_integer.cpp b/src/realm/array_integer.cpp index 7cf99ff8dff..dca47b7069f 100644 --- a/src/realm/array_integer.cpp +++ b/src/realm/array_integer.cpp @@ -22,6 +22,8 @@ #include #include +#include + using namespace realm; ArrayInteger::ArrayInteger(Allocator& allocator) noexcept diff --git a/src/realm/array_integer.hpp b/src/realm/array_integer.hpp index 9c31e5f6bf9..ed680588f51 100644 --- a/src/realm/array_integer.hpp +++ b/src/realm/array_integer.hpp @@ -176,6 +176,7 @@ inline ArrayIntNull::~ArrayIntNull() noexcept {} inline size_t ArrayIntNull::size() const noexcept { + // this cannot be right, what if size is 0 return Array::size() - 1; } diff --git a/src/realm/array_mixed.cpp b/src/realm/array_mixed.cpp index 3f65498c844..af7e0e0174b 100644 --- a/src/realm/array_mixed.cpp +++ b/src/realm/array_mixed.cpp @@ -118,6 +118,22 @@ void ArrayMixed::set_null(size_t ndx) } } +std::optional ArrayMixed::get_string_id(size_t ndx) const +{ + int64_t val = m_composite.get(ndx); + if (val) { + const int64_t int_val = val >> s_data_shift; + const size_t payload_ndx{(size_t)int_val}; + const DataType type((val & s_data_type_mask) - 1); + if (type == type_String) { + ensure_string_array(); + REALM_ASSERT(size_t(int_val) < m_strings.size()); + return m_strings.get_string_id(payload_ndx); + } + } + return {}; +} + Mixed ArrayMixed::get(size_t ndx) const { int64_t val = m_composite.get(ndx); @@ -363,9 +379,8 @@ ref_type ArrayMixed::typed_write(ref_type top_ref, _impl::ArrayWriterBase& out, 2. int and pair int arrays, they are used for storing integers, timestamps, floats, doubles, decimals, links. In general we can compress them, but we need to be careful, controlling the col_type should prevent compressing data that we want to leave in the current format. - 3. string array is for strings and binary data (no compression for now) - 4. ref array is actually storing refs to collections. they can only be BPlusTree or - BPlusTree. + 3. string array is for strings and binary data + 4. ref array is actually storing refs to collections. They can only be Lst or Dictionary. 5. key array stores unique identifiers for collections in mixed (integers that can be compressed) */ Array composite(alloc); @@ -375,41 +390,48 @@ ref_type ArrayMixed::typed_write(ref_type top_ref, _impl::ArrayWriterBase& out, auto ref = top.get(i); ref_type new_ref = ref; if (ref && !(out.only_modified && alloc.is_read_only(ref))) { - if (i < 3) { // int, and pair_int - // integer arrays - new_ref = Array::write(ref, alloc, out, out.only_modified, out.compress); - } - else if (i == 4) { // collection in mixed - ArrayRef arr_ref(alloc); - arr_ref.init_from_ref(ref); - auto ref_sz = arr_ref.size(); - TempArray written_ref_leaf(ref_sz); - - for (size_t k = 0; k < ref_sz; k++) { - ref_type new_sub_ref = 0; - if (auto sub_ref = arr_ref.get(k)) { - auto header = alloc.translate(sub_ref); - // Now we have to find out if the nested collection is a - // dictionary or a list. If the top array has a size of 2 - // and it is not a BplusTree inner node, then it is a dictionary - if (NodeHeader::get_size_from_header(header) == 2 && - !NodeHeader::get_is_inner_bptree_node_from_header(header)) { - new_sub_ref = Dictionary::typed_write(sub_ref, out, alloc); - } - else { - new_sub_ref = BPlusTree::typed_write(sub_ref, out, alloc); + switch (i) { + case payload_idx_int: + // integer array + new_ref = Array::write(ref, alloc, out, out.only_modified, out.compress); + break; + case payload_idx_pair: + // integer array + new_ref = Array::write(ref, alloc, out, out.only_modified, out.compress); + break; + case payload_idx_str: + new_ref = ArrayString::typed_write(ref, out, alloc); + break; + case payload_idx_ref: { + // collection in mixed + ArrayRef arr_ref(alloc); + arr_ref.init_from_ref(ref); + auto ref_sz = arr_ref.size(); + TempArray written_ref_leaf(ref_sz); + + for (size_t k = 0; k < ref_sz; k++) { + ref_type new_sub_ref = 0; + if (auto sub_ref = arr_ref.get(k)) { + auto header = alloc.translate(sub_ref); + // Now we have to find out if the nested collection is a + // dictionary or a list. If the top array has a size of 2 + // and it is not a BplusTree inner node, then it is a dictionary + if (NodeHeader::get_size_from_header(header) == 2 && + !NodeHeader::get_is_inner_bptree_node_from_header(header)) { + new_sub_ref = Dictionary::typed_write(sub_ref, out, alloc); + } + else { + new_sub_ref = BPlusTree::typed_write(sub_ref, out, alloc); + } } + written_ref_leaf.set_as_ref(k, new_sub_ref); } - written_ref_leaf.set_as_ref(k, new_sub_ref); + new_ref = written_ref_leaf.write(out); + break; } - new_ref = written_ref_leaf.write(out); - } - else if (i == 5) { // unique keys associated to collections in mixed - new_ref = Array::write(ref, alloc, out, out.only_modified, out.compress); - } - else { - // all the rest we don't want to compress it, at least for now (strings will be needed) - new_ref = Array::write(ref, alloc, out, out.only_modified, false); + case payload_idx_key: + new_ref = Array::write(ref, alloc, out, out.only_modified, out.compress); + break; } } written_leaf.set(i, new_ref); diff --git a/src/realm/array_mixed.hpp b/src/realm/array_mixed.hpp index a0de93b8339..95afc264fef 100644 --- a/src/realm/array_mixed.hpp +++ b/src/realm/array_mixed.hpp @@ -64,6 +64,15 @@ class ArrayMixed : public ArrayPayload, private Array { { Array::set_parent(parent, ndx_in_parent); } + bool need_string_interner() const override + { + return true; + } + virtual void set_string_interner(StringInterner* interner) const override + { + m_strings.set_string_interner(interner); + } + void init_from_parent() { ref_type ref = get_ref_from_parent(); @@ -88,6 +97,7 @@ class ArrayMixed : public ArrayPayload, private Array { { return m_composite.get(ndx) == 0; } + std::optional get_string_id(size_t ndx) const; void clear(); void erase(size_t ndx); diff --git a/src/realm/array_string.cpp b/src/realm/array_string.cpp index 636a60a2865..7d47862be72 100644 --- a/src/realm/array_string.cpp +++ b/src/realm/array_string.cpp @@ -17,7 +17,10 @@ **************************************************************************/ #include -#include +#include +#include +#include +#include #include using namespace realm; @@ -52,14 +55,9 @@ void ArrayString::init_from_mem(MemRef mem) noexcept else { auto arr = new (&m_storage) Array(m_alloc); arr->init_from_mem(mem); - m_string_enum_values = std::make_unique(m_alloc); - ArrayParent* p; - REALM_ASSERT(m_spec != nullptr); - REALM_ASSERT(m_col_ndx != realm::npos); - ref_type r = m_spec->get_enumkeys_ref(m_col_ndx, p); - m_string_enum_values->init_from_ref(r); - m_string_enum_values->set_parent(p, m_col_ndx); - m_type = Type::enum_strings; + // init for new interned strings + m_type = Type::interned_strings; + // consider if we want this invariant: REALM_ASSERT_DEBUG(m_string_interner); } } else { @@ -110,7 +108,7 @@ size_t ArrayString::size() const return static_cast(m_arr)->size(); case Type::big_strings: return static_cast(m_arr)->size(); - case Type::enum_strings: + case Type::interned_strings: return static_cast(m_arr)->size(); } return {}; @@ -128,7 +126,7 @@ void ArrayString::add(StringData value) case Type::big_strings: static_cast(m_arr)->add_string(value); break; - case Type::enum_strings: { + case Type::interned_strings: { auto a = static_cast(m_arr); size_t ndx = a->size(); a->add(0); @@ -150,14 +148,9 @@ void ArrayString::set(size_t ndx, StringData value) case Type::big_strings: static_cast(m_arr)->set_string(ndx, value); break; - case Type::enum_strings: { - size_t sz = m_string_enum_values->size(); - size_t res = m_string_enum_values->find_first(value, 0, sz); - if (res == realm::not_found) { - m_string_enum_values->add(value); - res = sz; - } - static_cast(m_arr)->set(ndx, res); + case Type::interned_strings: { + auto id = m_string_interner->intern(value); + static_cast(m_arr)->set(ndx, id); break; } } @@ -175,9 +168,10 @@ void ArrayString::insert(size_t ndx, StringData value) case Type::big_strings: static_cast(m_arr)->insert_string(ndx, value); break; - case Type::enum_strings: { + case Type::interned_strings: { static_cast(m_arr)->insert(ndx, 0); set(ndx, value); + break; } } } @@ -191,29 +185,20 @@ StringData ArrayString::get(size_t ndx) const return static_cast(m_arr)->get_string(ndx); case Type::big_strings: return static_cast(m_arr)->get_string(ndx); - case Type::enum_strings: { - size_t index = size_t(static_cast(m_arr)->get(ndx)); - return m_string_enum_values->get(index); + case Type::interned_strings: { + size_t id = size_t(static_cast(m_arr)->get(ndx)); + return m_string_interner->get(id); } } return {}; } -StringData ArrayString::get_legacy(size_t ndx) const +std::optional ArrayString::get_string_id(size_t ndx) const { - switch (m_type) { - case Type::small_strings: - return static_cast(m_arr)->get(ndx); - case Type::medium_strings: - return static_cast(m_arr)->get_string_legacy(ndx); - case Type::big_strings: - return static_cast(m_arr)->get_string(ndx); - case Type::enum_strings: { - size_t index = size_t(static_cast(m_arr)->get(ndx)); - return m_string_enum_values->get(index); - } + if (m_type == Type::interned_strings) { + return StringID(static_cast(m_arr)->get(ndx)); } - return {}; + return m_string_interner->lookup(get(ndx)); } Mixed ArrayString::get_any(size_t ndx) const @@ -230,9 +215,9 @@ bool ArrayString::is_null(size_t ndx) const return static_cast(m_arr)->is_null(ndx); case Type::big_strings: return static_cast(m_arr)->is_null(ndx); - case Type::enum_strings: { - size_t index = size_t(static_cast(m_arr)->get(ndx)); - return m_string_enum_values->is_null(index); + case Type::interned_strings: { + size_t id = size_t(static_cast(m_arr)->get(ndx)); + return id == 0; } } return {}; @@ -250,7 +235,7 @@ void ArrayString::erase(size_t ndx) case Type::big_strings: static_cast(m_arr)->erase(ndx); break; - case Type::enum_strings: + case Type::interned_strings: static_cast(m_arr)->erase(ndx); break; } @@ -273,9 +258,8 @@ void ArrayString::move(ArrayString& dst, size_t ndx) case Type::big_strings: static_cast(m_arr)->truncate(ndx); break; - case Type::enum_strings: - // this operation will never be called for enumerated columns - REALM_UNREACHABLE(); + case Type::interned_strings: + m_arr->truncate(ndx); break; } } @@ -292,13 +276,23 @@ void ArrayString::clear() case Type::big_strings: static_cast(m_arr)->clear(); break; - case Type::enum_strings: + case Type::interned_strings: static_cast(m_arr)->clear(); break; } } size_t ArrayString::find_first(StringData value, size_t begin, size_t end) const noexcept +{ + // This should only be called if we don't have a string id for this particular array (aka no string interner) + std::optional id; + if (m_type == Type::interned_strings) + id = m_string_interner->lookup(value); + + return find_first(value, begin, end, id); +} + +size_t ArrayString::find_first(StringData value, size_t begin, size_t end, std::optional id) const noexcept { switch (m_type) { case Type::small_strings: @@ -313,14 +307,14 @@ size_t ArrayString::find_first(StringData value, size_t begin, size_t end) const return static_cast(m_arr)->find_first(as_binary, true, begin, end); break; } - case Type::enum_strings: { - size_t sz = m_string_enum_values->size(); - size_t res = m_string_enum_values->find_first(value, 0, sz); - if (res != realm::not_found) { - return static_cast(m_arr)->find_first(res, begin, end); + case Type::interned_strings: { + if (id) { + return static_cast(m_arr)->find_first(*id, begin, end); } break; } + default: + break; } return not_found; } @@ -369,7 +363,8 @@ size_t ArrayString::lower_bound(StringData value) return lower_bound_string(static_cast(m_arr), value); case Type::big_strings: return lower_bound_string(static_cast(m_arr), value); - case Type::enum_strings: + case Type::interned_strings: + REALM_UNREACHABLE(); break; } return realm::npos; @@ -380,8 +375,8 @@ ArrayString::Type ArrayString::upgrade_leaf(size_t value_size) if (m_type == Type::big_strings) return Type::big_strings; - if (m_type == Type::enum_strings) - return Type::enum_strings; + if (m_type == Type::interned_strings) + return Type::interned_strings; if (m_type == Type::medium_strings) { if (value_size <= medium_string_max_size) @@ -472,9 +467,45 @@ void ArrayString::verify() const case Type::big_strings: static_cast(m_arr)->verify(); break; - case Type::enum_strings: + case Type::interned_strings: static_cast(m_arr)->verify(); break; } #endif } + +template <> +ref_type ArrayString::typed_write(ref_type ref, _impl::ArrayWriterBase& out, Allocator& alloc) +{ + Array leaf(alloc); + leaf.init_from_ref(ref); + ref_type ret_val; + auto header = leaf.get_header(); + if (NodeHeader::get_hasrefs_from_header(header) || + NodeHeader::get_wtype_from_header(header) == NodeHeader::wtype_Multiply) { + // We're interning these strings + ArrayString as(alloc); + as.init_from_ref(ref); + StringInterner* interner = out.table->get_string_interner(out.col_key); + auto sz = as.size(); + Array interned(Allocator::get_default()); + interned.create(NodeHeader::type_Normal, true, sz); + for (size_t i = 0; i < sz; ++i) { + interned.set(i, interner->intern(as.get(i))); + } + ret_val = interned.write(out, false, false, out.compress); + interned.destroy(); + // in a transactional setting: + // Destroy all sub-arrays if present, in order to release memory in file + // This is contrary to the rest of the handling in this function, but needed + // here since sub-arrays may not have been COW'ed and therefore not freed in file. + // We rely on 'only_modified' to indicate that we're in a transactional setting. + if (out.only_modified) + leaf.destroy_deep(true); + } + else { + // just write out the array using integer leaf compression + ret_val = leaf.write(out, false, out.only_modified, out.compress); + } + return ret_val; +} diff --git a/src/realm/array_string.hpp b/src/realm/array_string.hpp index 4dc96646378..0e5a5cc3895 100644 --- a/src/realm/array_string.hpp +++ b/src/realm/array_string.hpp @@ -22,6 +22,7 @@ #include #include #include +#include namespace realm { @@ -66,14 +67,17 @@ class ArrayString : public ArrayPayload { { m_arr->set_parent(p, n); } - bool need_spec() const override + bool need_string_interner() const override { return true; } - void set_spec(Spec* spec, size_t col_ndx) const override + void set_string_interner(StringInterner* string_interner) const override { - m_spec = spec; - m_col_ndx = col_ndx; + m_string_interner = string_interner; + } + bool is_compressed() const + { + return m_type == Type::interned_strings; } void update_parent() @@ -100,7 +104,7 @@ class ArrayString : public ArrayPayload { } void insert(size_t ndx, StringData value); StringData get(size_t ndx) const; - StringData get_legacy(size_t ndx) const; + std::optional get_string_id(size_t ndx) const; Mixed get_any(size_t ndx) const override; bool is_null(size_t ndx) const; void erase(size_t ndx); @@ -109,6 +113,9 @@ class ArrayString : public ArrayPayload { size_t find_first(StringData value, size_t begin, size_t end) const noexcept; + /// Special version for searching in an array or compressed strings. + size_t find_first(StringData value, size_t begin, size_t end, std::optional) const noexcept; + size_t lower_bound(StringData value); /// Get the specified element without the cost of constructing an @@ -118,6 +125,8 @@ class ArrayString : public ArrayPayload { static StringData get(const char* header, size_t ndx, Allocator& alloc) noexcept; void verify() const; + template + static ref_type typed_write(ref_type ref, T& out, Allocator& alloc); private: static constexpr size_t small_string_max_size = 15; // ArrayStringShort @@ -127,18 +136,16 @@ class ArrayString : public ArrayPayload { static constexpr size_t storage_size = std::max({sizeof(ArrayStringShort), sizeof(ArraySmallBlobs), sizeof(ArrayBigBlobs), sizeof(Array)}); - enum class Type { small_strings, medium_strings, big_strings, enum_strings }; + enum class Type { small_strings, medium_strings, big_strings, interned_strings }; Type m_type = Type::small_strings; Allocator& m_alloc; alignas(storage_alignment) std::byte m_storage[storage_size]; Array* m_arr; - mutable Spec* m_spec = nullptr; - mutable size_t m_col_ndx = realm::npos; bool m_nullable = true; - std::unique_ptr m_string_enum_values; + mutable StringInterner* m_string_interner = nullptr; Type upgrade_leaf(size_t value_size); }; diff --git a/src/realm/array_timestamp.hpp b/src/realm/array_timestamp.hpp index e2982934080..c486df2479a 100644 --- a/src/realm/array_timestamp.hpp +++ b/src/realm/array_timestamp.hpp @@ -76,7 +76,8 @@ class ArrayTimestamp : public ArrayPayload, private Array { Timestamp get(size_t ndx) const { util::Optional seconds = m_seconds.get(ndx); - return seconds ? Timestamp(*seconds, int32_t(m_nanoseconds.get(ndx))) : Timestamp{}; + int32_t nano = (int32_t)m_nanoseconds.get(ndx); + return seconds ? Timestamp(*seconds, nano) : Timestamp{}; } Mixed get_any(size_t ndx) const final { diff --git a/src/realm/array_unsigned.cpp b/src/realm/array_unsigned.cpp index 938fe5aece8..55f030522b9 100644 --- a/src/realm/array_unsigned.cpp +++ b/src/realm/array_unsigned.cpp @@ -92,23 +92,25 @@ void ArrayUnsigned::update_from_parent() noexcept size_t ArrayUnsigned::lower_bound(uint64_t value) const noexcept { - if (m_width == 8) { + auto width = get_width_from_header(get_header()); + + if (width == 8) { uint8_t* arr = reinterpret_cast(m_data); uint8_t* pos = std::lower_bound(arr, arr + m_size, value); return pos - arr; } - else if (m_width == 16) { + else if (width == 16) { uint16_t* arr = reinterpret_cast(m_data); uint16_t* pos = std::lower_bound(arr, arr + m_size, value); return pos - arr; } - else if (m_width == 32) { + else if (width == 32) { uint32_t* arr = reinterpret_cast(m_data); uint32_t* pos = std::lower_bound(arr, arr + m_size, value); return pos - arr; } - else if (m_width < 8) { - switch (m_width) { + else if (width < 8) { + switch (width) { case 0: return realm::lower_bound<0>(m_data, m_size, value); case 1: @@ -130,23 +132,25 @@ size_t ArrayUnsigned::lower_bound(uint64_t value) const noexcept size_t ArrayUnsigned::upper_bound(uint64_t value) const noexcept { - if (m_width == 8) { + auto width = get_width_from_header(get_header()); + + if (width == 8) { uint8_t* arr = reinterpret_cast(m_data); uint8_t* pos = std::upper_bound(arr, arr + m_size, value); return pos - arr; } - else if (m_width == 16) { + else if (width == 16) { uint16_t* arr = reinterpret_cast(m_data); uint16_t* pos = std::upper_bound(arr, arr + m_size, value); return pos - arr; } - else if (m_width == 32) { + else if (width == 32) { uint32_t* arr = reinterpret_cast(m_data); uint32_t* pos = std::upper_bound(arr, arr + m_size, value); return pos - arr; } - else if (m_width < 8) { - switch (m_width) { + else if (width < 8) { + switch (width) { case 0: return realm::upper_bound<0>(m_data, m_size, value); case 1: diff --git a/src/realm/bplustree.hpp b/src/realm/bplustree.hpp index 5f763892b7a..5eb92512993 100644 --- a/src/realm/bplustree.hpp +++ b/src/realm/bplustree.hpp @@ -30,6 +30,7 @@ namespace realm { class BPlusTreeBase; class BPlusTreeInner; +class StringInterner; /*****************************************************************************/ /* BPlusTreeNode */ @@ -207,6 +208,16 @@ class BPlusTreeBase { m_root->bp_set_parent(parent, ndx_in_parent); } + void set_interner(StringInterner* interner) + { + m_interner = interner; + } + + StringInterner* get_interner() + { + return m_interner; + } + virtual void erase(size_t) = 0; virtual void clear() = 0; virtual void swap(size_t, size_t) = 0; @@ -232,6 +243,7 @@ class BPlusTreeBase { std::unique_ptr m_root; Allocator& m_alloc; ArrayParent* m_parent = nullptr; + StringInterner* m_interner = nullptr; size_t m_ndx_in_parent = 0; size_t m_size = 0; size_t m_cached_leaf_begin; @@ -298,6 +310,9 @@ class BPlusTree : public BPlusTreeBase { void init_from_ref(ref_type ref) noexcept override { LeafArray::init_from_ref(ref); + if constexpr (realm::is_any_v) { + LeafArray::set_string_interner(m_tree->get_interner()); + } } ref_type get_ref() const override @@ -572,19 +587,25 @@ class BPlusTree : public BPlusTreeBase { std::unique_ptr create_leaf_node() override { - std::unique_ptr leaf = std::make_unique(this); - static_cast(leaf.get())->create(); + auto leaf = std::make_unique(this); + leaf->create(); + if constexpr (realm::is_any_v) { + leaf->set_string_interner(m_interner); + } return leaf; } std::unique_ptr init_leaf_node(ref_type ref) override { - std::unique_ptr leaf = std::make_unique(this); + auto leaf = std::make_unique(this); leaf->init_from_ref(ref); return leaf; } BPlusTreeLeaf* cache_leaf(MemRef mem) override { m_leaf_cache.init_from_mem(mem); + if constexpr (realm::is_any_v) { + m_leaf_cache.LeafArray::set_string_interner(m_interner); + } return &m_leaf_cache; } void replace_root(std::unique_ptr new_root) override diff --git a/src/realm/cluster.cpp b/src/realm/cluster.cpp index 1734993ebce..c1207f0d54f 100644 --- a/src/realm/cluster.cpp +++ b/src/realm/cluster.cpp @@ -154,12 +154,7 @@ void Cluster::create() do_create(col_key); break; case col_type_String: { - if (m_tree_top.is_string_enum_type(col_ndx)) { - do_create(col_key); - } - else { - do_create(col_key); - } + do_create(col_key); break; } case col_type_Binary: @@ -251,14 +246,20 @@ size_t Cluster::node_size_from_header(Allocator& alloc, const char* header) } template -inline void Cluster::set_spec(T&, ColKey::Idx) const +inline void Cluster::set_string_interner(T&, ColKey) const { } template <> -inline void Cluster::set_spec(ArrayString& arr, ColKey::Idx col_ndx) const +inline void Cluster::set_string_interner(ArrayString& arr, ColKey col_key) const { - m_tree_top.set_spec(arr, col_ndx); + m_tree_top.set_string_interner(arr, col_key); +} + +template <> +inline void Cluster::set_string_interner(ArrayMixed& arr, ColKey col_key) const +{ + m_tree_top.set_string_interner(arr, col_key); } template @@ -269,7 +270,7 @@ inline void Cluster::do_insert_row(size_t ndx, ColKey col, Mixed init_val, bool T arr(m_alloc); auto col_ndx = col.get_index(); arr.set_parent(this, col_ndx.val + s_first_col_index); - set_spec(arr, col_ndx); + set_string_interner(arr, col); arr.init_from_parent(); if (init_val.is_null()) { arr.insert(ndx, T::default_value(nullable)); @@ -302,6 +303,7 @@ inline void Cluster::do_insert_mixed(size_t ndx, ColKey col_key, Mixed init_valu { ArrayMixed arr(m_alloc); arr.set_parent(this, col_key.get_index().val + s_first_col_index); + set_string_interner(arr, col_key); arr.init_from_parent(); arr.insert(ndx, init_value); @@ -447,10 +449,12 @@ inline void Cluster::do_move(size_t ndx, ColKey col_key, Cluster* to) T src(m_alloc); src.set_parent(this, col_ndx); src.init_from_parent(); + set_string_interner(src, col_key); T dst(m_alloc); dst.set_parent(to, col_ndx); dst.init_from_parent(); + set_string_interner(dst, col_key); src.move(dst, ndx); } @@ -486,13 +490,9 @@ void Cluster::move(size_t ndx, ClusterNode* new_node, int64_t offset) case col_type_Double: do_move(ndx, col_key, new_leaf); break; - case col_type_String: { - if (m_tree_top.is_string_enum_type(col_key.get_index())) - do_move(ndx, col_key, new_leaf); - else - do_move(ndx, col_key, new_leaf); + case col_type_String: + do_move(ndx, col_key, new_leaf); break; - } case col_type_Binary: do_move(ndx, col_key, new_leaf); break; @@ -760,7 +760,7 @@ inline void Cluster::do_erase(size_t ndx, ColKey col_key) auto col_ndx = col_key.get_index(); T values(m_alloc); values.set_parent(this, col_ndx.val + s_first_col_index); - set_spec(values, col_ndx); + set_string_interner(values, col_key); values.init_from_parent(); if constexpr (std::is_same_v) { if (ObjLink link = values.get(ndx)) { @@ -784,6 +784,7 @@ inline void Cluster::do_erase_mixed(size_t ndx, ColKey col_key, CascadeState& st ArrayMixed values(m_alloc); values.set_parent(this, col_ndx.val + s_first_col_index); + set_string_interner(values, col_key); values.init_from_parent(); Mixed value = values.get(ndx); @@ -1025,25 +1026,6 @@ void Cluster::nullify_incoming_links(RowKey key, CascadeState& state) m_tree_top.get_owning_table()->for_each_backlink_column(nullify_fwd_links); } -void Cluster::upgrade_string_to_enum(ColKey col_key, ArrayString& keys) -{ - auto col_ndx = col_key.get_index(); - Array indexes(m_alloc); - indexes.create(Array::type_Normal, false); - ArrayString values(m_alloc); - ref_type ref = Array::get_as_ref(col_ndx.val + s_first_col_index); - values.init_from_ref(ref); - size_t sz = values.size(); - for (size_t i = 0; i < sz; i++) { - auto v = values.get(i); - size_t pos = keys.lower_bound(v); - REALM_ASSERT_3(pos, !=, keys.size()); - indexes.add(pos); - } - Array::set(col_ndx.val + s_first_col_index, indexes.get_ref()); - Array::destroy_deep(ref, m_alloc); -} - void Cluster::init_leaf(ColKey col_key, ArrayPayload* leaf) const { auto col_ndx = col_key.get_index(); @@ -1053,8 +1035,8 @@ void Cluster::init_leaf(ColKey col_key, ArrayPayload* leaf) const if (auto t = m_tree_top.get_owning_table()) t->check_column(col_key); ref_type ref = to_ref(Array::get(col_ndx.val + 1)); - if (leaf->need_spec()) { - m_tree_top.set_spec(*leaf, col_ndx); + if (leaf->need_string_interner()) { + m_tree_top.set_string_interner(*leaf, col_key); } leaf->init_from_ref(ref); leaf->set_parent(const_cast(this), col_ndx.val + 1); @@ -1071,7 +1053,10 @@ template void Cluster::verify(ref_type ref, size_t index, util::Optional& sz) const { ArrayType arr(get_alloc()); - set_spec(arr, ColKey::Idx{unsigned(index) - 1}); + auto table = get_owning_table(); + REALM_ASSERT(index <= table->m_leaf_ndx2colkey.size()); + auto col_key = table->m_leaf_ndx2colkey[index - 1]; + set_string_interner(arr, col_key); arr.set_parent(const_cast(this), index); arr.init_from_ref(ref); arr.verify(); @@ -1409,7 +1394,7 @@ void Cluster::dump_objects(int64_t key_offset, std::string lead) const } case col_type_String: { ArrayString arr(m_alloc); - set_spec(arr, col.get_index()); + set_string_interner(arr, col); ref_type ref = Array::get_as_ref(j); arr.init_from_ref(ref); std::cout << ", " << arr.get(i); @@ -1424,6 +1409,7 @@ void Cluster::dump_objects(int64_t key_offset, std::string lead) const } case col_type_Mixed: { ArrayMixed arr(m_alloc); + set_string_interner(arr, col); ref_type ref = Array::get_as_ref(j); arr.init_from_ref(ref); std::cout << ", " << arr.get(i); @@ -1628,6 +1614,7 @@ ref_type Cluster::typed_write(ref_type ref, _impl::ArrayWriterBase& out) const else { // Columns auto col_key = out.table->m_leaf_ndx2colkey[j - 1]; + out.col_key = col_key; auto col_type = col_key.get_type(); if (col_key.is_collection()) { ArrayRef arr_ref(m_alloc); diff --git a/src/realm/cluster.hpp b/src/realm/cluster.hpp index 564f3aa107e..ae0f166a68c 100644 --- a/src/realm/cluster.hpp +++ b/src/realm/cluster.hpp @@ -314,7 +314,6 @@ class Cluster : public ClusterNode { size_t get_ndx(RowKey key, size_t ndx) const noexcept override; size_t erase(RowKey k, CascadeState& state) override; void nullify_incoming_links(RowKey key, CascadeState& state) override; - void upgrade_string_to_enum(ColKey col, ArrayString& keys); void init_leaf(ColKey col, ArrayPayload* leaf) const; void add_leaf(ColKey col, ref_type ref); @@ -366,6 +365,8 @@ class Cluster : public ClusterNode { void do_insert_mixed(size_t ndx, ColKey col_key, Mixed init_value, ObjKey origin_key); template void set_spec(T&, ColKey::Idx) const; + template + void set_string_interner(T&, ColKey) const; template void verify(ref_type ref, size_t index, util::Optional& sz) const; }; diff --git a/src/realm/cluster_tree.cpp b/src/realm/cluster_tree.cpp index 9c6d7fb1fe7..6884cf9b8ca 100644 --- a/src/realm/cluster_tree.cpp +++ b/src/realm/cluster_tree.cpp @@ -891,45 +891,6 @@ void ClusterTree::clear(CascadeState& state) m_size = 0; } -void ClusterTree::enumerate_string_column(ColKey col_key) -{ - Allocator& alloc = get_alloc(); - - ArrayString keys(alloc); - ArrayString leaf(alloc); - keys.create(); - - auto collect_strings = [col_key, &leaf, &keys](const Cluster* cluster) { - cluster->init_leaf(col_key, &leaf); - size_t sz = leaf.size(); - size_t key_size = keys.size(); - for (size_t i = 0; i < sz; i++) { - auto v = leaf.get(i); - size_t pos = keys.lower_bound(v); - if (pos == key_size || keys.get(pos) != v) { - keys.insert(pos, v); // Throws - key_size++; - } - } - - return IteratorControl::AdvanceToNext; - }; - - auto upgrade = [col_key, &keys](Cluster* cluster) { - cluster->upgrade_string_to_enum(col_key, keys); - }; - - // Populate 'keys' array - traverse(collect_strings); - - // Store key strings in spec - size_t spec_ndx = m_owner->colkey2spec_ndx(col_key); - const_cast(&m_owner->m_spec)->upgrade_string_to_enum(spec_ndx, keys.get_ref()); - - // Replace column in all clusters - update(upgrade); -} - void ClusterTree::replace_root(std::unique_ptr new_root) { if (new_root != m_root) { @@ -1095,13 +1056,12 @@ void ClusterTree::update(UpdateFunction func) } } -void ClusterTree::set_spec(ArrayPayload& arr, ColKey::Idx col_ndx) const +void ClusterTree::set_string_interner(ArrayPayload& arr, ColKey col_key) const { // Check for owner. This function may be called in context of DictionaryClusterTree // in which case m_owner is null (and spec never needed). if (m_owner) { - auto spec_ndx = m_owner->leaf_ndx2spec_ndx(col_ndx); - arr.set_spec(&m_owner->m_spec, spec_ndx); + arr.set_string_interner(m_owner->get_string_interner(col_key)); } } @@ -1134,12 +1094,6 @@ void ClusterTree::nullify_incoming_links(ObjKey obj_key, CascadeState& state) m_root->nullify_incoming_links(ClusterNode::RowKey(obj_key), state); } -bool ClusterTree::is_string_enum_type(ColKey::Idx col_ndx) const -{ - size_t spec_ndx = m_owner->leaf_ndx2spec_ndx(col_ndx); - return m_owner->m_spec.is_string_enum_type(spec_ndx); -} - void ClusterTree::remove_all_links(CascadeState& state) { Allocator& alloc = get_alloc(); diff --git a/src/realm/cluster_tree.hpp b/src/realm/cluster_tree.hpp index 1b0d05c759b..f0bcb9c48bc 100644 --- a/src/realm/cluster_tree.hpp +++ b/src/realm/cluster_tree.hpp @@ -153,7 +153,6 @@ class ClusterTree { } void clear(CascadeState&); - void enumerate_string_column(ColKey col_key); const Table* get_owning_table() const noexcept { @@ -180,7 +179,7 @@ class ClusterTree { // Visit all leaves and call the supplied function. The function can modify the leaf. void update(UpdateFunction func); - void set_spec(ArrayPayload& arr, ColKey::Idx col_ndx) const; + void set_string_interner(ArrayPayload& arr, ColKey col_key) const; virtual std::unique_ptr get_root_from_parent(); @@ -213,7 +212,6 @@ class ClusterTree { std::unique_ptr create_root_from_parent(ArrayParent* parent, size_t ndx_in_parent); std::unique_ptr get_node(ArrayParent* parent, size_t ndx_in_parent) const; TableRef get_table_ref() const; - bool is_string_enum_type(ColKey::Idx col_ndx) const; void remove_all_links(CascadeState&); }; diff --git a/src/realm/collection.cpp b/src/realm/collection.cpp index f0eabf95d46..24d261622a6 100644 --- a/src/realm/collection.cpp +++ b/src/realm/collection.cpp @@ -155,6 +155,7 @@ void Collection::get_any(QueryCtrlBlock& ctrl, Mixed val, size_t index) BPlusTree keys(*ctrl.alloc); keys.set_parent(&top, 0); + keys.set_interner(ctrl.interner); keys.init_from_parent(); size_t start = 0; if (size_t finish = keys.size()) { @@ -177,6 +178,7 @@ void Collection::get_any(QueryCtrlBlock& ctrl, Mixed val, size_t index) } BPlusTree values(*ctrl.alloc); values.set_parent(&top, 1); + values.set_interner(ctrl.interner); values.init_from_parent(); for (; start < finish; start++) { val = values.get(start); @@ -194,6 +196,7 @@ void Collection::get_any(QueryCtrlBlock& ctrl, Mixed val, size_t index) if (!ref) return; BPlusTree list(*ctrl.alloc); + list.set_interner(ctrl.interner); list.init_from_ref(ref); if (size_t sz = list.size()) { size_t start = 0; diff --git a/src/realm/collection.hpp b/src/realm/collection.hpp index e26158a63ee..bc2d7f6b99d 100644 --- a/src/realm/collection.hpp +++ b/src/realm/collection.hpp @@ -12,15 +12,17 @@ namespace realm { +class StringInterner; template struct CollectionIterator; // Used in Cluster when removing owning object class DummyParent : public CollectionParent { public: - DummyParent(TableRef t, ref_type ref) + DummyParent(TableRef t, ref_type ref, ColKey ck) : m_obj(t, MemRef(), ObjKey(), 0) , m_ref(ref) + , m_col_key(ck) { } FullPath get_path() const noexcept final @@ -37,7 +39,7 @@ class DummyParent : public CollectionParent { } ColKey get_col_key() const noexcept final { - return {}; + return m_col_key; } void add_index(Path&, const Index&) const noexcept final {} size_t find_index(const Index&) const noexcept final @@ -62,6 +64,7 @@ class DummyParent : public CollectionParent { protected: Obj m_obj; ref_type m_ref; + ColKey m_col_key; UpdateStatus update_if_needed() const final { return UpdateStatus::Updated; @@ -111,6 +114,7 @@ class Collection { bool path_only_unary_keys = false; // Not from list Allocator* alloc = nullptr; Group* group = nullptr; + StringInterner* interner = nullptr; }; static void get_any(QueryCtrlBlock&, Mixed, size_t); }; diff --git a/src/realm/dictionary.cpp b/src/realm/dictionary.cpp index 63851d0e72b..788a1dfcb65 100644 --- a/src/realm/dictionary.cpp +++ b/src/realm/dictionary.cpp @@ -852,9 +852,11 @@ UpdateStatus Dictionary::init_from_parent(bool allow_create) const Allocator& alloc = get_alloc(); m_dictionary_top.reset(new Array(alloc)); m_dictionary_top->set_parent(const_cast(this), 0); + StringInterner* interner = m_col_key ? get_table()->get_string_interner(m_col_key) : nullptr; switch (m_key_type) { case type_String: { m_keys.reset(new BPlusTree(alloc)); + m_keys->set_interner(interner); break; } case type_Int: { @@ -867,6 +869,7 @@ UpdateStatus Dictionary::init_from_parent(bool allow_create) const m_keys->set_parent(m_dictionary_top.get(), 0); m_values.reset(new BPlusTreeMixed(alloc)); m_values->set_parent(m_dictionary_top.get(), 1); + m_values->set_interner(interner); } if (ref) { @@ -1155,12 +1158,12 @@ void Dictionary::to_json(std::ostream& out, JSONOutputMode output_mode, fn(val); } else if (val.is_type(type_Dictionary)) { - DummyParent parent(this->get_table(), val.get_ref()); + DummyParent parent(this->get_table(), val.get_ref(), m_col_key); Dictionary dict(parent, 0); dict.to_json(out, output_mode, fn); } else if (val.is_type(type_List)) { - DummyParent parent(this->get_table(), val.get_ref()); + DummyParent parent(this->get_table(), val.get_ref(), m_col_key); Lst list(parent, 0); list.to_json(out, output_mode, fn); } diff --git a/src/realm/exec/CMakeLists.txt b/src/realm/exec/CMakeLists.txt index 969b45a9e10..28a8a3fe19d 100644 --- a/src/realm/exec/CMakeLists.txt +++ b/src/realm/exec/CMakeLists.txt @@ -46,17 +46,6 @@ endif() target_link_libraries(ClickQuery Storage) -add_executable(RealmEnumerate realm_enumerate.cpp) -set_target_properties(RealmEnumerate PROPERTIES - OUTPUT_NAME "realm-enumerate" - DEBUG_POSTFIX ${CMAKE_DEBUG_POSTFIX} -) -target_link_libraries(RealmEnumerate ObjectStore) -# FIXME can be fixed for others, but requires link and install fixes for libuv target -if (NOT APPLE) - set_target_properties(RealmEnumerate PROPERTIES EXCLUDE_FROM_ALL TRUE) -endif() - add_executable(RealmDecrypt realm_decrypt.cpp) set_target_properties(RealmDecrypt PROPERTIES OUTPUT_NAME "realm-decrypt" diff --git a/src/realm/exec/realm_enumerate.cpp b/src/realm/exec/realm_enumerate.cpp deleted file mode 100644 index 44f534c8454..00000000000 --- a/src/realm/exec/realm_enumerate.cpp +++ /dev/null @@ -1,139 +0,0 @@ -/* - * Useage: realm-enumerate [--key crypt_key] [--threshold 0.xx] - * Changes string columns which pass the threshold of unique values to enumerated columns - * and compacts the Realm in place. - */ - -#include -#include -#include -#include -#include -#include - -#include -#include - -static void enumerate_strings(realm::SharedRealm realm, double threshold) -{ - auto& group = realm->read_group(); - auto table_keys = group.get_table_keys(); - for (auto table_key : table_keys) { - realm::TableRef t = group.get_table(table_key); - size_t table_size = t->size(); - realm::util::format(std::cout, "Begin table '%1' of size %2:\n", t->get_name(), table_size); - if (table_size == 0) - continue; - bool found_str_col = false; - auto do_convert = [&realm, &t](realm::ColKey col) { - auto start = std::chrono::steady_clock::now(); - std::cout << "[converting]" << std::flush; - realm->begin_transaction(); - t->enumerate_string_column(col); - realm->commit_transaction(); - std::chrono::duration diff = std::chrono::steady_clock::now() - start; - std::cout << " (" << diff.count() << " seconds)" << std::endl; - }; - t->for_each_public_column([&](realm::ColKey col_key) { - if (col_key.get_type() == realm::col_type_String && !col_key.is_collection()) { - found_str_col = true; - realm::util::format(std::cout, "\tcolumn '%1' ", t->get_column_name(col_key)); - std::cout << std::flush; - if (t->is_enumerated(col_key)) { - std::cout << "[already enumerated]" << std::endl; - } - else if (t->get_primary_key_column() == col_key) { - std::cout << "[pk - skipping]" << std::endl; - } - else if (threshold >= 100) { - do_convert(col_key); - } - else if (threshold < 100 && threshold > 0) { - std::unique_ptr distinct = - std::make_unique(); - distinct->append_distinct(realm::DistinctDescriptor({{col_key}})); - size_t uniques = t->where().count(*distinct.get()); - double utilization = uniques / double(table_size); - realm::util::format(std::cout, "contains %1 unique values (%2%%) ", uniques, utilization * 100.0); - std::cout << std::flush; - if (utilization <= threshold / 100) { - do_convert(col_key); - } - else { - std::cout << "[skipping due to threshold]" << std::endl; - } - } - else { - std::cout << "[skipping due to threshold]" << std::endl; - } - } - return realm::IteratorControl::AdvanceToNext; - }); - if (!found_str_col) { - std::cout << "\tNo string columns found." << std::endl; - } - } -} - -int main(int argc, const char* argv[]) -{ - if (argc > 1) { - try { - const char* key_ptr = nullptr; - char key[64]; - double threshold = 0; // by default don't convert, just compact - for (int curr_arg = 1; curr_arg < argc; curr_arg++) { - if (strcmp(argv[curr_arg], "--key") == 0) { - std::ifstream key_file(argv[curr_arg + 1]); - key_file.read(key, sizeof(key)); - key_ptr = key; - curr_arg++; - } - else if (strcmp(argv[curr_arg], "--threshold") == 0) { - threshold = strtod(argv[curr_arg + 1], nullptr); - curr_arg++; - } - else { - realm::util::format(std::cout, "File name '%1' for threshold %2%%\n", argv[curr_arg], threshold); - auto start = std::chrono::steady_clock::now(); - realm::Realm::Config config; - config.path = argv[curr_arg]; - if (key_ptr) { - config.encryption_key.resize(64); - memcpy(&config.encryption_key[0], &key_ptr[0], 64); - } - realm::SharedRealm realm; - try { - realm = realm::Realm::get_shared_realm(config); - } - catch (const realm::FileAccessError& e) { - std::cout << "trying to open as a sync Realm\n" << e.what() << "\n" << std::endl; - config.force_sync_history = true; - realm = realm::Realm::get_shared_realm(config); - } - enumerate_strings(realm, threshold); - realm->compact(); - std::chrono::duration diff = std::chrono::steady_clock::now() - start; - std::cout << "Done in " << diff.count() << " seconds." << std::endl; - std::cout << std::endl; - return 0; - } - } - } - catch (const std::exception& e) { - std::cout << e.what() << std::endl; - } - } - else { - std::cout << "Usage: realm-enumerate [--key crypt_key] [--threshold 0.xx] " << std::endl; - std::cout << "The optional crypt_key arg is a filename which contains the 64 byte key." << std::endl; - std::cout - << "The optional threshold is a number between [0, 100] indicating the percentage of unique strings " - "below which columns will be converted. At a value of 100, all columns will be converted. " - "For value of 50 only columns which have 50% or fewer unique values will be converted." - "If not set, the threshold default is 0 which just compacts the file without converting anything." - << std::endl; - } - - return 0; -} diff --git a/src/realm/group.cpp b/src/realm/group.cpp index b6703b3af53..7064b511d2c 100644 --- a/src/realm/group.cpp +++ b/src/realm/group.cpp @@ -943,19 +943,17 @@ ref_type Group::typed_write_tables(_impl::ArrayWriterBase& out) const ref_type ref = m_top.get_as_ref(1); if (out.only_modified && m_alloc.is_read_only(ref)) return ref; - Array a(m_alloc); - a.init_from_ref(ref); - REALM_ASSERT_DEBUG(a.has_refs()); - TempArray dest(a.size()); - for (unsigned j = 0; j < a.size(); ++j) { - RefOrTagged rot = a.get_as_ref_or_tagged(j); + auto num_tables = m_tables.size(); + TempArray dest(num_tables); + for (unsigned j = 0; j < num_tables; ++j) { + RefOrTagged rot = m_tables.get_as_ref_or_tagged(j); if (rot.is_tagged()) { dest.set(j, rot); } else { auto table = do_get_table(j); REALM_ASSERT_DEBUG(table); - dest.set_as_ref(j, table->typed_write(rot.get_as_ref(), out)); + dest.set_as_ref(j, table->typed_write(out)); } } return dest.write(out); @@ -1326,7 +1324,7 @@ void Group::flush_accessors_for_commit() acc->flush_for_commit(); } -void Group::refresh_dirty_accessors() +void Group::refresh_dirty_accessors(bool writable) { if (!m_tables.is_attached()) { m_table_accessors.clear(); @@ -1356,7 +1354,7 @@ void Group::refresh_dirty_accessors() same_table = true; } if (same_table) { - table_accessor->refresh_accessor_tree(); + table_accessor->refresh_accessor_tree(writable); } else { table_accessor->detach(Table::cookie_removed); @@ -1414,7 +1412,7 @@ void Group::advance_transact(ref_type new_top_ref, util::InputStream* in, bool w m_top.detach(); // Soft detach bool create_group_when_missing = false; // See Group::attach_shared(). attach(new_top_ref, writable, create_group_when_missing); // Throws - refresh_dirty_accessors(); // Throws + refresh_dirty_accessors(writable); // Throws if (schema_changed) send_schema_change_notification(); diff --git a/src/realm/group.hpp b/src/realm/group.hpp index 352c5bd25fb..9145d2b9c5a 100644 --- a/src/realm/group.hpp +++ b/src/realm/group.hpp @@ -679,7 +679,7 @@ class Group : public ArrayParent { /// Memory mappings must have been updated to reflect any growth in filesize before /// calling advance_transact() void advance_transact(ref_type new_top_ref, util::InputStream*, bool writable); - void refresh_dirty_accessors(); + void refresh_dirty_accessors(bool writable); void flush_accessors_for_commit(); /// \brief The version of the format of the node structure (in file or in diff --git a/src/realm/group_writer.cpp b/src/realm/group_writer.cpp index 22ce7db93ac..533565f39d2 100644 --- a/src/realm/group_writer.cpp +++ b/src/realm/group_writer.cpp @@ -647,6 +647,7 @@ ref_type GroupWriter::write_group() { ALLOC_DBG_COUT("Commit nr " << m_current_version << " ( from " << m_oldest_reachable_version << " )" << std::endl); + // m_group.typed_print(""); read_in_freelist(); // Now, 'm_size_map' holds all free elements candidate for recycling @@ -710,7 +711,7 @@ ref_type GroupWriter::write_group() top.set_as_ref(Group::s_evacuation_point_ndx, ref); } else if (ref) { - Array::destroy(ref, m_alloc); + Array::destroy(ref_type(ref), m_alloc); top.set(Group::s_evacuation_point_ndx, 0); } } @@ -788,7 +789,9 @@ ref_type GroupWriter::write_group() top.set(Group::s_file_size_ndx, RefOrTagged::make_tagged(m_logical_size)); auto ref = top.get_as_ref(Group::s_evacuation_point_ndx); REALM_ASSERT(ref); - Array::destroy(ref, m_alloc); + Array destroy_array(m_alloc); + destroy_array.init_from_ref(ref); + destroy_array.destroy(); top.set(Group::s_evacuation_point_ndx, 0); m_evacuation_limit = 0; diff --git a/src/realm/impl/array_writer.hpp b/src/realm/impl/array_writer.hpp index 4096805e0fa..c6a7e18e413 100644 --- a/src/realm/impl/array_writer.hpp +++ b/src/realm/impl/array_writer.hpp @@ -20,6 +20,7 @@ #define REALM_ARRAY_WRITER_HPP #include +#include namespace realm { class Table; @@ -30,6 +31,7 @@ class ArrayWriterBase { bool only_modified = true; bool compress = true; const Table* table; + ColKey col_key; virtual ~ArrayWriterBase() { } diff --git a/src/realm/list.cpp b/src/realm/list.cpp index 2a9b61b63ee..e782198e7a1 100644 --- a/src/realm/list.cpp +++ b/src/realm/list.cpp @@ -386,6 +386,8 @@ UpdateStatus Lst::init_from_parent(bool allow_create) const m_tree.reset(new BPlusTreeMixed(get_alloc())); const ArrayParent* parent = this; m_tree->set_parent(const_cast(parent), 0); + if (m_col_key) + m_tree->set_interner(get_table()->get_string_interner(m_col_key)); } try { return do_init_from_parent(m_tree.get(), Base::get_collection_ref(), allow_create); @@ -746,12 +748,12 @@ void Lst::to_json(std::ostream& out, JSONOutputMode output_mode, fn(val); } else if (val.is_type(type_Dictionary)) { - DummyParent parent(this->get_table(), val.get_ref()); + DummyParent parent(this->get_table(), val.get_ref(), m_col_key); Dictionary dict(parent, i); dict.to_json(out, output_mode, fn); } else if (val.is_type(type_List)) { - DummyParent parent(this->get_table(), val.get_ref()); + DummyParent parent(this->get_table(), val.get_ref(), m_col_key); Lst list(parent, i); list.to_json(out, output_mode, fn); } diff --git a/src/realm/list.hpp b/src/realm/list.hpp index f0646d2a176..398988e02bc 100644 --- a/src/realm/list.hpp +++ b/src/realm/list.hpp @@ -258,6 +258,10 @@ class Lst final : public CollectionBaseImpl { m_tree.reset(new BPlusTree(get_alloc())); const ArrayParent* parent = this; m_tree->set_parent(const_cast(parent), 0); + if constexpr (realm::is_any_v) { + if (m_col_key) + m_tree->set_interner(get_table()->get_string_interner(m_col_key)); + } } Base::update_content_version(); return do_init_from_parent(m_tree.get(), Base::get_collection_ref(), allow_create); diff --git a/src/realm/node.hpp b/src/realm/node.hpp index 8d606b37708..e00ea30691d 100644 --- a/src/realm/node.hpp +++ b/src/realm/node.hpp @@ -346,17 +346,18 @@ class ArrayWriterBase; } /// Base class for all nodes holding user data +class StringInterner; class ArrayPayload { public: virtual ~ArrayPayload(); virtual void init_from_ref(ref_type) noexcept = 0; virtual void set_parent(ArrayParent* parent, size_t ndx_in_parent) noexcept = 0; virtual Mixed get_any(size_t ndx) const = 0; - virtual bool need_spec() const + virtual bool need_string_interner() const { return false; } - virtual void set_spec(Spec*, size_t) const {} + virtual void set_string_interner(StringInterner*) const {} static ref_type typed_write(ref_type ref, _impl::ArrayWriterBase& out, Allocator& alloc); }; diff --git a/src/realm/obj.cpp b/src/realm/obj.cpp index 8d968971fc7..ece4b291d9a 100644 --- a/src/realm/obj.cpp +++ b/src/realm/obj.cpp @@ -239,13 +239,15 @@ bool Obj::compare_list_in_mixed(Lst& val1, Lst& val2, ColKey ck, O auto m1 = val1.get_any(i); auto m2 = val2.get_any(i); + auto other_table = other.get_table(); + auto other_col_key = other_table->get_column_key(col_name); if (m1.is_type(type_List) && m2.is_type(type_List)) { - DummyParent parent(other.get_table(), m2.get_ref()); + DummyParent parent(other_table, m2.get_ref(), other_col_key); Lst list(parent, 0); return compare_list_in_mixed(*val1.get_list(i), list, ck, other, col_name); } else if (m1.is_type(type_Dictionary) && m2.is_type(type_Dictionary)) { - DummyParent parent(other.get_table(), m2.get_ref()); + DummyParent parent(other_table, m2.get_ref(), other_col_key); Dictionary dict(parent, 0); return compare_dict_in_mixed(*val1.get_dictionary(i), dict, ck, other, col_name); } @@ -268,13 +270,15 @@ bool Obj::compare_dict_in_mixed(Dictionary& val1, Dictionary& val2, ColKey ck, O if (k1 != k2) return false; + auto other_table = other.get_table(); + auto other_col_key = other_table->get_column_key(col_name); if (m1.is_type(type_List) && m2.is_type(type_List)) { - DummyParent parent(other.get_table(), m2.get_ref()); + DummyParent parent(other_table, m2.get_ref(), other_col_key); Lst list(parent, 0); return compare_list_in_mixed(*val1.get_list(k1.get_string()), list, ck, other, col_name); } else if (m1.is_type(type_Dictionary) && m2.is_type(type_Dictionary)) { - DummyParent parent(other.get_table(), m2.get_ref()); + DummyParent parent(other_table, m2.get_ref(), other_col_key); Dictionary dict(parent, 0); return compare_dict_in_mixed(*val1.get_dictionary(k1.get_string()), dict, ck, other, col_name); } @@ -495,6 +499,7 @@ Mixed Obj::get_unfiltered_mixed(ColKey::Idx col_ndx) const ArrayMixed values(get_alloc()); ref_type ref = to_ref(Array::get(m_mem.get_addr(), col_ndx.val + 1)); values.init_from_ref(ref); + values.set_string_interner(m_table->get_string_interner(col_ndx)); return values.get(m_row_ndx); } @@ -603,18 +608,11 @@ StringData Obj::_get(ColKey::Idx col_ndx) const } ref_type ref = to_ref(Array::get(m_mem.get_addr(), col_ndx.val + 1)); - auto spec_ndx = m_table->leaf_ndx2spec_ndx(col_ndx); - auto& spec = get_spec(); - if (spec.is_string_enum_type(spec_ndx)) { - ArrayString values(get_alloc()); - values.set_spec(const_cast(&spec), spec_ndx); - values.init_from_ref(ref); - - return values.get(m_row_ndx); - } - else { - return ArrayString::get(alloc.translate(ref), m_row_ndx, alloc); - } + ArrayString values(get_alloc()); + auto col_key = m_table->leaf_ndx2colkey(col_ndx); + values.set_string_interner(m_table->get_string_interner(col_key)); + values.init_from_ref(ref); + return values.get(m_row_ndx); } template <> @@ -631,6 +629,36 @@ BinaryData Obj::_get(ColKey::Idx col_ndx) const return ArrayBinary::get(alloc.translate(ref), m_row_ndx, alloc); } +std::optional Obj::get_string_id(ColKey col_key) const +{ + // we return a string id only if the property is string or mixed. + // And it got compressed. + + // only strings and mixed can have an interner + if (col_key.get_type() != col_type_String && col_key.get_type() != col_type_Mixed) + return {}; + + m_table->check_column(col_key); + _update_if_needed(); + + const auto col_ndx = col_key.get_index(); + const auto interner = m_table->get_string_interner(col_ndx); + ref_type ref = to_ref(Array::get(m_mem.get_addr(), col_ndx.val + 1)); + + if (col_key.get_type() == col_type_Mixed) { + // mixed handling. Only strings in mixed may have a string id + ArrayMixed values(get_alloc()); + values.set_string_interner(interner); + values.init_from_ref(ref); + return values.get_string_id(m_row_ndx); + } + // must be string. + ArrayString values(get_alloc()); + values.set_string_interner(interner); + values.init_from_ref(ref); + return values.get_string_id(m_row_ndx); +} + Mixed Obj::get_any(ColKey col_key) const { m_table->check_column(col_key); @@ -738,9 +766,11 @@ inline bool Obj::do_is_null(ColKey::Idx col_ndx) const template <> inline bool Obj::do_is_null(ColKey::Idx col_ndx) const { + REALM_ASSERT(false); // Don't come here, you're falling from a cliff.... ArrayString values(get_alloc()); ref_type ref = to_ref(Array::get(m_mem.get_addr(), col_ndx.val + 1)); - values.set_spec(const_cast(&get_spec()), m_table->leaf_ndx2spec_ndx(col_ndx)); + // TODO: Set string interner if needed + // values.set_string_interner(m_table->get_string_interner(col_key)); values.init_from_ref(ref); return values.is_null(m_row_ndx); } @@ -765,8 +795,15 @@ bool Obj::is_null(ColKey col_key) const return do_is_null(col_ndx); case col_type_Double: return do_is_null(col_ndx); - case col_type_String: - return do_is_null(col_ndx); + case col_type_String: { + ArrayString values(get_alloc()); + ref_type ref = to_ref(Array::get(m_mem.get_addr(), col_ndx.val + 1)); + // TODO: Set string interner if needed + values.set_string_interner(m_table->get_string_interner(col_key)); + values.init_from_ref(ref); + return values.is_null(m_row_ndx); + } + // return do_is_null(col_ndx); case col_type_Binary: return do_is_null(col_ndx); case col_type_Mixed: @@ -1153,6 +1190,22 @@ REALM_FORCEINLINE void Obj::sync(Node& arr) } } +// helper functions for filtering out calls to set_string_interner() +template +inline void Obj::set_string_interner(T&, ColKey) +{ +} +template <> +inline void Obj::set_string_interner(ArrayString& values, ColKey col_key) +{ + values.set_string_interner(m_table->get_string_interner(col_key)); +} +template <> +inline void Obj::set_string_interner(ArrayMixed& values, ColKey col_key) +{ + values.set_string_interner(m_table->get_string_interner(col_key)); +} + template <> Obj& Obj::set(ColKey col_key, Mixed value, bool is_default) { @@ -1210,6 +1263,7 @@ Obj& Obj::set(ColKey col_key, Mixed value, bool is_default) REALM_ASSERT(col_ndx.val + 1 < fields.size()); ArrayMixed values(alloc); values.set_parent(&fields, col_ndx.val + 1); + set_string_interner(values, col_key); values.init_from_parent(); values.set(m_row_ndx, value); if (value.is_type(type_Dictionary, type_List)) { @@ -1354,6 +1408,7 @@ Obj& Obj::add_int(ColKey col_key, int64_t value) if (col_key.get_type() == col_type_Mixed) { ArrayMixed values(alloc); values.set_parent(&fields, col_ndx.val + 1); + set_string_interner(values, col_key); values.init_from_parent(); Mixed old = values.get(m_row_ndx); if (old.is_type(type_Int)) { @@ -1588,19 +1643,6 @@ inline void check_range(const BinaryData& val) } } // namespace -// helper functions for filtering out calls to set_spec() -template -inline void Obj::set_spec(T&, ColKey) -{ -} -template <> -inline void Obj::set_spec(ArrayString& values, ColKey col_key) -{ - size_t spec_ndx = m_table->colkey2spec_ndx(col_key); - Spec* spec = const_cast(&get_spec()); - values.set_spec(spec, spec_ndx); -} - #if REALM_ENABLE_GEOSPATIAL template <> @@ -1684,7 +1726,7 @@ Obj& Obj::set(ColKey col_key, T value, bool is_default) using LeafType = typename ColumnTypeTraits::cluster_leaf_type; LeafType values(alloc); values.set_parent(&fields, col_ndx.val + 1); - set_spec(values, col_key); + set_string_interner(values, col_key); values.init_from_parent(); values.set(m_row_ndx, value); @@ -2287,7 +2329,6 @@ template <> inline void Obj::do_set_null(ColKey col_key) { ColKey::Idx col_ndx = col_key.get_index(); - size_t spec_ndx = m_table->leaf_ndx2spec_ndx(col_ndx); Allocator& alloc = get_alloc(); alloc.bump_content_version(); Array fallback(alloc); @@ -2295,7 +2336,7 @@ inline void Obj::do_set_null(ColKey col_key) ArrayString values(alloc); values.set_parent(&fields, col_ndx.val + 1); - values.set_spec(const_cast(&get_spec()), spec_ndx); + values.set_string_interner(m_table->get_string_interner(col_key)); values.init_from_parent(); values.set_null(m_row_ndx); diff --git a/src/realm/obj.hpp b/src/realm/obj.hpp index 67c82a0cada..bb320f02c9f 100644 --- a/src/realm/obj.hpp +++ b/src/realm/obj.hpp @@ -117,6 +117,11 @@ class Obj { template U get(ColKey col_key) const; + std::optional get_string_id(ColKey) const; + std::optional get_string_id(StringData col_name) const + { + return get_string_id(get_column_key(col_name)); + } Mixed get_any(ColKey col_key) const; Mixed get_any(StringData col_name) const { @@ -391,7 +396,7 @@ class Obj { bool remove_one_backlink(ColKey backlink_col, ObjKey origin_key); void nullify_link(ColKey origin_col, ObjLink target_key) &&; template - inline void set_spec(T&, ColKey); + inline void set_string_interner(T&, ColKey); template inline void nullify_single_link(ColKey col, ValueType target); diff --git a/src/realm/path.hpp b/src/realm/path.hpp index 6124590271c..0dfb83f96cc 100644 --- a/src/realm/path.hpp +++ b/src/realm/path.hpp @@ -256,6 +256,10 @@ class ExtendedColumnKey { ObjKey get_link_target(const Obj& obj) const; Mixed get_value(const Obj& obj) const; + // get String ID for the obj, it makes sense to call this method only if the col_key type is either Mixed or + // String. + std::optional get_string_id(const Obj& obj) const; + private: ColKey m_colkey; PathElement m_index; diff --git a/src/realm/query_engine.cpp b/src/realm/query_engine.cpp index 6af55085725..03cec8674ad 100644 --- a/src/realm/query_engine.cpp +++ b/src/realm/query_engine.cpp @@ -272,10 +272,7 @@ void StringNodeEqualBase::init(bool will_query_ranges) StringNodeBase::init(will_query_ranges); const bool uses_index = has_search_index(); - if (m_is_string_enum) { - m_dT = 1.0; - } - else if (uses_index) { + if (uses_index) { m_dT = 0.0; } else { @@ -456,7 +453,7 @@ bool StringNode::do_consume_condition(ParentNode& node) size_t StringNode::_find_first_local(size_t start, size_t end) { if (m_needles.empty()) { - return m_leaf->find_first(m_string_value, start, end); + return m_leaf->find_first(m_string_value, start, end, m_interned_string_id); } else { if (end == npos) @@ -508,7 +505,8 @@ size_t StringNode::_find_first_local(size_t start, size_t end) } StringNodeFulltext::StringNodeFulltext(StringData v, ColKey column, std::unique_ptr lm) - : StringNodeEqualBase(v, column) + : m_value(v) + , m_col(column) , m_link_map(std::move(lm)) { if (!m_link_map) @@ -517,22 +515,25 @@ StringNodeFulltext::StringNodeFulltext(StringData v, ColKey column, std::unique_ void StringNodeFulltext::table_changed() { - StringNodeEqualBase::table_changed(); m_link_map->set_base_table(m_table); } StringNodeFulltext::StringNodeFulltext(const StringNodeFulltext& other) - : StringNodeEqualBase(other) + : ParentNode(other) + , m_value(other.m_value) + , m_col(other.m_col) + , m_link_map(std::make_unique(*other.m_link_map)) { - m_link_map = std::make_unique(*other.m_link_map); } -void StringNodeFulltext::_search_index_init() +void StringNodeFulltext::init(bool will_query_ranges) { - StringIndex* index = m_link_map->get_target_table()->get_string_index(ParentNode::m_condition_column_key); + ParentNode::init(will_query_ranges); + + StringIndex* index = m_link_map->get_target_table()->get_string_index(m_col); REALM_ASSERT(index && index->is_fulltext_index()); m_index_matches.clear(); - index->find_all_fulltext(m_index_matches, StringNodeBase::m_string_value); + index->find_all_fulltext(m_index_matches, m_value); // If links exists, use backlinks to find the original objects if (m_link_map->links_exist()) { @@ -545,7 +546,7 @@ void StringNodeFulltext::_search_index_init() } m_index_evaluator = IndexEvaluator{}; - m_index_evaluator->init(&m_index_matches); + m_index_evaluator.init(&m_index_matches); } std::unique_ptr TwoColumnsNodeBase::update_cached_leaf_pointers_for_column(Allocator& alloc, diff --git a/src/realm/query_engine.hpp b/src/realm/query_engine.hpp index 94cc9612a48..a2d81a96738 100644 --- a/src/realm/query_engine.hpp +++ b/src/realm/query_engine.hpp @@ -151,6 +151,8 @@ class ParentNode { { m_dD = 100.0; + if (m_condition_column_key) + m_table->check_column(m_condition_column_key); if (m_child) m_child->init(will_query_ranges); } @@ -1649,7 +1651,7 @@ class StringNodeBase : public ParentNode { void table_changed() override { - m_is_string_enum = m_table.unchecked_ptr()->is_enumerated(m_condition_column_key); + m_string_interner = m_table.unchecked_ptr()->get_string_interner(m_condition_column_key); } void cluster_changed() override @@ -1667,6 +1669,7 @@ class StringNodeBase : public ParentNode { m_end_s = 0; m_leaf_start = 0; m_leaf_end = 0; + m_interned_string_id = m_string_interner->lookup(m_value); } virtual void clear_leaf_state() @@ -1678,7 +1681,8 @@ class StringNodeBase : public ParentNode { : ParentNode(from) , m_value(from.m_value) , m_string_value(m_value) - , m_is_string_enum(from.m_is_string_enum) + , m_string_interner(from.m_string_interner) + , m_interned_string_id(from.m_interned_string_id) { } @@ -1693,8 +1697,8 @@ class StringNodeBase : public ParentNode { std::optional m_value; std::optional m_leaf; StringData m_string_value; - - bool m_is_string_enum = false; + StringInterner* m_string_interner = nullptr; + std::optional m_interned_string_id; size_t m_end_s = 0; size_t m_leaf_start = 0; @@ -1711,7 +1715,7 @@ template class StringNode : public StringNodeBase { public: constexpr static bool case_sensitive_comparison = - is_any_v; + is_any_v; StringNode(StringData v, ColKey column) : StringNodeBase(v, column) { @@ -1740,16 +1744,31 @@ class StringNode : public StringNodeBase { TConditionFunction cond; for (size_t s = start; s < end; ++s) { - StringData t = get_string(s); + // special handling for !=, <, <= , >, >= if the leaf is compressed and we have got a compressed string + // id. + if constexpr (realm::is_any_v) { + if (m_leaf->is_compressed()) { + if (m_interned_string_id) { + const auto id = m_leaf->get_string_id(s); + if (cond(m_string_interner->compare(*id, *m_interned_string_id), 0)) + return s; + else + continue; + } + } + } + + StringData t = get_string(s); if constexpr (case_sensitive_comparison) { // case insensitive not implemented for: >, >=, <, <= if (cond(t, m_string_value)) return s; } else { - if (cond(m_string_value, m_ucase.c_str(), m_lcase.c_str(), t)) + if (cond(m_string_value, m_ucase.c_str(), m_lcase.c_str(), t)) { return s; + } } } return not_found; @@ -2069,20 +2088,24 @@ class StringNode : public StringNodeEqualBase { size_t _find_first_local(size_t start, size_t end) override; }; - -class StringNodeFulltext : public StringNodeEqualBase { +class StringNodeFulltext : public ParentNode { public: StringNodeFulltext(StringData v, ColKey column, std::unique_ptr lm = {}); void table_changed() override; - void _search_index_init() override; + void init(bool will_query_ranges) override; bool has_search_index() const override { return true; // it's a required precondition for fulltext queries } + const IndexEvaluator* index_based_keys() override + { + return &m_index_evaluator; + } + std::unique_ptr clone() const override { return std::unique_ptr(new StringNodeFulltext(*this)); @@ -2094,13 +2117,16 @@ class StringNodeFulltext : public StringNodeEqualBase { } private: - std::vector m_index_matches; + std::string m_value; + ColKey m_col; std::unique_ptr m_link_map; + IndexEvaluator m_index_evaluator; + std::vector m_index_matches; StringNodeFulltext(const StringNodeFulltext&); - size_t _find_first_local(size_t, size_t) override + size_t find_first_local(size_t start, size_t end) override { - REALM_UNREACHABLE(); + return m_index_evaluator.do_search_index(m_cluster, start, end); } }; diff --git a/src/realm/query_expression.hpp b/src/realm/query_expression.hpp index a7ffcc155f1..df6e1a2e873 100644 --- a/src/realm/query_expression.hpp +++ b/src/realm/query_expression.hpp @@ -1977,7 +1977,7 @@ class SimpleQuerySupport : public ObjPropertyExpr { return TypeOfValueOperator(this->clone()); } -private: +protected: using ObjPropertyExpr::m_link_map; using ObjPropertyExpr::m_column_key; @@ -2053,8 +2053,10 @@ class Columns : public SimpleQuerySupport { void set_base_table(ConstTableRef table) override { SimpleQuerySupport::set_base_table(table); - m_ctrl.alloc = &get_link_map().get_target_table()->get_alloc(); + auto target_table = get_link_map().get_target_table(); + m_ctrl.alloc = &target_table->get_alloc(); m_ctrl.group = table->get_parent_group(); + m_ctrl.interner = target_table->get_string_interner(m_column_key); } void evaluate(Subexpr::Index& index, ValueBase& destination) override @@ -2626,12 +2628,12 @@ class SizeOperator : public Subexpr2 { destination.set(i, int64_t(elem.get_string().size())); } else if (elem.is_type(type_List)) { - DummyParent parent(m_expr->get_base_table().cast_away_const(), elem.get_ref()); + DummyParent parent(m_expr->get_base_table().cast_away_const(), elem.get_ref(), ColKey()); Lst list(parent, 0); destination.set(i, int64_t(list.size())); } else if (elem.is_type(type_Dictionary)) { - DummyParent parent(m_expr->get_base_table().cast_away_const(), elem.get_ref()); + DummyParent parent(m_expr->get_base_table().cast_away_const(), elem.get_ref(), ColKey()); Dictionary dict(parent, 0); destination.set(i, int64_t(dict.size())); } @@ -3309,8 +3311,10 @@ class Columns> : public ColumnsCollection { void set_base_table(ConstTableRef table) override { ColumnsCollection::set_base_table(table); - m_ctrl.alloc = &m_link_map.get_target_table()->get_alloc(); + auto target_table = m_link_map.get_target_table(); + m_ctrl.alloc = &target_table->get_alloc(); m_ctrl.group = table->get_parent_group(); + m_ctrl.interner = target_table->get_string_interner(m_column_key); } void evaluate(Subexpr::Index& index, ValueBase& destination) override @@ -3407,8 +3411,10 @@ class Columns : public ColumnsCollection { void set_base_table(ConstTableRef table) override { ColumnsCollection::set_base_table(table); - m_ctrl.alloc = &m_link_map.get_target_table()->get_alloc(); + auto target_table = m_link_map.get_target_table(); + m_ctrl.alloc = &target_table->get_alloc(); m_ctrl.group = table->get_parent_group(); + m_ctrl.interner = target_table->get_string_interner(m_column_key); } SizeOperator size() override; std::unique_ptr get_element_length() override diff --git a/src/realm/set.hpp b/src/realm/set.hpp index e3d7fac3d60..dd42bfd26d1 100644 --- a/src/realm/set.hpp +++ b/src/realm/set.hpp @@ -532,6 +532,9 @@ UpdateStatus Set::init_from_parent(bool allow_create) const m_tree.reset(new BPlusTree(get_alloc())); const ArrayParent* parent = this; m_tree->set_parent(const_cast(parent), 0); + if constexpr (realm::is_any_v) { + m_tree->set_interner(get_table()->get_string_interner(m_col_key)); + } } return do_init_from_parent(m_tree.get(), Base::get_collection_ref(), allow_create); } diff --git a/src/realm/sort_descriptor.cpp b/src/realm/sort_descriptor.cpp index 4d0e97c2bcb..8e1258be048 100644 --- a/src/realm/sort_descriptor.cpp +++ b/src/realm/sort_descriptor.cpp @@ -23,9 +23,51 @@ #include #include #include +#include using namespace realm; +namespace { + +template +int compare(const T& i, const T& j, const Col& col) +{ + Mixed m_i = i.get_value(); + Mixed m_j = j.get_value(); + + // 1. not compressed + if (!i.compressed && !j.compressed) + return m_i.compare(m_j); + + ColKey ck{col.col_key}; + StringInterner* interner = col.table->get_string_interner(ck); + + // 2. two compressed strings + if (i.compressed && j.compressed) { + return interner->compare((StringID)m_i.get_int(), (StringID)m_j.get_int()); + } + + // 3. one index is a compressed string, and the other one is mixed. + if (i.compressed || j.compressed) { + if (m_i.is_type(type_String)) + return interner->compare(m_i.get_string(), (StringID)m_j.get_int()); + + if (m_j.is_type(type_String)) + return -interner->compare(m_j.get_string(), (StringID)m_i.get_int()); + } + + // 4. compare string vs any other non-string (since value comparison is triggered only if the type matches, we can + // skip fetching the actual values) + if (i.compressed) + m_i = Mixed{""}; + else + m_j = Mixed{""}; + + return m_i.compare(m_j); +} + +} // namespace + ConstTableRef ExtendedColumnKey::get_target_table(const Table* table) const { return (m_colkey.get_type() == col_type_Link) ? table->get_link_target(m_colkey) : ConstTableRef{}; @@ -85,6 +127,14 @@ Mixed ExtendedColumnKey::get_value(const Obj& obj) const return {}; } +std::optional ExtendedColumnKey::get_string_id(const Obj& obj) const +{ + const auto type = m_colkey.get_type(); + if (type != col_type_String && type != col_type_Mixed) + return {}; + return obj.get_string_id(m_colkey); +} + LinkPathPart::LinkPathPart(ColKey col_key, ConstTableRef source) : column_key(col_key) , from(source->get_key()) @@ -419,9 +469,8 @@ bool BaseDescriptor::Sorter::operator()(IndexPair i, IndexPair j, bool total_ord } int c; - if (t == 0) { - c = i.cached_value.compare(j.cached_value); + c = compare(i, j, m_columns[t]); } else { if (m_cache[t - 1].empty()) { @@ -434,20 +483,25 @@ bool BaseDescriptor::Sorter::operator()(IndexPair i, IndexPair j, bool total_ord const auto& obj = m_columns[t].table->get_object(key_i); const auto& col_key = m_columns[t].col_key; - cache_i.value = col_key.get_value(obj); + // store stringID instead of the actual string if possible cache_i.key = key_i; + const std::optional string_id = col_key.get_string_id(obj); + cache_i.compressed = string_id ? true : false; + cache_i.value = cache_i.compressed ? static_cast(*string_id) : col_key.get_value(obj); } - Mixed val_i = cache_i.value; if (cache_j.key != key_j) { const auto& obj = m_columns[t].table->get_object(key_j); const auto& col_key = m_columns[t].col_key; - cache_j.value = col_key.get_value(obj); + // store stringID instead of the actual string if possible cache_j.key = key_j; + const std::optional string_id = col_key.get_string_id(obj); + cache_j.compressed = string_id ? true : false; + cache_j.value = cache_j.compressed ? static_cast(*string_id) : col_key.get_value(obj); } - c = val_i.compare(cache_j.value); + c = compare(cache_i, cache_j, m_columns[t]); } // if c is negative i comes before j if (c) { @@ -476,9 +530,10 @@ void BaseDescriptor::Sorter::cache_first_column(IndexPairs& v) continue; } } - const auto obj = col.table->get_object(key); - index.cached_value = ck.get_value(obj); + const std::optional string_id = ck.get_string_id(obj); + index.compressed = string_id ? true : false; + index.cached_value = index.compressed ? static_cast(*string_id) : ck.get_value(obj); } } diff --git a/src/realm/sort_descriptor.hpp b/src/realm/sort_descriptor.hpp index 0224ea5de6b..a3f03390b54 100644 --- a/src/realm/sort_descriptor.hpp +++ b/src/realm/sort_descriptor.hpp @@ -66,9 +66,18 @@ class BaseDescriptor { { return index_in_view < other.index_in_view; } + ObjKey get_key() const + { + return key_for_object; + } + Mixed get_value() const + { + return cached_value; + } ObjKey key_for_object; size_t index_in_view; Mixed cached_value; + bool compressed = false; }; class IndexPairs : public std::vector { public: @@ -115,6 +124,16 @@ class BaseDescriptor { struct ObjCache { ObjKey key; Mixed value; + bool compressed = false; + + ObjKey get_key() const + { + return key; + } + Mixed get_value() const + { + return value; + } }; using TableCache = std::vector; mutable std::vector m_cache; diff --git a/src/realm/spec.cpp b/src/realm/spec.cpp index b2746f3c1c2..1a6b4dfefaa 100644 --- a/src/realm/spec.cpp +++ b/src/realm/spec.cpp @@ -51,14 +51,6 @@ void Spec::init(MemRef mem) noexcept m_top.add(0); } - // Enumkeys array is only there when there are StringEnum columns - if (auto ref = m_top.get_as_ref(s_enum_keys_ndx)) { - m_enumkeys.init_from_ref(ref); - } - else { - m_enumkeys.detach(); - } - if (m_top.get_as_ref(s_col_keys_ndx) == 0) { // This is an upgrade - create column key array MemRef mem_ref = Array::create_empty_array(Array::type_Normal, false, m_top.get_alloc()); // Throws @@ -96,14 +88,6 @@ void Spec::update_from_parent() noexcept m_types.update_from_parent(); m_names.update_from_parent(); m_attr.update_from_parent(); - - if (m_top.get_as_ref(s_enum_keys_ndx) != 0) { - m_enumkeys.update_from_parent(); - } - else { - m_enumkeys.detach(); - } - m_keys.update_from_parent(); update_internals(); @@ -115,36 +99,25 @@ MemRef Spec::create_empty_spec(Allocator& alloc) // The 'spec_set' contains the specification (types and names) of // all columns and sub-tables Array spec_set(alloc); - _impl::DeepArrayDestroyGuard dg(&spec_set); spec_set.create(Array::type_HasRefs); // Throws - _impl::DeepArrayRefDestroyGuard dg_2(alloc); { // One type for each column bool context_flag = false; MemRef mem = Array::create_empty_array(Array::type_Normal, context_flag, alloc); // Throws - dg_2.reset(mem.get_ref()); - int_fast64_t v(from_ref(mem.get_ref())); - spec_set.add(v); // Throws - dg_2.release(); + spec_set.add(from_ref(mem.get_ref())); // Throws } { size_t size = 0; // One name for each column MemRef mem = ArrayStringShort::create_array(size, alloc); // Throws - dg_2.reset(mem.get_ref()); - int_fast64_t v = from_ref(mem.get_ref()); - spec_set.add(v); // Throws - dg_2.release(); + spec_set.add(from_ref(mem.get_ref())); // Throws } { // One attrib set for each column bool context_flag = false; MemRef mem = Array::create_empty_array(Array::type_Normal, context_flag, alloc); // Throws - dg_2.reset(mem.get_ref()); - int_fast64_t v = from_ref(mem.get_ref()); - spec_set.add(v); // Throws - dg_2.release(); + spec_set.add(from_ref(mem.get_ref())); // Throws } spec_set.add(0); // Nested collections array spec_set.add(0); // Enumkeys array @@ -152,13 +125,9 @@ MemRef Spec::create_empty_spec(Allocator& alloc) // One key for each column bool context_flag = false; MemRef mem = Array::create_empty_array(Array::type_Normal, context_flag, alloc); // Throws - dg_2.reset(mem.get_ref()); - int_fast64_t v = from_ref(mem.get_ref()); - spec_set.add(v); // Throws - dg_2.release(); + spec_set.add(from_ref(mem.get_ref())); // Throws } - dg.release(); return spec_set.get_mem(); } @@ -204,10 +173,6 @@ void Spec::insert_column(size_t column_ndx, ColKey col_key, ColumnType type, Str m_attr.insert(column_ndx, attr); // Throws m_keys.insert(column_ndx, col_key.value); - if (m_enumkeys.is_attached() && type != col_type_BackLink) { - m_enumkeys.insert(column_ndx, 0); - } - update_internals(); } @@ -216,28 +181,6 @@ void Spec::erase_column(size_t column_ndx) REALM_ASSERT(column_ndx < m_types.size()); if (ColumnType(int(m_types.get(column_ndx))) != col_type_BackLink) { - if (is_string_enum_type(column_ndx)) { - // Enum columns do also have a separate key list - ref_type keys_ref = m_enumkeys.get_as_ref(column_ndx); - Array::destroy_deep(keys_ref, m_top.get_alloc()); - m_enumkeys.set(column_ndx, 0); - } - - // Remove this column from the enum keys lookup and clean it up if it's now empty - if (m_enumkeys.is_attached()) { - m_enumkeys.erase(column_ndx); // Throws - bool all_empty = true; - for (size_t i = 0; i < m_enumkeys.size(); i++) { - if (m_enumkeys.get(i) != 0) { - all_empty = false; - break; - } - } - if (all_empty) { - m_enumkeys.destroy_deep(); - m_top.set(4, 0); - } - } m_num_public_columns--; m_names.erase(column_ndx); // Throws } @@ -250,34 +193,6 @@ void Spec::erase_column(size_t column_ndx) update_internals(); } -void Spec::upgrade_string_to_enum(size_t column_ndx, ref_type keys_ref) -{ - REALM_ASSERT(get_column_type(column_ndx) == col_type_String); - - // Create the enumkeys list if needed - if (!m_enumkeys.is_attached()) { - m_enumkeys.create(Array::type_HasRefs, false, m_num_public_columns); - m_top.set(4, m_enumkeys.get_ref()); - m_enumkeys.set_parent(&m_top, 4); - } - - // Insert the new key list - m_enumkeys.set(column_ndx, keys_ref); -} - -bool Spec::is_string_enum_type(size_t column_ndx) const noexcept -{ - return m_enumkeys.is_attached() ? (m_enumkeys.get(column_ndx) != 0) : false; -} - -ref_type Spec::get_enumkeys_ref(size_t column_ndx, ArrayParent*& keys_parent) noexcept -{ - // We also need to return parent info - keys_parent = &m_enumkeys; - - return m_enumkeys.get_as_ref(column_ndx); -} - namespace { template diff --git a/src/realm/spec.hpp b/src/realm/spec.hpp index c9f3ff0c230..188ea05f20d 100644 --- a/src/realm/spec.hpp +++ b/src/realm/spec.hpp @@ -65,12 +65,6 @@ class Spec { void set_dictionary_key_type(size_t column_ndx, DataType key_type); DataType get_dictionary_key_type(size_t column_ndx) const; - // Auto Enumerated string columns - void upgrade_string_to_enum(size_t column_ndx, ref_type keys_ref); - size_t _get_enumkeys_ndx(size_t column_ndx) const noexcept; - bool is_string_enum_type(size_t column_ndx) const noexcept; - ref_type get_enumkeys_ref(size_t column_ndx, ArrayParent*& keys_parent) noexcept; - //@{ /// Compare two table specs for equality. bool operator==(const Spec&) const noexcept; @@ -92,7 +86,7 @@ class Spec { static constexpr size_t s_names_ndx = 1; static constexpr size_t s_attributes_ndx = 2; static constexpr size_t s_vacant_1 = 3; - static constexpr size_t s_enum_keys_ndx = 4; + // static constexpr size_t s_enum_keys_ndx = 4; static constexpr size_t s_col_keys_ndx = 5; static constexpr size_t s_spec_max_size = 6; @@ -100,8 +94,8 @@ class Spec { Array m_types; // 1st slot in m_top ArrayStringShort m_names; // 2nd slot in m_top Array m_attr; // 3rd slot in m_top - // 4th slot in m_top not cached - Array m_enumkeys; // 5th slot in m_top + // 4th slot in m_top, old subspecs. Not used since v6.0.0 + // 5th slot in m_top, old enum keys which was never released Array m_keys; // 6th slot in m_top size_t m_num_public_columns = 0; @@ -151,13 +145,11 @@ inline Spec::Spec(Allocator& alloc) noexcept , m_types(alloc) , m_names(alloc) , m_attr(alloc) - , m_enumkeys(alloc) , m_keys(alloc) { m_types.set_parent(&m_top, s_types_ndx); m_names.set_parent(&m_top, s_names_ndx); m_attr.set_parent(&m_top, s_attributes_ndx); - m_enumkeys.set_parent(&m_top, s_enum_keys_ndx); m_keys.set_parent(&m_top, s_col_keys_ndx); } diff --git a/src/realm/string_compressor.cpp b/src/realm/string_compressor.cpp new file mode 100644 index 00000000000..9f88cc24206 --- /dev/null +++ b/src/realm/string_compressor.cpp @@ -0,0 +1,357 @@ +/************************************************************************* + * + * Copyright 2016 Realm Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + **************************************************************************/ + +#include +#include +#include + +#include +namespace realm { + +StringCompressor::StringCompressor(Allocator& alloc, Array& parent, size_t index, bool writable) + : m_data(alloc) +{ + m_compression_map.resize(16); // start with a very small compression map + m_symbols.reserve(65536); + m_data.set_parent(&parent, index); + refresh(writable); +} + +void StringCompressor::refresh(bool writable) +{ + // we assume that compressors are only created from a valid parent. + // String interners in 'dead' mode should never instantiate a string compressor. + if (m_data.get_ref_from_parent() == 0) { + REALM_ASSERT(writable); + m_data.create(0, 65535); + m_data.update_parent(); + } + else { + if (m_data.is_attached()) + m_data.update_from_parent(); + else + m_data.init_from_ref(m_data.get_ref_from_parent()); + } + rebuild_internal(); +} + +static size_t symbol_pair_hash(CompressionSymbol a, CompressionSymbol b) +{ + // range of return value must match size of encoding table + uint32_t tmp = a + 3; + tmp *= b + 7; + return (tmp ^ (tmp >> 16)) & 0xFFFF; +} + +void StringCompressor::add_expansion(SymbolDef def) +{ + // compute expansion size: + size_t exp_size = 0; + if (def.expansion_a < 256) + exp_size = 1; + else + exp_size = m_symbols[def.expansion_a - 256].expansion.size(); + if (def.expansion_b < 256) + exp_size += 1; + else + exp_size += m_symbols[def.expansion_b - 256].expansion.size(); + // make sure there is room in active storage chunk: + if (m_expansion_storage.size() == 0 || m_expansion_storage.back().size() + exp_size + 1 >= storage_chunk_size) { + m_expansion_storage.push_back({}); + m_expansion_storage.back().reserve(storage_chunk_size); + } + // construct expansion at end of chunk: + auto& chunk = m_expansion_storage.back(); + auto start_index = (uint32_t)chunk.size(); + if (def.expansion_a < 256) + chunk.push_back((char)def.expansion_a); + else + chunk.append(m_symbols[def.expansion_a - 256].expansion); + if (def.expansion_b < 256) + chunk.push_back((char)def.expansion_b); + else + chunk.append(m_symbols[def.expansion_b - 256].expansion); + std::string_view expansion(chunk.data() + start_index, exp_size); + m_symbols.push_back({def, expansion, (uint32_t)m_expansion_storage.size() - 1, start_index}); +} + +void StringCompressor::expand_compression_map() +{ + size_t old_size = m_compression_map.size(); + REALM_ASSERT(old_size <= 16384); + size_t new_size = 4 * old_size; + std::vector map(new_size); + for (size_t i = 0; i < m_compression_map.size(); ++i) { + auto& entry = m_compression_map[i]; + if (entry.id == 0) + continue; + auto hash = symbol_pair_hash(entry.expansion_a, entry.expansion_b); + auto new_hash = hash & (new_size - 1); + REALM_ASSERT(map[new_hash].id == 0); + map[new_hash] = entry; + } + m_compression_map.swap(map); +} + +void StringCompressor::rebuild_internal() +{ + auto num_symbols = m_data.size(); + if (num_symbols == m_symbols.size()) + return; + if (num_symbols < m_symbols.size()) { + // fewer symbols (likely a rollback) -- remove last ones added + while (num_symbols < m_symbols.size()) { + auto& symbol = m_symbols.back(); + auto hash = symbol_pair_hash(symbol.def.expansion_a, symbol.def.expansion_b); + hash &= m_compression_map.size() - 1; + REALM_ASSERT(m_compression_map[hash].id == symbol.def.id); + m_compression_map[hash] = {0, 0, 0}; + if (symbol.storage_index < m_expansion_storage.size() - 1) { + m_expansion_storage.resize(symbol.storage_index + 1); + } + m_expansion_storage[symbol.storage_index].resize(symbol.storage_offset); + m_symbols.pop_back(); + } + return; + } + // we have new symbols to add + for (size_t i = m_symbols.size(); i < num_symbols; ++i) { + auto pair = m_data.get(i); + SymbolDef def; + def.id = (CompressionSymbol)(i + 256); + def.expansion_a = 0xFFFF & (pair >> 16); + def.expansion_b = 0xFFFF & pair; + auto hash = symbol_pair_hash(def.expansion_a, def.expansion_b); + while (m_compression_map[hash & (m_compression_map.size() - 1)].id) { + expand_compression_map(); + } + // REALM_ASSERT_DEBUG(m_compression_map[hash].id == 0); + m_compression_map[hash & (m_compression_map.size() - 1)] = def; + add_expansion(def); + } +} + +StringCompressor::~StringCompressor() {} + +CompressedString StringCompressor::compress(StringData sd, bool learn) +{ + CompressedString result(sd.size()); + // expand string into array of symbols + const char* d = sd.data(); + const size_t limit = sd.size(); + if (limit == 0) + return {}; + size_t i = 0; + while (i < limit) { + result[i++] = 0xFF & *d++; + } + // iteratively compress array of symbols. Each run compresses pairs into single symbols. + // 6 runs give a max compression of 64x - on average it will be much less :-) + constexpr int run_limit = 6; + CompressionSymbol* to; + for (int run = 0; run < run_limit; ++run) { + CompressionSymbol* from = to = result.data(); + CompressionSymbol* limit = from + result.size() - 1; + while (from < limit) { + auto hash = symbol_pair_hash(from[0], from[1]); + hash &= m_compression_map.size() - 1; + auto& def = m_compression_map[hash]; + if (def.id) { + // existing symbol + if (def.expansion_a == from[0] && def.expansion_b == from[1]) { + // matching symbol + *to++ = def.id; + from += 2; + } + else if (m_compression_map.size() < 65536) { + // Conflict: some other symbol is defined here - but we can expand the compression map + // and hope to find room! + expand_compression_map(); + // simply retry: + continue; + } + else { + // also conflict: some other symbol is defined here, we can't compress + *to++ = *from++; + // In a normal hash table we'd have buckets and add a translation + // to a bucket. This is slower generally, but yields better compression. + } + } + else { + // free entry we can use for new symbol (and we're learning) + if (m_symbols.size() < (65536 - 256) && learn) { + // define a new symbol for this entry and use it. + REALM_ASSERT_DEBUG(m_compression_map[hash].id == 0); + REALM_ASSERT_DEBUG(m_symbols.size() == m_data.size()); + REALM_ASSERT_DEBUG(m_data.is_attached()); + CompressionSymbol id = (CompressionSymbol)(256 + m_symbols.size()); + SymbolDef def{id, from[0], from[1]}; + m_compression_map[hash] = def; + add_expansion(def); + m_data.add(((uint64_t)from[0]) << 16 | from[1]); + // std::cerr << id << " = {" << from[0] << ", " << from[1] << "}" << std::endl; + *to++ = id; + from += 2; + } + else { + // no more symbol space, so can't compress + *to++ = *from++; + } + } + } + if (from == limit) { + // copy over trailing symbol + *to++ = *from++; + } + REALM_ASSERT_DEBUG(to > result.data()); + size_t sz = to - result.data(); + REALM_ASSERT_DEBUG(sz <= sd.size()); + result.resize(sz); + if (from == to) // no compression took place in last iteration + break; + } + return result; +} + +std::string StringCompressor::decompress(CompressedStringView& c_str) +{ + CompressionSymbol* ptr = c_str.data; + CompressionSymbol* limit = ptr + c_str.size; + // compute size of decompressed string first to avoid allocations as string grows + size_t result_size = 0; + while (ptr < limit) { + if (*ptr < 256) + result_size += 1; + else + result_size += m_symbols[*ptr - 256].expansion.size(); + ++ptr; + } + std::string result2; + result2.reserve(result_size); + // generate result + ptr = c_str.data; + while (ptr < limit) { + if (*ptr < 256) + result2.push_back((char)*ptr); + else + result2.append(m_symbols[*ptr - 256].expansion); + ptr++; + } +#ifdef REALM_DEBUG + std::string result; + { + auto decompress = [&](CompressionSymbol symbol, auto& decompress) -> void { + if (symbol < 256) { + result.push_back((char)symbol); + } + else { + auto& s = m_symbols[symbol - 256]; + decompress(s.def.expansion_a, decompress); + decompress(s.def.expansion_b, decompress); + } + }; + + CompressionSymbol* ptr = c_str.data; + CompressionSymbol* limit = ptr + c_str.size; + while (ptr < limit) { + decompress(*ptr, decompress); + ++ptr; + } + } + REALM_ASSERT_DEBUG(result == result2); +#endif + return result2; +} + +int StringCompressor::compare(CompressedStringView& A, CompressedStringView& B) +{ + auto A_ptr = A.data; + auto A_limit = A_ptr + A.size; + auto B_ptr = B.data; + auto B_limit = B_ptr + B.size; + while (A_ptr < A_limit && B_ptr < B_limit) { + auto code_A = *A_ptr++; + auto code_B = *B_ptr++; + if (code_A == code_B) + continue; + // symbols did not match: + + // 1. both symbols are single characters + if (code_A < 256 && code_B < 256) + return code_A - code_B; + + // 2. all the other possible cases + std::string str_a{(char)code_A, 1}; + std::string str_b{(char)code_B, 1}; + StringData sd_a = code_A < 256 ? str_a : m_symbols[code_A - 256].expansion; + StringData sd_b = code_B < 256 ? str_b : m_symbols[code_B - 256].expansion; + + REALM_ASSERT_DEBUG(sd_a != sd_b); + if (sd_a < sd_b) + return -1; + else + return 1; + } + // The compressed strings are identical or one is the prefix of the other + return static_cast(A.size - B.size); + // ^ a faster way of producing same positive / negative / zero as: + // if (A.size() < B.size()) + // return -1; + // if (A.size() > B.size()) + // return 1; + // return 0; +} + +int StringCompressor::compare(StringData sd, CompressedStringView& B) +{ + auto B_size = B.size; + // make sure comparisons are unsigned, even though StringData does not specify signedness + const unsigned char* A_ptr = reinterpret_cast(sd.data()); + auto A_limit = A_ptr + sd.size(); + for (size_t i = 0; i < B_size; ++i) { + if (A_ptr == A_limit) { + // sd ended first, so B is bigger + return -1; + } + auto code = B.data[i]; + if (code < 256) { + if (code < *A_ptr) + return 1; + if (code > *A_ptr) + return -1; + ++A_ptr; + continue; + } + auto& expansion = m_symbols[code - 256]; + for (size_t disp = 0; disp < expansion.expansion.size(); ++disp) { + uint8_t c = expansion.expansion[disp]; + if (c < *A_ptr) + return 1; + if (c > *A_ptr) + return -1; + ++A_ptr; + } + } + // if sd is longer than B, sd is the biggest string + if (A_ptr < A_limit) + return 1; + return 0; +} + + +} // namespace realm diff --git a/src/realm/string_compressor.hpp b/src/realm/string_compressor.hpp new file mode 100644 index 00000000000..bd10948e25c --- /dev/null +++ b/src/realm/string_compressor.hpp @@ -0,0 +1,97 @@ +/************************************************************************* + * + * Copyright 2016 Realm Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + **************************************************************************/ + +#ifndef REALM_STRING_COMPRESSOR_HPP +#define REALM_STRING_COMPRESSOR_HPP + +#include +#include +#include + +using CompressionSymbol = uint16_t; +using CompressedString = std::vector; + +struct CompressedStringView { + CompressionSymbol* data = 0; + uint32_t size = 0; + CompressedStringView() = default; + CompressedStringView(CompressionSymbol* c_ptr, size_t s) + : data(c_ptr) + , size(uint32_t(s)) + { + } + explicit CompressedStringView(CompressedString& cs) + : data(cs.data()) + , size(uint32_t(cs.size())) + { + } + bool operator==(CompressedStringView& other) + { + if (size != other.size) + return false; + for (size_t i = 0; i < size; ++i) { + if (data[i] != other.data[i]) + return false; + } + return true; + } +}; + +namespace realm { +class StringCompressor { +public: + StringCompressor(Allocator& alloc, Array& parent, size_t index, bool writable); + void refresh(bool writable); + ~StringCompressor(); + + int compare(CompressedStringView& A, CompressedStringView& B); + int compare(StringData sd, CompressedStringView& B); + + CompressedString compress(StringData, bool learn); + std::string decompress(CompressedStringView& c_str); + +private: + struct SymbolDef { + CompressionSymbol id = 0; + CompressionSymbol expansion_a = 0; + CompressionSymbol expansion_b = 0; + }; + + struct ExpandedSymbolDef { + SymbolDef def; + std::string_view expansion; + // ^ points into storage managed by m_expansion_storage + // we need the following 2 values to facilitate rollback of allocated storage + uint32_t storage_index; // index into m_expansion_storage + uint32_t storage_offset; // offset into block. + }; + + void rebuild_internal(); + void expand_compression_map(); + void add_expansion(SymbolDef def); + std::vector m_symbols; // map from symbol -> symbolpair, 2 elements pr entry + std::vector m_compression_map; // perfect hash from symbolpair to its symbol + + ArrayUnsigned m_data; + constexpr static size_t storage_chunk_size = 4096; + std::vector m_expansion_storage; +}; + +} // namespace realm + +#endif diff --git a/src/realm/string_data.hpp b/src/realm/string_data.hpp index 46e1df0713d..63578350e77 100644 --- a/src/realm/string_data.hpp +++ b/src/realm/string_data.hpp @@ -34,6 +34,11 @@ namespace realm { +// Compressed strings have unique IDs, this defines a global alias +// for this. A StringID is an entry inside an array of N compressed strings. +// 0 means null, all the other ids [1, N-1] represent a valid string. +using StringID = size_t; + /// Selects CityHash64 on 64-bit platforms, and Murmur2 on 32-bit platforms. /// This is what libc++ does, and it is a good general choice for a /// non-cryptographic hash function (suitable for std::unordered_map etc.). diff --git a/src/realm/string_interner.cpp b/src/realm/string_interner.cpp new file mode 100644 index 00000000000..47d21e506d8 --- /dev/null +++ b/src/realm/string_interner.cpp @@ -0,0 +1,693 @@ +/************************************************************************* + * + * Copyright 2016 Realm Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + **************************************************************************/ + +#include +#include +#include +#include + +namespace realm { + +// Fast mapping of strings (or rather hash of strings) to string IDs. +// +// We use a tree where: +// * All interior nodes are radix nodes with a fan-out of 256. +// * Leaf nodes with up to 16 entries are just lists, searched linearly +// * Leaf nodes with more than 16 entries and less than 1K are hash tables. +// Hash tables use linear search starting from the entry found by hashing. +// +constexpr static size_t linear_search_limit = 16; +constexpr static size_t hash_node_min_size = 32; +constexpr static size_t hash_node_max_size = 1024; +constexpr static size_t radix_node_consumes_bits = 8; +constexpr static size_t radix_node_size = 1ULL << radix_node_consumes_bits; + +// helpers +struct HashMapIter { + Array& m_array; + uint32_t hash_filter; + uint16_t index; + uint16_t left_to_search; + uint8_t hash_size; + HashMapIter(Array& array, uint32_t hash, uint8_t hash_size) + : m_array(array) + , hash_filter(hash) + , hash_size(hash_size) + { + set_index(0); + } + HashMapIter(Array& dummy) + : m_array(dummy) + { + left_to_search = 0; + } + inline uint32_t get() + { + return (uint32_t)(m_array.get(index) >> hash_size); + } + inline bool empty() + { + auto element = m_array.get(index); + return (element >> hash_size) == 0; + } + inline void set(uint64_t element) + { + m_array.set(index, element); + } + inline bool matches() + { + auto mask = 0xFFFFFFFFUL >> (32 - hash_size); + auto element = m_array.get(index); + return ((element & mask) == hash_filter) && (element >> hash_size); + } + inline bool is_valid() + { + return left_to_search != 0; + } + inline void set_index(size_t i, size_t search_limit = linear_search_limit) + { + index = (uint16_t)i; + left_to_search = (uint16_t)std::min(m_array.size(), (size_t)search_limit); + } + void operator++() + { + if (is_valid()) { + left_to_search--; + index++; + if (index == m_array.size()) { + index = 0; + } + } + } +}; + +// Attempt to build a hash leaf from a smaller hash leaf or a non-hash leaf. +static bool rehash(Array& from, Array& to, uint8_t hash_size) +{ + REALM_ASSERT_DEBUG(from.size() * 2 <= to.size()); + + for (size_t i = 0; i < from.size(); ++i) { + auto entry = (size_t)from.get(i); + if ((entry >> hash_size) == 0) + continue; + size_t starting_index = entry & (to.size() - 1); + HashMapIter it(to, 0, hash_size); + it.set_index(starting_index); + while (it.is_valid() && !it.empty()) { + ++it; + } + if (!it.is_valid()) { + // abort rehashing, we need a larger to-space + return false; + } + REALM_ASSERT(it.empty()); + it.set(entry); + } + return true; +} + +// Add a binding from hash value to id. +static void add_to_hash_map(Array& node, uint64_t hash, uint64_t id, uint8_t hash_size) +{ + REALM_ASSERT(node.is_attached()); + if (!node.has_refs()) { + // it's a leaf. + if (node.size() < linear_search_limit) { + // it's a list with room to grow + node.add(((uint64_t)id << hash_size) | hash); + return; + } + if (node.size() == linear_search_limit) { + // it's a full list, must be converted to a hash table + Array new_node(node.get_alloc()); + new_node.create(NodeHeader::type_Normal, false, hash_node_min_size, 0); + new_node.set_parent(node.get_parent(), node.get_ndx_in_parent()); + new_node.update_parent(); + // transform existing list into hash table + rehash(node, new_node, hash_size); + node.destroy(); + node.init_from_parent(); + } + // it's a hash table. Grow if needed up till 'hash_node_max_size' entries + while (node.size() < hash_node_max_size) { + auto size = node.size(); + size_t start_index = hash & (size - 1); + HashMapIter it(node, 0, hash_size); + it.set_index(start_index); + while (it.is_valid() && !it.empty()) { + ++it; + } + if (it.is_valid()) { + // found an empty spot within search range + it.set(((uint64_t)id << hash_size) | hash); + return; + } + if (node.size() >= hash_node_max_size) + break; + // No free spot found - rehash into bigger and bigger tables + auto new_size = node.size(); + bool need_to_rehash = true; + Array new_node(node.get_alloc()); + while (need_to_rehash && new_size < hash_node_max_size) { + new_size *= 2; + new_node.create(NodeHeader::type_Normal, false, new_size, 0); + need_to_rehash = !rehash(node, new_node, hash_size); + if (need_to_rehash) { // we failed, try again - or shift to radix + // I find it counter-intuitive. But it CAN happen. + new_node.destroy(); + } + } + if (need_to_rehash) + break; + new_node.set_parent(node.get_parent(), node.get_ndx_in_parent()); + new_node.update_parent(); + node.destroy(); + node.init_from_parent(); + } + // we ran out of space. Rewrite as a radix node with subtrees + Array new_node(node.get_alloc()); + new_node.create(NodeHeader::type_HasRefs, false, radix_node_size, 0); + new_node.set_parent(node.get_parent(), node.get_ndx_in_parent()); + new_node.update_parent(); + for (size_t index = 0; index < node.size(); ++index) { + auto element = node.get(index); + auto hash = element & (0xFFFFFFFF >> (32 - hash_size)); + auto string_id = element >> hash_size; + if (string_id == 0) + continue; + auto remaining_hash = hash >> radix_node_consumes_bits; + add_to_hash_map(new_node, remaining_hash, string_id, hash_size - 8); + } + node.destroy(); + node.init_from_parent(); + } + // We have a radix node and need to insert the new binding into the proper subtree + size_t index = hash & (radix_node_size - 1); + auto rot = node.get_as_ref_or_tagged(index); + REALM_ASSERT(!rot.is_tagged()); + Array subtree(node.get_alloc()); + if (rot.get_as_ref() == 0) { + // no subtree present, create an empty one + subtree.set_parent(&node, index); + subtree.create(NodeHeader::type_Normal); + subtree.update_parent(); + } + else { + // subtree already present + subtree.set_parent(&node, index); + subtree.init_from_parent(); + } + // recurse into subtree + add_to_hash_map(subtree, hash >> radix_node_consumes_bits, id, hash_size - radix_node_consumes_bits); +} + +static std::vector hash_to_id(Array& node, uint32_t hash, uint8_t hash_size) +{ + std::vector result; + REALM_ASSERT(node.is_attached()); + if (!node.has_refs()) { + // it's a leaf - default is a list, search starts from index 0. + HashMapIter it(node, hash, hash_size); + if (node.size() >= hash_node_min_size) { + // it is a hash table, so use hash to select index to start searching + // table size must be power of two! + size_t index = hash & (node.size() - 1); + it.set_index(index); + } + // collect all matching values within allowed range + while (it.is_valid()) { + if (it.matches()) { + result.push_back(it.get()); + } + ++it; + } + return result; + } + else { + // it's a radix node + size_t index = hash & (node.size() - 1); + auto rot = node.get_as_ref_or_tagged(index); + REALM_ASSERT(rot.is_ref()); + if (rot.get_as_ref() == 0) { + // no subtree, return empty vector + return result; + } + // descend into subtree + Array subtree(node.get_alloc()); + subtree.set_parent(&node, index); + subtree.init_from_parent(); + return hash_to_id(subtree, hash >> radix_node_consumes_bits, hash_size - radix_node_consumes_bits); + } +} + + +enum positions { Pos_Version, Pos_ColKey, Pos_Size, Pos_Compressor, Pos_Data, Pos_Map, Top_Size }; +struct StringInterner::DataLeaf { + ref_type m_leaf_ref = 0; + std::vector m_compressed; + std::atomic m_is_loaded = false; + DataLeaf() {} + DataLeaf(ref_type ref) + : m_leaf_ref(ref) + { + } + DataLeaf(const DataLeaf&& other) + : m_leaf_ref(other.m_leaf_ref) + , m_compressed(other.m_compressed) + , m_is_loaded(other.m_is_loaded.load(std::memory_order_acquire)) + { + } +}; + +StringInterner::StringInterner(Allocator& alloc, Array& parent, ColKey col_key, bool writable) + : m_parent(parent) + , m_top(alloc) + , m_data(alloc) + , m_hash_map(alloc) + , m_current_string_leaf(alloc) + , m_current_long_string_node(alloc) +{ + REALM_ASSERT_DEBUG(col_key != ColKey()); + size_t index = col_key.get_index().val; + // ensure that m_top and m_data is well defined and reflect any existing data + // We'll have to extend this to handle no defined backing + m_top.set_parent(&parent, index); + m_data.set_parent(&m_top, Pos_Data); + m_hash_map.set_parent(&m_top, Pos_Map); + m_col_key = col_key; + update_from_parent(writable); +} + +void StringInterner::update_from_parent(bool writable) +{ + auto parent_idx = m_top.get_ndx_in_parent(); + bool valid_top_ref_spot = m_parent.is_attached() && parent_idx < m_parent.size(); + bool valid_top = valid_top_ref_spot && m_parent.get_as_ref(parent_idx); + if (valid_top) { + m_top.update_from_parent(); + m_data.update_from_parent(); + m_hash_map.update_from_parent(); + } + else if (writable && valid_top_ref_spot) { + m_top.create(NodeHeader::type_HasRefs, false, Top_Size, 0); + m_top.set(Pos_Version, (1 << 1) + 1); // version number 1. + m_top.set(Pos_Size, (0 << 1) + 1); // total size 0 + m_top.set(Pos_ColKey, (m_col_key.value << 1) + 1); + m_top.set(Pos_Compressor, 0); + + // create first level of data tree here (to simplify other stuff) + m_data.create(NodeHeader::type_HasRefs, false, 0); + m_data.update_parent(); + + m_hash_map.create(NodeHeader::type_Normal); + m_hash_map.update_parent(); + m_top.update_parent(); + valid_top = true; + } + if (!valid_top) { + // We're lacking part of underlying data and not allowed to create it, so enter "dead" mode + m_compressor.reset(); + m_compressed_leafs.clear(); + // m_compressed_string_map.clear(); + m_top.detach(); + m_data.detach(); + m_hash_map.detach(); + m_compressor.reset(); + return; + } + // validate we're accessing data for the correct column. A combination of column erase + // and insert could lead to an interner being paired with wrong data in the file. + // If so, we clear internal data forcing rebuild_internal() to rebuild from scratch. + int64_t data_colkey = m_top.get_as_ref_or_tagged(Pos_ColKey).get_as_int(); + if (m_col_key.value != data_colkey) { + // new column, new data + m_compressor.reset(); + m_decompressed_strings.clear(); + } + if (!m_compressor) + m_compressor = std::make_unique(m_top.get_alloc(), m_top, Pos_Compressor, writable); + else + m_compressor->refresh(writable); + if (m_data.size()) { + auto ref_to_write_buffer = m_data.get_as_ref(m_data.size() - 1); + const char* header = m_top.get_alloc().translate(ref_to_write_buffer); + bool is_array_of_cprs = NodeHeader::get_hasrefs_from_header(header); + if (is_array_of_cprs) { + m_current_long_string_node.set_parent(&m_data, m_data.size() - 1); + m_current_long_string_node.update_from_parent(); + } + else { + m_current_long_string_node.detach(); + } + } + else + m_current_long_string_node.detach(); // just in case... + + // rebuild internal structures...... + rebuild_internal(); + m_current_string_leaf.detach(); +} + +void StringInterner::rebuild_internal() +{ + // release old decompressed strings + for (size_t idx = 0; idx < m_in_memory_strings.size(); ++idx) { + StringID id = m_in_memory_strings[idx]; + if (id > m_decompressed_strings.size()) { + m_in_memory_strings[idx] = m_in_memory_strings.back(); + m_in_memory_strings.pop_back(); + continue; + } + if (auto& w = m_decompressed_strings[id - 1].m_weight) { + auto val = w.load(std::memory_order_acquire); + val = val >> 1; + w.store(val, std::memory_order_release); + } + else { + m_decompressed_strings[id - 1].m_decompressed.reset(); + m_in_memory_strings[idx] = m_in_memory_strings.back(); + m_in_memory_strings.pop_back(); + continue; + } + } + + size_t target_size = (size_t)m_top.get_as_ref_or_tagged(Pos_Size).get_as_int(); + m_decompressed_strings.resize(target_size); + if (m_data.size() != m_compressed_leafs.size()) { + m_compressed_leafs.resize(m_data.size()); + } + // always force new setup of all leafs: + // update m_compressed_leafs to reflect m_data + for (size_t idx = 0; idx < m_compressed_leafs.size(); ++idx) { + auto ref = m_data.get_as_ref(idx); + auto& leaf_meta = m_compressed_leafs[idx]; + leaf_meta.m_is_loaded.store(false, std::memory_order_release); + leaf_meta.m_compressed.clear(); + leaf_meta.m_leaf_ref = ref; + } +} + +StringInterner::~StringInterner() {} + +StringID StringInterner::intern(StringData sd) +{ + REALM_ASSERT(m_top.is_attached()); + // special case for null string + if (sd.data() == nullptr) + return 0; + uint32_t h = (uint32_t)sd.hash(); + auto candidates = hash_to_id(m_hash_map, h, 32); + for (auto& candidate : candidates) { + auto candidate_cpr = get_compressed(candidate); + if (m_compressor->compare(sd, candidate_cpr) == 0) + return candidate; + } + // it's a new string + bool learn = true; + auto c_str = m_compressor->compress(sd, learn); + m_decompressed_strings.emplace_back(64, std::make_unique(sd)); + auto id = m_decompressed_strings.size(); + m_in_memory_strings.push_back(id); + add_to_hash_map(m_hash_map, h, id, 32); + size_t index = (size_t)m_top.get_as_ref_or_tagged(Pos_Size).get_as_int(); + REALM_ASSERT_DEBUG(index == id - 1); + bool need_long_string_node = c_str.size() >= 65536; + + if (need_long_string_node && !m_current_long_string_node.is_attached()) { + + m_current_long_string_node.create(NodeHeader::type_HasRefs); + + if ((index & 0xFF) == 0) { + // if we're starting on a new leaf, extend parent array for it + m_data.add(0); + m_compressed_leafs.push_back({}); + m_current_long_string_node.set_parent(&m_data, m_data.size() - 1); + m_current_long_string_node.update_parent(); + REALM_ASSERT_DEBUG(!m_current_string_leaf.is_attached() || m_current_string_leaf.size() == 0); + m_current_string_leaf.detach(); + } + else { + // we have been building an existing leaf and need to shift representation. + // but first we need to update leaf accessor for existing leaf + if (m_current_string_leaf.is_attached()) { + m_current_string_leaf.update_from_parent(); + } + else { + m_current_string_leaf.init_from_ref(m_current_string_leaf.get_ref_from_parent()); + } + REALM_ASSERT_DEBUG(m_current_string_leaf.size() > 0); + m_current_long_string_node.set_parent(&m_data, m_data.size() - 1); + m_current_long_string_node.update_parent(); + // convert the current leaf into a long string node. (array of strings in separate arrays) + for (auto s : m_compressed_leafs.back().m_compressed) { + ArrayUnsigned arr(m_top.get_alloc()); + arr.create(s.size, 65535); + unsigned short* dest = reinterpret_cast(arr.m_data); + std::copy_n(s.data, s.size, dest); + m_current_long_string_node.add(arr.get_ref()); + } + m_current_string_leaf.destroy(); + // force later reload of leaf + m_compressed_leafs.back().m_is_loaded.store(false, std::memory_order_release); + } + } + if (m_current_long_string_node.is_attached()) { + ArrayUnsigned arr(m_top.get_alloc()); + arr.create(c_str.size(), 65535); + unsigned short* begin = c_str.data(); + if (begin) { + // if the compressed string is empty, 'begin' is zero and we don't copy + size_t n = c_str.size(); + unsigned short* dest = reinterpret_cast(arr.m_data); + std::copy_n(begin, n, dest); + } + m_current_long_string_node.add(arr.get_ref()); + m_current_long_string_node.update_parent(); + if (m_current_long_string_node.size() == 256) { + // exit from "long string mode" + m_current_long_string_node.detach(); + } + CompressionSymbol* p_start = reinterpret_cast(arr.m_data); + m_compressed_leafs.back().m_compressed.push_back({p_start, arr.size()}); + } + else { + // Append to leaf with up to 256 entries. + // First create a new leaf if needed (limit number of entries to 256 pr leaf) + bool need_leaf_update = !m_current_string_leaf.is_attached() || (index & 0xFF) == 0; + if (need_leaf_update) { + m_current_string_leaf.set_parent(&m_data, index >> 8); + if ((index & 0xFF) == 0) { + // create new leaf + m_current_string_leaf.create(0, 65535); + m_data.add(m_current_string_leaf.get_ref()); + m_compressed_leafs.push_back({}); + } + else { + // just setup leaf accessor + if (m_current_string_leaf.is_attached()) { + m_current_string_leaf.update_from_parent(); + } + else { + m_current_string_leaf.init_from_ref(m_current_string_leaf.get_ref_from_parent()); + } + } + } + REALM_ASSERT(c_str.size() < 65535); + // Add compressed string at end of leaf + m_current_string_leaf.add(c_str.size()); + for (auto c : c_str) { + m_current_string_leaf.add(c); + } + REALM_ASSERT_DEBUG(m_compressed_leafs.size()); + CompressionSymbol* p = reinterpret_cast(m_current_string_leaf.m_data); + auto p_limit = p + m_current_string_leaf.size(); + auto p_start = p_limit - c_str.size(); + m_compressed_leafs.back().m_compressed.push_back({p_start, c_str.size()}); + REALM_ASSERT(m_compressed_leafs.back().m_compressed.size() <= 256); + } + m_top.adjust(Pos_Size, 2); // type is has_Refs, so increment is by 2 + load_leaf_if_new_ref(m_compressed_leafs.back(), m_data.get_as_ref(m_data.size() - 1)); +#ifdef REALM_DEBUG + auto csv = get_compressed(id); + CompressedStringView csv2(c_str); + REALM_ASSERT(csv == csv2); +#endif + return id; +} + +bool StringInterner::load_leaf_if_needed(DataLeaf& leaf) +{ + if (!leaf.m_is_loaded.load(std::memory_order_relaxed)) { + // start with an empty leaf: + leaf.m_compressed.clear(); + leaf.m_compressed.reserve(256); + + // must interpret leaf first - the leaf is either a single array holding all strings, + // or an array with each (compressed) string placed in its own array. + const char* header = m_top.get_alloc().translate(leaf.m_leaf_ref); + bool is_single_array = !NodeHeader::get_hasrefs_from_header(header); + if (is_single_array) { + size_t leaf_offset = 0; + ArrayUnsigned leaf_array(m_top.get_alloc()); + leaf_array.init_from_ref(leaf.m_leaf_ref); + REALM_ASSERT(NodeHeader::get_encoding(leaf_array.get_header()) == NodeHeader::Encoding::WTypBits); + REALM_ASSERT(NodeHeader::get_width_from_header(leaf_array.get_header()) == 16); + // This is dangerous if the leaf is for some reason not in the assumed format + CompressionSymbol* c = reinterpret_cast(leaf_array.m_data); + auto leaf_size = leaf_array.size(); + while (leaf_offset < leaf_size) { + size_t length = c[leaf_offset]; + REALM_ASSERT_DEBUG(length == leaf_array.get(leaf_offset)); + leaf_offset++; + leaf.m_compressed.push_back({c + leaf_offset, length}); + REALM_ASSERT_DEBUG(leaf.m_compressed.size() <= 256); + leaf_offset += length; + } + } + else { + // Not a single leaf - instead an array of strings + Array arr(m_top.get_alloc()); + arr.init_from_ref(leaf.m_leaf_ref); + for (size_t idx = 0; idx < arr.size(); ++idx) { + ArrayUnsigned str_array(m_top.get_alloc()); + ref_type ref = arr.get_as_ref(idx); + str_array.init_from_ref(ref); + REALM_ASSERT(NodeHeader::get_encoding(str_array.get_header()) == NodeHeader::Encoding::WTypBits); + REALM_ASSERT(NodeHeader::get_width_from_header(str_array.get_header()) == 16); + CompressionSymbol* c = reinterpret_cast(str_array.m_data); + leaf.m_compressed.push_back({c, str_array.size()}); + } + } + leaf.m_is_loaded.store(true, std::memory_order_release); + return true; + } + return false; +} + +// Danger: Only to be used if you know that a change in content ==> different ref +bool StringInterner::load_leaf_if_new_ref(DataLeaf& leaf, ref_type new_ref) +{ + if (leaf.m_leaf_ref != new_ref) { + leaf.m_leaf_ref = new_ref; + leaf.m_is_loaded = false; + leaf.m_compressed.resize(0); + } + return load_leaf_if_needed(leaf); +} + +CompressedStringView& StringInterner::get_compressed(StringID id, bool lock_if_mutating) +{ + auto index = id - 1; // 0 represents null + auto hi = index >> 8; + auto lo = index & 0xFFUL; + + // This is an instance of the "double checked locking" idiom, chosen to minimize + // locking in the common case of the leaf already being fully initialized. + DataLeaf& leaf = m_compressed_leafs[hi]; + if (leaf.m_is_loaded.load(std::memory_order_acquire)) { + return leaf.m_compressed[lo]; + } + if (lock_if_mutating) { + std::lock_guard lock(m_mutex); + load_leaf_if_needed(leaf); + } + else { + load_leaf_if_needed(leaf); + } + REALM_ASSERT_DEBUG(lo < leaf.m_compressed.size()); + return leaf.m_compressed[lo]; +} + +std::optional StringInterner::lookup(StringData sd) +{ + if (!m_top.is_attached()) { + // "dead" mode + return {}; + } + if (sd.data() == nullptr) + return 0; + uint32_t h = (uint32_t)sd.hash(); + auto candidates = hash_to_id(m_hash_map, h, 32); + for (auto& candidate : candidates) { + auto candidate_cpr = get_compressed(candidate, true); + if (m_compressor->compare(sd, candidate_cpr) == 0) + return candidate; + } + return {}; +} + +int StringInterner::compare(StringID A, StringID B) +{ + // comparisons against null + if (A == B && A == 0) + return 0; + if (A == 0) + return -1; + if (B == 0) + return 1; + // ok, no nulls. + // StringID 0 is null, the first true index starts from 1. + REALM_ASSERT_DEBUG(A <= m_decompressed_strings.size()); + REALM_ASSERT_DEBUG(B <= m_decompressed_strings.size()); + REALM_ASSERT(m_compressor); + return m_compressor->compare(get_compressed(A, true), get_compressed(B, true)); +} + +int StringInterner::compare(StringData s, StringID A) +{ + // comparisons against null + if (s.data() == nullptr && A == 0) + return 0; + if (s.data() == nullptr) + return -1; + if (A == 0) + return 1; + // ok, no nulls + REALM_ASSERT_DEBUG(A <= m_decompressed_strings.size()); + REALM_ASSERT(m_compressor); + return m_compressor->compare(s, get_compressed(A, true)); +} + + +StringData StringInterner::get(StringID id) +{ + REALM_ASSERT(m_compressor); + if (id == 0) + return StringData{nullptr}; + REALM_ASSERT_DEBUG(id <= m_decompressed_strings.size()); + + // Avoid taking a lock in the (presumably) common case, where the leaf with the compressed + // strings have already been loaded. Such leafs have "m_weight" > 0. + CachedString& cs = m_decompressed_strings[id - 1]; + if (auto weight = cs.m_weight.load(std::memory_order_acquire)) { + REALM_ASSERT_DEBUG(cs.m_decompressed); + if (weight < 128) { + // ignore if this fails - that happens if some other thread bumps the value + // concurrently. And if so, we can live with loosing our own "bump" + cs.m_weight.compare_exchange_strong(weight, weight + 64, std::memory_order_acq_rel); + } + return {cs.m_decompressed->c_str(), cs.m_decompressed->size()}; + } + std::lock_guard lock(m_mutex); + cs.m_decompressed = std::make_unique(m_compressor->decompress(get_compressed(id))); + m_in_memory_strings.push_back(id); + cs.m_weight.store(64, std::memory_order_release); + return {cs.m_decompressed->c_str(), cs.m_decompressed->size()}; +} + +} // namespace realm diff --git a/src/realm/string_interner.hpp b/src/realm/string_interner.hpp new file mode 100644 index 00000000000..dbc984ede55 --- /dev/null +++ b/src/realm/string_interner.hpp @@ -0,0 +1,117 @@ +/************************************************************************* + * + * Copyright 2016 Realm Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + **************************************************************************/ + +#ifndef REALM_STRING_INTERNER_HPP +#define REALM_STRING_INTERNER_HPP + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +struct CompressedStringView; + +namespace realm { + +class StringCompressor; + +struct CachedString { + std::atomic m_weight = 0; + std::unique_ptr m_decompressed; + CachedString() {} + CachedString(CachedString&& other) + { + m_decompressed = std::move(other.m_decompressed); + m_weight.store(other.m_weight.load(std::memory_order_relaxed), std::memory_order_relaxed); + } + CachedString(uint8_t init_weight, std::unique_ptr&& ptr) + { + m_decompressed = std::move(ptr); + m_weight.store(init_weight, std::memory_order_relaxed); + } +}; + +class StringInterner { +public: + // Use of the StringInterner must honour the restrictions on concurrency given + // below. Currently this is ensured by only using concurrent access on frozen + // objects. + // + // Limitations wrt concurrency: + // + // To be used exclusively from Table and in a non-concurrent setting: + StringInterner(Allocator& alloc, Array& parent, ColKey col_key, bool writable); + void update_from_parent(bool writable); + ~StringInterner(); + + // To be used from Obj within a write transaction or during commit. + // To be used only in a non-concurrent setting: + StringID intern(StringData); + + // The following four methods can be used in a concurrent setting with each other, + // but not concurrently with any of the above methods. + std::optional lookup(StringData); + int compare(StringID A, StringID B); + int compare(StringData, StringID A); + StringData get(StringID); + +private: + Array& m_parent; // need to be able to check if this is attached or not + Array m_top; + // Compressed strings are stored in blocks of 256. + // One array holds refs to all blocks: + Array m_data; + // In-memory representation of a block. Either only the ref to it, + // or a full vector of views into the block. + struct DataLeaf; + // in-memory metadata for faster access to compressed strings. Mirrors m_data. + std::vector m_compressed_leafs; + // 'm_hash_map' is used for mapping hash of uncompressed string to string id. + Array m_hash_map; + // the block of compressed strings we're currently appending to: + ArrayUnsigned m_current_string_leaf; + // an array of strings we're currently appending to. This is used instead + // when ever we meet a string too large to be placed inline. + Array m_current_long_string_node; + void rebuild_internal(); + CompressedStringView& get_compressed(StringID id, bool lock_if_mutating = false); + // return true if the leaf was reloaded + bool load_leaf_if_needed(DataLeaf& leaf); + // return 'true' if the new ref was different and forced a reload + bool load_leaf_if_new_ref(DataLeaf& leaf, ref_type new_ref); + ColKey m_col_key; // for validation + std::unique_ptr m_compressor; + // At the moment we need to keep decompressed strings around if they've been + // returned to the caller, since we're handing + // out StringData references to their storage. This is a temporary solution. + std::vector m_decompressed_strings; + std::vector m_in_memory_strings; + // Mutual exclusion is needed for frozen transactions only. Live objects are + // only used in single threaded contexts so don't need them. For now, we don't + // distinguish, assuming that locking is sufficiently low in both scenarios. + std::mutex m_mutex; +}; +} // namespace realm + +#endif diff --git a/src/realm/table.cpp b/src/realm/table.cpp index a21820997d6..6c66c4feda2 100644 --- a/src/realm/table.cpp +++ b/src/realm/table.cpp @@ -35,6 +35,7 @@ #include #include #include +#include #include @@ -263,6 +264,11 @@ using namespace realm::util; Replication* Table::g_dummy_replication = nullptr; +static inline bool needs_string_interner(ColKey col_key) +{ + return col_key.get_type() == col_type_String || col_key.get_type() == col_type_Mixed || col_key.is_dictionary(); +} + bool TableVersions::operator==(const TableVersions& other) const { if (size() != other.size()) @@ -356,6 +362,7 @@ Table::Table(Allocator& alloc) , m_index_refs(m_alloc) , m_opposite_table(m_alloc) , m_opposite_column(m_alloc) + , m_interner_data(m_alloc) , m_repl(&g_dummy_replication) , m_own_ref(this, alloc.get_instance_version()) { @@ -363,7 +370,7 @@ Table::Table(Allocator& alloc) m_index_refs.set_parent(&m_top, top_position_for_search_indexes); m_opposite_table.set_parent(&m_top, top_position_for_opposite_table); m_opposite_column.set_parent(&m_top, top_position_for_opposite_column); - + m_interner_data.set_parent(&m_top, top_position_for_interners); ref_type ref = create_empty_table(m_alloc); // Throws ArrayParent* parent = nullptr; size_t ndx_in_parent = 0; @@ -378,6 +385,7 @@ Table::Table(Replication* const* repl, Allocator& alloc) , m_index_refs(m_alloc) , m_opposite_table(m_alloc) , m_opposite_column(m_alloc) + , m_interner_data(m_alloc) , m_repl(repl) , m_own_ref(this, alloc.get_instance_version()) { @@ -385,6 +393,8 @@ Table::Table(Replication* const* repl, Allocator& alloc) m_index_refs.set_parent(&m_top, top_position_for_search_indexes); m_opposite_table.set_parent(&m_top, top_position_for_opposite_table); m_opposite_column.set_parent(&m_top, top_position_for_opposite_column); + m_opposite_column.set_parent(&m_top, top_position_for_opposite_column); + m_interner_data.set_parent(&m_top, top_position_for_interners); m_cookie = cookie_created; } @@ -653,6 +663,14 @@ void Table::init(ref_type top_ref, ArrayParent* parent, size_t ndx_in_parent, bo else { m_tombstones = nullptr; } + if (m_top.size() > top_position_for_interners && m_top.get_as_ref(top_position_for_interners)) { + // Interner data exist + m_interner_data.init_from_parent(); + } + else { + REALM_ASSERT_DEBUG(!m_interner_data.is_attached()); + } + refresh_string_interners(is_writable); m_cookie = cookie_initialized; } @@ -972,36 +990,6 @@ void Table::remove_search_index(ColKey col_key) m_spec.set_column_attr(spec_ndx, attr); // Throws } -void Table::enumerate_string_column(ColKey col_key) -{ - check_column(col_key); - size_t column_ndx = colkey2spec_ndx(col_key); - ColumnType type = col_key.get_type(); - if (type == col_type_String && !col_key.is_collection() && !m_spec.is_string_enum_type(column_ndx)) { - m_clusters.enumerate_string_column(col_key); - } -} - -bool Table::is_enumerated(ColKey col_key) const noexcept -{ - size_t col_ndx = colkey2spec_ndx(col_key); - return m_spec.is_string_enum_type(col_ndx); -} - -size_t Table::get_num_unique_values(ColKey col_key) const -{ - if (!is_enumerated(col_key)) - return 0; - - ArrayParent* parent; - ref_type ref = const_cast(m_spec).get_enumkeys_ref(colkey2spec_ndx(col_key), parent); - BPlusTree col(get_alloc()); - col.init_from_ref(ref); - - return col.size(); -} - - void Table::erase_root_column(ColKey col_key) { ColumnType col_type = col_key.get_type(); @@ -1054,7 +1042,18 @@ ColKey Table::do_insert_root_column(ColKey col_key, ColumnType type, StringData if (m_tombstones) { m_tombstones->insert_column(col_key); } - + if (needs_string_interner(col_key)) { + // create string interners internal rep as well as data area + REALM_ASSERT_DEBUG(m_interner_data.is_attached()); + while (col_ndx >= m_string_interners.size()) { + m_string_interners.push_back({}); + } + while (col_ndx >= m_interner_data.size()) { + m_interner_data.add(0); + } + REALM_ASSERT(!m_string_interners[col_ndx]); + m_string_interners[col_ndx] = std::make_unique(m_alloc, m_interner_data, col_key, true); + } bump_storage_version(); return col_key; @@ -1086,6 +1085,15 @@ void Table::do_erase_root_column(ColKey col_key) REALM_ASSERT(m_index_accessors.back() == nullptr); m_index_accessors.pop_back(); } + if (col_ndx < m_string_interners.size() && m_string_interners[col_ndx]) { + REALM_ASSERT_DEBUG(m_interner_data.is_attached()); + REALM_ASSERT_DEBUG(col_ndx < m_interner_data.size()); + auto data_ref = m_interner_data.get_as_ref(col_ndx); + if (data_ref) + Array::destroy_deep(data_ref, m_alloc); + m_interner_data.set(col_ndx, 0); + m_string_interners[col_ndx].reset(); + } bump_content_version(); bump_storage_version(); } @@ -1239,6 +1247,9 @@ void Table::detach(LifeCycleCookie cookie) noexcept { m_cookie = cookie; m_alloc.bump_instance_version(); + // release string interners + m_string_interners.clear(); + m_interner_data.detach(); } void Table::fully_detach() noexcept @@ -1249,6 +1260,7 @@ void Table::fully_detach() noexcept m_opposite_table.detach(); m_opposite_column.detach(); m_index_accessors.clear(); + m_string_interners.clear(); } @@ -1465,6 +1477,7 @@ ref_type Table::create_empty_table(Allocator& alloc, TableKey key) top.add(0); // pk col key top.add(0); // flags top.add(0); // tombstones + top.add(0); // string interners REALM_ASSERT(top.size() == top_array_size); @@ -1721,9 +1734,25 @@ ObjKey Table::find_first(ColKey col_key, T value) const using LeafType = typename ColumnTypeTraits::cluster_leaf_type; LeafType leaf(get_alloc()); - auto f = [&key, &col_key, &value, &leaf](const Cluster* cluster) { + // In case of a string column we can try to look up the StringID of the search string, + // and search for that in case the leaf is compressed. + std::optional string_id; + if constexpr (std::is_same_v) { + auto string_interner = get_string_interner(col_key); + REALM_ASSERT(string_interner != nullptr); + string_id = string_interner->lookup(value); + } + + auto f = [&](const Cluster* cluster) { cluster->init_leaf(col_key, &leaf); - size_t row = leaf.find_first(value, 0, cluster->node_size()); + size_t row; + if constexpr (std::is_same_v) { + row = leaf.find_first(value, 0, cluster->node_size(), string_id); + } + else { + row = leaf.find_first(value, 0, cluster->node_size()); + } + if (row != realm::npos) { key = cluster->get_real_key(row); return IteratorControl::Stop; @@ -1976,6 +2005,13 @@ void Table::update_from_parent() noexcept refresh_content_version(); m_has_any_embedded_objects.reset(); + if (m_top.size() > top_position_for_interners) { + if (m_top.get_as_ref(top_position_for_interners)) + m_interner_data.update_from_parent(); + else + m_interner_data.detach(); + } + refresh_string_interners(false); } m_alloc.bump_storage_version(); } @@ -2104,7 +2140,7 @@ void Table::refresh_content_version() // Called when Group is moved to another version - either a rollback or an advance. // The content of the table is potentially different, so make no assumptions. -void Table::refresh_accessor_tree() +void Table::refresh_accessor_tree(bool writable) { REALM_ASSERT(m_cookie == cookie_initialized); REALM_ASSERT(m_top.is_attached()); @@ -2134,12 +2170,81 @@ void Table::refresh_accessor_tree() else { m_tombstones = nullptr; } + if (writable) { + while (m_top.size() < top_position_for_interners) + m_top.add(0); + } + if (m_top.size() > top_position_for_interners) { + if (m_top.get_as_ref(top_position_for_interners)) + m_interner_data.init_from_parent(); + else + m_interner_data.detach(); + } refresh_content_version(); bump_storage_version(); build_column_mapping(); + refresh_string_interners(writable); refresh_index_accessors(); } +void Table::refresh_string_interners(bool writable) +{ + if (writable) { + // if we're in a write transaction, make sure interner arrays are created which will allow + // string interners to expand with their own data when "learning" + while (m_top.size() <= top_position_for_interners) { + m_top.add(0); + } + } + if (m_top.size() > top_position_for_interners && m_top.get_as_ref(top_position_for_interners)) + m_interner_data.update_from_parent(); + else + m_interner_data.detach(); + if (writable) { + if (!m_interner_data.is_attached()) { + m_interner_data.create(NodeHeader::type_HasRefs); + m_interner_data.update_parent(); + } + } + // bring string interners in line with underlying data. + // Precondition: we rely on the col keys in m_leaf_ndx2colkey[] being up to date. + for (size_t idx = 0; idx < m_leaf_ndx2colkey.size(); ++idx) { + auto col_key = m_leaf_ndx2colkey[idx]; + if (col_key == ColKey()) { + // deleted column, we really don't want a string interner for this + if (idx < m_string_interners.size() && m_string_interners[idx]) + m_string_interners[idx].reset(); + continue; + } + if (!needs_string_interner(col_key)) + continue; + + REALM_ASSERT_DEBUG(col_key.get_index().val == idx); + // maintain sufficient size of interner arrays to cover all columns + while (idx >= m_string_interners.size()) { + m_string_interners.push_back({}); + } + while (writable && idx >= m_interner_data.size()) { // m_interner_data.is_attached() per above + m_interner_data.add(0); + } + if (m_string_interners[idx]) { + // existing interner + m_string_interners[idx]->update_from_parent(writable); + } + else { + // new interner. Note: if not in a writable state, the interner will not have a valid + // underlying data array. The interner will be set in a state, where it cannot "learn", + // and searches will not find any matching interned strings. + m_string_interners[idx] = std::make_unique(m_alloc, m_interner_data, col_key, writable); + } + } + if (m_string_interners.size() > m_leaf_ndx2colkey.size()) { + // remove any string interners which are no longer reachable, + // e.g. after a rollback + m_string_interners.resize(m_leaf_ndx2colkey.size()); + } +} + void Table::refresh_index_accessors() { // Refresh search index accessors @@ -3351,9 +3456,9 @@ ColKey Table::find_opposite_column(ColKey col_key) const return ColKey(); } -ref_type Table::typed_write(ref_type ref, _impl::ArrayWriterBase& out) const +ref_type Table::typed_write(_impl::ArrayWriterBase& out) const { - REALM_ASSERT(ref == m_top.get_mem().get_ref()); + auto ref = m_top.get_ref(); if (out.only_modified && m_alloc.is_read_only(ref)) return ref; out.table = this; @@ -3381,3 +3486,11 @@ ref_type Table::typed_write(ref_type ref, _impl::ArrayWriterBase& out) const } return dest.write(out); } + +StringInterner* Table::get_string_interner(ColKey::Idx idx) const +{ + REALM_ASSERT(idx.val < m_string_interners.size()); + auto interner = m_string_interners[idx.val].get(); + REALM_ASSERT(interner); + return interner; +} diff --git a/src/realm/table.hpp b/src/realm/table.hpp index ac0faa5140a..86a78901cd8 100644 --- a/src/realm/table.hpp +++ b/src/realm/table.hpp @@ -248,18 +248,10 @@ class Table { } void remove_search_index(ColKey col_key); - void enumerate_string_column(ColKey col_key); - bool is_enumerated(ColKey col_key) const noexcept; bool contains_unique_values(ColKey col_key) const; //@} - /// If the specified column is optimized to store only unique values, then - /// this function returns the number of unique values currently - /// stored. Otherwise it returns zero. This function is mainly intended for - /// debugging purposes. - size_t get_num_unique_values(ColKey col_key) const; - template Columns column(ColKey col_key, util::Optional = util::none) const; template @@ -572,7 +564,11 @@ class Table { ColKey::Idx spec_ndx2leaf_ndx(size_t idx) const; ColKey leaf_ndx2colkey(ColKey::Idx idx) const; ColKey spec_ndx2colkey(size_t ndx) const; - + StringInterner* get_string_interner(ColKey::Idx idx) const; + StringInterner* get_string_interner(ColKey col_key) const + { + return get_string_interner(col_key.get_index()); + } // Queries // Using where(tv) is the new method to perform queries on TableView. The 'tv' can have any order; it does not // need to be sorted, and, resulting view retains its order. @@ -691,7 +687,7 @@ class Table { Replication* const* m_repl; }; - ref_type typed_write(ref_type ref, _impl::ArrayWriterBase& out) const; + ref_type typed_write(_impl::ArrayWriterBase& out) const; private: enum LifeCycleCookie { @@ -736,6 +732,7 @@ class Table { Array m_index_refs; // 5th slot in m_top Array m_opposite_table; // 7th slot in m_top Array m_opposite_column; // 8th slot in m_top + Array m_interner_data; // 14th slot in m_top std::vector> m_index_accessors; ColKey m_primary_key_col; Replication* const* m_repl; @@ -847,8 +844,9 @@ class Table { /// Refresh the part of the accessor tree that is rooted at this /// table. - void refresh_accessor_tree(); + void refresh_accessor_tree(bool writable); void refresh_index_accessors(); + void refresh_string_interners(bool writable); void refresh_content_version(); void flush_for_commit(); @@ -860,6 +858,7 @@ class Table { std::vector m_leaf_ndx2colkey; std::vector m_spec_ndx2leaf_ndx; std::vector m_leaf_ndx2spec_ndx; + mutable std::vector> m_string_interners; Type m_table_type = Type::TopLevel; uint64_t m_in_file_version_at_transaction_boundary = 0; AtomicLifeCycleCookie m_cookie; @@ -879,7 +878,8 @@ class Table { static constexpr int top_position_for_flags = 12; // flags contents: bit 0-1 - table type static constexpr int top_position_for_tombstones = 13; - static constexpr int top_array_size = 14; + static constexpr int top_position_for_interners = 14; + static constexpr int top_array_size = 15; enum { s_collision_map_lo = 0, s_collision_map_hi = 1, s_collision_map_local_id = 2, s_collision_map_num_slots }; diff --git a/src/realm/to_json.cpp b/src/realm/to_json.cpp index e9b9b049d72..ed1a11839d0 100644 --- a/src/realm/to_json.cpp +++ b/src/realm/to_json.cpp @@ -297,12 +297,12 @@ void Obj::to_json(std::ostream& out, JSONOutputMode output_mode) const print_link(val); } else if (val.is_type(type_Dictionary)) { - DummyParent parent(m_table, val.get_ref()); + DummyParent parent(m_table, val.get_ref(), ck); Dictionary dict(parent, 0); dict.to_json(out, output_mode, print_link); } else if (val.is_type(type_List)) { - DummyParent parent(m_table, val.get_ref()); + DummyParent parent(m_table, val.get_ref(), ck); Lst list(parent, 0); list.to_json(out, output_mode, print_link); } diff --git a/src/realm/transaction.hpp b/src/realm/transaction.hpp index 4da316c0d2e..e4db3c8a586 100644 --- a/src/realm/transaction.hpp +++ b/src/realm/transaction.hpp @@ -217,6 +217,7 @@ class Transaction : public Group { friend class DB; friend class DisableReplication; + friend class Table; }; /* diff --git a/src/realm/utilities.hpp b/src/realm/utilities.hpp index 2125fe2c2fa..badc4d772b6 100644 --- a/src/realm/utilities.hpp +++ b/src/realm/utilities.hpp @@ -69,8 +69,8 @@ typedef SSIZE_T ssize_t; #if defined(REALM_PTR_64) && defined(REALM_X86_OR_X64) && !REALM_WATCHOS -#define REALM_COMPILER_SSE // Compiler supports SSE 4.2 through __builtin_ accessors or back-end assembler -#define REALM_COMPILER_AVX +// #define REALM_COMPILER_SSE // Compiler supports SSE 4.2 through __builtin_ accessors or back-end assembler +// #define REALM_COMPILER_AVX #endif namespace realm { diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 97829133ecd..2377d36404e 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -76,6 +76,7 @@ set(CORE_TEST_SOURCES test_shared.cpp test_status.cpp test_string_data.cpp + test_string_compression.cpp test_table_view.cpp test_thread.cpp test_transactions.cpp diff --git a/test/benchmark-common-tasks/main.cpp b/test/benchmark-common-tasks/main.cpp index b8e52fc5d70..2db7982a39d 100644 --- a/test/benchmark-common-tasks/main.cpp +++ b/test/benchmark-common-tasks/main.cpp @@ -1630,6 +1630,70 @@ struct BenchmarkSort : BenchmarkWithStrings { } }; +// benchmark for testing compressed string sorting. +// N is the size of the string to generate +// M is the number of times we want store the string (number of dups) +template +struct BenchmarkSortCompressed : BenchmarkWithStringsTable { + std::string compressed_benchmark_name; + + BenchmarkSortCompressed() + : BenchmarkWithStringsTable() + { + if constexpr (N <= 15) { + compressed_benchmark_name = util::format("SortCompressedSmall(%1,%2)", N, M); + } + else if constexpr (N <= 63) { + compressed_benchmark_name = util::format("SortCompressedMedium(%1,%2)", N, M); + } + else { + compressed_benchmark_name = util::format("SortCompressedLarge(%1,%2)", N, M); + } + } + + const char* name() const + { + return compressed_benchmark_name.c_str(); + } + + void before_all(DBRef group) + { + BenchmarkWithStringsTable::before_all(group); + WriteTransaction tr(group); + TableRef t = tr.get_table(name()); + + auto gen_string = [](size_t length) { + const std::string alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuv" + "wxyz0123456789"; + std::random_device rd; + std::mt19937 generator(rd()); + std::uniform_int_distribution<> distribution(0, (int)alphabet.size() - 1); + + std::string random_str; + for (size_t i = 0; i < length; ++i) + random_str += alphabet[distribution(generator)]; + + return random_str; + }; + + std::string str = ""; + for (size_t i = 0; i < BASE_SIZE; ++i) { + if (i % M == 0) + str = gen_string(N); + + Obj obj = t->create_object(); + obj.set(m_col, str); + } + tr.commit(); + } + + void operator()(DBRef) + { + ConstTableRef table = m_table; + TableView view = table->get_sorted_view(m_col); + } +}; + struct BenchmarkEmptyCommit : Benchmark { const char* name() const { @@ -2663,6 +2727,11 @@ int benchmark_common_tasks_main() BENCH(IterateTableByIteratorIndex); BENCH(BenchmarkSort); + BENCH(BenchmarkSortCompressed<10, 500>); + BENCH(BenchmarkSortCompressed<50, 500>); + BENCH(BenchmarkSortCompressed<100, 500>); + BENCH(BenchmarkSortCompressed<1000, 5000>); + BENCH(BenchmarkSortInt); BENCH(BenchmarkSortIntList); BENCH(BenchmarkSortIntDictionary); diff --git a/test/expect_json.json b/test/expect_json.json index 4f4d0e227c3..5929348e35c 100644 --- a/test/expect_json.json +++ b/test/expect_json.json @@ -1 +1,431 @@ -[{"_key":0,"int":0,"bool":false,"date":"1970-01-01 03:25:45","float":1.2345600e+02,"double":9.8765432099999998e+03,"string":"string0","string_long":"string0 very long string.........","string_big_blobs":"string0 very long string......... big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs","string_enum":"enum1","binary":"YmluYXJ5AA==","oid":"000000000000000000000000","decimal":"1.2345","integers":[],"strings":[],"dictionary":{"a":2},"set":[123],"uuid":"00000000-0000-0000-0000-000000000000"},{"_key":1,"int":-1,"bool":true,"date":"1970-01-01 03:25:45","float":-1.2345600e+02,"double":-9.8765432099999998e+03,"string":"string1","string_long":"string1 very long string.........","string_big_blobs":"","string_enum":"enum2","binary":"YmluYXJ5AA==","oid":"000000000000000000000000","decimal":"1.2345","integers":[-123],"strings":["sub_-123"],"dictionary":{"a":2},"set":[123],"uuid":"00000000-0000-0000-0000-000000000000"},{"_key":2,"int":2,"bool":false,"date":"1970-01-01 03:25:45","float":1.2345600e+02,"double":9.8765432099999998e+03,"string":"string2","string_long":"string2 very long string.........","string_big_blobs":"string2 very long string......... big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs","string_enum":"enum3","binary":"YmluYXJ5AA==","oid":"000000000000000000000000","decimal":"1.2345","integers":[-123,2345],"strings":["sub_-123","sub_2345"],"dictionary":{"a":2},"set":[123],"uuid":"00000000-0000-0000-0000-000000000000"},{"_key":3,"int":-3,"bool":true,"date":"1970-01-01 03:25:45","float":-1.2345600e+02,"double":-9.8765432099999998e+03,"string":"string3","string_long":"string3 very long string.........","string_big_blobs":"","string_enum":"enum1","binary":"YmluYXJ5AA==","oid":"000000000000000000000000","decimal":"1.2345","integers":[-123,-3825,-7527],"strings":["sub_-123","sub_-3825","sub_-7527"],"dictionary":{"a":2},"set":[123],"uuid":"00000000-0000-0000-0000-000000000000"},{"_key":4,"int":4,"bool":false,"date":"1970-01-01 03:25:45","float":1.2345600e+02,"double":9.8765432099999998e+03,"string":"string4","string_long":"string4 very long string.........","string_big_blobs":"string4 very long string......... big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs","string_enum":"enum2","binary":"YmluYXJ5AA==","oid":"000000000000000000000000","decimal":"1.2345","integers":[-123,4813,9749,14685],"strings":["sub_-123","sub_4813","sub_9749","sub_14685"],"dictionary":{"a":2},"set":[123],"uuid":"00000000-0000-0000-0000-000000000000"},{"_key":5,"int":-5,"bool":true,"date":"1970-01-01 03:25:45","float":-1.2345600e+02,"double":-9.8765432099999998e+03,"string":"string5","string_long":"string5 very long string.........","string_big_blobs":"","string_enum":"enum3","binary":"YmluYXJ5AA==","oid":"000000000000000000000000","decimal":"1.2345","integers":[],"strings":[],"dictionary":{"a":2},"set":[123],"uuid":"00000000-0000-0000-0000-000000000000"},{"_key":6,"int":6,"bool":false,"date":"1970-01-01 03:25:45","float":1.2345600e+02,"double":9.8765432099999998e+03,"string":"string6","string_long":"string6 very long string.........","string_big_blobs":"string6 very long string......... big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs","string_enum":"enum1","binary":"YmluYXJ5AA==","oid":"000000000000000000000000","decimal":"1.2345","integers":[-123],"strings":["sub_-123"],"dictionary":{"a":2},"set":[123],"uuid":"00000000-0000-0000-0000-000000000000"},{"_key":7,"int":-7,"bool":true,"date":"1970-01-01 03:25:45","float":-1.2345600e+02,"double":-9.8765432099999998e+03,"string":"string7","string_long":"string7 very long string.........","string_big_blobs":"","string_enum":"enum2","binary":"YmluYXJ5AA==","oid":"000000000000000000000000","decimal":"1.2345","integers":[-123,-8761],"strings":["sub_-123","sub_-8761"],"dictionary":{"a":2},"set":[123],"uuid":"00000000-0000-0000-0000-000000000000"},{"_key":8,"int":8,"bool":false,"date":"1970-01-01 03:25:45","float":1.2345600e+02,"double":9.8765432099999998e+03,"string":"string8","string_long":"string8 very long string.........","string_big_blobs":"string8 very long string......... big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs","string_enum":"enum3","binary":"YmluYXJ5AA==","oid":"000000000000000000000000","decimal":"1.2345","integers":[-123,9749,19621],"strings":["sub_-123","sub_9749","sub_19621"],"dictionary":{"a":2},"set":[123],"uuid":"00000000-0000-0000-0000-000000000000"},{"_key":9,"int":-9,"bool":true,"date":"1970-01-01 03:25:45","float":-1.2345600e+02,"double":-9.8765432099999998e+03,"string":"string9","string_long":"string9 very long string.........","string_big_blobs":"","string_enum":"enum1","binary":"YmluYXJ5AA==","oid":"000000000000000000000000","decimal":"1.2345","integers":[-123,-11229,-22335,-33441],"strings":["sub_-123","sub_-11229","sub_-22335","sub_-33441"],"dictionary":{"a":2},"set":[123],"uuid":"00000000-0000-0000-0000-000000000000"},{"_key":10,"int":10,"bool":false,"date":"1970-01-01 03:25:45","float":1.2345600e+02,"double":9.8765432099999998e+03,"string":"string10","string_long":"string10 very long string.........","string_big_blobs":"string10 very long string......... big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs","string_enum":"enum2","binary":"YmluYXJ5AA==","oid":"000000000000000000000000","decimal":"1.2345","integers":[],"strings":[],"dictionary":{"a":2},"set":[123],"uuid":"00000000-0000-0000-0000-000000000000"},{"_key":11,"int":-11,"bool":true,"date":"1970-01-01 03:25:45","float":-1.2345600e+02,"double":-9.8765432099999998e+03,"string":"string11","string_long":"string11 very long string.........","string_big_blobs":"","string_enum":"enum3","binary":"YmluYXJ5AA==","oid":"000000000000000000000000","decimal":"1.2345","integers":[-123],"strings":["sub_-123"],"dictionary":{"a":2},"set":[123],"uuid":"00000000-0000-0000-0000-000000000000"},{"_key":12,"int":12,"bool":false,"date":"1970-01-01 03:25:45","float":1.2345600e+02,"double":9.8765432099999998e+03,"string":"string12","string_long":"string12 very long string.........","string_big_blobs":"string12 very long string......... big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs","string_enum":"enum1","binary":"YmluYXJ5AA==","oid":"000000000000000000000000","decimal":"1.2345","integers":[-123,14685],"strings":["sub_-123","sub_14685"],"dictionary":{"a":2},"set":[123],"uuid":"00000000-0000-0000-0000-000000000000"},{"_key":13,"int":-13,"bool":true,"date":"1970-01-01 03:25:45","float":-1.2345600e+02,"double":-9.8765432099999998e+03,"string":"string13","string_long":"string13 very long string.........","string_big_blobs":"","string_enum":"enum2","binary":"YmluYXJ5AA==","oid":"000000000000000000000000","decimal":"1.2345","integers":[-123,-16165,-32207],"strings":["sub_-123","sub_-16165","sub_-32207"],"dictionary":{"a":2},"set":[123],"uuid":"00000000-0000-0000-0000-000000000000"},{"_key":14,"int":14,"bool":false,"date":"1970-01-01 03:25:45","float":1.2345600e+02,"double":9.8765432099999998e+03,"string":"string14","string_long":"string14 very long string.........","string_big_blobs":"string14 very long string......... big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs","string_enum":"enum3","binary":"YmluYXJ5AA==","oid":"000000000000000000000000","decimal":"1.2345","integers":[-123,17153,34429,51705],"strings":["sub_-123","sub_17153","sub_34429","sub_51705"],"dictionary":{"a":2},"set":[123],"uuid":"00000000-0000-0000-0000-000000000000"}] \ No newline at end of file +[ + { + "_key": 0, + "int": 0, + "bool": false, + "date": "1970-01-01 03:25:45", + "float": 123.456, + "double": 9876.54321, + "string": "string0", + "string_long": "string0 very long string.........", + "string_big_blobs": "string0 very long string......... big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs", + "binary": "YmluYXJ5AA==", + "oid": "000000000000000000000000", + "decimal": "1.2345", + "integers": [], + "strings": [], + "dictionary": { + "a": 2 + }, + "set": [ + 123 + ], + "uuid": "00000000-0000-0000-0000-000000000000" + }, + { + "_key": 1, + "int": -1, + "bool": true, + "date": "1970-01-01 03:25:45", + "float": -123.456, + "double": -9876.54321, + "string": "string1", + "string_long": "string1 very long string.........", + "string_big_blobs": "", + "binary": "YmluYXJ5AA==", + "oid": "000000000000000000000000", + "decimal": "1.2345", + "integers": [ + -123 + ], + "strings": [ + "sub_-123" + ], + "dictionary": { + "a": 2 + }, + "set": [ + 123 + ], + "uuid": "00000000-0000-0000-0000-000000000000" + }, + { + "_key": 2, + "int": 2, + "bool": false, + "date": "1970-01-01 03:25:45", + "float": 123.456, + "double": 9876.54321, + "string": "string2", + "string_long": "string2 very long string.........", + "string_big_blobs": "string2 very long string......... big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs", + "binary": "YmluYXJ5AA==", + "oid": "000000000000000000000000", + "decimal": "1.2345", + "integers": [ + -123, + 2345 + ], + "strings": [ + "sub_-123", + "sub_2345" + ], + "dictionary": { + "a": 2 + }, + "set": [ + 123 + ], + "uuid": "00000000-0000-0000-0000-000000000000" + }, + { + "_key": 3, + "int": -3, + "bool": true, + "date": "1970-01-01 03:25:45", + "float": -123.456, + "double": -9876.54321, + "string": "string3", + "string_long": "string3 very long string.........", + "string_big_blobs": "", + "binary": "YmluYXJ5AA==", + "oid": "000000000000000000000000", + "decimal": "1.2345", + "integers": [ + -123, + -3825, + -7527 + ], + "strings": [ + "sub_-123", + "sub_-3825", + "sub_-7527" + ], + "dictionary": { + "a": 2 + }, + "set": [ + 123 + ], + "uuid": "00000000-0000-0000-0000-000000000000" + }, + { + "_key": 4, + "int": 4, + "bool": false, + "date": "1970-01-01 03:25:45", + "float": 123.456, + "double": 9876.54321, + "string": "string4", + "string_long": "string4 very long string.........", + "string_big_blobs": "string4 very long string......... big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs", + "binary": "YmluYXJ5AA==", + "oid": "000000000000000000000000", + "decimal": "1.2345", + "integers": [ + -123, + 4813, + 9749, + 14685 + ], + "strings": [ + "sub_-123", + "sub_4813", + "sub_9749", + "sub_14685" + ], + "dictionary": { + "a": 2 + }, + "set": [ + 123 + ], + "uuid": "00000000-0000-0000-0000-000000000000" + }, + { + "_key": 5, + "int": -5, + "bool": true, + "date": "1970-01-01 03:25:45", + "float": -123.456, + "double": -9876.54321, + "string": "string5", + "string_long": "string5 very long string.........", + "string_big_blobs": "", + "binary": "YmluYXJ5AA==", + "oid": "000000000000000000000000", + "decimal": "1.2345", + "integers": [], + "strings": [], + "dictionary": { + "a": 2 + }, + "set": [ + 123 + ], + "uuid": "00000000-0000-0000-0000-000000000000" + }, + { + "_key": 6, + "int": 6, + "bool": false, + "date": "1970-01-01 03:25:45", + "float": 123.456, + "double": 9876.54321, + "string": "string6", + "string_long": "string6 very long string.........", + "string_big_blobs": "string6 very long string......... big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs", + "binary": "YmluYXJ5AA==", + "oid": "000000000000000000000000", + "decimal": "1.2345", + "integers": [ + -123 + ], + "strings": [ + "sub_-123" + ], + "dictionary": { + "a": 2 + }, + "set": [ + 123 + ], + "uuid": "00000000-0000-0000-0000-000000000000" + }, + { + "_key": 7, + "int": -7, + "bool": true, + "date": "1970-01-01 03:25:45", + "float": -123.456, + "double": -9876.54321, + "string": "string7", + "string_long": "string7 very long string.........", + "string_big_blobs": "", + "binary": "YmluYXJ5AA==", + "oid": "000000000000000000000000", + "decimal": "1.2345", + "integers": [ + -123, + -8761 + ], + "strings": [ + "sub_-123", + "sub_-8761" + ], + "dictionary": { + "a": 2 + }, + "set": [ + 123 + ], + "uuid": "00000000-0000-0000-0000-000000000000" + }, + { + "_key": 8, + "int": 8, + "bool": false, + "date": "1970-01-01 03:25:45", + "float": 123.456, + "double": 9876.54321, + "string": "string8", + "string_long": "string8 very long string.........", + "string_big_blobs": "string8 very long string......... big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs", + "binary": "YmluYXJ5AA==", + "oid": "000000000000000000000000", + "decimal": "1.2345", + "integers": [ + -123, + 9749, + 19621 + ], + "strings": [ + "sub_-123", + "sub_9749", + "sub_19621" + ], + "dictionary": { + "a": 2 + }, + "set": [ + 123 + ], + "uuid": "00000000-0000-0000-0000-000000000000" + }, + { + "_key": 9, + "int": -9, + "bool": true, + "date": "1970-01-01 03:25:45", + "float": -123.456, + "double": -9876.54321, + "string": "string9", + "string_long": "string9 very long string.........", + "string_big_blobs": "", + "binary": "YmluYXJ5AA==", + "oid": "000000000000000000000000", + "decimal": "1.2345", + "integers": [ + -123, + -11229, + -22335, + -33441 + ], + "strings": [ + "sub_-123", + "sub_-11229", + "sub_-22335", + "sub_-33441" + ], + "dictionary": { + "a": 2 + }, + "set": [ + 123 + ], + "uuid": "00000000-0000-0000-0000-000000000000" + }, + { + "_key": 10, + "int": 10, + "bool": false, + "date": "1970-01-01 03:25:45", + "float": 123.456, + "double": 9876.54321, + "string": "string10", + "string_long": "string10 very long string.........", + "string_big_blobs": "string10 very long string......... big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs", + "binary": "YmluYXJ5AA==", + "oid": "000000000000000000000000", + "decimal": "1.2345", + "integers": [], + "strings": [], + "dictionary": { + "a": 2 + }, + "set": [ + 123 + ], + "uuid": "00000000-0000-0000-0000-000000000000" + }, + { + "_key": 11, + "int": -11, + "bool": true, + "date": "1970-01-01 03:25:45", + "float": -123.456, + "double": -9876.54321, + "string": "string11", + "string_long": "string11 very long string.........", + "string_big_blobs": "", + "binary": "YmluYXJ5AA==", + "oid": "000000000000000000000000", + "decimal": "1.2345", + "integers": [ + -123 + ], + "strings": [ + "sub_-123" + ], + "dictionary": { + "a": 2 + }, + "set": [ + 123 + ], + "uuid": "00000000-0000-0000-0000-000000000000" + }, + { + "_key": 12, + "int": 12, + "bool": false, + "date": "1970-01-01 03:25:45", + "float": 123.456, + "double": 9876.54321, + "string": "string12", + "string_long": "string12 very long string.........", + "string_big_blobs": "string12 very long string......... big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs", + "binary": "YmluYXJ5AA==", + "oid": "000000000000000000000000", + "decimal": "1.2345", + "integers": [ + -123, + 14685 + ], + "strings": [ + "sub_-123", + "sub_14685" + ], + "dictionary": { + "a": 2 + }, + "set": [ + 123 + ], + "uuid": "00000000-0000-0000-0000-000000000000" + }, + { + "_key": 13, + "int": -13, + "bool": true, + "date": "1970-01-01 03:25:45", + "float": -123.456, + "double": -9876.54321, + "string": "string13", + "string_long": "string13 very long string.........", + "string_big_blobs": "", + "binary": "YmluYXJ5AA==", + "oid": "000000000000000000000000", + "decimal": "1.2345", + "integers": [ + -123, + -16165, + -32207 + ], + "strings": [ + "sub_-123", + "sub_-16165", + "sub_-32207" + ], + "dictionary": { + "a": 2 + }, + "set": [ + 123 + ], + "uuid": "00000000-0000-0000-0000-000000000000" + }, + { + "_key": 14, + "int": 14, + "bool": false, + "date": "1970-01-01 03:25:45", + "float": 123.456, + "double": 9876.54321, + "string": "string14", + "string_long": "string14 very long string.........", + "string_big_blobs": "string14 very long string......... big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs", + "binary": "YmluYXJ5AA==", + "oid": "000000000000000000000000", + "decimal": "1.2345", + "integers": [ + -123, + 17153, + 34429, + 51705 + ], + "strings": [ + "sub_-123", + "sub_17153", + "sub_34429", + "sub_51705" + ], + "dictionary": { + "a": 2 + }, + "set": [ + 123 + ], + "uuid": "00000000-0000-0000-0000-000000000000" + } +] diff --git a/test/expect_xjson.json b/test/expect_xjson.json index 0a551ccc102..4f035b9c82d 100644 --- a/test/expect_xjson.json +++ b/test/expect_xjson.json @@ -18,7 +18,6 @@ "string": "string0", "string_long": "string0 very long string.........", "string_big_blobs": "string0 very long string......... big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs", - "string_enum": "enum1", "binary": { "$binary": { "base64": "YmluYXJ5AA==", @@ -69,7 +68,6 @@ "string": "string1", "string_long": "string1 very long string.........", "string_big_blobs": "", - "string_enum": "enum2", "binary": { "$binary": { "base64": "YmluYXJ5AA==", @@ -126,7 +124,6 @@ "string": "string2", "string_long": "string2 very long string.........", "string_big_blobs": "string2 very long string......... big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs", - "string_enum": "enum3", "binary": { "$binary": { "base64": "YmluYXJ5AA==", @@ -187,7 +184,6 @@ "string": "string3", "string_long": "string3 very long string.........", "string_big_blobs": "", - "string_enum": "enum1", "binary": { "$binary": { "base64": "YmluYXJ5AA==", @@ -252,7 +248,6 @@ "string": "string4", "string_long": "string4 very long string.........", "string_big_blobs": "string4 very long string......... big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs", - "string_enum": "enum2", "binary": { "$binary": { "base64": "YmluYXJ5AA==", @@ -321,7 +316,6 @@ "string": "string5", "string_long": "string5 very long string.........", "string_big_blobs": "", - "string_enum": "enum3", "binary": { "$binary": { "base64": "YmluYXJ5AA==", @@ -372,7 +366,6 @@ "string": "string6", "string_long": "string6 very long string.........", "string_big_blobs": "string6 very long string......... big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs", - "string_enum": "enum1", "binary": { "$binary": { "base64": "YmluYXJ5AA==", @@ -429,7 +422,6 @@ "string": "string7", "string_long": "string7 very long string.........", "string_big_blobs": "", - "string_enum": "enum2", "binary": { "$binary": { "base64": "YmluYXJ5AA==", @@ -490,7 +482,6 @@ "string": "string8", "string_long": "string8 very long string.........", "string_big_blobs": "string8 very long string......... big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs", - "string_enum": "enum3", "binary": { "$binary": { "base64": "YmluYXJ5AA==", @@ -555,7 +546,6 @@ "string": "string9", "string_long": "string9 very long string.........", "string_big_blobs": "", - "string_enum": "enum1", "binary": { "$binary": { "base64": "YmluYXJ5AA==", @@ -624,7 +614,6 @@ "string": "string10", "string_long": "string10 very long string.........", "string_big_blobs": "string10 very long string......... big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs", - "string_enum": "enum2", "binary": { "$binary": { "base64": "YmluYXJ5AA==", @@ -675,7 +664,6 @@ "string": "string11", "string_long": "string11 very long string.........", "string_big_blobs": "", - "string_enum": "enum3", "binary": { "$binary": { "base64": "YmluYXJ5AA==", @@ -732,7 +720,6 @@ "string": "string12", "string_long": "string12 very long string.........", "string_big_blobs": "string12 very long string......... big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs", - "string_enum": "enum1", "binary": { "$binary": { "base64": "YmluYXJ5AA==", @@ -793,7 +780,6 @@ "string": "string13", "string_long": "string13 very long string.........", "string_big_blobs": "", - "string_enum": "enum2", "binary": { "$binary": { "base64": "YmluYXJ5AA==", @@ -858,7 +844,6 @@ "string": "string14", "string_long": "string14 very long string.........", "string_big_blobs": "string14 very long string......... big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs", - "string_enum": "enum3", "binary": { "$binary": { "base64": "YmluYXJ5AA==", diff --git a/test/expect_xjson_plus.json b/test/expect_xjson_plus.json index 4b2f456835e..c0bcf5616e9 100644 --- a/test/expect_xjson_plus.json +++ b/test/expect_xjson_plus.json @@ -18,7 +18,6 @@ "string": "string0", "string_long": "string0 very long string.........", "string_big_blobs": "string0 very long string......... big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs", - "string_enum": "enum1", "binary": { "$binary": { "base64": "YmluYXJ5AA==", @@ -73,7 +72,6 @@ "string": "string1", "string_long": "string1 very long string.........", "string_big_blobs": "", - "string_enum": "enum2", "binary": { "$binary": { "base64": "YmluYXJ5AA==", @@ -134,7 +132,6 @@ "string": "string2", "string_long": "string2 very long string.........", "string_big_blobs": "string2 very long string......... big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs", - "string_enum": "enum3", "binary": { "$binary": { "base64": "YmluYXJ5AA==", @@ -199,7 +196,6 @@ "string": "string3", "string_long": "string3 very long string.........", "string_big_blobs": "", - "string_enum": "enum1", "binary": { "$binary": { "base64": "YmluYXJ5AA==", @@ -268,7 +264,6 @@ "string": "string4", "string_long": "string4 very long string.........", "string_big_blobs": "string4 very long string......... big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs", - "string_enum": "enum2", "binary": { "$binary": { "base64": "YmluYXJ5AA==", @@ -341,7 +336,6 @@ "string": "string5", "string_long": "string5 very long string.........", "string_big_blobs": "", - "string_enum": "enum3", "binary": { "$binary": { "base64": "YmluYXJ5AA==", @@ -396,7 +390,6 @@ "string": "string6", "string_long": "string6 very long string.........", "string_big_blobs": "string6 very long string......... big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs", - "string_enum": "enum1", "binary": { "$binary": { "base64": "YmluYXJ5AA==", @@ -457,7 +450,6 @@ "string": "string7", "string_long": "string7 very long string.........", "string_big_blobs": "", - "string_enum": "enum2", "binary": { "$binary": { "base64": "YmluYXJ5AA==", @@ -522,7 +514,6 @@ "string": "string8", "string_long": "string8 very long string.........", "string_big_blobs": "string8 very long string......... big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs", - "string_enum": "enum3", "binary": { "$binary": { "base64": "YmluYXJ5AA==", @@ -591,7 +582,6 @@ "string": "string9", "string_long": "string9 very long string.........", "string_big_blobs": "", - "string_enum": "enum1", "binary": { "$binary": { "base64": "YmluYXJ5AA==", @@ -664,7 +654,6 @@ "string": "string10", "string_long": "string10 very long string.........", "string_big_blobs": "string10 very long string......... big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs", - "string_enum": "enum2", "binary": { "$binary": { "base64": "YmluYXJ5AA==", @@ -719,7 +708,6 @@ "string": "string11", "string_long": "string11 very long string.........", "string_big_blobs": "", - "string_enum": "enum3", "binary": { "$binary": { "base64": "YmluYXJ5AA==", @@ -780,7 +768,6 @@ "string": "string12", "string_long": "string12 very long string.........", "string_big_blobs": "string12 very long string......... big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs", - "string_enum": "enum1", "binary": { "$binary": { "base64": "YmluYXJ5AA==", @@ -845,7 +832,6 @@ "string": "string13", "string_long": "string13 very long string.........", "string_big_blobs": "", - "string_enum": "enum2", "binary": { "$binary": { "base64": "YmluYXJ5AA==", @@ -914,7 +900,6 @@ "string": "string14", "string_long": "string14 very long string.........", "string_big_blobs": "string14 very long string......... big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs big blobs", - "string_enum": "enum3", "binary": { "$binary": { "base64": "YmluYXJ5AA==", diff --git a/test/fuzz_group.cpp b/test/fuzz_group.cpp index f4c4e3c3c26..f19f995d98f 100644 --- a/test/fuzz_group.cpp +++ b/test/fuzz_group.cpp @@ -79,7 +79,6 @@ enum INS { CREATE_TABLE_VIEW, COMPACT, IS_NULL, - ENUMERATE_COLUMN, COUNT }; @@ -597,19 +596,6 @@ void parse_and_apply_instructions(std::string& in, const std::string& path, std: t->remove_object_recursive(key); } } - else if (instr == ENUMERATE_COLUMN && wt->size() > 0) { - TableKey table_key = wt->get_table_keys()[get_next(s) % wt->size()]; - TableRef t = wt->get_table(table_key); - auto all_col_keys = t->get_column_keys(); - if (!all_col_keys.empty()) { - size_t ndx = get_next(s) % all_col_keys.size(); - ColKey col = all_col_keys[ndx]; - if (log) { - *log << "wt->get_table(" << table_key << ")->enumerate_string_column(" << col << ");\n"; - } - wt->get_table(table_key)->enumerate_string_column(col); - } - } else if (instr == COMMIT) { if (log) { *log << "wt->commit_and_continue_as_read();\n"; diff --git a/test/object-store/sync/client_reset.cpp b/test/object-store/sync/client_reset.cpp index 05352002d39..13f5fbb9177 100644 --- a/test/object-store/sync/client_reset.cpp +++ b/test/object-store/sync/client_reset.cpp @@ -1061,6 +1061,13 @@ TEST_CASE("sync: client reset", "[sync][pbs][client reset][baas]") { }, std::chrono::seconds(20)); } + // We can't be sure that the 'after' callback has been called yet + timed_sleeping_wait_for( + [&]() -> bool { + std::lock_guard lock(mtx); + return after_callback_invocations == 1; + }, + std::chrono::milliseconds(20)); auto session = test_app_session.sync_manager()->get_existing_session(local_config.path); if (session) { session->shutdown_and_wait(); diff --git a/test/realm-fuzzer/fuzz_engine.cpp b/test/realm-fuzzer/fuzz_engine.cpp index 896dbdceb4a..82951a6c5bf 100644 --- a/test/realm-fuzzer/fuzz_engine.cpp +++ b/test/realm-fuzzer/fuzz_engine.cpp @@ -151,9 +151,6 @@ void FuzzEngine::do_fuzz(FuzzConfigurator& cnf) else if (instr == Remove_Recursive && group.size() > 0) { fuzzer.remove_recursive(group, log, state); } - else if (instr == Enumerate_Column && group.size() > 0) { - fuzzer.enumerate_column(group, log, state); - } else if (instr == Commit) { fuzzer.commit(shared_realm, log); } diff --git a/test/realm-fuzzer/fuzz_object.cpp b/test/realm-fuzzer/fuzz_object.cpp index 80f28bb65dd..c1d66ea455a 100644 --- a/test/realm-fuzzer/fuzz_object.cpp +++ b/test/realm-fuzzer/fuzz_object.cpp @@ -324,20 +324,6 @@ void FuzzObject::remove_recursive(Group& group, FuzzLog& log, State& s) } } -void FuzzObject::enumerate_column(Group& group, FuzzLog& log, State& s) -{ - log << "FuzzObject::enumerate_column();\n"; - TableKey table_key = group.get_table_keys()[get_next_token(s) % group.size()]; - TableRef t = group.get_table(table_key); - auto all_col_keys = t->get_column_keys(); - if (!all_col_keys.empty()) { - size_t ndx = get_next_token(s) % all_col_keys.size(); - ColKey col = all_col_keys[ndx]; - log << "group.get_table(" << table_key << ")->enumerate_string_column(" << col << ");\n"; - group.get_table(table_key)->enumerate_string_column(col); - } -} - void FuzzObject::get_all_column_names(Group& group, FuzzLog& log) { log << "FuzzObject::get_all_column_names();\n"; diff --git a/test/realm-fuzzer/fuzz_object.hpp b/test/realm-fuzzer/fuzz_object.hpp index 469f76a109f..2fd31f79a7b 100644 --- a/test/realm-fuzzer/fuzz_object.hpp +++ b/test/realm-fuzzer/fuzz_object.hpp @@ -43,7 +43,6 @@ class FuzzObject { void set_obj(realm::Group& group, FuzzLog& log, State& s); void remove_obj(realm::Group& group, FuzzLog& log, State& s); void remove_recursive(realm::Group& group, FuzzLog& log, State& s); - void enumerate_column(realm::Group& group, FuzzLog& log, State& s); void get_all_column_names(realm::Group& group, FuzzLog& log); void commit(realm::SharedRealm shared_realm, FuzzLog& log); void rollback(realm::SharedRealm shared_realm, realm::Group& group, FuzzLog& log); diff --git a/test/realm-fuzzer/util.hpp b/test/realm-fuzzer/util.hpp index deb6946bdfb..1f1d6a5b754 100644 --- a/test/realm-fuzzer/util.hpp +++ b/test/realm-fuzzer/util.hpp @@ -52,8 +52,7 @@ enum Instruction { Create_Table_View = 20, Compact = 21, Is_Null = 22, - Enumerate_Column = 23, - Count = 24 + Count = 23 }; diff --git a/test/test_group.cpp b/test/test_group.cpp index c3c11c16ed5..27c94702224 100644 --- a/test/test_group.cpp +++ b/test/test_group.cpp @@ -916,53 +916,6 @@ TEST(Group_Close) Group from_mem(buffer); } -TEST(Group_Serialize_Optimized) -{ - // Create group with one table - Group to_mem; - TableRef table = to_mem.add_table("test"); - test_table_add_columns(table); - - for (size_t i = 0; i < 5; ++i) { - table->create_object().set_all("abd", 1, true, int(Mon)); - table->create_object().set_all("eftg", 2, true, int(Tue)); - table->create_object().set_all("hijkl", 5, true, int(Wed)); - table->create_object().set_all("mnopqr", 8, true, int(Thu)); - table->create_object().set_all("stuvxyz", 9, true, int(Fri)); - } - - ColKey col_string = table->get_column_keys()[0]; - table->enumerate_string_column(col_string); - -#ifdef REALM_DEBUG - to_mem.verify(); -#endif - - // Serialize to memory (we now own the buffer) - BinaryData buffer = to_mem.write_to_mem(); - - // Load the table - Group from_mem(buffer); - TableRef t = from_mem.get_table("test"); - - CHECK_EQUAL(4, t->get_column_count()); - - // Verify that original values are there - CHECK(*table == *t); - - // Add a row with a known (but unique) value - auto k = table->create_object().set_all("search_target", 9, true, int(Fri)).get_key(); - - const auto res = table->find_first_string(col_string, "search_target"); - CHECK_EQUAL(k, res); - -#ifdef REALM_DEBUG - to_mem.verify(); - from_mem.verify(); -#endif -} - - TEST(Group_Serialize_All) { // Create group with one table @@ -2508,5 +2461,172 @@ TEST(Group_ArrayCompression_Correctness_Random_Input) #endif } +TEST(Group_ArrayCompression_Strings) +{ + GROUP_TEST_PATH(path); + + // create a bunch of string related properties that are going to be compressed and verify write/read machinery + // and string correctness. + Group to_disk; + TableRef table = to_disk.add_table("test"); + auto col_key_string = table->add_column(type_String, "string"); + auto col_key_list_string = table->add_column_list(type_String, "list_strings"); + auto col_key_set_string = table->add_column_set(type_String, "set_strings"); + auto col_key_dict_string = table->add_column_dictionary(type_String, "dict_strings"); + auto obj = table->create_object(); + + + obj.set_any(col_key_string, {"Test"}); + auto list_s = obj.get_list(col_key_list_string); + auto set_s = obj.get_set(col_key_set_string); + auto dictionary_s = obj.get_dictionary(col_key_dict_string); + + std::string tmp{"aabbbcccaaaaddfwregfgklnjytojfs"}; + for (size_t i = 0; i < 10; ++i) { + list_s.add({tmp + std::to_string(i)}); + } + for (size_t i = 0; i < 10; ++i) { + set_s.insert({tmp + std::to_string(i)}); + } + for (size_t i = 0; i < 10; ++i) { + const auto key_value = tmp + std::to_string(i); + dictionary_s.insert({key_value}, {key_value}); + } + + CHECK(list_s.size() == 10); + CHECK(set_s.size() == 10); + CHECK(dictionary_s.size() == 10); + + // Serialize to disk (compression should happen when the proper leaf array is serialized to disk) + to_disk.write(path, crypt_key()); + +#ifdef REALM_DEBUG + to_disk.verify(); +#endif + + // Load the tables + Group from_disk(path, crypt_key()); + TableRef read_table = from_disk.get_table("test"); + auto obj1 = read_table->get_object(0); + + auto list_s1 = obj.get_list("list_strings"); + auto set_s1 = obj.get_set("set_strings"); + auto dictionary_s1 = obj.get_dictionary("dict_strings"); + + CHECK(obj1.get_any("string") == obj.get_any("string")); + + + CHECK(list_s1.size() == list_s.size()); + CHECK(set_s1.size() == set_s.size()); + CHECK(dictionary_s1.size() == dictionary_s.size()); + + CHECK(*read_table == *table); + + for (size_t i = 0; i < list_s1.size(); ++i) { + CHECK_EQUAL(list_s1.get_any(i), list_s.get_any(i)); + } + + for (size_t i = 0; i < set_s1.size(); ++i) { + CHECK_EQUAL(set_s1.get_any(i), set_s.get_any(i)); + } + + for (size_t i = 0; i < dictionary_s1.size(); ++i) { + CHECK_EQUAL(dictionary_s1.get_key(i), dictionary_s.get_key(i)); + CHECK_EQUAL(dictionary_s1.get_any(i), dictionary_s.get_any(i)); + } + +#ifdef REALM_DEBUG + from_disk.verify(); +#endif +} + +TEST(Test_Commit_Compression_Strings) +{ + auto generate_random_str_len = []() { + std::random_device rd; + std::mt19937 generator(rd()); + std::uniform_int_distribution<> distribution(1, 100); + return distribution(generator); + }; + + auto generate_random_string = [](size_t length) { + const std::string alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuv" + "wxyz0123456789"; + std::random_device rd; + std::mt19937 generator(rd()); + std::uniform_int_distribution<> distribution(0, (int)alphabet.size() - 1); + + std::string random_str; + for (size_t i = 0; i < length; ++i) + random_str += alphabet[distribution(generator)]; + + return random_str; + }; + + SHARED_GROUP_TEST_PATH(path); + auto hist = make_in_realm_history(); + DBRef db = DB::create(*hist, path); + ColKey col_key_string, col_key_list_string, col_key_set_string, col_key_dict_string; + ObjKey obj_key; + TableKey table_key; + + auto rt = db->start_read(); + { + WriteTransaction wt(db); + auto table = wt.add_table("test"); + table_key = table->get_key(); + col_key_string = table->add_column(type_String, "string"); + col_key_list_string = table->add_column_list(type_String, "list_strings"); + col_key_set_string = table->add_column_set(type_String, "set_strings"); + col_key_dict_string = table->add_column_dictionary(type_String, "dict_strings"); + Obj obj = table->create_object(); + obj_key = obj.get_key(); + wt.commit(); + } + // check verify that columns have been created + rt->advance_read(); + rt->verify(); + + // commit random strings in all the string based columns and verify interner updates + + for (size_t i = 0; i < 50; ++i) { + + // some string + const auto str = generate_random_string(generate_random_str_len()); + + rt = db->start_read(); + { + WriteTransaction wt(db); + auto table = wt.get_table(table_key); + auto obj = table->get_object(obj_key); + + obj.set_any(col_key_string, {str}); + auto list_s = obj.get_list(col_key_list_string); + auto set_s = obj.get_set(col_key_set_string); + auto dictionary_s = obj.get_dictionary(col_key_dict_string); + + list_s.add({str}); + set_s.insert({str}); + dictionary_s.insert({str}, {str}); + + wt.commit(); + } + rt->advance_read(); + rt->verify(); + + auto table = rt->get_table(table_key); + auto obj = table->get_object(obj_key); + const auto current_str = obj.get_any(col_key_string).get_string(); + CHECK_EQUAL(current_str, str); + + auto list_s = obj.get_list(col_key_list_string); + auto set_s = obj.get_set(col_key_set_string); + auto dictionary_s = obj.get_dictionary(col_key_dict_string); + + CHECK_EQUAL(list_s.get_any(i), str); + CHECK_NOT_EQUAL(set_s.find_any(str), not_found); + CHECK_NOT_EQUAL(dictionary_s.find_any(str), not_found); + } +} #endif // TEST_GROUP diff --git a/test/test_index_string.cpp b/test/test_index_string.cpp index 83ed8c342ec..4a588cd0666 100644 --- a/test/test_index_string.cpp +++ b/test/test_index_string.cpp @@ -150,13 +150,10 @@ class column { std::vector m_keys; }; - column(bool nullable = false, bool enumerated = false) + column(bool nullable = false) : m_column(this) { m_col_key = m_table.add_column(ColumnTypeTraits::id, "values", nullable); - if (enumerated) { - m_table.enumerate_string_column(m_col_key); - } } ColumnTestType& get_column() { @@ -172,62 +169,24 @@ class column { class string_column : public column { public: string_column() - : column(false, false) + : column(false) { } static bool is_nullable() { return false; } - static bool is_enumerated() - { - return false; - } }; class nullable_string_column : public column { public: nullable_string_column() - : column(true, false) - { - } - static bool is_nullable() - { - return true; - } - static bool is_enumerated() - { - return false; - } -}; -class enum_column : public column { -public: - enum_column() - : column(false, true) - { - } - static bool is_nullable() - { - return false; - } - static bool is_enumerated() - { - return true; - } -}; -class nullable_enum_column : public column { -public: - nullable_enum_column() - : column(true, true) + : column(true) { } static bool is_nullable() { return true; } - static bool is_enumerated() - { - return true; - } }; // disable to avoid warnings about not being used - enable when tests @@ -300,7 +259,7 @@ TEST(StringIndex_NonIndexable) } } -TEST_TYPES(StringIndex_BuildIndex, string_column, nullable_string_column, enum_column, nullable_enum_column) +TEST_TYPES(StringIndex_BuildIndex, string_column, nullable_string_column) { TEST_TYPE test_resources; typename TEST_TYPE::ColumnTestType& col = test_resources.get_column(); @@ -331,7 +290,7 @@ TEST_TYPES(StringIndex_BuildIndex, string_column, nullable_string_column, enum_c CHECK_EQUAL(6, r6.value); } -TEST_TYPES(StringIndex_DeleteAll, string_column, nullable_string_column, enum_column, nullable_enum_column) +TEST_TYPES(StringIndex_DeleteAll, string_column, nullable_string_column) { TEST_TYPE test_resources; typename TEST_TYPE::ColumnTestType& col = test_resources.get_column(); @@ -379,7 +338,7 @@ TEST_TYPES(StringIndex_DeleteAll, string_column, nullable_string_column, enum_co CHECK(ndx.is_empty()); } -TEST_TYPES(StringIndex_Delete, string_column, nullable_string_column, enum_column, nullable_enum_column) +TEST_TYPES(StringIndex_Delete, string_column, nullable_string_column) { TEST_TYPE test_resources; typename TEST_TYPE::ColumnTestType& col = test_resources.get_column(); @@ -424,7 +383,7 @@ TEST_TYPES(StringIndex_Delete, string_column, nullable_string_column, enum_colum } -TEST_TYPES(StringIndex_ClearEmpty, string_column, nullable_string_column, enum_column, nullable_enum_column) +TEST_TYPES(StringIndex_ClearEmpty, string_column, nullable_string_column) { TEST_TYPE test_resources; typename TEST_TYPE::ColumnTestType& col = test_resources.get_column(); @@ -437,7 +396,7 @@ TEST_TYPES(StringIndex_ClearEmpty, string_column, nullable_string_column, enum_c CHECK(ndx.is_empty()); } -TEST_TYPES(StringIndex_Clear, string_column, nullable_string_column, enum_column, nullable_enum_column) +TEST_TYPES(StringIndex_Clear, string_column, nullable_string_column) { TEST_TYPE test_resources; typename TEST_TYPE::ColumnTestType& col = test_resources.get_column(); @@ -482,7 +441,7 @@ TEST_TYPES(StringIndex_Clear, string_column, nullable_string_column, enum_column } -TEST_TYPES(StringIndex_Set, string_column, nullable_string_column, enum_column, nullable_enum_column) +TEST_TYPES(StringIndex_Set, string_column, nullable_string_column) { TEST_TYPE test_resources; typename TEST_TYPE::ColumnTestType& col = test_resources.get_column(); @@ -527,7 +486,7 @@ TEST_TYPES(StringIndex_Set, string_column, nullable_string_column, enum_column, CHECK_EQUAL(4, col.find_first(s6)); } -TEST_TYPES(StringIndex_Count, string_column, nullable_string_column, enum_column, nullable_enum_column) +TEST_TYPES(StringIndex_Count, string_column, nullable_string_column) { TEST_TYPE test_resources; typename TEST_TYPE::ColumnTestType& col = test_resources.get_column(); @@ -559,7 +518,7 @@ TEST_TYPES(StringIndex_Count, string_column, nullable_string_column, enum_column CHECK_EQUAL(4, c4); } -TEST_TYPES(StringIndex_Distinct, string_column, nullable_string_column, enum_column, nullable_enum_column) +TEST_TYPES(StringIndex_Distinct, string_column, nullable_string_column) { TEST_TYPE test_resources; typename TEST_TYPE::ColumnTestType& col = test_resources.get_column(); @@ -580,7 +539,7 @@ TEST_TYPES(StringIndex_Distinct, string_column, nullable_string_column, enum_col CHECK(ndx->has_duplicate_values()); } -TEST_TYPES(StringIndex_FindAllNoCopy, string_column, nullable_string_column, enum_column, nullable_enum_column) +TEST_TYPES(StringIndex_FindAllNoCopy, string_column, nullable_string_column) { TEST_TYPE test_resources; typename TEST_TYPE::ColumnTestType& col = test_resources.get_column(); @@ -706,8 +665,7 @@ TEST(StringIndex_FindAllNoCopy2_IntNull) CHECK_EQUAL(results.payload, col.size() - 1); } -TEST_TYPES(StringIndex_FindAllNoCopyCommonPrefixStrings, string_column, nullable_string_column, enum_column, - nullable_enum_column) +TEST_TYPES(StringIndex_FindAllNoCopyCommonPrefixStrings, string_column, nullable_string_column) { TEST_TYPE test_resources; typename TEST_TYPE::ColumnTestType& col = test_resources.get_column(); @@ -951,7 +909,7 @@ TEST_TYPES_IF(StringIndex_EmbeddedZeroesCombinations, TEST_DURATION > 1, string_ } // Tests for a bug with strings containing zeroes -TEST_TYPES(StringIndex_EmbeddedZeroes, string_column, nullable_string_column, enum_column, nullable_enum_column) +TEST_TYPES(StringIndex_EmbeddedZeroes, string_column, nullable_string_column) { TEST_TYPE test_resources; typename TEST_TYPE::ColumnTestType& col2 = test_resources.get_column(); @@ -982,10 +940,10 @@ TEST_TYPES(StringIndex_EmbeddedZeroes, string_column, nullable_string_column, en CHECK_EQUAL(f, null_key); } -TEST_TYPES(StringIndex_Null, nullable_string_column, nullable_enum_column) +TEST(StringIndex_Null) { - TEST_TYPE test_resources; - typename TEST_TYPE::ColumnTestType& col = test_resources.get_column(); + nullable_string_column test_resources; + auto& col = test_resources.get_column(); col.add(""); col.add(realm::null()); @@ -997,7 +955,7 @@ TEST_TYPES(StringIndex_Null, nullable_string_column, nullable_enum_column) } -TEST_TYPES(StringIndex_Zero_Crash, string_column, nullable_string_column, enum_column, nullable_enum_column) +TEST_TYPES(StringIndex_Zero_Crash, string_column, nullable_string_column) { bool nullable = TEST_TYPE::is_nullable(); @@ -1010,9 +968,6 @@ TEST_TYPES(StringIndex_Zero_Crash, string_column, nullable_string_column, enum_c auto k2 = table.create_object().set(col, StringData("\0\0", 2)).get_key(); table.add_search_index(col); - if (TEST_TYPE::is_enumerated()) - table.enumerate_string_column(col); - ObjKey t; t = table.find_first_string(col, StringData("")); @@ -1154,7 +1109,7 @@ TEST(StringIndex_Integer_Increasing) } } -TEST_TYPES(StringIndex_Duplicate_Values, string_column, nullable_string_column, enum_column, nullable_enum_column) +TEST_TYPES(StringIndex_Duplicate_Values, string_column, nullable_string_column) { TEST_TYPE test_resources; typename TEST_TYPE::ColumnTestType& col = test_resources.get_column(); @@ -1222,7 +1177,7 @@ TEST_TYPES(StringIndex_Duplicate_Values, string_column, nullable_string_column, CHECK(col.size() == 0); } -TEST_TYPES(StringIndex_MaxBytes, string_column, nullable_string_column, enum_column, nullable_enum_column) +TEST_TYPES(StringIndex_MaxBytes, string_column, nullable_string_column) { TEST_TYPE test_resources; typename TEST_TYPE::ColumnTestType& col = test_resources.get_column(); @@ -1267,7 +1222,7 @@ TEST_TYPES(StringIndex_MaxBytes, string_column, nullable_string_column, enum_col // for the characters at the end (they have an identical very // long prefix). This was causing a stack overflow because of // the recursive nature of the insert function. -TEST_TYPES(StringIndex_InsertLongPrefix, string_column, nullable_string_column, enum_column, nullable_enum_column) +TEST_TYPES(StringIndex_InsertLongPrefix, string_column, nullable_string_column) { TEST_TYPE test_resources; typename TEST_TYPE::ColumnTestType& col = test_resources.get_column(); @@ -1346,8 +1301,7 @@ TEST_TYPES(StringIndex_InsertLongPrefix, string_column, nullable_string_column, col.clear(); // calls recursive function Array::destroy_deep() } -TEST_TYPES(StringIndex_InsertLongPrefixAndQuery, string_column, nullable_string_column, enum_column, - nullable_enum_column) +TEST_TYPES(StringIndex_InsertLongPrefixAndQuery, string_column, nullable_string_column) { constexpr int half_node_size = REALM_MAX_BPNODE_SIZE / 2; bool nullable_column = TEST_TYPE::is_nullable(); @@ -1379,8 +1333,6 @@ TEST_TYPES(StringIndex_InsertLongPrefixAndQuery, string_column, nullable_string_ index->to_dot(o, ""); } */ - if (TEST_TYPE::is_enumerated()) - t->enumerate_string_column(col); auto ndx_a = t->where().equal(col, StringData(str_a)).find(); auto cnt = t->count_string(col, StringData(str_a)); @@ -1514,7 +1466,7 @@ void check_result_order(const std::vector& results, TestContext& test_co } // end anonymous namespace -TEST_TYPES(StringIndex_Insensitive, string_column, nullable_string_column, enum_column, nullable_enum_column) +TEST_TYPES(StringIndex_Insensitive, string_column, nullable_string_column) { TEST_TYPE test_resources; typename TEST_TYPE::ColumnTestType& col = test_resources.get_column(); @@ -1681,7 +1633,7 @@ TEST_TYPES(StringIndex_Insensitive_Unicode, non_nullable, nullable) */ -TEST_TYPES(StringIndex_45, string_column, nullable_string_column, enum_column, nullable_enum_column) +TEST_TYPES(StringIndex_45, string_column, nullable_string_column) { TEST_TYPE test_resources; typename TEST_TYPE::ColumnTestType& col = test_resources.get_column(); @@ -1715,8 +1667,7 @@ std::string create_random_a_string(size_t max_len) // Excluded when run with valgrind because it takes a long time -TEST_TYPES_IF(StringIndex_Insensitive_Fuzz, TEST_DURATION > 1, string_column, nullable_string_column, enum_column, - nullable_enum_column) +TEST_TYPES_IF(StringIndex_Insensitive_Fuzz, TEST_DURATION > 1, string_column, nullable_string_column) { const size_t max_str_len = 9; const size_t iters = 3; @@ -1763,8 +1714,7 @@ TEST_TYPES_IF(StringIndex_Insensitive_Fuzz, TEST_DURATION > 1, string_column, nu // Exercise the StringIndex case insensitive search for strings with very long, common prefixes // to cover the special case code paths where different strings are stored in a list. -TEST_TYPES(StringIndex_Insensitive_VeryLongStrings, string_column, nullable_string_column, enum_column, - nullable_enum_column) +TEST_TYPES(StringIndex_Insensitive_VeryLongStrings, string_column, nullable_string_column) { TEST_TYPE test_resources; typename TEST_TYPE::ColumnTestType& col = test_resources.get_column(); @@ -1800,7 +1750,7 @@ TEST_TYPES(StringIndex_Insensitive_VeryLongStrings, string_column, nullable_stri // Bug with case insensitive search on numbers that gives duplicate results -TEST_TYPES(StringIndex_Insensitive_Numbers, string_column, nullable_string_column, enum_column, nullable_enum_column) +TEST_TYPES(StringIndex_Insensitive_Numbers, string_column, nullable_string_column) { TEST_TYPE test_resources; typename TEST_TYPE::ColumnTestType& col = test_resources.get_column(); @@ -1819,7 +1769,7 @@ TEST_TYPES(StringIndex_Insensitive_Numbers, string_column, nullable_string_colum } -TEST_TYPES(StringIndex_Rover, string_column, nullable_string_column, enum_column, nullable_enum_column) +TEST_TYPES(StringIndex_Rover, string_column, nullable_string_column) { TEST_TYPE test_resources; typename TEST_TYPE::ColumnTestType& col = test_resources.get_column(); diff --git a/test/test_json.cpp b/test/test_json.cpp index 39f61d2a981..c90aeea913a 100644 --- a/test/test_json.cpp +++ b/test/test_json.cpp @@ -93,7 +93,6 @@ void setup_multi_table(Table& table, size_t rows) table.add_column(type_String, "string"); // 5 table.add_column(type_String, "string_long"); // 6 ColKey col_string_big = table.add_column(type_String, "string_big_blobs"); // 7 - ColKey col_string_enum = table.add_column(type_String, "string_enum"); // 8 - becomes StringEnumColumn ColKey col_binary = table.add_column(type_Binary, "binary"); // 9 ColKey col_oid = table.add_column(type_ObjectId, "oid"); // 10 ColKey col_decimal = table.add_column(type_Decimal, "decimal"); // 11 @@ -128,17 +127,6 @@ void setup_multi_table(Table& table, size_t rows) obj.set(col_string_big, ""); break; } - switch (i % 3) { - case 0: - obj.set(col_string_enum, "enum1"); - break; - case 1: - obj.set(col_string_enum, "enum2"); - break; - case 2: - obj.set(col_string_enum, "enum3"); - break; - } obj.set(col_binary, BinaryData("binary", 7)); obj.set(col_oid, ObjectId()); obj.set(col_decimal, Decimal128("1.2345")); @@ -158,9 +146,6 @@ void setup_multi_table(Table& table, size_t rows) auto set = obj.get_set(col_set); set.insert(123); } - - // We also want a StringEnumColumn - table.enumerate_string_column(col_string_enum); } bool json_test(std::string json, std::string expected_file, bool generate) diff --git a/test/test_lang_bind_helper.cpp b/test/test_lang_bind_helper.cpp index 7bdd650d225..925eb7ec852 100644 --- a/test/test_lang_bind_helper.cpp +++ b/test/test_lang_bind_helper.cpp @@ -521,7 +521,7 @@ TEST(LangBindHelper_AdvanceReadTransact_Basics) rt->verify(); CHECK_EQUAL(0, rt->size()); - // Create a table via the other SharedGroup + // Create a table in a separate transaction ObjKey k0; { WriteTransaction wt(sg); @@ -542,7 +542,7 @@ TEST(LangBindHelper_AdvanceReadTransact_Basics) CHECK_EQUAL(0, foo->get_object(k0).get(cols[0])); uint_fast64_t version = foo->get_content_version(); - // Modify the table via the other SharedGroup + // Modify the table in a separate transaction ObjKey k1; { WriteTransaction wt(sg); @@ -989,54 +989,6 @@ TEST(LangBindHelper_AdvanceReadTransact_LinkColumnInNewTable) } -TEST(LangBindHelper_AdvanceReadTransact_EnumeratedStrings) -{ - SHARED_GROUP_TEST_PATH(path); - ShortCircuitHistory hist; - DBRef sg = DB::create(hist, path, DBOptions(crypt_key())); - ColKey c0, c1, c2; - - // Start a read transaction (to be repeatedly advanced) - auto rt = sg->start_read(); - CHECK_EQUAL(0, rt->size()); - - // Create 3 string columns, one primed for conversion to "unique string - // enumeration" representation - { - WriteTransaction wt(sg); - TableRef table_w = wt.add_table("t"); - c0 = table_w->add_column(type_String, "a"); - c1 = table_w->add_column(type_String, "b"); - c2 = table_w->add_column(type_String, "c"); - for (int i = 0; i < 1000; ++i) { - std::ostringstream out; - out << i; - std::string str = out.str(); - table_w->create_object(ObjKey{}, {{c0, str}, {c1, "foo"}, {c2, str}}); - } - wt.commit(); - } - rt->advance_read(); - rt->verify(); - ConstTableRef table = rt->get_table("t"); - CHECK_EQUAL(0, table->get_num_unique_values(c0)); - CHECK_EQUAL(0, table->get_num_unique_values(c1)); // Not yet "optimized" - CHECK_EQUAL(0, table->get_num_unique_values(c2)); - - // Optimize - { - WriteTransaction wt(sg); - TableRef table_w = wt.get_table("t"); - table_w->enumerate_string_column(c1); - wt.commit(); - } - rt->advance_read(); - rt->verify(); - CHECK_EQUAL(0, table->get_num_unique_values(c0)); - CHECK_NOT_EQUAL(0, table->get_num_unique_values(c1)); // Must be "optimized" now - CHECK_EQUAL(0, table->get_num_unique_values(c2)); -} - NONCONCURRENT_TEST_IF(LangBindHelper_AdvanceReadTransact_SearchIndex, testing_supports_spawn_process) { SHARED_GROUP_TEST_PATH(path); @@ -5593,28 +5545,6 @@ TEST(LangBindHelper_CopyOnWriteOverflow) } -TEST(LangBindHelper_RollbackOptimize) -{ - SHARED_GROUP_TEST_PATH(path); - const char* key = crypt_key(); - std::unique_ptr hist_w(make_in_realm_history()); - DBRef sg_w = DB::create(*hist_w, path, DBOptions(key)); - auto g = sg_w->start_write(); - - auto table = g->add_table("t0"); - auto col = table->add_column(type_String, "str_col_0", true); - g->commit_and_continue_as_read(); - g->verify(); - g->promote_to_write(); - g->verify(); - std::vector keys; - table->create_objects(198, keys); - table->enumerate_string_column(col); - g->rollback_and_continue_as_read(); - g->verify(); -} - - TEST(LangBindHelper_BinaryReallocOverMax) { SHARED_GROUP_TEST_PATH(path); @@ -5667,33 +5597,6 @@ TEST(LangBindHelper_OpenAsEncrypted) #endif -// Test case generated in [realm-core-4.0.4] on Mon Dec 18 13:33:24 2017. -// Adding 0 rows to a StringEnumColumn would add the default value to the keys -// but not the indexes creating an inconsistency. -TEST(LangBindHelper_EnumColumnAddZeroRows) -{ - SHARED_GROUP_TEST_PATH(path); - const char* key = nullptr; - std::unique_ptr hist(make_in_realm_history()); - DBRef sg = DB::create(*hist, path, DBOptions(key)); - auto g = sg->start_write(); - auto g_r = sg->start_read(); - auto table = g->add_table(""); - - auto col = table->add_column(DataType(2), "table", false); - table->enumerate_string_column(col); - g->commit_and_continue_as_read(); - g->verify(); - g->promote_to_write(); - g->verify(); - table->create_object(); - g->commit_and_continue_as_read(); - g_r->advance_read(); - g_r->verify(); - g->verify(); -} - - TEST(LangBindHelper_RemoveObject) { SHARED_GROUP_TEST_PATH(path); diff --git a/test/test_links.cpp b/test/test_links.cpp index 7561364089b..6e1eed71f47 100644 --- a/test/test_links.cpp +++ b/test/test_links.cpp @@ -752,17 +752,6 @@ TEST(ListList_Clear) CHECK_EQUAL(links2->size(), 0); } -TEST(Links_AddBacklinkToTableWithEnumColumns) -{ - Group g; - auto table = g.add_table("fshno"); - auto col = table->add_column(type_String, "strings", false); - table->create_object(); - table->add_column(*table, "link1"); - table->enumerate_string_column(col); - table->add_column(*table, "link2"); -} - TEST(Links_LinkList_Inserts) { Group group; diff --git a/test/test_list.cpp b/test/test_list.cpp index d8e3f1fc1de..f3c5dd0b938 100644 --- a/test/test_list.cpp +++ b/test/test_list.cpp @@ -108,9 +108,12 @@ TEST(List_basic) TEST(List_SimpleTypes) { - Group g; + SHARED_GROUP_TEST_PATH(path); + DBRef db = DB::create(make_in_realm_history(), path); + + auto tr = db->start_write(); std::vector lists; - TableRef t = g.add_table("table"); + TableRef t = tr->add_table("table"); ColKey int_col = t->add_column_list(type_Int, "integers"); ColKey bool_col = t->add_column_list(type_Bool, "booleans"); ColKey string_col = t->add_column_list(type_String, "strings"); @@ -135,6 +138,9 @@ TEST(List_SimpleTypes) Timestamp(seconds_since_epoc + 60, 0)}; obj.set_list_values(timestamp_col, timestamp_vector); + tr->commit_and_continue_as_read(); + tr->promote_to_write(); + auto int_list = obj.get_list(int_col); lists.push_back(&int_list); std::vector vec(int_list.size()); diff --git a/test/test_query.cpp b/test/test_query.cpp index 1c380e409cb..37b109bfaad 100644 --- a/test/test_query.cpp +++ b/test/test_query.cpp @@ -329,11 +329,13 @@ See TEST(StringData_Substrings) for more unit tests for null, isolated to using columns or queries involved */ - -TEST(Query_NextGen_StringConditions) +TEST_TYPES(Query_NextGen_StringConditions, std::true_type, std::false_type) { - Group group; - TableRef table1 = group.add_table("table1"); + SHARED_GROUP_TEST_PATH(path); + + auto db = DB::create(make_in_realm_history(), path); + auto wt = db->start_write(); + TableRef table1 = wt->add_table("table1"); auto col_str1 = table1->add_column(type_String, "str1"); auto col_str2 = table1->add_column(type_String, "str2"); @@ -342,6 +344,11 @@ TEST(Query_NextGen_StringConditions) table1->create_object().set_all("!", "x").get_key(); ObjKey key_1_2 = table1->create_object().set_all("bar", "r").get_key(); + if (TEST_TYPE::value) { + wt->commit_and_continue_as_read(); + wt->promote_to_write(); + } + ObjKey m; // Equal m = table1->column(col_str1).equal("bar", false).find(); @@ -433,7 +440,7 @@ TEST(Query_NextGen_StringConditions) CHECK_EQUAL(m, null_key); // Test various compare operations with null - TableRef table2 = group.add_table("table2"); + TableRef table2 = wt->add_table("table2"); auto col_str3 = table2->add_column(type_String, "str3", true); ObjKey key_2_0 = table2->create_object().set(col_str3, "foo").get_key(); @@ -442,6 +449,11 @@ TEST(Query_NextGen_StringConditions) ObjKey key_2_3 = table2->create_object().set(col_str3, "bar").get_key(); ObjKey key_2_4 = table2->create_object().set(col_str3, "").get_key(); + if (TEST_TYPE::value) { + wt->commit_and_continue_as_read(); + wt->promote_to_write(); + } + size_t cnt; cnt = table2->column(col_str3).contains(StringData("")).count(); CHECK_EQUAL(cnt, 4); @@ -522,6 +534,12 @@ TEST(Query_NextGen_StringConditions) } }; + // not equal + check_results((table2->column(col_str3) != StringData("")), {StringData(), "foo", "bar", "!"}); + check_results((table2->column(col_str3) != StringData()), {"", "foo", "bar", "!"}); + check_results((table2->column(col_str3) != StringData("foo")), {StringData(), "", "bar", "!"}); + check_results((table2->column(col_str3) != StringData("barr")), {StringData(), "", "foo", "bar", "!"}); + // greater check_results((table2->column(col_str3) > StringData("")), {"foo", "bar", "!"}); check_results((table2->column(col_str3) > StringData("b")), {"foo", "bar"}); @@ -553,7 +571,7 @@ TEST(Query_NextGen_StringConditions) check_results((table2->column(col_str3) <= StringData("barrrr")), {"bar", "", "!", StringData()}); check_results((table2->column(col_str3) <= StringData("z")), {"foo", "bar", "", "!", StringData()}); - TableRef table3 = group.add_table(StringData("table3")); + TableRef table3 = wt->add_table(StringData("table3")); auto col_link1 = table3->add_column(*table2, "link1"); table3->create_object().set(col_link1, key_2_0); @@ -562,6 +580,11 @@ TEST(Query_NextGen_StringConditions) table3->create_object().set(col_link1, key_2_3); table3->create_object().set(col_link1, key_2_4); + if (TEST_TYPE::value) { + wt->commit_and_continue_as_read(); + wt->promote_to_write(); + } + cnt = table3->link(col_link1).column(col_str3).contains(StringData("")).count(); CHECK_EQUAL(cnt, 4); @@ -638,8 +661,14 @@ TEST(Query_NextGen_StringConditions) "This is a long search string that does not contain the word being searched for!, " "This is a long search string that does not contain the word being searched for!, " "needle"; + table2->create_object().set(col_str3, long_string).get_key(); + if (TEST_TYPE::value) { + wt->commit_and_continue_as_read(); + wt->promote_to_write(); + } + cnt = table2->column(col_str3).contains(search_1, false).count(); CHECK_EQUAL(cnt, 1); @@ -2124,15 +2153,12 @@ TEST_TYPES(Query_ListOfPrimitives_MinMax, int64_t, float, double, Decimal128, Ti validate_aggregate_results(test_context, table, col, value, max); } -TEST_TYPES(Query_StringIndexCommonPrefix, std::true_type, std::false_type) +TEST(Query_StringIndexCommonPrefix) { Group group; TableRef table = group.add_table("test"); auto col_str = table->add_column(type_String, "first"); table->add_search_index(col_str); - if (TEST_TYPE::value == true) { - table->enumerate_string_column(col_str); - } auto test_prefix_find = [&](std::string prefix) { std::string prefix_b = prefix + "b"; @@ -2822,8 +2848,6 @@ TEST(Query_Huge) for (size_t t = 0; t < 4; t++) { if (t == 1) { - tt.enumerate_string_column(col_str0); - tt.enumerate_string_column(col_str1); } else if (t == 2) { tt.add_search_index(col_str0); @@ -3007,8 +3031,6 @@ TEST_IF(Query_StrIndex3, TEST_DURATION > 0) for (size_t t = 0; t < vec.size(); t++) CHECK_EQUAL(vec[t], v.get_key(t)); - ttt.enumerate_string_column(col_str); - // Linear scan over enum, plus linear integer column scan v = ttt.where().equal(col_str, "AA").equal(col_int, 0).find_all(); CHECK_EQUAL(vec.size(), v.size()); @@ -3059,34 +3081,6 @@ TEST(Query_StrIndex2) CHECK_EQUAL(0, s); } -TEST(Query_StrEnum) -{ - Random random(random_int()); // Seed from slow global generator - Table ttt; - ttt.add_column(type_Int, "1"); - auto col_str = ttt.add_column(type_String, "2"); - - int aa; - int64_t s; - - for (int i = 0; i < 100; ++i) { - ttt.clear(); - aa = 0; - for (size_t t = 0; t < REALM_MAX_BPNODE_SIZE * 2; ++t) { - if (random.chance(1, 3)) { - ttt.create_object().set_all(1, "AA"); - ++aa; - } - else { - ttt.create_object().set_all(1, "BB"); - } - } - ttt.enumerate_string_column(col_str); - s = ttt.where().equal(col_str, "AA").count(); - CHECK_EQUAL(aa, s); - } -} - TEST(Query_StrIndex) { Random random(random_int()); // Seed from slow global generator @@ -3121,10 +3115,6 @@ TEST(Query_StrIndex) s = ttt.where().equal(str_col, "AA").count(); CHECK_EQUAL(aa, s); - ttt.enumerate_string_column(str_col); - s = ttt.where().equal(str_col, "AA").count(); - CHECK_EQUAL(aa, s); - ttt.add_search_index(str_col); s = ttt.where().equal(str_col, "AA").count(); CHECK_EQUAL(aa, s); @@ -3212,49 +3202,6 @@ TEST(Query_StrIndexUpdating) CHECK_EQUAL(tv_ins.size(), 0); } -TEST(Query_GA_Crash) -{ - GROUP_TEST_PATH(path); - Random random(random_int()); // Seed from slow global generator - { - Group g; - TableRef t = g.add_table("firstevents"); - auto col_str0 = t->add_column(type_String, "1"); - auto col_str1 = t->add_column(type_String, "2"); - auto col_str2 = t->add_column(type_String, "3"); - t->add_column(type_Int, "4"); - t->add_column(type_Int, "5"); - - for (size_t i = 0; i < 100; ++i) { - int64_t r1 = random.draw_int_mod(100); - int64_t r2 = random.draw_int_mod(100); - - t->create_object().set_all("10", "US", "1.0", r1, r2); - } - t->enumerate_string_column(col_str0); - t->enumerate_string_column(col_str1); - t->enumerate_string_column(col_str2); - g.write(path); - } - - Group g(path); - TableRef t = g.get_table("firstevents"); - auto col_str1 = t->get_column_key("2"); - - Query q = t->where().equal(col_str1, "US"); - - size_t c1 = 0; - for (size_t i = 0; i < 100; ++i) - c1 += t->count_string(col_str1, "US"); - - size_t c2 = 0; - for (size_t i = 0; i < 100; ++i) - c2 += q.count(); - - CHECK_EQUAL(c1, t->size() * 100); - CHECK_EQUAL(c1, c2); -} - TEST(Query_Float3) { Table t; @@ -3563,7 +3510,7 @@ TEST(Query_DoubleCoordinates) } -TEST_TYPES(Query_StrIndexed, std::true_type, std::false_type) +TEST(Query_StrIndexed) { Table ttt; auto col_int = ttt.add_column(type_Int, "1"); @@ -3578,10 +3525,6 @@ TEST_TYPES(Query_StrIndexed, std::true_type, std::false_type) ttt.create_object().set_all(4, "c"); } - if (TEST_TYPE::value == true) { - ttt.enumerate_string_column(col_str); - } - ttt.add_search_index(col_str); auto s = *ttt.where().equal(col_str, "a").sum(col_int); @@ -4164,7 +4107,6 @@ TEST(Query_LinkChainSortErrors) CHECK_LOGIC_ERROR(t1->get_sorted_view(SortDescriptor({{t1_linklist_col}})), ErrorCodes::InvalidSortDescriptor); } - TEST(Query_EmptyDescriptors) { Group g; @@ -5563,6 +5505,7 @@ TEST(Query_LinkToDictionary) { Group g; auto target = g.add_table("target"); + target->add_column(type_Int, "dummy"); // Ensure that dict_col get index 1 auto dict_col = target->add_column_dictionary(type_String, "string", true); auto source = g.add_table("source"); auto link_col = source->add_column(*target, "link"); @@ -5829,4 +5772,339 @@ TEST_TYPES(Query_IntCompressed, Equal, NotEqual, Less, LessEqual, Greater, Great } } +// Many of our tests just test the correctness of sorting strings. +// For compressed strings we can use the string ids to perform the +// same task. We just need to commit first and then run the query. +// These tests are mainly verifying the following: +// +// 1. Store N strings inside a Mixed. Commit and sort. +// 2. Store inside a Mixed integers and Strings. Commit and sort +// 3. Store N strings in compressed format inside a Mixed property, store another N strings uncompressed in another +// column. Sort using both columns. +// 4. Store N compressed strings inside a table, linked by another table. Sort over links. + +// Note: Strings and Mixed use the same logic for compressing strings. Thus these tests are solely using Mixed +// columns. + + +static int gen_random_int(int min = 1, int max = 100) +{ + std::random_device rd; + std::mt19937 generator(rd()); + std::uniform_int_distribution<> distribution(min, max); + return distribution(generator); +} + +static std::string gen_random_string(size_t length) +{ + const std::string alphabet = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuv" + "wxyz0123456789"; + std::random_device rd; + std::mt19937 generator(rd()); + std::uniform_int_distribution<> distribution(0, (int)alphabet.size() - 1); + + std::string random_str; + for (size_t i = 0; i < length; ++i) + random_str += alphabet[distribution(generator)]; + + return random_str; +} + +TEST_TYPES(CompressedStrings_Sort, std::true_type, std::false_type) +{ + constexpr size_t N = 10; + using type = typename TEST_TYPE::type; + SHARED_GROUP_TEST_PATH(path); + std::unique_ptr hist(make_in_realm_history()); + DBRef sg = DB::create(*hist, path, DBOptions(crypt_key())); + TransactionRef rt = sg->start_read(); + + auto commit = [&sg](auto f) { + WriteTransaction wt(sg); + f(wt); + wt.commit(); + }; + + commit([&](auto& wt) { + auto t = wt.add_table("Table"); + t->add_column(type_Mixed, "any"); + }); + rt->advance_read(); + + std::vector strings; + for (size_t i = 0; i < N; ++i) { + const auto size = gen_random_int(); + strings.push_back(gen_random_string(size)); + } + + commit([&](auto& wt) { + TableRef t = wt.get_table("Table"); + ColKey col = t->get_column_key("any"); + for (auto& s : strings) { + t->create_object().set(col, Mixed{s}); + } + }); + rt->advance_read(); + + // sort and verify + TableRef table = rt->get_table("Table"); + ColKey col_key = table->get_column_key("any"); + bool ascending = type::value; + + auto cmp = [&ascending](auto& s1, auto& s2) { + Mixed m1{s1}; + Mixed m2{s2}; + return ascending ? m1 < m2 : m1 > m2; + }; + + std::sort(strings.begin(), strings.end(), cmp); + + TableView tv = table->where().find_all(); + tv.sort(col_key, ascending); + CHECK(tv.size() == strings.size()); + for (size_t i = 0; i < N; ++i) { + CHECK_EQUAL(tv[i].get_any(col_key), strings[i]); + } +} + +TEST_TYPES(CompressedStringsAndOtherMixed_Sort, std::true_type, std::false_type) +{ + constexpr size_t N = 10; + using type = typename TEST_TYPE::type; + SHARED_GROUP_TEST_PATH(path); + std::unique_ptr hist(make_in_realm_history()); + DBRef sg = DB::create(*hist, path, DBOptions(crypt_key())); + TransactionRef rt = sg->start_read(); + + auto commit = [&sg](auto f) { + WriteTransaction wt(sg); + f(wt); + wt.commit(); + }; + + commit([&](auto& wt) { + auto t = wt.add_table("Table"); + t->add_column(type_Mixed, "any"); + }); + rt->advance_read(); + + std::vector strings; + std::vector ints; + for (size_t i = 0; i < N; ++i) { + const auto size = gen_random_int(); + ints.push_back(size); + strings.push_back(gen_random_string(size)); + } + + commit([&](auto& wt) { + TableRef t = wt.get_table("Table"); + ColKey col_any = t->get_column_key("any"); + for (auto& s : strings) { + Obj obj = t->create_object(); + obj.set(col_any, Mixed{s}); + } + for (auto i : ints) { + Obj obj = t->create_object(); + obj.set(col_any, Mixed{i}); + } + }); + rt->advance_read(); + + // sort and verify + TableRef table = rt->get_table("Table"); + ColKey col_key = table->get_column_key("any"); + bool ascending = type::value; + + auto cmp = [&ascending](Mixed m1, Mixed m2) { + return ascending ? m1 < m2 : m1 > m2; + }; + + std::vector data; + for (auto& str : strings) + data.push_back(Mixed{str}); + for (auto i : ints) + data.push_back(Mixed{i}); + + std::sort(data.begin(), data.end(), cmp); + + TableView tv = table->where().find_all(); + tv.sort(col_key, ascending); + CHECK(tv.size() == data.size()); + for (size_t i = 0; i < N; ++i) { + CHECK_EQUAL(tv[i].get_any(col_key), data[i]); + } +} + +TEST_TYPES(CompressedStrings_CompressedAndUncompressedStringColumns, std::true_type, std::false_type) +{ + constexpr size_t N = 10; + using type = typename TEST_TYPE::type; + SHARED_GROUP_TEST_PATH(path); + std::unique_ptr hist(make_in_realm_history()); + DBRef sg = DB::create(*hist, path, DBOptions(crypt_key())); + TransactionRef rt = sg->start_read(); + + auto commit = [&sg](auto f) { + WriteTransaction wt(sg); + f(wt); + wt.commit(); + }; + + commit([&](auto& wt) { + auto t = wt.add_table("Table"); + t->add_column(type_Mixed, "any_compressed"); + t->add_column(type_Mixed, "any_uncompressed"); + }); + rt->advance_read(); + + std::vector strings; + for (size_t i = 0; i < N; ++i) { + const auto size = gen_random_int(); + strings.push_back(gen_random_string(size)); + } + + commit([&](auto& wt) { + TableRef t = wt.get_table("Table"); + ColKey col = t->get_column_key("any_compressed"); + for (auto& s : strings) { + t->create_object().set(col, Mixed{s}); + } + }); + // any_compressed Mixed is now using compressed strings + + rt->advance_read(); + + commit([&](auto& wt) { + TableRef t = wt.get_table("Table"); + ColKey col_compressed = t->get_column_key("any_compressed"); + ColKey col_uncompressed = t->get_column_key("any_uncompressed"); + + // add N new objects but as long as we don't commit these strings + // will be in uncompressed format + for (auto& s : strings) { + t->create_object().set(col_uncompressed, Mixed{s}); + } + + // sort and verify both columns + bool ascending = type::value; + auto cmp = [&ascending](auto& s1, auto& s2) { + Mixed m1{s1}; + Mixed m2{s2}; + return ascending ? m1 < m2 : m1 > m2; + }; + std::sort(strings.begin(), strings.end(), cmp); + + TableView tv = t->where().find_all(); + + tv.sort(SortDescriptor({{col_compressed}, {col_uncompressed}}, {ascending, ascending})); + + CHECK(tv.size() == strings.size() * 2); + for (size_t i = 0; i < 2 * N; ++i) { + auto compressed_str = tv[i].get_any(col_compressed); + auto uncompressed_str = tv[i].get_any(col_uncompressed); + if (!compressed_str.is_null()) { + CHECK_EQUAL(compressed_str, strings[i % N]); + } + if (!uncompressed_str.is_null()) { + CHECK_EQUAL(uncompressed_str, strings[i % N]); + } + } + }); + // after this point both columns will be in compressed format +} + +TEST_TYPES(CompressedStrings_SortOverLinks, std::true_type, std::false_type) +{ + constexpr size_t N = 10; + using type = typename TEST_TYPE::type; + SHARED_GROUP_TEST_PATH(path); + std::unique_ptr hist(make_in_realm_history()); + DBRef sg = DB::create(*hist, path, DBOptions(crypt_key())); + TransactionRef rt = sg->start_read(); + + auto commit = [&sg](auto f) { + WriteTransaction wt(sg); + f(wt); + wt.commit(); + }; + + commit([&](auto& wt) { + auto t = wt.add_table("Table"); + auto o = wt.add_table("Other"); + // store N ints in T.any + t->add_column(type_Mixed, "any"); + // link O to T + t->add_column(*o, "link"); + // store N compressed strings in O.any + o->add_column(type_Mixed, "any"); + }); + rt->advance_read(); + + std::vector ints; + std::vector strings; + for (size_t i = 0; i < N; ++i) { + const auto size = gen_random_int(); + ints.push_back(size); + strings.push_back(gen_random_string(size)); + } + + commit([&](auto& wt) { + // store N strings in O + TableRef table = wt.get_table("Other"); + ColKey col = table->get_column_key("any"); + for (const auto& s : strings) { + table->create_object().set(col, Mixed{s}); + } + }); + + rt->advance_read(); + + commit([&](auto& wt) { + TableRef table = wt.get_table("Table"); + TableRef other = wt.get_table("Other"); + + ColKey col = table->get_column_key("any"); + ColKey link = table->get_column_key("link"); + + // set N ints + for (auto i : ints) + table->create_object().set(col, Mixed{i}); + + // set N links to Other's objects storing compressed strings. + size_t i = 0; + for (Obj o : *other) + table->get_object(i++).set(link, o.get_key()); + }); + // any Mixed that contains strings is now pointing to a compressed string + rt->advance_read(); + + std::vector> data; + for (size_t i = 0; i < N; ++i) { + auto p = std::make_pair(Mixed{ints[i]}, Mixed{strings[i]}); + data.push_back(p); + } + + bool ascending = type::value; + auto cmp = [&ascending](auto& p1, auto& p2) { + // sort based on strings + Mixed m1 = p1.second; + Mixed m2 = p2.second; + return ascending ? m1 < m2 : m1 > m2; + }; + std::sort(data.begin(), data.end(), cmp); + + TableRef table = rt->get_table("Table"); + TableRef other = rt->get_table("Other"); + ColKey t_any = table->get_column_key("any"); + ColKey link = table->get_column_key("link"); + ColKey o_any = other->get_column_key("any"); + + TableView tv = table->where().find_all(); + tv.sort(SortDescriptor({{link, o_any}}, {ascending})); + CHECK(tv.size() == data.size()); + for (size_t i = 0; i < N; ++i) { + CHECK_EQUAL(tv[i].get_any(t_any), data[i].first); + } +} + #endif // TEST_QUERY diff --git a/test/test_query2.cpp b/test/test_query2.cpp index b0dc4f75165..462fbcfc3dc 100644 --- a/test/test_query2.cpp +++ b/test/test_query2.cpp @@ -661,35 +661,6 @@ TEST(Query_Binary) } } -TEST(Query_Enums) -{ - Table table; - auto col_int = table.add_column(type_Int, "1"); - auto col_str = table.add_column(type_String, "2"); - - - for (size_t i = 0; i < 5; ++i) { - table.create_object().set_all(1, "abd"); - table.create_object().set_all(2, "eftg"); - table.create_object().set_all(5, "hijkl"); - table.create_object().set_all(8, "mnopqr"); - table.create_object().set_all(9, "stuvxyz"); - } - - table.enumerate_string_column(col_str); - - Query q1 = table.where().equal(col_str, "eftg"); - TableView tv1 = q1.find_all(); - - CHECK_EQUAL(5, tv1.size()); - CHECK_EQUAL(2, tv1[0].get(col_int)); - CHECK_EQUAL(2, tv1[1].get(col_int)); - CHECK_EQUAL(2, tv1[2].get(col_int)); - CHECK_EQUAL(2, tv1[3].get(col_int)); - CHECK_EQUAL(2, tv1[4].get(col_int)); -} - - TEST_TYPES(Query_CaseSensitivity, std::true_type, std::false_type) { constexpr bool nullable = TEST_TYPE::value; @@ -1422,29 +1393,16 @@ TEST(Query_NullStrings) TEST(Query_Nulls_Fuzzy) { - for (int attributes = 1; attributes < 5; attributes++) { + for (int attributes = 1; attributes < 3; attributes++) { Random random(random_int()); for (size_t t = 0; t < 10; t++) { Table table; auto col = table.add_column(type_String, "string", true); - if (attributes == 0) { - } if (attributes == 1) { table.add_search_index(col); } - else if (attributes == 2) { - table.enumerate_string_column(col); - } - else if (attributes == 3) { - table.add_search_index(col); - table.enumerate_string_column(col); - } - else if (attributes == 4) { - table.enumerate_string_column(col); - table.add_search_index(col); - } // map that is kept in sync with the column so that we can compare with it std::map v; diff --git a/test/test_shared.cpp b/test/test_shared.cpp index df9ae2f778d..c8f75c47fed 100644 --- a/test/test_shared.cpp +++ b/test/test_shared.cpp @@ -2328,6 +2328,89 @@ TEST(Shared_KeyWithNulBytes) #endif // REALM_ENABLE_ENCRYPTION +TEST(Shared_MaxStrings) +{ + SHARED_GROUP_TEST_PATH(path); + DBRef sg = get_test_db(path); + auto trans = sg->start_write(); + auto t = trans->add_table("MyTable"); + ColKey ck = t->add_column(type_String, "MyStrings"); + std::string str_a(16 * 1024 * 1024 - 257, 'a'); + std::string str_b(16 * 1024 * 1024 - 257, 'b'); + // make it harder to compress: + for (auto& e : str_a) { + e = std::rand() % 256; + } + for (auto& e : str_b) { + e = std::rand() % 256; + } + auto o = t->create_object(); + o.set(ck, str_a); + trans->commit_and_continue_as_read(); + auto v = o.get(ck); + CHECK_EQUAL(str_a, v); + trans->promote_to_write(); + auto o2 = t->create_object(); + o2.set(ck, str_b); + trans->commit_and_continue_as_read(); + v = o.get(ck); + auto v2 = o2.get(ck); + CHECK_EQUAL(v, str_a); + CHECK_EQUAL(v2, str_b); + trans->close(); + sg.reset(); +} + +TEST(Shared_RandomMaxStrings) +{ + + SHARED_GROUP_TEST_PATH(path); + DBRef sg = get_test_db(path); + auto trans = sg->start_write(); + auto t = trans->add_table("MyTable"); + ColKey ck = t->add_column(type_String, "MyStrings"); + trans->commit_and_continue_as_read(); + for (int run = 0; run < 10; ++run) { + trans->promote_to_write(); + size_t str_length = std::rand() % (16 * 1024 * 1024 - 257); + std::string str(str_length, 'X'); + for (auto& e : str) { + e = std::rand() % 256; + } + auto o = t->create_object(); + o.set(ck, str); + trans->commit_and_continue_as_read(); + } + trans->close(); +} + +TEST(Shared_RandomSmallStrings) +{ + + SHARED_GROUP_TEST_PATH(path); + DBRef sg = get_test_db(path); + // std::cout << "Writing " << path << std::endl; + auto trans = sg->start_write(); + auto t = trans->add_table("MyTable"); + ColKey ck = t->add_column(type_String, "MyStrings"); + trans->commit_and_continue_as_read(); + std::string str(500, 'X'); + // insert a million objects with at most 4000 different strings + for (int run = 0; run < 100; ++run) { + trans->promote_to_write(); + for (int i = 0; i < 1000; ++i) { + // size_t str_length = std::rand() % (1 + 500); + // std::string str(str_length, 'X'); + size_t offset = std::rand() % str.size(); + str[offset] = 'a' + (std::rand() & 0x7); + auto o = t->create_object(); + o.set(ck, str); + } + trans->commit_and_continue_as_read(); + } + trans->close(); +} + TEST(Shared_VersionCount) { SHARED_GROUP_TEST_PATH(path); @@ -2488,10 +2571,6 @@ TEST(Shared_MovingSearchIndex) obj.set(enum_col, "bar"); } table->get_object(obj_keys.back()).set(enum_col, "bar63"); - table->enumerate_string_column(enum_col); - CHECK_EQUAL(0, table->get_num_unique_values(int_col)); - CHECK_EQUAL(0, table->get_num_unique_values(str_col)); - CHECK_EQUAL(2, table->get_num_unique_values(enum_col)); table->add_search_index(int_col); table->add_search_index(str_col); @@ -2508,14 +2587,12 @@ TEST(Shared_MovingSearchIndex) // Remove the padding column to shift the indexed columns { WriteTransaction wt(sg); + wt.get_group().verify(); TableRef table = wt.get_table("foo"); CHECK(table->has_search_index(int_col)); CHECK(table->has_search_index(str_col)); CHECK(table->has_search_index(enum_col)); - CHECK_EQUAL(0, table->get_num_unique_values(int_col)); - CHECK_EQUAL(0, table->get_num_unique_values(str_col)); - CHECK_EQUAL(2, table->get_num_unique_values(enum_col)); CHECK_EQUAL(ObjKey(), table->find_first_int(int_col, 100)); CHECK_EQUAL(ObjKey(), table->find_first_string(str_col, "bad")); CHECK_EQUAL(ObjKey(), table->find_first_string(enum_col, "bad")); @@ -2529,9 +2606,6 @@ TEST(Shared_MovingSearchIndex) CHECK(table->has_search_index(int_col)); CHECK(table->has_search_index(str_col)); CHECK(table->has_search_index(enum_col)); - CHECK_EQUAL(0, table->get_num_unique_values(int_col)); - CHECK_EQUAL(0, table->get_num_unique_values(str_col)); - CHECK_EQUAL(2, table->get_num_unique_values(enum_col)); CHECK_EQUAL(ObjKey(), table->find_first_int(int_col, 100)); CHECK_EQUAL(ObjKey(), table->find_first_string(str_col, "bad")); CHECK_EQUAL(ObjKey(), table->find_first_string(enum_col, "bad")); @@ -2548,9 +2622,6 @@ TEST(Shared_MovingSearchIndex) CHECK(table->has_search_index(int_col)); CHECK(table->has_search_index(str_col)); CHECK(table->has_search_index(enum_col)); - CHECK_EQUAL(0, table->get_num_unique_values(int_col)); - CHECK_EQUAL(0, table->get_num_unique_values(str_col)); - CHECK_EQUAL(3, table->get_num_unique_values(enum_col)); CHECK_EQUAL(ObjKey(), table->find_first_int(int_col, 100)); CHECK_EQUAL(ObjKey(), table->find_first_string(str_col, "bad")); CHECK_EQUAL(ObjKey(), table->find_first_string(enum_col, "bad")); @@ -3548,34 +3619,6 @@ TEST(Shared_OpenAfterClose) db_w->close(); } -TEST(Shared_RemoveTableWithEnumAndLinkColumns) -{ - // Test case generated with fuzzer - SHARED_GROUP_TEST_PATH(path); - DBRef db_w = DB::create(path); - TableKey tk; - { - auto wt = db_w->start_write(); - wt->add_table("Table_2"); - wt->commit(); - } - { - auto wt = db_w->start_write(); - auto table = wt->get_table("Table_2"); - tk = table->get_key(); - auto col_key = table->add_column(DataType(2), "string_3", false); - table->enumerate_string_column(col_key); - table->add_column(*table, "link_5"); - table->add_search_index(col_key); - wt->commit(); - } - { - auto wt = db_w->start_write(); - wt->remove_table(tk); - wt->commit(); - } -} - TEST(Shared_GenerateObjectIdAfterRollback) { // Test case generated in [realm-core-6.0.0-alpha.0] on Mon Aug 13 14:43:06 2018. diff --git a/test/test_string_compression.cpp b/test/test_string_compression.cpp new file mode 100644 index 00000000000..cf104445b6c --- /dev/null +++ b/test/test_string_compression.cpp @@ -0,0 +1,249 @@ +/************************************************************************* + * + * Copyright 2024 Realm Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + * + **************************************************************************/ + +#include "testsettings.hpp" + +#include +#include +#include + +#include +#include +#include +#include + +#include "test.hpp" + +using namespace realm; + + +TEST(StringInterner_Basic_Creation) +{ + Array parent(Allocator::get_default()); + _impl::DeepArrayDestroyGuard dg(&parent); + parent.create(NodeHeader::type_HasRefs, false, 1, 0); + StringInterner interner(Allocator::get_default(), parent, ColKey(0), true); + StringData my_string = "aaaaaaaaaaaaaaa"; + + auto id = interner.intern(my_string); + + const auto stored_id = interner.lookup(my_string); + CHECK(stored_id); + CHECK(*stored_id == id); + + CHECK(interner.compare(my_string, *stored_id) == 0); // should be equal + const auto origin_string = interner.get(id); + CHECK_EQUAL(my_string, origin_string); + + CHECK(interner.compare(*stored_id, id) == 0); // compare agaist self. +} + +TEST(StringInterner_InternMultipleStrings) +{ + Array parent(Allocator::get_default()); + _impl::DeepArrayDestroyGuard dg(&parent); + parent.create(NodeHeader::type_HasRefs, false, 1, 0); + StringInterner interner(Allocator::get_default(), parent, ColKey(0), true); + + std::vector strings; + for (size_t i = 0; i < 100; i++) + strings.push_back("aaaaaaaaaaaaa" + std::to_string(i)); + + size_t i = 0; + for (const auto& s : strings) { + const auto id = interner.intern(s); + const auto& str = interner.get(id); + CHECK(str == strings[i++]); + auto stored_id = interner.lookup(str); + CHECK_EQUAL(*stored_id, id); + CHECK_EQUAL(interner.compare(str, id), 0); + } +} + +TEST(StringInterner_TestLookup) +{ + Array parent(Allocator::get_default()); + _impl::DeepArrayDestroyGuard dg(&parent); + parent.create(NodeHeader::type_HasRefs, false, 1, 0); + StringInterner interner(Allocator::get_default(), parent, ColKey(0), true); + + std::vector strings; + for (size_t i = 0; i < 500; ++i) { + std::string my_string = "aaaaaaaaaaaaaaa" + std::to_string(i); + strings.push_back(my_string); + } + std::random_device rd; + std::mt19937 g(rd()); + std::shuffle(strings.begin(), strings.end(), g); + + for (const auto& s : strings) { + interner.intern(s); + auto id = interner.lookup(StringData(s)); + CHECK(id); + CHECK(interner.compare(StringData(s), *id) == 0); + } +} + +TEST(StringInterner_VerifyComparison) +{ + Array parent(Allocator::get_default()); + _impl::DeepArrayDestroyGuard dg(&parent); + parent.create(NodeHeader::type_HasRefs, false, 1, 0); + StringInterner interner(Allocator::get_default(), parent, ColKey(0), true); + + auto null_id = interner.intern({}); + auto test_lower_case_id = interner.intern({"test"}); + auto test_upper_case_id = interner.intern({"TEST"}); + + // check NULL vs empty string + auto res = interner.compare("", null_id); + CHECK_GREATER(StringData(""), StringData()); + CHECK_EQUAL(res, 1); + + // check that NULL filtering actually works + res = interner.compare(test_lower_case_id, null_id); + CHECK_GREATER(interner.get(test_lower_case_id), StringData()); + CHECK_EQUAL(res, 1); + + res = interner.compare(null_id, test_lower_case_id); + CHECK_LESS(StringData(), interner.get(test_lower_case_id)); + CHECK_EQUAL(res, -1); + + //"aaa" < "test" + res = interner.compare({"aaa"}, test_lower_case_id); + CHECK_LESS(StringData("aaa"), interner.get(test_lower_case_id)); + CHECK_EQUAL(res, -1); + + //"zzz" > "test" + res = interner.compare({"zzz"}, test_lower_case_id); + CHECK_GREATER(StringData("zzz"), interner.get(test_lower_case_id)); + CHECK_EQUAL(res, 1); + + //"AAA" < "test" + res = interner.compare({"AAA"}, test_lower_case_id); + CHECK_LESS(StringData("AAA"), interner.get(test_lower_case_id)); + CHECK_EQUAL(res, -1); + + //"ZZZ" < "test" + res = interner.compare({"ZZZ"}, test_lower_case_id); + CHECK_LESS(StringData("ZZZ"), interner.get(test_lower_case_id)); + CHECK_EQUAL(res, -1); + + //"aaa" > "TEST" + res = interner.compare({"aaa"}, test_upper_case_id); + CHECK_GREATER(StringData("aaa"), interner.get(test_upper_case_id)); + CHECK_EQUAL(res, 1); + + //"zzz" > "TEST" + res = interner.compare({"zzz"}, test_upper_case_id); + CHECK_GREATER(StringData("zzz"), interner.get(test_upper_case_id)); + CHECK_EQUAL(res, 1); + + //"AAA" < "TEST" + res = interner.compare({"AAA"}, test_upper_case_id); + CHECK_LESS(StringData("AAA"), interner.get(test_upper_case_id)); + CHECK_EQUAL(res, -1); + + //"ZZZ" > "TEST" + res = interner.compare({"ZZZ"}, test_upper_case_id); + CHECK_GREATER(StringData("ZZZ"), interner.get(test_upper_case_id)); + CHECK_EQUAL(res, 1); + + // test > TEST + res = interner.compare(test_lower_case_id, test_upper_case_id); + CHECK_GREATER(interner.get(test_lower_case_id), interner.get(test_upper_case_id)); + CHECK_EQUAL(res, 1); + + // TEST < test + res = interner.compare(test_upper_case_id, test_lower_case_id); + CHECK_LESS(interner.get(test_upper_case_id), interner.get(test_lower_case_id)); + CHECK_EQUAL(res, -1); +} + +TEST(StringInterner_VerifyInterningNull) +{ + Array parent(Allocator::get_default()); + _impl::DeepArrayDestroyGuard dg(&parent); + parent.create(NodeHeader::type_HasRefs, false, 1, 0); + StringInterner interner(Allocator::get_default(), parent, ColKey(0), true); + auto null_id = interner.intern({}); + CHECK_EQUAL(null_id, 0); + CHECK_EQUAL(interner.get(null_id), StringData{}); + const auto stored_id = interner.lookup({}); + CHECK_EQUAL(stored_id, 0); + // comparison StringID vs StringID + CHECK_EQUAL(interner.compare({}, 0), 0); + // interned string id vs null id + auto str_id = interner.intern(StringData("test")); + CHECK_EQUAL(interner.compare(str_id, null_id), 1); + CHECK_GREATER(interner.get(str_id), interner.get(null_id)); // compare via StringData + // null id vs interned string id + CHECK_EQUAL(interner.compare(null_id, str_id), -1); + CHECK_LESS(interner.get(null_id), interner.get(str_id)); + + // comparison String vs StringID + CHECK_EQUAL(interner.compare(StringData{}, null_id), 0); + CHECK_EQUAL(interner.compare(StringData{}, str_id), -1); + CHECK_LESS(StringData{}, interner.get(str_id)); // compare via StringData + CHECK_EQUAL(interner.compare(StringData{"test"}, null_id), 1); + CHECK_GREATER(StringData{"test"}, interner.get(null_id)); +} + +TEST(StringInterner_VerifyLongString) +{ + Array parent(Allocator::get_default()); + _impl::DeepArrayDestroyGuard dg(&parent); + parent.create(NodeHeader::type_HasRefs, false, 1, 0); + StringInterner interner(Allocator::get_default(), parent, ColKey(0), true); + + const auto N = 7000000; // a lot of characters for triggering long string handling. + std::string long_string(N, 'a'); + + const auto id = interner.intern(StringData(long_string)); + CHECK_EQUAL(id, 1); + const auto stored_id = interner.lookup(StringData(long_string)); + CHECK_EQUAL(stored_id, 1); + CHECK(interner.compare(StringData(long_string), *stored_id) == 0); +} + +TEST(StringInterner_VerifyExpansionFromSmallStringToLongString) +{ + Array parent(Allocator::get_default()); + _impl::DeepArrayDestroyGuard dg(&parent); + parent.create(NodeHeader::type_HasRefs, false, 1, 0); + StringInterner interner(Allocator::get_default(), parent, ColKey(0), true); + + const auto M = 1000; + std::string small_string = ""; + for (size_t i = 0; i < M; ++i) + small_string += 'a'; + + auto id = interner.intern(StringData(small_string)); + CHECK_EQUAL(id, 1); + auto stored_id = interner.lookup(StringData(small_string)); + CHECK_EQUAL(stored_id, 1); + CHECK(interner.compare(StringData(small_string), *stored_id) == 0); + + const auto N = 7000000; // a lot of characters for triggering long string handling. + std::string long_string(N, 'b'); + id = interner.intern(StringData(long_string)); + CHECK_EQUAL(id, 2); + stored_id = interner.lookup(StringData(long_string)); + CHECK_EQUAL(stored_id, id); + CHECK(interner.compare(StringData(long_string), *stored_id) == 0); +} diff --git a/test/test_table.cpp b/test/test_table.cpp index 52e06fb2659..4029043e0e1 100644 --- a/test/test_table.cpp +++ b/test/test_table.cpp @@ -366,18 +366,6 @@ TEST(Table_DeleteCrash) table->remove_object(k1); } -TEST(Table_OptimizeCrash) -{ - // This will crash at the .add() method - Table ttt; - ttt.add_column(type_Int, "first"); - auto col = ttt.add_column(type_String, "second"); - ttt.enumerate_string_column(col); - ttt.add_search_index(col); - ttt.clear(); - ttt.create_object().set_all(1, "AA"); -} - TEST(Table_DateTimeMinMax) { Group g; @@ -1002,7 +990,6 @@ void setup_multi_table(Table& table, size_t rows, std::vector& keys, std auto string_col = table.add_column(type_String, "string"); // 4 auto string_long_col = table.add_column(type_String, "string_long"); // 5 auto string_big_col = table.add_column(type_String, "string_big_blobs"); // 6 - auto string_enum_col = table.add_column(type_String, "string_enum"); // 7 - becomes StringEnumColumn auto bin_col = table.add_column(type_Binary, "binary"); // 8 auto int_null_col = table.add_column(type_Int, "int_null", true); // 9, nullable = true column_keys.push_back(int_col); @@ -1012,7 +999,6 @@ void setup_multi_table(Table& table, size_t rows, std::vector& keys, std column_keys.push_back(string_col); column_keys.push_back(string_long_col); column_keys.push_back(string_big_col); - column_keys.push_back(string_enum_col); column_keys.push_back(bin_col); column_keys.push_back(int_null_col); @@ -1061,23 +1047,8 @@ void setup_multi_table(Table& table, size_t rows, std::vector& keys, std obj.set(string_big_col, StringData("")); break; } - // enum - switch (i % 3) { - case 0: - obj.set(string_enum_col, "enum1"); - break; - case 1: - obj.set(string_enum_col, "enum2"); - break; - case 2: - obj.set(string_enum_col, "enum3"); - break; - } obj.set(bin_col, BinaryData("binary", 7)); } - - // We also want a StringEnumColumn - table.enumerate_string_column(string_enum_col); } } // anonymous namespace @@ -1584,145 +1555,6 @@ TEST(Table_IndexInt) #endif } -TEST(Table_AutoEnumeration) -{ - Table table; - - auto col_int = table.add_column(type_Int, "first"); - auto col_str = table.add_column(type_String, "second"); - - for (size_t i = 0; i < 5; ++i) { - table.create_object().set_all(1, "abd"); - table.create_object().set_all(2, "eftg"); - table.create_object().set_all(5, "hijkl"); - table.create_object().set_all(8, "mnopqr"); - table.create_object().set_all(9, "stuvxyz"); - } - - table.enumerate_string_column(col_str); - - for (size_t i = 0; i < 5; ++i) { - const size_t n = i * 5; - CHECK_EQUAL(1, table.get_object(ObjKey(0 + n)).get(col_int)); - CHECK_EQUAL(2, table.get_object(ObjKey(1 + n)).get(col_int)); - CHECK_EQUAL(5, table.get_object(ObjKey(2 + n)).get(col_int)); - CHECK_EQUAL(8, table.get_object(ObjKey(3 + n)).get(col_int)); - CHECK_EQUAL(9, table.get_object(ObjKey(4 + n)).get(col_int)); - - CHECK_EQUAL("abd", table.get_object(ObjKey(0 + n)).get(col_str)); - CHECK_EQUAL("eftg", table.get_object(ObjKey(1 + n)).get(col_str)); - CHECK_EQUAL("hijkl", table.get_object(ObjKey(2 + n)).get(col_str)); - CHECK_EQUAL("mnopqr", table.get_object(ObjKey(3 + n)).get(col_str)); - CHECK_EQUAL("stuvxyz", table.get_object(ObjKey(4 + n)).get(col_str)); - } - - // Verify counts - const size_t count1 = table.count_string(col_str, "abd"); - const size_t count2 = table.count_string(col_str, "eftg"); - const size_t count3 = table.count_string(col_str, "hijkl"); - const size_t count4 = table.count_string(col_str, "mnopqr"); - const size_t count5 = table.count_string(col_str, "stuvxyz"); - CHECK_EQUAL(5, count1); - CHECK_EQUAL(5, count2); - CHECK_EQUAL(5, count3); - CHECK_EQUAL(5, count4); - CHECK_EQUAL(5, count5); - - ObjKey t = table.find_first_string(col_str, "eftg"); - CHECK_EQUAL(ObjKey(1), t); - - auto tv = table.find_all_string(col_str, "eftg"); - CHECK_EQUAL(5, tv.size()); - CHECK_EQUAL("eftg", tv.get_object(0).get(col_str)); - CHECK_EQUAL("eftg", tv.get_object(1).get(col_str)); - CHECK_EQUAL("eftg", tv.get_object(2).get(col_str)); - CHECK_EQUAL("eftg", tv.get_object(3).get(col_str)); - CHECK_EQUAL("eftg", tv.get_object(4).get(col_str)); - - Obj obj = table.create_object(); - CHECK_EQUAL(0, obj.get(col_int)); - CHECK_EQUAL("", obj.get(col_str)); -} - - -TEST(Table_AutoEnumerationOptimize) -{ - Table t; - auto col0 = t.add_column(type_String, "col1"); - auto col1 = t.add_column(type_String, "col2"); - auto col2 = t.add_column(type_String, "col3"); - auto col3 = t.add_column(type_String, "col4"); - - // Insert non-optimizable strings - std::string s; - std::vector keys; - t.create_objects(10, keys); - for (Obj o : t) { - o.set_all(s.c_str(), s.c_str(), s.c_str(), s.c_str()); - s += "x"; - } - - // AutoEnumerate in reverse order - for (Obj o : t) { - o.set(col3, "test"); - } - t.enumerate_string_column(col3); - for (Obj o : t) { - o.set(col2, "test"); - } - t.enumerate_string_column(col2); - for (Obj o : t) { - o.set(col1, "test"); - } - t.enumerate_string_column(col1); - for (Obj o : t) { - o.set(col0, "test"); - } - t.enumerate_string_column(col0); - - for (Obj o : t) { - CHECK_EQUAL("test", o.get(col0)); - CHECK_EQUAL("test", o.get(col1)); - CHECK_EQUAL("test", o.get(col2)); - CHECK_EQUAL("test", o.get(col3)); - } - -#ifdef REALM_DEBUG - t.verify(); -#endif -} - -TEST(Table_OptimizeCompare) -{ - Table t1, t2; - auto col_t1 = t1.add_column(type_String, "str"); - auto col_t2 = t2.add_column(type_String, "str"); - - std::vector keys_t1; - std::vector keys_t2; - t1.create_objects(100, keys_t1); - for (Obj o : t1) { - o.set(col_t1, "foo"); - } - t2.create_objects(100, keys_t2); - for (Obj o : t2) { - o.set(col_t2, "foo"); - } - t1.enumerate_string_column(col_t1); - CHECK(t1 == t2); - Obj obj1 = t1.get_object(keys_t1[50]); - Obj obj2 = t2.get_object(keys_t2[50]); - obj1.set(col_t1, "bar"); - CHECK(t1 != t2); - obj1.set(col_t1, "foo"); - CHECK(t1 == t2); - obj2.set(col_t2, "bar"); - CHECK(t1 != t2); - obj2.set(col_t2, "foo"); - CHECK(t1 == t2); -} - - TEST(Table_SlabAlloc) { SlabAlloc alloc; @@ -1756,56 +1588,6 @@ TEST(Table_SlabAlloc) #endif } -TEST(Table_NullInEnum) -{ - Group group; - TableRef table = group.add_table("test"); - auto col = table->add_column(type_String, "second", true); - - for (size_t c = 0; c < 100; c++) { - table->create_object().set(col, "hello"); - } - - size_t r; - - r = table->where().equal(col, "hello").count(); - CHECK_EQUAL(100, r); - - Obj obj50 = table->get_object(ObjKey(50)); - obj50.set(col, realm::null()); - r = table->where().equal(col, "hello").count(); - CHECK_EQUAL(99, r); - - table->enumerate_string_column(col); - - obj50.set(col, realm::null()); - r = table->where().equal(col, "hello").count(); - CHECK_EQUAL(99, r); - - obj50.set(col, "hello"); - r = table->where().equal(col, "hello").count(); - CHECK_EQUAL(100, r); - - obj50.set(col, realm::null()); - r = table->where().equal(col, "hello").count(); - CHECK_EQUAL(99, r); - - r = table->where().equal(col, realm::null()).count(); - CHECK_EQUAL(1, r); - - table->get_object(ObjKey(55)).set(col, realm::null()); - r = table->where().equal(col, realm::null()).count(); - CHECK_EQUAL(2, r); - - r = table->where().equal(col, "hello").count(); - CHECK_EQUAL(98, r); - - table->remove_object(ObjKey(55)); - r = table->where().equal(col, realm::null()).count(); - CHECK_EQUAL(1, r); -} - - TEST(Table_DateAndBinary) { Table t; @@ -2122,22 +1904,6 @@ TEST(Table_EmptyMinmax) CHECK(is_null); } -TEST(Table_EnumStringInsertEmptyRow) -{ - Table table; - auto col_str = table.add_column(type_String, "strings"); - for (int i = 0; i < 128; ++i) - table.create_object().set(col_str, "foo"); - - CHECK_EQUAL(0, table.get_num_unique_values(col_str)); - table.enumerate_string_column(col_str); - // Make sure we now have an enumerated strings column - CHECK_EQUAL(1, table.get_num_unique_values(col_str)); - Obj obj = table.create_object(); - CHECK_EQUAL("", obj.get(col_str)); - CHECK_EQUAL(2, table.get_num_unique_values(col_str)); -} - TEST(Table_AddColumnWithThreeLevelBptree) { Table table; @@ -2256,24 +2022,14 @@ TEST(Table_NullableChecks) TEST(Table_Nulls) { - // 'round' lets us run this entire test both with and without index and with/without optimize/enum - for (size_t round = 0; round < 5; round++) { + // 'round' lets us run this entire test both with and without index + for (size_t round = 0; round < 2; round++) { Table t; TableView tv; auto col_str = t.add_column(type_String, "str", true /*nullable*/); if (round == 1) t.add_search_index(col_str); - else if (round == 2) - t.enumerate_string_column(col_str); - else if (round == 3) { - t.add_search_index(col_str); - t.enumerate_string_column(col_str); - } - else if (round == 4) { - t.enumerate_string_column(col_str); - t.add_search_index(col_str); - } std::vector keys; t.create_objects(3, keys); diff --git a/test/test_table_view.cpp b/test/test_table_view.cpp index a7f9e1ac8da..72ce797cd5f 100644 --- a/test/test_table_view.cpp +++ b/test/test_table_view.cpp @@ -891,32 +891,6 @@ TEST(TableView_QueryCopyStringOr) CHECK_EQUAL(after_copy_count, 4); } -TEST(TableView_SortEnum) -{ - Table table; - auto col = table.add_column(type_String, "str"); - - table.create_object().set_all("foo"); - table.create_object().set_all("foo"); - table.create_object().set_all("foo"); - - table.enumerate_string_column(col); - - table.create_object().set_all("bbb"); - table.create_object().set_all("aaa"); - table.create_object().set_all("baz"); - - TableView tv = table.where().find_all(); - tv.sort(col); - - CHECK_EQUAL(tv[0].get(col), "aaa"); - CHECK_EQUAL(tv[1].get(col), "baz"); - CHECK_EQUAL(tv[2].get(col), "bbb"); - CHECK_EQUAL(tv[3].get(col), "foo"); - CHECK_EQUAL(tv[4].get(col), "foo"); - CHECK_EQUAL(tv[5].get(col), "foo"); -} - TEST(TableView_Backlinks) { Group group; @@ -1319,20 +1293,6 @@ TEST_TYPES(TableView_Distinct, DistinctDirect, DistinctOverLink) CHECK_EQUAL(h.get_key(tv, 3), k0); CHECK_EQUAL(h.get_key(tv, 4), k1); - - // Same as previous test, but with string column being Enum - t.enumerate_string_column(col_str); - tv = h.find_all(); - tv.distinct(h.get_distinct({col_str, col_int})); - tv.sort(h.get_sort({col_str}, {false})); - CHECK_EQUAL(tv.size(), 5); - CHECK_EQUAL(h.get_key(tv, 0), k4); - CHECK_EQUAL(h.get_key(tv, 1), k5); - CHECK_EQUAL(h.get_key(tv, 2), k6); - CHECK_EQUAL(h.get_key(tv, 3), k0); - CHECK_EQUAL(h.get_key(tv, 4), k1); - - // Now test sync_if_needed() tv = h.find_all(); // "", null, "", null, "foo", "foo", "bar" diff --git a/test/test_transactions.cpp b/test/test_transactions.cpp index fbbdd314379..f45a576e9cd 100644 --- a/test/test_transactions.cpp +++ b/test/test_transactions.cpp @@ -544,41 +544,4 @@ TEST(Transactions_Continuous_SerialWrites) } } - -// Check that enumeration is gone after -// rolling back the insertion of a string enum column -TEST(LangBindHelper_RollbackStringEnumInsert) -{ - SHARED_GROUP_TEST_PATH(path); - std::unique_ptr hist_w(make_in_realm_history()); - auto sg_w = DB::create(*hist_w, path); - auto g = sg_w->start_write(); - auto t = g->add_table("t1"); - auto col = t->add_column(type_String, "t1_col0_string"); - - auto populate_with_string_enum = [&]() { - t->create_object().set_all("simple_string"); - t->create_object().set_all("duplicate"); - t->create_object().set_all("duplicate"); - t->enumerate_string_column(col); // upgrade to internal string enum column type - CHECK(t->is_enumerated(col)); - CHECK_EQUAL(t->get_num_unique_values(col), 2); - }; - - g->commit_and_continue_as_read(); - g->promote_to_write(); - - populate_with_string_enum(); - - g->rollback_and_continue_as_read(); - g->promote_to_write(); - CHECK(!t->is_enumerated(col)); - populate_with_string_enum(); - - t->begin()->set(col, "duplicate"); - - g->commit_and_continue_as_read(); - CHECK(t->is_enumerated(col)); -} - #endif // TEST_TRANSACTIONS diff --git a/test/test_unresolved_links.cpp b/test/test_unresolved_links.cpp index 60f50ee3488..b47c68fa313 100644 --- a/test/test_unresolved_links.cpp +++ b/test/test_unresolved_links.cpp @@ -870,6 +870,7 @@ TEST(Unresolved_PerformanceLinkList) tr->commit_and_continue_as_read(); CHECK(t2 > t1); tr->promote_to_write(); + // fails in compressed format because of unsigned/signed interpretation. tr->verify(); } diff --git a/test/test_upgrade_database.cpp b/test/test_upgrade_database.cpp index ae95d1a02da..04bf2e533b4 100644 --- a/test/test_upgrade_database.cpp +++ b/test/test_upgrade_database.cpp @@ -166,6 +166,7 @@ TEST(Upgrade_Disabled) TEST(Upgrade_DatabaseWithUnsupportedOldFileFormat) { + // Not core 6, thus kind is not set. And assetion is triggered. std::string path = test_util::get_test_resource_path() + "test_upgrade_database_1000_1.realm"; CHECK_OR_RETURN(File::exists(path)); diff --git a/test/test_utf8.cpp b/test/test_utf8.cpp index ec4c913eafd..de456b2d69e 100644 --- a/test/test_utf8.cpp +++ b/test/test_utf8.cpp @@ -24,10 +24,13 @@ #include #include #include +#include #include -#include #include +#include +#include +#include #include "test.hpp" @@ -86,10 +89,22 @@ const char* u16sur2 = "\xF0\xA0\x9C\xB1"; // same as above, with larger unicode TEST(UTF8_Compare_Strings) { + Array parent(Allocator::get_default()); + _impl::DeepArrayDestroyGuard dg(&parent); + + parent.create(NodeHeader::type_HasRefs, false, 1, 0); + StringInterner interner(Allocator::get_default(), parent, ColKey(0), true); + // Useful line for creating new unit test cases: // bool ret = std::locale("us_EN")(string("a"), std::string("b")); - auto str_compare = [](StringData a, StringData b) { - return a < b; + + auto str_compare = [&interner, this](StringData a, StringData b) { + auto id1 = interner.intern(a); + auto id2 = interner.intern(b); + const auto ret_interner_cmp = interner.compare(id1, id2) < 0; + const auto ret_cmp = a < b; + CHECK_EQUAL(ret_cmp, ret_interner_cmp); + return ret_cmp; }; // simplest test @@ -141,9 +156,21 @@ TEST(UTF8_Compare_Strings) TEST(UTF8_Compare_Core_utf8) { - auto str_compare = [](StringData a, StringData b) { - return a < b; + Array parent(Allocator::get_default()); + _impl::DeepArrayDestroyGuard dg(&parent); + + parent.create(NodeHeader::type_HasRefs, false, 1, 0); + StringInterner interner(Allocator::get_default(), parent, ColKey(0), true); + + auto str_compare = [&interner, this](StringData a, StringData b) { + auto id1 = interner.intern(a); + auto id2 = interner.intern(b); + const auto ret_interner_cmp = interner.compare(id1, id2) < 0; + const auto ret_cmp = a < b; + CHECK_EQUAL(ret_cmp, ret_interner_cmp); + return ret_cmp; }; + // single utf16 code points (tests mostly Windows) CHECK_EQUAL(false, str_compare(uae, uae)); CHECK_EQUAL(false, str_compare(uAE, uAE)); @@ -169,7 +196,6 @@ TEST(UTF8_Compare_Core_utf8) CHECK_EQUAL(false, str_compare(u16sur2, u16sur2)); } - TEST(UTF8_Compare_Core_utf8_invalid) { // Test that invalid utf8 won't make decisions on data beyond Realm payload. Do @@ -194,8 +220,17 @@ TEST(UTF8_Compare_Core_utf8_invalid) // that return value is arbitrary for invalid utf8 bool ret = i1 < i2; CHECK_EQUAL(ret, i2 < i1); // must sort the same as before regardless of succeeding data -} + // the same applies if the strings are interned. + Array parent(Allocator::get_default()); + _impl::DeepArrayDestroyGuard dg(&parent); + parent.create(NodeHeader::type_HasRefs, false, 1, 0); + StringInterner interner(Allocator::get_default(), parent, ColKey(0), true); + auto id1 = interner.intern(invalid1); + auto id2 = interner.intern(invalid2); + bool ret_interned = interner.compare(id1, id2) < 0; + CHECK_EQUAL(ret_interned, ret); +} TEST(Compare_Core_utf8_invalid_crash) { @@ -218,12 +253,22 @@ TEST(Compare_Core_utf8_invalid_crash) } } - TEST(UTF8_Compare_Core_utf8_zero) { - auto str_compare = [](StringData a, StringData b) { - return a < b; + Array parent(Allocator::get_default()); + _impl::DeepArrayDestroyGuard dg(&parent); + parent.create(NodeHeader::type_HasRefs, false, 1, 0); + StringInterner interner(Allocator::get_default(), parent, ColKey(0), true); + + auto str_compare = [&interner, this](StringData a, StringData b) { + auto id1 = interner.intern(a); + auto id2 = interner.intern(b); + const auto ret_interner_cmp = interner.compare(id1, id2) < 0; + const auto ret_cmp = a < b; + CHECK_EQUAL(ret_cmp, ret_interner_cmp); + return ret_cmp; }; + // Realm must support 0 characters in utf8 strings CHECK_EQUAL(false, str_compare(StringData("\0", 1), StringData("\0", 1))); CHECK_EQUAL(true, str_compare(StringData("\0", 1), StringData("a"))); @@ -238,7 +283,6 @@ TEST(UTF8_Compare_Core_utf8_zero) CHECK_EQUAL(true, str_compare(StringData("a\0", 2), StringData("a\0\0", 3))); CHECK_EQUAL(false, str_compare(StringData("a\0\0", 3), StringData("a\0", 2))); } - } // anonymous namespace #endif // TEST_UTF8