Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/string compression #7803

Draft
wants to merge 23 commits into
base: next-major
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
fcb50cd
Squashed into single commit relative to next-major
finnschiermer Jun 6, 2024
9f4d51c
Merge branch 'next-major' into fsa/string-interning
nicola-cab Jun 11, 2024
22d15d9
No Unique PTRs for `string interner` and `string compressor` (#7807)
nicola-cab Jun 17, 2024
8f1d472
Fix compilation
jedelbo Jun 17, 2024
fc31117
RCORE-2162: Add compression of strings in Mixed, Lst<String> and Dict…
jedelbo Jun 19, 2024
1d89781
Merge branch 'next-major' into feature/string-compression
jedelbo Jul 1, 2024
3e8a751
Merge branch 'next-major' into feature/string-compression
nicola-cab Jul 3, 2024
5112b13
Merge branch 'next-major' into feature/string-compression
jedelbo Jul 8, 2024
499e630
RCORE-2170 String compression tests (#7812)
nicola-cab Jul 8, 2024
ce6f196
Remove enum string feature (#7858)
jedelbo Jul 10, 2024
1afc39e
RCORE-2064 String EQ/NEQ optimisations for compressed strings (#7820)
nicola-cab Jul 10, 2024
d35c8fc
Merge branch 'next-major' into feature/string-compression
nicola-cab Jul 12, 2024
aec4ca0
RCORE-2065 use compressed string view for quick comparison if the lea…
nicola-cab Jul 16, 2024
10d4535
Merge branch 'next-major' into feature/string-compression
nicola-cab Jul 22, 2024
533a0f3
Merge branch 'next-major' into feature/string-compression
nicola-cab Jul 23, 2024
9278ebe
RCORE-2157 Avoid to decompress Strings while sorting them. Instead us…
nicola-cab Aug 1, 2024
8f6b79b
Merge branch 'next-major' into feature/string-compression
nicola-cab Aug 1, 2024
7b6159e
reduce locking for StringInterner lookup and compare methods
finnschiermer Aug 6, 2024
f9d1021
reduced locking in StringInterner::get()
finnschiermer Aug 6, 2024
8325c81
minor fixes and comments
finnschiermer Aug 8, 2024
ddc557e
optimize memory ordering a bit
finnschiermer Aug 8, 2024
80b95d0
Merge pull request #7954 from realm/fsa/reduce-string-interner-locking
finnschiermer Aug 9, 2024
9fe448f
Small fix to Table::typed_write
jedelbo Aug 21, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
-----------

### Internals
* None.
* Ability to enumerate a string column has been removed.

----------------------------------------------

Expand Down
4 changes: 4 additions & 0 deletions src/realm/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,8 @@ set(REALM_SOURCES
table.cpp
table_ref.cpp
obj_list.cpp
string_interner.cpp
string_compressor.cpp
object_id.cpp
table_view.cpp
tokenizer.cpp
Expand Down Expand Up @@ -178,6 +180,8 @@ set(REALM_INSTALL_HEADERS
null.hpp
obj.hpp
obj_list.hpp
string_interner.hpp
string_compressor.hpp
object_id.hpp
path.hpp
owned_data.hpp
Expand Down
16 changes: 2 additions & 14 deletions src/realm/array.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -294,7 +294,7 @@ void Array::set_type(Type type)
set_hasrefs_in_header(init_has_refs, header);
}

void Array::destroy_children(size_t offset) noexcept
void Array::destroy_children(size_t offset, bool ro_only) noexcept
{
for (size_t i = offset; i != m_size; ++i) {
int64_t value = get(i);
Expand All @@ -310,22 +310,10 @@ void Array::destroy_children(size_t offset) noexcept
continue;

ref_type ref = to_ref(value);
destroy_deep(ref, m_alloc);
destroy_deep(ref, m_alloc, ro_only);
}
}

// size_t Array::get_byte_size() const noexcept
//{
// const auto header = get_header();
// auto num_bytes = get_byte_size_from_header(header);
// auto read_only = m_alloc.is_read_only(m_ref) == true;
// auto capacity = get_capacity_from_header(header);
// auto bytes_ok = num_bytes <= capacity;
// REALM_ASSERT(read_only || bytes_ok);
// REALM_ASSERT_7(m_alloc.is_read_only(m_ref), ==, true, ||, num_bytes, <=, get_capacity_from_header(header));
// return num_bytes;
// }

ref_type Array::do_write_shallow(_impl::ArrayWriterBase& out) const
{
// here we might want to compress the array and write down.
Expand Down
29 changes: 16 additions & 13 deletions src/realm/array.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ class Array : public Node, public ArrayParent {
/// pointer.
void init_from_mem(MemRef) noexcept;

/// Same as `init_from_ref(get_ref_from_parent())`.
/// Same as `init_from_ref(ref_from_parent())`.
void init_from_parent() noexcept
{
ref_type ref = get_ref_from_parent();
Expand Down Expand Up @@ -362,7 +362,8 @@ class Array : public Node, public ArrayParent {
/// state (as if calling detach()), then free the allocated memory. If this
/// accessor is already in the detached state, this function has no effect
/// (idempotency).
void destroy_deep() noexcept;
/// If 'ro_only', only free space in read-only memory (the file)
void destroy_deep(bool ro_only = false) noexcept;

/// check if the array is encoded (in B format)
inline bool is_compressed() const;
Expand All @@ -377,13 +378,13 @@ class Array : public Node, public ArrayParent {
bool try_decompress();

/// Shorthand for `destroy_deep(MemRef(ref, alloc), alloc)`.
static void destroy_deep(ref_type ref, Allocator& alloc) noexcept;
static void destroy_deep(ref_type ref, Allocator& alloc, bool ro_only = false) noexcept;

/// Destroy the specified array node and all of its children, recursively.
///
/// This is done by freeing the specified array node after calling
/// destroy_deep() for every contained 'ref' element.
static void destroy_deep(MemRef, Allocator&) noexcept;
static void destroy_deep(MemRef, Allocator&, bool ro_only = false) noexcept;

// Clone deep
static MemRef clone(MemRef, Allocator& from_alloc, Allocator& target_alloc);
Expand Down Expand Up @@ -544,7 +545,7 @@ class Array : public Node, public ArrayParent {
// Overriding method in ArrayParent
ref_type get_child_ref(size_t) const noexcept override;

void destroy_children(size_t offset = 0) noexcept;
void destroy_children(size_t offset = 0, bool ro_only = false) noexcept;

protected:
// Getters and Setters for adaptive-packed arrays
Expand Down Expand Up @@ -916,16 +917,17 @@ inline void Array::set_context_flag(bool value) noexcept
}
}

inline void Array::destroy_deep() noexcept
inline void Array::destroy_deep(bool ro_only) noexcept
{
if (!is_attached())
return;

if (m_has_refs)
destroy_children();
destroy_children(0, ro_only);

char* header = get_header_from_data(m_data);
m_alloc.free_(m_ref, header);
if (!ro_only || is_read_only())
m_alloc.free_(m_ref, header);
m_data = nullptr;
}

Expand Down Expand Up @@ -968,20 +970,21 @@ inline void Array::clear_and_destroy_children()
truncate_and_destroy_children(0);
}

inline void Array::destroy_deep(ref_type ref, Allocator& alloc) noexcept
inline void Array::destroy_deep(ref_type ref, Allocator& alloc, bool ro_only) noexcept
{
destroy_deep(MemRef(ref, alloc), alloc);
destroy_deep(MemRef(ref, alloc), alloc, ro_only);
}

inline void Array::destroy_deep(MemRef mem, Allocator& alloc) noexcept
inline void Array::destroy_deep(MemRef mem, Allocator& alloc, bool ro_only) noexcept
{
if (!get_hasrefs_from_header(mem.get_addr())) {
alloc.free_(mem);
if (!ro_only || alloc.is_read_only(mem.get_ref()))
alloc.free_(mem);
return;
}
Array array(alloc);
array.init_from_mem(mem);
array.destroy_deep();
array.destroy_deep(ro_only);
}


Expand Down
4 changes: 2 additions & 2 deletions src/realm/array_backlink.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -225,12 +225,12 @@ void ArrayBacklink::verify() const
REALM_ASSERT(src_obj.get<Mixed>(src_col_key).get_link() == target_link);
}
else if (val.is_type(type_List)) {
DummyParent parent(src_table, val.get_ref());
DummyParent parent(src_table, val.get_ref(), src_col_key);
Lst<Mixed> list(parent, 0);
REALM_ASSERT(list.find_any(target_link) != npos);
}
else if (val.is_type(type_Dictionary)) {
DummyParent parent(src_table, val.get_ref());
DummyParent parent(src_table, val.get_ref(), src_col_key);
Dictionary dict(parent, 0);
REALM_ASSERT(dict.find_any(target_link) != npos);
}
Expand Down
2 changes: 2 additions & 0 deletions src/realm/array_integer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
#include <realm/impl/destroy_guard.hpp>
#include <realm/column_integer.hpp>

#include <iostream>

using namespace realm;

ArrayInteger::ArrayInteger(Allocator& allocator) noexcept
Expand Down
1 change: 1 addition & 0 deletions src/realm/array_integer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,7 @@ inline ArrayIntNull::~ArrayIntNull() noexcept {}

inline size_t ArrayIntNull::size() const noexcept
{
// this cannot be right, what if size is 0
return Array::size() - 1;
}

Expand Down
92 changes: 57 additions & 35 deletions src/realm/array_mixed.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,22 @@ void ArrayMixed::set_null(size_t ndx)
}
}

std::optional<StringID> ArrayMixed::get_string_id(size_t ndx) const
{
int64_t val = m_composite.get(ndx);
if (val) {
const int64_t int_val = val >> s_data_shift;
const size_t payload_ndx{(size_t)int_val};
const DataType type((val & s_data_type_mask) - 1);
if (type == type_String) {
ensure_string_array();
REALM_ASSERT(size_t(int_val) < m_strings.size());
return m_strings.get_string_id(payload_ndx);
}
}
return {};
}

Mixed ArrayMixed::get(size_t ndx) const
{
int64_t val = m_composite.get(ndx);
Expand Down Expand Up @@ -363,9 +379,8 @@ ref_type ArrayMixed::typed_write(ref_type top_ref, _impl::ArrayWriterBase& out,
2. int and pair int arrays, they are used for storing integers, timestamps, floats, doubles,
decimals, links. In general we can compress them, but we need to be careful, controlling the col_type
should prevent compressing data that we want to leave in the current format.
3. string array is for strings and binary data (no compression for now)
4. ref array is actually storing refs to collections. they can only be BPlusTree<int, Mixed> or
BPlusTree<string, Mixed>.
3. string array is for strings and binary data
4. ref array is actually storing refs to collections. They can only be Lst<Mixed> or Dictionary.
5. key array stores unique identifiers for collections in mixed (integers that can be compressed)
*/
Array composite(alloc);
Expand All @@ -375,41 +390,48 @@ ref_type ArrayMixed::typed_write(ref_type top_ref, _impl::ArrayWriterBase& out,
auto ref = top.get(i);
ref_type new_ref = ref;
if (ref && !(out.only_modified && alloc.is_read_only(ref))) {
if (i < 3) { // int, and pair_int
// integer arrays
new_ref = Array::write(ref, alloc, out, out.only_modified, out.compress);
}
else if (i == 4) { // collection in mixed
ArrayRef arr_ref(alloc);
arr_ref.init_from_ref(ref);
auto ref_sz = arr_ref.size();
TempArray written_ref_leaf(ref_sz);

for (size_t k = 0; k < ref_sz; k++) {
ref_type new_sub_ref = 0;
if (auto sub_ref = arr_ref.get(k)) {
auto header = alloc.translate(sub_ref);
// Now we have to find out if the nested collection is a
// dictionary or a list. If the top array has a size of 2
// and it is not a BplusTree inner node, then it is a dictionary
if (NodeHeader::get_size_from_header(header) == 2 &&
!NodeHeader::get_is_inner_bptree_node_from_header(header)) {
new_sub_ref = Dictionary::typed_write(sub_ref, out, alloc);
}
else {
new_sub_ref = BPlusTree<Mixed>::typed_write(sub_ref, out, alloc);
switch (i) {
case payload_idx_int:
// integer array
new_ref = Array::write(ref, alloc, out, out.only_modified, out.compress);
break;
case payload_idx_pair:
// integer array
new_ref = Array::write(ref, alloc, out, out.only_modified, out.compress);
break;
case payload_idx_str:
new_ref = ArrayString::typed_write(ref, out, alloc);
break;
case payload_idx_ref: {
// collection in mixed
ArrayRef arr_ref(alloc);
arr_ref.init_from_ref(ref);
auto ref_sz = arr_ref.size();
TempArray written_ref_leaf(ref_sz);

for (size_t k = 0; k < ref_sz; k++) {
ref_type new_sub_ref = 0;
if (auto sub_ref = arr_ref.get(k)) {
auto header = alloc.translate(sub_ref);
// Now we have to find out if the nested collection is a
// dictionary or a list. If the top array has a size of 2
// and it is not a BplusTree inner node, then it is a dictionary
if (NodeHeader::get_size_from_header(header) == 2 &&
!NodeHeader::get_is_inner_bptree_node_from_header(header)) {
new_sub_ref = Dictionary::typed_write(sub_ref, out, alloc);
}
else {
new_sub_ref = BPlusTree<Mixed>::typed_write(sub_ref, out, alloc);
}
}
written_ref_leaf.set_as_ref(k, new_sub_ref);
}
written_ref_leaf.set_as_ref(k, new_sub_ref);
new_ref = written_ref_leaf.write(out);
break;
}
new_ref = written_ref_leaf.write(out);
}
else if (i == 5) { // unique keys associated to collections in mixed
new_ref = Array::write(ref, alloc, out, out.only_modified, out.compress);
}
else {
// all the rest we don't want to compress it, at least for now (strings will be needed)
new_ref = Array::write(ref, alloc, out, out.only_modified, false);
case payload_idx_key:
new_ref = Array::write(ref, alloc, out, out.only_modified, out.compress);
break;
}
}
written_leaf.set(i, new_ref);
Expand Down
10 changes: 10 additions & 0 deletions src/realm/array_mixed.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,15 @@ class ArrayMixed : public ArrayPayload, private Array {
{
Array::set_parent(parent, ndx_in_parent);
}
bool need_string_interner() const override
{
return true;
}
virtual void set_string_interner(StringInterner* interner) const override
{
m_strings.set_string_interner(interner);
}

void init_from_parent()
{
ref_type ref = get_ref_from_parent();
Expand All @@ -88,6 +97,7 @@ class ArrayMixed : public ArrayPayload, private Array {
{
return m_composite.get(ndx) == 0;
}
std::optional<StringID> get_string_id(size_t ndx) const;

void clear();
void erase(size_t ndx);
Expand Down
Loading
Loading