Skip to content

Commit

Permalink
Merge branch 'feature/string-compression' into je/string-interning
Browse files Browse the repository at this point in the history
  • Loading branch information
jedelbo committed Jun 17, 2024
2 parents 9c8d1c8 + 8f1d472 commit 07fde03
Show file tree
Hide file tree
Showing 11 changed files with 123 additions and 158 deletions.
20 changes: 0 additions & 20 deletions src/realm/array.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -314,18 +314,6 @@ void Array::destroy_children(size_t offset, bool ro_only) noexcept
}
}

// size_t Array::get_byte_size() const noexcept
//{
// const auto header = get_header();
// auto num_bytes = get_byte_size_from_header(header);
// auto read_only = m_alloc.is_read_only(m_ref) == true;
// auto capacity = get_capacity_from_header(header);
// auto bytes_ok = num_bytes <= capacity;
// REALM_ASSERT(read_only || bytes_ok);
// REALM_ASSERT_7(m_alloc.is_read_only(m_ref), ==, true, ||, num_bytes, <=, get_capacity_from_header(header));
// return num_bytes;
// }

ref_type Array::do_write_shallow(_impl::ArrayWriterBase& out) const
{
// here we might want to compress the array and write down.
Expand Down Expand Up @@ -607,14 +595,6 @@ void Array::do_ensure_minimum_width(int_fast64_t value)
}
}

size_t Array::size() const noexcept
{
// in case the array is in compressed format. Never read directly
// from the header the size, since it will result very likely in a cache miss.
// For compressed arrays m_size should always be kept updated, due to init_from_mem
return m_size;
}

bool Array::compress_array(Array& arr) const
{
if (m_integer_compressor.get_encoding() == NodeHeader::Encoding::WTypBits) {
Expand Down
2 changes: 0 additions & 2 deletions src/realm/array.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -210,8 +210,6 @@ class Array : public Node, public ArrayParent {
update_width_cache_from_header();
}

size_t size() const noexcept;

bool is_empty() const noexcept
{
return size() == 0;
Expand Down
1 change: 1 addition & 0 deletions src/realm/array_string.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include <realm/array_string.hpp>
#include <realm/impl/array_writer.hpp>
#include <realm/table.hpp>
#include <realm/string_interner.hpp>
#include <realm/mixed.hpp>

using namespace realm;
Expand Down
2 changes: 0 additions & 2 deletions src/realm/db.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -510,8 +510,6 @@ class DB : public std::enable_shared_from_this<DB> {
std::shared_ptr<util::Logger> m_logger;
std::mutex m_commit_listener_mutex;
std::vector<CommitListener*> m_commit_listeners;
std::unordered_map<TableKey, std::vector<StringInterner*>*> m_string_interners;
std::mutex m_string_interners_mutex;
bool m_is_sync_agent = false;
// Id for this DB to be used in logging. We will just use some bits from the pointer.
// The path cannot be used as this would not allow us to distinguish between two DBs opening
Expand Down
2 changes: 1 addition & 1 deletion src/realm/node.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@

#include <realm/node_header.hpp>
#include <realm/alloc.hpp>
#include <realm/string_interner.hpp>

#include <iostream>

Expand Down Expand Up @@ -352,6 +351,7 @@ class ArrayWriterBase;
}

/// Base class for all nodes holding user data
class StringInterner;
class ArrayPayload {
public:
virtual ~ArrayPayload();
Expand Down
29 changes: 14 additions & 15 deletions src/realm/string_compressor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -17,36 +17,35 @@
**************************************************************************/

#include <realm/string_compressor.hpp>
#include <realm/string_interner.hpp>
#include <realm/string_data.hpp>

#include <realm/array_unsigned.hpp>

#include <iostream>
namespace realm {

StringCompressor::StringCompressor(Allocator& alloc, Array& parent, size_t index, bool writable)
: m_data(alloc)
{
m_compression_map.resize(16); // start with a very small compression map
m_symbols.reserve(65536);
m_data = std::make_unique<ArrayUnsigned>(alloc);
m_data->set_parent(&parent, index);
m_data.set_parent(&parent, index);
refresh(writable);
}

void StringCompressor::refresh(bool writable)
{
// we assume that compressors are only created from a valid parent.
// String interners in 'dead' mode should never instantiate a string compressor.
if (m_data->get_ref_from_parent() == 0) {
if (m_data.get_ref_from_parent() == 0) {
REALM_ASSERT(writable);
m_data->create(0, 65535);
m_data->update_parent();
m_data.create(0, 65535);
m_data.update_parent();
}
else {
if (m_data->is_attached())
m_data->update_from_parent();
if (m_data.is_attached())
m_data.update_from_parent();
else
m_data->init_from_ref(m_data->get_ref_from_parent());
m_data.init_from_ref(m_data.get_ref_from_parent());
}
rebuild_internal();
}
Expand Down Expand Up @@ -111,7 +110,7 @@ void StringCompressor::expand_compression_map()

void StringCompressor::rebuild_internal()
{
auto num_symbols = m_data->size();
auto num_symbols = m_data.size();
if (num_symbols == m_symbols.size())
return;
if (num_symbols < m_symbols.size()) {
Expand All @@ -132,7 +131,7 @@ void StringCompressor::rebuild_internal()
}
// we have new symbols to add
for (size_t i = m_symbols.size(); i < num_symbols; ++i) {
auto pair = m_data->get(i);
auto pair = m_data.get(i);
SymbolDef def;
def.id = (CompressionSymbol)(i + 256);
def.expansion_a = 0xFFFF & (pair >> 16);
Expand Down Expand Up @@ -198,13 +197,13 @@ CompressedString StringCompressor::compress(StringData sd, bool learn)
if (m_symbols.size() < (65536 - 256) && learn) {
// define a new symbol for this entry and use it.
REALM_ASSERT_DEBUG(m_compression_map[hash].id == 0);
REALM_ASSERT_DEBUG(m_symbols.size() == m_data->size());
REALM_ASSERT_DEBUG(m_data->is_attached());
REALM_ASSERT_DEBUG(m_symbols.size() == m_data.size());
REALM_ASSERT_DEBUG(m_data.is_attached());
CompressionSymbol id = (CompressionSymbol)(256 + m_symbols.size());
SymbolDef def{id, from[0], from[1]};
m_compression_map[hash] = def;
add_expansion(def);
m_data->add(((uint64_t)from[0]) << 16 | from[1]);
m_data.add(((uint64_t)from[0]) << 16 | from[1]);
// std::cerr << id << " = {" << from[0] << ", " << from[1] << "}" << std::endl;
*to++ = id;
from += 2;
Expand Down
9 changes: 3 additions & 6 deletions src/realm/string_compressor.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,13 @@
#ifndef REALM_STRING_COMPRESSOR_HPP
#define REALM_STRING_COMPRESSOR_HPP

#include <realm/array_unsigned.hpp>
#include <realm/utilities.hpp>
#include <vector>

using CompressionSymbol = uint16_t;
using CompressedString = std::vector<CompressionSymbol>;

struct CompressedStringView {
CompressionSymbol* data = 0;
uint32_t size = 0;
Expand Down Expand Up @@ -51,11 +53,6 @@ struct CompressedStringView {
};

namespace realm {

class ArrayUnsigned;
class Array;
class Allocator;

class StringCompressor {
public:
StringCompressor(Allocator& alloc, Array& parent, size_t index, bool writable);
Expand Down Expand Up @@ -90,7 +87,7 @@ class StringCompressor {
std::vector<ExpandedSymbolDef> m_symbols; // map from symbol -> symbolpair, 2 elements pr entry
std::vector<SymbolDef> m_compression_map; // perfect hash from symbolpair to its symbol

std::unique_ptr<ArrayUnsigned> m_data;
ArrayUnsigned m_data;
constexpr static size_t storage_chunk_size = 4096;
std::vector<std::string> m_expansion_storage;
};
Expand Down
Loading

0 comments on commit 07fde03

Please sign in to comment.