Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Vector comparisons #2119

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 13 additions & 5 deletions cpp/arcticdb/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -383,6 +383,11 @@ set(arcticdb_srcs
util/type_traits.hpp
util/variant.hpp
util/gil_safe_py_none.hpp
util/min_max_integer.hpp
util/mean.hpp
util/min_max_float.hpp
util/sum.hpp
util/vector_common.hpp
version/de_dup_map.hpp
version/op_log.hpp
version/schema_checks.hpp
Expand Down Expand Up @@ -760,8 +765,8 @@ if (SSL_LINK)
find_package(OpenSSL REQUIRED)
list(APPEND arcticdb_core_libraries OpenSSL::SSL)
if (NOT WIN32)
list(APPEND arcticdb_core_libraries ${KERBEROS_LIBRARY})
list(APPEND arcticdb_core_includes ${KERBEROS_INCLUDE_DIR})
#list(APPEND arcticdb_core_libraries ${KERBEROS_LIBRARY})
#list(APPEND arcticdb_core_includes ${KERBEROS_INCLUDE_DIR})
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Uncomment

endif()
endif ()
target_link_libraries(arcticdb_core_object PUBLIC ${arcticdb_core_libraries})
Expand Down Expand Up @@ -979,6 +984,10 @@ if(${TEST})
util/test/test_storage_lock.cpp
util/test/test_string_pool.cpp
util/test/test_string_utils.cpp
util/test/test_min_max_float.cpp
util/test/test_min_max_integer.cpp
util/test/test_sum.cpp
util/test/test_mean.cpp
util/test/test_tracing_allocator.cpp
version/test/test_append.cpp
version/test/test_key_block.cpp
Expand All @@ -991,8 +1000,7 @@ if(${TEST})
version/test/test_version_map_batch.cpp
version/test/test_version_store.cpp
version/test/version_map_model.hpp
python/python_handlers.cpp
)
python/python_handlers.cpp)

set(EXECUTABLE_PERMS OWNER_WRITE OWNER_READ OWNER_EXECUTE GROUP_READ GROUP_EXECUTE WORLD_READ WORLD_EXECUTE) # 755

Expand Down Expand Up @@ -1097,7 +1105,7 @@ if(${TEST})
util/test/rapidcheck_string_pool.cpp
util/test/rapidcheck_main.cpp
util/test/rapidcheck_lru_cache.cpp
version/test/rapidcheck_version_map.cpp)
version/test/rapidcheck_version_map.cpp util/test/test_min_max_integer.cpp)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not needed here, although this code is a good candidate for rapidcheck testing


add_executable(arcticdb_rapidcheck_tests ${rapidcheck_srcs})
install(TARGETS arcticdb_rapidcheck_tests RUNTIME
Expand Down
21 changes: 12 additions & 9 deletions cpp/arcticdb/column_store/block.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
namespace arcticdb {

struct MemBlock {
static const size_t Align = 128;
static const size_t MinSize = 64;
using magic_t = arcticdb::util::MagicNum<'M', 'e', 'm', 'b'>;
magic_t magic_;
Expand Down Expand Up @@ -140,17 +139,21 @@ struct MemBlock {
bool owns_external_data_ = false;

static const size_t HeaderDataSize =
sizeof(magic_) + // 8 bytes
sizeof(bytes_) + // 8 bytes
sizeof(capacity_) + // 8 bytes
sizeof(magic_) +
sizeof(bytes_) +
sizeof(capacity_) +
sizeof(external_data_) +
sizeof(offset_) +
sizeof(timestamp_) +
sizeof(timestamp_) +
sizeof(owns_external_data_);

uint8_t pad[Align - HeaderDataSize];
static const size_t HeaderSize = HeaderDataSize + sizeof(pad);
static_assert(HeaderSize == Align);
uint8_t data_[MinSize];
static const size_t DataAlignment = 64;
static const size_t PadSize = (DataAlignment - (HeaderDataSize % DataAlignment)) % DataAlignment;
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

mod operator is a no-op, (DataAlignment - (HeaderDataSize % DataAlignment)) < DataAlignment, so (DataAlignment - (HeaderDataSize % DataAlignment)) % DataAlignment == (DataAlignment - (HeaderDataSize % DataAlignment))


uint8_t pad[PadSize];
static const size_t HeaderSize = HeaderDataSize + PadSize;
static_assert(HeaderSize % DataAlignment == 0, "Header size must be aligned to 64 bytes");

alignas(DataAlignment) uint8_t data_[MinSize];
};
}
4 changes: 2 additions & 2 deletions cpp/arcticdb/column_store/chunked_buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ std::vector<ChunkedBufferImpl<BlockSize>> split(const ChunkedBufferImpl<BlockSiz
}

template std::vector<ChunkedBufferImpl<64>> split(const ChunkedBufferImpl<64>& input, size_t nbytes);
template std::vector<ChunkedBufferImpl<3968>> split(const ChunkedBufferImpl<3968>& input, size_t nbytes);
template std::vector<ChunkedBufferImpl<4032ul>> split(const ChunkedBufferImpl<4032ul>& input, size_t nbytes);
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Was this just too small in error before?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If there's any data that has been written into storage in 3968 byte blocks, will they still be readable without further modifications?


// Inclusive of start_byte, exclusive of end_byte
template <size_t BlockSize>
Expand Down Expand Up @@ -112,6 +112,6 @@ ChunkedBufferImpl<BlockSize> truncate(const ChunkedBufferImpl<BlockSize>& input,
}

template ChunkedBufferImpl<64> truncate(const ChunkedBufferImpl<64>& input, size_t start_byte, size_t end_byte);
template ChunkedBufferImpl<3968> truncate(const ChunkedBufferImpl<3968>& input, size_t start_byte, size_t end_byte);
template ChunkedBufferImpl<4032ul> truncate(const ChunkedBufferImpl<4032ul>& input, size_t start_byte, size_t end_byte);

} //namespace arcticdb
1 change: 0 additions & 1 deletion cpp/arcticdb/column_store/chunked_buffer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@ class ChunkedBufferImpl {

using BlockType = MemBlock;

static_assert(sizeof(BlockType) == BlockType::Align + BlockType::MinSize);
static_assert(DefaultBlockSize >= BlockType::MinSize);

public:
Expand Down
3 changes: 1 addition & 2 deletions cpp/arcticdb/column_store/test/test_column.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
*/

#include <gtest/gtest.h>
#include <cstdint>
#include <limits>

#include <arcticdb/column_store/memory_segment.hpp>
Expand Down Expand Up @@ -410,7 +409,7 @@ TEST(ColumnStats, MultipleBlocks) {
EXPECT_TRUE(stats.has_max());
EXPECT_TRUE(stats.has_unique());

EXPECT_EQ(single_col.buffer().num_blocks(), 2017);
EXPECT_EQ(single_col.buffer().num_blocks(), 1985);

EXPECT_EQ(stats.get_min<uint64_t>(), 0);
EXPECT_EQ(stats.get_max<int32_t>(), 999'999);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,7 @@ class ColumnDataRandomAccessorTest : public testing::Test {
input_data.resize(n);
std::iota(input_data.begin(), input_data.end(), 42);
}
// 3968 bytes == 496 int64s per block, so 3 blocks here
size_t n{1000};
size_t n{1100};
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Comment is useful, it's now 504 int64s per block

std::vector<int64_t> input_data;
TypeDescriptor type_descriptor{static_cast<TypeDescriptor>(TDT{})};
};
Expand Down
1 change: 0 additions & 1 deletion cpp/arcticdb/storage/s3/s3_client_wrapper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,6 @@ folly::Future<S3Result<std::monostate>> S3ClientTestWrapper::delete_object(

// Using a fixed page size since it's only being used for simple tests.
// If we ever need to configure it we should move it to the s3 proto config instead.
constexpr auto page_size = 10;
S3Result<ListObjectsOutput> S3ClientTestWrapper::list_objects(
const std::string& name_prefix,
const std::string& bucket_name,
Expand Down
123 changes: 123 additions & 0 deletions cpp/arcticdb/util/mean.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
#include <cstdint>
#include <limits>
#include <type_traits>
#include <cstddef>

#include <arcticdb/util/vector_common.hpp>
#include <arcticdb/util/preconditions.hpp>

namespace arcticdb {

#if HAS_VECTOR_EXTENSIONS

template<typename Vec, typename U>
inline void fill_vector(Vec &vec, U value) {
U* p = reinterpret_cast<U*>(&vec);
constexpr size_t count = sizeof(Vec) / sizeof(U);
for (size_t i = 0; i < count; i++) {
p[i] = value;
}
}

template<typename T>
class MeanFinder {
static_assert(is_supported_int<T>::value || is_supported_float<T>::value, "Unsupported type");
public:
static double find(const T* __restrict data, size_t n) {
util::check(n > 0, "Empty array provided");
using VectorType = vector_type<T>;
using AccumVectorType = vector_type<double>;
constexpr size_t elements_per_vector = sizeof(VectorType) / sizeof(T);
constexpr size_t doubles_per_vector = sizeof(AccumVectorType) / sizeof(double);
constexpr size_t vectors_per_acc = elements_per_vector / doubles_per_vector;
AccumVectorType vsum;
double* vsum_ptr = reinterpret_cast<double*>(&vsum);
fill_vector<AccumVectorType, double>(vsum, 0.0);
size_t valid_count = 0;
const VectorType* vdata = reinterpret_cast<const VectorType*>(data);
const size_t vector_len = n / elements_per_vector;
for (size_t i = 0; i < vector_len; i++) {
VectorType v = vdata[i];
const T* v_arr = reinterpret_cast<const T*>(&v);
if constexpr (std::is_floating_point_v<T>) {
bool has_nan = false;
for (size_t j = 0; j < elements_per_vector; j++) {
if (std::isnan(v_arr[j])) { has_nan = true; break; }
}
if (!has_nan) {
for (size_t chunk = 0; chunk < vectors_per_acc; chunk++) {
size_t base = chunk * doubles_per_vector;
for (size_t j = 0; j < doubles_per_vector; j++) {
if constexpr (std::is_same_v<T, double>)
vsum_ptr[j] += v_arr[base + j];
else
vsum_ptr[j] += static_cast<double>(v_arr[base + j]);
}
}
valid_count += elements_per_vector;
} else {
for (size_t chunk = 0; chunk < vectors_per_acc; chunk++) {
size_t base = chunk * doubles_per_vector;
for (size_t j = 0; j < doubles_per_vector; j++) {
size_t idx = base + j;
if (!std::isnan(v_arr[idx])) {
if constexpr (std::is_same_v<T, double>)
vsum_ptr[j] += v_arr[idx];
else
vsum_ptr[j] += static_cast<double>(v_arr[idx]);
valid_count++;
}
}
}
}
} else {
for (size_t chunk = 0; chunk < vectors_per_acc; chunk++) {
size_t base = chunk * doubles_per_vector;
for (size_t j = 0; j < doubles_per_vector; j++) {
vsum_ptr[j] += static_cast<double>(v_arr[base + j]);
}
}
valid_count += elements_per_vector;
}
}
double total = 0.0;
for (size_t j = 0; j < doubles_per_vector; j++) {
total += vsum_ptr[j];
}
const T* remain = data + (vector_len * elements_per_vector);
size_t rem = n % elements_per_vector;
for (size_t i = 0; i < rem; i++) {
if constexpr (std::is_floating_point_v<T>) {
if (!std::isnan(remain[i])) {
if constexpr (std::is_same_v<T, double>)
total += remain[i];
else
total += static_cast<double>(remain[i]);
valid_count++;
}
} else {
total += static_cast<double>(remain[i]);
valid_count++;
}
}
return valid_count > 0 ? total / static_cast<double>(valid_count) : 0.0;
}
};

template<typename T>
double find_mean(const T* __restrict data, size_t n) {
return MeanFinder<T>::find(data, n);
}

#else

template <typename T, typename = typename std::enable_if<std::is_arithmetic<T>::value>::type>
double find_mean(const T* data, std::size_t size) {
util::check(size > 0, "Cannot compute mean of an empty array.");
double sum = std::accumulate(data, data + size, 0.0);
return sum / size;
}

#endif

} // namespace arcticdb
93 changes: 93 additions & 0 deletions cpp/arcticdb/util/min_max_float.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
#include <cstdint>
#include <limits>
#include <type_traits>
#include <cstddef>
#include <algorithm>

#include <arcticdb/util/vector_common.hpp>

namespace arcticdb {

#if HAS_VECTOR_EXTENSIONS

template<typename T, typename Comparator>
class FloatExtremumFinder {
static_assert(is_supported_float<T>::value, "Type must be float or double");
static_assert(std::is_floating_point_v<T>, "Type must be floating point");
public:
static T find(const T* data, size_t n) {
if (n == 0)
return Comparator::identity();
using vec_t = vector_type<T>;
constexpr size_t lane_count = sizeof(vec_t) / sizeof(T);
vec_t vext;
for (size_t i = 0; i < lane_count; i++)
reinterpret_cast<T*>(&vext)[i] = Comparator::identity();

const vec_t* vdata = reinterpret_cast<const vec_t*>(data);
size_t vlen = n / lane_count;
for (size_t i = 0; i < vlen; i++) {
vec_t v = vdata[i];
if constexpr (Comparator::is_min)
vext = (v < vext) ? v : vext;
else
vext = (v > vext) ? v : vext;
}
T result = Comparator::identity();
const T* lanes = reinterpret_cast<const T*>(&vext);
for (size_t i = 0; i < lane_count; i++) {
if (lanes[i] == lanes[i])
result = Comparator::compare(lanes[i], result);
}
const T* remain = data + (vlen * lane_count);
size_t remain_count = n % lane_count;
for (size_t i = 0; i < remain_count; i++) {
if (remain[i] == remain[i])
result = Comparator::compare(remain[i], result);
}
return result;
}
};

template<typename T>
struct FloatMinComparator {
static constexpr bool is_min = true;
static T identity() { return std::numeric_limits<T>::infinity(); }
static T compare(T a, T b) { return std::min(a, b); }
};

template<typename T>
struct FloatMaxComparator {
static constexpr bool is_min = false;
static T identity() { return -std::numeric_limits<T>::infinity(); }
static T compare(T a, T b) { return std::max(a, b); }
};

template<typename T>
T find_float_min(const T* data, size_t n) {
return FloatExtremumFinder<T, FloatMinComparator<T>>::find(data, n);
}

template<typename T>
T find_float_max(const T* data, size_t n) {
return FloatExtremumFinder<T, FloatMaxComparator<T>>::find(data, n);
}

#else
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same comments below the #else as for min_max_integer.hpp


template<typename T>
typename std::enable_if<std::is_floating_point<T>::value, T>::type
find_float_min(const T *data, size_t n) {
util::check(size != 0, "Got zero size in find_float_min");

Check failure on line 81 in cpp/arcticdb/util/min_max_float.hpp

View workflow job for this annotation

GitHub Actions / Windows C++ Tests / compile (windows, windows-cl, win_amd64, C:/cpp_build, C:/vcpkg_packages, *.pdb, *.lib, *.ilk, *....

'util': identifier not found

Check failure on line 81 in cpp/arcticdb/util/min_max_float.hpp

View workflow job for this annotation

GitHub Actions / Windows C++ Tests / compile (windows, windows-cl, win_amd64, C:/cpp_build, C:/vcpkg_packages, *.pdb, *.lib, *.ilk, *....

'size': undeclared identifier

Check failure on line 81 in cpp/arcticdb/util/min_max_float.hpp

View workflow job for this annotation

GitHub Actions / Windows C++ Tests / compile (windows, windows-cl, win_amd64, C:/cpp_build, C:/vcpkg_packages, *.pdb, *.lib, *.ilk, *....

'size': identifier not found

Check failure on line 81 in cpp/arcticdb/util/min_max_float.hpp

View workflow job for this annotation

GitHub Actions / Windows C++ Tests / compile (windows, windows-cl, win_amd64, C:/cpp_build, C:/vcpkg_packages, *.pdb, *.lib, *.ilk, *....

'util': undeclared identifier
return *std::min_element(data, data + n);
}

template<typename T>
typename std::enable_if<std::is_floating_point<T>::value, T>::type
find_float_max(const T *data, size_t n) {
util::check(size != 0, "Got zero size in find_float_max");

Check failure on line 88 in cpp/arcticdb/util/min_max_float.hpp

View workflow job for this annotation

GitHub Actions / Windows C++ Tests / compile (windows, windows-cl, win_amd64, C:/cpp_build, C:/vcpkg_packages, *.pdb, *.lib, *.ilk, *....

'util': identifier not found

Check failure on line 88 in cpp/arcticdb/util/min_max_float.hpp

View workflow job for this annotation

GitHub Actions / Windows C++ Tests / compile (windows, windows-cl, win_amd64, C:/cpp_build, C:/vcpkg_packages, *.pdb, *.lib, *.ilk, *....

'size': undeclared identifier

Check failure on line 88 in cpp/arcticdb/util/min_max_float.hpp

View workflow job for this annotation

GitHub Actions / Windows C++ Tests / compile (windows, windows-cl, win_amd64, C:/cpp_build, C:/vcpkg_packages, *.pdb, *.lib, *.ilk, *....

'size': identifier not found

Check failure on line 88 in cpp/arcticdb/util/min_max_float.hpp

View workflow job for this annotation

GitHub Actions / Windows C++ Tests / compile (windows, windows-cl, win_amd64, C:/cpp_build, C:/vcpkg_packages, *.pdb, *.lib, *.ilk, *....

'util': undeclared identifier
return *std::max_element(data, data + n);
}

#endif
} // namespace arcticdb
Loading
Loading