Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add getFileRawDataSize Test Utilities #73

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions dwio/nimble/encodings/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
add_library(nimble_encodings_tests_utils TestUtils.cpp)
target_link_libraries(nimble_encodings_tests_utils nimble_encodings)

add_executable(
nimble_encodings_tests
ConstantEncodingTests.cpp
Expand All @@ -27,6 +30,7 @@ add_test(nimble_encodings_tests nimble_encodings_tests)

target_link_libraries(
nimble_encodings_tests
nimble_encodings_tests_utils
nimble_encodings
nimble_common
nimble_tools_common
Expand Down
76 changes: 71 additions & 5 deletions dwio/nimble/encodings/tests/EncodingSelectionTests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#include "dwio/nimble/encodings/EncodingFactory.h"
#include "dwio/nimble/encodings/EncodingSelectionPolicy.h"
#include "dwio/nimble/encodings/NullableEncoding.h"
#include "dwio/nimble/encodings/tests/TestUtils.h"
#include "dwio/nimble/tools/EncodingUtilities.h"

using namespace ::facebook;
Expand Down Expand Up @@ -112,6 +113,12 @@ void test(std::span<const T> values, std::vector<EncodingDetails> expected) {
auto serialized =
nimble::EncodingFactory::encode<T>(std::move(policy), values, buffer);

// test getRawDataSize
auto size =
facebook::nimble::test::TestUtils::getRawDataSize(*pool, serialized);
auto expectedSize = values.size_bytes();
ASSERT_EQ(size, expectedSize);

LOG(INFO) << "Final size: " << serialized.size();

ASSERT_GT(expected.size(), 0);
Expand Down Expand Up @@ -554,6 +561,12 @@ TEST(EncodingSelectionBoolTests, SelectTrivial) {
auto serialized =
nimble::EncodingFactory::encode<T>(std::move(policy), values, buffer);

// test getRawDataSize
auto size =
facebook::nimble::test::TestUtils::getRawDataSize(*pool, serialized);
auto expectedSize = values.size() * sizeof(T);
ASSERT_EQ(size, expectedSize);

LOG(INFO) << "Final size: " << serialized.size();

verifyEncodingTree(
Expand Down Expand Up @@ -599,6 +612,12 @@ TEST(EncodingSelectionBoolTests, SelectRunLength) {
auto serialized =
nimble::EncodingFactory::encode<T>(std::move(policy), values, buffer);

// test getRawDataSize
auto size =
facebook::nimble::test::TestUtils::getRawDataSize(*pool, serialized);
auto expectedSize = values.size() * sizeof(T);
ASSERT_EQ(size, expectedSize);

LOG(INFO) << "Final size: " << serialized.size();

verifyEncodingTree(
Expand Down Expand Up @@ -635,6 +654,12 @@ TEST(EncodingSelectionStringTests, SelectConst) {
auto serialized =
nimble::EncodingFactory::encode<T>(std::move(policy), values, buffer);

// test getRawDataSize
auto size =
facebook::nimble::test::TestUtils::getRawDataSize(*pool, serialized);
auto expectedSize = value.size() * values.size();
ASSERT_EQ(size, expectedSize);

LOG(INFO) << "Final size: " << serialized.size();

verifyEncodingTree(
Expand All @@ -659,24 +684,35 @@ TEST(EncodingSelectionStringTests, SelectMainlyConst) {
std::string(5000, '\0'),
}) {
std::vector<T> values;
values.resize(1000);
auto expectedSize = 0;

auto resize = 1000;
values.resize(resize);
for (auto i = 0; i < values.size(); ++i) {
values[i] = value;
}
expectedSize += resize * value.size();

std::vector<std::string> uncommonValues;
for (auto i = 0; i < values.size() / 20; ++i) {
uncommonValues.emplace_back(i, 'b');
}

for (auto i = 0; i < uncommonValues.size(); ++i) {
values[i * 20] = uncommonValues[i];
std::string_view val = uncommonValues[i];
values[i * 20] = val;
expectedSize += val.size() - value.size();
}

auto policy = getRootManualSelectionPolicy<T>();
auto serialized =
nimble::EncodingFactory::encode<T>(std::move(policy), values, buffer);

// test getRawDataSize
auto size =
facebook::nimble::test::TestUtils::getRawDataSize(*pool, serialized);
ASSERT_EQ(size, expectedSize);

LOG(INFO) << "Final size: " << serialized.size();

verifyEncodingTree(
Expand Down Expand Up @@ -730,14 +766,21 @@ TEST(EncodingSelectionStringTests, SelectTrivial) {
}

std::vector<T> values;
auto expectedSize = 0;
values.resize(cache.size());
for (auto i = 0; i < cache.size(); ++i) {
values[i] = cache[i];
expectedSize += cache[i].size();
}

auto serialized =
nimble::EncodingFactory::encode<T>(std::move(policy), values, buffer);

// test getRawDataSize
auto size =
facebook::nimble::test::TestUtils::getRawDataSize(*pool, serialized);
ASSERT_EQ(size, expectedSize);

LOG(INFO) << "Final size: " << serialized.size();

verifyEncodingTree(
Expand Down Expand Up @@ -769,14 +812,22 @@ TEST(EncodingSelectionStringTests, SelectDictionary) {
auto policy = getRootManualSelectionPolicy<T>();

std::vector<T> values;
auto expectedSize = 0;
values.resize(10000);
for (auto i = 0; i < values.size(); ++i) {
values[i] = uniqueValues[folly::Random::rand32(rng) % uniqueValues.size()];
T val = uniqueValues[folly::Random::rand32(rng) % uniqueValues.size()];
values[i] = val;
expectedSize += val.size();
}

auto serialized =
nimble::EncodingFactory::encode<T>(std::move(policy), values, buffer);

// test getRawDataSize
auto size =
facebook::nimble::test::TestUtils::getRawDataSize(*pool, serialized);
ASSERT_EQ(size, expectedSize);

LOG(INFO) << "Final size: " << serialized.size();

verifyEncodingTree(
Expand Down Expand Up @@ -821,12 +872,15 @@ TEST(EncodingSelectionStringTests, SelectRunLength) {
}

std::vector<T> values;
auto expectedSize = 0;
values.reserve(valueCount);
auto index = 0;
for (const auto length : runLengths) {
for (auto i = 0; i < length; ++i) {
values.emplace_back(
index % 2 == 0 ? "abcdefghijklmnopqrstuvwxyz" : "1234567890");
std::string_view val =
((index % 2 == 0) ? "abcdefghijklmnopqrstuvwxyz" : "1234567890");
values.emplace_back(val);
expectedSize += val.size();
}
++index;
}
Expand All @@ -835,6 +889,11 @@ TEST(EncodingSelectionStringTests, SelectRunLength) {
auto serialized =
nimble::EncodingFactory::encode<T>(std::move(policy), values, buffer);

// test getRawDataSize
auto size =
facebook::nimble::test::TestUtils::getRawDataSize(*pool, serialized);
ASSERT_EQ(size, expectedSize);

LOG(INFO) << "Final size: " << serialized.size();

verifyEncodingTree(
Expand Down Expand Up @@ -878,5 +937,12 @@ TEST(EncodingSelectionTests, TestNullable) {

auto serialized = nimble::EncodingFactory::encodeNullable<T>(
std::move(policy), data, nulls, buffer);

// test getRawDataSize
auto size =
facebook::nimble::test::TestUtils::getRawDataSize(*pool, serialized);
auto expectedSize = 15 + 6; // 15 bytes for string data, 6 bytes for nulls
ASSERT_EQ(size, expectedSize);

LOG(INFO) << "Final size: " << serialized.size();
}
138 changes: 138 additions & 0 deletions dwio/nimble/encodings/tests/TestUtils.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
/*
* Copyright (c) Meta Platforms, Inc. and its affiliates.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "dwio/nimble/encodings/tests/TestUtils.h"
#include "dwio/nimble/encodings/EncodingUtils.h"

namespace facebook::nimble::test {

static constexpr int kRowCountOffset = 2;
static constexpr int kPrefixSize = 6;
static constexpr int kCompressionTypeSize = 1;

uint64_t TestUtils::getRawDataSize(
velox::memory::MemoryPool& memoryPool,
std::string_view encodingStr) {
auto encoding = EncodingFactory::decode(memoryPool, encodingStr);
EncodingType encodingType = encoding->encodingType();
DataType dataType = encoding->dataType();
uint32_t rowCount = encoding->rowCount();

if (encodingType == EncodingType::Sentinel) {
NIMBLE_NOT_SUPPORTED("Sentinel encoding is not supported");
}

if (encodingType == EncodingType::Nullable) {
auto pos = encodingStr.data() + kPrefixSize;
auto nonNullsSize = encoding::readUint32(pos);
auto nonNullsCount = encoding::peek<uint32_t>(pos + kRowCountOffset);
// We do not count the bits indicating non-null, therefore we only
// include the size of the null bits and the non-null values.
return getRawDataSize(memoryPool, {pos, nonNullsSize}) +
(rowCount - nonNullsCount);
} else {
if (dataType != DataType::String) {
auto typeSize = nimble::detail::dataTypeSize(dataType);
auto result = typeSize * rowCount;
return result;
} else {
auto pos = encodingStr.data() + kPrefixSize; // Skip the prefix.
uint64_t result = 0;

switch (encodingType) {
case EncodingType::Trivial: {
pos += kCompressionTypeSize;
auto lengthsSize = encoding::readUint32(pos);
auto lengths =
EncodingFactory::decode(memoryPool, {pos, lengthsSize});
std::vector<uint32_t> buffer(rowCount);
lengths->materialize(rowCount, buffer.data());
result += std::accumulate(buffer.begin(), buffer.end(), 0u);
break;
}

case EncodingType::Constant: {
auto valueSize = encoding::readUint32(pos);
result += rowCount * valueSize;
break;
}

case EncodingType::MainlyConstant: {
auto isCommonSize = encoding::readUint32(pos);
pos += isCommonSize;
auto otherValuesSize = encoding::readUint32(pos);
auto otherValuesOffset = pos;
auto otherValuesCount =
encoding::peek<uint32_t>(pos + kRowCountOffset);
pos += otherValuesSize;
auto constantValueSize = encoding::readUint32(pos);
result += (rowCount - otherValuesCount) * constantValueSize;
result +=
getRawDataSize(memoryPool, {otherValuesOffset, otherValuesSize});
break;
}

case EncodingType::Dictionary: {
auto alphabetSize = encoding::readUint32(pos);
auto alphabetCount = encoding::peek<uint32_t>(pos + kRowCountOffset);
auto alphabet =
EncodingFactory::decode(memoryPool, {pos, alphabetSize});
std::vector<std::string_view> alphabetBuffer(alphabetCount);
alphabet->materialize(alphabetCount, alphabetBuffer.data());

pos += alphabetSize;
auto indicesSize = encodingStr.length() - (pos - encodingStr.data());
auto indices =
EncodingFactory::decode(memoryPool, {pos, indicesSize});
std::vector<uint32_t> indicesBuffer(rowCount);
indices->materialize(rowCount, indicesBuffer.data());
for (int i = 0; i < rowCount; ++i) {
result += alphabetBuffer[indicesBuffer[i]].size();
}
break;
}

case EncodingType::RLE: {
auto runLengthsSize = encoding::readUint32(pos);
auto runLengthsCount =
encoding::peek<uint32_t>(pos + kRowCountOffset);
auto runLengths =
EncodingFactory::decode(memoryPool, {pos, runLengthsSize});
std::vector<uint32_t> runLengthsBuffer(runLengthsCount);
runLengths->materialize(runLengthsCount, runLengthsBuffer.data());

pos += runLengthsSize;
auto runValuesSize =
encodingStr.length() - (pos - encodingStr.data());
auto runValues =
EncodingFactory::decode(memoryPool, {pos, runValuesSize});
std::vector<std::string_view> runValuesBuffer(runLengthsCount);
runValues->materialize(runLengthsCount, runValuesBuffer.data());

for (int i = 0; i < runLengthsCount; ++i) {
result += runLengthsBuffer[i] * runValuesBuffer[i].size();
}
break;
}

default:
NIMBLE_NOT_SUPPORTED("Encoding type does not support strings.");
}
return result;
}
}
}
} // namespace facebook::nimble::test
8 changes: 7 additions & 1 deletion dwio/nimble/encodings/tests/TestUtils.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,6 @@
#pragma once

#include "dwio/nimble/encodings/ConstantEncoding.h"
#include "dwio/nimble/encodings/DeltaEncoding.h"
#include "dwio/nimble/encodings/DictionaryEncoding.h"
#include "dwio/nimble/encodings/Encoding.h"
#include "dwio/nimble/encodings/EncodingFactory.h"
Expand Down Expand Up @@ -236,4 +235,11 @@ class Encoder {
encodeNullable(buffer, values, nulls, compressionType));
}
};

class TestUtils {
public:
static uint64_t getRawDataSize(
velox::memory::MemoryPool& memoryPool,
std::string_view encodingStr);
};
} // namespace facebook::nimble::test
Loading