From 7a848567786e7f8d950441e433048cda373d9fb0 Mon Sep 17 00:00:00 2001 From: Stephanie Han Date: Thu, 19 Sep 2024 13:06:33 -0700 Subject: [PATCH] Add getRawDataSize to Test Utilities (#73) Summary: Pull Request resolved: https://github.com/facebookincubator/nimble/pull/73 # Changes Adding functions to the TestUtils.h in tests folders of both `encodings` and `velox` which allows us to calculate the raw data size of files by reading. - Note the special handling for Nullable encoding types and String data types in the `getRawDataSize` function. # Context This will be used by nimble_dump later on. RawDataSize is for streams and encodings, so this captures the raw data size *after* some optimizations such as deduping are applied, but *before* compression and serde. Reviewed By: sdruzkin, helfman Differential Revision: D61050482 --- dwio/nimble/encodings/tests/CMakeLists.txt | 4 + .../tests/EncodingSelectionTests.cpp | 76 +++++++++- dwio/nimble/encodings/tests/TestUtils.cpp | 138 ++++++++++++++++++ dwio/nimble/encodings/tests/TestUtils.h | 8 +- dwio/nimble/velox/tests/TestUtils.h | 52 +++++++ 5 files changed, 272 insertions(+), 6 deletions(-) create mode 100644 dwio/nimble/encodings/tests/TestUtils.cpp create mode 100644 dwio/nimble/velox/tests/TestUtils.h diff --git a/dwio/nimble/encodings/tests/CMakeLists.txt b/dwio/nimble/encodings/tests/CMakeLists.txt index 0751320..ed23d61 100644 --- a/dwio/nimble/encodings/tests/CMakeLists.txt +++ b/dwio/nimble/encodings/tests/CMakeLists.txt @@ -11,6 +11,9 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +add_library(nimble_encodings_tests_utils TestUtils.cpp) +target_link_libraries(nimble_encodings_tests_utils nimble_encodings) + add_executable( nimble_encodings_tests ConstantEncodingTests.cpp @@ -27,6 +30,7 @@ add_test(nimble_encodings_tests nimble_encodings_tests) target_link_libraries( nimble_encodings_tests + nimble_encodings_tests_utils nimble_encodings nimble_common nimble_tools_common diff --git a/dwio/nimble/encodings/tests/EncodingSelectionTests.cpp b/dwio/nimble/encodings/tests/EncodingSelectionTests.cpp index 110c9dd..5c5aa94 100644 --- a/dwio/nimble/encodings/tests/EncodingSelectionTests.cpp +++ b/dwio/nimble/encodings/tests/EncodingSelectionTests.cpp @@ -21,6 +21,7 @@ #include "dwio/nimble/encodings/EncodingFactory.h" #include "dwio/nimble/encodings/EncodingSelectionPolicy.h" #include "dwio/nimble/encodings/NullableEncoding.h" +#include "dwio/nimble/encodings/tests/TestUtils.h" #include "dwio/nimble/tools/EncodingUtilities.h" using namespace ::facebook; @@ -112,6 +113,12 @@ void test(std::span values, std::vector expected) { auto serialized = nimble::EncodingFactory::encode(std::move(policy), values, buffer); + // test getRawDataSize + auto size = + facebook::nimble::test::TestUtils::getRawDataSize(*pool, serialized); + auto expectedSize = values.size_bytes(); + ASSERT_EQ(size, expectedSize); + LOG(INFO) << "Final size: " << serialized.size(); ASSERT_GT(expected.size(), 0); @@ -554,6 +561,12 @@ TEST(EncodingSelectionBoolTests, SelectTrivial) { auto serialized = nimble::EncodingFactory::encode(std::move(policy), values, buffer); + // test getRawDataSize + auto size = + facebook::nimble::test::TestUtils::getRawDataSize(*pool, serialized); + auto expectedSize = values.size() * sizeof(T); + ASSERT_EQ(size, expectedSize); + LOG(INFO) << "Final size: " << serialized.size(); verifyEncodingTree( @@ -599,6 +612,12 @@ TEST(EncodingSelectionBoolTests, SelectRunLength) { auto serialized = nimble::EncodingFactory::encode(std::move(policy), values, buffer); + // test getRawDataSize + auto size = + facebook::nimble::test::TestUtils::getRawDataSize(*pool, serialized); + auto expectedSize = values.size() * sizeof(T); + ASSERT_EQ(size, expectedSize); + LOG(INFO) << "Final size: " << serialized.size(); verifyEncodingTree( @@ -635,6 +654,12 @@ TEST(EncodingSelectionStringTests, SelectConst) { auto serialized = nimble::EncodingFactory::encode(std::move(policy), values, buffer); + // test getRawDataSize + auto size = + facebook::nimble::test::TestUtils::getRawDataSize(*pool, serialized); + auto expectedSize = value.size() * values.size(); + ASSERT_EQ(size, expectedSize); + LOG(INFO) << "Final size: " << serialized.size(); verifyEncodingTree( @@ -659,10 +684,14 @@ TEST(EncodingSelectionStringTests, SelectMainlyConst) { std::string(5000, '\0'), }) { std::vector values; - values.resize(1000); + auto expectedSize = 0; + + auto resize = 1000; + values.resize(resize); for (auto i = 0; i < values.size(); ++i) { values[i] = value; } + expectedSize += resize * value.size(); std::vector uncommonValues; for (auto i = 0; i < values.size() / 20; ++i) { @@ -670,13 +699,20 @@ TEST(EncodingSelectionStringTests, SelectMainlyConst) { } for (auto i = 0; i < uncommonValues.size(); ++i) { - values[i * 20] = uncommonValues[i]; + std::string_view val = uncommonValues[i]; + values[i * 20] = val; + expectedSize += val.size() - value.size(); } auto policy = getRootManualSelectionPolicy(); auto serialized = nimble::EncodingFactory::encode(std::move(policy), values, buffer); + // test getRawDataSize + auto size = + facebook::nimble::test::TestUtils::getRawDataSize(*pool, serialized); + ASSERT_EQ(size, expectedSize); + LOG(INFO) << "Final size: " << serialized.size(); verifyEncodingTree( @@ -730,14 +766,21 @@ TEST(EncodingSelectionStringTests, SelectTrivial) { } std::vector values; + auto expectedSize = 0; values.resize(cache.size()); for (auto i = 0; i < cache.size(); ++i) { values[i] = cache[i]; + expectedSize += cache[i].size(); } auto serialized = nimble::EncodingFactory::encode(std::move(policy), values, buffer); + // test getRawDataSize + auto size = + facebook::nimble::test::TestUtils::getRawDataSize(*pool, serialized); + ASSERT_EQ(size, expectedSize); + LOG(INFO) << "Final size: " << serialized.size(); verifyEncodingTree( @@ -769,14 +812,22 @@ TEST(EncodingSelectionStringTests, SelectDictionary) { auto policy = getRootManualSelectionPolicy(); std::vector values; + auto expectedSize = 0; values.resize(10000); for (auto i = 0; i < values.size(); ++i) { - values[i] = uniqueValues[folly::Random::rand32(rng) % uniqueValues.size()]; + T val = uniqueValues[folly::Random::rand32(rng) % uniqueValues.size()]; + values[i] = val; + expectedSize += val.size(); } auto serialized = nimble::EncodingFactory::encode(std::move(policy), values, buffer); + // test getRawDataSize + auto size = + facebook::nimble::test::TestUtils::getRawDataSize(*pool, serialized); + ASSERT_EQ(size, expectedSize); + LOG(INFO) << "Final size: " << serialized.size(); verifyEncodingTree( @@ -821,12 +872,15 @@ TEST(EncodingSelectionStringTests, SelectRunLength) { } std::vector values; + auto expectedSize = 0; values.reserve(valueCount); auto index = 0; for (const auto length : runLengths) { for (auto i = 0; i < length; ++i) { - values.emplace_back( - index % 2 == 0 ? "abcdefghijklmnopqrstuvwxyz" : "1234567890"); + std::string_view val = + ((index % 2 == 0) ? "abcdefghijklmnopqrstuvwxyz" : "1234567890"); + values.emplace_back(val); + expectedSize += val.size(); } ++index; } @@ -835,6 +889,11 @@ TEST(EncodingSelectionStringTests, SelectRunLength) { auto serialized = nimble::EncodingFactory::encode(std::move(policy), values, buffer); + // test getRawDataSize + auto size = + facebook::nimble::test::TestUtils::getRawDataSize(*pool, serialized); + ASSERT_EQ(size, expectedSize); + LOG(INFO) << "Final size: " << serialized.size(); verifyEncodingTree( @@ -878,5 +937,12 @@ TEST(EncodingSelectionTests, TestNullable) { auto serialized = nimble::EncodingFactory::encodeNullable( std::move(policy), data, nulls, buffer); + + // test getRawDataSize + auto size = + facebook::nimble::test::TestUtils::getRawDataSize(*pool, serialized); + auto expectedSize = 15 + 6; // 15 bytes for string data, 6 bytes for nulls + ASSERT_EQ(size, expectedSize); + LOG(INFO) << "Final size: " << serialized.size(); } diff --git a/dwio/nimble/encodings/tests/TestUtils.cpp b/dwio/nimble/encodings/tests/TestUtils.cpp new file mode 100644 index 0000000..162967e --- /dev/null +++ b/dwio/nimble/encodings/tests/TestUtils.cpp @@ -0,0 +1,138 @@ +/* + * Copyright (c) Meta Platforms, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dwio/nimble/encodings/tests/TestUtils.h" +#include "dwio/nimble/encodings/EncodingUtils.h" + +namespace facebook::nimble::test { + +static constexpr int kRowCountOffset = 2; +static constexpr int kPrefixSize = 6; +static constexpr int kCompressionTypeSize = 1; + +uint64_t TestUtils::getRawDataSize( + velox::memory::MemoryPool& memoryPool, + std::string_view encodingStr) { + auto encoding = EncodingFactory::decode(memoryPool, encodingStr); + EncodingType encodingType = encoding->encodingType(); + DataType dataType = encoding->dataType(); + uint32_t rowCount = encoding->rowCount(); + + if (encodingType == EncodingType::Sentinel) { + NIMBLE_NOT_SUPPORTED("Sentinel encoding is not supported"); + } + + if (encodingType == EncodingType::Nullable) { + auto pos = encodingStr.data() + kPrefixSize; + auto nonNullsSize = encoding::readUint32(pos); + auto nonNullsCount = encoding::peek(pos + kRowCountOffset); + // We do not count the bits indicating non-null, therefore we only + // include the size of the null bits and the non-null values. + return getRawDataSize(memoryPool, {pos, nonNullsSize}) + + (rowCount - nonNullsCount); + } else { + if (dataType != DataType::String) { + auto typeSize = nimble::detail::dataTypeSize(dataType); + auto result = typeSize * rowCount; + return result; + } else { + auto pos = encodingStr.data() + kPrefixSize; // Skip the prefix. + uint64_t result = 0; + + switch (encodingType) { + case EncodingType::Trivial: { + pos += kCompressionTypeSize; + auto lengthsSize = encoding::readUint32(pos); + auto lengths = + EncodingFactory::decode(memoryPool, {pos, lengthsSize}); + std::vector buffer(rowCount); + lengths->materialize(rowCount, buffer.data()); + result += std::accumulate(buffer.begin(), buffer.end(), 0u); + break; + } + + case EncodingType::Constant: { + auto valueSize = encoding::readUint32(pos); + result += rowCount * valueSize; + break; + } + + case EncodingType::MainlyConstant: { + auto isCommonSize = encoding::readUint32(pos); + pos += isCommonSize; + auto otherValuesSize = encoding::readUint32(pos); + auto otherValuesOffset = pos; + auto otherValuesCount = + encoding::peek(pos + kRowCountOffset); + pos += otherValuesSize; + auto constantValueSize = encoding::readUint32(pos); + result += (rowCount - otherValuesCount) * constantValueSize; + result += + getRawDataSize(memoryPool, {otherValuesOffset, otherValuesSize}); + break; + } + + case EncodingType::Dictionary: { + auto alphabetSize = encoding::readUint32(pos); + auto alphabetCount = encoding::peek(pos + kRowCountOffset); + auto alphabet = + EncodingFactory::decode(memoryPool, {pos, alphabetSize}); + std::vector alphabetBuffer(alphabetCount); + alphabet->materialize(alphabetCount, alphabetBuffer.data()); + + pos += alphabetSize; + auto indicesSize = encodingStr.length() - (pos - encodingStr.data()); + auto indices = + EncodingFactory::decode(memoryPool, {pos, indicesSize}); + std::vector indicesBuffer(rowCount); + indices->materialize(rowCount, indicesBuffer.data()); + for (int i = 0; i < rowCount; ++i) { + result += alphabetBuffer[indicesBuffer[i]].size(); + } + break; + } + + case EncodingType::RLE: { + auto runLengthsSize = encoding::readUint32(pos); + auto runLengthsCount = + encoding::peek(pos + kRowCountOffset); + auto runLengths = + EncodingFactory::decode(memoryPool, {pos, runLengthsSize}); + std::vector runLengthsBuffer(runLengthsCount); + runLengths->materialize(runLengthsCount, runLengthsBuffer.data()); + + pos += runLengthsSize; + auto runValuesSize = + encodingStr.length() - (pos - encodingStr.data()); + auto runValues = + EncodingFactory::decode(memoryPool, {pos, runValuesSize}); + std::vector runValuesBuffer(runLengthsCount); + runValues->materialize(runLengthsCount, runValuesBuffer.data()); + + for (int i = 0; i < runLengthsCount; ++i) { + result += runLengthsBuffer[i] * runValuesBuffer[i].size(); + } + break; + } + + default: + NIMBLE_NOT_SUPPORTED("Encoding type does not support strings."); + } + return result; + } + } +} +} // namespace facebook::nimble::test diff --git a/dwio/nimble/encodings/tests/TestUtils.h b/dwio/nimble/encodings/tests/TestUtils.h index 8f09677..2738b14 100644 --- a/dwio/nimble/encodings/tests/TestUtils.h +++ b/dwio/nimble/encodings/tests/TestUtils.h @@ -16,7 +16,6 @@ #pragma once #include "dwio/nimble/encodings/ConstantEncoding.h" -#include "dwio/nimble/encodings/DeltaEncoding.h" #include "dwio/nimble/encodings/DictionaryEncoding.h" #include "dwio/nimble/encodings/Encoding.h" #include "dwio/nimble/encodings/EncodingFactory.h" @@ -236,4 +235,11 @@ class Encoder { encodeNullable(buffer, values, nulls, compressionType)); } }; + +class TestUtils { + public: + static uint64_t getRawDataSize( + velox::memory::MemoryPool& memoryPool, + std::string_view encodingStr); +}; } // namespace facebook::nimble::test diff --git a/dwio/nimble/velox/tests/TestUtils.h b/dwio/nimble/velox/tests/TestUtils.h new file mode 100644 index 0000000..4703238 --- /dev/null +++ b/dwio/nimble/velox/tests/TestUtils.h @@ -0,0 +1,52 @@ +/* + * Copyright (c) Meta Platforms, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "dwio/nimble/encodings/tests/TestUtils.h" +#include "dwio/nimble/tablet/TabletReader.h" +#include "dwio/nimble/velox/ChunkedStream.h" + +namespace facebook::nimble::test { +// Calculate the raw Stream Size. +inline std::uint64_t getRawStreamSize( + velox::memory::MemoryPool& pool, + nimble::TabletReader& tablet) { + // Calculate expected size by summing stream sizes. + uint64_t expected = 0; + for (auto i = 0; i < tablet.stripeCount(); ++i) { + auto stripeIdentifier = tablet.getStripeIdentifier(i); + + auto numStreams = tablet.streamCount(stripeIdentifier); + std::vector identifiers(numStreams); + std::iota(identifiers.begin(), identifiers.end(), 0); + auto streams = tablet.load(stripeIdentifier, identifiers); + + // Skip nullStreams indicated by nullptr. + for (auto& stream : streams) { + if (stream == nullptr) { + continue; + } + nimble::InMemoryChunkedStream chunkedStream{pool, std::move(stream)}; + while (chunkedStream.hasNext()) { + auto chunk = chunkedStream.nextChunk(); + expected += TestUtils::getRawDataSize(pool, chunk); + } + } + } + return expected; +} + +} // namespace facebook::nimble::test