From c900fd81b26533982dbf43b027cab79305879719 Mon Sep 17 00:00:00 2001 From: stephoenix Date: Tue, 20 Aug 2024 14:44:23 -0700 Subject: [PATCH] Add getFileRawDataSize Test Utilities (#73) Summary: Pull Request resolved: https://github.com/facebookincubator/nimble/pull/73 # Changes Adding functions to the TestUtils.h in tests folders of both `encodings` and `velox` which allows us to calculate the raw data size of files by reading. - Note the special handling for Nullable encoding types and String data types in the `getRawChunkSize` function. # Context This is necessary to check the correctness of the raw data size stat added in the following diff, which calculates the size when writing. This change will also be used by nimble_dump later on (to be implemented in a later diff). Differential Revision: D61050482 --- dwio/nimble/encodings/tests/TestUtils.cpp | 134 ++++++++++++++++++++++ dwio/nimble/encodings/tests/TestUtils.h | 8 +- dwio/nimble/velox/tests/TestUtils.h | 53 +++++++++ 3 files changed, 194 insertions(+), 1 deletion(-) create mode 100644 dwio/nimble/encodings/tests/TestUtils.cpp create mode 100644 dwio/nimble/velox/tests/TestUtils.h diff --git a/dwio/nimble/encodings/tests/TestUtils.cpp b/dwio/nimble/encodings/tests/TestUtils.cpp new file mode 100644 index 0000000..6a2e232 --- /dev/null +++ b/dwio/nimble/encodings/tests/TestUtils.cpp @@ -0,0 +1,134 @@ +/* + * Copyright (c) Meta Platforms, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dwio/nimble/encodings/tests/TestUtils.h" +#include "dwio/nimble/encodings/EncodingUtils.h" + +namespace facebook::nimble::test { + +static constexpr int kRowCountOffset = 2; +static constexpr int kPrefixSize = 6; +static constexpr int kCompressionTypeSize = 1; + +std::uint64_t TestUtils::getRawChunkSize( + velox::memory::MemoryPool& memoryPool, + std::string_view chunk) { + auto encoding = EncodingFactory::decode(memoryPool, chunk); + EncodingType encodingType = encoding->encodingType(); + DataType dataType = encoding->dataType(); + uint32_t rowCount = encoding->rowCount(); + + if (encodingType == EncodingType::Sentinel) { + throw std::runtime_error("Sentinel encoding is not supported"); + + } else if (encodingType == EncodingType::Nullable) { + auto pos = chunk.data() + kPrefixSize; + auto nonNullsSize = encoding::readUint32(pos); + auto posNonNullCountOffset = pos + kRowCountOffset; + auto nonNullsCount = encoding::readUint32(posNonNullCountOffset); + return getRawChunkSize(memoryPool, {pos, nonNullsSize}) + + (rowCount - nonNullsCount); + } else { + if (dataType == DataType::String) { + auto pos = chunk.data() + kPrefixSize; // Skip the prefix. + auto result{0}; + + switch (encodingType) { + case EncodingType::Trivial: { + pos += kCompressionTypeSize; + auto lengthsSize = encoding::readUint32(pos); + auto lengths = + EncodingFactory::decode(memoryPool, {pos, lengthsSize}); + Vector buffer{&memoryPool, rowCount}; + lengths->materialize(rowCount, buffer.data()); + for (int i = 0; i < rowCount; ++i) { + result += buffer[i]; + } + } break; + + case EncodingType::Constant: { + auto valueLen = encoding::readUint32(pos); + result += rowCount * valueLen; + } break; + + case EncodingType::MainlyConstant: { + auto commonSize = encoding::readUint32(pos); + pos += commonSize; + auto otherValuesSize = encoding::readUint32(pos); + auto otherValuesOffset = pos; + auto otherValuesCount = *reinterpret_cast( + otherValuesOffset + kRowCountOffset); + pos += otherValuesSize; + auto valueLen = encoding::readUint32(pos); + result += (rowCount - otherValuesCount) * valueLen; + result += + getRawChunkSize(memoryPool, {otherValuesOffset, otherValuesSize}); + } break; + + case EncodingType::Dictionary: { + auto alphabetSize = encoding::readUint32(pos); + auto alphabetCount = *reinterpret_cast(pos + 2); + auto alphabet = + EncodingFactory::decode(memoryPool, {pos, alphabetSize}); + Vector buffer{&memoryPool, alphabetCount}; + alphabet->materialize(alphabetCount, buffer.data()); + uint32_t alphabetLens[alphabetCount]; + for (int i = 0; i < alphabetCount; ++i) { + alphabetLens[i] = buffer[i].size(); + } + + pos += alphabetSize; + auto indices = EncodingFactory::decode(memoryPool, {pos}); + Vector indicesBuffer{&memoryPool, rowCount}; + indices->materialize(rowCount, indicesBuffer.data()); + for (int i = 0; i < rowCount; ++i) { + result += alphabetLens[indicesBuffer[i]]; + } + } break; + + case EncodingType::RLE: { + auto runLengthsSize = encoding::readUint32(pos); + auto rlCount = *reinterpret_cast(pos + 2); + auto runLengths = + EncodingFactory::decode(memoryPool, {pos, runLengthsSize}); + Vector buffer{&memoryPool, rlCount}; + runLengths->materialize(rlCount, buffer.data()); + + pos += runLengthsSize; + auto runValuesSize = encoding::readUint32(pos); + auto runValues = + EncodingFactory::decode(memoryPool, {pos, runValuesSize}); + Vector rvBuffer{&memoryPool, rlCount}; + runValues->materialize(rlCount, rvBuffer.data()); + for (int i = 0; i < rlCount; ++i) { + result += buffer[i] * rvBuffer[i].size(); + } + } break; + + default: + throw std::runtime_error("Encoding type does not support strings."); + } + return result; + + } else { + auto typeSize = nimble::detail::dataTypeSize(dataType); + auto result = typeSize * rowCount; + + return result; + } + } +} +} // namespace facebook::nimble::test diff --git a/dwio/nimble/encodings/tests/TestUtils.h b/dwio/nimble/encodings/tests/TestUtils.h index 8f09677..9b6b683 100644 --- a/dwio/nimble/encodings/tests/TestUtils.h +++ b/dwio/nimble/encodings/tests/TestUtils.h @@ -16,7 +16,6 @@ #pragma once #include "dwio/nimble/encodings/ConstantEncoding.h" -#include "dwio/nimble/encodings/DeltaEncoding.h" #include "dwio/nimble/encodings/DictionaryEncoding.h" #include "dwio/nimble/encodings/Encoding.h" #include "dwio/nimble/encodings/EncodingFactory.h" @@ -236,4 +235,11 @@ class Encoder { encodeNullable(buffer, values, nulls, compressionType)); } }; + +class TestUtils { + public: + static std::uint64_t getRawChunkSize( + velox::memory::MemoryPool& memoryPool, + std::string_view chunk); +}; } // namespace facebook::nimble::test diff --git a/dwio/nimble/velox/tests/TestUtils.h b/dwio/nimble/velox/tests/TestUtils.h new file mode 100644 index 0000000..425b6d4 --- /dev/null +++ b/dwio/nimble/velox/tests/TestUtils.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) Meta Platforms, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "dwio/nimble/encodings/tests/TestUtils.h" +#include "dwio/nimble/tablet/TabletReader.h" +#include "dwio/nimble/velox/ChunkedStream.h" + +namespace facebook::nimble::test { +// Calculate the rawDataSize of a file. +inline std::uint64_t getFileRawDataSize( + nimble::TabletReader& tablet, + velox::memory::MemoryPool& pool) { + // Calculate expected size by summing stream sizes. + uint64_t expected{0}; + for (auto i = 0; i < tablet.stripeCount(); ++i) { + auto stripeIdentifier = tablet.getStripeIdentifier(i); + + auto numStreams = tablet.streamCount(stripeIdentifier); + std::vector identifiers(numStreams + 1); + std::iota(identifiers.begin(), identifiers.end(), 0); + auto streams = tablet.load(stripeIdentifier, identifiers); + + // Skip nullStreams indicated by nullptr. + for (auto j = 0; j < streams.size(); ++j) { + if (streams[j] == nullptr) { + continue; + } + nimble::InMemoryChunkedStream chunkedStream{pool, std::move(streams[j])}; + while (chunkedStream.hasNext()) { + auto chunk = chunkedStream.nextChunk(); + auto size = TestUtils::getRawChunkSize(pool, chunk); + expected += size; + } + } + } + return expected; +} + +} // namespace facebook::nimble::test