diff --git a/dwio/nimble/encodings/tests/TestUtils.cpp b/dwio/nimble/encodings/tests/TestUtils.cpp new file mode 100644 index 0000000..6a2e232 --- /dev/null +++ b/dwio/nimble/encodings/tests/TestUtils.cpp @@ -0,0 +1,134 @@ +/* + * Copyright (c) Meta Platforms, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "dwio/nimble/encodings/tests/TestUtils.h" +#include "dwio/nimble/encodings/EncodingUtils.h" + +namespace facebook::nimble::test { + +static constexpr int kRowCountOffset = 2; +static constexpr int kPrefixSize = 6; +static constexpr int kCompressionTypeSize = 1; + +std::uint64_t TestUtils::getRawChunkSize( + velox::memory::MemoryPool& memoryPool, + std::string_view chunk) { + auto encoding = EncodingFactory::decode(memoryPool, chunk); + EncodingType encodingType = encoding->encodingType(); + DataType dataType = encoding->dataType(); + uint32_t rowCount = encoding->rowCount(); + + if (encodingType == EncodingType::Sentinel) { + throw std::runtime_error("Sentinel encoding is not supported"); + + } else if (encodingType == EncodingType::Nullable) { + auto pos = chunk.data() + kPrefixSize; + auto nonNullsSize = encoding::readUint32(pos); + auto posNonNullCountOffset = pos + kRowCountOffset; + auto nonNullsCount = encoding::readUint32(posNonNullCountOffset); + return getRawChunkSize(memoryPool, {pos, nonNullsSize}) + + (rowCount - nonNullsCount); + } else { + if (dataType == DataType::String) { + auto pos = chunk.data() + kPrefixSize; // Skip the prefix. + auto result{0}; + + switch (encodingType) { + case EncodingType::Trivial: { + pos += kCompressionTypeSize; + auto lengthsSize = encoding::readUint32(pos); + auto lengths = + EncodingFactory::decode(memoryPool, {pos, lengthsSize}); + Vector buffer{&memoryPool, rowCount}; + lengths->materialize(rowCount, buffer.data()); + for (int i = 0; i < rowCount; ++i) { + result += buffer[i]; + } + } break; + + case EncodingType::Constant: { + auto valueLen = encoding::readUint32(pos); + result += rowCount * valueLen; + } break; + + case EncodingType::MainlyConstant: { + auto commonSize = encoding::readUint32(pos); + pos += commonSize; + auto otherValuesSize = encoding::readUint32(pos); + auto otherValuesOffset = pos; + auto otherValuesCount = *reinterpret_cast( + otherValuesOffset + kRowCountOffset); + pos += otherValuesSize; + auto valueLen = encoding::readUint32(pos); + result += (rowCount - otherValuesCount) * valueLen; + result += + getRawChunkSize(memoryPool, {otherValuesOffset, otherValuesSize}); + } break; + + case EncodingType::Dictionary: { + auto alphabetSize = encoding::readUint32(pos); + auto alphabetCount = *reinterpret_cast(pos + 2); + auto alphabet = + EncodingFactory::decode(memoryPool, {pos, alphabetSize}); + Vector buffer{&memoryPool, alphabetCount}; + alphabet->materialize(alphabetCount, buffer.data()); + uint32_t alphabetLens[alphabetCount]; + for (int i = 0; i < alphabetCount; ++i) { + alphabetLens[i] = buffer[i].size(); + } + + pos += alphabetSize; + auto indices = EncodingFactory::decode(memoryPool, {pos}); + Vector indicesBuffer{&memoryPool, rowCount}; + indices->materialize(rowCount, indicesBuffer.data()); + for (int i = 0; i < rowCount; ++i) { + result += alphabetLens[indicesBuffer[i]]; + } + } break; + + case EncodingType::RLE: { + auto runLengthsSize = encoding::readUint32(pos); + auto rlCount = *reinterpret_cast(pos + 2); + auto runLengths = + EncodingFactory::decode(memoryPool, {pos, runLengthsSize}); + Vector buffer{&memoryPool, rlCount}; + runLengths->materialize(rlCount, buffer.data()); + + pos += runLengthsSize; + auto runValuesSize = encoding::readUint32(pos); + auto runValues = + EncodingFactory::decode(memoryPool, {pos, runValuesSize}); + Vector rvBuffer{&memoryPool, rlCount}; + runValues->materialize(rlCount, rvBuffer.data()); + for (int i = 0; i < rlCount; ++i) { + result += buffer[i] * rvBuffer[i].size(); + } + } break; + + default: + throw std::runtime_error("Encoding type does not support strings."); + } + return result; + + } else { + auto typeSize = nimble::detail::dataTypeSize(dataType); + auto result = typeSize * rowCount; + + return result; + } + } +} +} // namespace facebook::nimble::test diff --git a/dwio/nimble/encodings/tests/TestUtils.h b/dwio/nimble/encodings/tests/TestUtils.h index 8f09677..9b6b683 100644 --- a/dwio/nimble/encodings/tests/TestUtils.h +++ b/dwio/nimble/encodings/tests/TestUtils.h @@ -16,7 +16,6 @@ #pragma once #include "dwio/nimble/encodings/ConstantEncoding.h" -#include "dwio/nimble/encodings/DeltaEncoding.h" #include "dwio/nimble/encodings/DictionaryEncoding.h" #include "dwio/nimble/encodings/Encoding.h" #include "dwio/nimble/encodings/EncodingFactory.h" @@ -236,4 +235,11 @@ class Encoder { encodeNullable(buffer, values, nulls, compressionType)); } }; + +class TestUtils { + public: + static std::uint64_t getRawChunkSize( + velox::memory::MemoryPool& memoryPool, + std::string_view chunk); +}; } // namespace facebook::nimble::test diff --git a/dwio/nimble/velox/tests/TestUtils.h b/dwio/nimble/velox/tests/TestUtils.h new file mode 100644 index 0000000..425b6d4 --- /dev/null +++ b/dwio/nimble/velox/tests/TestUtils.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) Meta Platforms, Inc. and its affiliates. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "dwio/nimble/encodings/tests/TestUtils.h" +#include "dwio/nimble/tablet/TabletReader.h" +#include "dwio/nimble/velox/ChunkedStream.h" + +namespace facebook::nimble::test { +// Calculate the rawDataSize of a file. +inline std::uint64_t getFileRawDataSize( + nimble::TabletReader& tablet, + velox::memory::MemoryPool& pool) { + // Calculate expected size by summing stream sizes. + uint64_t expected{0}; + for (auto i = 0; i < tablet.stripeCount(); ++i) { + auto stripeIdentifier = tablet.getStripeIdentifier(i); + + auto numStreams = tablet.streamCount(stripeIdentifier); + std::vector identifiers(numStreams + 1); + std::iota(identifiers.begin(), identifiers.end(), 0); + auto streams = tablet.load(stripeIdentifier, identifiers); + + // Skip nullStreams indicated by nullptr. + for (auto j = 0; j < streams.size(); ++j) { + if (streams[j] == nullptr) { + continue; + } + nimble::InMemoryChunkedStream chunkedStream{pool, std::move(streams[j])}; + while (chunkedStream.hasNext()) { + auto chunk = chunkedStream.nextChunk(); + auto size = TestUtils::getRawChunkSize(pool, chunk); + expected += size; + } + } + } + return expected; +} + +} // namespace facebook::nimble::test