From a19ba3b8da2d80be93fd079019a9e16f7edf802a Mon Sep 17 00:00:00 2001 From: Stephanie Han Date: Fri, 20 Sep 2024 11:49:17 -0700 Subject: [PATCH] Add Stats to Tablet Constants and Renaming (#74) Summary: Pull Request resolved: https://github.com/facebookincubator/nimble/pull/74 Raw size will be implemented in a followup task, to better reflect logical data size that is file format agnostic. # Changes - Adding the optional Stats section to Tablet - ~~`VeloxWriter.cpp`: Calculating the raw data size in bytes via running sum of memory used right before encoding and flushing~~ - ~~`FieldWriter.h`: Added `rawSize()` virtual function to handle string and nullable cases.~~ # Important Notes - In `VeloxWriter.cpp` we have special handling for chunked null streams, which is why `materialize` is called in an if statement. Differential Revision: D60534808 --- dwio/nimble/encodings/RleEncoding.h | 2 +- dwio/nimble/tablet/Constants.h | 1 + dwio/nimble/velox/VeloxWriter.cpp | 23 +++++++++++++++-------- 3 files changed, 17 insertions(+), 9 deletions(-) diff --git a/dwio/nimble/encodings/RleEncoding.h b/dwio/nimble/encodings/RleEncoding.h index 614c297..3217d01 100644 --- a/dwio/nimble/encodings/RleEncoding.h +++ b/dwio/nimble/encodings/RleEncoding.h @@ -174,7 +174,7 @@ class RLEEncodingBase } // namespace internal -// Handles the numeric cases. Bools and strings are templated below. +// Handles the numeric and string cases. Bools are templated below. // Data layout is: // RLEEncodingBase bytes // 4 * sizeof(physicalType) bytes: run values diff --git a/dwio/nimble/tablet/Constants.h b/dwio/nimble/tablet/Constants.h index a269048..d9dc390 100644 --- a/dwio/nimble/tablet/Constants.h +++ b/dwio/nimble/tablet/Constants.h @@ -31,5 +31,6 @@ constexpr uint32_t kPostscriptChecksumedSize = 5; constexpr std::string_view kSchemaSection = "columnar.schema"; constexpr std::string_view kMetadataSection = "columnar.metadata"; +constexpr std::string_view kStatsSection = "columnar.stats"; } // namespace facebook::nimble diff --git a/dwio/nimble/velox/VeloxWriter.cpp b/dwio/nimble/velox/VeloxWriter.cpp index 9980969..a9947d8 100644 --- a/dwio/nimble/velox/VeloxWriter.cpp +++ b/dwio/nimble/velox/VeloxWriter.cpp @@ -15,14 +15,11 @@ */ #include "dwio/nimble/velox/VeloxWriter.h" -#include #include #include "dwio/nimble/common/Exceptions.h" #include "dwio/nimble/common/Types.h" -#include "dwio/nimble/encodings/Encoding.h" #include "dwio/nimble/encodings/EncodingSelectionPolicy.h" -#include "dwio/nimble/encodings/SentinelEncoding.h" #include "dwio/nimble/tablet/Constants.h" #include "dwio/nimble/velox/BufferGrowthPolicy.h" #include "dwio/nimble/velox/ChunkedStreamWriter.h" @@ -557,6 +554,8 @@ void VeloxWriter::close() { builder.GetSize()}); } + // TODO: logical raw size. + { SchemaSerializer serializer; writer_.writeOptionalSection( @@ -621,14 +620,12 @@ void VeloxWriter::writeChunk(bool lastChunk) { streams_.resize(context_->schemaBuilder.nodeCount()); // When writing null streams, we write the nulls as data, and the stream - // itself is non-nullable. This adpater class is how we expose the nulls as - // values. + // itself is non-nullable. This adapter class is how we expose the nulls + // as values. class NullsAsDataStreamData : public StreamData { public: explicit NullsAsDataStreamData(StreamData& streamData) - : StreamData(streamData.descriptor()), streamData_{streamData} { - streamData_.materialize(); - } + : StreamData(streamData.descriptor()), streamData_{streamData} {} inline virtual std::string_view data() const override { return { @@ -692,6 +689,16 @@ void VeloxWriter::writeChunk(bool lastChunk) { streamData.nonNulls().size() > minStreamSize) || (lastChunk && !streamData.empty() && !streams_[offset].content.empty())) { + // If we ended up in this code block, it means that we either saw + // enough nulls to flush a chunk, or that we are on the last chunk of + // a stream containing nulls. + // If previous chunks in the same stream contained nulls, then + // regardless of the presence of nulls in the last chunk we must flush + // it. If the last chunk doesn't contain nulls, the DataStream (which + // is a NullsStreamData) will contain an empty nulls buffer, so we + // must materialize it before we write. + streamData.materialize(); + encoder(streamData, true); } } else {