From 7dddc55d77093579fca9f915874bc31a311d57f5 Mon Sep 17 00:00:00 2001 From: Sutou Kouhei Date: Mon, 9 Sep 2024 10:52:00 +0900 Subject: [PATCH] GH-44008: [C++][Parquet] Add support for arrow::ArrayStatistics: boolean --- .../parquet/arrow/arrow_statistics_test.cc | 4 +++ cpp/src/parquet/arrow/reader_internal.cc | 30 +++++++++++++------ 2 files changed, 25 insertions(+), 9 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow_statistics_test.cc b/cpp/src/parquet/arrow/arrow_statistics_test.cc index 5011bf89112c6..a8e2287d37085 100644 --- a/cpp/src/parquet/arrow/arrow_statistics_test.cc +++ b/cpp/src/parquet/arrow/arrow_statistics_test.cc @@ -248,6 +248,10 @@ void TestStatisticsReadArray(std::shared_ptr<::arrow::DataType> arrow_type) { } } // namespace +TEST(TestStatisticsRead, Boolean) { + TestStatisticsReadArray<::arrow::BooleanType, bool>(::arrow::boolean()); +} + TEST(TestStatisticsRead, Int8) { TestStatisticsReadArray<::arrow::Int8Type, int64_t>(::arrow::int8()); } diff --git a/cpp/src/parquet/arrow/reader_internal.cc b/cpp/src/parquet/arrow/reader_internal.cc index aa84a7a92bbe1..42a8cb202eb2a 100644 --- a/cpp/src/parquet/arrow/reader_internal.cc +++ b/cpp/src/parquet/arrow/reader_internal.cc @@ -342,10 +342,13 @@ void AttachStatistics(::arrow::ArrayData* data, static_cast<::parquet::TypedStatistics*>(statistics); const ArrowCType min = typed_statistics->min(); const ArrowCType max = typed_statistics->max(); - if (std::is_floating_point::value) { + if constexpr (std::is_same::value) { + array_statistics->min = static_cast(min); + array_statistics->max = static_cast(max); + } else if constexpr (std::is_floating_point::value) { array_statistics->min = static_cast(min); array_statistics->max = static_cast(max); - } else if (std::is_signed::value) { + } else if constexpr (std::is_signed::value) { array_statistics->min = static_cast(min); array_statistics->max = static_cast(max); } else { @@ -414,11 +417,13 @@ std::shared_ptr TransferZeroCopy( return ::arrow::MakeArray(std::move(data)); } -Status TransferBool(RecordReader* reader, bool nullable, MemoryPool* pool, Datum* out) { +Status TransferBool(RecordReader* reader, + std::unique_ptr<::parquet::ColumnChunkMetaData> metadata, + const ReaderContext* ctx, bool nullable, Datum* out) { int64_t length = reader->values_written(); const int64_t buffer_size = bit_util::BytesForBits(length); - ARROW_ASSIGN_OR_RAISE(auto data, ::arrow::AllocateBuffer(buffer_size, pool)); + ARROW_ASSIGN_OR_RAISE(auto data, ::arrow::AllocateBuffer(buffer_size, ctx->pool)); // Transfer boolean values to packed bitmap auto values = reinterpret_cast(reader->values()); @@ -431,13 +436,19 @@ Status TransferBool(RecordReader* reader, bool nullable, MemoryPool* pool, Datum } } + std::shared_ptr<::arrow::ArrayData> array_data; if (nullable) { - *out = std::make_shared(length, std::move(data), - reader->ReleaseIsValid(), reader->null_count()); + array_data = ::arrow::ArrayData::Make(::arrow::boolean(), length, + {reader->ReleaseIsValid(), std::move(data)}, + reader->null_count()); } else { - *out = std::make_shared(length, std::move(data), - /*null_bitmap=*/nullptr, /*null_count=*/0); + array_data = ::arrow::ArrayData::Make(::arrow::boolean(), length, + {/*null_bitmap=*/nullptr, std::move(data)}, + /*null_count=*/0); } + AttachStatistics<::arrow::BooleanType, BooleanType>(array_data.get(), + std::move(metadata), ctx); + *out = std::make_shared(std::move(array_data)); return Status::OK(); } @@ -833,7 +844,8 @@ Status TransferColumnData(RecordReader* reader, reader, std::move(metadata), ctx, value_field); break; case ::arrow::Type::BOOL: - RETURN_NOT_OK(TransferBool(reader, value_field->nullable(), pool, &result)); + RETURN_NOT_OK(TransferBool(reader, std::move(metadata), ctx, + value_field->nullable(), &result)); break; TRANSFER_INT32(UINT8, ::arrow::UInt8Type); TRANSFER_INT32(INT8, ::arrow::Int8Type);