Skip to content

Commit

Permalink
Move statistics to Array from ArrayData
Browse files Browse the repository at this point in the history
  • Loading branch information
kou committed Jul 11, 2024
1 parent 31fdf8b commit c2ba4ed
Show file tree
Hide file tree
Showing 5 changed files with 51 additions and 47 deletions.
18 changes: 13 additions & 5 deletions cpp/src/arrow/array/array_base.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include <vector>

#include "arrow/array/data.h"
#include "arrow/array/statistics.h"
#include "arrow/buffer.h"
#include "arrow/compare.h"
#include "arrow/result.h"
Expand Down Expand Up @@ -232,13 +233,17 @@ class ARROW_EXPORT Array {
/// \return DeviceAllocationType
DeviceAllocationType device_type() const { return data_->device_type(); }

/// \brief Return the statistics of this Array
/// \brief Set the statistics to this Array
///
/// This just delegates to calling statistics on the underlying ArrayData
/// object which backs this Array.
/// \param[in] statistics the statistics of this Array
void SetStatistics(std::shared_ptr<ArrayStatistics> statistics) {
statistics_ = std::move(statistics);
}

/// \brief Return the statistics of this Array
///
/// \return const ArrayStatistics&
const ArrayStatistics& statistics() const { return data_->statistics; }
/// \return std::shared_ptr<ArrayStatistics>
std::shared_ptr<ArrayStatistics> GetStatistics() const { return statistics_; }

protected:
Array() = default;
Expand All @@ -257,6 +262,9 @@ class ARROW_EXPORT Array {
data_ = data;
}

// The statistics for this Array.
std::shared_ptr<ArrayStatistics> statistics_;

private:
ARROW_DISALLOW_COPY_AND_ASSIGN(Array);

Expand Down
8 changes: 4 additions & 4 deletions cpp/src/arrow/array/array_primitive.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,8 @@ class ARROW_EXPORT BooleanArray : public PrimitiveArray {
IteratorType end() const { return IteratorType(*this, length()); }

/// \brief Return the statistics for boolean.
const BooleanArrayStatistics& statistics() const {
return static_cast<const BooleanArrayStatistics&>(Array::statistics());
std::shared_ptr<BooleanArrayStatistics> GetStatistics() const {
return std::static_pointer_cast<BooleanArrayStatistics>(Array::GetStatistics());
}

protected:
Expand Down Expand Up @@ -125,8 +125,8 @@ class NumericArray : public PrimitiveArray {
IteratorType end() const { return IteratorType(*this, length()); }

/// \brief Return the typed statistics.
const TypedArrayStatistics<TYPE>& statistics() const {
return static_cast<const TypedArrayStatistics<TYPE>&>(Array::statistics());
std::shared_ptr<TypedArrayStatistics<TYPE>> GetStatistics() const {
return std::static_pointer_cast<TypedArrayStatistics<TYPE>>(Array::GetStatistics());
}

protected:
Expand Down
12 changes: 2 additions & 10 deletions cpp/src/arrow/array/data.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
#include <utility>
#include <vector>

#include "arrow/array/statistics.h"
#include "arrow/buffer.h"
#include "arrow/result.h"
#include "arrow/type.h"
Expand Down Expand Up @@ -153,8 +152,7 @@ struct ARROW_EXPORT ArrayData {
offset(other.offset),
buffers(std::move(other.buffers)),
child_data(std::move(other.child_data)),
dictionary(std::move(other.dictionary)),
statistics(std::move(other.statistics)) {
dictionary(std::move(other.dictionary)) {
SetNullCount(other.null_count);
}

Expand All @@ -165,8 +163,7 @@ struct ARROW_EXPORT ArrayData {
offset(other.offset),
buffers(other.buffers),
child_data(other.child_data),
dictionary(other.dictionary),
statistics(other.statistics) {
dictionary(other.dictionary) {
SetNullCount(other.null_count);
}

Expand All @@ -179,7 +176,6 @@ struct ARROW_EXPORT ArrayData {
buffers = std::move(other.buffers);
child_data = std::move(other.child_data);
dictionary = std::move(other.dictionary);
statistics = std::move(other.statistics);
return *this;
}

Expand All @@ -192,7 +188,6 @@ struct ARROW_EXPORT ArrayData {
buffers = other.buffers;
child_data = other.child_data;
dictionary = other.dictionary;
statistics = other.statistics;
return *this;
}

Expand Down Expand Up @@ -395,9 +390,6 @@ struct ARROW_EXPORT ArrayData {

// The dictionary for this Array, if any. Only used for dictionary type
std::shared_ptr<ArrayData> dictionary;

// The statistics for this Array.
ArrayStatistics statistics{};
};

/// \brief A non-owning Buffer reference
Expand Down
32 changes: 16 additions & 16 deletions cpp/src/parquet/arrow/arrow_statistics_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -186,28 +186,28 @@ TEST(TestStatisticsRead, Boolean) {
ASSERT_OK_AND_ASSIGN(auto array,
StatisticsReadArray(::arrow::boolean(), R"([true, null, true])"));
auto typed_array = std::static_pointer_cast<::arrow::BooleanArray>(array);
auto statistics = typed_array->statistics();
ASSERT_EQ(true, statistics.null_count.has_value());
ASSERT_EQ(1, statistics.null_count.value());
ASSERT_EQ(false, statistics.distinct_count.has_value());
ASSERT_EQ(true, statistics.min().has_value());
ASSERT_EQ(true, statistics.min().value());
ASSERT_EQ(true, statistics.max().has_value());
ASSERT_EQ(true, statistics.max().value());
auto statistics = typed_array->GetStatistics();
ASSERT_EQ(true, statistics->null_count.has_value());
ASSERT_EQ(1, statistics->null_count.value());
ASSERT_EQ(false, statistics->distinct_count.has_value());
ASSERT_EQ(true, statistics->min().has_value());
ASSERT_EQ(true, statistics->min().value());
ASSERT_EQ(true, statistics->max().has_value());
ASSERT_EQ(true, statistics->max().value());
}

TEST(TestStatisticsRead, Int8) {
ASSERT_OK_AND_ASSIGN(auto array,
StatisticsReadArray(::arrow::int8(), R"([1, null, -1, 1])"));
auto typed_array = std::static_pointer_cast<::arrow::Int8Array>(array);
auto statistics = typed_array->statistics();
ASSERT_EQ(true, statistics.null_count.has_value());
ASSERT_EQ(1, statistics.null_count.value());
ASSERT_EQ(false, statistics.distinct_count.has_value());
ASSERT_EQ(true, statistics.min().has_value());
ASSERT_EQ(-1, statistics.min().value());
ASSERT_EQ(true, statistics.max().has_value());
ASSERT_EQ(1, statistics.max().value());
auto statistics = typed_array->GetStatistics();
ASSERT_EQ(true, statistics->null_count.has_value());
ASSERT_EQ(1, statistics->null_count.value());
ASSERT_EQ(false, statistics->distinct_count.has_value());
ASSERT_EQ(true, statistics->min().has_value());
ASSERT_EQ(-1, statistics->min().value());
ASSERT_EQ(true, statistics->max().has_value());
ASSERT_EQ(1, statistics->max().value());
}

} // namespace parquet::arrow
28 changes: 16 additions & 12 deletions cpp/src/parquet/arrow/reader_internal.cc
Original file line number Diff line number Diff line change
Expand Up @@ -339,22 +339,23 @@ Status TransferInt(RecordReader* reader,
}
auto array_data =
::arrow::ArrayData::Make(field->type(), length, std::move(buffers), null_count);
array_data->statistics.null_count = null_count;
auto array_statistics = std::make_shared<::arrow::ArrayStatistics>();
array_statistics->null_count = null_count;
auto statistics = metadata->statistics().get();
if (statistics) {
if (statistics->HasDistinctCount()) {
array_data->statistics.distinct_count = statistics->distinct_count();
array_statistics->distinct_count = statistics->distinct_count();
}
if (statistics->HasMinMax()) {
auto typed_statistics =
static_cast<::parquet::TypedStatistics<ParquetType>*>(statistics);
array_data->statistics.min_buffer =
static_cast<ArrowCType>(typed_statistics->min());
array_data->statistics.max_buffer =
static_cast<ArrowCType>(typed_statistics->max());
array_statistics->min_buffer = static_cast<ArrowCType>(typed_statistics->min());
array_statistics->max_buffer = static_cast<ArrowCType>(typed_statistics->max());
}
}
*out = std::make_shared<ArrayType<ArrowType>>(std::move(array_data));
auto array = std::make_shared<ArrayType<ArrowType>>(std::move(array_data));
array->SetStatistics(std::move(array_statistics));
*out = std::move(array);
return Status::OK();
}

Expand Down Expand Up @@ -401,19 +402,22 @@ Status TransferBool(RecordReader* reader,
}
auto array_data = ::arrow::ArrayData::Make(::arrow::boolean(), length,
std::move(buffers), null_count);
array_data->statistics.null_count = null_count;
auto array_statistics = std::make_shared<::arrow::ArrayStatistics>();
array_statistics->null_count = null_count;
auto statistics = metadata->statistics().get();
if (statistics) {
if (statistics->HasDistinctCount()) {
array_data->statistics.distinct_count = statistics->distinct_count();
array_statistics->distinct_count = statistics->distinct_count();
}
if (statistics->HasMinMax()) {
auto bool_statistics = static_cast<::parquet::BoolStatistics*>(statistics);
array_data->statistics.min_buffer = bool_statistics->min();
array_data->statistics.max_buffer = bool_statistics->max();
array_statistics->min_buffer = bool_statistics->min();
array_statistics->max_buffer = bool_statistics->max();
}
}
*out = std::make_shared<BooleanArray>(std::move(array_data));
auto array = std::make_shared<BooleanArray>(std::move(array_data));
array->SetStatistics(std::move(array_statistics));
*out = std::move(array);
return Status::OK();
}

Expand Down

0 comments on commit c2ba4ed

Please sign in to comment.