Skip to content

Commit

Permalink
apacheGH-41909: PoC: [C++] Add arrow::ArrayStatistics
Browse files Browse the repository at this point in the history
  • Loading branch information
kou committed Jun 13, 2024
1 parent 8ae1edb commit f362ccc
Show file tree
Hide file tree
Showing 9 changed files with 257 additions and 36 deletions.
8 changes: 8 additions & 0 deletions cpp/src/arrow/array/array_base.h
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,14 @@ class ARROW_EXPORT Array {
/// \return DeviceAllocationType
DeviceAllocationType device_type() const { return data_->device_type(); }

/// \brief Return the statistics of this Array
///
/// This just delegates to calling statistics on the underlying ArrayData
/// object which backs this Array.
///
/// \return const ArrayStatistics&
const ArrayStatistics& statistics() const { return data_->statistics; }

protected:
Array() = default;
ARROW_DEFAULT_MOVE_AND_ASSIGN(Array);
Expand Down
10 changes: 10 additions & 0 deletions cpp/src/arrow/array/array_primitive.h
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,11 @@ class ARROW_EXPORT BooleanArray : public PrimitiveArray {

IteratorType end() const { return IteratorType(*this, length()); }

/// \brief Return the statistics for boolean.
const BooleanArrayStatistics& statistics() const {
return static_cast<const BooleanArrayStatistics&>(Array::statistics());
}

protected:
using PrimitiveArray::PrimitiveArray;
};
Expand Down Expand Up @@ -119,6 +124,11 @@ class NumericArray : public PrimitiveArray {

IteratorType end() const { return IteratorType(*this, length()); }

/// \brief Return the typed statistics.
const TypedArrayStatistics<TYPE>& statistics() const {
return static_cast<const TypedArrayStatistics<TYPE>&>(Array::statistics());
}

protected:
using PrimitiveArray::PrimitiveArray;
};
Expand Down
12 changes: 10 additions & 2 deletions cpp/src/arrow/array/data.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include <utility>
#include <vector>

#include "arrow/array/statistics.h"
#include "arrow/buffer.h"
#include "arrow/result.h"
#include "arrow/type.h"
Expand Down Expand Up @@ -152,7 +153,8 @@ struct ARROW_EXPORT ArrayData {
offset(other.offset),
buffers(std::move(other.buffers)),
child_data(std::move(other.child_data)),
dictionary(std::move(other.dictionary)) {
dictionary(std::move(other.dictionary)),
statistics(std::move(other.statistics)) {
SetNullCount(other.null_count);
}

Expand All @@ -163,7 +165,8 @@ struct ARROW_EXPORT ArrayData {
offset(other.offset),
buffers(other.buffers),
child_data(other.child_data),
dictionary(other.dictionary) {
dictionary(other.dictionary),
statistics(other.statistics) {
SetNullCount(other.null_count);
}

Expand All @@ -176,6 +179,7 @@ struct ARROW_EXPORT ArrayData {
buffers = std::move(other.buffers);
child_data = std::move(other.child_data);
dictionary = std::move(other.dictionary);
statistics = std::move(other.statistics);
return *this;
}

Expand All @@ -188,6 +192,7 @@ struct ARROW_EXPORT ArrayData {
buffers = other.buffers;
child_data = other.child_data;
dictionary = other.dictionary;
statistics = other.statistics;
return *this;
}

Expand Down Expand Up @@ -390,6 +395,9 @@ struct ARROW_EXPORT ArrayData {

// The dictionary for this Array, if any. Only used for dictionary type
std::shared_ptr<ArrayData> dictionary;

// The statistics for this Array.
ArrayStatistics statistics{};
};

/// \brief A non-owning Buffer reference
Expand Down
84 changes: 84 additions & 0 deletions cpp/src/arrow/array/statistics.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#pragma once

#include <optional>
#include <variant>

#include "arrow/util/visibility.h"

namespace arrow {

/// \brief Statistics for an Array
///
/// Apache Arrow format doesn't have statistics but data source such
/// as Apache Parquet may have statistics. Statistics associate with
/// data source can be read unified API via this class.
struct ARROW_EXPORT ArrayStatistics {
public:
using ElementBufferType = std::variant<bool, int8_t, uint8_t, int16_t, uint16_t,
int32_t, uint32_t, int64_t, uint64_t>;

ArrayStatistics() = default;
~ArrayStatistics() = default;

/// \brief The number of null values, may not be set
std::optional<int64_t> null_count = std::nullopt;

/// \brief The number of distinct values, may not be set
std::optional<int64_t> distinct_count = std::nullopt;

/// \brief The current minimum value buffer, may not be set
std::optional<ElementBufferType> min_buffer = std::nullopt;

/// \brief The current maximum value buffer, may not be set
std::optional<ElementBufferType> max_buffer = std::nullopt;

/// \brief Check two Statistics for equality
bool Equals(const ArrayStatistics& other) const {
return null_count == other.null_count && distinct_count == other.distinct_count &&
min_buffer == other.min_buffer && max_buffer == other.max_buffer;
}
};

/// \brief A typed implementation of ArrayStatistics
template <typename TypeClass>
class TypedArrayStatistics : public ArrayStatistics {
public:
using ElementType = typename TypeClass::c_type;

/// \brief The current minimum value, may not be set
std::optional<ElementType> min() const {
if (min_buffer && std::holds_alternative<ElementType>(*min_buffer)) {
return std::get<ElementType>(*min_buffer);
} else {
return std::nullopt;
}
}

/// \brief The current maximum value, may not be set
std::optional<ElementType> max() const {
if (max_buffer && std::holds_alternative<ElementType>(*max_buffer)) {
return std::get<ElementType>(*max_buffer);
} else {
return std::nullopt;
}
}
};

} // namespace arrow
15 changes: 10 additions & 5 deletions cpp/src/arrow/type_fwd.h
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,9 @@ using ChunkedArrayVector = std::vector<std::shared_ptr<ChunkedArray>>;
using RecordBatchVector = std::vector<std::shared_ptr<RecordBatch>>;
using RecordBatchIterator = Iterator<std::shared_ptr<RecordBatch>>;

template <typename TypeClass>
class TypedArrayStatistics;

class DictionaryType;
class DictionaryArray;
struct DictionaryScalar;
Expand All @@ -102,6 +105,7 @@ class FixedWidthType;

class BooleanType;
class BooleanArray;
using BooleanArrayStatistics = TypedArrayStatistics<BooleanType>;
class BooleanBuilder;
struct BooleanScalar;

Expand Down Expand Up @@ -215,11 +219,12 @@ class NumericBuilder;
template <typename TypeClass>
class NumericTensor;

#define _NUMERIC_TYPE_DECL(KLASS) \
class KLASS##Type; \
using KLASS##Array = NumericArray<KLASS##Type>; \
using KLASS##Builder = NumericBuilder<KLASS##Type>; \
struct KLASS##Scalar; \
#define _NUMERIC_TYPE_DECL(KLASS) \
class KLASS##Type; \
using KLASS##Array = NumericArray<KLASS##Type>; \
using KLASS##ArrayStatistics = TypedArrayStatistics<KLASS##Type>; \
using KLASS##Builder = NumericBuilder<KLASS##Type>; \
struct KLASS##Scalar; \
using KLASS##Tensor = NumericTensor<KLASS##Type>;

_NUMERIC_TYPE_DECL(Int8)
Expand Down
54 changes: 54 additions & 0 deletions cpp/src/parquet/arrow/arrow_statistics_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,14 @@

#include "gtest/gtest.h"

#include "arrow/array.h"
#include "arrow/table.h"
#include "arrow/testing/gtest_util.h"

#include "parquet/api/reader.h"
#include "parquet/api/writer.h"

#include "parquet/arrow/reader.h"
#include "parquet/arrow/schema.h"
#include "parquet/arrow/writer.h"
#include "parquet/file_writer.h"
Expand Down Expand Up @@ -156,4 +158,56 @@ INSTANTIATE_TEST_SUITE_P(
/*expected_min=*/"z",
/*expected_max=*/"z"}));

namespace {
::arrow::Result<std::shared_ptr<::arrow::Array>> StatisticsReadArray(
std::shared_ptr<::arrow::DataType> data_type, const std::string& json) {
auto schema = ::arrow::schema({::arrow::field("column", data_type)});
auto array = ::arrow::ArrayFromJSON(data_type, json);
auto record_batch = ::arrow::RecordBatch::Make(schema, array->length(), {array});
ARROW_ASSIGN_OR_RAISE(auto sink, ::arrow::io::BufferOutputStream::Create());
ARROW_ASSIGN_OR_RAISE(auto writer,
FileWriter::Open(*schema, ::arrow::default_memory_pool(), sink));
ARROW_RETURN_NOT_OK(writer->WriteRecordBatch(*record_batch));
ARROW_RETURN_NOT_OK(writer->Close());
ARROW_ASSIGN_OR_RAISE(auto buffer, sink->Finish());

auto reader =
ParquetFileReader::Open(std::make_shared<::arrow::io::BufferReader>(buffer));
std::unique_ptr<FileReader> file_reader;
ARROW_RETURN_NOT_OK(
FileReader::Make(::arrow::default_memory_pool(), std::move(reader), &file_reader));
std::shared_ptr<::arrow::ChunkedArray> chunked_array;
ARROW_RETURN_NOT_OK(file_reader->ReadColumn(0, &chunked_array));
return chunked_array->chunk(0);
}
} // namespace

TEST(TestStatisticsRead, Boolean) {
ASSERT_OK_AND_ASSIGN(auto array,
StatisticsReadArray(::arrow::boolean(), R"([true, null, true])"));
auto typed_array = std::static_pointer_cast<::arrow::BooleanArray>(array);
auto statistics = typed_array->statistics();
ASSERT_EQ(true, statistics.null_count.has_value());
ASSERT_EQ(1, statistics.null_count.value());
ASSERT_EQ(false, statistics.distinct_count.has_value());
ASSERT_EQ(true, statistics.min().has_value());
ASSERT_EQ(true, statistics.min().value());
ASSERT_EQ(true, statistics.max().has_value());
ASSERT_EQ(true, statistics.max().value());
}

TEST(TestStatisticsRead, Int8) {
ASSERT_OK_AND_ASSIGN(auto array,
StatisticsReadArray(::arrow::int8(), R"([1, null, -1, 1])"));
auto typed_array = std::static_pointer_cast<::arrow::Int8Array>(array);
auto statistics = typed_array->statistics();
ASSERT_EQ(true, statistics.null_count.has_value());
ASSERT_EQ(1, statistics.null_count.value());
ASSERT_EQ(false, statistics.distinct_count.has_value());
ASSERT_EQ(true, statistics.min().has_value());
ASSERT_EQ(-1, statistics.min().value());
ASSERT_EQ(true, statistics.max().has_value());
ASSERT_EQ(1, statistics.max().value());
}

} // namespace parquet::arrow
5 changes: 3 additions & 2 deletions cpp/src/parquet/arrow/reader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -485,8 +485,9 @@ class LeafReader : public ColumnReaderImpl {
NextRowGroup();
}
}
RETURN_NOT_OK(
TransferColumnData(record_reader_.get(), field_, descr_, ctx_->pool, &out_));
RETURN_NOT_OK(TransferColumnData(record_reader_.get(),
input_->column_chunk_metadata(), field_, descr_,
ctx_->pool, &out_));
return Status::OK();
END_PARQUET_CATCH_EXCEPTIONS
}
Expand Down
Loading

0 comments on commit f362ccc

Please sign in to comment.