Skip to content

Commit

Permalink
apacheGH-43666: [C++] Attach arrow::ArrayStatistics to arrow::Array
Browse files Browse the repository at this point in the history
New public APIs:

* `arrow::Array::statistics()`: It returns the associated statistics.
  It can't be changed after an array is created. A sliced array
  doesn't have parent array's statistics because parent array's
  statistics isn't valid for sliced array.
* Add new optional `arrow::ArrayStatistics` argument to all
  `arrow::*Array(ArrayData)` constructors: `arrow::*Array(ArrayData,
  ArrayStatistics = NULLPTR)`

New internal APIs:

* `arrow::Array::Init()`: All array constructors must call this to
  attach `arrow::ArrayData` and `arrow::ArrayStatistics`. Note that
  calling this via parent's constructor isn't
  allowed. Array constructors don't need to call
  `arrow::Array::SetData()` directly. It's called in
  `arrow::Array::Init()`.
* `arrow::Array::SetStatitics()`: It attaches `arrow::ArrayStatistics`
  to `arrow::Array`. In general, this is not called directly. This is
  called from `arrow::Array::Init()` internally.

Changed internal APIs:

* `arrow::Array::SetData()`: It becomes a virtual method. So
  `arrow::Array::Init()` must be called by each array's constructor.
  • Loading branch information
kou committed Aug 15, 2024
1 parent 483bc7b commit abef9fc
Show file tree
Hide file tree
Showing 13 changed files with 304 additions and 296 deletions.
54 changes: 37 additions & 17 deletions cpp/src/arrow/array/array_base.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
#include <vector>

#include "arrow/array/data.h"
#include "arrow/array/statistics.h"
#include "arrow/buffer.h"
#include "arrow/compare.h"
#include "arrow/result.h"
Expand Down Expand Up @@ -232,15 +233,32 @@ class ARROW_EXPORT Array {
/// \return DeviceAllocationType
DeviceAllocationType device_type() const { return data_->device_type(); }

/// \brief Return the statistics of this Array
///
/// \return const std::shared_ptr<ArrayStatistics>&
const std::shared_ptr<ArrayStatistics>& statistics() const { return statistics_; }

protected:
Array() = default;
ARROW_DEFAULT_MOVE_AND_ASSIGN(Array);

std::shared_ptr<ArrayData> data_;
const uint8_t* null_bitmap_data_ = NULLPTR;

/// Protected method for constructors
void SetData(const std::shared_ptr<ArrayData>& data) {
/// Protected method for constructors. This must be called from each
/// array class to call its SetData(). You can't use call this in a
/// parent array class.
void Init(const std::shared_ptr<ArrayData>& data,
const std::shared_ptr<ArrayStatistics>& statistics) {
SetData(data);
if (statistics) {
SetStatistics(statistics);
}
}

/// Protected method for constructors. Don't call this method
/// directly. This should be called from Init().
virtual void SetData(const std::shared_ptr<ArrayData>& data) {
if (data->buffers.size() > 0) {
null_bitmap_data_ = data->GetValuesSafe<uint8_t>(0, /*offset=*/0);
} else {
Expand All @@ -249,6 +267,15 @@ class ARROW_EXPORT Array {
data_ = data;
}

// The statistics for this Array.
std::shared_ptr<ArrayStatistics> statistics_;

/// Protected method for constructors. Don't call this method
/// directly. This should be called from Init().
void SetStatistics(const std::shared_ptr<ArrayStatistics>& statistics) {
statistics_ = statistics;
}

private:
ARROW_DISALLOW_COPY_AND_ASSIGN(Array);

Expand All @@ -261,32 +288,22 @@ static inline std::ostream& operator<<(std::ostream& os, const Array& x) {
}

/// Base class for non-nested arrays
class ARROW_EXPORT FlatArray : public Array {
protected:
using Array::Array;
};
class ARROW_EXPORT FlatArray : public Array {};

/// Base class for arrays of fixed-size logical types
class ARROW_EXPORT PrimitiveArray : public FlatArray {
public:
PrimitiveArray(const std::shared_ptr<DataType>& type, int64_t length,
const std::shared_ptr<Buffer>& data,
const std::shared_ptr<Buffer>& null_bitmap = NULLPTR,
int64_t null_count = kUnknownNullCount, int64_t offset = 0);

/// Does not account for any slice offset
const std::shared_ptr<Buffer>& values() const { return data_->buffers[1]; }

protected:
PrimitiveArray() : raw_values_(NULLPTR) {}

void SetData(const std::shared_ptr<ArrayData>& data) {
this->Array::SetData(data);
void SetData(const std::shared_ptr<ArrayData>& data) override {
Array::SetData(data);
raw_values_ = data->GetValuesSafe<uint8_t>(1, /*offset=*/0);
}

explicit PrimitiveArray(const std::shared_ptr<ArrayData>& data) { SetData(data); }

const uint8_t* raw_values_;
};

Expand All @@ -295,11 +312,14 @@ class ARROW_EXPORT NullArray : public FlatArray {
public:
using TypeClass = NullType;

explicit NullArray(const std::shared_ptr<ArrayData>& data) { SetData(data); }
explicit NullArray(const std::shared_ptr<ArrayData>& data,
const std::shared_ptr<ArrayStatistics>& statistics = NULLPTR) {
Init(data, statistics);
}
explicit NullArray(int64_t length);

private:
void SetData(const std::shared_ptr<ArrayData>& data) {
void SetData(const std::shared_ptr<ArrayData>& data) override {
null_bitmap_data_ = NULLPTR;
data->null_count = data->length;
data_ = data;
Expand Down
78 changes: 33 additions & 45 deletions cpp/src/arrow/array/array_binary.cc
Original file line number Diff line number Diff line change
Expand Up @@ -32,36 +32,19 @@ namespace arrow {

using internal::checked_cast;

BinaryArray::BinaryArray(const std::shared_ptr<ArrayData>& data) {
void BinaryArray::SetData(const std::shared_ptr<ArrayData>& data) {
ARROW_CHECK(is_binary_like(data->type->id()));
SetData(data);
BaseBinaryArray<BinaryType>::SetData(data);
}

BinaryArray::BinaryArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets,
const std::shared_ptr<Buffer>& data,
const std::shared_ptr<Buffer>& null_bitmap, int64_t null_count,
int64_t offset) {
SetData(ArrayData::Make(binary(), length, {null_bitmap, value_offsets, data},
null_count, offset));
}

LargeBinaryArray::LargeBinaryArray(const std::shared_ptr<ArrayData>& data) {
void LargeBinaryArray::SetData(const std::shared_ptr<ArrayData>& data) {
ARROW_CHECK(is_large_binary_like(data->type->id()));
SetData(data);
BaseBinaryArray<LargeBinaryType>::SetData(data);
}

LargeBinaryArray::LargeBinaryArray(int64_t length,
const std::shared_ptr<Buffer>& value_offsets,
const std::shared_ptr<Buffer>& data,
const std::shared_ptr<Buffer>& null_bitmap,
int64_t null_count, int64_t offset) {
SetData(ArrayData::Make(large_binary(), length, {null_bitmap, value_offsets, data},
null_count, offset));
}

StringArray::StringArray(const std::shared_ptr<ArrayData>& data) {
void StringArray::SetData(const std::shared_ptr<ArrayData>& data) {
ARROW_CHECK_EQ(data->type->id(), Type::STRING);
SetData(data);
BinaryArray::SetData(data);
}

StringArray::StringArray(int64_t length, const std::shared_ptr<Buffer>& value_offsets,
Expand All @@ -74,9 +57,9 @@ StringArray::StringArray(int64_t length, const std::shared_ptr<Buffer>& value_of

Status StringArray::ValidateUTF8() const { return internal::ValidateUTF8(*data_); }

LargeStringArray::LargeStringArray(const std::shared_ptr<ArrayData>& data) {
void LargeStringArray::SetData(const std::shared_ptr<ArrayData>& data) {
ARROW_CHECK_EQ(data->type->id(), Type::LARGE_STRING);
SetData(data);
LargeBinaryArray::SetData(data);
}

LargeStringArray::LargeStringArray(int64_t length,
Expand All @@ -90,45 +73,50 @@ LargeStringArray::LargeStringArray(int64_t length,

Status LargeStringArray::ValidateUTF8() const { return internal::ValidateUTF8(*data_); }

BinaryViewArray::BinaryViewArray(std::shared_ptr<ArrayData> data) {
ARROW_CHECK_EQ(data->type->id(), Type::BINARY_VIEW);
SetData(std::move(data));
void BinaryViewArray::SetData(const std::shared_ptr<ArrayData>& data) {
ARROW_CHECK_EQ(data->type->id(), expected_type_id());
FlatArray::SetData(data);
raw_values_ = data_->GetValuesSafe<c_type>(1);
}

namespace {
void InitViewArrayBuffers(BufferVector& buffers, std::shared_ptr<Buffer> views,
std::shared_ptr<Buffer> null_bitmap) {
buffers.insert(buffers.begin(), std::move(views));
buffers.insert(buffers.begin(), std::move(null_bitmap));
}
}; // namespace

BinaryViewArray::BinaryViewArray(std::shared_ptr<DataType> type, int64_t length,
std::shared_ptr<Buffer> views, BufferVector buffers,
std::shared_ptr<Buffer> null_bitmap, int64_t null_count,
int64_t offset) {
buffers.insert(buffers.begin(), std::move(views));
buffers.insert(buffers.begin(), std::move(null_bitmap));
SetData(
ArrayData::Make(std::move(type), length, std::move(buffers), null_count, offset));
InitViewArrayBuffers(buffers, std::move(views), std::move(null_bitmap));
Init(ArrayData::Make(std::move(type), length, std::move(buffers), null_count, offset),
nullptr);
}

std::string_view BinaryViewArray::GetView(int64_t i) const {
const std::shared_ptr<Buffer>* data_buffers = data_->buffers.data() + 2;
return util::FromBinaryView(raw_values_[i], data_buffers);
}

StringViewArray::StringViewArray(std::shared_ptr<ArrayData> data) {
ARROW_CHECK_EQ(data->type->id(), Type::STRING_VIEW);
SetData(std::move(data));
StringViewArray::StringViewArray(std::shared_ptr<DataType> type, int64_t length,
std::shared_ptr<Buffer> views, BufferVector buffers,
std::shared_ptr<Buffer> null_bitmap, int64_t null_count,
int64_t offset) {
InitViewArrayBuffers(buffers, std::move(views), std::move(null_bitmap));
Init(ArrayData::Make(std::move(type), length, std::move(buffers), null_count, offset),
nullptr);
}

Status StringViewArray::ValidateUTF8() const { return internal::ValidateUTF8(*data_); }

FixedSizeBinaryArray::FixedSizeBinaryArray(const std::shared_ptr<ArrayData>& data) {
SetData(data);
void FixedSizeBinaryArray::SetData(const std::shared_ptr<ArrayData>& data) {
PrimitiveArray::SetData(data);
byte_width_ = internal::checked_cast<const FixedSizeBinaryType&>(*type()).byte_width();
}

FixedSizeBinaryArray::FixedSizeBinaryArray(const std::shared_ptr<DataType>& type,
int64_t length,
const std::shared_ptr<Buffer>& data,
const std::shared_ptr<Buffer>& null_bitmap,
int64_t null_count, int64_t offset)
: PrimitiveArray(type, length, data, null_bitmap, null_count, offset),
byte_width_(checked_cast<const FixedSizeBinaryType&>(*type).byte_width()) {}

const uint8_t* FixedSizeBinaryArray::GetValue(int64_t i) const {
return raw_values_ + (i + data_->offset) * byte_width_;
}
Expand Down
Loading

0 comments on commit abef9fc

Please sign in to comment.