Skip to content

Commit

Permalink
add StripeStatistic api and test
Browse files Browse the repository at this point in the history
  • Loading branch information
wushap committed Feb 25, 2025
1 parent da6cfc1 commit f31ebb9
Show file tree
Hide file tree
Showing 4 changed files with 49 additions and 1 deletion.
9 changes: 8 additions & 1 deletion c++/include/orc/Reader.hh
Original file line number Diff line number Diff line change
Expand Up @@ -495,10 +495,17 @@ namespace orc {
*/
virtual uint64_t getNumberOfStripeStatistics() const = 0;

/**
* Get the statistics about a stripe .
* @param stripeIndex the index of the stripe (0 to N-1) to get statistics about
* @return the statistics about that stripe without reading row group index statistics
*/
virtual std::unique_ptr<Statistics> getStripeStatisticsOnly(uint64_t stripeIndex) const = 0;

/**
* Get the statistics about a stripe.
* @param stripeIndex the index of the stripe (0 to N-1) to get statistics about
* @return the statistics about that stripe
* @return the statistics about that stripe and row group index statistics
*/
virtual std::unique_ptr<StripeStatistics> getStripeStatistics(uint64_t stripeIndex) const = 0;

Expand Down
14 changes: 14 additions & 0 deletions c++/src/Reader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -751,6 +751,20 @@ namespace orc {
return *(contents_->schema.get());
}

std::unique_ptr<Statistics>
ReaderImpl::getStripeStatisticsOnly(uint64_t stripeIndex) const {
if (!isMetadataLoaded_) {
readMetadata();
}
if (contents_->metadata == nullptr) {
throw std::logic_error("No stripe statistics in file");
}
StatContext statContext(hasCorrectStatistics());
return std::unique_ptr<Statistics>(new StatisticsImpl(
contents_->metadata->stripestats(static_cast<int>(stripeIndex)),
statContext));
}

std::unique_ptr<StripeStatistics> ReaderImpl::getStripeStatistics(uint64_t stripeIndex) const {
if (!isMetadataLoaded_) {
readMetadata();
Expand Down
2 changes: 2 additions & 0 deletions c++/src/Reader.hh
Original file line number Diff line number Diff line change
Expand Up @@ -346,6 +346,8 @@ namespace orc {

std::unique_ptr<ColumnStatistics> getColumnStatistics(uint32_t columnId) const override;

std::unique_ptr<Statistics> getStripeStatisticsOnly(uint64_t stripeIndex) const override;

std::string getSerializedFileTail() const override;

const Type& getType() const override;
Expand Down
25 changes: 25 additions & 0 deletions c++/test/TestStripeIndexStatistics.cc
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,31 @@ namespace orc {
"length: "
"8000\n",
stringColStats->toString());

std::unique_ptr<orc::Statistics> stripeLevelStats = reader->getStripeStatisticsOnly(0);
const orc::IntegerColumnStatistics* stripeLevelIntColStats;
stripeLevelIntColStats = reinterpret_cast<const orc::IntegerColumnStatistics*>(
stripeLevelStats->getColumnStatistics(1));
EXPECT_EQ(
"Data type: Integer\nValues: 6000\nHas null: no\nMinimum: 1\nMaximum: 6000\nSum: "
"18003000\n",
stripeLevelIntColStats->toString());

const orc::StringColumnStatistics* stripeLevelStringColStats;
stripeLevelStringColStats = reinterpret_cast<const orc::StringColumnStatistics*>(
stripeLevelStats->getColumnStatistics(2));
EXPECT_EQ(
"Data type: String\nValues: 6000\nHas null: no\nMinimum: 1000\nMaximum: 9a\nTotal length: "
"23892\n",
stripeLevelStringColStats->toString());

intColStats =
reinterpret_cast<const orc::IntegerColumnStatistics*>(stripeStats->getColumnStatistics(1));
stringColStats =
reinterpret_cast<const orc::StringColumnStatistics*>(stripeStats->getColumnStatistics(2));

EXPECT_EQ(intColStats->toString(), stripeLevelStringColStats->toString());
EXPECT_EQ(stringColStats->toString(), stripeLevelStringColStats->toString());
}

} // namespace orc

0 comments on commit f31ebb9

Please sign in to comment.