Skip to content

Commit

Permalink
apacheGH-38071: [C++][CI] Fix Overlap column chunk ranges for pre-buf…
Browse files Browse the repository at this point in the history
…fer (apache#38073)

### Rationale for this change

The C++ Parquet Arrow fuzz will generate bad Parquet file with bad row-range, this patch change the `CoalesceReadRanges` to return `Result<>`.

### What changes are included in this PR?

Just a checking, change `CoalesceReadRanges` to return `Result<>`.

### Are these changes tested?

No.

### Are there any user-facing changes?

No.

* Closes: apache#38071

Authored-by: mwish <maplewish117@gmail.com>
Signed-off-by: Sutou Kouhei <kou@clear-code.com>
  • Loading branch information
mapleFU committed Oct 8, 2023
1 parent 60916fe commit 0b9f817
Show file tree
Hide file tree
Showing 4 changed files with 17 additions and 12 deletions.
5 changes: 3 additions & 2 deletions cpp/src/arrow/io/caching.cc
Original file line number Diff line number Diff line change
Expand Up @@ -174,8 +174,9 @@ struct ReadRangeCache::Impl {

// Add the given ranges to the cache, coalescing them where possible
virtual Status Cache(std::vector<ReadRange> ranges) {
ranges = internal::CoalesceReadRanges(std::move(ranges), options.hole_size_limit,
options.range_size_limit);
ARROW_ASSIGN_OR_RAISE(
ranges, internal::CoalesceReadRanges(std::move(ranges), options.hole_size_limit,
options.range_size_limit));
std::vector<RangeCacheEntry> new_entries = MakeCacheEntries(ranges);
// Add new entries, themselves ordered by offset
if (entries.size() > 0) {
Expand Down
12 changes: 7 additions & 5 deletions cpp/src/arrow/io/interfaces.cc
Original file line number Diff line number Diff line change
Expand Up @@ -425,7 +425,7 @@ ThreadPool* GetIOThreadPool() {
namespace {

struct ReadRangeCombiner {
std::vector<ReadRange> Coalesce(std::vector<ReadRange> ranges) {
Result<std::vector<ReadRange>> Coalesce(std::vector<ReadRange> ranges) {
if (ranges.empty()) {
return ranges;
}
Expand Down Expand Up @@ -454,7 +454,9 @@ struct ReadRangeCombiner {
const auto& left = ranges[i];
const auto& right = ranges[i + 1];
DCHECK_LE(left.offset, right.offset);
DCHECK_LE(left.offset + left.length, right.offset) << "Some read ranges overlap";
if (left.offset + left.length > right.offset) {
return Status::IOError("Some read ranges overlap");
}
}
#endif

Expand Down Expand Up @@ -509,9 +511,9 @@ struct ReadRangeCombiner {

}; // namespace

std::vector<ReadRange> CoalesceReadRanges(std::vector<ReadRange> ranges,
int64_t hole_size_limit,
int64_t range_size_limit) {
Result<std::vector<ReadRange>> CoalesceReadRanges(std::vector<ReadRange> ranges,
int64_t hole_size_limit,
int64_t range_size_limit) {
DCHECK_GT(range_size_limit, hole_size_limit);

ReadRangeCombiner combiner{hole_size_limit, range_size_limit};
Expand Down
6 changes: 3 additions & 3 deletions cpp/src/arrow/io/util_internal.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,9 +45,9 @@ ARROW_EXPORT Status ValidateWriteRange(int64_t offset, int64_t size, int64_t fil
ARROW_EXPORT Status ValidateRange(int64_t offset, int64_t size);

ARROW_EXPORT
std::vector<ReadRange> CoalesceReadRanges(std::vector<ReadRange> ranges,
int64_t hole_size_limit,
int64_t range_size_limit);
Result<std::vector<ReadRange>> CoalesceReadRanges(std::vector<ReadRange> ranges,
int64_t hole_size_limit,
int64_t range_size_limit);

ARROW_EXPORT
::arrow::internal::ThreadPool* GetIOThreadPool();
Expand Down
6 changes: 4 additions & 2 deletions cpp/src/parquet/file_reader.cc
Original file line number Diff line number Diff line change
Expand Up @@ -139,8 +139,10 @@ const RowGroupMetaData* RowGroupReader::metadata() const { return contents_->met
::arrow::io::ReadRange ComputeColumnChunkRange(FileMetaData* file_metadata,
int64_t source_size, int row_group_index,
int column_index) {
auto row_group_metadata = file_metadata->RowGroup(row_group_index);
auto column_metadata = row_group_metadata->ColumnChunk(column_index);
std::unique_ptr<RowGroupMetaData> row_group_metadata =
file_metadata->RowGroup(row_group_index);
std::unique_ptr<ColumnChunkMetaData> column_metadata =
row_group_metadata->ColumnChunk(column_index);

int64_t col_start = column_metadata->data_page_offset();
if (column_metadata->has_dictionary_page() &&
Expand Down

0 comments on commit 0b9f817

Please sign in to comment.