From a1376879ced6c0bc14dcc1e27c0c23c6ad1554a9 Mon Sep 17 00:00:00 2001 From: Felipe Oliveira Carvalho Date: Wed, 17 Jul 2024 13:39:22 -0300 Subject: [PATCH] GH-43185: [C++] Suggest a cast when Concatenate fails due to offsets overflow (#43190) ## Rationale for this change When arrays using 32-bit offsets into data buffers are concatenated and the data buffers of the results grow beyond 2GB, `Concatenate` returns a bad `Status` with a very simple message: `"offset overflow while concatenating arrays"` The contract that `Concatenate` honors is very simple: arrays of input type T lead to output of the same type T, so we can't, for instance, return a `LARGE_STRING` [1] array when the input is `STRING`. But we can **suggest a cast** to the caller in case an overflow error is detected. Either programatically (by taking an output parameter) or by giving a better error message to users. [1] `LARGE_STRING` can use 64-bit offsets ### What changes are included in this PR? - Suggest casts when concatenation of the values of an FSL fail due to overflow - Suggest casts when concatenation of [LARGE_]LIST_VIEW array fails due to overflow - Suggest casts when concatenation of [LARGE_]LIST array fails due to overflow - Suggest a cast to LARGE_(BINARY|STRING) when offsets overflow ### Are these changes tested? Yes. * GitHub Issue: #43185 Lead-authored-by: Felipe Oliveira Carvalho Co-authored-by: Benjamin Kietzman Signed-off-by: Felipe Oliveira Carvalho --- cpp/src/arrow/array/concatenate.cc | 262 ++++++++++++++++++------ cpp/src/arrow/array/concatenate.h | 16 ++ cpp/src/arrow/array/concatenate_test.cc | 103 +++++++++- 3 files changed, 316 insertions(+), 65 deletions(-) diff --git a/cpp/src/arrow/array/concatenate.cc b/cpp/src/arrow/array/concatenate.cc index 87e55246c78fe..b4638dd6593d8 100644 --- a/cpp/src/arrow/array/concatenate.cc +++ b/cpp/src/arrow/array/concatenate.cc @@ -75,6 +75,31 @@ struct Bitmap { bool AllSet() const { return data == nullptr; } }; +enum class OffsetBufferOpOutcome { + kOk, + kOffsetOverflow, +}; + +Status OffsetOverflowStatus() { + return Status::Invalid("offset overflow while concatenating arrays"); +} + +#define RETURN_IF_NOT_OK_OUTCOME(outcome) \ + switch (outcome) { \ + case OffsetBufferOpOutcome::kOk: \ + break; \ + case OffsetBufferOpOutcome::kOffsetOverflow: \ + return OffsetOverflowStatus(); \ + } + +struct ErrorHints { + /// \brief Suggested cast to avoid overflow during concatenation. + /// + /// If the concatenation of offsets overflows, this field might be set to the + /// a type that uses larger offsets (e.g. large_utf8, large_list). + std::shared_ptr suggested_cast; +}; + // Allocate a buffer and concatenate bitmaps into it. Status ConcatenateBitmaps(const std::vector& bitmaps, MemoryPool* pool, std::shared_ptr* out) { @@ -112,15 +137,16 @@ int64_t SumBufferSizesInBytes(const BufferVector& buffers) { // Write offsets in src into dst, adjusting them such that first_offset // will be the first offset written. template -Status PutOffsets(const Buffer& src, Offset first_offset, Offset* dst, - Range* values_range); +Result PutOffsets(const Buffer& src, Offset first_offset, + Offset* dst, Range* values_range); // Concatenate buffers holding offsets into a single buffer of offsets, // also computing the ranges of values spanned by each buffer of offsets. template -Status ConcatenateOffsets(const BufferVector& buffers, MemoryPool* pool, - std::shared_ptr* out, - std::vector* values_ranges) { +Result ConcatenateOffsets(const BufferVector& buffers, + MemoryPool* pool, + std::shared_ptr* out, + std::vector* values_ranges) { values_ranges->resize(buffers.size()); // allocate output buffer @@ -133,26 +159,30 @@ Status ConcatenateOffsets(const BufferVector& buffers, MemoryPool* pool, for (size_t i = 0; i < buffers.size(); ++i) { // the first offset from buffers[i] will be adjusted to values_length // (the cumulative length of values spanned by offsets in previous buffers) - RETURN_NOT_OK(PutOffsets(*buffers[i], values_length, - out_data + elements_length, &(*values_ranges)[i])); + ARROW_ASSIGN_OR_RAISE(auto outcome, PutOffsets(*buffers[i], values_length, + out_data + elements_length, + &(*values_ranges)[i])); + if (ARROW_PREDICT_FALSE(outcome != OffsetBufferOpOutcome::kOk)) { + return outcome; + } elements_length += buffers[i]->size() / sizeof(Offset); values_length += static_cast((*values_ranges)[i].length); } // the final element in out_data is the length of all values spanned by the offsets out_data[out_size_in_bytes / sizeof(Offset)] = values_length; - return Status::OK(); + return OffsetBufferOpOutcome::kOk; } template -Status PutOffsets(const Buffer& src, Offset first_offset, Offset* dst, - Range* values_range) { +Result PutOffsets(const Buffer& src, Offset first_offset, + Offset* dst, Range* values_range) { if (src.size() == 0) { // It's allowed to have an empty offsets buffer for a 0-length array // (see Array::Validate) values_range->offset = 0; values_range->length = 0; - return Status::OK(); + return OffsetBufferOpOutcome::kOk; } // Get the range of offsets to transfer from src @@ -162,8 +192,9 @@ Status PutOffsets(const Buffer& src, Offset first_offset, Offset* dst, // Compute the range of values which is spanned by this range of offsets values_range->offset = src_begin[0]; values_range->length = *src_end - values_range->offset; - if (first_offset > std::numeric_limits::max() - values_range->length) { - return Status::Invalid("offset overflow while concatenating arrays"); + if (ARROW_PREDICT_FALSE(first_offset > + std::numeric_limits::max() - values_range->length)) { + return OffsetBufferOpOutcome::kOffsetOverflow; } // Write offsets into dst, ensuring that the first offset written is @@ -175,12 +206,14 @@ Status PutOffsets(const Buffer& src, Offset first_offset, Offset* dst, std::transform(src_begin, src_end, dst, [displacement](Offset offset) { return SafeSignedAdd(offset, displacement); }); - return Status::OK(); + return OffsetBufferOpOutcome::kOk; } template -Status PutListViewOffsets(const ArrayData& input, offset_type* sizes, const Buffer& src, - offset_type displacement, offset_type* dst); +Result PutListViewOffsets(const ArrayData& input, + offset_type* sizes, const Buffer& src, + offset_type displacement, + offset_type* dst); // Concatenate buffers holding list-view offsets into a single buffer of offsets // @@ -198,10 +231,10 @@ Status PutListViewOffsets(const ArrayData& input, offset_type* sizes, const Buff // \param[in] in The child arrays // \param[in,out] sizes The concatenated sizes buffer template -Status ConcatenateListViewOffsets(const ArrayDataVector& in, offset_type* sizes, - const BufferVector& offset_buffers, - const std::vector& value_ranges, - MemoryPool* pool, std::shared_ptr* out) { +Result ConcatenateListViewOffsets( + const ArrayDataVector& in, offset_type* sizes, const BufferVector& offset_buffers, + const std::vector& value_ranges, MemoryPool* pool, + std::shared_ptr* out) { DCHECK_EQ(offset_buffers.size(), value_ranges.size()); // Allocate resulting offsets buffer and initialize it with zeros @@ -216,26 +249,32 @@ Status ConcatenateListViewOffsets(const ArrayDataVector& in, offset_type* sizes, for (size_t i = 0; i < offset_buffers.size(); ++i) { const auto displacement = static_cast(num_child_values - value_ranges[i].offset); - RETURN_NOT_OK(PutListViewOffsets(*in[i], /*sizes=*/sizes + elements_length, - /*src=*/*offset_buffers[i], displacement, - /*dst=*/out_offsets + elements_length)); + ARROW_ASSIGN_OR_RAISE(auto outcome, + PutListViewOffsets(*in[i], /*sizes=*/sizes + elements_length, + /*src=*/*offset_buffers[i], displacement, + /*dst=*/out_offsets + elements_length)); + if (ARROW_PREDICT_FALSE(outcome != OffsetBufferOpOutcome::kOk)) { + return outcome; + } elements_length += offset_buffers[i]->size() / sizeof(offset_type); num_child_values += value_ranges[i].length; if (num_child_values > std::numeric_limits::max()) { - return Status::Invalid("offset overflow while concatenating arrays"); + return OffsetBufferOpOutcome::kOffsetOverflow; } } DCHECK_EQ(elements_length, static_cast(out_size_in_bytes / sizeof(offset_type))); - return Status::OK(); + return OffsetBufferOpOutcome::kOk; } template -Status PutListViewOffsets(const ArrayData& input, offset_type* sizes, const Buffer& src, - offset_type displacement, offset_type* dst) { +Result PutListViewOffsets(const ArrayData& input, + offset_type* sizes, const Buffer& src, + offset_type displacement, + offset_type* dst) { if (src.size() == 0) { - return Status::OK(); + return OffsetBufferOpOutcome::kOk; } const auto& validity_buffer = input.buffers[0]; if (validity_buffer) { @@ -291,7 +330,7 @@ Status PutListViewOffsets(const ArrayData& input, offset_type* sizes, const Buff } } } - return Status::OK(); + return OffsetBufferOpOutcome::kOk; } class ConcatenateImpl { @@ -316,11 +355,17 @@ class ConcatenateImpl { } } - Status Concatenate(std::shared_ptr* out) && { + Status Concatenate(std::shared_ptr* out, ErrorHints* out_hints) && { if (out_->null_count != 0 && internal::may_have_validity_bitmap(out_->type->id())) { RETURN_NOT_OK(ConcatenateBitmaps(Bitmaps(0), pool_, &out_->buffers[0])); } - RETURN_NOT_OK(VisitTypeInline(*out_->type, this)); + auto status = VisitTypeInline(*out_->type, this); + if (!status.ok()) { + if (out_hints) { + out_hints->suggested_cast = std::move(suggested_cast_); + } + return status; + } *out = std::move(out_); return Status::OK(); } @@ -337,11 +382,29 @@ class ConcatenateImpl { return ConcatenateBuffers(buffers, pool_).Value(&out_->buffers[1]); } - Status Visit(const BinaryType&) { + Status Visit(const BinaryType& input_type) { std::vector value_ranges; ARROW_ASSIGN_OR_RAISE(auto index_buffers, Buffers(1, sizeof(int32_t))); - RETURN_NOT_OK(ConcatenateOffsets(index_buffers, pool_, &out_->buffers[1], - &value_ranges)); + ARROW_ASSIGN_OR_RAISE( + auto outcome, ConcatenateOffsets(index_buffers, pool_, &out_->buffers[1], + &value_ranges)); + switch (outcome) { + case OffsetBufferOpOutcome::kOk: + break; + case OffsetBufferOpOutcome::kOffsetOverflow: + switch (input_type.id()) { + case Type::BINARY: + suggested_cast_ = large_binary(); + break; + case Type::STRING: + suggested_cast_ = large_utf8(); + break; + default: + DCHECK(false) << "unexpected type id from BinaryType: " << input_type; + break; + } + return OffsetOverflowStatus(); + } ARROW_ASSIGN_OR_RAISE(auto value_buffers, Buffers(2, value_ranges)); return ConcatenateBuffers(value_buffers, pool_).Value(&out_->buffers[2]); } @@ -349,8 +412,10 @@ class ConcatenateImpl { Status Visit(const LargeBinaryType&) { std::vector value_ranges; ARROW_ASSIGN_OR_RAISE(auto index_buffers, Buffers(1, sizeof(int64_t))); - RETURN_NOT_OK(ConcatenateOffsets(index_buffers, pool_, &out_->buffers[1], - &value_ranges)); + ARROW_ASSIGN_OR_RAISE( + auto outcome, ConcatenateOffsets(index_buffers, pool_, &out_->buffers[1], + &value_ranges)); + RETURN_IF_NOT_OK_OUTCOME(outcome); ARROW_ASSIGN_OR_RAISE(auto value_buffers, Buffers(2, value_ranges)); return ConcatenateBuffers(value_buffers, pool_).Value(&out_->buffers[2]); } @@ -394,22 +459,44 @@ class ConcatenateImpl { return Status::OK(); } - Status Visit(const ListType&) { + Status Visit(const ListType& input_type) { std::vector value_ranges; ARROW_ASSIGN_OR_RAISE(auto index_buffers, Buffers(1, sizeof(int32_t))); - RETURN_NOT_OK(ConcatenateOffsets(index_buffers, pool_, &out_->buffers[1], - &value_ranges)); + ARROW_ASSIGN_OR_RAISE(auto offsets_outcome, + ConcatenateOffsets(index_buffers, pool_, + &out_->buffers[1], &value_ranges)); + switch (offsets_outcome) { + case OffsetBufferOpOutcome::kOk: + break; + case OffsetBufferOpOutcome::kOffsetOverflow: + suggested_cast_ = large_list(input_type.value_type()); + return OffsetOverflowStatus(); + } ARROW_ASSIGN_OR_RAISE(auto child_data, ChildData(0, value_ranges)); - return ConcatenateImpl(child_data, pool_).Concatenate(&out_->child_data[0]); + ErrorHints child_error_hints; + auto status = ConcatenateImpl(child_data, pool_) + .Concatenate(&out_->child_data[0], &child_error_hints); + if (!status.ok() && child_error_hints.suggested_cast) { + suggested_cast_ = list(std::move(child_error_hints.suggested_cast)); + } + return status; } Status Visit(const LargeListType&) { std::vector value_ranges; ARROW_ASSIGN_OR_RAISE(auto index_buffers, Buffers(1, sizeof(int64_t))); - RETURN_NOT_OK(ConcatenateOffsets(index_buffers, pool_, &out_->buffers[1], - &value_ranges)); + ARROW_ASSIGN_OR_RAISE( + auto outcome, ConcatenateOffsets(index_buffers, pool_, &out_->buffers[1], + &value_ranges)); + RETURN_IF_NOT_OK_OUTCOME(outcome); ARROW_ASSIGN_OR_RAISE(auto child_data, ChildData(0, value_ranges)); - return ConcatenateImpl(child_data, pool_).Concatenate(&out_->child_data[0]); + ErrorHints child_error_hints; + auto status = ConcatenateImpl(child_data, pool_) + .Concatenate(&out_->child_data[0], &child_error_hints); + if (!status.ok() && child_error_hints.suggested_cast) { + suggested_cast_ = large_list(std::move(child_error_hints.suggested_cast)); + } + return status; } template @@ -430,8 +517,17 @@ class ConcatenateImpl { } // Concatenate the values + ErrorHints child_error_hints; ARROW_ASSIGN_OR_RAISE(ArrayDataVector value_data, ChildData(0, value_ranges)); - RETURN_NOT_OK(ConcatenateImpl(value_data, pool_).Concatenate(&out_->child_data[0])); + auto values_status = ConcatenateImpl(value_data, pool_) + .Concatenate(&out_->child_data[0], &child_error_hints); + if (!values_status.ok()) { + if (child_error_hints.suggested_cast) { + suggested_cast_ = std::make_shared>( + std::move(child_error_hints.suggested_cast)); + } + return values_status; + } out_->child_data[0]->type = type.value_type(); // Concatenate the sizes first @@ -440,22 +536,39 @@ class ConcatenateImpl { // Concatenate the offsets ARROW_ASSIGN_OR_RAISE(auto offset_buffers, Buffers(1, sizeof(offset_type))); - RETURN_NOT_OK(ConcatenateListViewOffsets( - in_, /*sizes=*/out_->buffers[2]->mutable_data_as(), offset_buffers, - value_ranges, pool_, &out_->buffers[1])); - + ARROW_ASSIGN_OR_RAISE( + auto outcome, ConcatenateListViewOffsets( + in_, /*sizes=*/out_->buffers[2]->mutable_data_as(), + offset_buffers, value_ranges, pool_, &out_->buffers[1])); + switch (outcome) { + case OffsetBufferOpOutcome::kOk: + break; + case OffsetBufferOpOutcome::kOffsetOverflow: + if constexpr (T::type_id == Type::LIST_VIEW) { + suggested_cast_ = large_list_view(type.value_type()); + } + return OffsetOverflowStatus(); + } return Status::OK(); } - Status Visit(const FixedSizeListType& fixed_size_list) { - ARROW_ASSIGN_OR_RAISE(auto child_data, ChildData(0, fixed_size_list.list_size())); - return ConcatenateImpl(child_data, pool_).Concatenate(&out_->child_data[0]); + Status Visit(const FixedSizeListType& fsl_type) { + ARROW_ASSIGN_OR_RAISE(auto child_data, ChildData(0, fsl_type.list_size())); + ErrorHints hints; + auto status = + ConcatenateImpl(child_data, pool_).Concatenate(&out_->child_data[0], &hints); + if (!status.ok() && hints.suggested_cast) { + suggested_cast_ = + fixed_size_list(std::move(hints.suggested_cast), fsl_type.list_size()); + } + return status; } Status Visit(const StructType& s) { for (int i = 0; i < s.num_fields(); ++i) { ARROW_ASSIGN_OR_RAISE(auto child_data, ChildData(i)); - RETURN_NOT_OK(ConcatenateImpl(child_data, pool_).Concatenate(&out_->child_data[i])); + RETURN_NOT_OK(ConcatenateImpl(child_data, pool_) + .Concatenate(&out_->child_data[i], /*hints=*/nullptr)); } return Status::OK(); } @@ -570,8 +683,8 @@ class ConcatenateImpl { case UnionMode::SPARSE: { for (int i = 0; i < u.num_fields(); i++) { ARROW_ASSIGN_OR_RAISE(auto child_data, ChildData(i)); - RETURN_NOT_OK( - ConcatenateImpl(child_data, pool_).Concatenate(&out_->child_data[i])); + RETURN_NOT_OK(ConcatenateImpl(child_data, pool_) + .Concatenate(&out_->child_data[i], /*hints=*/nullptr)); } break; } @@ -581,8 +694,8 @@ class ConcatenateImpl { for (size_t j = 0; j < in_.size(); j++) { child_data[j] = in_[j]->child_data[i]; } - RETURN_NOT_OK( - ConcatenateImpl(child_data, pool_).Concatenate(&out_->child_data[i])); + RETURN_NOT_OK(ConcatenateImpl(child_data, pool_) + .Concatenate(&out_->child_data[i], /*hints=*/nullptr)); } break; } @@ -666,7 +779,8 @@ class ConcatenateImpl { storage_data[i]->type = e.storage_type(); } std::shared_ptr out_storage; - RETURN_NOT_OK(ConcatenateImpl(storage_data, pool_).Concatenate(&out_storage)); + RETURN_NOT_OK(ConcatenateImpl(storage_data, pool_) + .Concatenate(&out_storage, /*hints=*/nullptr)); out_storage->type = in_[0]->type; out_ = std::move(out_storage); return Status::OK(); @@ -797,11 +911,18 @@ class ConcatenateImpl { const ArrayDataVector& in_; MemoryPool* pool_; std::shared_ptr out_; + std::shared_ptr suggested_cast_; }; } // namespace -Result> Concatenate(const ArrayVector& arrays, MemoryPool* pool) { +namespace internal { + +Result> Concatenate( + const ArrayVector& arrays, MemoryPool* pool, + std::shared_ptr* out_suggested_cast) { + DCHECK(out_suggested_cast); + *out_suggested_cast = nullptr; if (arrays.size() == 0) { return Status::Invalid("Must pass at least one array"); } @@ -818,8 +939,31 @@ Result> Concatenate(const ArrayVector& arrays, MemoryPool } std::shared_ptr out_data; - RETURN_NOT_OK(ConcatenateImpl(data, pool).Concatenate(&out_data)); + ErrorHints hints; + auto status = ConcatenateImpl(data, pool).Concatenate(&out_data, &hints); + if (!status.ok()) { + if (hints.suggested_cast) { + DCHECK(status.IsInvalid()); + *out_suggested_cast = std::move(hints.suggested_cast); + } + return status; + } return MakeArray(std::move(out_data)); } +} // namespace internal + +Result> Concatenate(const ArrayVector& arrays, MemoryPool* pool) { + std::shared_ptr suggested_cast; + auto result = internal::Concatenate(arrays, pool, &suggested_cast); + if (!result.ok() && suggested_cast && arrays.size() > 0) { + DCHECK(result.status().IsInvalid()); + return Status::Invalid(result.status().message(), ", consider casting input from `", + *arrays[0]->type(), "` to `", *suggested_cast, "` first."); + } + return result; +} + +#undef RETURN_IF_NOT_OK_OUTCOME + } // namespace arrow diff --git a/cpp/src/arrow/array/concatenate.h b/cpp/src/arrow/array/concatenate.h index e7597aad812c4..aada5624d63a3 100644 --- a/cpp/src/arrow/array/concatenate.h +++ b/cpp/src/arrow/array/concatenate.h @@ -24,6 +24,22 @@ #include "arrow/util/visibility.h" namespace arrow { +namespace internal { + +/// \brief Concatenate arrays +/// +/// \param[in] arrays a vector of arrays to be concatenated +/// \param[in] pool memory to store the result will be allocated from this memory pool +/// \param[out] out_suggested_cast if a non-OK Result is returned, the function might set +/// out_suggested_cast to a cast suggestion that would allow concatenating the arrays +/// without overflow of offsets (e.g. string to large_string) +/// +/// \return the concatenated array +ARROW_EXPORT +Result> Concatenate(const ArrayVector& arrays, MemoryPool* pool, + std::shared_ptr* out_suggested_cast); + +} // namespace internal /// \brief Concatenate arrays /// diff --git a/cpp/src/arrow/array/concatenate_test.cc b/cpp/src/arrow/array/concatenate_test.cc index af595e897f9ee..aea5311575299 100644 --- a/cpp/src/arrow/array/concatenate_test.cc +++ b/cpp/src/arrow/array/concatenate_test.cc @@ -29,6 +29,7 @@ #include #include +#include #include #include "arrow/array.h" @@ -42,6 +43,7 @@ #include "arrow/testing/util.h" #include "arrow/type.h" #include "arrow/util/list_util.h" +#include "arrow/util/unreachable.h" namespace arrow { @@ -661,14 +663,103 @@ TEST_F(ConcatenateTest, ExtensionType) { }); } +std::shared_ptr LargeVersionOfType(const std::shared_ptr& type) { + switch (type->id()) { + case Type::BINARY: + return large_binary(); + case Type::STRING: + return large_utf8(); + case Type::LIST: + return large_list(static_cast(*type).value_type()); + case Type::LIST_VIEW: + return large_list_view(static_cast(*type).value_type()); + case Type::LARGE_BINARY: + case Type::LARGE_STRING: + case Type::LARGE_LIST: + case Type::LARGE_LIST_VIEW: + return type; + default: + Unreachable(); + } +} + +std::shared_ptr fixed_size_list_of_1(std::shared_ptr type) { + return fixed_size_list(std::move(type), 1); +} + TEST_F(ConcatenateTest, OffsetOverflow) { - auto fake_long = ArrayFromJSON(utf8(), "[\"\"]"); - fake_long->data()->GetMutableValues(1)[1] = + using TypeFactory = std::shared_ptr (*)(std::shared_ptr); + static const std::vector kNestedTypeFactories = { + list, large_list, list_view, large_list_view, fixed_size_list_of_1, + }; + + auto* pool = default_memory_pool(); + std::shared_ptr suggested_cast; + for (auto& ty : {binary(), utf8()}) { + auto large_ty = LargeVersionOfType(ty); + + auto fake_long = ArrayFromJSON(ty, "[\"\"]"); + fake_long->data()->GetMutableValues(1)[1] = + std::numeric_limits::max(); + // XXX: since the data fake_long claims to own isn't there, this would + // segfault if Concatenate didn't detect overflow and raise an error. + auto concatenate_status = Concatenate({fake_long, fake_long}); + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, + ::testing::StrEq("Invalid: offset overflow while concatenating arrays, " + "consider casting input from `" + + ty->ToString() + "` to `large_" + ty->ToString() + "` first."), + concatenate_status); + + concatenate_status = + internal::Concatenate({fake_long, fake_long}, pool, &suggested_cast); + // Message is doesn't contain the suggested cast type when the caller + // asks for it by passing the output parameter. + EXPECT_RAISES_WITH_MESSAGE_THAT( + Invalid, ::testing::StrEq("Invalid: offset overflow while concatenating arrays"), + concatenate_status); + ASSERT_TRUE(large_ty->Equals(*suggested_cast)); + + // Check that the suggested cast is correct when concatenation + // fails due to the child array being too large. + for (auto factory : kNestedTypeFactories) { + auto nested_ty = factory(ty); + auto expected_suggestion = factory(large_ty); + auto fake_long_list = ArrayFromJSON(nested_ty, "[[\"\"]]"); + fake_long_list->data()->child_data[0] = fake_long->data(); + + ASSERT_RAISES(Invalid, internal::Concatenate({fake_long_list, fake_long_list}, pool, + &suggested_cast) + .status()); + ASSERT_TRUE(suggested_cast->Equals(*expected_suggestion)); + } + } + + auto list_ty = list(utf8()); + auto fake_long_list = ArrayFromJSON(list_ty, "[[\"Hello\"]]"); + fake_long_list->data()->GetMutableValues(1)[1] = std::numeric_limits::max(); - std::shared_ptr concatenated; - // XX since the data fake_long claims to own isn't there, this will segfault if - // Concatenate doesn't detect overflow and raise an error. - ASSERT_RAISES(Invalid, Concatenate({fake_long, fake_long}).status()); + ASSERT_RAISES(Invalid, internal::Concatenate({fake_long_list, fake_long_list}, pool, + &suggested_cast) + .status()); + ASSERT_TRUE(suggested_cast->Equals(LargeVersionOfType(list_ty))); + + auto list_view_ty = list_view(null()); + auto fake_long_list_view = ArrayFromJSON(list_view_ty, "[[], []]"); + { + constexpr int kInt32Max = std::numeric_limits::max(); + auto* values = fake_long_list_view->data()->child_data[0].get(); + auto* mutable_offsets = fake_long_list_view->data()->GetMutableValues(1); + auto* mutable_sizes = fake_long_list_view->data()->GetMutableValues(2); + values->length = 2 * static_cast(kInt32Max); + mutable_offsets[1] = kInt32Max; + mutable_offsets[0] = kInt32Max; + mutable_sizes[0] = kInt32Max; + } + ASSERT_RAISES(Invalid, internal::Concatenate({fake_long_list_view, fake_long_list_view}, + pool, &suggested_cast) + .status()); + ASSERT_TRUE(suggested_cast->Equals(LargeVersionOfType(list_view_ty))); } TEST_F(ConcatenateTest, DictionaryConcatenateWithEmptyUint16) {