From 255dbf990c3d3e5fb1270a2a11efe0af2be195ab Mon Sep 17 00:00:00 2001 From: Alenka Frim Date: Fri, 31 May 2024 10:09:54 +0200 Subject: [PATCH] GH-41684: [C++][Python] Add optional null_bitmap to MapArray::FromArrays (#41757) ### Rationale for this change When constructing a `MapArray` with `FromArrays` one can not supply a `null_bitmap`. ### What changes are included in this PR? Optional `null_bitmap` argument is added to `MapArray::FromArrays`. ### Are these changes tested? TODO (have them locally, need to clean them up and commit. ### Are there any user-facing changes? No. * GitHub Issue: #41684 Authored-by: AlenkaF Signed-off-by: Joris Van den Bossche --- cpp/src/arrow/array/array_list_test.cc | 17 ++++++++++ cpp/src/arrow/array/array_nested.cc | 45 ++++++++++++++++++-------- cpp/src/arrow/array/array_nested.h | 9 ++++-- python/pyarrow/array.pxi | 11 +++++-- python/pyarrow/includes/libarrow.pxd | 8 +++-- python/pyarrow/tests/test_array.py | 34 +++++++++++++++++++ 6 files changed, 102 insertions(+), 22 deletions(-) diff --git a/cpp/src/arrow/array/array_list_test.cc b/cpp/src/arrow/array/array_list_test.cc index e79ce6fe172b2..55f91dc34167b 100644 --- a/cpp/src/arrow/array/array_list_test.cc +++ b/cpp/src/arrow/array/array_list_test.cc @@ -1368,6 +1368,23 @@ TEST_F(TestMapArray, FromArrays) { ASSERT_EQ(keys_with_null->length(), tmp_items->length()); ASSERT_RAISES(Invalid, MapArray::FromArrays(offsets1, keys_with_null, tmp_items, pool_)); + + // With null_bitmap + ASSERT_OK_AND_ASSIGN(auto map7, MapArray::FromArrays(offsets1, keys, items, pool_, + offsets3->data()->buffers[0])); + ASSERT_OK(map7->Validate()); + MapArray expected7(map_type, length, offsets1->data()->buffers[1], keys, items, + offsets3->data()->buffers[0], 1); + AssertArraysEqual(expected7, *map7); + + // Null bitmap and offset with null + ASSERT_RAISES(Invalid, MapArray::FromArrays(offsets3, keys, items, pool_, + offsets3->data()->buffers[0])); + + // Null bitmap and offset with offset + ASSERT_RAISES(NotImplemented, + MapArray::FromArrays(offsets3->Slice(2), keys, items, pool_, + offsets3->data()->buffers[0])); } TEST_F(TestMapArray, FromArraysEquality) { diff --git a/cpp/src/arrow/array/array_nested.cc b/cpp/src/arrow/array/array_nested.cc index 67a499c2b8277..bb5c6bf018006 100644 --- a/cpp/src/arrow/array/array_nested.cc +++ b/cpp/src/arrow/array/array_nested.cc @@ -807,7 +807,7 @@ MapArray::MapArray(const std::shared_ptr& type, int64_t length, Result> MapArray::FromArraysInternal( std::shared_ptr type, const std::shared_ptr& offsets, const std::shared_ptr& keys, const std::shared_ptr& items, - MemoryPool* pool) { + MemoryPool* pool, const std::shared_ptr& null_bitmap) { using offset_type = typename MapType::offset_type; using OffsetArrowType = typename CTypeTraits::ArrowType; @@ -827,6 +827,15 @@ Result> MapArray::FromArraysInternal( return Status::Invalid("Map key and item arrays must be equal length"); } + if (null_bitmap != nullptr && offsets->null_count() > 0) { + return Status::Invalid( + "Ambiguous to specify both validity map and offsets with nulls"); + } + + if (null_bitmap != nullptr && offsets->offset() != 0) { + return Status::NotImplemented("Null bitmap with offsets slice not supported."); + } + if (offsets->null_count() > 0) { ARROW_ASSIGN_OR_RAISE(auto buffers, CleanListOffsets(NULLPTR, *offsets, pool)); @@ -836,24 +845,32 @@ Result> MapArray::FromArraysInternal( using OffsetArrayType = typename TypeTraits::ArrayType; const auto& typed_offsets = checked_cast(*offsets); - auto buffers = BufferVector({nullptr, typed_offsets.values()}); + + BufferVector buffers; + int64_t null_count; + if (null_bitmap != nullptr) { + buffers = BufferVector({std::move(null_bitmap), typed_offsets.values()}); + null_count = null_bitmap->size(); + } else { + buffers = BufferVector({null_bitmap, typed_offsets.values()}); + null_count = 0; + } return std::make_shared(type, offsets->length() - 1, std::move(buffers), keys, - items, /*null_count=*/0, offsets->offset()); + items, /*null_count=*/null_count, offsets->offset()); } -Result> MapArray::FromArrays(const std::shared_ptr& offsets, - const std::shared_ptr& keys, - const std::shared_ptr& items, - MemoryPool* pool) { +Result> MapArray::FromArrays( + const std::shared_ptr& offsets, const std::shared_ptr& keys, + const std::shared_ptr& items, MemoryPool* pool, + const std::shared_ptr& null_bitmap) { return FromArraysInternal(std::make_shared(keys->type(), items->type()), - offsets, keys, items, pool); + offsets, keys, items, pool, null_bitmap); } -Result> MapArray::FromArrays(std::shared_ptr type, - const std::shared_ptr& offsets, - const std::shared_ptr& keys, - const std::shared_ptr& items, - MemoryPool* pool) { +Result> MapArray::FromArrays( + std::shared_ptr type, const std::shared_ptr& offsets, + const std::shared_ptr& keys, const std::shared_ptr& items, + MemoryPool* pool, const std::shared_ptr& null_bitmap) { if (type->id() != Type::MAP) { return Status::TypeError("Expected map type, got ", type->ToString()); } @@ -864,7 +881,7 @@ Result> MapArray::FromArrays(std::shared_ptr ty if (!map_type.item_type()->Equals(items->type())) { return Status::TypeError("Mismatching map items type"); } - return FromArraysInternal(std::move(type), offsets, keys, items, pool); + return FromArraysInternal(std::move(type), offsets, keys, items, pool, null_bitmap); } Status MapArray::ValidateChildData( diff --git a/cpp/src/arrow/array/array_nested.h b/cpp/src/arrow/array/array_nested.h index 5744f5fcadf05..f96b6bd3b1346 100644 --- a/cpp/src/arrow/array/array_nested.h +++ b/cpp/src/arrow/array/array_nested.h @@ -532,15 +532,18 @@ class ARROW_EXPORT MapArray : public ListArray { /// \param[in] keys Array containing key values /// \param[in] items Array containing item values /// \param[in] pool MemoryPool in case new offsets array needs to be + /// \param[in] null_bitmap Optional validity bitmap /// allocated because of null values static Result> FromArrays( const std::shared_ptr& offsets, const std::shared_ptr& keys, - const std::shared_ptr& items, MemoryPool* pool = default_memory_pool()); + const std::shared_ptr& items, MemoryPool* pool = default_memory_pool(), + const std::shared_ptr& null_bitmap = NULLPTR); static Result> FromArrays( std::shared_ptr type, const std::shared_ptr& offsets, const std::shared_ptr& keys, const std::shared_ptr& items, - MemoryPool* pool = default_memory_pool()); + MemoryPool* pool = default_memory_pool(), + const std::shared_ptr& null_bitmap = NULLPTR); const MapType* map_type() const { return map_type_; } @@ -560,7 +563,7 @@ class ARROW_EXPORT MapArray : public ListArray { static Result> FromArraysInternal( std::shared_ptr type, const std::shared_ptr& offsets, const std::shared_ptr& keys, const std::shared_ptr& items, - MemoryPool* pool); + MemoryPool* pool, const std::shared_ptr& null_bitmap = NULLPTR); private: const MapType* map_type_; diff --git a/python/pyarrow/array.pxi b/python/pyarrow/array.pxi index 406830ad4dd69..3c26e85887466 100644 --- a/python/pyarrow/array.pxi +++ b/python/pyarrow/array.pxi @@ -3060,7 +3060,7 @@ cdef class MapArray(ListArray): """ @staticmethod - def from_arrays(offsets, keys, items, DataType type=None, MemoryPool pool=None): + def from_arrays(offsets, keys, items, DataType type=None, MemoryPool pool=None, mask=None): """ Construct MapArray from arrays of int32 offsets and key, item arrays. @@ -3072,6 +3072,8 @@ cdef class MapArray(ListArray): type : DataType, optional If not specified, a default MapArray with the keys' and items' type is used. pool : MemoryPool + mask : Array (boolean type), optional + Indicate which values are null (True) or not null (False). Returns ------- @@ -3153,24 +3155,27 @@ cdef class MapArray(ListArray): cdef: Array _offsets, _keys, _items shared_ptr[CArray] out + shared_ptr[CBuffer] c_mask cdef CMemoryPool* cpool = maybe_unbox_memory_pool(pool) _offsets = asarray(offsets, type='int32') _keys = asarray(keys) _items = asarray(items) + c_mask = c_mask_inverted_from_obj(mask, pool) + if type is not None: with nogil: out = GetResultValue( CMapArray.FromArraysAndType( type.sp_type, _offsets.sp_array, - _keys.sp_array, _items.sp_array, cpool)) + _keys.sp_array, _items.sp_array, cpool, c_mask)) else: with nogil: out = GetResultValue( CMapArray.FromArrays(_offsets.sp_array, _keys.sp_array, - _items.sp_array, cpool)) + _items.sp_array, cpool, c_mask)) cdef Array result = pyarrow_wrap_array(out) result.validate() return result diff --git a/python/pyarrow/includes/libarrow.pxd b/python/pyarrow/includes/libarrow.pxd index a66f584b83f5b..0d63ec6be38d8 100644 --- a/python/pyarrow/includes/libarrow.pxd +++ b/python/pyarrow/includes/libarrow.pxd @@ -823,7 +823,9 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: const shared_ptr[CArray]& offsets, const shared_ptr[CArray]& keys, const shared_ptr[CArray]& items, - CMemoryPool* pool) + CMemoryPool* pool, + const shared_ptr[CBuffer] null_bitmap, + ) @staticmethod CResult[shared_ptr[CArray]] FromArraysAndType" FromArrays"( @@ -831,7 +833,9 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil: const shared_ptr[CArray]& offsets, const shared_ptr[CArray]& keys, const shared_ptr[CArray]& items, - CMemoryPool* pool) + CMemoryPool* pool, + const shared_ptr[CBuffer] null_bitmap, + ) shared_ptr[CArray] keys() shared_ptr[CArray] items() diff --git a/python/pyarrow/tests/test_array.py b/python/pyarrow/tests/test_array.py index b89e0ace157af..49a00517fca9f 100644 --- a/python/pyarrow/tests/test_array.py +++ b/python/pyarrow/tests/test_array.py @@ -1079,6 +1079,40 @@ def test_map_from_arrays(): pa.int64() )) + # pass in null bitmap with type + result = pa.MapArray.from_arrays([0, 2, 2, 6], keys, items, pa.map_( + keys.type, + items.type), + mask=pa.array([False, True, False], type=pa.bool_()) + ) + assert result.equals(expected) + + # pass in null bitmap without the type + result = pa.MapArray.from_arrays([0, 2, 2, 6], keys, items, + mask=pa.array([False, True, False], + type=pa.bool_()) + ) + assert result.equals(expected) + + # error if null bitmap and offsets with nulls passed + msg1 = 'Ambiguous to specify both validity map and offsets with nulls' + with pytest.raises(pa.ArrowInvalid, match=msg1): + pa.MapArray.from_arrays(offsets, keys, items, pa.map_( + keys.type, + items.type), + mask=pa.array([False, True, False], type=pa.bool_()) + ) + + # error if null bitmap passed to sliced offset + msg2 = 'Null bitmap with offsets slice not supported.' + offsets = pa.array(offsets, pa.int32()) + with pytest.raises(pa.ArrowNotImplementedError, match=msg2): + pa.MapArray.from_arrays(offsets.slice(2), keys, items, pa.map_( + keys.type, + items.type), + mask=pa.array([False, True, False], type=pa.bool_()) + ) + # check invalid usage offsets = [0, 1, 3, 5] keys = np.arange(5)