diff --git a/src/awkward/contents/indexedarray.py b/src/awkward/contents/indexedarray.py index bdb2bb5fb2..6fb4ea3c69 100644 --- a/src/awkward/contents/indexedarray.py +++ b/src/awkward/contents/indexedarray.py @@ -626,6 +626,7 @@ def _mergemany(self, others: Sequence[Content]) -> Content: if isinstance( array, (ak.contents.IndexedOptionArray, ak.contents.IndexedArray) ): + array = array._trim() # see: #3185 and #3119 parameters = parameters_intersect(parameters, array._parameters) contents.append(array.content) @@ -1177,3 +1178,19 @@ def _is_equal_to( other.content, index_dtype, numpyarray, all_parameters ) ) + + def _trim(self) -> Self: + nplike = self._backend.index_nplike + + if not nplike.known_data or self._index.length == 0: + return self + + idx_buf = nplike.asarray(self._index.data, copy=True) + min_idx = nplike.min(idx_buf) + max_idx = nplike.max(idx_buf) + idx_buf -= min_idx + index = Index(idx_buf) + + # left and right trim + content = self._content._getitem_range(min_idx, max_idx + 1) + return IndexedArray(index, content, parameters=self._parameters) diff --git a/src/awkward/contents/indexedoptionarray.py b/src/awkward/contents/indexedoptionarray.py index e296b2fdab..0e68461dc5 100644 --- a/src/awkward/contents/indexedoptionarray.py +++ b/src/awkward/contents/indexedoptionarray.py @@ -761,6 +761,7 @@ def _mergemany(self, others: Sequence[Content]) -> Content: if isinstance( array, (ak.contents.IndexedOptionArray, ak.contents.IndexedArray) ): + array = array._trim() # see: #3185 and #3119 # If we're merging an option, then merge parameters before pulling out `content` parameters = parameters_intersect(parameters, array._parameters) contents.append(array.content) @@ -1767,6 +1768,28 @@ def _is_equal_to( ) ) + def _trim(self) -> Self: + nplike = self._backend.index_nplike + + if not nplike.known_data or self._index.length == 0: + return self + + idx_buf = nplike.asarray(self._index.data, copy=True) + only_positive = idx_buf >= 0 + + # no positive index at all + if not nplike.any(only_positive): + return self + + min_idx = nplike.min(idx_buf[only_positive]) + max_idx = nplike.max(idx_buf[only_positive]) + idx_buf[only_positive] -= min_idx + index = Index(idx_buf) + + # left and right trim + content = self._content._getitem_range(min_idx, max_idx + 1) + return IndexedOptionArray(index, content, parameters=self._parameters) + def create_missing_data(dtype, backend): """Create missing data based on the input dtype diff --git a/src/awkward/contents/unionarray.py b/src/awkward/contents/unionarray.py index 213b5c84b1..bb3b6ca929 100644 --- a/src/awkward/contents/unionarray.py +++ b/src/awkward/contents/unionarray.py @@ -431,7 +431,7 @@ def simplified( ] if len(contents) == 1: - next = contents[0]._carry(index, False) + next = contents[0]._carry(index, True) return next.copy(parameters=parameters_union(next._parameters, parameters)) else: @@ -702,7 +702,7 @@ def project(self, index): nextcarry = ak.index.Index64( tmpcarry.data[: lenout[0]], nplike=self._backend.index_nplike ) - return self._contents[index]._carry(nextcarry, False) + return self._contents[index]._carry(nextcarry, True) @staticmethod def regular_index( diff --git a/tests/test_2713_from_buffers_allow_noncanonical.py b/tests/test_2713_from_buffers_allow_noncanonical.py index 58df61bf8b..bda8e30ed9 100644 --- a/tests/test_2713_from_buffers_allow_noncanonical.py +++ b/tests/test_2713_from_buffers_allow_noncanonical.py @@ -122,8 +122,8 @@ def test_union_simplification(): ) assert projected.layout.form.to_dict(verbose=False) == { - "class": "RecordArray", - "fields": ["x"], - "contents": ["int64"], + "class": "IndexedArray", + "index": "i64", + "content": {"class": "RecordArray", "fields": ["x"], "contents": ["int64"]}, } assert ak.almost_equal(array[["x"]], projected) diff --git a/tests/test_3118_prevent_exponential_memory_growth_in_unionarray.py b/tests/test_3118_prevent_exponential_memory_growth_in_unionarray.py index 38eca227fd..51456355fc 100644 --- a/tests/test_3118_prevent_exponential_memory_growth_in_unionarray.py +++ b/tests/test_3118_prevent_exponential_memory_growth_in_unionarray.py @@ -5,7 +5,17 @@ import awkward as ak -def test(): +def check(layout, assert_length): + if hasattr(layout, "contents"): + for x in layout.contents: + check(x, assert_length) + elif hasattr(layout, "content"): + check(layout.content, assert_length) + else: + assert layout.length <= assert_length + + +def test_2arrays(): one_a = ak.Array([{"x": 1, "y": 2}], with_name="T") one_b = ak.Array([{"x": 1, "y": 2}], with_name="T") two_a = ak.Array([{"x": 1, "z": 3}], with_name="T") @@ -19,16 +29,30 @@ def test(): cat["another"] = three - def check(layout): - if hasattr(layout, "contents"): - for x in layout.contents: - check(x) - elif hasattr(layout, "content"): - check(layout.content) - else: - assert layout.length <= 2 + for _ in range(5): + check(cat.layout, 2) + + cat["another", "w"] = three.x + + +def test_3arrays(): + zero_a = ak.Array([{"x": 1, "y": 1}], with_name="T") + zero_b = ak.Array([{"x": 1, "v": 1}], with_name="T") + one_a = ak.Array([{"x": 1, "y": 2}], with_name="T") + one_b = ak.Array([{"x": 1, "y": 2}], with_name="T") + two_a = ak.Array([{"x": 1, "z": 3}], with_name="T") + two_b = ak.Array([{"x": 1, "z": 3}], with_name="T") + three = ak.Array([{"x": 4}, {"x": 4}, {"x": 4}], with_name="T") + + zeroth = ak.zip({"a": zero_a, "b": zero_b}) + first = ak.zip({"a": one_a, "b": one_b}) + second = ak.zip({"a": two_a, "b": two_b}) + + cat = ak.concatenate([zeroth, first, second], axis=0) + + cat["another"] = three for _ in range(5): - check(cat.layout) + check(cat.layout, 3) cat["another", "w"] = three.x