Skip to content

Commit

Permalink
fix: issue #3118 and #3185 (#3193)
Browse files Browse the repository at this point in the history
* [Indexed*Array] adding '_trim()' method to fix exponential memory growth

* [test] enhance test_3118 with a third array

* [Indexed*Array] _trim: safer index treating

* [test] add two tests, one with 2 and one with 3 arrays

* style: pre-commit fixes

* [Indexed*Array] _trim: properly handle typetracers (and all negative indices for IndexOptionArrays)

* [test] revert commit 698b015 for 'tests/test_2713_from_buffers_allow_noncanonical.py'

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
pfackeldey and pre-commit-ci[bot] authored Aug 1, 2024
1 parent 39541db commit 55a9a43
Show file tree
Hide file tree
Showing 5 changed files with 79 additions and 15 deletions.
17 changes: 17 additions & 0 deletions src/awkward/contents/indexedarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -626,6 +626,7 @@ def _mergemany(self, others: Sequence[Content]) -> Content:
if isinstance(
array, (ak.contents.IndexedOptionArray, ak.contents.IndexedArray)
):
array = array._trim() # see: #3185 and #3119
parameters = parameters_intersect(parameters, array._parameters)

contents.append(array.content)
Expand Down Expand Up @@ -1177,3 +1178,19 @@ def _is_equal_to(
other.content, index_dtype, numpyarray, all_parameters
)
)

def _trim(self) -> Self:
nplike = self._backend.index_nplike

if not nplike.known_data or self._index.length == 0:
return self

idx_buf = nplike.asarray(self._index.data, copy=True)
min_idx = nplike.min(idx_buf)
max_idx = nplike.max(idx_buf)
idx_buf -= min_idx
index = Index(idx_buf)

# left and right trim
content = self._content._getitem_range(min_idx, max_idx + 1)
return IndexedArray(index, content, parameters=self._parameters)
23 changes: 23 additions & 0 deletions src/awkward/contents/indexedoptionarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -761,6 +761,7 @@ def _mergemany(self, others: Sequence[Content]) -> Content:
if isinstance(
array, (ak.contents.IndexedOptionArray, ak.contents.IndexedArray)
):
array = array._trim() # see: #3185 and #3119
# If we're merging an option, then merge parameters before pulling out `content`
parameters = parameters_intersect(parameters, array._parameters)
contents.append(array.content)
Expand Down Expand Up @@ -1767,6 +1768,28 @@ def _is_equal_to(
)
)

def _trim(self) -> Self:
nplike = self._backend.index_nplike

if not nplike.known_data or self._index.length == 0:
return self

idx_buf = nplike.asarray(self._index.data, copy=True)
only_positive = idx_buf >= 0

# no positive index at all
if not nplike.any(only_positive):
return self

min_idx = nplike.min(idx_buf[only_positive])
max_idx = nplike.max(idx_buf[only_positive])
idx_buf[only_positive] -= min_idx
index = Index(idx_buf)

# left and right trim
content = self._content._getitem_range(min_idx, max_idx + 1)
return IndexedOptionArray(index, content, parameters=self._parameters)


def create_missing_data(dtype, backend):
"""Create missing data based on the input dtype
Expand Down
4 changes: 2 additions & 2 deletions src/awkward/contents/unionarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -431,7 +431,7 @@ def simplified(
]

if len(contents) == 1:
next = contents[0]._carry(index, False)
next = contents[0]._carry(index, True)
return next.copy(parameters=parameters_union(next._parameters, parameters))

else:
Expand Down Expand Up @@ -702,7 +702,7 @@ def project(self, index):
nextcarry = ak.index.Index64(
tmpcarry.data[: lenout[0]], nplike=self._backend.index_nplike
)
return self._contents[index]._carry(nextcarry, False)
return self._contents[index]._carry(nextcarry, True)

@staticmethod
def regular_index(
Expand Down
6 changes: 3 additions & 3 deletions tests/test_2713_from_buffers_allow_noncanonical.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,8 +122,8 @@ def test_union_simplification():
)

assert projected.layout.form.to_dict(verbose=False) == {
"class": "RecordArray",
"fields": ["x"],
"contents": ["int64"],
"class": "IndexedArray",
"index": "i64",
"content": {"class": "RecordArray", "fields": ["x"], "contents": ["int64"]},
}
assert ak.almost_equal(array[["x"]], projected)
44 changes: 34 additions & 10 deletions tests/test_3118_prevent_exponential_memory_growth_in_unionarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,17 @@
import awkward as ak


def test():
def check(layout, assert_length):
if hasattr(layout, "contents"):
for x in layout.contents:
check(x, assert_length)
elif hasattr(layout, "content"):
check(layout.content, assert_length)
else:
assert layout.length <= assert_length


def test_2arrays():
one_a = ak.Array([{"x": 1, "y": 2}], with_name="T")
one_b = ak.Array([{"x": 1, "y": 2}], with_name="T")
two_a = ak.Array([{"x": 1, "z": 3}], with_name="T")
Expand All @@ -19,16 +29,30 @@ def test():

cat["another"] = three

def check(layout):
if hasattr(layout, "contents"):
for x in layout.contents:
check(x)
elif hasattr(layout, "content"):
check(layout.content)
else:
assert layout.length <= 2
for _ in range(5):
check(cat.layout, 2)

cat["another", "w"] = three.x


def test_3arrays():
zero_a = ak.Array([{"x": 1, "y": 1}], with_name="T")
zero_b = ak.Array([{"x": 1, "v": 1}], with_name="T")
one_a = ak.Array([{"x": 1, "y": 2}], with_name="T")
one_b = ak.Array([{"x": 1, "y": 2}], with_name="T")
two_a = ak.Array([{"x": 1, "z": 3}], with_name="T")
two_b = ak.Array([{"x": 1, "z": 3}], with_name="T")
three = ak.Array([{"x": 4}, {"x": 4}, {"x": 4}], with_name="T")

zeroth = ak.zip({"a": zero_a, "b": zero_b})
first = ak.zip({"a": one_a, "b": one_b})
second = ak.zip({"a": two_a, "b": two_b})

cat = ak.concatenate([zeroth, first, second], axis=0)

cat["another"] = three

for _ in range(5):
check(cat.layout)
check(cat.layout, 3)

cat["another", "w"] = three.x

0 comments on commit 55a9a43

Please sign in to comment.