Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions src/datasets/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -2120,6 +2120,15 @@ def embed_array_storage(array: pa.Array, feature: "FeatureType", token_per_repo_

if isinstance(array, pa.ExtensionArray):
array = array.storage

# Force contiguous copy for sliced list arrays to avoid SIGKILL crash.
# When ds.shard() or ds.select() creates a sliced view, array.values returns
# values with internal offset references that can cause PyArrow's C++ layer
# to crash when processing nested types like Sequence(Nifti()).
if pa.types.is_list(array.type) or pa.types.is_large_list(array.type):
if hasattr(array, "offset") and array.offset > 0:
array = pa.concat_arrays([array])

if hasattr(feature, "embed_storage"):
return feature.embed_storage(array, token_per_repo_id=token_per_repo_id)
elif pa.types.is_struct(array.type):
Expand Down
122 changes: 122 additions & 0 deletions tests/features/test_embed_storage_sliced.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
"""Tests for embed_array_storage with sliced/sharded arrays.

Regression tests for SIGKILL crash when processing sliced/sharded Arrow tables
with nested types like Sequence(Nifti()) or Sequence(Image()).
"""

import pyarrow as pa

from datasets.features import Image, List
from datasets.table import embed_array_storage

from ..utils import require_nibabel


class TestEmbedArrayStorageSliced:
"""Tests for embed_array_storage with sliced/sharded arrays."""

def test_embed_array_storage_sliced_list_image(self, shared_datadir):
"""embed_array_storage should work on sliced ListArray with Image.

This is a regression test for SIGKILL when processing sharded datasets
with Sequence(Image()) or similar nested types.
"""
image_file = str(shared_datadir / "test_image_rgb.jpg")

# Create a ListArray with 4 items
array = pa.array(
[
[{"bytes": None, "path": image_file}],
[{"bytes": None, "path": image_file}, {"bytes": None, "path": image_file}],
[],
[{"bytes": None, "path": image_file}],
],
type=pa.list_(Image.pa_type),
)

# Slice it (simulates ds.shard() or ds.select())
sliced = array.slice(1, 2) # Items 1 and 2

# Verify the array is actually sliced (this is the problematic case)
assert sliced.offset == 1, "Expected sliced array to have non-zero offset"

# This should NOT crash with SIGKILL
embedded = embed_array_storage(sliced, List(Image()))

# The fix should make the result contiguous (offset = 0)
assert embedded.offset == 0, "Result should be contiguous after fix"
assert len(embedded) == 2
# Item 0 of sliced = Item 1 of original (has 2 images)
assert len(embedded[0].as_py()) == 2
# Item 1 of sliced = Item 2 of original (empty list)
assert len(embedded[1].as_py()) == 0

@require_nibabel
def test_embed_array_storage_sliced_list_nifti(self, shared_datadir):
"""embed_array_storage should work on sliced ListArray with Nifti.

This is the specific case that crashed in the ARC dataset upload.
"""
from datasets.features.nifti import Nifti

nifti_path = str(shared_datadir / "test_nifti.nii.gz")

# Create a ListArray with 4 items (Sequence(Nifti()))
array = pa.array(
[
[{"bytes": None, "path": nifti_path}],
[{"bytes": None, "path": nifti_path}, {"bytes": None, "path": nifti_path}],
[], # Empty list - this also triggered the crash
[{"bytes": None, "path": nifti_path}],
],
type=pa.list_(Nifti.pa_type),
)

# Slice it (simulates ds.shard())
sliced = array.slice(1, 2)

# Verify the array is actually sliced
assert sliced.offset == 1, "Expected sliced array to have non-zero offset"

# This should NOT crash with SIGKILL
embedded = embed_array_storage(sliced, List(Nifti()))

# The fix should make the result contiguous (offset = 0)
assert embedded.offset == 0, "Result should be contiguous after fix"
assert len(embedded) == 2
# Verify bytes were embedded
assert embedded[0].as_py()[0]["bytes"] is not None

def test_embed_array_storage_sliced_large_list(self, shared_datadir):
"""embed_array_storage should work on sliced LargeListArray."""
image_file = str(shared_datadir / "test_image_rgb.jpg")

# Create a LargeListArray with 4 items
from datasets.features import LargeList

array = pa.array(
[
[{"bytes": None, "path": image_file}],
[{"bytes": None, "path": image_file}, {"bytes": None, "path": image_file}],
[],
[{"bytes": None, "path": image_file}],
],
type=pa.large_list(Image.pa_type),
)

# Slice it
sliced = array.slice(1, 2)

# Verify the array is actually sliced
assert sliced.offset == 1, "Expected sliced array to have non-zero offset"

# This should NOT crash with SIGKILL
embedded = embed_array_storage(sliced, LargeList(Image()))

# The fix should make the result contiguous (offset = 0)
assert embedded.offset == 0, "Result should be contiguous after fix"
assert len(embedded) == 2
# Item 0 of sliced = Item 1 of original (has 2 images)
assert len(embedded[0].as_py()) == 2
# Verify bytes were embedded
assert embedded[0].as_py()[0]["bytes"] is not None