Skip to content

Commit

Permalink
refactor!: validate ArrayBytes::Variable
Browse files Browse the repository at this point in the history
  • Loading branch information
LDeakin committed Jan 22, 2025
1 parent 177f701 commit feb8cce
Show file tree
Hide file tree
Showing 8 changed files with 100 additions and 21 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- **Breaking**: change `ArraySubset::inbounds` to take another subset rather than a shape
- **Breaking**: `CodecError` enum changes:
- Change `CodecError::UnexpectedChunkDecodedSize` to an `InvalidBytesLengthError`
- Add `CodecError::{InvalidArrayShape,InvalidNumberOfElements,SubsetOutOfBounds}`
- Add `CodecError::{InvalidArrayShape,InvalidNumberOfElements,SubsetOutOfBounds,RawBytesOffsetsCreate,RawBytesOffsetsOutOfBounds}`
- **Breaking**: Change output args to `ArrayBytesFixedDisjointView` and make safe the following:
- `Array::[async_]retrieve_chunk[_subset]_into`
- `[Async]ArrayPartialDecoderTraits::partial_decode_into`
- `ArrayToBytesCodecTraits::decode_into`
- `zarrs::array::copy_fill_value_into`
- `zarrs::array::update_array_bytes`
- **Breaking**: change `RawBytesOffsets` into a validated newtype
- **Breaking**: `ArrayBytes::new_vlen()` not returns a `Result` and validates bytes/offsets compatibility

## [0.19.1] - 2025-01-19

Expand Down
2 changes: 1 addition & 1 deletion zarrs/src/array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ pub use self::{
array_builder::ArrayBuilder,
array_bytes::{
copy_fill_value_into, update_array_bytes, ArrayBytes, ArrayBytesError, RawBytes,
RawBytesOffsets, RawBytesOffsetsCreateError,
RawBytesOffsets, RawBytesOffsetsCreateError, RawBytesOffsetsOutOfBoundsError,
},
array_bytes_fixed_disjoint_view::{
ArrayBytesFixedDisjointView, ArrayBytesFixedDisjointViewCreateError,
Expand Down
84 changes: 73 additions & 11 deletions zarrs/src/array/array_bytes.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
use std::borrow::Cow;

use derive_more::derive::Display;
use itertools::Itertools;
use thiserror::Error;
use unsafe_cell_slice::UnsafeCellSlice;
Expand Down Expand Up @@ -38,9 +39,18 @@ pub enum ArrayBytes<'a> {
/// The bytes and offsets are modeled on the [Apache Arrow Variable-size Binary Layout](https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-layout).
/// - The offsets buffer contains length + 1 ~~signed integers (either 32-bit or 64-bit, depending on the data type)~~ usize integers.
/// - Offsets must be monotonically increasing, that is `offsets[j+1] >= offsets[j]` for `0 <= j < length`, even for null slots. Thus, the bytes represent C-contiguous elements with padding permitted.
/// - The final offset must be less than or equal to the length of the bytes buffer.
Variable(RawBytes<'a>, RawBytesOffsets<'a>),
}

/// An error raised if variable length array bytes offsets are out of bounds.
#[derive(Debug, Error, Display)]
#[display("Offset {offset} is out of bounds for bytes of length {len}")]
pub struct RawBytesOffsetsOutOfBoundsError {
offset: usize,
len: usize,
}

/// Errors related to [`ArrayBytes<'_>`] and [`ArrayBytes`].
#[derive(Debug, Error)]
pub enum ArrayBytesError {
Expand All @@ -58,8 +68,40 @@ impl<'a> ArrayBytes<'a> {
}

/// Create a new variable length array bytes from `bytes` and `offsets`.
pub fn new_vlen(bytes: impl Into<RawBytes<'a>>, offsets: RawBytesOffsets<'a>) -> Self {
Self::Variable(bytes.into(), offsets)
///
/// # Errors
/// Returns a [`RawBytesOffsetsOutOfBoundsError`] if the last offset is out of bounds of the bytes.
pub fn new_vlen(
bytes: impl Into<RawBytes<'a>>,
offsets: RawBytesOffsets<'a>,
) -> Result<Self, RawBytesOffsetsOutOfBoundsError> {
let bytes = bytes.into();
match offsets.last() {
Some(&last) => {
if last <= bytes.len() {
Ok(Self::Variable(bytes, offsets))
} else {
Err(RawBytesOffsetsOutOfBoundsError {
offset: last,
len: bytes.len(),
})

Check warning on line 87 in zarrs/src/array/array_bytes.rs

View check run for this annotation

Codecov / codecov/patch

zarrs/src/array/array_bytes.rs#L84-L87

Added lines #L84 - L87 were not covered by tests
}
}
None => Err(RawBytesOffsetsOutOfBoundsError { offset: 0, len: 0 }),

Check warning on line 90 in zarrs/src/array/array_bytes.rs

View check run for this annotation

Codecov / codecov/patch

zarrs/src/array/array_bytes.rs#L90

Added line #L90 was not covered by tests
}
}

/// Create a new variable length array bytes from `bytes` and `offsets` without checking the offsets.
///
/// # Safety
/// The last offset must be less than or equal to the length of the bytes.
pub unsafe fn new_vlen_unchecked(
bytes: impl Into<RawBytes<'a>>,
offsets: RawBytesOffsets<'a>,
) -> Self {
let bytes = bytes.into();
debug_assert!(offsets.last().is_some_and(|&last| last <= bytes.len()));
Self::Variable(bytes, offsets)
}

/// Create a new [`ArrayBytes`] with `num_elements` composed entirely of the `fill_value`.
Expand All @@ -78,14 +120,18 @@ impl<'a> ArrayBytes<'a> {
}
ArraySize::Variable { num_elements } => {
let num_elements = usize::try_from(num_elements).unwrap();
Self::new_vlen(fill_value.as_ne_bytes().repeat(num_elements), unsafe {
let offsets = unsafe {
// SAFETY: The offsets are monotonically increasing.
RawBytesOffsets::new_unchecked(
(0..=num_elements)
.map(|i| i * fill_value.size())
.collect::<Vec<_>>(),
)
})
};
unsafe {
// SAFETY: The last offset is equal to the length of the bytes
Self::new_vlen_unchecked(fill_value.as_ne_bytes().repeat(num_elements), offsets)
}
}
}
}
Expand Down Expand Up @@ -135,9 +181,9 @@ impl<'a> ArrayBytes<'a> {
#[must_use]
pub fn into_owned<'b>(self) -> ArrayBytes<'b> {
match self {
Self::Fixed(bytes) => ArrayBytes::<'b>::new_flen(bytes.into_owned()),
Self::Fixed(bytes) => ArrayBytes::<'b>::Fixed(bytes.into_owned().into()),
Self::Variable(bytes, offsets) => {
ArrayBytes::<'b>::new_vlen(bytes.into_owned(), offsets.into_owned())
ArrayBytes::<'b>::Variable(bytes.into_owned().into(), offsets.into_owned())
}
}
}
Expand Down Expand Up @@ -206,7 +252,11 @@ impl<'a> ArrayBytes<'a> {
// SAFETY: The offsets are monotonically increasing.
RawBytesOffsets::new_unchecked(ss_offsets)
};
Ok(ArrayBytes::new_vlen(ss_bytes, ss_offsets))
let array_bytes = unsafe {
// SAFETY: The last offset is equal to the length of the bytes
ArrayBytes::new_vlen_unchecked(ss_bytes, ss_offsets)
};
Ok(array_bytes)
}
ArrayBytes::Fixed(bytes) => {
let byte_ranges =
Expand Down Expand Up @@ -337,8 +387,11 @@ pub(crate) fn update_bytes_vlen<'a>(
// SAFETY: The offsets are monotonically increasing.
RawBytesOffsets::new_unchecked(offsets_new)
};

Ok(ArrayBytes::new_vlen(bytes_new, offsets_new))
let array_bytes = unsafe {
// SAFETY: The last offset is equal to the length of the bytes
ArrayBytes::new_vlen_unchecked(bytes_new, offsets_new)
};
Ok(array_bytes)
}

/// Update a subset of an array.
Expand Down Expand Up @@ -462,7 +515,12 @@ pub(crate) fn merge_chunks_vlen<'a>(
}
}

Ok(ArrayBytes::new_vlen(bytes, offsets))
let array_bytes = unsafe {
// SAFETY: The last offset is equal to the length of the bytes
ArrayBytes::new_vlen_unchecked(bytes, offsets)
};

Ok(array_bytes)
}

pub(crate) fn extract_decoded_regions_vlen<'a>(
Expand Down Expand Up @@ -496,7 +554,11 @@ pub(crate) fn extract_decoded_regions_vlen<'a>(
// SAFETY: The offsets are monotonically increasing.
RawBytesOffsets::new_unchecked(region_offsets)
};
out.push(ArrayBytes::new_vlen(region_bytes, region_offsets));
let array_bytes = unsafe {
// SAFETY: The last offset is equal to the length of the bytes
ArrayBytes::new_vlen_unchecked(region_bytes, region_offsets)
};
out.push(array_bytes);
}
Ok(out)
}
Expand Down
4 changes: 4 additions & 0 deletions zarrs/src/array/codec.rs
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ use std::any::Any;
use std::borrow::Cow;
use std::sync::Arc;

use super::RawBytesOffsetsOutOfBoundsError;
use super::{
array_bytes::RawBytesOffsetsCreateError, concurrency::RecommendedConcurrency, ArrayBytes,
ArrayBytesFixedDisjointView, BytesRepresentation, ChunkRepresentation, ChunkShape, DataType,
Expand Down Expand Up @@ -1063,6 +1064,9 @@ pub enum CodecError {
/// Invalid byte offsets for variable length data.
#[error(transparent)]
RawBytesOffsetsCreate(#[from] RawBytesOffsetsCreateError),
/// Variable length array bytes offsets are out of bounds.
#[error(transparent)]
RawBytesOffsetsOutOfBounds(#[from] RawBytesOffsetsOutOfBoundsError),
}

impl From<&str> for CodecError {
Expand Down
7 changes: 5 additions & 2 deletions zarrs/src/array/codec/array_to_array/transpose.rs
Original file line number Diff line number Diff line change
Expand Up @@ -124,8 +124,11 @@ fn transpose_vlen<'a>(
// SAFETY: The offsets are monotonically increasing.
RawBytesOffsets::new_unchecked(offsets_new)
};

ArrayBytes::new_vlen(bytes_new, offsets_new)
let array_bytes = unsafe {
// SAFETY: The last offset is equal to the length of the bytes
ArrayBytes::new_vlen_unchecked(bytes_new, offsets_new)
};
array_bytes
}

#[cfg(test)]
Expand Down
6 changes: 3 additions & 3 deletions zarrs/src/array/codec/array_to_bytes/vlen/vlen_codec.rs
Original file line number Diff line number Diff line change
Expand Up @@ -265,16 +265,16 @@ impl ArrayToBytesCodecTraits for VlenCodec {
}
}
.unwrap();
let (data, offsets) = super::get_vlen_bytes_and_offsets(
let (bytes, offsets) = super::get_vlen_bytes_and_offsets(
&index_chunk_rep,
&bytes,
&self.index_codecs,
&self.data_codecs,
options,
)?;
let offsets = RawBytesOffsets::new(offsets)?;

Ok(ArrayBytes::new_vlen(data, offsets))
let array_bytes = ArrayBytes::new_vlen(bytes, offsets)?;
Ok(array_bytes)
}

fn partial_decoder(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,8 @@ impl ArrayToBytesCodecTraits for VlenV2Codec {
let num_elements = decoded_representation.num_elements_usize();
let (bytes, offsets) = super::get_interleaved_bytes_and_offsets(num_elements, &bytes)?;
let offsets = RawBytesOffsets::new(offsets)?;
Ok(ArrayBytes::new_vlen(bytes, offsets))
let array_bytes = ArrayBytes::new_vlen(bytes, offsets)?;
Ok(array_bytes)
}

fn partial_decoder(
Expand Down
12 changes: 10 additions & 2 deletions zarrs/src/array/element.rs
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,11 @@ macro_rules! impl_element_string {
for element in elements {
bytes.extend_from_slice(element.as_bytes());
}
Ok(ArrayBytes::new_vlen(bytes, offsets))
let array_bytes = unsafe {
// SAFETY: The last offset is the length of the bytes.
ArrayBytes::new_vlen_unchecked(bytes, offsets)
};
Ok(array_bytes)
}
}
};
Expand Down Expand Up @@ -252,7 +256,11 @@ macro_rules! impl_element_binary {
// Concatenate bytes
let bytes = elements.concat();

Ok(ArrayBytes::new_vlen(bytes, offsets))
let array_bytes = unsafe {
// SAFETY: The last offset is the length of the bytes.
ArrayBytes::new_vlen_unchecked(bytes, offsets)
};
Ok(array_bytes)
}
}
};
Expand Down

0 comments on commit feb8cce

Please sign in to comment.