Skip to content

Commit

Permalink
refactor!: validate ArrayBytes::Variable (#138)
Browse files Browse the repository at this point in the history
  • Loading branch information
LDeakin authored Jan 22, 2025
1 parent 177f701 commit 4c2d73c
Show file tree
Hide file tree
Showing 9 changed files with 132 additions and 27 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,14 +16,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
- **Breaking**: change `ArraySubset::inbounds` to take another subset rather than a shape
- **Breaking**: `CodecError` enum changes:
- Change `CodecError::UnexpectedChunkDecodedSize` to an `InvalidBytesLengthError`
- Add `CodecError::{InvalidArrayShape,InvalidNumberOfElements,SubsetOutOfBounds}`
- Add `CodecError::{InvalidArrayShape,InvalidNumberOfElements,SubsetOutOfBounds,RawBytesOffsetsCreate,RawBytesOffsetsOutOfBounds}`
- **Breaking**: Change output args to `ArrayBytesFixedDisjointView` and make safe the following:
- `Array::[async_]retrieve_chunk[_subset]_into`
- `[Async]ArrayPartialDecoderTraits::partial_decode_into`
- `ArrayToBytesCodecTraits::decode_into`
- `zarrs::array::copy_fill_value_into`
- `zarrs::array::update_array_bytes`
- **Breaking**: change `RawBytesOffsets` into a validated newtype
- **Breaking**: `ArrayBytes::new_vlen()` not returns a `Result` and validates bytes/offsets compatibility

## [0.19.1] - 2025-01-19

Expand Down
2 changes: 1 addition & 1 deletion zarrs/src/array.rs
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ pub use self::{
array_builder::ArrayBuilder,
array_bytes::{
copy_fill_value_into, update_array_bytes, ArrayBytes, ArrayBytesError, RawBytes,
RawBytesOffsets, RawBytesOffsetsCreateError,
RawBytesOffsets, RawBytesOffsetsCreateError, RawBytesOffsetsOutOfBoundsError,
},
array_bytes_fixed_disjoint_view::{
ArrayBytesFixedDisjointView, ArrayBytesFixedDisjointViewCreateError,
Expand Down
92 changes: 80 additions & 12 deletions zarrs/src/array/array_bytes.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
use std::borrow::Cow;

use derive_more::derive::Display;
use itertools::Itertools;
use thiserror::Error;
use unsafe_cell_slice::UnsafeCellSlice;
Expand Down Expand Up @@ -38,9 +39,18 @@ pub enum ArrayBytes<'a> {
/// The bytes and offsets are modeled on the [Apache Arrow Variable-size Binary Layout](https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-layout).
/// - The offsets buffer contains length + 1 ~~signed integers (either 32-bit or 64-bit, depending on the data type)~~ usize integers.
/// - Offsets must be monotonically increasing, that is `offsets[j+1] >= offsets[j]` for `0 <= j < length`, even for null slots. Thus, the bytes represent C-contiguous elements with padding permitted.
/// - The final offset must be less than or equal to the length of the bytes buffer.
Variable(RawBytes<'a>, RawBytesOffsets<'a>),
}

/// An error raised if variable length array bytes offsets are out of bounds.
#[derive(Debug, Error, Display)]
#[display("Offset {offset} is out of bounds for bytes of length {len}")]
pub struct RawBytesOffsetsOutOfBoundsError {
offset: usize,
len: usize,
}

/// Errors related to [`ArrayBytes<'_>`] and [`ArrayBytes`].
#[derive(Debug, Error)]
pub enum ArrayBytesError {
Expand All @@ -58,8 +68,35 @@ impl<'a> ArrayBytes<'a> {
}

/// Create a new variable length array bytes from `bytes` and `offsets`.
pub fn new_vlen(bytes: impl Into<RawBytes<'a>>, offsets: RawBytesOffsets<'a>) -> Self {
Self::Variable(bytes.into(), offsets)
///
/// # Errors
/// Returns a [`RawBytesOffsetsOutOfBoundsError`] if the last offset is out of bounds of the bytes.
pub fn new_vlen(
bytes: impl Into<RawBytes<'a>>,
offsets: RawBytesOffsets<'a>,
) -> Result<Self, RawBytesOffsetsOutOfBoundsError> {
let bytes = bytes.into();
if offsets.last() <= bytes.len() {
Ok(Self::Variable(bytes, offsets))
} else {
Err(RawBytesOffsetsOutOfBoundsError {
offset: offsets.last(),
len: bytes.len(),
})
}
}

/// Create a new variable length array bytes from `bytes` and `offsets` without checking the offsets.
///
/// # Safety
/// The last offset must be less than or equal to the length of the bytes.
pub unsafe fn new_vlen_unchecked(
bytes: impl Into<RawBytes<'a>>,
offsets: RawBytesOffsets<'a>,
) -> Self {
let bytes = bytes.into();
debug_assert!(offsets.last() <= bytes.len());
Self::Variable(bytes, offsets)
}

/// Create a new [`ArrayBytes`] with `num_elements` composed entirely of the `fill_value`.
Expand All @@ -78,14 +115,18 @@ impl<'a> ArrayBytes<'a> {
}
ArraySize::Variable { num_elements } => {
let num_elements = usize::try_from(num_elements).unwrap();
Self::new_vlen(fill_value.as_ne_bytes().repeat(num_elements), unsafe {
let offsets = unsafe {
// SAFETY: The offsets are monotonically increasing.
RawBytesOffsets::new_unchecked(
(0..=num_elements)
.map(|i| i * fill_value.size())
.collect::<Vec<_>>(),
)
})
};
unsafe {
// SAFETY: The last offset is equal to the length of the bytes
Self::new_vlen_unchecked(fill_value.as_ne_bytes().repeat(num_elements), offsets)
}
}
}
}
Expand Down Expand Up @@ -135,9 +176,9 @@ impl<'a> ArrayBytes<'a> {
#[must_use]
pub fn into_owned<'b>(self) -> ArrayBytes<'b> {
match self {
Self::Fixed(bytes) => ArrayBytes::<'b>::new_flen(bytes.into_owned()),
Self::Fixed(bytes) => ArrayBytes::<'b>::Fixed(bytes.into_owned().into()),
Self::Variable(bytes, offsets) => {
ArrayBytes::<'b>::new_vlen(bytes.into_owned(), offsets.into_owned())
ArrayBytes::<'b>::Variable(bytes.into_owned().into(), offsets.into_owned())
}
}
}
Expand Down Expand Up @@ -206,7 +247,11 @@ impl<'a> ArrayBytes<'a> {
// SAFETY: The offsets are monotonically increasing.
RawBytesOffsets::new_unchecked(ss_offsets)
};
Ok(ArrayBytes::new_vlen(ss_bytes, ss_offsets))
let array_bytes = unsafe {
// SAFETY: The last offset is equal to the length of the bytes
ArrayBytes::new_vlen_unchecked(ss_bytes, ss_offsets)
};
Ok(array_bytes)
}
ArrayBytes::Fixed(bytes) => {
let byte_ranges =
Expand Down Expand Up @@ -337,8 +382,11 @@ pub(crate) fn update_bytes_vlen<'a>(
// SAFETY: The offsets are monotonically increasing.
RawBytesOffsets::new_unchecked(offsets_new)
};

Ok(ArrayBytes::new_vlen(bytes_new, offsets_new))
let array_bytes = unsafe {
// SAFETY: The last offset is equal to the length of the bytes
ArrayBytes::new_vlen_unchecked(bytes_new, offsets_new)
};
Ok(array_bytes)
}

/// Update a subset of an array.
Expand Down Expand Up @@ -448,7 +496,7 @@ pub(crate) fn merge_chunks_vlen<'a>(

// Write bytes
// TODO: Go parallel
let mut bytes = vec![0; *offsets.last().unwrap()];
let mut bytes = vec![0; offsets.last()];
for (chunk_bytes, chunk_subset) in chunk_bytes_and_subsets {
let (chunk_bytes, chunk_offsets) = chunk_bytes.into_variable()?;
let indices = chunk_subset.linearised_indices(array_shape).unwrap();
Expand All @@ -462,7 +510,12 @@ pub(crate) fn merge_chunks_vlen<'a>(
}
}

Ok(ArrayBytes::new_vlen(bytes, offsets))
let array_bytes = unsafe {
// SAFETY: The last offset is equal to the length of the bytes
ArrayBytes::new_vlen_unchecked(bytes, offsets)
};

Ok(array_bytes)
}

pub(crate) fn extract_decoded_regions_vlen<'a>(
Expand Down Expand Up @@ -496,7 +549,11 @@ pub(crate) fn extract_decoded_regions_vlen<'a>(
// SAFETY: The offsets are monotonically increasing.
RawBytesOffsets::new_unchecked(region_offsets)
};
out.push(ArrayBytes::new_vlen(region_bytes, region_offsets));
let array_bytes = unsafe {
// SAFETY: The last offset is equal to the length of the bytes
ArrayBytes::new_vlen_unchecked(region_bytes, region_offsets)
};
out.push(array_bytes);
}
Ok(out)
}
Expand Down Expand Up @@ -599,6 +656,17 @@ mod tests {
Ok(())
}

#[test]
fn array_bytes_vlen() {
let data = [0u8, 1, 2, 3, 4];
assert!(ArrayBytes::new_vlen(&data, vec![0].try_into().unwrap()).is_ok());
assert!(ArrayBytes::new_vlen(&data, vec![0, 5].try_into().unwrap()).is_ok());
assert!(ArrayBytes::new_vlen(&data, vec![0, 5, 5].try_into().unwrap()).is_ok());
assert!(ArrayBytes::new_vlen(&data, vec![0, 5, 6].try_into().unwrap()).is_err());
assert!(ArrayBytes::new_vlen(&data, vec![0, 1, 3, 5].try_into().unwrap()).is_ok());
assert!(ArrayBytes::new_vlen(&data, vec![0, 1, 3, 6].try_into().unwrap()).is_err());
}

#[test]
fn array_bytes_str() -> Result<(), Box<dyn Error>> {
let data = ["a", "bb", "ccc"];
Expand Down
30 changes: 25 additions & 5 deletions zarrs/src/array/array_bytes/raw_bytes_offsets.rs
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,15 @@ impl Deref for RawBytesOffsets<'_> {
}

/// An error creating [`RawBytesOffsets`].
///
/// This error occurs when the offsets are not monotonically increasing.
#[derive(Debug, Error, Display)]
pub struct RawBytesOffsetsCreateError;
pub enum RawBytesOffsetsCreateError {
/// The offsets length must be greater than zero.
#[display("offsets length must be greater than zero")]
ZeroLength,
/// The offsets are not monotonically increasing.
#[display("offsets are not monotonically increasing")]
NotMonotonicallyIncreasing,
}

impl<'a> RawBytesOffsets<'a> {
/// Creates a new `RawBytesOffsets`.
Expand All @@ -30,10 +35,12 @@ impl<'a> RawBytesOffsets<'a> {
/// Returns an error if the offsets are not monotonically increasing.
pub fn new(offsets: impl Into<Cow<'a, [usize]>>) -> Result<Self, RawBytesOffsetsCreateError> {
let offsets = offsets.into();
if offsets.windows(2).all(|w| w[1] >= w[0]) {
if offsets.is_empty() {
Err(RawBytesOffsetsCreateError::ZeroLength)
} else if offsets.windows(2).all(|w| w[1] >= w[0]) {
Ok(Self(offsets))
} else {
Err(RawBytesOffsetsCreateError)
Err(RawBytesOffsetsCreateError::NotMonotonicallyIncreasing)
}
}

Expand All @@ -44,6 +51,7 @@ impl<'a> RawBytesOffsets<'a> {
#[must_use]
pub unsafe fn new_unchecked(offsets: impl Into<Cow<'a, [usize]>>) -> Self {
let offsets = offsets.into();
debug_assert!(!offsets.is_empty());
debug_assert!(offsets.windows(2).all(|w| w[1] >= w[0]));
Self(offsets)
}
Expand All @@ -53,6 +61,15 @@ impl<'a> RawBytesOffsets<'a> {
pub fn into_owned(self) -> RawBytesOffsets<'static> {
RawBytesOffsets(self.0.into_owned().into())
}

/// Returns the last offset.
#[must_use]
pub fn last(&self) -> usize {
unsafe {
// SAFETY: The offsets cannot be empty.
*self.0.last().unwrap_unchecked()
}
}
}

impl<'a> TryFrom<Cow<'a, [usize]>> for RawBytesOffsets<'a> {
Expand Down Expand Up @@ -95,6 +112,9 @@ mod tests {
fn raw_bytes_offsets() {
let offsets = RawBytesOffsets::new(vec![0, 1, 2, 3]).unwrap();
assert_eq!(&*offsets, &[0, 1, 2, 3]);
assert!(RawBytesOffsets::new(vec![]).is_err());
assert!(RawBytesOffsets::new(vec![0]).is_ok());
assert!(RawBytesOffsets::new(vec![10]).is_ok()); // nonsense, but not invalid
assert!(RawBytesOffsets::new(vec![0, 1, 1]).is_ok());
assert!(RawBytesOffsets::new(vec![0, 1, 0]).is_err());
assert!(RawBytesOffsets::try_from(vec![0, 1, 2]).is_ok());
Expand Down
4 changes: 4 additions & 0 deletions zarrs/src/array/codec.rs
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ use std::any::Any;
use std::borrow::Cow;
use std::sync::Arc;

use super::RawBytesOffsetsOutOfBoundsError;
use super::{
array_bytes::RawBytesOffsetsCreateError, concurrency::RecommendedConcurrency, ArrayBytes,
ArrayBytesFixedDisjointView, BytesRepresentation, ChunkRepresentation, ChunkShape, DataType,
Expand Down Expand Up @@ -1063,6 +1064,9 @@ pub enum CodecError {
/// Invalid byte offsets for variable length data.
#[error(transparent)]
RawBytesOffsetsCreate(#[from] RawBytesOffsetsCreateError),
/// Variable length array bytes offsets are out of bounds.
#[error(transparent)]
RawBytesOffsetsOutOfBounds(#[from] RawBytesOffsetsOutOfBoundsError),
}

impl From<&str> for CodecError {
Expand Down
7 changes: 5 additions & 2 deletions zarrs/src/array/codec/array_to_array/transpose.rs
Original file line number Diff line number Diff line change
Expand Up @@ -124,8 +124,11 @@ fn transpose_vlen<'a>(
// SAFETY: The offsets are monotonically increasing.
RawBytesOffsets::new_unchecked(offsets_new)
};

ArrayBytes::new_vlen(bytes_new, offsets_new)
let array_bytes = unsafe {
// SAFETY: The last offset is equal to the length of the bytes
ArrayBytes::new_vlen_unchecked(bytes_new, offsets_new)
};
array_bytes
}

#[cfg(test)]
Expand Down
6 changes: 3 additions & 3 deletions zarrs/src/array/codec/array_to_bytes/vlen/vlen_codec.rs
Original file line number Diff line number Diff line change
Expand Up @@ -265,16 +265,16 @@ impl ArrayToBytesCodecTraits for VlenCodec {
}
}
.unwrap();
let (data, offsets) = super::get_vlen_bytes_and_offsets(
let (bytes, offsets) = super::get_vlen_bytes_and_offsets(
&index_chunk_rep,
&bytes,
&self.index_codecs,
&self.data_codecs,
options,
)?;
let offsets = RawBytesOffsets::new(offsets)?;

Ok(ArrayBytes::new_vlen(data, offsets))
let array_bytes = ArrayBytes::new_vlen(bytes, offsets)?;
Ok(array_bytes)
}

fn partial_decoder(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,8 @@ impl ArrayToBytesCodecTraits for VlenV2Codec {
let num_elements = decoded_representation.num_elements_usize();
let (bytes, offsets) = super::get_interleaved_bytes_and_offsets(num_elements, &bytes)?;
let offsets = RawBytesOffsets::new(offsets)?;
Ok(ArrayBytes::new_vlen(bytes, offsets))
let array_bytes = ArrayBytes::new_vlen(bytes, offsets)?;
Ok(array_bytes)
}

fn partial_decoder(
Expand Down
12 changes: 10 additions & 2 deletions zarrs/src/array/element.rs
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,11 @@ macro_rules! impl_element_string {
for element in elements {
bytes.extend_from_slice(element.as_bytes());
}
Ok(ArrayBytes::new_vlen(bytes, offsets))
let array_bytes = unsafe {
// SAFETY: The last offset is the length of the bytes.
ArrayBytes::new_vlen_unchecked(bytes, offsets)
};
Ok(array_bytes)
}
}
};
Expand Down Expand Up @@ -252,7 +256,11 @@ macro_rules! impl_element_binary {
// Concatenate bytes
let bytes = elements.concat();

Ok(ArrayBytes::new_vlen(bytes, offsets))
let array_bytes = unsafe {
// SAFETY: The last offset is the length of the bytes.
ArrayBytes::new_vlen_unchecked(bytes, offsets)
};
Ok(array_bytes)
}
}
};
Expand Down

0 comments on commit 4c2d73c

Please sign in to comment.