diff --git a/arrow-array/src/builder/generic_bytes_builder.rs b/arrow-array/src/builder/generic_bytes_builder.rs index 913a440ca747..f743b3191607 100644 --- a/arrow-array/src/builder/generic_bytes_builder.rs +++ b/arrow-array/src/builder/generic_bytes_builder.rs @@ -348,6 +348,50 @@ impl std::fmt::Write for GenericStringBuilder { } } +/// A byte size value representing the number of bytes to allocate per string in [`GenericStringBuilder`] +/// +/// To create a [`GenericStringBuilder`] using `.with_capacity` we are required to provide: \ +/// - `item_capacity` - the row count \ +/// - `data_capacity` - total string byte count \ +/// +/// We will use the `AVERAGE_STRING_LENGTH` * row_count for `data_capacity`. \ +/// +/// These capacities are preallocation hints used to improve performance, +/// but consuquences of passing a hint too large or too small should be negligible. +const AVERAGE_STRING_LENGTH: usize = 16; +/// Trait for string-like array builders +/// +/// This trait provides unified interface for builders that append string-like data +/// such as [`GenericStringBuilder`] and [`crate::builder::StringViewBuilder`] +pub trait StringLikeArrayBuilder: ArrayBuilder { + /// Returns a human-readable type name for the builder. + fn type_name() -> &'static str; + + /// Creates a new builder with the given row capacity. + fn with_capacity(capacity: usize) -> Self; + + /// Appends a non-null string value to the builder. + fn append_value(&mut self, value: &str); + + /// Appends a null value to the builder. + fn append_null(&mut self); +} + +impl StringLikeArrayBuilder for GenericStringBuilder { + fn type_name() -> &'static str { + std::any::type_name::() + } + fn with_capacity(capacity: usize) -> Self { + Self::with_capacity(capacity, capacity * AVERAGE_STRING_LENGTH) + } + fn append_value(&mut self, value: &str) { + Self::append_value(self, value); + } + fn append_null(&mut self) { + Self::append_null(self); + } +} + /// Array builder for [`GenericBinaryArray`][crate::GenericBinaryArray] /// /// Values can be appended using [`GenericByteBuilder::append_value`], and nulls with diff --git a/arrow-array/src/builder/generic_bytes_view_builder.rs b/arrow-array/src/builder/generic_bytes_view_builder.rs index 5ee257543b60..7e7a561a8c33 100644 --- a/arrow-array/src/builder/generic_bytes_view_builder.rs +++ b/arrow-array/src/builder/generic_bytes_view_builder.rs @@ -25,7 +25,7 @@ use arrow_schema::ArrowError; use hashbrown::HashTable; use hashbrown::hash_table::Entry; -use crate::builder::ArrayBuilder; +use crate::builder::{ArrayBuilder, StringLikeArrayBuilder}; use crate::types::bytes::ByteArrayNativeType; use crate::types::{BinaryViewType, ByteViewType, StringViewType}; use crate::{Array, ArrayRef, GenericByteViewArray}; @@ -533,6 +533,21 @@ impl> Extend> /// ``` pub type StringViewBuilder = GenericByteViewBuilder; +impl StringLikeArrayBuilder for StringViewBuilder { + fn type_name() -> &'static str { + std::any::type_name::() + } + fn with_capacity(capacity: usize) -> Self { + Self::with_capacity(capacity) + } + fn append_value(&mut self, value: &str) { + Self::append_value(self, value); + } + fn append_null(&mut self) { + Self::append_null(self); + } +} + /// Array builder for [`BinaryViewArray`][crate::BinaryViewArray] /// /// Values can be appended using [`GenericByteViewBuilder::append_value`], and nulls with diff --git a/parquet-variant-compute/src/variant_array.rs b/parquet-variant-compute/src/variant_array.rs index 2b38081d07e2..eb51144e3d28 100644 --- a/parquet-variant-compute/src/variant_array.rs +++ b/parquet-variant-compute/src/variant_array.rs @@ -947,6 +947,16 @@ fn typed_value_to_variant<'a>( let value = array.value(index); Variant::from(value) } + DataType::LargeUtf8 => { + let array = typed_value.as_string::(); + let value = array.value(index); + Variant::from(value) + } + DataType::Utf8View => { + let array = typed_value.as_string_view(); + let value = array.value(index); + Variant::from(value) + } DataType::Int8 => { primitive_conversion_single_value!(Int8Type, typed_value, index) } @@ -1098,14 +1108,14 @@ fn canonicalize_and_verify_data_type( // Binary and string are allowed. Force Binary to BinaryView because that's what the parquet // reader returns and what the rest of the variant code expects. Binary => Cow::Owned(DataType::BinaryView), - BinaryView | Utf8 => borrow!(), + BinaryView | Utf8 | LargeUtf8 | Utf8View => borrow!(), // UUID maps to 16-byte fixed-size binary; no other width is allowed FixedSizeBinary(16) => borrow!(), FixedSizeBinary(_) | FixedSizeList(..) => fail!(), // We can _possibly_ allow (some of) these some day? - LargeBinary | LargeUtf8 | Utf8View | ListView(_) | LargeList(_) | LargeListView(_) => { + LargeBinary | ListView(_) | LargeList(_) | LargeListView(_) => { fail!() } diff --git a/parquet-variant-compute/src/variant_get.rs b/parquet-variant-compute/src/variant_get.rs index e782b5968af5..0f08496fd767 100644 --- a/parquet-variant-compute/src/variant_get.rs +++ b/parquet-variant-compute/src/variant_get.rs @@ -299,7 +299,8 @@ mod test { use arrow::array::{ Array, ArrayRef, AsArray, BinaryViewArray, BooleanArray, Date32Array, Decimal32Array, Decimal64Array, Decimal128Array, Decimal256Array, Float32Array, Float64Array, Int8Array, - Int16Array, Int32Array, Int64Array, StringArray, StructArray, + Int16Array, Int32Array, Int64Array, LargeStringArray, StringArray, StringViewArray, + StructArray, }; use arrow::buffer::NullBuffer; use arrow::compute::CastOptions; @@ -766,6 +767,27 @@ mod test { BooleanArray::from(vec![Some(true), Some(false), Some(true)]) ); + perfectly_shredded_to_arrow_primitive_test!( + get_variant_perfectly_shredded_utf8_as_utf8, + DataType::Utf8, + perfectly_shredded_utf8_variant_array, + StringArray::from(vec![Some("foo"), Some("bar"), Some("baz")]) + ); + + perfectly_shredded_to_arrow_primitive_test!( + get_variant_perfectly_shredded_large_utf8_as_utf8, + DataType::Utf8, + perfectly_shredded_large_utf8_variant_array, + StringArray::from(vec![Some("foo"), Some("bar"), Some("baz")]) + ); + + perfectly_shredded_to_arrow_primitive_test!( + get_variant_perfectly_shredded_utf8_view_as_utf8, + DataType::Utf8, + perfectly_shredded_utf8_view_variant_array, + StringArray::from(vec![Some("foo"), Some("bar"), Some("baz")]) + ); + macro_rules! perfectly_shredded_variant_array_fn { ($func:ident, $typed_value_gen:expr) => { fn $func() -> ArrayRef { @@ -789,6 +811,18 @@ mod test { }; } + perfectly_shredded_variant_array_fn!(perfectly_shredded_utf8_variant_array, || { + StringArray::from(vec![Some("foo"), Some("bar"), Some("baz")]) + }); + + perfectly_shredded_variant_array_fn!(perfectly_shredded_large_utf8_variant_array, || { + LargeStringArray::from(vec![Some("foo"), Some("bar"), Some("baz")]) + }); + + perfectly_shredded_variant_array_fn!(perfectly_shredded_utf8_view_variant_array, || { + StringViewArray::from(vec![Some("foo"), Some("bar"), Some("baz")]) + }); + perfectly_shredded_variant_array_fn!(perfectly_shredded_bool_variant_array, || { BooleanArray::from(vec![Some(true), Some(false), Some(true)]) }); diff --git a/parquet-variant-compute/src/variant_to_arrow.rs b/parquet-variant-compute/src/variant_to_arrow.rs index 9219a34be52f..7065669982a5 100644 --- a/parquet-variant-compute/src/variant_to_arrow.rs +++ b/parquet-variant-compute/src/variant_to_arrow.rs @@ -16,7 +16,8 @@ // under the License. use arrow::array::{ - ArrayRef, BinaryViewArray, BooleanBuilder, NullBufferBuilder, PrimitiveBuilder, + ArrayRef, BinaryViewArray, LargeStringBuilder, NullBufferBuilder, PrimitiveBuilder, + StringBuilder, StringLikeArrayBuilder, StringViewBuilder, builder::BooleanBuilder, }; use arrow::compute::{CastOptions, DecimalCast}; use arrow::datatypes::{self, DataType, DecimalType}; @@ -58,6 +59,9 @@ pub(crate) enum PrimitiveVariantToArrowRowBuilder<'a> { TimestampNano(VariantToTimestampArrowRowBuilder<'a, datatypes::TimestampNanosecondType>), TimestampNanoNtz(VariantToTimestampNtzArrowRowBuilder<'a, datatypes::TimestampNanosecondType>), Date(VariantToPrimitiveArrowRowBuilder<'a, datatypes::Date32Type>), + String(VariantToStringArrowBuilder<'a, StringBuilder>), + LargeString(VariantToStringArrowBuilder<'a, LargeStringBuilder>), + StringView(VariantToStringArrowBuilder<'a, StringViewBuilder>), } /// Builder for converting variant values into strongly typed Arrow arrays. @@ -97,6 +101,9 @@ impl<'a> PrimitiveVariantToArrowRowBuilder<'a> { TimestampNano(b) => b.append_null(), TimestampNanoNtz(b) => b.append_null(), Date(b) => b.append_null(), + String(b) => b.append_null(), + LargeString(b) => b.append_null(), + StringView(b) => b.append_null(), } } @@ -124,6 +131,9 @@ impl<'a> PrimitiveVariantToArrowRowBuilder<'a> { TimestampNano(b) => b.append_value(value), TimestampNanoNtz(b) => b.append_value(value), Date(b) => b.append_value(value), + String(b) => b.append_value(value), + LargeString(b) => b.append_value(value), + StringView(b) => b.append_value(value), } } @@ -151,6 +161,9 @@ impl<'a> PrimitiveVariantToArrowRowBuilder<'a> { TimestampNano(b) => b.finish(), TimestampNanoNtz(b) => b.finish(), Date(b) => b.finish(), + String(b) => b.finish(), + LargeString(b) => b.finish(), + StringView(b) => b.finish(), } } } @@ -269,6 +282,13 @@ pub(crate) fn make_primitive_variant_to_arrow_row_builder<'a>( cast_options, capacity, )), + DataType::Utf8 => String(VariantToStringArrowBuilder::new(cast_options, capacity)), + DataType::LargeUtf8 => { + LargeString(VariantToStringArrowBuilder::new(cast_options, capacity)) + } + DataType::Utf8View => { + StringView(VariantToStringArrowBuilder::new(cast_options, capacity)) + } _ if data_type.is_primitive() => { return Err(ArrowError::NotYetImplemented(format!( "Primitive data_type {data_type:?} not yet implemented" @@ -413,6 +433,13 @@ macro_rules! define_variant_to_primitive_builder { } } +define_variant_to_primitive_builder!( + struct VariantToStringArrowBuilder<'a, B: StringLikeArrayBuilder> + |capacity| -> B { B::with_capacity(capacity) }, + |value| value.as_string(), + type_name: B::type_name() +); + define_variant_to_primitive_builder!( struct VariantToBooleanArrowRowBuilder<'a> |capacity| -> BooleanBuilder { BooleanBuilder::with_capacity(capacity) },