From 77ed22165d55008b7994db5896acc51395949db4 Mon Sep 17 00:00:00 2001 From: Tobias Schwarzinger Date: Tue, 7 Oct 2025 11:36:17 +0200 Subject: [PATCH 1/5] Try to improve benchmark for array_from --- arrow/benches/array_from.rs | 40 +++++++++++++++++++++---------------- 1 file changed, 23 insertions(+), 17 deletions(-) diff --git a/arrow/benches/array_from.rs b/arrow/benches/array_from.rs index 575a8280f652..481433b186ed 100644 --- a/arrow/benches/array_from.rs +++ b/arrow/benches/array_from.rs @@ -206,13 +206,19 @@ fn array_from_vec_benchmark(c: &mut Criterion) { }); } -fn gen_option_vector(item: TItem, len: usize) -> Vec> { - hint::black_box( - repeat_n(item, len) - .enumerate() - .map(|(idx, item)| if idx % 3 == 0 { None } else { Some(item) }) - .collect(), - ) +fn gen_option_iter( + item: TItem, + len: usize, +) -> Box>> { + hint::black_box(Box::new(repeat_n(item, len).enumerate().map( + |(idx, item)| { + if idx % 3 == 0 { + None + } else { + Some(item) + } + }, + ))) } fn from_iter_benchmark(c: &mut Criterion) { @@ -220,26 +226,26 @@ fn from_iter_benchmark(c: &mut Criterion) { // All ArrowPrimitiveType use the same implementation c.bench_function("Int64Array::from_iter", |b| { - let values = gen_option_vector(1, ITER_LEN); - b.iter(|| hint::black_box(Int64Array::from_iter(values.iter()))); + b.iter(|| hint::black_box(Int64Array::from_iter(gen_option_iter(1, ITER_LEN)))); }); c.bench_function("Int64Array::from_trusted_len_iter", |b| { - let values = gen_option_vector(1, ITER_LEN); b.iter(|| unsafe { - // SAFETY: values.iter() is a TrustedLenIterator - hint::black_box(Int64Array::from_trusted_len_iter(values.iter())) + // SAFETY: gen_option_iter returns a TrustedLen iterator + hint::black_box(Int64Array::from_trusted_len_iter(gen_option_iter( + 1, ITER_LEN, + ))) }); }); c.bench_function("BooleanArray::from_iter", |b| { - let values = gen_option_vector(true, ITER_LEN); - b.iter(|| hint::black_box(BooleanArray::from_iter(values.iter()))); + b.iter(|| hint::black_box(BooleanArray::from_iter(gen_option_iter(true, ITER_LEN)))); }); c.bench_function("BooleanArray::from_trusted_len_iter", |b| { - let values = gen_option_vector(true, ITER_LEN); b.iter(|| unsafe { - // SAFETY: values.iter() is a TrustedLenIterator - hint::black_box(BooleanArray::from_trusted_len_iter(values.iter())) + // SAFETY: gen_option_iter returns a TrustedLen iterator + hint::black_box(BooleanArray::from_trusted_len_iter(gen_option_iter( + true, ITER_LEN, + ))) }); }); } From 716735f088f4e76436afa517d530a5e6d560be27 Mon Sep 17 00:00:00 2001 From: Tobias Schwarzinger Date: Tue, 7 Oct 2025 11:36:47 +0200 Subject: [PATCH 2/5] Use ExactSizeIterator bound in PrimitiveArray::from_trusted_len_iter --- arrow-array/src/array/boolean_array.rs | 2 +- arrow-array/src/array/primitive_array.rs | 11 +++++++++-- arrow-cast/src/cast/string.rs | 8 ++++---- 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/arrow-array/src/array/boolean_array.rs b/arrow-array/src/array/boolean_array.rs index 7967084aa7ab..ad34076900cb 100644 --- a/arrow-array/src/array/boolean_array.rs +++ b/arrow-array/src/array/boolean_array.rs @@ -498,7 +498,7 @@ impl BooleanArray { /// /// The iterator must be [`TrustedLen`](https://doc.rust-lang.org/std/iter/trait.TrustedLen.html). /// I.e. that `size_hint().1` correctly reports its length. Note that this is a stronger - /// guarantee that `ExactSizeIterator` provides which could still report a wrong length. + /// guarantee than `ExactSizeIterator` provides, which could still report a wrong length. /// /// # Panics /// diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index a5bbd0e664d6..32f5e5ed0b51 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -1458,14 +1458,21 @@ impl>> FromIterator for P impl PrimitiveArray { /// Creates a [`PrimitiveArray`] from an iterator of trusted length. + /// /// # Safety + /// /// The iterator must be [`TrustedLen`](https://doc.rust-lang.org/std/iter/trait.TrustedLen.html). - /// I.e. that `size_hint().1` correctly reports its length. + /// I.e. that `size_hint().1` correctly reports its length. Note that this is a stronger + /// guarantee than `ExactSizeIterator` provides, which could still report a wrong length. + /// + /// # Panics + /// + /// Panics if the iterator does not report an upper bound on `size_hint()`. #[inline] pub unsafe fn from_trusted_len_iter(iter: I) -> Self where P: std::borrow::Borrow::Native>>, - I: IntoIterator, + I: ExactSizeIterator, { let iterator = iter.into_iter(); let (_, upper) = iterator.size_hint(); diff --git a/arrow-cast/src/cast/string.rs b/arrow-cast/src/cast/string.rs index 77696ae0d8cc..12c9289edaff 100644 --- a/arrow-cast/src/cast/string.rs +++ b/arrow-cast/src/cast/string.rs @@ -87,7 +87,7 @@ pub(crate) fn parse_string_view( fn parse_string_iter< 'a, P: Parser, - I: Iterator>, + I: ExactSizeIterator>, F: FnOnce() -> Option, >( iter: I, @@ -156,7 +156,7 @@ pub(crate) fn cast_view_to_timestamp( fn cast_string_to_timestamp_impl< 'a, - I: Iterator>, + I: ExactSizeIterator>, T: ArrowTimestampType, Tz: TimeZone, >( @@ -310,7 +310,7 @@ fn cast_string_to_interval_impl<'a, I, ArrowType, F>( parse_function: F, ) -> Result where - I: Iterator>, + I: ExactSizeIterator>, ArrowType: ArrowPrimitiveType, F: Fn(&str) -> Result + Copy, { @@ -331,7 +331,7 @@ where // 20% performance improvement // Soundness: // The iterator is trustedLen because it comes from an `StringArray`. - unsafe { PrimitiveArray::::from_trusted_len_iter(vec) } + unsafe { PrimitiveArray::::from_trusted_len_iter(vec.into_iter()) } }; Ok(Arc::new(interval_array) as ArrayRef) } From c50a02bd4ccd89a76ac6c030876d6aedc70b53b4 Mon Sep 17 00:00:00 2001 From: Tobias Schwarzinger Date: Wed, 15 Oct 2025 14:44:39 +0200 Subject: [PATCH 3/5] Use IntoIterator for trait bound --- arrow-array/src/array/boolean_array.rs | 3 ++- arrow-array/src/array/primitive_array.rs | 5 ++--- arrow-array/src/builder/boolean_builder.rs | 2 +- arrow-cast/src/cast/string.rs | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arrow-array/src/array/boolean_array.rs b/arrow-array/src/array/boolean_array.rs index ad34076900cb..abc4fbc90686 100644 --- a/arrow-array/src/array/boolean_array.rs +++ b/arrow-array/src/array/boolean_array.rs @@ -511,8 +511,9 @@ impl BooleanArray { pub unsafe fn from_trusted_len_iter(iter: I) -> Self where P: Into, - I: ExactSizeIterator, + I: IntoIterator, { + let iter = iter.into_iter(); let data_len = iter.len(); let num_bytes = bit_util::ceil(data_len, 8); diff --git a/arrow-array/src/array/primitive_array.rs b/arrow-array/src/array/primitive_array.rs index 32f5e5ed0b51..62970e2f1fe9 100644 --- a/arrow-array/src/array/primitive_array.rs +++ b/arrow-array/src/array/primitive_array.rs @@ -1472,11 +1472,10 @@ impl PrimitiveArray { pub unsafe fn from_trusted_len_iter(iter: I) -> Self where P: std::borrow::Borrow::Native>>, - I: ExactSizeIterator, + I: IntoIterator, { let iterator = iter.into_iter(); - let (_, upper) = iterator.size_hint(); - let len = upper.expect("trusted_len_unzip requires an upper limit"); + let len = iterator.len(); let (null, buffer) = unsafe { trusted_len_unzip(iterator) }; diff --git a/arrow-array/src/builder/boolean_builder.rs b/arrow-array/src/builder/boolean_builder.rs index 275aa8c9e56a..165fd3b42fc0 100644 --- a/arrow-array/src/builder/boolean_builder.rs +++ b/arrow-array/src/builder/boolean_builder.rs @@ -237,7 +237,7 @@ impl Extend> for BooleanBuilder { let buffered = iter.into_iter().collect::>(); let array = unsafe { // SAFETY: std::vec::IntoIter implements TrustedLen - BooleanArray::from_trusted_len_iter(buffered.into_iter()) + BooleanArray::from_trusted_len_iter(buffered) }; self.append_array(&array) } diff --git a/arrow-cast/src/cast/string.rs b/arrow-cast/src/cast/string.rs index 12c9289edaff..8194a15d34dd 100644 --- a/arrow-cast/src/cast/string.rs +++ b/arrow-cast/src/cast/string.rs @@ -331,7 +331,7 @@ where // 20% performance improvement // Soundness: // The iterator is trustedLen because it comes from an `StringArray`. - unsafe { PrimitiveArray::::from_trusted_len_iter(vec.into_iter()) } + unsafe { PrimitiveArray::::from_trusted_len_iter(vec) } }; Ok(Arc::new(interval_array) as ArrayRef) } From 9f297674010ecb1d0dab28219f11cedd48c3fdd1 Mon Sep 17 00:00:00 2001 From: Tobias Schwarzinger Date: Wed, 15 Oct 2025 14:52:09 +0200 Subject: [PATCH 4/5] Clippy --- arrow-array/src/array/boolean_array.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arrow-array/src/array/boolean_array.rs b/arrow-array/src/array/boolean_array.rs index abc4fbc90686..2d1bc675aaaa 100644 --- a/arrow-array/src/array/boolean_array.rs +++ b/arrow-array/src/array/boolean_array.rs @@ -716,7 +716,7 @@ mod tests { let expected = v.clone().into_iter().collect::(); let actual = unsafe { // SAFETY: `v` has trusted length - BooleanArray::from_trusted_len_iter(v.into_iter()) + BooleanArray::from_trusted_len_iter(v) }; assert_eq!(expected, actual); } From 01d15e43bd34f458c20bfd3c938f317972c96256 Mon Sep 17 00:00:00 2001 From: Tobias Schwarzinger Date: Wed, 15 Oct 2025 15:40:22 +0200 Subject: [PATCH 5/5] Formatting after updating rustfmt.toml --- arrow/benches/array_from.rs | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/arrow/benches/array_from.rs b/arrow/benches/array_from.rs index 481433b186ed..b50fa62489ea 100644 --- a/arrow/benches/array_from.rs +++ b/arrow/benches/array_from.rs @@ -212,11 +212,7 @@ fn gen_option_iter( ) -> Box>> { hint::black_box(Box::new(repeat_n(item, len).enumerate().map( |(idx, item)| { - if idx % 3 == 0 { - None - } else { - Some(item) - } + if idx % 3 == 0 { None } else { Some(item) } }, ))) }