Prefer MaybeUninit for write-only slices

xiph · Oct 20, 2023 · 620d541 · 620d541
1 parent ff2df3d
commit 620d541
Show file tree

Hide file tree

Showing 9 changed files with 90 additions and 50 deletions.
diff --git a/benches/transform.rs b/benches/transform.rs
@@ -15,6 +15,7 @@ use rav1e::bench::transform;
 use rav1e::bench::transform::{
   forward_transform, get_valid_txfm_types, TxSize,
 };
+use std::mem::MaybeUninit;
 
 fn init_buffers(size: usize) -> (Vec<i32>, Vec<i32>) {
   let mut ra = ChaChaRng::from_seed([0; 32]);
@@ -96,7 +97,7 @@ pub fn bench_forward_transforms(c: &mut Criterion) {
 
     let input: Vec<i16> =
       (0..area).map(|_| rng.gen_range(-255..256)).collect();
-    let mut output = vec![0i16; area];
+    let mut output = vec![MaybeUninit::new(0i16); area];
 
     for &tx_type in get_valid_txfm_types(tx_size) {
       group.bench_function(

diff --git a/src/asm/shared/transform/inverse.rs b/src/asm/shared/transform/inverse.rs
@@ -9,6 +9,7 @@
 
 use crate::tiling::PlaneRegionMut;
 use crate::util::*;
+use std::mem::MaybeUninit;
 
 // Note: Input coeffs are mutable since the assembly uses them as a scratchpad
 pub type InvTxfmFunc =
@@ -27,13 +28,13 @@ pub fn call_inverse_func<T: Pixel>(
   let input: &[T::Coeff] = &input[..width.min(32) * height.min(32)];
 
   // SAFETY: We write to the array below before reading from it.
-  let mut copied: Aligned<[T::Coeff; 32 * 32]> =
+  let mut copied: Aligned<[MaybeUninit<T::Coeff>; 32 * 32]> =
     unsafe { Aligned::uninitialized() };
 
   // Convert input to 16-bits.
   // TODO: Remove by changing inverse assembly to not overwrite its input
   for (a, b) in copied.data.iter_mut().zip(input) {
-    *a = *b;
+    a.write(*b);
   }
 
   // perform the inverse transform
@@ -57,13 +58,13 @@ pub fn call_inverse_hbd_func<T: Pixel>(
   let input: &[T::Coeff] = &input[..width.min(32) * height.min(32)];
 
   // SAFETY: We write to the array below before reading from it.
-  let mut copied: Aligned<[T::Coeff; 32 * 32]> =
+  let mut copied: Aligned<[MaybeUninit<T::Coeff>; 32 * 32]> =
     unsafe { Aligned::uninitialized() };
 
   // Convert input to 16-bits.
   // TODO: Remove by changing inverse assembly to not overwrite its input
   for (a, b) in copied.data.iter_mut().zip(input) {
-    *a = *b;
+    a.write(*b);
   }
 
   // perform the inverse transform
@@ -89,6 +90,7 @@ pub mod test {
   use crate::transform::TxSize::*;
   use crate::transform::*;
   use rand::{random, thread_rng, Rng};
+  use std::mem::MaybeUninit;
 
   pub fn pick_eob<T: Coefficient>(
     coeffs: &mut [T], tx_size: TxSize, tx_type: TxType, sub_h: usize,
@@ -147,21 +149,22 @@ pub mod test {
       let mut src_storage = [T::zero(); 64 * 64];
       let src = &mut src_storage[..tx_size.area()];
       let mut dst = Plane::from_slice(&[T::zero(); 64 * 64], 64);
-      // SAFETY: We write to the array below before reading from it.
-      let mut res_storage: Aligned<[i16; 64 * 64]> =
+      let mut res_storage: Aligned<[MaybeUninit<i16>; 64 * 64]> =
         unsafe { Aligned::uninitialized() };
       let res = &mut res_storage.data[..tx_size.area()];
       // SAFETY: We write to the array below before reading from it.
-      let mut freq_storage: Aligned<[T::Coeff; 64 * 64]> =
+      let mut freq_storage: Aligned<[MaybeUninit<T::Coeff>; 64 * 64]> =
         unsafe { Aligned::uninitialized() };
       let freq = &mut freq_storage.data[..tx_size.area()];
       for ((r, s), d) in
         res.iter_mut().zip(src.iter_mut()).zip(dst.data.iter_mut())
       {
         *s = T::cast_from(random::<u16>() >> (16 - bit_depth));
         *d = T::cast_from(random::<u16>() >> (16 - bit_depth));
-        *r = i16::cast_from(*s) - i16::cast_from(*d);
+        r.write(i16::cast_from(*s) - i16::cast_from(*d));
       }
+      // SAFETY: The loop just initialized res, and all three slices have the same length
+      let res = unsafe { slice_assume_init_mut(res) };
       forward_transform(
         res,
         freq,
@@ -171,6 +174,8 @@ pub mod test {
         bit_depth,
         CpuFeatureLevel::RUST,
       );
+      // SAFETY: forward_transform initialized freq
+      let freq = unsafe { slice_assume_init_mut(freq) };
 
       let eob: usize = pick_eob(freq, tx_size, tx_type, sub_h);
       let mut rust_dst = dst.clone();

diff --git a/src/asm/x86/quantize.rs b/src/asm/x86/quantize.rs
@@ -17,6 +17,7 @@ use crate::cpu_features::CpuFeatureLevel;
 use crate::quantize::*;
 use crate::transform::TxSize;
 use crate::util::*;
+use std::mem::MaybeUninit;
 
 type DequantizeFn = unsafe fn(
   qindex: u8,
@@ -37,21 +38,21 @@ cpu_function_lookup_table!(
 
 #[inline(always)]
 pub fn dequantize<T: Coefficient>(
-  qindex: u8, coeffs: &[T], eob: usize, rcoeffs: &mut [T], tx_size: TxSize,
-  bit_depth: usize, dc_delta_q: i8, ac_delta_q: i8, cpu: CpuFeatureLevel,
+  qindex: u8, coeffs: &[T], eob: usize, rcoeffs: &mut [MaybeUninit<T>],
+  tx_size: TxSize, bit_depth: usize, dc_delta_q: i8, ac_delta_q: i8,
+  cpu: CpuFeatureLevel,
 ) {
-  let call_rust = |rcoeffs: &mut [T]| {
+  let call_rust = |rcoeffs: &mut [MaybeUninit<T>]| {
     crate::quantize::rust::dequantize(
       qindex, coeffs, eob, rcoeffs, tx_size, bit_depth, dc_delta_q,
       ac_delta_q, cpu,
     );
   };
 
   #[cfg(any(feature = "check_asm", test))]
-  let ref_rcoeffs = {
+  let mut ref_rcoeffs = {
     let area = av1_get_coded_tx_size(tx_size).area();
-    let mut copy = vec![T::cast_from(0); area];
-    copy[..].copy_from_slice(&rcoeffs[..area]);
+    let mut copy = vec![MaybeUninit::new(T::cast_from(0)); area];
     call_rust(&mut copy);
     copy
   };
@@ -82,7 +83,9 @@ pub fn dequantize<T: Coefficient>(
   #[cfg(any(feature = "check_asm", test))]
   {
     let area = av1_get_coded_tx_size(tx_size).area();
-    assert_eq!(&rcoeffs[..area], &ref_rcoeffs[..]);
+    let rcoeffs = unsafe { assume_slice_init_mut(&mut rcoeffs[..area]) };
+    let ref_rcoeffs = unsafe { assume_slice_init_mut(&mut ref_rcoeffs[..]) };
+    assert_eq!(rcoeffs, ref_rcoeffs);
   }
 }
 
@@ -157,6 +160,7 @@ mod test {
   use super::*;
   use rand::distributions::{Distribution, Uniform};
   use rand::{thread_rng, Rng};
+  use std::mem::MaybeUninit;
 
   #[test]
   fn dequantize_test() {
@@ -190,7 +194,7 @@ mod test {
 
       for &eob in &eobs {
         let mut qcoeffs = Aligned::new([0i16; 32 * 32]);
-        let mut rcoeffs = Aligned::new([0i16; 32 * 32]);
+        let mut rcoeffs = Aligned::new([MaybeUninit::new(0i16); 32 * 32]);
 
         // Generate quantized coefficients up to the eob
         let between = Uniform::from(-i16::MAX..=i16::MAX);

diff --git a/src/asm/x86/transform/forward.rs b/src/asm/x86/transform/forward.rs
@@ -335,8 +335,8 @@ fn cast_mut<const N: usize, T>(x: &mut [T]) -> &mut [T; N] {
 #[allow(clippy::identity_op, clippy::erasing_op)]
 #[target_feature(enable = "avx2")]
 unsafe fn forward_transform_avx2<T: Coefficient>(
-  input: &[i16], output: &mut [T], stride: usize, tx_size: TxSize,
-  tx_type: TxType, bd: usize,
+  input: &[i16], output: &mut [MaybeUninit<T>], stride: usize,
+  tx_size: TxSize, tx_type: TxType, bd: usize,
 ) {
   // Note when assigning txfm_size_col, we use the txfm_size from the
   // row configuration and vice versa. This is intentionally done to
@@ -510,8 +510,8 @@ unsafe fn forward_transform_avx2<T: Coefficient>(
 ///
 /// - If called with an invalid combination of `tx_size` and `tx_type`
 pub fn forward_transform<T: Coefficient>(
-  input: &[i16], output: &mut [T], stride: usize, tx_size: TxSize,
-  tx_type: TxType, bd: usize, cpu: CpuFeatureLevel,
+  input: &[i16], output: &mut [MaybeUninit<T>], stride: usize,
+  tx_size: TxSize, tx_type: TxType, bd: usize, cpu: CpuFeatureLevel,
 ) {
   assert!(valid_av1_transform(tx_size, tx_type));
   if cpu >= CpuFeatureLevel::AVX2 {
@@ -528,7 +528,9 @@ pub fn forward_transform<T: Coefficient>(
 mod test {
   use crate::cpu_features::*;
   use crate::transform::{forward_transform, get_valid_txfm_types, TxSize};
+  use crate::util::assume_slice_init_mut;
   use rand::Rng;
+  use std::mem::MaybeUninit;
 
   // Ensure that the simd results match the rust code
   #[test]
@@ -560,8 +562,8 @@ mod test {
         (0..area).map(|_| rng.gen_range(-255..256)).collect();
 
       for &tx_type in get_valid_txfm_types(tx_size) {
-        let mut output_ref = vec![0i16; area];
-        let mut output_simd = vec![0i16; area];
+        let mut output_ref = vec![MaybeUninit::new(0i16); area];
+        let mut output_simd = vec![MaybeUninit::new(0i16); area];
 
         println!("Testing combination {:?}, {:?}", tx_size, tx_type);
         forward_transform(
@@ -573,6 +575,7 @@ mod test {
           8,
           CpuFeatureLevel::RUST,
         );
+        let output_ref = unsafe { assume_slice_init_mut(&mut output_ref[..]) };
         forward_transform(
           &input[..],
           &mut output_simd[..],
@@ -582,6 +585,8 @@ mod test {
           8,
           cpu,
         );
+        let output_simd =
+          unsafe { assume_slice_init_mut(&mut output_simd[..]) };
         assert_eq!(output_ref, output_simd)
       }
     }

diff --git a/src/encoder.rs b/src/encoder.rs
@@ -1355,9 +1355,12 @@ fn write_key_frame_obus<T: Pixel>(
 
 /// Write into `dst` the difference between the blocks at `src1` and `src2`
 fn diff<T: Pixel>(
-  dst: &mut [i16], src1: &PlaneRegion<'_, T>, src2: &PlaneRegion<'_, T>,
+  dst: &mut [MaybeUninit<i16>], src1: &PlaneRegion<'_, T>,
+  src2: &PlaneRegion<'_, T>,
 ) {
   debug_assert!(dst.len() % src1.rect().width == 0);
+  debug_assert_eq!(src1.rows_iter().count(), src1.rect().height);
+
   let width = src1.rect().width;
   let height = src1.rect().height;
 
@@ -1374,7 +1377,7 @@ fn diff<T: Pixel>(
     dst.chunks_exact_mut(width).zip(src1.rows_iter()).zip(src2.rows_iter())
   {
     for ((r, v1), v2) in l.iter_mut().zip(s1).zip(s2) {
-      *r = i16::cast_from(*v1) - i16::cast_from(*v2);
+      r.write(i16::cast_from(*v1) - i16::cast_from(*v2));
     }
   }
 }
@@ -1504,17 +1507,13 @@ pub fn encode_tx_block<T: Pixel, W: Writer>(
   }
 
   let coded_tx_area = av1_get_coded_tx_size(tx_size).area();
-  // SAFETY: We write to the array below before reading from it.
-  let mut residual_storage: Aligned<[i16; 64 * 64]> =
+  let mut residual_storage: Aligned<[MaybeUninit<i16>; 64 * 64]> =
     unsafe { Aligned::uninitialized() };
-  // SAFETY: We write to the array below before reading from it.
-  let mut coeffs_storage: Aligned<[T::Coeff; 64 * 64]> =
+  let mut coeffs_storage: Aligned<[MaybeUninit<T::Coeff>; 64 * 64]> =
     unsafe { Aligned::uninitialized() };
-  // SAFETY: We write to the array below before reading from it.
   let mut qcoeffs_storage: Aligned<[MaybeUninit<T::Coeff>; 32 * 32]> =
     unsafe { Aligned::uninitialized() };
-  // SAFETY: We write to the array below before reading from it.
-  let mut rcoeffs_storage: Aligned<[T::Coeff; 32 * 32]> =
+  let mut rcoeffs_storage: Aligned<[MaybeUninit<T::Coeff>; 32 * 32]> =
     unsafe { Aligned::uninitialized() };
   let residual = &mut residual_storage.data[..tx_size.area()];
   let coeffs = &mut coeffs_storage.data[..tx_size.area()];
@@ -1539,8 +1538,10 @@ pub fn encode_tx_block<T: Pixel, W: Writer>(
       &rec.subregion(area),
     );
   } else {
-    residual.fill(0);
+    residual.fill(MaybeUninit::new(0));
   }
+  // SAFETY: `diff()` inits `tx_size.area()` elements when it matches size of `subregion(area)`
+  let residual = unsafe { slice_assume_init_mut(residual) };
 
   forward_transform(
     residual,
@@ -1551,6 +1552,8 @@ pub fn encode_tx_block<T: Pixel, W: Writer>(
     fi.sequence.bit_depth,
     fi.cpu_feature_level,
   );
+  // SAFETY: forward_transform initialized coeffs
+  let coeffs = unsafe { slice_assume_init_mut(coeffs) };
 
   let eob = ts.qc.quantize(coeffs, qcoeffs, tx_size, tx_type);
 
@@ -1596,6 +1599,8 @@ pub fn encode_tx_block<T: Pixel, W: Writer>(
     fi.ac_delta_q[p],
     fi.cpu_feature_level,
   );
+  // SAFETY: dequantize initialized rcoeffs
+  let rcoeffs = unsafe { slice_assume_init_mut(rcoeffs) };
 
   if eob == 0 {
     // All zero coefficients is a no-op

diff --git a/src/quantize/mod.rs b/src/quantize/mod.rs
@@ -359,10 +359,12 @@ impl QuantizationContext {
 pub mod rust {
   use super::*;
   use crate::cpu_features::CpuFeatureLevel;
+  use std::mem::MaybeUninit;
 
   pub fn dequantize<T: Coefficient>(
-    qindex: u8, coeffs: &[T], _eob: usize, rcoeffs: &mut [T], tx_size: TxSize,
-    bit_depth: usize, dc_delta_q: i8, ac_delta_q: i8, _cpu: CpuFeatureLevel,
+    qindex: u8, coeffs: &[T], _eob: usize, rcoeffs: &mut [MaybeUninit<T>],
+    tx_size: TxSize, bit_depth: usize, dc_delta_q: i8, ac_delta_q: i8,
+    _cpu: CpuFeatureLevel,
   ) {
     let log_tx_scale = get_log_tx_scale(tx_size) as i32;
     let offset = (1 << log_tx_scale) - 1;
@@ -376,7 +378,9 @@ pub mod rust {
       .enumerate()
     {
       let quant = if i == 0 { dc_quant } else { ac_quant };
-      *r = T::cast_from((c * quant + ((c >> 31) & offset)) >> log_tx_scale);
+      r.write(T::cast_from(
+        (c * quant + ((c >> 31) & offset)) >> log_tx_scale,
+      ));
     }
   }
 }