Skip to content

Commit

Permalink
Prefer MaybeUninit for write-only slices
Browse files Browse the repository at this point in the history
  • Loading branch information
kornelski authored and barrbrain committed Oct 20, 2023
1 parent ff2df3d commit 620d541
Show file tree
Hide file tree
Showing 9 changed files with 90 additions and 50 deletions.
3 changes: 2 additions & 1 deletion benches/transform.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ use rav1e::bench::transform;
use rav1e::bench::transform::{
forward_transform, get_valid_txfm_types, TxSize,
};
use std::mem::MaybeUninit;

fn init_buffers(size: usize) -> (Vec<i32>, Vec<i32>) {
let mut ra = ChaChaRng::from_seed([0; 32]);
Expand Down Expand Up @@ -96,7 +97,7 @@ pub fn bench_forward_transforms(c: &mut Criterion) {

let input: Vec<i16> =
(0..area).map(|_| rng.gen_range(-255..256)).collect();
let mut output = vec![0i16; area];
let mut output = vec![MaybeUninit::new(0i16); area];

for &tx_type in get_valid_txfm_types(tx_size) {
group.bench_function(
Expand Down
21 changes: 13 additions & 8 deletions src/asm/shared/transform/inverse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

use crate::tiling::PlaneRegionMut;
use crate::util::*;
use std::mem::MaybeUninit;

// Note: Input coeffs are mutable since the assembly uses them as a scratchpad
pub type InvTxfmFunc =
Expand All @@ -27,13 +28,13 @@ pub fn call_inverse_func<T: Pixel>(
let input: &[T::Coeff] = &input[..width.min(32) * height.min(32)];

// SAFETY: We write to the array below before reading from it.
let mut copied: Aligned<[T::Coeff; 32 * 32]> =
let mut copied: Aligned<[MaybeUninit<T::Coeff>; 32 * 32]> =
unsafe { Aligned::uninitialized() };

// Convert input to 16-bits.
// TODO: Remove by changing inverse assembly to not overwrite its input
for (a, b) in copied.data.iter_mut().zip(input) {
*a = *b;
a.write(*b);
}

// perform the inverse transform
Expand All @@ -57,13 +58,13 @@ pub fn call_inverse_hbd_func<T: Pixel>(
let input: &[T::Coeff] = &input[..width.min(32) * height.min(32)];

// SAFETY: We write to the array below before reading from it.
let mut copied: Aligned<[T::Coeff; 32 * 32]> =
let mut copied: Aligned<[MaybeUninit<T::Coeff>; 32 * 32]> =
unsafe { Aligned::uninitialized() };

// Convert input to 16-bits.
// TODO: Remove by changing inverse assembly to not overwrite its input
for (a, b) in copied.data.iter_mut().zip(input) {
*a = *b;
a.write(*b);
}

// perform the inverse transform
Expand All @@ -89,6 +90,7 @@ pub mod test {
use crate::transform::TxSize::*;
use crate::transform::*;
use rand::{random, thread_rng, Rng};
use std::mem::MaybeUninit;

pub fn pick_eob<T: Coefficient>(
coeffs: &mut [T], tx_size: TxSize, tx_type: TxType, sub_h: usize,
Expand Down Expand Up @@ -147,21 +149,22 @@ pub mod test {
let mut src_storage = [T::zero(); 64 * 64];
let src = &mut src_storage[..tx_size.area()];
let mut dst = Plane::from_slice(&[T::zero(); 64 * 64], 64);
// SAFETY: We write to the array below before reading from it.
let mut res_storage: Aligned<[i16; 64 * 64]> =
let mut res_storage: Aligned<[MaybeUninit<i16>; 64 * 64]> =
unsafe { Aligned::uninitialized() };
let res = &mut res_storage.data[..tx_size.area()];
// SAFETY: We write to the array below before reading from it.
let mut freq_storage: Aligned<[T::Coeff; 64 * 64]> =
let mut freq_storage: Aligned<[MaybeUninit<T::Coeff>; 64 * 64]> =
unsafe { Aligned::uninitialized() };
let freq = &mut freq_storage.data[..tx_size.area()];
for ((r, s), d) in
res.iter_mut().zip(src.iter_mut()).zip(dst.data.iter_mut())
{
*s = T::cast_from(random::<u16>() >> (16 - bit_depth));
*d = T::cast_from(random::<u16>() >> (16 - bit_depth));
*r = i16::cast_from(*s) - i16::cast_from(*d);
r.write(i16::cast_from(*s) - i16::cast_from(*d));
}
// SAFETY: The loop just initialized res, and all three slices have the same length
let res = unsafe { slice_assume_init_mut(res) };
forward_transform(
res,
freq,
Expand All @@ -171,6 +174,8 @@ pub mod test {
bit_depth,
CpuFeatureLevel::RUST,
);
// SAFETY: forward_transform initialized freq
let freq = unsafe { slice_assume_init_mut(freq) };

let eob: usize = pick_eob(freq, tx_size, tx_type, sub_h);
let mut rust_dst = dst.clone();
Expand Down
20 changes: 12 additions & 8 deletions src/asm/x86/quantize.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ use crate::cpu_features::CpuFeatureLevel;
use crate::quantize::*;
use crate::transform::TxSize;
use crate::util::*;
use std::mem::MaybeUninit;

type DequantizeFn = unsafe fn(
qindex: u8,
Expand All @@ -37,21 +38,21 @@ cpu_function_lookup_table!(

#[inline(always)]
pub fn dequantize<T: Coefficient>(
qindex: u8, coeffs: &[T], eob: usize, rcoeffs: &mut [T], tx_size: TxSize,
bit_depth: usize, dc_delta_q: i8, ac_delta_q: i8, cpu: CpuFeatureLevel,
qindex: u8, coeffs: &[T], eob: usize, rcoeffs: &mut [MaybeUninit<T>],
tx_size: TxSize, bit_depth: usize, dc_delta_q: i8, ac_delta_q: i8,
cpu: CpuFeatureLevel,
) {
let call_rust = |rcoeffs: &mut [T]| {
let call_rust = |rcoeffs: &mut [MaybeUninit<T>]| {
crate::quantize::rust::dequantize(
qindex, coeffs, eob, rcoeffs, tx_size, bit_depth, dc_delta_q,
ac_delta_q, cpu,
);
};

#[cfg(any(feature = "check_asm", test))]
let ref_rcoeffs = {
let mut ref_rcoeffs = {
let area = av1_get_coded_tx_size(tx_size).area();
let mut copy = vec![T::cast_from(0); area];
copy[..].copy_from_slice(&rcoeffs[..area]);
let mut copy = vec![MaybeUninit::new(T::cast_from(0)); area];
call_rust(&mut copy);
copy
};
Expand Down Expand Up @@ -82,7 +83,9 @@ pub fn dequantize<T: Coefficient>(
#[cfg(any(feature = "check_asm", test))]
{
let area = av1_get_coded_tx_size(tx_size).area();
assert_eq!(&rcoeffs[..area], &ref_rcoeffs[..]);
let rcoeffs = unsafe { assume_slice_init_mut(&mut rcoeffs[..area]) };
let ref_rcoeffs = unsafe { assume_slice_init_mut(&mut ref_rcoeffs[..]) };
assert_eq!(rcoeffs, ref_rcoeffs);
}
}

Expand Down Expand Up @@ -157,6 +160,7 @@ mod test {
use super::*;
use rand::distributions::{Distribution, Uniform};
use rand::{thread_rng, Rng};
use std::mem::MaybeUninit;

#[test]
fn dequantize_test() {
Expand Down Expand Up @@ -190,7 +194,7 @@ mod test {

for &eob in &eobs {
let mut qcoeffs = Aligned::new([0i16; 32 * 32]);
let mut rcoeffs = Aligned::new([0i16; 32 * 32]);
let mut rcoeffs = Aligned::new([MaybeUninit::new(0i16); 32 * 32]);

// Generate quantized coefficients up to the eob
let between = Uniform::from(-i16::MAX..=i16::MAX);
Expand Down
17 changes: 11 additions & 6 deletions src/asm/x86/transform/forward.rs
Original file line number Diff line number Diff line change
Expand Up @@ -335,8 +335,8 @@ fn cast_mut<const N: usize, T>(x: &mut [T]) -> &mut [T; N] {
#[allow(clippy::identity_op, clippy::erasing_op)]
#[target_feature(enable = "avx2")]
unsafe fn forward_transform_avx2<T: Coefficient>(
input: &[i16], output: &mut [T], stride: usize, tx_size: TxSize,
tx_type: TxType, bd: usize,
input: &[i16], output: &mut [MaybeUninit<T>], stride: usize,
tx_size: TxSize, tx_type: TxType, bd: usize,
) {
// Note when assigning txfm_size_col, we use the txfm_size from the
// row configuration and vice versa. This is intentionally done to
Expand Down Expand Up @@ -510,8 +510,8 @@ unsafe fn forward_transform_avx2<T: Coefficient>(
///
/// - If called with an invalid combination of `tx_size` and `tx_type`
pub fn forward_transform<T: Coefficient>(
input: &[i16], output: &mut [T], stride: usize, tx_size: TxSize,
tx_type: TxType, bd: usize, cpu: CpuFeatureLevel,
input: &[i16], output: &mut [MaybeUninit<T>], stride: usize,
tx_size: TxSize, tx_type: TxType, bd: usize, cpu: CpuFeatureLevel,
) {
assert!(valid_av1_transform(tx_size, tx_type));
if cpu >= CpuFeatureLevel::AVX2 {
Expand All @@ -528,7 +528,9 @@ pub fn forward_transform<T: Coefficient>(
mod test {
use crate::cpu_features::*;
use crate::transform::{forward_transform, get_valid_txfm_types, TxSize};
use crate::util::assume_slice_init_mut;
use rand::Rng;
use std::mem::MaybeUninit;

// Ensure that the simd results match the rust code
#[test]
Expand Down Expand Up @@ -560,8 +562,8 @@ mod test {
(0..area).map(|_| rng.gen_range(-255..256)).collect();

for &tx_type in get_valid_txfm_types(tx_size) {
let mut output_ref = vec![0i16; area];
let mut output_simd = vec![0i16; area];
let mut output_ref = vec![MaybeUninit::new(0i16); area];
let mut output_simd = vec![MaybeUninit::new(0i16); area];

println!("Testing combination {:?}, {:?}", tx_size, tx_type);
forward_transform(
Expand All @@ -573,6 +575,7 @@ mod test {
8,
CpuFeatureLevel::RUST,
);
let output_ref = unsafe { assume_slice_init_mut(&mut output_ref[..]) };
forward_transform(
&input[..],
&mut output_simd[..],
Expand All @@ -582,6 +585,8 @@ mod test {
8,
cpu,
);
let output_simd =
unsafe { assume_slice_init_mut(&mut output_simd[..]) };
assert_eq!(output_ref, output_simd)
}
}
Expand Down
25 changes: 15 additions & 10 deletions src/encoder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1355,9 +1355,12 @@ fn write_key_frame_obus<T: Pixel>(

/// Write into `dst` the difference between the blocks at `src1` and `src2`
fn diff<T: Pixel>(
dst: &mut [i16], src1: &PlaneRegion<'_, T>, src2: &PlaneRegion<'_, T>,
dst: &mut [MaybeUninit<i16>], src1: &PlaneRegion<'_, T>,
src2: &PlaneRegion<'_, T>,
) {
debug_assert!(dst.len() % src1.rect().width == 0);
debug_assert_eq!(src1.rows_iter().count(), src1.rect().height);

let width = src1.rect().width;
let height = src1.rect().height;

Expand All @@ -1374,7 +1377,7 @@ fn diff<T: Pixel>(
dst.chunks_exact_mut(width).zip(src1.rows_iter()).zip(src2.rows_iter())
{
for ((r, v1), v2) in l.iter_mut().zip(s1).zip(s2) {
*r = i16::cast_from(*v1) - i16::cast_from(*v2);
r.write(i16::cast_from(*v1) - i16::cast_from(*v2));
}
}
}
Expand Down Expand Up @@ -1504,17 +1507,13 @@ pub fn encode_tx_block<T: Pixel, W: Writer>(
}

let coded_tx_area = av1_get_coded_tx_size(tx_size).area();
// SAFETY: We write to the array below before reading from it.
let mut residual_storage: Aligned<[i16; 64 * 64]> =
let mut residual_storage: Aligned<[MaybeUninit<i16>; 64 * 64]> =
unsafe { Aligned::uninitialized() };
// SAFETY: We write to the array below before reading from it.
let mut coeffs_storage: Aligned<[T::Coeff; 64 * 64]> =
let mut coeffs_storage: Aligned<[MaybeUninit<T::Coeff>; 64 * 64]> =
unsafe { Aligned::uninitialized() };
// SAFETY: We write to the array below before reading from it.
let mut qcoeffs_storage: Aligned<[MaybeUninit<T::Coeff>; 32 * 32]> =
unsafe { Aligned::uninitialized() };
// SAFETY: We write to the array below before reading from it.
let mut rcoeffs_storage: Aligned<[T::Coeff; 32 * 32]> =
let mut rcoeffs_storage: Aligned<[MaybeUninit<T::Coeff>; 32 * 32]> =
unsafe { Aligned::uninitialized() };
let residual = &mut residual_storage.data[..tx_size.area()];
let coeffs = &mut coeffs_storage.data[..tx_size.area()];
Expand All @@ -1539,8 +1538,10 @@ pub fn encode_tx_block<T: Pixel, W: Writer>(
&rec.subregion(area),
);
} else {
residual.fill(0);
residual.fill(MaybeUninit::new(0));
}
// SAFETY: `diff()` inits `tx_size.area()` elements when it matches size of `subregion(area)`
let residual = unsafe { slice_assume_init_mut(residual) };

forward_transform(
residual,
Expand All @@ -1551,6 +1552,8 @@ pub fn encode_tx_block<T: Pixel, W: Writer>(
fi.sequence.bit_depth,
fi.cpu_feature_level,
);
// SAFETY: forward_transform initialized coeffs
let coeffs = unsafe { slice_assume_init_mut(coeffs) };

let eob = ts.qc.quantize(coeffs, qcoeffs, tx_size, tx_type);

Expand Down Expand Up @@ -1596,6 +1599,8 @@ pub fn encode_tx_block<T: Pixel, W: Writer>(
fi.ac_delta_q[p],
fi.cpu_feature_level,
);
// SAFETY: dequantize initialized rcoeffs
let rcoeffs = unsafe { slice_assume_init_mut(rcoeffs) };

if eob == 0 {
// All zero coefficients is a no-op
Expand Down
10 changes: 7 additions & 3 deletions src/quantize/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -359,10 +359,12 @@ impl QuantizationContext {
pub mod rust {
use super::*;
use crate::cpu_features::CpuFeatureLevel;
use std::mem::MaybeUninit;

pub fn dequantize<T: Coefficient>(
qindex: u8, coeffs: &[T], _eob: usize, rcoeffs: &mut [T], tx_size: TxSize,
bit_depth: usize, dc_delta_q: i8, ac_delta_q: i8, _cpu: CpuFeatureLevel,
qindex: u8, coeffs: &[T], _eob: usize, rcoeffs: &mut [MaybeUninit<T>],
tx_size: TxSize, bit_depth: usize, dc_delta_q: i8, ac_delta_q: i8,
_cpu: CpuFeatureLevel,
) {
let log_tx_scale = get_log_tx_scale(tx_size) as i32;
let offset = (1 << log_tx_scale) - 1;
Expand All @@ -376,7 +378,9 @@ pub mod rust {
.enumerate()
{
let quant = if i == 0 { dc_quant } else { ac_quant };
*r = T::cast_from((c * quant + ((c >> 31) & offset)) >> log_tx_scale);
r.write(T::cast_from(
(c * quant + ((c >> 31) & offset)) >> log_tx_scale,
));
}
}
}
Loading

0 comments on commit 620d541

Please sign in to comment.