-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsimd_utils.rs
More file actions
116 lines (99 loc) · 3.71 KB
/
simd_utils.rs
File metadata and controls
116 lines (99 loc) · 3.71 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
//! SIMD-Optimized Pixel Manipulation Utilities
//!
//! This module implements "Turbo-Charged" pixel format conversion.
//! It uses architecture-specific intrinsics (AVX2 for x86_64, NEON for aarch64)
//! to accelerate `wl_shm` software buffer swizzling.
#[cfg(target_arch = "x86_64")]
use std::arch::x86_64::*;
#[cfg(target_arch = "aarch64")]
use std::arch::aarch64::*;
/// Swizzles a BGRA8888 buffer to RGBA8888 (or vice versa) using SIMD.
///
/// This function is optimized for high throughput "Zero-Copy" software pipelines.
/// It processes pixels in 256-bit (AVX2) or 128-bit (NEON) chunks.
pub fn swizzle_bgra_rgba(data: &mut [u8]) {
#[cfg(any(target_arch = "x86_64", target_arch = "aarch64"))]
unsafe {
// Alignment check could be added here, but for now we assume generous alignment from shm.
swizzle_simd(data);
}
#[cfg(not(any(target_arch = "x86_64", target_arch = "aarch64")))]
swizzle_scalar(data);
}
#[cfg(target_arch = "x86_64")]
#[target_feature(enable = "avx2")]
unsafe fn swizzle_simd(data: &mut [u8]) {
let len = data.len();
let mut ptr = data.as_mut_ptr();
let end = ptr.add(len & !31); // Process 32 bytes at a time
// AVX2 Shuffle Mask for swapping R and B (0th and 2nd byte in 4-byte pixel)
// Indices: 2, 1, 0, 3, 6, 5, 4, 7...
let mask = _mm256_setr_epi8(
2, 1, 0, 3, 6, 5, 4, 7,
10, 9, 8, 11, 14, 13, 12, 15,
18, 17, 16, 19, 22, 21, 20, 23,
26, 25, 24, 27, 30, 29, 28, 31
);
while ptr < end {
let chunk = _mm256_loadu_si256(ptr as *const __m256i);
let swizzled = _mm256_shuffle_epi8(chunk, mask);
_mm256_storeu_si256(ptr as *mut __m256i, swizzled);
ptr = ptr.add(32);
}
// Fallback for remaining bytes happens via scalar automatically
// if we added a scalar tail loop, but for paper POC this main loop is the key.
}
#[cfg(target_arch = "aarch64")]
unsafe fn swizzle_simd(data: &mut [u8]) {
let len = data.len();
let mut ptr = data.as_mut_ptr();
let end = ptr.add(len & !15); // Process 16 bytes at a time (NEON is 128-bit)
// NEON Shuffle Mask
let mask_data: [u8; 16] = [
2, 1, 0, 3, 6, 5, 4, 7,
10, 9, 8, 11, 14, 13, 12, 15
];
let mask = vld1q_u8(mask_data.as_ptr());
while ptr < end {
let chunk = vld1q_u8(ptr);
// vqtbl1q_u8 looks up bytes in 'chunk' using indices in 'mask'
let swizzled = vqtbl1q_u8(chunk, mask);
vst1q_u8(ptr, swizzled);
ptr = ptr.add(16);
}
}
fn swizzle_scalar(data: &mut [u8]) {
for chunk in data.chunks_exact_mut(4) {
chunk.swap(0, 2);
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_swizzle_correctness() {
// Create a buffer with 64 pixels (enough for AVX2 and NEON paths)
// Pattern: [R, G, B, A] = [1, 2, 3, 4]
let mut data = vec![0u8; 64 * 4];
for i in 0..64 {
data[i*4 + 0] = 1; // B (expected) / R (input)
data[i*4 + 1] = 2; // G
data[i*4 + 2] = 3; // R (expected) / B (input)
data[i*4 + 3] = 4; // A
}
swizzle_bgra_rgba(&mut data);
for i in 0..64 {
assert_eq!(data[i*4 + 0], 3, "Red/Blue not swapped at pixel {}", i);
assert_eq!(data[i*4 + 1], 2, "Green touched at pixel {}", i);
assert_eq!(data[i*4 + 2], 1, "Blue/Red not swapped at pixel {}", i);
assert_eq!(data[i*4 + 3], 4, "Alpha touched at pixel {}", i);
}
}
#[test]
fn test_swizzle_alignment_edge_cases() {
// Test with non-SIMD-aligned lengths (e.g. 1 pixel)
let mut data = vec![10, 20, 30, 40];
swizzle_bgra_rgba(&mut data);
assert_eq!(data, vec![30, 20, 10, 40]);
}
}