Skip to content

Commit 44e3b94

Browse files
authored
Use SVE widening loads (ARM-software#490)
This change moves from NEON-style post-load widening, to SVE-style load-time widening. On Neoverse V1 this improves performance 1-5% (increasing with block size).
1 parent 298aafd commit 44e3b94

File tree

1 file changed

+3
-9
lines changed

1 file changed

+3
-9
lines changed

Source/astcenc_vecmathlib_sve_8.h

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -146,14 +146,8 @@ struct vint8
146146
*/
147147
ASTCENC_SIMD_INLINE explicit vint8(const uint8_t *p)
148148
{
149-
// Load 8 byte values
150-
svbool_8_t pred = svptrue_pat_b8(SV_VL8);
151-
svuint8_8_t m8 = svld1_u8(pred, p);
152-
153-
// Expand to 32-bits
154-
svuint16_8_t m16 = svunpklo_u16(m8);
155-
svuint32_8_t m32 = svunpklo_u32(m16);
156-
m = svreinterpret_s32_u32(m32);
149+
// Load 8-bit values and expand to 32-bits
150+
m = svld1ub_s32(svptrue_b32(), p);
157151
}
158152

159153
/**
@@ -1037,7 +1031,7 @@ ASTCENC_SIMD_INLINE vint8 interleave_rgba8(vint8 r, vint8 g, vint8 b, vint8 a)
10371031
*/
10381032
ASTCENC_SIMD_INLINE void store_lanes_masked(uint8_t* base, vint8 data, vmask8 mask)
10391033
{
1040-
svst1_u32(mask.m, reinterpret_cast<uint32_t*>(base), data.m);
1034+
svst1_s32(mask.m, reinterpret_cast<int32_t*>(base), data.m);
10411035
}
10421036

10431037
/**

0 commit comments

Comments
 (0)