From 6922e501c7eec48ab7da0df943b73e2dff36a8ee Mon Sep 17 00:00:00 2001 From: Katherine Whitlock Date: Tue, 10 Dec 2024 21:31:46 -0500 Subject: [PATCH] Add temporary multiload/store intrinsics --- src/CMakeLists.txt | 2 +- src/mem_functions.h | 2 +- src/{deluge/dsp => }/memmove.c | 34 ++++++++++++++++++++++++++++++---- 3 files changed, 32 insertions(+), 6 deletions(-) rename src/{deluge/dsp => }/memmove.c (60%) diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 0d165e29ed..02ee576727 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,6 +1,6 @@ cmake_path(SET SHARED_INCLUDE ${CMAKE_CURRENT_LIST_DIR}) -target_sources(deluge PUBLIC main.c resetprg.c c_lib_alternatives.S malloc.c terminate.cpp) +target_sources(deluge PUBLIC main.c resetprg.c c_lib_alternatives.S malloc.c terminate.cpp memmove.c) add_subdirectory(OSLikeStuff) add_subdirectory(deluge) diff --git a/src/mem_functions.h b/src/mem_functions.h index 2ceb8e7395..452279661a 100644 --- a/src/mem_functions.h +++ b/src/mem_functions.h @@ -6,8 +6,8 @@ extern "C" { // shenanigans void* memset(void*, int, size_t); void* memcpy(void* dest, const void* src, size_t n); +void* memmove(void* dest, const void* src, size_t n); int strcmp(const char* str1, const char* str2); -void* memmove(void* dst, const void* src, size_t len); #ifdef __cplusplus } #endif diff --git a/src/deluge/dsp/memmove.c b/src/memmove.c similarity index 60% rename from src/deluge/dsp/memmove.c rename to src/memmove.c index afad545181..f9cd90e368 100644 --- a/src/deluge/dsp/memmove.c +++ b/src/memmove.c @@ -5,7 +5,35 @@ #include #include -void* my_memmove(void* dst, const void* src, size_t len) { +[[gnu::always_inline]] inline uint8x16x2_t vld1q_u8_x2(unsigned char const* ptr) { + uint8x16x2_t output; + output.val[0] = vld1q_u8(ptr); + output.val[1] = vld1q_u8(ptr + 16); + return output; +} + +[[gnu::always_inline]] inline uint8x16x4_t vld1q_u8_x4(unsigned char const* ptr) { + uint8x16x4_t output; + output.val[0] = vld1q_u8(ptr); + output.val[1] = vld1q_u8(ptr + 16); + output.val[2] = vld1q_u8(ptr + 32); + output.val[3] = vld1q_u8(ptr + 48); + return output; +} + +[[gnu::always_inline]] inline void vst1q_u8_x2(unsigned char* ptr, uint8x16x2_t output) { + vst1q_u8(ptr, output.val[0]); + vst1q_u8(ptr + 16, output.val[1]); +} + +[[gnu::always_inline]] inline void vst1q_u8_x4(unsigned char* ptr, uint8x16x4_t output) { + vst1q_u8(ptr, output.val[0]); + vst1q_u8(ptr + 16, output.val[1]); + vst1q_u8(ptr + 32, output.val[2]); + vst1q_u8(ptr + 48, output.val[3]); +} + +void* memmove(void* dst, const void* src, size_t len) { ptrdiff_t result; asm("sub %0, %1, %2" : "=r"(result) : "r"(dst), "r"(src)); if (abs(result) >= (ptrdiff_t)len) { @@ -51,21 +79,19 @@ void* my_memmove(void* dst, const void* src, size_t len) { vst1q_u8(d, vld1q_u8(s)); } - // quadword x2 if (len % 64) { s -= 32; d -= 32; vst1q_u8_x2(d, vld1q_u8_x2(s)); } - // quadword x4 if (len % 128) { s -= 64; d -= 64; vst1q_u8_x4(d, vld1q_u8_x4(s)); } - // quadword x8 + // max number of quadwords is 16 while ((intptr_t)d > (intptr_t)dst) { s -= 64; uint8x16x4_t ld1 = vld1q_u8_x4(s);