add mac NEON SIMD

PokemonAutomation · Nov 23, 2023 · 572623a · 572623a
1 parent cf8955d
commit 572623a
Show file tree

Hide file tree

Showing 15 changed files with 1,475 additions and 4 deletions.
diff --git a/SerialPrograms/CMakeLists.txt b/SerialPrograms/CMakeLists.txt
@@ -644,9 +644,11 @@ file(GLOB MAIN_SOURCES
     Source/Kernels/BinaryImageFilters/Kernels_BinaryImage_BasicFilters_Core_64x32_x64_AVX512.cpp
     Source/Kernels/BinaryImageFilters/Kernels_BinaryImage_BasicFilters_Core_64x4_Default.cpp
     Source/Kernels/BinaryImageFilters/Kernels_BinaryImage_BasicFilters_Core_64x64_x64_AVX512.cpp
+    Source/Kernels/BinaryImageFilters/Kernels_BinaryImage_BasicFilters_Core_64x8_arm64_NEON.cpp
     Source/Kernels/BinaryImageFilters/Kernels_BinaryImage_BasicFilters_Core_64x8_x64_SSE42.cpp
     Source/Kernels/BinaryImageFilters/Kernels_BinaryImage_BasicFilters_Default.h
     Source/Kernels/BinaryImageFilters/Kernels_BinaryImage_BasicFilters_Routines.h
+    Source/Kernels/BinaryImageFilters/Kernels_BinaryImage_BasicFilters_arm64_NEON.h
     Source/Kernels/BinaryImageFilters/Kernels_BinaryImage_BasicFilters_x64_AVX2.h
     Source/Kernels/BinaryImageFilters/Kernels_BinaryImage_BasicFilters_x64_AVX512.h
     Source/Kernels/BinaryImageFilters/Kernels_BinaryImage_BasicFilters_x64_SSE42.h
@@ -656,19 +658,23 @@ file(GLOB MAIN_SOURCES
     Source/Kernels/BinaryMatrix/Kernels_BinaryMatrixTile_64x32_x64_AVX512.h
     Source/Kernels/BinaryMatrix/Kernels_BinaryMatrixTile_64x4_Default.h
     Source/Kernels/BinaryMatrix/Kernels_BinaryMatrixTile_64x64_x64_AVX512.h
+    Source/Kernels/BinaryMatrix/Kernels_BinaryMatrixTile_64x8_arm64_NEON.h
     Source/Kernels/BinaryMatrix/Kernels_BinaryMatrixTile_64x8_x64_SSE42.h
     Source/Kernels/BinaryMatrix/Kernels_BinaryMatrixTile_64xH_Default.h
     Source/Kernels/BinaryMatrix/Kernels_BinaryMatrixTile_Debugging.h
     Source/Kernels/BinaryMatrix/Kernels_BinaryMatrix_Arch_64x16_x64_AVX2.h
     Source/Kernels/BinaryMatrix/Kernels_BinaryMatrix_Arch_64x32_x64_AVX512.h
+    Source/Kernels/BinaryMatrix/Kernels_BinaryMatrix_Arch_64x4_Default.h
     Source/Kernels/BinaryMatrix/Kernels_BinaryMatrix_Arch_64x64_x64_AVX512.h
+    Source/Kernels/BinaryMatrix/Kernels_BinaryMatrix_Arch_64x8_arm64_NEON.h
     Source/Kernels/BinaryMatrix/Kernels_BinaryMatrix_Arch_64x8_x64_SSE42.h
     Source/Kernels/BinaryMatrix/Kernels_BinaryMatrix_Arch_64xH_Default.h
     Source/Kernels/BinaryMatrix/Kernels_BinaryMatrix_Core_64x16_x64_AVX2.cpp
     Source/Kernels/BinaryMatrix/Kernels_BinaryMatrix_Core_64x32_x64_AVX512.cpp
     Source/Kernels/BinaryMatrix/Kernels_BinaryMatrix_Core_64x64_x64_AVX512.cpp
     Source/Kernels/BinaryMatrix/Kernels_BinaryMatrix_Core_64x8_x64_SSE42.cpp
     Source/Kernels/BinaryMatrix/Kernels_BinaryMatrix_Core_64xH_Default.cpp
+    Source/Kernels/BinaryMatrix/Kernels_BinaryMatrix_Core_arm64_NEON.cpp
     Source/Kernels/BinaryMatrix/Kernels_BinaryMatrix_t.h
     Source/Kernels/BinaryMatrix/Kernels_PackedBinaryMatrix.h
     Source/Kernels/BinaryMatrix/Kernels_PackedBinaryMatrixCore.h
@@ -739,6 +745,8 @@ file(GLOB MAIN_SOURCES
     Source/Kernels/Waterfill/Kernels_Waterfill_Core_64x64_x64_AVX512-GF.h
     Source/Kernels/Waterfill/Kernels_Waterfill_Core_64x64_x64_AVX512.cpp
     Source/Kernels/Waterfill/Kernels_Waterfill_Core_64x64_x64_AVX512.h
+    Source/Kernels/Waterfill/Kernels_Waterfill_Core_64x8_arm64_NEON.cpp
+    Source/Kernels/Waterfill/Kernels_Waterfill_Core_64x8_arm64_NEON.h
     Source/Kernels/Waterfill/Kernels_Waterfill_Core_64x8_x64_SSE42.cpp
     Source/Kernels/Waterfill/Kernels_Waterfill_Core_64x8_x64_SSE42.h
     Source/Kernels/Waterfill/Kernels_Waterfill_Core_64xH_Default.cpp

diff --git a/SerialPrograms/SerialPrograms.pro b/SerialPrograms/SerialPrograms.pro
@@ -330,13 +330,15 @@ SOURCES += \
     Source/Kernels/BinaryImageFilters/Kernels_BinaryImage_BasicFilters_Core_64x32_x64_AVX512.cpp \
     Source/Kernels/BinaryImageFilters/Kernels_BinaryImage_BasicFilters_Core_64x4_Default.cpp \
     Source/Kernels/BinaryImageFilters/Kernels_BinaryImage_BasicFilters_Core_64x64_x64_AVX512.cpp \
+    Source/Kernels/BinaryImageFilters/Kernels_BinaryImage_BasicFilters_Core_64x8_arm64_NEON.cpp \
     Source/Kernels/BinaryImageFilters/Kernels_BinaryImage_BasicFilters_Core_64x8_x64_SSE42.cpp \
     Source/Kernels/BinaryMatrix/Kernels_BinaryMatrix.cpp \
     Source/Kernels/BinaryMatrix/Kernels_BinaryMatrix_Core_64x16_x64_AVX2.cpp \
     Source/Kernels/BinaryMatrix/Kernels_BinaryMatrix_Core_64x32_x64_AVX512.cpp \
     Source/Kernels/BinaryMatrix/Kernels_BinaryMatrix_Core_64x64_x64_AVX512.cpp \
     Source/Kernels/BinaryMatrix/Kernels_BinaryMatrix_Core_64x8_x64_SSE42.cpp \
     Source/Kernels/BinaryMatrix/Kernels_BinaryMatrix_Core_64xH_Default.cpp \
+    Source/Kernels/BinaryMatrix/Kernels_BinaryMatrix_Core_arm64_NEON.cpp \
     Source/Kernels/BinaryMatrix/Kernels_BinaryMatrix_Core_x64_AVX2.cpp \
     Source/Kernels/BinaryMatrix/Kernels_BinaryMatrix_Core_x64_AVX512.cpp \
     Source/Kernels/BinaryMatrix/Kernels_BinaryMatrix_Core_x64_SSE42.cpp \
@@ -378,6 +380,7 @@ SOURCES += \
     Source/Kernels/Waterfill/Kernels_Waterfill_Core_64x32_x64_AVX512.cpp \
     Source/Kernels/Waterfill/Kernels_Waterfill_Core_64x64_x64_AVX512-GF.cpp \
     Source/Kernels/Waterfill/Kernels_Waterfill_Core_64x64_x64_AVX512.cpp \
+    Source/Kernels/Waterfill/Kernels_Waterfill_Core_64x8_arm64_NEON.cpp \
     Source/Kernels/Waterfill/Kernels_Waterfill_Core_64x8_x64_SSE42.cpp \
     Source/Kernels/Waterfill/Kernels_Waterfill_Core_64xH_Default.cpp \
     Source/Kernels/Waterfill/Kernels_Waterfill_Session.cpp \
@@ -1368,6 +1371,7 @@ HEADERS += \
     Source/Kernels/AudioStreamConversion/AudioStreamConversion.h \
     Source/Kernels/BinaryImageFilters/Kernels_BinaryImage_BasicFilters.h \
     Source/Kernels/BinaryImageFilters/Kernels_BinaryImage_BasicFilters_Routines.h \
+    Source/Kernels/BinaryImageFilters/Kernels_BinaryImage_BasicFilters_arm64_NEON.h \
     Source/Kernels/BinaryImageFilters/Kernels_BinaryImage_BasicFilters_x64_AVX2.h \
     Source/Kernels/BinaryImageFilters/Kernels_BinaryImage_BasicFilters_x64_AVX512.h \
     Source/Kernels/BinaryImageFilters/Kernels_BinaryImage_BasicFilters_x64_SSE42.h \
@@ -1376,12 +1380,14 @@ HEADERS += \
     Source/Kernels/BinaryMatrix/Kernels_BinaryMatrixTile_64x32_x64_AVX512.h \
     Source/Kernels/BinaryMatrix/Kernels_BinaryMatrixTile_64x4_Default.h \
     Source/Kernels/BinaryMatrix/Kernels_BinaryMatrixTile_64x64_x64_AVX512.h \
+    Source/Kernels/BinaryMatrix/Kernels_BinaryMatrixTile_64x8_arm64_NEON.h \
     Source/Kernels/BinaryMatrix/Kernels_BinaryMatrixTile_64x8_x64_SSE42.h \
     Source/Kernels/BinaryMatrix/Kernels_BinaryMatrixTile_64xH_Default.h \
     Source/Kernels/BinaryMatrix/Kernels_BinaryMatrixTile_Debugging.h \
     Source/Kernels/BinaryMatrix/Kernels_BinaryMatrix_Arch_64x16_x64_AVX2.h \
     Source/Kernels/BinaryMatrix/Kernels_BinaryMatrix_Arch_64x32_x64_AVX512.h \
     Source/Kernels/BinaryMatrix/Kernels_BinaryMatrix_Arch_64x64_x64_AVX512.h \
+    Source/Kernels/BinaryMatrix/Kernels_BinaryMatrix_Arch_64x8_arm64_NEON.h \
     Source/Kernels/BinaryMatrix/Kernels_BinaryMatrix_Arch_64x8_x64_SSE42.h \
     Source/Kernels/BinaryMatrix/Kernels_BinaryMatrix_Arch_64xH_Default.h \
     Source/Kernels/BinaryMatrix/Kernels_BinaryMatrix_t.h \
@@ -1414,6 +1420,7 @@ HEADERS += \
     Source/Kernels/Waterfill/Kernels_Waterfill_Core_64x4_Default.h \
     Source/Kernels/Waterfill/Kernels_Waterfill_Core_64x64_x64_AVX512-GF.h \
     Source/Kernels/Waterfill/Kernels_Waterfill_Core_64x64_x64_AVX512.h \
+    Source/Kernels/Waterfill/Kernels_Waterfill_Core_64x8_arm64_NEON.h \
     Source/Kernels/Waterfill/Kernels_Waterfill_Core_64x8_x64_SSE42.h \
     Source/Kernels/Waterfill/Kernels_Waterfill_Core_64xH_Default.h \
     Source/Kernels/Waterfill/Kernels_Waterfill_Intrinsics_x64_AVX512-GF.h \

diff --git a/SerialPrograms/Source/Kernels/BinaryImageFilters/Kernels_BinaryImage_BasicFilters.cpp b/SerialPrograms/Source/Kernels/BinaryImageFilters/Kernels_BinaryImage_BasicFilters.cpp
@@ -22,6 +22,7 @@ void filter_by_mask_64x8_x64_SSE42    (const PackedBinaryMatrix_IB& matrix, uint
 void filter_by_mask_64x16_x64_AVX2    (const PackedBinaryMatrix_IB& matrix, uint32_t* image, size_t bytes_per_row, uint32_t replace_with, bool replace_if_zero);
 void filter_by_mask_64x32_x64_AVX512  (const PackedBinaryMatrix_IB& matrix, uint32_t* image, size_t bytes_per_row, uint32_t replace_with, bool replace_if_zero);
 void filter_by_mask_64x64_x64_AVX512  (const PackedBinaryMatrix_IB& matrix, uint32_t* image, size_t bytes_per_row, uint32_t replace_with, bool replace_if_zero);
+void filter_by_mask_64x8_arm64_NEON    (const PackedBinaryMatrix_IB& matrix, uint32_t* image, size_t bytes_per_row, uint32_t replace_with, bool replace_if_zero);
 
 void filter_by_mask(
     const PackedBinaryMatrix_IB& matrix,
@@ -47,12 +48,17 @@ void filter_by_mask(
     case BinaryMatrixType::i64x8_x64_SSE42:
         filter_by_mask_64x8_x64_SSE42(matrix, image, bytes_per_row, replace_with, replace_if_zero);
         return;
+#endif
+#ifdef PA_AutoDispatch_arm64_20_M1
+    case BinaryMatrixType::arm64x8_x64_NEON:
+        filter_by_mask_64x8_arm64_NEON(matrix, image, bytes_per_row, replace_with, replace_if_zero);
+        return;
 #endif
     case BinaryMatrixType::i64x4_Default:
         filter_by_mask_64x4_Default(matrix, image, bytes_per_row, replace_with, replace_if_zero);
         return;
     default:
-        throw InternalProgramError(nullptr, PA_CURRENT_FUNCTION, "Unsupported matrix format.");
+        throw InternalProgramError(nullptr, PA_CURRENT_FUNCTION, "Unsupported matrix format in filter_by_mask().");
     }
 }
 
@@ -108,6 +114,15 @@ void compress_rgb32_to_binary_range_64x64_x64_AVX512(
     CompressRgb32ToBinaryRangeFilter* filters, size_t filter_count
 );
 
+void compress_rgb32_to_binary_range_64x8_arm64_NEON(
+    const uint32_t* image, size_t bytes_per_row,
+    PackedBinaryMatrix_IB& matrix0, uint32_t mins0, uint32_t maxs0
+);
+void compress_rgb32_to_binary_range_64x8_arm64_NEON(
+    const uint32_t* image, size_t bytes_per_row,
+    CompressRgb32ToBinaryRangeFilter* filters, size_t filter_count
+);
+
 void compress_rgb32_to_binary_range(
     const uint32_t* image, size_t bytes_per_row,
     PackedBinaryMatrix_IB& matrix,
@@ -131,6 +146,11 @@ void compress_rgb32_to_binary_range(
     case BinaryMatrixType::i64x8_x64_SSE42:
         compress_rgb32_to_binary_range_64x8_x64_SSE42(image, bytes_per_row, matrix, mins, maxs);
         return;
+#endif
+#ifdef PA_AutoDispatch_arm64_20_M1
+    case BinaryMatrixType::arm64x8_x64_NEON:
+        compress_rgb32_to_binary_range_64x8_arm64_NEON(image, bytes_per_row, matrix, mins, maxs);
+        return;
 #endif
     case BinaryMatrixType::i64x4_Default:
         compress_rgb32_to_binary_range_64x4_Default(image, bytes_per_row, matrix, mins, maxs);
@@ -170,6 +190,11 @@ void compress_rgb32_to_binary_range(
     case BinaryMatrixType::i64x8_x64_SSE42:
         compress_rgb32_to_binary_range_64x8_x64_SSE42(image, bytes_per_row, filters, filter_count);
         return;
+#endif
+#ifdef PA_AutoDispatch_arm64_20_M1
+    case BinaryMatrixType::arm64x8_x64_NEON:
+        compress_rgb32_to_binary_range_64x8_arm64_NEON(image, bytes_per_row, filters, filter_count);
+        return;
 #endif
     case BinaryMatrixType::i64x4_Default:
         compress_rgb32_to_binary_range_64x4_Default(image, bytes_per_row, filters, filter_count);
@@ -180,7 +205,6 @@ void compress_rgb32_to_binary_range(
 }
 
 
-
 void compress_rgb32_to_binary_euclidean_64x64_x64_AVX512(
     const uint32_t* image, size_t bytes_per_row,
     PackedBinaryMatrix_IB& matrix,
@@ -201,6 +225,11 @@ void compress_rgb32_to_binary_euclidean_64x8_x64_SSE42(
     PackedBinaryMatrix_IB& matrix,
     uint32_t expected, double max_euclidean_distance
 );
+void compress_rgb32_to_binary_euclidean_64x8_arm64_NEON(
+    const uint32_t* image, size_t bytes_per_row,
+    PackedBinaryMatrix_IB& matrix,
+    uint32_t expected, double max_euclidean_distance
+);
 void compress_rgb32_to_binary_euclidean_64x4_Default(
     const uint32_t* image, size_t bytes_per_row,
     PackedBinaryMatrix_IB& matrix,
@@ -229,6 +258,11 @@ void compress_rgb32_to_binary_euclidean(
     case BinaryMatrixType::i64x8_x64_SSE42:
         compress_rgb32_to_binary_euclidean_64x8_x64_SSE42(image, bytes_per_row, matrix, expected, max_euclidean_distance);
         return;
+#endif
+#ifdef PA_AutoDispatch_arm64_20_M1
+    case BinaryMatrixType::arm64x8_x64_NEON:
+        compress_rgb32_to_binary_euclidean_64x8_arm64_NEON(image, bytes_per_row, matrix, expected, max_euclidean_distance);
+        return;
 #endif
     case BinaryMatrixType::i64x4_Default:
         compress_rgb32_to_binary_euclidean_64x4_Default(image, bytes_per_row, matrix, expected, max_euclidean_distance);

diff --git a/...urce/Kernels/BinaryImageFilters/Kernels_BinaryImage_BasicFilters_Core_64x8_arm64_NEON.cpp b/...urce/Kernels/BinaryImageFilters/Kernels_BinaryImage_BasicFilters_Core_64x8_arm64_NEON.cpp
@@ -0,0 +1,64 @@
+/*  Binary Image Basic Filters (arm64 NEON)
+ *
+ *  From: https://github.com/PokemonAutomation/Arduino-Source
+ *
+ */
+
+#ifdef PA_AutoDispatch_arm64_20_M1
+
+#include "Kernels/BinaryMatrix/Kernels_BinaryMatrix_Arch_64x8_arm64_NEON.h"
+#include "Kernels_BinaryImage_BasicFilters_Routines.h"
+#include "Kernels_BinaryImage_BasicFilters_arm64_NEON.h"
+
+namespace PokemonAutomation{
+namespace Kernels{
+
+
+void filter_by_mask_64x8_arm64_NEON(
+    const PackedBinaryMatrix_IB& matrix,
+    uint32_t* image, size_t bytes_per_row,
+    uint32_t replace_with, bool replace_if_zero
+){
+    FilterByMask_arm64_NEON filter(replace_with, replace_if_zero);
+    filter_by_mask(static_cast<const PackedBinaryMatrix_64x8_arm64_NEON&>(matrix).get(), image, bytes_per_row, filter);
+}
+
+
+void compress_rgb32_to_binary_range_64x8_arm64_NEON(
+    const uint32_t* image, size_t bytes_per_row,
+    PackedBinaryMatrix_IB& matrix0, uint32_t mins0, uint32_t maxs0
+){
+    Compressor_RgbRange_arm64_NEON compressor0(mins0, maxs0);
+    compress_rgb32_to_binary(
+        image, bytes_per_row,
+        static_cast<PackedBinaryMatrix_64x8_arm64_NEON&>(matrix0).get(), compressor0
+    );
+}
+void compress_rgb32_to_binary_range_64x8_arm64_NEON(
+    const uint32_t* image, size_t bytes_per_row,
+    CompressRgb32ToBinaryRangeFilter* filters, size_t filter_count
+){
+    compress_rgb32_to_binary<PackedBinaryMatrix_64x8_arm64_NEON, Compressor_RgbRange_arm64_NEON>(
+        image, bytes_per_row, filters, filter_count
+    );
+}
+
+
+void compress_rgb32_to_binary_euclidean_64x8_arm64_NEON(
+    const uint32_t* image, size_t bytes_per_row,
+    PackedBinaryMatrix_IB& matrix,
+    uint32_t expected, double max_euclidean_distance
+){
+    // TODO:
+    // Compressor_RgbEuclidean_arm64_NEON compressor(expected, max_euclidean_distance);
+    // compress_rgb32_to_binary(
+    //     image, bytes_per_row,
+    //     static_cast<PackedBinaryMatrix_64x8_arm64_NEON&>(matrix).get(), compressor
+    // );
+}
+
+
+
+}
+}
+#endif
diff --git a/SerialPrograms/Source/Kernels/BinaryImageFilters/Kernels_BinaryImage_BasicFilters_Routines.h b/SerialPrograms/Source/Kernels/BinaryImageFilters/Kernels_BinaryImage_BasicFilters_Routines.h
@@ -118,6 +118,44 @@ void compress_rgb32_to_binary(
 }
 
 
+// Change pixel (as uint32_t) color of image based on bits in a binary matrix
+// If `filter` is constructed with `replace_if_zero` being true, image pixels corresponding to 0-bits in `matrix`
+//    are replaced with color `replace_with` which is provided by the filter.
+// If `filter` is constructed with `replace_if_zero` being false, image pixels corresponding to 1-bits in `matrix`
+//    are replaced with color `replace_with`.
+// `BinaryMatrixType` is a PackedBinaryMatrixCore<BinaryMatrixTileType>,
+// where BinaryMatrixTileType is an implementation of a tile, defined for every simd architecture.
+template <typename BinaryMatrixType, typename Filter>
+void filter_rgb32(
+    const BinaryMatrixType& matrix,
+    uint32_t* image, size_t bytes_per_row,
+    const Filter& filter
+){
+    size_t bit_width = matrix.width();
+//    size_t bit_height = binary_image.height();
+//    size_t word_width = binary_image.word64_width();
+
+    // How many words in a row. Each word is 64-bit long
+    size_t word_height = matrix.word64_height();
+    for (size_t r = 0; r < word_height; r++){ // For each row
+        uint32_t* img = image;
+        // c: current index of the word. One word has 64-bit wide
+        size_t c = 0;
+        size_t left = bit_width;
+        while (left >= 64){
+            // Modify some pixels in the 64-pixel-long area of the image,
+            // starting at pointr `img`
+            filter.filter64(matrix.word64(c, r), img);
+            c++;
+            img += 64;
+            left -= 64;
+        }
+        if (left > 0){
+            filter.filter64(matrix.word64(c, r), img, left);
+        }
+        image = (uint32_t*)((const char*)image + bytes_per_row);
+    }
+}
 
 
 }