Skip to content

Commit

Permalink
add mac NEON SIMD
Browse files Browse the repository at this point in the history
  • Loading branch information
Gin committed Nov 23, 2023
1 parent cf8955d commit 572623a
Show file tree
Hide file tree
Showing 15 changed files with 1,475 additions and 4 deletions.
8 changes: 8 additions & 0 deletions SerialPrograms/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -644,9 +644,11 @@ file(GLOB MAIN_SOURCES
Source/Kernels/BinaryImageFilters/Kernels_BinaryImage_BasicFilters_Core_64x32_x64_AVX512.cpp
Source/Kernels/BinaryImageFilters/Kernels_BinaryImage_BasicFilters_Core_64x4_Default.cpp
Source/Kernels/BinaryImageFilters/Kernels_BinaryImage_BasicFilters_Core_64x64_x64_AVX512.cpp
Source/Kernels/BinaryImageFilters/Kernels_BinaryImage_BasicFilters_Core_64x8_arm64_NEON.cpp
Source/Kernels/BinaryImageFilters/Kernels_BinaryImage_BasicFilters_Core_64x8_x64_SSE42.cpp
Source/Kernels/BinaryImageFilters/Kernels_BinaryImage_BasicFilters_Default.h
Source/Kernels/BinaryImageFilters/Kernels_BinaryImage_BasicFilters_Routines.h
Source/Kernels/BinaryImageFilters/Kernels_BinaryImage_BasicFilters_arm64_NEON.h
Source/Kernels/BinaryImageFilters/Kernels_BinaryImage_BasicFilters_x64_AVX2.h
Source/Kernels/BinaryImageFilters/Kernels_BinaryImage_BasicFilters_x64_AVX512.h
Source/Kernels/BinaryImageFilters/Kernels_BinaryImage_BasicFilters_x64_SSE42.h
Expand All @@ -656,19 +658,23 @@ file(GLOB MAIN_SOURCES
Source/Kernels/BinaryMatrix/Kernels_BinaryMatrixTile_64x32_x64_AVX512.h
Source/Kernels/BinaryMatrix/Kernels_BinaryMatrixTile_64x4_Default.h
Source/Kernels/BinaryMatrix/Kernels_BinaryMatrixTile_64x64_x64_AVX512.h
Source/Kernels/BinaryMatrix/Kernels_BinaryMatrixTile_64x8_arm64_NEON.h
Source/Kernels/BinaryMatrix/Kernels_BinaryMatrixTile_64x8_x64_SSE42.h
Source/Kernels/BinaryMatrix/Kernels_BinaryMatrixTile_64xH_Default.h
Source/Kernels/BinaryMatrix/Kernels_BinaryMatrixTile_Debugging.h
Source/Kernels/BinaryMatrix/Kernels_BinaryMatrix_Arch_64x16_x64_AVX2.h
Source/Kernels/BinaryMatrix/Kernels_BinaryMatrix_Arch_64x32_x64_AVX512.h
Source/Kernels/BinaryMatrix/Kernels_BinaryMatrix_Arch_64x4_Default.h
Source/Kernels/BinaryMatrix/Kernels_BinaryMatrix_Arch_64x64_x64_AVX512.h
Source/Kernels/BinaryMatrix/Kernels_BinaryMatrix_Arch_64x8_arm64_NEON.h
Source/Kernels/BinaryMatrix/Kernels_BinaryMatrix_Arch_64x8_x64_SSE42.h
Source/Kernels/BinaryMatrix/Kernels_BinaryMatrix_Arch_64xH_Default.h
Source/Kernels/BinaryMatrix/Kernels_BinaryMatrix_Core_64x16_x64_AVX2.cpp
Source/Kernels/BinaryMatrix/Kernels_BinaryMatrix_Core_64x32_x64_AVX512.cpp
Source/Kernels/BinaryMatrix/Kernels_BinaryMatrix_Core_64x64_x64_AVX512.cpp
Source/Kernels/BinaryMatrix/Kernels_BinaryMatrix_Core_64x8_x64_SSE42.cpp
Source/Kernels/BinaryMatrix/Kernels_BinaryMatrix_Core_64xH_Default.cpp
Source/Kernels/BinaryMatrix/Kernels_BinaryMatrix_Core_arm64_NEON.cpp
Source/Kernels/BinaryMatrix/Kernels_BinaryMatrix_t.h
Source/Kernels/BinaryMatrix/Kernels_PackedBinaryMatrix.h
Source/Kernels/BinaryMatrix/Kernels_PackedBinaryMatrixCore.h
Expand Down Expand Up @@ -739,6 +745,8 @@ file(GLOB MAIN_SOURCES
Source/Kernels/Waterfill/Kernels_Waterfill_Core_64x64_x64_AVX512-GF.h
Source/Kernels/Waterfill/Kernels_Waterfill_Core_64x64_x64_AVX512.cpp
Source/Kernels/Waterfill/Kernels_Waterfill_Core_64x64_x64_AVX512.h
Source/Kernels/Waterfill/Kernels_Waterfill_Core_64x8_arm64_NEON.cpp
Source/Kernels/Waterfill/Kernels_Waterfill_Core_64x8_arm64_NEON.h
Source/Kernels/Waterfill/Kernels_Waterfill_Core_64x8_x64_SSE42.cpp
Source/Kernels/Waterfill/Kernels_Waterfill_Core_64x8_x64_SSE42.h
Source/Kernels/Waterfill/Kernels_Waterfill_Core_64xH_Default.cpp
Expand Down
7 changes: 7 additions & 0 deletions SerialPrograms/SerialPrograms.pro
Original file line number Diff line number Diff line change
Expand Up @@ -330,13 +330,15 @@ SOURCES += \
Source/Kernels/BinaryImageFilters/Kernels_BinaryImage_BasicFilters_Core_64x32_x64_AVX512.cpp \
Source/Kernels/BinaryImageFilters/Kernels_BinaryImage_BasicFilters_Core_64x4_Default.cpp \
Source/Kernels/BinaryImageFilters/Kernels_BinaryImage_BasicFilters_Core_64x64_x64_AVX512.cpp \
Source/Kernels/BinaryImageFilters/Kernels_BinaryImage_BasicFilters_Core_64x8_arm64_NEON.cpp \
Source/Kernels/BinaryImageFilters/Kernels_BinaryImage_BasicFilters_Core_64x8_x64_SSE42.cpp \
Source/Kernels/BinaryMatrix/Kernels_BinaryMatrix.cpp \
Source/Kernels/BinaryMatrix/Kernels_BinaryMatrix_Core_64x16_x64_AVX2.cpp \
Source/Kernels/BinaryMatrix/Kernels_BinaryMatrix_Core_64x32_x64_AVX512.cpp \
Source/Kernels/BinaryMatrix/Kernels_BinaryMatrix_Core_64x64_x64_AVX512.cpp \
Source/Kernels/BinaryMatrix/Kernels_BinaryMatrix_Core_64x8_x64_SSE42.cpp \
Source/Kernels/BinaryMatrix/Kernels_BinaryMatrix_Core_64xH_Default.cpp \
Source/Kernels/BinaryMatrix/Kernels_BinaryMatrix_Core_arm64_NEON.cpp \
Source/Kernels/BinaryMatrix/Kernels_BinaryMatrix_Core_x64_AVX2.cpp \
Source/Kernels/BinaryMatrix/Kernels_BinaryMatrix_Core_x64_AVX512.cpp \
Source/Kernels/BinaryMatrix/Kernels_BinaryMatrix_Core_x64_SSE42.cpp \
Expand Down Expand Up @@ -378,6 +380,7 @@ SOURCES += \
Source/Kernels/Waterfill/Kernels_Waterfill_Core_64x32_x64_AVX512.cpp \
Source/Kernels/Waterfill/Kernels_Waterfill_Core_64x64_x64_AVX512-GF.cpp \
Source/Kernels/Waterfill/Kernels_Waterfill_Core_64x64_x64_AVX512.cpp \
Source/Kernels/Waterfill/Kernels_Waterfill_Core_64x8_arm64_NEON.cpp \
Source/Kernels/Waterfill/Kernels_Waterfill_Core_64x8_x64_SSE42.cpp \
Source/Kernels/Waterfill/Kernels_Waterfill_Core_64xH_Default.cpp \
Source/Kernels/Waterfill/Kernels_Waterfill_Session.cpp \
Expand Down Expand Up @@ -1368,6 +1371,7 @@ HEADERS += \
Source/Kernels/AudioStreamConversion/AudioStreamConversion.h \
Source/Kernels/BinaryImageFilters/Kernels_BinaryImage_BasicFilters.h \
Source/Kernels/BinaryImageFilters/Kernels_BinaryImage_BasicFilters_Routines.h \
Source/Kernels/BinaryImageFilters/Kernels_BinaryImage_BasicFilters_arm64_NEON.h \
Source/Kernels/BinaryImageFilters/Kernels_BinaryImage_BasicFilters_x64_AVX2.h \
Source/Kernels/BinaryImageFilters/Kernels_BinaryImage_BasicFilters_x64_AVX512.h \
Source/Kernels/BinaryImageFilters/Kernels_BinaryImage_BasicFilters_x64_SSE42.h \
Expand All @@ -1376,12 +1380,14 @@ HEADERS += \
Source/Kernels/BinaryMatrix/Kernels_BinaryMatrixTile_64x32_x64_AVX512.h \
Source/Kernels/BinaryMatrix/Kernels_BinaryMatrixTile_64x4_Default.h \
Source/Kernels/BinaryMatrix/Kernels_BinaryMatrixTile_64x64_x64_AVX512.h \
Source/Kernels/BinaryMatrix/Kernels_BinaryMatrixTile_64x8_arm64_NEON.h \
Source/Kernels/BinaryMatrix/Kernels_BinaryMatrixTile_64x8_x64_SSE42.h \
Source/Kernels/BinaryMatrix/Kernels_BinaryMatrixTile_64xH_Default.h \
Source/Kernels/BinaryMatrix/Kernels_BinaryMatrixTile_Debugging.h \
Source/Kernels/BinaryMatrix/Kernels_BinaryMatrix_Arch_64x16_x64_AVX2.h \
Source/Kernels/BinaryMatrix/Kernels_BinaryMatrix_Arch_64x32_x64_AVX512.h \
Source/Kernels/BinaryMatrix/Kernels_BinaryMatrix_Arch_64x64_x64_AVX512.h \
Source/Kernels/BinaryMatrix/Kernels_BinaryMatrix_Arch_64x8_arm64_NEON.h \
Source/Kernels/BinaryMatrix/Kernels_BinaryMatrix_Arch_64x8_x64_SSE42.h \
Source/Kernels/BinaryMatrix/Kernels_BinaryMatrix_Arch_64xH_Default.h \
Source/Kernels/BinaryMatrix/Kernels_BinaryMatrix_t.h \
Expand Down Expand Up @@ -1414,6 +1420,7 @@ HEADERS += \
Source/Kernels/Waterfill/Kernels_Waterfill_Core_64x4_Default.h \
Source/Kernels/Waterfill/Kernels_Waterfill_Core_64x64_x64_AVX512-GF.h \
Source/Kernels/Waterfill/Kernels_Waterfill_Core_64x64_x64_AVX512.h \
Source/Kernels/Waterfill/Kernels_Waterfill_Core_64x8_arm64_NEON.h \
Source/Kernels/Waterfill/Kernels_Waterfill_Core_64x8_x64_SSE42.h \
Source/Kernels/Waterfill/Kernels_Waterfill_Core_64xH_Default.h \
Source/Kernels/Waterfill/Kernels_Waterfill_Intrinsics_x64_AVX512-GF.h \
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ void filter_by_mask_64x8_x64_SSE42 (const PackedBinaryMatrix_IB& matrix, uint
void filter_by_mask_64x16_x64_AVX2 (const PackedBinaryMatrix_IB& matrix, uint32_t* image, size_t bytes_per_row, uint32_t replace_with, bool replace_if_zero);
void filter_by_mask_64x32_x64_AVX512 (const PackedBinaryMatrix_IB& matrix, uint32_t* image, size_t bytes_per_row, uint32_t replace_with, bool replace_if_zero);
void filter_by_mask_64x64_x64_AVX512 (const PackedBinaryMatrix_IB& matrix, uint32_t* image, size_t bytes_per_row, uint32_t replace_with, bool replace_if_zero);
void filter_by_mask_64x8_arm64_NEON (const PackedBinaryMatrix_IB& matrix, uint32_t* image, size_t bytes_per_row, uint32_t replace_with, bool replace_if_zero);

void filter_by_mask(
const PackedBinaryMatrix_IB& matrix,
Expand All @@ -47,12 +48,17 @@ void filter_by_mask(
case BinaryMatrixType::i64x8_x64_SSE42:
filter_by_mask_64x8_x64_SSE42(matrix, image, bytes_per_row, replace_with, replace_if_zero);
return;
#endif
#ifdef PA_AutoDispatch_arm64_20_M1
case BinaryMatrixType::arm64x8_x64_NEON:
filter_by_mask_64x8_arm64_NEON(matrix, image, bytes_per_row, replace_with, replace_if_zero);
return;
#endif
case BinaryMatrixType::i64x4_Default:
filter_by_mask_64x4_Default(matrix, image, bytes_per_row, replace_with, replace_if_zero);
return;
default:
throw InternalProgramError(nullptr, PA_CURRENT_FUNCTION, "Unsupported matrix format.");
throw InternalProgramError(nullptr, PA_CURRENT_FUNCTION, "Unsupported matrix format in filter_by_mask().");
}
}

Expand Down Expand Up @@ -108,6 +114,15 @@ void compress_rgb32_to_binary_range_64x64_x64_AVX512(
CompressRgb32ToBinaryRangeFilter* filters, size_t filter_count
);

void compress_rgb32_to_binary_range_64x8_arm64_NEON(
const uint32_t* image, size_t bytes_per_row,
PackedBinaryMatrix_IB& matrix0, uint32_t mins0, uint32_t maxs0
);
void compress_rgb32_to_binary_range_64x8_arm64_NEON(
const uint32_t* image, size_t bytes_per_row,
CompressRgb32ToBinaryRangeFilter* filters, size_t filter_count
);

void compress_rgb32_to_binary_range(
const uint32_t* image, size_t bytes_per_row,
PackedBinaryMatrix_IB& matrix,
Expand All @@ -131,6 +146,11 @@ void compress_rgb32_to_binary_range(
case BinaryMatrixType::i64x8_x64_SSE42:
compress_rgb32_to_binary_range_64x8_x64_SSE42(image, bytes_per_row, matrix, mins, maxs);
return;
#endif
#ifdef PA_AutoDispatch_arm64_20_M1
case BinaryMatrixType::arm64x8_x64_NEON:
compress_rgb32_to_binary_range_64x8_arm64_NEON(image, bytes_per_row, matrix, mins, maxs);
return;
#endif
case BinaryMatrixType::i64x4_Default:
compress_rgb32_to_binary_range_64x4_Default(image, bytes_per_row, matrix, mins, maxs);
Expand Down Expand Up @@ -170,6 +190,11 @@ void compress_rgb32_to_binary_range(
case BinaryMatrixType::i64x8_x64_SSE42:
compress_rgb32_to_binary_range_64x8_x64_SSE42(image, bytes_per_row, filters, filter_count);
return;
#endif
#ifdef PA_AutoDispatch_arm64_20_M1
case BinaryMatrixType::arm64x8_x64_NEON:
compress_rgb32_to_binary_range_64x8_arm64_NEON(image, bytes_per_row, filters, filter_count);
return;
#endif
case BinaryMatrixType::i64x4_Default:
compress_rgb32_to_binary_range_64x4_Default(image, bytes_per_row, filters, filter_count);
Expand All @@ -180,7 +205,6 @@ void compress_rgb32_to_binary_range(
}



void compress_rgb32_to_binary_euclidean_64x64_x64_AVX512(
const uint32_t* image, size_t bytes_per_row,
PackedBinaryMatrix_IB& matrix,
Expand All @@ -201,6 +225,11 @@ void compress_rgb32_to_binary_euclidean_64x8_x64_SSE42(
PackedBinaryMatrix_IB& matrix,
uint32_t expected, double max_euclidean_distance
);
void compress_rgb32_to_binary_euclidean_64x8_arm64_NEON(
const uint32_t* image, size_t bytes_per_row,
PackedBinaryMatrix_IB& matrix,
uint32_t expected, double max_euclidean_distance
);
void compress_rgb32_to_binary_euclidean_64x4_Default(
const uint32_t* image, size_t bytes_per_row,
PackedBinaryMatrix_IB& matrix,
Expand Down Expand Up @@ -229,6 +258,11 @@ void compress_rgb32_to_binary_euclidean(
case BinaryMatrixType::i64x8_x64_SSE42:
compress_rgb32_to_binary_euclidean_64x8_x64_SSE42(image, bytes_per_row, matrix, expected, max_euclidean_distance);
return;
#endif
#ifdef PA_AutoDispatch_arm64_20_M1
case BinaryMatrixType::arm64x8_x64_NEON:
compress_rgb32_to_binary_euclidean_64x8_arm64_NEON(image, bytes_per_row, matrix, expected, max_euclidean_distance);
return;
#endif
case BinaryMatrixType::i64x4_Default:
compress_rgb32_to_binary_euclidean_64x4_Default(image, bytes_per_row, matrix, expected, max_euclidean_distance);
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
/* Binary Image Basic Filters (arm64 NEON)
*
* From: https://github.com/PokemonAutomation/Arduino-Source
*
*/

#ifdef PA_AutoDispatch_arm64_20_M1

#include "Kernels/BinaryMatrix/Kernels_BinaryMatrix_Arch_64x8_arm64_NEON.h"
#include "Kernels_BinaryImage_BasicFilters_Routines.h"
#include "Kernels_BinaryImage_BasicFilters_arm64_NEON.h"

namespace PokemonAutomation{
namespace Kernels{


void filter_by_mask_64x8_arm64_NEON(
const PackedBinaryMatrix_IB& matrix,
uint32_t* image, size_t bytes_per_row,
uint32_t replace_with, bool replace_if_zero
){
FilterByMask_arm64_NEON filter(replace_with, replace_if_zero);
filter_by_mask(static_cast<const PackedBinaryMatrix_64x8_arm64_NEON&>(matrix).get(), image, bytes_per_row, filter);
}


void compress_rgb32_to_binary_range_64x8_arm64_NEON(
const uint32_t* image, size_t bytes_per_row,
PackedBinaryMatrix_IB& matrix0, uint32_t mins0, uint32_t maxs0
){
Compressor_RgbRange_arm64_NEON compressor0(mins0, maxs0);
compress_rgb32_to_binary(
image, bytes_per_row,
static_cast<PackedBinaryMatrix_64x8_arm64_NEON&>(matrix0).get(), compressor0
);
}
void compress_rgb32_to_binary_range_64x8_arm64_NEON(
const uint32_t* image, size_t bytes_per_row,
CompressRgb32ToBinaryRangeFilter* filters, size_t filter_count
){
compress_rgb32_to_binary<PackedBinaryMatrix_64x8_arm64_NEON, Compressor_RgbRange_arm64_NEON>(
image, bytes_per_row, filters, filter_count
);
}


void compress_rgb32_to_binary_euclidean_64x8_arm64_NEON(
const uint32_t* image, size_t bytes_per_row,
PackedBinaryMatrix_IB& matrix,
uint32_t expected, double max_euclidean_distance
){
// TODO:
// Compressor_RgbEuclidean_arm64_NEON compressor(expected, max_euclidean_distance);
// compress_rgb32_to_binary(
// image, bytes_per_row,
// static_cast<PackedBinaryMatrix_64x8_arm64_NEON&>(matrix).get(), compressor
// );
}



}
}
#endif
Original file line number Diff line number Diff line change
Expand Up @@ -118,6 +118,44 @@ void compress_rgb32_to_binary(
}


// Change pixel (as uint32_t) color of image based on bits in a binary matrix
// If `filter` is constructed with `replace_if_zero` being true, image pixels corresponding to 0-bits in `matrix`
// are replaced with color `replace_with` which is provided by the filter.
// If `filter` is constructed with `replace_if_zero` being false, image pixels corresponding to 1-bits in `matrix`
// are replaced with color `replace_with`.
// `BinaryMatrixType` is a PackedBinaryMatrixCore<BinaryMatrixTileType>,
// where BinaryMatrixTileType is an implementation of a tile, defined for every simd architecture.
template <typename BinaryMatrixType, typename Filter>
void filter_rgb32(
const BinaryMatrixType& matrix,
uint32_t* image, size_t bytes_per_row,
const Filter& filter
){
size_t bit_width = matrix.width();
// size_t bit_height = binary_image.height();
// size_t word_width = binary_image.word64_width();

// How many words in a row. Each word is 64-bit long
size_t word_height = matrix.word64_height();
for (size_t r = 0; r < word_height; r++){ // For each row
uint32_t* img = image;
// c: current index of the word. One word has 64-bit wide
size_t c = 0;
size_t left = bit_width;
while (left >= 64){
// Modify some pixels in the 64-pixel-long area of the image,
// starting at pointr `img`
filter.filter64(matrix.word64(c, r), img);
c++;
img += 64;
left -= 64;
}
if (left > 0){
filter.filter64(matrix.word64(c, r), img, left);
}
image = (uint32_t*)((const char*)image + bytes_per_row);
}
}


}
Expand Down
Loading

0 comments on commit 572623a

Please sign in to comment.