From 2ebd9d9ec72cfe9d0f698fd4528a513b19e45306 Mon Sep 17 00:00:00 2001
From: Gin <>
Date: Sun, 3 Dec 2023 12:59:51 -0800
Subject: [PATCH] NEON binary image filter

---
 .../Kernels_BinaryImage_BasicFilters.h        |   4 +-
 ...mage_BasicFilters_Core_64x8_arm64_NEON.cpp |  15 +-
 ...nels_BinaryImage_BasicFilters_arm64_NEON.h | 332 +++++++++---------
 .../Kernels_PackedBinaryMatrixCore.h          |   3 +-
 .../Kernels_PartialWordAccess_arm64_NEON.h    |  12 +-
 SerialPrograms/Source/Tests/Kernels_Tests.cpp | 279 ++++++++++++---
 SerialPrograms/Source/Tests/Kernels_Tests.h   |   4 +
 SerialPrograms/Source/Tests/TestMap.cpp       |   2 +
 8 files changed, 422 insertions(+), 229 deletions(-)
diff --git a/SerialPrograms/Source/Kernels/BinaryImageFilters/Kernels_BinaryImage_BasicFilters.h b/SerialPrograms/Source/Kernels/BinaryImageFilters/Kernels_BinaryImage_BasicFilters.h
index 731068b48..69fe01004 100644
--- a/SerialPrograms/Source/Kernels/BinaryImageFilters/Kernels_BinaryImage_BasicFilters.h
+++ b/SerialPrograms/Source/Kernels/BinaryImageFilters/Kernels_BinaryImage_BasicFilters.h
@@ -81,11 +81,11 @@ void compress_rgb32_to_binary_range(
 
 
 //  Compress (image, bytes_per_row) into a binary_image.
-//  For each pixel, set to 1 if distance is within the expected value.
+//  For each pixel, set to 1 if the Euclidean distance of the pixel color to the expected color <= max distance.
 void compress_rgb32_to_binary_euclidean(
     const uint32_t* image, size_t bytes_per_row,
     PackedBinaryMatrix_IB& matrix,
-    uint32_t expected, double max_euclidean_distance
+    uint32_t expected_color, double max_euclidean_distance
 );
 
 
diff --git a/SerialPrograms/Source/Kernels/BinaryImageFilters/Kernels_BinaryImage_BasicFilters_Core_64x8_arm64_NEON.cpp b/SerialPrograms/Source/Kernels/BinaryImageFilters/Kernels_BinaryImage_BasicFilters_Core_64x8_arm64_NEON.cpp
index 3554902a8..716259ebe 100644
--- a/SerialPrograms/Source/Kernels/BinaryImageFilters/Kernels_BinaryImage_BasicFilters_Core_64x8_arm64_NEON.cpp
+++ b/SerialPrograms/Source/Kernels/BinaryImageFilters/Kernels_BinaryImage_BasicFilters_Core_64x8_arm64_NEON.cpp
@@ -17,9 +17,9 @@ namespace Kernels{
 void filter_by_mask_64x8_arm64_NEON(
     const PackedBinaryMatrix_IB& matrix,
     uint32_t* image, size_t bytes_per_row,
-    uint32_t replace_with, bool replace_if_zero
+    uint32_t replacement_color, bool replace_zero_bits
 ){
-    FilterByMask_arm64_NEON filter(replace_with, replace_if_zero);
+    FilterByMask_arm64_NEON filter(replacement_color, replace_zero_bits);
     filter_by_mask(static_cast<const PackedBinaryMatrix_64x8_arm64_NEON&>(matrix).get(), image, bytes_per_row, filter);
 }
 
@@ -49,12 +49,11 @@ void compress_rgb32_to_binary_euclidean_64x8_arm64_NEON(
     PackedBinaryMatrix_IB& matrix,
     uint32_t expected, double max_euclidean_distance
 ){
-    // TODO:
-    // Compressor_RgbEuclidean_arm64_NEON compressor(expected, max_euclidean_distance);
-    // compress_rgb32_to_binary(
-    //     image, bytes_per_row,
-    //     static_cast<PackedBinaryMatrix_64x8_arm64_NEON&>(matrix).get(), compressor
-    // );
+    Compressor_RgbEuclidean_arm64_NEON compressor(expected, max_euclidean_distance);
+    compress_rgb32_to_binary(
+        image, bytes_per_row,
+        static_cast<PackedBinaryMatrix_64x8_arm64_NEON&>(matrix).get(), compressor
+    );
 }
 
 
diff --git a/SerialPrograms/Source/Kernels/BinaryImageFilters/Kernels_BinaryImage_BasicFilters_arm64_NEON.h b/SerialPrograms/Source/Kernels/BinaryImageFilters/Kernels_BinaryImage_BasicFilters_arm64_NEON.h
index 9dc8418a1..a01a4de2f 100644
--- a/SerialPrograms/Source/Kernels/BinaryImageFilters/Kernels_BinaryImage_BasicFilters_arm64_NEON.h
+++ b/SerialPrograms/Source/Kernels/BinaryImageFilters/Kernels_BinaryImage_BasicFilters_arm64_NEON.h
@@ -9,215 +9,144 @@
 
 #include "Kernels/PartialWordAccess/Kernels_PartialWordAccess_arm64_NEON.h"
 
+#include <iostream>
+using std::cout;
+using std::endl;
+
 namespace PokemonAutomation{
 namespace Kernels{
 
 // Change color of an array of pixels based on values from a bitmap that corresponds to the pixel array.
-// If `replace_if_zero` is true, change color of pixels that correspond to 0 bits.
+// If `replace_zero_bits` is true, change color of pixels that correspond to 0 bits.
 // Otherwise, chagne color of pixels that correspond to 1 bits.
 class FilterByMask_arm64_NEON{
 public:
-    FilterByMask_arm64_NEON(uint32_t replacement, bool replace_if_zero)
-        : m_replacement_pixel(replacement)
-        , m_replacement(vdupq_n_u32(replacement))
-        , m_replace_if_zero_bool(replace_if_zero)
-        , m_replace_if_zero(vreinterpretq_u32_u8(vdupq_n_u8(replace_if_zero ? 0xff: 0)))
+    FilterByMask_arm64_NEON(uint32_t replacement, bool replace_zero_bits)
+        : m_replacement_u32(vdupq_n_u32(replacement))
+        , m_replace_if_zero(replace_zero_bits)
         , m_zeros(vreinterpretq_u32_u8(vdupq_n_u8(0)))
-        , m_lasts(vdupq_n_u32(1))
     {}
 
     // Given 64 bits stored in `uint64_t`, use it to set colors to 64 pixels in `pixels`.
-    // If filter constructor parameter `replace_if_zero` is true, the pixels corresponding to
+    // If filter constructor parameter `replace_zero_bits` is true, the pixels corresponding to
     // 0-bits are set to color `replacement` (another filter constructor parameter).
     // Otherwise, pixels corresponding to 1-bits are set to the color.
     PA_FORCE_INLINE void filter64(uint64_t bits, uint32_t* pixels) const{
-        for(int i = 0; i < 64; i+=8){
-            filter8((bits >> i) & 0xFF, pixels + i);
+        for(int i = 0; i < 64; i+=16){
+            filter16((bits >> i) & 0xFFFF, pixels + i);
         }
     }
 
-    // partial version of filter64(bits, pixels)
+    // partial version of filter64(bits, pixels): instead of setting colors to 64 pixels,
+    // only setting `count` (count <= 64) pixels.
     PA_FORCE_INLINE void filter64(uint64_t bits, uint32_t* pixels, size_t count) const{
-        size_t lc = count / 4;
-        while (lc--){
-            uint32x4_t pixel = vld1q_u32(pixels);
-            // Change color in the four pixels according to the lowest four bits in `bits`
-            uint32x4_t filtered_pixel = filter4(bits & 0xF, pixel);
-
-            vst1q_u32(pixels, filtered_pixel);
-            pixels += 4;
-            bits >>= 4;
+        const size_t count_round_4 = (count / 4) * 4;
+        for(size_t i = 0; i < count_round_4; i+=4){
+            filter4((bits >> i) & 0xF, pixels + i);
         }
-        count %= 4;
-        if (count){
-            // Load a partial of 4-pixel long data
-            PartialWordAccess_arm64_NEON loader(count * sizeof(uint32_t));
-            uint32x4_t pixel = vreinterpretq_u32_u8(loader.load(pixels));
-            // Change color in the pixels according to the lowest bits in `bits`
-            pixel = filter4(bits & 0xF, pixel);
-            switch (count){
-                case 1:
-                    pixels[0] = vgetq_lane_u32(pixel, 0);
-                    break;
-                case 2:
-                    pixels[0] = vgetq_lane_u32(pixel, 0);
-                    pixels[1] = vgetq_lane_u32(pixel, 1);
-                    break;
-                default:
-                    pixels[0] = vgetq_lane_u32(pixel, 0);
-                    pixels[1] = vgetq_lane_u32(pixel, 1);
-                    pixels[2] = vgetq_lane_u32(pixel, 2);
-            }
+        
+        size_t left = count % 4;
+        if (left){
+            uint32_t buffer[4];
+            memcpy(buffer, pixels + count_round_4, sizeof(uint32_t) * left);
+            filter4((bits >> count_round_4) & 0xF, buffer);
+            memcpy(pixels + count_round_4, buffer, sizeof(uint32_t) * left);
         }
     }
 
 private:
+    // Change color in the 8 pixels according to the lowest 4 bits in `bits`
+    PA_FORCE_INLINE void filter4(uint64_t bits64, uint32_t* pixels) const{
+        // Duplicate 4-bit pattern into four uint16_t places in `bits`
+        bits64 *= 0x0001000100010001;
+        // convert each uint16_t to be one bit from the lowest four bits in input `bits`
+        bits64 &= 0x0008000400020001;
 
-    // Change color in the 8 pixels according to the lowest 8 bits in `bits`
-    PA_FORCE_INLINE void filter8(uint32_t bits64, uint32_t* pixels) const{
-        const uint32_t uint32_max = 0xFFFFFFFF;
-        if (m_replace_if_zero_bool){
-            uint32_t mask0 = (bits64 & 0x1) - 1;
-            uint32_t mask1 = ((bits64 >> 1) & 0x1) - 1;
-            uint32_t mask2 = ((bits64 >> 2) & 0x1) - 1;
-            uint32_t mask3 = ((bits64 >> 3) & 0x1) - 1;
-            uint32_t mask4 = ((bits64 >> 4) & 0x1) - 1;
-            uint32_t mask5 = ((bits64 >> 5) & 0x1) - 1;
-            uint32_t mask6 = ((bits64 >> 6) & 0x1) - 1;
-            uint32_t mask7 = ((bits64 >> 7) & 0x1) - 1;
-
-            pixels[0] = (mask0 & m_replacement_pixel) | (pixels[0] & (uint32_max - mask0));
-            pixels[1] = (mask1 & m_replacement_pixel) | (pixels[1] & (uint32_max - mask1));
-            pixels[2] = (mask2 & m_replacement_pixel) | (pixels[2] & (uint32_max - mask2));
-            pixels[3] = (mask3 & m_replacement_pixel) | (pixels[3] & (uint32_max - mask3));
-            pixels[4] = (mask4 & m_replacement_pixel) | (pixels[4] & (uint32_max - mask4));
-            pixels[5] = (mask5 & m_replacement_pixel) | (pixels[5] & (uint32_max - mask5));
-            pixels[6] = (mask6 & m_replacement_pixel) | (pixels[6] & (uint32_max - mask6));
-            pixels[7] = (mask7 & m_replacement_pixel) | (pixels[7] & (uint32_max - mask7));
-        } else{
-            uint32_t mask0 = (bits64 & 0x1) - 1;
-            uint32_t mask1 = ((bits64 >> 1) & 0x1) - 1;
-            uint32_t mask2 = ((bits64 >> 2) & 0x1) - 1;
-            uint32_t mask3 = ((bits64 >> 3) & 0x1) - 1;
-            uint32_t mask4 = ((bits64 >> 4) & 0x1) - 1;
-            uint32_t mask5 = ((bits64 >> 5) & 0x1) - 1;
-            uint32_t mask6 = ((bits64 >> 6) & 0x1) - 1;
-            uint32_t mask7 = ((bits64 >> 7) & 0x1) - 1;
-
-            pixels[0] = (mask0 & pixels[0]) | (m_replacement_pixel & (uint32_max - mask0));
-            pixels[1] = (mask1 & pixels[1]) | (m_replacement_pixel & (uint32_max - mask1));
-            pixels[2] = (mask2 & pixels[2]) | (m_replacement_pixel & (uint32_max - mask2));
-            pixels[3] = (mask3 & pixels[3]) | (m_replacement_pixel & (uint32_max - mask3));
-            pixels[4] = (mask4 & pixels[4]) | (m_replacement_pixel & (uint32_max - mask4));
-            pixels[5] = (mask5 & pixels[5]) | (m_replacement_pixel & (uint32_max - mask5));
-            pixels[6] = (mask6 & pixels[6]) | (m_replacement_pixel & (uint32_max - mask6));
-            pixels[7] = (mask7 & pixels[7]) | (m_replacement_pixel & (uint32_max - mask7));
-        }
-    }
+        uint32x4_t pixels_u32 = vld1q_u32(pixels);
 
-    // Change color in the 16 pixels according to the lowest 16 bits in `bits`
-    PA_FORCE_INLINE void filter16_NEON(uint32_t bits64, uint32_t* pixels) const{
-        // const uint32_t uint32_max = 0xFFFFFFFF;
-        if (m_replace_if_zero_bool){
-
-            uint32x4x4_t pixel_32x4x4 = vld1q_u32_x4(pixels);
-
-            uint32_t bit_rs_00 = (bits64 >> 0);
-            uint32_t bit_rs_01 = (bits64 >> 1);
-            uint32_t bit_rs_02 = (bits64 >> 2);
-            uint32_t bit_rs_03 = (bits64 >> 3);
-            uint32x4_t mask0 = vtstq_u32(uint32x4_t{bit_rs_00, bit_rs_01, bit_rs_02, bit_rs_03}, m_lasts);
-            uint32_t bit_rs_04 = (bits64 >> 4);
-            uint32_t bit_rs_05 = (bits64 >> 5);
-            uint32_t bit_rs_06 = (bits64 >> 6);
-            uint32_t bit_rs_07 = (bits64 >> 7);
-            uint32x4_t mask1 = vtstq_u32(uint32x4_t{bit_rs_04, bit_rs_05, bit_rs_06, bit_rs_07}, m_lasts);
-            uint32_t bit_rs_08 = (bits64 >> 8);
-            uint32_t bit_rs_09 = (bits64 >> 9);
-            uint32_t bit_rs_10 = (bits64 >> 10);
-            uint32_t bit_rs_11 = (bits64 >> 11);
-            uint32x4_t mask2 = vtstq_u32(uint32x4_t{bit_rs_08, bit_rs_09, bit_rs_10, bit_rs_11}, m_lasts);
-            uint32_t bit_rs_12 = (bits64 >> 12);
-            uint32_t bit_rs_13 = (bits64 >> 13);
-            uint32_t bit_rs_14 = (bits64 >> 14);
-            uint32_t bit_rs_15 = (bits64 >> 15);
-            uint32x4_t mask3 = vtstq_u32(uint32x4_t{bit_rs_12, bit_rs_13, bit_rs_14, bit_rs_15}, m_lasts);
-
-            uint32x4x4_t filtered_pixel_32x4x4;
-            
-            filtered_pixel_32x4x4.val[0] = ((m_replace_if_zero - mask0) & m_replacement) + ((mask0) & pixel_32x4x4.val[0]);
-            filtered_pixel_32x4x4.val[1] = ((m_replace_if_zero - mask1) & m_replacement) + ((mask1) & pixel_32x4x4.val[1]);
-            filtered_pixel_32x4x4.val[2] = ((m_replace_if_zero - mask2) & m_replacement) + ((mask2) & pixel_32x4x4.val[2]);
-            filtered_pixel_32x4x4.val[3] = ((m_replace_if_zero - mask3) & m_replacement) + ((mask3) & pixel_32x4x4.val[3]);
+        // Load `bits` into simd 64-bit vector register
+        uint16x4_t mask_u16 = vcreate_u16(bits64);
+        // Expand mask to cover each pixel (uint32_t)
+        uint32x4_t mask_u32 = vmovl_u16(mask_u16);
+        // Expand mask to be all-1 or all-0 mask for each pixel
+        mask_u32 = vcgtq_u32(mask_u32, m_zeros);
 
+        uint32x4_t out_u32;
+        if (m_replace_if_zero){
+            // bit select intrinsic: 
             // vbslq_u32(a, b, c), for 1 bits in a, choose b; for 0 bits in a, choose c
-            // filtered_pixel_32x4x4.val[0] = vbslq_u32(vsubq_u32(mask0, m_lasts), m_replacement, pixel_32x4x4.val[0]);
-            // filtered_pixel_32x4x4.val[1] = vbslq_u32(vsubq_u32(mask1, m_lasts), m_replacement, pixel_32x4x4.val[1]);
-            // filtered_pixel_32x4x4.val[2] = vbslq_u32(vsubq_u32(mask2, m_lasts), m_replacement, pixel_32x4x4.val[2]);
-            // filtered_pixel_32x4x4.val[3] = vbslq_u32(vsubq_u32(mask3, m_lasts), m_replacement, pixel_32x4x4.val[3]);     
-
-            vst1q_u32_x4(pixels, filtered_pixel_32x4x4);
-        } else{
-            // TODO:
+            out_u32 = vbslq_u32(mask_u32, pixels_u32, m_replacement_u32);
+        } else {
+            out_u32 = vbslq_u32(mask_u32, m_replacement_u32, pixels_u32);
         }
+        vst1q_u32(pixels, out_u32);
     }
 
-    // Change color in the four pixels according to the lowest four bits in `bits`
-    PA_FORCE_INLINE void filter4(uint32_t bits64, uint32_t* pixels) const{
-        // uint32x4_t pixel = vld1q_u32(pixels);
-        if (m_replace_if_zero_bool){
-            uint32_t check0 = (bits64 & 0x1);
-            pixels[0] = (1 - check0) * m_replacement_pixel + pixels[0] * check0;
-            uint32_t check1 = ((bits64 >> 1) & 0x1);
-            pixels[1] = (1 - check1) * m_replacement_pixel + pixels[1] * check1;
-            uint32_t check2 = ((bits64 >> 2) & 0x1);
-            pixels[2] = (1 - check2) * m_replacement_pixel + pixels[2] * check2;
-            uint32_t check3 = ((bits64 >> 3) & 0x1);
-            pixels[3] = (1 - check3) * m_replacement_pixel + pixels[3] * check3;
-        }
-        else{
-            if ((bits64 & 0x1) != 0){
-                pixels[0] = m_replacement_pixel;
-            }
-            if ((bits64 & 0x2) != 0){
-                pixels[1] = m_replacement_pixel;
-            }
-            if ((bits64 & 0x4) != 0){
-                pixels[2] = m_replacement_pixel;
-            }
-            if ((bits64 & 0x8) != 0){
-                pixels[3] = m_replacement_pixel;
-            }
-        }
-    }
+    PA_FORCE_INLINE void filter16(uint64_t bits64, uint32_t* pixels) const{
+        uint64_t bits_0 = bits64 & 0xF;
+        uint64_t bits_1 = (bits64 >> 4) & 0xF;
+        uint64_t bits_2 = (bits64 >> 8) & 0xF;
+        uint64_t bits_3 = bits64 >> 12;
 
-    // Change color in the four pixels according to the lowest four bits in `bits`
-    PA_FORCE_INLINE uint32x4_t filter4(uint64_t bits64, uint32x4_t pixel) const{
         // Duplicate 4-bit pattern into four uint16_t places in `bits`
-        bits64 *= 0x0001000100010001;
+        bits_0 *= 0x0001000100010001;
         // convert each uint16_t to be one bit from the lowest four bits in input `bits`
-        bits64 &= 0x0008000400020001;
+        bits_0 &= 0x0008000400020001;
+        
+        bits_1 *= 0x0001000100010001;
+        bits_1 &= 0x0008000400020001;
+
+        bits_2 *= 0x0001000100010001;
+        bits_2 &= 0x0008000400020001;
+
+        bits_3 *= 0x0001000100010001;
+        bits_3 &= 0x0008000400020001;
+
+        uint32x4_t pixels_0_u32 = vld1q_u32(pixels);
+        uint32x4_t pixels_1_u32 = vld1q_u32(pixels + 4);
+        uint32x4_t pixels_2_u32 = vld1q_u32(pixels + 8);
+        uint32x4_t pixels_3_u32 = vld1q_u32(pixels + 12);
 
         // Load `bits` into simd 64-bit vector register
-        uint16x4_t mask_u16x4 = vcreate_u16(bits64);
+        uint16x4_t mask_0_u16x4 = vcreate_u16(bits_0);
+        uint16x4_t mask_1_u16x4 = vcreate_u16(bits_1);
+        uint16x4_t mask_2_u16x4 = vcreate_u16(bits_2);
+        uint16x4_t mask_3_u16x4 = vcreate_u16(bits_3);
         // Expand mask to cover each pixel (uint32_t)
-        uint32x4_t mask = vmovl_u16(mask_u16x4);
+        uint32x4_t mask_0_u32 = vmovl_u16(mask_0_u16x4);
+        uint32x4_t mask_1_u32 = vmovl_u16(mask_1_u16x4);
+        uint32x4_t mask_2_u32 = vmovl_u16(mask_2_u16x4);
+        uint32x4_t mask_3_u32 = vmovl_u16(mask_3_u16x4);
         // Expand mask to be all-1 or all-0 mask for each pixel
-        mask = vcgtq_u32(mask, m_zeros);
-        // If m_replace_if_zero is 0xFF...FF, invert the mask
-        mask = veorq_u32(mask, m_replace_if_zero);
-        // bit select intrinsic: 
-        // vbslq_u32(a, b, c), for 1 bits in a, choose b; for 0 bits in a, choose c
-        return vbslq_u32(mask, m_replacement, pixel);
+        mask_0_u32 = vcgtq_u32(mask_0_u32, m_zeros);
+        mask_1_u32 = vcgtq_u32(mask_1_u32, m_zeros);
+        mask_2_u32 = vcgtq_u32(mask_2_u32, m_zeros);
+        mask_3_u32 = vcgtq_u32(mask_3_u32, m_zeros);
+
+        uint32x4_t out_0_u32, out_1_u32, out_2_u32, out_3_u32;
+        if (m_replace_if_zero){
+            // bit select intrinsic: 
+            // vbslq_u32(a, b, c), for 1 bits in a, choose b; for 0 bits in a, choose c
+            out_0_u32 = vbslq_u32(mask_0_u32, pixels_0_u32, m_replacement_u32);
+            out_1_u32 = vbslq_u32(mask_1_u32, pixels_1_u32, m_replacement_u32);
+            out_2_u32 = vbslq_u32(mask_2_u32, pixels_2_u32, m_replacement_u32);
+            out_3_u32 = vbslq_u32(mask_3_u32, pixels_3_u32, m_replacement_u32);
+        } else {
+            out_0_u32 = vbslq_u32(mask_0_u32, m_replacement_u32, pixels_0_u32);
+            out_1_u32 = vbslq_u32(mask_1_u32, m_replacement_u32, pixels_1_u32);
+            out_2_u32 = vbslq_u32(mask_2_u32, m_replacement_u32, pixels_2_u32);
+            out_3_u32 = vbslq_u32(mask_3_u32, m_replacement_u32, pixels_3_u32);
+        }
+        vst1q_u32(pixels, out_0_u32);
+        vst1q_u32(pixels + 4, out_1_u32);
+        vst1q_u32(pixels + 8, out_2_u32);
+        vst1q_u32(pixels + 12, out_3_u32);
     }
 
 private:
-    const uint32_t m_replacement_pixel;
-    const uint32x4_t m_replacement;
-    const bool m_replace_if_zero_bool;
-    const uint32x4_t m_replace_if_zero;
+    const uint32x4_t m_replacement_u32;
+    const bool m_replace_if_zero;
     const uint32x4_t m_zeros;
-    const uint32x4_t m_lasts;
 };
 
 
@@ -323,6 +252,73 @@ class Compressor_RgbRange_arm64_NEON{
 };
 
 
+class Compressor_RgbEuclidean_arm64_NEON{
+public:
+    Compressor_RgbEuclidean_arm64_NEON(uint32_t expected_color, double max_euclidean_distance)
+        : m_expected_color_rgb_u8(vreinterpretq_u8_u32(vdupq_n_u32(expected_color & 0x00ffffff)))
+        , m_distance_squared_u32(vdupq_n_u32((uint32_t)(max_euclidean_distance * max_euclidean_distance)))
+    {}
+
+    PA_FORCE_INLINE uint64_t convert64(const uint32_t* pixels) const{
+        uint64_t bits = 0;
+        for(size_t c = 0; c < 64; c += 4){
+            bits |= convert4(pixels + c) << c;
+        }
+        return bits;
+    }
+    PA_FORCE_INLINE uint64_t convert64(const uint32_t* pixels, size_t count) const{
+        uint64_t bits = 0;
+        size_t c = 0;
+
+        for(size_t i = 0; i < count / 4; i++, c+=4){
+            bits |= convert4(pixels + c) << c;
+        }
+        if (count){
+            PartialWordAccess_arm64_NEON loader(count * sizeof(uint32_t));
+            const uint8x16_t pixel = loader.load(pixels + c);
+            const uint64_t mask = ((uint64_t)1 << count) - 1;
+            bits |= (convert4(pixel) & mask) << c;
+        }
+        return bits;
+    }
+
+private:
+    PA_FORCE_INLINE uint64_t convert4(const uint32_t* pixel) const{
+        uint32x4_t in_u32 = vld1q_u32(pixel);
+        return convert4(vreinterpretq_u8_u32(in_u32));
+    }
+    PA_FORCE_INLINE uint64_t convert4(const uint32x4_t& in_u32) const{
+        // subtract the expected values
+        uint32x4_t in_dif_u32 = vreinterpretq_u32_u8(vabdq_u8(vreinterpretq_u8_u32(in_u32), m_expected_color_rgb_u8));
+
+        // Get green channel
+        uint32x4_t in_g_u32 = vandq_u32(in_dif_u32, vdupq_n_u32(0x0000ff00));
+        // Move green channel to the lower end of the 16-bit regions
+        uint16x8_t in_g_u16 = vshrq_n_u16(vreinterpretq_u16_u32(in_g_u32), 8);
+        // in_rb_u16 contains the red and blue channels. Each channel occupies a 16-bit region
+        uint16x8_t in_rb_u16 = vandq_u16(vreinterpretq_u16_u32(in_dif_u32), vdupq_n_u16(0x00ff));
+
+        // Square operation
+        uint16x8_t in_g2_u16 = vmulq_u16(in_g_u16, in_g_u16);
+        uint16x8_t in_r2b2_u16 = vmulq_u16(in_rb_u16, in_rb_u16);
+
+        uint32x4_t in_g2_u32 = vreinterpretq_u32_u16(in_g2_u16);
+        // Use pairwise addition and accumulate to add r2, g2, and b2 together
+        uint32x4_t sum_sqr_u32 = vpadalq_u16(in_g2_u32, in_r2b2_u16);
+
+        // cmp_u32: if each pixel is within range (sum_sqr <= max_distance_squared), its uint32_t in `cmp_u32` is all 1 bits,
+        // otherwise, all 0 bits
+        uint32x4_t cmp_u32 = vcleq_u32(sum_sqr_u32, m_distance_squared_u32);
+        return (cmp_u32[0] & 0x01) | (cmp_u32[1] & 0x02) | (cmp_u32[2] & 0x04) | (cmp_u32[3] & 0x08);
+    }
+
+private:
+    uint8x16_t m_expected_color_rgb_u8;
+    uint32x4_t m_distance_squared_u32;
+};
+
+
+
 }
 }
 #endif
diff --git a/SerialPrograms/Source/Kernels/BinaryMatrix/Kernels_PackedBinaryMatrixCore.h b/SerialPrograms/Source/Kernels/BinaryMatrix/Kernels_PackedBinaryMatrixCore.h
index da32485b2..ec93953dd 100644
--- a/SerialPrograms/Source/Kernels/BinaryMatrix/Kernels_PackedBinaryMatrixCore.h
+++ b/SerialPrograms/Source/Kernels/BinaryMatrix/Kernels_PackedBinaryMatrixCore.h
@@ -111,8 +111,9 @@ class PackedBinaryMatrixCore{
     //  Word Access. How many words in a column. One word is 8 bytes (aka 64 bits).
     size_t word64_height() const{ return m_logical_height; }
 
-    // Get (x-th, y-th) word. One word is 16 bytes (aka 64 bits).
+    // Get (x-th, y-th) word. One word is 8 bytes (aka 64 bits), one row in a tile.
     uint64_t word64(size_t x, size_t y) const;
+    // Get (x-th, y-th) word. One word is 8 bytes (aka 64 bits), one row in a tile.
     uint64_t& word64(size_t x, size_t y);
 
 private:
diff --git a/SerialPrograms/Source/Kernels/PartialWordAccess/Kernels_PartialWordAccess_arm64_NEON.h b/SerialPrograms/Source/Kernels/PartialWordAccess/Kernels_PartialWordAccess_arm64_NEON.h
index 9c426a878..4af5d0401 100644
--- a/SerialPrograms/Source/Kernels/PartialWordAccess/Kernels_PartialWordAccess_arm64_NEON.h
+++ b/SerialPrograms/Source/Kernels/PartialWordAccess/Kernels_PartialWordAccess_arm64_NEON.h
@@ -33,11 +33,11 @@ class PartialWordAccess_arm64_NEON{
         // If `bytes` is 3, then `m_back_mask` is from low bytes to high bytes: [0xFF, 0xFF, 0xFF, ... 0xFF, 0, 0, 0]
         m_back_mask = vcgtq_u8(vdupq_n_u8((uint8_t)m_shift), seq_u8x16);
 
-        // If `bytes` is 3, then `m_shift_front` is from low bytes to high bytes: [3, 4, 5, 6, 7, ..., 18]
-        m_shift_front = vaddq_u8(vdupq_n_u8(uint8_t(bytes)), seq_u8x16);
+        // If `bytes` is 3, then `m_shift_front` is from low bytes to high bytes: [13, 14, 15, 16, 17, ..., 28]
+        m_shift_front = vaddq_u8(vdupq_n_u8(uint8_t(m_shift)), seq_u8x16);
 
-        // IF `bytes` is 3, then `m_shift_back` is from low bytes to hight bytes: [253, 254, 255, 0, 1, 2, 3]
-        m_shift_back = vsubq_u8(seq_u8x16, vdupq_n_u8((uint8_t)bytes));
+        // IF `bytes` is 3, then `m_shift_back` is from low bytes to high bytes: [243, 244, 245,... 0, 1, 2]
+        m_shift_back = vsubq_u8(seq_u8x16, vdupq_n_u8((uint8_t)m_shift));
     }
 
     // load() function that does not read past end of buffer
@@ -50,8 +50,8 @@ class PartialWordAccess_arm64_NEON{
         // for each uint8 in the result, ret_u8[i], get the index from `m_shift_front`: m_shift_front[i]
         // use the value of m_shift_front[i] as an index to get a value in x:
         // ret_u8[i] = x[m_shift_front[i]]
-        // since `m_shift_front` stores [`bytes`, `bytes+1`, `bytes+2`, ...]
-        // the resulting operation is to shift the bytes in x to the lower bytes by `bytes` bytes.
+        // since `m_shift_front` stores [`16-bytes`, `16-bytes+1`, `16-bytes+2`, ...]
+        // the resulting operation is to shift the bytes in x to the lower bytes by `16-bytes` bytes.
         // For the index values >= 16 in m_shift_front[i], `vqtbl1q_u8()` returns 0.
         return vqtbl1q_u8(x, m_shift_front);
     }
diff --git a/SerialPrograms/Source/Tests/Kernels_Tests.cpp b/SerialPrograms/Source/Tests/Kernels_Tests.cpp
index 335db779d..ac8d8d8ea 100644
--- a/SerialPrograms/Source/Tests/Kernels_Tests.cpp
+++ b/SerialPrograms/Source/Tests/Kernels_Tests.cpp
@@ -16,6 +16,7 @@
 #include "Kernels/BinaryMatrix/Kernels_BinaryMatrix.h"
 #ifdef PA_AutoDispatch_arm64_20_M1
     #include "Kernels/BinaryMatrix/Kernels_BinaryMatrixTile_64x8_arm64_NEON.h"
+    #include "Kernels/PartialWordAccess/Kernels_PartialWordAccess_arm64_NEON.h"
 #endif
 #include "Kernels/BinaryMatrix/Kernels_BinaryMatrixTile_64xH_Default.h"
 #include "Kernels/BinaryImageFilters/Kernels_BinaryImage_BasicFilters.h"
@@ -71,19 +72,16 @@ int test_kernels_BinaryMatrix(const ImageViewRGB32& image){
         return 1;
     }
 
-    const size_t width = image.width();
-    const size_t height = image.height();
+    const size_t width = image.width(), height = image.height();
 
-    const Color min_color(0, 0, 0);
-    const Color max_color(63, 63, 63);
-    const uint32_t mins = uint32_t(min_color);
-    const uint32_t maxs = uint32_t(max_color);
+    const Color min_color(0, 0, 0), max_color(63, 63, 63);
+    const uint32_t mins = uint32_t(min_color), maxs = uint32_t(max_color);
 
-    auto matrix_default = make_PackedBinaryMatrix(get_BinaryMatrixType(), width, height);
+    auto binary_matrix = make_PackedBinaryMatrix(get_BinaryMatrixType(), width, height);
 
     auto time_start = current_time();
     compress_rgb32_to_binary_range(
-        image.data(), image.bytes_per_row(), *matrix_default, mins, maxs
+        image.data(), image.bytes_per_row(), *binary_matrix, mins, maxs
     );
     auto time_end = current_time();
     auto ns = std::chrono::duration_cast<std::chrono::nanoseconds>(time_end - time_start).count();
@@ -91,19 +89,19 @@ int test_kernels_BinaryMatrix(const ImageViewRGB32& image){
     cout << "One binary matrix creation. time: " << ms << " ms" << endl;
     
     size_t error_count = 0;
-    for (size_t r = 0; r < height; r++){
-        for (size_t c = 0; c < width; c++){
-            const Color color(image.pixel(c, r));
+    for (size_t y = 0; y < height; y++){
+        for (size_t x = 0; x < width; x++){
+            const Color color(image.pixel(x, y));
             bool in_range = (min_color.alpha() <= color.alpha() && color.alpha() <= max_color.alpha());
             in_range = in_range && (min_color.red() <= color.red() && color.red() <= max_color.red());
             in_range = in_range && (min_color.green() <= color.green() && color.green() <= max_color.green());
             in_range = in_range && (min_color.blue() <= color.blue() && color.blue() <= max_color.blue());
 
-            const bool v_default = matrix_default->get(c, r);
+            const bool v_default = binary_matrix->get(x, y);
             
             if (error_count < 10){
                 if (v_default != in_range){
-                    cout << "Error: matrix (" << c << ", " << r << ") got "
+                    cout << "Error: matrix (" << x << ", " << y << ") got "
                         << v_default << " but GT is " << in_range << endl;
                     ++error_count;
                 }
@@ -120,21 +118,20 @@ int test_kernels_BinaryMatrix(const ImageViewRGB32& image){
     time_start = current_time();
     for(size_t i = 0; i < num_iters; i++){
         compress_rgb32_to_binary_range(
-            image.data(), image.bytes_per_row(), *matrix_default, mins, maxs
+            image.data(), image.bytes_per_row(), *binary_matrix, mins, maxs
         );
     }
     time_end = current_time();
     ms = std::chrono::duration_cast<Milliseconds>(time_end - time_start).count();
     cout << "Running " << num_iters << " iters, average creation impl. time: " << ms / (double)num_iters << " ms" << endl;
 
-    // cout << matrix_default->dump() << flush;
+    // cout << binary_matrix->dump() << flush;
 
     return 0;
 }
 
 int test_kernels_FilterRGB32Range(const ImageViewRGB32& image){
-    const size_t width = image.width();
-    const size_t height = image.height();
+    const size_t width = image.width(), height = image.height();
     cout << "Testing filter_rgb32_range(), image size " << width << " x " << height << endl;
 
     Color min_color(0, 0, 0);
@@ -169,11 +166,11 @@ int test_kernels_FilterRGB32Range(const ImageViewRGB32& image){
 
     size_t actual_num_pixels_in_range = 0;
     size_t error_count = 0;
-    for (size_t r = 0; r < height; r++){
-        for (size_t c = 0; c < width; c++){
-            const Color color(image.pixel(c, r));
-            const Color new_color(image_out.pixel(c, r));
-            const Color new_color_2(image_out_2.pixel(c, r));
+    for (size_t y = 0; y < height; y++){
+        for (size_t x = 0; x < width; x++){
+            const Color color(image.pixel(x, y));
+            const Color new_color(image_out.pixel(x, y));
+            const Color new_color_2(image_out_2.pixel(x, y));
             bool in_range = (min_color.alpha() <= color.alpha() && color.alpha() <= max_color.alpha());
             in_range = in_range && (min_color.red() <= color.red() && color.red() <= max_color.red());
             in_range = in_range && (min_color.green() <= color.green() && color.green() <= max_color.green());
@@ -183,23 +180,23 @@ int test_kernels_FilterRGB32Range(const ImageViewRGB32& image){
                 // Print first 10 errors:
                 if (in_range && new_color != COLOR_WHITE){
                     cout << "Error: wrong filter result: old color " << color.to_string() << ", (x,y) = "
-                        << c << ", " << r << ", should be in range but not found by the function" << endl;
+                        << x << ", " << y << ", should be in range but not found by the function" << endl;
                     ++error_count;
                 }
                 else if (in_range == false && new_color != color){
                     cout << "Error: wrong filter result: old color " << color.to_string() << ", (x,y) = "
-                        << c << ", " << r << ", should not be changed by the function" << endl;
+                        << x << ", " << y << ", should not be changed by the function" << endl;
                     ++error_count;
                 }
 
                 if (in_range && new_color_2 != color){
                     cout << "Error: wrong inverse filter result: old color " << color.to_string() << ", (x,y) = "
-                        << c << ", " << r << ", should not be changed by the function" << endl;
+                        << x << ", " << y << ", should not be changed by the function" << endl;
                     ++error_count;
                 }
                 else if (in_range == false && new_color_2 != COLOR_WHITE){
                     cout << "Error: wrong inverse filter result: old color " << color.to_string() << ", (x,y) = "
-                        << c << ", " << r << ", should not be in range but not found by the function" << endl;
+                        << x << ", " << y << ", should not be in range but not found by the function" << endl;
                     ++error_count;
                 }
             }
@@ -269,11 +266,11 @@ int test_kernels_FilterRGB32Euclidean(const ImageViewRGB32& image){
 
     size_t actual_num_pixels_in_range = 0;
     size_t error_count = 0;
-    for (size_t r = 0; r < height; r++){
-        for (size_t c = 0; c < width; c++){
-            const Color color(image.pixel(c, r));
-            const Color new_color(image_out.pixel(c, r));
-            const Color new_color_2(image_out_2.pixel(c, r));
+    for (size_t y = 0; y < height; y++){
+        for (size_t x = 0; x < width; x++){
+            const Color color(image.pixel(x, y));
+            const Color new_color(image_out.pixel(x, y));
+            const Color new_color_2(image_out_2.pixel(x, y));
             int red_dif = (color.red() - middle_color.red());
             int green_dif = (color.green() - middle_color.green());
             int blue_dif = (color.blue() - middle_color.blue());
@@ -284,14 +281,14 @@ int test_kernels_FilterRGB32Euclidean(const ImageViewRGB32& image){
                 // Print first 10 errors:
                 if (in_range && new_color != COLOR_WHITE){
                     cout << "Error: wrong filter result: old color " << color.to_string()
-                         << ", (x,y) = (" << c << ", " << r << ")"
+                         << ", (x,y) = (" << x << ", " << y << ")"
                          << ", dist2 " << color_dist2 << ", max dist2 " << max_dist2
                          << ", should be in range but not found by the function" << endl;
                     ++error_count;
                 }
                 else if (in_range == false && new_color != color){
                     cout << "Error: wrong filter result: old color " << color.to_string()
-                         << ", (x,y) = (" << c << ", " << r << ")"
+                         << ", (x,y) = (" << x << ", " << y << ")"
                          << ", dist2 " << color_dist2 << ", max dist2 " << max_dist2
                          << ", should not be changed by the function" << endl;
                     ++error_count;
@@ -299,14 +296,14 @@ int test_kernels_FilterRGB32Euclidean(const ImageViewRGB32& image){
 
                 if (in_range && new_color_2 != color){
                     cout << "Error: wrong inverse filter result: old color " << color.to_string()
-                         << ", (x,y) = (" << c << ", " << r << ")"
+                         << ", (x,y) = (" << x << ", " << y << ")"
                          << ", dist2 " << color_dist2 << ", max dist2 " << max_dist2
                          << ", should not be changed by the function" << endl;
                     ++error_count;
                 }
                 else if (in_range == false && new_color_2 != COLOR_WHITE){
                     cout << "Error: wrong inverse filter result: old color " << color.to_string()
-                         << ", (x,y) = (" << c << ", " << r << ")"
+                         << ", (x,y) = (" << x << ", " << y << ")"
                          << ", dist2 " << color_dist2 << ", max dist2 " << max_dist2
                          << ", should not be in range but not found by the function" << endl;
                     ++error_count;
@@ -378,11 +375,11 @@ int test_kernels_ToBlackWhiteRGB32Range(const ImageViewRGB32& image){
 
     size_t actual_num_pixels_in_range = 0;
     size_t error_count = 0;
-    for (size_t r = 0; r < height; r++){
-        for (size_t c = 0; c < width; c++){
-            const Color color(image.pixel(c, r));
-            const Color new_color(image_out.pixel(c, r));
-            const Color new_color_2(image_out_2.pixel(c, r));
+    for (size_t y = 0; y < height; y++){
+        for (size_t x = 0; x < width; x++){
+            const Color color(image.pixel(x, y));
+            const Color new_color(image_out.pixel(x, y));
+            const Color new_color_2(image_out_2.pixel(x, y));
             bool in_range = (min_color.alpha() <= color.alpha() && color.alpha() <= max_color.alpha());
             in_range = in_range && (min_color.red() <= color.red() && color.red() <= max_color.red());
             in_range = in_range && (min_color.green() <= color.green() && color.green() <= max_color.green());
@@ -392,23 +389,23 @@ int test_kernels_ToBlackWhiteRGB32Range(const ImageViewRGB32& image){
                 // Print first 10 errors:
                 if (in_range && new_color != COLOR_BLACK){
                     cout << "Error: wrong filter result: old color " << color.to_string() << ", (x,y) = "
-                        << c << ", " << r << ", should be black due to in range but not so" << endl;
+                        << x << ", " << y << ", should be black due to in range but not so" << endl;
                     ++error_count;
                 }
                 else if (in_range == false && new_color != COLOR_WHITE){
                     cout << "Error: wrong filter result: old color " << color.to_string() << ", (x,y) = "
-                        << c << ", " << r << ", should be white due to out of range but not so" << endl;
+                        << x << ", " << y << ", should be white due to out of range but not so" << endl;
                     ++error_count;
                 }
 
                 if (in_range && new_color_2 != COLOR_WHITE){
                     cout << "Error: wrong inverse filter result: old color " << color.to_string() << ", (x,y) = "
-                        << c << ", " << r << ", should be white due to in range but not so" << endl;
+                        << x << ", " << y << ", should be white due to in range but not so" << endl;
                     ++error_count;
                 }
                 else if (in_range == false && new_color_2 != COLOR_BLACK){
                     cout << "Error: wrong inverse filter result: old color " << color.to_string() << ", (x,y) = "
-                        << c << ", " << r << ", should be black due to out of range but not so" << endl;
+                        << x << ", " << y << ", should be black due to out of range but not so" << endl;
                     ++error_count;
                 }
             }
@@ -440,6 +437,159 @@ int test_kernels_ToBlackWhiteRGB32Range(const ImageViewRGB32& image){
     return 0;
 }
 
+int test_kernels_FilterByMask(const ImageViewRGB32& image){
+    const size_t width = image.width(), height = image.height();
+    cout << "Image width " << width << " height " << height << endl;
+
+    const Color min_color(0, 0, 0), max_color(63, 63, 63);
+    const uint32_t mins = uint32_t(min_color), maxs = uint32_t(max_color);
+
+    auto binary_matrix = make_PackedBinaryMatrix(get_BinaryMatrixType(), width, height);
+    compress_rgb32_to_binary_range(
+        image.data(), image.bytes_per_row(), *binary_matrix, mins, maxs
+    );
+    ImageRGB32 new_image = image.copy();
+    ImageRGB32 new_image_2 = image.copy();
+
+    Color replacement_color = COLOR_WHITE;
+    bool replace_zero_bits = true;
+
+    auto time_start = current_time();    
+    filter_by_mask(*binary_matrix, new_image.data(), new_image.bytes_per_row(), uint32_t(replacement_color), replace_zero_bits);
+    auto time_end = current_time();
+    auto ns = std::chrono::duration_cast<std::chrono::nanoseconds>(time_end - time_start).count();
+    auto ms = ns / 1000000.;
+    cout << "One Filter by mask call. time: " << ms << " ms" << endl;
+
+    filter_by_mask(*binary_matrix, new_image_2.data(), new_image_2.bytes_per_row(), uint32_t(replacement_color), !replace_zero_bits);
+    
+    size_t error_count = 0;
+
+    for (size_t y = 0; y < height; y++){
+        for (size_t x = 0; x < width; x++){
+            const Color color(image.pixel(x, y));
+            const Color new_color(new_image.pixel(x, y));
+            const Color new_color_2(new_image_2.pixel(x, y));
+            bool in_range = (min_color.alpha() <= color.alpha() && color.alpha() <= max_color.alpha());
+            in_range = in_range && (min_color.red() <= color.red() && color.red() <= max_color.red());
+            in_range = in_range && (min_color.green() <= color.green() && color.green() <= max_color.green());
+            in_range = in_range && (min_color.blue() <= color.blue() && color.blue() <= max_color.blue());
+
+            if (error_count <= 10){
+                if (in_range && new_color != color){
+                    cout << "Error: wrong filter(replace_zero_bits) result: old color " << color.to_string() << ", (x,y) = "
+                            << x << ", " << y << ", should not be changed due to being one bit but not so" << endl;
+                    ++error_count;
+                }
+                else if (!in_range && new_color != replacement_color){
+                    cout << "Error: wrong filter(replace_zero_bits) result: old color " << color.to_string() << ", (x,y) = "
+                            << x << ", " << y << ", should be changed due to being zero bit but not so" << endl;
+                    ++error_count;
+                }
+
+                if (in_range && new_color_2 != replacement_color){
+                    cout << "Error: wrong filter(replace_one_bits) result: old color " << color.to_string() << ", (x,y) = "
+                            << x << ", " << y << ", should be changed due to being one bit but not so" << endl;
+                    ++error_count;
+                }
+                else if (!in_range && new_color_2 != color){
+                    cout << "Error: wrong filter(replace_one_bits) result: old color " << color.to_string() << ", (x,y) = "
+                            << x << ", " << y << ", should not be changed due to being zero bit but not so" << endl;
+                    ++error_count;
+                }
+            }
+        }
+    }
+
+    if (error_count){
+        return 1;
+    }
+
+    // We try to wait for three seconds:
+    const size_t num_iters = size_t(3000 / ms);
+    time_start = current_time();
+    for(size_t i = 0; i < num_iters; i++){
+        filter_by_mask(*binary_matrix, new_image.data(), new_image.bytes_per_row(), uint32_t(replacement_color), replace_zero_bits);
+    }
+    time_end = current_time();
+    ms = (double)std::chrono::duration_cast<Milliseconds>(time_end - time_start).count();
+    cout << "Running " << num_iters << " iters, avg filter time: " << ms / num_iters << " ms" << endl;
+
+    return 0;
+}
+
+int test_kernels_CompressRGB32ToBinaryEuclidean(const ImageViewRGB32& image){
+    const size_t width = image.width();
+    const size_t height = image.height();
+    cout << "Testing test_kernels_CompressRGB32ToBinaryEuclidean(), image size " << width << " x " << height << endl;
+
+    Color middle_color = Color(image.pixel(width/2, height/2));
+    cout << "Expected color: " << middle_color.to_string() << endl;
+
+    double max_dist = 50.0;
+    size_t max_dist2 = size_t(max_dist * max_dist);
+
+    PackedBinaryMatrix matrix(image.width(), image.height());
+
+    auto time_start = current_time();
+    Kernels::compress_rgb32_to_binary_euclidean(
+        image.data(), image.bytes_per_row(), matrix,
+        uint32_t(middle_color), max_dist
+    );
+    auto time_end = current_time();
+    auto ns = std::chrono::duration_cast<std::chrono::nanoseconds>(time_end - time_start).count();
+    auto ms = ns / 1000000.;
+    cout << "One filter time: " << ms << " ms" << endl;
+
+    size_t error_count = 0;
+    for (size_t y = 0; y < height; y++){
+        for (size_t x = 0; x < width; x++){
+            const Color color(image.pixel(x, y));
+            int red_dif = (color.red() - middle_color.red());
+            int green_dif = (color.green() - middle_color.green());
+            int blue_dif = (color.blue() - middle_color.blue());
+            size_t color_dist2 = red_dif * red_dif + green_dif * green_dif + blue_dif * blue_dif;
+            bool in_range = color_dist2 <= max_dist2;
+            if (error_count < 10){
+                // Print first 10 errors:
+                if (in_range && matrix.get(x, y) == false){
+                    cout << "Error: wrong filter result: old color " << color.to_string()
+                         << ", (x,y) = (" << x << ", " << y << ")"
+                         << ", dist2 " << color_dist2 << ", max dist2 " << max_dist2
+                         << ", should be in range but not set on matrix" << endl;
+                    ++error_count;
+                }
+                else if (in_range == false && matrix.get(x, y) == true){
+                    cout << "Error: wrong filter result: old color " << color.to_string()
+                         << ", (x,y) = (" << x << ", " << y << ")"
+                         << ", dist2 " << color_dist2 << ", max dist2 " << max_dist2
+                         << ", should not be in range but set on matrix" << endl;
+                    ++error_count;
+                }
+            }
+        }
+    }
+    if (error_count){
+        return 1;
+    }
+
+    // We try to wait for three seconds:
+    const size_t num_iters = size_t(3000 / ms);
+    time_start = current_time();
+    for(size_t i = 0; i < num_iters; i++){
+        Kernels::compress_rgb32_to_binary_euclidean(
+            image.data(), image.bytes_per_row(), matrix,
+            uint32_t(middle_color), max_dist
+        );
+    }
+    time_end = current_time();
+    ms = (double)std::chrono::duration_cast<Milliseconds>(time_end - time_start).count();
+    cout << "Running " << num_iters << " iters, avg filter time: " << ms / num_iters << " ms" << endl;
+
+    return 0;
+}
+
+
 
 
 int test_kernels_Waterfill(const ImageViewRGB32& image){
@@ -724,6 +874,47 @@ int test_binary_matrix_tile(){
     if (test_binary_matrix_tile_t<BinaryTile_64x8_arm64_NEON>() != 0){
         return 1;
     }
+
+    for(size_t num_bytes = 0; num_bytes <= 16; ++num_bytes){
+        PartialWordAccess_arm64_NEON partial(num_bytes);
+
+        uint8_t buffer[48];
+        for(int i = 0; i < 48; i++){
+            buffer[i] = 99;
+        }
+        for(uint8_t i = 0; i < 16; i++){
+            buffer[i+16] = i;
+        }
+
+        // uint8x16_t x = partial.load(buffer+16);
+        uint8x16_t x = partial.load_int_no_read_past_end(buffer+16);
+        for(size_t i = 0; i < num_bytes; ++i){
+            if (x[i] != buffer[i+16]){
+                cout << "Error: PartialWordAccess_arm64_NEON(" << num_bytes << ")::load_int_no_read_past_end(), i = " << i << " is " << int(x[i])
+                    << ", but should be " << int(buffer[i+16]) << endl;
+                return 1;
+            }
+        }
+        x = partial.load_int_no_read_before_ptr(buffer+16);
+        for(size_t i = 0; i < num_bytes; ++i){
+            if (x[i] != buffer[i+16]){
+                cout << "Error: PartialWordAccess_arm64_NEON(" << num_bytes << ")::load_int_no_read_before_ptr(), i = " << i << " is " << int(x[i])
+                    << ", but should be " << int(buffer[i+16]) << endl;
+                return 1;
+            }
+        }
+        for(int i = 0; i < 48; i++){
+            buffer[i] = 99;
+        }
+        partial.store_int_no_past_end(buffer+16, x);
+        for(size_t i = 0; i < num_bytes; ++i){
+            if (x[i] != buffer[i+16]){
+                cout << "Error: PartialWordAccess_arm64_NEON(" << num_bytes << ")::store_int_no_past_end(), i = " << i << " is " << int(buffer[i+16])
+                    << ", but should be " << int(x[i+16]) << endl;
+                return 1;
+            }
+        }
+    }
 #endif
     return 0;
 }
diff --git a/SerialPrograms/Source/Tests/Kernels_Tests.h b/SerialPrograms/Source/Tests/Kernels_Tests.h
index 6a2138dfe..d09fd9a57 100644
--- a/SerialPrograms/Source/Tests/Kernels_Tests.h
+++ b/SerialPrograms/Source/Tests/Kernels_Tests.h
@@ -25,6 +25,10 @@ int test_kernels_FilterRGB32Euclidean(const ImageViewRGB32& image);
 
 int test_kernels_ToBlackWhiteRGB32Range(const ImageViewRGB32& image);
 
+int test_kernels_FilterByMask(const ImageViewRGB32& image);
+
+int test_kernels_CompressRGB32ToBinaryEuclidean(const ImageViewRGB32& image);
+
 int test_kernels_Waterfill(const ImageViewRGB32& image);
 
 
diff --git a/SerialPrograms/Source/Tests/TestMap.cpp b/SerialPrograms/Source/Tests/TestMap.cpp
index 3ecc221f9..ce0426e9d 100644
--- a/SerialPrograms/Source/Tests/TestMap.cpp
+++ b/SerialPrograms/Source/Tests/TestMap.cpp
@@ -224,6 +224,8 @@ const std::map<std::string, TestFunction> TEST_MAP = {
     {"Kernels_FilterRGB32Range", std::bind(image_void_detector_helper, test_kernels_FilterRGB32Range, _1)},
     {"Kernels_FilterRGB32Euclidean", std::bind(image_void_detector_helper, test_kernels_FilterRGB32Euclidean, _1)},
     {"Kernels_ToBlackWhiteRGB32Range", std::bind(image_void_detector_helper, test_kernels_ToBlackWhiteRGB32Range, _1)},
+    {"Kernels_FilterByMask", std::bind(image_void_detector_helper, test_kernels_FilterByMask, _1)},
+    {"Kernels_CompressRGB32ToBinaryEuclidean", std::bind(image_void_detector_helper, test_kernels_CompressRGB32ToBinaryEuclidean, _1)},
     {"Kernels_Waterfill", std::bind(image_void_detector_helper, test_kernels_Waterfill, _1)},
     {"CommonFramework_BlackBorderDetector", std::bind(image_bool_detector_helper, test_CommonFramework_BlackBorderDetector, _1)},
     {"NintendoSwitch_UpdateMenuDetector", std::bind(image_bool_detector_helper, test_NintendoSwitch_UpdateMenuDetector, _1)},