Skip to content

Commit 129d22a

Browse files
author
Gin
committed
fix NEON waterfill
1 parent af10ce2 commit 129d22a

File tree

4 files changed

+81
-21
lines changed

4 files changed

+81
-21
lines changed

SerialPrograms/Source/Kernels/BinaryMatrix/Kernels_BinaryMatrix.cpp

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -31,10 +31,9 @@ BinaryMatrixType get_BinaryMatrixType(){
3131
return BinaryMatrixType::i64x8_x64_SSE42;
3232
}
3333
#elif PA_ARCH_arm64
34-
// TODO: enable this once binary matrix is ready!
35-
// if (CPU_CAPABILITY_CURRENT.OK_M1){
36-
// return BinaryMatrixType::arm64x8_x64_NEON;
37-
// }
34+
if (CPU_CAPABILITY_CURRENT.OK_M1){
35+
return BinaryMatrixType::arm64x8_x64_NEON;
36+
}
3837
#endif
3938

4039
// return BinaryMatrixType::i64x8_Default;

SerialPrograms/Source/Kernels/Waterfill/Kernels_Waterfill_Core_64x8_arm64_NEON.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66

77
#ifdef PA_AutoDispatch_arm64_20_M1
88

9-
#define USE_CPP_TEMPLATE_IMPL
9+
// #define USE_CPP_TEMPLATE_IMPL
1010

1111
#include "Kernels_Waterfill_Routines.h"
1212
#include "Kernels_Waterfill_Core_64x8_arm64_NEON.h"
@@ -24,15 +24,15 @@ namespace Waterfill{
2424
using Waterfill_64x8_Default = Waterfill_64xH_Default<BinaryTile_64x8_arm64_NEON>;
2525

2626
std::vector<WaterfillObject> find_objects_inplace_64x8_arm64_NEON(PackedBinaryMatrix_IB& matrix, size_t min_area){
27-
return find_objects_inplace<BinaryTile_64x8_arm64_NEON, Waterfill_64x8_arm64_NEON>(
27+
return find_objects_inplace<BinaryTile_64x8_arm64_NEON, Waterfill_64x8_Default>(
2828
static_cast<PackedBinaryMatrix_64x8_arm64_NEON&>(matrix).get(),
2929
min_area
3030
);
3131
}
3232
std::unique_ptr<WaterfillSession> make_WaterfillSession_64x8_arm64_NEON(PackedBinaryMatrix_IB* matrix){
3333
return matrix == nullptr
34-
? std::make_unique<WaterfillSession_t<BinaryTile_64x8_arm64_NEON, Waterfill_64x8_arm64_NEON>>()
35-
: std::make_unique<WaterfillSession_t<BinaryTile_64x8_arm64_NEON, Waterfill_64x8_arm64_NEON>>(
34+
? std::make_unique<WaterfillSession_t<BinaryTile_64x8_arm64_NEON, Waterfill_64x8_Default>>()
35+
: std::make_unique<WaterfillSession_t<BinaryTile_64x8_arm64_NEON, Waterfill_64x8_Default>>(
3636
static_cast<PackedBinaryMatrix_64x8_arm64_NEON*>(matrix)->get()
3737
);
3838
}

SerialPrograms/Source/Kernels/Waterfill/Kernels_Waterfill_Core_64x8_arm64_NEON.h

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -55,14 +55,14 @@ PA_FORCE_INLINE uint64x2_t bit_reverse(uint64x2_t x){
5555

5656

5757

58-
struct Waterfill_64x8_x64_SSE42_ProcessedMask{
58+
struct Waterfill_64x8_arm64_NEON_ProcessedMask{
5959
uint64x2_t m0, m1, m2, m3; // Copy of logical OR of the mask `m` and recorded tile `x` bits
6060
uint64x2_t b0, b1, b2, b3; // Bit-reversed copy of m0, m1, m2, m3.
6161
uint64x2_t t0, t1, t2, t3; // Transposed masks.
6262
uint64x2_t f1, f2, f3; // Forward-carry mask.
6363
uint64x2_t r0, r1, r2; // Reverse-carry mask.
6464

65-
PA_FORCE_INLINE Waterfill_64x8_x64_SSE42_ProcessedMask(
65+
PA_FORCE_INLINE Waterfill_64x8_arm64_NEON_ProcessedMask(
6666
const BinaryTile_64x8_arm64_NEON& m,
6767
uint64x2_t x0, uint64x2_t x1, uint64x2_t x2, uint64x2_t x3
6868
){
@@ -104,7 +104,7 @@ struct Waterfill_64x8_x64_SSE42_ProcessedMask{
104104

105105

106106
PA_FORCE_INLINE bool keep_going(
107-
const Waterfill_64x8_x64_SSE42_ProcessedMask& mask,
107+
const Waterfill_64x8_arm64_NEON_ProcessedMask& mask,
108108
uint64x2_t& m0, uint64x2_t& m1, uint64x2_t& m2, uint64x2_t& m3,
109109
uint64x2_t& x0, uint64x2_t& x1, uint64x2_t& x2, uint64x2_t& x3
110110
){
@@ -145,7 +145,7 @@ PA_FORCE_INLINE bool keep_going(
145145

146146

147147
PA_FORCE_INLINE void expand_forward(
148-
const Waterfill_64x8_x64_SSE42_ProcessedMask& mask,
148+
const Waterfill_64x8_arm64_NEON_ProcessedMask& mask,
149149
uint64x2_t& x0, uint64x2_t& x1, uint64x2_t& x2, uint64x2_t& x3
150150
){
151151
uint64x2_t s0 = vaddq_u64(x0, mask.m0);
@@ -170,7 +170,7 @@ PA_FORCE_INLINE void expand_reverse(uint64x2_t m, uint64x2_t b, uint64x2_t& x){
170170
x = vorrq_u64(x, s);
171171
}
172172
PA_FORCE_INLINE void expand_reverse(
173-
const Waterfill_64x8_x64_SSE42_ProcessedMask& mask,
173+
const Waterfill_64x8_arm64_NEON_ProcessedMask& mask,
174174
uint64x2_t& x0, uint64x2_t& x1, uint64x2_t& x2, uint64x2_t& x3
175175
){
176176
expand_reverse(mask.m0, mask.b0, x0);
@@ -179,7 +179,7 @@ PA_FORCE_INLINE void expand_reverse(
179179
expand_reverse(mask.m3, mask.b3, x3);
180180
}
181181
PA_FORCE_INLINE void expand_vertical(
182-
const Waterfill_64x8_x64_SSE42_ProcessedMask& mask,
182+
const Waterfill_64x8_arm64_NEON_ProcessedMask& mask,
183183
uint64x2_t& x0, uint64x2_t& x1, uint64x2_t& x2, uint64x2_t& x3
184184
){
185185
// Carry across adjacent rows.
@@ -435,7 +435,7 @@ static PA_FORCE_INLINE void waterfill_expand(BinaryTile_64x8_arm64_NEON& m, Bina
435435
uint64x2_t x2 = x.vec.val[2];
436436
uint64x2_t x3 = x.vec.val[3];
437437

438-
Waterfill_64x8_x64_SSE42_ProcessedMask mask(m, x0, x1, x2, x3);
438+
Waterfill_64x8_arm64_NEON_ProcessedMask mask(m, x0, x1, x2, x3);
439439
expand_forward(mask, x0, x1, x2, x3);
440440

441441
uint64x2_t m0, m1, m2, m3;

SerialPrograms/Source/Tests/Kernels_Tests.cpp

Lines changed: 67 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -17,11 +17,18 @@
1717
#ifdef PA_AutoDispatch_arm64_20_M1
1818
#include "Kernels/BinaryMatrix/Kernels_BinaryMatrixTile_64x8_arm64_NEON.h"
1919
#include "Kernels/PartialWordAccess/Kernels_PartialWordAccess_arm64_NEON.h"
20+
#include "Kernels/Waterfill/Kernels_Waterfill_Core_64x8_arm64_NEON.h"
2021
#endif
22+
#include "Kernels/BinaryMatrix/Kernels_BinaryMatrix_Arch_64xH_Default.h"
23+
#include "Kernels/BinaryMatrix/Kernels_BinaryMatrixTile_64x4_Default.h"
2124
#include "Kernels/BinaryMatrix/Kernels_BinaryMatrixTile_64xH_Default.h"
2225
#include "Kernels/BinaryImageFilters/Kernels_BinaryImage_BasicFilters.h"
2326
#include "Kernels/ImageFilters/Kernels_ImageFilter_Basic.h"
2427
#include "Kernels/ImageScaleBrightness/Kernels_ImageScaleBrightness.h"
28+
#include "Kernels/Waterfill/Kernels_Waterfill.h"
29+
#include "Kernels/Waterfill/Kernels_Waterfill_Session.h"
30+
#include "Kernels/Waterfill/Kernels_Waterfill_Core_64xH_Default.h"
31+
#include "Kernels/Waterfill/Kernels_Waterfill_Routines.h"
2532
#include "Kernels_Tests.h"
2633
#include "TestUtils.h"
2734

@@ -593,20 +600,74 @@ int test_kernels_CompressRGB32ToBinaryEuclidean(const ImageViewRGB32& image){
593600

594601

595602
int test_kernels_Waterfill(const ImageViewRGB32& image){
603+
const size_t width = image.width();
604+
const size_t height = image.height();
605+
cout << "Testing test_kernels_Waterfill(), image size " << width << " x " << height << endl;
596606

597-
ImagePixelBox box(0, 0, image.width(), image.height());
598-
ImageViewRGB32 sub_image = extract_box_reference(image, box);
599-
600-
PackedBinaryMatrix matrix(sub_image.width(), sub_image.height());
607+
PackedBinaryMatrix matrix(width, height);
601608
uint32_t mins = combine_rgb(0, 0, 0);
602609
// uint32_t maxs = combine_rgb(255, 255, 255);
603610
uint32_t maxs = combine_rgb(63, 63, 63);
604611
Kernels::compress_rgb32_to_binary_range(
605-
sub_image.data(), sub_image.bytes_per_row(),
612+
image.data(), image.bytes_per_row(),
606613
matrix, mins, maxs
607614
);
608615

609-
cout << matrix.dump() << flush;
616+
PackedBinaryMatrix source_matrix = matrix.copy();
617+
618+
PackedBinaryMatrix gt_matrix = matrix.copy();
619+
Kernels::PackedBinaryMatrix_IB& gt_matrix_ib = gt_matrix;
620+
621+
size_t min_area = 10;
622+
std::vector<Kernels::Waterfill::WaterfillObject> gt_objects;
623+
bool gt_computed = false;
624+
625+
#ifdef PA_AutoDispatch_arm64_20_M1
626+
if (CPU_CAPABILITY_CURRENT.OK_M1){
627+
using Waterfill_64x8_Default = Kernels::Waterfill::Waterfill_64xH_Default<Kernels::BinaryTile_64x8_arm64_NEON>;
628+
gt_objects = Kernels::Waterfill::find_objects_inplace<Kernels::BinaryTile_64x8_arm64_NEON, Waterfill_64x8_Default>(
629+
static_cast<Kernels::PackedBinaryMatrix_64x8_arm64_NEON&>(gt_matrix_ib).get(),
630+
min_area
631+
);
632+
gt_computed = true;
633+
}
634+
#endif
635+
if (gt_computed == false){
636+
using Waterfill_64x4_Default = Kernels::Waterfill::Waterfill_64xH_Default<Kernels::BinaryTile_64x4_Default>;
637+
gt_objects = Kernels::Waterfill::find_objects_inplace<Kernels::BinaryTile_64x4_Default, Waterfill_64x4_Default>(
638+
static_cast<Kernels::PackedBinaryMatrix_64x4_Default&>(gt_matrix_ib).get(),
639+
min_area
640+
);
641+
}
642+
cout << "num objects: " << gt_objects.size() << endl;
643+
644+
auto time_start = current_time();
645+
std::vector<Kernels::Waterfill::WaterfillObject> objects = Kernels::Waterfill::find_objects_inplace(matrix, min_area);
646+
auto time_end = current_time();
647+
auto ns = std::chrono::duration_cast<std::chrono::nanoseconds>(time_end - time_start).count();
648+
auto ms = ns / 1000000.;
649+
cout << "One waterfill time: " << ms << " ms" << endl;
650+
651+
for(size_t i = 0; i < objects.size(); ++i){
652+
TEST_RESULT_COMPONENT_EQUAL(objects[i].area, gt_objects[i].area, "object " + std::to_string(i) + " area");
653+
TEST_RESULT_COMPONENT_EQUAL(objects[i].min_x, gt_objects[i].min_x, "object " + std::to_string(i) + " min_x");
654+
TEST_RESULT_COMPONENT_EQUAL(objects[i].min_y, gt_objects[i].min_y, "object " + std::to_string(i) + " min_y");
655+
TEST_RESULT_COMPONENT_EQUAL(objects[i].max_x, gt_objects[i].max_x, "object " + std::to_string(i) + " max_x");
656+
TEST_RESULT_COMPONENT_EQUAL(objects[i].max_y, gt_objects[i].max_y, "object " + std::to_string(i) + " max_y");
657+
}
658+
659+
// We try to wait for three seconds:
660+
const size_t num_iters = size_t(3000 / ms);
661+
time_start = current_time();
662+
for(size_t i = 0; i < num_iters; i++){
663+
matrix = source_matrix.copy();
664+
objects = Kernels::Waterfill::find_objects_inplace(matrix, min_area);
665+
}
666+
time_end = current_time();
667+
ms = (double)std::chrono::duration_cast<Milliseconds>(time_end - time_start).count();
668+
cout << "Running " << num_iters << " iters, avg filter time: " << ms / num_iters << " ms" << endl;
669+
670+
610671

611672
return 0;
612673
}

0 commit comments

Comments
 (0)