Apply optimizations to CPU Kernel reductions to improve performance o…

…n various toolchains. (#93) * Optimize CPU Kernel Reductions for Various compilers.
zpzim · Feb 14, 2022 · 4db3e25 · 4db3e25 · github-actions · Feb 14, 2022
1 parent 3e7d3b8
commit 4db3e25
Showing 1 changed file with 61 additions and 2 deletions.
diff --git a/src/core/cpu_kernels.cpp b/src/core/cpu_kernels.cpp
@@ -93,18 +93,43 @@ FORCE_INLINE inline void update_mp(PROFILE_DATA_TYPE *mp, double corr,
 }
 
 template <typename EIGEN_TYPE, SCAMPProfileType type>
-FORCE_INLINE inline void reduce_row(EIGEN_TYPE &corr, int &index,
-                                    double thresh) {  // NOLINT
+FORCE_INLINE inline void reduce_row_fast(EIGEN_TYPE &corr, int &index,
+                                         double thresh) {  // NOLINT
   switch (type) {
     case PROFILE_TYPE_1NN_INDEX: {
+#if defined(_WIN32) || defined(__INTEL_COMPILER) || \
+    defined(__INTEL_LLVM_COMPILER)
+      // cl, clang-cl, and intel compilers work better using Eigen's reduction.
       corr[0] = corr.maxCoeff(&index);
+#else
+      // gcc and clang work better using a manual reduction loop.
+      Eigen::Array<int, unrollWid, 1> corrIdx;
+      for (int i = 0; i < unrollWid / 2; i++) {
+        corrIdx[i] = corr[i] >= corr[i + unrollWid / 2] ? i : i + unrollWid / 2;
+        corr[i] = corr[i] >= corr[i + unrollWid / 2] ? corr[i]
+                                                     : corr[i + unrollWid / 2];
+      }
+      auto horizontal_reduction = [&corr, &corrIdx](int offset) {
+        for (int i = 0; i < offset; i++) {
+          corrIdx[i] =
+              corr[i] >= corr[i + offset] ? corrIdx[i] : corrIdx[i + offset];
+          corr[i] = corr[i] >= corr[i + offset] ? corr[i] : corr[i + offset];
+        }
+      };
+      for (int i = unrollWid / 4; i > 0; i /= 2) {
+        horizontal_reduction(i);
+      }
+      index = corrIdx[0];
+#endif
       break;
     }
     case PROFILE_TYPE_1NN: {
+      // Eigen's reduction is faster in all cases here.
       corr[0] = corr.maxCoeff();
       break;
     }
     case PROFILE_TYPE_SUM_THRESH: {
+      // TODO(zpzim): Provide a faster reduction implementation.
       corr[0] = (corr > thresh).select(corr, 0).sum();
       break;
     }
@@ -113,6 +138,40 @@ FORCE_INLINE inline void reduce_row(EIGEN_TYPE &corr, int &index,
   }
 }
 
+// These reductions are slightly slower than what we can do with loops.
+// If Eigen performance improves over time, these might end up faster.
+template <typename EIGEN_TYPE, SCAMPProfileType type>
+FORCE_INLINE inline void reduce_row_eigen(EIGEN_TYPE &corr, int &index,
+                                          double thresh) {  // NOLINT
+  switch (type) {
+    case PROFILE_TYPE_1NN_INDEX: {
+      corr[0] = corr.maxCoeff(&index);
+      break;
+    }
+    case PROFILE_TYPE_1NN: {
+      corr[0] = corr.maxCoeff();
+      break;
+    }
+    case PROFILE_TYPE_SUM_THRESH: {
+      corr[0] = (corr > thresh).select(corr, 0).sum();
+      break;
+    }
+    default:
+      break;
+  }
+}
+
+template <typename EIGEN_TYPE, SCAMPProfileType type>
+FORCE_INLINE inline void reduce_row(EIGEN_TYPE &corr, int &index,
+                                    double thresh) {  // NOLINT
+
+  if constexpr (EIGEN_TYPE::RowsAtCompileTime != Eigen::Dynamic) {
+    reduce_row_fast<EIGEN_TYPE, type>(corr, index, thresh);
+  } else {
+    reduce_row_eigen<EIGEN_TYPE, type>(corr, index, thresh);
+  }
+}
+
 template <typename DIST_TYPE, typename PROFILE_DATA_TYPE, typename EIGEN_TYPE,
           SCAMPProfileType PROFILE_TYPE>
 FORCE_INLINE inline void update_columnwise(