Skip to content

Commit

Permalink
Apply optimizations to CPU Kernel reductions to improve performance o…
Browse files Browse the repository at this point in the history
…n various toolchains. (#93)

* Optimize CPU Kernel Reductions for Various compilers.
  • Loading branch information
zpzim authored Feb 14, 2022
1 parent 3e7d3b8 commit 4db3e25
Showing 1 changed file with 61 additions and 2 deletions.
63 changes: 61 additions & 2 deletions src/core/cpu_kernels.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -93,18 +93,43 @@ FORCE_INLINE inline void update_mp(PROFILE_DATA_TYPE *mp, double corr,
}

template <typename EIGEN_TYPE, SCAMPProfileType type>
FORCE_INLINE inline void reduce_row(EIGEN_TYPE &corr, int &index,
double thresh) { // NOLINT
FORCE_INLINE inline void reduce_row_fast(EIGEN_TYPE &corr, int &index,
double thresh) { // NOLINT
switch (type) {
case PROFILE_TYPE_1NN_INDEX: {
#if defined(_WIN32) || defined(__INTEL_COMPILER) || \
defined(__INTEL_LLVM_COMPILER)
// cl, clang-cl, and intel compilers work better using Eigen's reduction.
corr[0] = corr.maxCoeff(&index);
#else
// gcc and clang work better using a manual reduction loop.
Eigen::Array<int, unrollWid, 1> corrIdx;
for (int i = 0; i < unrollWid / 2; i++) {
corrIdx[i] = corr[i] >= corr[i + unrollWid / 2] ? i : i + unrollWid / 2;
corr[i] = corr[i] >= corr[i + unrollWid / 2] ? corr[i]
: corr[i + unrollWid / 2];
}
auto horizontal_reduction = [&corr, &corrIdx](int offset) {
for (int i = 0; i < offset; i++) {
corrIdx[i] =
corr[i] >= corr[i + offset] ? corrIdx[i] : corrIdx[i + offset];
corr[i] = corr[i] >= corr[i + offset] ? corr[i] : corr[i + offset];
}
};
for (int i = unrollWid / 4; i > 0; i /= 2) {
horizontal_reduction(i);
}
index = corrIdx[0];
#endif
break;
}
case PROFILE_TYPE_1NN: {
// Eigen's reduction is faster in all cases here.
corr[0] = corr.maxCoeff();
break;
}
case PROFILE_TYPE_SUM_THRESH: {
// TODO(zpzim): Provide a faster reduction implementation.
corr[0] = (corr > thresh).select(corr, 0).sum();
break;
}
Expand All @@ -113,6 +138,40 @@ FORCE_INLINE inline void reduce_row(EIGEN_TYPE &corr, int &index,
}
}

// These reductions are slightly slower than what we can do with loops.
// If Eigen performance improves over time, these might end up faster.
template <typename EIGEN_TYPE, SCAMPProfileType type>
FORCE_INLINE inline void reduce_row_eigen(EIGEN_TYPE &corr, int &index,
double thresh) { // NOLINT
switch (type) {
case PROFILE_TYPE_1NN_INDEX: {
corr[0] = corr.maxCoeff(&index);
break;
}
case PROFILE_TYPE_1NN: {
corr[0] = corr.maxCoeff();
break;
}
case PROFILE_TYPE_SUM_THRESH: {
corr[0] = (corr > thresh).select(corr, 0).sum();
break;
}
default:
break;
}
}

template <typename EIGEN_TYPE, SCAMPProfileType type>
FORCE_INLINE inline void reduce_row(EIGEN_TYPE &corr, int &index,
double thresh) { // NOLINT

if constexpr (EIGEN_TYPE::RowsAtCompileTime != Eigen::Dynamic) {
reduce_row_fast<EIGEN_TYPE, type>(corr, index, thresh);
} else {
reduce_row_eigen<EIGEN_TYPE, type>(corr, index, thresh);
}
}

template <typename DIST_TYPE, typename PROFILE_DATA_TYPE, typename EIGEN_TYPE,
SCAMPProfileType PROFILE_TYPE>
FORCE_INLINE inline void update_columnwise(
Expand Down

1 comment on commit 4db3e25

@github-actions
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Performance Alert ⚠️

Possible performance regression was detected for benchmark.
Benchmark result of this commit is worse than the previous benchmark result exceeding threshold 1.50.

Benchmark suite Current: 4db3e25 Previous: 3e7d3b8 Ratio
BM_1NN_SELF_JOIN/1/131072 11166307877.999998 ns/iter 5590754157.000049 ns/iter 2.00

This comment was automatically generated by workflow using github-action-benchmark.

CC: @zpzim

Please sign in to comment.