Dramatically improve performance of CPU kernels on less sophisticated…

… compilers (#78) * Force inline CPU kernel methods * Add warnings when compiling cpu kernels and inlining fails * Optimize CPU kernels to improve the probability that loop vectorization occurs * Only check for nan values in the CPU kernels if necessary. * Clang will now output whether or not loops were vertorized during compilation.
zpzim · Jan 19, 2022 · 0f8c54a · 0f8c54a
1 parent 84d53a9
commit 0f8c54a
Show file tree

Hide file tree

Showing 10 changed files with 129 additions and 42 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -126,6 +126,7 @@ CHECK_CXX_COMPILER_FLAG("-O3" COMPILER_OPT_O3_SUPPORTED)
 CHECK_CXX_COMPILER_FLAG("-funroll-loops" COMPILER_OPT_UNROLL_LOOPS_SUPPORTED)
 CHECK_CXX_COMPILER_FLAG("-ffp-contract=fast" COMPILER_OPT_FPCONTRACT_FAST_SUPPORTED)
 CHECK_CXX_COMPILER_FLAG("-Wall" COMPILER_OPT_WARN_ALL_SUPPORTED)
+CHECK_CXX_COMPILER_FLAG("-Wno-sign-compare" COMPILER_OPT_NO_WARN_SIGN_COMPARE_SUPPORTED)
 
 if (COMPILER_OPT_PIC_SUPPORTED)
   set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -fPIC")
@@ -153,6 +154,11 @@ if (COMPILER_OPT_WARN_ALL_SUPPORTED)
   set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -Wall")
 endif()
 
+if (COMPILER_OPT_NO_WARN_SIGN_COMPARE_SUPPORTED)
+  set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -Wno-sign-compare")
+  set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -Wno-sign-compare")
+endif()
+
 CHECK_CXX_COMPILER_FLAG("-fsanitize=address" COMPILER_OPT_SANITIZE_ADDRESS_SUPPORTED)
 CHECK_CXX_COMPILER_FLAG("-fno-omit-frame-pointer" COMPILER_OPT_NO_OMIT_FP_SUPPORTED)
 

diff --git a/src/common/common.cpp b/src/common/common.cpp
@@ -179,6 +179,7 @@ std::string getSCAMPErrorString(SCAMPError_t err) {
     case SCAMP_DIM_INCOMPATIBLE:
       return "SCAMP_DIM_INCOMPATIBLE";
   }
+  return "SCAMP_UNKNOWN_ERROR";
 }
 
 size_t GetProfileTypeSize(SCAMPProfileType t) {
@@ -196,8 +197,7 @@ size_t GetProfileTypeSize(SCAMPProfileType t) {
     case PROFILE_TYPE_FREQUENCY_THRESH:
     case PROFILE_TYPE_INVALID:
     default:
-      throw SCAMPException(
-          "Error: Could not determine size of profile elements");
+      throw SCAMPException("Error: Profile Type Unknown");
   }
 }
 
@@ -222,6 +222,7 @@ std::string GetProfileTypeString(SCAMPProfileType t) {
     case PROFILE_TYPE_MATRIX_SUMMARY:
       return "PROFILE_TYPE_MATRIX_SUMMARY";
   }
+  return "PROFILE_TYPE_UNKNOWN";
 }
 
 std::string GetPrecisionTypeString(SCAMPPrecisionType t) {
@@ -237,6 +238,7 @@ std::string GetPrecisionTypeString(SCAMPPrecisionType t) {
     case PRECISION_ULTRA:
       return "PRECISION_ULTRA";
   }
+  return "PRECISION_UNKNOWN";
 }
 
 }  // namespace SCAMP

diff --git a/src/common/common.h b/src/common/common.h
@@ -185,28 +185,32 @@ struct OpInfo {
 // Struct containing the precomputed statistics for an input time series
 struct PrecomputedInfo {
  private:
-  std::vector<double> _norms;
-  std::vector<double> _df;
-  std::vector<double> _dg;
-  std::vector<double> _means;
+  std::vector<double> norms_;
+  std::vector<double> df_;
+  std::vector<double> dg_;
+  std::vector<double> means_;
+  std::vector<int> nan_idxs_;
 
  public:
   void set(std::vector<double> &means, std::vector<double> &norms,
-           std::vector<double> &df, std::vector<double> &dg) {
-    _norms = std::move(norms);
-    _means = std::move(means);
-    _df = std::move(df);
-    _dg = std::move(dg);
+           std::vector<double> &df, std::vector<double> &dg,
+           std::vector<int> &nan_idxs) {
+    norms_ = std::move(norms);
+    means_ = std::move(means);
+    df_ = std::move(df);
+    dg_ = std::move(dg);
+    nan_idxs_ = std::move(nan_idxs);
   }
 
-  const std::vector<double> &dg() const { return _dg; }
-  const std::vector<double> &df() const { return _df; }
-  const std::vector<double> &norms() const { return _norms; }
-  const std::vector<double> &means() const { return _means; }
-  std::vector<double> &mutable_dg() { return _dg; }
-  std::vector<double> &mutable_df() { return _df; }
-  std::vector<double> &mutable_norms() { return _norms; }
-  std::vector<double> &mutable_means() { return _means; }
+  const std::vector<double> &dg() const { return dg_; }
+  const std::vector<double> &df() const { return df_; }
+  const std::vector<double> &norms() const { return norms_; }
+  const std::vector<double> &means() const { return means_; }
+  const std::vector<int> &nan_idxs() const { return nan_idxs_; }
+  std::vector<double> &mutable_dg() { return dg_; }
+  std::vector<double> &mutable_df() { return df_; }
+  std::vector<double> &mutable_norms() { return norms_; }
+  std::vector<double> &mutable_means() { return means_; }
 };
 
 struct CombinedStats {

diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
@@ -22,6 +22,34 @@ target_link_libraries(cpu_stats common)
 add_library(cpu_kernels ${CMAKE_CURRENT_SOURCE_DIR}/cpu_kernels.cpp)
 target_link_libraries(cpu_kernels kernel_common common)
 
+CHECK_CXX_COMPILER_FLAG("-Winline" COMPILER_OPT_WARN_INLINE_SUPPORTED)
+CHECK_CXX_COMPILER_FLAG("/Qvec-report:2" COMPILER_OPT_QVEC_REPORT_SUPPORTED)
+CHECK_CXX_COMPILER_FLAG("-fopt-info-vec-all" COMPILER_OPT_GCC_VEC_INFO_SUPPORTED)
+CHECK_CXX_COMPILER_FLAG("-Rpass-analysis=loop-vectorize" COMPILER_OPT_LLVM_VEC_MISSED_INFO_SUPPORTED)
+CHECK_CXX_COMPILER_FLAG("-Rpass=loop-vectorize" COMPILER_OPT_LLVM_VEC_LOOPS_INFO_SUPPORTED)
+
+
+if (COMPILER_OPT_QVEC_REPORT_SUPPORTED)
+  target_compile_options(cpu_kernels PRIVATE "/Qvec-report:2")
+endif()
+
+if (COMPILER_OPT_GCC_VEC_INFO_SUPPORTED)
+  target_compile_options(cpu_kernels PRIVATE "-fopt-info-vec-all")
+endif()
+
+if (COMPILER_OPT_LLVM_VEC_MISSED_INFO_SUPPORTED)
+  target_compile_options(cpu_kernels PRIVATE "-Rpass-analysis=loop-vectorize")
+endif()
+
+if (COMPILER_OPT_LLVM_VEC_LOOPS_INFO_SUPPORTED)
+  target_compile_options(cpu_kernels PRIVATE "-Rpass=loop-vectorize")
+endif()
+
+
+if (COMPILER_OPT_WARN_INLINE_SUPPORTED)
+  target_compile_options(cpu_kernels PRIVATE "-Winline")
+endif()
+
 add_library(tile ${CMAKE_CURRENT_SOURCE_DIR}/tile.cpp)
 
 if (CMAKE_CUDA_COMPILER)

diff --git a/src/core/cpu_kernels.cpp b/src/core/cpu_kernels.cpp
@@ -1,4 +1,5 @@
 #include "cpu_kernels.h"
+#include "defines.h"
 #include "kernel_common.h"
 
 #include <array>
@@ -20,7 +21,7 @@ constexpr int simdByteLen{32};
 // Outputs an 'initial' distance value based on the type of profile being
 // computed
 template <typename DISTANCE_TYPE, SCAMPProfileType type>
-inline DISTANCE_TYPE init_dist() {
+FORCE_INLINE inline DISTANCE_TYPE init_dist() {
   switch (type) {
     case PROFILE_TYPE_KNN:
     case PROFILE_TYPE_APPROX_ALL_NEIGHBORS:
@@ -37,9 +38,9 @@ inline DISTANCE_TYPE init_dist() {
 }
 
 template <SCAMPProfileType PROFILE_TYPE>
-inline void update_mp(double *mp, double corr, int row,
-                      int col,  // NOLINT(misc-unused-parameters)
-                      double thresh) {
+FORCE_INLINE inline void update_mp(double *mp, double corr, int row,
+                                   int col,  // NOLINT(misc-unused-parameters)
+                                   double thresh) {
   if (PROFILE_TYPE == PROFILE_TYPE_SUM_THRESH) {
     mp[col] = corr > thresh ? mp[col] + corr : mp[col];
   } else {
@@ -48,8 +49,9 @@ inline void update_mp(double *mp, double corr, int row,
 }
 
 template <SCAMPProfileType PROFILE_TYPE>
-inline void update_mp(mp_entry *mp, double corr, int row, int col,
-                      double thresh) {  // NOLINT(misc-unused-parameters)
+FORCE_INLINE inline void update_mp(
+    mp_entry *mp, double corr, int row, int col,
+    double thresh) {  // NOLINT(misc-unused-parameters)
   if (PROFILE_TYPE == PROFILE_TYPE_1NN_INDEX) {
     if (corr > mp[col].floats[0]) {
       mp[col].floats[0] = corr;
@@ -61,19 +63,20 @@ inline void update_mp(mp_entry *mp, double corr, int row, int col,
 }
 
 template <SCAMPProfileType PROFILE_TYPE>
-inline void update_mp(float *mp, double corr, int row, int col,
-                      double thresh) {  // NOLINT(misc-unused-parameters)
+FORCE_INLINE inline void update_mp(
+    float *mp, double corr, int row, int col,
+    double thresh) {  // NOLINT(misc-unused-parameters)
   if (PROFILE_TYPE == PROFILE_TYPE_1NN) {
-    mp[col] = mp[col] >= corr ? mp[col] : corr;
+    mp[col] = corr > mp[col] ? corr : mp[col];
   } else {
     ASSERT(false, "No Implementation provided for updating MP in CPU KERNEL");
   }
 }
 
 template <typename DATA_TYPE, SCAMPProfileType type>
-inline void reduce_row(std::array<DATA_TYPE, unrollWid> &corr,
-                       std::array<int, unrollWid / 2> &corrIdx,
-                       double thresh) {  // NOLINT
+FORCE_INLINE inline void reduce_row(std::array<DATA_TYPE, unrollWid> &corr,
+                                    std::array<int, unrollWid / 2> &corrIdx,
+                                    double thresh) {  // NOLINT
   switch (type) {
     case PROFILE_TYPE_1NN_INDEX: {
       for (int i = 0; i < unrollWid / 2; i++) {
@@ -147,13 +150,23 @@ void do_tile(const SCAMPKernelInputArgs<double> &args,
       alignas(simdByteLen) std::array<DIST_TYPE, unrollWid>
           corr;  // NOLINT(cppcoreguidelines-pro-type-member-init,
                  // hicpp-member-init)
+      // MSVC and other less sophisticated compilers cannot autovectorize this
+      // loop unless all accesses appear aligned. We can trick these compilers
+      // into vectorizing this loop by avoiding the complicated indexing
+      // within the loop and rather just change the offest of each input array.
+      double *__restrict cov = args.cov + tile_diag;
+      const double *__restrict normsa = args.normsa + tile_diag + row;
+      const double *__restrict normsb = args.normsb;
       for (int local_diag = 0; local_diag < unrollWid; local_diag++) {
-        int curr_diag = tile_diag + local_diag;
-        int col = curr_diag + row;
-        DIST_TYPE correlation =
-            args.cov[curr_diag] * args.normsa[col] * args.normsb[row];
-        corr[local_diag] =
-            std::isfinite(correlation) ? correlation : initializer;
+        corr[local_diag] = cov[local_diag] * normsa[local_diag] * normsb[row];
+      }
+      if (args.has_nan_input) {
+        // Remove any nan values so that they don't pollute the reduction.
+        // This is expensive on some compilers so only do it if we need to.
+        for (int local_diag = 0; local_diag < unrollWid; local_diag++) {
+          corr[local_diag] =
+              std::isfinite(corr[local_diag]) ? corr[local_diag] : initializer;
+        }
       }
       if (computing_cols) {
         for (int local_diag = 0; local_diag < unrollWid; local_diag++) {
@@ -172,11 +185,15 @@ void do_tile(const SCAMPKernelInputArgs<double> &args,
                                 corrIdx[0] + tile_diag + row, row,
                                 args.opt.threshold);
       }
+      // Same as above, avoid complicated indexing within the loop to get
+      // less sophisticated compilers to vectorize it.
+      const double *__restrict dfa = args.dfa + tile_diag + row;
+      const double *__restrict dga = args.dga + tile_diag + row;
+      double dfb = args.dfb[row];
+      double dgb = args.dgb[row];
       for (int local_diag = 0; local_diag < unrollWid; local_diag++) {
-        int curr_diag = tile_diag + local_diag;
-        int col = curr_diag + row;
-        args.cov[curr_diag] += args.dfa[col] * args.dgb[row];
-        args.cov[curr_diag] += args.dfb[row] * args.dga[col];
+        cov[local_diag] += dfa[local_diag] * dgb;
+        cov[local_diag] += dfb * dga[local_diag];
       }
     }
 

diff --git a/src/core/cpu_stats.cpp b/src/core/cpu_stats.cpp
@@ -128,6 +128,7 @@ void compute_statistics_cpu(const std::vector<double> &T,
   int n = T.size() - m + 1;
   std::vector<double> norms, df(n), dg(n);
   std::vector<double> means;
+  std::vector<int> nan_idxs;
 
   if (high_precision_norms) {
     means = brute_force_moving_mean(T, m);
@@ -141,10 +142,12 @@ void compute_statistics_cpu(const std::vector<double> &T,
     // If the subsequence includes a NaN, we define the norm as NaN
     if (nanvalues[i]) {
       norms[i] = std::nan("NaN");
+      nan_idxs.push_back(i);
       // Check if the sum of differences from the mean is too small and this
       // subsequence should be considered FLAT
     } else if (norms[i] <= FLATNESS_EPSILON) {
       norms[i] = std::nan("NaN");
+      nan_idxs.push_back(i);
     } else {
       // Compute the inverse norm from the sum of squared differences
       norms[i] = static_cast<double>(1.0) / std::sqrt(norms[i]);
@@ -156,7 +159,7 @@ void compute_statistics_cpu(const std::vector<double> &T,
     dg[i] = (T[i + m] - means[i + 1]) + (T[i] - means[i]);
   }
 
-  info->set(means, norms, df, dg);
+  info->set(means, norms, df, dg, nan_idxs);
 }
 
 CombinedStats compute_combined_stats_cpu(const std::vector<double> &A,

diff --git a/src/core/kernel_common.cpp b/src/core/kernel_common.cpp
@@ -35,6 +35,7 @@ SCAMPKernelInputArgs<T>::SCAMPKernelInputArgs(Tile *t, bool transpose,
   cols_per_cell = t->info()->cols_per_cell;
   global_start_col = t->get_tile_col();
   global_start_row = t->get_tile_row();
+  has_nan_input = t->has_nan_input();
 }
 
 template <typename T>

diff --git a/src/core/kernel_common.h b/src/core/kernel_common.h
@@ -33,6 +33,7 @@ struct SCAMPKernelInputArgs {
   int32_t cols_per_cell;
   int64_t global_start_col;
   int64_t global_start_row;
+  bool has_nan_input;
 
   OptionalArgs opt;
   void Print();

diff --git a/src/core/tile.cpp b/src/core/tile.cpp
@@ -157,6 +157,8 @@ T *alloc_mem(size_t count, SCAMPArchitecture arch, int deviceid) {
     case CPU_WORKER:
       return new T[count];  // NOLINT
   }
+  ASSERT(false, "Architecture not defined");
+  return nullptr;
 }
 
 // Deleter for tile memory which can reside on the host or cuda devices
@@ -390,6 +392,26 @@ void Tile::InitStats(const PrecomputedInfo &a, const PrecomputedInfo &b,
   size_t bytes_b =
       (current_tile_height_ - info_->mp_window + 1) * sizeof(double);
 
+  // If this tile contains nan inputs we will need to perform potentially more
+  // expensive computation.
+  has_nan_input_ = false;
+  for (const auto &idx : a.nan_idxs()) {
+    if (idx >= current_tile_col_ &&
+        idx < current_tile_col_ + current_tile_width_) {
+      has_nan_input_ = true;
+      break;
+    }
+  }
+  if (!has_nan_input_) {
+    for (const auto &idx : b.nan_idxs()) {
+      if (idx >= current_tile_row_ &&
+          idx < current_tile_row_ + current_tile_height_) {
+        has_nan_input_ = true;
+        break;
+      }
+    }
+  }
+
   // Initialize the tile's local stats based on global statistics "a" and "b"
   Memcopy(norms_A_.get(), a.norms().data() + current_tile_col_, bytes_a, false);
   Memcopy(norms_B_.get(), b.norms().data() + current_tile_row_, bytes_b, false);

diff --git a/src/core/tile.h b/src/core/tile.h
@@ -42,6 +42,8 @@ class Tile {
   size_t current_tile_height_;
   size_t current_tile_col_;
   size_t current_tile_row_;
+  // True if this tile has nan inputs.
+  bool has_nan_input_;
 
   const OpInfo *info_;
   ExecInfo exec_info_;
@@ -90,6 +92,7 @@ class Tile {
   size_t get_tile_height() const { return current_tile_height_; }
   size_t get_tile_row() const { return current_tile_row_; }
   size_t get_tile_col() const { return current_tile_col_; }
+  bool has_nan_input() const { return has_nan_input_; }
   const OpInfo *info() const { return info_; }
   void *profile_a() { return profile_a_tile_dev_.at(info_->profile_type); };
   void *profile_b() { return profile_b_tile_dev_.at(info_->profile_type); };