Skip to content

Commit

Permalink
Dramatically improve performance of CPU kernels on less sophisticated…
Browse files Browse the repository at this point in the history
… compilers (#78)

* Force inline CPU kernel methods

* Add warnings when compiling cpu kernels and inlining fails

* Optimize CPU kernels to improve the probability that loop vectorization occurs

* Only check for nan values in the CPU kernels if necessary.

* Clang will now output whether or not loops were vertorized during compilation.
  • Loading branch information
zpzim authored Jan 19, 2022
1 parent 84d53a9 commit 0f8c54a
Show file tree
Hide file tree
Showing 10 changed files with 129 additions and 42 deletions.
6 changes: 6 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,7 @@ CHECK_CXX_COMPILER_FLAG("-O3" COMPILER_OPT_O3_SUPPORTED)
CHECK_CXX_COMPILER_FLAG("-funroll-loops" COMPILER_OPT_UNROLL_LOOPS_SUPPORTED)
CHECK_CXX_COMPILER_FLAG("-ffp-contract=fast" COMPILER_OPT_FPCONTRACT_FAST_SUPPORTED)
CHECK_CXX_COMPILER_FLAG("-Wall" COMPILER_OPT_WARN_ALL_SUPPORTED)
CHECK_CXX_COMPILER_FLAG("-Wno-sign-compare" COMPILER_OPT_NO_WARN_SIGN_COMPARE_SUPPORTED)

if (COMPILER_OPT_PIC_SUPPORTED)
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -fPIC")
Expand Down Expand Up @@ -153,6 +154,11 @@ if (COMPILER_OPT_WARN_ALL_SUPPORTED)
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -Wall")
endif()

if (COMPILER_OPT_NO_WARN_SIGN_COMPARE_SUPPORTED)
set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -Wno-sign-compare")
set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -Wno-sign-compare")
endif()

CHECK_CXX_COMPILER_FLAG("-fsanitize=address" COMPILER_OPT_SANITIZE_ADDRESS_SUPPORTED)
CHECK_CXX_COMPILER_FLAG("-fno-omit-frame-pointer" COMPILER_OPT_NO_OMIT_FP_SUPPORTED)

Expand Down
6 changes: 4 additions & 2 deletions src/common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,7 @@ std::string getSCAMPErrorString(SCAMPError_t err) {
case SCAMP_DIM_INCOMPATIBLE:
return "SCAMP_DIM_INCOMPATIBLE";
}
return "SCAMP_UNKNOWN_ERROR";
}

size_t GetProfileTypeSize(SCAMPProfileType t) {
Expand All @@ -196,8 +197,7 @@ size_t GetProfileTypeSize(SCAMPProfileType t) {
case PROFILE_TYPE_FREQUENCY_THRESH:
case PROFILE_TYPE_INVALID:
default:
throw SCAMPException(
"Error: Could not determine size of profile elements");
throw SCAMPException("Error: Profile Type Unknown");
}
}

Expand All @@ -222,6 +222,7 @@ std::string GetProfileTypeString(SCAMPProfileType t) {
case PROFILE_TYPE_MATRIX_SUMMARY:
return "PROFILE_TYPE_MATRIX_SUMMARY";
}
return "PROFILE_TYPE_UNKNOWN";
}

std::string GetPrecisionTypeString(SCAMPPrecisionType t) {
Expand All @@ -237,6 +238,7 @@ std::string GetPrecisionTypeString(SCAMPPrecisionType t) {
case PRECISION_ULTRA:
return "PRECISION_ULTRA";
}
return "PRECISION_UNKNOWN";
}

} // namespace SCAMP
Expand Down
38 changes: 21 additions & 17 deletions src/common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -185,28 +185,32 @@ struct OpInfo {
// Struct containing the precomputed statistics for an input time series
struct PrecomputedInfo {
private:
std::vector<double> _norms;
std::vector<double> _df;
std::vector<double> _dg;
std::vector<double> _means;
std::vector<double> norms_;
std::vector<double> df_;
std::vector<double> dg_;
std::vector<double> means_;
std::vector<int> nan_idxs_;

public:
void set(std::vector<double> &means, std::vector<double> &norms,
std::vector<double> &df, std::vector<double> &dg) {
_norms = std::move(norms);
_means = std::move(means);
_df = std::move(df);
_dg = std::move(dg);
std::vector<double> &df, std::vector<double> &dg,
std::vector<int> &nan_idxs) {
norms_ = std::move(norms);
means_ = std::move(means);
df_ = std::move(df);
dg_ = std::move(dg);
nan_idxs_ = std::move(nan_idxs);
}

const std::vector<double> &dg() const { return _dg; }
const std::vector<double> &df() const { return _df; }
const std::vector<double> &norms() const { return _norms; }
const std::vector<double> &means() const { return _means; }
std::vector<double> &mutable_dg() { return _dg; }
std::vector<double> &mutable_df() { return _df; }
std::vector<double> &mutable_norms() { return _norms; }
std::vector<double> &mutable_means() { return _means; }
const std::vector<double> &dg() const { return dg_; }
const std::vector<double> &df() const { return df_; }
const std::vector<double> &norms() const { return norms_; }
const std::vector<double> &means() const { return means_; }
const std::vector<int> &nan_idxs() const { return nan_idxs_; }
std::vector<double> &mutable_dg() { return dg_; }
std::vector<double> &mutable_df() { return df_; }
std::vector<double> &mutable_norms() { return norms_; }
std::vector<double> &mutable_means() { return means_; }
};

struct CombinedStats {
Expand Down
28 changes: 28 additions & 0 deletions src/core/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,34 @@ target_link_libraries(cpu_stats common)
add_library(cpu_kernels ${CMAKE_CURRENT_SOURCE_DIR}/cpu_kernels.cpp)
target_link_libraries(cpu_kernels kernel_common common)

CHECK_CXX_COMPILER_FLAG("-Winline" COMPILER_OPT_WARN_INLINE_SUPPORTED)
CHECK_CXX_COMPILER_FLAG("/Qvec-report:2" COMPILER_OPT_QVEC_REPORT_SUPPORTED)
CHECK_CXX_COMPILER_FLAG("-fopt-info-vec-all" COMPILER_OPT_GCC_VEC_INFO_SUPPORTED)
CHECK_CXX_COMPILER_FLAG("-Rpass-analysis=loop-vectorize" COMPILER_OPT_LLVM_VEC_MISSED_INFO_SUPPORTED)
CHECK_CXX_COMPILER_FLAG("-Rpass=loop-vectorize" COMPILER_OPT_LLVM_VEC_LOOPS_INFO_SUPPORTED)


if (COMPILER_OPT_QVEC_REPORT_SUPPORTED)
target_compile_options(cpu_kernels PRIVATE "/Qvec-report:2")
endif()

if (COMPILER_OPT_GCC_VEC_INFO_SUPPORTED)
target_compile_options(cpu_kernels PRIVATE "-fopt-info-vec-all")
endif()

if (COMPILER_OPT_LLVM_VEC_MISSED_INFO_SUPPORTED)
target_compile_options(cpu_kernels PRIVATE "-Rpass-analysis=loop-vectorize")
endif()

if (COMPILER_OPT_LLVM_VEC_LOOPS_INFO_SUPPORTED)
target_compile_options(cpu_kernels PRIVATE "-Rpass=loop-vectorize")
endif()


if (COMPILER_OPT_WARN_INLINE_SUPPORTED)
target_compile_options(cpu_kernels PRIVATE "-Winline")
endif()

add_library(tile ${CMAKE_CURRENT_SOURCE_DIR}/tile.cpp)

if (CMAKE_CUDA_COMPILER)
Expand Down
61 changes: 39 additions & 22 deletions src/core/cpu_kernels.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
#include "cpu_kernels.h"
#include "defines.h"
#include "kernel_common.h"

#include <array>
Expand All @@ -20,7 +21,7 @@ constexpr int simdByteLen{32};
// Outputs an 'initial' distance value based on the type of profile being
// computed
template <typename DISTANCE_TYPE, SCAMPProfileType type>
inline DISTANCE_TYPE init_dist() {
FORCE_INLINE inline DISTANCE_TYPE init_dist() {
switch (type) {
case PROFILE_TYPE_KNN:
case PROFILE_TYPE_APPROX_ALL_NEIGHBORS:
Expand All @@ -37,9 +38,9 @@ inline DISTANCE_TYPE init_dist() {
}

template <SCAMPProfileType PROFILE_TYPE>
inline void update_mp(double *mp, double corr, int row,
int col, // NOLINT(misc-unused-parameters)
double thresh) {
FORCE_INLINE inline void update_mp(double *mp, double corr, int row,
int col, // NOLINT(misc-unused-parameters)
double thresh) {
if (PROFILE_TYPE == PROFILE_TYPE_SUM_THRESH) {
mp[col] = corr > thresh ? mp[col] + corr : mp[col];
} else {
Expand All @@ -48,8 +49,9 @@ inline void update_mp(double *mp, double corr, int row,
}

template <SCAMPProfileType PROFILE_TYPE>
inline void update_mp(mp_entry *mp, double corr, int row, int col,
double thresh) { // NOLINT(misc-unused-parameters)
FORCE_INLINE inline void update_mp(
mp_entry *mp, double corr, int row, int col,
double thresh) { // NOLINT(misc-unused-parameters)
if (PROFILE_TYPE == PROFILE_TYPE_1NN_INDEX) {
if (corr > mp[col].floats[0]) {
mp[col].floats[0] = corr;
Expand All @@ -61,19 +63,20 @@ inline void update_mp(mp_entry *mp, double corr, int row, int col,
}

template <SCAMPProfileType PROFILE_TYPE>
inline void update_mp(float *mp, double corr, int row, int col,
double thresh) { // NOLINT(misc-unused-parameters)
FORCE_INLINE inline void update_mp(
float *mp, double corr, int row, int col,
double thresh) { // NOLINT(misc-unused-parameters)
if (PROFILE_TYPE == PROFILE_TYPE_1NN) {
mp[col] = mp[col] >= corr ? mp[col] : corr;
mp[col] = corr > mp[col] ? corr : mp[col];
} else {
ASSERT(false, "No Implementation provided for updating MP in CPU KERNEL");
}
}

template <typename DATA_TYPE, SCAMPProfileType type>
inline void reduce_row(std::array<DATA_TYPE, unrollWid> &corr,
std::array<int, unrollWid / 2> &corrIdx,
double thresh) { // NOLINT
FORCE_INLINE inline void reduce_row(std::array<DATA_TYPE, unrollWid> &corr,
std::array<int, unrollWid / 2> &corrIdx,
double thresh) { // NOLINT
switch (type) {
case PROFILE_TYPE_1NN_INDEX: {
for (int i = 0; i < unrollWid / 2; i++) {
Expand Down Expand Up @@ -147,13 +150,23 @@ void do_tile(const SCAMPKernelInputArgs<double> &args,
alignas(simdByteLen) std::array<DIST_TYPE, unrollWid>
corr; // NOLINT(cppcoreguidelines-pro-type-member-init,
// hicpp-member-init)
// MSVC and other less sophisticated compilers cannot autovectorize this
// loop unless all accesses appear aligned. We can trick these compilers
// into vectorizing this loop by avoiding the complicated indexing
// within the loop and rather just change the offest of each input array.
double *__restrict cov = args.cov + tile_diag;
const double *__restrict normsa = args.normsa + tile_diag + row;
const double *__restrict normsb = args.normsb;
for (int local_diag = 0; local_diag < unrollWid; local_diag++) {
int curr_diag = tile_diag + local_diag;
int col = curr_diag + row;
DIST_TYPE correlation =
args.cov[curr_diag] * args.normsa[col] * args.normsb[row];
corr[local_diag] =
std::isfinite(correlation) ? correlation : initializer;
corr[local_diag] = cov[local_diag] * normsa[local_diag] * normsb[row];
}
if (args.has_nan_input) {
// Remove any nan values so that they don't pollute the reduction.
// This is expensive on some compilers so only do it if we need to.
for (int local_diag = 0; local_diag < unrollWid; local_diag++) {
corr[local_diag] =
std::isfinite(corr[local_diag]) ? corr[local_diag] : initializer;
}
}
if (computing_cols) {
for (int local_diag = 0; local_diag < unrollWid; local_diag++) {
Expand All @@ -172,11 +185,15 @@ void do_tile(const SCAMPKernelInputArgs<double> &args,
corrIdx[0] + tile_diag + row, row,
args.opt.threshold);
}
// Same as above, avoid complicated indexing within the loop to get
// less sophisticated compilers to vectorize it.
const double *__restrict dfa = args.dfa + tile_diag + row;
const double *__restrict dga = args.dga + tile_diag + row;
double dfb = args.dfb[row];
double dgb = args.dgb[row];
for (int local_diag = 0; local_diag < unrollWid; local_diag++) {
int curr_diag = tile_diag + local_diag;
int col = curr_diag + row;
args.cov[curr_diag] += args.dfa[col] * args.dgb[row];
args.cov[curr_diag] += args.dfb[row] * args.dga[col];
cov[local_diag] += dfa[local_diag] * dgb;
cov[local_diag] += dfb * dga[local_diag];
}
}

Expand Down
5 changes: 4 additions & 1 deletion src/core/cpu_stats.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ void compute_statistics_cpu(const std::vector<double> &T,
int n = T.size() - m + 1;
std::vector<double> norms, df(n), dg(n);
std::vector<double> means;
std::vector<int> nan_idxs;

if (high_precision_norms) {
means = brute_force_moving_mean(T, m);
Expand All @@ -141,10 +142,12 @@ void compute_statistics_cpu(const std::vector<double> &T,
// If the subsequence includes a NaN, we define the norm as NaN
if (nanvalues[i]) {
norms[i] = std::nan("NaN");
nan_idxs.push_back(i);
// Check if the sum of differences from the mean is too small and this
// subsequence should be considered FLAT
} else if (norms[i] <= FLATNESS_EPSILON) {
norms[i] = std::nan("NaN");
nan_idxs.push_back(i);
} else {
// Compute the inverse norm from the sum of squared differences
norms[i] = static_cast<double>(1.0) / std::sqrt(norms[i]);
Expand All @@ -156,7 +159,7 @@ void compute_statistics_cpu(const std::vector<double> &T,
dg[i] = (T[i + m] - means[i + 1]) + (T[i] - means[i]);
}

info->set(means, norms, df, dg);
info->set(means, norms, df, dg, nan_idxs);
}

CombinedStats compute_combined_stats_cpu(const std::vector<double> &A,
Expand Down
1 change: 1 addition & 0 deletions src/core/kernel_common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ SCAMPKernelInputArgs<T>::SCAMPKernelInputArgs(Tile *t, bool transpose,
cols_per_cell = t->info()->cols_per_cell;
global_start_col = t->get_tile_col();
global_start_row = t->get_tile_row();
has_nan_input = t->has_nan_input();
}

template <typename T>
Expand Down
1 change: 1 addition & 0 deletions src/core/kernel_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ struct SCAMPKernelInputArgs {
int32_t cols_per_cell;
int64_t global_start_col;
int64_t global_start_row;
bool has_nan_input;

OptionalArgs opt;
void Print();
Expand Down
22 changes: 22 additions & 0 deletions src/core/tile.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,8 @@ T *alloc_mem(size_t count, SCAMPArchitecture arch, int deviceid) {
case CPU_WORKER:
return new T[count]; // NOLINT
}
ASSERT(false, "Architecture not defined");
return nullptr;
}

// Deleter for tile memory which can reside on the host or cuda devices
Expand Down Expand Up @@ -390,6 +392,26 @@ void Tile::InitStats(const PrecomputedInfo &a, const PrecomputedInfo &b,
size_t bytes_b =
(current_tile_height_ - info_->mp_window + 1) * sizeof(double);

// If this tile contains nan inputs we will need to perform potentially more
// expensive computation.
has_nan_input_ = false;
for (const auto &idx : a.nan_idxs()) {
if (idx >= current_tile_col_ &&
idx < current_tile_col_ + current_tile_width_) {
has_nan_input_ = true;
break;
}
}
if (!has_nan_input_) {
for (const auto &idx : b.nan_idxs()) {
if (idx >= current_tile_row_ &&
idx < current_tile_row_ + current_tile_height_) {
has_nan_input_ = true;
break;
}
}
}

// Initialize the tile's local stats based on global statistics "a" and "b"
Memcopy(norms_A_.get(), a.norms().data() + current_tile_col_, bytes_a, false);
Memcopy(norms_B_.get(), b.norms().data() + current_tile_row_, bytes_b, false);
Expand Down
3 changes: 3 additions & 0 deletions src/core/tile.h
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@ class Tile {
size_t current_tile_height_;
size_t current_tile_col_;
size_t current_tile_row_;
// True if this tile has nan inputs.
bool has_nan_input_;

const OpInfo *info_;
ExecInfo exec_info_;
Expand Down Expand Up @@ -90,6 +92,7 @@ class Tile {
size_t get_tile_height() const { return current_tile_height_; }
size_t get_tile_row() const { return current_tile_row_; }
size_t get_tile_col() const { return current_tile_col_; }
bool has_nan_input() const { return has_nan_input_; }
const OpInfo *info() const { return info_; }
void *profile_a() { return profile_a_tile_dev_.at(info_->profile_type); };
void *profile_b() { return profile_b_tile_dev_.at(info_->profile_type); };
Expand Down

0 comments on commit 0f8c54a

Please sign in to comment.