-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.cpp
138 lines (123 loc) · 5.87 KB
/
main.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
#include <array>
#include <boost/optional/optional.hpp>
#include <chrono>
#include <iomanip>
#include <iostream>
#include <random>
#include <sstream>
#include <cblas.h>
class Matrix {
std::size_t _rows;
std::size_t _cols;
public:
float *data;
Matrix(const std::size_t numRows, const std::size_t numCols, const boost::optional<float> initValue = boost::none) : _rows(numRows), _cols(numCols) {
data = new float[_rows * _cols];
if (initValue) {
for (std::size_t row = 0; row < numRows; ++row) {
for (std::size_t col = 0; col < numCols; ++col) {
data[row + col] = initValue.get();
}
}
} else {
std::random_device rd;
std::mt19937 generator(rd());
std::uniform_real_distribution<float> dist(0, 1000);
for (std::size_t row = 0; row < numRows; ++row) {
for (std::size_t col = 0; col < numCols; ++col) {
data[row + col] = dist(generator);
}
}
}
}
~Matrix() {
delete[] data;
data = nullptr;
}
decltype(_rows) rows() const {
return _rows;
}
decltype(_cols) cols() const {
return _cols;
}
static void gemm_naive(const Matrix & A, const Matrix & B, Matrix & C) {
for (std::size_t row = 0; row < A.rows(); ++row) {
for (std::size_t col = 0; col < A.cols(); ++col) {
for (std::size_t i = 0; i < A.rows(); ++i) {
C.data[row + col] = C.data[row + col] + A.data[row+i] * B.data[i+col];
}
}
}
}
};
std::ostream& operator<<(std::ostream & stream, const Matrix & m) {
for (std::size_t row = 0; row < m.rows(); ++row) {
for (std::size_t col = 0; col < m.cols(); ++col) {
stream << m.data[row + col] << " ";
}
stream << "\n";
}
return stream;
}
std::ostream& aligned(const float number, const int width, const int precision) {
return std::cout << std::setw(width) << std::setiosflags(std::ios::fixed) << std::setprecision(precision) << std::right << number;
}
int main(int, char *[]) {
const std::size_t samplesPerDim = 100;
const std::size_t numDims = 10;
const std::size_t stepSize = 100;
std::array<std::array<float, 3>, numDims> results;
for (std::size_t i = 0; i < numDims; ++i) {
const std::size_t dim = (i+1)*stepSize;
long avgNaiveGemm = 0;
long avgCblasGemmSingle = 0;
long avgCblasGemmMulti = 0;
const Matrix A(dim, dim);
std::cout << " finished building matrix A" << std::endl;
const Matrix B(dim, dim);
std::cout << " finished building matrix B" << std::endl;
Matrix C(dim, dim);
std::cout << " finished building matrix C" << std::endl;
for (uint sample = 0; sample < samplesPerDim; ++sample) {
std::cout << "Dimension " << dim << ", sample " << sample + 1 << std::endl;
const int intDim = static_cast<int>(dim);
auto startNaive = std::chrono::steady_clock::now();
Matrix::gemm_naive(A, B, C);
avgNaiveGemm += std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::steady_clock::now() - startNaive).count();
std::cout << " finished naive gemm" << std::endl;
openblas_set_num_threads(1);
auto startCblas = std::chrono::steady_clock::now();
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
intDim, intDim, intDim,
1, A.data, intDim, B.data, intDim, 1, C.data, intDim);
avgCblasGemmSingle += std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::steady_clock::now() - startCblas).count();
std::cout << " finished cblas gemm single-core." << std::endl;
openblas_set_num_threads(4);
auto startCblasMulti = std::chrono::steady_clock::now();
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans,
intDim, intDim, intDim,
1, A.data, intDim, B.data, intDim, 1, C.data, intDim);
std::cout << " finished cblas gemm multi-core (4 threads)" << std::endl;
avgCblasGemmMulti += std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::steady_clock::now() - startCblasMulti).count();
}
results[i] = {static_cast<float>(avgNaiveGemm)/samplesPerDim, static_cast<float>(avgCblasGemmSingle)/samplesPerDim, static_cast<float>(avgCblasGemmMulti)/samplesPerDim};
std::cout << "finished dimension " << dim << std::endl << std::endl;
}
std::cout << "Dimensions: " << numDims << ", step size: " << stepSize << ", samples per dimension: " << samplesPerDim << std::endl;
for (std::size_t i = 0; i < numDims; ++i) {
auto naive = results[i][0];
auto cblasSingle = results[i][1];
auto cblasMulti = results[i][2];
std::cout << "dim "; aligned((i+1)*stepSize, 4, 0) << ":" << std::endl;
std::cout << "Naive: "; aligned(naive/std::pow(10.f, 6.f), 8, 3) << "ms ("; aligned(naive/std::pow(10.f, 9.f), 5, 3);
std::cout << "s), cblas single-core: "; aligned(cblasSingle/std::pow(10.f, 6.f), 8, 3) << "ms ("; aligned(cblasSingle/std::pow(10.f, 9.f), 5, 3) << "s).";
std::cout << "s), cblas multi-core: "; aligned(cblasMulti/std::pow(10.f, 6.f), 8, 3) << "ms ("; aligned(cblasMulti/std::pow(10.f, 9.f), 5, 3) << "s)." << std::endl;
if (cblasSingle > 0) {
std::cout << "Ratio naive/cblas-single-core: "; aligned(static_cast<float>(naive)/cblasSingle, 5, 2) << std::endl;
if (cblasMulti > 0) {
std::cout << "Ratio naive/cblas-multi-core: "; aligned(static_cast<float>(naive)/cblasMulti, 5, 2) << ", cblas-single-core/cblas-multi-core: "; aligned(static_cast<float>(cblasSingle)/cblasMulti, 5, 2) << std::endl;
}
}
std::cout << std::endl;
}
}