-
Notifications
You must be signed in to change notification settings - Fork 7
BLAS compatibility library #7
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
7cbf4de
2d72035
08461cc
b531adc
a59dd65
d11fc10
0745071
1da49df
dd0db45
6ecea7a
2dd8b21
e4165cb
7da55be
aeb9ce1
4c499ae
bfcacd9
b50c80e
4947c85
d96de48
4eb4881
c49e272
bdd9f35
adaba04
13aa0e9
e2c32d8
41f75a8
4ab5d11
3c37a15
98f4721
ee113ba
1d53578
c6a86a7
4c22885
2aa28f5
47022fd
8c8e2c2
b1449e1
572a4ca
102c818
d22a4ee
b223108
71a4cf8
14274fc
0b435be
1f43348
9856042
1e8cffe
c1fdfa9
0677b71
03859ee
b1a768d
78a4594
119185c
8338d02
1f8cd57
137d11d
1cb5c90
98434fc
4e23918
7f1fa90
6fd095e
cacf283
9d1173c
9dbcb97
d0ab102
155d0b8
55c9fe8
4e5c34d
8b22b71
ad5d63f
40a61a6
66ecc7e
f9c7c3c
1685fd2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -6,10 +6,15 @@ | |||||
|
||||||
#include "ArithmeticOperations.h" | ||||||
|
||||||
// All memory accesses are column-major! | ||||||
// I.e. a(i,j) = a[i + LDA * j] | ||||||
// AB = sum_k a(i,k) b(k, j) = sum_k a[i + LDA * k] * b[k + LDA * j] | ||||||
// LDA (leading dimension of A) = stride | ||||||
|
||||||
// Annoyingly we have to specialize the innermost loop on whether multiple DRAM flits per number are required or not, | ||||||
// because HLS otherwise gets confused by pragmas applied to a loop of size 1 in the latter case. | ||||||
template <int lines_per_number> | ||||||
void ReadAInner(DramLine const *const mem, hlslib::Stream<PackedFloat> &a_to_feeder, const int size_k, const int n0, | ||||||
void ReadAInner(DramLine const *const mem, hlslib::Stream<PackedFloat> &a_to_feeder, const int size_n, const int n0, | ||||||
const int k) { | ||||||
#pragma HLS INLINE | ||||||
DramLine num[kLinesPerNumber]; | ||||||
|
@@ -19,7 +24,7 @@ void ReadAInner(DramLine const *const mem, hlslib::Stream<PackedFloat> &a_to_fee | |||||
for (int i = 0; i < kLinesPerNumber; ++i) { | ||||||
#pragma HLS PIPELINE II = 1 | ||||||
#pragma HLS LOOP_FLATTEN | ||||||
num[i] = mem[((n0 * kTileSizeN + n1) * size_k + k) * kLinesPerNumber + i]; | ||||||
num[i] = mem[((n0 * kTileSizeN + n1) + k * size_n) * kLinesPerNumber + i]; | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
if (i == kLinesPerNumber - 1) { | ||||||
a_to_feeder.Push(PackedFloat(num)); | ||||||
} | ||||||
|
@@ -28,15 +33,15 @@ void ReadAInner(DramLine const *const mem, hlslib::Stream<PackedFloat> &a_to_fee | |||||
} | ||||||
|
||||||
template <> | ||||||
void ReadAInner<1>(DramLine const *const mem, hlslib::Stream<PackedFloat> &a_to_feeder, const int size_k, const int n0, | ||||||
void ReadAInner<1>(DramLine const *const mem, hlslib::Stream<PackedFloat> &a_to_feeder, const int size_n, const int n0, | ||||||
const int k) { | ||||||
#pragma HLS INLINE | ||||||
ReadA_N: | ||||||
for (int n1 = 0; n1 < kTileSizeN; ++n1) { | ||||||
#pragma HLS PIPELINE II = 1 | ||||||
#pragma HLS LOOP_FLATTEN | ||||||
DramLine num[1]; | ||||||
num[0] = mem[(n0 * kTileSizeN + n1) * size_k + k]; | ||||||
num[0] = mem[((n0 * kTileSizeN + n1) + k * size_n) * kLinesPerNumber]; | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
a_to_feeder.Push(PackedFloat(num)); | ||||||
} | ||||||
} | ||||||
|
@@ -51,7 +56,7 @@ void ReadA(DramLine const *const mem, hlslib::Stream<PackedFloat> &a_to_feeder, | |||||
for (int m0 = 0; m0 < tiles_m; ++m0) { | ||||||
ReadA_K: | ||||||
for (int k = 0; k < size_k; ++k) { | ||||||
ReadAInner<kLinesPerNumber>(mem, a_to_feeder, size_k, n0, k); | ||||||
ReadAInner<kLinesPerNumber>(mem, a_to_feeder, size_n, n0, k); | ||||||
} | ||||||
} | ||||||
} | ||||||
|
@@ -90,7 +95,7 @@ void FeedA(hlslib::Stream<PackedFloat> &a_to_feeder, hlslib::Stream<PackedFloat> | |||||
//////////////////////////////////////////////////////////////////////////////// | ||||||
|
||||||
template <int lines_per_number> | ||||||
void ReadBInner(DramLine const *const mem, hlslib::Stream<PackedFloat> &b_to_feeder, const int size_m, const int m0, | ||||||
void ReadBInner(DramLine const *const mem, hlslib::Stream<PackedFloat> &b_to_feeder, const int size_k, const int m0, | ||||||
const int k) { | ||||||
#pragma HLS INLINE | ||||||
DramLine num[kLinesPerNumber]; | ||||||
|
@@ -100,7 +105,7 @@ void ReadBInner(DramLine const *const mem, hlslib::Stream<PackedFloat> &b_to_fee | |||||
for (int i = 0; i < kLinesPerNumber; ++i) { | ||||||
#pragma HLS PIPELINE II = 1 | ||||||
#pragma HLS LOOP_FLATTEN | ||||||
num[i] = mem[(k * size_m + m0 * kTileSizeM + m1) * kLinesPerNumber + i]; | ||||||
num[i] = mem[(k + (m0 * kTileSizeM + m1) * size_k) * kLinesPerNumber + i]; | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
if (i == kLinesPerNumber - 1) { | ||||||
b_to_feeder.Push(PackedFloat(num)); | ||||||
} | ||||||
|
@@ -109,15 +114,15 @@ void ReadBInner(DramLine const *const mem, hlslib::Stream<PackedFloat> &b_to_fee | |||||
} | ||||||
|
||||||
template <> | ||||||
void ReadBInner<1>(DramLine const *const mem, hlslib::Stream<PackedFloat> &b_to_feeder, const int size_m, const int m0, | ||||||
void ReadBInner<1>(DramLine const *const mem, hlslib::Stream<PackedFloat> &b_to_feeder, const int size_k, const int m0, | ||||||
const int k) { | ||||||
#pragma HLS INLINE | ||||||
ReadB_M: | ||||||
for (int m1 = 0; m1 < kTileSizeM; ++m1) { | ||||||
#pragma HLS PIPELINE II = 1 | ||||||
#pragma HLS LOOP_FLATTEN | ||||||
DramLine num[1]; | ||||||
num[0] = mem[k * size_m + m0 * kTileSizeM + m1]; | ||||||
num[0] = mem[(k + (m0 * kTileSizeM + m1) * size_k) * kLinesPerNumber]; | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
b_to_feeder.Push(PackedFloat(num)); | ||||||
} | ||||||
} | ||||||
|
@@ -132,7 +137,7 @@ void ReadB(DramLine const *const mem, hlslib::Stream<PackedFloat> &b_to_feeder, | |||||
for (int m0 = 0; m0 < tiles_m; ++m0) { | ||||||
ReadB_K: | ||||||
for (int k = 0; k < size_k; ++k) { | ||||||
ReadBInner<kLinesPerNumber>(mem, b_to_feeder, size_m, m0, k); | ||||||
ReadBInner<kLinesPerNumber>(mem, b_to_feeder, size_k, m0, k); | ||||||
} | ||||||
} | ||||||
} | ||||||
|
@@ -169,7 +174,7 @@ void FeedB(hlslib::Stream<PackedFloat> &b_to_feeder, hlslib::Stream<PackedFloat> | |||||
//////////////////////////////////////////////////////////////////////////////// | ||||||
|
||||||
template <int lines_per_number> | ||||||
void ReadCInner(DramLine const *const mem, hlslib::Stream<PackedFloat> &c_to_feeder, const int size_m, const int n0, | ||||||
void ReadCInner(DramLine const *const mem, hlslib::Stream<PackedFloat> &c_to_feeder, const int size_n, const int n0, | ||||||
const int m0, const int n1) { | ||||||
#pragma HLS INLINE | ||||||
ReadC_M: | ||||||
|
@@ -179,7 +184,7 @@ void ReadCInner(DramLine const *const mem, hlslib::Stream<PackedFloat> &c_to_fee | |||||
for (int i = 0; i < kLinesPerNumber; ++i) { | ||||||
#pragma HLS PIPELINE II = 1 | ||||||
#pragma HLS LOOP_FLATTEN | ||||||
num[i] = mem[((n0 * kTileSizeN + n1) * size_m + m0 * kTileSizeM + m1) * kLinesPerNumber + i]; | ||||||
num[i] = mem[((n0 * kTileSizeN + n1) + (m0 * kTileSizeM + m1) * size_n) * kLinesPerNumber + i]; | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
if (i == kLinesPerNumber - 1) { | ||||||
c_to_feeder.Push(PackedFloat(num)); | ||||||
} | ||||||
|
@@ -188,15 +193,15 @@ void ReadCInner(DramLine const *const mem, hlslib::Stream<PackedFloat> &c_to_fee | |||||
} | ||||||
|
||||||
template <> | ||||||
void ReadCInner<1>(DramLine const *const mem, hlslib::Stream<PackedFloat> &c_to_feeder, const int size_m, const int n0, | ||||||
void ReadCInner<1>(DramLine const *const mem, hlslib::Stream<PackedFloat> &c_to_feeder, const int size_n, const int n0, | ||||||
const int m0, const int n1) { | ||||||
#pragma HLS INLINE | ||||||
ReadC_M: | ||||||
for (int m1 = 0; m1 < kTileSizeM; ++m1) { | ||||||
#pragma HLS PIPELINE II = 1 | ||||||
#pragma HLS LOOP_FLATTEN | ||||||
DramLine num[1]; | ||||||
num[0] = mem[(n0 * kTileSizeN + n1) * size_m + m0 * kTileSizeM + m1]; | ||||||
num[0] = mem[((n0 * kTileSizeN + n1) + (m0 * kTileSizeM + m1) * size_n) * kLinesPerNumber]; | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
c_to_feeder.Push(PackedFloat(num)); | ||||||
} | ||||||
} | ||||||
|
@@ -210,7 +215,7 @@ void ReadC(DramLine const *const mem, hlslib::Stream<PackedFloat> &c_to_feeder, | |||||
for (int m0 = 0; m0 < tiles_m; ++m0) { | ||||||
ReadC_N: | ||||||
for (int n1 = 0; n1 < kTileSizeN; ++n1) { | ||||||
ReadCInner<kLinesPerNumber>(mem, c_to_feeder, size_m, n0, m0, n1); | ||||||
ReadCInner<kLinesPerNumber>(mem, c_to_feeder, size_n, n0, m0, n1); | ||||||
} | ||||||
} | ||||||
} | ||||||
|
@@ -290,7 +295,7 @@ void WriteCInner(hlslib::Stream<PackedFloat> &from_kernel, DramLine *const mem, | |||||
} | ||||||
const bool in_bounds = (n0 * kTileSizeN + n1 < size_n) && (m0 * kTileSizeM + m1 < size_m); | ||||||
if (in_bounds) { | ||||||
mem[((n0 * kTileSizeN + n1) * size_m + m0 * kTileSizeM + m1) * kLinesPerNumber + i] = num[i]; | ||||||
mem[((n0 * kTileSizeN + n1) + (m0 * kTileSizeM + m1) * size_n) * kLinesPerNumber + i] = num[i]; | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
} | ||||||
} | ||||||
} | ||||||
|
@@ -308,7 +313,7 @@ void WriteCInner<1>(hlslib::Stream<PackedFloat> &from_kernel, DramLine *const me | |||||
from_kernel.Pop().UnpackFlits(num); | ||||||
const bool in_bounds = (n0 * kTileSizeN + n1 < size_n) && (m0 * kTileSizeM + m1 < size_m); | ||||||
if (in_bounds) { | ||||||
mem[(n0 * kTileSizeN + n1) * size_m + m0 * kTileSizeM + m1] = num[0]; | ||||||
mem[((n0 * kTileSizeN + n1) + (m0 * kTileSizeM + m1) * size_n) * kLinesPerNumber] = num[0]; | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Wait, this is not good -- C has now also become strided access, so we access two arrays strided, and only one sequentially. I'm afraid we have to complete change the iteration space of the computation for this, not just the memory indices |
||||||
} | ||||||
} | ||||||
} | ||||||
|
@@ -354,7 +359,7 @@ void Compute(hlslib::Stream<PackedFloat> &a_in, hlslib::Stream<PackedFloat> &b_i | |||||
const PackedFloat c_read = c_in.Pop(); | ||||||
const PackedFloat a = (m1 == 0) ? a_read : a_buffer; | ||||||
const PackedFloat b = (n1 == 0) ? b_read : b_buffer[m1]; | ||||||
const PackedFloat c = (k == 0) ? c_read : c_buffer[n1 * kTileSizeM + m1]; | ||||||
const PackedFloat c = (k == 0) ? c_read : c_buffer[n1 + m1 * kTileSizeN]; | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There's no point in making this strided, it's a memory of size |
||||||
a_buffer = a; | ||||||
b_buffer[m1] = b; | ||||||
// Ignore contributions from out-of-bound indices | ||||||
|
@@ -363,7 +368,7 @@ void Compute(hlslib::Stream<PackedFloat> &a_in, hlslib::Stream<PackedFloat> &b_i | |||||
const auto res = MultiplyAccumulate(in_bounds ? a : PackedFloat::Zero(), | ||||||
in_bounds ? b : PackedFloat::Zero(), c); | ||||||
// Write back to buffer | ||||||
c_buffer[n1 * kTileSizeM + m1] = res; | ||||||
c_buffer[n1 + m1 * kTileSizeN] = res; | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same as above, we shouldn't make these strided |
||||||
c_out.Push(res); | ||||||
} | ||||||
} | ||||||
|
Uh oh!
There was an error while loading. Please reload this page.