-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.cu
318 lines (286 loc) · 11.6 KB
/
main.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
#include <iostream>
#include <boost/random/uniform_int_distribution.hpp>
#include <boost/random/mersenne_twister.hpp>
#include <cooperative_groups/memcpy_async.h>
#include <cuda/cmath>
#include <cuda/std/array>
#include <cuda/std/chrono>
#include <cutlass/epilogue/thread/activation.h>
#include <fmt/ranges.h>
#include <cuda.h>
#include <nccl.h>
#include <nvshmemx.h>
#include <nvshmem.h>
#include <host/nvshmemx_api.h> // Makes CLion happy
//#include "torchInclude.h"
#include "processor/gemm.cuh"
#define NANO_TO_MICRO (cuda::std::nano::den / cuda::std::micro::den)
#if !defined(CHECK_ERROR_EXIT)
# define CHECK_ERROR_EXIT(e) \
do { \
cudaError_t code = (e); \
if (code != cudaSuccess) { \
fprintf(stderr, "<%s:%d> %s:\n %s: %s\n", \
__FILE__, __LINE__, #e, \
cudaGetErrorName(code), cudaGetErrorString(code)); \
fflush(stderr); \
exit(1); \
} \
} while (0)
#endif
constexpr unsigned int len = 10000U;
template<unsigned int n>
__device__ __forceinline__
unsigned int blockManipulation(const cuda::std::array<bool, n>& isRemote, const unsigned int& idx) {
unsigned int numPeers = 0U;
cuda::std::array<unsigned int, n> peers{};
#pragma unroll
for(unsigned int i = 0U; i < n; ++i) {
const bool b = (idx > 0) * !isRemote[i] + isRemote[i] * (idx == 0);
peers[numPeers] = !b * peers[numPeers] + i * b;
numPeers += b;
}
return numPeers;
}
template<unsigned int n>
__device__ __forceinline__
unsigned int blockManipulationBranch(const cuda::std::array<bool, n>& isRemote, const unsigned int& idx) {
unsigned int numPeers = 0U;
cuda::std::array<unsigned int, n> peers{};
#pragma unroll
for(unsigned int i = 0U; i < n; ++i) {
if ((isRemote[i] && idx == 0) || (!isRemote[i] && idx > 0)) {
peers[numPeers++] = i;
}
}
return numPeers;
}
template<unsigned int n>
__global__ void benchBranch(const bool* in, __grid_constant__ const unsigned int idx) {
cuda::std::array<bool, n> isRemote{};
size_t start, end;
double duration = 0.0;
#pragma unroll
for (unsigned int i = 0; i < n; ++i) {
isRemote[i] = in[i];
}
constexpr unsigned int runs = 4;
#pragma unroll
for (unsigned int i = 0; i < runs; ++i) {
asm volatile("mov.u64 %0, %%globaltimer;": "=l"(start)::);
blockManipulation<len>(isRemote, idx);
asm volatile("mov.u64 %0, %%globaltimer;": "=l"(end)::);
duration += static_cast<double>(end - start) / static_cast<double>(runs);
}
printf("Branch less is %f, res is %u\n", duration, blockManipulation<len>(isRemote, idx));
duration = 0.0;
#pragma unroll
for (unsigned int i = 0; i < runs; ++i) {
asm volatile("mov.u64 %0, %%globaltimer;": "=l"(start)::);
blockManipulationBranch<len>(isRemote, idx);
asm volatile("mov.u64 %0, %%globaltimer;": "=l"(end)::);
duration += static_cast<double>(end - start) / static_cast<double>(runs);
}
printf("Branch is %f, res is %u\n", duration, blockManipulationBranch<len>(isRemote, idx));
}
__always_inline
void launchBenchBranch() {
boost::random::mt19937 rng(cuda::std::chrono::high_resolution_clock::now().time_since_epoch().count());
const boost::random::uniform_int_distribution<> bits(0,1);
std::array<bool, len> b{};
for (unsigned int i = 0; i < len; ++i) {
b[i] = bits(rng);
}
//fmt::println("{}", b);
bool* bDevice;
constexpr unsigned int idx = 1U;
CHECK_ERROR_EXIT(cudaMalloc(&bDevice, sizeof(bool)*len));
CHECK_ERROR_EXIT(cudaMemcpy(bDevice, b.data(), sizeof(bool)*len, cudaMemcpyHostToDevice));
benchBranch<len><<<1,1>>>(bDevice, idx);
CHECK_ERROR_EXIT(cudaPeekAtLastError());
CHECK_ERROR_EXIT(cudaDeviceSynchronize());
}
struct __align__(16) Args {
double* sHeap;
uint64_t* flags;
double* result;
unsigned int n;
unsigned int rank;
bool remotePresent;
unsigned int processingRate;
Args() = default;
Args(double* _sHeap, uint64_t * _flags,
double* _result, const unsigned int& _n,
const unsigned int& _rank, const bool& _remotePresent, const unsigned int& _processingRate)
: sHeap(_sHeap),
flags(_flags),
result(_result),
n(_n),
rank(_rank),
remotePresent(_remotePresent),
processingRate(_processingRate) {}
};
__constant__ Args b{};
void __global__ testArgs() {
printf("Args has rank %u, results %f\n", b.rank, b.result[0]);
b.sHeap[0] = 45.0;
b.result[0] = 59.0;
printf("Args has rank %u, results %f\n", b.rank, b.result[0]);
}
#define TO_MB(b) static_cast<double>(b) / (1024.0f*1024.0f)
void testArgsHost() {
void* p;
CHECK_ERROR_EXIT(cudaMalloc(&p, sizeof(double)*4));
CHECK_ERROR_EXIT(cudaMemset(p, 0, sizeof(double)*4));
const auto a = Args(static_cast<double*>(p),
static_cast<uint64_t *>(p) + 1,
static_cast<double*>(p) + 2,
1, 0, true, 1);
CHECK_ERROR_EXIT(cudaMemcpyToSymbol(b, &a, sizeof(Args)));
testArgs<<<1,1>>>();
CHECK_ERROR_EXIT(cudaPeekAtLastError());
CHECK_ERROR_EXIT(cudaDeviceSynchronize());
std::cout << TO_MB(1024*1024) << std::endl;
}
auto constexpr runs = 64U;
constexpr auto nBytes = sizeof(int);
enum signal : unsigned short {
NOOP,
shouldProcess,
processed,
};
#define STAGES 2U
#define CELLS 2U
template<typename T, unsigned int stage=0>
requires (stage < STAGES && !cuda::std::is_same_v<T, void>) // Pointer arithmetic on void is undefined
CUTE_DEVICE
T* advanceHeap(T* const& __restrict__ buffer, const unsigned int& slotSize) {
return buffer + slotSize * (STAGES + stage);
}
template<class GEMM, unsigned short world, unsigned short rounds, bool skip=true>
requires (GEMM::block_dim >= STAGES)
__global__ void overlapKernel(const typename GEMM::b_value_type* __restrict__ weights,
const typename GEMM::c_value_type* __restrict__ result,
cuda::std::byte* __restrict__ sHeap, cuda::std::byte* __restrict__ staging,
uint64_t* __restrict__ flags,
CUTE_GRID_CONSTANT const int rank) {
// The workflow operates as follows, assuming each PE has a weight matrix and starts with an input matrix.
// 1. At time i A2A to disseminate vector v_i
// 2. GEMM on all received vectors
// 3. A2A to reconstitute original vector v_i
// 3. Process received vector
// 4. Repeat
static_assert(signal::processed == STAGES);
static_assert(cublasdx::size_of<GEMM>::n == cublasdx::size_of<GEMM>::k);
extern __shared__ __align__(16) char workspace[];
__shared__ unsigned int bid;
// Ensures a 32-bit single register is used
const unsigned int tid = cooperative_groups::thread_block::thread_rank();
if (tid == 0) {
// grid::block_rank() == peer rank
bid = cooperative_groups::grid_group::block_rank();
sHeap += STAGES * GEMM::c_size * bid;
staging += GEMM::c_size * bid;
}
__threadfence_block();
__syncthreads();
// Make global memory tensor
auto tAgB = cublasdx::make_tensor(weights, GEMM::get_layout_gmem_b());
auto tCgC = cublasdx::make_tensor(result, GEMM::get_layout_gmem_c());
auto [sA, sB, sC] = GEMM::slice_shared_memory(workspace);
// Make shared memory tensor
auto tAsA = cublasdx::make_tensor(sA, GEMM::suggest_layout_smem_a());
auto tBsB = cublasdx::make_tensor(sB, GEMM::suggest_layout_smem_b());
auto tCsC = cublasdx::make_tensor(sC, GEMM::suggest_layout_smem_c());
// Load data from global memory tensor to shared memory tensor
// Note each block has identical copy of weights
using alignment = cublasdx::alignment_of<GEMM>;
cublasdx::copy<GEMM, alignment::b>(tAgB, tBsB);
cublasdx::copy<GEMM, alignment::b>(tCgC, tCsC);
cublasdx::copy_wait();
CUTE_UNROLL
for (unsigned short i = 0; i < rounds; ++i) {
// upper bound of number of messages per round
memcpy_async(cooperative_groups::this_thread_block(), staging, sC, GEMM::c_size);
wait(cooperative_groups::this_thread_block());
// Communicate vector to peer
nvshmemx_putmem_signal_nbi_block(advanceHeap<0>(sHeap, GEMM::c_size),
staging, (bid != rank) * GEMM::c_size,
flags + bid, shouldProcess, NVSHMEM_SIGNAL_SET, bid);
if (!tid) {
// Await data arrival
nvshmem_signal_wait_until(flags + bid, NVSHMEM_CMP_EQ, shouldProcess);
}
__syncthreads();
/// First stage
// Copy received data to shared memory workspace
cooperative_groups::memcpy_async(cooperative_groups::this_thread_block(), sA,
advanceHeap<0>(sHeap, GEMM::c_size), GEMM::c_size);
wait(cooperative_groups::this_thread_block());
// Execute GEMM
GEMM().execute(GEMM::a_value_type(1.0), tAsA, tBsB, GEMM::c_value_type(0.0), tCsC);
__syncthreads();
memcpy_async(cooperative_groups::this_thread_block(), staging, sC, GEMM::c_size);
wait(cooperative_groups::this_thread_block());
// Eagerly communicate computed vector to peer
nvshmemx_putmem_signal_nbi_block(advanceHeap<1>(sHeap, GEMM::c_size),
staging, (bid != rank) * GEMM::c_size,
flags + world + bid, processed, NVSHMEM_SIGNAL_SET, bid);
// Second Stage
if (!tid) {
// Await data arrival
nvshmem_signal_wait_until(flags + world + bid, NVSHMEM_CMP_EQ, processed);
}
__syncthreads();
cooperative_groups::memcpy_async(cooperative_groups::this_thread_block(), sA,
advanceHeap<1>(sHeap, GEMM::c_size), GEMM::c_size);
wait(cooperative_groups::this_thread_block());
// Fused GEMM and ReLU
GEMM().execute(GEMM::a_value_type(1.0), tAsA, tBsB, GEMM::c_value_type(0.0), tCsC,
cublasdx::identity{}, cublasdx::identity{}, cublasdx::identity{},
cutlass::epilogue::thread::ReLU<typename GEMM::c_value_type>{});
}
// Store final result in global memory
cublasdx::copy<GEMM, alignment::c>(tCsC, tCsC);
}
void overlapPrototype() {
// blocks should be equal to n
nvshmem_init();
CUTE_CHECK_ERROR(cudaSetDevice(nvshmem_team_my_pe(NVSHMEMX_TEAM_NODE)));
// total memory = GEMM::c_size * n * (STAGES + 1) -> STAGES and staging buffer
auto* p = nvshmem_align(16, 2*sizeof(float)*nvshmem_n_pes());
CUTE_CHECK_ERROR(cudaMemset(p, 0, nBytes));
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
CUTE_CHECK_ERROR(cudaEventRecord(start));
CUTE_CHECK_ERROR(cudaEventRecord(stop));
CUTE_CHECK_ERROR(cudaPeekAtLastError());
CUTE_CHECK_LAST();
float duration = 0.0f;
CUTE_CHECK_ERROR(cudaEventElapsedTime(&duration, start, stop));
fmt::println("Elapsed time {}", duration);
nvshmem_free(p);
nvshmem_finalize();
}
void testGEMM() {
introduction_example<800>();
}
void testArrangement() {
std::array<int, 4>a{{0,1,2,3}};
const auto t = make_tensor(a.data(), cute::make_shape(2,2));
print_tensor(t);
fmt::println("a[0][1] is {}, a[1][1] is {}", t(0,1), t(1,1));
const auto tT = make_tensor(a.data(), cute::make_shape(2,2), cute::LayoutRight{});
print_tensor(tT);
fmt::println("a[0][1] is {}, a[1][1] is {}", tT(0,1), tT(1,1));
}
template<unsigned int Arch>
__global__ void testArch() {
printf("%u", 5);
}
int main() {
testArch<800><<<1,1>>>();
//testGEMM();
return 0;
}