-
Notifications
You must be signed in to change notification settings - Fork 0
/
matmul.cu
79 lines (64 loc) · 2.18 KB
/
matmul.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
#include <cstdio>
#include "lib/helper_cuda.cuh"
#define N 10000
// Kernel definition
__global__ void MatAdd(float *A, float *B,
float *C)
{
int i = blockIdx.x * blockDim.x + threadIdx.x;
int j = blockIdx.y * blockDim.y + threadIdx.y;
if (i < N && j < N)
{
C[i * N + j] = A[i * N + j] + B[i * N + j];
}
}
int main()
{
unsigned int size = N * N;
unsigned int mem_size = sizeof(float) * size;
// Allocate input vectors h_A and h_B in host memory
float *h_A, *h_B, *h_C;
checkCudaErrors(cudaMallocHost(&h_A, mem_size));
checkCudaErrors(cudaMallocHost(&h_B, mem_size));
checkCudaErrors(cudaMallocHost(&h_C, mem_size));
if (h_A == NULL || h_B == NULL || h_C == NULL)
{
fprintf(stderr, "Failed to allocate host vectors!\n");
exit(EXIT_FAILURE);
}
float count = 0;
// Initialize input matrices
for (int i = 0; i < size; i++)
{
h_A[i] = count++;
h_B[i] = count++;
}
// Allocate vectors in device memory
float *d_A, *d_B, *d_C;
checkCudaErrors(cudaMalloc(&d_A, mem_size));
checkCudaErrors(cudaMalloc(&d_B, mem_size));
checkCudaErrors(cudaMalloc(&d_C, mem_size));
// Allocate CUDA events that we'll use for timing
cudaEvent_t start, stop;
checkCudaErrors(cudaEventCreate(&start));
checkCudaErrors(cudaEventCreate(&stop));
// Copy host vectors to device
checkCudaErrors(cudaMemcpy(d_A, h_A, mem_size, cudaMemcpyHostToDevice));
checkCudaErrors(cudaMemcpy(d_B, h_B, mem_size, cudaMemcpyHostToDevice));
// Invoke kernel
dim3 threadsPerBlock(16, 16);
dim3 numBlocks((N + threadsPerBlock.x - 1) / threadsPerBlock.x, (N + threadsPerBlock.y - 1) / threadsPerBlock.y);
MatAdd<<<numBlocks, threadsPerBlock>>>(d_A, d_B, d_C);
getLastCudaError("Kernel execution failed");
// Copy result from device to host
checkCudaErrors(cudaMemcpy(h_C, d_C, mem_size, cudaMemcpyDeviceToHost));
// Verify result
for (int i = 0; i < size; i++)
{
if (h_A[i] + h_B[i] != h_C[i])
{
fprintf(stderr, "Result verification failed at element %d!\n", i);
exit(EXIT_FAILURE);
}
}
}