-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmatrix-multiplication.cu
147 lines (125 loc) · 4.95 KB
/
matrix-multiplication.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <cuda.h>
#define MATRIX_SIZE 8192
#define CPU_MATRIX_SIZE 1024
__global__ void gpu_matrix_mult(const int *a, const int *b, int *c, const int n)
{
const int row = blockIdx.y * blockDim.y + threadIdx.y;
if(const int col = blockIdx.x * blockDim.x + threadIdx.x; col < n && row < n)
{
const int offset = row * n;
int sum = 0;
for(int i = 0; i < n; ++i)
sum += a[offset + i] * b[i * n + col];
c[offset + col] = sum;
}
}
void cpu_matrix_mult (const int *a, const int *b, int *c, const int n)
{
for (int i = 0; i < n; ++i)
for (int j = 0; j < n; ++j)
{
const int offset = i * n;
int sum_mult = 0;
for (int k = 0; k < n; ++k)
sum_mult += a[offset + k] * b[k * n + j];
c[offset + j] = sum_mult;
}
}
int main(int argc, char const *argv[])
{
int block_size;
/// retrieve some info about the CUDA device
int nDevices;
cudaGetDeviceCount(&nDevices);
for (int i = 0; i < nDevices; i++) {
cudaDeviceProp prop;
cudaGetDeviceProperties(&prop, i);
printf("Device Number: %d\n", i);
printf(" Device name: %s\n", prop.name);
printf(" max Blocks Per MultiProcessor: %d\n", prop.maxBlocksPerMultiProcessor);
printf(" max Threads Per MultiProcessor: %d\n", prop.maxThreadsPerMultiProcessor);
printf(" max Threads Per Block: %d\n", prop.maxThreadsPerBlock);
printf(" num SM: %d\n", prop.multiProcessorCount);
printf(" num bytes sharedMem Per Block: %d\n", prop.sharedMemPerBlock);
printf(" num bytes sharedMem Per Multiprocessor: %d\n", prop.sharedMemPerMultiprocessor);
printf(" Memory Clock Rate (KHz): %d\n",
prop.memoryClockRate);
printf(" Memory Bus Width (bits): %d\n",
prop.memoryBusWidth);
printf(" Peak Memory Bandwidth (GB/s): %f\n\n",
2.0*prop.memoryClockRate*(prop.memoryBusWidth/8)/1.0e6);
}
{
int *a, *b, *c;
a = (int*)malloc(sizeof(int)*CPU_MATRIX_SIZE*CPU_MATRIX_SIZE);
b = (int*)malloc(sizeof(int)*CPU_MATRIX_SIZE*CPU_MATRIX_SIZE);
c = (int*)malloc(sizeof(int)*CPU_MATRIX_SIZE*CPU_MATRIX_SIZE);
// initialize matrix A
for (int i = 0; i < CPU_MATRIX_SIZE; ++i) {
for (int j = 0; j < CPU_MATRIX_SIZE; ++j) {
a[i * CPU_MATRIX_SIZE + j] = 2;
}
}
// initialize matrix B
for (int i = 0; i < CPU_MATRIX_SIZE; ++i) {
for (int j = 0; j < CPU_MATRIX_SIZE; ++j) {
b[i * CPU_MATRIX_SIZE + j] = 3;
}
}
// sequential version of matrix multiplication
clock_t begin = clock();
cpu_matrix_mult(a, b, c, CPU_MATRIX_SIZE);
clock_t end = clock();
double time_spent = ((double)((end - begin)) * 1000) / CLOCKS_PER_SEC;
printf("Time elapsed on naive CPU sequential matrix multiplication of %dx%d . %dx%d: %f ms\n\n", CPU_MATRIX_SIZE, CPU_MATRIX_SIZE, CPU_MATRIX_SIZE, CPU_MATRIX_SIZE, time_spent);
free(a);
free(b);
free(c);
}
for(block_size= 4; block_size <= 32; block_size *= 2)
{
int *a, *b, *c;
cudaMallocManaged((void **) &a, sizeof(int)*MATRIX_SIZE*MATRIX_SIZE);
cudaMallocManaged((void **) &b, sizeof(int)*MATRIX_SIZE*MATRIX_SIZE);
cudaMallocManaged((void **) &c, sizeof(int)*MATRIX_SIZE*MATRIX_SIZE);
// initialize matrix A
for (int i = 0; i < MATRIX_SIZE; ++i) {
for (int j = 0; j < MATRIX_SIZE; ++j) {
a[i * MATRIX_SIZE + j] = 2;
}
}
// initialize matrix B
for (int i = 0; i < MATRIX_SIZE; ++i) {
for (int j = 0; j < MATRIX_SIZE; ++j) {
b[i * MATRIX_SIZE + j] = 3;
}
}
float naive_gpu_elapsed_time_ms;
// some events to count the execution time
//clock_t st, end;
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
unsigned int grid_rows = (MATRIX_SIZE + block_size - 1) / block_size;
unsigned int grid_cols = (MATRIX_SIZE + block_size - 1) / block_size;
dim3 dimGrid(grid_cols, grid_rows);
dim3 dimBlock(block_size, block_size);
cudaEventRecord(start, 0);
gpu_matrix_mult<<<dimGrid, dimBlock>>>(a, b, c, MATRIX_SIZE);
cudaThreadSynchronize();
// time counting terminate
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
// compute time elapsed on GPU computing
cudaEventElapsedTime(&naive_gpu_elapsed_time_ms, start, stop);
printf("Time elapsed on naive GPU matrix multiplication of %dx%d . %dx%d (%d): %f ms.\n\n", MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE, MATRIX_SIZE, block_size, naive_gpu_elapsed_time_ms);
// free memory
cudaFree(a);
cudaFree(b);
cudaFree(c);
}
return 0;
}