-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathTiled_Matrix_Multiplication.cu
151 lines (125 loc) · 5 KB
/
Tiled_Matrix_Multiplication.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
#include <wb.h>
#define TILE_WIDTH 16
#define wbCheck(stmt) \
do { \
cudaError_t err = stmt; \
if (err != cudaSuccess) { \
wbLog(ERROR, "Failed to run stmt ", #stmt); \
wbLog(ERROR, "Got CUDA error ... ", cudaGetErrorString(err)); \
return -1; \
} \
} while (0)
// Compute C = A * B
__global__ void matrixMultiplyShared(float *A, float *B, float *C, int numARows,
int numAColumns, int numBRows,
int numBColumns, int numCRows,
int numCColumns) {
//@@ Insert code to implement matrix multiplication here
//@@ You have to use shared memory for this MP
__shared__ float ds_A[TILE_WIDTH][TILE_WIDTH];
__shared__ float ds_B[TILE_WIDTH][TILE_WIDTH];
int bx = blockIdx.x;
int by = blockIdx.y;
int tx = threadIdx.x;
int ty = threadIdx.y;
int Row = by * blockDim.y + ty;
int Col = bx * blockDim.x + tx;
float CValue = 0.0;
for (int t = 0; t < (numAColumns - 1)/TILE_WIDTH + 1; t++)
{
if (t * TILE_WIDTH + tx < numAColumns && Row < numARows)
{
ds_A[ty][tx] = A[Row * numAColumns + t * TILE_WIDTH + tx];
}
else
{
ds_A[ty][tx] = 0.0;
}
if (t * TILE_WIDTH + ty < numBRows && Col < numBColumns)
{
ds_B[ty][tx] = B[(t * TILE_WIDTH + ty) * numBColumns + Col];
}
else
{
ds_B[ty][tx] = 0.0;
}
__syncthreads();
for (int i = 0; i < TILE_WIDTH; i++)
{
CValue += (ds_A[ty][i] * ds_B[i][tx]);
}
__syncthreads();
}
if (Row < numCRows && Col < numCColumns)
{
C[Row * numCColumns + Col] = CValue;
}
}
int main(int argc, char **argv) {
wbArg_t args;
float *hostA; // The A matrix
float *hostB; // The B matrix
float *hostC; // The output C matrix
float *deviceA;
float *deviceB;
float *deviceC;
int numARows; // number of rows in the matrix A
int numAColumns; // number of columns in the matrix A
int numBRows; // number of rows in the matrix B
int numBColumns; // number of columns in the matrix B
int numCRows; // number of rows in the matrix C (you have to set this)
int numCColumns; // number of columns in the matrix C (you have to set this)
args = wbArg_read(argc, argv);
wbTime_start(Generic, "Importing data and creating memory on host");
hostA =
( float * )wbImport(wbArg_getInputFile(args, 0), &numARows, &numAColumns);
hostB =
( float * )wbImport(wbArg_getInputFile(args, 1), &numBRows, &numBColumns);
//@@ Set numCRows and numCColumns
numCRows = numARows;
numCColumns = numBColumns;
//@@ Allocate the hostC matrix
wbTime_stop(Generic, "Importing data and creating memory on host");
int sizeA = numARows * numAColumns * sizeof(float);
int sizeB = numBRows * numBColumns * sizeof(float);
int sizeC = numCRows * numCColumns * sizeof(float);
hostC = (float*) malloc(sizeC);
wbLog(TRACE, "The dimensions of A are ", numARows, " x ", numAColumns);
wbLog(TRACE, "The dimensions of B are ", numBRows, " x ", numBColumns);
wbTime_start(GPU, "Allocating GPU memory.");
//@@ Allocate GPU memory here
wbCheck(cudaMalloc((void**)&deviceA, sizeA));
wbCheck(cudaMalloc((void**)&deviceB, sizeB));
wbCheck(cudaMalloc((void**)&deviceC, sizeC));
wbTime_stop(GPU, "Allocating GPU memory.");
wbTime_start(GPU, "Copying input memory to the GPU.");
//@@ Copy memory to the GPU here
wbCheck(cudaMemcpy(deviceA, hostA, sizeA, cudaMemcpyHostToDevice));
wbCheck(cudaMemcpy(deviceB, hostB, sizeB, cudaMemcpyHostToDevice));
wbTime_stop(GPU, "Copying input memory to the GPU.");
//@@ Initialize the grid and block dimensions here
dim3 gridDim3((numCColumns - 1)/TILE_WIDTH + 1, (numCRows - 1)/TILE_WIDTH + 1, 1);
dim3 blockDim3(TILE_WIDTH, TILE_WIDTH, 1);
wbTime_start(Compute, "Performing CUDA computation");
//@@ Launch the GPU Kernel here
matrixMultiplyShared <<< gridDim3, blockDim3 >>> (deviceA, deviceB, deviceC,
numARows, numAColumns, numBRows,
numBColumns, numCRows, numCColumns);
cudaDeviceSynchronize();
wbTime_stop(Compute, "Performing CUDA computation");
wbTime_start(Copy, "Copying output memory to the CPU");
//@@ Copy the GPU memory back to the CPU here
wbCheck(cudaMemcpy(hostC, deviceC, sizeC, cudaMemcpyDeviceToHost));
wbTime_stop(Copy, "Copying output memory to the CPU");
wbTime_start(GPU, "Freeing GPU Memory");
//@@ Free the GPU memory here
wbCheck(cudaFree(deviceA));
wbCheck(cudaFree(deviceB));
wbCheck(cudaFree(deviceC));
wbTime_stop(GPU, "Freeing GPU Memory");
wbSolution(args, hostC, numCRows, numCColumns);
free(hostA);
free(hostB);
free(hostC);
return 0;
}