diff --git a/src/hamc/MatrixAdd.cu b/src/hamc/MatrixAdd.cu index b6dcb7d..d560c69 100644 --- a/src/hamc/MatrixAdd.cu +++ b/src/hamc/MatrixAdd.cu @@ -3,13 +3,16 @@ #define ushort unsigned short __global__ void MatrixAdd(ushort *A, ushort *B, ushort *C, - int height, int width,) { - int ROW = blockIdx.y*blockDim.y + threadIdx.y; + int height, int width,) + int ROW = blockIdx.y*blockDim.y + threadIdx.y; - int COL = blockIdx.x*blockDim.x + threadIdx.x; + int COL = blockIdx.x*blockDim.x + threadIdx.x; - if((ROW < height) && (COL < width)){ - int address = ROW*width+COL; - C[address] = A[i] ^ B[i]; - } + int index = row * N + col; + + if( (ROW < numARows) && (COL < numAColumns) ) + { + C[index] = A[index] ^ B[index]; + + } } diff --git a/src/tests/MatrixAdd/MatrixAdd_cpu.c b/src/tests/MatrixAdd/MatrixAdd_cpu.c index fb34c1a..884ad10 100644 --- a/src/tests/MatrixAdd/MatrixAdd_cpu.c +++ b/src/tests/MatrixAdd/MatrixAdd_cpu.c @@ -7,23 +7,8 @@ #define mat_element(mat, cols, row_idx, col_idx) \ mat[(row_idx * cols) + col_idx] -//initialize the matrix -ushort mat_init(int rows, int cols) -{ - if(rows <= 0 || cols <= 0) - { - return NULL; - } - ushort A; - A = (ushort)safe_malloc(sizeof(struct matrix)); - A->cols = cols; - A->rows = rows; - A->data = (ushort *)safe_malloc(rows*cols*sizeof(ushort)); - return A; -} - //Set the value of matix element at position given by the indices to "val" -void set_matrix_element(ushort A, int row_idx, int col_idx, ushort val) +void set_matrix_element(bin_matrix A, int row_idx, int col_idx, bin_matrix val) { if(row_idx < 0 || row_idx >= A->rows || col_idx < 0 || col_idx >= A->cols) { @@ -33,14 +18,14 @@ void set_matrix_element(ushort A, int row_idx, int col_idx, ushort val) mat_element(A, row_idx, col_idx) = val; } -ushort matrix_add(ushort *A, ushort *B) +bin_matrix MatrixAdd_cpu(bin_matrix *A, bin_matrix *B) { - if(A->rows != B->rows || A->cols != B->cols) - { - printf("Incompatible dimensions for matrix addition. \n"); - exit(0); - } - ushort temp mat_init(A->rows, A->cols); + // if(A->rows != B->rows || A->cols != B->cols) + // { + // printf("Incompatible dimensions for matrix addition. \n"); + // exit(0); + // } + // bin_matrix temp mat_init(A->rows, A->cols); for(int i = 0; i < A->rows; i++) { for(int j = 0; j < A->cols; j++) diff --git a/src/tests/MatrixAdd/MatrixAdd_cpu.h b/src/tests/MatrixAdd/MatrixAdd_cpu.h index c52e819..db3cbf3 100644 --- a/src/tests/MatrixAdd/MatrixAdd_cpu.h +++ b/src/tests/MatrixAdd/MatrixAdd_cpu.h @@ -8,9 +8,8 @@ extern "C" { #define ushort unsigned short //FUNCTIONS go in here -ushort mat_init(int rows, int cols); -void set_matrix_element(ushort A, int row_idx, int col_idx, ushort val); -ushort add_matrix(ushort *A, ushort *B); +void set_matrix_element(bin_matrix A, int row_idx, int col_idx, bin_matrix val); +bin_matrix MatrixAdd_cpu(bin_matrix *A, bin_matrix *B); #ifdef __cplusplus diff --git a/src/tests/MatrixAdd/main.cu b/src/tests/MatrixAdd/main.cu index f2b636f..6ffec5d 100644 --- a/src/tests/MatrixAdd/main.cu +++ b/src/tests/MatrixAdd/main.cu @@ -10,6 +10,7 @@ #include "../../hamc/MatrixAdd.cu" #define TILE_WIDTH 16 +#define ushort unsigned short #define CUDA_CHECK(ans) \ { gpuAssert((ans), __FILE__, __LINE__); } @@ -23,6 +24,28 @@ inline void gpuAssert(cudaError_t code, const char *file, int line, } } +#define mat_element(mat, row_idx, col_idx) \ + mat->data[row_idx * (mat->cols) + col_idx] + +typedef struct matrix +{ + int rows; + int cols; + ushort *data; + +}*bin_matrix; + +void* safe_malloc(size_t n) +{ + void* p = malloc(n); + if (!p) + { + fprintf(stderr, "Out of memory(%lu bytes)\n",(size_t)n); + exit(EXIT_FAILURE); + } + return p; +} + void printHelp() { printf("run this executable with the following flags\n"); @@ -32,30 +55,85 @@ void printHelp() printf("\t-s \n"); } +bin_matrix run_cpu(bin_matrix A, bin_matrix B) +{ + if (A->rows != B->rows || A->cols != B->cols){ + printf("Matrices are incompatible, check dimensions...\n"); + exit(0); + } + + return MatrixAdd_cpu(A, B); +} -void run_cpu(const char *in, const char*sol) +bin_matrix run_kernel(bin_matrix A, bin_matrix B) { - int numARows, numAColumns; - ushort *hostA = (ushort *)wbImport(in, &numARows, &numAColumns); - ushort *hostC = (ushort *)malloc(numARows*numAColumns * sizeof(ushort)); + if (A->rows != B->rows || A->cols != B->cols){ + printf("Matrices are incompatible, check dimensions...\n"); + exit(0); + } + + ushort *deviceA; + ushort *deviceB; + ushort *deviceC; + + /* allocate the memory space on GPU */ + wbTime_start(GPU, "Allocating GPU memory."); + cudaMalloc((void **) &deviceA, A->cols * A->rows * sizeof(ushort)); + cudaMalloc((void **) &deviceB, B->cols * B->rows * sizeof(ushort)); + cudaMalloc((void **) &deviceC, B->cols * A->rows * sizeof(ushort)); + wbTime_stop(GPU, "Allocating GPU memory."); + + + cudaMemcpy(deviceA, A->data, A->cols * A->rows * sizeof(ushort), cudaMemcpyHostToDevice); + cudaMemcpy(deviceB, B->data, B->cols * B->rows * sizeof(ushort), cudaMemcpyHostToDevice); + + dim3 DimBlock(TILE_WIDTH, TILE_WIDTH, 1); + int x_blocks = ((B->cols - 1)/TILE_WIDTH) + 1; + int y_blocks = ((A->rows - 1)/TILE_WIDTH) + 1; + dim3 DimGrid(x_blocks, y_blocks, 1); + + matrixAdd<<>>(deviceA, deviceB, deviceC, A->rows, A->cols); + + cudaDeviceSynchronize(); + + wbTime_start(Copy, "Copying output memory to the CPU"); + cudaMemcpy(C->data, deviceC, B->cols * A->rows * sizeof(ushort), cudaMemcpyDeviceToHost); + wbTime_stop(Copy, "Copying output memory to the CPU"); + + wbTime_start(GPU, "Freeing GPU Memory"); + cudaFree(deviceA); + cudaFree(deviceB); + cudaFree(deviceC); + wbTime_stop(GPU, "Freeing GPU Memory"); - matrix_add(hostA, hostC, numARows, numAColumns); + return C; } int main(int argc, char *argv[]) { printf("MatrixAdd test:\n"); + + // Variable - Matrices (Device) wbArg_t args; - - ushort *hostA; // The A matrix - ushort *hostC; // The output C matrix - ushort *deviceA; // A matrix on device - ushort *deviceB; // B matrix on device (copy of A) - ushort *deviceC; // C matrix on device + bin_matrix A; + bin_matrix B; + bin_matrix C; + + // Variable - Matrices (Host) + ushort *hostA; + ushort *hostB; + ushort *hostC; + + // Variables - Rows & Cols int numARows; // number of rows in the matrix A int numAColumns; // number of columns in the matrix A - + int numBRows; // number of rows in the matrix B + int numBColumns; // number of columns in the matrix B + int numCRows; // number of rows in the matrix C + int numCColumns; // number of columns in the matrix C + + //Inputs char *inputFileName; char *solutionFileName; @@ -97,49 +175,20 @@ int main(int argc, char *argv[]) /* allocate host data for matrix */ wbTime_start(Generic, "Importing data and creating memory on host"); + hostA = (ushort *)wbImport(inputFileName, &numARows, &numAColumns); - int numBRows = numARows; // number of rows in the matrix B - int numBColumns = numAColumns; // number of columns in the matrix B - int numCRows = numARows; // number of rows in the matrix C - int numCColumns = numAColumns; // number of columns in the matrix C + A = mat_init(numARows, numACols); + A->data = hostA; + + hostB = (ushort *)wbImport(inputFileName, &numBRows, &numBColumns); + B = mat_init(numARows, numACols); + B->data = hostB; + hostC = (ushort *)malloc(numCRows*numCColumns * sizeof(ushort)); + wbTime_stop(Generic, "Importing data and creating memory on host"); - wbLog(TRACE, "The dimensions of A are ", numARows, " x ", numAColumns); - - - /* allocate the memory space on GPU */ - wbTime_start(GPU, "Allocating GPU memory."); - cudaMalloc((void**) &deviceA, numARows * numAColumns * sizeof(ushort)); - cudaMalloc((void**) &deviceB, numBRows * numBColumns * sizeof(ushort)); - cudaMalloc((void**) &deviceC, numCRows * numCColumns * sizeof(ushort)); - wbTime_stop(GPU, "Allocating GPU memory."); - - - dim3 dimGrid((numCColumns - 1) / 16 + 1, (numCRows - 1) / TILE_WIDTH + 1, 1); - dim3 dimBlock(TILE_WIDTH, TILE_WIDTH, 1); - - /* call CUDA kernel to perform computations */ - wbTime_start(Compute, "Performing CUDA computation for RREF"); - MatrixAdd_kernel<<>>(deviceA, deviceB, deviceC, numARows); - cudaDeviceSynchronize(); - wbTime_stop(Compute, "Performing CUDA computation"); - - - - wbTime_start(Copy, "Copying output memory to the CPU"); - cudaMemcpy(hostC, deviceC, numCRows * numCColumns * sizeof(ushort), cudaMemcpyDeviceToHost); - wbTime_stop(Copy, "Copying output memory to the CPU"); - - /* Free GPU Memory */ - wbTime_start(GPU, "Freeing GPU Memory"); - cudaFree(deviceA); - cudaFree(deviceC); - wbTime_stop(GPU, "Freeing GPU Memory"); - - wbSolution(args, hostC, numCRows, numCColumns); - free(hostA); free(hostB); free(hostC); diff --git a/src/tests/MatrixAdd/sources.cmake b/src/tests/MatrixAdd/sources.cmake index bd51ac4..918c38b 100644 --- a/src/tests/MatrixAdd/sources.cmake +++ b/src/tests/MatrixAdd/sources.cmake @@ -1,3 +1,3 @@ add_lab("ADD_MATRIX_test") -add_lab_solution("ADD_MATRIX_test", ${CMAKE_CURRENT_LIST_DIR}/main.cu) -add_generator("ADD_MATRIX_test" ${CMAKE_CURRENT_LIST_DIR}/dataset_generator.cpp) +add_lab_solution("ADD_MATRIX_test" ${CMAKE_CURRENT_LIST_DIR}/main.cu) +#add_generator("ADD_MATRIX_test", ${CMAKE_CURRENT_LIST_DIR}/dataset_generator.cpp) diff --git a/src/tests/sources.cmake b/src/tests/sources.cmake index baac856..ccc0763 100644 --- a/src/tests/sources.cmake +++ b/src/tests/sources.cmake @@ -1 +1,3 @@ include(${CMAKE_CURRENT_LIST_DIR}/RREFMatrix/sources.cmake) +include(${CMAKE_CURRENT_LIST_DIR}/MatrixAdd/sources.cmake) +include(${CMAKE_CURRENT_LIST_DIR}/InverseMatrix/sources.cmake)