diff --git a/README.md b/README.md index d2fa33d..adaf2e7 100644 --- a/README.md +++ b/README.md @@ -3,11 +3,27 @@ Project 0 Getting Started **University of Pennsylvania, CIS 5650: GPU Programming and Architecture, Project 0** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Jefferson Koumba Moussadji Lu + * [LinkedIn](https://www.linkedin.com/in/-jeff-koumba-0b356721b/), [personal website](), [twitter](), etc. +* Tested on: Personal Laptop, Windows 11 Home, Intel(R) Core(TM) i9-14900HX @ 2.22GHz @ 24 Cores @ 32GB RAM, Nvidia GeForce RTX 4090 @ 16 GB @ SM 8.9 -### (TODO: Your README) +**CUDA GL Check** +![](images/CUDA_GL_Check.png) -Include screenshots, analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +**NVIDIA Nsight Debugging** +![](images/Nsight_Debugging_Windows_VS.png) + +**NVIDIA Nsight System Analysis** +![](images/NVIDIA_Nsight_Systems_Analysis_Summary.png) + +**NVIDIA Nsight System Timeline** +![](images/NVIDIA_Nsight_Systems_Timeline.png) + +**NVIDIA Nsight Compute** +![](images/NVIDIA_Nsight_Compute.png) + +**WebGL** +![](images/WebGL_Report.png) + +**WebGPU** +![](images/WebGPU.png) \ No newline at end of file diff --git a/cuda-gl-check/src/main.cpp b/cuda-gl-check/src/main.cpp index 886fd4c..f63f2ee 100644 --- a/cuda-gl-check/src/main.cpp +++ b/cuda-gl-check/src/main.cpp @@ -11,7 +11,7 @@ */ int main(int argc, char* argv[]) { // TODO: Change this line to use your name! - m_yourName = "TODO: YOUR NAME HERE"; + m_yourName = "Jefferson Koumba Moussadji Lu"; if (init(argc, argv)) { mainLoop(); diff --git a/cuda-introduction/source/common.cu b/cuda-introduction/source/common.cu index dce8793..a4dcdf7 100644 --- a/cuda-introduction/source/common.cu +++ b/cuda-introduction/source/common.cu @@ -9,7 +9,7 @@ unsigned divup(unsigned size, unsigned div) { // TODO: implement a 1 line function to return the divup operation. // Note: You only need to use addition, subtraction, and division operations. - return 0; + return (size + div - 1) / div; } void clearHostAndDeviceArray(float *res, float *dev_res, unsigned size, const int value) diff --git a/cuda-introduction/source/matmul.cu b/cuda-introduction/source/matmul.cu index 826e535..3f8dfee 100644 --- a/cuda-introduction/source/matmul.cu +++ b/cuda-introduction/source/matmul.cu @@ -12,17 +12,25 @@ __global__ void matrixMultiplicationNaive(float* const matrixP, const float* con { // TODO 10a: Compute the P matrix global index for each thread along x and y dimentions. // Remember that each thread of the kernel computes the result of 1 unique element of P - unsigned px; - unsigned py; + unsigned px = blockIdx.x * blockDim.x + threadIdx.x; + unsigned py = blockIdx.y * blockDim.y + threadIdx.y; // TODO 10b: Check if px or py are out of bounds. If they are, return. + if (px >= sizeMX || py >= sizeNY) + return; // TODO 10c: Compute the dot product for the P element in each thread // This loop will be the same as the host loop float dot = 0.0; + for (int k = 0; k < sizeXY; k++) { + const float m = matrixM[k * sizeMX + px]; + const float n = matrixN[py * sizeXY + k]; + dot += m * n; + } // TODO 10d: Copy dot to P matrix // matrixP[] = dot; + matrixP[py * sizeMX + px] = dot; } int main(int argc, char *argv[]) @@ -31,19 +39,19 @@ int main(int argc, char *argv[]) // Then try large multiple-block square matrix like 64x64 up to 2048x2048. // Then try square, non-power-of-two like 15x15, 33x33, 67x67, 123x123, and 771x771 // Then try rectangles with powers of two and then non-power-of-two. - const unsigned sizeMX = 0; - const unsigned sizeXY = 0; - const unsigned sizeNY = 0; + const unsigned sizeMX = 16*16; + const unsigned sizeXY = 32*32; + const unsigned sizeNY = 64*64; // TODO 2: Allocate host 1D arrays for: // matrixM[sizeMX, sizeXY] // matrixN[sizeXY, sizeNY] // matrixP[sizeMX, sizeNY] // matrixPGold[sizeMX, sizeNY] - float* matrixM; - float* matrixN; - float* matrixP; - float* matrixPGold; + float* matrixM = new float[sizeMX * sizeXY]; + float* matrixN = new float[sizeXY * sizeNY]; + float* matrixP = new float[sizeMX * sizeNY]; + float* matrixPGold = new float[sizeMX * sizeNY]; // LOOK: Setup random number generator and fill host arrays and the scalar a. std::random_device rd; @@ -65,13 +73,29 @@ int main(int argc, char *argv[]) // for k -> 0 to sizeXY // dot = m[k, px] * n[py, k] // matrixPGold[py, px] = dot + for (int py = 0; py < sizeNY; py++) { + for (int px = 0; px < sizeMX; px++) { + float dot = 0; + for (int k = 0; k < sizeXY; k++) { + float m = matrixM[k * sizeMX + px]; + float n = matrixN[py * sizeXY + k]; + dot += m * n; + } + matrixPGold[py * sizeMX + px] = dot; + } + } // Device arrays float *d_matrixM, *d_matrixN, *d_matrixP; // TODO 4: Allocate memory on the device for d_matrixM, d_matrixN, d_matrixP. + CUDA(cudaMalloc((void**)&d_matrixM, sizeMX * sizeXY * sizeof(float))); + CUDA(cudaMalloc((void**)&d_matrixN, sizeXY * sizeNY * sizeof(float))); + CUDA(cudaMalloc((void**)&d_matrixP, sizeMX * sizeNY * sizeof(float))); // TODO 5: Copy array contents of M and N from the host (CPU) to the device (GPU) + CUDA(cudaMemcpy(d_matrixM, matrixM, sizeMX * sizeXY * sizeof(float), cudaMemcpyHostToDevice)); + CUDA(cudaMemcpy(d_matrixN, matrixN, sizeXY * sizeNY * sizeof(float), cudaMemcpyHostToDevice)); CUDA(cudaDeviceSynchronize()); @@ -86,13 +110,17 @@ int main(int argc, char *argv[]) // Calculate number of blocks along X and Y in a 2D CUDA "grid" using divup // HINT: The shape of matrices has no impact on launch configuaration DIMS dims; - dims.dimBlock = dim3(1, 1, 1); - dims.dimGrid = dim3(1, 1, 1); + dims.dimBlock = dim3(16, 16, 1); + dims.dimGrid = dim3(divup(sizeMX, dims.dimBlock.x), + divup(sizeNY, dims.dimBlock.y), + 1); // TODO 7: Launch the matrix transpose kernel // matrixMultiplicationNaive<<<>>>(); + matrixMultiplicationNaive << > > (d_matrixP, d_matrixM, d_matrixN, sizeMX, sizeNY, sizeXY); // TODO 8: copy the answer back to the host (CPU) from the device (GPU) + CUDA(cudaMemcpy(matrixP, d_matrixP, sizeMX * sizeNY * sizeof(float), cudaMemcpyDeviceToHost)); // LOOK: Use compareReferenceAndResult to check the result compareReferenceAndResult(matrixPGold, matrixP, sizeMX * sizeNY, 1e-3); @@ -101,6 +129,9 @@ int main(int argc, char *argv[]) //////////////////////////////////////////////////////////// // TODO 9: free device memory using cudaFree + CUDA(cudaFree(d_matrixM)); + CUDA(cudaFree(d_matrixN)); + CUDA(cudaFree(d_matrixP)); // free host memory delete[] matrixM; diff --git a/cuda-introduction/source/saxpy.cu b/cuda-introduction/source/saxpy.cu index 5ed591f..4a379bd 100644 --- a/cuda-introduction/source/saxpy.cu +++ b/cuda-introduction/source/saxpy.cu @@ -9,20 +9,21 @@ __global__ void saxpy(float* const z, const float* const x, const float* const y, const float a, const unsigned size) { // TODO 9: Compute the global index for each thread. - unsigned idx = 0; + unsigned idx = blockIdx.x * blockDim.x + threadIdx.x; // TODO 10: Check if idx is out of bounds. If yes, return. - if (idx >= 0) + if (idx >= size) return; // TODO 11: Perform the SAXPY operation: z = a * x + y. + z[idx] = a * x[idx] + y[idx]; } int main(int argc, char *argv[]) { // TODO 1: Set the size. Start with something simple like 64. // TODO Optional: Try out these sizes: 256, 1024, 2048, 14, 103, 1025, 3127 - const unsigned size = 0; + const unsigned size = 64; // Host arrays. float* x = new float[size]; @@ -53,9 +54,14 @@ int main(int argc, char *argv[]) // TODO 2: Allocate memory on the device. Fill in the blanks for d_x, then do the same commands for d_y and d_z. // CUDA(cudaMalloc((void **)& pointer, size in bytes))); + CUDA(cudaMalloc((void**)&d_x, size * sizeof(float))); + CUDA(cudaMalloc((void**)&d_y, size * sizeof(float))); + CUDA(cudaMalloc((void**)&d_z, size * sizeof(float))); // TODO 3: Copy array contents of X and Y from the host (CPU) to the device (GPU). Follow what you did for 2, // CUDA(cudaMemcpy(dest ptr, source ptr, size in bytes, direction enum)); + CUDA(cudaMemcpy(d_x, x, size * sizeof(float), cudaMemcpyHostToDevice)); + CUDA(cudaMemcpy(d_y, y, size * sizeof(float), cudaMemcpyHostToDevice)); CUDA(cudaDeviceSynchronize()); @@ -69,16 +75,18 @@ int main(int argc, char *argv[]) // TODO 4: Setup threads and blocks. // Start threadPerBlock as 128, then try out differnt configurations: 32, 64, 256, 512, 1024 // Use divup to get the number of blocks to launch. - const unsigned threadsPerBlock = 0; + const unsigned threadsPerBlock = 128; // TODO 5: Implement the divup function in common.cpp const unsigned blocks = divup(size, threadsPerBlock); // TODO 6: Launch the GPU kernel with blocks and threadPerBlock as launch configuration // saxpy<<< >>> (....); + saxpy << > > (d_z, d_x, d_y, a, size); // TODO 7: Copy the answer back to the host (CPU) from the device (GPU). // Copy what you did in 3, except for d_z -> z. + CUDA(cudaMemcpy(z, d_z, size * sizeof(float), cudaMemcpyDeviceToHost)); // LOOK: Use postprocess to check the result compareReferenceAndResult(z_gold, z, size, 1e-6); @@ -87,6 +95,9 @@ int main(int argc, char *argv[]) // TODO 8: free device memory using cudaFree // CUDA(cudaFree(device pointer)); + CUDA(cudaFree(d_x)); + CUDA(cudaFree(d_y)); + CUDA(cudaFree(d_z)); // free host memory delete[] x; diff --git a/cuda-introduction/source/transpose.cu b/cuda-introduction/source/transpose.cu index 89f6f8f..ce265a3 100644 --- a/cuda-introduction/source/transpose.cu +++ b/cuda-introduction/source/transpose.cu @@ -19,16 +19,21 @@ __global__ void copyKernel(const float* const a, float* const b, const unsigned sizeX, const unsigned sizeY) { // TODO 6a: Compute the global index for each thread along x and y dimentions. - unsigned i = 0; - unsigned j = 0;; + unsigned i = blockIdx.x * blockDim.x + threadIdx.x; + unsigned j = blockIdx.y * blockDim.y + threadIdx.y; // TODO 6b: Check if i or j are out of bounds. If they are, return. + if (i >= sizeX || j >= sizeY) + { + return; + } // TODO 6c: Compute global 1D index from i and j - unsigned index = 0; + unsigned index = j * sizeX + i; // TODO 6d: Copy data from A to B. Note that in copy kernel source and destination indices are the same // b[] = a[]; + b[index] = a[index]; } // TODO 11: Implement the transpose kernel @@ -38,16 +43,21 @@ __global__ void copyKernel(const float* const a, float* const b, const unsigned __global__ void matrixTransposeNaive(const float* const a, float* const b, const unsigned sizeX, const unsigned sizeY) { // TODO 11a: Compute the global index for each thread along x and y dimentions. - unsigned i = 0; - unsigned j = 0; + unsigned i = blockIdx.x * blockDim.x + threadIdx.x; + unsigned j = blockIdx.y * blockDim.y + threadIdx.y; // TODO 11b: Check if i or j are out of bounds. If they are, return. + if (i >= sizeX || j >= sizeY) + { + return; + } // TODO 11c: Compute index_in as (i,j) (same as index in copy kernel) and index_out as (j,i) - unsigned index_in = 0; // Compute input index (i,j) from matrix A - unsigned index_out = 0; // Compute output index (j,i) in matrix B = transpose(A) + unsigned index_in = j * sizeX + i; // Compute input index (i,j) from matrix A + unsigned index_out = i * sizeY + j; // Compute output index (j,i) in matrix B = transpose(A) // TODO 11d: Copy data from A to B using transpose indices + b[index_out] = a[index_in]; } int main(int argc, char *argv[]) @@ -82,8 +92,11 @@ int main(int argc, char *argv[]) float *d_a, *d_b; // TODO 2: Allocate memory on the device for d_a and d_b. + CUDA(cudaMalloc((void**)&d_a, sizeX * sizeY * sizeof(float))); + CUDA(cudaMalloc((void**)&d_b, sizeX * sizeY * sizeof(float))); // TODO 3: Copy array contents of A from the host (CPU) to the device (GPU) + CUDA(cudaMemcpy(d_a, a, sizeX * sizeY * sizeof(float), cudaMemcpyHostToDevice)); CUDA(cudaDeviceSynchronize()); @@ -97,13 +110,16 @@ int main(int argc, char *argv[]) // TODO 4: Assign a 2D distribution of BS_X x BS_Y x 1 CUDA threads within // Calculate number of blocks along X and Y in a 2D CUDA "grid" using divup DIMS dims; - dims.dimBlock = dim3(1, 1, 1); - dims.dimGrid = dim3(1, 1, 1); + dims.dimBlock = dim3(32, 32, 1); + dims.dimGrid = dim3(divup(sizeX, dims.dimBlock.x), + divup(sizeY, dims.dimBlock.y), + 1); // LOOK: Launch the copy kernel copyKernel<<>>(d_a, d_b, sizeX, sizeY); // TODO 5: copy the answer back to the host (CPU) from the device (GPU) + CUDA(cudaMemcpy(b, d_b, sizeY * sizeX * sizeof(float), cudaMemcpyDeviceToHost)); // LOOK: Use compareReferenceAndResult to check the result compareReferenceAndResult(a_gold, b, sizeX * sizeY); @@ -121,13 +137,17 @@ int main(int argc, char *argv[]) // TODO 8: Assign a 2D distribution of BS_X x BS_Y x 1 CUDA threads within // Calculate number of blocks along X and Y in a 2D CUDA "grid" using divup DIMS dims; - dims.dimBlock = dim3(1, 1, 1); - dims.dimGrid = dim3(1, 1, 1); + dims.dimBlock = dim3(32, 32, 1); + dims.dimGrid = dim3(divup(sizeX, dims.dimBlock.x), + divup(sizeY, dims.dimBlock.y), + 1); // TODO 9: Launch the matrix transpose kernel // matrixTransposeNaive<<<>>>(......); + matrixTransposeNaive <<>> (d_a, d_b, sizeX, sizeY); // TODO 10: copy the answer back to the host (CPU) from the device (GPU) + CUDA(cudaMemcpy(b, d_b, sizeY * sizeX * sizeof(float), cudaMemcpyDeviceToHost)); // LOOK: Use compareReferenceAndResult to check the result compareReferenceAndResult(b_gold, b, sizeX * sizeY); @@ -136,6 +156,8 @@ int main(int argc, char *argv[]) //////////////////////////////////////////////////////////// // TODO 7: free device memory using cudaFree + CUDA(cudaFree(d_a)); + CUDA(cudaFree(d_b)); // free host memory delete[] a; diff --git a/images/CUDA_GL_Check.png b/images/CUDA_GL_Check.png new file mode 100644 index 0000000..2bb719c Binary files /dev/null and b/images/CUDA_GL_Check.png differ diff --git a/images/NVIDIA_Nsight_Compute.png b/images/NVIDIA_Nsight_Compute.png new file mode 100644 index 0000000..c425993 Binary files /dev/null and b/images/NVIDIA_Nsight_Compute.png differ diff --git a/images/NVIDIA_Nsight_Systems_Analysis_Summary.png b/images/NVIDIA_Nsight_Systems_Analysis_Summary.png new file mode 100644 index 0000000..5621015 Binary files /dev/null and b/images/NVIDIA_Nsight_Systems_Analysis_Summary.png differ diff --git a/images/NVIDIA_Nsight_Systems_Timeline.png b/images/NVIDIA_Nsight_Systems_Timeline.png new file mode 100644 index 0000000..1002284 Binary files /dev/null and b/images/NVIDIA_Nsight_Systems_Timeline.png differ diff --git a/images/Nsight_Debugging_Windows_VS.png b/images/Nsight_Debugging_Windows_VS.png new file mode 100644 index 0000000..9d64fb2 Binary files /dev/null and b/images/Nsight_Debugging_Windows_VS.png differ diff --git a/images/WebGL_Report.png b/images/WebGL_Report.png new file mode 100644 index 0000000..ffc3205 Binary files /dev/null and b/images/WebGL_Report.png differ diff --git a/images/WebGPU.png b/images/WebGPU.png new file mode 100644 index 0000000..ea36989 Binary files /dev/null and b/images/WebGPU.png differ