diff --git a/README.md b/README.md index d2fa33d..84929d5 100644 --- a/README.md +++ b/README.md @@ -3,11 +3,35 @@ Project 0 Getting Started **University of Pennsylvania, CIS 5650: GPU Programming and Architecture, Project 0** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Aaron Jiang + * [LinkedIn](https://www.linkedin.com/in/aaronpjiang/), [personal website](https://aaron-jiang.com/) +* Tested on: Windows 11, i5-13420H @ 1.99GHz 15.6GB, GTX 4050 13.8MB (Personal Computer) -### (TODO: Your README) +### GPU Setup Documentation -Include screenshots, analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +I confirmed that my personal machine is capable of running CUDA projects. The Compute Capability of my personal laptop is **8.9** + +#### 2.1.2 +My machine successfully runs cuda-gl-check and produces this image output. +![](images/Screenshot%202025-08-29%20162758.png) + +#### 2.1.3 +I was able to use the Nsight debugger to control which thread to view in debug mode. +![](images/Screenshot%202025-08-29%20184157.png) + +#### 2.1.4 +I was also able to produce a report using Nsight Systems on statistics of cuda-gl-check. +![](images/Screenshot%202025-08-29%20194907.png) + +#### 2.1.5 +I was not able to use Nsight Compute, running into the error found in the pinned message on Ed, linked [here](https://edstem.org/us/courses/81464/discussion/6880884), of an unknown error occuring. +![](images/Screenshot%202025-08-29%20232639.png) + +#### 2.2 +My browser is capable of running both WebGL1 and WebGL2. +![](images/Screenshot%202025-08-29%20232804.png) +![](images/Screenshot%202025-08-29%20232927.png) + +#### 2.3 +And it was also capable of running WebGPU. +![](images/Screenshot%202025-08-29%20233005.png) \ No newline at end of file diff --git a/cuda-gl-check/src/main.cpp b/cuda-gl-check/src/main.cpp index 886fd4c..113dd9f 100644 --- a/cuda-gl-check/src/main.cpp +++ b/cuda-gl-check/src/main.cpp @@ -11,7 +11,7 @@ */ int main(int argc, char* argv[]) { // TODO: Change this line to use your name! - m_yourName = "TODO: YOUR NAME HERE"; + m_yourName = "Aaron Jiang"; if (init(argc, argv)) { mainLoop(); diff --git a/cuda-introduction/source/common.cu b/cuda-introduction/source/common.cu index dce8793..3446ce1 100644 --- a/cuda-introduction/source/common.cu +++ b/cuda-introduction/source/common.cu @@ -9,7 +9,7 @@ unsigned divup(unsigned size, unsigned div) { // TODO: implement a 1 line function to return the divup operation. // Note: You only need to use addition, subtraction, and division operations. - return 0; + return (size - 1) / div + 1; } void clearHostAndDeviceArray(float *res, float *dev_res, unsigned size, const int value) diff --git a/cuda-introduction/source/matmul.cu b/cuda-introduction/source/matmul.cu index 826e535..82c4c66 100644 --- a/cuda-introduction/source/matmul.cu +++ b/cuda-introduction/source/matmul.cu @@ -12,17 +12,24 @@ __global__ void matrixMultiplicationNaive(float* const matrixP, const float* con { // TODO 10a: Compute the P matrix global index for each thread along x and y dimentions. // Remember that each thread of the kernel computes the result of 1 unique element of P - unsigned px; - unsigned py; + unsigned px = blockIdx.x * blockDim.x + threadIdx.x; + unsigned py = blockIdx.y * blockDim.y + threadIdx.y; // TODO 10b: Check if px or py are out of bounds. If they are, return. + if (px >= sizeMX || py >= sizeNY) { + return; + } // TODO 10c: Compute the dot product for the P element in each thread // This loop will be the same as the host loop float dot = 0.0; + for (unsigned k = 0; k < sizeXY; ++k) { + dot += matrixM[px * sizeXY + k] * matrixN[k * sizeNY + py]; + } + // TODO 10d: Copy dot to P matrix - // matrixP[] = dot; + matrixP[px * sizeNY + py] = dot; } int main(int argc, char *argv[]) @@ -31,19 +38,19 @@ int main(int argc, char *argv[]) // Then try large multiple-block square matrix like 64x64 up to 2048x2048. // Then try square, non-power-of-two like 15x15, 33x33, 67x67, 123x123, and 771x771 // Then try rectangles with powers of two and then non-power-of-two. - const unsigned sizeMX = 0; - const unsigned sizeXY = 0; - const unsigned sizeNY = 0; + const unsigned sizeMX = 16; + const unsigned sizeXY = 16; + const unsigned sizeNY = 16; // TODO 2: Allocate host 1D arrays for: // matrixM[sizeMX, sizeXY] // matrixN[sizeXY, sizeNY] // matrixP[sizeMX, sizeNY] // matrixPGold[sizeMX, sizeNY] - float* matrixM; - float* matrixN; - float* matrixP; - float* matrixPGold; + float* matrixM = new float[sizeMX * sizeXY]; + float* matrixN = new float[sizeXY * sizeNY]; + float* matrixP = new float[sizeMX * sizeNY]; + float* matrixPGold = new float[sizeMX * sizeNY]; // LOOK: Setup random number generator and fill host arrays and the scalar a. std::random_device rd; @@ -64,14 +71,29 @@ int main(int argc, char *argv[]) // initialize dot product accumulator // for k -> 0 to sizeXY // dot = m[k, px] * n[py, k] - // matrixPGold[py, px] = dot + // matrixPGold[py, px] = + for (unsigned py = 0; py < sizeNY; ++py) { + for (unsigned px = 0; px < sizeMX; ++px) { + float dot_accum = 0.f; + for (unsigned k = 0; k < sizeXY; ++k) { + // row major + dot_accum += matrixM[px * sizeXY + k] * matrixN[k * sizeNY + py]; + } + matrixPGold[px * sizeNY + py] = dot_accum; + } + } // Device arrays float *d_matrixM, *d_matrixN, *d_matrixP; // TODO 4: Allocate memory on the device for d_matrixM, d_matrixN, d_matrixP. + CUDA(cudaMalloc((void**)&d_matrixM, sizeMX * sizeXY * sizeof(float))); + CUDA(cudaMalloc((void**)&d_matrixN, sizeXY * sizeNY * sizeof(float))); + CUDA(cudaMalloc((void**)&d_matrixP, sizeMX * sizeNY * sizeof(float))); // TODO 5: Copy array contents of M and N from the host (CPU) to the device (GPU) + CUDA(cudaMemcpy(d_matrixM, matrixM, sizeMX * sizeXY * sizeof(float), cudaMemcpyHostToDevice)); + CUDA(cudaMemcpy(d_matrixN, matrixN, sizeXY * sizeNY * sizeof(float), cudaMemcpyHostToDevice)); CUDA(cudaDeviceSynchronize()); @@ -85,14 +107,18 @@ int main(int argc, char *argv[]) // TODO 6: Assign a 2D distribution of BS_X x BS_Y x 1 CUDA threads within // Calculate number of blocks along X and Y in a 2D CUDA "grid" using divup // HINT: The shape of matrices has no impact on launch configuaration + unsigned BS_X = 16; + unsigned BS_Y = 16; + DIMS dims; - dims.dimBlock = dim3(1, 1, 1); - dims.dimGrid = dim3(1, 1, 1); + dims.dimBlock = dim3(BS_X, BS_Y, 1); + dims.dimGrid = dim3(divup(sizeMX, BS_X), divup(sizeNY, BS_Y), 1); // TODO 7: Launch the matrix transpose kernel - // matrixMultiplicationNaive<<<>>>(); + matrixMultiplicationNaive<<>>(d_matrixP, d_matrixM, d_matrixN, sizeMX, sizeNY, sizeXY); // TODO 8: copy the answer back to the host (CPU) from the device (GPU) + CUDA(cudaMemcpy(matrixP, d_matrixP, sizeMX * sizeNY * sizeof(float), cudaMemcpyDeviceToHost)); // LOOK: Use compareReferenceAndResult to check the result compareReferenceAndResult(matrixPGold, matrixP, sizeMX * sizeNY, 1e-3); @@ -101,6 +127,9 @@ int main(int argc, char *argv[]) //////////////////////////////////////////////////////////// // TODO 9: free device memory using cudaFree + CUDA(cudaFree(d_matrixM)); + CUDA(cudaFree(d_matrixN)); + CUDA(cudaFree(d_matrixP)); // free host memory delete[] matrixM; diff --git a/cuda-introduction/source/saxpy.cu b/cuda-introduction/source/saxpy.cu index 5ed591f..577a21b 100644 --- a/cuda-introduction/source/saxpy.cu +++ b/cuda-introduction/source/saxpy.cu @@ -9,20 +9,21 @@ __global__ void saxpy(float* const z, const float* const x, const float* const y, const float a, const unsigned size) { // TODO 9: Compute the global index for each thread. - unsigned idx = 0; + unsigned idx = blockIdx.x * blockDim.x + threadIdx.x; // TODO 10: Check if idx is out of bounds. If yes, return. - if (idx >= 0) + if (idx >= size) return; // TODO 11: Perform the SAXPY operation: z = a * x + y. + z[idx] = a * x[idx] + y[idx]; } int main(int argc, char *argv[]) { // TODO 1: Set the size. Start with something simple like 64. // TODO Optional: Try out these sizes: 256, 1024, 2048, 14, 103, 1025, 3127 - const unsigned size = 0; + const unsigned size = 64; // Host arrays. float* x = new float[size]; @@ -53,10 +54,17 @@ int main(int argc, char *argv[]) // TODO 2: Allocate memory on the device. Fill in the blanks for d_x, then do the same commands for d_y and d_z. // CUDA(cudaMalloc((void **)& pointer, size in bytes))); + + CUDA(cudaMalloc((void**)&d_x, size * sizeof(float))); + CUDA(cudaMalloc((void**)&d_y, size * sizeof(float))); + CUDA(cudaMalloc((void**)&d_z, size * sizeof(float))); // TODO 3: Copy array contents of X and Y from the host (CPU) to the device (GPU). Follow what you did for 2, // CUDA(cudaMemcpy(dest ptr, source ptr, size in bytes, direction enum)); + CUDA(cudaMemcpy(d_x, x, size * sizeof(float), cudaMemcpyHostToDevice)); + CUDA(cudaMemcpy(d_y, y, size * sizeof(float), cudaMemcpyHostToDevice)); + CUDA(cudaDeviceSynchronize()); //////////////////////////////////////////////////////////// @@ -69,16 +77,18 @@ int main(int argc, char *argv[]) // TODO 4: Setup threads and blocks. // Start threadPerBlock as 128, then try out differnt configurations: 32, 64, 256, 512, 1024 // Use divup to get the number of blocks to launch. - const unsigned threadsPerBlock = 0; + const unsigned threadsPerBlock = 128; // TODO 5: Implement the divup function in common.cpp const unsigned blocks = divup(size, threadsPerBlock); // TODO 6: Launch the GPU kernel with blocks and threadPerBlock as launch configuration // saxpy<<< >>> (....); + saxpy<<>>(d_z, d_x, d_y, a, size); // TODO 7: Copy the answer back to the host (CPU) from the device (GPU). // Copy what you did in 3, except for d_z -> z. + CUDA(cudaMemcpy(z, d_z, size * sizeof(float), cudaMemcpyDeviceToHost)); // LOOK: Use postprocess to check the result compareReferenceAndResult(z_gold, z, size, 1e-6); @@ -87,6 +97,9 @@ int main(int argc, char *argv[]) // TODO 8: free device memory using cudaFree // CUDA(cudaFree(device pointer)); + CUDA(cudaFree(d_x)); + CUDA(cudaFree(d_y)); + CUDA(cudaFree(d_z)); // free host memory delete[] x; diff --git a/cuda-introduction/source/transpose.cu b/cuda-introduction/source/transpose.cu index 89f6f8f..4503d96 100644 --- a/cuda-introduction/source/transpose.cu +++ b/cuda-introduction/source/transpose.cu @@ -19,16 +19,20 @@ __global__ void copyKernel(const float* const a, float* const b, const unsigned sizeX, const unsigned sizeY) { // TODO 6a: Compute the global index for each thread along x and y dimentions. - unsigned i = 0; - unsigned j = 0;; + unsigned i = blockIdx.x * blockDim.x + threadIdx.x; + unsigned j = blockIdx.y * blockDim.y + threadIdx.y; // TODO 6b: Check if i or j are out of bounds. If they are, return. + if (i >= sizeX || j >= sizeY) { + return; + } // TODO 6c: Compute global 1D index from i and j - unsigned index = 0; + unsigned index = j * sizeX + i; // TODO 6d: Copy data from A to B. Note that in copy kernel source and destination indices are the same // b[] = a[]; + b[index] = a[index]; } // TODO 11: Implement the transpose kernel @@ -38,16 +42,20 @@ __global__ void copyKernel(const float* const a, float* const b, const unsigned __global__ void matrixTransposeNaive(const float* const a, float* const b, const unsigned sizeX, const unsigned sizeY) { // TODO 11a: Compute the global index for each thread along x and y dimentions. - unsigned i = 0; - unsigned j = 0; + unsigned i = blockIdx.x * blockDim.x + threadIdx.x; + unsigned j = blockIdx.y * blockDim.y + threadIdx.y; // TODO 11b: Check if i or j are out of bounds. If they are, return. + if (i >= sizeX || j >= sizeY) { + return; + } // TODO 11c: Compute index_in as (i,j) (same as index in copy kernel) and index_out as (j,i) - unsigned index_in = 0; // Compute input index (i,j) from matrix A - unsigned index_out = 0; // Compute output index (j,i) in matrix B = transpose(A) + unsigned index_in = j * sizeX + i; // Compute input index (i,j) from matrix A + unsigned index_out = i * sizeY + j; // Compute output index (j,i) in matrix B = transpose(A) // TODO 11d: Copy data from A to B using transpose indices + b[index_out] = a[index_in]; } int main(int argc, char *argv[]) @@ -82,8 +90,11 @@ int main(int argc, char *argv[]) float *d_a, *d_b; // TODO 2: Allocate memory on the device for d_a and d_b. + CUDA(cudaMalloc((void**)&d_a, sizeX * sizeY * sizeof(float))); + CUDA(cudaMalloc((void**)&d_b, sizeX * sizeY * sizeof(float))); // TODO 3: Copy array contents of A from the host (CPU) to the device (GPU) + CUDA(cudaMemcpy(d_a, a, sizeX * sizeY * sizeof(float), cudaMemcpyHostToDevice)); CUDA(cudaDeviceSynchronize()); @@ -96,14 +107,18 @@ int main(int argc, char *argv[]) // TODO 4: Assign a 2D distribution of BS_X x BS_Y x 1 CUDA threads within // Calculate number of blocks along X and Y in a 2D CUDA "grid" using divup + unsigned BS_X = 16; + unsigned BS_Y = 16; + DIMS dims; - dims.dimBlock = dim3(1, 1, 1); - dims.dimGrid = dim3(1, 1, 1); + dims.dimBlock = dim3(BS_X, BS_Y, 1); + dims.dimGrid = dim3(divup(sizeX, BS_X), divup(sizeY, BS_Y), 1); // LOOK: Launch the copy kernel copyKernel<<>>(d_a, d_b, sizeX, sizeY); // TODO 5: copy the answer back to the host (CPU) from the device (GPU) + CUDA(cudaMemcpy(b, d_b, sizeX * sizeY * sizeof(float), cudaMemcpyDeviceToHost)); // LOOK: Use compareReferenceAndResult to check the result compareReferenceAndResult(a_gold, b, sizeX * sizeY); @@ -120,14 +135,18 @@ int main(int argc, char *argv[]) // TODO 8: Assign a 2D distribution of BS_X x BS_Y x 1 CUDA threads within // Calculate number of blocks along X and Y in a 2D CUDA "grid" using divup + unsigned BS_X = 16; + unsigned BS_Y = 16; + DIMS dims; - dims.dimBlock = dim3(1, 1, 1); - dims.dimGrid = dim3(1, 1, 1); + dims.dimBlock = dim3(BS_X, BS_Y, 1); + dims.dimGrid = dim3(divup(sizeX, BS_X), divup(sizeY, BS_Y), 1); // TODO 9: Launch the matrix transpose kernel - // matrixTransposeNaive<<<>>>(......); + matrixTransposeNaive<<>>(d_a, d_b, sizeX, sizeY); // TODO 10: copy the answer back to the host (CPU) from the device (GPU) + CUDA(cudaMemcpy(b, d_b, sizeX * sizeY * sizeof(float), cudaMemcpyDeviceToHost)); // LOOK: Use compareReferenceAndResult to check the result compareReferenceAndResult(b_gold, b, sizeX * sizeY); @@ -136,6 +155,8 @@ int main(int argc, char *argv[]) //////////////////////////////////////////////////////////// // TODO 7: free device memory using cudaFree + CUDA(cudaFree(d_a)); + CUDA(cudaFree(d_b)); // free host memory delete[] a; diff --git a/images/Screenshot 2025-08-29 162758.png b/images/Screenshot 2025-08-29 162758.png new file mode 100644 index 0000000..31333b2 Binary files /dev/null and b/images/Screenshot 2025-08-29 162758.png differ diff --git a/images/Screenshot 2025-08-29 184157.png b/images/Screenshot 2025-08-29 184157.png new file mode 100644 index 0000000..139ebbe Binary files /dev/null and b/images/Screenshot 2025-08-29 184157.png differ diff --git a/images/Screenshot 2025-08-29 194907.png b/images/Screenshot 2025-08-29 194907.png new file mode 100644 index 0000000..462dada Binary files /dev/null and b/images/Screenshot 2025-08-29 194907.png differ diff --git a/images/Screenshot 2025-08-29 232639.png b/images/Screenshot 2025-08-29 232639.png new file mode 100644 index 0000000..6ee0a3a Binary files /dev/null and b/images/Screenshot 2025-08-29 232639.png differ diff --git a/images/Screenshot 2025-08-29 232804.png b/images/Screenshot 2025-08-29 232804.png new file mode 100644 index 0000000..05d4505 Binary files /dev/null and b/images/Screenshot 2025-08-29 232804.png differ diff --git a/images/Screenshot 2025-08-29 232927.png b/images/Screenshot 2025-08-29 232927.png new file mode 100644 index 0000000..441032c Binary files /dev/null and b/images/Screenshot 2025-08-29 232927.png differ diff --git a/images/Screenshot 2025-08-29 233005.png b/images/Screenshot 2025-08-29 233005.png new file mode 100644 index 0000000..9d7c25d Binary files /dev/null and b/images/Screenshot 2025-08-29 233005.png differ