diff --git a/README.md b/README.md
index d2fa33d..84929d5 100644
--- a/README.md
+++ b/README.md
@@ -3,11 +3,35 @@ Project 0 Getting Started
 
 **University of Pennsylvania, CIS 5650: GPU Programming and Architecture, Project 0**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Aaron Jiang
+  * [LinkedIn](https://www.linkedin.com/in/aaronpjiang/), [personal website](https://aaron-jiang.com/)
+* Tested on: Windows 11, i5-13420H @ 1.99GHz 15.6GB, GTX 4050 13.8MB (Personal Computer)
 
-### (TODO: Your README)
+### GPU Setup Documentation
 
-Include screenshots, analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+I confirmed that my personal machine is capable of running CUDA projects. The Compute Capability of my personal laptop is **8.9**  
+
+#### 2.1.2  
+My machine successfully runs cuda-gl-check and produces this image output.  
+![](images/Screenshot%202025-08-29%20162758.png)  
+
+#### 2.1.3  
+I was able to use the Nsight debugger to control which thread to view in debug mode. 
+![](images/Screenshot%202025-08-29%20184157.png)  
+
+#### 2.1.4  
+I was also able to produce a report using Nsight Systems on statistics of cuda-gl-check.  
+![](images/Screenshot%202025-08-29%20194907.png)
+
+#### 2.1.5  
+I was not able to use Nsight Compute, running into the error found in the pinned message on Ed, linked [here](https://edstem.org/us/courses/81464/discussion/6880884), of an unknown error occuring.  
+![](images/Screenshot%202025-08-29%20232639.png)
+
+#### 2.2   
+My browser is capable of running both WebGL1 and WebGL2.  
+![](images/Screenshot%202025-08-29%20232804.png)  
+![](images/Screenshot%202025-08-29%20232927.png)  
+
+#### 2.3   
+And it was also capable of running WebGPU.  
+![](images/Screenshot%202025-08-29%20233005.png)  
\ No newline at end of file
diff --git a/cuda-gl-check/src/main.cpp b/cuda-gl-check/src/main.cpp
index 886fd4c..113dd9f 100644
--- a/cuda-gl-check/src/main.cpp
+++ b/cuda-gl-check/src/main.cpp
@@ -11,7 +11,7 @@
  */
 int main(int argc, char* argv[]) {
     // TODO: Change this line to use your name!
-    m_yourName = "TODO: YOUR NAME HERE";
+    m_yourName = "Aaron Jiang";
 
     if (init(argc, argv)) {
         mainLoop();
diff --git a/cuda-introduction/source/common.cu b/cuda-introduction/source/common.cu
index dce8793..3446ce1 100644
--- a/cuda-introduction/source/common.cu
+++ b/cuda-introduction/source/common.cu
@@ -9,7 +9,7 @@ unsigned divup(unsigned size, unsigned div)
 {
     // TODO: implement a 1 line function to return the divup operation.
     // Note: You only need to use addition, subtraction, and division operations.
-    return 0;
+    return (size - 1) / div + 1;
 }
 
 void clearHostAndDeviceArray(float *res, float *dev_res, unsigned size, const int value)
diff --git a/cuda-introduction/source/matmul.cu b/cuda-introduction/source/matmul.cu
index 826e535..82c4c66 100644
--- a/cuda-introduction/source/matmul.cu
+++ b/cuda-introduction/source/matmul.cu
@@ -12,17 +12,24 @@ __global__ void matrixMultiplicationNaive(float* const matrixP, const float* con
 {
     // TODO 10a: Compute the P matrix global index for each thread along x and y dimentions.
     // Remember that each thread of the kernel computes the result of 1 unique element of P
-    unsigned px;
-    unsigned py;
+    unsigned px = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned py = blockIdx.y * blockDim.y + threadIdx.y;
 
     // TODO 10b: Check if px or py are out of bounds. If they are, return.
+    if (px >= sizeMX || py >= sizeNY) {
+        return;
+    }
 
     // TODO 10c: Compute the dot product for the P element in each thread
     // This loop will be the same as the host loop
     float dot = 0.0;
 
+    for (unsigned k = 0; k < sizeXY; ++k) {
+        dot += matrixM[px * sizeXY + k] * matrixN[k * sizeNY + py];
+    }
+
     // TODO 10d: Copy dot to P matrix
-    // matrixP[] = dot;
+    matrixP[px * sizeNY + py] = dot;
 }
 
 int main(int argc, char *argv[])
@@ -31,19 +38,19 @@ int main(int argc, char *argv[])
     // Then try large multiple-block square matrix like 64x64 up to 2048x2048.
     // Then try square, non-power-of-two like 15x15, 33x33, 67x67, 123x123, and 771x771
     // Then try rectangles with powers of two and then non-power-of-two.
-    const unsigned sizeMX = 0;
-    const unsigned sizeXY = 0;
-    const unsigned sizeNY = 0;
+    const unsigned sizeMX = 16;
+    const unsigned sizeXY = 16;
+    const unsigned sizeNY = 16;
 
     // TODO 2: Allocate host 1D arrays for:
     // matrixM[sizeMX, sizeXY]
     // matrixN[sizeXY, sizeNY]
     // matrixP[sizeMX, sizeNY]
     // matrixPGold[sizeMX, sizeNY]
-    float* matrixM;
-    float* matrixN;
-    float* matrixP;
-    float* matrixPGold;
+    float* matrixM = new float[sizeMX * sizeXY];
+    float* matrixN = new float[sizeXY * sizeNY];
+    float* matrixP = new float[sizeMX * sizeNY];
+    float* matrixPGold = new float[sizeMX * sizeNY];
 
     // LOOK: Setup random number generator and fill host arrays and the scalar a.
     std::random_device rd;
@@ -64,14 +71,29 @@ int main(int argc, char *argv[])
     //     initialize dot product accumulator
     //     for k -> 0 to sizeXY
     //       dot = m[k, px] * n[py, k]
-    //  matrixPGold[py, px] = dot
+    //  matrixPGold[py, px] = 
+    for (unsigned py = 0; py < sizeNY; ++py) {
+        for (unsigned px = 0; px < sizeMX; ++px) {
+            float dot_accum = 0.f;
+            for (unsigned k = 0; k < sizeXY; ++k) {
+                // row major
+                dot_accum += matrixM[px * sizeXY + k] * matrixN[k * sizeNY + py];
+            }
+            matrixPGold[px * sizeNY + py] = dot_accum;
+        }
+    }
 
     // Device arrays
     float *d_matrixM, *d_matrixN, *d_matrixP;
 
     // TODO 4: Allocate memory on the device for d_matrixM, d_matrixN, d_matrixP.
+    CUDA(cudaMalloc((void**)&d_matrixM, sizeMX * sizeXY * sizeof(float)));
+    CUDA(cudaMalloc((void**)&d_matrixN, sizeXY * sizeNY * sizeof(float)));
+    CUDA(cudaMalloc((void**)&d_matrixP, sizeMX * sizeNY * sizeof(float)));
 
     // TODO 5: Copy array contents of M and N from the host (CPU) to the device (GPU)
+    CUDA(cudaMemcpy(d_matrixM, matrixM, sizeMX * sizeXY * sizeof(float), cudaMemcpyHostToDevice));
+    CUDA(cudaMemcpy(d_matrixN, matrixN, sizeXY * sizeNY * sizeof(float), cudaMemcpyHostToDevice));
 
     CUDA(cudaDeviceSynchronize());
 
@@ -85,14 +107,18 @@ int main(int argc, char *argv[])
     // TODO 6: Assign a 2D distribution of BS_X x BS_Y x 1 CUDA threads within
     // Calculate number of blocks along X and Y in a 2D CUDA "grid" using divup
     // HINT: The shape of matrices has no impact on launch configuaration
+    unsigned BS_X = 16;
+    unsigned BS_Y = 16;
+
     DIMS dims;
-    dims.dimBlock = dim3(1, 1, 1);
-    dims.dimGrid  = dim3(1, 1, 1);
+    dims.dimBlock = dim3(BS_X, BS_Y, 1);
+    dims.dimGrid  = dim3(divup(sizeMX, BS_X), divup(sizeNY, BS_Y), 1);
 
     // TODO 7: Launch the matrix transpose kernel
-    // matrixMultiplicationNaive<<<>>>();
+    matrixMultiplicationNaive<<<dims.dimGrid, dims.dimBlock>>>(d_matrixP, d_matrixM, d_matrixN, sizeMX, sizeNY, sizeXY);
 
     // TODO 8: copy the answer back to the host (CPU) from the device (GPU)
+    CUDA(cudaMemcpy(matrixP, d_matrixP, sizeMX * sizeNY * sizeof(float), cudaMemcpyDeviceToHost));
 
     // LOOK: Use compareReferenceAndResult to check the result
     compareReferenceAndResult(matrixPGold, matrixP, sizeMX * sizeNY, 1e-3);
@@ -101,6 +127,9 @@ int main(int argc, char *argv[])
     ////////////////////////////////////////////////////////////
 
     // TODO 9: free device memory using cudaFree
+    CUDA(cudaFree(d_matrixM));
+    CUDA(cudaFree(d_matrixN));
+    CUDA(cudaFree(d_matrixP));
 
     // free host memory
     delete[] matrixM;
diff --git a/cuda-introduction/source/saxpy.cu b/cuda-introduction/source/saxpy.cu
index 5ed591f..577a21b 100644
--- a/cuda-introduction/source/saxpy.cu
+++ b/cuda-introduction/source/saxpy.cu
@@ -9,20 +9,21 @@
 __global__ void saxpy(float* const z, const float* const x, const float* const y, const float a, const unsigned size)
 {
     // TODO 9: Compute the global index for each thread.
-    unsigned idx = 0;
+    unsigned idx = blockIdx.x * blockDim.x + threadIdx.x;
 
     // TODO 10: Check if idx is out of bounds. If yes, return.
-    if (idx >= 0)
+    if (idx >= size)
         return;
 
     // TODO 11: Perform the SAXPY operation: z = a * x + y.
+    z[idx] = a * x[idx] + y[idx];
 }
 
 int main(int argc, char *argv[])
 {
     // TODO 1: Set the size. Start with something simple like 64.
     // TODO Optional: Try out these sizes: 256, 1024, 2048, 14, 103, 1025, 3127
-    const unsigned size = 0;
+    const unsigned size = 64;
 
     // Host arrays.
     float* x = new float[size];
@@ -53,10 +54,17 @@ int main(int argc, char *argv[])
 
     // TODO 2: Allocate memory on the device. Fill in the blanks for d_x, then do the same commands for d_y and d_z.
     // CUDA(cudaMalloc((void **)& pointer, size in bytes)));
+    
+    CUDA(cudaMalloc((void**)&d_x, size * sizeof(float)));
+    CUDA(cudaMalloc((void**)&d_y, size * sizeof(float)));
+    CUDA(cudaMalloc((void**)&d_z, size * sizeof(float)));
 
     // TODO 3: Copy array contents of X and Y from the host (CPU) to the device (GPU). Follow what you did for 2,
     // CUDA(cudaMemcpy(dest ptr, source ptr, size in bytes, direction enum));
 
+    CUDA(cudaMemcpy(d_x, x, size * sizeof(float), cudaMemcpyHostToDevice));
+    CUDA(cudaMemcpy(d_y, y, size * sizeof(float), cudaMemcpyHostToDevice));
+
     CUDA(cudaDeviceSynchronize());
 
     ////////////////////////////////////////////////////////////
@@ -69,16 +77,18 @@ int main(int argc, char *argv[])
     // TODO 4: Setup threads and blocks.
     // Start threadPerBlock as 128, then try out differnt configurations: 32, 64, 256, 512, 1024
     // Use divup to get the number of blocks to launch.
-    const unsigned threadsPerBlock = 0;
+    const unsigned threadsPerBlock = 128;
 
     // TODO 5: Implement the divup function in common.cpp
     const unsigned blocks = divup(size, threadsPerBlock);
 
     // TODO 6: Launch the GPU kernel with blocks and threadPerBlock as launch configuration
     // saxpy<<< >>> (....);
+    saxpy<<<blocks, threadsPerBlock>>>(d_z, d_x, d_y, a, size);
 
     // TODO 7: Copy the answer back to the host (CPU) from the device (GPU).
     // Copy what you did in 3, except for d_z -> z.
+    CUDA(cudaMemcpy(z, d_z, size * sizeof(float), cudaMemcpyDeviceToHost));
 
     // LOOK: Use postprocess to check the result
     compareReferenceAndResult(z_gold, z, size, 1e-6);
@@ -87,6 +97,9 @@ int main(int argc, char *argv[])
 
     // TODO 8: free device memory using cudaFree
     // CUDA(cudaFree(device pointer));
+    CUDA(cudaFree(d_x));
+    CUDA(cudaFree(d_y));
+    CUDA(cudaFree(d_z));
 
     // free host memory
     delete[] x;
diff --git a/cuda-introduction/source/transpose.cu b/cuda-introduction/source/transpose.cu
index 89f6f8f..4503d96 100644
--- a/cuda-introduction/source/transpose.cu
+++ b/cuda-introduction/source/transpose.cu
@@ -19,16 +19,20 @@
 __global__ void copyKernel(const float* const a, float* const b, const unsigned sizeX, const unsigned sizeY)
 {
     // TODO 6a: Compute the global index for each thread along x and y dimentions.
-    unsigned i = 0;
-    unsigned j = 0;;
+    unsigned i = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned j = blockIdx.y * blockDim.y + threadIdx.y;
 
     // TODO 6b: Check if i or j are out of bounds. If they are, return.
+    if (i >= sizeX || j >= sizeY) {
+        return;
+    }
 
     // TODO 6c: Compute global 1D index from i and j
-    unsigned index = 0;
+    unsigned index = j * sizeX + i;
 
     // TODO 6d: Copy data from A to B. Note that in copy kernel source and destination indices are the same
     // b[] = a[];
+    b[index] = a[index];
 }
 
 // TODO 11: Implement the transpose kernel
@@ -38,16 +42,20 @@ __global__ void copyKernel(const float* const a, float* const b, const unsigned
 __global__ void matrixTransposeNaive(const float* const a, float* const b, const unsigned sizeX, const unsigned sizeY)
 {
     // TODO 11a: Compute the global index for each thread along x and y dimentions.
-    unsigned i = 0;
-    unsigned j = 0;
+    unsigned i = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned j = blockIdx.y * blockDim.y + threadIdx.y;
 
     // TODO 11b: Check if i or j are out of bounds. If they are, return.
+    if (i >= sizeX || j >= sizeY) {
+        return;
+    }
 
     // TODO 11c: Compute index_in as (i,j) (same as index in copy kernel) and index_out as (j,i)
-    unsigned index_in  = 0;  // Compute input index (i,j) from matrix A
-    unsigned index_out = 0;  // Compute output index (j,i) in matrix B = transpose(A)
+    unsigned index_in  = j * sizeX + i;  // Compute input index (i,j) from matrix A
+    unsigned index_out = i * sizeY + j;  // Compute output index (j,i) in matrix B = transpose(A)
 
     // TODO 11d: Copy data from A to B using transpose indices
+    b[index_out] = a[index_in];
 }
 
 int main(int argc, char *argv[])
@@ -82,8 +90,11 @@ int main(int argc, char *argv[])
     float *d_a, *d_b;
 
     // TODO 2: Allocate memory on the device for d_a and d_b.
+    CUDA(cudaMalloc((void**)&d_a, sizeX * sizeY * sizeof(float)));
+    CUDA(cudaMalloc((void**)&d_b, sizeX * sizeY * sizeof(float)));
 
     // TODO 3: Copy array contents of A from the host (CPU) to the device (GPU)
+    CUDA(cudaMemcpy(d_a, a, sizeX * sizeY * sizeof(float), cudaMemcpyHostToDevice));
 
     CUDA(cudaDeviceSynchronize());
 
@@ -96,14 +107,18 @@ int main(int argc, char *argv[])
 
         // TODO 4: Assign a 2D distribution of BS_X x BS_Y x 1 CUDA threads within
         // Calculate number of blocks along X and Y in a 2D CUDA "grid" using divup
+        unsigned BS_X = 16;
+        unsigned BS_Y = 16;
+
         DIMS dims;
-        dims.dimBlock = dim3(1, 1, 1);
-        dims.dimGrid = dim3(1, 1, 1);
+        dims.dimBlock = dim3(BS_X, BS_Y, 1);
+        dims.dimGrid = dim3(divup(sizeX, BS_X), divup(sizeY, BS_Y), 1);
 
         // LOOK: Launch the copy kernel
         copyKernel<<<dims.dimGrid, dims.dimBlock>>>(d_a, d_b, sizeX, sizeY);
 
         // TODO 5: copy the answer back to the host (CPU) from the device (GPU)
+        CUDA(cudaMemcpy(b, d_b, sizeX * sizeY * sizeof(float), cudaMemcpyDeviceToHost));
 
         // LOOK: Use compareReferenceAndResult to check the result
         compareReferenceAndResult(a_gold, b, sizeX * sizeY);
@@ -120,14 +135,18 @@ int main(int argc, char *argv[])
 
         // TODO 8: Assign a 2D distribution of BS_X x BS_Y x 1 CUDA threads within
         // Calculate number of blocks along X and Y in a 2D CUDA "grid" using divup
+        unsigned BS_X = 16;
+        unsigned BS_Y = 16;
+
         DIMS dims;
-        dims.dimBlock = dim3(1, 1, 1);
-        dims.dimGrid = dim3(1, 1, 1);
+        dims.dimBlock = dim3(BS_X, BS_Y, 1);
+        dims.dimGrid = dim3(divup(sizeX, BS_X), divup(sizeY, BS_Y), 1);
 
         // TODO 9: Launch the matrix transpose kernel
-        // matrixTransposeNaive<<<>>>(......);
+        matrixTransposeNaive<<<dims.dimGrid, dims.dimBlock>>>(d_a, d_b, sizeX, sizeY);
 
         // TODO 10: copy the answer back to the host (CPU) from the device (GPU)
+        CUDA(cudaMemcpy(b, d_b, sizeX * sizeY * sizeof(float), cudaMemcpyDeviceToHost));
 
         // LOOK: Use compareReferenceAndResult to check the result
         compareReferenceAndResult(b_gold, b, sizeX * sizeY);
@@ -136,6 +155,8 @@ int main(int argc, char *argv[])
     ////////////////////////////////////////////////////////////
 
     // TODO 7: free device memory using cudaFree
+    CUDA(cudaFree(d_a));
+    CUDA(cudaFree(d_b));
 
     // free host memory
     delete[] a;
diff --git a/images/Screenshot 2025-08-29 162758.png b/images/Screenshot 2025-08-29 162758.png
new file mode 100644
index 0000000..31333b2
Binary files /dev/null and b/images/Screenshot 2025-08-29 162758.png differ
diff --git a/images/Screenshot 2025-08-29 184157.png b/images/Screenshot 2025-08-29 184157.png
new file mode 100644
index 0000000..139ebbe
Binary files /dev/null and b/images/Screenshot 2025-08-29 184157.png differ
diff --git a/images/Screenshot 2025-08-29 194907.png b/images/Screenshot 2025-08-29 194907.png
new file mode 100644
index 0000000..462dada
Binary files /dev/null and b/images/Screenshot 2025-08-29 194907.png differ
diff --git a/images/Screenshot 2025-08-29 232639.png b/images/Screenshot 2025-08-29 232639.png
new file mode 100644
index 0000000..6ee0a3a
Binary files /dev/null and b/images/Screenshot 2025-08-29 232639.png differ
diff --git a/images/Screenshot 2025-08-29 232804.png b/images/Screenshot 2025-08-29 232804.png
new file mode 100644
index 0000000..05d4505
Binary files /dev/null and b/images/Screenshot 2025-08-29 232804.png differ
diff --git a/images/Screenshot 2025-08-29 232927.png b/images/Screenshot 2025-08-29 232927.png
new file mode 100644
index 0000000..441032c
Binary files /dev/null and b/images/Screenshot 2025-08-29 232927.png differ
diff --git a/images/Screenshot 2025-08-29 233005.png b/images/Screenshot 2025-08-29 233005.png
new file mode 100644
index 0000000..9d7c25d
Binary files /dev/null and b/images/Screenshot 2025-08-29 233005.png differ