diff --git a/README.md b/README.md
index d2fa33d..adaf2e7 100644
--- a/README.md
+++ b/README.md
@@ -3,11 +3,27 @@ Project 0 Getting Started
 
 **University of Pennsylvania, CIS 5650: GPU Programming and Architecture, Project 0**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Jefferson Koumba Moussadji Lu
+  * [LinkedIn](https://www.linkedin.com/in/-jeff-koumba-0b356721b/), [personal website](), [twitter](), etc.
+* Tested on: Personal Laptop, Windows 11 Home, Intel(R) Core(TM) i9-14900HX @ 2.22GHz @ 24 Cores @ 32GB RAM, Nvidia GeForce RTX 4090 @ 16 GB @  SM 8.9 
 
-### (TODO: Your README)
+**CUDA GL Check**
+![](images/CUDA_GL_Check.png)
 
-Include screenshots, analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+**NVIDIA Nsight Debugging**
+![](images/Nsight_Debugging_Windows_VS.png)
+
+**NVIDIA Nsight System Analysis**
+![](images/NVIDIA_Nsight_Systems_Analysis_Summary.png)
+
+**NVIDIA Nsight System Timeline**
+![](images/NVIDIA_Nsight_Systems_Timeline.png)
+
+**NVIDIA Nsight Compute**
+![](images/NVIDIA_Nsight_Compute.png)
+
+**WebGL**
+![](images/WebGL_Report.png)
+
+**WebGPU**
+![](images/WebGPU.png)
\ No newline at end of file
diff --git a/cuda-gl-check/src/main.cpp b/cuda-gl-check/src/main.cpp
index 886fd4c..f63f2ee 100644
--- a/cuda-gl-check/src/main.cpp
+++ b/cuda-gl-check/src/main.cpp
@@ -11,7 +11,7 @@
  */
 int main(int argc, char* argv[]) {
     // TODO: Change this line to use your name!
-    m_yourName = "TODO: YOUR NAME HERE";
+    m_yourName = "Jefferson Koumba Moussadji Lu";
 
     if (init(argc, argv)) {
         mainLoop();
diff --git a/cuda-introduction/source/common.cu b/cuda-introduction/source/common.cu
index dce8793..a4dcdf7 100644
--- a/cuda-introduction/source/common.cu
+++ b/cuda-introduction/source/common.cu
@@ -9,7 +9,7 @@ unsigned divup(unsigned size, unsigned div)
 {
     // TODO: implement a 1 line function to return the divup operation.
     // Note: You only need to use addition, subtraction, and division operations.
-    return 0;
+    return (size + div - 1) / div;
 }
 
 void clearHostAndDeviceArray(float *res, float *dev_res, unsigned size, const int value)
diff --git a/cuda-introduction/source/matmul.cu b/cuda-introduction/source/matmul.cu
index 826e535..3f8dfee 100644
--- a/cuda-introduction/source/matmul.cu
+++ b/cuda-introduction/source/matmul.cu
@@ -12,17 +12,25 @@ __global__ void matrixMultiplicationNaive(float* const matrixP, const float* con
 {
     // TODO 10a: Compute the P matrix global index for each thread along x and y dimentions.
     // Remember that each thread of the kernel computes the result of 1 unique element of P
-    unsigned px;
-    unsigned py;
+    unsigned px = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned py = blockIdx.y * blockDim.y + threadIdx.y;
 
     // TODO 10b: Check if px or py are out of bounds. If they are, return.
+    if (px >= sizeMX || py >= sizeNY)
+        return;
 
     // TODO 10c: Compute the dot product for the P element in each thread
     // This loop will be the same as the host loop
     float dot = 0.0;
+    for (int k = 0; k < sizeXY; k++) {
+        const float m = matrixM[k * sizeMX + px];
+        const float n = matrixN[py * sizeXY + k];
+        dot += m * n;
+    }
 
     // TODO 10d: Copy dot to P matrix
     // matrixP[] = dot;
+    matrixP[py * sizeMX + px] = dot;
 }
 
 int main(int argc, char *argv[])
@@ -31,19 +39,19 @@ int main(int argc, char *argv[])
     // Then try large multiple-block square matrix like 64x64 up to 2048x2048.
     // Then try square, non-power-of-two like 15x15, 33x33, 67x67, 123x123, and 771x771
     // Then try rectangles with powers of two and then non-power-of-two.
-    const unsigned sizeMX = 0;
-    const unsigned sizeXY = 0;
-    const unsigned sizeNY = 0;
+    const unsigned sizeMX = 16*16;
+    const unsigned sizeXY = 32*32;
+    const unsigned sizeNY = 64*64;
 
     // TODO 2: Allocate host 1D arrays for:
     // matrixM[sizeMX, sizeXY]
     // matrixN[sizeXY, sizeNY]
     // matrixP[sizeMX, sizeNY]
     // matrixPGold[sizeMX, sizeNY]
-    float* matrixM;
-    float* matrixN;
-    float* matrixP;
-    float* matrixPGold;
+    float* matrixM = new float[sizeMX * sizeXY];
+    float* matrixN = new float[sizeXY * sizeNY];
+    float* matrixP = new float[sizeMX * sizeNY];
+    float* matrixPGold = new float[sizeMX * sizeNY];
 
     // LOOK: Setup random number generator and fill host arrays and the scalar a.
     std::random_device rd;
@@ -65,13 +73,29 @@ int main(int argc, char *argv[])
     //     for k -> 0 to sizeXY
     //       dot = m[k, px] * n[py, k]
     //  matrixPGold[py, px] = dot
+    for (int py = 0; py < sizeNY; py++) {
+        for (int px = 0; px < sizeMX; px++) {
+            float dot = 0;
+            for (int k = 0; k < sizeXY; k++) {
+                float m = matrixM[k * sizeMX + px];
+                float n = matrixN[py * sizeXY + k];
+                dot += m * n;
+            }
+            matrixPGold[py * sizeMX + px] = dot;
+        }
+    }
 
     // Device arrays
     float *d_matrixM, *d_matrixN, *d_matrixP;
 
     // TODO 4: Allocate memory on the device for d_matrixM, d_matrixN, d_matrixP.
+    CUDA(cudaMalloc((void**)&d_matrixM, sizeMX * sizeXY * sizeof(float)));
+    CUDA(cudaMalloc((void**)&d_matrixN, sizeXY * sizeNY * sizeof(float)));
+    CUDA(cudaMalloc((void**)&d_matrixP, sizeMX * sizeNY * sizeof(float)));
 
     // TODO 5: Copy array contents of M and N from the host (CPU) to the device (GPU)
+    CUDA(cudaMemcpy(d_matrixM, matrixM, sizeMX * sizeXY * sizeof(float), cudaMemcpyHostToDevice));
+    CUDA(cudaMemcpy(d_matrixN, matrixN, sizeXY * sizeNY * sizeof(float), cudaMemcpyHostToDevice));
 
     CUDA(cudaDeviceSynchronize());
 
@@ -86,13 +110,17 @@ int main(int argc, char *argv[])
     // Calculate number of blocks along X and Y in a 2D CUDA "grid" using divup
     // HINT: The shape of matrices has no impact on launch configuaration
     DIMS dims;
-    dims.dimBlock = dim3(1, 1, 1);
-    dims.dimGrid  = dim3(1, 1, 1);
+    dims.dimBlock = dim3(16, 16, 1);
+    dims.dimGrid = dim3(divup(sizeMX, dims.dimBlock.x),
+        divup(sizeNY, dims.dimBlock.y),
+        1);
 
     // TODO 7: Launch the matrix transpose kernel
     // matrixMultiplicationNaive<<<>>>();
+    matrixMultiplicationNaive << <dims.dimGrid, dims.dimBlock >> > (d_matrixP, d_matrixM, d_matrixN, sizeMX, sizeNY, sizeXY);
 
     // TODO 8: copy the answer back to the host (CPU) from the device (GPU)
+    CUDA(cudaMemcpy(matrixP, d_matrixP, sizeMX * sizeNY * sizeof(float), cudaMemcpyDeviceToHost));
 
     // LOOK: Use compareReferenceAndResult to check the result
     compareReferenceAndResult(matrixPGold, matrixP, sizeMX * sizeNY, 1e-3);
@@ -101,6 +129,9 @@ int main(int argc, char *argv[])
     ////////////////////////////////////////////////////////////
 
     // TODO 9: free device memory using cudaFree
+    CUDA(cudaFree(d_matrixM));
+    CUDA(cudaFree(d_matrixN));
+    CUDA(cudaFree(d_matrixP));
 
     // free host memory
     delete[] matrixM;
diff --git a/cuda-introduction/source/saxpy.cu b/cuda-introduction/source/saxpy.cu
index 5ed591f..4a379bd 100644
--- a/cuda-introduction/source/saxpy.cu
+++ b/cuda-introduction/source/saxpy.cu
@@ -9,20 +9,21 @@
 __global__ void saxpy(float* const z, const float* const x, const float* const y, const float a, const unsigned size)
 {
     // TODO 9: Compute the global index for each thread.
-    unsigned idx = 0;
+    unsigned idx = blockIdx.x * blockDim.x + threadIdx.x;
 
     // TODO 10: Check if idx is out of bounds. If yes, return.
-    if (idx >= 0)
+    if (idx >= size)
         return;
 
     // TODO 11: Perform the SAXPY operation: z = a * x + y.
+    z[idx] = a * x[idx] + y[idx];
 }
 
 int main(int argc, char *argv[])
 {
     // TODO 1: Set the size. Start with something simple like 64.
     // TODO Optional: Try out these sizes: 256, 1024, 2048, 14, 103, 1025, 3127
-    const unsigned size = 0;
+    const unsigned size = 64;
 
     // Host arrays.
     float* x = new float[size];
@@ -53,9 +54,14 @@ int main(int argc, char *argv[])
 
     // TODO 2: Allocate memory on the device. Fill in the blanks for d_x, then do the same commands for d_y and d_z.
     // CUDA(cudaMalloc((void **)& pointer, size in bytes)));
+    CUDA(cudaMalloc((void**)&d_x, size * sizeof(float)));
+    CUDA(cudaMalloc((void**)&d_y, size * sizeof(float)));
+    CUDA(cudaMalloc((void**)&d_z, size * sizeof(float)));
 
     // TODO 3: Copy array contents of X and Y from the host (CPU) to the device (GPU). Follow what you did for 2,
     // CUDA(cudaMemcpy(dest ptr, source ptr, size in bytes, direction enum));
+    CUDA(cudaMemcpy(d_x, x, size * sizeof(float), cudaMemcpyHostToDevice));
+    CUDA(cudaMemcpy(d_y, y, size * sizeof(float), cudaMemcpyHostToDevice));
 
     CUDA(cudaDeviceSynchronize());
 
@@ -69,16 +75,18 @@ int main(int argc, char *argv[])
     // TODO 4: Setup threads and blocks.
     // Start threadPerBlock as 128, then try out differnt configurations: 32, 64, 256, 512, 1024
     // Use divup to get the number of blocks to launch.
-    const unsigned threadsPerBlock = 0;
+    const unsigned threadsPerBlock = 128;
 
     // TODO 5: Implement the divup function in common.cpp
     const unsigned blocks = divup(size, threadsPerBlock);
 
     // TODO 6: Launch the GPU kernel with blocks and threadPerBlock as launch configuration
     // saxpy<<< >>> (....);
+    saxpy << <blocks, threadsPerBlock >> > (d_z, d_x, d_y, a, size);
 
     // TODO 7: Copy the answer back to the host (CPU) from the device (GPU).
     // Copy what you did in 3, except for d_z -> z.
+    CUDA(cudaMemcpy(z, d_z, size * sizeof(float), cudaMemcpyDeviceToHost));
 
     // LOOK: Use postprocess to check the result
     compareReferenceAndResult(z_gold, z, size, 1e-6);
@@ -87,6 +95,9 @@ int main(int argc, char *argv[])
 
     // TODO 8: free device memory using cudaFree
     // CUDA(cudaFree(device pointer));
+    CUDA(cudaFree(d_x));
+    CUDA(cudaFree(d_y));
+    CUDA(cudaFree(d_z));
 
     // free host memory
     delete[] x;
diff --git a/cuda-introduction/source/transpose.cu b/cuda-introduction/source/transpose.cu
index 89f6f8f..ce265a3 100644
--- a/cuda-introduction/source/transpose.cu
+++ b/cuda-introduction/source/transpose.cu
@@ -19,16 +19,21 @@
 __global__ void copyKernel(const float* const a, float* const b, const unsigned sizeX, const unsigned sizeY)
 {
     // TODO 6a: Compute the global index for each thread along x and y dimentions.
-    unsigned i = 0;
-    unsigned j = 0;;
+    unsigned i = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned j = blockIdx.y * blockDim.y + threadIdx.y;
 
     // TODO 6b: Check if i or j are out of bounds. If they are, return.
+    if (i >= sizeX || j >= sizeY)
+    {
+        return;
+    }
 
     // TODO 6c: Compute global 1D index from i and j
-    unsigned index = 0;
+    unsigned index = j * sizeX + i;
 
     // TODO 6d: Copy data from A to B. Note that in copy kernel source and destination indices are the same
     // b[] = a[];
+    b[index] = a[index];
 }
 
 // TODO 11: Implement the transpose kernel
@@ -38,16 +43,21 @@ __global__ void copyKernel(const float* const a, float* const b, const unsigned
 __global__ void matrixTransposeNaive(const float* const a, float* const b, const unsigned sizeX, const unsigned sizeY)
 {
     // TODO 11a: Compute the global index for each thread along x and y dimentions.
-    unsigned i = 0;
-    unsigned j = 0;
+    unsigned i = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned j = blockIdx.y * blockDim.y + threadIdx.y;
 
     // TODO 11b: Check if i or j are out of bounds. If they are, return.
+    if (i >= sizeX || j >= sizeY)
+    {
+        return;
+    }
 
     // TODO 11c: Compute index_in as (i,j) (same as index in copy kernel) and index_out as (j,i)
-    unsigned index_in  = 0;  // Compute input index (i,j) from matrix A
-    unsigned index_out = 0;  // Compute output index (j,i) in matrix B = transpose(A)
+    unsigned index_in  = j * sizeX + i;  // Compute input index (i,j) from matrix A
+    unsigned index_out = i * sizeY + j;  // Compute output index (j,i) in matrix B = transpose(A)
 
     // TODO 11d: Copy data from A to B using transpose indices
+    b[index_out] = a[index_in];
 }
 
 int main(int argc, char *argv[])
@@ -82,8 +92,11 @@ int main(int argc, char *argv[])
     float *d_a, *d_b;
 
     // TODO 2: Allocate memory on the device for d_a and d_b.
+    CUDA(cudaMalloc((void**)&d_a, sizeX * sizeY * sizeof(float)));
+    CUDA(cudaMalloc((void**)&d_b, sizeX * sizeY * sizeof(float)));
 
     // TODO 3: Copy array contents of A from the host (CPU) to the device (GPU)
+    CUDA(cudaMemcpy(d_a, a, sizeX * sizeY * sizeof(float), cudaMemcpyHostToDevice));
 
     CUDA(cudaDeviceSynchronize());
 
@@ -97,13 +110,16 @@ int main(int argc, char *argv[])
         // TODO 4: Assign a 2D distribution of BS_X x BS_Y x 1 CUDA threads within
         // Calculate number of blocks along X and Y in a 2D CUDA "grid" using divup
         DIMS dims;
-        dims.dimBlock = dim3(1, 1, 1);
-        dims.dimGrid = dim3(1, 1, 1);
+        dims.dimBlock = dim3(32, 32, 1);
+        dims.dimGrid = dim3(divup(sizeX, dims.dimBlock.x),
+            divup(sizeY, dims.dimBlock.y),
+            1);
 
         // LOOK: Launch the copy kernel
         copyKernel<<<dims.dimGrid, dims.dimBlock>>>(d_a, d_b, sizeX, sizeY);
 
         // TODO 5: copy the answer back to the host (CPU) from the device (GPU)
+        CUDA(cudaMemcpy(b, d_b, sizeY * sizeX * sizeof(float), cudaMemcpyDeviceToHost));
 
         // LOOK: Use compareReferenceAndResult to check the result
         compareReferenceAndResult(a_gold, b, sizeX * sizeY);
@@ -121,13 +137,17 @@ int main(int argc, char *argv[])
         // TODO 8: Assign a 2D distribution of BS_X x BS_Y x 1 CUDA threads within
         // Calculate number of blocks along X and Y in a 2D CUDA "grid" using divup
         DIMS dims;
-        dims.dimBlock = dim3(1, 1, 1);
-        dims.dimGrid = dim3(1, 1, 1);
+        dims.dimBlock = dim3(32, 32, 1);
+        dims.dimGrid = dim3(divup(sizeX, dims.dimBlock.x),
+            divup(sizeY, dims.dimBlock.y),
+            1);
 
         // TODO 9: Launch the matrix transpose kernel
         // matrixTransposeNaive<<<>>>(......);
+        matrixTransposeNaive <<<dims.dimGrid, dims.dimBlock >>> (d_a, d_b, sizeX, sizeY);
 
         // TODO 10: copy the answer back to the host (CPU) from the device (GPU)
+        CUDA(cudaMemcpy(b, d_b, sizeY * sizeX * sizeof(float), cudaMemcpyDeviceToHost));
 
         // LOOK: Use compareReferenceAndResult to check the result
         compareReferenceAndResult(b_gold, b, sizeX * sizeY);
@@ -136,6 +156,8 @@ int main(int argc, char *argv[])
     ////////////////////////////////////////////////////////////
 
     // TODO 7: free device memory using cudaFree
+    CUDA(cudaFree(d_a));
+    CUDA(cudaFree(d_b));
 
     // free host memory
     delete[] a;
diff --git a/images/CUDA_GL_Check.png b/images/CUDA_GL_Check.png
new file mode 100644
index 0000000..2bb719c
Binary files /dev/null and b/images/CUDA_GL_Check.png differ
diff --git a/images/NVIDIA_Nsight_Compute.png b/images/NVIDIA_Nsight_Compute.png
new file mode 100644
index 0000000..c425993
Binary files /dev/null and b/images/NVIDIA_Nsight_Compute.png differ
diff --git a/images/NVIDIA_Nsight_Systems_Analysis_Summary.png b/images/NVIDIA_Nsight_Systems_Analysis_Summary.png
new file mode 100644
index 0000000..5621015
Binary files /dev/null and b/images/NVIDIA_Nsight_Systems_Analysis_Summary.png differ
diff --git a/images/NVIDIA_Nsight_Systems_Timeline.png b/images/NVIDIA_Nsight_Systems_Timeline.png
new file mode 100644
index 0000000..1002284
Binary files /dev/null and b/images/NVIDIA_Nsight_Systems_Timeline.png differ
diff --git a/images/Nsight_Debugging_Windows_VS.png b/images/Nsight_Debugging_Windows_VS.png
new file mode 100644
index 0000000..9d64fb2
Binary files /dev/null and b/images/Nsight_Debugging_Windows_VS.png differ
diff --git a/images/WebGL_Report.png b/images/WebGL_Report.png
new file mode 100644
index 0000000..ffc3205
Binary files /dev/null and b/images/WebGL_Report.png differ
diff --git a/images/WebGPU.png b/images/WebGPU.png
new file mode 100644
index 0000000..ea36989
Binary files /dev/null and b/images/WebGPU.png differ