CIS5650-Fall-2025 · thesquashedman · Aug 27, 2025 · Aug 27, 2025 · Aug 27, 2025 · Aug 27, 2025
diff --git a/README.md b/README.md
@@ -3,11 +3,25 @@ Project 0 Getting Started
 
 **University of Pennsylvania, CIS 5650: GPU Programming and Architecture, Project 0**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Pavel Zdravkov Peev
+  * LinkedIn: https://www.linkedin.com/in/pavel-peev-5568561b9/
+  * Personal Website: https://www.Cartaphil.com
+* Tested on: Windows 11, i7-12700, NVIDIA T1000, (SEAS VLAB 007)
 
-### (TODO: Your README)
+### Analysis and Screenshots.
 
-Include screenshots, analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+Had problems with 2.1.4 and 2.1.5 due to not having administrative access on the virtual lab computers.
+Was expecting PC to arrive 8/28/2025, but it seems to have been delayed, will redo with new pc when it arrives.
+
+### 2.1.2
+
+<img width="397" height="418" alt="2-1-2-Screenshot" src="https://github.com/user-attachments/assets/945490a4-9f9c-4ad0-88f1-34a8e8fda4d6" />
+
+### 2.1.3
+<img width="959" height="481" alt="2-1-3-Screenshot" src="https://github.com/user-attachments/assets/27aa63bc-45ee-422f-a135-a8c68b7b852e" />
+
+### 2.2
+<img width="535" height="310" alt="2-2-Screenshot" src="https://github.com/user-attachments/assets/06e95e31-8a96-42c0-adc8-ba341eb53710" />
+
+### 2.3
+<img width="791" height="440" alt="2-3Screenshot" src="https://github.com/user-attachments/assets/527c0c9b-7783-45df-a9d7-8b084a0ba2af" />
diff --git a/cuda-gl-check/src/main.cpp b/cuda-gl-check/src/main.cpp
@@ -10,8 +10,7 @@
  * C main function.
  */
 int main(int argc, char* argv[]) {
-    // TODO: Change this line to use your name!
-    m_yourName = "TODO: YOUR NAME HERE";
+    m_yourName = "Pavel Peev";
 
     if (init(argc, argv)) {
         mainLoop();

diff --git a/cuda-introduction/source/common.cu b/cuda-introduction/source/common.cu
@@ -9,7 +9,7 @@ unsigned divup(unsigned size, unsigned div)
 {
     // TODO: implement a 1 line function to return the divup operation.
     // Note: You only need to use addition, subtraction, and division operations.
-    return 0;
+    return size % div == 0 ? (size / div) : (size / div) + 1;
 }
 
 void clearHostAndDeviceArray(float *res, float *dev_res, unsigned size, const int value)

diff --git a/cuda-introduction/source/saxpy.cu b/cuda-introduction/source/saxpy.cu
@@ -9,20 +9,21 @@
 __global__ void saxpy(float* const z, const float* const x, const float* const y, const float a, const unsigned size)
 {
     // TODO 9: Compute the global index for each thread.
-    unsigned idx = 0;
+    unsigned idx = blockIdx.x * blockDim.x + threadIdx.x;
 
     // TODO 10: Check if idx is out of bounds. If yes, return.
-    if (idx >= 0)
+    if (idx >= size)
         return;
 
     // TODO 11: Perform the SAXPY operation: z = a * x + y.
+    z[idx] = a * x[idx] + y[idx];
 }
 
 int main(int argc, char *argv[])
 {
     // TODO 1: Set the size. Start with something simple like 64.
     // TODO Optional: Try out these sizes: 256, 1024, 2048, 14, 103, 1025, 3127
-    const unsigned size = 0;
+    const unsigned size = 2024;
 
     // Host arrays.
     float* x = new float[size];
@@ -53,9 +54,15 @@ int main(int argc, char *argv[])
 
     // TODO 2: Allocate memory on the device. Fill in the blanks for d_x, then do the same commands for d_y and d_z.
     // CUDA(cudaMalloc((void **)& pointer, size in bytes)));
-
+    CUDA(cudaMalloc((void**)&d_x, size * sizeof(float)));
+    CUDA(cudaMalloc((void**)&d_y, size * sizeof(float)));
+    CUDA(cudaMalloc((void**)&d_z, size * sizeof(float)));
     // TODO 3: Copy array contents of X and Y from the host (CPU) to the device (GPU). Follow what you did for 2,
     // CUDA(cudaMemcpy(dest ptr, source ptr, size in bytes, direction enum));
+    CUDA(cudaMemcpy(d_x, x, size * sizeof(float), cudaMemcpyHostToDevice));
+    CUDA(cudaMemcpy(d_y, y, size * sizeof(float), cudaMemcpyHostToDevice));
+    CUDA(cudaMemcpy(d_z, z, size * sizeof(float), cudaMemcpyHostToDevice));
+
 
     CUDA(cudaDeviceSynchronize());
 
@@ -69,25 +76,27 @@ int main(int argc, char *argv[])
     // TODO 4: Setup threads and blocks.
     // Start threadPerBlock as 128, then try out differnt configurations: 32, 64, 256, 512, 1024
     // Use divup to get the number of blocks to launch.
-    const unsigned threadsPerBlock = 0;
+    const unsigned threadsPerBlock = 32;
 
     // TODO 5: Implement the divup function in common.cpp
     const unsigned blocks = divup(size, threadsPerBlock);
 
     // TODO 6: Launch the GPU kernel with blocks and threadPerBlock as launch configuration
     // saxpy<<< >>> (....);
-
+    saxpy <<<blocks, threadsPerBlock>>> (d_z, d_x, d_y, a, size);
     // TODO 7: Copy the answer back to the host (CPU) from the device (GPU).
     // Copy what you did in 3, except for d_z -> z.
-
+    CUDA(cudaMemcpy(z, d_z, size * sizeof(float), cudaMemcpyDeviceToHost));
     // LOOK: Use postprocess to check the result
     compareReferenceAndResult(z_gold, z, size, 1e-6);
     std::cout << "****************************************************" << std::endl << std::endl;
     ////////////////////////////////////////////////////////////
 
     // TODO 8: free device memory using cudaFree
     // CUDA(cudaFree(device pointer));
-
+    CUDA(cudaFree(d_x));
+    CUDA(cudaFree(d_y));
+    CUDA(cudaFree(d_z));
     // free host memory
     delete[] x;
     delete[] y;

diff --git a/images/2-1-2-Screenshot.png b/images/2-1-2-Screenshot.png
diff --git a/images/2-1-3-Screenshot.png b/images/2-1-3-Screenshot.png
diff --git a/images/2-2-Screenshot.png b/images/2-2-Screenshot.png
diff --git a/images/2-3Screenshot.png b/images/2-3Screenshot.png