From cdefe817ff6bed879b31a07c23695c8ac59be0cd Mon Sep 17 00:00:00 2001
From: Josef Edwards <joed6834@colorado.edu>
Date: Mon, 11 Nov 2024 20:43:06 -0500
Subject: [PATCH] Update GPUAceel

Signed-off-by: Josef Edwards <joed6834@colorado.edu>
---
 GPUAceel | 36 ++++++++++++++++++++++++++++++++----
 1 file changed, 32 insertions(+), 4 deletions(-)

diff --git a/GPUAceel b/GPUAceel
index 603a2f6..4482a88 100644
--- a/GPUAceel
+++ b/GPUAceel
@@ -1,25 +1,53 @@
 #include <cuda_runtime.h>
 #include <stdio.h>
 
+#define MAX_ITERATIONS 1024  // Define the maximum number of iterations
+
+// Error checking macro
+#define CUDA_CHECK(call)                                 
+    do {                                                
+        cudaError_t err = call;                         
+        if (err != cudaSuccess) {                       
+            fprintf(stderr, "CUDA Error: %s\n",        
+                    cudaGetErrorString(err));          
+            exit(err);                                  
+        }                                               
+    } while (0)
+
 __global__ void PMLL_LogicLoop_GPU(int *counter) {
     int tid = blockIdx.x * blockDim.x + threadIdx.x;
     if (tid < MAX_ITERATIONS) {
         printf("Updating memory graph at iteration %d\n", tid);
+        atomicAdd(counter, 1);  // Safely update the counter
     }
 }
 
 int main() {
     int *d_counter;
-    cudaMalloc((void **)&d_counter, sizeof(int));
-    cudaMemset(d_counter, 0, sizeof(int));
+    int h_counter = 0;  // Host counter
+
+    // Allocate device memory for the counter
+    CUDA_CHECK(cudaMalloc((void **)&d_counter, sizeof(int)));
+    // Initialize device counter
+    CUDA_CHECK(cudaMemcpy(d_counter, &h_counter, sizeof(int), cudaMemcpyHostToDevice));
 
     dim3 blockSize(256);
     dim3 gridSize((MAX_ITERATIONS + blockSize.x - 1) / blockSize.x);
 
+    // Launch the kernel
     PMLL_LogicLoop_GPU<<<gridSize, blockSize>>>(d_counter);
 
-    cudaDeviceSynchronize();
-    cudaFree(d_counter);
+    // Synchronize the device
+    CUDA_CHECK(cudaDeviceSynchronize());
+
+    // Copy the counter value back to the host
+    CUDA_CHECK(cudaMemcpy(&h_counter, d_counter, sizeof(int), cudaMemcpyDeviceToHost));
+
+    // Display the total iterations processed
+    printf("Total iterations processed: %d\n", h_counter);
+
+    // Free device memory
+    CUDA_CHECK(cudaFree(d_counter));
 
     return 0;
 }