-
Notifications
You must be signed in to change notification settings - Fork 52
/
Copy pathunified_pointer.cu
70 lines (55 loc) · 1.75 KB
/
unified_pointer.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
#include <iostream>
#include <math.h>
struct Operation {
float *x;
float *y;
int n;
};
// CUDA kernel to add elements of two arrays
__global__ void add(Operation *op) {
int index = blockIdx.x * blockDim.x + threadIdx.x;
int stride = blockDim.x * gridDim.x;
printf("The X is: %x\n", op->x[0]);
printf("The Y is: %x\n", op->y[0]);
for (int i = index; i < op->n; i += stride)
{
op->y[i] = op->x[i] + op->y[i];
}
}
int main(void) {
Operation *op;
// Allocate Unified Memory -- accessible from CPU or GPU
cudaMallocManaged(&op, sizeof(Operation));
op->n = 100;
cudaMallocManaged(&op->x, op->n * sizeof(float));
cudaMallocManaged(&op->y, op->n * sizeof(float));
// initialize x and y arrays on the host
for (int i = 0; i < op->n; i++) {
op->x[i] = 1.0f;
op->y[i] = 2.0f;
}
// Launch kernel on n elements on the GPU
int blockSize = 256;
int numBlocks = (op->n + blockSize - 1) / blockSize;
std::cout << "numBlocks: " << numBlocks << std::endl;
std::cout << "X: " << &op->x << std::endl;
add<<<numBlocks, blockSize>>>(op);
// Wait for GPU to finish before accessing on host
cudaDeviceSynchronize();
// Log results for debugging
std::cout << "Results (y = x + y):" << std::endl;
for (int i = 0; i < op->n; i++) {
std::cout << "y[" << i << "] = " << op->y[i] << " (expected: 3.0)" << std::endl;
}
// Check for errors (all values should be 3.0f)
float maxError = 0.0f;
for (int i = 0; i < op->n; i++) {
maxError = fmax(maxError, fabs(op->y[i] - 3.0f));
}
std::cout << "Max error: " << maxError << std::endl;
// Free memory
cudaFree(op->x);
cudaFree(op->y);
cudaFree(op);
return 0;
}