-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathDP.cu
More file actions
74 lines (50 loc) · 1.59 KB
/
DP.cu
File metadata and controls
74 lines (50 loc) · 1.59 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <iostream>// CUDA kernel. Each thread takes care of one element of c
using namespace std;
__global__ void fun(long long *a, long long n)
{
if (threadIdx.x == 0) {
ChildKernel<<<1, 32>>>(a, n);
cudaThreadSynchronize();
}
syncthreads();
//operate on data
// Get our global thread ID
long long id = (long long)blockIdx.x * blockDim.x + threadIdx.x;
// Make sure we do not go out of bounds
if (id < n and id%2==0)
atomicAdd(reinterpret_cast<unsigned long long*>(a), 1);
}
int main(int argc, char *argv[])
{
// Size of vectors
long long n = 1e12;
// Host input number
long long *h_a;
h_a = (long long *)malloc(sizeof(long long));
*h_a = 0;
// Device input number
long long *d_a;
// Allocate memory for each vector on GPU
cudaMalloc(&d_a, sizeof(long long));
// Copy host number to device
cudaMemcpy(d_a, h_a, sizeof(long long), cudaMemcpyHostToDevice);
long long blockSize, gridSize;
// Number of threads in each thread block
blockSize = 1024;
// Number of thread blocks in grid
gridSize = (long long)ceil((float)n / blockSize);
cout << "Block Size: " << blockSize << ", Grid Size: " << gridSize << '\n';
// Execute the kernel
fun<<<gridSize, blockSize>>>(d_a, n);
// Copy array back to host
cudaMemcpy(h_a, d_a, sizeof(long long), cudaMemcpyDeviceToHost);
::cout << "Result: " << *h_a << '\n';
// Release device memory
cudaFree(d_a);
// Release host memory
free(h_a);
return 0;
}