cuda-cpp-programming/DP.cu at main · MohammadDallash/cuda-cpp-programming · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <iostream>// CUDA kernel. Each thread takes care of one element of c


using namespace std;


__global__ void fun(long long *a, long long n)
{

    if (threadIdx.x == 0) {
        ChildKernel<<<1, 32>>>(a, n);
        cudaThreadSynchronize();
    }
    syncthreads();
    //operate on data


    // Get our global thread ID
    long long id = (long long)blockIdx.x * blockDim.x + threadIdx.x;

    // Make sure we do not go out of bounds
    if (id < n and id%2==0)
        atomicAdd(reinterpret_cast<unsigned long long*>(a), 1);
}

int main(int argc, char *argv[])
{
    // Size of vectors
    long long n = 1e12;

    // Host input number
    long long *h_a;
    h_a = (long long *)malloc(sizeof(long long));
    *h_a = 0;

    // Device input number
    long long *d_a;

    // Allocate memory for each vector on GPU
    cudaMalloc(&d_a, sizeof(long long));

    // Copy host number to device
    cudaMemcpy(d_a, h_a, sizeof(long long), cudaMemcpyHostToDevice);

    long long blockSize, gridSize;

    // Number of threads in each thread block
    blockSize = 1024;

    // Number of thread blocks in grid
    gridSize = (long long)ceil((float)n / blockSize);

    cout << "Block Size: " << blockSize << ", Grid Size: " << gridSize << '\n';

    // Execute the kernel
    fun<<<gridSize, blockSize>>>(d_a, n);

    // Copy array back to host
    cudaMemcpy(h_a, d_a, sizeof(long long), cudaMemcpyDeviceToHost);

    ::cout << "Result: " << *h_a << '\n';

    // Release device memory
    cudaFree(d_a);

    // Release host memory
    free(h_a);

    return 0;
}