forked from davestampf/GPUTalk2014
-
Notifications
You must be signed in to change notification settings - Fork 0
/
scan-cuda-kernel-small.cu
42 lines (30 loc) · 958 Bytes
/
scan-cuda-kernel-small.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
/*
* a simple scan program. compute the partial sums of
* the elements of the input array. This version uses
* only one block
*/
__global__ void cudaScan(float* d_out, float* d_in, int n) {
// shared array allocated by the launch of the kernel
extern __shared__ float temp[];
int threadId = threadIdx.x;
if (threadId >= n) return;
int fromBuffer = 1;
int toBuffer = 0;
// make a local copy of the data
temp[threadId] = d_in[threadId];
__syncthreads();
int maxOffset =(int)ceil(log2(1.0f*n));
maxOffset = pow(2.0f,1.0f*maxOffset);
for (int offset = 1; offset < maxOffset; offset *= 2) {
fromBuffer = 1-fromBuffer;
toBuffer = 1-toBuffer;
if (threadId >= offset) {
temp[toBuffer*n + threadId] = temp[fromBuffer*n + threadId - offset] +
temp[fromBuffer*n + threadId];
} else {
temp[toBuffer*n+ threadId] = temp[fromBuffer*n + threadId];
}
__syncthreads();
}
d_out[threadId] = temp[toBuffer*n + threadId];
}