diff --git a/README.md b/README.md index 0e38ddb..d12a56b 100644 --- a/README.md +++ b/README.md @@ -3,12 +3,82 @@ CUDA Stream Compaction **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2** -* (TODO) YOUR NAME HERE - * (TODO) [LinkedIn](), [personal website](), [twitter](), etc. -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Janet Wang + * https://xchennnw.github.io/ +* Tested on: Windows 11, i7-12700H @ 2.30GHz 16GB, Nvidia Geforce RTX 3070 Ti 8054MB -### (TODO: Your README) +### TODO implemented +* CPU Scan & Stream Compaction +* Naive GPU Scan Algorithm +* Work-Efficient GPU Scan & Stream Compaction +* Scan using Thrust + +### Project Description +This project is about GPU stream compaction in CUDA, including a few different versions of the Scan (Prefix Sum) algorithm: a CPU version, GPU naive scan, GPU "work-efficient" scan, and GPU scan using thrust. It also includes GPU stream compaction using the above algorithms. -Include analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +### Performance Analysis +![](scan2.png) +![](scan1.png) +* When the array size is greater than 2^8, thrus scan always has the best performance. +* When the array size is under 2^16, CPU scan is faster than both of the GPU naive scan and work-efficient scan. The rational explanation could be the cost of GPU reading data from global memory is a relatively large part of time cost when array size is small. +* The GPU efficient scan perferms better than naive only after the the array size is greater than 2^16. I am actually confused about this point. +* When the array size is greater than 2^16, the rank of the methods becomes relatively stable: thrust > GPU efficient > GPU naive > CPU +![](nsight.PNG) +This is the Nsight timeline for the execution of GPU scan using thrust. It seems like cudaMemcpyAsync() and cudaStreamSynchronize are used here, but to be honest I do not quite understand what happens in these two functions and how they significantly improve the performance. +### Output of the test program +SIZE = 1 << 24 +``` +**************** +** SCAN TESTS ** +**************** + [ 43 2 2 48 43 19 31 48 40 42 13 19 31 ... 6 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 41.2005ms (std::chrono Measured) + [ 0 43 45 47 95 138 157 188 236 276 318 331 350 ... 410823510 410823516 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 43.9688ms (std::chrono Measured) + [ 0 43 45 47 95 138 157 188 236 276 318 331 350 ... 410823427 410823469 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 26.7661ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + elapsed time: 27.9169ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 12.5882ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 13.1602ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 1.37734ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 1.35168ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 0 3 2 0 0 0 1 3 1 0 1 3 2 ... 3 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 49.1391ms (std::chrono Measured) + [ 3 2 1 3 1 1 3 2 1 3 3 1 3 ... 1 3 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 51.6044ms (std::chrono Measured) + [ 3 2 1 3 1 1 3 2 1 3 3 1 3 ... 1 1 ] + passed +==== cpu compact with scan ==== + elapsed time: 116.318ms (std::chrono Measured) + [ 3 2 1 3 1 1 3 2 1 3 3 1 3 ... 1 3 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 12.9027ms (CUDA Measured) + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 13.2913ms (CUDA Measured) + passed +``` diff --git a/nsight.PNG b/nsight.PNG new file mode 100644 index 0000000..c86a890 Binary files /dev/null and b/nsight.PNG differ diff --git a/pj2.txt b/pj2.txt new file mode 100644 index 0000000..91d6f5a --- /dev/null +++ b/pj2.txt @@ -0,0 +1,53 @@ + +**************** +** SCAN TESTS ** +**************** + [ 47 40 21 2 41 34 26 16 47 18 22 27 9 ... 12 0 ] +==== cpu scan, power-of-two ==== + elapsed time: 0.0004ms (std::chrono Measured) + [ 0 47 87 108 110 151 185 211 227 274 292 314 341 ... 6241 6253 ] +==== cpu scan, non-power-of-two ==== + elapsed time: 0.0003ms (std::chrono Measured) + [ 0 47 87 108 110 151 185 211 227 274 292 314 341 ... 6148 6160 ] + passed +==== naive scan, power-of-two ==== + elapsed time: 0.047456ms (CUDA Measured) + passed +==== naive scan, non-power-of-two ==== + elapsed time: 0.046528ms (CUDA Measured) + passed +==== work-efficient scan, power-of-two ==== + elapsed time: 0.051264ms (CUDA Measured) + passed +==== work-efficient scan, non-power-of-two ==== + elapsed time: 0.051392ms (CUDA Measured) + passed +==== thrust scan, power-of-two ==== + elapsed time: 0.037952ms (CUDA Measured) + passed +==== thrust scan, non-power-of-two ==== + elapsed time: 0.03808ms (CUDA Measured) + passed + +***************************** +** STREAM COMPACTION TESTS ** +***************************** + [ 0 1 3 2 1 3 2 1 3 0 0 3 0 ... 0 0 ] +==== cpu compact without scan, power-of-two ==== + elapsed time: 0.0006ms (std::chrono Measured) + [ 1 3 2 1 3 2 1 3 3 3 2 1 3 ... 1 2 ] + passed +==== cpu compact without scan, non-power-of-two ==== + elapsed time: 0.0005ms (std::chrono Measured) + [ 1 3 2 1 3 2 1 3 3 3 2 1 3 ... 1 1 ] + passed +==== cpu compact with scan ==== + elapsed time: 0.0013ms (std::chrono Measured) + [ 1 3 2 1 3 2 1 3 3 3 2 1 3 ... 1 2 ] + passed +==== work-efficient compact, power-of-two ==== + elapsed time: 0.07184ms (CUDA Measured) + passed +==== work-efficient compact, non-power-of-two ==== + elapsed time: 0.07296ms (CUDA Measured) +Press any key to continue . . . \ No newline at end of file diff --git a/scan1.png b/scan1.png new file mode 100644 index 0000000..7d82a77 Binary files /dev/null and b/scan1.png differ diff --git a/scan2.png b/scan2.png new file mode 100644 index 0000000..c4ec97d Binary files /dev/null and b/scan2.png differ diff --git a/src/main.cpp b/src/main.cpp index 896ac2b..2fce82c 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -13,7 +13,7 @@ #include #include "testing_helpers.hpp" -const int SIZE = 1 << 8; // feel free to change the size of array +const int SIZE = 1 << 16; // feel free to change the size of array const int NPOT = SIZE - 3; // Non-Power-Of-Two int *a = new int[SIZE]; int *b = new int[SIZE]; diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu index 2ed6d63..7dbb08f 100644 --- a/stream_compaction/common.cu +++ b/stream_compaction/common.cu @@ -24,6 +24,9 @@ namespace StreamCompaction { */ __global__ void kernMapToBoolean(int n, int *bools, const int *idata) { // TODO + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) return; + bools[index] = (idata[index] != 0); } /** @@ -33,6 +36,11 @@ namespace StreamCompaction { __global__ void kernScatter(int n, int *odata, const int *idata, const int *bools, const int *indices) { // TODO + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) return; + if (bools[index] > 0) { + odata[indices[index]] = idata[index]; + } } } diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu index 719fa11..3341a9c 100644 --- a/stream_compaction/cpu.cu +++ b/stream_compaction/cpu.cu @@ -20,6 +20,10 @@ namespace StreamCompaction { void scan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + odata[0] = 0; + for (int i = 1; i < n; i++) { + odata[i] = idata[i - 1] + odata[i - 1]; + } timer().endCpuTimer(); } @@ -31,8 +35,17 @@ namespace StreamCompaction { int compactWithoutScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + int cnt = 0; + odata[0] = 0; + for (int i = 0; i < n; i++) { + if(idata[i] != 0) + { + odata[cnt] = idata[i]; + cnt++; + } + } timer().endCpuTimer(); - return -1; + return cnt; } /** @@ -43,8 +56,33 @@ namespace StreamCompaction { int compactWithScan(int n, int *odata, const int *idata) { timer().startCpuTimer(); // TODO + + // temporary array + int *temp = new int[n]; + for (int i = 0; i < n; i++) { + temp[i] = (idata[i] != 0); + } + int *temp2 = new int[n]; + + // scan + temp2[0] = 0; + for (int i = 1; i < n; i++) { + temp2[i] = temp[i - 1] + temp2[i - 1]; + } + int cnt = temp2[n - 1]; + + // scatter + for (int i = 0; i < n; i++) { + if (temp[i] > 0) + { + odata[temp2[i]] = idata[i]; + } + } + timer().endCpuTimer(); - return -1; + delete[] temp; + delete[] temp2; + return cnt; } } } diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu index 2db346e..f18cfd9 100644 --- a/stream_compaction/efficient.cu +++ b/stream_compaction/efficient.cu @@ -3,6 +3,8 @@ #include "common.h" #include "efficient.h" +#define blockSize 128 + namespace StreamCompaction { namespace Efficient { using StreamCompaction::Common::PerformanceTimer; @@ -12,13 +14,65 @@ namespace StreamCompaction { return timer; } + __global__ void kernUpSweep(int n, int d, int* data) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) return; + + int a = 1 << d; + int b = 1 << (d + 1); + if (index % b == 0) + { + data[index + b - 1] += data[index + a - 1]; + } + } + + __global__ void kernDownSweep(int n, int d, int* data){ + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) return; + + int a = 1 << d; + int b = 1 << (d + 1); + if (index % b == 0) + { + int temp = data[index + b - 1]; + data[index + b - 1] += data[index + a - 1]; + data[index + a - 1] = temp; + } + } + /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); + // TODO + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + int size = pow(2, ilog2ceil(n)); + + int* scan_array; + cudaMalloc((void**)&scan_array, size * sizeof(int)); + cudaMemset(scan_array, 0, size * sizeof(int)); + cudaMemcpy(scan_array, idata, size * sizeof(int), cudaMemcpyHostToDevice); + + timer().startGpuTimer(); + // Up sweep + for (int i = 0; i < ilog2ceil(n); i++) + { + kernUpSweep <<>> (size, i, scan_array); + checkCUDAError("kernUpSweep fails."); + } + cudaMemset(scan_array + size - 1, 0, sizeof(int)); + + // Down sweep + for (int i = ilog2ceil(n) - 1; i >= 0; i--) + { + kernDownSweep <<>> (size, i, scan_array); + checkCUDAError("kernDownSweep fails."); + } timer().endGpuTimer(); + + cudaMemcpy(odata, scan_array, n * sizeof(int), cudaMemcpyDeviceToHost); + cudaFree(scan_array); } /** @@ -31,10 +85,59 @@ namespace StreamCompaction { * @returns The number of elements remaining after compaction. */ int compact(int n, int *odata, const int *idata) { - timer().startGpuTimer(); + // TODO + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + int size = pow(2, ilog2ceil(n)); + + int* iarray; + int* bool_array; + int* scan_array; + cudaMalloc((void**)&iarray, size * sizeof(int)); + cudaMalloc((void**)&bool_array, size * sizeof(int)); + cudaMalloc((void**)&scan_array, size * sizeof(int)); + cudaMemset(iarray, 0, n * sizeof(int)); + cudaMemcpy(iarray, idata, n * sizeof(int), cudaMemcpyHostToDevice); + + timer().startGpuTimer(); + + // map to bool + Common::kernMapToBoolean<<>>(size, bool_array, iarray); + cudaMemcpy(scan_array, bool_array, size * sizeof(int), cudaMemcpyDeviceToDevice); + + // up sweep + for (int i = 0; i < ilog2ceil(n); i++) { + kernUpSweep <<>> (size, i, scan_array); + checkCUDAError("kernUpSweep fails."); + } + cudaMemset(scan_array + size - 1, 0, sizeof(int)); + + // down sweep + for (int i = ilog2ceil(n) - 1; i >= 0; i--) { + kernDownSweep <<>> (size, i, scan_array); + checkCUDAError("kernDownSweep fails."); + } + + // scatter + int* oarray; + cudaMalloc((void**)&oarray, size * sizeof(int)); + Common::kernScatter <<>>(size, oarray, iarray, bool_array, scan_array); + timer().endGpuTimer(); - return -1; + + int a = 0; + int b = 0; + cudaMemcpy(&a, scan_array + size - 1, sizeof(int), cudaMemcpyDeviceToHost); + cudaMemcpy(&b, bool_array + size - 1, sizeof(int), cudaMemcpyDeviceToHost); + int cnt = a + b; + cudaMemcpy(odata, oarray, cnt * sizeof(int), cudaMemcpyDeviceToHost); + + cudaFree(iarray); + cudaFree(oarray); + cudaFree(bool_array); + cudaFree(scan_array); + + return cnt; } } } diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu index 4308876..a758b78 100644 --- a/stream_compaction/naive.cu +++ b/stream_compaction/naive.cu @@ -11,15 +11,45 @@ namespace StreamCompaction { static PerformanceTimer timer; return timer; } + // TODO: __global__ + __global__ void kernNaiveScan(int *idata, int *odata, int n, int offset) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= n) return; + if (index >= offset) { + odata[index] = idata[index] + idata[index - offset]; + } + else { + odata[index] = idata[index]; + } + } /** * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); // TODO + int blockSize = 128; + dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize); + + int* iarray; + int* oarray; + cudaMalloc((void**)&iarray, n * sizeof(int)); + cudaMalloc((void**)&oarray, n * sizeof(int)); + cudaMemcpy(iarray, idata, n * sizeof(int), cudaMemcpyHostToDevice); + + timer().startGpuTimer(); + for (int i = 1; i <= ilog2ceil(n); i++) { + int offset = pow(2, i - 1); + kernNaiveScan <<>> (iarray, oarray, n, offset); + cudaMemcpy(iarray, oarray, n * sizeof(int), cudaMemcpyDeviceToDevice); + } timer().endGpuTimer(); + + cudaMemcpy(odata + 1, oarray, n * sizeof(int), cudaMemcpyDeviceToHost); + odata[0] = 0; + cudaFree(iarray); + cudaFree(oarray); } } } diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu index 1def45e..282bbf1 100644 --- a/stream_compaction/thrust.cu +++ b/stream_compaction/thrust.cu @@ -18,11 +18,19 @@ namespace StreamCompaction { * Performs prefix-sum (aka scan) on idata, storing the result into odata. */ void scan(int n, int *odata, const int *idata) { - timer().startGpuTimer(); + // TODO use `thrust::exclusive_scan` // example: for device_vectors dv_in and dv_out: // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); + + thrust::device_vector dv_in(idata, idata+n); + thrust::device_vector dv_out(n); + + timer().startGpuTimer(); + thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin()); timer().endGpuTimer(); + + thrust::copy(dv_out.begin(), dv_out.end(), odata); } } }