diff --git a/README.md b/README.md index 98dd9a8..756fae8 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,72 @@ **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 1 - Flocking** -* (TODO) YOUR NAME HERE -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Daniel Krupka +* Tested on: Debian testing (stretch), Intel(R) Core(TM) i7-4710HQ CPU @ 2.50GHz 8GB, GTX 850M -### (TODO: Your README) +# Project 1 - Boids +This project's goal is to implement [Boids](https://en.wikipedia.org/wiki/Boids) using CUDA, +and to explore a few optimizations that can be made to the naive algorithm. To summarize, +the Boids algorithm implements flocking behavior as seen in birds as an emergent behavior +from a few simple rules. The algorithm is embarassingly parallel, as the behavior of each +Boid depends only on the previous state of the system. Details of the assignment can be found + [here](INSTRUCTION.md). + +![Boids Demo](images/boid_demo.gif "Boids Demo") + +# Optimization + +The naive implementation checks every Boid. However, when the search space is large, +most Boids will not be nearby and have no effect. An effective solution to this is +to divide the search space with a grid whose cells are near the size of the Boid range +of influence, and only search neighboring grid cells. On a GPU, where data is most +effectively represented by arrays of values, this is performed by giving each Boid and grid cell an integer index, +and sorting the Boid indices by their corresponding grid cell index. Then, it is simple to find the slice of the Boid index array corresponding +to a given cell. + +However, position and velocity are *not* sorted, requiring some index lookups to +find the spatial data for each boid. This is solved by shuffling the position and velocity arrays +to the same order as the Boid index array, making the data coherent and saving lookup time. + +# Profiling + +Optimizations were tested by running each implementation, referred to as 'brute force', +'uniform', and 'coherent', on Boid counts ranging from 5,000 to 1,000,000. Implementations +were also tested on CUDA block sizes ranging from 128 threads to 1,024 threads. Single-step +execution time was smoothed by an infinite horizon filter with alpha=0.95. + +## Peak and Steady-State Rates + +Preliminary probe runs showed that, in all implementations, simulation speed peaked early, +then settled into a steady state. This can be explained by noting that, as the simulation +progresses, the Boids 'clump', resulting in more Boids in a given Boids neighborhood than +there are in the dispersed initial state. + +![Single Time Step Plot](images/single_iter.png "Single Time Step") + +The plot of step time versus Boid count shows a weak influence of Boid count on peak rate, but +a strong influence on the steady-state, supporting the notion that cluster size is responsible +for speed degradation. + +## Block Size + +As the naive implementation failed at fairly low Boid counts, further testing +was performed only on the Uniform and Coherent methods. + +![Block Size Comparison](images/blk_compare.png "Block Size Comparison") + +The plot shows single step execution time of each block size, relative to a base +size of 128 threads. In both cases, higher block counts were no better, with substantial +degradation in the Uniform case. This is likely due to scattered memory accesses +rendering the GPUs limited caching ineffective. Additionally, both implementations +were substantially slowed by a 768 thread block, possibly due to the non-power-of-two +number of threads not mapping well to available hardware. + +## Coherence + +![Coherence Comparison](images/coherent_uniform_compare.png "Coherence Comparison") + +Direct comparison between the non-coherent and coherent implementations further +supports the importance of memory coherence, with the coherent version showing a +40% improvement in step time at large Boid counts. -Include screenshots, analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) diff --git a/images/blk_compare.png b/images/blk_compare.png new file mode 100644 index 0000000..b5ad61e Binary files /dev/null and b/images/blk_compare.png differ diff --git a/images/boid_demo.gif b/images/boid_demo.gif new file mode 100644 index 0000000..0bcdacb Binary files /dev/null and b/images/boid_demo.gif differ diff --git a/images/coherent_uniform_compare.png b/images/coherent_uniform_compare.png new file mode 100644 index 0000000..3be750d Binary files /dev/null and b/images/coherent_uniform_compare.png differ diff --git a/images/single_iter.png b/images/single_iter.png new file mode 100644 index 0000000..01c3164 Binary files /dev/null and b/images/single_iter.png differ diff --git a/prof/profRun.sh b/prof/profRun.sh new file mode 100755 index 0000000..3205423 --- /dev/null +++ b/prof/profRun.sh @@ -0,0 +1,7 @@ +#!/bin/bash + +echo "$1 = [" +for n in 5000 10000 15000 20000 25000 50000 75000 100000 250000 500000 1000000; do + build/cis565_boids $n 2> /dev/null +done +echo "]" diff --git a/prof/runData.py b/prof/runData.py new file mode 100644 index 0000000..e8b7775 --- /dev/null +++ b/prof/runData.py @@ -0,0 +1,271 @@ +from matplotlib import pyplot as plt +import matplotlib.lines as mlines + +uniform = {} +bruteForce = {} +coherent = {} + +uniform[128] = [ +[5000,383.656957,624.735748], +[10000,419.100000,903.498103], +[15000,452.350000,1254.570835], +[20000,509.000000,1659.217137], +[25000,584.000000,1991.820693], +[50000,853.000000,4176.786314], +[75000,1164.000000,6912.989198], +[100000,860.000000,9940.314287], +[250000,2847.000000,30302.529010], +[500000,3981.000000,98495.401126], +[1000000,6780.000000,332754.849440], +] +uniform[256] = [ +[5000,383.335023,632.444092], +[10000,412.500000,915.473426], +[15000,454.950000,1255.105734], +[20000,518.000000,1679.717623], +[25000,581.000000,2039.227647], +[50000,848.000000,4179.722180], +[75000,1176.000000,6934.372327], +[100000,869.000000,9952.324545], +[250000,2816.000000,30327.787594], +[500000,3965.000000,98674.105457], +[1000000,6795.000000,332020.133592], +] +uniform[512] = [ +[5000,385.891676,639.369672], +[10000,417.250000,920.228329], +[15000,456.650000,1266.591706], +[20000,511.000000,1696.459238], +[25000,593.000000,2050.577559], +[50000,843.000000,4250.495830], +[75000,1164.000000,6984.192784], +[100000,865.000000,9969.864876], +[250000,2808.000000,30414.321448], +[500000,3962.000000,99746.753330], +[1000000,6783.000000,331621.236897], +] +uniform[768] = [ +[5000,415.881824,833.147383], +[10000,414.250000,1186.176995], +[15000,455.050000,1612.495266], +[20000,506.000000,2198.307857], +[25000,585.000000,2527.516943], +[50000,835.000000,5176.156233], +[75000,1162.000000,8199.426148], +[100000,903.000000,11660.588738], +[250000,2842.000000,32719.868674], +[500000,3957.000000,114974.955343], +[1000000,6794.000000,374045.896418], +] +uniform[1024] = [ +[5000,383.236620,636.124498], +[10000,416.300000,898.264101], +[15000,453.450000,1268.548150], +[20000,511.000000,1717.535533], +[25000,582.000000,2082.039087], +[50000,841.000000,4266.817845], +[75000,1164.000000,7033.642933], +[100000,1120.000000,10079.411187], +[250000,2853.000000,30541.178775], +[500000,4011.000000,100375.362867], +[1000000,6792.000000,332025.727319], +] +coherent[128] = [ +[5000,386.640039,579.540401], +[10000,432.467500,830.854868], +[15000,457.700000,1096.861468], +[20000,511.650000,1454.444801], +[25000,588.000000,1918.377965], +[50000,848.000000,3487.051416], +[75000,1160.000000,5368.146619], +[100000,912.000000,7227.005568], +[250000,2869.000000,18207.942247], +[500000,3996.000000,57475.470061], +[1000000,6860.000000,212690.564596], +] +coherent[256] = [ +[5000,385.174786,581.196897], +[10000,417.550000,831.696793], +[15000,457.850000,1108.747156], +[20000,529.150000,1440.820549], +[25000,588.000000,1908.937970], +[50000,859.000000,3486.965248], +[75000,1160.000000,5394.183790], +[100000,873.000000,7270.980267], +[250000,2857.000000,18222.656499], +[500000,3972.000000,57827.861638], +[1000000,6855.000000,212859.542813], +] +coherent[512] = [ +[5000,393.391149,585.138622], +[10000,419.450000,829.729229], +[15000,458.950000,1098.056921], +[20000,515.250000,1473.669385], +[25000,589.000000,1788.153907], +[50000,918.000000,3516.217751], +[75000,1168.000000,5432.469845], +[100000,868.000000,7296.062158], +[250000,3984.000000,18211.412190], +[500000,4439.000000,58252.685316], +[1000000,7245.000000,213454.071366], +] +coherent[768] = [ +[5000,412.840833,764.167344], +[10000,440.400000,1009.650819], +[15000,459.850000,1288.873454], +[20000,521.000000,1869.933998], +[25000,590.000000,2161.145613], +[50000,847.000000,4139.408600], +[75000,1191.000000,6331.838109], +[100000,879.000000,8479.993987], +[250000,3222.000000,21434.089841], +[500000,4101.000000,69428.011576], +[1000000,6780.000000,249272.889109], +] +coherent[1024] = [ +[5000,387.201838,584.761264], +[10000,422.250000,838.071770], +[15000,454.350000,1114.856206], +[20000,513.000000,1473.060754], +[25000,590.000000,1779.910442], +[50000,851.000000,3549.825770], +[75000,1303.000000,5450.188405], +[100000,869.000000,7247.422401], +[250000,2905.000000,18090.378486], +[500000,3962.000000,58350.514909], +[1000000,6860.000000,213256.116817], +] +bruteForce[128] =[ + [5000, 1000000.0/178, 1000000.0/147], + [10000, 1000000.0/151, 1000000.0/36.2], + [15000, 1000000.0/145, 1000000.0/16.2], + [20000, 1000000.0/138, 1000000.0/9.0], +] + +bruteForce[256] =[ + [5000, 1000000.0/166.8, 1000000.0/149], + [10000, 1000000.0/151.3, 1000000.0/38.2], + [15000, 1000000.0/143.6, 1000000.0/17.0], + [20000, 1000000.0/139.2, 1000000.0/9.6], +] + +bruteForce[512] =[ + [5000, 1000000.0/168.5, 1000000.0/147.5], + [10000, 1000000.0/149.8, 1000000.0/38.2], + [15000, 1000000.0/145.5, 1000000.0/17.0], + [20000, 1000000.0/140.4, 1000000.0/9.6], +] + +bruteForce[768] =[ + [5000, 1000000.0/160.8, 1000000.0/95.1], + [10000, 1000000.0/150.8, 1000000.0/31.2], + [15000, 1000000.0/142.8, 1000000.0/15.4], + [20000, 1000000.0/133.2, 1000000.0/7.8], +] + +bruteForce[1024] =[ + [5000, 1000000.0/168.3, 1000000.0/147.3], + [10000, 1000000.0/151.4, 1000000.0/37.8], + [15000, 1000000.0/144.1, 1000000.0/17.0], + [20000, 1000000.0/139.7, 1000000.0/9.6], +] + +dsets = [ + (bruteForce, 'bruteForce', 'r'), + (uniform, 'uniform', 'g'), + (coherent, 'coherent', (1,0,1)), +] + +### OVERALL COMPARISON +plt.figure() +legend = [] +for dset,name,c in dsets: + for sz in dset: + x,yPk,ySt = zip(*dset[sz]) + plt.loglog(x,[t/1000 for t in ySt],color=c) + plt.loglog(x,[t/1000 for t in yPk],color=c,ls='dashed') + line = mlines.Line2D([], [], color=c, label=name+' (peak)', ls='dashed') + legend.append(line) + line = mlines.Line2D([], [], color=c, label=name+' (steady)') + legend.append(line) + +plt.legend(handles=legend, loc=4) +plt.xlabel('# of boids') +plt.ylabel('t (ms)') +#plt.axvline(x=5000.,color='k',ls='dashed') +plt.title('Single iteration time') +plt.xlim([5000,1000000]) + +plt.figure() +szList = sorted(uniform.keys()) +sz0 = szList[0] +szList = szList[1:] +x0,yPk0,ySt0 = zip(*dset[sz0]) + +plt.subplot(221) +hPkList = [] +for sz in szList: + x,yPk,ySt = zip(*uniform[sz]) + rPk = [float(a)/a0 for a,a0 in zip(yPk,yPk0)] + hPk, = plt.semilogx(x,rPk,label='blk=%d'%sz) + hPkList.append(hPk) +plt.legend(handles=hPkList, loc=2) +plt.xlim([5000,1000000]) +plt.title("'Uniform' peak performance") +plt.xlabel('# of boids') +plt.ylabel('t(blk)/t(128)') + +plt.subplot(222) +hStList = [] +for sz in szList: + x,yPk,ySt = zip(*uniform[sz]) + rSt = [float(a)/a0 for a,a0 in zip(ySt,ySt0)] + hSt, = plt.semilogx(x,rSt,label='blk=%d'%sz) + hStList.append(hSt) +plt.legend(handles=hStList, loc=2) +plt.xlim([5000,1000000]) +plt.title("'Uniform' steady state performance") +plt.xlabel('# of boids') +plt.ylabel('t(blk)/t(128)') + +plt.subplot(223) +hPkList = [] +for sz in szList: + x,yPk,ySt = zip(*coherent[sz]) + rPk = [float(a)/a0 for a,a0 in zip(yPk,yPk0)] + hPk, = plt.semilogx(x,rPk,label='blk=%d'%sz) + hPkList.append(hPk) +plt.legend(handles=hPkList, loc=2) +plt.xlim([5000,1000000]) +plt.title("'Coherent' peak performance") +plt.xlabel('# of boids') +plt.ylabel('t(blk)/t(128)') + +plt.subplot(224) +hStList = [] +for sz in szList: + x,yPk,ySt = zip(*coherent[sz]) + rSt = [float(a)/a0 for a,a0 in zip(ySt,ySt0)] + hSt, = plt.semilogx(x,rSt,label='blk=%d'%sz) + hStList.append(hSt) +plt.legend(handles=hStList, loc=1) +plt.xlim([5000,1000000]) +plt.title("'Coherent' steady state performance") +plt.xlabel('# of boids') +plt.ylabel('t(blk)/t(128)') + +plt.figure() +hList = [] +for sz in szList: + x,yPk1,ySt1 = zip(*uniform[sz]) + x,yPk2,ySt2 = zip(*coherent[sz]) + rPk = [float(a)/a0 for a,a0 in zip(ySt2,ySt1)] + hPk, = plt.semilogx(x,rPk,label='blk=%d'%sz) + hList.append(hPk) +plt.legend(handles=hList, loc=1) +plt.xlim([5000,1000000]) +plt.title("'Coherent' vs 'Uniform' steady state performance") +plt.xlabel('# of boids') +plt.ylabel('t(coherent)/t(uniform)') + +plt.show() diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index fdd636d..dff0113 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -10,5 +10,5 @@ set(SOURCE_FILES cuda_add_library(src ${SOURCE_FILES} - OPTIONS -arch=sm_20 + OPTIONS -arch=sm_50 ) diff --git a/src/glslUtility.cpp b/src/glslUtility.cpp index 1f9c716..ea57da0 100644 --- a/src/glslUtility.cpp +++ b/src/glslUtility.cpp @@ -33,11 +33,11 @@ char* loadFile(const char *fname, GLint &fSize) { file.seekg(0, ios::beg); file.read(memblock, size); file.close(); - std::cout << "file " << fname << " loaded" << std::endl; + std::cerr << "file " << fname << " loaded" << std::endl; return memblock; } - std::cout << "Unable to open file " << fname << std::endl; + std::cerr << "Unable to open file " << fname << std::endl; exit(1); } @@ -55,7 +55,7 @@ void printShaderInfoLog(GLint shader) { infoLog = new GLchar[infoLogLen]; // error check for fail to allocate memory omitted glGetShaderInfoLog(shader, infoLogLen, &charsWritten, infoLog); - std::cout << "InfoLog:" << std::endl << infoLog << std::endl; + std::cerr << "InfoLog:" << std::endl << infoLog << std::endl; delete [] infoLog; } } @@ -71,7 +71,7 @@ void printLinkInfoLog(GLint prog) { infoLog = new GLchar[infoLogLen]; // error check for fail to allocate memory omitted glGetProgramInfoLog(prog, infoLogLen, &charsWritten, infoLog); - std::cout << "InfoLog:" << std::endl << infoLog << std::endl; + std::cerr << "InfoLog:" << std::endl << infoLog << std::endl; delete [] infoLog; } } @@ -112,7 +112,7 @@ shaders_t loadShaders(const char * vert_path, const char * geom_path, const char glCompileShader(v); glGetShaderiv(v, GL_COMPILE_STATUS, &compiled); if (!compiled) { - std::cout << "Vertex shader not compiled." << std::endl; + std::cerr << "Vertex shader not compiled." << std::endl; } printShaderInfoLog(v); @@ -120,7 +120,7 @@ shaders_t loadShaders(const char * vert_path, const char * geom_path, const char glCompileShader(g); glGetShaderiv(g, GL_COMPILE_STATUS, &compiled); if (!compiled) { - std::cout << "Geometry shader not compiled." << std::endl; + std::cerr << "Geometry shader not compiled." << std::endl; } printShaderInfoLog(g); } @@ -128,7 +128,7 @@ shaders_t loadShaders(const char * vert_path, const char * geom_path, const char glCompileShader(f); glGetShaderiv(f, GL_COMPILE_STATUS, &compiled); if (!compiled) { - std::cout << "Fragment shader not compiled." << std::endl; + std::cerr << "Fragment shader not compiled." << std::endl; } printShaderInfoLog(f); @@ -157,7 +157,7 @@ void attachAndLinkProgram(GLuint program, shaders_t shaders) { GLint linked; glGetProgramiv(program, GL_LINK_STATUS, &linked); if (!linked) { - std::cout << "Program did not link." << std::endl; + std::cerr << "Program did not link." << std::endl; } printLinkInfoLog(program); } diff --git a/src/kernel.cu b/src/kernel.cu index aaf0fbf..09e04bf 100644 --- a/src/kernel.cu +++ b/src/kernel.cu @@ -37,7 +37,7 @@ void checkCUDAError(const char *msg, int line = -1) { *****************/ /*! Block size used for CUDA kernel launch. */ -#define blockSize 128 +#define blockSize 1024 // LOOK-1.2 Parameters for the boids algorithm. // These worked well in our reference implementation. @@ -66,7 +66,8 @@ dim3 threadsPerBlock(blockSize); // Consider why you would need two velocity buffers in a simulation where each // boid cares about its neighbors' velocities. // These are called ping-pong buffers. -glm::vec3 *dev_pos; +glm::vec3 *dev_pos1; +glm::vec3 *dev_pos2; glm::vec3 *dev_vel1; glm::vec3 *dev_vel2; @@ -74,7 +75,7 @@ glm::vec3 *dev_vel2; // pointers on your own too. // For efficient sorting and the uniform grid. These should always be parallel. -int *dev_particleArrayIndices; // What index in dev_pos and dev_velX represents this particle? +int *dev_particleArrayIndices; // What index in dev_pos1 and dev_velX represents this particle? int *dev_particleGridIndices; // What grid cell is this particle in? // needed for use with thrust thrust::device_ptr dev_thrust_particleArrayIndices; @@ -142,8 +143,11 @@ void Boids::initSimulation(int N) { // LOOK-1.2 - This is basic CUDA memory management and error checking. // Don't forget to cudaFree in Boids::endSimulation. - cudaMalloc((void**)&dev_pos, N * sizeof(glm::vec3)); - checkCUDAErrorWithLine("cudaMalloc dev_pos failed!"); + cudaMalloc((void**)&dev_pos1, N * sizeof(glm::vec3)); + checkCUDAErrorWithLine("cudaMalloc dev_pos1 failed!"); + + cudaMalloc((void**)&dev_pos2, N * sizeof(glm::vec3)); + checkCUDAErrorWithLine("cudaMalloc dev_pos2 failed!"); cudaMalloc((void**)&dev_vel1, N * sizeof(glm::vec3)); checkCUDAErrorWithLine("cudaMalloc dev_vel1 failed!"); @@ -153,9 +157,14 @@ void Boids::initSimulation(int N) { // LOOK-1.2 - This is a typical CUDA kernel invocation. kernGenerateRandomPosArray<<>>(1, numObjects, - dev_pos, scene_scale); + dev_pos1, scene_scale); checkCUDAErrorWithLine("kernGenerateRandomPosArray failed!"); + glm::vec3 *zero = new glm::vec3[N]; + cudaMemcpy(dev_vel1, zero, N*sizeof(glm::vec3), cudaMemcpyHostToDevice); + cudaMemcpy(dev_vel2, zero, N*sizeof(glm::vec3), cudaMemcpyHostToDevice); + delete zero; + // LOOK-2.1 computing grid params gridCellWidth = 2.0f * std::max(std::max(rule1Distance, rule2Distance), rule3Distance); int halfSideCount = (int)(scene_scale / gridCellWidth) + 1; @@ -169,7 +178,15 @@ void Boids::initSimulation(int N) { gridMinimum.z -= halfGridWidth; // TODO-2.1 TODO-2.3 - Allocate additional buffers here. + cudaMalloc((void**)&dev_particleArrayIndices, N*sizeof(int)); + cudaMalloc((void**)&dev_particleGridIndices, N*sizeof(int)); + cudaMalloc((void**)&dev_gridCellStartIndices, gridCellCount*sizeof(int)); + cudaMalloc((void**)&dev_gridCellEndIndices, gridCellCount*sizeof(int)); cudaThreadSynchronize(); + + dev_thrust_particleArrayIndices = thrust::device_ptr(dev_particleArrayIndices); + dev_thrust_particleGridIndices = thrust::device_ptr(dev_particleGridIndices); + } @@ -210,7 +227,7 @@ __global__ void kernCopyVelocitiesToVBO(int N, glm::vec3 *vel, float *vbo, float void Boids::copyBoidsToVBO(float *vbodptr_positions, float *vbodptr_velocities) { dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize); - kernCopyPositionsToVBO << > >(numObjects, dev_pos, vbodptr_positions, scene_scale); + kernCopyPositionsToVBO << > >(numObjects, dev_pos1, vbodptr_positions, scene_scale); kernCopyVelocitiesToVBO << > >(numObjects, dev_vel1, vbodptr_velocities, scene_scale); checkCUDAErrorWithLine("copyBoidsToVBO failed!"); @@ -233,7 +250,41 @@ __device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *po // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves // Rule 2: boids try to stay a distance d away from each other // Rule 3: boids try to match the speed of surrounding boids - return glm::vec3(0.0f, 0.0f, 0.0f); + + glm::vec3 ctr(0.0); + glm::vec3 v1(0.0), v2(0.0), v3(0.0); + float k1=0.0, k2=0.0, k3=0.0; + for (int i = 0; i < N; i++) { + if (i == iSelf) + continue; + + float dist = glm::length(pos[i] - pos[iSelf]); + + if (dist < rule1Distance) { + ctr += pos[i]; + k1++; + } + + if (dist < rule2Distance) { + v2 += pos[iSelf] - pos[i]; + k2++; + } + + if (dist < rule3Distance) { + v3 += vel[i]; + k3++; + } + } + + glm::vec3 dVel(0.0); + if (k1 > 0) + dVel += rule1Scale * (ctr/k1 - pos[iSelf]); + if (k2 > 0) + dVel += rule2Scale * v2; + if (k3 > 0) + dVel += rule3Scale * (v3/k3 - vel[iSelf]); + + return dVel; } /** @@ -245,6 +296,18 @@ __global__ void kernUpdateVelocityBruteForce(int N, glm::vec3 *pos, // Compute a new velocity based on pos and vel1 // Clamp the speed // Record the new velocity into vel2. Question: why NOT vel1? + + // get the self index + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= N) + return; + + glm::vec3 dVel = computeVelocityChange(N, index, pos, vel1); + vel2[index] += dVel; + float speed = glm::length(vel2[index]); + if (speed > maxSpeed) + vel2[index] *= maxSpeed / speed; + } /** @@ -285,10 +348,22 @@ __device__ int gridIndex3Dto1D(int x, int y, int z, int gridResolution) { __global__ void kernComputeIndices(int N, int gridResolution, glm::vec3 gridMin, float inverseCellWidth, glm::vec3 *pos, int *indices, int *gridIndices) { - // TODO-2.1 - // - Label each boid with the index of its grid cell. - // - Set up a parallel array of integer indices as pointers to the actual - // boid data in pos and vel1/vel2 + // TODO-2.1 + // - Label each boid with the index of its grid cell. + // - Set up a parallel array of integer indices as pointers to the actual + // boid data in pos and vel1/vel2 + + // this is run before sorting + + int boidIdx = blockIdx.x * blockDim.x + threadIdx.x; + if (boidIdx >= N) + return; + + glm::ivec3 gridIdx3 = (pos[boidIdx]-gridMin)*inverseCellWidth; + int gridIdx = gridIndex3Dto1D(gridIdx3.x, gridIdx3.y, gridIdx3.z, + gridResolution); + gridIndices[boidIdx] = gridIdx; + indices[boidIdx] = boidIdx; } // LOOK-2.1 Consider how this could be useful for indicating that a cell @@ -306,14 +381,34 @@ __global__ void kernIdentifyCellStartEnd(int N, int *particleGridIndices, // Identify the start point of each cell in the gridIndices array. // This is basically a parallel unrolling of a loop that goes // "this index doesn't match the one before it, must be a new cell!" + + // this is post-sorting + + int boidIdx = blockIdx.x * blockDim.x + threadIdx.x; + if (boidIdx >= N) + return; + + int boidCell = particleGridIndices[boidIdx]; + + // short circuited, adjacent index checks only happen if not at + // array boundary + + if (boidIdx == 0 || boidCell != particleGridIndices[boidIdx-1]) { + gridCellStartIndices[boidCell] = boidIdx; + } + + if (boidIdx == N-1 || boidCell != particleGridIndices[boidIdx+1]) { + gridCellEndIndices[boidCell] = boidIdx; + } + } __global__ void kernUpdateVelNeighborSearchScattered( - int N, int gridResolution, glm::vec3 gridMin, - float inverseCellWidth, float cellWidth, - int *gridCellStartIndices, int *gridCellEndIndices, - int *particleArrayIndices, - glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) { + int N, int gridResolution, glm::vec3 gridMin, + float inverseCellWidth, float cellWidth, + int *gridCellStartIndices, int *gridCellEndIndices, + int *particleArrayIndices, + glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) { // TODO-2.1 - Update a boid's velocity using the uniform grid to reduce // the number of boids that need to be checked. // - Identify the grid cell that this particle is in @@ -322,33 +417,202 @@ __global__ void kernUpdateVelNeighborSearchScattered( // - Access each boid in the cell and compute velocity change from // the boids rules, if this boid is within the neighborhood distance. // - Clamp the speed change before putting the new speed in vel2 + + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= N) + return; + int boidIdx = particleArrayIndices[idx]; + + // get the grid index and offset + glm::vec3 fGridIdx3; + glm::vec3 relPos = glm::modf((pos[boidIdx]-gridMin) * inverseCellWidth, fGridIdx3); + relPos = cellWidth * relPos + gridMin / float(gridResolution); + glm::ivec3 gridIdx3 = (glm::ivec3)fGridIdx3; + int gridIdx = gridIndex3Dto1D(gridIdx3.x, gridIdx3.y, gridIdx3.z, + gridResolution); + + // find which adjacent cells to search + glm::ivec3 searchIdx(0,0,0); + for (int i = 0; i < 3; i++) { + if (gridIdx3[i] > 0 && relPos[i] < 0) + searchIdx[i] = -1; + if (gridIdx3[i] < gridResolution-1 && relPos[i] > 0) + searchIdx[i] = 1; + } + + + // search all extant adjacent cells + glm::ivec3 sMin = glm::min(gridIdx3 + searchIdx, gridIdx3); + glm::ivec3 sMax = glm::max(gridIdx3 + searchIdx, gridIdx3); + glm::vec3 ctr(0.0); + glm::vec3 v1(0.0), v2(0.0), v3(0.0); + float k1=0.0, k2=0.0, k3=0.0; + for (int k = sMin.z; k <= sMax.z; k++) { + for (int j = sMin.y; j <= sMax.y; j++) { + for (int i = sMin.x; i <= sMax.x; i++) { + gridIdx = gridIndex3Dto1D(i,j,k,gridResolution); + int cellStart = gridCellStartIndices[gridIdx]; + int cellEnd = gridCellEndIndices[gridIdx]; + if (cellStart < 0 || cellEnd < 0) + continue; + + for (int p = cellStart; p <= cellEnd; p++) { + int pBoid = particleArrayIndices[p]; + if (pBoid == boidIdx) + continue; + + float dist = glm::length(pos[pBoid] - pos[boidIdx]); + + if (dist < rule1Distance) { + ctr += pos[pBoid]; + k1++; + } + + if (dist < rule2Distance) { + v2 += pos[boidIdx] - pos[pBoid]; + k2++; + } + + if (dist < rule3Distance) { + v3 += vel1[pBoid]; + k3++; + } + } + }}} + + // total velocity change + glm::vec3 nVel = vel1[boidIdx]; + if (k1 > 0) + nVel += rule1Scale * (ctr/k1 - pos[boidIdx]); + if (k2 > 0) + nVel += rule2Scale * v2; + if (k3 > 0) + nVel += rule3Scale * (v3/k3 - vel1[boidIdx]); + + float speed = glm::length(nVel); + if (speed > maxSpeed) + nVel *= maxSpeed / speed; + vel2[boidIdx] = nVel; +} + +// shuffle data1 into data2 according to a given order +__global__ void kernShuffleToOrder(int N, glm::vec3 *data1, glm::vec3 *data2, int *order) { + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= N) + return; + int src = order[idx]; + + data2[idx] = data1[src]; } __global__ void kernUpdateVelNeighborSearchCoherent( - int N, int gridResolution, glm::vec3 gridMin, - float inverseCellWidth, float cellWidth, - int *gridCellStartIndices, int *gridCellEndIndices, - glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) { - // TODO-2.3 - This should be very similar to kernUpdateVelNeighborSearchScattered, - // except with one less level of indirection. - // This should expect gridCellStartIndices and gridCellEndIndices to refer - // directly to pos and vel1. + int N, int gridResolution, glm::vec3 gridMin, + float inverseCellWidth, float cellWidth, + int *gridCellStartIndices, int *gridCellEndIndices, + glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) { + // TODO-2.1 - Update a boid's velocity using the uniform grid to reduce + // the number of boids that need to be checked. // - Identify the grid cell that this particle is in // - Identify which cells may contain neighbors. This isn't always 8. // - For each cell, read the start/end indices in the boid pointer array. - // DIFFERENCE: For best results, consider what order the cells should be - // checked in to maximize the memory benefits of reordering the boids data. // - Access each boid in the cell and compute velocity change from // the boids rules, if this boid is within the neighborhood distance. // - Clamp the speed change before putting the new speed in vel2 + + int boidIdx = blockIdx.x * blockDim.x + threadIdx.x; + if (boidIdx >= N) + return; + + // get the grid index and offset + glm::vec3 fGridIdx3; + glm::vec3 relPos = glm::modf((pos[boidIdx]-gridMin) * inverseCellWidth, fGridIdx3); + relPos = cellWidth * relPos + gridMin / float(gridResolution); + glm::ivec3 gridIdx3 = (glm::ivec3)fGridIdx3; + int gridIdx = gridIndex3Dto1D(gridIdx3.x, gridIdx3.y, gridIdx3.z, + gridResolution); + + // find which adjacent cells to search + glm::ivec3 searchIdx(0,0,0); + for (int i = 0; i < 3; i++) { + if (gridIdx3[i] > 0 && relPos[i] < 0) + searchIdx[i] = -1; + if (gridIdx3[i] < gridResolution-1 && relPos[i] > 0) + searchIdx[i] = 1; + } + + + // search all extant adjacent cells + glm::ivec3 sMin = glm::min(gridIdx3 + searchIdx, gridIdx3); + glm::ivec3 sMax = glm::max(gridIdx3 + searchIdx, gridIdx3); + glm::vec3 ctr(0.0); + glm::vec3 v1(0.0), v2(0.0), v3(0.0); + float k1=0.0, k2=0.0, k3=0.0; + for (int k = sMin.z; k <= sMax.z; k++) { + for (int j = sMin.y; j <= sMax.y; j++) { + for (int i = sMin.x; i <= sMax.x; i++) { + gridIdx = gridIndex3Dto1D(i,j,k,gridResolution); + int cellStart = gridCellStartIndices[gridIdx]; + int cellEnd = gridCellEndIndices[gridIdx]; + if (cellStart < 0 || cellEnd < 0) + continue; + + for (int pBoid = cellStart; pBoid <= cellEnd; pBoid++) { + if (pBoid == boidIdx) + continue; + + float dist = glm::length(pos[pBoid] - pos[boidIdx]); + + if (dist < rule1Distance) { + ctr += pos[pBoid]; + k1++; + } + + if (dist < rule2Distance) { + v2 += pos[boidIdx] - pos[pBoid]; + k2++; + } + + if (dist < rule3Distance) { + v3 += vel1[pBoid]; + k3++; + } + } + }}} + + // total velocity change + glm::vec3 nVel = vel1[boidIdx]; + if (k1 > 0) + nVel += rule1Scale * (ctr/k1 - pos[boidIdx]); + if (k2 > 0) + nVel += rule2Scale * v2; + if (k3 > 0) + nVel += rule3Scale * (v3/k3 - vel1[boidIdx]); + + float speed = glm::length(nVel); + if (speed > maxSpeed) + nVel *= maxSpeed / speed; + vel2[boidIdx] = nVel; } + /** * Step the entire N-body simulation by `dt` seconds. */ void Boids::stepSimulationNaive(float dt) { - // TODO-1.2 - use the kernels you wrote to step the simulation forward in time. - // TODO-1.2 ping-pong the velocity buffers +// TODO-1.2 - use the kernels you wrote to step the simulation forward in time. +// TODO-1.2 ping-pong the velocity buffers + dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize); + + //std::cout << "naive step" << std::endl; + + kernUpdateVelocityBruteForce<<>>(numObjects, + dev_pos1, dev_vel1, dev_vel2); + checkCUDAErrorWithLine("kernUpdateVelocityBruteForce failed!"); + + cudaMemcpy(dev_vel1, dev_vel2, numObjects * sizeof(glm::vec3), cudaMemcpyDeviceToDevice); + + kernUpdatePos<<>>(numObjects, dt, dev_pos1, dev_vel1); + checkCUDAErrorWithLine("kernUpdatePos failed!"); } void Boids::stepSimulationScatteredGrid(float dt) { @@ -364,6 +628,49 @@ void Boids::stepSimulationScatteredGrid(float dt) { // - Perform velocity updates using neighbor search // - Update positions // - Ping-pong buffers as needed + + //std::cout << "scatteredGrid step" << std::endl; + + dim3 boid_fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize); + dim3 cell_fullBlocksPerGrid((gridCellCount + blockSize - 1) / blockSize); + + // initialize indices + kernComputeIndices<<>>(numObjects, gridSideCount, + gridMinimum, gridInverseCellWidth, dev_pos1, + dev_particleArrayIndices, dev_particleGridIndices); + checkCUDAErrorWithLine("kernComputeIndices failed!"); + + // unstable sort + thrust::sort_by_key(dev_thrust_particleGridIndices, + dev_thrust_particleGridIndices + numObjects, dev_thrust_particleArrayIndices); + + // initialize all grid cells to 'unoccupied' + kernResetIntBuffer<<>>(gridCellCount, + dev_gridCellStartIndices, -7); + kernResetIntBuffer<<>>(gridCellCount, + dev_gridCellEndIndices, -7); + + // find cell boundaries + kernIdentifyCellStartEnd<<>>(numObjects, + dev_particleGridIndices, dev_gridCellStartIndices, dev_gridCellEndIndices); + checkCUDAErrorWithLine("kernIdentifyCellStartEnd failed!"); + + // perform flocking rules + kernUpdateVelNeighborSearchScattered<<>>( + numObjects, gridSideCount, gridMinimum, + gridInverseCellWidth, gridCellWidth, + dev_gridCellStartIndices, dev_gridCellEndIndices, + dev_particleArrayIndices, + dev_pos1, dev_vel1, dev_vel2); + checkCUDAErrorWithLine("kernUpdateVelNeighborSearchScattered failed!"); + + // update positions + kernUpdatePos<<>>(numObjects, dt, dev_pos1, + dev_vel2); + checkCUDAErrorWithLine("kernUpdatePos failed!"); + + // ping-pong + cudaMemcpy(dev_vel1, dev_vel2, numObjects*sizeof(glm::vec3), cudaMemcpyDeviceToDevice); } void Boids::stepSimulationCoherentGrid(float dt) { @@ -382,12 +689,72 @@ void Boids::stepSimulationCoherentGrid(float dt) { // - Perform velocity updates using neighbor search // - Update positions // - Ping-pong buffers as needed. THIS MAY BE DIFFERENT FROM BEFORE. + + dim3 boid_fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize); + dim3 cell_fullBlocksPerGrid((gridCellCount + blockSize - 1) / blockSize); + + // initialize indices + kernComputeIndices<<>>(numObjects, gridSideCount, + gridMinimum, gridInverseCellWidth, dev_pos1, + dev_particleArrayIndices, dev_particleGridIndices); + checkCUDAErrorWithLine("kernComputeIndices failed!"); + + // unstable sort + thrust::sort_by_key(dev_thrust_particleGridIndices, + dev_thrust_particleGridIndices + numObjects, dev_thrust_particleArrayIndices); + + // initialize all grid cells to 'unoccupied' + kernResetIntBuffer<<>>(gridCellCount, + dev_gridCellStartIndices, -7); + kernResetIntBuffer<<>>(gridCellCount, + dev_gridCellEndIndices, -7); + + // find cell boundaries + kernIdentifyCellStartEnd<<>>(numObjects, + dev_particleGridIndices, dev_gridCellStartIndices, dev_gridCellEndIndices); + checkCUDAErrorWithLine("kernIdentifyCellStartEnd failed!"); + + // reorder pos and vel + kernShuffleToOrder<<>>(numObjects, + dev_pos1, dev_pos2, dev_particleArrayIndices); + checkCUDAErrorWithLine("kernShuffleToOrder failed!"); + kernShuffleToOrder<<>>(numObjects, + dev_vel1, dev_vel2, dev_particleArrayIndices); + checkCUDAErrorWithLine("kernShuffleToOrder failed!"); + /* + thrust::gather(dev_thrust_particleArrayIndices, + dev_thrust_particleArrayIndices+numObjects, thrust_pos1, thrust_pos2); + thrust::gather(dev_thrust_particleArrayIndices, + dev_thrust_particleArrayIndices+numObjects, thrust_vel1, thrust_vel2); + */ + + // perform flocking rules + kernUpdateVelNeighborSearchCoherent<<>>( + numObjects, gridSideCount, gridMinimum, + gridInverseCellWidth, gridCellWidth, + dev_gridCellStartIndices, dev_gridCellEndIndices, + dev_pos2, dev_vel2, dev_vel1); + checkCUDAErrorWithLine("kernUpdateVelNeighborSearchCoherent failed!"); + + // ping-pong + cudaMemcpy(dev_pos1, dev_pos2, numObjects*sizeof(glm::vec3), + cudaMemcpyDeviceToDevice); + + // update positions + kernUpdatePos<<>>(numObjects, dt, dev_pos1, + dev_vel1); + checkCUDAErrorWithLine("kernUpdatePos failed!"); } void Boids::endSimulation() { cudaFree(dev_vel1); cudaFree(dev_vel2); - cudaFree(dev_pos); + cudaFree(dev_pos1); + cudaFree(dev_pos2); + cudaFree(dev_gridCellStartIndices); + cudaFree(dev_gridCellEndIndices); + cudaFree(dev_particleArrayIndices); + cudaFree(dev_particleGridIndices); // TODO-2.1 TODO-2.3 - Free any additional buffers here. } diff --git a/src/main.cpp b/src/main.cpp index e416836..424720e 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -14,11 +14,15 @@ // LOOK-2.1 LOOK-2.3 - toggles for UNIFORM_GRID and COHERENT_GRID #define VISUALIZE 1 -#define UNIFORM_GRID 0 -#define COHERENT_GRID 0 +#define UNIFORM_GRID 1 +#define COHERENT_GRID 1 + +#define PROFILE_MODE 0 +#define PROFILE_TIME 45 +#define PROFILE_FRAMES 25000 // LOOK-1.2 - change this to adjust particle count in the simulation -const int N_FOR_VIS = 5000; +int N_FOR_VIS = 5000; const float DT = 0.2f; /** @@ -67,6 +71,9 @@ bool init(int argc, char **argv) { ss << projectName << " [SM " << major << "." << minor << " " << deviceProp.name << "]"; deviceName = ss.str(); + if (argc > 1) + N_FOR_VIS = atoi(argv[1]); + // Window setup stuff glfwSetErrorCallback(errorCallback); @@ -184,9 +191,9 @@ void initShaders(GLuint * program) { } //==================================== - // Main loop + // Main loop. Returns simulation time. //==================================== - void runCUDA() { + double runCUDA() { // Map OpenGL buffer object for writing from CUDA on a single GPU // No data is moved (Win & Linux). When mapped to CUDA, OpenGL should not // use this buffer @@ -199,6 +206,7 @@ void initShaders(GLuint * program) { cudaGLMapBufferObject((void**)&dptrVertVelocities, boidVBO_velocities); // execute the kernel + double t1 = clock(); #if UNIFORM_GRID && COHERENT_GRID Boids::stepSimulationCoherentGrid(DT); #elif UNIFORM_GRID @@ -206,6 +214,7 @@ void initShaders(GLuint * program) { #else Boids::stepSimulationNaive(DT); #endif + double t2 = clock(); #if VISUALIZE Boids::copyBoidsToVBO(dptrVertPositions, dptrVertVelocities); @@ -213,20 +222,28 @@ void initShaders(GLuint * program) { // unmap buffer object cudaGLUnmapBufferObject(boidVBO_positions); cudaGLUnmapBufferObject(boidVBO_velocities); + + return t2-t1; } void mainLoop() { double fps = 0; double timebase = 0; - int frame = 0; + int frame = 0, totFrame = 0; + double dtMin = DBL_MAX, dtPrev = 0.0; + double t0 = glfwGetTime(); + /* Boids::unitTest(); // LOOK-1.2 We run some basic example code to make sure // your CUDA development setup is ready to go. + */ + double dt = 0.0, avgdt = 0.0; while (!glfwWindowShouldClose(window)) { glfwPollEvents(); frame++; + totFrame++; double time = glfwGetTime(); if (time - timebase > 1.0) { @@ -235,7 +252,15 @@ void initShaders(GLuint * program) { frame = 0; } - runCUDA(); + dt = runCUDA(); + if (totFrame == 1) + avgdt = dt; + else + avgdt = .95*avgdt + .05*dt; + if (avgdt < dtMin) + dtMin = avgdt; + if (PROFILE_MODE && (time-t0 > PROFILE_TIME || totFrame > PROFILE_FRAMES)) + break; std::ostringstream ss; ss << "["; @@ -261,6 +286,8 @@ void initShaders(GLuint * program) { } glfwDestroyWindow(window); glfwTerminate(); + + printf("[%d,%f,%f],\n", N_FOR_VIS, 1.0E6*dtMin/CLOCKS_PER_SEC, 1.0E6*avgdt/CLOCKS_PER_SEC); } diff --git a/src/main.hpp b/src/main.hpp index 6cdaa93..6a6ca5b 100644 --- a/src/main.hpp +++ b/src/main.hpp @@ -70,7 +70,7 @@ void keyCallback(GLFWwindow* window, int key, int scancode, int action, int mods void mouseButtonCallback(GLFWwindow* window, int button, int action, int mods); void mousePositionCallback(GLFWwindow* window, double xpos, double ypos); void updateCamera(); -void runCUDA(); +double runCUDA(); //==================================== // Setup/init Stuff diff --git a/src/utilityCore.hpp b/src/utilityCore.hpp index bf7c05f..5fd5916 100644 --- a/src/utilityCore.hpp +++ b/src/utilityCore.hpp @@ -21,7 +21,7 @@ #define PI 3.1415926535897932384626422832795028841971 #define TWO_PI 6.2831853071795864769252867665590057683943 #define SQRT_OF_ONE_THIRD 0.5773502691896257645091487805019574556476 -#define E 2.7182818284590452353602874713526624977572 +//#define E 2.7182818284590452353602874713526624977572 #define G 6.67384e-11 #define EPSILON .000000001 #define ZERO_ABSORPTION_EPSILON 0.00001