diff --git a/README.md b/README.md index 98dd9a8..ec82289 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,113 @@ **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 1 - Flocking** -* (TODO) YOUR NAME HERE -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Rony Edde +* Tested on: Personal Laptop Windows 22, i7-6700k @ 4.00GHz 64GB, GTX 980M 8192MB (home) ### (TODO: Your README) Include screenshots, analysis, etc. (Remember, this is public, so don't put anything here that you don't want to share with the world.) + +**Title changes and performance analysis** + +Boids simulation running 3 implementation versions. + +* The Naive method consists of a neighbor search through all boids + This is slow due to the large lookup. + +* The sparse grid lookup where all boids are indexed using a voxel + grid to identify nearby cells. This is much faster because we + only search boids that are in neighboring cells. + +* The coherent grid where instead of indexing the particle pointers + in order to find their corresponding velocity and positions, we + sort the position and velocity attributes directly. + +* Naive method: +![default5000bruteforce](./images/default_5000_bruteforce.png) + +* Sparse Grid method: +![default100000sparsegrid](images/default_100000_sparsegrid.png) + +* Coherent Grid method: +![default100000coherentgrid](images/default_100000_coherentgrid.png) + + +* Performance: + The naive solution seems fast enough with 5000 points running on + a GTX980M but increasing this to 10000 and we see a drop by half + the framerate and exponentially slower the higher the number. + + * Here have a few CUDA compute average results on 5000 points: + * Naive: + - Average CUDA frame time 3.584352 ms + - Average CUDA frame time 3.646981 ms + - Average CUDA frame time 3.643696 ms + * Saprse Grid: + - Average CUDA frame time 1.676956 ms + - Average CUDA frame time 2.982781 ms + - Average CUDA frame time 3.228212 ms + * Coherent Grid: + - Average CUDA frame time 1.936160 ms + - Average CUDA frame time 1.930368 ms + - Average CUDA frame time 2.040167 ms + + The performance is a bit faster with the sparse and coherent + solutions and it looks like the coherent solution is a litte + faster but overall not much difference can be seen with 5000 + points. + + * Pushing the limit further reveals more. Here are the + results when simulating 100,000 points: + * Naive: + - Average CUDA frame time 2190.682983 ms + - Average CUDA frame time 2174.661621 ms + - Average CUDA frame time 2174.265381 ms + * Saprse Grid: + - Average CUDA frame time 16.067091 ms + - Average CUDA frame time 22.827206 ms + - Average CUDA frame time 27.569127 ms + * Coherent Grid: + - Average CUDA frame time 18.013756 ms + - Average CUDA frame time 21.239281 ms + - Average CUDA frame time 21.476675 ms + + * Here's a graph of performance tests running from 5000 to + 50000 incrementing by 5000: +![testAll](images/fig_1.png) +![testGridOnly](images/fig_2.png) + + Here we can clearly see the advantage of using grids. + Especially the coherent solution which reduces index lookup. + + * Increasing the number of points clearly impacts performance + as seen in the difference between the 5000 point benchmark + and this last benchmark. This is obviously due to the + increased number of points to search for. The naive method + takes a bigger hit since the search involves all points + whereas the sparse and coherent grid solutions are less + impacted but still impacted due the increased concentration + of points in cells. + Here are the results for 100,000 points + + * Now there are cases where the sparse and coherent grid fail. + This is when the search radius becomes so small that the + number of cells exceeds the graphics card memory and will + crash. The solution to this would be to limit the size in + order to prevent such a scenario. This is commented out + in the final code release. + So provided we clamp the minimum cell size, it should be + safe to use the sparse and coherent grid. + +Here are a few performance analysis with NSIGHT: + * Bruteforce: +![default100000bruteforceperformance](images/bruteforce_performance.png) + + * Sparce Grid: +![default100000sparsegridperformance](images/sparsegrid_performance.png) + + * Coherent Grid: +![default100000coherentgridperformance](images/coherentgrid_performance.png) + + diff --git a/images/bruteforce_performance.png b/images/bruteforce_performance.png new file mode 100644 index 0000000..73a49ae Binary files /dev/null and b/images/bruteforce_performance.png differ diff --git a/images/coherentgrid_performance.png b/images/coherentgrid_performance.png new file mode 100644 index 0000000..270d113 Binary files /dev/null and b/images/coherentgrid_performance.png differ diff --git a/images/default_100000_coherentgrid.png b/images/default_100000_coherentgrid.png new file mode 100644 index 0000000..5e8c118 Binary files /dev/null and b/images/default_100000_coherentgrid.png differ diff --git a/images/default_100000_sparsegrid.png b/images/default_100000_sparsegrid.png new file mode 100644 index 0000000..980eb89 Binary files /dev/null and b/images/default_100000_sparsegrid.png differ diff --git a/images/default_5000_bruteforce.png b/images/default_5000_bruteforce.png new file mode 100644 index 0000000..1a16f16 Binary files /dev/null and b/images/default_5000_bruteforce.png differ diff --git a/images/fig_1.png b/images/fig_1.png new file mode 100644 index 0000000..445c28d Binary files /dev/null and b/images/fig_1.png differ diff --git a/images/fig_2.png b/images/fig_2.png new file mode 100644 index 0000000..8011c63 Binary files /dev/null and b/images/fig_2.png differ diff --git a/images/sparsegrid_performance.png b/images/sparsegrid_performance.png new file mode 100644 index 0000000..2c97e58 Binary files /dev/null and b/images/sparsegrid_performance.png differ diff --git a/src/kernel.cu b/src/kernel.cu index 30356b9..48b9c3a 100644 --- a/src/kernel.cu +++ b/src/kernel.cu @@ -83,6 +83,8 @@ thrust::device_ptr dev_thrust_particleGridIndices; int *dev_gridCellStartIndices; // What part of dev_particleArrayIndices belongs int *dev_gridCellEndIndices; // to this cell? +int *dev_indexTempStorage; + // TODO-2.3 - consider what additional buffers you might need to reshuffle // the position and velocity data to be coherent within cells. @@ -128,8 +130,8 @@ __global__ void kernGenerateRandomPosArray(int time, int N, glm::vec3 * arr, flo if (index < N) { glm::vec3 rand = generateRandomVec3(time, index); arr[index].x = scale * rand.x; - arr[index].y = scale * rand.y; - arr[index].z = scale * rand.z; + arr[index].y = scale * rand.y;// *0.001; + arr[index].z = scale * rand.z; } } @@ -158,9 +160,12 @@ void Boids::initSimulation(int N) { // LOOK-2.1 computing grid params gridCellWidth = 2.0f * std::max(std::max(rule1Distance, rule2Distance), rule3Distance); + + // limit cell width to prevent crashing + //gridCellWidth = gridCellWidth < 5.0 ? 5.0 : gridCellWidth; + int halfSideCount = (int)(scene_scale / gridCellWidth) + 1; gridSideCount = 2 * halfSideCount; - gridCellCount = gridSideCount * gridSideCount * gridSideCount; gridInverseCellWidth = 1.0f / gridCellWidth; float halfGridWidth = gridCellWidth * halfSideCount; @@ -169,6 +174,24 @@ void Boids::initSimulation(int N) { gridMinimum.z -= halfGridWidth; // TODO-2.1 TODO-2.3 - Allocate additional buffers here. + + printf("\ngridCellCount = %d", gridCellCount); + + cudaMalloc((void**)&dev_particleArrayIndices, N * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_particleArrayIndices failed!"); + + cudaMalloc((void**)&dev_particleGridIndices, N * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_particleGridIndices failed!"); + + cudaMalloc((void**)&dev_gridCellStartIndices, gridCellCount * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_gridCellStartIndices failed!"); + + cudaMalloc((void**)&dev_gridCellEndIndices, gridCellCount * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_gridCellEndIndices failed!"); + + cudaMalloc((void**)&dev_indexTempStorage, N * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_indexTempStorage failed!"); + cudaThreadSynchronize(); } @@ -236,15 +259,77 @@ __device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *po return glm::vec3(0.0f, 0.0f, 0.0f); } +__global__ void kernSwapVel1Vel2(int N, glm::vec3 *vel1, glm::vec3 *vel2) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= N) { + return; + } + vel1[index] = vel2[index]; +} + /** * TODO-1.2 implement basic flocking * For each of the `N` bodies, update its position based on its current velocity. */ __global__ void kernUpdateVelocityBruteForce(int N, glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) { - // Compute a new velocity based on pos and vel1 - // Clamp the speed - // Record the new velocity into vel2. Question: why NOT vel1? + // Compute a new velocity based on pos and vel1 + // Clamp the speed + // Record the new velocity into vel2. Question: why NOT vel1? + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= N) { + return; + } + glm::vec3 center(0.0, 0.0, 0.0); + glm::vec3 prox(0.0, 0.0, 0.0); + glm::vec3 velocity(0.0, 0.0, 0.0); + + int numNeighborsCenter = 0; + //int numNeighborsProx = 0; + int numNeighborsVelocity = 0; + + + for (int i = 0; i < N; i++) + { + if (i != index) + { + float dist = glm::distance(pos[i], pos[index]); + //printf("\ndist: %f", dist); + if (dist < rule1Distance) + { + center = center + pos[i]; + numNeighborsCenter++; + } + if (dist < rule2Distance) + { + prox = prox - (pos[i] - pos[index]); + } + if (dist < rule3Distance) + { + velocity = velocity + vel1[i]; + numNeighborsVelocity++; + } + } + } + + if (numNeighborsCenter != 0) + { + center = center / (float)numNeighborsCenter; + vel2[index] += (center - pos[index]) * rule1Scale; + } + + if (numNeighborsVelocity != 0) + { + //velocity = velocity / (float)numNeighborsVelocity; + vel2[index] += velocity * rule3Scale; + } + + vel2[index] += prox * rule2Scale; + + if (glm::length(vel2[index]) > maxSpeed) + { + vel2[index] = glm::normalize(vel2[index]) * maxSpeed; + } } /** @@ -282,6 +367,7 @@ __device__ int gridIndex3Dto1D(int x, int y, int z, int gridResolution) { return x + y * gridResolution + z * gridResolution * gridResolution; } + __global__ void kernComputeIndices(int N, int gridResolution, glm::vec3 gridMin, float inverseCellWidth, glm::vec3 *pos, int *indices, int *gridIndices) { @@ -289,6 +375,21 @@ __global__ void kernComputeIndices(int N, int gridResolution, // - Label each boid with the index of its grid cell. // - Set up a parallel array of integer indices as pointers to the actual // boid data in pos and vel1/vel2 + + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= N) { + return; + } + + // find indices from pos + int x = floor((pos[index].x - gridMin.x) * inverseCellWidth); + int y = floor((pos[index].y - gridMin.y) * inverseCellWidth); + int z = floor((pos[index].z - gridMin.z) * inverseCellWidth); + + int gridIndex = gridIndex3Dto1D(x, y, z, gridResolution); + + gridIndices[index] = gridIndex; + indices[index] = index; } // LOOK-2.1 Consider how this could be useful for indicating that a cell @@ -306,6 +407,24 @@ __global__ void kernIdentifyCellStartEnd(int N, int *particleGridIndices, // Identify the start point of each cell in the gridIndices array. // This is basically a parallel unrolling of a loop that goes // "this index doesn't match the one before it, must be a new cell!" + + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= N) { + return; + } + + if (index > 0) + { + if (particleGridIndices[index] != particleGridIndices[index - 1]) + { + gridCellStartIndices[particleGridIndices[index]] = index; + gridCellEndIndices[particleGridIndices[index-1]] = index-1; + } + } + else + { + gridCellStartIndices[particleGridIndices[index]] = index; + } } __global__ void kernUpdateVelNeighborSearchScattered( @@ -322,6 +441,121 @@ __global__ void kernUpdateVelNeighborSearchScattered( // - Access each boid in the cell and compute velocity change from // the boids rules, if this boid is within the neighborhood distance. // - Clamp the speed change before putting the new speed in vel2 + + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= N) { + return; + } + + int start = gridCellStartIndices[index]; + int end = gridCellEndIndices[index]; + + if (start == -1) + { + return; + } + + // loop through points in the voxel + for (int p = start; p <= end; p++) + { + glm::vec3 center(0.0, 0.0, 0.0); + glm::vec3 prox(0.0, 0.0, 0.0); + glm::vec3 velocity(0.0, 0.0, 0.0); + + int numNeighborsCenter = 0; + int numNeighborsVelocity = 0; + + // get quadrant and neighboring voxels + // find indices from pos + // unfold particle index + int pIndex1 = particleArrayIndices[p]; + float voxelPosX = (pos[pIndex1].x - gridMin.x) * inverseCellWidth; + float voxelPosY = (pos[pIndex1].y - gridMin.y) * inverseCellWidth; + float voxelPosZ = (pos[pIndex1].z - gridMin.z) * inverseCellWidth; + + int x = floor(voxelPosX); + int y = floor(voxelPosY); + int z = floor(voxelPosZ); + + int startX = ((voxelPosX - x) > (0.5f * inverseCellWidth)) ? x : x < 1 ? 0 : x - 1; + int startY = ((voxelPosY - y) > (0.5f * inverseCellWidth)) ? y : y < 1 ? 0 : y - 1; + int startZ = ((voxelPosZ - z) > (0.5f * inverseCellWidth)) ? z : z < 1 ? 0 : z - 1; + + // lookup particles in neighboring voxels + for (int i = 0; i < 2; i++) + { + for (int j = 0; j < 2; j++) + { + for (int k = 0; k < 2; k++) + { + int Z = startZ + i; + int Y = startY + j; + int X = startX + k; + + if (Z < 0 || Z >= gridResolution || + Y < 0 || Y >= gridResolution || + X < 0 || X >= gridResolution) + { + continue; + } + else + { + // get the points at the current voxel + int voxelIndex = gridIndex3Dto1D(X, Y, Z, gridResolution); + + int subStart = gridCellStartIndices[voxelIndex]; + int subEnd = gridCellEndIndices[voxelIndex]; + if (subStart != -1) + { + for (int pIdx = subStart; pIdx <= subEnd; pIdx++) + { + // unfold particle index + int pIndex2 = particleArrayIndices[pIdx]; + if (pIndex2 != pIndex1) + { + float dist = glm::distance(pos[pIndex2], pos[pIndex1]); + //printf("\ndist: %f", dist); + if (dist < rule1Distance) + { + center = center + pos[pIndex2]; + numNeighborsCenter++; + } + if (dist < rule2Distance) + { + prox = prox - (pos[pIndex2] - pos[pIndex1]); + } + if (dist < rule3Distance) + { + velocity = velocity + vel1[pIndex2]; + numNeighborsVelocity++; + } + } + } + } + } + } + } + } + + if (numNeighborsCenter != 0) + { + center = center / (float)numNeighborsCenter; + vel2[pIndex1] += (center - pos[pIndex1]) * rule1Scale; + } + + if (numNeighborsVelocity != 0) + { + //velocity = velocity / (float)numNeighborsVelocity; + vel2[pIndex1] += velocity * rule3Scale; + } + + vel2[pIndex1] += prox * rule2Scale; + + if (glm::length(vel2[pIndex1]) > maxSpeed) + { + vel2[pIndex1] = glm::normalize(vel2[pIndex1]) * maxSpeed; + } + } } __global__ void kernUpdateVelNeighborSearchCoherent( @@ -341,6 +575,122 @@ __global__ void kernUpdateVelNeighborSearchCoherent( // - Access each boid in the cell and compute velocity change from // the boids rules, if this boid is within the neighborhood distance. // - Clamp the speed change before putting the new speed in vel2 + + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index >= N) { + return; + } + + int start = gridCellStartIndices[index]; + int end = gridCellEndIndices[index]; + + if (start == -1) + { + return; + } + + // loop through points in the voxel + for (int p = start; p <= end; p++) + { + glm::vec3 center(0.0, 0.0, 0.0); + glm::vec3 prox(0.0, 0.0, 0.0); + glm::vec3 velocity(0.0, 0.0, 0.0); + + int numNeighborsCenter = 0; + int numNeighborsVelocity = 0; + + // get quadrant and neighboring voxels + // find indices from pos + // no need to unfold particle index + //int pIndex1 = particleArrayIndices[p]; + float voxelPosX = (pos[p].x - gridMin.x) * inverseCellWidth; + float voxelPosY = (pos[p].y - gridMin.y) * inverseCellWidth; + float voxelPosZ = (pos[p].z - gridMin.z) * inverseCellWidth; + + int x = floor(voxelPosX); + int y = floor(voxelPosY); + int z = floor(voxelPosZ); + + int startX = ((voxelPosX - x) > (0.5f * inverseCellWidth)) ? x : x < 1 ? 0 : x - 1; + int startY = ((voxelPosY - y) > (0.5f * inverseCellWidth)) ? y : y < 1 ? 0 : y - 1; + int startZ = ((voxelPosZ - z) > (0.5f * inverseCellWidth)) ? z : z < 1 ? 0 : z - 1; + + // lookup particles in neighboring voxels + for (int i = 0; i < 2; i++) + { + for (int j = 0; j < 2; j++) + { + for (int k = 0; k < 2; k++) + { + int Z = startZ + i; + int Y = startY + j; + int X = startX + k; + + // check if out of bounds + if (Z < 0 || Z >= gridResolution || + Y < 0 || Y >= gridResolution || + X < 0 || X >= gridResolution) + { + continue; + } + else + { + // get the points at the current voxel + int voxelIndex = gridIndex3Dto1D(X, Y, Z, gridResolution); + + int subStart = gridCellStartIndices[voxelIndex]; + int subEnd = gridCellEndIndices[voxelIndex]; + if (subStart != -1) + { + for (int pIdx = subStart; pIdx <= subEnd; pIdx++) + { + // no need to unfold particle index + //int pIndex2 = particleArrayIndices[pIdx]; + if (pIdx != p) + { + float dist = glm::distance(pos[pIdx], pos[p]); + //printf("\ndist: %f", dist); + if (dist < rule1Distance) + { + center = center + pos[pIdx]; + numNeighborsCenter++; + } + if (dist < rule2Distance) + { + prox = prox - (pos[pIdx] - pos[p]); + } + if (dist < rule3Distance) + { + velocity = velocity + vel1[pIdx]; + numNeighborsVelocity++; + } + } + } + } + } + } + } + } + + if (numNeighborsCenter != 0) + { + center = center / (float)numNeighborsCenter; + vel2[p] += (center - pos[p]) * rule1Scale; + } + + if (numNeighborsVelocity != 0) + { + //velocity = velocity / (float)numNeighborsVelocity; + vel2[p] += velocity * rule3Scale; + } + + vel2[p] += prox * rule2Scale; + + if (glm::length(vel2[p]) > maxSpeed) + { + vel2[p] = glm::normalize(vel2[p]) * maxSpeed; + } + } } /** @@ -349,6 +699,22 @@ __global__ void kernUpdateVelNeighborSearchCoherent( void Boids::stepSimulationNaive(float dt) { // TODO-1.2 - use the kernels you wrote to step the simulation forward in time. // TODO-1.2 ping-pong the velocity buffers + + dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize); + + + // update velocity + kernUpdateVelocityBruteForce << > >(numObjects, + dev_pos, dev_vel1, dev_vel2); + checkCUDAErrorWithLine("kernUpdateVelocityBruteForce failed!"); + + // swap velocity + dev_vel1 = dev_vel2; + + // update position + kernUpdatePos << > >(numObjects, dt, + dev_pos, dev_vel1); + checkCUDAErrorWithLine("kernUpdatePos failed!"); } void Boids::stepSimulationScatteredGrid(float dt) { @@ -364,6 +730,53 @@ void Boids::stepSimulationScatteredGrid(float dt) { // - Perform velocity updates using neighbor search // - Update positions // - Ping-pong buffers as needed + + + dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize); + dim3 fullBlocksPerGridVoxels((gridCellCount + blockSize - 1) / blockSize); + + + kernComputeIndices << > >(numObjects, gridSideCount, + gridMinimum, gridInverseCellWidth, dev_pos, dev_particleArrayIndices, dev_particleGridIndices); + checkCUDAErrorWithLine("kernComputeIndices failed!"); + + + // Wrap device vectors in thrust iterators for use with thrust. + thrust::device_ptr dev_thrust_keys(dev_particleGridIndices); + thrust::device_ptr dev_thrust_values(dev_particleArrayIndices); + // LOOK-2.1 Example for using thrust::sort_by_key + thrust::sort_by_key(dev_thrust_keys, dev_thrust_keys + numObjects, dev_thrust_values); + + + // reset grid start buffer + kernResetIntBuffer << > >(gridCellCount, dev_gridCellStartIndices, -1); + checkCUDAErrorWithLine("kernResetIntBuffer failed!"); + + // reset grid end buffer + kernResetIntBuffer << > >(gridCellCount, dev_gridCellEndIndices, -1); + checkCUDAErrorWithLine("kernResetIntBuffer failed!"); + + // save start end particle indices in start end buffers + kernIdentifyCellStartEnd << > >(numObjects, dev_particleGridIndices, + dev_gridCellStartIndices, dev_gridCellEndIndices); + checkCUDAErrorWithLine("kernIdentifyCellStartEnd failed!"); + + // compute velocities + + kernUpdateVelNeighborSearchScattered << > >( + gridCellCount, gridSideCount, gridMinimum, + gridInverseCellWidth, gridCellWidth, + dev_gridCellStartIndices, dev_gridCellEndIndices, + dev_particleArrayIndices, dev_pos, dev_vel1, dev_vel2); + checkCUDAErrorWithLine("kernUpdateVelNeighborSearchScattered failed!"); + + // swap velocity + dev_vel1 = dev_vel2; + + // update position + kernUpdatePos << > >(numObjects, dt, + dev_pos, dev_vel1); + checkCUDAErrorWithLine("kernUpdatePos failed!"); } void Boids::stepSimulationCoherentGrid(float dt) { @@ -382,6 +795,64 @@ void Boids::stepSimulationCoherentGrid(float dt) { // - Perform velocity updates using neighbor search // - Update positions // - Ping-pong buffers as needed. THIS MAY BE DIFFERENT FROM BEFORE. + + + dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize); + dim3 fullBlocksPerGridVoxels((gridCellCount + blockSize - 1) / blockSize); + + + kernComputeIndices << > >(numObjects, gridSideCount, + gridMinimum, gridInverseCellWidth, dev_pos, dev_particleArrayIndices, dev_particleGridIndices); + checkCUDAErrorWithLine("kernComputeIndices failed!"); + + + // store temp + cudaMemcpy(dev_indexTempStorage, dev_particleGridIndices, sizeof(int) * numObjects, cudaMemcpyDeviceToDevice); + + + // Wrap device vectors in thrust iterators for use with thrust. + thrust::device_ptr dev_thrust_keys(dev_particleGridIndices); + thrust::device_ptr dev_thrust_values(dev_particleArrayIndices); + thrust::device_ptr dev_thrust_values_pos(dev_pos); + thrust::device_ptr dev_thrust_values_vel1(dev_vel1); + + // LOOK-2.1 Example for using thrust::sort_by_key + thrust::sort_by_key(dev_thrust_keys, dev_thrust_keys + numObjects, dev_thrust_values); + cudaMemcpy(dev_particleGridIndices, dev_indexTempStorage, sizeof(int) * numObjects, cudaMemcpyDeviceToDevice); // reset idx + thrust::sort_by_key(dev_thrust_keys, dev_thrust_keys + numObjects, dev_thrust_values_vel1); + cudaMemcpy(dev_particleGridIndices, dev_indexTempStorage, sizeof(int) * numObjects, cudaMemcpyDeviceToDevice); // reset idx + thrust::sort_by_key(dev_thrust_keys, dev_thrust_keys + numObjects, dev_thrust_values_pos); + + + + // reset grid start buffer + kernResetIntBuffer << > >(gridCellCount, dev_gridCellStartIndices, -1); + checkCUDAErrorWithLine("kernResetIntBuffer failed!"); + + // reset grid end buffer + kernResetIntBuffer << > >(gridCellCount, dev_gridCellEndIndices, -1); + checkCUDAErrorWithLine("kernResetIntBuffer failed!"); + + // save start end particle indices in start end buffers + kernIdentifyCellStartEnd << > >(numObjects, dev_particleGridIndices, + dev_gridCellStartIndices, dev_gridCellEndIndices); + checkCUDAErrorWithLine("kernIdentifyCellStartEnd failed!"); + + // compute velocities + kernUpdateVelNeighborSearchCoherent << > >( + gridCellCount, gridSideCount, gridMinimum, + gridInverseCellWidth, gridCellWidth, + dev_gridCellStartIndices, dev_gridCellEndIndices, + dev_pos, dev_vel1, dev_vel2); + checkCUDAErrorWithLine("kernUpdateVelNeighborSearchCoherent failed!"); + + // swap velocity + dev_vel1 = dev_vel2; + + // update position + kernUpdatePos << > >(numObjects, dt, + dev_pos, dev_vel1); + checkCUDAErrorWithLine("kernUpdatePos failed!"); } void Boids::endSimulation() { @@ -390,6 +861,11 @@ void Boids::endSimulation() { cudaFree(dev_pos); // TODO-2.1 TODO-2.3 - Free any additional buffers here. + cudaFree(dev_particleArrayIndices); + cudaFree(dev_particleGridIndices); + cudaFree(dev_gridCellStartIndices); + cudaFree(dev_gridCellEndIndices); + cudaFree(dev_indexTempStorage); } void Boids::unitTest() { @@ -454,6 +930,7 @@ void Boids::unitTest() { delete(intValues); cudaFree(dev_intKeys); cudaFree(dev_intValues); + checkCUDAErrorWithLine("cudaFree failed!"); return; } diff --git a/src/main.cpp b/src/main.cpp index e416836..c487f9f 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -14,13 +14,15 @@ // LOOK-2.1 LOOK-2.3 - toggles for UNIFORM_GRID and COHERENT_GRID #define VISUALIZE 1 -#define UNIFORM_GRID 0 -#define COHERENT_GRID 0 +#define UNIFORM_GRID 1 +#define COHERENT_GRID 1 // LOOK-1.2 - change this to adjust particle count in the simulation const int N_FOR_VIS = 5000; const float DT = 0.2f; +static int TOTAL_CYCLES = 0; +static double TOTAL_MS = 0.0; /** * C main function. */ @@ -235,7 +237,31 @@ void initShaders(GLuint * program) { frame = 0; } - runCUDA(); +#if VISUALIZE + runCUDA(); +#else + // run once per second + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); + cudaEventRecord(start); + + runCUDA(); + + cudaEventRecord(stop); + cudaEventSynchronize(stop); + float milliseconds = 0; + cudaEventElapsedTime(&milliseconds, start, stop); + TOTAL_MS += milliseconds; + TOTAL_CYCLES++; + + if (glfwGetTime() - timebase < (1.0 / fps)) + { + printf("\nAverage CUDA frame time %f ms", TOTAL_MS / TOTAL_CYCLES); + TOTAL_MS = 0.0; + TOTAL_CYCLES = 0; + } +#endif std::ostringstream ss; ss << "["; @@ -251,7 +277,12 @@ void initShaders(GLuint * program) { glBindVertexArray(boidVAO); glPointSize(pointSize); glDrawElements(GL_POINTS, N_FOR_VIS + 1, GL_UNSIGNED_INT, 0); - glPointSize(1.0f); + + // for analyzing the first point + //glPointSize(10); + //glDrawElements(GL_POINTS, 1, GL_UNSIGNED_INT, 0); + + glPointSize(1.0f); glUseProgram(0); glBindVertexArray(0); @@ -282,8 +313,8 @@ void initShaders(GLuint * program) { void mousePositionCallback(GLFWwindow* window, double xpos, double ypos) { if (leftMousePressed) { // compute new camera parameters - phi += (xpos - lastX) / width; - theta -= (ypos - lastY) / height; + phi += 3.0*(xpos - lastX) / width; + theta -= 3.0*(ypos - lastY) / height; theta = std::fmax(0.01f, std::fmin(theta, 3.14f)); updateCamera(); }