diff --git a/README.md b/README.md index 98dd9a8..fd1f360 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,79 @@ **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 1 - Flocking** -* (TODO) YOUR NAME HERE -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Trung Le +* Windows 10 Home, i7-4790 CPU @ 3.60GHz 12GB, GTX 980 Ti (Personal desktop) -### (TODO: Your README) +### Flocking -Include screenshots, analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +#### Description + +This project implemented three different algorithms for flocking behavior in CUDA: brute-force, uniform grid, coherent uniform grid. + +![alt text](https://github.com/trungtle/Project1-CUDA-Flocking/blob/master/images/screenshots/Uniform_5000_boids.gif "Flocking simulation") + +**---- General information for CUDA device ----** +- Device name: GeForce GTX 980 Ti +- Compute capability: 5.2 +- Compute mode: Default +- Clock rate: 1076000 +- Integrated: 0 +- Device copy overlap: Enabled +- Kernel execution timeout: Enabled + +**---- Memory information for CUDA device ----** + +- Total global memory: 6442450944 +- Total constant memory: 65536 +- Multiprocessor count: 22 +- Shared memory per multiprocessor: 98304 +- Registers per multiprocessor: 65536 +- Max threads per multiprocessor: 2048 +- Max grid dimensions: [2147483647, 65535, 65535] +- Max threads per block: 1024 +- Max registers per block: 65536 +- Max thread dimensions: [1024, 1024, 64] +- Threads per block: 512 + + +#### Performance analysis + +To run performance analysis, I used CUDA to wrap around the simulation step and turned off visualization. + +The boid count tested are: 1000, 5000, 50000, 100000, 150000 +The block size tested are: 1, 128, 512 (maxed at 512 threads per block on my machine) + +**Table showing simulation time per frame (in ms) vs. number of boids). Block size is 128.** + +| Boid count | Coherent | Uniform | Naive| +| ---------- | -------- | ------- | ---- | +|5000| 0.2| 0.2| 4.3| +|50000| 1.2| 1.5| 166| +|100000| 1.8| 2.6| 654| +|500000| 12.5| 20.6| (crash)| +|1000000| 52| 74| (crash)| + +![alt text](https://github.com/trungtle/Project1-CUDA-Flocking/blob/master/images/charts/performace.png "Naive vs. Coherent vs scattered uniform grid performance") + +For each implmentation, as the number of boids increases, there is a big drop in performance. This is due to the fact that for each boid's velocity computation, we need loop through a list of potential neighbors that can affect the boid. The biggest optimization made in each implementation is a improvement on how efficient we can iterate through these neighbors by partitioning them into a uniform grid and rearrange their data in a coherent memory. + +Changing the block size doesn't seem to increases performace. However, I did find an interesting pattern. At every block size that is a power of 2, the performance is optimal. I tested this with 1,000,000 in coherent grid (~46ms max) and scattered uniform grid (~74ms max). + +**Table showing block size vs. simulation time per frame (in ms)** + +| Block size | Coherent | Uniform | +| ---------- | -------- | ------- | +|32| 45| 74| +|35| 74| 113| +|64| 44| 73| +|100| 54| 87| +|128| 46| 74| +|256| 45| 73| +|400| 54| 85| +|512| 46| 73| +|600| 68| 105| +|1024| 46| 72| + +When comparing between coherent and scattered uniform grids, I did see a great performance improvement with the coherent data implementation as the number of boids increases. From my performance analysis, I observed that at 1000000 boids and kernel block size 128, coherent uniform grid is ~20ms faster than scattered uniform grid. This is a big improvement! This is due to the fact that we take advantage of spatial coherence can access each neighboring boid's data sequentially in memory. + +**Note**: I also updated the -arch=sm_52 version to make it compatible for my machine diff --git a/images/charts/performace.png b/images/charts/performace.png new file mode 100644 index 0000000..a1e1d23 Binary files /dev/null and b/images/charts/performace.png differ diff --git a/images/screenshots/Uniform_5000_boids.gif b/images/screenshots/Uniform_5000_boids.gif new file mode 100644 index 0000000..ed5de48 Binary files /dev/null and b/images/screenshots/Uniform_5000_boids.gif differ diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index fdd636d..eeaabd4 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -10,5 +10,5 @@ set(SOURCE_FILES cuda_add_library(src ${SOURCE_FILES} - OPTIONS -arch=sm_20 + OPTIONS -arch=sm_52 ) diff --git a/src/kernel.cu b/src/kernel.cu index 30356b9..378ae51 100644 --- a/src/kernel.cu +++ b/src/kernel.cu @@ -6,6 +6,8 @@ #include "utilityCore.hpp" #include "kernel.h" +#define INVALID_VALUE (-1) + // LOOK-2.1 potentially useful for doing grid-based neighbor search #ifndef imax #define imax( a, b ) ( ((a) > (b)) ? (a) : (b) ) @@ -83,8 +85,13 @@ thrust::device_ptr dev_thrust_particleGridIndices; int *dev_gridCellStartIndices; // What part of dev_particleArrayIndices belongs int *dev_gridCellEndIndices; // to this cell? + + // TODO-2.3 - consider what additional buffers you might need to reshuffle // the position and velocity data to be coherent within cells. +glm::vec3* dev_sorted_pos; +glm::vec3* dev_sorted_vel1; +glm::vec3* dev_sorted_vel2; // LOOK-2.1 - Grid parameters based on simulation parameters. // These are automatically computed for you in Boids::initSimulation @@ -133,6 +140,15 @@ __global__ void kernGenerateRandomPosArray(int time, int N, glm::vec3 * arr, flo } } +__global__ void kernCopyBuffer(int N, glm::vec3* desBuffer, glm::vec3* srcBuffer) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= N) { + return; + } + + desBuffer[index] = srcBuffer[index]; +} + /** * Initialize memory, update some globals */ @@ -151,15 +167,26 @@ void Boids::initSimulation(int N) { cudaMalloc((void**)&dev_vel2, N * sizeof(glm::vec3)); checkCUDAErrorWithLine("cudaMalloc dev_vel2 failed!"); + cudaMalloc((void**)&dev_sorted_pos, N * sizeof(glm::vec3)); + checkCUDAErrorWithLine("cudaMalloc dev_sorted_pos failed!"); + + cudaMalloc((void**)&dev_sorted_vel1, N * sizeof(glm::vec3)); + checkCUDAErrorWithLine("cudaMalloc dev_sorted_vel1 failed!"); + cudaMalloc((void**)&dev_sorted_vel2, N * sizeof(glm::vec3)); + checkCUDAErrorWithLine("cudaMalloc dev_sorted_vel2 failed!"); + // LOOK-1.2 - This is a typical CUDA kernel invocation. kernGenerateRandomPosArray<<>>(1, numObjects, dev_pos, scene_scale); checkCUDAErrorWithLine("kernGenerateRandomPosArray failed!"); + // Initialize the dev_sorted_pos + kernCopyBuffer << > >(numObjects, dev_sorted_pos, dev_pos); + // LOOK-2.1 computing grid params gridCellWidth = 2.0f * std::max(std::max(rule1Distance, rule2Distance), rule3Distance); int halfSideCount = (int)(scene_scale / gridCellWidth) + 1; - gridSideCount = 2 * halfSideCount; + gridSideCount = halfSideCount * 2; gridCellCount = gridSideCount * gridSideCount * gridSideCount; gridInverseCellWidth = 1.0f / gridCellWidth; @@ -169,6 +196,20 @@ void Boids::initSimulation(int N) { gridMinimum.z -= halfGridWidth; // TODO-2.1 TODO-2.3 - Allocate additional buffers here. + cudaMalloc((void**)&dev_particleArrayIndices, N * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_particleArrayIndices failed!"); + cudaMalloc((void**)&dev_particleGridIndices, N * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_particleGridIndices failed!"); + + dev_thrust_particleArrayIndices = thrust::device_pointer_cast(dev_particleArrayIndices); + dev_thrust_particleGridIndices = thrust::device_pointer_cast(dev_particleGridIndices); + + cudaMalloc((void**)&dev_gridCellStartIndices, gridCellCount * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_gridCellStartIndices failed!"); + cudaMalloc((void**)&dev_gridCellEndIndices, gridCellCount * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_gridCellEndIndices failed!"); + + cudaThreadSynchronize(); } @@ -218,7 +259,6 @@ void Boids::copyBoidsToVBO(float *vbodptr_positions, float *vbodptr_velocities) cudaThreadSynchronize(); } - /****************** * stepSimulation * ******************/ @@ -230,10 +270,55 @@ void Boids::copyBoidsToVBO(float *vbodptr_positions, float *vbodptr_velocities) * in the `pos` and `vel` arrays. */ __device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *pos, const glm::vec3 *vel) { - // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves - // Rule 2: boids try to stay a distance d away from each other - // Rule 3: boids try to match the speed of surrounding boids - return glm::vec3(0.0f, 0.0f, 0.0f); + + glm::vec3 velCohesion, velSeparation, velAlignment; + glm::vec3 centerOfMass; + unsigned int rule1NeighborCount = 0; + unsigned int rule2NeighborCount = 0; + unsigned int rule3NeighborCount = 0; + + for (auto i = 0; i < N; ++i) { + + // skip itself + if (i == iSelf) continue; + auto distance = glm::distance(pos[iSelf], pos[i]); + + // Rule 1, cohesion: boids fly towards their local perceived center of mass, which excludes themselves + if (distance < rule1Distance) { + centerOfMass += pos[i]; + ++rule1NeighborCount; + } + + // Rule 2: boids try to stay a distance d away from each other + if (distance < rule2Distance) { + velSeparation -= (pos[i] - pos[iSelf]); + ++rule2NeighborCount; + } + + // Rule 3: boids try to match the speed of surrounding boids + if (distance < rule3Distance) { + velAlignment += vel[i]; + ++rule3NeighborCount; + } + } + + if (rule1NeighborCount > 0) { + centerOfMass /= rule1NeighborCount; + velCohesion = (centerOfMass - pos[iSelf]) * rule1Scale; + } + + if (rule2NeighborCount > 0) { + velSeparation *= rule2Scale; + } + + if (rule3NeighborCount > 0) { + velAlignment /= rule3NeighborCount; + velAlignment = velAlignment - vel[iSelf]; + velAlignment *= rule3Scale; + } + + // Return new computed velocity + return velCohesion + velSeparation + velAlignment; } /** @@ -242,9 +327,21 @@ __device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *po */ __global__ void kernUpdateVelocityBruteForce(int N, glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) { + // Compute a new velocity based on pos and vel1 - // Clamp the speed + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= N) { + return; + } + // Record the new velocity into vel2. Question: why NOT vel1? + // Since the neighbor might be inspecting our current vel1 this frame, we don't want to update vel1 yet + vel2[index] = vel1[index] + computeVelocityChange(N, index, pos, vel1); + + // Clamp the speed + if (glm::length(vel2[index]) >= maxSpeed) { + vel2[index] = glm::normalize(vel2[index]) * maxSpeed; + } } /** @@ -289,6 +386,19 @@ __global__ void kernComputeIndices(int N, int gridResolution, // - Label each boid with the index of its grid cell. // - Set up a parallel array of integer indices as pointers to the actual // boid data in pos and vel1/vel2 + + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index < N) { + indices[index] = index; + + // Compute the 3D grid indices for this particle + int gridX = (int)((pos[index].x - gridMin.x) * inverseCellWidth); + int gridY = (int)((pos[index].y - gridMin.y) * inverseCellWidth); + int gridZ = (int)((pos[index].z - gridMin.z) * inverseCellWidth); + + // Convert and store the 1D grid index + gridIndices[index] = gridIndex3Dto1D(gridX, gridY, gridZ, gridResolution); + } } // LOOK-2.1 Consider how this could be useful for indicating that a cell @@ -306,6 +416,28 @@ __global__ void kernIdentifyCellStartEnd(int N, int *particleGridIndices, // Identify the start point of each cell in the gridIndices array. // This is basically a parallel unrolling of a loop that goes // "this index doesn't match the one before it, must be a new cell!" + + int index = (blockIdx.x * blockDim.x) + threadIdx.x; + if (index > N) { + return; + } + + int cellIndex = particleGridIndices[index]; + if (index == 0) { + gridCellStartIndices[cellIndex] = 0; + return; + } + + if (index == N - 1) { + gridCellEndIndices[cellIndex] = N - 1; + } + + int prevCellIndex = particleGridIndices[index - 1]; + if (cellIndex != prevCellIndex) + { + gridCellStartIndices[cellIndex] = index; + gridCellEndIndices[prevCellIndex] = index - 1; + } } __global__ void kernUpdateVelNeighborSearchScattered( @@ -322,6 +454,126 @@ __global__ void kernUpdateVelNeighborSearchScattered( // - Access each boid in the cell and compute velocity change from // the boids rules, if this boid is within the neighborhood distance. // - Clamp the speed change before putting the new speed in vel2 + + // Compute a new velocity based on pos and vel1 + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= N) { + return; + } + + int boidIndex = particleArrayIndices[index]; + + // Compute the 3D grid indices for this particle + glm::vec3 grid3DIndex = (pos[boidIndex] - gridMin) * inverseCellWidth; + + // Find the other 8 neiboring cells based on which octant the particle is in + glm::vec3 distanceToNeighbor = + pos[boidIndex] - (glm::vec3((int)grid3DIndex.x, (int)grid3DIndex.y, (int)grid3DIndex.z) * cellWidth + gridMin); + float halfCellWidth = cellWidth / 2.0f; + + int neighborMinIndexX, neighborMinIndexY, neighborMinIndexZ, neighborMaxIndexX, neighborMaxIndexY, neighborMaxIndexZ; + + if (distanceToNeighbor.x < halfCellWidth) { + neighborMinIndexX = (int)grid3DIndex.x - 1 >= 0 ? (int)grid3DIndex.x - 1 : (int)grid3DIndex.x; + neighborMaxIndexX = (int)grid3DIndex.x; + } else { + neighborMinIndexX = (int)grid3DIndex.x; + neighborMaxIndexX = (int)grid3DIndex.x + 1 < N ? (int)grid3DIndex.x + 1: (int)grid3DIndex.x; + } + + if (distanceToNeighbor.y < halfCellWidth) { + neighborMinIndexY = (int)grid3DIndex.y - 1 >= 0 ? (int)grid3DIndex.y - 1 : (int)grid3DIndex.y; + neighborMaxIndexY = (int)grid3DIndex.y; + } + else { + neighborMinIndexY = (int)grid3DIndex.y; + neighborMaxIndexY = (int)grid3DIndex.y + 1 < N ? (int)grid3DIndex.y + 1 : (int)grid3DIndex.y; + } + + if (distanceToNeighbor.z < halfCellWidth) { + neighborMinIndexZ = (int)grid3DIndex.z - 1 >= 0 ? (int)grid3DIndex.z - 1 : (int)grid3DIndex.z; + neighborMaxIndexZ = (int)grid3DIndex.z; + } + else { + neighborMinIndexZ = (int)grid3DIndex.z; + neighborMaxIndexZ = (int)grid3DIndex.z + 1 < N ? (int)grid3DIndex.z + 1 : (int)grid3DIndex.z; + } + + // These are to keep track of velocity computation + glm::vec3 velCohesion, velSeparation, velAlignment; + glm::vec3 centerOfMass; + unsigned int rule1NeighborCount = 0; + unsigned int rule2NeighborCount = 0; + unsigned int rule3NeighborCount = 0; + + // Loop through neighbor cells and find all boids in them + int neighborIndex, neighborCellStartIndex, neighborCellEndIndex; + for (auto z = neighborMinIndexZ; z <= neighborMaxIndexZ; ++z) { + for (auto y = neighborMinIndexY; y <= neighborMaxIndexY; ++y) { + for (auto x = neighborMinIndexX; x <= neighborMaxIndexX; ++x) { + + // Compute velocity contribution from particles in this neighboring cell + neighborIndex = gridIndex3Dto1D(x, y, z, gridResolution); + neighborCellStartIndex = gridCellStartIndices[neighborIndex]; + neighborCellEndIndex = gridCellEndIndices[neighborIndex]; + + if (neighborCellStartIndex >= 0 && neighborCellStartIndex <= neighborCellEndIndex && + neighborCellEndIndex >= neighborCellStartIndex && neighborCellEndIndex < N) { + + //Compute velocity + for (auto i = neighborCellStartIndex; i < neighborCellEndIndex; ++i) { + + int neighborBoidIndex = particleArrayIndices[i]; + // skip itself + if (neighborBoidIndex == boidIndex) continue; + auto distance = glm::distance(pos[boidIndex], pos[neighborBoidIndex]); + + // Rule 1, cohesion: boids fly towards their local perceived center of mass, which excludes themselves + if (distance < rule1Distance) { + centerOfMass += pos[neighborBoidIndex]; + ++rule1NeighborCount; + } + + // Rule 2: boids try to stay a distance d away from each other + if (distance < rule2Distance) { + velSeparation -= (pos[neighborBoidIndex] - pos[boidIndex]); + ++rule2NeighborCount; + } + + // Rule 3: boids try to match the speed of surrounding boids + if (distance < rule3Distance) { + velAlignment += vel1[neighborBoidIndex]; + ++rule3NeighborCount; + } + } + } + } + } + } + + if (rule1NeighborCount > 0) { + centerOfMass /= rule1NeighborCount; + velCohesion = (centerOfMass - pos[boidIndex]) * rule1Scale; + } + + if (rule2NeighborCount > 0) { + velSeparation *= rule2Scale; + } + + if (rule3NeighborCount > 0) { + velAlignment /= rule3NeighborCount; + velAlignment = velAlignment - vel1[boidIndex]; + velAlignment *= rule3Scale; + } + + // Record the new velocity into vel2. Question: why NOT vel1? + // Since the neighbor might be inspecting our current vel1 this frame, we don't want to update vel1 yet + vel2[boidIndex] = vel1[boidIndex] + velCohesion + velSeparation + velAlignment; + + // Clamp the speed + if (glm::length(vel2[boidIndex]) >= maxSpeed) { + vel2[boidIndex] = glm::normalize(vel2[boidIndex]) * maxSpeed; + } } __global__ void kernUpdateVelNeighborSearchCoherent( @@ -341,6 +593,147 @@ __global__ void kernUpdateVelNeighborSearchCoherent( // - Access each boid in the cell and compute velocity change from // the boids rules, if this boid is within the neighborhood distance. // - Clamp the speed change before putting the new speed in vel2 + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= N) { + return; + } + + // Compute the 3D grid indices for this particle + glm::vec3 grid3DIndex = (pos[index] - gridMin) * inverseCellWidth; + + // Find the other 8 neiboring cells based on which octant the particle is in + glm::vec3 distanceToNeighbor = + pos[index] - (glm::vec3((int)grid3DIndex.x, (int)grid3DIndex.y, (int)grid3DIndex.z) * cellWidth + gridMin); + float halfCellWidth = cellWidth / 2.0f; + + int neighborMinIndexX, + neighborMinIndexY, + neighborMinIndexZ, + neighborMaxIndexX, + neighborMaxIndexY, + neighborMaxIndexZ; + + if (distanceToNeighbor.x < halfCellWidth) { + neighborMinIndexX = (int)grid3DIndex.x - 1; + neighborMaxIndexX = (int)grid3DIndex.x; + } + else { + neighborMinIndexX = (int)grid3DIndex.x; + neighborMaxIndexX = (int)grid3DIndex.x + 1; + } + + if (distanceToNeighbor.y < halfCellWidth) { + neighborMinIndexY = (int)grid3DIndex.y - 1; + neighborMaxIndexY = (int)grid3DIndex.y; + } + else { + neighborMinIndexY = (int)grid3DIndex.y; + neighborMaxIndexY = (int)grid3DIndex.y + 1; + } + + if (distanceToNeighbor.z < halfCellWidth) { + neighborMinIndexZ = (int)grid3DIndex.z - 1; + neighborMaxIndexZ = (int)grid3DIndex.z; + } + else { + neighborMinIndexZ = (int)grid3DIndex.z; + neighborMaxIndexZ = (int)grid3DIndex.z + 1; + } + + // These are to keep track of velocity computation + glm::vec3 velCohesion, velSeparation, velAlignment; + glm::vec3 centerOfMass; + unsigned int rule1NeighborCount = 0; + unsigned int rule2NeighborCount = 0; + unsigned int rule3NeighborCount = 0; + + // Loop through neighbor cells and find all boids in them + int neighborIndex, neighborCellStartIndex, neighborCellEndIndex; + for (int z = neighborMinIndexZ; z <= neighborMaxIndexZ; ++z) { + for (int y = neighborMinIndexY; y <= neighborMaxIndexY; ++y) { + for (int x = neighborMinIndexX; x <= neighborMaxIndexX; ++x) { + + // Compute velocity contribution from particles in this neighboring cell + neighborIndex = gridIndex3Dto1D(x % gridResolution, y % gridResolution, z % gridResolution, gridResolution); + neighborCellStartIndex = gridCellStartIndices[neighborIndex]; + neighborCellEndIndex = gridCellEndIndices[neighborIndex]; + + if (neighborCellStartIndex >= 0 && neighborCellStartIndex <= neighborCellEndIndex && + neighborCellEndIndex >= neighborCellStartIndex && neighborCellEndIndex < N) { + + + //Compute velocity + for (auto neighborBoidIndex = neighborCellStartIndex; neighborBoidIndex < neighborCellEndIndex; ++neighborBoidIndex) { + + // skip itself + if (neighborBoidIndex == index) continue; + auto distance = glm::distance(pos[index], pos[neighborBoidIndex]); + + // Rule 1, cohesion: boids fly towards their local perceived center of mass, which excludes themselves + if (distance < rule1Distance) { + centerOfMass += pos[neighborBoidIndex]; + ++rule1NeighborCount; + } + + // Rule 2: boids try to stay a distance d away from each other + if (distance < rule2Distance) { + velSeparation -= (pos[neighborBoidIndex] - pos[index]); + ++rule2NeighborCount; + } + + // Rule 3: boids try to match the speed of surrounding boids + if (distance < rule3Distance) { + velAlignment += vel1[neighborBoidIndex]; + ++rule3NeighborCount; + } + } + } + } + } + } + + if (rule1NeighborCount > 0) { + centerOfMass /= rule1NeighborCount; + velCohesion = (centerOfMass - pos[index]) * rule1Scale; + } + + if (rule2NeighborCount > 0) { + velSeparation *= rule2Scale; + } + + if (rule3NeighborCount > 0) { + velAlignment /= rule3NeighborCount; + velAlignment = velAlignment - vel1[index]; + velAlignment *= rule3Scale; + } + + // Record the new velocity into vel2. Question: why NOT vel1? + // Since the neighbor might be inspecting our current vel1 this frame, we don't want to update vel1 yet + vel2[index] = vel1[index] + velCohesion + velSeparation + velAlignment; + + // Clamp the speed + if (glm::length(vel2[index]) >= maxSpeed) { + vel2[index] = glm::normalize(vel2[index]) * maxSpeed; + } +} + +__global__ void kernRearrangeBuffer(int N, int* particleGridIndices, glm::vec3* newBuffer, glm::vec3* oldBuffer) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= N) { + return; + } + int oldIdx = particleGridIndices[index]; + newBuffer[index] = oldBuffer[oldIdx]; +} + +__global__ void kernCopyBack(int N, int* particleGridIndices, glm::vec3* newBuffer, glm::vec3* oldBuffer) { + int index = threadIdx.x + (blockIdx.x * blockDim.x); + if (index >= N) { + return; + } + + int newIdx = particleGridIndices[index]; + newBuffer[newIdx] = oldBuffer[index]; } /** @@ -348,7 +741,14 @@ __global__ void kernUpdateVelNeighborSearchCoherent( */ void Boids::stepSimulationNaive(float dt) { // TODO-1.2 - use the kernels you wrote to step the simulation forward in time. + static bool isDevVel1Active = true; + dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize); + + kernUpdateVelocityBruteForce << > >(numObjects, dev_pos, isDevVel1Active ? dev_vel1 : dev_vel2, isDevVel1Active ? dev_vel2 : dev_vel1); + kernUpdatePos << > >(numObjects, dt, dev_pos, isDevVel1Active ? dev_vel2 : dev_vel1); + // TODO-1.2 ping-pong the velocity buffers + isDevVel1Active = !isDevVel1Active; } void Boids::stepSimulationScatteredGrid(float dt) { @@ -364,6 +764,71 @@ void Boids::stepSimulationScatteredGrid(float dt) { // - Perform velocity updates using neighbor search // - Update positions // - Ping-pong buffers as needed + + dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize); + + // reseet the grid cell start and end indices to an invalid value + kernResetIntBuffer << > >( + numObjects, + dev_gridCellStartIndices, + INVALID_VALUE + ); + + kernResetIntBuffer << > >( + numObjects, + dev_gridCellEndIndices, + INVALID_VALUE + ); + + // Compute the new index + kernComputeIndices << > >( + numObjects, + gridSideCount, + gridMinimum, + gridInverseCellWidth, + dev_pos, + dev_particleArrayIndices, + dev_particleGridIndices + ); + + thrust::sort_by_key( + dev_thrust_particleGridIndices, + dev_thrust_particleGridIndices + numObjects, + dev_thrust_particleArrayIndices); + + kernIdentifyCellStartEnd << > >( + numObjects, + dev_particleGridIndices, + dev_gridCellStartIndices, + dev_gridCellEndIndices + ); + + // Update velocity + static bool isDevVel1Active = true; + kernUpdateVelNeighborSearchScattered << > >( + numObjects, + gridSideCount, + gridMinimum, + gridInverseCellWidth, + gridCellWidth, + dev_gridCellStartIndices, + dev_gridCellEndIndices, + dev_particleArrayIndices, + dev_pos, + isDevVel1Active ? dev_vel1 : dev_vel2, + isDevVel1Active ? dev_vel2 : dev_vel1 + ); + + // Update position + kernUpdatePos << > >( + numObjects, + dt, + dev_pos, + isDevVel1Active ? dev_vel2 : dev_vel1 + ); + + // Ping pong + isDevVel1Active = !isDevVel1Active; } void Boids::stepSimulationCoherentGrid(float dt) { @@ -382,6 +847,83 @@ void Boids::stepSimulationCoherentGrid(float dt) { // - Perform velocity updates using neighbor search // - Update positions // - Ping-pong buffers as needed. THIS MAY BE DIFFERENT FROM BEFORE. + + dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize); + + // reseet the grid cell start and end indices to an invalid value + kernResetIntBuffer << > >( + numObjects, + dev_gridCellStartIndices, + INVALID_VALUE + ); + + kernResetIntBuffer << > >( + numObjects, + dev_gridCellEndIndices, + INVALID_VALUE + ); + + // Compute the new index + kernComputeIndices << > >( + numObjects, + gridSideCount, + gridMinimum, + gridInverseCellWidth, + dev_pos, + dev_particleArrayIndices, + dev_particleGridIndices + ); + + thrust::sort_by_key( + dev_thrust_particleGridIndices, + dev_thrust_particleGridIndices + numObjects, + dev_thrust_particleArrayIndices); + + kernIdentifyCellStartEnd << > >( + numObjects, + dev_particleGridIndices, + dev_gridCellStartIndices, + dev_gridCellEndIndices + ); + + kernRearrangeBuffer << > >( + numObjects, + dev_particleArrayIndices, + dev_sorted_pos, + dev_pos + ); + + kernRearrangeBuffer << > >( + numObjects, + dev_particleArrayIndices, + dev_vel1, + dev_vel2 + ); + + // Update velocity + kernUpdateVelNeighborSearchCoherent << > >( + numObjects, + gridSideCount, + gridMinimum, + gridInverseCellWidth, + gridCellWidth, + dev_gridCellStartIndices, + dev_gridCellEndIndices, + dev_sorted_pos, + dev_vel1, + dev_vel2 + ); + + // Update position + kernUpdatePos << > >( + numObjects, + dt, + dev_sorted_pos, + dev_vel2 + ); + + // Ping pong + std::swap(dev_pos, dev_sorted_pos); } void Boids::endSimulation() { @@ -390,6 +932,13 @@ void Boids::endSimulation() { cudaFree(dev_pos); // TODO-2.1 TODO-2.3 - Free any additional buffers here. + cudaFree(dev_particleArrayIndices); + cudaFree(dev_particleGridIndices); + cudaFree(dev_gridCellStartIndices); + cudaFree(dev_gridCellEndIndices); + cudaFree(dev_sorted_pos); + cudaFree(dev_sorted_vel1); + cudaFree(dev_sorted_vel2); } void Boids::unitTest() { @@ -432,6 +981,8 @@ void Boids::unitTest() { cudaMemcpy(dev_intKeys, intKeys, sizeof(int) * N, cudaMemcpyHostToDevice); cudaMemcpy(dev_intValues, intValues, sizeof(int) * N, cudaMemcpyHostToDevice); + // ------------ + // Wrap device vectors in thrust iterators for use with thrust. thrust::device_ptr dev_thrust_keys(dev_intKeys); thrust::device_ptr dev_thrust_values(dev_intValues); @@ -449,11 +1000,135 @@ void Boids::unitTest() { std::cout << " value: " << intValues[i] << std::endl; } + // ------------ + float test_cellWidth = 10.0; + float test_sceneScale = 100.0f; + int test_sideCount = (int)(test_sceneScale / test_cellWidth); + int test_halfSideCount = test_sideCount / 2; + int test_gridCellCount = test_sideCount * test_sideCount * test_sideCount; + float test_inverseCellWidth = 1.0f / test_cellWidth; + float test_halfGridWidth = test_cellWidth * test_halfSideCount; + glm::vec3 test_gridMinimum(-test_halfGridWidth, -test_halfGridWidth, -test_halfGridWidth); + + std::cout << "Test cell width: " << test_cellWidth << std::endl; + std::cout << "Test scene scale: " << test_sceneScale << std::endl; + std::cout << "Test half side count: " << test_halfSideCount << std::endl; + std::cout << "Test side count: " << test_sideCount << std::endl; + std::cout << "Test grid cell count: " << test_gridCellCount << std::endl; + std::cout << "Test inverse cell width: " << test_inverseCellWidth << std::endl; + std::cout << "Test half grid width: " << test_halfGridWidth << std::endl; + std::cout << "Test grid minimum: " << test_gridMinimum.x << ", " << test_gridMinimum.y << ", " << test_gridMinimum.z << std::endl; + + int* test_particleArrayIndices = new int[N]; + int* test_particleGridIndices = new int[N]; + glm::vec3* test_pos = new glm::vec3[N]; + + glm::vec3* test_dev_pos; + int* test_dev_particleArrayIndices; + int* test_dev_particleGridIndices; + + cudaMalloc((void**)&test_dev_pos, N * sizeof(glm::vec3)); + for (int i = 0; i < N; ++i) { + test_pos[i] = glm::vec3(test_cellWidth * i) + test_gridMinimum; + + } + cudaMemcpy(test_dev_pos, test_pos, sizeof(glm::vec3) * N, cudaMemcpyHostToDevice); + + cudaMalloc((void**)&test_dev_particleArrayIndices, N * sizeof(int)); + cudaMalloc((void**)&test_dev_particleGridIndices, N * sizeof(int)); + + kernComputeIndices << <1, N >> >( + N, + test_sideCount, + test_gridMinimum, + test_inverseCellWidth, + test_dev_pos, + test_dev_particleArrayIndices, + test_dev_particleGridIndices + ); + + cudaMemcpy(test_pos, test_dev_pos, sizeof(glm::vec3) * N, cudaMemcpyDeviceToHost); + cudaMemcpy(test_particleArrayIndices, test_dev_particleArrayIndices, sizeof(int) * N, cudaMemcpyDeviceToHost); + cudaMemcpy(test_particleGridIndices, test_dev_particleGridIndices, sizeof(int) * N, cudaMemcpyDeviceToHost); + + std::cout << "Test kernComputeIndices : " << std::endl; + for (auto i = 0; i < N; i++) { + std::cout << " pos: " << test_pos[i].x << ", " << test_pos[i].y << ", " << test_pos[i].z; + std::cout << " particle array indices: " << test_particleArrayIndices[i]; + std::cout << " particle grid indices: " << test_particleGridIndices[i] << std::endl; + } + + // ------------ + + int *dev_keyStarts; + int *dev_keyEnds; + cudaMalloc((void**)&dev_keyStarts, N * sizeof(int)); + cudaMalloc((void**)&dev_keyEnds, N * sizeof(int)); + + int *intStarts = new int[N]; + int *intEnds = new int[N]; + + kernResetIntBuffer << <1, N >> >(N, dev_keyStarts, INVALID_VALUE); + kernResetIntBuffer << <1, N >> >(N, dev_keyEnds, INVALID_VALUE); + cudaMemcpy(intStarts, dev_keyStarts, sizeof(int) * N, cudaMemcpyDeviceToHost); + cudaMemcpy(intEnds, dev_keyEnds, sizeof(int) * N, cudaMemcpyDeviceToHost); + + std::cout << "Test kernResetIntBuffer : " << std::endl; + for (auto i = 0; i < N; i++) { + std::cout << " start: " << intStarts[i]; + std::cout << " end: " << intEnds[i] << std::endl; + } + + // ------------ + + kernIdentifyCellStartEnd << <1, N >> >(N, dev_intKeys, dev_keyStarts, dev_keyEnds); + + cudaMemcpy(intStarts, dev_keyStarts, sizeof(int) * N, cudaMemcpyDeviceToHost); + cudaMemcpy(intEnds, dev_keyEnds, sizeof(int) * N, cudaMemcpyDeviceToHost); + + std::cout << "Test kernIdentifyCellStartEnd : " << std::endl; + for (int i = 0; i < N; i++) { + std::cout << " start: " << intStarts[i]; + std::cout << " end: " << intEnds[i] << std::endl; + } + + // -------------- + glm::vec3* test_sorted_pos = new glm::vec3[N]; + + glm::vec3* test_dev_sorted_pos; + cudaMalloc((void**)&test_dev_sorted_pos, N * sizeof(glm::vec3)); + + std::cout << "Test kernRearrangeBuffer : " << std::endl; + for (int i = 0; i < N; i++) { + std::cout << " pos: " << test_pos[i].x << ", " << test_pos[i].y << ", " << test_pos[i].z; + std::cout << " index: " << intValues[i] << std::endl; + } + kernRearrangeBuffer << <1, N >> >(N, dev_intValues, test_dev_sorted_pos, test_dev_pos); + cudaMemcpy(test_sorted_pos, test_dev_sorted_pos, sizeof(glm::vec3) * N, cudaMemcpyDeviceToHost); + + for (int i = 0; i < N; i++) { + std::cout << " sorted_pos: " << test_sorted_pos[i].x << ", " << test_sorted_pos[i].y << ", " << test_sorted_pos[i].z; + std::cout << " index: " << intValues[i] << std::endl; + } + + // cleanup - delete(intKeys); - delete(intValues); + delete[] intKeys; + delete[] intValues; + delete[] intStarts; + delete[] intEnds; + delete[] test_particleArrayIndices; + delete[] test_particleGridIndices; + delete[] test_pos; + delete[] test_sorted_pos; cudaFree(dev_intKeys); cudaFree(dev_intValues); + cudaFree(dev_keyStarts); + cudaFree(dev_keyEnds); + cudaFree(test_dev_pos); + cudaFree(test_dev_particleArrayIndices); + cudaFree(test_dev_particleGridIndices); + cudaFree(test_dev_sorted_pos); checkCUDAErrorWithLine("cudaFree failed!"); return; } diff --git a/src/main.cpp b/src/main.cpp index e416836..f7f36c7 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -14,18 +14,18 @@ // LOOK-2.1 LOOK-2.3 - toggles for UNIFORM_GRID and COHERENT_GRID #define VISUALIZE 1 -#define UNIFORM_GRID 0 -#define COHERENT_GRID 0 +#define UNIFORM_GRID 1 +#define COHERENT_GRID 1 // LOOK-1.2 - change this to adjust particle count in the simulation -const int N_FOR_VIS = 5000; +const int N_FOR_VIS = 100000; const float DT = 0.2f; /** * C main function. */ int main(int argc, char* argv[]) { - projectName = "565 CUDA Intro: Boids"; + projectName = "565 CUDA Intro: Boids - Trung Le"; if (init(argc, argv)) { mainLoop(); @@ -47,6 +47,10 @@ GLFWwindow *window; * Initialization of CUDA and GLFW. */ bool init(int argc, char **argv) { + + // Print out our CUDA-capable devices properties + printCudaDeviceProperties(); + // Set window title to "Student Name: [SM 2.0] GPU Name" cudaDeviceProp deviceProp; int gpuDevice = 0; @@ -183,10 +187,67 @@ void initShaders(GLuint * program) { } } +void printCudaDeviceProperties() +{ + int deviceCount = 0; + cudaGetDeviceCount(&deviceCount); + + cudaDeviceProp deviceProp; + for (auto i = 0; i < deviceCount; ++i) { + cudaGetDeviceProperties(&deviceProp, i); + + std::cout << "---- General information for device " << i << " ----" << std::endl; + std::cout << "Device name: " << deviceProp.name << std::endl; + std::cout << "Compute capability: " << deviceProp.major << "." << deviceProp.minor << std::endl; + std::cout << "Compute mode: "; + switch (deviceProp.computeMode) { + case cudaComputeModeDefault: + std::cout << "Default" << std::endl; + break; + case cudaComputeModeExclusive: + std::cout << "Exclusive" << std::endl; + break; + case cudaComputeModeExclusiveProcess: + std::cout << "Exclusive process" << std::endl; + break; + case cudaComputeModeProhibited: + std::cout << "Prohibited" << std::endl; + break; + } + std::cout << "Clock rate: " << deviceProp.clockRate << std::endl; + std::cout << "Integrated: " << deviceProp.integrated << std::endl; + std::cout << "Device copy overlap: "; + if (deviceProp.deviceOverlap) { + std::cout << "Enabled" << std::endl; + } + else { + std::cout << "Disabled" << std::endl; + } + std::cout << "Kernel execution timeout: "; + if (deviceProp.kernelExecTimeoutEnabled) { + std::cout << "Enabled" << std::endl; + } + else { + std::cout << "Disabled" << std::endl; + } + std::cout << "---- Memory information for device " << i << " ----" << std::endl; + std::cout << "Total global memory: " << deviceProp.totalGlobalMem << std::endl; + std::cout << "Total constant memory: " << deviceProp.totalConstMem << std::endl; + std::cout << "Multiprocessor count: " << deviceProp.multiProcessorCount << std::endl; + std::cout << "Shared memory per multiprocessor: " << deviceProp.sharedMemPerMultiprocessor << std::endl; + std::cout << "Registers per multiprocessor: " << deviceProp.regsPerMultiprocessor << std::endl; + std::cout << "Max threads per multiprocessor: " << deviceProp.maxThreadsPerMultiProcessor << std::endl; + std::cout << "Max grid dimensions: [" << deviceProp.maxGridSize[0] << ", " << deviceProp.maxGridSize[1] << ", " << deviceProp.maxGridSize[2] << "]" << std::endl; + std::cout << "Max threads per block: " << deviceProp.maxThreadsPerBlock << std::endl; + std::cout << "Max registers per block: " << deviceProp.regsPerBlock << std::endl; + std::cout << "Max thread dimensions: [" << deviceProp.maxThreadsDim[0] << ", " << deviceProp.maxThreadsDim[1] << ", " << deviceProp.maxThreadsDim[2] << "]" << std::endl; + } +} + //==================================== // Main loop //==================================== - void runCUDA() { + float runCUDA() { // Map OpenGL buffer object for writing from CUDA on a single GPU // No data is moved (Win & Linux). When mapped to CUDA, OpenGL should not // use this buffer @@ -198,7 +259,12 @@ void initShaders(GLuint * program) { cudaGLMapBufferObject((void**)&dptrVertPositions, boidVBO_positions); cudaGLMapBufferObject((void**)&dptrVertVelocities, boidVBO_velocities); + // Record cuda simulation + cudaEvent_t start, stop; + cudaEventCreate(&start); + cudaEventCreate(&stop); // execute the kernel + cudaEventRecord(start); #if UNIFORM_GRID && COHERENT_GRID Boids::stepSimulationCoherentGrid(DT); #elif UNIFORM_GRID @@ -206,6 +272,11 @@ void initShaders(GLuint * program) { #else Boids::stepSimulationNaive(DT); #endif + cudaEventRecord(stop); + + cudaEventSynchronize(stop); + float milliseconds = 0; + cudaEventElapsedTime(&milliseconds, start, stop); #if VISUALIZE Boids::copyBoidsToVBO(dptrVertPositions, dptrVertVelocities); @@ -213,6 +284,8 @@ void initShaders(GLuint * program) { // unmap buffer object cudaGLUnmapBufferObject(boidVBO_positions); cudaGLUnmapBufferObject(boidVBO_velocities); + + return milliseconds; } void mainLoop() { @@ -235,13 +308,14 @@ void initShaders(GLuint * program) { frame = 0; } - runCUDA(); + float simulationTimeMs = runCUDA(); std::ostringstream ss; ss << "["; ss.precision(1); ss << std::fixed << fps; - ss << " fps] " << deviceName; + ss << " fps] " << "[" << simulationTimeMs << " ms/frame]"; + ss << deviceName; glfwSetWindowTitle(window, ss.str().c_str()); glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT); diff --git a/src/main.hpp b/src/main.hpp index 6cdaa93..9c2110b 100644 --- a/src/main.hpp +++ b/src/main.hpp @@ -70,7 +70,7 @@ void keyCallback(GLFWwindow* window, int key, int scancode, int action, int mods void mouseButtonCallback(GLFWwindow* window, int button, int action, int mods); void mousePositionCallback(GLFWwindow* window, double xpos, double ypos); void updateCamera(); -void runCUDA(); +float runCUDA(); //==================================== // Setup/init Stuff @@ -78,3 +78,4 @@ void runCUDA(); bool init(int argc, char **argv); void initVAO(); void initShaders(GLuint *program); +void printCudaDeviceProperties();