diff --git a/README.md b/README.md index 98dd9a8..5c7198e 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,46 @@ **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 1 - Flocking** -* (TODO) YOUR NAME HERE -* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab) +* Jian Ru +* Tested on: Windows 10, i7-4850 @ 2.30GHz 16GB, GT 750M 2GB (Personal) -### (TODO: Your README) +--- +### Results -Include screenshots, analysis, etc. (Remember, this is public, so don't put -anything here that you don't want to share with the world.) +* Parameters + * Number of particles: 40,000 + * Blocks: 40, 1, 1 + * Threads: 128, 1, 1 + * Rule distances: 5.0, 3.0, 5.0 + * Rule scales: 0.01, 0.1, 0.1 + * Scene scale: 100.0 + * Delta time: 0.2 + ![result](images/demo1.gif) + +--- +### Analysis + +* Simulation Time vs. Number of Particles + * For the brute force version, the simulation time grows polynomially as particle count increases. This is expected because + even though the complexity of each thread is O(n) but there are too many particles and hence too many threads. So it is + impossible to parallize all the threads at once. Therefore, the time complexity should still grow in a polynomial fashion + but less sensitive than sequential implementation. + * For the scattered and coherent grid versions, they still demonstrates a little polynomial growth but the speed is much + slower and their growth seems almost linear. This is expected because each particle has much fewer neighbours to examine + in each step. Statistically, the number of neighbours grows linearly as particle count increases. But the number of threads + also increase at the same time so the time complexity of the implementation should be a liitle bit more expensive than O(n). + ![sp](images/st_pc.png) + +* Simulation Time vs. Block Size + * The relationship between simulation time and block size is somewhat random but expected. Since it guaranteed that GPU + executes each block on a single SM, putting more threads that access the same memory region with similar access pattern + should increase performance due to the increased cache hit-rate. But putting too many threads in a single block may hinder + performance if a SM cannot execute all the threads in a block at once. + ![sb](images/bs_st.png) + +* Coherent Grid vs. Scattered Grid + * From my experimentation, coherent grid performances better than scattered grid. This is expected because even though + reordering position and velocity arrays has cost, in this case, the gain from increased cache hit-rate outweight the cost + of copying and additional kernel calls. Since adjacent threads tend to have shared neighouring cells, they tend to access + the same memory regions when they execute. Even for just one thread, it also enjoys cache hit-rate increase because after + sorting, the data of particles in the same cell are stored closely in one consecutive memory region. diff --git a/images/bs_st.png b/images/bs_st.png new file mode 100644 index 0000000..622b1b1 Binary files /dev/null and b/images/bs_st.png differ diff --git a/images/demo1.gif b/images/demo1.gif new file mode 100644 index 0000000..2bdd922 Binary files /dev/null and b/images/demo1.gif differ diff --git a/images/st_pc.png b/images/st_pc.png new file mode 100644 index 0000000..0b1070b Binary files /dev/null and b/images/st_pc.png differ diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index fdd636d..750f0cb 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -10,5 +10,5 @@ set(SOURCE_FILES cuda_add_library(src ${SOURCE_FILES} - OPTIONS -arch=sm_20 + OPTIONS -arch=sm_30 ) diff --git a/src/kernel.cu b/src/kernel.cu index 30356b9..e64e717 100644 --- a/src/kernel.cu +++ b/src/kernel.cu @@ -76,6 +76,7 @@ glm::vec3 *dev_vel2; // For efficient sorting and the uniform grid. These should always be parallel. int *dev_particleArrayIndices; // What index in dev_pos and dev_velX represents this particle? int *dev_particleGridIndices; // What grid cell is this particle in? + // needed for use with thrust thrust::device_ptr dev_thrust_particleArrayIndices; thrust::device_ptr dev_thrust_particleGridIndices; @@ -85,6 +86,7 @@ int *dev_gridCellEndIndices; // to this cell? // TODO-2.3 - consider what additional buffers you might need to reshuffle // the position and velocity data to be coherent within cells. +glm::vec3 *dev_pos2; // LOOK-2.1 - Grid parameters based on simulation parameters. // These are automatically computed for you in Boids::initSimulation @@ -94,6 +96,13 @@ float gridCellWidth; float gridInverseCellWidth; glm::vec3 gridMinimum; +#define NUM_SIM_TIMES 500 +float Boids::averageSimTime; +float accumSimTime; +int simTimeCount; +cudaEvent_t simStart; +cudaEvent_t simEnd; + /****************** * initSimulation * ******************/ @@ -169,7 +178,32 @@ void Boids::initSimulation(int N) { gridMinimum.z -= halfGridWidth; // TODO-2.1 TODO-2.3 - Allocate additional buffers here. - cudaThreadSynchronize(); + cudaMalloc((void **)&dev_particleArrayIndices, N * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_particleArrayIndices failed!"); + + cudaMalloc((void **)&dev_particleGridIndices, N * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_particleGridIndices failed!"); + + cudaMalloc((void **)&dev_gridCellStartIndices, gridCellCount * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_gridCellStartIndices failed!"); + + cudaMalloc((void **)&dev_gridCellEndIndices, gridCellCount * sizeof(int)); + checkCUDAErrorWithLine("cudaMalloc dev_gridCellEndIndices failed!"); + + cudaMalloc((void **)&dev_pos2, N * sizeof(glm::vec3)); + checkCUDAErrorWithLine("cudaMalloc dev_pos2 failed!"); + + // needed for use with thrust + dev_thrust_particleArrayIndices = thrust::device_ptr(dev_particleArrayIndices); + dev_thrust_particleGridIndices = thrust::device_ptr(dev_particleGridIndices); + + averageSimTime = 0.f; + accumSimTime = 0.f; + simTimeCount = 0; + cudaEventCreate(&simStart); + cudaEventCreate(&simEnd); + + cudaDeviceSynchronize(); } @@ -215,7 +249,7 @@ void Boids::copyBoidsToVBO(float *vbodptr_positions, float *vbodptr_velocities) checkCUDAErrorWithLine("copyBoidsToVBO failed!"); - cudaThreadSynchronize(); + cudaDeviceSynchronize(); } @@ -229,11 +263,64 @@ void Boids::copyBoidsToVBO(float *vbodptr_positions, float *vbodptr_velocities) * Compute the new velocity on the body with index `iSelf` due to the `N` boids * in the `pos` and `vel` arrays. */ -__device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *pos, const glm::vec3 *vel) { - // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves - // Rule 2: boids try to stay a distance d away from each other - // Rule 3: boids try to match the speed of surrounding boids - return glm::vec3(0.0f, 0.0f, 0.0f); +__device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *pos, const glm::vec3 *vel) +{ + // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves + // Rule 2: boids try to stay a distance d away from each other + // Rule 3: boids try to match the speed of surrounding boids + glm::vec3 com(0.f); // center of mass + glm::vec3 d(0.f); // displacement from neighbours + glm::vec3 av(0.f); // average velocities of neighbours + uint32_t r1Count = 0; + uint32_t r3Count = 0; + + const glm::vec3 &myPos = pos[iSelf]; + const glm::vec3 &myVel = vel[iSelf]; + + for (int i = 0; i < N; ++i) + { + if (i == iSelf) continue; + + const glm::vec3 &npos = pos[i]; + const glm::vec3 &nvel = vel[i]; + + const float dist = glm::distance(myPos, npos); + + if (dist < rule1Distance) + { + ++r1Count; + com += npos; + } + + if (dist < rule2Distance) + { + d += myPos - npos; + } + + if (dist < rule3Distance) + { + ++r3Count; + av += nvel; + } + } + + if (r1Count) com /= r1Count; else com = myPos; + if (r3Count) av /= r3Count; else av = myVel; + + glm::vec3 v1 = rule1Scale * (com - myPos); + glm::vec3 v2 = rule2Scale * d; + glm::vec3 v3 = rule3Scale * (av - myVel); + + return v1 + v2 + v3; +} + +__device__ void clampVel(glm::vec3 &vel) +{ + float speed = glm::length(vel) + FLT_EPSILON; + float tooFast = static_cast(speed > maxSpeed); + float notTooFast = 1.f - tooFast; + float scale = tooFast * (1.f / speed * maxSpeed) + notTooFast; + vel *= scale; } /** @@ -241,10 +328,19 @@ __device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *po * For each of the `N` bodies, update its position based on its current velocity. */ __global__ void kernUpdateVelocityBruteForce(int N, glm::vec3 *pos, - glm::vec3 *vel1, glm::vec3 *vel2) { - // Compute a new velocity based on pos and vel1 - // Clamp the speed - // Record the new velocity into vel2. Question: why NOT vel1? + glm::vec3 *vel1, glm::vec3 *vel2) +{ + // Compute a new velocity based on pos and vel1 + // Clamp the speed + // Record the new velocity into vel2. Question: why NOT vel1? + int boidIdx = threadIdx.x + (blockIdx.x * blockDim.x); + + glm::vec3 newVel = vel1[boidIdx]; + glm::vec3 dv = computeVelocityChange(N, boidIdx, pos, vel1); + newVel += dv; + + clampVel(newVel); + vel2[boidIdx] = newVel; } /** @@ -282,13 +378,40 @@ __device__ int gridIndex3Dto1D(int x, int y, int z, int gridResolution) { return x + y * gridResolution + z * gridResolution * gridResolution; } +__device__ void computeCellIdx3D(const glm::vec3 &myPos, const glm::vec3 &gridMin, + float inverseCellWidth, int gridResolution, int &cx, int &cy, int &cz) +{ + glm::vec3 fci = (myPos - gridMin) * inverseCellWidth; + + cx = static_cast(fci.x); + cy = static_cast(fci.y); + cz = static_cast(fci.z); +} + +__device__ int computeCellIdx(const glm::vec3 &myPos, const glm::vec3 &gridMin, + float inverseCellWidth, int gridResolution) +{ + int cx, cy, cz; + computeCellIdx3D(myPos, gridMin, inverseCellWidth, gridResolution, cx, cy, cz); + return gridIndex3Dto1D(cx, cy, cz, gridResolution); +} + __global__ void kernComputeIndices(int N, int gridResolution, - glm::vec3 gridMin, float inverseCellWidth, - glm::vec3 *pos, int *indices, int *gridIndices) { - // TODO-2.1 - // - Label each boid with the index of its grid cell. - // - Set up a parallel array of integer indices as pointers to the actual - // boid data in pos and vel1/vel2 + glm::vec3 gridMin, float inverseCellWidth, + glm::vec3 *pos, int *indices, int *gridIndices) +{ + // TODO-2.1 + // - Label each boid with the index of its grid cell. + // - Set up a parallel array of integer indices as pointers to the actual + // boid data in pos and vel1/vel2 + int boidIdx = blockIdx.x * blockDim.x + threadIdx.x; + if (boidIdx >= N) return; + + const glm::vec3 &myPos = pos[boidIdx]; + int ci = computeCellIdx(myPos, gridMin, inverseCellWidth, gridResolution); + + indices[boidIdx] = boidIdx; + gridIndices[boidIdx] = ci; } // LOOK-2.1 Consider how this could be useful for indicating that a cell @@ -301,87 +424,396 @@ __global__ void kernResetIntBuffer(int N, int *intBuffer, int value) { } __global__ void kernIdentifyCellStartEnd(int N, int *particleGridIndices, - int *gridCellStartIndices, int *gridCellEndIndices) { - // TODO-2.1 - // Identify the start point of each cell in the gridIndices array. - // This is basically a parallel unrolling of a loop that goes - // "this index doesn't match the one before it, must be a new cell!" + int *gridCellStartIndices, int *gridCellEndIndices) +{ + // TODO-2.1 + // Identify the start point of each cell in the gridIndices array. + // This is basically a parallel unrolling of a loop that goes + // "this index doesn't match the one before it, must be a new cell!" + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= N) return; + + int preIdx = (idx + N - 1) % N; + int nextIdx = (idx + 1) % N; + int ci = particleGridIndices[idx]; + int preCi = particleGridIndices[preIdx]; + int nextCi = particleGridIndices[nextIdx]; + + if (preCi != ci) + { + gridCellStartIndices[ci] = idx; + } + if (nextCi != ci) + { + gridCellEndIndices[ci] = idx; + } } __global__ void kernUpdateVelNeighborSearchScattered( - int N, int gridResolution, glm::vec3 gridMin, - float inverseCellWidth, float cellWidth, - int *gridCellStartIndices, int *gridCellEndIndices, - int *particleArrayIndices, - glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) { - // TODO-2.1 - Update a boid's velocity using the uniform grid to reduce - // the number of boids that need to be checked. - // - Identify the grid cell that this particle is in - // - Identify which cells may contain neighbors. This isn't always 8. - // - For each cell, read the start/end indices in the boid pointer array. - // - Access each boid in the cell and compute velocity change from - // the boids rules, if this boid is within the neighborhood distance. - // - Clamp the speed change before putting the new speed in vel2 + int N, int gridResolution, glm::vec3 gridMin, + float inverseCellWidth, float cellWidth, + int *gridCellStartIndices, int *gridCellEndIndices, + int *particleArrayIndices, + glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) +{ + // TODO-2.1 - Update a boid's velocity using the uniform grid to reduce + // the number of boids that need to be checked. + // - Identify the grid cell that this particle is in + // - Identify which cells may contain neighbors. This isn't always 8. + // - For each cell, read the start/end indices in the boid pointer array. + // - Access each boid in the cell and compute velocity change from + // the boids rules, if this boid is within the neighborhood distance. + // - Clamp the speed change before putting the new speed in vel2 + int boidIdx = blockIdx.x * blockDim.x + threadIdx.x; + if (boidIdx >= N) return; + + // Adjacent particles in @particleArrayIndices are located in the same or + // adjacent grid cells. This improves cache hit rates A LOT!!! + boidIdx = particleArrayIndices[boidIdx]; + + const glm::vec3 &myPos = pos[boidIdx]; + const glm::vec3 &myVel = vel1[boidIdx]; + int cx, cy, cz; + glm::vec3 fci = (myPos - gridMin) * inverseCellWidth; + cx = int(fci.x); + cy = int(fci.y); + cz = int(fci.z); + glm::vec3 fc = fci - glm::vec3(cx, cy, cz); + + // rule1 + int dx; + int dy; + int dz; + + // This neighour accessing scheme improve the chance that adjacent + // threads access the same grid cell at the same time + dx = (fc.x >= .5f) ? 1 : -1; + dy = (fc.y >= .5f) ? 1 : -1; + dz = (fc.z >= .5f) ? 1 : -1; + + int cis[8]; + cis[0] = gridIndex3Dto1D(cx , cy , cz , gridResolution); + cis[1] = gridIndex3Dto1D(cx + dx, cy , cz , gridResolution); + cis[2] = gridIndex3Dto1D(cx , cy + dy, cz , gridResolution); + cis[3] = gridIndex3Dto1D(cx + dx, cy + dy, cz , gridResolution); + cis[4] = gridIndex3Dto1D(cx , cy , cz + dz, gridResolution); + cis[5] = gridIndex3Dto1D(cx + dx, cy , cz + dz, gridResolution); + cis[6] = gridIndex3Dto1D(cx , cy + dy, cz + dz, gridResolution); + cis[7] = gridIndex3Dto1D(cx + dx, cy + dy, cz + dz, gridResolution); + + const int gridCellCount = gridResolution * gridResolution * gridResolution; + glm::vec3 com(0.f); + glm::vec3 d(0.f); + glm::vec3 av(0.f); + int r1Count = 0; + int r3Count = 0; + + for (int j = 0; j < 8; ++j) + { + int cellIdx = cis[j]; + if (cellIdx < 0 || cellIdx >= gridCellCount) continue; + + int cellStart = gridCellStartIndices[cellIdx]; + int cellEnd = gridCellEndIndices[cellIdx]; + + for (int i = cellStart; i <= cellEnd; ++i) + { + int nidx = particleArrayIndices[i]; + if (nidx == boidIdx) continue; + + const glm::vec3 &npos = pos[nidx]; + const glm::vec3 &nvel = vel1[nidx]; + const glm::vec3 dpos = myPos - npos; + float dist = glm::length(dpos); + + if (dist < rule1Distance) + { + com += npos; + ++r1Count; + } + if (dist < rule2Distance) + { + d += dpos; + } + if (dist < rule3Distance) + { + av += nvel; + ++r3Count; + } + } + } + + if (r1Count) com /= float(r1Count); else com = myPos; + if (r3Count) av /= float(r3Count); else av = myVel; + glm::vec3 r1Vel = (com - myPos) * rule1Scale; + glm::vec3 r2Vel = d * rule2Scale; + glm::vec3 r3Vel = (av - myVel) * rule3Scale; + + glm::vec3 newVel = myVel + r1Vel + r2Vel + r3Vel; + if (glm::length(newVel) > maxSpeed) + { + newVel = glm::normalize(newVel) * maxSpeed; + } + vel2[boidIdx] = newVel; } __global__ void kernUpdateVelNeighborSearchCoherent( - int N, int gridResolution, glm::vec3 gridMin, - float inverseCellWidth, float cellWidth, - int *gridCellStartIndices, int *gridCellEndIndices, - glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) { - // TODO-2.3 - This should be very similar to kernUpdateVelNeighborSearchScattered, - // except with one less level of indirection. - // This should expect gridCellStartIndices and gridCellEndIndices to refer - // directly to pos and vel1. - // - Identify the grid cell that this particle is in - // - Identify which cells may contain neighbors. This isn't always 8. - // - For each cell, read the start/end indices in the boid pointer array. - // DIFFERENCE: For best results, consider what order the cells should be - // checked in to maximize the memory benefits of reordering the boids data. - // - Access each boid in the cell and compute velocity change from - // the boids rules, if this boid is within the neighborhood distance. - // - Clamp the speed change before putting the new speed in vel2 + int N, int gridResolution, glm::vec3 gridMin, + float inverseCellWidth, float cellWidth, + int *gridCellStartIndices, int *gridCellEndIndices, + glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) +{ + // TODO-2.3 - This should be very similar to kernUpdateVelNeighborSearchScattered, + // except with one less level of indirection. + // This should expect gridCellStartIndices and gridCellEndIndices to refer + // directly to pos and vel1. + // - Identify the grid cell that this particle is in + // - Identify which cells may contain neighbors. This isn't always 8. + // - For each cell, read the start/end indices in the boid pointer array. + // DIFFERENCE: For best results, consider what order the cells should be + // checked in to maximize the memory benefits of reordering the boids data. + // - Access each boid in the cell and compute velocity change from + // the boids rules, if this boid is within the neighborhood distance. + // - Clamp the speed change before putting the new speed in vel2 + int boidIdx = blockIdx.x * blockDim.x + threadIdx.x; + if (boidIdx >= N) return; + + const glm::vec3 &myPos = pos[boidIdx]; + const glm::vec3 &myVel = vel1[boidIdx]; + int cx, cy, cz; + glm::vec3 fci = (myPos - gridMin) * inverseCellWidth; + cx = int(fci.x); + cy = int(fci.y); + cz = int(fci.z); + glm::vec3 fc = fci - glm::vec3(cx, cy, cz); + + // rule1 + int dx; + int dy; + int dz; + + // This neighour accessing scheme improve the chance that adjacent + // threads access the same grid cell at the same time + dx = (fc.x >= .5f) ? 1 : -1; + dy = (fc.y >= .5f) ? 1 : -1; + dz = (fc.z >= .5f) ? 1 : -1; + + int cis[8]; + cis[0] = gridIndex3Dto1D(cx, cy, cz, gridResolution); + cis[1] = gridIndex3Dto1D(cx + dx, cy, cz, gridResolution); + cis[2] = gridIndex3Dto1D(cx, cy + dy, cz, gridResolution); + cis[3] = gridIndex3Dto1D(cx + dx, cy + dy, cz, gridResolution); + cis[4] = gridIndex3Dto1D(cx, cy, cz + dz, gridResolution); + cis[5] = gridIndex3Dto1D(cx + dx, cy, cz + dz, gridResolution); + cis[6] = gridIndex3Dto1D(cx, cy + dy, cz + dz, gridResolution); + cis[7] = gridIndex3Dto1D(cx + dx, cy + dy, cz + dz, gridResolution); + + const int gridCellCount = gridResolution * gridResolution * gridResolution; + glm::vec3 com(0.f); + glm::vec3 d(0.f); + glm::vec3 av(0.f); + int r1Count = 0; + int r3Count = 0; + + for (int j = 0; j < 8; ++j) + { + int cellIdx = cis[j]; + if (cellIdx < 0 || cellIdx >= gridCellCount) continue; + + int cellStart = gridCellStartIndices[cellIdx]; + int cellEnd = gridCellEndIndices[cellIdx]; + + for (int i = cellStart; i <= cellEnd; ++i) + { + if (i == boidIdx) continue; + + const glm::vec3 &npos = pos[i]; + const glm::vec3 &nvel = vel1[i]; + const glm::vec3 dpos = myPos - npos; + float dist = glm::length(dpos); + + if (dist < rule1Distance) + { + com += npos; + ++r1Count; + } + if (dist < rule2Distance) + { + d += dpos; + } + if (dist < rule3Distance) + { + av += nvel; + ++r3Count; + } + } + } + + if (r1Count) com /= float(r1Count); else com = myPos; + if (r3Count) av /= float(r3Count); else av = myVel; + glm::vec3 r1Vel = (com - myPos) * rule1Scale; + glm::vec3 r2Vel = d * rule2Scale; + glm::vec3 r3Vel = (av - myVel) * rule3Scale; + + glm::vec3 newVel = myVel + r1Vel + r2Vel + r3Vel; + if (glm::length(newVel) > maxSpeed) + { + newVel = glm::normalize(newVel) * maxSpeed; + } + vel2[boidIdx] = newVel; } /** * Step the entire N-body simulation by `dt` seconds. */ -void Boids::stepSimulationNaive(float dt) { - // TODO-1.2 - use the kernels you wrote to step the simulation forward in time. - // TODO-1.2 ping-pong the velocity buffers +void Boids::stepSimulationNaive(float dt) +{ + // TODO-1.2 - use the kernels you wrote to step the simulation forward in time. + // TODO-1.2 ping-pong the velocity buffers + cudaEventRecord(simStart); + + dim3 blocksPerGrid((numObjects + blockSize - 1) / blockSize); + kernUpdateVelocityBruteForce<<>>(numObjects, dev_pos, dev_vel1, dev_vel2); + kernUpdatePos<<>>(numObjects, dt, dev_pos, dev_vel2); + + cudaEventRecord(simEnd); + cudaEventSynchronize(simEnd); + + float timeElapsedMilliseconds = 0.f; + cudaEventElapsedTime(&timeElapsedMilliseconds, simStart, simEnd); + accumSimTime += timeElapsedMilliseconds; + ++simTimeCount; + if (simTimeCount >= NUM_SIM_TIMES) + { + averageSimTime = accumSimTime / static_cast(simTimeCount); + accumSimTime = 0.f; + simTimeCount = 0; + } + + glm::vec3 *tmp = dev_vel1; + dev_vel1 = dev_vel2; + dev_vel2 = tmp; +} + +void Boids::stepSimulationScatteredGrid(float dt) +{ + // TODO-2.1 + // Uniform Grid Neighbor search using Thrust sort. + // In Parallel: + // - label each particle with its array index as well as its grid index. + // Use 2x width grids. + // - Unstable key sort using Thrust. A stable sort isn't necessary, but you + // are welcome to do a performance comparison. + // - Naively unroll the loop for finding the start and end indices of each + // cell's data pointers in the array of boid indices + // - Perform velocity updates using neighbor search + // - Update positions + // - Ping-pong buffers as needed + cudaEventRecord(simStart); + + dim3 cellBlocksPerGrid((gridCellCount + blockSize - 1) / blockSize); + kernResetIntBuffer << > >(gridCellCount, dev_gridCellStartIndices, -1); + kernResetIntBuffer << > >(gridCellCount, dev_gridCellEndIndices, -2); + + dim3 blocksPerGrid((numObjects + blockSize - 1) / blockSize); + kernComputeIndices << > >(numObjects, gridSideCount, gridMinimum, + gridInverseCellWidth, dev_pos, dev_particleArrayIndices, dev_particleGridIndices); + + thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, dev_thrust_particleArrayIndices); + + kernIdentifyCellStartEnd << > >(numObjects, dev_particleGridIndices, + dev_gridCellStartIndices, dev_gridCellEndIndices); + + kernUpdateVelNeighborSearchScattered << > >(numObjects, gridSideCount, + gridMinimum, gridInverseCellWidth, gridCellWidth, dev_gridCellStartIndices, dev_gridCellEndIndices, + dev_particleArrayIndices, dev_pos, dev_vel1, dev_vel2); + kernUpdatePos << > >(numObjects, dt, dev_pos, dev_vel2); + + cudaEventRecord(simEnd); + cudaEventSynchronize(simEnd); + + float timeElapsedMilliseconds = 0.f; + cudaEventElapsedTime(&timeElapsedMilliseconds, simStart, simEnd); + accumSimTime += timeElapsedMilliseconds; + ++simTimeCount; + if (simTimeCount >= NUM_SIM_TIMES) + { + averageSimTime = accumSimTime / static_cast(simTimeCount); + accumSimTime = 0.f; + simTimeCount = 0; + } + + glm::vec3 *tmp = dev_vel1; + dev_vel1 = dev_vel2; + dev_vel2 = tmp; } -void Boids::stepSimulationScatteredGrid(float dt) { - // TODO-2.1 - // Uniform Grid Neighbor search using Thrust sort. - // In Parallel: - // - label each particle with its array index as well as its grid index. - // Use 2x width grids. - // - Unstable key sort using Thrust. A stable sort isn't necessary, but you - // are welcome to do a performance comparison. - // - Naively unroll the loop for finding the start and end indices of each - // cell's data pointers in the array of boid indices - // - Perform velocity updates using neighbor search - // - Update positions - // - Ping-pong buffers as needed +__global__ void kernShuffle(int N, const int *srcPositions, const glm::vec3 *src, glm::vec3 *dest) +{ + int idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= N) return; + int srcIdx = srcPositions[idx]; + dest[idx] = src[srcIdx]; } -void Boids::stepSimulationCoherentGrid(float dt) { - // TODO-2.3 - start by copying Boids::stepSimulationNaiveGrid - // Uniform Grid Neighbor search using Thrust sort on cell-coherent data. - // In Parallel: - // - Label each particle with its array index as well as its grid index. - // Use 2x width grids - // - Unstable key sort using Thrust. A stable sort isn't necessary, but you - // are welcome to do a performance comparison. - // - Naively unroll the loop for finding the start and end indices of each - // cell's data pointers in the array of boid indices - // - BIG DIFFERENCE: use the rearranged array index buffer to reshuffle all - // the particle data in the simulation array. - // CONSIDER WHAT ADDITIONAL BUFFERS YOU NEED - // - Perform velocity updates using neighbor search - // - Update positions - // - Ping-pong buffers as needed. THIS MAY BE DIFFERENT FROM BEFORE. +void Boids::stepSimulationCoherentGrid(float dt) +{ + // TODO-2.3 - start by copying Boids::stepSimulationNaiveGrid + // Uniform Grid Neighbor search using Thrust sort on cell-coherent data. + // In Parallel: + // - Label each particle with its array index as well as its grid index. + // Use 2x width grids + // - Unstable key sort using Thrust. A stable sort isn't necessary, but you + // are welcome to do a performance comparison. + // - Naively unroll the loop for finding the start and end indices of each + // cell's data pointers in the array of boid indices + // - BIG DIFFERENCE: use the rearranged array index buffer to reshuffle all + // the particle data in the simulation array. + // CONSIDER WHAT ADDITIONAL BUFFERS YOU NEED + // - Perform velocity updates using neighbor search + // - Update positions + // - Ping-pong buffers as needed. THIS MAY BE DIFFERENT FROM BEFORE. + cudaEventRecord(simStart); + + dim3 cellBlocksPerGrid((gridCellCount + blockSize - 1) / blockSize); + kernResetIntBuffer << > >(gridCellCount, dev_gridCellStartIndices, -1); + kernResetIntBuffer << > >(gridCellCount, dev_gridCellEndIndices, -2); + + dim3 blocksPerGrid((numObjects + blockSize - 1) / blockSize); + kernComputeIndices << > >(numObjects, gridSideCount, gridMinimum, + gridInverseCellWidth, dev_pos, dev_particleArrayIndices, dev_particleGridIndices); + + thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, + dev_thrust_particleArrayIndices); + + kernShuffle << > >(numObjects, dev_particleArrayIndices, dev_pos, dev_pos2); + kernShuffle << > >(numObjects, dev_particleArrayIndices, dev_vel1, dev_vel2); + + kernIdentifyCellStartEnd << > >(numObjects, dev_particleGridIndices, + dev_gridCellStartIndices, dev_gridCellEndIndices); + + kernUpdateVelNeighborSearchCoherent << > >(numObjects, gridSideCount, + gridMinimum, gridInverseCellWidth, gridCellWidth, dev_gridCellStartIndices, dev_gridCellEndIndices, + dev_pos2, dev_vel2, dev_vel1); + kernUpdatePos << > >(numObjects, dt, dev_pos2, dev_vel1); + + cudaEventRecord(simEnd); + cudaEventSynchronize(simEnd); + + float timeElapsedMilliseconds = 0.f; + cudaEventElapsedTime(&timeElapsedMilliseconds, simStart, simEnd); + accumSimTime += timeElapsedMilliseconds; + ++simTimeCount; + if (simTimeCount >= NUM_SIM_TIMES) + { + averageSimTime = accumSimTime / static_cast(simTimeCount); + accumSimTime = 0.f; + simTimeCount = 0; + } + + glm::vec3 *tmp = dev_pos; + dev_pos = dev_pos2; + dev_pos2 = tmp; } void Boids::endSimulation() { @@ -390,6 +822,14 @@ void Boids::endSimulation() { cudaFree(dev_pos); // TODO-2.1 TODO-2.3 - Free any additional buffers here. + cudaFree(dev_particleArrayIndices); + cudaFree(dev_particleGridIndices); + cudaFree(dev_gridCellStartIndices); + cudaFree(dev_gridCellEndIndices); + cudaFree(dev_pos2); + + cudaEventDestroy(simStart); + cudaEventDestroy(simEnd); } void Boids::unitTest() { diff --git a/src/kernel.h b/src/kernel.h index 3d3da72..8408c9f 100644 --- a/src/kernel.h +++ b/src/kernel.h @@ -18,4 +18,6 @@ namespace Boids { void endSimulation(); void unitTest(); + + extern float averageSimTime; } diff --git a/src/main.cpp b/src/main.cpp index e416836..55ca457 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -14,11 +14,11 @@ // LOOK-2.1 LOOK-2.3 - toggles for UNIFORM_GRID and COHERENT_GRID #define VISUALIZE 1 -#define UNIFORM_GRID 0 -#define COHERENT_GRID 0 +#define UNIFORM_GRID 1 +#define COHERENT_GRID 1 // LOOK-1.2 - change this to adjust particle count in the simulation -const int N_FOR_VIS = 5000; +const int N_FOR_VIS = 10000; const float DT = 0.2f; /** @@ -42,6 +42,7 @@ int main(int argc, char* argv[]) { std::string deviceName; GLFWwindow *window; +cudaGraphicsResource *posVboHandle, *velVboHandle; /** * Initialization of CUDA and GLFW. @@ -101,12 +102,8 @@ bool init(int argc, char **argv) { // Initialize drawing state initVAO(); - // Default to device ID 0. If you have more than one GPU and want to test a non-default one, - // change the device ID. - cudaGLSetGLDevice(0); - - cudaGLRegisterBufferObject(boidVBO_positions); - cudaGLRegisterBufferObject(boidVBO_velocities); + cudaGraphicsGLRegisterBuffer(&posVboHandle, boidVBO_positions, cudaGraphicsRegisterFlagsNone); + cudaGraphicsGLRegisterBuffer(&velVboHandle, boidVBO_velocities, cudaGraphicsRegisterFlagsNone); // Initialize N-body simulation Boids::initSimulation(N_FOR_VIS); @@ -194,9 +191,13 @@ void initShaders(GLuint * program) { float4 *dptr = NULL; float *dptrVertPositions = NULL; float *dptrVertVelocities = NULL; + size_t devPosSize; + size_t devVelSize; - cudaGLMapBufferObject((void**)&dptrVertPositions, boidVBO_positions); - cudaGLMapBufferObject((void**)&dptrVertVelocities, boidVBO_velocities); + cudaGraphicsMapResources(1, &posVboHandle, 0); + cudaGraphicsMapResources(1, &velVboHandle, 0); + cudaGraphicsResourceGetMappedPointer((void **)&dptrVertPositions, &devPosSize, posVboHandle); + cudaGraphicsResourceGetMappedPointer((void **)&dptrVertVelocities, &devVelSize, velVboHandle); // execute the kernel #if UNIFORM_GRID && COHERENT_GRID @@ -210,9 +211,10 @@ void initShaders(GLuint * program) { #if VISUALIZE Boids::copyBoidsToVBO(dptrVertPositions, dptrVertVelocities); #endif + // unmap buffer object - cudaGLUnmapBufferObject(boidVBO_positions); - cudaGLUnmapBufferObject(boidVBO_velocities); + cudaGraphicsUnmapResources(1, &posVboHandle, 0); + cudaGraphicsUnmapResources(1, &velVboHandle, 0); } void mainLoop() { @@ -239,9 +241,11 @@ void initShaders(GLuint * program) { std::ostringstream ss; ss << "["; - ss.precision(1); + ss.precision(2); ss << std::fixed << fps; - ss << " fps] " << deviceName; + ss << " fps] " + << " [" << Boids::averageSimTime << " ms] " + << deviceName; glfwSetWindowTitle(window, ss.str().c_str()); glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT); diff --git a/src/main.hpp b/src/main.hpp index 6cdaa93..821c23a 100644 --- a/src/main.hpp +++ b/src/main.hpp @@ -36,9 +36,9 @@ const float fovy = (float) (PI / 4); const float zNear = 0.10f; const float zFar = 10.0f; // LOOK-1.2: for high DPI displays, you may want to double these settings. -int width = 1280; -int height = 720; -int pointSize = 2.0f; +int width = 2560; +int height = 1440; +int pointSize = 3.0f; // For camera controls bool leftMousePressed = false;