diff --git a/README.md b/README.md
index 98dd9a8..fd1f360 100644
--- a/README.md
+++ b/README.md
@@ -1,10 +1,79 @@
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture,
 Project 1 - Flocking**
 
-* (TODO) YOUR NAME HERE
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Trung Le
+* Windows 10 Home, i7-4790 CPU @ 3.60GHz 12GB, GTX 980 Ti (Personal desktop)
 
-### (TODO: Your README)
+### Flocking
 
-Include screenshots, analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+#### Description
+
+This project implemented three different algorithms for flocking behavior in CUDA: brute-force, uniform grid, coherent uniform grid.
+
+![alt text](https://github.com/trungtle/Project1-CUDA-Flocking/blob/master/images/screenshots/Uniform_5000_boids.gif "Flocking simulation")
+
+**---- General information for CUDA device ----**
+- Device name: GeForce GTX 980 Ti
+- Compute capability: 5.2
+- Compute mode: Default
+- Clock rate: 1076000
+- Integrated: 0
+- Device copy overlap: Enabled
+- Kernel execution timeout: Enabled
+ 
+**---- Memory information for CUDA device ----**
+
+- Total global memory: 6442450944
+- Total constant memory: 65536
+- Multiprocessor count: 22
+- Shared memory per multiprocessor: 98304
+- Registers per multiprocessor: 65536
+- Max threads per multiprocessor: 2048
+- Max grid dimensions: [2147483647, 65535, 65535]
+- Max threads per block: 1024
+- Max registers per block: 65536
+- Max thread dimensions: [1024, 1024, 64]
+- Threads per block: 512
+
+
+#### Performance analysis
+
+To run performance analysis, I used CUDA to wrap around the simulation step and turned off visualization.
+
+The boid count tested are: 1000, 5000, 50000, 100000, 150000
+The block size tested are: 1, 128, 512 (maxed at 512 threads per block on my machine)
+
+**Table showing simulation time per frame (in ms) vs. number of boids). Block size is 128.**
+
+| Boid count |	Coherent	| Uniform |	Naive|
+| ---------- | -------- | ------- | ---- |
+|5000|	0.2|	0.2|	4.3|
+|50000|	1.2|	1.5|	166|
+|100000|	1.8|	2.6|	654|
+|500000|	12.5|	20.6|	(crash)|
+|1000000|	52|	74|	(crash)|
+
+![alt text](https://github.com/trungtle/Project1-CUDA-Flocking/blob/master/images/charts/performace.png "Naive vs. Coherent vs scattered uniform grid performance")
+
+For each implmentation, as the number of boids increases, there is a big drop in performance. This is due to the fact that for each boid's velocity computation, we need loop through a list of potential neighbors that can affect the boid. The biggest optimization made in each implementation is a improvement on how efficient we can iterate through these neighbors by partitioning them into a uniform grid and rearrange their data in a coherent memory.
+
+Changing the block size doesn't seem to increases performace. However, I did find an interesting pattern. At every block size that is a power of 2, the performance is optimal. I tested this with 1,000,000 in coherent grid (~46ms max) and scattered uniform grid (~74ms max).
+
+**Table showing block size vs. simulation time per frame (in ms)**
+
+| Block size |	Coherent |	Uniform |
+| ---------- | -------- | ------- |
+|32|	45|	74|
+|35|	74|	113|
+|64|	44|	73|
+|100|	54|	87|
+|128|	46|	74|
+|256|	45|	73|
+|400|	54|	85|
+|512|	46|	73|
+|600|	68|	105|
+|1024|	46|	72|
+
+When comparing between coherent and scattered uniform grids, I did see a great performance improvement with the coherent data implementation as the number of boids increases. From my performance analysis, I observed that at 1000000 boids and kernel block size 128, coherent uniform grid is ~20ms faster than scattered uniform grid. This is a big improvement! This is due to the fact that we take advantage of spatial coherence can access each neighboring boid's data sequentially in memory.
+
+**Note**: I also updated the -arch=sm_52 version to make it compatible for my machine
diff --git a/images/charts/performace.png b/images/charts/performace.png
new file mode 100644
index 0000000..a1e1d23
Binary files /dev/null and b/images/charts/performace.png differ
diff --git a/images/screenshots/Uniform_5000_boids.gif b/images/screenshots/Uniform_5000_boids.gif
new file mode 100644
index 0000000..ed5de48
Binary files /dev/null and b/images/screenshots/Uniform_5000_boids.gif differ
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index fdd636d..eeaabd4 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -10,5 +10,5 @@ set(SOURCE_FILES
 
 cuda_add_library(src
     ${SOURCE_FILES}
-    OPTIONS -arch=sm_20
+    OPTIONS -arch=sm_52
     )
diff --git a/src/kernel.cu b/src/kernel.cu
index 30356b9..378ae51 100644
--- a/src/kernel.cu
+++ b/src/kernel.cu
@@ -6,6 +6,8 @@
 #include "utilityCore.hpp"
 #include "kernel.h"
 
+#define INVALID_VALUE (-1)
+
 // LOOK-2.1 potentially useful for doing grid-based neighbor search
 #ifndef imax
 #define imax( a, b ) ( ((a) > (b)) ? (a) : (b) )
@@ -83,8 +85,13 @@ thrust::device_ptr<int> dev_thrust_particleGridIndices;
 int *dev_gridCellStartIndices; // What part of dev_particleArrayIndices belongs
 int *dev_gridCellEndIndices;   // to this cell?
 
+
+
 // TODO-2.3 - consider what additional buffers you might need to reshuffle
 // the position and velocity data to be coherent within cells.
+glm::vec3* dev_sorted_pos;
+glm::vec3* dev_sorted_vel1;
+glm::vec3* dev_sorted_vel2;
 
 // LOOK-2.1 - Grid parameters based on simulation parameters.
 // These are automatically computed for you in Boids::initSimulation
@@ -133,6 +140,15 @@ __global__ void kernGenerateRandomPosArray(int time, int N, glm::vec3 * arr, flo
   }
 }
 
+__global__ void kernCopyBuffer(int N, glm::vec3* desBuffer, glm::vec3* srcBuffer) {
+  int index = threadIdx.x + (blockIdx.x * blockDim.x);
+  if (index >= N) {
+    return;
+  }
+
+  desBuffer[index] = srcBuffer[index];
+}
+
 /**
 * Initialize memory, update some globals
 */
@@ -151,15 +167,26 @@ void Boids::initSimulation(int N) {
   cudaMalloc((void**)&dev_vel2, N * sizeof(glm::vec3));
   checkCUDAErrorWithLine("cudaMalloc dev_vel2 failed!");
 
+  cudaMalloc((void**)&dev_sorted_pos, N * sizeof(glm::vec3));
+  checkCUDAErrorWithLine("cudaMalloc dev_sorted_pos failed!");
+  
+  cudaMalloc((void**)&dev_sorted_vel1, N * sizeof(glm::vec3));
+  checkCUDAErrorWithLine("cudaMalloc dev_sorted_vel1 failed!");
+  cudaMalloc((void**)&dev_sorted_vel2, N * sizeof(glm::vec3));
+  checkCUDAErrorWithLine("cudaMalloc dev_sorted_vel2 failed!");
+
   // LOOK-1.2 - This is a typical CUDA kernel invocation.
   kernGenerateRandomPosArray<<<fullBlocksPerGrid, blockSize>>>(1, numObjects,
     dev_pos, scene_scale);
   checkCUDAErrorWithLine("kernGenerateRandomPosArray failed!");
 
+  // Initialize the dev_sorted_pos
+  kernCopyBuffer << <fullBlocksPerGrid, blockSize >> >(numObjects, dev_sorted_pos, dev_pos);
+
   // LOOK-2.1 computing grid params
   gridCellWidth = 2.0f * std::max(std::max(rule1Distance, rule2Distance), rule3Distance);
   int halfSideCount = (int)(scene_scale / gridCellWidth) + 1;
-  gridSideCount = 2 * halfSideCount;
+  gridSideCount = halfSideCount * 2;
 
   gridCellCount = gridSideCount * gridSideCount * gridSideCount;
   gridInverseCellWidth = 1.0f / gridCellWidth;
@@ -169,6 +196,20 @@ void Boids::initSimulation(int N) {
   gridMinimum.z -= halfGridWidth;
 
   // TODO-2.1 TODO-2.3 - Allocate additional buffers here.
+  cudaMalloc((void**)&dev_particleArrayIndices, N * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_particleArrayIndices failed!");
+  cudaMalloc((void**)&dev_particleGridIndices, N * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_particleGridIndices failed!");
+
+  dev_thrust_particleArrayIndices = thrust::device_pointer_cast<int>(dev_particleArrayIndices);
+  dev_thrust_particleGridIndices = thrust::device_pointer_cast<int>(dev_particleGridIndices);
+
+  cudaMalloc((void**)&dev_gridCellStartIndices, gridCellCount * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_gridCellStartIndices failed!");
+  cudaMalloc((void**)&dev_gridCellEndIndices, gridCellCount * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_gridCellEndIndices failed!");
+
+
   cudaThreadSynchronize();
 }
 
@@ -218,7 +259,6 @@ void Boids::copyBoidsToVBO(float *vbodptr_positions, float *vbodptr_velocities)
   cudaThreadSynchronize();
 }
 
-
 /******************
 * stepSimulation *
 ******************/
@@ -230,10 +270,55 @@ void Boids::copyBoidsToVBO(float *vbodptr_positions, float *vbodptr_velocities)
 * in the `pos` and `vel` arrays.
 */
 __device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *pos, const glm::vec3 *vel) {
-  // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
-  // Rule 2: boids try to stay a distance d away from each other
-  // Rule 3: boids try to match the speed of surrounding boids
-  return glm::vec3(0.0f, 0.0f, 0.0f);
+  
+  glm::vec3 velCohesion, velSeparation, velAlignment;
+  glm::vec3 centerOfMass;
+  unsigned int rule1NeighborCount = 0;
+  unsigned int rule2NeighborCount = 0;
+  unsigned int rule3NeighborCount = 0;
+  
+  for (auto i = 0; i < N; ++i) {
+    
+    // skip itself
+    if (i == iSelf) continue;
+    auto distance = glm::distance(pos[iSelf], pos[i]);
+
+    // Rule 1, cohesion: boids fly towards their local perceived center of mass, which excludes themselves
+    if (distance < rule1Distance) {
+      centerOfMass += pos[i];
+      ++rule1NeighborCount;
+    }
+
+    // Rule 2: boids try to stay a distance d away from each other
+    if (distance < rule2Distance) {
+      velSeparation -= (pos[i] - pos[iSelf]);
+      ++rule2NeighborCount;
+    }
+
+    // Rule 3: boids try to match the speed of surrounding boids
+    if (distance < rule3Distance) {
+      velAlignment += vel[i];
+      ++rule3NeighborCount;
+    }
+  }
+
+  if (rule1NeighborCount > 0) {
+    centerOfMass /= rule1NeighborCount;
+    velCohesion = (centerOfMass - pos[iSelf]) * rule1Scale;
+  }
+
+  if (rule2NeighborCount > 0) {
+    velSeparation *= rule2Scale;
+  }
+
+  if (rule3NeighborCount > 0) {
+    velAlignment /= rule3NeighborCount;
+    velAlignment = velAlignment - vel[iSelf];
+    velAlignment *= rule3Scale;
+  }
+
+  // Return new computed velocity
+  return velCohesion + velSeparation + velAlignment;
 }
 
 /**
@@ -242,9 +327,21 @@ __device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *po
 */
 __global__ void kernUpdateVelocityBruteForce(int N, glm::vec3 *pos,
   glm::vec3 *vel1, glm::vec3 *vel2) {
+
   // Compute a new velocity based on pos and vel1
-  // Clamp the speed
+  int index = threadIdx.x + (blockIdx.x * blockDim.x);
+  if (index >= N) {
+    return;
+  }
+    
   // Record the new velocity into vel2. Question: why NOT vel1?
+  // Since the neighbor might be inspecting our current vel1 this frame, we don't want to update vel1 yet
+  vel2[index] = vel1[index] + computeVelocityChange(N, index, pos, vel1);
+
+  // Clamp the speed
+  if (glm::length(vel2[index]) >= maxSpeed) {
+    vel2[index] = glm::normalize(vel2[index]) * maxSpeed;
+  }
 }
 
 /**
@@ -289,6 +386,19 @@ __global__ void kernComputeIndices(int N, int gridResolution,
     // - Label each boid with the index of its grid cell.
     // - Set up a parallel array of integer indices as pointers to the actual
     //   boid data in pos and vel1/vel2
+
+  int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+  if (index < N) {
+    indices[index] = index;
+
+    // Compute the 3D grid indices for this particle
+    int gridX = (int)((pos[index].x - gridMin.x) * inverseCellWidth);
+    int gridY = (int)((pos[index].y - gridMin.y) * inverseCellWidth);
+    int gridZ = (int)((pos[index].z - gridMin.z) * inverseCellWidth);
+
+    // Convert and store the 1D grid index
+    gridIndices[index] = gridIndex3Dto1D(gridX, gridY, gridZ, gridResolution);
+  }
 }
 
 // LOOK-2.1 Consider how this could be useful for indicating that a cell
@@ -306,6 +416,28 @@ __global__ void kernIdentifyCellStartEnd(int N, int *particleGridIndices,
   // Identify the start point of each cell in the gridIndices array.
   // This is basically a parallel unrolling of a loop that goes
   // "this index doesn't match the one before it, must be a new cell!"
+
+  int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+  if (index > N) {
+    return;
+  }
+
+  int cellIndex = particleGridIndices[index];
+  if (index == 0) {
+    gridCellStartIndices[cellIndex] = 0;
+    return;
+  }
+
+  if (index == N - 1) {
+    gridCellEndIndices[cellIndex] = N - 1;
+  }
+
+  int prevCellIndex = particleGridIndices[index - 1];
+  if (cellIndex != prevCellIndex)
+  {
+    gridCellStartIndices[cellIndex] = index;
+    gridCellEndIndices[prevCellIndex] = index - 1;
+  }
 }
 
 __global__ void kernUpdateVelNeighborSearchScattered(
@@ -322,6 +454,126 @@ __global__ void kernUpdateVelNeighborSearchScattered(
   // - Access each boid in the cell and compute velocity change from
   //   the boids rules, if this boid is within the neighborhood distance.
   // - Clamp the speed change before putting the new speed in vel2
+
+  // Compute a new velocity based on pos and vel1
+  int index = threadIdx.x + (blockIdx.x * blockDim.x);
+  if (index >= N) {
+    return;
+  }
+
+  int boidIndex = particleArrayIndices[index];
+
+  // Compute the 3D grid indices for this particle
+  glm::vec3 grid3DIndex = (pos[boidIndex] - gridMin) * inverseCellWidth;
+
+    // Find the other 8 neiboring cells based on which octant the particle is in
+  glm::vec3 distanceToNeighbor = 
+    pos[boidIndex] - (glm::vec3((int)grid3DIndex.x, (int)grid3DIndex.y, (int)grid3DIndex.z) * cellWidth + gridMin);
+  float halfCellWidth = cellWidth / 2.0f;
+
+  int neighborMinIndexX, neighborMinIndexY, neighborMinIndexZ, neighborMaxIndexX, neighborMaxIndexY, neighborMaxIndexZ;
+
+  if (distanceToNeighbor.x < halfCellWidth) {
+    neighborMinIndexX = (int)grid3DIndex.x - 1 >= 0 ? (int)grid3DIndex.x - 1 : (int)grid3DIndex.x;
+    neighborMaxIndexX = (int)grid3DIndex.x;
+  } else {
+    neighborMinIndexX = (int)grid3DIndex.x;
+    neighborMaxIndexX = (int)grid3DIndex.x + 1 < N ? (int)grid3DIndex.x + 1: (int)grid3DIndex.x;
+  }
+
+  if (distanceToNeighbor.y < halfCellWidth) {
+    neighborMinIndexY = (int)grid3DIndex.y - 1 >= 0 ? (int)grid3DIndex.y - 1 : (int)grid3DIndex.y;
+    neighborMaxIndexY = (int)grid3DIndex.y;
+  }
+  else {
+    neighborMinIndexY = (int)grid3DIndex.y;
+    neighborMaxIndexY = (int)grid3DIndex.y + 1 < N ? (int)grid3DIndex.y + 1 : (int)grid3DIndex.y;
+  }
+  
+  if (distanceToNeighbor.z < halfCellWidth) {
+    neighborMinIndexZ = (int)grid3DIndex.z - 1 >= 0 ? (int)grid3DIndex.z - 1 : (int)grid3DIndex.z;
+    neighborMaxIndexZ = (int)grid3DIndex.z;
+  }
+  else {
+    neighborMinIndexZ = (int)grid3DIndex.z;
+    neighborMaxIndexZ = (int)grid3DIndex.z + 1 < N ? (int)grid3DIndex.z + 1 : (int)grid3DIndex.z;
+  }
+
+  // These are to keep track of velocity computation
+  glm::vec3 velCohesion, velSeparation, velAlignment;
+  glm::vec3 centerOfMass;
+  unsigned int rule1NeighborCount = 0;
+  unsigned int rule2NeighborCount = 0;
+  unsigned int rule3NeighborCount = 0;
+
+  // Loop through neighbor cells and find all boids in them
+  int neighborIndex, neighborCellStartIndex, neighborCellEndIndex;
+  for (auto z = neighborMinIndexZ; z <= neighborMaxIndexZ; ++z) {
+    for (auto y = neighborMinIndexY; y <= neighborMaxIndexY; ++y) {
+      for (auto x = neighborMinIndexX; x <= neighborMaxIndexX; ++x) {
+        
+        // Compute velocity contribution from particles in this neighboring cell
+        neighborIndex = gridIndex3Dto1D(x, y, z, gridResolution);
+        neighborCellStartIndex = gridCellStartIndices[neighborIndex];
+        neighborCellEndIndex = gridCellEndIndices[neighborIndex];
+
+        if (neighborCellStartIndex >= 0 && neighborCellStartIndex <= neighborCellEndIndex &&
+          neighborCellEndIndex >= neighborCellStartIndex && neighborCellEndIndex < N) {
+
+          //Compute velocity
+          for (auto i = neighborCellStartIndex; i < neighborCellEndIndex; ++i) {
+
+            int neighborBoidIndex = particleArrayIndices[i];
+            // skip itself
+            if (neighborBoidIndex == boidIndex) continue;
+            auto distance = glm::distance(pos[boidIndex], pos[neighborBoidIndex]);
+
+            // Rule 1, cohesion: boids fly towards their local perceived center of mass, which excludes themselves
+            if (distance < rule1Distance) {
+              centerOfMass += pos[neighborBoidIndex];
+              ++rule1NeighborCount;
+            }
+
+            // Rule 2: boids try to stay a distance d away from each other
+            if (distance < rule2Distance) {
+              velSeparation -= (pos[neighborBoidIndex] - pos[boidIndex]);
+              ++rule2NeighborCount;
+            }
+
+            // Rule 3: boids try to match the speed of surrounding boids
+            if (distance < rule3Distance) {
+              velAlignment += vel1[neighborBoidIndex];
+              ++rule3NeighborCount;
+            }
+          }          
+        }
+      }
+    }
+  }
+
+  if (rule1NeighborCount > 0) {
+    centerOfMass /= rule1NeighborCount;
+    velCohesion = (centerOfMass - pos[boidIndex]) * rule1Scale;
+  }
+
+  if (rule2NeighborCount > 0) {
+    velSeparation *= rule2Scale;
+  }
+
+  if (rule3NeighborCount > 0) {
+    velAlignment /= rule3NeighborCount;
+    velAlignment = velAlignment - vel1[boidIndex];
+    velAlignment *= rule3Scale;
+  }
+
+  // Record the new velocity into vel2. Question: why NOT vel1?
+  // Since the neighbor might be inspecting our current vel1 this frame, we don't want to update vel1 yet
+  vel2[boidIndex] = vel1[boidIndex] + velCohesion + velSeparation + velAlignment;
+
+  // Clamp the speed
+  if (glm::length(vel2[boidIndex]) >= maxSpeed) {
+    vel2[boidIndex] = glm::normalize(vel2[boidIndex]) * maxSpeed;
+  }
 }
 
 __global__ void kernUpdateVelNeighborSearchCoherent(
@@ -341,6 +593,147 @@ __global__ void kernUpdateVelNeighborSearchCoherent(
   // - Access each boid in the cell and compute velocity change from
   //   the boids rules, if this boid is within the neighborhood distance.
   // - Clamp the speed change before putting the new speed in vel2
+  int index = threadIdx.x + (blockIdx.x * blockDim.x);
+  if (index >= N) {
+    return;
+  }
+
+  // Compute the 3D grid indices for this particle
+  glm::vec3 grid3DIndex = (pos[index] - gridMin) * inverseCellWidth;
+
+  // Find the other 8 neiboring cells based on which octant the particle is in
+  glm::vec3 distanceToNeighbor =
+    pos[index] - (glm::vec3((int)grid3DIndex.x, (int)grid3DIndex.y, (int)grid3DIndex.z) * cellWidth + gridMin);
+  float halfCellWidth = cellWidth / 2.0f;
+
+  int neighborMinIndexX, 
+    neighborMinIndexY, 
+      neighborMinIndexZ, 
+        neighborMaxIndexX, 
+          neighborMaxIndexY, 
+            neighborMaxIndexZ;
+
+  if (distanceToNeighbor.x < halfCellWidth) {
+    neighborMinIndexX = (int)grid3DIndex.x - 1;
+    neighborMaxIndexX = (int)grid3DIndex.x;
+  }
+  else {
+    neighborMinIndexX = (int)grid3DIndex.x;
+    neighborMaxIndexX = (int)grid3DIndex.x + 1;
+  }
+
+  if (distanceToNeighbor.y < halfCellWidth) {
+    neighborMinIndexY = (int)grid3DIndex.y - 1;
+    neighborMaxIndexY = (int)grid3DIndex.y;
+  }
+  else {
+    neighborMinIndexY = (int)grid3DIndex.y;
+    neighborMaxIndexY = (int)grid3DIndex.y + 1;
+  }
+
+  if (distanceToNeighbor.z < halfCellWidth) {
+    neighborMinIndexZ = (int)grid3DIndex.z - 1;
+    neighborMaxIndexZ = (int)grid3DIndex.z;
+  }
+  else {
+    neighborMinIndexZ = (int)grid3DIndex.z;
+    neighborMaxIndexZ = (int)grid3DIndex.z + 1;
+  }
+
+  // These are to keep track of velocity computation
+  glm::vec3 velCohesion, velSeparation, velAlignment;
+  glm::vec3 centerOfMass;
+  unsigned int rule1NeighborCount = 0;
+  unsigned int rule2NeighborCount = 0;
+  unsigned int rule3NeighborCount = 0;
+
+  // Loop through neighbor cells and find all boids in them
+  int neighborIndex, neighborCellStartIndex, neighborCellEndIndex;
+  for (int z = neighborMinIndexZ; z <= neighborMaxIndexZ; ++z) {
+    for (int y = neighborMinIndexY; y <= neighborMaxIndexY; ++y) {
+      for (int x = neighborMinIndexX; x <= neighborMaxIndexX; ++x) {
+
+        // Compute velocity contribution from particles in this neighboring cell
+        neighborIndex = gridIndex3Dto1D(x % gridResolution, y % gridResolution, z % gridResolution, gridResolution);
+        neighborCellStartIndex = gridCellStartIndices[neighborIndex];
+        neighborCellEndIndex = gridCellEndIndices[neighborIndex];
+
+        if (neighborCellStartIndex >= 0 && neighborCellStartIndex <= neighborCellEndIndex &&
+          neighborCellEndIndex >= neighborCellStartIndex && neighborCellEndIndex < N) {
+
+
+          //Compute velocity
+          for (auto neighborBoidIndex = neighborCellStartIndex; neighborBoidIndex < neighborCellEndIndex; ++neighborBoidIndex) {
+
+            // skip itself
+            if (neighborBoidIndex == index) continue;
+            auto distance = glm::distance(pos[index], pos[neighborBoidIndex]);
+
+            // Rule 1, cohesion: boids fly towards their local perceived center of mass, which excludes themselves
+            if (distance < rule1Distance) {
+              centerOfMass += pos[neighborBoidIndex];
+              ++rule1NeighborCount;
+            }
+
+            // Rule 2: boids try to stay a distance d away from each other
+            if (distance < rule2Distance) {
+              velSeparation -= (pos[neighborBoidIndex] - pos[index]);
+              ++rule2NeighborCount;
+            }
+
+            // Rule 3: boids try to match the speed of surrounding boids
+            if (distance < rule3Distance) {
+              velAlignment += vel1[neighborBoidIndex];
+              ++rule3NeighborCount;
+            }
+          }
+        }
+      }
+    }
+  }
+
+  if (rule1NeighborCount > 0) {
+    centerOfMass /= rule1NeighborCount;
+    velCohesion = (centerOfMass - pos[index]) * rule1Scale;
+  }
+
+  if (rule2NeighborCount > 0) {
+    velSeparation *= rule2Scale;
+  }
+
+  if (rule3NeighborCount > 0) {
+    velAlignment /= rule3NeighborCount;
+    velAlignment = velAlignment - vel1[index];
+    velAlignment *= rule3Scale;
+  }
+
+  // Record the new velocity into vel2. Question: why NOT vel1?
+  // Since the neighbor might be inspecting our current vel1 this frame, we don't want to update vel1 yet
+  vel2[index] = vel1[index] + velCohesion + velSeparation + velAlignment;
+
+  // Clamp the speed
+  if (glm::length(vel2[index]) >= maxSpeed) {
+    vel2[index] = glm::normalize(vel2[index]) * maxSpeed;
+  } 
+}
+
+__global__ void kernRearrangeBuffer(int N, int* particleGridIndices, glm::vec3* newBuffer, glm::vec3* oldBuffer) {
+  int index = threadIdx.x + (blockIdx.x * blockDim.x);
+  if (index >= N) {
+    return;
+  }
+  int oldIdx = particleGridIndices[index];
+  newBuffer[index] = oldBuffer[oldIdx];
+}
+
+__global__ void kernCopyBack(int N, int* particleGridIndices, glm::vec3* newBuffer, glm::vec3* oldBuffer) {
+  int index = threadIdx.x + (blockIdx.x * blockDim.x);
+  if (index >= N) {
+    return;
+  }
+
+  int newIdx = particleGridIndices[index];
+  newBuffer[newIdx] = oldBuffer[index];
 }
 
 /**
@@ -348,7 +741,14 @@ __global__ void kernUpdateVelNeighborSearchCoherent(
 */
 void Boids::stepSimulationNaive(float dt) {
   // TODO-1.2 - use the kernels you wrote to step the simulation forward in time.
+  static bool isDevVel1Active = true;
+  dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+  
+  kernUpdateVelocityBruteForce << <fullBlocksPerGrid, blockSize >> >(numObjects, dev_pos, isDevVel1Active ? dev_vel1 : dev_vel2, isDevVel1Active ? dev_vel2 : dev_vel1);
+  kernUpdatePos << <fullBlocksPerGrid, blockSize >> >(numObjects, dt, dev_pos, isDevVel1Active ? dev_vel2 : dev_vel1);
+
   // TODO-1.2 ping-pong the velocity buffers
+  isDevVel1Active = !isDevVel1Active;
 }
 
 void Boids::stepSimulationScatteredGrid(float dt) {
@@ -364,6 +764,71 @@ void Boids::stepSimulationScatteredGrid(float dt) {
   // - Perform velocity updates using neighbor search
   // - Update positions
   // - Ping-pong buffers as needed
+
+  dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+
+  // reseet the grid cell start and end indices to an invalid value
+  kernResetIntBuffer << <fullBlocksPerGrid, blockSize >> >(
+    numObjects, 
+      dev_gridCellStartIndices, 
+      INVALID_VALUE
+          );
+
+  kernResetIntBuffer << <fullBlocksPerGrid, blockSize >> >(
+    numObjects, 
+      dev_gridCellEndIndices, 
+      INVALID_VALUE
+          );
+
+  // Compute the new index
+  kernComputeIndices << <fullBlocksPerGrid, blockSize >> >(
+    numObjects, 
+      gridSideCount, 
+        gridMinimum, 
+          gridInverseCellWidth, 
+          dev_pos,
+              dev_particleArrayIndices, 
+                dev_particleGridIndices
+                  );
+
+  thrust::sort_by_key(
+    dev_thrust_particleGridIndices, 
+      dev_thrust_particleGridIndices + numObjects, 
+        dev_thrust_particleArrayIndices);
+
+  kernIdentifyCellStartEnd << <fullBlocksPerGrid, blockSize >> >(
+    numObjects, 
+      dev_particleGridIndices, 
+        dev_gridCellStartIndices, 
+          dev_gridCellEndIndices
+            );
+
+  // Update velocity
+  static bool isDevVel1Active = true;
+  kernUpdateVelNeighborSearchScattered << <fullBlocksPerGrid, blockSize >> >(
+    numObjects,
+      gridSideCount, 
+        gridMinimum, 
+          gridInverseCellWidth, 
+            gridCellWidth, 
+              dev_gridCellStartIndices, 
+                dev_gridCellEndIndices, 
+                  dev_particleArrayIndices, 
+                    dev_pos, 
+                      isDevVel1Active ? dev_vel1 : dev_vel2, 
+                        isDevVel1Active ? dev_vel2 : dev_vel1
+                          );
+  
+  //  Update position
+  kernUpdatePos << <fullBlocksPerGrid, blockSize >> >(
+    numObjects, 
+      dt, 
+        dev_pos, 
+          isDevVel1Active ? dev_vel2 : dev_vel1
+            );
+  
+  // Ping pong
+  isDevVel1Active = !isDevVel1Active;
 }
 
 void Boids::stepSimulationCoherentGrid(float dt) {
@@ -382,6 +847,83 @@ void Boids::stepSimulationCoherentGrid(float dt) {
   // - Perform velocity updates using neighbor search
   // - Update positions
   // - Ping-pong buffers as needed. THIS MAY BE DIFFERENT FROM BEFORE.
+
+  dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+
+  // reseet the grid cell start and end indices to an invalid value
+  kernResetIntBuffer << <fullBlocksPerGrid, blockSize >> >(
+    numObjects,
+    dev_gridCellStartIndices,
+    INVALID_VALUE
+    );
+
+  kernResetIntBuffer << <fullBlocksPerGrid, blockSize >> >(
+    numObjects,
+    dev_gridCellEndIndices,
+    INVALID_VALUE
+    );
+
+  // Compute the new index
+  kernComputeIndices << <fullBlocksPerGrid, blockSize >> >(
+    numObjects,
+    gridSideCount,
+    gridMinimum,
+    gridInverseCellWidth,
+    dev_pos,
+    dev_particleArrayIndices,
+    dev_particleGridIndices
+    );
+
+  thrust::sort_by_key(
+    dev_thrust_particleGridIndices,
+    dev_thrust_particleGridIndices + numObjects,
+    dev_thrust_particleArrayIndices);
+
+  kernIdentifyCellStartEnd << <fullBlocksPerGrid, blockSize >> >(
+    numObjects,
+    dev_particleGridIndices,
+    dev_gridCellStartIndices,
+    dev_gridCellEndIndices
+    );
+
+  kernRearrangeBuffer << <fullBlocksPerGrid, blockSize >> >(
+    numObjects,
+    dev_particleArrayIndices,
+    dev_sorted_pos,
+    dev_pos
+    );
+
+  kernRearrangeBuffer << <fullBlocksPerGrid, blockSize >> >(
+    numObjects,
+    dev_particleArrayIndices,
+    dev_vel1,
+    dev_vel2
+    );
+
+  // Update velocity
+  kernUpdateVelNeighborSearchCoherent << <fullBlocksPerGrid, blockSize >> >(
+    numObjects,
+    gridSideCount,
+    gridMinimum,
+    gridInverseCellWidth,
+    gridCellWidth,
+    dev_gridCellStartIndices,
+    dev_gridCellEndIndices,
+    dev_sorted_pos,
+    dev_vel1,
+    dev_vel2
+    );
+
+    //  Update position
+  kernUpdatePos << <fullBlocksPerGrid, blockSize >> >(
+    numObjects,
+    dt,
+    dev_sorted_pos,
+    dev_vel2
+    );
+
+  // Ping pong
+  std::swap(dev_pos, dev_sorted_pos);
 }
 
 void Boids::endSimulation() {
@@ -390,6 +932,13 @@ void Boids::endSimulation() {
   cudaFree(dev_pos);
 
   // TODO-2.1 TODO-2.3 - Free any additional buffers here.
+  cudaFree(dev_particleArrayIndices);
+  cudaFree(dev_particleGridIndices);
+  cudaFree(dev_gridCellStartIndices);
+  cudaFree(dev_gridCellEndIndices);
+  cudaFree(dev_sorted_pos);
+  cudaFree(dev_sorted_vel1);
+  cudaFree(dev_sorted_vel2);
 }
 
 void Boids::unitTest() {
@@ -432,6 +981,8 @@ void Boids::unitTest() {
   cudaMemcpy(dev_intKeys, intKeys, sizeof(int) * N, cudaMemcpyHostToDevice);
   cudaMemcpy(dev_intValues, intValues, sizeof(int) * N, cudaMemcpyHostToDevice);
 
+  // ------------
+
   // Wrap device vectors in thrust iterators for use with thrust.
   thrust::device_ptr<int> dev_thrust_keys(dev_intKeys);
   thrust::device_ptr<int> dev_thrust_values(dev_intValues);
@@ -449,11 +1000,135 @@ void Boids::unitTest() {
     std::cout << " value: " << intValues[i] << std::endl;
   }
 
+  // ------------
+  float test_cellWidth = 10.0;
+  float test_sceneScale = 100.0f;
+  int test_sideCount = (int)(test_sceneScale / test_cellWidth);
+  int test_halfSideCount = test_sideCount / 2;
+  int test_gridCellCount = test_sideCount * test_sideCount * test_sideCount;
+  float test_inverseCellWidth = 1.0f / test_cellWidth;
+  float test_halfGridWidth = test_cellWidth * test_halfSideCount;
+  glm::vec3 test_gridMinimum(-test_halfGridWidth, -test_halfGridWidth, -test_halfGridWidth);
+
+  std::cout << "Test cell width: " << test_cellWidth << std::endl;
+  std::cout << "Test scene scale: " << test_sceneScale << std::endl;
+  std::cout << "Test half side count: " << test_halfSideCount << std::endl;
+  std::cout << "Test side count: " << test_sideCount << std::endl;
+  std::cout << "Test grid cell count: " << test_gridCellCount << std::endl;
+  std::cout << "Test inverse cell width: " << test_inverseCellWidth << std::endl;
+  std::cout << "Test half grid width: " << test_halfGridWidth << std::endl;
+  std::cout << "Test grid minimum: " << test_gridMinimum.x << ", " << test_gridMinimum.y << ", " << test_gridMinimum.z << std::endl;
+
+  int* test_particleArrayIndices = new int[N];
+  int* test_particleGridIndices = new int[N];
+  glm::vec3* test_pos = new glm::vec3[N];
+
+  glm::vec3* test_dev_pos;
+  int* test_dev_particleArrayIndices;
+  int* test_dev_particleGridIndices;
+
+  cudaMalloc((void**)&test_dev_pos, N * sizeof(glm::vec3));
+  for (int i = 0; i < N; ++i) {
+    test_pos[i] = glm::vec3(test_cellWidth * i) + test_gridMinimum;
+
+  }
+  cudaMemcpy(test_dev_pos, test_pos, sizeof(glm::vec3) * N, cudaMemcpyHostToDevice);
+
+  cudaMalloc((void**)&test_dev_particleArrayIndices, N * sizeof(int));
+  cudaMalloc((void**)&test_dev_particleGridIndices, N * sizeof(int));
+
+  kernComputeIndices << <1, N >> >(
+    N, 
+      test_sideCount,
+        test_gridMinimum,
+          test_inverseCellWidth,
+            test_dev_pos,
+              test_dev_particleArrayIndices,
+                test_dev_particleGridIndices
+                  );
+
+  cudaMemcpy(test_pos, test_dev_pos, sizeof(glm::vec3) * N, cudaMemcpyDeviceToHost);
+  cudaMemcpy(test_particleArrayIndices, test_dev_particleArrayIndices, sizeof(int) * N, cudaMemcpyDeviceToHost);
+  cudaMemcpy(test_particleGridIndices, test_dev_particleGridIndices, sizeof(int) * N, cudaMemcpyDeviceToHost);
+
+  std::cout << "Test kernComputeIndices  : " << std::endl;
+  for (auto i = 0; i < N; i++) {
+    std::cout << "  pos: " << test_pos[i].x << ", " << test_pos[i].y << ", " << test_pos[i].z;
+    std::cout << "  particle array indices: " << test_particleArrayIndices[i];
+    std::cout << " particle grid indices: " << test_particleGridIndices[i] << std::endl;
+  }
+
+  // ------------
+
+  int *dev_keyStarts;
+  int *dev_keyEnds;
+  cudaMalloc((void**)&dev_keyStarts, N * sizeof(int));
+  cudaMalloc((void**)&dev_keyEnds, N * sizeof(int));
+
+  int *intStarts = new int[N];
+  int *intEnds = new int[N];
+
+  kernResetIntBuffer << <1, N >> >(N, dev_keyStarts, INVALID_VALUE);
+  kernResetIntBuffer << <1, N >> >(N, dev_keyEnds, INVALID_VALUE);
+  cudaMemcpy(intStarts, dev_keyStarts, sizeof(int) * N, cudaMemcpyDeviceToHost);
+  cudaMemcpy(intEnds, dev_keyEnds, sizeof(int) * N, cudaMemcpyDeviceToHost);
+
+  std::cout << "Test kernResetIntBuffer  : " << std::endl;
+  for (auto i = 0; i < N; i++) {
+    std::cout << "  start: " << intStarts[i];
+    std::cout << " end: " << intEnds[i] << std::endl;
+  }
+
+  // ------------
+
+  kernIdentifyCellStartEnd << <1, N >> >(N, dev_intKeys, dev_keyStarts, dev_keyEnds);
+
+  cudaMemcpy(intStarts, dev_keyStarts, sizeof(int) * N, cudaMemcpyDeviceToHost);
+  cudaMemcpy(intEnds, dev_keyEnds, sizeof(int) * N, cudaMemcpyDeviceToHost);
+
+  std::cout << "Test kernIdentifyCellStartEnd  : " << std::endl;
+  for (int i = 0; i < N; i++) {
+    std::cout << "  start: " << intStarts[i];
+    std::cout << " end: " << intEnds[i] << std::endl;
+  }
+
+  // --------------
+  glm::vec3* test_sorted_pos = new glm::vec3[N];
+
+  glm::vec3* test_dev_sorted_pos;
+  cudaMalloc((void**)&test_dev_sorted_pos, N * sizeof(glm::vec3));
+
+  std::cout << "Test kernRearrangeBuffer  : " << std::endl;
+  for (int i = 0; i < N; i++) {
+    std::cout << "  pos: " << test_pos[i].x << ", " << test_pos[i].y << ", " << test_pos[i].z;
+    std::cout << " index: " << intValues[i] << std::endl;
+  }
+  kernRearrangeBuffer << <1, N >> >(N, dev_intValues, test_dev_sorted_pos, test_dev_pos);
+  cudaMemcpy(test_sorted_pos, test_dev_sorted_pos, sizeof(glm::vec3) * N, cudaMemcpyDeviceToHost);
+
+  for (int i = 0; i < N; i++) {
+    std::cout << "  sorted_pos: " << test_sorted_pos[i].x << ", " << test_sorted_pos[i].y << ", " << test_sorted_pos[i].z;
+    std::cout << " index: " << intValues[i] << std::endl;
+  }
+
+
   // cleanup
-  delete(intKeys);
-  delete(intValues);
+  delete[] intKeys;
+  delete[] intValues;
+  delete[] intStarts;
+  delete[] intEnds;
+  delete[] test_particleArrayIndices;
+  delete[] test_particleGridIndices;
+  delete[] test_pos;
+  delete[] test_sorted_pos;
   cudaFree(dev_intKeys);
   cudaFree(dev_intValues);
+  cudaFree(dev_keyStarts);
+  cudaFree(dev_keyEnds);
+  cudaFree(test_dev_pos);
+  cudaFree(test_dev_particleArrayIndices);
+  cudaFree(test_dev_particleGridIndices);
+  cudaFree(test_dev_sorted_pos);
   checkCUDAErrorWithLine("cudaFree failed!");
   return;
 }
diff --git a/src/main.cpp b/src/main.cpp
index e416836..f7f36c7 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -14,18 +14,18 @@
 
 // LOOK-2.1 LOOK-2.3 - toggles for UNIFORM_GRID and COHERENT_GRID
 #define VISUALIZE 1
-#define UNIFORM_GRID 0
-#define COHERENT_GRID 0
+#define UNIFORM_GRID 1
+#define COHERENT_GRID 1
 
 // LOOK-1.2 - change this to adjust particle count in the simulation
-const int N_FOR_VIS = 5000;
+const int N_FOR_VIS = 100000;
 const float DT = 0.2f;
 
 /**
 * C main function.
 */
 int main(int argc, char* argv[]) {
-  projectName = "565 CUDA Intro: Boids";
+  projectName = "565 CUDA Intro: Boids - Trung Le";
 
   if (init(argc, argv)) {
     mainLoop();
@@ -47,6 +47,10 @@ GLFWwindow *window;
 * Initialization of CUDA and GLFW.
 */
 bool init(int argc, char **argv) {
+
+  // Print out our CUDA-capable devices properties
+  printCudaDeviceProperties();
+
   // Set window title to "Student Name: [SM 2.0] GPU Name"
   cudaDeviceProp deviceProp;
   int gpuDevice = 0;
@@ -183,10 +187,67 @@ void initShaders(GLuint * program) {
     }
   }
 
+void printCudaDeviceProperties()
+{
+  int deviceCount = 0;
+  cudaGetDeviceCount(&deviceCount);
+
+  cudaDeviceProp deviceProp;
+  for (auto i = 0; i < deviceCount; ++i) {
+    cudaGetDeviceProperties(&deviceProp, i);
+
+    std::cout << "---- General information for device " << i << " ----" << std::endl;
+    std::cout << "Device name: " << deviceProp.name << std::endl;
+    std::cout << "Compute capability: " << deviceProp.major << "." << deviceProp.minor << std::endl;
+    std::cout << "Compute mode: ";
+    switch (deviceProp.computeMode) {
+    case cudaComputeModeDefault:
+      std::cout << "Default" << std::endl;
+      break;
+    case cudaComputeModeExclusive:
+      std::cout << "Exclusive" << std::endl;
+      break;
+    case cudaComputeModeExclusiveProcess:
+      std::cout << "Exclusive process" << std::endl;
+      break;
+    case cudaComputeModeProhibited:
+      std::cout << "Prohibited" << std::endl;
+      break;
+    }
+    std::cout << "Clock rate: " << deviceProp.clockRate << std::endl;
+    std::cout << "Integrated: " << deviceProp.integrated << std::endl;
+    std::cout << "Device copy overlap: ";
+    if (deviceProp.deviceOverlap) {
+      std::cout << "Enabled" << std::endl;
+    }
+    else {
+      std::cout << "Disabled" << std::endl;
+    }
+    std::cout << "Kernel execution timeout: ";
+    if (deviceProp.kernelExecTimeoutEnabled) {
+      std::cout << "Enabled" << std::endl;
+    }
+    else {
+      std::cout << "Disabled" << std::endl;
+    }
+    std::cout << "---- Memory information for device " << i << " ----" << std::endl;
+    std::cout << "Total global memory: " << deviceProp.totalGlobalMem << std::endl;
+    std::cout << "Total constant memory: " << deviceProp.totalConstMem << std::endl;
+    std::cout << "Multiprocessor count: " << deviceProp.multiProcessorCount << std::endl;
+    std::cout << "Shared memory per multiprocessor: " << deviceProp.sharedMemPerMultiprocessor << std::endl;
+    std::cout << "Registers per multiprocessor: " << deviceProp.regsPerMultiprocessor << std::endl;
+    std::cout << "Max threads per multiprocessor: " << deviceProp.maxThreadsPerMultiProcessor << std::endl;
+    std::cout << "Max grid dimensions: [" << deviceProp.maxGridSize[0] << ", " << deviceProp.maxGridSize[1] << ", " << deviceProp.maxGridSize[2] << "]" << std::endl;
+    std::cout << "Max threads per block: " << deviceProp.maxThreadsPerBlock << std::endl;
+    std::cout << "Max registers per block: " << deviceProp.regsPerBlock << std::endl;
+    std::cout << "Max thread dimensions: [" << deviceProp.maxThreadsDim[0] << ", " << deviceProp.maxThreadsDim[1] << ", " << deviceProp.maxThreadsDim[2] << "]" << std::endl;
+  }
+}
+
   //====================================
   // Main loop
   //====================================
-  void runCUDA() {
+  float runCUDA() {
     // Map OpenGL buffer object for writing from CUDA on a single GPU
     // No data is moved (Win & Linux). When mapped to CUDA, OpenGL should not
     // use this buffer
@@ -198,7 +259,12 @@ void initShaders(GLuint * program) {
     cudaGLMapBufferObject((void**)&dptrVertPositions, boidVBO_positions);
     cudaGLMapBufferObject((void**)&dptrVertVelocities, boidVBO_velocities);
 
+    // Record cuda simulation
+    cudaEvent_t start, stop;
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop);
     // execute the kernel
+    cudaEventRecord(start);
     #if UNIFORM_GRID && COHERENT_GRID
     Boids::stepSimulationCoherentGrid(DT);
     #elif UNIFORM_GRID
@@ -206,6 +272,11 @@ void initShaders(GLuint * program) {
     #else
     Boids::stepSimulationNaive(DT);
     #endif
+    cudaEventRecord(stop);
+
+    cudaEventSynchronize(stop);
+    float milliseconds = 0;
+    cudaEventElapsedTime(&milliseconds, start, stop);
 
     #if VISUALIZE
     Boids::copyBoidsToVBO(dptrVertPositions, dptrVertVelocities);
@@ -213,6 +284,8 @@ void initShaders(GLuint * program) {
     // unmap buffer object
     cudaGLUnmapBufferObject(boidVBO_positions);
     cudaGLUnmapBufferObject(boidVBO_velocities);
+
+    return milliseconds;
   }
 
   void mainLoop() {
@@ -235,13 +308,14 @@ void initShaders(GLuint * program) {
         frame = 0;
       }
 
-      runCUDA();
+      float simulationTimeMs = runCUDA();
 
       std::ostringstream ss;
       ss << "[";
       ss.precision(1);
       ss << std::fixed << fps;
-      ss << " fps] " << deviceName;
+      ss << " fps] " << "[" << simulationTimeMs << " ms/frame]";
+      ss << deviceName;
       glfwSetWindowTitle(window, ss.str().c_str());
 
       glClear(GL_COLOR_BUFFER_BIT | GL_DEPTH_BUFFER_BIT);
diff --git a/src/main.hpp b/src/main.hpp
index 6cdaa93..9c2110b 100644
--- a/src/main.hpp
+++ b/src/main.hpp
@@ -70,7 +70,7 @@ void keyCallback(GLFWwindow* window, int key, int scancode, int action, int mods
 void mouseButtonCallback(GLFWwindow* window, int button, int action, int mods);
 void mousePositionCallback(GLFWwindow* window, double xpos, double ypos);
 void updateCamera();
-void runCUDA();
+float runCUDA();
 
 //====================================
 // Setup/init Stuff
@@ -78,3 +78,4 @@ void runCUDA();
 bool init(int argc, char **argv);
 void initVAO();
 void initShaders(GLuint *program);
+void printCudaDeviceProperties();