diff --git a/README.md b/README.md
index 98dd9a8..2bec8a4 100644
--- a/README.md
+++ b/README.md
@@ -1,10 +1,64 @@
-**University of Pennsylvania, CIS 565: GPU Programming and Architecture,
+# **University of Pennsylvania, CIS 565: GPU Programming and Architecture,
 Project 1 - Flocking**
 
-* (TODO) YOUR NAME HERE
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Ruoyu Fan
+* Tested on:  Windows 7, Xeon(R) E5-1630 @ 3.70GHz 32GB, GTX 1070 8192MB (Moore 103 SigLab)
 
-### (TODO: Your README)
+### Description - CUDA Flocking
+
+I implemented flocking simulation (Boids) using three different neighbor searching approaches:
+ * **Brute-force**
+ * **Scattered grid**
+ * **Coherent grid**
+
+ ![simulation preview](/screenshots/flocking.gif)
+
+#### Q&A
+1. For each implementation, how does changing the number of boids affect performance? Why do you think this is?
+
+	**Ans:** _In brute-force method, the framerate drops significantly faster than the other two methods. The framerate drops at square rate when the number of boids (threads) reaches the concurrency limit of GPU._
+
+    _In the other two optimized methods, as the number of boids increase, the framerate drops much slower than the naive method._
+
+	
+2. For each implementation, how does changing the block count and block size affect performance? Why do you think this is?
+
+	**Ans:** _If the block is very small, the performance will drop very fast. In my opinion the first reason is that if the block size is too small (maybe less than warp size), some of the processing units are wasted. Increasing block size can improve performance, but the effect will be less obvious when the block is large enough - and it is limited by hardware capacity. Large block size can also cause some threads to be wasted._
+
+	
+3. For the coherent uniform grid: did you experience any performance improvements with the more coherent uniform grid? Was this the outcome you expected? Why or why not?
+
+	**Ans:** _Yes. It is. because coherent approach follows the principle of "keeping things you will use close" and results in less pointer jumping and less page access. So it is faster when using a uniform grid and access the memory continuously._
+
+
+#### Performance
+
+On grid size of 128 and visual disabled, other settings as source code. 
+
+| Particle Count|Naive FPS|Scattered Grid FPS|Coherent Grid FPS
+| ------------- |--------:|-----------------:|----------------:|
+| 5000|591.5||
+| 10000|242.1||
+| 20000|52.5|1103.8|1069.1
+| 40000|14.5|1072.0|1082.4
+| 80k|4.2|662.1|761.8
+| 160k||344.7|468.9
+| 320k||121.5|191.3
+| 640k||37.2|63.5
+| 1280k||10.9|17.9
+
+
+On 160000 particles and visual disabled, other settings as source code.
+
+| Grid Size|Scattered Grid FPS|Coherent Grid FPS
+| --------------|-----------------:|----------------:|
+| 4|115.5|154.4
+| 8|185.5|245.7
+| 16|268.8|259.9
+| 32|351.2|465.8
+| 64|349.2|463.3
+| 128|347.8|472.3
+| 256|348.1|464.4
+| 512|347.9|465.7
+| 1024|343.5|467.3
 
-Include screenshots, analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
diff --git a/screenshots/flocking.gif b/screenshots/flocking.gif
new file mode 100644
index 0000000..dec7b37
Binary files /dev/null and b/screenshots/flocking.gif differ
diff --git a/screenshots/performance.xlsx b/screenshots/performance.xlsx
new file mode 100644
index 0000000..323b6b0
Binary files /dev/null and b/screenshots/performance.xlsx differ
diff --git a/src/kernel.cu b/src/kernel.cu
index aaf0fbf..044fcdf 100644
--- a/src/kernel.cu
+++ b/src/kernel.cu
@@ -3,6 +3,7 @@
 #include <cuda.h>
 #include <cmath>
 #include <glm/glm.hpp>
+
 #include "utilityCore.hpp"
 #include "kernel.h"
 
@@ -83,8 +84,10 @@ thrust::device_ptr<int> dev_thrust_particleGridIndices;
 int *dev_gridCellStartIndices; // What part of dev_particleArrayIndices belongs
 int *dev_gridCellEndIndices;   // to this cell?
 
-// TODO-2.3 - consider what additional buffers you might need to reshuffle
+// DONE-2.3 - consider what additional buffers you might need to reshuffle
 // the position and velocity data to be coherent within cells.
+glm::vec3* dev_pos_shuffler;
+glm::vec3* dev_vel_shuffler;
 
 // LOOK-2.1 - Grid parameters based on simulation parameters.
 // These are automatically computed for you in Boids::initSimulation
@@ -168,7 +171,25 @@ void Boids::initSimulation(int N) {
   gridMinimum.y -= halfGridWidth;
   gridMinimum.z -= halfGridWidth;
 
-  // TODO-2.1 TODO-2.3 - Allocate additional buffers here.
+  // DONE-2.1 DONE-2.3 - Allocate additional buffers here.
+  cudaMalloc((void **)&dev_particleArrayIndices, numObjects * sizeof(*dev_particleArrayIndices));
+  checkCUDAErrorWithLine("cudaMalloc dev_particleArrayIndices failed!");
+  cudaMalloc((void **)&dev_particleGridIndices, numObjects * sizeof(*dev_particleGridIndices));
+  checkCUDAErrorWithLine("cudaMalloc dev_particleGridIndices failed!");
+
+  dev_thrust_particleArrayIndices = thrust::device_ptr<int>(dev_particleArrayIndices);
+  dev_thrust_particleGridIndices = thrust::device_ptr<int>(dev_particleGridIndices);
+  
+  cudaMalloc((void **)&dev_gridCellStartIndices, gridCellCount * sizeof(*dev_gridCellStartIndices));
+  checkCUDAErrorWithLine("cudaMalloc dev_gridCellStartIndices failed!");
+  cudaMalloc((void **)&dev_gridCellEndIndices, gridCellCount * sizeof(*dev_gridCellEndIndices));
+  checkCUDAErrorWithLine("cudaMalloc dev_gridCellEndIndices failed!");
+
+  cudaMalloc((void**)&dev_pos_shuffler, numObjects * sizeof(*dev_pos_shuffler));
+  checkCUDAErrorWithLine("cudaMalloc dev_pos_shuffler failed!");
+  cudaMalloc((void**)&dev_vel_shuffler, numObjects * sizeof(*dev_vel_shuffler));
+  checkCUDAErrorWithLine("cudaMalloc dev_vel_shuffler failed!");
+
   cudaThreadSynchronize();
 }
 
@@ -230,21 +251,88 @@ void Boids::copyBoidsToVBO(float *vbodptr_positions, float *vbodptr_velocities)
 * in the `pos` and `vel` arrays.
 */
 __device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *pos, const glm::vec3 *vel) {
-  // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
-  // Rule 2: boids try to stay a distance d away from each other
-  // Rule 3: boids try to match the speed of surrounding boids
-  return glm::vec3(0.0f, 0.0f, 0.0f);
+    glm::vec3 delta_vel(0.0f);
+
+    glm::vec3 rule1_neighbor_pos_sum(0.0f);
+    float rule1_neighbor_count = 0.0f;
+    glm::vec3 rule2_total_offset(0.0f);
+    glm::vec3 rule3_neighbor_vel_sum(0.0f);
+    float rule3_neighbor_count = 0.0f;
+
+    float current_distance;
+    glm::vec3 current_pos_offset;
+    for (int i = 0; i < N; i++)
+    {
+        if (i == iSelf){ continue; }
+
+        current_pos_offset = pos[i] - pos[iSelf];
+        current_distance = glm::length(current_pos_offset);
+
+        // Rule 1: Get neighbor position sum and neighbor count for rule1
+        if (current_distance < rule1Distance)
+        {
+            rule1_neighbor_pos_sum += pos[i];
+            rule1_neighbor_count += 1.0f;
+        }
+        // Rule 2: Calculate offset for rule 2
+        if (current_distance < rule2Distance)
+        {
+            rule2_total_offset -= current_pos_offset;
+        }
+        // Rule 3: Get velocity sum and neighbor count for rule 3
+        if (current_distance < rule3Distance)
+        {
+            rule3_neighbor_vel_sum += vel[i];
+            rule3_neighbor_count += 1.0f;
+        }
+    }
+
+    // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
+    if (rule1_neighbor_count > 0.0f)
+    {
+        delta_vel += rule1Scale * ((rule1_neighbor_pos_sum / rule1_neighbor_count) - pos[iSelf]);
+    }
+
+    // Rule 2: boids try to stay a distance d away from each other
+    delta_vel += rule2Scale *  rule2_total_offset;
+
+    // Rule 3: boids try to match the speed of surrounding boids
+    if (rule3_neighbor_count > 0.0f)
+    {
+        delta_vel += rule3Scale * ((rule3_neighbor_vel_sum / rule3_neighbor_count) - vel[iSelf]);
+    }
+
+    return delta_vel;
 }
 
 /**
-* TODO-1.2 implement basic flocking
+* DONE-1.2 implement basic flocking
 * For each of the `N` bodies, update its position based on its current velocity.
 */
 __global__ void kernUpdateVelocityBruteForce(int N, glm::vec3 *pos,
-  glm::vec3 *vel1, glm::vec3 *vel2) {
-  // Compute a new velocity based on pos and vel1
-  // Clamp the speed
-  // Record the new velocity into vel2. Question: why NOT vel1?
+  glm::vec3 *vel1, glm::vec3 *vel2)
+{
+    // static const float max_sqr_speed = maxSpeed * maxSpeed;
+
+    auto index = threadIdx.x + (blockIdx.x * blockDim.x);
+    if (index >= N)
+    {
+        return;
+    }
+
+    // Compute a new velocity based on pos and vel1
+    glm::vec3 new_vel = vel1[index] + computeVelocityChange(N, index, pos, vel1);
+
+    // Clamp the speed
+    auto speed = glm::length(new_vel);
+    if (speed > maxSpeed)
+    {
+        new_vel = new_vel * (maxSpeed / speed);
+    }
+
+    // Record the new velocity into vel2. Question: why NOT vel1?
+    //   Ans: neighbors need vel from last frame to update their velocity
+    vel2[index] = new_vel;
 }
 
 /**
@@ -278,17 +366,64 @@ __global__ void kernUpdatePos(int N, float dt, glm::vec3 *pos, glm::vec3 *vel) {
 //          for(x)
 //            for(y)
 //             for(z)? Or some other order?
+//   Ans: I think it is
+//          for(z)
+//            for(y)
+//              for(x)
 __device__ int gridIndex3Dto1D(int x, int y, int z, int gridResolution) {
-  return x + y * gridResolution + z * gridResolution * gridResolution;
+    return x + y * gridResolution + z * gridResolution * gridResolution;
+}
+
+// This function returns -1 if any dimension falls out of boundry
+__device__ int gridIndex3Dto1DWithCheck(int x, int y, int z, int gridResolution)
+{
+    if (x < 0 || x >= gridResolution
+        || y < 0 || y >= gridResolution
+        || z < 0 || z >= gridResolution)
+    {
+        return -1;
+    }
+    else
+    {
+        return gridIndex3Dto1D(x, y, z, gridResolution);
+    }
+}
+
+__device__ glm::vec3 gridIndex3DFromPosition(const glm::vec3& pos
+    , const glm::vec3& grid_min, float inverse_cell_width)
+{
+    return (pos - grid_min) * inverse_cell_width;
+}
+
+__device__ int gridIndexFromPosition(const glm::vec3& pos, const glm::vec3& grid_min
+    , float inverse_cell_width, int grid_resolution)
+{
+    auto gridindex_3d = gridIndex3DFromPosition(pos, grid_min, inverse_cell_width);
+    auto gridindex = gridIndex3Dto1D(static_cast<int>(gridindex_3d.x)
+        , static_cast<int>(gridindex_3d.y)
+        , static_cast<int>(gridindex_3d.z)
+        , grid_resolution);
+	return gridindex;
 }
 
 __global__ void kernComputeIndices(int N, int gridResolution,
   glm::vec3 gridMin, float inverseCellWidth,
-  glm::vec3 *pos, int *indices, int *gridIndices) {
-    // TODO-2.1
+  glm::vec3 *pos, int *indices, int *gridIndices)
+{
+    // DONE-2.1
     // - Label each boid with the index of its grid cell.
     // - Set up a parallel array of integer indices as pointers to the actual
     //   boid data in pos and vel1/vel2
+    auto index = threadIdx.x + (blockIdx.x * blockDim.x);
+    if (index >= N)
+    {
+        return;
+    }
+
+    indices[index] = index;
+    gridIndices[index] = gridIndexFromPosition(pos[index], gridMin
+        , inverseCellWidth, gridResolution);
+
 }
 
 // LOOK-2.1 Consider how this could be useful for indicating that a cell
@@ -302,94 +437,439 @@ __global__ void kernResetIntBuffer(int N, int *intBuffer, int value) {
 
 __global__ void kernIdentifyCellStartEnd(int N, int *particleGridIndices,
   int *gridCellStartIndices, int *gridCellEndIndices) {
-  // TODO-2.1
-  // Identify the start point of each cell in the gridIndices array.
-  // This is basically a parallel unrolling of a loop that goes
-  // "this index doesn't match the one before it, must be a new cell!"
+    // DONE-2.1
+    // Identify the start point of each cell in the gridIndices array.
+    // This is basically a parallel unrolling of a loop that goes
+    // "this index doesn't match the one before it, must be a new cell!"
+
+    // index is sorted index, not boid index!
+    auto index = threadIdx.x + (blockIdx.x * blockDim.x);
+    if (index >= N)
+    {
+        return;
+    }
+
+    auto prev_index = (index - 1 + N) % N;  // To minimize branches
+    if (particleGridIndices[index] != particleGridIndices[prev_index])
+    {
+        gridCellStartIndices[particleGridIndices[index]] = index;
+        gridCellEndIndices[particleGridIndices[prev_index]] = prev_index + 1;
+          // intended to make the end side open like most C++ ranges
+    }
+}
+
+// Helper function to determine direction (1 or -1) of the neighbor
+// grid cell in one dimension
+__device__ int directionOfNeighborSingleDimension(float axis_index)
+{
+    if (static_cast<int>(axis_index + 0.5f) > static_cast<int>(axis_index))
+    {
+        return 1;
+    }
+    else
+    {
+        return -1;
+    }
 }
 
 __global__ void kernUpdateVelNeighborSearchScattered(
-  int N, int gridResolution, glm::vec3 gridMin,
-  float inverseCellWidth, float cellWidth,
-  int *gridCellStartIndices, int *gridCellEndIndices,
-  int *particleArrayIndices,
-  glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) {
-  // TODO-2.1 - Update a boid's velocity using the uniform grid to reduce
-  // the number of boids that need to be checked.
-  // - Identify the grid cell that this particle is in
-  // - Identify which cells may contain neighbors. This isn't always 8.
-  // - For each cell, read the start/end indices in the boid pointer array.
-  // - Access each boid in the cell and compute velocity change from
-  //   the boids rules, if this boid is within the neighborhood distance.
-  // - Clamp the speed change before putting the new speed in vel2
+    int N, int gridResolution, glm::vec3 gridMin
+    , float inverseCellWidth, float cellWidth
+    , int *gridCellStartIndices, int *gridCellEndIndices
+    , int *particleArrayIndices, glm::vec3 *pos
+	, glm::vec3 *vel1, glm::vec3 *vel2)
+{
+    // DONE-2.1 - Update a boid's velocity using the uniform grid to reduce
+    // the number of boids that need to be checked.
+    auto index = threadIdx.x + (blockIdx.x * blockDim.x);
+    if (index >= N)
+    {
+        return;
+    }
+
+    // - Identify the grid cell that this particle is in
+    auto boid_index = particleArrayIndices[index];
+
+	auto gridcell_index3d = gridIndex3DFromPosition(pos[boid_index], gridMin
+		 , inverseCellWidth);
+
+    // - Identify which cells may contain neighbors. This isn't always 8.
+    auto sign_x = directionOfNeighborSingleDimension(gridcell_index3d.x);
+    auto sign_y = directionOfNeighborSingleDimension(gridcell_index3d.y);
+    auto sign_z = directionOfNeighborSingleDimension(gridcell_index3d.z);
+    auto index_x = static_cast<int>(gridcell_index3d.x);
+    auto index_y = static_cast<int>(gridcell_index3d.y);
+    auto index_z = static_cast<int>(gridcell_index3d.z);
+
+    int possible_neighbor_indices[8];
+    possible_neighbor_indices[0] = gridIndex3Dto1DWithCheck(index_x, index_y, index_z
+        , gridResolution);
+    possible_neighbor_indices[1] = gridIndex3Dto1DWithCheck(index_x + sign_x, index_y, index_z
+        , gridResolution);
+    possible_neighbor_indices[2] = gridIndex3Dto1DWithCheck(index_x, index_y + sign_y, index_z
+        , gridResolution);
+    possible_neighbor_indices[3] = gridIndex3Dto1DWithCheck(index_x, index_y, index_z + sign_z
+        , gridResolution);
+    possible_neighbor_indices[4] = gridIndex3Dto1DWithCheck(index_x + sign_x, index_y + sign_y, index_z
+        , gridResolution);
+    possible_neighbor_indices[5] = gridIndex3Dto1DWithCheck(index_x + sign_x, index_y, index_z + sign_z
+        , gridResolution);
+    possible_neighbor_indices[6] = gridIndex3Dto1DWithCheck(index_x, index_y + sign_y, index_z + sign_z
+        , gridResolution);
+    possible_neighbor_indices[7] = gridIndex3Dto1DWithCheck(index_x + sign_x, index_y + sign_y, index_z + sign_z
+        , gridResolution);
+
+
+    // - For each cell, read the start/end indices in the boid pointer array.
+    // - Access each boid in the cell and compute velocity change from
+    //   the boids rules, if this boid is within the neighborhood distance.
+
+    glm::vec3 delta_vel(0.0f);
+
+    glm::vec3 rule1_neighbor_pos_sum(0.0f);
+    float rule1_neighbor_count = 0.0f;
+    glm::vec3 rule2_total_offset(0.0f);
+    glm::vec3 rule3_neighbor_vel_sum(0.0f);
+    float rule3_neighbor_count = 0.0f;
+
+    float current_distance;
+    glm::vec3 current_pos_offset;
+    int iother;
+    for (const auto neighbor_cell_index : possible_neighbor_indices)
+    {
+        if (neighbor_cell_index == -1)
+        {
+            continue;
+        }
+
+        for (int i = gridCellStartIndices[neighbor_cell_index]
+            ; i < gridCellEndIndices[neighbor_cell_index]
+            ; i++)
+        {
+            iother = particleArrayIndices[i];
+            if (iother == boid_index){ continue; }
+
+            current_pos_offset = pos[iother] - pos[boid_index];
+            current_distance = glm::length(current_pos_offset);
+
+            // Rule 1: Get neighbor position sum and neighbor count for rule1
+            if (current_distance < rule1Distance)
+            {
+                rule1_neighbor_pos_sum += pos[iother];
+                rule1_neighbor_count += 1.0f;
+            }
+            // Rule 2: Calculate offset for rule 2
+            if (current_distance < rule2Distance)
+            {
+                rule2_total_offset -= current_pos_offset;
+            }
+            // Rule 3: Get velocity sum and neighbor count for rule 3
+            if (current_distance < rule3Distance)
+            {
+                rule3_neighbor_vel_sum += vel1[iother];
+                rule3_neighbor_count += 1.0f;
+            }
+        }
+    }
+
+    // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
+    if (rule1_neighbor_count > 0.0f)
+    {
+        delta_vel += rule1Scale * ((rule1_neighbor_pos_sum / rule1_neighbor_count) - pos[boid_index]);
+    }
+
+    // Rule 2: boids try to stay a distance d away from each other
+    delta_vel += rule2Scale *  rule2_total_offset;
+
+    // Rule 3: boids try to match the speed of surrounding boids
+    if (rule3_neighbor_count > 0.0f)
+    {
+        delta_vel += rule3Scale * ((rule3_neighbor_vel_sum / rule3_neighbor_count) - vel1[boid_index]);
+    }
+
+    auto new_vel = vel1[boid_index] + delta_vel;
+
+    // - Clamp the speed change before putting the new speed in vel2
+    auto speed = glm::length(new_vel);
+    if (speed > maxSpeed)
+    {
+        new_vel = new_vel * (maxSpeed / speed);
+    }
+    vel2[boid_index] = new_vel;
+
+}
+
+__global__ void kernReshuffle(int N, glm::vec3* pos
+	, glm::vec3* vel, glm::vec3* pos_shuffler, glm::vec3* vel_shuffler
+	, int* particleArrayIndices)
+{
+	// Reshuffle pos and vel using particleArrayIndices 
+	// , store in pos_shuffler and vel_shuffler
+	auto index = threadIdx.x + (blockIdx.x * blockDim.x);
+	if (index >= N)
+	{
+		return;
+	}
+
+	pos_shuffler[index] = pos[particleArrayIndices[index]];
+	vel_shuffler[index] = vel[particleArrayIndices[index]];
 }
 
 __global__ void kernUpdateVelNeighborSearchCoherent(
-  int N, int gridResolution, glm::vec3 gridMin,
-  float inverseCellWidth, float cellWidth,
-  int *gridCellStartIndices, int *gridCellEndIndices,
-  glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) {
-  // TODO-2.3 - This should be very similar to kernUpdateVelNeighborSearchScattered,
-  // except with one less level of indirection.
-  // This should expect gridCellStartIndices and gridCellEndIndices to refer
-  // directly to pos and vel1.
-  // - Identify the grid cell that this particle is in
-  // - Identify which cells may contain neighbors. This isn't always 8.
-  // - For each cell, read the start/end indices in the boid pointer array.
-  //   DIFFERENCE: For best results, consider what order the cells should be
-  //   checked in to maximize the memory benefits of reordering the boids data.
-  // - Access each boid in the cell and compute velocity change from
-  //   the boids rules, if this boid is within the neighborhood distance.
-  // - Clamp the speed change before putting the new speed in vel2
+	int N, int gridResolution, glm::vec3 gridMin,
+	float inverseCellWidth, float cellWidth,
+	int *gridCellStartIndices, int *gridCellEndIndices,
+	glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) {
+	// DONE-2.3 - This should be very similar to kernUpdateVelNeighborSearchScattered,
+	// except with one less level of indirection.
+	// This should expect gridCellStartIndices and gridCellEndIndices to refer
+	// directly to pos and vel1.
+
+	auto index = threadIdx.x + (blockIdx.x * blockDim.x);
+	if (index >= N)
+	{
+		return;
+	}
+
+	// - Identify the grid cell that this particle is in
+	auto gridcell_index3d = gridIndex3DFromPosition(pos[index], gridMin
+		, inverseCellWidth);
+
+	// - Identify which cells may contain neighbors. This isn't always 8.
+	auto sign_x = directionOfNeighborSingleDimension(gridcell_index3d.x);
+	auto sign_y = directionOfNeighborSingleDimension(gridcell_index3d.y);
+	auto sign_z = directionOfNeighborSingleDimension(gridcell_index3d.z);
+	auto index_x = static_cast<int>(gridcell_index3d.x);
+	auto index_y = static_cast<int>(gridcell_index3d.y);
+	auto index_z = static_cast<int>(gridcell_index3d.z);
+
+
+	int possible_neighbor_indices[8]; 
+	// CHANGE: order of this array changed from 2.1 to match the best for(z): for(y): for(x) order
+	int index_z_smaller = imin(index_z, index_z + sign_z);
+	int index_z_bigger = imax(index_z, index_z + sign_z);
+	int index_y_smaller = imin(index_y, index_y + sign_y);
+	int index_y_bigger = imax(index_y, index_y + sign_y);
+	int index_x_smaller = imin(index_x, index_x + sign_x);
+	int index_x_bigger = imax(index_x, index_x + sign_x);
+	possible_neighbor_indices[0] = gridIndex3Dto1DWithCheck(index_x_smaller, index_y_smaller, index_z_smaller
+		, gridResolution);
+	possible_neighbor_indices[1] = gridIndex3Dto1DWithCheck(index_x_bigger, index_y_smaller, index_z_smaller
+		, gridResolution);
+	possible_neighbor_indices[2] = gridIndex3Dto1DWithCheck(index_x_smaller, index_y_bigger, index_z_smaller
+		, gridResolution);
+	possible_neighbor_indices[3] = gridIndex3Dto1DWithCheck(index_x_bigger, index_y_bigger, index_z_smaller
+		, gridResolution);
+	possible_neighbor_indices[4] = gridIndex3Dto1DWithCheck(index_x_smaller, index_y_smaller, index_z_bigger
+		, gridResolution);
+	possible_neighbor_indices[5] = gridIndex3Dto1DWithCheck(index_x_bigger, index_y_smaller, index_z_bigger
+		, gridResolution);
+	possible_neighbor_indices[6] = gridIndex3Dto1DWithCheck(index_x_smaller, index_y_bigger, index_z_bigger
+		, gridResolution);
+	possible_neighbor_indices[7] = gridIndex3Dto1DWithCheck(index_x_bigger, index_y_bigger, index_z_bigger
+		, gridResolution);
+
+	// - For each cell, read the start/end indices in the boid pointer array.
+	//   DIFFERENCE: For best results, consider what order the cells should be
+	//   checked in to maximize the memory benefits of reordering the boids data.
+	//   
+	//   CHANGE: order of the array (possible_neighbor_indices) above already changed 
+	//     so the order of the cells checked become
+	//     for(z): for(y): for(x)
+	glm::vec3 delta_vel(0.0f);
+
+	glm::vec3 rule1_neighbor_pos_sum(0.0f);
+	float rule1_neighbor_count = 0.0f;
+	glm::vec3 rule2_total_offset(0.0f);
+	glm::vec3 rule3_neighbor_vel_sum(0.0f);
+	float rule3_neighbor_count = 0.0f;
+
+	float current_distance;
+	glm::vec3 current_pos_offset;
+	for (const auto neighbor_cell_index : possible_neighbor_indices)
+	{
+		if (neighbor_cell_index == -1)
+		{
+			continue;
+		}
+
+		// - Access each boid in the cell and compute velocity change from
+		//   the boids rules, if this boid is within the neighborhood distance.
+		for (int iother = gridCellStartIndices[neighbor_cell_index]
+			; iother < gridCellEndIndices[neighbor_cell_index]
+			; iother++)
+		{
+			if (iother == index){ continue; }
+
+			current_pos_offset = pos[iother] - pos[index];
+			current_distance = glm::length(current_pos_offset);
+
+			// Rule 1: Get neighbor position sum and neighbor count for rule1
+			if (current_distance < rule1Distance)
+			{
+				rule1_neighbor_pos_sum += pos[iother];
+				rule1_neighbor_count += 1.0f;
+			}
+			// Rule 2: Calculate offset for rule 2
+			if (current_distance < rule2Distance)
+			{
+				rule2_total_offset -= current_pos_offset;
+			}
+			// Rule 3: Get velocity sum and neighbor count for rule 3
+			if (current_distance < rule3Distance)
+			{
+				rule3_neighbor_vel_sum += vel1[iother];
+				rule3_neighbor_count += 1.0f;
+			}
+		}
+	}
+
+	// Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
+	if (rule1_neighbor_count > 0.0f)
+	{
+		delta_vel += rule1Scale * ((rule1_neighbor_pos_sum / rule1_neighbor_count) - pos[index]);
+	}
+
+	// Rule 2: boids try to stay a distance d away from each other
+	delta_vel += rule2Scale *  rule2_total_offset;
+
+	// Rule 3: boids try to match the speed of surrounding boids
+	if (rule3_neighbor_count > 0.0f)
+	{
+		delta_vel += rule3Scale * ((rule3_neighbor_vel_sum / rule3_neighbor_count) - vel1[index]);
+	}
+
+	auto new_vel = vel1[index] + delta_vel;
+
+	// - Clamp the speed change before putting the new speed in vel2
+	auto speed = glm::length(new_vel);
+	if (speed > maxSpeed)
+	{
+		new_vel = new_vel * (maxSpeed / speed);
+	}
+	vel2[index] = new_vel;
 }
 
 /**
 * Step the entire N-body simulation by `dt` seconds.
 */
-void Boids::stepSimulationNaive(float dt) {
-  // TODO-1.2 - use the kernels you wrote to step the simulation forward in time.
-  // TODO-1.2 ping-pong the velocity buffers
+void Boids::stepSimulationNaive(float dt)
+{
+
+    // DONE-1.2 - use the kernels you wrote to step the simulation forward in time.
+    dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+    kernUpdateVelocityBruteForce<<<fullBlocksPerGrid, blockSize >>>(numObjects, dev_pos, dev_vel1, dev_vel2);
+    kernUpdatePos <<<fullBlocksPerGrid, blockSize >>>(numObjects, dt, dev_pos, dev_vel1);
+
+    // DONE-1.2 ping-pong the velocity buffers
+    // swap pointers so current velocity become new velocity
+    std::swap(dev_vel1, dev_vel2);
 }
 
-void Boids::stepSimulationScatteredGrid(float dt) {
-  // TODO-2.1
-  // Uniform Grid Neighbor search using Thrust sort.
-  // In Parallel:
-  // - label each particle with its array index as well as its grid index.
-  //   Use 2x width grids.
-  // - Unstable key sort using Thrust. A stable sort isn't necessary, but you
-  //   are welcome to do a performance comparison.
-  // - Naively unroll the loop for finding the start and end indices of each
-  //   cell's data pointers in the array of boid indices
-  // - Perform velocity updates using neighbor search
-  // - Update positions
-  // - Ping-pong buffers as needed
+void Boids::stepSimulationScatteredGrid(float dt)
+{
+    dim3 fullBlocksPerGridForBoids((numObjects + blockSize - 1) / blockSize);
+    dim3 fullBlocksPerGridForCells((gridCellCount + blockSize - 1) / blockSize);
+    // DONE-2.1
+    // Uniform Grid Neighbor search using Thrust sort.
+    // In Parallel:
+
+    // - label each particle with its array index as well as its grid index.
+    //   Use 2x width grids.
+	kernComputeIndices << <fullBlocksPerGridForBoids, blockSize >> >(numObjects, gridSideCount
+        , gridMinimum, gridInverseCellWidth, dev_pos, dev_particleArrayIndices, dev_particleGridIndices);
+    
+    // - Unstable key sort using Thrust. A stable sort isn't necessary, but you
+    //   are welcome to do a performance comparison.
+    thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects
+        , dev_thrust_particleArrayIndices);
+
+    //   reset start and end buffer since not all values are set during kernIdentifyCellStartEnd
+    kernResetIntBuffer <<<fullBlocksPerGridForCells, blockSize >>>(gridCellCount
+        , dev_gridCellStartIndices, -1);
+    kernResetIntBuffer <<<fullBlocksPerGridForCells, blockSize >>>(gridCellCount
+        , dev_gridCellEndIndices, -1);
+
+    // - Naively unroll the loop for finding the start and end indices of each
+    //   cell's data pointers in the array of boid indices
+    kernIdentifyCellStartEnd <<<fullBlocksPerGridForBoids, blockSize >>>(numObjects
+        , dev_particleGridIndices, dev_gridCellStartIndices, dev_gridCellEndIndices);
+
+    // - Perform velocity updates using neighbor search
+    kernUpdateVelNeighborSearchScattered <<<fullBlocksPerGridForBoids, blockSize >>>(numObjects
+        , gridSideCount, gridMinimum, gridInverseCellWidth, gridCellWidth, dev_gridCellStartIndices
+        , dev_gridCellEndIndices, dev_particleArrayIndices, dev_pos, dev_vel1, dev_vel2);
+
+    // - Update positions
+    kernUpdatePos <<<fullBlocksPerGridForBoids, blockSize >>>(numObjects, dt, dev_pos, dev_vel1);
+
+    // - Ping-pong buffers as needed
+    std::swap(dev_vel1, dev_vel2);
 }
 
+
+
 void Boids::stepSimulationCoherentGrid(float dt) {
-  // TODO-2.3 - start by copying Boids::stepSimulationNaiveGrid
-  // Uniform Grid Neighbor search using Thrust sort on cell-coherent data.
-  // In Parallel:
-  // - Label each particle with its array index as well as its grid index.
-  //   Use 2x width grids
-  // - Unstable key sort using Thrust. A stable sort isn't necessary, but you
-  //   are welcome to do a performance comparison.
-  // - Naively unroll the loop for finding the start and end indices of each
-  //   cell's data pointers in the array of boid indices
-  // - BIG DIFFERENCE: use the rearranged array index buffer to reshuffle all
-  //   the particle data in the simulation array.
-  //   CONSIDER WHAT ADDITIONAL BUFFERS YOU NEED
-  // - Perform velocity updates using neighbor search
-  // - Update positions
-  // - Ping-pong buffers as needed. THIS MAY BE DIFFERENT FROM BEFORE.
+	dim3 fullBlocksPerGridForBoids((numObjects + blockSize - 1) / blockSize);
+	dim3 fullBlocksPerGridForCells((gridCellCount + blockSize - 1) / blockSize);
+	// DONE-2.3 - start by copying Boids::stepSimulationNaiveGrid
+	// Uniform Grid Neighbor search using Thrust sort on cell-coherent data.
+	// In Parallel:
+
+	// - Label each particle with its array index as well as its grid index.
+	//   Use 2x width grids
+	kernComputeIndices <<<fullBlocksPerGridForBoids, blockSize >>>(numObjects, gridSideCount
+		, gridMinimum, gridInverseCellWidth, dev_pos, dev_particleArrayIndices, dev_particleGridIndices);
+
+	// - Unstable key sort using Thrust. A stable sort isn't necessary, but you
+	//   are welcome to do a performance comparison.
+	thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects
+		, dev_thrust_particleArrayIndices);
+
+	//   reset start and end buffer since not all values are set during kernIdentifyCellStartEnd
+	kernResetIntBuffer <<<fullBlocksPerGridForCells, blockSize >>>(gridCellCount
+		, dev_gridCellStartIndices, -1);
+	kernResetIntBuffer <<<fullBlocksPerGridForCells, blockSize >>>(gridCellCount
+		, dev_gridCellEndIndices, -1);
+
+	// - Naively unroll the loop for finding the start and end indices of each
+	//   cell's data pointers in the array of boid indices
+	kernIdentifyCellStartEnd <<<fullBlocksPerGridForBoids, blockSize >>>(numObjects
+		, dev_particleGridIndices, dev_gridCellStartIndices, dev_gridCellEndIndices);
+
+	// - BIG DIFFERENCE: use the rearranged array index buffer to reshuffle all
+	//   the particle data in the simulation array.
+	//   CONSIDER WHAT ADDITIONAL BUFFERS YOU NEED
+	kernReshuffle <<<fullBlocksPerGridForBoids, blockSize >>>(numObjects, dev_pos
+		, dev_vel1, dev_pos_shuffler, dev_vel_shuffler, dev_particleArrayIndices);
+	std::swap(dev_pos, dev_pos_shuffler);
+	std::swap(dev_vel1, dev_vel_shuffler);
+
+	// - Perform velocity updates using neighbor search
+	kernUpdateVelNeighborSearchCoherent <<<fullBlocksPerGridForBoids, blockSize >>>(numObjects
+		, gridSideCount, gridMinimum, gridInverseCellWidth, gridCellWidth, dev_gridCellStartIndices
+		, dev_gridCellEndIndices, dev_pos, dev_vel1, dev_vel2);
+
+	// - Update positions
+	kernUpdatePos <<<fullBlocksPerGridForBoids, blockSize >>>(numObjects, dt, dev_pos, dev_vel1);
+
+	// - Ping-pong buffers as needed. THIS MAY BE DIFFERENT FROM BEFORE.
+	std::swap(dev_vel1, dev_vel2);
 }
 
-void Boids::endSimulation() {
-  cudaFree(dev_vel1);
-  cudaFree(dev_vel2);
-  cudaFree(dev_pos);
-
-  // TODO-2.1 TODO-2.3 - Free any additional buffers here.
+void Boids::endSimulation() 
+{
+	cudaFree(dev_vel1);
+	cudaFree(dev_vel2);
+	cudaFree(dev_pos);
+
+	// DONE-2.1 DONE-2.3 - Free any additional buffers here.
+	cudaFree(dev_particleArrayIndices);
+	cudaFree(dev_particleGridIndices);
+	cudaFree(dev_gridCellStartIndices);
+	cudaFree(dev_gridCellEndIndices);
+	cudaFree(dev_pos_shuffler);
+	cudaFree(dev_vel_shuffler);
 }
 
 void Boids::unitTest() {
diff --git a/src/main.cpp b/src/main.cpp
index e416836..1f86d5f 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -14,11 +14,11 @@
 
 // LOOK-2.1 LOOK-2.3 - toggles for UNIFORM_GRID and COHERENT_GRID
 #define VISUALIZE 1
-#define UNIFORM_GRID 0
-#define COHERENT_GRID 0
+#define UNIFORM_GRID 1
+#define COHERENT_GRID 1
 
 // LOOK-1.2 - change this to adjust particle count in the simulation
-const int N_FOR_VIS = 5000;
+const int N_FOR_VIS = 10000;
 const float DT = 0.2f;
 
 /**
diff --git a/src/main.hpp b/src/main.hpp
index 6cdaa93..deaf57e 100644
--- a/src/main.hpp
+++ b/src/main.hpp
@@ -36,6 +36,9 @@ const float fovy = (float) (PI / 4);
 const float zNear = 0.10f;
 const float zFar = 10.0f;
 // LOOK-1.2: for high DPI displays, you may want to double these settings.
+//int width = 2560;
+//int height = 1440;
+//int pointSize = 4.0f;
 int width = 1280;
 int height = 720;
 int pointSize = 2.0f;