diff --git a/README.md b/README.md
index d63a6a1..808880c 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,28 @@
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture,
 Project 1 - Flocking**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Xuanzhuo Xu
+  * [LinkedIn](https://www.linkedin.com/in/xuanzhuoxu/), [personal website](https://www.linkedin.com/in/xuanzhuoxu/), [github](https://github.com/PKUxxz), etc.
+* Tested on: Windows Server 2019, E5-2686 @ 2.30GHz 30.5GB, Tesla M60 8GB (AWS g3s.xlarge)
 
-### (TODO: Your README)
+### Gif demo of Coherent Grid Simulation
+
+![](images/res.gif)
+
+### Answer and relative results towards those questions
+
+- For each implementation, how does changing the number of boids affect performance? Why do you think this is?
+  Result of the number of boids vs FPS:
+  ![](images/NumBoidFPS.png)
+  Increasing the number of boids typically hurts the performance. Enabling the Visulization hurts the performance as well. This is because more boids mean more computation, owing to the limitation of the GPU, the FPS drops correspondingly.
+- For each implementation, how does changing the block count and block size affect performance? Why do you think this is?
+  ![](images/BlockSizeFPS.png)
+  Block size doesn't change the performance significantly. This is probably because of the total amount of computation doesn't change, and the parallelism is performed well in each size of blocks, so the performance doesn't change either.
+  
+- For the coherent uniform grid: did you experience any performance improvements with the more coherent uniform grid? Was this the outcome you expected? Why or why not?
+  
+  Yes and yes. Using coherent way performs better than any other methods, this is because of the reshuffling of the index reduces meanless computation. However, as can be seen from the graph above, when the number of boid is small, this improvement is not such significant. This is because of the overhead been reduced by coherent way is relative to the scale of the problem.
+- Did changing cell width and checking 27 vs 8 neighboring cells affect performance? Why or why not? Be careful: it is insufficient (and possibly incorrect) to say that 27-cell is slower simply because there are more cells to check!
+  
+  It seems there's no significant difference between setting to 27 or to 8. I guess it is just because of the total computation is similar. Maybe test in larger scale will make a difference, however, the GPU is not capable to handle more number of boids.
 
-Include screenshots, analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
diff --git a/images/BlockSizeFPS.png b/images/BlockSizeFPS.png
new file mode 100644
index 0000000..897b919
Binary files /dev/null and b/images/BlockSizeFPS.png differ
diff --git a/images/NumBoidFPS.png b/images/NumBoidFPS.png
new file mode 100644
index 0000000..fb4046c
Binary files /dev/null and b/images/NumBoidFPS.png differ
diff --git a/images/res.gif b/images/res.gif
new file mode 100644
index 0000000..a7d9c3e
Binary files /dev/null and b/images/res.gif differ
diff --git a/src/kernel.cu b/src/kernel.cu
index 74dffcb..d419faf 100644
--- a/src/kernel.cu
+++ b/src/kernel.cu
@@ -86,6 +86,10 @@ int *dev_gridCellEndIndices;   // to this cell?
 // TODO-2.3 - consider what additional buffers you might need to reshuffle
 // the position and velocity data to be coherent within cells.
 
+glm::vec3 *dev_pos_coh;
+glm::vec3 *dev_vel1_coh;
+glm::vec3 *dev_vel2_coh;
+
 // LOOK-2.1 - Grid parameters based on simulation parameters.
 // These are automatically computed for you in Boids::initSimulation
 int gridCellCount;
@@ -169,6 +173,31 @@ void Boids::initSimulation(int N) {
   gridMinimum.z -= halfGridWidth;
 
   // TODO-2.1 TODO-2.3 - Allocate additional buffers here.
+  cudaMalloc((void**)&dev_particleArrayIndices, (N * sizeof(int)));
+  checkCUDAErrorWithLine("cudaMalloc dev_particleArrayIndices failed!");
+
+  cudaMalloc((void**)&dev_particleGridIndices, (N * sizeof(int)));
+  checkCUDAErrorWithLine("cudaMalloc dev_particleGridIndices failed!");
+
+  cudaMalloc((void**)&dev_gridCellStartIndices, (gridCellCount * sizeof(int)));
+  checkCUDAErrorWithLine("cudaMalloc dev_gridCellStartIndices failed!");
+
+  cudaMalloc((void**)&dev_gridCellEndIndices, (gridCellCount * sizeof(int)));
+  checkCUDAErrorWithLine("cudaMalloc dev_gridCellEndIndices failed!");
+
+  dev_thrust_particleArrayIndices = thrust::device_ptr<int>(dev_particleArrayIndices);
+  dev_thrust_particleGridIndices = thrust::device_ptr<int>(dev_particleGridIndices);
+
+  cudaMalloc((void**)&dev_pos_coh, N * sizeof(glm::vec3));
+  checkCUDAErrorWithLine("cudaMalloc dev_pos_coh failed!");
+
+  cudaMalloc((void**)&dev_vel1_coh, N * sizeof(glm::vec3));
+  checkCUDAErrorWithLine("cudaMalloc dev_vel1_coh failed!");
+
+  cudaMalloc((void**)&dev_vel2_coh, N * sizeof(glm::vec3));
+  checkCUDAErrorWithLine("cudaMalloc dev_vel2_coh failed!");
+
+
   cudaDeviceSynchronize();
 }
 
@@ -233,7 +262,46 @@ __device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *po
   // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
   // Rule 2: boids try to stay a distance d away from each other
   // Rule 3: boids try to match the speed of surrounding boids
-  return glm::vec3(0.0f, 0.0f, 0.0f);
+  glm::vec3 R1Pos(0.0f);
+  glm::vec3 R1V(0.0f);
+  glm::vec3 R2V(0.0f);
+  glm::vec3 R3V(0.0f);
+
+  int R1Cnt = 0;
+  int R3Cnt = 0;
+
+  for (int i = 0; i < N; i ++) {
+    if (i == iSelf) {
+      continue;
+    }
+    double curDist = glm::distance(pos[i], pos[iSelf]);
+    if (curDist < rule1Distance) {
+        R1Cnt++;
+        R1Pos += pos[i];
+    }
+    if (curDist < rule2Distance) {
+        R2V -= (pos[i] - pos[iSelf]);
+    }
+    if (curDist < rule3Distance) {
+        R3Cnt++;
+        R3V += vel[i];
+    }
+  }
+
+  // R1
+  if (R1Cnt > 0) {
+      R1Pos /= R1Cnt;
+      R1V = (R1Pos - pos[iSelf]) * rule1Scale;
+  }
+  // R2
+  R2V *= rule2Scale;
+  // R3
+  if (R3Cnt > 0) {
+      R3V /= R3Cnt;
+      R3V *= rule3Scale;
+  }
+
+  return R1V + R2V + R3V;
 }
 
 /**
@@ -242,9 +310,19 @@ __device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *po
 */
 __global__ void kernUpdateVelocityBruteForce(int N, glm::vec3 *pos,
   glm::vec3 *vel1, glm::vec3 *vel2) {
-  // Compute a new velocity based on pos and vel1
-  // Clamp the speed
-  // Record the new velocity into vel2. Question: why NOT vel1?
+    // Compute a new velocity based on pos and vel1
+    int index = threadIdx.x + (blockIdx.x * blockDim.x);
+    if (index >= N) {
+        return;
+    }
+    
+    glm::vec3 thisV = vel1[index] + computeVelocityChange(N, index, pos, vel1);
+
+    // Clamp the speed
+    thisV = glm::clamp(thisV, -maxSpeed, maxSpeed);
+
+    // Record the new velocity into vel2. Question: why NOT vel1?
+    vel2[index] = thisV;
 }
 
 /**
@@ -289,6 +367,16 @@ __global__ void kernComputeIndices(int N, int gridResolution,
     // - Label each boid with the index of its grid cell.
     // - Set up a parallel array of integer indices as pointers to the actual
     //   boid data in pos and vel1/vel2
+    int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (index < N) {
+        glm::vec3 gridPos = pos[index];
+        int x = (int)((gridPos.x - gridMin.x) * inverseCellWidth);
+        int y = (int)((gridPos.y - gridMin.y) * inverseCellWidth);
+        int z = (int)((gridPos.z - gridMin.z) * inverseCellWidth);
+        int gridIdx = gridIndex3Dto1D(x, y, z, gridResolution);
+        gridIndices[index] = gridIdx;
+        indices[index] = index;
+    }
 }
 
 // LOOK-2.1 Consider how this could be useful for indicating that a cell
@@ -306,6 +394,21 @@ __global__ void kernIdentifyCellStartEnd(int N, int *particleGridIndices,
   // Identify the start point of each cell in the gridIndices array.
   // This is basically a parallel unrolling of a loop that goes
   // "this index doesn't match the one before it, must be a new cell!"
+    int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (index < N) {
+        if (index == 0) {   // start
+            gridCellStartIndices[particleGridIndices[index]] = index;
+        }
+        else if (index == N - 1) {  // end
+            gridCellEndIndices[particleGridIndices[index]] = index;
+        }
+        else {
+            if (particleGridIndices[index] != particleGridIndices[index + 1]) {
+                gridCellStartIndices[particleGridIndices[index + 1]] = index + 1;
+                gridCellEndIndices[particleGridIndices[index]] = index;
+            }
+        }
+    }
 }
 
 __global__ void kernUpdateVelNeighborSearchScattered(
@@ -322,6 +425,67 @@ __global__ void kernUpdateVelNeighborSearchScattered(
   // - Access each boid in the cell and compute velocity change from
   //   the boids rules, if this boid is within the neighborhood distance.
   // - Clamp the speed change before putting the new speed in vel2
+    int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (index < N) {
+        glm::vec3 gridPos = pos[index];
+        int x = (int)((gridPos.x - gridMin.x) * inverseCellWidth);
+        int y = (int)((gridPos.y - gridMin.y) * inverseCellWidth);
+        int z = (int)((gridPos.z - gridMin.z) * inverseCellWidth);
+
+        glm::vec3 boidPos = pos[index]; // Used to calculate dist with the temp one when iterating
+
+        glm::vec3 R1Pos(0.0f);
+        glm::vec3 R1V(0.0f);
+        glm::vec3 R2V(0.0f);
+        glm::vec3 R3V(0.0f);
+
+        int R1Cnt = 0;
+        int R3Cnt = 0;
+
+        for (int ix = imax(x - 1, 0); ix <= imin(x + 1, gridResolution - 1); ix++) {
+            for (int iy = imax(y - 1, 0); iy <= imin(y + 1, gridResolution - 1); iy++) {
+                for (int iz = imax(z - 1, 0); iz <= imin(z + 1, gridResolution - 1); iz++) {
+                    int curIdx = gridIndex3Dto1D(ix, iy, iz, gridResolution);
+                    int curStart = gridCellStartIndices[curIdx];
+                    int curEnd = gridCellEndIndices[curIdx];
+                    for (int ib = curStart; ib <= curEnd; ib++) {
+                        int boidIdx = particleArrayIndices[ib];
+                        if (boidIdx != index) {
+                            glm::vec3 curBoidPos = pos[boidIdx];
+                            double curDist = glm::distance(curBoidPos, boidPos);
+                            // Here apply those three rules
+                            if (curDist < rule1Distance) {
+                                R1Cnt++;
+                                R1Pos += curBoidPos;
+                            }
+                            if (curDist < rule2Distance) {
+                                R2V -= (curBoidPos - boidPos);
+                            }
+                            if (curDist < rule3Distance) {
+                                R3Cnt++;
+                                R3V += vel1[boidIdx];
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        // R1
+        if (R1Cnt > 0) {
+            R1Pos /= R1Cnt;
+            R1V = (R1Pos - boidPos) * rule1Scale;
+        }
+        // R2
+        R2V *= rule2Scale;
+        // R3
+        if (R3Cnt > 0) {
+            R3V /= R3Cnt;
+            R3V *= rule3Scale;
+        }
+
+        glm::vec3 newV = glm::clamp(vel1[index] + R1V + R2V + R3V, -maxSpeed, maxSpeed);
+        vel2[index] = newV;
+    }
 }
 
 __global__ void kernUpdateVelNeighborSearchCoherent(
@@ -341,6 +505,66 @@ __global__ void kernUpdateVelNeighborSearchCoherent(
   // - Access each boid in the cell and compute velocity change from
   //   the boids rules, if this boid is within the neighborhood distance.
   // - Clamp the speed change before putting the new speed in vel2
+  int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (index < N) {
+        glm::vec3 gridPos = pos[index];
+        int x = (int)((gridPos.x - gridMin.x) * inverseCellWidth);
+        int y = (int)((gridPos.y - gridMin.y) * inverseCellWidth);
+        int z = (int)((gridPos.z - gridMin.z) * inverseCellWidth);
+
+        glm::vec3 boidPos = pos[index]; // Used to calculate dist with the temp one when iterating
+
+        glm::vec3 R1Pos(0.0f);
+        glm::vec3 R1V(0.0f);
+        glm::vec3 R2V(0.0f);
+        glm::vec3 R3V(0.0f);
+
+        int R1Cnt = 0;
+        int R3Cnt = 0;
+
+        for (int ix = imax(x - 1, 0); ix <= imin(x + 1, gridResolution - 1); ix++) {
+            for (int iy = imax(y - 1, 0); iy <= imin(y + 1, gridResolution - 1); iy++) {
+                for (int iz = imax(z - 1, 0); iz <= imin(z + 1, gridResolution - 1); iz++) {
+                    int curIdx = gridIndex3Dto1D(ix, iy, iz, gridResolution);
+                    int curStart = gridCellStartIndices[curIdx];
+                    int curEnd = gridCellEndIndices[curIdx];
+                    for (int boidIdx = curStart; boidIdx <= curEnd; boidIdx++) {
+                        if (boidIdx != index) {
+                            glm::vec3 curBoidPos = pos[boidIdx];
+                            double curDist = glm::distance(curBoidPos, boidPos);
+                            // Here apply those three rules
+                            if (curDist < rule1Distance) {
+                                R1Cnt++;
+                                R1Pos += curBoidPos;
+                            }
+                            if (curDist < rule2Distance) {
+                                R2V -= (curBoidPos - boidPos);
+                            }
+                            if (curDist < rule3Distance) {
+                                R3Cnt++;
+                                R3V += vel1[boidIdx];
+                            }
+                        }
+                    }
+                }
+            }
+        }
+        // R1
+        if (R1Cnt > 0) {
+            R1Pos /= R1Cnt;
+            R1V = (R1Pos - boidPos) * rule1Scale;
+        }
+        // R2
+        R2V *= rule2Scale;
+        // R3
+        if (R3Cnt > 0) {
+            R3V /= R3Cnt;
+            R3V *= rule3Scale;
+        }
+
+        glm::vec3 newV = glm::clamp(vel1[index] + R1V + R2V + R3V, -maxSpeed, maxSpeed);
+        vel2[index] = newV;
+    }
 }
 
 /**
@@ -348,40 +572,78 @@ __global__ void kernUpdateVelNeighborSearchCoherent(
 */
 void Boids::stepSimulationNaive(float dt) {
   // TODO-1.2 - use the kernels you wrote to step the simulation forward in time.
+    dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+    kernUpdateVelocityBruteForce <<<fullBlocksPerGrid, blockSize >>> (numObjects, dev_pos, dev_vel1, dev_vel2);
+    kernUpdatePos << <fullBlocksPerGrid, blockSize >> > (numObjects, dt, dev_pos, dev_vel2);
   // TODO-1.2 ping-pong the velocity buffers
+    cudaMemcpy(dev_vel1, dev_vel2, (numObjects * sizeof(glm::vec3)), cudaMemcpyDeviceToDevice);
 }
 
 void Boids::stepSimulationScatteredGrid(float dt) {
   // TODO-2.1
   // Uniform Grid Neighbor search using Thrust sort.
   // In Parallel:
+    dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
   // - label each particle with its array index as well as its grid index.
   //   Use 2x width grids.
+    kernComputeIndices<<<fullBlocksPerGrid, blockSize>>> (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, dev_pos, dev_particleArrayIndices, dev_particleGridIndices);
   // - Unstable key sort using Thrust. A stable sort isn't necessary, but you
   //   are welcome to do a performance comparison.
+    thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, dev_thrust_particleArrayIndices);
+    dim3 numBlocks((gridCellCount + blockSize - 1) / blockSize);
+
+    kernResetIntBuffer<<<numBlocks, blockSize >>> (gridCellCount, dev_gridCellStartIndices, -1);
+    kernResetIntBuffer<<<numBlocks, blockSize >>> (gridCellCount, dev_gridCellEndIndices, -1);
   // - Naively unroll the loop for finding the start and end indices of each
   //   cell's data pointers in the array of boid indices
+    kernIdentifyCellStartEnd<<<fullBlocksPerGrid, blockSize>>> (numObjects, dev_particleGridIndices, dev_gridCellStartIndices, dev_gridCellEndIndices);
   // - Perform velocity updates using neighbor search
+    kernUpdateVelNeighborSearchScattered<<<fullBlocksPerGrid, blockSize >>> (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, gridCellWidth, dev_gridCellStartIndices, dev_gridCellEndIndices, dev_particleArrayIndices, dev_pos, dev_vel1, dev_vel2);
   // - Update positions
+    kernUpdatePos<<<fullBlocksPerGrid, blockSize>>> (numObjects, dt, dev_pos, dev_vel2);
   // - Ping-pong buffers as needed
+    cudaMemcpy(dev_vel1, dev_vel2, (numObjects * sizeof(glm::vec3)), cudaMemcpyDeviceToDevice);
+}
+
+__global__ void kernCoherentReshuffleParticle(int N, int *particles, glm::vec3 *pos, glm::vec3 *vel, glm::vec3 *pos_coh, glm::vec3 *vel_coh) {
+	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+	if (index < N) {
+    int index_ = particles[index];
+    pos_coh[index] = pos[index_];
+    vel_coh[index] = vel[index_];
+  }
 }
 
 void Boids::stepSimulationCoherentGrid(float dt) {
   // TODO-2.3 - start by copying Boids::stepSimulationNaiveGrid
   // Uniform Grid Neighbor search using Thrust sort on cell-coherent data.
   // In Parallel:
+    dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+    
   // - Label each particle with its array index as well as its grid index.
   //   Use 2x width grids
+    kernComputeIndices<<<fullBlocksPerGrid, blockSize>>> (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, dev_pos, dev_particleArrayIndices, dev_particleGridIndices);
   // - Unstable key sort using Thrust. A stable sort isn't necessary, but you
   //   are welcome to do a performance comparison.
+    thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, dev_thrust_particleArrayIndices);
+    
+    kernResetIntBuffer<<<fullBlocksPerGrid, blockSize >>> (gridCellCount, dev_gridCellStartIndices, -1);
+    kernResetIntBuffer<<<fullBlocksPerGrid, blockSize >>> (gridCellCount, dev_gridCellEndIndices, -1);
   // - Naively unroll the loop for finding the start and end indices of each
   //   cell's data pointers in the array of boid indices
+    kernIdentifyCellStartEnd<<<fullBlocksPerGrid, blockSize>>> (numObjects, dev_particleGridIndices, dev_gridCellStartIndices, dev_gridCellEndIndices);
   // - BIG DIFFERENCE: use the rearranged array index buffer to reshuffle all
   //   the particle data in the simulation array.
   //   CONSIDER WHAT ADDITIONAL BUFFERS YOU NEED
+    kernCoherentReshuffleParticle<<<fullBlocksPerGrid, blockSize>>> (numObjects, dev_particleArrayIndices, dev_pos, dev_vel1, dev_pos_coh, dev_vel1_coh);
   // - Perform velocity updates using neighbor search
+    kernUpdateVelNeighborSearchCoherent<<<fullBlocksPerGrid, blockSize >>> (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, gridCellWidth, dev_gridCellStartIndices, dev_gridCellEndIndices, dev_pos_coh, dev_vel1_coh, dev_vel2_coh);
+
   // - Update positions
+    kernUpdatePos<<<fullBlocksPerGrid, blockSize>>> (numObjects, dt, dev_pos_coh, dev_vel2_coh);
   // - Ping-pong buffers as needed. THIS MAY BE DIFFERENT FROM BEFORE.
+    cudaMemcpy(dev_vel1, dev_vel2_coh, (numObjects * sizeof(glm::vec3)), cudaMemcpyDeviceToDevice);
+    cudaMemcpy(dev_pos, dev_pos_coh, (numObjects * sizeof(glm::vec3)), cudaMemcpyDeviceToDevice);
 }
 
 void Boids::endSimulation() {
@@ -390,6 +652,14 @@ void Boids::endSimulation() {
   cudaFree(dev_pos);
 
   // TODO-2.1 TODO-2.3 - Free any additional buffers here.
+  cudaFree(dev_particleArrayIndices);
+  cudaFree(dev_particleGridIndices);
+  cudaFree(dev_gridCellStartIndices);
+  cudaFree(dev_gridCellEndIndices);
+
+  cudaFree(dev_pos_coh);
+  cudaFree(dev_vel1_coh);
+  cudaFree(dev_vel2_coh);
 }
 
 void Boids::unitTest() {
diff --git a/src/main.cpp b/src/main.cpp
index b82c8c6..6641a62 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -13,9 +13,9 @@
 // ================
 
 // LOOK-2.1 LOOK-2.3 - toggles for UNIFORM_GRID and COHERENT_GRID
-#define VISUALIZE 1
-#define UNIFORM_GRID 0
-#define COHERENT_GRID 0
+#define VISUALIZE 0
+#define UNIFORM_GRID 1
+#define COHERENT_GRID 1
 
 // LOOK-1.2 - change this to adjust particle count in the simulation
 const int N_FOR_VIS = 5000;