diff --git a/.vscode/c_cpp_properties.json b/.vscode/c_cpp_properties.json
new file mode 100644
index 0000000..4b5a896
--- /dev/null
+++ b/.vscode/c_cpp_properties.json
@@ -0,0 +1,22 @@
+{
+    "configurations": [
+        {
+            "name": "Win32",
+            "includePath": [
+                "${workspaceFolder}/**",
+                "D:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v12.2\\include"
+            ],
+            "defines": [
+                "_DEBUG",
+                "UNICODE",
+                "_UNICODE"
+            ],
+            "windowsSdkVersion": "10.0.22621.0",
+            "compilerPath": "cl.exe",
+            "cStandard": "c17",
+            "cppStandard": "c++17",
+            "intelliSenseMode": "windows-msvc-x64"
+        }
+    ],
+    "version": 4
+}
\ No newline at end of file
diff --git a/README.md b/README.md
index d63a6a1..2bb832d 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,99 @@
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture,
 Project 1 - Flocking**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Yian Chen
+  * [LinkedIn](https://www.linkedin.com/in/yian-chen-33a31a1a8/), [personal website](https://sydianandrewchen.github.io/) etc.
+* Tested on: Windows 10, AMD Ryzen 5800 HS with Radeon Graphics CPU @ 3.20GHz 16GB, NVIDIA GeForce RTX3060 Laptop 8GB
 
-### (TODO: Your README)
 
-Include screenshots, analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+### Banner
+
+* `N_FOR_VIS = 5,000; scene_scale=100.0f`
+![](images/banner.gif)
+
+* `N_FOR_VIS = 500,000; scene_scale=200.0f`
+![](images/banner2.gif)
+
+* `N_FOR_VIS = 2,000,000; scene_scale=600.0f`
+![](images/banner3.gif)
+
+
+### Performance Analysis
+
+#### Results
+
+- Framerate change with increasing # of boids for naive, scattered uniform grid, and coherent uniform grid (with and without visualization)
+![](images/boids_no_visual.png)
+![](images/boids_with_visual.png)
+- Framerate change with increasing block size
+![](images/increasing_gridSize.png)
+
+#### Analysis Methods
+
+- `cudaEvent_t` is used to record the performance of each implementation under different cases.
+
+```cpp
+runCuda
+{
+
+    ... // before kernel
+
+    #if CUDA_PROFILING
+    cudaEventRecord(start);
+    #endif
+
+    // execute the kernel
+    #if UNIFORM_GRID && COHERENT_GRID
+    Boids::stepSimulationCoherentGrid(DT);
+    #elif UNIFORM_GRID
+    Boids::stepSimulationScatteredGrid(DT);
+    #else
+    Boids::stepSimulationNaive(DT);
+    #endif
+
+    #if CUDA_PROFILING
+        cudaEventRecord(stop);
+        cudaEventSynchronize(stop);
+        float milliseconds = 0;
+        cudaEventElapsedTime(&milliseconds, start, stop);
+        //std::cout << milliseconds << std::endl;
+        elapsed_frame++;
+        elapsed_time += milliseconds;
+    #endif
+    ... // after kernel
+}
+
+mainLoop
+{
+  while (...){
+    ... 
+  } // after quitting mainLoop
+  #if CUDA_PROFILING
+      float average_frame_time = std::max(elapsed_time / elapsed_frame, FLT_EPSILON);
+      std::cout 
+        << "[Stat] average_frame_time: " 
+        << average_frame_time << " ms   average FPS: " 
+        << 1000. / average_frame_time << "\n";
+      cudaEventDestroy(start);
+      cudaEventDestroy(stop);
+  #endif
+}
+```
+
+- To enable tight profiling, enable `CUDA_PROFILING` macro. After quitting the simulation, the average FPS should be printed to the standard output.
+
+![](images/profiling.png)
+
+#### Questions
+
+* For each implementation, how does changing the number of boids affect performance? Why do you think this is?
+  * For all implementation, increasing the number of boids will decrease performance. This is because the required calculations increase and the memory required also increase along with the increasing number of boids.
+
+* For each implementation, how does changing the block count and block size affect performance? Why do you think this is?
+  * As block size increases, the performance will first increase then decrease. The increasing blockSize brings higher parallelism, along with higher memory usage within each thread. As the register of each thread is limited, this will surely increase the io of local/global memory more often, thus slower the execution of each thread.
+
+* For the coherent uniform grid: did you experience any performance improvements with the more coherent uniform grid? Was this the outcome you expected? Why or why not?
+  * The performace improvements did not occur until the number of boids is increased to a certain amount. This should be the case. In a rather small amount of boids, the time saved by the coherent array is not higher than the time used to reallocate the array. 
+* Did changing cell width and checking 27 vs 8 neighboring cells affect performance? Why or why not? Be careful: it is insufficient (and possibly incorrect) to say that 27-cell is slower simply because there are more cells to check!
+  * Yes. According to the stats, 27 neighboring slows down performance mainly at a middle amount of boids. 27-cell is slower mainly because there are a lot of waste spent on checking irrelevant cells. There are at least 19 cells we can ensure that no adjacent boids are in.
+  ![](images/27vs8_no_visual.png)
diff --git a/images/27vs8_no_visual.png b/images/27vs8_no_visual.png
new file mode 100644
index 0000000..e73b468
Binary files /dev/null and b/images/27vs8_no_visual.png differ
diff --git a/images/banner.gif b/images/banner.gif
new file mode 100644
index 0000000..3d7e751
Binary files /dev/null and b/images/banner.gif differ
diff --git a/images/banner2.gif b/images/banner2.gif
new file mode 100644
index 0000000..6de3aef
Binary files /dev/null and b/images/banner2.gif differ
diff --git a/images/banner3.gif b/images/banner3.gif
new file mode 100644
index 0000000..754f976
Binary files /dev/null and b/images/banner3.gif differ
diff --git a/images/boids_no_visual.png b/images/boids_no_visual.png
new file mode 100644
index 0000000..4f138e8
Binary files /dev/null and b/images/boids_no_visual.png differ
diff --git a/images/boids_with_visual.png b/images/boids_with_visual.png
new file mode 100644
index 0000000..5684f16
Binary files /dev/null and b/images/boids_with_visual.png differ
diff --git a/images/increasing_gridSize.png b/images/increasing_gridSize.png
new file mode 100644
index 0000000..e61cdea
Binary files /dev/null and b/images/increasing_gridSize.png differ
diff --git a/images/profiling.png b/images/profiling.png
new file mode 100644
index 0000000..d8ef639
Binary files /dev/null and b/images/profiling.png differ
diff --git a/src/kernel.cu b/src/kernel.cu
index 74dffcb..81ddb7f 100644
--- a/src/kernel.cu
+++ b/src/kernel.cu
@@ -5,6 +5,7 @@
 #include <glm/glm.hpp>
 #include "utilityCore.hpp"
 #include "kernel.h"
+#include <device_launch_parameters.h>
 
 // LOOK-2.1 potentially useful for doing grid-based neighbor search
 #ifndef imax
@@ -21,6 +22,7 @@
 * Check for CUDA errors; print and exit if there was a problem.
 */
 void checkCUDAError(const char *msg, int line = -1) {
+#ifdef DEBUG
   cudaError_t err = cudaGetLastError();
   if (cudaSuccess != err) {
     if (line >= 0) {
@@ -29,6 +31,7 @@ void checkCUDAError(const char *msg, int line = -1) {
     fprintf(stderr, "Cuda error: %s: %s.\n", msg, cudaGetErrorString(err));
     exit(EXIT_FAILURE);
   }
+#endif // DEBUG
 }
 
 
@@ -37,7 +40,7 @@ void checkCUDAError(const char *msg, int line = -1) {
 *****************/
 
 /*! Block size used for CUDA kernel launch. */
-#define blockSize 128
+#define blockSize 256
 
 // LOOK-1.2 Parameters for the boids algorithm.
 // These worked well in our reference implementation.
@@ -54,6 +57,9 @@ void checkCUDAError(const char *msg, int line = -1) {
 /*! Size of the starting area in simulation space. */
 #define scene_scale 100.0f
 
+#define USE_27_NEIGHBOR 0 /* Use 27 neighbors or 8 neighbors */
+
+
 /***********************************************
 * Kernel state (pointers are device pointers) *
 ***********************************************/
@@ -85,6 +91,8 @@ int *dev_gridCellEndIndices;   // to this cell?
 
 // TODO-2.3 - consider what additional buffers you might need to reshuffle
 // the position and velocity data to be coherent within cells.
+glm::vec3* dev_coherent_pos;
+glm::vec3* dev_coherent_vel;
 
 // LOOK-2.1 - Grid parameters based on simulation parameters.
 // These are automatically computed for you in Boids::initSimulation
@@ -94,6 +102,8 @@ float gridCellWidth;
 float gridInverseCellWidth;
 glm::vec3 gridMinimum;
 
+
+
 /******************
 * initSimulation *
 ******************/
@@ -133,6 +143,67 @@ __global__ void kernGenerateRandomPosArray(int time, int N, glm::vec3 * arr, flo
   }
 }
 
+// LOOK-2.1 Consider this method of computing a 1D index from a 3D grid index.
+// LOOK-2.3 Looking at this method, what would be the most memory efficient
+//          order for iterating over neighboring grid cells?
+//          for(x)
+//            for(y)
+//             for(z)? Or some other order?
+__device__ int gridIndex3Dto1D(int x, int y, int z, int gridResolution) {
+    return x + y * gridResolution + z * gridResolution * gridResolution;
+}
+
+__global__ void kernComputeIndices(int N, int gridResolution,
+    glm::vec3 gridMin, float inverseCellWidth,
+    glm::vec3* pos, int* indices, int* gridIndices) {
+    // TODO-2.1
+    // - Label each boid with the index of its grid cell.
+    // - Set up a parallel array of integer indices as pointers to the actual
+    //   boid data in pos and vel1/vel2
+    int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (index < N) {
+        glm::ivec3 gridIndex3D = glm::floor((pos[index] - gridMin) * inverseCellWidth);
+        gridIndices[index] = gridIndex3Dto1D(gridIndex3D.x, gridIndex3D.y, gridIndex3D.z, gridResolution);
+        indices[index] = index;
+    }
+}
+
+// LOOK-2.1 Consider how this could be useful for indicating that a cell
+//          does not enclose any boids
+__global__ void kernResetIntBuffer(int N, int* intBuffer, int value) {
+    int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (index < N) {
+        intBuffer[index] = value;
+    }
+}
+
+__global__ void kernIdentifyCellStartEnd(int N, int* particleGridIndices,
+    int* gridCellStartIndices, int* gridCellEndIndices) {
+    // TODO-2.1
+    // Identify the start point of each cell in the gridIndices array.
+    // This is basically a parallel unrolling of a loop that goes
+    // "this index doesn't match the one before it, must be a new cell!"
+    int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (index < N) {
+        int gridIndex = particleGridIndices[index];
+        if (index == 0) {
+            gridCellStartIndices[gridIndex] = index;
+        }
+        else {
+            if (index == N - 1) gridCellEndIndices[gridIndex] = index;
+            int previousGridIndex = particleGridIndices[index - 1];
+            if (gridIndex != previousGridIndex) { // A new cell!
+                // Start the grid right in this index;
+                gridCellStartIndices[gridIndex] = index;
+                // End the grid right before this index;
+                gridCellEndIndices[previousGridIndex] = index - 1;
+            }
+        }
+    }
+
+}
+
+
 /**
 * Initialize memory, update some globals
 */
@@ -169,6 +240,28 @@ void Boids::initSimulation(int N) {
   gridMinimum.z -= halfGridWidth;
 
   // TODO-2.1 TODO-2.3 - Allocate additional buffers here.
+  cudaMalloc((void**)&dev_particleArrayIndices, N * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_particleArrayIndices failed!");
+
+  cudaMalloc((void**)&dev_particleGridIndices, N * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_particleGridIndices failed!");
+
+  cudaMalloc((void**)&dev_coherent_pos, N * sizeof(glm::vec3));
+  checkCUDAErrorWithLine("cudaMalloc dev_coherent_pos failed!");
+
+  cudaMalloc((void**)&dev_coherent_vel, N * sizeof(glm::vec3));
+  checkCUDAErrorWithLine("cudaMalloc dev_coherent_vel failed!");
+
+  dev_thrust_particleArrayIndices = thrust::device_pointer_cast(dev_particleArrayIndices);
+  dev_thrust_particleGridIndices = thrust::device_pointer_cast(dev_particleGridIndices);
+
+    //fullBlocksPerGrid = (gridCellCount + blockSize - 1) / blockSize;
+
+    cudaMalloc((void**)&dev_gridCellStartIndices, gridCellCount * sizeof(int));
+    checkCUDAErrorWithLine("cudaMalloc dev_gridCellStartIndices failed!");
+    cudaMalloc((void**)&dev_gridCellEndIndices, gridCellCount * sizeof(int));
+    checkCUDAErrorWithLine("cudaMalloc dev_gridCellEndIndices failed!");
+
   cudaDeviceSynchronize();
 }
 
@@ -230,10 +323,51 @@ void Boids::copyBoidsToVBO(float *vbodptr_positions, float *vbodptr_velocities)
 * in the `pos` and `vel` arrays.
 */
 __device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *pos, const glm::vec3 *vel) {
-  // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
-  // Rule 2: boids try to stay a distance d away from each other
-  // Rule 3: boids try to match the speed of surrounding boids
-  return glm::vec3(0.0f, 0.0f, 0.0f);
+    const glm::vec3 self_pos = pos[iSelf];
+    glm::vec3 perceived_center(0, 0, 0);
+    glm::vec3 c(0.0, 0.0, 0.0);
+    glm::vec3 perceived_velocity(0, 0, 0);
+    int rule1_neighbor_cnt = 0;
+    int rule3_neighbor_cnt = 0;
+    glm::vec3 other_pos;
+    for (int index = 0; index < N; ++index) {
+        other_pos = pos[index];
+        if (index != iSelf) {
+            const float distance = glm::distance(other_pos, self_pos);
+            // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
+            if (distance < rule1Distance) {
+                perceived_center += other_pos;
+                rule1_neighbor_cnt++;
+            }
+
+            // Rule 2: boids try to stay a distance d away from each other
+            if (distance < rule2Distance) {
+                c -= (other_pos - self_pos);
+            }
+
+            // Rule 3: boids try to match the speed of surrounding boids
+            if (distance < rule3Distance) {
+                perceived_velocity += vel[index];
+                rule3_neighbor_cnt++;
+            }
+
+        }
+    }
+    glm::vec3 changedVel(0, 0, 0);
+    if (rule1_neighbor_cnt) {
+		perceived_center /= rule1_neighbor_cnt;
+        changedVel += (perceived_center - self_pos) * rule1Scale;
+	}
+    changedVel += c * rule2Scale;
+    if (rule3_neighbor_cnt) {
+        perceived_velocity /= rule3_neighbor_cnt;
+        changedVel += perceived_velocity * rule3Scale;
+    }
+    return changedVel;
+}
+
+__device__ float deviceClampFloat(float value, float min, float max) {
+    return value < min ? min : ((value > max ? max : value));
 }
 
 /**
@@ -242,9 +376,18 @@ __device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *po
 */
 __global__ void kernUpdateVelocityBruteForce(int N, glm::vec3 *pos,
   glm::vec3 *vel1, glm::vec3 *vel2) {
+    int index = threadIdx.x + (blockIdx.x * blockDim.x);
+    if (index >= N) return;
+
   // Compute a new velocity based on pos and vel1
-  // Clamp the speed
   // Record the new velocity into vel2. Question: why NOT vel1?
+  // Because vel1 is the old vels needed to calculate the new vels
+  // Clamp the speed
+    glm::vec3 changedVel = vel1[index] + computeVelocityChange(N, index, pos, vel1);
+    if (glm::length(changedVel) > maxSpeed) {
+        changedVel = glm::normalize(changedVel) * maxSpeed;
+    }
+    vel2[index] = changedVel;
 }
 
 /**
@@ -272,42 +415,6 @@ __global__ void kernUpdatePos(int N, float dt, glm::vec3 *pos, glm::vec3 *vel) {
   pos[index] = thisPos;
 }
 
-// LOOK-2.1 Consider this method of computing a 1D index from a 3D grid index.
-// LOOK-2.3 Looking at this method, what would be the most memory efficient
-//          order for iterating over neighboring grid cells?
-//          for(x)
-//            for(y)
-//             for(z)? Or some other order?
-__device__ int gridIndex3Dto1D(int x, int y, int z, int gridResolution) {
-  return x + y * gridResolution + z * gridResolution * gridResolution;
-}
-
-__global__ void kernComputeIndices(int N, int gridResolution,
-  glm::vec3 gridMin, float inverseCellWidth,
-  glm::vec3 *pos, int *indices, int *gridIndices) {
-    // TODO-2.1
-    // - Label each boid with the index of its grid cell.
-    // - Set up a parallel array of integer indices as pointers to the actual
-    //   boid data in pos and vel1/vel2
-}
-
-// LOOK-2.1 Consider how this could be useful for indicating that a cell
-//          does not enclose any boids
-__global__ void kernResetIntBuffer(int N, int *intBuffer, int value) {
-  int index = (blockIdx.x * blockDim.x) + threadIdx.x;
-  if (index < N) {
-    intBuffer[index] = value;
-  }
-}
-
-__global__ void kernIdentifyCellStartEnd(int N, int *particleGridIndices,
-  int *gridCellStartIndices, int *gridCellEndIndices) {
-  // TODO-2.1
-  // Identify the start point of each cell in the gridIndices array.
-  // This is basically a parallel unrolling of a loop that goes
-  // "this index doesn't match the one before it, must be a new cell!"
-}
-
 __global__ void kernUpdateVelNeighborSearchScattered(
   int N, int gridResolution, glm::vec3 gridMin,
   float inverseCellWidth, float cellWidth,
@@ -316,12 +423,105 @@ __global__ void kernUpdateVelNeighborSearchScattered(
   glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) {
   // TODO-2.1 - Update a boid's velocity using the uniform grid to reduce
   // the number of boids that need to be checked.
-  // - Identify the grid cell that this particle is in
-  // - Identify which cells may contain neighbors. This isn't always 8.
-  // - For each cell, read the start/end indices in the boid pointer array.
-  // - Access each boid in the cell and compute velocity change from
-  //   the boids rules, if this boid is within the neighborhood distance.
-  // - Clamp the speed change before putting the new speed in vel2
+    int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (index < N) {
+        int iSelf = particleArrayIndices[index];
+        const glm::vec3 self_pos = pos[iSelf];
+
+        // - Identify the grid cell that this particle is in
+        //const float maxDistance = max(rule1Distance, max(rule2Distance, rule3Distance));
+        //const glm::vec3 searchRange = ;
+        glm::vec3 perceived_center(0.0, 0.0, 0.0);
+        glm::vec3 c(0.0, 0.0, 0.0);
+        glm::vec3 perceived_velocity(0.0, 0.0, 0.0);
+        int rule1_neighbor_cnt = 0;
+        int rule3_neighbor_cnt = 0;
+
+#if USE_27_NEIGHBOR
+        const glm::vec3 gridIndex3D = glm::floor((self_pos - gridMin) * inverseCellWidth);
+        for (int z = gridIndex3D.z - 1; z <= gridIndex3D.z + 1; z++)
+        {
+            //if (z < 0 || z >= gridResolution) continue;
+            for (int y = gridIndex3D.y - 1; y <= gridIndex3D.y + 1; y++)
+            {
+                //if (y < 0 || y >= gridResolution) continue;
+                for (int x = gridIndex3D.x - 1; x <= gridIndex3D.x + 1; x++)
+                {
+                    //if (x < 0 || x >= gridResolution) continue;
+#else
+        const glm::vec3 maxGridIndex3D(glm::floor((self_pos + glm::vec3(max(rule1Distance, max(rule2Distance, rule3Distance))) - gridMin) * inverseCellWidth));
+        const glm::vec3 minGridIndex3D(glm::floor((self_pos - glm::vec3(max(rule1Distance, max(rule2Distance, rule3Distance))) - gridMin) * inverseCellWidth));
+
+        for (int z = minGridIndex3D.z; z <= maxGridIndex3D.z; z++)
+        {
+            //if (z < 0 || z >= gridResolution) continue;
+            for (int y = minGridIndex3D.y; y <= maxGridIndex3D.y; y++)
+            {
+                //if (y < 0 || y >= gridResolution) continue;
+                for (int x = minGridIndex3D.x; x <= maxGridIndex3D.x; x++)
+                {
+                    //if (x < 0 || x >= gridResolution) continue
+#endif
+                    int cellIndex = gridIndex3Dto1D(x, y, z, gridResolution);
+                    int startIndex = gridCellStartIndices[cellIndex];
+                    int endIndex = gridCellEndIndices[cellIndex];
+                    if (startIndex == -1) continue;
+                    // - For each cell, read the start/end indices in the boid pointer array.
+                    for (int other_index = startIndex; other_index <= endIndex; other_index++) {
+                        // - Access each boid in the cell and compute velocity change from
+                        //   the boids rules, if this boid is within the neighborhood distance.
+                        const int iOther = particleArrayIndices[other_index];
+                        const glm::vec3 other_pos = pos[iOther];
+                        if (iSelf != iOther) {
+                            const float distance = glm::distance(other_pos, self_pos);
+                            // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
+                            if (distance < rule1Distance) {
+                                perceived_center += other_pos;
+                                rule1_neighbor_cnt++;
+                            }
+
+                            // Rule 2: boids try to stay a distance d away from each other
+                            if (distance < rule2Distance) {
+                                c -= (other_pos - self_pos);
+                            }
+
+                            // Rule 3: boids try to match the speed of surrounding boids
+                            if (distance < rule3Distance) {
+                                perceived_velocity += vel1[iOther];
+                                rule3_neighbor_cnt++;
+                            }
+
+                        }
+                    }
+                }
+            }
+        }
+        glm::vec3 newVelocity = vel1[iSelf];
+        // Rule 1
+        if (rule1_neighbor_cnt) {
+            perceived_center /= rule1_neighbor_cnt;
+            newVelocity += (perceived_center - self_pos) * rule1Scale;
+        }
+        // Rule 2
+        newVelocity += c * rule2Scale;
+        if (rule3_neighbor_cnt) {
+            perceived_velocity /= rule3_neighbor_cnt;
+            newVelocity += perceived_velocity * rule3Scale;
+        }
+
+        // - Clamp the speed change before putting the new speed in vel2
+        if (glm::length(newVelocity) > maxSpeed) {
+            newVelocity = glm::normalize(newVelocity) * maxSpeed;
+        }
+        vel2[iSelf] = newVelocity;
+    }
+}
+
+__global__ void kernRearrangeSimulationData(int N, glm::vec3* data, glm::vec3* coherentData, int* particleArrayIndices) {
+    int index = (blockDim.x * blockIdx.x) + threadIdx.x;
+    if (index >= N) return;
+    //int iSelf = particleArrayIndices[index];
+    coherentData[index] = data[particleArrayIndices[index]];
 }
 
 __global__ void kernUpdateVelNeighborSearchCoherent(
@@ -329,26 +529,117 @@ __global__ void kernUpdateVelNeighborSearchCoherent(
   float inverseCellWidth, float cellWidth,
   int *gridCellStartIndices, int *gridCellEndIndices,
   glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) {
-  // TODO-2.3 - This should be very similar to kernUpdateVelNeighborSearchScattered,
-  // except with one less level of indirection.
-  // This should expect gridCellStartIndices and gridCellEndIndices to refer
-  // directly to pos and vel1.
-  // - Identify the grid cell that this particle is in
-  // - Identify which cells may contain neighbors. This isn't always 8.
-  // - For each cell, read the start/end indices in the boid pointer array.
-  //   DIFFERENCE: For best results, consider what order the cells should be
-  //   checked in to maximize the memory benefits of reordering the boids data.
-  // - Access each boid in the cell and compute velocity change from
-  //   the boids rules, if this boid is within the neighborhood distance.
-  // - Clamp the speed change before putting the new speed in vel2
+    // TODO-2.3 - This should be very similar to kernUpdateVelNeighborSearchScattered,
+    // except with one less level of indirection.
+    // This should expect gridCellStartIndices and gridCellEndIndices to refer
+    // directly to pos and vel1.
+    int iSelf = (blockIdx.x * blockDim.x) + threadIdx.x;
+    if (iSelf < N) {
+        const glm::vec3 self_pos = pos[iSelf];
+
+        // - Identify the grid cell that this particle is in
+        const glm::vec3 gridIndex3D = glm::floor((self_pos - gridMin) * inverseCellWidth);
+        // - Identify which cells may contain neighbors. This isn't always 8.
+
+        glm::vec3 maxGridIndex3D(glm::floor((self_pos + glm::vec3(max(rule1Distance, max(rule2Distance, rule3Distance))) - gridMin) * inverseCellWidth));
+        glm::vec3 minGridIndex3D(glm::floor((self_pos - glm::vec3(max(rule1Distance, max(rule2Distance, rule3Distance))) - gridMin) * inverseCellWidth));
+
+        glm::vec3 perceived_center(0.0, 0.0, 0.0);
+        glm::vec3 c(0.0, 0.0, 0.0);
+        glm::vec3 perceived_velocity(0.0, 0.0, 0.0);
+        int rule1_neighbor_cnt = 0;
+        int rule3_neighbor_cnt = 0;
+        for (int z = minGridIndex3D.z; z <= maxGridIndex3D.z; z++)
+        {
+            //if (z < 0 || z >= gridResolution) continue;
+            for (int y = minGridIndex3D.y; y <= maxGridIndex3D.y; y++)
+            {
+                //if (y < 0 || y >= gridResolution) continue;
+                for (int x = minGridIndex3D.x; x <= maxGridIndex3D.x; x++)
+                {
+                    //if (x < 0 || x >= gridResolution) continue;
+        //for (int z = gridIndex3D.z - 1; z <= gridIndex3D.z + 1; z++)
+        //{
+        //    //if (z < 0 || z >= gridResolution) continue;
+        //    for (int y = gridIndex3D.y - 1; y <= gridIndex3D.y + 1; y++)
+        //    {
+        //        //if (y < 0 || y >= gridResolution) continue;
+        //        for (int x = gridIndex3D.x - 1; x <= gridIndex3D.x + 1; x++)
+        //        {
+                    //if (x < 0 || x >= gridResolution) continue;
+                    int cellIndex = gridIndex3Dto1D(x, y, z, gridResolution);
+                    int startIndex = gridCellStartIndices[cellIndex];
+                    int endIndex = gridCellEndIndices[cellIndex];
+                    if (startIndex == -1) continue;
+                    // - For each cell, read the start/end indices in the boid pointer array.
+                    //   DIFFERENCE: For best results, consider what order the cells should be
+                    //   checked in to maximize the memory benefits of reordering the boids data.
+                    for (int iOther = startIndex; iOther <= endIndex; iOther++) {
+                        // - Access each boid in the cell and compute velocity change from
+                        //   the boids rules, if this boid is within the neighborhood distance.
+                        const glm::vec3 other_pos = pos[iOther];
+                        if (iSelf != iOther) {
+                            const float distance = glm::distance(other_pos, self_pos);
+                            // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
+                            if (distance < rule1Distance) {
+                                perceived_center += other_pos;
+                                rule1_neighbor_cnt++;
+                            }
+
+                            // Rule 2: boids try to stay a distance d away from each other
+                            if (distance < rule2Distance) {
+                                c -= (other_pos - self_pos);
+                            }
+
+                            // Rule 3: boids try to match the speed of surrounding boids
+                            if (distance < rule3Distance) {
+                                perceived_velocity += vel1[iOther];
+                                rule3_neighbor_cnt++;
+                            }
+
+                        }
+                    }
+                }
+            }
+        }
+        glm::vec3 newVelocity = vel1[iSelf];
+        // Rule 1
+        if (rule1_neighbor_cnt) {
+            perceived_center /= rule1_neighbor_cnt;
+            newVelocity += (perceived_center - self_pos) * rule1Scale;
+        }
+        // Rule 2
+        newVelocity += c * rule2Scale;
+        if (rule3_neighbor_cnt) {
+            perceived_velocity /= rule3_neighbor_cnt;
+            newVelocity += perceived_velocity * rule3Scale;
+        }
+
+        // - Clamp the speed change before putting the new speed in vel2
+        if (glm::length(newVelocity) > maxSpeed) {
+            newVelocity = glm::normalize(newVelocity) * maxSpeed;
+        }
+        vel2[iSelf] = newVelocity;
+    }
 }
 
 /**
 * Step the entire N-body simulation by `dt` seconds.
 */
 void Boids::stepSimulationNaive(float dt) {
-  // TODO-1.2 - use the kernels you wrote to step the simulation forward in time.
-  // TODO-1.2 ping-pong the velocity buffers
+    // TODO-1.2 - use the kernels you wrote to step the simulation forward in time.
+    dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+
+    kernUpdateVelocityBruteForce << <fullBlocksPerGrid, blockSize >> > (numObjects, dev_pos, dev_vel1, dev_vel2);
+    checkCUDAErrorWithLine("kernUpdateVelocityBruteForce failed!");
+    //cudaDeviceSynchronize();
+
+    kernUpdatePos << <fullBlocksPerGrid, blockSize >> > (numObjects, dt, dev_pos, dev_vel2);
+    checkCUDAErrorWithLine("kernUpdatePos failed!");
+    cudaDeviceSynchronize();
+
+    // TODO-1.2 ping-pong the velocity buffers
+    std::swap(dev_vel1, dev_vel2);
 }
 
 void Boids::stepSimulationScatteredGrid(float dt) {
@@ -357,13 +648,38 @@ void Boids::stepSimulationScatteredGrid(float dt) {
   // In Parallel:
   // - label each particle with its array index as well as its grid index.
   //   Use 2x width grids.
+
+    dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+
+    kernResetIntBuffer << <fullBlocksPerGrid, blockSize >> > (gridCellCount, dev_gridCellStartIndices, -1);
+    kernResetIntBuffer << <fullBlocksPerGrid, blockSize >> > (gridCellCount, dev_gridCellEndIndices, -1);
+
+    kernComputeIndices << <fullBlocksPerGrid, blockSize >> > (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, dev_pos, dev_particleArrayIndices, dev_particleGridIndices);
+    checkCUDAErrorWithLine("kernComputeIndices failed!");
+
   // - Unstable key sort using Thrust. A stable sort isn't necessary, but you
   //   are welcome to do a performance comparison.
+
+    thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, dev_thrust_particleArrayIndices);
+    checkCUDAErrorWithLine("thrust::sort_by_key failed!");
+
   // - Naively unroll the loop for finding the start and end indices of each
   //   cell's data pointers in the array of boid indices
+    kernIdentifyCellStartEnd << <fullBlocksPerGrid, blockSize >> > (numObjects, dev_particleGridIndices, dev_gridCellStartIndices, dev_gridCellEndIndices);
+    checkCUDAErrorWithLine("kernIdentifyCellStartEnd failed!");
+
   // - Perform velocity updates using neighbor search
+    kernUpdateVelNeighborSearchScattered << <fullBlocksPerGrid, blockSize >> > (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, gridCellWidth, 
+        dev_gridCellStartIndices, dev_gridCellEndIndices, dev_particleArrayIndices, dev_pos, dev_vel1, dev_vel2);
+    checkCUDAErrorWithLine("kernUpdateVelNeighborSearchScattered failed!");
+
   // - Update positions
+    kernUpdatePos << <fullBlocksPerGrid, blockSize >> > (numObjects, dt, dev_pos, dev_vel2);
+    checkCUDAErrorWithLine("kernUpdatePos failed!");
+
   // - Ping-pong buffers as needed
+    std::swap(dev_vel1, dev_vel2);
+
 }
 
 void Boids::stepSimulationCoherentGrid(float dt) {
@@ -372,16 +688,41 @@ void Boids::stepSimulationCoherentGrid(float dt) {
   // In Parallel:
   // - Label each particle with its array index as well as its grid index.
   //   Use 2x width grids
+    dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+
+    kernResetIntBuffer << <fullBlocksPerGrid, blockSize >> > (gridCellCount, dev_gridCellStartIndices, -1);
+    kernResetIntBuffer << <fullBlocksPerGrid, blockSize >> > (gridCellCount, dev_gridCellEndIndices, -1);
+
+    kernComputeIndices << <fullBlocksPerGrid, blockSize >> > (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, dev_pos, dev_particleArrayIndices, dev_particleGridIndices);
+    checkCUDAErrorWithLine("kernComputeIndices failed!");
   // - Unstable key sort using Thrust. A stable sort isn't necessary, but you
   //   are welcome to do a performance comparison.
+    thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, dev_thrust_particleArrayIndices);
+    checkCUDAErrorWithLine("thrust::sort_by_key failed!");
   // - Naively unroll the loop for finding the start and end indices of each
   //   cell's data pointers in the array of boid indices
+    kernIdentifyCellStartEnd << <fullBlocksPerGrid, blockSize >> > (numObjects, dev_particleGridIndices, dev_gridCellStartIndices, dev_gridCellEndIndices);
+    checkCUDAErrorWithLine("kernIdentifyCellStartEnd failed!");
   // - BIG DIFFERENCE: use the rearranged array index buffer to reshuffle all
   //   the particle data in the simulation array.
   //   CONSIDER WHAT ADDITIONAL BUFFERS YOU NEED
+    kernRearrangeSimulationData << <fullBlocksPerGrid, blockSize >> > (numObjects, dev_pos, dev_coherent_pos, dev_particleArrayIndices);
+    checkCUDAErrorWithLine("kernRearrangeSimulationData failed!");
+    std::swap(dev_pos, dev_coherent_pos);
+
+    kernRearrangeSimulationData << <fullBlocksPerGrid, blockSize >> > (numObjects, dev_vel1, dev_coherent_vel, dev_particleArrayIndices);
+    checkCUDAErrorWithLine("kernRearrangeSimulationData failed!");
+
+  
   // - Perform velocity updates using neighbor search
+    kernUpdateVelNeighborSearchCoherent << <fullBlocksPerGrid, blockSize >> > (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, gridCellWidth,
+        dev_gridCellStartIndices, dev_gridCellEndIndices, dev_pos, dev_coherent_vel, dev_vel2);
+    checkCUDAErrorWithLine("kernUpdateVelNeighborSearchScattered failed!");
   // - Update positions
+    kernUpdatePos << <fullBlocksPerGrid, blockSize >> > (numObjects, dt, dev_pos, dev_vel2);
+    checkCUDAErrorWithLine("kernUpdatePos failed!");
   // - Ping-pong buffers as needed. THIS MAY BE DIFFERENT FROM BEFORE.
+    std::swap(dev_vel1, dev_vel2);
 }
 
 void Boids::endSimulation() {
diff --git a/src/kernel.h b/src/kernel.h
index 3d3da72..4fa9439 100644
--- a/src/kernel.h
+++ b/src/kernel.h
@@ -1,5 +1,7 @@
 #pragma once
 
+#include <algorithm>
+
 #include <stdio.h>
 #include <thrust/sort.h>
 #include <thrust/execution_policy.h>
diff --git a/src/main.cpp b/src/main.cpp
index b82c8c6..efff2da 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -14,9 +14,16 @@
 
 // LOOK-2.1 LOOK-2.3 - toggles for UNIFORM_GRID and COHERENT_GRID
 #define VISUALIZE 1
-#define UNIFORM_GRID 0
-#define COHERENT_GRID 0
-
+#define UNIFORM_GRID 1
+#define COHERENT_GRID 1
+#define CUDA_PROFILING 1 /* No visulization better be guaranteed. Else the fps will be limited by the fps of screen */
+
+#if CUDA_PROFILING
+cudaEvent_t start;
+cudaEvent_t stop;
+float elapsed_time = 0.0f;
+int elapsed_frame = 0;
+#endif
 // LOOK-1.2 - change this to adjust particle count in the simulation
 const int N_FOR_VIS = 5000;
 const float DT = 0.2f;
@@ -117,6 +124,11 @@ bool init(int argc, char **argv) {
 
   glEnable(GL_DEPTH_TEST);
 
+#if CUDA_PROFILING
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+#endif
+
   return true;
 }
 
@@ -195,6 +207,10 @@ void initShaders(GLuint * program) {
     cudaGLMapBufferObject((void**)&dptrVertPositions, boidVBO_positions);
     cudaGLMapBufferObject((void**)&dptrVertVelocities, boidVBO_velocities);
 
+
+    #if CUDA_PROFILING
+    cudaEventRecord(start);
+    #endif
     // execute the kernel
     #if UNIFORM_GRID && COHERENT_GRID
     Boids::stepSimulationCoherentGrid(DT);
@@ -204,6 +220,16 @@ void initShaders(GLuint * program) {
     Boids::stepSimulationNaive(DT);
     #endif
 
+    #if CUDA_PROFILING
+        cudaEventRecord(stop);
+        cudaEventSynchronize(stop);
+        float milliseconds = 0;
+        cudaEventElapsedTime(&milliseconds, start, stop);
+        //std::cout << milliseconds << std::endl;
+        elapsed_frame++;
+        elapsed_time += milliseconds;
+    #endif
+
     #if VISUALIZE
     Boids::copyBoidsToVBO(dptrVertPositions, dptrVertVelocities);
     #endif
@@ -256,6 +282,14 @@ void initShaders(GLuint * program) {
       glfwSwapBuffers(window);
       #endif
     }
+
+#if CUDA_PROFILING
+    float average_frame_time = std::max(elapsed_time / elapsed_frame, FLT_EPSILON);
+    std::cout << "[Stat] average_frame_time: " << average_frame_time << " ms   average FPS: " << 1000. / average_frame_time << "\n";
+    cudaEventDestroy(start);
+    cudaEventDestroy(stop);
+#endif
+
     glfwDestroyWindow(window);
     glfwTerminate();
   }
diff --git a/tools/data_processing.py b/tools/data_processing.py
new file mode 100644
index 0000000..feac5e1
--- /dev/null
+++ b/tools/data_processing.py
@@ -0,0 +1,66 @@
+import matplotlib.pyplot as plt
+
+N = [r"$5x10^{}$".format(i) for i in range(2, 7)]
+
+# Drawing increasing boids, with visuals
+fps_brutalForce = [4802.94, 707.513, 42.102, 0.457157, 0.0]
+fps_scatteredGrid = [4104.64, 3761.42, 1748.94, 416.962, 18.5537]
+fps_coherentGrid = [3738.6, 3349.59, 1713.82, 761.368, 57.0016]
+
+plt.xlabel("N count")
+plt.ylabel("FPS")
+plt.plot(N, fps_brutalForce, label="brutalForce", color="r", marker="o")
+plt.plot(N, fps_scatteredGrid, label="scatteredGrid", color="g", marker="o")
+plt.plot(N, fps_coherentGrid, label="coherentGrid", color="b", marker="o")
+plt.legend(labels=["brutalForce", "scatteredGrid", "coherentGrid"])
+plt.title("FPS of kernel under different method (With visulization)")
+# plt.show()
+plt.savefig("../images/boids_with_visual.png")
+
+plt.clf()
+
+# Drawing increasing boids, no visuals
+fps_brutalForce = [8297.48, 1227.13, 42.882, 0.46393, 0.0]
+fps_scatteredGrid = [7666.14, 7279.71, 4348.93, 380.379, 18.6696]
+fps_coherentGrid = [7383.36, 7034.57, 4743.67, 683.372, 57.848]
+
+plt.xlabel("N count")
+plt.ylabel("FPS")
+plt.plot(N, fps_brutalForce, label="brutalForce", color="r", marker="o")
+plt.plot(N, fps_scatteredGrid, label="scatteredGrid", color="g", marker="o")
+plt.plot(N, fps_coherentGrid, label="coherentGrid", color="b", marker="o")
+plt.legend(labels=["brutalForce", "scatteredGrid", "coherentGrid"])
+plt.title("FPS of kernel under different method (No visulization)")
+# plt.show()
+plt.savefig("../images/boids_no_visual.png")
+
+plt.clf()
+
+# Drawing 27-cell vs 8-cell
+plt.xlabel("N count")
+plt.ylabel("FPS")
+fps_scatteredGrid27Neighbors = [7285.03, 6472.57, 3955.12, 353.385, 18.667]
+fps_scatteredGrid8Neighbors = [7666.14, 7279.71, 4348.93, 380.379, 18.6696]
+plt.plot(N, fps_scatteredGrid27Neighbors, label="27-cell", marker="o")
+plt.plot(N, fps_scatteredGrid8Neighbors, label="8-cell", marker="o")
+plt.legend(labels=["27-cell", "8-cell"])
+plt.title("FPS of 27-cell vs 8-cell (No visualization)")
+# plt.show()
+plt.savefig("../images/27vs8_no_visual.png")
+
+plt.clf()
+
+
+# Drawing increasing blockSize
+
+blockSize = [r"$2^{}$".format(i) for i in range(2, 11)]
+fps_coherentGridBlockSizeChanged = [ 11.7512, 18.7417,28.524,41.9919, 57.0016, 62.4268, 64.3508, 61.0085, 59.5032]
+plt.xlabel("Block size")
+plt.ylabel("FPS")
+plt.plot(blockSize, fps_coherentGridBlockSizeChanged, marker="o")
+plt.title("FPS of coherent grid over increasing grid size (With visualization)")
+# plt.show()
+plt.savefig("../images/increasing_gridSize.png")
+
+if __name__ == "__main__":
+    pass
\ No newline at end of file