diff --git a/README.md b/README.md
index d63a6a1..09773a0 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,30 @@
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture,
 Project 1 - Flocking**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+![Flocking Particles](images/flocking.gif)
 
-### (TODO: Your README)
+* Salaar Kohari
+  * [LinkedIn](https://www.linkedin.com/in/salaarkohari), [personal website](http://salaar.kohari.com)
+* Tested on: Windows 10, Intel Xeon @ 3.7GHz 32GB, GTX 1070 8GB (SIG Lab)
 
-Include screenshots, analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+### Description
+
+Implemented flocking behavior for thousands of particles on the GPU. Flocking behavior includes cohesion, separation, and alignment rules.
+
+The naive implementation involves parallel computation of each particle checking all other particles and performing rules if they are within the neighborhood distance. This and other implementations require device side storage of position and (ping-pong) velocity buffers.
+
+The grid implementation involves dividing the space into a 3D grid and only checking the particles in cells that may contain neighbors. A cohesive grid implementation does the same with additional memory alignment for better performance at higher particle count. Grid implementations require new data arrays to be stored and sorted in order to keep track of particles and their corresponding grid cell.
+
+### Details/Analysis
+
+For naive implementation, fps goes from 1940 (100/1,000 boids) to 565 (5,000 boids) dropping to 53 (20,000 boids). The time taken is directly related to the number of boids since it must check all boids on each thread to calculate flocking behavior.
+
+![Naive FPS Analysis](images/naiveFPS.png)
+
+Grid implementations were attempted, but the program kept crashing. With GPU debug not working on the lab computer, I was unable to figure out the problem.
+
+I expect that boid count wouldn't have as drastic effect on the grid based implementation, since more boids would have to be checked but compute time would increase logarithmically (checking nearby cells) rather than linearly with boid count.
+
+Coherent uniform grid should also speed up performance as the number of boids gets large (in the tens of thousands) due to faster memory access adding up. Changing cell width and cells to check would slow down performance because it would take longer to map the grid arrays as well as to search more cells.
+
+Lastly, as a note, I changed the gpu architecture to sm_50 to support my GPU in CMakeLists.txt.
diff --git a/images/flocking.gif b/images/flocking.gif
new file mode 100644
index 0000000..3494428
Binary files /dev/null and b/images/flocking.gif differ
diff --git a/images/naiveFPS.png b/images/naiveFPS.png
new file mode 100644
index 0000000..24c0f59
Binary files /dev/null and b/images/naiveFPS.png differ
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index fdd636d..dff0113 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -10,5 +10,5 @@ set(SOURCE_FILES
 
 cuda_add_library(src
     ${SOURCE_FILES}
-    OPTIONS -arch=sm_20
+    OPTIONS -arch=sm_50
     )
diff --git a/src/kernel.cu b/src/kernel.cu
index 74dffcb..19647a4 100644
--- a/src/kernel.cu
+++ b/src/kernel.cu
@@ -169,6 +169,21 @@ void Boids::initSimulation(int N) {
   gridMinimum.z -= halfGridWidth;
 
   // TODO-2.1 TODO-2.3 - Allocate additional buffers here.
+  cudaMalloc((void**)&dev_particleArrayIndices, N * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_particleArrayIndices failed!");
+
+  cudaMalloc((void**)&dev_particleGridIndices, N * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_particleGridIndices failed!");
+
+  cudaMalloc((void**)&dev_gridCellStartIndices, N * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_gridCellStartIndices failed!");
+
+  cudaMalloc((void**)&dev_gridCellEndIndices, N * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_gridCellEndIndices failed!");
+
+  dev_thrust_particleArrayIndices = thrust::device_ptr<int>(dev_particleArrayIndices);
+  dev_thrust_particleGridIndices = thrust::device_ptr<int>(dev_particleGridIndices);
+
   cudaDeviceSynchronize();
 }
 
@@ -233,18 +248,64 @@ __device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *po
   // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
   // Rule 2: boids try to stay a distance d away from each other
   // Rule 3: boids try to match the speed of surrounding boids
-  return glm::vec3(0.0f, 0.0f, 0.0f);
+
+	glm::vec3 perceivedCenter;
+	glm::vec3 separate;
+	glm::vec3 perceivedVelocity;
+	int centerNeighbors = 0;
+	int velocityNeighbors = 0;
+
+	for (int i = 0; i < N; ++i) {
+		if (i == iSelf)
+			continue;
+
+		float distance = glm::distance(pos[i], pos[iSelf]);
+
+		if (distance < rule1Distance)
+		{
+			perceivedCenter += pos[i];
+			++centerNeighbors;
+		}
+
+		if (distance < rule2Distance)
+			separate -= pos[i] - pos[iSelf];
+
+		if (distance < rule3Distance)
+		{
+			perceivedVelocity += vel[i];
+			++velocityNeighbors;
+		}
+	}
+
+	perceivedCenter /= centerNeighbors ? centerNeighbors : 1;
+	perceivedVelocity /= velocityNeighbors ? velocityNeighbors : 1;
+
+	glm::vec3 velocity = vel[iSelf];
+	velocity += centerNeighbors ? (perceivedCenter - pos[iSelf]) * rule1Scale : glm::vec3();
+	velocity += separate * rule2Scale;
+	velocity += perceivedVelocity * rule3Scale;
+
+	return velocity;
 }
 
 /**
 * TODO-1.2 implement basic flocking
 * For each of the `N` bodies, update its position based on its current velocity.
 */
-__global__ void kernUpdateVelocityBruteForce(int N, glm::vec3 *pos,
-  glm::vec3 *vel1, glm::vec3 *vel2) {
-  // Compute a new velocity based on pos and vel1
-  // Clamp the speed
-  // Record the new velocity into vel2. Question: why NOT vel1?
+__global__ void kernUpdateVelocityBruteForce(int N, glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) {
+	int index = threadIdx.x + (blockIdx.x * blockDim.x);
+	if (index >= N)
+		return;
+
+	// Compute a new velocity based on pos and vel1
+	glm::vec3 velocity = computeVelocityChange(N, index, pos, vel1);
+
+	// Clamp the speed
+	if (glm::length(velocity) > maxSpeed)
+		velocity = maxSpeed * glm::normalize(velocity);
+
+	// Record the new velocity into vel2
+	vel2[index] = velocity;
 }
 
 /**
@@ -289,6 +350,14 @@ __global__ void kernComputeIndices(int N, int gridResolution,
     // - Label each boid with the index of its grid cell.
     // - Set up a parallel array of integer indices as pointers to the actual
     //   boid data in pos and vel1/vel2
+	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+	if (index >= N)
+		return;
+
+    glm::vec3 gridPos = (pos[index] - gridMin) * inverseCellWidth;
+	int gridCell = gridIndex3Dto1D((int)gridPos.x, (int)gridPos.y, (int)gridPos.z, gridResolution);
+	indices[index] = index;
+	gridIndices[index] = gridCell;
 }
 
 // LOOK-2.1 Consider how this could be useful for indicating that a cell
@@ -306,6 +375,23 @@ __global__ void kernIdentifyCellStartEnd(int N, int *particleGridIndices,
   // Identify the start point of each cell in the gridIndices array.
   // This is basically a parallel unrolling of a loop that goes
   // "this index doesn't match the one before it, must be a new cell!"
+	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+	if (index >= N)
+		return;
+	
+	int currIdx = particleGridIndices[index];
+    if (index == 0) {
+        gridCellStartIndices[currIdx] = index;
+        return;
+    }
+	int prevIdx = particleGridIndices[index - 1];
+    if (currIdx != prevIdx) {
+        gridCellStartIndices[currIdx] = index;
+        gridCellEndIndices[prevIdx] = index - 1;
+    }
+	if (index == N - 1) {
+		gridCellEndIndices[prevIdx] = index;
+	}
 }
 
 __global__ void kernUpdateVelNeighborSearchScattered(
@@ -322,6 +408,85 @@ __global__ void kernUpdateVelNeighborSearchScattered(
   // - Access each boid in the cell and compute velocity change from
   //   the boids rules, if this boid is within the neighborhood distance.
   // - Clamp the speed change before putting the new speed in vel2
+	int index = (blockIdx.x * blockDim.x) + threadIdx.x;
+	if (index >= N)
+		return;
+
+    glm::vec3 gridPos = (pos[index] - gridMin) * inverseCellWidth;
+	int gridCell = gridIndex3Dto1D((int)gridPos.x, (int)gridPos.y, (int)gridPos.z, gridResolution);
+
+	int neighborCells[8];
+	int neighborIdx = 0;
+	neighborCells[0] = gridCell;
+
+	int quadx = gridPos.x - (int)gridPos.x >= 0.5f ? 1 : -1;
+	int quady = gridPos.y - (int)gridPos.y >= 0.5f ? 1 : -1;
+	int quadz = gridPos.z - (int)gridPos.z >= 0.5f ? 1 : -1;
+	
+	bool xinrange = gridPos.x + quadx >= 0 && gridPos.x + quadx < gridResolution;
+	bool yinrange = gridPos.y + quady >= 0 && gridPos.y + quady < gridResolution;
+	bool zinrange = gridPos.z + quadz >= 0 && gridPos.z + quadz < gridResolution;
+
+	if (xinrange)
+		neighborCells[++neighborIdx] = gridIndex3Dto1D(gridPos.x + quadx, gridPos.y, gridPos.z, gridResolution);
+	if (yinrange)
+		neighborCells[++neighborIdx] = gridIndex3Dto1D(gridPos.x, gridPos.y + quady, gridPos.z, gridResolution);
+	if (zinrange)
+		neighborCells[++neighborIdx] = gridIndex3Dto1D(gridPos.x, gridPos.y, gridPos.z + quadz, gridResolution);
+	if (xinrange && yinrange)
+		neighborCells[++neighborIdx] = gridIndex3Dto1D(gridPos.x + quadx, gridPos.y + quady, gridPos.z, gridResolution);
+	if (xinrange && zinrange)
+		neighborCells[++neighborIdx] = gridIndex3Dto1D(gridPos.x + quadx, gridPos.y, gridPos.z + quadz, gridResolution);
+	if (yinrange && zinrange)
+		neighborCells[++neighborIdx] = gridIndex3Dto1D(gridPos.x, gridPos.y + quady, gridPos.z + quadz, gridResolution);
+	if (xinrange && yinrange && zinrange)
+		neighborCells[++neighborIdx] = gridIndex3Dto1D(gridPos.x + quadx, gridPos.y + quady, gridPos.z + quadz, gridResolution);
+	
+	glm::vec3 perceivedCenter;
+	glm::vec3 separate;
+	glm::vec3 perceivedVelocity;
+	int centerNeighbors = 0;
+	int velocityNeighbors = 0;
+	glm::vec3 velocity = vel1[index];
+
+	for (int cellIdx = 0; cellIdx <= neighborIdx; cellIdx++) {
+		int cell = neighborCells[cellIdx];
+        int startIdx = gridCellStartIndices[cell];
+        int endIdx = gridCellEndIndices[cell];
+        if (startIdx == -1 || startIdx >= N || endIdx < startIdx || endIdx >= N)
+            continue;
+
+		for (int i = startIdx; i <= endIdx; ++i) {
+			int neighbor = particleArrayIndices[i];
+            if (neighbor == index)
+                continue;
+			
+			float distance = glm::distance(pos[neighbor], pos[index]);
+
+			if (distance < rule1Distance)
+			{
+				perceivedCenter += pos[neighbor];
+				++centerNeighbors;
+			}
+
+			if (distance < rule2Distance)
+				separate -= pos[neighbor] - pos[index];
+
+			if (distance < rule3Distance)
+			{
+				perceivedVelocity += vel1[neighbor];
+				++velocityNeighbors;
+			}
+		}
+	}
+	perceivedCenter /= centerNeighbors ? centerNeighbors : 1;
+	perceivedVelocity /= velocityNeighbors ? velocityNeighbors : 1;
+
+	velocity += centerNeighbors ? (perceivedCenter - pos[index]) * rule1Scale : glm::vec3();
+	velocity += separate * rule2Scale;
+	velocity += perceivedVelocity * rule3Scale;
+
+	vel2[index] = velocity;
 }
 
 __global__ void kernUpdateVelNeighborSearchCoherent(
@@ -349,6 +514,14 @@ __global__ void kernUpdateVelNeighborSearchCoherent(
 void Boids::stepSimulationNaive(float dt) {
   // TODO-1.2 - use the kernels you wrote to step the simulation forward in time.
   // TODO-1.2 ping-pong the velocity buffers
+	dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+	kernUpdateVelocityBruteForce<<<fullBlocksPerGrid, blockSize>>>(numObjects, dev_pos, dev_vel1, dev_vel2);
+	kernUpdatePos<<<fullBlocksPerGrid, blockSize>>>(numObjects, dt, dev_pos, dev_vel1);
+
+	glm::vec3* tempVel;
+	tempVel = dev_vel1;
+	dev_vel1 = dev_vel2;
+	dev_vel2 = tempVel;
 }
 
 void Boids::stepSimulationScatteredGrid(float dt) {
@@ -364,6 +537,29 @@ void Boids::stepSimulationScatteredGrid(float dt) {
   // - Perform velocity updates using neighbor search
   // - Update positions
   // - Ping-pong buffers as needed
+    dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+
+	kernComputeIndices << <fullBlocksPerGrid, blockSize >> >(numObjects, gridSideCount, gridMinimum, gridInverseCellWidth,
+		dev_pos, dev_particleArrayIndices, dev_particleGridIndices);
+
+	thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + numObjects, dev_thrust_particleArrayIndices);
+
+    kernResetIntBuffer << <fullBlocksPerGrid, blockSize >> > (numObjects, dev_gridCellStartIndices, -1);
+    kernResetIntBuffer << <fullBlocksPerGrid, blockSize >> > (numObjects, dev_gridCellEndIndices, -1);
+
+	kernIdentifyCellStartEnd << <fullBlocksPerGrid, blockSize >> >
+		(numObjects, dev_particleGridIndices, dev_gridCellStartIndices, dev_gridCellEndIndices);
+
+    //kernUpdateVelocityBruteForce << <fullBlocksPerGrid, blockSize >> >(numObjects, dev_pos, dev_vel1, dev_vel2);
+
+	kernUpdateVelNeighborSearchScattered << <fullBlocksPerGrid, blockSize >> > (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth,
+		gridCellWidth, dev_gridCellStartIndices, dev_gridCellEndIndices, dev_particleArrayIndices, dev_pos, dev_vel1, dev_vel2);
+
+	kernUpdatePos << <fullBlocksPerGrid, blockSize >> >(numObjects, dt, dev_pos, dev_vel1);
+
+    glm::vec3* tempVel = dev_vel1;
+	dev_vel1 = dev_vel2;
+	dev_vel2 = tempVel;
 }
 
 void Boids::stepSimulationCoherentGrid(float dt) {
@@ -390,6 +586,10 @@ void Boids::endSimulation() {
   cudaFree(dev_pos);
 
   // TODO-2.1 TODO-2.3 - Free any additional buffers here.
+  cudaFree(dev_particleArrayIndices);
+  cudaFree(dev_particleGridIndices);
+  cudaFree(dev_gridCellStartIndices);
+  cudaFree(dev_gridCellEndIndices);
 }
 
 void Boids::unitTest() {
diff --git a/src/main.cpp b/src/main.cpp
index b82c8c6..db9a64e 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -13,12 +13,12 @@
 // ================
 
 // LOOK-2.1 LOOK-2.3 - toggles for UNIFORM_GRID and COHERENT_GRID
-#define VISUALIZE 1
+#define VISUALIZE 0
 #define UNIFORM_GRID 0
 #define COHERENT_GRID 0
 
 // LOOK-1.2 - change this to adjust particle count in the simulation
-const int N_FOR_VIS = 5000;
+const int N_FOR_VIS = 20000;
 const float DT = 0.2f;
 
 /**