diff --git a/README.md b/README.md
index 98dd9a8..37ba7ce 100644
--- a/README.md
+++ b/README.md
@@ -1,10 +1,44 @@
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture,
 Project 1 - Flocking**
 
-* (TODO) YOUR NAME HERE
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Akshay Shah
+* Tested on: Windows 10, i7-5700HQ @ 2.70GHz 16GB, GTX 970M 6GB (Personal Computer)
 
-### (TODO: Your README)
+### Screenshots
 
-Include screenshots, analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+Here is an implementation of the coherent grid boid simulation with 5000 and 120,000 boids respectively:
+
+![](images/perf_128_std.gif)
+5000 boids
+
+![](images/perf_256_std.gif)
+120,000 boids
+
+### Analysis
+
+* For each implementation, how does changing the number of boids affect performance? Why do you think this is?
+
+ A: Increasing the boids decreases the FPS as there are more boids to look at to include them to change the velocity.
+ Here is an image comparing the different implementations with increasing number of boids:
+
+ ![](images/128vs256boidsvsfpsnaivevscoherentvsuniform.png)
+
+
+* For each implementation, how does changing the block count and block size affect performance? Why do you think this is?
+
+  A: Increasing the blocksize slightly increases the FPS over time over the same number of boids. This is maybe due to many blocks per function call
+  ![](images/128vs256boidsvsfpsnaive.png)
+
+* For the coherent uniform grid: did you experience any performance improvements with the more coherent uniform grid? Was this the outcome you expected? Why or why not?
+
+  A: The average FPS for coherent grid was slower over lesser number of boids but stays at a stable rate of 45fps for 80,000 boids whereas the uniform grid reduces down over time to 30fps for the same number of boids. Look at the following graph for an example:
+
+  ![](images/128vs256boidsvsfpsnaivevscoherentvsuniform.png)
+
+  Following is an analysis of the average time spent in a function:
+  ![](images/perf_analysis.png)
+  Notice how the average time spent is 94% in updating velocities.
+  This is a naive implementation of the simulation.
+
+  ![](images/perf_analysis_20k_std.png)
+  Notice the average time spent in the update velocity is 20% and is staggered over the update position and compute start and end cell grid indices. 
diff --git a/images/128vs256boidsvsfpsnaive.png b/images/128vs256boidsvsfpsnaive.png
new file mode 100644
index 0000000..5b40de3
Binary files /dev/null and b/images/128vs256boidsvsfpsnaive.png differ
diff --git a/images/128vs256boidsvsfpsnaivevscoherentvsuniform.png b/images/128vs256boidsvsfpsnaivevscoherentvsuniform.png
new file mode 100644
index 0000000..df6f7f0
Binary files /dev/null and b/images/128vs256boidsvsfpsnaivevscoherentvsuniform.png differ
diff --git a/images/perf_128_std.gif b/images/perf_128_std.gif
new file mode 100644
index 0000000..d9a9cbc
Binary files /dev/null and b/images/perf_128_std.gif differ
diff --git a/images/perf_256_std.gif b/images/perf_256_std.gif
new file mode 100644
index 0000000..a475614
Binary files /dev/null and b/images/perf_256_std.gif differ
diff --git a/images/perf_analysis.png b/images/perf_analysis.png
new file mode 100644
index 0000000..27d25cf
Binary files /dev/null and b/images/perf_analysis.png differ
diff --git a/images/perf_analysis_20k_std.png b/images/perf_analysis_20k_std.png
new file mode 100644
index 0000000..c620398
Binary files /dev/null and b/images/perf_analysis_20k_std.png differ
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index fdd636d..eeaabd4 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -10,5 +10,5 @@ set(SOURCE_FILES
 
 cuda_add_library(src
     ${SOURCE_FILES}
-    OPTIONS -arch=sm_20
+    OPTIONS -arch=sm_52
     )
diff --git a/src/kernel.cu b/src/kernel.cu
index 30356b9..081199f 100644
--- a/src/kernel.cu
+++ b/src/kernel.cu
@@ -67,6 +67,7 @@ dim3 threadsPerBlock(blockSize);
 // boid cares about its neighbors' velocities.
 // These are called ping-pong buffers.
 glm::vec3 *dev_pos;
+glm::vec3 *dev_pos_coherent;
 glm::vec3 *dev_vel1;
 glm::vec3 *dev_vel2;
 
@@ -145,6 +146,9 @@ void Boids::initSimulation(int N) {
   cudaMalloc((void**)&dev_pos, N * sizeof(glm::vec3));
   checkCUDAErrorWithLine("cudaMalloc dev_pos failed!");
 
+  cudaMalloc((void**)&dev_pos_coherent, N * sizeof(glm::vec3));
+  checkCUDAErrorWithLine("cudaMalloc dev_pos_coherent failed!");
+
   cudaMalloc((void**)&dev_vel1, N * sizeof(glm::vec3));
   checkCUDAErrorWithLine("cudaMalloc dev_vel1 failed!");
 
@@ -169,6 +173,21 @@ void Boids::initSimulation(int N) {
   gridMinimum.z -= halfGridWidth;
 
   // TODO-2.1 TODO-2.3 - Allocate additional buffers here.
+  cudaMalloc((void**)&dev_particleArrayIndices, N * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_particleArrayIndices failed!");
+
+  cudaMalloc((void**)&dev_particleGridIndices, N * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_particleGridIndices failed!");
+
+  dev_thrust_particleGridIndices = thrust::device_ptr<int>(dev_particleGridIndices);
+  dev_thrust_particleArrayIndices = thrust::device_ptr<int>(dev_particleArrayIndices);
+
+  cudaMalloc((void**)&dev_gridCellStartIndices, gridCellCount * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_gridCellStartIndices failed!");
+
+  cudaMalloc((void**)&dev_gridCellEndIndices, gridCellCount * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_gridCellEndIndices failed!");
+
   cudaThreadSynchronize();
 }
 
@@ -230,10 +249,37 @@ void Boids::copyBoidsToVBO(float *vbodptr_positions, float *vbodptr_velocities)
 * in the `pos` and `vel` arrays.
 */
 __device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *pos, const glm::vec3 *vel) {
-  // Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
-  // Rule 2: boids try to stay a distance d away from each other
-  // Rule 3: boids try to match the speed of surrounding boids
-  return glm::vec3(0.0f, 0.0f, 0.0f);
+	glm::vec3 v1(0.f), v2(0.f), v3(0.f);
+	int index;
+	float ctr1 = 0.f, ctr3 = 0.f;
+	for (index = 0; index < N; ++index) {
+		if (index != iSelf) {
+			float dist = glm::distance(pos[iSelf], pos[index]);
+			// Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
+			if (dist < rule1Distance) {
+				v1 += pos[index];
+				ctr1 += 1.f;
+			}
+			// Rule 2: boids try to stay a distance d away from each other
+			if (dist < rule2Distance) {
+				v2 -= (pos[index] - pos[iSelf]);
+			}
+			// Rule 3: boids try to match the speed of surrounding boids
+			if (dist < rule3Distance) {
+				v3 += vel[index];
+				ctr3 += 1.f;
+			}
+		}
+	}
+	v1 = v1 / ctr1;
+	v1 = (v1 - pos[iSelf]) * rule1Scale;
+ 
+	v2 = v2 * rule2Scale;
+
+	v3 = v3 / ctr3;
+	v3 = (v3 - vel[iSelf]) * rule3Scale;
+
+	return vel[iSelf] + v1 + v2 + v3;
 }
 
 /**
@@ -243,8 +289,14 @@ __device__ glm::vec3 computeVelocityChange(int N, int iSelf, const glm::vec3 *po
 __global__ void kernUpdateVelocityBruteForce(int N, glm::vec3 *pos,
   glm::vec3 *vel1, glm::vec3 *vel2) {
   // Compute a new velocity based on pos and vel1
+	int idx = threadIdx.x + (blockIdx.x * blockDim.x);
+	glm::vec3 thisVel(0.f);
+	if (idx >= N) return;
+	thisVel = computeVelocityChange(N, idx, pos, vel1);
   // Clamp the speed
+	thisVel = glm::length(thisVel) > maxSpeed ? glm::normalize(thisVel) * maxSpeed : thisVel;
   // Record the new velocity into vel2. Question: why NOT vel1?
+	vel2[idx] = thisVel;
 }
 
 /**
@@ -286,24 +338,52 @@ __global__ void kernComputeIndices(int N, int gridResolution,
   glm::vec3 gridMin, float inverseCellWidth,
   glm::vec3 *pos, int *indices, int *gridIndices) {
     // TODO-2.1
+	int index = threadIdx.x + (blockIdx.x * blockDim.x);
+	if (index >= N) return;
     // - Label each boid with the index of its grid cell.
+	glm::ivec3 position = (pos[index] - gridMin) * inverseCellWidth;
+	gridIndices[index] = gridIndex3Dto1D(position.x, position.y, position.z, gridResolution);
     // - Set up a parallel array of integer indices as pointers to the actual
     //   boid data in pos and vel1/vel2
+	indices[index] = index;
 }
 
 // LOOK-2.1 Consider how this could be useful for indicating that a cell
 //          does not enclose any boids
 __global__ void kernResetIntBuffer(int N, int *intBuffer, int value) {
   int index = (blockIdx.x * blockDim.x) + threadIdx.x;
-  if (index < N) {
-    intBuffer[index] = value;
-  }
+  if (index >= N) return;
+  intBuffer[index] = value;
 }
 
 __global__ void kernIdentifyCellStartEnd(int N, int *particleGridIndices,
   int *gridCellStartIndices, int *gridCellEndIndices) {
   // TODO-2.1
-  // Identify the start point of each cell in the gridIndices array.
+	int index = threadIdx.x + (blockIdx.x * blockDim.x);
+	// Identify the start point of each cell in the gridIndices array.
+	if (index >= N) return;
+	int prev = index - 1;
+	int next = index + 1;
+
+	int gridCellIdx = particleGridIndices[index];
+
+	if (prev < 0) {
+		gridCellStartIndices[gridCellIdx] = index;
+	}
+	if (next >= N) {
+		gridCellEndIndices[gridCellIdx] = index;
+	}
+
+	int prevGridIdx = prev < 0 ? particleGridIndices[index] : particleGridIndices[prev];
+	int nextGridIdx = next >= N ? particleGridIndices[index] : particleGridIndices[next];
+	if (prevGridIdx != gridCellIdx) {
+		/*gridCellEndIndices[prevGridIdx] = prev;*/
+		gridCellStartIndices[gridCellIdx] = index;
+	}
+	if (nextGridIdx != gridCellIdx) {
+		gridCellEndIndices[gridCellIdx] = index;
+		/*gridCellStartIndices[gridCellIdx] = index;*/
+	}
   // This is basically a parallel unrolling of a loop that goes
   // "this index doesn't match the one before it, must be a new cell!"
 }
@@ -316,12 +396,82 @@ __global__ void kernUpdateVelNeighborSearchScattered(
   glm::vec3 *pos, glm::vec3 *vel1, glm::vec3 *vel2) {
   // TODO-2.1 - Update a boid's velocity using the uniform grid to reduce
   // the number of boids that need to be checked.
-  // - Identify the grid cell that this particle is in
+	//access particle array
+	int index = threadIdx.x + (blockIdx.x * blockDim.x);
+	if (index >= N) return;
+	// the index to access pos, vel1, vel2 data
+	int boidIdx = particleArrayIndices[index];
+
+	glm::vec3 position = (pos[boidIdx] - gridMin) * inverseCellWidth;
+	// - Identify the grid cell that this particle is in
+	int posx = static_cast<int>(position.x);
+	int posy = static_cast<int>(position.y);
+	int posz = static_cast<int>(position.z);
+
+	//which octant is it in? decide by -1 or 1
+	int quadx = ((position.x + 0.5f*cellWidth) - posx) == 0 ? -1 : 1;
+	int quady = ((position.y + 0.5f*cellWidth) - posy) == 0 ? -1 : 1;
+	int quadz = ((position.z + 0.5f*cellWidth) - posz) == 0 ? -1 : 1;
   // - Identify which cells may contain neighbors. This isn't always 8.
+	int neighborGrids[8];
+	neighborGrids[0] = gridIndex3Dto1D(posx, posy, posz, gridResolution);
+	neighborGrids[1] = gridIndex3Dto1D(posx + quadx, posy, posz, gridResolution);
+	neighborGrids[2] = gridIndex3Dto1D(posx, posy + quady, posz, gridResolution);
+	neighborGrids[3] = gridIndex3Dto1D(posx, posy, posz + quadz, gridResolution);
+	neighborGrids[4] = gridIndex3Dto1D(posx + quadx, posy + quady, posz, gridResolution);
+	neighborGrids[5] = gridIndex3Dto1D(posx + quadx, posy, posz + quadz, gridResolution);
+	neighborGrids[6] = gridIndex3Dto1D(posx, posy + quady, posz + quadz, gridResolution);
+	neighborGrids[7] = gridIndex3Dto1D(posx + quadx, posy + quady, posz + quadz, gridResolution);
   // - For each cell, read the start/end indices in the boid pointer array.
-  // - Access each boid in the cell and compute velocity change from
-  //   the boids rules, if this boid is within the neighborhood distance.
+	int i = 0;
+	glm::vec3 v1(0.f), v2(0.f), v3(0.f);
+	float ctr1 = 0.f, ctr3 = 0.f;
+	for (i = 0; i < 8; ++i) {
+		int cellIdx = neighborGrids[i];
+		int startIdx = gridCellStartIndices[cellIdx];
+		int endIdx = gridCellEndIndices[cellIdx];
+		int j = 0;
+		// - Access each boid in the cell and compute velocity change from
+		//   the boids rules, if this boid is within the neighborhood distance.
+		for (j = startIdx; j < endIdx; ++j) {
+			int neighborBoidIdx = particleArrayIndices[j];
+			if (boidIdx != neighborBoidIdx) {
+				float dist = glm::distance(pos[neighborBoidIdx], pos[boidIdx]);
+				// Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
+				if (dist < rule1Distance) {
+					v1 += pos[neighborBoidIdx];
+					ctr1 += 1.f;
+				}
+				// Rule 2: boids try to stay a distance d away from each other
+				if (dist < rule2Distance) {
+					v2 -= (pos[neighborBoidIdx] - pos[boidIdx]);
+				}
+				// Rule 3: boids try to match the speed of surrounding boids
+				if (dist < rule3Distance) {
+					v3 += vel1[neighborBoidIdx];
+					ctr3 += 1.f;
+				}
+			}
+		}
+	}
+	v1 = ctr1 > 0 ? (v1 / ctr1) - pos[boidIdx] : v1;
+	v1 = v1 * rule1Scale;
+
+	v2 = v2 * rule2Scale;
+
+	v3 = ctr3 > 0 ? (v3 / ctr3): v3;
+	v3 = v3 * rule3Scale;
+
+	glm::vec3 vel = vel1[boidIdx] + v1 + v2 + v3;
   // - Clamp the speed change before putting the new speed in vel2
+	vel2[boidIdx] = glm::length(vel) > maxSpeed ? glm::normalize(vel) * maxSpeed : vel;
+}
+
+__global__ void kernSwitchBuffers(int N, glm::vec3* oldPos, glm::vec3* oldVel, int* oldIndices, glm::vec3* newPos, glm::vec3* newVel) {
+	int index = threadIdx.x + (blockIdx.x * blockDim.x);
+	if (index >= N) return;
+	newPos[index] = oldPos[oldIndices[index]];
+	newVel[index] = oldVel[oldIndices[index]];
 }
 
 __global__ void kernUpdateVelNeighborSearchCoherent(
@@ -333,55 +483,154 @@ __global__ void kernUpdateVelNeighborSearchCoherent(
   // except with one less level of indirection.
   // This should expect gridCellStartIndices and gridCellEndIndices to refer
   // directly to pos and vel1.
+	int index = threadIdx.x + (blockIdx.x * blockDim.x);
+	if (index >= N) return;
   // - Identify the grid cell that this particle is in
+	glm::vec3 position;
+	position = (pos[index] - gridMin) * inverseCellWidth;
+
+	int posx = static_cast<int>(position.x);
+	int posy = static_cast<int>(position.y);
+	int posz = static_cast<int>(position.z);
+
+	int quadx = ((position.x + 0.5f*cellWidth) - posx) == 0 ? -1 : 1;
+	int quady = ((position.y + 0.5f*cellWidth) - posy) == 0 ? -1 : 1;
+	int quadz = ((position.z + 0.5f*cellWidth) - posz) == 0 ? -1 : 1;
   // - Identify which cells may contain neighbors. This isn't always 8.
+	int neighborGrids[8];
+	neighborGrids[0] = gridIndex3Dto1D(posx, posy, posz, gridResolution);
+	neighborGrids[1] = gridIndex3Dto1D(posx + quadx, posy, posz, gridResolution);
+	neighborGrids[2] = gridIndex3Dto1D(posx, posy + quady, posz, gridResolution);
+	neighborGrids[3] = gridIndex3Dto1D(posx, posy, posz + quadz, gridResolution);
+	neighborGrids[4] = gridIndex3Dto1D(posx + quadx, posy + quady, posz, gridResolution);
+	neighborGrids[5] = gridIndex3Dto1D(posx + quadx, posy, posz + quadz, gridResolution);
+	neighborGrids[6] = gridIndex3Dto1D(posx, posy + quady, posz + quadz, gridResolution);
+	neighborGrids[7] = gridIndex3Dto1D(posx + quadx, posy + quady, posz + quadz, gridResolution);
   // - For each cell, read the start/end indices in the boid pointer array.
-  //   DIFFERENCE: For best results, consider what order the cells should be
-  //   checked in to maximize the memory benefits of reordering the boids data.
-  // - Access each boid in the cell and compute velocity change from
-  //   the boids rules, if this boid is within the neighborhood distance.
-  // - Clamp the speed change before putting the new speed in vel2
+	int i = 0;
+	glm::vec3 v1(0.f), v2(0.f), v3(0.f);
+	float ctr1 = 0.f, ctr3 = 0.f;
+	for (i = 0; i < 8; ++i) {
+		int cellIdx = neighborGrids[i];
+		int startIdx = gridCellStartIndices[cellIdx];
+		int endIdx = gridCellEndIndices[cellIdx];
+		int j = 0;
+		// - Access each boid in the cell and compute velocity change from
+		//   the boids rules, if this boid is within the neighborhood distance.
+		for (j = startIdx; j < endIdx; ++j) {
+			//don't need this for coherent
+			//int neighborBoidIdx = particleArrayIndices[j];
+			if (index != j) {
+				float dist = glm::distance(pos[j], pos[index]);
+				// Rule 1: boids fly towards their local perceived center of mass, which excludes themselves
+				if (dist < rule1Distance) {
+					v1 += pos[j];
+					ctr1 += 1.f;
+				}
+				// Rule 2: boids try to stay a distance d away from each other
+				if (dist < rule2Distance) {
+					v2 -= (pos[j] - pos[index]);
+				}
+				// Rule 3: boids try to match the speed of surrounding boids
+				if (dist < rule3Distance) {
+					v3 += vel1[j];
+					ctr3 += 1.f;
+				}
+			}
+		}
+	}
+	v1 = ctr1 > 0 ? (v1 / ctr1) - pos[index] : v1;
+	v1 = v1 * rule1Scale;
+
+	v2 = v2 * rule2Scale;
+
+	v3 = ctr3 > 0 ? (v3 / ctr3) : v3;
+	v3 = v3 * rule3Scale;
+
+	glm::vec3 vel = vel1[index] + v1 + v2 + v3;
+	// - Clamp the speed change before putting the new speed in vel2
+	vel2[index] = glm::length(vel) > maxSpeed ? glm::normalize(vel) * maxSpeed : vel;
 }
 
 /**
 * Step the entire N-body simulation by `dt` seconds.
 */
 void Boids::stepSimulationNaive(float dt) {
-  // TODO-1.2 - use the kernels you wrote to step the simulation forward in time.
+	dim3 fullBlocksPerGrid((numObjects + blockSize - 1) / blockSize);
+	// TODO-1.2 - use the kernels you wrote to step the simulation forward in time
+	kernUpdateVelocityBruteForce << <fullBlocksPerGrid, blockSize >> >(numObjects, dev_pos, dev_vel1, dev_vel2);
+	checkCUDAErrorWithLine("kernUpdateVelocityBruteForce failed!");
+	kernUpdatePos << <fullBlocksPerGrid, blockSize >> > (numObjects, dt, dev_pos, dev_vel2);
+	checkCUDAErrorWithLine("kernUpdatePos failed!");
   // TODO-1.2 ping-pong the velocity buffers
+	std::swap(dev_vel1, dev_vel2);
 }
 
 void Boids::stepSimulationScatteredGrid(float dt) {
   // TODO-2.1
   // Uniform Grid Neighbor search using Thrust sort.
   // In Parallel:
+	int N = numObjects;
+	dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize);
+	dim3 fullBlocksPerGridCell((gridCellCount + blockSize - 1) / blockSize);
   // - label each particle with its array index as well as its grid index.
+	kernComputeIndices << <fullBlocksPerGrid, blockSize >> > (N, gridSideCount, gridMinimum, gridInverseCellWidth, dev_pos, dev_particleArrayIndices, dev_particleGridIndices);
+	checkCUDAErrorWithLine("kernComputeIndices failed!");
   //   Use 2x width grids.
   // - Unstable key sort using Thrust. A stable sort isn't necessary, but you
   //   are welcome to do a performance comparison.
+	thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + N, dev_thrust_particleArrayIndices);
   // - Naively unroll the loop for finding the start and end indices of each
   //   cell's data pointers in the array of boid indices
+	kernResetIntBuffer << <fullBlocksPerGridCell, blockSize >> > (gridCellCount, dev_gridCellStartIndices, -1);
+	kernResetIntBuffer << <fullBlocksPerGridCell, blockSize >> > (gridCellCount, dev_gridCellEndIndices, -1);
+	kernIdentifyCellStartEnd << <fullBlocksPerGrid, blockSize >> > (N, dev_particleGridIndices, dev_gridCellStartIndices, dev_gridCellEndIndices);
+	checkCUDAErrorWithLine("kernIdentifyCellStartEnd failed!");
   // - Perform velocity updates using neighbor search
+	kernUpdateVelNeighborSearchScattered << <fullBlocksPerGrid, blockSize >> > (N, gridSideCount, gridMinimum, gridInverseCellWidth, gridCellWidth, dev_gridCellStartIndices, dev_gridCellEndIndices, dev_particleArrayIndices, dev_pos, dev_vel1, dev_vel2);
+	checkCUDAErrorWithLine("kernUpdateVelNeighborSearchScattered failed!");
   // - Update positions
+	kernUpdatePos << <fullBlocksPerGrid, blockSize >> > (N, dt, dev_pos, dev_vel2);
+	checkCUDAErrorWithLine("kernUpdatePos failed!");
   // - Ping-pong buffers as needed
+	std::swap(dev_vel1, dev_vel2);
 }
 
 void Boids::stepSimulationCoherentGrid(float dt) {
   // TODO-2.3 - start by copying Boids::stepSimulationNaiveGrid
   // Uniform Grid Neighbor search using Thrust sort on cell-coherent data.
   // In Parallel:
+	int N = numObjects;
+	dim3 fullBlocksPerGrid((N + blockSize - 1) / blockSize);
+	dim3 fullBlocksPerGridCell((gridCellCount + blockSize - 1) / blockSize);
   // - Label each particle with its array index as well as its grid index.
+	kernComputeIndices << <fullBlocksPerGrid, blockSize >> > (numObjects, gridSideCount, gridMinimum, gridInverseCellWidth, dev_pos, dev_particleArrayIndices, dev_particleGridIndices);
+	checkCUDAErrorWithLine("kernComputeIndices failed!");
   //   Use 2x width grids
   // - Unstable key sort using Thrust. A stable sort isn't necessary, but you
   //   are welcome to do a performance comparison.
+	thrust::sort_by_key(dev_thrust_particleGridIndices, dev_thrust_particleGridIndices + N, dev_thrust_particleArrayIndices);
   // - Naively unroll the loop for finding the start and end indices of each
   //   cell's data pointers in the array of boid indices
+	kernResetIntBuffer << <fullBlocksPerGridCell, blockSize >> > (gridCellCount, dev_gridCellStartIndices, -1);
+	checkCUDAErrorWithLine("kernResetIntBuffer failed!");
+	kernResetIntBuffer << <fullBlocksPerGridCell, blockSize >> > (gridCellCount, dev_gridCellEndIndices, -1);
+	checkCUDAErrorWithLine("kernResetIntBuffer failed!");
+	kernIdentifyCellStartEnd << <fullBlocksPerGrid, blockSize >> > (N, dev_particleGridIndices, dev_gridCellStartIndices, dev_gridCellEndIndices);
+	checkCUDAErrorWithLine("kernIdentifyCellStartEnd failed!");
   // - BIG DIFFERENCE: use the rearranged array index buffer to reshuffle all
   //   the particle data in the simulation array.
   //   CONSIDER WHAT ADDITIONAL BUFFERS YOU NEED
+	kernSwitchBuffers << <fullBlocksPerGrid, blockSize >> > (N, dev_pos, dev_vel1, dev_particleArrayIndices, dev_pos_coherent, dev_vel2);
+	checkCUDAErrorWithLine("kernSwitchBuffers failed!");
   // - Perform velocity updates using neighbor search
-  // - Update positions
+	kernUpdateVelNeighborSearchCoherent << <fullBlocksPerGrid, blockSize >> > (N, gridSideCount, gridMinimum, gridInverseCellWidth, gridCellWidth, dev_gridCellStartIndices, dev_gridCellEndIndices, dev_pos_coherent, dev_vel2, dev_vel1);
+	checkCUDAErrorWithLine("kernUpdateVelNeighborSearchCoherent failed!");
+	// - Update positions
+	kernUpdatePos << <fullBlocksPerGrid, blockSize >> > (N, dt, dev_pos_coherent, dev_vel1);
+	checkCUDAErrorWithLine("kernUpdatePos failed!");
   // - Ping-pong buffers as needed. THIS MAY BE DIFFERENT FROM BEFORE.
+	//std::swap(dev_pos, dev_pos_coherent);
 }
 
 void Boids::endSimulation() {
@@ -390,6 +639,11 @@ void Boids::endSimulation() {
   cudaFree(dev_pos);
 
   // TODO-2.1 TODO-2.3 - Free any additional buffers here.
+  cudaFree(dev_pos_coherent);
+  cudaFree(dev_gridCellStartIndices);
+  cudaFree(dev_gridCellEndIndices);
+  cudaFree(dev_particleArrayIndices);
+  cudaFree(dev_particleGridIndices);
 }
 
 void Boids::unitTest() {
@@ -403,7 +657,7 @@ void Boids::unitTest() {
   int *intKeys = new int[N];
   int *intValues = new int[N];
 
-  intKeys[0] = 0; intValues[0] = 0;
+  intKeys[0] = 0; intValues[0] = 5;
   intKeys[1] = 1; intValues[1] = 1;
   intKeys[2] = 0; intValues[2] = 2;
   intKeys[3] = 3; intValues[3] = 3;
@@ -449,11 +703,39 @@ void Boids::unitTest() {
     std::cout << " value: " << intValues[i] << std::endl;
   }
 
+  /***********************************************/
+  int *dev_startGridCell;
+  int *dev_endGridCell;
+
+  int *intStart = new int[N];
+  int *intEnd = new int[N];
+  cudaMalloc((void**)&dev_startGridCell, N * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_startGridCell failed!");
+  cudaMalloc((void**)&dev_endGridCell, N * sizeof(int));
+  checkCUDAErrorWithLine("cudaMalloc dev_endGridCell failed!");
+
+  kernResetIntBuffer << < 1, N >> >(N, dev_startGridCell, -1);
+  kernResetIntBuffer << < 1, N >> >(N, dev_endGridCell, -1);
+  kernIdentifyCellStartEnd<<< 1, N >>>(N, dev_intKeys, dev_startGridCell, dev_endGridCell);
+  // How to copy data back to the CPU side from the GPU
+  cudaMemcpy(intStart, dev_startGridCell, sizeof(int) * N, cudaMemcpyDeviceToHost);
+  cudaMemcpy(intEnd, dev_endGridCell, sizeof(int) * N, cudaMemcpyDeviceToHost);
+  checkCUDAErrorWithLine("memcpy back failed!");
+
+  for (int i = 0; i < N; ++i) {
+	  std::cout << "  Start: " << intStart[i];
+	  std::cout << " End: " << intEnd[i] << std::endl;
+  }
+  /***********************************************/
   // cleanup
-  delete(intKeys);
-  delete(intValues);
+  delete [] intKeys;
+  delete [] intValues;
+  delete[] intStart;
+  delete[] intEnd;
   cudaFree(dev_intKeys);
   cudaFree(dev_intValues);
+  cudaFree(dev_startGridCell);
+  cudaFree(dev_endGridCell);
   checkCUDAErrorWithLine("cudaFree failed!");
   return;
 }
diff --git a/src/main.cpp b/src/main.cpp
index e416836..fa1bb04 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -14,11 +14,11 @@
 
 // LOOK-2.1 LOOK-2.3 - toggles for UNIFORM_GRID and COHERENT_GRID
 #define VISUALIZE 1
-#define UNIFORM_GRID 0
+#define UNIFORM_GRID 1
 #define COHERENT_GRID 0
 
 // LOOK-1.2 - change this to adjust particle count in the simulation
-const int N_FOR_VIS = 5000;
+const int N_FOR_VIS = 20000;
 const float DT = 0.2f;
 
 /**
@@ -250,7 +250,10 @@ void initShaders(GLuint * program) {
       glUseProgram(program[PROG_BOID]);
       glBindVertexArray(boidVAO);
       glPointSize(pointSize);
-      glDrawElements(GL_POINTS, N_FOR_VIS + 1, GL_UNSIGNED_INT, 0);
+	  //glDrawElementsInstanced(GL_POINTS, 1, GL_UNSIGNED_INT, 0, N_FOR_VIS + 1);
+	  glDrawElements(GL_POINTS, N_FOR_VIS + 1, GL_UNSIGNED_INT, 0);
+	  //for (int i = 0; i < N_FOR_VIS / 2; ++i)
+		 //glDrawArraysInstanced(GL_POINTS, 0, 4, N_FOR_VIS + 1);
       glPointSize(1.0f);
 
       glUseProgram(0);