diff --git a/README.md b/README.md
index 0e38ddb..20fa8c5 100644
--- a/README.md
+++ b/README.md
@@ -1,14 +1,215 @@
-CUDA Stream Compaction
+CUDA Scan and Stream Compaction
 ======================
 
 **University of Pennsylvania, CIS 565: GPU Programming and Architecture, Project 2**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Linda Zhu
+* Tested on: Windows 11, i7-12800H @ 2.40GHz 16GB, NVIDIA GeForce RTX 3070 Ti (Personal Laptop)
 
-### (TODO: Your README)
+## Overview
+In this project, I implemented a few different versions of the Scan (Prefix Sum) algorithm in CUDA from scratch (see the list below), and used the CPU scan and GPU work-efficient scan to implement GPU stream compaction that simply removes 0s from an array of ints. This project builds the foundation of culling early-terminated paths from an array of rays in a path tracer. It also helps to reorient your algorithmic thinking to the way how GPU works because n GPUs, many algorithms can benefit from massive parallelism and, in particular, data parallelism: executing the same code many times simultaneously with different data.
 
-Include analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+Featuring:
+  * CPU scan/compact (for GPU comparison)
+  * GPU naïve parallel scan/compact
+  * GPU work-efficient scan/compact
+  * Thrust scan (for GPU comparison)
+  * GPU work-efficient scan optimization (extra credit)
 
+## Descriptions
+
+### CPU Scan
+Use a for loop to compute an exclusive prefix sum. 
+
+![](img/cpu_scan.png)
+
+Number of adds: O(n)
+
+### Naive Parallel Scan
+Use double-buffer to scan two array. First do exclusive scan, then do shift right to get inclusive scan array.
+
+<img src="img/naive_gpu_scan.png" width="800" >
+
+Naïve Parallel Scan needs O(nlog2(n)) passes.  Each pass is O(n), but really is less in practice, e.g., because of early warp retirement.
+
+### Work-Efficient Parallel Scan
+
+#### Step 1. Up-Sweep (Reduction) Phase:
+We first traverse the tree from leaves to the root computing partial sums at internal nodes of the tree.
+![](img/upsweep.png)
+
+
+#### Step 2. Down-Sweep Phase:
+Then we traverse back down the tree from the root, using the partial sums from the reduce phase to build the scan in place on the array. We start by inserting zero at the root of the tree, and on each step, each node at the current level passes its own value to its left child, and the sum of its value and the former value of its left child to its right child.
+![](img/downsweep.png)
+
+#### Step 3. Finally we convert this exclusive scan to an inclusive scan and output the result.
+
+Up-Sweep: Really n – 1 adds
+
+Down-Sweep: Really n - 1 adds and n - 1 swaps
+
+
+### Thrust's Implementation
+
+We simply wrap a call to the Thrust library function thrust::exclusive_scan(first, last, result).
+
+### Stream Compation
+![](img/stream_compaction.png)
+
+The goal of stream compaction is that, given an array of elements, we create a new array with elements that meet a certain criteria, e.g. non null and preserve order. It's used in path tracing, collision detection, sparse matrix compression, etc.
+
+* Step 1:  Compute temporary array
+* Step 2:  Run exclusive scan on temporary array
+* Step 3:  Scatter
+
+
+## Results
+Test case configuration: block size = 512, array size = 2<sup>25</sup>
+
+I added 2 corner cases for my optimized GPU efficient scan in the output with the label "optimized work-efficient scan ...".
+
+```
+****************
+** SCAN TESTS **
+****************
+    [  39  25  44  23  36  32  15  35  41  19   6  20  21 ...   8   0 ]
+==== cpu scan, power-of-two ====
+   elapsed time: 17.503ms    (std::chrono Measured)
+    [   0  39  64 108 131 167 199 214 249 290 309 315 335 ... 821771882 821771890 ]
+==== cpu scan, non-power-of-two ====
+   elapsed time: 20.5698ms    (std::chrono Measured)
+    [   0  39  64 108 131 167 199 214 249 290 309 315 335 ... 821771783 821771828 ]
+    passed
+==== naive scan, power-of-two ====
+   elapsed time: 22.0729ms    (CUDA Measured)
+    passed
+==== naive scan, non-power-of-two ====
+   elapsed time: 22.8742ms    (CUDA Measured)
+    passed
+==== work-efficient scan, power-of-two ====
+   elapsed time: 21.3687ms    (CUDA Measured)
+    passed
+==== work-efficient scan, non-power-of-two ====
+   elapsed time: 17.9821ms    (CUDA Measured)
+    passed
+==== optimized work-efficient scan, power-of-two ====
+   elapsed time: 13.8136ms    (CUDA Measured)
+    passed
+==== optimized work-efficient scan, non-power-of-two ====
+   elapsed time: 13.4661ms    (CUDA Measured)
+    passed
+==== thrust scan, power-of-two ====
+   elapsed time: 1.61328ms    (CUDA Measured)
+    passed
+==== thrust scan, non-power-of-two ====
+   elapsed time: 1.93814ms    (CUDA Measured)
+    passed
+
+
+*****************************
+** STREAM COMPACTION TESTS **
+*****************************
+    [   0   1   1   0   1   1   0   0   3   1   0   0   0 ...   3   0 ]
+==== cpu compact without scan, power-of-two ====
+   elapsed time: 68.2749ms    (std::chrono Measured)
+    [   1   1   1   1   3   1   2   3   2   3   2   2   1 ...   1   3 ]
+    passed
+==== cpu compact without scan, non-power-of-two ====
+   elapsed time: 68.3939ms    (std::chrono Measured)
+    [   1   1   1   1   3   1   2   3   2   3   2   2   1 ...   3   2 ]
+    passed
+==== cpu compact with scan ====
+   elapsed time: 117.265ms    (std::chrono Measured)
+    [   1   1   1   1   3   1   2   3   2   3   2   2   1 ...   1   3 ]
+    passed
+==== work-efficient compact, power-of-two ====
+   elapsed time: 19.442ms    (CUDA Measured)
+    passed
+==== work-efficient compact, non-power-of-two ====
+   elapsed time: 18.6701ms    (CUDA Measured)
+    passed
+```
+
+## Performance Analysis
+
+### 1. Preliminary runs of non-optimized CPU vs GPU scan implementations
+Test case configuration: default block size = 128, array size = [1024, 2<sup>28</sup>]. 3 runs for each array size and plotting using their averages.
+
+![Figure 1](img/results/cpu_gpu_efficient_comparison.png)
+
+*Figure 1: Runtime vs Array Size for CPU and GPU Efficient Scan*
+
+If just implementing the GPU scan version following the described algorithm above, my "efficient" GPU scan is actually not that efficient -- it is even slower than the cpu approach. From the graph we can see that only when the data size is larger than `~5K` (2<sup>22</sup>) did the GPU approach start to show a slight advantage over CPU scan in terms of the runtime. Nevertheless, the small advantage can be trivial.
+
+There can be multiple reasons behind this phenomenon. A major one relates to warp partitioning. In the Up-sweep and Down-sweep phases of the original GPU work-efficient scan, the way we write in the data, i.e. dividing in half/ doubling the number of active threads through array indexing, causes warp divergence (Figure 2). The occupancy is high (active warps / maximum warps available) yet not all threads in each warp are performing some work. To optimize this approach, we partition threads based on consecutive increasing `threadIdx` such that we minimize divergent branches and retire unused warps early. Retired warps can then be scheduled to run something else, thus resulting in better hardware utilization and hiding latency.
+
+![Figure 2](img/results/warp_divergence_diagram.png)
+
+*Figure 2: Left is the original parallel reduction (up-sweep) algorithm. Right is a reworked algorithm. In the 1st pass, the original method leads to 4 divergent warps while the tweaked implementation has no divergence. Instead, the reworked method retires 2 warps early (warp 2 and warp 3). For easy understanding and illustration purposes, we assume warp size = 2 here.*
+
+Instead of doing the indexing below during up-sweep (similar for down-sweep),
+
+```
+for (int stride = 1; stride < blockDim.x; stride *= 2) {
+  if (t % (2 * stride) == 0)
+	  partialSum[t] += partialSum[t + stride];
+}
+// stride = 1, 2, 4, ...
+```
+we change the condition to determine which thread we should write or not like this: 
+```
+for (int stride = blockDim.x / 2; stride > 0; stride /= 2) {
+  if (t < stride)
+    partialSum[t] += partialSum[t + stride];
+}
+// stride = ... 4, 2, 1
+```
+
+In addition, we no longer need to check `modulo %` in the loop, an expensive operation that can take 100s of cycle/ up to 20 instructions on GPU. Less-than operator is considerably faster. **Theoretically speaking**, by updating these index calculation hacks we can achieve a significant performance boost compared to serial CPU scan. **However, my attempt to optimize indexing (for part 5 extra credit) seems unstable because my reworked implementation was even slower than the non-optimized, especially when data size is below 2<sup>22</sup>. Due to time constraint, I couldn't confirm why this happens from time to time. Therefore, for the rest of the performance analysis although I have included sample outputs from the optimized GPU efficient scan, I will not put much weight on its accuracy because of unreliability.**
+
+
+### 2. Block size optimization for minimal runtime
+
+Test case configuration: power-of-two array size = **2<sup>25</sup> (33,554,432)**, non-power-of-tow array size = (array size - 3), 3 runs for each array size and plotting using their averages.
+
+![Figure 3](img/results/gpu_blocksize_comparison.png)
+
+*Figure 3: Runtime vs Block Size (multiples of 32) for GPU Efficient Scan and Stream Compaction*
+
+For GPU scan, the optimal block size was **128** for the naive, **256** for unoptimized work-efficient, and **256/512** (512 is faster than 256 by a teeny-tiny bit) for optimized work-efficient implementations. For GPU stream compaction, it was also at block size = **256** that the non-optimized work-efficient algorithm has the best performance. It is worth noting that from increasing block size past 256 (512 for optimized efficient scan), there seems no performance gain and in fact, the application starts to slow down quite much. This is well-founded because when too big a block size leads to a lower occupancy of the multiprocessor, it is a BAD approach. Remeber increasing number of threads also heavily affects throughput and available resource, e.g. number of registers each thread will use. These preliminary runs of different block size helped me decide I should use block size = 256 for the GPU vs CPU Scan algorithm comparison below for optimal performance.
+
+
+### 3. All GPU Scan (Naive, Work-Efficient and Thrust) vs serial CPU Scan Comparison
+Test case configuration: block size = 256, array size = [2<sup>17</sup> (131,072), 2<sup>28</sup> (268,435,456)] 
+
+<img src="img/results/scan_arraysize_comparison.png" width="1000">
+<img src="img/results/scan_arraysize_comparison_nonpow2.png" width="1000">
+
+*Figure 4 & 5: Runtime vs Array Size in both power of 2 and non-power of 2 for all the Scan algorithms*
+
+As can be seen in Figure 4 & 5, the runtime of the scan algorithm increases smoonthly for each of the implementation when the array size is smaller than **2<sup>20</sup>** and peaks steeply past **2<sup>25</sup>**. The Thrust library version was always the best and outperforms the other implementations by at maximum **30x** faster. In all test cases, the Thrust version maintains a runtime below 10ms, which is reasonable because it is a well-established GPU library that is developed for high performance applications. To guess at what might be happening inside the Thrust implementation (e.g. allocation, memory copy), refer to the analysis down below where I used Nsight Timeline for its execution.
+
+The optimized work-efficient parallel scan is the next best version. This is sometimes slower than the CPU (when data size is small as I mentioned) but always faster than the naive parallel version in the long run. My guess why it can be slower than the CPU version is that it is only using global memory, the type of GPU memory with the highest latency. Especially when compared to the CPU version's cache coherency, this is a huge bottleneck on the algorithm. Another problem of both the optimized and the original wor-efficient scan is that the number of threads remains constant each iteration. This means that, during both the up sweep and down sweep phases, there are potentially thousands of blocks launched for a process, while only one or a few threads are actually active at a time. A more optimized version of the algorithm would 1. dynamically shrink/ expand the block size so that in each level of depth in scanning we can terminate idle threads, 2. switch to shared memory for faster data access.
+
+The next best version was the CPU scan. While this is initially suprising, there are a couple benefits the CPU version has with respect to the GPU versions. First, the logic is very simple for computing each element of the output, and the serial nature of the algorithm means it is very cache coherent. It also requires no additional memory allocation (CPU or GPU), which all the GPU parallel versions require.
+
+Finally, the naive parallel scan is the worst. For 33 million array size, this algorithm is 50% worse than the optimized efficient parallel scan, and 30x slower than the thrust library implementation.
+
+### 4. A Sneak Peak of Thrust Implementation
+
+![Figure 6](img/results/nsight_timeline.png)
+
+*Figure 6: Nsight Timeline on Thrust Eclusive Scan operation*
+
+I checked Nsight Timeline within Nsight Compute 2023.2.2 using the test case where array size is 2<sup>25</sup>. The function call of thrust::exclusive_scan is about one half of each kernel sweep call. We can see in the Thrust implementation most expense is on device memory allocation and memory copy based on the time span of `cudaMemcpyAsync` and `cudaMemcpy` executions. I guess the basic cost for memory operation is quite big in Thrust, but as the array size grows, since it might has some kind of memory access optimization like contiguous access, the memory operation cost might not increase a lot. As a result, in a large array thrust implementation has the best performance.
+
+
+## References
+* UPenn CIS5650 [Slides on Parallel Algorithms](https://docs.google.com/presentation/d/1ETVONA7QDM-WqsEj4qVOGD6Kura5I6E9yqH-7krnwZ0/edit#slide=id.p126)
+  for Scan, Stream Compaction, and Work-Efficient Parallel Scan.
+* GPU Gems 3, Chapter 39 - [Parallel Prefix Sum (Scan) with CUDA](https://developer.nvidia.com/gpugems/GPUGems3/gpugems3_ch39.html).
+    - This online version contains a few small errors (in superscripting, missing braces, bad indentation, etc.)
+    - We maintain a fix for this at [GPU Gem 3 Ch 39 Patch](https://github.com/CIS565-Fall-2017/Project2-Stream-Compaction/blob/master/INSTRUCTION.md#gpu-gem-3-ch-39-patch). If you find more errors in the chapter, welcome to open new pull requests to contribute.
+* UPenn CIS5650 Project 2 [Recitation slides](https://docs.google.com/presentation/d/1daOnWHOjMp1sIqMdVsNnvEU1UYynKcEMARc_W6bGnqE/edit?usp=sharing)
+* [Nsight Compute Tools Documentation](https://docs.nvidia.com/nsight-compute/NsightCompute/index.html)
diff --git a/img/cpu_scan.png b/img/cpu_scan.png
new file mode 100644
index 0000000..5128ff6
Binary files /dev/null and b/img/cpu_scan.png differ
diff --git a/img/downsweep.png b/img/downsweep.png
new file mode 100644
index 0000000..b683da1
Binary files /dev/null and b/img/downsweep.png differ
diff --git a/img/naive_gpu_scan.png b/img/naive_gpu_scan.png
new file mode 100644
index 0000000..2c0e7f8
Binary files /dev/null and b/img/naive_gpu_scan.png differ
diff --git a/img/results/cpu_gpu_efficient_comparison.png b/img/results/cpu_gpu_efficient_comparison.png
new file mode 100644
index 0000000..d5957c8
Binary files /dev/null and b/img/results/cpu_gpu_efficient_comparison.png differ
diff --git a/img/results/gpu_blocksize_comparison.png b/img/results/gpu_blocksize_comparison.png
new file mode 100644
index 0000000..78f3bcd
Binary files /dev/null and b/img/results/gpu_blocksize_comparison.png differ
diff --git a/img/results/nsight_timeline.png b/img/results/nsight_timeline.png
new file mode 100644
index 0000000..e610756
Binary files /dev/null and b/img/results/nsight_timeline.png differ
diff --git a/img/results/scan_arraysize_comparison.png b/img/results/scan_arraysize_comparison.png
new file mode 100644
index 0000000..f91cb46
Binary files /dev/null and b/img/results/scan_arraysize_comparison.png differ
diff --git a/img/results/scan_arraysize_comparison_nonpow2.png b/img/results/scan_arraysize_comparison_nonpow2.png
new file mode 100644
index 0000000..ddc3fca
Binary files /dev/null and b/img/results/scan_arraysize_comparison_nonpow2.png differ
diff --git a/img/results/warp_divergence_diagram.png b/img/results/warp_divergence_diagram.png
new file mode 100644
index 0000000..c539538
Binary files /dev/null and b/img/results/warp_divergence_diagram.png differ
diff --git a/img/stream_compaction.png b/img/stream_compaction.png
new file mode 100644
index 0000000..0dfa4d2
Binary files /dev/null and b/img/stream_compaction.png differ
diff --git a/img/upsweep.png b/img/upsweep.png
new file mode 100644
index 0000000..845d34c
Binary files /dev/null and b/img/upsweep.png differ
diff --git a/src/main.cpp b/src/main.cpp
index 896ac2b..aa794f5 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -13,7 +13,10 @@
 #include <stream_compaction/thrust.h>
 #include "testing_helpers.hpp"
 
-const int SIZE = 1 << 8; // feel free to change the size of array
+const int SIZE = 1 << 25; // feel free to change the size of array
+                          // 28 = 268435456
+//const int SIZE = 1000000;
+
 const int NPOT = SIZE - 3; // Non-Power-Of-Two
 int *a = new int[SIZE];
 int *b = new int[SIZE];
@@ -76,10 +79,26 @@ int main(int argc, char* argv[]) {
 
     zeroArray(SIZE, c);
     printDesc("work-efficient scan, non-power-of-two");
-    StreamCompaction::Efficient::scan(NPOT, c, a);
+    StreamCompaction::Efficient::scan(SIZE, c, a);
+    printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+    //printArray(NPOT, c, true);
+    printCmpResult(NPOT, b, c);
+
+#if OPTIMIZE
+    zeroArray(SIZE, c);
+    printDesc("optimized work-efficient scan, power-of-two");
+    StreamCompaction::Efficient::scanOptimized(SIZE, c, a);
+    printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
+    //printArray(SIZE, c, true);
+    printCmpResult(SIZE, b, c);
+
+    zeroArray(SIZE, c);
+    printDesc("optimized work-efficient scan, non-power-of-two");
+    StreamCompaction::Efficient::scanOptimized(SIZE, c, a);
     printElapsedTime(StreamCompaction::Efficient::timer().getGpuElapsedTimeForPreviousOperation(), "(CUDA Measured)");
     //printArray(NPOT, c, true);
     printCmpResult(NPOT, b, c);
+#endif
 
     zeroArray(SIZE, c);
     printDesc("thrust scan, power-of-two");
diff --git a/stream_compaction/common.cu b/stream_compaction/common.cu
index 2ed6d63..25d24a7 100644
--- a/stream_compaction/common.cu
+++ b/stream_compaction/common.cu
@@ -24,6 +24,12 @@ namespace StreamCompaction {
          */
         __global__ void kernMapToBoolean(int n, int *bools, const int *idata) {
             // TODO
+            int index = blockIdx.x * blockDim.x + threadIdx.x;
+            if (index >= n) {
+                return;
+            }
+
+            bools[index] = (int)(idata[index] != 0);
         }
 
         /**
@@ -33,6 +39,14 @@ namespace StreamCompaction {
         __global__ void kernScatter(int n, int *odata,
                 const int *idata, const int *bools, const int *indices) {
             // TODO
+            int index = blockIdx.x * blockDim.x + threadIdx.x;
+            if (index >= n) {
+                return;
+            }
+
+            if (bools[index]) {
+                odata[indices[index]] = idata[index];
+            }
         }
 
     }
diff --git a/stream_compaction/common.h b/stream_compaction/common.h
index d2c1fed..a3da057 100644
--- a/stream_compaction/common.h
+++ b/stream_compaction/common.h
@@ -13,6 +13,9 @@
 #define FILENAME (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__)
 #define checkCUDAError(msg) checkCUDAErrorFn(msg, FILENAME, __LINE__)
 
+#define blockSize 256
+#define OPTIMIZE 1
+
 /**
  * Check for CUDA errors; print and exit if there was a problem.
  */
diff --git a/stream_compaction/cpu.cu b/stream_compaction/cpu.cu
index 719fa11..85249a1 100644
--- a/stream_compaction/cpu.cu
+++ b/stream_compaction/cpu.cu
@@ -19,20 +19,33 @@ namespace StreamCompaction {
          */
         void scan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
-            // TODO
+            int sum = 0;
+            for (int i = 0; i < n; ++i) {
+                odata[i] = sum;
+                int val = idata[i];
+                sum += val;
+            }
             timer().endCpuTimer();
         }
 
         /**
          * CPU stream compaction without using the scan function.
-         *
+         * goal: closely and neatly packed the elements != 0
+         * Simply loop through the input array, meanwhile maintain a pointer indicating which address shall we put the next non-zero element
+         * 
          * @returns the number of elements remaining after compaction.
          */
         int compactWithoutScan(int n, int *odata, const int *idata) {
             timer().startCpuTimer();
-            // TODO
+            int count = 0;
+            for (int i = 0; i < n; ++i) {
+                int val = idata[i];
+                if (val != 0) {
+                    odata[count++] = val;
+                }
+            }
             timer().endCpuTimer();
-            return -1;
+            return count;
         }
 
         /**
@@ -41,10 +54,40 @@ namespace StreamCompaction {
          * @returns the number of elements remaining after compaction.
          */
         int compactWithScan(int n, int *odata, const int *idata) {
+            int* temp = new int[n];
+
             timer().startCpuTimer();
-            // TODO
+            // Step 1: map
+            // map original data array (integer, Light Ray, etc) to a bool array
+            for (int i = 0; i < n; ++i) {
+                odata[i] = idata[i] ? 1 : 0;
+            }
+            int last = odata[n - 1];
+
+            // Step 2: scan
+            //scan(n, odata, odata);
+            int sum = 0;
+            for (int i = 0; i < n; ++i) {
+                temp[i] = sum;
+                int val = odata[i];
+                sum += val;
+            }
+            int count = last + temp[n - 1];
+
+            // Step 3: scatter
+            // preserve non-zero elements and compact them into a new array
+            // for each element input[i] in original array
+            //  if it's non-zero (given by mapped array)
+            //    then put it at output[index], where index = scanned[i]
+            for (int i = 0; i < n; ++i) {
+                if (odata[i]) {
+                    odata[temp[i]] = idata[i];
+                }
+            }
             timer().endCpuTimer();
-            return -1;
+
+            delete[] (temp);
+            return count;
         }
     }
 }
diff --git a/stream_compaction/efficient.cu b/stream_compaction/efficient.cu
index 2db346e..b5f2f23 100644
--- a/stream_compaction/efficient.cu
+++ b/stream_compaction/efficient.cu
@@ -12,13 +12,154 @@ namespace StreamCompaction {
             return timer;
         }
 
+        // Performs ONE iteration of up sweep
+        __global__ void kernUpSweep(int n, int d, int* data) {
+            int index = blockIdx.x * blockDim.x + threadIdx.x;
+            if (index >= n) {
+                return;
+            }
+
+            int offset1 = 1 << d;
+            int offset2 = 1 << (1 + d);
+            if (index % offset2 == 0) {
+                data[index + offset2 - 1] +=
+                    data[index + offset1 - 1];
+            }
+        }
+
+        __global__ void kernUpSweepOptimized(int n, int d, int stride, int* data) {
+            int index = blockIdx.x * blockDim.x + threadIdx.x;
+            if (index >= n) {
+                return;
+            }
+
+            int offset = 1 << d;
+            if (index < stride) {
+                data[(index * 2 + 2) * offset - 1] +=
+                    data[(index * 2 + 1) * offset - 1];
+            }
+        }
+
+        // Performs ONE iteration of down sweep
+        __global__ void kernDownSweep(int n, int d, int *data) {
+            int index = blockIdx.x * blockDim.x + threadIdx.x;
+            if (index >= n) {
+                return;
+            }
+
+            int offset1 = 1 << d;
+            int offset2 = 1 << (1 + d);
+            if (index % offset2 == 0) {
+                int left = data[index + offset1 - 1];                   // Save left child
+                data[index + offset1 - 1] = data[index + offset2 - 1];  // Set left child to this node�s value
+                data[index + offset2 - 1] += left;                      // Set right child to old left value +
+                                                                        // this node�s value
+            }
+        }
+
+        __global__ void kernDownSweepOptimized(int n, int d, int stride, int* data) {
+            int index = blockIdx.x * blockDim.x + threadIdx.x;
+            if (index >= n) {
+                return;
+            }
+
+            int offset1 = (index * 2 + 1) * stride - 1;
+            int offset2 = (index * 2 + 2) * stride - 1;
+            if (index < d) {
+                int left = data[offset1];       // Save left child
+                data[offset1] = data[offset2];  // Set left child to this node�s value
+                data[offset2] += left;          // Set right child to old left value +
+                                                // this node�s value
+            }
+        }
+
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
+            int* dev_buf;
+
+            int power2 = ilog2ceil(n);
+            int chunk = 1 << power2;
+
+            dim3 blocksPerGrid((chunk + blockSize - 1) / blockSize);
+            size_t arrSize = n * sizeof(int);
+
+            cudaMalloc((void**)&dev_buf, chunk * sizeof(int));
+            //checkCUDAError("cudaMalloc dev_buf failed!");
+
+            cudaMemcpy(dev_buf, idata, arrSize, cudaMemcpyHostToDevice);
+            //checkCUDAError("cudaMemcpy idata to dev_buf failed!");
+            if (chunk > n) {
+                cudaMemset(dev_buf + n, 0, (chunk - n) * sizeof(int));
+                //checkCUDAError("cudaMemset dev_buf[n] failed!");
+            }
+
+            timer().startGpuTimer();
+            // Up Sweep
+            for (int d = 0; d <= power2 - 1; ++d) {
+                kernUpSweep << <blocksPerGrid, blockSize >> > (chunk, d, dev_buf);
+                //checkCUDAError("kernUpSweep failed!");
+            }
+            
+            // Down Sweep
+            cudaDeviceSynchronize();
+            cudaMemset(dev_buf + chunk - 1, 0, sizeof(int)); // set root to zero
+            //checkCUDAError("cudaMemset dev_buf[chunk-1] failed!");
+
+            for (int d = power2-1; d >= 0; --d) {
+                kernDownSweep <<<blocksPerGrid, blockSize>>> (chunk, d, dev_buf);
+                //checkCUDAError("kernDownSweep failed!");
+            }
+            timer().endGpuTimer();
+
+            cudaMemcpy(odata, dev_buf, arrSize, cudaMemcpyDeviceToHost);
+            //checkCUDAError("cudaMemcpy odata failed!");
+            cudaFree(dev_buf);
+        }
+
+        void scanOptimized(int n, int* odata, const int* idata) {
+            int* dev_buf;
+
+            int power2 = ilog2ceil(n);
+            int chunk = 1 << power2;
+
+            dim3 blocksPerGrid((chunk + blockSize - 1) / blockSize);
+            size_t arrSize = n * sizeof(int);
+
+            cudaMalloc((void**)&dev_buf, chunk * sizeof(int));
+            //checkCUDAError("cudaMalloc dev_buf failed!");
+
+            cudaMemcpy(dev_buf, idata, arrSize, cudaMemcpyHostToDevice);
+            //checkCUDAError("cudaMemcpy idata to dev_buf failed!");
+            if (chunk > n) {
+                cudaMemset(dev_buf + n, 0, (chunk - n) * sizeof(int));
+                //checkCUDAError("cudaMemset dev_buf[n] failed!");
+            }
+
             timer().startGpuTimer();
-            // TODO
+            // Up Sweep
+            int stride = chunk >> 1;
+            for (int d = 0; d <= power2 - 1; ++d) {
+                kernUpSweepOptimized << <blocksPerGrid, blockSize >> > (chunk, d, stride, dev_buf);
+                stride >>= 1;
+            }
+
+            // Down Sweep
+            cudaDeviceSynchronize();
+            cudaMemset(dev_buf + chunk - 1, 0, sizeof(int)); // set root to zero
+            //checkCUDAError("cudaMemset dev_buf[chunk-1] failed!");
+
+            stride = chunk >> 1;
+            for (int d = 1; d < chunk; d <<= 1) {
+                kernDownSweepOptimized << <blocksPerGrid, blockSize >> > (chunk, d, stride, dev_buf);
+                stride >>= 1;
+            }
             timer().endGpuTimer();
+
+            cudaMemcpy(odata, dev_buf, arrSize, cudaMemcpyDeviceToHost);
+            //checkCUDAError("cudaMemcpy odata failed!");
+            cudaFree(dev_buf);
         }
 
         /**
@@ -30,11 +171,69 @@ namespace StreamCompaction {
          * @param idata  The array of elements to compact.
          * @returns      The number of elements remaining after compaction.
          */
-        int compact(int n, int *odata, const int *idata) {
+        int compact(int n, int *odata, const int *idata) {            
+            int* dev_idata;
+            int* dev_odata;
+            int* dev_bools;
+            int* dev_indices;
+            int count = 0;
+
+            int power2 = ilog2ceil(n);
+            int chunk = 1 << power2;
+
+            dim3 chunkBlocksPerGrid((chunk + blockSize - 1) / blockSize);
+            dim3 fullBlocksPerGrid((n + blockSize - 1) / blockSize);
+
+            size_t arrSize = n * sizeof(int);
+            cudaMalloc((void**)&dev_idata, arrSize);
+            cudaMalloc((void**)&dev_odata, arrSize);
+            cudaMalloc((void**)&dev_bools, arrSize);
+            cudaMalloc((void**)&dev_indices, chunk * sizeof(int));
+
+            // copy original data to GPU
+            cudaMemcpy(dev_idata, idata, arrSize, cudaMemcpyHostToDevice);
+
             timer().startGpuTimer();
-            // TODO
+            // Step 1: Map
+            Common::kernMapToBoolean << < fullBlocksPerGrid, blockSize >> > (n, dev_bools, dev_idata);
+
+            // Step 2: Scan
+            // copy bool array to index array for in-place operation, still need original bool array later for scatter
+            cudaMemcpy(dev_indices, dev_bools, arrSize, cudaMemcpyDeviceToDevice);
+            if (chunk > n) {
+                cudaMemset(dev_indices + n, 0, (chunk - n) * sizeof(int));
+            }
+
+            // Up Sweep
+            for (int d = 0; d <= power2 - 1; ++d) {
+                kernUpSweep << <chunkBlocksPerGrid, blockSize >> > (chunk, d, dev_indices);
+            }
+
+            // set root to zero
+            cudaMemset(dev_indices + chunk - 1, 0, sizeof(int)); 
+            
+            // Down Sweep
+            for (int d = power2 - 1; d >= 0; --d) {
+                kernDownSweep << <chunkBlocksPerGrid, blockSize >> > (chunk, d, dev_indices);
+            }
+
+            // Step 3: Scatter
+            Common::kernScatter << <fullBlocksPerGrid, blockSize >> > (n,
+                    dev_odata, dev_idata, dev_bools, dev_indices);
             timer().endGpuTimer();
-            return -1;
+            
+            // Copy over last elements from GPU to local to return
+            cudaMemcpy(&count, dev_indices + (n - 1), sizeof(int), cudaMemcpyDeviceToHost);
+            count += (int)(idata[n - 1] != 0);
+            cudaMemcpy(odata, dev_odata, arrSize, cudaMemcpyDeviceToHost);
+
+            // free memory
+            cudaFree(dev_idata);
+            cudaFree(dev_odata);
+            cudaFree(dev_bools);
+            cudaFree(dev_indices);
+
+            return count;
         }
     }
 }
diff --git a/stream_compaction/efficient.h b/stream_compaction/efficient.h
index 803cb4f..bc8bb2c 100644
--- a/stream_compaction/efficient.h
+++ b/stream_compaction/efficient.h
@@ -7,6 +7,7 @@ namespace StreamCompaction {
         StreamCompaction::Common::PerformanceTimer& timer();
 
         void scan(int n, int *odata, const int *idata);
+        void scanOptimized(int n, int* odata, const int* idata);
 
         int compact(int n, int *odata, const int *idata);
     }
diff --git a/stream_compaction/naive.cu b/stream_compaction/naive.cu
index 4308876..32c7c38 100644
--- a/stream_compaction/naive.cu
+++ b/stream_compaction/naive.cu
@@ -11,15 +11,62 @@ namespace StreamCompaction {
             static PerformanceTimer timer;
             return timer;
         }
-        // TODO: __global__
+
+        // Performs one iteration of naive scan of N elements.
+        __global__ void kernNaiveScan(int n, int offset, int *odata, const int *idata) {
+            int index = blockIdx.x * blockDim.x + threadIdx.x;
+            if (index >= n) {
+                return;
+            }
+
+            if (index >= offset) {
+                odata[index] = idata[index - offset] + idata[index];
+            }
+            else {
+                odata[index] = idata[index];
+            }
+        }
 
         /**
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
+            dim3 blocksPerGrid((n + blockSize - 1) / blockSize);
+            
+            // copy data to the GPU
+            int* dev_buf1;
+            int* dev_buf2;
+
+            size_t arrSize = n * sizeof(int);
+            cudaMalloc((void**)&dev_buf1, arrSize);
+            checkCUDAError("cudaMalloc dev_buf1 failed!");
+            cudaMalloc((void**)&dev_buf2, arrSize);
+            checkCUDAError("cudaMalloc dev_buf2 failed!");
+
+            cudaMemcpy(dev_buf1, idata, arrSize, cudaMemcpyHostToDevice);
+            checkCUDAError("cudaMemcpy idata to dev_buf1 failed!");
+
             timer().startGpuTimer();
-            // TODO
+            for (int d = 1; d <= ilog2ceil(n); ++d) {
+                int offset = 1 << (d - 1);
+                kernNaiveScan <<<blocksPerGrid, blockSize>>> (n, offset, dev_buf2, dev_buf1);
+                checkCUDAError("kernNaiveScan failed!");
+
+                // ping-pong buffer to avoid race conditions
+                int* tmp = dev_buf1;
+                dev_buf1 = dev_buf2;
+                dev_buf2 = tmp;
+            }
             timer().endGpuTimer();
+
+            // shift inclusive to exclusive scan for compaction
+            odata[0] = 0;
+            cudaMemcpy(odata + 1, dev_buf1, arrSize, cudaMemcpyDeviceToHost);
+            checkCUDAError("cudaMemcpy odata failed!");
+
+            // free memory
+            cudaFree(dev_buf2);
+            cudaFree(dev_buf1);
         }
     }
 }
diff --git a/stream_compaction/thrust.cu b/stream_compaction/thrust.cu
index 1def45e..541bccf 100644
--- a/stream_compaction/thrust.cu
+++ b/stream_compaction/thrust.cu
@@ -18,11 +18,15 @@ namespace StreamCompaction {
          * Performs prefix-sum (aka scan) on idata, storing the result into odata.
          */
         void scan(int n, int *odata, const int *idata) {
+            thrust::device_vector<int> thrust_dv_idata(idata, idata + n);
+            thrust::device_vector<int> thrust_dv_odata(odata, odata + n);
+
             timer().startGpuTimer();
-            // TODO use `thrust::exclusive_scan`
-            // example: for device_vectors dv_in and dv_out:
-            // thrust::exclusive_scan(dv_in.begin(), dv_in.end(), dv_out.begin());
+            thrust::exclusive_scan(thrust_dv_idata.begin(), thrust_dv_idata.end(), thrust_dv_odata.begin());
             timer().endGpuTimer();
+
+            int* dv_odata_ptr = thrust::raw_pointer_cast(thrust_dv_odata.data());
+            cudaMemcpy(odata, dv_odata_ptr, n * sizeof(int), cudaMemcpyDeviceToHost);
         }
     }
 }