diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..a4d6d9c
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1 @@
+.vs/
\ No newline at end of file
diff --git a/README.md b/README.md
index d2fa33d..3db0df2 100644
--- a/README.md
+++ b/README.md
@@ -3,11 +3,32 @@ Project 0 Getting Started
 
 **University of Pennsylvania, CIS 5650: GPU Programming and Architecture, Project 0**
 
-* (TODO) YOUR NAME HERE
-  * (TODO) [LinkedIn](), [personal website](), [twitter](), etc.
-* Tested on: (TODO) Windows 22, i7-2222 @ 2.22GHz 22GB, GTX 222 222MB (Moore 2222 Lab)
+* Hongyi Ding
+  * [LinkedIn](https://www.linkedin.com/in/hongyi-ding/), [personal website](https://johnnyding.com/)
+* Tested on: Windows 11, i7-12700 @ 2.10GHz 32GB, NVIDIA T1000 4GB (SEAS Virtual Lab)
 
-### (TODO: Your README)
+### Screenshots
 
-Include screenshots, analysis, etc. (Remember, this is public, so don't put
-anything here that you don't want to share with the world.)
+notes: vLab PCs limit user's permission on software installation and some NVIDIA debug features. Thanks to CETS, they added CMake support upon my request. However, Nsight Systems and Nsight Computes still can't work.
+
+#### Test program and Nsight debugger
+
+![2.1.2](images/2.1.2.png)
+
+![2.1.3](images/2.1.3.png)
+
+#### Compatibility of Nsight Systems and Nsight Computes
+
+It says it cannot communicate with the localhost, due to lack of permission.
+
+![2.1.4](images/2.1.4.png)
+
+It says it lacks permission to access NVIDIA GPU Performance Counters.
+
+![2.1.5](images/2.1.5.png)
+
+#### Compatibility Check for WebGL and WebGPU
+
+![2.2](images/2.2.png)
+
+![2.3](images/2.3.png)
\ No newline at end of file
diff --git a/cuda-gl-check/src/main.cpp b/cuda-gl-check/src/main.cpp
index 886fd4c..06ed2d0 100644
--- a/cuda-gl-check/src/main.cpp
+++ b/cuda-gl-check/src/main.cpp
@@ -11,7 +11,7 @@
  */
 int main(int argc, char* argv[]) {
     // TODO: Change this line to use your name!
-    m_yourName = "TODO: YOUR NAME HERE";
+    m_yourName = "Hongyi Ding";
 
     if (init(argc, argv)) {
         mainLoop();
diff --git a/cuda-introduction/.gitignore b/cuda-introduction/.gitignore
new file mode 100644
index 0000000..92a50c5
--- /dev/null
+++ b/cuda-introduction/.gitignore
@@ -0,0 +1,559 @@
+*.orig
+*.filters
+*.vcxproj
+*.xcodeproj
+build
+
+# Created by https://www.gitignore.io/api/linux,osx,sublimetext,windows,jetbrains,vim,emacs,cmake,c++,cuda,visualstudio,webstorm,eclipse,xcode
+
+### Linux ###
+*~
+
+# KDE directory preferences
+.directory
+
+# Linux trash folder which might appear on any partition or disk
+.Trash-*
+
+
+### OSX ###
+.DS_Store
+.AppleDouble
+.LSOverride
+
+# Icon must end with two \r
+Icon
+
+
+# Thumbnails
+._*
+
+# Files that might appear in the root of a volume
+.DocumentRevisions-V100
+.fseventsd
+.Spotlight-V100
+.TemporaryItems
+.Trashes
+.VolumeIcon.icns
+
+# Directories potentially created on remote AFP share
+.AppleDB
+.AppleDesktop
+Network Trash Folder
+Temporary Items
+.apdisk
+
+
+### SublimeText ###
+# cache files for sublime text
+*.tmlanguage.cache
+*.tmPreferences.cache
+*.stTheme.cache
+
+# workspace files are user-specific
+*.sublime-workspace
+
+# project files should be checked into the repository, unless a significant
+# proportion of contributors will probably not be using SublimeText
+# *.sublime-project
+
+# sftp configuration file
+sftp-config.json
+
+
+### Windows ###
+# Windows image file caches
+Thumbs.db
+ehthumbs.db
+
+# Folder config file
+Desktop.ini
+
+# Recycle Bin used on file shares
+$RECYCLE.BIN/
+
+# Windows Installer files
+*.cab
+*.msi
+*.msm
+*.msp
+
+# Windows shortcuts
+*.lnk
+
+
+### JetBrains ###
+# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio
+
+*.iml
+
+## Directory-based project format:
+#.idea/
+# if you remove the above rule, at least ignore the following:
+
+# User-specific stuff:
+.idea/workspace.xml
+.idea/tasks.xml
+.idea/dictionaries
+
+# Sensitive or high-churn files:
+.idea/dataSources.ids
+.idea/dataSources.xml
+.idea/sqlDataSources.xml
+.idea/dynamic.xml
+.idea/uiDesigner.xml
+
+# Gradle:
+.idea/gradle.xml
+.idea/libraries
+
+# Mongo Explorer plugin:
+.idea/mongoSettings.xml
+
+## File-based project format:
+*.ipr
+*.iws
+
+## Plugin-specific files:
+
+# IntelliJ
+/out/
+
+# mpeltonen/sbt-idea plugin
+.idea_modules/
+
+# JIRA plugin
+atlassian-ide-plugin.xml
+
+# Crashlytics plugin (for Android Studio and IntelliJ)
+com_crashlytics_export_strings.xml
+crashlytics.properties
+crashlytics-build.properties
+
+
+### Vim ###
+[._]*.s[a-w][a-z]
+[._]s[a-w][a-z]
+*.un~
+Session.vim
+.netrwhist
+*~
+
+
+### Emacs ###
+# -*- mode: gitignore; -*-
+*~
+\#*\#
+/.emacs.desktop
+/.emacs.desktop.lock
+*.elc
+auto-save-list
+tramp
+.\#*
+
+# Org-mode
+.org-id-locations
+*_archive
+
+# flymake-mode
+*_flymake.*
+
+# eshell files
+/eshell/history
+/eshell/lastdir
+
+# elpa packages
+/elpa/
+
+# reftex files
+*.rel
+
+# AUCTeX auto folder
+/auto/
+
+# cask packages
+.cask/
+
+
+### CMake ###
+CMakeCache.txt
+CMakeFiles
+CMakeScripts
+Makefile
+cmake_install.cmake
+install_manifest.txt
+
+
+### C++ ###
+# Compiled Object files
+*.slo
+*.lo
+*.o
+*.obj
+
+# Precompiled Headers
+*.gch
+*.pch
+
+# Compiled Dynamic libraries
+*.so
+*.dylib
+*.dll
+
+# Fortran module files
+*.mod
+
+# Compiled Static libraries
+*.lai
+*.la
+*.a
+*.lib
+
+# Executables
+*.exe
+*.out
+*.app
+
+
+### CUDA ###
+*.i
+*.ii
+*.gpu
+*.ptx
+*.cubin
+*.fatbin
+
+
+### VisualStudio ###
+## Ignore Visual Studio temporary files, build results, and
+## files generated by popular Visual Studio add-ons.
+
+# User-specific files
+*.suo
+*.user
+*.userosscache.docstates
+
+# User-specific files (MonoDevelop/Xamarin Studio)
+*.userprefs
+
+# Build results
+[Dd]ebug/
+[Dd]ebugPublic/
+[Rr]elease/
+[Rr]eleases/
+x64/
+x86/
+build/
+bld/
+[Bb]in/
+[Oo]bj/
+
+# Visual Studio 2015 cache/options directory
+.vs/
+# Uncomment if you have tasks that create the project's static files in wwwroot
+#wwwroot/
+
+# MSTest test Results
+[Tt]est[Rr]esult*/
+[Bb]uild[Ll]og.*
+
+# NUNIT
+*.VisualState.xml
+TestResult.xml
+
+# Build Results of an ATL Project
+[Dd]ebugPS/
+[Rr]eleasePS/
+dlldata.c
+
+# DNX
+project.lock.json
+artifacts/
+
+*_i.c
+*_p.c
+*_i.h
+*.ilk
+*.meta
+*.obj
+*.pch
+*.pdb
+*.pgc
+*.pgd
+*.rsp
+*.sbr
+*.tlb
+*.tli
+*.tlh
+*.tmp
+*.tmp_proj
+*.log
+*.vspscc
+*.vssscc
+.builds
+*.pidb
+*.svclog
+*.scc
+
+# Chutzpah Test files
+_Chutzpah*
+
+# Visual C++ cache files
+ipch/
+*.aps
+*.ncb
+*.opensdf
+*.sdf
+*.cachefile
+
+# Visual Studio profiler
+*.psess
+*.vsp
+*.vspx
+
+# TFS 2012 Local Workspace
+$tf/
+
+# Guidance Automation Toolkit
+*.gpState
+
+# ReSharper is a .NET coding add-in
+_ReSharper*/
+*.[Rr]e[Ss]harper
+*.DotSettings.user
+
+# JustCode is a .NET coding add-in
+.JustCode
+
+# TeamCity is a build add-in
+_TeamCity*
+
+# DotCover is a Code Coverage Tool
+*.dotCover
+
+# NCrunch
+_NCrunch_*
+.*crunch*.local.xml
+nCrunchTemp_*
+
+# MightyMoose
+*.mm.*
+AutoTest.Net/
+
+# Web workbench (sass)
+.sass-cache/
+
+# Installshield output folder
+[Ee]xpress/
+
+# DocProject is a documentation generator add-in
+DocProject/buildhelp/
+DocProject/Help/*.HxT
+DocProject/Help/*.HxC
+DocProject/Help/*.hhc
+DocProject/Help/*.hhk
+DocProject/Help/*.hhp
+DocProject/Help/Html2
+DocProject/Help/html
+
+# Click-Once directory
+publish/
+
+# Publish Web Output
+*.[Pp]ublish.xml
+*.azurePubxml
+# TODO: Comment the next line if you want to checkin your web deploy settings 
+# but database connection strings (with potential passwords) will be unencrypted
+*.pubxml
+*.publishproj
+
+# NuGet Packages
+*.nupkg
+# The packages folder can be ignored because of Package Restore
+**/packages/*
+# except build/, which is used as an MSBuild target.
+!**/packages/build/
+# Uncomment if necessary however generally it will be regenerated when needed
+#!**/packages/repositories.config
+
+# Windows Azure Build Output
+csx/
+*.build.csdef
+
+# Windows Store app package directory
+AppPackages/
+
+# Visual Studio cache files
+# files ending in .cache can be ignored
+*.[Cc]ache
+# but keep track of directories ending in .cache
+!*.[Cc]ache/
+
+# Others
+ClientBin/
+[Ss]tyle[Cc]op.*
+~$*
+*~
+*.dbmdl
+*.dbproj.schemaview
+*.pfx
+*.publishsettings
+node_modules/
+orleans.codegen.cs
+
+# RIA/Silverlight projects
+Generated_Code/
+
+# Backup & report files from converting an old project file
+# to a newer Visual Studio version. Backup files are not needed,
+# because we have git ;-)
+_UpgradeReport_Files/
+Backup*/
+UpgradeLog*.XML
+UpgradeLog*.htm
+
+# SQL Server files
+*.mdf
+*.ldf
+
+# Business Intelligence projects
+*.rdl.data
+*.bim.layout
+*.bim_*.settings
+
+# Microsoft Fakes
+FakesAssemblies/
+
+# Node.js Tools for Visual Studio
+.ntvs_analysis.dat
+
+# Visual Studio 6 build log
+*.plg
+
+# Visual Studio 6 workspace options file
+*.opt
+
+# Visual Studio LightSwitch build output
+**/*.HTMLClient/GeneratedArtifacts
+**/*.DesktopClient/GeneratedArtifacts
+**/*.DesktopClient/ModelManifest.xml
+**/*.Server/GeneratedArtifacts
+**/*.Server/ModelManifest.xml
+_Pvt_Extensions
+
+
+### WebStorm ###
+# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio
+
+*.iml
+
+## Directory-based project format:
+.idea/
+# if you remove the above rule, at least ignore the following:
+
+# User-specific stuff:
+# .idea/workspace.xml
+# .idea/tasks.xml
+# .idea/dictionaries
+
+# Sensitive or high-churn files:
+# .idea/dataSources.ids
+# .idea/dataSources.xml
+# .idea/sqlDataSources.xml
+# .idea/dynamic.xml
+# .idea/uiDesigner.xml
+
+# Gradle:
+# .idea/gradle.xml
+# .idea/libraries
+
+# Mongo Explorer plugin:
+# .idea/mongoSettings.xml
+
+## File-based project format:
+*.ipr
+*.iws
+
+## Plugin-specific files:
+
+# IntelliJ
+/out/
+
+# mpeltonen/sbt-idea plugin
+.idea_modules/
+
+# JIRA plugin
+atlassian-ide-plugin.xml
+
+# Crashlytics plugin (for Android Studio and IntelliJ)
+com_crashlytics_export_strings.xml
+crashlytics.properties
+crashlytics-build.properties
+
+
+### Eclipse ###
+*.pydevproject
+.metadata
+.gradle
+bin/
+tmp/
+*.tmp
+*.bak
+*.swp
+*~.nib
+local.properties
+.settings/
+.loadpath
+
+# Eclipse Core
+.project
+
+# External tool builders
+.externalToolBuilders/
+
+# Locally stored "Eclipse launch configurations"
+*.launch
+
+# CDT-specific
+.cproject
+
+# JDT-specific (Eclipse Java Development Tools)
+.classpath
+
+# Java annotation processor (APT)
+.factorypath
+
+# PDT-specific
+.buildpath
+
+# sbteclipse plugin
+.target
+
+# TeXlipse plugin
+.texlipse
+
+
+### Xcode ###
+# Xcode
+#
+# gitignore contributors: remember to update Global/Xcode.gitignore, Objective-C.gitignore & Swift.gitignore
+
+## Build generated
+build/
+DerivedData
+
+## Various settings
+*.pbxuser
+!default.pbxuser
+*.mode1v3
+!default.mode1v3
+*.mode2v3
+!default.mode2v3
+*.perspectivev3
+!default.perspectivev3
+xcuserdata
+
+## Other
+*.xccheckout
+*.moved-aside
+*.xcuserstate
diff --git a/cuda-introduction/source/common.cu b/cuda-introduction/source/common.cu
index dce8793..a4dcdf7 100644
--- a/cuda-introduction/source/common.cu
+++ b/cuda-introduction/source/common.cu
@@ -9,7 +9,7 @@ unsigned divup(unsigned size, unsigned div)
 {
     // TODO: implement a 1 line function to return the divup operation.
     // Note: You only need to use addition, subtraction, and division operations.
-    return 0;
+    return (size + div - 1) / div;
 }
 
 void clearHostAndDeviceArray(float *res, float *dev_res, unsigned size, const int value)
diff --git a/cuda-introduction/source/matmul.cu b/cuda-introduction/source/matmul.cu
index 826e535..d146533 100644
--- a/cuda-introduction/source/matmul.cu
+++ b/cuda-introduction/source/matmul.cu
@@ -12,17 +12,22 @@ __global__ void matrixMultiplicationNaive(float* const matrixP, const float* con
 {
     // TODO 10a: Compute the P matrix global index for each thread along x and y dimentions.
     // Remember that each thread of the kernel computes the result of 1 unique element of P
-    unsigned px;
-    unsigned py;
+    unsigned px = threadIdx.x + blockIdx.x * blockDim.x;
+    unsigned py = threadIdx.y + blockIdx.y * blockDim.y;
 
     // TODO 10b: Check if px or py are out of bounds. If they are, return.
+    if (px < 0 || py < 0 || px >= sizeMX || py >= sizeNY)
+		return;
 
     // TODO 10c: Compute the dot product for the P element in each thread
     // This loop will be the same as the host loop
     float dot = 0.0;
+    for (unsigned k = 0; k < sizeXY; k++) {
+        dot += matrixM[k * sizeMX + px] * matrixN[py * sizeXY + k];
+	}
 
     // TODO 10d: Copy dot to P matrix
-    // matrixP[] = dot;
+    matrixP[py * sizeMX + px] = dot;
 }
 
 int main(int argc, char *argv[])
@@ -31,9 +36,9 @@ int main(int argc, char *argv[])
     // Then try large multiple-block square matrix like 64x64 up to 2048x2048.
     // Then try square, non-power-of-two like 15x15, 33x33, 67x67, 123x123, and 771x771
     // Then try rectangles with powers of two and then non-power-of-two.
-    const unsigned sizeMX = 0;
-    const unsigned sizeXY = 0;
-    const unsigned sizeNY = 0;
+    const unsigned sizeMX = 32;
+    const unsigned sizeXY = 32;
+    const unsigned sizeNY = 32;
 
     // TODO 2: Allocate host 1D arrays for:
     // matrixM[sizeMX, sizeXY]
@@ -44,6 +49,10 @@ int main(int argc, char *argv[])
     float* matrixN;
     float* matrixP;
     float* matrixPGold;
+	matrixM = new float[sizeMX * sizeXY];
+	matrixN = new float[sizeXY * sizeNY];
+	matrixP = new float[sizeMX * sizeNY];
+	matrixPGold = new float[sizeMX * sizeNY];
 
     // LOOK: Setup random number generator and fill host arrays and the scalar a.
     std::random_device rd;
@@ -65,13 +74,27 @@ int main(int argc, char *argv[])
     //     for k -> 0 to sizeXY
     //       dot = m[k, px] * n[py, k]
     //  matrixPGold[py, px] = dot
+    for (unsigned j = 0; j < sizeNY; j++) {
+        for (unsigned i = 0; i < sizeMX; i++) {
+            float sum = 0.f;
+            for (unsigned k = 0; k < sizeXY; k++) {
+                sum += matrixM[k * sizeMX + i] * matrixN[j * sizeXY + k];
+            }
+            matrixPGold[j * sizeMX + i] = sum;
+		}
+    }
 
     // Device arrays
     float *d_matrixM, *d_matrixN, *d_matrixP;
 
     // TODO 4: Allocate memory on the device for d_matrixM, d_matrixN, d_matrixP.
+	CUDA(cudaMalloc((void**)&d_matrixM, sizeMX * sizeXY * sizeof(float)));
+	CUDA(cudaMalloc((void**)&d_matrixN, sizeXY * sizeNY * sizeof(float)));
+	CUDA(cudaMalloc((void**)&d_matrixP, sizeMX * sizeNY * sizeof(float)));
 
     // TODO 5: Copy array contents of M and N from the host (CPU) to the device (GPU)
+	CUDA(cudaMemcpy(d_matrixM, matrixM, sizeMX * sizeXY * sizeof(float), cudaMemcpyHostToDevice));
+	CUDA(cudaMemcpy(d_matrixN, matrixN, sizeXY * sizeNY * sizeof(float), cudaMemcpyHostToDevice));
 
     CUDA(cudaDeviceSynchronize());
 
@@ -86,13 +109,14 @@ int main(int argc, char *argv[])
     // Calculate number of blocks along X and Y in a 2D CUDA "grid" using divup
     // HINT: The shape of matrices has no impact on launch configuaration
     DIMS dims;
-    dims.dimBlock = dim3(1, 1, 1);
-    dims.dimGrid  = dim3(1, 1, 1);
+    dims.dimBlock = dim3(32, 32, 1);
+    dims.dimGrid  = dim3(divup(sizeMX, 32), divup(sizeNY, 32), 1);
 
     // TODO 7: Launch the matrix transpose kernel
-    // matrixMultiplicationNaive<<<>>>();
+	matrixMultiplicationNaive<<<dims.dimGrid, dims.dimBlock>>>(d_matrixP, d_matrixM, d_matrixN, sizeMX, sizeNY, sizeXY);
 
     // TODO 8: copy the answer back to the host (CPU) from the device (GPU)
+	CUDA(cudaMemcpy(matrixP, d_matrixP, sizeMX * sizeNY * sizeof(float), cudaMemcpyDeviceToHost));
 
     // LOOK: Use compareReferenceAndResult to check the result
     compareReferenceAndResult(matrixPGold, matrixP, sizeMX * sizeNY, 1e-3);
@@ -101,6 +125,9 @@ int main(int argc, char *argv[])
     ////////////////////////////////////////////////////////////
 
     // TODO 9: free device memory using cudaFree
+	CUDA(cudaFree(d_matrixM));
+	CUDA(cudaFree(d_matrixN));
+	CUDA(cudaFree(d_matrixP));
 
     // free host memory
     delete[] matrixM;
diff --git a/cuda-introduction/source/saxpy.cu b/cuda-introduction/source/saxpy.cu
index 5ed591f..2777068 100644
--- a/cuda-introduction/source/saxpy.cu
+++ b/cuda-introduction/source/saxpy.cu
@@ -9,20 +9,21 @@
 __global__ void saxpy(float* const z, const float* const x, const float* const y, const float a, const unsigned size)
 {
     // TODO 9: Compute the global index for each thread.
-    unsigned idx = 0;
+    unsigned idx = threadIdx.x + blockIdx.x * blockDim.x;
 
     // TODO 10: Check if idx is out of bounds. If yes, return.
-    if (idx >= 0)
+    if (idx < 0 || idx >= size)
         return;
 
     // TODO 11: Perform the SAXPY operation: z = a * x + y.
+	z[idx] = a * x[idx] + y[idx];
 }
 
 int main(int argc, char *argv[])
 {
     // TODO 1: Set the size. Start with something simple like 64.
     // TODO Optional: Try out these sizes: 256, 1024, 2048, 14, 103, 1025, 3127
-    const unsigned size = 0;
+    const unsigned size = 1025;
 
     // Host arrays.
     float* x = new float[size];
@@ -52,10 +53,13 @@ int main(int argc, char *argv[])
     float *d_x, *d_y, *d_z;
 
     // TODO 2: Allocate memory on the device. Fill in the blanks for d_x, then do the same commands for d_y and d_z.
-    // CUDA(cudaMalloc((void **)& pointer, size in bytes)));
+    CUDA(cudaMalloc((void**)&d_x, size * sizeof(float)));
+	CUDA(cudaMalloc((void**)&d_y, size * sizeof(float)));
+	CUDA(cudaMalloc((void**)&d_z, size * sizeof(float)));
 
     // TODO 3: Copy array contents of X and Y from the host (CPU) to the device (GPU). Follow what you did for 2,
-    // CUDA(cudaMemcpy(dest ptr, source ptr, size in bytes, direction enum));
+	CUDA(cudaMemcpy(d_x, x, size * sizeof(float), cudaMemcpyHostToDevice));
+	CUDA(cudaMemcpy(d_y, y, size * sizeof(float), cudaMemcpyHostToDevice));
 
     CUDA(cudaDeviceSynchronize());
 
@@ -69,16 +73,17 @@ int main(int argc, char *argv[])
     // TODO 4: Setup threads and blocks.
     // Start threadPerBlock as 128, then try out differnt configurations: 32, 64, 256, 512, 1024
     // Use divup to get the number of blocks to launch.
-    const unsigned threadsPerBlock = 0;
+    const unsigned threadsPerBlock = 128;
 
     // TODO 5: Implement the divup function in common.cpp
     const unsigned blocks = divup(size, threadsPerBlock);
 
     // TODO 6: Launch the GPU kernel with blocks and threadPerBlock as launch configuration
-    // saxpy<<< >>> (....);
+    saxpy<<<blocks, threadsPerBlock>>> (d_z, d_x, d_y, a, size);
 
     // TODO 7: Copy the answer back to the host (CPU) from the device (GPU).
     // Copy what you did in 3, except for d_z -> z.
+	CUDA(cudaMemcpy(z, d_z, size * sizeof(float), cudaMemcpyDeviceToHost));
 
     // LOOK: Use postprocess to check the result
     compareReferenceAndResult(z_gold, z, size, 1e-6);
@@ -86,7 +91,9 @@ int main(int argc, char *argv[])
     ////////////////////////////////////////////////////////////
 
     // TODO 8: free device memory using cudaFree
-    // CUDA(cudaFree(device pointer));
+    CUDA(cudaFree(d_x));
+	CUDA(cudaFree(d_y));
+	CUDA(cudaFree(d_z));
 
     // free host memory
     delete[] x;
diff --git a/cuda-introduction/source/transpose.cu b/cuda-introduction/source/transpose.cu
index 89f6f8f..39ea60a 100644
--- a/cuda-introduction/source/transpose.cu
+++ b/cuda-introduction/source/transpose.cu
@@ -19,16 +19,18 @@
 __global__ void copyKernel(const float* const a, float* const b, const unsigned sizeX, const unsigned sizeY)
 {
     // TODO 6a: Compute the global index for each thread along x and y dimentions.
-    unsigned i = 0;
-    unsigned j = 0;;
+    unsigned i = threadIdx.x + blockIdx.x * blockDim.x;
+    unsigned j = threadIdx.y + blockIdx.y * blockDim.y;
 
     // TODO 6b: Check if i or j are out of bounds. If they are, return.
+    if (i < 0 || j < 0 || i >= sizeX || j >= sizeY)
+		return;
 
     // TODO 6c: Compute global 1D index from i and j
-    unsigned index = 0;
+    unsigned index = j * sizeX + i;
 
     // TODO 6d: Copy data from A to B. Note that in copy kernel source and destination indices are the same
-    // b[] = a[];
+    b[index] = a[index];
 }
 
 // TODO 11: Implement the transpose kernel
@@ -38,16 +40,19 @@ __global__ void copyKernel(const float* const a, float* const b, const unsigned
 __global__ void matrixTransposeNaive(const float* const a, float* const b, const unsigned sizeX, const unsigned sizeY)
 {
     // TODO 11a: Compute the global index for each thread along x and y dimentions.
-    unsigned i = 0;
-    unsigned j = 0;
+    unsigned i = threadIdx.x + blockIdx.x * blockDim.x;
+    unsigned j = threadIdx.y + blockIdx.y * blockDim.y;
 
     // TODO 11b: Check if i or j are out of bounds. If they are, return.
+    if (i < 0 || j < 0 || i >= sizeX || j >= sizeY)
+        return;
 
     // TODO 11c: Compute index_in as (i,j) (same as index in copy kernel) and index_out as (j,i)
-    unsigned index_in  = 0;  // Compute input index (i,j) from matrix A
-    unsigned index_out = 0;  // Compute output index (j,i) in matrix B = transpose(A)
+    unsigned index_in  = j * sizeX + i;  // Compute input index (i,j) from matrix A
+    unsigned index_out = i * sizeY + j;  // Compute output index (j,i) in matrix B = transpose(A)
 
     // TODO 11d: Copy data from A to B using transpose indices
+	b[index_out] = a[index_in];
 }
 
 int main(int argc, char *argv[])
@@ -82,8 +87,11 @@ int main(int argc, char *argv[])
     float *d_a, *d_b;
 
     // TODO 2: Allocate memory on the device for d_a and d_b.
+	CUDA(cudaMalloc((void**)&d_a, sizeX * sizeY * sizeof(float)));
+	CUDA(cudaMalloc((void**)&d_b, sizeX * sizeY * sizeof(float)));
 
     // TODO 3: Copy array contents of A from the host (CPU) to the device (GPU)
+	CUDA(cudaMemcpy(d_a, a, sizeX * sizeY * sizeof(float), cudaMemcpyHostToDevice));
 
     CUDA(cudaDeviceSynchronize());
 
@@ -97,13 +105,14 @@ int main(int argc, char *argv[])
         // TODO 4: Assign a 2D distribution of BS_X x BS_Y x 1 CUDA threads within
         // Calculate number of blocks along X and Y in a 2D CUDA "grid" using divup
         DIMS dims;
-        dims.dimBlock = dim3(1, 1, 1);
-        dims.dimGrid = dim3(1, 1, 1);
+        dims.dimBlock = dim3(32, 32, 1);
+        dims.dimGrid = dim3(divup(sizeX, 32), divup(sizeY, 32), 1);
 
         // LOOK: Launch the copy kernel
         copyKernel<<<dims.dimGrid, dims.dimBlock>>>(d_a, d_b, sizeX, sizeY);
 
         // TODO 5: copy the answer back to the host (CPU) from the device (GPU)
+		CUDA(cudaMemcpy(b, d_b, sizeX * sizeY * sizeof(float), cudaMemcpyDeviceToHost));
 
         // LOOK: Use compareReferenceAndResult to check the result
         compareReferenceAndResult(a_gold, b, sizeX * sizeY);
@@ -121,13 +130,14 @@ int main(int argc, char *argv[])
         // TODO 8: Assign a 2D distribution of BS_X x BS_Y x 1 CUDA threads within
         // Calculate number of blocks along X and Y in a 2D CUDA "grid" using divup
         DIMS dims;
-        dims.dimBlock = dim3(1, 1, 1);
-        dims.dimGrid = dim3(1, 1, 1);
+        dims.dimBlock = dim3(32, 32, 1);
+        dims.dimGrid = dim3(divup(sizeX, 32), divup(sizeY, 32), 1);
 
         // TODO 9: Launch the matrix transpose kernel
-        // matrixTransposeNaive<<<>>>(......);
+        matrixTransposeNaive<<<dims.dimGrid, dims.dimBlock>>>(d_a, d_b, sizeX, sizeY);
 
         // TODO 10: copy the answer back to the host (CPU) from the device (GPU)
+		CUDA(cudaMemcpy(b, d_b, sizeX * sizeY * sizeof(float), cudaMemcpyDeviceToHost));
 
         // LOOK: Use compareReferenceAndResult to check the result
         compareReferenceAndResult(b_gold, b, sizeX * sizeY);
@@ -136,6 +146,8 @@ int main(int argc, char *argv[])
     ////////////////////////////////////////////////////////////
 
     // TODO 7: free device memory using cudaFree
+	CUDA(cudaFree(d_a));
+	CUDA(cudaFree(d_b));
 
     // free host memory
     delete[] a;
diff --git a/images/2.1.2.png b/images/2.1.2.png
new file mode 100644
index 0000000..f16da20
Binary files /dev/null and b/images/2.1.2.png differ
diff --git a/images/2.1.3.png b/images/2.1.3.png
new file mode 100644
index 0000000..6bf930c
Binary files /dev/null and b/images/2.1.3.png differ
diff --git a/images/2.1.4.png b/images/2.1.4.png
new file mode 100644
index 0000000..9ebee9f
Binary files /dev/null and b/images/2.1.4.png differ
diff --git a/images/2.1.5.png b/images/2.1.5.png
new file mode 100644
index 0000000..c23b01b
Binary files /dev/null and b/images/2.1.5.png differ
diff --git a/images/2.2.png b/images/2.2.png
new file mode 100644
index 0000000..6210cdd
Binary files /dev/null and b/images/2.2.png differ
diff --git a/images/2.3.png b/images/2.3.png
new file mode 100644
index 0000000..57c227f
Binary files /dev/null and b/images/2.3.png differ