diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml
deleted file mode 100644
index c61f731855..0000000000
--- a/.github/workflows/pylint.yml
+++ /dev/null
@@ -1,51 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-name: PyLint
-on:
-  pull_request:
-    paths:
-      - '**.py'
-
-permissions:
-  contents: read
-
-jobs:
-  build:
-    name: PyLint
-    runs-on: ubuntu-latest
-    steps:
-    - name: Checkout code
-      uses: actions/checkout@v4
-    - name: Get file changes
-      id: get_file_changes
-      uses: trilom/file-changes-action@v1.2.4
-      with:
-        output: ' '
-    - name: Report list of changed files
-      run: |
-        echo Changed files: ${{ steps.get_file_changes.outputs.files }}
-    - name: Set up Python 3.10
-      uses: actions/setup-python@v5
-      with:
-        python-version: "3.10"
-    - name: Install Python dependencies
-      run: |
-        python -m pip install --upgrade pip
-        pip install pylint==3.0.2 numpy wheel
-        pip install -r ci/requirements.txt
-    - name: Run PyLint on changed files
-      run: |
-        echo "${{ steps.get_file_changes.outputs.files}}" | tr " " "\n" | grep ".py$" | xargs pylint --rcfile=ci/pylintrc
diff --git a/.gitignore b/.gitignore
index 06c3c0710a..96b03f7bf9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,6 +5,7 @@
 /build
 /builddir
 .cache/
+.idea/
 
 # jni build files
 iniparser/
diff --git a/Applications/Android/PicoGPTJNI/.gitignore b/Applications/Android/PicoGPTJNI/.gitignore
new file mode 100644
index 0000000000..54ed6ea235
--- /dev/null
+++ b/Applications/Android/PicoGPTJNI/.gitignore
@@ -0,0 +1,19 @@
+*.iml
+.gradle
+/.vscode
+/.idea
+/local.properties
+/.idea/caches
+/.idea/libraries
+/.idea/modules.xml
+/.idea/workspace.xml
+/.idea/navEditor.xml
+/.idea/assetWizardSettings.xml
+.DS_Store
+/build
+/captures
+.externalNativeBuild
+.cxx
+local.properties
+/app/src/main/jniLibs
+/app/src/main/obj
diff --git a/Applications/Android/PicoGPTJNI/.gradle/7.5/checksums/checksums.lock b/Applications/Android/PicoGPTJNI/.gradle/7.5/checksums/checksums.lock
deleted file mode 100644
index 62d1fcfe2b..0000000000
Binary files a/Applications/Android/PicoGPTJNI/.gradle/7.5/checksums/checksums.lock and /dev/null differ
diff --git a/Applications/Android/PicoGPTJNI/.gradle/7.5/dependencies-accessors/dependencies-accessors.lock b/Applications/Android/PicoGPTJNI/.gradle/7.5/dependencies-accessors/dependencies-accessors.lock
deleted file mode 100644
index 4f1595be70..0000000000
Binary files a/Applications/Android/PicoGPTJNI/.gradle/7.5/dependencies-accessors/dependencies-accessors.lock and /dev/null differ
diff --git a/Applications/Android/PicoGPTJNI/.gradle/7.5/dependencies-accessors/gc.properties b/Applications/Android/PicoGPTJNI/.gradle/7.5/dependencies-accessors/gc.properties
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/Applications/Android/PicoGPTJNI/.gradle/7.5/executionHistory/executionHistory.lock b/Applications/Android/PicoGPTJNI/.gradle/7.5/executionHistory/executionHistory.lock
deleted file mode 100644
index 506dd636a9..0000000000
Binary files a/Applications/Android/PicoGPTJNI/.gradle/7.5/executionHistory/executionHistory.lock and /dev/null differ
diff --git a/Applications/Android/PicoGPTJNI/.gradle/7.5/fileHashes/fileHashes.lock b/Applications/Android/PicoGPTJNI/.gradle/7.5/fileHashes/fileHashes.lock
deleted file mode 100644
index 096927b1af..0000000000
Binary files a/Applications/Android/PicoGPTJNI/.gradle/7.5/fileHashes/fileHashes.lock and /dev/null differ
diff --git a/Applications/Android/PicoGPTJNI/.gradle/7.5/gc.properties b/Applications/Android/PicoGPTJNI/.gradle/7.5/gc.properties
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/Applications/Android/PicoGPTJNI/.gradle/buildOutputCleanup/buildOutputCleanup.lock b/Applications/Android/PicoGPTJNI/.gradle/buildOutputCleanup/buildOutputCleanup.lock
deleted file mode 100644
index 2ab7eb0273..0000000000
Binary files a/Applications/Android/PicoGPTJNI/.gradle/buildOutputCleanup/buildOutputCleanup.lock and /dev/null differ
diff --git a/Applications/Android/PicoGPTJNI/.gradle/buildOutputCleanup/cache.properties b/Applications/Android/PicoGPTJNI/.gradle/buildOutputCleanup/cache.properties
deleted file mode 100644
index f11a0f4e85..0000000000
--- a/Applications/Android/PicoGPTJNI/.gradle/buildOutputCleanup/cache.properties
+++ /dev/null
@@ -1,2 +0,0 @@
-#Tue Feb 14 16:37:06 KST 2023
-gradle.version=7.5
diff --git a/Applications/Android/PicoGPTJNI/.gradle/checksums/checksums.lock b/Applications/Android/PicoGPTJNI/.gradle/checksums/checksums.lock
deleted file mode 100644
index 287309dd96..0000000000
Binary files a/Applications/Android/PicoGPTJNI/.gradle/checksums/checksums.lock and /dev/null differ
diff --git a/Applications/Android/PicoGPTJNI/.gradle/file-system.probe b/Applications/Android/PicoGPTJNI/.gradle/file-system.probe
deleted file mode 100644
index 71fa644c1c..0000000000
Binary files a/Applications/Android/PicoGPTJNI/.gradle/file-system.probe and /dev/null differ
diff --git a/Applications/Android/PicoGPTJNI/.gradle/vcs-1/gc.properties b/Applications/Android/PicoGPTJNI/.gradle/vcs-1/gc.properties
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/Applications/Android/PicoGPTJNI/.idea/compiler.xml b/Applications/Android/PicoGPTJNI/.idea/compiler.xml
deleted file mode 100644
index 5421743a9c..0000000000
--- a/Applications/Android/PicoGPTJNI/.idea/compiler.xml
+++ /dev/null
@@ -1,6 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="CompilerConfiguration">
-    <bytecodeTargetLevel target="11" />
-  </component>
-</project>
diff --git a/Applications/Android/PicoGPTJNI/.idea/gradle.xml b/Applications/Android/PicoGPTJNI/.idea/gradle.xml
deleted file mode 100644
index b795db1fe1..0000000000
--- a/Applications/Android/PicoGPTJNI/.idea/gradle.xml
+++ /dev/null
@@ -1,20 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="GradleMigrationSettings" migrationVersion="1" />
-  <component name="GradleSettings">
-    <option name="linkedExternalProjectsSettings">
-      <GradleProjectSettings>
-        <option name="testRunner" value="GRADLE" />
-        <option name="distributionType" value="DEFAULT_WRAPPED" />
-        <option name="externalProjectPath" value="$PROJECT_DIR$" />
-        <option name="gradleJvm" value="JDK" />
-        <option name="modules">
-          <set>
-            <option value="$PROJECT_DIR$" />
-            <option value="$PROJECT_DIR$/app" />
-          </set>
-        </option>
-      </GradleProjectSettings>
-    </option>
-  </component>
-</project>
diff --git a/Applications/Android/PicoGPTJNI/.idea/misc.xml b/Applications/Android/PicoGPTJNI/.idea/misc.xml
deleted file mode 100644
index 0f31685c15..0000000000
--- a/Applications/Android/PicoGPTJNI/.idea/misc.xml
+++ /dev/null
@@ -1,12 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="ExternalStorageConfigurationManager" enabled="true" />
-  <component name="ProjectRootManager" version="2" languageLevel="JDK_11" default="true" project-jdk-name="11" project-jdk-type="JavaSDK" />
-  <component name="VisualizationToolProject">
-    <option name="state">
-      <ProjectState>
-        <option name="scale" value="0.028300243817485197" />
-      </ProjectState>
-    </option>
-  </component>
-</project>
\ No newline at end of file
diff --git a/Applications/Android/PicoGPTJNI/.idea/vcs.xml b/Applications/Android/PicoGPTJNI/.idea/vcs.xml
deleted file mode 100644
index c2365ab11f..0000000000
--- a/Applications/Android/PicoGPTJNI/.idea/vcs.xml
+++ /dev/null
@@ -1,6 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="VcsDirectoryMappings">
-    <mapping directory="$PROJECT_DIR$/../../.." vcs="Git" />
-  </component>
-</project>
\ No newline at end of file
diff --git a/Applications/Android/PicoGPTJNI/.idea/workspace.xml b/Applications/Android/PicoGPTJNI/.idea/workspace.xml
deleted file mode 100644
index 039da86b98..0000000000
--- a/Applications/Android/PicoGPTJNI/.idea/workspace.xml
+++ /dev/null
@@ -1,147 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="AndroidLayouts">
-    <shared>
-      <config />
-    </shared>
-  </component>
-  <component name="AutoImportSettings">
-    <option name="autoReloadType" value="NONE" />
-  </component>
-  <component name="ChangeListManager">
-    <list default="true" id="b78dfbab-89c8-40be-8d53-4ace7cd06096" name="Changes" comment="">
-      <change beforePath="$PROJECT_DIR$/../ResnetJNI/app/src/main/jni/resnet.h" beforeDir="false" afterPath="$PROJECT_DIR$/../ResnetJNI/app/src/main/jni/resnet.h" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/../../meson.build" beforeDir="false" afterPath="$PROJECT_DIR$/../../meson.build" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/../../../meson_options.txt" beforeDir="false" afterPath="$PROJECT_DIR$/../../../meson_options.txt" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/../../../nntrainer/layers/concat_layer.cpp" beforeDir="false" afterPath="$PROJECT_DIR$/../../../nntrainer/layers/concat_layer.cpp" afterDir="false" />
-      <change beforePath="$PROJECT_DIR$/../../../tools/package_android.sh" beforeDir="false" afterPath="$PROJECT_DIR$/../../../tools/package_android.sh" afterDir="false" />
-    </list>
-    <option name="SHOW_DIALOG" value="false" />
-    <option name="HIGHLIGHT_CONFLICTS" value="true" />
-    <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
-    <option name="LAST_RESOLUTION" value="IGNORE" />
-  </component>
-  <component name="ExecutionTargetManager" SELECTED_TARGET="device_and_snapshot_combo_box_target[R3CR60NW1TN]" />
-  <component name="ExternalProjectsData">
-    <projectState path="$PROJECT_DIR$">
-      <ProjectState />
-    </projectState>
-  </component>
-  <component name="ExternalProjectsManager">
-    <system id="GRADLE">
-      <state>
-        <projects_view>
-          <tree_state>
-            <expand />
-            <select />
-          </tree_state>
-        </projects_view>
-      </state>
-    </system>
-  </component>
-  <component name="FileTemplateManagerImpl">
-    <option name="RECENT_TEMPLATES">
-      <list>
-        <option value="layoutResourceFile_vertical" />
-      </list>
-    </option>
-  </component>
-  <component name="Git.Settings">
-    <option name="RECENT_GIT_ROOT_PATH" value="$PROJECT_DIR$/../../.." />
-  </component>
-  <component name="MarkdownSettingsMigration">
-    <option name="stateVersion" value="1" />
-  </component>
-  <component name="ProblemsViewState">
-    <option name="selectedTabId" value="CurrentFile" />
-  </component>
-  <component name="ProjectId" id="2LicpclhJHyJAx5NQuq0S08TxXt" />
-  <component name="ProjectLevelVcsManager" settingsEditedManually="true" />
-  <component name="ProjectViewState">
-    <option name="hideEmptyMiddlePackages" value="true" />
-    <option name="showLibraryContents" value="true" />
-  </component>
-  <component name="PropertiesComponent"><![CDATA[{
-  "keyToString": {
-    "AnalyzeApkAction.lastApkPath": "/home/jijoongmoon/WorkSpace1/NNTrainer-java/Android_Demo_PicoGpt/Android_Application/PicoGptJNI",
-    "RunOnceActivity.OpenProjectViewOnStart": "true",
-    "RunOnceActivity.ShowReadmeOnStart": "true",
-    "RunOnceActivity.cidr.known.project.marker": "true",
-    "cidr.known.project.marker": "true",
-    "last_opened_file_path": "/home/jijoongmoon/WorkSpace1/nntrainer/Applications/Android/PicoGPTJNI",
-    "memory.settings.postsync.last.time.stamp": "1686724550880",
-    "settings.editor.selected.configurable": "reference.settingsdialog.IDE.editor.colors.Java"
-  }
-}]]></component>
-  <component name="RunManager">
-    <configuration name="app" type="AndroidRunConfigurationType" factoryName="Android App">
-      <module name="PicoGPTJNI.app.main" />
-      <option name="DEPLOY" value="true" />
-      <option name="DEPLOY_APK_FROM_BUNDLE" value="false" />
-      <option name="DEPLOY_AS_INSTANT" value="false" />
-      <option name="ARTIFACT_NAME" value="" />
-      <option name="PM_INSTALL_OPTIONS" value="" />
-      <option name="ALL_USERS" value="false" />
-      <option name="ALWAYS_INSTALL_WITH_PM" value="false" />
-      <option name="CLEAR_APP_STORAGE" value="false" />
-      <option name="DYNAMIC_FEATURES_DISABLED_LIST" value="" />
-      <option name="ACTIVITY_EXTRA_FLAGS" value="" />
-      <option name="MODE" value="default_activity" />
-      <option name="CLEAR_LOGCAT" value="false" />
-      <option name="SHOW_LOGCAT_AUTOMATICALLY" value="false" />
-      <option name="INSPECTION_WITHOUT_ACTIVITY_RESTART" value="false" />
-      <option name="TARGET_SELECTION_MODE" value="DEVICE_AND_SNAPSHOT_COMBO_BOX" />
-      <option name="SELECTED_CLOUD_MATRIX_CONFIGURATION_ID" value="-1" />
-      <option name="SELECTED_CLOUD_MATRIX_PROJECT_ID" value="" />
-      <option name="DEBUGGER_TYPE" value="Auto" />
-      <Auto>
-        <option name="USE_JAVA_AWARE_DEBUGGER" value="false" />
-        <option name="SHOW_STATIC_VARS" value="true" />
-        <option name="WORKING_DIR" value="" />
-        <option name="TARGET_LOGGING_CHANNELS" value="lldb process:gdb-remote packets" />
-        <option name="SHOW_OPTIMIZED_WARNING" value="true" />
-      </Auto>
-      <Hybrid>
-        <option name="USE_JAVA_AWARE_DEBUGGER" value="false" />
-        <option name="SHOW_STATIC_VARS" value="true" />
-        <option name="WORKING_DIR" value="" />
-        <option name="TARGET_LOGGING_CHANNELS" value="lldb process:gdb-remote packets" />
-        <option name="SHOW_OPTIMIZED_WARNING" value="true" />
-      </Hybrid>
-      <Java />
-      <Native>
-        <option name="USE_JAVA_AWARE_DEBUGGER" value="false" />
-        <option name="SHOW_STATIC_VARS" value="true" />
-        <option name="WORKING_DIR" value="" />
-        <option name="TARGET_LOGGING_CHANNELS" value="lldb process:gdb-remote packets" />
-        <option name="SHOW_OPTIMIZED_WARNING" value="true" />
-      </Native>
-      <Profilers>
-        <option name="ADVANCED_PROFILING_ENABLED" value="false" />
-        <option name="STARTUP_PROFILING_ENABLED" value="false" />
-        <option name="STARTUP_CPU_PROFILING_ENABLED" value="false" />
-        <option name="STARTUP_CPU_PROFILING_CONFIGURATION_NAME" value="Java/Kotlin Method Sample (legacy)" />
-        <option name="STARTUP_NATIVE_MEMORY_PROFILING_ENABLED" value="false" />
-        <option name="NATIVE_MEMORY_SAMPLE_RATE_BYTES" value="2048" />
-      </Profilers>
-      <option name="DEEP_LINK" value="" />
-      <option name="ACTIVITY_CLASS" value="" />
-      <option name="SEARCH_ACTIVITY_IN_GLOBAL_SCOPE" value="false" />
-      <option name="SKIP_ACTIVITY_VALIDATION" value="false" />
-      <method v="2">
-        <option name="Android.Gradle.BeforeRunTask" enabled="true" />
-      </method>
-    </configuration>
-  </component>
-  <component name="SpellCheckerSettings" RuntimeDictionaries="0" Folders="0" CustomDictionaries="0" DefaultDictionary="application-level" UseSingleDictionary="true" transferred="true" />
-  <component name="TaskManager">
-    <task active="true" id="Default" summary="Default task">
-      <changelist id="b78dfbab-89c8-40be-8d53-4ace7cd06096" name="Changes" comment="" />
-      <created>1676357527812</created>
-      <option name="number" value="Default" />
-      <option name="presentableId" value="Default" />
-      <updated>1676357527812</updated>
-    </task>
-    <servers />
-  </component>
-</project>
\ No newline at end of file
diff --git a/Applications/Android/ResnetJNI/.gitignore b/Applications/Android/ResnetJNI/.gitignore
new file mode 100644
index 0000000000..54ed6ea235
--- /dev/null
+++ b/Applications/Android/ResnetJNI/.gitignore
@@ -0,0 +1,19 @@
+*.iml
+.gradle
+/.vscode
+/.idea
+/local.properties
+/.idea/caches
+/.idea/libraries
+/.idea/modules.xml
+/.idea/workspace.xml
+/.idea/navEditor.xml
+/.idea/assetWizardSettings.xml
+.DS_Store
+/build
+/captures
+.externalNativeBuild
+.cxx
+local.properties
+/app/src/main/jniLibs
+/app/src/main/obj
diff --git a/Applications/Android/ResnetJNI/.gradle/7.5/checksums/checksums.lock b/Applications/Android/ResnetJNI/.gradle/7.5/checksums/checksums.lock
deleted file mode 100644
index dcf4a0cfa3..0000000000
Binary files a/Applications/Android/ResnetJNI/.gradle/7.5/checksums/checksums.lock and /dev/null differ
diff --git a/Applications/Android/ResnetJNI/.gradle/7.5/dependencies-accessors/dependencies-accessors.lock b/Applications/Android/ResnetJNI/.gradle/7.5/dependencies-accessors/dependencies-accessors.lock
deleted file mode 100644
index 4f1595be70..0000000000
Binary files a/Applications/Android/ResnetJNI/.gradle/7.5/dependencies-accessors/dependencies-accessors.lock and /dev/null differ
diff --git a/Applications/Android/ResnetJNI/.gradle/7.5/dependencies-accessors/gc.properties b/Applications/Android/ResnetJNI/.gradle/7.5/dependencies-accessors/gc.properties
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/Applications/Android/ResnetJNI/.gradle/7.5/executionHistory/executionHistory.lock b/Applications/Android/ResnetJNI/.gradle/7.5/executionHistory/executionHistory.lock
deleted file mode 100644
index 41a6551295..0000000000
Binary files a/Applications/Android/ResnetJNI/.gradle/7.5/executionHistory/executionHistory.lock and /dev/null differ
diff --git a/Applications/Android/ResnetJNI/.gradle/7.5/fileHashes/fileHashes.lock b/Applications/Android/ResnetJNI/.gradle/7.5/fileHashes/fileHashes.lock
deleted file mode 100644
index 6c027ab889..0000000000
Binary files a/Applications/Android/ResnetJNI/.gradle/7.5/fileHashes/fileHashes.lock and /dev/null differ
diff --git a/Applications/Android/ResnetJNI/.gradle/7.5/gc.properties b/Applications/Android/ResnetJNI/.gradle/7.5/gc.properties
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/Applications/Android/ResnetJNI/.gradle/buildOutputCleanup/buildOutputCleanup.lock b/Applications/Android/ResnetJNI/.gradle/buildOutputCleanup/buildOutputCleanup.lock
deleted file mode 100644
index 737b37946f..0000000000
Binary files a/Applications/Android/ResnetJNI/.gradle/buildOutputCleanup/buildOutputCleanup.lock and /dev/null differ
diff --git a/Applications/Android/ResnetJNI/.gradle/buildOutputCleanup/cache.properties b/Applications/Android/ResnetJNI/.gradle/buildOutputCleanup/cache.properties
deleted file mode 100644
index f11a0f4e85..0000000000
--- a/Applications/Android/ResnetJNI/.gradle/buildOutputCleanup/cache.properties
+++ /dev/null
@@ -1,2 +0,0 @@
-#Tue Feb 14 16:37:06 KST 2023
-gradle.version=7.5
diff --git a/Applications/Android/ResnetJNI/.gradle/checksums/checksums.lock b/Applications/Android/ResnetJNI/.gradle/checksums/checksums.lock
deleted file mode 100644
index 287309dd96..0000000000
Binary files a/Applications/Android/ResnetJNI/.gradle/checksums/checksums.lock and /dev/null differ
diff --git a/Applications/Android/ResnetJNI/.gradle/file-system.probe b/Applications/Android/ResnetJNI/.gradle/file-system.probe
deleted file mode 100644
index d43b228f76..0000000000
Binary files a/Applications/Android/ResnetJNI/.gradle/file-system.probe and /dev/null differ
diff --git a/Applications/Android/ResnetJNI/.gradle/vcs-1/gc.properties b/Applications/Android/ResnetJNI/.gradle/vcs-1/gc.properties
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/Applications/Android/ResnetJNI/.idea/compiler.xml b/Applications/Android/ResnetJNI/.idea/compiler.xml
deleted file mode 100644
index 5421743a9c..0000000000
--- a/Applications/Android/ResnetJNI/.idea/compiler.xml
+++ /dev/null
@@ -1,6 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="CompilerConfiguration">
-    <bytecodeTargetLevel target="11" />
-  </component>
-</project>
diff --git a/Applications/Android/ResnetJNI/.idea/gradle.xml b/Applications/Android/ResnetJNI/.idea/gradle.xml
deleted file mode 100644
index b795db1fe1..0000000000
--- a/Applications/Android/ResnetJNI/.idea/gradle.xml
+++ /dev/null
@@ -1,20 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="GradleMigrationSettings" migrationVersion="1" />
-  <component name="GradleSettings">
-    <option name="linkedExternalProjectsSettings">
-      <GradleProjectSettings>
-        <option name="testRunner" value="GRADLE" />
-        <option name="distributionType" value="DEFAULT_WRAPPED" />
-        <option name="externalProjectPath" value="$PROJECT_DIR$" />
-        <option name="gradleJvm" value="JDK" />
-        <option name="modules">
-          <set>
-            <option value="$PROJECT_DIR$" />
-            <option value="$PROJECT_DIR$/app" />
-          </set>
-        </option>
-      </GradleProjectSettings>
-    </option>
-  </component>
-</project>
diff --git a/Applications/Android/ResnetJNI/.idea/misc.xml b/Applications/Android/ResnetJNI/.idea/misc.xml
deleted file mode 100644
index cd890b0b4e..0000000000
--- a/Applications/Android/ResnetJNI/.idea/misc.xml
+++ /dev/null
@@ -1,5 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="ExternalStorageConfigurationManager" enabled="true" />
-  <component name="ProjectRootManager" version="2" languageLevel="JDK_11" default="true" project-jdk-name="11" project-jdk-type="JavaSDK" />
-</project>
diff --git a/Applications/Android/ResnetJNI/.idea/workspace.xml b/Applications/Android/ResnetJNI/.idea/workspace.xml
deleted file mode 100644
index 03a1e61514..0000000000
--- a/Applications/Android/ResnetJNI/.idea/workspace.xml
+++ /dev/null
@@ -1,137 +0,0 @@
-<?xml version="1.0" encoding="UTF-8"?>
-<project version="4">
-  <component name="AndroidLayouts">
-    <shared>
-      <config />
-    </shared>
-  </component>
-  <component name="AutoImportSettings">
-    <option name="autoReloadType" value="NONE" />
-  </component>
-  <component name="ChangeListManager">
-    <list default="true" id="b78dfbab-89c8-40be-8d53-4ace7cd06096" name="Changes" comment="" />
-    <option name="SHOW_DIALOG" value="false" />
-    <option name="HIGHLIGHT_CONFLICTS" value="true" />
-    <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
-    <option name="LAST_RESOLUTION" value="IGNORE" />
-  </component>
-  <component name="ExecutionTargetManager" SELECTED_TARGET="device_and_snapshot_combo_box_target[R3CR60NW1TN]" />
-  <component name="ExternalProjectsData">
-    <projectState path="$PROJECT_DIR$">
-      <ProjectState />
-    </projectState>
-  </component>
-  <component name="ExternalProjectsManager">
-    <system id="GRADLE">
-      <state>
-        <projects_view>
-          <tree_state>
-            <expand />
-            <select />
-          </tree_state>
-        </projects_view>
-      </state>
-    </system>
-  </component>
-  <component name="FileTemplateManagerImpl">
-    <option name="RECENT_TEMPLATES">
-      <list>
-        <option value="layoutResourceFile_vertical" />
-      </list>
-    </option>
-  </component>
-  <component name="MarkdownSettingsMigration">
-    <option name="stateVersion" value="1" />
-  </component>
-  <component name="ProblemsViewState">
-    <option name="selectedTabId" value="CurrentFile" />
-  </component>
-  <component name="ProjectId" id="2LicpclhJHyJAx5NQuq0S08TxXt" />
-  <component name="ProjectViewState">
-    <option name="hideEmptyMiddlePackages" value="true" />
-    <option name="showLibraryContents" value="true" />
-  </component>
-  <component name="PropertiesComponent"><![CDATA[{
-  "keyToString": {
-    "AnalyzeApkAction.lastApkPath": "/home/jijoongmoon/WorkSpace1/NNTrainer-java/Android_Demo_Resnet/Android_Application/ResnetJNI",
-    "RunOnceActivity.OpenProjectViewOnStart": "true",
-    "RunOnceActivity.ShowReadmeOnStart": "true",
-    "RunOnceActivity.cidr.known.project.marker": "true",
-    "cidr.known.project.marker": "true",
-    "last_opened_file_path": "/home/jijoongmoon/jni_data/256_res",
-    "memory.settings.postsync.last.time.stamp": "1676590368219",
-    "settings.editor.selected.configurable": "reference.settingsdialog.IDE.editor.colors.Java"
-  }
-}]]></component>
-  <component name="RunManager">
-    <configuration name="app" type="AndroidRunConfigurationType" factoryName="Android App">
-      <module name="ResnetJNI.app.main" />
-      <option name="DEPLOY" value="true" />
-      <option name="DEPLOY_APK_FROM_BUNDLE" value="false" />
-      <option name="DEPLOY_AS_INSTANT" value="false" />
-      <option name="ARTIFACT_NAME" value="" />
-      <option name="PM_INSTALL_OPTIONS" value="" />
-      <option name="ALL_USERS" value="false" />
-      <option name="ALWAYS_INSTALL_WITH_PM" value="false" />
-      <option name="CLEAR_APP_STORAGE" value="false" />
-      <option name="DYNAMIC_FEATURES_DISABLED_LIST" value="" />
-      <option name="ACTIVITY_EXTRA_FLAGS" value="" />
-      <option name="MODE" value="default_activity" />
-      <option name="CLEAR_LOGCAT" value="false" />
-      <option name="SHOW_LOGCAT_AUTOMATICALLY" value="false" />
-      <option name="INSPECTION_WITHOUT_ACTIVITY_RESTART" value="false" />
-      <option name="TARGET_SELECTION_MODE" value="DEVICE_AND_SNAPSHOT_COMBO_BOX" />
-      <option name="SELECTED_CLOUD_MATRIX_CONFIGURATION_ID" value="-1" />
-      <option name="SELECTED_CLOUD_MATRIX_PROJECT_ID" value="" />
-      <option name="DEBUGGER_TYPE" value="Auto" />
-      <Auto>
-        <option name="USE_JAVA_AWARE_DEBUGGER" value="false" />
-        <option name="SHOW_STATIC_VARS" value="true" />
-        <option name="WORKING_DIR" value="" />
-        <option name="TARGET_LOGGING_CHANNELS" value="lldb process:gdb-remote packets" />
-        <option name="SHOW_OPTIMIZED_WARNING" value="true" />
-      </Auto>
-      <Hybrid>
-        <option name="USE_JAVA_AWARE_DEBUGGER" value="false" />
-        <option name="SHOW_STATIC_VARS" value="true" />
-        <option name="WORKING_DIR" value="" />
-        <option name="TARGET_LOGGING_CHANNELS" value="lldb process:gdb-remote packets" />
-        <option name="SHOW_OPTIMIZED_WARNING" value="true" />
-      </Hybrid>
-      <Java />
-      <Native>
-        <option name="USE_JAVA_AWARE_DEBUGGER" value="false" />
-        <option name="SHOW_STATIC_VARS" value="true" />
-        <option name="WORKING_DIR" value="" />
-        <option name="TARGET_LOGGING_CHANNELS" value="lldb process:gdb-remote packets" />
-        <option name="SHOW_OPTIMIZED_WARNING" value="true" />
-      </Native>
-      <Profilers>
-        <option name="ADVANCED_PROFILING_ENABLED" value="false" />
-        <option name="STARTUP_PROFILING_ENABLED" value="false" />
-        <option name="STARTUP_CPU_PROFILING_ENABLED" value="false" />
-        <option name="STARTUP_CPU_PROFILING_CONFIGURATION_NAME" value="Java/Kotlin Method Sample (legacy)" />
-        <option name="STARTUP_NATIVE_MEMORY_PROFILING_ENABLED" value="false" />
-        <option name="NATIVE_MEMORY_SAMPLE_RATE_BYTES" value="2048" />
-      </Profilers>
-      <option name="DEEP_LINK" value="" />
-      <option name="ACTIVITY_CLASS" value="" />
-      <option name="SEARCH_ACTIVITY_IN_GLOBAL_SCOPE" value="false" />
-      <option name="SKIP_ACTIVITY_VALIDATION" value="false" />
-      <method v="2">
-        <option name="Android.Gradle.BeforeRunTask" enabled="true" />
-      </method>
-    </configuration>
-  </component>
-  <component name="SpellCheckerSettings" RuntimeDictionaries="0" Folders="0" CustomDictionaries="0" DefaultDictionary="application-level" UseSingleDictionary="true" transferred="true" />
-  <component name="TaskManager">
-    <task active="true" id="Default" summary="Default task">
-      <changelist id="b78dfbab-89c8-40be-8d53-4ace7cd06096" name="Changes" comment="" />
-      <created>1676357527812</created>
-      <option name="number" value="Default" />
-      <option name="presentableId" value="Default" />
-      <updated>1676357527812</updated>
-    </task>
-    <servers />
-  </component>
-</project>
diff --git a/Applications/KNN/jni/meson.build b/Applications/KNN/jni/meson.build
index bc50dc0214..58ca099d75 100644
--- a/Applications/KNN/jni/meson.build
+++ b/Applications/KNN/jni/meson.build
@@ -15,4 +15,4 @@ e = executable('knn_sample',
   install_dir: application_install_dir
 )
 
-test('app_knn', e, args: [nntr_app_resdir / 'KNN'])
+test('app_knn', e, args: [nntr_app_resdir / 'KNN/'])
diff --git a/Applications/LLaMA/jni/main.cpp b/Applications/LLaMA/jni/main.cpp
index 96be8671dc..985d82a79e 100644
--- a/Applications/LLaMA/jni/main.cpp
+++ b/Applications/LLaMA/jni/main.cpp
@@ -56,7 +56,7 @@ int const NUM_VOCAB = 96000;
 int MAX_SEQ_LEN = 1024;
 int NUM_TO_GENERATE = 100;
 
-constexpr unsigned int INIT_SEQ_LEN = 30;
+constexpr unsigned int INIT_SEQ_LEN = 28;
 unsigned int batch_size = 1;
 unsigned int epoch = 1;
 
@@ -596,7 +596,7 @@ void run(std::string text, bool apply_temperature) {
   float init_input[INIT_SEQ_LEN] = {
     0,  1,  2,  3,  4,  5,   6,   7,   8,   9,   10,  20,  30,  40,
     50, 60, 70, 80, 90, 100, 200, 300, 400, 500, 600, 700, 800, 900};
-  ((uint *)(input_sample))[0] = init_input[0];
+  memcpy(input_sample, init_input, sizeof(float) * INIT_SEQ_LEN);
   input.push_back(input_sample);
   init_len = 18;
 #endif
diff --git a/Applications/Resnet/README.md b/Applications/Resnet/README.md
index f76d5b25de..f195a8c764 100644
--- a/Applications/Resnet/README.md
+++ b/Applications/Resnet/README.md
@@ -14,7 +14,7 @@ Please file an issue if you have a problem running the example.
 
 ```bash
 $ meson ${build_dir} -Denable-test=true -Denable-long-test=true
-$ meson test app_resnet18 -v -c ${build_dir}
+$ meson test app_resnet18 -v -C ${build_dir}
 ```
 
 ### To run with a real data.
diff --git a/Applications/YOLO/PyTorch/main.py b/Applications/YOLO/PyTorch/main.py
deleted file mode 100644
index b831e1ebb1..0000000000
--- a/Applications/YOLO/PyTorch/main.py
+++ /dev/null
@@ -1,171 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# Copyright (C) 2023 Seungbaek Hong <sb92.hong@samsung.com>
-#
-# @file main.py
-# @date 8 March 2023
-# @brief Implement training for yolo
-#
-# @author Seungbaek Hong <sb92.hong@samsung.com>
-
-import numpy as np
-import torch
-import torch.nn as nn
-import torch.optim as optim
-import torch.nn.functional as F
-from torch.utils.data import DataLoader
-
-from yolo import YoloV2
-from yolo_loss import YoloV2_LOSS
-from dataset import YOLODataset, collate_db
-
-import sys
-import os
-
-# get pyutils path using relative path
-def get_util_path():
-    current_path = os.path.abspath(os.path.dirname(__file__))
-    parent_path = os.path.abspath(os.path.dirname(current_path))
-    target_path = os.path.abspath(os.path.dirname(parent_path))
-    return os.path.dirname(target_path) + '/tools/pyutils/'
-
-# add pyutils path to sys.path
-sys.path.append(get_util_path())
-from torchconverter import save_bin
-
-# set config
-out_size = 13
-num_classes = 4
-num_anchors = 5
-
-epochs = 3
-batch_size = 4
-
-train_img_dir = '/home/user/TRAIN_DIR/images/*'
-train_ann_dir = '/home/user/TRAIN_DIR/annotations/*'
-valid_img_dir = '/home/user/VALID_DIR/images/*'
-valid_ann_dir = '/home/user/VALID_DIR/annotations/*'
-
-# load data
-train_dataset = YOLODataset(train_img_dir, train_ann_dir)
-train_loader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_db, shuffle=True, drop_last=True)
-valid_dataset = YOLODataset(valid_img_dir, valid_ann_dir)
-valid_loader = DataLoader(valid_dataset, batch_size=batch_size, collate_fn=collate_db, shuffle=False, drop_last=True)
-
-# set model, loss and optimizer
-model = YoloV2(num_classes=num_classes)
-criterion = YoloV2_LOSS(num_classes=num_classes)
-optimizer = optim.Adam(model.parameters(), lr=1e-3)
-# scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10, eta_min=0)
-
-# save init model
-save_bin(model, 'init_model')
-torch.save(model.state_dict(), './init_model.pt')
-
-# train model
-best_loss = 1e+10
-for epoch in range(epochs):
-    epoch_train_loss = 0
-    epoch_valid_loss = 0
-    for idx, (img, bbox, cls) in enumerate(train_loader):
-        model.train()
-        optimizer.zero_grad()
-        # model prediction
-        hypothesis = model(img).permute((0, 2, 3, 1))
-        hypothesis = hypothesis.reshape((batch_size, out_size**2, num_anchors, 5+num_classes))
-        # split each prediction(bbox, iou, class prob)
-        bbox_pred_xy = torch.sigmoid(hypothesis[..., :2])
-        bbox_pred_wh = torch.exp(hypothesis[..., 2:4])
-        bbox_pred = torch.cat((bbox_pred_xy, bbox_pred_wh), 3)
-        iou_pred = torch.sigmoid(hypothesis[..., 4:5])
-        score_pred = hypothesis[..., 5:].contiguous()
-        prob_pred = torch.softmax(score_pred.view(-1, num_classes), dim=1).view(score_pred.shape)
-        # calc loss
-        loss = criterion(torch.FloatTensor(bbox_pred),
-                         torch.FloatTensor(iou_pred),
-                         torch.FloatTensor(prob_pred),
-                         bbox,
-                         cls)
-        # back prop
-        loss.backward()
-        optimizer.step()  
-        # scheduler.step()
-        epoch_train_loss += loss.item()
-
-    for idx, (img, bbox, cls) in enumerate(valid_loader):
-        model.eval()
-        with torch.no_grad():
-            # model prediction
-            hypothesis = model(img).permute((0, 2, 3, 1))
-            hypothesis = hypothesis.reshape((hypothesis.shape[0], out_size**2, num_anchors, 5+num_classes))        
-            # split each prediction(bbox, iou, class prob)
-            bbox_pred_xy = torch.sigmoid(hypothesis[..., :2])
-            bbox_pred_wh = torch.exp(hypothesis[..., 2:4])
-            bbox_pred = torch.cat((bbox_pred_xy, bbox_pred_wh), 3)
-            iou_pred = torch.sigmoid(hypothesis[..., 4:5])
-            score_pred = hypothesis[..., 5:].contiguous()
-            prob_pred = torch.softmax(score_pred.view(-1, num_classes), dim=1).view(score_pred.shape)
-            # calc loss
-            loss = criterion(torch.FloatTensor(bbox_pred),
-                            torch.FloatTensor(iou_pred),
-                            torch.FloatTensor(prob_pred),
-                            bbox,
-                            cls)
-            epoch_valid_loss += loss.item()
-        
-    if epoch_valid_loss < best_loss:
-        best_loss = epoch_valid_loss
-        torch.save(model.state_dict(), './best_model.pt')
-        save_bin(model, 'best_model')
-        
-    print("{}epoch, train loss: {:.4f}, valid loss: {:.4f}".format(
-        epoch, epoch_train_loss / len(train_loader), epoch_valid_loss / len(valid_loader)))
-
-##
-# @brief bbox post process function for inference
-def post_process_for_bbox(bbox_pred):
-    """
-    @param bbox_pred shape(batch_size, cell_h x cell_w, num_anchors, 4)
-    @return bbox_pred shape(batch_size, cell_h x cell_w, num_anchors, 4)
-    """
-    anchors = torch.FloatTensor(
-        [(1.3221, 1.73145),
-        (3.19275, 4.00944),
-        (5.05587, 8.09892),
-        (9.47112, 4.84053),
-        (11.2364, 10.0071)]
-    )
-
-    outsize = (13, 13)
-    width, height = outsize
-    
-    # restore cell pos to x, y
-    for w in range(width):
-        for h in range(height):
-            bbox_pred[:, height*h + w, :, 0] += w
-            bbox_pred[:, height*h + w, :, 1] += h
-    bbox_pred[:, :, :, :2] /= 13
-    
-    # apply anchors to w, h
-    anchor_w = anchors[:, 0].contiguous().view(-1, 1)
-    anchor_h = anchors[:, 1].contiguous().view(-1, 1)
-    bbox_pred[:, :, :, 2:3] *= anchor_w
-    bbox_pred[:, :, :, 3:4] *= anchor_h
-
-    return bbox_pred
-
-# inference example using trained model
-hypothesis = model(img).permute((0, 2, 3, 1))
-hypothesis = hypothesis[0].reshape((1, out_size**2, num_anchors, 5+num_classes))
-
-# transform output
-bbox_pred_xy = torch.sigmoid(hypothesis[..., :2])
-bbox_pred_wh = torch.exp(hypothesis[..., 2:4])
-bbox_pred = torch.cat((bbox_pred_xy, bbox_pred_wh), 3)
-bbox_pred = post_process_for_bbox(bbox_pred)
-iou_pred = torch.sigmoid(hypothesis[..., 4:5])
-score_pred = hypothesis[..., 5:].contiguous()
-prob_pred = torch.softmax(score_pred.view(-1, num_classes), dim=1).view(score_pred.shape)
-
-# result of inference (data range 0~1)
-iou_mask = (iou_pred > 0.5)
-print(bbox_pred * iou_mask, iou_pred * iou_mask, prob_pred * iou_mask)
diff --git a/Applications/YOLO/PyTorch/yolo.py b/Applications/YOLO/PyTorch/yolo.py
deleted file mode 100644
index 53763f1be7..0000000000
--- a/Applications/YOLO/PyTorch/yolo.py
+++ /dev/null
@@ -1,105 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# Copyright (C) 2023 Seungbaek Hong <sb92.hong@samsung.com>
-#
-# @file yolo.py
-# @date 8 March 2023
-# @brief Define simple yolo model, but not original darknet.
-#
-# @author Seungbaek Hong <sb92.hong@samsung.com>
-
-import torch
-import torch.nn as nn
-
-##
-# @brief define yolo model (except for re-organization module)
-class YoloV2(nn.Module): 
-    def __init__(self, num_classes, num_anchors=5):
-        
-        super(YoloV2, self).__init__()
-        self.num_classes = num_classes
-        self.num_anchors = num_anchors
-        self.conv1 = nn.Sequential(nn.Conv2d(3, 32, 3, 1, 1), nn.BatchNorm2d(32, eps=1e-3),
-                                   nn.LeakyReLU(), nn.MaxPool2d(2, 2))
-        self.conv2 = nn.Sequential(nn.Conv2d(32, 64, 3, 1, 1), nn.BatchNorm2d(64, eps=1e-3),
-                                   nn.LeakyReLU(), nn.MaxPool2d(2, 2))
-        self.conv3 = nn.Sequential(nn.Conv2d(64, 128, 3, 1, 1), nn.BatchNorm2d(128, eps=1e-3),
-                                   nn.LeakyReLU())
-        self.conv4 = nn.Sequential(nn.Conv2d(128, 64, 1, 1, 0), nn.BatchNorm2d(64, eps=1e-3),
-                                   nn.LeakyReLU())
-        self.conv5 = nn.Sequential(nn.Conv2d(64, 128, 3, 1, 1), nn.BatchNorm2d(128, eps=1e-3),
-                                   nn.LeakyReLU(), nn.MaxPool2d(2, 2))
-        self.conv6 = nn.Sequential(nn.Conv2d(128, 256, 3, 1, 1), nn.BatchNorm2d(256, eps=1e-3),
-                                   nn.LeakyReLU())
-        self.conv7 = nn.Sequential(nn.Conv2d(256, 128, 1, 1, 0), nn.BatchNorm2d(128, eps=1e-3),
-                                   nn.LeakyReLU())
-        self.conv8 = nn.Sequential(nn.Conv2d(128, 256, 3, 1, 1), nn.BatchNorm2d(256, eps=1e-3),
-                                   nn.LeakyReLU(), nn.MaxPool2d(2, 2))
-        self.conv9 = nn.Sequential(nn.Conv2d(256, 512, 3, 1, 1), nn.BatchNorm2d(512, eps=1e-3),
-                                   nn.LeakyReLU())
-        self.conv10 = nn.Sequential(nn.Conv2d(512, 256, 1, 1, 0), nn.BatchNorm2d(256, eps=1e-3),
-                                    nn.LeakyReLU())
-        self.conv11 = nn.Sequential(nn.Conv2d(256, 512, 3, 1, 1), nn.BatchNorm2d(512, eps=1e-3),
-                                    nn.LeakyReLU())
-        self.conv12 = nn.Sequential(nn.Conv2d(512, 256, 1, 1, 0), nn.BatchNorm2d(256, eps=1e-3),
-                                    nn.LeakyReLU())
-        self.conv13 = nn.Sequential(nn.Conv2d(256, 512, 3, 1, 1), nn.BatchNorm2d(512, eps=1e-3),
-                                           nn.LeakyReLU())
-
-        self.conv_b = nn.Sequential(nn.Conv2d(512, 64, 1, 1, 0), nn.BatchNorm2d(64, eps=1e-3),
-                                    nn.LeakyReLU())        
-
-        self.maxpool_a = nn.MaxPool2d(2, 2)
-        self.conv_a1 = nn.Sequential(nn.Conv2d(512, 1024, 3, 1, 1), nn.BatchNorm2d(1024, eps=1e-3),
-                                   nn.LeakyReLU())
-        self.conv_a2 = nn.Sequential(nn.Conv2d(1024, 512, 1, 1, 0), nn.BatchNorm2d(512, eps=1e-3),
-                                   nn.LeakyReLU())
-        self.conv_a3 = nn.Sequential(nn.Conv2d(512, 1024, 3, 1, 1), nn.BatchNorm2d(1024, eps=1e-3),
-                                   nn.LeakyReLU())
-        self.conv_a4 = nn.Sequential(nn.Conv2d(1024, 512, 1, 1, 0), nn.BatchNorm2d(512, eps=1e-3),
-                                   nn.LeakyReLU())
-        self.conv_a5 = nn.Sequential(nn.Conv2d(512, 1024, 3, 1, 1), nn.BatchNorm2d(1024, eps=1e-3),
-                                   nn.LeakyReLU())        
-        self.conv_a6 = nn.Sequential(nn.Conv2d(1024, 1024, 3, 1, 1), nn.BatchNorm2d(1024, eps=1e-3),
-                                   nn.LeakyReLU())
-        self.conv_a7 = nn.Sequential(nn.Conv2d(1024, 1024, 3, 1, 1), nn.BatchNorm2d(1024, eps=1e-3),
-                                            nn.LeakyReLU())
-
-        self.conv_out1 = nn.Sequential(nn.Conv2d(1280, 1024, 3, 1, 1), nn.BatchNorm2d(1024, eps=1e-3),
-                                          nn.LeakyReLU())
-
-        self.conv_out2 = nn.Conv2d(1024, self.num_anchors * (5 + num_classes), 1, 1, 0)
-
-    def forward(self, input):
-        output = self.conv1(input)
-        output = self.conv2(output)
-        output = self.conv3(output)
-        output = self.conv4(output)
-        output = self.conv5(output)
-        output = self.conv6(output)
-        output = self.conv7(output)
-        output = self.conv8(output)
-        output = self.conv9(output)
-        output = self.conv10(output)
-        output = self.conv11(output)
-        output = self.conv12(output)
-        output = self.conv13(output)
-
-        output_a = self.maxpool_a(output)
-        output_a = self.conv_a1(output_a)
-        output_a = self.conv_a2(output_a)
-        output_a = self.conv_a3(output_a)
-        output_a = self.conv_a4(output_a)
-        output_a = self.conv_a5(output_a)
-        output_a = self.conv_a6(output_a)
-        output_a = self.conv_a7(output_a)
-
-        output_b = self.conv_b(output)
-        b, c, h, w = output_b.size()
-        output_b = output_b.view(b, int(c / 4), h, 2, w, 2).contiguous()
-        output_b = output_b.permute(0, 3, 5, 1, 2, 4).contiguous()
-        output_b = output_b.view(b, -1, int(h / 2), int(w / 2))
-
-        output = torch.cat((output_a, output_b), 1)
-        output = self.conv_out1(output)
-        output = self.conv_out2(output)
-        return output
diff --git a/Applications/YOLO/PyTorch/dataset.py b/Applications/YOLOv2/PyTorch/dataset.py
similarity index 58%
rename from Applications/YOLO/PyTorch/dataset.py
rename to Applications/YOLOv2/PyTorch/dataset.py
index a02971ae87..d939e0f8a9 100644
--- a/Applications/YOLO/PyTorch/dataset.py
+++ b/Applications/YOLOv2/PyTorch/dataset.py
@@ -8,50 +8,68 @@
 # @author Seungbaek Hong <sb92.hong@samsung.com>
 
 import glob
+import re
 import numpy as np
 import torch
 from torch.utils.data import Dataset
 from torch.utils.data.dataloader import default_collate
 from PIL import Image
 
+
 ##
 # @brief dataset class for yolo
-# @note Need annotation text files corresponding to the name of the images.    
+# @note Need annotation text files corresponding to the name of the images.
 class YOLODataset(Dataset):
     def __init__(self, img_dir, ann_dir):
         super().__init__()
-        img_list = glob.glob(img_dir)
-        ann_list = glob.glob(ann_dir)
-        img_list.sort(), ann_list.sort()
+        self.img_dir = img_dir
+        pattern = re.compile("\/(\d+)\.")
+        img_list = glob.glob(img_dir + "*")
+        ann_list = glob.glob(ann_dir + "*")
+
+        img_ids = list(map(lambda x: pattern.search(x).group(1), img_list))
+        ann_ids = list(map(lambda x: pattern.search(x).group(1), ann_list))
+        ids_list = list(set(img_ids) & set(ann_ids))
 
-        self.length = len(img_list)
-        self.input_images = []
+        self.ids_list = []
         self.bbox_gt = []
         self.cls_gt = []
 
-        for i in range(len(img_list)):
-            img = np.array(Image.open(img_list[i]).resize((416, 416))) / 255
+        for ids in ids_list:
             label_bbox = []
             label_cls = []
-            with open(ann_list[i], 'rt') as f:
+            with open(ann_dir + ids + ".txt", "rt", encoding="utf-8") as f:
                 for line in f.readlines():
                     line = [float(i) for i in line.split()]
                     label_bbox.append(np.array(line[1:], dtype=np.float32) / 416)
                     label_cls.append(int(line[0]))
 
-            self.input_images.append(img)
+            if len(label_cls) == 0:
+                continue
+
+            self.ids_list.append(ids)
             self.bbox_gt.append(label_bbox)
             self.cls_gt.append(label_cls)
 
-        self.input_images = np.array(self.input_images)
-        self.input_images = torch.FloatTensor(self.input_images).permute((0, 3, 1, 2))
+        self.length = len(self.ids_list)
 
     def __len__(self):
         return self.length
-    
+
     def __getitem__(self, idx):
-        return self.input_images[idx], self.bbox_gt[idx], self.cls_gt[idx]
-    
+        img = (
+            torch.FloatTensor(
+                np.array(
+                    Image.open(self.img_dir + self.ids_list[idx] + ".jpg").resize(
+                        (416, 416)
+                    )
+                )
+            ).permute((2, 0, 1))
+            / 255
+        )
+        return img, self.bbox_gt[idx], self.cls_gt[idx]
+
+
 ##
 # @brief collate db function for yolo
 def collate_db(batch):
diff --git a/Applications/YOLOv2/PyTorch/main.py b/Applications/YOLOv2/PyTorch/main.py
new file mode 100644
index 0000000000..6e42fa1c6b
--- /dev/null
+++ b/Applications/YOLOv2/PyTorch/main.py
@@ -0,0 +1,222 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (C) 2023 Seungbaek Hong <sb92.hong@samsung.com>
+#
+# @file main.py
+# @date 8 March 2023
+# @brief Implement training for yolo
+#
+# @author Seungbaek Hong <sb92.hong@samsung.com>
+
+import sys
+import os
+
+from PIL import Image, ImageDraw
+from matplotlib import pyplot as plt
+from torch import optim
+from torch.utils.data import DataLoader
+import torch
+import numpy as np
+
+from yolo import YoloV2
+from yolo_loss import YoloV2_LOSS
+from dataset import YOLODataset, collate_db
+from torchconverter import save_bin
+
+device = "cuda" if torch.cuda.is_available() else "cpu"
+
+
+# get pyutils path using relative path
+def get_util_path():
+    current_path = os.path.abspath(os.path.dirname(__file__))
+    parent_path = os.path.abspath(os.path.dirname(current_path))
+    target_path = os.path.abspath(os.path.dirname(parent_path))
+    return os.path.dirname(target_path) + "/tools/pyutils/"
+
+
+# add pyutils path to sys.path
+sys.path.append(get_util_path())
+
+# set config
+out_size = 13
+num_classes = 4
+num_anchors = 5
+
+epochs = 3
+batch_size = 4
+
+train_img_dir = "/home/user/TRAIN_DIR/images/"
+train_ann_dir = "/home/user/TRAIN_DIR/annotations/"
+valid_img_dir = "/home/user/VALID_DIR/images/"
+valid_ann_dir = "/home/user/VALID_DIR/annotations/"
+
+# load data
+train_dataset = YOLODataset(train_img_dir, train_ann_dir)
+train_loader = DataLoader(
+    train_dataset,
+    batch_size=batch_size,
+    collate_fn=collate_db,
+    shuffle=True,
+    drop_last=True,
+)
+valid_dataset = YOLODataset(valid_img_dir, valid_ann_dir)
+valid_loader = DataLoader(
+    valid_dataset,
+    batch_size=batch_size,
+    collate_fn=collate_db,
+    shuffle=False,
+    drop_last=True,
+)
+
+# set model, loss and optimizer
+model = YoloV2(num_classes=num_classes).to(device)
+criterion = YoloV2_LOSS(
+    num_classes=num_classes, img_shape=(416, 416), device=device
+).to(device)
+optimizer = optim.Adam(model.parameters(), lr=1e-5)
+scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10, eta_min=0)
+
+# save init model
+save_bin(model, "init_model")
+torch.save(model.state_dict(), "./init_model.pt")
+
+# train model
+best_loss = 1e10
+for epoch in range(epochs):
+    epoch_train_loss = 0
+    epoch_valid_loss = 0
+    model.train()
+    for idx, (img, bbox, cls) in enumerate(train_loader):
+        optimizer.zero_grad()
+        # model prediction
+        hypothesis = model(img.to(device)).permute((0, 2, 3, 1))
+        hypothesis = hypothesis.reshape(
+            (batch_size, out_size**2, num_anchors, 5 + num_classes)
+        )
+        # split each prediction(bbox, iou, class prob)
+        bbox_pred_xy = torch.sigmoid(hypothesis[..., :2])
+        bbox_pred_wh = torch.exp(hypothesis[..., 2:4])
+        bbox_pred = torch.cat((bbox_pred_xy, bbox_pred_wh), 3)
+        iou_pred = torch.sigmoid(hypothesis[..., 4:5])
+        score_pred = hypothesis[..., 5:].contiguous()
+        prob_pred = torch.softmax(score_pred.view(-1, num_classes), dim=1).view(
+            score_pred.shape
+        )
+        # calc loss
+        loss = criterion(bbox_pred, iou_pred, prob_pred, bbox, cls)
+        # back prop
+        loss.backward()
+        optimizer.step()
+        scheduler.step()
+        epoch_train_loss += loss.item()
+
+    model.eval()
+    for idx, (img, bbox, cls) in enumerate(valid_loader):
+        with torch.no_grad():
+            # model prediction
+            hypothesis = model(img.to(device)).permute((0, 2, 3, 1))
+            hypothesis = hypothesis.reshape(
+                (hypothesis.shape[0], out_size**2, num_anchors, 5 + num_classes)
+            )
+            # split each prediction(bbox, iou, class prob)
+            bbox_pred_xy = torch.sigmoid(hypothesis[..., :2])
+            bbox_pred_wh = torch.exp(hypothesis[..., 2:4])
+            bbox_pred = torch.cat((bbox_pred_xy, bbox_pred_wh), 3)
+            iou_pred = torch.sigmoid(hypothesis[..., 4:5])
+            score_pred = hypothesis[..., 5:].contiguous()
+            prob_pred = torch.softmax(score_pred.view(-1, num_classes), dim=1).view(
+                score_pred.shape
+            )
+            # calc loss
+            loss = criterion(bbox_pred, iou_pred, prob_pred, bbox, cls)
+            epoch_valid_loss += loss.item()
+
+    if epoch_valid_loss < best_loss:
+        best_loss = epoch_valid_loss
+        torch.save(model.state_dict(), "./best_model.pt")
+        save_bin(model, "best_model")
+
+    print(
+        f"{epoch}epoch, train loss: {epoch_train_loss / len(train_loader):.4f},\
+          valid loss: {epoch_valid_loss / len(valid_loader):.4f}"
+    )
+
+
+##
+# @brief bbox post process function for inference
+def post_process_for_bbox(bbox_p):
+    """
+    @param bbox_p shape(batch_size, cell_h x cell_w, num_anchors, 4)
+    @return bbox_p shape(batch_size, cell_h x cell_w, num_anchors, 4)
+    """
+    anchors = torch.FloatTensor(
+        [
+            (1.3221, 1.73145),
+            (3.19275, 4.00944),
+            (5.05587, 8.09892),
+            (9.47112, 4.84053),
+            (11.2364, 10.0071),
+        ]
+    )
+
+    outsize = (13, 13)
+    width, height = outsize
+
+    # restore cell pos to x, y
+    for w in range(width):
+        for h in range(height):
+            bbox_p[:, height * h + w, :, 0] += w
+            bbox_p[:, height * h + w, :, 1] += h
+    bbox_p[:, :, :, :2] /= 13
+
+    # apply anchors to w, h
+    anchor_w = anchors[:, 0].contiguous().view(-1, 1).to(device)
+    anchor_h = anchors[:, 1].contiguous().view(-1, 1).to(device)
+    bbox_p[:, :, :, 2:3] *= anchor_w
+    bbox_p[:, :, :, 3:4] *= anchor_h
+
+    return bbox_p
+
+
+def visualize_bbox(img_pred, bbox_preds):
+    img_array = (img_pred.to("cpu") * 255).permute((1, 2, 0)).numpy().astype(np.uint8)
+    img = Image.fromarray(img_array)
+
+    for bbox_pred in bbox_preds:
+        bbox_pred = [int(x * 416) for x in bbox_pred]
+
+        if sum(bbox_pred) == 0:
+            continue
+
+        x_lefttop = bbox_pred[0]
+        y_lefttop = bbox_pred[1]
+        width = bbox_pred[2]
+        height = bbox_pred[3]
+
+        draw = ImageDraw.Draw(img)
+        draw.rectangle(
+            [(x_lefttop, y_lefttop), (x_lefttop + width, y_lefttop + height)]
+        )
+
+    plt.imshow(img)
+    plt.show()
+
+
+# inference example using trained model
+hypothesis = model(img.to(device)).permute((0, 2, 3, 1))
+hypothesis = hypothesis[0].reshape((1, out_size**2, num_anchors, 5 + num_classes))
+
+# transform output
+bbox_pred_xy = torch.sigmoid(hypothesis[..., :2])
+bbox_pred_wh = torch.exp(hypothesis[..., 2:4])
+bbox_pred = torch.cat((bbox_pred_xy, bbox_pred_wh), 3)
+bbox_pred = post_process_for_bbox(bbox_pred)
+iou_pred = torch.sigmoid(hypothesis[..., 4:5])
+score_pred = hypothesis[..., 5:].contiguous()
+prob_pred = torch.softmax(score_pred.view(-1, num_classes), dim=1).view(
+    score_pred.shape
+)
+
+# result of inference (data range 0~1)
+iou_mask = iou_pred > 0.5
+bbox_pred = bbox_pred * iou_mask
+visualize_bbox(img, bbox_pred.reshape(-1, 4))
diff --git a/Applications/YOLOv2/PyTorch/yolo.py b/Applications/YOLOv2/PyTorch/yolo.py
new file mode 100644
index 0000000000..390cbd5ada
--- /dev/null
+++ b/Applications/YOLOv2/PyTorch/yolo.py
@@ -0,0 +1,176 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (C) 2023 Seungbaek Hong <sb92.hong@samsung.com>
+#
+# @file yolo.py
+# @date 8 March 2023
+# @brief Define simple yolo model, but not original darknet.
+#
+# @author Seungbaek Hong <sb92.hong@samsung.com>
+
+import torch
+from torch import nn
+
+
+##
+# @brief define yolo model (except for re-organization module)
+class YoloV2(nn.Module):
+    def __init__(self, num_classes, num_anchors=5):
+
+        super().__init__()
+        self.num_classes = num_classes
+        self.num_anchors = num_anchors
+        self.conv1 = nn.Sequential(
+            nn.Conv2d(3, 32, 3, 1, 1, bias=False),
+            nn.BatchNorm2d(32),
+            nn.LeakyReLU(0.1),
+            nn.MaxPool2d(2, 2),
+        )
+        self.conv2 = nn.Sequential(
+            nn.Conv2d(32, 64, 3, 1, 1, bias=False),
+            nn.BatchNorm2d(64),
+            nn.LeakyReLU(0.1),
+            nn.MaxPool2d(2, 2),
+        )
+        self.conv3 = nn.Sequential(
+            nn.Conv2d(64, 128, 3, 1, 1, bias=False),
+            nn.BatchNorm2d(128),
+            nn.LeakyReLU(0.1),
+        )
+        self.conv4 = nn.Sequential(
+            nn.Conv2d(128, 64, 1, 1, 0, bias=False),
+            nn.BatchNorm2d(64),
+            nn.LeakyReLU(0.1),
+        )
+        self.conv5 = nn.Sequential(
+            nn.Conv2d(64, 128, 3, 1, 1, bias=False),
+            nn.BatchNorm2d(128),
+            nn.LeakyReLU(0.1),
+            nn.MaxPool2d(2, 2),
+        )
+        self.conv6 = nn.Sequential(
+            nn.Conv2d(128, 256, 3, 1, 1, bias=False),
+            nn.BatchNorm2d(256),
+            nn.LeakyReLU(0.1),
+        )
+        self.conv7 = nn.Sequential(
+            nn.Conv2d(256, 128, 1, 1, 0, bias=False),
+            nn.BatchNorm2d(128),
+            nn.LeakyReLU(0.1),
+        )
+        self.conv8 = nn.Sequential(
+            nn.Conv2d(128, 256, 3, 1, 1, bias=False),
+            nn.BatchNorm2d(256),
+            nn.LeakyReLU(0.1),
+            nn.MaxPool2d(2, 2),
+        )
+        self.conv9 = nn.Sequential(
+            nn.Conv2d(256, 512, 3, 1, 1, bias=False),
+            nn.BatchNorm2d(512),
+            nn.LeakyReLU(0.1),
+        )
+        self.conv10 = nn.Sequential(
+            nn.Conv2d(512, 256, 1, 1, 0, bias=False),
+            nn.BatchNorm2d(256),
+            nn.LeakyReLU(0.1),
+        )
+        self.conv11 = nn.Sequential(
+            nn.Conv2d(256, 512, 3, 1, 1, bias=False),
+            nn.BatchNorm2d(512),
+            nn.LeakyReLU(0.1),
+        )
+        self.conv12 = nn.Sequential(
+            nn.Conv2d(512, 256, 1, 1, 0, bias=False),
+            nn.BatchNorm2d(256),
+            nn.LeakyReLU(0.1),
+        )
+        self.conv13 = nn.Sequential(
+            nn.Conv2d(256, 512, 3, 1, 1, bias=False),
+            nn.BatchNorm2d(512),
+            nn.LeakyReLU(0.1),
+        )
+
+        self.conv_b = nn.Sequential(
+            nn.Conv2d(512, 64, 1, 1, 0, bias=False),
+            nn.BatchNorm2d(64),
+            nn.LeakyReLU(0.1),
+        )
+
+        self.maxpool_a = nn.MaxPool2d(2, 2)
+        self.conv_a1 = nn.Sequential(
+            nn.Conv2d(512, 1024, 3, 1, 1, bias=False),
+            nn.BatchNorm2d(1024),
+            nn.LeakyReLU(0.1),
+        )
+        self.conv_a2 = nn.Sequential(
+            nn.Conv2d(1024, 512, 1, 1, 0, bias=False),
+            nn.BatchNorm2d(512),
+            nn.LeakyReLU(0.1),
+        )
+        self.conv_a3 = nn.Sequential(
+            nn.Conv2d(512, 1024, 3, 1, 1, bias=False),
+            nn.BatchNorm2d(1024),
+            nn.LeakyReLU(0.1),
+        )
+        self.conv_a4 = nn.Sequential(
+            nn.Conv2d(1024, 512, 1, 1, 0, bias=False),
+            nn.BatchNorm2d(512),
+            nn.LeakyReLU(0.1),
+        )
+        self.conv_a5 = nn.Sequential(
+            nn.Conv2d(512, 1024, 3, 1, 1, bias=False),
+            nn.BatchNorm2d(1024),
+            nn.LeakyReLU(0.1),
+        )
+        self.conv_a6 = nn.Sequential(
+            nn.Conv2d(1024, 1024, 3, 1, 1, bias=False),
+            nn.BatchNorm2d(1024),
+            nn.LeakyReLU(0.1),
+        )
+        self.conv_a7 = nn.Sequential(
+            nn.Conv2d(1024, 1024, 3, 1, 1, bias=False),
+            nn.BatchNorm2d(1024),
+            nn.LeakyReLU(0.1),
+        )
+
+        self.conv_out1 = nn.Sequential(
+            nn.Conv2d(1280, 1024, 3, 1, 1, bias=False),
+            nn.BatchNorm2d(1024),
+            nn.LeakyReLU(0.1),
+        )
+
+        self.conv_out2 = nn.Conv2d(1024, self.num_anchors * (5 + num_classes), 1, 1, 0)
+
+    def forward(self, x):
+        output = self.conv1(x)
+        output = self.conv2(output)
+        output = self.conv3(output)
+        output = self.conv4(output)
+        output = self.conv5(output)
+        output = self.conv6(output)
+        output = self.conv7(output)
+        output = self.conv8(output)
+        output = self.conv9(output)
+        output = self.conv10(output)
+        output = self.conv11(output)
+        output = self.conv12(output)
+        output = self.conv13(output)
+
+        output_a = self.maxpool_a(output)
+        output_a = self.conv_a1(output_a)
+        output_a = self.conv_a2(output_a)
+        output_a = self.conv_a3(output_a)
+        output_a = self.conv_a4(output_a)
+        output_a = self.conv_a5(output_a)
+        output_a = self.conv_a6(output_a)
+        output_a = self.conv_a7(output_a)
+
+        output_b = self.conv_b(output)
+        b, c, h, w = output_b.size()
+        output_b = output_b.view(b, int(c / 4), h, 2, w, 2).contiguous()
+        output_b = output_b.permute(0, 3, 5, 1, 2, 4).contiguous()
+        output_b = output_b.view(b, -1, int(h / 2), int(w / 2))
+
+        output = torch.cat((output_a, output_b), 1)
+        output = self.conv_out1(output)
+        output = self.conv_out2(output)
+        return output
diff --git a/Applications/YOLO/PyTorch/yolo_loss.py b/Applications/YOLOv2/PyTorch/yolo_loss.py
similarity index 72%
rename from Applications/YOLO/PyTorch/yolo_loss.py
rename to Applications/YOLOv2/PyTorch/yolo_loss.py
index 12f95572a4..c444821236 100644
--- a/Applications/YOLO/PyTorch/yolo_loss.py
+++ b/Applications/YOLOv2/PyTorch/yolo_loss.py
@@ -8,10 +8,10 @@
 # @author Seungbaek Hong <sb92.hong@samsung.com>
 
 import torch
-import torch.nn as nn
-import torch.functional as F
+from torch import nn
 import numpy as np
 
+
 ##
 # @brief calculate iou between two boxes list
 def calculate_iou(bbox1, bbox2):
@@ -25,27 +25,28 @@ def calculate_iou(bbox1, bbox2):
     b1x2, b1y2 = (bbox1[:, :2] + (bbox1[:, 2:4])).split(1, 1)
     b2x1, b2y1 = (bbox2[:, :2]).split(1, 1)
     b2x2, b2y2 = (bbox2[:, :2] + (bbox2[:, 2:4])).split(1, 1)
-    
+
     # box areas
     areas1 = (b1x2 - b1x1) * (b1y2 - b1y1)
     areas2 = (b2x2 - b2x1) * (b2y2 - b2y1)
-    
+
     # intersections
     min_x_of_max_x, max_x_of_min_x = torch.min(b1x2, b2x2), torch.max(b1x1, b2x1)
     min_y_of_max_y, max_y_of_min_y = torch.min(b1y2, b2y2), torch.max(b1y1, b2y1)
     intersection_width = (min_x_of_max_x - max_x_of_min_x).clamp(min=0)
     intersection_height = (min_y_of_max_y - max_y_of_min_y).clamp(min=0)
     intersections = intersection_width * intersection_height
-    
-    # unions        
+
+    # unions
     unions = (areas1 + areas2) - intersections
-    
-    result = intersections / unions    
+
+    result = intersections / unions
     return result
 
+
 ##
 # @brief find best iou and its index
-def find_best_ratio(anchors, bbox):    
+def find_best_ratio(anchors, bbox):
     """
     @param anchors shape(numb_of_anchors, 2), it contains w, h
     @param bbox shape(numb_of_bbox, 2), it contains w, h
@@ -57,52 +58,59 @@ def find_best_ratio(anchors, bbox):
     best_match = np.argmin(similarities, axis=0)
     return best_match
 
+
 ##
 # @brief loss class for yolo
 class YoloV2_LOSS(nn.Module):
     """Yolo v2 loss"""
-    def __init__(self, num_classes, img_shape = (416, 416), outsize = (13, 13)):
+
+    def __init__(self, num_classes, img_shape, device="cpu", outsize=(13, 13)):
         super().__init__()
+        self.device = device
         self.num_classes = num_classes
         self.img_shape = img_shape
         self.outsize = outsize
-        self.hook = dict()
-        
+        self.hook = {}
+
         self.anchors = torch.FloatTensor(
-            [(1.3221, 1.73145),
-            (3.19275, 4.00944),
-            (5.05587, 8.09892),
-            (9.47112, 4.84053),
-            (11.2364, 10.0071)]
+            [
+                (1.3221, 1.73145),
+                (3.19275, 4.00944),
+                (5.05587, 8.09892),
+                (9.47112, 4.84053),
+                (11.2364, 10.0071),
+            ]
         )
-                
+
         self.mse = nn.MSELoss()
         self.bbox_loss, self.iou_loss, self.cls_loss = None, None, None
-    
+
     ##
-    # @brief function to track gradients of non-leaf varibles.    
+    # @brief function to track gradients of non-leaf varibles.
     def hook_variable(self, name, var):
-        """ Do not use this function when training. It is for debugging. """
+        """Do not use this function when training. It is for debugging."""
         self.hook[name] = var
         self.hook[name].requires_grad_().retain_grad()
 
     ##
     # @brief function to print gradients of non-leaf varibles.
     def print_hook_variables(self):
-        """ Do not use this function when training. It is for debugging. """
+        """Do not use this function when training. It is for debugging."""
         for k, var in self.hook.items():
-            print("gradients of variable {}:".format(k))
+            print(f"gradients of variable {k}:")
             batch, channel, height, width = var.grad.shape
             for b in range(batch):
                 for c in range(channel):
                     for h in range(height):
                         for w in range(width):
                             if torch.abs(var.grad[b, c, h, w]).item() >= 1e-3:
-                                print("(b: {}, c: {}, h: {}, w: {}) = {}"\
-                                      .format(b, c, h, w, var.grad[b, c, h, w]))
+                                print(
+                                    f"(b: {b}, c: {c}, h: {h}, w: {w}) =\
+                                          {var.grad[b, c, h, w]}"
+                                )
             print("=" * 20)
-        
-    def forward(self, bbox_pred, iou_pred, prob_pred, bbox_gt, cls_gt):        
+
+    def forward(self, bbox_pred, iou_pred, prob_pred, bbox_gt, cls_gt):
         """
         @param bbox_pred shape(batch_size, cell_h x cell_w, num_anchors, 4)
         @param iou_pred shape(batch_size, cell_h x cell_w, 1)
@@ -114,52 +122,50 @@ def forward(self, bbox_pred, iou_pred, prob_pred, bbox_gt, cls_gt):
         self.hook_variable("bbox_pred", bbox_pred)
         bbox_pred = self.apply_anchors_to_bbox(bbox_pred)
 
-        bbox_built, iou_built, cls_built, bbox_mask, iou_mask, cls_mask =\
+        bbox_built, iou_built, cls_built, bbox_mask, iou_mask, cls_mask = (
             self._build_target(bbox_pred, bbox_gt, cls_gt)
-        
-        self.bbox_loss = self.mse(bbox_pred * bbox_mask,
-                                        bbox_built * bbox_mask)
-        self.iou_loss = self.mse(iou_pred * iou_mask,
-                                       iou_built * iou_mask)
-        self.cls_loss = self.mse(prob_pred * cls_mask,
-                                       cls_built * cls_mask)
-        
+        )
+
+        self.bbox_loss = self.mse(bbox_pred * bbox_mask, bbox_built * bbox_mask)
+        self.iou_loss = self.mse(iou_pred * iou_mask, iou_built * iou_mask)
+        self.cls_loss = self.mse(prob_pred * cls_mask, cls_built * cls_mask)
+
         return self.bbox_loss * 5 + self.iou_loss + self.cls_loss
-        
+
     def apply_anchors_to_bbox(self, bbox_pred):
         """
         @param bbox_pred shape(batch_size, cell_h x cell_w, num_anchors, 4)
-        @return bbox_pred shape(batch_size, cell_h x cell_w, num_anchors, 4)    
+        @return bbox_pred shape(batch_size, cell_h x cell_w, num_anchors, 4)
         """
-        anchor_w = self.anchors[:, 0].contiguous().view(-1, 1)
-        anchor_h = self.anchors[:, 1].contiguous().view(-1, 1)
+        anchor_w = self.anchors[:, 0].contiguous().view(-1, 1).to(self.device)
+        anchor_h = self.anchors[:, 1].contiguous().view(-1, 1).to(self.device)
         bbox_pred_tmp = bbox_pred.clone()
         bbox_pred_tmp[:, :, :, 2:3] = torch.sqrt(bbox_pred[:, :, :, 2:3] * anchor_w)
         bbox_pred_tmp[:, :, :, 3:4] = torch.sqrt(bbox_pred[:, :, :, 3:4] * anchor_h)
         return bbox_pred_tmp
-    
+
     def _build_target(self, bbox_pred, bbox_gt, cls_gt):
         """
         @param bbox_pred shape(batch_size, cell_h x cell_w, num_anchors, 4)
         @param bbox_gt shape(batch_size, num_bbox, 4)
         @param cls_gt shape(batch_size, num_bbox, 1)
         @return tuple of (bbox_built, iou_built, cls_built, bbox_mask, iou_mask, cls_mask)
-        """    
+        """
         bbox_built, bbox_mask = [], []
         iou_built, iou_mask = [], []
         cls_built, cls_mask = [], []
-        
+
         batch_size = bbox_pred.shape[0]
-                
+
         for i in range(batch_size):
-            _bbox_built, _iou_built, _cls_built,\
-                _bbox_mask, _iou_mask, _cls_mask =\
-                    self._make_target_per_sample(
-                        torch.FloatTensor(bbox_pred[i]),
-                        torch.FloatTensor(np.array(bbox_gt[i])),
-                        torch.LongTensor(cls_gt[i])
-                    )
-            
+            _bbox_built, _iou_built, _cls_built, _bbox_mask, _iou_mask, _cls_mask = (
+                self._make_target_per_sample(
+                    bbox_pred[i],
+                    torch.FloatTensor(np.array(bbox_gt[i])),
+                    torch.LongTensor(cls_gt[i]),
+                )
+            )
+
             bbox_built.append(_bbox_built)
             bbox_mask.append(_bbox_mask)
             iou_built.append(_iou_built)
@@ -173,9 +179,16 @@ def _build_target(self, bbox_pred, bbox_gt, cls_gt):
         iou_mask = torch.stack(iou_mask)
         cls_built = torch.stack(cls_built)
         cls_mask = torch.stack(cls_mask)
-                    
-        return bbox_built, iou_built, cls_built, bbox_mask, iou_mask, cls_mask
-        
+
+        return (
+            bbox_built.to(self.device),
+            iou_built.to(self.device),
+            cls_built.to(self.device),
+            bbox_mask.to(self.device),
+            iou_mask.to(self.device),
+            cls_mask.to(self.device),
+        )
+
     def _make_target_per_sample(self, _bbox_pred, _bbox_gt, _cls_gt):
         """
         @param _bbox_pred shape(cell_h x cell_w, num_anchors, 4)
@@ -183,22 +196,22 @@ def _make_target_per_sample(self, _bbox_pred, _bbox_gt, _cls_gt):
         @param _cls_gt shape(num_bbox,)
         @return tuple of (_bbox_built, _iou_built, _cls_built, _bbox_mask, _iou_mask, _cls_mask)
         """
-        hw, num_anchors, _  = _bbox_pred.shape
-        
+        hw, num_anchors, _ = _bbox_pred.shape
+
         # set result template
         _bbox_built = torch.zeros((hw, num_anchors, 4))
         _bbox_mask = torch.zeros((hw, num_anchors, 1))
-        
+
         _iou_built = torch.zeros((hw, num_anchors, 1))
         _iou_mask = torch.ones((hw, num_anchors, 1)) * 0.5
-        
+
         _cls_built = torch.zeros((hw, num_anchors, self.num_classes))
         _cls_mask = torch.zeros((hw, num_anchors, 1))
-                        
+
         # find best anchors
-        _bbox_gt_wh = _bbox_gt.clone()[:, 2:]        
+        _bbox_gt_wh = _bbox_gt.clone()[:, 2:]
         best_anchors = find_best_ratio(self.anchors, _bbox_gt_wh)
-        
+
         # normalize x, y pos based on cell coornindates
         cx = _bbox_gt[:, 0] * self.outsize[0]
         cy = _bbox_gt[:, 1] * self.outsize[1]
@@ -207,22 +220,23 @@ def _make_target_per_sample(self, _bbox_pred, _bbox_gt, _cls_gt):
         cell_idx = np.array(cell_idx, dtype=np.int16)
         cx -= np.floor(cx)
         cy -= np.floor(cy)
-                
+
         # set bbox of gt
-        _bbox_built[cell_idx, best_anchors, 0] = cx 
+        _bbox_built[cell_idx, best_anchors, 0] = cx
         _bbox_built[cell_idx, best_anchors, 1] = cy
-        _bbox_built[cell_idx, best_anchors, 2] = torch.sqrt(_bbox_gt[:, 2]) 
-        _bbox_built[cell_idx, best_anchors, 3] = torch.sqrt(_bbox_gt[:, 3]) 
+        _bbox_built[cell_idx, best_anchors, 2] = torch.sqrt(_bbox_gt[:, 2])
+        _bbox_built[cell_idx, best_anchors, 3] = torch.sqrt(_bbox_gt[:, 3])
         _bbox_mask[cell_idx, best_anchors, :] = 1
-        
-        # set cls of gt       
+
+        # set cls of gt
         _cls_built[cell_idx, best_anchors, _cls_gt] = 1
         _cls_mask[cell_idx, best_anchors, :] = 1
-        
+
         # set confidence score of gt
-        _iou_built = calculate_iou(_bbox_pred.reshape(-1, 4), _bbox_built.view(-1, 4)).detach()
+        _iou_built = calculate_iou(
+            _bbox_pred.reshape(-1, 4), _bbox_built.view(-1, 4).to(self.device)
+        ).detach()
         _iou_built = _iou_built.view(hw, num_anchors, 1)
         _iou_mask[cell_idx, best_anchors, :] = 1
-        
-        return _bbox_built, _iou_built, _cls_built,\
-                _bbox_mask, _iou_mask, _cls_mask  
+
+        return _bbox_built, _iou_built, _cls_built, _bbox_mask, _iou_mask, _cls_mask
diff --git a/Applications/YOLO/jni/Android.mk b/Applications/YOLOv2/jni/Android.mk
similarity index 100%
rename from Applications/YOLO/jni/Android.mk
rename to Applications/YOLOv2/jni/Android.mk
diff --git a/Applications/YOLO/jni/Application.mk b/Applications/YOLOv2/jni/Application.mk
similarity index 100%
rename from Applications/YOLO/jni/Application.mk
rename to Applications/YOLOv2/jni/Application.mk
diff --git a/Applications/YOLO/jni/det_dataloader.cpp b/Applications/YOLOv2/jni/det_dataloader.cpp
similarity index 100%
rename from Applications/YOLO/jni/det_dataloader.cpp
rename to Applications/YOLOv2/jni/det_dataloader.cpp
diff --git a/Applications/YOLO/jni/det_dataloader.h b/Applications/YOLOv2/jni/det_dataloader.h
similarity index 100%
rename from Applications/YOLO/jni/det_dataloader.h
rename to Applications/YOLOv2/jni/det_dataloader.h
diff --git a/Applications/YOLO/jni/main.cpp b/Applications/YOLOv2/jni/main.cpp
similarity index 97%
rename from Applications/YOLO/jni/main.cpp
rename to Applications/YOLOv2/jni/main.cpp
index bc3985adbd..018602e408 100644
--- a/Applications/YOLO/jni/main.cpp
+++ b/Applications/YOLOv2/jni/main.cpp
@@ -139,6 +139,7 @@ std::vector<LayerHandle> yoloBlock(const std::string &block_name,
       withKey("filters", filters),
       withKey("kernel_size", {kernel_size, kernel_size}),
       withKey("padding", padding),
+      withKey("disable_bias", "true"),
       withKey("input_layers", input_layer)};
 
     return createLayer("conv2d", props);
@@ -150,6 +151,7 @@ std::vector<LayerHandle> yoloBlock(const std::string &block_name,
   if (downsample) {
     LayerHandle a2 = createLayer("batch_normalization",
                                  {with_name("a2"), withKey("momentum", "0.9"),
+                                  withKey("epsilon", 0.00001),
                                   withKey("activation", "leaky_relu")});
 
     LayerHandle a3 = createLayer(
@@ -158,10 +160,10 @@ std::vector<LayerHandle> yoloBlock(const std::string &block_name,
 
     return {a1, a2, a3};
   } else {
-    LayerHandle a2 =
-      createLayer("batch_normalization",
-                  {withKey("name", block_name), withKey("momentum", "0.9"),
-                   withKey("activation", "leaky_relu")});
+    LayerHandle a2 = createLayer(
+      "batch_normalization",
+      {withKey("name", block_name), withKey("momentum", "0.9"),
+       withKey("epsilon", 0.00001), withKey("activation", "leaky_relu")});
 
     return {a1, a2};
   }
diff --git a/Applications/YOLO/jni/meson.build b/Applications/YOLOv2/jni/meson.build
similarity index 100%
rename from Applications/YOLO/jni/meson.build
rename to Applications/YOLOv2/jni/meson.build
diff --git a/Applications/YOLO/jni/reorg_layer.cpp b/Applications/YOLOv2/jni/reorg_layer.cpp
similarity index 100%
rename from Applications/YOLO/jni/reorg_layer.cpp
rename to Applications/YOLOv2/jni/reorg_layer.cpp
diff --git a/Applications/YOLO/jni/reorg_layer.h b/Applications/YOLOv2/jni/reorg_layer.h
similarity index 100%
rename from Applications/YOLO/jni/reorg_layer.h
rename to Applications/YOLOv2/jni/reorg_layer.h
diff --git a/Applications/YOLO/jni/yolo_v2_loss.cpp b/Applications/YOLOv2/jni/yolo_v2_loss.cpp
similarity index 100%
rename from Applications/YOLO/jni/yolo_v2_loss.cpp
rename to Applications/YOLOv2/jni/yolo_v2_loss.cpp
diff --git a/Applications/YOLO/jni/yolo_v2_loss.h b/Applications/YOLOv2/jni/yolo_v2_loss.h
similarity index 100%
rename from Applications/YOLO/jni/yolo_v2_loss.h
rename to Applications/YOLOv2/jni/yolo_v2_loss.h
diff --git a/Applications/meson.build b/Applications/meson.build
index 2e3f59fdf2..7c8ef63cd4 100644
--- a/Applications/meson.build
+++ b/Applications/meson.build
@@ -9,7 +9,7 @@ if enable_ccapi
 endif
 subdir('VGG/jni')
 subdir('Resnet/jni')
-subdir('YOLO/jni')
+subdir('YOLOv2/jni')
 subdir('YOLOv3/jni')
 subdir('LLaMA/jni')
 subdir('Multi_input/jni')
diff --git a/ci/pylintrc b/ci/pylintrc
deleted file mode 100644
index aa38200415..0000000000
--- a/ci/pylintrc
+++ /dev/null
@@ -1,36 +0,0 @@
-[MASTER]
-
-[MESSAGESCONTROL]
-disable=
-    too-many-instance-attributes,
-    len-as-condition,
-    too-few-public-methods,
-    anomalous-backslash-in-string,
-    no-else-return,
-    simplifiable-if-statement,
-    too-many-arguments,
-    duplicate-code,
-    no-name-in-module,
-    no-member,
-    raw-checker-failed,
-    bad-inline-option,
-    locally-disabled,
-    file-ignored,
-    suppressed-message,
-    useless-suppression,
-    deprecated-pragma,
-    import-error,
-    missing-docstring,
-    invalid-name,
-    consider-using-enumerate
-
-[SIMILARITIES]
-
-# Ignore comments when computing similarities.
-ignore-comments=yes
-
-# Ignore docstrings when computing similarities.
-ignore-docstrings=yes
-
-# Ignore imports when computing similarities.
-ignore-imports=no
diff --git a/ci/requirements.txt b/ci/requirements.txt
deleted file mode 100644
index 0be69076fc..0000000000
--- a/ci/requirements.txt
+++ /dev/null
@@ -1,81 +0,0 @@
-absl-py==2.1.0
-astroid==3.0.2
-astunparse==1.6.3
-cachetools==5.3.2
-certifi==2023.11.17
-charset-normalizer==3.3.2
-contourpy==1.2.0
-cycler==0.12.1
-dill==0.3.8
-filelock==3.13.1
-flatbuffers==23.5.26
-fonttools==4.47.2
-fsspec==2023.12.2
-gast==0.5.4
-google-auth==2.27.0
-google-auth-oauthlib==1.2.0
-google-pasta==0.2.0
-grpcio==1.60.0
-h5py==3.10.0
-huggingface-hub==0.20.3
-idna==3.6
-importlib-metadata==7.0.1
-importlib-resources==6.1.1
-isort==5.13.2
-Jinja2==3.1.3
-joblib==1.3.2
-keras==2.15.0
-kiwisolver==1.4.5
-libclang==16.0.6
-Markdown==3.5.2
-MarkupSafe==2.1.4
-matplotlib==3.8.2
-mccabe==0.7.0
-ml-dtypes==0.2.0
-mpmath==1.3.0
-networkx==3.2.1
-numpy==1.26.3
-oauthlib==3.2.2
-opt-einsum==3.3.0
-packaging==23.2
-pandas==2.2.0
-pillow==10.2.0
-platformdirs==4.2.0
-protobuf==4.23.4
-pyasn1==0.5.1
-pyasn1-modules==0.3.0
-pylint==3.0.2
-pyparsing==3.1.1
-python-dateutil==2.8.2
-pytz==2023.4
-PyYAML==6.0.1
-regex==2023.12.25
-requests==2.31.0
-requests-oauthlib==1.3.1
-rsa==4.9
-safetensors==0.4.2
-scikit-learn==1.4.0
-scipy==1.12.0
-six==1.16.0
-sympy==1.12
-tensorboard==2.15.1
-tensorboard-data-server==0.7.2
-tensorflow==2.15.0.post1
-tensorflow-estimator==2.15.0
-tensorflow-io-gcs-filesystem==0.35.0
-termcolor==2.4.0
-threadpoolctl==3.2.0
-tokenizers==0.15.1
-tomli==2.0.1
-tomlkit==0.12.3
-torch==2.2.0
-torchvision==0.17.0
-tqdm==4.66.1
-transformers==4.37.2
-triton==2.2.0
-typing_extensions==4.9.0
-tzdata==2023.4
-urllib3==2.2.0
-Werkzeug==3.0.1
-wrapt==1.14.1
-zipp==3.17.0
diff --git a/debian/nntrainer-dev.install b/debian/nntrainer-dev.install
index 4fd55b3774..11b41f990b 100644
--- a/debian/nntrainer-dev.install
+++ b/debian/nntrainer-dev.install
@@ -16,6 +16,7 @@
 /usr/include/nntrainer/blas_interface.h
 /usr/include/nntrainer/var_grad.h
 /usr/include/nntrainer/weight.h
+/usr/include/nntrainer/blas_avx.h
 # todo: update dataset headers
 /usr/include/nntrainer/databuffer.h
 /usr/include/nntrainer/databuffer_factory.h
diff --git a/meson.build b/meson.build
index d4aea330a4..7ae692e6d9 100644
--- a/meson.build
+++ b/meson.build
@@ -64,9 +64,19 @@ warning_c_flags = [
   '-Wno-error=varargs'
 ]
 
+arch = host_machine.cpu_family()
+
+if get_option('enable-avx')
+   extra_defines += '-DUSE_AVX=1'
+   if get_option('platform') == 'tizen'
+      add_project_arguments(['-mavx2'], language: ['c','cpp'])
+   else
+      add_project_arguments(['-march=native'], language: ['c','cpp'])
+   endif
+   message('-march=native added for AVX hardware acceleration.')
+endif
 
 if get_option('enable-fp16')
-   arch = host_machine.cpu_family()
    if get_option('platform') == 'android'
      add_project_arguments('-mfp16-format=ieee', language: ['c', 'cpp'])
      extra_defines += '-DENABLE_FP16=1'
@@ -105,11 +115,6 @@ if get_option('enable-fp16')
      if cc.version().version_compare('>=12.1.0')
        message ('Float16 for x86_64 enabled. Modern gcc-x64 generally supports float16 with _Float16.')
        extra_defines += '-DENABLE_FP16=1'
-       if get_option('enable-avx')
-        extra_defines += '-DUSE_AVX=1'
-        add_project_arguments(['-march=native'], language: ['c','cpp'])
-        message('-march=native added for AVX hardware acceleration.')
-       endif
      else
        warning ('Float16 for x86_64 enabled. However, software emulation is applied for fp16, making it slower and inconsistent. Use GCC 12+ for FP16 support. This build will probably fail unless you bring a compiler that supports fp16 for x64.')
      endif
diff --git a/meson_options.txt b/meson_options.txt
index de2578cb47..59accc1c1a 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -40,7 +40,7 @@ option('enable-fp16', type: 'boolean', value: false)
 option('enable-cublas', type: 'boolean', value: false)
 option('enable-openmp', type: 'boolean', value: true)
 option('enable-neon', type: 'boolean', value: false)
-option('enable-avx', type: 'boolean', value: false)
+option('enable-avx', type: 'boolean', value: true)
 option('enable-opencl', type: 'boolean', value: false)
 
 # ml-api dependency (to enable, install capi-inference from github.com/nnstreamer/api )
diff --git a/nnstreamer/tensor_trainer/tensor_trainer_nntrainer.cc b/nnstreamer/tensor_trainer/tensor_trainer_nntrainer.cc
index 57d84f99d1..c18630efb9 100644
--- a/nnstreamer/tensor_trainer/tensor_trainer_nntrainer.cc
+++ b/nnstreamer/tensor_trainer/tensor_trainer_nntrainer.cc
@@ -555,7 +555,8 @@ void NNTrainer::NNTrainerImpl::trainModel() {
   ml_logd("pid[%d], tid[%d]", pid, tid);
 
   try {
-    model->setProperty({"epochs=" + std::to_string(num_epochs)});
+    model->setProperty(
+      {"epochs=" + std::to_string(num_epochs), "save_path=" + model_save_path});
   } catch (const std::exception &e) {
     ml_loge("Error %s, %s", typeid(e).name(), e.what());
     return;
@@ -574,14 +575,6 @@ void NNTrainer::NNTrainerImpl::trainModel() {
     return;
   }
 
-  try {
-    ml_logd("Save_model: %s", model_save_path.c_str());
-    model->save(model_save_path, ml::train::ModelFormat::MODEL_FORMAT_BIN);
-
-  } catch (const std::exception &e) {
-    ml_loge("Error %s, %s", typeid(e).name(), e.what());
-    return;
-  }
   /* send event */
   nnstreamer_trainer_notify_event(this->notifier,
                                   TRAINER_EVENT_TRAINING_COMPLETION, NULL);
diff --git a/nntrainer/cl_context.cpp b/nntrainer/cl_context.cpp
index 1ed31490be..be7345eed0 100644
--- a/nntrainer/cl_context.cpp
+++ b/nntrainer/cl_context.cpp
@@ -13,7 +13,7 @@
  */
 
 #include <cl_context.h>
-#include <fc_layer.h>
+#include <fc_layer_cl.h>
 
 namespace nntrainer {
 
@@ -23,8 +23,9 @@ std::once_flag global_cl_context_init_flag;
 
 static void add_default_object(ClContext &cc) {
 
-  cc.registerFactory(nntrainer::createLayer<FullyConnectedLayer>,
-                     FullyConnectedLayer::type, ml::train::LayerType::LAYER_FC);
+  cc.registerFactory(nntrainer::createLayer<FullyConnectedLayerCl>,
+                     FullyConnectedLayerCl::type,
+                     ml::train::LayerType::LAYER_FC);
 }
 
 static void registerer(ClContext &cc) noexcept {
diff --git a/nntrainer/graph/graph_core.cpp b/nntrainer/graph/graph_core.cpp
index b624e066e4..3eafbb9261 100644
--- a/nntrainer/graph/graph_core.cpp
+++ b/nntrainer/graph/graph_core.cpp
@@ -35,6 +35,10 @@ GraphCore::getSortedNode(unsigned int ith) const {
   return Sorted.at(ith);
 }
 
+const unsigned int GraphCore::getSortedNodeIdx(const std::string &name) const {
+  return sorted_node_map.at(name);
+}
+
 void GraphCore::makeAdjacencyList(
   std::vector<std::list<std::shared_ptr<GraphNode>>> &adj) {
   /** initialize the adj list */
@@ -93,6 +97,11 @@ void GraphCore::topologicalSort() {
 
   if (Sorted.size() != node_list.size())
     throw std::runtime_error("Internal error in topologicalSort");
+  unsigned int idx = 0;
+  for (auto n : Sorted) {
+    sorted_node_map[n->getName()] = idx;
+    idx++;
+  }
 }
 
 const std::shared_ptr<GraphNode> &
diff --git a/nntrainer/graph/graph_core.h b/nntrainer/graph/graph_core.h
index 83d3ce7c39..77aa63666a 100644
--- a/nntrainer/graph/graph_core.h
+++ b/nntrainer/graph/graph_core.h
@@ -91,6 +91,13 @@ class GraphCore {
    */
   const std::shared_ptr<GraphNode> &getSortedNode(unsigned int ith) const;
 
+  /**
+   * @brief getter of Sorted GraphNode index with name
+   * @param[in] layer name
+   * @ret index
+   */
+  const unsigned int getSortedNodeIdx(const std::string &name) const;
+
   /**
    * @brief getter of GraphNode with node name
    * @param[in] node name
@@ -252,6 +259,7 @@ class GraphCore {
   std::vector<std::shared_ptr<GraphNode>>
     node_list;                                    /**< Unordered Node List  */
   std::unordered_map<std::string, int> node_map;  /**< Unordered Node map  */
+  std::unordered_map<std::string, int> sorted_node_map;  /**< Unordered Node map  */
   std::vector<std::shared_ptr<GraphNode>> Sorted; /**< Ordered Node List  */
   bool sorted; /** if the node_list is sorted */
 
diff --git a/nntrainer/graph/network_graph.cpp b/nntrainer/graph/network_graph.cpp
index 2d4cfdc769..ec69ebd69f 100644
--- a/nntrainer/graph/network_graph.cpp
+++ b/nntrainer/graph/network_graph.cpp
@@ -337,7 +337,7 @@ void NetworkGraph::applyGradients(
       continue;
     }
 
-    if (rc.isGradientClipByGlobalNorm(i)) {
+    if (rc.isGradientClipByGlobalNorm(i) || rc.isMixedPrecision(i)) {
       /**
        * @note the weights whose gradient are to be clipped by global norm will
        * be clipped at once at the end of iteration and applied then.
@@ -393,56 +393,113 @@ sharedConstTensors NetworkGraph::incremental_forwarding(
   return out;
 }
 
-void NetworkGraph::backwarding(
+bool NetworkGraph::backwarding(
   int iteration,
-  std::function<void(std::shared_ptr<LayerNode>, int)> &backwarding_op,
-  std::function<void(Weight &, int)> &apply_grad_clip_op,
-  std::function<bool(void *userdata)> stop_cb, void *userdata) const {
+  std::function<void(std::shared_ptr<LayerNode>, bool)> &forwarding_op,
+  std::function<bool(std::shared_ptr<LayerNode>, int)> &backwarding_op,
+  std::function<void(Weight &, int)> &lazy_apply_grad_op,
+  std::function<bool(void *userdata)> stop_cb, void *userdata) {
   /**
    * last layer backwarding is run out of this loop
    */
   auto iter_begin = getBackwardingBeginIter();
   auto iter_end = getBackwardingEndIter();
+  bool is_valid = true;
 
   /// there is no layer to train, so backwarding is essentially noop
   if (iter_begin == iter_end) {
-    return;
+    return true;
   }
 
   auto const &lptr_begin = (*iter_begin);
+  // graph_const_reverse_iterator
+  auto iter_ = iter_begin;
 
   if (lptr_begin->requireLabel() == false)
     throw std::runtime_error(
       "Error: last layer does not accept label, we can't train");
 
-  for (auto iter = iter_begin; iter != iter_end && !stop_cb(userdata); iter++) {
-    auto &ln = *iter;
+  for (iter_ = iter_begin; iter_ != iter_end && !stop_cb(userdata); iter_++) {
+    auto &ln = *iter_;
     PROFILE_TIME_START(profile_keys.at(ln->getType()));
-    backwarding_op(ln, iteration);
+    is_valid = backwarding_op(ln, iteration);
     PROFILE_TIME_END(profile_keys.at(ln->getType()));
+
+    if (!is_valid) {
+      std::cout << ln->getName() << " : Gradient has NaN --> "
+                << ln->getRunContext().getLossScale() << std::endl;
+      break;
+    }
   }
 
-  /** perform clipping of the gradients by global norm if any */
-  if (clip_weights.empty())
-    return;
+  if (!is_valid) {
+    /** if has NaN
+     * 1. reset the loss scale. : @todo Backoff_factor : default --> 0.5
+     * 2. run forwarding from cur_iter to cend() && !stop_cb(userdata);
+     * 3. return false --> run backwarding again;
+     */
+    float scale = (*iter_)->getRunContext().getLossScale();
+
+    NNTR_THROW_IF(scale == 1.0f, std::invalid_argument)
+      << "Loss Scale Factor is 1.0f";
+
+    float s = scale > 1.5f ? scale * 0.5f : 1.0f;
 
-  /** calculate the global norm */
-  Tensor global_norm_t(
-    TensorDim({1u, 1u, 1u, (unsigned int)clip_weights.size()}));
-  float *global_norm_data = global_norm_t.getData();
-  for (unsigned int idx = 0; idx < clip_weights.size(); idx++) {
-    auto const &w = clip_weights[idx];
-    global_norm_data[idx] = w->getGradientNorm();
+    resetLossScale(s);
+
+    auto f_iter = cbegin() + graph.getSortedNodeIdx((*iter_)->getName());
+
+    for (auto iter = f_iter; iter != cend() && !stop_cb(userdata); iter++) {
+      auto &ln = *iter;
+      ln->needsOutputSetZero(true);
+    }
+
+    for (auto iter = f_iter; iter != cend() && !stop_cb(userdata); iter++) {
+      auto &ln = *iter;
+      PROFILE_TIME_START(profile_keys.at(ln->getType()));
+      forwarding_op(*iter, true);
+      PROFILE_TIME_END(profile_keys.at(ln->getType()));
+    }
+
+    return false;
   }
-  float global_norm = global_norm_t.l2norm();
-  /** apply the gradient with the above global norm */
-  for (auto w : clip_weights) {
-    w->clipGradientByGlobalNorm(global_norm);
+
+  /** perform clipping of the gradients by global norm if any */
+  if (lazy_weights.empty())
+    return true;
+
+  if (is_clip_grad) {
+    /** calculate the global norm */
+    Tensor global_norm_t(
+      TensorDim({1u, 1u, 1u, (unsigned int)lazy_weights.size()}));
+    float *global_norm_data = global_norm_t.getData();
+    for (unsigned int idx = 0; idx < lazy_weights.size(); idx++) {
+      auto const &w = lazy_weights[idx];
+      global_norm_data[idx] = w->getGradientNorm();
+    }
+    float global_norm = global_norm_t.l2norm();
+    /** apply the gradient with the above global norm */
+    for (auto w : lazy_weights) {
+      w->clipGradientByGlobalNorm(global_norm);
+    }
   }
   /** apply the gradient with the above global norm */
-  for (auto w : clip_weights) {
-    apply_grad_clip_op(*w, iteration);
+  for (auto w : lazy_weights) {
+    lazy_apply_grad_op(*w, iteration);
   }
+  nan_count++;
+
+  /** @todo : handle as property : growth_interval : default --> 2000 */
+
+  if (nan_count > 2000) {
+    float scale = (*iter_)->getRunContext().getLossScale();
+    /** @todo growth_factor : default --> 2.0 */
+    float s = scale * 2.0f;
+    resetLossScale(s);
+    nan_count = 0;
+  }
+
+  return true;
 }
 
 LayerNode *NetworkGraph::computeBackwardEnd() {
@@ -580,8 +637,15 @@ void NetworkGraph::addLayer(std::shared_ptr<LayerNode> layer) {
 
 InPlace
 NetworkGraph::canExecuteInPlace(const std::shared_ptr<LayerNode> &lnode) {
-  if (!lnode->supportInPlace())
+
+  if (!lnode->supportInPlace()) {
     return InPlace::NONE;
+  }
+
+  if (lnode->getType() == InputLayer::type &&
+      !istrequal(getTensorType()[2], "FP32")) {
+    return InPlace::NONE;
+  }
 
   /** layers which behave as a no-op - flatten */
   auto no_op = [](const std::shared_ptr<LayerNode> &lnode) {
@@ -768,9 +832,10 @@ NetworkGraph::finalizeContext(const std::shared_ptr<LayerNode> &lnode,
    * node is going to be used with in-place optimizations.
    */
   auto out_specs = init_context.getOutSpecs();
+
   /// @note try move inplace control to finalize
   bool shared_var = false, shared_grad = false;
-  if (lnode->executeInPlace() != InPlace::NONE) {
+  if (lnode->executeInPlace() != InPlace::NONE && lnode->supportInPlace()) {
     setInplaceSharedMemoryConfigByLayer(lnode, shared_var, shared_grad);
     for (unsigned int i = 0; i < out_specs.size(); ++i) {
       auto &s = out_specs.at(i);
@@ -879,7 +944,8 @@ NetworkGraph::finalizeContext(const std::shared_ptr<LayerNode> &lnode,
                                    lnode->getTrainable(), shared_weight_names),
     inputs, outputs,
     tensor_manager->requestTensors(gnode, init_context.getTensorsSpec(),
-                                   lnode->getTrainable(), shared_tensor_names));
+                                   lnode->getTrainable(), shared_tensor_names),
+    init_context.getLossScale());
 
   return outputs;
 }
@@ -1027,7 +1093,8 @@ NetworkGraph::refinalizeContext(const std::shared_ptr<LayerNode> &lnode,
     // TODO: update weights spec for trainable based on layer trainable prop
     weights, inputs, outputs,
     tensor_manager->requestTensors(gnode, init_context.getTensorsSpec(),
-                                   lnode->getTrainable(), shared_tensor_names));
+                                   lnode->getTrainable(), shared_tensor_names),
+    init_context.getLossScale());
 
   return outputs;
 }
@@ -1197,7 +1264,7 @@ int NetworkGraph::initialize(ExecutionMode mode,
          */
         if (tensor_manager->isLastAccess(rc.getWeightGrad(i).getName(),
                                          last_grad_access) ||
-            (rc.isGradientClipByGlobalNorm(i) &&
+            ((rc.isGradientClipByGlobalNorm(i) || rc.isMixedPrecision(i)) &&
              tensor_manager->isSecondLastAccess(rc.getWeightGrad(i).getName(),
                                                 last_grad_access))) {
           rc.getWeightObject(i).setAsGradientLastAccess();
@@ -1287,11 +1354,19 @@ int NetworkGraph::initialize(ExecutionMode mode,
 
   /** select weights which would require clipping of the gradients by global
    * norm if any */
-  clip_weights = tensor_manager->getWeights([](const Weight *w) {
+  lazy_weights = tensor_manager->getWeights([](const Weight *w) {
     return w->hasGradient() && w->isGradientLastAccess() &&
-           w->isGradientClipByGlobalNorm();
+           (w->isGradientClipByGlobalNorm() || w->isMixedPrecision());
   });
 
+  is_clip_grad = false;
+  for (auto w : lazy_weights) {
+    if (w->isGradientClipByGlobalNorm()) {
+      is_clip_grad = true;
+      break;
+    }
+  }
+
   return ML_ERROR_NONE;
 }
 
@@ -1556,10 +1631,18 @@ void NetworkGraph::requestOptimizerVariable(
       const TensorDim &dim = w->getDim();
       std::vector<TensorDim> dims = cb(dim);
       w->setOptimizerVariables(tensor_manager->requestWeightOptimizerVariables(
-        dims, w->getName(), TensorLifespan::MAX_LIFESPAN,
-        w->isGradientClipByGlobalNorm(), Tensor::Initializer::ZEROS));
+        dims, w->getName(), ":opt", TensorLifespan::MAX_LIFESPAN,
+        w->isGradientClipByGlobalNorm(), w->isMixedPrecision(),
+        Tensor::Initializer::ZEROS));
     }
   }
 }
 
+void NetworkGraph::resetLossScale(float scale) {
+  for (auto iter = cbegin(); iter != cend(); iter++) {
+    auto &ln = *iter;
+    ln->getRunContext().setLossScale(scale);
+  }
+}
+
 } /* namespace nntrainer */
diff --git a/nntrainer/graph/network_graph.h b/nntrainer/graph/network_graph.h
index 5c9adf0363..22f14e1b73 100644
--- a/nntrainer/graph/network_graph.h
+++ b/nntrainer/graph/network_graph.h
@@ -51,7 +51,9 @@ class NetworkGraph {
     optimize_memory(true),
     exec_mode(ExecutionMode::TRAIN),
     tensor_format("NCHW"),
-    tensor_dtype(split("FP32-FP32", getRegex("\\-"))) {}
+    tensor_dtype(split("FP32-FP32", getRegex("\\-"))) {
+    nan_count = 0;
+  }
 
   /**
    * @brief     Constructor of NeuralNetwork Graph Class
@@ -73,7 +75,9 @@ class NetworkGraph {
     optimize_memory(true),
     exec_mode(ExecutionMode::TRAIN),
     tensor_format(tensor_format_),
-    tensor_dtype(split(tensor_dtype_, getRegex("\\-"))) {}
+    tensor_dtype(split(tensor_dtype_, getRegex("\\-"))) {
+    nan_count = 0;
+  }
 
   /**
    * @brief   Destructor of the NeuralNetwork Graph class
@@ -206,13 +210,14 @@ class NetworkGraph {
    * @param[in] backwarding_op operation for the backwarding
    * @param[in] apply_grad_clip_op operation for applying the clip gradients
    */
-  void backwarding(
+  bool backwarding(
     int iteration,
-    std::function<void(std::shared_ptr<LayerNode>, int)> &backwarding_op,
-    std::function<void(Weight &, int)> &apply_grad_clip_op,
+    std::function<void(std::shared_ptr<LayerNode>, bool)> &forwarding_op,
+    std::function<bool(std::shared_ptr<LayerNode>, int)> &backwarding_op,
+    std::function<void(Weight &, int)> &lazy_apply_grad_op,
     std::function<bool(void *userdata)> stop_cb =
       [](void *user_data) { return false; },
-    void *user_data = nullptr) const;
+    void *user_data = nullptr);
 
   /**
    * @brief     get begin iterator for the graph
@@ -444,6 +449,12 @@ class NetworkGraph {
   getLayerExecutionOrders(const std::shared_ptr<LayerNode> &lnode);
 #endif // ENABLE_TEST
 
+  /**
+   * @brief     reset the loss scale
+   * @param[in] scale
+   */
+  void resetLossScale(float scale);
+
 private:
   std::map<std::string, std::string> sub_in_out; /** This is map to identify
                    input and output layer name of subgraph */
@@ -480,7 +491,10 @@ class NetworkGraph {
   std::unordered_map<std::string, int>
     profile_keys; /**< profile keys based on the layer type */
   std::vector<Weight *>
-    clip_weights; /**< weights with global norm based clipping enabled */
+    lazy_weights; /**< weights with global norm based clipping enabled */
+  bool is_clip_grad;
+
+  unsigned int nan_count;
 
   /**
    * @brief     topological sort
diff --git a/nntrainer/layers/bn_layer.cpp b/nntrainer/layers/bn_layer.cpp
index 1723ac677f..e978b1ef59 100644
--- a/nntrainer/layers/bn_layer.cpp
+++ b/nntrainer/layers/bn_layer.cpp
@@ -111,6 +111,12 @@ void BatchNormalizationLayer::finalize(InitLayerContext &context) {
     context.requestWeight(dim, bnparams_beta, WeightRegularizer::NONE, 1.0f,
                           bias_decay, "beta", true);
 
+  /**
+   * @note declare weigth dimention with activation datatype
+   */
+  TensorDim w_dim = dim;
+  w_dim.setDataType(in_dim.getDataType());
+
   /**
    * caches the deviation -> input - avg(input)
    * @todo check if avoiding this storage and adding dependency on input (no
@@ -121,7 +127,7 @@ void BatchNormalizationLayer::finalize(InitLayerContext &context) {
                           TensorLifespan::ITERATION_LIFESPAN);
   /** caches the inverse standard deviation */
   wt_idx[BNParams::invstd] =
-    context.requestTensor(dim, "invstd", Tensor::Initializer::NONE, false,
+    context.requestTensor(w_dim, "invstd", Tensor::Initializer::NONE, false,
                           TensorLifespan::ITERATION_LIFESPAN);
   /**
    * Temporary tensor to store the full sized tensors in order to allow batch
@@ -136,13 +142,13 @@ void BatchNormalizationLayer::finalize(InitLayerContext &context) {
    * caches variance + epsilon as well.
    */
   wt_idx[BNParams::cvar] =
-    context.requestTensor(dim, "cvar", Tensor::Initializer::NONE, false,
+    context.requestTensor(w_dim, "cvar", Tensor::Initializer::NONE, false,
                           TensorLifespan::ITERATION_LIFESPAN);
   /**
    * Temporary tensor to store the reduced tensors along the axes_to_reduce.
    */
   wt_idx[BNParams::t_reduced] =
-    context.requestTensor(dim, "tensor_reduced", Tensor::Initializer::NONE,
+    context.requestTensor(w_dim, "tensor_reduced", Tensor::Initializer::NONE,
                           false, TensorLifespan::FORWARD_DERIV_LIFESPAN);
 }
 
diff --git a/nntrainer/layers/cl_layers/blas_kernels.cpp b/nntrainer/layers/cl_layers/blas_kernels.cpp
new file mode 100644
index 0000000000..c190688c66
--- /dev/null
+++ b/nntrainer/layers/cl_layers/blas_kernels.cpp
@@ -0,0 +1,301 @@
+// SPDX-License-Identifier: Apache-2.0
+/**
+ * Copyright (C) 2024 Debadri Samaddar <s.debadri@samsung.com>
+ *
+ * @file	blas_kernels.cpp
+ * @date	14 May 2024
+ * @brief	Common blas OpenCL kernels
+ * @see		https://github.com/nnstreamer/nntrainer
+ * @author	Debadri Samaddar <s.debadri@samsung.com>
+ * @bug		No known bugs except for NYI items
+ *
+ */
+
+#include <blas_kernels.h>
+
+namespace nntrainer {
+
+std::string sgemv_cl_kernel_ =
+  R"(__kernel void sgemv_cl(const __global float* A, const __global float* X,
+                      __global float* Y, unsigned int M, unsigned int N) {                                            
+        unsigned int i;
+        i = get_global_id(0);                         
+        float y0 = 0.0f;
+        for (unsigned int j = 0; j < M; j++)                         
+            y0 += A[i + j * N] * X[j]; 
+        Y[i] = y0;                            
+          
+    })";
+
+std::string dot_cl_kernel_ =
+  R"(__kernel void dot_cl(const __global float* A, const __global float* X, unsigned int K, __global float* res) {
+        *res = 0;
+        for (unsigned int i = 0; i < K; i++){
+            *res += A[i] * X[i];
+        }
+    })";
+
+std::string sgemm_cl_kernel_ =
+  R"(__kernel void sgemm_cl(const __global float* A, const __global float* B,
+                      __global float* C, unsigned int K, unsigned int lda, unsigned int ldb, unsigned int ldc) {
+        
+        unsigned int m = get_global_id(0);
+        unsigned int n = get_global_id(1);
+        float c = 0.0f;
+        for (unsigned int k = 0; k < K; ++k) {
+          float a, b;
+          a = A[m * lda + k];
+          b = B[k * ldb + n];
+          c += a * b;
+        }
+        C[m * ldc + n] = c;
+    })";
+
+/**
+ * @brief declaring global kernel objects
+ */
+opencl::Kernel kernel_sgemv;
+opencl::Kernel kernel_sgemm;
+opencl::Kernel kernel_dot;
+
+void sgemv_cl(const float *matAdata, const float *vecXdata, float *vecYdata,
+              unsigned int dim1, unsigned int dim2, unsigned int lda,
+              RunLayerContext &context) {
+
+  bool result = false;
+
+  do {
+    result = context.clCreateKernel(sgemv_cl_kernel_,
+                                    context.LayerKernel::SGEMV, kernel_sgemv);
+    if (!result) {
+      break;
+    }
+
+    size_t dim1_size = sizeof(float) * dim1;
+    size_t dim2_size = sizeof(float) * dim2;
+    opencl::Buffer inputA(context.context_inst_, dim1 * dim2 * sizeof(float),
+                          true, nullptr);
+
+    opencl::Buffer inputX(context.context_inst_, dim1_size, true, nullptr);
+
+    opencl::Buffer inOutY(context.context_inst_, dim2_size, true, nullptr);
+
+    result = inputA.WriteData(context.command_queue_inst_, matAdata);
+    if (!result) {
+      break;
+    }
+
+    result = inputX.WriteData(context.command_queue_inst_, vecXdata);
+    if (!result) {
+      break;
+    }
+
+    result = inOutY.WriteData(context.command_queue_inst_, vecYdata);
+    if (!result) {
+      break;
+    }
+
+    result = kernel_sgemv.SetKernelArguments(0, &inputA, sizeof(cl_mem));
+    if (!result) {
+      break;
+    }
+
+    result = kernel_sgemv.SetKernelArguments(1, &inputX, sizeof(cl_mem));
+    if (!result) {
+      break;
+    }
+
+    result = kernel_sgemv.SetKernelArguments(2, &inOutY, sizeof(cl_mem));
+    if (!result) {
+      break;
+    }
+
+    result = kernel_sgemv.SetKernelArguments(3, &dim1, sizeof(int));
+    if (!result) {
+      break;
+    }
+
+    result = kernel_sgemv.SetKernelArguments(4, &dim2, sizeof(int));
+    if (!result) {
+      break;
+    }
+
+    const int work_groups_count[3] = {(int)dim2, 1, 1};
+    const int work_group_size[3] = {32, 32, 1}; // test-value
+
+    result = context.command_queue_inst_.DispatchCommand(
+      kernel_sgemv, work_groups_count, work_group_size);
+    if (!result) {
+      break;
+    }
+
+    result = inOutY.ReadData(context.command_queue_inst_, vecYdata);
+    if (!result) {
+      break;
+    }
+
+  } while (false);
+}
+
+float dot_cl(const float *vecAdata, const float *vecXdata, unsigned int dim1,
+             RunLayerContext &context) {
+
+  bool result = false;
+
+  float cl_ret = 0;
+
+  do {
+    result = context.clCreateKernel(dot_cl_kernel_, context.LayerKernel::DOT,
+                                    kernel_dot);
+    if (!result) {
+      break;
+    }
+
+    size_t dim1_size = sizeof(float) * dim1;
+
+    opencl::Buffer inputA(context.context_inst_, dim1_size, true, nullptr);
+
+    opencl::Buffer inputX(context.context_inst_, dim1_size, true, nullptr);
+
+    opencl::Buffer dotResult(context.context_inst_, sizeof(float), true,
+                             &cl_ret);
+
+    result = inputA.WriteData(context.command_queue_inst_, vecAdata);
+    if (!result) {
+      break;
+    }
+
+    result = inputX.WriteData(context.command_queue_inst_, vecXdata);
+    if (!result) {
+      break;
+    }
+
+    result = kernel_dot.SetKernelArguments(0, &inputA, sizeof(cl_mem));
+    if (!result) {
+      break;
+    }
+
+    result = kernel_dot.SetKernelArguments(1, &inputX, sizeof(cl_mem));
+    if (!result) {
+      break;
+    }
+
+    result = kernel_dot.SetKernelArguments(2, &dim1, sizeof(int));
+    if (!result) {
+      break;
+    }
+
+    result = kernel_dot.SetKernelArguments(3, &dotResult, sizeof(cl_mem));
+    if (!result) {
+      break;
+    }
+
+    const int work_groups_count[3] = {(int)dim1, 1, 1};
+    const int work_group_size[3] = {32, 32, 1}; // test-value
+
+    result = context.command_queue_inst_.DispatchCommand(
+      kernel_dot, work_groups_count, work_group_size);
+    if (!result) {
+      break;
+    }
+
+    result = dotResult.ReadData(context.command_queue_inst_, &cl_ret);
+    if (!result) {
+      break;
+    }
+
+  } while (false);
+
+  return cl_ret;
+}
+
+void sgemm_cl(const float *A, const float *B, float *C, unsigned int M,
+              unsigned int N, unsigned int K, unsigned int lda,
+              unsigned int ldb, unsigned int ldc, RunLayerContext &context) {
+
+  bool result = false;
+
+  do {
+    result = context.clCreateKernel(sgemm_cl_kernel_,
+                                    context.LayerKernel::SGEMM, kernel_sgemm);
+    if (!result) {
+      break;
+    }
+
+    size_t m_k_size = M * K * sizeof(float);
+    size_t k_n_size = K * N * sizeof(float);
+    size_t m_n_size = M * N * sizeof(float);
+
+    opencl::Buffer inputA(context.context_inst_, m_k_size, true, nullptr);
+
+    opencl::Buffer inputB(context.context_inst_, k_n_size, true, nullptr);
+
+    opencl::Buffer inOutC(context.context_inst_, m_n_size, true, nullptr);
+
+    result = inputA.WriteData(context.command_queue_inst_, A);
+    if (!result) {
+      break;
+    }
+
+    result = inputB.WriteData(context.command_queue_inst_, B);
+    if (!result) {
+      break;
+    }
+
+    result = inOutC.WriteData(context.command_queue_inst_, C);
+    if (!result) {
+      break;
+    }
+
+    result = kernel_sgemm.SetKernelArguments(0, &inputA, sizeof(cl_mem));
+    if (!result) {
+      break;
+    }
+
+    result = kernel_sgemm.SetKernelArguments(1, &inputB, sizeof(cl_mem));
+    if (!result) {
+      break;
+    }
+
+    result = kernel_sgemm.SetKernelArguments(2, &inOutC, sizeof(cl_mem));
+    if (!result) {
+      break;
+    }
+
+    result = kernel_sgemm.SetKernelArguments(3, &K, sizeof(int));
+    if (!result) {
+      break;
+    }
+
+    result = kernel_sgemm.SetKernelArguments(4, &lda, sizeof(int));
+    if (!result) {
+      break;
+    }
+
+    result = kernel_sgemm.SetKernelArguments(5, &ldb, sizeof(int));
+    if (!result) {
+      break;
+    }
+
+    result = kernel_sgemm.SetKernelArguments(6, &ldc, sizeof(int));
+    if (!result) {
+      break;
+    }
+
+    const int work_groups_count[3] = {(int)M, (int)N, 1};
+    const int work_group_size[3] = {32, 32, 1}; // test-value
+
+    result = context.command_queue_inst_.DispatchCommand(
+      kernel_sgemm, work_groups_count, work_group_size);
+    if (!result) {
+      break;
+    }
+
+    result = inOutC.ReadData(context.command_queue_inst_, C);
+    if (!result) {
+      break;
+    }
+
+  } while (false);
+}
+} // namespace nntrainer
diff --git a/nntrainer/layers/cl_layers/blas_kernels.h b/nntrainer/layers/cl_layers/blas_kernels.h
new file mode 100644
index 0000000000..ad59b8bbd1
--- /dev/null
+++ b/nntrainer/layers/cl_layers/blas_kernels.h
@@ -0,0 +1,74 @@
+// SPDX-License-Identifier: Apache-2.0
+/**
+ * Copyright (C) 2024 Debadri Samaddar <s.debadri@samsung.com>
+ *
+ * @file	blas_kernels.h
+ * @date	14 May 2024
+ * @brief	Common blas OpenCL kernels
+ * @see		https://github.com/nnstreamer/nntrainer
+ * @author	Debadri Samaddar <s.debadri@samsung.com>
+ * @bug		No known bugs except for NYI items
+ *
+ */
+
+#ifndef __BLAS_KERNELS_H__
+#define __BLAS_KERNELS_H__
+
+#include <layer_context.h>
+#include <opencl_buffer.h>
+#include <opencl_kernel.h>
+#include <string>
+
+namespace nntrainer {
+
+/**
+ * @brief declaring global kernel objects
+ */
+extern opencl::Kernel kernel_sgemv;
+extern opencl::Kernel kernel_sgemm;
+extern opencl::Kernel kernel_dot;
+
+/**
+ * @brief     sgemv computation : Y = A*X + Y
+ * @param[in] matAdata float * for Matrix A
+ * @param[in] vecXdata float * for Vector X
+ * @param[in] vecYdata float * for Vector Y
+ * @param[in] dim1 number of A's columns
+ * @param[in] dim2 number of A's rows
+ * @param[in] lda number of X's columns
+ * @param[in] context RunLayerContext reference
+ */
+void sgemv_cl(const float *matAdata, const float *vecXdata, float *vecYdata,
+              unsigned int dim1, unsigned int dim2, unsigned int lda,
+              RunLayerContext &context);
+
+/**
+ * @brief     dot computation : sum of all X * Y
+ * @param[in] vecAdata float * for Vector A
+ * @param[in] vecXdata float * for Vector X
+ * @param[in] dim1 number of elements in both input vectors
+ * @param[in] context RunLayerContext reference
+ */
+float dot_cl(const float *vecAdata, const float *vecXdata, unsigned int dim1,
+             RunLayerContext &context);
+
+/**
+ * @brief     sgemm computation : Y = op(A)*op(B) + C,
+ * where op(X) is one of X or X**T
+ * @param[in] A float * for Matrix A
+ * @param[in] B float * for Matrix B
+ * @param[in] C float * for Matrix C
+ * @param[in] M number of op(A)'s and C's row
+ * @param[in] N number of op(B)'s and C's columns
+ * @param[in] K number of op(A)'s and columns and op(B)'s rows
+ * @param[in] lda number of A's columns
+ * @param[in] ldb number of B's columns
+ * @param[in] ldc number of C's columns
+ * @param[in] context RunLayerContext reference
+ */
+void sgemm_cl(const float *A, const float *B, float *C, unsigned int M,
+              unsigned int N, unsigned int K, unsigned int lda,
+              unsigned int ldb, unsigned int ldc, RunLayerContext &context);
+
+} // namespace nntrainer
+#endif /* __BLAS_KERNELS_H__ */
diff --git a/nntrainer/layers/cl_layers/fc_layer_cl.cpp b/nntrainer/layers/cl_layers/fc_layer_cl.cpp
new file mode 100644
index 0000000000..b0a41c4e5f
--- /dev/null
+++ b/nntrainer/layers/cl_layers/fc_layer_cl.cpp
@@ -0,0 +1,288 @@
+// SPDX-License-Identifier: Apache-2.0
+/**
+ * Copyright (C) 2024 Debadri Samaddar <s.debadri@samsung.com>
+ *
+ * @file	fc_layer_cl.cpp
+ * @date	7 May 2024
+ * @brief	This is Fully Connected Layer Class for Neural Network with OpenCl
+ * implementation
+ * @see		https://github.com/nnstreamer/nntrainer
+ * @author	Debadri Samaddar <s.debadri@samsung.com>
+ * @bug		No known bugs except for NYI items
+ *
+ */
+
+#include <blas_kernels.h>
+#include <common_properties.h>
+#include <fc_layer_cl.h>
+#include <layer_context.h>
+#include <lazy_tensor.h>
+#include <nntrainer_error.h>
+#include <nntrainer_log.h>
+#include <node_exporter.h>
+#include <util_func.h>
+
+namespace nntrainer {
+
+static constexpr size_t SINGLE_INOUT_IDX = 0;
+
+enum FCParams { weight, bias };
+
+FullyConnectedLayerCl::FullyConnectedLayerCl() :
+  LayerImpl(), fc_props(props::Unit()) {
+  weight_idx.fill(std::numeric_limits<unsigned>::max());
+}
+
+void FullyConnectedLayerCl::finalize(InitLayerContext &context) {
+  auto &weight_regularizer =
+    std::get<props::WeightRegularizer>(*layer_impl_props);
+  auto &weight_regularizer_constant =
+    std::get<props::WeightRegularizerConstant>(*layer_impl_props);
+  auto &weight_initializer =
+    std::get<props::WeightInitializer>(*layer_impl_props);
+  auto &weight_decay = std::get<props::WeightDecay>(*layer_impl_props);
+  auto &bias_decay = std::get<props::BiasDecay>(*layer_impl_props);
+  auto &bias_initializer = std::get<props::BiasInitializer>(*layer_impl_props);
+  auto &disable_bias = std::get<props::DisableBias>(*layer_impl_props);
+
+  auto unit = std::get<props::Unit>(fc_props).get();
+
+  NNTR_THROW_IF(context.getNumInputs() != 1, std::invalid_argument)
+    << "Fully connected layer takes only one input";
+
+  std::vector<TensorDim> output_dims(1);
+
+  /// @todo fc actaully supports multidimensions. EffDimFlag shouldn't be fixed
+  /// like this.
+  context.setEffDimFlagInputDimension(0, 0b1001);
+  context.setDynDimFlagInputDimension(0, 0b1000);
+
+  bool is_nchw = (context.getFormat() == Tformat::NCHW);
+  /** set output dimensions */
+  auto const &in_dim = context.getInputDimensions()[0];
+  output_dims[0] = in_dim;
+  is_nchw ? output_dims[0].width(unit) : output_dims[0].channel(unit);
+
+  output_dims[0].setTensorType(
+    {context.getFormat(), context.getActivationDataType()});
+
+  context.setOutputDimensions(output_dims);
+
+  /** set weight specifications */
+  // @todo : This NCHW format setting is just temporal, it needs to be set by
+  // global configuration
+  TensorDim bias_dim(
+    1, is_nchw ? 1 : unit, 1, is_nchw ? unit : 1,
+    TensorDim::TensorType(context.getFormat(), context.getWeightDataType()),
+    is_nchw ? 0b0001 : 0b0100);
+
+  TensorDim weight_dim(
+    1, is_nchw ? 1 : unit, is_nchw ? in_dim.width() : 1,
+    is_nchw ? unit : in_dim.channel(),
+    TensorDim::TensorType(context.getFormat(), context.getWeightDataType()),
+    is_nchw ? 0b0011 : 0b0101);
+
+  weight_idx[FCParams::weight] = context.requestWeight(
+    weight_dim, weight_initializer, weight_regularizer,
+    weight_regularizer_constant, weight_decay, "weight", true);
+
+  if (disable_bias.empty() || disable_bias.get() == false) {
+    weight_idx[FCParams::bias] =
+      context.requestWeight(bias_dim, bias_initializer, WeightRegularizer::NONE,
+                            1.0f, bias_decay, "bias", true);
+  }
+}
+
+void FullyConnectedLayerCl::exportTo(
+  Exporter &exporter, const ml::train::ExportMethods &method) const {
+  LayerImpl::exportTo(exporter, method);
+  exporter.saveResult(fc_props, method, this);
+}
+
+void FullyConnectedLayerCl::setProperty(
+  const std::vector<std::string> &values) {
+  auto remain_props = loadProperties(values, fc_props);
+  LayerImpl::setProperty(remain_props);
+}
+
+void FullyConnectedLayerCl::forwarding(RunLayerContext &context,
+                                       bool training) {
+
+  Tensor &weight = context.getWeight(weight_idx[FCParams::weight]);
+  Tensor &hidden_ = context.getOutput(SINGLE_INOUT_IDX);
+  Tensor &input_ = context.getInput(SINGLE_INOUT_IDX);
+
+  if (weight.getDataType() == nntrainer::Tdatatype::QINT4 ||
+      weight.getDataType() == nntrainer::Tdatatype::QINT8) {
+    Tdatatype dtype = input_.getDataType();
+
+    Tensor weight_(
+      {{weight.batch(), weight.channel(), weight.height(), weight.width()},
+       {weight.getFormat(), dtype}},
+      true);
+
+    unsigned int axis =
+      context.getWeightObject(weight_idx[FCParams::weight]).getOutputAxis();
+
+    weight.dequantize(weight_, axis);
+
+    fcDotProcess(input_, weight_, hidden_, context);
+  } else {
+    fcDotProcess(input_, weight, hidden_, context);
+  }
+
+  if (auto &disable_bias = std::get<props::DisableBias>(*layer_impl_props);
+      disable_bias.empty() || disable_bias.get() == false) {
+    Tensor &bias = context.getWeight(weight_idx[FCParams::bias]);
+    hidden_.add_i(bias);
+  }
+}
+
+void FullyConnectedLayerCl::fcDotProcess(Tensor const &input,
+                                         Tensor const &weight, Tensor &result,
+                                         RunLayerContext &context) {
+  // to do:
+  // NNTR_THROW_IF(!contiguous, std::invalid_argument)
+  //   << getName() << " is not contiguous. Cannot dot product.";
+
+  unsigned int dim1, dim2, mdim1, mdim2;
+  if (input.getFormat() == Tformat::NHWC) {
+    dim1 = input.batch() * input.height() * input.width();
+    dim2 = input.channel();
+    mdim1 = weight.batch() * weight.height() * weight.width();
+    mdim2 = weight.channel();
+  } else {
+    dim1 = input.batch() * input.channel() * input.height();
+    dim2 = input.width();
+    mdim1 = weight.batch() * weight.channel() * weight.height();
+    mdim2 = weight.width();
+  }
+
+  unsigned int M, N, K, lda, ldb, ldc;
+  if (dim2 != mdim1)
+    throw std::runtime_error("Error: incompatible dimensions for dot product");
+  K = mdim1; /** == dim2 */
+  N = mdim2;
+  M = dim1;
+  if (input.getFormat() == Tformat::NHWC) {
+    CREATE_IF_EMPTY_DIMS(result, input.batch(), N, input.height(),
+                         input.width(),
+                         input.getTensorType()); //  NHWC Result Tensor
+  } else {
+    CREATE_IF_EMPTY_DIMS(result, input.batch(), input.channel(), input.height(),
+                         N, input.getTensorType());
+  }
+
+  lda = dim2;
+  ldb = mdim2;
+  ldc =
+    (input.getFormat() == Tformat::NHWC) ? result.channel() : result.width();
+
+  if (input.getDataType() == ml::train::TensorDim::DataType::FP32) {
+    const float *data = input.getData();
+    const float *mdata = weight.getData();
+    float *rdata = result.getData();
+
+    /// shortcut handling in case of vector
+    /// for vector, (1 * K) == (K * 1) in current memory layout...
+    /// and plaese note that N, K, M is a fixed place holder after considering
+    /// transpose.
+    /// For example, there is no case like (1 * K) X (1 * K) while
+    /// (1 * K) X (1 * M) can be a case
+    /// case1: (1 * K) X (K * 1)
+    if (M == 1 && N == 1) {
+      *rdata = dot_cl(data, mdata, K, context) + (*rdata);
+    }
+    /// case2: (M * K) X (K * 1)
+    else if (N == 1) {
+      sgemv_cl(data, mdata, rdata, dim1, dim2, lda, context);
+    }
+    /// case3: (1 * K) X (K * N) = 1 * N = R
+    /// = R^T = (K * N) ^T * (1 * K) ^T = (N * K) * (K * 1) = (N * K) * (1 * K)
+    /// Effectively a translation of sgemv
+    else if (M == 1) {
+      sgemv_cl(mdata, data, rdata, mdim1, mdim2, ldb, context);
+    }
+    /// case others: use gemm
+    else {
+      sgemm_cl(data, mdata, rdata, M, N, K, lda, ldb, ldc, context);
+    }
+  } else
+    throw std::invalid_argument("Error: OpenCL fp16 is not supported yet.");
+}
+
+void FullyConnectedLayerCl::incremental_forwarding(RunLayerContext &context,
+                                                   unsigned int from,
+                                                   unsigned int to,
+                                                   bool training) {
+  Tensor w;
+  Tensor &weight = w;
+  context.getWeight(weight, weight_idx[FCParams::weight]);
+
+  Tensor &input_ = context.getInput(SINGLE_INOUT_IDX);
+  Tensor &hidden_ = context.getOutput(SINGLE_INOUT_IDX);
+
+  TensorDim input_dim = input_.getDim();
+  TensorDim hidden_dim = hidden_.getDim();
+
+  TensorDim input_step_dim = input_dim;
+  TensorDim hidden_step_dim = hidden_dim;
+
+  if (from) {
+    NNTR_THROW_IF(to - from != 1, std::invalid_argument)
+      << "incremental step size is not 1";
+    from = 0;
+    to = 1;
+  }
+
+  input_step_dim.height(to - from);
+  hidden_step_dim.height(to - from);
+
+  // @todo: set reset stride as false. This implementation only works when batch
+  // size is 1
+  Tensor input_step = input_.getSharedDataTensor(input_step_dim, 0, true);
+  Tensor hidden_step = hidden_.getSharedDataTensor(hidden_step_dim, 0, true);
+
+  fcDotProcess(input_step, weight, hidden_step, context);
+
+  if (auto &disable_bias = std::get<props::DisableBias>(*layer_impl_props);
+      disable_bias.empty() || disable_bias.get() == false) {
+    Tensor &bias = context.getWeight(weight_idx[FCParams::bias]);
+    hidden_step.add_i(bias);
+  }
+}
+
+void FullyConnectedLayerCl::calcDerivative(RunLayerContext &context) {
+  Tensor &weight = context.getWeight(weight_idx[FCParams::weight]);
+
+  const Tensor &derivative_ = context.getIncomingDerivative(SINGLE_INOUT_IDX);
+  Tensor &ret_ = context.getOutgoingDerivative(SINGLE_INOUT_IDX);
+
+  ret_.dot_deriv_wrt_1(weight, derivative_, false, false);
+}
+
+void FullyConnectedLayerCl::calcGradient(RunLayerContext &context) {
+  Tensor &djdw = context.getWeightGrad(weight_idx[FCParams::weight]);
+
+  const Tensor &derivative_ = context.getIncomingDerivative(SINGLE_INOUT_IDX);
+  Tensor &input_ = context.getInput(SINGLE_INOUT_IDX);
+
+  if (auto &disable_bias = std::get<props::DisableBias>(*layer_impl_props);
+      disable_bias.empty() || disable_bias.get() == false) {
+    Tensor &djdb = context.getWeightGrad(weight_idx[FCParams::bias]);
+
+    if (context.isGradientFirstAccess(weight_idx[FCParams::bias])) {
+      derivative_.sum({0, 1, 2}, djdb);
+    } else {
+      /// @todo optimize below by adding beta to Tensor::sum
+      Tensor t = derivative_.sum({0, 1, 2});
+      djdb.add_i(t);
+    }
+  }
+
+  input_.dot_deriv_wrt_2(
+    djdw, derivative_, false, false,
+    !context.isGradientFirstAccess(weight_idx[FCParams::weight]));
+}
+
+} /* namespace nntrainer */
diff --git a/nntrainer/layers/cl_layers/fc_layer_cl.h b/nntrainer/layers/cl_layers/fc_layer_cl.h
new file mode 100644
index 0000000000..c94ecb22d7
--- /dev/null
+++ b/nntrainer/layers/cl_layers/fc_layer_cl.h
@@ -0,0 +1,130 @@
+// SPDX-License-Identifier: Apache-2.0
+/**
+ * Copyright (C) 2024 Debadri Samaddar <s.debadri@samsung.com>
+ *
+ * @file   fc_layer_cl.h
+ * @date   7 May 2024
+ * @brief  This is Fully Connected Layer Class of Neural Network with OpenCl
+ * implementation
+ * @see    https://github.com/nnstreamer/nntrainer
+ * @author Debadri Samaddar <s.debadri@samsung.com>
+ * @bug    No known bugs except for NYI items
+ *
+ */
+
+#ifndef __FC_LAYER_CL_H__
+#define __FC_LAYER_CL_H__
+#ifdef __cplusplus
+
+#include <common_properties.h>
+#include <layer_impl.h>
+
+#define CREATE_IF_EMPTY_DIMS(tensor, ...) \
+  do {                                    \
+    if (tensor.empty())                   \
+      tensor = Tensor(__VA_ARGS__);       \
+  } while (0);
+
+namespace nntrainer {
+
+/**
+ * @class   FullyConnecedLayer
+ * @brief   fully connected layer
+ */
+class FullyConnectedLayerCl : public LayerImpl {
+public:
+  /**
+   * @brief     Constructor of Fully Connected Layer
+   */
+  FullyConnectedLayerCl();
+
+  /**
+   * @brief     Destructor of Fully Connected Layer
+   */
+  ~FullyConnectedLayerCl() = default;
+
+  /**
+   *  @brief  Move constructor.
+   *  @param[in] FullyConnected &&
+   */
+  FullyConnectedLayerCl(FullyConnectedLayerCl &&rhs) noexcept = default;
+
+  /**
+   * @brief  Move assignment operator.
+   * @parma[in] rhs FullyConnectedLayer to be moved.
+   */
+  FullyConnectedLayerCl &operator=(FullyConnectedLayerCl &&rhs) = default;
+
+  /**
+   * @copydoc Layer::finalize(InitLayerContext &context)
+   */
+  void finalize(InitLayerContext &context) override;
+
+  /**
+   * @copydoc Layer::forwarding(RunLayerContext &context, bool training)
+   */
+  void forwarding(RunLayerContext &context, bool training) override;
+
+  /**
+￼   * @copydoc Layer::incremental_forwarding(RunLayerContext &context, unsigned
+￼   * int from, unsigned int to, bool training)
+￼   */
+  void incremental_forwarding(RunLayerContext &context, unsigned int from,
+                              unsigned int to, bool training) override;
+
+  /**
+   * @copydoc Layer::calcDerivative(RunLayerContext &context)
+   */
+  void calcDerivative(RunLayerContext &context) override;
+
+  /**
+   * @copydoc Layer::calcGradient(RunLayerContext &context)
+   */
+  void calcGradient(RunLayerContext &context) override;
+
+  /**
+   * @copydoc Layer::exportTo(Exporter &exporter, ml::train::ExportMethods
+   * method)
+   */
+  void exportTo(Exporter &exporter,
+                const ml::train::ExportMethods &method) const override;
+
+  /**
+   * @copydoc Layer::getType()
+   */
+  const std::string getType() const override {
+    return FullyConnectedLayerCl::type;
+  };
+
+  /**
+   * @brief Process data and dimensions for dot operation used in fc_layer
+   * @param[in] input Tensor
+   * @param[in] weight Tensor
+   * @param[in] result Tensor
+   * @param[in] RunLayerContext reference
+   */
+  void fcDotProcess(Tensor const &input, Tensor const &weight, Tensor &result,
+                    RunLayerContext &context);
+
+  /**
+   * @copydoc Layer::supportBackwarding()
+   */
+  bool supportBackwarding() const override { return true; }
+
+  /**
+   * @copydoc Layer::setProperty(const PropertyType type, const std::string
+   * &value)
+   */
+  void setProperty(const std::vector<std::string> &values) override;
+
+  inline static const std::string type = "fully_connected";
+
+private:
+  std::tuple<props::Unit>
+    fc_props; /**< fc layer properties : unit - number of output neurons */
+  std::array<unsigned int, 2> weight_idx; /**< indices of the weights */
+};
+} // namespace nntrainer
+
+#endif /* __cplusplus */
+#endif /* __FC_LAYER_CL__ */
diff --git a/nntrainer/layers/cl_layers/meson.build b/nntrainer/layers/cl_layers/meson.build
new file mode 100644
index 0000000000..2f1ba7fc03
--- /dev/null
+++ b/nntrainer/layers/cl_layers/meson.build
@@ -0,0 +1,8 @@
+cl_layer_sources = [
+  'fc_layer_cl.cpp',
+  'blas_kernels.cpp'
+]
+
+foreach s : cl_layer_sources
+  nntrainer_sources += meson.current_source_dir() / s
+endforeach
diff --git a/nntrainer/layers/conv2d_layer.cpp b/nntrainer/layers/conv2d_layer.cpp
index c059ae9caf..5d9dbc1e19 100644
--- a/nntrainer/layers/conv2d_layer.cpp
+++ b/nntrainer/layers/conv2d_layer.cpp
@@ -38,7 +38,8 @@ namespace {
 static TensorDim calcCol2ImOutputDim(const TensorDim &out,
                                      const TensorDim &kdim) {
 
-  return TensorDim({kdim.getFeatureLen(), out.width() * out.height()});
+  return TensorDim({kdim.getFeatureLen(), out.width() * out.height()},
+                   out.getTensorType());
 }
 
 /**
@@ -56,7 +57,10 @@ static void col2im(const Tensor &col_matrix, const TensorDim &kdim,
                    const std::array<props::Stride, CONV2D_DIM> &mstride,
                    const std::array<props::Dilation, CONV2D_DIM> &dilation,
                    Tensor &image) {
-  auto [pt, pb, pl, pr] = padding;
+  auto pt = padding[0];
+  auto pb = padding[1];
+  auto pl = padding[2];
+  auto pr = padding[3];
 
   unsigned k_height = kdim.height();
   unsigned k_width = kdim.width();
@@ -84,32 +88,48 @@ static void col2im(const Tensor &col_matrix, const TensorDim &kdim,
   int h_stride_end = im_eff_height - eff_k_height - pt;
   int w_stride_end = im_eff_width - eff_k_width - pl;
 
-  unsigned col_w = 0;
-  for (int hs = -pt; hs <= h_stride_end; hs += hstride) {
-    for (int ws = -pl; ws <= w_stride_end; ws += wstride) {
-      unsigned col_h = 0;
-      int patch_height_end = hs + eff_k_height;
-      int patch_width_end = ws + eff_k_width;
-      for (unsigned c = 0; c < im_channel; c++) {
-        for (int h = hs; h < patch_height_end; h += hdilation) {
-          if (h < 0 || im_height <= h) {
-            col_h += k_width;
-            continue;
-          }
-          for (int w = ws; w < patch_width_end; w += wdilation) {
-            if (w < 0 || im_width <= w) {
-              col_h++;
+  auto apply_data = [&]<typename T>(T *val) {
+    unsigned col_w = 0;
+    for (int hs = -pt; hs <= h_stride_end; hs += hstride) {
+      for (int ws = -pl; ws <= w_stride_end; ws += wstride) {
+        unsigned col_h = 0;
+        int patch_height_end = hs + eff_k_height;
+        int patch_width_end = ws + eff_k_width;
+        for (unsigned c = 0; c < im_channel; c++) {
+          for (int h = hs; h < patch_height_end; h += hdilation) {
+            if (h < 0 || im_height <= h) {
+              col_h += k_width;
               continue;
             }
-
-            float *val = image.getAddress<float>(0, c, h, w);
-            *val += col_matrix.getValue<float>(0, 0, col_h, col_w);
-            col_h++;
+            for (int w = ws; w < patch_width_end; w += wdilation) {
+              if (w < 0 || im_width <= w) {
+                col_h++;
+                continue;
+              }
+
+              val = image.getAddress<T>(0, c, h, w);
+              *val += col_matrix.getValue<T>(0, 0, col_h, col_w);
+              col_h++;
+            }
           }
         }
+        col_w++;
       }
-      col_w++;
     }
+  };
+
+  if (image.getDataType() == nntrainer::Tdatatype::FP32) {
+    float val;
+    apply_data(&val);
+  }
+#ifdef ENABLE_FP16
+  else if (image.getDataType() == nntrainer::Tdatatype::FP16) {
+    _FP16 val;
+    apply_data(&val);
+  }
+#endif
+  else {
+    throw std::runtime_error("Not supported datatype");
   }
 }
 
@@ -179,7 +199,10 @@ static void im2col(const Tensor &in, const TensorDim &kdim,
   //   }
   */
 
-  auto [pt, pb, pl, pr] = padding;
+  auto pt = padding[0];
+  auto pb = padding[1];
+  auto pl = padding[2];
+  auto pr = padding[3];
 
   unsigned int channel = in.channel();
   int in_height = in.height();
@@ -198,46 +221,62 @@ static void im2col(const Tensor &in, const TensorDim &kdim,
   unsigned int out_width = (width - eff_k_width) / mstride[1] + 1;
 
   out.reshape(
-    TensorDim({out_height * out_width, in.channel() * k_height * k_width}));
-  float *out_data = out.getData();
-
-  int h_stride_end = height - eff_k_height - pt;
-  int w_stride_end = width - eff_k_width - pl;
-
-  /// get a patch, size of kernel
-  /// hs is height_strided, ws is width_strided
-  unsigned int owidth = out.width();
-  unsigned int base_im_w = 0;
-  for (int hs = -pt; hs <= h_stride_end; hs += mstride[0]) {
-    unsigned int base_im_h = 0;
-    int patch_height_end = eff_k_height + hs;
-    /// map the patch to a single line looping through channel
-    for (unsigned int c = 0; c < channel; ++c) {
-      for (int h = hs; h < patch_height_end; h += dilation[0]) {
-        if (h < 0 || in_height <= h) {
-          base_im_h += k_width;
-          continue;
-        }
-
-        unsigned int im_w = base_im_w;
-        for (int ws = -pl; ws <= w_stride_end; ws += mstride[1]) {
-          unsigned int im_h = base_im_h;
-          int patch_width_end = eff_k_width + ws;
+    TensorDim({out_height * out_width, in.channel() * k_height * k_width},
+              in.getTensorType()));
+
+  auto apply_data = [&]<typename T>(T *out_data) {
+    int h_stride_end = height - eff_k_height - pt;
+    int w_stride_end = width - eff_k_width - pl;
+
+    /// get a patch, size of kernel
+    /// hs is height_strided, ws is width_strided
+    unsigned int owidth = out.width();
+    unsigned int base_im_w = 0;
+    for (int hs = -pt; hs <= h_stride_end; hs += mstride[0]) {
+      unsigned int base_im_h = 0;
+      int patch_height_end = eff_k_height + hs;
+      /// map the patch to a single line looping through channel
+      for (unsigned int c = 0; c < channel; ++c) {
+        for (int h = hs; h < patch_height_end; h += dilation[0]) {
+          if (h < 0 || in_height <= h) {
+            base_im_h += k_width;
+            continue;
+          }
 
-          for (int w = ws; w < patch_width_end; w += dilation[1]) {
-            if (w < 0 || in_width <= w) {
+          unsigned int im_w = base_im_w;
+          for (int ws = -pl; ws <= w_stride_end; ws += mstride[1]) {
+            unsigned int im_h = base_im_h;
+            int patch_width_end = eff_k_width + ws;
+
+            for (int w = ws; w < patch_width_end; w += dilation[1]) {
+              if (w < 0 || in_width <= w) {
+                im_h++;
+                continue;
+              }
+              out_data[im_w * owidth + im_h] = in.getValue<T>(0, c, h, w);
               im_h++;
-              continue;
             }
-            out_data[im_w * owidth + im_h] = in.getValue<float>(0, c, h, w);
-            im_h++;
+            im_w++;
           }
-          im_w++;
+          base_im_h += k_width;
         }
-        base_im_h += k_width;
       }
+      base_im_w += out_width;
     }
-    base_im_w += out_width;
+  };
+
+  if (out.getDataType() == nntrainer::Tdatatype::FP32) {
+    float *out_data = out.getData<float>();
+    apply_data(out_data);
+  }
+#ifdef ENABLE_FP16
+  else if (out.getDataType() == nntrainer::Tdatatype::FP16) {
+    _FP16 *out_data = out.getData<_FP16>();
+    apply_data(out_data);
+  }
+#endif
+  else {
+    throw std::runtime_error("Not supported datatype");
   }
 }
 
@@ -279,9 +318,11 @@ void Conv2DLayer::finalize(InitLayerContext &context) {
   auto &dilation =
     std::get<std::array<props::Dilation, CONV2D_DIM>>(conv_props);
 
-  TensorDim kernel_dim =
-    TensorDim(filter_size, in_dim.channel(), kernel_size[0], kernel_size[1]);
-  TensorDim bias_dim = TensorDim(1, filter_size, 1, 1);
+  auto in_t_type = in_dim.getTensorType();
+  in_t_type.data_type = context.getWeightDataType();
+  TensorDim kernel_dim = TensorDim(filter_size, in_dim.channel(),
+                                   kernel_size[0], kernel_size[1], in_t_type);
+  TensorDim bias_dim = TensorDim(1, filter_size, 1, 1, in_t_type);
 
   padding = std::get<props::Padding2D>(conv_props)
               .compute(in_dim, kernel_dim, {stride[0], stride[1]},
@@ -309,6 +350,7 @@ void Conv2DLayer::finalize(InitLayerContext &context) {
   out_dim.channel(filter_size);
   out_dim.height((eff_in_height - eff_k_height) / stride[0] + 1);
   out_dim.width((eff_in_width - eff_k_width) / stride[1] + 1);
+  out_dim.setTensorType(in_dim.getTensorType());
   context.setOutputDimensions({out_dim});
 
   NNTR_THROW_IF(eff_in_height < kernel_size[0] || eff_in_width < kernel_size[1],
diff --git a/nntrainer/layers/fc_layer.cpp b/nntrainer/layers/fc_layer.cpp
index de34f5f921..436a936439 100644
--- a/nntrainer/layers/fc_layer.cpp
+++ b/nntrainer/layers/fc_layer.cpp
@@ -40,8 +40,11 @@ enum FCParams { weight, bias };
 enum LORAParams { loraA, loraB, loraTmp, loraOut };
 
 FullyConnectedLayer::FullyConnectedLayer() :
-  LayerImpl(), fc_props(props::Unit(), props::LoraRank(), props::LoraAlpha()) {
+  LayerImpl(),
+  lora_scaling(1.0f),
+  fc_props(props::Unit(), props::LoraRank(), props::LoraAlpha()) {
   weight_idx.fill(std::numeric_limits<unsigned>::max());
+  lora_idx.fill(std::numeric_limits<unsigned>::max());
 }
 
 void FullyConnectedLayer::finalize(InitLayerContext &context) {
diff --git a/nntrainer/layers/fc_layer.h b/nntrainer/layers/fc_layer.h
index cb3726b020..44ef99d912 100644
--- a/nntrainer/layers/fc_layer.h
+++ b/nntrainer/layers/fc_layer.h
@@ -114,7 +114,7 @@ class FullyConnectedLayer : public LayerImpl {
                                                 lora_scaling - scaling factor of LoRA apply, i.e.,
                                              lora_scaling = alpha / lora_rank */
   std::array<unsigned int, 2> weight_idx; /**< indices of the weights */
-  std::array<unsigned int, 2> lora_idx;   /**< indices of the lora weights */
+  std::array<unsigned int, 4> lora_idx;   /**< indices of the lora weights */
 };
 } // namespace nntrainer
 
diff --git a/nntrainer/layers/input_layer.cpp b/nntrainer/layers/input_layer.cpp
index eabd40b297..a67701da2c 100644
--- a/nntrainer/layers/input_layer.cpp
+++ b/nntrainer/layers/input_layer.cpp
@@ -34,7 +34,8 @@ static constexpr size_t SINGLE_INOUT_IDX = 0;
 
 InputLayer::InputLayer() :
   Layer(),
-  input_props(props::Normalization(), props::Standardization()) {}
+  input_props(props::Normalization(), props::Standardization()),
+  is_inplace(true) {}
 
 void InputLayer::setProperty(const std::vector<std::string> &values) {
   auto remain_props = loadProperties(values, input_props);
@@ -47,7 +48,7 @@ void InputLayer::forwarding(RunLayerContext &context, bool training) {
   Tensor &hidden_ = context.getOutput(SINGLE_INOUT_IDX);
   if (!context.executeInPlace()) {
     Tensor &input_ = context.getInput(SINGLE_INOUT_IDX);
-    hidden_.copy(input_);
+    hidden_.copyData(input_);
   }
 
   if (std::get<props::Normalization>(input_props))
@@ -70,7 +71,22 @@ void InputLayer::finalize(InitLayerContext &context) {
 
   std::vector<TensorDim> output_dims = context.getInputDimensions();
 
+  for (auto &d : output_dims) {
+    d.setDataType(context.getActivationDataType());
+  }
+
   context.setOutputDimensions(output_dims);
+
+  is_inplace = true;
+
+  /**
+   * @note Input Layer assuems that the FP32 IN Tensor always. Therefore, if the
+   * activation data type is not fp32, then it does not support in-place
+   * operation.
+   */
+  if (context.getActivationDataType() != ml::train::TensorDim::DataType::FP32) {
+    is_inplace = false;
+  }
 }
 
 } /* namespace nntrainer */
diff --git a/nntrainer/layers/input_layer.h b/nntrainer/layers/input_layer.h
index f6728d676b..e9183e23d1 100644
--- a/nntrainer/layers/input_layer.h
+++ b/nntrainer/layers/input_layer.h
@@ -82,7 +82,7 @@ class InputLayer : public Layer {
   /**
    * @copydoc Layer::supportInPlace()
    */
-  bool supportInPlace() const override { return true; }
+  bool supportInPlace() const override { return is_inplace; }
 
   /**
    * @copydoc Layer::exportTo(Exporter &exporter, ml::train::ExportMethods
@@ -105,6 +105,7 @@ class InputLayer : public Layer {
 
 private:
   std::tuple<props::Normalization, props::Standardization> input_props;
+  bool is_inplace;
 };
 } // namespace nntrainer
 
diff --git a/nntrainer/layers/layer_context.cpp b/nntrainer/layers/layer_context.cpp
index fff2eb15ec..add78c09cb 100644
--- a/nntrainer/layers/layer_context.cpp
+++ b/nntrainer/layers/layer_context.cpp
@@ -126,13 +126,14 @@ const std::vector<VarGradSpecV2> &InitLayerContext::getOutSpecs() const {
 }
 
 RunLayerContext::RunLayerContext(const std::string &name, bool trainable,
-                                 float l, bool in_place_,
+                                 float l, bool in_place_, float loss_scale_,
                                  const std::vector<Weight *> &w,
                                  const std::vector<Var_Grad *> &in,
                                  const std::vector<Var_Grad *> &out,
                                  const std::vector<Var_Grad *> &t) :
   loss(l),
   in_place(in_place_),
+  loss_scale(loss_scale_),
   weights(w),
   inputs(in),
   outputs(out),
@@ -169,6 +170,19 @@ Tensor &RunLayerContext::getWeightGrad(unsigned int idx) const {
   return weights[idx]->getGradientRef();
 }
 
+/**
+ * @brief Get the Weight Gradient tensor object
+ *
+ * @param idx Identifier of the weight
+ * @return Tensor& Reference to the weight grad tensor
+ */
+Tensor &RunLayerContext::getWeightFP32(unsigned int idx) const {
+  if (!weights[idx]->hasGradient())
+    throw std::invalid_argument(
+      "Requesting gradient for a non-trainable weight.");
+  return weights[idx]->getVariableFP32Ref();
+}
+
 /**
  * @brief Get the Weight Optimizer Variable tensor object
  *
@@ -402,6 +416,17 @@ bool RunLayerContext::isGradientClipByGlobalNorm(unsigned int idx) const {
   return weights[idx]->isGradientClipByGlobalNorm();
 }
 
+bool RunLayerContext::isMixedPrecision(unsigned int idx) const {
+  return weights[idx]->isMixedPrecision();
+}
+
+bool RunLayerContext::isMixedPrecision() const {
+  for (auto w : weights)
+    if (w->isMixedPrecision())
+      return true;
+  return false;
+}
+
 /**
  * @brief Get the tensor name
  *
@@ -650,10 +675,12 @@ bool RunLayerContext::clCreateKernel(std::string kernel_string,
  */
 std::string RunLayerContext::getKernelName(LayerKernel layerKernel) {
   switch (layerKernel) {
-  case LayerKernel::KERNEL_NAME1:
-    return "kernel_name1";
-  case LayerKernel::KERNEL_NAME2:
-    return "kernel_name2";
+  case LayerKernel::SGEMV:
+    return "sgemv_cl";
+  case LayerKernel::DOT:
+    return "dot_cl";
+  case LayerKernel::SGEMM:
+    return "sgemm_cl";
   default:
     return "";
   }
diff --git a/nntrainer/layers/layer_context.h b/nntrainer/layers/layer_context.h
index e5c6759638..2a32ba7287 100644
--- a/nntrainer/layers/layer_context.h
+++ b/nntrainer/layers/layer_context.h
@@ -63,7 +63,7 @@ class InitLayerContext {
                    const float max_norm = 0.0,
                    std::array<std::string, 3> tensor_type_ = {"NCHW", "FP32",
                                                               "FP32"},
-                   const float loss_scale = 0.0);
+                   const float loss_scale = 1.0);
   /**
    * @brief   get Tensor Format of Layer
    *
@@ -348,6 +348,14 @@ class InitLayerContext {
    */
   bool executeInPlace() const { return in_place; }
 
+  /**
+   * @brief   get Initial value of Loss_Scale. This is set to RunLayerContext
+   * and updated
+   *
+   * @return loss_scale
+   */
+  float getLossScale() const { return loss_scale; }
+
 private:
   std::vector<TensorDim> input_dim; /**< Input dimensions for the layer */
   bool in_place;             /**< if the layer is expected to run in-place */
@@ -385,7 +393,7 @@ class RunLayerContext {
    * @brief Construct a new Run Layer Context object
    *
    */
-  RunLayerContext() : loss(0.0), in_place(false) {}
+  RunLayerContext() : loss(0.0), in_place(false), loss_scale(1.0) {}
 
   /**
    * @brief Construct a new Run Layer Context object
@@ -396,6 +404,17 @@ class RunLayerContext {
     std::get<props::Name>(props).set(name);
   }
 
+  /**
+   * @brief Construct a new Run Layer Context object
+   *
+   */
+  RunLayerContext(const std::string &name, bool in_place_, float loss_scale_) :
+    RunLayerContext() {
+    in_place = in_place_;
+    std::get<props::Name>(props).set(name);
+    loss_scale = loss_scale_;
+  }
+
   /**
    * @brief Construct a new Run Layer Context object
    *
@@ -403,13 +422,15 @@ class RunLayerContext {
    * @param trainable if the layer is trainable
    * @param l loss of the layer
    * @param in_place_ execution in-place of the layer
+   * @param loss_scale loss_scale of the layer
    * @param w weights of the layer
    * @param in inputs of the layer
    * @param out outputs of the layer
    * @param t extra tensors of the layer
    */
   RunLayerContext(const std::string &name, bool trainable, float l,
-                  bool in_place_, const std::vector<Weight *> &w,
+                  bool in_place_, float loss_scale_,
+                  const std::vector<Weight *> &w,
                   const std::vector<Var_Grad *> &in,
                   const std::vector<Var_Grad *> &out,
                   const std::vector<Var_Grad *> &t);
@@ -463,6 +484,15 @@ class RunLayerContext {
   Tensor &getWeightGrad(unsigned int idx) const;
 
   /**
+   * @brief Get the Weight Gradient tensor object
+   *
+   * @param idx Identifier of the weight
+   * @return Tensor& Reference to the weight grad tensor
+   */
+  Tensor &getWeightFP32(unsigned int idx) const;
+
+  /**
+
    * @brief Get the Weight Optimizer Variable tensor object
    *
    * @param idx Identifier of the weight
@@ -659,6 +689,20 @@ class RunLayerContext {
    */
   bool isGradientClipByGlobalNorm(unsigned int idx) const;
 
+  /**
+   * @brief check if the weight is mixed precsion
+   *
+   * @param idx index
+   * @return bool true if it is mixed precision
+   */
+  bool isMixedPrecision(unsigned int idx) const;
+
+  /**
+   * @brief check if the weight is mixed precsion
+   * @return bool true if it is mixed precision
+   */
+  bool isMixedPrecision() const;
+
   /**
    * @brief Get the tensor name
    *
@@ -830,8 +874,9 @@ class RunLayerContext {
    * getKernelName function.
    */
   enum LayerKernel {
-    KERNEL_NAME1 = 1, /**< placeholder for kernel name */
-    KERNEL_NAME2 = 2  /**< placeholder for kernel name */
+    SGEMV = 1, /**< placeholder for kernel name */
+    DOT = 2,   /**< placeholder for kernel name */
+    SGEMM = 4  /**< placeholder for kernel name */
   };
 
   /**
@@ -874,10 +919,29 @@ class RunLayerContext {
    */
   ml::train::LayerComputeEngine getComputeEngine() { return compute_engine; }
 
+  /**
+   * @brief get loss scale
+   * @return loss scale
+   */
+  float getLossScale() { return loss_scale; }
+
+  /**
+   * @brief   set Loss_Scale.
+   *
+   * @return loss_scale
+   */
+  void setLossScale(float scale) {
+    loss_scale = scale;
+    for (auto w : weights) {
+      w->setLossScale(scale);
+    }
+  }
+
 private:
   std::tuple<props::Name, props::Trainable> props; /**< props of the layer */
   float loss;                                      /**< loss of the layer */
-  bool in_place; /**< if the layer is expected to run in-place */
+  bool in_place;    /**< if the layer is expected to run in-place */
+  float loss_scale; /**< loss_scale of the layer */
 
   std::vector<Weight *> weights;   /**< weights of the layer */
   std::vector<Var_Grad *> inputs;  /**< inputs of the layer */
diff --git a/nntrainer/layers/layer_devel.h b/nntrainer/layers/layer_devel.h
index 54ce1a0ee9..44a87cc7e9 100644
--- a/nntrainer/layers/layer_devel.h
+++ b/nntrainer/layers/layer_devel.h
@@ -259,6 +259,11 @@ class Layer {
    * @return true if supports backwarding, else false
    */
   virtual bool supportBackwarding() const = 0;
+
+  /**
+   * @brief Set loss scale factor
+   */
+  virtual void setLossScale(float scale) {}
 };
 
 /// @todo Decide where to put and how to implement(#986)
diff --git a/nntrainer/layers/layer_node.cpp b/nntrainer/layers/layer_node.cpp
index 8b18d80762..114555fee4 100644
--- a/nntrainer/layers/layer_node.cpp
+++ b/nntrainer/layers/layer_node.cpp
@@ -180,6 +180,7 @@ LayerNode::LayerNode(std::unique_ptr<nntrainer::Layer> &&l) :
   inplace(InPlace::NONE),
   needs_calc_derivative(false),
   needs_calc_gradient(false),
+
   output_connections(),
   run_context(nullptr),
   layer_node_props(
@@ -190,7 +191,8 @@ LayerNode::LayerNode(std::unique_ptr<nntrainer::Layer> &&l) :
     new RealizationPropsType(props::Flatten(), props::Activation())),
   loss(new props::Loss()),
   regularization_loss(0.0f),
-  exec_order({0, 0, 0, 0}) {
+  exec_order({0, 0, 0, 0}),
+  needs_output_set_zero(false) {
   if (layer && layer->getType() == TimeDistLayer::type) {
     std::get<props::Distribute>(*layer_node_props).set(true);
   }
@@ -475,6 +477,9 @@ void LayerNode::read(std::ifstream &file, bool opt_var) {
       /// @note shared weights are only be read at the first acecss
       if (run_context->isGradientLastAccess(i)) {
         run_context->getWeight(i).read(file);
+        if (run_context->isMixedPrecision(i) && getTrainable()) {
+          run_context->getWeightFP32(i).copyData(run_context->getWeight(i));
+        }
       }
     }
   }
@@ -599,7 +604,7 @@ InitLayerContext LayerNode::finalize(const std::vector<TensorDim> &input_dims,
 
   const auto &scope = getSharedFrom().empty() ? getName() : getSharedFrom();
   float max_norm = 0.0;
-  float loss_scale = 0.0;
+  float loss_scale = 1.0;
   if (!std::get<props::ClipGradByGlobalNorm>(*layer_node_props).empty())
     max_norm = std::get<props::ClipGradByGlobalNorm>(*layer_node_props).get();
 
@@ -748,8 +753,21 @@ LayerNode::refinalize(const std::vector<TensorDim> &input_dims) {
  */
 void LayerNode::forwarding(bool training) {
   loss->set(run_context->getRegularizationLoss());
+
   PROFILE_TIME_START(forward_event_key);
+  if (needsOutputSetZero()) {
+    for (unsigned int i = 0; i < run_context->getNumOutputs(); ++i) {
+      run_context->getOutput(i).setValue(0);
+      run_context->getOutgoingDerivative(i).setValue(0);
+    }
+
+    for (unsigned int i = 0; i < run_context->getNumWeights(); ++i) {
+      run_context->getWeightGrad(i).setValue(0);
+    }
+  }
+
   layer->forwarding(*run_context, training);
+  needsOutputSetZero(false);
   PROFILE_TIME_END(forward_event_key);
   TRACE_MEMORY() << getName() + ": F";
   TRACE_TIME() << getName() + ": F";
@@ -864,10 +882,11 @@ float LayerNode::getLoss() const { return *loss; }
 void LayerNode::configureRunContext(const std::vector<Weight *> &weights,
                                     const std::vector<Var_Grad *> &inputs,
                                     const std::vector<Var_Grad *> &outputs,
-                                    const std::vector<Var_Grad *> &tensors) {
+                                    const std::vector<Var_Grad *> &tensors,
+                                    float loss_scale) {
   run_context = std::make_unique<RunLayerContext>(
-    getName(), getTrainable(), 0.0f, executeInPlace() != InPlace::NONE, weights,
-    inputs, outputs, tensors);
+    getName(), getTrainable(), 0.0f, executeInPlace() != InPlace::NONE,
+    loss_scale, weights, inputs, outputs, tensors);
 }
 
 /**
diff --git a/nntrainer/layers/layer_node.h b/nntrainer/layers/layer_node.h
index 93e7ac7069..c2202f20aa 100644
--- a/nntrainer/layers/layer_node.h
+++ b/nntrainer/layers/layer_node.h
@@ -487,6 +487,7 @@ class LayerNode final : public ml::train::Layer, public GraphNode {
   const std::vector<TensorDim> getOutputDimensions() const;
   /**
    * @brief Get the Weight object
+   * currently, only unittest uses this func.
    *
    * @param idx Identifier of the weight
    * @return Weight& Reference to the weight
@@ -495,11 +496,11 @@ class LayerNode final : public ml::train::Layer, public GraphNode {
     NNTR_THROW_IF(!run_context, std::runtime_error)
       << __func__ << " layer needs to be finalized first!";
     if (run_context->weightHasGradient(idx)) {
-      return Weight(run_context->getWeight(idx),
-                    run_context->getWeightGrad(idx),
-                    run_context->getWeightName(idx));
+      return Weight(
+        run_context->getWeight(idx), run_context->getWeightGrad(idx),
+        run_context->getWeightFP32(idx), run_context->getWeightName(idx));
     } else {
-      return Weight(run_context->getWeight(idx), Tensor(),
+      return Weight(run_context->getWeight(idx), Tensor(), Tensor(),
                     run_context->getWeightName(idx));
     }
   }
@@ -819,7 +820,8 @@ class LayerNode final : public ml::train::Layer, public GraphNode {
   void configureRunContext(const std::vector<Weight *> &weights,
                            const std::vector<Var_Grad *> &inputs,
                            const std::vector<Var_Grad *> &outputs,
-                           const std::vector<Var_Grad *> &tensors);
+                           const std::vector<Var_Grad *> &tensors,
+                           float loss_scale);
 
   /**
    * @brief Preset modes for printing summary for the layer
@@ -877,6 +879,13 @@ class LayerNode final : public ml::train::Layer, public GraphNode {
     needs_calc_derivative = nb;
   }
 
+  /**
+   * @brief Set if the layer output needs reinitialization @mixed precsion
+   *
+   * @param nb true if the layer needs to do reinitialization, eles false
+   */
+  void needsOutputSetZero(bool nb) { needs_output_set_zero = nb; }
+
   /**
    * @brief Set if the layer needs to do calculation of gradients
    *
@@ -898,6 +907,13 @@ class LayerNode final : public ml::train::Layer, public GraphNode {
    */
   bool needsCalcGradient() { return needs_calc_gradient; }
 
+  /**
+   * @brief Set if the layer needs to reinitialization @mixed precsion
+   *
+   * @param nb true if the layer needs reinitialization, eles false
+   */
+  bool needsOutputSetZero() { return needs_output_set_zero; }
+
 private:
   /**
    * @brief     Get the Input Layers object
@@ -964,6 +980,9 @@ properties in the context/graph unless intended. */
   ExecutionOrder exec_order; /**< order/location of execution for this node
                                    in forward and backwarding operations */
 
+  bool needs_output_set_zero; /**< cache if this layer needs reinitialization
+                                 output  */
+
   /**
    * @brief   Get the effective layer managed by this layer node
    *
diff --git a/nntrainer/layers/loss/loss_layer.cpp b/nntrainer/layers/loss/loss_layer.cpp
index 40f74717f8..8d18878f49 100644
--- a/nntrainer/layers/loss/loss_layer.cpp
+++ b/nntrainer/layers/loss/loss_layer.cpp
@@ -22,8 +22,12 @@ void LossLayer::finalize(InitLayerContext &context) {
     d.setDataType(
       str_converter<enum_class_prop_tag,
                     nntrainer::TensorDataTypeInfo>::from_string("FP32"));
-  
+
   context.setOutputDimensions(output_dim);
+
+  is_inplace = true;
+  if (context.getActivationDataType() != ml::train::TensorDim::DataType::FP32)
+    is_inplace = false;
 }
 
 void LossLayer::updateLoss(RunLayerContext &context, const Tensor &l) {
@@ -36,6 +40,13 @@ void LossLayer::updateLoss(RunLayerContext &context, const Tensor &l) {
   context.setLoss(loss_sum / (float)l.batch());
 }
 
+void LossLayer::applyLossScale(RunLayerContext &context, Tensor &ret_deriv) {
+
+  float loss_scale = context.getLossScale();
+  if (loss_scale != 1.0)
+    ret_deriv.multiply_i(loss_scale);
+}
+
 /**
  * @copydoc Layer::setProperty(const std::vector<std::string> &values)
  */
diff --git a/nntrainer/layers/loss/loss_layer.h b/nntrainer/layers/loss/loss_layer.h
index 00b520f6e6..418777606c 100644
--- a/nntrainer/layers/loss/loss_layer.h
+++ b/nntrainer/layers/loss/loss_layer.h
@@ -47,6 +47,8 @@ class LossLayer : public Layer {
    */
   virtual bool supportBackwarding() const override { return true; }
 
+  bool supportInPlace() const override {return is_inplace;}
+
   /**
    * @copydoc Layer::requireLabel()
    */
@@ -60,8 +62,17 @@ class LossLayer : public Layer {
    */
   void updateLoss(RunLayerContext &context, const Tensor &l);
 
+  /**
+   * @brief     update return derivative with loss scale
+   * @param     context Run context to update
+   * @param     return_dev Tensor data to calculate
+   */
+  void applyLossScale(RunLayerContext &context, Tensor &l);
+
   Tensor
     l; /**< loss tensor to store intermediate value to calculate loss value */
+
+    bool is_inplace;
 };
 
 } // namespace nntrainer
diff --git a/nntrainer/layers/loss/mse_loss_layer.cpp b/nntrainer/layers/loss/mse_loss_layer.cpp
index 7f7bd1626f..356acae6f5 100644
--- a/nntrainer/layers/loss/mse_loss_layer.cpp
+++ b/nntrainer/layers/loss/mse_loss_layer.cpp
@@ -20,7 +20,16 @@ static constexpr size_t SINGLE_INOUT_IDX = 0;
 
 void MSELossLayer::forwarding(RunLayerContext &context, bool training) {
   Tensor &hidden_ = context.getOutput(SINGLE_INOUT_IDX);
-  Tensor &y = context.getInput(SINGLE_INOUT_IDX);
+
+  Tensor empty_tensor;
+  Tensor &y = context.getInput(SINGLE_INOUT_IDX).getDataType() ==
+                  ml::train::TensorDim::DataType::FP32
+                ? context.getInput(SINGLE_INOUT_IDX)
+                : empty_tensor;
+
+  if (y.empty())
+    y = context.getInput(SINGLE_INOUT_IDX)
+          .clone(ml::train::TensorDim::DataType::FP32);
 
   // hidden_ <- y2 - y;
   if (context.isLabelAvailable(SINGLE_INOUT_IDX)) {
@@ -41,9 +50,28 @@ void MSELossLayer::forwarding(RunLayerContext &context, bool training) {
 }
 
 void MSELossLayer::calcDerivative(RunLayerContext &context) {
-  Tensor &ret_derivative = context.getOutgoingDerivative(SINGLE_INOUT_IDX);
+  Tensor empty_tensor;
+
+  Tensor &ret_derivative =
+    context.getOutgoingDerivative(SINGLE_INOUT_IDX).getDataType() ==
+        ml::train::TensorDim::DataType::FP32
+      ? context.getOutgoingDerivative(SINGLE_INOUT_IDX)
+      : empty_tensor;
+
+  if (ret_derivative.empty())
+    ret_derivative = context.getOutgoingDerivative(SINGLE_INOUT_IDX)
+                       .clone(ml::train::TensorDim::DataType::FP32);
+  Tensor empty_tensor1;
+  Tensor &y = context.getInput(SINGLE_INOUT_IDX).getDataType() ==
+                  ml::train::TensorDim::DataType::FP32
+                ? context.getInput(SINGLE_INOUT_IDX)
+                : empty_tensor1;
+
+  if (y.empty())
+    y = context.getInput(SINGLE_INOUT_IDX)
+          .clone(ml::train::TensorDim::DataType::FP32);
+
   const Tensor &y2 = context.getIncomingDerivative(SINGLE_INOUT_IDX);
-  Tensor &y = context.getInput(SINGLE_INOUT_IDX);
 
   y.subtract(y2, ret_derivative);
   float divider = ((float)y.size()) / 2;
@@ -51,6 +79,16 @@ void MSELossLayer::calcDerivative(RunLayerContext &context) {
     throw std::runtime_error(
       "[MSELossLayer::calcDerivative] Error when calculating loss");
   }
+
+  // Loss Scale needs Full precsiion of ret_derivative. Therefore,
+  // ret_derivateive should be FP32 when applying scale, and after applying it
+  // need to convert original type for backpropagating.
+
+  LossLayer::applyLossScale(context, ret_derivative);
+
+  if (context.getOutgoingDerivative(SINGLE_INOUT_IDX).getDataType() !=
+      ml::train::TensorDim::DataType::FP32)
+    context.getOutgoingDerivative(SINGLE_INOUT_IDX).copyData(ret_derivative);
 }
 
 } // namespace nntrainer
diff --git a/nntrainer/layers/loss/mse_loss_layer.h b/nntrainer/layers/loss/mse_loss_layer.h
index 387e92b3b5..829b921668 100644
--- a/nntrainer/layers/loss/mse_loss_layer.h
+++ b/nntrainer/layers/loss/mse_loss_layer.h
@@ -51,6 +51,7 @@ class MSELossLayer : public LossLayer {
   const std::string getType() const override { return MSELossLayer::type; };
 
   inline static const std::string type = "mse";
+
 };
 } // namespace nntrainer
 
diff --git a/nntrainer/layers/lstm.cpp b/nntrainer/layers/lstm.cpp
index d5f13a1fc5..be313a0aca 100644
--- a/nntrainer/layers/lstm.cpp
+++ b/nntrainer/layers/lstm.cpp
@@ -509,21 +509,27 @@ void LSTMLayer::finalize(InitLayerContext &context) {
   }
 
   // hidden_state_dim : [ batch_size, 1, max_timestep, unit ]
-  const TensorDim hidden_state_dim(batch_size, 1, max_timestep, unit,
-                                   weight_tensor_type);
+  TensorDim hidden_state_dim(batch_size, 1, max_timestep, unit,
+                             weight_tensor_type);
+  hidden_state_dim.setDataType(context.getActivationDataType());
+
   wt_idx[LSTMParams::hidden_state] = context.requestTensor(
     hidden_state_dim, "hidden_state", Tensor::Initializer::NONE, true,
     TensorLifespan::ITERATION_LIFESPAN);
   // cell_state_dim : [ batch_size, 1, max_timestep, unit ]
-  const TensorDim cell_state_dim(batch_size, 1, max_timestep, unit,
-                                 weight_tensor_type);
+  TensorDim cell_state_dim(batch_size, 1, max_timestep, unit,
+                           weight_tensor_type);
+  cell_state_dim.setDataType(context.getActivationDataType());
+
   wt_idx[LSTMParams::cell_state] = context.requestTensor(
     cell_state_dim, "cell_state", Tensor::Initializer::NONE, true,
     TensorLifespan::ITERATION_LIFESPAN);
 
   // ifgo_dim : [ batch_size, 1, max_timestep, NUM_GATE * unit ]
-  const TensorDim ifgo_dim(batch_size, 1, max_timestep, NUM_GATE * unit,
-                           weight_tensor_type);
+  TensorDim ifgo_dim(batch_size, 1, max_timestep, NUM_GATE * unit,
+                     weight_tensor_type);
+  ifgo_dim.setDataType(context.getActivationDataType());
+
   wt_idx[LSTMParams::ifgo] =
     context.requestTensor(ifgo_dim, "ifgo", Tensor::Initializer::NONE, true,
                           TensorLifespan::ITERATION_LIFESPAN);
@@ -576,21 +582,27 @@ void LSTMLayer::finalize(InitLayerContext &context) {
     }
 
     // reverse_hidden_state_dim : [ batch_size, 1, max_timestep, unit ]
-    const TensorDim reverse_hidden_state_dim(batch_size, 1, max_timestep, unit,
-                                             weight_tensor_type);
+    TensorDim reverse_hidden_state_dim(batch_size, 1, max_timestep, unit,
+                                       weight_tensor_type);
+    reverse_hidden_state_dim.setDataType(context.getActivationDataType());
+
     wt_idx[LSTMParams::reverse_hidden_state] = context.requestTensor(
       reverse_hidden_state_dim, "reverse_hidden_state",
       Tensor::Initializer::NONE, true, TensorLifespan::ITERATION_LIFESPAN);
     // reverse_cell_state_dim : [ batch_size, 1, max_timestep, unit ]
-    const TensorDim reverse_cell_state_dim(batch_size, 1, max_timestep, unit,
-                                           weight_tensor_type);
+    TensorDim reverse_cell_state_dim(batch_size, 1, max_timestep, unit,
+                                     weight_tensor_type);
+    reverse_cell_state_dim.setDataType(context.getActivationDataType());
+
     wt_idx[LSTMParams::reverse_cell_state] = context.requestTensor(
       reverse_cell_state_dim, "reverse_cell_state", Tensor::Initializer::NONE,
       true, TensorLifespan::ITERATION_LIFESPAN);
 
     // reverse_ifgo_dim : [ batch_size, 1, max_timestep, NUM_GATE * unit ]
-    const TensorDim reverse_ifgo_dim(batch_size, 1, max_timestep,
-                                     NUM_GATE * unit, weight_tensor_type);
+    TensorDim reverse_ifgo_dim(batch_size, 1, max_timestep, NUM_GATE * unit,
+                               weight_tensor_type);
+    reverse_ifgo_dim.setDataType(context.getActivationDataType());
+
     wt_idx[LSTMParams::reverse_ifgo] = context.requestTensor(
       reverse_ifgo_dim, "reverse_ifgo", Tensor::Initializer::NONE, true,
       TensorLifespan::ITERATION_LIFESPAN);
@@ -598,8 +610,10 @@ void LSTMLayer::finalize(InitLayerContext &context) {
 
   if (dropout_rate > epsilon) {
     // dropout_mask_dim = [ batch, 1, time_iteration, unit ]
-    const TensorDim dropout_mask_dim(batch_size, 1, max_timestep, unit,
-                                     weight_tensor_type);
+    TensorDim dropout_mask_dim(batch_size, 1, max_timestep, unit,
+                               weight_tensor_type);
+    dropout_mask_dim.setDataType(context.getActivationDataType());
+
     wt_idx[LSTMParams::dropout_mask] = context.requestTensor(
       dropout_mask_dim, "dropout_mask", Tensor::Initializer::NONE, false,
       TensorLifespan::ITERATION_LIFESPAN);
diff --git a/nntrainer/layers/lstm.h b/nntrainer/layers/lstm.h
index f35fdf8815..a9b2cac7d7 100644
--- a/nntrainer/layers/lstm.h
+++ b/nntrainer/layers/lstm.h
@@ -99,7 +99,6 @@ class LSTMLayer : public LSTMCore {
 
   inline static const std::string type = "lstm";
 
-private:
   static constexpr unsigned int NUM_GATE = 4;
 
   /** common properties like Unit, IntegrateBias, HiddenStateActivation and
diff --git a/nntrainer/layers/pooling2d_layer.cpp b/nntrainer/layers/pooling2d_layer.cpp
index a68e42e8d0..b53ca354f2 100644
--- a/nntrainer/layers/pooling2d_layer.cpp
+++ b/nntrainer/layers/pooling2d_layer.cpp
@@ -6,6 +6,7 @@
  * @date   12 June 2020
  * @see    https://github.com/nnstreamer/nntrainer
  * @author Jijoong Moon <jijoong.moon@samsung.com>
+ * @author Jiho Chu <jiho.chu@samsung.com>
  * @bug    No known bugs except for NYI items
  * @brief  This is 2 Dimensional Pooling Layer Class for Neural Network
  *
@@ -26,6 +27,13 @@ namespace nntrainer {
 
 static constexpr size_t SINGLE_INOUT_IDX = 0;
 
+/**
+ * @brief help function for Pooling handler
+ */
+template <typename T> struct PoolFunc {
+  typedef std::function<T(const T *, int, int, int)> Type;
+};
+
 Pooling2DLayer::Pooling2DLayer(
   const std::array<unsigned int, POOLING2D_DIM * 2> &padding_) :
   Layer(),
@@ -96,6 +104,7 @@ void Pooling2DLayer::finalize(InitLayerContext &context) {
   out_dim.channel(in_dim.channel());
   out_dim.height((eff_in_height - pool_size[0]) / stride[0] + 1);
   out_dim.width((eff_in_width - pool_size[1]) / stride[1] + 1);
+  out_dim.setDataType(in_dim.getDataType());
   context.setOutputDimensions({out_dim});
 
   /**
@@ -111,13 +120,17 @@ void Pooling2DLayer::finalize(InitLayerContext &context) {
    * // clang-format on
    */
   if (pooling_type == props::PoolingTypeInfo::Enum::global_max) {
+    auto helper_dim = in_dim;
+    helper_dim.setDataType(ml::train::TensorDim::DataType::FP32);
     pool_helper_idx =
-      context.requestTensor(in_dim, "helper_idx", Tensor::Initializer::NONE,
+      context.requestTensor(helper_dim, "helper_idx", Tensor::Initializer::NONE,
                             false, TensorLifespan::ITERATION_LIFESPAN);
-    pool_helper_size.resize(in_dim.batch() * in_dim.channel());
+    pool_helper_size.resize(helper_dim.batch() * helper_dim.channel());
   } else {
+    auto helper_dim = out_dim;
+    helper_dim.setDataType(ml::train::TensorDim::DataType::FP32);
     pool_helper_idx =
-      context.requestTensor(out_dim, "helper_idx", Tensor::Initializer::NONE,
+      context.requestTensor(helper_dim, "helper_idx", Tensor::Initializer::NONE,
                             false, TensorLifespan::ITERATION_LIFESPAN);
   }
 }
@@ -172,15 +185,13 @@ void Pooling2DLayer::calcDerivative(RunLayerContext &context) {
   unsigned int J, K;
 
   result.setZero();
-  float *result_data = result.getData();
 
   unsigned int out_map_size = deriv.height() * deriv.width();
   unsigned int in_map_size = height * width;
 
-  switch (pooling_type) {
-  case props::PoolingTypeInfo::Enum::max: {
+  auto apply_max = [&]<typename T>(T *result_data) {
     const int *iter = pool_helper.getData<int>();
-    const float *deriv_data = deriv.getData();
+    const T *deriv_data = deriv.getData<T>();
     for (unsigned int b = 0; b < batch; ++b) {
       for (unsigned int c = 0; c < channel; ++c) {
         for (unsigned int i = 0; i < out_map_size; ++i) {
@@ -195,9 +206,9 @@ void Pooling2DLayer::calcDerivative(RunLayerContext &context) {
         result_data += in_map_size;
       }
     }
-  } break;
-  case props::PoolingTypeInfo::Enum::global_average:
-  case props::PoolingTypeInfo::Enum::average: {
+  };
+
+  auto apply_average = [&]<typename T>(T *result_data) {
     int height_stride_end = height - p_height + pt;
     int width_stride_end = width - p_width + pl;
     const int *iter = pool_helper.getData<int>();
@@ -207,7 +218,7 @@ void Pooling2DLayer::calcDerivative(RunLayerContext &context) {
         for (int j = -pt; j <= height_stride_end; j += stride[0]) {
           K = 0;
           for (int k = -pl; k <= width_stride_end; k += stride[1]) {
-            float del = deriv.getValue<float>(b, i, J, K) / *iter;
+            T del = deriv.getValue<T>(b, i, J, K) / *iter;
             int patch_height_end =
               std::min(static_cast<int>(j + p_height), height);
             int patch_width_end =
@@ -217,7 +228,7 @@ void Pooling2DLayer::calcDerivative(RunLayerContext &context) {
             for (int h = start_h; h < patch_height_end; ++h) {
               for (int w = start_w; w < patch_width_end; ++w) {
                 result.setValue(b, i, h, w,
-                                result.getValue<float>(b, i, h, w) + del);
+                                result.getValue<T>(b, i, h, w) + del);
               }
             }
             iter++;
@@ -227,15 +238,16 @@ void Pooling2DLayer::calcDerivative(RunLayerContext &context) {
         }
       }
     }
-  } break;
-  case props::PoolingTypeInfo::Enum::global_max: {
-    const float *deriv_data = deriv.getData();
+  };
+
+  auto apply_global_max = [&]<typename T>(T *result_data) {
+    const T *deriv_data = deriv.getData<T>();
     for (unsigned int b = 0; b < batch; b++) {
       for (unsigned int c = 0; c < channel; c++) {
         const int *iter =
           pool_helper.getData<int>() + pool_helper.getIndex(b, c, 0, 0);
         unsigned int helper_size = pool_helper_size[b * channel + c];
-        float der = *deriv_data / helper_size;
+        T der = *deriv_data / static_cast<T>(helper_size);
 
         for (unsigned int idx = 0; idx < helper_size; idx++)
           result_data[iter[idx]] += der;
@@ -244,7 +256,40 @@ void Pooling2DLayer::calcDerivative(RunLayerContext &context) {
         result_data += in_map_size;
       }
     }
-  } break;
+  };
+
+  switch (pooling_type) {
+  case props::PoolingTypeInfo::Enum::max:
+    if (in_dim.getDataType() == ml::train::TensorDim::DataType::FP32)
+      apply_max(result.getData<float>());
+#ifdef ENABLE_FP16
+    else if (in_dim.getDataType() == ml::train::TensorDim::DataType::FP16)
+      apply_max(result.getData<_FP16>());
+#endif
+    else
+      throw std::runtime_error("Not supported datatype");
+    break;
+  case props::PoolingTypeInfo::Enum::global_average:
+  case props::PoolingTypeInfo::Enum::average:
+    if (in_dim.getDataType() == ml::train::TensorDim::DataType::FP32)
+      apply_average(result.getData<float>());
+#ifdef ENABLE_FP16
+    else if (in_dim.getDataType() == ml::train::TensorDim::DataType::FP16)
+      apply_average(result.getData<_FP16>());
+#endif
+    else
+      throw std::runtime_error("Not supported datatype");
+    break;
+  case props::PoolingTypeInfo::Enum::global_max:
+    if (in_dim.getDataType() == ml::train::TensorDim::DataType::FP32)
+      apply_global_max(result.getData<float>());
+#ifdef ENABLE_FP16
+    else if (in_dim.getDataType() == ml::train::TensorDim::DataType::FP16)
+      apply_global_max(result.getData<_FP16>());
+#endif
+    else
+      throw std::runtime_error("Not supported datatype");
+    break;
   default:
     throw std::runtime_error("Error: Unknown Pooling Type");
   }
@@ -290,124 +335,167 @@ void Pooling2DLayer::pooling2d(Tensor &in, bool training, Tensor &output,
    * @param start_w (width index pointing the start of the patch)
    * @return result value of pooling
    */
-  std::function<float(const float *, int, int, int)> pool_fn;
+  PoolFunc<float>::Type pool_fn_fp32;
+#ifdef ENABLE_FP16
+  PoolFunc<_FP16>::Type pool_fn_fp16;
+#endif
 
   unsigned int max_idx_count = 0;
-  switch (pooling_type) {
-  case props::PoolingTypeInfo::Enum::max: {
-    pool_fn = [&](const float *in_data, int channel_idx, int start_h,
-                  int start_w) {
-      int end_h = start_h + patch_height;
-      int end_w = start_w + patch_width;
-
-      float max_val = std::numeric_limits<float>::lowest();
-
-      int cur_max_idx = -1;
-      int eff_end_h = std::min(end_h, in_height);
-      int eff_end_w = std::min(end_w, in_width);
-      start_w = std::max(0, start_w);
-      for (int h = std::max(0, start_h); h < eff_end_h; ++h) {
-        for (int w = start_w; w < eff_end_w; ++w) {
-          int cur_idx = h * in_width + w;
-          float val = in_data[cur_idx];
-          if (max_val < val) {
-            max_val = val;
-            if (training) {
-              cur_max_idx = cur_idx;
-            }
+
+  auto pool_fn_max = [&]<typename T>(const T *in_data, int channel_idx,
+                                     int start_h, int start_w) {
+    int end_h = start_h + patch_height;
+    int end_w = start_w + patch_width;
+
+    T max_val = std::numeric_limits<T>::lowest();
+
+    int cur_max_idx = -1;
+    int eff_end_h = std::min(end_h, in_height);
+    int eff_end_w = std::min(end_w, in_width);
+    start_w = std::max(0, start_w);
+    for (int h = std::max(0, start_h); h < eff_end_h; ++h) {
+      for (int w = start_w; w < eff_end_w; ++w) {
+        int cur_idx = h * in_width + w;
+        T val = in_data[cur_idx];
+        if (max_val < val) {
+          max_val = val;
+          if (training) {
+            cur_max_idx = cur_idx;
           }
         }
       }
+    }
 
-      if (training) {
-        pool_helper.setValueInt(max_idx_count++, cur_max_idx);
-      }
+    if (training) {
+      pool_helper.setValueInt(max_idx_count++, cur_max_idx);
+    }
 
-      return max_val;
-    };
-    break;
-  }
-  case props::PoolingTypeInfo::Enum::global_max: {
-    pool_fn = [&, this](const float *in_data, int channel_idx, int start_h,
-                        int start_w) {
-      int end_h = start_h + patch_height;
-      int end_w = start_w + patch_width;
-
-      float max_val = std::numeric_limits<float>::lowest();
-      int *helper_data = pool_helper.getData<int>();
-      helper_data += channel_idx * in_height * in_width;
-
-      for (int h = start_h; h < end_h; ++h) {
-        for (int w = start_w; w < end_w; ++w) {
-          int cur_idx = h * in_width + w;
-          float val = in_data[cur_idx];
-          if (max_val < val) {
-            max_val = val;
-            max_idx_count = 0;
-          }
+    return max_val;
+  };
 
-          if (training && max_val == val) {
-            *(helper_data + max_idx_count++) = cur_idx;
-          }
+  auto pool_fn_global_max = [&, this]<typename T>(const T *in_data,
+                                                  int channel_idx, int start_h,
+                                                  int start_w) {
+    int end_h = start_h + patch_height;
+    int end_w = start_w + patch_width;
+
+    T max_val = std::numeric_limits<T>::lowest();
+    int *helper_data = pool_helper.getData<int>();
+    helper_data += channel_idx * in_height * in_width;
+
+    for (int h = start_h; h < end_h; ++h) {
+      for (int w = start_w; w < end_w; ++w) {
+        int cur_idx = h * in_width + w;
+        T val = in_data[cur_idx];
+        if (max_val < val) {
+          max_val = val;
+          max_idx_count = 0;
         }
-      }
 
-      pool_helper_size[batch_idx * in.channel() + channel_idx] = max_idx_count;
-      return max_val;
-    };
-    break;
-  }
-  case props::PoolingTypeInfo::Enum::global_average:
-  case props::PoolingTypeInfo::Enum::average: {
-    pool_fn = [&](const float *in_data, int channel_idx, int start_h,
-                  int start_w) {
-      int end_h = start_h + patch_height;
-      int end_w = start_w + patch_width;
-      float total = 0.0f;
-
-      int eff_end_h = std::min(end_h, in_height);
-      int eff_end_w = std::min(end_w, in_width);
-      int eff_start_h = std::max(0, start_h);
-      int eff_start_w = std::max(0, start_w);
-
-      int cnt = (eff_end_h - eff_start_h) * (eff_end_w - eff_start_w);
-      for (int h = eff_start_h; h < eff_end_h; ++h) {
-        for (int w = eff_start_w; w < eff_end_w; ++w) {
-          float val = in_data[h * in_width + w];
-          total += val;
+        if (training && max_val == val) {
+          *(helper_data + max_idx_count++) = cur_idx;
         }
       }
+    }
 
-      if (training) {
-        pool_helper.setValueInt(max_idx_count++, cnt);
+    pool_helper_size[batch_idx * in.channel() + channel_idx] = max_idx_count;
+    return max_val;
+  };
+
+  auto pool_fn_average = [&]<typename T>(const T *in_data, int channel_idx,
+                                         int start_h, int start_w) {
+    int end_h = start_h + patch_height;
+    int end_w = start_w + patch_width;
+    T total = static_cast<T>(0.0f);
+
+    int eff_end_h = std::min(end_h, in_height);
+    int eff_end_w = std::min(end_w, in_width);
+    int eff_start_h = std::max(0, start_h);
+    int eff_start_w = std::max(0, start_w);
+
+    int cnt = (eff_end_h - eff_start_h) * (eff_end_w - eff_start_w);
+    for (int h = eff_start_h; h < eff_end_h; ++h) {
+      for (int w = eff_start_w; w < eff_end_w; ++w) {
+        T val = in_data[h * in_width + w];
+        total += val;
       }
-      return total / cnt;
-    };
+    }
+
+    if (training) {
+      pool_helper.setValueInt(max_idx_count++, cnt);
+    }
+    return total / cnt;
+  };
+
+  switch (pooling_type) {
+  case props::PoolingTypeInfo::Enum::max:
+    pool_fn_fp32 = pool_fn_max;
+#ifdef ENABLE_FP16
+    pool_fn_fp16 = pool_fn_max;
+#endif
+    break;
+  case props::PoolingTypeInfo::Enum::global_max:
+    pool_fn_fp32 = pool_fn_global_max;
+#ifdef ENABLE_FP16
+    pool_fn_fp16 = pool_fn_global_max;
+#endif
+    break;
+  case props::PoolingTypeInfo::Enum::global_average:
+  case props::PoolingTypeInfo::Enum::average:
+    pool_fn_fp32 = pool_fn_average;
+#ifdef ENABLE_FP16
+    pool_fn_fp16 = pool_fn_average;
+#endif
     break;
-  }
   case props::PoolingTypeInfo::Enum::unknown:
   default:
     throw std::invalid_argument("unknown pooling type given");
     break;
   }
 
-  const float *in_data = in.getData();
-  float *out_data = output.getData();
-
-  unsigned int map_size = in_height * in_width;
-
-  int height_stride_end = height - patch_height - pt;
-  int width_stride_end = width - patch_width - pl;
-  for (unsigned int i = 0; i < channel; ++i) {
-    const float *in_data_channel_sliced = in_data + i * map_size;
-    for (int j = -pt; j <= height_stride_end; j += stride[0]) {
-      for (int k = -pl; k <= width_stride_end; k += stride[1]) {
-        float pool_value = pool_fn(in_data_channel_sliced, i, j, k);
-        *out_data = pool_value;
-        out_data++;
+  if (in.getDataType() == ml::train::TensorDim::DataType::FP32) {
+    const float *in_data = in.getData<float>();
+    float *out_data = output.getData<float>();
+
+    unsigned int map_size = in_height * in_width;
+
+    int height_stride_end = height - patch_height - pt;
+    int width_stride_end = width - patch_width - pl;
+    for (unsigned int i = 0; i < channel; ++i) {
+      const float *in_data_channel_sliced = in_data + i * map_size;
+      for (int j = -pt; j <= height_stride_end; j += stride[0]) {
+        for (int k = -pl; k <= width_stride_end; k += stride[1]) {
+          float pool_value = pool_fn_fp32(in_data_channel_sliced, i, j, k);
+          *out_data = pool_value;
+          out_data++;
+        }
+      }
+    }
+  }
+#ifdef ENABLE_FP16
+  else if (in.getDataType() == ml::train::TensorDim::DataType::FP16) {
+    const _FP16 *in_data = in.getData<_FP16>();
+    _FP16 *out_data = output.getData<_FP16>();
+
+    unsigned int map_size = in_height * in_width;
+
+    int height_stride_end = height - patch_height - pt;
+    int width_stride_end = width - patch_width - pl;
+    for (unsigned int i = 0; i < channel; ++i) {
+      const _FP16 *in_data_channel_sliced = in_data + i * map_size;
+      for (int j = -pt; j <= height_stride_end; j += stride[0]) {
+        for (int k = -pl; k <= width_stride_end; k += stride[1]) {
+          _FP16 pool_value = pool_fn_fp16(in_data_channel_sliced, i, j, k);
+          *out_data = pool_value;
+          out_data++;
+        }
       }
     }
   }
+#endif
+  else {
+    throw std::runtime_error("Not supported datatype");
+  }
 }
 
 void Pooling2DLayer::setBatch(RunLayerContext &context, unsigned int batch) {
diff --git a/nntrainer/layers/reshape_layer.cpp b/nntrainer/layers/reshape_layer.cpp
index 0f82d84f3a..07564b3970 100644
--- a/nntrainer/layers/reshape_layer.cpp
+++ b/nntrainer/layers/reshape_layer.cpp
@@ -42,6 +42,7 @@ void ReshapeLayer::finalize(InitLayerContext &context) {
   }
 
   out_dim.batch(in_dim.batch());
+  out_dim.setDataType(in_dim.getDataType());
 
   context.setOutputDimensions({out_dim});
 }
diff --git a/nntrainer/layers/time_dist.cpp b/nntrainer/layers/time_dist.cpp
index 80451416df..779010065a 100644
--- a/nntrainer/layers/time_dist.cpp
+++ b/nntrainer/layers/time_dist.cpp
@@ -256,8 +256,8 @@ void TimeDistLayer::forwarding(RunLayerContext &context, bool training) {
 
     RunLayerContext dist_context(context.getName(), context.getTrainable(),
                                  context.getLoss(), context.executeInPlace(),
-                                 getWeightsForContext(), {&in_var}, {&out_var},
-                                 getTensorsForContext());
+                                 context.getLossScale(), getWeightsForContext(),
+                                 {&in_var}, {&out_var}, getTensorsForContext());
 
     dist_layer->forwarding(dist_context, training);
   }
@@ -303,8 +303,8 @@ void TimeDistLayer::calcDerivative(RunLayerContext &context) {
 
     RunLayerContext dist_context(context.getName(), context.getTrainable(),
                                  context.getLoss(), context.executeInPlace(),
-                                 getWeightsForContext(), {&in_var}, {&out_var},
-                                 getTensorsForContext());
+                                 context.getLossScale(), getWeightsForContext(),
+                                 {&in_var}, {&out_var}, getTensorsForContext());
 
     dist_layer->calcDerivative(dist_context);
   }
@@ -354,8 +354,8 @@ void TimeDistLayer::calcGradient(RunLayerContext &context) {
 
     RunLayerContext dist_context(context.getName(), context.getTrainable(),
                                  context.getLoss(), context.executeInPlace(),
-                                 getWeightsForContext(), {&in_var}, {&out_var},
-                                 getTensorsForContext());
+                                 context.getLossScale(), getWeightsForContext(),
+                                 {&in_var}, {&out_var}, getTensorsForContext());
 
     dist_layer->calcGradient(dist_context);
   }
@@ -396,8 +396,8 @@ void TimeDistLayer::setBatch(RunLayerContext &context, unsigned int batch) {
 
     RunLayerContext dist_context(context.getName(), context.getTrainable(),
                                  context.getLoss(), context.executeInPlace(),
-                                 getWeightsForContext(), {&in_var}, {&out_var},
-                                 getTensorsForContext());
+                                 context.getLossScale(), getWeightsForContext(),
+                                 {&in_var}, {&out_var}, getTensorsForContext());
 
     dist_layer->setBatch(dist_context, batch);
 
diff --git a/nntrainer/meson.build b/nntrainer/meson.build
index 02df7744b6..5c7a14d4a5 100644
--- a/nntrainer/meson.build
+++ b/nntrainer/meson.build
@@ -47,6 +47,7 @@ nntrainer_elements = [
 
 if get_option('enable-opencl')
   nntrainer_elements += 'opencl'
+  nntrainer_elements += 'layers/cl_layers'
 endif
 
 foreach elem : nntrainer_elements
diff --git a/nntrainer/models/model_common_properties.h b/nntrainer/models/model_common_properties.h
index 3776afefca..3435d18e96 100644
--- a/nntrainer/models/model_common_properties.h
+++ b/nntrainer/models/model_common_properties.h
@@ -217,7 +217,7 @@ class ModelTensorDataType final : public EnumProperty<ModelTensorDataTypeInfo> {
  */
 class LossScale : public Property<float> {
 public:
-  LossScale(float value = 0.0f);
+  LossScale(float value = 1.0f);
   static constexpr const char *key = "loss_scale"; /**< unique key to access */
   using prop_tag = float_prop_tag;                 /**< property type */
 };
diff --git a/nntrainer/models/neuralnet.cpp b/nntrainer/models/neuralnet.cpp
index d0e542825f..afc560603e 100644
--- a/nntrainer/models/neuralnet.cpp
+++ b/nntrainer/models/neuralnet.cpp
@@ -412,9 +412,21 @@ void NeuralNetwork::backwarding(int iteration,
   NNTR_THROW_IF(!opt, std::invalid_argument) << "optimizer is null!";
 #endif
 
-  std::function<void(std::shared_ptr<LayerNode>, int)> backwarding_op =
+  std::function<void(std::shared_ptr<LayerNode>, bool)> forwarding_op =
+    [this, stop_cb, userdata](std::shared_ptr<LayerNode> node,
+                              bool training) -> void {
+    (void)this;
+    PROFILE_MEM_ANNOTATE("Forwarding for layer: " + node->getName());
+
+    auto f = std::get<0>(node->getExecutionOrder());
+    model_graph.flushCacheExcept(f);
+
+    node->forwarding(training);
+  };
+
+  std::function<bool(std::shared_ptr<LayerNode>, int)> backwarding_op =
     [this, stop_cb, userdata](std::shared_ptr<LayerNode> node,
-                              int iteration) -> void {
+                              int iteration) -> bool {
     /**
      * Do not change this order:
      * 1. calcGradient
@@ -448,19 +460,29 @@ void NeuralNetwork::backwarding(int iteration,
       /** If gradient must be applied and its not gradient mode, calculate
        * gradient
        */
-      if (!dynamic_training_opt.isGradientMode() && apply_gradient)
+      if (!dynamic_training_opt.isGradientMode() && apply_gradient) {
         node->calcGradient();
+
+        RunLayerContext &rc = node->getRunContext();
+        if (rc.isMixedPrecision()) {
+          for (auto w : rc.getWeights()) {
+            if (!w->getGradientRef().isValid())
+              return false;
+          }
+        }
+      }
     }
 
     model_graph.flushCacheExcept(std::get<2>(node->getExecutionOrder()));
     PROFILE_MEM_ANNOTATE("CalcDerivative: " + node->getName());
 
     if (stop_cb(userdata)) {
-      return;
+      return true;
     }
 
-    if (node->needsCalcDerivative())
+    if (node->needsCalcDerivative()) {
       node->calcDerivative();
+    }
 
     model_graph.flushCacheExcept(std::get<3>(node->getExecutionOrder()));
     PROFILE_MEM_ANNOTATE("ApplyGradient: " + node->getName());
@@ -476,9 +498,10 @@ void NeuralNetwork::backwarding(int iteration,
           opt_->applyGradient(opt_context);
         });
     }
+    return true;
   };
 
-  std::function<void(Weight &, int)> apply_grad_clip_op =
+  std::function<void(Weight &, int)> lazy_apply_grad_op =
     [opt_ = opt.get()](Weight &w, int iteration) -> void {
     w.calcRegularizationGradient();
     w.calcWeightDecayGradient();
@@ -487,8 +510,13 @@ void NeuralNetwork::backwarding(int iteration,
     opt_->applyGradient(opt_context);
   };
 
-  model_graph.backwarding(iteration, backwarding_op, apply_grad_clip_op,
-                          stop_cb, userdata);
+  // return false if the gradient is not valid
+  bool ret = false;
+
+  while (!ret) {
+    ret = model_graph.backwarding(iteration, forwarding_op, backwarding_op,
+                                  lazy_apply_grad_op, stop_cb, userdata);
+  }
 }
 
 void NeuralNetwork::save(const std::string &file_path,
diff --git a/nntrainer/optimizers/adam.cpp b/nntrainer/optimizers/adam.cpp
index 18c0a0fcc1..f7189dda7e 100644
--- a/nntrainer/optimizers/adam.cpp
+++ b/nntrainer/optimizers/adam.cpp
@@ -36,7 +36,15 @@ Adam::~Adam() {}
 enum AdamParams { wm, wv };
 
 std::vector<TensorDim> Adam::getOptimizerVariableDim(const TensorDim &dim) {
-  return {dim, dim};
+  /**
+   * @note We assume the optimizer parameters should be full precsion to
+   * maintain the accuracy even in mixed precision training.
+   */
+  TensorDim wm_dim(dim);
+  TensorDim wv_dim(dim);
+  wm_dim.setDataType(ml::train::TensorDim::DataType::FP32);
+  wv_dim.setDataType(ml::train::TensorDim::DataType::FP32);
+  return {wm_dim, wv_dim};
 }
 
 void Adam::exportTo(Exporter &exporter,
@@ -64,7 +72,17 @@ double Adam::getUpdatedLearningRate(unsigned int iteration, double ll) const {
 }
 
 void Adam::applyGradient(RunOptimizerContext &context) {
-  Tensor &x_grad = context.getGradient();
+  Tensor empty_tensor;
+
+  Tensor &x_grad =
+    context.getGradient().getDataType() == ml::train::TensorDim::DataType::FP32
+      ? context.getGradient()
+      : empty_tensor;
+
+  if (x_grad.empty()) {
+    x_grad = context.getGradient().clone(ml::train::TensorDim::DataType::FP32);
+    context.applyLossScale(x_grad);
+  }
 
   auto &beta1 = std::get<PropsB1>(adam_props).get();
   auto &beta2 = std::get<PropsB2>(adam_props).get();
@@ -91,7 +109,7 @@ void Adam::applyGradient(RunOptimizerContext &context) {
     denom.add_i(epsilon);
     wm.divide(denom, x_grad);
 
-    context.applyGradient(context.getLearningRate() / biasCorrection1);
+    context.applyGradient(context.getLearningRate() / biasCorrection1, x_grad);
 
   } else {
     std::function<double(double)> sqrtEps = [epsilon](double f) {
@@ -100,8 +118,9 @@ void Adam::applyGradient(RunOptimizerContext &context) {
 
     x_grad = wv.apply<float>(sqrtEps, x_grad);
     x_grad.multiply_i(wm);
-    context.applyGradient(getUpdatedLearningRate(context.getIteration(),
-                                                 context.getLearningRate()));
+    context.applyGradient(
+      getUpdatedLearningRate(context.getIteration(), context.getLearningRate()),
+      x_grad);
   }
 }
 
diff --git a/nntrainer/optimizers/optimizer_context.cpp b/nntrainer/optimizers/optimizer_context.cpp
index da4cd1f7e9..8380ad6613 100644
--- a/nntrainer/optimizers/optimizer_context.cpp
+++ b/nntrainer/optimizers/optimizer_context.cpp
@@ -42,4 +42,24 @@ Tensor &RunOptimizerContext::getOptimizerVariable(unsigned int idx) const {
 void RunOptimizerContext::applyGradient(double lr) const {
   weight->applyGradient(lr);
 }
+
+/**
+ * @brief   Apply the gradient with the given learning rate and gradient
+ */
+void RunOptimizerContext::applyGradient(double lr, Tensor &updated_grad) const {
+  weight->applyGradient(lr, updated_grad);
+}
+
+/**
+ * @brief   Apply loss scale to gradient (full precision)
+ */
+void RunOptimizerContext::applyLossScale(Tensor &fp32_grad) {
+  if (!weight->isMixedPrecision())
+    return;
+  if (fp32_grad.getDataType() != ml::train::TensorDim::DataType::FP32)
+    throw std::invalid_argument(
+      "gradient should be fullprecsion to maintain accuracy");
+  float loss_scale = weight->getLossScale();
+  fp32_grad.divide_i(loss_scale);
+}
 } // namespace nntrainer
diff --git a/nntrainer/optimizers/optimizer_context.h b/nntrainer/optimizers/optimizer_context.h
index 62f9e0945d..27f028fc52 100644
--- a/nntrainer/optimizers/optimizer_context.h
+++ b/nntrainer/optimizers/optimizer_context.h
@@ -35,9 +35,7 @@ class RunOptimizerContext {
    *
    */
   RunOptimizerContext(Weight *w = nullptr, size_t iter = 0, double lr = 0.0) :
-    weight(w),
-    iteration(iter),
-    learning_rate(lr) {}
+    weight(w), iteration(iter), learning_rate(lr) {}
 
   /**
    * @brief Get the Weight tensor object
@@ -75,6 +73,16 @@ class RunOptimizerContext {
    */
   void applyGradient(double lr) const;
 
+  /**
+   * @brief   Apply the gradient with the given learning rate and updated
+   * gradient
+   *
+   * @param lr learning rate
+   * @param updated_grad gradient tensor which is updated. (usually it could be
+   * fp32)
+   */
+  void applyGradient(double lr, Tensor &updated_grad) const;
+
   /**
    * @brief   Get the current iteration value
    *
@@ -89,6 +97,11 @@ class RunOptimizerContext {
    */
   double getLearningRate() const { return learning_rate; }
 
+  /**
+   * @brief   Apply loss scale to gradient (full precision)
+   */
+  void applyLossScale(Tensor &fp32_grad);
+
 private:
   Weight *weight;       /**< weights for the optimizer */
   size_t iteration;     /**< iteration number */
diff --git a/nntrainer/optimizers/sgd.cpp b/nntrainer/optimizers/sgd.cpp
index 8b0078e9e6..e4b2209a57 100644
--- a/nntrainer/optimizers/sgd.cpp
+++ b/nntrainer/optimizers/sgd.cpp
@@ -16,7 +16,20 @@
 namespace nntrainer {
 
 void SGD::applyGradient(RunOptimizerContext &context) {
-  context.applyGradient(context.getLearningRate());
+  // @todo This could go inside the context.
+  Tensor empty_tensor;
+
+  Tensor &x_grad =
+    context.getGradient().getDataType() == ml::train::TensorDim::DataType::FP32
+      ? context.getGradient()
+      : empty_tensor;
+
+  if (x_grad.empty()) {
+    x_grad = context.getGradient().clone(ml::train::TensorDim::DataType::FP32);
+    context.applyLossScale(x_grad);
+  }
+
+  context.applyGradient(context.getLearningRate(), x_grad);
 }
 
 } // namespace nntrainer
diff --git a/nntrainer/tensor/blas_avx.cpp b/nntrainer/tensor/blas_avx.cpp
index ce59583d6f..411dbcbb5d 100644
--- a/nntrainer/tensor/blas_avx.cpp
+++ b/nntrainer/tensor/blas_avx.cpp
@@ -20,6 +20,7 @@
 
 namespace nntrainer::avx {
 
+#ifdef ENABLE_FP16
 void vcvt_f16_f32(size_t N, const void *input, float *output) {
   assert(N != 0);
   assert(input != NULL);
@@ -114,4 +115,163 @@ void vcvt_f32_f16(size_t N, const float *input, void *output) {
   }
 }
 
+bool isValid(const size_t N, const _Float16 *input) {
+  assert(N != 0);
+  assert(input != NULL);
+
+  int temp = 0;
+  size_t idx = 0;
+
+  const __m256 SIGN_MASK = _mm256_set1_ps(-0.0);
+  const __m256 INF = _mm256_set1_ps(std::numeric_limits<float>::infinity());
+
+  // 16 single-precision check : ( X != X )
+  for (; N - idx >= 16; idx += 16) {
+    __m256 vec0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)input));
+    __m256 vec1 =
+      _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)(input + 8)));
+
+    input += 16;
+
+    // check NaN in vec0
+    __m256 res = _mm256_cmp_ps(vec0, vec0, _CMP_NEQ_UQ);
+    temp = temp | _mm256_movemask_ps(res);
+    if (temp)
+      return false;
+
+    // check infinity in vec0
+    vec0 = _mm256_andnot_ps(SIGN_MASK, vec0);
+    vec0 = _mm256_cmp_ps(vec0, INF, _CMP_EQ_OQ);
+
+    temp = temp | _mm256_movemask_ps(vec0);
+    if (temp)
+      return false;
+
+    // check NaN in vec1
+    __m256 res1 = _mm256_cmp_ps(vec1, vec1, _CMP_NEQ_UQ);
+    temp = temp | _mm256_movemask_ps(res1);
+
+    if (temp)
+      return false;
+
+    // check infinity in vec1
+    vec1 = _mm256_andnot_ps(SIGN_MASK, vec1);
+    vec1 = _mm256_cmp_ps(vec1, INF, _CMP_EQ_OQ);
+
+    temp = temp | _mm256_movemask_ps(vec1);
+
+    if (temp)
+      return false;
+  }
+
+  // 8 single-precision check : ( X != X )
+  for (; N - idx >= 8; idx += 8) {
+    __m256 vec = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)input));
+    input += 8;
+    __m256 res = _mm256_cmp_ps(vec, vec, _CMP_NEQ_UQ);
+    temp = temp | _mm256_movemask_ps(res);
+
+    if (temp)
+      return false;
+
+    // check infinity in vec1
+    vec = _mm256_andnot_ps(SIGN_MASK, vec);
+    vec = _mm256_cmp_ps(vec, INF, _CMP_EQ_OQ);
+
+    temp = temp | _mm256_movemask_ps(vec);
+
+    if (temp)
+      return false;
+  }
+
+  // remain check : ( X != X || X == Inf )
+  while (idx < N) {
+    if (*input != *input || *input == std::numeric_limits<float>::infinity()) {
+      return false;
+    }
+    ++input;
+    ++idx;
+  }
+
+  return true;
+}
+#endif
+
+bool isValid(const size_t N, const float *input) {
+  assert(N != 0);
+  assert(input != NULL);
+
+  int temp = 0;
+  size_t idx = 0;
+
+  const __m256 SIGN_MASK = _mm256_set1_ps(-0.0);
+  const __m256 INF = _mm256_set1_ps(std::numeric_limits<float>::infinity());
+
+  // 16 single-precision check : ( X != X )
+  for (; N - idx >= 16; idx += 16) {
+    __m256 vec0 = _mm256_loadu_ps(input);
+    __m256 vec1 = _mm256_loadu_ps(input + 8);
+    input += 16;
+    __m256 res = _mm256_cmp_ps(vec0, vec0, _CMP_NEQ_UQ);
+    temp = temp | _mm256_movemask_ps(res);
+
+    if (temp)
+      return false;
+
+    // check infinity in vec0
+    vec0 = _mm256_andnot_ps(SIGN_MASK, vec0);
+    vec0 = _mm256_cmp_ps(vec0, INF, _CMP_EQ_OQ);
+
+    temp = temp | _mm256_movemask_ps(vec0);
+    if (temp)
+      return false;
+
+    __m256 res1 = _mm256_cmp_ps(vec1, vec1, _CMP_NEQ_UQ);
+    temp = temp | _mm256_movemask_ps(res1);
+
+    if (temp)
+      return false;
+
+    // check infinity in vec1
+    vec1 = _mm256_andnot_ps(SIGN_MASK, vec1);
+    vec1 = _mm256_cmp_ps(vec1, INF, _CMP_EQ_OQ);
+
+    temp = temp | _mm256_movemask_ps(vec1);
+
+    if (temp)
+      return false;
+  }
+
+  // 8 single-precision check : ( X != X )
+  for (; N - idx >= 8; idx += 8) {
+    __m256 vec = _mm256_loadu_ps(input);
+    input += 8;
+    __m256 res = _mm256_cmp_ps(vec, vec, _CMP_NEQ_UQ);
+    temp = temp | _mm256_movemask_ps(res);
+
+    if (temp)
+      return false;
+
+    // check infinity in vec
+    vec = _mm256_andnot_ps(SIGN_MASK, vec);
+    vec = _mm256_cmp_ps(vec, INF, _CMP_EQ_OQ);
+
+    temp = temp | _mm256_movemask_ps(vec);
+
+    if (temp)
+      return false;
+  }
+
+  // remain check : ( X != X )
+  while (idx < N) {
+    if (*input != *input || *input == std::numeric_limits<float>::infinity()) {
+      return false;
+    }
+    ++input;
+    ++idx;
+  }
+
+  return true;
+}
+
 } // namespace nntrainer::avx
diff --git a/nntrainer/tensor/blas_avx.h b/nntrainer/tensor/blas_avx.h
index ab1270a208..5eabcbdb2c 100644
--- a/nntrainer/tensor/blas_avx.h
+++ b/nntrainer/tensor/blas_avx.h
@@ -20,6 +20,7 @@
 
 namespace nntrainer::avx {
 
+#ifdef ENABLE_FP16
 /**
  * @brief Converts half-precision floating point values to single-precision
  * floating point values.
@@ -40,6 +41,25 @@ void vcvt_f16_f32(size_t N, const void *input, float *output);
  */
 void vcvt_f32_f16(size_t N, const float *input, void *output);
 
+/**
+ * @brief     check if the X has NaN value
+ * @note it compare (x!=x || x == inf)
+ * @param[in] N  length of the vector
+ * @param[in] X half-precision * for Vector X
+ * @param[out] false if it has NaN or inf
+ */
+bool isValid(const size_t N, const _Float16 *X);
+#endif
+
+/**
+ * @brief     check if the X has NaN value
+ * @note it compare (x!=x || x == inf)
+ * @param[in] N  length of the vector
+ * @param[in] X float * for Vector X
+ * @param[out] false if it has NaN or inf
+ */
+bool isValid(const size_t N, const float *X);
+
 } // namespace nntrainer::avx
 
 #endif /* __cplusplus */
diff --git a/nntrainer/tensor/blas_interface.cpp b/nntrainer/tensor/blas_interface.cpp
index 9be6fb9911..e8fb78d734 100644
--- a/nntrainer/tensor/blas_interface.cpp
+++ b/nntrainer/tensor/blas_interface.cpp
@@ -842,7 +842,10 @@ void scopy(const unsigned int N, const float *X, const int incX, float *Y,
 #ifdef BLAS_NUM_THREADS
   openblas_set_num_threads(BLAS_NUM_THREADS);
 #endif
-  cblas_scopy(N, X, incX, Y, incY);
+  // cblas_scopy(N, (float*)(X), incX, (float*)(Y), incY);
+  // replace cblas scopy with raw temporary.
+  for (unsigned int i = 0; i < N; ++i)
+    Y[i * incY] = X[i * incX];
 #else
   scopy_raw(N, X, incX, Y, incY);
 #endif
@@ -1038,6 +1041,16 @@ static void ele_div_fallback(const unsigned int N, const float *X,
   }
 }
 
+static bool is_valid_fallback(const size_t N, const float *X) {
+  for (size_t i = 0; i < N; ++i) {
+    if (*X != *X || *X == std::numeric_limits<float>::infinity())
+      return false;
+    ++X;
+  }
+
+  return true;
+}
+
 void ele_mul(const unsigned int N, const float *X, const float *Y, float *Z,
              float alpha, float beta, unsigned int i_stride,
              unsigned int o_stride) {
@@ -1090,4 +1103,30 @@ void ele_div(const unsigned int N, const float *X, const float *Y, float *Z,
     ele_div_fallback(N, X, Y, Z, alpha, beta, i_stride, o_stride);
 }
 
+bool is_valid(const size_t N, ml::train::TensorDim::DataType d_type,
+              const void *X) {
+  if (d_type == ml::train::TensorDim::DataType::FP16) {
+#ifdef ENABLE_FP16
+    const _FP16 *vec = (const _FP16 *)X;
+#ifdef USE_NEON
+    return nntrainer::neon::isValid(N, vec);
+#elif defined(USE_AVX)
+    return nntrainer::avx::isValid(N, vec);
+#else
+    throw std::invalid_argument("Error: enable-fp16 is not enabled");
+#endif
+#endif
+  } else if (d_type == ml::train::TensorDim::DataType::FP32) {
+    const float *vec = (const float *)X;
+#ifdef USE_NEON
+    return nntrainer::neon::isValid(N, vec);
+#elif defined(USE_AVX)
+    return nntrainer::avx::isValid(N, vec);
+#endif
+
+    return is_valid_fallback(N, vec);
+  }
+  return false;
+}
+
 } // namespace nntrainer
diff --git a/nntrainer/tensor/blas_interface.h b/nntrainer/tensor/blas_interface.h
index 04a8a23018..2b5ef72922 100644
--- a/nntrainer/tensor/blas_interface.h
+++ b/nntrainer/tensor/blas_interface.h
@@ -478,6 +478,16 @@ void ele_sub(const unsigned N, const float *X, const float *Y, float *Z,
 void ele_div(const unsigned N, const float *X, const float *Y, float *Z,
              float alpha = 1.f, float beta = 0.f, unsigned int i_stride = 1,
              unsigned int o_stride = 1);
+
+/**
+ * @brief     check if X array has NaN or inf
+ * @param[in] N  length of the vector
+ * @param[in] X float/fp16 * for Vector X
+ * @param[out] bool false if not valide else true
+ */
+bool is_valid(const size_t N, ml::train::TensorDim::DataType d_type,
+              const void *X);
+
 } /* namespace nntrainer */
 #endif /* __cplusplus */
 #endif /* __BLAS_INTERFACE_H__ */
diff --git a/nntrainer/tensor/blas_neon.cpp b/nntrainer/tensor/blas_neon.cpp
index 3609b6b8b5..20f4d102ec 100644
--- a/nntrainer/tensor/blas_neon.cpp
+++ b/nntrainer/tensor/blas_neon.cpp
@@ -546,6 +546,36 @@ void ele_div(const unsigned N, const float *X, const float *Y, float *Z,
   }
 }
 
+bool isValid(const size_t N, const float *X) {
+  size_t i = 0;
+  float inf_s = std::numeric_limits<float>::infinity();
+  float32x4_t inf = vdupq_n_f32(inf_s);
+  uint16x8_t zero = vdupq_n_f32(0);
+
+  for (; N - i >= 4; i += 4) {
+    float32x4_t vec = vld1q_f32(&X[i]);
+    uint32x4_t vcmp = vceqq_f32(vec, vec);
+
+    vcmp = vceqq_f32(vcmp, zero);
+
+    if (vaddvq_u32(vcmp))
+      return false;
+
+    vcmp = vceqq_f32(vec, inf);
+
+    if (vaddvq_u16(vcmp))
+      return false;
+  }
+
+  while (i < N) {
+    if (X[i] != X[i] || X[i] == std::numeric_limits<float>::infinity())
+      return false;
+    ++i;
+  }
+
+  return true;
+}
+
 #ifdef ENABLE_FP16
 
 void hgemv(const __fp16 *A, const __fp16 *X, __fp16 *Y, uint32_t M, uint32_t N,
@@ -1192,51 +1222,29 @@ void haxpy(const unsigned int N, const float alpha, const __fp16 *X,
 }
 
 __fp16 hdot(const unsigned int N, const __fp16 *X, const __fp16 *Y) {
-
-  float16x8_t accX8 = vmovq_n_f16(0);
-  float16x4_t accX4 = vmov_n_f16(0);
+  float32x4_t accX0_3 = vmovq_n_f32(0.F);
+  float32x4_t accX4_7 = vmovq_n_f32(0.F);
 
   unsigned int idx = 0;
-  __fp16 ret = 0;
+  unsigned int N8 = (N >> 3) << 3;
+  float ret = 0;
 
-  // processing batch of 8
-  for (; (N - idx) >= 8; idx += 8) {
+  // Adaptive loop for batch size of 8
+  for (; idx < N8; idx += 8) {
     float16x8_t x = vld1q_f16(&X[idx]);
     float16x8_t y = vld1q_f16(&Y[idx]);
 
-    // x*y + accX8 -> accX8
-    accX8 = vfmaq_f16(accX8, x, y);
-  }
-
-  // check at least one batch of 8 is processed
-  if (N - 8 >= 0) {
-    __fp16 result[8];
-    vst1q_f16(result, accX8);
-    for (unsigned int i = 0; i < 8; i++)
-      ret += result[i];
-  }
-
-  // processing remaining batch of 4
-  for (; (N - idx) >= 4; idx += 4) {
-    float16x4_t x = vld1_f16(&X[idx]);
-    float16x4_t y = vld1_f16(&Y[idx]);
-
-    // x*y + accX4 -> accX4
-    accX4 = vfma_f16(accX4, x, y);
-  }
-
-  // check at least one batch of 4 is processed
-  if (N % 8 >= 4) {
-    __fp16 result[4];
-    vst1_f16(result, accX4);
-    ret += result[0] + result[1] + result[2] + result[3];
+    x = vmulq_f16(x, y);
+    accX0_3 = vaddq_f32(accX0_3, vcvt_f32_f16(vget_low_f16(x)));
+    accX4_7 = vaddq_f32(accX4_7, vcvt_f32_f16(vget_high_f16(x)));
   }
+  ret += vaddvq_f32(accX0_3) + vaddvq_f32(accX4_7);
 
-  // pocessing remaining values
+  // Loop for remaining indices
   for (; idx < N; idx++)
     ret += X[idx] * Y[idx];
 
-  return ret;
+  return static_cast<__fp16>(ret);
 }
 
 __fp16 hnrm2(const unsigned int N, const __fp16 *X) {
@@ -1994,5 +2002,40 @@ void inv_sqrt_inplace(const unsigned int N, __fp16 *X) {
   }
 }
 
+bool isValid(const size_t N, const __fp16 *input) {
+  bool temp = 0;
+  size_t i = 0;
+  __fp16 inf_s = std::numeric_limits<float>::infinity();
+  float16x8_t inf = vdupq_n_f16(inf_s);
+  uint16x8_t zero = vdupq_n_f16(0);
+
+  for (; N - i >= 8; i += 8) {
+    float16x8_t vec = vld1q_f16(&input[i]);
+
+    uint16x8_t vcmp = vceqq_f16(vec, vec);
+
+    vcmp = vceqq_f16(vcmp, zero);
+
+    if (vaddvq_u16(vcmp)) {
+      return false;
+    }
+
+    vcmp = vceqq_f16(vec, inf);
+
+    if (vaddvq_u16(vcmp)) {
+      return false;
+    }
+  }
+
+  while (i < N) {
+    if (input[i] != input[i] ||
+        input[i] == std::numeric_limits<float>::infinity()) {
+      return false;
+    }
+    ++i;
+  }
+  return true;
+}
+
 #endif
 } // namespace nntrainer::neon
diff --git a/nntrainer/tensor/blas_neon.h b/nntrainer/tensor/blas_neon.h
index db1b6a5ccc..978d3428f7 100644
--- a/nntrainer/tensor/blas_neon.h
+++ b/nntrainer/tensor/blas_neon.h
@@ -148,6 +148,15 @@ void ele_sub(const unsigned N, const float *X, const float *Y, float *Z,
 void ele_div(const unsigned N, const float *X, const float *Y, float *Z,
              float alpha = 1.f, float beta = 0.f);
 
+/**
+ * @brief     check if the X has NaN value or Inf
+ * @note it compare (x!=x || x == inf)
+ * @param[in] N  length of the vector
+ * @param[in] input float * for Vector X
+ * @param[out] false if it has NaN or Inf
+ */
+bool isValid(const size_t N, const float *input);
+
 #ifdef ENABLE_FP16
 /**
  * @brief     hgemv computation with neon : Y = alpha*A*X + beta*Y
@@ -380,6 +389,15 @@ void hgemm_transAB(const __fp16 *A, const __fp16 *B, float *C, uint32_t M,
  * @param X __fp16 * for Vector X
  */
 void inv_sqrt_inplace(const unsigned int N, __fp16 *X);
+
+/**
+ * @brief     check if the X is valid: Check NaN or Inf
+ * @note it compare (x!=x || x == inf)
+ * @param[in] N  length of the vector
+ * @param[in] X float * for Vector X
+ * @param[out] false if it has NaN or Inf
+ */
+bool isValid(const size_t N, const __fp16 *X);
 #endif
 
 } // namespace nntrainer::neon
diff --git a/nntrainer/tensor/hgemm/hgemm.cpp b/nntrainer/tensor/hgemm/hgemm.cpp
index a41a5ba6dc..4aaadf331c 100644
--- a/nntrainer/tensor/hgemm/hgemm.cpp
+++ b/nntrainer/tensor/hgemm/hgemm.cpp
@@ -32,15 +32,17 @@
 void hgemm_noTrans(const __fp16 *A, const __fp16 *B, float *C32, unsigned int M,
                    unsigned int N, unsigned int K, float alpha, float beta) {
   if (alpha == 1.F && beta == 0.F) {
-    if (M % 8 == 0 && N % 16 == 0 && K % 8 == 0) {
+    // used bitwise operator instead of modulo for performance
+    // e.g (M % 8) is same as (M & 0x7) which will extract last 3 bits of M
+    if ((M & 0x7) == 0 && (N & 0xF) == 0 && (K & 0x7) == 0) {
       hgemm_noTrans_8x16(M, N, K, A, K, B, N, C32, N, alpha, beta);
-    } else if (M % 8 == 0 && N % 8 == 0 && K % 8 == 0) {
+    } else if ((M & 0x7) == 0 && (N & 0x7) == 0 && (K & 0x7) == 0) {
       hgemm_noTrans_8x8(M, N, K, A, K, B, N, C32, N, alpha, beta);
-    } else if (M % 4 == 0 && N % 8 == 0 && K % 4 == 0) {
+    } else if ((M & 0x3) == 0 && (N & 0x7) == 0 && (K & 0x3) == 0) {
       hgemm_noTrans_4x8(M, N, K, A, K, B, N, C32, N, alpha, beta);
-    } else if (N % 8 == 0) {
+    } else if ((K & 0x7) == 0 && (N & 0x7) == 0) {
       hgemm_noTrans_1x8(M, N, K, A, K, B, N, C32, N, alpha, beta);
-    } else if (N % 4 == 0) {
+    } else if ((K & 0x7) == 0 && (N & 0x3) == 0) {
       hgemm_noTrans_1x4(M, N, K, A, K, B, N, C32, N, alpha, beta);
     } else {
       hgemm_noTrans_fallback(M, N, K, A, K, B, N, C32, N, alpha, beta);
@@ -52,17 +54,19 @@ void hgemm_noTrans(const __fp16 *A, const __fp16 *B, float *C32, unsigned int M,
 void hgemm_noTrans(const __fp16 *A, const __fp16 *B, __fp16 *C, unsigned int M,
                    unsigned int N, unsigned int K, float alpha, float beta) {
   if (alpha == 1.F && beta == 0.F) {
-    if (M % 8 == 0 && N % 16 == 0 && K % 8 == 0) {
+    // used bitwise operator instead of modulo for performance
+    // e.g (M % 8) is same as (M & 0x7) which will extract last 3 bits of M
+    if ((M & 0x7) == 0 && (N & 0xF) == 0 && (K & 0x7) == 0) {
       hgemm_noTrans_8x16(M, N, K, A, K, B, N, C, N, alpha, beta);
-    } else if (M % 8 == 0 && N % 8 == 0 && K % 8 == 0) {
+    } else if ((M & 0x7) == 0 && (N & 0x7) == 0 && (K & 0x7) == 0) {
       hgemm_noTrans_8x8(M, N, K, A, K, B, N, C, N, alpha, beta);
-    } else if (M % 4 == 0 && N % 8 == 0 && K % 4 == 0) {
+    } else if ((M & 0x3) == 0 && (N & 0x7) == 0 && (K & 0x3) == 0) {
       hgemm_noTrans_4x8(M, N, K, A, K, B, N, C, N, alpha, beta);
-    } else if (N % 8 == 0) {
-      hgemm_noTrans_1x8(M, N, K, A, K, B, N, C, N, alpha, beta);
-    } else if (M % 4 == 0 && N % 4 == 0 && K % 4 == 0) {
+    } else if ((M & 0x3) == 0 && (N & 0x3) == 0 && (K & 0x3) == 0) {
       hgemm_noTrans_4x4(M, N, K, A, K, B, N, C, N, alpha, beta);
-    } else if (N % 4 == 0) {
+    } else if ((N & 0x7) == 0 && (K & 0x7) == 0) {
+      hgemm_noTrans_1x8(M, N, K, A, K, B, N, C, N, alpha, beta);
+    } else if ((N & 0x3) == 0 && (K & 0x7) == 0) {
       hgemm_noTrans_1x4(M, N, K, A, K, B, N, C, N, alpha, beta);
     }
   }
@@ -408,6 +412,72 @@ void hgemm_noTrans_1x8(unsigned int M, unsigned int N, unsigned int K,
   free(sb);
 }
 
+void hgemm_noTrans_4x4(unsigned int M, unsigned int N, unsigned int K,
+                       const __fp16 *A, unsigned int lda, const __fp16 *B,
+                       unsigned int ldb, float *C, unsigned int ldc,
+                       float alpha, float beta) {
+  __fp16 *sa = alignedMalloc(M * K);
+  __fp16 *sb = alignedMalloc(K * N);
+
+  unsigned int ms, mms, ns, ks;
+  unsigned int m_min, m2_min, n_min, k_min;
+  for (ms = 0; ms < M; ms += M_BLOCKING) {
+    m_min = M - ms;
+    if (m_min > M_BLOCKING) {
+      m_min = M_BLOCKING;
+    }
+
+    for (ks = 0; ks < K; ks += k_min) {
+      k_min = K - ks;
+      if (k_min >= (K_BLOCKING << 1)) {
+        k_min = K_BLOCKING;
+      } else if (k_min > K_BLOCKING) {
+        k_min = (k_min / 2 + GEMM_UNROLLING_4 - 1) & ~(GEMM_UNROLLING_4 - 1);
+      }
+
+      n_min = N;
+      if (N >= N_BLOCKING * 2) {
+        n_min = N_BLOCKING;
+      } else if (N > N_BLOCKING) {
+        n_min = (n_min / 2 + GEMM_UNROLLING_4 - 1) & ~(GEMM_UNROLLING_4 - 1);
+      }
+      packing_B4(k_min, n_min, B + ks * ldb, ldb, sb);
+
+      for (mms = ms; mms < ms + m_min; mms += m2_min) {
+        m2_min = (ms + m_min) - mms;
+        if (m2_min >= 3 * GEMM_UNROLLING_4) {
+          m2_min = 3 * GEMM_UNROLLING_4;
+        } else if (m2_min >= 2 * GEMM_UNROLLING_4) {
+          m2_min = 2 * GEMM_UNROLLING_4;
+        } else if (m2_min > GEMM_UNROLLING_4) {
+          m2_min = GEMM_UNROLLING_4;
+        }
+
+        packing_A4(m2_min, k_min, A + mms * lda + ks, lda,
+                   sa + k_min * (mms - ms));
+
+        HGEMM_KERNEL_4x4(m2_min, n_min, k_min, sa + k_min * (mms - ms), sb,
+                         C + mms * ldc, ldc);
+      }
+
+      for (ns = n_min; ns < N; ns += n_min) {
+        n_min = N - ns;
+        if (n_min >= N_BLOCKING * 2) {
+          n_min = N_BLOCKING;
+        } else if (n_min > N_BLOCKING) {
+          n_min = (n_min / 2 + GEMM_UNROLLING_4 - 1) & ~(GEMM_UNROLLING_4 - 1);
+        }
+
+        packing_B4(k_min, n_min, B + ns + ldb * ks, ldb, sb);
+        HGEMM_KERNEL_4x4(m_min, n_min, k_min, sa, sb, C + ms * ldc + ns, ldc);
+      }
+    }
+  }
+
+  free(sa);
+  free(sb);
+}
+
 void hgemm_noTrans_4x8(unsigned int M, unsigned int N, unsigned int K,
                        const __fp16 *A, unsigned int lda, const __fp16 *B,
                        unsigned int ldb, __fp16 *C, unsigned int ldc,
diff --git a/nntrainer/tensor/hgemm/hgemm.h b/nntrainer/tensor/hgemm/hgemm.h
index b05d89cb01..7c8194edf2 100644
--- a/nntrainer/tensor/hgemm/hgemm.h
+++ b/nntrainer/tensor/hgemm/hgemm.h
@@ -181,6 +181,26 @@ void hgemm_noTrans_8x8(unsigned int M, unsigned int N, unsigned int K,
                        unsigned int ldb, __fp16 *C, unsigned int ldc,
                        float alpha = 1.F, float beta = 0.F);
 
+/**
+ * @brief hgemm noTrans computation with 4x4 kernel : C = A*B,
+ *
+ * @param M length of the row of matrix A
+ * @param N length of the col of matrix B
+ * @param K length of the col of matrix A
+ * @param A input matrix A
+ * @param lda length of the col of matrix C
+ * @param B input matrix B
+ * @param ldb length of the col of matrix C
+ * @param C output matrix C
+ * @param ldc length of the col of matrix C
+ * @param[in] alpha float number
+ * @param[in] beta float number
+ */
+void hgemm_noTrans_4x4(unsigned int M, unsigned int N, unsigned int K,
+                       const __fp16 *A, unsigned int lda, const __fp16 *B,
+                       unsigned int ldb, float *C, unsigned int ldc,
+                       float alpha = 1.F, float beta = 0.F);
+
 /**
  * @brief hgemm noTrans computation with 8x8 kernel : C = A*B,
  *
diff --git a/nntrainer/tensor/hgemm/hgemm_kernel_4x4.h b/nntrainer/tensor/hgemm/hgemm_kernel_4x4.h
index 6166b9407d..7bf75b13b7 100644
--- a/nntrainer/tensor/hgemm/hgemm_kernel_4x4.h
+++ b/nntrainer/tensor/hgemm/hgemm_kernel_4x4.h
@@ -14,6 +14,193 @@
 #include <hgemm_common.h>
 #include <stdlib.h>
 
+#define INIT_KERNEL_4x4() \
+  v24 = vdup_n_f16(0.F);  \
+  v25 = vdup_n_f16(0.F);  \
+  v26 = vdup_n_f16(0.F);  \
+  v27 = vdup_n_f16(0.F);
+
+// 1. Partial sum 256 digits
+#define KERNEL_4x4_ACC16()               \
+  dv0 = vld1_f16(a);                     \
+  vb0 = vld1_f16(b);                     \
+  v24 = vfma_lane_f16(v24, vb0, dv0, 0); \
+  v25 = vfma_lane_f16(v25, vb0, dv0, 1); \
+  v26 = vfma_lane_f16(v26, vb0, dv0, 2); \
+  v27 = vfma_lane_f16(v27, vb0, dv0, 3); \
+  dv1 = vld1_f16(a + 4);                 \
+  vb1 = vld1_f16(b + 4);                 \
+  v24 = vfma_lane_f16(v24, vb1, dv1, 0); \
+  v25 = vfma_lane_f16(v25, vb1, dv1, 1); \
+  v26 = vfma_lane_f16(v26, vb1, dv1, 2); \
+  v27 = vfma_lane_f16(v27, vb1, dv1, 3); \
+  dv2 = vld1_f16(a + 4 * 2);             \
+  vb2 = vld1_f16(b + 4 * 2);             \
+  v24 = vfma_lane_f16(v24, vb2, dv2, 0); \
+  v25 = vfma_lane_f16(v25, vb2, dv2, 1); \
+  v26 = vfma_lane_f16(v26, vb2, dv2, 2); \
+  v27 = vfma_lane_f16(v27, vb2, dv2, 3); \
+  dv3 = vld1_f16(a + 4 * 3);             \
+  vb3 = vld1_f16(b + 4 * 3);             \
+  v24 = vfma_lane_f16(v24, vb3, dv3, 0); \
+  v25 = vfma_lane_f16(v25, vb3, dv3, 1); \
+  v26 = vfma_lane_f16(v26, vb3, dv3, 2); \
+  v27 = vfma_lane_f16(v27, vb3, dv3, 3); \
+  dv4 = vld1_f16(a + 4 * 4);             \
+  vb4 = vld1_f16(b + 4 * 4);             \
+  v24 = vfma_lane_f16(v24, vb4, dv4, 0); \
+  v25 = vfma_lane_f16(v25, vb4, dv4, 1); \
+  v26 = vfma_lane_f16(v26, vb4, dv4, 2); \
+  v27 = vfma_lane_f16(v27, vb4, dv4, 3); \
+  dv5 = vld1_f16(a + 4 * 5);             \
+  vb5 = vld1_f16(b + 4 * 5);             \
+  v24 = vfma_lane_f16(v24, vb5, dv5, 0); \
+  v25 = vfma_lane_f16(v25, vb5, dv5, 1); \
+  v26 = vfma_lane_f16(v26, vb5, dv5, 2); \
+  v27 = vfma_lane_f16(v27, vb5, dv5, 3); \
+  dv6 = vld1_f16(a + 4 * 6);             \
+  vb6 = vld1_f16(b + 4 * 6);             \
+  v24 = vfma_lane_f16(v24, vb6, dv6, 0); \
+  v25 = vfma_lane_f16(v25, vb6, dv6, 1); \
+  v26 = vfma_lane_f16(v26, vb6, dv6, 2); \
+  v27 = vfma_lane_f16(v27, vb6, dv6, 3); \
+  dv7 = vld1_f16(a + 4 * 7);             \
+  vb7 = vld1_f16(b + 4 * 7);             \
+  v24 = vfma_lane_f16(v24, vb7, dv7, 0); \
+  v25 = vfma_lane_f16(v25, vb7, dv7, 1); \
+  v26 = vfma_lane_f16(v26, vb7, dv7, 2); \
+  v27 = vfma_lane_f16(v27, vb7, dv7, 3); \
+  dv7 = vld1_f16(a + 4 * 8);             \
+  vb7 = vld1_f16(b + 4 * 8);             \
+  v24 = vfma_lane_f16(v24, vb7, dv7, 0); \
+  v25 = vfma_lane_f16(v25, vb7, dv7, 1); \
+  v26 = vfma_lane_f16(v26, vb7, dv7, 2); \
+  v27 = vfma_lane_f16(v27, vb7, dv7, 3); \
+  dv7 = vld1_f16(a + 4 * 9);             \
+  vb7 = vld1_f16(b + 4 * 9);             \
+  v24 = vfma_lane_f16(v24, vb7, dv7, 0); \
+  v25 = vfma_lane_f16(v25, vb7, dv7, 1); \
+  v26 = vfma_lane_f16(v26, vb7, dv7, 2); \
+  v27 = vfma_lane_f16(v27, vb7, dv7, 3); \
+  dv7 = vld1_f16(a + 4 * 10);            \
+  vb7 = vld1_f16(b + 4 * 10);            \
+  v24 = vfma_lane_f16(v24, vb7, dv7, 0); \
+  v25 = vfma_lane_f16(v25, vb7, dv7, 1); \
+  v26 = vfma_lane_f16(v26, vb7, dv7, 2); \
+  v27 = vfma_lane_f16(v27, vb7, dv7, 3); \
+  dv7 = vld1_f16(a + 4 * 11);            \
+  vb7 = vld1_f16(b + 4 * 11);            \
+  v24 = vfma_lane_f16(v24, vb7, dv7, 0); \
+  v25 = vfma_lane_f16(v25, vb7, dv7, 1); \
+  v26 = vfma_lane_f16(v26, vb7, dv7, 2); \
+  v27 = vfma_lane_f16(v27, vb7, dv7, 3); \
+  dv7 = vld1_f16(a + 4 * 12);            \
+  vb7 = vld1_f16(b + 4 * 12);            \
+  v24 = vfma_lane_f16(v24, vb7, dv7, 0); \
+  v25 = vfma_lane_f16(v25, vb7, dv7, 1); \
+  v26 = vfma_lane_f16(v26, vb7, dv7, 2); \
+  v27 = vfma_lane_f16(v27, vb7, dv7, 3); \
+  dv7 = vld1_f16(a + 4 * 13);            \
+  vb7 = vld1_f16(b + 4 * 13);            \
+  v24 = vfma_lane_f16(v24, vb7, dv7, 0); \
+  v25 = vfma_lane_f16(v25, vb7, dv7, 1); \
+  v26 = vfma_lane_f16(v26, vb7, dv7, 2); \
+  v27 = vfma_lane_f16(v27, vb7, dv7, 3); \
+  dv7 = vld1_f16(a + 4 * 14);            \
+  vb7 = vld1_f16(b + 4 * 14);            \
+  v24 = vfma_lane_f16(v24, vb7, dv7, 0); \
+  v25 = vfma_lane_f16(v25, vb7, dv7, 1); \
+  v26 = vfma_lane_f16(v26, vb7, dv7, 2); \
+  v27 = vfma_lane_f16(v27, vb7, dv7, 3); \
+  dv7 = vld1_f16(a + 4 * 15);            \
+  vb7 = vld1_f16(b + 4 * 15);            \
+  v24 = vfma_lane_f16(v24, vb7, dv7, 0); \
+  v25 = vfma_lane_f16(v25, vb7, dv7, 1); \
+  v26 = vfma_lane_f16(v26, vb7, dv7, 2); \
+  v27 = vfma_lane_f16(v27, vb7, dv7, 3); \
+  l += 16;                               \
+  __builtin_prefetch(b + 64, 0, 3);      \
+  __builtin_prefetch(a + 64, 0, 3);      \
+  b += 4 * 16;                           \
+  a += 4 * 16;
+
+// 2. Partial sum 128 digits
+#define KERNEL_4x4_ACC8()                \
+  dv0 = vld1_f16(a);                     \
+  vb0 = vld1_f16(b);                     \
+  v24 = vfma_lane_f16(v24, vb0, dv0, 0); \
+  v25 = vfma_lane_f16(v25, vb0, dv0, 1); \
+  v26 = vfma_lane_f16(v26, vb0, dv0, 2); \
+  v27 = vfma_lane_f16(v27, vb0, dv0, 3); \
+  dv1 = vld1_f16(a + 4);                 \
+  vb1 = vld1_f16(b + 4);                 \
+  v24 = vfma_lane_f16(v24, vb1, dv1, 0); \
+  v25 = vfma_lane_f16(v25, vb1, dv1, 1); \
+  v26 = vfma_lane_f16(v26, vb1, dv1, 2); \
+  v27 = vfma_lane_f16(v27, vb1, dv1, 3); \
+  dv2 = vld1_f16(a + 8);                 \
+  vb2 = vld1_f16(b + 8);                 \
+  v24 = vfma_lane_f16(v24, vb2, dv2, 0); \
+  v25 = vfma_lane_f16(v25, vb2, dv2, 1); \
+  v26 = vfma_lane_f16(v26, vb2, dv2, 2); \
+  v27 = vfma_lane_f16(v27, vb2, dv2, 3); \
+  dv3 = vld1_f16(a + 12);                \
+  vb3 = vld1_f16(b + 12);                \
+  v24 = vfma_lane_f16(v24, vb3, dv3, 0); \
+  v25 = vfma_lane_f16(v25, vb3, dv3, 1); \
+  v26 = vfma_lane_f16(v26, vb3, dv3, 2); \
+  v27 = vfma_lane_f16(v27, vb3, dv3, 3); \
+  dv4 = vld1_f16(a + 16);                \
+  vb4 = vld1_f16(b + 16);                \
+  v24 = vfma_lane_f16(v24, vb4, dv4, 0); \
+  v25 = vfma_lane_f16(v25, vb4, dv4, 1); \
+  v26 = vfma_lane_f16(v26, vb4, dv4, 2); \
+  v27 = vfma_lane_f16(v27, vb4, dv4, 3); \
+  dv5 = vld1_f16(a + 20);                \
+  vb5 = vld1_f16(b + 20);                \
+  v24 = vfma_lane_f16(v24, vb5, dv5, 0); \
+  v25 = vfma_lane_f16(v25, vb5, dv5, 1); \
+  v26 = vfma_lane_f16(v26, vb5, dv5, 2); \
+  v27 = vfma_lane_f16(v27, vb5, dv5, 3); \
+  dv6 = vld1_f16(a + 24);                \
+  vb6 = vld1_f16(b + 24);                \
+  v24 = vfma_lane_f16(v24, vb6, dv6, 0); \
+  v25 = vfma_lane_f16(v25, vb6, dv6, 1); \
+  v26 = vfma_lane_f16(v26, vb6, dv6, 2); \
+  v27 = vfma_lane_f16(v27, vb6, dv6, 3); \
+  dv7 = vld1_f16(a + 28);                \
+  vb7 = vld1_f16(b + 28);                \
+  v24 = vfma_lane_f16(v24, vb7, dv7, 0); \
+  v25 = vfma_lane_f16(v25, vb7, dv7, 1); \
+  v26 = vfma_lane_f16(v26, vb7, dv7, 2); \
+  v27 = vfma_lane_f16(v27, vb7, dv7, 3); \
+  l += 8;                                \
+  __builtin_prefetch(b + 32, 0, 3);      \
+  __builtin_prefetch(a + 32, 0, 3);      \
+  b += 4 * 8;                            \
+  a += 4 * 8;
+
+// 2. Partial sum 16 digits
+#define KERNEL_4x4_ACC1()                \
+  dv0 = vld1_f16(a);                     \
+  vb0 = vld1_f16(b);                     \
+  v24 = vfma_lane_f16(v24, vb0, dv0, 0); \
+  v25 = vfma_lane_f16(v25, vb0, dv0, 1); \
+  v26 = vfma_lane_f16(v26, vb0, dv0, 2); \
+  v27 = vfma_lane_f16(v27, vb0, dv0, 3); \
+  l += 1;                                \
+  __builtin_prefetch(b + 4, 0, 3);       \
+  __builtin_prefetch(a + 4, 0, 3);       \
+  b += 4 * 1;                            \
+  a += 4 * 1;
+
+#define SAVE_KERNEL_4X4_F16_F32()                                       \
+  vst1q_f32(c, vaddq_f32(vld1q_f32(c), vcvt_f32_f16(v24)));             \
+  vst1q_f32(c + ldc, vaddq_f32(vld1q_f32(c + ldc), vcvt_f32_f16(v25))); \
+  vst1q_f32(c + 2 * ldc,                                                \
+            vaddq_f32(vld1q_f32(c + 2 * ldc), vcvt_f32_f16(v26)));      \
+  vst1q_f32(c + 3 * ldc, vaddq_f32(vld1q_f32(c + 3 * ldc), vcvt_f32_f16(v27)));
+
 /**
  * @brief hgemm 4x4 kernel sc = sa * sb
  *
@@ -37,10 +224,11 @@ void hgemm_kernel_4x4(unsigned int M, unsigned int N, unsigned int K,
       __builtin_prefetch(b, 0, 3);
       __builtin_prefetch(a, 0, 3);
 
-      float16x4_t v24 = {0};
-      float16x4_t v25 = {0};
-      float16x4_t v26 = {0};
-      float16x4_t v27 = {0};
+      float16x4_t v24;
+      float16x4_t v25;
+      float16x4_t v26;
+      float16x4_t v27;
+      INIT_KERNEL_4x4();
 
       for (l = 0; l < K; l += VL_FP16_HALF) {
         float16x4_t v0 = vld1_f16(b);
@@ -101,3 +289,59 @@ void hgemm_kernel_4x4(unsigned int M, unsigned int N, unsigned int K,
     b = sb;
   }
 }
+
+/**
+ * @brief hgemm 4x4 kernel sc = sa * sb
+ *
+ * @param m length of the row of matrix A
+ * @param n length of the col of matrix B
+ * @param k length of the col of matrix A
+ * @param sa sub-matrix of input matrix A
+ * @param sb sub-matrix of input matrix B
+ * @param sc sub-matrix of output matrix C
+ * @param ldc leading dimension of matrix C
+ */
+void hgemm_kernel_4x4(unsigned int M, unsigned int N, unsigned int K,
+                      __fp16 *sa, __fp16 *sb, float *sc, unsigned int ldc) {
+  assert(M > 0 && N > 0 && K > 0);
+  assert(M % 4 == 0 && N % 4 == 0 && K % 4 == 0);
+
+  __fp16 *a = sa, *b = sb;
+  float *c = sc;
+  unsigned int i, j, l;
+  unsigned int K16 = (K >> 4) << 4;
+  unsigned int K8 = (K >> 3) << 3;
+  for (i = 0; i < M; i += VL_FP16_HALF) {
+    for (j = 0; j < N; j += VL_FP16_HALF) {
+      __builtin_prefetch(b, 0, 3);
+      __builtin_prefetch(a, 0, 3);
+
+      float16x4_t v24, v25, v26, v27;
+      float16x4_t dv0, dv1, dv2, dv3, dv4, dv5, dv6, dv7;
+      float16x4_t vb0, vb1, vb2, vb3, vb4, vb5, vb6, vb7;
+      l = 0;
+      for (; l < K16;) {
+        INIT_KERNEL_4x4();
+        KERNEL_4x4_ACC16();
+        SAVE_KERNEL_4X4_F16_F32();
+      }
+      for (; l < K8;) {
+        INIT_KERNEL_4x4();
+        KERNEL_4x4_ACC8();
+        SAVE_KERNEL_4X4_F16_F32();
+      }
+      for (; l < K;) {
+        INIT_KERNEL_4x4();
+        KERNEL_4x4_ACC1();
+        SAVE_KERNEL_4X4_F16_F32();
+      }
+
+      c += 4;
+      a -= 4 * K;
+    }
+    sc += ldc * 4;
+    c = sc;
+    a += 4 * K;
+    b = sb;
+  }
+}
diff --git a/nntrainer/tensor/hgemm/hgemm_kernel_4x8.h b/nntrainer/tensor/hgemm/hgemm_kernel_4x8.h
index dce6659934..01204457e9 100644
--- a/nntrainer/tensor/hgemm/hgemm_kernel_4x8.h
+++ b/nntrainer/tensor/hgemm/hgemm_kernel_4x8.h
@@ -14,15 +14,118 @@
 #include <hgemm_common.h>
 #include <stdlib.h>
 
-/// @note Following KERNELs are the combinations of accuracy-latency
-/// tradeoff. User can select which kernel to use by replacing them.
+#define INIT_KERNEL_4X8() \
+  v0 = vdupq_n_f16(0.F);  \
+  v3 = vdupq_n_f16(0.F);  \
+  v6 = vdupq_n_f16(0.F);  \
+  v9 = vdupq_n_f16(0.F);
 
-// 1. Partial sum 256 digits : worst accuracy, best latency
+// 1. Partial sum 256 digits
+#define KERNEL_4x8_ACC16()              \
+  dv0 = vld1_f16(a);                    \
+  v24 = vld1q_f16(b);                   \
+  v0 = vfmaq_lane_f16(v0, v24, dv0, 0); \
+  v3 = vfmaq_lane_f16(v3, v24, dv0, 1); \
+  v6 = vfmaq_lane_f16(v6, v24, dv0, 2); \
+  v9 = vfmaq_lane_f16(v9, v24, dv0, 3); \
+  dv1 = vld1_f16(a + 4);                \
+  v25 = vld1q_f16(b + 8);               \
+  v0 = vfmaq_lane_f16(v0, v25, dv1, 0); \
+  v3 = vfmaq_lane_f16(v3, v25, dv1, 1); \
+  v6 = vfmaq_lane_f16(v6, v25, dv1, 2); \
+  v9 = vfmaq_lane_f16(v9, v25, dv1, 3); \
+  dv2 = vld1_f16(a + 4 * 2);            \
+  v26 = vld1q_f16(b + 8 * 2);           \
+  v0 = vfmaq_lane_f16(v0, v26, dv2, 0); \
+  v3 = vfmaq_lane_f16(v3, v26, dv2, 1); \
+  v6 = vfmaq_lane_f16(v6, v26, dv2, 2); \
+  v9 = vfmaq_lane_f16(v9, v26, dv2, 3); \
+  dv3 = vld1_f16(a + 4 * 3);            \
+  v27 = vld1q_f16(b + 8 * 3);           \
+  v0 = vfmaq_lane_f16(v0, v27, dv3, 0); \
+  v3 = vfmaq_lane_f16(v3, v27, dv3, 1); \
+  v6 = vfmaq_lane_f16(v6, v27, dv3, 2); \
+  v9 = vfmaq_lane_f16(v9, v27, dv3, 3); \
+  dv4 = vld1_f16(a + 4 * 4);            \
+  v28 = vld1q_f16(b + 8 * 4);           \
+  v0 = vfmaq_lane_f16(v0, v28, dv4, 0); \
+  v3 = vfmaq_lane_f16(v3, v28, dv4, 1); \
+  v6 = vfmaq_lane_f16(v6, v28, dv4, 2); \
+  v9 = vfmaq_lane_f16(v9, v28, dv4, 3); \
+  dv5 = vld1_f16(a + 4 * 5);            \
+  v29 = vld1q_f16(b + 8 * 5);           \
+  v0 = vfmaq_lane_f16(v0, v29, dv5, 0); \
+  v3 = vfmaq_lane_f16(v3, v29, dv5, 1); \
+  v6 = vfmaq_lane_f16(v6, v29, dv5, 2); \
+  v9 = vfmaq_lane_f16(v9, v29, dv5, 3); \
+  dv6 = vld1_f16(a + 4 * 6);            \
+  v30 = vld1q_f16(b + 8 * 6);           \
+  v0 = vfmaq_lane_f16(v0, v30, dv6, 0); \
+  v3 = vfmaq_lane_f16(v3, v30, dv6, 1); \
+  v6 = vfmaq_lane_f16(v6, v30, dv6, 2); \
+  v9 = vfmaq_lane_f16(v9, v30, dv6, 3); \
+  dv7 = vld1_f16(a + 4 * 7);            \
+  v31 = vld1q_f16(b + 8 * 7);           \
+  v0 = vfmaq_lane_f16(v0, v31, dv7, 0); \
+  v3 = vfmaq_lane_f16(v3, v31, dv7, 1); \
+  v6 = vfmaq_lane_f16(v6, v31, dv7, 2); \
+  v9 = vfmaq_lane_f16(v9, v31, dv7, 3); \
+  dv7 = vld1_f16(a + 4 * 8);            \
+  v31 = vld1q_f16(b + 8 * 8);           \
+  v0 = vfmaq_lane_f16(v0, v31, dv7, 0); \
+  v3 = vfmaq_lane_f16(v3, v31, dv7, 1); \
+  v6 = vfmaq_lane_f16(v6, v31, dv7, 2); \
+  v9 = vfmaq_lane_f16(v9, v31, dv7, 3); \
+  dv7 = vld1_f16(a + 4 * 9);            \
+  v31 = vld1q_f16(b + 8 * 9);           \
+  v0 = vfmaq_lane_f16(v0, v31, dv7, 0); \
+  v3 = vfmaq_lane_f16(v3, v31, dv7, 1); \
+  v6 = vfmaq_lane_f16(v6, v31, dv7, 2); \
+  v9 = vfmaq_lane_f16(v9, v31, dv7, 3); \
+  dv7 = vld1_f16(a + 4 * 10);           \
+  v31 = vld1q_f16(b + 8 * 10);          \
+  v0 = vfmaq_lane_f16(v0, v31, dv7, 0); \
+  v3 = vfmaq_lane_f16(v3, v31, dv7, 1); \
+  v6 = vfmaq_lane_f16(v6, v31, dv7, 2); \
+  v9 = vfmaq_lane_f16(v9, v31, dv7, 3); \
+  dv7 = vld1_f16(a + 4 * 11);           \
+  v31 = vld1q_f16(b + 8 * 11);          \
+  v0 = vfmaq_lane_f16(v0, v31, dv7, 0); \
+  v3 = vfmaq_lane_f16(v3, v31, dv7, 1); \
+  v6 = vfmaq_lane_f16(v6, v31, dv7, 2); \
+  v9 = vfmaq_lane_f16(v9, v31, dv7, 3); \
+  dv7 = vld1_f16(a + 4 * 12);           \
+  v31 = vld1q_f16(b + 8 * 12);          \
+  v0 = vfmaq_lane_f16(v0, v31, dv7, 0); \
+  v3 = vfmaq_lane_f16(v3, v31, dv7, 1); \
+  v6 = vfmaq_lane_f16(v6, v31, dv7, 2); \
+  v9 = vfmaq_lane_f16(v9, v31, dv7, 3); \
+  dv7 = vld1_f16(a + 4 * 13);           \
+  v31 = vld1q_f16(b + 8 * 13);          \
+  v0 = vfmaq_lane_f16(v0, v31, dv7, 0); \
+  v3 = vfmaq_lane_f16(v3, v31, dv7, 1); \
+  v6 = vfmaq_lane_f16(v6, v31, dv7, 2); \
+  v9 = vfmaq_lane_f16(v9, v31, dv7, 3); \
+  dv7 = vld1_f16(a + 4 * 14);           \
+  v31 = vld1q_f16(b + 8 * 14);          \
+  v0 = vfmaq_lane_f16(v0, v31, dv7, 0); \
+  v3 = vfmaq_lane_f16(v3, v31, dv7, 1); \
+  v6 = vfmaq_lane_f16(v6, v31, dv7, 2); \
+  v9 = vfmaq_lane_f16(v9, v31, dv7, 3); \
+  dv7 = vld1_f16(a + 4 * 15);           \
+  v31 = vld1q_f16(b + 8 * 15);          \
+  v0 = vfmaq_lane_f16(v0, v31, dv7, 0); \
+  v3 = vfmaq_lane_f16(v3, v31, dv7, 1); \
+  v6 = vfmaq_lane_f16(v6, v31, dv7, 2); \
+  v9 = vfmaq_lane_f16(v9, v31, dv7, 3); \
+  l += 16;                              \
+  __builtin_prefetch(b + 128, 0, 3);    \
+  __builtin_prefetch(a + 64, 0, 3);     \
+  b += 8 * 16;                          \
+  a += 4 * 16;
+
+// 1. Partial sum 256 digits
 #define KERNEL_4x8_ACC8()               \
-  v0 = vdupq_n_f16(0.F);                \
-  v3 = vdupq_n_f16(0.F);                \
-  v6 = vdupq_n_f16(0.F);                \
-  v9 = vdupq_n_f16(0.F);                \
   dv0 = vld1_f16(a);                    \
   v24 = vld1q_f16(b);                   \
   v0 = vfmaq_lane_f16(v0, v24, dv0, 0); \
@@ -77,12 +180,8 @@
   b += 8 * 8;                           \
   a += 4 * 8;
 
-// 2. Partial sum 128 digits : medium accuracy, medium latency
+// 2. Partial sum 128 digits
 #define KERNEL_4x8_ACC4()               \
-  v0 = vdupq_n_f16(0.F);                \
-  v3 = vdupq_n_f16(0.F);                \
-  v6 = vdupq_n_f16(0.F);                \
-  v9 = vdupq_n_f16(0.F);                \
   dv0 = vld1_f16(a);                    \
   v24 = vld1q_f16(b);                   \
   v0 = vfmaq_lane_f16(v0, v24, dv0, 0); \
@@ -113,12 +212,8 @@
   b += 8 * 4;                           \
   a += 4 * 4;
 
-// 3. Partial sum 32 digits : Best accuracy, worst latency
+// 3. Partial sum 32 digits
 #define KERNEL_4x8_ACC1()               \
-  v0 = vdupq_n_f16(0.F);                \
-  v3 = vdupq_n_f16(0.F);                \
-  v6 = vdupq_n_f16(0.F);                \
-  v9 = vdupq_n_f16(0.F);                \
   dv0 = vld1_f16(a);                    \
   v24 = vld1q_f16(b);                   \
   v0 = vfmaq_lane_f16(v0, v24, dv0, 0); \
@@ -131,6 +226,24 @@
   b += 8 * 1;                           \
   a += 4 * 1;
 
+#define SAVE_KERNEL_4X8_F16_F32()                                           \
+  vst1q_f32(c, vaddq_f32(vld1q_f32(c), vcvt_f32_f16(vget_low_f16(v0))));    \
+  vst1q_f32(c + ldc,                                                        \
+            vaddq_f32(vld1q_f32(c + ldc), vcvt_f32_f16(vget_low_f16(v3)))); \
+  vst1q_f32(c + 2 * ldc, vaddq_f32(vld1q_f32(c + 2 * ldc),                  \
+                                   vcvt_f32_f16(vget_low_f16(v6))));        \
+  vst1q_f32(c + 3 * ldc, vaddq_f32(vld1q_f32(c + 3 * ldc),                  \
+                                   vcvt_f32_f16(vget_low_f16(v9))));        \
+                                                                            \
+  vst1q_f32(c + 4,                                                          \
+            vaddq_f32(vld1q_f32(c + 4), vcvt_f32_f16(vget_high_f16(v0))));  \
+  vst1q_f32(c + 4 + ldc, vaddq_f32(vld1q_f32(c + 4 + ldc),                  \
+                                   vcvt_f32_f16(vget_high_f16(v3))));       \
+  vst1q_f32(c + 4 + 2 * ldc, vaddq_f32(vld1q_f32(c + 4 + 2 * ldc),          \
+                                       vcvt_f32_f16(vget_high_f16(v6))));   \
+  vst1q_f32(c + 4 + 3 * ldc, vaddq_f32(vld1q_f32(c + 4 + 3 * ldc),          \
+                                       vcvt_f32_f16(vget_high_f16(v9))));
+
 /**
  * @brief hgemm 4x8 kernel sc = sa * sb
  *
@@ -148,7 +261,7 @@ void hgemm_kernel_4x8(unsigned int M, unsigned int N, unsigned int K,
   assert(M % 4 == 0 && N % 8 == 0);
 
   __fp16 *a = sa, *b = sb, *c = sc;
-  unsigned int k8 = (K >> 3) << 3;
+  unsigned int K8 = (K >> 3) << 3;
   unsigned int i, j, l;
   for (i = 0; i < M; i += 4) {
     for (j = 0; j < N; j += 8) {
@@ -157,23 +270,18 @@ void hgemm_kernel_4x8(unsigned int M, unsigned int N, unsigned int K,
       float16x8_t v0, v3, v6, v9;
       float16x8_t v24, v25, v26, v27, v28, v29, v30, v31;
       float16x4_t dv0, dv1, dv2, dv3, dv4, dv5, dv6, dv7;
+      INIT_KERNEL_4X8();
       l = 0;
-      for (; l < k8;) {
+      for (; l < K8;) {
         KERNEL_4x8_ACC8();
-
-        vst1q_f16(c, vaddq_f16(vld1q_f16(c), v0));
-        vst1q_f16(c + ldc, vaddq_f16(vld1q_f16(c + ldc), v3));
-        vst1q_f16(c + 2 * ldc, vaddq_f16(vld1q_f16(c + 2 * ldc), v6));
-        vst1q_f16(c + 3 * ldc, vaddq_f16(vld1q_f16(c + 3 * ldc), v9));
       }
       for (; l < K;) {
         KERNEL_4x8_ACC1();
-
-        vst1q_f16(c, vaddq_f16(vld1q_f16(c), v0));
-        vst1q_f16(c + ldc, vaddq_f16(vld1q_f16(c + ldc), v3));
-        vst1q_f16(c + 2 * ldc, vaddq_f16(vld1q_f16(c + 2 * ldc), v6));
-        vst1q_f16(c + 3 * ldc, vaddq_f16(vld1q_f16(c + 3 * ldc), v9));
       }
+      vst1q_f16(c, vaddq_f16(vld1q_f16(c), v0));
+      vst1q_f16(c + ldc, vaddq_f16(vld1q_f16(c + ldc), v3));
+      vst1q_f16(c + 2 * ldc, vaddq_f16(vld1q_f16(c + 2 * ldc), v6));
+      vst1q_f16(c + 3 * ldc, vaddq_f16(vld1q_f16(c + 3 * ldc), v9));
       c += 8;
       a -= 4 * K;
     }
@@ -202,7 +310,9 @@ void hgemm_kernel_4x8(unsigned int M, unsigned int N, unsigned int K,
 
   __fp16 *a = sa, *b = sb;
   float *c = sc;
-  unsigned int k8 = (K >> 3) << 3;
+  unsigned int K16 = (K >> 4) << 4;
+  unsigned int K8 = (K >> 3) << 3;
+  unsigned int K4 = (K >> 2) << 2;
   unsigned int i, j, l;
   for (i = 0; i < M; i += 4) {
     for (j = 0; j < N; j += 8) {
@@ -212,45 +322,25 @@ void hgemm_kernel_4x8(unsigned int M, unsigned int N, unsigned int K,
       float16x8_t v24, v25, v26, v27, v28, v29, v30, v31;
       float16x4_t dv0, dv1, dv2, dv3, dv4, dv5, dv6, dv7;
       l = 0;
-      for (; l < k8;) {
+      for (; l < K16;) {
+        INIT_KERNEL_4X8();
+        KERNEL_4x8_ACC16();
+        SAVE_KERNEL_4X8_F16_F32();
+      }
+      for (; l < K8;) {
+        INIT_KERNEL_4X8();
         KERNEL_4x8_ACC8();
-
-        vst1q_f32(c, vaddq_f32(vld1q_f32(c), vcvt_f32_f16(vget_low_f16(v0))));
-        vst1q_f32(c + ldc, vaddq_f32(vld1q_f32(c + ldc),
-                                     vcvt_f32_f16(vget_low_f16(v3))));
-        vst1q_f32(c + 2 * ldc, vaddq_f32(vld1q_f32(c + 2 * ldc),
-                                         vcvt_f32_f16(vget_low_f16(v6))));
-        vst1q_f32(c + 3 * ldc, vaddq_f32(vld1q_f32(c + 3 * ldc),
-                                         vcvt_f32_f16(vget_low_f16(v9))));
-
-        vst1q_f32(c + 4,
-                  vaddq_f32(vld1q_f32(c + 4), vcvt_f32_f16(vget_high_f16(v0))));
-        vst1q_f32(c + 4 + ldc, vaddq_f32(vld1q_f32(c + 4 + ldc),
-                                         vcvt_f32_f16(vget_high_f16(v3))));
-        vst1q_f32(c + 4 + 2 * ldc, vaddq_f32(vld1q_f32(c + 4 + 2 * ldc),
-                                             vcvt_f32_f16(vget_high_f16(v6))));
-        vst1q_f32(c + 4 + 3 * ldc, vaddq_f32(vld1q_f32(c + 4 + 3 * ldc),
-                                             vcvt_f32_f16(vget_high_f16(v9))));
+        SAVE_KERNEL_4X8_F16_F32();
+      }
+      for (; l < K4;) {
+        INIT_KERNEL_4X8();
+        KERNEL_4x8_ACC4();
+        SAVE_KERNEL_4X8_F16_F32();
       }
       for (; l < K;) {
+        INIT_KERNEL_4X8();
         KERNEL_4x8_ACC1();
-
-        vst1q_f32(c, vaddq_f32(vld1q_f32(c), vcvt_f32_f16(vget_low_f16(v0))));
-        vst1q_f32(c + ldc, vaddq_f32(vld1q_f32(c + ldc),
-                                     vcvt_f32_f16(vget_low_f16(v3))));
-        vst1q_f32(c + 2 * ldc, vaddq_f32(vld1q_f32(c + 2 * ldc),
-                                         vcvt_f32_f16(vget_low_f16(v6))));
-        vst1q_f32(c + 3 * ldc, vaddq_f32(vld1q_f32(c + 3 * ldc),
-                                         vcvt_f32_f16(vget_low_f16(v9))));
-
-        vst1q_f32(c + 4,
-                  vaddq_f32(vld1q_f32(c + 4), vcvt_f32_f16(vget_high_f16(v0))));
-        vst1q_f32(c + 4 + ldc, vaddq_f32(vld1q_f32(c + 4 + ldc),
-                                         vcvt_f32_f16(vget_high_f16(v3))));
-        vst1q_f32(c + 4 + 2 * ldc, vaddq_f32(vld1q_f32(c + 4 + 2 * ldc),
-                                             vcvt_f32_f16(vget_high_f16(v6))));
-        vst1q_f32(c + 4 + 3 * ldc, vaddq_f32(vld1q_f32(c + 4 + 3 * ldc),
-                                             vcvt_f32_f16(vget_high_f16(v9))));
+        SAVE_KERNEL_4X8_F16_F32();
       }
       c += 8;
       a -= 4 * K;
diff --git a/nntrainer/tensor/hgemm/hgemm_kernel_8x16.h b/nntrainer/tensor/hgemm/hgemm_kernel_8x16.h
index 7cac545809..a89a6b5421 100644
--- a/nntrainer/tensor/hgemm/hgemm_kernel_8x16.h
+++ b/nntrainer/tensor/hgemm/hgemm_kernel_8x16.h
@@ -14,27 +14,338 @@
 #include <hgemm_common.h>
 #include <stdlib.h>
 
-/// @note Following KERNELs are the combinations of accuracy-latency
-/// tradeoff. User can select which kernel to use by replacing them.
+#define INIT_KERNEL_8X16()     \
+  v0_7 = vdupq_n_f16(0.F);     \
+  v8_15 = vdupq_n_f16(0.F);    \
+  v16_23 = vdupq_n_f16(0.F);   \
+  v24_31 = vdupq_n_f16(0.F);   \
+  v32_39 = vdupq_n_f16(0.F);   \
+  v40_47 = vdupq_n_f16(0.F);   \
+  v48_55 = vdupq_n_f16(0.F);   \
+  v56_63 = vdupq_n_f16(0.F);   \
+  v64_71 = vdupq_n_f16(0.F);   \
+  v72_79 = vdupq_n_f16(0.F);   \
+  v80_87 = vdupq_n_f16(0.F);   \
+  v88_95 = vdupq_n_f16(0.F);   \
+  v96_103 = vdupq_n_f16(0.F);  \
+  v104_111 = vdupq_n_f16(0.F); \
+  v112_119 = vdupq_n_f16(0.F); \
+  v120_127 = vdupq_n_f16(0.F);
 
-// 1. Partial sum 1024 digits : Worst accuracy, best latency
+// 1. Partial sum 2048 digits
+#define KERNEL_8x16_ACC16()                          \
+  va0 = vld1q_f16(a);                                \
+  v24 = vld1q_f16(b);                                \
+  v25 = vld1q_f16(b + 8);                            \
+  v0_7 = vfmaq_laneq_f16(v0_7, v24, va0, 0);         \
+  v8_15 = vfmaq_laneq_f16(v8_15, v24, va0, 1);       \
+  v16_23 = vfmaq_laneq_f16(v16_23, v24, va0, 2);     \
+  v24_31 = vfmaq_laneq_f16(v24_31, v24, va0, 3);     \
+  v32_39 = vfmaq_laneq_f16(v32_39, v24, va0, 4);     \
+  v40_47 = vfmaq_laneq_f16(v40_47, v24, va0, 5);     \
+  v48_55 = vfmaq_laneq_f16(v48_55, v24, va0, 6);     \
+  v56_63 = vfmaq_laneq_f16(v56_63, v24, va0, 7);     \
+  v64_71 = vfmaq_laneq_f16(v64_71, v25, va0, 0);     \
+  v72_79 = vfmaq_laneq_f16(v72_79, v25, va0, 1);     \
+  v80_87 = vfmaq_laneq_f16(v80_87, v25, va0, 2);     \
+  v88_95 = vfmaq_laneq_f16(v88_95, v25, va0, 3);     \
+  v96_103 = vfmaq_laneq_f16(v96_103, v25, va0, 4);   \
+  v104_111 = vfmaq_laneq_f16(v104_111, v25, va0, 5); \
+  v112_119 = vfmaq_laneq_f16(v112_119, v25, va0, 6); \
+  v120_127 = vfmaq_laneq_f16(v120_127, v25, va0, 7); \
+  va1 = vld1q_f16(a + 8);                            \
+  v26 = vld1q_f16(b + 8 * 2);                        \
+  v27 = vld1q_f16(b + 8 * 3);                        \
+  v0_7 = vfmaq_laneq_f16(v0_7, v26, va1, 0);         \
+  v8_15 = vfmaq_laneq_f16(v8_15, v26, va1, 1);       \
+  v16_23 = vfmaq_laneq_f16(v16_23, v26, va1, 2);     \
+  v24_31 = vfmaq_laneq_f16(v24_31, v26, va1, 3);     \
+  v32_39 = vfmaq_laneq_f16(v32_39, v26, va1, 4);     \
+  v40_47 = vfmaq_laneq_f16(v40_47, v26, va1, 5);     \
+  v48_55 = vfmaq_laneq_f16(v48_55, v26, va1, 6);     \
+  v56_63 = vfmaq_laneq_f16(v56_63, v26, va1, 7);     \
+  v64_71 = vfmaq_laneq_f16(v64_71, v27, va1, 0);     \
+  v72_79 = vfmaq_laneq_f16(v72_79, v27, va1, 1);     \
+  v80_87 = vfmaq_laneq_f16(v80_87, v27, va1, 2);     \
+  v88_95 = vfmaq_laneq_f16(v88_95, v27, va1, 3);     \
+  v96_103 = vfmaq_laneq_f16(v96_103, v27, va1, 4);   \
+  v104_111 = vfmaq_laneq_f16(v104_111, v27, va1, 5); \
+  v112_119 = vfmaq_laneq_f16(v112_119, v27, va1, 6); \
+  v120_127 = vfmaq_laneq_f16(v120_127, v27, va1, 7); \
+  va2 = vld1q_f16(a + 8 * 2);                        \
+  v28 = vld1q_f16(b + 8 * 4);                        \
+  v29 = vld1q_f16(b + 8 * 5);                        \
+  v0_7 = vfmaq_laneq_f16(v0_7, v28, va2, 0);         \
+  v8_15 = vfmaq_laneq_f16(v8_15, v28, va2, 1);       \
+  v16_23 = vfmaq_laneq_f16(v16_23, v28, va2, 2);     \
+  v24_31 = vfmaq_laneq_f16(v24_31, v28, va2, 3);     \
+  v32_39 = vfmaq_laneq_f16(v32_39, v28, va2, 4);     \
+  v40_47 = vfmaq_laneq_f16(v40_47, v28, va2, 5);     \
+  v48_55 = vfmaq_laneq_f16(v48_55, v28, va2, 6);     \
+  v56_63 = vfmaq_laneq_f16(v56_63, v28, va2, 7);     \
+  v64_71 = vfmaq_laneq_f16(v64_71, v29, va2, 0);     \
+  v72_79 = vfmaq_laneq_f16(v72_79, v29, va2, 1);     \
+  v80_87 = vfmaq_laneq_f16(v80_87, v29, va2, 2);     \
+  v88_95 = vfmaq_laneq_f16(v88_95, v29, va2, 3);     \
+  v96_103 = vfmaq_laneq_f16(v96_103, v29, va2, 4);   \
+  v104_111 = vfmaq_laneq_f16(v104_111, v29, va2, 5); \
+  v112_119 = vfmaq_laneq_f16(v112_119, v29, va2, 6); \
+  v120_127 = vfmaq_laneq_f16(v120_127, v29, va2, 7); \
+  va3 = vld1q_f16(a + 8 * 3);                        \
+  v30 = vld1q_f16(b + 8 * 6);                        \
+  v31 = vld1q_f16(b + 8 * 7);                        \
+  v0_7 = vfmaq_laneq_f16(v0_7, v30, va3, 0);         \
+  v8_15 = vfmaq_laneq_f16(v8_15, v30, va3, 1);       \
+  v16_23 = vfmaq_laneq_f16(v16_23, v30, va3, 2);     \
+  v24_31 = vfmaq_laneq_f16(v24_31, v30, va3, 3);     \
+  v32_39 = vfmaq_laneq_f16(v32_39, v30, va3, 4);     \
+  v40_47 = vfmaq_laneq_f16(v40_47, v30, va3, 5);     \
+  v48_55 = vfmaq_laneq_f16(v48_55, v30, va3, 6);     \
+  v56_63 = vfmaq_laneq_f16(v56_63, v30, va3, 7);     \
+  v64_71 = vfmaq_laneq_f16(v64_71, v31, va3, 0);     \
+  v72_79 = vfmaq_laneq_f16(v72_79, v31, va3, 1);     \
+  v80_87 = vfmaq_laneq_f16(v80_87, v31, va3, 2);     \
+  v88_95 = vfmaq_laneq_f16(v88_95, v31, va3, 3);     \
+  v96_103 = vfmaq_laneq_f16(v96_103, v31, va3, 4);   \
+  v104_111 = vfmaq_laneq_f16(v104_111, v31, va3, 5); \
+  v112_119 = vfmaq_laneq_f16(v112_119, v31, va3, 6); \
+  v120_127 = vfmaq_laneq_f16(v120_127, v31, va3, 7); \
+  va4 = vld1q_f16(a + 8 * 4);                        \
+  v24 = vld1q_f16(b + 8 * 8);                        \
+  v25 = vld1q_f16(b + 8 * 9);                        \
+  v0_7 = vfmaq_laneq_f16(v0_7, v24, va4, 0);         \
+  v8_15 = vfmaq_laneq_f16(v8_15, v24, va4, 1);       \
+  v16_23 = vfmaq_laneq_f16(v16_23, v24, va4, 2);     \
+  v24_31 = vfmaq_laneq_f16(v24_31, v24, va4, 3);     \
+  v32_39 = vfmaq_laneq_f16(v32_39, v24, va4, 4);     \
+  v40_47 = vfmaq_laneq_f16(v40_47, v24, va4, 5);     \
+  v48_55 = vfmaq_laneq_f16(v48_55, v24, va4, 6);     \
+  v56_63 = vfmaq_laneq_f16(v56_63, v24, va4, 7);     \
+  v64_71 = vfmaq_laneq_f16(v64_71, v25, va4, 0);     \
+  v72_79 = vfmaq_laneq_f16(v72_79, v25, va4, 1);     \
+  v80_87 = vfmaq_laneq_f16(v80_87, v25, va4, 2);     \
+  v88_95 = vfmaq_laneq_f16(v88_95, v25, va4, 3);     \
+  v96_103 = vfmaq_laneq_f16(v96_103, v25, va4, 4);   \
+  v104_111 = vfmaq_laneq_f16(v104_111, v25, va4, 5); \
+  v112_119 = vfmaq_laneq_f16(v112_119, v25, va4, 6); \
+  v120_127 = vfmaq_laneq_f16(v120_127, v25, va4, 7); \
+  va5 = vld1q_f16(a + 8 * 5);                        \
+  v26 = vld1q_f16(b + 8 * 10);                       \
+  v27 = vld1q_f16(b + 8 * 11);                       \
+  v0_7 = vfmaq_laneq_f16(v0_7, v26, va5, 0);         \
+  v8_15 = vfmaq_laneq_f16(v8_15, v26, va5, 1);       \
+  v16_23 = vfmaq_laneq_f16(v16_23, v26, va5, 2);     \
+  v24_31 = vfmaq_laneq_f16(v24_31, v26, va5, 3);     \
+  v32_39 = vfmaq_laneq_f16(v32_39, v26, va5, 4);     \
+  v40_47 = vfmaq_laneq_f16(v40_47, v26, va5, 5);     \
+  v48_55 = vfmaq_laneq_f16(v48_55, v26, va5, 6);     \
+  v56_63 = vfmaq_laneq_f16(v56_63, v26, va5, 7);     \
+  v64_71 = vfmaq_laneq_f16(v64_71, v27, va5, 0);     \
+  v72_79 = vfmaq_laneq_f16(v72_79, v27, va5, 1);     \
+  v80_87 = vfmaq_laneq_f16(v80_87, v27, va5, 2);     \
+  v88_95 = vfmaq_laneq_f16(v88_95, v27, va5, 3);     \
+  v96_103 = vfmaq_laneq_f16(v96_103, v27, va5, 4);   \
+  v104_111 = vfmaq_laneq_f16(v104_111, v27, va5, 5); \
+  v112_119 = vfmaq_laneq_f16(v112_119, v27, va5, 6); \
+  v120_127 = vfmaq_laneq_f16(v120_127, v27, va5, 7); \
+  va6 = vld1q_f16(a + 8 * 6);                        \
+  v28 = vld1q_f16(b + 8 * 12);                       \
+  v29 = vld1q_f16(b + 8 * 13);                       \
+  v0_7 = vfmaq_laneq_f16(v0_7, v28, va6, 0);         \
+  v8_15 = vfmaq_laneq_f16(v8_15, v28, va6, 1);       \
+  v16_23 = vfmaq_laneq_f16(v16_23, v28, va6, 2);     \
+  v24_31 = vfmaq_laneq_f16(v24_31, v28, va6, 3);     \
+  v32_39 = vfmaq_laneq_f16(v32_39, v28, va6, 4);     \
+  v40_47 = vfmaq_laneq_f16(v40_47, v28, va6, 5);     \
+  v48_55 = vfmaq_laneq_f16(v48_55, v28, va6, 6);     \
+  v56_63 = vfmaq_laneq_f16(v56_63, v28, va6, 7);     \
+  v64_71 = vfmaq_laneq_f16(v64_71, v29, va6, 0);     \
+  v72_79 = vfmaq_laneq_f16(v72_79, v29, va6, 1);     \
+  v80_87 = vfmaq_laneq_f16(v80_87, v29, va6, 2);     \
+  v88_95 = vfmaq_laneq_f16(v88_95, v29, va6, 3);     \
+  v96_103 = vfmaq_laneq_f16(v96_103, v29, va6, 4);   \
+  v104_111 = vfmaq_laneq_f16(v104_111, v29, va6, 5); \
+  v112_119 = vfmaq_laneq_f16(v112_119, v29, va6, 6); \
+  v120_127 = vfmaq_laneq_f16(v120_127, v29, va6, 7); \
+  va7 = vld1q_f16(a + 8 * 7);                        \
+  v30 = vld1q_f16(b + 8 * 14);                       \
+  v31 = vld1q_f16(b + 8 * 15);                       \
+  v0_7 = vfmaq_laneq_f16(v0_7, v30, va7, 0);         \
+  v8_15 = vfmaq_laneq_f16(v8_15, v30, va7, 1);       \
+  v16_23 = vfmaq_laneq_f16(v16_23, v30, va7, 2);     \
+  v24_31 = vfmaq_laneq_f16(v24_31, v30, va7, 3);     \
+  v32_39 = vfmaq_laneq_f16(v32_39, v30, va7, 4);     \
+  v40_47 = vfmaq_laneq_f16(v40_47, v30, va7, 5);     \
+  v48_55 = vfmaq_laneq_f16(v48_55, v30, va7, 6);     \
+  v56_63 = vfmaq_laneq_f16(v56_63, v30, va7, 7);     \
+  v64_71 = vfmaq_laneq_f16(v64_71, v31, va7, 0);     \
+  v72_79 = vfmaq_laneq_f16(v72_79, v31, va7, 1);     \
+  v80_87 = vfmaq_laneq_f16(v80_87, v31, va7, 2);     \
+  v88_95 = vfmaq_laneq_f16(v88_95, v31, va7, 3);     \
+  v96_103 = vfmaq_laneq_f16(v96_103, v31, va7, 4);   \
+  v104_111 = vfmaq_laneq_f16(v104_111, v31, va7, 5); \
+  v112_119 = vfmaq_laneq_f16(v112_119, v31, va7, 6); \
+  v120_127 = vfmaq_laneq_f16(v120_127, v31, va7, 7); \
+  va7 = vld1q_f16(a + 8 * 8);                        \
+  v30 = vld1q_f16(b + 8 * 16);                       \
+  v31 = vld1q_f16(b + 8 * 17);                       \
+  v0_7 = vfmaq_laneq_f16(v0_7, v30, va7, 0);         \
+  v8_15 = vfmaq_laneq_f16(v8_15, v30, va7, 1);       \
+  v16_23 = vfmaq_laneq_f16(v16_23, v30, va7, 2);     \
+  v24_31 = vfmaq_laneq_f16(v24_31, v30, va7, 3);     \
+  v32_39 = vfmaq_laneq_f16(v32_39, v30, va7, 4);     \
+  v40_47 = vfmaq_laneq_f16(v40_47, v30, va7, 5);     \
+  v48_55 = vfmaq_laneq_f16(v48_55, v30, va7, 6);     \
+  v56_63 = vfmaq_laneq_f16(v56_63, v30, va7, 7);     \
+  v64_71 = vfmaq_laneq_f16(v64_71, v31, va7, 0);     \
+  v72_79 = vfmaq_laneq_f16(v72_79, v31, va7, 1);     \
+  v80_87 = vfmaq_laneq_f16(v80_87, v31, va7, 2);     \
+  v88_95 = vfmaq_laneq_f16(v88_95, v31, va7, 3);     \
+  v96_103 = vfmaq_laneq_f16(v96_103, v31, va7, 4);   \
+  v104_111 = vfmaq_laneq_f16(v104_111, v31, va7, 5); \
+  v112_119 = vfmaq_laneq_f16(v112_119, v31, va7, 6); \
+  v120_127 = vfmaq_laneq_f16(v120_127, v31, va7, 7); \
+  va7 = vld1q_f16(a + 8 * 9);                        \
+  v30 = vld1q_f16(b + 8 * 18);                       \
+  v31 = vld1q_f16(b + 8 * 19);                       \
+  v0_7 = vfmaq_laneq_f16(v0_7, v30, va7, 0);         \
+  v8_15 = vfmaq_laneq_f16(v8_15, v30, va7, 1);       \
+  v16_23 = vfmaq_laneq_f16(v16_23, v30, va7, 2);     \
+  v24_31 = vfmaq_laneq_f16(v24_31, v30, va7, 3);     \
+  v32_39 = vfmaq_laneq_f16(v32_39, v30, va7, 4);     \
+  v40_47 = vfmaq_laneq_f16(v40_47, v30, va7, 5);     \
+  v48_55 = vfmaq_laneq_f16(v48_55, v30, va7, 6);     \
+  v56_63 = vfmaq_laneq_f16(v56_63, v30, va7, 7);     \
+  v64_71 = vfmaq_laneq_f16(v64_71, v31, va7, 0);     \
+  v72_79 = vfmaq_laneq_f16(v72_79, v31, va7, 1);     \
+  v80_87 = vfmaq_laneq_f16(v80_87, v31, va7, 2);     \
+  v88_95 = vfmaq_laneq_f16(v88_95, v31, va7, 3);     \
+  v96_103 = vfmaq_laneq_f16(v96_103, v31, va7, 4);   \
+  v104_111 = vfmaq_laneq_f16(v104_111, v31, va7, 5); \
+  v112_119 = vfmaq_laneq_f16(v112_119, v31, va7, 6); \
+  v120_127 = vfmaq_laneq_f16(v120_127, v31, va7, 7); \
+  va7 = vld1q_f16(a + 8 * 10);                       \
+  v30 = vld1q_f16(b + 8 * 20);                       \
+  v31 = vld1q_f16(b + 8 * 21);                       \
+  v0_7 = vfmaq_laneq_f16(v0_7, v30, va7, 0);         \
+  v8_15 = vfmaq_laneq_f16(v8_15, v30, va7, 1);       \
+  v16_23 = vfmaq_laneq_f16(v16_23, v30, va7, 2);     \
+  v24_31 = vfmaq_laneq_f16(v24_31, v30, va7, 3);     \
+  v32_39 = vfmaq_laneq_f16(v32_39, v30, va7, 4);     \
+  v40_47 = vfmaq_laneq_f16(v40_47, v30, va7, 5);     \
+  v48_55 = vfmaq_laneq_f16(v48_55, v30, va7, 6);     \
+  v56_63 = vfmaq_laneq_f16(v56_63, v30, va7, 7);     \
+  v64_71 = vfmaq_laneq_f16(v64_71, v31, va7, 0);     \
+  v72_79 = vfmaq_laneq_f16(v72_79, v31, va7, 1);     \
+  v80_87 = vfmaq_laneq_f16(v80_87, v31, va7, 2);     \
+  v88_95 = vfmaq_laneq_f16(v88_95, v31, va7, 3);     \
+  v96_103 = vfmaq_laneq_f16(v96_103, v31, va7, 4);   \
+  v104_111 = vfmaq_laneq_f16(v104_111, v31, va7, 5); \
+  v112_119 = vfmaq_laneq_f16(v112_119, v31, va7, 6); \
+  v120_127 = vfmaq_laneq_f16(v120_127, v31, va7, 7); \
+  va7 = vld1q_f16(a + 8 * 11);                       \
+  v30 = vld1q_f16(b + 8 * 22);                       \
+  v31 = vld1q_f16(b + 8 * 23);                       \
+  v0_7 = vfmaq_laneq_f16(v0_7, v30, va7, 0);         \
+  v8_15 = vfmaq_laneq_f16(v8_15, v30, va7, 1);       \
+  v16_23 = vfmaq_laneq_f16(v16_23, v30, va7, 2);     \
+  v24_31 = vfmaq_laneq_f16(v24_31, v30, va7, 3);     \
+  v32_39 = vfmaq_laneq_f16(v32_39, v30, va7, 4);     \
+  v40_47 = vfmaq_laneq_f16(v40_47, v30, va7, 5);     \
+  v48_55 = vfmaq_laneq_f16(v48_55, v30, va7, 6);     \
+  v56_63 = vfmaq_laneq_f16(v56_63, v30, va7, 7);     \
+  v64_71 = vfmaq_laneq_f16(v64_71, v31, va7, 0);     \
+  v72_79 = vfmaq_laneq_f16(v72_79, v31, va7, 1);     \
+  v80_87 = vfmaq_laneq_f16(v80_87, v31, va7, 2);     \
+  v88_95 = vfmaq_laneq_f16(v88_95, v31, va7, 3);     \
+  v96_103 = vfmaq_laneq_f16(v96_103, v31, va7, 4);   \
+  v104_111 = vfmaq_laneq_f16(v104_111, v31, va7, 5); \
+  v112_119 = vfmaq_laneq_f16(v112_119, v31, va7, 6); \
+  v120_127 = vfmaq_laneq_f16(v120_127, v31, va7, 7); \
+  va7 = vld1q_f16(a + 8 * 12);                       \
+  v30 = vld1q_f16(b + 8 * 24);                       \
+  v31 = vld1q_f16(b + 8 * 25);                       \
+  v0_7 = vfmaq_laneq_f16(v0_7, v30, va7, 0);         \
+  v8_15 = vfmaq_laneq_f16(v8_15, v30, va7, 1);       \
+  v16_23 = vfmaq_laneq_f16(v16_23, v30, va7, 2);     \
+  v24_31 = vfmaq_laneq_f16(v24_31, v30, va7, 3);     \
+  v32_39 = vfmaq_laneq_f16(v32_39, v30, va7, 4);     \
+  v40_47 = vfmaq_laneq_f16(v40_47, v30, va7, 5);     \
+  v48_55 = vfmaq_laneq_f16(v48_55, v30, va7, 6);     \
+  v56_63 = vfmaq_laneq_f16(v56_63, v30, va7, 7);     \
+  v64_71 = vfmaq_laneq_f16(v64_71, v31, va7, 0);     \
+  v72_79 = vfmaq_laneq_f16(v72_79, v31, va7, 1);     \
+  v80_87 = vfmaq_laneq_f16(v80_87, v31, va7, 2);     \
+  v88_95 = vfmaq_laneq_f16(v88_95, v31, va7, 3);     \
+  v96_103 = vfmaq_laneq_f16(v96_103, v31, va7, 4);   \
+  v104_111 = vfmaq_laneq_f16(v104_111, v31, va7, 5); \
+  v112_119 = vfmaq_laneq_f16(v112_119, v31, va7, 6); \
+  v120_127 = vfmaq_laneq_f16(v120_127, v31, va7, 7); \
+  va7 = vld1q_f16(a + 8 * 13);                       \
+  v30 = vld1q_f16(b + 8 * 26);                       \
+  v31 = vld1q_f16(b + 8 * 27);                       \
+  v0_7 = vfmaq_laneq_f16(v0_7, v30, va7, 0);         \
+  v8_15 = vfmaq_laneq_f16(v8_15, v30, va7, 1);       \
+  v16_23 = vfmaq_laneq_f16(v16_23, v30, va7, 2);     \
+  v24_31 = vfmaq_laneq_f16(v24_31, v30, va7, 3);     \
+  v32_39 = vfmaq_laneq_f16(v32_39, v30, va7, 4);     \
+  v40_47 = vfmaq_laneq_f16(v40_47, v30, va7, 5);     \
+  v48_55 = vfmaq_laneq_f16(v48_55, v30, va7, 6);     \
+  v56_63 = vfmaq_laneq_f16(v56_63, v30, va7, 7);     \
+  v64_71 = vfmaq_laneq_f16(v64_71, v31, va7, 0);     \
+  v72_79 = vfmaq_laneq_f16(v72_79, v31, va7, 1);     \
+  v80_87 = vfmaq_laneq_f16(v80_87, v31, va7, 2);     \
+  v88_95 = vfmaq_laneq_f16(v88_95, v31, va7, 3);     \
+  v96_103 = vfmaq_laneq_f16(v96_103, v31, va7, 4);   \
+  v104_111 = vfmaq_laneq_f16(v104_111, v31, va7, 5); \
+  v112_119 = vfmaq_laneq_f16(v112_119, v31, va7, 6); \
+  v120_127 = vfmaq_laneq_f16(v120_127, v31, va7, 7); \
+  va7 = vld1q_f16(a + 8 * 14);                       \
+  v30 = vld1q_f16(b + 8 * 28);                       \
+  v31 = vld1q_f16(b + 8 * 29);                       \
+  v0_7 = vfmaq_laneq_f16(v0_7, v30, va7, 0);         \
+  v8_15 = vfmaq_laneq_f16(v8_15, v30, va7, 1);       \
+  v16_23 = vfmaq_laneq_f16(v16_23, v30, va7, 2);     \
+  v24_31 = vfmaq_laneq_f16(v24_31, v30, va7, 3);     \
+  v32_39 = vfmaq_laneq_f16(v32_39, v30, va7, 4);     \
+  v40_47 = vfmaq_laneq_f16(v40_47, v30, va7, 5);     \
+  v48_55 = vfmaq_laneq_f16(v48_55, v30, va7, 6);     \
+  v56_63 = vfmaq_laneq_f16(v56_63, v30, va7, 7);     \
+  v64_71 = vfmaq_laneq_f16(v64_71, v31, va7, 0);     \
+  v72_79 = vfmaq_laneq_f16(v72_79, v31, va7, 1);     \
+  v80_87 = vfmaq_laneq_f16(v80_87, v31, va7, 2);     \
+  v88_95 = vfmaq_laneq_f16(v88_95, v31, va7, 3);     \
+  v96_103 = vfmaq_laneq_f16(v96_103, v31, va7, 4);   \
+  v104_111 = vfmaq_laneq_f16(v104_111, v31, va7, 5); \
+  v112_119 = vfmaq_laneq_f16(v112_119, v31, va7, 6); \
+  v120_127 = vfmaq_laneq_f16(v120_127, v31, va7, 7); \
+  va7 = vld1q_f16(a + 8 * 15);                       \
+  v30 = vld1q_f16(b + 8 * 30);                       \
+  v31 = vld1q_f16(b + 8 * 31);                       \
+  v0_7 = vfmaq_laneq_f16(v0_7, v30, va7, 0);         \
+  v8_15 = vfmaq_laneq_f16(v8_15, v30, va7, 1);       \
+  v16_23 = vfmaq_laneq_f16(v16_23, v30, va7, 2);     \
+  v24_31 = vfmaq_laneq_f16(v24_31, v30, va7, 3);     \
+  v32_39 = vfmaq_laneq_f16(v32_39, v30, va7, 4);     \
+  v40_47 = vfmaq_laneq_f16(v40_47, v30, va7, 5);     \
+  v48_55 = vfmaq_laneq_f16(v48_55, v30, va7, 6);     \
+  v56_63 = vfmaq_laneq_f16(v56_63, v30, va7, 7);     \
+  v64_71 = vfmaq_laneq_f16(v64_71, v31, va7, 0);     \
+  v72_79 = vfmaq_laneq_f16(v72_79, v31, va7, 1);     \
+  v80_87 = vfmaq_laneq_f16(v80_87, v31, va7, 2);     \
+  v88_95 = vfmaq_laneq_f16(v88_95, v31, va7, 3);     \
+  v96_103 = vfmaq_laneq_f16(v96_103, v31, va7, 4);   \
+  v104_111 = vfmaq_laneq_f16(v104_111, v31, va7, 5); \
+  v112_119 = vfmaq_laneq_f16(v112_119, v31, va7, 6); \
+  v120_127 = vfmaq_laneq_f16(v120_127, v31, va7, 7); \
+  l += 16;                                           \
+  __builtin_prefetch(b + 256, 0, 3);                 \
+  __builtin_prefetch(a + 128, 0, 3);                 \
+  b += 16 * 16;                                      \
+  a += 8 * 16;
+
+// 2. Partial sum 1024 digits
 #define KERNEL_8x16_ACC8()                           \
-  v0_7 = vdupq_n_f16(0.F);                           \
-  v8_15 = vdupq_n_f16(0.F);                          \
-  v16_23 = vdupq_n_f16(0.F);                         \
-  v24_31 = vdupq_n_f16(0.F);                         \
-  v32_39 = vdupq_n_f16(0.F);                         \
-  v40_47 = vdupq_n_f16(0.F);                         \
-  v48_55 = vdupq_n_f16(0.F);                         \
-  v56_63 = vdupq_n_f16(0.F);                         \
-  v64_71 = vdupq_n_f16(0.F);                         \
-  v72_79 = vdupq_n_f16(0.F);                         \
-  v80_87 = vdupq_n_f16(0.F);                         \
-  v88_95 = vdupq_n_f16(0.F);                         \
-  v96_103 = vdupq_n_f16(0.F);                        \
-  v104_111 = vdupq_n_f16(0.F);                       \
-  v112_119 = vdupq_n_f16(0.F);                       \
-  v120_127 = vdupq_n_f16(0.F);                       \
   va0 = vld1q_f16(a);                                \
   v24 = vld1q_f16(b);                                \
   v25 = vld1q_f16(b + 8);                            \
@@ -193,24 +504,8 @@
   b += 16 * 8;                                       \
   a += 8 * 8;
 
-// 2. Partial sum 512 digits : Medium accuracy, medium latency
+// 3. Partial sum 512 digits
 #define KERNEL_8x16_ACC4()                           \
-  v0_7 = vdupq_n_f16(0.F);                           \
-  v8_15 = vdupq_n_f16(0.F);                          \
-  v16_23 = vdupq_n_f16(0.F);                         \
-  v24_31 = vdupq_n_f16(0.F);                         \
-  v32_39 = vdupq_n_f16(0.F);                         \
-  v40_47 = vdupq_n_f16(0.F);                         \
-  v48_55 = vdupq_n_f16(0.F);                         \
-  v56_63 = vdupq_n_f16(0.F);                         \
-  v64_71 = vdupq_n_f16(0.F);                         \
-  v72_79 = vdupq_n_f16(0.F);                         \
-  v80_87 = vdupq_n_f16(0.F);                         \
-  v88_95 = vdupq_n_f16(0.F);                         \
-  v96_103 = vdupq_n_f16(0.F);                        \
-  v104_111 = vdupq_n_f16(0.F);                       \
-  v112_119 = vdupq_n_f16(0.F);                       \
-  v120_127 = vdupq_n_f16(0.F);                       \
   va0 = vld1q_f16(a);                                \
   v24 = vld1q_f16(b);                                \
   v25 = vld1q_f16(b + 8);                            \
@@ -293,24 +588,8 @@
   b += 16 * 4;                                       \
   a += 8 * 4;
 
-// 3. Partial sum 128 digits : Best accuracy, worst latency
+// 3. Partial sum 128 digits
 #define KERNEL_8x16_ACC1()                           \
-  v0_7 = vdupq_n_f16(0.F);                           \
-  v8_15 = vdupq_n_f16(0.F);                          \
-  v16_23 = vdupq_n_f16(0.F);                         \
-  v24_31 = vdupq_n_f16(0.F);                         \
-  v32_39 = vdupq_n_f16(0.F);                         \
-  v40_47 = vdupq_n_f16(0.F);                         \
-  v48_55 = vdupq_n_f16(0.F);                         \
-  v56_63 = vdupq_n_f16(0.F);                         \
-  v64_71 = vdupq_n_f16(0.F);                         \
-  v72_79 = vdupq_n_f16(0.F);                         \
-  v80_87 = vdupq_n_f16(0.F);                         \
-  v88_95 = vdupq_n_f16(0.F);                         \
-  v96_103 = vdupq_n_f16(0.F);                        \
-  v104_111 = vdupq_n_f16(0.F);                       \
-  v112_119 = vdupq_n_f16(0.F);                       \
-  v120_127 = vdupq_n_f16(0.F);                       \
   va0 = vld1q_f16(a);                                \
   v24 = vld1q_f16(b);                                \
   v25 = vld1q_f16(b + 8);                            \
@@ -336,6 +615,91 @@
   b += 16 * 1;                                       \
   a += 8 * 1;
 
+#define SAVE_KERNEL_8X16_F16_F32()                                             \
+  vst1q_f32(c, vaddq_f32(vld1q_f32(c), vcvt_f32_f16(vget_low_f16(v0_7))));     \
+  vst1q_f32(c + 4,                                                             \
+            vaddq_f32(vld1q_f32(c + 4), vcvt_f32_f16(vget_high_f16(v0_7))));   \
+                                                                               \
+  vst1q_f32(c + 8,                                                             \
+            vaddq_f32(vld1q_f32(c + 8), vcvt_f32_f16(vget_low_f16(v64_71))));  \
+  vst1q_f32(c + 8 + 4, vaddq_f32(vld1q_f32(c + 8 + 4),                         \
+                                 vcvt_f32_f16(vget_high_f16(v64_71))));        \
+                                                                               \
+  vst1q_f32(c + ldc,                                                           \
+            vaddq_f32(vld1q_f32(c + ldc), vcvt_f32_f16(vget_low_f16(v8_15)))); \
+  vst1q_f32(c + ldc + 4, vaddq_f32(vld1q_f32(c + ldc + 4),                     \
+                                   vcvt_f32_f16(vget_high_f16(v8_15))));       \
+                                                                               \
+  vst1q_f32(c + ldc + 8, vaddq_f32(vld1q_f32(c + ldc + 8),                     \
+                                   vcvt_f32_f16(vget_low_f16(v72_79))));       \
+  vst1q_f32(c + ldc + 8 + 4, vaddq_f32(vld1q_f32(c + ldc + 8 + 4),             \
+                                       vcvt_f32_f16(vget_high_f16(v72_79))));  \
+                                                                               \
+  vst1q_f32(c + 2 * ldc, vaddq_f32(vld1q_f32(c + 2 * ldc),                     \
+                                   vcvt_f32_f16(vget_low_f16(v16_23))));       \
+  vst1q_f32(c + 2 * ldc + 4, vaddq_f32(vld1q_f32(c + 2 * ldc + 4),             \
+                                       vcvt_f32_f16(vget_high_f16(v16_23))));  \
+                                                                               \
+  vst1q_f32(c + 2 * ldc + 8, vaddq_f32(vld1q_f32(c + 2 * ldc + 8),             \
+                                       vcvt_f32_f16(vget_low_f16(v80_87))));   \
+  vst1q_f32(c + 2 * ldc + 8 + 4,                                               \
+            vaddq_f32(vld1q_f32(c + 2 * ldc + 8 + 4),                          \
+                      vcvt_f32_f16(vget_high_f16(v80_87))));                   \
+                                                                               \
+  vst1q_f32(c + 3 * ldc, vaddq_f32(vld1q_f32(c + 3 * ldc),                     \
+                                   vcvt_f32_f16(vget_low_f16(v24_31))));       \
+  vst1q_f32(c + 3 * ldc + 4, vaddq_f32(vld1q_f32(c + 3 * ldc + 4),             \
+                                       vcvt_f32_f16(vget_high_f16(v24_31))));  \
+                                                                               \
+  vst1q_f32(c + 3 * ldc + 8, vaddq_f32(vld1q_f32(c + 3 * ldc + 8),             \
+                                       vcvt_f32_f16(vget_low_f16(v88_95))));   \
+  vst1q_f32(c + 3 * ldc + 8 + 4,                                               \
+            vaddq_f32(vld1q_f32(c + 3 * ldc + 8 + 4),                          \
+                      vcvt_f32_f16(vget_high_f16(v88_95))));                   \
+                                                                               \
+  vst1q_f32(c + 4 * ldc, vaddq_f32(vld1q_f32(c + 4 * ldc),                     \
+                                   vcvt_f32_f16(vget_low_f16(v32_39))));       \
+  vst1q_f32(c + 4 * ldc + 4, vaddq_f32(vld1q_f32(c + 4 * ldc + 4),             \
+                                       vcvt_f32_f16(vget_high_f16(v32_39))));  \
+                                                                               \
+  vst1q_f32(c + 4 * ldc + 8, vaddq_f32(vld1q_f32(c + 4 * ldc + 8),             \
+                                       vcvt_f32_f16(vget_low_f16(v96_103))));  \
+  vst1q_f32(c + 4 * ldc + 8 + 4,                                               \
+            vaddq_f32(vld1q_f32(c + 4 * ldc + 8 + 4),                          \
+                      vcvt_f32_f16(vget_high_f16(v96_103))));                  \
+                                                                               \
+  vst1q_f32(c + 5 * ldc, vaddq_f32(vld1q_f32(c + 5 * ldc),                     \
+                                   vcvt_f32_f16(vget_low_f16(v40_47))));       \
+  vst1q_f32(c + 5 * ldc + 4, vaddq_f32(vld1q_f32(c + 5 * ldc + 4),             \
+                                       vcvt_f32_f16(vget_high_f16(v40_47))));  \
+  vst1q_f32(c + 5 * ldc + 8, vaddq_f32(vld1q_f32(c + 5 * ldc + 8),             \
+                                       vcvt_f32_f16(vget_low_f16(v104_111)))); \
+  vst1q_f32(c + 5 * ldc + 8 + 4,                                               \
+            vaddq_f32(vld1q_f32(c + 5 * ldc + 8 + 4),                          \
+                      vcvt_f32_f16(vget_high_f16(v104_111))));                 \
+                                                                               \
+  vst1q_f32(c + 6 * ldc, vaddq_f32(vld1q_f32(c + 6 * ldc),                     \
+                                   vcvt_f32_f16(vget_low_f16(v48_55))));       \
+  vst1q_f32(c + 6 * ldc + 4, vaddq_f32(vld1q_f32(c + 6 * ldc + 4),             \
+                                       vcvt_f32_f16(vget_high_f16(v48_55))));  \
+                                                                               \
+  vst1q_f32(c + 6 * ldc + 8, vaddq_f32(vld1q_f32(c + 6 * ldc + 8),             \
+                                       vcvt_f32_f16(vget_low_f16(v112_119)))); \
+  vst1q_f32(c + 6 * ldc + 8 + 4,                                               \
+            vaddq_f32(vld1q_f32(c + 6 * ldc + 8 + 4),                          \
+                      vcvt_f32_f16(vget_high_f16(v112_119))));                 \
+                                                                               \
+  vst1q_f32(c + 7 * ldc, vaddq_f32(vld1q_f32(c + 7 * ldc),                     \
+                                   vcvt_f32_f16(vget_low_f16(v56_63))));       \
+  vst1q_f32(c + 7 * ldc + 4, vaddq_f32(vld1q_f32(c + 7 * ldc + 4),             \
+                                       vcvt_f32_f16(vget_high_f16(v56_63))));  \
+                                                                               \
+  vst1q_f32(c + 7 * ldc + 8, vaddq_f32(vld1q_f32(c + 7 * ldc + 8),             \
+                                       vcvt_f32_f16(vget_low_f16(v120_127)))); \
+  vst1q_f32(c + 7 * ldc + 8 + 4,                                               \
+            vaddq_f32(vld1q_f32(c + 7 * ldc + 8 + 4),                          \
+                      vcvt_f32_f16(vget_high_f16(v120_127))));
+
 /**
  * @brief hgemm 8x16 kernel sc = sa * sb
  *
@@ -370,32 +734,32 @@ void hgemm_kernel_8x16(unsigned int M, unsigned int N, unsigned int K,
 
       float16x8_t v24, v25, v26, v27, v28, v29, v30, v31;
       float16x8_t va0, va1, va2, va3;
+
+      INIT_KERNEL_8X16();
       l = 0;
       for (; l < K;) {
-        KERNEL_8x16_ACC4();
-        vst1q_f16(c, vaddq_f16(vld1q_f16(c), v0_7));
-        vst1q_f16(c + 8, vaddq_f16(vld1q_f16(c + 8), v64_71));
-        vst1q_f16(c + ldc, vaddq_f16(vld1q_f16(c + ldc), v8_15));
-        vst1q_f16(c + ldc + 8, vaddq_f16(vld1q_f16(c + ldc + 8), v72_79));
-        vst1q_f16(c + 2 * ldc, vaddq_f16(vld1q_f16(c + 2 * ldc), v16_23));
-        vst1q_f16(c + 2 * ldc + 8,
-                  vaddq_f16(vld1q_f16(c + 2 * ldc + 8), v80_87));
-        vst1q_f16(c + 3 * ldc, vaddq_f16(vld1q_f16(c + 3 * ldc), v24_31));
-        vst1q_f16(c + 3 * ldc + 8,
-                  vaddq_f16(vld1q_f16(c + 3 * ldc + 8), v88_95));
-        vst1q_f16(c + 4 * ldc, vaddq_f16(vld1q_f16(c + 4 * ldc), v32_39));
-        vst1q_f16(c + 4 * ldc + 8,
-                  vaddq_f16(vld1q_f16(c + 4 * ldc + 8), v96_103));
-        vst1q_f16(c + 5 * ldc, vaddq_f16(vld1q_f16(c + 5 * ldc), v40_47));
-        vst1q_f16(c + 5 * ldc + 8,
-                  vaddq_f16(vld1q_f16(c + 5 * ldc + 8), v104_111));
-        vst1q_f16(c + 6 * ldc, vaddq_f16(vld1q_f16(c + 6 * ldc), v48_55));
-        vst1q_f16(c + 6 * ldc + 8,
-                  vaddq_f16(vld1q_f16(c + 6 * ldc + 8), v112_119));
-        vst1q_f16(c + 7 * ldc, vaddq_f16(vld1q_f16(c + 7 * ldc), v56_63));
-        vst1q_f16(c + 7 * ldc + 8,
-                  vaddq_f16(vld1q_f16(c + 7 * ldc + 8), v120_127));
+        KERNEL_8x16_ACC1();
       }
+      vst1q_f16(c, vaddq_f16(vld1q_f16(c), v0_7));
+      vst1q_f16(c + 8, vaddq_f16(vld1q_f16(c + 8), v64_71));
+      vst1q_f16(c + ldc, vaddq_f16(vld1q_f16(c + ldc), v8_15));
+      vst1q_f16(c + ldc + 8, vaddq_f16(vld1q_f16(c + ldc + 8), v72_79));
+      vst1q_f16(c + 2 * ldc, vaddq_f16(vld1q_f16(c + 2 * ldc), v16_23));
+      vst1q_f16(c + 2 * ldc + 8, vaddq_f16(vld1q_f16(c + 2 * ldc + 8), v80_87));
+      vst1q_f16(c + 3 * ldc, vaddq_f16(vld1q_f16(c + 3 * ldc), v24_31));
+      vst1q_f16(c + 3 * ldc + 8, vaddq_f16(vld1q_f16(c + 3 * ldc + 8), v88_95));
+      vst1q_f16(c + 4 * ldc, vaddq_f16(vld1q_f16(c + 4 * ldc), v32_39));
+      vst1q_f16(c + 4 * ldc + 8,
+                vaddq_f16(vld1q_f16(c + 4 * ldc + 8), v96_103));
+      vst1q_f16(c + 5 * ldc, vaddq_f16(vld1q_f16(c + 5 * ldc), v40_47));
+      vst1q_f16(c + 5 * ldc + 8,
+                vaddq_f16(vld1q_f16(c + 5 * ldc + 8), v104_111));
+      vst1q_f16(c + 6 * ldc, vaddq_f16(vld1q_f16(c + 6 * ldc), v48_55));
+      vst1q_f16(c + 6 * ldc + 8,
+                vaddq_f16(vld1q_f16(c + 6 * ldc + 8), v112_119));
+      vst1q_f16(c + 7 * ldc, vaddq_f16(vld1q_f16(c + 7 * ldc), v56_63));
+      vst1q_f16(c + 7 * ldc + 8,
+                vaddq_f16(vld1q_f16(c + 7 * ldc + 8), v120_127));
       c += 16;
       a -= 8 * K;
     }
@@ -425,6 +789,9 @@ void hgemm_kernel_8x16(unsigned int M, unsigned int N, unsigned int K,
   __fp16 *a = sa, *b = sb;
   float *c = sc;
   unsigned int i, j, l;
+  unsigned int K4 = (K >> 2) << 2;
+  unsigned int K8 = (K >> 3) << 3;
+  unsigned int K16 = (K >> 4) << 4;
   for (i = 0; i < M; i += 8) {
     for (j = 0; j < N; j += 16) {
       __builtin_prefetch(b, 0, 3);
@@ -440,106 +807,25 @@ void hgemm_kernel_8x16(unsigned int M, unsigned int N, unsigned int K,
       float16x8_t v24, v25, v26, v27, v28, v29, v30, v31;
       float16x8_t va0, va1, va2, va3, va4, va5, va6, va7;
       l = 0;
-      for (; l < K;) {
+      for (; l < K16;) {
+        INIT_KERNEL_8X16();
+        KERNEL_8x16_ACC16();
+        SAVE_KERNEL_8X16_F16_F32();
+      }
+      for (; l < K8;) {
+        INIT_KERNEL_8X16();
         KERNEL_8x16_ACC8();
-
-        vst1q_f32(c, vaddq_f32(vld1q_f32(c), vcvt_f32_f16(vget_low_f16(v0_7))));
-        vst1q_f32(c + 4, vaddq_f32(vld1q_f32(c + 4),
-                                   vcvt_f32_f16(vget_high_f16(v0_7))));
-
-        vst1q_f32(c + 8, vaddq_f32(vld1q_f32(c + 8),
-                                   vcvt_f32_f16(vget_low_f16(v64_71))));
-        vst1q_f32(c + 8 + 4, vaddq_f32(vld1q_f32(c + 8 + 4),
-                                       vcvt_f32_f16(vget_high_f16(v64_71))));
-
-        vst1q_f32(c + ldc, vaddq_f32(vld1q_f32(c + ldc),
-                                     vcvt_f32_f16(vget_low_f16(v8_15))));
-        vst1q_f32(c + ldc + 4, vaddq_f32(vld1q_f32(c + ldc + 4),
-                                         vcvt_f32_f16(vget_high_f16(v8_15))));
-
-        vst1q_f32(c + ldc + 8, vaddq_f32(vld1q_f32(c + ldc + 8),
-                                         vcvt_f32_f16(vget_low_f16(v72_79))));
-        vst1q_f32(c + ldc + 8 + 4,
-                  vaddq_f32(vld1q_f32(c + ldc + 8 + 4),
-                            vcvt_f32_f16(vget_high_f16(v72_79))));
-
-        vst1q_f32(c + 2 * ldc, vaddq_f32(vld1q_f32(c + 2 * ldc),
-                                         vcvt_f32_f16(vget_low_f16(v16_23))));
-        vst1q_f32(c + 2 * ldc + 4,
-                  vaddq_f32(vld1q_f32(c + 2 * ldc + 4),
-                            vcvt_f32_f16(vget_high_f16(v16_23))));
-
-        vst1q_f32(c + 2 * ldc + 8,
-                  vaddq_f32(vld1q_f32(c + 2 * ldc + 8),
-                            vcvt_f32_f16(vget_low_f16(v80_87))));
-        vst1q_f32(c + 2 * ldc + 8 + 4,
-                  vaddq_f32(vld1q_f32(c + 2 * ldc + 8 + 4),
-                            vcvt_f32_f16(vget_high_f16(v80_87))));
-
-        vst1q_f32(c + 3 * ldc, vaddq_f32(vld1q_f32(c + 3 * ldc),
-                                         vcvt_f32_f16(vget_low_f16(v24_31))));
-        vst1q_f32(c + 3 * ldc + 4,
-                  vaddq_f32(vld1q_f32(c + 3 * ldc + 4),
-                            vcvt_f32_f16(vget_high_f16(v24_31))));
-
-        vst1q_f32(c + 3 * ldc + 8,
-                  vaddq_f32(vld1q_f32(c + 3 * ldc + 8),
-                            vcvt_f32_f16(vget_low_f16(v88_95))));
-        vst1q_f32(c + 3 * ldc + 8 + 4,
-                  vaddq_f32(vld1q_f32(c + 3 * ldc + 8 + 4),
-                            vcvt_f32_f16(vget_high_f16(v88_95))));
-
-        vst1q_f32(c + 4 * ldc, vaddq_f32(vld1q_f32(c + 4 * ldc),
-                                         vcvt_f32_f16(vget_low_f16(v32_39))));
-        vst1q_f32(c + 4 * ldc + 4,
-                  vaddq_f32(vld1q_f32(c + 4 * ldc + 4),
-                            vcvt_f32_f16(vget_high_f16(v32_39))));
-
-        vst1q_f32(c + 4 * ldc + 8,
-                  vaddq_f32(vld1q_f32(c + 4 * ldc + 8),
-                            vcvt_f32_f16(vget_low_f16(v96_103))));
-        vst1q_f32(c + 4 * ldc + 8 + 4,
-                  vaddq_f32(vld1q_f32(c + 4 * ldc + 8 + 4),
-                            vcvt_f32_f16(vget_high_f16(v96_103))));
-
-        vst1q_f32(c + 5 * ldc, vaddq_f32(vld1q_f32(c + 5 * ldc),
-                                         vcvt_f32_f16(vget_low_f16(v40_47))));
-        vst1q_f32(c + 5 * ldc + 4,
-                  vaddq_f32(vld1q_f32(c + 5 * ldc + 4),
-                            vcvt_f32_f16(vget_high_f16(v40_47))));
-
-        vst1q_f32(c + 5 * ldc + 8,
-                  vaddq_f32(vld1q_f32(c + 5 * ldc + 8),
-                            vcvt_f32_f16(vget_low_f16(v104_111))));
-        vst1q_f32(c + 5 * ldc + 8 + 4,
-                  vaddq_f32(vld1q_f32(c + 5 * ldc + 8 + 4),
-                            vcvt_f32_f16(vget_high_f16(v104_111))));
-
-        vst1q_f32(c + 6 * ldc, vaddq_f32(vld1q_f32(c + 6 * ldc),
-                                         vcvt_f32_f16(vget_low_f16(v48_55))));
-        vst1q_f32(c + 6 * ldc + 4,
-                  vaddq_f32(vld1q_f32(c + 6 * ldc + 4),
-                            vcvt_f32_f16(vget_high_f16(v48_55))));
-
-        vst1q_f32(c + 6 * ldc + 8,
-                  vaddq_f32(vld1q_f32(c + 6 * ldc + 8),
-                            vcvt_f32_f16(vget_low_f16(v112_119))));
-        vst1q_f32(c + 6 * ldc + 8 + 4,
-                  vaddq_f32(vld1q_f32(c + 6 * ldc + 8 + 4),
-                            vcvt_f32_f16(vget_high_f16(v112_119))));
-
-        vst1q_f32(c + 7 * ldc, vaddq_f32(vld1q_f32(c + 7 * ldc),
-                                         vcvt_f32_f16(vget_low_f16(v56_63))));
-        vst1q_f32(c + 7 * ldc + 4,
-                  vaddq_f32(vld1q_f32(c + 7 * ldc + 4),
-                            vcvt_f32_f16(vget_high_f16(v56_63))));
-
-        vst1q_f32(c + 7 * ldc + 8,
-                  vaddq_f32(vld1q_f32(c + 7 * ldc + 8),
-                            vcvt_f32_f16(vget_low_f16(v120_127))));
-        vst1q_f32(c + 7 * ldc + 8 + 4,
-                  vaddq_f32(vld1q_f32(c + 7 * ldc + 8 + 4),
-                            vcvt_f32_f16(vget_high_f16(v120_127))));
+        SAVE_KERNEL_8X16_F16_F32();
+      }
+      for (; l < K4;) {
+        INIT_KERNEL_8X16();
+        KERNEL_8x16_ACC4();
+        SAVE_KERNEL_8X16_F16_F32();
+      }
+      for (; l < K;) {
+        INIT_KERNEL_8X16();
+        KERNEL_8x16_ACC1();
+        SAVE_KERNEL_8X16_F16_F32();
       }
       c += 16;
       a -= 8 * K;
diff --git a/nntrainer/tensor/hgemm/hgemm_kernel_8x8.h b/nntrainer/tensor/hgemm/hgemm_kernel_8x8.h
index e67ef462b4..4901c3f518 100644
--- a/nntrainer/tensor/hgemm/hgemm_kernel_8x8.h
+++ b/nntrainer/tensor/hgemm/hgemm_kernel_8x8.h
@@ -14,19 +14,186 @@
 #include <hgemm_common.h>
 #include <stdlib.h>
 
-/// @note Following KERNELs are the combinations of accuracy-latency
-/// tradeoff. User can select which kernel to use by replacing them.
+#define INIT_KERNEL_8x8() \
+  v24 = vdupq_n_f16(0.F); \
+  v25 = vdupq_n_f16(0.F); \
+  v26 = vdupq_n_f16(0.F); \
+  v27 = vdupq_n_f16(0.F); \
+  v28 = vdupq_n_f16(0.F); \
+  v29 = vdupq_n_f16(0.F); \
+  v30 = vdupq_n_f16(0.F); \
+  v31 = vdupq_n_f16(0.F);
 
-// 1. Partial sum 512 digits : Worst accuracy, best latency
+// 1. Partial sum 1024 digits
+#define KERNEL_8x8_ACC16()                 \
+  va0 = vld1q_f16(a);                      \
+  v16 = vld1q_f16(b);                      \
+  v24 = vfmaq_laneq_f16(v24, v16, va0, 0); \
+  v25 = vfmaq_laneq_f16(v25, v16, va0, 1); \
+  v26 = vfmaq_laneq_f16(v26, v16, va0, 2); \
+  v27 = vfmaq_laneq_f16(v27, v16, va0, 3); \
+  v28 = vfmaq_laneq_f16(v28, v16, va0, 4); \
+  v29 = vfmaq_laneq_f16(v29, v16, va0, 5); \
+  v30 = vfmaq_laneq_f16(v30, v16, va0, 6); \
+  v31 = vfmaq_laneq_f16(v31, v16, va0, 7); \
+  va1 = vld1q_f16(a + 8);                  \
+  v17 = vld1q_f16(b + 8);                  \
+  v24 = vfmaq_laneq_f16(v24, v17, va1, 0); \
+  v25 = vfmaq_laneq_f16(v25, v17, va1, 1); \
+  v26 = vfmaq_laneq_f16(v26, v17, va1, 2); \
+  v27 = vfmaq_laneq_f16(v27, v17, va1, 3); \
+  v28 = vfmaq_laneq_f16(v28, v17, va1, 4); \
+  v29 = vfmaq_laneq_f16(v29, v17, va1, 5); \
+  v30 = vfmaq_laneq_f16(v30, v17, va1, 6); \
+  v31 = vfmaq_laneq_f16(v31, v17, va1, 7); \
+  va2 = vld1q_f16(a + 8 * 2);              \
+  v18 = vld1q_f16(b + 8 * 2);              \
+  v24 = vfmaq_laneq_f16(v24, v18, va2, 0); \
+  v25 = vfmaq_laneq_f16(v25, v18, va2, 1); \
+  v26 = vfmaq_laneq_f16(v26, v18, va2, 2); \
+  v27 = vfmaq_laneq_f16(v27, v18, va2, 3); \
+  v28 = vfmaq_laneq_f16(v28, v18, va2, 4); \
+  v29 = vfmaq_laneq_f16(v29, v18, va2, 5); \
+  v30 = vfmaq_laneq_f16(v30, v18, va2, 6); \
+  v31 = vfmaq_laneq_f16(v31, v18, va2, 7); \
+  va3 = vld1q_f16(a + 8 * 3);              \
+  v19 = vld1q_f16(b + 8 * 3);              \
+  v24 = vfmaq_laneq_f16(v24, v19, va3, 0); \
+  v25 = vfmaq_laneq_f16(v25, v19, va3, 1); \
+  v26 = vfmaq_laneq_f16(v26, v19, va3, 2); \
+  v27 = vfmaq_laneq_f16(v27, v19, va3, 3); \
+  v28 = vfmaq_laneq_f16(v28, v19, va3, 4); \
+  v29 = vfmaq_laneq_f16(v29, v19, va3, 5); \
+  v30 = vfmaq_laneq_f16(v30, v19, va3, 6); \
+  v31 = vfmaq_laneq_f16(v31, v19, va3, 7); \
+  va4 = vld1q_f16(a + 8 * 4);              \
+  v20 = vld1q_f16(b + 8 * 4);              \
+  v24 = vfmaq_laneq_f16(v24, v20, va4, 0); \
+  v25 = vfmaq_laneq_f16(v25, v20, va4, 1); \
+  v26 = vfmaq_laneq_f16(v26, v20, va4, 2); \
+  v27 = vfmaq_laneq_f16(v27, v20, va4, 3); \
+  v28 = vfmaq_laneq_f16(v28, v20, va4, 4); \
+  v29 = vfmaq_laneq_f16(v29, v20, va4, 5); \
+  v30 = vfmaq_laneq_f16(v30, v20, va4, 6); \
+  v31 = vfmaq_laneq_f16(v31, v20, va4, 7); \
+  va5 = vld1q_f16(a + 8 * 5);              \
+  v21 = vld1q_f16(b + 8 * 5);              \
+  v24 = vfmaq_laneq_f16(v24, v21, va5, 0); \
+  v25 = vfmaq_laneq_f16(v25, v21, va5, 1); \
+  v26 = vfmaq_laneq_f16(v26, v21, va5, 2); \
+  v27 = vfmaq_laneq_f16(v27, v21, va5, 3); \
+  v28 = vfmaq_laneq_f16(v28, v21, va5, 4); \
+  v29 = vfmaq_laneq_f16(v29, v21, va5, 5); \
+  v30 = vfmaq_laneq_f16(v30, v21, va5, 6); \
+  v31 = vfmaq_laneq_f16(v31, v21, va5, 7); \
+  va6 = vld1q_f16(a + 8 * 6);              \
+  v22 = vld1q_f16(b + 8 * 6);              \
+  v24 = vfmaq_laneq_f16(v24, v22, va6, 0); \
+  v25 = vfmaq_laneq_f16(v25, v22, va6, 1); \
+  v26 = vfmaq_laneq_f16(v26, v22, va6, 2); \
+  v27 = vfmaq_laneq_f16(v27, v22, va6, 3); \
+  v28 = vfmaq_laneq_f16(v28, v22, va6, 4); \
+  v29 = vfmaq_laneq_f16(v29, v22, va6, 5); \
+  v30 = vfmaq_laneq_f16(v30, v22, va6, 6); \
+  v31 = vfmaq_laneq_f16(v31, v22, va6, 7); \
+  va7 = vld1q_f16(a + 8 * 7);              \
+  v23 = vld1q_f16(b + 8 * 7);              \
+  v24 = vfmaq_laneq_f16(v24, v23, va7, 0); \
+  v25 = vfmaq_laneq_f16(v25, v23, va7, 1); \
+  v26 = vfmaq_laneq_f16(v26, v23, va7, 2); \
+  v27 = vfmaq_laneq_f16(v27, v23, va7, 3); \
+  v28 = vfmaq_laneq_f16(v28, v23, va7, 4); \
+  v29 = vfmaq_laneq_f16(v29, v23, va7, 5); \
+  v30 = vfmaq_laneq_f16(v30, v23, va7, 6); \
+  v31 = vfmaq_laneq_f16(v31, v23, va7, 7); \
+  va7 = vld1q_f16(a + 8 * 8);              \
+  v23 = vld1q_f16(b + 8 * 8);              \
+  v24 = vfmaq_laneq_f16(v24, v23, va7, 0); \
+  v25 = vfmaq_laneq_f16(v25, v23, va7, 1); \
+  v26 = vfmaq_laneq_f16(v26, v23, va7, 2); \
+  v27 = vfmaq_laneq_f16(v27, v23, va7, 3); \
+  v28 = vfmaq_laneq_f16(v28, v23, va7, 4); \
+  v29 = vfmaq_laneq_f16(v29, v23, va7, 5); \
+  v30 = vfmaq_laneq_f16(v30, v23, va7, 6); \
+  v31 = vfmaq_laneq_f16(v31, v23, va7, 7); \
+  va7 = vld1q_f16(a + 8 * 9);              \
+  v23 = vld1q_f16(b + 8 * 9);              \
+  v24 = vfmaq_laneq_f16(v24, v23, va7, 0); \
+  v25 = vfmaq_laneq_f16(v25, v23, va7, 1); \
+  v26 = vfmaq_laneq_f16(v26, v23, va7, 2); \
+  v27 = vfmaq_laneq_f16(v27, v23, va7, 3); \
+  v28 = vfmaq_laneq_f16(v28, v23, va7, 4); \
+  v29 = vfmaq_laneq_f16(v29, v23, va7, 5); \
+  v30 = vfmaq_laneq_f16(v30, v23, va7, 6); \
+  v31 = vfmaq_laneq_f16(v31, v23, va7, 7); \
+  va7 = vld1q_f16(a + 8 * 10);             \
+  v23 = vld1q_f16(b + 8 * 10);             \
+  v24 = vfmaq_laneq_f16(v24, v23, va7, 0); \
+  v25 = vfmaq_laneq_f16(v25, v23, va7, 1); \
+  v26 = vfmaq_laneq_f16(v26, v23, va7, 2); \
+  v27 = vfmaq_laneq_f16(v27, v23, va7, 3); \
+  v28 = vfmaq_laneq_f16(v28, v23, va7, 4); \
+  v29 = vfmaq_laneq_f16(v29, v23, va7, 5); \
+  v30 = vfmaq_laneq_f16(v30, v23, va7, 6); \
+  v31 = vfmaq_laneq_f16(v31, v23, va7, 7); \
+  va7 = vld1q_f16(a + 8 * 11);             \
+  v23 = vld1q_f16(b + 8 * 11);             \
+  v24 = vfmaq_laneq_f16(v24, v23, va7, 0); \
+  v25 = vfmaq_laneq_f16(v25, v23, va7, 1); \
+  v26 = vfmaq_laneq_f16(v26, v23, va7, 2); \
+  v27 = vfmaq_laneq_f16(v27, v23, va7, 3); \
+  v28 = vfmaq_laneq_f16(v28, v23, va7, 4); \
+  v29 = vfmaq_laneq_f16(v29, v23, va7, 5); \
+  v30 = vfmaq_laneq_f16(v30, v23, va7, 6); \
+  v31 = vfmaq_laneq_f16(v31, v23, va7, 7); \
+  va7 = vld1q_f16(a + 8 * 12);             \
+  v23 = vld1q_f16(b + 8 * 12);             \
+  v24 = vfmaq_laneq_f16(v24, v23, va7, 0); \
+  v25 = vfmaq_laneq_f16(v25, v23, va7, 1); \
+  v26 = vfmaq_laneq_f16(v26, v23, va7, 2); \
+  v27 = vfmaq_laneq_f16(v27, v23, va7, 3); \
+  v28 = vfmaq_laneq_f16(v28, v23, va7, 4); \
+  v29 = vfmaq_laneq_f16(v29, v23, va7, 5); \
+  v30 = vfmaq_laneq_f16(v30, v23, va7, 6); \
+  v31 = vfmaq_laneq_f16(v31, v23, va7, 7); \
+  va7 = vld1q_f16(a + 8 * 13);             \
+  v23 = vld1q_f16(b + 8 * 13);             \
+  v24 = vfmaq_laneq_f16(v24, v23, va7, 0); \
+  v25 = vfmaq_laneq_f16(v25, v23, va7, 1); \
+  v26 = vfmaq_laneq_f16(v26, v23, va7, 2); \
+  v27 = vfmaq_laneq_f16(v27, v23, va7, 3); \
+  v28 = vfmaq_laneq_f16(v28, v23, va7, 4); \
+  v29 = vfmaq_laneq_f16(v29, v23, va7, 5); \
+  v30 = vfmaq_laneq_f16(v30, v23, va7, 6); \
+  v31 = vfmaq_laneq_f16(v31, v23, va7, 7); \
+  va7 = vld1q_f16(a + 8 * 14);             \
+  v23 = vld1q_f16(b + 8 * 14);             \
+  v24 = vfmaq_laneq_f16(v24, v23, va7, 0); \
+  v25 = vfmaq_laneq_f16(v25, v23, va7, 1); \
+  v26 = vfmaq_laneq_f16(v26, v23, va7, 2); \
+  v27 = vfmaq_laneq_f16(v27, v23, va7, 3); \
+  v28 = vfmaq_laneq_f16(v28, v23, va7, 4); \
+  v29 = vfmaq_laneq_f16(v29, v23, va7, 5); \
+  v30 = vfmaq_laneq_f16(v30, v23, va7, 6); \
+  v31 = vfmaq_laneq_f16(v31, v23, va7, 7); \
+  va7 = vld1q_f16(a + 8 * 15);             \
+  v23 = vld1q_f16(b + 8 * 15);             \
+  v24 = vfmaq_laneq_f16(v24, v23, va7, 0); \
+  v25 = vfmaq_laneq_f16(v25, v23, va7, 1); \
+  v26 = vfmaq_laneq_f16(v26, v23, va7, 2); \
+  v27 = vfmaq_laneq_f16(v27, v23, va7, 3); \
+  v28 = vfmaq_laneq_f16(v28, v23, va7, 4); \
+  v29 = vfmaq_laneq_f16(v29, v23, va7, 5); \
+  v30 = vfmaq_laneq_f16(v30, v23, va7, 6); \
+  v31 = vfmaq_laneq_f16(v31, v23, va7, 7); \
+  __builtin_prefetch(b + 128, 0, 3);       \
+  __builtin_prefetch(a + 128, 0, 3);       \
+  l += 16;                                 \
+  b += 8 * 16;                             \
+  a += 8 * 16;
+
+// 2. Partial sum 512 digits
 #define KERNEL_8x8_ACC8()                  \
-  v24 = vdupq_n_f16(0.F);                  \
-  v25 = vdupq_n_f16(0.F);                  \
-  v26 = vdupq_n_f16(0.F);                  \
-  v27 = vdupq_n_f16(0.F);                  \
-  v28 = vdupq_n_f16(0.F);                  \
-  v29 = vdupq_n_f16(0.F);                  \
-  v30 = vdupq_n_f16(0.F);                  \
-  v31 = vdupq_n_f16(0.F);                  \
   va0 = vld1q_f16(a);                      \
   v16 = vld1q_f16(b);                      \
   v24 = vfmaq_laneq_f16(v24, v16, va0, 0); \
@@ -113,16 +280,8 @@
   b += 8 * 8;                              \
   a += 8 * 8;
 
-// 2. Partial sum 256 digits : Medium accuracy, medium latency
+// 3. Partial sum 256 digits
 #define KERNEL_8x8_ACC4()                  \
-  v24 = vdupq_n_f16(0.F);                  \
-  v25 = vdupq_n_f16(0.F);                  \
-  v26 = vdupq_n_f16(0.F);                  \
-  v27 = vdupq_n_f16(0.F);                  \
-  v28 = vdupq_n_f16(0.F);                  \
-  v29 = vdupq_n_f16(0.F);                  \
-  v30 = vdupq_n_f16(0.F);                  \
-  v31 = vdupq_n_f16(0.F);                  \
   va0 = vld1q_f16(a);                      \
   v16 = vld1q_f16(b);                      \
   v24 = vfmaq_laneq_f16(v24, v16, va0, 0); \
@@ -169,16 +328,8 @@
   b += 8 * 4;                              \
   a += 8 * 4;
 
-// 3. Partial sum 64 digits : Best accuracy, worst latency
+// 4. Partial sum 64 digits
 #define KERNEL_8x8_ACC1()                  \
-  v24 = vdupq_n_f16(0.F);                  \
-  v25 = vdupq_n_f16(0.F);                  \
-  v26 = vdupq_n_f16(0.F);                  \
-  v27 = vdupq_n_f16(0.F);                  \
-  v28 = vdupq_n_f16(0.F);                  \
-  v29 = vdupq_n_f16(0.F);                  \
-  v30 = vdupq_n_f16(0.F);                  \
-  v31 = vdupq_n_f16(0.F);                  \
   va0 = vld1q_f16(a);                      \
   v16 = vld1q_f16(b);                      \
   v24 = vfmaq_laneq_f16(v24, v16, va0, 0); \
@@ -195,6 +346,46 @@
   b += 8 * 1;                              \
   a += 8 * 1;
 
+#define SAVE_KERNEL_8X8_F16_f32()                                            \
+  vst1q_f32(c, vaddq_f32(vld1q_f32(c), vcvt_f32_f16(vget_low_f16(v24))));    \
+  vst1q_f32(c + 4,                                                           \
+            vaddq_f32(vld1q_f32(c + 4), vcvt_f32_f16(vget_high_f16(v24))));  \
+                                                                             \
+  vst1q_f32(c + ldc,                                                         \
+            vaddq_f32(vld1q_f32(c + ldc), vcvt_f32_f16(vget_low_f16(v25)))); \
+  vst1q_f32(c + 4 + ldc, vaddq_f32(vld1q_f32(c + 4 + ldc),                   \
+                                   vcvt_f32_f16(vget_high_f16(v25))));       \
+                                                                             \
+  vst1q_f32(c + 2 * ldc, vaddq_f32(vld1q_f32(c + 2 * ldc),                   \
+                                   vcvt_f32_f16(vget_low_f16(v26))));        \
+  vst1q_f32(c + 4 + 2 * ldc, vaddq_f32(vld1q_f32(c + 4 + 2 * ldc),           \
+                                       vcvt_f32_f16(vget_high_f16(v26))));   \
+                                                                             \
+  vst1q_f32(c + 3 * ldc, vaddq_f32(vld1q_f32(c + 3 * ldc),                   \
+                                   vcvt_f32_f16(vget_low_f16(v27))));        \
+  vst1q_f32(c + 4 + 3 * ldc, vaddq_f32(vld1q_f32(c + 4 + 3 * ldc),           \
+                                       vcvt_f32_f16(vget_high_f16(v27))));   \
+                                                                             \
+  vst1q_f32(c + 4 * ldc, vaddq_f32(vld1q_f32(c + 4 * ldc),                   \
+                                   vcvt_f32_f16(vget_low_f16(v28))));        \
+  vst1q_f32(c + 4 + 4 * ldc, vaddq_f32(vld1q_f32(c + 4 + 4 * ldc),           \
+                                       vcvt_f32_f16(vget_high_f16(v28))));   \
+                                                                             \
+  vst1q_f32(c + 5 * ldc, vaddq_f32(vld1q_f32(c + 5 * ldc),                   \
+                                   vcvt_f32_f16(vget_low_f16(v29))));        \
+  vst1q_f32(c + 4 + 5 * ldc, vaddq_f32(vld1q_f32(c + 4 + 5 * ldc),           \
+                                       vcvt_f32_f16(vget_high_f16(v29))));   \
+                                                                             \
+  vst1q_f32(c + 6 * ldc, vaddq_f32(vld1q_f32(c + 6 * ldc),                   \
+                                   vcvt_f32_f16(vget_low_f16(v30))));        \
+  vst1q_f32(c + 4 + 6 * ldc, vaddq_f32(vld1q_f32(c + 4 + 6 * ldc),           \
+                                       vcvt_f32_f16(vget_high_f16(v30))));   \
+                                                                             \
+  vst1q_f32(c + 7 * ldc, vaddq_f32(vld1q_f32(c + 7 * ldc),                   \
+                                   vcvt_f32_f16(vget_low_f16(v31))));        \
+  vst1q_f32(c + 4 + 7 * ldc, vaddq_f32(vld1q_f32(c + 4 + 7 * ldc),           \
+                                       vcvt_f32_f16(vget_high_f16(v31))));
+
 /**
  * @brief hgemm 8x8 kernel sc = sa * sb
  *
@@ -221,19 +412,19 @@ void hgemm_kernel_8x8(unsigned int M, unsigned int N, unsigned int K,
       float16x8_t v16, v17, v18, v19, v20, v21, v22, v23;
       float16x8_t v24, v25, v26, v27, v28, v29, v30, v31;
       float16x8_t va0, va1, va2, va3, va4, va5, va6, va7;
+      INIT_KERNEL_8x8();
       l = 0;
       for (; l < K;) {
-        KERNEL_8x8_ACC8();
-
-        vst1q_f16(c, vaddq_f16(vld1q_f16(c), v24));
-        vst1q_f16(c + ldc, vaddq_f16(vld1q_f16(c + ldc), v25));
-        vst1q_f16(c + 2 * ldc, vaddq_f16(vld1q_f16(c + 2 * ldc), v26));
-        vst1q_f16(c + 3 * ldc, vaddq_f16(vld1q_f16(c + 3 * ldc), v27));
-        vst1q_f16(c + 4 * ldc, vaddq_f16(vld1q_f16(c + 4 * ldc), v28));
-        vst1q_f16(c + 5 * ldc, vaddq_f16(vld1q_f16(c + 5 * ldc), v29));
-        vst1q_f16(c + 6 * ldc, vaddq_f16(vld1q_f16(c + 6 * ldc), v30));
-        vst1q_f16(c + 7 * ldc, vaddq_f16(vld1q_f16(c + 7 * ldc), v31));
+        KERNEL_8x8_ACC1();
       }
+      vst1q_f16(c, vaddq_f16(vld1q_f16(c), v24));
+      vst1q_f16(c + ldc, vaddq_f16(vld1q_f16(c + ldc), v25));
+      vst1q_f16(c + 2 * ldc, vaddq_f16(vld1q_f16(c + 2 * ldc), v26));
+      vst1q_f16(c + 3 * ldc, vaddq_f16(vld1q_f16(c + 3 * ldc), v27));
+      vst1q_f16(c + 4 * ldc, vaddq_f16(vld1q_f16(c + 4 * ldc), v28));
+      vst1q_f16(c + 5 * ldc, vaddq_f16(vld1q_f16(c + 5 * ldc), v29));
+      vst1q_f16(c + 6 * ldc, vaddq_f16(vld1q_f16(c + 6 * ldc), v30));
+      vst1q_f16(c + 7 * ldc, vaddq_f16(vld1q_f16(c + 7 * ldc), v31));
       c += 8;
       a -= 8 * K;
     }
@@ -263,6 +454,9 @@ void hgemm_kernel_8x8(unsigned int M, unsigned int N, unsigned int K,
   __fp16 *a = sa, *b = sb;
   float *c = sc;
   unsigned int i, j, l;
+  unsigned int K4 = (K >> 2) << 2;
+  unsigned int K8 = (K >> 3) << 3;
+  unsigned int K16 = (K >> 4) << 4;
   for (i = 0; i < M; i += VL_FP16) {
     for (j = 0; j < N; j += VL_FP16) {
       __builtin_prefetch(b, 0, 3);
@@ -272,48 +466,25 @@ void hgemm_kernel_8x8(unsigned int M, unsigned int N, unsigned int K,
       float16x8_t v24, v25, v26, v27, v28, v29, v30, v31;
       float16x8_t va0, va1, va2, va3, va4, va5, va6, va7;
       l = 0;
-
-      for (; l < K;) {
+      for (; l < K16;) {
+        INIT_KERNEL_8x8();
+        KERNEL_8x8_ACC16();
+        SAVE_KERNEL_8X8_F16_f32();
+      }
+      for (; l < K8;) {
+        INIT_KERNEL_8x8();
         KERNEL_8x8_ACC8();
-
-        vst1q_f32(c, vaddq_f32(vld1q_f32(c), vcvt_f32_f16(vget_low_f16(v24))));
-        vst1q_f32(
-          c + 4, vaddq_f32(vld1q_f32(c + 4), vcvt_f32_f16(vget_high_f16(v24))));
-
-        vst1q_f32(c + ldc, vaddq_f32(vld1q_f32(c + ldc),
-                                     vcvt_f32_f16(vget_low_f16(v25))));
-        vst1q_f32(c + 4 + ldc, vaddq_f32(vld1q_f32(c + 4 + ldc),
-                                         vcvt_f32_f16(vget_high_f16(v25))));
-
-        vst1q_f32(c + 2 * ldc, vaddq_f32(vld1q_f32(c + 2 * ldc),
-                                         vcvt_f32_f16(vget_low_f16(v26))));
-        vst1q_f32(c + 4 + 2 * ldc, vaddq_f32(vld1q_f32(c + 4 + 2 * ldc),
-                                             vcvt_f32_f16(vget_high_f16(v26))));
-
-        vst1q_f32(c + 3 * ldc, vaddq_f32(vld1q_f32(c + 3 * ldc),
-                                         vcvt_f32_f16(vget_low_f16(v27))));
-        vst1q_f32(c + 4 + 3 * ldc, vaddq_f32(vld1q_f32(c + 4 + 3 * ldc),
-                                             vcvt_f32_f16(vget_high_f16(v27))));
-
-        vst1q_f32(c + 4 * ldc, vaddq_f32(vld1q_f32(c + 4 * ldc),
-                                         vcvt_f32_f16(vget_low_f16(v28))));
-        vst1q_f32(c + 4 + 4 * ldc, vaddq_f32(vld1q_f32(c + 4 + 4 * ldc),
-                                             vcvt_f32_f16(vget_high_f16(v28))));
-
-        vst1q_f32(c + 5 * ldc, vaddq_f32(vld1q_f32(c + 5 * ldc),
-                                         vcvt_f32_f16(vget_low_f16(v29))));
-        vst1q_f32(c + 4 + 5 * ldc, vaddq_f32(vld1q_f32(c + 4 + 5 * ldc),
-                                             vcvt_f32_f16(vget_high_f16(v29))));
-
-        vst1q_f32(c + 6 * ldc, vaddq_f32(vld1q_f32(c + 6 * ldc),
-                                         vcvt_f32_f16(vget_low_f16(v30))));
-        vst1q_f32(c + 4 + 6 * ldc, vaddq_f32(vld1q_f32(c + 4 + 6 * ldc),
-                                             vcvt_f32_f16(vget_high_f16(v30))));
-
-        vst1q_f32(c + 7 * ldc, vaddq_f32(vld1q_f32(c + 7 * ldc),
-                                         vcvt_f32_f16(vget_low_f16(v31))));
-        vst1q_f32(c + 4 + 7 * ldc, vaddq_f32(vld1q_f32(c + 4 + 7 * ldc),
-                                             vcvt_f32_f16(vget_high_f16(v31))));
+        SAVE_KERNEL_8X8_F16_f32();
+      }
+      for (; l < K4;) {
+        INIT_KERNEL_8x8();
+        KERNEL_8x8_ACC4();
+        SAVE_KERNEL_8X8_F16_f32();
+      }
+      for (; l < K;) {
+        INIT_KERNEL_8x8();
+        KERNEL_8x8_ACC1();
+        SAVE_KERNEL_8X8_F16_f32();
       }
 
       c += 8;
diff --git a/nntrainer/tensor/manager.cpp b/nntrainer/tensor/manager.cpp
index 9a0d235ba9..4a2838d05e 100644
--- a/nntrainer/tensor/manager.cpp
+++ b/nntrainer/tensor/manager.cpp
@@ -407,14 +407,15 @@ std::vector<Weight *> Manager::requestWeights(
      * order with the max exec order where it will be used for clipping and then
      * applied to the weight.
      */
-    if (Weight::isGradientClipByGlobalNorm(clip_by_global_norm)) {
+    if (Weight::isGradientClipByGlobalNorm(clip_by_global_norm) ||
+        isMixedPrecision()) {
       grad_exec_order.push_back(TensorPool::PERSIST_END_ORDER);
       // TODO: We need double check if it is OK not to add PERSIST_END_ORDER
       // here or add other conditions
       // var_exec_order.push_back(TensorPool::PERSIST_END_ORDER);
     }
 
-    Tensor *var = nullptr, *grad = nullptr;
+    Tensor *var = nullptr, *grad = nullptr, *var32 = nullptr;
     bool is_dependent = !shared_names.empty();
     if (is_dependent) {
       /// shared_name is used and the orignal name is discarded
@@ -431,6 +432,17 @@ std::vector<Weight *> Manager::requestWeights(
         grad = tensor_pool.requestOrExtend(shared_name + Var_Grad::grad_suffix,
                                            dim_g, grad_exec_order, grad_ls,
                                            Tensor::Initializer::ZEROS);
+
+        if (var->getDataType() != ml::train::TensorDim::DataType::FP32) {
+          TensorDim var32_dim(dim_v);
+          var32_dim.setDataType(ml::train::TensorDim::DataType::FP32);
+          std::vector<unsigned int> var32_exec_order;
+          var32_exec_order.push_back(TensorPool::PERSIST_END_ORDER);
+
+          var32 = weight_pool.requestOrExtend(shared_name + ":var32", var32_dim,
+                                              var32_exec_order, var_ls,
+                                              Tensor::Initializer::ZEROS);
+        }
       }
     } else {
       /** case requesting fresh weights */
@@ -448,11 +460,21 @@ std::vector<Weight *> Manager::requestWeights(
         grad = tensor_pool.request(name + Var_Grad::grad_suffix, dim_g,
                                    grad_exec_order, grad_ls,
                                    Tensor::Initializer::ZEROS, is_wgrad);
+        if (var->getDataType() != ml::train::TensorDim::DataType::FP32) {
+          TensorDim var32_dim(dim_v);
+          var32_dim.setDataType(ml::train::TensorDim::DataType::FP32);
+          std::vector<unsigned int> var32_exec_order;
+          var32_exec_order.push_back(TensorPool::PERSIST_END_ORDER);
+          var32 =
+            weight_pool.request(name + ":var32", var32_dim, var32_exec_order,
+                                var_ls, Tensor::Initializer::ZEROS);
+        }
       }
     }
 
     weights_v2.emplace_back(std::make_unique<Weight>(
-      var, grad, w_reg, w_reg_const, decay, is_dependent, clip_by_global_norm));
+      var, grad, var32, w_reg, w_reg_const, decay, is_dependent,
+      clip_by_global_norm, axis, loss_scale));
   }
 
   std::transform(weights_v2.begin() + current_size, weights_v2.end(),
@@ -668,15 +690,15 @@ bool Manager::isSecondLastAccess(const std::string &name,
  */
 std::vector<Tensor *> Manager::requestWeightOptimizerVariables(
   const std::vector<TensorDim> &dims, const std::string &name,
-  const TensorLifespan &lifespan, bool is_grad_clip,
-  Tensor::Initializer initializer) {
+  const std::string &suffix, const TensorLifespan &lifespan, bool is_grad_clip,
+  bool is_mixed_precision, Tensor::Initializer initializer) {
 
   std::vector<Tensor *> ret;
   ret.reserve(dims.size());
 
   std::vector<unsigned int> exec;
   exec.reserve(1);
-  if (is_grad_clip) {
+  if (is_grad_clip || is_mixed_precision) {
     exec.emplace_back(TensorPool::PERSIST_END_ORDER);
   } else {
     exec.emplace_back(getMinMaxTensorExecutionOrder(name, true).second);
@@ -685,7 +707,7 @@ std::vector<Tensor *> Manager::requestWeightOptimizerVariables(
   /// @note this is assuming weight optimizer variables is treated as weight, if
   /// not, there is room to optimize below behavior
   for (unsigned int idx = 0; idx < dims.size(); idx++)
-    ret.push_back(weight_pool.request(name + ":opt" + std::to_string(idx),
+    ret.push_back(weight_pool.request(name + suffix + std::to_string(idx),
                                       dims[idx], exec, lifespan, initializer));
 
   return ret;
diff --git a/nntrainer/tensor/manager.h b/nntrainer/tensor/manager.h
index ab1c018153..d561770206 100644
--- a/nntrainer/tensor/manager.h
+++ b/nntrainer/tensor/manager.h
@@ -224,7 +224,8 @@ class Manager {
    */
   std::vector<Tensor *> requestWeightOptimizerVariables(
     const std::vector<TensorDim> &dims, const std::string &name,
-    const TensorLifespan &lifespan, bool is_grad_clip,
+    const std::string &suffix, const TensorLifespan &lifespan,
+    bool is_grad_clip, bool is_mixed_type,
     Tensor::Initializer initializer = Tensor::Initializer::NONE);
 
   /**
@@ -494,6 +495,11 @@ class Manager {
     exec_mode = mode;
   };
 
+  /**
+   * @brief     return if it is mixed precsion
+   */
+  bool isMixedPrecision() { return !istrequal(tensor_dtype[0], "FP32"); }
+
 private:
   /** @todo: merge this list to one */
   std::vector<std::unique_ptr<Weight>> weights_v2; /**< weights for the layers
diff --git a/nntrainer/tensor/meson.build b/nntrainer/tensor/meson.build
index 0884dbd3b4..b14fa0ee85 100644
--- a/nntrainer/tensor/meson.build
+++ b/nntrainer/tensor/meson.build
@@ -44,6 +44,12 @@ cl_headers = [
 
 
 arch = host_machine.cpu_family()
+
+if get_option('enable-avx')
+    tensor_sources += 'blas_avx.cpp'
+    tensor_headers += 'blas_avx.h'
+endif
+
 if get_option('enable-fp16') 
   if arch == 'arm'
     error ('FP16/ARM code (blas_neon.cpp) uses armv8.2 instructions. armv7 is not supported.')
@@ -55,9 +61,6 @@ if get_option('enable-fp16')
       nntrainer_inc += include_directories('hgemm')
       nntrainer_inc_abs += meson.current_source_dir() / 'hgemm'
     endif
-  elif get_option('enable-avx')
-    tensor_sources += 'blas_avx.cpp'
-    tensor_headers += 'blas_avx.h'
   endif
 endif
 
diff --git a/nntrainer/tensor/tensor.cpp b/nntrainer/tensor/tensor.cpp
index 4f1e8e0721..827ba7e979 100644
--- a/nntrainer/tensor/tensor.cpp
+++ b/nntrainer/tensor/tensor.cpp
@@ -3065,6 +3065,18 @@ Tensor Tensor::clone() const {
   return t;
 }
 
+Tensor Tensor::clone(ml::train::TensorDim::DataType type) const {
+  if (getDataType() == type)
+    return clone();
+
+  TensorDim dim = getDim();
+  dim.setDataType(type);
+  Tensor t(dim, true);
+  t.copyData(*this);
+  t.name = name;
+  return t;
+}
+
 void Tensor::reshape(const TensorDim &d) {
 
   NNTR_THROW_IF(!contiguous, std::invalid_argument)
@@ -3808,6 +3820,18 @@ void Tensor::dequantize(Tensor &output, unsigned int axis) const {
   return;
 }
 
+bool Tensor::isValid() const {
+  if (getDataType() == Tdatatype::FP16) {
+#ifdef ENABLE_FP16
+    return is_valid(dim.getDataLen(), Tdatatype::FP16, getData<_FP16>());
+#else
+    throw std::invalid_argument("enble-fp16 is not set");
+#endif
+  } else {
+    return is_valid(dim.getDataLen(), Tdatatype::FP32, getData<float>());
+  }
+}
+
 // namespace nntrainer
 
 } /* namespace nntrainer */
diff --git a/nntrainer/tensor/tensor.h b/nntrainer/tensor/tensor.h
index 211334da40..ad3781526f 100644
--- a/nntrainer/tensor/tensor.h
+++ b/nntrainer/tensor/tensor.h
@@ -1680,6 +1680,13 @@ class Tensor {
    */
   Tensor clone() const;
 
+  /**
+   * @brief     Convient wrapper for inplace copy of @a this.
+   * @param[in] type output tensor data type
+   * @retval    Copied version of this
+   */
+  Tensor clone(ml::train::TensorDim::DataType type) const;
+
   /**
    * @brief     Save the Tensor into file
    * @param[in] file output file stream
@@ -2031,6 +2038,12 @@ class Tensor {
 
   static constexpr float epsilon = 1e-5;
 
+  /**
+   * @brief      check if there is NaN or Inf element
+   * @param[out] bool false if there is NaN or Inf else false
+   */
+  bool isValid() const;
+
 private:
   /**< handle the data as a std::shared_ptr<float> type */
   TensorDim dim;
diff --git a/nntrainer/tensor/weight.cpp b/nntrainer/tensor/weight.cpp
index f98c8c8356..ea8c65a7cb 100644
--- a/nntrainer/tensor/weight.cpp
+++ b/nntrainer/tensor/weight.cpp
@@ -34,6 +34,28 @@ Weight::Weight(const TensorDim &dim, const Tensor::Initializer init,
     throw std::invalid_argument("Weight initializer cannot be none");
   if (regularizer == WeightRegularizer::UNKNOWN)
     throw std::invalid_argument("Weight regularizer unknown");
+
+  std::string var32_suffix = ":fp32";
+  std::string var32_name = name + var32_suffix;
+
+  /**
+   * @note We assume if the Weight Data Type is not FP32, then FP32 Weight is
+   * necessary to maintain the accuracy.
+   * We could think it can be other data type and if there is the case to
+   * support other data type, then the code below needs to be udpated.
+   *
+   * Also, the loss_scale is not used in Weight but leave as it is for later
+   * usage.
+   */
+
+  if (train && dim.getDataType() != ml::train::TensorDim::DataType::FP32) {
+    TensorDim var32_dim(dim);
+    var32_dim.setDataType(ml::train::TensorDim::DataType::FP32);
+
+    var32 = std::make_shared<Tensor>(var32_dim, alloc_now_, init, var32_name);
+  } else {
+    var32 = std::make_shared<Tensor>(var32_name);
+  }
 }
 
 Weight::Weight(const TensorDim &dim_v, const TensorDim &dim_g,
@@ -52,6 +74,93 @@ Weight::Weight(const TensorDim &dim_v, const TensorDim &dim_g,
     throw std::invalid_argument("Weight initializer cannot be none");
   if (regularizer == WeightRegularizer::UNKNOWN)
     throw std::invalid_argument("Weight regularizer unknown");
+
+  std::string var32_suffix = ":fp32";
+  std::string var32_name = name + var32_suffix;
+
+  if (train && dim_v.getDataType() != ml::train::TensorDim::DataType::FP32) {
+    TensorDim var32_dim(dim_v);
+    var32_dim.setDataType(ml::train::TensorDim::DataType::FP32);
+    std::string var32_suffix = ":fp32";
+    std::string var32_name = name + var32_suffix;
+
+    var32 = std::make_shared<Tensor>(var32_dim, alloc_now_, init, var32_name);
+  } else {
+    var32 = std::make_shared<Tensor>(var32_name);
+  }
+}
+
+Weight::Weight(const Tensor &v, const Tensor &g, const Tensor &v32,
+               const std::string &n, bool is_dependent,
+               unsigned int output_axis_) :
+  Var_Grad(v, g, n, is_dependent),
+  regularizer(WeightRegularizer::NONE),
+  regularizer_constant(1.0f),
+  decay(0.0f),
+  clip_by_global_norm(0.0f),
+  output_axis(output_axis_),
+  loss_scale(1.0),
+  var32(std::make_shared<Tensor>(n + ":fp32")) {
+
+  if (!g.empty() && isMixedPrecision()) {
+    TensorDim var32_dim(v.getDim());
+    var32_dim.setDataType(ml::train::TensorDim::DataType::FP32);
+    if (!v32.empty())
+      var32 = std::make_shared<Tensor>(
+        v32.getSharedDataTensor(var32_dim, 0, false, n + ":fp32"));
+  }
+}
+
+Weight::Weight(Tensor *v, Tensor *g, Tensor *v32, const WeightRegularizer reg,
+               const float reg_const, const float decay, bool is_dependent,
+               const float max_norm, unsigned int output_axis_,
+               float loss_scale_) :
+  Var_Grad(v, g, is_dependent),
+  regularizer(reg),
+  regularizer_constant(reg_const),
+  decay(decay),
+  clip_by_global_norm(max_norm),
+  output_axis(output_axis_),
+  loss_scale(loss_scale_),
+  var32(std::shared_ptr<Tensor>(v32, [](void *) {})) {
+  if (!v32)
+    var32 = std::make_shared<Tensor>();
+}
+
+void Weight::applyGradient(double lr, Tensor &updated_grad) {
+  if (isMixedPrecision() &&
+      updated_grad.getDataType() == ml::train::TensorDim::DataType::FP32) {
+    var32->add_i(updated_grad, -lr);
+    quantizeWeight();
+    return;
+  }
+
+  return applyGradient(lr);
+}
+
+void Weight::quantizeWeight() {
+  if (!isMixedPrecision())
+    return;
+
+  Tensor &var = getVariableRef();
+  ml::train::TensorDim::DataType type = var.getDataType();
+  switch (type) {
+  case ml::train::TensorDim::DataType::QINT4:
+    // NYI
+    break;
+  case ml::train::TensorDim::DataType::QINT8:
+    // NYI
+    break;
+  case ml::train::TensorDim::DataType::FP16:
+    getVariableRef().copyData(getVariableFP32Ref());
+    break;
+  case ml::train::TensorDim::DataType::FP32:
+    break;
+  default:
+    break;
+  }
+
+  return;
 }
 
 } // namespace nntrainer
diff --git a/nntrainer/tensor/weight.h b/nntrainer/tensor/weight.h
index 552f6d5739..ef65ca9318 100644
--- a/nntrainer/tensor/weight.h
+++ b/nntrainer/tensor/weight.h
@@ -46,7 +46,7 @@ class Weight : public Var_Grad {
     decay(0.0f),
     clip_by_global_norm(0.0f),
     output_axis(3),
-    loss_scale(0.0) {}
+    loss_scale(1.0) {}
 
   /**
    * @brief Construct a new Weight object
@@ -66,7 +66,7 @@ class Weight : public Var_Grad {
     const float reg_const = 1.0f, const float decay = 0.0f,
     const float clip_by_global_norm = 0.0f, bool ng = true,
     bool alloc_now = false, std::string name = "", unsigned int axis = 3,
-    float loss_scale_ = 0.0);
+    float loss_scale_ = 1.0);
 
   /**
    * @brief Construct a new Weight object
@@ -87,7 +87,7 @@ class Weight : public Var_Grad {
     const float reg_const = 1.0f, const float decay = 0.0f,
     const float clip_by_global_norm = 0.0f, bool ng = true,
     bool alloc_now = false, std::string name = "", unsigned int axis = 3,
-    float loss_scale_ = 0.0);
+    float loss_scale_ = 1.0);
 
   /**
    * @brief Construct a new Weight object
@@ -114,6 +114,7 @@ class Weight : public Var_Grad {
    *
    * @param v Already created variable object
    * @param g Already created gradient object
+   * @param v32 Already created gradient object
    * @param n Name for this Weight
    *
    * @note This is primarily used to created wrapper of variable extracted from
@@ -123,35 +124,24 @@ class Weight : public Var_Grad {
    * uses only, as Weight does not own the tensors v and g, and can go invalid
    * if the owner of these tensors free the tensors.
    */
-  explicit Weight(const Tensor &v, const Tensor &g, const std::string &n = "",
-                  bool is_dependent = false, unsigned int output_axis_ = 3) :
-    Var_Grad(v, g, n, is_dependent),
-    regularizer(WeightRegularizer::NONE),
-    regularizer_constant(1.0f),
-    decay(0.0f),
-    clip_by_global_norm(0.0f),
-    output_axis(output_axis_),
-    loss_scale(0.0) {}
+  explicit Weight(const Tensor &v, const Tensor &g, const Tensor &v32,
+                  const std::string &n = "", bool is_dependent = false,
+                  unsigned int output_axis_ = 3);
 
   /**
    * @brief Construct a new Weight object
    *
    * @param v ptr to already created variable tensor
    * @param g ptr to already created gradient tensor
+   * @param v32 ptr to already created variable32 tensor
    * @param reg Regularizer for the weight
    * @param reg_const Constant multiplier for regularizer
    */
-  explicit Weight(Tensor *v, Tensor *g, const WeightRegularizer reg,
-                  const float reg_const, const float decay,
-                  bool is_dependent = false, const float max_norm = 0.0f,
-                  unsigned int output_axis_ = 3, float loss_scale_ = 0.0f) :
-    Var_Grad(v, g, is_dependent),
-    regularizer(reg),
-    regularizer_constant(reg_const),
-    decay(decay),
-    clip_by_global_norm(max_norm),
-    output_axis(output_axis_),
-    loss_scale(loss_scale_) {}
+  explicit Weight(Tensor *v, Tensor *g, Tensor *v32,
+                  const WeightRegularizer reg, const float reg_const,
+                  const float decay, bool is_dependent = false,
+                  const float max_norm = 0.0f, unsigned int output_axis_ = 3,
+                  float loss_scale_ = 1.0f);
 
   /**
    * @brief Swap for weight
@@ -170,6 +160,7 @@ class Weight : public Var_Grad {
     swap(lhs.output_axis, rhs.output_axis);
     swap(lhs.opt_vars, rhs.opt_vars);
     swap(lhs.loss_scale, rhs.loss_scale);
+    swap(lhs.var32, rhs.var32);
   }
 
   /**
@@ -213,6 +204,8 @@ class Weight : public Var_Grad {
       w.var = std::make_shared<Tensor>(this->var->clone());
     if (!this->grad->empty())
       w.grad = std::make_shared<Tensor>(this->grad->clone());
+    if (!this->var32->empty())
+      w.var32 = std::make_shared<Tensor>(this->var32->clone());
 
     return w;
   }
@@ -294,6 +287,13 @@ class Weight : public Var_Grad {
    */
   void applyGradient(double lr) { var->add_i(*grad.get(), -lr); }
 
+  /**
+   * @brief     Apply the gradient to the weight with updated gradient
+   * @param[in] updated_grad gradient tensor which is updated in optimizer
+   * it might be different data type with gradient in weight. .eg : FP32
+   */
+  void applyGradient(double lr, Tensor &updated_grad);
+
   /**
    * @brief Check if the gradient is supposed to be clipped by global norm with
    * the given max_norm value
@@ -316,6 +316,16 @@ class Weight : public Var_Grad {
     return clip_by_global_norm > epsilon;
   }
 
+  /**
+   * @brief Check if the variable type is not full precision
+   *
+   * @return true if it is not full precsion
+   * @return false otherwise
+   */
+  bool isMixedPrecision() const {
+    return ((var->getDataType() != ml::train::TensorDim::DataType::FP32));
+  }
+
   /**
    * @brief clip the gradient value based on the given global norm
    *
@@ -326,6 +336,32 @@ class Weight : public Var_Grad {
       grad->multiply_i(clip_by_global_norm / (global_norm + epsilon));
   }
 
+  /**
+   * @brief Get the variable FP32 tensor (by reference)
+   *
+   * @return Tensor Variable FP32 tensor
+   */
+  Tensor &getVariableFP32Ref() { return *var32.get(); }
+
+  /**
+   * @brief Quantize var32 to var
+   *
+   */
+  void quantizeWeight();
+
+  /**
+   * @brief set loss scale
+   * param[in] scale
+   *
+   */
+  void setLossScale(float scale) { loss_scale = scale; };
+
+  /**
+   * @brief get loss scale
+   *
+   */
+  const float getLossScale() { return loss_scale; };
+
 private:
   static constexpr float epsilon = 1e-6; /**< epsilon for zero comparison */
   static constexpr float epsilon_decay =
@@ -337,7 +373,8 @@ class Weight : public Var_Grad {
   float clip_by_global_norm; /**< constant factor to clip gradient by L2 norm */
   unsigned int output_axis;
   float loss_scale;
-  std::vector<Tensor *> opt_vars; /**< optimizer variables */
+  std::vector<Tensor *>
+    opt_vars; /**< optimizer variables : We assume it is always full-precsion*/
   std::shared_ptr<Tensor> var32;
 
   /**
diff --git a/packaging/nntrainer.spec b/packaging/nntrainer.spec
index 36ba371d22..2f1dc57f68 100644
--- a/packaging/nntrainer.spec
+++ b/packaging/nntrainer.spec
@@ -65,6 +65,13 @@
 %define neon_support -Denable-neon=false
 %endif # arch aarch64
 
+%ifarch x86_64
+%define enable_avx 1
+%define avx_support -Denable-avx=true
+%else
+%define avx_support -Denable-avx=false
+%endif # arch aarch64
+
 
 Name:		nntrainer
 Summary:	Software framework for training neural networks
@@ -410,7 +417,7 @@ meson --buildtype=plain --prefix=%{_prefix} --sysconfdir=%{_sysconfdir} \
       %{enable_reduce_tolerance} %{configure_subplugin_install_path} %{enable_debug} \
       -Dml-api-support=enabled -Denable-nnstreamer-tensor-filter=enabled \
       -Denable-nnstreamer-tensor-trainer=enabled -Denable-capi=enabled \
-      %{fp16_support} %{neon_support} build
+      %{fp16_support} %{neon_support} %{avx_support} build
 
 ninja -C build %{?_smp_mflags}
 
@@ -563,6 +570,10 @@ cp -r result %{buildroot}%{_datadir}/nntrainer/unittest/
 %{_includedir}/nntrainer/util_simd_neon.h
 %endif
 
+%if 0%{?enable_avx}
+%{_includedir}/nntrainer/blas_avx.h
+%endif
+
 %files devel-static
 %{_libdir}/libnntrainer*.a
 %exclude %{_libdir}/libcapi*.a
diff --git a/packaging/unittest_layers.tar.gz b/packaging/unittest_layers.tar.gz
index 7a435aadf4..3bd488a0a2 100644
Binary files a/packaging/unittest_layers.tar.gz and b/packaging/unittest_layers.tar.gz differ
diff --git a/packaging/unittest_models_v3.tar.gz b/packaging/unittest_models_v3.tar.gz
index abc7ead4a4..49a1f1b2ad 100644
Binary files a/packaging/unittest_models_v3.tar.gz and b/packaging/unittest_models_v3.tar.gz differ
diff --git a/test/include/nntrainer_test_util.h b/test/include/nntrainer_test_util.h
index 74eef4abaa..8e16b6a9f4 100644
--- a/test/include/nntrainer_test_util.h
+++ b/test/include/nntrainer_test_util.h
@@ -347,6 +347,29 @@ float mse(Ta *A, Tb *B, uint32_t size) {
   return mse;
 }
 
+/**
+ * @brief calculate mean squared errer
+ *
+ * @param A const prediction data
+ * @param B const reference data
+ * @param size data size
+ * @return mean squared errer value
+ */
+template <typename Ta = float, typename Tb = float>
+float mse(const Ta *A, const Tb *B, uint32_t size) {
+  float pred;
+  float ref;
+  float mse_error = 0;
+  for (uint32_t i = 0; i < size; i++) {
+    pred = A[i];
+    ref = B[i];
+    float diff = pred - ref;
+    mse_error += pow(diff, 2);
+  }
+  float mse = mse_error / size;
+  return mse;
+}
+
 /**
  * @brief A helper struct for performing static_cast operations on types.
  *
diff --git a/test/input_gen/genModelTests_v2.py b/test/input_gen/genModelTests_v2.py
index a56f437785..422c737487 100644
--- a/test/input_gen/genModelTests_v2.py
+++ b/test/input_gen/genModelTests_v2.py
@@ -11,6 +11,7 @@
 import math
 from recorder_v2 import record_v2, inspect_file, _rand_like
 import torch
+from torch import autocast
 
 class ReduceMeanLast(torch.nn.Module):
     def __init__(self):
@@ -307,6 +308,40 @@ def forward(self, inputs, labels):
         loss = self.loss(out, labels[0])
         return out, loss
 
+class LinearMixedPrecision(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.fc = torch.nn.Linear(3, 10)
+        self.loss = torch.nn.MSELoss()
+
+    def forward(self, inputs, labels):
+        with autocast(device_type='cuda', dtype=torch.float16):
+            input=inputs[0].to('cuda')
+            label=labels[0].to('cuda')
+            out = self.fc(input)
+        return out
+
+    def getOptimizer(self):
+        return torch.optim.Adam(self.parameters(), lr=0.1)
+
+class LinearMixedPrecisionNaNSGD(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.fc0 = torch.nn.Linear(1, 1)
+        self.fc1 = torch.nn.Linear(1, 1)
+        self.loss = torch.nn.MSELoss()
+
+    def forward(self, inputs, labels):
+        with autocast(device_type='cuda', dtype=torch.float16):
+            input=inputs[0].to('cuda')
+            label=labels[0].to('cuda')
+            out = self.fc0(input)
+            out = self.fc1(out)
+        return out
+
+    def getOptimizer(self):
+        return torch.optim.SGD(self.parameters(), lr=0.1)
+
 if __name__ == "__main__":
     record_v2(
         ReduceMeanLast(),
@@ -537,5 +572,28 @@ def forward(self, inputs, labels):
         name="non_trainable_fc_idx3"
     )
     
-    # Function to check the created golden test file
+    fc_mixed_training = LinearMixedPrecision()
+    record_v2(
+        fc_mixed_training,
+        iteration=3,
+        input_dims=[(1,3)],
+        input_dtype=[float],
+        label_dims=[(1,10)],
+        name="fc_mixed_training",
+        optimizer=fc_mixed_training.getOptimizer()
+    )
+
+    fc_mixed_training_nan_sgd = LinearMixedPrecisionNaNSGD()
+    record_v2(
+        fc_mixed_training_nan_sgd,
+        iteration=5,
+        input_dims=[(1,1)],
+        input_dtype=[float],
+        label_dims=[(1,1)],
+        name="fc_mixed_training_nan_sgd",
+        optimizer=fc_mixed_training_nan_sgd.getOptimizer()
+    )    
+
+#    Function to check the created golden test file    
     inspect_file("non_trainable_fc_idx3.nnmodelgolden")
+    
diff --git a/test/input_gen/gen_layer_tests.py b/test/input_gen/gen_layer_tests.py
index 48e68acaf1..7a1ed18ec6 100644
--- a/test/input_gen/gen_layer_tests.py
+++ b/test/input_gen/gen_layer_tests.py
@@ -17,6 +17,7 @@
 
 @author Jihoon Lee <jhoon.it.lee@samsung.com>
 @author Sungsik Kong <ss.kong@samsung.com>
+@author	Debadri Samaddar <s.debadri@samsung.com>
 """
 
 import warnings
@@ -866,3 +867,19 @@ def call(self, inputs):
 
     added = K.layers.Add()
     record_single_fp16(added, [(2, 3, 3, 3), (2, 3, 3, 3)], "added_w16a16")
+
+    def swiglu(inputs):
+        [x, y] = inputs
+        # swish(x) = x * sigmoid(x)
+        swishTensor = x * K.activations.sigmoid(x)
+
+        return K.layers.Multiply()([swishTensor, y])
+
+    swiglu_layer = K.layers.Lambda(swiglu)
+
+    record_single(
+        swiglu_layer,
+        [(2, 3, 3, 3), (2, 3, 3, 3)],
+        "swiglu",
+        input_type="float",
+    )
diff --git a/test/input_gen/recorder_v2.py b/test/input_gen/recorder_v2.py
index 9bc219c767..6b8f42ff88 100644
--- a/test/input_gen/recorder_v2.py
+++ b/test/input_gen/recorder_v2.py
@@ -12,6 +12,8 @@
 import random
 import torch  # torch used here is torch==1.9.1
 import numpy as np
+import torch.cuda.amp as amp
+from torch import autocast
 
 from transLayer_v2 import params_translated
 
@@ -29,13 +31,31 @@
 
 
 def _get_writer(file):
-    def write_fn(items):
+    def write_fn(items, type = 'float32'):
         if not isinstance(items, (list, tuple)):
             items = [items]
 
         for item in items:
-            np.array([item.numel()], dtype="int32").tofile(file)
-            item.detach().cpu().numpy().tofile(file)
+            print(item.numel(), " -0-----")
+            print(item)
+            np.array([item.numel()], dtype='int32').tofile(file)
+            a=np.array(item.detach().cpu(), dtype=type)
+            a.tofile(file)
+            print(a.dtype)
+
+        return items
+
+    return write_fn
+
+def _get_writer_mixed(file):
+    def write_fn(items, num_type = 'int32', type = 'float32'):
+        if not isinstance(items, (list, tuple)):
+            items = [items]
+
+        for item in items:
+            np.array([item.numel()], dtype=num_type).tofile(file)
+            a=np.array(item.detach().cpu(), dtype=type)
+            a.tofile(file)
 
         return items
 
@@ -96,14 +116,65 @@ def record_iteration(write_fn):
             norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 0.0001)
         optimizer.step()
 
+    def record_iteration_with_amp(write_fn, inputs, labels, is_nan, scaler):
+        model_= model.cuda()
+
+        print(inputs[0], " inputs inside")
+        output = model_(inputs[0], labels[0])
+
+        print("model output type: ",output.dtype)
+
+        with autocast(device_type='cuda', dtype=torch.float16):
+            l=model_.loss(output, labels[0].to('cuda'))
+
+        optimizer.zero_grad()
+
+        scaler.scale(l).backward()
+        print("Gradient      ---------------")
+        for param in model_.parameters():
+            print (param.grad)
+            mask = torch.isnan(param.grad) or torch.isinf(param.grad)
+            check_nan = mask.int()
+            if check_nan.sum().item():
+                is_nan = True
+            else:
+                is_nan = False
+
+
+        if not is_nan:
+            print("------------------------------- not nan")
+            write_fn(output,'int32','float32')
+        return output, is_nan
+
     with open(file_name, "wb") as f:
         # write number of iterations
+        print("iteration : ", iteration)
         np.array([iteration], dtype="int32").tofile(f)
 
-        write_fn = _get_writer(f)
-        for _ in range(iteration):
-            record_iteration(write_fn)
-
+        write_fn = _get_writer_mixed(f)
+        for i in range(iteration):
+            if input_label_reader != None:
+                inputs, labels = input_label_reader(input_dims, label_dims, input_dtype)
+            else:
+                inputs = _rand_like(input_dims, dtype=input_dtype if input_dtype is not None else float)
+                labels = _rand_like(label_dims, dtype=float)
+            print("inputs ==============")
+            write_fn(inputs,'int32', 'float32')
+            print("labels ==============")
+            write_fn(labels, 'int32', 'float32')
+            is_nan = True;
+            print("=========================== ", i)
+            scaler = amp.GradScaler()
+            print("weights ==============")
+            write_fn(list(t for _, t in params_translated(model)),'int16','float16')
+            print("\n\n")
+            while(is_nan):
+                print( "before is_nan_", is_nan)
+                output,is_nan_ = record_iteration_with_amp(write_fn, inputs, labels, is_nan, scaler)
+                is_nan = is_nan_
+                print( "after is_nan_", is_nan)
+                scaler.step(optimizer)
+                scaler.update()
 
 ##
 # @brief inpsect if file is created correctly
diff --git a/test/jni/Android.mk b/test/jni/Android.mk
index a9033b65cc..978e98bd67 100644
--- a/test/jni/Android.mk
+++ b/test/jni/Android.mk
@@ -16,6 +16,7 @@ NNTRAINER_INCLUDES := $(NNTRAINER_ROOT)/nntrainer \
 	$(NNTRAINER_ROOT)/nntrainer/dataset \
 	$(NNTRAINER_ROOT)/nntrainer/models \
 	$(NNTRAINER_ROOT)/nntrainer/layers \
+	$(NNTRAINER_ROOT)/nntrainer/layers/cl_layers \
 	$(NNTRAINER_ROOT)/nntrainer/compiler \
 	$(NNTRAINER_ROOT)/nntrainer/graph \
 	$(NNTRAINER_ROOT)/nntrainer/opencl \
@@ -442,6 +443,7 @@ LOCAL_SRC_FILES := \
 	 ../unittest/layers/unittest_layers_impl.cpp \
 	 ../unittest/layers/unittest_layers_input.cpp \
 	 ../unittest/layers/unittest_layers_loss.cpp \
+	 ../unittest/layers/unittest_layers_fully_connected_cl.cpp \
 	 ../unittest/layers/unittest_layers_fully_connected.cpp \
 	 ../unittest/layers/unittest_layers_batch_normalization.cpp \
 	 ../unittest/layers/unittest_layers_layer_normalization.cpp \
diff --git a/test/nntrainer_test_util.cpp b/test/nntrainer_test_util.cpp
index bcc33e40c8..5777bb75b2 100644
--- a/test/nntrainer_test_util.cpp
+++ b/test/nntrainer_test_util.cpp
@@ -332,6 +332,7 @@ void sizeCheckedReadTensor(nntrainer::Tensor &t, std::ifstream &file,
     nntrainer::checkedRead(file, (char *)&sz, sizeof(unsigned));
   } else if (t.getDataType() == ml::train::TensorDim::DataType::FP16) {
 #ifdef ENABLE_FP16
+    // This needs to be fixed. sz is always unsinged int type.
     nntrainer::checkedRead(file, (char *)&sz, sizeof(_FP16));
 #else
     throw std::invalid_argument("Error: enable-fp16 is not enabled");
diff --git a/test/unittest/layers/layers_common_tests.h b/test/unittest/layers/layers_common_tests.h
index 57f693c0a2..d63357c805 100644
--- a/test/unittest/layers/layers_common_tests.h
+++ b/test/unittest/layers/layers_common_tests.h
@@ -93,6 +93,7 @@ class LayerPropertySemantics : public LayerSemantics {};
 typedef enum {
   SKIP_CALC_GRAD = 1 << 0,  /**< skip calculating gradient and compare */
   SKIP_CALC_DERIV = 1 << 1, /**< skip calculating derivative and compare */
+  USE_INC_FORWARD = 1 << 2, /**< use incremental forwarding and compare */
 
   FORWARD_MODE_INFERENCE =
     1 << 2, /**< set if layer should be forwarded with inference mode */
@@ -172,6 +173,14 @@ class LayerGoldenTest
    */
   bool shouldSkipCalcGrad();
 
+  /**
+   * @brief check if given test suite should use incremental forwarding instead
+   * of normal forwarding
+   *
+   * @return bool true if should use incremental forwarding
+   */
+  bool shouldUseIncForward();
+
   /**
    * @brief check if given test suite should skip cosine similarity check
    *
diff --git a/test/unittest/layers/layers_golden_tests.cpp b/test/unittest/layers/layers_golden_tests.cpp
index 64400e6ecd..73f3954052 100644
--- a/test/unittest/layers/layers_golden_tests.cpp
+++ b/test/unittest/layers/layers_golden_tests.cpp
@@ -156,7 +156,7 @@ static RunLayerContext prepareRunContext(const TensorPacks &packs) {
   };
 
   auto rc =
-    RunLayerContext("golden", true, 0.0f, false, create_view(weights),
+    RunLayerContext("golden", true, 0.0f, false, 1.0, create_view(weights),
                     create_view(ins), create_view(outs), create_view(tensors));
 
   auto num_outputs = rc.getNumOutputs();
@@ -364,6 +364,11 @@ bool LayerGoldenTest::shouldSkipCalcGrad() {
          LayerGoldenTestParamOptions::SKIP_CALC_GRAD;
 }
 
+bool LayerGoldenTest::shouldUseIncForward() {
+  return std::get<int>(GetParam()) &
+         LayerGoldenTestParamOptions::USE_INC_FORWARD;
+}
+
 bool LayerGoldenTest::shouldSkipCosineSimilarity() {
   return std::get<int>(GetParam()) &
          LayerGoldenTestParamOptions::SKIP_COSINE_SIMILARITY;
@@ -387,15 +392,31 @@ TEST_P(LayerGoldenTest, run) {
 
   bool skip_calc_grad = shouldSkipCalcGrad();
   bool skip_calc_deriv = shouldSkipCalcDeriv();
+  bool use_inc_forward = shouldUseIncForward();
   bool dropout_compare_60_percent = shouldMatchDropout60Percent();
   bool skip_cos_sim = shouldSkipCosineSimilarity();
 
+  Tensor &input = rc.getInput(0);
+  TensorDim input_dim = input.getDim();
+  size_t inputHeight = input_dim.height();
+
   for (int i = 0; i < 4; ++i) {
     /// warm layer multiple times
+    if (use_inc_forward) {
+      layer->incremental_forwarding(rc, 0, inputHeight,
+                                    !shouldForwardWithInferenceMode());
+    } else {
+      layer->forwarding(rc, !shouldForwardWithInferenceMode());
+    }
+  }
+
+  if (use_inc_forward) {
+    layer->incremental_forwarding(rc, 0, inputHeight,
+                                  !shouldForwardWithInferenceMode());
+  } else {
     layer->forwarding(rc, !shouldForwardWithInferenceMode());
   }
 
-  layer->forwarding(rc, !shouldForwardWithInferenceMode());
   if (!skip_calc_grad) {
     layer->calcGradient(rc);
   }
diff --git a/test/unittest/layers/unittest_layer_node.cpp b/test/unittest/layers/unittest_layer_node.cpp
index 3b41f02f30..37287f7ce5 100644
--- a/test/unittest/layers/unittest_layer_node.cpp
+++ b/test/unittest/layers/unittest_layer_node.cpp
@@ -131,7 +131,7 @@ TEST(nntrainer_LayerNode, finalize_05_n) {
                     nntrainer::createLayerNode(nntrainer::IdentityLayer::type));
   EXPECT_NO_THROW(lnode->setProperty({"input_shape=1:1:1", "name=abc"}));
   EXPECT_NO_THROW(lnode->finalize());
-  EXPECT_NO_THROW(lnode->configureRunContext({}, {&input}, {}, {}));
+  EXPECT_NO_THROW(lnode->configureRunContext({}, {&input}, {}, {}, 1.0));
   EXPECT_THROW(lnode->finalize(), std::runtime_error);
 }
 
@@ -298,7 +298,7 @@ TEST(nntrainer_LayerNode, setWeights_02_n) {
   EXPECT_NO_THROW(lnode =
                     nntrainer::createLayerNode(nntrainer::IdentityLayer::type));
   EXPECT_NO_THROW(lnode->setProperty({"input_shape=1:1:1", "name=abc"}));
-  EXPECT_NO_THROW(lnode->configureRunContext({&weight}, {&input}, {}, {}));
+  EXPECT_NO_THROW(lnode->configureRunContext({&weight}, {&input}, {}, {}, 1.0));
 
   EXPECT_THROW(lnode->setWeights(new_weights), std::runtime_error);
 }
diff --git a/test/unittest/layers/unittest_layers_convolution2d.cpp b/test/unittest/layers/unittest_layers_convolution2d.cpp
index 724c79079b..92d9c593e7 100644
--- a/test/unittest/layers/unittest_layers_convolution2d.cpp
+++ b/test/unittest/layers/unittest_layers_convolution2d.cpp
@@ -198,3 +198,185 @@ GTEST_PARAMETER_TEST(
     conv2d_mb_valid_drop_last, conv2d_sb_no_overlap, conv2d_mb_no_overlap,
     conv2d_sb_1x1_kernel, conv2d_mb_1x1_kernel, conv2d_sb_dilation,
     conv2d_mb_dilation, conv2d_sb_same_dilation, conv2d_mb_same_dilation));
+
+#ifdef ENABLE_FP16
+auto conv2d_sb_minimum_w16a16 = LayerGoldenTestParamType(
+  nntrainer::createLayer<nntrainer::Conv2DLayer>,
+  {"filters=3", "kernel_size=2,2"}, "1:1:4:4",
+  "conv2d_sb_minimum_w16a16.nnlayergolden",
+  LayerGoldenTestParamOptions::DEFAULT, "nchw", "fp16", "fp16");
+
+auto conv2d_mb_minimum_w16a16 = LayerGoldenTestParamType(
+  nntrainer::createLayer<nntrainer::Conv2DLayer>,
+  {"filters=3", "kernel_size=2,2"}, "3:1:4:4",
+  "conv2d_mb_minimum_w16a16.nnlayergolden",
+  LayerGoldenTestParamOptions::DEFAULT, "nchw", "fp16", "fp16");
+
+auto conv2d_sb_same_remain_w16a16 = LayerGoldenTestParamType(
+  nntrainer::createLayer<nntrainer::Conv2DLayer>,
+  {"filters=2", "kernel_size=3,3", "padding=same"}, "1:1:4:4",
+  "conv2d_sb_same_remain_w16a16.nnlayergolden",
+  LayerGoldenTestParamOptions::DEFAULT, "nchw", "fp16", "fp16");
+
+auto conv2d_mb_same_remain_w16a16 = LayerGoldenTestParamType(
+  nntrainer::createLayer<nntrainer::Conv2DLayer>,
+  {"filters=2", "kernel_size=3,3", "padding=same"}, "3:1:4:4",
+  "conv2d_mb_same_remain_w16a16.nnlayergolden",
+  LayerGoldenTestParamOptions::DEFAULT, "nchw", "fp16", "fp16");
+
+auto conv2d_sb_same_uneven_remain_1_w16a16 = LayerGoldenTestParamType(
+  nntrainer::createLayer<nntrainer::Conv2DLayer>,
+  {
+    "filters=2",
+    "kernel_size=3,3",
+    "stride=2,2",
+    "padding=same",
+  },
+  "1:3:4:4", "conv2d_sb_same_uneven_remain_w16a16.nnlayergolden",
+  LayerGoldenTestParamOptions::DEFAULT, "nchw", "fp16", "fp16");
+
+auto conv2d_sb_same_uneven_remain_2_w16a16 = LayerGoldenTestParamType(
+  nntrainer::createLayer<nntrainer::Conv2DLayer>,
+  {
+    "filters=2",
+    "kernel_size=3,3",
+    "stride=2,2",
+    "padding=0,1,0,1",
+  },
+  "1:3:4:4", "conv2d_sb_same_uneven_remain_w16a16.nnlayergolden",
+  LayerGoldenTestParamOptions::DEFAULT, "nchw", "fp16", "fp16");
+
+auto conv2d_mb_same_uneven_remain_1_w16a16 = LayerGoldenTestParamType(
+  nntrainer::createLayer<nntrainer::Conv2DLayer>,
+  {
+    "filters=2",
+    "kernel_size=3,3",
+    "stride=2,2",
+    "padding=same",
+  },
+  "3:3:4:4", "conv2d_mb_same_uneven_remain_w16a16.nnlayergolden",
+  LayerGoldenTestParamOptions::DEFAULT, "nchw", "fp16", "fp16");
+
+auto conv2d_mb_same_uneven_remain_2_w16a16 = LayerGoldenTestParamType(
+  nntrainer::createLayer<nntrainer::Conv2DLayer>,
+  {
+    "filters=2",
+    "kernel_size=3,3",
+    "stride=2,2",
+    "padding=0,1,0,1",
+  },
+  "3:3:4:4", "conv2d_mb_same_uneven_remain_w16a16.nnlayergolden",
+  LayerGoldenTestParamOptions::DEFAULT, "nchw", "fp16", "fp16");
+
+auto conv2d_sb_valid_drop_last_w16a16 = LayerGoldenTestParamType(
+  nntrainer::createLayer<nntrainer::Conv2DLayer>,
+  {
+    "filters=2",
+    "kernel_size=3,3",
+    "stride=2,2",
+    "padding=valid",
+  },
+  "1:3:7:7", "conv2d_sb_valid_drop_last_w16a16.nnlayergolden",
+  LayerGoldenTestParamOptions::DEFAULT, "nchw", "fp16", "fp16");
+
+auto conv2d_mb_valid_drop_last_w16a16 = LayerGoldenTestParamType(
+  nntrainer::createLayer<nntrainer::Conv2DLayer>,
+  {
+    "filters=2",
+    "kernel_size=3,3",
+    "stride=2,2",
+    "padding=valid",
+  },
+  "3:3:7:7", "conv2d_mb_valid_drop_last_w16a16.nnlayergolden",
+  LayerGoldenTestParamOptions::DEFAULT, "nchw", "fp16", "fp16");
+
+auto conv2d_sb_no_overlap_w16a16 = LayerGoldenTestParamType(
+  nntrainer::createLayer<nntrainer::Conv2DLayer>,
+  {"filters=3", "kernel_size=2,2", "stride=3,3"}, "1:2:5:5",
+  "conv2d_sb_no_overlap_w16a16.nnlayergolden",
+  LayerGoldenTestParamOptions::DEFAULT, "nchw", "fp16", "fp16");
+
+auto conv2d_mb_no_overlap_w16a16 = LayerGoldenTestParamType(
+  nntrainer::createLayer<nntrainer::Conv2DLayer>,
+  {
+    "filters=3",
+    "kernel_size=2,2",
+    "stride=3,3",
+  },
+  "3:2:5:5", "conv2d_mb_no_overlap_w16a16.nnlayergolden",
+  LayerGoldenTestParamOptions::DEFAULT, "nchw", "fp16", "fp16");
+
+auto conv2d_sb_1x1_kernel_w16a16 = LayerGoldenTestParamType(
+  nntrainer::createLayer<nntrainer::Conv2DLayer>,
+  {"filters=3", "kernel_size=1,1", "stride=2,2"}, "1:2:5:5",
+  "conv2d_sb_1x1_kernel_w16a16.nnlayergolden",
+  LayerGoldenTestParamOptions::DEFAULT, "nchw", "fp16", "fp16");
+
+auto conv2d_mb_1x1_kernel_w16a16 = LayerGoldenTestParamType(
+  nntrainer::createLayer<nntrainer::Conv2DLayer>,
+  {
+    "filters=3",
+    "kernel_size=1,1",
+    "stride=2,2",
+  },
+  "3:2:5:5", "conv2d_mb_1x1_kernel_w16a16.nnlayergolden",
+  LayerGoldenTestParamOptions::DEFAULT, "nchw", "fp16", "fp16");
+
+auto conv2d_sb_dilation_w16a16 = LayerGoldenTestParamType(
+  nntrainer::createLayer<nntrainer::Conv2DLayer>,
+  {
+    "filters=2",
+    "kernel_size=3,3",
+    "dilation=2,2",
+  },
+  "1:3:11:11", "conv2d_sb_dilation_w16a16.nnlayergolden",
+  LayerGoldenTestParamOptions::DEFAULT, "nchw", "fp16", "fp16");
+
+auto conv2d_mb_dilation_w16a16 = LayerGoldenTestParamType(
+  nntrainer::createLayer<nntrainer::Conv2DLayer>,
+  {
+    "filters=2",
+    "kernel_size=3,3",
+    "dilation=2,2",
+  },
+  "3:3:11:11", "conv2d_mb_dilation_w16a16.nnlayergolden",
+  LayerGoldenTestParamOptions::DEFAULT, "nchw", "fp16", "fp16");
+
+auto conv2d_sb_same_dilation_w16a16 = LayerGoldenTestParamType(
+  nntrainer::createLayer<nntrainer::Conv2DLayer>,
+  {
+    "filters=2",
+    "kernel_size=3,3",
+    "padding=same",
+    "dilation=2,2",
+  },
+  "1:3:11:11", "conv2d_sb_same_dilation_w16a16.nnlayergolden",
+  LayerGoldenTestParamOptions::DEFAULT, "nchw", "fp16", "fp16");
+
+auto conv2d_mb_same_dilation_w16a16 = LayerGoldenTestParamType(
+  nntrainer::createLayer<nntrainer::Conv2DLayer>,
+  {
+    "filters=2",
+    "kernel_size=3,3",
+    "padding=same",
+    "dilation=2,2",
+  },
+  "3:3:11:11", "conv2d_mb_same_dilation_w16a16.nnlayergolden",
+  LayerGoldenTestParamOptions::DEFAULT, "nchw", "fp16", "fp16");
+
+GTEST_PARAMETER_TEST(
+  Convolution2D16, LayerGoldenTest,
+  ::testing::Values(conv2d_sb_minimum_w16a16, conv2d_mb_minimum_w16a16,
+                    conv2d_sb_same_remain_w16a16, conv2d_mb_same_remain_w16a16,
+                    conv2d_sb_same_uneven_remain_1_w16a16,
+                    conv2d_sb_same_uneven_remain_2_w16a16,
+                    conv2d_mb_same_uneven_remain_1_w16a16,
+                    conv2d_mb_same_uneven_remain_2_w16a16,
+                    conv2d_sb_valid_drop_last_w16a16,
+                    conv2d_mb_valid_drop_last_w16a16,
+                    conv2d_sb_no_overlap_w16a16, conv2d_mb_no_overlap_w16a16,
+                    conv2d_sb_1x1_kernel_w16a16, conv2d_mb_1x1_kernel_w16a16,
+                    conv2d_sb_dilation_w16a16, conv2d_mb_dilation_w16a16,
+                    conv2d_sb_same_dilation_w16a16,
+                    conv2d_mb_same_dilation_w16a16));
+#endif
diff --git a/test/unittest/layers/unittest_layers_fully_connected_cl.cpp b/test/unittest/layers/unittest_layers_fully_connected_cl.cpp
new file mode 100644
index 0000000000..07bb138272
--- /dev/null
+++ b/test/unittest/layers/unittest_layers_fully_connected_cl.cpp
@@ -0,0 +1,68 @@
+// SPDX-License-Identifier: Apache-2.0
+/**
+ * Copyright (C) 2024 Debadri Samaddar <s.debadri@samsung.com>
+ *
+ * @file unittest_layers_fully_connected_cl.cpp
+ * @date 7 June 2024
+ * @brief Fully Connected Layer Test
+ * @see	https://github.com/nnstreamer/nntrainer
+ * @author Debadri Samaddar <s.debadri@samsung.com>
+ * @bug No known bugs except for NYI items
+ */
+#include <tuple>
+
+#include <gtest/gtest.h>
+
+#include <fc_layer_cl.h>
+#include <layers_common_tests.h>
+
+auto semantic_fc_gpu = LayerSemanticsParamType(
+  nntrainer::createLayer<nntrainer::FullyConnectedLayerCl>,
+  nntrainer::FullyConnectedLayerCl::type, {"unit=1"},
+  LayerCreateSetPropertyOptions::AVAILABLE_FROM_APP_CONTEXT, false, 1);
+
+GTEST_PARAMETER_TEST(FullyConnectedGPU, LayerSemantics,
+                     ::testing::Values(semantic_fc_gpu));
+
+auto fc_gpu_plain = LayerGoldenTestParamType(
+  nntrainer::createLayer<nntrainer::FullyConnectedLayerCl>, {"unit=5"},
+  "3:1:1:10", "fc_plain.nnlayergolden", LayerGoldenTestParamOptions::DEFAULT,
+  "nchw", "fp32", "fp32");
+auto fc_gpu_single_batch = LayerGoldenTestParamType(
+  nntrainer::createLayer<nntrainer::FullyConnectedLayerCl>, {"unit=4"},
+  "1:1:1:10", "fc_single_batch.nnlayergolden",
+  LayerGoldenTestParamOptions::DEFAULT, "nchw", "fp32", "fp32");
+auto fc_gpu_no_decay = LayerGoldenTestParamType(
+  nntrainer::createLayer<nntrainer::FullyConnectedLayerCl>,
+  {"unit=5", "weight_decay=0.0", "bias_decay=0.0"}, "3:1:1:10",
+  "fc_plain.nnlayergolden", LayerGoldenTestParamOptions::DEFAULT, "nchw",
+  "fp32", "fp32");
+
+auto fc_gpu_plain_nhwc = LayerGoldenTestParamType(
+  nntrainer::createLayer<nntrainer::FullyConnectedLayerCl>, {"unit=5"},
+  "3:10:1:1", "fc_plain.nnlayergolden",
+  LayerGoldenTestParamOptions::SKIP_CALC_DERIV |
+    LayerGoldenTestParamOptions::SKIP_CALC_GRAD |
+    LayerGoldenTestParamOptions::USE_INC_FORWARD,
+  "nhwc", "fp32", "fp32");
+
+auto fc_gpu_single_batch_nhwc = LayerGoldenTestParamType(
+  nntrainer::createLayer<nntrainer::FullyConnectedLayerCl>, {"unit=4"},
+  "1:10:1:1", "fc_single_batch.nnlayergolden",
+  LayerGoldenTestParamOptions::SKIP_CALC_DERIV |
+    LayerGoldenTestParamOptions::SKIP_CALC_GRAD,
+  "nhwc", "fp32", "fp32");
+
+auto fc_gpu_no_decay_nhwc = LayerGoldenTestParamType(
+  nntrainer::createLayer<nntrainer::FullyConnectedLayerCl>,
+  {"unit=5", "weight_decay=0.0", "bias_decay=0.0"}, "3:10:1:1",
+  "fc_plain.nnlayergolden",
+  LayerGoldenTestParamOptions::SKIP_CALC_DERIV |
+    LayerGoldenTestParamOptions::SKIP_CALC_GRAD,
+  "nhwc", "fp32", "fp32");
+
+GTEST_PARAMETER_TEST(FullyConnectedGPU, LayerGoldenTest,
+                     ::testing::Values(fc_gpu_plain, fc_gpu_single_batch,
+                                       fc_gpu_no_decay, fc_gpu_plain_nhwc,
+                                       fc_gpu_single_batch_nhwc,
+                                       fc_gpu_no_decay_nhwc));
diff --git a/test/unittest/models/meson.build b/test/unittest/models/meson.build
index 7166fc41ff..3f17369f94 100644
--- a/test/unittest/models/meson.build
+++ b/test/unittest/models/meson.build
@@ -1,4 +1,5 @@
 test_name = 'unittest_models'
+mixed_test_name = 'unittest_mixed_models'
 
 test_target = []
 
@@ -11,6 +12,30 @@ models_targets = [
   # disable temperally
 ]
 
+mixed_test_targets = [
+  'models_test_utils.cpp',
+  'models_golden_test.cpp',
+  'unittest_models_mixed_precision.cpp',
+]
+
+if get_option('enable-fp16')
+  mixed_exe = executable(
+    mixed_test_name,
+    mixed_test_targets,
+    include_directories: include_directories('.'),
+    dependencies: [
+      nntrainer_test_main_deps, nntrainer_ccapi_dep
+    ],
+    install: get_option('enable-test'),
+    install_dir: application_install_dir
+  )
+
+  test(mixed_test_name, mixed_exe,
+    args: '--gtest_output=xml:@0@/@1@.xml'.format(meson.build_root(), mixed_test_name),
+    timeout: test_timeout
+  )
+endif
+
 test_target += models_targets
 exe = executable(
   test_name,
diff --git a/test/unittest/models/models_test_utils.cpp b/test/unittest/models/models_test_utils.cpp
index 741e008994..ac956d479b 100644
--- a/test/unittest/models/models_test_utils.cpp
+++ b/test/unittest/models/models_test_utils.cpp
@@ -50,8 +50,41 @@ static sharedConstTensors toSharedTensors(const std::vector<Tensor> &ts) {
 static void verify(const nntrainer::Tensor &actual,
                    const nntrainer::Tensor &expected,
                    const std::string &error_msg) {
+  bool equal = false;
+
+  if (actual.getDataType() == ml::train::TensorDim::DataType::FP32 &&
+      expected.getDataType() == ml::train::TensorDim::DataType::FP32) {
+    equal = (actual == expected);
+    if (!equal) {
+      float mseError = mse<float>(actual.getData<float>(),
+                                  expected.getData<float>(), actual.size());
+      if (mseError > 10 - 4) {
+        equal = false;
+      } else {
+        equal = true;
+      }
+    }
+  }
+
+#ifdef ENABLE_FP16
+  if (!equal) {
+    if (actual.getDataType() == ml::train::TensorDim::DataType::FP16 &&
+        expected.getDataType() == ml::train::TensorDim::DataType::FP16) {
+      float mseError = mse<_FP16>(actual.getData<_FP16>(),
+                                  expected.getData<_FP16>(), actual.size());
+      if (mseError > 10 - 2) {
+        equal = false;
+      } else {
+        equal = true;
+      }
+    }
+  }
+#endif
+
+  if (!equal) {
+    nntrainer::Tensor diff = actual.subtract(expected);
+    const float *diff_data = diff.getData();
 
-  if (actual != expected) {
     std::cout
       << "============================================================\n";
     std::cout << "\033[1;33m" << error_msg << "\033[0m\n";
@@ -60,8 +93,6 @@ static void verify(const nntrainer::Tensor &actual,
               << " - " << expected;
 
     if (actual.getDim() == expected.getDim()) {
-      nntrainer::Tensor diff = actual.subtract(expected);
-      const float *diff_data = diff.getData();
       std::cout << "\033[1;33mdifference\033[0m " << diff;
       std::cout << "number of data: " << diff.size() << std::endl;
       std::cout << "\033[4;33mMAX DIFF: "
@@ -119,6 +150,12 @@ class IterationForGolden {
         }
 
         Tensor &t = rc.getWeight(i);
+
+        if (t.getDataType() != ml::train::TensorDim::DataType::FP32) {
+          Tensor &t32 = rc.getWeightFP32(i);
+          weights32.push_back(t32);
+        }
+
         weights.push_back(t);
         expected_weights.push_back(t.clone());
       }
@@ -158,6 +195,10 @@ class IterationForGolden {
     } else {
       for (unsigned int i = 0; i < weights.size(); ++i) {
         weights.at(i).fill(expected_weights.at(i));
+        if (iteration == 0 &&
+            weights.at(i).getDataType() != ml::train::TensorDim::DataType::FP32)
+          weights32.at(i).fill(
+            weights.at(i).clone(ml::train::TensorDim::DataType::FP32));
       }
     }
 
@@ -174,6 +215,7 @@ class IterationForGolden {
   std::vector<Tensor> inputs;
   std::vector<Tensor> labels;
   std::vector<Tensor> weights;
+  std::vector<Tensor> weights32;
   std::vector<Tensor> expected_weights;
   std::vector<Tensor> expected_outputs;
 };
diff --git a/test/unittest/models/unittest_models_mixed_precision.cpp b/test/unittest/models/unittest_models_mixed_precision.cpp
new file mode 100644
index 0000000000..04c1495491
--- /dev/null
+++ b/test/unittest/models/unittest_models_mixed_precision.cpp
@@ -0,0 +1,74 @@
+// SPDX-License-Identifier: Apache-2.0
+/**
+ * Copyright (C) 2024 Jijoong Moon <jijoong.moon@samsung.com>
+ *
+ * @file unittest_models_mixed_precision.cpp
+ * @date 3 May 2024
+ * @brief unittest models to cover mixed precision
+ * @see	https://github.com/nnstreamer/nntrainer
+ * @author Jijoong Moon <jijoong.moon@samsung.com>
+ * @bug No known bugs except for NYI items
+ */
+
+#include <gtest/gtest.h>
+
+#include <memory>
+
+#include <ini_wrapper.h>
+#include <neuralnet.h>
+#include <nntrainer_test_util.h>
+
+#include <models_golden_test.h>
+
+using namespace nntrainer;
+
+static std::unique_ptr<NeuralNetwork> fc_mixed_training() {
+  std::unique_ptr<NeuralNetwork> nn(new NeuralNetwork());
+  nn->setProperty(
+    {"batch_size=1", "model_tensor_type=FP16-FP16", "loss_scale=65536"});
+
+  auto graph = makeGraph({
+    {"input", {"name=in", "input_shape=1:1:3"}},
+    {"Fully_connected", {"name=fc", "input_layers=in", "unit=10"}},
+    {"mse", {"name=loss", "input_layers=fc"}},
+  });
+  for (auto &node : graph) {
+    nn->addLayer(node);
+  }
+
+  nn->setOptimizer(ml::train::createOptimizer(
+    "adam", {"learning_rate = 0.1", "torch_ref=true"}));
+
+  return nn;
+}
+
+static std::unique_ptr<NeuralNetwork> fc_mixed_training_nan_sgd() {
+  std::unique_ptr<NeuralNetwork> nn(new NeuralNetwork());
+  nn->setProperty(
+    {"batch_size=1", "model_tensor_type=FP16-FP16", "loss_scale=65536"});
+
+  auto graph = makeGraph({
+    {"input", {"name=in", "input_shape=1:1:1"}},
+    {"Fully_connected", {"name=fc0", "input_layers=in", "unit=1"}},
+    {"Fully_connected", {"name=fc1", "input_layers=fc0", "unit=1"}},
+    {"mse", {"name=loss", "input_layers=fc1"}},
+  });
+  for (auto &node : graph) {
+    nn->addLayer(node);
+  }
+
+  nn->setOptimizer(ml::train::createOptimizer("sgd", {"learning_rate = 0.1"}));
+
+  return nn;
+}
+
+GTEST_PARAMETER_TEST(
+  MixedPrecision, nntrainerModelTest,
+  ::testing::ValuesIn({
+    mkModelTc_V2(fc_mixed_training, "fc_mixed_training",
+                 ModelTestOption::ALL_V2),
+    mkModelTc_V2(fc_mixed_training_nan_sgd, "fc_mixed_training_nan_sgd",
+                 ModelTestOption::ALL_V2),
+  }),
+  [](const testing::TestParamInfo<nntrainerModelTest::ParamType> &info)
+    -> const auto & { return std::get<1>(info.param); });
diff --git a/test/unittest/unittest_nntrainer_tensor.cpp b/test/unittest/unittest_nntrainer_tensor.cpp
index 94aa01836d..d5b6a028f9 100644
--- a/test/unittest/unittest_nntrainer_tensor.cpp
+++ b/test/unittest/unittest_nntrainer_tensor.cpp
@@ -4704,6 +4704,30 @@ TEST(nntrainer_Tensor, inv_sqrt_i_uncontiguous_p) {
   }
 }
 
+/**
+ * @brief fp16 tensor has NaN
+ */
+TEST(nntrainer_Tensor, is_valid_01) {
+  size_t batch = 1;
+  size_t channel = 3;
+  size_t height = 4;
+  size_t width = 5;
+
+  nntrainer::Tensor input(
+    {batch,
+     channel,
+     height,
+     width,
+     {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP32}},
+    true, nntrainer::Tensor::Initializer::ZEROS);
+
+  EXPECT_EQ(input.isValid(), true);
+
+  input.setValue(0, 0, 0, 0, std::nan("1"));
+
+  EXPECT_EQ(input.isValid(), false);
+}
+
 int main(int argc, char **argv) {
   int result = -1;
 
diff --git a/test/unittest/unittest_nntrainer_tensor_fp16.cpp b/test/unittest/unittest_nntrainer_tensor_fp16.cpp
index 2b0d9c040d..58455757c5 100644
--- a/test/unittest/unittest_nntrainer_tensor_fp16.cpp
+++ b/test/unittest/unittest_nntrainer_tensor_fp16.cpp
@@ -6196,6 +6196,34 @@ TEST(nntrainer_Tensor, dequantize_06_p) {
   EXPECT_EQ(output, answer3);
 }
 
+/**
+ * @brief fp16 tensor has NaN
+ */
+TEST(nntrainer_Tensor, is_valid_01) {
+  size_t batch = 1;
+  size_t channel = 3;
+  size_t height = 4;
+  size_t width = 5;
+
+  nntrainer::Tensor input(
+    {batch,
+     channel,
+     height,
+     width,
+     {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP16}},
+    true, nntrainer::Tensor::Initializer::ZEROS);
+
+  EXPECT_EQ(input.isValid(), true);
+
+  input.setValue(0, 0, 0, 0, std::nan("1"));
+
+  EXPECT_EQ(input.isValid(), false);
+
+  input.setValue(0, 0, 0, 0, std::numeric_limits<float>::infinity());
+
+  EXPECT_EQ(input.isValid(), false);
+}
+
 GTEST_API_ int main(int argc, char **argv) {
   int result = -1;
 
diff --git a/test/unittest/unittest_nntrainer_tensor_neon_fp16.cpp b/test/unittest/unittest_nntrainer_tensor_neon_fp16.cpp
index e02eac1786..799a910273 100644
--- a/test/unittest/unittest_nntrainer_tensor_neon_fp16.cpp
+++ b/test/unittest/unittest_nntrainer_tensor_neon_fp16.cpp
@@ -120,6 +120,70 @@ TEST(nntrainer_Tensor, dot) {
   EXPECT_IN_RANGE((float)cosSimNeon, 0.99, 1);
 }
 
+TEST(nntrainer_Tensor, hdot_768) {
+
+  nntrainer::TensorDim::TensorType t_type_nchw_fp16 = {
+    nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP16};
+
+  nntrainer::TensorDim::TensorType t_type_nchw_fp32 = {
+    nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP32};
+
+  // conditions for fp16 hdot call:
+  // this->(batch * channel * height) = arg->(width) = 1;
+  size_t batch = 1;
+  size_t channel = 1;
+  size_t height = 1;
+  size_t width = 768;
+
+  nntrainer::Tensor input(
+    nntrainer::TensorDim(1, 1, 1, width, t_type_nchw_fp16));
+
+  nntrainer::Tensor input_2(
+    nntrainer::TensorDim(1, 1, width, 1, t_type_nchw_fp16));
+
+  nntrainer::Tensor input_fp32(
+    nntrainer::TensorDim(1, 1, 1, width, t_type_nchw_fp32));
+
+  nntrainer::Tensor input_fp32_2(
+    nntrainer::TensorDim(1, 1, width, 1, t_type_nchw_fp32));
+
+  const float alpha = 1e-1;
+  const int MOD = 10;
+
+  GEN_TEST_INPUT(input, ((i * j * (batch * height * channel) +
+                          j * (batch * height) + k * (width) + l + 1) %
+                         MOD) *
+                          alpha);
+  GEN_TEST_INPUT(input_fp32, ((i * j * (batch * height * channel) +
+                               j * (batch * height) + k * (width) + l + 1) %
+                              MOD) *
+                               alpha);
+  GEN_TEST_INPUT(input_2, ((i * k * (batch * height * channel) +
+                            j * (batch * height) + k * (width) + l + 1) %
+                           MOD) *
+                            alpha);
+  GEN_TEST_INPUT(input_fp32_2, ((i * k * (batch * height * channel) +
+                                 j * (batch * height) + k * (width) + l + 1) %
+                                MOD) *
+                                 alpha);
+
+  nntrainer::Tensor result_neon = input.dot(input_2, false, false);
+  nntrainer::Tensor result_fp32 = input_fp32.dot(input_fp32_2, false, false);
+
+  float mseErrorNeon =
+    mse<__fp16>(result_neon.getData<__fp16>(), result_fp32.getData<float>(),
+                result_neon.size());
+
+  double cosSimNeon =
+    cosine_similarity<__fp16>(result_neon.getData<__fp16>(),
+                              result_fp32.getData<float>(), result_neon.size());
+
+  const float epsilon = 1e-3;
+
+  EXPECT_IN_RANGE(mseErrorNeon, 0, epsilon);
+  EXPECT_IN_RANGE((float)cosSimNeon, 0.99, 1);
+}
+
 TEST(nntrainer_Tensor, l2norm) {
 
   nntrainer::TensorDim::TensorType t_type_nchw_fp16 = {
@@ -701,6 +765,128 @@ TEST(nntrainer_Tensor, dot_gemm_50_768_20000) {
   EXPECT_IN_RANGE((float)cosSimNeon, 0.99, 1);
 }
 
+TEST(nntrainer_Tensor, dot_gemm_512_520_1032) {
+  /// @note GEMM : A X B = C
+  int batch = 1;
+  int channel = 1;
+  int height = 512;
+  int width = 520;
+
+  int height_b = 520;
+  int width_b = 1032;
+
+  bool transA = false;
+  bool transB = false;
+
+  nntrainer::TensorDim::TensorType t_type_nchw_fp16 = {
+    nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP16};
+
+  nntrainer::TensorDim::TensorType t_type_nchw_fp32 = {
+    nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP32};
+
+  nntrainer::Tensor A(batch, channel, height, width, t_type_nchw_fp16);
+  nntrainer::Tensor B(batch, channel, height_b, width_b, t_type_nchw_fp16);
+
+  nntrainer::Tensor A_fp32(batch, channel, height, width, t_type_nchw_fp32);
+  nntrainer::Tensor B_fp32(batch, channel, height_b, width_b, t_type_nchw_fp32);
+
+  const float alpha = 1e-1;
+  const int MOD = 10;
+
+  GEN_TEST_INPUT(A, ((i * (batch * height * channel) + j * (batch * height) +
+                      k * (width) + l + 1) %
+                     MOD) *
+                      alpha);
+  GEN_TEST_INPUT_B(B, ((i * (batch * height_b * channel) +
+                        j * (batch * height_b) + k * (width_b) + l + 1) %
+                       MOD) *
+                        alpha);
+  GEN_TEST_INPUT(A_fp32, ((i * (batch * height * channel) +
+                           j * (batch * height) + k * (width) + l + 1) %
+                          MOD) *
+                           alpha);
+  GEN_TEST_INPUT_B(B_fp32, ((i * (batch * height_b * channel) +
+                             j * (batch * height_b) + k * (width_b) + l + 1) %
+                            MOD) *
+                             alpha);
+
+  nntrainer::Tensor C = A.dot(B, transA, transB);
+
+  nntrainer::Tensor C_fp32 = A_fp32.dot(B_fp32, transA, transB);
+
+  float mseErrorNeon =
+    mse<__fp16>(C.getData<__fp16>(), C_fp32.getData<float>(), C.size());
+
+  double cosSimNeon = cosine_similarity<__fp16>(
+    C.getData<__fp16>(), C_fp32.getData<float>(), C.size());
+
+  const float epsilon = 1e-3 * width;
+
+  EXPECT_IN_RANGE(mseErrorNeon, 0, epsilon);
+  EXPECT_IN_RANGE((float)cosSimNeon, 0.99, 1);
+}
+
+TEST(nntrainer_Tensor, dot_gemm_1001_1024_20000) {
+  /// @note GEMM : A X B = C
+  int batch = 1;
+  int channel = 1;
+  int height = 1001;
+  int width = 1024;
+
+  int height_b = 1024;
+  int width_b = 20000;
+
+  bool transA = false;
+  bool transB = false;
+
+  nntrainer::TensorDim::TensorType t_type_nchw_fp16 = {
+    nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP16};
+
+  nntrainer::TensorDim::TensorType t_type_nchw_fp32 = {
+    nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP32};
+
+  nntrainer::Tensor A(batch, channel, height, width, t_type_nchw_fp16);
+  nntrainer::Tensor B(batch, channel, height_b, width_b, t_type_nchw_fp16);
+
+  nntrainer::Tensor A_fp32(batch, channel, height, width, t_type_nchw_fp32);
+  nntrainer::Tensor B_fp32(batch, channel, height_b, width_b, t_type_nchw_fp32);
+
+  const float alpha = 1e-1;
+  const int MOD = 10;
+
+  GEN_TEST_INPUT(A, ((i * (batch * height * channel) + j * (batch * height) +
+                      k * (width) + l + 1) %
+                     MOD) *
+                      alpha);
+  GEN_TEST_INPUT_B(B, ((i * (batch * height_b * channel) +
+                        j * (batch * height_b) + k * (width_b) + l + 1) %
+                       MOD) *
+                        alpha);
+  GEN_TEST_INPUT(A_fp32, ((i * (batch * height * channel) +
+                           j * (batch * height) + k * (width) + l + 1) %
+                          MOD) *
+                           alpha);
+  GEN_TEST_INPUT_B(B_fp32, ((i * (batch * height_b * channel) +
+                             j * (batch * height_b) + k * (width_b) + l + 1) %
+                            MOD) *
+                             alpha);
+
+  nntrainer::Tensor C = A.dot(B, transA, transB);
+
+  nntrainer::Tensor C_fp32 = A_fp32.dot(B_fp32, transA, transB);
+
+  float mseErrorNeon =
+    mse<__fp16>(C.getData<__fp16>(), C_fp32.getData<float>(), C.size());
+
+  double cosSimNeon = cosine_similarity<__fp16>(
+    C.getData<__fp16>(), C_fp32.getData<float>(), C.size());
+
+  const float epsilon = 1e-3 * width;
+
+  EXPECT_IN_RANGE(mseErrorNeon, 0, epsilon);
+  EXPECT_IN_RANGE((float)cosSimNeon, 0.99, 1);
+}
+
 TEST(nntrainer_Tensor, dot_gemm_50_768_516) {
   /// @note GEMM : A X B = C
   int batch = 1;
@@ -994,6 +1180,38 @@ TEST(nntrainer_Tensor, inv_sqrt_i_p) {
   EXPECT_EQ(flag, true);
 }
 
+/**
+ * @brief fp16 tensor has NaN
+ */
+TEST(nntrainer_Tensor, is_valid_01) {
+  size_t batch = 1;
+  size_t channel = 3;
+  size_t height = 4;
+  size_t width = 5;
+
+  nntrainer::Tensor input(
+    {batch,
+     channel,
+     height,
+     width,
+     {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP16}},
+    true, nntrainer::Tensor::Initializer::ZEROS);
+
+  EXPECT_EQ(input.isValid(), true);
+
+  input.setValue(0, 0, 0, 0, std::nan("1"));
+
+  EXPECT_EQ(input.isValid(), false);
+
+  input.setValue(0, 0, 0, 0, std::numeric_limits<float>::infinity());
+
+  EXPECT_EQ(input.isValid(), false);
+
+  input.setValue(0, 0, 0, 0, 1);
+
+  EXPECT_EQ(input.isValid(), true);
+}
+
 GTEST_API_ int main(int argc, char **argv) {
   int result = -1;
 
diff --git a/tools/package_android.sh b/tools/package_android.sh
index 6e02cc23d2..5fc7ba8754 100755
--- a/tools/package_android.sh
+++ b/tools/package_android.sh
@@ -17,14 +17,14 @@ if [ ! -d builddir ]; then
     #default value of openblas num threads is 1 for android
     #enable-tflite-interpreter=false is just temporally until ci system is stabel
     #enable-opencl=true will compile OpenCL related changes or remove this option to exclude OpenCL compilations.
-  meson builddir -Dplatform=android -Dopenblas-num-threads=1 -Denable-tflite-interpreter=false -Denable-tflite-backbone=false -Denable-fp16=true -Denable-neon=true -Domp-num-threads=1 -Denable-opencl=true
+  meson builddir -Dplatform=android -Dopenblas-num-threads=1 -Denable-tflite-interpreter=false -Denable-tflite-backbone=false -Denable-fp16=true -Denable-neon=true -Domp-num-threads=1 -Denable-opencl=true -Denable-avx=false
 else
   echo "warning: $TARGET/builddir has already been taken, this script tries to reconfigure and try building"
   pushd builddir
     #default value of openblas num threads is 1 for android
     #enable-tflite-interpreter=false is just temporally until ci system is stabel  
     #enable-opencl=true will compile OpenCL related changes or remove this option to exclude OpenCL compilations.
-    meson configure -Dplatform=android -Dopenblas-num-threads=1 -Denable-tflite-interpreter=false -Denable-tflite-backbone=false -Denable-fp16=true -Denable-neon=true -Domp-num-threads=1 -Denable-opencl=true
+    meson configure -Dplatform=android -Dopenblas-num-threads=1 -Denable-tflite-interpreter=false -Denable-tflite-backbone=false -Denable-fp16=true -Denable-neon=true -Domp-num-threads=1 -Denable-opencl=true -Denable-avx=false
     meson --wipe
   popd
 fi