diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml
deleted file mode 100644
index c61f731855..0000000000
--- a/.github/workflows/pylint.yml
+++ /dev/null
@@ -1,51 +0,0 @@
-# Copyright 2021 The TensorFlow Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-# ==============================================================================
-
-name: PyLint
-on:
- pull_request:
- paths:
- - '**.py'
-
-permissions:
- contents: read
-
-jobs:
- build:
- name: PyLint
- runs-on: ubuntu-latest
- steps:
- - name: Checkout code
- uses: actions/checkout@v4
- - name: Get file changes
- id: get_file_changes
- uses: trilom/file-changes-action@v1.2.4
- with:
- output: ' '
- - name: Report list of changed files
- run: |
- echo Changed files: ${{ steps.get_file_changes.outputs.files }}
- - name: Set up Python 3.10
- uses: actions/setup-python@v5
- with:
- python-version: "3.10"
- - name: Install Python dependencies
- run: |
- python -m pip install --upgrade pip
- pip install pylint==3.0.2 numpy wheel
- pip install -r ci/requirements.txt
- - name: Run PyLint on changed files
- run: |
- echo "${{ steps.get_file_changes.outputs.files}}" | tr " " "\n" | grep ".py$" | xargs pylint --rcfile=ci/pylintrc
diff --git a/.gitignore b/.gitignore
index 06c3c0710a..96b03f7bf9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -5,6 +5,7 @@
/build
/builddir
.cache/
+.idea/
# jni build files
iniparser/
diff --git a/Applications/Android/PicoGPTJNI/.gitignore b/Applications/Android/PicoGPTJNI/.gitignore
new file mode 100644
index 0000000000..54ed6ea235
--- /dev/null
+++ b/Applications/Android/PicoGPTJNI/.gitignore
@@ -0,0 +1,19 @@
+*.iml
+.gradle
+/.vscode
+/.idea
+/local.properties
+/.idea/caches
+/.idea/libraries
+/.idea/modules.xml
+/.idea/workspace.xml
+/.idea/navEditor.xml
+/.idea/assetWizardSettings.xml
+.DS_Store
+/build
+/captures
+.externalNativeBuild
+.cxx
+local.properties
+/app/src/main/jniLibs
+/app/src/main/obj
diff --git a/Applications/Android/PicoGPTJNI/.gradle/7.5/checksums/checksums.lock b/Applications/Android/PicoGPTJNI/.gradle/7.5/checksums/checksums.lock
deleted file mode 100644
index 62d1fcfe2b..0000000000
Binary files a/Applications/Android/PicoGPTJNI/.gradle/7.5/checksums/checksums.lock and /dev/null differ
diff --git a/Applications/Android/PicoGPTJNI/.gradle/7.5/dependencies-accessors/dependencies-accessors.lock b/Applications/Android/PicoGPTJNI/.gradle/7.5/dependencies-accessors/dependencies-accessors.lock
deleted file mode 100644
index 4f1595be70..0000000000
Binary files a/Applications/Android/PicoGPTJNI/.gradle/7.5/dependencies-accessors/dependencies-accessors.lock and /dev/null differ
diff --git a/Applications/Android/PicoGPTJNI/.gradle/7.5/dependencies-accessors/gc.properties b/Applications/Android/PicoGPTJNI/.gradle/7.5/dependencies-accessors/gc.properties
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/Applications/Android/PicoGPTJNI/.gradle/7.5/executionHistory/executionHistory.lock b/Applications/Android/PicoGPTJNI/.gradle/7.5/executionHistory/executionHistory.lock
deleted file mode 100644
index 506dd636a9..0000000000
Binary files a/Applications/Android/PicoGPTJNI/.gradle/7.5/executionHistory/executionHistory.lock and /dev/null differ
diff --git a/Applications/Android/PicoGPTJNI/.gradle/7.5/fileHashes/fileHashes.lock b/Applications/Android/PicoGPTJNI/.gradle/7.5/fileHashes/fileHashes.lock
deleted file mode 100644
index 096927b1af..0000000000
Binary files a/Applications/Android/PicoGPTJNI/.gradle/7.5/fileHashes/fileHashes.lock and /dev/null differ
diff --git a/Applications/Android/PicoGPTJNI/.gradle/7.5/gc.properties b/Applications/Android/PicoGPTJNI/.gradle/7.5/gc.properties
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/Applications/Android/PicoGPTJNI/.gradle/buildOutputCleanup/buildOutputCleanup.lock b/Applications/Android/PicoGPTJNI/.gradle/buildOutputCleanup/buildOutputCleanup.lock
deleted file mode 100644
index 2ab7eb0273..0000000000
Binary files a/Applications/Android/PicoGPTJNI/.gradle/buildOutputCleanup/buildOutputCleanup.lock and /dev/null differ
diff --git a/Applications/Android/PicoGPTJNI/.gradle/buildOutputCleanup/cache.properties b/Applications/Android/PicoGPTJNI/.gradle/buildOutputCleanup/cache.properties
deleted file mode 100644
index f11a0f4e85..0000000000
--- a/Applications/Android/PicoGPTJNI/.gradle/buildOutputCleanup/cache.properties
+++ /dev/null
@@ -1,2 +0,0 @@
-#Tue Feb 14 16:37:06 KST 2023
-gradle.version=7.5
diff --git a/Applications/Android/PicoGPTJNI/.gradle/checksums/checksums.lock b/Applications/Android/PicoGPTJNI/.gradle/checksums/checksums.lock
deleted file mode 100644
index 287309dd96..0000000000
Binary files a/Applications/Android/PicoGPTJNI/.gradle/checksums/checksums.lock and /dev/null differ
diff --git a/Applications/Android/PicoGPTJNI/.gradle/file-system.probe b/Applications/Android/PicoGPTJNI/.gradle/file-system.probe
deleted file mode 100644
index 71fa644c1c..0000000000
Binary files a/Applications/Android/PicoGPTJNI/.gradle/file-system.probe and /dev/null differ
diff --git a/Applications/Android/PicoGPTJNI/.gradle/vcs-1/gc.properties b/Applications/Android/PicoGPTJNI/.gradle/vcs-1/gc.properties
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/Applications/Android/PicoGPTJNI/.idea/compiler.xml b/Applications/Android/PicoGPTJNI/.idea/compiler.xml
deleted file mode 100644
index 5421743a9c..0000000000
--- a/Applications/Android/PicoGPTJNI/.idea/compiler.xml
+++ /dev/null
@@ -1,6 +0,0 @@
-
-
-
-
-
-
diff --git a/Applications/Android/PicoGPTJNI/.idea/gradle.xml b/Applications/Android/PicoGPTJNI/.idea/gradle.xml
deleted file mode 100644
index b795db1fe1..0000000000
--- a/Applications/Android/PicoGPTJNI/.idea/gradle.xml
+++ /dev/null
@@ -1,20 +0,0 @@
-
-
-
-
-
-
-
diff --git a/Applications/Android/PicoGPTJNI/.idea/misc.xml b/Applications/Android/PicoGPTJNI/.idea/misc.xml
deleted file mode 100644
index 0f31685c15..0000000000
--- a/Applications/Android/PicoGPTJNI/.idea/misc.xml
+++ /dev/null
@@ -1,12 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
-
\ No newline at end of file
diff --git a/Applications/Android/PicoGPTJNI/.idea/vcs.xml b/Applications/Android/PicoGPTJNI/.idea/vcs.xml
deleted file mode 100644
index c2365ab11f..0000000000
--- a/Applications/Android/PicoGPTJNI/.idea/vcs.xml
+++ /dev/null
@@ -1,6 +0,0 @@
-
-
-
-
-
-
\ No newline at end of file
diff --git a/Applications/Android/PicoGPTJNI/.idea/workspace.xml b/Applications/Android/PicoGPTJNI/.idea/workspace.xml
deleted file mode 100644
index 039da86b98..0000000000
--- a/Applications/Android/PicoGPTJNI/.idea/workspace.xml
+++ /dev/null
@@ -1,147 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- 1676357527812
-
-
- 1676357527812
-
-
-
-
\ No newline at end of file
diff --git a/Applications/Android/ResnetJNI/.gitignore b/Applications/Android/ResnetJNI/.gitignore
new file mode 100644
index 0000000000..54ed6ea235
--- /dev/null
+++ b/Applications/Android/ResnetJNI/.gitignore
@@ -0,0 +1,19 @@
+*.iml
+.gradle
+/.vscode
+/.idea
+/local.properties
+/.idea/caches
+/.idea/libraries
+/.idea/modules.xml
+/.idea/workspace.xml
+/.idea/navEditor.xml
+/.idea/assetWizardSettings.xml
+.DS_Store
+/build
+/captures
+.externalNativeBuild
+.cxx
+local.properties
+/app/src/main/jniLibs
+/app/src/main/obj
diff --git a/Applications/Android/ResnetJNI/.gradle/7.5/checksums/checksums.lock b/Applications/Android/ResnetJNI/.gradle/7.5/checksums/checksums.lock
deleted file mode 100644
index dcf4a0cfa3..0000000000
Binary files a/Applications/Android/ResnetJNI/.gradle/7.5/checksums/checksums.lock and /dev/null differ
diff --git a/Applications/Android/ResnetJNI/.gradle/7.5/dependencies-accessors/dependencies-accessors.lock b/Applications/Android/ResnetJNI/.gradle/7.5/dependencies-accessors/dependencies-accessors.lock
deleted file mode 100644
index 4f1595be70..0000000000
Binary files a/Applications/Android/ResnetJNI/.gradle/7.5/dependencies-accessors/dependencies-accessors.lock and /dev/null differ
diff --git a/Applications/Android/ResnetJNI/.gradle/7.5/dependencies-accessors/gc.properties b/Applications/Android/ResnetJNI/.gradle/7.5/dependencies-accessors/gc.properties
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/Applications/Android/ResnetJNI/.gradle/7.5/executionHistory/executionHistory.lock b/Applications/Android/ResnetJNI/.gradle/7.5/executionHistory/executionHistory.lock
deleted file mode 100644
index 41a6551295..0000000000
Binary files a/Applications/Android/ResnetJNI/.gradle/7.5/executionHistory/executionHistory.lock and /dev/null differ
diff --git a/Applications/Android/ResnetJNI/.gradle/7.5/fileHashes/fileHashes.lock b/Applications/Android/ResnetJNI/.gradle/7.5/fileHashes/fileHashes.lock
deleted file mode 100644
index 6c027ab889..0000000000
Binary files a/Applications/Android/ResnetJNI/.gradle/7.5/fileHashes/fileHashes.lock and /dev/null differ
diff --git a/Applications/Android/ResnetJNI/.gradle/7.5/gc.properties b/Applications/Android/ResnetJNI/.gradle/7.5/gc.properties
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/Applications/Android/ResnetJNI/.gradle/buildOutputCleanup/buildOutputCleanup.lock b/Applications/Android/ResnetJNI/.gradle/buildOutputCleanup/buildOutputCleanup.lock
deleted file mode 100644
index 737b37946f..0000000000
Binary files a/Applications/Android/ResnetJNI/.gradle/buildOutputCleanup/buildOutputCleanup.lock and /dev/null differ
diff --git a/Applications/Android/ResnetJNI/.gradle/buildOutputCleanup/cache.properties b/Applications/Android/ResnetJNI/.gradle/buildOutputCleanup/cache.properties
deleted file mode 100644
index f11a0f4e85..0000000000
--- a/Applications/Android/ResnetJNI/.gradle/buildOutputCleanup/cache.properties
+++ /dev/null
@@ -1,2 +0,0 @@
-#Tue Feb 14 16:37:06 KST 2023
-gradle.version=7.5
diff --git a/Applications/Android/ResnetJNI/.gradle/checksums/checksums.lock b/Applications/Android/ResnetJNI/.gradle/checksums/checksums.lock
deleted file mode 100644
index 287309dd96..0000000000
Binary files a/Applications/Android/ResnetJNI/.gradle/checksums/checksums.lock and /dev/null differ
diff --git a/Applications/Android/ResnetJNI/.gradle/file-system.probe b/Applications/Android/ResnetJNI/.gradle/file-system.probe
deleted file mode 100644
index d43b228f76..0000000000
Binary files a/Applications/Android/ResnetJNI/.gradle/file-system.probe and /dev/null differ
diff --git a/Applications/Android/ResnetJNI/.gradle/vcs-1/gc.properties b/Applications/Android/ResnetJNI/.gradle/vcs-1/gc.properties
deleted file mode 100644
index e69de29bb2..0000000000
diff --git a/Applications/Android/ResnetJNI/.idea/compiler.xml b/Applications/Android/ResnetJNI/.idea/compiler.xml
deleted file mode 100644
index 5421743a9c..0000000000
--- a/Applications/Android/ResnetJNI/.idea/compiler.xml
+++ /dev/null
@@ -1,6 +0,0 @@
-
-
-
-
-
-
diff --git a/Applications/Android/ResnetJNI/.idea/gradle.xml b/Applications/Android/ResnetJNI/.idea/gradle.xml
deleted file mode 100644
index b795db1fe1..0000000000
--- a/Applications/Android/ResnetJNI/.idea/gradle.xml
+++ /dev/null
@@ -1,20 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
diff --git a/Applications/Android/ResnetJNI/.idea/misc.xml b/Applications/Android/ResnetJNI/.idea/misc.xml
deleted file mode 100644
index cd890b0b4e..0000000000
--- a/Applications/Android/ResnetJNI/.idea/misc.xml
+++ /dev/null
@@ -1,5 +0,0 @@
-
-
-
-
-
diff --git a/Applications/Android/ResnetJNI/.idea/workspace.xml b/Applications/Android/ResnetJNI/.idea/workspace.xml
deleted file mode 100644
index 03a1e61514..0000000000
--- a/Applications/Android/ResnetJNI/.idea/workspace.xml
+++ /dev/null
@@ -1,137 +0,0 @@
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- 1676357527812
-
-
- 1676357527812
-
-
-
-
diff --git a/Applications/KNN/jni/meson.build b/Applications/KNN/jni/meson.build
index bc50dc0214..58ca099d75 100644
--- a/Applications/KNN/jni/meson.build
+++ b/Applications/KNN/jni/meson.build
@@ -15,4 +15,4 @@ e = executable('knn_sample',
install_dir: application_install_dir
)
-test('app_knn', e, args: [nntr_app_resdir / 'KNN'])
+test('app_knn', e, args: [nntr_app_resdir / 'KNN/'])
diff --git a/Applications/LLaMA/jni/main.cpp b/Applications/LLaMA/jni/main.cpp
index 96be8671dc..985d82a79e 100644
--- a/Applications/LLaMA/jni/main.cpp
+++ b/Applications/LLaMA/jni/main.cpp
@@ -56,7 +56,7 @@ int const NUM_VOCAB = 96000;
int MAX_SEQ_LEN = 1024;
int NUM_TO_GENERATE = 100;
-constexpr unsigned int INIT_SEQ_LEN = 30;
+constexpr unsigned int INIT_SEQ_LEN = 28;
unsigned int batch_size = 1;
unsigned int epoch = 1;
@@ -596,7 +596,7 @@ void run(std::string text, bool apply_temperature) {
float init_input[INIT_SEQ_LEN] = {
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40,
50, 60, 70, 80, 90, 100, 200, 300, 400, 500, 600, 700, 800, 900};
- ((uint *)(input_sample))[0] = init_input[0];
+ memcpy(input_sample, init_input, sizeof(float) * INIT_SEQ_LEN);
input.push_back(input_sample);
init_len = 18;
#endif
diff --git a/Applications/Resnet/README.md b/Applications/Resnet/README.md
index f76d5b25de..f195a8c764 100644
--- a/Applications/Resnet/README.md
+++ b/Applications/Resnet/README.md
@@ -14,7 +14,7 @@ Please file an issue if you have a problem running the example.
```bash
$ meson ${build_dir} -Denable-test=true -Denable-long-test=true
-$ meson test app_resnet18 -v -c ${build_dir}
+$ meson test app_resnet18 -v -C ${build_dir}
```
### To run with a real data.
diff --git a/Applications/YOLO/PyTorch/main.py b/Applications/YOLO/PyTorch/main.py
deleted file mode 100644
index b831e1ebb1..0000000000
--- a/Applications/YOLO/PyTorch/main.py
+++ /dev/null
@@ -1,171 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# Copyright (C) 2023 Seungbaek Hong
-#
-# @file main.py
-# @date 8 March 2023
-# @brief Implement training for yolo
-#
-# @author Seungbaek Hong
-
-import numpy as np
-import torch
-import torch.nn as nn
-import torch.optim as optim
-import torch.nn.functional as F
-from torch.utils.data import DataLoader
-
-from yolo import YoloV2
-from yolo_loss import YoloV2_LOSS
-from dataset import YOLODataset, collate_db
-
-import sys
-import os
-
-# get pyutils path using relative path
-def get_util_path():
- current_path = os.path.abspath(os.path.dirname(__file__))
- parent_path = os.path.abspath(os.path.dirname(current_path))
- target_path = os.path.abspath(os.path.dirname(parent_path))
- return os.path.dirname(target_path) + '/tools/pyutils/'
-
-# add pyutils path to sys.path
-sys.path.append(get_util_path())
-from torchconverter import save_bin
-
-# set config
-out_size = 13
-num_classes = 4
-num_anchors = 5
-
-epochs = 3
-batch_size = 4
-
-train_img_dir = '/home/user/TRAIN_DIR/images/*'
-train_ann_dir = '/home/user/TRAIN_DIR/annotations/*'
-valid_img_dir = '/home/user/VALID_DIR/images/*'
-valid_ann_dir = '/home/user/VALID_DIR/annotations/*'
-
-# load data
-train_dataset = YOLODataset(train_img_dir, train_ann_dir)
-train_loader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_db, shuffle=True, drop_last=True)
-valid_dataset = YOLODataset(valid_img_dir, valid_ann_dir)
-valid_loader = DataLoader(valid_dataset, batch_size=batch_size, collate_fn=collate_db, shuffle=False, drop_last=True)
-
-# set model, loss and optimizer
-model = YoloV2(num_classes=num_classes)
-criterion = YoloV2_LOSS(num_classes=num_classes)
-optimizer = optim.Adam(model.parameters(), lr=1e-3)
-# scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10, eta_min=0)
-
-# save init model
-save_bin(model, 'init_model')
-torch.save(model.state_dict(), './init_model.pt')
-
-# train model
-best_loss = 1e+10
-for epoch in range(epochs):
- epoch_train_loss = 0
- epoch_valid_loss = 0
- for idx, (img, bbox, cls) in enumerate(train_loader):
- model.train()
- optimizer.zero_grad()
- # model prediction
- hypothesis = model(img).permute((0, 2, 3, 1))
- hypothesis = hypothesis.reshape((batch_size, out_size**2, num_anchors, 5+num_classes))
- # split each prediction(bbox, iou, class prob)
- bbox_pred_xy = torch.sigmoid(hypothesis[..., :2])
- bbox_pred_wh = torch.exp(hypothesis[..., 2:4])
- bbox_pred = torch.cat((bbox_pred_xy, bbox_pred_wh), 3)
- iou_pred = torch.sigmoid(hypothesis[..., 4:5])
- score_pred = hypothesis[..., 5:].contiguous()
- prob_pred = torch.softmax(score_pred.view(-1, num_classes), dim=1).view(score_pred.shape)
- # calc loss
- loss = criterion(torch.FloatTensor(bbox_pred),
- torch.FloatTensor(iou_pred),
- torch.FloatTensor(prob_pred),
- bbox,
- cls)
- # back prop
- loss.backward()
- optimizer.step()
- # scheduler.step()
- epoch_train_loss += loss.item()
-
- for idx, (img, bbox, cls) in enumerate(valid_loader):
- model.eval()
- with torch.no_grad():
- # model prediction
- hypothesis = model(img).permute((0, 2, 3, 1))
- hypothesis = hypothesis.reshape((hypothesis.shape[0], out_size**2, num_anchors, 5+num_classes))
- # split each prediction(bbox, iou, class prob)
- bbox_pred_xy = torch.sigmoid(hypothesis[..., :2])
- bbox_pred_wh = torch.exp(hypothesis[..., 2:4])
- bbox_pred = torch.cat((bbox_pred_xy, bbox_pred_wh), 3)
- iou_pred = torch.sigmoid(hypothesis[..., 4:5])
- score_pred = hypothesis[..., 5:].contiguous()
- prob_pred = torch.softmax(score_pred.view(-1, num_classes), dim=1).view(score_pred.shape)
- # calc loss
- loss = criterion(torch.FloatTensor(bbox_pred),
- torch.FloatTensor(iou_pred),
- torch.FloatTensor(prob_pred),
- bbox,
- cls)
- epoch_valid_loss += loss.item()
-
- if epoch_valid_loss < best_loss:
- best_loss = epoch_valid_loss
- torch.save(model.state_dict(), './best_model.pt')
- save_bin(model, 'best_model')
-
- print("{}epoch, train loss: {:.4f}, valid loss: {:.4f}".format(
- epoch, epoch_train_loss / len(train_loader), epoch_valid_loss / len(valid_loader)))
-
-##
-# @brief bbox post process function for inference
-def post_process_for_bbox(bbox_pred):
- """
- @param bbox_pred shape(batch_size, cell_h x cell_w, num_anchors, 4)
- @return bbox_pred shape(batch_size, cell_h x cell_w, num_anchors, 4)
- """
- anchors = torch.FloatTensor(
- [(1.3221, 1.73145),
- (3.19275, 4.00944),
- (5.05587, 8.09892),
- (9.47112, 4.84053),
- (11.2364, 10.0071)]
- )
-
- outsize = (13, 13)
- width, height = outsize
-
- # restore cell pos to x, y
- for w in range(width):
- for h in range(height):
- bbox_pred[:, height*h + w, :, 0] += w
- bbox_pred[:, height*h + w, :, 1] += h
- bbox_pred[:, :, :, :2] /= 13
-
- # apply anchors to w, h
- anchor_w = anchors[:, 0].contiguous().view(-1, 1)
- anchor_h = anchors[:, 1].contiguous().view(-1, 1)
- bbox_pred[:, :, :, 2:3] *= anchor_w
- bbox_pred[:, :, :, 3:4] *= anchor_h
-
- return bbox_pred
-
-# inference example using trained model
-hypothesis = model(img).permute((0, 2, 3, 1))
-hypothesis = hypothesis[0].reshape((1, out_size**2, num_anchors, 5+num_classes))
-
-# transform output
-bbox_pred_xy = torch.sigmoid(hypothesis[..., :2])
-bbox_pred_wh = torch.exp(hypothesis[..., 2:4])
-bbox_pred = torch.cat((bbox_pred_xy, bbox_pred_wh), 3)
-bbox_pred = post_process_for_bbox(bbox_pred)
-iou_pred = torch.sigmoid(hypothesis[..., 4:5])
-score_pred = hypothesis[..., 5:].contiguous()
-prob_pred = torch.softmax(score_pred.view(-1, num_classes), dim=1).view(score_pred.shape)
-
-# result of inference (data range 0~1)
-iou_mask = (iou_pred > 0.5)
-print(bbox_pred * iou_mask, iou_pred * iou_mask, prob_pred * iou_mask)
diff --git a/Applications/YOLO/PyTorch/yolo.py b/Applications/YOLO/PyTorch/yolo.py
deleted file mode 100644
index 53763f1be7..0000000000
--- a/Applications/YOLO/PyTorch/yolo.py
+++ /dev/null
@@ -1,105 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# Copyright (C) 2023 Seungbaek Hong
-#
-# @file yolo.py
-# @date 8 March 2023
-# @brief Define simple yolo model, but not original darknet.
-#
-# @author Seungbaek Hong
-
-import torch
-import torch.nn as nn
-
-##
-# @brief define yolo model (except for re-organization module)
-class YoloV2(nn.Module):
- def __init__(self, num_classes, num_anchors=5):
-
- super(YoloV2, self).__init__()
- self.num_classes = num_classes
- self.num_anchors = num_anchors
- self.conv1 = nn.Sequential(nn.Conv2d(3, 32, 3, 1, 1), nn.BatchNorm2d(32, eps=1e-3),
- nn.LeakyReLU(), nn.MaxPool2d(2, 2))
- self.conv2 = nn.Sequential(nn.Conv2d(32, 64, 3, 1, 1), nn.BatchNorm2d(64, eps=1e-3),
- nn.LeakyReLU(), nn.MaxPool2d(2, 2))
- self.conv3 = nn.Sequential(nn.Conv2d(64, 128, 3, 1, 1), nn.BatchNorm2d(128, eps=1e-3),
- nn.LeakyReLU())
- self.conv4 = nn.Sequential(nn.Conv2d(128, 64, 1, 1, 0), nn.BatchNorm2d(64, eps=1e-3),
- nn.LeakyReLU())
- self.conv5 = nn.Sequential(nn.Conv2d(64, 128, 3, 1, 1), nn.BatchNorm2d(128, eps=1e-3),
- nn.LeakyReLU(), nn.MaxPool2d(2, 2))
- self.conv6 = nn.Sequential(nn.Conv2d(128, 256, 3, 1, 1), nn.BatchNorm2d(256, eps=1e-3),
- nn.LeakyReLU())
- self.conv7 = nn.Sequential(nn.Conv2d(256, 128, 1, 1, 0), nn.BatchNorm2d(128, eps=1e-3),
- nn.LeakyReLU())
- self.conv8 = nn.Sequential(nn.Conv2d(128, 256, 3, 1, 1), nn.BatchNorm2d(256, eps=1e-3),
- nn.LeakyReLU(), nn.MaxPool2d(2, 2))
- self.conv9 = nn.Sequential(nn.Conv2d(256, 512, 3, 1, 1), nn.BatchNorm2d(512, eps=1e-3),
- nn.LeakyReLU())
- self.conv10 = nn.Sequential(nn.Conv2d(512, 256, 1, 1, 0), nn.BatchNorm2d(256, eps=1e-3),
- nn.LeakyReLU())
- self.conv11 = nn.Sequential(nn.Conv2d(256, 512, 3, 1, 1), nn.BatchNorm2d(512, eps=1e-3),
- nn.LeakyReLU())
- self.conv12 = nn.Sequential(nn.Conv2d(512, 256, 1, 1, 0), nn.BatchNorm2d(256, eps=1e-3),
- nn.LeakyReLU())
- self.conv13 = nn.Sequential(nn.Conv2d(256, 512, 3, 1, 1), nn.BatchNorm2d(512, eps=1e-3),
- nn.LeakyReLU())
-
- self.conv_b = nn.Sequential(nn.Conv2d(512, 64, 1, 1, 0), nn.BatchNorm2d(64, eps=1e-3),
- nn.LeakyReLU())
-
- self.maxpool_a = nn.MaxPool2d(2, 2)
- self.conv_a1 = nn.Sequential(nn.Conv2d(512, 1024, 3, 1, 1), nn.BatchNorm2d(1024, eps=1e-3),
- nn.LeakyReLU())
- self.conv_a2 = nn.Sequential(nn.Conv2d(1024, 512, 1, 1, 0), nn.BatchNorm2d(512, eps=1e-3),
- nn.LeakyReLU())
- self.conv_a3 = nn.Sequential(nn.Conv2d(512, 1024, 3, 1, 1), nn.BatchNorm2d(1024, eps=1e-3),
- nn.LeakyReLU())
- self.conv_a4 = nn.Sequential(nn.Conv2d(1024, 512, 1, 1, 0), nn.BatchNorm2d(512, eps=1e-3),
- nn.LeakyReLU())
- self.conv_a5 = nn.Sequential(nn.Conv2d(512, 1024, 3, 1, 1), nn.BatchNorm2d(1024, eps=1e-3),
- nn.LeakyReLU())
- self.conv_a6 = nn.Sequential(nn.Conv2d(1024, 1024, 3, 1, 1), nn.BatchNorm2d(1024, eps=1e-3),
- nn.LeakyReLU())
- self.conv_a7 = nn.Sequential(nn.Conv2d(1024, 1024, 3, 1, 1), nn.BatchNorm2d(1024, eps=1e-3),
- nn.LeakyReLU())
-
- self.conv_out1 = nn.Sequential(nn.Conv2d(1280, 1024, 3, 1, 1), nn.BatchNorm2d(1024, eps=1e-3),
- nn.LeakyReLU())
-
- self.conv_out2 = nn.Conv2d(1024, self.num_anchors * (5 + num_classes), 1, 1, 0)
-
- def forward(self, input):
- output = self.conv1(input)
- output = self.conv2(output)
- output = self.conv3(output)
- output = self.conv4(output)
- output = self.conv5(output)
- output = self.conv6(output)
- output = self.conv7(output)
- output = self.conv8(output)
- output = self.conv9(output)
- output = self.conv10(output)
- output = self.conv11(output)
- output = self.conv12(output)
- output = self.conv13(output)
-
- output_a = self.maxpool_a(output)
- output_a = self.conv_a1(output_a)
- output_a = self.conv_a2(output_a)
- output_a = self.conv_a3(output_a)
- output_a = self.conv_a4(output_a)
- output_a = self.conv_a5(output_a)
- output_a = self.conv_a6(output_a)
- output_a = self.conv_a7(output_a)
-
- output_b = self.conv_b(output)
- b, c, h, w = output_b.size()
- output_b = output_b.view(b, int(c / 4), h, 2, w, 2).contiguous()
- output_b = output_b.permute(0, 3, 5, 1, 2, 4).contiguous()
- output_b = output_b.view(b, -1, int(h / 2), int(w / 2))
-
- output = torch.cat((output_a, output_b), 1)
- output = self.conv_out1(output)
- output = self.conv_out2(output)
- return output
diff --git a/Applications/YOLO/PyTorch/dataset.py b/Applications/YOLOv2/PyTorch/dataset.py
similarity index 58%
rename from Applications/YOLO/PyTorch/dataset.py
rename to Applications/YOLOv2/PyTorch/dataset.py
index a02971ae87..d939e0f8a9 100644
--- a/Applications/YOLO/PyTorch/dataset.py
+++ b/Applications/YOLOv2/PyTorch/dataset.py
@@ -8,50 +8,68 @@
# @author Seungbaek Hong
import glob
+import re
import numpy as np
import torch
from torch.utils.data import Dataset
from torch.utils.data.dataloader import default_collate
from PIL import Image
+
##
# @brief dataset class for yolo
-# @note Need annotation text files corresponding to the name of the images.
+# @note Need annotation text files corresponding to the name of the images.
class YOLODataset(Dataset):
def __init__(self, img_dir, ann_dir):
super().__init__()
- img_list = glob.glob(img_dir)
- ann_list = glob.glob(ann_dir)
- img_list.sort(), ann_list.sort()
+ self.img_dir = img_dir
+ pattern = re.compile("\/(\d+)\.")
+ img_list = glob.glob(img_dir + "*")
+ ann_list = glob.glob(ann_dir + "*")
+
+ img_ids = list(map(lambda x: pattern.search(x).group(1), img_list))
+ ann_ids = list(map(lambda x: pattern.search(x).group(1), ann_list))
+ ids_list = list(set(img_ids) & set(ann_ids))
- self.length = len(img_list)
- self.input_images = []
+ self.ids_list = []
self.bbox_gt = []
self.cls_gt = []
- for i in range(len(img_list)):
- img = np.array(Image.open(img_list[i]).resize((416, 416))) / 255
+ for ids in ids_list:
label_bbox = []
label_cls = []
- with open(ann_list[i], 'rt') as f:
+ with open(ann_dir + ids + ".txt", "rt", encoding="utf-8") as f:
for line in f.readlines():
line = [float(i) for i in line.split()]
label_bbox.append(np.array(line[1:], dtype=np.float32) / 416)
label_cls.append(int(line[0]))
- self.input_images.append(img)
+ if len(label_cls) == 0:
+ continue
+
+ self.ids_list.append(ids)
self.bbox_gt.append(label_bbox)
self.cls_gt.append(label_cls)
- self.input_images = np.array(self.input_images)
- self.input_images = torch.FloatTensor(self.input_images).permute((0, 3, 1, 2))
+ self.length = len(self.ids_list)
def __len__(self):
return self.length
-
+
def __getitem__(self, idx):
- return self.input_images[idx], self.bbox_gt[idx], self.cls_gt[idx]
-
+ img = (
+ torch.FloatTensor(
+ np.array(
+ Image.open(self.img_dir + self.ids_list[idx] + ".jpg").resize(
+ (416, 416)
+ )
+ )
+ ).permute((2, 0, 1))
+ / 255
+ )
+ return img, self.bbox_gt[idx], self.cls_gt[idx]
+
+
##
# @brief collate db function for yolo
def collate_db(batch):
diff --git a/Applications/YOLOv2/PyTorch/main.py b/Applications/YOLOv2/PyTorch/main.py
new file mode 100644
index 0000000000..6e42fa1c6b
--- /dev/null
+++ b/Applications/YOLOv2/PyTorch/main.py
@@ -0,0 +1,222 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (C) 2023 Seungbaek Hong
+#
+# @file main.py
+# @date 8 March 2023
+# @brief Implement training for yolo
+#
+# @author Seungbaek Hong
+
+import sys
+import os
+
+from PIL import Image, ImageDraw
+from matplotlib import pyplot as plt
+from torch import optim
+from torch.utils.data import DataLoader
+import torch
+import numpy as np
+
+from yolo import YoloV2
+from yolo_loss import YoloV2_LOSS
+from dataset import YOLODataset, collate_db
+from torchconverter import save_bin
+
+device = "cuda" if torch.cuda.is_available() else "cpu"
+
+
+# get pyutils path using relative path
+def get_util_path():
+ current_path = os.path.abspath(os.path.dirname(__file__))
+ parent_path = os.path.abspath(os.path.dirname(current_path))
+ target_path = os.path.abspath(os.path.dirname(parent_path))
+ return os.path.dirname(target_path) + "/tools/pyutils/"
+
+
+# add pyutils path to sys.path
+sys.path.append(get_util_path())
+
+# set config
+out_size = 13
+num_classes = 4
+num_anchors = 5
+
+epochs = 3
+batch_size = 4
+
+train_img_dir = "/home/user/TRAIN_DIR/images/"
+train_ann_dir = "/home/user/TRAIN_DIR/annotations/"
+valid_img_dir = "/home/user/VALID_DIR/images/"
+valid_ann_dir = "/home/user/VALID_DIR/annotations/"
+
+# load data
+train_dataset = YOLODataset(train_img_dir, train_ann_dir)
+train_loader = DataLoader(
+ train_dataset,
+ batch_size=batch_size,
+ collate_fn=collate_db,
+ shuffle=True,
+ drop_last=True,
+)
+valid_dataset = YOLODataset(valid_img_dir, valid_ann_dir)
+valid_loader = DataLoader(
+ valid_dataset,
+ batch_size=batch_size,
+ collate_fn=collate_db,
+ shuffle=False,
+ drop_last=True,
+)
+
+# set model, loss and optimizer
+model = YoloV2(num_classes=num_classes).to(device)
+criterion = YoloV2_LOSS(
+ num_classes=num_classes, img_shape=(416, 416), device=device
+).to(device)
+optimizer = optim.Adam(model.parameters(), lr=1e-5)
+scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10, eta_min=0)
+
+# save init model
+save_bin(model, "init_model")
+torch.save(model.state_dict(), "./init_model.pt")
+
+# train model
+best_loss = 1e10
+for epoch in range(epochs):
+ epoch_train_loss = 0
+ epoch_valid_loss = 0
+ model.train()
+ for idx, (img, bbox, cls) in enumerate(train_loader):
+ optimizer.zero_grad()
+ # model prediction
+ hypothesis = model(img.to(device)).permute((0, 2, 3, 1))
+ hypothesis = hypothesis.reshape(
+ (batch_size, out_size**2, num_anchors, 5 + num_classes)
+ )
+ # split each prediction(bbox, iou, class prob)
+ bbox_pred_xy = torch.sigmoid(hypothesis[..., :2])
+ bbox_pred_wh = torch.exp(hypothesis[..., 2:4])
+ bbox_pred = torch.cat((bbox_pred_xy, bbox_pred_wh), 3)
+ iou_pred = torch.sigmoid(hypothesis[..., 4:5])
+ score_pred = hypothesis[..., 5:].contiguous()
+ prob_pred = torch.softmax(score_pred.view(-1, num_classes), dim=1).view(
+ score_pred.shape
+ )
+ # calc loss
+ loss = criterion(bbox_pred, iou_pred, prob_pred, bbox, cls)
+ # back prop
+ loss.backward()
+ optimizer.step()
+ scheduler.step()
+ epoch_train_loss += loss.item()
+
+ model.eval()
+ for idx, (img, bbox, cls) in enumerate(valid_loader):
+ with torch.no_grad():
+ # model prediction
+ hypothesis = model(img.to(device)).permute((0, 2, 3, 1))
+ hypothesis = hypothesis.reshape(
+ (hypothesis.shape[0], out_size**2, num_anchors, 5 + num_classes)
+ )
+ # split each prediction(bbox, iou, class prob)
+ bbox_pred_xy = torch.sigmoid(hypothesis[..., :2])
+ bbox_pred_wh = torch.exp(hypothesis[..., 2:4])
+ bbox_pred = torch.cat((bbox_pred_xy, bbox_pred_wh), 3)
+ iou_pred = torch.sigmoid(hypothesis[..., 4:5])
+ score_pred = hypothesis[..., 5:].contiguous()
+ prob_pred = torch.softmax(score_pred.view(-1, num_classes), dim=1).view(
+ score_pred.shape
+ )
+ # calc loss
+ loss = criterion(bbox_pred, iou_pred, prob_pred, bbox, cls)
+ epoch_valid_loss += loss.item()
+
+ if epoch_valid_loss < best_loss:
+ best_loss = epoch_valid_loss
+ torch.save(model.state_dict(), "./best_model.pt")
+ save_bin(model, "best_model")
+
+ print(
+ f"{epoch}epoch, train loss: {epoch_train_loss / len(train_loader):.4f},\
+ valid loss: {epoch_valid_loss / len(valid_loader):.4f}"
+ )
+
+
+##
+# @brief bbox post process function for inference
+def post_process_for_bbox(bbox_p):
+ """
+ @param bbox_p shape(batch_size, cell_h x cell_w, num_anchors, 4)
+ @return bbox_p shape(batch_size, cell_h x cell_w, num_anchors, 4)
+ """
+ anchors = torch.FloatTensor(
+ [
+ (1.3221, 1.73145),
+ (3.19275, 4.00944),
+ (5.05587, 8.09892),
+ (9.47112, 4.84053),
+ (11.2364, 10.0071),
+ ]
+ )
+
+ outsize = (13, 13)
+ width, height = outsize
+
+ # restore cell pos to x, y
+ for w in range(width):
+ for h in range(height):
+ bbox_p[:, height * h + w, :, 0] += w
+ bbox_p[:, height * h + w, :, 1] += h
+ bbox_p[:, :, :, :2] /= 13
+
+ # apply anchors to w, h
+ anchor_w = anchors[:, 0].contiguous().view(-1, 1).to(device)
+ anchor_h = anchors[:, 1].contiguous().view(-1, 1).to(device)
+ bbox_p[:, :, :, 2:3] *= anchor_w
+ bbox_p[:, :, :, 3:4] *= anchor_h
+
+ return bbox_p
+
+
+def visualize_bbox(img_pred, bbox_preds):
+ img_array = (img_pred.to("cpu") * 255).permute((1, 2, 0)).numpy().astype(np.uint8)
+ img = Image.fromarray(img_array)
+
+ for bbox_pred in bbox_preds:
+ bbox_pred = [int(x * 416) for x in bbox_pred]
+
+ if sum(bbox_pred) == 0:
+ continue
+
+ x_lefttop = bbox_pred[0]
+ y_lefttop = bbox_pred[1]
+ width = bbox_pred[2]
+ height = bbox_pred[3]
+
+ draw = ImageDraw.Draw(img)
+ draw.rectangle(
+ [(x_lefttop, y_lefttop), (x_lefttop + width, y_lefttop + height)]
+ )
+
+ plt.imshow(img)
+ plt.show()
+
+
+# inference example using trained model
+hypothesis = model(img.to(device)).permute((0, 2, 3, 1))
+hypothesis = hypothesis[0].reshape((1, out_size**2, num_anchors, 5 + num_classes))
+
+# transform output
+bbox_pred_xy = torch.sigmoid(hypothesis[..., :2])
+bbox_pred_wh = torch.exp(hypothesis[..., 2:4])
+bbox_pred = torch.cat((bbox_pred_xy, bbox_pred_wh), 3)
+bbox_pred = post_process_for_bbox(bbox_pred)
+iou_pred = torch.sigmoid(hypothesis[..., 4:5])
+score_pred = hypothesis[..., 5:].contiguous()
+prob_pred = torch.softmax(score_pred.view(-1, num_classes), dim=1).view(
+ score_pred.shape
+)
+
+# result of inference (data range 0~1)
+iou_mask = iou_pred > 0.5
+bbox_pred = bbox_pred * iou_mask
+visualize_bbox(img, bbox_pred.reshape(-1, 4))
diff --git a/Applications/YOLOv2/PyTorch/yolo.py b/Applications/YOLOv2/PyTorch/yolo.py
new file mode 100644
index 0000000000..390cbd5ada
--- /dev/null
+++ b/Applications/YOLOv2/PyTorch/yolo.py
@@ -0,0 +1,176 @@
+# SPDX-License-Identifier: Apache-2.0
+# Copyright (C) 2023 Seungbaek Hong
+#
+# @file yolo.py
+# @date 8 March 2023
+# @brief Define simple yolo model, but not original darknet.
+#
+# @author Seungbaek Hong
+
+import torch
+from torch import nn
+
+
+##
+# @brief define yolo model (except for re-organization module)
+class YoloV2(nn.Module):
+ def __init__(self, num_classes, num_anchors=5):
+
+ super().__init__()
+ self.num_classes = num_classes
+ self.num_anchors = num_anchors
+ self.conv1 = nn.Sequential(
+ nn.Conv2d(3, 32, 3, 1, 1, bias=False),
+ nn.BatchNorm2d(32),
+ nn.LeakyReLU(0.1),
+ nn.MaxPool2d(2, 2),
+ )
+ self.conv2 = nn.Sequential(
+ nn.Conv2d(32, 64, 3, 1, 1, bias=False),
+ nn.BatchNorm2d(64),
+ nn.LeakyReLU(0.1),
+ nn.MaxPool2d(2, 2),
+ )
+ self.conv3 = nn.Sequential(
+ nn.Conv2d(64, 128, 3, 1, 1, bias=False),
+ nn.BatchNorm2d(128),
+ nn.LeakyReLU(0.1),
+ )
+ self.conv4 = nn.Sequential(
+ nn.Conv2d(128, 64, 1, 1, 0, bias=False),
+ nn.BatchNorm2d(64),
+ nn.LeakyReLU(0.1),
+ )
+ self.conv5 = nn.Sequential(
+ nn.Conv2d(64, 128, 3, 1, 1, bias=False),
+ nn.BatchNorm2d(128),
+ nn.LeakyReLU(0.1),
+ nn.MaxPool2d(2, 2),
+ )
+ self.conv6 = nn.Sequential(
+ nn.Conv2d(128, 256, 3, 1, 1, bias=False),
+ nn.BatchNorm2d(256),
+ nn.LeakyReLU(0.1),
+ )
+ self.conv7 = nn.Sequential(
+ nn.Conv2d(256, 128, 1, 1, 0, bias=False),
+ nn.BatchNorm2d(128),
+ nn.LeakyReLU(0.1),
+ )
+ self.conv8 = nn.Sequential(
+ nn.Conv2d(128, 256, 3, 1, 1, bias=False),
+ nn.BatchNorm2d(256),
+ nn.LeakyReLU(0.1),
+ nn.MaxPool2d(2, 2),
+ )
+ self.conv9 = nn.Sequential(
+ nn.Conv2d(256, 512, 3, 1, 1, bias=False),
+ nn.BatchNorm2d(512),
+ nn.LeakyReLU(0.1),
+ )
+ self.conv10 = nn.Sequential(
+ nn.Conv2d(512, 256, 1, 1, 0, bias=False),
+ nn.BatchNorm2d(256),
+ nn.LeakyReLU(0.1),
+ )
+ self.conv11 = nn.Sequential(
+ nn.Conv2d(256, 512, 3, 1, 1, bias=False),
+ nn.BatchNorm2d(512),
+ nn.LeakyReLU(0.1),
+ )
+ self.conv12 = nn.Sequential(
+ nn.Conv2d(512, 256, 1, 1, 0, bias=False),
+ nn.BatchNorm2d(256),
+ nn.LeakyReLU(0.1),
+ )
+ self.conv13 = nn.Sequential(
+ nn.Conv2d(256, 512, 3, 1, 1, bias=False),
+ nn.BatchNorm2d(512),
+ nn.LeakyReLU(0.1),
+ )
+
+ self.conv_b = nn.Sequential(
+ nn.Conv2d(512, 64, 1, 1, 0, bias=False),
+ nn.BatchNorm2d(64),
+ nn.LeakyReLU(0.1),
+ )
+
+ self.maxpool_a = nn.MaxPool2d(2, 2)
+ self.conv_a1 = nn.Sequential(
+ nn.Conv2d(512, 1024, 3, 1, 1, bias=False),
+ nn.BatchNorm2d(1024),
+ nn.LeakyReLU(0.1),
+ )
+ self.conv_a2 = nn.Sequential(
+ nn.Conv2d(1024, 512, 1, 1, 0, bias=False),
+ nn.BatchNorm2d(512),
+ nn.LeakyReLU(0.1),
+ )
+ self.conv_a3 = nn.Sequential(
+ nn.Conv2d(512, 1024, 3, 1, 1, bias=False),
+ nn.BatchNorm2d(1024),
+ nn.LeakyReLU(0.1),
+ )
+ self.conv_a4 = nn.Sequential(
+ nn.Conv2d(1024, 512, 1, 1, 0, bias=False),
+ nn.BatchNorm2d(512),
+ nn.LeakyReLU(0.1),
+ )
+ self.conv_a5 = nn.Sequential(
+ nn.Conv2d(512, 1024, 3, 1, 1, bias=False),
+ nn.BatchNorm2d(1024),
+ nn.LeakyReLU(0.1),
+ )
+ self.conv_a6 = nn.Sequential(
+ nn.Conv2d(1024, 1024, 3, 1, 1, bias=False),
+ nn.BatchNorm2d(1024),
+ nn.LeakyReLU(0.1),
+ )
+ self.conv_a7 = nn.Sequential(
+ nn.Conv2d(1024, 1024, 3, 1, 1, bias=False),
+ nn.BatchNorm2d(1024),
+ nn.LeakyReLU(0.1),
+ )
+
+ self.conv_out1 = nn.Sequential(
+ nn.Conv2d(1280, 1024, 3, 1, 1, bias=False),
+ nn.BatchNorm2d(1024),
+ nn.LeakyReLU(0.1),
+ )
+
+ self.conv_out2 = nn.Conv2d(1024, self.num_anchors * (5 + num_classes), 1, 1, 0)
+
+ def forward(self, x):
+ output = self.conv1(x)
+ output = self.conv2(output)
+ output = self.conv3(output)
+ output = self.conv4(output)
+ output = self.conv5(output)
+ output = self.conv6(output)
+ output = self.conv7(output)
+ output = self.conv8(output)
+ output = self.conv9(output)
+ output = self.conv10(output)
+ output = self.conv11(output)
+ output = self.conv12(output)
+ output = self.conv13(output)
+
+ output_a = self.maxpool_a(output)
+ output_a = self.conv_a1(output_a)
+ output_a = self.conv_a2(output_a)
+ output_a = self.conv_a3(output_a)
+ output_a = self.conv_a4(output_a)
+ output_a = self.conv_a5(output_a)
+ output_a = self.conv_a6(output_a)
+ output_a = self.conv_a7(output_a)
+
+ output_b = self.conv_b(output)
+ b, c, h, w = output_b.size()
+ output_b = output_b.view(b, int(c / 4), h, 2, w, 2).contiguous()
+ output_b = output_b.permute(0, 3, 5, 1, 2, 4).contiguous()
+ output_b = output_b.view(b, -1, int(h / 2), int(w / 2))
+
+ output = torch.cat((output_a, output_b), 1)
+ output = self.conv_out1(output)
+ output = self.conv_out2(output)
+ return output
diff --git a/Applications/YOLO/PyTorch/yolo_loss.py b/Applications/YOLOv2/PyTorch/yolo_loss.py
similarity index 72%
rename from Applications/YOLO/PyTorch/yolo_loss.py
rename to Applications/YOLOv2/PyTorch/yolo_loss.py
index 12f95572a4..c444821236 100644
--- a/Applications/YOLO/PyTorch/yolo_loss.py
+++ b/Applications/YOLOv2/PyTorch/yolo_loss.py
@@ -8,10 +8,10 @@
# @author Seungbaek Hong
import torch
-import torch.nn as nn
-import torch.functional as F
+from torch import nn
import numpy as np
+
##
# @brief calculate iou between two boxes list
def calculate_iou(bbox1, bbox2):
@@ -25,27 +25,28 @@ def calculate_iou(bbox1, bbox2):
b1x2, b1y2 = (bbox1[:, :2] + (bbox1[:, 2:4])).split(1, 1)
b2x1, b2y1 = (bbox2[:, :2]).split(1, 1)
b2x2, b2y2 = (bbox2[:, :2] + (bbox2[:, 2:4])).split(1, 1)
-
+
# box areas
areas1 = (b1x2 - b1x1) * (b1y2 - b1y1)
areas2 = (b2x2 - b2x1) * (b2y2 - b2y1)
-
+
# intersections
min_x_of_max_x, max_x_of_min_x = torch.min(b1x2, b2x2), torch.max(b1x1, b2x1)
min_y_of_max_y, max_y_of_min_y = torch.min(b1y2, b2y2), torch.max(b1y1, b2y1)
intersection_width = (min_x_of_max_x - max_x_of_min_x).clamp(min=0)
intersection_height = (min_y_of_max_y - max_y_of_min_y).clamp(min=0)
intersections = intersection_width * intersection_height
-
- # unions
+
+ # unions
unions = (areas1 + areas2) - intersections
-
- result = intersections / unions
+
+ result = intersections / unions
return result
+
##
# @brief find best iou and its index
-def find_best_ratio(anchors, bbox):
+def find_best_ratio(anchors, bbox):
"""
@param anchors shape(numb_of_anchors, 2), it contains w, h
@param bbox shape(numb_of_bbox, 2), it contains w, h
@@ -57,52 +58,59 @@ def find_best_ratio(anchors, bbox):
best_match = np.argmin(similarities, axis=0)
return best_match
+
##
# @brief loss class for yolo
class YoloV2_LOSS(nn.Module):
"""Yolo v2 loss"""
- def __init__(self, num_classes, img_shape = (416, 416), outsize = (13, 13)):
+
+ def __init__(self, num_classes, img_shape, device="cpu", outsize=(13, 13)):
super().__init__()
+ self.device = device
self.num_classes = num_classes
self.img_shape = img_shape
self.outsize = outsize
- self.hook = dict()
-
+ self.hook = {}
+
self.anchors = torch.FloatTensor(
- [(1.3221, 1.73145),
- (3.19275, 4.00944),
- (5.05587, 8.09892),
- (9.47112, 4.84053),
- (11.2364, 10.0071)]
+ [
+ (1.3221, 1.73145),
+ (3.19275, 4.00944),
+ (5.05587, 8.09892),
+ (9.47112, 4.84053),
+ (11.2364, 10.0071),
+ ]
)
-
+
self.mse = nn.MSELoss()
self.bbox_loss, self.iou_loss, self.cls_loss = None, None, None
-
+
##
- # @brief function to track gradients of non-leaf varibles.
+ # @brief function to track gradients of non-leaf varibles.
def hook_variable(self, name, var):
- """ Do not use this function when training. It is for debugging. """
+ """Do not use this function when training. It is for debugging."""
self.hook[name] = var
self.hook[name].requires_grad_().retain_grad()
##
# @brief function to print gradients of non-leaf varibles.
def print_hook_variables(self):
- """ Do not use this function when training. It is for debugging. """
+ """Do not use this function when training. It is for debugging."""
for k, var in self.hook.items():
- print("gradients of variable {}:".format(k))
+ print(f"gradients of variable {k}:")
batch, channel, height, width = var.grad.shape
for b in range(batch):
for c in range(channel):
for h in range(height):
for w in range(width):
if torch.abs(var.grad[b, c, h, w]).item() >= 1e-3:
- print("(b: {}, c: {}, h: {}, w: {}) = {}"\
- .format(b, c, h, w, var.grad[b, c, h, w]))
+ print(
+ f"(b: {b}, c: {c}, h: {h}, w: {w}) =\
+ {var.grad[b, c, h, w]}"
+ )
print("=" * 20)
-
- def forward(self, bbox_pred, iou_pred, prob_pred, bbox_gt, cls_gt):
+
+ def forward(self, bbox_pred, iou_pred, prob_pred, bbox_gt, cls_gt):
"""
@param bbox_pred shape(batch_size, cell_h x cell_w, num_anchors, 4)
@param iou_pred shape(batch_size, cell_h x cell_w, 1)
@@ -114,52 +122,50 @@ def forward(self, bbox_pred, iou_pred, prob_pred, bbox_gt, cls_gt):
self.hook_variable("bbox_pred", bbox_pred)
bbox_pred = self.apply_anchors_to_bbox(bbox_pred)
- bbox_built, iou_built, cls_built, bbox_mask, iou_mask, cls_mask =\
+ bbox_built, iou_built, cls_built, bbox_mask, iou_mask, cls_mask = (
self._build_target(bbox_pred, bbox_gt, cls_gt)
-
- self.bbox_loss = self.mse(bbox_pred * bbox_mask,
- bbox_built * bbox_mask)
- self.iou_loss = self.mse(iou_pred * iou_mask,
- iou_built * iou_mask)
- self.cls_loss = self.mse(prob_pred * cls_mask,
- cls_built * cls_mask)
-
+ )
+
+ self.bbox_loss = self.mse(bbox_pred * bbox_mask, bbox_built * bbox_mask)
+ self.iou_loss = self.mse(iou_pred * iou_mask, iou_built * iou_mask)
+ self.cls_loss = self.mse(prob_pred * cls_mask, cls_built * cls_mask)
+
return self.bbox_loss * 5 + self.iou_loss + self.cls_loss
-
+
def apply_anchors_to_bbox(self, bbox_pred):
"""
@param bbox_pred shape(batch_size, cell_h x cell_w, num_anchors, 4)
- @return bbox_pred shape(batch_size, cell_h x cell_w, num_anchors, 4)
+ @return bbox_pred shape(batch_size, cell_h x cell_w, num_anchors, 4)
"""
- anchor_w = self.anchors[:, 0].contiguous().view(-1, 1)
- anchor_h = self.anchors[:, 1].contiguous().view(-1, 1)
+ anchor_w = self.anchors[:, 0].contiguous().view(-1, 1).to(self.device)
+ anchor_h = self.anchors[:, 1].contiguous().view(-1, 1).to(self.device)
bbox_pred_tmp = bbox_pred.clone()
bbox_pred_tmp[:, :, :, 2:3] = torch.sqrt(bbox_pred[:, :, :, 2:3] * anchor_w)
bbox_pred_tmp[:, :, :, 3:4] = torch.sqrt(bbox_pred[:, :, :, 3:4] * anchor_h)
return bbox_pred_tmp
-
+
def _build_target(self, bbox_pred, bbox_gt, cls_gt):
"""
@param bbox_pred shape(batch_size, cell_h x cell_w, num_anchors, 4)
@param bbox_gt shape(batch_size, num_bbox, 4)
@param cls_gt shape(batch_size, num_bbox, 1)
@return tuple of (bbox_built, iou_built, cls_built, bbox_mask, iou_mask, cls_mask)
- """
+ """
bbox_built, bbox_mask = [], []
iou_built, iou_mask = [], []
cls_built, cls_mask = [], []
-
+
batch_size = bbox_pred.shape[0]
-
+
for i in range(batch_size):
- _bbox_built, _iou_built, _cls_built,\
- _bbox_mask, _iou_mask, _cls_mask =\
- self._make_target_per_sample(
- torch.FloatTensor(bbox_pred[i]),
- torch.FloatTensor(np.array(bbox_gt[i])),
- torch.LongTensor(cls_gt[i])
- )
-
+ _bbox_built, _iou_built, _cls_built, _bbox_mask, _iou_mask, _cls_mask = (
+ self._make_target_per_sample(
+ bbox_pred[i],
+ torch.FloatTensor(np.array(bbox_gt[i])),
+ torch.LongTensor(cls_gt[i]),
+ )
+ )
+
bbox_built.append(_bbox_built)
bbox_mask.append(_bbox_mask)
iou_built.append(_iou_built)
@@ -173,9 +179,16 @@ def _build_target(self, bbox_pred, bbox_gt, cls_gt):
iou_mask = torch.stack(iou_mask)
cls_built = torch.stack(cls_built)
cls_mask = torch.stack(cls_mask)
-
- return bbox_built, iou_built, cls_built, bbox_mask, iou_mask, cls_mask
-
+
+ return (
+ bbox_built.to(self.device),
+ iou_built.to(self.device),
+ cls_built.to(self.device),
+ bbox_mask.to(self.device),
+ iou_mask.to(self.device),
+ cls_mask.to(self.device),
+ )
+
def _make_target_per_sample(self, _bbox_pred, _bbox_gt, _cls_gt):
"""
@param _bbox_pred shape(cell_h x cell_w, num_anchors, 4)
@@ -183,22 +196,22 @@ def _make_target_per_sample(self, _bbox_pred, _bbox_gt, _cls_gt):
@param _cls_gt shape(num_bbox,)
@return tuple of (_bbox_built, _iou_built, _cls_built, _bbox_mask, _iou_mask, _cls_mask)
"""
- hw, num_anchors, _ = _bbox_pred.shape
-
+ hw, num_anchors, _ = _bbox_pred.shape
+
# set result template
_bbox_built = torch.zeros((hw, num_anchors, 4))
_bbox_mask = torch.zeros((hw, num_anchors, 1))
-
+
_iou_built = torch.zeros((hw, num_anchors, 1))
_iou_mask = torch.ones((hw, num_anchors, 1)) * 0.5
-
+
_cls_built = torch.zeros((hw, num_anchors, self.num_classes))
_cls_mask = torch.zeros((hw, num_anchors, 1))
-
+
# find best anchors
- _bbox_gt_wh = _bbox_gt.clone()[:, 2:]
+ _bbox_gt_wh = _bbox_gt.clone()[:, 2:]
best_anchors = find_best_ratio(self.anchors, _bbox_gt_wh)
-
+
# normalize x, y pos based on cell coornindates
cx = _bbox_gt[:, 0] * self.outsize[0]
cy = _bbox_gt[:, 1] * self.outsize[1]
@@ -207,22 +220,23 @@ def _make_target_per_sample(self, _bbox_pred, _bbox_gt, _cls_gt):
cell_idx = np.array(cell_idx, dtype=np.int16)
cx -= np.floor(cx)
cy -= np.floor(cy)
-
+
# set bbox of gt
- _bbox_built[cell_idx, best_anchors, 0] = cx
+ _bbox_built[cell_idx, best_anchors, 0] = cx
_bbox_built[cell_idx, best_anchors, 1] = cy
- _bbox_built[cell_idx, best_anchors, 2] = torch.sqrt(_bbox_gt[:, 2])
- _bbox_built[cell_idx, best_anchors, 3] = torch.sqrt(_bbox_gt[:, 3])
+ _bbox_built[cell_idx, best_anchors, 2] = torch.sqrt(_bbox_gt[:, 2])
+ _bbox_built[cell_idx, best_anchors, 3] = torch.sqrt(_bbox_gt[:, 3])
_bbox_mask[cell_idx, best_anchors, :] = 1
-
- # set cls of gt
+
+ # set cls of gt
_cls_built[cell_idx, best_anchors, _cls_gt] = 1
_cls_mask[cell_idx, best_anchors, :] = 1
-
+
# set confidence score of gt
- _iou_built = calculate_iou(_bbox_pred.reshape(-1, 4), _bbox_built.view(-1, 4)).detach()
+ _iou_built = calculate_iou(
+ _bbox_pred.reshape(-1, 4), _bbox_built.view(-1, 4).to(self.device)
+ ).detach()
_iou_built = _iou_built.view(hw, num_anchors, 1)
_iou_mask[cell_idx, best_anchors, :] = 1
-
- return _bbox_built, _iou_built, _cls_built,\
- _bbox_mask, _iou_mask, _cls_mask
+
+ return _bbox_built, _iou_built, _cls_built, _bbox_mask, _iou_mask, _cls_mask
diff --git a/Applications/YOLO/jni/Android.mk b/Applications/YOLOv2/jni/Android.mk
similarity index 100%
rename from Applications/YOLO/jni/Android.mk
rename to Applications/YOLOv2/jni/Android.mk
diff --git a/Applications/YOLO/jni/Application.mk b/Applications/YOLOv2/jni/Application.mk
similarity index 100%
rename from Applications/YOLO/jni/Application.mk
rename to Applications/YOLOv2/jni/Application.mk
diff --git a/Applications/YOLO/jni/det_dataloader.cpp b/Applications/YOLOv2/jni/det_dataloader.cpp
similarity index 100%
rename from Applications/YOLO/jni/det_dataloader.cpp
rename to Applications/YOLOv2/jni/det_dataloader.cpp
diff --git a/Applications/YOLO/jni/det_dataloader.h b/Applications/YOLOv2/jni/det_dataloader.h
similarity index 100%
rename from Applications/YOLO/jni/det_dataloader.h
rename to Applications/YOLOv2/jni/det_dataloader.h
diff --git a/Applications/YOLO/jni/main.cpp b/Applications/YOLOv2/jni/main.cpp
similarity index 97%
rename from Applications/YOLO/jni/main.cpp
rename to Applications/YOLOv2/jni/main.cpp
index bc3985adbd..018602e408 100644
--- a/Applications/YOLO/jni/main.cpp
+++ b/Applications/YOLOv2/jni/main.cpp
@@ -139,6 +139,7 @@ std::vector yoloBlock(const std::string &block_name,
withKey("filters", filters),
withKey("kernel_size", {kernel_size, kernel_size}),
withKey("padding", padding),
+ withKey("disable_bias", "true"),
withKey("input_layers", input_layer)};
return createLayer("conv2d", props);
@@ -150,6 +151,7 @@ std::vector yoloBlock(const std::string &block_name,
if (downsample) {
LayerHandle a2 = createLayer("batch_normalization",
{with_name("a2"), withKey("momentum", "0.9"),
+ withKey("epsilon", 0.00001),
withKey("activation", "leaky_relu")});
LayerHandle a3 = createLayer(
@@ -158,10 +160,10 @@ std::vector yoloBlock(const std::string &block_name,
return {a1, a2, a3};
} else {
- LayerHandle a2 =
- createLayer("batch_normalization",
- {withKey("name", block_name), withKey("momentum", "0.9"),
- withKey("activation", "leaky_relu")});
+ LayerHandle a2 = createLayer(
+ "batch_normalization",
+ {withKey("name", block_name), withKey("momentum", "0.9"),
+ withKey("epsilon", 0.00001), withKey("activation", "leaky_relu")});
return {a1, a2};
}
diff --git a/Applications/YOLO/jni/meson.build b/Applications/YOLOv2/jni/meson.build
similarity index 100%
rename from Applications/YOLO/jni/meson.build
rename to Applications/YOLOv2/jni/meson.build
diff --git a/Applications/YOLO/jni/reorg_layer.cpp b/Applications/YOLOv2/jni/reorg_layer.cpp
similarity index 100%
rename from Applications/YOLO/jni/reorg_layer.cpp
rename to Applications/YOLOv2/jni/reorg_layer.cpp
diff --git a/Applications/YOLO/jni/reorg_layer.h b/Applications/YOLOv2/jni/reorg_layer.h
similarity index 100%
rename from Applications/YOLO/jni/reorg_layer.h
rename to Applications/YOLOv2/jni/reorg_layer.h
diff --git a/Applications/YOLO/jni/yolo_v2_loss.cpp b/Applications/YOLOv2/jni/yolo_v2_loss.cpp
similarity index 100%
rename from Applications/YOLO/jni/yolo_v2_loss.cpp
rename to Applications/YOLOv2/jni/yolo_v2_loss.cpp
diff --git a/Applications/YOLO/jni/yolo_v2_loss.h b/Applications/YOLOv2/jni/yolo_v2_loss.h
similarity index 100%
rename from Applications/YOLO/jni/yolo_v2_loss.h
rename to Applications/YOLOv2/jni/yolo_v2_loss.h
diff --git a/Applications/meson.build b/Applications/meson.build
index 2e3f59fdf2..7c8ef63cd4 100644
--- a/Applications/meson.build
+++ b/Applications/meson.build
@@ -9,7 +9,7 @@ if enable_ccapi
endif
subdir('VGG/jni')
subdir('Resnet/jni')
-subdir('YOLO/jni')
+subdir('YOLOv2/jni')
subdir('YOLOv3/jni')
subdir('LLaMA/jni')
subdir('Multi_input/jni')
diff --git a/ci/pylintrc b/ci/pylintrc
deleted file mode 100644
index aa38200415..0000000000
--- a/ci/pylintrc
+++ /dev/null
@@ -1,36 +0,0 @@
-[MASTER]
-
-[MESSAGESCONTROL]
-disable=
- too-many-instance-attributes,
- len-as-condition,
- too-few-public-methods,
- anomalous-backslash-in-string,
- no-else-return,
- simplifiable-if-statement,
- too-many-arguments,
- duplicate-code,
- no-name-in-module,
- no-member,
- raw-checker-failed,
- bad-inline-option,
- locally-disabled,
- file-ignored,
- suppressed-message,
- useless-suppression,
- deprecated-pragma,
- import-error,
- missing-docstring,
- invalid-name,
- consider-using-enumerate
-
-[SIMILARITIES]
-
-# Ignore comments when computing similarities.
-ignore-comments=yes
-
-# Ignore docstrings when computing similarities.
-ignore-docstrings=yes
-
-# Ignore imports when computing similarities.
-ignore-imports=no
diff --git a/ci/requirements.txt b/ci/requirements.txt
deleted file mode 100644
index 0be69076fc..0000000000
--- a/ci/requirements.txt
+++ /dev/null
@@ -1,81 +0,0 @@
-absl-py==2.1.0
-astroid==3.0.2
-astunparse==1.6.3
-cachetools==5.3.2
-certifi==2023.11.17
-charset-normalizer==3.3.2
-contourpy==1.2.0
-cycler==0.12.1
-dill==0.3.8
-filelock==3.13.1
-flatbuffers==23.5.26
-fonttools==4.47.2
-fsspec==2023.12.2
-gast==0.5.4
-google-auth==2.27.0
-google-auth-oauthlib==1.2.0
-google-pasta==0.2.0
-grpcio==1.60.0
-h5py==3.10.0
-huggingface-hub==0.20.3
-idna==3.6
-importlib-metadata==7.0.1
-importlib-resources==6.1.1
-isort==5.13.2
-Jinja2==3.1.3
-joblib==1.3.2
-keras==2.15.0
-kiwisolver==1.4.5
-libclang==16.0.6
-Markdown==3.5.2
-MarkupSafe==2.1.4
-matplotlib==3.8.2
-mccabe==0.7.0
-ml-dtypes==0.2.0
-mpmath==1.3.0
-networkx==3.2.1
-numpy==1.26.3
-oauthlib==3.2.2
-opt-einsum==3.3.0
-packaging==23.2
-pandas==2.2.0
-pillow==10.2.0
-platformdirs==4.2.0
-protobuf==4.23.4
-pyasn1==0.5.1
-pyasn1-modules==0.3.0
-pylint==3.0.2
-pyparsing==3.1.1
-python-dateutil==2.8.2
-pytz==2023.4
-PyYAML==6.0.1
-regex==2023.12.25
-requests==2.31.0
-requests-oauthlib==1.3.1
-rsa==4.9
-safetensors==0.4.2
-scikit-learn==1.4.0
-scipy==1.12.0
-six==1.16.0
-sympy==1.12
-tensorboard==2.15.1
-tensorboard-data-server==0.7.2
-tensorflow==2.15.0.post1
-tensorflow-estimator==2.15.0
-tensorflow-io-gcs-filesystem==0.35.0
-termcolor==2.4.0
-threadpoolctl==3.2.0
-tokenizers==0.15.1
-tomli==2.0.1
-tomlkit==0.12.3
-torch==2.2.0
-torchvision==0.17.0
-tqdm==4.66.1
-transformers==4.37.2
-triton==2.2.0
-typing_extensions==4.9.0
-tzdata==2023.4
-urllib3==2.2.0
-Werkzeug==3.0.1
-wrapt==1.14.1
-zipp==3.17.0
diff --git a/debian/nntrainer-dev.install b/debian/nntrainer-dev.install
index 4fd55b3774..11b41f990b 100644
--- a/debian/nntrainer-dev.install
+++ b/debian/nntrainer-dev.install
@@ -16,6 +16,7 @@
/usr/include/nntrainer/blas_interface.h
/usr/include/nntrainer/var_grad.h
/usr/include/nntrainer/weight.h
+/usr/include/nntrainer/blas_avx.h
# todo: update dataset headers
/usr/include/nntrainer/databuffer.h
/usr/include/nntrainer/databuffer_factory.h
diff --git a/meson.build b/meson.build
index d4aea330a4..7ae692e6d9 100644
--- a/meson.build
+++ b/meson.build
@@ -64,9 +64,19 @@ warning_c_flags = [
'-Wno-error=varargs'
]
+arch = host_machine.cpu_family()
+
+if get_option('enable-avx')
+ extra_defines += '-DUSE_AVX=1'
+ if get_option('platform') == 'tizen'
+ add_project_arguments(['-mavx2'], language: ['c','cpp'])
+ else
+ add_project_arguments(['-march=native'], language: ['c','cpp'])
+ endif
+ message('-march=native added for AVX hardware acceleration.')
+endif
if get_option('enable-fp16')
- arch = host_machine.cpu_family()
if get_option('platform') == 'android'
add_project_arguments('-mfp16-format=ieee', language: ['c', 'cpp'])
extra_defines += '-DENABLE_FP16=1'
@@ -105,11 +115,6 @@ if get_option('enable-fp16')
if cc.version().version_compare('>=12.1.0')
message ('Float16 for x86_64 enabled. Modern gcc-x64 generally supports float16 with _Float16.')
extra_defines += '-DENABLE_FP16=1'
- if get_option('enable-avx')
- extra_defines += '-DUSE_AVX=1'
- add_project_arguments(['-march=native'], language: ['c','cpp'])
- message('-march=native added for AVX hardware acceleration.')
- endif
else
warning ('Float16 for x86_64 enabled. However, software emulation is applied for fp16, making it slower and inconsistent. Use GCC 12+ for FP16 support. This build will probably fail unless you bring a compiler that supports fp16 for x64.')
endif
diff --git a/meson_options.txt b/meson_options.txt
index de2578cb47..59accc1c1a 100644
--- a/meson_options.txt
+++ b/meson_options.txt
@@ -40,7 +40,7 @@ option('enable-fp16', type: 'boolean', value: false)
option('enable-cublas', type: 'boolean', value: false)
option('enable-openmp', type: 'boolean', value: true)
option('enable-neon', type: 'boolean', value: false)
-option('enable-avx', type: 'boolean', value: false)
+option('enable-avx', type: 'boolean', value: true)
option('enable-opencl', type: 'boolean', value: false)
# ml-api dependency (to enable, install capi-inference from github.com/nnstreamer/api )
diff --git a/nnstreamer/tensor_trainer/tensor_trainer_nntrainer.cc b/nnstreamer/tensor_trainer/tensor_trainer_nntrainer.cc
index 57d84f99d1..c18630efb9 100644
--- a/nnstreamer/tensor_trainer/tensor_trainer_nntrainer.cc
+++ b/nnstreamer/tensor_trainer/tensor_trainer_nntrainer.cc
@@ -555,7 +555,8 @@ void NNTrainer::NNTrainerImpl::trainModel() {
ml_logd("pid[%d], tid[%d]", pid, tid);
try {
- model->setProperty({"epochs=" + std::to_string(num_epochs)});
+ model->setProperty(
+ {"epochs=" + std::to_string(num_epochs), "save_path=" + model_save_path});
} catch (const std::exception &e) {
ml_loge("Error %s, %s", typeid(e).name(), e.what());
return;
@@ -574,14 +575,6 @@ void NNTrainer::NNTrainerImpl::trainModel() {
return;
}
- try {
- ml_logd("Save_model: %s", model_save_path.c_str());
- model->save(model_save_path, ml::train::ModelFormat::MODEL_FORMAT_BIN);
-
- } catch (const std::exception &e) {
- ml_loge("Error %s, %s", typeid(e).name(), e.what());
- return;
- }
/* send event */
nnstreamer_trainer_notify_event(this->notifier,
TRAINER_EVENT_TRAINING_COMPLETION, NULL);
diff --git a/nntrainer/cl_context.cpp b/nntrainer/cl_context.cpp
index 1ed31490be..be7345eed0 100644
--- a/nntrainer/cl_context.cpp
+++ b/nntrainer/cl_context.cpp
@@ -13,7 +13,7 @@
*/
#include
-#include
+#include
namespace nntrainer {
@@ -23,8 +23,9 @@ std::once_flag global_cl_context_init_flag;
static void add_default_object(ClContext &cc) {
- cc.registerFactory(nntrainer::createLayer,
- FullyConnectedLayer::type, ml::train::LayerType::LAYER_FC);
+ cc.registerFactory(nntrainer::createLayer,
+ FullyConnectedLayerCl::type,
+ ml::train::LayerType::LAYER_FC);
}
static void registerer(ClContext &cc) noexcept {
diff --git a/nntrainer/graph/graph_core.cpp b/nntrainer/graph/graph_core.cpp
index b624e066e4..3eafbb9261 100644
--- a/nntrainer/graph/graph_core.cpp
+++ b/nntrainer/graph/graph_core.cpp
@@ -35,6 +35,10 @@ GraphCore::getSortedNode(unsigned int ith) const {
return Sorted.at(ith);
}
+const unsigned int GraphCore::getSortedNodeIdx(const std::string &name) const {
+ return sorted_node_map.at(name);
+}
+
void GraphCore::makeAdjacencyList(
std::vector>> &adj) {
/** initialize the adj list */
@@ -93,6 +97,11 @@ void GraphCore::topologicalSort() {
if (Sorted.size() != node_list.size())
throw std::runtime_error("Internal error in topologicalSort");
+ unsigned int idx = 0;
+ for (auto n : Sorted) {
+ sorted_node_map[n->getName()] = idx;
+ idx++;
+ }
}
const std::shared_ptr &
diff --git a/nntrainer/graph/graph_core.h b/nntrainer/graph/graph_core.h
index 83d3ce7c39..77aa63666a 100644
--- a/nntrainer/graph/graph_core.h
+++ b/nntrainer/graph/graph_core.h
@@ -91,6 +91,13 @@ class GraphCore {
*/
const std::shared_ptr &getSortedNode(unsigned int ith) const;
+ /**
+ * @brief getter of Sorted GraphNode index with name
+ * @param[in] layer name
+ * @ret index
+ */
+ const unsigned int getSortedNodeIdx(const std::string &name) const;
+
/**
* @brief getter of GraphNode with node name
* @param[in] node name
@@ -252,6 +259,7 @@ class GraphCore {
std::vector>
node_list; /**< Unordered Node List */
std::unordered_map node_map; /**< Unordered Node map */
+ std::unordered_map sorted_node_map; /**< Unordered Node map */
std::vector> Sorted; /**< Ordered Node List */
bool sorted; /** if the node_list is sorted */
diff --git a/nntrainer/graph/network_graph.cpp b/nntrainer/graph/network_graph.cpp
index 2d4cfdc769..ec69ebd69f 100644
--- a/nntrainer/graph/network_graph.cpp
+++ b/nntrainer/graph/network_graph.cpp
@@ -337,7 +337,7 @@ void NetworkGraph::applyGradients(
continue;
}
- if (rc.isGradientClipByGlobalNorm(i)) {
+ if (rc.isGradientClipByGlobalNorm(i) || rc.isMixedPrecision(i)) {
/**
* @note the weights whose gradient are to be clipped by global norm will
* be clipped at once at the end of iteration and applied then.
@@ -393,56 +393,113 @@ sharedConstTensors NetworkGraph::incremental_forwarding(
return out;
}
-void NetworkGraph::backwarding(
+bool NetworkGraph::backwarding(
int iteration,
- std::function, int)> &backwarding_op,
- std::function &apply_grad_clip_op,
- std::function stop_cb, void *userdata) const {
+ std::function, bool)> &forwarding_op,
+ std::function, int)> &backwarding_op,
+ std::function &lazy_apply_grad_op,
+ std::function stop_cb, void *userdata) {
/**
* last layer backwarding is run out of this loop
*/
auto iter_begin = getBackwardingBeginIter();
auto iter_end = getBackwardingEndIter();
+ bool is_valid = true;
/// there is no layer to train, so backwarding is essentially noop
if (iter_begin == iter_end) {
- return;
+ return true;
}
auto const &lptr_begin = (*iter_begin);
+ // graph_const_reverse_iterator
+ auto iter_ = iter_begin;
if (lptr_begin->requireLabel() == false)
throw std::runtime_error(
"Error: last layer does not accept label, we can't train");
- for (auto iter = iter_begin; iter != iter_end && !stop_cb(userdata); iter++) {
- auto &ln = *iter;
+ for (iter_ = iter_begin; iter_ != iter_end && !stop_cb(userdata); iter_++) {
+ auto &ln = *iter_;
PROFILE_TIME_START(profile_keys.at(ln->getType()));
- backwarding_op(ln, iteration);
+ is_valid = backwarding_op(ln, iteration);
PROFILE_TIME_END(profile_keys.at(ln->getType()));
+
+ if (!is_valid) {
+ std::cout << ln->getName() << " : Gradient has NaN --> "
+ << ln->getRunContext().getLossScale() << std::endl;
+ break;
+ }
}
- /** perform clipping of the gradients by global norm if any */
- if (clip_weights.empty())
- return;
+ if (!is_valid) {
+ /** if has NaN
+ * 1. reset the loss scale. : @todo Backoff_factor : default --> 0.5
+ * 2. run forwarding from cur_iter to cend() && !stop_cb(userdata);
+ * 3. return false --> run backwarding again;
+ */
+ float scale = (*iter_)->getRunContext().getLossScale();
+
+ NNTR_THROW_IF(scale == 1.0f, std::invalid_argument)
+ << "Loss Scale Factor is 1.0f";
+
+ float s = scale > 1.5f ? scale * 0.5f : 1.0f;
- /** calculate the global norm */
- Tensor global_norm_t(
- TensorDim({1u, 1u, 1u, (unsigned int)clip_weights.size()}));
- float *global_norm_data = global_norm_t.getData();
- for (unsigned int idx = 0; idx < clip_weights.size(); idx++) {
- auto const &w = clip_weights[idx];
- global_norm_data[idx] = w->getGradientNorm();
+ resetLossScale(s);
+
+ auto f_iter = cbegin() + graph.getSortedNodeIdx((*iter_)->getName());
+
+ for (auto iter = f_iter; iter != cend() && !stop_cb(userdata); iter++) {
+ auto &ln = *iter;
+ ln->needsOutputSetZero(true);
+ }
+
+ for (auto iter = f_iter; iter != cend() && !stop_cb(userdata); iter++) {
+ auto &ln = *iter;
+ PROFILE_TIME_START(profile_keys.at(ln->getType()));
+ forwarding_op(*iter, true);
+ PROFILE_TIME_END(profile_keys.at(ln->getType()));
+ }
+
+ return false;
}
- float global_norm = global_norm_t.l2norm();
- /** apply the gradient with the above global norm */
- for (auto w : clip_weights) {
- w->clipGradientByGlobalNorm(global_norm);
+
+ /** perform clipping of the gradients by global norm if any */
+ if (lazy_weights.empty())
+ return true;
+
+ if (is_clip_grad) {
+ /** calculate the global norm */
+ Tensor global_norm_t(
+ TensorDim({1u, 1u, 1u, (unsigned int)lazy_weights.size()}));
+ float *global_norm_data = global_norm_t.getData();
+ for (unsigned int idx = 0; idx < lazy_weights.size(); idx++) {
+ auto const &w = lazy_weights[idx];
+ global_norm_data[idx] = w->getGradientNorm();
+ }
+ float global_norm = global_norm_t.l2norm();
+ /** apply the gradient with the above global norm */
+ for (auto w : lazy_weights) {
+ w->clipGradientByGlobalNorm(global_norm);
+ }
}
/** apply the gradient with the above global norm */
- for (auto w : clip_weights) {
- apply_grad_clip_op(*w, iteration);
+ for (auto w : lazy_weights) {
+ lazy_apply_grad_op(*w, iteration);
}
+ nan_count++;
+
+ /** @todo : handle as property : growth_interval : default --> 2000 */
+
+ if (nan_count > 2000) {
+ float scale = (*iter_)->getRunContext().getLossScale();
+ /** @todo growth_factor : default --> 2.0 */
+ float s = scale * 2.0f;
+ resetLossScale(s);
+ nan_count = 0;
+ }
+
+ return true;
}
LayerNode *NetworkGraph::computeBackwardEnd() {
@@ -580,8 +637,15 @@ void NetworkGraph::addLayer(std::shared_ptr layer) {
InPlace
NetworkGraph::canExecuteInPlace(const std::shared_ptr &lnode) {
- if (!lnode->supportInPlace())
+
+ if (!lnode->supportInPlace()) {
return InPlace::NONE;
+ }
+
+ if (lnode->getType() == InputLayer::type &&
+ !istrequal(getTensorType()[2], "FP32")) {
+ return InPlace::NONE;
+ }
/** layers which behave as a no-op - flatten */
auto no_op = [](const std::shared_ptr &lnode) {
@@ -768,9 +832,10 @@ NetworkGraph::finalizeContext(const std::shared_ptr &lnode,
* node is going to be used with in-place optimizations.
*/
auto out_specs = init_context.getOutSpecs();
+
/// @note try move inplace control to finalize
bool shared_var = false, shared_grad = false;
- if (lnode->executeInPlace() != InPlace::NONE) {
+ if (lnode->executeInPlace() != InPlace::NONE && lnode->supportInPlace()) {
setInplaceSharedMemoryConfigByLayer(lnode, shared_var, shared_grad);
for (unsigned int i = 0; i < out_specs.size(); ++i) {
auto &s = out_specs.at(i);
@@ -879,7 +944,8 @@ NetworkGraph::finalizeContext(const std::shared_ptr &lnode,
lnode->getTrainable(), shared_weight_names),
inputs, outputs,
tensor_manager->requestTensors(gnode, init_context.getTensorsSpec(),
- lnode->getTrainable(), shared_tensor_names));
+ lnode->getTrainable(), shared_tensor_names),
+ init_context.getLossScale());
return outputs;
}
@@ -1027,7 +1093,8 @@ NetworkGraph::refinalizeContext(const std::shared_ptr &lnode,
// TODO: update weights spec for trainable based on layer trainable prop
weights, inputs, outputs,
tensor_manager->requestTensors(gnode, init_context.getTensorsSpec(),
- lnode->getTrainable(), shared_tensor_names));
+ lnode->getTrainable(), shared_tensor_names),
+ init_context.getLossScale());
return outputs;
}
@@ -1197,7 +1264,7 @@ int NetworkGraph::initialize(ExecutionMode mode,
*/
if (tensor_manager->isLastAccess(rc.getWeightGrad(i).getName(),
last_grad_access) ||
- (rc.isGradientClipByGlobalNorm(i) &&
+ ((rc.isGradientClipByGlobalNorm(i) || rc.isMixedPrecision(i)) &&
tensor_manager->isSecondLastAccess(rc.getWeightGrad(i).getName(),
last_grad_access))) {
rc.getWeightObject(i).setAsGradientLastAccess();
@@ -1287,11 +1354,19 @@ int NetworkGraph::initialize(ExecutionMode mode,
/** select weights which would require clipping of the gradients by global
* norm if any */
- clip_weights = tensor_manager->getWeights([](const Weight *w) {
+ lazy_weights = tensor_manager->getWeights([](const Weight *w) {
return w->hasGradient() && w->isGradientLastAccess() &&
- w->isGradientClipByGlobalNorm();
+ (w->isGradientClipByGlobalNorm() || w->isMixedPrecision());
});
+ is_clip_grad = false;
+ for (auto w : lazy_weights) {
+ if (w->isGradientClipByGlobalNorm()) {
+ is_clip_grad = true;
+ break;
+ }
+ }
+
return ML_ERROR_NONE;
}
@@ -1556,10 +1631,18 @@ void NetworkGraph::requestOptimizerVariable(
const TensorDim &dim = w->getDim();
std::vector dims = cb(dim);
w->setOptimizerVariables(tensor_manager->requestWeightOptimizerVariables(
- dims, w->getName(), TensorLifespan::MAX_LIFESPAN,
- w->isGradientClipByGlobalNorm(), Tensor::Initializer::ZEROS));
+ dims, w->getName(), ":opt", TensorLifespan::MAX_LIFESPAN,
+ w->isGradientClipByGlobalNorm(), w->isMixedPrecision(),
+ Tensor::Initializer::ZEROS));
}
}
}
+void NetworkGraph::resetLossScale(float scale) {
+ for (auto iter = cbegin(); iter != cend(); iter++) {
+ auto &ln = *iter;
+ ln->getRunContext().setLossScale(scale);
+ }
+}
+
} /* namespace nntrainer */
diff --git a/nntrainer/graph/network_graph.h b/nntrainer/graph/network_graph.h
index 5c9adf0363..22f14e1b73 100644
--- a/nntrainer/graph/network_graph.h
+++ b/nntrainer/graph/network_graph.h
@@ -51,7 +51,9 @@ class NetworkGraph {
optimize_memory(true),
exec_mode(ExecutionMode::TRAIN),
tensor_format("NCHW"),
- tensor_dtype(split("FP32-FP32", getRegex("\\-"))) {}
+ tensor_dtype(split("FP32-FP32", getRegex("\\-"))) {
+ nan_count = 0;
+ }
/**
* @brief Constructor of NeuralNetwork Graph Class
@@ -73,7 +75,9 @@ class NetworkGraph {
optimize_memory(true),
exec_mode(ExecutionMode::TRAIN),
tensor_format(tensor_format_),
- tensor_dtype(split(tensor_dtype_, getRegex("\\-"))) {}
+ tensor_dtype(split(tensor_dtype_, getRegex("\\-"))) {
+ nan_count = 0;
+ }
/**
* @brief Destructor of the NeuralNetwork Graph class
@@ -206,13 +210,14 @@ class NetworkGraph {
* @param[in] backwarding_op operation for the backwarding
* @param[in] apply_grad_clip_op operation for applying the clip gradients
*/
- void backwarding(
+ bool backwarding(
int iteration,
- std::function, int)> &backwarding_op,
- std::function &apply_grad_clip_op,
+ std::function, bool)> &forwarding_op,
+ std::function, int)> &backwarding_op,
+ std::function &lazy_apply_grad_op,
std::function stop_cb =
[](void *user_data) { return false; },
- void *user_data = nullptr) const;
+ void *user_data = nullptr);
/**
* @brief get begin iterator for the graph
@@ -444,6 +449,12 @@ class NetworkGraph {
getLayerExecutionOrders(const std::shared_ptr &lnode);
#endif // ENABLE_TEST
+ /**
+ * @brief reset the loss scale
+ * @param[in] scale
+ */
+ void resetLossScale(float scale);
+
private:
std::map sub_in_out; /** This is map to identify
input and output layer name of subgraph */
@@ -480,7 +491,10 @@ class NetworkGraph {
std::unordered_map
profile_keys; /**< profile keys based on the layer type */
std::vector
- clip_weights; /**< weights with global norm based clipping enabled */
+ lazy_weights; /**< weights with global norm based clipping enabled */
+ bool is_clip_grad;
+
+ unsigned int nan_count;
/**
* @brief topological sort
diff --git a/nntrainer/layers/bn_layer.cpp b/nntrainer/layers/bn_layer.cpp
index 1723ac677f..e978b1ef59 100644
--- a/nntrainer/layers/bn_layer.cpp
+++ b/nntrainer/layers/bn_layer.cpp
@@ -111,6 +111,12 @@ void BatchNormalizationLayer::finalize(InitLayerContext &context) {
context.requestWeight(dim, bnparams_beta, WeightRegularizer::NONE, 1.0f,
bias_decay, "beta", true);
+ /**
+ * @note declare weigth dimention with activation datatype
+ */
+ TensorDim w_dim = dim;
+ w_dim.setDataType(in_dim.getDataType());
+
/**
* caches the deviation -> input - avg(input)
* @todo check if avoiding this storage and adding dependency on input (no
@@ -121,7 +127,7 @@ void BatchNormalizationLayer::finalize(InitLayerContext &context) {
TensorLifespan::ITERATION_LIFESPAN);
/** caches the inverse standard deviation */
wt_idx[BNParams::invstd] =
- context.requestTensor(dim, "invstd", Tensor::Initializer::NONE, false,
+ context.requestTensor(w_dim, "invstd", Tensor::Initializer::NONE, false,
TensorLifespan::ITERATION_LIFESPAN);
/**
* Temporary tensor to store the full sized tensors in order to allow batch
@@ -136,13 +142,13 @@ void BatchNormalizationLayer::finalize(InitLayerContext &context) {
* caches variance + epsilon as well.
*/
wt_idx[BNParams::cvar] =
- context.requestTensor(dim, "cvar", Tensor::Initializer::NONE, false,
+ context.requestTensor(w_dim, "cvar", Tensor::Initializer::NONE, false,
TensorLifespan::ITERATION_LIFESPAN);
/**
* Temporary tensor to store the reduced tensors along the axes_to_reduce.
*/
wt_idx[BNParams::t_reduced] =
- context.requestTensor(dim, "tensor_reduced", Tensor::Initializer::NONE,
+ context.requestTensor(w_dim, "tensor_reduced", Tensor::Initializer::NONE,
false, TensorLifespan::FORWARD_DERIV_LIFESPAN);
}
diff --git a/nntrainer/layers/cl_layers/blas_kernels.cpp b/nntrainer/layers/cl_layers/blas_kernels.cpp
new file mode 100644
index 0000000000..c190688c66
--- /dev/null
+++ b/nntrainer/layers/cl_layers/blas_kernels.cpp
@@ -0,0 +1,301 @@
+// SPDX-License-Identifier: Apache-2.0
+/**
+ * Copyright (C) 2024 Debadri Samaddar
+ *
+ * @file blas_kernels.cpp
+ * @date 14 May 2024
+ * @brief Common blas OpenCL kernels
+ * @see https://github.com/nnstreamer/nntrainer
+ * @author Debadri Samaddar
+ * @bug No known bugs except for NYI items
+ *
+ */
+
+#include
+
+namespace nntrainer {
+
+std::string sgemv_cl_kernel_ =
+ R"(__kernel void sgemv_cl(const __global float* A, const __global float* X,
+ __global float* Y, unsigned int M, unsigned int N) {
+ unsigned int i;
+ i = get_global_id(0);
+ float y0 = 0.0f;
+ for (unsigned int j = 0; j < M; j++)
+ y0 += A[i + j * N] * X[j];
+ Y[i] = y0;
+
+ })";
+
+std::string dot_cl_kernel_ =
+ R"(__kernel void dot_cl(const __global float* A, const __global float* X, unsigned int K, __global float* res) {
+ *res = 0;
+ for (unsigned int i = 0; i < K; i++){
+ *res += A[i] * X[i];
+ }
+ })";
+
+std::string sgemm_cl_kernel_ =
+ R"(__kernel void sgemm_cl(const __global float* A, const __global float* B,
+ __global float* C, unsigned int K, unsigned int lda, unsigned int ldb, unsigned int ldc) {
+
+ unsigned int m = get_global_id(0);
+ unsigned int n = get_global_id(1);
+ float c = 0.0f;
+ for (unsigned int k = 0; k < K; ++k) {
+ float a, b;
+ a = A[m * lda + k];
+ b = B[k * ldb + n];
+ c += a * b;
+ }
+ C[m * ldc + n] = c;
+ })";
+
+/**
+ * @brief declaring global kernel objects
+ */
+opencl::Kernel kernel_sgemv;
+opencl::Kernel kernel_sgemm;
+opencl::Kernel kernel_dot;
+
+void sgemv_cl(const float *matAdata, const float *vecXdata, float *vecYdata,
+ unsigned int dim1, unsigned int dim2, unsigned int lda,
+ RunLayerContext &context) {
+
+ bool result = false;
+
+ do {
+ result = context.clCreateKernel(sgemv_cl_kernel_,
+ context.LayerKernel::SGEMV, kernel_sgemv);
+ if (!result) {
+ break;
+ }
+
+ size_t dim1_size = sizeof(float) * dim1;
+ size_t dim2_size = sizeof(float) * dim2;
+ opencl::Buffer inputA(context.context_inst_, dim1 * dim2 * sizeof(float),
+ true, nullptr);
+
+ opencl::Buffer inputX(context.context_inst_, dim1_size, true, nullptr);
+
+ opencl::Buffer inOutY(context.context_inst_, dim2_size, true, nullptr);
+
+ result = inputA.WriteData(context.command_queue_inst_, matAdata);
+ if (!result) {
+ break;
+ }
+
+ result = inputX.WriteData(context.command_queue_inst_, vecXdata);
+ if (!result) {
+ break;
+ }
+
+ result = inOutY.WriteData(context.command_queue_inst_, vecYdata);
+ if (!result) {
+ break;
+ }
+
+ result = kernel_sgemv.SetKernelArguments(0, &inputA, sizeof(cl_mem));
+ if (!result) {
+ break;
+ }
+
+ result = kernel_sgemv.SetKernelArguments(1, &inputX, sizeof(cl_mem));
+ if (!result) {
+ break;
+ }
+
+ result = kernel_sgemv.SetKernelArguments(2, &inOutY, sizeof(cl_mem));
+ if (!result) {
+ break;
+ }
+
+ result = kernel_sgemv.SetKernelArguments(3, &dim1, sizeof(int));
+ if (!result) {
+ break;
+ }
+
+ result = kernel_sgemv.SetKernelArguments(4, &dim2, sizeof(int));
+ if (!result) {
+ break;
+ }
+
+ const int work_groups_count[3] = {(int)dim2, 1, 1};
+ const int work_group_size[3] = {32, 32, 1}; // test-value
+
+ result = context.command_queue_inst_.DispatchCommand(
+ kernel_sgemv, work_groups_count, work_group_size);
+ if (!result) {
+ break;
+ }
+
+ result = inOutY.ReadData(context.command_queue_inst_, vecYdata);
+ if (!result) {
+ break;
+ }
+
+ } while (false);
+}
+
+float dot_cl(const float *vecAdata, const float *vecXdata, unsigned int dim1,
+ RunLayerContext &context) {
+
+ bool result = false;
+
+ float cl_ret = 0;
+
+ do {
+ result = context.clCreateKernel(dot_cl_kernel_, context.LayerKernel::DOT,
+ kernel_dot);
+ if (!result) {
+ break;
+ }
+
+ size_t dim1_size = sizeof(float) * dim1;
+
+ opencl::Buffer inputA(context.context_inst_, dim1_size, true, nullptr);
+
+ opencl::Buffer inputX(context.context_inst_, dim1_size, true, nullptr);
+
+ opencl::Buffer dotResult(context.context_inst_, sizeof(float), true,
+ &cl_ret);
+
+ result = inputA.WriteData(context.command_queue_inst_, vecAdata);
+ if (!result) {
+ break;
+ }
+
+ result = inputX.WriteData(context.command_queue_inst_, vecXdata);
+ if (!result) {
+ break;
+ }
+
+ result = kernel_dot.SetKernelArguments(0, &inputA, sizeof(cl_mem));
+ if (!result) {
+ break;
+ }
+
+ result = kernel_dot.SetKernelArguments(1, &inputX, sizeof(cl_mem));
+ if (!result) {
+ break;
+ }
+
+ result = kernel_dot.SetKernelArguments(2, &dim1, sizeof(int));
+ if (!result) {
+ break;
+ }
+
+ result = kernel_dot.SetKernelArguments(3, &dotResult, sizeof(cl_mem));
+ if (!result) {
+ break;
+ }
+
+ const int work_groups_count[3] = {(int)dim1, 1, 1};
+ const int work_group_size[3] = {32, 32, 1}; // test-value
+
+ result = context.command_queue_inst_.DispatchCommand(
+ kernel_dot, work_groups_count, work_group_size);
+ if (!result) {
+ break;
+ }
+
+ result = dotResult.ReadData(context.command_queue_inst_, &cl_ret);
+ if (!result) {
+ break;
+ }
+
+ } while (false);
+
+ return cl_ret;
+}
+
+void sgemm_cl(const float *A, const float *B, float *C, unsigned int M,
+ unsigned int N, unsigned int K, unsigned int lda,
+ unsigned int ldb, unsigned int ldc, RunLayerContext &context) {
+
+ bool result = false;
+
+ do {
+ result = context.clCreateKernel(sgemm_cl_kernel_,
+ context.LayerKernel::SGEMM, kernel_sgemm);
+ if (!result) {
+ break;
+ }
+
+ size_t m_k_size = M * K * sizeof(float);
+ size_t k_n_size = K * N * sizeof(float);
+ size_t m_n_size = M * N * sizeof(float);
+
+ opencl::Buffer inputA(context.context_inst_, m_k_size, true, nullptr);
+
+ opencl::Buffer inputB(context.context_inst_, k_n_size, true, nullptr);
+
+ opencl::Buffer inOutC(context.context_inst_, m_n_size, true, nullptr);
+
+ result = inputA.WriteData(context.command_queue_inst_, A);
+ if (!result) {
+ break;
+ }
+
+ result = inputB.WriteData(context.command_queue_inst_, B);
+ if (!result) {
+ break;
+ }
+
+ result = inOutC.WriteData(context.command_queue_inst_, C);
+ if (!result) {
+ break;
+ }
+
+ result = kernel_sgemm.SetKernelArguments(0, &inputA, sizeof(cl_mem));
+ if (!result) {
+ break;
+ }
+
+ result = kernel_sgemm.SetKernelArguments(1, &inputB, sizeof(cl_mem));
+ if (!result) {
+ break;
+ }
+
+ result = kernel_sgemm.SetKernelArguments(2, &inOutC, sizeof(cl_mem));
+ if (!result) {
+ break;
+ }
+
+ result = kernel_sgemm.SetKernelArguments(3, &K, sizeof(int));
+ if (!result) {
+ break;
+ }
+
+ result = kernel_sgemm.SetKernelArguments(4, &lda, sizeof(int));
+ if (!result) {
+ break;
+ }
+
+ result = kernel_sgemm.SetKernelArguments(5, &ldb, sizeof(int));
+ if (!result) {
+ break;
+ }
+
+ result = kernel_sgemm.SetKernelArguments(6, &ldc, sizeof(int));
+ if (!result) {
+ break;
+ }
+
+ const int work_groups_count[3] = {(int)M, (int)N, 1};
+ const int work_group_size[3] = {32, 32, 1}; // test-value
+
+ result = context.command_queue_inst_.DispatchCommand(
+ kernel_sgemm, work_groups_count, work_group_size);
+ if (!result) {
+ break;
+ }
+
+ result = inOutC.ReadData(context.command_queue_inst_, C);
+ if (!result) {
+ break;
+ }
+
+ } while (false);
+}
+} // namespace nntrainer
diff --git a/nntrainer/layers/cl_layers/blas_kernels.h b/nntrainer/layers/cl_layers/blas_kernels.h
new file mode 100644
index 0000000000..ad59b8bbd1
--- /dev/null
+++ b/nntrainer/layers/cl_layers/blas_kernels.h
@@ -0,0 +1,74 @@
+// SPDX-License-Identifier: Apache-2.0
+/**
+ * Copyright (C) 2024 Debadri Samaddar
+ *
+ * @file blas_kernels.h
+ * @date 14 May 2024
+ * @brief Common blas OpenCL kernels
+ * @see https://github.com/nnstreamer/nntrainer
+ * @author Debadri Samaddar
+ * @bug No known bugs except for NYI items
+ *
+ */
+
+#ifndef __BLAS_KERNELS_H__
+#define __BLAS_KERNELS_H__
+
+#include
+#include
+#include
+#include
+
+namespace nntrainer {
+
+/**
+ * @brief declaring global kernel objects
+ */
+extern opencl::Kernel kernel_sgemv;
+extern opencl::Kernel kernel_sgemm;
+extern opencl::Kernel kernel_dot;
+
+/**
+ * @brief sgemv computation : Y = A*X + Y
+ * @param[in] matAdata float * for Matrix A
+ * @param[in] vecXdata float * for Vector X
+ * @param[in] vecYdata float * for Vector Y
+ * @param[in] dim1 number of A's columns
+ * @param[in] dim2 number of A's rows
+ * @param[in] lda number of X's columns
+ * @param[in] context RunLayerContext reference
+ */
+void sgemv_cl(const float *matAdata, const float *vecXdata, float *vecYdata,
+ unsigned int dim1, unsigned int dim2, unsigned int lda,
+ RunLayerContext &context);
+
+/**
+ * @brief dot computation : sum of all X * Y
+ * @param[in] vecAdata float * for Vector A
+ * @param[in] vecXdata float * for Vector X
+ * @param[in] dim1 number of elements in both input vectors
+ * @param[in] context RunLayerContext reference
+ */
+float dot_cl(const float *vecAdata, const float *vecXdata, unsigned int dim1,
+ RunLayerContext &context);
+
+/**
+ * @brief sgemm computation : Y = op(A)*op(B) + C,
+ * where op(X) is one of X or X**T
+ * @param[in] A float * for Matrix A
+ * @param[in] B float * for Matrix B
+ * @param[in] C float * for Matrix C
+ * @param[in] M number of op(A)'s and C's row
+ * @param[in] N number of op(B)'s and C's columns
+ * @param[in] K number of op(A)'s and columns and op(B)'s rows
+ * @param[in] lda number of A's columns
+ * @param[in] ldb number of B's columns
+ * @param[in] ldc number of C's columns
+ * @param[in] context RunLayerContext reference
+ */
+void sgemm_cl(const float *A, const float *B, float *C, unsigned int M,
+ unsigned int N, unsigned int K, unsigned int lda,
+ unsigned int ldb, unsigned int ldc, RunLayerContext &context);
+
+} // namespace nntrainer
+#endif /* __BLAS_KERNELS_H__ */
diff --git a/nntrainer/layers/cl_layers/fc_layer_cl.cpp b/nntrainer/layers/cl_layers/fc_layer_cl.cpp
new file mode 100644
index 0000000000..b0a41c4e5f
--- /dev/null
+++ b/nntrainer/layers/cl_layers/fc_layer_cl.cpp
@@ -0,0 +1,288 @@
+// SPDX-License-Identifier: Apache-2.0
+/**
+ * Copyright (C) 2024 Debadri Samaddar
+ *
+ * @file fc_layer_cl.cpp
+ * @date 7 May 2024
+ * @brief This is Fully Connected Layer Class for Neural Network with OpenCl
+ * implementation
+ * @see https://github.com/nnstreamer/nntrainer
+ * @author Debadri Samaddar
+ * @bug No known bugs except for NYI items
+ *
+ */
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+
+namespace nntrainer {
+
+static constexpr size_t SINGLE_INOUT_IDX = 0;
+
+enum FCParams { weight, bias };
+
+FullyConnectedLayerCl::FullyConnectedLayerCl() :
+ LayerImpl(), fc_props(props::Unit()) {
+ weight_idx.fill(std::numeric_limits::max());
+}
+
+void FullyConnectedLayerCl::finalize(InitLayerContext &context) {
+ auto &weight_regularizer =
+ std::get(*layer_impl_props);
+ auto &weight_regularizer_constant =
+ std::get(*layer_impl_props);
+ auto &weight_initializer =
+ std::get(*layer_impl_props);
+ auto &weight_decay = std::get(*layer_impl_props);
+ auto &bias_decay = std::get(*layer_impl_props);
+ auto &bias_initializer = std::get(*layer_impl_props);
+ auto &disable_bias = std::get(*layer_impl_props);
+
+ auto unit = std::get(fc_props).get();
+
+ NNTR_THROW_IF(context.getNumInputs() != 1, std::invalid_argument)
+ << "Fully connected layer takes only one input";
+
+ std::vector output_dims(1);
+
+ /// @todo fc actaully supports multidimensions. EffDimFlag shouldn't be fixed
+ /// like this.
+ context.setEffDimFlagInputDimension(0, 0b1001);
+ context.setDynDimFlagInputDimension(0, 0b1000);
+
+ bool is_nchw = (context.getFormat() == Tformat::NCHW);
+ /** set output dimensions */
+ auto const &in_dim = context.getInputDimensions()[0];
+ output_dims[0] = in_dim;
+ is_nchw ? output_dims[0].width(unit) : output_dims[0].channel(unit);
+
+ output_dims[0].setTensorType(
+ {context.getFormat(), context.getActivationDataType()});
+
+ context.setOutputDimensions(output_dims);
+
+ /** set weight specifications */
+ // @todo : This NCHW format setting is just temporal, it needs to be set by
+ // global configuration
+ TensorDim bias_dim(
+ 1, is_nchw ? 1 : unit, 1, is_nchw ? unit : 1,
+ TensorDim::TensorType(context.getFormat(), context.getWeightDataType()),
+ is_nchw ? 0b0001 : 0b0100);
+
+ TensorDim weight_dim(
+ 1, is_nchw ? 1 : unit, is_nchw ? in_dim.width() : 1,
+ is_nchw ? unit : in_dim.channel(),
+ TensorDim::TensorType(context.getFormat(), context.getWeightDataType()),
+ is_nchw ? 0b0011 : 0b0101);
+
+ weight_idx[FCParams::weight] = context.requestWeight(
+ weight_dim, weight_initializer, weight_regularizer,
+ weight_regularizer_constant, weight_decay, "weight", true);
+
+ if (disable_bias.empty() || disable_bias.get() == false) {
+ weight_idx[FCParams::bias] =
+ context.requestWeight(bias_dim, bias_initializer, WeightRegularizer::NONE,
+ 1.0f, bias_decay, "bias", true);
+ }
+}
+
+void FullyConnectedLayerCl::exportTo(
+ Exporter &exporter, const ml::train::ExportMethods &method) const {
+ LayerImpl::exportTo(exporter, method);
+ exporter.saveResult(fc_props, method, this);
+}
+
+void FullyConnectedLayerCl::setProperty(
+ const std::vector &values) {
+ auto remain_props = loadProperties(values, fc_props);
+ LayerImpl::setProperty(remain_props);
+}
+
+void FullyConnectedLayerCl::forwarding(RunLayerContext &context,
+ bool training) {
+
+ Tensor &weight = context.getWeight(weight_idx[FCParams::weight]);
+ Tensor &hidden_ = context.getOutput(SINGLE_INOUT_IDX);
+ Tensor &input_ = context.getInput(SINGLE_INOUT_IDX);
+
+ if (weight.getDataType() == nntrainer::Tdatatype::QINT4 ||
+ weight.getDataType() == nntrainer::Tdatatype::QINT8) {
+ Tdatatype dtype = input_.getDataType();
+
+ Tensor weight_(
+ {{weight.batch(), weight.channel(), weight.height(), weight.width()},
+ {weight.getFormat(), dtype}},
+ true);
+
+ unsigned int axis =
+ context.getWeightObject(weight_idx[FCParams::weight]).getOutputAxis();
+
+ weight.dequantize(weight_, axis);
+
+ fcDotProcess(input_, weight_, hidden_, context);
+ } else {
+ fcDotProcess(input_, weight, hidden_, context);
+ }
+
+ if (auto &disable_bias = std::get(*layer_impl_props);
+ disable_bias.empty() || disable_bias.get() == false) {
+ Tensor &bias = context.getWeight(weight_idx[FCParams::bias]);
+ hidden_.add_i(bias);
+ }
+}
+
+void FullyConnectedLayerCl::fcDotProcess(Tensor const &input,
+ Tensor const &weight, Tensor &result,
+ RunLayerContext &context) {
+ // to do:
+ // NNTR_THROW_IF(!contiguous, std::invalid_argument)
+ // << getName() << " is not contiguous. Cannot dot product.";
+
+ unsigned int dim1, dim2, mdim1, mdim2;
+ if (input.getFormat() == Tformat::NHWC) {
+ dim1 = input.batch() * input.height() * input.width();
+ dim2 = input.channel();
+ mdim1 = weight.batch() * weight.height() * weight.width();
+ mdim2 = weight.channel();
+ } else {
+ dim1 = input.batch() * input.channel() * input.height();
+ dim2 = input.width();
+ mdim1 = weight.batch() * weight.channel() * weight.height();
+ mdim2 = weight.width();
+ }
+
+ unsigned int M, N, K, lda, ldb, ldc;
+ if (dim2 != mdim1)
+ throw std::runtime_error("Error: incompatible dimensions for dot product");
+ K = mdim1; /** == dim2 */
+ N = mdim2;
+ M = dim1;
+ if (input.getFormat() == Tformat::NHWC) {
+ CREATE_IF_EMPTY_DIMS(result, input.batch(), N, input.height(),
+ input.width(),
+ input.getTensorType()); // NHWC Result Tensor
+ } else {
+ CREATE_IF_EMPTY_DIMS(result, input.batch(), input.channel(), input.height(),
+ N, input.getTensorType());
+ }
+
+ lda = dim2;
+ ldb = mdim2;
+ ldc =
+ (input.getFormat() == Tformat::NHWC) ? result.channel() : result.width();
+
+ if (input.getDataType() == ml::train::TensorDim::DataType::FP32) {
+ const float *data = input.getData();
+ const float *mdata = weight.getData();
+ float *rdata = result.getData();
+
+ /// shortcut handling in case of vector
+ /// for vector, (1 * K) == (K * 1) in current memory layout...
+ /// and plaese note that N, K, M is a fixed place holder after considering
+ /// transpose.
+ /// For example, there is no case like (1 * K) X (1 * K) while
+ /// (1 * K) X (1 * M) can be a case
+ /// case1: (1 * K) X (K * 1)
+ if (M == 1 && N == 1) {
+ *rdata = dot_cl(data, mdata, K, context) + (*rdata);
+ }
+ /// case2: (M * K) X (K * 1)
+ else if (N == 1) {
+ sgemv_cl(data, mdata, rdata, dim1, dim2, lda, context);
+ }
+ /// case3: (1 * K) X (K * N) = 1 * N = R
+ /// = R^T = (K * N) ^T * (1 * K) ^T = (N * K) * (K * 1) = (N * K) * (1 * K)
+ /// Effectively a translation of sgemv
+ else if (M == 1) {
+ sgemv_cl(mdata, data, rdata, mdim1, mdim2, ldb, context);
+ }
+ /// case others: use gemm
+ else {
+ sgemm_cl(data, mdata, rdata, M, N, K, lda, ldb, ldc, context);
+ }
+ } else
+ throw std::invalid_argument("Error: OpenCL fp16 is not supported yet.");
+}
+
+void FullyConnectedLayerCl::incremental_forwarding(RunLayerContext &context,
+ unsigned int from,
+ unsigned int to,
+ bool training) {
+ Tensor w;
+ Tensor &weight = w;
+ context.getWeight(weight, weight_idx[FCParams::weight]);
+
+ Tensor &input_ = context.getInput(SINGLE_INOUT_IDX);
+ Tensor &hidden_ = context.getOutput(SINGLE_INOUT_IDX);
+
+ TensorDim input_dim = input_.getDim();
+ TensorDim hidden_dim = hidden_.getDim();
+
+ TensorDim input_step_dim = input_dim;
+ TensorDim hidden_step_dim = hidden_dim;
+
+ if (from) {
+ NNTR_THROW_IF(to - from != 1, std::invalid_argument)
+ << "incremental step size is not 1";
+ from = 0;
+ to = 1;
+ }
+
+ input_step_dim.height(to - from);
+ hidden_step_dim.height(to - from);
+
+ // @todo: set reset stride as false. This implementation only works when batch
+ // size is 1
+ Tensor input_step = input_.getSharedDataTensor(input_step_dim, 0, true);
+ Tensor hidden_step = hidden_.getSharedDataTensor(hidden_step_dim, 0, true);
+
+ fcDotProcess(input_step, weight, hidden_step, context);
+
+ if (auto &disable_bias = std::get(*layer_impl_props);
+ disable_bias.empty() || disable_bias.get() == false) {
+ Tensor &bias = context.getWeight(weight_idx[FCParams::bias]);
+ hidden_step.add_i(bias);
+ }
+}
+
+void FullyConnectedLayerCl::calcDerivative(RunLayerContext &context) {
+ Tensor &weight = context.getWeight(weight_idx[FCParams::weight]);
+
+ const Tensor &derivative_ = context.getIncomingDerivative(SINGLE_INOUT_IDX);
+ Tensor &ret_ = context.getOutgoingDerivative(SINGLE_INOUT_IDX);
+
+ ret_.dot_deriv_wrt_1(weight, derivative_, false, false);
+}
+
+void FullyConnectedLayerCl::calcGradient(RunLayerContext &context) {
+ Tensor &djdw = context.getWeightGrad(weight_idx[FCParams::weight]);
+
+ const Tensor &derivative_ = context.getIncomingDerivative(SINGLE_INOUT_IDX);
+ Tensor &input_ = context.getInput(SINGLE_INOUT_IDX);
+
+ if (auto &disable_bias = std::get(*layer_impl_props);
+ disable_bias.empty() || disable_bias.get() == false) {
+ Tensor &djdb = context.getWeightGrad(weight_idx[FCParams::bias]);
+
+ if (context.isGradientFirstAccess(weight_idx[FCParams::bias])) {
+ derivative_.sum({0, 1, 2}, djdb);
+ } else {
+ /// @todo optimize below by adding beta to Tensor::sum
+ Tensor t = derivative_.sum({0, 1, 2});
+ djdb.add_i(t);
+ }
+ }
+
+ input_.dot_deriv_wrt_2(
+ djdw, derivative_, false, false,
+ !context.isGradientFirstAccess(weight_idx[FCParams::weight]));
+}
+
+} /* namespace nntrainer */
diff --git a/nntrainer/layers/cl_layers/fc_layer_cl.h b/nntrainer/layers/cl_layers/fc_layer_cl.h
new file mode 100644
index 0000000000..c94ecb22d7
--- /dev/null
+++ b/nntrainer/layers/cl_layers/fc_layer_cl.h
@@ -0,0 +1,130 @@
+// SPDX-License-Identifier: Apache-2.0
+/**
+ * Copyright (C) 2024 Debadri Samaddar
+ *
+ * @file fc_layer_cl.h
+ * @date 7 May 2024
+ * @brief This is Fully Connected Layer Class of Neural Network with OpenCl
+ * implementation
+ * @see https://github.com/nnstreamer/nntrainer
+ * @author Debadri Samaddar
+ * @bug No known bugs except for NYI items
+ *
+ */
+
+#ifndef __FC_LAYER_CL_H__
+#define __FC_LAYER_CL_H__
+#ifdef __cplusplus
+
+#include
+#include
+
+#define CREATE_IF_EMPTY_DIMS(tensor, ...) \
+ do { \
+ if (tensor.empty()) \
+ tensor = Tensor(__VA_ARGS__); \
+ } while (0);
+
+namespace nntrainer {
+
+/**
+ * @class FullyConnecedLayer
+ * @brief fully connected layer
+ */
+class FullyConnectedLayerCl : public LayerImpl {
+public:
+ /**
+ * @brief Constructor of Fully Connected Layer
+ */
+ FullyConnectedLayerCl();
+
+ /**
+ * @brief Destructor of Fully Connected Layer
+ */
+ ~FullyConnectedLayerCl() = default;
+
+ /**
+ * @brief Move constructor.
+ * @param[in] FullyConnected &&
+ */
+ FullyConnectedLayerCl(FullyConnectedLayerCl &&rhs) noexcept = default;
+
+ /**
+ * @brief Move assignment operator.
+ * @parma[in] rhs FullyConnectedLayer to be moved.
+ */
+ FullyConnectedLayerCl &operator=(FullyConnectedLayerCl &&rhs) = default;
+
+ /**
+ * @copydoc Layer::finalize(InitLayerContext &context)
+ */
+ void finalize(InitLayerContext &context) override;
+
+ /**
+ * @copydoc Layer::forwarding(RunLayerContext &context, bool training)
+ */
+ void forwarding(RunLayerContext &context, bool training) override;
+
+ /**
+ * @copydoc Layer::incremental_forwarding(RunLayerContext &context, unsigned
+ * int from, unsigned int to, bool training)
+ */
+ void incremental_forwarding(RunLayerContext &context, unsigned int from,
+ unsigned int to, bool training) override;
+
+ /**
+ * @copydoc Layer::calcDerivative(RunLayerContext &context)
+ */
+ void calcDerivative(RunLayerContext &context) override;
+
+ /**
+ * @copydoc Layer::calcGradient(RunLayerContext &context)
+ */
+ void calcGradient(RunLayerContext &context) override;
+
+ /**
+ * @copydoc Layer::exportTo(Exporter &exporter, ml::train::ExportMethods
+ * method)
+ */
+ void exportTo(Exporter &exporter,
+ const ml::train::ExportMethods &method) const override;
+
+ /**
+ * @copydoc Layer::getType()
+ */
+ const std::string getType() const override {
+ return FullyConnectedLayerCl::type;
+ };
+
+ /**
+ * @brief Process data and dimensions for dot operation used in fc_layer
+ * @param[in] input Tensor
+ * @param[in] weight Tensor
+ * @param[in] result Tensor
+ * @param[in] RunLayerContext reference
+ */
+ void fcDotProcess(Tensor const &input, Tensor const &weight, Tensor &result,
+ RunLayerContext &context);
+
+ /**
+ * @copydoc Layer::supportBackwarding()
+ */
+ bool supportBackwarding() const override { return true; }
+
+ /**
+ * @copydoc Layer::setProperty(const PropertyType type, const std::string
+ * &value)
+ */
+ void setProperty(const std::vector &values) override;
+
+ inline static const std::string type = "fully_connected";
+
+private:
+ std::tuple
+ fc_props; /**< fc layer properties : unit - number of output neurons */
+ std::array weight_idx; /**< indices of the weights */
+};
+} // namespace nntrainer
+
+#endif /* __cplusplus */
+#endif /* __FC_LAYER_CL__ */
diff --git a/nntrainer/layers/cl_layers/meson.build b/nntrainer/layers/cl_layers/meson.build
new file mode 100644
index 0000000000..2f1ba7fc03
--- /dev/null
+++ b/nntrainer/layers/cl_layers/meson.build
@@ -0,0 +1,8 @@
+cl_layer_sources = [
+ 'fc_layer_cl.cpp',
+ 'blas_kernels.cpp'
+]
+
+foreach s : cl_layer_sources
+ nntrainer_sources += meson.current_source_dir() / s
+endforeach
diff --git a/nntrainer/layers/conv2d_layer.cpp b/nntrainer/layers/conv2d_layer.cpp
index c059ae9caf..5d9dbc1e19 100644
--- a/nntrainer/layers/conv2d_layer.cpp
+++ b/nntrainer/layers/conv2d_layer.cpp
@@ -38,7 +38,8 @@ namespace {
static TensorDim calcCol2ImOutputDim(const TensorDim &out,
const TensorDim &kdim) {
- return TensorDim({kdim.getFeatureLen(), out.width() * out.height()});
+ return TensorDim({kdim.getFeatureLen(), out.width() * out.height()},
+ out.getTensorType());
}
/**
@@ -56,7 +57,10 @@ static void col2im(const Tensor &col_matrix, const TensorDim &kdim,
const std::array &mstride,
const std::array &dilation,
Tensor &image) {
- auto [pt, pb, pl, pr] = padding;
+ auto pt = padding[0];
+ auto pb = padding[1];
+ auto pl = padding[2];
+ auto pr = padding[3];
unsigned k_height = kdim.height();
unsigned k_width = kdim.width();
@@ -84,32 +88,48 @@ static void col2im(const Tensor &col_matrix, const TensorDim &kdim,
int h_stride_end = im_eff_height - eff_k_height - pt;
int w_stride_end = im_eff_width - eff_k_width - pl;
- unsigned col_w = 0;
- for (int hs = -pt; hs <= h_stride_end; hs += hstride) {
- for (int ws = -pl; ws <= w_stride_end; ws += wstride) {
- unsigned col_h = 0;
- int patch_height_end = hs + eff_k_height;
- int patch_width_end = ws + eff_k_width;
- for (unsigned c = 0; c < im_channel; c++) {
- for (int h = hs; h < patch_height_end; h += hdilation) {
- if (h < 0 || im_height <= h) {
- col_h += k_width;
- continue;
- }
- for (int w = ws; w < patch_width_end; w += wdilation) {
- if (w < 0 || im_width <= w) {
- col_h++;
+ auto apply_data = [&](T *val) {
+ unsigned col_w = 0;
+ for (int hs = -pt; hs <= h_stride_end; hs += hstride) {
+ for (int ws = -pl; ws <= w_stride_end; ws += wstride) {
+ unsigned col_h = 0;
+ int patch_height_end = hs + eff_k_height;
+ int patch_width_end = ws + eff_k_width;
+ for (unsigned c = 0; c < im_channel; c++) {
+ for (int h = hs; h < patch_height_end; h += hdilation) {
+ if (h < 0 || im_height <= h) {
+ col_h += k_width;
continue;
}
-
- float *val = image.getAddress(0, c, h, w);
- *val += col_matrix.getValue(0, 0, col_h, col_w);
- col_h++;
+ for (int w = ws; w < patch_width_end; w += wdilation) {
+ if (w < 0 || im_width <= w) {
+ col_h++;
+ continue;
+ }
+
+ val = image.getAddress(0, c, h, w);
+ *val += col_matrix.getValue(0, 0, col_h, col_w);
+ col_h++;
+ }
}
}
+ col_w++;
}
- col_w++;
}
+ };
+
+ if (image.getDataType() == nntrainer::Tdatatype::FP32) {
+ float val;
+ apply_data(&val);
+ }
+#ifdef ENABLE_FP16
+ else if (image.getDataType() == nntrainer::Tdatatype::FP16) {
+ _FP16 val;
+ apply_data(&val);
+ }
+#endif
+ else {
+ throw std::runtime_error("Not supported datatype");
}
}
@@ -179,7 +199,10 @@ static void im2col(const Tensor &in, const TensorDim &kdim,
// }
*/
- auto [pt, pb, pl, pr] = padding;
+ auto pt = padding[0];
+ auto pb = padding[1];
+ auto pl = padding[2];
+ auto pr = padding[3];
unsigned int channel = in.channel();
int in_height = in.height();
@@ -198,46 +221,62 @@ static void im2col(const Tensor &in, const TensorDim &kdim,
unsigned int out_width = (width - eff_k_width) / mstride[1] + 1;
out.reshape(
- TensorDim({out_height * out_width, in.channel() * k_height * k_width}));
- float *out_data = out.getData();
-
- int h_stride_end = height - eff_k_height - pt;
- int w_stride_end = width - eff_k_width - pl;
-
- /// get a patch, size of kernel
- /// hs is height_strided, ws is width_strided
- unsigned int owidth = out.width();
- unsigned int base_im_w = 0;
- for (int hs = -pt; hs <= h_stride_end; hs += mstride[0]) {
- unsigned int base_im_h = 0;
- int patch_height_end = eff_k_height + hs;
- /// map the patch to a single line looping through channel
- for (unsigned int c = 0; c < channel; ++c) {
- for (int h = hs; h < patch_height_end; h += dilation[0]) {
- if (h < 0 || in_height <= h) {
- base_im_h += k_width;
- continue;
- }
-
- unsigned int im_w = base_im_w;
- for (int ws = -pl; ws <= w_stride_end; ws += mstride[1]) {
- unsigned int im_h = base_im_h;
- int patch_width_end = eff_k_width + ws;
+ TensorDim({out_height * out_width, in.channel() * k_height * k_width},
+ in.getTensorType()));
+
+ auto apply_data = [&](T *out_data) {
+ int h_stride_end = height - eff_k_height - pt;
+ int w_stride_end = width - eff_k_width - pl;
+
+ /// get a patch, size of kernel
+ /// hs is height_strided, ws is width_strided
+ unsigned int owidth = out.width();
+ unsigned int base_im_w = 0;
+ for (int hs = -pt; hs <= h_stride_end; hs += mstride[0]) {
+ unsigned int base_im_h = 0;
+ int patch_height_end = eff_k_height + hs;
+ /// map the patch to a single line looping through channel
+ for (unsigned int c = 0; c < channel; ++c) {
+ for (int h = hs; h < patch_height_end; h += dilation[0]) {
+ if (h < 0 || in_height <= h) {
+ base_im_h += k_width;
+ continue;
+ }
- for (int w = ws; w < patch_width_end; w += dilation[1]) {
- if (w < 0 || in_width <= w) {
+ unsigned int im_w = base_im_w;
+ for (int ws = -pl; ws <= w_stride_end; ws += mstride[1]) {
+ unsigned int im_h = base_im_h;
+ int patch_width_end = eff_k_width + ws;
+
+ for (int w = ws; w < patch_width_end; w += dilation[1]) {
+ if (w < 0 || in_width <= w) {
+ im_h++;
+ continue;
+ }
+ out_data[im_w * owidth + im_h] = in.getValue(0, c, h, w);
im_h++;
- continue;
}
- out_data[im_w * owidth + im_h] = in.getValue(0, c, h, w);
- im_h++;
+ im_w++;
}
- im_w++;
+ base_im_h += k_width;
}
- base_im_h += k_width;
}
+ base_im_w += out_width;
}
- base_im_w += out_width;
+ };
+
+ if (out.getDataType() == nntrainer::Tdatatype::FP32) {
+ float *out_data = out.getData();
+ apply_data(out_data);
+ }
+#ifdef ENABLE_FP16
+ else if (out.getDataType() == nntrainer::Tdatatype::FP16) {
+ _FP16 *out_data = out.getData<_FP16>();
+ apply_data(out_data);
+ }
+#endif
+ else {
+ throw std::runtime_error("Not supported datatype");
}
}
@@ -279,9 +318,11 @@ void Conv2DLayer::finalize(InitLayerContext &context) {
auto &dilation =
std::get>(conv_props);
- TensorDim kernel_dim =
- TensorDim(filter_size, in_dim.channel(), kernel_size[0], kernel_size[1]);
- TensorDim bias_dim = TensorDim(1, filter_size, 1, 1);
+ auto in_t_type = in_dim.getTensorType();
+ in_t_type.data_type = context.getWeightDataType();
+ TensorDim kernel_dim = TensorDim(filter_size, in_dim.channel(),
+ kernel_size[0], kernel_size[1], in_t_type);
+ TensorDim bias_dim = TensorDim(1, filter_size, 1, 1, in_t_type);
padding = std::get(conv_props)
.compute(in_dim, kernel_dim, {stride[0], stride[1]},
@@ -309,6 +350,7 @@ void Conv2DLayer::finalize(InitLayerContext &context) {
out_dim.channel(filter_size);
out_dim.height((eff_in_height - eff_k_height) / stride[0] + 1);
out_dim.width((eff_in_width - eff_k_width) / stride[1] + 1);
+ out_dim.setTensorType(in_dim.getTensorType());
context.setOutputDimensions({out_dim});
NNTR_THROW_IF(eff_in_height < kernel_size[0] || eff_in_width < kernel_size[1],
diff --git a/nntrainer/layers/fc_layer.cpp b/nntrainer/layers/fc_layer.cpp
index de34f5f921..436a936439 100644
--- a/nntrainer/layers/fc_layer.cpp
+++ b/nntrainer/layers/fc_layer.cpp
@@ -40,8 +40,11 @@ enum FCParams { weight, bias };
enum LORAParams { loraA, loraB, loraTmp, loraOut };
FullyConnectedLayer::FullyConnectedLayer() :
- LayerImpl(), fc_props(props::Unit(), props::LoraRank(), props::LoraAlpha()) {
+ LayerImpl(),
+ lora_scaling(1.0f),
+ fc_props(props::Unit(), props::LoraRank(), props::LoraAlpha()) {
weight_idx.fill(std::numeric_limits::max());
+ lora_idx.fill(std::numeric_limits::max());
}
void FullyConnectedLayer::finalize(InitLayerContext &context) {
diff --git a/nntrainer/layers/fc_layer.h b/nntrainer/layers/fc_layer.h
index cb3726b020..44ef99d912 100644
--- a/nntrainer/layers/fc_layer.h
+++ b/nntrainer/layers/fc_layer.h
@@ -114,7 +114,7 @@ class FullyConnectedLayer : public LayerImpl {
lora_scaling - scaling factor of LoRA apply, i.e.,
lora_scaling = alpha / lora_rank */
std::array weight_idx; /**< indices of the weights */
- std::array lora_idx; /**< indices of the lora weights */
+ std::array lora_idx; /**< indices of the lora weights */
};
} // namespace nntrainer
diff --git a/nntrainer/layers/input_layer.cpp b/nntrainer/layers/input_layer.cpp
index eabd40b297..a67701da2c 100644
--- a/nntrainer/layers/input_layer.cpp
+++ b/nntrainer/layers/input_layer.cpp
@@ -34,7 +34,8 @@ static constexpr size_t SINGLE_INOUT_IDX = 0;
InputLayer::InputLayer() :
Layer(),
- input_props(props::Normalization(), props::Standardization()) {}
+ input_props(props::Normalization(), props::Standardization()),
+ is_inplace(true) {}
void InputLayer::setProperty(const std::vector &values) {
auto remain_props = loadProperties(values, input_props);
@@ -47,7 +48,7 @@ void InputLayer::forwarding(RunLayerContext &context, bool training) {
Tensor &hidden_ = context.getOutput(SINGLE_INOUT_IDX);
if (!context.executeInPlace()) {
Tensor &input_ = context.getInput(SINGLE_INOUT_IDX);
- hidden_.copy(input_);
+ hidden_.copyData(input_);
}
if (std::get(input_props))
@@ -70,7 +71,22 @@ void InputLayer::finalize(InitLayerContext &context) {
std::vector output_dims = context.getInputDimensions();
+ for (auto &d : output_dims) {
+ d.setDataType(context.getActivationDataType());
+ }
+
context.setOutputDimensions(output_dims);
+
+ is_inplace = true;
+
+ /**
+ * @note Input Layer assuems that the FP32 IN Tensor always. Therefore, if the
+ * activation data type is not fp32, then it does not support in-place
+ * operation.
+ */
+ if (context.getActivationDataType() != ml::train::TensorDim::DataType::FP32) {
+ is_inplace = false;
+ }
}
} /* namespace nntrainer */
diff --git a/nntrainer/layers/input_layer.h b/nntrainer/layers/input_layer.h
index f6728d676b..e9183e23d1 100644
--- a/nntrainer/layers/input_layer.h
+++ b/nntrainer/layers/input_layer.h
@@ -82,7 +82,7 @@ class InputLayer : public Layer {
/**
* @copydoc Layer::supportInPlace()
*/
- bool supportInPlace() const override { return true; }
+ bool supportInPlace() const override { return is_inplace; }
/**
* @copydoc Layer::exportTo(Exporter &exporter, ml::train::ExportMethods
@@ -105,6 +105,7 @@ class InputLayer : public Layer {
private:
std::tuple input_props;
+ bool is_inplace;
};
} // namespace nntrainer
diff --git a/nntrainer/layers/layer_context.cpp b/nntrainer/layers/layer_context.cpp
index fff2eb15ec..add78c09cb 100644
--- a/nntrainer/layers/layer_context.cpp
+++ b/nntrainer/layers/layer_context.cpp
@@ -126,13 +126,14 @@ const std::vector &InitLayerContext::getOutSpecs() const {
}
RunLayerContext::RunLayerContext(const std::string &name, bool trainable,
- float l, bool in_place_,
+ float l, bool in_place_, float loss_scale_,
const std::vector &w,
const std::vector &in,
const std::vector &out,
const std::vector &t) :
loss(l),
in_place(in_place_),
+ loss_scale(loss_scale_),
weights(w),
inputs(in),
outputs(out),
@@ -169,6 +170,19 @@ Tensor &RunLayerContext::getWeightGrad(unsigned int idx) const {
return weights[idx]->getGradientRef();
}
+/**
+ * @brief Get the Weight Gradient tensor object
+ *
+ * @param idx Identifier of the weight
+ * @return Tensor& Reference to the weight grad tensor
+ */
+Tensor &RunLayerContext::getWeightFP32(unsigned int idx) const {
+ if (!weights[idx]->hasGradient())
+ throw std::invalid_argument(
+ "Requesting gradient for a non-trainable weight.");
+ return weights[idx]->getVariableFP32Ref();
+}
+
/**
* @brief Get the Weight Optimizer Variable tensor object
*
@@ -402,6 +416,17 @@ bool RunLayerContext::isGradientClipByGlobalNorm(unsigned int idx) const {
return weights[idx]->isGradientClipByGlobalNorm();
}
+bool RunLayerContext::isMixedPrecision(unsigned int idx) const {
+ return weights[idx]->isMixedPrecision();
+}
+
+bool RunLayerContext::isMixedPrecision() const {
+ for (auto w : weights)
+ if (w->isMixedPrecision())
+ return true;
+ return false;
+}
+
/**
* @brief Get the tensor name
*
@@ -650,10 +675,12 @@ bool RunLayerContext::clCreateKernel(std::string kernel_string,
*/
std::string RunLayerContext::getKernelName(LayerKernel layerKernel) {
switch (layerKernel) {
- case LayerKernel::KERNEL_NAME1:
- return "kernel_name1";
- case LayerKernel::KERNEL_NAME2:
- return "kernel_name2";
+ case LayerKernel::SGEMV:
+ return "sgemv_cl";
+ case LayerKernel::DOT:
+ return "dot_cl";
+ case LayerKernel::SGEMM:
+ return "sgemm_cl";
default:
return "";
}
diff --git a/nntrainer/layers/layer_context.h b/nntrainer/layers/layer_context.h
index e5c6759638..2a32ba7287 100644
--- a/nntrainer/layers/layer_context.h
+++ b/nntrainer/layers/layer_context.h
@@ -63,7 +63,7 @@ class InitLayerContext {
const float max_norm = 0.0,
std::array tensor_type_ = {"NCHW", "FP32",
"FP32"},
- const float loss_scale = 0.0);
+ const float loss_scale = 1.0);
/**
* @brief get Tensor Format of Layer
*
@@ -348,6 +348,14 @@ class InitLayerContext {
*/
bool executeInPlace() const { return in_place; }
+ /**
+ * @brief get Initial value of Loss_Scale. This is set to RunLayerContext
+ * and updated
+ *
+ * @return loss_scale
+ */
+ float getLossScale() const { return loss_scale; }
+
private:
std::vector input_dim; /**< Input dimensions for the layer */
bool in_place; /**< if the layer is expected to run in-place */
@@ -385,7 +393,7 @@ class RunLayerContext {
* @brief Construct a new Run Layer Context object
*
*/
- RunLayerContext() : loss(0.0), in_place(false) {}
+ RunLayerContext() : loss(0.0), in_place(false), loss_scale(1.0) {}
/**
* @brief Construct a new Run Layer Context object
@@ -396,6 +404,17 @@ class RunLayerContext {
std::get(props).set(name);
}
+ /**
+ * @brief Construct a new Run Layer Context object
+ *
+ */
+ RunLayerContext(const std::string &name, bool in_place_, float loss_scale_) :
+ RunLayerContext() {
+ in_place = in_place_;
+ std::get(props).set(name);
+ loss_scale = loss_scale_;
+ }
+
/**
* @brief Construct a new Run Layer Context object
*
@@ -403,13 +422,15 @@ class RunLayerContext {
* @param trainable if the layer is trainable
* @param l loss of the layer
* @param in_place_ execution in-place of the layer
+ * @param loss_scale loss_scale of the layer
* @param w weights of the layer
* @param in inputs of the layer
* @param out outputs of the layer
* @param t extra tensors of the layer
*/
RunLayerContext(const std::string &name, bool trainable, float l,
- bool in_place_, const std::vector &w,
+ bool in_place_, float loss_scale_,
+ const std::vector &w,
const std::vector &in,
const std::vector &out,
const std::vector &t);
@@ -463,6 +484,15 @@ class RunLayerContext {
Tensor &getWeightGrad(unsigned int idx) const;
/**
+ * @brief Get the Weight Gradient tensor object
+ *
+ * @param idx Identifier of the weight
+ * @return Tensor& Reference to the weight grad tensor
+ */
+ Tensor &getWeightFP32(unsigned int idx) const;
+
+ /**
+
* @brief Get the Weight Optimizer Variable tensor object
*
* @param idx Identifier of the weight
@@ -659,6 +689,20 @@ class RunLayerContext {
*/
bool isGradientClipByGlobalNorm(unsigned int idx) const;
+ /**
+ * @brief check if the weight is mixed precsion
+ *
+ * @param idx index
+ * @return bool true if it is mixed precision
+ */
+ bool isMixedPrecision(unsigned int idx) const;
+
+ /**
+ * @brief check if the weight is mixed precsion
+ * @return bool true if it is mixed precision
+ */
+ bool isMixedPrecision() const;
+
/**
* @brief Get the tensor name
*
@@ -830,8 +874,9 @@ class RunLayerContext {
* getKernelName function.
*/
enum LayerKernel {
- KERNEL_NAME1 = 1, /**< placeholder for kernel name */
- KERNEL_NAME2 = 2 /**< placeholder for kernel name */
+ SGEMV = 1, /**< placeholder for kernel name */
+ DOT = 2, /**< placeholder for kernel name */
+ SGEMM = 4 /**< placeholder for kernel name */
};
/**
@@ -874,10 +919,29 @@ class RunLayerContext {
*/
ml::train::LayerComputeEngine getComputeEngine() { return compute_engine; }
+ /**
+ * @brief get loss scale
+ * @return loss scale
+ */
+ float getLossScale() { return loss_scale; }
+
+ /**
+ * @brief set Loss_Scale.
+ *
+ * @return loss_scale
+ */
+ void setLossScale(float scale) {
+ loss_scale = scale;
+ for (auto w : weights) {
+ w->setLossScale(scale);
+ }
+ }
+
private:
std::tuple props; /**< props of the layer */
float loss; /**< loss of the layer */
- bool in_place; /**< if the layer is expected to run in-place */
+ bool in_place; /**< if the layer is expected to run in-place */
+ float loss_scale; /**< loss_scale of the layer */
std::vector weights; /**< weights of the layer */
std::vector inputs; /**< inputs of the layer */
diff --git a/nntrainer/layers/layer_devel.h b/nntrainer/layers/layer_devel.h
index 54ce1a0ee9..44a87cc7e9 100644
--- a/nntrainer/layers/layer_devel.h
+++ b/nntrainer/layers/layer_devel.h
@@ -259,6 +259,11 @@ class Layer {
* @return true if supports backwarding, else false
*/
virtual bool supportBackwarding() const = 0;
+
+ /**
+ * @brief Set loss scale factor
+ */
+ virtual void setLossScale(float scale) {}
};
/// @todo Decide where to put and how to implement(#986)
diff --git a/nntrainer/layers/layer_node.cpp b/nntrainer/layers/layer_node.cpp
index 8b18d80762..114555fee4 100644
--- a/nntrainer/layers/layer_node.cpp
+++ b/nntrainer/layers/layer_node.cpp
@@ -180,6 +180,7 @@ LayerNode::LayerNode(std::unique_ptr &&l) :
inplace(InPlace::NONE),
needs_calc_derivative(false),
needs_calc_gradient(false),
+
output_connections(),
run_context(nullptr),
layer_node_props(
@@ -190,7 +191,8 @@ LayerNode::LayerNode(std::unique_ptr &&l) :
new RealizationPropsType(props::Flatten(), props::Activation())),
loss(new props::Loss()),
regularization_loss(0.0f),
- exec_order({0, 0, 0, 0}) {
+ exec_order({0, 0, 0, 0}),
+ needs_output_set_zero(false) {
if (layer && layer->getType() == TimeDistLayer::type) {
std::get(*layer_node_props).set(true);
}
@@ -475,6 +477,9 @@ void LayerNode::read(std::ifstream &file, bool opt_var) {
/// @note shared weights are only be read at the first acecss
if (run_context->isGradientLastAccess(i)) {
run_context->getWeight(i).read(file);
+ if (run_context->isMixedPrecision(i) && getTrainable()) {
+ run_context->getWeightFP32(i).copyData(run_context->getWeight(i));
+ }
}
}
}
@@ -599,7 +604,7 @@ InitLayerContext LayerNode::finalize(const std::vector &input_dims,
const auto &scope = getSharedFrom().empty() ? getName() : getSharedFrom();
float max_norm = 0.0;
- float loss_scale = 0.0;
+ float loss_scale = 1.0;
if (!std::get(*layer_node_props).empty())
max_norm = std::get(*layer_node_props).get();
@@ -748,8 +753,21 @@ LayerNode::refinalize(const std::vector &input_dims) {
*/
void LayerNode::forwarding(bool training) {
loss->set(run_context->getRegularizationLoss());
+
PROFILE_TIME_START(forward_event_key);
+ if (needsOutputSetZero()) {
+ for (unsigned int i = 0; i < run_context->getNumOutputs(); ++i) {
+ run_context->getOutput(i).setValue(0);
+ run_context->getOutgoingDerivative(i).setValue(0);
+ }
+
+ for (unsigned int i = 0; i < run_context->getNumWeights(); ++i) {
+ run_context->getWeightGrad(i).setValue(0);
+ }
+ }
+
layer->forwarding(*run_context, training);
+ needsOutputSetZero(false);
PROFILE_TIME_END(forward_event_key);
TRACE_MEMORY() << getName() + ": F";
TRACE_TIME() << getName() + ": F";
@@ -864,10 +882,11 @@ float LayerNode::getLoss() const { return *loss; }
void LayerNode::configureRunContext(const std::vector &weights,
const std::vector &inputs,
const std::vector &outputs,
- const std::vector &tensors) {
+ const std::vector &tensors,
+ float loss_scale) {
run_context = std::make_unique(
- getName(), getTrainable(), 0.0f, executeInPlace() != InPlace::NONE, weights,
- inputs, outputs, tensors);
+ getName(), getTrainable(), 0.0f, executeInPlace() != InPlace::NONE,
+ loss_scale, weights, inputs, outputs, tensors);
}
/**
diff --git a/nntrainer/layers/layer_node.h b/nntrainer/layers/layer_node.h
index 93e7ac7069..c2202f20aa 100644
--- a/nntrainer/layers/layer_node.h
+++ b/nntrainer/layers/layer_node.h
@@ -487,6 +487,7 @@ class LayerNode final : public ml::train::Layer, public GraphNode {
const std::vector getOutputDimensions() const;
/**
* @brief Get the Weight object
+ * currently, only unittest uses this func.
*
* @param idx Identifier of the weight
* @return Weight& Reference to the weight
@@ -495,11 +496,11 @@ class LayerNode final : public ml::train::Layer, public GraphNode {
NNTR_THROW_IF(!run_context, std::runtime_error)
<< __func__ << " layer needs to be finalized first!";
if (run_context->weightHasGradient(idx)) {
- return Weight(run_context->getWeight(idx),
- run_context->getWeightGrad(idx),
- run_context->getWeightName(idx));
+ return Weight(
+ run_context->getWeight(idx), run_context->getWeightGrad(idx),
+ run_context->getWeightFP32(idx), run_context->getWeightName(idx));
} else {
- return Weight(run_context->getWeight(idx), Tensor(),
+ return Weight(run_context->getWeight(idx), Tensor(), Tensor(),
run_context->getWeightName(idx));
}
}
@@ -819,7 +820,8 @@ class LayerNode final : public ml::train::Layer, public GraphNode {
void configureRunContext(const std::vector &weights,
const std::vector &inputs,
const std::vector &outputs,
- const std::vector &tensors);
+ const std::vector &tensors,
+ float loss_scale);
/**
* @brief Preset modes for printing summary for the layer
@@ -877,6 +879,13 @@ class LayerNode final : public ml::train::Layer, public GraphNode {
needs_calc_derivative = nb;
}
+ /**
+ * @brief Set if the layer output needs reinitialization @mixed precsion
+ *
+ * @param nb true if the layer needs to do reinitialization, eles false
+ */
+ void needsOutputSetZero(bool nb) { needs_output_set_zero = nb; }
+
/**
* @brief Set if the layer needs to do calculation of gradients
*
@@ -898,6 +907,13 @@ class LayerNode final : public ml::train::Layer, public GraphNode {
*/
bool needsCalcGradient() { return needs_calc_gradient; }
+ /**
+ * @brief Set if the layer needs to reinitialization @mixed precsion
+ *
+ * @param nb true if the layer needs reinitialization, eles false
+ */
+ bool needsOutputSetZero() { return needs_output_set_zero; }
+
private:
/**
* @brief Get the Input Layers object
@@ -964,6 +980,9 @@ properties in the context/graph unless intended. */
ExecutionOrder exec_order; /**< order/location of execution for this node
in forward and backwarding operations */
+ bool needs_output_set_zero; /**< cache if this layer needs reinitialization
+ output */
+
/**
* @brief Get the effective layer managed by this layer node
*
diff --git a/nntrainer/layers/loss/loss_layer.cpp b/nntrainer/layers/loss/loss_layer.cpp
index 40f74717f8..8d18878f49 100644
--- a/nntrainer/layers/loss/loss_layer.cpp
+++ b/nntrainer/layers/loss/loss_layer.cpp
@@ -22,8 +22,12 @@ void LossLayer::finalize(InitLayerContext &context) {
d.setDataType(
str_converter::from_string("FP32"));
-
+
context.setOutputDimensions(output_dim);
+
+ is_inplace = true;
+ if (context.getActivationDataType() != ml::train::TensorDim::DataType::FP32)
+ is_inplace = false;
}
void LossLayer::updateLoss(RunLayerContext &context, const Tensor &l) {
@@ -36,6 +40,13 @@ void LossLayer::updateLoss(RunLayerContext &context, const Tensor &l) {
context.setLoss(loss_sum / (float)l.batch());
}
+void LossLayer::applyLossScale(RunLayerContext &context, Tensor &ret_deriv) {
+
+ float loss_scale = context.getLossScale();
+ if (loss_scale != 1.0)
+ ret_deriv.multiply_i(loss_scale);
+}
+
/**
* @copydoc Layer::setProperty(const std::vector &values)
*/
diff --git a/nntrainer/layers/loss/loss_layer.h b/nntrainer/layers/loss/loss_layer.h
index 00b520f6e6..418777606c 100644
--- a/nntrainer/layers/loss/loss_layer.h
+++ b/nntrainer/layers/loss/loss_layer.h
@@ -47,6 +47,8 @@ class LossLayer : public Layer {
*/
virtual bool supportBackwarding() const override { return true; }
+ bool supportInPlace() const override {return is_inplace;}
+
/**
* @copydoc Layer::requireLabel()
*/
@@ -60,8 +62,17 @@ class LossLayer : public Layer {
*/
void updateLoss(RunLayerContext &context, const Tensor &l);
+ /**
+ * @brief update return derivative with loss scale
+ * @param context Run context to update
+ * @param return_dev Tensor data to calculate
+ */
+ void applyLossScale(RunLayerContext &context, Tensor &l);
+
Tensor
l; /**< loss tensor to store intermediate value to calculate loss value */
+
+ bool is_inplace;
};
} // namespace nntrainer
diff --git a/nntrainer/layers/loss/mse_loss_layer.cpp b/nntrainer/layers/loss/mse_loss_layer.cpp
index 7f7bd1626f..356acae6f5 100644
--- a/nntrainer/layers/loss/mse_loss_layer.cpp
+++ b/nntrainer/layers/loss/mse_loss_layer.cpp
@@ -20,7 +20,16 @@ static constexpr size_t SINGLE_INOUT_IDX = 0;
void MSELossLayer::forwarding(RunLayerContext &context, bool training) {
Tensor &hidden_ = context.getOutput(SINGLE_INOUT_IDX);
- Tensor &y = context.getInput(SINGLE_INOUT_IDX);
+
+ Tensor empty_tensor;
+ Tensor &y = context.getInput(SINGLE_INOUT_IDX).getDataType() ==
+ ml::train::TensorDim::DataType::FP32
+ ? context.getInput(SINGLE_INOUT_IDX)
+ : empty_tensor;
+
+ if (y.empty())
+ y = context.getInput(SINGLE_INOUT_IDX)
+ .clone(ml::train::TensorDim::DataType::FP32);
// hidden_ <- y2 - y;
if (context.isLabelAvailable(SINGLE_INOUT_IDX)) {
@@ -41,9 +50,28 @@ void MSELossLayer::forwarding(RunLayerContext &context, bool training) {
}
void MSELossLayer::calcDerivative(RunLayerContext &context) {
- Tensor &ret_derivative = context.getOutgoingDerivative(SINGLE_INOUT_IDX);
+ Tensor empty_tensor;
+
+ Tensor &ret_derivative =
+ context.getOutgoingDerivative(SINGLE_INOUT_IDX).getDataType() ==
+ ml::train::TensorDim::DataType::FP32
+ ? context.getOutgoingDerivative(SINGLE_INOUT_IDX)
+ : empty_tensor;
+
+ if (ret_derivative.empty())
+ ret_derivative = context.getOutgoingDerivative(SINGLE_INOUT_IDX)
+ .clone(ml::train::TensorDim::DataType::FP32);
+ Tensor empty_tensor1;
+ Tensor &y = context.getInput(SINGLE_INOUT_IDX).getDataType() ==
+ ml::train::TensorDim::DataType::FP32
+ ? context.getInput(SINGLE_INOUT_IDX)
+ : empty_tensor1;
+
+ if (y.empty())
+ y = context.getInput(SINGLE_INOUT_IDX)
+ .clone(ml::train::TensorDim::DataType::FP32);
+
const Tensor &y2 = context.getIncomingDerivative(SINGLE_INOUT_IDX);
- Tensor &y = context.getInput(SINGLE_INOUT_IDX);
y.subtract(y2, ret_derivative);
float divider = ((float)y.size()) / 2;
@@ -51,6 +79,16 @@ void MSELossLayer::calcDerivative(RunLayerContext &context) {
throw std::runtime_error(
"[MSELossLayer::calcDerivative] Error when calculating loss");
}
+
+ // Loss Scale needs Full precsiion of ret_derivative. Therefore,
+ // ret_derivateive should be FP32 when applying scale, and after applying it
+ // need to convert original type for backpropagating.
+
+ LossLayer::applyLossScale(context, ret_derivative);
+
+ if (context.getOutgoingDerivative(SINGLE_INOUT_IDX).getDataType() !=
+ ml::train::TensorDim::DataType::FP32)
+ context.getOutgoingDerivative(SINGLE_INOUT_IDX).copyData(ret_derivative);
}
} // namespace nntrainer
diff --git a/nntrainer/layers/loss/mse_loss_layer.h b/nntrainer/layers/loss/mse_loss_layer.h
index 387e92b3b5..829b921668 100644
--- a/nntrainer/layers/loss/mse_loss_layer.h
+++ b/nntrainer/layers/loss/mse_loss_layer.h
@@ -51,6 +51,7 @@ class MSELossLayer : public LossLayer {
const std::string getType() const override { return MSELossLayer::type; };
inline static const std::string type = "mse";
+
};
} // namespace nntrainer
diff --git a/nntrainer/layers/lstm.cpp b/nntrainer/layers/lstm.cpp
index d5f13a1fc5..be313a0aca 100644
--- a/nntrainer/layers/lstm.cpp
+++ b/nntrainer/layers/lstm.cpp
@@ -509,21 +509,27 @@ void LSTMLayer::finalize(InitLayerContext &context) {
}
// hidden_state_dim : [ batch_size, 1, max_timestep, unit ]
- const TensorDim hidden_state_dim(batch_size, 1, max_timestep, unit,
- weight_tensor_type);
+ TensorDim hidden_state_dim(batch_size, 1, max_timestep, unit,
+ weight_tensor_type);
+ hidden_state_dim.setDataType(context.getActivationDataType());
+
wt_idx[LSTMParams::hidden_state] = context.requestTensor(
hidden_state_dim, "hidden_state", Tensor::Initializer::NONE, true,
TensorLifespan::ITERATION_LIFESPAN);
// cell_state_dim : [ batch_size, 1, max_timestep, unit ]
- const TensorDim cell_state_dim(batch_size, 1, max_timestep, unit,
- weight_tensor_type);
+ TensorDim cell_state_dim(batch_size, 1, max_timestep, unit,
+ weight_tensor_type);
+ cell_state_dim.setDataType(context.getActivationDataType());
+
wt_idx[LSTMParams::cell_state] = context.requestTensor(
cell_state_dim, "cell_state", Tensor::Initializer::NONE, true,
TensorLifespan::ITERATION_LIFESPAN);
// ifgo_dim : [ batch_size, 1, max_timestep, NUM_GATE * unit ]
- const TensorDim ifgo_dim(batch_size, 1, max_timestep, NUM_GATE * unit,
- weight_tensor_type);
+ TensorDim ifgo_dim(batch_size, 1, max_timestep, NUM_GATE * unit,
+ weight_tensor_type);
+ ifgo_dim.setDataType(context.getActivationDataType());
+
wt_idx[LSTMParams::ifgo] =
context.requestTensor(ifgo_dim, "ifgo", Tensor::Initializer::NONE, true,
TensorLifespan::ITERATION_LIFESPAN);
@@ -576,21 +582,27 @@ void LSTMLayer::finalize(InitLayerContext &context) {
}
// reverse_hidden_state_dim : [ batch_size, 1, max_timestep, unit ]
- const TensorDim reverse_hidden_state_dim(batch_size, 1, max_timestep, unit,
- weight_tensor_type);
+ TensorDim reverse_hidden_state_dim(batch_size, 1, max_timestep, unit,
+ weight_tensor_type);
+ reverse_hidden_state_dim.setDataType(context.getActivationDataType());
+
wt_idx[LSTMParams::reverse_hidden_state] = context.requestTensor(
reverse_hidden_state_dim, "reverse_hidden_state",
Tensor::Initializer::NONE, true, TensorLifespan::ITERATION_LIFESPAN);
// reverse_cell_state_dim : [ batch_size, 1, max_timestep, unit ]
- const TensorDim reverse_cell_state_dim(batch_size, 1, max_timestep, unit,
- weight_tensor_type);
+ TensorDim reverse_cell_state_dim(batch_size, 1, max_timestep, unit,
+ weight_tensor_type);
+ reverse_cell_state_dim.setDataType(context.getActivationDataType());
+
wt_idx[LSTMParams::reverse_cell_state] = context.requestTensor(
reverse_cell_state_dim, "reverse_cell_state", Tensor::Initializer::NONE,
true, TensorLifespan::ITERATION_LIFESPAN);
// reverse_ifgo_dim : [ batch_size, 1, max_timestep, NUM_GATE * unit ]
- const TensorDim reverse_ifgo_dim(batch_size, 1, max_timestep,
- NUM_GATE * unit, weight_tensor_type);
+ TensorDim reverse_ifgo_dim(batch_size, 1, max_timestep, NUM_GATE * unit,
+ weight_tensor_type);
+ reverse_ifgo_dim.setDataType(context.getActivationDataType());
+
wt_idx[LSTMParams::reverse_ifgo] = context.requestTensor(
reverse_ifgo_dim, "reverse_ifgo", Tensor::Initializer::NONE, true,
TensorLifespan::ITERATION_LIFESPAN);
@@ -598,8 +610,10 @@ void LSTMLayer::finalize(InitLayerContext &context) {
if (dropout_rate > epsilon) {
// dropout_mask_dim = [ batch, 1, time_iteration, unit ]
- const TensorDim dropout_mask_dim(batch_size, 1, max_timestep, unit,
- weight_tensor_type);
+ TensorDim dropout_mask_dim(batch_size, 1, max_timestep, unit,
+ weight_tensor_type);
+ dropout_mask_dim.setDataType(context.getActivationDataType());
+
wt_idx[LSTMParams::dropout_mask] = context.requestTensor(
dropout_mask_dim, "dropout_mask", Tensor::Initializer::NONE, false,
TensorLifespan::ITERATION_LIFESPAN);
diff --git a/nntrainer/layers/lstm.h b/nntrainer/layers/lstm.h
index f35fdf8815..a9b2cac7d7 100644
--- a/nntrainer/layers/lstm.h
+++ b/nntrainer/layers/lstm.h
@@ -99,7 +99,6 @@ class LSTMLayer : public LSTMCore {
inline static const std::string type = "lstm";
-private:
static constexpr unsigned int NUM_GATE = 4;
/** common properties like Unit, IntegrateBias, HiddenStateActivation and
diff --git a/nntrainer/layers/pooling2d_layer.cpp b/nntrainer/layers/pooling2d_layer.cpp
index a68e42e8d0..b53ca354f2 100644
--- a/nntrainer/layers/pooling2d_layer.cpp
+++ b/nntrainer/layers/pooling2d_layer.cpp
@@ -6,6 +6,7 @@
* @date 12 June 2020
* @see https://github.com/nnstreamer/nntrainer
* @author Jijoong Moon
+ * @author Jiho Chu
* @bug No known bugs except for NYI items
* @brief This is 2 Dimensional Pooling Layer Class for Neural Network
*
@@ -26,6 +27,13 @@ namespace nntrainer {
static constexpr size_t SINGLE_INOUT_IDX = 0;
+/**
+ * @brief help function for Pooling handler
+ */
+template struct PoolFunc {
+ typedef std::function Type;
+};
+
Pooling2DLayer::Pooling2DLayer(
const std::array &padding_) :
Layer(),
@@ -96,6 +104,7 @@ void Pooling2DLayer::finalize(InitLayerContext &context) {
out_dim.channel(in_dim.channel());
out_dim.height((eff_in_height - pool_size[0]) / stride[0] + 1);
out_dim.width((eff_in_width - pool_size[1]) / stride[1] + 1);
+ out_dim.setDataType(in_dim.getDataType());
context.setOutputDimensions({out_dim});
/**
@@ -111,13 +120,17 @@ void Pooling2DLayer::finalize(InitLayerContext &context) {
* // clang-format on
*/
if (pooling_type == props::PoolingTypeInfo::Enum::global_max) {
+ auto helper_dim = in_dim;
+ helper_dim.setDataType(ml::train::TensorDim::DataType::FP32);
pool_helper_idx =
- context.requestTensor(in_dim, "helper_idx", Tensor::Initializer::NONE,
+ context.requestTensor(helper_dim, "helper_idx", Tensor::Initializer::NONE,
false, TensorLifespan::ITERATION_LIFESPAN);
- pool_helper_size.resize(in_dim.batch() * in_dim.channel());
+ pool_helper_size.resize(helper_dim.batch() * helper_dim.channel());
} else {
+ auto helper_dim = out_dim;
+ helper_dim.setDataType(ml::train::TensorDim::DataType::FP32);
pool_helper_idx =
- context.requestTensor(out_dim, "helper_idx", Tensor::Initializer::NONE,
+ context.requestTensor(helper_dim, "helper_idx", Tensor::Initializer::NONE,
false, TensorLifespan::ITERATION_LIFESPAN);
}
}
@@ -172,15 +185,13 @@ void Pooling2DLayer::calcDerivative(RunLayerContext &context) {
unsigned int J, K;
result.setZero();
- float *result_data = result.getData();
unsigned int out_map_size = deriv.height() * deriv.width();
unsigned int in_map_size = height * width;
- switch (pooling_type) {
- case props::PoolingTypeInfo::Enum::max: {
+ auto apply_max = [&](T *result_data) {
const int *iter = pool_helper.getData();
- const float *deriv_data = deriv.getData();
+ const T *deriv_data = deriv.getData();
for (unsigned int b = 0; b < batch; ++b) {
for (unsigned int c = 0; c < channel; ++c) {
for (unsigned int i = 0; i < out_map_size; ++i) {
@@ -195,9 +206,9 @@ void Pooling2DLayer::calcDerivative(RunLayerContext &context) {
result_data += in_map_size;
}
}
- } break;
- case props::PoolingTypeInfo::Enum::global_average:
- case props::PoolingTypeInfo::Enum::average: {
+ };
+
+ auto apply_average = [&](T *result_data) {
int height_stride_end = height - p_height + pt;
int width_stride_end = width - p_width + pl;
const int *iter = pool_helper.getData();
@@ -207,7 +218,7 @@ void Pooling2DLayer::calcDerivative(RunLayerContext &context) {
for (int j = -pt; j <= height_stride_end; j += stride[0]) {
K = 0;
for (int k = -pl; k <= width_stride_end; k += stride[1]) {
- float del = deriv.getValue(b, i, J, K) / *iter;
+ T del = deriv.getValue(b, i, J, K) / *iter;
int patch_height_end =
std::min(static_cast(j + p_height), height);
int patch_width_end =
@@ -217,7 +228,7 @@ void Pooling2DLayer::calcDerivative(RunLayerContext &context) {
for (int h = start_h; h < patch_height_end; ++h) {
for (int w = start_w; w < patch_width_end; ++w) {
result.setValue(b, i, h, w,
- result.getValue(b, i, h, w) + del);
+ result.getValue(b, i, h, w) + del);
}
}
iter++;
@@ -227,15 +238,16 @@ void Pooling2DLayer::calcDerivative(RunLayerContext &context) {
}
}
}
- } break;
- case props::PoolingTypeInfo::Enum::global_max: {
- const float *deriv_data = deriv.getData();
+ };
+
+ auto apply_global_max = [&](T *result_data) {
+ const T *deriv_data = deriv.getData();
for (unsigned int b = 0; b < batch; b++) {
for (unsigned int c = 0; c < channel; c++) {
const int *iter =
pool_helper.getData() + pool_helper.getIndex(b, c, 0, 0);
unsigned int helper_size = pool_helper_size[b * channel + c];
- float der = *deriv_data / helper_size;
+ T der = *deriv_data / static_cast(helper_size);
for (unsigned int idx = 0; idx < helper_size; idx++)
result_data[iter[idx]] += der;
@@ -244,7 +256,40 @@ void Pooling2DLayer::calcDerivative(RunLayerContext &context) {
result_data += in_map_size;
}
}
- } break;
+ };
+
+ switch (pooling_type) {
+ case props::PoolingTypeInfo::Enum::max:
+ if (in_dim.getDataType() == ml::train::TensorDim::DataType::FP32)
+ apply_max(result.getData());
+#ifdef ENABLE_FP16
+ else if (in_dim.getDataType() == ml::train::TensorDim::DataType::FP16)
+ apply_max(result.getData<_FP16>());
+#endif
+ else
+ throw std::runtime_error("Not supported datatype");
+ break;
+ case props::PoolingTypeInfo::Enum::global_average:
+ case props::PoolingTypeInfo::Enum::average:
+ if (in_dim.getDataType() == ml::train::TensorDim::DataType::FP32)
+ apply_average(result.getData());
+#ifdef ENABLE_FP16
+ else if (in_dim.getDataType() == ml::train::TensorDim::DataType::FP16)
+ apply_average(result.getData<_FP16>());
+#endif
+ else
+ throw std::runtime_error("Not supported datatype");
+ break;
+ case props::PoolingTypeInfo::Enum::global_max:
+ if (in_dim.getDataType() == ml::train::TensorDim::DataType::FP32)
+ apply_global_max(result.getData());
+#ifdef ENABLE_FP16
+ else if (in_dim.getDataType() == ml::train::TensorDim::DataType::FP16)
+ apply_global_max(result.getData<_FP16>());
+#endif
+ else
+ throw std::runtime_error("Not supported datatype");
+ break;
default:
throw std::runtime_error("Error: Unknown Pooling Type");
}
@@ -290,124 +335,167 @@ void Pooling2DLayer::pooling2d(Tensor &in, bool training, Tensor &output,
* @param start_w (width index pointing the start of the patch)
* @return result value of pooling
*/
- std::function pool_fn;
+ PoolFunc::Type pool_fn_fp32;
+#ifdef ENABLE_FP16
+ PoolFunc<_FP16>::Type pool_fn_fp16;
+#endif
unsigned int max_idx_count = 0;
- switch (pooling_type) {
- case props::PoolingTypeInfo::Enum::max: {
- pool_fn = [&](const float *in_data, int channel_idx, int start_h,
- int start_w) {
- int end_h = start_h + patch_height;
- int end_w = start_w + patch_width;
-
- float max_val = std::numeric_limits::lowest();
-
- int cur_max_idx = -1;
- int eff_end_h = std::min(end_h, in_height);
- int eff_end_w = std::min(end_w, in_width);
- start_w = std::max(0, start_w);
- for (int h = std::max(0, start_h); h < eff_end_h; ++h) {
- for (int w = start_w; w < eff_end_w; ++w) {
- int cur_idx = h * in_width + w;
- float val = in_data[cur_idx];
- if (max_val < val) {
- max_val = val;
- if (training) {
- cur_max_idx = cur_idx;
- }
+
+ auto pool_fn_max = [&](const T *in_data, int channel_idx,
+ int start_h, int start_w) {
+ int end_h = start_h + patch_height;
+ int end_w = start_w + patch_width;
+
+ T max_val = std::numeric_limits::lowest();
+
+ int cur_max_idx = -1;
+ int eff_end_h = std::min(end_h, in_height);
+ int eff_end_w = std::min(end_w, in_width);
+ start_w = std::max(0, start_w);
+ for (int h = std::max(0, start_h); h < eff_end_h; ++h) {
+ for (int w = start_w; w < eff_end_w; ++w) {
+ int cur_idx = h * in_width + w;
+ T val = in_data[cur_idx];
+ if (max_val < val) {
+ max_val = val;
+ if (training) {
+ cur_max_idx = cur_idx;
}
}
}
+ }
- if (training) {
- pool_helper.setValueInt(max_idx_count++, cur_max_idx);
- }
+ if (training) {
+ pool_helper.setValueInt(max_idx_count++, cur_max_idx);
+ }
- return max_val;
- };
- break;
- }
- case props::PoolingTypeInfo::Enum::global_max: {
- pool_fn = [&, this](const float *in_data, int channel_idx, int start_h,
- int start_w) {
- int end_h = start_h + patch_height;
- int end_w = start_w + patch_width;
-
- float max_val = std::numeric_limits::lowest();
- int *helper_data = pool_helper.getData();
- helper_data += channel_idx * in_height * in_width;
-
- for (int h = start_h; h < end_h; ++h) {
- for (int w = start_w; w < end_w; ++w) {
- int cur_idx = h * in_width + w;
- float val = in_data[cur_idx];
- if (max_val < val) {
- max_val = val;
- max_idx_count = 0;
- }
+ return max_val;
+ };
- if (training && max_val == val) {
- *(helper_data + max_idx_count++) = cur_idx;
- }
+ auto pool_fn_global_max = [&, this](const T *in_data,
+ int channel_idx, int start_h,
+ int start_w) {
+ int end_h = start_h + patch_height;
+ int end_w = start_w + patch_width;
+
+ T max_val = std::numeric_limits::lowest();
+ int *helper_data = pool_helper.getData();
+ helper_data += channel_idx * in_height * in_width;
+
+ for (int h = start_h; h < end_h; ++h) {
+ for (int w = start_w; w < end_w; ++w) {
+ int cur_idx = h * in_width + w;
+ T val = in_data[cur_idx];
+ if (max_val < val) {
+ max_val = val;
+ max_idx_count = 0;
}
- }
- pool_helper_size[batch_idx * in.channel() + channel_idx] = max_idx_count;
- return max_val;
- };
- break;
- }
- case props::PoolingTypeInfo::Enum::global_average:
- case props::PoolingTypeInfo::Enum::average: {
- pool_fn = [&](const float *in_data, int channel_idx, int start_h,
- int start_w) {
- int end_h = start_h + patch_height;
- int end_w = start_w + patch_width;
- float total = 0.0f;
-
- int eff_end_h = std::min(end_h, in_height);
- int eff_end_w = std::min(end_w, in_width);
- int eff_start_h = std::max(0, start_h);
- int eff_start_w = std::max(0, start_w);
-
- int cnt = (eff_end_h - eff_start_h) * (eff_end_w - eff_start_w);
- for (int h = eff_start_h; h < eff_end_h; ++h) {
- for (int w = eff_start_w; w < eff_end_w; ++w) {
- float val = in_data[h * in_width + w];
- total += val;
+ if (training && max_val == val) {
+ *(helper_data + max_idx_count++) = cur_idx;
}
}
+ }
- if (training) {
- pool_helper.setValueInt(max_idx_count++, cnt);
+ pool_helper_size[batch_idx * in.channel() + channel_idx] = max_idx_count;
+ return max_val;
+ };
+
+ auto pool_fn_average = [&](const T *in_data, int channel_idx,
+ int start_h, int start_w) {
+ int end_h = start_h + patch_height;
+ int end_w = start_w + patch_width;
+ T total = static_cast(0.0f);
+
+ int eff_end_h = std::min(end_h, in_height);
+ int eff_end_w = std::min(end_w, in_width);
+ int eff_start_h = std::max(0, start_h);
+ int eff_start_w = std::max(0, start_w);
+
+ int cnt = (eff_end_h - eff_start_h) * (eff_end_w - eff_start_w);
+ for (int h = eff_start_h; h < eff_end_h; ++h) {
+ for (int w = eff_start_w; w < eff_end_w; ++w) {
+ T val = in_data[h * in_width + w];
+ total += val;
}
- return total / cnt;
- };
+ }
+
+ if (training) {
+ pool_helper.setValueInt(max_idx_count++, cnt);
+ }
+ return total / cnt;
+ };
+
+ switch (pooling_type) {
+ case props::PoolingTypeInfo::Enum::max:
+ pool_fn_fp32 = pool_fn_max;
+#ifdef ENABLE_FP16
+ pool_fn_fp16 = pool_fn_max;
+#endif
+ break;
+ case props::PoolingTypeInfo::Enum::global_max:
+ pool_fn_fp32 = pool_fn_global_max;
+#ifdef ENABLE_FP16
+ pool_fn_fp16 = pool_fn_global_max;
+#endif
+ break;
+ case props::PoolingTypeInfo::Enum::global_average:
+ case props::PoolingTypeInfo::Enum::average:
+ pool_fn_fp32 = pool_fn_average;
+#ifdef ENABLE_FP16
+ pool_fn_fp16 = pool_fn_average;
+#endif
break;
- }
case props::PoolingTypeInfo::Enum::unknown:
default:
throw std::invalid_argument("unknown pooling type given");
break;
}
- const float *in_data = in.getData();
- float *out_data = output.getData();
-
- unsigned int map_size = in_height * in_width;
-
- int height_stride_end = height - patch_height - pt;
- int width_stride_end = width - patch_width - pl;
- for (unsigned int i = 0; i < channel; ++i) {
- const float *in_data_channel_sliced = in_data + i * map_size;
- for (int j = -pt; j <= height_stride_end; j += stride[0]) {
- for (int k = -pl; k <= width_stride_end; k += stride[1]) {
- float pool_value = pool_fn(in_data_channel_sliced, i, j, k);
- *out_data = pool_value;
- out_data++;
+ if (in.getDataType() == ml::train::TensorDim::DataType::FP32) {
+ const float *in_data = in.getData();
+ float *out_data = output.getData();
+
+ unsigned int map_size = in_height * in_width;
+
+ int height_stride_end = height - patch_height - pt;
+ int width_stride_end = width - patch_width - pl;
+ for (unsigned int i = 0; i < channel; ++i) {
+ const float *in_data_channel_sliced = in_data + i * map_size;
+ for (int j = -pt; j <= height_stride_end; j += stride[0]) {
+ for (int k = -pl; k <= width_stride_end; k += stride[1]) {
+ float pool_value = pool_fn_fp32(in_data_channel_sliced, i, j, k);
+ *out_data = pool_value;
+ out_data++;
+ }
+ }
+ }
+ }
+#ifdef ENABLE_FP16
+ else if (in.getDataType() == ml::train::TensorDim::DataType::FP16) {
+ const _FP16 *in_data = in.getData<_FP16>();
+ _FP16 *out_data = output.getData<_FP16>();
+
+ unsigned int map_size = in_height * in_width;
+
+ int height_stride_end = height - patch_height - pt;
+ int width_stride_end = width - patch_width - pl;
+ for (unsigned int i = 0; i < channel; ++i) {
+ const _FP16 *in_data_channel_sliced = in_data + i * map_size;
+ for (int j = -pt; j <= height_stride_end; j += stride[0]) {
+ for (int k = -pl; k <= width_stride_end; k += stride[1]) {
+ _FP16 pool_value = pool_fn_fp16(in_data_channel_sliced, i, j, k);
+ *out_data = pool_value;
+ out_data++;
+ }
}
}
}
+#endif
+ else {
+ throw std::runtime_error("Not supported datatype");
+ }
}
void Pooling2DLayer::setBatch(RunLayerContext &context, unsigned int batch) {
diff --git a/nntrainer/layers/reshape_layer.cpp b/nntrainer/layers/reshape_layer.cpp
index 0f82d84f3a..07564b3970 100644
--- a/nntrainer/layers/reshape_layer.cpp
+++ b/nntrainer/layers/reshape_layer.cpp
@@ -42,6 +42,7 @@ void ReshapeLayer::finalize(InitLayerContext &context) {
}
out_dim.batch(in_dim.batch());
+ out_dim.setDataType(in_dim.getDataType());
context.setOutputDimensions({out_dim});
}
diff --git a/nntrainer/layers/time_dist.cpp b/nntrainer/layers/time_dist.cpp
index 80451416df..779010065a 100644
--- a/nntrainer/layers/time_dist.cpp
+++ b/nntrainer/layers/time_dist.cpp
@@ -256,8 +256,8 @@ void TimeDistLayer::forwarding(RunLayerContext &context, bool training) {
RunLayerContext dist_context(context.getName(), context.getTrainable(),
context.getLoss(), context.executeInPlace(),
- getWeightsForContext(), {&in_var}, {&out_var},
- getTensorsForContext());
+ context.getLossScale(), getWeightsForContext(),
+ {&in_var}, {&out_var}, getTensorsForContext());
dist_layer->forwarding(dist_context, training);
}
@@ -303,8 +303,8 @@ void TimeDistLayer::calcDerivative(RunLayerContext &context) {
RunLayerContext dist_context(context.getName(), context.getTrainable(),
context.getLoss(), context.executeInPlace(),
- getWeightsForContext(), {&in_var}, {&out_var},
- getTensorsForContext());
+ context.getLossScale(), getWeightsForContext(),
+ {&in_var}, {&out_var}, getTensorsForContext());
dist_layer->calcDerivative(dist_context);
}
@@ -354,8 +354,8 @@ void TimeDistLayer::calcGradient(RunLayerContext &context) {
RunLayerContext dist_context(context.getName(), context.getTrainable(),
context.getLoss(), context.executeInPlace(),
- getWeightsForContext(), {&in_var}, {&out_var},
- getTensorsForContext());
+ context.getLossScale(), getWeightsForContext(),
+ {&in_var}, {&out_var}, getTensorsForContext());
dist_layer->calcGradient(dist_context);
}
@@ -396,8 +396,8 @@ void TimeDistLayer::setBatch(RunLayerContext &context, unsigned int batch) {
RunLayerContext dist_context(context.getName(), context.getTrainable(),
context.getLoss(), context.executeInPlace(),
- getWeightsForContext(), {&in_var}, {&out_var},
- getTensorsForContext());
+ context.getLossScale(), getWeightsForContext(),
+ {&in_var}, {&out_var}, getTensorsForContext());
dist_layer->setBatch(dist_context, batch);
diff --git a/nntrainer/meson.build b/nntrainer/meson.build
index 02df7744b6..5c7a14d4a5 100644
--- a/nntrainer/meson.build
+++ b/nntrainer/meson.build
@@ -47,6 +47,7 @@ nntrainer_elements = [
if get_option('enable-opencl')
nntrainer_elements += 'opencl'
+ nntrainer_elements += 'layers/cl_layers'
endif
foreach elem : nntrainer_elements
diff --git a/nntrainer/models/model_common_properties.h b/nntrainer/models/model_common_properties.h
index 3776afefca..3435d18e96 100644
--- a/nntrainer/models/model_common_properties.h
+++ b/nntrainer/models/model_common_properties.h
@@ -217,7 +217,7 @@ class ModelTensorDataType final : public EnumProperty {
*/
class LossScale : public Property {
public:
- LossScale(float value = 0.0f);
+ LossScale(float value = 1.0f);
static constexpr const char *key = "loss_scale"; /**< unique key to access */
using prop_tag = float_prop_tag; /**< property type */
};
diff --git a/nntrainer/models/neuralnet.cpp b/nntrainer/models/neuralnet.cpp
index d0e542825f..afc560603e 100644
--- a/nntrainer/models/neuralnet.cpp
+++ b/nntrainer/models/neuralnet.cpp
@@ -412,9 +412,21 @@ void NeuralNetwork::backwarding(int iteration,
NNTR_THROW_IF(!opt, std::invalid_argument) << "optimizer is null!";
#endif
- std::function, int)> backwarding_op =
+ std::function, bool)> forwarding_op =
+ [this, stop_cb, userdata](std::shared_ptr node,
+ bool training) -> void {
+ (void)this;
+ PROFILE_MEM_ANNOTATE("Forwarding for layer: " + node->getName());
+
+ auto f = std::get<0>(node->getExecutionOrder());
+ model_graph.flushCacheExcept(f);
+
+ node->forwarding(training);
+ };
+
+ std::function, int)> backwarding_op =
[this, stop_cb, userdata](std::shared_ptr node,
- int iteration) -> void {
+ int iteration) -> bool {
/**
* Do not change this order:
* 1. calcGradient
@@ -448,19 +460,29 @@ void NeuralNetwork::backwarding(int iteration,
/** If gradient must be applied and its not gradient mode, calculate
* gradient
*/
- if (!dynamic_training_opt.isGradientMode() && apply_gradient)
+ if (!dynamic_training_opt.isGradientMode() && apply_gradient) {
node->calcGradient();
+
+ RunLayerContext &rc = node->getRunContext();
+ if (rc.isMixedPrecision()) {
+ for (auto w : rc.getWeights()) {
+ if (!w->getGradientRef().isValid())
+ return false;
+ }
+ }
+ }
}
model_graph.flushCacheExcept(std::get<2>(node->getExecutionOrder()));
PROFILE_MEM_ANNOTATE("CalcDerivative: " + node->getName());
if (stop_cb(userdata)) {
- return;
+ return true;
}
- if (node->needsCalcDerivative())
+ if (node->needsCalcDerivative()) {
node->calcDerivative();
+ }
model_graph.flushCacheExcept(std::get<3>(node->getExecutionOrder()));
PROFILE_MEM_ANNOTATE("ApplyGradient: " + node->getName());
@@ -476,9 +498,10 @@ void NeuralNetwork::backwarding(int iteration,
opt_->applyGradient(opt_context);
});
}
+ return true;
};
- std::function apply_grad_clip_op =
+ std::function lazy_apply_grad_op =
[opt_ = opt.get()](Weight &w, int iteration) -> void {
w.calcRegularizationGradient();
w.calcWeightDecayGradient();
@@ -487,8 +510,13 @@ void NeuralNetwork::backwarding(int iteration,
opt_->applyGradient(opt_context);
};
- model_graph.backwarding(iteration, backwarding_op, apply_grad_clip_op,
- stop_cb, userdata);
+ // return false if the gradient is not valid
+ bool ret = false;
+
+ while (!ret) {
+ ret = model_graph.backwarding(iteration, forwarding_op, backwarding_op,
+ lazy_apply_grad_op, stop_cb, userdata);
+ }
}
void NeuralNetwork::save(const std::string &file_path,
diff --git a/nntrainer/optimizers/adam.cpp b/nntrainer/optimizers/adam.cpp
index 18c0a0fcc1..f7189dda7e 100644
--- a/nntrainer/optimizers/adam.cpp
+++ b/nntrainer/optimizers/adam.cpp
@@ -36,7 +36,15 @@ Adam::~Adam() {}
enum AdamParams { wm, wv };
std::vector Adam::getOptimizerVariableDim(const TensorDim &dim) {
- return {dim, dim};
+ /**
+ * @note We assume the optimizer parameters should be full precsion to
+ * maintain the accuracy even in mixed precision training.
+ */
+ TensorDim wm_dim(dim);
+ TensorDim wv_dim(dim);
+ wm_dim.setDataType(ml::train::TensorDim::DataType::FP32);
+ wv_dim.setDataType(ml::train::TensorDim::DataType::FP32);
+ return {wm_dim, wv_dim};
}
void Adam::exportTo(Exporter &exporter,
@@ -64,7 +72,17 @@ double Adam::getUpdatedLearningRate(unsigned int iteration, double ll) const {
}
void Adam::applyGradient(RunOptimizerContext &context) {
- Tensor &x_grad = context.getGradient();
+ Tensor empty_tensor;
+
+ Tensor &x_grad =
+ context.getGradient().getDataType() == ml::train::TensorDim::DataType::FP32
+ ? context.getGradient()
+ : empty_tensor;
+
+ if (x_grad.empty()) {
+ x_grad = context.getGradient().clone(ml::train::TensorDim::DataType::FP32);
+ context.applyLossScale(x_grad);
+ }
auto &beta1 = std::get(adam_props).get();
auto &beta2 = std::get(adam_props).get();
@@ -91,7 +109,7 @@ void Adam::applyGradient(RunOptimizerContext &context) {
denom.add_i(epsilon);
wm.divide(denom, x_grad);
- context.applyGradient(context.getLearningRate() / biasCorrection1);
+ context.applyGradient(context.getLearningRate() / biasCorrection1, x_grad);
} else {
std::function sqrtEps = [epsilon](double f) {
@@ -100,8 +118,9 @@ void Adam::applyGradient(RunOptimizerContext &context) {
x_grad = wv.apply(sqrtEps, x_grad);
x_grad.multiply_i(wm);
- context.applyGradient(getUpdatedLearningRate(context.getIteration(),
- context.getLearningRate()));
+ context.applyGradient(
+ getUpdatedLearningRate(context.getIteration(), context.getLearningRate()),
+ x_grad);
}
}
diff --git a/nntrainer/optimizers/optimizer_context.cpp b/nntrainer/optimizers/optimizer_context.cpp
index da4cd1f7e9..8380ad6613 100644
--- a/nntrainer/optimizers/optimizer_context.cpp
+++ b/nntrainer/optimizers/optimizer_context.cpp
@@ -42,4 +42,24 @@ Tensor &RunOptimizerContext::getOptimizerVariable(unsigned int idx) const {
void RunOptimizerContext::applyGradient(double lr) const {
weight->applyGradient(lr);
}
+
+/**
+ * @brief Apply the gradient with the given learning rate and gradient
+ */
+void RunOptimizerContext::applyGradient(double lr, Tensor &updated_grad) const {
+ weight->applyGradient(lr, updated_grad);
+}
+
+/**
+ * @brief Apply loss scale to gradient (full precision)
+ */
+void RunOptimizerContext::applyLossScale(Tensor &fp32_grad) {
+ if (!weight->isMixedPrecision())
+ return;
+ if (fp32_grad.getDataType() != ml::train::TensorDim::DataType::FP32)
+ throw std::invalid_argument(
+ "gradient should be fullprecsion to maintain accuracy");
+ float loss_scale = weight->getLossScale();
+ fp32_grad.divide_i(loss_scale);
+}
} // namespace nntrainer
diff --git a/nntrainer/optimizers/optimizer_context.h b/nntrainer/optimizers/optimizer_context.h
index 62f9e0945d..27f028fc52 100644
--- a/nntrainer/optimizers/optimizer_context.h
+++ b/nntrainer/optimizers/optimizer_context.h
@@ -35,9 +35,7 @@ class RunOptimizerContext {
*
*/
RunOptimizerContext(Weight *w = nullptr, size_t iter = 0, double lr = 0.0) :
- weight(w),
- iteration(iter),
- learning_rate(lr) {}
+ weight(w), iteration(iter), learning_rate(lr) {}
/**
* @brief Get the Weight tensor object
@@ -75,6 +73,16 @@ class RunOptimizerContext {
*/
void applyGradient(double lr) const;
+ /**
+ * @brief Apply the gradient with the given learning rate and updated
+ * gradient
+ *
+ * @param lr learning rate
+ * @param updated_grad gradient tensor which is updated. (usually it could be
+ * fp32)
+ */
+ void applyGradient(double lr, Tensor &updated_grad) const;
+
/**
* @brief Get the current iteration value
*
@@ -89,6 +97,11 @@ class RunOptimizerContext {
*/
double getLearningRate() const { return learning_rate; }
+ /**
+ * @brief Apply loss scale to gradient (full precision)
+ */
+ void applyLossScale(Tensor &fp32_grad);
+
private:
Weight *weight; /**< weights for the optimizer */
size_t iteration; /**< iteration number */
diff --git a/nntrainer/optimizers/sgd.cpp b/nntrainer/optimizers/sgd.cpp
index 8b0078e9e6..e4b2209a57 100644
--- a/nntrainer/optimizers/sgd.cpp
+++ b/nntrainer/optimizers/sgd.cpp
@@ -16,7 +16,20 @@
namespace nntrainer {
void SGD::applyGradient(RunOptimizerContext &context) {
- context.applyGradient(context.getLearningRate());
+ // @todo This could go inside the context.
+ Tensor empty_tensor;
+
+ Tensor &x_grad =
+ context.getGradient().getDataType() == ml::train::TensorDim::DataType::FP32
+ ? context.getGradient()
+ : empty_tensor;
+
+ if (x_grad.empty()) {
+ x_grad = context.getGradient().clone(ml::train::TensorDim::DataType::FP32);
+ context.applyLossScale(x_grad);
+ }
+
+ context.applyGradient(context.getLearningRate(), x_grad);
}
} // namespace nntrainer
diff --git a/nntrainer/tensor/blas_avx.cpp b/nntrainer/tensor/blas_avx.cpp
index ce59583d6f..411dbcbb5d 100644
--- a/nntrainer/tensor/blas_avx.cpp
+++ b/nntrainer/tensor/blas_avx.cpp
@@ -20,6 +20,7 @@
namespace nntrainer::avx {
+#ifdef ENABLE_FP16
void vcvt_f16_f32(size_t N, const void *input, float *output) {
assert(N != 0);
assert(input != NULL);
@@ -114,4 +115,163 @@ void vcvt_f32_f16(size_t N, const float *input, void *output) {
}
}
+bool isValid(const size_t N, const _Float16 *input) {
+ assert(N != 0);
+ assert(input != NULL);
+
+ int temp = 0;
+ size_t idx = 0;
+
+ const __m256 SIGN_MASK = _mm256_set1_ps(-0.0);
+ const __m256 INF = _mm256_set1_ps(std::numeric_limits::infinity());
+
+ // 16 single-precision check : ( X != X )
+ for (; N - idx >= 16; idx += 16) {
+ __m256 vec0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)input));
+ __m256 vec1 =
+ _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)(input + 8)));
+
+ input += 16;
+
+ // check NaN in vec0
+ __m256 res = _mm256_cmp_ps(vec0, vec0, _CMP_NEQ_UQ);
+ temp = temp | _mm256_movemask_ps(res);
+ if (temp)
+ return false;
+
+ // check infinity in vec0
+ vec0 = _mm256_andnot_ps(SIGN_MASK, vec0);
+ vec0 = _mm256_cmp_ps(vec0, INF, _CMP_EQ_OQ);
+
+ temp = temp | _mm256_movemask_ps(vec0);
+ if (temp)
+ return false;
+
+ // check NaN in vec1
+ __m256 res1 = _mm256_cmp_ps(vec1, vec1, _CMP_NEQ_UQ);
+ temp = temp | _mm256_movemask_ps(res1);
+
+ if (temp)
+ return false;
+
+ // check infinity in vec1
+ vec1 = _mm256_andnot_ps(SIGN_MASK, vec1);
+ vec1 = _mm256_cmp_ps(vec1, INF, _CMP_EQ_OQ);
+
+ temp = temp | _mm256_movemask_ps(vec1);
+
+ if (temp)
+ return false;
+ }
+
+ // 8 single-precision check : ( X != X )
+ for (; N - idx >= 8; idx += 8) {
+ __m256 vec = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)input));
+ input += 8;
+ __m256 res = _mm256_cmp_ps(vec, vec, _CMP_NEQ_UQ);
+ temp = temp | _mm256_movemask_ps(res);
+
+ if (temp)
+ return false;
+
+ // check infinity in vec1
+ vec = _mm256_andnot_ps(SIGN_MASK, vec);
+ vec = _mm256_cmp_ps(vec, INF, _CMP_EQ_OQ);
+
+ temp = temp | _mm256_movemask_ps(vec);
+
+ if (temp)
+ return false;
+ }
+
+ // remain check : ( X != X || X == Inf )
+ while (idx < N) {
+ if (*input != *input || *input == std::numeric_limits::infinity()) {
+ return false;
+ }
+ ++input;
+ ++idx;
+ }
+
+ return true;
+}
+#endif
+
+bool isValid(const size_t N, const float *input) {
+ assert(N != 0);
+ assert(input != NULL);
+
+ int temp = 0;
+ size_t idx = 0;
+
+ const __m256 SIGN_MASK = _mm256_set1_ps(-0.0);
+ const __m256 INF = _mm256_set1_ps(std::numeric_limits::infinity());
+
+ // 16 single-precision check : ( X != X )
+ for (; N - idx >= 16; idx += 16) {
+ __m256 vec0 = _mm256_loadu_ps(input);
+ __m256 vec1 = _mm256_loadu_ps(input + 8);
+ input += 16;
+ __m256 res = _mm256_cmp_ps(vec0, vec0, _CMP_NEQ_UQ);
+ temp = temp | _mm256_movemask_ps(res);
+
+ if (temp)
+ return false;
+
+ // check infinity in vec0
+ vec0 = _mm256_andnot_ps(SIGN_MASK, vec0);
+ vec0 = _mm256_cmp_ps(vec0, INF, _CMP_EQ_OQ);
+
+ temp = temp | _mm256_movemask_ps(vec0);
+ if (temp)
+ return false;
+
+ __m256 res1 = _mm256_cmp_ps(vec1, vec1, _CMP_NEQ_UQ);
+ temp = temp | _mm256_movemask_ps(res1);
+
+ if (temp)
+ return false;
+
+ // check infinity in vec1
+ vec1 = _mm256_andnot_ps(SIGN_MASK, vec1);
+ vec1 = _mm256_cmp_ps(vec1, INF, _CMP_EQ_OQ);
+
+ temp = temp | _mm256_movemask_ps(vec1);
+
+ if (temp)
+ return false;
+ }
+
+ // 8 single-precision check : ( X != X )
+ for (; N - idx >= 8; idx += 8) {
+ __m256 vec = _mm256_loadu_ps(input);
+ input += 8;
+ __m256 res = _mm256_cmp_ps(vec, vec, _CMP_NEQ_UQ);
+ temp = temp | _mm256_movemask_ps(res);
+
+ if (temp)
+ return false;
+
+ // check infinity in vec
+ vec = _mm256_andnot_ps(SIGN_MASK, vec);
+ vec = _mm256_cmp_ps(vec, INF, _CMP_EQ_OQ);
+
+ temp = temp | _mm256_movemask_ps(vec);
+
+ if (temp)
+ return false;
+ }
+
+ // remain check : ( X != X )
+ while (idx < N) {
+ if (*input != *input || *input == std::numeric_limits::infinity()) {
+ return false;
+ }
+ ++input;
+ ++idx;
+ }
+
+ return true;
+}
+
} // namespace nntrainer::avx
diff --git a/nntrainer/tensor/blas_avx.h b/nntrainer/tensor/blas_avx.h
index ab1270a208..5eabcbdb2c 100644
--- a/nntrainer/tensor/blas_avx.h
+++ b/nntrainer/tensor/blas_avx.h
@@ -20,6 +20,7 @@
namespace nntrainer::avx {
+#ifdef ENABLE_FP16
/**
* @brief Converts half-precision floating point values to single-precision
* floating point values.
@@ -40,6 +41,25 @@ void vcvt_f16_f32(size_t N, const void *input, float *output);
*/
void vcvt_f32_f16(size_t N, const float *input, void *output);
+/**
+ * @brief check if the X has NaN value
+ * @note it compare (x!=x || x == inf)
+ * @param[in] N length of the vector
+ * @param[in] X half-precision * for Vector X
+ * @param[out] false if it has NaN or inf
+ */
+bool isValid(const size_t N, const _Float16 *X);
+#endif
+
+/**
+ * @brief check if the X has NaN value
+ * @note it compare (x!=x || x == inf)
+ * @param[in] N length of the vector
+ * @param[in] X float * for Vector X
+ * @param[out] false if it has NaN or inf
+ */
+bool isValid(const size_t N, const float *X);
+
} // namespace nntrainer::avx
#endif /* __cplusplus */
diff --git a/nntrainer/tensor/blas_interface.cpp b/nntrainer/tensor/blas_interface.cpp
index 9be6fb9911..e8fb78d734 100644
--- a/nntrainer/tensor/blas_interface.cpp
+++ b/nntrainer/tensor/blas_interface.cpp
@@ -842,7 +842,10 @@ void scopy(const unsigned int N, const float *X, const int incX, float *Y,
#ifdef BLAS_NUM_THREADS
openblas_set_num_threads(BLAS_NUM_THREADS);
#endif
- cblas_scopy(N, X, incX, Y, incY);
+ // cblas_scopy(N, (float*)(X), incX, (float*)(Y), incY);
+ // replace cblas scopy with raw temporary.
+ for (unsigned int i = 0; i < N; ++i)
+ Y[i * incY] = X[i * incX];
#else
scopy_raw(N, X, incX, Y, incY);
#endif
@@ -1038,6 +1041,16 @@ static void ele_div_fallback(const unsigned int N, const float *X,
}
}
+static bool is_valid_fallback(const size_t N, const float *X) {
+ for (size_t i = 0; i < N; ++i) {
+ if (*X != *X || *X == std::numeric_limits::infinity())
+ return false;
+ ++X;
+ }
+
+ return true;
+}
+
void ele_mul(const unsigned int N, const float *X, const float *Y, float *Z,
float alpha, float beta, unsigned int i_stride,
unsigned int o_stride) {
@@ -1090,4 +1103,30 @@ void ele_div(const unsigned int N, const float *X, const float *Y, float *Z,
ele_div_fallback(N, X, Y, Z, alpha, beta, i_stride, o_stride);
}
+bool is_valid(const size_t N, ml::train::TensorDim::DataType d_type,
+ const void *X) {
+ if (d_type == ml::train::TensorDim::DataType::FP16) {
+#ifdef ENABLE_FP16
+ const _FP16 *vec = (const _FP16 *)X;
+#ifdef USE_NEON
+ return nntrainer::neon::isValid(N, vec);
+#elif defined(USE_AVX)
+ return nntrainer::avx::isValid(N, vec);
+#else
+ throw std::invalid_argument("Error: enable-fp16 is not enabled");
+#endif
+#endif
+ } else if (d_type == ml::train::TensorDim::DataType::FP32) {
+ const float *vec = (const float *)X;
+#ifdef USE_NEON
+ return nntrainer::neon::isValid(N, vec);
+#elif defined(USE_AVX)
+ return nntrainer::avx::isValid(N, vec);
+#endif
+
+ return is_valid_fallback(N, vec);
+ }
+ return false;
+}
+
} // namespace nntrainer
diff --git a/nntrainer/tensor/blas_interface.h b/nntrainer/tensor/blas_interface.h
index 04a8a23018..2b5ef72922 100644
--- a/nntrainer/tensor/blas_interface.h
+++ b/nntrainer/tensor/blas_interface.h
@@ -478,6 +478,16 @@ void ele_sub(const unsigned N, const float *X, const float *Y, float *Z,
void ele_div(const unsigned N, const float *X, const float *Y, float *Z,
float alpha = 1.f, float beta = 0.f, unsigned int i_stride = 1,
unsigned int o_stride = 1);
+
+/**
+ * @brief check if X array has NaN or inf
+ * @param[in] N length of the vector
+ * @param[in] X float/fp16 * for Vector X
+ * @param[out] bool false if not valide else true
+ */
+bool is_valid(const size_t N, ml::train::TensorDim::DataType d_type,
+ const void *X);
+
} /* namespace nntrainer */
#endif /* __cplusplus */
#endif /* __BLAS_INTERFACE_H__ */
diff --git a/nntrainer/tensor/blas_neon.cpp b/nntrainer/tensor/blas_neon.cpp
index 3609b6b8b5..20f4d102ec 100644
--- a/nntrainer/tensor/blas_neon.cpp
+++ b/nntrainer/tensor/blas_neon.cpp
@@ -546,6 +546,36 @@ void ele_div(const unsigned N, const float *X, const float *Y, float *Z,
}
}
+bool isValid(const size_t N, const float *X) {
+ size_t i = 0;
+ float inf_s = std::numeric_limits::infinity();
+ float32x4_t inf = vdupq_n_f32(inf_s);
+ uint16x8_t zero = vdupq_n_f32(0);
+
+ for (; N - i >= 4; i += 4) {
+ float32x4_t vec = vld1q_f32(&X[i]);
+ uint32x4_t vcmp = vceqq_f32(vec, vec);
+
+ vcmp = vceqq_f32(vcmp, zero);
+
+ if (vaddvq_u32(vcmp))
+ return false;
+
+ vcmp = vceqq_f32(vec, inf);
+
+ if (vaddvq_u16(vcmp))
+ return false;
+ }
+
+ while (i < N) {
+ if (X[i] != X[i] || X[i] == std::numeric_limits::infinity())
+ return false;
+ ++i;
+ }
+
+ return true;
+}
+
#ifdef ENABLE_FP16
void hgemv(const __fp16 *A, const __fp16 *X, __fp16 *Y, uint32_t M, uint32_t N,
@@ -1192,51 +1222,29 @@ void haxpy(const unsigned int N, const float alpha, const __fp16 *X,
}
__fp16 hdot(const unsigned int N, const __fp16 *X, const __fp16 *Y) {
-
- float16x8_t accX8 = vmovq_n_f16(0);
- float16x4_t accX4 = vmov_n_f16(0);
+ float32x4_t accX0_3 = vmovq_n_f32(0.F);
+ float32x4_t accX4_7 = vmovq_n_f32(0.F);
unsigned int idx = 0;
- __fp16 ret = 0;
+ unsigned int N8 = (N >> 3) << 3;
+ float ret = 0;
- // processing batch of 8
- for (; (N - idx) >= 8; idx += 8) {
+ // Adaptive loop for batch size of 8
+ for (; idx < N8; idx += 8) {
float16x8_t x = vld1q_f16(&X[idx]);
float16x8_t y = vld1q_f16(&Y[idx]);
- // x*y + accX8 -> accX8
- accX8 = vfmaq_f16(accX8, x, y);
- }
-
- // check at least one batch of 8 is processed
- if (N - 8 >= 0) {
- __fp16 result[8];
- vst1q_f16(result, accX8);
- for (unsigned int i = 0; i < 8; i++)
- ret += result[i];
- }
-
- // processing remaining batch of 4
- for (; (N - idx) >= 4; idx += 4) {
- float16x4_t x = vld1_f16(&X[idx]);
- float16x4_t y = vld1_f16(&Y[idx]);
-
- // x*y + accX4 -> accX4
- accX4 = vfma_f16(accX4, x, y);
- }
-
- // check at least one batch of 4 is processed
- if (N % 8 >= 4) {
- __fp16 result[4];
- vst1_f16(result, accX4);
- ret += result[0] + result[1] + result[2] + result[3];
+ x = vmulq_f16(x, y);
+ accX0_3 = vaddq_f32(accX0_3, vcvt_f32_f16(vget_low_f16(x)));
+ accX4_7 = vaddq_f32(accX4_7, vcvt_f32_f16(vget_high_f16(x)));
}
+ ret += vaddvq_f32(accX0_3) + vaddvq_f32(accX4_7);
- // pocessing remaining values
+ // Loop for remaining indices
for (; idx < N; idx++)
ret += X[idx] * Y[idx];
- return ret;
+ return static_cast<__fp16>(ret);
}
__fp16 hnrm2(const unsigned int N, const __fp16 *X) {
@@ -1994,5 +2002,40 @@ void inv_sqrt_inplace(const unsigned int N, __fp16 *X) {
}
}
+bool isValid(const size_t N, const __fp16 *input) {
+ bool temp = 0;
+ size_t i = 0;
+ __fp16 inf_s = std::numeric_limits::infinity();
+ float16x8_t inf = vdupq_n_f16(inf_s);
+ uint16x8_t zero = vdupq_n_f16(0);
+
+ for (; N - i >= 8; i += 8) {
+ float16x8_t vec = vld1q_f16(&input[i]);
+
+ uint16x8_t vcmp = vceqq_f16(vec, vec);
+
+ vcmp = vceqq_f16(vcmp, zero);
+
+ if (vaddvq_u16(vcmp)) {
+ return false;
+ }
+
+ vcmp = vceqq_f16(vec, inf);
+
+ if (vaddvq_u16(vcmp)) {
+ return false;
+ }
+ }
+
+ while (i < N) {
+ if (input[i] != input[i] ||
+ input[i] == std::numeric_limits::infinity()) {
+ return false;
+ }
+ ++i;
+ }
+ return true;
+}
+
#endif
} // namespace nntrainer::neon
diff --git a/nntrainer/tensor/blas_neon.h b/nntrainer/tensor/blas_neon.h
index db1b6a5ccc..978d3428f7 100644
--- a/nntrainer/tensor/blas_neon.h
+++ b/nntrainer/tensor/blas_neon.h
@@ -148,6 +148,15 @@ void ele_sub(const unsigned N, const float *X, const float *Y, float *Z,
void ele_div(const unsigned N, const float *X, const float *Y, float *Z,
float alpha = 1.f, float beta = 0.f);
+/**
+ * @brief check if the X has NaN value or Inf
+ * @note it compare (x!=x || x == inf)
+ * @param[in] N length of the vector
+ * @param[in] input float * for Vector X
+ * @param[out] false if it has NaN or Inf
+ */
+bool isValid(const size_t N, const float *input);
+
#ifdef ENABLE_FP16
/**
* @brief hgemv computation with neon : Y = alpha*A*X + beta*Y
@@ -380,6 +389,15 @@ void hgemm_transAB(const __fp16 *A, const __fp16 *B, float *C, uint32_t M,
* @param X __fp16 * for Vector X
*/
void inv_sqrt_inplace(const unsigned int N, __fp16 *X);
+
+/**
+ * @brief check if the X is valid: Check NaN or Inf
+ * @note it compare (x!=x || x == inf)
+ * @param[in] N length of the vector
+ * @param[in] X float * for Vector X
+ * @param[out] false if it has NaN or Inf
+ */
+bool isValid(const size_t N, const __fp16 *X);
#endif
} // namespace nntrainer::neon
diff --git a/nntrainer/tensor/hgemm/hgemm.cpp b/nntrainer/tensor/hgemm/hgemm.cpp
index a41a5ba6dc..4aaadf331c 100644
--- a/nntrainer/tensor/hgemm/hgemm.cpp
+++ b/nntrainer/tensor/hgemm/hgemm.cpp
@@ -32,15 +32,17 @@
void hgemm_noTrans(const __fp16 *A, const __fp16 *B, float *C32, unsigned int M,
unsigned int N, unsigned int K, float alpha, float beta) {
if (alpha == 1.F && beta == 0.F) {
- if (M % 8 == 0 && N % 16 == 0 && K % 8 == 0) {
+ // used bitwise operator instead of modulo for performance
+ // e.g (M % 8) is same as (M & 0x7) which will extract last 3 bits of M
+ if ((M & 0x7) == 0 && (N & 0xF) == 0 && (K & 0x7) == 0) {
hgemm_noTrans_8x16(M, N, K, A, K, B, N, C32, N, alpha, beta);
- } else if (M % 8 == 0 && N % 8 == 0 && K % 8 == 0) {
+ } else if ((M & 0x7) == 0 && (N & 0x7) == 0 && (K & 0x7) == 0) {
hgemm_noTrans_8x8(M, N, K, A, K, B, N, C32, N, alpha, beta);
- } else if (M % 4 == 0 && N % 8 == 0 && K % 4 == 0) {
+ } else if ((M & 0x3) == 0 && (N & 0x7) == 0 && (K & 0x3) == 0) {
hgemm_noTrans_4x8(M, N, K, A, K, B, N, C32, N, alpha, beta);
- } else if (N % 8 == 0) {
+ } else if ((K & 0x7) == 0 && (N & 0x7) == 0) {
hgemm_noTrans_1x8(M, N, K, A, K, B, N, C32, N, alpha, beta);
- } else if (N % 4 == 0) {
+ } else if ((K & 0x7) == 0 && (N & 0x3) == 0) {
hgemm_noTrans_1x4(M, N, K, A, K, B, N, C32, N, alpha, beta);
} else {
hgemm_noTrans_fallback(M, N, K, A, K, B, N, C32, N, alpha, beta);
@@ -52,17 +54,19 @@ void hgemm_noTrans(const __fp16 *A, const __fp16 *B, float *C32, unsigned int M,
void hgemm_noTrans(const __fp16 *A, const __fp16 *B, __fp16 *C, unsigned int M,
unsigned int N, unsigned int K, float alpha, float beta) {
if (alpha == 1.F && beta == 0.F) {
- if (M % 8 == 0 && N % 16 == 0 && K % 8 == 0) {
+ // used bitwise operator instead of modulo for performance
+ // e.g (M % 8) is same as (M & 0x7) which will extract last 3 bits of M
+ if ((M & 0x7) == 0 && (N & 0xF) == 0 && (K & 0x7) == 0) {
hgemm_noTrans_8x16(M, N, K, A, K, B, N, C, N, alpha, beta);
- } else if (M % 8 == 0 && N % 8 == 0 && K % 8 == 0) {
+ } else if ((M & 0x7) == 0 && (N & 0x7) == 0 && (K & 0x7) == 0) {
hgemm_noTrans_8x8(M, N, K, A, K, B, N, C, N, alpha, beta);
- } else if (M % 4 == 0 && N % 8 == 0 && K % 4 == 0) {
+ } else if ((M & 0x3) == 0 && (N & 0x7) == 0 && (K & 0x3) == 0) {
hgemm_noTrans_4x8(M, N, K, A, K, B, N, C, N, alpha, beta);
- } else if (N % 8 == 0) {
- hgemm_noTrans_1x8(M, N, K, A, K, B, N, C, N, alpha, beta);
- } else if (M % 4 == 0 && N % 4 == 0 && K % 4 == 0) {
+ } else if ((M & 0x3) == 0 && (N & 0x3) == 0 && (K & 0x3) == 0) {
hgemm_noTrans_4x4(M, N, K, A, K, B, N, C, N, alpha, beta);
- } else if (N % 4 == 0) {
+ } else if ((N & 0x7) == 0 && (K & 0x7) == 0) {
+ hgemm_noTrans_1x8(M, N, K, A, K, B, N, C, N, alpha, beta);
+ } else if ((N & 0x3) == 0 && (K & 0x7) == 0) {
hgemm_noTrans_1x4(M, N, K, A, K, B, N, C, N, alpha, beta);
}
}
@@ -408,6 +412,72 @@ void hgemm_noTrans_1x8(unsigned int M, unsigned int N, unsigned int K,
free(sb);
}
+void hgemm_noTrans_4x4(unsigned int M, unsigned int N, unsigned int K,
+ const __fp16 *A, unsigned int lda, const __fp16 *B,
+ unsigned int ldb, float *C, unsigned int ldc,
+ float alpha, float beta) {
+ __fp16 *sa = alignedMalloc(M * K);
+ __fp16 *sb = alignedMalloc(K * N);
+
+ unsigned int ms, mms, ns, ks;
+ unsigned int m_min, m2_min, n_min, k_min;
+ for (ms = 0; ms < M; ms += M_BLOCKING) {
+ m_min = M - ms;
+ if (m_min > M_BLOCKING) {
+ m_min = M_BLOCKING;
+ }
+
+ for (ks = 0; ks < K; ks += k_min) {
+ k_min = K - ks;
+ if (k_min >= (K_BLOCKING << 1)) {
+ k_min = K_BLOCKING;
+ } else if (k_min > K_BLOCKING) {
+ k_min = (k_min / 2 + GEMM_UNROLLING_4 - 1) & ~(GEMM_UNROLLING_4 - 1);
+ }
+
+ n_min = N;
+ if (N >= N_BLOCKING * 2) {
+ n_min = N_BLOCKING;
+ } else if (N > N_BLOCKING) {
+ n_min = (n_min / 2 + GEMM_UNROLLING_4 - 1) & ~(GEMM_UNROLLING_4 - 1);
+ }
+ packing_B4(k_min, n_min, B + ks * ldb, ldb, sb);
+
+ for (mms = ms; mms < ms + m_min; mms += m2_min) {
+ m2_min = (ms + m_min) - mms;
+ if (m2_min >= 3 * GEMM_UNROLLING_4) {
+ m2_min = 3 * GEMM_UNROLLING_4;
+ } else if (m2_min >= 2 * GEMM_UNROLLING_4) {
+ m2_min = 2 * GEMM_UNROLLING_4;
+ } else if (m2_min > GEMM_UNROLLING_4) {
+ m2_min = GEMM_UNROLLING_4;
+ }
+
+ packing_A4(m2_min, k_min, A + mms * lda + ks, lda,
+ sa + k_min * (mms - ms));
+
+ HGEMM_KERNEL_4x4(m2_min, n_min, k_min, sa + k_min * (mms - ms), sb,
+ C + mms * ldc, ldc);
+ }
+
+ for (ns = n_min; ns < N; ns += n_min) {
+ n_min = N - ns;
+ if (n_min >= N_BLOCKING * 2) {
+ n_min = N_BLOCKING;
+ } else if (n_min > N_BLOCKING) {
+ n_min = (n_min / 2 + GEMM_UNROLLING_4 - 1) & ~(GEMM_UNROLLING_4 - 1);
+ }
+
+ packing_B4(k_min, n_min, B + ns + ldb * ks, ldb, sb);
+ HGEMM_KERNEL_4x4(m_min, n_min, k_min, sa, sb, C + ms * ldc + ns, ldc);
+ }
+ }
+ }
+
+ free(sa);
+ free(sb);
+}
+
void hgemm_noTrans_4x8(unsigned int M, unsigned int N, unsigned int K,
const __fp16 *A, unsigned int lda, const __fp16 *B,
unsigned int ldb, __fp16 *C, unsigned int ldc,
diff --git a/nntrainer/tensor/hgemm/hgemm.h b/nntrainer/tensor/hgemm/hgemm.h
index b05d89cb01..7c8194edf2 100644
--- a/nntrainer/tensor/hgemm/hgemm.h
+++ b/nntrainer/tensor/hgemm/hgemm.h
@@ -181,6 +181,26 @@ void hgemm_noTrans_8x8(unsigned int M, unsigned int N, unsigned int K,
unsigned int ldb, __fp16 *C, unsigned int ldc,
float alpha = 1.F, float beta = 0.F);
+/**
+ * @brief hgemm noTrans computation with 4x4 kernel : C = A*B,
+ *
+ * @param M length of the row of matrix A
+ * @param N length of the col of matrix B
+ * @param K length of the col of matrix A
+ * @param A input matrix A
+ * @param lda length of the col of matrix C
+ * @param B input matrix B
+ * @param ldb length of the col of matrix C
+ * @param C output matrix C
+ * @param ldc length of the col of matrix C
+ * @param[in] alpha float number
+ * @param[in] beta float number
+ */
+void hgemm_noTrans_4x4(unsigned int M, unsigned int N, unsigned int K,
+ const __fp16 *A, unsigned int lda, const __fp16 *B,
+ unsigned int ldb, float *C, unsigned int ldc,
+ float alpha = 1.F, float beta = 0.F);
+
/**
* @brief hgemm noTrans computation with 8x8 kernel : C = A*B,
*
diff --git a/nntrainer/tensor/hgemm/hgemm_kernel_4x4.h b/nntrainer/tensor/hgemm/hgemm_kernel_4x4.h
index 6166b9407d..7bf75b13b7 100644
--- a/nntrainer/tensor/hgemm/hgemm_kernel_4x4.h
+++ b/nntrainer/tensor/hgemm/hgemm_kernel_4x4.h
@@ -14,6 +14,193 @@
#include
#include
+#define INIT_KERNEL_4x4() \
+ v24 = vdup_n_f16(0.F); \
+ v25 = vdup_n_f16(0.F); \
+ v26 = vdup_n_f16(0.F); \
+ v27 = vdup_n_f16(0.F);
+
+// 1. Partial sum 256 digits
+#define KERNEL_4x4_ACC16() \
+ dv0 = vld1_f16(a); \
+ vb0 = vld1_f16(b); \
+ v24 = vfma_lane_f16(v24, vb0, dv0, 0); \
+ v25 = vfma_lane_f16(v25, vb0, dv0, 1); \
+ v26 = vfma_lane_f16(v26, vb0, dv0, 2); \
+ v27 = vfma_lane_f16(v27, vb0, dv0, 3); \
+ dv1 = vld1_f16(a + 4); \
+ vb1 = vld1_f16(b + 4); \
+ v24 = vfma_lane_f16(v24, vb1, dv1, 0); \
+ v25 = vfma_lane_f16(v25, vb1, dv1, 1); \
+ v26 = vfma_lane_f16(v26, vb1, dv1, 2); \
+ v27 = vfma_lane_f16(v27, vb1, dv1, 3); \
+ dv2 = vld1_f16(a + 4 * 2); \
+ vb2 = vld1_f16(b + 4 * 2); \
+ v24 = vfma_lane_f16(v24, vb2, dv2, 0); \
+ v25 = vfma_lane_f16(v25, vb2, dv2, 1); \
+ v26 = vfma_lane_f16(v26, vb2, dv2, 2); \
+ v27 = vfma_lane_f16(v27, vb2, dv2, 3); \
+ dv3 = vld1_f16(a + 4 * 3); \
+ vb3 = vld1_f16(b + 4 * 3); \
+ v24 = vfma_lane_f16(v24, vb3, dv3, 0); \
+ v25 = vfma_lane_f16(v25, vb3, dv3, 1); \
+ v26 = vfma_lane_f16(v26, vb3, dv3, 2); \
+ v27 = vfma_lane_f16(v27, vb3, dv3, 3); \
+ dv4 = vld1_f16(a + 4 * 4); \
+ vb4 = vld1_f16(b + 4 * 4); \
+ v24 = vfma_lane_f16(v24, vb4, dv4, 0); \
+ v25 = vfma_lane_f16(v25, vb4, dv4, 1); \
+ v26 = vfma_lane_f16(v26, vb4, dv4, 2); \
+ v27 = vfma_lane_f16(v27, vb4, dv4, 3); \
+ dv5 = vld1_f16(a + 4 * 5); \
+ vb5 = vld1_f16(b + 4 * 5); \
+ v24 = vfma_lane_f16(v24, vb5, dv5, 0); \
+ v25 = vfma_lane_f16(v25, vb5, dv5, 1); \
+ v26 = vfma_lane_f16(v26, vb5, dv5, 2); \
+ v27 = vfma_lane_f16(v27, vb5, dv5, 3); \
+ dv6 = vld1_f16(a + 4 * 6); \
+ vb6 = vld1_f16(b + 4 * 6); \
+ v24 = vfma_lane_f16(v24, vb6, dv6, 0); \
+ v25 = vfma_lane_f16(v25, vb6, dv6, 1); \
+ v26 = vfma_lane_f16(v26, vb6, dv6, 2); \
+ v27 = vfma_lane_f16(v27, vb6, dv6, 3); \
+ dv7 = vld1_f16(a + 4 * 7); \
+ vb7 = vld1_f16(b + 4 * 7); \
+ v24 = vfma_lane_f16(v24, vb7, dv7, 0); \
+ v25 = vfma_lane_f16(v25, vb7, dv7, 1); \
+ v26 = vfma_lane_f16(v26, vb7, dv7, 2); \
+ v27 = vfma_lane_f16(v27, vb7, dv7, 3); \
+ dv7 = vld1_f16(a + 4 * 8); \
+ vb7 = vld1_f16(b + 4 * 8); \
+ v24 = vfma_lane_f16(v24, vb7, dv7, 0); \
+ v25 = vfma_lane_f16(v25, vb7, dv7, 1); \
+ v26 = vfma_lane_f16(v26, vb7, dv7, 2); \
+ v27 = vfma_lane_f16(v27, vb7, dv7, 3); \
+ dv7 = vld1_f16(a + 4 * 9); \
+ vb7 = vld1_f16(b + 4 * 9); \
+ v24 = vfma_lane_f16(v24, vb7, dv7, 0); \
+ v25 = vfma_lane_f16(v25, vb7, dv7, 1); \
+ v26 = vfma_lane_f16(v26, vb7, dv7, 2); \
+ v27 = vfma_lane_f16(v27, vb7, dv7, 3); \
+ dv7 = vld1_f16(a + 4 * 10); \
+ vb7 = vld1_f16(b + 4 * 10); \
+ v24 = vfma_lane_f16(v24, vb7, dv7, 0); \
+ v25 = vfma_lane_f16(v25, vb7, dv7, 1); \
+ v26 = vfma_lane_f16(v26, vb7, dv7, 2); \
+ v27 = vfma_lane_f16(v27, vb7, dv7, 3); \
+ dv7 = vld1_f16(a + 4 * 11); \
+ vb7 = vld1_f16(b + 4 * 11); \
+ v24 = vfma_lane_f16(v24, vb7, dv7, 0); \
+ v25 = vfma_lane_f16(v25, vb7, dv7, 1); \
+ v26 = vfma_lane_f16(v26, vb7, dv7, 2); \
+ v27 = vfma_lane_f16(v27, vb7, dv7, 3); \
+ dv7 = vld1_f16(a + 4 * 12); \
+ vb7 = vld1_f16(b + 4 * 12); \
+ v24 = vfma_lane_f16(v24, vb7, dv7, 0); \
+ v25 = vfma_lane_f16(v25, vb7, dv7, 1); \
+ v26 = vfma_lane_f16(v26, vb7, dv7, 2); \
+ v27 = vfma_lane_f16(v27, vb7, dv7, 3); \
+ dv7 = vld1_f16(a + 4 * 13); \
+ vb7 = vld1_f16(b + 4 * 13); \
+ v24 = vfma_lane_f16(v24, vb7, dv7, 0); \
+ v25 = vfma_lane_f16(v25, vb7, dv7, 1); \
+ v26 = vfma_lane_f16(v26, vb7, dv7, 2); \
+ v27 = vfma_lane_f16(v27, vb7, dv7, 3); \
+ dv7 = vld1_f16(a + 4 * 14); \
+ vb7 = vld1_f16(b + 4 * 14); \
+ v24 = vfma_lane_f16(v24, vb7, dv7, 0); \
+ v25 = vfma_lane_f16(v25, vb7, dv7, 1); \
+ v26 = vfma_lane_f16(v26, vb7, dv7, 2); \
+ v27 = vfma_lane_f16(v27, vb7, dv7, 3); \
+ dv7 = vld1_f16(a + 4 * 15); \
+ vb7 = vld1_f16(b + 4 * 15); \
+ v24 = vfma_lane_f16(v24, vb7, dv7, 0); \
+ v25 = vfma_lane_f16(v25, vb7, dv7, 1); \
+ v26 = vfma_lane_f16(v26, vb7, dv7, 2); \
+ v27 = vfma_lane_f16(v27, vb7, dv7, 3); \
+ l += 16; \
+ __builtin_prefetch(b + 64, 0, 3); \
+ __builtin_prefetch(a + 64, 0, 3); \
+ b += 4 * 16; \
+ a += 4 * 16;
+
+// 2. Partial sum 128 digits
+#define KERNEL_4x4_ACC8() \
+ dv0 = vld1_f16(a); \
+ vb0 = vld1_f16(b); \
+ v24 = vfma_lane_f16(v24, vb0, dv0, 0); \
+ v25 = vfma_lane_f16(v25, vb0, dv0, 1); \
+ v26 = vfma_lane_f16(v26, vb0, dv0, 2); \
+ v27 = vfma_lane_f16(v27, vb0, dv0, 3); \
+ dv1 = vld1_f16(a + 4); \
+ vb1 = vld1_f16(b + 4); \
+ v24 = vfma_lane_f16(v24, vb1, dv1, 0); \
+ v25 = vfma_lane_f16(v25, vb1, dv1, 1); \
+ v26 = vfma_lane_f16(v26, vb1, dv1, 2); \
+ v27 = vfma_lane_f16(v27, vb1, dv1, 3); \
+ dv2 = vld1_f16(a + 8); \
+ vb2 = vld1_f16(b + 8); \
+ v24 = vfma_lane_f16(v24, vb2, dv2, 0); \
+ v25 = vfma_lane_f16(v25, vb2, dv2, 1); \
+ v26 = vfma_lane_f16(v26, vb2, dv2, 2); \
+ v27 = vfma_lane_f16(v27, vb2, dv2, 3); \
+ dv3 = vld1_f16(a + 12); \
+ vb3 = vld1_f16(b + 12); \
+ v24 = vfma_lane_f16(v24, vb3, dv3, 0); \
+ v25 = vfma_lane_f16(v25, vb3, dv3, 1); \
+ v26 = vfma_lane_f16(v26, vb3, dv3, 2); \
+ v27 = vfma_lane_f16(v27, vb3, dv3, 3); \
+ dv4 = vld1_f16(a + 16); \
+ vb4 = vld1_f16(b + 16); \
+ v24 = vfma_lane_f16(v24, vb4, dv4, 0); \
+ v25 = vfma_lane_f16(v25, vb4, dv4, 1); \
+ v26 = vfma_lane_f16(v26, vb4, dv4, 2); \
+ v27 = vfma_lane_f16(v27, vb4, dv4, 3); \
+ dv5 = vld1_f16(a + 20); \
+ vb5 = vld1_f16(b + 20); \
+ v24 = vfma_lane_f16(v24, vb5, dv5, 0); \
+ v25 = vfma_lane_f16(v25, vb5, dv5, 1); \
+ v26 = vfma_lane_f16(v26, vb5, dv5, 2); \
+ v27 = vfma_lane_f16(v27, vb5, dv5, 3); \
+ dv6 = vld1_f16(a + 24); \
+ vb6 = vld1_f16(b + 24); \
+ v24 = vfma_lane_f16(v24, vb6, dv6, 0); \
+ v25 = vfma_lane_f16(v25, vb6, dv6, 1); \
+ v26 = vfma_lane_f16(v26, vb6, dv6, 2); \
+ v27 = vfma_lane_f16(v27, vb6, dv6, 3); \
+ dv7 = vld1_f16(a + 28); \
+ vb7 = vld1_f16(b + 28); \
+ v24 = vfma_lane_f16(v24, vb7, dv7, 0); \
+ v25 = vfma_lane_f16(v25, vb7, dv7, 1); \
+ v26 = vfma_lane_f16(v26, vb7, dv7, 2); \
+ v27 = vfma_lane_f16(v27, vb7, dv7, 3); \
+ l += 8; \
+ __builtin_prefetch(b + 32, 0, 3); \
+ __builtin_prefetch(a + 32, 0, 3); \
+ b += 4 * 8; \
+ a += 4 * 8;
+
+// 2. Partial sum 16 digits
+#define KERNEL_4x4_ACC1() \
+ dv0 = vld1_f16(a); \
+ vb0 = vld1_f16(b); \
+ v24 = vfma_lane_f16(v24, vb0, dv0, 0); \
+ v25 = vfma_lane_f16(v25, vb0, dv0, 1); \
+ v26 = vfma_lane_f16(v26, vb0, dv0, 2); \
+ v27 = vfma_lane_f16(v27, vb0, dv0, 3); \
+ l += 1; \
+ __builtin_prefetch(b + 4, 0, 3); \
+ __builtin_prefetch(a + 4, 0, 3); \
+ b += 4 * 1; \
+ a += 4 * 1;
+
+#define SAVE_KERNEL_4X4_F16_F32() \
+ vst1q_f32(c, vaddq_f32(vld1q_f32(c), vcvt_f32_f16(v24))); \
+ vst1q_f32(c + ldc, vaddq_f32(vld1q_f32(c + ldc), vcvt_f32_f16(v25))); \
+ vst1q_f32(c + 2 * ldc, \
+ vaddq_f32(vld1q_f32(c + 2 * ldc), vcvt_f32_f16(v26))); \
+ vst1q_f32(c + 3 * ldc, vaddq_f32(vld1q_f32(c + 3 * ldc), vcvt_f32_f16(v27)));
+
/**
* @brief hgemm 4x4 kernel sc = sa * sb
*
@@ -37,10 +224,11 @@ void hgemm_kernel_4x4(unsigned int M, unsigned int N, unsigned int K,
__builtin_prefetch(b, 0, 3);
__builtin_prefetch(a, 0, 3);
- float16x4_t v24 = {0};
- float16x4_t v25 = {0};
- float16x4_t v26 = {0};
- float16x4_t v27 = {0};
+ float16x4_t v24;
+ float16x4_t v25;
+ float16x4_t v26;
+ float16x4_t v27;
+ INIT_KERNEL_4x4();
for (l = 0; l < K; l += VL_FP16_HALF) {
float16x4_t v0 = vld1_f16(b);
@@ -101,3 +289,59 @@ void hgemm_kernel_4x4(unsigned int M, unsigned int N, unsigned int K,
b = sb;
}
}
+
+/**
+ * @brief hgemm 4x4 kernel sc = sa * sb
+ *
+ * @param m length of the row of matrix A
+ * @param n length of the col of matrix B
+ * @param k length of the col of matrix A
+ * @param sa sub-matrix of input matrix A
+ * @param sb sub-matrix of input matrix B
+ * @param sc sub-matrix of output matrix C
+ * @param ldc leading dimension of matrix C
+ */
+void hgemm_kernel_4x4(unsigned int M, unsigned int N, unsigned int K,
+ __fp16 *sa, __fp16 *sb, float *sc, unsigned int ldc) {
+ assert(M > 0 && N > 0 && K > 0);
+ assert(M % 4 == 0 && N % 4 == 0 && K % 4 == 0);
+
+ __fp16 *a = sa, *b = sb;
+ float *c = sc;
+ unsigned int i, j, l;
+ unsigned int K16 = (K >> 4) << 4;
+ unsigned int K8 = (K >> 3) << 3;
+ for (i = 0; i < M; i += VL_FP16_HALF) {
+ for (j = 0; j < N; j += VL_FP16_HALF) {
+ __builtin_prefetch(b, 0, 3);
+ __builtin_prefetch(a, 0, 3);
+
+ float16x4_t v24, v25, v26, v27;
+ float16x4_t dv0, dv1, dv2, dv3, dv4, dv5, dv6, dv7;
+ float16x4_t vb0, vb1, vb2, vb3, vb4, vb5, vb6, vb7;
+ l = 0;
+ for (; l < K16;) {
+ INIT_KERNEL_4x4();
+ KERNEL_4x4_ACC16();
+ SAVE_KERNEL_4X4_F16_F32();
+ }
+ for (; l < K8;) {
+ INIT_KERNEL_4x4();
+ KERNEL_4x4_ACC8();
+ SAVE_KERNEL_4X4_F16_F32();
+ }
+ for (; l < K;) {
+ INIT_KERNEL_4x4();
+ KERNEL_4x4_ACC1();
+ SAVE_KERNEL_4X4_F16_F32();
+ }
+
+ c += 4;
+ a -= 4 * K;
+ }
+ sc += ldc * 4;
+ c = sc;
+ a += 4 * K;
+ b = sb;
+ }
+}
diff --git a/nntrainer/tensor/hgemm/hgemm_kernel_4x8.h b/nntrainer/tensor/hgemm/hgemm_kernel_4x8.h
index dce6659934..01204457e9 100644
--- a/nntrainer/tensor/hgemm/hgemm_kernel_4x8.h
+++ b/nntrainer/tensor/hgemm/hgemm_kernel_4x8.h
@@ -14,15 +14,118 @@
#include
#include
-/// @note Following KERNELs are the combinations of accuracy-latency
-/// tradeoff. User can select which kernel to use by replacing them.
+#define INIT_KERNEL_4X8() \
+ v0 = vdupq_n_f16(0.F); \
+ v3 = vdupq_n_f16(0.F); \
+ v6 = vdupq_n_f16(0.F); \
+ v9 = vdupq_n_f16(0.F);
-// 1. Partial sum 256 digits : worst accuracy, best latency
+// 1. Partial sum 256 digits
+#define KERNEL_4x8_ACC16() \
+ dv0 = vld1_f16(a); \
+ v24 = vld1q_f16(b); \
+ v0 = vfmaq_lane_f16(v0, v24, dv0, 0); \
+ v3 = vfmaq_lane_f16(v3, v24, dv0, 1); \
+ v6 = vfmaq_lane_f16(v6, v24, dv0, 2); \
+ v9 = vfmaq_lane_f16(v9, v24, dv0, 3); \
+ dv1 = vld1_f16(a + 4); \
+ v25 = vld1q_f16(b + 8); \
+ v0 = vfmaq_lane_f16(v0, v25, dv1, 0); \
+ v3 = vfmaq_lane_f16(v3, v25, dv1, 1); \
+ v6 = vfmaq_lane_f16(v6, v25, dv1, 2); \
+ v9 = vfmaq_lane_f16(v9, v25, dv1, 3); \
+ dv2 = vld1_f16(a + 4 * 2); \
+ v26 = vld1q_f16(b + 8 * 2); \
+ v0 = vfmaq_lane_f16(v0, v26, dv2, 0); \
+ v3 = vfmaq_lane_f16(v3, v26, dv2, 1); \
+ v6 = vfmaq_lane_f16(v6, v26, dv2, 2); \
+ v9 = vfmaq_lane_f16(v9, v26, dv2, 3); \
+ dv3 = vld1_f16(a + 4 * 3); \
+ v27 = vld1q_f16(b + 8 * 3); \
+ v0 = vfmaq_lane_f16(v0, v27, dv3, 0); \
+ v3 = vfmaq_lane_f16(v3, v27, dv3, 1); \
+ v6 = vfmaq_lane_f16(v6, v27, dv3, 2); \
+ v9 = vfmaq_lane_f16(v9, v27, dv3, 3); \
+ dv4 = vld1_f16(a + 4 * 4); \
+ v28 = vld1q_f16(b + 8 * 4); \
+ v0 = vfmaq_lane_f16(v0, v28, dv4, 0); \
+ v3 = vfmaq_lane_f16(v3, v28, dv4, 1); \
+ v6 = vfmaq_lane_f16(v6, v28, dv4, 2); \
+ v9 = vfmaq_lane_f16(v9, v28, dv4, 3); \
+ dv5 = vld1_f16(a + 4 * 5); \
+ v29 = vld1q_f16(b + 8 * 5); \
+ v0 = vfmaq_lane_f16(v0, v29, dv5, 0); \
+ v3 = vfmaq_lane_f16(v3, v29, dv5, 1); \
+ v6 = vfmaq_lane_f16(v6, v29, dv5, 2); \
+ v9 = vfmaq_lane_f16(v9, v29, dv5, 3); \
+ dv6 = vld1_f16(a + 4 * 6); \
+ v30 = vld1q_f16(b + 8 * 6); \
+ v0 = vfmaq_lane_f16(v0, v30, dv6, 0); \
+ v3 = vfmaq_lane_f16(v3, v30, dv6, 1); \
+ v6 = vfmaq_lane_f16(v6, v30, dv6, 2); \
+ v9 = vfmaq_lane_f16(v9, v30, dv6, 3); \
+ dv7 = vld1_f16(a + 4 * 7); \
+ v31 = vld1q_f16(b + 8 * 7); \
+ v0 = vfmaq_lane_f16(v0, v31, dv7, 0); \
+ v3 = vfmaq_lane_f16(v3, v31, dv7, 1); \
+ v6 = vfmaq_lane_f16(v6, v31, dv7, 2); \
+ v9 = vfmaq_lane_f16(v9, v31, dv7, 3); \
+ dv7 = vld1_f16(a + 4 * 8); \
+ v31 = vld1q_f16(b + 8 * 8); \
+ v0 = vfmaq_lane_f16(v0, v31, dv7, 0); \
+ v3 = vfmaq_lane_f16(v3, v31, dv7, 1); \
+ v6 = vfmaq_lane_f16(v6, v31, dv7, 2); \
+ v9 = vfmaq_lane_f16(v9, v31, dv7, 3); \
+ dv7 = vld1_f16(a + 4 * 9); \
+ v31 = vld1q_f16(b + 8 * 9); \
+ v0 = vfmaq_lane_f16(v0, v31, dv7, 0); \
+ v3 = vfmaq_lane_f16(v3, v31, dv7, 1); \
+ v6 = vfmaq_lane_f16(v6, v31, dv7, 2); \
+ v9 = vfmaq_lane_f16(v9, v31, dv7, 3); \
+ dv7 = vld1_f16(a + 4 * 10); \
+ v31 = vld1q_f16(b + 8 * 10); \
+ v0 = vfmaq_lane_f16(v0, v31, dv7, 0); \
+ v3 = vfmaq_lane_f16(v3, v31, dv7, 1); \
+ v6 = vfmaq_lane_f16(v6, v31, dv7, 2); \
+ v9 = vfmaq_lane_f16(v9, v31, dv7, 3); \
+ dv7 = vld1_f16(a + 4 * 11); \
+ v31 = vld1q_f16(b + 8 * 11); \
+ v0 = vfmaq_lane_f16(v0, v31, dv7, 0); \
+ v3 = vfmaq_lane_f16(v3, v31, dv7, 1); \
+ v6 = vfmaq_lane_f16(v6, v31, dv7, 2); \
+ v9 = vfmaq_lane_f16(v9, v31, dv7, 3); \
+ dv7 = vld1_f16(a + 4 * 12); \
+ v31 = vld1q_f16(b + 8 * 12); \
+ v0 = vfmaq_lane_f16(v0, v31, dv7, 0); \
+ v3 = vfmaq_lane_f16(v3, v31, dv7, 1); \
+ v6 = vfmaq_lane_f16(v6, v31, dv7, 2); \
+ v9 = vfmaq_lane_f16(v9, v31, dv7, 3); \
+ dv7 = vld1_f16(a + 4 * 13); \
+ v31 = vld1q_f16(b + 8 * 13); \
+ v0 = vfmaq_lane_f16(v0, v31, dv7, 0); \
+ v3 = vfmaq_lane_f16(v3, v31, dv7, 1); \
+ v6 = vfmaq_lane_f16(v6, v31, dv7, 2); \
+ v9 = vfmaq_lane_f16(v9, v31, dv7, 3); \
+ dv7 = vld1_f16(a + 4 * 14); \
+ v31 = vld1q_f16(b + 8 * 14); \
+ v0 = vfmaq_lane_f16(v0, v31, dv7, 0); \
+ v3 = vfmaq_lane_f16(v3, v31, dv7, 1); \
+ v6 = vfmaq_lane_f16(v6, v31, dv7, 2); \
+ v9 = vfmaq_lane_f16(v9, v31, dv7, 3); \
+ dv7 = vld1_f16(a + 4 * 15); \
+ v31 = vld1q_f16(b + 8 * 15); \
+ v0 = vfmaq_lane_f16(v0, v31, dv7, 0); \
+ v3 = vfmaq_lane_f16(v3, v31, dv7, 1); \
+ v6 = vfmaq_lane_f16(v6, v31, dv7, 2); \
+ v9 = vfmaq_lane_f16(v9, v31, dv7, 3); \
+ l += 16; \
+ __builtin_prefetch(b + 128, 0, 3); \
+ __builtin_prefetch(a + 64, 0, 3); \
+ b += 8 * 16; \
+ a += 4 * 16;
+
+// 1. Partial sum 256 digits
#define KERNEL_4x8_ACC8() \
- v0 = vdupq_n_f16(0.F); \
- v3 = vdupq_n_f16(0.F); \
- v6 = vdupq_n_f16(0.F); \
- v9 = vdupq_n_f16(0.F); \
dv0 = vld1_f16(a); \
v24 = vld1q_f16(b); \
v0 = vfmaq_lane_f16(v0, v24, dv0, 0); \
@@ -77,12 +180,8 @@
b += 8 * 8; \
a += 4 * 8;
-// 2. Partial sum 128 digits : medium accuracy, medium latency
+// 2. Partial sum 128 digits
#define KERNEL_4x8_ACC4() \
- v0 = vdupq_n_f16(0.F); \
- v3 = vdupq_n_f16(0.F); \
- v6 = vdupq_n_f16(0.F); \
- v9 = vdupq_n_f16(0.F); \
dv0 = vld1_f16(a); \
v24 = vld1q_f16(b); \
v0 = vfmaq_lane_f16(v0, v24, dv0, 0); \
@@ -113,12 +212,8 @@
b += 8 * 4; \
a += 4 * 4;
-// 3. Partial sum 32 digits : Best accuracy, worst latency
+// 3. Partial sum 32 digits
#define KERNEL_4x8_ACC1() \
- v0 = vdupq_n_f16(0.F); \
- v3 = vdupq_n_f16(0.F); \
- v6 = vdupq_n_f16(0.F); \
- v9 = vdupq_n_f16(0.F); \
dv0 = vld1_f16(a); \
v24 = vld1q_f16(b); \
v0 = vfmaq_lane_f16(v0, v24, dv0, 0); \
@@ -131,6 +226,24 @@
b += 8 * 1; \
a += 4 * 1;
+#define SAVE_KERNEL_4X8_F16_F32() \
+ vst1q_f32(c, vaddq_f32(vld1q_f32(c), vcvt_f32_f16(vget_low_f16(v0)))); \
+ vst1q_f32(c + ldc, \
+ vaddq_f32(vld1q_f32(c + ldc), vcvt_f32_f16(vget_low_f16(v3)))); \
+ vst1q_f32(c + 2 * ldc, vaddq_f32(vld1q_f32(c + 2 * ldc), \
+ vcvt_f32_f16(vget_low_f16(v6)))); \
+ vst1q_f32(c + 3 * ldc, vaddq_f32(vld1q_f32(c + 3 * ldc), \
+ vcvt_f32_f16(vget_low_f16(v9)))); \
+ \
+ vst1q_f32(c + 4, \
+ vaddq_f32(vld1q_f32(c + 4), vcvt_f32_f16(vget_high_f16(v0)))); \
+ vst1q_f32(c + 4 + ldc, vaddq_f32(vld1q_f32(c + 4 + ldc), \
+ vcvt_f32_f16(vget_high_f16(v3)))); \
+ vst1q_f32(c + 4 + 2 * ldc, vaddq_f32(vld1q_f32(c + 4 + 2 * ldc), \
+ vcvt_f32_f16(vget_high_f16(v6)))); \
+ vst1q_f32(c + 4 + 3 * ldc, vaddq_f32(vld1q_f32(c + 4 + 3 * ldc), \
+ vcvt_f32_f16(vget_high_f16(v9))));
+
/**
* @brief hgemm 4x8 kernel sc = sa * sb
*
@@ -148,7 +261,7 @@ void hgemm_kernel_4x8(unsigned int M, unsigned int N, unsigned int K,
assert(M % 4 == 0 && N % 8 == 0);
__fp16 *a = sa, *b = sb, *c = sc;
- unsigned int k8 = (K >> 3) << 3;
+ unsigned int K8 = (K >> 3) << 3;
unsigned int i, j, l;
for (i = 0; i < M; i += 4) {
for (j = 0; j < N; j += 8) {
@@ -157,23 +270,18 @@ void hgemm_kernel_4x8(unsigned int M, unsigned int N, unsigned int K,
float16x8_t v0, v3, v6, v9;
float16x8_t v24, v25, v26, v27, v28, v29, v30, v31;
float16x4_t dv0, dv1, dv2, dv3, dv4, dv5, dv6, dv7;
+ INIT_KERNEL_4X8();
l = 0;
- for (; l < k8;) {
+ for (; l < K8;) {
KERNEL_4x8_ACC8();
-
- vst1q_f16(c, vaddq_f16(vld1q_f16(c), v0));
- vst1q_f16(c + ldc, vaddq_f16(vld1q_f16(c + ldc), v3));
- vst1q_f16(c + 2 * ldc, vaddq_f16(vld1q_f16(c + 2 * ldc), v6));
- vst1q_f16(c + 3 * ldc, vaddq_f16(vld1q_f16(c + 3 * ldc), v9));
}
for (; l < K;) {
KERNEL_4x8_ACC1();
-
- vst1q_f16(c, vaddq_f16(vld1q_f16(c), v0));
- vst1q_f16(c + ldc, vaddq_f16(vld1q_f16(c + ldc), v3));
- vst1q_f16(c + 2 * ldc, vaddq_f16(vld1q_f16(c + 2 * ldc), v6));
- vst1q_f16(c + 3 * ldc, vaddq_f16(vld1q_f16(c + 3 * ldc), v9));
}
+ vst1q_f16(c, vaddq_f16(vld1q_f16(c), v0));
+ vst1q_f16(c + ldc, vaddq_f16(vld1q_f16(c + ldc), v3));
+ vst1q_f16(c + 2 * ldc, vaddq_f16(vld1q_f16(c + 2 * ldc), v6));
+ vst1q_f16(c + 3 * ldc, vaddq_f16(vld1q_f16(c + 3 * ldc), v9));
c += 8;
a -= 4 * K;
}
@@ -202,7 +310,9 @@ void hgemm_kernel_4x8(unsigned int M, unsigned int N, unsigned int K,
__fp16 *a = sa, *b = sb;
float *c = sc;
- unsigned int k8 = (K >> 3) << 3;
+ unsigned int K16 = (K >> 4) << 4;
+ unsigned int K8 = (K >> 3) << 3;
+ unsigned int K4 = (K >> 2) << 2;
unsigned int i, j, l;
for (i = 0; i < M; i += 4) {
for (j = 0; j < N; j += 8) {
@@ -212,45 +322,25 @@ void hgemm_kernel_4x8(unsigned int M, unsigned int N, unsigned int K,
float16x8_t v24, v25, v26, v27, v28, v29, v30, v31;
float16x4_t dv0, dv1, dv2, dv3, dv4, dv5, dv6, dv7;
l = 0;
- for (; l < k8;) {
+ for (; l < K16;) {
+ INIT_KERNEL_4X8();
+ KERNEL_4x8_ACC16();
+ SAVE_KERNEL_4X8_F16_F32();
+ }
+ for (; l < K8;) {
+ INIT_KERNEL_4X8();
KERNEL_4x8_ACC8();
-
- vst1q_f32(c, vaddq_f32(vld1q_f32(c), vcvt_f32_f16(vget_low_f16(v0))));
- vst1q_f32(c + ldc, vaddq_f32(vld1q_f32(c + ldc),
- vcvt_f32_f16(vget_low_f16(v3))));
- vst1q_f32(c + 2 * ldc, vaddq_f32(vld1q_f32(c + 2 * ldc),
- vcvt_f32_f16(vget_low_f16(v6))));
- vst1q_f32(c + 3 * ldc, vaddq_f32(vld1q_f32(c + 3 * ldc),
- vcvt_f32_f16(vget_low_f16(v9))));
-
- vst1q_f32(c + 4,
- vaddq_f32(vld1q_f32(c + 4), vcvt_f32_f16(vget_high_f16(v0))));
- vst1q_f32(c + 4 + ldc, vaddq_f32(vld1q_f32(c + 4 + ldc),
- vcvt_f32_f16(vget_high_f16(v3))));
- vst1q_f32(c + 4 + 2 * ldc, vaddq_f32(vld1q_f32(c + 4 + 2 * ldc),
- vcvt_f32_f16(vget_high_f16(v6))));
- vst1q_f32(c + 4 + 3 * ldc, vaddq_f32(vld1q_f32(c + 4 + 3 * ldc),
- vcvt_f32_f16(vget_high_f16(v9))));
+ SAVE_KERNEL_4X8_F16_F32();
+ }
+ for (; l < K4;) {
+ INIT_KERNEL_4X8();
+ KERNEL_4x8_ACC4();
+ SAVE_KERNEL_4X8_F16_F32();
}
for (; l < K;) {
+ INIT_KERNEL_4X8();
KERNEL_4x8_ACC1();
-
- vst1q_f32(c, vaddq_f32(vld1q_f32(c), vcvt_f32_f16(vget_low_f16(v0))));
- vst1q_f32(c + ldc, vaddq_f32(vld1q_f32(c + ldc),
- vcvt_f32_f16(vget_low_f16(v3))));
- vst1q_f32(c + 2 * ldc, vaddq_f32(vld1q_f32(c + 2 * ldc),
- vcvt_f32_f16(vget_low_f16(v6))));
- vst1q_f32(c + 3 * ldc, vaddq_f32(vld1q_f32(c + 3 * ldc),
- vcvt_f32_f16(vget_low_f16(v9))));
-
- vst1q_f32(c + 4,
- vaddq_f32(vld1q_f32(c + 4), vcvt_f32_f16(vget_high_f16(v0))));
- vst1q_f32(c + 4 + ldc, vaddq_f32(vld1q_f32(c + 4 + ldc),
- vcvt_f32_f16(vget_high_f16(v3))));
- vst1q_f32(c + 4 + 2 * ldc, vaddq_f32(vld1q_f32(c + 4 + 2 * ldc),
- vcvt_f32_f16(vget_high_f16(v6))));
- vst1q_f32(c + 4 + 3 * ldc, vaddq_f32(vld1q_f32(c + 4 + 3 * ldc),
- vcvt_f32_f16(vget_high_f16(v9))));
+ SAVE_KERNEL_4X8_F16_F32();
}
c += 8;
a -= 4 * K;
diff --git a/nntrainer/tensor/hgemm/hgemm_kernel_8x16.h b/nntrainer/tensor/hgemm/hgemm_kernel_8x16.h
index 7cac545809..a89a6b5421 100644
--- a/nntrainer/tensor/hgemm/hgemm_kernel_8x16.h
+++ b/nntrainer/tensor/hgemm/hgemm_kernel_8x16.h
@@ -14,27 +14,338 @@
#include
#include
-/// @note Following KERNELs are the combinations of accuracy-latency
-/// tradeoff. User can select which kernel to use by replacing them.
+#define INIT_KERNEL_8X16() \
+ v0_7 = vdupq_n_f16(0.F); \
+ v8_15 = vdupq_n_f16(0.F); \
+ v16_23 = vdupq_n_f16(0.F); \
+ v24_31 = vdupq_n_f16(0.F); \
+ v32_39 = vdupq_n_f16(0.F); \
+ v40_47 = vdupq_n_f16(0.F); \
+ v48_55 = vdupq_n_f16(0.F); \
+ v56_63 = vdupq_n_f16(0.F); \
+ v64_71 = vdupq_n_f16(0.F); \
+ v72_79 = vdupq_n_f16(0.F); \
+ v80_87 = vdupq_n_f16(0.F); \
+ v88_95 = vdupq_n_f16(0.F); \
+ v96_103 = vdupq_n_f16(0.F); \
+ v104_111 = vdupq_n_f16(0.F); \
+ v112_119 = vdupq_n_f16(0.F); \
+ v120_127 = vdupq_n_f16(0.F);
-// 1. Partial sum 1024 digits : Worst accuracy, best latency
+// 1. Partial sum 2048 digits
+#define KERNEL_8x16_ACC16() \
+ va0 = vld1q_f16(a); \
+ v24 = vld1q_f16(b); \
+ v25 = vld1q_f16(b + 8); \
+ v0_7 = vfmaq_laneq_f16(v0_7, v24, va0, 0); \
+ v8_15 = vfmaq_laneq_f16(v8_15, v24, va0, 1); \
+ v16_23 = vfmaq_laneq_f16(v16_23, v24, va0, 2); \
+ v24_31 = vfmaq_laneq_f16(v24_31, v24, va0, 3); \
+ v32_39 = vfmaq_laneq_f16(v32_39, v24, va0, 4); \
+ v40_47 = vfmaq_laneq_f16(v40_47, v24, va0, 5); \
+ v48_55 = vfmaq_laneq_f16(v48_55, v24, va0, 6); \
+ v56_63 = vfmaq_laneq_f16(v56_63, v24, va0, 7); \
+ v64_71 = vfmaq_laneq_f16(v64_71, v25, va0, 0); \
+ v72_79 = vfmaq_laneq_f16(v72_79, v25, va0, 1); \
+ v80_87 = vfmaq_laneq_f16(v80_87, v25, va0, 2); \
+ v88_95 = vfmaq_laneq_f16(v88_95, v25, va0, 3); \
+ v96_103 = vfmaq_laneq_f16(v96_103, v25, va0, 4); \
+ v104_111 = vfmaq_laneq_f16(v104_111, v25, va0, 5); \
+ v112_119 = vfmaq_laneq_f16(v112_119, v25, va0, 6); \
+ v120_127 = vfmaq_laneq_f16(v120_127, v25, va0, 7); \
+ va1 = vld1q_f16(a + 8); \
+ v26 = vld1q_f16(b + 8 * 2); \
+ v27 = vld1q_f16(b + 8 * 3); \
+ v0_7 = vfmaq_laneq_f16(v0_7, v26, va1, 0); \
+ v8_15 = vfmaq_laneq_f16(v8_15, v26, va1, 1); \
+ v16_23 = vfmaq_laneq_f16(v16_23, v26, va1, 2); \
+ v24_31 = vfmaq_laneq_f16(v24_31, v26, va1, 3); \
+ v32_39 = vfmaq_laneq_f16(v32_39, v26, va1, 4); \
+ v40_47 = vfmaq_laneq_f16(v40_47, v26, va1, 5); \
+ v48_55 = vfmaq_laneq_f16(v48_55, v26, va1, 6); \
+ v56_63 = vfmaq_laneq_f16(v56_63, v26, va1, 7); \
+ v64_71 = vfmaq_laneq_f16(v64_71, v27, va1, 0); \
+ v72_79 = vfmaq_laneq_f16(v72_79, v27, va1, 1); \
+ v80_87 = vfmaq_laneq_f16(v80_87, v27, va1, 2); \
+ v88_95 = vfmaq_laneq_f16(v88_95, v27, va1, 3); \
+ v96_103 = vfmaq_laneq_f16(v96_103, v27, va1, 4); \
+ v104_111 = vfmaq_laneq_f16(v104_111, v27, va1, 5); \
+ v112_119 = vfmaq_laneq_f16(v112_119, v27, va1, 6); \
+ v120_127 = vfmaq_laneq_f16(v120_127, v27, va1, 7); \
+ va2 = vld1q_f16(a + 8 * 2); \
+ v28 = vld1q_f16(b + 8 * 4); \
+ v29 = vld1q_f16(b + 8 * 5); \
+ v0_7 = vfmaq_laneq_f16(v0_7, v28, va2, 0); \
+ v8_15 = vfmaq_laneq_f16(v8_15, v28, va2, 1); \
+ v16_23 = vfmaq_laneq_f16(v16_23, v28, va2, 2); \
+ v24_31 = vfmaq_laneq_f16(v24_31, v28, va2, 3); \
+ v32_39 = vfmaq_laneq_f16(v32_39, v28, va2, 4); \
+ v40_47 = vfmaq_laneq_f16(v40_47, v28, va2, 5); \
+ v48_55 = vfmaq_laneq_f16(v48_55, v28, va2, 6); \
+ v56_63 = vfmaq_laneq_f16(v56_63, v28, va2, 7); \
+ v64_71 = vfmaq_laneq_f16(v64_71, v29, va2, 0); \
+ v72_79 = vfmaq_laneq_f16(v72_79, v29, va2, 1); \
+ v80_87 = vfmaq_laneq_f16(v80_87, v29, va2, 2); \
+ v88_95 = vfmaq_laneq_f16(v88_95, v29, va2, 3); \
+ v96_103 = vfmaq_laneq_f16(v96_103, v29, va2, 4); \
+ v104_111 = vfmaq_laneq_f16(v104_111, v29, va2, 5); \
+ v112_119 = vfmaq_laneq_f16(v112_119, v29, va2, 6); \
+ v120_127 = vfmaq_laneq_f16(v120_127, v29, va2, 7); \
+ va3 = vld1q_f16(a + 8 * 3); \
+ v30 = vld1q_f16(b + 8 * 6); \
+ v31 = vld1q_f16(b + 8 * 7); \
+ v0_7 = vfmaq_laneq_f16(v0_7, v30, va3, 0); \
+ v8_15 = vfmaq_laneq_f16(v8_15, v30, va3, 1); \
+ v16_23 = vfmaq_laneq_f16(v16_23, v30, va3, 2); \
+ v24_31 = vfmaq_laneq_f16(v24_31, v30, va3, 3); \
+ v32_39 = vfmaq_laneq_f16(v32_39, v30, va3, 4); \
+ v40_47 = vfmaq_laneq_f16(v40_47, v30, va3, 5); \
+ v48_55 = vfmaq_laneq_f16(v48_55, v30, va3, 6); \
+ v56_63 = vfmaq_laneq_f16(v56_63, v30, va3, 7); \
+ v64_71 = vfmaq_laneq_f16(v64_71, v31, va3, 0); \
+ v72_79 = vfmaq_laneq_f16(v72_79, v31, va3, 1); \
+ v80_87 = vfmaq_laneq_f16(v80_87, v31, va3, 2); \
+ v88_95 = vfmaq_laneq_f16(v88_95, v31, va3, 3); \
+ v96_103 = vfmaq_laneq_f16(v96_103, v31, va3, 4); \
+ v104_111 = vfmaq_laneq_f16(v104_111, v31, va3, 5); \
+ v112_119 = vfmaq_laneq_f16(v112_119, v31, va3, 6); \
+ v120_127 = vfmaq_laneq_f16(v120_127, v31, va3, 7); \
+ va4 = vld1q_f16(a + 8 * 4); \
+ v24 = vld1q_f16(b + 8 * 8); \
+ v25 = vld1q_f16(b + 8 * 9); \
+ v0_7 = vfmaq_laneq_f16(v0_7, v24, va4, 0); \
+ v8_15 = vfmaq_laneq_f16(v8_15, v24, va4, 1); \
+ v16_23 = vfmaq_laneq_f16(v16_23, v24, va4, 2); \
+ v24_31 = vfmaq_laneq_f16(v24_31, v24, va4, 3); \
+ v32_39 = vfmaq_laneq_f16(v32_39, v24, va4, 4); \
+ v40_47 = vfmaq_laneq_f16(v40_47, v24, va4, 5); \
+ v48_55 = vfmaq_laneq_f16(v48_55, v24, va4, 6); \
+ v56_63 = vfmaq_laneq_f16(v56_63, v24, va4, 7); \
+ v64_71 = vfmaq_laneq_f16(v64_71, v25, va4, 0); \
+ v72_79 = vfmaq_laneq_f16(v72_79, v25, va4, 1); \
+ v80_87 = vfmaq_laneq_f16(v80_87, v25, va4, 2); \
+ v88_95 = vfmaq_laneq_f16(v88_95, v25, va4, 3); \
+ v96_103 = vfmaq_laneq_f16(v96_103, v25, va4, 4); \
+ v104_111 = vfmaq_laneq_f16(v104_111, v25, va4, 5); \
+ v112_119 = vfmaq_laneq_f16(v112_119, v25, va4, 6); \
+ v120_127 = vfmaq_laneq_f16(v120_127, v25, va4, 7); \
+ va5 = vld1q_f16(a + 8 * 5); \
+ v26 = vld1q_f16(b + 8 * 10); \
+ v27 = vld1q_f16(b + 8 * 11); \
+ v0_7 = vfmaq_laneq_f16(v0_7, v26, va5, 0); \
+ v8_15 = vfmaq_laneq_f16(v8_15, v26, va5, 1); \
+ v16_23 = vfmaq_laneq_f16(v16_23, v26, va5, 2); \
+ v24_31 = vfmaq_laneq_f16(v24_31, v26, va5, 3); \
+ v32_39 = vfmaq_laneq_f16(v32_39, v26, va5, 4); \
+ v40_47 = vfmaq_laneq_f16(v40_47, v26, va5, 5); \
+ v48_55 = vfmaq_laneq_f16(v48_55, v26, va5, 6); \
+ v56_63 = vfmaq_laneq_f16(v56_63, v26, va5, 7); \
+ v64_71 = vfmaq_laneq_f16(v64_71, v27, va5, 0); \
+ v72_79 = vfmaq_laneq_f16(v72_79, v27, va5, 1); \
+ v80_87 = vfmaq_laneq_f16(v80_87, v27, va5, 2); \
+ v88_95 = vfmaq_laneq_f16(v88_95, v27, va5, 3); \
+ v96_103 = vfmaq_laneq_f16(v96_103, v27, va5, 4); \
+ v104_111 = vfmaq_laneq_f16(v104_111, v27, va5, 5); \
+ v112_119 = vfmaq_laneq_f16(v112_119, v27, va5, 6); \
+ v120_127 = vfmaq_laneq_f16(v120_127, v27, va5, 7); \
+ va6 = vld1q_f16(a + 8 * 6); \
+ v28 = vld1q_f16(b + 8 * 12); \
+ v29 = vld1q_f16(b + 8 * 13); \
+ v0_7 = vfmaq_laneq_f16(v0_7, v28, va6, 0); \
+ v8_15 = vfmaq_laneq_f16(v8_15, v28, va6, 1); \
+ v16_23 = vfmaq_laneq_f16(v16_23, v28, va6, 2); \
+ v24_31 = vfmaq_laneq_f16(v24_31, v28, va6, 3); \
+ v32_39 = vfmaq_laneq_f16(v32_39, v28, va6, 4); \
+ v40_47 = vfmaq_laneq_f16(v40_47, v28, va6, 5); \
+ v48_55 = vfmaq_laneq_f16(v48_55, v28, va6, 6); \
+ v56_63 = vfmaq_laneq_f16(v56_63, v28, va6, 7); \
+ v64_71 = vfmaq_laneq_f16(v64_71, v29, va6, 0); \
+ v72_79 = vfmaq_laneq_f16(v72_79, v29, va6, 1); \
+ v80_87 = vfmaq_laneq_f16(v80_87, v29, va6, 2); \
+ v88_95 = vfmaq_laneq_f16(v88_95, v29, va6, 3); \
+ v96_103 = vfmaq_laneq_f16(v96_103, v29, va6, 4); \
+ v104_111 = vfmaq_laneq_f16(v104_111, v29, va6, 5); \
+ v112_119 = vfmaq_laneq_f16(v112_119, v29, va6, 6); \
+ v120_127 = vfmaq_laneq_f16(v120_127, v29, va6, 7); \
+ va7 = vld1q_f16(a + 8 * 7); \
+ v30 = vld1q_f16(b + 8 * 14); \
+ v31 = vld1q_f16(b + 8 * 15); \
+ v0_7 = vfmaq_laneq_f16(v0_7, v30, va7, 0); \
+ v8_15 = vfmaq_laneq_f16(v8_15, v30, va7, 1); \
+ v16_23 = vfmaq_laneq_f16(v16_23, v30, va7, 2); \
+ v24_31 = vfmaq_laneq_f16(v24_31, v30, va7, 3); \
+ v32_39 = vfmaq_laneq_f16(v32_39, v30, va7, 4); \
+ v40_47 = vfmaq_laneq_f16(v40_47, v30, va7, 5); \
+ v48_55 = vfmaq_laneq_f16(v48_55, v30, va7, 6); \
+ v56_63 = vfmaq_laneq_f16(v56_63, v30, va7, 7); \
+ v64_71 = vfmaq_laneq_f16(v64_71, v31, va7, 0); \
+ v72_79 = vfmaq_laneq_f16(v72_79, v31, va7, 1); \
+ v80_87 = vfmaq_laneq_f16(v80_87, v31, va7, 2); \
+ v88_95 = vfmaq_laneq_f16(v88_95, v31, va7, 3); \
+ v96_103 = vfmaq_laneq_f16(v96_103, v31, va7, 4); \
+ v104_111 = vfmaq_laneq_f16(v104_111, v31, va7, 5); \
+ v112_119 = vfmaq_laneq_f16(v112_119, v31, va7, 6); \
+ v120_127 = vfmaq_laneq_f16(v120_127, v31, va7, 7); \
+ va7 = vld1q_f16(a + 8 * 8); \
+ v30 = vld1q_f16(b + 8 * 16); \
+ v31 = vld1q_f16(b + 8 * 17); \
+ v0_7 = vfmaq_laneq_f16(v0_7, v30, va7, 0); \
+ v8_15 = vfmaq_laneq_f16(v8_15, v30, va7, 1); \
+ v16_23 = vfmaq_laneq_f16(v16_23, v30, va7, 2); \
+ v24_31 = vfmaq_laneq_f16(v24_31, v30, va7, 3); \
+ v32_39 = vfmaq_laneq_f16(v32_39, v30, va7, 4); \
+ v40_47 = vfmaq_laneq_f16(v40_47, v30, va7, 5); \
+ v48_55 = vfmaq_laneq_f16(v48_55, v30, va7, 6); \
+ v56_63 = vfmaq_laneq_f16(v56_63, v30, va7, 7); \
+ v64_71 = vfmaq_laneq_f16(v64_71, v31, va7, 0); \
+ v72_79 = vfmaq_laneq_f16(v72_79, v31, va7, 1); \
+ v80_87 = vfmaq_laneq_f16(v80_87, v31, va7, 2); \
+ v88_95 = vfmaq_laneq_f16(v88_95, v31, va7, 3); \
+ v96_103 = vfmaq_laneq_f16(v96_103, v31, va7, 4); \
+ v104_111 = vfmaq_laneq_f16(v104_111, v31, va7, 5); \
+ v112_119 = vfmaq_laneq_f16(v112_119, v31, va7, 6); \
+ v120_127 = vfmaq_laneq_f16(v120_127, v31, va7, 7); \
+ va7 = vld1q_f16(a + 8 * 9); \
+ v30 = vld1q_f16(b + 8 * 18); \
+ v31 = vld1q_f16(b + 8 * 19); \
+ v0_7 = vfmaq_laneq_f16(v0_7, v30, va7, 0); \
+ v8_15 = vfmaq_laneq_f16(v8_15, v30, va7, 1); \
+ v16_23 = vfmaq_laneq_f16(v16_23, v30, va7, 2); \
+ v24_31 = vfmaq_laneq_f16(v24_31, v30, va7, 3); \
+ v32_39 = vfmaq_laneq_f16(v32_39, v30, va7, 4); \
+ v40_47 = vfmaq_laneq_f16(v40_47, v30, va7, 5); \
+ v48_55 = vfmaq_laneq_f16(v48_55, v30, va7, 6); \
+ v56_63 = vfmaq_laneq_f16(v56_63, v30, va7, 7); \
+ v64_71 = vfmaq_laneq_f16(v64_71, v31, va7, 0); \
+ v72_79 = vfmaq_laneq_f16(v72_79, v31, va7, 1); \
+ v80_87 = vfmaq_laneq_f16(v80_87, v31, va7, 2); \
+ v88_95 = vfmaq_laneq_f16(v88_95, v31, va7, 3); \
+ v96_103 = vfmaq_laneq_f16(v96_103, v31, va7, 4); \
+ v104_111 = vfmaq_laneq_f16(v104_111, v31, va7, 5); \
+ v112_119 = vfmaq_laneq_f16(v112_119, v31, va7, 6); \
+ v120_127 = vfmaq_laneq_f16(v120_127, v31, va7, 7); \
+ va7 = vld1q_f16(a + 8 * 10); \
+ v30 = vld1q_f16(b + 8 * 20); \
+ v31 = vld1q_f16(b + 8 * 21); \
+ v0_7 = vfmaq_laneq_f16(v0_7, v30, va7, 0); \
+ v8_15 = vfmaq_laneq_f16(v8_15, v30, va7, 1); \
+ v16_23 = vfmaq_laneq_f16(v16_23, v30, va7, 2); \
+ v24_31 = vfmaq_laneq_f16(v24_31, v30, va7, 3); \
+ v32_39 = vfmaq_laneq_f16(v32_39, v30, va7, 4); \
+ v40_47 = vfmaq_laneq_f16(v40_47, v30, va7, 5); \
+ v48_55 = vfmaq_laneq_f16(v48_55, v30, va7, 6); \
+ v56_63 = vfmaq_laneq_f16(v56_63, v30, va7, 7); \
+ v64_71 = vfmaq_laneq_f16(v64_71, v31, va7, 0); \
+ v72_79 = vfmaq_laneq_f16(v72_79, v31, va7, 1); \
+ v80_87 = vfmaq_laneq_f16(v80_87, v31, va7, 2); \
+ v88_95 = vfmaq_laneq_f16(v88_95, v31, va7, 3); \
+ v96_103 = vfmaq_laneq_f16(v96_103, v31, va7, 4); \
+ v104_111 = vfmaq_laneq_f16(v104_111, v31, va7, 5); \
+ v112_119 = vfmaq_laneq_f16(v112_119, v31, va7, 6); \
+ v120_127 = vfmaq_laneq_f16(v120_127, v31, va7, 7); \
+ va7 = vld1q_f16(a + 8 * 11); \
+ v30 = vld1q_f16(b + 8 * 22); \
+ v31 = vld1q_f16(b + 8 * 23); \
+ v0_7 = vfmaq_laneq_f16(v0_7, v30, va7, 0); \
+ v8_15 = vfmaq_laneq_f16(v8_15, v30, va7, 1); \
+ v16_23 = vfmaq_laneq_f16(v16_23, v30, va7, 2); \
+ v24_31 = vfmaq_laneq_f16(v24_31, v30, va7, 3); \
+ v32_39 = vfmaq_laneq_f16(v32_39, v30, va7, 4); \
+ v40_47 = vfmaq_laneq_f16(v40_47, v30, va7, 5); \
+ v48_55 = vfmaq_laneq_f16(v48_55, v30, va7, 6); \
+ v56_63 = vfmaq_laneq_f16(v56_63, v30, va7, 7); \
+ v64_71 = vfmaq_laneq_f16(v64_71, v31, va7, 0); \
+ v72_79 = vfmaq_laneq_f16(v72_79, v31, va7, 1); \
+ v80_87 = vfmaq_laneq_f16(v80_87, v31, va7, 2); \
+ v88_95 = vfmaq_laneq_f16(v88_95, v31, va7, 3); \
+ v96_103 = vfmaq_laneq_f16(v96_103, v31, va7, 4); \
+ v104_111 = vfmaq_laneq_f16(v104_111, v31, va7, 5); \
+ v112_119 = vfmaq_laneq_f16(v112_119, v31, va7, 6); \
+ v120_127 = vfmaq_laneq_f16(v120_127, v31, va7, 7); \
+ va7 = vld1q_f16(a + 8 * 12); \
+ v30 = vld1q_f16(b + 8 * 24); \
+ v31 = vld1q_f16(b + 8 * 25); \
+ v0_7 = vfmaq_laneq_f16(v0_7, v30, va7, 0); \
+ v8_15 = vfmaq_laneq_f16(v8_15, v30, va7, 1); \
+ v16_23 = vfmaq_laneq_f16(v16_23, v30, va7, 2); \
+ v24_31 = vfmaq_laneq_f16(v24_31, v30, va7, 3); \
+ v32_39 = vfmaq_laneq_f16(v32_39, v30, va7, 4); \
+ v40_47 = vfmaq_laneq_f16(v40_47, v30, va7, 5); \
+ v48_55 = vfmaq_laneq_f16(v48_55, v30, va7, 6); \
+ v56_63 = vfmaq_laneq_f16(v56_63, v30, va7, 7); \
+ v64_71 = vfmaq_laneq_f16(v64_71, v31, va7, 0); \
+ v72_79 = vfmaq_laneq_f16(v72_79, v31, va7, 1); \
+ v80_87 = vfmaq_laneq_f16(v80_87, v31, va7, 2); \
+ v88_95 = vfmaq_laneq_f16(v88_95, v31, va7, 3); \
+ v96_103 = vfmaq_laneq_f16(v96_103, v31, va7, 4); \
+ v104_111 = vfmaq_laneq_f16(v104_111, v31, va7, 5); \
+ v112_119 = vfmaq_laneq_f16(v112_119, v31, va7, 6); \
+ v120_127 = vfmaq_laneq_f16(v120_127, v31, va7, 7); \
+ va7 = vld1q_f16(a + 8 * 13); \
+ v30 = vld1q_f16(b + 8 * 26); \
+ v31 = vld1q_f16(b + 8 * 27); \
+ v0_7 = vfmaq_laneq_f16(v0_7, v30, va7, 0); \
+ v8_15 = vfmaq_laneq_f16(v8_15, v30, va7, 1); \
+ v16_23 = vfmaq_laneq_f16(v16_23, v30, va7, 2); \
+ v24_31 = vfmaq_laneq_f16(v24_31, v30, va7, 3); \
+ v32_39 = vfmaq_laneq_f16(v32_39, v30, va7, 4); \
+ v40_47 = vfmaq_laneq_f16(v40_47, v30, va7, 5); \
+ v48_55 = vfmaq_laneq_f16(v48_55, v30, va7, 6); \
+ v56_63 = vfmaq_laneq_f16(v56_63, v30, va7, 7); \
+ v64_71 = vfmaq_laneq_f16(v64_71, v31, va7, 0); \
+ v72_79 = vfmaq_laneq_f16(v72_79, v31, va7, 1); \
+ v80_87 = vfmaq_laneq_f16(v80_87, v31, va7, 2); \
+ v88_95 = vfmaq_laneq_f16(v88_95, v31, va7, 3); \
+ v96_103 = vfmaq_laneq_f16(v96_103, v31, va7, 4); \
+ v104_111 = vfmaq_laneq_f16(v104_111, v31, va7, 5); \
+ v112_119 = vfmaq_laneq_f16(v112_119, v31, va7, 6); \
+ v120_127 = vfmaq_laneq_f16(v120_127, v31, va7, 7); \
+ va7 = vld1q_f16(a + 8 * 14); \
+ v30 = vld1q_f16(b + 8 * 28); \
+ v31 = vld1q_f16(b + 8 * 29); \
+ v0_7 = vfmaq_laneq_f16(v0_7, v30, va7, 0); \
+ v8_15 = vfmaq_laneq_f16(v8_15, v30, va7, 1); \
+ v16_23 = vfmaq_laneq_f16(v16_23, v30, va7, 2); \
+ v24_31 = vfmaq_laneq_f16(v24_31, v30, va7, 3); \
+ v32_39 = vfmaq_laneq_f16(v32_39, v30, va7, 4); \
+ v40_47 = vfmaq_laneq_f16(v40_47, v30, va7, 5); \
+ v48_55 = vfmaq_laneq_f16(v48_55, v30, va7, 6); \
+ v56_63 = vfmaq_laneq_f16(v56_63, v30, va7, 7); \
+ v64_71 = vfmaq_laneq_f16(v64_71, v31, va7, 0); \
+ v72_79 = vfmaq_laneq_f16(v72_79, v31, va7, 1); \
+ v80_87 = vfmaq_laneq_f16(v80_87, v31, va7, 2); \
+ v88_95 = vfmaq_laneq_f16(v88_95, v31, va7, 3); \
+ v96_103 = vfmaq_laneq_f16(v96_103, v31, va7, 4); \
+ v104_111 = vfmaq_laneq_f16(v104_111, v31, va7, 5); \
+ v112_119 = vfmaq_laneq_f16(v112_119, v31, va7, 6); \
+ v120_127 = vfmaq_laneq_f16(v120_127, v31, va7, 7); \
+ va7 = vld1q_f16(a + 8 * 15); \
+ v30 = vld1q_f16(b + 8 * 30); \
+ v31 = vld1q_f16(b + 8 * 31); \
+ v0_7 = vfmaq_laneq_f16(v0_7, v30, va7, 0); \
+ v8_15 = vfmaq_laneq_f16(v8_15, v30, va7, 1); \
+ v16_23 = vfmaq_laneq_f16(v16_23, v30, va7, 2); \
+ v24_31 = vfmaq_laneq_f16(v24_31, v30, va7, 3); \
+ v32_39 = vfmaq_laneq_f16(v32_39, v30, va7, 4); \
+ v40_47 = vfmaq_laneq_f16(v40_47, v30, va7, 5); \
+ v48_55 = vfmaq_laneq_f16(v48_55, v30, va7, 6); \
+ v56_63 = vfmaq_laneq_f16(v56_63, v30, va7, 7); \
+ v64_71 = vfmaq_laneq_f16(v64_71, v31, va7, 0); \
+ v72_79 = vfmaq_laneq_f16(v72_79, v31, va7, 1); \
+ v80_87 = vfmaq_laneq_f16(v80_87, v31, va7, 2); \
+ v88_95 = vfmaq_laneq_f16(v88_95, v31, va7, 3); \
+ v96_103 = vfmaq_laneq_f16(v96_103, v31, va7, 4); \
+ v104_111 = vfmaq_laneq_f16(v104_111, v31, va7, 5); \
+ v112_119 = vfmaq_laneq_f16(v112_119, v31, va7, 6); \
+ v120_127 = vfmaq_laneq_f16(v120_127, v31, va7, 7); \
+ l += 16; \
+ __builtin_prefetch(b + 256, 0, 3); \
+ __builtin_prefetch(a + 128, 0, 3); \
+ b += 16 * 16; \
+ a += 8 * 16;
+
+// 2. Partial sum 1024 digits
#define KERNEL_8x16_ACC8() \
- v0_7 = vdupq_n_f16(0.F); \
- v8_15 = vdupq_n_f16(0.F); \
- v16_23 = vdupq_n_f16(0.F); \
- v24_31 = vdupq_n_f16(0.F); \
- v32_39 = vdupq_n_f16(0.F); \
- v40_47 = vdupq_n_f16(0.F); \
- v48_55 = vdupq_n_f16(0.F); \
- v56_63 = vdupq_n_f16(0.F); \
- v64_71 = vdupq_n_f16(0.F); \
- v72_79 = vdupq_n_f16(0.F); \
- v80_87 = vdupq_n_f16(0.F); \
- v88_95 = vdupq_n_f16(0.F); \
- v96_103 = vdupq_n_f16(0.F); \
- v104_111 = vdupq_n_f16(0.F); \
- v112_119 = vdupq_n_f16(0.F); \
- v120_127 = vdupq_n_f16(0.F); \
va0 = vld1q_f16(a); \
v24 = vld1q_f16(b); \
v25 = vld1q_f16(b + 8); \
@@ -193,24 +504,8 @@
b += 16 * 8; \
a += 8 * 8;
-// 2. Partial sum 512 digits : Medium accuracy, medium latency
+// 3. Partial sum 512 digits
#define KERNEL_8x16_ACC4() \
- v0_7 = vdupq_n_f16(0.F); \
- v8_15 = vdupq_n_f16(0.F); \
- v16_23 = vdupq_n_f16(0.F); \
- v24_31 = vdupq_n_f16(0.F); \
- v32_39 = vdupq_n_f16(0.F); \
- v40_47 = vdupq_n_f16(0.F); \
- v48_55 = vdupq_n_f16(0.F); \
- v56_63 = vdupq_n_f16(0.F); \
- v64_71 = vdupq_n_f16(0.F); \
- v72_79 = vdupq_n_f16(0.F); \
- v80_87 = vdupq_n_f16(0.F); \
- v88_95 = vdupq_n_f16(0.F); \
- v96_103 = vdupq_n_f16(0.F); \
- v104_111 = vdupq_n_f16(0.F); \
- v112_119 = vdupq_n_f16(0.F); \
- v120_127 = vdupq_n_f16(0.F); \
va0 = vld1q_f16(a); \
v24 = vld1q_f16(b); \
v25 = vld1q_f16(b + 8); \
@@ -293,24 +588,8 @@
b += 16 * 4; \
a += 8 * 4;
-// 3. Partial sum 128 digits : Best accuracy, worst latency
+// 3. Partial sum 128 digits
#define KERNEL_8x16_ACC1() \
- v0_7 = vdupq_n_f16(0.F); \
- v8_15 = vdupq_n_f16(0.F); \
- v16_23 = vdupq_n_f16(0.F); \
- v24_31 = vdupq_n_f16(0.F); \
- v32_39 = vdupq_n_f16(0.F); \
- v40_47 = vdupq_n_f16(0.F); \
- v48_55 = vdupq_n_f16(0.F); \
- v56_63 = vdupq_n_f16(0.F); \
- v64_71 = vdupq_n_f16(0.F); \
- v72_79 = vdupq_n_f16(0.F); \
- v80_87 = vdupq_n_f16(0.F); \
- v88_95 = vdupq_n_f16(0.F); \
- v96_103 = vdupq_n_f16(0.F); \
- v104_111 = vdupq_n_f16(0.F); \
- v112_119 = vdupq_n_f16(0.F); \
- v120_127 = vdupq_n_f16(0.F); \
va0 = vld1q_f16(a); \
v24 = vld1q_f16(b); \
v25 = vld1q_f16(b + 8); \
@@ -336,6 +615,91 @@
b += 16 * 1; \
a += 8 * 1;
+#define SAVE_KERNEL_8X16_F16_F32() \
+ vst1q_f32(c, vaddq_f32(vld1q_f32(c), vcvt_f32_f16(vget_low_f16(v0_7)))); \
+ vst1q_f32(c + 4, \
+ vaddq_f32(vld1q_f32(c + 4), vcvt_f32_f16(vget_high_f16(v0_7)))); \
+ \
+ vst1q_f32(c + 8, \
+ vaddq_f32(vld1q_f32(c + 8), vcvt_f32_f16(vget_low_f16(v64_71)))); \
+ vst1q_f32(c + 8 + 4, vaddq_f32(vld1q_f32(c + 8 + 4), \
+ vcvt_f32_f16(vget_high_f16(v64_71)))); \
+ \
+ vst1q_f32(c + ldc, \
+ vaddq_f32(vld1q_f32(c + ldc), vcvt_f32_f16(vget_low_f16(v8_15)))); \
+ vst1q_f32(c + ldc + 4, vaddq_f32(vld1q_f32(c + ldc + 4), \
+ vcvt_f32_f16(vget_high_f16(v8_15)))); \
+ \
+ vst1q_f32(c + ldc + 8, vaddq_f32(vld1q_f32(c + ldc + 8), \
+ vcvt_f32_f16(vget_low_f16(v72_79)))); \
+ vst1q_f32(c + ldc + 8 + 4, vaddq_f32(vld1q_f32(c + ldc + 8 + 4), \
+ vcvt_f32_f16(vget_high_f16(v72_79)))); \
+ \
+ vst1q_f32(c + 2 * ldc, vaddq_f32(vld1q_f32(c + 2 * ldc), \
+ vcvt_f32_f16(vget_low_f16(v16_23)))); \
+ vst1q_f32(c + 2 * ldc + 4, vaddq_f32(vld1q_f32(c + 2 * ldc + 4), \
+ vcvt_f32_f16(vget_high_f16(v16_23)))); \
+ \
+ vst1q_f32(c + 2 * ldc + 8, vaddq_f32(vld1q_f32(c + 2 * ldc + 8), \
+ vcvt_f32_f16(vget_low_f16(v80_87)))); \
+ vst1q_f32(c + 2 * ldc + 8 + 4, \
+ vaddq_f32(vld1q_f32(c + 2 * ldc + 8 + 4), \
+ vcvt_f32_f16(vget_high_f16(v80_87)))); \
+ \
+ vst1q_f32(c + 3 * ldc, vaddq_f32(vld1q_f32(c + 3 * ldc), \
+ vcvt_f32_f16(vget_low_f16(v24_31)))); \
+ vst1q_f32(c + 3 * ldc + 4, vaddq_f32(vld1q_f32(c + 3 * ldc + 4), \
+ vcvt_f32_f16(vget_high_f16(v24_31)))); \
+ \
+ vst1q_f32(c + 3 * ldc + 8, vaddq_f32(vld1q_f32(c + 3 * ldc + 8), \
+ vcvt_f32_f16(vget_low_f16(v88_95)))); \
+ vst1q_f32(c + 3 * ldc + 8 + 4, \
+ vaddq_f32(vld1q_f32(c + 3 * ldc + 8 + 4), \
+ vcvt_f32_f16(vget_high_f16(v88_95)))); \
+ \
+ vst1q_f32(c + 4 * ldc, vaddq_f32(vld1q_f32(c + 4 * ldc), \
+ vcvt_f32_f16(vget_low_f16(v32_39)))); \
+ vst1q_f32(c + 4 * ldc + 4, vaddq_f32(vld1q_f32(c + 4 * ldc + 4), \
+ vcvt_f32_f16(vget_high_f16(v32_39)))); \
+ \
+ vst1q_f32(c + 4 * ldc + 8, vaddq_f32(vld1q_f32(c + 4 * ldc + 8), \
+ vcvt_f32_f16(vget_low_f16(v96_103)))); \
+ vst1q_f32(c + 4 * ldc + 8 + 4, \
+ vaddq_f32(vld1q_f32(c + 4 * ldc + 8 + 4), \
+ vcvt_f32_f16(vget_high_f16(v96_103)))); \
+ \
+ vst1q_f32(c + 5 * ldc, vaddq_f32(vld1q_f32(c + 5 * ldc), \
+ vcvt_f32_f16(vget_low_f16(v40_47)))); \
+ vst1q_f32(c + 5 * ldc + 4, vaddq_f32(vld1q_f32(c + 5 * ldc + 4), \
+ vcvt_f32_f16(vget_high_f16(v40_47)))); \
+ vst1q_f32(c + 5 * ldc + 8, vaddq_f32(vld1q_f32(c + 5 * ldc + 8), \
+ vcvt_f32_f16(vget_low_f16(v104_111)))); \
+ vst1q_f32(c + 5 * ldc + 8 + 4, \
+ vaddq_f32(vld1q_f32(c + 5 * ldc + 8 + 4), \
+ vcvt_f32_f16(vget_high_f16(v104_111)))); \
+ \
+ vst1q_f32(c + 6 * ldc, vaddq_f32(vld1q_f32(c + 6 * ldc), \
+ vcvt_f32_f16(vget_low_f16(v48_55)))); \
+ vst1q_f32(c + 6 * ldc + 4, vaddq_f32(vld1q_f32(c + 6 * ldc + 4), \
+ vcvt_f32_f16(vget_high_f16(v48_55)))); \
+ \
+ vst1q_f32(c + 6 * ldc + 8, vaddq_f32(vld1q_f32(c + 6 * ldc + 8), \
+ vcvt_f32_f16(vget_low_f16(v112_119)))); \
+ vst1q_f32(c + 6 * ldc + 8 + 4, \
+ vaddq_f32(vld1q_f32(c + 6 * ldc + 8 + 4), \
+ vcvt_f32_f16(vget_high_f16(v112_119)))); \
+ \
+ vst1q_f32(c + 7 * ldc, vaddq_f32(vld1q_f32(c + 7 * ldc), \
+ vcvt_f32_f16(vget_low_f16(v56_63)))); \
+ vst1q_f32(c + 7 * ldc + 4, vaddq_f32(vld1q_f32(c + 7 * ldc + 4), \
+ vcvt_f32_f16(vget_high_f16(v56_63)))); \
+ \
+ vst1q_f32(c + 7 * ldc + 8, vaddq_f32(vld1q_f32(c + 7 * ldc + 8), \
+ vcvt_f32_f16(vget_low_f16(v120_127)))); \
+ vst1q_f32(c + 7 * ldc + 8 + 4, \
+ vaddq_f32(vld1q_f32(c + 7 * ldc + 8 + 4), \
+ vcvt_f32_f16(vget_high_f16(v120_127))));
+
/**
* @brief hgemm 8x16 kernel sc = sa * sb
*
@@ -370,32 +734,32 @@ void hgemm_kernel_8x16(unsigned int M, unsigned int N, unsigned int K,
float16x8_t v24, v25, v26, v27, v28, v29, v30, v31;
float16x8_t va0, va1, va2, va3;
+
+ INIT_KERNEL_8X16();
l = 0;
for (; l < K;) {
- KERNEL_8x16_ACC4();
- vst1q_f16(c, vaddq_f16(vld1q_f16(c), v0_7));
- vst1q_f16(c + 8, vaddq_f16(vld1q_f16(c + 8), v64_71));
- vst1q_f16(c + ldc, vaddq_f16(vld1q_f16(c + ldc), v8_15));
- vst1q_f16(c + ldc + 8, vaddq_f16(vld1q_f16(c + ldc + 8), v72_79));
- vst1q_f16(c + 2 * ldc, vaddq_f16(vld1q_f16(c + 2 * ldc), v16_23));
- vst1q_f16(c + 2 * ldc + 8,
- vaddq_f16(vld1q_f16(c + 2 * ldc + 8), v80_87));
- vst1q_f16(c + 3 * ldc, vaddq_f16(vld1q_f16(c + 3 * ldc), v24_31));
- vst1q_f16(c + 3 * ldc + 8,
- vaddq_f16(vld1q_f16(c + 3 * ldc + 8), v88_95));
- vst1q_f16(c + 4 * ldc, vaddq_f16(vld1q_f16(c + 4 * ldc), v32_39));
- vst1q_f16(c + 4 * ldc + 8,
- vaddq_f16(vld1q_f16(c + 4 * ldc + 8), v96_103));
- vst1q_f16(c + 5 * ldc, vaddq_f16(vld1q_f16(c + 5 * ldc), v40_47));
- vst1q_f16(c + 5 * ldc + 8,
- vaddq_f16(vld1q_f16(c + 5 * ldc + 8), v104_111));
- vst1q_f16(c + 6 * ldc, vaddq_f16(vld1q_f16(c + 6 * ldc), v48_55));
- vst1q_f16(c + 6 * ldc + 8,
- vaddq_f16(vld1q_f16(c + 6 * ldc + 8), v112_119));
- vst1q_f16(c + 7 * ldc, vaddq_f16(vld1q_f16(c + 7 * ldc), v56_63));
- vst1q_f16(c + 7 * ldc + 8,
- vaddq_f16(vld1q_f16(c + 7 * ldc + 8), v120_127));
+ KERNEL_8x16_ACC1();
}
+ vst1q_f16(c, vaddq_f16(vld1q_f16(c), v0_7));
+ vst1q_f16(c + 8, vaddq_f16(vld1q_f16(c + 8), v64_71));
+ vst1q_f16(c + ldc, vaddq_f16(vld1q_f16(c + ldc), v8_15));
+ vst1q_f16(c + ldc + 8, vaddq_f16(vld1q_f16(c + ldc + 8), v72_79));
+ vst1q_f16(c + 2 * ldc, vaddq_f16(vld1q_f16(c + 2 * ldc), v16_23));
+ vst1q_f16(c + 2 * ldc + 8, vaddq_f16(vld1q_f16(c + 2 * ldc + 8), v80_87));
+ vst1q_f16(c + 3 * ldc, vaddq_f16(vld1q_f16(c + 3 * ldc), v24_31));
+ vst1q_f16(c + 3 * ldc + 8, vaddq_f16(vld1q_f16(c + 3 * ldc + 8), v88_95));
+ vst1q_f16(c + 4 * ldc, vaddq_f16(vld1q_f16(c + 4 * ldc), v32_39));
+ vst1q_f16(c + 4 * ldc + 8,
+ vaddq_f16(vld1q_f16(c + 4 * ldc + 8), v96_103));
+ vst1q_f16(c + 5 * ldc, vaddq_f16(vld1q_f16(c + 5 * ldc), v40_47));
+ vst1q_f16(c + 5 * ldc + 8,
+ vaddq_f16(vld1q_f16(c + 5 * ldc + 8), v104_111));
+ vst1q_f16(c + 6 * ldc, vaddq_f16(vld1q_f16(c + 6 * ldc), v48_55));
+ vst1q_f16(c + 6 * ldc + 8,
+ vaddq_f16(vld1q_f16(c + 6 * ldc + 8), v112_119));
+ vst1q_f16(c + 7 * ldc, vaddq_f16(vld1q_f16(c + 7 * ldc), v56_63));
+ vst1q_f16(c + 7 * ldc + 8,
+ vaddq_f16(vld1q_f16(c + 7 * ldc + 8), v120_127));
c += 16;
a -= 8 * K;
}
@@ -425,6 +789,9 @@ void hgemm_kernel_8x16(unsigned int M, unsigned int N, unsigned int K,
__fp16 *a = sa, *b = sb;
float *c = sc;
unsigned int i, j, l;
+ unsigned int K4 = (K >> 2) << 2;
+ unsigned int K8 = (K >> 3) << 3;
+ unsigned int K16 = (K >> 4) << 4;
for (i = 0; i < M; i += 8) {
for (j = 0; j < N; j += 16) {
__builtin_prefetch(b, 0, 3);
@@ -440,106 +807,25 @@ void hgemm_kernel_8x16(unsigned int M, unsigned int N, unsigned int K,
float16x8_t v24, v25, v26, v27, v28, v29, v30, v31;
float16x8_t va0, va1, va2, va3, va4, va5, va6, va7;
l = 0;
- for (; l < K;) {
+ for (; l < K16;) {
+ INIT_KERNEL_8X16();
+ KERNEL_8x16_ACC16();
+ SAVE_KERNEL_8X16_F16_F32();
+ }
+ for (; l < K8;) {
+ INIT_KERNEL_8X16();
KERNEL_8x16_ACC8();
-
- vst1q_f32(c, vaddq_f32(vld1q_f32(c), vcvt_f32_f16(vget_low_f16(v0_7))));
- vst1q_f32(c + 4, vaddq_f32(vld1q_f32(c + 4),
- vcvt_f32_f16(vget_high_f16(v0_7))));
-
- vst1q_f32(c + 8, vaddq_f32(vld1q_f32(c + 8),
- vcvt_f32_f16(vget_low_f16(v64_71))));
- vst1q_f32(c + 8 + 4, vaddq_f32(vld1q_f32(c + 8 + 4),
- vcvt_f32_f16(vget_high_f16(v64_71))));
-
- vst1q_f32(c + ldc, vaddq_f32(vld1q_f32(c + ldc),
- vcvt_f32_f16(vget_low_f16(v8_15))));
- vst1q_f32(c + ldc + 4, vaddq_f32(vld1q_f32(c + ldc + 4),
- vcvt_f32_f16(vget_high_f16(v8_15))));
-
- vst1q_f32(c + ldc + 8, vaddq_f32(vld1q_f32(c + ldc + 8),
- vcvt_f32_f16(vget_low_f16(v72_79))));
- vst1q_f32(c + ldc + 8 + 4,
- vaddq_f32(vld1q_f32(c + ldc + 8 + 4),
- vcvt_f32_f16(vget_high_f16(v72_79))));
-
- vst1q_f32(c + 2 * ldc, vaddq_f32(vld1q_f32(c + 2 * ldc),
- vcvt_f32_f16(vget_low_f16(v16_23))));
- vst1q_f32(c + 2 * ldc + 4,
- vaddq_f32(vld1q_f32(c + 2 * ldc + 4),
- vcvt_f32_f16(vget_high_f16(v16_23))));
-
- vst1q_f32(c + 2 * ldc + 8,
- vaddq_f32(vld1q_f32(c + 2 * ldc + 8),
- vcvt_f32_f16(vget_low_f16(v80_87))));
- vst1q_f32(c + 2 * ldc + 8 + 4,
- vaddq_f32(vld1q_f32(c + 2 * ldc + 8 + 4),
- vcvt_f32_f16(vget_high_f16(v80_87))));
-
- vst1q_f32(c + 3 * ldc, vaddq_f32(vld1q_f32(c + 3 * ldc),
- vcvt_f32_f16(vget_low_f16(v24_31))));
- vst1q_f32(c + 3 * ldc + 4,
- vaddq_f32(vld1q_f32(c + 3 * ldc + 4),
- vcvt_f32_f16(vget_high_f16(v24_31))));
-
- vst1q_f32(c + 3 * ldc + 8,
- vaddq_f32(vld1q_f32(c + 3 * ldc + 8),
- vcvt_f32_f16(vget_low_f16(v88_95))));
- vst1q_f32(c + 3 * ldc + 8 + 4,
- vaddq_f32(vld1q_f32(c + 3 * ldc + 8 + 4),
- vcvt_f32_f16(vget_high_f16(v88_95))));
-
- vst1q_f32(c + 4 * ldc, vaddq_f32(vld1q_f32(c + 4 * ldc),
- vcvt_f32_f16(vget_low_f16(v32_39))));
- vst1q_f32(c + 4 * ldc + 4,
- vaddq_f32(vld1q_f32(c + 4 * ldc + 4),
- vcvt_f32_f16(vget_high_f16(v32_39))));
-
- vst1q_f32(c + 4 * ldc + 8,
- vaddq_f32(vld1q_f32(c + 4 * ldc + 8),
- vcvt_f32_f16(vget_low_f16(v96_103))));
- vst1q_f32(c + 4 * ldc + 8 + 4,
- vaddq_f32(vld1q_f32(c + 4 * ldc + 8 + 4),
- vcvt_f32_f16(vget_high_f16(v96_103))));
-
- vst1q_f32(c + 5 * ldc, vaddq_f32(vld1q_f32(c + 5 * ldc),
- vcvt_f32_f16(vget_low_f16(v40_47))));
- vst1q_f32(c + 5 * ldc + 4,
- vaddq_f32(vld1q_f32(c + 5 * ldc + 4),
- vcvt_f32_f16(vget_high_f16(v40_47))));
-
- vst1q_f32(c + 5 * ldc + 8,
- vaddq_f32(vld1q_f32(c + 5 * ldc + 8),
- vcvt_f32_f16(vget_low_f16(v104_111))));
- vst1q_f32(c + 5 * ldc + 8 + 4,
- vaddq_f32(vld1q_f32(c + 5 * ldc + 8 + 4),
- vcvt_f32_f16(vget_high_f16(v104_111))));
-
- vst1q_f32(c + 6 * ldc, vaddq_f32(vld1q_f32(c + 6 * ldc),
- vcvt_f32_f16(vget_low_f16(v48_55))));
- vst1q_f32(c + 6 * ldc + 4,
- vaddq_f32(vld1q_f32(c + 6 * ldc + 4),
- vcvt_f32_f16(vget_high_f16(v48_55))));
-
- vst1q_f32(c + 6 * ldc + 8,
- vaddq_f32(vld1q_f32(c + 6 * ldc + 8),
- vcvt_f32_f16(vget_low_f16(v112_119))));
- vst1q_f32(c + 6 * ldc + 8 + 4,
- vaddq_f32(vld1q_f32(c + 6 * ldc + 8 + 4),
- vcvt_f32_f16(vget_high_f16(v112_119))));
-
- vst1q_f32(c + 7 * ldc, vaddq_f32(vld1q_f32(c + 7 * ldc),
- vcvt_f32_f16(vget_low_f16(v56_63))));
- vst1q_f32(c + 7 * ldc + 4,
- vaddq_f32(vld1q_f32(c + 7 * ldc + 4),
- vcvt_f32_f16(vget_high_f16(v56_63))));
-
- vst1q_f32(c + 7 * ldc + 8,
- vaddq_f32(vld1q_f32(c + 7 * ldc + 8),
- vcvt_f32_f16(vget_low_f16(v120_127))));
- vst1q_f32(c + 7 * ldc + 8 + 4,
- vaddq_f32(vld1q_f32(c + 7 * ldc + 8 + 4),
- vcvt_f32_f16(vget_high_f16(v120_127))));
+ SAVE_KERNEL_8X16_F16_F32();
+ }
+ for (; l < K4;) {
+ INIT_KERNEL_8X16();
+ KERNEL_8x16_ACC4();
+ SAVE_KERNEL_8X16_F16_F32();
+ }
+ for (; l < K;) {
+ INIT_KERNEL_8X16();
+ KERNEL_8x16_ACC1();
+ SAVE_KERNEL_8X16_F16_F32();
}
c += 16;
a -= 8 * K;
diff --git a/nntrainer/tensor/hgemm/hgemm_kernel_8x8.h b/nntrainer/tensor/hgemm/hgemm_kernel_8x8.h
index e67ef462b4..4901c3f518 100644
--- a/nntrainer/tensor/hgemm/hgemm_kernel_8x8.h
+++ b/nntrainer/tensor/hgemm/hgemm_kernel_8x8.h
@@ -14,19 +14,186 @@
#include
#include
-/// @note Following KERNELs are the combinations of accuracy-latency
-/// tradeoff. User can select which kernel to use by replacing them.
+#define INIT_KERNEL_8x8() \
+ v24 = vdupq_n_f16(0.F); \
+ v25 = vdupq_n_f16(0.F); \
+ v26 = vdupq_n_f16(0.F); \
+ v27 = vdupq_n_f16(0.F); \
+ v28 = vdupq_n_f16(0.F); \
+ v29 = vdupq_n_f16(0.F); \
+ v30 = vdupq_n_f16(0.F); \
+ v31 = vdupq_n_f16(0.F);
-// 1. Partial sum 512 digits : Worst accuracy, best latency
+// 1. Partial sum 1024 digits
+#define KERNEL_8x8_ACC16() \
+ va0 = vld1q_f16(a); \
+ v16 = vld1q_f16(b); \
+ v24 = vfmaq_laneq_f16(v24, v16, va0, 0); \
+ v25 = vfmaq_laneq_f16(v25, v16, va0, 1); \
+ v26 = vfmaq_laneq_f16(v26, v16, va0, 2); \
+ v27 = vfmaq_laneq_f16(v27, v16, va0, 3); \
+ v28 = vfmaq_laneq_f16(v28, v16, va0, 4); \
+ v29 = vfmaq_laneq_f16(v29, v16, va0, 5); \
+ v30 = vfmaq_laneq_f16(v30, v16, va0, 6); \
+ v31 = vfmaq_laneq_f16(v31, v16, va0, 7); \
+ va1 = vld1q_f16(a + 8); \
+ v17 = vld1q_f16(b + 8); \
+ v24 = vfmaq_laneq_f16(v24, v17, va1, 0); \
+ v25 = vfmaq_laneq_f16(v25, v17, va1, 1); \
+ v26 = vfmaq_laneq_f16(v26, v17, va1, 2); \
+ v27 = vfmaq_laneq_f16(v27, v17, va1, 3); \
+ v28 = vfmaq_laneq_f16(v28, v17, va1, 4); \
+ v29 = vfmaq_laneq_f16(v29, v17, va1, 5); \
+ v30 = vfmaq_laneq_f16(v30, v17, va1, 6); \
+ v31 = vfmaq_laneq_f16(v31, v17, va1, 7); \
+ va2 = vld1q_f16(a + 8 * 2); \
+ v18 = vld1q_f16(b + 8 * 2); \
+ v24 = vfmaq_laneq_f16(v24, v18, va2, 0); \
+ v25 = vfmaq_laneq_f16(v25, v18, va2, 1); \
+ v26 = vfmaq_laneq_f16(v26, v18, va2, 2); \
+ v27 = vfmaq_laneq_f16(v27, v18, va2, 3); \
+ v28 = vfmaq_laneq_f16(v28, v18, va2, 4); \
+ v29 = vfmaq_laneq_f16(v29, v18, va2, 5); \
+ v30 = vfmaq_laneq_f16(v30, v18, va2, 6); \
+ v31 = vfmaq_laneq_f16(v31, v18, va2, 7); \
+ va3 = vld1q_f16(a + 8 * 3); \
+ v19 = vld1q_f16(b + 8 * 3); \
+ v24 = vfmaq_laneq_f16(v24, v19, va3, 0); \
+ v25 = vfmaq_laneq_f16(v25, v19, va3, 1); \
+ v26 = vfmaq_laneq_f16(v26, v19, va3, 2); \
+ v27 = vfmaq_laneq_f16(v27, v19, va3, 3); \
+ v28 = vfmaq_laneq_f16(v28, v19, va3, 4); \
+ v29 = vfmaq_laneq_f16(v29, v19, va3, 5); \
+ v30 = vfmaq_laneq_f16(v30, v19, va3, 6); \
+ v31 = vfmaq_laneq_f16(v31, v19, va3, 7); \
+ va4 = vld1q_f16(a + 8 * 4); \
+ v20 = vld1q_f16(b + 8 * 4); \
+ v24 = vfmaq_laneq_f16(v24, v20, va4, 0); \
+ v25 = vfmaq_laneq_f16(v25, v20, va4, 1); \
+ v26 = vfmaq_laneq_f16(v26, v20, va4, 2); \
+ v27 = vfmaq_laneq_f16(v27, v20, va4, 3); \
+ v28 = vfmaq_laneq_f16(v28, v20, va4, 4); \
+ v29 = vfmaq_laneq_f16(v29, v20, va4, 5); \
+ v30 = vfmaq_laneq_f16(v30, v20, va4, 6); \
+ v31 = vfmaq_laneq_f16(v31, v20, va4, 7); \
+ va5 = vld1q_f16(a + 8 * 5); \
+ v21 = vld1q_f16(b + 8 * 5); \
+ v24 = vfmaq_laneq_f16(v24, v21, va5, 0); \
+ v25 = vfmaq_laneq_f16(v25, v21, va5, 1); \
+ v26 = vfmaq_laneq_f16(v26, v21, va5, 2); \
+ v27 = vfmaq_laneq_f16(v27, v21, va5, 3); \
+ v28 = vfmaq_laneq_f16(v28, v21, va5, 4); \
+ v29 = vfmaq_laneq_f16(v29, v21, va5, 5); \
+ v30 = vfmaq_laneq_f16(v30, v21, va5, 6); \
+ v31 = vfmaq_laneq_f16(v31, v21, va5, 7); \
+ va6 = vld1q_f16(a + 8 * 6); \
+ v22 = vld1q_f16(b + 8 * 6); \
+ v24 = vfmaq_laneq_f16(v24, v22, va6, 0); \
+ v25 = vfmaq_laneq_f16(v25, v22, va6, 1); \
+ v26 = vfmaq_laneq_f16(v26, v22, va6, 2); \
+ v27 = vfmaq_laneq_f16(v27, v22, va6, 3); \
+ v28 = vfmaq_laneq_f16(v28, v22, va6, 4); \
+ v29 = vfmaq_laneq_f16(v29, v22, va6, 5); \
+ v30 = vfmaq_laneq_f16(v30, v22, va6, 6); \
+ v31 = vfmaq_laneq_f16(v31, v22, va6, 7); \
+ va7 = vld1q_f16(a + 8 * 7); \
+ v23 = vld1q_f16(b + 8 * 7); \
+ v24 = vfmaq_laneq_f16(v24, v23, va7, 0); \
+ v25 = vfmaq_laneq_f16(v25, v23, va7, 1); \
+ v26 = vfmaq_laneq_f16(v26, v23, va7, 2); \
+ v27 = vfmaq_laneq_f16(v27, v23, va7, 3); \
+ v28 = vfmaq_laneq_f16(v28, v23, va7, 4); \
+ v29 = vfmaq_laneq_f16(v29, v23, va7, 5); \
+ v30 = vfmaq_laneq_f16(v30, v23, va7, 6); \
+ v31 = vfmaq_laneq_f16(v31, v23, va7, 7); \
+ va7 = vld1q_f16(a + 8 * 8); \
+ v23 = vld1q_f16(b + 8 * 8); \
+ v24 = vfmaq_laneq_f16(v24, v23, va7, 0); \
+ v25 = vfmaq_laneq_f16(v25, v23, va7, 1); \
+ v26 = vfmaq_laneq_f16(v26, v23, va7, 2); \
+ v27 = vfmaq_laneq_f16(v27, v23, va7, 3); \
+ v28 = vfmaq_laneq_f16(v28, v23, va7, 4); \
+ v29 = vfmaq_laneq_f16(v29, v23, va7, 5); \
+ v30 = vfmaq_laneq_f16(v30, v23, va7, 6); \
+ v31 = vfmaq_laneq_f16(v31, v23, va7, 7); \
+ va7 = vld1q_f16(a + 8 * 9); \
+ v23 = vld1q_f16(b + 8 * 9); \
+ v24 = vfmaq_laneq_f16(v24, v23, va7, 0); \
+ v25 = vfmaq_laneq_f16(v25, v23, va7, 1); \
+ v26 = vfmaq_laneq_f16(v26, v23, va7, 2); \
+ v27 = vfmaq_laneq_f16(v27, v23, va7, 3); \
+ v28 = vfmaq_laneq_f16(v28, v23, va7, 4); \
+ v29 = vfmaq_laneq_f16(v29, v23, va7, 5); \
+ v30 = vfmaq_laneq_f16(v30, v23, va7, 6); \
+ v31 = vfmaq_laneq_f16(v31, v23, va7, 7); \
+ va7 = vld1q_f16(a + 8 * 10); \
+ v23 = vld1q_f16(b + 8 * 10); \
+ v24 = vfmaq_laneq_f16(v24, v23, va7, 0); \
+ v25 = vfmaq_laneq_f16(v25, v23, va7, 1); \
+ v26 = vfmaq_laneq_f16(v26, v23, va7, 2); \
+ v27 = vfmaq_laneq_f16(v27, v23, va7, 3); \
+ v28 = vfmaq_laneq_f16(v28, v23, va7, 4); \
+ v29 = vfmaq_laneq_f16(v29, v23, va7, 5); \
+ v30 = vfmaq_laneq_f16(v30, v23, va7, 6); \
+ v31 = vfmaq_laneq_f16(v31, v23, va7, 7); \
+ va7 = vld1q_f16(a + 8 * 11); \
+ v23 = vld1q_f16(b + 8 * 11); \
+ v24 = vfmaq_laneq_f16(v24, v23, va7, 0); \
+ v25 = vfmaq_laneq_f16(v25, v23, va7, 1); \
+ v26 = vfmaq_laneq_f16(v26, v23, va7, 2); \
+ v27 = vfmaq_laneq_f16(v27, v23, va7, 3); \
+ v28 = vfmaq_laneq_f16(v28, v23, va7, 4); \
+ v29 = vfmaq_laneq_f16(v29, v23, va7, 5); \
+ v30 = vfmaq_laneq_f16(v30, v23, va7, 6); \
+ v31 = vfmaq_laneq_f16(v31, v23, va7, 7); \
+ va7 = vld1q_f16(a + 8 * 12); \
+ v23 = vld1q_f16(b + 8 * 12); \
+ v24 = vfmaq_laneq_f16(v24, v23, va7, 0); \
+ v25 = vfmaq_laneq_f16(v25, v23, va7, 1); \
+ v26 = vfmaq_laneq_f16(v26, v23, va7, 2); \
+ v27 = vfmaq_laneq_f16(v27, v23, va7, 3); \
+ v28 = vfmaq_laneq_f16(v28, v23, va7, 4); \
+ v29 = vfmaq_laneq_f16(v29, v23, va7, 5); \
+ v30 = vfmaq_laneq_f16(v30, v23, va7, 6); \
+ v31 = vfmaq_laneq_f16(v31, v23, va7, 7); \
+ va7 = vld1q_f16(a + 8 * 13); \
+ v23 = vld1q_f16(b + 8 * 13); \
+ v24 = vfmaq_laneq_f16(v24, v23, va7, 0); \
+ v25 = vfmaq_laneq_f16(v25, v23, va7, 1); \
+ v26 = vfmaq_laneq_f16(v26, v23, va7, 2); \
+ v27 = vfmaq_laneq_f16(v27, v23, va7, 3); \
+ v28 = vfmaq_laneq_f16(v28, v23, va7, 4); \
+ v29 = vfmaq_laneq_f16(v29, v23, va7, 5); \
+ v30 = vfmaq_laneq_f16(v30, v23, va7, 6); \
+ v31 = vfmaq_laneq_f16(v31, v23, va7, 7); \
+ va7 = vld1q_f16(a + 8 * 14); \
+ v23 = vld1q_f16(b + 8 * 14); \
+ v24 = vfmaq_laneq_f16(v24, v23, va7, 0); \
+ v25 = vfmaq_laneq_f16(v25, v23, va7, 1); \
+ v26 = vfmaq_laneq_f16(v26, v23, va7, 2); \
+ v27 = vfmaq_laneq_f16(v27, v23, va7, 3); \
+ v28 = vfmaq_laneq_f16(v28, v23, va7, 4); \
+ v29 = vfmaq_laneq_f16(v29, v23, va7, 5); \
+ v30 = vfmaq_laneq_f16(v30, v23, va7, 6); \
+ v31 = vfmaq_laneq_f16(v31, v23, va7, 7); \
+ va7 = vld1q_f16(a + 8 * 15); \
+ v23 = vld1q_f16(b + 8 * 15); \
+ v24 = vfmaq_laneq_f16(v24, v23, va7, 0); \
+ v25 = vfmaq_laneq_f16(v25, v23, va7, 1); \
+ v26 = vfmaq_laneq_f16(v26, v23, va7, 2); \
+ v27 = vfmaq_laneq_f16(v27, v23, va7, 3); \
+ v28 = vfmaq_laneq_f16(v28, v23, va7, 4); \
+ v29 = vfmaq_laneq_f16(v29, v23, va7, 5); \
+ v30 = vfmaq_laneq_f16(v30, v23, va7, 6); \
+ v31 = vfmaq_laneq_f16(v31, v23, va7, 7); \
+ __builtin_prefetch(b + 128, 0, 3); \
+ __builtin_prefetch(a + 128, 0, 3); \
+ l += 16; \
+ b += 8 * 16; \
+ a += 8 * 16;
+
+// 2. Partial sum 512 digits
#define KERNEL_8x8_ACC8() \
- v24 = vdupq_n_f16(0.F); \
- v25 = vdupq_n_f16(0.F); \
- v26 = vdupq_n_f16(0.F); \
- v27 = vdupq_n_f16(0.F); \
- v28 = vdupq_n_f16(0.F); \
- v29 = vdupq_n_f16(0.F); \
- v30 = vdupq_n_f16(0.F); \
- v31 = vdupq_n_f16(0.F); \
va0 = vld1q_f16(a); \
v16 = vld1q_f16(b); \
v24 = vfmaq_laneq_f16(v24, v16, va0, 0); \
@@ -113,16 +280,8 @@
b += 8 * 8; \
a += 8 * 8;
-// 2. Partial sum 256 digits : Medium accuracy, medium latency
+// 3. Partial sum 256 digits
#define KERNEL_8x8_ACC4() \
- v24 = vdupq_n_f16(0.F); \
- v25 = vdupq_n_f16(0.F); \
- v26 = vdupq_n_f16(0.F); \
- v27 = vdupq_n_f16(0.F); \
- v28 = vdupq_n_f16(0.F); \
- v29 = vdupq_n_f16(0.F); \
- v30 = vdupq_n_f16(0.F); \
- v31 = vdupq_n_f16(0.F); \
va0 = vld1q_f16(a); \
v16 = vld1q_f16(b); \
v24 = vfmaq_laneq_f16(v24, v16, va0, 0); \
@@ -169,16 +328,8 @@
b += 8 * 4; \
a += 8 * 4;
-// 3. Partial sum 64 digits : Best accuracy, worst latency
+// 4. Partial sum 64 digits
#define KERNEL_8x8_ACC1() \
- v24 = vdupq_n_f16(0.F); \
- v25 = vdupq_n_f16(0.F); \
- v26 = vdupq_n_f16(0.F); \
- v27 = vdupq_n_f16(0.F); \
- v28 = vdupq_n_f16(0.F); \
- v29 = vdupq_n_f16(0.F); \
- v30 = vdupq_n_f16(0.F); \
- v31 = vdupq_n_f16(0.F); \
va0 = vld1q_f16(a); \
v16 = vld1q_f16(b); \
v24 = vfmaq_laneq_f16(v24, v16, va0, 0); \
@@ -195,6 +346,46 @@
b += 8 * 1; \
a += 8 * 1;
+#define SAVE_KERNEL_8X8_F16_f32() \
+ vst1q_f32(c, vaddq_f32(vld1q_f32(c), vcvt_f32_f16(vget_low_f16(v24)))); \
+ vst1q_f32(c + 4, \
+ vaddq_f32(vld1q_f32(c + 4), vcvt_f32_f16(vget_high_f16(v24)))); \
+ \
+ vst1q_f32(c + ldc, \
+ vaddq_f32(vld1q_f32(c + ldc), vcvt_f32_f16(vget_low_f16(v25)))); \
+ vst1q_f32(c + 4 + ldc, vaddq_f32(vld1q_f32(c + 4 + ldc), \
+ vcvt_f32_f16(vget_high_f16(v25)))); \
+ \
+ vst1q_f32(c + 2 * ldc, vaddq_f32(vld1q_f32(c + 2 * ldc), \
+ vcvt_f32_f16(vget_low_f16(v26)))); \
+ vst1q_f32(c + 4 + 2 * ldc, vaddq_f32(vld1q_f32(c + 4 + 2 * ldc), \
+ vcvt_f32_f16(vget_high_f16(v26)))); \
+ \
+ vst1q_f32(c + 3 * ldc, vaddq_f32(vld1q_f32(c + 3 * ldc), \
+ vcvt_f32_f16(vget_low_f16(v27)))); \
+ vst1q_f32(c + 4 + 3 * ldc, vaddq_f32(vld1q_f32(c + 4 + 3 * ldc), \
+ vcvt_f32_f16(vget_high_f16(v27)))); \
+ \
+ vst1q_f32(c + 4 * ldc, vaddq_f32(vld1q_f32(c + 4 * ldc), \
+ vcvt_f32_f16(vget_low_f16(v28)))); \
+ vst1q_f32(c + 4 + 4 * ldc, vaddq_f32(vld1q_f32(c + 4 + 4 * ldc), \
+ vcvt_f32_f16(vget_high_f16(v28)))); \
+ \
+ vst1q_f32(c + 5 * ldc, vaddq_f32(vld1q_f32(c + 5 * ldc), \
+ vcvt_f32_f16(vget_low_f16(v29)))); \
+ vst1q_f32(c + 4 + 5 * ldc, vaddq_f32(vld1q_f32(c + 4 + 5 * ldc), \
+ vcvt_f32_f16(vget_high_f16(v29)))); \
+ \
+ vst1q_f32(c + 6 * ldc, vaddq_f32(vld1q_f32(c + 6 * ldc), \
+ vcvt_f32_f16(vget_low_f16(v30)))); \
+ vst1q_f32(c + 4 + 6 * ldc, vaddq_f32(vld1q_f32(c + 4 + 6 * ldc), \
+ vcvt_f32_f16(vget_high_f16(v30)))); \
+ \
+ vst1q_f32(c + 7 * ldc, vaddq_f32(vld1q_f32(c + 7 * ldc), \
+ vcvt_f32_f16(vget_low_f16(v31)))); \
+ vst1q_f32(c + 4 + 7 * ldc, vaddq_f32(vld1q_f32(c + 4 + 7 * ldc), \
+ vcvt_f32_f16(vget_high_f16(v31))));
+
/**
* @brief hgemm 8x8 kernel sc = sa * sb
*
@@ -221,19 +412,19 @@ void hgemm_kernel_8x8(unsigned int M, unsigned int N, unsigned int K,
float16x8_t v16, v17, v18, v19, v20, v21, v22, v23;
float16x8_t v24, v25, v26, v27, v28, v29, v30, v31;
float16x8_t va0, va1, va2, va3, va4, va5, va6, va7;
+ INIT_KERNEL_8x8();
l = 0;
for (; l < K;) {
- KERNEL_8x8_ACC8();
-
- vst1q_f16(c, vaddq_f16(vld1q_f16(c), v24));
- vst1q_f16(c + ldc, vaddq_f16(vld1q_f16(c + ldc), v25));
- vst1q_f16(c + 2 * ldc, vaddq_f16(vld1q_f16(c + 2 * ldc), v26));
- vst1q_f16(c + 3 * ldc, vaddq_f16(vld1q_f16(c + 3 * ldc), v27));
- vst1q_f16(c + 4 * ldc, vaddq_f16(vld1q_f16(c + 4 * ldc), v28));
- vst1q_f16(c + 5 * ldc, vaddq_f16(vld1q_f16(c + 5 * ldc), v29));
- vst1q_f16(c + 6 * ldc, vaddq_f16(vld1q_f16(c + 6 * ldc), v30));
- vst1q_f16(c + 7 * ldc, vaddq_f16(vld1q_f16(c + 7 * ldc), v31));
+ KERNEL_8x8_ACC1();
}
+ vst1q_f16(c, vaddq_f16(vld1q_f16(c), v24));
+ vst1q_f16(c + ldc, vaddq_f16(vld1q_f16(c + ldc), v25));
+ vst1q_f16(c + 2 * ldc, vaddq_f16(vld1q_f16(c + 2 * ldc), v26));
+ vst1q_f16(c + 3 * ldc, vaddq_f16(vld1q_f16(c + 3 * ldc), v27));
+ vst1q_f16(c + 4 * ldc, vaddq_f16(vld1q_f16(c + 4 * ldc), v28));
+ vst1q_f16(c + 5 * ldc, vaddq_f16(vld1q_f16(c + 5 * ldc), v29));
+ vst1q_f16(c + 6 * ldc, vaddq_f16(vld1q_f16(c + 6 * ldc), v30));
+ vst1q_f16(c + 7 * ldc, vaddq_f16(vld1q_f16(c + 7 * ldc), v31));
c += 8;
a -= 8 * K;
}
@@ -263,6 +454,9 @@ void hgemm_kernel_8x8(unsigned int M, unsigned int N, unsigned int K,
__fp16 *a = sa, *b = sb;
float *c = sc;
unsigned int i, j, l;
+ unsigned int K4 = (K >> 2) << 2;
+ unsigned int K8 = (K >> 3) << 3;
+ unsigned int K16 = (K >> 4) << 4;
for (i = 0; i < M; i += VL_FP16) {
for (j = 0; j < N; j += VL_FP16) {
__builtin_prefetch(b, 0, 3);
@@ -272,48 +466,25 @@ void hgemm_kernel_8x8(unsigned int M, unsigned int N, unsigned int K,
float16x8_t v24, v25, v26, v27, v28, v29, v30, v31;
float16x8_t va0, va1, va2, va3, va4, va5, va6, va7;
l = 0;
-
- for (; l < K;) {
+ for (; l < K16;) {
+ INIT_KERNEL_8x8();
+ KERNEL_8x8_ACC16();
+ SAVE_KERNEL_8X8_F16_f32();
+ }
+ for (; l < K8;) {
+ INIT_KERNEL_8x8();
KERNEL_8x8_ACC8();
-
- vst1q_f32(c, vaddq_f32(vld1q_f32(c), vcvt_f32_f16(vget_low_f16(v24))));
- vst1q_f32(
- c + 4, vaddq_f32(vld1q_f32(c + 4), vcvt_f32_f16(vget_high_f16(v24))));
-
- vst1q_f32(c + ldc, vaddq_f32(vld1q_f32(c + ldc),
- vcvt_f32_f16(vget_low_f16(v25))));
- vst1q_f32(c + 4 + ldc, vaddq_f32(vld1q_f32(c + 4 + ldc),
- vcvt_f32_f16(vget_high_f16(v25))));
-
- vst1q_f32(c + 2 * ldc, vaddq_f32(vld1q_f32(c + 2 * ldc),
- vcvt_f32_f16(vget_low_f16(v26))));
- vst1q_f32(c + 4 + 2 * ldc, vaddq_f32(vld1q_f32(c + 4 + 2 * ldc),
- vcvt_f32_f16(vget_high_f16(v26))));
-
- vst1q_f32(c + 3 * ldc, vaddq_f32(vld1q_f32(c + 3 * ldc),
- vcvt_f32_f16(vget_low_f16(v27))));
- vst1q_f32(c + 4 + 3 * ldc, vaddq_f32(vld1q_f32(c + 4 + 3 * ldc),
- vcvt_f32_f16(vget_high_f16(v27))));
-
- vst1q_f32(c + 4 * ldc, vaddq_f32(vld1q_f32(c + 4 * ldc),
- vcvt_f32_f16(vget_low_f16(v28))));
- vst1q_f32(c + 4 + 4 * ldc, vaddq_f32(vld1q_f32(c + 4 + 4 * ldc),
- vcvt_f32_f16(vget_high_f16(v28))));
-
- vst1q_f32(c + 5 * ldc, vaddq_f32(vld1q_f32(c + 5 * ldc),
- vcvt_f32_f16(vget_low_f16(v29))));
- vst1q_f32(c + 4 + 5 * ldc, vaddq_f32(vld1q_f32(c + 4 + 5 * ldc),
- vcvt_f32_f16(vget_high_f16(v29))));
-
- vst1q_f32(c + 6 * ldc, vaddq_f32(vld1q_f32(c + 6 * ldc),
- vcvt_f32_f16(vget_low_f16(v30))));
- vst1q_f32(c + 4 + 6 * ldc, vaddq_f32(vld1q_f32(c + 4 + 6 * ldc),
- vcvt_f32_f16(vget_high_f16(v30))));
-
- vst1q_f32(c + 7 * ldc, vaddq_f32(vld1q_f32(c + 7 * ldc),
- vcvt_f32_f16(vget_low_f16(v31))));
- vst1q_f32(c + 4 + 7 * ldc, vaddq_f32(vld1q_f32(c + 4 + 7 * ldc),
- vcvt_f32_f16(vget_high_f16(v31))));
+ SAVE_KERNEL_8X8_F16_f32();
+ }
+ for (; l < K4;) {
+ INIT_KERNEL_8x8();
+ KERNEL_8x8_ACC4();
+ SAVE_KERNEL_8X8_F16_f32();
+ }
+ for (; l < K;) {
+ INIT_KERNEL_8x8();
+ KERNEL_8x8_ACC1();
+ SAVE_KERNEL_8X8_F16_f32();
}
c += 8;
diff --git a/nntrainer/tensor/manager.cpp b/nntrainer/tensor/manager.cpp
index 9a0d235ba9..4a2838d05e 100644
--- a/nntrainer/tensor/manager.cpp
+++ b/nntrainer/tensor/manager.cpp
@@ -407,14 +407,15 @@ std::vector Manager::requestWeights(
* order with the max exec order where it will be used for clipping and then
* applied to the weight.
*/
- if (Weight::isGradientClipByGlobalNorm(clip_by_global_norm)) {
+ if (Weight::isGradientClipByGlobalNorm(clip_by_global_norm) ||
+ isMixedPrecision()) {
grad_exec_order.push_back(TensorPool::PERSIST_END_ORDER);
// TODO: We need double check if it is OK not to add PERSIST_END_ORDER
// here or add other conditions
// var_exec_order.push_back(TensorPool::PERSIST_END_ORDER);
}
- Tensor *var = nullptr, *grad = nullptr;
+ Tensor *var = nullptr, *grad = nullptr, *var32 = nullptr;
bool is_dependent = !shared_names.empty();
if (is_dependent) {
/// shared_name is used and the orignal name is discarded
@@ -431,6 +432,17 @@ std::vector Manager::requestWeights(
grad = tensor_pool.requestOrExtend(shared_name + Var_Grad::grad_suffix,
dim_g, grad_exec_order, grad_ls,
Tensor::Initializer::ZEROS);
+
+ if (var->getDataType() != ml::train::TensorDim::DataType::FP32) {
+ TensorDim var32_dim(dim_v);
+ var32_dim.setDataType(ml::train::TensorDim::DataType::FP32);
+ std::vector var32_exec_order;
+ var32_exec_order.push_back(TensorPool::PERSIST_END_ORDER);
+
+ var32 = weight_pool.requestOrExtend(shared_name + ":var32", var32_dim,
+ var32_exec_order, var_ls,
+ Tensor::Initializer::ZEROS);
+ }
}
} else {
/** case requesting fresh weights */
@@ -448,11 +460,21 @@ std::vector Manager::requestWeights(
grad = tensor_pool.request(name + Var_Grad::grad_suffix, dim_g,
grad_exec_order, grad_ls,
Tensor::Initializer::ZEROS, is_wgrad);
+ if (var->getDataType() != ml::train::TensorDim::DataType::FP32) {
+ TensorDim var32_dim(dim_v);
+ var32_dim.setDataType(ml::train::TensorDim::DataType::FP32);
+ std::vector var32_exec_order;
+ var32_exec_order.push_back(TensorPool::PERSIST_END_ORDER);
+ var32 =
+ weight_pool.request(name + ":var32", var32_dim, var32_exec_order,
+ var_ls, Tensor::Initializer::ZEROS);
+ }
}
}
weights_v2.emplace_back(std::make_unique(
- var, grad, w_reg, w_reg_const, decay, is_dependent, clip_by_global_norm));
+ var, grad, var32, w_reg, w_reg_const, decay, is_dependent,
+ clip_by_global_norm, axis, loss_scale));
}
std::transform(weights_v2.begin() + current_size, weights_v2.end(),
@@ -668,15 +690,15 @@ bool Manager::isSecondLastAccess(const std::string &name,
*/
std::vector Manager::requestWeightOptimizerVariables(
const std::vector &dims, const std::string &name,
- const TensorLifespan &lifespan, bool is_grad_clip,
- Tensor::Initializer initializer) {
+ const std::string &suffix, const TensorLifespan &lifespan, bool is_grad_clip,
+ bool is_mixed_precision, Tensor::Initializer initializer) {
std::vector ret;
ret.reserve(dims.size());
std::vector exec;
exec.reserve(1);
- if (is_grad_clip) {
+ if (is_grad_clip || is_mixed_precision) {
exec.emplace_back(TensorPool::PERSIST_END_ORDER);
} else {
exec.emplace_back(getMinMaxTensorExecutionOrder(name, true).second);
@@ -685,7 +707,7 @@ std::vector Manager::requestWeightOptimizerVariables(
/// @note this is assuming weight optimizer variables is treated as weight, if
/// not, there is room to optimize below behavior
for (unsigned int idx = 0; idx < dims.size(); idx++)
- ret.push_back(weight_pool.request(name + ":opt" + std::to_string(idx),
+ ret.push_back(weight_pool.request(name + suffix + std::to_string(idx),
dims[idx], exec, lifespan, initializer));
return ret;
diff --git a/nntrainer/tensor/manager.h b/nntrainer/tensor/manager.h
index ab1c018153..d561770206 100644
--- a/nntrainer/tensor/manager.h
+++ b/nntrainer/tensor/manager.h
@@ -224,7 +224,8 @@ class Manager {
*/
std::vector requestWeightOptimizerVariables(
const std::vector &dims, const std::string &name,
- const TensorLifespan &lifespan, bool is_grad_clip,
+ const std::string &suffix, const TensorLifespan &lifespan,
+ bool is_grad_clip, bool is_mixed_type,
Tensor::Initializer initializer = Tensor::Initializer::NONE);
/**
@@ -494,6 +495,11 @@ class Manager {
exec_mode = mode;
};
+ /**
+ * @brief return if it is mixed precsion
+ */
+ bool isMixedPrecision() { return !istrequal(tensor_dtype[0], "FP32"); }
+
private:
/** @todo: merge this list to one */
std::vector> weights_v2; /**< weights for the layers
diff --git a/nntrainer/tensor/meson.build b/nntrainer/tensor/meson.build
index 0884dbd3b4..b14fa0ee85 100644
--- a/nntrainer/tensor/meson.build
+++ b/nntrainer/tensor/meson.build
@@ -44,6 +44,12 @@ cl_headers = [
arch = host_machine.cpu_family()
+
+if get_option('enable-avx')
+ tensor_sources += 'blas_avx.cpp'
+ tensor_headers += 'blas_avx.h'
+endif
+
if get_option('enable-fp16')
if arch == 'arm'
error ('FP16/ARM code (blas_neon.cpp) uses armv8.2 instructions. armv7 is not supported.')
@@ -55,9 +61,6 @@ if get_option('enable-fp16')
nntrainer_inc += include_directories('hgemm')
nntrainer_inc_abs += meson.current_source_dir() / 'hgemm'
endif
- elif get_option('enable-avx')
- tensor_sources += 'blas_avx.cpp'
- tensor_headers += 'blas_avx.h'
endif
endif
diff --git a/nntrainer/tensor/tensor.cpp b/nntrainer/tensor/tensor.cpp
index 4f1e8e0721..827ba7e979 100644
--- a/nntrainer/tensor/tensor.cpp
+++ b/nntrainer/tensor/tensor.cpp
@@ -3065,6 +3065,18 @@ Tensor Tensor::clone() const {
return t;
}
+Tensor Tensor::clone(ml::train::TensorDim::DataType type) const {
+ if (getDataType() == type)
+ return clone();
+
+ TensorDim dim = getDim();
+ dim.setDataType(type);
+ Tensor t(dim, true);
+ t.copyData(*this);
+ t.name = name;
+ return t;
+}
+
void Tensor::reshape(const TensorDim &d) {
NNTR_THROW_IF(!contiguous, std::invalid_argument)
@@ -3808,6 +3820,18 @@ void Tensor::dequantize(Tensor &output, unsigned int axis) const {
return;
}
+bool Tensor::isValid() const {
+ if (getDataType() == Tdatatype::FP16) {
+#ifdef ENABLE_FP16
+ return is_valid(dim.getDataLen(), Tdatatype::FP16, getData<_FP16>());
+#else
+ throw std::invalid_argument("enble-fp16 is not set");
+#endif
+ } else {
+ return is_valid(dim.getDataLen(), Tdatatype::FP32, getData());
+ }
+}
+
// namespace nntrainer
} /* namespace nntrainer */
diff --git a/nntrainer/tensor/tensor.h b/nntrainer/tensor/tensor.h
index 211334da40..ad3781526f 100644
--- a/nntrainer/tensor/tensor.h
+++ b/nntrainer/tensor/tensor.h
@@ -1680,6 +1680,13 @@ class Tensor {
*/
Tensor clone() const;
+ /**
+ * @brief Convient wrapper for inplace copy of @a this.
+ * @param[in] type output tensor data type
+ * @retval Copied version of this
+ */
+ Tensor clone(ml::train::TensorDim::DataType type) const;
+
/**
* @brief Save the Tensor into file
* @param[in] file output file stream
@@ -2031,6 +2038,12 @@ class Tensor {
static constexpr float epsilon = 1e-5;
+ /**
+ * @brief check if there is NaN or Inf element
+ * @param[out] bool false if there is NaN or Inf else false
+ */
+ bool isValid() const;
+
private:
/**< handle the data as a std::shared_ptr type */
TensorDim dim;
diff --git a/nntrainer/tensor/weight.cpp b/nntrainer/tensor/weight.cpp
index f98c8c8356..ea8c65a7cb 100644
--- a/nntrainer/tensor/weight.cpp
+++ b/nntrainer/tensor/weight.cpp
@@ -34,6 +34,28 @@ Weight::Weight(const TensorDim &dim, const Tensor::Initializer init,
throw std::invalid_argument("Weight initializer cannot be none");
if (regularizer == WeightRegularizer::UNKNOWN)
throw std::invalid_argument("Weight regularizer unknown");
+
+ std::string var32_suffix = ":fp32";
+ std::string var32_name = name + var32_suffix;
+
+ /**
+ * @note We assume if the Weight Data Type is not FP32, then FP32 Weight is
+ * necessary to maintain the accuracy.
+ * We could think it can be other data type and if there is the case to
+ * support other data type, then the code below needs to be udpated.
+ *
+ * Also, the loss_scale is not used in Weight but leave as it is for later
+ * usage.
+ */
+
+ if (train && dim.getDataType() != ml::train::TensorDim::DataType::FP32) {
+ TensorDim var32_dim(dim);
+ var32_dim.setDataType(ml::train::TensorDim::DataType::FP32);
+
+ var32 = std::make_shared(var32_dim, alloc_now_, init, var32_name);
+ } else {
+ var32 = std::make_shared(var32_name);
+ }
}
Weight::Weight(const TensorDim &dim_v, const TensorDim &dim_g,
@@ -52,6 +74,93 @@ Weight::Weight(const TensorDim &dim_v, const TensorDim &dim_g,
throw std::invalid_argument("Weight initializer cannot be none");
if (regularizer == WeightRegularizer::UNKNOWN)
throw std::invalid_argument("Weight regularizer unknown");
+
+ std::string var32_suffix = ":fp32";
+ std::string var32_name = name + var32_suffix;
+
+ if (train && dim_v.getDataType() != ml::train::TensorDim::DataType::FP32) {
+ TensorDim var32_dim(dim_v);
+ var32_dim.setDataType(ml::train::TensorDim::DataType::FP32);
+ std::string var32_suffix = ":fp32";
+ std::string var32_name = name + var32_suffix;
+
+ var32 = std::make_shared(var32_dim, alloc_now_, init, var32_name);
+ } else {
+ var32 = std::make_shared(var32_name);
+ }
+}
+
+Weight::Weight(const Tensor &v, const Tensor &g, const Tensor &v32,
+ const std::string &n, bool is_dependent,
+ unsigned int output_axis_) :
+ Var_Grad(v, g, n, is_dependent),
+ regularizer(WeightRegularizer::NONE),
+ regularizer_constant(1.0f),
+ decay(0.0f),
+ clip_by_global_norm(0.0f),
+ output_axis(output_axis_),
+ loss_scale(1.0),
+ var32(std::make_shared(n + ":fp32")) {
+
+ if (!g.empty() && isMixedPrecision()) {
+ TensorDim var32_dim(v.getDim());
+ var32_dim.setDataType(ml::train::TensorDim::DataType::FP32);
+ if (!v32.empty())
+ var32 = std::make_shared(
+ v32.getSharedDataTensor(var32_dim, 0, false, n + ":fp32"));
+ }
+}
+
+Weight::Weight(Tensor *v, Tensor *g, Tensor *v32, const WeightRegularizer reg,
+ const float reg_const, const float decay, bool is_dependent,
+ const float max_norm, unsigned int output_axis_,
+ float loss_scale_) :
+ Var_Grad(v, g, is_dependent),
+ regularizer(reg),
+ regularizer_constant(reg_const),
+ decay(decay),
+ clip_by_global_norm(max_norm),
+ output_axis(output_axis_),
+ loss_scale(loss_scale_),
+ var32(std::shared_ptr(v32, [](void *) {})) {
+ if (!v32)
+ var32 = std::make_shared();
+}
+
+void Weight::applyGradient(double lr, Tensor &updated_grad) {
+ if (isMixedPrecision() &&
+ updated_grad.getDataType() == ml::train::TensorDim::DataType::FP32) {
+ var32->add_i(updated_grad, -lr);
+ quantizeWeight();
+ return;
+ }
+
+ return applyGradient(lr);
+}
+
+void Weight::quantizeWeight() {
+ if (!isMixedPrecision())
+ return;
+
+ Tensor &var = getVariableRef();
+ ml::train::TensorDim::DataType type = var.getDataType();
+ switch (type) {
+ case ml::train::TensorDim::DataType::QINT4:
+ // NYI
+ break;
+ case ml::train::TensorDim::DataType::QINT8:
+ // NYI
+ break;
+ case ml::train::TensorDim::DataType::FP16:
+ getVariableRef().copyData(getVariableFP32Ref());
+ break;
+ case ml::train::TensorDim::DataType::FP32:
+ break;
+ default:
+ break;
+ }
+
+ return;
}
} // namespace nntrainer
diff --git a/nntrainer/tensor/weight.h b/nntrainer/tensor/weight.h
index 552f6d5739..ef65ca9318 100644
--- a/nntrainer/tensor/weight.h
+++ b/nntrainer/tensor/weight.h
@@ -46,7 +46,7 @@ class Weight : public Var_Grad {
decay(0.0f),
clip_by_global_norm(0.0f),
output_axis(3),
- loss_scale(0.0) {}
+ loss_scale(1.0) {}
/**
* @brief Construct a new Weight object
@@ -66,7 +66,7 @@ class Weight : public Var_Grad {
const float reg_const = 1.0f, const float decay = 0.0f,
const float clip_by_global_norm = 0.0f, bool ng = true,
bool alloc_now = false, std::string name = "", unsigned int axis = 3,
- float loss_scale_ = 0.0);
+ float loss_scale_ = 1.0);
/**
* @brief Construct a new Weight object
@@ -87,7 +87,7 @@ class Weight : public Var_Grad {
const float reg_const = 1.0f, const float decay = 0.0f,
const float clip_by_global_norm = 0.0f, bool ng = true,
bool alloc_now = false, std::string name = "", unsigned int axis = 3,
- float loss_scale_ = 0.0);
+ float loss_scale_ = 1.0);
/**
* @brief Construct a new Weight object
@@ -114,6 +114,7 @@ class Weight : public Var_Grad {
*
* @param v Already created variable object
* @param g Already created gradient object
+ * @param v32 Already created gradient object
* @param n Name for this Weight
*
* @note This is primarily used to created wrapper of variable extracted from
@@ -123,35 +124,24 @@ class Weight : public Var_Grad {
* uses only, as Weight does not own the tensors v and g, and can go invalid
* if the owner of these tensors free the tensors.
*/
- explicit Weight(const Tensor &v, const Tensor &g, const std::string &n = "",
- bool is_dependent = false, unsigned int output_axis_ = 3) :
- Var_Grad(v, g, n, is_dependent),
- regularizer(WeightRegularizer::NONE),
- regularizer_constant(1.0f),
- decay(0.0f),
- clip_by_global_norm(0.0f),
- output_axis(output_axis_),
- loss_scale(0.0) {}
+ explicit Weight(const Tensor &v, const Tensor &g, const Tensor &v32,
+ const std::string &n = "", bool is_dependent = false,
+ unsigned int output_axis_ = 3);
/**
* @brief Construct a new Weight object
*
* @param v ptr to already created variable tensor
* @param g ptr to already created gradient tensor
+ * @param v32 ptr to already created variable32 tensor
* @param reg Regularizer for the weight
* @param reg_const Constant multiplier for regularizer
*/
- explicit Weight(Tensor *v, Tensor *g, const WeightRegularizer reg,
- const float reg_const, const float decay,
- bool is_dependent = false, const float max_norm = 0.0f,
- unsigned int output_axis_ = 3, float loss_scale_ = 0.0f) :
- Var_Grad(v, g, is_dependent),
- regularizer(reg),
- regularizer_constant(reg_const),
- decay(decay),
- clip_by_global_norm(max_norm),
- output_axis(output_axis_),
- loss_scale(loss_scale_) {}
+ explicit Weight(Tensor *v, Tensor *g, Tensor *v32,
+ const WeightRegularizer reg, const float reg_const,
+ const float decay, bool is_dependent = false,
+ const float max_norm = 0.0f, unsigned int output_axis_ = 3,
+ float loss_scale_ = 1.0f);
/**
* @brief Swap for weight
@@ -170,6 +160,7 @@ class Weight : public Var_Grad {
swap(lhs.output_axis, rhs.output_axis);
swap(lhs.opt_vars, rhs.opt_vars);
swap(lhs.loss_scale, rhs.loss_scale);
+ swap(lhs.var32, rhs.var32);
}
/**
@@ -213,6 +204,8 @@ class Weight : public Var_Grad {
w.var = std::make_shared(this->var->clone());
if (!this->grad->empty())
w.grad = std::make_shared(this->grad->clone());
+ if (!this->var32->empty())
+ w.var32 = std::make_shared(this->var32->clone());
return w;
}
@@ -294,6 +287,13 @@ class Weight : public Var_Grad {
*/
void applyGradient(double lr) { var->add_i(*grad.get(), -lr); }
+ /**
+ * @brief Apply the gradient to the weight with updated gradient
+ * @param[in] updated_grad gradient tensor which is updated in optimizer
+ * it might be different data type with gradient in weight. .eg : FP32
+ */
+ void applyGradient(double lr, Tensor &updated_grad);
+
/**
* @brief Check if the gradient is supposed to be clipped by global norm with
* the given max_norm value
@@ -316,6 +316,16 @@ class Weight : public Var_Grad {
return clip_by_global_norm > epsilon;
}
+ /**
+ * @brief Check if the variable type is not full precision
+ *
+ * @return true if it is not full precsion
+ * @return false otherwise
+ */
+ bool isMixedPrecision() const {
+ return ((var->getDataType() != ml::train::TensorDim::DataType::FP32));
+ }
+
/**
* @brief clip the gradient value based on the given global norm
*
@@ -326,6 +336,32 @@ class Weight : public Var_Grad {
grad->multiply_i(clip_by_global_norm / (global_norm + epsilon));
}
+ /**
+ * @brief Get the variable FP32 tensor (by reference)
+ *
+ * @return Tensor Variable FP32 tensor
+ */
+ Tensor &getVariableFP32Ref() { return *var32.get(); }
+
+ /**
+ * @brief Quantize var32 to var
+ *
+ */
+ void quantizeWeight();
+
+ /**
+ * @brief set loss scale
+ * param[in] scale
+ *
+ */
+ void setLossScale(float scale) { loss_scale = scale; };
+
+ /**
+ * @brief get loss scale
+ *
+ */
+ const float getLossScale() { return loss_scale; };
+
private:
static constexpr float epsilon = 1e-6; /**< epsilon for zero comparison */
static constexpr float epsilon_decay =
@@ -337,7 +373,8 @@ class Weight : public Var_Grad {
float clip_by_global_norm; /**< constant factor to clip gradient by L2 norm */
unsigned int output_axis;
float loss_scale;
- std::vector opt_vars; /**< optimizer variables */
+ std::vector
+ opt_vars; /**< optimizer variables : We assume it is always full-precsion*/
std::shared_ptr var32;
/**
diff --git a/packaging/nntrainer.spec b/packaging/nntrainer.spec
index 36ba371d22..2f1dc57f68 100644
--- a/packaging/nntrainer.spec
+++ b/packaging/nntrainer.spec
@@ -65,6 +65,13 @@
%define neon_support -Denable-neon=false
%endif # arch aarch64
+%ifarch x86_64
+%define enable_avx 1
+%define avx_support -Denable-avx=true
+%else
+%define avx_support -Denable-avx=false
+%endif # arch aarch64
+
Name: nntrainer
Summary: Software framework for training neural networks
@@ -410,7 +417,7 @@ meson --buildtype=plain --prefix=%{_prefix} --sysconfdir=%{_sysconfdir} \
%{enable_reduce_tolerance} %{configure_subplugin_install_path} %{enable_debug} \
-Dml-api-support=enabled -Denable-nnstreamer-tensor-filter=enabled \
-Denable-nnstreamer-tensor-trainer=enabled -Denable-capi=enabled \
- %{fp16_support} %{neon_support} build
+ %{fp16_support} %{neon_support} %{avx_support} build
ninja -C build %{?_smp_mflags}
@@ -563,6 +570,10 @@ cp -r result %{buildroot}%{_datadir}/nntrainer/unittest/
%{_includedir}/nntrainer/util_simd_neon.h
%endif
+%if 0%{?enable_avx}
+%{_includedir}/nntrainer/blas_avx.h
+%endif
+
%files devel-static
%{_libdir}/libnntrainer*.a
%exclude %{_libdir}/libcapi*.a
diff --git a/packaging/unittest_layers.tar.gz b/packaging/unittest_layers.tar.gz
index 7a435aadf4..3bd488a0a2 100644
Binary files a/packaging/unittest_layers.tar.gz and b/packaging/unittest_layers.tar.gz differ
diff --git a/packaging/unittest_models_v3.tar.gz b/packaging/unittest_models_v3.tar.gz
index abc7ead4a4..49a1f1b2ad 100644
Binary files a/packaging/unittest_models_v3.tar.gz and b/packaging/unittest_models_v3.tar.gz differ
diff --git a/test/include/nntrainer_test_util.h b/test/include/nntrainer_test_util.h
index 74eef4abaa..8e16b6a9f4 100644
--- a/test/include/nntrainer_test_util.h
+++ b/test/include/nntrainer_test_util.h
@@ -347,6 +347,29 @@ float mse(Ta *A, Tb *B, uint32_t size) {
return mse;
}
+/**
+ * @brief calculate mean squared errer
+ *
+ * @param A const prediction data
+ * @param B const reference data
+ * @param size data size
+ * @return mean squared errer value
+ */
+template
+float mse(const Ta *A, const Tb *B, uint32_t size) {
+ float pred;
+ float ref;
+ float mse_error = 0;
+ for (uint32_t i = 0; i < size; i++) {
+ pred = A[i];
+ ref = B[i];
+ float diff = pred - ref;
+ mse_error += pow(diff, 2);
+ }
+ float mse = mse_error / size;
+ return mse;
+}
+
/**
* @brief A helper struct for performing static_cast operations on types.
*
diff --git a/test/input_gen/genModelTests_v2.py b/test/input_gen/genModelTests_v2.py
index a56f437785..422c737487 100644
--- a/test/input_gen/genModelTests_v2.py
+++ b/test/input_gen/genModelTests_v2.py
@@ -11,6 +11,7 @@
import math
from recorder_v2 import record_v2, inspect_file, _rand_like
import torch
+from torch import autocast
class ReduceMeanLast(torch.nn.Module):
def __init__(self):
@@ -307,6 +308,40 @@ def forward(self, inputs, labels):
loss = self.loss(out, labels[0])
return out, loss
+class LinearMixedPrecision(torch.nn.Module):
+ def __init__(self):
+ super().__init__()
+ self.fc = torch.nn.Linear(3, 10)
+ self.loss = torch.nn.MSELoss()
+
+ def forward(self, inputs, labels):
+ with autocast(device_type='cuda', dtype=torch.float16):
+ input=inputs[0].to('cuda')
+ label=labels[0].to('cuda')
+ out = self.fc(input)
+ return out
+
+ def getOptimizer(self):
+ return torch.optim.Adam(self.parameters(), lr=0.1)
+
+class LinearMixedPrecisionNaNSGD(torch.nn.Module):
+ def __init__(self):
+ super().__init__()
+ self.fc0 = torch.nn.Linear(1, 1)
+ self.fc1 = torch.nn.Linear(1, 1)
+ self.loss = torch.nn.MSELoss()
+
+ def forward(self, inputs, labels):
+ with autocast(device_type='cuda', dtype=torch.float16):
+ input=inputs[0].to('cuda')
+ label=labels[0].to('cuda')
+ out = self.fc0(input)
+ out = self.fc1(out)
+ return out
+
+ def getOptimizer(self):
+ return torch.optim.SGD(self.parameters(), lr=0.1)
+
if __name__ == "__main__":
record_v2(
ReduceMeanLast(),
@@ -537,5 +572,28 @@ def forward(self, inputs, labels):
name="non_trainable_fc_idx3"
)
- # Function to check the created golden test file
+ fc_mixed_training = LinearMixedPrecision()
+ record_v2(
+ fc_mixed_training,
+ iteration=3,
+ input_dims=[(1,3)],
+ input_dtype=[float],
+ label_dims=[(1,10)],
+ name="fc_mixed_training",
+ optimizer=fc_mixed_training.getOptimizer()
+ )
+
+ fc_mixed_training_nan_sgd = LinearMixedPrecisionNaNSGD()
+ record_v2(
+ fc_mixed_training_nan_sgd,
+ iteration=5,
+ input_dims=[(1,1)],
+ input_dtype=[float],
+ label_dims=[(1,1)],
+ name="fc_mixed_training_nan_sgd",
+ optimizer=fc_mixed_training_nan_sgd.getOptimizer()
+ )
+
+# Function to check the created golden test file
inspect_file("non_trainable_fc_idx3.nnmodelgolden")
+
diff --git a/test/input_gen/gen_layer_tests.py b/test/input_gen/gen_layer_tests.py
index 48e68acaf1..7a1ed18ec6 100644
--- a/test/input_gen/gen_layer_tests.py
+++ b/test/input_gen/gen_layer_tests.py
@@ -17,6 +17,7 @@
@author Jihoon Lee
@author Sungsik Kong
+@author Debadri Samaddar
"""
import warnings
@@ -866,3 +867,19 @@ def call(self, inputs):
added = K.layers.Add()
record_single_fp16(added, [(2, 3, 3, 3), (2, 3, 3, 3)], "added_w16a16")
+
+ def swiglu(inputs):
+ [x, y] = inputs
+ # swish(x) = x * sigmoid(x)
+ swishTensor = x * K.activations.sigmoid(x)
+
+ return K.layers.Multiply()([swishTensor, y])
+
+ swiglu_layer = K.layers.Lambda(swiglu)
+
+ record_single(
+ swiglu_layer,
+ [(2, 3, 3, 3), (2, 3, 3, 3)],
+ "swiglu",
+ input_type="float",
+ )
diff --git a/test/input_gen/recorder_v2.py b/test/input_gen/recorder_v2.py
index 9bc219c767..6b8f42ff88 100644
--- a/test/input_gen/recorder_v2.py
+++ b/test/input_gen/recorder_v2.py
@@ -12,6 +12,8 @@
import random
import torch # torch used here is torch==1.9.1
import numpy as np
+import torch.cuda.amp as amp
+from torch import autocast
from transLayer_v2 import params_translated
@@ -29,13 +31,31 @@
def _get_writer(file):
- def write_fn(items):
+ def write_fn(items, type = 'float32'):
if not isinstance(items, (list, tuple)):
items = [items]
for item in items:
- np.array([item.numel()], dtype="int32").tofile(file)
- item.detach().cpu().numpy().tofile(file)
+ print(item.numel(), " -0-----")
+ print(item)
+ np.array([item.numel()], dtype='int32').tofile(file)
+ a=np.array(item.detach().cpu(), dtype=type)
+ a.tofile(file)
+ print(a.dtype)
+
+ return items
+
+ return write_fn
+
+def _get_writer_mixed(file):
+ def write_fn(items, num_type = 'int32', type = 'float32'):
+ if not isinstance(items, (list, tuple)):
+ items = [items]
+
+ for item in items:
+ np.array([item.numel()], dtype=num_type).tofile(file)
+ a=np.array(item.detach().cpu(), dtype=type)
+ a.tofile(file)
return items
@@ -96,14 +116,65 @@ def record_iteration(write_fn):
norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 0.0001)
optimizer.step()
+ def record_iteration_with_amp(write_fn, inputs, labels, is_nan, scaler):
+ model_= model.cuda()
+
+ print(inputs[0], " inputs inside")
+ output = model_(inputs[0], labels[0])
+
+ print("model output type: ",output.dtype)
+
+ with autocast(device_type='cuda', dtype=torch.float16):
+ l=model_.loss(output, labels[0].to('cuda'))
+
+ optimizer.zero_grad()
+
+ scaler.scale(l).backward()
+ print("Gradient ---------------")
+ for param in model_.parameters():
+ print (param.grad)
+ mask = torch.isnan(param.grad) or torch.isinf(param.grad)
+ check_nan = mask.int()
+ if check_nan.sum().item():
+ is_nan = True
+ else:
+ is_nan = False
+
+
+ if not is_nan:
+ print("------------------------------- not nan")
+ write_fn(output,'int32','float32')
+ return output, is_nan
+
with open(file_name, "wb") as f:
# write number of iterations
+ print("iteration : ", iteration)
np.array([iteration], dtype="int32").tofile(f)
- write_fn = _get_writer(f)
- for _ in range(iteration):
- record_iteration(write_fn)
-
+ write_fn = _get_writer_mixed(f)
+ for i in range(iteration):
+ if input_label_reader != None:
+ inputs, labels = input_label_reader(input_dims, label_dims, input_dtype)
+ else:
+ inputs = _rand_like(input_dims, dtype=input_dtype if input_dtype is not None else float)
+ labels = _rand_like(label_dims, dtype=float)
+ print("inputs ==============")
+ write_fn(inputs,'int32', 'float32')
+ print("labels ==============")
+ write_fn(labels, 'int32', 'float32')
+ is_nan = True;
+ print("=========================== ", i)
+ scaler = amp.GradScaler()
+ print("weights ==============")
+ write_fn(list(t for _, t in params_translated(model)),'int16','float16')
+ print("\n\n")
+ while(is_nan):
+ print( "before is_nan_", is_nan)
+ output,is_nan_ = record_iteration_with_amp(write_fn, inputs, labels, is_nan, scaler)
+ is_nan = is_nan_
+ print( "after is_nan_", is_nan)
+ scaler.step(optimizer)
+ scaler.update()
##
# @brief inpsect if file is created correctly
diff --git a/test/jni/Android.mk b/test/jni/Android.mk
index a9033b65cc..978e98bd67 100644
--- a/test/jni/Android.mk
+++ b/test/jni/Android.mk
@@ -16,6 +16,7 @@ NNTRAINER_INCLUDES := $(NNTRAINER_ROOT)/nntrainer \
$(NNTRAINER_ROOT)/nntrainer/dataset \
$(NNTRAINER_ROOT)/nntrainer/models \
$(NNTRAINER_ROOT)/nntrainer/layers \
+ $(NNTRAINER_ROOT)/nntrainer/layers/cl_layers \
$(NNTRAINER_ROOT)/nntrainer/compiler \
$(NNTRAINER_ROOT)/nntrainer/graph \
$(NNTRAINER_ROOT)/nntrainer/opencl \
@@ -442,6 +443,7 @@ LOCAL_SRC_FILES := \
../unittest/layers/unittest_layers_impl.cpp \
../unittest/layers/unittest_layers_input.cpp \
../unittest/layers/unittest_layers_loss.cpp \
+ ../unittest/layers/unittest_layers_fully_connected_cl.cpp \
../unittest/layers/unittest_layers_fully_connected.cpp \
../unittest/layers/unittest_layers_batch_normalization.cpp \
../unittest/layers/unittest_layers_layer_normalization.cpp \
diff --git a/test/nntrainer_test_util.cpp b/test/nntrainer_test_util.cpp
index bcc33e40c8..5777bb75b2 100644
--- a/test/nntrainer_test_util.cpp
+++ b/test/nntrainer_test_util.cpp
@@ -332,6 +332,7 @@ void sizeCheckedReadTensor(nntrainer::Tensor &t, std::ifstream &file,
nntrainer::checkedRead(file, (char *)&sz, sizeof(unsigned));
} else if (t.getDataType() == ml::train::TensorDim::DataType::FP16) {
#ifdef ENABLE_FP16
+ // This needs to be fixed. sz is always unsinged int type.
nntrainer::checkedRead(file, (char *)&sz, sizeof(_FP16));
#else
throw std::invalid_argument("Error: enable-fp16 is not enabled");
diff --git a/test/unittest/layers/layers_common_tests.h b/test/unittest/layers/layers_common_tests.h
index 57f693c0a2..d63357c805 100644
--- a/test/unittest/layers/layers_common_tests.h
+++ b/test/unittest/layers/layers_common_tests.h
@@ -93,6 +93,7 @@ class LayerPropertySemantics : public LayerSemantics {};
typedef enum {
SKIP_CALC_GRAD = 1 << 0, /**< skip calculating gradient and compare */
SKIP_CALC_DERIV = 1 << 1, /**< skip calculating derivative and compare */
+ USE_INC_FORWARD = 1 << 2, /**< use incremental forwarding and compare */
FORWARD_MODE_INFERENCE =
1 << 2, /**< set if layer should be forwarded with inference mode */
@@ -172,6 +173,14 @@ class LayerGoldenTest
*/
bool shouldSkipCalcGrad();
+ /**
+ * @brief check if given test suite should use incremental forwarding instead
+ * of normal forwarding
+ *
+ * @return bool true if should use incremental forwarding
+ */
+ bool shouldUseIncForward();
+
/**
* @brief check if given test suite should skip cosine similarity check
*
diff --git a/test/unittest/layers/layers_golden_tests.cpp b/test/unittest/layers/layers_golden_tests.cpp
index 64400e6ecd..73f3954052 100644
--- a/test/unittest/layers/layers_golden_tests.cpp
+++ b/test/unittest/layers/layers_golden_tests.cpp
@@ -156,7 +156,7 @@ static RunLayerContext prepareRunContext(const TensorPacks &packs) {
};
auto rc =
- RunLayerContext("golden", true, 0.0f, false, create_view(weights),
+ RunLayerContext("golden", true, 0.0f, false, 1.0, create_view(weights),
create_view(ins), create_view(outs), create_view(tensors));
auto num_outputs = rc.getNumOutputs();
@@ -364,6 +364,11 @@ bool LayerGoldenTest::shouldSkipCalcGrad() {
LayerGoldenTestParamOptions::SKIP_CALC_GRAD;
}
+bool LayerGoldenTest::shouldUseIncForward() {
+ return std::get(GetParam()) &
+ LayerGoldenTestParamOptions::USE_INC_FORWARD;
+}
+
bool LayerGoldenTest::shouldSkipCosineSimilarity() {
return std::get(GetParam()) &
LayerGoldenTestParamOptions::SKIP_COSINE_SIMILARITY;
@@ -387,15 +392,31 @@ TEST_P(LayerGoldenTest, run) {
bool skip_calc_grad = shouldSkipCalcGrad();
bool skip_calc_deriv = shouldSkipCalcDeriv();
+ bool use_inc_forward = shouldUseIncForward();
bool dropout_compare_60_percent = shouldMatchDropout60Percent();
bool skip_cos_sim = shouldSkipCosineSimilarity();
+ Tensor &input = rc.getInput(0);
+ TensorDim input_dim = input.getDim();
+ size_t inputHeight = input_dim.height();
+
for (int i = 0; i < 4; ++i) {
/// warm layer multiple times
+ if (use_inc_forward) {
+ layer->incremental_forwarding(rc, 0, inputHeight,
+ !shouldForwardWithInferenceMode());
+ } else {
+ layer->forwarding(rc, !shouldForwardWithInferenceMode());
+ }
+ }
+
+ if (use_inc_forward) {
+ layer->incremental_forwarding(rc, 0, inputHeight,
+ !shouldForwardWithInferenceMode());
+ } else {
layer->forwarding(rc, !shouldForwardWithInferenceMode());
}
- layer->forwarding(rc, !shouldForwardWithInferenceMode());
if (!skip_calc_grad) {
layer->calcGradient(rc);
}
diff --git a/test/unittest/layers/unittest_layer_node.cpp b/test/unittest/layers/unittest_layer_node.cpp
index 3b41f02f30..37287f7ce5 100644
--- a/test/unittest/layers/unittest_layer_node.cpp
+++ b/test/unittest/layers/unittest_layer_node.cpp
@@ -131,7 +131,7 @@ TEST(nntrainer_LayerNode, finalize_05_n) {
nntrainer::createLayerNode(nntrainer::IdentityLayer::type));
EXPECT_NO_THROW(lnode->setProperty({"input_shape=1:1:1", "name=abc"}));
EXPECT_NO_THROW(lnode->finalize());
- EXPECT_NO_THROW(lnode->configureRunContext({}, {&input}, {}, {}));
+ EXPECT_NO_THROW(lnode->configureRunContext({}, {&input}, {}, {}, 1.0));
EXPECT_THROW(lnode->finalize(), std::runtime_error);
}
@@ -298,7 +298,7 @@ TEST(nntrainer_LayerNode, setWeights_02_n) {
EXPECT_NO_THROW(lnode =
nntrainer::createLayerNode(nntrainer::IdentityLayer::type));
EXPECT_NO_THROW(lnode->setProperty({"input_shape=1:1:1", "name=abc"}));
- EXPECT_NO_THROW(lnode->configureRunContext({&weight}, {&input}, {}, {}));
+ EXPECT_NO_THROW(lnode->configureRunContext({&weight}, {&input}, {}, {}, 1.0));
EXPECT_THROW(lnode->setWeights(new_weights), std::runtime_error);
}
diff --git a/test/unittest/layers/unittest_layers_convolution2d.cpp b/test/unittest/layers/unittest_layers_convolution2d.cpp
index 724c79079b..92d9c593e7 100644
--- a/test/unittest/layers/unittest_layers_convolution2d.cpp
+++ b/test/unittest/layers/unittest_layers_convolution2d.cpp
@@ -198,3 +198,185 @@ GTEST_PARAMETER_TEST(
conv2d_mb_valid_drop_last, conv2d_sb_no_overlap, conv2d_mb_no_overlap,
conv2d_sb_1x1_kernel, conv2d_mb_1x1_kernel, conv2d_sb_dilation,
conv2d_mb_dilation, conv2d_sb_same_dilation, conv2d_mb_same_dilation));
+
+#ifdef ENABLE_FP16
+auto conv2d_sb_minimum_w16a16 = LayerGoldenTestParamType(
+ nntrainer::createLayer,
+ {"filters=3", "kernel_size=2,2"}, "1:1:4:4",
+ "conv2d_sb_minimum_w16a16.nnlayergolden",
+ LayerGoldenTestParamOptions::DEFAULT, "nchw", "fp16", "fp16");
+
+auto conv2d_mb_minimum_w16a16 = LayerGoldenTestParamType(
+ nntrainer::createLayer