diff --git a/.github/workflows/pylint.yml b/.github/workflows/pylint.yml deleted file mode 100644 index c61f731855..0000000000 --- a/.github/workflows/pylint.yml +++ /dev/null @@ -1,51 +0,0 @@ -# Copyright 2021 The TensorFlow Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# ============================================================================== - -name: PyLint -on: - pull_request: - paths: - - '**.py' - -permissions: - contents: read - -jobs: - build: - name: PyLint - runs-on: ubuntu-latest - steps: - - name: Checkout code - uses: actions/checkout@v4 - - name: Get file changes - id: get_file_changes - uses: trilom/file-changes-action@v1.2.4 - with: - output: ' ' - - name: Report list of changed files - run: | - echo Changed files: ${{ steps.get_file_changes.outputs.files }} - - name: Set up Python 3.10 - uses: actions/setup-python@v5 - with: - python-version: "3.10" - - name: Install Python dependencies - run: | - python -m pip install --upgrade pip - pip install pylint==3.0.2 numpy wheel - pip install -r ci/requirements.txt - - name: Run PyLint on changed files - run: | - echo "${{ steps.get_file_changes.outputs.files}}" | tr " " "\n" | grep ".py$" | xargs pylint --rcfile=ci/pylintrc diff --git a/.gitignore b/.gitignore index 06c3c0710a..96b03f7bf9 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,7 @@ /build /builddir .cache/ +.idea/ # jni build files iniparser/ diff --git a/Applications/Android/PicoGPTJNI/.gitignore b/Applications/Android/PicoGPTJNI/.gitignore new file mode 100644 index 0000000000..54ed6ea235 --- /dev/null +++ b/Applications/Android/PicoGPTJNI/.gitignore @@ -0,0 +1,19 @@ +*.iml +.gradle +/.vscode +/.idea +/local.properties +/.idea/caches +/.idea/libraries +/.idea/modules.xml +/.idea/workspace.xml +/.idea/navEditor.xml +/.idea/assetWizardSettings.xml +.DS_Store +/build +/captures +.externalNativeBuild +.cxx +local.properties +/app/src/main/jniLibs +/app/src/main/obj diff --git a/Applications/Android/PicoGPTJNI/.gradle/7.5/checksums/checksums.lock b/Applications/Android/PicoGPTJNI/.gradle/7.5/checksums/checksums.lock deleted file mode 100644 index 62d1fcfe2b..0000000000 Binary files a/Applications/Android/PicoGPTJNI/.gradle/7.5/checksums/checksums.lock and /dev/null differ diff --git a/Applications/Android/PicoGPTJNI/.gradle/7.5/dependencies-accessors/dependencies-accessors.lock b/Applications/Android/PicoGPTJNI/.gradle/7.5/dependencies-accessors/dependencies-accessors.lock deleted file mode 100644 index 4f1595be70..0000000000 Binary files a/Applications/Android/PicoGPTJNI/.gradle/7.5/dependencies-accessors/dependencies-accessors.lock and /dev/null differ diff --git a/Applications/Android/PicoGPTJNI/.gradle/7.5/dependencies-accessors/gc.properties b/Applications/Android/PicoGPTJNI/.gradle/7.5/dependencies-accessors/gc.properties deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/Applications/Android/PicoGPTJNI/.gradle/7.5/executionHistory/executionHistory.lock b/Applications/Android/PicoGPTJNI/.gradle/7.5/executionHistory/executionHistory.lock deleted file mode 100644 index 506dd636a9..0000000000 Binary files a/Applications/Android/PicoGPTJNI/.gradle/7.5/executionHistory/executionHistory.lock and /dev/null differ diff --git a/Applications/Android/PicoGPTJNI/.gradle/7.5/fileHashes/fileHashes.lock b/Applications/Android/PicoGPTJNI/.gradle/7.5/fileHashes/fileHashes.lock deleted file mode 100644 index 096927b1af..0000000000 Binary files a/Applications/Android/PicoGPTJNI/.gradle/7.5/fileHashes/fileHashes.lock and /dev/null differ diff --git a/Applications/Android/PicoGPTJNI/.gradle/7.5/gc.properties b/Applications/Android/PicoGPTJNI/.gradle/7.5/gc.properties deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/Applications/Android/PicoGPTJNI/.gradle/buildOutputCleanup/buildOutputCleanup.lock b/Applications/Android/PicoGPTJNI/.gradle/buildOutputCleanup/buildOutputCleanup.lock deleted file mode 100644 index 2ab7eb0273..0000000000 Binary files a/Applications/Android/PicoGPTJNI/.gradle/buildOutputCleanup/buildOutputCleanup.lock and /dev/null differ diff --git a/Applications/Android/PicoGPTJNI/.gradle/buildOutputCleanup/cache.properties b/Applications/Android/PicoGPTJNI/.gradle/buildOutputCleanup/cache.properties deleted file mode 100644 index f11a0f4e85..0000000000 --- a/Applications/Android/PicoGPTJNI/.gradle/buildOutputCleanup/cache.properties +++ /dev/null @@ -1,2 +0,0 @@ -#Tue Feb 14 16:37:06 KST 2023 -gradle.version=7.5 diff --git a/Applications/Android/PicoGPTJNI/.gradle/checksums/checksums.lock b/Applications/Android/PicoGPTJNI/.gradle/checksums/checksums.lock deleted file mode 100644 index 287309dd96..0000000000 Binary files a/Applications/Android/PicoGPTJNI/.gradle/checksums/checksums.lock and /dev/null differ diff --git a/Applications/Android/PicoGPTJNI/.gradle/file-system.probe b/Applications/Android/PicoGPTJNI/.gradle/file-system.probe deleted file mode 100644 index 71fa644c1c..0000000000 Binary files a/Applications/Android/PicoGPTJNI/.gradle/file-system.probe and /dev/null differ diff --git a/Applications/Android/PicoGPTJNI/.gradle/vcs-1/gc.properties b/Applications/Android/PicoGPTJNI/.gradle/vcs-1/gc.properties deleted file mode 100644 index e69de29bb2..0000000000 diff --git a/Applications/Android/PicoGPTJNI/.idea/compiler.xml b/Applications/Android/PicoGPTJNI/.idea/compiler.xml deleted file mode 100644 index 5421743a9c..0000000000 --- a/Applications/Android/PicoGPTJNI/.idea/compiler.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - - - diff --git a/Applications/Android/PicoGPTJNI/.idea/gradle.xml b/Applications/Android/PicoGPTJNI/.idea/gradle.xml deleted file mode 100644 index b795db1fe1..0000000000 --- a/Applications/Android/PicoGPTJNI/.idea/gradle.xml +++ /dev/null @@ -1,20 +0,0 @@ - - - - - - - diff --git a/Applications/Android/PicoGPTJNI/.idea/misc.xml b/Applications/Android/PicoGPTJNI/.idea/misc.xml deleted file mode 100644 index 0f31685c15..0000000000 --- a/Applications/Android/PicoGPTJNI/.idea/misc.xml +++ /dev/null @@ -1,12 +0,0 @@ - - - - - - - - \ No newline at end of file diff --git a/Applications/Android/PicoGPTJNI/.idea/vcs.xml b/Applications/Android/PicoGPTJNI/.idea/vcs.xml deleted file mode 100644 index c2365ab11f..0000000000 --- a/Applications/Android/PicoGPTJNI/.idea/vcs.xml +++ /dev/null @@ -1,6 +0,0 @@ - - - - - - \ No newline at end of file diff --git a/Applications/Android/PicoGPTJNI/.idea/workspace.xml b/Applications/Android/PicoGPTJNI/.idea/workspace.xml deleted file mode 100644 index 039da86b98..0000000000 --- a/Applications/Android/PicoGPTJNI/.idea/workspace.xml +++ /dev/null @@ -1,147 +0,0 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 1676357527812 - - - - diff --git a/Applications/KNN/jni/meson.build b/Applications/KNN/jni/meson.build index bc50dc0214..58ca099d75 100644 --- a/Applications/KNN/jni/meson.build +++ b/Applications/KNN/jni/meson.build @@ -15,4 +15,4 @@ e = executable('knn_sample', install_dir: application_install_dir ) -test('app_knn', e, args: [nntr_app_resdir / 'KNN']) +test('app_knn', e, args: [nntr_app_resdir / 'KNN/']) diff --git a/Applications/LLaMA/jni/main.cpp b/Applications/LLaMA/jni/main.cpp index 96be8671dc..985d82a79e 100644 --- a/Applications/LLaMA/jni/main.cpp +++ b/Applications/LLaMA/jni/main.cpp @@ -56,7 +56,7 @@ int const NUM_VOCAB = 96000; int MAX_SEQ_LEN = 1024; int NUM_TO_GENERATE = 100; -constexpr unsigned int INIT_SEQ_LEN = 30; +constexpr unsigned int INIT_SEQ_LEN = 28; unsigned int batch_size = 1; unsigned int epoch = 1; @@ -596,7 +596,7 @@ void run(std::string text, bool apply_temperature) { float init_input[INIT_SEQ_LEN] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 200, 300, 400, 500, 600, 700, 800, 900}; - ((uint *)(input_sample))[0] = init_input[0]; + memcpy(input_sample, init_input, sizeof(float) * INIT_SEQ_LEN); input.push_back(input_sample); init_len = 18; #endif diff --git a/Applications/Resnet/README.md b/Applications/Resnet/README.md index f76d5b25de..f195a8c764 100644 --- a/Applications/Resnet/README.md +++ b/Applications/Resnet/README.md @@ -14,7 +14,7 @@ Please file an issue if you have a problem running the example. ```bash $ meson ${build_dir} -Denable-test=true -Denable-long-test=true -$ meson test app_resnet18 -v -c ${build_dir} +$ meson test app_resnet18 -v -C ${build_dir} ``` ### To run with a real data. diff --git a/Applications/YOLO/PyTorch/main.py b/Applications/YOLO/PyTorch/main.py deleted file mode 100644 index b831e1ebb1..0000000000 --- a/Applications/YOLO/PyTorch/main.py +++ /dev/null @@ -1,171 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# Copyright (C) 2023 Seungbaek Hong -# -# @file main.py -# @date 8 March 2023 -# @brief Implement training for yolo -# -# @author Seungbaek Hong - -import numpy as np -import torch -import torch.nn as nn -import torch.optim as optim -import torch.nn.functional as F -from torch.utils.data import DataLoader - -from yolo import YoloV2 -from yolo_loss import YoloV2_LOSS -from dataset import YOLODataset, collate_db - -import sys -import os - -# get pyutils path using relative path -def get_util_path(): - current_path = os.path.abspath(os.path.dirname(__file__)) - parent_path = os.path.abspath(os.path.dirname(current_path)) - target_path = os.path.abspath(os.path.dirname(parent_path)) - return os.path.dirname(target_path) + '/tools/pyutils/' - -# add pyutils path to sys.path -sys.path.append(get_util_path()) -from torchconverter import save_bin - -# set config -out_size = 13 -num_classes = 4 -num_anchors = 5 - -epochs = 3 -batch_size = 4 - -train_img_dir = '/home/user/TRAIN_DIR/images/*' -train_ann_dir = '/home/user/TRAIN_DIR/annotations/*' -valid_img_dir = '/home/user/VALID_DIR/images/*' -valid_ann_dir = '/home/user/VALID_DIR/annotations/*' - -# load data -train_dataset = YOLODataset(train_img_dir, train_ann_dir) -train_loader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_db, shuffle=True, drop_last=True) -valid_dataset = YOLODataset(valid_img_dir, valid_ann_dir) -valid_loader = DataLoader(valid_dataset, batch_size=batch_size, collate_fn=collate_db, shuffle=False, drop_last=True) - -# set model, loss and optimizer -model = YoloV2(num_classes=num_classes) -criterion = YoloV2_LOSS(num_classes=num_classes) -optimizer = optim.Adam(model.parameters(), lr=1e-3) -# scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10, eta_min=0) - -# save init model -save_bin(model, 'init_model') -torch.save(model.state_dict(), './init_model.pt') - -# train model -best_loss = 1e+10 -for epoch in range(epochs): - epoch_train_loss = 0 - epoch_valid_loss = 0 - for idx, (img, bbox, cls) in enumerate(train_loader): - model.train() - optimizer.zero_grad() - # model prediction - hypothesis = model(img).permute((0, 2, 3, 1)) - hypothesis = hypothesis.reshape((batch_size, out_size**2, num_anchors, 5+num_classes)) - # split each prediction(bbox, iou, class prob) - bbox_pred_xy = torch.sigmoid(hypothesis[..., :2]) - bbox_pred_wh = torch.exp(hypothesis[..., 2:4]) - bbox_pred = torch.cat((bbox_pred_xy, bbox_pred_wh), 3) - iou_pred = torch.sigmoid(hypothesis[..., 4:5]) - score_pred = hypothesis[..., 5:].contiguous() - prob_pred = torch.softmax(score_pred.view(-1, num_classes), dim=1).view(score_pred.shape) - # calc loss - loss = criterion(torch.FloatTensor(bbox_pred), - torch.FloatTensor(iou_pred), - torch.FloatTensor(prob_pred), - bbox, - cls) - # back prop - loss.backward() - optimizer.step() - # scheduler.step() - epoch_train_loss += loss.item() - - for idx, (img, bbox, cls) in enumerate(valid_loader): - model.eval() - with torch.no_grad(): - # model prediction - hypothesis = model(img).permute((0, 2, 3, 1)) - hypothesis = hypothesis.reshape((hypothesis.shape[0], out_size**2, num_anchors, 5+num_classes)) - # split each prediction(bbox, iou, class prob) - bbox_pred_xy = torch.sigmoid(hypothesis[..., :2]) - bbox_pred_wh = torch.exp(hypothesis[..., 2:4]) - bbox_pred = torch.cat((bbox_pred_xy, bbox_pred_wh), 3) - iou_pred = torch.sigmoid(hypothesis[..., 4:5]) - score_pred = hypothesis[..., 5:].contiguous() - prob_pred = torch.softmax(score_pred.view(-1, num_classes), dim=1).view(score_pred.shape) - # calc loss - loss = criterion(torch.FloatTensor(bbox_pred), - torch.FloatTensor(iou_pred), - torch.FloatTensor(prob_pred), - bbox, - cls) - epoch_valid_loss += loss.item() - - if epoch_valid_loss < best_loss: - best_loss = epoch_valid_loss - torch.save(model.state_dict(), './best_model.pt') - save_bin(model, 'best_model') - - print("{}epoch, train loss: {:.4f}, valid loss: {:.4f}".format( - epoch, epoch_train_loss / len(train_loader), epoch_valid_loss / len(valid_loader))) - -## -# @brief bbox post process function for inference -def post_process_for_bbox(bbox_pred): - """ - @param bbox_pred shape(batch_size, cell_h x cell_w, num_anchors, 4) - @return bbox_pred shape(batch_size, cell_h x cell_w, num_anchors, 4) - """ - anchors = torch.FloatTensor( - [(1.3221, 1.73145), - (3.19275, 4.00944), - (5.05587, 8.09892), - (9.47112, 4.84053), - (11.2364, 10.0071)] - ) - - outsize = (13, 13) - width, height = outsize - - # restore cell pos to x, y - for w in range(width): - for h in range(height): - bbox_pred[:, height*h + w, :, 0] += w - bbox_pred[:, height*h + w, :, 1] += h - bbox_pred[:, :, :, :2] /= 13 - - # apply anchors to w, h - anchor_w = anchors[:, 0].contiguous().view(-1, 1) - anchor_h = anchors[:, 1].contiguous().view(-1, 1) - bbox_pred[:, :, :, 2:3] *= anchor_w - bbox_pred[:, :, :, 3:4] *= anchor_h - - return bbox_pred - -# inference example using trained model -hypothesis = model(img).permute((0, 2, 3, 1)) -hypothesis = hypothesis[0].reshape((1, out_size**2, num_anchors, 5+num_classes)) - -# transform output -bbox_pred_xy = torch.sigmoid(hypothesis[..., :2]) -bbox_pred_wh = torch.exp(hypothesis[..., 2:4]) -bbox_pred = torch.cat((bbox_pred_xy, bbox_pred_wh), 3) -bbox_pred = post_process_for_bbox(bbox_pred) -iou_pred = torch.sigmoid(hypothesis[..., 4:5]) -score_pred = hypothesis[..., 5:].contiguous() -prob_pred = torch.softmax(score_pred.view(-1, num_classes), dim=1).view(score_pred.shape) - -# result of inference (data range 0~1) -iou_mask = (iou_pred > 0.5) -print(bbox_pred * iou_mask, iou_pred * iou_mask, prob_pred * iou_mask) diff --git a/Applications/YOLO/PyTorch/yolo.py b/Applications/YOLO/PyTorch/yolo.py deleted file mode 100644 index 53763f1be7..0000000000 --- a/Applications/YOLO/PyTorch/yolo.py +++ /dev/null @@ -1,105 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# Copyright (C) 2023 Seungbaek Hong -# -# @file yolo.py -# @date 8 March 2023 -# @brief Define simple yolo model, but not original darknet. -# -# @author Seungbaek Hong - -import torch -import torch.nn as nn - -## -# @brief define yolo model (except for re-organization module) -class YoloV2(nn.Module): - def __init__(self, num_classes, num_anchors=5): - - super(YoloV2, self).__init__() - self.num_classes = num_classes - self.num_anchors = num_anchors - self.conv1 = nn.Sequential(nn.Conv2d(3, 32, 3, 1, 1), nn.BatchNorm2d(32, eps=1e-3), - nn.LeakyReLU(), nn.MaxPool2d(2, 2)) - self.conv2 = nn.Sequential(nn.Conv2d(32, 64, 3, 1, 1), nn.BatchNorm2d(64, eps=1e-3), - nn.LeakyReLU(), nn.MaxPool2d(2, 2)) - self.conv3 = nn.Sequential(nn.Conv2d(64, 128, 3, 1, 1), nn.BatchNorm2d(128, eps=1e-3), - nn.LeakyReLU()) - self.conv4 = nn.Sequential(nn.Conv2d(128, 64, 1, 1, 0), nn.BatchNorm2d(64, eps=1e-3), - nn.LeakyReLU()) - self.conv5 = nn.Sequential(nn.Conv2d(64, 128, 3, 1, 1), nn.BatchNorm2d(128, eps=1e-3), - nn.LeakyReLU(), nn.MaxPool2d(2, 2)) - self.conv6 = nn.Sequential(nn.Conv2d(128, 256, 3, 1, 1), nn.BatchNorm2d(256, eps=1e-3), - nn.LeakyReLU()) - self.conv7 = nn.Sequential(nn.Conv2d(256, 128, 1, 1, 0), nn.BatchNorm2d(128, eps=1e-3), - nn.LeakyReLU()) - self.conv8 = nn.Sequential(nn.Conv2d(128, 256, 3, 1, 1), nn.BatchNorm2d(256, eps=1e-3), - nn.LeakyReLU(), nn.MaxPool2d(2, 2)) - self.conv9 = nn.Sequential(nn.Conv2d(256, 512, 3, 1, 1), nn.BatchNorm2d(512, eps=1e-3), - nn.LeakyReLU()) - self.conv10 = nn.Sequential(nn.Conv2d(512, 256, 1, 1, 0), nn.BatchNorm2d(256, eps=1e-3), - nn.LeakyReLU()) - self.conv11 = nn.Sequential(nn.Conv2d(256, 512, 3, 1, 1), nn.BatchNorm2d(512, eps=1e-3), - nn.LeakyReLU()) - self.conv12 = nn.Sequential(nn.Conv2d(512, 256, 1, 1, 0), nn.BatchNorm2d(256, eps=1e-3), - nn.LeakyReLU()) - self.conv13 = nn.Sequential(nn.Conv2d(256, 512, 3, 1, 1), nn.BatchNorm2d(512, eps=1e-3), - nn.LeakyReLU()) - - self.conv_b = nn.Sequential(nn.Conv2d(512, 64, 1, 1, 0), nn.BatchNorm2d(64, eps=1e-3), - nn.LeakyReLU()) - - self.maxpool_a = nn.MaxPool2d(2, 2) - self.conv_a1 = nn.Sequential(nn.Conv2d(512, 1024, 3, 1, 1), nn.BatchNorm2d(1024, eps=1e-3), - nn.LeakyReLU()) - self.conv_a2 = nn.Sequential(nn.Conv2d(1024, 512, 1, 1, 0), nn.BatchNorm2d(512, eps=1e-3), - nn.LeakyReLU()) - self.conv_a3 = nn.Sequential(nn.Conv2d(512, 1024, 3, 1, 1), nn.BatchNorm2d(1024, eps=1e-3), - nn.LeakyReLU()) - self.conv_a4 = nn.Sequential(nn.Conv2d(1024, 512, 1, 1, 0), nn.BatchNorm2d(512, eps=1e-3), - nn.LeakyReLU()) - self.conv_a5 = nn.Sequential(nn.Conv2d(512, 1024, 3, 1, 1), nn.BatchNorm2d(1024, eps=1e-3), - nn.LeakyReLU()) - self.conv_a6 = nn.Sequential(nn.Conv2d(1024, 1024, 3, 1, 1), nn.BatchNorm2d(1024, eps=1e-3), - nn.LeakyReLU()) - self.conv_a7 = nn.Sequential(nn.Conv2d(1024, 1024, 3, 1, 1), nn.BatchNorm2d(1024, eps=1e-3), - nn.LeakyReLU()) - - self.conv_out1 = nn.Sequential(nn.Conv2d(1280, 1024, 3, 1, 1), nn.BatchNorm2d(1024, eps=1e-3), - nn.LeakyReLU()) - - self.conv_out2 = nn.Conv2d(1024, self.num_anchors * (5 + num_classes), 1, 1, 0) - - def forward(self, input): - output = self.conv1(input) - output = self.conv2(output) - output = self.conv3(output) - output = self.conv4(output) - output = self.conv5(output) - output = self.conv6(output) - output = self.conv7(output) - output = self.conv8(output) - output = self.conv9(output) - output = self.conv10(output) - output = self.conv11(output) - output = self.conv12(output) - output = self.conv13(output) - - output_a = self.maxpool_a(output) - output_a = self.conv_a1(output_a) - output_a = self.conv_a2(output_a) - output_a = self.conv_a3(output_a) - output_a = self.conv_a4(output_a) - output_a = self.conv_a5(output_a) - output_a = self.conv_a6(output_a) - output_a = self.conv_a7(output_a) - - output_b = self.conv_b(output) - b, c, h, w = output_b.size() - output_b = output_b.view(b, int(c / 4), h, 2, w, 2).contiguous() - output_b = output_b.permute(0, 3, 5, 1, 2, 4).contiguous() - output_b = output_b.view(b, -1, int(h / 2), int(w / 2)) - - output = torch.cat((output_a, output_b), 1) - output = self.conv_out1(output) - output = self.conv_out2(output) - return output diff --git a/Applications/YOLO/PyTorch/dataset.py b/Applications/YOLOv2/PyTorch/dataset.py similarity index 58% rename from Applications/YOLO/PyTorch/dataset.py rename to Applications/YOLOv2/PyTorch/dataset.py index a02971ae87..d939e0f8a9 100644 --- a/Applications/YOLO/PyTorch/dataset.py +++ b/Applications/YOLOv2/PyTorch/dataset.py @@ -8,50 +8,68 @@ # @author Seungbaek Hong import glob +import re import numpy as np import torch from torch.utils.data import Dataset from torch.utils.data.dataloader import default_collate from PIL import Image + ## # @brief dataset class for yolo -# @note Need annotation text files corresponding to the name of the images. +# @note Need annotation text files corresponding to the name of the images. class YOLODataset(Dataset): def __init__(self, img_dir, ann_dir): super().__init__() - img_list = glob.glob(img_dir) - ann_list = glob.glob(ann_dir) - img_list.sort(), ann_list.sort() + self.img_dir = img_dir + pattern = re.compile("\/(\d+)\.") + img_list = glob.glob(img_dir + "*") + ann_list = glob.glob(ann_dir + "*") + + img_ids = list(map(lambda x: pattern.search(x).group(1), img_list)) + ann_ids = list(map(lambda x: pattern.search(x).group(1), ann_list)) + ids_list = list(set(img_ids) & set(ann_ids)) - self.length = len(img_list) - self.input_images = [] + self.ids_list = [] self.bbox_gt = [] self.cls_gt = [] - for i in range(len(img_list)): - img = np.array(Image.open(img_list[i]).resize((416, 416))) / 255 + for ids in ids_list: label_bbox = [] label_cls = [] - with open(ann_list[i], 'rt') as f: + with open(ann_dir + ids + ".txt", "rt", encoding="utf-8") as f: for line in f.readlines(): line = [float(i) for i in line.split()] label_bbox.append(np.array(line[1:], dtype=np.float32) / 416) label_cls.append(int(line[0])) - self.input_images.append(img) + if len(label_cls) == 0: + continue + + self.ids_list.append(ids) self.bbox_gt.append(label_bbox) self.cls_gt.append(label_cls) - self.input_images = np.array(self.input_images) - self.input_images = torch.FloatTensor(self.input_images).permute((0, 3, 1, 2)) + self.length = len(self.ids_list) def __len__(self): return self.length - + def __getitem__(self, idx): - return self.input_images[idx], self.bbox_gt[idx], self.cls_gt[idx] - + img = ( + torch.FloatTensor( + np.array( + Image.open(self.img_dir + self.ids_list[idx] + ".jpg").resize( + (416, 416) + ) + ) + ).permute((2, 0, 1)) + / 255 + ) + return img, self.bbox_gt[idx], self.cls_gt[idx] + + ## # @brief collate db function for yolo def collate_db(batch): diff --git a/Applications/YOLOv2/PyTorch/main.py b/Applications/YOLOv2/PyTorch/main.py new file mode 100644 index 0000000000..6e42fa1c6b --- /dev/null +++ b/Applications/YOLOv2/PyTorch/main.py @@ -0,0 +1,222 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (C) 2023 Seungbaek Hong +# +# @file main.py +# @date 8 March 2023 +# @brief Implement training for yolo +# +# @author Seungbaek Hong + +import sys +import os + +from PIL import Image, ImageDraw +from matplotlib import pyplot as plt +from torch import optim +from torch.utils.data import DataLoader +import torch +import numpy as np + +from yolo import YoloV2 +from yolo_loss import YoloV2_LOSS +from dataset import YOLODataset, collate_db +from torchconverter import save_bin + +device = "cuda" if torch.cuda.is_available() else "cpu" + + +# get pyutils path using relative path +def get_util_path(): + current_path = os.path.abspath(os.path.dirname(__file__)) + parent_path = os.path.abspath(os.path.dirname(current_path)) + target_path = os.path.abspath(os.path.dirname(parent_path)) + return os.path.dirname(target_path) + "/tools/pyutils/" + + +# add pyutils path to sys.path +sys.path.append(get_util_path()) + +# set config +out_size = 13 +num_classes = 4 +num_anchors = 5 + +epochs = 3 +batch_size = 4 + +train_img_dir = "/home/user/TRAIN_DIR/images/" +train_ann_dir = "/home/user/TRAIN_DIR/annotations/" +valid_img_dir = "/home/user/VALID_DIR/images/" +valid_ann_dir = "/home/user/VALID_DIR/annotations/" + +# load data +train_dataset = YOLODataset(train_img_dir, train_ann_dir) +train_loader = DataLoader( + train_dataset, + batch_size=batch_size, + collate_fn=collate_db, + shuffle=True, + drop_last=True, +) +valid_dataset = YOLODataset(valid_img_dir, valid_ann_dir) +valid_loader = DataLoader( + valid_dataset, + batch_size=batch_size, + collate_fn=collate_db, + shuffle=False, + drop_last=True, +) + +# set model, loss and optimizer +model = YoloV2(num_classes=num_classes).to(device) +criterion = YoloV2_LOSS( + num_classes=num_classes, img_shape=(416, 416), device=device +).to(device) +optimizer = optim.Adam(model.parameters(), lr=1e-5) +scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10, eta_min=0) + +# save init model +save_bin(model, "init_model") +torch.save(model.state_dict(), "./init_model.pt") + +# train model +best_loss = 1e10 +for epoch in range(epochs): + epoch_train_loss = 0 + epoch_valid_loss = 0 + model.train() + for idx, (img, bbox, cls) in enumerate(train_loader): + optimizer.zero_grad() + # model prediction + hypothesis = model(img.to(device)).permute((0, 2, 3, 1)) + hypothesis = hypothesis.reshape( + (batch_size, out_size**2, num_anchors, 5 + num_classes) + ) + # split each prediction(bbox, iou, class prob) + bbox_pred_xy = torch.sigmoid(hypothesis[..., :2]) + bbox_pred_wh = torch.exp(hypothesis[..., 2:4]) + bbox_pred = torch.cat((bbox_pred_xy, bbox_pred_wh), 3) + iou_pred = torch.sigmoid(hypothesis[..., 4:5]) + score_pred = hypothesis[..., 5:].contiguous() + prob_pred = torch.softmax(score_pred.view(-1, num_classes), dim=1).view( + score_pred.shape + ) + # calc loss + loss = criterion(bbox_pred, iou_pred, prob_pred, bbox, cls) + # back prop + loss.backward() + optimizer.step() + scheduler.step() + epoch_train_loss += loss.item() + + model.eval() + for idx, (img, bbox, cls) in enumerate(valid_loader): + with torch.no_grad(): + # model prediction + hypothesis = model(img.to(device)).permute((0, 2, 3, 1)) + hypothesis = hypothesis.reshape( + (hypothesis.shape[0], out_size**2, num_anchors, 5 + num_classes) + ) + # split each prediction(bbox, iou, class prob) + bbox_pred_xy = torch.sigmoid(hypothesis[..., :2]) + bbox_pred_wh = torch.exp(hypothesis[..., 2:4]) + bbox_pred = torch.cat((bbox_pred_xy, bbox_pred_wh), 3) + iou_pred = torch.sigmoid(hypothesis[..., 4:5]) + score_pred = hypothesis[..., 5:].contiguous() + prob_pred = torch.softmax(score_pred.view(-1, num_classes), dim=1).view( + score_pred.shape + ) + # calc loss + loss = criterion(bbox_pred, iou_pred, prob_pred, bbox, cls) + epoch_valid_loss += loss.item() + + if epoch_valid_loss < best_loss: + best_loss = epoch_valid_loss + torch.save(model.state_dict(), "./best_model.pt") + save_bin(model, "best_model") + + print( + f"{epoch}epoch, train loss: {epoch_train_loss / len(train_loader):.4f},\ + valid loss: {epoch_valid_loss / len(valid_loader):.4f}" + ) + + +## +# @brief bbox post process function for inference +def post_process_for_bbox(bbox_p): + """ + @param bbox_p shape(batch_size, cell_h x cell_w, num_anchors, 4) + @return bbox_p shape(batch_size, cell_h x cell_w, num_anchors, 4) + """ + anchors = torch.FloatTensor( + [ + (1.3221, 1.73145), + (3.19275, 4.00944), + (5.05587, 8.09892), + (9.47112, 4.84053), + (11.2364, 10.0071), + ] + ) + + outsize = (13, 13) + width, height = outsize + + # restore cell pos to x, y + for w in range(width): + for h in range(height): + bbox_p[:, height * h + w, :, 0] += w + bbox_p[:, height * h + w, :, 1] += h + bbox_p[:, :, :, :2] /= 13 + + # apply anchors to w, h + anchor_w = anchors[:, 0].contiguous().view(-1, 1).to(device) + anchor_h = anchors[:, 1].contiguous().view(-1, 1).to(device) + bbox_p[:, :, :, 2:3] *= anchor_w + bbox_p[:, :, :, 3:4] *= anchor_h + + return bbox_p + + +def visualize_bbox(img_pred, bbox_preds): + img_array = (img_pred.to("cpu") * 255).permute((1, 2, 0)).numpy().astype(np.uint8) + img = Image.fromarray(img_array) + + for bbox_pred in bbox_preds: + bbox_pred = [int(x * 416) for x in bbox_pred] + + if sum(bbox_pred) == 0: + continue + + x_lefttop = bbox_pred[0] + y_lefttop = bbox_pred[1] + width = bbox_pred[2] + height = bbox_pred[3] + + draw = ImageDraw.Draw(img) + draw.rectangle( + [(x_lefttop, y_lefttop), (x_lefttop + width, y_lefttop + height)] + ) + + plt.imshow(img) + plt.show() + + +# inference example using trained model +hypothesis = model(img.to(device)).permute((0, 2, 3, 1)) +hypothesis = hypothesis[0].reshape((1, out_size**2, num_anchors, 5 + num_classes)) + +# transform output +bbox_pred_xy = torch.sigmoid(hypothesis[..., :2]) +bbox_pred_wh = torch.exp(hypothesis[..., 2:4]) +bbox_pred = torch.cat((bbox_pred_xy, bbox_pred_wh), 3) +bbox_pred = post_process_for_bbox(bbox_pred) +iou_pred = torch.sigmoid(hypothesis[..., 4:5]) +score_pred = hypothesis[..., 5:].contiguous() +prob_pred = torch.softmax(score_pred.view(-1, num_classes), dim=1).view( + score_pred.shape +) + +# result of inference (data range 0~1) +iou_mask = iou_pred > 0.5 +bbox_pred = bbox_pred * iou_mask +visualize_bbox(img, bbox_pred.reshape(-1, 4)) diff --git a/Applications/YOLOv2/PyTorch/yolo.py b/Applications/YOLOv2/PyTorch/yolo.py new file mode 100644 index 0000000000..390cbd5ada --- /dev/null +++ b/Applications/YOLOv2/PyTorch/yolo.py @@ -0,0 +1,176 @@ +# SPDX-License-Identifier: Apache-2.0 +# Copyright (C) 2023 Seungbaek Hong +# +# @file yolo.py +# @date 8 March 2023 +# @brief Define simple yolo model, but not original darknet. +# +# @author Seungbaek Hong + +import torch +from torch import nn + + +## +# @brief define yolo model (except for re-organization module) +class YoloV2(nn.Module): + def __init__(self, num_classes, num_anchors=5): + + super().__init__() + self.num_classes = num_classes + self.num_anchors = num_anchors + self.conv1 = nn.Sequential( + nn.Conv2d(3, 32, 3, 1, 1, bias=False), + nn.BatchNorm2d(32), + nn.LeakyReLU(0.1), + nn.MaxPool2d(2, 2), + ) + self.conv2 = nn.Sequential( + nn.Conv2d(32, 64, 3, 1, 1, bias=False), + nn.BatchNorm2d(64), + nn.LeakyReLU(0.1), + nn.MaxPool2d(2, 2), + ) + self.conv3 = nn.Sequential( + nn.Conv2d(64, 128, 3, 1, 1, bias=False), + nn.BatchNorm2d(128), + nn.LeakyReLU(0.1), + ) + self.conv4 = nn.Sequential( + nn.Conv2d(128, 64, 1, 1, 0, bias=False), + nn.BatchNorm2d(64), + nn.LeakyReLU(0.1), + ) + self.conv5 = nn.Sequential( + nn.Conv2d(64, 128, 3, 1, 1, bias=False), + nn.BatchNorm2d(128), + nn.LeakyReLU(0.1), + nn.MaxPool2d(2, 2), + ) + self.conv6 = nn.Sequential( + nn.Conv2d(128, 256, 3, 1, 1, bias=False), + nn.BatchNorm2d(256), + nn.LeakyReLU(0.1), + ) + self.conv7 = nn.Sequential( + nn.Conv2d(256, 128, 1, 1, 0, bias=False), + nn.BatchNorm2d(128), + nn.LeakyReLU(0.1), + ) + self.conv8 = nn.Sequential( + nn.Conv2d(128, 256, 3, 1, 1, bias=False), + nn.BatchNorm2d(256), + nn.LeakyReLU(0.1), + nn.MaxPool2d(2, 2), + ) + self.conv9 = nn.Sequential( + nn.Conv2d(256, 512, 3, 1, 1, bias=False), + nn.BatchNorm2d(512), + nn.LeakyReLU(0.1), + ) + self.conv10 = nn.Sequential( + nn.Conv2d(512, 256, 1, 1, 0, bias=False), + nn.BatchNorm2d(256), + nn.LeakyReLU(0.1), + ) + self.conv11 = nn.Sequential( + nn.Conv2d(256, 512, 3, 1, 1, bias=False), + nn.BatchNorm2d(512), + nn.LeakyReLU(0.1), + ) + self.conv12 = nn.Sequential( + nn.Conv2d(512, 256, 1, 1, 0, bias=False), + nn.BatchNorm2d(256), + nn.LeakyReLU(0.1), + ) + self.conv13 = nn.Sequential( + nn.Conv2d(256, 512, 3, 1, 1, bias=False), + nn.BatchNorm2d(512), + nn.LeakyReLU(0.1), + ) + + self.conv_b = nn.Sequential( + nn.Conv2d(512, 64, 1, 1, 0, bias=False), + nn.BatchNorm2d(64), + nn.LeakyReLU(0.1), + ) + + self.maxpool_a = nn.MaxPool2d(2, 2) + self.conv_a1 = nn.Sequential( + nn.Conv2d(512, 1024, 3, 1, 1, bias=False), + nn.BatchNorm2d(1024), + nn.LeakyReLU(0.1), + ) + self.conv_a2 = nn.Sequential( + nn.Conv2d(1024, 512, 1, 1, 0, bias=False), + nn.BatchNorm2d(512), + nn.LeakyReLU(0.1), + ) + self.conv_a3 = nn.Sequential( + nn.Conv2d(512, 1024, 3, 1, 1, bias=False), + nn.BatchNorm2d(1024), + nn.LeakyReLU(0.1), + ) + self.conv_a4 = nn.Sequential( + nn.Conv2d(1024, 512, 1, 1, 0, bias=False), + nn.BatchNorm2d(512), + nn.LeakyReLU(0.1), + ) + self.conv_a5 = nn.Sequential( + nn.Conv2d(512, 1024, 3, 1, 1, bias=False), + nn.BatchNorm2d(1024), + nn.LeakyReLU(0.1), + ) + self.conv_a6 = nn.Sequential( + nn.Conv2d(1024, 1024, 3, 1, 1, bias=False), + nn.BatchNorm2d(1024), + nn.LeakyReLU(0.1), + ) + self.conv_a7 = nn.Sequential( + nn.Conv2d(1024, 1024, 3, 1, 1, bias=False), + nn.BatchNorm2d(1024), + nn.LeakyReLU(0.1), + ) + + self.conv_out1 = nn.Sequential( + nn.Conv2d(1280, 1024, 3, 1, 1, bias=False), + nn.BatchNorm2d(1024), + nn.LeakyReLU(0.1), + ) + + self.conv_out2 = nn.Conv2d(1024, self.num_anchors * (5 + num_classes), 1, 1, 0) + + def forward(self, x): + output = self.conv1(x) + output = self.conv2(output) + output = self.conv3(output) + output = self.conv4(output) + output = self.conv5(output) + output = self.conv6(output) + output = self.conv7(output) + output = self.conv8(output) + output = self.conv9(output) + output = self.conv10(output) + output = self.conv11(output) + output = self.conv12(output) + output = self.conv13(output) + + output_a = self.maxpool_a(output) + output_a = self.conv_a1(output_a) + output_a = self.conv_a2(output_a) + output_a = self.conv_a3(output_a) + output_a = self.conv_a4(output_a) + output_a = self.conv_a5(output_a) + output_a = self.conv_a6(output_a) + output_a = self.conv_a7(output_a) + + output_b = self.conv_b(output) + b, c, h, w = output_b.size() + output_b = output_b.view(b, int(c / 4), h, 2, w, 2).contiguous() + output_b = output_b.permute(0, 3, 5, 1, 2, 4).contiguous() + output_b = output_b.view(b, -1, int(h / 2), int(w / 2)) + + output = torch.cat((output_a, output_b), 1) + output = self.conv_out1(output) + output = self.conv_out2(output) + return output diff --git a/Applications/YOLO/PyTorch/yolo_loss.py b/Applications/YOLOv2/PyTorch/yolo_loss.py similarity index 72% rename from Applications/YOLO/PyTorch/yolo_loss.py rename to Applications/YOLOv2/PyTorch/yolo_loss.py index 12f95572a4..c444821236 100644 --- a/Applications/YOLO/PyTorch/yolo_loss.py +++ b/Applications/YOLOv2/PyTorch/yolo_loss.py @@ -8,10 +8,10 @@ # @author Seungbaek Hong import torch -import torch.nn as nn -import torch.functional as F +from torch import nn import numpy as np + ## # @brief calculate iou between two boxes list def calculate_iou(bbox1, bbox2): @@ -25,27 +25,28 @@ def calculate_iou(bbox1, bbox2): b1x2, b1y2 = (bbox1[:, :2] + (bbox1[:, 2:4])).split(1, 1) b2x1, b2y1 = (bbox2[:, :2]).split(1, 1) b2x2, b2y2 = (bbox2[:, :2] + (bbox2[:, 2:4])).split(1, 1) - + # box areas areas1 = (b1x2 - b1x1) * (b1y2 - b1y1) areas2 = (b2x2 - b2x1) * (b2y2 - b2y1) - + # intersections min_x_of_max_x, max_x_of_min_x = torch.min(b1x2, b2x2), torch.max(b1x1, b2x1) min_y_of_max_y, max_y_of_min_y = torch.min(b1y2, b2y2), torch.max(b1y1, b2y1) intersection_width = (min_x_of_max_x - max_x_of_min_x).clamp(min=0) intersection_height = (min_y_of_max_y - max_y_of_min_y).clamp(min=0) intersections = intersection_width * intersection_height - - # unions + + # unions unions = (areas1 + areas2) - intersections - - result = intersections / unions + + result = intersections / unions return result + ## # @brief find best iou and its index -def find_best_ratio(anchors, bbox): +def find_best_ratio(anchors, bbox): """ @param anchors shape(numb_of_anchors, 2), it contains w, h @param bbox shape(numb_of_bbox, 2), it contains w, h @@ -57,52 +58,59 @@ def find_best_ratio(anchors, bbox): best_match = np.argmin(similarities, axis=0) return best_match + ## # @brief loss class for yolo class YoloV2_LOSS(nn.Module): """Yolo v2 loss""" - def __init__(self, num_classes, img_shape = (416, 416), outsize = (13, 13)): + + def __init__(self, num_classes, img_shape, device="cpu", outsize=(13, 13)): super().__init__() + self.device = device self.num_classes = num_classes self.img_shape = img_shape self.outsize = outsize - self.hook = dict() - + self.hook = {} + self.anchors = torch.FloatTensor( - [(1.3221, 1.73145), - (3.19275, 4.00944), - (5.05587, 8.09892), - (9.47112, 4.84053), - (11.2364, 10.0071)] + [ + (1.3221, 1.73145), + (3.19275, 4.00944), + (5.05587, 8.09892), + (9.47112, 4.84053), + (11.2364, 10.0071), + ] ) - + self.mse = nn.MSELoss() self.bbox_loss, self.iou_loss, self.cls_loss = None, None, None - + ## - # @brief function to track gradients of non-leaf varibles. + # @brief function to track gradients of non-leaf varibles. def hook_variable(self, name, var): - """ Do not use this function when training. It is for debugging. """ + """Do not use this function when training. It is for debugging.""" self.hook[name] = var self.hook[name].requires_grad_().retain_grad() ## # @brief function to print gradients of non-leaf varibles. def print_hook_variables(self): - """ Do not use this function when training. It is for debugging. """ + """Do not use this function when training. It is for debugging.""" for k, var in self.hook.items(): - print("gradients of variable {}:".format(k)) + print(f"gradients of variable {k}:") batch, channel, height, width = var.grad.shape for b in range(batch): for c in range(channel): for h in range(height): for w in range(width): if torch.abs(var.grad[b, c, h, w]).item() >= 1e-3: - print("(b: {}, c: {}, h: {}, w: {}) = {}"\ - .format(b, c, h, w, var.grad[b, c, h, w])) + print( + f"(b: {b}, c: {c}, h: {h}, w: {w}) =\ + {var.grad[b, c, h, w]}" + ) print("=" * 20) - - def forward(self, bbox_pred, iou_pred, prob_pred, bbox_gt, cls_gt): + + def forward(self, bbox_pred, iou_pred, prob_pred, bbox_gt, cls_gt): """ @param bbox_pred shape(batch_size, cell_h x cell_w, num_anchors, 4) @param iou_pred shape(batch_size, cell_h x cell_w, 1) @@ -114,52 +122,50 @@ def forward(self, bbox_pred, iou_pred, prob_pred, bbox_gt, cls_gt): self.hook_variable("bbox_pred", bbox_pred) bbox_pred = self.apply_anchors_to_bbox(bbox_pred) - bbox_built, iou_built, cls_built, bbox_mask, iou_mask, cls_mask =\ + bbox_built, iou_built, cls_built, bbox_mask, iou_mask, cls_mask = ( self._build_target(bbox_pred, bbox_gt, cls_gt) - - self.bbox_loss = self.mse(bbox_pred * bbox_mask, - bbox_built * bbox_mask) - self.iou_loss = self.mse(iou_pred * iou_mask, - iou_built * iou_mask) - self.cls_loss = self.mse(prob_pred * cls_mask, - cls_built * cls_mask) - + ) + + self.bbox_loss = self.mse(bbox_pred * bbox_mask, bbox_built * bbox_mask) + self.iou_loss = self.mse(iou_pred * iou_mask, iou_built * iou_mask) + self.cls_loss = self.mse(prob_pred * cls_mask, cls_built * cls_mask) + return self.bbox_loss * 5 + self.iou_loss + self.cls_loss - + def apply_anchors_to_bbox(self, bbox_pred): """ @param bbox_pred shape(batch_size, cell_h x cell_w, num_anchors, 4) - @return bbox_pred shape(batch_size, cell_h x cell_w, num_anchors, 4) + @return bbox_pred shape(batch_size, cell_h x cell_w, num_anchors, 4) """ - anchor_w = self.anchors[:, 0].contiguous().view(-1, 1) - anchor_h = self.anchors[:, 1].contiguous().view(-1, 1) + anchor_w = self.anchors[:, 0].contiguous().view(-1, 1).to(self.device) + anchor_h = self.anchors[:, 1].contiguous().view(-1, 1).to(self.device) bbox_pred_tmp = bbox_pred.clone() bbox_pred_tmp[:, :, :, 2:3] = torch.sqrt(bbox_pred[:, :, :, 2:3] * anchor_w) bbox_pred_tmp[:, :, :, 3:4] = torch.sqrt(bbox_pred[:, :, :, 3:4] * anchor_h) return bbox_pred_tmp - + def _build_target(self, bbox_pred, bbox_gt, cls_gt): """ @param bbox_pred shape(batch_size, cell_h x cell_w, num_anchors, 4) @param bbox_gt shape(batch_size, num_bbox, 4) @param cls_gt shape(batch_size, num_bbox, 1) @return tuple of (bbox_built, iou_built, cls_built, bbox_mask, iou_mask, cls_mask) - """ + """ bbox_built, bbox_mask = [], [] iou_built, iou_mask = [], [] cls_built, cls_mask = [], [] - + batch_size = bbox_pred.shape[0] - + for i in range(batch_size): - _bbox_built, _iou_built, _cls_built,\ - _bbox_mask, _iou_mask, _cls_mask =\ - self._make_target_per_sample( - torch.FloatTensor(bbox_pred[i]), - torch.FloatTensor(np.array(bbox_gt[i])), - torch.LongTensor(cls_gt[i]) - ) - + _bbox_built, _iou_built, _cls_built, _bbox_mask, _iou_mask, _cls_mask = ( + self._make_target_per_sample( + bbox_pred[i], + torch.FloatTensor(np.array(bbox_gt[i])), + torch.LongTensor(cls_gt[i]), + ) + ) + bbox_built.append(_bbox_built) bbox_mask.append(_bbox_mask) iou_built.append(_iou_built) @@ -173,9 +179,16 @@ def _build_target(self, bbox_pred, bbox_gt, cls_gt): iou_mask = torch.stack(iou_mask) cls_built = torch.stack(cls_built) cls_mask = torch.stack(cls_mask) - - return bbox_built, iou_built, cls_built, bbox_mask, iou_mask, cls_mask - + + return ( + bbox_built.to(self.device), + iou_built.to(self.device), + cls_built.to(self.device), + bbox_mask.to(self.device), + iou_mask.to(self.device), + cls_mask.to(self.device), + ) + def _make_target_per_sample(self, _bbox_pred, _bbox_gt, _cls_gt): """ @param _bbox_pred shape(cell_h x cell_w, num_anchors, 4) @@ -183,22 +196,22 @@ def _make_target_per_sample(self, _bbox_pred, _bbox_gt, _cls_gt): @param _cls_gt shape(num_bbox,) @return tuple of (_bbox_built, _iou_built, _cls_built, _bbox_mask, _iou_mask, _cls_mask) """ - hw, num_anchors, _ = _bbox_pred.shape - + hw, num_anchors, _ = _bbox_pred.shape + # set result template _bbox_built = torch.zeros((hw, num_anchors, 4)) _bbox_mask = torch.zeros((hw, num_anchors, 1)) - + _iou_built = torch.zeros((hw, num_anchors, 1)) _iou_mask = torch.ones((hw, num_anchors, 1)) * 0.5 - + _cls_built = torch.zeros((hw, num_anchors, self.num_classes)) _cls_mask = torch.zeros((hw, num_anchors, 1)) - + # find best anchors - _bbox_gt_wh = _bbox_gt.clone()[:, 2:] + _bbox_gt_wh = _bbox_gt.clone()[:, 2:] best_anchors = find_best_ratio(self.anchors, _bbox_gt_wh) - + # normalize x, y pos based on cell coornindates cx = _bbox_gt[:, 0] * self.outsize[0] cy = _bbox_gt[:, 1] * self.outsize[1] @@ -207,22 +220,23 @@ def _make_target_per_sample(self, _bbox_pred, _bbox_gt, _cls_gt): cell_idx = np.array(cell_idx, dtype=np.int16) cx -= np.floor(cx) cy -= np.floor(cy) - + # set bbox of gt - _bbox_built[cell_idx, best_anchors, 0] = cx + _bbox_built[cell_idx, best_anchors, 0] = cx _bbox_built[cell_idx, best_anchors, 1] = cy - _bbox_built[cell_idx, best_anchors, 2] = torch.sqrt(_bbox_gt[:, 2]) - _bbox_built[cell_idx, best_anchors, 3] = torch.sqrt(_bbox_gt[:, 3]) + _bbox_built[cell_idx, best_anchors, 2] = torch.sqrt(_bbox_gt[:, 2]) + _bbox_built[cell_idx, best_anchors, 3] = torch.sqrt(_bbox_gt[:, 3]) _bbox_mask[cell_idx, best_anchors, :] = 1 - - # set cls of gt + + # set cls of gt _cls_built[cell_idx, best_anchors, _cls_gt] = 1 _cls_mask[cell_idx, best_anchors, :] = 1 - + # set confidence score of gt - _iou_built = calculate_iou(_bbox_pred.reshape(-1, 4), _bbox_built.view(-1, 4)).detach() + _iou_built = calculate_iou( + _bbox_pred.reshape(-1, 4), _bbox_built.view(-1, 4).to(self.device) + ).detach() _iou_built = _iou_built.view(hw, num_anchors, 1) _iou_mask[cell_idx, best_anchors, :] = 1 - - return _bbox_built, _iou_built, _cls_built,\ - _bbox_mask, _iou_mask, _cls_mask + + return _bbox_built, _iou_built, _cls_built, _bbox_mask, _iou_mask, _cls_mask diff --git a/Applications/YOLO/jni/Android.mk b/Applications/YOLOv2/jni/Android.mk similarity index 100% rename from Applications/YOLO/jni/Android.mk rename to Applications/YOLOv2/jni/Android.mk diff --git a/Applications/YOLO/jni/Application.mk b/Applications/YOLOv2/jni/Application.mk similarity index 100% rename from Applications/YOLO/jni/Application.mk rename to Applications/YOLOv2/jni/Application.mk diff --git a/Applications/YOLO/jni/det_dataloader.cpp b/Applications/YOLOv2/jni/det_dataloader.cpp similarity index 100% rename from Applications/YOLO/jni/det_dataloader.cpp rename to Applications/YOLOv2/jni/det_dataloader.cpp diff --git a/Applications/YOLO/jni/det_dataloader.h b/Applications/YOLOv2/jni/det_dataloader.h similarity index 100% rename from Applications/YOLO/jni/det_dataloader.h rename to Applications/YOLOv2/jni/det_dataloader.h diff --git a/Applications/YOLO/jni/main.cpp b/Applications/YOLOv2/jni/main.cpp similarity index 97% rename from Applications/YOLO/jni/main.cpp rename to Applications/YOLOv2/jni/main.cpp index bc3985adbd..018602e408 100644 --- a/Applications/YOLO/jni/main.cpp +++ b/Applications/YOLOv2/jni/main.cpp @@ -139,6 +139,7 @@ std::vector yoloBlock(const std::string &block_name, withKey("filters", filters), withKey("kernel_size", {kernel_size, kernel_size}), withKey("padding", padding), + withKey("disable_bias", "true"), withKey("input_layers", input_layer)}; return createLayer("conv2d", props); @@ -150,6 +151,7 @@ std::vector yoloBlock(const std::string &block_name, if (downsample) { LayerHandle a2 = createLayer("batch_normalization", {with_name("a2"), withKey("momentum", "0.9"), + withKey("epsilon", 0.00001), withKey("activation", "leaky_relu")}); LayerHandle a3 = createLayer( @@ -158,10 +160,10 @@ std::vector yoloBlock(const std::string &block_name, return {a1, a2, a3}; } else { - LayerHandle a2 = - createLayer("batch_normalization", - {withKey("name", block_name), withKey("momentum", "0.9"), - withKey("activation", "leaky_relu")}); + LayerHandle a2 = createLayer( + "batch_normalization", + {withKey("name", block_name), withKey("momentum", "0.9"), + withKey("epsilon", 0.00001), withKey("activation", "leaky_relu")}); return {a1, a2}; } diff --git a/Applications/YOLO/jni/meson.build b/Applications/YOLOv2/jni/meson.build similarity index 100% rename from Applications/YOLO/jni/meson.build rename to Applications/YOLOv2/jni/meson.build diff --git a/Applications/YOLO/jni/reorg_layer.cpp b/Applications/YOLOv2/jni/reorg_layer.cpp similarity index 100% rename from Applications/YOLO/jni/reorg_layer.cpp rename to Applications/YOLOv2/jni/reorg_layer.cpp diff --git a/Applications/YOLO/jni/reorg_layer.h b/Applications/YOLOv2/jni/reorg_layer.h similarity index 100% rename from Applications/YOLO/jni/reorg_layer.h rename to Applications/YOLOv2/jni/reorg_layer.h diff --git a/Applications/YOLO/jni/yolo_v2_loss.cpp b/Applications/YOLOv2/jni/yolo_v2_loss.cpp similarity index 100% rename from Applications/YOLO/jni/yolo_v2_loss.cpp rename to Applications/YOLOv2/jni/yolo_v2_loss.cpp diff --git a/Applications/YOLO/jni/yolo_v2_loss.h b/Applications/YOLOv2/jni/yolo_v2_loss.h similarity index 100% rename from Applications/YOLO/jni/yolo_v2_loss.h rename to Applications/YOLOv2/jni/yolo_v2_loss.h diff --git a/Applications/meson.build b/Applications/meson.build index 2e3f59fdf2..7c8ef63cd4 100644 --- a/Applications/meson.build +++ b/Applications/meson.build @@ -9,7 +9,7 @@ if enable_ccapi endif subdir('VGG/jni') subdir('Resnet/jni') -subdir('YOLO/jni') +subdir('YOLOv2/jni') subdir('YOLOv3/jni') subdir('LLaMA/jni') subdir('Multi_input/jni') diff --git a/ci/pylintrc b/ci/pylintrc deleted file mode 100644 index aa38200415..0000000000 --- a/ci/pylintrc +++ /dev/null @@ -1,36 +0,0 @@ -[MASTER] - -[MESSAGESCONTROL] -disable= - too-many-instance-attributes, - len-as-condition, - too-few-public-methods, - anomalous-backslash-in-string, - no-else-return, - simplifiable-if-statement, - too-many-arguments, - duplicate-code, - no-name-in-module, - no-member, - raw-checker-failed, - bad-inline-option, - locally-disabled, - file-ignored, - suppressed-message, - useless-suppression, - deprecated-pragma, - import-error, - missing-docstring, - invalid-name, - consider-using-enumerate - -[SIMILARITIES] - -# Ignore comments when computing similarities. -ignore-comments=yes - -# Ignore docstrings when computing similarities. -ignore-docstrings=yes - -# Ignore imports when computing similarities. -ignore-imports=no diff --git a/ci/requirements.txt b/ci/requirements.txt deleted file mode 100644 index 0be69076fc..0000000000 --- a/ci/requirements.txt +++ /dev/null @@ -1,81 +0,0 @@ -absl-py==2.1.0 -astroid==3.0.2 -astunparse==1.6.3 -cachetools==5.3.2 -certifi==2023.11.17 -charset-normalizer==3.3.2 -contourpy==1.2.0 -cycler==0.12.1 -dill==0.3.8 -filelock==3.13.1 -flatbuffers==23.5.26 -fonttools==4.47.2 -fsspec==2023.12.2 -gast==0.5.4 -google-auth==2.27.0 -google-auth-oauthlib==1.2.0 -google-pasta==0.2.0 -grpcio==1.60.0 -h5py==3.10.0 -huggingface-hub==0.20.3 -idna==3.6 -importlib-metadata==7.0.1 -importlib-resources==6.1.1 -isort==5.13.2 -Jinja2==3.1.3 -joblib==1.3.2 -keras==2.15.0 -kiwisolver==1.4.5 -libclang==16.0.6 -Markdown==3.5.2 -MarkupSafe==2.1.4 -matplotlib==3.8.2 -mccabe==0.7.0 -ml-dtypes==0.2.0 -mpmath==1.3.0 -networkx==3.2.1 -numpy==1.26.3 -oauthlib==3.2.2 -opt-einsum==3.3.0 -packaging==23.2 -pandas==2.2.0 -pillow==10.2.0 -platformdirs==4.2.0 -protobuf==4.23.4 -pyasn1==0.5.1 -pyasn1-modules==0.3.0 -pylint==3.0.2 -pyparsing==3.1.1 -python-dateutil==2.8.2 -pytz==2023.4 -PyYAML==6.0.1 -regex==2023.12.25 -requests==2.31.0 -requests-oauthlib==1.3.1 -rsa==4.9 -safetensors==0.4.2 -scikit-learn==1.4.0 -scipy==1.12.0 -six==1.16.0 -sympy==1.12 -tensorboard==2.15.1 -tensorboard-data-server==0.7.2 -tensorflow==2.15.0.post1 -tensorflow-estimator==2.15.0 -tensorflow-io-gcs-filesystem==0.35.0 -termcolor==2.4.0 -threadpoolctl==3.2.0 -tokenizers==0.15.1 -tomli==2.0.1 -tomlkit==0.12.3 -torch==2.2.0 -torchvision==0.17.0 -tqdm==4.66.1 -transformers==4.37.2 -triton==2.2.0 -typing_extensions==4.9.0 -tzdata==2023.4 -urllib3==2.2.0 -Werkzeug==3.0.1 -wrapt==1.14.1 -zipp==3.17.0 diff --git a/debian/nntrainer-dev.install b/debian/nntrainer-dev.install index 4fd55b3774..11b41f990b 100644 --- a/debian/nntrainer-dev.install +++ b/debian/nntrainer-dev.install @@ -16,6 +16,7 @@ /usr/include/nntrainer/blas_interface.h /usr/include/nntrainer/var_grad.h /usr/include/nntrainer/weight.h +/usr/include/nntrainer/blas_avx.h # todo: update dataset headers /usr/include/nntrainer/databuffer.h /usr/include/nntrainer/databuffer_factory.h diff --git a/meson.build b/meson.build index d4aea330a4..7ae692e6d9 100644 --- a/meson.build +++ b/meson.build @@ -64,9 +64,19 @@ warning_c_flags = [ '-Wno-error=varargs' ] +arch = host_machine.cpu_family() + +if get_option('enable-avx') + extra_defines += '-DUSE_AVX=1' + if get_option('platform') == 'tizen' + add_project_arguments(['-mavx2'], language: ['c','cpp']) + else + add_project_arguments(['-march=native'], language: ['c','cpp']) + endif + message('-march=native added for AVX hardware acceleration.') +endif if get_option('enable-fp16') - arch = host_machine.cpu_family() if get_option('platform') == 'android' add_project_arguments('-mfp16-format=ieee', language: ['c', 'cpp']) extra_defines += '-DENABLE_FP16=1' @@ -105,11 +115,6 @@ if get_option('enable-fp16') if cc.version().version_compare('>=12.1.0') message ('Float16 for x86_64 enabled. Modern gcc-x64 generally supports float16 with _Float16.') extra_defines += '-DENABLE_FP16=1' - if get_option('enable-avx') - extra_defines += '-DUSE_AVX=1' - add_project_arguments(['-march=native'], language: ['c','cpp']) - message('-march=native added for AVX hardware acceleration.') - endif else warning ('Float16 for x86_64 enabled. However, software emulation is applied for fp16, making it slower and inconsistent. Use GCC 12+ for FP16 support. This build will probably fail unless you bring a compiler that supports fp16 for x64.') endif diff --git a/meson_options.txt b/meson_options.txt index de2578cb47..59accc1c1a 100644 --- a/meson_options.txt +++ b/meson_options.txt @@ -40,7 +40,7 @@ option('enable-fp16', type: 'boolean', value: false) option('enable-cublas', type: 'boolean', value: false) option('enable-openmp', type: 'boolean', value: true) option('enable-neon', type: 'boolean', value: false) -option('enable-avx', type: 'boolean', value: false) +option('enable-avx', type: 'boolean', value: true) option('enable-opencl', type: 'boolean', value: false) # ml-api dependency (to enable, install capi-inference from github.com/nnstreamer/api ) diff --git a/nnstreamer/tensor_trainer/tensor_trainer_nntrainer.cc b/nnstreamer/tensor_trainer/tensor_trainer_nntrainer.cc index 57d84f99d1..c18630efb9 100644 --- a/nnstreamer/tensor_trainer/tensor_trainer_nntrainer.cc +++ b/nnstreamer/tensor_trainer/tensor_trainer_nntrainer.cc @@ -555,7 +555,8 @@ void NNTrainer::NNTrainerImpl::trainModel() { ml_logd("pid[%d], tid[%d]", pid, tid); try { - model->setProperty({"epochs=" + std::to_string(num_epochs)}); + model->setProperty( + {"epochs=" + std::to_string(num_epochs), "save_path=" + model_save_path}); } catch (const std::exception &e) { ml_loge("Error %s, %s", typeid(e).name(), e.what()); return; @@ -574,14 +575,6 @@ void NNTrainer::NNTrainerImpl::trainModel() { return; } - try { - ml_logd("Save_model: %s", model_save_path.c_str()); - model->save(model_save_path, ml::train::ModelFormat::MODEL_FORMAT_BIN); - - } catch (const std::exception &e) { - ml_loge("Error %s, %s", typeid(e).name(), e.what()); - return; - } /* send event */ nnstreamer_trainer_notify_event(this->notifier, TRAINER_EVENT_TRAINING_COMPLETION, NULL); diff --git a/nntrainer/cl_context.cpp b/nntrainer/cl_context.cpp index 1ed31490be..be7345eed0 100644 --- a/nntrainer/cl_context.cpp +++ b/nntrainer/cl_context.cpp @@ -13,7 +13,7 @@ */ #include -#include +#include namespace nntrainer { @@ -23,8 +23,9 @@ std::once_flag global_cl_context_init_flag; static void add_default_object(ClContext &cc) { - cc.registerFactory(nntrainer::createLayer, - FullyConnectedLayer::type, ml::train::LayerType::LAYER_FC); + cc.registerFactory(nntrainer::createLayer, + FullyConnectedLayerCl::type, + ml::train::LayerType::LAYER_FC); } static void registerer(ClContext &cc) noexcept { diff --git a/nntrainer/graph/graph_core.cpp b/nntrainer/graph/graph_core.cpp index b624e066e4..3eafbb9261 100644 --- a/nntrainer/graph/graph_core.cpp +++ b/nntrainer/graph/graph_core.cpp @@ -35,6 +35,10 @@ GraphCore::getSortedNode(unsigned int ith) const { return Sorted.at(ith); } +const unsigned int GraphCore::getSortedNodeIdx(const std::string &name) const { + return sorted_node_map.at(name); +} + void GraphCore::makeAdjacencyList( std::vector>> &adj) { /** initialize the adj list */ @@ -93,6 +97,11 @@ void GraphCore::topologicalSort() { if (Sorted.size() != node_list.size()) throw std::runtime_error("Internal error in topologicalSort"); + unsigned int idx = 0; + for (auto n : Sorted) { + sorted_node_map[n->getName()] = idx; + idx++; + } } const std::shared_ptr & diff --git a/nntrainer/graph/graph_core.h b/nntrainer/graph/graph_core.h index 83d3ce7c39..77aa63666a 100644 --- a/nntrainer/graph/graph_core.h +++ b/nntrainer/graph/graph_core.h @@ -91,6 +91,13 @@ class GraphCore { */ const std::shared_ptr &getSortedNode(unsigned int ith) const; + /** + * @brief getter of Sorted GraphNode index with name + * @param[in] layer name + * @ret index + */ + const unsigned int getSortedNodeIdx(const std::string &name) const; + /** * @brief getter of GraphNode with node name * @param[in] node name @@ -252,6 +259,7 @@ class GraphCore { std::vector> node_list; /**< Unordered Node List */ std::unordered_map node_map; /**< Unordered Node map */ + std::unordered_map sorted_node_map; /**< Unordered Node map */ std::vector> Sorted; /**< Ordered Node List */ bool sorted; /** if the node_list is sorted */ diff --git a/nntrainer/graph/network_graph.cpp b/nntrainer/graph/network_graph.cpp index 2d4cfdc769..ec69ebd69f 100644 --- a/nntrainer/graph/network_graph.cpp +++ b/nntrainer/graph/network_graph.cpp @@ -337,7 +337,7 @@ void NetworkGraph::applyGradients( continue; } - if (rc.isGradientClipByGlobalNorm(i)) { + if (rc.isGradientClipByGlobalNorm(i) || rc.isMixedPrecision(i)) { /** * @note the weights whose gradient are to be clipped by global norm will * be clipped at once at the end of iteration and applied then. @@ -393,56 +393,113 @@ sharedConstTensors NetworkGraph::incremental_forwarding( return out; } -void NetworkGraph::backwarding( +bool NetworkGraph::backwarding( int iteration, - std::function, int)> &backwarding_op, - std::function &apply_grad_clip_op, - std::function stop_cb, void *userdata) const { + std::function, bool)> &forwarding_op, + std::function, int)> &backwarding_op, + std::function &lazy_apply_grad_op, + std::function stop_cb, void *userdata) { /** * last layer backwarding is run out of this loop */ auto iter_begin = getBackwardingBeginIter(); auto iter_end = getBackwardingEndIter(); + bool is_valid = true; /// there is no layer to train, so backwarding is essentially noop if (iter_begin == iter_end) { - return; + return true; } auto const &lptr_begin = (*iter_begin); + // graph_const_reverse_iterator + auto iter_ = iter_begin; if (lptr_begin->requireLabel() == false) throw std::runtime_error( "Error: last layer does not accept label, we can't train"); - for (auto iter = iter_begin; iter != iter_end && !stop_cb(userdata); iter++) { - auto &ln = *iter; + for (iter_ = iter_begin; iter_ != iter_end && !stop_cb(userdata); iter_++) { + auto &ln = *iter_; PROFILE_TIME_START(profile_keys.at(ln->getType())); - backwarding_op(ln, iteration); + is_valid = backwarding_op(ln, iteration); PROFILE_TIME_END(profile_keys.at(ln->getType())); + + if (!is_valid) { + std::cout << ln->getName() << " : Gradient has NaN --> " + << ln->getRunContext().getLossScale() << std::endl; + break; + } } - /** perform clipping of the gradients by global norm if any */ - if (clip_weights.empty()) - return; + if (!is_valid) { + /** if has NaN + * 1. reset the loss scale. : @todo Backoff_factor : default --> 0.5 + * 2. run forwarding from cur_iter to cend() && !stop_cb(userdata); + * 3. return false --> run backwarding again; + */ + float scale = (*iter_)->getRunContext().getLossScale(); + + NNTR_THROW_IF(scale == 1.0f, std::invalid_argument) + << "Loss Scale Factor is 1.0f"; + + float s = scale > 1.5f ? scale * 0.5f : 1.0f; - /** calculate the global norm */ - Tensor global_norm_t( - TensorDim({1u, 1u, 1u, (unsigned int)clip_weights.size()})); - float *global_norm_data = global_norm_t.getData(); - for (unsigned int idx = 0; idx < clip_weights.size(); idx++) { - auto const &w = clip_weights[idx]; - global_norm_data[idx] = w->getGradientNorm(); + resetLossScale(s); + + auto f_iter = cbegin() + graph.getSortedNodeIdx((*iter_)->getName()); + + for (auto iter = f_iter; iter != cend() && !stop_cb(userdata); iter++) { + auto &ln = *iter; + ln->needsOutputSetZero(true); + } + + for (auto iter = f_iter; iter != cend() && !stop_cb(userdata); iter++) { + auto &ln = *iter; + PROFILE_TIME_START(profile_keys.at(ln->getType())); + forwarding_op(*iter, true); + PROFILE_TIME_END(profile_keys.at(ln->getType())); + } + + return false; } - float global_norm = global_norm_t.l2norm(); - /** apply the gradient with the above global norm */ - for (auto w : clip_weights) { - w->clipGradientByGlobalNorm(global_norm); + + /** perform clipping of the gradients by global norm if any */ + if (lazy_weights.empty()) + return true; + + if (is_clip_grad) { + /** calculate the global norm */ + Tensor global_norm_t( + TensorDim({1u, 1u, 1u, (unsigned int)lazy_weights.size()})); + float *global_norm_data = global_norm_t.getData(); + for (unsigned int idx = 0; idx < lazy_weights.size(); idx++) { + auto const &w = lazy_weights[idx]; + global_norm_data[idx] = w->getGradientNorm(); + } + float global_norm = global_norm_t.l2norm(); + /** apply the gradient with the above global norm */ + for (auto w : lazy_weights) { + w->clipGradientByGlobalNorm(global_norm); + } } /** apply the gradient with the above global norm */ - for (auto w : clip_weights) { - apply_grad_clip_op(*w, iteration); + for (auto w : lazy_weights) { + lazy_apply_grad_op(*w, iteration); } + nan_count++; + + /** @todo : handle as property : growth_interval : default --> 2000 */ + + if (nan_count > 2000) { + float scale = (*iter_)->getRunContext().getLossScale(); + /** @todo growth_factor : default --> 2.0 */ + float s = scale * 2.0f; + resetLossScale(s); + nan_count = 0; + } + + return true; } LayerNode *NetworkGraph::computeBackwardEnd() { @@ -580,8 +637,15 @@ void NetworkGraph::addLayer(std::shared_ptr layer) { InPlace NetworkGraph::canExecuteInPlace(const std::shared_ptr &lnode) { - if (!lnode->supportInPlace()) + + if (!lnode->supportInPlace()) { return InPlace::NONE; + } + + if (lnode->getType() == InputLayer::type && + !istrequal(getTensorType()[2], "FP32")) { + return InPlace::NONE; + } /** layers which behave as a no-op - flatten */ auto no_op = [](const std::shared_ptr &lnode) { @@ -768,9 +832,10 @@ NetworkGraph::finalizeContext(const std::shared_ptr &lnode, * node is going to be used with in-place optimizations. */ auto out_specs = init_context.getOutSpecs(); + /// @note try move inplace control to finalize bool shared_var = false, shared_grad = false; - if (lnode->executeInPlace() != InPlace::NONE) { + if (lnode->executeInPlace() != InPlace::NONE && lnode->supportInPlace()) { setInplaceSharedMemoryConfigByLayer(lnode, shared_var, shared_grad); for (unsigned int i = 0; i < out_specs.size(); ++i) { auto &s = out_specs.at(i); @@ -879,7 +944,8 @@ NetworkGraph::finalizeContext(const std::shared_ptr &lnode, lnode->getTrainable(), shared_weight_names), inputs, outputs, tensor_manager->requestTensors(gnode, init_context.getTensorsSpec(), - lnode->getTrainable(), shared_tensor_names)); + lnode->getTrainable(), shared_tensor_names), + init_context.getLossScale()); return outputs; } @@ -1027,7 +1093,8 @@ NetworkGraph::refinalizeContext(const std::shared_ptr &lnode, // TODO: update weights spec for trainable based on layer trainable prop weights, inputs, outputs, tensor_manager->requestTensors(gnode, init_context.getTensorsSpec(), - lnode->getTrainable(), shared_tensor_names)); + lnode->getTrainable(), shared_tensor_names), + init_context.getLossScale()); return outputs; } @@ -1197,7 +1264,7 @@ int NetworkGraph::initialize(ExecutionMode mode, */ if (tensor_manager->isLastAccess(rc.getWeightGrad(i).getName(), last_grad_access) || - (rc.isGradientClipByGlobalNorm(i) && + ((rc.isGradientClipByGlobalNorm(i) || rc.isMixedPrecision(i)) && tensor_manager->isSecondLastAccess(rc.getWeightGrad(i).getName(), last_grad_access))) { rc.getWeightObject(i).setAsGradientLastAccess(); @@ -1287,11 +1354,19 @@ int NetworkGraph::initialize(ExecutionMode mode, /** select weights which would require clipping of the gradients by global * norm if any */ - clip_weights = tensor_manager->getWeights([](const Weight *w) { + lazy_weights = tensor_manager->getWeights([](const Weight *w) { return w->hasGradient() && w->isGradientLastAccess() && - w->isGradientClipByGlobalNorm(); + (w->isGradientClipByGlobalNorm() || w->isMixedPrecision()); }); + is_clip_grad = false; + for (auto w : lazy_weights) { + if (w->isGradientClipByGlobalNorm()) { + is_clip_grad = true; + break; + } + } + return ML_ERROR_NONE; } @@ -1556,10 +1631,18 @@ void NetworkGraph::requestOptimizerVariable( const TensorDim &dim = w->getDim(); std::vector dims = cb(dim); w->setOptimizerVariables(tensor_manager->requestWeightOptimizerVariables( - dims, w->getName(), TensorLifespan::MAX_LIFESPAN, - w->isGradientClipByGlobalNorm(), Tensor::Initializer::ZEROS)); + dims, w->getName(), ":opt", TensorLifespan::MAX_LIFESPAN, + w->isGradientClipByGlobalNorm(), w->isMixedPrecision(), + Tensor::Initializer::ZEROS)); } } } +void NetworkGraph::resetLossScale(float scale) { + for (auto iter = cbegin(); iter != cend(); iter++) { + auto &ln = *iter; + ln->getRunContext().setLossScale(scale); + } +} + } /* namespace nntrainer */ diff --git a/nntrainer/graph/network_graph.h b/nntrainer/graph/network_graph.h index 5c9adf0363..22f14e1b73 100644 --- a/nntrainer/graph/network_graph.h +++ b/nntrainer/graph/network_graph.h @@ -51,7 +51,9 @@ class NetworkGraph { optimize_memory(true), exec_mode(ExecutionMode::TRAIN), tensor_format("NCHW"), - tensor_dtype(split("FP32-FP32", getRegex("\\-"))) {} + tensor_dtype(split("FP32-FP32", getRegex("\\-"))) { + nan_count = 0; + } /** * @brief Constructor of NeuralNetwork Graph Class @@ -73,7 +75,9 @@ class NetworkGraph { optimize_memory(true), exec_mode(ExecutionMode::TRAIN), tensor_format(tensor_format_), - tensor_dtype(split(tensor_dtype_, getRegex("\\-"))) {} + tensor_dtype(split(tensor_dtype_, getRegex("\\-"))) { + nan_count = 0; + } /** * @brief Destructor of the NeuralNetwork Graph class @@ -206,13 +210,14 @@ class NetworkGraph { * @param[in] backwarding_op operation for the backwarding * @param[in] apply_grad_clip_op operation for applying the clip gradients */ - void backwarding( + bool backwarding( int iteration, - std::function, int)> &backwarding_op, - std::function &apply_grad_clip_op, + std::function, bool)> &forwarding_op, + std::function, int)> &backwarding_op, + std::function &lazy_apply_grad_op, std::function stop_cb = [](void *user_data) { return false; }, - void *user_data = nullptr) const; + void *user_data = nullptr); /** * @brief get begin iterator for the graph @@ -444,6 +449,12 @@ class NetworkGraph { getLayerExecutionOrders(const std::shared_ptr &lnode); #endif // ENABLE_TEST + /** + * @brief reset the loss scale + * @param[in] scale + */ + void resetLossScale(float scale); + private: std::map sub_in_out; /** This is map to identify input and output layer name of subgraph */ @@ -480,7 +491,10 @@ class NetworkGraph { std::unordered_map profile_keys; /**< profile keys based on the layer type */ std::vector - clip_weights; /**< weights with global norm based clipping enabled */ + lazy_weights; /**< weights with global norm based clipping enabled */ + bool is_clip_grad; + + unsigned int nan_count; /** * @brief topological sort diff --git a/nntrainer/layers/bn_layer.cpp b/nntrainer/layers/bn_layer.cpp index 1723ac677f..e978b1ef59 100644 --- a/nntrainer/layers/bn_layer.cpp +++ b/nntrainer/layers/bn_layer.cpp @@ -111,6 +111,12 @@ void BatchNormalizationLayer::finalize(InitLayerContext &context) { context.requestWeight(dim, bnparams_beta, WeightRegularizer::NONE, 1.0f, bias_decay, "beta", true); + /** + * @note declare weigth dimention with activation datatype + */ + TensorDim w_dim = dim; + w_dim.setDataType(in_dim.getDataType()); + /** * caches the deviation -> input - avg(input) * @todo check if avoiding this storage and adding dependency on input (no @@ -121,7 +127,7 @@ void BatchNormalizationLayer::finalize(InitLayerContext &context) { TensorLifespan::ITERATION_LIFESPAN); /** caches the inverse standard deviation */ wt_idx[BNParams::invstd] = - context.requestTensor(dim, "invstd", Tensor::Initializer::NONE, false, + context.requestTensor(w_dim, "invstd", Tensor::Initializer::NONE, false, TensorLifespan::ITERATION_LIFESPAN); /** * Temporary tensor to store the full sized tensors in order to allow batch @@ -136,13 +142,13 @@ void BatchNormalizationLayer::finalize(InitLayerContext &context) { * caches variance + epsilon as well. */ wt_idx[BNParams::cvar] = - context.requestTensor(dim, "cvar", Tensor::Initializer::NONE, false, + context.requestTensor(w_dim, "cvar", Tensor::Initializer::NONE, false, TensorLifespan::ITERATION_LIFESPAN); /** * Temporary tensor to store the reduced tensors along the axes_to_reduce. */ wt_idx[BNParams::t_reduced] = - context.requestTensor(dim, "tensor_reduced", Tensor::Initializer::NONE, + context.requestTensor(w_dim, "tensor_reduced", Tensor::Initializer::NONE, false, TensorLifespan::FORWARD_DERIV_LIFESPAN); } diff --git a/nntrainer/layers/cl_layers/blas_kernels.cpp b/nntrainer/layers/cl_layers/blas_kernels.cpp new file mode 100644 index 0000000000..c190688c66 --- /dev/null +++ b/nntrainer/layers/cl_layers/blas_kernels.cpp @@ -0,0 +1,301 @@ +// SPDX-License-Identifier: Apache-2.0 +/** + * Copyright (C) 2024 Debadri Samaddar + * + * @file blas_kernels.cpp + * @date 14 May 2024 + * @brief Common blas OpenCL kernels + * @see https://github.com/nnstreamer/nntrainer + * @author Debadri Samaddar + * @bug No known bugs except for NYI items + * + */ + +#include + +namespace nntrainer { + +std::string sgemv_cl_kernel_ = + R"(__kernel void sgemv_cl(const __global float* A, const __global float* X, + __global float* Y, unsigned int M, unsigned int N) { + unsigned int i; + i = get_global_id(0); + float y0 = 0.0f; + for (unsigned int j = 0; j < M; j++) + y0 += A[i + j * N] * X[j]; + Y[i] = y0; + + })"; + +std::string dot_cl_kernel_ = + R"(__kernel void dot_cl(const __global float* A, const __global float* X, unsigned int K, __global float* res) { + *res = 0; + for (unsigned int i = 0; i < K; i++){ + *res += A[i] * X[i]; + } + })"; + +std::string sgemm_cl_kernel_ = + R"(__kernel void sgemm_cl(const __global float* A, const __global float* B, + __global float* C, unsigned int K, unsigned int lda, unsigned int ldb, unsigned int ldc) { + + unsigned int m = get_global_id(0); + unsigned int n = get_global_id(1); + float c = 0.0f; + for (unsigned int k = 0; k < K; ++k) { + float a, b; + a = A[m * lda + k]; + b = B[k * ldb + n]; + c += a * b; + } + C[m * ldc + n] = c; + })"; + +/** + * @brief declaring global kernel objects + */ +opencl::Kernel kernel_sgemv; +opencl::Kernel kernel_sgemm; +opencl::Kernel kernel_dot; + +void sgemv_cl(const float *matAdata, const float *vecXdata, float *vecYdata, + unsigned int dim1, unsigned int dim2, unsigned int lda, + RunLayerContext &context) { + + bool result = false; + + do { + result = context.clCreateKernel(sgemv_cl_kernel_, + context.LayerKernel::SGEMV, kernel_sgemv); + if (!result) { + break; + } + + size_t dim1_size = sizeof(float) * dim1; + size_t dim2_size = sizeof(float) * dim2; + opencl::Buffer inputA(context.context_inst_, dim1 * dim2 * sizeof(float), + true, nullptr); + + opencl::Buffer inputX(context.context_inst_, dim1_size, true, nullptr); + + opencl::Buffer inOutY(context.context_inst_, dim2_size, true, nullptr); + + result = inputA.WriteData(context.command_queue_inst_, matAdata); + if (!result) { + break; + } + + result = inputX.WriteData(context.command_queue_inst_, vecXdata); + if (!result) { + break; + } + + result = inOutY.WriteData(context.command_queue_inst_, vecYdata); + if (!result) { + break; + } + + result = kernel_sgemv.SetKernelArguments(0, &inputA, sizeof(cl_mem)); + if (!result) { + break; + } + + result = kernel_sgemv.SetKernelArguments(1, &inputX, sizeof(cl_mem)); + if (!result) { + break; + } + + result = kernel_sgemv.SetKernelArguments(2, &inOutY, sizeof(cl_mem)); + if (!result) { + break; + } + + result = kernel_sgemv.SetKernelArguments(3, &dim1, sizeof(int)); + if (!result) { + break; + } + + result = kernel_sgemv.SetKernelArguments(4, &dim2, sizeof(int)); + if (!result) { + break; + } + + const int work_groups_count[3] = {(int)dim2, 1, 1}; + const int work_group_size[3] = {32, 32, 1}; // test-value + + result = context.command_queue_inst_.DispatchCommand( + kernel_sgemv, work_groups_count, work_group_size); + if (!result) { + break; + } + + result = inOutY.ReadData(context.command_queue_inst_, vecYdata); + if (!result) { + break; + } + + } while (false); +} + +float dot_cl(const float *vecAdata, const float *vecXdata, unsigned int dim1, + RunLayerContext &context) { + + bool result = false; + + float cl_ret = 0; + + do { + result = context.clCreateKernel(dot_cl_kernel_, context.LayerKernel::DOT, + kernel_dot); + if (!result) { + break; + } + + size_t dim1_size = sizeof(float) * dim1; + + opencl::Buffer inputA(context.context_inst_, dim1_size, true, nullptr); + + opencl::Buffer inputX(context.context_inst_, dim1_size, true, nullptr); + + opencl::Buffer dotResult(context.context_inst_, sizeof(float), true, + &cl_ret); + + result = inputA.WriteData(context.command_queue_inst_, vecAdata); + if (!result) { + break; + } + + result = inputX.WriteData(context.command_queue_inst_, vecXdata); + if (!result) { + break; + } + + result = kernel_dot.SetKernelArguments(0, &inputA, sizeof(cl_mem)); + if (!result) { + break; + } + + result = kernel_dot.SetKernelArguments(1, &inputX, sizeof(cl_mem)); + if (!result) { + break; + } + + result = kernel_dot.SetKernelArguments(2, &dim1, sizeof(int)); + if (!result) { + break; + } + + result = kernel_dot.SetKernelArguments(3, &dotResult, sizeof(cl_mem)); + if (!result) { + break; + } + + const int work_groups_count[3] = {(int)dim1, 1, 1}; + const int work_group_size[3] = {32, 32, 1}; // test-value + + result = context.command_queue_inst_.DispatchCommand( + kernel_dot, work_groups_count, work_group_size); + if (!result) { + break; + } + + result = dotResult.ReadData(context.command_queue_inst_, &cl_ret); + if (!result) { + break; + } + + } while (false); + + return cl_ret; +} + +void sgemm_cl(const float *A, const float *B, float *C, unsigned int M, + unsigned int N, unsigned int K, unsigned int lda, + unsigned int ldb, unsigned int ldc, RunLayerContext &context) { + + bool result = false; + + do { + result = context.clCreateKernel(sgemm_cl_kernel_, + context.LayerKernel::SGEMM, kernel_sgemm); + if (!result) { + break; + } + + size_t m_k_size = M * K * sizeof(float); + size_t k_n_size = K * N * sizeof(float); + size_t m_n_size = M * N * sizeof(float); + + opencl::Buffer inputA(context.context_inst_, m_k_size, true, nullptr); + + opencl::Buffer inputB(context.context_inst_, k_n_size, true, nullptr); + + opencl::Buffer inOutC(context.context_inst_, m_n_size, true, nullptr); + + result = inputA.WriteData(context.command_queue_inst_, A); + if (!result) { + break; + } + + result = inputB.WriteData(context.command_queue_inst_, B); + if (!result) { + break; + } + + result = inOutC.WriteData(context.command_queue_inst_, C); + if (!result) { + break; + } + + result = kernel_sgemm.SetKernelArguments(0, &inputA, sizeof(cl_mem)); + if (!result) { + break; + } + + result = kernel_sgemm.SetKernelArguments(1, &inputB, sizeof(cl_mem)); + if (!result) { + break; + } + + result = kernel_sgemm.SetKernelArguments(2, &inOutC, sizeof(cl_mem)); + if (!result) { + break; + } + + result = kernel_sgemm.SetKernelArguments(3, &K, sizeof(int)); + if (!result) { + break; + } + + result = kernel_sgemm.SetKernelArguments(4, &lda, sizeof(int)); + if (!result) { + break; + } + + result = kernel_sgemm.SetKernelArguments(5, &ldb, sizeof(int)); + if (!result) { + break; + } + + result = kernel_sgemm.SetKernelArguments(6, &ldc, sizeof(int)); + if (!result) { + break; + } + + const int work_groups_count[3] = {(int)M, (int)N, 1}; + const int work_group_size[3] = {32, 32, 1}; // test-value + + result = context.command_queue_inst_.DispatchCommand( + kernel_sgemm, work_groups_count, work_group_size); + if (!result) { + break; + } + + result = inOutC.ReadData(context.command_queue_inst_, C); + if (!result) { + break; + } + + } while (false); +} +} // namespace nntrainer diff --git a/nntrainer/layers/cl_layers/blas_kernels.h b/nntrainer/layers/cl_layers/blas_kernels.h new file mode 100644 index 0000000000..ad59b8bbd1 --- /dev/null +++ b/nntrainer/layers/cl_layers/blas_kernels.h @@ -0,0 +1,74 @@ +// SPDX-License-Identifier: Apache-2.0 +/** + * Copyright (C) 2024 Debadri Samaddar + * + * @file blas_kernels.h + * @date 14 May 2024 + * @brief Common blas OpenCL kernels + * @see https://github.com/nnstreamer/nntrainer + * @author Debadri Samaddar + * @bug No known bugs except for NYI items + * + */ + +#ifndef __BLAS_KERNELS_H__ +#define __BLAS_KERNELS_H__ + +#include +#include +#include +#include + +namespace nntrainer { + +/** + * @brief declaring global kernel objects + */ +extern opencl::Kernel kernel_sgemv; +extern opencl::Kernel kernel_sgemm; +extern opencl::Kernel kernel_dot; + +/** + * @brief sgemv computation : Y = A*X + Y + * @param[in] matAdata float * for Matrix A + * @param[in] vecXdata float * for Vector X + * @param[in] vecYdata float * for Vector Y + * @param[in] dim1 number of A's columns + * @param[in] dim2 number of A's rows + * @param[in] lda number of X's columns + * @param[in] context RunLayerContext reference + */ +void sgemv_cl(const float *matAdata, const float *vecXdata, float *vecYdata, + unsigned int dim1, unsigned int dim2, unsigned int lda, + RunLayerContext &context); + +/** + * @brief dot computation : sum of all X * Y + * @param[in] vecAdata float * for Vector A + * @param[in] vecXdata float * for Vector X + * @param[in] dim1 number of elements in both input vectors + * @param[in] context RunLayerContext reference + */ +float dot_cl(const float *vecAdata, const float *vecXdata, unsigned int dim1, + RunLayerContext &context); + +/** + * @brief sgemm computation : Y = op(A)*op(B) + C, + * where op(X) is one of X or X**T + * @param[in] A float * for Matrix A + * @param[in] B float * for Matrix B + * @param[in] C float * for Matrix C + * @param[in] M number of op(A)'s and C's row + * @param[in] N number of op(B)'s and C's columns + * @param[in] K number of op(A)'s and columns and op(B)'s rows + * @param[in] lda number of A's columns + * @param[in] ldb number of B's columns + * @param[in] ldc number of C's columns + * @param[in] context RunLayerContext reference + */ +void sgemm_cl(const float *A, const float *B, float *C, unsigned int M, + unsigned int N, unsigned int K, unsigned int lda, + unsigned int ldb, unsigned int ldc, RunLayerContext &context); + +} // namespace nntrainer +#endif /* __BLAS_KERNELS_H__ */ diff --git a/nntrainer/layers/cl_layers/fc_layer_cl.cpp b/nntrainer/layers/cl_layers/fc_layer_cl.cpp new file mode 100644 index 0000000000..b0a41c4e5f --- /dev/null +++ b/nntrainer/layers/cl_layers/fc_layer_cl.cpp @@ -0,0 +1,288 @@ +// SPDX-License-Identifier: Apache-2.0 +/** + * Copyright (C) 2024 Debadri Samaddar + * + * @file fc_layer_cl.cpp + * @date 7 May 2024 + * @brief This is Fully Connected Layer Class for Neural Network with OpenCl + * implementation + * @see https://github.com/nnstreamer/nntrainer + * @author Debadri Samaddar + * @bug No known bugs except for NYI items + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace nntrainer { + +static constexpr size_t SINGLE_INOUT_IDX = 0; + +enum FCParams { weight, bias }; + +FullyConnectedLayerCl::FullyConnectedLayerCl() : + LayerImpl(), fc_props(props::Unit()) { + weight_idx.fill(std::numeric_limits::max()); +} + +void FullyConnectedLayerCl::finalize(InitLayerContext &context) { + auto &weight_regularizer = + std::get(*layer_impl_props); + auto &weight_regularizer_constant = + std::get(*layer_impl_props); + auto &weight_initializer = + std::get(*layer_impl_props); + auto &weight_decay = std::get(*layer_impl_props); + auto &bias_decay = std::get(*layer_impl_props); + auto &bias_initializer = std::get(*layer_impl_props); + auto &disable_bias = std::get(*layer_impl_props); + + auto unit = std::get(fc_props).get(); + + NNTR_THROW_IF(context.getNumInputs() != 1, std::invalid_argument) + << "Fully connected layer takes only one input"; + + std::vector output_dims(1); + + /// @todo fc actaully supports multidimensions. EffDimFlag shouldn't be fixed + /// like this. + context.setEffDimFlagInputDimension(0, 0b1001); + context.setDynDimFlagInputDimension(0, 0b1000); + + bool is_nchw = (context.getFormat() == Tformat::NCHW); + /** set output dimensions */ + auto const &in_dim = context.getInputDimensions()[0]; + output_dims[0] = in_dim; + is_nchw ? output_dims[0].width(unit) : output_dims[0].channel(unit); + + output_dims[0].setTensorType( + {context.getFormat(), context.getActivationDataType()}); + + context.setOutputDimensions(output_dims); + + /** set weight specifications */ + // @todo : This NCHW format setting is just temporal, it needs to be set by + // global configuration + TensorDim bias_dim( + 1, is_nchw ? 1 : unit, 1, is_nchw ? unit : 1, + TensorDim::TensorType(context.getFormat(), context.getWeightDataType()), + is_nchw ? 0b0001 : 0b0100); + + TensorDim weight_dim( + 1, is_nchw ? 1 : unit, is_nchw ? in_dim.width() : 1, + is_nchw ? unit : in_dim.channel(), + TensorDim::TensorType(context.getFormat(), context.getWeightDataType()), + is_nchw ? 0b0011 : 0b0101); + + weight_idx[FCParams::weight] = context.requestWeight( + weight_dim, weight_initializer, weight_regularizer, + weight_regularizer_constant, weight_decay, "weight", true); + + if (disable_bias.empty() || disable_bias.get() == false) { + weight_idx[FCParams::bias] = + context.requestWeight(bias_dim, bias_initializer, WeightRegularizer::NONE, + 1.0f, bias_decay, "bias", true); + } +} + +void FullyConnectedLayerCl::exportTo( + Exporter &exporter, const ml::train::ExportMethods &method) const { + LayerImpl::exportTo(exporter, method); + exporter.saveResult(fc_props, method, this); +} + +void FullyConnectedLayerCl::setProperty( + const std::vector &values) { + auto remain_props = loadProperties(values, fc_props); + LayerImpl::setProperty(remain_props); +} + +void FullyConnectedLayerCl::forwarding(RunLayerContext &context, + bool training) { + + Tensor &weight = context.getWeight(weight_idx[FCParams::weight]); + Tensor &hidden_ = context.getOutput(SINGLE_INOUT_IDX); + Tensor &input_ = context.getInput(SINGLE_INOUT_IDX); + + if (weight.getDataType() == nntrainer::Tdatatype::QINT4 || + weight.getDataType() == nntrainer::Tdatatype::QINT8) { + Tdatatype dtype = input_.getDataType(); + + Tensor weight_( + {{weight.batch(), weight.channel(), weight.height(), weight.width()}, + {weight.getFormat(), dtype}}, + true); + + unsigned int axis = + context.getWeightObject(weight_idx[FCParams::weight]).getOutputAxis(); + + weight.dequantize(weight_, axis); + + fcDotProcess(input_, weight_, hidden_, context); + } else { + fcDotProcess(input_, weight, hidden_, context); + } + + if (auto &disable_bias = std::get(*layer_impl_props); + disable_bias.empty() || disable_bias.get() == false) { + Tensor &bias = context.getWeight(weight_idx[FCParams::bias]); + hidden_.add_i(bias); + } +} + +void FullyConnectedLayerCl::fcDotProcess(Tensor const &input, + Tensor const &weight, Tensor &result, + RunLayerContext &context) { + // to do: + // NNTR_THROW_IF(!contiguous, std::invalid_argument) + // << getName() << " is not contiguous. Cannot dot product."; + + unsigned int dim1, dim2, mdim1, mdim2; + if (input.getFormat() == Tformat::NHWC) { + dim1 = input.batch() * input.height() * input.width(); + dim2 = input.channel(); + mdim1 = weight.batch() * weight.height() * weight.width(); + mdim2 = weight.channel(); + } else { + dim1 = input.batch() * input.channel() * input.height(); + dim2 = input.width(); + mdim1 = weight.batch() * weight.channel() * weight.height(); + mdim2 = weight.width(); + } + + unsigned int M, N, K, lda, ldb, ldc; + if (dim2 != mdim1) + throw std::runtime_error("Error: incompatible dimensions for dot product"); + K = mdim1; /** == dim2 */ + N = mdim2; + M = dim1; + if (input.getFormat() == Tformat::NHWC) { + CREATE_IF_EMPTY_DIMS(result, input.batch(), N, input.height(), + input.width(), + input.getTensorType()); // NHWC Result Tensor + } else { + CREATE_IF_EMPTY_DIMS(result, input.batch(), input.channel(), input.height(), + N, input.getTensorType()); + } + + lda = dim2; + ldb = mdim2; + ldc = + (input.getFormat() == Tformat::NHWC) ? result.channel() : result.width(); + + if (input.getDataType() == ml::train::TensorDim::DataType::FP32) { + const float *data = input.getData(); + const float *mdata = weight.getData(); + float *rdata = result.getData(); + + /// shortcut handling in case of vector + /// for vector, (1 * K) == (K * 1) in current memory layout... + /// and plaese note that N, K, M is a fixed place holder after considering + /// transpose. + /// For example, there is no case like (1 * K) X (1 * K) while + /// (1 * K) X (1 * M) can be a case + /// case1: (1 * K) X (K * 1) + if (M == 1 && N == 1) { + *rdata = dot_cl(data, mdata, K, context) + (*rdata); + } + /// case2: (M * K) X (K * 1) + else if (N == 1) { + sgemv_cl(data, mdata, rdata, dim1, dim2, lda, context); + } + /// case3: (1 * K) X (K * N) = 1 * N = R + /// = R^T = (K * N) ^T * (1 * K) ^T = (N * K) * (K * 1) = (N * K) * (1 * K) + /// Effectively a translation of sgemv + else if (M == 1) { + sgemv_cl(mdata, data, rdata, mdim1, mdim2, ldb, context); + } + /// case others: use gemm + else { + sgemm_cl(data, mdata, rdata, M, N, K, lda, ldb, ldc, context); + } + } else + throw std::invalid_argument("Error: OpenCL fp16 is not supported yet."); +} + +void FullyConnectedLayerCl::incremental_forwarding(RunLayerContext &context, + unsigned int from, + unsigned int to, + bool training) { + Tensor w; + Tensor &weight = w; + context.getWeight(weight, weight_idx[FCParams::weight]); + + Tensor &input_ = context.getInput(SINGLE_INOUT_IDX); + Tensor &hidden_ = context.getOutput(SINGLE_INOUT_IDX); + + TensorDim input_dim = input_.getDim(); + TensorDim hidden_dim = hidden_.getDim(); + + TensorDim input_step_dim = input_dim; + TensorDim hidden_step_dim = hidden_dim; + + if (from) { + NNTR_THROW_IF(to - from != 1, std::invalid_argument) + << "incremental step size is not 1"; + from = 0; + to = 1; + } + + input_step_dim.height(to - from); + hidden_step_dim.height(to - from); + + // @todo: set reset stride as false. This implementation only works when batch + // size is 1 + Tensor input_step = input_.getSharedDataTensor(input_step_dim, 0, true); + Tensor hidden_step = hidden_.getSharedDataTensor(hidden_step_dim, 0, true); + + fcDotProcess(input_step, weight, hidden_step, context); + + if (auto &disable_bias = std::get(*layer_impl_props); + disable_bias.empty() || disable_bias.get() == false) { + Tensor &bias = context.getWeight(weight_idx[FCParams::bias]); + hidden_step.add_i(bias); + } +} + +void FullyConnectedLayerCl::calcDerivative(RunLayerContext &context) { + Tensor &weight = context.getWeight(weight_idx[FCParams::weight]); + + const Tensor &derivative_ = context.getIncomingDerivative(SINGLE_INOUT_IDX); + Tensor &ret_ = context.getOutgoingDerivative(SINGLE_INOUT_IDX); + + ret_.dot_deriv_wrt_1(weight, derivative_, false, false); +} + +void FullyConnectedLayerCl::calcGradient(RunLayerContext &context) { + Tensor &djdw = context.getWeightGrad(weight_idx[FCParams::weight]); + + const Tensor &derivative_ = context.getIncomingDerivative(SINGLE_INOUT_IDX); + Tensor &input_ = context.getInput(SINGLE_INOUT_IDX); + + if (auto &disable_bias = std::get(*layer_impl_props); + disable_bias.empty() || disable_bias.get() == false) { + Tensor &djdb = context.getWeightGrad(weight_idx[FCParams::bias]); + + if (context.isGradientFirstAccess(weight_idx[FCParams::bias])) { + derivative_.sum({0, 1, 2}, djdb); + } else { + /// @todo optimize below by adding beta to Tensor::sum + Tensor t = derivative_.sum({0, 1, 2}); + djdb.add_i(t); + } + } + + input_.dot_deriv_wrt_2( + djdw, derivative_, false, false, + !context.isGradientFirstAccess(weight_idx[FCParams::weight])); +} + +} /* namespace nntrainer */ diff --git a/nntrainer/layers/cl_layers/fc_layer_cl.h b/nntrainer/layers/cl_layers/fc_layer_cl.h new file mode 100644 index 0000000000..c94ecb22d7 --- /dev/null +++ b/nntrainer/layers/cl_layers/fc_layer_cl.h @@ -0,0 +1,130 @@ +// SPDX-License-Identifier: Apache-2.0 +/** + * Copyright (C) 2024 Debadri Samaddar + * + * @file fc_layer_cl.h + * @date 7 May 2024 + * @brief This is Fully Connected Layer Class of Neural Network with OpenCl + * implementation + * @see https://github.com/nnstreamer/nntrainer + * @author Debadri Samaddar + * @bug No known bugs except for NYI items + * + */ + +#ifndef __FC_LAYER_CL_H__ +#define __FC_LAYER_CL_H__ +#ifdef __cplusplus + +#include +#include + +#define CREATE_IF_EMPTY_DIMS(tensor, ...) \ + do { \ + if (tensor.empty()) \ + tensor = Tensor(__VA_ARGS__); \ + } while (0); + +namespace nntrainer { + +/** + * @class FullyConnecedLayer + * @brief fully connected layer + */ +class FullyConnectedLayerCl : public LayerImpl { +public: + /** + * @brief Constructor of Fully Connected Layer + */ + FullyConnectedLayerCl(); + + /** + * @brief Destructor of Fully Connected Layer + */ + ~FullyConnectedLayerCl() = default; + + /** + * @brief Move constructor. + * @param[in] FullyConnected && + */ + FullyConnectedLayerCl(FullyConnectedLayerCl &&rhs) noexcept = default; + + /** + * @brief Move assignment operator. + * @parma[in] rhs FullyConnectedLayer to be moved. + */ + FullyConnectedLayerCl &operator=(FullyConnectedLayerCl &&rhs) = default; + + /** + * @copydoc Layer::finalize(InitLayerContext &context) + */ + void finalize(InitLayerContext &context) override; + + /** + * @copydoc Layer::forwarding(RunLayerContext &context, bool training) + */ + void forwarding(RunLayerContext &context, bool training) override; + + /** + * @copydoc Layer::incremental_forwarding(RunLayerContext &context, unsigned + * int from, unsigned int to, bool training) + */ + void incremental_forwarding(RunLayerContext &context, unsigned int from, + unsigned int to, bool training) override; + + /** + * @copydoc Layer::calcDerivative(RunLayerContext &context) + */ + void calcDerivative(RunLayerContext &context) override; + + /** + * @copydoc Layer::calcGradient(RunLayerContext &context) + */ + void calcGradient(RunLayerContext &context) override; + + /** + * @copydoc Layer::exportTo(Exporter &exporter, ml::train::ExportMethods + * method) + */ + void exportTo(Exporter &exporter, + const ml::train::ExportMethods &method) const override; + + /** + * @copydoc Layer::getType() + */ + const std::string getType() const override { + return FullyConnectedLayerCl::type; + }; + + /** + * @brief Process data and dimensions for dot operation used in fc_layer + * @param[in] input Tensor + * @param[in] weight Tensor + * @param[in] result Tensor + * @param[in] RunLayerContext reference + */ + void fcDotProcess(Tensor const &input, Tensor const &weight, Tensor &result, + RunLayerContext &context); + + /** + * @copydoc Layer::supportBackwarding() + */ + bool supportBackwarding() const override { return true; } + + /** + * @copydoc Layer::setProperty(const PropertyType type, const std::string + * &value) + */ + void setProperty(const std::vector &values) override; + + inline static const std::string type = "fully_connected"; + +private: + std::tuple + fc_props; /**< fc layer properties : unit - number of output neurons */ + std::array weight_idx; /**< indices of the weights */ +}; +} // namespace nntrainer + +#endif /* __cplusplus */ +#endif /* __FC_LAYER_CL__ */ diff --git a/nntrainer/layers/cl_layers/meson.build b/nntrainer/layers/cl_layers/meson.build new file mode 100644 index 0000000000..2f1ba7fc03 --- /dev/null +++ b/nntrainer/layers/cl_layers/meson.build @@ -0,0 +1,8 @@ +cl_layer_sources = [ + 'fc_layer_cl.cpp', + 'blas_kernels.cpp' +] + +foreach s : cl_layer_sources + nntrainer_sources += meson.current_source_dir() / s +endforeach diff --git a/nntrainer/layers/conv2d_layer.cpp b/nntrainer/layers/conv2d_layer.cpp index c059ae9caf..5d9dbc1e19 100644 --- a/nntrainer/layers/conv2d_layer.cpp +++ b/nntrainer/layers/conv2d_layer.cpp @@ -38,7 +38,8 @@ namespace { static TensorDim calcCol2ImOutputDim(const TensorDim &out, const TensorDim &kdim) { - return TensorDim({kdim.getFeatureLen(), out.width() * out.height()}); + return TensorDim({kdim.getFeatureLen(), out.width() * out.height()}, + out.getTensorType()); } /** @@ -56,7 +57,10 @@ static void col2im(const Tensor &col_matrix, const TensorDim &kdim, const std::array &mstride, const std::array &dilation, Tensor &image) { - auto [pt, pb, pl, pr] = padding; + auto pt = padding[0]; + auto pb = padding[1]; + auto pl = padding[2]; + auto pr = padding[3]; unsigned k_height = kdim.height(); unsigned k_width = kdim.width(); @@ -84,32 +88,48 @@ static void col2im(const Tensor &col_matrix, const TensorDim &kdim, int h_stride_end = im_eff_height - eff_k_height - pt; int w_stride_end = im_eff_width - eff_k_width - pl; - unsigned col_w = 0; - for (int hs = -pt; hs <= h_stride_end; hs += hstride) { - for (int ws = -pl; ws <= w_stride_end; ws += wstride) { - unsigned col_h = 0; - int patch_height_end = hs + eff_k_height; - int patch_width_end = ws + eff_k_width; - for (unsigned c = 0; c < im_channel; c++) { - for (int h = hs; h < patch_height_end; h += hdilation) { - if (h < 0 || im_height <= h) { - col_h += k_width; - continue; - } - for (int w = ws; w < patch_width_end; w += wdilation) { - if (w < 0 || im_width <= w) { - col_h++; + auto apply_data = [&](T *val) { + unsigned col_w = 0; + for (int hs = -pt; hs <= h_stride_end; hs += hstride) { + for (int ws = -pl; ws <= w_stride_end; ws += wstride) { + unsigned col_h = 0; + int patch_height_end = hs + eff_k_height; + int patch_width_end = ws + eff_k_width; + for (unsigned c = 0; c < im_channel; c++) { + for (int h = hs; h < patch_height_end; h += hdilation) { + if (h < 0 || im_height <= h) { + col_h += k_width; continue; } - - float *val = image.getAddress(0, c, h, w); - *val += col_matrix.getValue(0, 0, col_h, col_w); - col_h++; + for (int w = ws; w < patch_width_end; w += wdilation) { + if (w < 0 || im_width <= w) { + col_h++; + continue; + } + + val = image.getAddress(0, c, h, w); + *val += col_matrix.getValue(0, 0, col_h, col_w); + col_h++; + } } } + col_w++; } - col_w++; } + }; + + if (image.getDataType() == nntrainer::Tdatatype::FP32) { + float val; + apply_data(&val); + } +#ifdef ENABLE_FP16 + else if (image.getDataType() == nntrainer::Tdatatype::FP16) { + _FP16 val; + apply_data(&val); + } +#endif + else { + throw std::runtime_error("Not supported datatype"); } } @@ -179,7 +199,10 @@ static void im2col(const Tensor &in, const TensorDim &kdim, // } */ - auto [pt, pb, pl, pr] = padding; + auto pt = padding[0]; + auto pb = padding[1]; + auto pl = padding[2]; + auto pr = padding[3]; unsigned int channel = in.channel(); int in_height = in.height(); @@ -198,46 +221,62 @@ static void im2col(const Tensor &in, const TensorDim &kdim, unsigned int out_width = (width - eff_k_width) / mstride[1] + 1; out.reshape( - TensorDim({out_height * out_width, in.channel() * k_height * k_width})); - float *out_data = out.getData(); - - int h_stride_end = height - eff_k_height - pt; - int w_stride_end = width - eff_k_width - pl; - - /// get a patch, size of kernel - /// hs is height_strided, ws is width_strided - unsigned int owidth = out.width(); - unsigned int base_im_w = 0; - for (int hs = -pt; hs <= h_stride_end; hs += mstride[0]) { - unsigned int base_im_h = 0; - int patch_height_end = eff_k_height + hs; - /// map the patch to a single line looping through channel - for (unsigned int c = 0; c < channel; ++c) { - for (int h = hs; h < patch_height_end; h += dilation[0]) { - if (h < 0 || in_height <= h) { - base_im_h += k_width; - continue; - } - - unsigned int im_w = base_im_w; - for (int ws = -pl; ws <= w_stride_end; ws += mstride[1]) { - unsigned int im_h = base_im_h; - int patch_width_end = eff_k_width + ws; + TensorDim({out_height * out_width, in.channel() * k_height * k_width}, + in.getTensorType())); + + auto apply_data = [&](T *out_data) { + int h_stride_end = height - eff_k_height - pt; + int w_stride_end = width - eff_k_width - pl; + + /// get a patch, size of kernel + /// hs is height_strided, ws is width_strided + unsigned int owidth = out.width(); + unsigned int base_im_w = 0; + for (int hs = -pt; hs <= h_stride_end; hs += mstride[0]) { + unsigned int base_im_h = 0; + int patch_height_end = eff_k_height + hs; + /// map the patch to a single line looping through channel + for (unsigned int c = 0; c < channel; ++c) { + for (int h = hs; h < patch_height_end; h += dilation[0]) { + if (h < 0 || in_height <= h) { + base_im_h += k_width; + continue; + } - for (int w = ws; w < patch_width_end; w += dilation[1]) { - if (w < 0 || in_width <= w) { + unsigned int im_w = base_im_w; + for (int ws = -pl; ws <= w_stride_end; ws += mstride[1]) { + unsigned int im_h = base_im_h; + int patch_width_end = eff_k_width + ws; + + for (int w = ws; w < patch_width_end; w += dilation[1]) { + if (w < 0 || in_width <= w) { + im_h++; + continue; + } + out_data[im_w * owidth + im_h] = in.getValue(0, c, h, w); im_h++; - continue; } - out_data[im_w * owidth + im_h] = in.getValue(0, c, h, w); - im_h++; + im_w++; } - im_w++; + base_im_h += k_width; } - base_im_h += k_width; } + base_im_w += out_width; } - base_im_w += out_width; + }; + + if (out.getDataType() == nntrainer::Tdatatype::FP32) { + float *out_data = out.getData(); + apply_data(out_data); + } +#ifdef ENABLE_FP16 + else if (out.getDataType() == nntrainer::Tdatatype::FP16) { + _FP16 *out_data = out.getData<_FP16>(); + apply_data(out_data); + } +#endif + else { + throw std::runtime_error("Not supported datatype"); } } @@ -279,9 +318,11 @@ void Conv2DLayer::finalize(InitLayerContext &context) { auto &dilation = std::get>(conv_props); - TensorDim kernel_dim = - TensorDim(filter_size, in_dim.channel(), kernel_size[0], kernel_size[1]); - TensorDim bias_dim = TensorDim(1, filter_size, 1, 1); + auto in_t_type = in_dim.getTensorType(); + in_t_type.data_type = context.getWeightDataType(); + TensorDim kernel_dim = TensorDim(filter_size, in_dim.channel(), + kernel_size[0], kernel_size[1], in_t_type); + TensorDim bias_dim = TensorDim(1, filter_size, 1, 1, in_t_type); padding = std::get(conv_props) .compute(in_dim, kernel_dim, {stride[0], stride[1]}, @@ -309,6 +350,7 @@ void Conv2DLayer::finalize(InitLayerContext &context) { out_dim.channel(filter_size); out_dim.height((eff_in_height - eff_k_height) / stride[0] + 1); out_dim.width((eff_in_width - eff_k_width) / stride[1] + 1); + out_dim.setTensorType(in_dim.getTensorType()); context.setOutputDimensions({out_dim}); NNTR_THROW_IF(eff_in_height < kernel_size[0] || eff_in_width < kernel_size[1], diff --git a/nntrainer/layers/fc_layer.cpp b/nntrainer/layers/fc_layer.cpp index de34f5f921..436a936439 100644 --- a/nntrainer/layers/fc_layer.cpp +++ b/nntrainer/layers/fc_layer.cpp @@ -40,8 +40,11 @@ enum FCParams { weight, bias }; enum LORAParams { loraA, loraB, loraTmp, loraOut }; FullyConnectedLayer::FullyConnectedLayer() : - LayerImpl(), fc_props(props::Unit(), props::LoraRank(), props::LoraAlpha()) { + LayerImpl(), + lora_scaling(1.0f), + fc_props(props::Unit(), props::LoraRank(), props::LoraAlpha()) { weight_idx.fill(std::numeric_limits::max()); + lora_idx.fill(std::numeric_limits::max()); } void FullyConnectedLayer::finalize(InitLayerContext &context) { diff --git a/nntrainer/layers/fc_layer.h b/nntrainer/layers/fc_layer.h index cb3726b020..44ef99d912 100644 --- a/nntrainer/layers/fc_layer.h +++ b/nntrainer/layers/fc_layer.h @@ -114,7 +114,7 @@ class FullyConnectedLayer : public LayerImpl { lora_scaling - scaling factor of LoRA apply, i.e., lora_scaling = alpha / lora_rank */ std::array weight_idx; /**< indices of the weights */ - std::array lora_idx; /**< indices of the lora weights */ + std::array lora_idx; /**< indices of the lora weights */ }; } // namespace nntrainer diff --git a/nntrainer/layers/input_layer.cpp b/nntrainer/layers/input_layer.cpp index eabd40b297..a67701da2c 100644 --- a/nntrainer/layers/input_layer.cpp +++ b/nntrainer/layers/input_layer.cpp @@ -34,7 +34,8 @@ static constexpr size_t SINGLE_INOUT_IDX = 0; InputLayer::InputLayer() : Layer(), - input_props(props::Normalization(), props::Standardization()) {} + input_props(props::Normalization(), props::Standardization()), + is_inplace(true) {} void InputLayer::setProperty(const std::vector &values) { auto remain_props = loadProperties(values, input_props); @@ -47,7 +48,7 @@ void InputLayer::forwarding(RunLayerContext &context, bool training) { Tensor &hidden_ = context.getOutput(SINGLE_INOUT_IDX); if (!context.executeInPlace()) { Tensor &input_ = context.getInput(SINGLE_INOUT_IDX); - hidden_.copy(input_); + hidden_.copyData(input_); } if (std::get(input_props)) @@ -70,7 +71,22 @@ void InputLayer::finalize(InitLayerContext &context) { std::vector output_dims = context.getInputDimensions(); + for (auto &d : output_dims) { + d.setDataType(context.getActivationDataType()); + } + context.setOutputDimensions(output_dims); + + is_inplace = true; + + /** + * @note Input Layer assuems that the FP32 IN Tensor always. Therefore, if the + * activation data type is not fp32, then it does not support in-place + * operation. + */ + if (context.getActivationDataType() != ml::train::TensorDim::DataType::FP32) { + is_inplace = false; + } } } /* namespace nntrainer */ diff --git a/nntrainer/layers/input_layer.h b/nntrainer/layers/input_layer.h index f6728d676b..e9183e23d1 100644 --- a/nntrainer/layers/input_layer.h +++ b/nntrainer/layers/input_layer.h @@ -82,7 +82,7 @@ class InputLayer : public Layer { /** * @copydoc Layer::supportInPlace() */ - bool supportInPlace() const override { return true; } + bool supportInPlace() const override { return is_inplace; } /** * @copydoc Layer::exportTo(Exporter &exporter, ml::train::ExportMethods @@ -105,6 +105,7 @@ class InputLayer : public Layer { private: std::tuple input_props; + bool is_inplace; }; } // namespace nntrainer diff --git a/nntrainer/layers/layer_context.cpp b/nntrainer/layers/layer_context.cpp index fff2eb15ec..add78c09cb 100644 --- a/nntrainer/layers/layer_context.cpp +++ b/nntrainer/layers/layer_context.cpp @@ -126,13 +126,14 @@ const std::vector &InitLayerContext::getOutSpecs() const { } RunLayerContext::RunLayerContext(const std::string &name, bool trainable, - float l, bool in_place_, + float l, bool in_place_, float loss_scale_, const std::vector &w, const std::vector &in, const std::vector &out, const std::vector &t) : loss(l), in_place(in_place_), + loss_scale(loss_scale_), weights(w), inputs(in), outputs(out), @@ -169,6 +170,19 @@ Tensor &RunLayerContext::getWeightGrad(unsigned int idx) const { return weights[idx]->getGradientRef(); } +/** + * @brief Get the Weight Gradient tensor object + * + * @param idx Identifier of the weight + * @return Tensor& Reference to the weight grad tensor + */ +Tensor &RunLayerContext::getWeightFP32(unsigned int idx) const { + if (!weights[idx]->hasGradient()) + throw std::invalid_argument( + "Requesting gradient for a non-trainable weight."); + return weights[idx]->getVariableFP32Ref(); +} + /** * @brief Get the Weight Optimizer Variable tensor object * @@ -402,6 +416,17 @@ bool RunLayerContext::isGradientClipByGlobalNorm(unsigned int idx) const { return weights[idx]->isGradientClipByGlobalNorm(); } +bool RunLayerContext::isMixedPrecision(unsigned int idx) const { + return weights[idx]->isMixedPrecision(); +} + +bool RunLayerContext::isMixedPrecision() const { + for (auto w : weights) + if (w->isMixedPrecision()) + return true; + return false; +} + /** * @brief Get the tensor name * @@ -650,10 +675,12 @@ bool RunLayerContext::clCreateKernel(std::string kernel_string, */ std::string RunLayerContext::getKernelName(LayerKernel layerKernel) { switch (layerKernel) { - case LayerKernel::KERNEL_NAME1: - return "kernel_name1"; - case LayerKernel::KERNEL_NAME2: - return "kernel_name2"; + case LayerKernel::SGEMV: + return "sgemv_cl"; + case LayerKernel::DOT: + return "dot_cl"; + case LayerKernel::SGEMM: + return "sgemm_cl"; default: return ""; } diff --git a/nntrainer/layers/layer_context.h b/nntrainer/layers/layer_context.h index e5c6759638..2a32ba7287 100644 --- a/nntrainer/layers/layer_context.h +++ b/nntrainer/layers/layer_context.h @@ -63,7 +63,7 @@ class InitLayerContext { const float max_norm = 0.0, std::array tensor_type_ = {"NCHW", "FP32", "FP32"}, - const float loss_scale = 0.0); + const float loss_scale = 1.0); /** * @brief get Tensor Format of Layer * @@ -348,6 +348,14 @@ class InitLayerContext { */ bool executeInPlace() const { return in_place; } + /** + * @brief get Initial value of Loss_Scale. This is set to RunLayerContext + * and updated + * + * @return loss_scale + */ + float getLossScale() const { return loss_scale; } + private: std::vector input_dim; /**< Input dimensions for the layer */ bool in_place; /**< if the layer is expected to run in-place */ @@ -385,7 +393,7 @@ class RunLayerContext { * @brief Construct a new Run Layer Context object * */ - RunLayerContext() : loss(0.0), in_place(false) {} + RunLayerContext() : loss(0.0), in_place(false), loss_scale(1.0) {} /** * @brief Construct a new Run Layer Context object @@ -396,6 +404,17 @@ class RunLayerContext { std::get(props).set(name); } + /** + * @brief Construct a new Run Layer Context object + * + */ + RunLayerContext(const std::string &name, bool in_place_, float loss_scale_) : + RunLayerContext() { + in_place = in_place_; + std::get(props).set(name); + loss_scale = loss_scale_; + } + /** * @brief Construct a new Run Layer Context object * @@ -403,13 +422,15 @@ class RunLayerContext { * @param trainable if the layer is trainable * @param l loss of the layer * @param in_place_ execution in-place of the layer + * @param loss_scale loss_scale of the layer * @param w weights of the layer * @param in inputs of the layer * @param out outputs of the layer * @param t extra tensors of the layer */ RunLayerContext(const std::string &name, bool trainable, float l, - bool in_place_, const std::vector &w, + bool in_place_, float loss_scale_, + const std::vector &w, const std::vector &in, const std::vector &out, const std::vector &t); @@ -463,6 +484,15 @@ class RunLayerContext { Tensor &getWeightGrad(unsigned int idx) const; /** + * @brief Get the Weight Gradient tensor object + * + * @param idx Identifier of the weight + * @return Tensor& Reference to the weight grad tensor + */ + Tensor &getWeightFP32(unsigned int idx) const; + + /** + * @brief Get the Weight Optimizer Variable tensor object * * @param idx Identifier of the weight @@ -659,6 +689,20 @@ class RunLayerContext { */ bool isGradientClipByGlobalNorm(unsigned int idx) const; + /** + * @brief check if the weight is mixed precsion + * + * @param idx index + * @return bool true if it is mixed precision + */ + bool isMixedPrecision(unsigned int idx) const; + + /** + * @brief check if the weight is mixed precsion + * @return bool true if it is mixed precision + */ + bool isMixedPrecision() const; + /** * @brief Get the tensor name * @@ -830,8 +874,9 @@ class RunLayerContext { * getKernelName function. */ enum LayerKernel { - KERNEL_NAME1 = 1, /**< placeholder for kernel name */ - KERNEL_NAME2 = 2 /**< placeholder for kernel name */ + SGEMV = 1, /**< placeholder for kernel name */ + DOT = 2, /**< placeholder for kernel name */ + SGEMM = 4 /**< placeholder for kernel name */ }; /** @@ -874,10 +919,29 @@ class RunLayerContext { */ ml::train::LayerComputeEngine getComputeEngine() { return compute_engine; } + /** + * @brief get loss scale + * @return loss scale + */ + float getLossScale() { return loss_scale; } + + /** + * @brief set Loss_Scale. + * + * @return loss_scale + */ + void setLossScale(float scale) { + loss_scale = scale; + for (auto w : weights) { + w->setLossScale(scale); + } + } + private: std::tuple props; /**< props of the layer */ float loss; /**< loss of the layer */ - bool in_place; /**< if the layer is expected to run in-place */ + bool in_place; /**< if the layer is expected to run in-place */ + float loss_scale; /**< loss_scale of the layer */ std::vector weights; /**< weights of the layer */ std::vector inputs; /**< inputs of the layer */ diff --git a/nntrainer/layers/layer_devel.h b/nntrainer/layers/layer_devel.h index 54ce1a0ee9..44a87cc7e9 100644 --- a/nntrainer/layers/layer_devel.h +++ b/nntrainer/layers/layer_devel.h @@ -259,6 +259,11 @@ class Layer { * @return true if supports backwarding, else false */ virtual bool supportBackwarding() const = 0; + + /** + * @brief Set loss scale factor + */ + virtual void setLossScale(float scale) {} }; /// @todo Decide where to put and how to implement(#986) diff --git a/nntrainer/layers/layer_node.cpp b/nntrainer/layers/layer_node.cpp index 8b18d80762..114555fee4 100644 --- a/nntrainer/layers/layer_node.cpp +++ b/nntrainer/layers/layer_node.cpp @@ -180,6 +180,7 @@ LayerNode::LayerNode(std::unique_ptr &&l) : inplace(InPlace::NONE), needs_calc_derivative(false), needs_calc_gradient(false), + output_connections(), run_context(nullptr), layer_node_props( @@ -190,7 +191,8 @@ LayerNode::LayerNode(std::unique_ptr &&l) : new RealizationPropsType(props::Flatten(), props::Activation())), loss(new props::Loss()), regularization_loss(0.0f), - exec_order({0, 0, 0, 0}) { + exec_order({0, 0, 0, 0}), + needs_output_set_zero(false) { if (layer && layer->getType() == TimeDistLayer::type) { std::get(*layer_node_props).set(true); } @@ -475,6 +477,9 @@ void LayerNode::read(std::ifstream &file, bool opt_var) { /// @note shared weights are only be read at the first acecss if (run_context->isGradientLastAccess(i)) { run_context->getWeight(i).read(file); + if (run_context->isMixedPrecision(i) && getTrainable()) { + run_context->getWeightFP32(i).copyData(run_context->getWeight(i)); + } } } } @@ -599,7 +604,7 @@ InitLayerContext LayerNode::finalize(const std::vector &input_dims, const auto &scope = getSharedFrom().empty() ? getName() : getSharedFrom(); float max_norm = 0.0; - float loss_scale = 0.0; + float loss_scale = 1.0; if (!std::get(*layer_node_props).empty()) max_norm = std::get(*layer_node_props).get(); @@ -748,8 +753,21 @@ LayerNode::refinalize(const std::vector &input_dims) { */ void LayerNode::forwarding(bool training) { loss->set(run_context->getRegularizationLoss()); + PROFILE_TIME_START(forward_event_key); + if (needsOutputSetZero()) { + for (unsigned int i = 0; i < run_context->getNumOutputs(); ++i) { + run_context->getOutput(i).setValue(0); + run_context->getOutgoingDerivative(i).setValue(0); + } + + for (unsigned int i = 0; i < run_context->getNumWeights(); ++i) { + run_context->getWeightGrad(i).setValue(0); + } + } + layer->forwarding(*run_context, training); + needsOutputSetZero(false); PROFILE_TIME_END(forward_event_key); TRACE_MEMORY() << getName() + ": F"; TRACE_TIME() << getName() + ": F"; @@ -864,10 +882,11 @@ float LayerNode::getLoss() const { return *loss; } void LayerNode::configureRunContext(const std::vector &weights, const std::vector &inputs, const std::vector &outputs, - const std::vector &tensors) { + const std::vector &tensors, + float loss_scale) { run_context = std::make_unique( - getName(), getTrainable(), 0.0f, executeInPlace() != InPlace::NONE, weights, - inputs, outputs, tensors); + getName(), getTrainable(), 0.0f, executeInPlace() != InPlace::NONE, + loss_scale, weights, inputs, outputs, tensors); } /** diff --git a/nntrainer/layers/layer_node.h b/nntrainer/layers/layer_node.h index 93e7ac7069..c2202f20aa 100644 --- a/nntrainer/layers/layer_node.h +++ b/nntrainer/layers/layer_node.h @@ -487,6 +487,7 @@ class LayerNode final : public ml::train::Layer, public GraphNode { const std::vector getOutputDimensions() const; /** * @brief Get the Weight object + * currently, only unittest uses this func. * * @param idx Identifier of the weight * @return Weight& Reference to the weight @@ -495,11 +496,11 @@ class LayerNode final : public ml::train::Layer, public GraphNode { NNTR_THROW_IF(!run_context, std::runtime_error) << __func__ << " layer needs to be finalized first!"; if (run_context->weightHasGradient(idx)) { - return Weight(run_context->getWeight(idx), - run_context->getWeightGrad(idx), - run_context->getWeightName(idx)); + return Weight( + run_context->getWeight(idx), run_context->getWeightGrad(idx), + run_context->getWeightFP32(idx), run_context->getWeightName(idx)); } else { - return Weight(run_context->getWeight(idx), Tensor(), + return Weight(run_context->getWeight(idx), Tensor(), Tensor(), run_context->getWeightName(idx)); } } @@ -819,7 +820,8 @@ class LayerNode final : public ml::train::Layer, public GraphNode { void configureRunContext(const std::vector &weights, const std::vector &inputs, const std::vector &outputs, - const std::vector &tensors); + const std::vector &tensors, + float loss_scale); /** * @brief Preset modes for printing summary for the layer @@ -877,6 +879,13 @@ class LayerNode final : public ml::train::Layer, public GraphNode { needs_calc_derivative = nb; } + /** + * @brief Set if the layer output needs reinitialization @mixed precsion + * + * @param nb true if the layer needs to do reinitialization, eles false + */ + void needsOutputSetZero(bool nb) { needs_output_set_zero = nb; } + /** * @brief Set if the layer needs to do calculation of gradients * @@ -898,6 +907,13 @@ class LayerNode final : public ml::train::Layer, public GraphNode { */ bool needsCalcGradient() { return needs_calc_gradient; } + /** + * @brief Set if the layer needs to reinitialization @mixed precsion + * + * @param nb true if the layer needs reinitialization, eles false + */ + bool needsOutputSetZero() { return needs_output_set_zero; } + private: /** * @brief Get the Input Layers object @@ -964,6 +980,9 @@ properties in the context/graph unless intended. */ ExecutionOrder exec_order; /**< order/location of execution for this node in forward and backwarding operations */ + bool needs_output_set_zero; /**< cache if this layer needs reinitialization + output */ + /** * @brief Get the effective layer managed by this layer node * diff --git a/nntrainer/layers/loss/loss_layer.cpp b/nntrainer/layers/loss/loss_layer.cpp index 40f74717f8..8d18878f49 100644 --- a/nntrainer/layers/loss/loss_layer.cpp +++ b/nntrainer/layers/loss/loss_layer.cpp @@ -22,8 +22,12 @@ void LossLayer::finalize(InitLayerContext &context) { d.setDataType( str_converter::from_string("FP32")); - + context.setOutputDimensions(output_dim); + + is_inplace = true; + if (context.getActivationDataType() != ml::train::TensorDim::DataType::FP32) + is_inplace = false; } void LossLayer::updateLoss(RunLayerContext &context, const Tensor &l) { @@ -36,6 +40,13 @@ void LossLayer::updateLoss(RunLayerContext &context, const Tensor &l) { context.setLoss(loss_sum / (float)l.batch()); } +void LossLayer::applyLossScale(RunLayerContext &context, Tensor &ret_deriv) { + + float loss_scale = context.getLossScale(); + if (loss_scale != 1.0) + ret_deriv.multiply_i(loss_scale); +} + /** * @copydoc Layer::setProperty(const std::vector &values) */ diff --git a/nntrainer/layers/loss/loss_layer.h b/nntrainer/layers/loss/loss_layer.h index 00b520f6e6..418777606c 100644 --- a/nntrainer/layers/loss/loss_layer.h +++ b/nntrainer/layers/loss/loss_layer.h @@ -47,6 +47,8 @@ class LossLayer : public Layer { */ virtual bool supportBackwarding() const override { return true; } + bool supportInPlace() const override {return is_inplace;} + /** * @copydoc Layer::requireLabel() */ @@ -60,8 +62,17 @@ class LossLayer : public Layer { */ void updateLoss(RunLayerContext &context, const Tensor &l); + /** + * @brief update return derivative with loss scale + * @param context Run context to update + * @param return_dev Tensor data to calculate + */ + void applyLossScale(RunLayerContext &context, Tensor &l); + Tensor l; /**< loss tensor to store intermediate value to calculate loss value */ + + bool is_inplace; }; } // namespace nntrainer diff --git a/nntrainer/layers/loss/mse_loss_layer.cpp b/nntrainer/layers/loss/mse_loss_layer.cpp index 7f7bd1626f..356acae6f5 100644 --- a/nntrainer/layers/loss/mse_loss_layer.cpp +++ b/nntrainer/layers/loss/mse_loss_layer.cpp @@ -20,7 +20,16 @@ static constexpr size_t SINGLE_INOUT_IDX = 0; void MSELossLayer::forwarding(RunLayerContext &context, bool training) { Tensor &hidden_ = context.getOutput(SINGLE_INOUT_IDX); - Tensor &y = context.getInput(SINGLE_INOUT_IDX); + + Tensor empty_tensor; + Tensor &y = context.getInput(SINGLE_INOUT_IDX).getDataType() == + ml::train::TensorDim::DataType::FP32 + ? context.getInput(SINGLE_INOUT_IDX) + : empty_tensor; + + if (y.empty()) + y = context.getInput(SINGLE_INOUT_IDX) + .clone(ml::train::TensorDim::DataType::FP32); // hidden_ <- y2 - y; if (context.isLabelAvailable(SINGLE_INOUT_IDX)) { @@ -41,9 +50,28 @@ void MSELossLayer::forwarding(RunLayerContext &context, bool training) { } void MSELossLayer::calcDerivative(RunLayerContext &context) { - Tensor &ret_derivative = context.getOutgoingDerivative(SINGLE_INOUT_IDX); + Tensor empty_tensor; + + Tensor &ret_derivative = + context.getOutgoingDerivative(SINGLE_INOUT_IDX).getDataType() == + ml::train::TensorDim::DataType::FP32 + ? context.getOutgoingDerivative(SINGLE_INOUT_IDX) + : empty_tensor; + + if (ret_derivative.empty()) + ret_derivative = context.getOutgoingDerivative(SINGLE_INOUT_IDX) + .clone(ml::train::TensorDim::DataType::FP32); + Tensor empty_tensor1; + Tensor &y = context.getInput(SINGLE_INOUT_IDX).getDataType() == + ml::train::TensorDim::DataType::FP32 + ? context.getInput(SINGLE_INOUT_IDX) + : empty_tensor1; + + if (y.empty()) + y = context.getInput(SINGLE_INOUT_IDX) + .clone(ml::train::TensorDim::DataType::FP32); + const Tensor &y2 = context.getIncomingDerivative(SINGLE_INOUT_IDX); - Tensor &y = context.getInput(SINGLE_INOUT_IDX); y.subtract(y2, ret_derivative); float divider = ((float)y.size()) / 2; @@ -51,6 +79,16 @@ void MSELossLayer::calcDerivative(RunLayerContext &context) { throw std::runtime_error( "[MSELossLayer::calcDerivative] Error when calculating loss"); } + + // Loss Scale needs Full precsiion of ret_derivative. Therefore, + // ret_derivateive should be FP32 when applying scale, and after applying it + // need to convert original type for backpropagating. + + LossLayer::applyLossScale(context, ret_derivative); + + if (context.getOutgoingDerivative(SINGLE_INOUT_IDX).getDataType() != + ml::train::TensorDim::DataType::FP32) + context.getOutgoingDerivative(SINGLE_INOUT_IDX).copyData(ret_derivative); } } // namespace nntrainer diff --git a/nntrainer/layers/loss/mse_loss_layer.h b/nntrainer/layers/loss/mse_loss_layer.h index 387e92b3b5..829b921668 100644 --- a/nntrainer/layers/loss/mse_loss_layer.h +++ b/nntrainer/layers/loss/mse_loss_layer.h @@ -51,6 +51,7 @@ class MSELossLayer : public LossLayer { const std::string getType() const override { return MSELossLayer::type; }; inline static const std::string type = "mse"; + }; } // namespace nntrainer diff --git a/nntrainer/layers/lstm.cpp b/nntrainer/layers/lstm.cpp index d5f13a1fc5..be313a0aca 100644 --- a/nntrainer/layers/lstm.cpp +++ b/nntrainer/layers/lstm.cpp @@ -509,21 +509,27 @@ void LSTMLayer::finalize(InitLayerContext &context) { } // hidden_state_dim : [ batch_size, 1, max_timestep, unit ] - const TensorDim hidden_state_dim(batch_size, 1, max_timestep, unit, - weight_tensor_type); + TensorDim hidden_state_dim(batch_size, 1, max_timestep, unit, + weight_tensor_type); + hidden_state_dim.setDataType(context.getActivationDataType()); + wt_idx[LSTMParams::hidden_state] = context.requestTensor( hidden_state_dim, "hidden_state", Tensor::Initializer::NONE, true, TensorLifespan::ITERATION_LIFESPAN); // cell_state_dim : [ batch_size, 1, max_timestep, unit ] - const TensorDim cell_state_dim(batch_size, 1, max_timestep, unit, - weight_tensor_type); + TensorDim cell_state_dim(batch_size, 1, max_timestep, unit, + weight_tensor_type); + cell_state_dim.setDataType(context.getActivationDataType()); + wt_idx[LSTMParams::cell_state] = context.requestTensor( cell_state_dim, "cell_state", Tensor::Initializer::NONE, true, TensorLifespan::ITERATION_LIFESPAN); // ifgo_dim : [ batch_size, 1, max_timestep, NUM_GATE * unit ] - const TensorDim ifgo_dim(batch_size, 1, max_timestep, NUM_GATE * unit, - weight_tensor_type); + TensorDim ifgo_dim(batch_size, 1, max_timestep, NUM_GATE * unit, + weight_tensor_type); + ifgo_dim.setDataType(context.getActivationDataType()); + wt_idx[LSTMParams::ifgo] = context.requestTensor(ifgo_dim, "ifgo", Tensor::Initializer::NONE, true, TensorLifespan::ITERATION_LIFESPAN); @@ -576,21 +582,27 @@ void LSTMLayer::finalize(InitLayerContext &context) { } // reverse_hidden_state_dim : [ batch_size, 1, max_timestep, unit ] - const TensorDim reverse_hidden_state_dim(batch_size, 1, max_timestep, unit, - weight_tensor_type); + TensorDim reverse_hidden_state_dim(batch_size, 1, max_timestep, unit, + weight_tensor_type); + reverse_hidden_state_dim.setDataType(context.getActivationDataType()); + wt_idx[LSTMParams::reverse_hidden_state] = context.requestTensor( reverse_hidden_state_dim, "reverse_hidden_state", Tensor::Initializer::NONE, true, TensorLifespan::ITERATION_LIFESPAN); // reverse_cell_state_dim : [ batch_size, 1, max_timestep, unit ] - const TensorDim reverse_cell_state_dim(batch_size, 1, max_timestep, unit, - weight_tensor_type); + TensorDim reverse_cell_state_dim(batch_size, 1, max_timestep, unit, + weight_tensor_type); + reverse_cell_state_dim.setDataType(context.getActivationDataType()); + wt_idx[LSTMParams::reverse_cell_state] = context.requestTensor( reverse_cell_state_dim, "reverse_cell_state", Tensor::Initializer::NONE, true, TensorLifespan::ITERATION_LIFESPAN); // reverse_ifgo_dim : [ batch_size, 1, max_timestep, NUM_GATE * unit ] - const TensorDim reverse_ifgo_dim(batch_size, 1, max_timestep, - NUM_GATE * unit, weight_tensor_type); + TensorDim reverse_ifgo_dim(batch_size, 1, max_timestep, NUM_GATE * unit, + weight_tensor_type); + reverse_ifgo_dim.setDataType(context.getActivationDataType()); + wt_idx[LSTMParams::reverse_ifgo] = context.requestTensor( reverse_ifgo_dim, "reverse_ifgo", Tensor::Initializer::NONE, true, TensorLifespan::ITERATION_LIFESPAN); @@ -598,8 +610,10 @@ void LSTMLayer::finalize(InitLayerContext &context) { if (dropout_rate > epsilon) { // dropout_mask_dim = [ batch, 1, time_iteration, unit ] - const TensorDim dropout_mask_dim(batch_size, 1, max_timestep, unit, - weight_tensor_type); + TensorDim dropout_mask_dim(batch_size, 1, max_timestep, unit, + weight_tensor_type); + dropout_mask_dim.setDataType(context.getActivationDataType()); + wt_idx[LSTMParams::dropout_mask] = context.requestTensor( dropout_mask_dim, "dropout_mask", Tensor::Initializer::NONE, false, TensorLifespan::ITERATION_LIFESPAN); diff --git a/nntrainer/layers/lstm.h b/nntrainer/layers/lstm.h index f35fdf8815..a9b2cac7d7 100644 --- a/nntrainer/layers/lstm.h +++ b/nntrainer/layers/lstm.h @@ -99,7 +99,6 @@ class LSTMLayer : public LSTMCore { inline static const std::string type = "lstm"; -private: static constexpr unsigned int NUM_GATE = 4; /** common properties like Unit, IntegrateBias, HiddenStateActivation and diff --git a/nntrainer/layers/pooling2d_layer.cpp b/nntrainer/layers/pooling2d_layer.cpp index a68e42e8d0..b53ca354f2 100644 --- a/nntrainer/layers/pooling2d_layer.cpp +++ b/nntrainer/layers/pooling2d_layer.cpp @@ -6,6 +6,7 @@ * @date 12 June 2020 * @see https://github.com/nnstreamer/nntrainer * @author Jijoong Moon + * @author Jiho Chu * @bug No known bugs except for NYI items * @brief This is 2 Dimensional Pooling Layer Class for Neural Network * @@ -26,6 +27,13 @@ namespace nntrainer { static constexpr size_t SINGLE_INOUT_IDX = 0; +/** + * @brief help function for Pooling handler + */ +template struct PoolFunc { + typedef std::function Type; +}; + Pooling2DLayer::Pooling2DLayer( const std::array &padding_) : Layer(), @@ -96,6 +104,7 @@ void Pooling2DLayer::finalize(InitLayerContext &context) { out_dim.channel(in_dim.channel()); out_dim.height((eff_in_height - pool_size[0]) / stride[0] + 1); out_dim.width((eff_in_width - pool_size[1]) / stride[1] + 1); + out_dim.setDataType(in_dim.getDataType()); context.setOutputDimensions({out_dim}); /** @@ -111,13 +120,17 @@ void Pooling2DLayer::finalize(InitLayerContext &context) { * // clang-format on */ if (pooling_type == props::PoolingTypeInfo::Enum::global_max) { + auto helper_dim = in_dim; + helper_dim.setDataType(ml::train::TensorDim::DataType::FP32); pool_helper_idx = - context.requestTensor(in_dim, "helper_idx", Tensor::Initializer::NONE, + context.requestTensor(helper_dim, "helper_idx", Tensor::Initializer::NONE, false, TensorLifespan::ITERATION_LIFESPAN); - pool_helper_size.resize(in_dim.batch() * in_dim.channel()); + pool_helper_size.resize(helper_dim.batch() * helper_dim.channel()); } else { + auto helper_dim = out_dim; + helper_dim.setDataType(ml::train::TensorDim::DataType::FP32); pool_helper_idx = - context.requestTensor(out_dim, "helper_idx", Tensor::Initializer::NONE, + context.requestTensor(helper_dim, "helper_idx", Tensor::Initializer::NONE, false, TensorLifespan::ITERATION_LIFESPAN); } } @@ -172,15 +185,13 @@ void Pooling2DLayer::calcDerivative(RunLayerContext &context) { unsigned int J, K; result.setZero(); - float *result_data = result.getData(); unsigned int out_map_size = deriv.height() * deriv.width(); unsigned int in_map_size = height * width; - switch (pooling_type) { - case props::PoolingTypeInfo::Enum::max: { + auto apply_max = [&](T *result_data) { const int *iter = pool_helper.getData(); - const float *deriv_data = deriv.getData(); + const T *deriv_data = deriv.getData(); for (unsigned int b = 0; b < batch; ++b) { for (unsigned int c = 0; c < channel; ++c) { for (unsigned int i = 0; i < out_map_size; ++i) { @@ -195,9 +206,9 @@ void Pooling2DLayer::calcDerivative(RunLayerContext &context) { result_data += in_map_size; } } - } break; - case props::PoolingTypeInfo::Enum::global_average: - case props::PoolingTypeInfo::Enum::average: { + }; + + auto apply_average = [&](T *result_data) { int height_stride_end = height - p_height + pt; int width_stride_end = width - p_width + pl; const int *iter = pool_helper.getData(); @@ -207,7 +218,7 @@ void Pooling2DLayer::calcDerivative(RunLayerContext &context) { for (int j = -pt; j <= height_stride_end; j += stride[0]) { K = 0; for (int k = -pl; k <= width_stride_end; k += stride[1]) { - float del = deriv.getValue(b, i, J, K) / *iter; + T del = deriv.getValue(b, i, J, K) / *iter; int patch_height_end = std::min(static_cast(j + p_height), height); int patch_width_end = @@ -217,7 +228,7 @@ void Pooling2DLayer::calcDerivative(RunLayerContext &context) { for (int h = start_h; h < patch_height_end; ++h) { for (int w = start_w; w < patch_width_end; ++w) { result.setValue(b, i, h, w, - result.getValue(b, i, h, w) + del); + result.getValue(b, i, h, w) + del); } } iter++; @@ -227,15 +238,16 @@ void Pooling2DLayer::calcDerivative(RunLayerContext &context) { } } } - } break; - case props::PoolingTypeInfo::Enum::global_max: { - const float *deriv_data = deriv.getData(); + }; + + auto apply_global_max = [&](T *result_data) { + const T *deriv_data = deriv.getData(); for (unsigned int b = 0; b < batch; b++) { for (unsigned int c = 0; c < channel; c++) { const int *iter = pool_helper.getData() + pool_helper.getIndex(b, c, 0, 0); unsigned int helper_size = pool_helper_size[b * channel + c]; - float der = *deriv_data / helper_size; + T der = *deriv_data / static_cast(helper_size); for (unsigned int idx = 0; idx < helper_size; idx++) result_data[iter[idx]] += der; @@ -244,7 +256,40 @@ void Pooling2DLayer::calcDerivative(RunLayerContext &context) { result_data += in_map_size; } } - } break; + }; + + switch (pooling_type) { + case props::PoolingTypeInfo::Enum::max: + if (in_dim.getDataType() == ml::train::TensorDim::DataType::FP32) + apply_max(result.getData()); +#ifdef ENABLE_FP16 + else if (in_dim.getDataType() == ml::train::TensorDim::DataType::FP16) + apply_max(result.getData<_FP16>()); +#endif + else + throw std::runtime_error("Not supported datatype"); + break; + case props::PoolingTypeInfo::Enum::global_average: + case props::PoolingTypeInfo::Enum::average: + if (in_dim.getDataType() == ml::train::TensorDim::DataType::FP32) + apply_average(result.getData()); +#ifdef ENABLE_FP16 + else if (in_dim.getDataType() == ml::train::TensorDim::DataType::FP16) + apply_average(result.getData<_FP16>()); +#endif + else + throw std::runtime_error("Not supported datatype"); + break; + case props::PoolingTypeInfo::Enum::global_max: + if (in_dim.getDataType() == ml::train::TensorDim::DataType::FP32) + apply_global_max(result.getData()); +#ifdef ENABLE_FP16 + else if (in_dim.getDataType() == ml::train::TensorDim::DataType::FP16) + apply_global_max(result.getData<_FP16>()); +#endif + else + throw std::runtime_error("Not supported datatype"); + break; default: throw std::runtime_error("Error: Unknown Pooling Type"); } @@ -290,124 +335,167 @@ void Pooling2DLayer::pooling2d(Tensor &in, bool training, Tensor &output, * @param start_w (width index pointing the start of the patch) * @return result value of pooling */ - std::function pool_fn; + PoolFunc::Type pool_fn_fp32; +#ifdef ENABLE_FP16 + PoolFunc<_FP16>::Type pool_fn_fp16; +#endif unsigned int max_idx_count = 0; - switch (pooling_type) { - case props::PoolingTypeInfo::Enum::max: { - pool_fn = [&](const float *in_data, int channel_idx, int start_h, - int start_w) { - int end_h = start_h + patch_height; - int end_w = start_w + patch_width; - - float max_val = std::numeric_limits::lowest(); - - int cur_max_idx = -1; - int eff_end_h = std::min(end_h, in_height); - int eff_end_w = std::min(end_w, in_width); - start_w = std::max(0, start_w); - for (int h = std::max(0, start_h); h < eff_end_h; ++h) { - for (int w = start_w; w < eff_end_w; ++w) { - int cur_idx = h * in_width + w; - float val = in_data[cur_idx]; - if (max_val < val) { - max_val = val; - if (training) { - cur_max_idx = cur_idx; - } + + auto pool_fn_max = [&](const T *in_data, int channel_idx, + int start_h, int start_w) { + int end_h = start_h + patch_height; + int end_w = start_w + patch_width; + + T max_val = std::numeric_limits::lowest(); + + int cur_max_idx = -1; + int eff_end_h = std::min(end_h, in_height); + int eff_end_w = std::min(end_w, in_width); + start_w = std::max(0, start_w); + for (int h = std::max(0, start_h); h < eff_end_h; ++h) { + for (int w = start_w; w < eff_end_w; ++w) { + int cur_idx = h * in_width + w; + T val = in_data[cur_idx]; + if (max_val < val) { + max_val = val; + if (training) { + cur_max_idx = cur_idx; } } } + } - if (training) { - pool_helper.setValueInt(max_idx_count++, cur_max_idx); - } + if (training) { + pool_helper.setValueInt(max_idx_count++, cur_max_idx); + } - return max_val; - }; - break; - } - case props::PoolingTypeInfo::Enum::global_max: { - pool_fn = [&, this](const float *in_data, int channel_idx, int start_h, - int start_w) { - int end_h = start_h + patch_height; - int end_w = start_w + patch_width; - - float max_val = std::numeric_limits::lowest(); - int *helper_data = pool_helper.getData(); - helper_data += channel_idx * in_height * in_width; - - for (int h = start_h; h < end_h; ++h) { - for (int w = start_w; w < end_w; ++w) { - int cur_idx = h * in_width + w; - float val = in_data[cur_idx]; - if (max_val < val) { - max_val = val; - max_idx_count = 0; - } + return max_val; + }; - if (training && max_val == val) { - *(helper_data + max_idx_count++) = cur_idx; - } + auto pool_fn_global_max = [&, this](const T *in_data, + int channel_idx, int start_h, + int start_w) { + int end_h = start_h + patch_height; + int end_w = start_w + patch_width; + + T max_val = std::numeric_limits::lowest(); + int *helper_data = pool_helper.getData(); + helper_data += channel_idx * in_height * in_width; + + for (int h = start_h; h < end_h; ++h) { + for (int w = start_w; w < end_w; ++w) { + int cur_idx = h * in_width + w; + T val = in_data[cur_idx]; + if (max_val < val) { + max_val = val; + max_idx_count = 0; } - } - pool_helper_size[batch_idx * in.channel() + channel_idx] = max_idx_count; - return max_val; - }; - break; - } - case props::PoolingTypeInfo::Enum::global_average: - case props::PoolingTypeInfo::Enum::average: { - pool_fn = [&](const float *in_data, int channel_idx, int start_h, - int start_w) { - int end_h = start_h + patch_height; - int end_w = start_w + patch_width; - float total = 0.0f; - - int eff_end_h = std::min(end_h, in_height); - int eff_end_w = std::min(end_w, in_width); - int eff_start_h = std::max(0, start_h); - int eff_start_w = std::max(0, start_w); - - int cnt = (eff_end_h - eff_start_h) * (eff_end_w - eff_start_w); - for (int h = eff_start_h; h < eff_end_h; ++h) { - for (int w = eff_start_w; w < eff_end_w; ++w) { - float val = in_data[h * in_width + w]; - total += val; + if (training && max_val == val) { + *(helper_data + max_idx_count++) = cur_idx; } } + } - if (training) { - pool_helper.setValueInt(max_idx_count++, cnt); + pool_helper_size[batch_idx * in.channel() + channel_idx] = max_idx_count; + return max_val; + }; + + auto pool_fn_average = [&](const T *in_data, int channel_idx, + int start_h, int start_w) { + int end_h = start_h + patch_height; + int end_w = start_w + patch_width; + T total = static_cast(0.0f); + + int eff_end_h = std::min(end_h, in_height); + int eff_end_w = std::min(end_w, in_width); + int eff_start_h = std::max(0, start_h); + int eff_start_w = std::max(0, start_w); + + int cnt = (eff_end_h - eff_start_h) * (eff_end_w - eff_start_w); + for (int h = eff_start_h; h < eff_end_h; ++h) { + for (int w = eff_start_w; w < eff_end_w; ++w) { + T val = in_data[h * in_width + w]; + total += val; } - return total / cnt; - }; + } + + if (training) { + pool_helper.setValueInt(max_idx_count++, cnt); + } + return total / cnt; + }; + + switch (pooling_type) { + case props::PoolingTypeInfo::Enum::max: + pool_fn_fp32 = pool_fn_max; +#ifdef ENABLE_FP16 + pool_fn_fp16 = pool_fn_max; +#endif + break; + case props::PoolingTypeInfo::Enum::global_max: + pool_fn_fp32 = pool_fn_global_max; +#ifdef ENABLE_FP16 + pool_fn_fp16 = pool_fn_global_max; +#endif + break; + case props::PoolingTypeInfo::Enum::global_average: + case props::PoolingTypeInfo::Enum::average: + pool_fn_fp32 = pool_fn_average; +#ifdef ENABLE_FP16 + pool_fn_fp16 = pool_fn_average; +#endif break; - } case props::PoolingTypeInfo::Enum::unknown: default: throw std::invalid_argument("unknown pooling type given"); break; } - const float *in_data = in.getData(); - float *out_data = output.getData(); - - unsigned int map_size = in_height * in_width; - - int height_stride_end = height - patch_height - pt; - int width_stride_end = width - patch_width - pl; - for (unsigned int i = 0; i < channel; ++i) { - const float *in_data_channel_sliced = in_data + i * map_size; - for (int j = -pt; j <= height_stride_end; j += stride[0]) { - for (int k = -pl; k <= width_stride_end; k += stride[1]) { - float pool_value = pool_fn(in_data_channel_sliced, i, j, k); - *out_data = pool_value; - out_data++; + if (in.getDataType() == ml::train::TensorDim::DataType::FP32) { + const float *in_data = in.getData(); + float *out_data = output.getData(); + + unsigned int map_size = in_height * in_width; + + int height_stride_end = height - patch_height - pt; + int width_stride_end = width - patch_width - pl; + for (unsigned int i = 0; i < channel; ++i) { + const float *in_data_channel_sliced = in_data + i * map_size; + for (int j = -pt; j <= height_stride_end; j += stride[0]) { + for (int k = -pl; k <= width_stride_end; k += stride[1]) { + float pool_value = pool_fn_fp32(in_data_channel_sliced, i, j, k); + *out_data = pool_value; + out_data++; + } + } + } + } +#ifdef ENABLE_FP16 + else if (in.getDataType() == ml::train::TensorDim::DataType::FP16) { + const _FP16 *in_data = in.getData<_FP16>(); + _FP16 *out_data = output.getData<_FP16>(); + + unsigned int map_size = in_height * in_width; + + int height_stride_end = height - patch_height - pt; + int width_stride_end = width - patch_width - pl; + for (unsigned int i = 0; i < channel; ++i) { + const _FP16 *in_data_channel_sliced = in_data + i * map_size; + for (int j = -pt; j <= height_stride_end; j += stride[0]) { + for (int k = -pl; k <= width_stride_end; k += stride[1]) { + _FP16 pool_value = pool_fn_fp16(in_data_channel_sliced, i, j, k); + *out_data = pool_value; + out_data++; + } } } } +#endif + else { + throw std::runtime_error("Not supported datatype"); + } } void Pooling2DLayer::setBatch(RunLayerContext &context, unsigned int batch) { diff --git a/nntrainer/layers/reshape_layer.cpp b/nntrainer/layers/reshape_layer.cpp index 0f82d84f3a..07564b3970 100644 --- a/nntrainer/layers/reshape_layer.cpp +++ b/nntrainer/layers/reshape_layer.cpp @@ -42,6 +42,7 @@ void ReshapeLayer::finalize(InitLayerContext &context) { } out_dim.batch(in_dim.batch()); + out_dim.setDataType(in_dim.getDataType()); context.setOutputDimensions({out_dim}); } diff --git a/nntrainer/layers/time_dist.cpp b/nntrainer/layers/time_dist.cpp index 80451416df..779010065a 100644 --- a/nntrainer/layers/time_dist.cpp +++ b/nntrainer/layers/time_dist.cpp @@ -256,8 +256,8 @@ void TimeDistLayer::forwarding(RunLayerContext &context, bool training) { RunLayerContext dist_context(context.getName(), context.getTrainable(), context.getLoss(), context.executeInPlace(), - getWeightsForContext(), {&in_var}, {&out_var}, - getTensorsForContext()); + context.getLossScale(), getWeightsForContext(), + {&in_var}, {&out_var}, getTensorsForContext()); dist_layer->forwarding(dist_context, training); } @@ -303,8 +303,8 @@ void TimeDistLayer::calcDerivative(RunLayerContext &context) { RunLayerContext dist_context(context.getName(), context.getTrainable(), context.getLoss(), context.executeInPlace(), - getWeightsForContext(), {&in_var}, {&out_var}, - getTensorsForContext()); + context.getLossScale(), getWeightsForContext(), + {&in_var}, {&out_var}, getTensorsForContext()); dist_layer->calcDerivative(dist_context); } @@ -354,8 +354,8 @@ void TimeDistLayer::calcGradient(RunLayerContext &context) { RunLayerContext dist_context(context.getName(), context.getTrainable(), context.getLoss(), context.executeInPlace(), - getWeightsForContext(), {&in_var}, {&out_var}, - getTensorsForContext()); + context.getLossScale(), getWeightsForContext(), + {&in_var}, {&out_var}, getTensorsForContext()); dist_layer->calcGradient(dist_context); } @@ -396,8 +396,8 @@ void TimeDistLayer::setBatch(RunLayerContext &context, unsigned int batch) { RunLayerContext dist_context(context.getName(), context.getTrainable(), context.getLoss(), context.executeInPlace(), - getWeightsForContext(), {&in_var}, {&out_var}, - getTensorsForContext()); + context.getLossScale(), getWeightsForContext(), + {&in_var}, {&out_var}, getTensorsForContext()); dist_layer->setBatch(dist_context, batch); diff --git a/nntrainer/meson.build b/nntrainer/meson.build index 02df7744b6..5c7a14d4a5 100644 --- a/nntrainer/meson.build +++ b/nntrainer/meson.build @@ -47,6 +47,7 @@ nntrainer_elements = [ if get_option('enable-opencl') nntrainer_elements += 'opencl' + nntrainer_elements += 'layers/cl_layers' endif foreach elem : nntrainer_elements diff --git a/nntrainer/models/model_common_properties.h b/nntrainer/models/model_common_properties.h index 3776afefca..3435d18e96 100644 --- a/nntrainer/models/model_common_properties.h +++ b/nntrainer/models/model_common_properties.h @@ -217,7 +217,7 @@ class ModelTensorDataType final : public EnumProperty { */ class LossScale : public Property { public: - LossScale(float value = 0.0f); + LossScale(float value = 1.0f); static constexpr const char *key = "loss_scale"; /**< unique key to access */ using prop_tag = float_prop_tag; /**< property type */ }; diff --git a/nntrainer/models/neuralnet.cpp b/nntrainer/models/neuralnet.cpp index d0e542825f..afc560603e 100644 --- a/nntrainer/models/neuralnet.cpp +++ b/nntrainer/models/neuralnet.cpp @@ -412,9 +412,21 @@ void NeuralNetwork::backwarding(int iteration, NNTR_THROW_IF(!opt, std::invalid_argument) << "optimizer is null!"; #endif - std::function, int)> backwarding_op = + std::function, bool)> forwarding_op = + [this, stop_cb, userdata](std::shared_ptr node, + bool training) -> void { + (void)this; + PROFILE_MEM_ANNOTATE("Forwarding for layer: " + node->getName()); + + auto f = std::get<0>(node->getExecutionOrder()); + model_graph.flushCacheExcept(f); + + node->forwarding(training); + }; + + std::function, int)> backwarding_op = [this, stop_cb, userdata](std::shared_ptr node, - int iteration) -> void { + int iteration) -> bool { /** * Do not change this order: * 1. calcGradient @@ -448,19 +460,29 @@ void NeuralNetwork::backwarding(int iteration, /** If gradient must be applied and its not gradient mode, calculate * gradient */ - if (!dynamic_training_opt.isGradientMode() && apply_gradient) + if (!dynamic_training_opt.isGradientMode() && apply_gradient) { node->calcGradient(); + + RunLayerContext &rc = node->getRunContext(); + if (rc.isMixedPrecision()) { + for (auto w : rc.getWeights()) { + if (!w->getGradientRef().isValid()) + return false; + } + } + } } model_graph.flushCacheExcept(std::get<2>(node->getExecutionOrder())); PROFILE_MEM_ANNOTATE("CalcDerivative: " + node->getName()); if (stop_cb(userdata)) { - return; + return true; } - if (node->needsCalcDerivative()) + if (node->needsCalcDerivative()) { node->calcDerivative(); + } model_graph.flushCacheExcept(std::get<3>(node->getExecutionOrder())); PROFILE_MEM_ANNOTATE("ApplyGradient: " + node->getName()); @@ -476,9 +498,10 @@ void NeuralNetwork::backwarding(int iteration, opt_->applyGradient(opt_context); }); } + return true; }; - std::function apply_grad_clip_op = + std::function lazy_apply_grad_op = [opt_ = opt.get()](Weight &w, int iteration) -> void { w.calcRegularizationGradient(); w.calcWeightDecayGradient(); @@ -487,8 +510,13 @@ void NeuralNetwork::backwarding(int iteration, opt_->applyGradient(opt_context); }; - model_graph.backwarding(iteration, backwarding_op, apply_grad_clip_op, - stop_cb, userdata); + // return false if the gradient is not valid + bool ret = false; + + while (!ret) { + ret = model_graph.backwarding(iteration, forwarding_op, backwarding_op, + lazy_apply_grad_op, stop_cb, userdata); + } } void NeuralNetwork::save(const std::string &file_path, diff --git a/nntrainer/optimizers/adam.cpp b/nntrainer/optimizers/adam.cpp index 18c0a0fcc1..f7189dda7e 100644 --- a/nntrainer/optimizers/adam.cpp +++ b/nntrainer/optimizers/adam.cpp @@ -36,7 +36,15 @@ Adam::~Adam() {} enum AdamParams { wm, wv }; std::vector Adam::getOptimizerVariableDim(const TensorDim &dim) { - return {dim, dim}; + /** + * @note We assume the optimizer parameters should be full precsion to + * maintain the accuracy even in mixed precision training. + */ + TensorDim wm_dim(dim); + TensorDim wv_dim(dim); + wm_dim.setDataType(ml::train::TensorDim::DataType::FP32); + wv_dim.setDataType(ml::train::TensorDim::DataType::FP32); + return {wm_dim, wv_dim}; } void Adam::exportTo(Exporter &exporter, @@ -64,7 +72,17 @@ double Adam::getUpdatedLearningRate(unsigned int iteration, double ll) const { } void Adam::applyGradient(RunOptimizerContext &context) { - Tensor &x_grad = context.getGradient(); + Tensor empty_tensor; + + Tensor &x_grad = + context.getGradient().getDataType() == ml::train::TensorDim::DataType::FP32 + ? context.getGradient() + : empty_tensor; + + if (x_grad.empty()) { + x_grad = context.getGradient().clone(ml::train::TensorDim::DataType::FP32); + context.applyLossScale(x_grad); + } auto &beta1 = std::get(adam_props).get(); auto &beta2 = std::get(adam_props).get(); @@ -91,7 +109,7 @@ void Adam::applyGradient(RunOptimizerContext &context) { denom.add_i(epsilon); wm.divide(denom, x_grad); - context.applyGradient(context.getLearningRate() / biasCorrection1); + context.applyGradient(context.getLearningRate() / biasCorrection1, x_grad); } else { std::function sqrtEps = [epsilon](double f) { @@ -100,8 +118,9 @@ void Adam::applyGradient(RunOptimizerContext &context) { x_grad = wv.apply(sqrtEps, x_grad); x_grad.multiply_i(wm); - context.applyGradient(getUpdatedLearningRate(context.getIteration(), - context.getLearningRate())); + context.applyGradient( + getUpdatedLearningRate(context.getIteration(), context.getLearningRate()), + x_grad); } } diff --git a/nntrainer/optimizers/optimizer_context.cpp b/nntrainer/optimizers/optimizer_context.cpp index da4cd1f7e9..8380ad6613 100644 --- a/nntrainer/optimizers/optimizer_context.cpp +++ b/nntrainer/optimizers/optimizer_context.cpp @@ -42,4 +42,24 @@ Tensor &RunOptimizerContext::getOptimizerVariable(unsigned int idx) const { void RunOptimizerContext::applyGradient(double lr) const { weight->applyGradient(lr); } + +/** + * @brief Apply the gradient with the given learning rate and gradient + */ +void RunOptimizerContext::applyGradient(double lr, Tensor &updated_grad) const { + weight->applyGradient(lr, updated_grad); +} + +/** + * @brief Apply loss scale to gradient (full precision) + */ +void RunOptimizerContext::applyLossScale(Tensor &fp32_grad) { + if (!weight->isMixedPrecision()) + return; + if (fp32_grad.getDataType() != ml::train::TensorDim::DataType::FP32) + throw std::invalid_argument( + "gradient should be fullprecsion to maintain accuracy"); + float loss_scale = weight->getLossScale(); + fp32_grad.divide_i(loss_scale); +} } // namespace nntrainer diff --git a/nntrainer/optimizers/optimizer_context.h b/nntrainer/optimizers/optimizer_context.h index 62f9e0945d..27f028fc52 100644 --- a/nntrainer/optimizers/optimizer_context.h +++ b/nntrainer/optimizers/optimizer_context.h @@ -35,9 +35,7 @@ class RunOptimizerContext { * */ RunOptimizerContext(Weight *w = nullptr, size_t iter = 0, double lr = 0.0) : - weight(w), - iteration(iter), - learning_rate(lr) {} + weight(w), iteration(iter), learning_rate(lr) {} /** * @brief Get the Weight tensor object @@ -75,6 +73,16 @@ class RunOptimizerContext { */ void applyGradient(double lr) const; + /** + * @brief Apply the gradient with the given learning rate and updated + * gradient + * + * @param lr learning rate + * @param updated_grad gradient tensor which is updated. (usually it could be + * fp32) + */ + void applyGradient(double lr, Tensor &updated_grad) const; + /** * @brief Get the current iteration value * @@ -89,6 +97,11 @@ class RunOptimizerContext { */ double getLearningRate() const { return learning_rate; } + /** + * @brief Apply loss scale to gradient (full precision) + */ + void applyLossScale(Tensor &fp32_grad); + private: Weight *weight; /**< weights for the optimizer */ size_t iteration; /**< iteration number */ diff --git a/nntrainer/optimizers/sgd.cpp b/nntrainer/optimizers/sgd.cpp index 8b0078e9e6..e4b2209a57 100644 --- a/nntrainer/optimizers/sgd.cpp +++ b/nntrainer/optimizers/sgd.cpp @@ -16,7 +16,20 @@ namespace nntrainer { void SGD::applyGradient(RunOptimizerContext &context) { - context.applyGradient(context.getLearningRate()); + // @todo This could go inside the context. + Tensor empty_tensor; + + Tensor &x_grad = + context.getGradient().getDataType() == ml::train::TensorDim::DataType::FP32 + ? context.getGradient() + : empty_tensor; + + if (x_grad.empty()) { + x_grad = context.getGradient().clone(ml::train::TensorDim::DataType::FP32); + context.applyLossScale(x_grad); + } + + context.applyGradient(context.getLearningRate(), x_grad); } } // namespace nntrainer diff --git a/nntrainer/tensor/blas_avx.cpp b/nntrainer/tensor/blas_avx.cpp index ce59583d6f..411dbcbb5d 100644 --- a/nntrainer/tensor/blas_avx.cpp +++ b/nntrainer/tensor/blas_avx.cpp @@ -20,6 +20,7 @@ namespace nntrainer::avx { +#ifdef ENABLE_FP16 void vcvt_f16_f32(size_t N, const void *input, float *output) { assert(N != 0); assert(input != NULL); @@ -114,4 +115,163 @@ void vcvt_f32_f16(size_t N, const float *input, void *output) { } } +bool isValid(const size_t N, const _Float16 *input) { + assert(N != 0); + assert(input != NULL); + + int temp = 0; + size_t idx = 0; + + const __m256 SIGN_MASK = _mm256_set1_ps(-0.0); + const __m256 INF = _mm256_set1_ps(std::numeric_limits::infinity()); + + // 16 single-precision check : ( X != X ) + for (; N - idx >= 16; idx += 16) { + __m256 vec0 = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)input)); + __m256 vec1 = + _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)(input + 8))); + + input += 16; + + // check NaN in vec0 + __m256 res = _mm256_cmp_ps(vec0, vec0, _CMP_NEQ_UQ); + temp = temp | _mm256_movemask_ps(res); + if (temp) + return false; + + // check infinity in vec0 + vec0 = _mm256_andnot_ps(SIGN_MASK, vec0); + vec0 = _mm256_cmp_ps(vec0, INF, _CMP_EQ_OQ); + + temp = temp | _mm256_movemask_ps(vec0); + if (temp) + return false; + + // check NaN in vec1 + __m256 res1 = _mm256_cmp_ps(vec1, vec1, _CMP_NEQ_UQ); + temp = temp | _mm256_movemask_ps(res1); + + if (temp) + return false; + + // check infinity in vec1 + vec1 = _mm256_andnot_ps(SIGN_MASK, vec1); + vec1 = _mm256_cmp_ps(vec1, INF, _CMP_EQ_OQ); + + temp = temp | _mm256_movemask_ps(vec1); + + if (temp) + return false; + } + + // 8 single-precision check : ( X != X ) + for (; N - idx >= 8; idx += 8) { + __m256 vec = _mm256_cvtph_ps(_mm_loadu_si128((const __m128i *)input)); + input += 8; + __m256 res = _mm256_cmp_ps(vec, vec, _CMP_NEQ_UQ); + temp = temp | _mm256_movemask_ps(res); + + if (temp) + return false; + + // check infinity in vec1 + vec = _mm256_andnot_ps(SIGN_MASK, vec); + vec = _mm256_cmp_ps(vec, INF, _CMP_EQ_OQ); + + temp = temp | _mm256_movemask_ps(vec); + + if (temp) + return false; + } + + // remain check : ( X != X || X == Inf ) + while (idx < N) { + if (*input != *input || *input == std::numeric_limits::infinity()) { + return false; + } + ++input; + ++idx; + } + + return true; +} +#endif + +bool isValid(const size_t N, const float *input) { + assert(N != 0); + assert(input != NULL); + + int temp = 0; + size_t idx = 0; + + const __m256 SIGN_MASK = _mm256_set1_ps(-0.0); + const __m256 INF = _mm256_set1_ps(std::numeric_limits::infinity()); + + // 16 single-precision check : ( X != X ) + for (; N - idx >= 16; idx += 16) { + __m256 vec0 = _mm256_loadu_ps(input); + __m256 vec1 = _mm256_loadu_ps(input + 8); + input += 16; + __m256 res = _mm256_cmp_ps(vec0, vec0, _CMP_NEQ_UQ); + temp = temp | _mm256_movemask_ps(res); + + if (temp) + return false; + + // check infinity in vec0 + vec0 = _mm256_andnot_ps(SIGN_MASK, vec0); + vec0 = _mm256_cmp_ps(vec0, INF, _CMP_EQ_OQ); + + temp = temp | _mm256_movemask_ps(vec0); + if (temp) + return false; + + __m256 res1 = _mm256_cmp_ps(vec1, vec1, _CMP_NEQ_UQ); + temp = temp | _mm256_movemask_ps(res1); + + if (temp) + return false; + + // check infinity in vec1 + vec1 = _mm256_andnot_ps(SIGN_MASK, vec1); + vec1 = _mm256_cmp_ps(vec1, INF, _CMP_EQ_OQ); + + temp = temp | _mm256_movemask_ps(vec1); + + if (temp) + return false; + } + + // 8 single-precision check : ( X != X ) + for (; N - idx >= 8; idx += 8) { + __m256 vec = _mm256_loadu_ps(input); + input += 8; + __m256 res = _mm256_cmp_ps(vec, vec, _CMP_NEQ_UQ); + temp = temp | _mm256_movemask_ps(res); + + if (temp) + return false; + + // check infinity in vec + vec = _mm256_andnot_ps(SIGN_MASK, vec); + vec = _mm256_cmp_ps(vec, INF, _CMP_EQ_OQ); + + temp = temp | _mm256_movemask_ps(vec); + + if (temp) + return false; + } + + // remain check : ( X != X ) + while (idx < N) { + if (*input != *input || *input == std::numeric_limits::infinity()) { + return false; + } + ++input; + ++idx; + } + + return true; +} + } // namespace nntrainer::avx diff --git a/nntrainer/tensor/blas_avx.h b/nntrainer/tensor/blas_avx.h index ab1270a208..5eabcbdb2c 100644 --- a/nntrainer/tensor/blas_avx.h +++ b/nntrainer/tensor/blas_avx.h @@ -20,6 +20,7 @@ namespace nntrainer::avx { +#ifdef ENABLE_FP16 /** * @brief Converts half-precision floating point values to single-precision * floating point values. @@ -40,6 +41,25 @@ void vcvt_f16_f32(size_t N, const void *input, float *output); */ void vcvt_f32_f16(size_t N, const float *input, void *output); +/** + * @brief check if the X has NaN value + * @note it compare (x!=x || x == inf) + * @param[in] N length of the vector + * @param[in] X half-precision * for Vector X + * @param[out] false if it has NaN or inf + */ +bool isValid(const size_t N, const _Float16 *X); +#endif + +/** + * @brief check if the X has NaN value + * @note it compare (x!=x || x == inf) + * @param[in] N length of the vector + * @param[in] X float * for Vector X + * @param[out] false if it has NaN or inf + */ +bool isValid(const size_t N, const float *X); + } // namespace nntrainer::avx #endif /* __cplusplus */ diff --git a/nntrainer/tensor/blas_interface.cpp b/nntrainer/tensor/blas_interface.cpp index 9be6fb9911..e8fb78d734 100644 --- a/nntrainer/tensor/blas_interface.cpp +++ b/nntrainer/tensor/blas_interface.cpp @@ -842,7 +842,10 @@ void scopy(const unsigned int N, const float *X, const int incX, float *Y, #ifdef BLAS_NUM_THREADS openblas_set_num_threads(BLAS_NUM_THREADS); #endif - cblas_scopy(N, X, incX, Y, incY); + // cblas_scopy(N, (float*)(X), incX, (float*)(Y), incY); + // replace cblas scopy with raw temporary. + for (unsigned int i = 0; i < N; ++i) + Y[i * incY] = X[i * incX]; #else scopy_raw(N, X, incX, Y, incY); #endif @@ -1038,6 +1041,16 @@ static void ele_div_fallback(const unsigned int N, const float *X, } } +static bool is_valid_fallback(const size_t N, const float *X) { + for (size_t i = 0; i < N; ++i) { + if (*X != *X || *X == std::numeric_limits::infinity()) + return false; + ++X; + } + + return true; +} + void ele_mul(const unsigned int N, const float *X, const float *Y, float *Z, float alpha, float beta, unsigned int i_stride, unsigned int o_stride) { @@ -1090,4 +1103,30 @@ void ele_div(const unsigned int N, const float *X, const float *Y, float *Z, ele_div_fallback(N, X, Y, Z, alpha, beta, i_stride, o_stride); } +bool is_valid(const size_t N, ml::train::TensorDim::DataType d_type, + const void *X) { + if (d_type == ml::train::TensorDim::DataType::FP16) { +#ifdef ENABLE_FP16 + const _FP16 *vec = (const _FP16 *)X; +#ifdef USE_NEON + return nntrainer::neon::isValid(N, vec); +#elif defined(USE_AVX) + return nntrainer::avx::isValid(N, vec); +#else + throw std::invalid_argument("Error: enable-fp16 is not enabled"); +#endif +#endif + } else if (d_type == ml::train::TensorDim::DataType::FP32) { + const float *vec = (const float *)X; +#ifdef USE_NEON + return nntrainer::neon::isValid(N, vec); +#elif defined(USE_AVX) + return nntrainer::avx::isValid(N, vec); +#endif + + return is_valid_fallback(N, vec); + } + return false; +} + } // namespace nntrainer diff --git a/nntrainer/tensor/blas_interface.h b/nntrainer/tensor/blas_interface.h index 04a8a23018..2b5ef72922 100644 --- a/nntrainer/tensor/blas_interface.h +++ b/nntrainer/tensor/blas_interface.h @@ -478,6 +478,16 @@ void ele_sub(const unsigned N, const float *X, const float *Y, float *Z, void ele_div(const unsigned N, const float *X, const float *Y, float *Z, float alpha = 1.f, float beta = 0.f, unsigned int i_stride = 1, unsigned int o_stride = 1); + +/** + * @brief check if X array has NaN or inf + * @param[in] N length of the vector + * @param[in] X float/fp16 * for Vector X + * @param[out] bool false if not valide else true + */ +bool is_valid(const size_t N, ml::train::TensorDim::DataType d_type, + const void *X); + } /* namespace nntrainer */ #endif /* __cplusplus */ #endif /* __BLAS_INTERFACE_H__ */ diff --git a/nntrainer/tensor/blas_neon.cpp b/nntrainer/tensor/blas_neon.cpp index 3609b6b8b5..20f4d102ec 100644 --- a/nntrainer/tensor/blas_neon.cpp +++ b/nntrainer/tensor/blas_neon.cpp @@ -546,6 +546,36 @@ void ele_div(const unsigned N, const float *X, const float *Y, float *Z, } } +bool isValid(const size_t N, const float *X) { + size_t i = 0; + float inf_s = std::numeric_limits::infinity(); + float32x4_t inf = vdupq_n_f32(inf_s); + uint16x8_t zero = vdupq_n_f32(0); + + for (; N - i >= 4; i += 4) { + float32x4_t vec = vld1q_f32(&X[i]); + uint32x4_t vcmp = vceqq_f32(vec, vec); + + vcmp = vceqq_f32(vcmp, zero); + + if (vaddvq_u32(vcmp)) + return false; + + vcmp = vceqq_f32(vec, inf); + + if (vaddvq_u16(vcmp)) + return false; + } + + while (i < N) { + if (X[i] != X[i] || X[i] == std::numeric_limits::infinity()) + return false; + ++i; + } + + return true; +} + #ifdef ENABLE_FP16 void hgemv(const __fp16 *A, const __fp16 *X, __fp16 *Y, uint32_t M, uint32_t N, @@ -1192,51 +1222,29 @@ void haxpy(const unsigned int N, const float alpha, const __fp16 *X, } __fp16 hdot(const unsigned int N, const __fp16 *X, const __fp16 *Y) { - - float16x8_t accX8 = vmovq_n_f16(0); - float16x4_t accX4 = vmov_n_f16(0); + float32x4_t accX0_3 = vmovq_n_f32(0.F); + float32x4_t accX4_7 = vmovq_n_f32(0.F); unsigned int idx = 0; - __fp16 ret = 0; + unsigned int N8 = (N >> 3) << 3; + float ret = 0; - // processing batch of 8 - for (; (N - idx) >= 8; idx += 8) { + // Adaptive loop for batch size of 8 + for (; idx < N8; idx += 8) { float16x8_t x = vld1q_f16(&X[idx]); float16x8_t y = vld1q_f16(&Y[idx]); - // x*y + accX8 -> accX8 - accX8 = vfmaq_f16(accX8, x, y); - } - - // check at least one batch of 8 is processed - if (N - 8 >= 0) { - __fp16 result[8]; - vst1q_f16(result, accX8); - for (unsigned int i = 0; i < 8; i++) - ret += result[i]; - } - - // processing remaining batch of 4 - for (; (N - idx) >= 4; idx += 4) { - float16x4_t x = vld1_f16(&X[idx]); - float16x4_t y = vld1_f16(&Y[idx]); - - // x*y + accX4 -> accX4 - accX4 = vfma_f16(accX4, x, y); - } - - // check at least one batch of 4 is processed - if (N % 8 >= 4) { - __fp16 result[4]; - vst1_f16(result, accX4); - ret += result[0] + result[1] + result[2] + result[3]; + x = vmulq_f16(x, y); + accX0_3 = vaddq_f32(accX0_3, vcvt_f32_f16(vget_low_f16(x))); + accX4_7 = vaddq_f32(accX4_7, vcvt_f32_f16(vget_high_f16(x))); } + ret += vaddvq_f32(accX0_3) + vaddvq_f32(accX4_7); - // pocessing remaining values + // Loop for remaining indices for (; idx < N; idx++) ret += X[idx] * Y[idx]; - return ret; + return static_cast<__fp16>(ret); } __fp16 hnrm2(const unsigned int N, const __fp16 *X) { @@ -1994,5 +2002,40 @@ void inv_sqrt_inplace(const unsigned int N, __fp16 *X) { } } +bool isValid(const size_t N, const __fp16 *input) { + bool temp = 0; + size_t i = 0; + __fp16 inf_s = std::numeric_limits::infinity(); + float16x8_t inf = vdupq_n_f16(inf_s); + uint16x8_t zero = vdupq_n_f16(0); + + for (; N - i >= 8; i += 8) { + float16x8_t vec = vld1q_f16(&input[i]); + + uint16x8_t vcmp = vceqq_f16(vec, vec); + + vcmp = vceqq_f16(vcmp, zero); + + if (vaddvq_u16(vcmp)) { + return false; + } + + vcmp = vceqq_f16(vec, inf); + + if (vaddvq_u16(vcmp)) { + return false; + } + } + + while (i < N) { + if (input[i] != input[i] || + input[i] == std::numeric_limits::infinity()) { + return false; + } + ++i; + } + return true; +} + #endif } // namespace nntrainer::neon diff --git a/nntrainer/tensor/blas_neon.h b/nntrainer/tensor/blas_neon.h index db1b6a5ccc..978d3428f7 100644 --- a/nntrainer/tensor/blas_neon.h +++ b/nntrainer/tensor/blas_neon.h @@ -148,6 +148,15 @@ void ele_sub(const unsigned N, const float *X, const float *Y, float *Z, void ele_div(const unsigned N, const float *X, const float *Y, float *Z, float alpha = 1.f, float beta = 0.f); +/** + * @brief check if the X has NaN value or Inf + * @note it compare (x!=x || x == inf) + * @param[in] N length of the vector + * @param[in] input float * for Vector X + * @param[out] false if it has NaN or Inf + */ +bool isValid(const size_t N, const float *input); + #ifdef ENABLE_FP16 /** * @brief hgemv computation with neon : Y = alpha*A*X + beta*Y @@ -380,6 +389,15 @@ void hgemm_transAB(const __fp16 *A, const __fp16 *B, float *C, uint32_t M, * @param X __fp16 * for Vector X */ void inv_sqrt_inplace(const unsigned int N, __fp16 *X); + +/** + * @brief check if the X is valid: Check NaN or Inf + * @note it compare (x!=x || x == inf) + * @param[in] N length of the vector + * @param[in] X float * for Vector X + * @param[out] false if it has NaN or Inf + */ +bool isValid(const size_t N, const __fp16 *X); #endif } // namespace nntrainer::neon diff --git a/nntrainer/tensor/hgemm/hgemm.cpp b/nntrainer/tensor/hgemm/hgemm.cpp index a41a5ba6dc..4aaadf331c 100644 --- a/nntrainer/tensor/hgemm/hgemm.cpp +++ b/nntrainer/tensor/hgemm/hgemm.cpp @@ -32,15 +32,17 @@ void hgemm_noTrans(const __fp16 *A, const __fp16 *B, float *C32, unsigned int M, unsigned int N, unsigned int K, float alpha, float beta) { if (alpha == 1.F && beta == 0.F) { - if (M % 8 == 0 && N % 16 == 0 && K % 8 == 0) { + // used bitwise operator instead of modulo for performance + // e.g (M % 8) is same as (M & 0x7) which will extract last 3 bits of M + if ((M & 0x7) == 0 && (N & 0xF) == 0 && (K & 0x7) == 0) { hgemm_noTrans_8x16(M, N, K, A, K, B, N, C32, N, alpha, beta); - } else if (M % 8 == 0 && N % 8 == 0 && K % 8 == 0) { + } else if ((M & 0x7) == 0 && (N & 0x7) == 0 && (K & 0x7) == 0) { hgemm_noTrans_8x8(M, N, K, A, K, B, N, C32, N, alpha, beta); - } else if (M % 4 == 0 && N % 8 == 0 && K % 4 == 0) { + } else if ((M & 0x3) == 0 && (N & 0x7) == 0 && (K & 0x3) == 0) { hgemm_noTrans_4x8(M, N, K, A, K, B, N, C32, N, alpha, beta); - } else if (N % 8 == 0) { + } else if ((K & 0x7) == 0 && (N & 0x7) == 0) { hgemm_noTrans_1x8(M, N, K, A, K, B, N, C32, N, alpha, beta); - } else if (N % 4 == 0) { + } else if ((K & 0x7) == 0 && (N & 0x3) == 0) { hgemm_noTrans_1x4(M, N, K, A, K, B, N, C32, N, alpha, beta); } else { hgemm_noTrans_fallback(M, N, K, A, K, B, N, C32, N, alpha, beta); @@ -52,17 +54,19 @@ void hgemm_noTrans(const __fp16 *A, const __fp16 *B, float *C32, unsigned int M, void hgemm_noTrans(const __fp16 *A, const __fp16 *B, __fp16 *C, unsigned int M, unsigned int N, unsigned int K, float alpha, float beta) { if (alpha == 1.F && beta == 0.F) { - if (M % 8 == 0 && N % 16 == 0 && K % 8 == 0) { + // used bitwise operator instead of modulo for performance + // e.g (M % 8) is same as (M & 0x7) which will extract last 3 bits of M + if ((M & 0x7) == 0 && (N & 0xF) == 0 && (K & 0x7) == 0) { hgemm_noTrans_8x16(M, N, K, A, K, B, N, C, N, alpha, beta); - } else if (M % 8 == 0 && N % 8 == 0 && K % 8 == 0) { + } else if ((M & 0x7) == 0 && (N & 0x7) == 0 && (K & 0x7) == 0) { hgemm_noTrans_8x8(M, N, K, A, K, B, N, C, N, alpha, beta); - } else if (M % 4 == 0 && N % 8 == 0 && K % 4 == 0) { + } else if ((M & 0x3) == 0 && (N & 0x7) == 0 && (K & 0x3) == 0) { hgemm_noTrans_4x8(M, N, K, A, K, B, N, C, N, alpha, beta); - } else if (N % 8 == 0) { - hgemm_noTrans_1x8(M, N, K, A, K, B, N, C, N, alpha, beta); - } else if (M % 4 == 0 && N % 4 == 0 && K % 4 == 0) { + } else if ((M & 0x3) == 0 && (N & 0x3) == 0 && (K & 0x3) == 0) { hgemm_noTrans_4x4(M, N, K, A, K, B, N, C, N, alpha, beta); - } else if (N % 4 == 0) { + } else if ((N & 0x7) == 0 && (K & 0x7) == 0) { + hgemm_noTrans_1x8(M, N, K, A, K, B, N, C, N, alpha, beta); + } else if ((N & 0x3) == 0 && (K & 0x7) == 0) { hgemm_noTrans_1x4(M, N, K, A, K, B, N, C, N, alpha, beta); } } @@ -408,6 +412,72 @@ void hgemm_noTrans_1x8(unsigned int M, unsigned int N, unsigned int K, free(sb); } +void hgemm_noTrans_4x4(unsigned int M, unsigned int N, unsigned int K, + const __fp16 *A, unsigned int lda, const __fp16 *B, + unsigned int ldb, float *C, unsigned int ldc, + float alpha, float beta) { + __fp16 *sa = alignedMalloc(M * K); + __fp16 *sb = alignedMalloc(K * N); + + unsigned int ms, mms, ns, ks; + unsigned int m_min, m2_min, n_min, k_min; + for (ms = 0; ms < M; ms += M_BLOCKING) { + m_min = M - ms; + if (m_min > M_BLOCKING) { + m_min = M_BLOCKING; + } + + for (ks = 0; ks < K; ks += k_min) { + k_min = K - ks; + if (k_min >= (K_BLOCKING << 1)) { + k_min = K_BLOCKING; + } else if (k_min > K_BLOCKING) { + k_min = (k_min / 2 + GEMM_UNROLLING_4 - 1) & ~(GEMM_UNROLLING_4 - 1); + } + + n_min = N; + if (N >= N_BLOCKING * 2) { + n_min = N_BLOCKING; + } else if (N > N_BLOCKING) { + n_min = (n_min / 2 + GEMM_UNROLLING_4 - 1) & ~(GEMM_UNROLLING_4 - 1); + } + packing_B4(k_min, n_min, B + ks * ldb, ldb, sb); + + for (mms = ms; mms < ms + m_min; mms += m2_min) { + m2_min = (ms + m_min) - mms; + if (m2_min >= 3 * GEMM_UNROLLING_4) { + m2_min = 3 * GEMM_UNROLLING_4; + } else if (m2_min >= 2 * GEMM_UNROLLING_4) { + m2_min = 2 * GEMM_UNROLLING_4; + } else if (m2_min > GEMM_UNROLLING_4) { + m2_min = GEMM_UNROLLING_4; + } + + packing_A4(m2_min, k_min, A + mms * lda + ks, lda, + sa + k_min * (mms - ms)); + + HGEMM_KERNEL_4x4(m2_min, n_min, k_min, sa + k_min * (mms - ms), sb, + C + mms * ldc, ldc); + } + + for (ns = n_min; ns < N; ns += n_min) { + n_min = N - ns; + if (n_min >= N_BLOCKING * 2) { + n_min = N_BLOCKING; + } else if (n_min > N_BLOCKING) { + n_min = (n_min / 2 + GEMM_UNROLLING_4 - 1) & ~(GEMM_UNROLLING_4 - 1); + } + + packing_B4(k_min, n_min, B + ns + ldb * ks, ldb, sb); + HGEMM_KERNEL_4x4(m_min, n_min, k_min, sa, sb, C + ms * ldc + ns, ldc); + } + } + } + + free(sa); + free(sb); +} + void hgemm_noTrans_4x8(unsigned int M, unsigned int N, unsigned int K, const __fp16 *A, unsigned int lda, const __fp16 *B, unsigned int ldb, __fp16 *C, unsigned int ldc, diff --git a/nntrainer/tensor/hgemm/hgemm.h b/nntrainer/tensor/hgemm/hgemm.h index b05d89cb01..7c8194edf2 100644 --- a/nntrainer/tensor/hgemm/hgemm.h +++ b/nntrainer/tensor/hgemm/hgemm.h @@ -181,6 +181,26 @@ void hgemm_noTrans_8x8(unsigned int M, unsigned int N, unsigned int K, unsigned int ldb, __fp16 *C, unsigned int ldc, float alpha = 1.F, float beta = 0.F); +/** + * @brief hgemm noTrans computation with 4x4 kernel : C = A*B, + * + * @param M length of the row of matrix A + * @param N length of the col of matrix B + * @param K length of the col of matrix A + * @param A input matrix A + * @param lda length of the col of matrix C + * @param B input matrix B + * @param ldb length of the col of matrix C + * @param C output matrix C + * @param ldc length of the col of matrix C + * @param[in] alpha float number + * @param[in] beta float number + */ +void hgemm_noTrans_4x4(unsigned int M, unsigned int N, unsigned int K, + const __fp16 *A, unsigned int lda, const __fp16 *B, + unsigned int ldb, float *C, unsigned int ldc, + float alpha = 1.F, float beta = 0.F); + /** * @brief hgemm noTrans computation with 8x8 kernel : C = A*B, * diff --git a/nntrainer/tensor/hgemm/hgemm_kernel_4x4.h b/nntrainer/tensor/hgemm/hgemm_kernel_4x4.h index 6166b9407d..7bf75b13b7 100644 --- a/nntrainer/tensor/hgemm/hgemm_kernel_4x4.h +++ b/nntrainer/tensor/hgemm/hgemm_kernel_4x4.h @@ -14,6 +14,193 @@ #include #include +#define INIT_KERNEL_4x4() \ + v24 = vdup_n_f16(0.F); \ + v25 = vdup_n_f16(0.F); \ + v26 = vdup_n_f16(0.F); \ + v27 = vdup_n_f16(0.F); + +// 1. Partial sum 256 digits +#define KERNEL_4x4_ACC16() \ + dv0 = vld1_f16(a); \ + vb0 = vld1_f16(b); \ + v24 = vfma_lane_f16(v24, vb0, dv0, 0); \ + v25 = vfma_lane_f16(v25, vb0, dv0, 1); \ + v26 = vfma_lane_f16(v26, vb0, dv0, 2); \ + v27 = vfma_lane_f16(v27, vb0, dv0, 3); \ + dv1 = vld1_f16(a + 4); \ + vb1 = vld1_f16(b + 4); \ + v24 = vfma_lane_f16(v24, vb1, dv1, 0); \ + v25 = vfma_lane_f16(v25, vb1, dv1, 1); \ + v26 = vfma_lane_f16(v26, vb1, dv1, 2); \ + v27 = vfma_lane_f16(v27, vb1, dv1, 3); \ + dv2 = vld1_f16(a + 4 * 2); \ + vb2 = vld1_f16(b + 4 * 2); \ + v24 = vfma_lane_f16(v24, vb2, dv2, 0); \ + v25 = vfma_lane_f16(v25, vb2, dv2, 1); \ + v26 = vfma_lane_f16(v26, vb2, dv2, 2); \ + v27 = vfma_lane_f16(v27, vb2, dv2, 3); \ + dv3 = vld1_f16(a + 4 * 3); \ + vb3 = vld1_f16(b + 4 * 3); \ + v24 = vfma_lane_f16(v24, vb3, dv3, 0); \ + v25 = vfma_lane_f16(v25, vb3, dv3, 1); \ + v26 = vfma_lane_f16(v26, vb3, dv3, 2); \ + v27 = vfma_lane_f16(v27, vb3, dv3, 3); \ + dv4 = vld1_f16(a + 4 * 4); \ + vb4 = vld1_f16(b + 4 * 4); \ + v24 = vfma_lane_f16(v24, vb4, dv4, 0); \ + v25 = vfma_lane_f16(v25, vb4, dv4, 1); \ + v26 = vfma_lane_f16(v26, vb4, dv4, 2); \ + v27 = vfma_lane_f16(v27, vb4, dv4, 3); \ + dv5 = vld1_f16(a + 4 * 5); \ + vb5 = vld1_f16(b + 4 * 5); \ + v24 = vfma_lane_f16(v24, vb5, dv5, 0); \ + v25 = vfma_lane_f16(v25, vb5, dv5, 1); \ + v26 = vfma_lane_f16(v26, vb5, dv5, 2); \ + v27 = vfma_lane_f16(v27, vb5, dv5, 3); \ + dv6 = vld1_f16(a + 4 * 6); \ + vb6 = vld1_f16(b + 4 * 6); \ + v24 = vfma_lane_f16(v24, vb6, dv6, 0); \ + v25 = vfma_lane_f16(v25, vb6, dv6, 1); \ + v26 = vfma_lane_f16(v26, vb6, dv6, 2); \ + v27 = vfma_lane_f16(v27, vb6, dv6, 3); \ + dv7 = vld1_f16(a + 4 * 7); \ + vb7 = vld1_f16(b + 4 * 7); \ + v24 = vfma_lane_f16(v24, vb7, dv7, 0); \ + v25 = vfma_lane_f16(v25, vb7, dv7, 1); \ + v26 = vfma_lane_f16(v26, vb7, dv7, 2); \ + v27 = vfma_lane_f16(v27, vb7, dv7, 3); \ + dv7 = vld1_f16(a + 4 * 8); \ + vb7 = vld1_f16(b + 4 * 8); \ + v24 = vfma_lane_f16(v24, vb7, dv7, 0); \ + v25 = vfma_lane_f16(v25, vb7, dv7, 1); \ + v26 = vfma_lane_f16(v26, vb7, dv7, 2); \ + v27 = vfma_lane_f16(v27, vb7, dv7, 3); \ + dv7 = vld1_f16(a + 4 * 9); \ + vb7 = vld1_f16(b + 4 * 9); \ + v24 = vfma_lane_f16(v24, vb7, dv7, 0); \ + v25 = vfma_lane_f16(v25, vb7, dv7, 1); \ + v26 = vfma_lane_f16(v26, vb7, dv7, 2); \ + v27 = vfma_lane_f16(v27, vb7, dv7, 3); \ + dv7 = vld1_f16(a + 4 * 10); \ + vb7 = vld1_f16(b + 4 * 10); \ + v24 = vfma_lane_f16(v24, vb7, dv7, 0); \ + v25 = vfma_lane_f16(v25, vb7, dv7, 1); \ + v26 = vfma_lane_f16(v26, vb7, dv7, 2); \ + v27 = vfma_lane_f16(v27, vb7, dv7, 3); \ + dv7 = vld1_f16(a + 4 * 11); \ + vb7 = vld1_f16(b + 4 * 11); \ + v24 = vfma_lane_f16(v24, vb7, dv7, 0); \ + v25 = vfma_lane_f16(v25, vb7, dv7, 1); \ + v26 = vfma_lane_f16(v26, vb7, dv7, 2); \ + v27 = vfma_lane_f16(v27, vb7, dv7, 3); \ + dv7 = vld1_f16(a + 4 * 12); \ + vb7 = vld1_f16(b + 4 * 12); \ + v24 = vfma_lane_f16(v24, vb7, dv7, 0); \ + v25 = vfma_lane_f16(v25, vb7, dv7, 1); \ + v26 = vfma_lane_f16(v26, vb7, dv7, 2); \ + v27 = vfma_lane_f16(v27, vb7, dv7, 3); \ + dv7 = vld1_f16(a + 4 * 13); \ + vb7 = vld1_f16(b + 4 * 13); \ + v24 = vfma_lane_f16(v24, vb7, dv7, 0); \ + v25 = vfma_lane_f16(v25, vb7, dv7, 1); \ + v26 = vfma_lane_f16(v26, vb7, dv7, 2); \ + v27 = vfma_lane_f16(v27, vb7, dv7, 3); \ + dv7 = vld1_f16(a + 4 * 14); \ + vb7 = vld1_f16(b + 4 * 14); \ + v24 = vfma_lane_f16(v24, vb7, dv7, 0); \ + v25 = vfma_lane_f16(v25, vb7, dv7, 1); \ + v26 = vfma_lane_f16(v26, vb7, dv7, 2); \ + v27 = vfma_lane_f16(v27, vb7, dv7, 3); \ + dv7 = vld1_f16(a + 4 * 15); \ + vb7 = vld1_f16(b + 4 * 15); \ + v24 = vfma_lane_f16(v24, vb7, dv7, 0); \ + v25 = vfma_lane_f16(v25, vb7, dv7, 1); \ + v26 = vfma_lane_f16(v26, vb7, dv7, 2); \ + v27 = vfma_lane_f16(v27, vb7, dv7, 3); \ + l += 16; \ + __builtin_prefetch(b + 64, 0, 3); \ + __builtin_prefetch(a + 64, 0, 3); \ + b += 4 * 16; \ + a += 4 * 16; + +// 2. Partial sum 128 digits +#define KERNEL_4x4_ACC8() \ + dv0 = vld1_f16(a); \ + vb0 = vld1_f16(b); \ + v24 = vfma_lane_f16(v24, vb0, dv0, 0); \ + v25 = vfma_lane_f16(v25, vb0, dv0, 1); \ + v26 = vfma_lane_f16(v26, vb0, dv0, 2); \ + v27 = vfma_lane_f16(v27, vb0, dv0, 3); \ + dv1 = vld1_f16(a + 4); \ + vb1 = vld1_f16(b + 4); \ + v24 = vfma_lane_f16(v24, vb1, dv1, 0); \ + v25 = vfma_lane_f16(v25, vb1, dv1, 1); \ + v26 = vfma_lane_f16(v26, vb1, dv1, 2); \ + v27 = vfma_lane_f16(v27, vb1, dv1, 3); \ + dv2 = vld1_f16(a + 8); \ + vb2 = vld1_f16(b + 8); \ + v24 = vfma_lane_f16(v24, vb2, dv2, 0); \ + v25 = vfma_lane_f16(v25, vb2, dv2, 1); \ + v26 = vfma_lane_f16(v26, vb2, dv2, 2); \ + v27 = vfma_lane_f16(v27, vb2, dv2, 3); \ + dv3 = vld1_f16(a + 12); \ + vb3 = vld1_f16(b + 12); \ + v24 = vfma_lane_f16(v24, vb3, dv3, 0); \ + v25 = vfma_lane_f16(v25, vb3, dv3, 1); \ + v26 = vfma_lane_f16(v26, vb3, dv3, 2); \ + v27 = vfma_lane_f16(v27, vb3, dv3, 3); \ + dv4 = vld1_f16(a + 16); \ + vb4 = vld1_f16(b + 16); \ + v24 = vfma_lane_f16(v24, vb4, dv4, 0); \ + v25 = vfma_lane_f16(v25, vb4, dv4, 1); \ + v26 = vfma_lane_f16(v26, vb4, dv4, 2); \ + v27 = vfma_lane_f16(v27, vb4, dv4, 3); \ + dv5 = vld1_f16(a + 20); \ + vb5 = vld1_f16(b + 20); \ + v24 = vfma_lane_f16(v24, vb5, dv5, 0); \ + v25 = vfma_lane_f16(v25, vb5, dv5, 1); \ + v26 = vfma_lane_f16(v26, vb5, dv5, 2); \ + v27 = vfma_lane_f16(v27, vb5, dv5, 3); \ + dv6 = vld1_f16(a + 24); \ + vb6 = vld1_f16(b + 24); \ + v24 = vfma_lane_f16(v24, vb6, dv6, 0); \ + v25 = vfma_lane_f16(v25, vb6, dv6, 1); \ + v26 = vfma_lane_f16(v26, vb6, dv6, 2); \ + v27 = vfma_lane_f16(v27, vb6, dv6, 3); \ + dv7 = vld1_f16(a + 28); \ + vb7 = vld1_f16(b + 28); \ + v24 = vfma_lane_f16(v24, vb7, dv7, 0); \ + v25 = vfma_lane_f16(v25, vb7, dv7, 1); \ + v26 = vfma_lane_f16(v26, vb7, dv7, 2); \ + v27 = vfma_lane_f16(v27, vb7, dv7, 3); \ + l += 8; \ + __builtin_prefetch(b + 32, 0, 3); \ + __builtin_prefetch(a + 32, 0, 3); \ + b += 4 * 8; \ + a += 4 * 8; + +// 2. Partial sum 16 digits +#define KERNEL_4x4_ACC1() \ + dv0 = vld1_f16(a); \ + vb0 = vld1_f16(b); \ + v24 = vfma_lane_f16(v24, vb0, dv0, 0); \ + v25 = vfma_lane_f16(v25, vb0, dv0, 1); \ + v26 = vfma_lane_f16(v26, vb0, dv0, 2); \ + v27 = vfma_lane_f16(v27, vb0, dv0, 3); \ + l += 1; \ + __builtin_prefetch(b + 4, 0, 3); \ + __builtin_prefetch(a + 4, 0, 3); \ + b += 4 * 1; \ + a += 4 * 1; + +#define SAVE_KERNEL_4X4_F16_F32() \ + vst1q_f32(c, vaddq_f32(vld1q_f32(c), vcvt_f32_f16(v24))); \ + vst1q_f32(c + ldc, vaddq_f32(vld1q_f32(c + ldc), vcvt_f32_f16(v25))); \ + vst1q_f32(c + 2 * ldc, \ + vaddq_f32(vld1q_f32(c + 2 * ldc), vcvt_f32_f16(v26))); \ + vst1q_f32(c + 3 * ldc, vaddq_f32(vld1q_f32(c + 3 * ldc), vcvt_f32_f16(v27))); + /** * @brief hgemm 4x4 kernel sc = sa * sb * @@ -37,10 +224,11 @@ void hgemm_kernel_4x4(unsigned int M, unsigned int N, unsigned int K, __builtin_prefetch(b, 0, 3); __builtin_prefetch(a, 0, 3); - float16x4_t v24 = {0}; - float16x4_t v25 = {0}; - float16x4_t v26 = {0}; - float16x4_t v27 = {0}; + float16x4_t v24; + float16x4_t v25; + float16x4_t v26; + float16x4_t v27; + INIT_KERNEL_4x4(); for (l = 0; l < K; l += VL_FP16_HALF) { float16x4_t v0 = vld1_f16(b); @@ -101,3 +289,59 @@ void hgemm_kernel_4x4(unsigned int M, unsigned int N, unsigned int K, b = sb; } } + +/** + * @brief hgemm 4x4 kernel sc = sa * sb + * + * @param m length of the row of matrix A + * @param n length of the col of matrix B + * @param k length of the col of matrix A + * @param sa sub-matrix of input matrix A + * @param sb sub-matrix of input matrix B + * @param sc sub-matrix of output matrix C + * @param ldc leading dimension of matrix C + */ +void hgemm_kernel_4x4(unsigned int M, unsigned int N, unsigned int K, + __fp16 *sa, __fp16 *sb, float *sc, unsigned int ldc) { + assert(M > 0 && N > 0 && K > 0); + assert(M % 4 == 0 && N % 4 == 0 && K % 4 == 0); + + __fp16 *a = sa, *b = sb; + float *c = sc; + unsigned int i, j, l; + unsigned int K16 = (K >> 4) << 4; + unsigned int K8 = (K >> 3) << 3; + for (i = 0; i < M; i += VL_FP16_HALF) { + for (j = 0; j < N; j += VL_FP16_HALF) { + __builtin_prefetch(b, 0, 3); + __builtin_prefetch(a, 0, 3); + + float16x4_t v24, v25, v26, v27; + float16x4_t dv0, dv1, dv2, dv3, dv4, dv5, dv6, dv7; + float16x4_t vb0, vb1, vb2, vb3, vb4, vb5, vb6, vb7; + l = 0; + for (; l < K16;) { + INIT_KERNEL_4x4(); + KERNEL_4x4_ACC16(); + SAVE_KERNEL_4X4_F16_F32(); + } + for (; l < K8;) { + INIT_KERNEL_4x4(); + KERNEL_4x4_ACC8(); + SAVE_KERNEL_4X4_F16_F32(); + } + for (; l < K;) { + INIT_KERNEL_4x4(); + KERNEL_4x4_ACC1(); + SAVE_KERNEL_4X4_F16_F32(); + } + + c += 4; + a -= 4 * K; + } + sc += ldc * 4; + c = sc; + a += 4 * K; + b = sb; + } +} diff --git a/nntrainer/tensor/hgemm/hgemm_kernel_4x8.h b/nntrainer/tensor/hgemm/hgemm_kernel_4x8.h index dce6659934..01204457e9 100644 --- a/nntrainer/tensor/hgemm/hgemm_kernel_4x8.h +++ b/nntrainer/tensor/hgemm/hgemm_kernel_4x8.h @@ -14,15 +14,118 @@ #include #include -/// @note Following KERNELs are the combinations of accuracy-latency -/// tradeoff. User can select which kernel to use by replacing them. +#define INIT_KERNEL_4X8() \ + v0 = vdupq_n_f16(0.F); \ + v3 = vdupq_n_f16(0.F); \ + v6 = vdupq_n_f16(0.F); \ + v9 = vdupq_n_f16(0.F); -// 1. Partial sum 256 digits : worst accuracy, best latency +// 1. Partial sum 256 digits +#define KERNEL_4x8_ACC16() \ + dv0 = vld1_f16(a); \ + v24 = vld1q_f16(b); \ + v0 = vfmaq_lane_f16(v0, v24, dv0, 0); \ + v3 = vfmaq_lane_f16(v3, v24, dv0, 1); \ + v6 = vfmaq_lane_f16(v6, v24, dv0, 2); \ + v9 = vfmaq_lane_f16(v9, v24, dv0, 3); \ + dv1 = vld1_f16(a + 4); \ + v25 = vld1q_f16(b + 8); \ + v0 = vfmaq_lane_f16(v0, v25, dv1, 0); \ + v3 = vfmaq_lane_f16(v3, v25, dv1, 1); \ + v6 = vfmaq_lane_f16(v6, v25, dv1, 2); \ + v9 = vfmaq_lane_f16(v9, v25, dv1, 3); \ + dv2 = vld1_f16(a + 4 * 2); \ + v26 = vld1q_f16(b + 8 * 2); \ + v0 = vfmaq_lane_f16(v0, v26, dv2, 0); \ + v3 = vfmaq_lane_f16(v3, v26, dv2, 1); \ + v6 = vfmaq_lane_f16(v6, v26, dv2, 2); \ + v9 = vfmaq_lane_f16(v9, v26, dv2, 3); \ + dv3 = vld1_f16(a + 4 * 3); \ + v27 = vld1q_f16(b + 8 * 3); \ + v0 = vfmaq_lane_f16(v0, v27, dv3, 0); \ + v3 = vfmaq_lane_f16(v3, v27, dv3, 1); \ + v6 = vfmaq_lane_f16(v6, v27, dv3, 2); \ + v9 = vfmaq_lane_f16(v9, v27, dv3, 3); \ + dv4 = vld1_f16(a + 4 * 4); \ + v28 = vld1q_f16(b + 8 * 4); \ + v0 = vfmaq_lane_f16(v0, v28, dv4, 0); \ + v3 = vfmaq_lane_f16(v3, v28, dv4, 1); \ + v6 = vfmaq_lane_f16(v6, v28, dv4, 2); \ + v9 = vfmaq_lane_f16(v9, v28, dv4, 3); \ + dv5 = vld1_f16(a + 4 * 5); \ + v29 = vld1q_f16(b + 8 * 5); \ + v0 = vfmaq_lane_f16(v0, v29, dv5, 0); \ + v3 = vfmaq_lane_f16(v3, v29, dv5, 1); \ + v6 = vfmaq_lane_f16(v6, v29, dv5, 2); \ + v9 = vfmaq_lane_f16(v9, v29, dv5, 3); \ + dv6 = vld1_f16(a + 4 * 6); \ + v30 = vld1q_f16(b + 8 * 6); \ + v0 = vfmaq_lane_f16(v0, v30, dv6, 0); \ + v3 = vfmaq_lane_f16(v3, v30, dv6, 1); \ + v6 = vfmaq_lane_f16(v6, v30, dv6, 2); \ + v9 = vfmaq_lane_f16(v9, v30, dv6, 3); \ + dv7 = vld1_f16(a + 4 * 7); \ + v31 = vld1q_f16(b + 8 * 7); \ + v0 = vfmaq_lane_f16(v0, v31, dv7, 0); \ + v3 = vfmaq_lane_f16(v3, v31, dv7, 1); \ + v6 = vfmaq_lane_f16(v6, v31, dv7, 2); \ + v9 = vfmaq_lane_f16(v9, v31, dv7, 3); \ + dv7 = vld1_f16(a + 4 * 8); \ + v31 = vld1q_f16(b + 8 * 8); \ + v0 = vfmaq_lane_f16(v0, v31, dv7, 0); \ + v3 = vfmaq_lane_f16(v3, v31, dv7, 1); \ + v6 = vfmaq_lane_f16(v6, v31, dv7, 2); \ + v9 = vfmaq_lane_f16(v9, v31, dv7, 3); \ + dv7 = vld1_f16(a + 4 * 9); \ + v31 = vld1q_f16(b + 8 * 9); \ + v0 = vfmaq_lane_f16(v0, v31, dv7, 0); \ + v3 = vfmaq_lane_f16(v3, v31, dv7, 1); \ + v6 = vfmaq_lane_f16(v6, v31, dv7, 2); \ + v9 = vfmaq_lane_f16(v9, v31, dv7, 3); \ + dv7 = vld1_f16(a + 4 * 10); \ + v31 = vld1q_f16(b + 8 * 10); \ + v0 = vfmaq_lane_f16(v0, v31, dv7, 0); \ + v3 = vfmaq_lane_f16(v3, v31, dv7, 1); \ + v6 = vfmaq_lane_f16(v6, v31, dv7, 2); \ + v9 = vfmaq_lane_f16(v9, v31, dv7, 3); \ + dv7 = vld1_f16(a + 4 * 11); \ + v31 = vld1q_f16(b + 8 * 11); \ + v0 = vfmaq_lane_f16(v0, v31, dv7, 0); \ + v3 = vfmaq_lane_f16(v3, v31, dv7, 1); \ + v6 = vfmaq_lane_f16(v6, v31, dv7, 2); \ + v9 = vfmaq_lane_f16(v9, v31, dv7, 3); \ + dv7 = vld1_f16(a + 4 * 12); \ + v31 = vld1q_f16(b + 8 * 12); \ + v0 = vfmaq_lane_f16(v0, v31, dv7, 0); \ + v3 = vfmaq_lane_f16(v3, v31, dv7, 1); \ + v6 = vfmaq_lane_f16(v6, v31, dv7, 2); \ + v9 = vfmaq_lane_f16(v9, v31, dv7, 3); \ + dv7 = vld1_f16(a + 4 * 13); \ + v31 = vld1q_f16(b + 8 * 13); \ + v0 = vfmaq_lane_f16(v0, v31, dv7, 0); \ + v3 = vfmaq_lane_f16(v3, v31, dv7, 1); \ + v6 = vfmaq_lane_f16(v6, v31, dv7, 2); \ + v9 = vfmaq_lane_f16(v9, v31, dv7, 3); \ + dv7 = vld1_f16(a + 4 * 14); \ + v31 = vld1q_f16(b + 8 * 14); \ + v0 = vfmaq_lane_f16(v0, v31, dv7, 0); \ + v3 = vfmaq_lane_f16(v3, v31, dv7, 1); \ + v6 = vfmaq_lane_f16(v6, v31, dv7, 2); \ + v9 = vfmaq_lane_f16(v9, v31, dv7, 3); \ + dv7 = vld1_f16(a + 4 * 15); \ + v31 = vld1q_f16(b + 8 * 15); \ + v0 = vfmaq_lane_f16(v0, v31, dv7, 0); \ + v3 = vfmaq_lane_f16(v3, v31, dv7, 1); \ + v6 = vfmaq_lane_f16(v6, v31, dv7, 2); \ + v9 = vfmaq_lane_f16(v9, v31, dv7, 3); \ + l += 16; \ + __builtin_prefetch(b + 128, 0, 3); \ + __builtin_prefetch(a + 64, 0, 3); \ + b += 8 * 16; \ + a += 4 * 16; + +// 1. Partial sum 256 digits #define KERNEL_4x8_ACC8() \ - v0 = vdupq_n_f16(0.F); \ - v3 = vdupq_n_f16(0.F); \ - v6 = vdupq_n_f16(0.F); \ - v9 = vdupq_n_f16(0.F); \ dv0 = vld1_f16(a); \ v24 = vld1q_f16(b); \ v0 = vfmaq_lane_f16(v0, v24, dv0, 0); \ @@ -77,12 +180,8 @@ b += 8 * 8; \ a += 4 * 8; -// 2. Partial sum 128 digits : medium accuracy, medium latency +// 2. Partial sum 128 digits #define KERNEL_4x8_ACC4() \ - v0 = vdupq_n_f16(0.F); \ - v3 = vdupq_n_f16(0.F); \ - v6 = vdupq_n_f16(0.F); \ - v9 = vdupq_n_f16(0.F); \ dv0 = vld1_f16(a); \ v24 = vld1q_f16(b); \ v0 = vfmaq_lane_f16(v0, v24, dv0, 0); \ @@ -113,12 +212,8 @@ b += 8 * 4; \ a += 4 * 4; -// 3. Partial sum 32 digits : Best accuracy, worst latency +// 3. Partial sum 32 digits #define KERNEL_4x8_ACC1() \ - v0 = vdupq_n_f16(0.F); \ - v3 = vdupq_n_f16(0.F); \ - v6 = vdupq_n_f16(0.F); \ - v9 = vdupq_n_f16(0.F); \ dv0 = vld1_f16(a); \ v24 = vld1q_f16(b); \ v0 = vfmaq_lane_f16(v0, v24, dv0, 0); \ @@ -131,6 +226,24 @@ b += 8 * 1; \ a += 4 * 1; +#define SAVE_KERNEL_4X8_F16_F32() \ + vst1q_f32(c, vaddq_f32(vld1q_f32(c), vcvt_f32_f16(vget_low_f16(v0)))); \ + vst1q_f32(c + ldc, \ + vaddq_f32(vld1q_f32(c + ldc), vcvt_f32_f16(vget_low_f16(v3)))); \ + vst1q_f32(c + 2 * ldc, vaddq_f32(vld1q_f32(c + 2 * ldc), \ + vcvt_f32_f16(vget_low_f16(v6)))); \ + vst1q_f32(c + 3 * ldc, vaddq_f32(vld1q_f32(c + 3 * ldc), \ + vcvt_f32_f16(vget_low_f16(v9)))); \ + \ + vst1q_f32(c + 4, \ + vaddq_f32(vld1q_f32(c + 4), vcvt_f32_f16(vget_high_f16(v0)))); \ + vst1q_f32(c + 4 + ldc, vaddq_f32(vld1q_f32(c + 4 + ldc), \ + vcvt_f32_f16(vget_high_f16(v3)))); \ + vst1q_f32(c + 4 + 2 * ldc, vaddq_f32(vld1q_f32(c + 4 + 2 * ldc), \ + vcvt_f32_f16(vget_high_f16(v6)))); \ + vst1q_f32(c + 4 + 3 * ldc, vaddq_f32(vld1q_f32(c + 4 + 3 * ldc), \ + vcvt_f32_f16(vget_high_f16(v9)))); + /** * @brief hgemm 4x8 kernel sc = sa * sb * @@ -148,7 +261,7 @@ void hgemm_kernel_4x8(unsigned int M, unsigned int N, unsigned int K, assert(M % 4 == 0 && N % 8 == 0); __fp16 *a = sa, *b = sb, *c = sc; - unsigned int k8 = (K >> 3) << 3; + unsigned int K8 = (K >> 3) << 3; unsigned int i, j, l; for (i = 0; i < M; i += 4) { for (j = 0; j < N; j += 8) { @@ -157,23 +270,18 @@ void hgemm_kernel_4x8(unsigned int M, unsigned int N, unsigned int K, float16x8_t v0, v3, v6, v9; float16x8_t v24, v25, v26, v27, v28, v29, v30, v31; float16x4_t dv0, dv1, dv2, dv3, dv4, dv5, dv6, dv7; + INIT_KERNEL_4X8(); l = 0; - for (; l < k8;) { + for (; l < K8;) { KERNEL_4x8_ACC8(); - - vst1q_f16(c, vaddq_f16(vld1q_f16(c), v0)); - vst1q_f16(c + ldc, vaddq_f16(vld1q_f16(c + ldc), v3)); - vst1q_f16(c + 2 * ldc, vaddq_f16(vld1q_f16(c + 2 * ldc), v6)); - vst1q_f16(c + 3 * ldc, vaddq_f16(vld1q_f16(c + 3 * ldc), v9)); } for (; l < K;) { KERNEL_4x8_ACC1(); - - vst1q_f16(c, vaddq_f16(vld1q_f16(c), v0)); - vst1q_f16(c + ldc, vaddq_f16(vld1q_f16(c + ldc), v3)); - vst1q_f16(c + 2 * ldc, vaddq_f16(vld1q_f16(c + 2 * ldc), v6)); - vst1q_f16(c + 3 * ldc, vaddq_f16(vld1q_f16(c + 3 * ldc), v9)); } + vst1q_f16(c, vaddq_f16(vld1q_f16(c), v0)); + vst1q_f16(c + ldc, vaddq_f16(vld1q_f16(c + ldc), v3)); + vst1q_f16(c + 2 * ldc, vaddq_f16(vld1q_f16(c + 2 * ldc), v6)); + vst1q_f16(c + 3 * ldc, vaddq_f16(vld1q_f16(c + 3 * ldc), v9)); c += 8; a -= 4 * K; } @@ -202,7 +310,9 @@ void hgemm_kernel_4x8(unsigned int M, unsigned int N, unsigned int K, __fp16 *a = sa, *b = sb; float *c = sc; - unsigned int k8 = (K >> 3) << 3; + unsigned int K16 = (K >> 4) << 4; + unsigned int K8 = (K >> 3) << 3; + unsigned int K4 = (K >> 2) << 2; unsigned int i, j, l; for (i = 0; i < M; i += 4) { for (j = 0; j < N; j += 8) { @@ -212,45 +322,25 @@ void hgemm_kernel_4x8(unsigned int M, unsigned int N, unsigned int K, float16x8_t v24, v25, v26, v27, v28, v29, v30, v31; float16x4_t dv0, dv1, dv2, dv3, dv4, dv5, dv6, dv7; l = 0; - for (; l < k8;) { + for (; l < K16;) { + INIT_KERNEL_4X8(); + KERNEL_4x8_ACC16(); + SAVE_KERNEL_4X8_F16_F32(); + } + for (; l < K8;) { + INIT_KERNEL_4X8(); KERNEL_4x8_ACC8(); - - vst1q_f32(c, vaddq_f32(vld1q_f32(c), vcvt_f32_f16(vget_low_f16(v0)))); - vst1q_f32(c + ldc, vaddq_f32(vld1q_f32(c + ldc), - vcvt_f32_f16(vget_low_f16(v3)))); - vst1q_f32(c + 2 * ldc, vaddq_f32(vld1q_f32(c + 2 * ldc), - vcvt_f32_f16(vget_low_f16(v6)))); - vst1q_f32(c + 3 * ldc, vaddq_f32(vld1q_f32(c + 3 * ldc), - vcvt_f32_f16(vget_low_f16(v9)))); - - vst1q_f32(c + 4, - vaddq_f32(vld1q_f32(c + 4), vcvt_f32_f16(vget_high_f16(v0)))); - vst1q_f32(c + 4 + ldc, vaddq_f32(vld1q_f32(c + 4 + ldc), - vcvt_f32_f16(vget_high_f16(v3)))); - vst1q_f32(c + 4 + 2 * ldc, vaddq_f32(vld1q_f32(c + 4 + 2 * ldc), - vcvt_f32_f16(vget_high_f16(v6)))); - vst1q_f32(c + 4 + 3 * ldc, vaddq_f32(vld1q_f32(c + 4 + 3 * ldc), - vcvt_f32_f16(vget_high_f16(v9)))); + SAVE_KERNEL_4X8_F16_F32(); + } + for (; l < K4;) { + INIT_KERNEL_4X8(); + KERNEL_4x8_ACC4(); + SAVE_KERNEL_4X8_F16_F32(); } for (; l < K;) { + INIT_KERNEL_4X8(); KERNEL_4x8_ACC1(); - - vst1q_f32(c, vaddq_f32(vld1q_f32(c), vcvt_f32_f16(vget_low_f16(v0)))); - vst1q_f32(c + ldc, vaddq_f32(vld1q_f32(c + ldc), - vcvt_f32_f16(vget_low_f16(v3)))); - vst1q_f32(c + 2 * ldc, vaddq_f32(vld1q_f32(c + 2 * ldc), - vcvt_f32_f16(vget_low_f16(v6)))); - vst1q_f32(c + 3 * ldc, vaddq_f32(vld1q_f32(c + 3 * ldc), - vcvt_f32_f16(vget_low_f16(v9)))); - - vst1q_f32(c + 4, - vaddq_f32(vld1q_f32(c + 4), vcvt_f32_f16(vget_high_f16(v0)))); - vst1q_f32(c + 4 + ldc, vaddq_f32(vld1q_f32(c + 4 + ldc), - vcvt_f32_f16(vget_high_f16(v3)))); - vst1q_f32(c + 4 + 2 * ldc, vaddq_f32(vld1q_f32(c + 4 + 2 * ldc), - vcvt_f32_f16(vget_high_f16(v6)))); - vst1q_f32(c + 4 + 3 * ldc, vaddq_f32(vld1q_f32(c + 4 + 3 * ldc), - vcvt_f32_f16(vget_high_f16(v9)))); + SAVE_KERNEL_4X8_F16_F32(); } c += 8; a -= 4 * K; diff --git a/nntrainer/tensor/hgemm/hgemm_kernel_8x16.h b/nntrainer/tensor/hgemm/hgemm_kernel_8x16.h index 7cac545809..a89a6b5421 100644 --- a/nntrainer/tensor/hgemm/hgemm_kernel_8x16.h +++ b/nntrainer/tensor/hgemm/hgemm_kernel_8x16.h @@ -14,27 +14,338 @@ #include #include -/// @note Following KERNELs are the combinations of accuracy-latency -/// tradeoff. User can select which kernel to use by replacing them. +#define INIT_KERNEL_8X16() \ + v0_7 = vdupq_n_f16(0.F); \ + v8_15 = vdupq_n_f16(0.F); \ + v16_23 = vdupq_n_f16(0.F); \ + v24_31 = vdupq_n_f16(0.F); \ + v32_39 = vdupq_n_f16(0.F); \ + v40_47 = vdupq_n_f16(0.F); \ + v48_55 = vdupq_n_f16(0.F); \ + v56_63 = vdupq_n_f16(0.F); \ + v64_71 = vdupq_n_f16(0.F); \ + v72_79 = vdupq_n_f16(0.F); \ + v80_87 = vdupq_n_f16(0.F); \ + v88_95 = vdupq_n_f16(0.F); \ + v96_103 = vdupq_n_f16(0.F); \ + v104_111 = vdupq_n_f16(0.F); \ + v112_119 = vdupq_n_f16(0.F); \ + v120_127 = vdupq_n_f16(0.F); -// 1. Partial sum 1024 digits : Worst accuracy, best latency +// 1. Partial sum 2048 digits +#define KERNEL_8x16_ACC16() \ + va0 = vld1q_f16(a); \ + v24 = vld1q_f16(b); \ + v25 = vld1q_f16(b + 8); \ + v0_7 = vfmaq_laneq_f16(v0_7, v24, va0, 0); \ + v8_15 = vfmaq_laneq_f16(v8_15, v24, va0, 1); \ + v16_23 = vfmaq_laneq_f16(v16_23, v24, va0, 2); \ + v24_31 = vfmaq_laneq_f16(v24_31, v24, va0, 3); \ + v32_39 = vfmaq_laneq_f16(v32_39, v24, va0, 4); \ + v40_47 = vfmaq_laneq_f16(v40_47, v24, va0, 5); \ + v48_55 = vfmaq_laneq_f16(v48_55, v24, va0, 6); \ + v56_63 = vfmaq_laneq_f16(v56_63, v24, va0, 7); \ + v64_71 = vfmaq_laneq_f16(v64_71, v25, va0, 0); \ + v72_79 = vfmaq_laneq_f16(v72_79, v25, va0, 1); \ + v80_87 = vfmaq_laneq_f16(v80_87, v25, va0, 2); \ + v88_95 = vfmaq_laneq_f16(v88_95, v25, va0, 3); \ + v96_103 = vfmaq_laneq_f16(v96_103, v25, va0, 4); \ + v104_111 = vfmaq_laneq_f16(v104_111, v25, va0, 5); \ + v112_119 = vfmaq_laneq_f16(v112_119, v25, va0, 6); \ + v120_127 = vfmaq_laneq_f16(v120_127, v25, va0, 7); \ + va1 = vld1q_f16(a + 8); \ + v26 = vld1q_f16(b + 8 * 2); \ + v27 = vld1q_f16(b + 8 * 3); \ + v0_7 = vfmaq_laneq_f16(v0_7, v26, va1, 0); \ + v8_15 = vfmaq_laneq_f16(v8_15, v26, va1, 1); \ + v16_23 = vfmaq_laneq_f16(v16_23, v26, va1, 2); \ + v24_31 = vfmaq_laneq_f16(v24_31, v26, va1, 3); \ + v32_39 = vfmaq_laneq_f16(v32_39, v26, va1, 4); \ + v40_47 = vfmaq_laneq_f16(v40_47, v26, va1, 5); \ + v48_55 = vfmaq_laneq_f16(v48_55, v26, va1, 6); \ + v56_63 = vfmaq_laneq_f16(v56_63, v26, va1, 7); \ + v64_71 = vfmaq_laneq_f16(v64_71, v27, va1, 0); \ + v72_79 = vfmaq_laneq_f16(v72_79, v27, va1, 1); \ + v80_87 = vfmaq_laneq_f16(v80_87, v27, va1, 2); \ + v88_95 = vfmaq_laneq_f16(v88_95, v27, va1, 3); \ + v96_103 = vfmaq_laneq_f16(v96_103, v27, va1, 4); \ + v104_111 = vfmaq_laneq_f16(v104_111, v27, va1, 5); \ + v112_119 = vfmaq_laneq_f16(v112_119, v27, va1, 6); \ + v120_127 = vfmaq_laneq_f16(v120_127, v27, va1, 7); \ + va2 = vld1q_f16(a + 8 * 2); \ + v28 = vld1q_f16(b + 8 * 4); \ + v29 = vld1q_f16(b + 8 * 5); \ + v0_7 = vfmaq_laneq_f16(v0_7, v28, va2, 0); \ + v8_15 = vfmaq_laneq_f16(v8_15, v28, va2, 1); \ + v16_23 = vfmaq_laneq_f16(v16_23, v28, va2, 2); \ + v24_31 = vfmaq_laneq_f16(v24_31, v28, va2, 3); \ + v32_39 = vfmaq_laneq_f16(v32_39, v28, va2, 4); \ + v40_47 = vfmaq_laneq_f16(v40_47, v28, va2, 5); \ + v48_55 = vfmaq_laneq_f16(v48_55, v28, va2, 6); \ + v56_63 = vfmaq_laneq_f16(v56_63, v28, va2, 7); \ + v64_71 = vfmaq_laneq_f16(v64_71, v29, va2, 0); \ + v72_79 = vfmaq_laneq_f16(v72_79, v29, va2, 1); \ + v80_87 = vfmaq_laneq_f16(v80_87, v29, va2, 2); \ + v88_95 = vfmaq_laneq_f16(v88_95, v29, va2, 3); \ + v96_103 = vfmaq_laneq_f16(v96_103, v29, va2, 4); \ + v104_111 = vfmaq_laneq_f16(v104_111, v29, va2, 5); \ + v112_119 = vfmaq_laneq_f16(v112_119, v29, va2, 6); \ + v120_127 = vfmaq_laneq_f16(v120_127, v29, va2, 7); \ + va3 = vld1q_f16(a + 8 * 3); \ + v30 = vld1q_f16(b + 8 * 6); \ + v31 = vld1q_f16(b + 8 * 7); \ + v0_7 = vfmaq_laneq_f16(v0_7, v30, va3, 0); \ + v8_15 = vfmaq_laneq_f16(v8_15, v30, va3, 1); \ + v16_23 = vfmaq_laneq_f16(v16_23, v30, va3, 2); \ + v24_31 = vfmaq_laneq_f16(v24_31, v30, va3, 3); \ + v32_39 = vfmaq_laneq_f16(v32_39, v30, va3, 4); \ + v40_47 = vfmaq_laneq_f16(v40_47, v30, va3, 5); \ + v48_55 = vfmaq_laneq_f16(v48_55, v30, va3, 6); \ + v56_63 = vfmaq_laneq_f16(v56_63, v30, va3, 7); \ + v64_71 = vfmaq_laneq_f16(v64_71, v31, va3, 0); \ + v72_79 = vfmaq_laneq_f16(v72_79, v31, va3, 1); \ + v80_87 = vfmaq_laneq_f16(v80_87, v31, va3, 2); \ + v88_95 = vfmaq_laneq_f16(v88_95, v31, va3, 3); \ + v96_103 = vfmaq_laneq_f16(v96_103, v31, va3, 4); \ + v104_111 = vfmaq_laneq_f16(v104_111, v31, va3, 5); \ + v112_119 = vfmaq_laneq_f16(v112_119, v31, va3, 6); \ + v120_127 = vfmaq_laneq_f16(v120_127, v31, va3, 7); \ + va4 = vld1q_f16(a + 8 * 4); \ + v24 = vld1q_f16(b + 8 * 8); \ + v25 = vld1q_f16(b + 8 * 9); \ + v0_7 = vfmaq_laneq_f16(v0_7, v24, va4, 0); \ + v8_15 = vfmaq_laneq_f16(v8_15, v24, va4, 1); \ + v16_23 = vfmaq_laneq_f16(v16_23, v24, va4, 2); \ + v24_31 = vfmaq_laneq_f16(v24_31, v24, va4, 3); \ + v32_39 = vfmaq_laneq_f16(v32_39, v24, va4, 4); \ + v40_47 = vfmaq_laneq_f16(v40_47, v24, va4, 5); \ + v48_55 = vfmaq_laneq_f16(v48_55, v24, va4, 6); \ + v56_63 = vfmaq_laneq_f16(v56_63, v24, va4, 7); \ + v64_71 = vfmaq_laneq_f16(v64_71, v25, va4, 0); \ + v72_79 = vfmaq_laneq_f16(v72_79, v25, va4, 1); \ + v80_87 = vfmaq_laneq_f16(v80_87, v25, va4, 2); \ + v88_95 = vfmaq_laneq_f16(v88_95, v25, va4, 3); \ + v96_103 = vfmaq_laneq_f16(v96_103, v25, va4, 4); \ + v104_111 = vfmaq_laneq_f16(v104_111, v25, va4, 5); \ + v112_119 = vfmaq_laneq_f16(v112_119, v25, va4, 6); \ + v120_127 = vfmaq_laneq_f16(v120_127, v25, va4, 7); \ + va5 = vld1q_f16(a + 8 * 5); \ + v26 = vld1q_f16(b + 8 * 10); \ + v27 = vld1q_f16(b + 8 * 11); \ + v0_7 = vfmaq_laneq_f16(v0_7, v26, va5, 0); \ + v8_15 = vfmaq_laneq_f16(v8_15, v26, va5, 1); \ + v16_23 = vfmaq_laneq_f16(v16_23, v26, va5, 2); \ + v24_31 = vfmaq_laneq_f16(v24_31, v26, va5, 3); \ + v32_39 = vfmaq_laneq_f16(v32_39, v26, va5, 4); \ + v40_47 = vfmaq_laneq_f16(v40_47, v26, va5, 5); \ + v48_55 = vfmaq_laneq_f16(v48_55, v26, va5, 6); \ + v56_63 = vfmaq_laneq_f16(v56_63, v26, va5, 7); \ + v64_71 = vfmaq_laneq_f16(v64_71, v27, va5, 0); \ + v72_79 = vfmaq_laneq_f16(v72_79, v27, va5, 1); \ + v80_87 = vfmaq_laneq_f16(v80_87, v27, va5, 2); \ + v88_95 = vfmaq_laneq_f16(v88_95, v27, va5, 3); \ + v96_103 = vfmaq_laneq_f16(v96_103, v27, va5, 4); \ + v104_111 = vfmaq_laneq_f16(v104_111, v27, va5, 5); \ + v112_119 = vfmaq_laneq_f16(v112_119, v27, va5, 6); \ + v120_127 = vfmaq_laneq_f16(v120_127, v27, va5, 7); \ + va6 = vld1q_f16(a + 8 * 6); \ + v28 = vld1q_f16(b + 8 * 12); \ + v29 = vld1q_f16(b + 8 * 13); \ + v0_7 = vfmaq_laneq_f16(v0_7, v28, va6, 0); \ + v8_15 = vfmaq_laneq_f16(v8_15, v28, va6, 1); \ + v16_23 = vfmaq_laneq_f16(v16_23, v28, va6, 2); \ + v24_31 = vfmaq_laneq_f16(v24_31, v28, va6, 3); \ + v32_39 = vfmaq_laneq_f16(v32_39, v28, va6, 4); \ + v40_47 = vfmaq_laneq_f16(v40_47, v28, va6, 5); \ + v48_55 = vfmaq_laneq_f16(v48_55, v28, va6, 6); \ + v56_63 = vfmaq_laneq_f16(v56_63, v28, va6, 7); \ + v64_71 = vfmaq_laneq_f16(v64_71, v29, va6, 0); \ + v72_79 = vfmaq_laneq_f16(v72_79, v29, va6, 1); \ + v80_87 = vfmaq_laneq_f16(v80_87, v29, va6, 2); \ + v88_95 = vfmaq_laneq_f16(v88_95, v29, va6, 3); \ + v96_103 = vfmaq_laneq_f16(v96_103, v29, va6, 4); \ + v104_111 = vfmaq_laneq_f16(v104_111, v29, va6, 5); \ + v112_119 = vfmaq_laneq_f16(v112_119, v29, va6, 6); \ + v120_127 = vfmaq_laneq_f16(v120_127, v29, va6, 7); \ + va7 = vld1q_f16(a + 8 * 7); \ + v30 = vld1q_f16(b + 8 * 14); \ + v31 = vld1q_f16(b + 8 * 15); \ + v0_7 = vfmaq_laneq_f16(v0_7, v30, va7, 0); \ + v8_15 = vfmaq_laneq_f16(v8_15, v30, va7, 1); \ + v16_23 = vfmaq_laneq_f16(v16_23, v30, va7, 2); \ + v24_31 = vfmaq_laneq_f16(v24_31, v30, va7, 3); \ + v32_39 = vfmaq_laneq_f16(v32_39, v30, va7, 4); \ + v40_47 = vfmaq_laneq_f16(v40_47, v30, va7, 5); \ + v48_55 = vfmaq_laneq_f16(v48_55, v30, va7, 6); \ + v56_63 = vfmaq_laneq_f16(v56_63, v30, va7, 7); \ + v64_71 = vfmaq_laneq_f16(v64_71, v31, va7, 0); \ + v72_79 = vfmaq_laneq_f16(v72_79, v31, va7, 1); \ + v80_87 = vfmaq_laneq_f16(v80_87, v31, va7, 2); \ + v88_95 = vfmaq_laneq_f16(v88_95, v31, va7, 3); \ + v96_103 = vfmaq_laneq_f16(v96_103, v31, va7, 4); \ + v104_111 = vfmaq_laneq_f16(v104_111, v31, va7, 5); \ + v112_119 = vfmaq_laneq_f16(v112_119, v31, va7, 6); \ + v120_127 = vfmaq_laneq_f16(v120_127, v31, va7, 7); \ + va7 = vld1q_f16(a + 8 * 8); \ + v30 = vld1q_f16(b + 8 * 16); \ + v31 = vld1q_f16(b + 8 * 17); \ + v0_7 = vfmaq_laneq_f16(v0_7, v30, va7, 0); \ + v8_15 = vfmaq_laneq_f16(v8_15, v30, va7, 1); \ + v16_23 = vfmaq_laneq_f16(v16_23, v30, va7, 2); \ + v24_31 = vfmaq_laneq_f16(v24_31, v30, va7, 3); \ + v32_39 = vfmaq_laneq_f16(v32_39, v30, va7, 4); \ + v40_47 = vfmaq_laneq_f16(v40_47, v30, va7, 5); \ + v48_55 = vfmaq_laneq_f16(v48_55, v30, va7, 6); \ + v56_63 = vfmaq_laneq_f16(v56_63, v30, va7, 7); \ + v64_71 = vfmaq_laneq_f16(v64_71, v31, va7, 0); \ + v72_79 = vfmaq_laneq_f16(v72_79, v31, va7, 1); \ + v80_87 = vfmaq_laneq_f16(v80_87, v31, va7, 2); \ + v88_95 = vfmaq_laneq_f16(v88_95, v31, va7, 3); \ + v96_103 = vfmaq_laneq_f16(v96_103, v31, va7, 4); \ + v104_111 = vfmaq_laneq_f16(v104_111, v31, va7, 5); \ + v112_119 = vfmaq_laneq_f16(v112_119, v31, va7, 6); \ + v120_127 = vfmaq_laneq_f16(v120_127, v31, va7, 7); \ + va7 = vld1q_f16(a + 8 * 9); \ + v30 = vld1q_f16(b + 8 * 18); \ + v31 = vld1q_f16(b + 8 * 19); \ + v0_7 = vfmaq_laneq_f16(v0_7, v30, va7, 0); \ + v8_15 = vfmaq_laneq_f16(v8_15, v30, va7, 1); \ + v16_23 = vfmaq_laneq_f16(v16_23, v30, va7, 2); \ + v24_31 = vfmaq_laneq_f16(v24_31, v30, va7, 3); \ + v32_39 = vfmaq_laneq_f16(v32_39, v30, va7, 4); \ + v40_47 = vfmaq_laneq_f16(v40_47, v30, va7, 5); \ + v48_55 = vfmaq_laneq_f16(v48_55, v30, va7, 6); \ + v56_63 = vfmaq_laneq_f16(v56_63, v30, va7, 7); \ + v64_71 = vfmaq_laneq_f16(v64_71, v31, va7, 0); \ + v72_79 = vfmaq_laneq_f16(v72_79, v31, va7, 1); \ + v80_87 = vfmaq_laneq_f16(v80_87, v31, va7, 2); \ + v88_95 = vfmaq_laneq_f16(v88_95, v31, va7, 3); \ + v96_103 = vfmaq_laneq_f16(v96_103, v31, va7, 4); \ + v104_111 = vfmaq_laneq_f16(v104_111, v31, va7, 5); \ + v112_119 = vfmaq_laneq_f16(v112_119, v31, va7, 6); \ + v120_127 = vfmaq_laneq_f16(v120_127, v31, va7, 7); \ + va7 = vld1q_f16(a + 8 * 10); \ + v30 = vld1q_f16(b + 8 * 20); \ + v31 = vld1q_f16(b + 8 * 21); \ + v0_7 = vfmaq_laneq_f16(v0_7, v30, va7, 0); \ + v8_15 = vfmaq_laneq_f16(v8_15, v30, va7, 1); \ + v16_23 = vfmaq_laneq_f16(v16_23, v30, va7, 2); \ + v24_31 = vfmaq_laneq_f16(v24_31, v30, va7, 3); \ + v32_39 = vfmaq_laneq_f16(v32_39, v30, va7, 4); \ + v40_47 = vfmaq_laneq_f16(v40_47, v30, va7, 5); \ + v48_55 = vfmaq_laneq_f16(v48_55, v30, va7, 6); \ + v56_63 = vfmaq_laneq_f16(v56_63, v30, va7, 7); \ + v64_71 = vfmaq_laneq_f16(v64_71, v31, va7, 0); \ + v72_79 = vfmaq_laneq_f16(v72_79, v31, va7, 1); \ + v80_87 = vfmaq_laneq_f16(v80_87, v31, va7, 2); \ + v88_95 = vfmaq_laneq_f16(v88_95, v31, va7, 3); \ + v96_103 = vfmaq_laneq_f16(v96_103, v31, va7, 4); \ + v104_111 = vfmaq_laneq_f16(v104_111, v31, va7, 5); \ + v112_119 = vfmaq_laneq_f16(v112_119, v31, va7, 6); \ + v120_127 = vfmaq_laneq_f16(v120_127, v31, va7, 7); \ + va7 = vld1q_f16(a + 8 * 11); \ + v30 = vld1q_f16(b + 8 * 22); \ + v31 = vld1q_f16(b + 8 * 23); \ + v0_7 = vfmaq_laneq_f16(v0_7, v30, va7, 0); \ + v8_15 = vfmaq_laneq_f16(v8_15, v30, va7, 1); \ + v16_23 = vfmaq_laneq_f16(v16_23, v30, va7, 2); \ + v24_31 = vfmaq_laneq_f16(v24_31, v30, va7, 3); \ + v32_39 = vfmaq_laneq_f16(v32_39, v30, va7, 4); \ + v40_47 = vfmaq_laneq_f16(v40_47, v30, va7, 5); \ + v48_55 = vfmaq_laneq_f16(v48_55, v30, va7, 6); \ + v56_63 = vfmaq_laneq_f16(v56_63, v30, va7, 7); \ + v64_71 = vfmaq_laneq_f16(v64_71, v31, va7, 0); \ + v72_79 = vfmaq_laneq_f16(v72_79, v31, va7, 1); \ + v80_87 = vfmaq_laneq_f16(v80_87, v31, va7, 2); \ + v88_95 = vfmaq_laneq_f16(v88_95, v31, va7, 3); \ + v96_103 = vfmaq_laneq_f16(v96_103, v31, va7, 4); \ + v104_111 = vfmaq_laneq_f16(v104_111, v31, va7, 5); \ + v112_119 = vfmaq_laneq_f16(v112_119, v31, va7, 6); \ + v120_127 = vfmaq_laneq_f16(v120_127, v31, va7, 7); \ + va7 = vld1q_f16(a + 8 * 12); \ + v30 = vld1q_f16(b + 8 * 24); \ + v31 = vld1q_f16(b + 8 * 25); \ + v0_7 = vfmaq_laneq_f16(v0_7, v30, va7, 0); \ + v8_15 = vfmaq_laneq_f16(v8_15, v30, va7, 1); \ + v16_23 = vfmaq_laneq_f16(v16_23, v30, va7, 2); \ + v24_31 = vfmaq_laneq_f16(v24_31, v30, va7, 3); \ + v32_39 = vfmaq_laneq_f16(v32_39, v30, va7, 4); \ + v40_47 = vfmaq_laneq_f16(v40_47, v30, va7, 5); \ + v48_55 = vfmaq_laneq_f16(v48_55, v30, va7, 6); \ + v56_63 = vfmaq_laneq_f16(v56_63, v30, va7, 7); \ + v64_71 = vfmaq_laneq_f16(v64_71, v31, va7, 0); \ + v72_79 = vfmaq_laneq_f16(v72_79, v31, va7, 1); \ + v80_87 = vfmaq_laneq_f16(v80_87, v31, va7, 2); \ + v88_95 = vfmaq_laneq_f16(v88_95, v31, va7, 3); \ + v96_103 = vfmaq_laneq_f16(v96_103, v31, va7, 4); \ + v104_111 = vfmaq_laneq_f16(v104_111, v31, va7, 5); \ + v112_119 = vfmaq_laneq_f16(v112_119, v31, va7, 6); \ + v120_127 = vfmaq_laneq_f16(v120_127, v31, va7, 7); \ + va7 = vld1q_f16(a + 8 * 13); \ + v30 = vld1q_f16(b + 8 * 26); \ + v31 = vld1q_f16(b + 8 * 27); \ + v0_7 = vfmaq_laneq_f16(v0_7, v30, va7, 0); \ + v8_15 = vfmaq_laneq_f16(v8_15, v30, va7, 1); \ + v16_23 = vfmaq_laneq_f16(v16_23, v30, va7, 2); \ + v24_31 = vfmaq_laneq_f16(v24_31, v30, va7, 3); \ + v32_39 = vfmaq_laneq_f16(v32_39, v30, va7, 4); \ + v40_47 = vfmaq_laneq_f16(v40_47, v30, va7, 5); \ + v48_55 = vfmaq_laneq_f16(v48_55, v30, va7, 6); \ + v56_63 = vfmaq_laneq_f16(v56_63, v30, va7, 7); \ + v64_71 = vfmaq_laneq_f16(v64_71, v31, va7, 0); \ + v72_79 = vfmaq_laneq_f16(v72_79, v31, va7, 1); \ + v80_87 = vfmaq_laneq_f16(v80_87, v31, va7, 2); \ + v88_95 = vfmaq_laneq_f16(v88_95, v31, va7, 3); \ + v96_103 = vfmaq_laneq_f16(v96_103, v31, va7, 4); \ + v104_111 = vfmaq_laneq_f16(v104_111, v31, va7, 5); \ + v112_119 = vfmaq_laneq_f16(v112_119, v31, va7, 6); \ + v120_127 = vfmaq_laneq_f16(v120_127, v31, va7, 7); \ + va7 = vld1q_f16(a + 8 * 14); \ + v30 = vld1q_f16(b + 8 * 28); \ + v31 = vld1q_f16(b + 8 * 29); \ + v0_7 = vfmaq_laneq_f16(v0_7, v30, va7, 0); \ + v8_15 = vfmaq_laneq_f16(v8_15, v30, va7, 1); \ + v16_23 = vfmaq_laneq_f16(v16_23, v30, va7, 2); \ + v24_31 = vfmaq_laneq_f16(v24_31, v30, va7, 3); \ + v32_39 = vfmaq_laneq_f16(v32_39, v30, va7, 4); \ + v40_47 = vfmaq_laneq_f16(v40_47, v30, va7, 5); \ + v48_55 = vfmaq_laneq_f16(v48_55, v30, va7, 6); \ + v56_63 = vfmaq_laneq_f16(v56_63, v30, va7, 7); \ + v64_71 = vfmaq_laneq_f16(v64_71, v31, va7, 0); \ + v72_79 = vfmaq_laneq_f16(v72_79, v31, va7, 1); \ + v80_87 = vfmaq_laneq_f16(v80_87, v31, va7, 2); \ + v88_95 = vfmaq_laneq_f16(v88_95, v31, va7, 3); \ + v96_103 = vfmaq_laneq_f16(v96_103, v31, va7, 4); \ + v104_111 = vfmaq_laneq_f16(v104_111, v31, va7, 5); \ + v112_119 = vfmaq_laneq_f16(v112_119, v31, va7, 6); \ + v120_127 = vfmaq_laneq_f16(v120_127, v31, va7, 7); \ + va7 = vld1q_f16(a + 8 * 15); \ + v30 = vld1q_f16(b + 8 * 30); \ + v31 = vld1q_f16(b + 8 * 31); \ + v0_7 = vfmaq_laneq_f16(v0_7, v30, va7, 0); \ + v8_15 = vfmaq_laneq_f16(v8_15, v30, va7, 1); \ + v16_23 = vfmaq_laneq_f16(v16_23, v30, va7, 2); \ + v24_31 = vfmaq_laneq_f16(v24_31, v30, va7, 3); \ + v32_39 = vfmaq_laneq_f16(v32_39, v30, va7, 4); \ + v40_47 = vfmaq_laneq_f16(v40_47, v30, va7, 5); \ + v48_55 = vfmaq_laneq_f16(v48_55, v30, va7, 6); \ + v56_63 = vfmaq_laneq_f16(v56_63, v30, va7, 7); \ + v64_71 = vfmaq_laneq_f16(v64_71, v31, va7, 0); \ + v72_79 = vfmaq_laneq_f16(v72_79, v31, va7, 1); \ + v80_87 = vfmaq_laneq_f16(v80_87, v31, va7, 2); \ + v88_95 = vfmaq_laneq_f16(v88_95, v31, va7, 3); \ + v96_103 = vfmaq_laneq_f16(v96_103, v31, va7, 4); \ + v104_111 = vfmaq_laneq_f16(v104_111, v31, va7, 5); \ + v112_119 = vfmaq_laneq_f16(v112_119, v31, va7, 6); \ + v120_127 = vfmaq_laneq_f16(v120_127, v31, va7, 7); \ + l += 16; \ + __builtin_prefetch(b + 256, 0, 3); \ + __builtin_prefetch(a + 128, 0, 3); \ + b += 16 * 16; \ + a += 8 * 16; + +// 2. Partial sum 1024 digits #define KERNEL_8x16_ACC8() \ - v0_7 = vdupq_n_f16(0.F); \ - v8_15 = vdupq_n_f16(0.F); \ - v16_23 = vdupq_n_f16(0.F); \ - v24_31 = vdupq_n_f16(0.F); \ - v32_39 = vdupq_n_f16(0.F); \ - v40_47 = vdupq_n_f16(0.F); \ - v48_55 = vdupq_n_f16(0.F); \ - v56_63 = vdupq_n_f16(0.F); \ - v64_71 = vdupq_n_f16(0.F); \ - v72_79 = vdupq_n_f16(0.F); \ - v80_87 = vdupq_n_f16(0.F); \ - v88_95 = vdupq_n_f16(0.F); \ - v96_103 = vdupq_n_f16(0.F); \ - v104_111 = vdupq_n_f16(0.F); \ - v112_119 = vdupq_n_f16(0.F); \ - v120_127 = vdupq_n_f16(0.F); \ va0 = vld1q_f16(a); \ v24 = vld1q_f16(b); \ v25 = vld1q_f16(b + 8); \ @@ -193,24 +504,8 @@ b += 16 * 8; \ a += 8 * 8; -// 2. Partial sum 512 digits : Medium accuracy, medium latency +// 3. Partial sum 512 digits #define KERNEL_8x16_ACC4() \ - v0_7 = vdupq_n_f16(0.F); \ - v8_15 = vdupq_n_f16(0.F); \ - v16_23 = vdupq_n_f16(0.F); \ - v24_31 = vdupq_n_f16(0.F); \ - v32_39 = vdupq_n_f16(0.F); \ - v40_47 = vdupq_n_f16(0.F); \ - v48_55 = vdupq_n_f16(0.F); \ - v56_63 = vdupq_n_f16(0.F); \ - v64_71 = vdupq_n_f16(0.F); \ - v72_79 = vdupq_n_f16(0.F); \ - v80_87 = vdupq_n_f16(0.F); \ - v88_95 = vdupq_n_f16(0.F); \ - v96_103 = vdupq_n_f16(0.F); \ - v104_111 = vdupq_n_f16(0.F); \ - v112_119 = vdupq_n_f16(0.F); \ - v120_127 = vdupq_n_f16(0.F); \ va0 = vld1q_f16(a); \ v24 = vld1q_f16(b); \ v25 = vld1q_f16(b + 8); \ @@ -293,24 +588,8 @@ b += 16 * 4; \ a += 8 * 4; -// 3. Partial sum 128 digits : Best accuracy, worst latency +// 3. Partial sum 128 digits #define KERNEL_8x16_ACC1() \ - v0_7 = vdupq_n_f16(0.F); \ - v8_15 = vdupq_n_f16(0.F); \ - v16_23 = vdupq_n_f16(0.F); \ - v24_31 = vdupq_n_f16(0.F); \ - v32_39 = vdupq_n_f16(0.F); \ - v40_47 = vdupq_n_f16(0.F); \ - v48_55 = vdupq_n_f16(0.F); \ - v56_63 = vdupq_n_f16(0.F); \ - v64_71 = vdupq_n_f16(0.F); \ - v72_79 = vdupq_n_f16(0.F); \ - v80_87 = vdupq_n_f16(0.F); \ - v88_95 = vdupq_n_f16(0.F); \ - v96_103 = vdupq_n_f16(0.F); \ - v104_111 = vdupq_n_f16(0.F); \ - v112_119 = vdupq_n_f16(0.F); \ - v120_127 = vdupq_n_f16(0.F); \ va0 = vld1q_f16(a); \ v24 = vld1q_f16(b); \ v25 = vld1q_f16(b + 8); \ @@ -336,6 +615,91 @@ b += 16 * 1; \ a += 8 * 1; +#define SAVE_KERNEL_8X16_F16_F32() \ + vst1q_f32(c, vaddq_f32(vld1q_f32(c), vcvt_f32_f16(vget_low_f16(v0_7)))); \ + vst1q_f32(c + 4, \ + vaddq_f32(vld1q_f32(c + 4), vcvt_f32_f16(vget_high_f16(v0_7)))); \ + \ + vst1q_f32(c + 8, \ + vaddq_f32(vld1q_f32(c + 8), vcvt_f32_f16(vget_low_f16(v64_71)))); \ + vst1q_f32(c + 8 + 4, vaddq_f32(vld1q_f32(c + 8 + 4), \ + vcvt_f32_f16(vget_high_f16(v64_71)))); \ + \ + vst1q_f32(c + ldc, \ + vaddq_f32(vld1q_f32(c + ldc), vcvt_f32_f16(vget_low_f16(v8_15)))); \ + vst1q_f32(c + ldc + 4, vaddq_f32(vld1q_f32(c + ldc + 4), \ + vcvt_f32_f16(vget_high_f16(v8_15)))); \ + \ + vst1q_f32(c + ldc + 8, vaddq_f32(vld1q_f32(c + ldc + 8), \ + vcvt_f32_f16(vget_low_f16(v72_79)))); \ + vst1q_f32(c + ldc + 8 + 4, vaddq_f32(vld1q_f32(c + ldc + 8 + 4), \ + vcvt_f32_f16(vget_high_f16(v72_79)))); \ + \ + vst1q_f32(c + 2 * ldc, vaddq_f32(vld1q_f32(c + 2 * ldc), \ + vcvt_f32_f16(vget_low_f16(v16_23)))); \ + vst1q_f32(c + 2 * ldc + 4, vaddq_f32(vld1q_f32(c + 2 * ldc + 4), \ + vcvt_f32_f16(vget_high_f16(v16_23)))); \ + \ + vst1q_f32(c + 2 * ldc + 8, vaddq_f32(vld1q_f32(c + 2 * ldc + 8), \ + vcvt_f32_f16(vget_low_f16(v80_87)))); \ + vst1q_f32(c + 2 * ldc + 8 + 4, \ + vaddq_f32(vld1q_f32(c + 2 * ldc + 8 + 4), \ + vcvt_f32_f16(vget_high_f16(v80_87)))); \ + \ + vst1q_f32(c + 3 * ldc, vaddq_f32(vld1q_f32(c + 3 * ldc), \ + vcvt_f32_f16(vget_low_f16(v24_31)))); \ + vst1q_f32(c + 3 * ldc + 4, vaddq_f32(vld1q_f32(c + 3 * ldc + 4), \ + vcvt_f32_f16(vget_high_f16(v24_31)))); \ + \ + vst1q_f32(c + 3 * ldc + 8, vaddq_f32(vld1q_f32(c + 3 * ldc + 8), \ + vcvt_f32_f16(vget_low_f16(v88_95)))); \ + vst1q_f32(c + 3 * ldc + 8 + 4, \ + vaddq_f32(vld1q_f32(c + 3 * ldc + 8 + 4), \ + vcvt_f32_f16(vget_high_f16(v88_95)))); \ + \ + vst1q_f32(c + 4 * ldc, vaddq_f32(vld1q_f32(c + 4 * ldc), \ + vcvt_f32_f16(vget_low_f16(v32_39)))); \ + vst1q_f32(c + 4 * ldc + 4, vaddq_f32(vld1q_f32(c + 4 * ldc + 4), \ + vcvt_f32_f16(vget_high_f16(v32_39)))); \ + \ + vst1q_f32(c + 4 * ldc + 8, vaddq_f32(vld1q_f32(c + 4 * ldc + 8), \ + vcvt_f32_f16(vget_low_f16(v96_103)))); \ + vst1q_f32(c + 4 * ldc + 8 + 4, \ + vaddq_f32(vld1q_f32(c + 4 * ldc + 8 + 4), \ + vcvt_f32_f16(vget_high_f16(v96_103)))); \ + \ + vst1q_f32(c + 5 * ldc, vaddq_f32(vld1q_f32(c + 5 * ldc), \ + vcvt_f32_f16(vget_low_f16(v40_47)))); \ + vst1q_f32(c + 5 * ldc + 4, vaddq_f32(vld1q_f32(c + 5 * ldc + 4), \ + vcvt_f32_f16(vget_high_f16(v40_47)))); \ + vst1q_f32(c + 5 * ldc + 8, vaddq_f32(vld1q_f32(c + 5 * ldc + 8), \ + vcvt_f32_f16(vget_low_f16(v104_111)))); \ + vst1q_f32(c + 5 * ldc + 8 + 4, \ + vaddq_f32(vld1q_f32(c + 5 * ldc + 8 + 4), \ + vcvt_f32_f16(vget_high_f16(v104_111)))); \ + \ + vst1q_f32(c + 6 * ldc, vaddq_f32(vld1q_f32(c + 6 * ldc), \ + vcvt_f32_f16(vget_low_f16(v48_55)))); \ + vst1q_f32(c + 6 * ldc + 4, vaddq_f32(vld1q_f32(c + 6 * ldc + 4), \ + vcvt_f32_f16(vget_high_f16(v48_55)))); \ + \ + vst1q_f32(c + 6 * ldc + 8, vaddq_f32(vld1q_f32(c + 6 * ldc + 8), \ + vcvt_f32_f16(vget_low_f16(v112_119)))); \ + vst1q_f32(c + 6 * ldc + 8 + 4, \ + vaddq_f32(vld1q_f32(c + 6 * ldc + 8 + 4), \ + vcvt_f32_f16(vget_high_f16(v112_119)))); \ + \ + vst1q_f32(c + 7 * ldc, vaddq_f32(vld1q_f32(c + 7 * ldc), \ + vcvt_f32_f16(vget_low_f16(v56_63)))); \ + vst1q_f32(c + 7 * ldc + 4, vaddq_f32(vld1q_f32(c + 7 * ldc + 4), \ + vcvt_f32_f16(vget_high_f16(v56_63)))); \ + \ + vst1q_f32(c + 7 * ldc + 8, vaddq_f32(vld1q_f32(c + 7 * ldc + 8), \ + vcvt_f32_f16(vget_low_f16(v120_127)))); \ + vst1q_f32(c + 7 * ldc + 8 + 4, \ + vaddq_f32(vld1q_f32(c + 7 * ldc + 8 + 4), \ + vcvt_f32_f16(vget_high_f16(v120_127)))); + /** * @brief hgemm 8x16 kernel sc = sa * sb * @@ -370,32 +734,32 @@ void hgemm_kernel_8x16(unsigned int M, unsigned int N, unsigned int K, float16x8_t v24, v25, v26, v27, v28, v29, v30, v31; float16x8_t va0, va1, va2, va3; + + INIT_KERNEL_8X16(); l = 0; for (; l < K;) { - KERNEL_8x16_ACC4(); - vst1q_f16(c, vaddq_f16(vld1q_f16(c), v0_7)); - vst1q_f16(c + 8, vaddq_f16(vld1q_f16(c + 8), v64_71)); - vst1q_f16(c + ldc, vaddq_f16(vld1q_f16(c + ldc), v8_15)); - vst1q_f16(c + ldc + 8, vaddq_f16(vld1q_f16(c + ldc + 8), v72_79)); - vst1q_f16(c + 2 * ldc, vaddq_f16(vld1q_f16(c + 2 * ldc), v16_23)); - vst1q_f16(c + 2 * ldc + 8, - vaddq_f16(vld1q_f16(c + 2 * ldc + 8), v80_87)); - vst1q_f16(c + 3 * ldc, vaddq_f16(vld1q_f16(c + 3 * ldc), v24_31)); - vst1q_f16(c + 3 * ldc + 8, - vaddq_f16(vld1q_f16(c + 3 * ldc + 8), v88_95)); - vst1q_f16(c + 4 * ldc, vaddq_f16(vld1q_f16(c + 4 * ldc), v32_39)); - vst1q_f16(c + 4 * ldc + 8, - vaddq_f16(vld1q_f16(c + 4 * ldc + 8), v96_103)); - vst1q_f16(c + 5 * ldc, vaddq_f16(vld1q_f16(c + 5 * ldc), v40_47)); - vst1q_f16(c + 5 * ldc + 8, - vaddq_f16(vld1q_f16(c + 5 * ldc + 8), v104_111)); - vst1q_f16(c + 6 * ldc, vaddq_f16(vld1q_f16(c + 6 * ldc), v48_55)); - vst1q_f16(c + 6 * ldc + 8, - vaddq_f16(vld1q_f16(c + 6 * ldc + 8), v112_119)); - vst1q_f16(c + 7 * ldc, vaddq_f16(vld1q_f16(c + 7 * ldc), v56_63)); - vst1q_f16(c + 7 * ldc + 8, - vaddq_f16(vld1q_f16(c + 7 * ldc + 8), v120_127)); + KERNEL_8x16_ACC1(); } + vst1q_f16(c, vaddq_f16(vld1q_f16(c), v0_7)); + vst1q_f16(c + 8, vaddq_f16(vld1q_f16(c + 8), v64_71)); + vst1q_f16(c + ldc, vaddq_f16(vld1q_f16(c + ldc), v8_15)); + vst1q_f16(c + ldc + 8, vaddq_f16(vld1q_f16(c + ldc + 8), v72_79)); + vst1q_f16(c + 2 * ldc, vaddq_f16(vld1q_f16(c + 2 * ldc), v16_23)); + vst1q_f16(c + 2 * ldc + 8, vaddq_f16(vld1q_f16(c + 2 * ldc + 8), v80_87)); + vst1q_f16(c + 3 * ldc, vaddq_f16(vld1q_f16(c + 3 * ldc), v24_31)); + vst1q_f16(c + 3 * ldc + 8, vaddq_f16(vld1q_f16(c + 3 * ldc + 8), v88_95)); + vst1q_f16(c + 4 * ldc, vaddq_f16(vld1q_f16(c + 4 * ldc), v32_39)); + vst1q_f16(c + 4 * ldc + 8, + vaddq_f16(vld1q_f16(c + 4 * ldc + 8), v96_103)); + vst1q_f16(c + 5 * ldc, vaddq_f16(vld1q_f16(c + 5 * ldc), v40_47)); + vst1q_f16(c + 5 * ldc + 8, + vaddq_f16(vld1q_f16(c + 5 * ldc + 8), v104_111)); + vst1q_f16(c + 6 * ldc, vaddq_f16(vld1q_f16(c + 6 * ldc), v48_55)); + vst1q_f16(c + 6 * ldc + 8, + vaddq_f16(vld1q_f16(c + 6 * ldc + 8), v112_119)); + vst1q_f16(c + 7 * ldc, vaddq_f16(vld1q_f16(c + 7 * ldc), v56_63)); + vst1q_f16(c + 7 * ldc + 8, + vaddq_f16(vld1q_f16(c + 7 * ldc + 8), v120_127)); c += 16; a -= 8 * K; } @@ -425,6 +789,9 @@ void hgemm_kernel_8x16(unsigned int M, unsigned int N, unsigned int K, __fp16 *a = sa, *b = sb; float *c = sc; unsigned int i, j, l; + unsigned int K4 = (K >> 2) << 2; + unsigned int K8 = (K >> 3) << 3; + unsigned int K16 = (K >> 4) << 4; for (i = 0; i < M; i += 8) { for (j = 0; j < N; j += 16) { __builtin_prefetch(b, 0, 3); @@ -440,106 +807,25 @@ void hgemm_kernel_8x16(unsigned int M, unsigned int N, unsigned int K, float16x8_t v24, v25, v26, v27, v28, v29, v30, v31; float16x8_t va0, va1, va2, va3, va4, va5, va6, va7; l = 0; - for (; l < K;) { + for (; l < K16;) { + INIT_KERNEL_8X16(); + KERNEL_8x16_ACC16(); + SAVE_KERNEL_8X16_F16_F32(); + } + for (; l < K8;) { + INIT_KERNEL_8X16(); KERNEL_8x16_ACC8(); - - vst1q_f32(c, vaddq_f32(vld1q_f32(c), vcvt_f32_f16(vget_low_f16(v0_7)))); - vst1q_f32(c + 4, vaddq_f32(vld1q_f32(c + 4), - vcvt_f32_f16(vget_high_f16(v0_7)))); - - vst1q_f32(c + 8, vaddq_f32(vld1q_f32(c + 8), - vcvt_f32_f16(vget_low_f16(v64_71)))); - vst1q_f32(c + 8 + 4, vaddq_f32(vld1q_f32(c + 8 + 4), - vcvt_f32_f16(vget_high_f16(v64_71)))); - - vst1q_f32(c + ldc, vaddq_f32(vld1q_f32(c + ldc), - vcvt_f32_f16(vget_low_f16(v8_15)))); - vst1q_f32(c + ldc + 4, vaddq_f32(vld1q_f32(c + ldc + 4), - vcvt_f32_f16(vget_high_f16(v8_15)))); - - vst1q_f32(c + ldc + 8, vaddq_f32(vld1q_f32(c + ldc + 8), - vcvt_f32_f16(vget_low_f16(v72_79)))); - vst1q_f32(c + ldc + 8 + 4, - vaddq_f32(vld1q_f32(c + ldc + 8 + 4), - vcvt_f32_f16(vget_high_f16(v72_79)))); - - vst1q_f32(c + 2 * ldc, vaddq_f32(vld1q_f32(c + 2 * ldc), - vcvt_f32_f16(vget_low_f16(v16_23)))); - vst1q_f32(c + 2 * ldc + 4, - vaddq_f32(vld1q_f32(c + 2 * ldc + 4), - vcvt_f32_f16(vget_high_f16(v16_23)))); - - vst1q_f32(c + 2 * ldc + 8, - vaddq_f32(vld1q_f32(c + 2 * ldc + 8), - vcvt_f32_f16(vget_low_f16(v80_87)))); - vst1q_f32(c + 2 * ldc + 8 + 4, - vaddq_f32(vld1q_f32(c + 2 * ldc + 8 + 4), - vcvt_f32_f16(vget_high_f16(v80_87)))); - - vst1q_f32(c + 3 * ldc, vaddq_f32(vld1q_f32(c + 3 * ldc), - vcvt_f32_f16(vget_low_f16(v24_31)))); - vst1q_f32(c + 3 * ldc + 4, - vaddq_f32(vld1q_f32(c + 3 * ldc + 4), - vcvt_f32_f16(vget_high_f16(v24_31)))); - - vst1q_f32(c + 3 * ldc + 8, - vaddq_f32(vld1q_f32(c + 3 * ldc + 8), - vcvt_f32_f16(vget_low_f16(v88_95)))); - vst1q_f32(c + 3 * ldc + 8 + 4, - vaddq_f32(vld1q_f32(c + 3 * ldc + 8 + 4), - vcvt_f32_f16(vget_high_f16(v88_95)))); - - vst1q_f32(c + 4 * ldc, vaddq_f32(vld1q_f32(c + 4 * ldc), - vcvt_f32_f16(vget_low_f16(v32_39)))); - vst1q_f32(c + 4 * ldc + 4, - vaddq_f32(vld1q_f32(c + 4 * ldc + 4), - vcvt_f32_f16(vget_high_f16(v32_39)))); - - vst1q_f32(c + 4 * ldc + 8, - vaddq_f32(vld1q_f32(c + 4 * ldc + 8), - vcvt_f32_f16(vget_low_f16(v96_103)))); - vst1q_f32(c + 4 * ldc + 8 + 4, - vaddq_f32(vld1q_f32(c + 4 * ldc + 8 + 4), - vcvt_f32_f16(vget_high_f16(v96_103)))); - - vst1q_f32(c + 5 * ldc, vaddq_f32(vld1q_f32(c + 5 * ldc), - vcvt_f32_f16(vget_low_f16(v40_47)))); - vst1q_f32(c + 5 * ldc + 4, - vaddq_f32(vld1q_f32(c + 5 * ldc + 4), - vcvt_f32_f16(vget_high_f16(v40_47)))); - - vst1q_f32(c + 5 * ldc + 8, - vaddq_f32(vld1q_f32(c + 5 * ldc + 8), - vcvt_f32_f16(vget_low_f16(v104_111)))); - vst1q_f32(c + 5 * ldc + 8 + 4, - vaddq_f32(vld1q_f32(c + 5 * ldc + 8 + 4), - vcvt_f32_f16(vget_high_f16(v104_111)))); - - vst1q_f32(c + 6 * ldc, vaddq_f32(vld1q_f32(c + 6 * ldc), - vcvt_f32_f16(vget_low_f16(v48_55)))); - vst1q_f32(c + 6 * ldc + 4, - vaddq_f32(vld1q_f32(c + 6 * ldc + 4), - vcvt_f32_f16(vget_high_f16(v48_55)))); - - vst1q_f32(c + 6 * ldc + 8, - vaddq_f32(vld1q_f32(c + 6 * ldc + 8), - vcvt_f32_f16(vget_low_f16(v112_119)))); - vst1q_f32(c + 6 * ldc + 8 + 4, - vaddq_f32(vld1q_f32(c + 6 * ldc + 8 + 4), - vcvt_f32_f16(vget_high_f16(v112_119)))); - - vst1q_f32(c + 7 * ldc, vaddq_f32(vld1q_f32(c + 7 * ldc), - vcvt_f32_f16(vget_low_f16(v56_63)))); - vst1q_f32(c + 7 * ldc + 4, - vaddq_f32(vld1q_f32(c + 7 * ldc + 4), - vcvt_f32_f16(vget_high_f16(v56_63)))); - - vst1q_f32(c + 7 * ldc + 8, - vaddq_f32(vld1q_f32(c + 7 * ldc + 8), - vcvt_f32_f16(vget_low_f16(v120_127)))); - vst1q_f32(c + 7 * ldc + 8 + 4, - vaddq_f32(vld1q_f32(c + 7 * ldc + 8 + 4), - vcvt_f32_f16(vget_high_f16(v120_127)))); + SAVE_KERNEL_8X16_F16_F32(); + } + for (; l < K4;) { + INIT_KERNEL_8X16(); + KERNEL_8x16_ACC4(); + SAVE_KERNEL_8X16_F16_F32(); + } + for (; l < K;) { + INIT_KERNEL_8X16(); + KERNEL_8x16_ACC1(); + SAVE_KERNEL_8X16_F16_F32(); } c += 16; a -= 8 * K; diff --git a/nntrainer/tensor/hgemm/hgemm_kernel_8x8.h b/nntrainer/tensor/hgemm/hgemm_kernel_8x8.h index e67ef462b4..4901c3f518 100644 --- a/nntrainer/tensor/hgemm/hgemm_kernel_8x8.h +++ b/nntrainer/tensor/hgemm/hgemm_kernel_8x8.h @@ -14,19 +14,186 @@ #include #include -/// @note Following KERNELs are the combinations of accuracy-latency -/// tradeoff. User can select which kernel to use by replacing them. +#define INIT_KERNEL_8x8() \ + v24 = vdupq_n_f16(0.F); \ + v25 = vdupq_n_f16(0.F); \ + v26 = vdupq_n_f16(0.F); \ + v27 = vdupq_n_f16(0.F); \ + v28 = vdupq_n_f16(0.F); \ + v29 = vdupq_n_f16(0.F); \ + v30 = vdupq_n_f16(0.F); \ + v31 = vdupq_n_f16(0.F); -// 1. Partial sum 512 digits : Worst accuracy, best latency +// 1. Partial sum 1024 digits +#define KERNEL_8x8_ACC16() \ + va0 = vld1q_f16(a); \ + v16 = vld1q_f16(b); \ + v24 = vfmaq_laneq_f16(v24, v16, va0, 0); \ + v25 = vfmaq_laneq_f16(v25, v16, va0, 1); \ + v26 = vfmaq_laneq_f16(v26, v16, va0, 2); \ + v27 = vfmaq_laneq_f16(v27, v16, va0, 3); \ + v28 = vfmaq_laneq_f16(v28, v16, va0, 4); \ + v29 = vfmaq_laneq_f16(v29, v16, va0, 5); \ + v30 = vfmaq_laneq_f16(v30, v16, va0, 6); \ + v31 = vfmaq_laneq_f16(v31, v16, va0, 7); \ + va1 = vld1q_f16(a + 8); \ + v17 = vld1q_f16(b + 8); \ + v24 = vfmaq_laneq_f16(v24, v17, va1, 0); \ + v25 = vfmaq_laneq_f16(v25, v17, va1, 1); \ + v26 = vfmaq_laneq_f16(v26, v17, va1, 2); \ + v27 = vfmaq_laneq_f16(v27, v17, va1, 3); \ + v28 = vfmaq_laneq_f16(v28, v17, va1, 4); \ + v29 = vfmaq_laneq_f16(v29, v17, va1, 5); \ + v30 = vfmaq_laneq_f16(v30, v17, va1, 6); \ + v31 = vfmaq_laneq_f16(v31, v17, va1, 7); \ + va2 = vld1q_f16(a + 8 * 2); \ + v18 = vld1q_f16(b + 8 * 2); \ + v24 = vfmaq_laneq_f16(v24, v18, va2, 0); \ + v25 = vfmaq_laneq_f16(v25, v18, va2, 1); \ + v26 = vfmaq_laneq_f16(v26, v18, va2, 2); \ + v27 = vfmaq_laneq_f16(v27, v18, va2, 3); \ + v28 = vfmaq_laneq_f16(v28, v18, va2, 4); \ + v29 = vfmaq_laneq_f16(v29, v18, va2, 5); \ + v30 = vfmaq_laneq_f16(v30, v18, va2, 6); \ + v31 = vfmaq_laneq_f16(v31, v18, va2, 7); \ + va3 = vld1q_f16(a + 8 * 3); \ + v19 = vld1q_f16(b + 8 * 3); \ + v24 = vfmaq_laneq_f16(v24, v19, va3, 0); \ + v25 = vfmaq_laneq_f16(v25, v19, va3, 1); \ + v26 = vfmaq_laneq_f16(v26, v19, va3, 2); \ + v27 = vfmaq_laneq_f16(v27, v19, va3, 3); \ + v28 = vfmaq_laneq_f16(v28, v19, va3, 4); \ + v29 = vfmaq_laneq_f16(v29, v19, va3, 5); \ + v30 = vfmaq_laneq_f16(v30, v19, va3, 6); \ + v31 = vfmaq_laneq_f16(v31, v19, va3, 7); \ + va4 = vld1q_f16(a + 8 * 4); \ + v20 = vld1q_f16(b + 8 * 4); \ + v24 = vfmaq_laneq_f16(v24, v20, va4, 0); \ + v25 = vfmaq_laneq_f16(v25, v20, va4, 1); \ + v26 = vfmaq_laneq_f16(v26, v20, va4, 2); \ + v27 = vfmaq_laneq_f16(v27, v20, va4, 3); \ + v28 = vfmaq_laneq_f16(v28, v20, va4, 4); \ + v29 = vfmaq_laneq_f16(v29, v20, va4, 5); \ + v30 = vfmaq_laneq_f16(v30, v20, va4, 6); \ + v31 = vfmaq_laneq_f16(v31, v20, va4, 7); \ + va5 = vld1q_f16(a + 8 * 5); \ + v21 = vld1q_f16(b + 8 * 5); \ + v24 = vfmaq_laneq_f16(v24, v21, va5, 0); \ + v25 = vfmaq_laneq_f16(v25, v21, va5, 1); \ + v26 = vfmaq_laneq_f16(v26, v21, va5, 2); \ + v27 = vfmaq_laneq_f16(v27, v21, va5, 3); \ + v28 = vfmaq_laneq_f16(v28, v21, va5, 4); \ + v29 = vfmaq_laneq_f16(v29, v21, va5, 5); \ + v30 = vfmaq_laneq_f16(v30, v21, va5, 6); \ + v31 = vfmaq_laneq_f16(v31, v21, va5, 7); \ + va6 = vld1q_f16(a + 8 * 6); \ + v22 = vld1q_f16(b + 8 * 6); \ + v24 = vfmaq_laneq_f16(v24, v22, va6, 0); \ + v25 = vfmaq_laneq_f16(v25, v22, va6, 1); \ + v26 = vfmaq_laneq_f16(v26, v22, va6, 2); \ + v27 = vfmaq_laneq_f16(v27, v22, va6, 3); \ + v28 = vfmaq_laneq_f16(v28, v22, va6, 4); \ + v29 = vfmaq_laneq_f16(v29, v22, va6, 5); \ + v30 = vfmaq_laneq_f16(v30, v22, va6, 6); \ + v31 = vfmaq_laneq_f16(v31, v22, va6, 7); \ + va7 = vld1q_f16(a + 8 * 7); \ + v23 = vld1q_f16(b + 8 * 7); \ + v24 = vfmaq_laneq_f16(v24, v23, va7, 0); \ + v25 = vfmaq_laneq_f16(v25, v23, va7, 1); \ + v26 = vfmaq_laneq_f16(v26, v23, va7, 2); \ + v27 = vfmaq_laneq_f16(v27, v23, va7, 3); \ + v28 = vfmaq_laneq_f16(v28, v23, va7, 4); \ + v29 = vfmaq_laneq_f16(v29, v23, va7, 5); \ + v30 = vfmaq_laneq_f16(v30, v23, va7, 6); \ + v31 = vfmaq_laneq_f16(v31, v23, va7, 7); \ + va7 = vld1q_f16(a + 8 * 8); \ + v23 = vld1q_f16(b + 8 * 8); \ + v24 = vfmaq_laneq_f16(v24, v23, va7, 0); \ + v25 = vfmaq_laneq_f16(v25, v23, va7, 1); \ + v26 = vfmaq_laneq_f16(v26, v23, va7, 2); \ + v27 = vfmaq_laneq_f16(v27, v23, va7, 3); \ + v28 = vfmaq_laneq_f16(v28, v23, va7, 4); \ + v29 = vfmaq_laneq_f16(v29, v23, va7, 5); \ + v30 = vfmaq_laneq_f16(v30, v23, va7, 6); \ + v31 = vfmaq_laneq_f16(v31, v23, va7, 7); \ + va7 = vld1q_f16(a + 8 * 9); \ + v23 = vld1q_f16(b + 8 * 9); \ + v24 = vfmaq_laneq_f16(v24, v23, va7, 0); \ + v25 = vfmaq_laneq_f16(v25, v23, va7, 1); \ + v26 = vfmaq_laneq_f16(v26, v23, va7, 2); \ + v27 = vfmaq_laneq_f16(v27, v23, va7, 3); \ + v28 = vfmaq_laneq_f16(v28, v23, va7, 4); \ + v29 = vfmaq_laneq_f16(v29, v23, va7, 5); \ + v30 = vfmaq_laneq_f16(v30, v23, va7, 6); \ + v31 = vfmaq_laneq_f16(v31, v23, va7, 7); \ + va7 = vld1q_f16(a + 8 * 10); \ + v23 = vld1q_f16(b + 8 * 10); \ + v24 = vfmaq_laneq_f16(v24, v23, va7, 0); \ + v25 = vfmaq_laneq_f16(v25, v23, va7, 1); \ + v26 = vfmaq_laneq_f16(v26, v23, va7, 2); \ + v27 = vfmaq_laneq_f16(v27, v23, va7, 3); \ + v28 = vfmaq_laneq_f16(v28, v23, va7, 4); \ + v29 = vfmaq_laneq_f16(v29, v23, va7, 5); \ + v30 = vfmaq_laneq_f16(v30, v23, va7, 6); \ + v31 = vfmaq_laneq_f16(v31, v23, va7, 7); \ + va7 = vld1q_f16(a + 8 * 11); \ + v23 = vld1q_f16(b + 8 * 11); \ + v24 = vfmaq_laneq_f16(v24, v23, va7, 0); \ + v25 = vfmaq_laneq_f16(v25, v23, va7, 1); \ + v26 = vfmaq_laneq_f16(v26, v23, va7, 2); \ + v27 = vfmaq_laneq_f16(v27, v23, va7, 3); \ + v28 = vfmaq_laneq_f16(v28, v23, va7, 4); \ + v29 = vfmaq_laneq_f16(v29, v23, va7, 5); \ + v30 = vfmaq_laneq_f16(v30, v23, va7, 6); \ + v31 = vfmaq_laneq_f16(v31, v23, va7, 7); \ + va7 = vld1q_f16(a + 8 * 12); \ + v23 = vld1q_f16(b + 8 * 12); \ + v24 = vfmaq_laneq_f16(v24, v23, va7, 0); \ + v25 = vfmaq_laneq_f16(v25, v23, va7, 1); \ + v26 = vfmaq_laneq_f16(v26, v23, va7, 2); \ + v27 = vfmaq_laneq_f16(v27, v23, va7, 3); \ + v28 = vfmaq_laneq_f16(v28, v23, va7, 4); \ + v29 = vfmaq_laneq_f16(v29, v23, va7, 5); \ + v30 = vfmaq_laneq_f16(v30, v23, va7, 6); \ + v31 = vfmaq_laneq_f16(v31, v23, va7, 7); \ + va7 = vld1q_f16(a + 8 * 13); \ + v23 = vld1q_f16(b + 8 * 13); \ + v24 = vfmaq_laneq_f16(v24, v23, va7, 0); \ + v25 = vfmaq_laneq_f16(v25, v23, va7, 1); \ + v26 = vfmaq_laneq_f16(v26, v23, va7, 2); \ + v27 = vfmaq_laneq_f16(v27, v23, va7, 3); \ + v28 = vfmaq_laneq_f16(v28, v23, va7, 4); \ + v29 = vfmaq_laneq_f16(v29, v23, va7, 5); \ + v30 = vfmaq_laneq_f16(v30, v23, va7, 6); \ + v31 = vfmaq_laneq_f16(v31, v23, va7, 7); \ + va7 = vld1q_f16(a + 8 * 14); \ + v23 = vld1q_f16(b + 8 * 14); \ + v24 = vfmaq_laneq_f16(v24, v23, va7, 0); \ + v25 = vfmaq_laneq_f16(v25, v23, va7, 1); \ + v26 = vfmaq_laneq_f16(v26, v23, va7, 2); \ + v27 = vfmaq_laneq_f16(v27, v23, va7, 3); \ + v28 = vfmaq_laneq_f16(v28, v23, va7, 4); \ + v29 = vfmaq_laneq_f16(v29, v23, va7, 5); \ + v30 = vfmaq_laneq_f16(v30, v23, va7, 6); \ + v31 = vfmaq_laneq_f16(v31, v23, va7, 7); \ + va7 = vld1q_f16(a + 8 * 15); \ + v23 = vld1q_f16(b + 8 * 15); \ + v24 = vfmaq_laneq_f16(v24, v23, va7, 0); \ + v25 = vfmaq_laneq_f16(v25, v23, va7, 1); \ + v26 = vfmaq_laneq_f16(v26, v23, va7, 2); \ + v27 = vfmaq_laneq_f16(v27, v23, va7, 3); \ + v28 = vfmaq_laneq_f16(v28, v23, va7, 4); \ + v29 = vfmaq_laneq_f16(v29, v23, va7, 5); \ + v30 = vfmaq_laneq_f16(v30, v23, va7, 6); \ + v31 = vfmaq_laneq_f16(v31, v23, va7, 7); \ + __builtin_prefetch(b + 128, 0, 3); \ + __builtin_prefetch(a + 128, 0, 3); \ + l += 16; \ + b += 8 * 16; \ + a += 8 * 16; + +// 2. Partial sum 512 digits #define KERNEL_8x8_ACC8() \ - v24 = vdupq_n_f16(0.F); \ - v25 = vdupq_n_f16(0.F); \ - v26 = vdupq_n_f16(0.F); \ - v27 = vdupq_n_f16(0.F); \ - v28 = vdupq_n_f16(0.F); \ - v29 = vdupq_n_f16(0.F); \ - v30 = vdupq_n_f16(0.F); \ - v31 = vdupq_n_f16(0.F); \ va0 = vld1q_f16(a); \ v16 = vld1q_f16(b); \ v24 = vfmaq_laneq_f16(v24, v16, va0, 0); \ @@ -113,16 +280,8 @@ b += 8 * 8; \ a += 8 * 8; -// 2. Partial sum 256 digits : Medium accuracy, medium latency +// 3. Partial sum 256 digits #define KERNEL_8x8_ACC4() \ - v24 = vdupq_n_f16(0.F); \ - v25 = vdupq_n_f16(0.F); \ - v26 = vdupq_n_f16(0.F); \ - v27 = vdupq_n_f16(0.F); \ - v28 = vdupq_n_f16(0.F); \ - v29 = vdupq_n_f16(0.F); \ - v30 = vdupq_n_f16(0.F); \ - v31 = vdupq_n_f16(0.F); \ va0 = vld1q_f16(a); \ v16 = vld1q_f16(b); \ v24 = vfmaq_laneq_f16(v24, v16, va0, 0); \ @@ -169,16 +328,8 @@ b += 8 * 4; \ a += 8 * 4; -// 3. Partial sum 64 digits : Best accuracy, worst latency +// 4. Partial sum 64 digits #define KERNEL_8x8_ACC1() \ - v24 = vdupq_n_f16(0.F); \ - v25 = vdupq_n_f16(0.F); \ - v26 = vdupq_n_f16(0.F); \ - v27 = vdupq_n_f16(0.F); \ - v28 = vdupq_n_f16(0.F); \ - v29 = vdupq_n_f16(0.F); \ - v30 = vdupq_n_f16(0.F); \ - v31 = vdupq_n_f16(0.F); \ va0 = vld1q_f16(a); \ v16 = vld1q_f16(b); \ v24 = vfmaq_laneq_f16(v24, v16, va0, 0); \ @@ -195,6 +346,46 @@ b += 8 * 1; \ a += 8 * 1; +#define SAVE_KERNEL_8X8_F16_f32() \ + vst1q_f32(c, vaddq_f32(vld1q_f32(c), vcvt_f32_f16(vget_low_f16(v24)))); \ + vst1q_f32(c + 4, \ + vaddq_f32(vld1q_f32(c + 4), vcvt_f32_f16(vget_high_f16(v24)))); \ + \ + vst1q_f32(c + ldc, \ + vaddq_f32(vld1q_f32(c + ldc), vcvt_f32_f16(vget_low_f16(v25)))); \ + vst1q_f32(c + 4 + ldc, vaddq_f32(vld1q_f32(c + 4 + ldc), \ + vcvt_f32_f16(vget_high_f16(v25)))); \ + \ + vst1q_f32(c + 2 * ldc, vaddq_f32(vld1q_f32(c + 2 * ldc), \ + vcvt_f32_f16(vget_low_f16(v26)))); \ + vst1q_f32(c + 4 + 2 * ldc, vaddq_f32(vld1q_f32(c + 4 + 2 * ldc), \ + vcvt_f32_f16(vget_high_f16(v26)))); \ + \ + vst1q_f32(c + 3 * ldc, vaddq_f32(vld1q_f32(c + 3 * ldc), \ + vcvt_f32_f16(vget_low_f16(v27)))); \ + vst1q_f32(c + 4 + 3 * ldc, vaddq_f32(vld1q_f32(c + 4 + 3 * ldc), \ + vcvt_f32_f16(vget_high_f16(v27)))); \ + \ + vst1q_f32(c + 4 * ldc, vaddq_f32(vld1q_f32(c + 4 * ldc), \ + vcvt_f32_f16(vget_low_f16(v28)))); \ + vst1q_f32(c + 4 + 4 * ldc, vaddq_f32(vld1q_f32(c + 4 + 4 * ldc), \ + vcvt_f32_f16(vget_high_f16(v28)))); \ + \ + vst1q_f32(c + 5 * ldc, vaddq_f32(vld1q_f32(c + 5 * ldc), \ + vcvt_f32_f16(vget_low_f16(v29)))); \ + vst1q_f32(c + 4 + 5 * ldc, vaddq_f32(vld1q_f32(c + 4 + 5 * ldc), \ + vcvt_f32_f16(vget_high_f16(v29)))); \ + \ + vst1q_f32(c + 6 * ldc, vaddq_f32(vld1q_f32(c + 6 * ldc), \ + vcvt_f32_f16(vget_low_f16(v30)))); \ + vst1q_f32(c + 4 + 6 * ldc, vaddq_f32(vld1q_f32(c + 4 + 6 * ldc), \ + vcvt_f32_f16(vget_high_f16(v30)))); \ + \ + vst1q_f32(c + 7 * ldc, vaddq_f32(vld1q_f32(c + 7 * ldc), \ + vcvt_f32_f16(vget_low_f16(v31)))); \ + vst1q_f32(c + 4 + 7 * ldc, vaddq_f32(vld1q_f32(c + 4 + 7 * ldc), \ + vcvt_f32_f16(vget_high_f16(v31)))); + /** * @brief hgemm 8x8 kernel sc = sa * sb * @@ -221,19 +412,19 @@ void hgemm_kernel_8x8(unsigned int M, unsigned int N, unsigned int K, float16x8_t v16, v17, v18, v19, v20, v21, v22, v23; float16x8_t v24, v25, v26, v27, v28, v29, v30, v31; float16x8_t va0, va1, va2, va3, va4, va5, va6, va7; + INIT_KERNEL_8x8(); l = 0; for (; l < K;) { - KERNEL_8x8_ACC8(); - - vst1q_f16(c, vaddq_f16(vld1q_f16(c), v24)); - vst1q_f16(c + ldc, vaddq_f16(vld1q_f16(c + ldc), v25)); - vst1q_f16(c + 2 * ldc, vaddq_f16(vld1q_f16(c + 2 * ldc), v26)); - vst1q_f16(c + 3 * ldc, vaddq_f16(vld1q_f16(c + 3 * ldc), v27)); - vst1q_f16(c + 4 * ldc, vaddq_f16(vld1q_f16(c + 4 * ldc), v28)); - vst1q_f16(c + 5 * ldc, vaddq_f16(vld1q_f16(c + 5 * ldc), v29)); - vst1q_f16(c + 6 * ldc, vaddq_f16(vld1q_f16(c + 6 * ldc), v30)); - vst1q_f16(c + 7 * ldc, vaddq_f16(vld1q_f16(c + 7 * ldc), v31)); + KERNEL_8x8_ACC1(); } + vst1q_f16(c, vaddq_f16(vld1q_f16(c), v24)); + vst1q_f16(c + ldc, vaddq_f16(vld1q_f16(c + ldc), v25)); + vst1q_f16(c + 2 * ldc, vaddq_f16(vld1q_f16(c + 2 * ldc), v26)); + vst1q_f16(c + 3 * ldc, vaddq_f16(vld1q_f16(c + 3 * ldc), v27)); + vst1q_f16(c + 4 * ldc, vaddq_f16(vld1q_f16(c + 4 * ldc), v28)); + vst1q_f16(c + 5 * ldc, vaddq_f16(vld1q_f16(c + 5 * ldc), v29)); + vst1q_f16(c + 6 * ldc, vaddq_f16(vld1q_f16(c + 6 * ldc), v30)); + vst1q_f16(c + 7 * ldc, vaddq_f16(vld1q_f16(c + 7 * ldc), v31)); c += 8; a -= 8 * K; } @@ -263,6 +454,9 @@ void hgemm_kernel_8x8(unsigned int M, unsigned int N, unsigned int K, __fp16 *a = sa, *b = sb; float *c = sc; unsigned int i, j, l; + unsigned int K4 = (K >> 2) << 2; + unsigned int K8 = (K >> 3) << 3; + unsigned int K16 = (K >> 4) << 4; for (i = 0; i < M; i += VL_FP16) { for (j = 0; j < N; j += VL_FP16) { __builtin_prefetch(b, 0, 3); @@ -272,48 +466,25 @@ void hgemm_kernel_8x8(unsigned int M, unsigned int N, unsigned int K, float16x8_t v24, v25, v26, v27, v28, v29, v30, v31; float16x8_t va0, va1, va2, va3, va4, va5, va6, va7; l = 0; - - for (; l < K;) { + for (; l < K16;) { + INIT_KERNEL_8x8(); + KERNEL_8x8_ACC16(); + SAVE_KERNEL_8X8_F16_f32(); + } + for (; l < K8;) { + INIT_KERNEL_8x8(); KERNEL_8x8_ACC8(); - - vst1q_f32(c, vaddq_f32(vld1q_f32(c), vcvt_f32_f16(vget_low_f16(v24)))); - vst1q_f32( - c + 4, vaddq_f32(vld1q_f32(c + 4), vcvt_f32_f16(vget_high_f16(v24)))); - - vst1q_f32(c + ldc, vaddq_f32(vld1q_f32(c + ldc), - vcvt_f32_f16(vget_low_f16(v25)))); - vst1q_f32(c + 4 + ldc, vaddq_f32(vld1q_f32(c + 4 + ldc), - vcvt_f32_f16(vget_high_f16(v25)))); - - vst1q_f32(c + 2 * ldc, vaddq_f32(vld1q_f32(c + 2 * ldc), - vcvt_f32_f16(vget_low_f16(v26)))); - vst1q_f32(c + 4 + 2 * ldc, vaddq_f32(vld1q_f32(c + 4 + 2 * ldc), - vcvt_f32_f16(vget_high_f16(v26)))); - - vst1q_f32(c + 3 * ldc, vaddq_f32(vld1q_f32(c + 3 * ldc), - vcvt_f32_f16(vget_low_f16(v27)))); - vst1q_f32(c + 4 + 3 * ldc, vaddq_f32(vld1q_f32(c + 4 + 3 * ldc), - vcvt_f32_f16(vget_high_f16(v27)))); - - vst1q_f32(c + 4 * ldc, vaddq_f32(vld1q_f32(c + 4 * ldc), - vcvt_f32_f16(vget_low_f16(v28)))); - vst1q_f32(c + 4 + 4 * ldc, vaddq_f32(vld1q_f32(c + 4 + 4 * ldc), - vcvt_f32_f16(vget_high_f16(v28)))); - - vst1q_f32(c + 5 * ldc, vaddq_f32(vld1q_f32(c + 5 * ldc), - vcvt_f32_f16(vget_low_f16(v29)))); - vst1q_f32(c + 4 + 5 * ldc, vaddq_f32(vld1q_f32(c + 4 + 5 * ldc), - vcvt_f32_f16(vget_high_f16(v29)))); - - vst1q_f32(c + 6 * ldc, vaddq_f32(vld1q_f32(c + 6 * ldc), - vcvt_f32_f16(vget_low_f16(v30)))); - vst1q_f32(c + 4 + 6 * ldc, vaddq_f32(vld1q_f32(c + 4 + 6 * ldc), - vcvt_f32_f16(vget_high_f16(v30)))); - - vst1q_f32(c + 7 * ldc, vaddq_f32(vld1q_f32(c + 7 * ldc), - vcvt_f32_f16(vget_low_f16(v31)))); - vst1q_f32(c + 4 + 7 * ldc, vaddq_f32(vld1q_f32(c + 4 + 7 * ldc), - vcvt_f32_f16(vget_high_f16(v31)))); + SAVE_KERNEL_8X8_F16_f32(); + } + for (; l < K4;) { + INIT_KERNEL_8x8(); + KERNEL_8x8_ACC4(); + SAVE_KERNEL_8X8_F16_f32(); + } + for (; l < K;) { + INIT_KERNEL_8x8(); + KERNEL_8x8_ACC1(); + SAVE_KERNEL_8X8_F16_f32(); } c += 8; diff --git a/nntrainer/tensor/manager.cpp b/nntrainer/tensor/manager.cpp index 9a0d235ba9..4a2838d05e 100644 --- a/nntrainer/tensor/manager.cpp +++ b/nntrainer/tensor/manager.cpp @@ -407,14 +407,15 @@ std::vector Manager::requestWeights( * order with the max exec order where it will be used for clipping and then * applied to the weight. */ - if (Weight::isGradientClipByGlobalNorm(clip_by_global_norm)) { + if (Weight::isGradientClipByGlobalNorm(clip_by_global_norm) || + isMixedPrecision()) { grad_exec_order.push_back(TensorPool::PERSIST_END_ORDER); // TODO: We need double check if it is OK not to add PERSIST_END_ORDER // here or add other conditions // var_exec_order.push_back(TensorPool::PERSIST_END_ORDER); } - Tensor *var = nullptr, *grad = nullptr; + Tensor *var = nullptr, *grad = nullptr, *var32 = nullptr; bool is_dependent = !shared_names.empty(); if (is_dependent) { /// shared_name is used and the orignal name is discarded @@ -431,6 +432,17 @@ std::vector Manager::requestWeights( grad = tensor_pool.requestOrExtend(shared_name + Var_Grad::grad_suffix, dim_g, grad_exec_order, grad_ls, Tensor::Initializer::ZEROS); + + if (var->getDataType() != ml::train::TensorDim::DataType::FP32) { + TensorDim var32_dim(dim_v); + var32_dim.setDataType(ml::train::TensorDim::DataType::FP32); + std::vector var32_exec_order; + var32_exec_order.push_back(TensorPool::PERSIST_END_ORDER); + + var32 = weight_pool.requestOrExtend(shared_name + ":var32", var32_dim, + var32_exec_order, var_ls, + Tensor::Initializer::ZEROS); + } } } else { /** case requesting fresh weights */ @@ -448,11 +460,21 @@ std::vector Manager::requestWeights( grad = tensor_pool.request(name + Var_Grad::grad_suffix, dim_g, grad_exec_order, grad_ls, Tensor::Initializer::ZEROS, is_wgrad); + if (var->getDataType() != ml::train::TensorDim::DataType::FP32) { + TensorDim var32_dim(dim_v); + var32_dim.setDataType(ml::train::TensorDim::DataType::FP32); + std::vector var32_exec_order; + var32_exec_order.push_back(TensorPool::PERSIST_END_ORDER); + var32 = + weight_pool.request(name + ":var32", var32_dim, var32_exec_order, + var_ls, Tensor::Initializer::ZEROS); + } } } weights_v2.emplace_back(std::make_unique( - var, grad, w_reg, w_reg_const, decay, is_dependent, clip_by_global_norm)); + var, grad, var32, w_reg, w_reg_const, decay, is_dependent, + clip_by_global_norm, axis, loss_scale)); } std::transform(weights_v2.begin() + current_size, weights_v2.end(), @@ -668,15 +690,15 @@ bool Manager::isSecondLastAccess(const std::string &name, */ std::vector Manager::requestWeightOptimizerVariables( const std::vector &dims, const std::string &name, - const TensorLifespan &lifespan, bool is_grad_clip, - Tensor::Initializer initializer) { + const std::string &suffix, const TensorLifespan &lifespan, bool is_grad_clip, + bool is_mixed_precision, Tensor::Initializer initializer) { std::vector ret; ret.reserve(dims.size()); std::vector exec; exec.reserve(1); - if (is_grad_clip) { + if (is_grad_clip || is_mixed_precision) { exec.emplace_back(TensorPool::PERSIST_END_ORDER); } else { exec.emplace_back(getMinMaxTensorExecutionOrder(name, true).second); @@ -685,7 +707,7 @@ std::vector Manager::requestWeightOptimizerVariables( /// @note this is assuming weight optimizer variables is treated as weight, if /// not, there is room to optimize below behavior for (unsigned int idx = 0; idx < dims.size(); idx++) - ret.push_back(weight_pool.request(name + ":opt" + std::to_string(idx), + ret.push_back(weight_pool.request(name + suffix + std::to_string(idx), dims[idx], exec, lifespan, initializer)); return ret; diff --git a/nntrainer/tensor/manager.h b/nntrainer/tensor/manager.h index ab1c018153..d561770206 100644 --- a/nntrainer/tensor/manager.h +++ b/nntrainer/tensor/manager.h @@ -224,7 +224,8 @@ class Manager { */ std::vector requestWeightOptimizerVariables( const std::vector &dims, const std::string &name, - const TensorLifespan &lifespan, bool is_grad_clip, + const std::string &suffix, const TensorLifespan &lifespan, + bool is_grad_clip, bool is_mixed_type, Tensor::Initializer initializer = Tensor::Initializer::NONE); /** @@ -494,6 +495,11 @@ class Manager { exec_mode = mode; }; + /** + * @brief return if it is mixed precsion + */ + bool isMixedPrecision() { return !istrequal(tensor_dtype[0], "FP32"); } + private: /** @todo: merge this list to one */ std::vector> weights_v2; /**< weights for the layers diff --git a/nntrainer/tensor/meson.build b/nntrainer/tensor/meson.build index 0884dbd3b4..b14fa0ee85 100644 --- a/nntrainer/tensor/meson.build +++ b/nntrainer/tensor/meson.build @@ -44,6 +44,12 @@ cl_headers = [ arch = host_machine.cpu_family() + +if get_option('enable-avx') + tensor_sources += 'blas_avx.cpp' + tensor_headers += 'blas_avx.h' +endif + if get_option('enable-fp16') if arch == 'arm' error ('FP16/ARM code (blas_neon.cpp) uses armv8.2 instructions. armv7 is not supported.') @@ -55,9 +61,6 @@ if get_option('enable-fp16') nntrainer_inc += include_directories('hgemm') nntrainer_inc_abs += meson.current_source_dir() / 'hgemm' endif - elif get_option('enable-avx') - tensor_sources += 'blas_avx.cpp' - tensor_headers += 'blas_avx.h' endif endif diff --git a/nntrainer/tensor/tensor.cpp b/nntrainer/tensor/tensor.cpp index 4f1e8e0721..827ba7e979 100644 --- a/nntrainer/tensor/tensor.cpp +++ b/nntrainer/tensor/tensor.cpp @@ -3065,6 +3065,18 @@ Tensor Tensor::clone() const { return t; } +Tensor Tensor::clone(ml::train::TensorDim::DataType type) const { + if (getDataType() == type) + return clone(); + + TensorDim dim = getDim(); + dim.setDataType(type); + Tensor t(dim, true); + t.copyData(*this); + t.name = name; + return t; +} + void Tensor::reshape(const TensorDim &d) { NNTR_THROW_IF(!contiguous, std::invalid_argument) @@ -3808,6 +3820,18 @@ void Tensor::dequantize(Tensor &output, unsigned int axis) const { return; } +bool Tensor::isValid() const { + if (getDataType() == Tdatatype::FP16) { +#ifdef ENABLE_FP16 + return is_valid(dim.getDataLen(), Tdatatype::FP16, getData<_FP16>()); +#else + throw std::invalid_argument("enble-fp16 is not set"); +#endif + } else { + return is_valid(dim.getDataLen(), Tdatatype::FP32, getData()); + } +} + // namespace nntrainer } /* namespace nntrainer */ diff --git a/nntrainer/tensor/tensor.h b/nntrainer/tensor/tensor.h index 211334da40..ad3781526f 100644 --- a/nntrainer/tensor/tensor.h +++ b/nntrainer/tensor/tensor.h @@ -1680,6 +1680,13 @@ class Tensor { */ Tensor clone() const; + /** + * @brief Convient wrapper for inplace copy of @a this. + * @param[in] type output tensor data type + * @retval Copied version of this + */ + Tensor clone(ml::train::TensorDim::DataType type) const; + /** * @brief Save the Tensor into file * @param[in] file output file stream @@ -2031,6 +2038,12 @@ class Tensor { static constexpr float epsilon = 1e-5; + /** + * @brief check if there is NaN or Inf element + * @param[out] bool false if there is NaN or Inf else false + */ + bool isValid() const; + private: /**< handle the data as a std::shared_ptr type */ TensorDim dim; diff --git a/nntrainer/tensor/weight.cpp b/nntrainer/tensor/weight.cpp index f98c8c8356..ea8c65a7cb 100644 --- a/nntrainer/tensor/weight.cpp +++ b/nntrainer/tensor/weight.cpp @@ -34,6 +34,28 @@ Weight::Weight(const TensorDim &dim, const Tensor::Initializer init, throw std::invalid_argument("Weight initializer cannot be none"); if (regularizer == WeightRegularizer::UNKNOWN) throw std::invalid_argument("Weight regularizer unknown"); + + std::string var32_suffix = ":fp32"; + std::string var32_name = name + var32_suffix; + + /** + * @note We assume if the Weight Data Type is not FP32, then FP32 Weight is + * necessary to maintain the accuracy. + * We could think it can be other data type and if there is the case to + * support other data type, then the code below needs to be udpated. + * + * Also, the loss_scale is not used in Weight but leave as it is for later + * usage. + */ + + if (train && dim.getDataType() != ml::train::TensorDim::DataType::FP32) { + TensorDim var32_dim(dim); + var32_dim.setDataType(ml::train::TensorDim::DataType::FP32); + + var32 = std::make_shared(var32_dim, alloc_now_, init, var32_name); + } else { + var32 = std::make_shared(var32_name); + } } Weight::Weight(const TensorDim &dim_v, const TensorDim &dim_g, @@ -52,6 +74,93 @@ Weight::Weight(const TensorDim &dim_v, const TensorDim &dim_g, throw std::invalid_argument("Weight initializer cannot be none"); if (regularizer == WeightRegularizer::UNKNOWN) throw std::invalid_argument("Weight regularizer unknown"); + + std::string var32_suffix = ":fp32"; + std::string var32_name = name + var32_suffix; + + if (train && dim_v.getDataType() != ml::train::TensorDim::DataType::FP32) { + TensorDim var32_dim(dim_v); + var32_dim.setDataType(ml::train::TensorDim::DataType::FP32); + std::string var32_suffix = ":fp32"; + std::string var32_name = name + var32_suffix; + + var32 = std::make_shared(var32_dim, alloc_now_, init, var32_name); + } else { + var32 = std::make_shared(var32_name); + } +} + +Weight::Weight(const Tensor &v, const Tensor &g, const Tensor &v32, + const std::string &n, bool is_dependent, + unsigned int output_axis_) : + Var_Grad(v, g, n, is_dependent), + regularizer(WeightRegularizer::NONE), + regularizer_constant(1.0f), + decay(0.0f), + clip_by_global_norm(0.0f), + output_axis(output_axis_), + loss_scale(1.0), + var32(std::make_shared(n + ":fp32")) { + + if (!g.empty() && isMixedPrecision()) { + TensorDim var32_dim(v.getDim()); + var32_dim.setDataType(ml::train::TensorDim::DataType::FP32); + if (!v32.empty()) + var32 = std::make_shared( + v32.getSharedDataTensor(var32_dim, 0, false, n + ":fp32")); + } +} + +Weight::Weight(Tensor *v, Tensor *g, Tensor *v32, const WeightRegularizer reg, + const float reg_const, const float decay, bool is_dependent, + const float max_norm, unsigned int output_axis_, + float loss_scale_) : + Var_Grad(v, g, is_dependent), + regularizer(reg), + regularizer_constant(reg_const), + decay(decay), + clip_by_global_norm(max_norm), + output_axis(output_axis_), + loss_scale(loss_scale_), + var32(std::shared_ptr(v32, [](void *) {})) { + if (!v32) + var32 = std::make_shared(); +} + +void Weight::applyGradient(double lr, Tensor &updated_grad) { + if (isMixedPrecision() && + updated_grad.getDataType() == ml::train::TensorDim::DataType::FP32) { + var32->add_i(updated_grad, -lr); + quantizeWeight(); + return; + } + + return applyGradient(lr); +} + +void Weight::quantizeWeight() { + if (!isMixedPrecision()) + return; + + Tensor &var = getVariableRef(); + ml::train::TensorDim::DataType type = var.getDataType(); + switch (type) { + case ml::train::TensorDim::DataType::QINT4: + // NYI + break; + case ml::train::TensorDim::DataType::QINT8: + // NYI + break; + case ml::train::TensorDim::DataType::FP16: + getVariableRef().copyData(getVariableFP32Ref()); + break; + case ml::train::TensorDim::DataType::FP32: + break; + default: + break; + } + + return; } } // namespace nntrainer diff --git a/nntrainer/tensor/weight.h b/nntrainer/tensor/weight.h index 552f6d5739..ef65ca9318 100644 --- a/nntrainer/tensor/weight.h +++ b/nntrainer/tensor/weight.h @@ -46,7 +46,7 @@ class Weight : public Var_Grad { decay(0.0f), clip_by_global_norm(0.0f), output_axis(3), - loss_scale(0.0) {} + loss_scale(1.0) {} /** * @brief Construct a new Weight object @@ -66,7 +66,7 @@ class Weight : public Var_Grad { const float reg_const = 1.0f, const float decay = 0.0f, const float clip_by_global_norm = 0.0f, bool ng = true, bool alloc_now = false, std::string name = "", unsigned int axis = 3, - float loss_scale_ = 0.0); + float loss_scale_ = 1.0); /** * @brief Construct a new Weight object @@ -87,7 +87,7 @@ class Weight : public Var_Grad { const float reg_const = 1.0f, const float decay = 0.0f, const float clip_by_global_norm = 0.0f, bool ng = true, bool alloc_now = false, std::string name = "", unsigned int axis = 3, - float loss_scale_ = 0.0); + float loss_scale_ = 1.0); /** * @brief Construct a new Weight object @@ -114,6 +114,7 @@ class Weight : public Var_Grad { * * @param v Already created variable object * @param g Already created gradient object + * @param v32 Already created gradient object * @param n Name for this Weight * * @note This is primarily used to created wrapper of variable extracted from @@ -123,35 +124,24 @@ class Weight : public Var_Grad { * uses only, as Weight does not own the tensors v and g, and can go invalid * if the owner of these tensors free the tensors. */ - explicit Weight(const Tensor &v, const Tensor &g, const std::string &n = "", - bool is_dependent = false, unsigned int output_axis_ = 3) : - Var_Grad(v, g, n, is_dependent), - regularizer(WeightRegularizer::NONE), - regularizer_constant(1.0f), - decay(0.0f), - clip_by_global_norm(0.0f), - output_axis(output_axis_), - loss_scale(0.0) {} + explicit Weight(const Tensor &v, const Tensor &g, const Tensor &v32, + const std::string &n = "", bool is_dependent = false, + unsigned int output_axis_ = 3); /** * @brief Construct a new Weight object * * @param v ptr to already created variable tensor * @param g ptr to already created gradient tensor + * @param v32 ptr to already created variable32 tensor * @param reg Regularizer for the weight * @param reg_const Constant multiplier for regularizer */ - explicit Weight(Tensor *v, Tensor *g, const WeightRegularizer reg, - const float reg_const, const float decay, - bool is_dependent = false, const float max_norm = 0.0f, - unsigned int output_axis_ = 3, float loss_scale_ = 0.0f) : - Var_Grad(v, g, is_dependent), - regularizer(reg), - regularizer_constant(reg_const), - decay(decay), - clip_by_global_norm(max_norm), - output_axis(output_axis_), - loss_scale(loss_scale_) {} + explicit Weight(Tensor *v, Tensor *g, Tensor *v32, + const WeightRegularizer reg, const float reg_const, + const float decay, bool is_dependent = false, + const float max_norm = 0.0f, unsigned int output_axis_ = 3, + float loss_scale_ = 1.0f); /** * @brief Swap for weight @@ -170,6 +160,7 @@ class Weight : public Var_Grad { swap(lhs.output_axis, rhs.output_axis); swap(lhs.opt_vars, rhs.opt_vars); swap(lhs.loss_scale, rhs.loss_scale); + swap(lhs.var32, rhs.var32); } /** @@ -213,6 +204,8 @@ class Weight : public Var_Grad { w.var = std::make_shared(this->var->clone()); if (!this->grad->empty()) w.grad = std::make_shared(this->grad->clone()); + if (!this->var32->empty()) + w.var32 = std::make_shared(this->var32->clone()); return w; } @@ -294,6 +287,13 @@ class Weight : public Var_Grad { */ void applyGradient(double lr) { var->add_i(*grad.get(), -lr); } + /** + * @brief Apply the gradient to the weight with updated gradient + * @param[in] updated_grad gradient tensor which is updated in optimizer + * it might be different data type with gradient in weight. .eg : FP32 + */ + void applyGradient(double lr, Tensor &updated_grad); + /** * @brief Check if the gradient is supposed to be clipped by global norm with * the given max_norm value @@ -316,6 +316,16 @@ class Weight : public Var_Grad { return clip_by_global_norm > epsilon; } + /** + * @brief Check if the variable type is not full precision + * + * @return true if it is not full precsion + * @return false otherwise + */ + bool isMixedPrecision() const { + return ((var->getDataType() != ml::train::TensorDim::DataType::FP32)); + } + /** * @brief clip the gradient value based on the given global norm * @@ -326,6 +336,32 @@ class Weight : public Var_Grad { grad->multiply_i(clip_by_global_norm / (global_norm + epsilon)); } + /** + * @brief Get the variable FP32 tensor (by reference) + * + * @return Tensor Variable FP32 tensor + */ + Tensor &getVariableFP32Ref() { return *var32.get(); } + + /** + * @brief Quantize var32 to var + * + */ + void quantizeWeight(); + + /** + * @brief set loss scale + * param[in] scale + * + */ + void setLossScale(float scale) { loss_scale = scale; }; + + /** + * @brief get loss scale + * + */ + const float getLossScale() { return loss_scale; }; + private: static constexpr float epsilon = 1e-6; /**< epsilon for zero comparison */ static constexpr float epsilon_decay = @@ -337,7 +373,8 @@ class Weight : public Var_Grad { float clip_by_global_norm; /**< constant factor to clip gradient by L2 norm */ unsigned int output_axis; float loss_scale; - std::vector opt_vars; /**< optimizer variables */ + std::vector + opt_vars; /**< optimizer variables : We assume it is always full-precsion*/ std::shared_ptr var32; /** diff --git a/packaging/nntrainer.spec b/packaging/nntrainer.spec index 36ba371d22..2f1dc57f68 100644 --- a/packaging/nntrainer.spec +++ b/packaging/nntrainer.spec @@ -65,6 +65,13 @@ %define neon_support -Denable-neon=false %endif # arch aarch64 +%ifarch x86_64 +%define enable_avx 1 +%define avx_support -Denable-avx=true +%else +%define avx_support -Denable-avx=false +%endif # arch aarch64 + Name: nntrainer Summary: Software framework for training neural networks @@ -410,7 +417,7 @@ meson --buildtype=plain --prefix=%{_prefix} --sysconfdir=%{_sysconfdir} \ %{enable_reduce_tolerance} %{configure_subplugin_install_path} %{enable_debug} \ -Dml-api-support=enabled -Denable-nnstreamer-tensor-filter=enabled \ -Denable-nnstreamer-tensor-trainer=enabled -Denable-capi=enabled \ - %{fp16_support} %{neon_support} build + %{fp16_support} %{neon_support} %{avx_support} build ninja -C build %{?_smp_mflags} @@ -563,6 +570,10 @@ cp -r result %{buildroot}%{_datadir}/nntrainer/unittest/ %{_includedir}/nntrainer/util_simd_neon.h %endif +%if 0%{?enable_avx} +%{_includedir}/nntrainer/blas_avx.h +%endif + %files devel-static %{_libdir}/libnntrainer*.a %exclude %{_libdir}/libcapi*.a diff --git a/packaging/unittest_layers.tar.gz b/packaging/unittest_layers.tar.gz index 7a435aadf4..3bd488a0a2 100644 Binary files a/packaging/unittest_layers.tar.gz and b/packaging/unittest_layers.tar.gz differ diff --git a/packaging/unittest_models_v3.tar.gz b/packaging/unittest_models_v3.tar.gz index abc7ead4a4..49a1f1b2ad 100644 Binary files a/packaging/unittest_models_v3.tar.gz and b/packaging/unittest_models_v3.tar.gz differ diff --git a/test/include/nntrainer_test_util.h b/test/include/nntrainer_test_util.h index 74eef4abaa..8e16b6a9f4 100644 --- a/test/include/nntrainer_test_util.h +++ b/test/include/nntrainer_test_util.h @@ -347,6 +347,29 @@ float mse(Ta *A, Tb *B, uint32_t size) { return mse; } +/** + * @brief calculate mean squared errer + * + * @param A const prediction data + * @param B const reference data + * @param size data size + * @return mean squared errer value + */ +template +float mse(const Ta *A, const Tb *B, uint32_t size) { + float pred; + float ref; + float mse_error = 0; + for (uint32_t i = 0; i < size; i++) { + pred = A[i]; + ref = B[i]; + float diff = pred - ref; + mse_error += pow(diff, 2); + } + float mse = mse_error / size; + return mse; +} + /** * @brief A helper struct for performing static_cast operations on types. * diff --git a/test/input_gen/genModelTests_v2.py b/test/input_gen/genModelTests_v2.py index a56f437785..422c737487 100644 --- a/test/input_gen/genModelTests_v2.py +++ b/test/input_gen/genModelTests_v2.py @@ -11,6 +11,7 @@ import math from recorder_v2 import record_v2, inspect_file, _rand_like import torch +from torch import autocast class ReduceMeanLast(torch.nn.Module): def __init__(self): @@ -307,6 +308,40 @@ def forward(self, inputs, labels): loss = self.loss(out, labels[0]) return out, loss +class LinearMixedPrecision(torch.nn.Module): + def __init__(self): + super().__init__() + self.fc = torch.nn.Linear(3, 10) + self.loss = torch.nn.MSELoss() + + def forward(self, inputs, labels): + with autocast(device_type='cuda', dtype=torch.float16): + input=inputs[0].to('cuda') + label=labels[0].to('cuda') + out = self.fc(input) + return out + + def getOptimizer(self): + return torch.optim.Adam(self.parameters(), lr=0.1) + +class LinearMixedPrecisionNaNSGD(torch.nn.Module): + def __init__(self): + super().__init__() + self.fc0 = torch.nn.Linear(1, 1) + self.fc1 = torch.nn.Linear(1, 1) + self.loss = torch.nn.MSELoss() + + def forward(self, inputs, labels): + with autocast(device_type='cuda', dtype=torch.float16): + input=inputs[0].to('cuda') + label=labels[0].to('cuda') + out = self.fc0(input) + out = self.fc1(out) + return out + + def getOptimizer(self): + return torch.optim.SGD(self.parameters(), lr=0.1) + if __name__ == "__main__": record_v2( ReduceMeanLast(), @@ -537,5 +572,28 @@ def forward(self, inputs, labels): name="non_trainable_fc_idx3" ) - # Function to check the created golden test file + fc_mixed_training = LinearMixedPrecision() + record_v2( + fc_mixed_training, + iteration=3, + input_dims=[(1,3)], + input_dtype=[float], + label_dims=[(1,10)], + name="fc_mixed_training", + optimizer=fc_mixed_training.getOptimizer() + ) + + fc_mixed_training_nan_sgd = LinearMixedPrecisionNaNSGD() + record_v2( + fc_mixed_training_nan_sgd, + iteration=5, + input_dims=[(1,1)], + input_dtype=[float], + label_dims=[(1,1)], + name="fc_mixed_training_nan_sgd", + optimizer=fc_mixed_training_nan_sgd.getOptimizer() + ) + +# Function to check the created golden test file inspect_file("non_trainable_fc_idx3.nnmodelgolden") + diff --git a/test/input_gen/gen_layer_tests.py b/test/input_gen/gen_layer_tests.py index 48e68acaf1..7a1ed18ec6 100644 --- a/test/input_gen/gen_layer_tests.py +++ b/test/input_gen/gen_layer_tests.py @@ -17,6 +17,7 @@ @author Jihoon Lee @author Sungsik Kong +@author Debadri Samaddar """ import warnings @@ -866,3 +867,19 @@ def call(self, inputs): added = K.layers.Add() record_single_fp16(added, [(2, 3, 3, 3), (2, 3, 3, 3)], "added_w16a16") + + def swiglu(inputs): + [x, y] = inputs + # swish(x) = x * sigmoid(x) + swishTensor = x * K.activations.sigmoid(x) + + return K.layers.Multiply()([swishTensor, y]) + + swiglu_layer = K.layers.Lambda(swiglu) + + record_single( + swiglu_layer, + [(2, 3, 3, 3), (2, 3, 3, 3)], + "swiglu", + input_type="float", + ) diff --git a/test/input_gen/recorder_v2.py b/test/input_gen/recorder_v2.py index 9bc219c767..6b8f42ff88 100644 --- a/test/input_gen/recorder_v2.py +++ b/test/input_gen/recorder_v2.py @@ -12,6 +12,8 @@ import random import torch # torch used here is torch==1.9.1 import numpy as np +import torch.cuda.amp as amp +from torch import autocast from transLayer_v2 import params_translated @@ -29,13 +31,31 @@ def _get_writer(file): - def write_fn(items): + def write_fn(items, type = 'float32'): if not isinstance(items, (list, tuple)): items = [items] for item in items: - np.array([item.numel()], dtype="int32").tofile(file) - item.detach().cpu().numpy().tofile(file) + print(item.numel(), " -0-----") + print(item) + np.array([item.numel()], dtype='int32').tofile(file) + a=np.array(item.detach().cpu(), dtype=type) + a.tofile(file) + print(a.dtype) + + return items + + return write_fn + +def _get_writer_mixed(file): + def write_fn(items, num_type = 'int32', type = 'float32'): + if not isinstance(items, (list, tuple)): + items = [items] + + for item in items: + np.array([item.numel()], dtype=num_type).tofile(file) + a=np.array(item.detach().cpu(), dtype=type) + a.tofile(file) return items @@ -96,14 +116,65 @@ def record_iteration(write_fn): norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 0.0001) optimizer.step() + def record_iteration_with_amp(write_fn, inputs, labels, is_nan, scaler): + model_= model.cuda() + + print(inputs[0], " inputs inside") + output = model_(inputs[0], labels[0]) + + print("model output type: ",output.dtype) + + with autocast(device_type='cuda', dtype=torch.float16): + l=model_.loss(output, labels[0].to('cuda')) + + optimizer.zero_grad() + + scaler.scale(l).backward() + print("Gradient ---------------") + for param in model_.parameters(): + print (param.grad) + mask = torch.isnan(param.grad) or torch.isinf(param.grad) + check_nan = mask.int() + if check_nan.sum().item(): + is_nan = True + else: + is_nan = False + + + if not is_nan: + print("------------------------------- not nan") + write_fn(output,'int32','float32') + return output, is_nan + with open(file_name, "wb") as f: # write number of iterations + print("iteration : ", iteration) np.array([iteration], dtype="int32").tofile(f) - write_fn = _get_writer(f) - for _ in range(iteration): - record_iteration(write_fn) - + write_fn = _get_writer_mixed(f) + for i in range(iteration): + if input_label_reader != None: + inputs, labels = input_label_reader(input_dims, label_dims, input_dtype) + else: + inputs = _rand_like(input_dims, dtype=input_dtype if input_dtype is not None else float) + labels = _rand_like(label_dims, dtype=float) + print("inputs ==============") + write_fn(inputs,'int32', 'float32') + print("labels ==============") + write_fn(labels, 'int32', 'float32') + is_nan = True; + print("=========================== ", i) + scaler = amp.GradScaler() + print("weights ==============") + write_fn(list(t for _, t in params_translated(model)),'int16','float16') + print("\n\n") + while(is_nan): + print( "before is_nan_", is_nan) + output,is_nan_ = record_iteration_with_amp(write_fn, inputs, labels, is_nan, scaler) + is_nan = is_nan_ + print( "after is_nan_", is_nan) + scaler.step(optimizer) + scaler.update() ## # @brief inpsect if file is created correctly diff --git a/test/jni/Android.mk b/test/jni/Android.mk index a9033b65cc..978e98bd67 100644 --- a/test/jni/Android.mk +++ b/test/jni/Android.mk @@ -16,6 +16,7 @@ NNTRAINER_INCLUDES := $(NNTRAINER_ROOT)/nntrainer \ $(NNTRAINER_ROOT)/nntrainer/dataset \ $(NNTRAINER_ROOT)/nntrainer/models \ $(NNTRAINER_ROOT)/nntrainer/layers \ + $(NNTRAINER_ROOT)/nntrainer/layers/cl_layers \ $(NNTRAINER_ROOT)/nntrainer/compiler \ $(NNTRAINER_ROOT)/nntrainer/graph \ $(NNTRAINER_ROOT)/nntrainer/opencl \ @@ -442,6 +443,7 @@ LOCAL_SRC_FILES := \ ../unittest/layers/unittest_layers_impl.cpp \ ../unittest/layers/unittest_layers_input.cpp \ ../unittest/layers/unittest_layers_loss.cpp \ + ../unittest/layers/unittest_layers_fully_connected_cl.cpp \ ../unittest/layers/unittest_layers_fully_connected.cpp \ ../unittest/layers/unittest_layers_batch_normalization.cpp \ ../unittest/layers/unittest_layers_layer_normalization.cpp \ diff --git a/test/nntrainer_test_util.cpp b/test/nntrainer_test_util.cpp index bcc33e40c8..5777bb75b2 100644 --- a/test/nntrainer_test_util.cpp +++ b/test/nntrainer_test_util.cpp @@ -332,6 +332,7 @@ void sizeCheckedReadTensor(nntrainer::Tensor &t, std::ifstream &file, nntrainer::checkedRead(file, (char *)&sz, sizeof(unsigned)); } else if (t.getDataType() == ml::train::TensorDim::DataType::FP16) { #ifdef ENABLE_FP16 + // This needs to be fixed. sz is always unsinged int type. nntrainer::checkedRead(file, (char *)&sz, sizeof(_FP16)); #else throw std::invalid_argument("Error: enable-fp16 is not enabled"); diff --git a/test/unittest/layers/layers_common_tests.h b/test/unittest/layers/layers_common_tests.h index 57f693c0a2..d63357c805 100644 --- a/test/unittest/layers/layers_common_tests.h +++ b/test/unittest/layers/layers_common_tests.h @@ -93,6 +93,7 @@ class LayerPropertySemantics : public LayerSemantics {}; typedef enum { SKIP_CALC_GRAD = 1 << 0, /**< skip calculating gradient and compare */ SKIP_CALC_DERIV = 1 << 1, /**< skip calculating derivative and compare */ + USE_INC_FORWARD = 1 << 2, /**< use incremental forwarding and compare */ FORWARD_MODE_INFERENCE = 1 << 2, /**< set if layer should be forwarded with inference mode */ @@ -172,6 +173,14 @@ class LayerGoldenTest */ bool shouldSkipCalcGrad(); + /** + * @brief check if given test suite should use incremental forwarding instead + * of normal forwarding + * + * @return bool true if should use incremental forwarding + */ + bool shouldUseIncForward(); + /** * @brief check if given test suite should skip cosine similarity check * diff --git a/test/unittest/layers/layers_golden_tests.cpp b/test/unittest/layers/layers_golden_tests.cpp index 64400e6ecd..73f3954052 100644 --- a/test/unittest/layers/layers_golden_tests.cpp +++ b/test/unittest/layers/layers_golden_tests.cpp @@ -156,7 +156,7 @@ static RunLayerContext prepareRunContext(const TensorPacks &packs) { }; auto rc = - RunLayerContext("golden", true, 0.0f, false, create_view(weights), + RunLayerContext("golden", true, 0.0f, false, 1.0, create_view(weights), create_view(ins), create_view(outs), create_view(tensors)); auto num_outputs = rc.getNumOutputs(); @@ -364,6 +364,11 @@ bool LayerGoldenTest::shouldSkipCalcGrad() { LayerGoldenTestParamOptions::SKIP_CALC_GRAD; } +bool LayerGoldenTest::shouldUseIncForward() { + return std::get(GetParam()) & + LayerGoldenTestParamOptions::USE_INC_FORWARD; +} + bool LayerGoldenTest::shouldSkipCosineSimilarity() { return std::get(GetParam()) & LayerGoldenTestParamOptions::SKIP_COSINE_SIMILARITY; @@ -387,15 +392,31 @@ TEST_P(LayerGoldenTest, run) { bool skip_calc_grad = shouldSkipCalcGrad(); bool skip_calc_deriv = shouldSkipCalcDeriv(); + bool use_inc_forward = shouldUseIncForward(); bool dropout_compare_60_percent = shouldMatchDropout60Percent(); bool skip_cos_sim = shouldSkipCosineSimilarity(); + Tensor &input = rc.getInput(0); + TensorDim input_dim = input.getDim(); + size_t inputHeight = input_dim.height(); + for (int i = 0; i < 4; ++i) { /// warm layer multiple times + if (use_inc_forward) { + layer->incremental_forwarding(rc, 0, inputHeight, + !shouldForwardWithInferenceMode()); + } else { + layer->forwarding(rc, !shouldForwardWithInferenceMode()); + } + } + + if (use_inc_forward) { + layer->incremental_forwarding(rc, 0, inputHeight, + !shouldForwardWithInferenceMode()); + } else { layer->forwarding(rc, !shouldForwardWithInferenceMode()); } - layer->forwarding(rc, !shouldForwardWithInferenceMode()); if (!skip_calc_grad) { layer->calcGradient(rc); } diff --git a/test/unittest/layers/unittest_layer_node.cpp b/test/unittest/layers/unittest_layer_node.cpp index 3b41f02f30..37287f7ce5 100644 --- a/test/unittest/layers/unittest_layer_node.cpp +++ b/test/unittest/layers/unittest_layer_node.cpp @@ -131,7 +131,7 @@ TEST(nntrainer_LayerNode, finalize_05_n) { nntrainer::createLayerNode(nntrainer::IdentityLayer::type)); EXPECT_NO_THROW(lnode->setProperty({"input_shape=1:1:1", "name=abc"})); EXPECT_NO_THROW(lnode->finalize()); - EXPECT_NO_THROW(lnode->configureRunContext({}, {&input}, {}, {})); + EXPECT_NO_THROW(lnode->configureRunContext({}, {&input}, {}, {}, 1.0)); EXPECT_THROW(lnode->finalize(), std::runtime_error); } @@ -298,7 +298,7 @@ TEST(nntrainer_LayerNode, setWeights_02_n) { EXPECT_NO_THROW(lnode = nntrainer::createLayerNode(nntrainer::IdentityLayer::type)); EXPECT_NO_THROW(lnode->setProperty({"input_shape=1:1:1", "name=abc"})); - EXPECT_NO_THROW(lnode->configureRunContext({&weight}, {&input}, {}, {})); + EXPECT_NO_THROW(lnode->configureRunContext({&weight}, {&input}, {}, {}, 1.0)); EXPECT_THROW(lnode->setWeights(new_weights), std::runtime_error); } diff --git a/test/unittest/layers/unittest_layers_convolution2d.cpp b/test/unittest/layers/unittest_layers_convolution2d.cpp index 724c79079b..92d9c593e7 100644 --- a/test/unittest/layers/unittest_layers_convolution2d.cpp +++ b/test/unittest/layers/unittest_layers_convolution2d.cpp @@ -198,3 +198,185 @@ GTEST_PARAMETER_TEST( conv2d_mb_valid_drop_last, conv2d_sb_no_overlap, conv2d_mb_no_overlap, conv2d_sb_1x1_kernel, conv2d_mb_1x1_kernel, conv2d_sb_dilation, conv2d_mb_dilation, conv2d_sb_same_dilation, conv2d_mb_same_dilation)); + +#ifdef ENABLE_FP16 +auto conv2d_sb_minimum_w16a16 = LayerGoldenTestParamType( + nntrainer::createLayer, + {"filters=3", "kernel_size=2,2"}, "1:1:4:4", + "conv2d_sb_minimum_w16a16.nnlayergolden", + LayerGoldenTestParamOptions::DEFAULT, "nchw", "fp16", "fp16"); + +auto conv2d_mb_minimum_w16a16 = LayerGoldenTestParamType( + nntrainer::createLayer, + {"filters=3", "kernel_size=2,2"}, "3:1:4:4", + "conv2d_mb_minimum_w16a16.nnlayergolden", + LayerGoldenTestParamOptions::DEFAULT, "nchw", "fp16", "fp16"); + +auto conv2d_sb_same_remain_w16a16 = LayerGoldenTestParamType( + nntrainer::createLayer, + {"filters=2", "kernel_size=3,3", "padding=same"}, "1:1:4:4", + "conv2d_sb_same_remain_w16a16.nnlayergolden", + LayerGoldenTestParamOptions::DEFAULT, "nchw", "fp16", "fp16"); + +auto conv2d_mb_same_remain_w16a16 = LayerGoldenTestParamType( + nntrainer::createLayer, + {"filters=2", "kernel_size=3,3", "padding=same"}, "3:1:4:4", + "conv2d_mb_same_remain_w16a16.nnlayergolden", + LayerGoldenTestParamOptions::DEFAULT, "nchw", "fp16", "fp16"); + +auto conv2d_sb_same_uneven_remain_1_w16a16 = LayerGoldenTestParamType( + nntrainer::createLayer, + { + "filters=2", + "kernel_size=3,3", + "stride=2,2", + "padding=same", + }, + "1:3:4:4", "conv2d_sb_same_uneven_remain_w16a16.nnlayergolden", + LayerGoldenTestParamOptions::DEFAULT, "nchw", "fp16", "fp16"); + +auto conv2d_sb_same_uneven_remain_2_w16a16 = LayerGoldenTestParamType( + nntrainer::createLayer, + { + "filters=2", + "kernel_size=3,3", + "stride=2,2", + "padding=0,1,0,1", + }, + "1:3:4:4", "conv2d_sb_same_uneven_remain_w16a16.nnlayergolden", + LayerGoldenTestParamOptions::DEFAULT, "nchw", "fp16", "fp16"); + +auto conv2d_mb_same_uneven_remain_1_w16a16 = LayerGoldenTestParamType( + nntrainer::createLayer, + { + "filters=2", + "kernel_size=3,3", + "stride=2,2", + "padding=same", + }, + "3:3:4:4", "conv2d_mb_same_uneven_remain_w16a16.nnlayergolden", + LayerGoldenTestParamOptions::DEFAULT, "nchw", "fp16", "fp16"); + +auto conv2d_mb_same_uneven_remain_2_w16a16 = LayerGoldenTestParamType( + nntrainer::createLayer, + { + "filters=2", + "kernel_size=3,3", + "stride=2,2", + "padding=0,1,0,1", + }, + "3:3:4:4", "conv2d_mb_same_uneven_remain_w16a16.nnlayergolden", + LayerGoldenTestParamOptions::DEFAULT, "nchw", "fp16", "fp16"); + +auto conv2d_sb_valid_drop_last_w16a16 = LayerGoldenTestParamType( + nntrainer::createLayer, + { + "filters=2", + "kernel_size=3,3", + "stride=2,2", + "padding=valid", + }, + "1:3:7:7", "conv2d_sb_valid_drop_last_w16a16.nnlayergolden", + LayerGoldenTestParamOptions::DEFAULT, "nchw", "fp16", "fp16"); + +auto conv2d_mb_valid_drop_last_w16a16 = LayerGoldenTestParamType( + nntrainer::createLayer, + { + "filters=2", + "kernel_size=3,3", + "stride=2,2", + "padding=valid", + }, + "3:3:7:7", "conv2d_mb_valid_drop_last_w16a16.nnlayergolden", + LayerGoldenTestParamOptions::DEFAULT, "nchw", "fp16", "fp16"); + +auto conv2d_sb_no_overlap_w16a16 = LayerGoldenTestParamType( + nntrainer::createLayer, + {"filters=3", "kernel_size=2,2", "stride=3,3"}, "1:2:5:5", + "conv2d_sb_no_overlap_w16a16.nnlayergolden", + LayerGoldenTestParamOptions::DEFAULT, "nchw", "fp16", "fp16"); + +auto conv2d_mb_no_overlap_w16a16 = LayerGoldenTestParamType( + nntrainer::createLayer, + { + "filters=3", + "kernel_size=2,2", + "stride=3,3", + }, + "3:2:5:5", "conv2d_mb_no_overlap_w16a16.nnlayergolden", + LayerGoldenTestParamOptions::DEFAULT, "nchw", "fp16", "fp16"); + +auto conv2d_sb_1x1_kernel_w16a16 = LayerGoldenTestParamType( + nntrainer::createLayer, + {"filters=3", "kernel_size=1,1", "stride=2,2"}, "1:2:5:5", + "conv2d_sb_1x1_kernel_w16a16.nnlayergolden", + LayerGoldenTestParamOptions::DEFAULT, "nchw", "fp16", "fp16"); + +auto conv2d_mb_1x1_kernel_w16a16 = LayerGoldenTestParamType( + nntrainer::createLayer, + { + "filters=3", + "kernel_size=1,1", + "stride=2,2", + }, + "3:2:5:5", "conv2d_mb_1x1_kernel_w16a16.nnlayergolden", + LayerGoldenTestParamOptions::DEFAULT, "nchw", "fp16", "fp16"); + +auto conv2d_sb_dilation_w16a16 = LayerGoldenTestParamType( + nntrainer::createLayer, + { + "filters=2", + "kernel_size=3,3", + "dilation=2,2", + }, + "1:3:11:11", "conv2d_sb_dilation_w16a16.nnlayergolden", + LayerGoldenTestParamOptions::DEFAULT, "nchw", "fp16", "fp16"); + +auto conv2d_mb_dilation_w16a16 = LayerGoldenTestParamType( + nntrainer::createLayer, + { + "filters=2", + "kernel_size=3,3", + "dilation=2,2", + }, + "3:3:11:11", "conv2d_mb_dilation_w16a16.nnlayergolden", + LayerGoldenTestParamOptions::DEFAULT, "nchw", "fp16", "fp16"); + +auto conv2d_sb_same_dilation_w16a16 = LayerGoldenTestParamType( + nntrainer::createLayer, + { + "filters=2", + "kernel_size=3,3", + "padding=same", + "dilation=2,2", + }, + "1:3:11:11", "conv2d_sb_same_dilation_w16a16.nnlayergolden", + LayerGoldenTestParamOptions::DEFAULT, "nchw", "fp16", "fp16"); + +auto conv2d_mb_same_dilation_w16a16 = LayerGoldenTestParamType( + nntrainer::createLayer, + { + "filters=2", + "kernel_size=3,3", + "padding=same", + "dilation=2,2", + }, + "3:3:11:11", "conv2d_mb_same_dilation_w16a16.nnlayergolden", + LayerGoldenTestParamOptions::DEFAULT, "nchw", "fp16", "fp16"); + +GTEST_PARAMETER_TEST( + Convolution2D16, LayerGoldenTest, + ::testing::Values(conv2d_sb_minimum_w16a16, conv2d_mb_minimum_w16a16, + conv2d_sb_same_remain_w16a16, conv2d_mb_same_remain_w16a16, + conv2d_sb_same_uneven_remain_1_w16a16, + conv2d_sb_same_uneven_remain_2_w16a16, + conv2d_mb_same_uneven_remain_1_w16a16, + conv2d_mb_same_uneven_remain_2_w16a16, + conv2d_sb_valid_drop_last_w16a16, + conv2d_mb_valid_drop_last_w16a16, + conv2d_sb_no_overlap_w16a16, conv2d_mb_no_overlap_w16a16, + conv2d_sb_1x1_kernel_w16a16, conv2d_mb_1x1_kernel_w16a16, + conv2d_sb_dilation_w16a16, conv2d_mb_dilation_w16a16, + conv2d_sb_same_dilation_w16a16, + conv2d_mb_same_dilation_w16a16)); +#endif diff --git a/test/unittest/layers/unittest_layers_fully_connected_cl.cpp b/test/unittest/layers/unittest_layers_fully_connected_cl.cpp new file mode 100644 index 0000000000..07bb138272 --- /dev/null +++ b/test/unittest/layers/unittest_layers_fully_connected_cl.cpp @@ -0,0 +1,68 @@ +// SPDX-License-Identifier: Apache-2.0 +/** + * Copyright (C) 2024 Debadri Samaddar + * + * @file unittest_layers_fully_connected_cl.cpp + * @date 7 June 2024 + * @brief Fully Connected Layer Test + * @see https://github.com/nnstreamer/nntrainer + * @author Debadri Samaddar + * @bug No known bugs except for NYI items + */ +#include + +#include + +#include +#include + +auto semantic_fc_gpu = LayerSemanticsParamType( + nntrainer::createLayer, + nntrainer::FullyConnectedLayerCl::type, {"unit=1"}, + LayerCreateSetPropertyOptions::AVAILABLE_FROM_APP_CONTEXT, false, 1); + +GTEST_PARAMETER_TEST(FullyConnectedGPU, LayerSemantics, + ::testing::Values(semantic_fc_gpu)); + +auto fc_gpu_plain = LayerGoldenTestParamType( + nntrainer::createLayer, {"unit=5"}, + "3:1:1:10", "fc_plain.nnlayergolden", LayerGoldenTestParamOptions::DEFAULT, + "nchw", "fp32", "fp32"); +auto fc_gpu_single_batch = LayerGoldenTestParamType( + nntrainer::createLayer, {"unit=4"}, + "1:1:1:10", "fc_single_batch.nnlayergolden", + LayerGoldenTestParamOptions::DEFAULT, "nchw", "fp32", "fp32"); +auto fc_gpu_no_decay = LayerGoldenTestParamType( + nntrainer::createLayer, + {"unit=5", "weight_decay=0.0", "bias_decay=0.0"}, "3:1:1:10", + "fc_plain.nnlayergolden", LayerGoldenTestParamOptions::DEFAULT, "nchw", + "fp32", "fp32"); + +auto fc_gpu_plain_nhwc = LayerGoldenTestParamType( + nntrainer::createLayer, {"unit=5"}, + "3:10:1:1", "fc_plain.nnlayergolden", + LayerGoldenTestParamOptions::SKIP_CALC_DERIV | + LayerGoldenTestParamOptions::SKIP_CALC_GRAD | + LayerGoldenTestParamOptions::USE_INC_FORWARD, + "nhwc", "fp32", "fp32"); + +auto fc_gpu_single_batch_nhwc = LayerGoldenTestParamType( + nntrainer::createLayer, {"unit=4"}, + "1:10:1:1", "fc_single_batch.nnlayergolden", + LayerGoldenTestParamOptions::SKIP_CALC_DERIV | + LayerGoldenTestParamOptions::SKIP_CALC_GRAD, + "nhwc", "fp32", "fp32"); + +auto fc_gpu_no_decay_nhwc = LayerGoldenTestParamType( + nntrainer::createLayer, + {"unit=5", "weight_decay=0.0", "bias_decay=0.0"}, "3:10:1:1", + "fc_plain.nnlayergolden", + LayerGoldenTestParamOptions::SKIP_CALC_DERIV | + LayerGoldenTestParamOptions::SKIP_CALC_GRAD, + "nhwc", "fp32", "fp32"); + +GTEST_PARAMETER_TEST(FullyConnectedGPU, LayerGoldenTest, + ::testing::Values(fc_gpu_plain, fc_gpu_single_batch, + fc_gpu_no_decay, fc_gpu_plain_nhwc, + fc_gpu_single_batch_nhwc, + fc_gpu_no_decay_nhwc)); diff --git a/test/unittest/models/meson.build b/test/unittest/models/meson.build index 7166fc41ff..3f17369f94 100644 --- a/test/unittest/models/meson.build +++ b/test/unittest/models/meson.build @@ -1,4 +1,5 @@ test_name = 'unittest_models' +mixed_test_name = 'unittest_mixed_models' test_target = [] @@ -11,6 +12,30 @@ models_targets = [ # disable temperally ] +mixed_test_targets = [ + 'models_test_utils.cpp', + 'models_golden_test.cpp', + 'unittest_models_mixed_precision.cpp', +] + +if get_option('enable-fp16') + mixed_exe = executable( + mixed_test_name, + mixed_test_targets, + include_directories: include_directories('.'), + dependencies: [ + nntrainer_test_main_deps, nntrainer_ccapi_dep + ], + install: get_option('enable-test'), + install_dir: application_install_dir + ) + + test(mixed_test_name, mixed_exe, + args: '--gtest_output=xml:@0@/@1@.xml'.format(meson.build_root(), mixed_test_name), + timeout: test_timeout + ) +endif + test_target += models_targets exe = executable( test_name, diff --git a/test/unittest/models/models_test_utils.cpp b/test/unittest/models/models_test_utils.cpp index 741e008994..ac956d479b 100644 --- a/test/unittest/models/models_test_utils.cpp +++ b/test/unittest/models/models_test_utils.cpp @@ -50,8 +50,41 @@ static sharedConstTensors toSharedTensors(const std::vector &ts) { static void verify(const nntrainer::Tensor &actual, const nntrainer::Tensor &expected, const std::string &error_msg) { + bool equal = false; + + if (actual.getDataType() == ml::train::TensorDim::DataType::FP32 && + expected.getDataType() == ml::train::TensorDim::DataType::FP32) { + equal = (actual == expected); + if (!equal) { + float mseError = mse(actual.getData(), + expected.getData(), actual.size()); + if (mseError > 10 - 4) { + equal = false; + } else { + equal = true; + } + } + } + +#ifdef ENABLE_FP16 + if (!equal) { + if (actual.getDataType() == ml::train::TensorDim::DataType::FP16 && + expected.getDataType() == ml::train::TensorDim::DataType::FP16) { + float mseError = mse<_FP16>(actual.getData<_FP16>(), + expected.getData<_FP16>(), actual.size()); + if (mseError > 10 - 2) { + equal = false; + } else { + equal = true; + } + } + } +#endif + + if (!equal) { + nntrainer::Tensor diff = actual.subtract(expected); + const float *diff_data = diff.getData(); - if (actual != expected) { std::cout << "============================================================\n"; std::cout << "\033[1;33m" << error_msg << "\033[0m\n"; @@ -60,8 +93,6 @@ static void verify(const nntrainer::Tensor &actual, << " - " << expected; if (actual.getDim() == expected.getDim()) { - nntrainer::Tensor diff = actual.subtract(expected); - const float *diff_data = diff.getData(); std::cout << "\033[1;33mdifference\033[0m " << diff; std::cout << "number of data: " << diff.size() << std::endl; std::cout << "\033[4;33mMAX DIFF: " @@ -119,6 +150,12 @@ class IterationForGolden { } Tensor &t = rc.getWeight(i); + + if (t.getDataType() != ml::train::TensorDim::DataType::FP32) { + Tensor &t32 = rc.getWeightFP32(i); + weights32.push_back(t32); + } + weights.push_back(t); expected_weights.push_back(t.clone()); } @@ -158,6 +195,10 @@ class IterationForGolden { } else { for (unsigned int i = 0; i < weights.size(); ++i) { weights.at(i).fill(expected_weights.at(i)); + if (iteration == 0 && + weights.at(i).getDataType() != ml::train::TensorDim::DataType::FP32) + weights32.at(i).fill( + weights.at(i).clone(ml::train::TensorDim::DataType::FP32)); } } @@ -174,6 +215,7 @@ class IterationForGolden { std::vector inputs; std::vector labels; std::vector weights; + std::vector weights32; std::vector expected_weights; std::vector expected_outputs; }; diff --git a/test/unittest/models/unittest_models_mixed_precision.cpp b/test/unittest/models/unittest_models_mixed_precision.cpp new file mode 100644 index 0000000000..04c1495491 --- /dev/null +++ b/test/unittest/models/unittest_models_mixed_precision.cpp @@ -0,0 +1,74 @@ +// SPDX-License-Identifier: Apache-2.0 +/** + * Copyright (C) 2024 Jijoong Moon + * + * @file unittest_models_mixed_precision.cpp + * @date 3 May 2024 + * @brief unittest models to cover mixed precision + * @see https://github.com/nnstreamer/nntrainer + * @author Jijoong Moon + * @bug No known bugs except for NYI items + */ + +#include + +#include + +#include +#include +#include + +#include + +using namespace nntrainer; + +static std::unique_ptr fc_mixed_training() { + std::unique_ptr nn(new NeuralNetwork()); + nn->setProperty( + {"batch_size=1", "model_tensor_type=FP16-FP16", "loss_scale=65536"}); + + auto graph = makeGraph({ + {"input", {"name=in", "input_shape=1:1:3"}}, + {"Fully_connected", {"name=fc", "input_layers=in", "unit=10"}}, + {"mse", {"name=loss", "input_layers=fc"}}, + }); + for (auto &node : graph) { + nn->addLayer(node); + } + + nn->setOptimizer(ml::train::createOptimizer( + "adam", {"learning_rate = 0.1", "torch_ref=true"})); + + return nn; +} + +static std::unique_ptr fc_mixed_training_nan_sgd() { + std::unique_ptr nn(new NeuralNetwork()); + nn->setProperty( + {"batch_size=1", "model_tensor_type=FP16-FP16", "loss_scale=65536"}); + + auto graph = makeGraph({ + {"input", {"name=in", "input_shape=1:1:1"}}, + {"Fully_connected", {"name=fc0", "input_layers=in", "unit=1"}}, + {"Fully_connected", {"name=fc1", "input_layers=fc0", "unit=1"}}, + {"mse", {"name=loss", "input_layers=fc1"}}, + }); + for (auto &node : graph) { + nn->addLayer(node); + } + + nn->setOptimizer(ml::train::createOptimizer("sgd", {"learning_rate = 0.1"})); + + return nn; +} + +GTEST_PARAMETER_TEST( + MixedPrecision, nntrainerModelTest, + ::testing::ValuesIn({ + mkModelTc_V2(fc_mixed_training, "fc_mixed_training", + ModelTestOption::ALL_V2), + mkModelTc_V2(fc_mixed_training_nan_sgd, "fc_mixed_training_nan_sgd", + ModelTestOption::ALL_V2), + }), + [](const testing::TestParamInfo &info) + -> const auto & { return std::get<1>(info.param); }); diff --git a/test/unittest/unittest_nntrainer_tensor.cpp b/test/unittest/unittest_nntrainer_tensor.cpp index 94aa01836d..d5b6a028f9 100644 --- a/test/unittest/unittest_nntrainer_tensor.cpp +++ b/test/unittest/unittest_nntrainer_tensor.cpp @@ -4704,6 +4704,30 @@ TEST(nntrainer_Tensor, inv_sqrt_i_uncontiguous_p) { } } +/** + * @brief fp16 tensor has NaN + */ +TEST(nntrainer_Tensor, is_valid_01) { + size_t batch = 1; + size_t channel = 3; + size_t height = 4; + size_t width = 5; + + nntrainer::Tensor input( + {batch, + channel, + height, + width, + {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP32}}, + true, nntrainer::Tensor::Initializer::ZEROS); + + EXPECT_EQ(input.isValid(), true); + + input.setValue(0, 0, 0, 0, std::nan("1")); + + EXPECT_EQ(input.isValid(), false); +} + int main(int argc, char **argv) { int result = -1; diff --git a/test/unittest/unittest_nntrainer_tensor_fp16.cpp b/test/unittest/unittest_nntrainer_tensor_fp16.cpp index 2b0d9c040d..58455757c5 100644 --- a/test/unittest/unittest_nntrainer_tensor_fp16.cpp +++ b/test/unittest/unittest_nntrainer_tensor_fp16.cpp @@ -6196,6 +6196,34 @@ TEST(nntrainer_Tensor, dequantize_06_p) { EXPECT_EQ(output, answer3); } +/** + * @brief fp16 tensor has NaN + */ +TEST(nntrainer_Tensor, is_valid_01) { + size_t batch = 1; + size_t channel = 3; + size_t height = 4; + size_t width = 5; + + nntrainer::Tensor input( + {batch, + channel, + height, + width, + {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP16}}, + true, nntrainer::Tensor::Initializer::ZEROS); + + EXPECT_EQ(input.isValid(), true); + + input.setValue(0, 0, 0, 0, std::nan("1")); + + EXPECT_EQ(input.isValid(), false); + + input.setValue(0, 0, 0, 0, std::numeric_limits::infinity()); + + EXPECT_EQ(input.isValid(), false); +} + GTEST_API_ int main(int argc, char **argv) { int result = -1; diff --git a/test/unittest/unittest_nntrainer_tensor_neon_fp16.cpp b/test/unittest/unittest_nntrainer_tensor_neon_fp16.cpp index e02eac1786..799a910273 100644 --- a/test/unittest/unittest_nntrainer_tensor_neon_fp16.cpp +++ b/test/unittest/unittest_nntrainer_tensor_neon_fp16.cpp @@ -120,6 +120,70 @@ TEST(nntrainer_Tensor, dot) { EXPECT_IN_RANGE((float)cosSimNeon, 0.99, 1); } +TEST(nntrainer_Tensor, hdot_768) { + + nntrainer::TensorDim::TensorType t_type_nchw_fp16 = { + nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP16}; + + nntrainer::TensorDim::TensorType t_type_nchw_fp32 = { + nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP32}; + + // conditions for fp16 hdot call: + // this->(batch * channel * height) = arg->(width) = 1; + size_t batch = 1; + size_t channel = 1; + size_t height = 1; + size_t width = 768; + + nntrainer::Tensor input( + nntrainer::TensorDim(1, 1, 1, width, t_type_nchw_fp16)); + + nntrainer::Tensor input_2( + nntrainer::TensorDim(1, 1, width, 1, t_type_nchw_fp16)); + + nntrainer::Tensor input_fp32( + nntrainer::TensorDim(1, 1, 1, width, t_type_nchw_fp32)); + + nntrainer::Tensor input_fp32_2( + nntrainer::TensorDim(1, 1, width, 1, t_type_nchw_fp32)); + + const float alpha = 1e-1; + const int MOD = 10; + + GEN_TEST_INPUT(input, ((i * j * (batch * height * channel) + + j * (batch * height) + k * (width) + l + 1) % + MOD) * + alpha); + GEN_TEST_INPUT(input_fp32, ((i * j * (batch * height * channel) + + j * (batch * height) + k * (width) + l + 1) % + MOD) * + alpha); + GEN_TEST_INPUT(input_2, ((i * k * (batch * height * channel) + + j * (batch * height) + k * (width) + l + 1) % + MOD) * + alpha); + GEN_TEST_INPUT(input_fp32_2, ((i * k * (batch * height * channel) + + j * (batch * height) + k * (width) + l + 1) % + MOD) * + alpha); + + nntrainer::Tensor result_neon = input.dot(input_2, false, false); + nntrainer::Tensor result_fp32 = input_fp32.dot(input_fp32_2, false, false); + + float mseErrorNeon = + mse<__fp16>(result_neon.getData<__fp16>(), result_fp32.getData(), + result_neon.size()); + + double cosSimNeon = + cosine_similarity<__fp16>(result_neon.getData<__fp16>(), + result_fp32.getData(), result_neon.size()); + + const float epsilon = 1e-3; + + EXPECT_IN_RANGE(mseErrorNeon, 0, epsilon); + EXPECT_IN_RANGE((float)cosSimNeon, 0.99, 1); +} + TEST(nntrainer_Tensor, l2norm) { nntrainer::TensorDim::TensorType t_type_nchw_fp16 = { @@ -701,6 +765,128 @@ TEST(nntrainer_Tensor, dot_gemm_50_768_20000) { EXPECT_IN_RANGE((float)cosSimNeon, 0.99, 1); } +TEST(nntrainer_Tensor, dot_gemm_512_520_1032) { + /// @note GEMM : A X B = C + int batch = 1; + int channel = 1; + int height = 512; + int width = 520; + + int height_b = 520; + int width_b = 1032; + + bool transA = false; + bool transB = false; + + nntrainer::TensorDim::TensorType t_type_nchw_fp16 = { + nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP16}; + + nntrainer::TensorDim::TensorType t_type_nchw_fp32 = { + nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP32}; + + nntrainer::Tensor A(batch, channel, height, width, t_type_nchw_fp16); + nntrainer::Tensor B(batch, channel, height_b, width_b, t_type_nchw_fp16); + + nntrainer::Tensor A_fp32(batch, channel, height, width, t_type_nchw_fp32); + nntrainer::Tensor B_fp32(batch, channel, height_b, width_b, t_type_nchw_fp32); + + const float alpha = 1e-1; + const int MOD = 10; + + GEN_TEST_INPUT(A, ((i * (batch * height * channel) + j * (batch * height) + + k * (width) + l + 1) % + MOD) * + alpha); + GEN_TEST_INPUT_B(B, ((i * (batch * height_b * channel) + + j * (batch * height_b) + k * (width_b) + l + 1) % + MOD) * + alpha); + GEN_TEST_INPUT(A_fp32, ((i * (batch * height * channel) + + j * (batch * height) + k * (width) + l + 1) % + MOD) * + alpha); + GEN_TEST_INPUT_B(B_fp32, ((i * (batch * height_b * channel) + + j * (batch * height_b) + k * (width_b) + l + 1) % + MOD) * + alpha); + + nntrainer::Tensor C = A.dot(B, transA, transB); + + nntrainer::Tensor C_fp32 = A_fp32.dot(B_fp32, transA, transB); + + float mseErrorNeon = + mse<__fp16>(C.getData<__fp16>(), C_fp32.getData(), C.size()); + + double cosSimNeon = cosine_similarity<__fp16>( + C.getData<__fp16>(), C_fp32.getData(), C.size()); + + const float epsilon = 1e-3 * width; + + EXPECT_IN_RANGE(mseErrorNeon, 0, epsilon); + EXPECT_IN_RANGE((float)cosSimNeon, 0.99, 1); +} + +TEST(nntrainer_Tensor, dot_gemm_1001_1024_20000) { + /// @note GEMM : A X B = C + int batch = 1; + int channel = 1; + int height = 1001; + int width = 1024; + + int height_b = 1024; + int width_b = 20000; + + bool transA = false; + bool transB = false; + + nntrainer::TensorDim::TensorType t_type_nchw_fp16 = { + nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP16}; + + nntrainer::TensorDim::TensorType t_type_nchw_fp32 = { + nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP32}; + + nntrainer::Tensor A(batch, channel, height, width, t_type_nchw_fp16); + nntrainer::Tensor B(batch, channel, height_b, width_b, t_type_nchw_fp16); + + nntrainer::Tensor A_fp32(batch, channel, height, width, t_type_nchw_fp32); + nntrainer::Tensor B_fp32(batch, channel, height_b, width_b, t_type_nchw_fp32); + + const float alpha = 1e-1; + const int MOD = 10; + + GEN_TEST_INPUT(A, ((i * (batch * height * channel) + j * (batch * height) + + k * (width) + l + 1) % + MOD) * + alpha); + GEN_TEST_INPUT_B(B, ((i * (batch * height_b * channel) + + j * (batch * height_b) + k * (width_b) + l + 1) % + MOD) * + alpha); + GEN_TEST_INPUT(A_fp32, ((i * (batch * height * channel) + + j * (batch * height) + k * (width) + l + 1) % + MOD) * + alpha); + GEN_TEST_INPUT_B(B_fp32, ((i * (batch * height_b * channel) + + j * (batch * height_b) + k * (width_b) + l + 1) % + MOD) * + alpha); + + nntrainer::Tensor C = A.dot(B, transA, transB); + + nntrainer::Tensor C_fp32 = A_fp32.dot(B_fp32, transA, transB); + + float mseErrorNeon = + mse<__fp16>(C.getData<__fp16>(), C_fp32.getData(), C.size()); + + double cosSimNeon = cosine_similarity<__fp16>( + C.getData<__fp16>(), C_fp32.getData(), C.size()); + + const float epsilon = 1e-3 * width; + + EXPECT_IN_RANGE(mseErrorNeon, 0, epsilon); + EXPECT_IN_RANGE((float)cosSimNeon, 0.99, 1); +} + TEST(nntrainer_Tensor, dot_gemm_50_768_516) { /// @note GEMM : A X B = C int batch = 1; @@ -994,6 +1180,38 @@ TEST(nntrainer_Tensor, inv_sqrt_i_p) { EXPECT_EQ(flag, true); } +/** + * @brief fp16 tensor has NaN + */ +TEST(nntrainer_Tensor, is_valid_01) { + size_t batch = 1; + size_t channel = 3; + size_t height = 4; + size_t width = 5; + + nntrainer::Tensor input( + {batch, + channel, + height, + width, + {nntrainer::Tformat::NCHW, nntrainer::Tdatatype::FP16}}, + true, nntrainer::Tensor::Initializer::ZEROS); + + EXPECT_EQ(input.isValid(), true); + + input.setValue(0, 0, 0, 0, std::nan("1")); + + EXPECT_EQ(input.isValid(), false); + + input.setValue(0, 0, 0, 0, std::numeric_limits::infinity()); + + EXPECT_EQ(input.isValid(), false); + + input.setValue(0, 0, 0, 0, 1); + + EXPECT_EQ(input.isValid(), true); +} + GTEST_API_ int main(int argc, char **argv) { int result = -1; diff --git a/tools/package_android.sh b/tools/package_android.sh index 6e02cc23d2..5fc7ba8754 100755 --- a/tools/package_android.sh +++ b/tools/package_android.sh @@ -17,14 +17,14 @@ if [ ! -d builddir ]; then #default value of openblas num threads is 1 for android #enable-tflite-interpreter=false is just temporally until ci system is stabel #enable-opencl=true will compile OpenCL related changes or remove this option to exclude OpenCL compilations. - meson builddir -Dplatform=android -Dopenblas-num-threads=1 -Denable-tflite-interpreter=false -Denable-tflite-backbone=false -Denable-fp16=true -Denable-neon=true -Domp-num-threads=1 -Denable-opencl=true + meson builddir -Dplatform=android -Dopenblas-num-threads=1 -Denable-tflite-interpreter=false -Denable-tflite-backbone=false -Denable-fp16=true -Denable-neon=true -Domp-num-threads=1 -Denable-opencl=true -Denable-avx=false else echo "warning: $TARGET/builddir has already been taken, this script tries to reconfigure and try building" pushd builddir #default value of openblas num threads is 1 for android #enable-tflite-interpreter=false is just temporally until ci system is stabel #enable-opencl=true will compile OpenCL related changes or remove this option to exclude OpenCL compilations. - meson configure -Dplatform=android -Dopenblas-num-threads=1 -Denable-tflite-interpreter=false -Denable-tflite-backbone=false -Denable-fp16=true -Denable-neon=true -Domp-num-threads=1 -Denable-opencl=true + meson configure -Dplatform=android -Dopenblas-num-threads=1 -Denable-tflite-interpreter=false -Denable-tflite-backbone=false -Denable-fp16=true -Denable-neon=true -Domp-num-threads=1 -Denable-opencl=true -Denable-avx=false meson --wipe popd fi