Skip to content

Commit

Permalink
[ matrix_transpose ] Divide f16 transpose and f32 transpose with NEON
Browse files Browse the repository at this point in the history
1. no NEON -> matrix_transpose_fallback
2. NEON, but without f16 -> matrix_transpose_neon
3. NEON, with f16 -> 2 + matrix_transpose_neon_f16

**Self evaluation:**
1. Build test:     [X]Passed [ ]Failed [ ]Skipped
2. Run test:     [X]Passed [ ]Failed [ ]Skipped

Signed-off-by: skykongkong8 <ss.kong@samsung.com>
  • Loading branch information
skykongkong8 authored and jijoongmoon committed Feb 10, 2025
1 parent 5828113 commit 74c7b9e
Show file tree
Hide file tree
Showing 6 changed files with 146 additions and 116 deletions.
6 changes: 4 additions & 2 deletions nntrainer/tensor/blas_interface.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,11 @@
#include "blas_interface.h"
#include <nntrainer_error.h>

#if (defined USE__FP16 && defined USE_NEON)
#include "blas_neon.h"
#ifdef USE_NEON
#include "matrix_transpose_neon/matrix_transpose_neon.h"
#ifdef USE__FP16
#include <blas_neon.h>
#endif
#endif

#if USE_AVX
Expand Down
108 changes: 0 additions & 108 deletions nntrainer/tensor/matrix_transpose_neon/matrix_transpose_neon.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,116 +12,8 @@
*/

#include <arm_neon.h>
#include <matrix_transpose_kernels_neon.h>
#include <matrix_transpose_neon.h>

template <>
void transpose_neon(unsigned int M, unsigned int N, const __fp16 *src,
unsigned int ld_src, __fp16 *dst, unsigned int ld_dst) {
unsigned int ib = 0, jb = 0;
if (N % 8 > 0 && N % 8 < 4) {
for (ib = 0; ib + 8 <= M; ib += 8) {
for (jb = 0; jb + 8 <= N; jb += 8) {
transpose_kernel_8x8_neon(&src[ib * ld_src + jb], ld_src,
&dst[ib + jb * ld_dst], ld_dst);
}
for (unsigned int i = ib; i < ib + 8; i += 4) {
transpose_kernel_mxn_neon_128<4>(N - jb, &src[i * ld_src + jb], ld_src,
&dst[i + jb * ld_dst], ld_dst);
}
}
} else if (N % 8 == 4) {
for (ib = 0; ib + 8 <= M; ib += 8) {
for (jb = 0; jb + 8 <= N; jb += 8) {
transpose_kernel_8x8_neon(&src[ib * ld_src + jb], ld_src,
&dst[ib + jb * ld_dst], ld_dst);
}
for (unsigned int i = ib; i < ib + 8; i += 4) {
transpose_kernel_4x4_neon(&src[i * ld_src + jb], ld_src,
&dst[i + jb * ld_dst], ld_dst);
}
}
} else {
for (ib = 0; ib + 8 <= M; ib += 8) {
for (jb = 0; jb + 8 <= N; jb += 8) {
transpose_kernel_8x8_neon(&src[ib * ld_src + jb], ld_src,
&dst[ib + jb * ld_dst], ld_dst);
}
if (jb < N) {
transpose_kernel_mxn_neon_256<8>(N - jb, &src[ib * ld_src + jb], ld_src,
&dst[ib + jb * ld_dst], ld_dst);
}
}
}
switch (M - ib) {
case 1:
for (unsigned int j = 0; j < N; ++j) {
dst[ib + j * ld_dst] = src[ib * ld_src + j];
}
break;
case 2:
for (jb = 0; jb + 4 <= N; jb += 4) {
transpose_kernel_mxn_neon_128<2>(4, &src[ib * ld_src + jb], ld_src,
&dst[ib + jb * ld_dst], ld_dst);
}
if (jb < N) {
transpose_kernel_mxn_neon_128<2>(N - jb, &src[ib * ld_src + jb], ld_src,
&dst[ib + jb * ld_dst], ld_dst);
}
break;
case 3:
for (jb = 0; jb + 4 <= N; jb += 4) {
transpose_kernel_mxn_neon_128<3>(4, &src[ib * ld_src + jb], ld_src,
&dst[ib + jb * ld_dst], ld_dst);
}
if (jb < N) {
transpose_kernel_mxn_neon_128<3>(N - jb, &src[ib * ld_src + jb], ld_src,
&dst[ib + jb * ld_dst], ld_dst);
}
break;
case 4:
for (jb = 0; jb + 4 <= N; jb += 4) {
transpose_kernel_4x4_neon(&src[ib * ld_src + jb], ld_src,
&dst[ib + jb * ld_dst], ld_dst);
}
if (jb < N) {
transpose_kernel_mxn_neon_128<4>(N - jb, &src[ib * ld_src + jb], ld_src,
&dst[ib + jb * ld_dst], ld_dst);
}
break;
case 5:
for (jb = 0; jb + 8 <= N; jb += 8) {
transpose_kernel_mxn_neon_256<5>(8, &src[ib * ld_src + jb], ld_src,
&dst[ib + jb * ld_dst], ld_dst);
}
if (jb < N) {
transpose_kernel_mxn_neon_256<5>(N - jb, &src[ib * ld_src + jb], ld_src,
&dst[ib + jb * ld_dst], ld_dst);
}
break;
case 6:
for (jb = 0; jb + 8 <= N; jb += 8) {
transpose_kernel_mxn_neon_256<6>(8, &src[ib * ld_src + jb], ld_src,
&dst[ib + jb * ld_dst], ld_dst);
}
if (jb < N) {
transpose_kernel_mxn_neon_256<6>(N - jb, &src[ib * ld_src + jb], ld_src,
&dst[ib + jb * ld_dst], ld_dst);
}
break;
case 7:
for (jb = 0; jb + 8 <= N; jb += 8) {
transpose_kernel_mxn_neon_256<7>(8, &src[ib * ld_src + jb], ld_src,
&dst[ib + jb * ld_dst], ld_dst);
}
if (jb < N) {
transpose_kernel_mxn_neon_256<7>(N - jb, &src[ib * ld_src + jb], ld_src,
&dst[ib + jb * ld_dst], ld_dst);
}
break;
}
}

template <>
void transpose_neon(unsigned int M, unsigned int N, const float *src,
unsigned int ld_src, float *dst, unsigned int ld_dst) {
Expand Down
122 changes: 122 additions & 0 deletions nntrainer/tensor/matrix_transpose_neon/matrix_transpose_neon_f16.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
// SPDX-License-Identifier: Apache-2.0
/**
* Copyright (C) 2025 Sungsik Kong <ss.kong@samsung.com>
*
* @file matrix_transpose_neon_f16.cpp
* @date 23 Jan 2025
* @see https://github.com/nnstreamer/nntrainer
* @author Sungsik Kong <ss.kong@samsung.com>
* @bug No known bugs except for NYI items
* @brief This is source file for matrix transpose using NEON
*
*/

#include <matrix_transpose_kernels_neon.h>
#include <matrix_transpose_neon.h>

template <>
void transpose_neon(unsigned int M, unsigned int N, const __fp16 *src,
unsigned int ld_src, __fp16 *dst, unsigned int ld_dst) {
unsigned int ib = 0, jb = 0;
if (N % 8 > 0 && N % 8 < 4) {
for (ib = 0; ib + 8 <= M; ib += 8) {
for (jb = 0; jb + 8 <= N; jb += 8) {
transpose_kernel_8x8_neon(&src[ib * ld_src + jb], ld_src,
&dst[ib + jb * ld_dst], ld_dst);
}
for (unsigned int i = ib; i < ib + 8; i += 4) {
transpose_kernel_mxn_neon_128<4>(N - jb, &src[i * ld_src + jb], ld_src,
&dst[i + jb * ld_dst], ld_dst);
}
}
} else if (N % 8 == 4) {
for (ib = 0; ib + 8 <= M; ib += 8) {
for (jb = 0; jb + 8 <= N; jb += 8) {
transpose_kernel_8x8_neon(&src[ib * ld_src + jb], ld_src,
&dst[ib + jb * ld_dst], ld_dst);
}
for (unsigned int i = ib; i < ib + 8; i += 4) {
transpose_kernel_4x4_neon(&src[i * ld_src + jb], ld_src,
&dst[i + jb * ld_dst], ld_dst);
}
}
} else {
for (ib = 0; ib + 8 <= M; ib += 8) {
for (jb = 0; jb + 8 <= N; jb += 8) {
transpose_kernel_8x8_neon(&src[ib * ld_src + jb], ld_src,
&dst[ib + jb * ld_dst], ld_dst);
}
if (jb < N) {
transpose_kernel_mxn_neon_256<8>(N - jb, &src[ib * ld_src + jb], ld_src,
&dst[ib + jb * ld_dst], ld_dst);
}
}
}
switch (M - ib) {
case 1:
for (unsigned int j = 0; j < N; ++j) {
dst[ib + j * ld_dst] = src[ib * ld_src + j];
}
break;
case 2:
for (jb = 0; jb + 4 <= N; jb += 4) {
transpose_kernel_mxn_neon_128<2>(4, &src[ib * ld_src + jb], ld_src,
&dst[ib + jb * ld_dst], ld_dst);
}
if (jb < N) {
transpose_kernel_mxn_neon_128<2>(N - jb, &src[ib * ld_src + jb], ld_src,
&dst[ib + jb * ld_dst], ld_dst);
}
break;
case 3:
for (jb = 0; jb + 4 <= N; jb += 4) {
transpose_kernel_mxn_neon_128<3>(4, &src[ib * ld_src + jb], ld_src,
&dst[ib + jb * ld_dst], ld_dst);
}
if (jb < N) {
transpose_kernel_mxn_neon_128<3>(N - jb, &src[ib * ld_src + jb], ld_src,
&dst[ib + jb * ld_dst], ld_dst);
}
break;
case 4:
for (jb = 0; jb + 4 <= N; jb += 4) {
transpose_kernel_4x4_neon(&src[ib * ld_src + jb], ld_src,
&dst[ib + jb * ld_dst], ld_dst);
}
if (jb < N) {
transpose_kernel_mxn_neon_128<4>(N - jb, &src[ib * ld_src + jb], ld_src,
&dst[ib + jb * ld_dst], ld_dst);
}
break;
case 5:
for (jb = 0; jb + 8 <= N; jb += 8) {
transpose_kernel_mxn_neon_256<5>(8, &src[ib * ld_src + jb], ld_src,
&dst[ib + jb * ld_dst], ld_dst);
}
if (jb < N) {
transpose_kernel_mxn_neon_256<5>(N - jb, &src[ib * ld_src + jb], ld_src,
&dst[ib + jb * ld_dst], ld_dst);
}
break;
case 6:
for (jb = 0; jb + 8 <= N; jb += 8) {
transpose_kernel_mxn_neon_256<6>(8, &src[ib * ld_src + jb], ld_src,
&dst[ib + jb * ld_dst], ld_dst);
}
if (jb < N) {
transpose_kernel_mxn_neon_256<6>(N - jb, &src[ib * ld_src + jb], ld_src,
&dst[ib + jb * ld_dst], ld_dst);
}
break;
case 7:
for (jb = 0; jb + 8 <= N; jb += 8) {
transpose_kernel_mxn_neon_256<7>(8, &src[ib * ld_src + jb], ld_src,
&dst[ib + jb * ld_dst], ld_dst);
}
if (jb < N) {
transpose_kernel_mxn_neon_256<7>(N - jb, &src[ib * ld_src + jb], ld_src,
&dst[ib + jb * ld_dst], ld_dst);
}
break;
}
}
14 changes: 12 additions & 2 deletions nntrainer/tensor/matrix_transpose_neon/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,21 @@ matrix_transpose_neon_sources = [
]

matrix_transpose_neon_headers = [
'mask_neon.h',
'matrix_transpose_kernels_neon.h',
'matrix_transpose_neon.h',
]

if get_option('enable-fp16')
if arch == 'arm'
error ('FP16/ARM code (blas_neon.cpp) uses armv8.2 instructions. armv7 is not supported.')
elif arch == 'aarch64' or get_option('platform') == 'android'
if get_option('enable-neon')
matrix_transpose_neon_sources += 'matrix_transpose_neon_f16.cpp'
matrix_transpose_neon_headers += 'mask_neon.h'
matrix_transpose_neon_headers += 'matrix_transpose_kernels_neon.h'
endif
endif
endif

foreach s : matrix_transpose_neon_sources
nntrainer_sources += meson.current_source_dir() / s
endforeach
Expand Down
10 changes: 6 additions & 4 deletions nntrainer/tensor/meson.build
Original file line number Diff line number Diff line change
Expand Up @@ -68,14 +68,16 @@ if get_option('enable-fp16')
subdir('hgemm')
nntrainer_inc += include_directories('hgemm')
nntrainer_inc_abs += meson.current_source_dir() / 'hgemm'

subdir('matrix_transpose_neon')
nntrainer_inc += include_directories('matrix_transpose_neon')
nntrainer_inc_abs += meson.current_source_dir() / 'matrix_transpose_neon'
endif
endif
endif

if get_option('enable-neon')
subdir('matrix_transpose_neon')
nntrainer_inc += include_directories('matrix_transpose_neon')
nntrainer_inc_abs += meson.current_source_dir() / 'matrix_transpose_neon'
endif

if get_option('enable-fp16')
tensor_headers += 'half_tensor.h'
tensor_sources += 'half_tensor.cpp'
Expand Down
2 changes: 2 additions & 0 deletions packaging/nntrainer.spec
Original file line number Diff line number Diff line change
Expand Up @@ -582,11 +582,13 @@ cp -r result %{buildroot}%{_datadir}/nntrainer/unittest/
%{_includedir}/nntrainer/dynamic_library_loader.h
%{_includedir}/nntrainer/loss_layer.h
%ifarch aarch64
%{_includedir}/nntrainer/matrix_transpose_neon.h
%if 0%{?enable_fp16}
%{_includedir}/nntrainer/util_simd_neon.h
%{_includedir}/nntrainer/blas_neon.h
%{_includedir}/nntrainer/hgemm.h
%{_includedir}/nntrainer/hgemm_util.h
%{_includedir}/nntrainer/matrix_transpose_kernels_neon.h
%endif
%endif
%{_includedir}/nntrainer/acti_func.h
Expand Down

0 comments on commit 74c7b9e

Please sign in to comment.