forked from nnstreamer/nntrainer
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[ matrix_transpose ] Divide f16 transpose and f32 transpose with NEON
1. no NEON -> matrix_transpose_fallback 2. NEON, but without f16 -> matrix_transpose_neon 3. NEON, with f16 -> 2 + matrix_transpose_neon_f16 **Self evaluation:** 1. Build test: [X]Passed [ ]Failed [ ]Skipped 2. Run test: [X]Passed [ ]Failed [ ]Skipped Signed-off-by: skykongkong8 <ss.kong@samsung.com>
- Loading branch information
1 parent
5828113
commit 74c7b9e
Showing
6 changed files
with
146 additions
and
116 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
122 changes: 122 additions & 0 deletions
122
nntrainer/tensor/matrix_transpose_neon/matrix_transpose_neon_f16.cpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,122 @@ | ||
// SPDX-License-Identifier: Apache-2.0 | ||
/** | ||
* Copyright (C) 2025 Sungsik Kong <ss.kong@samsung.com> | ||
* | ||
* @file matrix_transpose_neon_f16.cpp | ||
* @date 23 Jan 2025 | ||
* @see https://github.com/nnstreamer/nntrainer | ||
* @author Sungsik Kong <ss.kong@samsung.com> | ||
* @bug No known bugs except for NYI items | ||
* @brief This is source file for matrix transpose using NEON | ||
* | ||
*/ | ||
|
||
#include <matrix_transpose_kernels_neon.h> | ||
#include <matrix_transpose_neon.h> | ||
|
||
template <> | ||
void transpose_neon(unsigned int M, unsigned int N, const __fp16 *src, | ||
unsigned int ld_src, __fp16 *dst, unsigned int ld_dst) { | ||
unsigned int ib = 0, jb = 0; | ||
if (N % 8 > 0 && N % 8 < 4) { | ||
for (ib = 0; ib + 8 <= M; ib += 8) { | ||
for (jb = 0; jb + 8 <= N; jb += 8) { | ||
transpose_kernel_8x8_neon(&src[ib * ld_src + jb], ld_src, | ||
&dst[ib + jb * ld_dst], ld_dst); | ||
} | ||
for (unsigned int i = ib; i < ib + 8; i += 4) { | ||
transpose_kernel_mxn_neon_128<4>(N - jb, &src[i * ld_src + jb], ld_src, | ||
&dst[i + jb * ld_dst], ld_dst); | ||
} | ||
} | ||
} else if (N % 8 == 4) { | ||
for (ib = 0; ib + 8 <= M; ib += 8) { | ||
for (jb = 0; jb + 8 <= N; jb += 8) { | ||
transpose_kernel_8x8_neon(&src[ib * ld_src + jb], ld_src, | ||
&dst[ib + jb * ld_dst], ld_dst); | ||
} | ||
for (unsigned int i = ib; i < ib + 8; i += 4) { | ||
transpose_kernel_4x4_neon(&src[i * ld_src + jb], ld_src, | ||
&dst[i + jb * ld_dst], ld_dst); | ||
} | ||
} | ||
} else { | ||
for (ib = 0; ib + 8 <= M; ib += 8) { | ||
for (jb = 0; jb + 8 <= N; jb += 8) { | ||
transpose_kernel_8x8_neon(&src[ib * ld_src + jb], ld_src, | ||
&dst[ib + jb * ld_dst], ld_dst); | ||
} | ||
if (jb < N) { | ||
transpose_kernel_mxn_neon_256<8>(N - jb, &src[ib * ld_src + jb], ld_src, | ||
&dst[ib + jb * ld_dst], ld_dst); | ||
} | ||
} | ||
} | ||
switch (M - ib) { | ||
case 1: | ||
for (unsigned int j = 0; j < N; ++j) { | ||
dst[ib + j * ld_dst] = src[ib * ld_src + j]; | ||
} | ||
break; | ||
case 2: | ||
for (jb = 0; jb + 4 <= N; jb += 4) { | ||
transpose_kernel_mxn_neon_128<2>(4, &src[ib * ld_src + jb], ld_src, | ||
&dst[ib + jb * ld_dst], ld_dst); | ||
} | ||
if (jb < N) { | ||
transpose_kernel_mxn_neon_128<2>(N - jb, &src[ib * ld_src + jb], ld_src, | ||
&dst[ib + jb * ld_dst], ld_dst); | ||
} | ||
break; | ||
case 3: | ||
for (jb = 0; jb + 4 <= N; jb += 4) { | ||
transpose_kernel_mxn_neon_128<3>(4, &src[ib * ld_src + jb], ld_src, | ||
&dst[ib + jb * ld_dst], ld_dst); | ||
} | ||
if (jb < N) { | ||
transpose_kernel_mxn_neon_128<3>(N - jb, &src[ib * ld_src + jb], ld_src, | ||
&dst[ib + jb * ld_dst], ld_dst); | ||
} | ||
break; | ||
case 4: | ||
for (jb = 0; jb + 4 <= N; jb += 4) { | ||
transpose_kernel_4x4_neon(&src[ib * ld_src + jb], ld_src, | ||
&dst[ib + jb * ld_dst], ld_dst); | ||
} | ||
if (jb < N) { | ||
transpose_kernel_mxn_neon_128<4>(N - jb, &src[ib * ld_src + jb], ld_src, | ||
&dst[ib + jb * ld_dst], ld_dst); | ||
} | ||
break; | ||
case 5: | ||
for (jb = 0; jb + 8 <= N; jb += 8) { | ||
transpose_kernel_mxn_neon_256<5>(8, &src[ib * ld_src + jb], ld_src, | ||
&dst[ib + jb * ld_dst], ld_dst); | ||
} | ||
if (jb < N) { | ||
transpose_kernel_mxn_neon_256<5>(N - jb, &src[ib * ld_src + jb], ld_src, | ||
&dst[ib + jb * ld_dst], ld_dst); | ||
} | ||
break; | ||
case 6: | ||
for (jb = 0; jb + 8 <= N; jb += 8) { | ||
transpose_kernel_mxn_neon_256<6>(8, &src[ib * ld_src + jb], ld_src, | ||
&dst[ib + jb * ld_dst], ld_dst); | ||
} | ||
if (jb < N) { | ||
transpose_kernel_mxn_neon_256<6>(N - jb, &src[ib * ld_src + jb], ld_src, | ||
&dst[ib + jb * ld_dst], ld_dst); | ||
} | ||
break; | ||
case 7: | ||
for (jb = 0; jb + 8 <= N; jb += 8) { | ||
transpose_kernel_mxn_neon_256<7>(8, &src[ib * ld_src + jb], ld_src, | ||
&dst[ib + jb * ld_dst], ld_dst); | ||
} | ||
if (jb < N) { | ||
transpose_kernel_mxn_neon_256<7>(N - jb, &src[ib * ld_src + jb], ld_src, | ||
&dst[ib + jb * ld_dst], ld_dst); | ||
} | ||
break; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters