Skip to content

Commit

Permalink
[ hgemm ] Use aligned memory allocation in transpose / padding gemm
Browse files Browse the repository at this point in the history
- Using unaligned memory may invoke SIGSEGV

**Self evaluation:**
1. Build test:     [X]Passed [ ]Failed [ ]Skipped
2. Run test:     [X]Passed [ ]Failed [ ]Skipped

Signed-off-by: skykongkong8 <ss.kong@samsung.com>
  • Loading branch information
skykongkong8 committed Jun 24, 2024
1 parent c39328d commit cc3bacf
Showing 1 changed file with 6 additions and 6 deletions.
12 changes: 6 additions & 6 deletions nntrainer/tensor/hgemm/hgemm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -96,8 +96,8 @@ void hgemm_noTrans_padding_wrt_K(const __fp16 *A, const __fp16 *B, float *C,
const unsigned int lda = K;
const unsigned int ldb = N;

__fp16 *A8 = new __fp16[M * K8_high];
__fp16 *B8 = new __fp16[K8_high * N];
__fp16 *A8 = alignedMalloc(M * K8_high);
__fp16 *B8 = alignedMalloc(K8_high * N);

float16x8_t ZEROS = vmovq_n_f16(0.F);

Expand Down Expand Up @@ -1257,7 +1257,7 @@ void hgemm_noTrans_fallback(unsigned int M, unsigned int N, unsigned int K,

void hgemm_transB(const __fp16 *A, const __fp16 *B, float *C, unsigned int M,
unsigned int N, unsigned int K, float alpha, float beta) {
__fp16 *B_T = new __fp16[K * N];
__fp16 *B_T = alignedMalloc(K * N);

transpose_neon<__fp16>(N, K, B, K, B_T, N);

Expand All @@ -1268,7 +1268,7 @@ void hgemm_transB(const __fp16 *A, const __fp16 *B, float *C, unsigned int M,

void hgemm_transA(const __fp16 *A, const __fp16 *B, float *C, unsigned int M,
unsigned int N, unsigned int K, float alpha, float beta) {
__fp16 *A_T = new __fp16[M * K];
__fp16 *A_T =alignedMalloc(M * K);

transpose_neon<__fp16>(K, M, A, M, A_T, K);

Expand All @@ -1279,8 +1279,8 @@ void hgemm_transA(const __fp16 *A, const __fp16 *B, float *C, unsigned int M,

void hgemm_transAB(const __fp16 *A, const __fp16 *B, float *C, unsigned int M,
unsigned int N, unsigned int K, float alpha, float beta) {
__fp16 *A_T = new __fp16[M * K];
__fp16 *B_T = new __fp16[K * N];
__fp16 *A_T = alignedMalloc(M * K);
__fp16 *B_T = alignedMalloc(K * N);

transpose_neon<__fp16>(K, M, A, M, A_T, K);
transpose_neon<__fp16>(N, K, B, K, B_T, N);
Expand Down

0 comments on commit cc3bacf

Please sign in to comment.