diff --git a/nntrainer/tensor/blas_interface.cpp b/nntrainer/tensor/blas_interface.cpp index 0847890b49..60b47d6fe9 100644 --- a/nntrainer/tensor/blas_interface.cpp +++ b/nntrainer/tensor/blas_interface.cpp @@ -263,21 +263,34 @@ static void copy_int4_to_fp16(const unsigned int N, const uint8_t *X, static void copy_int8_to_fp16(const unsigned int N, const uint8_t *X, const int incX, _FP16 *Y, const int incY) { - unsigned int incy = abs(incY); - unsigned int incx = abs(incX); + unsigned int inc_x = abs(incX); + unsigned int inc_y = abs(incY); #if (defined USE__FP16 && USE_NEON) if (incX == 1 && incY == 1) { nntrainer::neon::copy_int8_to_fp16(N, X, Y); - } else { - throw std::invalid_argument( - "Error: incX == 1 && incY == 1 is supported only"); + return; } -#else +#endif for (unsigned int idx = 0; idx < N; idx++) { - Y[idx] = X[idx]; + Y[idx * inc_y] = X[idx * inc_x]; + } +} + +static void copy_int8_to_fp16(const unsigned int N, const int8_t *X, + const int incX, _FP16 *Y, const int incY) { + unsigned int inc_x = abs(incX); + unsigned int inc_y = abs(incY); + +#if (defined USE__FP16 && USE_NEON) + if (incX == 1 && incY == 1) { + nntrainer::neon::copy_int8_to_fp16(N, X, Y); + return; } #endif + for (unsigned int idx = 0; idx < N; idx++) { + Y[idx * inc_y] = X[idx * inc_x]; + } } void sscal(const unsigned int N, const float alpha, _FP16 *X, const int incX) { @@ -423,6 +436,11 @@ void scopy_int8_to_float16(const unsigned int N, const uint8_t *X, copy_int8_to_fp16(N, X, incX, Y, incY); } +void scopy_int8_to_float16(const unsigned int N, const int8_t *X, + const int incX, _FP16 *Y, const int incY) { + copy_int8_to_fp16(N, X, incX, Y, incY); +} + static void ele_mul_fallback(const unsigned int N, const _FP16 *X, const _FP16 *Y, _FP16 *Z, float alpha, float beta, unsigned int i_stride, unsigned int o_stride) { @@ -901,6 +919,22 @@ void scopy(const unsigned int N, const uint8_t *X, const int incX, uint8_t *Y, #endif } +void scopy(const unsigned int N, const int8_t *X, const int incX, int8_t *Y, + const int incY) { + unsigned int inc_x = abs(incX); + unsigned int inc_y = abs(incY); + +#ifdef USE_NEON + if (incX == 1 && incY == 1) { + nntrainer::neon::copy_int8(N, X, Y); + return; + } +#endif + for (unsigned int idx = 0; idx < N; idx++) { + Y[idx * inc_y] = X[idx * inc_x]; + } +} + void scopy_int4_to_float32(const unsigned int N, const uint8_t *X, const int incX, float *Y, const int incY) { #ifdef USE_NEON @@ -915,13 +949,34 @@ void scopy_int4_to_float32(const unsigned int N, const uint8_t *X, void scopy_int8_to_float32(const unsigned int N, const uint8_t *X, const int incX, float *Y, const int incY) { + unsigned int inc_x = abs(incX); + unsigned int inc_y = abs(incY); + #ifdef USE_NEON - nntrainer::neon::copy_int8_to_fp32(N, X, Y); -#else + if (incX == 1 && incY == 1) { + nntrainer::neon::copy_int8_to_fp32(N, X, Y); + return; + } +#endif for (unsigned int idx = 0; idx < N; idx++) { - Y[idx] = X[idx]; + Y[idx * inc_y] = X[idx * inc_x]; + } +} + +void scopy_int8_to_float32(const unsigned int N, const int8_t *X, + const int incX, float *Y, const int incY) { + unsigned int inc_x = abs(incX); + unsigned int inc_y = abs(incY); + +#ifdef USE_NEON + if (incX == 1 && incY == 1) { + nntrainer::neon::copy_int8_to_fp32(N, X, Y); + return; } #endif + for (unsigned int idx = 0; idx < N; idx++) { + Y[idx * inc_y] = X[idx * inc_x]; + } } float snrm2(const int N, const float *X, const int incX) { diff --git a/nntrainer/tensor/blas_interface.h b/nntrainer/tensor/blas_interface.h index e99a4cedbe..792f1d441f 100644 --- a/nntrainer/tensor/blas_interface.h +++ b/nntrainer/tensor/blas_interface.h @@ -86,6 +86,15 @@ void scopy_int4_to_float16(const unsigned int N, const uint8_t *X, void scopy_int8_to_float16(const unsigned int N, const uint8_t *X, const int incX, _FP16 *Y, const int incY); +/** + * @brief copy function : Y = X + * @param[in] N number of elements in X + * @param[in] X int8_t * for Vector X + * @param[in] Y __fp16 * for Vector Y + */ +void scopy_int8_to_float16(const unsigned int N, const int8_t *X, + const int incX, _FP16 *Y, const int incY); + /** * @brief sdot computation : sum of all X * Y * @param[in] N number of elements in Y @@ -274,6 +283,16 @@ void scopy(const unsigned int N, const float *X, const int incX, float *Y, */ void scopy(const unsigned int N, const uint8_t *X, const int incX, uint8_t *Y, const int intY); + +/** + * @brief copy function : Y = X + * @param[in] N number of elements in X + * @param[in] X int8_t * for Vector X + * @param[in] Y int8_t * for Vector Y + */ +void scopy(const unsigned int N, const int8_t *X, const int incX, int8_t *Y, + const int intY); + /** * @brief copy function : Y = X * @param[in] N number of elements in X @@ -292,6 +311,15 @@ void scopy_int4_to_float32(const unsigned int N, const uint8_t *X, void scopy_int8_to_float32(const unsigned int N, const uint8_t *X, const int incX, float *Y, const int intY); +/** + * @brief copy function : Y = X + * @param[in] N number of elements in X + * @param[in] X uint8_t * for Vector X + * @param[in] Y float * for Vector Y + */ +void scopy_int8_to_float32(const unsigned int N, const int8_t *X, + const int incX, float *Y, const int intY); + /** * @brief sdot computation : sum of all X * Y * @param[in] N number of elements in Y diff --git a/nntrainer/tensor/blas_neon.cpp b/nntrainer/tensor/blas_neon.cpp index 0494e96d6b..cd0638a7ec 100644 --- a/nntrainer/tensor/blas_neon.cpp +++ b/nntrainer/tensor/blas_neon.cpp @@ -404,6 +404,22 @@ void copy_int8_or_int4(const unsigned int N, const uint8_t *X, uint8_t *Y) { } } +void copy_int8(const unsigned int N, const int8_t *X, int8_t *Y) { + ///@note int8 Tensor and int4 Tensor share the same memory offset + unsigned int idx = 0; + for (; N - idx >= 16; idx += 16) { + int8x16_t batch = vld1q_s8(&X[idx]); + vst1q_s8(&Y[idx], batch); + } + for (; N - idx >= 8; idx += 8) { + int8x8_t batch = vld1_s8(&X[idx]); + vst1_s8(&Y[idx], batch); + } + for (; N - idx >= 1; ++idx) { + Y[idx] = X[idx]; + } +} + void sine(const unsigned int N, float *X, float *Y, float alpha) { unsigned int i = 0; for (; N - i >= 4; i += 4) { @@ -1469,6 +1485,34 @@ void copy_int8_to_fp16(const unsigned int N, const uint8_t *X, __fp16 *Y) { } } +void copy_int8_to_fp16(const unsigned int N, const int8_t *X, __fp16 *Y) { + unsigned int idx = 0; + for (; (N - idx) >= 16; idx += 16) { + int8x16_t batch = vld1q_s8(&X[idx]); + int8x8_t low = vget_low_s8(batch); + int8x8_t high = vget_high_s8(batch); + + // convert to s16 + int16x8_t batch_low_s16 = vmovl_s8(low); + int16x8_t batch_high_s16 = vmovl_s8(high); + + // todo : experiment with vcvt_f32_s32_ bitwise operation w.r.t. + // time/accuracy + vst1q_f16(&Y[idx], vcvtq_f16_s16(batch_low_s16)); + vst1q_f16(&Y[idx + 8], vcvtq_f16_s16(batch_high_s16)); + } + for (; (N - idx) >= 8; idx += 8) { + int8x8_t batch = vld1_s8(&X[idx]); + + // convert to s16 + int16x8_t batch_s16 = vmovl_s8(batch); + vst1q_f16(&Y[idx], vcvtq_f16_s16(batch_s16)); + } + for (; (N - idx) >= 1; ++idx) { + Y[idx] = X[idx]; + } +} + void copy_int8_to_fp32(const unsigned int N, const uint8_t *X, float *Y) { unsigned int idx = 0; for (; (N - idx) >= 16; idx += 16) { @@ -1511,6 +1555,48 @@ void copy_int8_to_fp32(const unsigned int N, const uint8_t *X, float *Y) { } } +void copy_int8_to_fp32(const unsigned int N, const int8_t *X, float *Y) { + unsigned int idx = 0; + for (; (N - idx) >= 16; idx += 16) { + int8x16_t batch = vld1q_s8(&X[idx]); + int8x8_t low = vget_low_s8(batch); + int8x8_t high = vget_high_s8(batch); + + // convert to s16 + int16x8_t batch_low_s16 = vmovl_s8(low); + int16x8_t batch_high_s16 = vmovl_s8(high); + + // convert to s32 + int32x4_t batch_low_s32_low = vmovl_s16(vget_low_s16(batch_low_s16)); + int32x4_t batch_low_s32_high = vmovl_s16(vget_high_s16(batch_low_s16)); + int32x4_t batch_high_s32_low = vmovl_s16(vget_low_s16(batch_high_s16)); + int32x4_t batch_high_s32_high = vmovl_s16(vget_high_s16(batch_high_s16)); + + // todo : experiment with vcvt_f32_s32_ bitwise operation w.r.t. + // time/accuracy + vst1q_f32(&Y[idx], vcvtq_f32_s32(batch_low_s32_low)); + vst1q_f32(&Y[idx + 4], vcvtq_f32_s32(batch_low_s32_high)); + vst1q_f32(&Y[idx + 8], vcvtq_f32_s32(batch_high_s32_low)); + vst1q_f32(&Y[idx + 12], vcvtq_f32_s32(batch_high_s32_high)); + } + for (; (N - idx) >= 8; idx += 8) { + int8x8_t batch = vld1_s8(&X[idx]); + + // convert to s16 + int16x8_t batch_s16 = vmovl_s8(batch); + + // convert to s32 + int32x4_t batch_s32_low = vmovl_s16(vget_low_s16(batch_s16)); + int32x4_t batch_s32_high = vmovl_s16(vget_high_s16(batch_s16)); + + vst1q_f32(&Y[idx], vcvtq_f32_s32(batch_s32_low)); + vst1q_f32(&Y[idx + 4], vcvtq_f32_s32(batch_s32_high)); + } + for (; (N - idx) >= 1; ++idx) { + Y[idx] = X[idx]; + } +} + void copy_fp16_to_fp32(const unsigned int N, const __fp16 *X, float *Y) { unsigned int idx = 0; diff --git a/nntrainer/tensor/blas_neon.h b/nntrainer/tensor/blas_neon.h index 32bc075e6c..1cfd157126 100644 --- a/nntrainer/tensor/blas_neon.h +++ b/nntrainer/tensor/blas_neon.h @@ -65,6 +65,14 @@ void copy_int4_to_fp32(const unsigned int N, const uint8_t *X, float *Y); */ void copy_int8_to_fp32(const unsigned int N, const uint8_t *X, float *Y); +/** + * @brief copy function with neon: Y = X + * @param[in] N number of elements in X + * @param[in] X int8_t * for Vector X + * @param[in] Y float * for Vector Y + */ +void copy_int8_to_fp32(const unsigned int N, const int8_t *X, float *Y); + /** * @brief copy function with neon: Y = X * @param[in] N number of elements in X @@ -72,6 +80,14 @@ void copy_int8_to_fp32(const unsigned int N, const uint8_t *X, float *Y); * @param[in] Y uint8_t * for Vector Y */ void copy_int8_or_int4(const unsigned int N, const uint8_t *X, uint8_t *Y); + +/** + * @brief copy function with neon: Y = X + * @param[in] N number of elements in X + * @param[in] X int8_t * for Vector X + * @param[in] Y int8_t * for Vector Y + */ +void copy_int8(const unsigned int N, const int8_t *X, int8_t *Y); /** * @brief sine with neon: Y = sin(alpha * X) * @param[in] N number of elements in X @@ -311,6 +327,14 @@ void copy_int4_to_fp16(const unsigned int N, const uint8_t *X, __fp16 *Y); */ void copy_int8_to_fp16(const unsigned int N, const uint8_t *X, __fp16 *Y); +/** + * @brief copy function with neon: Y = X + * @param[in] N number of elements in X + * @param[in] X int8_t * for Vector X + * @param[in] Y __fp16 * for Vector Y + */ +void copy_int8_to_fp16(const unsigned int N, const int8_t *X, __fp16 *Y); + /** * @brief copy function with neon: Y = X * @param[in] N number of elements in X diff --git a/nntrainer/tensor/char_tensor.cpp b/nntrainer/tensor/char_tensor.cpp index ede1802be0..0d42ccded5 100644 --- a/nntrainer/tensor/char_tensor.cpp +++ b/nntrainer/tensor/char_tensor.cpp @@ -361,9 +361,7 @@ void CharTensor::copy(const void *buf) { } /// @todo need to optimize - for (unsigned int i = 0; i < size(); ++i) { - ((int8_t *)getData())[i] = ((int8_t *)buf)[i]; - } + scopy(size(), (int8_t *)buf, 1, (int8_t *)getData(), 1); } void CharTensor::save_quantization_info(std::ostream &file) { diff --git a/nntrainer/tensor/float_tensor.cpp b/nntrainer/tensor/float_tensor.cpp index 22600f32cb..9c31c40f2c 100644 --- a/nntrainer/tensor/float_tensor.cpp +++ b/nntrainer/tensor/float_tensor.cpp @@ -764,7 +764,7 @@ void FloatTensor::copyData(const Tensor &from) { #endif break; case ml::train::TensorDim::DataType::QINT8: - scopy_int8_to_float32(from.size(), from.getData(), 1, + scopy_int8_to_float32(from.size(), from.getData(), 1, (float *)getData(), 1); break; default: diff --git a/nntrainer/tensor/half_tensor.cpp b/nntrainer/tensor/half_tensor.cpp index 55e072ed74..bdc5090410 100644 --- a/nntrainer/tensor/half_tensor.cpp +++ b/nntrainer/tensor/half_tensor.cpp @@ -983,7 +983,7 @@ void HalfTensor::copyData(const Tensor &from) { copy(from.getData<_FP16>()); break; case ml::train::TensorDim::DataType::QINT8: - scopy_int8_to_float16(from.size(), from.getData(), 1, + scopy_int8_to_float16(from.size(), from.getData(), 1, (_FP16 *)getData(), 1); break; default: