diff --git a/nntrainer/tensor/blas_interface.cpp b/nntrainer/tensor/blas_interface.cpp
index 0847890b49..60b47d6fe9 100644
--- a/nntrainer/tensor/blas_interface.cpp
+++ b/nntrainer/tensor/blas_interface.cpp
@@ -263,21 +263,34 @@ static void copy_int4_to_fp16(const unsigned int N, const uint8_t *X,
 
 static void copy_int8_to_fp16(const unsigned int N, const uint8_t *X,
                               const int incX, _FP16 *Y, const int incY) {
-  unsigned int incy = abs(incY);
-  unsigned int incx = abs(incX);
+  unsigned int inc_x = abs(incX);
+  unsigned int inc_y = abs(incY);
 
 #if (defined USE__FP16 && USE_NEON)
   if (incX == 1 && incY == 1) {
     nntrainer::neon::copy_int8_to_fp16(N, X, Y);
-  } else {
-    throw std::invalid_argument(
-      "Error: incX == 1 && incY == 1 is supported only");
+    return;
   }
-#else
+#endif
   for (unsigned int idx = 0; idx < N; idx++) {
-    Y[idx] = X[idx];
+    Y[idx * inc_y] = X[idx * inc_x];
+  }
+}
+
+static void copy_int8_to_fp16(const unsigned int N, const int8_t *X,
+                              const int incX, _FP16 *Y, const int incY) {
+  unsigned int inc_x = abs(incX);
+  unsigned int inc_y = abs(incY);
+
+#if (defined USE__FP16 && USE_NEON)
+  if (incX == 1 && incY == 1) {
+    nntrainer::neon::copy_int8_to_fp16(N, X, Y);
+    return;
   }
 #endif
+  for (unsigned int idx = 0; idx < N; idx++) {
+    Y[idx * inc_y] = X[idx * inc_x];
+  }
 }
 
 void sscal(const unsigned int N, const float alpha, _FP16 *X, const int incX) {
@@ -423,6 +436,11 @@ void scopy_int8_to_float16(const unsigned int N, const uint8_t *X,
   copy_int8_to_fp16(N, X, incX, Y, incY);
 }
 
+void scopy_int8_to_float16(const unsigned int N, const int8_t *X,
+                           const int incX, _FP16 *Y, const int incY) {
+  copy_int8_to_fp16(N, X, incX, Y, incY);
+}
+
 static void ele_mul_fallback(const unsigned int N, const _FP16 *X,
                              const _FP16 *Y, _FP16 *Z, float alpha, float beta,
                              unsigned int i_stride, unsigned int o_stride) {
@@ -901,6 +919,22 @@ void scopy(const unsigned int N, const uint8_t *X, const int incX, uint8_t *Y,
 #endif
 }
 
+void scopy(const unsigned int N, const int8_t *X, const int incX, int8_t *Y,
+           const int incY) {
+  unsigned int inc_x = abs(incX);
+  unsigned int inc_y = abs(incY);
+
+#ifdef USE_NEON
+  if (incX == 1 && incY == 1) {
+    nntrainer::neon::copy_int8(N, X, Y);
+    return;
+  }
+#endif
+  for (unsigned int idx = 0; idx < N; idx++) {
+    Y[idx * inc_y] = X[idx * inc_x];
+  }
+}
+
 void scopy_int4_to_float32(const unsigned int N, const uint8_t *X,
                            const int incX, float *Y, const int incY) {
 #ifdef USE_NEON
@@ -915,13 +949,34 @@ void scopy_int4_to_float32(const unsigned int N, const uint8_t *X,
 
 void scopy_int8_to_float32(const unsigned int N, const uint8_t *X,
                            const int incX, float *Y, const int incY) {
+  unsigned int inc_x = abs(incX);
+  unsigned int inc_y = abs(incY);
+
 #ifdef USE_NEON
-  nntrainer::neon::copy_int8_to_fp32(N, X, Y);
-#else
+  if (incX == 1 && incY == 1) {
+    nntrainer::neon::copy_int8_to_fp32(N, X, Y);
+    return;
+  }
+#endif
   for (unsigned int idx = 0; idx < N; idx++) {
-    Y[idx] = X[idx];
+    Y[idx * inc_y] = X[idx * inc_x];
+  }
+}
+
+void scopy_int8_to_float32(const unsigned int N, const int8_t *X,
+                           const int incX, float *Y, const int incY) {
+  unsigned int inc_x = abs(incX);
+  unsigned int inc_y = abs(incY);
+
+#ifdef USE_NEON
+  if (incX == 1 && incY == 1) {
+    nntrainer::neon::copy_int8_to_fp32(N, X, Y);
+    return;
   }
 #endif
+  for (unsigned int idx = 0; idx < N; idx++) {
+    Y[idx * inc_y] = X[idx * inc_x];
+  }
 }
 
 float snrm2(const int N, const float *X, const int incX) {
diff --git a/nntrainer/tensor/blas_interface.h b/nntrainer/tensor/blas_interface.h
index e99a4cedbe..792f1d441f 100644
--- a/nntrainer/tensor/blas_interface.h
+++ b/nntrainer/tensor/blas_interface.h
@@ -86,6 +86,15 @@ void scopy_int4_to_float16(const unsigned int N, const uint8_t *X,
 void scopy_int8_to_float16(const unsigned int N, const uint8_t *X,
                            const int incX, _FP16 *Y, const int incY);
 
+/**
+ * @brief     copy function : Y = X
+ * @param[in] N number of elements in X
+ * @param[in] X int8_t * for Vector X
+ * @param[in] Y __fp16 * for Vector Y
+ */
+void scopy_int8_to_float16(const unsigned int N, const int8_t *X,
+                           const int incX, _FP16 *Y, const int incY);
+
 /**
  * @brief     sdot computation : sum of all X * Y
  * @param[in] N number of elements in Y
@@ -274,6 +283,16 @@ void scopy(const unsigned int N, const float *X, const int incX, float *Y,
  */
 void scopy(const unsigned int N, const uint8_t *X, const int incX, uint8_t *Y,
            const int intY);
+
+/**
+ * @brief     copy function : Y = X
+ * @param[in] N number of elements in X
+ * @param[in] X int8_t * for Vector X
+ * @param[in] Y int8_t * for Vector Y
+ */
+void scopy(const unsigned int N, const int8_t *X, const int incX, int8_t *Y,
+           const int intY);
+
 /**
  * @brief     copy function : Y = X
  * @param[in] N number of elements in X
@@ -292,6 +311,15 @@ void scopy_int4_to_float32(const unsigned int N, const uint8_t *X,
 void scopy_int8_to_float32(const unsigned int N, const uint8_t *X,
                            const int incX, float *Y, const int intY);
 
+/**
+ * @brief     copy function : Y = X
+ * @param[in] N number of elements in X
+ * @param[in] X uint8_t * for Vector X
+ * @param[in] Y float * for Vector Y
+ */
+void scopy_int8_to_float32(const unsigned int N, const int8_t *X,
+                           const int incX, float *Y, const int intY);
+
 /**
  * @brief     sdot computation : sum of all X * Y
  * @param[in] N number of elements in Y
diff --git a/nntrainer/tensor/blas_neon.cpp b/nntrainer/tensor/blas_neon.cpp
index 0494e96d6b..cd0638a7ec 100644
--- a/nntrainer/tensor/blas_neon.cpp
+++ b/nntrainer/tensor/blas_neon.cpp
@@ -404,6 +404,22 @@ void copy_int8_or_int4(const unsigned int N, const uint8_t *X, uint8_t *Y) {
   }
 }
 
+void copy_int8(const unsigned int N, const int8_t *X, int8_t *Y) {
+  ///@note int8 Tensor and int4 Tensor share the same memory offset
+  unsigned int idx = 0;
+  for (; N - idx >= 16; idx += 16) {
+    int8x16_t batch = vld1q_s8(&X[idx]);
+    vst1q_s8(&Y[idx], batch);
+  }
+  for (; N - idx >= 8; idx += 8) {
+    int8x8_t batch = vld1_s8(&X[idx]);
+    vst1_s8(&Y[idx], batch);
+  }
+  for (; N - idx >= 1; ++idx) {
+    Y[idx] = X[idx];
+  }
+}
+
 void sine(const unsigned int N, float *X, float *Y, float alpha) {
   unsigned int i = 0;
   for (; N - i >= 4; i += 4) {
@@ -1469,6 +1485,34 @@ void copy_int8_to_fp16(const unsigned int N, const uint8_t *X, __fp16 *Y) {
   }
 }
 
+void copy_int8_to_fp16(const unsigned int N, const int8_t *X, __fp16 *Y) {
+  unsigned int idx = 0;
+  for (; (N - idx) >= 16; idx += 16) {
+    int8x16_t batch = vld1q_s8(&X[idx]);
+    int8x8_t low = vget_low_s8(batch);
+    int8x8_t high = vget_high_s8(batch);
+
+    // convert to s16
+    int16x8_t batch_low_s16 = vmovl_s8(low);
+    int16x8_t batch_high_s16 = vmovl_s8(high);
+
+    // todo : experiment with vcvt_f32_s32_ bitwise operation w.r.t.
+    // time/accuracy
+    vst1q_f16(&Y[idx], vcvtq_f16_s16(batch_low_s16));
+    vst1q_f16(&Y[idx + 8], vcvtq_f16_s16(batch_high_s16));
+  }
+  for (; (N - idx) >= 8; idx += 8) {
+    int8x8_t batch = vld1_s8(&X[idx]);
+
+    // convert to s16
+    int16x8_t batch_s16 = vmovl_s8(batch);
+    vst1q_f16(&Y[idx], vcvtq_f16_s16(batch_s16));
+  }
+  for (; (N - idx) >= 1; ++idx) {
+    Y[idx] = X[idx];
+  }
+}
+
 void copy_int8_to_fp32(const unsigned int N, const uint8_t *X, float *Y) {
   unsigned int idx = 0;
   for (; (N - idx) >= 16; idx += 16) {
@@ -1511,6 +1555,48 @@ void copy_int8_to_fp32(const unsigned int N, const uint8_t *X, float *Y) {
   }
 }
 
+void copy_int8_to_fp32(const unsigned int N, const int8_t *X, float *Y) {
+  unsigned int idx = 0;
+  for (; (N - idx) >= 16; idx += 16) {
+    int8x16_t batch = vld1q_s8(&X[idx]);
+    int8x8_t low = vget_low_s8(batch);
+    int8x8_t high = vget_high_s8(batch);
+
+    // convert to s16
+    int16x8_t batch_low_s16 = vmovl_s8(low);
+    int16x8_t batch_high_s16 = vmovl_s8(high);
+
+    // convert to s32
+    int32x4_t batch_low_s32_low = vmovl_s16(vget_low_s16(batch_low_s16));
+    int32x4_t batch_low_s32_high = vmovl_s16(vget_high_s16(batch_low_s16));
+    int32x4_t batch_high_s32_low = vmovl_s16(vget_low_s16(batch_high_s16));
+    int32x4_t batch_high_s32_high = vmovl_s16(vget_high_s16(batch_high_s16));
+
+    // todo : experiment with vcvt_f32_s32_ bitwise operation w.r.t.
+    // time/accuracy
+    vst1q_f32(&Y[idx], vcvtq_f32_s32(batch_low_s32_low));
+    vst1q_f32(&Y[idx + 4], vcvtq_f32_s32(batch_low_s32_high));
+    vst1q_f32(&Y[idx + 8], vcvtq_f32_s32(batch_high_s32_low));
+    vst1q_f32(&Y[idx + 12], vcvtq_f32_s32(batch_high_s32_high));
+  }
+  for (; (N - idx) >= 8; idx += 8) {
+    int8x8_t batch = vld1_s8(&X[idx]);
+
+    // convert to s16
+    int16x8_t batch_s16 = vmovl_s8(batch);
+
+    // convert to s32
+    int32x4_t batch_s32_low = vmovl_s16(vget_low_s16(batch_s16));
+    int32x4_t batch_s32_high = vmovl_s16(vget_high_s16(batch_s16));
+
+    vst1q_f32(&Y[idx], vcvtq_f32_s32(batch_s32_low));
+    vst1q_f32(&Y[idx + 4], vcvtq_f32_s32(batch_s32_high));
+  }
+  for (; (N - idx) >= 1; ++idx) {
+    Y[idx] = X[idx];
+  }
+}
+
 void copy_fp16_to_fp32(const unsigned int N, const __fp16 *X, float *Y) {
   unsigned int idx = 0;
 
diff --git a/nntrainer/tensor/blas_neon.h b/nntrainer/tensor/blas_neon.h
index 32bc075e6c..1cfd157126 100644
--- a/nntrainer/tensor/blas_neon.h
+++ b/nntrainer/tensor/blas_neon.h
@@ -65,6 +65,14 @@ void copy_int4_to_fp32(const unsigned int N, const uint8_t *X, float *Y);
  */
 void copy_int8_to_fp32(const unsigned int N, const uint8_t *X, float *Y);
 
+/**
+ * @brief     copy function with neon: Y = X
+ * @param[in] N number of elements in X
+ * @param[in] X int8_t * for Vector X
+ * @param[in] Y float * for Vector Y
+ */
+void copy_int8_to_fp32(const unsigned int N, const int8_t *X, float *Y);
+
 /**
  * @brief     copy function with neon: Y = X
  * @param[in] N number of elements in X
@@ -72,6 +80,14 @@ void copy_int8_to_fp32(const unsigned int N, const uint8_t *X, float *Y);
  * @param[in] Y uint8_t * for Vector Y
  */
 void copy_int8_or_int4(const unsigned int N, const uint8_t *X, uint8_t *Y);
+
+/**
+ * @brief     copy function with neon: Y = X
+ * @param[in] N number of elements in X
+ * @param[in] X int8_t * for Vector X
+ * @param[in] Y int8_t * for Vector Y
+ */
+void copy_int8(const unsigned int N, const int8_t *X, int8_t *Y);
 /**
  * @brief     sine with neon: Y = sin(alpha * X)
  * @param[in] N number of elements in X
@@ -311,6 +327,14 @@ void copy_int4_to_fp16(const unsigned int N, const uint8_t *X, __fp16 *Y);
  */
 void copy_int8_to_fp16(const unsigned int N, const uint8_t *X, __fp16 *Y);
 
+/**
+ * @brief     copy function with neon: Y = X
+ * @param[in] N number of elements in X
+ * @param[in] X int8_t * for Vector X
+ * @param[in] Y __fp16 * for Vector Y
+ */
+void copy_int8_to_fp16(const unsigned int N, const int8_t *X, __fp16 *Y);
+
 /**
  * @brief     copy function with neon: Y = X
  * @param[in] N number of elements in X
diff --git a/nntrainer/tensor/char_tensor.cpp b/nntrainer/tensor/char_tensor.cpp
index ede1802be0..0d42ccded5 100644
--- a/nntrainer/tensor/char_tensor.cpp
+++ b/nntrainer/tensor/char_tensor.cpp
@@ -361,9 +361,7 @@ void CharTensor::copy(const void *buf) {
   }
 
   /// @todo need to optimize
-  for (unsigned int i = 0; i < size(); ++i) {
-    ((int8_t *)getData())[i] = ((int8_t *)buf)[i];
-  }
+  scopy(size(), (int8_t *)buf, 1, (int8_t *)getData(), 1);
 }
 
 void CharTensor::save_quantization_info(std::ostream &file) {
diff --git a/nntrainer/tensor/float_tensor.cpp b/nntrainer/tensor/float_tensor.cpp
index 22600f32cb..9c31c40f2c 100644
--- a/nntrainer/tensor/float_tensor.cpp
+++ b/nntrainer/tensor/float_tensor.cpp
@@ -764,7 +764,7 @@ void FloatTensor::copyData(const Tensor &from) {
 #endif
     break;
   case ml::train::TensorDim::DataType::QINT8:
-    scopy_int8_to_float32(from.size(), from.getData<uint8_t>(), 1,
+    scopy_int8_to_float32(from.size(), from.getData<int8_t>(), 1,
                           (float *)getData(), 1);
     break;
   default:
diff --git a/nntrainer/tensor/half_tensor.cpp b/nntrainer/tensor/half_tensor.cpp
index 55e072ed74..bdc5090410 100644
--- a/nntrainer/tensor/half_tensor.cpp
+++ b/nntrainer/tensor/half_tensor.cpp
@@ -983,7 +983,7 @@ void HalfTensor::copyData(const Tensor &from) {
     copy(from.getData<_FP16>());
     break;
   case ml::train::TensorDim::DataType::QINT8:
-    scopy_int8_to_float16(from.size(), from.getData<uint8_t>(), 1,
+    scopy_int8_to_float16(from.size(), from.getData<int8_t>(), 1,
                           (_FP16 *)getData(), 1);
     break;
   default: