diff --git a/lib_xcore_math/api/xmath/_support/dct_impl.h b/lib_xcore_math/api/xmath/_support/dct_impl.h
index c1947d37..ed23df05 100644
--- a/lib_xcore_math/api/xmath/_support/dct_impl.h
+++ b/lib_xcore_math/api/xmath/_support/dct_impl.h
@@ -184,3 +184,14 @@ headroom_t dct8x8_stageB(
     const int16_t x[8][8],
     const int16_t matrix[8][16],
     const right_shift_t sat);
+    
+    
+
+/**
+ * 
+*/
+C_API
+void dct8x8_stageB_16bit(
+    int16_t y[8][8],
+    const int16_t x[8][8],
+    const int16_t matrix[8][16]);
diff --git a/lib_xcore_math/api/xmath/dct.h b/lib_xcore_math/api/xmath/dct.h
index ace528cb..0b74663c 100644
--- a/lib_xcore_math/api/xmath/dct.h
+++ b/lib_xcore_math/api/xmath/dct.h
@@ -683,6 +683,17 @@ headroom_t dct8x8_inverse(
     const int8_t x[8][8],
     const right_shift_t sat);
 
+
+
+
+/**
+ * 
+ */
+C_API
+void dct8x8_forward_16bit(
+    int16_t y[8][8],
+    const int8_t x[8][8]);
+
     
 #ifdef __XC__
 } // extern "C"
diff --git a/lib_xcore_math/src/arch/xs3/dct/s8/dct8x8_stageB.S b/lib_xcore_math/src/arch/xs3/dct/s8/dct8x8_stageB.S
index a6fcc330..d3a81b14 100644
--- a/lib_xcore_math/src/arch/xs3/dct/s8/dct8x8_stageB.S
+++ b/lib_xcore_math/src/arch/xs3/dct/s8/dct8x8_stageB.S
@@ -21,7 +21,7 @@ Whether the forward or inverse DCT is performed depends on whether the
 matrix[][] argument points to dct8_matrix_16bit[][] or 
 idct8_matrix_16bit[][].
 
-headroom_t dct8_inversex8_stageB(
+headroom_t dct8x8_stageB(
     int8_t y[8][8],
     const int16_t x[8][8],
     const int16_t matrix[8][16],
diff --git a/lib_xcore_math/src/arch/xs3/dct/s8/dct8x8_stageB_16bit.S b/lib_xcore_math/src/arch/xs3/dct/s8/dct8x8_stageB_16bit.S
new file mode 100644
index 00000000..16c7532a
--- /dev/null
+++ b/lib_xcore_math/src/arch/xs3/dct/s8/dct8x8_stageB_16bit.S
@@ -0,0 +1,120 @@
+// Copyright 2020-2022 XMOS LIMITED.
+// This Software is subject to the terms of the XMOS Public Licence: Version 1.
+#if defined(__XS3A__)
+
+
+/*  
+
+Perform the final step of a 2D 8-by-8 forward or inverse DCT on 8-bit data.
+
+The first step takes an 8-bit tensor x[8][8] as input and populates a 16-bit
+tensor y[8][8] as output. The first step is implemented as dct8x8_stageA().
+
+The final step takes a 16-bit tensor x[8][8] as input and populates an 16-bit
+tensor y[8][8] as output.
+
+The operation is to perform an 8-point DCT on each row of x[][] to get
+an intermediate tensor tmp[][], and then populate y[][] with the TRANSPOSE of
+tmp[][].
+
+Whether the forward or inverse DCT is performed depends on whether the
+matrix[][] argument points to dct8_matrix_16bit[][] or 
+idct8_matrix_16bit[][].
+
+void dct8x8_stageB(
+    int16_t y[8][8],
+    const int16_t x[8][8],
+    const int16_t matrix[8][16]);
+
+*/
+
+#define FUNCTION_NAME   dct8x8_stageB_16bit
+#define NSTACKWORDS 40
+
+.text
+.issue_mode dual
+.global FUNCTION_NAME
+.type FUNCTION_NAME,@function
+.align 16
+.cc_top FUNCTION_NAME.function,FUNCTION_NAME
+
+#define STK_BUFF      (NSTACKWORDS - 32)
+
+#define y       r0
+#define x       r1
+#define mat     r2
+#define buff    r3
+#define count   r4
+#define _32     r5
+#define _16     r6
+#define sat     r7
+
+.L_vpu_vec_0x0010:
+.short 20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20
+
+FUNCTION_NAME:
+  dualentsp NSTACKWORDS
+  std r4, r5, sp[0]
+  std r6, r7, sp[1]
+  
+  ldc r11, 0x100 // 16-bit mode
+{ ldc _16, 16                 ; vsetc r11                   }
+  ldap r11, .L_vpu_vec_0x0010
+{ mov sat, r11                ;                             }
+
+////// Perform eight 8-point, 16-bit DCTs
+
+{ ldc count, 4                ; ldc r11, 28                 }
+// We need to traverse the rows of x[] backwards to get elements
+// in the right output order.
+  ldaw x, x[r11]
+{ ldc _32, 32                 ;                             }
+
+// Each loop handles 2 rows of the matrix, in the order: (1,0),(3,2),(5,4),(7,6)
+// so that later matrix rows end up in higher accumulator indices, and thus higher memory
+// when stored. 
+.L_loop_top:
+  { add mat, mat, _32           ; vclrdr                      }
+  { mov r11, x                  ; vldc mat[0]                 }
+  { sub r11, r11, _16           ; vlmaccr r11[0]              }
+  { sub r11, r11, _16           ; vlmaccr r11[0]              }
+  { sub r11, r11, _16           ; vlmaccr r11[0]              }
+  { sub r11, r11, _16           ; vlmaccr r11[0]              }
+  { sub r11, r11, _16           ; vlmaccr r11[0]              }
+  { sub r11, r11, _16           ; vlmaccr r11[0]              }
+  { sub r11, r11, _16           ; vlmaccr r11[0]              }
+  { sub mat, mat, _32           ; vlmaccr r11[0]              }
+  { mov r11, x                  ; vldc mat[0]                 }
+  { sub r11, r11, _16           ; vlmaccr r11[0]              }
+  { sub r11, r11, _16           ; vlmaccr r11[0]              }
+  { sub r11, r11, _16           ; vlmaccr r11[0]              }
+  { sub r11, r11, _16           ; vlmaccr r11[0]              }
+  { sub r11, r11, _16           ; vlmaccr r11[0]              }
+  { sub r11, r11, _16           ; vlmaccr r11[0]              }
+  { sub r11, r11, _16           ; vlmaccr r11[0]              }
+  { add mat, mat, _32           ; vlmaccr r11[0]              }
+  { add mat, mat, _32           ; vlsat sat[0]                }
+  { sub count, count, 1         ; vstr y[0]                   }
+  { add y, y, _32               ; bt count, .L_loop_top       }
+.L_loop_bot:
+
+  ldd r4, r5, sp[0]
+  ldd r6, r7, sp[1]
+
+  retsp NSTACKWORDS
+
+	
+.cc_bottom FUNCTION_NAME.function
+.set	FUNCTION_NAME.nstackwords,NSTACKWORDS
+.globl	FUNCTION_NAME.nstackwords
+.set	FUNCTION_NAME.maxcores,1
+.globl	FUNCTION_NAME.maxcores
+.set	FUNCTION_NAME.maxtimers,0
+.globl	FUNCTION_NAME.maxtimers
+.set	FUNCTION_NAME.maxchanends,0
+.globl	FUNCTION_NAME.maxchanends
+.Ltmp0:
+	.size	FUNCTION_NAME, .Ltmp0-FUNCTION_NAME    
+
+
+#endif //defined(__XS3A__)
\ No newline at end of file
diff --git a/lib_xcore_math/src/dct/dct8x8.c b/lib_xcore_math/src/dct/dct8x8.c
index e5c3d39d..4db4ae6a 100644
--- a/lib_xcore_math/src/dct/dct8x8.c
+++ b/lib_xcore_math/src/dct/dct8x8.c
@@ -11,28 +11,41 @@
 
 
 int16_t dct8_matrix_16bit[8][16] = {
-{ 0x4000, 0x4000, 0x4000, 0x4000, 0x4000, 0x4000, 0x4000, 0x4000, 0,0,0,0,0,0,0,0 },
-{ 0x3EC5, 0x3537, 0x238E, 0xC7C, 0xF384, 0xDC72, 0xCAC9, 0xC13B, 0,0,0,0,0,0,0,0 },
-{ 0x3B21, 0x187E, 0xE782, 0xC4DF, 0xC4DF, 0xE782, 0x187E, 0x3B21, 0,0,0,0,0,0,0,0 },
-{ 0x3537, 0xF384, 0xC13B, 0xDC72, 0x238E, 0x3EC5, 0xC7C, 0xCAC9, 0,0,0,0,0,0,0,0 },
-{ 0x2D41, 0xD2BF, 0xD2BF, 0x2D41, 0x2D41, 0xD2BF, 0xD2BF, 0x2D41, 0,0,0,0,0,0,0,0 },
-{ 0x238E, 0xC13B, 0xC7C, 0x3537, 0xCAC9, 0xF384, 0x3EC5, 0xDC72, 0,0,0,0,0,0,0,0 },
-{ 0x187E, 0xC4DF, 0x3B21, 0xE782, 0xE782, 0x3B21, 0xC4DF, 0x187E, 0,0,0,0,0,0,0,0 },
-{ 0xC7C, 0xDC72, 0x3537, 0xC13B, 0x3EC5, 0xCAC9, 0x238E, 0xF384, 0,0,0,0,0,0,0,0 },
+  { 0x4000, 0x4000, 0x4000, 0x4000, 0x4000, 0x4000, 0x4000, 0x4000, 0,0,0,0,0,0,0,0 },
+  { 0x3EC5, 0x3537, 0x238E, 0xC7C, 0xF384, 0xDC72, 0xCAC9, 0xC13B, 0,0,0,0,0,0,0,0 },
+  { 0x3B21, 0x187E, 0xE782, 0xC4DF, 0xC4DF, 0xE782, 0x187E, 0x3B21, 0,0,0,0,0,0,0,0 },
+  { 0x3537, 0xF384, 0xC13B, 0xDC72, 0x238E, 0x3EC5, 0xC7C, 0xCAC9, 0,0,0,0,0,0,0,0 },
+  { 0x2D41, 0xD2BF, 0xD2BF, 0x2D41, 0x2D41, 0xD2BF, 0xD2BF, 0x2D41, 0,0,0,0,0,0,0,0 },
+  { 0x238E, 0xC13B, 0xC7C, 0x3537, 0xCAC9, 0xF384, 0x3EC5, 0xDC72, 0,0,0,0,0,0,0,0 },
+  { 0x187E, 0xC4DF, 0x3B21, 0xE782, 0xE782, 0x3B21, 0xC4DF, 0x187E, 0,0,0,0,0,0,0,0 },
+  { 0xC7C, 0xDC72, 0x3537, 0xC13B, 0x3EC5, 0xCAC9, 0x238E, 0xF384, 0,0,0,0,0,0,0,0 },
 };
 
 
 int16_t idct8_matrix_16bit[8][16] = {
-{ 0x2000, 0x3EC5, 0x3B21, 0x3537, 0x2D41, 0x238E, 0x187E, 0xC7C, 0,0,0,0,0,0,0,0 },
-{ 0x2000, 0x3537, 0x187E, 0xF384, 0xD2BF, 0xC13B, 0xC4DF, 0xDC72, 0,0,0,0,0,0,0,0 },
-{ 0x2000, 0x238E, 0xE782, 0xC13B, 0xD2BF, 0xC7C, 0x3B21, 0x3537, 0,0,0,0,0,0,0,0 },
-{ 0x2000, 0xC7C, 0xC4DF, 0xDC72, 0x2D41, 0x3537, 0xE782, 0xC13B, 0,0,0,0,0,0,0,0 },
-{ 0x2000, 0xF384, 0xC4DF, 0x238E, 0x2D41, 0xCAC9, 0xE782, 0x3EC5, 0,0,0,0,0,0,0,0 },
-{ 0x2000, 0xDC72, 0xE782, 0x3EC5, 0xD2BF, 0xF384, 0x3B21, 0xCAC9, 0,0,0,0,0,0,0,0 },
-{ 0x2000, 0xCAC9, 0x187E, 0xC7C, 0xD2BF, 0x3EC5, 0xC4DF, 0x238E, 0,0,0,0,0,0,0,0 },
-{ 0x2000, 0xC13B, 0x3B21, 0xCAC9, 0x2D41, 0xDC72, 0x187E, 0xF384, 0,0,0,0,0,0,0,0 },
+  { 0x2000, 0x3EC5, 0x3B21, 0x3537, 0x2D41, 0x238E, 0x187E, 0xC7C, 0,0,0,0,0,0,0,0 },
+  { 0x2000, 0x3537, 0x187E, 0xF384, 0xD2BF, 0xC13B, 0xC4DF, 0xDC72, 0,0,0,0,0,0,0,0 },
+  { 0x2000, 0x238E, 0xE782, 0xC13B, 0xD2BF, 0xC7C, 0x3B21, 0x3537, 0,0,0,0,0,0,0,0 },
+  { 0x2000, 0xC7C, 0xC4DF, 0xDC72, 0x2D41, 0x3537, 0xE782, 0xC13B, 0,0,0,0,0,0,0,0 },
+  { 0x2000, 0xF384, 0xC4DF, 0x238E, 0x2D41, 0xCAC9, 0xE782, 0x3EC5, 0,0,0,0,0,0,0,0 },
+  { 0x2000, 0xDC72, 0xE782, 0x3EC5, 0xD2BF, 0xF384, 0x3B21, 0xCAC9, 0,0,0,0,0,0,0,0 },
+  { 0x2000, 0xCAC9, 0x187E, 0xC7C, 0xD2BF, 0x3EC5, 0xC4DF, 0x238E, 0,0,0,0,0,0,0,0 },
+  { 0x2000, 0xC13B, 0x3B21, 0xCAC9, 0x2D41, 0xDC72, 0x187E, 0xF384, 0,0,0,0,0,0,0,0 },
 };
 
+int16_t dct8_matrix_16bit_ortho[8][16] = {
+  { 0x2D41, 0x2D41, 0x2D41, 0x2D41, 0x2D41, 0x2D41, 0x2D41, 0x2D41 },
+  { 0x3EC5, 0x3537, 0x238E, 0x0C7C, 0xF384, 0xDC72, 0xCAC9, 0xC13B },
+  { 0x3B21, 0x187E, 0xE782, 0xC4DF, 0xC4DF, 0xE782, 0x187E, 0x3B21 },
+  { 0x3537, 0xF384, 0xC13B, 0xDC72, 0x238E, 0x3EC5, 0x0C7C, 0xCAC9 },
+  { 0x2D41, 0xD2BF, 0xD2BF, 0x2D41, 0x2D41, 0xD2BF, 0xD2BF, 0x2D41 },
+  { 0x238E, 0xC13B, 0x0C7C, 0x3537, 0xCAC9, 0xF384, 0x3EC5, 0xDC72 },
+  { 0x187E, 0xC4DF, 0x3B21, 0xE782, 0xE782, 0x3B21, 0xC4DF, 0x187E },
+  { 0x0C7C, 0xDC72, 0x3537, 0xC13B, 0x3EC5, 0xCAC9, 0x238E, 0xF384 },
+};
+
+
+
 
 headroom_t dct8x8_forward(
     int8_t y[8][8],
@@ -54,4 +67,16 @@ headroom_t dct8x8_inverse(
   int16_t DWORD_ALIGNED buff[8][8];
   dct8x8_stageA(buff, x, idct8_matrix_16bit);
   return dct8x8_stageB(y, buff, idct8_matrix_16bit, sat);
+}
+
+
+
+
+void dct8x8_forward_16bit(
+    int16_t y[8][8],
+    const int8_t x[8][8])
+{
+  int16_t DWORD_ALIGNED buff[8][8];
+  dct8x8_stageA(buff, x, dct8_matrix_16bit_ortho);
+  dct8x8_stageB_16bit(y, buff, dct8_matrix_16bit_ortho);
 }
\ No newline at end of file
diff --git a/test/dct_tests/src/test_dct8x8.c b/test/dct_tests/src/test_dct8x8.c
index a247d112..37ed2d69 100644
--- a/test/dct_tests/src/test_dct8x8.c
+++ b/test/dct_tests/src/test_dct8x8.c
@@ -16,6 +16,7 @@ TEST_GROUP_RUNNER(dct8x8) {
   RUN_TEST_CASE(dct8x8, dct8x8_stageB);
   RUN_TEST_CASE(dct8x8, dct8x8_forward);
   RUN_TEST_CASE(dct8x8, dct8x8_inverse);
+  RUN_TEST_CASE(dct8x8, dct8x8_forward_16bit);
 }
 
 TEST_GROUP(dct8x8);
@@ -551,3 +552,82 @@ TEST(dct8x8, dct8x8_inverse)
 }
 
 
+
+
+TEST(dct8x8, dct8x8_forward_16bit)
+{
+#define FUNC_NAME "dct8x8_forward_16bit"
+
+#if PRINT_FUNC_NAMES
+  printf("\n%s..\n", FUNC_NAME);
+#endif
+
+  unsigned r = 1;
+
+  float worst_timing = 0.0f;
+  
+  DWORD_ALIGNED int8_t x[8][8] = {
+    {-2, 13, 13, 13, 13, 13, 13, 13, },
+    {14, 22, 22, 22, 22, 22, 22, 22, },
+    {14, 22, 22, 22, 22, 22, 22, 22, },
+    {14, 22, 22, 22, 22, 22, 22, 22, },
+    {14, 22, 22, 22, 22, 22, 22, 22, },
+    {14, 22, 22, 22, 22, 22, 22, 22, },
+    {14, 22, 22, 22, 22, 22, 22, 22, },
+    {14, 22, 22, 22, 22, 22, 22, 22, },
+  };
+  DWORD_ALIGNED int16_t y[8][8];
+
+  double ref_in[8][8];
+  double ref_out[8][8];
+
+  int32_t ref_out_s32[8][8];
+
+  for(int row = 0; row < 8; row++){
+    for(unsigned col = 0; col < 8; col++){
+      ref_in[row][col] = x[row][col];
+    }
+  }
+
+  // Compute the reference
+  dbl_dct8x8(ref_out, ref_in, -8);
+  
+  // printf("x = np.array([ \n");
+  // for(int row = 0; row < 8; row++){
+  //   printf("  [ ");
+  //   for(int col = 0; col < 8; col++)  printf("%d, ", x[row][col]);
+  //   printf(" ],\n");
+  // }
+  // printf("])\n");
+
+  // dct8x8_forward_16bit(y, x);
+  
+  // printf("y = np.array([ \n");
+  // for(int row = 0; row < 8; row++){
+  //   printf("  [ ");
+  //   for(int col = 0; col < 8; col++) printf("%d, ", y[row][col]);
+  //   printf(" ],\n");
+  // }
+  // printf("])\n");
+  
+  // printf("ref = np.array([ \n");
+  // for(int row = 0; row < 8; row++){
+  //   printf("  [ ");
+  //   for(int col = 0; col < 8; col++) printf("%0.00f, ", ref_out[row][col]);
+  //   printf(" ],\n");
+  // }
+  // printf("])\n");
+  
+  float max_allowed_diff = 0;
+
+  for(int row = 0; row < 8; row++){
+    for(unsigned col = 0; col < 8; col++){
+      float ref_val = ref_out[row][col];
+      float act_val = y[row][col];
+
+      TEST_ASSERT_FLOAT_WITHIN(max_allowed_diff, ref_val, act_val);
+    }
+  }
+
+#undef FUNC_NAME
+}
\ No newline at end of file