diff --git a/lib_xcore_math/api/xmath/_support/dct_impl.h b/lib_xcore_math/api/xmath/_support/dct_impl.h index c1947d37..ed23df05 100644 --- a/lib_xcore_math/api/xmath/_support/dct_impl.h +++ b/lib_xcore_math/api/xmath/_support/dct_impl.h @@ -184,3 +184,14 @@ headroom_t dct8x8_stageB( const int16_t x[8][8], const int16_t matrix[8][16], const right_shift_t sat); + + + +/** + * +*/ +C_API +void dct8x8_stageB_16bit( + int16_t y[8][8], + const int16_t x[8][8], + const int16_t matrix[8][16]); diff --git a/lib_xcore_math/api/xmath/dct.h b/lib_xcore_math/api/xmath/dct.h index ace528cb..0b74663c 100644 --- a/lib_xcore_math/api/xmath/dct.h +++ b/lib_xcore_math/api/xmath/dct.h @@ -683,6 +683,17 @@ headroom_t dct8x8_inverse( const int8_t x[8][8], const right_shift_t sat); + + + +/** + * + */ +C_API +void dct8x8_forward_16bit( + int16_t y[8][8], + const int8_t x[8][8]); + #ifdef __XC__ } // extern "C" diff --git a/lib_xcore_math/src/arch/xs3/dct/s8/dct8x8_stageB.S b/lib_xcore_math/src/arch/xs3/dct/s8/dct8x8_stageB.S index a6fcc330..d3a81b14 100644 --- a/lib_xcore_math/src/arch/xs3/dct/s8/dct8x8_stageB.S +++ b/lib_xcore_math/src/arch/xs3/dct/s8/dct8x8_stageB.S @@ -21,7 +21,7 @@ Whether the forward or inverse DCT is performed depends on whether the matrix[][] argument points to dct8_matrix_16bit[][] or idct8_matrix_16bit[][]. -headroom_t dct8_inversex8_stageB( +headroom_t dct8x8_stageB( int8_t y[8][8], const int16_t x[8][8], const int16_t matrix[8][16], diff --git a/lib_xcore_math/src/arch/xs3/dct/s8/dct8x8_stageB_16bit.S b/lib_xcore_math/src/arch/xs3/dct/s8/dct8x8_stageB_16bit.S new file mode 100644 index 00000000..16c7532a --- /dev/null +++ b/lib_xcore_math/src/arch/xs3/dct/s8/dct8x8_stageB_16bit.S @@ -0,0 +1,120 @@ +// Copyright 2020-2022 XMOS LIMITED. +// This Software is subject to the terms of the XMOS Public Licence: Version 1. +#if defined(__XS3A__) + + +/* + +Perform the final step of a 2D 8-by-8 forward or inverse DCT on 8-bit data. + +The first step takes an 8-bit tensor x[8][8] as input and populates a 16-bit +tensor y[8][8] as output. The first step is implemented as dct8x8_stageA(). + +The final step takes a 16-bit tensor x[8][8] as input and populates an 16-bit +tensor y[8][8] as output. + +The operation is to perform an 8-point DCT on each row of x[][] to get +an intermediate tensor tmp[][], and then populate y[][] with the TRANSPOSE of +tmp[][]. + +Whether the forward or inverse DCT is performed depends on whether the +matrix[][] argument points to dct8_matrix_16bit[][] or +idct8_matrix_16bit[][]. + +void dct8x8_stageB( + int16_t y[8][8], + const int16_t x[8][8], + const int16_t matrix[8][16]); + +*/ + +#define FUNCTION_NAME dct8x8_stageB_16bit +#define NSTACKWORDS 40 + +.text +.issue_mode dual +.global FUNCTION_NAME +.type FUNCTION_NAME,@function +.align 16 +.cc_top FUNCTION_NAME.function,FUNCTION_NAME + +#define STK_BUFF (NSTACKWORDS - 32) + +#define y r0 +#define x r1 +#define mat r2 +#define buff r3 +#define count r4 +#define _32 r5 +#define _16 r6 +#define sat r7 + +.L_vpu_vec_0x0010: +.short 20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20 + +FUNCTION_NAME: + dualentsp NSTACKWORDS + std r4, r5, sp[0] + std r6, r7, sp[1] + + ldc r11, 0x100 // 16-bit mode +{ ldc _16, 16 ; vsetc r11 } + ldap r11, .L_vpu_vec_0x0010 +{ mov sat, r11 ; } + +////// Perform eight 8-point, 16-bit DCTs + +{ ldc count, 4 ; ldc r11, 28 } +// We need to traverse the rows of x[] backwards to get elements +// in the right output order. + ldaw x, x[r11] +{ ldc _32, 32 ; } + +// Each loop handles 2 rows of the matrix, in the order: (1,0),(3,2),(5,4),(7,6) +// so that later matrix rows end up in higher accumulator indices, and thus higher memory +// when stored. +.L_loop_top: + { add mat, mat, _32 ; vclrdr } + { mov r11, x ; vldc mat[0] } + { sub r11, r11, _16 ; vlmaccr r11[0] } + { sub r11, r11, _16 ; vlmaccr r11[0] } + { sub r11, r11, _16 ; vlmaccr r11[0] } + { sub r11, r11, _16 ; vlmaccr r11[0] } + { sub r11, r11, _16 ; vlmaccr r11[0] } + { sub r11, r11, _16 ; vlmaccr r11[0] } + { sub r11, r11, _16 ; vlmaccr r11[0] } + { sub mat, mat, _32 ; vlmaccr r11[0] } + { mov r11, x ; vldc mat[0] } + { sub r11, r11, _16 ; vlmaccr r11[0] } + { sub r11, r11, _16 ; vlmaccr r11[0] } + { sub r11, r11, _16 ; vlmaccr r11[0] } + { sub r11, r11, _16 ; vlmaccr r11[0] } + { sub r11, r11, _16 ; vlmaccr r11[0] } + { sub r11, r11, _16 ; vlmaccr r11[0] } + { sub r11, r11, _16 ; vlmaccr r11[0] } + { add mat, mat, _32 ; vlmaccr r11[0] } + { add mat, mat, _32 ; vlsat sat[0] } + { sub count, count, 1 ; vstr y[0] } + { add y, y, _32 ; bt count, .L_loop_top } +.L_loop_bot: + + ldd r4, r5, sp[0] + ldd r6, r7, sp[1] + + retsp NSTACKWORDS + + +.cc_bottom FUNCTION_NAME.function +.set FUNCTION_NAME.nstackwords,NSTACKWORDS +.globl FUNCTION_NAME.nstackwords +.set FUNCTION_NAME.maxcores,1 +.globl FUNCTION_NAME.maxcores +.set FUNCTION_NAME.maxtimers,0 +.globl FUNCTION_NAME.maxtimers +.set FUNCTION_NAME.maxchanends,0 +.globl FUNCTION_NAME.maxchanends +.Ltmp0: + .size FUNCTION_NAME, .Ltmp0-FUNCTION_NAME + + +#endif //defined(__XS3A__) \ No newline at end of file diff --git a/lib_xcore_math/src/dct/dct8x8.c b/lib_xcore_math/src/dct/dct8x8.c index e5c3d39d..4db4ae6a 100644 --- a/lib_xcore_math/src/dct/dct8x8.c +++ b/lib_xcore_math/src/dct/dct8x8.c @@ -11,28 +11,41 @@ int16_t dct8_matrix_16bit[8][16] = { -{ 0x4000, 0x4000, 0x4000, 0x4000, 0x4000, 0x4000, 0x4000, 0x4000, 0,0,0,0,0,0,0,0 }, -{ 0x3EC5, 0x3537, 0x238E, 0xC7C, 0xF384, 0xDC72, 0xCAC9, 0xC13B, 0,0,0,0,0,0,0,0 }, -{ 0x3B21, 0x187E, 0xE782, 0xC4DF, 0xC4DF, 0xE782, 0x187E, 0x3B21, 0,0,0,0,0,0,0,0 }, -{ 0x3537, 0xF384, 0xC13B, 0xDC72, 0x238E, 0x3EC5, 0xC7C, 0xCAC9, 0,0,0,0,0,0,0,0 }, -{ 0x2D41, 0xD2BF, 0xD2BF, 0x2D41, 0x2D41, 0xD2BF, 0xD2BF, 0x2D41, 0,0,0,0,0,0,0,0 }, -{ 0x238E, 0xC13B, 0xC7C, 0x3537, 0xCAC9, 0xF384, 0x3EC5, 0xDC72, 0,0,0,0,0,0,0,0 }, -{ 0x187E, 0xC4DF, 0x3B21, 0xE782, 0xE782, 0x3B21, 0xC4DF, 0x187E, 0,0,0,0,0,0,0,0 }, -{ 0xC7C, 0xDC72, 0x3537, 0xC13B, 0x3EC5, 0xCAC9, 0x238E, 0xF384, 0,0,0,0,0,0,0,0 }, + { 0x4000, 0x4000, 0x4000, 0x4000, 0x4000, 0x4000, 0x4000, 0x4000, 0,0,0,0,0,0,0,0 }, + { 0x3EC5, 0x3537, 0x238E, 0xC7C, 0xF384, 0xDC72, 0xCAC9, 0xC13B, 0,0,0,0,0,0,0,0 }, + { 0x3B21, 0x187E, 0xE782, 0xC4DF, 0xC4DF, 0xE782, 0x187E, 0x3B21, 0,0,0,0,0,0,0,0 }, + { 0x3537, 0xF384, 0xC13B, 0xDC72, 0x238E, 0x3EC5, 0xC7C, 0xCAC9, 0,0,0,0,0,0,0,0 }, + { 0x2D41, 0xD2BF, 0xD2BF, 0x2D41, 0x2D41, 0xD2BF, 0xD2BF, 0x2D41, 0,0,0,0,0,0,0,0 }, + { 0x238E, 0xC13B, 0xC7C, 0x3537, 0xCAC9, 0xF384, 0x3EC5, 0xDC72, 0,0,0,0,0,0,0,0 }, + { 0x187E, 0xC4DF, 0x3B21, 0xE782, 0xE782, 0x3B21, 0xC4DF, 0x187E, 0,0,0,0,0,0,0,0 }, + { 0xC7C, 0xDC72, 0x3537, 0xC13B, 0x3EC5, 0xCAC9, 0x238E, 0xF384, 0,0,0,0,0,0,0,0 }, }; int16_t idct8_matrix_16bit[8][16] = { -{ 0x2000, 0x3EC5, 0x3B21, 0x3537, 0x2D41, 0x238E, 0x187E, 0xC7C, 0,0,0,0,0,0,0,0 }, -{ 0x2000, 0x3537, 0x187E, 0xF384, 0xD2BF, 0xC13B, 0xC4DF, 0xDC72, 0,0,0,0,0,0,0,0 }, -{ 0x2000, 0x238E, 0xE782, 0xC13B, 0xD2BF, 0xC7C, 0x3B21, 0x3537, 0,0,0,0,0,0,0,0 }, -{ 0x2000, 0xC7C, 0xC4DF, 0xDC72, 0x2D41, 0x3537, 0xE782, 0xC13B, 0,0,0,0,0,0,0,0 }, -{ 0x2000, 0xF384, 0xC4DF, 0x238E, 0x2D41, 0xCAC9, 0xE782, 0x3EC5, 0,0,0,0,0,0,0,0 }, -{ 0x2000, 0xDC72, 0xE782, 0x3EC5, 0xD2BF, 0xF384, 0x3B21, 0xCAC9, 0,0,0,0,0,0,0,0 }, -{ 0x2000, 0xCAC9, 0x187E, 0xC7C, 0xD2BF, 0x3EC5, 0xC4DF, 0x238E, 0,0,0,0,0,0,0,0 }, -{ 0x2000, 0xC13B, 0x3B21, 0xCAC9, 0x2D41, 0xDC72, 0x187E, 0xF384, 0,0,0,0,0,0,0,0 }, + { 0x2000, 0x3EC5, 0x3B21, 0x3537, 0x2D41, 0x238E, 0x187E, 0xC7C, 0,0,0,0,0,0,0,0 }, + { 0x2000, 0x3537, 0x187E, 0xF384, 0xD2BF, 0xC13B, 0xC4DF, 0xDC72, 0,0,0,0,0,0,0,0 }, + { 0x2000, 0x238E, 0xE782, 0xC13B, 0xD2BF, 0xC7C, 0x3B21, 0x3537, 0,0,0,0,0,0,0,0 }, + { 0x2000, 0xC7C, 0xC4DF, 0xDC72, 0x2D41, 0x3537, 0xE782, 0xC13B, 0,0,0,0,0,0,0,0 }, + { 0x2000, 0xF384, 0xC4DF, 0x238E, 0x2D41, 0xCAC9, 0xE782, 0x3EC5, 0,0,0,0,0,0,0,0 }, + { 0x2000, 0xDC72, 0xE782, 0x3EC5, 0xD2BF, 0xF384, 0x3B21, 0xCAC9, 0,0,0,0,0,0,0,0 }, + { 0x2000, 0xCAC9, 0x187E, 0xC7C, 0xD2BF, 0x3EC5, 0xC4DF, 0x238E, 0,0,0,0,0,0,0,0 }, + { 0x2000, 0xC13B, 0x3B21, 0xCAC9, 0x2D41, 0xDC72, 0x187E, 0xF384, 0,0,0,0,0,0,0,0 }, }; +int16_t dct8_matrix_16bit_ortho[8][16] = { + { 0x2D41, 0x2D41, 0x2D41, 0x2D41, 0x2D41, 0x2D41, 0x2D41, 0x2D41 }, + { 0x3EC5, 0x3537, 0x238E, 0x0C7C, 0xF384, 0xDC72, 0xCAC9, 0xC13B }, + { 0x3B21, 0x187E, 0xE782, 0xC4DF, 0xC4DF, 0xE782, 0x187E, 0x3B21 }, + { 0x3537, 0xF384, 0xC13B, 0xDC72, 0x238E, 0x3EC5, 0x0C7C, 0xCAC9 }, + { 0x2D41, 0xD2BF, 0xD2BF, 0x2D41, 0x2D41, 0xD2BF, 0xD2BF, 0x2D41 }, + { 0x238E, 0xC13B, 0x0C7C, 0x3537, 0xCAC9, 0xF384, 0x3EC5, 0xDC72 }, + { 0x187E, 0xC4DF, 0x3B21, 0xE782, 0xE782, 0x3B21, 0xC4DF, 0x187E }, + { 0x0C7C, 0xDC72, 0x3537, 0xC13B, 0x3EC5, 0xCAC9, 0x238E, 0xF384 }, +}; + + + headroom_t dct8x8_forward( int8_t y[8][8], @@ -54,4 +67,16 @@ headroom_t dct8x8_inverse( int16_t DWORD_ALIGNED buff[8][8]; dct8x8_stageA(buff, x, idct8_matrix_16bit); return dct8x8_stageB(y, buff, idct8_matrix_16bit, sat); +} + + + + +void dct8x8_forward_16bit( + int16_t y[8][8], + const int8_t x[8][8]) +{ + int16_t DWORD_ALIGNED buff[8][8]; + dct8x8_stageA(buff, x, dct8_matrix_16bit_ortho); + dct8x8_stageB_16bit(y, buff, dct8_matrix_16bit_ortho); } \ No newline at end of file diff --git a/test/dct_tests/src/test_dct8x8.c b/test/dct_tests/src/test_dct8x8.c index a247d112..37ed2d69 100644 --- a/test/dct_tests/src/test_dct8x8.c +++ b/test/dct_tests/src/test_dct8x8.c @@ -16,6 +16,7 @@ TEST_GROUP_RUNNER(dct8x8) { RUN_TEST_CASE(dct8x8, dct8x8_stageB); RUN_TEST_CASE(dct8x8, dct8x8_forward); RUN_TEST_CASE(dct8x8, dct8x8_inverse); + RUN_TEST_CASE(dct8x8, dct8x8_forward_16bit); } TEST_GROUP(dct8x8); @@ -551,3 +552,82 @@ TEST(dct8x8, dct8x8_inverse) } + + +TEST(dct8x8, dct8x8_forward_16bit) +{ +#define FUNC_NAME "dct8x8_forward_16bit" + +#if PRINT_FUNC_NAMES + printf("\n%s..\n", FUNC_NAME); +#endif + + unsigned r = 1; + + float worst_timing = 0.0f; + + DWORD_ALIGNED int8_t x[8][8] = { + {-2, 13, 13, 13, 13, 13, 13, 13, }, + {14, 22, 22, 22, 22, 22, 22, 22, }, + {14, 22, 22, 22, 22, 22, 22, 22, }, + {14, 22, 22, 22, 22, 22, 22, 22, }, + {14, 22, 22, 22, 22, 22, 22, 22, }, + {14, 22, 22, 22, 22, 22, 22, 22, }, + {14, 22, 22, 22, 22, 22, 22, 22, }, + {14, 22, 22, 22, 22, 22, 22, 22, }, + }; + DWORD_ALIGNED int16_t y[8][8]; + + double ref_in[8][8]; + double ref_out[8][8]; + + int32_t ref_out_s32[8][8]; + + for(int row = 0; row < 8; row++){ + for(unsigned col = 0; col < 8; col++){ + ref_in[row][col] = x[row][col]; + } + } + + // Compute the reference + dbl_dct8x8(ref_out, ref_in, -8); + + // printf("x = np.array([ \n"); + // for(int row = 0; row < 8; row++){ + // printf(" [ "); + // for(int col = 0; col < 8; col++) printf("%d, ", x[row][col]); + // printf(" ],\n"); + // } + // printf("])\n"); + + // dct8x8_forward_16bit(y, x); + + // printf("y = np.array([ \n"); + // for(int row = 0; row < 8; row++){ + // printf(" [ "); + // for(int col = 0; col < 8; col++) printf("%d, ", y[row][col]); + // printf(" ],\n"); + // } + // printf("])\n"); + + // printf("ref = np.array([ \n"); + // for(int row = 0; row < 8; row++){ + // printf(" [ "); + // for(int col = 0; col < 8; col++) printf("%0.00f, ", ref_out[row][col]); + // printf(" ],\n"); + // } + // printf("])\n"); + + float max_allowed_diff = 0; + + for(int row = 0; row < 8; row++){ + for(unsigned col = 0; col < 8; col++){ + float ref_val = ref_out[row][col]; + float act_val = y[row][col]; + + TEST_ASSERT_FLOAT_WITHIN(max_allowed_diff, ref_val, act_val); + } + } + +#undef FUNC_NAME +} \ No newline at end of file