Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions lib_xcore_math/api/xmath/_support/dct_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -184,3 +184,14 @@ headroom_t dct8x8_stageB(
const int16_t x[8][8],
const int16_t matrix[8][16],
const right_shift_t sat);



/**
*
*/
C_API
void dct8x8_stageB_16bit(
int16_t y[8][8],
const int16_t x[8][8],
const int16_t matrix[8][16]);
11 changes: 11 additions & 0 deletions lib_xcore_math/api/xmath/dct.h
Original file line number Diff line number Diff line change
Expand Up @@ -683,6 +683,17 @@ headroom_t dct8x8_inverse(
const int8_t x[8][8],
const right_shift_t sat);




/**
*
*/
C_API
void dct8x8_forward_16bit(
int16_t y[8][8],
const int8_t x[8][8]);


#ifdef __XC__
} // extern "C"
Expand Down
2 changes: 1 addition & 1 deletion lib_xcore_math/src/arch/xs3/dct/s8/dct8x8_stageB.S
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ Whether the forward or inverse DCT is performed depends on whether the
matrix[][] argument points to dct8_matrix_16bit[][] or
idct8_matrix_16bit[][].

headroom_t dct8_inversex8_stageB(
headroom_t dct8x8_stageB(
int8_t y[8][8],
const int16_t x[8][8],
const int16_t matrix[8][16],
Expand Down
120 changes: 120 additions & 0 deletions lib_xcore_math/src/arch/xs3/dct/s8/dct8x8_stageB_16bit.S
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
// Copyright 2020-2022 XMOS LIMITED.
// This Software is subject to the terms of the XMOS Public Licence: Version 1.
#if defined(__XS3A__)


/*

Perform the final step of a 2D 8-by-8 forward or inverse DCT on 8-bit data.

The first step takes an 8-bit tensor x[8][8] as input and populates a 16-bit
tensor y[8][8] as output. The first step is implemented as dct8x8_stageA().

The final step takes a 16-bit tensor x[8][8] as input and populates an 16-bit
tensor y[8][8] as output.

The operation is to perform an 8-point DCT on each row of x[][] to get
an intermediate tensor tmp[][], and then populate y[][] with the TRANSPOSE of
tmp[][].

Whether the forward or inverse DCT is performed depends on whether the
matrix[][] argument points to dct8_matrix_16bit[][] or
idct8_matrix_16bit[][].

void dct8x8_stageB(
int16_t y[8][8],
const int16_t x[8][8],
const int16_t matrix[8][16]);

*/

#define FUNCTION_NAME dct8x8_stageB_16bit
#define NSTACKWORDS 40

.text
.issue_mode dual
.global FUNCTION_NAME
.type FUNCTION_NAME,@function
.align 16
.cc_top FUNCTION_NAME.function,FUNCTION_NAME

#define STK_BUFF (NSTACKWORDS - 32)

#define y r0
#define x r1
#define mat r2
#define buff r3
#define count r4
#define _32 r5
#define _16 r6
#define sat r7

.L_vpu_vec_0x0010:
.short 20,20,20,20,20,20,20,20,20,20,20,20,20,20,20,20

FUNCTION_NAME:
dualentsp NSTACKWORDS
std r4, r5, sp[0]
std r6, r7, sp[1]

ldc r11, 0x100 // 16-bit mode
{ ldc _16, 16 ; vsetc r11 }
ldap r11, .L_vpu_vec_0x0010
{ mov sat, r11 ; }

////// Perform eight 8-point, 16-bit DCTs

{ ldc count, 4 ; ldc r11, 28 }
// We need to traverse the rows of x[] backwards to get elements
// in the right output order.
ldaw x, x[r11]
{ ldc _32, 32 ; }

// Each loop handles 2 rows of the matrix, in the order: (1,0),(3,2),(5,4),(7,6)
// so that later matrix rows end up in higher accumulator indices, and thus higher memory
// when stored.
.L_loop_top:
{ add mat, mat, _32 ; vclrdr }
{ mov r11, x ; vldc mat[0] }
{ sub r11, r11, _16 ; vlmaccr r11[0] }
{ sub r11, r11, _16 ; vlmaccr r11[0] }
{ sub r11, r11, _16 ; vlmaccr r11[0] }
{ sub r11, r11, _16 ; vlmaccr r11[0] }
{ sub r11, r11, _16 ; vlmaccr r11[0] }
{ sub r11, r11, _16 ; vlmaccr r11[0] }
{ sub r11, r11, _16 ; vlmaccr r11[0] }
{ sub mat, mat, _32 ; vlmaccr r11[0] }
{ mov r11, x ; vldc mat[0] }
{ sub r11, r11, _16 ; vlmaccr r11[0] }
{ sub r11, r11, _16 ; vlmaccr r11[0] }
{ sub r11, r11, _16 ; vlmaccr r11[0] }
{ sub r11, r11, _16 ; vlmaccr r11[0] }
{ sub r11, r11, _16 ; vlmaccr r11[0] }
{ sub r11, r11, _16 ; vlmaccr r11[0] }
{ sub r11, r11, _16 ; vlmaccr r11[0] }
{ add mat, mat, _32 ; vlmaccr r11[0] }
{ add mat, mat, _32 ; vlsat sat[0] }
{ sub count, count, 1 ; vstr y[0] }
{ add y, y, _32 ; bt count, .L_loop_top }
.L_loop_bot:

ldd r4, r5, sp[0]
ldd r6, r7, sp[1]

retsp NSTACKWORDS


.cc_bottom FUNCTION_NAME.function
.set FUNCTION_NAME.nstackwords,NSTACKWORDS
.globl FUNCTION_NAME.nstackwords
.set FUNCTION_NAME.maxcores,1
.globl FUNCTION_NAME.maxcores
.set FUNCTION_NAME.maxtimers,0
.globl FUNCTION_NAME.maxtimers
.set FUNCTION_NAME.maxchanends,0
.globl FUNCTION_NAME.maxchanends
.Ltmp0:
.size FUNCTION_NAME, .Ltmp0-FUNCTION_NAME


#endif //defined(__XS3A__)
57 changes: 41 additions & 16 deletions lib_xcore_math/src/dct/dct8x8.c
Original file line number Diff line number Diff line change
Expand Up @@ -11,28 +11,41 @@


int16_t dct8_matrix_16bit[8][16] = {
{ 0x4000, 0x4000, 0x4000, 0x4000, 0x4000, 0x4000, 0x4000, 0x4000, 0,0,0,0,0,0,0,0 },
{ 0x3EC5, 0x3537, 0x238E, 0xC7C, 0xF384, 0xDC72, 0xCAC9, 0xC13B, 0,0,0,0,0,0,0,0 },
{ 0x3B21, 0x187E, 0xE782, 0xC4DF, 0xC4DF, 0xE782, 0x187E, 0x3B21, 0,0,0,0,0,0,0,0 },
{ 0x3537, 0xF384, 0xC13B, 0xDC72, 0x238E, 0x3EC5, 0xC7C, 0xCAC9, 0,0,0,0,0,0,0,0 },
{ 0x2D41, 0xD2BF, 0xD2BF, 0x2D41, 0x2D41, 0xD2BF, 0xD2BF, 0x2D41, 0,0,0,0,0,0,0,0 },
{ 0x238E, 0xC13B, 0xC7C, 0x3537, 0xCAC9, 0xF384, 0x3EC5, 0xDC72, 0,0,0,0,0,0,0,0 },
{ 0x187E, 0xC4DF, 0x3B21, 0xE782, 0xE782, 0x3B21, 0xC4DF, 0x187E, 0,0,0,0,0,0,0,0 },
{ 0xC7C, 0xDC72, 0x3537, 0xC13B, 0x3EC5, 0xCAC9, 0x238E, 0xF384, 0,0,0,0,0,0,0,0 },
{ 0x4000, 0x4000, 0x4000, 0x4000, 0x4000, 0x4000, 0x4000, 0x4000, 0,0,0,0,0,0,0,0 },
{ 0x3EC5, 0x3537, 0x238E, 0xC7C, 0xF384, 0xDC72, 0xCAC9, 0xC13B, 0,0,0,0,0,0,0,0 },
{ 0x3B21, 0x187E, 0xE782, 0xC4DF, 0xC4DF, 0xE782, 0x187E, 0x3B21, 0,0,0,0,0,0,0,0 },
{ 0x3537, 0xF384, 0xC13B, 0xDC72, 0x238E, 0x3EC5, 0xC7C, 0xCAC9, 0,0,0,0,0,0,0,0 },
{ 0x2D41, 0xD2BF, 0xD2BF, 0x2D41, 0x2D41, 0xD2BF, 0xD2BF, 0x2D41, 0,0,0,0,0,0,0,0 },
{ 0x238E, 0xC13B, 0xC7C, 0x3537, 0xCAC9, 0xF384, 0x3EC5, 0xDC72, 0,0,0,0,0,0,0,0 },
{ 0x187E, 0xC4DF, 0x3B21, 0xE782, 0xE782, 0x3B21, 0xC4DF, 0x187E, 0,0,0,0,0,0,0,0 },
{ 0xC7C, 0xDC72, 0x3537, 0xC13B, 0x3EC5, 0xCAC9, 0x238E, 0xF384, 0,0,0,0,0,0,0,0 },
};


int16_t idct8_matrix_16bit[8][16] = {
{ 0x2000, 0x3EC5, 0x3B21, 0x3537, 0x2D41, 0x238E, 0x187E, 0xC7C, 0,0,0,0,0,0,0,0 },
{ 0x2000, 0x3537, 0x187E, 0xF384, 0xD2BF, 0xC13B, 0xC4DF, 0xDC72, 0,0,0,0,0,0,0,0 },
{ 0x2000, 0x238E, 0xE782, 0xC13B, 0xD2BF, 0xC7C, 0x3B21, 0x3537, 0,0,0,0,0,0,0,0 },
{ 0x2000, 0xC7C, 0xC4DF, 0xDC72, 0x2D41, 0x3537, 0xE782, 0xC13B, 0,0,0,0,0,0,0,0 },
{ 0x2000, 0xF384, 0xC4DF, 0x238E, 0x2D41, 0xCAC9, 0xE782, 0x3EC5, 0,0,0,0,0,0,0,0 },
{ 0x2000, 0xDC72, 0xE782, 0x3EC5, 0xD2BF, 0xF384, 0x3B21, 0xCAC9, 0,0,0,0,0,0,0,0 },
{ 0x2000, 0xCAC9, 0x187E, 0xC7C, 0xD2BF, 0x3EC5, 0xC4DF, 0x238E, 0,0,0,0,0,0,0,0 },
{ 0x2000, 0xC13B, 0x3B21, 0xCAC9, 0x2D41, 0xDC72, 0x187E, 0xF384, 0,0,0,0,0,0,0,0 },
{ 0x2000, 0x3EC5, 0x3B21, 0x3537, 0x2D41, 0x238E, 0x187E, 0xC7C, 0,0,0,0,0,0,0,0 },
{ 0x2000, 0x3537, 0x187E, 0xF384, 0xD2BF, 0xC13B, 0xC4DF, 0xDC72, 0,0,0,0,0,0,0,0 },
{ 0x2000, 0x238E, 0xE782, 0xC13B, 0xD2BF, 0xC7C, 0x3B21, 0x3537, 0,0,0,0,0,0,0,0 },
{ 0x2000, 0xC7C, 0xC4DF, 0xDC72, 0x2D41, 0x3537, 0xE782, 0xC13B, 0,0,0,0,0,0,0,0 },
{ 0x2000, 0xF384, 0xC4DF, 0x238E, 0x2D41, 0xCAC9, 0xE782, 0x3EC5, 0,0,0,0,0,0,0,0 },
{ 0x2000, 0xDC72, 0xE782, 0x3EC5, 0xD2BF, 0xF384, 0x3B21, 0xCAC9, 0,0,0,0,0,0,0,0 },
{ 0x2000, 0xCAC9, 0x187E, 0xC7C, 0xD2BF, 0x3EC5, 0xC4DF, 0x238E, 0,0,0,0,0,0,0,0 },
{ 0x2000, 0xC13B, 0x3B21, 0xCAC9, 0x2D41, 0xDC72, 0x187E, 0xF384, 0,0,0,0,0,0,0,0 },
};

int16_t dct8_matrix_16bit_ortho[8][16] = {
{ 0x2D41, 0x2D41, 0x2D41, 0x2D41, 0x2D41, 0x2D41, 0x2D41, 0x2D41 },
{ 0x3EC5, 0x3537, 0x238E, 0x0C7C, 0xF384, 0xDC72, 0xCAC9, 0xC13B },
{ 0x3B21, 0x187E, 0xE782, 0xC4DF, 0xC4DF, 0xE782, 0x187E, 0x3B21 },
{ 0x3537, 0xF384, 0xC13B, 0xDC72, 0x238E, 0x3EC5, 0x0C7C, 0xCAC9 },
{ 0x2D41, 0xD2BF, 0xD2BF, 0x2D41, 0x2D41, 0xD2BF, 0xD2BF, 0x2D41 },
{ 0x238E, 0xC13B, 0x0C7C, 0x3537, 0xCAC9, 0xF384, 0x3EC5, 0xDC72 },
{ 0x187E, 0xC4DF, 0x3B21, 0xE782, 0xE782, 0x3B21, 0xC4DF, 0x187E },
{ 0x0C7C, 0xDC72, 0x3537, 0xC13B, 0x3EC5, 0xCAC9, 0x238E, 0xF384 },
};




headroom_t dct8x8_forward(
int8_t y[8][8],
Expand All @@ -54,4 +67,16 @@ headroom_t dct8x8_inverse(
int16_t DWORD_ALIGNED buff[8][8];
dct8x8_stageA(buff, x, idct8_matrix_16bit);
return dct8x8_stageB(y, buff, idct8_matrix_16bit, sat);
}




void dct8x8_forward_16bit(
int16_t y[8][8],
const int8_t x[8][8])
{
int16_t DWORD_ALIGNED buff[8][8];
dct8x8_stageA(buff, x, dct8_matrix_16bit_ortho);
dct8x8_stageB_16bit(y, buff, dct8_matrix_16bit_ortho);
}
80 changes: 80 additions & 0 deletions test/dct_tests/src/test_dct8x8.c
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ TEST_GROUP_RUNNER(dct8x8) {
RUN_TEST_CASE(dct8x8, dct8x8_stageB);
RUN_TEST_CASE(dct8x8, dct8x8_forward);
RUN_TEST_CASE(dct8x8, dct8x8_inverse);
RUN_TEST_CASE(dct8x8, dct8x8_forward_16bit);
}

TEST_GROUP(dct8x8);
Expand Down Expand Up @@ -551,3 +552,82 @@ TEST(dct8x8, dct8x8_inverse)
}




TEST(dct8x8, dct8x8_forward_16bit)
{
#define FUNC_NAME "dct8x8_forward_16bit"

#if PRINT_FUNC_NAMES
printf("\n%s..\n", FUNC_NAME);
#endif

unsigned r = 1;

float worst_timing = 0.0f;

DWORD_ALIGNED int8_t x[8][8] = {
{-2, 13, 13, 13, 13, 13, 13, 13, },
{14, 22, 22, 22, 22, 22, 22, 22, },
{14, 22, 22, 22, 22, 22, 22, 22, },
{14, 22, 22, 22, 22, 22, 22, 22, },
{14, 22, 22, 22, 22, 22, 22, 22, },
{14, 22, 22, 22, 22, 22, 22, 22, },
{14, 22, 22, 22, 22, 22, 22, 22, },
{14, 22, 22, 22, 22, 22, 22, 22, },
};
DWORD_ALIGNED int16_t y[8][8];

double ref_in[8][8];
double ref_out[8][8];

int32_t ref_out_s32[8][8];

for(int row = 0; row < 8; row++){
for(unsigned col = 0; col < 8; col++){
ref_in[row][col] = x[row][col];
}
}

// Compute the reference
dbl_dct8x8(ref_out, ref_in, -8);

// printf("x = np.array([ \n");
// for(int row = 0; row < 8; row++){
// printf(" [ ");
// for(int col = 0; col < 8; col++) printf("%d, ", x[row][col]);
// printf(" ],\n");
// }
// printf("])\n");

// dct8x8_forward_16bit(y, x);

// printf("y = np.array([ \n");
// for(int row = 0; row < 8; row++){
// printf(" [ ");
// for(int col = 0; col < 8; col++) printf("%d, ", y[row][col]);
// printf(" ],\n");
// }
// printf("])\n");

// printf("ref = np.array([ \n");
// for(int row = 0; row < 8; row++){
// printf(" [ ");
// for(int col = 0; col < 8; col++) printf("%0.00f, ", ref_out[row][col]);
// printf(" ],\n");
// }
// printf("])\n");

float max_allowed_diff = 0;

for(int row = 0; row < 8; row++){
for(unsigned col = 0; col < 8; col++){
float ref_val = ref_out[row][col];
float act_val = y[row][col];

TEST_ASSERT_FLOAT_WITHIN(max_allowed_diff, ref_val, act_val);
}
}

#undef FUNC_NAME
}