Skip to content

Commit

Permalink
Reduce memory usage for instruction block
Browse files Browse the repository at this point in the history
  • Loading branch information
RinHizakura committed Sep 30, 2023
1 parent 806fc6e commit 90fa28f
Show file tree
Hide file tree
Showing 5 changed files with 117 additions and 60 deletions.
2 changes: 2 additions & 0 deletions src/decode.h
Original file line number Diff line number Diff line change
Expand Up @@ -298,6 +298,8 @@ typedef struct rv_insn {
* specific IR array without the need for additional copying.
*/
struct rv_insn *branch_taken, *branch_untaken;

struct rv_insn *next;
} rv_insn_t;

/* decode the RISC-V instruction */
Expand Down
128 changes: 79 additions & 49 deletions src/emulate.c
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ extern struct target_ops gdbstub_ops;
#include "riscv_private.h"
#include "state.h"
#include "utils.h"
#include "mpool.h"

/* RISC-V exception code list */
#define RV_EXCEPTION_LIST \
Expand Down Expand Up @@ -277,16 +278,18 @@ static inline uint32_t hash(size_t k)
return k;
}

static void block_translate(riscv_t *rv, block_map_t *map, block_t *block);
/* allocate a basic block */
static block_t *block_alloc(const uint8_t bits)
static block_t *block_alloc(riscv_t *rv, block_map_t *map)
{
block_t *block = malloc(sizeof(struct block));
block_t *block = mpool_alloc(map->block_mp);
assert(block);
block->insn_capacity = 1 << bits;
block->n_insn = 0;
block->predict = NULL;
block->ir = malloc(block->insn_capacity * sizeof(rv_insn_t));
assert(block->ir);

/* Translate the basic block, block->first_ir and block->last_ir will
* be initialize there. */
block_translate(rv, map, block);
return block;
}

Expand Down Expand Up @@ -366,7 +369,7 @@ static uint32_t last_pc = 0;
rv->PC += ir->insn_len; \
if (unlikely(RVOP_NO_NEXT(ir))) \
return true; \
const rv_insn_t *next = ir + 1; \
const rv_insn_t *next = ir->next; \
MUST_TAIL return next->impl(rv, next); \
}

Expand Down Expand Up @@ -395,36 +398,47 @@ enum {
#undef _
};

/* FIXME: This will simply find the n-th instruction by iterating
* the linked list linearly, we may want to find better approach. */
#define GET_NEXT_N_INSN(ir, n) \
({ \
rv_insn_t *__tmp = ir; \
for (typeof(n) iter = 0; iter < n; iter++) \
__tmp = __tmp->next; \
__tmp; \
})

/* multiple lui */
static bool do_fuse1(riscv_t *rv, const rv_insn_t *ir)
static bool do_fuse1(riscv_t *rv, rv_insn_t *ir)
{
rv->csr_cycle += ir->imm2;
for (int i = 0; i < ir->imm2; i++) {
const rv_insn_t *cur_ir = ir + i;
int i;
rv_insn_t *cur_ir;
for (i = 0, cur_ir = ir; i < ir->imm2; i++, cur_ir = cur_ir->next) {
rv->X[cur_ir->rd] = cur_ir->imm;
}
rv->PC += ir->imm2 * ir->insn_len;
if (unlikely(RVOP_NO_NEXT(ir)))
return true;
const rv_insn_t *next = ir + ir->imm2;
const rv_insn_t *next = GET_NEXT_N_INSN(ir, ir->imm2);
MUST_TAIL return next->impl(rv, next);
}

/* LUI + ADD */
static bool do_fuse2(riscv_t *rv, const rv_insn_t *ir)
static bool do_fuse2(riscv_t *rv, rv_insn_t *ir)
{
rv->csr_cycle += 2;
rv->X[ir->rd] = ir->imm;
rv->X[ir->rs2] = rv->X[ir->rd] + rv->X[ir->rs1];
rv->PC += 2 * ir->insn_len;
if (unlikely(RVOP_NO_NEXT(ir)))
return true;
const rv_insn_t *next = ir + 2;
const rv_insn_t *next = GET_NEXT_N_INSN(ir, 2);
MUST_TAIL return next->impl(rv, next);
}

/* multiple SW */
static bool do_fuse3(riscv_t *rv, const rv_insn_t *ir)
static bool do_fuse3(riscv_t *rv, rv_insn_t *ir)
{
rv->csr_cycle += ir->imm2;
opcode_fuse_t *fuse = ir->fuse;
Expand All @@ -442,12 +456,12 @@ static bool do_fuse3(riscv_t *rv, const rv_insn_t *ir)
rv->PC += ir->imm2 * ir->insn_len;
if (unlikely(RVOP_NO_NEXT(ir)))
return true;
const rv_insn_t *next = ir + ir->imm2;
const rv_insn_t *next = GET_NEXT_N_INSN(ir, ir->imm2);
MUST_TAIL return next->impl(rv, next);
}

/* multiple LW */
static bool do_fuse4(riscv_t *rv, const rv_insn_t *ir)
static bool do_fuse4(riscv_t *rv, rv_insn_t *ir)
{
rv->csr_cycle += ir->imm2;
opcode_fuse_t *fuse = ir->fuse;
Expand All @@ -465,7 +479,7 @@ static bool do_fuse4(riscv_t *rv, const rv_insn_t *ir)
rv->PC += ir->imm2 * ir->insn_len;
if (unlikely(RVOP_NO_NEXT(ir)))
return true;
const rv_insn_t *next = ir + ir->imm2;
const rv_insn_t *next = GET_NEXT_N_INSN(ir, ir->imm2);
MUST_TAIL return next->impl(rv, next);
}

Expand All @@ -479,7 +493,7 @@ static bool do_fuse5(riscv_t *rv, const rv_insn_t *ir)
rv->PC = rv->X[rv_reg_ra] & ~1U;
if (unlikely(RVOP_NO_NEXT(ir)))
return true;
const rv_insn_t *next = ir + 1;
const rv_insn_t *next = ir->next;
MUST_TAIL return next->impl(rv, next);
}

Expand All @@ -493,7 +507,7 @@ static bool do_fuse6(riscv_t *rv, const rv_insn_t *ir)
rv->PC = rv->X[rv_reg_ra] & ~1U;
if (unlikely(RVOP_NO_NEXT(ir)))
return true;
const rv_insn_t *next = ir + 1;
const rv_insn_t *next = ir->next;
MUST_TAIL return next->impl(rv, next);
}

Expand Down Expand Up @@ -541,15 +555,21 @@ static inline bool insn_is_unconditional_branch(uint8_t opcode)
return false;
}

static void block_translate(riscv_t *rv, block_t *block)
static void block_translate(riscv_t *rv, block_map_t *map, block_t *block)
{
block->pc_start = block->pc_end = rv->PC;

rv_insn_t *prev_ir = NULL;
rv_insn_t *ir = mpool_alloc(map->block_ir_mp);
block->first_ir = ir;

/* translate the basic block */
while (block->n_insn < block->insn_capacity) {
rv_insn_t *ir = block->ir + block->n_insn;
while (true) {
memset(ir, 0, sizeof(rv_insn_t));

if (prev_ir)
prev_ir->next = ir;

/* fetch the next instruction */
const uint32_t insn = rv->io.mem_ifetch(block->pc_end);

Expand All @@ -564,21 +584,29 @@ static void block_translate(riscv_t *rv, block_t *block)
/* compute the end of pc */
block->pc_end += ir->insn_len;
block->n_insn++;
prev_ir = ir;
/* stop on branch */
if (insn_is_branch(ir->opcode))
break;

ir = mpool_alloc(map->block_ir_mp);
}
block->ir[block->n_insn - 1].tailcall = true;

assert(prev_ir);
block->last_ir = prev_ir;
block->last_ir->tailcall = true;
}

#define COMBINE_MEM_OPS(RW) \
count = 1; \
next_ir = ir + 1; \
next_ir = ir->next; \
tmp_ir = next_ir; \
if (next_ir->opcode != IIF(RW)(rv_insn_lw, rv_insn_sw)) \
break; \
sign = (ir->imm - next_ir->imm) >> 31 ? -1 : 1; \
for (uint32_t j = 1; j < block->n_insn - 1 - i; j++) { \
next_ir = ir + j; \
next_ir = tmp_ir; \
for (uint32_t j = 1; j < block->n_insn - 1 - i; \
j++, next_ir = next_ir->next) { \
if (next_ir->opcode != IIF(RW)(rv_insn_lw, rv_insn_sw) || \
ir->rs1 != next_ir->rs1 || ir->imm - next_ir->imm != 4 * sign) \
break; \
Expand All @@ -590,8 +618,8 @@ static void block_translate(riscv_t *rv, block_t *block)
ir->imm2 = count; \
memcpy(ir->fuse, ir, sizeof(opcode_fuse_t)); \
ir->impl = dispatch_table[ir->opcode]; \
for (int j = 1; j < count; j++) { \
next_ir = ir + j; \
next_ir = tmp_ir; \
for (int j = 1; j < count; j++, next_ir = next_ir->next) { \
memcpy(ir->fuse + j, next_ir, sizeof(opcode_fuse_t)); \
} \
ir->tailcall = next_ir->tailcall; \
Expand Down Expand Up @@ -825,7 +853,7 @@ static bool detect_memcpy(riscv_t *rv, int lib)

static bool libc_substitute(riscv_t *rv, block_t *block)
{
rv_insn_t *ir = block->ir, *next_ir = NULL;
rv_insn_t *ir = block->first_ir, *next_ir = NULL;
switch (ir->opcode) {
case rv_insn_addi:
/* Compare the target block with the first basic block of
Expand All @@ -835,10 +863,10 @@ static bool libc_substitute(riscv_t *rv, block_t *block)
* instruction sequence.
*/
if (ir->imm == 15 && ir->rd == rv_reg_t1 && ir->rs1 == rv_reg_zero) {
next_ir = ir + 1;
next_ir = ir->next;
if (next_ir->opcode == rv_insn_addi && next_ir->rd == rv_reg_a4 &&
next_ir->rs1 == rv_reg_a0 && next_ir->rs2 == rv_reg_zero) {
next_ir = next_ir + 1;
next_ir = next_ir->next;
if (next_ir->opcode == rv_insn_bgeu && next_ir->imm == 60 &&
next_ir->rs1 == rv_reg_t1 && next_ir->rs2 == rv_reg_a2) {
if (detect_memset(rv, 1)) {
Expand All @@ -851,7 +879,7 @@ static bool libc_substitute(riscv_t *rv, block_t *block)
}
} else if (ir->imm == 0 && ir->rd == rv_reg_t1 &&
ir->rs1 == rv_reg_a0) {
next_ir = ir + 1;
next_ir = ir->next;
if (next_ir->opcode == rv_insn_beq && next_ir->rs1 == rv_reg_a2 &&
next_ir->rs2 == rv_reg_zero) {
if (next_ir->imm == 20 && detect_memset(rv, 2)) {
Expand All @@ -876,14 +904,14 @@ static bool libc_substitute(riscv_t *rv, block_t *block)
*/
if (ir->rd == rv_reg_a5 && ir->rs1 == rv_reg_a0 &&
ir->rs2 == rv_reg_a1) {
next_ir = ir + 1;
next_ir = ir->next;
if (next_ir->opcode == rv_insn_andi && next_ir->imm == 3 &&
next_ir->rd == rv_reg_a5 && next_ir->rs1 == rv_reg_a5) {
next_ir = next_ir + 1;
next_ir = next_ir->next;
if (next_ir->opcode == rv_insn_add &&
next_ir->rd == rv_reg_a7 && next_ir->rs1 == rv_reg_a0 &&
next_ir->rs2 == rv_reg_a2) {
next_ir = next_ir + 1;
next_ir = next_ir->next;
if (next_ir->opcode == rv_insn_bne && next_ir->imm == 104 &&
next_ir->rs1 == rv_reg_a5 &&
next_ir->rs2 == rv_reg_zero) {
Expand Down Expand Up @@ -912,12 +940,15 @@ static bool libc_substitute(riscv_t *rv, block_t *block)
*/
static void match_pattern(block_t *block)
{
for (uint32_t i = 0; i < block->n_insn - 1; i++) {
rv_insn_t *ir = block->ir + i, *next_ir = NULL;
uint32_t i;
rv_insn_t *ir;
for (i = 0, ir = block->first_ir; i < block->n_insn - 1;
i++, ir = ir->next) {
rv_insn_t *next_ir = NULL, *tmp_ir = NULL;
int32_t count = 0, sign = 1;
switch (ir->opcode) {
case rv_insn_lui:
next_ir = ir + 1;
next_ir = ir->next;
switch (next_ir->opcode) {
case rv_insn_add:
if (ir->rd == next_ir->rs2 || ir->rd == next_ir->rs1) {
Expand All @@ -940,7 +971,7 @@ static void match_pattern(block_t *block)
count++;
if (next_ir->tailcall)
break;
next_ir++;
next_ir = next_ir->next;
}
ir->imm2 = count;
ir->opcode = rv_insn_fuse1;
Expand Down Expand Up @@ -994,8 +1025,10 @@ static void optimize_constant(riscv_t *rv, block_t *block)
constopt_info_t constopt_info = {0};
constopt_info.is_constant[0] = true;
assert(rv->X[0] == 0);
for (uint32_t i = 0; i < block->n_insn; i++) {
rv_insn_t *ir = block->ir + i;

uint32_t i;
rv_insn_t *ir;
for (i = 0, ir = block->first_ir; i < block->n_insn; i++, ir = ir->next) {
((constopt_func_t) constopt_table[ir->opcode])(ir, &constopt_info);
}
}
Expand All @@ -1014,10 +1047,7 @@ static block_t *block_find_or_translate(riscv_t *rv)
}

/* allocate a new block */
next = block_alloc(10);

/* translate the basic block */
block_translate(rv, next);
next = block_alloc(rv, map);

if (!libc_substitute(rv, next)) {
optimize_constant(rv, next);
Expand Down Expand Up @@ -1075,27 +1105,27 @@ void rv_step(riscv_t *rv, int32_t cycles)
if (prev->pc_start != last_pc)
prev = block_find(&rv->block_map, last_pc);

rv_insn_t *last_ir = prev->ir + prev->n_insn - 1;
rv_insn_t *last_ir = prev->last_ir;
/* chain block */
if (!insn_is_unconditional_branch(last_ir->opcode)) {
if (branch_taken && !last_ir->branch_taken)
last_ir->branch_taken = block->ir;
last_ir->branch_taken = block->first_ir;
else if (!last_ir->branch_untaken)
last_ir->branch_untaken = block->ir;
last_ir->branch_untaken = block->first_ir;
} else if (last_ir->opcode == rv_insn_jal
#if RV32_HAS(EXT_C)
|| last_ir->opcode == rv_insn_cj ||
last_ir->opcode == rv_insn_cjal
#endif
) {
if (!last_ir->branch_taken)
last_ir->branch_taken = block->ir;
last_ir->branch_taken = block->first_ir;
}
}
last_pc = rv->PC;

/* execute the block */
const rv_insn_t *ir = block->ir;
const rv_insn_t *ir = block->first_ir;
if (unlikely(!ir->impl(rv, ir)))
break;

Expand Down
Loading

1 comment on commit 90fa28f

@jserv
Copy link
Contributor

@jserv jserv commented on 90fa28f Sep 30, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Benchmarks

Benchmark suite Current: 90fa28f Previous: 3affb02 Ratio
Dhrystone 1256 Average DMIPS over 10 runs 1132 Average DMIPS over 10 runs 0.90
Coremark 1202.143 Average iterations/sec over 10 runs 1082.705 Average iterations/sec over 10 runs 0.90

This comment was automatically generated by workflow using github-action-benchmark.

Please sign in to comment.