From 54f986f6cd9797b127f7eb4ebc23b656e567317a Mon Sep 17 00:00:00 2001 From: RinHizakura Date: Fri, 22 Sep 2023 19:01:57 +0800 Subject: [PATCH] Reduce memory usage for instruction block --- src/decode.h | 2 + src/emulate.c | 127 +++++++++++++++++++++++++++----------------- src/riscv.c | 55 +++++++++++++++++-- src/riscv_private.h | 12 ++++- src/rv32_template.c | 6 +-- 5 files changed, 143 insertions(+), 59 deletions(-) diff --git a/src/decode.h b/src/decode.h index 40be29406..e5036a939 100644 --- a/src/decode.h +++ b/src/decode.h @@ -298,6 +298,8 @@ typedef struct rv_insn { * specific IR array without the need for additional copying. */ struct rv_insn *branch_taken, *branch_untaken; + + struct rv_insn *next; } rv_insn_t; /* decode the RISC-V instruction */ diff --git a/src/emulate.c b/src/emulate.c index cd1a0bd25..4906d9e53 100644 --- a/src/emulate.c +++ b/src/emulate.c @@ -277,16 +277,18 @@ static inline uint32_t hash(size_t k) return k; } +static void block_translate(riscv_t *rv, block_map_t *map, block_t *block); /* allocate a basic block */ -static block_t *block_alloc(const uint8_t bits) +static block_t *block_alloc(riscv_t *rv, block_map_t *map) { block_t *block = malloc(sizeof(struct block)); assert(block); - block->insn_capacity = 1 << bits; block->n_insn = 0; block->predict = NULL; - block->ir = malloc(block->insn_capacity * sizeof(rv_insn_t)); - assert(block->ir); + + /* Translate the basic block, block->first_ir and block->last_ir will + * be initialize there. */ + block_translate(rv, map, block); return block; } @@ -366,7 +368,7 @@ static uint32_t last_pc = 0; rv->PC += ir->insn_len; \ if (unlikely(RVOP_NO_NEXT(ir))) \ return true; \ - const rv_insn_t *next = ir + 1; \ + const rv_insn_t *next = ir->next; \ MUST_TAIL return next->impl(rv, next); \ } @@ -395,23 +397,34 @@ enum { #undef _ }; +/* FIXME: This will simply find the n-th instruction by iterating + * the linked list linearly, we may want to find better approach. */ +#define GET_NEXT_N_INSN(ir, n) \ + ({ \ + rv_insn_t *__tmp = ir; \ + for (typeof(n) iter = 0; iter < n; iter++) \ + __tmp = __tmp->next; \ + __tmp; \ + }) + /* multiple lui */ -static bool do_fuse1(riscv_t *rv, const rv_insn_t *ir) +static bool do_fuse1(riscv_t *rv, rv_insn_t *ir) { rv->csr_cycle += ir->imm2; - for (int i = 0; i < ir->imm2; i++) { - const rv_insn_t *cur_ir = ir + i; + int i; + rv_insn_t *cur_ir; + for (i = 0, cur_ir = ir; i < ir->imm2; i++, cur_ir = cur_ir->next) { rv->X[cur_ir->rd] = cur_ir->imm; } rv->PC += ir->imm2 * ir->insn_len; if (unlikely(RVOP_NO_NEXT(ir))) return true; - const rv_insn_t *next = ir + ir->imm2; + const rv_insn_t *next = GET_NEXT_N_INSN(ir, ir->imm2); MUST_TAIL return next->impl(rv, next); } /* LUI + ADD */ -static bool do_fuse2(riscv_t *rv, const rv_insn_t *ir) +static bool do_fuse2(riscv_t *rv, rv_insn_t *ir) { rv->csr_cycle += 2; rv->X[ir->rd] = ir->imm; @@ -419,12 +432,12 @@ static bool do_fuse2(riscv_t *rv, const rv_insn_t *ir) rv->PC += 2 * ir->insn_len; if (unlikely(RVOP_NO_NEXT(ir))) return true; - const rv_insn_t *next = ir + 2; + const rv_insn_t *next = GET_NEXT_N_INSN(ir, 2); MUST_TAIL return next->impl(rv, next); } /* multiple SW */ -static bool do_fuse3(riscv_t *rv, const rv_insn_t *ir) +static bool do_fuse3(riscv_t *rv, rv_insn_t *ir) { rv->csr_cycle += ir->imm2; opcode_fuse_t *fuse = ir->fuse; @@ -442,12 +455,12 @@ static bool do_fuse3(riscv_t *rv, const rv_insn_t *ir) rv->PC += ir->imm2 * ir->insn_len; if (unlikely(RVOP_NO_NEXT(ir))) return true; - const rv_insn_t *next = ir + ir->imm2; + const rv_insn_t *next = GET_NEXT_N_INSN(ir, ir->imm2); MUST_TAIL return next->impl(rv, next); } /* multiple LW */ -static bool do_fuse4(riscv_t *rv, const rv_insn_t *ir) +static bool do_fuse4(riscv_t *rv, rv_insn_t *ir) { rv->csr_cycle += ir->imm2; opcode_fuse_t *fuse = ir->fuse; @@ -465,7 +478,7 @@ static bool do_fuse4(riscv_t *rv, const rv_insn_t *ir) rv->PC += ir->imm2 * ir->insn_len; if (unlikely(RVOP_NO_NEXT(ir))) return true; - const rv_insn_t *next = ir + ir->imm2; + const rv_insn_t *next = GET_NEXT_N_INSN(ir, ir->imm2); MUST_TAIL return next->impl(rv, next); } @@ -479,7 +492,7 @@ static bool do_fuse5(riscv_t *rv, const rv_insn_t *ir) rv->PC = rv->X[rv_reg_ra] & ~1U; if (unlikely(RVOP_NO_NEXT(ir))) return true; - const rv_insn_t *next = ir + 1; + const rv_insn_t *next = ir->next; MUST_TAIL return next->impl(rv, next); } @@ -493,7 +506,7 @@ static bool do_fuse6(riscv_t *rv, const rv_insn_t *ir) rv->PC = rv->X[rv_reg_ra] & ~1U; if (unlikely(RVOP_NO_NEXT(ir))) return true; - const rv_insn_t *next = ir + 1; + const rv_insn_t *next = ir->next; MUST_TAIL return next->impl(rv, next); } @@ -541,15 +554,21 @@ static inline bool insn_is_unconditional_branch(uint8_t opcode) return false; } -static void block_translate(riscv_t *rv, block_t *block) +static void block_translate(riscv_t *rv, block_map_t *map, block_t *block) { block->pc_start = block->pc_end = rv->PC; + rv_insn_t *prev_ir = NULL; + rv_insn_t *ir = block_map_next_pool_ir(map); + block->first_ir = ir; + /* translate the basic block */ - while (block->n_insn < block->insn_capacity) { - rv_insn_t *ir = block->ir + block->n_insn; + while (true) { memset(ir, 0, sizeof(rv_insn_t)); + if (prev_ir) + prev_ir->next = ir; + /* fetch the next instruction */ const uint32_t insn = rv->io.mem_ifetch(block->pc_end); @@ -564,21 +583,29 @@ static void block_translate(riscv_t *rv, block_t *block) /* compute the end of pc */ block->pc_end += ir->insn_len; block->n_insn++; + prev_ir = ir; /* stop on branch */ if (insn_is_branch(ir->opcode)) break; + + ir = block_map_next_pool_ir(map); } - block->ir[block->n_insn - 1].tailcall = true; + + assert(prev_ir); + block->last_ir = prev_ir; + block->last_ir->tailcall = true; } #define COMBINE_MEM_OPS(RW) \ count = 1; \ - next_ir = ir + 1; \ + next_ir = ir->next; \ + tmp_ir = next_ir; \ if (next_ir->opcode != IIF(RW)(rv_insn_lw, rv_insn_sw)) \ break; \ sign = (ir->imm - next_ir->imm) >> 31 ? -1 : 1; \ - for (uint32_t j = 1; j < block->n_insn - 1 - i; j++) { \ - next_ir = ir + j; \ + next_ir = tmp_ir; \ + for (uint32_t j = 1; j < block->n_insn - 1 - i; \ + j++, next_ir = next_ir->next) { \ if (next_ir->opcode != IIF(RW)(rv_insn_lw, rv_insn_sw) || \ ir->rs1 != next_ir->rs1 || ir->imm - next_ir->imm != 4 * sign) \ break; \ @@ -590,8 +617,8 @@ static void block_translate(riscv_t *rv, block_t *block) ir->imm2 = count; \ memcpy(ir->fuse, ir, sizeof(opcode_fuse_t)); \ ir->impl = dispatch_table[ir->opcode]; \ - for (int j = 1; j < count; j++) { \ - next_ir = ir + j; \ + next_ir = tmp_ir; \ + for (int j = 1; j < count; j++, next_ir = next_ir->next) { \ memcpy(ir->fuse + j, next_ir, sizeof(opcode_fuse_t)); \ } \ ir->tailcall = next_ir->tailcall; \ @@ -825,7 +852,7 @@ static bool detect_memcpy(riscv_t *rv, int lib) static bool libc_substitute(riscv_t *rv, block_t *block) { - rv_insn_t *ir = block->ir, *next_ir = NULL; + rv_insn_t *ir = block->first_ir, *next_ir = NULL; switch (ir->opcode) { case rv_insn_addi: /* Compare the target block with the first basic block of @@ -835,10 +862,10 @@ static bool libc_substitute(riscv_t *rv, block_t *block) * instruction sequence. */ if (ir->imm == 15 && ir->rd == rv_reg_t1 && ir->rs1 == rv_reg_zero) { - next_ir = ir + 1; + next_ir = ir->next; if (next_ir->opcode == rv_insn_addi && next_ir->rd == rv_reg_a4 && next_ir->rs1 == rv_reg_a0 && next_ir->rs2 == rv_reg_zero) { - next_ir = next_ir + 1; + next_ir = next_ir->next; if (next_ir->opcode == rv_insn_bgeu && next_ir->imm == 60 && next_ir->rs1 == rv_reg_t1 && next_ir->rs2 == rv_reg_a2) { if (detect_memset(rv, 1)) { @@ -851,7 +878,7 @@ static bool libc_substitute(riscv_t *rv, block_t *block) } } else if (ir->imm == 0 && ir->rd == rv_reg_t1 && ir->rs1 == rv_reg_a0) { - next_ir = ir + 1; + next_ir = ir->next; if (next_ir->opcode == rv_insn_beq && next_ir->rs1 == rv_reg_a2 && next_ir->rs2 == rv_reg_zero) { if (next_ir->imm == 20 && detect_memset(rv, 2)) { @@ -876,14 +903,14 @@ static bool libc_substitute(riscv_t *rv, block_t *block) */ if (ir->rd == rv_reg_a5 && ir->rs1 == rv_reg_a0 && ir->rs2 == rv_reg_a1) { - next_ir = ir + 1; + next_ir = ir->next; if (next_ir->opcode == rv_insn_andi && next_ir->imm == 3 && next_ir->rd == rv_reg_a5 && next_ir->rs1 == rv_reg_a5) { - next_ir = next_ir + 1; + next_ir = next_ir->next; if (next_ir->opcode == rv_insn_add && next_ir->rd == rv_reg_a7 && next_ir->rs1 == rv_reg_a0 && next_ir->rs2 == rv_reg_a2) { - next_ir = next_ir + 1; + next_ir = next_ir->next; if (next_ir->opcode == rv_insn_bne && next_ir->imm == 104 && next_ir->rs1 == rv_reg_a5 && next_ir->rs2 == rv_reg_zero) { @@ -912,12 +939,15 @@ static bool libc_substitute(riscv_t *rv, block_t *block) */ static void match_pattern(block_t *block) { - for (uint32_t i = 0; i < block->n_insn - 1; i++) { - rv_insn_t *ir = block->ir + i, *next_ir = NULL; + uint32_t i; + rv_insn_t *ir; + for (i = 0, ir = block->first_ir; i < block->n_insn - 1; + i++, ir = ir->next) { + rv_insn_t *next_ir = NULL, *tmp_ir = NULL; int32_t count = 0, sign = 1; switch (ir->opcode) { case rv_insn_lui: - next_ir = ir + 1; + next_ir = ir->next; if (next_ir->opcode == rv_insn_add && ir->rd == next_ir->rs2) { /* The destination register of the LUI instruction is the * same as the source register 2 of the next instruction ADD. @@ -937,13 +967,13 @@ static void match_pattern(block_t *block) ir->impl = dispatch_table[ir->opcode]; } else { count = 1; - next_ir = ir + 1; + next_ir = ir->next; while (1) { if (next_ir->opcode != rv_insn_lui) break; next_ir->opcode = rv_insn_nop; count++; - next_ir += 1; + next_ir = next_ir->next; } if (count > 1) { ir->imm2 = count; @@ -998,8 +1028,10 @@ static void optimize_constant(riscv_t *rv, block_t *block) constopt_info_t constopt_info = {0}; constopt_info.is_constant[0] = true; assert(rv->X[0] == 0); - for (uint32_t i = 0; i < block->n_insn; i++) { - rv_insn_t *ir = block->ir + i; + + uint32_t i; + rv_insn_t *ir; + for (i = 0, ir = block->first_ir; i < block->n_insn; i++, ir = ir->next) { ((constopt_func_t) constopt_table[ir->opcode])(ir, &constopt_info); } } @@ -1018,10 +1050,7 @@ static block_t *block_find_or_translate(riscv_t *rv) } /* allocate a new block */ - next = block_alloc(10); - - /* translate the basic block */ - block_translate(rv, next); + next = block_alloc(rv, map); if (!libc_substitute(rv, next)) { optimize_constant(rv, next); @@ -1079,13 +1108,13 @@ void rv_step(riscv_t *rv, int32_t cycles) if (prev->pc_start != last_pc) prev = block_find(&rv->block_map, last_pc); - rv_insn_t *last_ir = prev->ir + prev->n_insn - 1; + rv_insn_t *last_ir = prev->last_ir; /* chain block */ if (!insn_is_unconditional_branch(last_ir->opcode)) { if (branch_taken && !last_ir->branch_taken) - last_ir->branch_taken = block->ir; + last_ir->branch_taken = block->first_ir; else if (!last_ir->branch_untaken) - last_ir->branch_untaken = block->ir; + last_ir->branch_untaken = block->first_ir; } else if (last_ir->opcode == rv_insn_jal #if RV32_HAS(EXT_C) || last_ir->opcode == rv_insn_cj || @@ -1093,13 +1122,13 @@ void rv_step(riscv_t *rv, int32_t cycles) #endif ) { if (!last_ir->branch_taken) - last_ir->branch_taken = block->ir; + last_ir->branch_taken = block->first_ir; } } last_pc = rv->PC; /* execute the block */ - const rv_insn_t *ir = block->ir; + const rv_insn_t *ir = block->first_ir; if (unlikely(!ir->impl(rv, ir))) break; diff --git a/src/riscv.c b/src/riscv.c index d5822d5e9..a9969266a 100644 --- a/src/riscv.c +++ b/src/riscv.c @@ -10,12 +10,44 @@ #include "riscv_private.h" #include "state.h" +#define BLOCK_POOL_SIZE (1 << 10) + /* initialize the block map */ static void block_map_init(block_map_t *map, const uint8_t bits) { map->block_capacity = 1 << bits; map->size = 0; map->map = calloc(map->block_capacity, sizeof(struct block *)); + + map->cur_insn_idx = 0; + map->cur_pool_idx = 0; + map->pool_cap = 1 << 2; + map->ir_pool = malloc(map->pool_cap * sizeof(rv_insn_t *)); + assert(map->ir_pool); + map->ir_pool[map->cur_pool_idx] = + calloc(BLOCK_POOL_SIZE, sizeof(rv_insn_t)); + assert(map->ir_pool[map->cur_pool_idx]); +} + +rv_insn_t *block_map_next_pool_ir(block_map_t *map) +{ + if (map->cur_insn_idx >= BLOCK_POOL_SIZE) { + map->cur_pool_idx++; + if (map->cur_pool_idx >= map->pool_cap) { + map->pool_cap <<= 1; + map->ir_pool = + realloc(map->ir_pool, map->pool_cap * sizeof(rv_insn_t *)); + assert(map->ir_pool); + } + map->cur_insn_idx = 0; + map->ir_pool[map->cur_pool_idx] = + calloc(BLOCK_POOL_SIZE, sizeof(rv_insn_t)); + assert(map->ir_pool[map->cur_pool_idx]); + } + + rv_insn_t *ir = &map->ir_pool[map->cur_pool_idx][map->cur_insn_idx]; + map->cur_insn_idx++; + return ir; } /* clear all block in the block map */ @@ -26,15 +58,29 @@ void block_map_clear(block_map_t *map) block_t *block = map->map[i]; if (!block) continue; - for (uint32_t i = 0; i < block->n_insn; i++) - free(block->ir[i].fuse); - free(block->ir); + uint32_t idx; + rv_insn_t *ir; + for (idx = 0, ir = block->first_ir; idx < block->n_insn; + idx++, ir = ir->next) { + free(ir->fuse); + } free(block); map->map[i] = NULL; } map->size = 0; } +static void block_map_destroy(block_map_t *map) +{ + block_map_clear(map); + free(map->map); + + for (uint32_t idx = 0; idx <= map->cur_pool_idx; idx++) { + free(map->ir_pool[idx]); + } + free(map->ir_pool); +} + riscv_user_t rv_userdata(riscv_t *rv) { assert(rv); @@ -122,8 +168,7 @@ bool rv_enables_to_output_exit_code(riscv_t *rv) void rv_delete(riscv_t *rv) { assert(rv); - block_map_clear(&rv->block_map); - free(rv->block_map.map); + block_map_destroy(&rv->block_map); free(rv); } diff --git a/src/riscv_private.h b/src/riscv_private.h index d608b338f..8b76bf450 100644 --- a/src/riscv_private.h +++ b/src/riscv_private.h @@ -56,17 +56,25 @@ enum { typedef struct block { uint32_t n_insn; /**< number of instructions encompased */ uint32_t pc_start, pc_end; /**< address range of the basic block */ - uint32_t insn_capacity; /**< maximum of instructions encompased */ struct block *predict; /**< block prediction */ - rv_insn_t *ir; /**< IR as memory blocks */ + + rv_insn_t *first_ir, *last_ir; /**< the first and last ir for this block */ } block_t; typedef struct { uint32_t block_capacity; /**< max number of entries in the block map */ uint32_t size; /**< number of entries currently in the map */ block_t **map; /**< block map */ + + rv_insn_t * + *ir_pool; /**< the instruction pool to generate ir for each block */ + uint32_t pool_cap; /**< maximum numbers of instruction pool */ + uint32_t cur_pool_idx; /**< the index of current used pool */ + uint32_t cur_insn_idx; /**< the index of current used instruction at current + pool */ } block_map_t; +rv_insn_t *block_map_next_pool_ir(block_map_t *map); /* clear all block in the block map */ void block_map_clear(block_map_t *map); diff --git a/src/rv32_template.c b/src/rv32_template.c index bcba15f10..3e0c03c5c 100644 --- a/src/rv32_template.c +++ b/src/rv32_template.c @@ -53,7 +53,7 @@ RVOP(jalr, { RV_EXC_MISALIGN_HANDLER(pc, insn, false, 0); block_t *block = block_find(&rv->block_map, rv->PC); if (block) - return block->ir->impl(rv, block->ir); + return block->first_ir->impl(rv, block->first_ir); return true; }) @@ -909,7 +909,7 @@ RVOP(cjr, { rv->PC = rv->X[ir->rs1]; block_t *block = block_find(&rv->block_map, rv->PC); if (block) - return block->ir->impl(rv, block->ir); + return block->first_ir->impl(rv, block->first_ir); return true; }) @@ -932,7 +932,7 @@ RVOP(cjalr, { RV_EXC_MISALIGN_HANDLER(rv->PC, insn, true, 0); block_t *block = block_find(&rv->block_map, rv->PC); if (block) - return block->ir->impl(rv, block->ir); + return block->first_ir->impl(rv, block->first_ir); return true; })