From 00312241db570add3b9948a023e7ebbe2041919f Mon Sep 17 00:00:00 2001 From: RinHizakura Date: Mon, 2 Oct 2023 19:50:34 +0800 Subject: [PATCH] Reduce memory usage for instruction block (#232) The original memory allocation strategy for instruction blocks was found to be inefficient, leading to excessive memory usage. In the previous approach, a fixed amount of memory was allocated for each block, resulting in significant wastage. To address this issue, we have implemented a more efficient memory allocation scheme. Instead of allocating a fixed size for each block, we now maintain a pool of rv_insn_t and allocate memory only when needed. This new approach minimizes heap allocations and optimizes memory usage. We have introduced a parameter, BLOCK_POOL_SIZE, which allows us to control the balance between the number of calloc calls and memory consumption. This flexibility ensures that memory allocation occurs only when the pool is depleted. As a result of these changes, the heap memory allocation has significantly improved. For example, in the puzzle.elf example, we observed a reduction in heap memory allocation from 20,306,989 bytes to just 313,461 bytes. While this design may lead to some discontinuity in memory spaces for instructions in sequence, the impact on random access is minimal, as random access is primarily required for certain fuse operations. In cases where random access is needed, we can employ linear search method. The potential cache locality issues resulting from the discontinuous memory spaces can also be mitigated by adjusting the BLOCK_POOL_SIZE parameter for better performance. --- src/decode.h | 2 + src/emulate.c | 128 +++++++++++++++++++++++++++----------------- src/riscv.c | 34 +++++++++--- src/riscv_private.h | 6 ++- src/rv32_template.c | 6 +-- 5 files changed, 116 insertions(+), 60 deletions(-) diff --git a/src/decode.h b/src/decode.h index 40be2940..e5036a93 100644 --- a/src/decode.h +++ b/src/decode.h @@ -298,6 +298,8 @@ typedef struct rv_insn { * specific IR array without the need for additional copying. */ struct rv_insn *branch_taken, *branch_untaken; + + struct rv_insn *next; } rv_insn_t; /* decode the RISC-V instruction */ diff --git a/src/emulate.c b/src/emulate.c index 28cf7cc1..2573e820 100644 --- a/src/emulate.c +++ b/src/emulate.c @@ -5,6 +5,7 @@ #include #include +#include #include #include #include @@ -30,6 +31,7 @@ extern struct target_ops gdbstub_ops; #endif #include "decode.h" +#include "mpool.h" #include "riscv.h" #include "riscv_private.h" #include "state.h" @@ -277,16 +279,17 @@ static inline uint32_t hash(size_t k) return k; } +static void block_translate(riscv_t *rv, block_map_t *map, block_t *block); /* allocate a basic block */ -static block_t *block_alloc(const uint8_t bits) +static block_t *block_alloc(riscv_t *rv, block_map_t *map) { - block_t *block = malloc(sizeof(struct block)); + block_t *block = mpool_alloc(map->block_mp); assert(block); - block->insn_capacity = 1 << bits; block->n_insn = 0; block->predict = NULL; - block->ir = malloc(block->insn_capacity * sizeof(rv_insn_t)); - assert(block->ir); + + /* Initialize remaining part of block_t */ + block_translate(rv, map, block); return block; } @@ -366,7 +369,7 @@ static uint32_t last_pc = 0; rv->PC += ir->insn_len; \ if (unlikely(RVOP_NO_NEXT(ir))) \ return true; \ - const rv_insn_t *next = ir + 1; \ + const rv_insn_t *next = ir->next; \ MUST_TAIL return next->impl(rv, next); \ } @@ -395,23 +398,34 @@ enum { #undef _ }; +/* FIXME: This will simply find the n-th instruction by iterating + * the linked list linearly, we may want to find better approach. */ +FORCE_INLINE rv_insn_t *next_nth_insn(rv_insn_t *ir, int32_t n) +{ + rv_insn_t *tmp = ir; + for (int32_t iter = 0; iter < n; iter++) + tmp = tmp->next; + return tmp; +} + /* multiple lui */ -static bool do_fuse1(riscv_t *rv, const rv_insn_t *ir) +static bool do_fuse1(riscv_t *rv, rv_insn_t *ir) { rv->csr_cycle += ir->imm2; - for (int i = 0; i < ir->imm2; i++) { - const rv_insn_t *cur_ir = ir + i; + int i; + rv_insn_t *cur_ir; + for (i = 0, cur_ir = ir; i < ir->imm2; i++, cur_ir = cur_ir->next) { rv->X[cur_ir->rd] = cur_ir->imm; } rv->PC += ir->imm2 * ir->insn_len; if (unlikely(RVOP_NO_NEXT(ir))) return true; - const rv_insn_t *next = ir + ir->imm2; + const rv_insn_t *next = next_nth_insn(ir, ir->imm2); MUST_TAIL return next->impl(rv, next); } /* LUI + ADD */ -static bool do_fuse2(riscv_t *rv, const rv_insn_t *ir) +static bool do_fuse2(riscv_t *rv, rv_insn_t *ir) { rv->csr_cycle += 2; rv->X[ir->rd] = ir->imm; @@ -419,12 +433,12 @@ static bool do_fuse2(riscv_t *rv, const rv_insn_t *ir) rv->PC += 2 * ir->insn_len; if (unlikely(RVOP_NO_NEXT(ir))) return true; - const rv_insn_t *next = ir + 2; + const rv_insn_t *next = next_nth_insn(ir, 2); MUST_TAIL return next->impl(rv, next); } /* multiple SW */ -static bool do_fuse3(riscv_t *rv, const rv_insn_t *ir) +static bool do_fuse3(riscv_t *rv, rv_insn_t *ir) { rv->csr_cycle += ir->imm2; opcode_fuse_t *fuse = ir->fuse; @@ -442,12 +456,12 @@ static bool do_fuse3(riscv_t *rv, const rv_insn_t *ir) rv->PC += ir->imm2 * ir->insn_len; if (unlikely(RVOP_NO_NEXT(ir))) return true; - const rv_insn_t *next = ir + ir->imm2; + const rv_insn_t *next = next_nth_insn(ir, ir->imm2); MUST_TAIL return next->impl(rv, next); } /* multiple LW */ -static bool do_fuse4(riscv_t *rv, const rv_insn_t *ir) +static bool do_fuse4(riscv_t *rv, rv_insn_t *ir) { rv->csr_cycle += ir->imm2; opcode_fuse_t *fuse = ir->fuse; @@ -465,7 +479,7 @@ static bool do_fuse4(riscv_t *rv, const rv_insn_t *ir) rv->PC += ir->imm2 * ir->insn_len; if (unlikely(RVOP_NO_NEXT(ir))) return true; - const rv_insn_t *next = ir + ir->imm2; + const rv_insn_t *next = next_nth_insn(ir, ir->imm2); MUST_TAIL return next->impl(rv, next); } @@ -479,7 +493,7 @@ static bool do_fuse5(riscv_t *rv, const rv_insn_t *ir) rv->PC = rv->X[rv_reg_ra] & ~1U; if (unlikely(RVOP_NO_NEXT(ir))) return true; - const rv_insn_t *next = ir + 1; + const rv_insn_t *next = ir->next; MUST_TAIL return next->impl(rv, next); } @@ -493,7 +507,7 @@ static bool do_fuse6(riscv_t *rv, const rv_insn_t *ir) rv->PC = rv->X[rv_reg_ra] & ~1U; if (unlikely(RVOP_NO_NEXT(ir))) return true; - const rv_insn_t *next = ir + 1; + const rv_insn_t *next = ir->next; MUST_TAIL return next->impl(rv, next); } @@ -541,15 +555,21 @@ FORCE_INLINE bool insn_is_unconditional_branch(uint8_t opcode) return false; } -static void block_translate(riscv_t *rv, block_t *block) +static void block_translate(riscv_t *rv, block_map_t *map, block_t *block) { block->pc_start = block->pc_end = rv->PC; + rv_insn_t *prev_ir = NULL; + rv_insn_t *ir = mpool_alloc(map->block_ir_mp); + block->ir_head = ir; + /* translate the basic block */ - while (block->n_insn < block->insn_capacity) { - rv_insn_t *ir = block->ir + block->n_insn; + while (true) { memset(ir, 0, sizeof(rv_insn_t)); + if (prev_ir) + prev_ir->next = ir; + /* fetch the next instruction */ const uint32_t insn = rv->io.mem_ifetch(block->pc_end); @@ -564,21 +584,29 @@ static void block_translate(riscv_t *rv, block_t *block) /* compute the end of pc */ block->pc_end += ir->insn_len; block->n_insn++; + prev_ir = ir; /* stop on branch */ if (insn_is_branch(ir->opcode)) break; + + ir = mpool_alloc(map->block_ir_mp); } - block->ir[block->n_insn - 1].tailcall = true; + + assert(prev_ir); + block->ir_tail = prev_ir; + block->ir_tail->tailcall = true; } #define COMBINE_MEM_OPS(RW) \ count = 1; \ - next_ir = ir + 1; \ + next_ir = ir->next; \ + tmp_ir = next_ir; \ if (next_ir->opcode != IIF(RW)(rv_insn_lw, rv_insn_sw)) \ break; \ sign = (ir->imm - next_ir->imm) >> 31 ? -1 : 1; \ - for (uint32_t j = 1; j < block->n_insn - 1 - i; j++) { \ - next_ir = ir + j; \ + next_ir = tmp_ir; \ + for (uint32_t j = 1; j < block->n_insn - 1 - i; \ + j++, next_ir = next_ir->next) { \ if (next_ir->opcode != IIF(RW)(rv_insn_lw, rv_insn_sw) || \ ir->rs1 != next_ir->rs1 || ir->imm - next_ir->imm != 4 * sign) \ break; \ @@ -590,8 +618,8 @@ static void block_translate(riscv_t *rv, block_t *block) ir->imm2 = count; \ memcpy(ir->fuse, ir, sizeof(opcode_fuse_t)); \ ir->impl = dispatch_table[ir->opcode]; \ - for (int j = 1; j < count; j++) { \ - next_ir = ir + j; \ + next_ir = tmp_ir; \ + for (int j = 1; j < count; j++, next_ir = next_ir->next) { \ memcpy(ir->fuse + j, next_ir, sizeof(opcode_fuse_t)); \ } \ ir->tailcall = next_ir->tailcall; \ @@ -825,7 +853,7 @@ static bool detect_memcpy(riscv_t *rv, int lib) static bool libc_substitute(riscv_t *rv, block_t *block) { - rv_insn_t *ir = block->ir, *next_ir = NULL; + rv_insn_t *ir = block->ir_head, *next_ir = NULL; switch (ir->opcode) { case rv_insn_addi: /* Compare the target block with the first basic block of @@ -835,10 +863,10 @@ static bool libc_substitute(riscv_t *rv, block_t *block) * instruction sequence. */ if (ir->imm == 15 && ir->rd == rv_reg_t1 && ir->rs1 == rv_reg_zero) { - next_ir = ir + 1; + next_ir = ir->next; if (next_ir->opcode == rv_insn_addi && next_ir->rd == rv_reg_a4 && next_ir->rs1 == rv_reg_a0 && next_ir->rs2 == rv_reg_zero) { - next_ir = next_ir + 1; + next_ir = next_ir->next; if (next_ir->opcode == rv_insn_bgeu && next_ir->imm == 60 && next_ir->rs1 == rv_reg_t1 && next_ir->rs2 == rv_reg_a2) { if (detect_memset(rv, 1)) { @@ -851,7 +879,7 @@ static bool libc_substitute(riscv_t *rv, block_t *block) } } else if (ir->imm == 0 && ir->rd == rv_reg_t1 && ir->rs1 == rv_reg_a0) { - next_ir = ir + 1; + next_ir = ir->next; if (next_ir->opcode == rv_insn_beq && next_ir->rs1 == rv_reg_a2 && next_ir->rs2 == rv_reg_zero) { if (next_ir->imm == 20 && detect_memset(rv, 2)) { @@ -876,14 +904,14 @@ static bool libc_substitute(riscv_t *rv, block_t *block) */ if (ir->rd == rv_reg_a5 && ir->rs1 == rv_reg_a0 && ir->rs2 == rv_reg_a1) { - next_ir = ir + 1; + next_ir = ir->next; if (next_ir->opcode == rv_insn_andi && next_ir->imm == 3 && next_ir->rd == rv_reg_a5 && next_ir->rs1 == rv_reg_a5) { - next_ir = next_ir + 1; + next_ir = next_ir->next; if (next_ir->opcode == rv_insn_add && next_ir->rd == rv_reg_a7 && next_ir->rs1 == rv_reg_a0 && next_ir->rs2 == rv_reg_a2) { - next_ir = next_ir + 1; + next_ir = next_ir->next; if (next_ir->opcode == rv_insn_bne && next_ir->imm == 104 && next_ir->rs1 == rv_reg_a5 && next_ir->rs2 == rv_reg_zero) { @@ -912,12 +940,15 @@ static bool libc_substitute(riscv_t *rv, block_t *block) */ static void match_pattern(block_t *block) { - for (uint32_t i = 0; i < block->n_insn - 1; i++) { - rv_insn_t *ir = block->ir + i, *next_ir = NULL; + uint32_t i; + rv_insn_t *ir; + for (i = 0, ir = block->ir_head; i < block->n_insn - 1; + i++, ir = ir->next) { + rv_insn_t *next_ir = NULL, *tmp_ir = NULL; int32_t count = 0, sign = 1; switch (ir->opcode) { case rv_insn_lui: - next_ir = ir + 1; + next_ir = ir->next; switch (next_ir->opcode) { case rv_insn_add: if (ir->rd == next_ir->rs2 || ir->rd == next_ir->rs1) { @@ -940,7 +971,7 @@ static void match_pattern(block_t *block) count++; if (next_ir->tailcall) break; - next_ir++; + next_ir = next_ir->next; } ir->imm2 = count; ir->opcode = rv_insn_fuse1; @@ -994,8 +1025,10 @@ static void optimize_constant(riscv_t *rv, block_t *block) constopt_info_t constopt_info = {0}; constopt_info.is_constant[0] = true; assert(rv->X[0] == 0); - for (uint32_t i = 0; i < block->n_insn; i++) { - rv_insn_t *ir = block->ir + i; + + uint32_t i; + rv_insn_t *ir; + for (i = 0, ir = block->ir_head; i < block->n_insn; i++, ir = ir->next) { ((constopt_func_t) constopt_table[ir->opcode])(ir, &constopt_info); } } @@ -1014,10 +1047,7 @@ static block_t *block_find_or_translate(riscv_t *rv) } /* allocate a new block */ - next = block_alloc(10); - - /* translate the basic block */ - block_translate(rv, next); + next = block_alloc(rv, map); if (!libc_substitute(rv, next)) { optimize_constant(rv, next); @@ -1075,13 +1105,13 @@ void rv_step(riscv_t *rv, int32_t cycles) if (prev->pc_start != last_pc) prev = block_find(&rv->block_map, last_pc); - rv_insn_t *last_ir = prev->ir + prev->n_insn - 1; + rv_insn_t *last_ir = prev->ir_tail; /* chain block */ if (!insn_is_unconditional_branch(last_ir->opcode)) { if (branch_taken && !last_ir->branch_taken) - last_ir->branch_taken = block->ir; + last_ir->branch_taken = block->ir_head; else if (!last_ir->branch_untaken) - last_ir->branch_untaken = block->ir; + last_ir->branch_untaken = block->ir_head; } else if (last_ir->opcode == rv_insn_jal #if RV32_HAS(EXT_C) || last_ir->opcode == rv_insn_cj || @@ -1089,13 +1119,13 @@ void rv_step(riscv_t *rv, int32_t cycles) #endif ) { if (!last_ir->branch_taken) - last_ir->branch_taken = block->ir; + last_ir->branch_taken = block->ir_head; } } last_pc = rv->PC; /* execute the block */ - const rv_insn_t *ir = block->ir; + const rv_insn_t *ir = block->ir_head; if (unlikely(!ir->impl(rv, ir))) break; diff --git a/src/riscv.c b/src/riscv.c index d5822d5e..de361934 100644 --- a/src/riscv.c +++ b/src/riscv.c @@ -7,15 +7,24 @@ #include #include +#include "mpool.h" #include "riscv_private.h" #include "state.h" +#define BLOCK_MAP_CAPACITY_BITS 10 +#define BLOCK_IR_MAP_CAPACITY_BITS 10 + /* initialize the block map */ static void block_map_init(block_map_t *map, const uint8_t bits) { map->block_capacity = 1 << bits; map->size = 0; map->map = calloc(map->block_capacity, sizeof(struct block *)); + + map->block_mp = mpool_create(sizeof(block_t) << BLOCK_MAP_CAPACITY_BITS, + sizeof(block_t)); + map->block_ir_mp = mpool_create( + sizeof(rv_insn_t) << BLOCK_IR_MAP_CAPACITY_BITS, sizeof(rv_insn_t)); } /* clear all block in the block map */ @@ -26,15 +35,29 @@ void block_map_clear(block_map_t *map) block_t *block = map->map[i]; if (!block) continue; - for (uint32_t i = 0; i < block->n_insn; i++) - free(block->ir[i].fuse); - free(block->ir); - free(block); + uint32_t idx; + rv_insn_t *ir, *next; + for (idx = 0, ir = block->ir_head; idx < block->n_insn; + idx++, ir = next) { + free(ir->fuse); + next = ir->next; + mpool_free(map->block_ir_mp, ir); + } + mpool_free(map->block_mp, block); map->map[i] = NULL; } map->size = 0; } +static void block_map_destroy(block_map_t *map) +{ + block_map_clear(map); + free(map->map); + + mpool_destroy(map->block_mp); + mpool_destroy(map->block_ir_mp); +} + riscv_user_t rv_userdata(riscv_t *rv) { assert(rv); @@ -122,8 +145,7 @@ bool rv_enables_to_output_exit_code(riscv_t *rv) void rv_delete(riscv_t *rv) { assert(rv); - block_map_clear(&rv->block_map); - free(rv->block_map.map); + block_map_destroy(&rv->block_map); free(rv); } diff --git a/src/riscv_private.h b/src/riscv_private.h index 1b8437ee..149986c3 100644 --- a/src/riscv_private.h +++ b/src/riscv_private.h @@ -56,15 +56,17 @@ enum { typedef struct block { uint32_t n_insn; /**< number of instructions encompased */ uint32_t pc_start, pc_end; /**< address range of the basic block */ - uint32_t insn_capacity; /**< maximum of instructions encompased */ struct block *predict; /**< block prediction */ - rv_insn_t *ir; /**< IR as memory blocks */ + + rv_insn_t *ir_head, *ir_tail; /**< the first and last ir for this block */ } block_t; typedef struct { uint32_t block_capacity; /**< max number of entries in the block map */ uint32_t size; /**< number of entries currently in the map */ block_t **map; /**< block map */ + + struct mpool *block_mp, *block_ir_mp; } block_map_t; /* clear all block in the block map */ diff --git a/src/rv32_template.c b/src/rv32_template.c index f9e86932..4432417b 100644 --- a/src/rv32_template.c +++ b/src/rv32_template.c @@ -53,7 +53,7 @@ RVOP(jalr, { RV_EXC_MISALIGN_HANDLER(pc, insn, false, 0); block_t *block = block_find(&rv->block_map, rv->PC); if (block) - return block->ir->impl(rv, block->ir); + return block->ir_head->impl(rv, block->ir_head); return true; }) @@ -924,7 +924,7 @@ RVOP(cjr, { rv->PC = rv->X[ir->rs1]; block_t *block = block_find(&rv->block_map, rv->PC); if (block) - return block->ir->impl(rv, block->ir); + return block->ir_head->impl(rv, block->ir_head); return true; }) @@ -947,7 +947,7 @@ RVOP(cjalr, { RV_EXC_MISALIGN_HANDLER(rv->PC, insn, true, 0); block_t *block = block_find(&rv->block_map, rv->PC); if (block) - return block->ir->impl(rv, block->ir); + return block->ir_head->impl(rv, block->ir_head); return true; })