Skip to content

Commit 0031224

Browse files
authoredOct 2, 2023
Reduce memory usage for instruction block (#232)
The original memory allocation strategy for instruction blocks was found to be inefficient, leading to excessive memory usage. In the previous approach, a fixed amount of memory was allocated for each block, resulting in significant wastage. To address this issue, we have implemented a more efficient memory allocation scheme. Instead of allocating a fixed size for each block, we now maintain a pool of rv_insn_t and allocate memory only when needed. This new approach minimizes heap allocations and optimizes memory usage. We have introduced a parameter, BLOCK_POOL_SIZE, which allows us to control the balance between the number of calloc calls and memory consumption. This flexibility ensures that memory allocation occurs only when the pool is depleted. As a result of these changes, the heap memory allocation has significantly improved. For example, in the puzzle.elf example, we observed a reduction in heap memory allocation from 20,306,989 bytes to just 313,461 bytes. While this design may lead to some discontinuity in memory spaces for instructions in sequence, the impact on random access is minimal, as random access is primarily required for certain fuse operations. In cases where random access is needed, we can employ linear search method. The potential cache locality issues resulting from the discontinuous memory spaces can also be mitigated by adjusting the BLOCK_POOL_SIZE parameter for better performance.
1 parent a207574 commit 0031224

File tree

5 files changed

+116
-60
lines changed

5 files changed

+116
-60
lines changed
 

‎src/decode.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -298,6 +298,8 @@ typedef struct rv_insn {
298298
* specific IR array without the need for additional copying.
299299
*/
300300
struct rv_insn *branch_taken, *branch_untaken;
301+
302+
struct rv_insn *next;
301303
} rv_insn_t;
302304

303305
/* decode the RISC-V instruction */

‎src/emulate.c

Lines changed: 79 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55

66
#include <assert.h>
77
#include <stdbool.h>
8+
#include <stdint.h>
89
#include <stdio.h>
910
#include <stdlib.h>
1011
#include <string.h>
@@ -30,6 +31,7 @@ extern struct target_ops gdbstub_ops;
3031
#endif
3132

3233
#include "decode.h"
34+
#include "mpool.h"
3335
#include "riscv.h"
3436
#include "riscv_private.h"
3537
#include "state.h"
@@ -277,16 +279,17 @@ static inline uint32_t hash(size_t k)
277279
return k;
278280
}
279281

282+
static void block_translate(riscv_t *rv, block_map_t *map, block_t *block);
280283
/* allocate a basic block */
281-
static block_t *block_alloc(const uint8_t bits)
284+
static block_t *block_alloc(riscv_t *rv, block_map_t *map)
282285
{
283-
block_t *block = malloc(sizeof(struct block));
286+
block_t *block = mpool_alloc(map->block_mp);
284287
assert(block);
285-
block->insn_capacity = 1 << bits;
286288
block->n_insn = 0;
287289
block->predict = NULL;
288-
block->ir = malloc(block->insn_capacity * sizeof(rv_insn_t));
289-
assert(block->ir);
290+
291+
/* Initialize remaining part of block_t */
292+
block_translate(rv, map, block);
290293
return block;
291294
}
292295

@@ -366,7 +369,7 @@ static uint32_t last_pc = 0;
366369
rv->PC += ir->insn_len; \
367370
if (unlikely(RVOP_NO_NEXT(ir))) \
368371
return true; \
369-
const rv_insn_t *next = ir + 1; \
372+
const rv_insn_t *next = ir->next; \
370373
MUST_TAIL return next->impl(rv, next); \
371374
}
372375

@@ -395,36 +398,47 @@ enum {
395398
#undef _
396399
};
397400

401+
/* FIXME: This will simply find the n-th instruction by iterating
402+
* the linked list linearly, we may want to find better approach. */
403+
FORCE_INLINE rv_insn_t *next_nth_insn(rv_insn_t *ir, int32_t n)
404+
{
405+
rv_insn_t *tmp = ir;
406+
for (int32_t iter = 0; iter < n; iter++)
407+
tmp = tmp->next;
408+
return tmp;
409+
}
410+
398411
/* multiple lui */
399-
static bool do_fuse1(riscv_t *rv, const rv_insn_t *ir)
412+
static bool do_fuse1(riscv_t *rv, rv_insn_t *ir)
400413
{
401414
rv->csr_cycle += ir->imm2;
402-
for (int i = 0; i < ir->imm2; i++) {
403-
const rv_insn_t *cur_ir = ir + i;
415+
int i;
416+
rv_insn_t *cur_ir;
417+
for (i = 0, cur_ir = ir; i < ir->imm2; i++, cur_ir = cur_ir->next) {
404418
rv->X[cur_ir->rd] = cur_ir->imm;
405419
}
406420
rv->PC += ir->imm2 * ir->insn_len;
407421
if (unlikely(RVOP_NO_NEXT(ir)))
408422
return true;
409-
const rv_insn_t *next = ir + ir->imm2;
423+
const rv_insn_t *next = next_nth_insn(ir, ir->imm2);
410424
MUST_TAIL return next->impl(rv, next);
411425
}
412426

413427
/* LUI + ADD */
414-
static bool do_fuse2(riscv_t *rv, const rv_insn_t *ir)
428+
static bool do_fuse2(riscv_t *rv, rv_insn_t *ir)
415429
{
416430
rv->csr_cycle += 2;
417431
rv->X[ir->rd] = ir->imm;
418432
rv->X[ir->rs2] = rv->X[ir->rd] + rv->X[ir->rs1];
419433
rv->PC += 2 * ir->insn_len;
420434
if (unlikely(RVOP_NO_NEXT(ir)))
421435
return true;
422-
const rv_insn_t *next = ir + 2;
436+
const rv_insn_t *next = next_nth_insn(ir, 2);
423437
MUST_TAIL return next->impl(rv, next);
424438
}
425439

426440
/* multiple SW */
427-
static bool do_fuse3(riscv_t *rv, const rv_insn_t *ir)
441+
static bool do_fuse3(riscv_t *rv, rv_insn_t *ir)
428442
{
429443
rv->csr_cycle += ir->imm2;
430444
opcode_fuse_t *fuse = ir->fuse;
@@ -442,12 +456,12 @@ static bool do_fuse3(riscv_t *rv, const rv_insn_t *ir)
442456
rv->PC += ir->imm2 * ir->insn_len;
443457
if (unlikely(RVOP_NO_NEXT(ir)))
444458
return true;
445-
const rv_insn_t *next = ir + ir->imm2;
459+
const rv_insn_t *next = next_nth_insn(ir, ir->imm2);
446460
MUST_TAIL return next->impl(rv, next);
447461
}
448462

449463
/* multiple LW */
450-
static bool do_fuse4(riscv_t *rv, const rv_insn_t *ir)
464+
static bool do_fuse4(riscv_t *rv, rv_insn_t *ir)
451465
{
452466
rv->csr_cycle += ir->imm2;
453467
opcode_fuse_t *fuse = ir->fuse;
@@ -465,7 +479,7 @@ static bool do_fuse4(riscv_t *rv, const rv_insn_t *ir)
465479
rv->PC += ir->imm2 * ir->insn_len;
466480
if (unlikely(RVOP_NO_NEXT(ir)))
467481
return true;
468-
const rv_insn_t *next = ir + ir->imm2;
482+
const rv_insn_t *next = next_nth_insn(ir, ir->imm2);
469483
MUST_TAIL return next->impl(rv, next);
470484
}
471485

@@ -479,7 +493,7 @@ static bool do_fuse5(riscv_t *rv, const rv_insn_t *ir)
479493
rv->PC = rv->X[rv_reg_ra] & ~1U;
480494
if (unlikely(RVOP_NO_NEXT(ir)))
481495
return true;
482-
const rv_insn_t *next = ir + 1;
496+
const rv_insn_t *next = ir->next;
483497
MUST_TAIL return next->impl(rv, next);
484498
}
485499

@@ -493,7 +507,7 @@ static bool do_fuse6(riscv_t *rv, const rv_insn_t *ir)
493507
rv->PC = rv->X[rv_reg_ra] & ~1U;
494508
if (unlikely(RVOP_NO_NEXT(ir)))
495509
return true;
496-
const rv_insn_t *next = ir + 1;
510+
const rv_insn_t *next = ir->next;
497511
MUST_TAIL return next->impl(rv, next);
498512
}
499513

@@ -541,15 +555,21 @@ FORCE_INLINE bool insn_is_unconditional_branch(uint8_t opcode)
541555
return false;
542556
}
543557

544-
static void block_translate(riscv_t *rv, block_t *block)
558+
static void block_translate(riscv_t *rv, block_map_t *map, block_t *block)
545559
{
546560
block->pc_start = block->pc_end = rv->PC;
547561

562+
rv_insn_t *prev_ir = NULL;
563+
rv_insn_t *ir = mpool_alloc(map->block_ir_mp);
564+
block->ir_head = ir;
565+
548566
/* translate the basic block */
549-
while (block->n_insn < block->insn_capacity) {
550-
rv_insn_t *ir = block->ir + block->n_insn;
567+
while (true) {
551568
memset(ir, 0, sizeof(rv_insn_t));
552569

570+
if (prev_ir)
571+
prev_ir->next = ir;
572+
553573
/* fetch the next instruction */
554574
const uint32_t insn = rv->io.mem_ifetch(block->pc_end);
555575

@@ -564,21 +584,29 @@ static void block_translate(riscv_t *rv, block_t *block)
564584
/* compute the end of pc */
565585
block->pc_end += ir->insn_len;
566586
block->n_insn++;
587+
prev_ir = ir;
567588
/* stop on branch */
568589
if (insn_is_branch(ir->opcode))
569590
break;
591+
592+
ir = mpool_alloc(map->block_ir_mp);
570593
}
571-
block->ir[block->n_insn - 1].tailcall = true;
594+
595+
assert(prev_ir);
596+
block->ir_tail = prev_ir;
597+
block->ir_tail->tailcall = true;
572598
}
573599

574600
#define COMBINE_MEM_OPS(RW) \
575601
count = 1; \
576-
next_ir = ir + 1; \
602+
next_ir = ir->next; \
603+
tmp_ir = next_ir; \
577604
if (next_ir->opcode != IIF(RW)(rv_insn_lw, rv_insn_sw)) \
578605
break; \
579606
sign = (ir->imm - next_ir->imm) >> 31 ? -1 : 1; \
580-
for (uint32_t j = 1; j < block->n_insn - 1 - i; j++) { \
581-
next_ir = ir + j; \
607+
next_ir = tmp_ir; \
608+
for (uint32_t j = 1; j < block->n_insn - 1 - i; \
609+
j++, next_ir = next_ir->next) { \
582610
if (next_ir->opcode != IIF(RW)(rv_insn_lw, rv_insn_sw) || \
583611
ir->rs1 != next_ir->rs1 || ir->imm - next_ir->imm != 4 * sign) \
584612
break; \
@@ -590,8 +618,8 @@ static void block_translate(riscv_t *rv, block_t *block)
590618
ir->imm2 = count; \
591619
memcpy(ir->fuse, ir, sizeof(opcode_fuse_t)); \
592620
ir->impl = dispatch_table[ir->opcode]; \
593-
for (int j = 1; j < count; j++) { \
594-
next_ir = ir + j; \
621+
next_ir = tmp_ir; \
622+
for (int j = 1; j < count; j++, next_ir = next_ir->next) { \
595623
memcpy(ir->fuse + j, next_ir, sizeof(opcode_fuse_t)); \
596624
} \
597625
ir->tailcall = next_ir->tailcall; \
@@ -825,7 +853,7 @@ static bool detect_memcpy(riscv_t *rv, int lib)
825853

826854
static bool libc_substitute(riscv_t *rv, block_t *block)
827855
{
828-
rv_insn_t *ir = block->ir, *next_ir = NULL;
856+
rv_insn_t *ir = block->ir_head, *next_ir = NULL;
829857
switch (ir->opcode) {
830858
case rv_insn_addi:
831859
/* Compare the target block with the first basic block of
@@ -835,10 +863,10 @@ static bool libc_substitute(riscv_t *rv, block_t *block)
835863
* instruction sequence.
836864
*/
837865
if (ir->imm == 15 && ir->rd == rv_reg_t1 && ir->rs1 == rv_reg_zero) {
838-
next_ir = ir + 1;
866+
next_ir = ir->next;
839867
if (next_ir->opcode == rv_insn_addi && next_ir->rd == rv_reg_a4 &&
840868
next_ir->rs1 == rv_reg_a0 && next_ir->rs2 == rv_reg_zero) {
841-
next_ir = next_ir + 1;
869+
next_ir = next_ir->next;
842870
if (next_ir->opcode == rv_insn_bgeu && next_ir->imm == 60 &&
843871
next_ir->rs1 == rv_reg_t1 && next_ir->rs2 == rv_reg_a2) {
844872
if (detect_memset(rv, 1)) {
@@ -851,7 +879,7 @@ static bool libc_substitute(riscv_t *rv, block_t *block)
851879
}
852880
} else if (ir->imm == 0 && ir->rd == rv_reg_t1 &&
853881
ir->rs1 == rv_reg_a0) {
854-
next_ir = ir + 1;
882+
next_ir = ir->next;
855883
if (next_ir->opcode == rv_insn_beq && next_ir->rs1 == rv_reg_a2 &&
856884
next_ir->rs2 == rv_reg_zero) {
857885
if (next_ir->imm == 20 && detect_memset(rv, 2)) {
@@ -876,14 +904,14 @@ static bool libc_substitute(riscv_t *rv, block_t *block)
876904
*/
877905
if (ir->rd == rv_reg_a5 && ir->rs1 == rv_reg_a0 &&
878906
ir->rs2 == rv_reg_a1) {
879-
next_ir = ir + 1;
907+
next_ir = ir->next;
880908
if (next_ir->opcode == rv_insn_andi && next_ir->imm == 3 &&
881909
next_ir->rd == rv_reg_a5 && next_ir->rs1 == rv_reg_a5) {
882-
next_ir = next_ir + 1;
910+
next_ir = next_ir->next;
883911
if (next_ir->opcode == rv_insn_add &&
884912
next_ir->rd == rv_reg_a7 && next_ir->rs1 == rv_reg_a0 &&
885913
next_ir->rs2 == rv_reg_a2) {
886-
next_ir = next_ir + 1;
914+
next_ir = next_ir->next;
887915
if (next_ir->opcode == rv_insn_bne && next_ir->imm == 104 &&
888916
next_ir->rs1 == rv_reg_a5 &&
889917
next_ir->rs2 == rv_reg_zero) {
@@ -912,12 +940,15 @@ static bool libc_substitute(riscv_t *rv, block_t *block)
912940
*/
913941
static void match_pattern(block_t *block)
914942
{
915-
for (uint32_t i = 0; i < block->n_insn - 1; i++) {
916-
rv_insn_t *ir = block->ir + i, *next_ir = NULL;
943+
uint32_t i;
944+
rv_insn_t *ir;
945+
for (i = 0, ir = block->ir_head; i < block->n_insn - 1;
946+
i++, ir = ir->next) {
947+
rv_insn_t *next_ir = NULL, *tmp_ir = NULL;
917948
int32_t count = 0, sign = 1;
918949
switch (ir->opcode) {
919950
case rv_insn_lui:
920-
next_ir = ir + 1;
951+
next_ir = ir->next;
921952
switch (next_ir->opcode) {
922953
case rv_insn_add:
923954
if (ir->rd == next_ir->rs2 || ir->rd == next_ir->rs1) {
@@ -940,7 +971,7 @@ static void match_pattern(block_t *block)
940971
count++;
941972
if (next_ir->tailcall)
942973
break;
943-
next_ir++;
974+
next_ir = next_ir->next;
944975
}
945976
ir->imm2 = count;
946977
ir->opcode = rv_insn_fuse1;
@@ -994,8 +1025,10 @@ static void optimize_constant(riscv_t *rv, block_t *block)
9941025
constopt_info_t constopt_info = {0};
9951026
constopt_info.is_constant[0] = true;
9961027
assert(rv->X[0] == 0);
997-
for (uint32_t i = 0; i < block->n_insn; i++) {
998-
rv_insn_t *ir = block->ir + i;
1028+
1029+
uint32_t i;
1030+
rv_insn_t *ir;
1031+
for (i = 0, ir = block->ir_head; i < block->n_insn; i++, ir = ir->next) {
9991032
((constopt_func_t) constopt_table[ir->opcode])(ir, &constopt_info);
10001033
}
10011034
}
@@ -1014,10 +1047,7 @@ static block_t *block_find_or_translate(riscv_t *rv)
10141047
}
10151048

10161049
/* allocate a new block */
1017-
next = block_alloc(10);
1018-
1019-
/* translate the basic block */
1020-
block_translate(rv, next);
1050+
next = block_alloc(rv, map);
10211051

10221052
if (!libc_substitute(rv, next)) {
10231053
optimize_constant(rv, next);
@@ -1075,27 +1105,27 @@ void rv_step(riscv_t *rv, int32_t cycles)
10751105
if (prev->pc_start != last_pc)
10761106
prev = block_find(&rv->block_map, last_pc);
10771107

1078-
rv_insn_t *last_ir = prev->ir + prev->n_insn - 1;
1108+
rv_insn_t *last_ir = prev->ir_tail;
10791109
/* chain block */
10801110
if (!insn_is_unconditional_branch(last_ir->opcode)) {
10811111
if (branch_taken && !last_ir->branch_taken)
1082-
last_ir->branch_taken = block->ir;
1112+
last_ir->branch_taken = block->ir_head;
10831113
else if (!last_ir->branch_untaken)
1084-
last_ir->branch_untaken = block->ir;
1114+
last_ir->branch_untaken = block->ir_head;
10851115
} else if (last_ir->opcode == rv_insn_jal
10861116
#if RV32_HAS(EXT_C)
10871117
|| last_ir->opcode == rv_insn_cj ||
10881118
last_ir->opcode == rv_insn_cjal
10891119
#endif
10901120
) {
10911121
if (!last_ir->branch_taken)
1092-
last_ir->branch_taken = block->ir;
1122+
last_ir->branch_taken = block->ir_head;
10931123
}
10941124
}
10951125
last_pc = rv->PC;
10961126

10971127
/* execute the block */
1098-
const rv_insn_t *ir = block->ir;
1128+
const rv_insn_t *ir = block->ir_head;
10991129
if (unlikely(!ir->impl(rv, ir)))
11001130
break;
11011131

‎src/riscv.c

Lines changed: 28 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -7,15 +7,24 @@
77
#include <stdlib.h>
88
#include <string.h>
99

10+
#include "mpool.h"
1011
#include "riscv_private.h"
1112
#include "state.h"
1213

14+
#define BLOCK_MAP_CAPACITY_BITS 10
15+
#define BLOCK_IR_MAP_CAPACITY_BITS 10
16+
1317
/* initialize the block map */
1418
static void block_map_init(block_map_t *map, const uint8_t bits)
1519
{
1620
map->block_capacity = 1 << bits;
1721
map->size = 0;
1822
map->map = calloc(map->block_capacity, sizeof(struct block *));
23+
24+
map->block_mp = mpool_create(sizeof(block_t) << BLOCK_MAP_CAPACITY_BITS,
25+
sizeof(block_t));
26+
map->block_ir_mp = mpool_create(
27+
sizeof(rv_insn_t) << BLOCK_IR_MAP_CAPACITY_BITS, sizeof(rv_insn_t));
1928
}
2029

2130
/* clear all block in the block map */
@@ -26,15 +35,29 @@ void block_map_clear(block_map_t *map)
2635
block_t *block = map->map[i];
2736
if (!block)
2837
continue;
29-
for (uint32_t i = 0; i < block->n_insn; i++)
30-
free(block->ir[i].fuse);
31-
free(block->ir);
32-
free(block);
38+
uint32_t idx;
39+
rv_insn_t *ir, *next;
40+
for (idx = 0, ir = block->ir_head; idx < block->n_insn;
41+
idx++, ir = next) {
42+
free(ir->fuse);
43+
next = ir->next;
44+
mpool_free(map->block_ir_mp, ir);
45+
}
46+
mpool_free(map->block_mp, block);
3347
map->map[i] = NULL;
3448
}
3549
map->size = 0;
3650
}
3751

52+
static void block_map_destroy(block_map_t *map)
53+
{
54+
block_map_clear(map);
55+
free(map->map);
56+
57+
mpool_destroy(map->block_mp);
58+
mpool_destroy(map->block_ir_mp);
59+
}
60+
3861
riscv_user_t rv_userdata(riscv_t *rv)
3962
{
4063
assert(rv);
@@ -122,8 +145,7 @@ bool rv_enables_to_output_exit_code(riscv_t *rv)
122145
void rv_delete(riscv_t *rv)
123146
{
124147
assert(rv);
125-
block_map_clear(&rv->block_map);
126-
free(rv->block_map.map);
148+
block_map_destroy(&rv->block_map);
127149
free(rv);
128150
}
129151

‎src/riscv_private.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -56,15 +56,17 @@ enum {
5656
typedef struct block {
5757
uint32_t n_insn; /**< number of instructions encompased */
5858
uint32_t pc_start, pc_end; /**< address range of the basic block */
59-
uint32_t insn_capacity; /**< maximum of instructions encompased */
6059
struct block *predict; /**< block prediction */
61-
rv_insn_t *ir; /**< IR as memory blocks */
60+
61+
rv_insn_t *ir_head, *ir_tail; /**< the first and last ir for this block */
6262
} block_t;
6363

6464
typedef struct {
6565
uint32_t block_capacity; /**< max number of entries in the block map */
6666
uint32_t size; /**< number of entries currently in the map */
6767
block_t **map; /**< block map */
68+
69+
struct mpool *block_mp, *block_ir_mp;
6870
} block_map_t;
6971

7072
/* clear all block in the block map */

‎src/rv32_template.c

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ RVOP(jalr, {
5353
RV_EXC_MISALIGN_HANDLER(pc, insn, false, 0);
5454
block_t *block = block_find(&rv->block_map, rv->PC);
5555
if (block)
56-
return block->ir->impl(rv, block->ir);
56+
return block->ir_head->impl(rv, block->ir_head);
5757
return true;
5858
})
5959

@@ -924,7 +924,7 @@ RVOP(cjr, {
924924
rv->PC = rv->X[ir->rs1];
925925
block_t *block = block_find(&rv->block_map, rv->PC);
926926
if (block)
927-
return block->ir->impl(rv, block->ir);
927+
return block->ir_head->impl(rv, block->ir_head);
928928
return true;
929929
})
930930

@@ -947,7 +947,7 @@ RVOP(cjalr, {
947947
RV_EXC_MISALIGN_HANDLER(rv->PC, insn, true, 0);
948948
block_t *block = block_find(&rv->block_map, rv->PC);
949949
if (block)
950-
return block->ir->impl(rv, block->ir);
950+
return block->ir_head->impl(rv, block->ir_head);
951951
return true;
952952
})
953953

1 commit comments

Comments
 (1)

jserv commented on Oct 2, 2023

@jserv
Contributor

Benchmarks

Benchmark suite Current: 0031224 Previous: 012f117 Ratio
Dhrystone 1130 Average DMIPS over 10 runs 1046.6 Average DMIPS over 10 runs 0.93
Coremark 966.211 Average iterations/sec over 10 runs 987.419 Average iterations/sec over 10 runs 1.02

This comment was automatically generated by workflow using github-action-benchmark.

Please sign in to comment.