Skip to content

Commit

Permalink
Apply branch prediction for indirect jump
Browse files Browse the repository at this point in the history
    Previously, it was necessary to perform a block cache lookup at the end
    of an indirect jump emulation; however, the associated overhead of this
    operation proved to be substantial. To mitigate this overhead, we have
    introduced a branch history table that captures the historical data of
    indirect jump targets. Given the limited number of entries in the
    branch history table, the lookup overhead is significantly reduced.

    As shown in the performance analysis provided below, the branch history
    table has demonstrably enhanced the overall performance.

    |  Metric   |   original   |   purposed   |
    |-----------+--------------+--------------|
    | Dhrystone | 2932.3 DMIPS | 2985.2 DMIPS |
    | CoreMark  | 2231 iter/s  | 2236 iter/s  |
    | Stream    | 76.04 sec    | 75.299 sec   |
    | Nqueens   | 4.069 sec    | 3.933 sec    |
  • Loading branch information
qwe661234 committed Nov 20, 2023
1 parent 17c399a commit fa3514c
Show file tree
Hide file tree
Showing 3 changed files with 71 additions and 20 deletions.
9 changes: 8 additions & 1 deletion src/decode.h
Original file line number Diff line number Diff line change
Expand Up @@ -252,6 +252,12 @@ typedef struct {
uint8_t opcode;
} opcode_fuse_t;

#define HISTORY_SIZE 16
typedef struct {
uint32_t PC;
struct rv_insn *branch_target;
} branch_history_entry_t;

typedef struct rv_insn {
union {
int32_t imm;
Expand Down Expand Up @@ -294,7 +300,8 @@ typedef struct rv_insn {
* specific IR array without the need for additional copying.
*/
struct rv_insn *branch_taken, *branch_untaken;

uint8_t branch_table_count;
branch_history_entry_t *branch_table;
} rv_insn_t;

/* decode the RISC-V instruction */
Expand Down
40 changes: 24 additions & 16 deletions src/emulate.c
Original file line number Diff line number Diff line change
Expand Up @@ -371,21 +371,21 @@ static bool is_branch_taken = false;
static uint32_t last_pc = 0;

/* Interpreter-based execution path */
#define RVOP(inst, code) \
static bool do_##inst(riscv_t *rv, const rv_insn_t *ir, uint64_t cycle, \
uint32_t PC) \
{ \
cycle++; \
code; \
nextop: \
PC += __rv_insn_##inst##_len; \
if (unlikely(RVOP_NO_NEXT(ir))) { \
rv->csr_cycle = cycle; \
rv->PC = PC; \
return true; \
} \
const rv_insn_t *next = ir->next; \
MUST_TAIL return next->impl(rv, next, cycle, PC); \
#define RVOP(inst, code) \
static bool do_##inst(riscv_t *rv, rv_insn_t *ir, uint64_t cycle, \
uint32_t PC) \
{ \
cycle++; \
code; \
nextop: \
PC += __rv_insn_##inst##_len; \
if (unlikely(RVOP_NO_NEXT(ir))) { \
rv->csr_cycle = cycle; \
rv->PC = PC; \
return true; \
} \
const rv_insn_t *next = ir->next; \
MUST_TAIL return next->impl(rv, next, cycle, PC); \
}

#include "rv32_template.c"
Expand Down Expand Up @@ -633,8 +633,16 @@ static void block_translate(riscv_t *rv, block_t *block)
block->n_insn++;
prev_ir = ir;
/* stop on branch */
if (insn_is_branch(ir->opcode))
if (insn_is_branch(ir->opcode)) {
if (ir->opcode == rv_insn_jalr
#if RV32_HAS(EXT_C)
|| ir->opcode == rv_insn_cjalr || ir->opcode == rv_insn_cjr
#endif
)
ir->branch_table =
calloc(1, HISTORY_SIZE * sizeof(branch_history_entry_t));
break;
}

ir = mpool_alloc(rv->block_ir_mp);
}
Expand Down
42 changes: 39 additions & 3 deletions src/rv32_template.c
Original file line number Diff line number Diff line change
Expand Up @@ -57,9 +57,21 @@ RVOP(jalr, {
rv->X[ir->rd] = pc + 4;
/* check instruction misaligned */
RV_EXC_MISALIGN_HANDLER(pc, insn, false, 0);
/* lookup branch table */
for (int i = 0; i < ir->branch_table_count; i++) {
if (ir->branch_table[i].PC == PC) {
MUST_TAIL return ir->branch_table[i].branch_target->impl(
rv, ir->branch_table[i].branch_target, cycle, PC);
}
}
block_t *block = block_find(&rv->block_map, PC);
if (block)
if (block) {
/* update branch table */
ir->branch_table_count = (ir->branch_table_count + 1) % HISTORY_SIZE;
ir->branch_table[ir->branch_table_count].PC = PC;
ir->branch_table[ir->branch_table_count].branch_target = block->ir_head;
MUST_TAIL return block->ir_head->impl(rv, block->ir_head, cycle, PC);
}
rv->csr_cycle = cycle;
rv->PC = PC;
return true;
Expand Down Expand Up @@ -1016,9 +1028,21 @@ RVOP(clwsp, {
/* C.JR */
RVOP(cjr, {
PC = rv->X[ir->rs1];
/* lookup branch table */
for (int i = 0; i < ir->branch_table_count; i++) {
if (ir->branch_table[i].PC == PC) {
MUST_TAIL return ir->branch_table[i].branch_target->impl(
rv, ir->branch_table[i].branch_target, cycle, PC);
}
}
block_t *block = block_find(&rv->block_map, PC);
if (block)
if (block) {
/* update branch table */
ir->branch_table_count = (ir->branch_table_count + 1) % HISTORY_SIZE;
ir->branch_table[ir->branch_table_count].PC = PC;
ir->branch_table[ir->branch_table_count].branch_target = block->ir_head;
MUST_TAIL return block->ir_head->impl(rv, block->ir_head, cycle, PC);
}
rv->csr_cycle = cycle;
rv->PC = PC;
return true;
Expand All @@ -1043,9 +1067,21 @@ RVOP(cjalr, {
rv->X[rv_reg_ra] = PC + 2;
PC = jump_to;
RV_EXC_MISALIGN_HANDLER(PC, insn, true, 0);
/* lookup branch table */
for (int i = 0; i < ir->branch_table_count; i++) {
if (ir->branch_table[i].PC == PC) {
MUST_TAIL return ir->branch_table[i].branch_target->impl(
rv, ir->branch_table[i].branch_target, cycle, PC);
}
}
block_t *block = block_find(&rv->block_map, PC);
if (block)
if (block) {
/* update branch table */
ir->branch_table_count = (ir->branch_table_count + 1) % HISTORY_SIZE;
ir->branch_table[ir->branch_table_count].PC = PC;
ir->branch_table[ir->branch_table_count].branch_target = block->ir_head;
MUST_TAIL return block->ir_head->impl(rv, block->ir_head, cycle, PC);
}
rv->csr_cycle = cycle;
rv->PC = PC;
return true;
Expand Down

1 comment on commit fa3514c

@jserv
Copy link
Contributor

@jserv jserv commented on fa3514c Nov 20, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Benchmarks

Benchmark suite Current: fa3514c Previous: 3d2bcff Ratio
Dhrystone 1669.77 Average DMIPS over 10 runs 1668.75 Average DMIPS over 10 runs 1.00
Coremark 1434.4 Average iterations/sec over 10 runs 1436.513 Average iterations/sec over 10 runs 1.00

This comment was automatically generated by workflow using github-action-benchmark.

Please sign in to comment.