From 44f7395e053a259eead03e65691b98c9ff753fa8 Mon Sep 17 00:00:00 2001 From: Yen-Fu Chen Date: Mon, 25 Dec 2023 13:04:10 +0800 Subject: [PATCH] Introduce a tier-1 JIT compiler based on aarch64 architecture We follow the template and API of X64 to implement A64 tier-1 JIT compiler. * Perfromance | Metric | rv32emu-T1C | qemu | |----------+-------------+-------| |aes | 0.034| 0.045| |puzzle | 0.0115| 0.0169| |pi | 0.035| 0.032| |dhrystone | 1.914| 2.005| |Nqeueens | 3.87| 2.898| |qsort-O2 | 7.819| 11.614| |miniz-O2 | 7.604| 3.803| |primes-O2 | 10.551| 5.986| |sha512-O2 | 6.497| 2.853| |stream | 52.25| 45.776| As demonstrated in the memory usage analysis below, the tier-1 JIT compiler utilizes less memory than QEMU across all benchmarks. * Memory usage | Metric | rv32emu-T1C | qemu | |----------+-------------+---------| |aes | 183,212|1,265,962| |puzzle | 145,239| 891,357| |pi | 144,739| 872,525| |dhrystone | 146,282| 853,256| |Nqeueens | 146,696| 854,174| |qsort-O2 | 146,907| 856,721| |miniz-O2 | 157,475| 999,897| |primes-O2 | 142,356| 851,661| |sha512-O2 | 145,369| 901,136| |stream | 157,975| 955,809| Related: #238 Close: #296 --- .github/workflows/main.yml | 3 + .gitignore | 2 +- Makefile | 12 +- src/emulate.c | 4 +- src/jit.c | 1557 ++++++++++++++++++++++++++++++++++++ src/jit.h | 39 + src/jit_x64.c | 577 ------------- src/jit_x64.h | 407 ---------- src/riscv.c | 6 +- src/rv32_template.c | 1072 +++++++++++++------------ src/utils.h | 9 + tools/gen-jit-template.py | 16 +- 12 files changed, 2181 insertions(+), 1523 deletions(-) create mode 100644 src/jit.c create mode 100644 src/jit.h delete mode 100644 src/jit_x64.c delete mode 100644 src/jit_x64.h diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 4f22cece6..cae46d01f 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -58,6 +58,9 @@ jobs: run: | make make check + - name: JIT test + run: | + make ENABLE_JIT=1 distclean check coding-style: runs-on: ubuntu-22.04 diff --git a/.gitignore b/.gitignore index bf2724afa..86dd56475 100644 --- a/.gitignore +++ b/.gitignore @@ -21,4 +21,4 @@ build/path/ tests/**/*.elf tests/arch-test-target/config.ini __pycache__/ -src/rv32_jit_template.c +src/rv32_jit.c diff --git a/Makefile b/Makefile index 8e3d375cd..cda3c5027 100644 --- a/Makefile +++ b/Makefile @@ -121,15 +121,15 @@ endif ENABLE_JIT ?= 0 $(call set-feature, JIT) ifeq ($(call has, JIT), 1) -OBJS_EXT += jit_x64.o -ifneq ($(processor), x86_64) -$(error JIT mode only supports for x64 target currently.) +OBJS_EXT += jit.o +ifneq ($(processor),$(filter $(processor),x86_64 aarch64 arm64)) +$(error JIT mode only supports for x64 and arm64 target currently.) endif -src/rv32_jit_template.c: +src/rv32_jit.c: $(Q)tools/gen-jit-template.py $(CFLAGS) > $@ -$(OUT)/jit_x64.o: src/jit_x64.c src/rv32_jit_template.c +$(OUT)/jit.o: src/jit.c src/rv32_jit.c $(VECHO) " CC\t$@\n" $(Q)$(CC) -o $@ $(CFLAGS) -c -MMD -MF $@.d $< endif @@ -235,7 +235,7 @@ endif endif clean: - $(RM) $(BIN) $(OBJS) $(HIST_BIN) $(HIST_OBJS) $(deps) $(CACHE_OUT) src/rv32_jit_template.c + $(RM) $(BIN) $(OBJS) $(HIST_BIN) $(HIST_OBJS) $(deps) $(CACHE_OUT) src/rv32_jit.c distclean: clean -$(RM) $(DOOM_DATA) $(QUAKE_DATA) $(RM) -r $(OUT)/id1 diff --git a/src/emulate.c b/src/emulate.c index 552e29425..9caba2f00 100644 --- a/src/emulate.c +++ b/src/emulate.c @@ -27,7 +27,7 @@ extern struct target_ops gdbstub_ops; #if RV32_HAS(JIT) #include "cache.h" -#include "jit_x64.h" +#include "jit.h" #endif /* Shortcuts for comparing each field of specified RISC-V instruction */ @@ -1067,7 +1067,7 @@ void rv_step(riscv_t *rv, int32_t cycles) cache_freq(rv->block_cache, block->pc_start) >= 1024) || cache_hot(rv->block_cache, block->pc_start))) { block->hot = true; - block->offset = translate_x64(rv, block); + block->offset = jit_translate(rv, block); ((exec_block_func_t) state->buf)( rv, (uintptr_t) (state->buf + block->offset)); prev = NULL; diff --git a/src/jit.c b/src/jit.c new file mode 100644 index 000000000..6fcbbe5d6 --- /dev/null +++ b/src/jit.c @@ -0,0 +1,1557 @@ +/* + * rv32emu is freely redistributable under the MIT License. See the file + * "LICENSE" for information on usage and redistribution of this file. + */ + +/* This JIT implementation has undergone extensive modifications, heavily + * relying on the ubpf_jit_[x86_64|arm64].[ch] from ubpf. The original + * ubpf_jit_[x86_64|arm64].[ch] file served as the foundation and source of + * inspiration for adapting and tailoring it specifically for this JIT + * implementation. Therefore, credit and sincere thanks are extended to ubpf for + * their invaluable work. + * + * Reference: + * https://github.com/iovisor/ubpf/blob/main/vm/ubpf_jit_x86_64.c + * https://github.com/iovisor/ubpf/blob/main/vm/ubpf_jit_arm64.c + */ + +#if !defined(__x86_64__) && !defined(__aarch64__) +#error "This implementation is dedicated to x64 and arm64." +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#if defined(__APPLE__) +#include +#if defined(__aarch64__) +#include +#endif +#endif + +#include "cache.h" +#include "decode.h" +#include "io.h" +#include "jit.h" +#include "riscv.h" +#include "utils.h" + +#define JIT_CLS_MASK 0x07 +#define JIT_ALU_OP_MASK 0xf0 +#define JIT_CLS_ALU 0x04 +#define JIT_CLS_ALU64 0x07 +#define JIT_SRC_IMM 0x00 +#define JIT_SRC_REG 0x08 +#define JIT_OP_MUL_IMM (JIT_CLS_ALU | JIT_SRC_IMM | 0x20) +#define JIT_OP_MUL_REG (JIT_CLS_ALU | JIT_SRC_REG | 0x20) +#define JIT_OP_DIV_IMM (JIT_CLS_ALU | JIT_SRC_IMM | 0x30) +#define JIT_OP_DIV_REG (JIT_CLS_ALU | JIT_SRC_REG | 0x30) +#define JIT_OP_MOD_IMM (JIT_CLS_ALU | JIT_SRC_IMM | 0x90) +#define JIT_OP_MOD_REG (JIT_CLS_ALU | JIT_SRC_REG | 0x90) + +#define STACK_SIZE 512 +#define MAX_INSNS 1024 +#if defined(__x86_64__) +#define JUMP_LOC jump_loc + 2 +/* Special values for target_pc in struct jump */ +#define TARGET_PC_EXIT -1U +#define TARGET_PC_RETPOLINE -3U +enum x64_reg { + RAX, + RCX, + RDX, + RBX, + RSP, + RBP, + RIP = 5, + RSI, + RDI, + R8, + R9, + R10, + R11, + R12, + R13, + R14, + R15, +}; + +#elif defined(__aarch64__) +#define JUMP_LOC jump_loc +/* Special values for target_pc in struct jump */ +#define TARGET_PC_EXIT ~UINT32_C(0) +#define TARGET_PC_ENTER (~UINT32_C(0) & 0x0101) +/* This is guaranteed to be an illegal A64 instruction. */ +#define BAD_OPCODE ~UINT32_C(0) + +enum a64_reg { + R0, + R1, + R2, + R3, + R4, + R5, + R6, + R7, + R8, + R9, + R10, + R11, + R12, + R13, + R14, + R15, + R16, + R17, + R18, + R19, + R20, + R21, + R22, + R23, + R24, + R25, + R26, + R27, + R28, + R29, + R30, + SP, + RZ = 31 +}; + +enum arm_opcode { + /* AddSubOpcode */ + AS_ADD = 0, + AS_SUB = 2, + AS_SUBS = 3, + /* LogicalOpcode */ + LOG_AND = 0x00000000U, // 0000_0000_0000_0000_0000_0000_0000_0000 + LOG_ORR = 0x20000000U, // 0010_0000_0000_0000_0000_0000_0000_0000 + LOG_ORN = 0x20200000U, // 0010_0000_0010_0000_0000_0000_0000_0000 + LOG_EOR = 0x40000000U, // 0100_0000_0000_0000_0000_0000_0000_0000 + /* LoadStoreOpcode */ + LS_STRB = 0x00000000U, // 0000_0000_0000_0000_0000_0000_0000_0000 + LS_LDRB = 0x00400000U, // 0000_0000_0100_0000_0000_0000_0000_0000 + LS_LDRSBW = 0x00c00000U, // 0000_0000_1100_0000_0000_0000_0000_0000 + LS_STRH = 0x40000000U, // 0100_0000_0000_0000_0000_0000_0000_0000 + LS_LDRH = 0x40400000U, // 0100_0000_0100_0000_0000_0000_0000_0000 + LS_LDRSHW = 0x40c00000U, // 0100_0000_1100_0000_0000_0000_0000_0000 + LS_STRW = 0x80000000U, // 1000_0000_0000_0000_0000_0000_0000_0000 + LS_LDRW = 0x80400000U, // 1000_0000_0100_0000_0000_0000_0000_0000 + LS_STRX = 0xc0000000U, // 1100_0000_0000_0000_0000_0000_0000_0000 + LS_LDRX = 0xc0400000U, // 1100_0000_0100_0000_0000_0000_0000_0000 + /* LoadStorePairOpcode */ + LSP_STPX = 0xa9000000U, // 1010_1001_0000_0000_0000_0000_0000_0000 + LSP_LDPX = 0xa9400000U, // 1010_1001_0100_0000_0000_0000_0000_0000 + /* UnconditionalBranchOpcode */ + BR_BR = 0xd61f0000U, // 1101_0110_0001_1111_0000_0000_0000_0000 + BR_BLR = 0xd63f0000U, // 1101_0110_0011_1111_0000_0000_0000_0000 + BR_RET = 0xd65f0000U, // 1101_0110_0101_1111_0000_0000_0000_0000 + /* UnconditionalBranchImmediateOpcode */ + UBR_B = 0x14000000U, // 0001_0100_0000_0000_0000_0000_0000_0000 + /* ConditionalBranchImmediateOpcode */ + BR_Bcond = 0x54000000U, + /* DP2Opcode */ + DP2_UDIV = 0x1ac00800U, // 0001_1010_1100_0000_0000_1000_0000_0000 + DP2_LSLV = 0x1ac02000U, // 0001_1010_1100_0000_0010_0000_0000_0000 + DP2_LSRV = 0x1ac02400U, // 0001_1010_1100_0000_0010_0100_0000_0000 + DP2_ASRV = 0x1ac02800U, // 0001_1010_1100_0000_0010_1000_0000_0000 + /* DP3Opcode */ + DP3_MADD = 0x1b000000U, // 0001_1011_0000_0000_0000_0000_0000_0000 + DP3_MSUB = 0x1b008000U, // 0001_1011_0000_0000_1000_0000_0000_0000 + /* MoveWideOpcode */ + MW_MOVN = 0x12800000U, // 0001_0010_1000_0000_0000_0000_0000_0000 + MW_MOVZ = 0x52800000U, // 0101_0010_1000_0000_0000_0000_0000_0000 + MW_MOVK = 0x72800000U, // 0111_0010_1000_0000_0000_0000_0000_0000 +}; + +enum condition { + COND_EQ, + COND_NE, + COND_HS, + COND_LO, + COND_GE = 10, + COND_LT = 11, +}; +#endif + +enum vm_reg { + VM_REG_0 = 0, + VM_REG_1, + VM_REG_2, + VM_REG_3, + VM_REG_4, + VM_REG_5, + VM_REG_6, + VM_REG_7, + VM_REG_8, + VM_REG_9, + VM_REG_10, + N_VM_REGS, +}; + +enum operand_size { + S8, + S16, + S32, +}; + +#if defined(__x86_64__) +/* There are two common x86-64 calling conventions, discussed at: + * https://en.wikipedia.org/wiki/X64_calling_conventions#x86-64_calling_conventions + * + * Please note: R12 is an exception and is *not* being used. Consequently, it + * is omitted from the list of non-volatile registers for both platforms, + * despite being non-volatile. + */ +#if defined(_WIN32) +static const int nonvolatile_reg[] = {RBP, RBX, RDI, RSI, R13, R14, R15}; +static const int parameter_reg[] = {RCX, RDX, R8, R9}; +#define RCX_ALT R10 +static const int register_map[] = { + RAX, R10, RDX, R8, R9, R14, R15, RDI, RSI, RBX, RBP, +}; +#else +#define RCX_ALT R9 +static const int nonvolatile_reg[] = {RBP, RBX, R13, R14, R15}; +static const int parameter_reg[] = {RDI, RSI, RDX, RCX, R8, R9}; +static const int temp_reg[] = {RAX, RBX, RCX}; +static const int register_map[] = { + RAX, RDI, RSI, RDX, R9, R8, RBX, R13, R14, R15, RBP, +}; +#endif +#elif defined(__aarch64__) +/* callee_reg - this must be a multiple of two because of how we save the stack + * later on. */ +static const int callee_reg[] = {R19, R20, R21, R22, R23, R24, R25, R26}; +/* parameter_reg (Caller saved registers) */ +static const int parameter_reg[] = {R0, R1, R2, R3, R4}; +static const int temp_reg[] = {R6, R7, R8}; +/* Temp register for immediate generation */ +static const int temp_imm_reg = R24; +/* Temp register for division results */ +static const int temp_div_reg = R25; + +/* Register assignments: + * Arm64 Usage + * r0 - r4 Function parameters, caller-saved + * r6 - r8 Temp - used for storing calculated value during execution + * r19 - r23 Callee-saved registers + * r24 Temp - used for generating 32-bit immediates + * r25 Temp - used for modulous calculations + */ + +static const int register_map[] = { + R5, /* result */ + R0, R1, R2, R3, R4, /* parameters */ + R19, R20, R21, R22, R23, /* callee-saved */ +}; +static inline void emit_load_imm(struct jit_state *state, int dst, int64_t imm); +#endif + +/* Return the register for the given JIT register */ +static int map_register(int r) +{ + assert(r < N_VM_REGS); + return register_map[r % N_VM_REGS]; +} + +static inline void offset_map_insert(struct jit_state *state, int32_t target_pc) +{ + struct offset_map *map_entry = &state->offset_map[state->n_insn++]; + map_entry->pc = target_pc; + map_entry->offset = state->offset; +} + +#if !defined(__APPLE__) +#define sys_icache_invalidate(addr, size) \ + __builtin___clear_cache((char *) (addr), (char *) (addr) + (size)); +#endif + +static void emit_bytes(struct jit_state *state, void *data, uint32_t len) +{ + assert(state->offset <= state->size - len); + if (unlikely((state->offset + len) > state->size)) { + state->offset = state->size; + return; + } +#if defined(__APPLE__) && defined(__aarch64__) + pthread_jit_write_protect_np(false); +#endif + memcpy(state->buf + state->offset, data, len); + sys_icache_invalidate(state->buf + state->offset, len); +#if defined(__APPLE__) && defined(__aarch64__) + pthread_jit_write_protect_np(true); +#endif + state->offset += len; +} + +#if defined(__x86_64__) +static inline void emit1(struct jit_state *state, uint8_t x) +{ + emit_bytes(state, &x, sizeof(x)); +} + +static inline void emit2(struct jit_state *state, uint16_t x) +{ + emit_bytes(state, &x, sizeof(x)); +} + +static inline void emit4(struct jit_state *state, uint32_t x) +{ + emit_bytes(state, &x, sizeof(x)); +} + +static inline void emit8(struct jit_state *state, uint64_t x) +{ + emit_bytes(state, &x, sizeof(x)); +} + +static inline void emit_modrm(struct jit_state *state, int mod, int r, int m) +{ + assert(!(mod & ~0xc0)); + emit1(state, (mod & 0xc0) | ((r & 7) << 3) | (m & 7)); +} + +static inline void emit_modrm_reg2reg(struct jit_state *state, int r, int m) +{ + emit_modrm(state, 0xc0, r, m); +} + +static inline void emit_modrm_and_displacement(struct jit_state *state, + int r, + int m, + int32_t d) +{ + if (d == 0 && (m & 7) != RBP) { + emit_modrm(state, 0x00, r, m); + } else if (d >= -128 && d <= 127) { + emit_modrm(state, 0x40, r, m); + emit1(state, d); + } else { + emit_modrm(state, 0x80, r, m); + emit4(state, d); + } +} + +static inline void emit_rex(struct jit_state *state, int w, int r, int x, int b) +{ + assert(!(w & ~1)); + assert(!(r & ~1)); + assert(!(x & ~1)); + assert(!(b & ~1)); + emit1(state, 0x40 | (w << 3) | (r << 2) | (x << 1) | b); +} + +/* Emit a REX prefix incorporating the top bit of both src and dst. This step is + * skipped if no bits are set. + */ +static inline void emit_basic_rex(struct jit_state *state, + int w, + int src, + int dst) +{ + if (w || (src & 8) || (dst & 8)) + emit_rex(state, w, !!(src & 8), 0, !!(dst & 8)); +} + +static inline void emit_push(struct jit_state *state, int r) +{ + emit_basic_rex(state, 0, 0, r); + emit1(state, 0x50 | (r & 7)); +} + +static inline void emit_pop(struct jit_state *state, int r) +{ + emit_basic_rex(state, 0, 0, r); + emit1(state, 0x58 | (r & 7)); +} + +static inline void emit_jump_target_address(struct jit_state *state, + int32_t target_pc) +{ + struct jump *jump = &state->jumps[state->n_jumps++]; + jump->offset_loc = state->offset; + jump->target_pc = target_pc; + emit4(state, 0); +} +#elif defined(__aarch64__) +static void emit_a64(struct jit_state *state, uint32_t insn) +{ + assert(insn != BAD_OPCODE); + emit_bytes(state, &insn, 4); +} + +/* Get the value of the size bit in most instruction encodings (bit 31). */ +static uint32_t sz(bool is64) +{ + return (is64 ? UINT32_C(1) : UINT32_C(0)) << 31; +} + + +/* For details on Arm instructions, users can refer to + * https://developer.arm.com/documentation/ddi0487/ha (Arm Architecture + * Reference Manual for A-profile architecture). + */ + +/* [ARM-A]: C4.1.64: Add/subtract (immediate). */ +static void emit_addsub_immediate(struct jit_state *state, + bool is64, + enum arm_opcode op, + int rd, + int rn, + uint32_t imm12) +{ + const uint32_t imm_op_base = 0x11000000; + emit_a64(state, sz(is64) | (op << 29) | imm_op_base | (0 << 22) | + (imm12 << 10) | (rn << 5) | rd); +} + +/* [ARM-A]: C4.1.67: Logical (shifted register). */ +static void emit_logical_register(struct jit_state *state, + bool is64, + enum arm_opcode op, + int rd, + int rn, + int rm) +{ + emit_a64(state, sz(is64) | op | (1 << 27) | (1 << 25) | (rm << 16) | + (rn << 5) | rd); +} + +/* [ARM-A]: C4.1.67: Add/subtract (shifted register). */ +static inline void emit_addsub_register(struct jit_state *state, + bool is64, + enum arm_opcode op, + int rd, + int rn, + int rm) +{ + const uint32_t reg_op_base = 0x0b000000; + emit_a64(state, + sz(is64) | (op << 29) | reg_op_base | (rm << 16) | (rn << 5) | rd); +} + +/* [ARM-A]: C4.1.64: Move wide (Immediate). */ +static inline void emit_movewide_immediate(struct jit_state *state, + bool is64, + int rd, + uint64_t imm) +{ + /* Emit a MOVZ or MOVN followed by a sequence of MOVKs to generate the + * 64-bit constant in imm. See whether the 0x0000 or 0xffff pattern is more + * common in the immediate. This ensures we produce the fewest number of + * immediates. + */ + unsigned count0000 = is64 ? 0 : 2; + unsigned countffff = 0; + for (unsigned i = 0; i < (is64 ? 64 : 32); i += 16) { + uint64_t block = (imm >> i) & 0xffff; + if (block == 0xffff) { + ++countffff; + } else if (block == 0) { + ++count0000; + } + } + + /* Iterate over 16-bit elements of imm, outputting an appropriate move + * instruction. */ + bool invert = (count0000 < countffff); + enum arm_opcode op = invert ? MW_MOVN : MW_MOVZ; + uint64_t skip_pattern = invert ? 0xffff : 0; + for (unsigned i = 0; i < (is64 ? 4 : 2); ++i) { + uint64_t imm16 = (imm >> (i * 16)) & 0xffff; + if (imm16 != skip_pattern) { + if (invert) { + imm16 = ~imm16; + imm16 &= 0xffff; + } + emit_a64(state, sz(is64) | op | (i << 21) | (imm16 << 5) | rd); + op = MW_MOVK; + invert = false; + } + } + + /* Tidy up for the case imm = 0 or imm == -1. */ + if (op != MW_MOVK) { + emit_a64(state, sz(is64) | op | (0 << 21) | (0 << 5) | rd); + } +} + +/* [ARM-A]: C4.1.66: Load/store register (unscaled immediate). */ +static void emit_loadstore_imm(struct jit_state *state, + enum arm_opcode op, + int rt, + int rn, + int16_t imm9) +{ + const uint32_t imm_op_base = 0x38000000U; + assert(imm9 >= -256 && imm9 < 256); + imm9 &= 0x1ff; + emit_a64(state, imm_op_base | op | (imm9 << 12) | (rn << 5) | rt); +} + +/* [ARM-A]: C4.1.66: Load/store register pair (offset). */ +static void emit_loadstorepair_immediate(struct jit_state *state, + enum arm_opcode op, + int rt, + int rt2, + int rn, + int32_t imm7) +{ + int32_t imm_div = ((op == LSP_STPX) || (op == LSP_LDPX)) ? 8 : 4; + assert(imm7 % imm_div == 0); + imm7 /= imm_div; + emit_a64(state, op | (imm7 << 15) | (rt2 << 10) | (rn << 5) | rt); +} + +/* [ARM-A]: C4.1.65: Unconditional branch (register). */ +static void emit_uncond_branch_reg(struct jit_state *state, + enum arm_opcode op, + int rn) +{ + emit_a64(state, op | (rn << 5)); +} + +/* [ARM-A]: C4.1.67: Data-processing (2 source). */ +static void emit_dataproc_2source(struct jit_state *state, + bool is64, + enum arm_opcode op, + int rd, + int rn, + int rm) +{ + emit_a64(state, sz(is64) | op | (rm << 16) | (rn << 5) | rd); +} + + +/* [ARM-A]: C4.1.67: Data-processing (3 source). */ +static void emit_dataproc_3source(struct jit_state *state, + bool is64, + enum arm_opcode op, + int rd, + int rn, + int rm, + int ra) +{ + emit_a64(state, sz(is64) | op | (rm << 16) | (ra << 10) | (rn << 5) | rd); +} + +static void update_branch_immediate(struct jit_state *state, + uint32_t offset, + int32_t imm) +{ + assert((imm & 3) == 0); + uint32_t insn; + imm >>= 2; + memcpy(&insn, state->buf + offset, sizeof(uint32_t)); + if ((insn & 0xfe000000U) == 0x54000000U /* Conditional branch immediate. */ + || (insn & 0x7e000000U) == + 0x34000000U) { /* Compare and branch immediate. */ + assert((imm >> 19) == INT64_C(-1) || (imm >> 19) == 0); + insn |= (imm & 0x7ffff) << 5; + } else if ((insn & 0x7c000000U) == 0x14000000U) { + /* Unconditional branch immediate. */ + assert((imm >> 26) == INT64_C(-1) || (imm >> 26) == 0); + insn |= (imm & 0x03ffffffU) << 0; + } else { + assert(false); + insn = BAD_OPCODE; + } +#if defined(__APPLE__) && defined(__aarch64__) + pthread_jit_write_protect_np(false); +#endif + memcpy(state->buf + offset, &insn, sizeof(uint32_t)); +#if defined(__APPLE__) && defined(__aarch64__) + pthread_jit_write_protect_np(true); +#endif +} +#endif + +static inline void emit_jump_target_offset(struct jit_state *state, + uint32_t jump_loc, + uint32_t jump_state_offset) +{ + struct jump *jump = &state->jumps[state->n_jumps++]; + jump->offset_loc = jump_loc; + jump->target_offset = jump_state_offset; +} + +/* The REX prefix and ModRM byte are emitted. + * The MR encoding is utilized when a choice is available. The 'src' is often + * used as an opcode extension. + */ +static inline void emit_alu32(struct jit_state *state, int op, int src, int dst) +{ +#if defined(__x86_64__) + emit_basic_rex(state, 0, src, dst); + emit1(state, op); + emit_modrm_reg2reg(state, src, dst); +#elif defined(__aarch64__) + switch (op) { + case 1: /* ADD */ + emit_addsub_register(state, false, AS_ADD, dst, dst, src); + break; + case 0x29: /* SUB */ + emit_addsub_register(state, false, AS_SUB, dst, dst, src); + break; + case 0x31: /* XOR */ + emit_logical_register(state, false, LOG_EOR, dst, dst, src); + break; + case 9: /* OR */ + emit_logical_register(state, false, LOG_ORR, dst, dst, src); + break; + case 0x21: /* AND */ + emit_logical_register(state, false, LOG_AND, dst, dst, src); + break; + case 0xd3: + if (src == 4) /* SLL */ + emit_dataproc_2source(state, false, DP2_LSLV, dst, dst, R8); + else if (src == 5) /* SRL */ + emit_dataproc_2source(state, false, DP2_LSRV, dst, dst, R8); + else if (src == 7) /* SRA */ + emit_dataproc_2source(state, false, DP2_ASRV, dst, dst, R8); + break; + default: + __UNREACHABLE; + break; + } +#endif +} + +/* REX prefix, ModRM byte, and 32-bit immediate */ +static inline void emit_alu32_imm32(struct jit_state *state, + int op UNUSED, + int src, + int dst, + int32_t imm) +{ +#if defined(__x86_64__) + emit_alu32(state, op, src, dst); + emit4(state, imm); +#elif defined(__aarch64__) + switch (src) { + case 0: + emit_load_imm(state, R10, imm); + emit_addsub_register(state, false, AS_ADD, dst, dst, R10); + break; + case 1: + emit_load_imm(state, R10, imm); + emit_logical_register(state, false, LOG_ORR, dst, dst, R10); + break; + case 4: + emit_load_imm(state, R10, imm); + emit_logical_register(state, false, LOG_AND, dst, dst, R10); + break; + case 6: + emit_load_imm(state, R10, imm); + emit_logical_register(state, false, LOG_EOR, dst, src, R10); + break; + default: + __UNREACHABLE; + break; + } +#endif +} + +/* REX prefix, ModRM byte, and 8-bit immediate */ +static inline void emit_alu32_imm8(struct jit_state *state, + int op UNUSED, + int src, + int dst, + int8_t imm) +{ +#if defined(__x86_64__) + emit_alu32(state, op, src, dst); + emit1(state, imm); +#elif defined(__aarch64__) + switch (src) { + case 4: + emit_load_imm(state, R10, imm); + emit_dataproc_2source(state, false, DP2_LSLV, dst, dst, R10); + break; + case 5: + emit_load_imm(state, R10, imm); + emit_dataproc_2source(state, false, DP2_LSRV, dst, dst, R10); + break; + case 7: + emit_load_imm(state, R10, imm); + emit_dataproc_2source(state, false, DP2_ASRV, dst, dst, R10); + break; + default: + __UNREACHABLE; + break; + } +#endif +} + +/* The REX.W prefix and ModRM byte are emitted. + * The MR encoding is used when there is a choice. 'src' is often used as + * an opcode extension. + */ +static inline void emit_alu64(struct jit_state *state, int op, int src, int dst) +{ +#if defined(__x86_64__) + emit_basic_rex(state, 1, src, dst); + emit1(state, op); + emit_modrm_reg2reg(state, src, dst); +#elif defined(__aarch64__) + if (op == 0x01) + emit_addsub_register(state, true, AS_ADD, dst, dst, src); +#endif +} + +/* REX.W prefix, ModRM byte, and 8-bit immediate */ +static inline void emit_alu64_imm8(struct jit_state *state, + int op, + int src UNUSED, + int dst, + int8_t imm) +{ +#if defined(__x86_64__) + emit_alu64(state, op, src, dst); + emit1(state, imm); +#elif defined(__aarch64__) + if (op == 0xc1) { + emit_load_imm(state, R10, imm); + emit_dataproc_2source(state, true, DP2_LSRV, dst, dst, R10); + } +#endif +} + +#if defined(__x86_64__) +/* Register to register mov */ +static inline void emit_mov(struct jit_state *state, int src, int dst) +{ + emit_alu64(state, 0x89, src, dst); +} + +/* REX.W prefix, ModRM byte, and 32-bit immediate */ +static inline void emit_alu64_imm32(struct jit_state *state, + int op, + int src, + int dst, + int32_t imm) +{ + emit_alu64(state, op, src, dst); + emit4(state, imm); +} +#elif defined(__aarch64__) +static void divmod(struct jit_state *state, + uint8_t opcode, + int rd, + int rn, + int rm) +{ + bool mod = (opcode & JIT_ALU_OP_MASK) == (JIT_OP_MOD_IMM & JIT_ALU_OP_MASK); + bool is64 = (opcode & JIT_CLS_MASK) == JIT_CLS_ALU64; + int div_dest = mod ? temp_div_reg : rd; + + /* Do not need to treet divide by zero as special because the UDIV + * instruction already returns 0 when dividing by zero. + */ + emit_dataproc_2source(state, is64, DP2_UDIV, div_dest, rn, rm); + if (mod) { + emit_dataproc_3source(state, is64, DP3_MSUB, rd, rm, div_dest, rn); + } +} +#endif + +static inline void emit_cmp_imm32(struct jit_state *state, int dst, int32_t imm) +{ +#if defined(__x86_64__) + emit_alu64_imm32(state, 0x81, 7, dst, imm); +#elif defined(__aarch64__) + emit_load_imm(state, R10, imm); + emit_addsub_register(state, false, AS_SUBS, RZ, dst, R10); +#endif +} + +static inline void emit_cmp32(struct jit_state *state, int src, int dst) +{ +#if defined(__x86_64__) + emit_alu32(state, 0x39, src, dst); +#elif defined(__aarch64__) + emit_addsub_register(state, false, AS_SUBS, RZ, dst, src); +#endif +} + +static inline void emit_jcc_offset(struct jit_state *state, int code) +{ +#if defined(__x86_64__) + emit1(state, 0x0f); + emit1(state, code); + emit4(state, 0); +#elif defined(__aarch64__) + switch (code) { + case 0x84: /* BEQ */ + code = COND_EQ; + break; + case 0x85: /* BNE */ + code = COND_NE; + break; + case 0x8c: /* BLT */ + code = COND_LT; + break; + case 0x8d: /* BGE */ + code = COND_GE; + break; + case 0x82: /* BLTU */ + code = COND_LO; + break; + case 0x83: /* BGEU */ + code = COND_HS; + break; + default: + __UNREACHABLE; + break; + } + emit_a64(state, BR_Bcond | (0 << 5) | code); +#endif +} + +/* Load [src + offset] into dst */ +static inline void emit_load(struct jit_state *state, + enum operand_size size, + int src, + int dst, + int32_t offset) +{ +#if defined(__x86_64__) + if (size == S8 || size == S16) { + /* movzx */ + emit1(state, 0x0f); + emit1(state, size == S8 ? 0xb6 : 0xb7); + } else if (size == S32) { + /* mov */ + emit1(state, 0x8b); + } + + emit_modrm_and_displacement(state, dst, src, offset); +#elif defined(__aarch64__) + switch (size) { + case S8: + emit_loadstore_imm(state, LS_LDRB, dst, src, offset); + break; + case S16: + emit_loadstore_imm(state, LS_LDRH, dst, src, offset); + break; + case S32: + emit_loadstore_imm(state, LS_LDRW, dst, src, offset); + break; + default: + __UNREACHABLE; + break; + } +#endif +} + +static inline void emit_load_sext(struct jit_state *state, + enum operand_size size, + int src, + int dst, + int32_t offset) +{ +#if defined(__x86_64__) + if (size == S8 || size == S16) { + /* movsx */ + emit1(state, 0x0f); + emit1(state, size == S8 ? 0xbe : 0xbf); + } else if (size == S32) { + emit_basic_rex(state, 1, dst, src); + emit1(state, 0x63); + } + + emit_modrm_and_displacement(state, dst, src, offset); +#elif defined(__aarch64__) + switch (size) { + case S8: + emit_loadstore_imm(state, LS_LDRSBW, dst, src, offset); + break; + case S16: + emit_loadstore_imm(state, LS_LDRSHW, dst, src, offset); + break; + default: + __UNREACHABLE; + break; + } +#endif +} + +/* Load sign-extended immediate into register */ +static inline void emit_load_imm(struct jit_state *state, int dst, int64_t imm) +{ +#if defined(__x86_64__) + if (imm >= INT32_MIN && imm <= INT32_MAX) + emit_alu64_imm32(state, 0xc7, 0, dst, imm); + else { + /* movabs $imm, dst */ + emit_basic_rex(state, 1, 0, dst); + emit1(state, 0xb8 | (dst & 7)); + emit8(state, imm); + } +#elif defined(__aarch64__) + if (imm >= INT32_MIN && imm <= INT32_MAX) + emit_movewide_immediate(state, false, dst, imm); + else + emit_movewide_immediate(state, true, dst, imm); +#endif +} + +/* Store register src to [dst + offset] */ +static inline void emit_store(struct jit_state *state, + enum operand_size size, + int src, + int dst, + int32_t offset) +{ +#if defined(__x86_64__) + if (size == S16) + emit1(state, 0x66); /* 16-bit override */ + emit1(state, size == S8 ? 0x88 : 0x89); + emit_modrm_and_displacement(state, src, dst, offset); +#elif defined(__aarch64__) + switch (size) { + case S8: + emit_loadstore_imm(state, LS_STRB, src, dst, offset); + break; + case S16: + emit_loadstore_imm(state, LS_STRH, src, dst, offset); + break; + case S32: + emit_loadstore_imm(state, LS_STRW, src, dst, offset); + break; + default: + __UNREACHABLE; + break; + } +#endif +} + +/* Store immediate to [dst + offset] */ +static inline void emit_store_imm32(struct jit_state *state, + enum operand_size size, + int dst, + int32_t offset, + int32_t imm) +{ +#if defined(__x86_64__) + if (size == S16) + emit1(state, 0x66); /* 16-bit override */ + emit1(state, size == S8 ? 0xc6 : 0xc7); + emit_modrm_and_displacement(state, 0, dst, offset); + switch (size) { + case S32: + emit4(state, imm); + break; + case S16: + emit2(state, imm); + break; + case S8: + emit1(state, imm); + break; + default: + __UNREACHABLE; + break; + } +#elif defined(__aarch64__) + emit_load_imm(state, R10, imm); + emit_store(state, size, R10, dst, offset); +#endif +} + +static inline void emit_jmp(struct jit_state *state, uint32_t target_pc) +{ +#if defined(__x86_64__) + emit1(state, 0xe9); + emit_jump_target_address(state, target_pc); +#elif defined(__aarch64__) + struct jump *jump = &state->jumps[state->n_jumps++]; + jump->offset_loc = state->offset; + jump->target_pc = target_pc; + emit_a64(state, UBR_B); +#endif +} + +static inline void emit_call(struct jit_state *state, intptr_t target) +{ +#if defined(__x86_64__) + emit_load_imm(state, RAX, target); + /* callq *%rax */ + emit1(state, 0xff); + /* ModR/M byte: b11010000b = xd0, rax is register 0 */ + emit1(state, 0xd0); +#elif defined(__aarch64__) + uint32_t stack_movement = align_up(8, 16); + emit_addsub_immediate(state, true, AS_SUB, SP, SP, stack_movement); + emit_loadstore_imm(state, LS_STRX, R30, SP, 0); + + emit_movewide_immediate(state, true, temp_imm_reg, target); + emit_uncond_branch_reg(state, BR_BLR, temp_imm_reg); + + int dest = map_register(0); + if (dest != R0) { + emit_logical_register(state, true, LOG_ORR, dest, RZ, R0); + } + + emit_loadstore_imm(state, LS_LDRX, R30, SP, 0); + emit_addsub_immediate(state, true, AS_ADD, SP, SP, stack_movement); +#endif +} + +static inline void emit_exit(struct jit_state *state) +{ +#if defined(__x86_64__) + emit1(state, 0xe9); + emit_jump_target_offset(state, state->offset, state->exit_loc); + emit4(state, 0); +#elif defined(__aarch64__) + emit_jmp(state, TARGET_PC_EXIT); +#endif +} + +/* TODO: muldivmod is incomplete, it does not handle imm or overflow now */ +#if RV32_HAS(EXT_M) +static void muldivmod(struct jit_state *state, + uint8_t opcode, + int src, + int dst, + int32_t imm UNUSED) +{ +#if defined(__x86_64__) + bool mul = (opcode & JIT_ALU_OP_MASK) == (JIT_OP_MUL_IMM & JIT_ALU_OP_MASK); + bool div = (opcode & JIT_ALU_OP_MASK) == (JIT_OP_DIV_IMM & JIT_ALU_OP_MASK); + bool mod = (opcode & JIT_ALU_OP_MASK) == (JIT_OP_MOD_IMM & JIT_ALU_OP_MASK); + bool is64 = (opcode & JIT_CLS_MASK) == JIT_CLS_ALU64; + bool reg = (opcode & JIT_SRC_REG) == JIT_SRC_REG; + + /* Short circuit for imm == 0 */ + if (!reg && imm == 0) { + assert(NULL); + if (div || mul) { + /* For division and multiplication, set result to zero. */ + emit_alu32(state, 0x31, dst, dst); + } else { + /* For modulo, set result to dividend. */ + emit_mov(state, dst, dst); + } + return; + } + + if (dst != RAX) + emit_push(state, RAX); + + if (dst != RDX) + emit_push(state, RDX); + + /* Load the divisor into RCX */ + if (imm) + emit_load_imm(state, RCX, imm); + else + emit_mov(state, src, RCX); + + /* Load the dividend into RAX */ + emit_mov(state, dst, RAX); + + /* The JIT employs two different semantics for division and modulus + * operations. In the case of division, if the divisor is zero, the result + * is set to zero. For modulus operations, if the divisor is zero, the + * result becomes the dividend. To manage this, we first set the divisor to + * 1 if it is initially zero. Then, we adjust the result accordingly: for + * division, we set it to zero if the original divisor was zero; for + * modulus, we set it to the dividend under the same condition. + */ + + if (div || mod) { + /* Check if divisor is zero */ + if (is64) + emit_alu64(state, 0x85, RCX, RCX); + else + emit_alu32(state, 0x85, RCX, RCX); + + /* Save the dividend for the modulo case */ + if (mod) + emit_push(state, RAX); /* Save dividend */ + + /* Save the result of the test */ + emit1(state, 0x9c); /* pushfq */ + + /* Set the divisor to 1 if it is zero */ + emit_load_imm(state, RDX, 1); + emit1(state, 0x48); + emit1(state, 0x0f); + emit1(state, 0x44); + emit1(state, 0xca); /* cmove rcx, rdx */ + + /* xor %edx,%edx */ + emit_alu32(state, 0x31, RDX, RDX); + } + + if (is64) + emit_rex(state, 1, 0, 0, 0); + + /* Multiply or divide */ + emit_alu32(state, 0xf7, mul ? 4 : 6, RCX); + + /* The division operation stores the remainder in RDX and the quotient + * in RAX. + */ + if (div || mod) { + /* Restore the result of the test */ + emit1(state, 0x9d); /* popfq */ + + /* If zero flag is set, then the divisor was zero. */ + + if (div) { + /* Set the dividend to zero if the divisor was zero. */ + emit_load_imm(state, RCX, 0); + + /* Store 0 in RAX if the divisor was zero. */ + /* Use conditional move to avoid a branch. */ + emit1(state, 0x48); + emit1(state, 0x0f); + emit1(state, 0x44); + emit1(state, 0xc1); /* cmove rax, rcx */ + } else { + /* Restore dividend to RCX */ + emit_pop(state, RCX); + + /* Store the dividend in RAX if the divisor was zero. */ + /* Use conditional move to avoid a branch. */ + emit1(state, 0x48); + emit1(state, 0x0f); + emit1(state, 0x44); + emit1(state, 0xd1); /* cmove rdx, rcx */ + } + } + + if (dst != RDX) { + if (mod) + emit_mov(state, RDX, dst); + emit_pop(state, RDX); + } + if (dst != RAX) { + if (div || mul) + emit_mov(state, RAX, dst); + emit_pop(state, RAX); + } +#elif defined(__aarch64__) + switch (opcode) { + case 0x28: + emit_dataproc_3source(state, false, DP3_MADD, dst, dst, src, RZ); + break; + case 0x2f: + emit_dataproc_3source(state, true, DP3_MADD, dst, dst, src, RZ); + break; + case 0x38: + divmod(state, JIT_OP_DIV_REG, dst, dst, src); + break; + case 0x98: + divmod(state, JIT_OP_MOD_REG, dst, dst, src); + break; + default: + __UNREACHABLE; + break; + } +#endif +} +#endif + +#define SET_SIZE_BITS 10 +#define SET_SIZE (1 << SET_SIZE_BITS) +#define SET_SLOTS_SIZE 32 +HASH_FUNC_IMPL(set_hash, SET_SIZE_BITS, 1 << SET_SIZE_BITS); + +/* The set consists of SET_SIZE buckets, with each bucket containing + * SET_SLOTS_SIZE slots. + */ +typedef struct { + uint32_t table[SET_SIZE][SET_SLOTS_SIZE]; +} set_t; + +/** + * set_reset - clear a set + * @set: a pointer points to target set + */ +static inline void set_reset(set_t *set) +{ + memset(set, 0, sizeof(set_t)); +} + +/** + * set_add - insert a new element into the set + * @set: a pointer points to target set + * @key: the key of the inserted entry + */ +static bool set_add(set_t *set, uint32_t key) +{ + const uint32_t index = set_hash(key); + uint8_t count = 0; + while (set->table[index][count]) { + if (set->table[index][count++] == key) + return false; + } + + set->table[index][count] = key; + return true; +} + +/** + * set_has - check whether the element exist in the set or not + * @set: a pointer points to target set + * @key: the key of the inserted entry + */ +static bool set_has(set_t *set, uint32_t key) +{ + const uint32_t index = set_hash(key); + for (uint8_t count = 0; set->table[index][count]; count++) { + if (set->table[index][count] == key) + return true; + } + return false; +} + +static void prepare_translate(struct jit_state *state) +{ +#if defined(__x86_64__) + /* Save platform non-volatile registers */ + for (uint32_t i = 0; i < ARRAYS_SIZE(nonvolatile_reg); i++) + emit_push(state, nonvolatile_reg[i]); + + /* Assuming that the stack is 16-byte aligned just before the call + * instruction that brought us to this code, we need to restore 16-byte + * alignment upon starting execution of the JIT'd code. STACK_SIZE is + * guaranteed to be divisible by 16. However, if an even number of + * registers were pushed onto the stack during state saving (see above), + * an additional 8 bytes must be added to regain 16-byte alignment. + */ + if (!(ARRAYS_SIZE(nonvolatile_reg) % 2)) + emit_alu64_imm32(state, 0x81, 5, RSP, 0x8); + + /* Set JIT R10 (the way to access the frame in JIT) to match RSP. */ + emit_mov(state, RSP, map_register(VM_REG_10)); + + /* Allocate stack space */ + emit_alu64_imm32(state, 0x81, 5, RSP, STACK_SIZE); + +#if defined(_WIN32) + /* Windows x64 ABI requires home register space. */ + /* Allocate home register space - 4 registers */ + emit_alu64_imm32(state, 0x81, 5, RSP, 4 * sizeof(uint64_t)); +#endif + + /* Jump to the entry point, which is stored in the second parameter. */ + emit1(state, 0xff); + emit1(state, 0xe6); + + /* Epilogue */ + state->exit_loc = state->offset; + + /* Move register 0 into rax */ + if (map_register(VM_REG_0) != RAX) + emit_mov(state, map_register(VM_REG_0), RAX); + + /* Deallocate stack space by restoring RSP from JIT R10. */ + emit_mov(state, map_register(VM_REG_10), RSP); + + if (!(ARRAYS_SIZE(nonvolatile_reg) % 2)) + emit_alu64_imm32(state, 0x81, 0, RSP, 0x8); + + /* Restore platform non-volatile registers */ + for (uint32_t i = 0; i < ARRAYS_SIZE(nonvolatile_reg); i++) + emit_pop(state, nonvolatile_reg[ARRAYS_SIZE(nonvolatile_reg) - i - 1]); + + /* Return */ + emit1(state, 0xc3); +#elif defined(__aarch64__) + uint32_t register_space = ARRAYS_SIZE(callee_reg) * 8 + 2 * 8; + state->stack_size = align_up(STACK_SIZE + register_space, 16); + emit_addsub_immediate(state, true, AS_SUB, SP, SP, state->stack_size); + + /* Set up frame */ + emit_loadstorepair_immediate(state, LSP_STPX, R29, R30, SP, 0); + /* In ARM64 calling convention, R29 is the frame pointer. */ + emit_addsub_immediate(state, true, AS_ADD, R29, SP, 0); + + /* Save callee saved registers */ + for (size_t i = 0; i < ARRAYS_SIZE(callee_reg); i += 2) { + emit_loadstorepair_immediate(state, LSP_STPX, callee_reg[i], + callee_reg[i + 1], SP, (i + 2) * 8); + } + + emit_uncond_branch_reg(state, BR_BR, R1); + /* Epilogue */ + state->exit_loc = state->offset; + + /* Move register 0 into R0 */ + if (map_register(0) != R0) { + emit_logical_register(state, true, LOG_ORR, R0, RZ, map_register(0)); + } + + /* Restore callee-saved registers). */ + for (size_t i = 0; i < ARRAYS_SIZE(callee_reg); i += 2) { + emit_loadstorepair_immediate(state, LSP_LDPX, callee_reg[i], + callee_reg[i + 1], SP, (i + 2) * 8); + } + emit_loadstorepair_immediate(state, LSP_LDPX, R29, R30, SP, 0); + emit_addsub_immediate(state, true, AS_ADD, SP, SP, state->stack_size); + emit_uncond_branch_reg(state, BR_RET, R30); +#endif +} + +#define GEN(inst, code) \ + static void do_##inst(struct jit_state *state UNUSED, riscv_t *rv UNUSED, \ + rv_insn_t *ir UNUSED) \ + { \ + code; \ + } +#include "rv32_jit.c" +#undef GEN + +static void do_fuse1(struct jit_state *state, riscv_t *rv UNUSED, rv_insn_t *ir) +{ + opcode_fuse_t *fuse = ir->fuse; + for (int i = 0; i < ir->imm2; i++) { + emit_load_imm(state, temp_reg[0], fuse[i].imm); + emit_store(state, S32, temp_reg[0], parameter_reg[0], + offsetof(struct riscv_internal, X) + 4 * fuse[i].rd); + } +} + +static void do_fuse2(struct jit_state *state, riscv_t *rv UNUSED, rv_insn_t *ir) +{ + emit_load_imm(state, temp_reg[0], ir->imm); + emit_store(state, S32, temp_reg[0], parameter_reg[0], + offsetof(struct riscv_internal, X) + 4 * ir->rd); + emit_load(state, S32, parameter_reg[0], temp_reg[1], + offsetof(struct riscv_internal, X) + 4 * ir->rs1); + emit_alu32(state, 0x01, temp_reg[1], temp_reg[0]); + emit_store(state, S32, temp_reg[0], parameter_reg[0], + offsetof(struct riscv_internal, X) + 4 * ir->rs2); +} + +static void do_fuse3(struct jit_state *state, riscv_t *rv, rv_insn_t *ir) +{ + memory_t *m = ((state_t *) rv->userdata)->mem; + opcode_fuse_t *fuse = ir->fuse; + for (int i = 0; i < ir->imm2; i++) { + emit_load(state, S32, parameter_reg[0], temp_reg[0], + offsetof(struct riscv_internal, X) + 4 * fuse[i].rs1); + emit_load_imm(state, temp_reg[1], + (intptr_t) (m->mem_base + fuse[i].imm)); + emit_alu64(state, 0x01, temp_reg[1], temp_reg[0]); + emit_load(state, S32, parameter_reg[0], temp_reg[1], + offsetof(struct riscv_internal, X) + 4 * fuse[i].rs2); + emit_store(state, S32, temp_reg[1], temp_reg[0], 0); + } +} + +static void do_fuse4(struct jit_state *state, riscv_t *rv, rv_insn_t *ir) +{ + memory_t *m = ((state_t *) rv->userdata)->mem; + opcode_fuse_t *fuse = ir->fuse; + for (int i = 0; i < ir->imm2; i++) { + emit_load(state, S32, parameter_reg[0], temp_reg[0], + offsetof(struct riscv_internal, X) + 4 * fuse[i].rs1); + emit_load_imm(state, temp_reg[1], + (intptr_t) (m->mem_base + fuse[i].imm)); + emit_alu64(state, 0x01, temp_reg[1], temp_reg[0]); + emit_load(state, S32, temp_reg[0], temp_reg[1], 0); + emit_store(state, S32, temp_reg[1], parameter_reg[0], + offsetof(struct riscv_internal, X) + 4 * fuse[i].rd); + } +} + +static void do_fuse5(struct jit_state *state, riscv_t *rv UNUSED, rv_insn_t *ir) +{ + emit_load_imm(state, temp_reg[0], ir->pc + 4); + emit_store(state, S32, temp_reg[0], parameter_reg[0], + offsetof(struct riscv_internal, PC)); + emit_call(state, (intptr_t) rv->io.on_memset); + emit_exit(&(*state)); +} + +static void do_fuse6(struct jit_state *state, riscv_t *rv UNUSED, rv_insn_t *ir) +{ + emit_load_imm(state, temp_reg[0], ir->pc + 4); + emit_store(state, S32, temp_reg[0], parameter_reg[0], + offsetof(struct riscv_internal, PC)); + emit_call(state, (intptr_t) rv->io.on_memcpy); + emit_exit(&(*state)); +} + +static void do_fuse7(struct jit_state *state, riscv_t *rv UNUSED, rv_insn_t *ir) +{ + opcode_fuse_t *fuse = ir->fuse; + for (int i = 0; i < ir->imm2; i++) { + switch (fuse[i].opcode) { + case rv_insn_slli: + emit_load(state, S32, parameter_reg[0], temp_reg[0], + offsetof(struct riscv_internal, X) + 4 * fuse[i].rs1); + emit_alu32_imm8(state, 0xc1, 4, temp_reg[0], fuse[i].imm & 0x1f); + emit_store(state, S32, temp_reg[0], parameter_reg[0], + offsetof(struct riscv_internal, X) + 4 * fuse[i].rd); + break; + case rv_insn_srli: + emit_load(state, S32, parameter_reg[0], temp_reg[0], + offsetof(struct riscv_internal, X) + 4 * fuse[i].rs1); + emit_alu32_imm8(state, 0xc1, 5, temp_reg[0], fuse[i].imm & 0x1f); + emit_store(state, S32, temp_reg[0], parameter_reg[0], + offsetof(struct riscv_internal, X) + 4 * fuse[i].rd); + break; + case rv_insn_srai: + emit_load(state, S32, parameter_reg[0], temp_reg[0], + offsetof(struct riscv_internal, X) + 4 * fuse[i].rs1); + emit_alu32_imm8(state, 0xc1, 7, temp_reg[0], fuse[i].imm & 0x1f); + emit_store(state, S32, temp_reg[0], parameter_reg[0], + offsetof(struct riscv_internal, X) + 4 * fuse[i].rd); + break; + default: + __UNREACHABLE; + break; + } + } +} + +/* clang-format off */ +static const void *dispatch_table[] = { + /* RV32 instructions */ +#define _(inst, can_branch, insn_len, translatable, reg_mask) [rv_insn_##inst] = do_##inst, + RV_INSN_LIST +#undef _ + /* Macro operation fusion instructions */ +#define _(inst) [rv_insn_##inst] = do_##inst, + FUSE_INSN_LIST +#undef _ +}; +/* clang-format on */ + +typedef void (*codegen_block_func_t)(struct jit_state *, + riscv_t *, + rv_insn_t *); + +static void translate(struct jit_state *state, riscv_t *rv, block_t *block) +{ + uint32_t idx; + rv_insn_t *ir, *next; + for (idx = 0, ir = block->ir_head; idx < block->n_insn; idx++, ir = next) { + next = ir->next; + ((codegen_block_func_t) dispatch_table[ir->opcode])(state, rv, ir); + } +} + +static void resolve_jumps(struct jit_state *state) +{ + for (int i = 0; i < state->n_jumps; i++) { + struct jump jump = state->jumps[i]; + int target_loc; + if (jump.target_offset != 0) + target_loc = jump.target_offset; + else if (jump.target_pc == TARGET_PC_EXIT) + target_loc = state->exit_loc; +#if defined(__x86_64__) + else if (jump.target_pc == TARGET_PC_RETPOLINE) + target_loc = state->retpoline_loc; +#elif defined(__aarch64__) + else if (jump.target_pc == TARGET_PC_ENTER) + target_loc = state->entry_loc; +#endif + else { + target_loc = jump.offset_loc + sizeof(uint32_t); + for (int i = 0; i < state->n_insn; i++) { + if (jump.target_pc == state->offset_map[i].pc) { + target_loc = state->offset_map[i].offset; + break; + } + } + } +#if defined(__x86_64__) + /* Assumes jump offset is at end of instruction */ + uint32_t rel = target_loc - (jump.offset_loc + sizeof(uint32_t)); + + uint8_t *offset_ptr = &state->buf[jump.offset_loc]; + memcpy(offset_ptr, &rel, sizeof(uint32_t)); +#elif defined(__aarch64__) + int32_t rel = target_loc - jump.offset_loc; + update_branch_immediate(state, jump.offset_loc, rel); +#endif + } +} + +static void translate_chained_block(struct jit_state *state, + riscv_t *rv, + block_t *block, + set_t *set) +{ + if (set_has(set, block->pc_start)) + return; + + set_add(set, block->pc_start); + offset_map_insert(state, block->pc_start); + translate(state, rv, block); + rv_insn_t *ir = block->ir_tail; + if (ir->branch_untaken && !set_has(set, ir->pc + 4)) { + block_t *block1 = cache_get(rv->block_cache, ir->pc + 4); + if (block1 && block1->translatable) + translate_chained_block(state, rv, block1, set); + } + if (ir->branch_taken && !set_has(set, ir->pc + ir->imm)) { + block_t *block1 = cache_get(rv->block_cache, ir->pc + ir->imm); + if (block1 && block1->translatable) + translate_chained_block(state, rv, block1, set); + } +} + +uint32_t jit_translate(riscv_t *rv, block_t *block) +{ + struct jit_state *state = rv->jit_state; + memset(state->offset_map, 0, MAX_INSNS * sizeof(struct offset_map)); + memset(state->jumps, 0, MAX_INSNS * sizeof(struct jump)); + state->n_insn = 0; + state->n_jumps = 0; + uint32_t entry_loc = state->offset; + set_t set; + set_reset(&set); + translate_chained_block(&(*state), rv, block, &set); + + if (state->offset == state->size) { + printf("Target buffer too small\n"); + goto out; + } + resolve_jumps(&(*state)); +out: + return entry_loc; +} + +struct jit_state *jit_state_init(size_t size) +{ + struct jit_state *state = malloc(sizeof(struct jit_state)); + state->offset = 0; + state->size = size; + state->buf = mmap(0, size, PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_PRIVATE | MAP_ANONYMOUS +#if defined(__APPLE__) + | MAP_JIT +#endif + , + -1, 0); + assert(state->buf != MAP_FAILED); + prepare_translate(state); + state->offset_map = calloc(MAX_INSNS, sizeof(struct offset_map)); + state->jumps = calloc(MAX_INSNS, sizeof(struct jump)); + return state; +} + +void jit_state_exit(struct jit_state *state) +{ + munmap(state->buf, state->size); + free(state->offset_map); + free(state->jumps); + free(state); +} diff --git a/src/jit.h b/src/jit.h new file mode 100644 index 000000000..2e816c3a2 --- /dev/null +++ b/src/jit.h @@ -0,0 +1,39 @@ +/* + * rv32emu is freely redistributable under the MIT License. See the file + * "LICENSE" for information on usage and redistribution of this file. + */ + +#pragma once + +#include + +#include "riscv_private.h" + +struct jump { + uint32_t offset_loc; + uint32_t target_pc; + uint32_t target_offset; +}; + +struct offset_map { + uint32_t pc; + uint32_t offset; +}; + +struct jit_state { + uint8_t *buf; + uint32_t offset; + uint32_t stack_size; + uint32_t size; + uint32_t entry_loc; + uint32_t exit_loc; + uint32_t retpoline_loc; + struct offset_map *offset_map; + int n_insn; + struct jump *jumps; + int n_jumps; +}; + +struct jit_state *jit_state_init(size_t size); +void jit_state_exit(struct jit_state *state); +uint32_t jit_translate(riscv_t *rv, block_t *block); diff --git a/src/jit_x64.c b/src/jit_x64.c deleted file mode 100644 index 0dda8ceb5..000000000 --- a/src/jit_x64.c +++ /dev/null @@ -1,577 +0,0 @@ -/* - * rv32emu is freely redistributable under the MIT License. See the file - * "LICENSE" for information on usage and redistribution of this file. - */ - -/* This JIT implementation has undergone extensive modifications, heavily - * relying on the ubpf_jit_x86_64.[ch] from ubpf. The original - * ubpf_jit_x86_64.[ch] file served as the foundation and source of inspiration - * for adapting and tailoring it specifically for this JIT implementation. - * Therefore, credit and sincere thanks are extended to ubpf for their - * invaluable work. - * - * Reference: - * https://github.com/iovisor/ubpf/blob/main/vm/ubpf_jit_x86_64.c - */ - -#if !defined(__x86_64__) -#error "This implementation is dedicated to x86-64." -#endif - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "cache.h" -#include "decode.h" -#include "io.h" -#include "jit_x64.h" -#include "riscv.h" -#include "utils.h" - -enum VM_REG { - VM_REG_0 = 0, - VM_REG_1, - VM_REG_2, - VM_REG_3, - VM_REG_4, - VM_REG_5, - VM_REG_6, - VM_REG_7, - VM_REG_8, - VM_REG_9, - VM_REG_10, - N_VM_REGS, -}; - -#define X64_CLS_MASK 0x07 -#define X64_ALU_OP_MASK 0xf0 -#define X64_CLS_ALU 0x04 -#define X64_CLS_ALU64 0x07 -#define X64_SRC_IMM 0x00 -#define X64_SRC_REG 0x08 -#define X64_OP_MUL_IMM (X64_CLS_ALU | X64_SRC_IMM | 0x20) -#define X64_OP_MUL_REG (X64_CLS_ALU | X64_SRC_REG | 0x20) -#define X64_OP_DIV_IMM (X64_CLS_ALU | X64_SRC_IMM | 0x30) -#define X64_OP_DIV_REG (X64_CLS_ALU | X64_SRC_REG | 0x30) -#define X64_OP_MOD_IMM (X64_CLS_ALU | X64_SRC_IMM | 0x90) -#define X64_OP_MOD_REG (X64_CLS_ALU | X64_SRC_REG | 0x90) - -#define STACK_SIZE 512 -#define MAX_INSNS 1024 - -#if RV32_HAS(EXT_M) -static void muldivmod(struct jit_state *state, - uint8_t opcode, - int src, - int dst, - int32_t imm) -{ - bool mul = (opcode & X64_ALU_OP_MASK) == (X64_OP_MUL_IMM & X64_ALU_OP_MASK); - bool div = (opcode & X64_ALU_OP_MASK) == (X64_OP_DIV_IMM & X64_ALU_OP_MASK); - bool mod = (opcode & X64_ALU_OP_MASK) == (X64_OP_MOD_IMM & X64_ALU_OP_MASK); - bool is64 = (opcode & X64_CLS_MASK) == X64_CLS_ALU64; - bool reg = (opcode & X64_SRC_REG) == X64_SRC_REG; - - /* Short circuit for imm == 0 */ - if (!reg && imm == 0) { - assert(NULL); - if (div || mul) { - /* For division and multiplication, set result to zero. */ - emit_alu32(state, 0x31, dst, dst); - } else { - /* For modulo, set result to dividend. */ - emit_mov(state, dst, dst); - } - return; - } - - if (dst != RAX) - emit_push(state, RAX); - - if (dst != RDX) - emit_push(state, RDX); - - /* Load the divisor into RCX */ - if (imm) - emit_load_imm(state, RCX, imm); - else - emit_mov(state, src, RCX); - - /* Load the dividend into RAX */ - emit_mov(state, dst, RAX); - - /* The JIT employs two different semantics for division and modulus - * operations. In the case of division, if the divisor is zero, the result - * is set to zero. For modulus operations, if the divisor is zero, the - * result becomes the dividend. To manage this, we first set the divisor to - * 1 if it is initially zero. Then, we adjust the result accordingly: for - * division, we set it to zero if the original divisor was zero; for - * modulus, we set it to the dividend under the same condition. - */ - - if (div || mod) { - /* Check if divisor is zero */ - if (is64) - emit_alu64(state, 0x85, RCX, RCX); - else - emit_alu32(state, 0x85, RCX, RCX); - - /* Save the dividend for the modulo case */ - if (mod) - emit_push(state, RAX); /* Save dividend */ - - /* Save the result of the test */ - emit1(state, 0x9c); /* pushfq */ - - /* Set the divisor to 1 if it is zero */ - emit_load_imm(state, RDX, 1); - emit1(state, 0x48); - emit1(state, 0x0f); - emit1(state, 0x44); - emit1(state, 0xca); /* cmove rcx, rdx */ - - /* xor %edx,%edx */ - emit_alu32(state, 0x31, RDX, RDX); - } - - if (is64) - emit_rex(state, 1, 0, 0, 0); - - /* Multiply or divide */ - emit_alu32(state, 0xf7, mul ? 4 : 6, RCX); - - /* The division operation stores the remainder in RDX and the quotient - * in RAX. - */ - if (div || mod) { - /* Restore the result of the test */ - emit1(state, 0x9d); /* popfq */ - - /* If zero flag is set, then the divisor was zero. */ - - if (div) { - /* Set the dividend to zero if the divisor was zero. */ - emit_load_imm(state, RCX, 0); - - /* Store 0 in RAX if the divisor was zero. */ - /* Use conditional move to avoid a branch. */ - emit1(state, 0x48); - emit1(state, 0x0f); - emit1(state, 0x44); - emit1(state, 0xc1); /* cmove rax, rcx */ - } else { - /* Restore dividend to RCX */ - emit_pop(state, RCX); - - /* Store the dividend in RAX if the divisor was zero. */ - /* Use conditional move to avoid a branch. */ - emit1(state, 0x48); - emit1(state, 0x0f); - emit1(state, 0x44); - emit1(state, 0xd1); /* cmove rdx, rcx */ - } - } - - if (dst != RDX) { - if (mod) - emit_mov(state, RDX, dst); - emit_pop(state, RDX); - } - if (dst != RAX) { - if (div || mul) - emit_mov(state, RAX, dst); - emit_pop(state, RAX); - } -} -#endif - -#define REGISTER_MAP_SIZE 11 - -/* There are two common x86-64 calling conventions, discussed at: - * https://en.wikipedia.org/wiki/X64_calling_conventions#x86-64_calling_conventions - * - * Please note: R12 is an exception and is *not* being used. Consequently, it - * is omitted from the list of non-volatile registers for both platforms, - * despite being non-volatile. - */ -#if defined(_WIN32) -static int nonvolatile_reg[] = {RBP, RBX, RDI, RSI, R13, R14, R15}; -static int parameter_reg[] = {RCX, RDX, R8, R9}; -#define RCX_ALT R10 -static int register_map[REGISTER_MAP_SIZE] = { - RAX, R10, RDX, R8, R9, R14, R15, RDI, RSI, RBX, RBP, -}; -#else -#define RCX_ALT R9 -static const int nonvolatile_reg[] = {RBP, RBX, R13, R14, R15}; -static const int parameter_reg[] = {RDI, RSI, RDX, RCX, R8, R9}; -static const int register_map[REGISTER_MAP_SIZE] = { - RAX, RDI, RSI, RDX, R9, R8, RBX, R13, R14, R15, RBP, -}; -#endif - -/* Return the x86 register for the given JIT register */ -static int map_register(int r) -{ - assert(r < N_VM_REGS); - return register_map[r % N_VM_REGS]; -} - -#define SET_SIZE_BITS 10 -#define SET_SIZE (1 << SET_SIZE_BITS) -#define SET_SLOTS_SIZE 32 -HASH_FUNC_IMPL(set_hash, SET_SIZE_BITS, 1 << SET_SIZE_BITS); - -/* The set consists of SET_SIZE buckets, with each bucket containing - * SET_SLOTS_SIZE slots. - */ -typedef struct { - uint32_t table[SET_SIZE][SET_SLOTS_SIZE]; -} set_t; - -/** - * set_reset - clear a set - * @set: a pointer points to target set - */ -static inline void set_reset(set_t *set) -{ - memset(set, 0, sizeof(set_t)); -} - -/** - * set_add - insert a new element into the set - * @set: a pointer points to target set - * @key: the key of the inserted entry - */ -static bool set_add(set_t *set, uint32_t key) -{ - const uint32_t index = set_hash(key); - uint8_t count = 0; - while (set->table[index][count]) { - if (set->table[index][count++] == key) - return false; - } - - set->table[index][count] = key; - return true; -} - -/** - * set_has - check whether the element exist in the set or not - * @set: a pointer points to target set - * @key: the key of the inserted entry - */ -static bool set_has(set_t *set, uint32_t key) -{ - const uint32_t index = set_hash(key); - for (uint8_t count = 0; set->table[index][count]; count++) { - if (set->table[index][count] == key) - return true; - } - return false; -} - -#define UPDATE_PC(pc) \ - emit_load_imm(state, RAX, (pc)); \ - emit_store(state, S32, RAX, parameter_reg[0], \ - offsetof(struct riscv_internal, PC)); - -static void prepare_translate(struct jit_state *state) -{ - /* Save platform non-volatile registers */ - for (uint32_t i = 0; i < ARRAYS_SIZE(nonvolatile_reg); i++) - emit_push(state, nonvolatile_reg[i]); - - /* Assuming that the stack is 16-byte aligned just before the call - * instruction that brought us to this code, we need to restore 16-byte - * alignment upon starting execution of the JIT'd code. STACK_SIZE is - * guaranteed to be divisible by 16. However, if an even number of - * registers were pushed onto the stack during state saving (see above), - * an additional 8 bytes must be added to regain 16-byte alignment. - */ - if (!(ARRAYS_SIZE(nonvolatile_reg) % 2)) - emit_alu64_imm32(state, 0x81, 5, RSP, 0x8); - - /* Set JIT R10 (the way to access the frame in JIT) to match RSP. */ - emit_mov(state, RSP, map_register(VM_REG_10)); - - /* Allocate stack space */ - emit_alu64_imm32(state, 0x81, 5, RSP, STACK_SIZE); - -#if defined(_WIN32) - /* Windows x64 ABI requires home register space. */ - /* Allocate home register space - 4 registers */ - emit_alu64_imm32(state, 0x81, 5, RSP, 4 * sizeof(uint64_t)); -#endif - - /* Jump to the entry point, which is stored in the second parameter. */ - emit1(state, 0xff); - emit1(state, 0xe6); - - /* Epilogue */ - state->exit_loc = state->offset; - - /* Move register 0 into rax */ - if (map_register(VM_REG_0) != RAX) - emit_mov(state, map_register(VM_REG_0), RAX); - - /* Deallocate stack space by restoring RSP from JIT R10. */ - emit_mov(state, map_register(VM_REG_10), RSP); - - if (!(ARRAYS_SIZE(nonvolatile_reg) % 2)) - emit_alu64_imm32(state, 0x81, 0, RSP, 0x8); - - /* Restore platform non-volatile registers */ - for (uint32_t i = 0; i < ARRAYS_SIZE(nonvolatile_reg); i++) - emit_pop(state, nonvolatile_reg[ARRAYS_SIZE(nonvolatile_reg) - i - 1]); - - /* Return */ - emit1(state, 0xc3); -} - -#define X64(inst, code) \ - static void do_##inst(struct jit_state *state UNUSED, riscv_t *rv UNUSED, \ - rv_insn_t *ir UNUSED) \ - { \ - code; \ - } -#include "rv32_jit_template.c" -#undef X64 - -static void do_fuse1(struct jit_state *state, riscv_t *rv UNUSED, rv_insn_t *ir) -{ - opcode_fuse_t *fuse = ir->fuse; - for (int i = 0; i < ir->imm2; i++) { - emit_load_imm(state, RAX, fuse[i].imm); - emit_store(state, S32, RAX, parameter_reg[0], - offsetof(struct riscv_internal, X) + 4 * fuse[i].rd); - } -} - -static void do_fuse2(struct jit_state *state, riscv_t *rv UNUSED, rv_insn_t *ir) -{ - emit_load_imm(state, RAX, ir->imm); - emit_store(state, S32, RAX, parameter_reg[0], - offsetof(struct riscv_internal, X) + 4 * ir->rd); - emit_load(state, S32, parameter_reg[0], RBX, - offsetof(struct riscv_internal, X) + 4 * ir->rs1); - emit_alu32(state, 0x01, RBX, RAX); - emit_store(state, S32, RAX, parameter_reg[0], - offsetof(struct riscv_internal, X) + 4 * ir->rs2); -} - -static void do_fuse3(struct jit_state *state, riscv_t *rv, rv_insn_t *ir) -{ - memory_t *m = ((state_t *) rv->userdata)->mem; - opcode_fuse_t *fuse = ir->fuse; - for (int i = 0; i < ir->imm2; i++) { - emit_load(state, S32, parameter_reg[0], RAX, - offsetof(struct riscv_internal, X) + 4 * fuse[i].rs1); - emit_load_imm(state, RBX, (intptr_t) (m->mem_base + fuse[i].imm)); - emit_alu64(state, 0x01, RBX, RAX); - emit_load(state, S32, parameter_reg[0], RBX, - offsetof(struct riscv_internal, X) + 4 * fuse[i].rs2); - emit_store(state, S32, RBX, RAX, 0); - } -} - -static void do_fuse4(struct jit_state *state, riscv_t *rv, rv_insn_t *ir) -{ - memory_t *m = ((state_t *) rv->userdata)->mem; - opcode_fuse_t *fuse = ir->fuse; - for (int i = 0; i < ir->imm2; i++) { - emit_load(state, S32, parameter_reg[0], RAX, - offsetof(struct riscv_internal, X) + 4 * fuse[i].rs1); - emit_load_imm(state, RBX, (intptr_t) (m->mem_base + fuse[i].imm)); - emit_alu64(state, 0x01, RBX, RAX); - emit_load(state, S32, RAX, RBX, 0); - emit_store(state, S32, RBX, parameter_reg[0], - offsetof(struct riscv_internal, X) + 4 * fuse[i].rd); - } -} - -static void do_fuse5(struct jit_state *state, riscv_t *rv UNUSED, rv_insn_t *ir) -{ - emit_load_imm(state, RAX, ir->pc + 4); - emit_store(state, S32, RAX, parameter_reg[0], - offsetof(struct riscv_internal, PC)); - emit_call(state, (intptr_t) rv->io.on_memset); - emit_exit(&(*state)); -} - -static void do_fuse6(struct jit_state *state, riscv_t *rv UNUSED, rv_insn_t *ir) -{ - emit_load_imm(state, RAX, ir->pc + 4); - emit_store(state, S32, RAX, parameter_reg[0], - offsetof(struct riscv_internal, PC)); - emit_call(state, (intptr_t) rv->io.on_memcpy); - emit_exit(&(*state)); -} - -static void do_fuse7(struct jit_state *state, riscv_t *rv UNUSED, rv_insn_t *ir) -{ - opcode_fuse_t *fuse = ir->fuse; - for (int i = 0; i < ir->imm2; i++) { - switch (fuse[i].opcode) { - case rv_insn_slli: - emit_load(state, S32, parameter_reg[0], RAX, - offsetof(struct riscv_internal, X) + 4 * fuse[i].rs1); - emit_alu32_imm8(state, 0xc1, 4, RAX, fuse[i].imm & 0x1f); - emit_store(state, S32, RAX, parameter_reg[0], - offsetof(struct riscv_internal, X) + 4 * fuse[i].rd); - break; - case rv_insn_srli: - emit_load(state, S32, parameter_reg[0], RAX, - offsetof(struct riscv_internal, X) + 4 * fuse[i].rs1); - emit_alu32_imm8(state, 0xc1, 5, RAX, fuse[i].imm & 0x1f); - emit_store(state, S32, RAX, parameter_reg[0], - offsetof(struct riscv_internal, X) + 4 * fuse[i].rd); - break; - case rv_insn_srai: - emit_load(state, S32, parameter_reg[0], RAX, - offsetof(struct riscv_internal, X) + 4 * fuse[i].rs1); - emit_alu32_imm8(state, 0xc1, 7, RAX, fuse[i].imm & 0x1f); - emit_store(state, S32, RAX, parameter_reg[0], - offsetof(struct riscv_internal, X) + 4 * fuse[i].rd); - break; - default: - __UNREACHABLE; - break; - } - } -} - -/* clang-format off */ -static const void *dispatch_table[] = { - /* RV32 instructions */ -#define _(inst, can_branch, insn_len, translatable, reg_mask) [rv_insn_##inst] = do_##inst, - RV_INSN_LIST -#undef _ - /* Macro operation fusion instructions */ -#define _(inst) [rv_insn_##inst] = do_##inst, - FUSE_INSN_LIST -#undef _ -}; -/* clang-format on */ - -typedef void (*codegen_block_func_t)(struct jit_state *, - riscv_t *, - rv_insn_t *); - -static void translate(struct jit_state *state, riscv_t *rv, block_t *block) -{ - uint32_t idx; - rv_insn_t *ir, *next; - for (idx = 0, ir = block->ir_head; idx < block->n_insn; idx++, ir = next) { - next = ir->next; - ((codegen_block_func_t) dispatch_table[ir->opcode])(state, rv, ir); - } -} - -static void resolve_jumps(struct jit_state *state) -{ - for (int i = 0; i < state->num_jumps; i++) { - struct jump jump = state->jumps[i]; - int target_loc; - if (jump.target_offset != 0) - target_loc = jump.target_offset; - else if (jump.target_pc == TARGET_PC_EXIT) - target_loc = state->exit_loc; - else if (jump.target_pc == TARGET_PC_RETPOLINE) - target_loc = state->retpoline_loc; - else { - target_loc = jump.offset_loc + sizeof(uint32_t); - for (int i = 0; i < state->num_insn; i++) { - if (jump.target_pc == state->offset_map[i].PC) { - target_loc = state->offset_map[i].offset; - break; - } - } - } - /* Assumes jump offset is at end of instruction */ - uint32_t rel = target_loc - (jump.offset_loc + sizeof(uint32_t)); - - uint8_t *offset_ptr = &state->buf[jump.offset_loc]; - memcpy(offset_ptr, &rel, sizeof(uint32_t)); - } -} - -static void translate_chained_block(struct jit_state *state, - riscv_t *rv, - block_t *block, - set_t *set) -{ - if (set_has(set, block->pc_start)) - return; - - set_add(set, block->pc_start); - offset_map_insert(state, block->pc_start); - translate(state, rv, block); - rv_insn_t *ir = block->ir_tail; - if (ir->branch_untaken && !set_has(set, ir->pc + 4)) { - block_t *block1 = cache_get(rv->block_cache, ir->pc + 4); - if (block1 && block1->translatable) - translate_chained_block(state, rv, block1, set); - } - if (ir->branch_taken && !set_has(set, ir->pc + ir->imm)) { - block_t *block1 = cache_get(rv->block_cache, ir->pc + ir->imm); - if (block1 && block1->translatable) - translate_chained_block(state, rv, block1, set); - } -} - -uint32_t translate_x64(riscv_t *rv, block_t *block) -{ - struct jit_state *state = rv->jit_state; - memset(state->offset_map, 0, MAX_INSNS * sizeof(struct offset_map)); - memset(state->jumps, 0, MAX_INSNS * sizeof(struct jump)); - state->num_insn = 0; - state->num_jumps = 0; - uint32_t entry_loc = state->offset; - set_t set; - set_reset(&set); - translate_chained_block(&(*state), rv, block, &set); - - if (state->offset == state->size) { - printf("Target buffer too small\n"); - goto out; - } - resolve_jumps(&(*state)); -out: - return entry_loc; -} - -struct jit_state *init_state(size_t size) -{ - struct jit_state *state = malloc(sizeof(struct jit_state)); - state->offset = 0; - state->size = size; - state->buf = mmap(0, size, PROT_READ | PROT_WRITE | PROT_EXEC, - MAP_PRIVATE | MAP_ANONYMOUS -#if defined(__APPLE__) - | MAP_JIT -#endif - , - -1, 0); - assert(state->buf != MAP_FAILED); - prepare_translate(state); - state->offset_map = calloc(MAX_INSNS, sizeof(struct offset_map)); - state->jumps = calloc(MAX_INSNS, sizeof(struct jump)); - return state; -} - -void destroy_state(struct jit_state *state) -{ - munmap(state->buf, state->size); - free(state->offset_map); - free(state->jumps); - free(state); -} diff --git a/src/jit_x64.h b/src/jit_x64.h deleted file mode 100644 index 3d3799e14..000000000 --- a/src/jit_x64.h +++ /dev/null @@ -1,407 +0,0 @@ -/* - * rv32emu is freely redistributable under the MIT License. See the file - * "LICENSE" for information on usage and redistribution of this file. - */ - -#pragma once - -#include -#include -#include -#include -#include - -#include "riscv_private.h" - -enum X64_REG { - RAX, - RCX, - RDX, - RBX, - RSP, - RBP, - RIP = 5, - RSI, - RDI, - R8, - R9, - R10, - R11, - R12, - R13, - R14, - R15, -}; - -enum operand_size { - S8, - S16, - S32, -}; - -struct jump { - uint32_t offset_loc; - uint32_t target_pc; - uint32_t target_offset; -}; - -/* Special values for target_pc in struct jump */ -#define TARGET_PC_EXIT -1U -#define TARGET_PC_RETPOLINE -3U - -struct offset_map { - uint32_t PC; - uint32_t offset; -}; - -struct jit_state { - uint8_t *buf; - uint32_t offset; - uint32_t size; - uint32_t exit_loc; - uint32_t retpoline_loc; - struct offset_map *offset_map; - int num_insn; - struct jump *jumps; - int num_jumps; -}; - -struct jit_state *init_state(size_t size); -void destroy_state(struct jit_state *state); -uint32_t translate_x64(riscv_t *rv, block_t *block); - -static inline void offset_map_insert(struct jit_state *state, int32_t target_pc) -{ - struct offset_map *map_entry = &state->offset_map[state->num_insn++]; - map_entry->PC = target_pc; - map_entry->offset = state->offset; -} - -static inline void emit_bytes(struct jit_state *state, void *data, uint32_t len) -{ - assert(state->offset <= state->size - len); - if ((state->offset + len) > state->size) { - state->offset = state->size; - return; - } - memcpy(state->buf + state->offset, data, len); - state->offset += len; -} - -static inline void emit1(struct jit_state *state, uint8_t x) -{ - emit_bytes(state, &x, sizeof(x)); -} - -static inline void emit2(struct jit_state *state, uint16_t x) -{ - emit_bytes(state, &x, sizeof(x)); -} - -static inline void emit4(struct jit_state *state, uint32_t x) -{ - emit_bytes(state, &x, sizeof(x)); -} - -static inline void emit8(struct jit_state *state, uint64_t x) -{ - emit_bytes(state, &x, sizeof(x)); -} - -static inline void emit_jump_target_address(struct jit_state *state, - int32_t target_pc) -{ - struct jump *jump = &state->jumps[state->num_jumps++]; - jump->offset_loc = state->offset; - jump->target_pc = target_pc; - emit4(state, 0); -} - -static inline void emit_jump_target_offset(struct jit_state *state, - uint32_t jump_loc, - uint32_t jump_state_offset) -{ - struct jump *jump = &state->jumps[state->num_jumps++]; - jump->offset_loc = jump_loc; - jump->target_offset = jump_state_offset; -} - -static inline void emit_modrm(struct jit_state *state, int mod, int r, int m) -{ - assert(!(mod & ~0xc0)); - emit1(state, (mod & 0xc0) | ((r & 7) << 3) | (m & 7)); -} - -static inline void emit_modrm_reg2reg(struct jit_state *state, int r, int m) -{ - emit_modrm(state, 0xc0, r, m); -} - -static inline void emit_modrm_and_displacement(struct jit_state *state, - int r, - int m, - int32_t d) -{ - if (d == 0 && (m & 7) != RBP) { - emit_modrm(state, 0x00, r, m); - } else if (d >= -128 && d <= 127) { - emit_modrm(state, 0x40, r, m); - emit1(state, d); - } else { - emit_modrm(state, 0x80, r, m); - emit4(state, d); - } -} - -static inline void emit_rex(struct jit_state *state, int w, int r, int x, int b) -{ - assert(!(w & ~1)); - assert(!(r & ~1)); - assert(!(x & ~1)); - assert(!(b & ~1)); - emit1(state, 0x40 | (w << 3) | (r << 2) | (x << 1) | b); -} - -/* Emit a REX prefix incorporating the top bit of both src and dst. This step is - * skipped if no bits are set. - */ -static inline void emit_basic_rex(struct jit_state *state, - int w, - int src, - int dst) -{ - if (w || (src & 8) || (dst & 8)) - emit_rex(state, w, !!(src & 8), 0, !!(dst & 8)); -} - -static inline void emit_push(struct jit_state *state, int r) -{ - emit_basic_rex(state, 0, 0, r); - emit1(state, 0x50 | (r & 7)); -} - -static inline void emit_pop(struct jit_state *state, int r) -{ - emit_basic_rex(state, 0, 0, r); - emit1(state, 0x58 | (r & 7)); -} - -/* The REX prefix and ModRM byte are emitted. - * The MR encoding is utilized when a choice is available. The 'src' is often - * used as an opcode extension. - */ -static inline void emit_alu32(struct jit_state *state, int op, int src, int dst) -{ - emit_basic_rex(state, 0, src, dst); - emit1(state, op); - emit_modrm_reg2reg(state, src, dst); -} - -/* REX prefix, ModRM byte, and 32-bit immediate */ -static inline void emit_alu32_imm32(struct jit_state *state, - int op, - int src, - int dst, - int32_t imm) -{ - emit_alu32(state, op, src, dst); - emit4(state, imm); -} - -/* REX prefix, ModRM byte, and 8-bit immediate */ -static inline void emit_alu32_imm8(struct jit_state *state, - int op, - int src, - int dst, - int8_t imm) -{ - emit_alu32(state, op, src, dst); - emit1(state, imm); -} - -/* The REX.W prefix and ModRM byte are emitted. - * The MR encoding is used when there is a choice. 'src' is often used as - * an opcode extension. - */ -static inline void emit_alu64(struct jit_state *state, int op, int src, int dst) -{ - emit_basic_rex(state, 1, src, dst); - emit1(state, op); - emit_modrm_reg2reg(state, src, dst); -} - -/* REX.W prefix, ModRM byte, and 32-bit immediate */ -static inline void emit_alu64_imm32(struct jit_state *state, - int op, - int src, - int dst, - int32_t imm) -{ - emit_alu64(state, op, src, dst); - emit4(state, imm); -} - -/* REX.W prefix, ModRM byte, and 8-bit immediate */ -static inline void emit_alu64_imm8(struct jit_state *state, - int op, - int src, - int dst, - int8_t imm) -{ - emit_alu64(state, op, src, dst); - emit1(state, imm); -} - -/* Register to register mov */ -static inline void emit_mov(struct jit_state *state, int src, int dst) -{ - emit_alu64(state, 0x89, src, dst); -} - -static inline void emit_cmp_imm32(struct jit_state *state, int dst, int32_t imm) -{ - emit_alu64_imm32(state, 0x81, 7, dst, imm); -} - -static inline void emit_cmp32_imm32(struct jit_state *state, - int dst, - int32_t imm) -{ - emit_alu32_imm32(state, 0x81, 7, dst, imm); -} - -static inline void emit_cmp(struct jit_state *state, int src, int dst) -{ - emit_alu64(state, 0x39, src, dst); -} - -static inline void emit_cmp32(struct jit_state *state, int src, int dst) -{ - emit_alu32(state, 0x39, src, dst); -} - -static inline void emit_jcc(struct jit_state *state, - int code, - int32_t target_pc) -{ - emit1(state, 0x0f); - emit1(state, code); - emit_jump_target_address(state, target_pc); -} - -static inline void emit_jcc_offset(struct jit_state *state, int code) -{ - emit1(state, 0x0f); - emit1(state, code); - emit4(state, 0); -} - -/* Load [src + offset] into dst */ -static inline void emit_load(struct jit_state *state, - enum operand_size size, - int src, - int dst, - int32_t offset) -{ - if (size == S8 || size == S16) { - /* movzx */ - emit1(state, 0x0f); - emit1(state, size == S8 ? 0xb6 : 0xb7); - } else if (size == S32) { - /* mov */ - emit1(state, 0x8b); - } - - emit_modrm_and_displacement(state, dst, src, offset); -} - -static inline void emit_load_sext(struct jit_state *state, - enum operand_size size, - int src, - int dst, - int32_t offset) -{ - if (size == S8 || size == S16) { - /* movsx */ - emit1(state, 0x0f); - emit1(state, size == S8 ? 0xbe : 0xbf); - } else if (size == S32) { - emit_basic_rex(state, 1, dst, src); - emit1(state, 0x63); - } - - emit_modrm_and_displacement(state, dst, src, offset); -} - -/* Load sign-extended immediate into register */ -static inline void emit_load_imm(struct jit_state *state, int dst, int64_t imm) -{ - if (imm >= INT32_MIN && imm <= INT32_MAX) { - emit_alu64_imm32(state, 0xc7, 0, dst, imm); - } else { - /* movabs $imm, dst */ - emit_basic_rex(state, 1, 0, dst); - emit1(state, 0xb8 | (dst & 7)); - emit8(state, imm); - } -} - -/* Store register src to [dst + offset] */ -static inline void emit_store(struct jit_state *state, - enum operand_size size, - int src, - int dst, - int32_t offset) -{ - if (size == S16) - emit1(state, 0x66); /* 16-bit override */ - emit1(state, size == S8 ? 0x88 : 0x89); - emit_modrm_and_displacement(state, src, dst, offset); -} - -/* Store immediate to [dst + offset] */ -static inline void emit_store_imm32(struct jit_state *state, - enum operand_size size, - int dst, - int32_t offset, - int32_t imm) -{ - if (size == S16) - emit1(state, 0x66); /* 16-bit override */ - emit1(state, size == S8 ? 0xc6 : 0xc7); - emit_modrm_and_displacement(state, 0, dst, offset); - if (size == S32) { - emit4(state, imm); - } else if (size == S16) { - emit2(state, imm); - } else if (size == S8) { - emit1(state, imm); - } -} - -static inline void emit_ret(struct jit_state *state) -{ - emit1(state, 0xc3); -} - -static inline void emit_jmp(struct jit_state *state, uint32_t target_pc) -{ - emit1(state, 0xe9); - emit_jump_target_address(state, target_pc); -} - -static inline void emit_call(struct jit_state *state, intptr_t target) -{ - emit_load_imm(state, RAX, (intptr_t) target); - /* callq *%rax */ - emit1(state, 0xff); - /* ModR/M byte: b11010000b = xd0, rax is register 0 */ - emit1(state, 0xd0); -} - -static inline void emit_exit(struct jit_state *state) -{ - emit1(state, 0xe9); - emit_jump_target_offset(state, state->offset, state->exit_loc); - emit4(state, 0); -} diff --git a/src/riscv.c b/src/riscv.c index 268d8e3fe..c5c14e4b8 100644 --- a/src/riscv.c +++ b/src/riscv.c @@ -14,7 +14,7 @@ #include "utils.h" #if RV32_HAS(JIT) #include "cache.h" -#include "jit_x64.h" +#include "jit.h" #define CODE_CACHE_SIZE (1024 * 1024) #endif @@ -132,7 +132,7 @@ riscv_t *rv_create(const riscv_io_t *io, /* initialize the block map */ block_map_init(&rv->block_map, BLOCK_MAP_CAPACITY_BITS); #else - rv->jit_state = init_state(CODE_CACHE_SIZE); + rv->jit_state = jit_state_init(CODE_CACHE_SIZE); rv->block_cache = cache_create(BLOCK_MAP_CAPACITY_BITS); #endif /* reset */ @@ -162,7 +162,7 @@ void rv_delete(riscv_t *rv) #if !RV32_HAS(JIT) block_map_destroy(rv); #else - destroy_state(rv->jit_state); + jit_state_exit(rv->jit_state); cache_free(rv->block_cache); #endif free(rv); diff --git a/src/rv32_template.c b/src/rv32_template.c index 46c0ff40a..cf139838d 100644 --- a/src/rv32_template.c +++ b/src/rv32_template.c @@ -14,22 +14,22 @@ * * addi, * { rv->X[ir->rd] = (int32_t) (rv->X[ir->rs1]) + ir->imm; }, - * x64({ - * ld, S32, RAX, X, rs1; - * alu32_imm, 32, 0x81, 0, RAX, imm; - * st, S32, RAX, X, rd; + * GEN({ + * ld, S32, TMP0, X, rs1; + * alu32_imm, 32, 0x81, 0, TMP0, imm; + * st, S32, TMP0, X, rd; * }) * - * The block defined as 'X64' is mapped to the generic C code used in the + * The block defined as 'GEN' is mapped to the generic C code used in the * interpreter. The following instructions will be generated by JIT compiler: - * - Load X->rs1 (target field) from the rv data structure to RAX (destination + * - Load X->rs1 (target field) from the rv data structure to TMP0 (destination * register). * - Do ALU operation on 0 (source register) and imm and store the result into - * RAX (destination register). - * - Store RAX (source register) value to the X->rd (target field) of the rv + * TMP0 (destination register). + * - Store TMP0 (source register) value to the X->rd (target field) of the rv * data structure. * - * The parameter of x64 instruction API + * The parameter of x64 or arm64 instruction API * - size: size of data * - op: opcode * - src: source register @@ -81,7 +81,7 @@ RVOP( nop, { rv->X[rv_reg_zero] = 0; }, - X64({})) + GEN({/* no operation */})) /* LUI is used to build 32-bit constants and uses the U-type format. LUI * places the U-immediate value in the top 20 bits of the destination @@ -91,9 +91,9 @@ RVOP( RVOP( lui, { rv->X[ir->rd] = ir->imm; }, - X64({ - ld_imm, RAX, imm; - st, S32, RAX, X, rd; + GEN({ + ld_imm, TMP0, imm; + st, S32, TMP0, X, rd; })) /* AUIPC is used to build pc-relative addresses and uses the U-type format. @@ -104,9 +104,9 @@ RVOP( RVOP( auipc, { rv->X[ir->rd] = ir->imm + PC; }, - X64({ - ld_imm, RAX, pc, imm; - st, S32, RAX, X, rd; + GEN({ + ld_imm, TMP0, pc, imm; + st, S32, TMP0, X, rd; })) /* JAL: Jump and Link @@ -143,13 +143,13 @@ RVOP( rv->PC = PC; return true; }, - X64({ + GEN({ cond, rd; - ld_imm, RAX, pc, 4; - st, S32, RAX, X, rd; + ld_imm, TMP0, pc, 4; + st, S32, TMP0, X, rd; end; - ld_imm, RAX, pc, imm; - st, S32, RAX, PC; + ld_imm, TMP0, pc, imm; + st, S32, TMP0, PC; jmp, pc, imm; exit; })) @@ -202,15 +202,15 @@ RVOP( rv->PC = PC; return true; }, - X64({ + GEN({ cond, rd; - ld_imm, RAX, pc, 4; - st, S32, RAX, X, rd; + ld_imm, TMP0, pc, 4; + st, S32, TMP0, X, rd; end; - ld, S32, RAX, X, rs1; - alu32_imm, 32, 0x81, 0, RAX, imm; - alu32_imm, 32, 0x81, 4, RAX, ~1U; - st, S32, RAX, PC; + ld, S32, TMP0, X, rs1; + alu32_imm, 32, 0x81, 0, TMP0, imm; + alu32_imm, 32, 0x81, 4, TMP0, ~1U; + st, S32, TMP0, PC; exit; })) @@ -286,24 +286,24 @@ RVOP( RVOP( beq, { BRANCH_FUNC(uint32_t, !=); }, - X64({ - ld, S32, RAX, X, rs1; - ld, S32, RBX, X, rs2; - cmp, RBX, RAX; + GEN({ + ld, S32, TMP0, X, rs1; + ld, S32, TMP1, X, rs2; + cmp, TMP1, TMP0; set_jmp_off; jcc, 0x84; cond, branch_untaken; jmp, pc, 4; end; - ld_imm, RAX, pc, 4; - st, S32, RAX, PC; + ld_imm, TMP0, pc, 4; + st, S32, TMP0, PC; exit; jmp_off; cond, branch_taken; jmp, pc, imm; end; - ld_imm, RAX, pc, imm; - st, S32, RAX, PC; + ld_imm, TMP0, pc, imm; + st, S32, TMP0, PC; exit; })) @@ -311,24 +311,24 @@ RVOP( RVOP( bne, { BRANCH_FUNC(uint32_t, ==); }, - X64({ - ld, S32, RAX, X, rs1; - ld, S32, RBX, X, rs2; - cmp, RBX, RAX; + GEN({ + ld, S32, TMP0, X, rs1; + ld, S32, TMP1, X, rs2; + cmp, TMP1, TMP0; set_jmp_off; jcc, 0x85; cond, branch_untaken; jmp, pc, 4; end; - ld_imm, RAX, pc, 4; - st, S32, RAX, PC; + ld_imm, TMP0, pc, 4; + st, S32, TMP0, PC; exit; jmp_off; cond, branch_taken; jmp, pc, imm; end; - ld_imm, RAX, pc, imm; - st, S32, RAX, PC; + ld_imm, TMP0, pc, imm; + st, S32, TMP0, PC; exit; })) @@ -336,24 +336,24 @@ RVOP( RVOP( blt, { BRANCH_FUNC(int32_t, >=); }, - X64({ - ld, S32, RAX, X, rs1; - ld, S32, RBX, X, rs2; - cmp, RBX, RAX; + GEN({ + ld, S32, TMP0, X, rs1; + ld, S32, TMP1, X, rs2; + cmp, TMP1, TMP0; set_jmp_off; jcc, 0x8c; cond, branch_untaken; jmp, pc, 4; end; - ld_imm, RAX, pc, 4; - st, S32, RAX, PC; + ld_imm, TMP0, pc, 4; + st, S32, TMP0, PC; exit; jmp_off; cond, branch_taken; jmp, pc, imm; end; - ld_imm, RAX, pc, imm; - st, S32, RAX, PC; + ld_imm, TMP0, pc, imm; + st, S32, TMP0, PC; exit; })) @@ -361,24 +361,24 @@ RVOP( RVOP( bge, { BRANCH_FUNC(int32_t, <); }, - X64({ - ld, S32, RAX, X, rs1; - ld, S32, RBX, X, rs2; - cmp, RBX, RAX; + GEN({ + ld, S32, TMP0, X, rs1; + ld, S32, TMP1, X, rs2; + cmp, TMP1, TMP0; set_jmp_off; jcc, 0x8d; cond, branch_untaken; jmp, pc, 4; end; - ld_imm, RAX, pc, 4; - st, S32, RAX, PC; + ld_imm, TMP0, pc, 4; + st, S32, TMP0, PC; exit; jmp_off; cond, branch_taken; jmp, pc, imm; end; - ld_imm, RAX, pc, imm; - st, S32, RAX, PC; + ld_imm, TMP0, pc, imm; + st, S32, TMP0, PC; exit; })) @@ -386,24 +386,24 @@ RVOP( RVOP( bltu, { BRANCH_FUNC(uint32_t, >=); }, - X64({ - ld, S32, RAX, X, rs1; - ld, S32, RBX, X, rs2; - cmp, RBX, RAX; + GEN({ + ld, S32, TMP0, X, rs1; + ld, S32, TMP1, X, rs2; + cmp, TMP1, TMP0; set_jmp_off; jcc, 0x82; cond, branch_untaken; jmp, pc, 4; end; - ld_imm, RAX, pc, 4; - st, S32, RAX, PC; + ld_imm, TMP0, pc, 4; + st, S32, TMP0, PC; exit; jmp_off; cond, branch_taken; jmp, pc, imm; end; - ld_imm, RAX, pc, imm; - st, S32, RAX, PC; + ld_imm, TMP0, pc, imm; + st, S32, TMP0, PC; exit; })) @@ -411,24 +411,24 @@ RVOP( RVOP( bgeu, { BRANCH_FUNC(uint32_t, <); }, - X64({ - ld, S32, RAX, X, rs1; - ld, S32, RBX, X, rs2; - cmp, RBX, RAX; + GEN({ + ld, S32, TMP0, X, rs1; + ld, S32, TMP1, X, rs2; + cmp, TMP1, TMP0; set_jmp_off; jcc, 0x83; cond, branch_untaken; jmp, pc, 4; end; - ld_imm, RAX, pc, 4; - st, S32, RAX, PC; + ld_imm, TMP0, pc, 4; + st, S32, TMP0, PC; exit; jmp_off; cond, branch_taken; jmp, pc, imm; end; - ld_imm, RAX, pc, imm; - st, S32, RAX, PC; + ld_imm, TMP0, pc, imm; + st, S32, TMP0, PC; exit; })) @@ -446,13 +446,13 @@ RVOP( rv->X[ir->rd] = sign_extend_b(rv->io.mem_read_b(rv->X[ir->rs1] + ir->imm)); }, - X64({ + GEN({ mem; - ld, S32, RAX, X, rs1; - ld_imm, RBX, mem; - alu64, 0x01, RBX, RAX; - ld_sext, S8, RAX, RBX, 0; - st, S32, RBX, X, rd; + ld, S32, TMP0, X, rs1; + ld_imm, TMP1, mem; + alu64, 0x01, TMP1, TMP0; + ld_sext, S8, TMP0, TMP1, 0; + st, S32, TMP1, X, rd; })) /* LH: Load Halfword */ @@ -463,13 +463,13 @@ RVOP( RV_EXC_MISALIGN_HANDLER(1, load, false, 1); rv->X[ir->rd] = sign_extend_h(rv->io.mem_read_s(addr)); }, - X64({ + GEN({ mem; - ld, S32, RAX, X, rs1; - ld_imm, RBX, mem; - alu64, 0x01, RBX, RAX; - ld_sext, S16, RAX, RBX, 0; - st, S32, RBX, X, rd; + ld, S32, TMP0, X, rs1; + ld_imm, TMP1, mem; + alu64, 0x01, TMP1, TMP0; + ld_sext, S16, TMP0, TMP1, 0; + st, S32, TMP1, X, rd; })) /* LW: Load Word */ @@ -480,26 +480,26 @@ RVOP( RV_EXC_MISALIGN_HANDLER(3, load, false, 1); rv->X[ir->rd] = rv->io.mem_read_w(addr); }, - X64({ + GEN({ mem; - ld, S32, RAX, X, rs1; - ld_imm, RBX, mem; - alu64, 0x01, RBX, RAX; - ld, S32, RAX, RBX, 0; - st, S32, RBX, X, rd; + ld, S32, TMP0, X, rs1; + ld_imm, TMP1, mem; + alu64, 0x01, TMP1, TMP0; + ld, S32, TMP0, TMP1, 0; + st, S32, TMP1, X, rd; })) /* LBU: Load Byte Unsigned */ RVOP( lbu, { rv->X[ir->rd] = rv->io.mem_read_b(rv->X[ir->rs1] + ir->imm); }, - X64({ + GEN({ mem; - ld, S32, RAX, X, rs1; - ld_imm, RBX, mem; - alu64, 0x01, RBX, RAX; - ld, S8, RAX, RBX, 0; - st, S32, RBX, X, rd; + ld, S32, TMP0, X, rs1; + ld_imm, TMP1, mem; + alu64, 0x01, TMP1, TMP0; + ld, S8, TMP0, TMP1, 0; + st, S32, TMP1, X, rd; })) /* LHU: Load Halfword Unsigned */ @@ -510,13 +510,13 @@ RVOP( RV_EXC_MISALIGN_HANDLER(1, load, false, 1); rv->X[ir->rd] = rv->io.mem_read_s(addr); }, - X64({ + GEN({ mem; - ld, S32, RAX, X, rs1; - ld_imm, RBX, mem; - alu64, 0x01, RBX, RAX; - ld, S16, RAX, RBX, 0; - st, S32, RBX, X, rd; + ld, S32, TMP0, X, rs1; + ld_imm, TMP1, mem; + alu64, 0x01, TMP1, TMP0; + ld, S16, TMP0, TMP1, 0; + st, S32, TMP1, X, rd; })) /* There are 3 types of stores: byte, halfword, and word-sized. Unlike loads, @@ -529,13 +529,13 @@ RVOP( RVOP( sb, { rv->io.mem_write_b(rv->X[ir->rs1] + ir->imm, rv->X[ir->rs2]); }, - X64({ + GEN({ mem; - ld, S32, RAX, X, rs1; - ld_imm, RBX, mem; - alu64, 0x01, RBX, RAX; - ld, S8, RBX, X, rs2; - st, S8, RBX, RAX, 0; + ld, S32, TMP0, X, rs1; + ld_imm, TMP1, mem; + alu64, 0x01, TMP1, TMP0; + ld, S8, TMP1, X, rs2; + st, S8, TMP1, TMP0, 0; })) /* SH: Store Halfword */ @@ -546,13 +546,13 @@ RVOP( RV_EXC_MISALIGN_HANDLER(1, store, false, 1); rv->io.mem_write_s(addr, rv->X[ir->rs2]); }, - X64({ + GEN({ mem; - ld, S32, RAX, X, rs1; - ld_imm, RBX, mem; - alu64, 0x01, RBX, RAX; - ld, S16, RBX, X, rs2; - st, S16, RBX, RAX, 0; + ld, S32, TMP0, X, rs1; + ld_imm, TMP1, mem; + alu64, 0x01, TMP1, TMP0; + ld, S16, TMP1, X, rs2; + st, S16, TMP1, TMP0, 0; })) /* SW: Store Word */ @@ -563,13 +563,13 @@ RVOP( RV_EXC_MISALIGN_HANDLER(3, store, false, 1); rv->io.mem_write_w(addr, rv->X[ir->rs2]); }, - X64({ + GEN({ mem; - ld, S32, RAX, X, rs1; - ld_imm, RBX, mem; - alu64, 0x01, RBX, RAX; - ld, S32, RBX, X, rs2; - st, S32, RBX, RAX, 0; + ld, S32, TMP0, X, rs1; + ld_imm, TMP1, mem; + alu64, 0x01, TMP1, TMP0; + ld, S32, TMP1, X, rs2; + st, S32, TMP1, TMP0, 0; })) /* ADDI adds the sign-extended 12-bit immediate to register rs1. Arithmetic @@ -580,10 +580,10 @@ RVOP( RVOP( addi, { rv->X[ir->rd] = (int32_t) (rv->X[ir->rs1]) + ir->imm; }, - X64({ - ld, S32, RAX, X, rs1; - alu32_imm, 32, 0x81, 0, RAX, imm; - st, S32, RAX, X, rd; + GEN({ + ld, S32, TMP0, X, rs1; + alu32_imm, 32, 0x81, 0, TMP0, imm; + st, S32, TMP0, X, rd; })) /* SLTI place the value 1 in register rd if register rs1 is less than the @@ -593,9 +593,9 @@ RVOP( RVOP( slti, { rv->X[ir->rd] = ((int32_t) (rv->X[ir->rs1]) < ir->imm) ? 1 : 0; }, - X64({ - ld, S32, RAX, X, rs1; - cmp_imm, RAX, imm; + GEN({ + ld, S32, TMP0, X, rs1; + cmp_imm, TMP0, imm; st_imm, S32, rd, 1; set_jmp_off; jcc, 0x82; @@ -609,9 +609,9 @@ RVOP( RVOP( sltiu, { rv->X[ir->rd] = (rv->X[ir->rs1] < (uint32_t) ir->imm) ? 1 : 0; }, - X64({ - ld, S32, RAX, X, rs1; - cmp_imm, RAX, imm; + GEN({ + ld, S32, TMP0, X, rs1; + cmp_imm, TMP0, imm; st_imm, S32, rd, 1; set_jmp_off; jcc, 0x82; @@ -623,20 +623,20 @@ RVOP( RVOP( xori, { rv->X[ir->rd] = rv->X[ir->rs1] ^ ir->imm; }, - X64({ - ld, S32, RAX, X, rs1; - alu32_imm, 32, 0x81, 6, RAX, imm; - st, S32, RAX, X, rd; + GEN({ + ld, S32, TMP0, X, rs1; + alu32_imm, 32, 0x81, 6, TMP0, imm; + st, S32, TMP0, X, rd; })) /* ORI: OR Immediate */ RVOP( ori, { rv->X[ir->rd] = rv->X[ir->rs1] | ir->imm; }, - X64({ - ld, S32, RAX, X, rs1; - alu32_imm, 32, 0x81, 1, RAX, imm; - st, S32, RAX, X, rd; + GEN({ + ld, S32, TMP0, X, rs1; + alu32_imm, 32, 0x81, 1, TMP0, imm; + st, S32, TMP0, X, rd; })) /* ANDI performs bitwise AND on register rs1 and the sign-extended 12-bit @@ -645,10 +645,10 @@ RVOP( RVOP( andi, { rv->X[ir->rd] = rv->X[ir->rs1] & ir->imm; }, - X64({ - ld, S32, RAX, X, rs1; - alu32_imm, 32, 0x81, 4, RAX, imm; - st, S32, RAX, X, rd; + GEN({ + ld, S32, TMP0, X, rs1; + alu32_imm, 32, 0x81, 4, TMP0, imm; + st, S32, TMP0, X, rd; })) FORCE_INLINE void shift_func(riscv_t *rv, const rv_insn_t *ir) @@ -675,10 +675,10 @@ FORCE_INLINE void shift_func(riscv_t *rv, const rv_insn_t *ir) RVOP( slli, { shift_func(rv, ir); }, - X64({ - ld, S32, RAX, X, rs1; - alu32_imm, 8, 0xc1, 4, RAX, imm, 0x1f; - st, S32, RAX, X, rd; + GEN({ + ld, S32, TMP0, X, rs1; + alu32_imm, 8, 0xc1, 4, TMP0, imm, 0x1f; + st, S32, TMP0, X, rd; })) /* SRLI performs logical right shift on the value in register rs1 by the shift @@ -687,10 +687,10 @@ RVOP( RVOP( srli, { shift_func(rv, ir); }, - X64({ - ld, S32, RAX, X, rs1; - alu32_imm, 8, 0xc1, 5, RAX, imm, 0x1f; - st, S32, RAX, X, rd; + GEN({ + ld, S32, TMP0, X, rs1; + alu32_imm, 8, 0xc1, 5, TMP0, imm, 0x1f; + st, S32, TMP0, X, rd; })) /* SRAI performs arithmetic right shift on the value in register rs1 by the @@ -699,10 +699,10 @@ RVOP( RVOP( srai, { shift_func(rv, ir); }, - X64({ - ld, S32, RAX, X, rs1; - alu32_imm, 8, 0xc1, 7, RAX, imm, 0x1f; - st, S32, RAX, X, rd; + GEN({ + ld, S32, TMP0, X, rs1; + alu32_imm, 8, 0xc1, 7, TMP0, imm, 0x1f; + st, S32, TMP0, X, rd; })) /* ADD */ @@ -711,11 +711,11 @@ RVOP( { rv->X[ir->rd] = (int32_t) (rv->X[ir->rs1]) + (int32_t) (rv->X[ir->rs2]); }, - X64({ - ld, S32, RAX, X, rs1; - ld, S32, RBX, X, rs2; - alu32, 0x01, RBX, RAX; - st, S32, RAX, X, rd; + GEN({ + ld, S32, TMP0, X, rs1; + ld, S32, TMP1, X, rs2; + alu32, 0x01, TMP1, TMP0; + st, S32, TMP0, X, rd; })) /* SUB: Substract */ @@ -724,23 +724,23 @@ RVOP( { rv->X[ir->rd] = (int32_t) (rv->X[ir->rs1]) - (int32_t) (rv->X[ir->rs2]); }, - X64({ - ld, S32, RAX, X, rs1; - ld, S32, RBX, X, rs2; - alu32, 0x29, RBX, RAX; - st, S32, RAX, X, rd; + GEN({ + ld, S32, TMP0, X, rs1; + ld, S32, TMP1, X, rs2; + alu32, 0x29, TMP1, TMP0; + st, S32, TMP0, X, rd; })) /* SLL: Shift Left Logical */ RVOP( sll, { rv->X[ir->rd] = rv->X[ir->rs1] << (rv->X[ir->rs2] & 0x1f); }, - X64({ - ld, S32, RAX, X, rs1; - ld, S32, RCX, X, rs2; - alu32_imm, 32, 0x81, 4, RCX, 0x1f; - alu32, 0xd3, 4, RAX; - st, S32, RAX, X, rd; + GEN({ + ld, S32, TMP0, X, rs1; + ld, S32, TMP2, X, rs2; + alu32_imm, 32, 0x81, 4, TMP2, 0x1f; + alu32, 0xd3, 4, TMP0; + st, S32, TMP0, X, rd; })) /* SLT: Set on Less Than */ @@ -750,10 +750,10 @@ RVOP( rv->X[ir->rd] = ((int32_t) (rv->X[ir->rs1]) < (int32_t) (rv->X[ir->rs2])) ? 1 : 0; }, - X64({ - ld, S32, RAX, X, rs1; - ld, S32, RBX, X, rs2; - cmp, RBX, RAX; + GEN({ + ld, S32, TMP0, X, rs1; + ld, S32, TMP1, X, rs2; + cmp, TMP1, TMP0; st_imm, S32, rd, 1; set_jmp_off; jcc, 0x82; @@ -765,10 +765,10 @@ RVOP( RVOP( sltu, { rv->X[ir->rd] = (rv->X[ir->rs1] < rv->X[ir->rs2]) ? 1 : 0; }, - X64({ - ld, S32, RAX, X, rs1; - ld, S32, RBX, X, rs2; - cmp, RBX, RAX; + GEN({ + ld, S32, TMP0, X, rs1; + ld, S32, TMP1, X, rs2; + cmp, TMP1, TMP0; st_imm, S32, rd, 1; set_jmp_off; jcc, 0x82; @@ -782,35 +782,35 @@ RVOP( { rv->X[ir->rd] = rv->X[ir->rs1] ^ rv->X[ir->rs2]; }, - X64({ - ld, S32, RAX, X, rs1; - ld, S32, RBX, X, rs2; - alu32, 0x31, RBX, RAX; - st, S32, RAX, X, rd; + GEN({ + ld, S32, TMP0, X, rs1; + ld, S32, TMP1, X, rs2; + alu32, 0x31, TMP1, TMP0; + st, S32, TMP0, X, rd; })) /* SRL: Shift Right Logical */ RVOP( srl, { rv->X[ir->rd] = rv->X[ir->rs1] >> (rv->X[ir->rs2] & 0x1f); }, - X64({ - ld, S32, RAX, X, rs1; - ld, S32, RCX, X, rs2; - alu32_imm, 32, 0x81, 4, RCX, 0x1f; - alu32, 0xd3, 5, RAX; - st, S32, RAX, X, rd; + GEN({ + ld, S32, TMP0, X, rs1; + ld, S32, TMP2, X, rs2; + alu32_imm, 32, 0x81, 4, TMP2, 0x1f; + alu32, 0xd3, 5, TMP0; + st, S32, TMP0, X, rd; })) /* SRA: Shift Right Arithmetic */ RVOP( sra, { rv->X[ir->rd] = ((int32_t) rv->X[ir->rs1]) >> (rv->X[ir->rs2] & 0x1f); }, - X64({ - ld, S32, RAX, X, rs1; - ld, S32, RCX, X, rs2; - alu32_imm, 32, 0x81, 4, RCX, 0x1f; - alu32, 0xd3, 7, RAX; - st, S32, RAX, X, rd; + GEN({ + ld, S32, TMP0, X, rs1; + ld, S32, TMP2, X, rs2; + alu32_imm, 32, 0x81, 4, TMP2, 0x1f; + alu32, 0xd3, 7, TMP0; + st, S32, TMP0, X, rd; })) /* OR */ @@ -818,11 +818,11 @@ RVOP( or , { rv->X[ir->rd] = rv->X[ir->rs1] | rv->X[ir->rs2]; }, - X64({ - ld, S32, RAX, X, rs1; - ld, S32, RBX, X, rs2; - alu32, 0x09, RBX, RAX; - st, S32, RAX, X, rd; + GEN({ + ld, S32, TMP0, X, rs1; + ld, S32, TMP1, X, rs2; + alu32, 0x09, TMP1, TMP0; + st, S32, TMP0, X, rd; })) /* AND */ @@ -830,11 +830,11 @@ RVOP( RVOP( and, { rv->X[ir->rd] = rv->X[ir->rs1] & rv->X[ir->rs2]; }, - X64({ - ld, S32, RAX, X, rs1; - ld, S32, RBX, X, rs2; - alu32, 0x21, RBX, RAX; - st, S32, RAX, X, rd; + GEN({ + ld, S32, TMP0, X, rs1; + ld, S32, TMP1, X, rs2; + alu32, 0x21, TMP1, TMP0; + st, S32, TMP0, X, rd; })) /* clang-format on */ @@ -848,9 +848,9 @@ RVOP( rv->io.on_ecall(rv); return true; }, - X64({ - ld_imm, RAX, pc; - st, S32, RAX, PC; + GEN({ + ld_imm, TMP0, pc; + st, S32, TMP0, PC; call, ecall; exit; })) @@ -865,9 +865,9 @@ RVOP( rv->io.on_ebreak(rv); return true; }, - X64({ - ld_imm, RAX, pc; - st, S32, RAX, PC; + GEN({ + ld_imm, TMP0, pc; + st, S32, TMP0, PC; call, ebreak; exit; })) @@ -879,7 +879,7 @@ RVOP( /* FIXME: Implement */ return false; }, - X64({ + GEN({ assert; /* FIXME: Implement */ })) @@ -890,7 +890,7 @@ RVOP( /* FIXME: Implement */ return false; }, - X64({ + GEN({ assert; /* FIXME: Implement */ })) @@ -901,7 +901,7 @@ RVOP( /* FIXME: Implement */ return false; }, - X64({ + GEN({ assert; /* FIXME: Implement */ })) @@ -912,7 +912,7 @@ RVOP( /* FIXME: Implement */ return false; }, - X64({ + GEN({ assert; /* FIXME: Implement */ })) @@ -924,7 +924,7 @@ RVOP( rv->PC = rv->csr_mepc; return true; }, - X64({ + GEN({ assert; /* FIXME: Implement */ })) @@ -938,7 +938,7 @@ RVOP( rv->PC = PC; return true; }, - X64({ + GEN({ assert; /* FIXME: Implement */ })) #endif @@ -951,7 +951,7 @@ RVOP( uint32_t tmp = csr_csrrw(rv, ir->imm, rv->X[ir->rs1]); rv->X[ir->rd] = ir->rd ? tmp : rv->X[ir->rd]; }, - X64({ + GEN({ assert; /* FIXME: Implement */ })) @@ -971,7 +971,7 @@ RVOP( rv, ir->imm, (ir->rs1 == rv_reg_zero) ? 0U : rv->X[ir->rs1]); rv->X[ir->rd] = ir->rd ? tmp : rv->X[ir->rd]; }, - X64({ + GEN({ assert; /* FIXME: Implement */ })) @@ -983,7 +983,7 @@ RVOP( rv, ir->imm, (ir->rs1 == rv_reg_zero) ? ~0U : rv->X[ir->rs1]); rv->X[ir->rd] = ir->rd ? tmp : rv->X[ir->rd]; }, - X64({ + GEN({ assert; /* FIXME: Implement */ })) @@ -994,7 +994,7 @@ RVOP( uint32_t tmp = csr_csrrw(rv, ir->imm, ir->rs1); rv->X[ir->rd] = ir->rd ? tmp : rv->X[ir->rd]; }, - X64({ + GEN({ assert; /* FIXME: Implement */ })) @@ -1005,7 +1005,7 @@ RVOP( uint32_t tmp = csr_csrrs(rv, ir->imm, ir->rs1); rv->X[ir->rd] = ir->rd ? tmp : rv->X[ir->rd]; }, - X64({ + GEN({ assert; /* FIXME: Implement */ })) @@ -1016,7 +1016,7 @@ RVOP( uint32_t tmp = csr_csrrc(rv, ir->imm, ir->rs1); rv->X[ir->rd] = ir->rd ? tmp : rv->X[ir->rd]; }, - X64({ + GEN({ assert; /* FIXME: Implement */ })) #endif @@ -1028,11 +1028,11 @@ RVOP( RVOP( mul, { rv->X[ir->rd] = (int32_t) rv->X[ir->rs1] * (int32_t) rv->X[ir->rs2]; }, - X64({ - ld, S32, RAX, X, rs1; - ld, S32, RBX, X, rs2; - mul, 0x28, RBX, RAX, 0; - st, S32, RAX, X, rd; + GEN({ + ld, S32, TMP0, X, rs1; + ld, S32, TMP1, X, rs2; + mul, 0x28, TMP1, TMP0, 0; + st, S32, TMP0, X, rd; })) /* MULH: Multiply High Signed Signed */ @@ -1046,12 +1046,12 @@ RVOP( const int64_t multiplier = (int32_t) rv->X[ir->rs2]; rv->X[ir->rd] = ((uint64_t) (multiplicand * multiplier)) >> 32; }, - X64({ - ld_sext, S32, RAX, X, rs1; - ld_sext, S32, RBX, X, rs2; - mul, 0x2f, RBX, RAX, 0; - alu64_imm, 8, 0xc1, 5, RAX, 32; - st, S32, RAX, X, rd; + GEN({ + ld_sext, S32, TMP0, X, rs1; + ld_sext, S32, TMP1, X, rs2; + mul, 0x2f, TMP1, TMP0, 0; + alu64_imm, 8, 0xc1, 5, TMP0, 32; + st, S32, TMP0, X, rd; })) /* MULHSU: Multiply High Signed Unsigned */ @@ -1066,12 +1066,12 @@ RVOP( const uint64_t umultiplier = rv->X[ir->rs2]; rv->X[ir->rd] = ((uint64_t) (multiplicand * umultiplier)) >> 32; }, - X64({ - ld_sext, S32, RAX, X, rs1; - ld, S32, RBX, X, rs2; - mul, 0x2f, RBX, RAX, 0; - alu64_imm, 8, 0xc1, 5, RAX, 32; - st, S32, RAX, X, rd; + GEN({ + ld_sext, S32, TMP0, X, rs1; + ld, S32, TMP1, X, rs2; + mul, 0x2f, TMP1, TMP0, 0; + alu64_imm, 8, 0xc1, 5, TMP0, 32; + st, S32, TMP0, X, rd; })) /* MULHU: Multiply High Unsigned Unsigned */ @@ -1081,12 +1081,12 @@ RVOP( rv->X[ir->rd] = ((uint64_t) rv->X[ir->rs1] * (uint64_t) rv->X[ir->rs2]) >> 32; }, - X64({ - ld, S32, RAX, X, rs1; - ld, S32, RBX, X, rs2; - mul, 0x2f, RBX, RAX, 0; - alu64_imm, 8, 0xc1, 5, RAX, 32; - st, S32, RAX, X, rd; + GEN({ + ld, S32, TMP0, X, rs1; + ld, S32, TMP1, X, rs2; + mul, 0x2f, TMP1, TMP0, 0; + alu64_imm, 8, 0xc1, 5, TMP0, 32; + st, S32, TMP0, X, rd; })) /* DIV: Divide Signed */ @@ -1107,16 +1107,16 @@ RVOP( ? rv->X[ir->rs1] /* overflow */ : (unsigned int) (dividend / divisor); }, - X64({ - ld, S32, RAX, X, rs1; - ld, S32, RBX, X, rs2; - div, 0x38, RBX, RAX, 0; - cmp_imm, RBX, 0; + GEN({ + ld, S32, TMP0, X, rs1; + ld, S32, TMP1, X, rs2; + div, 0x38, TMP1, TMP0, 0; + cmp_imm, TMP1, 0; set_jmp_off; jcc, 0x85; - ld_imm, RAX, -1; + ld_imm, TMP0, -1; jmp_off; - st, S32, RAX, X, rd; + st, S32, TMP0, X, rd; /* FIXME: handle overflow */ })) @@ -1134,16 +1134,16 @@ RVOP( const uint32_t udivisor = rv->X[ir->rs2]; rv->X[ir->rd] = !udivisor ? ~0U : udividend / udivisor; }, - X64({ - ld, S32, RAX, X, rs1; - ld, S32, RBX, X, rs2; - div, 0x38, RBX, RAX, 0; - cmp_imm, RBX, 0; + GEN({ + ld, S32, TMP0, X, rs1; + ld, S32, TMP1, X, rs2; + div, 0x38, TMP1, TMP0, 0; + cmp_imm, TMP1, 0; set_jmp_off; jcc, 0x85; - ld_imm, RAX, ~0U; + ld_imm, TMP0, ~0U; jmp_off; - st, S32, RAX, X, rd; + st, S32, TMP0, X, rd; })) /* clang-format off */ @@ -1163,11 +1163,11 @@ RVOP(rem, { ? 0 : (dividend % divisor); }, -X64({ - ld, S32, RAX, X, rs1; - ld, S32, RBX, X, rs2; - mod, 0x98, RBX, RAX, 0; - st, S32, RAX, X, rd; +GEN({ + ld, S32, TMP0, X, rs1; + ld, S32, TMP1, X, rs2; + mod, 0x98, TMP1, TMP0, 0; + st, S32, TMP0, X, rd; /* FIXME: handle overflow */ })) @@ -1184,11 +1184,11 @@ RVOP(remu, { rv->X[ir->rd] = !udivisor ? udividend : udividend % udivisor; }, -X64({ - ld, S32, RAX, X, rs1; - ld, S32, RBX, X, rs2; - mod, 0x98, RBX, RAX, 0; - st, S32, RAX, X, rd; +GEN({ + ld, S32, TMP0, X, rs1; + ld, S32, TMP1, X, rs2; + mod, 0x98, TMP1, TMP0, 0; + st, S32, TMP0, X, rd; })) /* clang-format on */ #endif @@ -1226,7 +1226,7 @@ RVOP( * FIXME: uimplemented */ }, - X64({ + GEN({ assert; /* FIXME: Implement */ })) @@ -1240,7 +1240,7 @@ RVOP( rv->io.mem_write_w(rv->X[ir->rs1], rv->X[ir->rs2]); rv->X[ir->rd] = 0; }, - X64({ + GEN({ assert; /* FIXME: Implement */ })) @@ -1251,7 +1251,7 @@ RVOP( rv->X[ir->rd] = rv->io.mem_read_w(ir->rs1); rv->io.mem_write_s(ir->rs1, rv->X[ir->rs2]); }, - X64({ + GEN({ assert; /* FIXME: Implement */ })) @@ -1263,7 +1263,7 @@ RVOP( const int32_t res = (int32_t) rv->X[ir->rd] + (int32_t) rv->X[ir->rs2]; rv->io.mem_write_s(ir->rs1, res); }, - X64({ + GEN({ assert; /* FIXME: Implement */ })) @@ -1275,7 +1275,7 @@ RVOP( const int32_t res = rv->X[ir->rd] ^ rv->X[ir->rs2]; rv->io.mem_write_s(ir->rs1, res); }, - X64({ + GEN({ assert; /* FIXME: Implement */ })) @@ -1287,7 +1287,7 @@ RVOP( const int32_t res = rv->X[ir->rd] & rv->X[ir->rs2]; rv->io.mem_write_s(ir->rs1, res); }, - X64({ + GEN({ assert; /* FIXME: Implement */ })) @@ -1299,7 +1299,7 @@ RVOP( const int32_t res = rv->X[ir->rd] | rv->X[ir->rs2]; rv->io.mem_write_s(ir->rs1, res); }, - X64({ + GEN({ assert; /* FIXME: Implement */ })) @@ -1312,7 +1312,7 @@ RVOP( rv->X[ir->rd] < rv->X[ir->rs2] ? rv->X[ir->rd] : rv->X[ir->rs2]; rv->io.mem_write_s(ir->rs1, res); }, - X64({ + GEN({ assert; /* FIXME: Implement */ })) @@ -1325,7 +1325,7 @@ RVOP( rv->X[ir->rd] > rv->X[ir->rs2] ? rv->X[ir->rd] : rv->X[ir->rs2]; rv->io.mem_write_s(ir->rs1, res); }, - X64({ + GEN({ assert; /* FIXME: Implement */ })) @@ -1338,7 +1338,7 @@ RVOP( rv->X[ir->rd] < rv->X[ir->rs2] ? rv->X[ir->rd] : rv->X[ir->rs2]; rv->io.mem_write_s(ir->rs1, ures); }, - X64({ + GEN({ assert; /* FIXME: Implement */ })) @@ -1351,7 +1351,7 @@ RVOP( rv->X[ir->rd] > rv->X[ir->rs2] ? rv->X[ir->rd] : rv->X[ir->rs2]; rv->io.mem_write_s(ir->rs1, ures); }, - X64({ + GEN({ assert; /* FIXME: Implement */ })) #endif /* RV32_HAS(EXT_A) */ @@ -1367,7 +1367,7 @@ RVOP( const uint32_t data = rv->io.mem_read_w(rv->X[ir->rs1] + ir->imm); rv->F[ir->rd].v = data; }, - X64({ + GEN({ assert; /* FIXME: Implement */ })) @@ -1379,7 +1379,7 @@ RVOP( uint32_t data = rv->F[ir->rs2].v; rv->io.mem_write_w(rv->X[ir->rs1] + ir->imm, data); }, - X64({ + GEN({ assert; /* FIXME: Implement */ })) @@ -1392,7 +1392,7 @@ RVOP( f32_mulAdd(rv->F[ir->rs1], rv->F[ir->rs2], rv->F[ir->rs3]); set_fflag(rv); }, - X64({ + GEN({ assert; /* FIXME: Implement */ })) @@ -1406,7 +1406,7 @@ RVOP( rv->F[ir->rd] = f32_mulAdd(rv->F[ir->rs1], rv->F[ir->rs2], tmp); set_fflag(rv); }, - X64({ + GEN({ assert; /* FIXME: Implement */ })) @@ -1420,7 +1420,7 @@ RVOP( rv->F[ir->rd] = f32_mulAdd(tmp, rv->F[ir->rs2], rv->F[ir->rs3]); set_fflag(rv); }, - X64({ + GEN({ assert; /* FIXME: Implement */ })) @@ -1436,7 +1436,7 @@ RVOP( rv->F[ir->rd] = f32_mulAdd(tmp1, rv->F[ir->rs2], tmp2); set_fflag(rv); }, - X64({ + GEN({ assert; /* FIXME: Implement */ })) @@ -1448,7 +1448,7 @@ RVOP( rv->F[ir->rd] = f32_add(rv->F[ir->rs1], rv->F[ir->rs2]); set_fflag(rv); }, - X64({ + GEN({ assert; /* FIXME: Implement */ })) @@ -1460,7 +1460,7 @@ RVOP( rv->F[ir->rd] = f32_sub(rv->F[ir->rs1], rv->F[ir->rs2]); set_fflag(rv); }, - X64({ + GEN({ assert; /* FIXME: Implement */ })) @@ -1472,7 +1472,7 @@ RVOP( rv->F[ir->rd] = f32_mul(rv->F[ir->rs1], rv->F[ir->rs2]); set_fflag(rv); }, - X64({ + GEN({ assert; /* FIXME: Implement */ })) @@ -1484,7 +1484,7 @@ RVOP( rv->F[ir->rd] = f32_div(rv->F[ir->rs1], rv->F[ir->rs2]); set_fflag(rv); }, - X64({ + GEN({ assert; /* FIXME: Implement */ })) @@ -1496,7 +1496,7 @@ RVOP( rv->F[ir->rd] = f32_sqrt(rv->F[ir->rs1]); set_fflag(rv); }, - X64({ + GEN({ assert; /* FIXME: Implement */ })) @@ -1507,7 +1507,7 @@ RVOP( rv->F[ir->rd].v = (rv->F[ir->rs1].v & ~FMASK_SIGN) | (rv->F[ir->rs2].v & FMASK_SIGN); }, - X64({ + GEN({ assert; /* FIXME: Implement */ })) @@ -1518,7 +1518,7 @@ RVOP( rv->F[ir->rd].v = (rv->F[ir->rs1].v & ~FMASK_SIGN) | (~rv->F[ir->rs2].v & FMASK_SIGN); }, - X64({ + GEN({ assert; /* FIXME: Implement */ })) @@ -1526,7 +1526,7 @@ RVOP( RVOP( fsgnjxs, { rv->F[ir->rd].v = rv->F[ir->rs1].v ^ (rv->F[ir->rs2].v & FMASK_SIGN); }, - X64({ + GEN({ assert; /* FIXME: Implement */ })) @@ -1552,7 +1552,7 @@ RVOP( rv->F[ir->rd] = (less || is_nan(rv->F[ir->rs2].v) ? rv->F[ir->rs1] : rv->F[ir->rs2]); }, - X64({ + GEN({ assert; /* FIXME: Implement */ })) @@ -1573,7 +1573,7 @@ RVOP( (greater || is_nan(rv->F[ir->rs2].v) ? rv->F[ir->rs1] : rv->F[ir->rs2]); }, - X64({ + GEN({ assert; /* FIXME: Implement */ })) @@ -1591,7 +1591,7 @@ RVOP( rv->X[ir->rd] = ret; set_fflag(rv); }, - X64({ + GEN({ assert; /* FIXME: Implement */ })) @@ -1606,7 +1606,7 @@ RVOP( rv->X[ir->rd] = ret; set_fflag(rv); }, - X64({ + GEN({ assert; /* FIXME: Implement */ })) @@ -1617,7 +1617,7 @@ RVOP( if (ir->rd) rv->X[ir->rd] = rv->F[ir->rs1].v; }, - X64({ + GEN({ assert; /* FIXME: Implement */ })) @@ -1633,7 +1633,7 @@ RVOP( rv->X[ir->rd] = ret; set_fflag(rv); }, - X64({ + GEN({ assert; /* FIXME: Implement */ })) @@ -1650,7 +1650,7 @@ RVOP( rv->X[ir->rd] = ret; set_fflag(rv); }, - X64({ + GEN({ assert; /* FIXME: Implement */ })) @@ -1663,7 +1663,7 @@ RVOP( rv->X[ir->rd] = ret; set_fflag(rv); }, - X64({ + GEN({ assert; /* FIXME: Implement */ })) @@ -1674,7 +1674,7 @@ RVOP( if (ir->rd) rv->X[ir->rd] = calc_fclass(rv->F[ir->rs1].v); }, - X64({ + GEN({ assert; /* FIXME: Implement */ })) @@ -1686,7 +1686,7 @@ RVOP( rv->F[ir->rd] = i32_to_f32(rv->X[ir->rs1]); set_fflag(rv); }, - X64({ + GEN({ assert; /* FIXME: Implement */ })) @@ -1698,22 +1698,20 @@ RVOP( rv->F[ir->rd] = ui32_to_f32(rv->X[ir->rs1]); set_fflag(rv); }, - X64({ + GEN({ assert; /* FIXME: Implement */ })) /* FMV.W.X */ -RVOP(fmvwx, - { - rv->F[ir->rd].v = rv->X[ir->rs1]; }, - { - - X64({ +RVOP( + fmvwx, + { rv->F[ir->rd].v = rv->X[ir->rs1]; }, + GEN({ assert; /* FIXME: Implement */ - })) + })) #endif - /* RV32C Standard Extension */ +/* RV32C Standard Extension */ #if RV32_HAS(EXT_C) /* C.ADDI4SPN is a CIW-format instruction that adds a zero-extended non-zero @@ -1722,56 +1720,58 @@ RVOP(fmvwx, * This instruction is used to generate pointers to stack-allocated variables, * and expands to addi rd', x2, nzuimm[9:2]. */ -RVOP(caddi4spn, - { - rv->X[ir->rd] = rv->X[rv_reg_sp] + (uint16_t) ir->imm; }, - X64({ - ld, S32, RAX, X, rv_reg_sp; - alu32_imm, 32, 0x81, 0, RAX, uint, 16, imm; - st, S32, RAX, X, rd; - })) +RVOP( + caddi4spn, + { rv->X[ir->rd] = rv->X[rv_reg_sp] + (uint16_t) ir->imm; }, + GEN({ + ld, S32, TMP0, X, rv_reg_sp; + alu32_imm, 32, 0x81, 0, TMP0, uint, 16, imm; + st, S32, TMP0, X, rd; + })) /* C.LW loads a 32-bit value from memory into register rd'. It computes an * effective address by adding the zero-extended offset, scaled by 4, to the * base address in register rs1'. It expands to lw rd', offset[6:2](rs1'). */ -RVOP(clw, - { +RVOP( + clw, + { const uint32_t addr = rv->X[ir->rs1] + (uint32_t) ir->imm; RV_EXC_MISALIGN_HANDLER(3, load, true, 1); rv->X[ir->rd] = rv->io.mem_read_w(addr); - }, - X64({ + }, + GEN({ mem; - ld, S32, RAX, X, rs1; - ld_imm, RBX, mem; - alu64, 0x01, RBX, RAX; - ld, S32, RAX, RBX, 0; - st, S32, RBX, X, rd; - })) + ld, S32, TMP0, X, rs1; + ld_imm, TMP1, mem; + alu64, 0x01, TMP1, TMP0; + ld, S32, TMP0, TMP1, 0; + st, S32, TMP1, X, rd; + })) /* C.SW stores a 32-bit value in register rs2' to memory. It computes an * effective address by adding the zero-extended offset, scaled by 4, to the * base address in register rs1'. * It expands to sw rs2', offset[6:2](rs1'). */ -RVOP(csw, - { +RVOP( + csw, + { const uint32_t addr = rv->X[ir->rs1] + (uint32_t) ir->imm; RV_EXC_MISALIGN_HANDLER(3, store, true, 1); rv->io.mem_write_w(addr, rv->X[ir->rs2]); - }, - X64({ + }, + GEN({ mem; - ld, S32, RAX, X, rs1; - ld_imm, RBX, mem; - alu64, 0x01, RBX, RAX; - ld, S32, RBX, X, rs2; - st, S32, RBX, RAX, 0; - })) + ld, S32, TMP0, X, rs1; + ld_imm, TMP1, mem; + alu64, 0x01, TMP1, TMP0; + ld, S32, TMP1, X, rs2; + st, S32, TMP1, TMP0, 0; + })) /* C.NOP */ -RVOP(cnop, {/* no operation */}, X64({/* no operation */})) +RVOP(cnop, {/* no operation */}, GEN({/* no operation */})) /* C.ADDI adds the non-zero sign-extended 6-bit immediate to the value in * register rd then writes the result to rd. C.ADDI expands into @@ -1779,16 +1779,19 @@ RVOP(cnop, {/* no operation */}, X64({/* no operation */})) * with both rd=x0 and nzimm=0 encodes the C.NOP instruction; the remaining * code points with either rd=x0 or nzimm=0 encode HINTs. */ -RVOP(caddi, { - rv->X[ir->rd] += (int16_t) ir->imm; }, X64({ - ld, S32, RAX, X, rd; - alu32_imm, 32, 0x81, 0, RAX, int, 16, imm; - st, S32, RAX, X, rd; - })) +RVOP( + caddi, + { rv->X[ir->rd] += (int16_t) ir->imm; }, + GEN({ + ld, S32, TMP0, X, rd; + alu32_imm, 32, 0x81, 0, TMP0, int, 16, imm; + st, S32, TMP0, X, rd; + })) /* C.JAL */ -RVOP(cjal, - { +RVOP( + cjal, + { rv->X[rv_reg_ra] = PC + 2; PC += ir->imm; RV_EXC_MISALIGN_HANDLER(PC, insn, true, 0); @@ -1810,37 +1813,41 @@ RVOP(cjal, rv->csr_cycle = cycle; rv->PC = PC; return true; - }, - X64({ - ld_imm, RAX, pc, 2; - st, S32, RAX, X, rv_reg_ra; - ld_imm, RAX, pc, imm; - st, S32, RAX, PC; + }, + GEN({ + ld_imm, TMP0, pc, 2; + st, S32, TMP0, X, rv_reg_ra; + ld_imm, TMP0, pc, imm; + st, S32, TMP0, PC; jmp, pc, imm; exit; - })) + })) /* C.LI loads the sign-extended 6-bit immediate, imm, into register rd. * C.LI expands into addi rd, x0, imm[5:0]. * C.LI is only valid when rd=x0; the code points with rd=x0 encode HINTs. */ -RVOP(cli, { - rv->X[ir->rd] = ir->imm; }, X64({ - ld_imm, RAX, imm; - st, S32, RAX, X, rd; - })) +RVOP( + cli, + { rv->X[ir->rd] = ir->imm; }, + GEN({ + ld_imm, TMP0, imm; + st, S32, TMP0, X, rd; + })) /* C.ADDI16SP is used to adjust the stack pointer in procedure prologues * and epilogues. It expands into addi x2, x2, nzimm[9:4]. * C.ADDI16SP is only valid when nzimm'=0; the code point with nzimm=0 is * reserved. */ -RVOP(caddi16sp, { - rv->X[ir->rd] += ir->imm; }, X64({ - ld, S32, RAX, X, rd; - alu32_imm, 32, 0x81, 0, RAX, imm; - st, S32, RAX, X, rd; - })) +RVOP( + caddi16sp, + { rv->X[ir->rd] += ir->imm; }, + GEN({ + ld, S32, TMP0, X, rd; + alu32_imm, 32, 0x81, 0, TMP0, imm; + st, S32, TMP0, X, rd; + })) /* C.LUI loads the non-zero 6-bit immediate field into bits 17–12 of the * destination register, clears the bottom 12 bits, and sign-extends bit @@ -1849,93 +1856,109 @@ RVOP(caddi16sp, { * C.LUI is only valid when rd'={x0, x2}, and when the immediate is not equal * to zero. */ -RVOP(clui, { - rv->X[ir->rd] = ir->imm; }, X64({ - ld_imm, RAX, imm; - st, S32, RAX, X, rd; - })) +RVOP( + clui, + { rv->X[ir->rd] = ir->imm; }, + GEN({ + ld_imm, TMP0, imm; + st, S32, TMP0, X, rd; + })) /* C.SRLI is a CB-format instruction that performs a logical right shift * of the value in register rd' then writes the result to rd'. The shift * amount is encoded in the shamt field. C.SRLI expands into srli rd', * rd', shamt[5:0]. */ -RVOP(csrli, { - rv->X[ir->rs1] >>= ir->shamt; }, X64({ - ld, S32, RAX, X, rs1; - alu32_imm, 8, 0xc1, 5, RAX, shamt; - st, S32, RAX, X, rs1; - })) +RVOP( + csrli, + { rv->X[ir->rs1] >>= ir->shamt; }, + GEN({ + ld, S32, TMP0, X, rs1; + alu32_imm, 8, 0xc1, 5, TMP0, shamt; + st, S32, TMP0, X, rs1; + })) /* C.SRAI is defined analogously to C.SRLI, but instead performs an * arithmetic right shift. C.SRAI expands to srai rd', rd', shamt[5:0]. */ -RVOP(csrai, - { +RVOP( + csrai, + { const uint32_t mask = 0x80000000 & rv->X[ir->rs1]; rv->X[ir->rs1] >>= ir->shamt; for (unsigned int i = 0; i < ir->shamt; ++i) rv->X[ir->rs1] |= mask >> i; - }, - X64({ - ld, S32, RAX, X, rs1; - alu32_imm, 8, 0xc1, 7, RAX, shamt; - st, S32, RAX, X, rs1; + }, + GEN({ + ld, S32, TMP0, X, rs1; + alu32_imm, 8, 0xc1, 7, TMP0, shamt; + st, S32, TMP0, X, rs1; /* FIXME: Incomplete */ - })) + })) /* C.ANDI is a CB-format instruction that computes the bitwise AND of the * value in register rd' and the sign-extended 6-bit immediate, then writes * the result to rd'. C.ANDI expands to andi rd', rd', imm[5:0]. */ -RVOP(candi, { - rv->X[ir->rs1] &= ir->imm; }, X64({ - ld, S32, RAX, X, rs1; - alu32_imm, 32, 0x81, 4, RAX, imm; - st, S32, RAX, X, rs1; - })) +RVOP( + candi, + { rv->X[ir->rs1] &= ir->imm; }, + GEN({ + ld, S32, TMP0, X, rs1; + alu32_imm, 32, 0x81, 4, TMP0, imm; + st, S32, TMP0, X, rs1; + })) /* C.SUB */ -RVOP(csub, { - rv->X[ir->rd] = rv->X[ir->rs1] - rv->X[ir->rs2]; }, X64({ - ld, S32, RAX, X, rs1; - ld, S32, RBX, X, rs2; - alu32, 0x29, RBX, RAX; - st, S32, RAX, X, rd; - })) +RVOP( + csub, + { rv->X[ir->rd] = rv->X[ir->rs1] - rv->X[ir->rs2]; }, + GEN({ + ld, S32, TMP0, X, rs1; + ld, S32, TMP1, X, rs2; + alu32, 0x29, TMP1, TMP0; + st, S32, TMP0, X, rd; + })) /* C.XOR */ -RVOP(cxor, { - rv->X[ir->rd] = rv->X[ir->rs1] ^ rv->X[ir->rs2]; }, X64({ - ld, S32, RAX, X, rs1; - ld, S32, RBX, X, rs2; - alu32, 0x31, RBX, RAX; - st, S32, RAX, X, rd; - })) +RVOP( + cxor, + { rv->X[ir->rd] = rv->X[ir->rs1] ^ rv->X[ir->rs2]; }, + GEN({ + ld, S32, TMP0, X, rs1; + ld, S32, TMP1, X, rs2; + alu32, 0x31, TMP1, TMP0; + st, S32, TMP0, X, rd; + })) -RVOP(cor, { - rv->X[ir->rd] = rv->X[ir->rs1] | rv->X[ir->rs2]; }, X64({ - ld, S32, RAX, X, rs1; - ld, S32, RBX, X, rs2; - alu32, 0x09, RBX, RAX; - st, S32, RAX, X, rd; - })) +RVOP( + cor, + { rv->X[ir->rd] = rv->X[ir->rs1] | rv->X[ir->rs2]; }, + GEN({ + ld, S32, TMP0, X, rs1; + ld, S32, TMP1, X, rs2; + alu32, 0x09, TMP1, TMP0; + st, S32, TMP0, X, rd; + })) -RVOP(cand, { - rv->X[ir->rd] = rv->X[ir->rs1] & rv->X[ir->rs2]; }, X64({ - ld, S32, RAX, X, rs1; - ld, S32, RBX, X, rs2; - alu32, 0x21, RBX, RAX; - st, S32, RAX, X, rd; - })) +RVOP( + cand, + { rv->X[ir->rd] = rv->X[ir->rs1] & rv->X[ir->rs2]; }, + GEN({ + ld, S32, TMP0, X, rs1; + ld, S32, TMP1, X, rs2; + alu32, 0x21, TMP1, TMP0; + st, S32, TMP0, X, rd; + })) /* C.J performs an unconditional control transfer. The offset is sign-extended * and added to the pc to form the jump target address. * C.J can therefore target a ±2 KiB range. * C.J expands to jal x0, offset[11:1]. */ -RVOP(cj, - { +RVOP( + cj, + { PC += ir->imm; RV_EXC_MISALIGN_HANDLER(PC, insn, true, 0); struct rv_insn *taken = ir->branch_taken; @@ -1956,21 +1979,22 @@ RVOP(cj, rv->csr_cycle = cycle; rv->PC = PC; return true; - }, - X64({ - ld_imm, RAX, pc, imm; - st, S32, RAX, PC; + }, + GEN({ + ld_imm, TMP0, pc, imm; + st, S32, TMP0, PC; jmp, pc, imm; exit; - })) + })) /* C.BEQZ performs conditional control transfers. The offset is sign-extended * and added to the pc to form the branch target address. * It can therefore target a ±256 B range. C.BEQZ takes the branch if the * value in register rs1' is zero. It expands to beq rs1', x0, offset[8:1]. */ -RVOP(cbeqz, - { +RVOP( + cbeqz, + { if (rv->X[ir->rs1]) { is_branch_taken = false; struct rv_insn *untaken = ir->branch_untaken; @@ -2010,30 +2034,31 @@ RVOP(cbeqz, rv->csr_cycle = cycle; rv->PC = PC; return true; - }, - X64({ - ld, S32, RAX, X, rs1; - cmp_imm, RAX, 0; + }, + GEN({ + ld, S32, TMP0, X, rs1; + cmp_imm, TMP0, 0; set_jmp_off; jcc, 0x84; cond, branch_untaken; jmp, pc, 2; end; - ld_imm, RAX, pc, 2; - st, S32, RAX, PC; + ld_imm, TMP0, pc, 2; + st, S32, TMP0, PC; exit; jmp_off; cond, branch_taken; jmp, pc, imm; end; - ld_imm, RAX, pc, imm; - st, S32, RAX, PC; + ld_imm, TMP0, pc, imm; + st, S32, TMP0, PC; exit; - })) + })) /* C.BEQZ */ -RVOP(cbnez, - { +RVOP( + cbnez, + { if (!rv->X[ir->rs1]) { is_branch_taken = false; struct rv_insn *untaken = ir->branch_untaken; @@ -2073,57 +2098,61 @@ RVOP(cbnez, rv->csr_cycle = cycle; rv->PC = PC; return true; - }, - X64({ - ld, S32, RAX, X, rs1; - cmp_imm, RAX, 0; + }, + GEN({ + ld, S32, TMP0, X, rs1; + cmp_imm, TMP0, 0; set_jmp_off; jcc, 0x85; cond, branch_untaken; jmp, pc, 2; end; - ld_imm, RAX, pc, 2; - st, S32, RAX, PC; + ld_imm, TMP0, pc, 2; + st, S32, TMP0, PC; exit; jmp_off; cond, branch_taken; jmp, pc, imm; end; - ld_imm, RAX, pc, imm; - st, S32, RAX, PC; + ld_imm, TMP0, pc, imm; + st, S32, TMP0, PC; exit; - })) + })) /* C.SLLI is a CI-format instruction that performs a logical left shift of * the value in register rd then writes the result to rd. The shift amount * is encoded in the shamt field. C.SLLI expands into slli rd, rd, shamt[5:0]. */ -RVOP(cslli, { - rv->X[ir->rd] <<= (uint8_t) ir->imm; }, X64({ - ld, S32, RAX, X, rd; - alu32_imm, 8, 0xc1, 4, RAX, uint, 8, imm; - st, S32, RAX, X, rd; - })) +RVOP( + cslli, + { rv->X[ir->rd] <<= (uint8_t) ir->imm; }, + GEN({ + ld, S32, TMP0, X, rd; + alu32_imm, 8, 0xc1, 4, TMP0, uint, 8, imm; + st, S32, TMP0, X, rd; + })) /* C.LWSP */ -RVOP(clwsp, - { +RVOP( + clwsp, + { const uint32_t addr = rv->X[rv_reg_sp] + ir->imm; RV_EXC_MISALIGN_HANDLER(3, load, true, 1); rv->X[ir->rd] = rv->io.mem_read_w(addr); - }, - X64({ + }, + GEN({ mem; - ld, S32, RAX, X, rv_reg_sp; - ld_imm, RBX, mem; - alu64, 0x01, RBX, RAX; - ld, S32, RAX, RBX, 0; - st, S32, RBX, X, rd; - })) + ld, S32, TMP0, X, rv_reg_sp; + ld_imm, TMP1, mem; + alu64, 0x01, TMP1, TMP0; + ld, S32, TMP0, TMP1, 0; + st, S32, TMP1, X, rd; + })) /* C.JR */ -RVOP(cjr, - { +RVOP( + cjr, + { PC = rv->X[ir->rs1]; #if !RV32_HAS(JIT) LOOKUP_OR_UPDATE_BRANCH_HISTORY_TABLE(); @@ -2131,41 +2160,45 @@ RVOP(cjr, rv->csr_cycle = cycle; rv->PC = PC; return true; - }, - X64({ - ld, S32, RAX, X, rs1; - st, S32, RAX, PC; + }, + GEN({ + ld, S32, TMP0, X, rs1; + st, S32, TMP0, PC; exit; - })) + })) /* C.MV */ -RVOP(cmv, { - rv->X[ir->rd] = rv->X[ir->rs2]; }, X64({ - ld, S32, RAX, X, rs2; - st, S32, RAX, X, rd; - })) +RVOP( + cmv, + { rv->X[ir->rd] = rv->X[ir->rs2]; }, + GEN({ + ld, S32, TMP0, X, rs2; + st, S32, TMP0, X, rd; + })) /* C.EBREAK */ -RVOP(cebreak, - { +RVOP( + cebreak, + { rv->compressed = true; rv->csr_cycle = cycle; rv->PC = PC; rv->io.on_ebreak(rv); return true; - }, - X64({ - ld_imm, RAX, pc; - st, S32, RAX, PC; - ld_imm, RAX, 1; - st, S32, RAX, compressed; + }, + GEN({ + ld_imm, TMP0, pc; + st, S32, TMP0, PC; + ld_imm, TMP0, 1; + st, S32, TMP0, compressed; call, ebreak; exit; - })) + })) /* C.JALR */ -RVOP(cjalr, - { +RVOP( + cjalr, + { /* Unconditional jump and store PC+2 to ra */ const int32_t jump_to = rv->X[ir->rs1]; rv->X[rv_reg_ra] = PC + 2; @@ -2177,14 +2210,14 @@ RVOP(cjalr, rv->csr_cycle = cycle; rv->PC = PC; return true; - }, - X64({ - ld_imm, RAX, pc, 2; - st, S32, RAX, X, rv_reg_ra; - ld, S32, RAX, X, rs1; - st, S32, RAX, PC; + }, + GEN({ + ld_imm, TMP0, pc, 2; + st, S32, TMP0, X, rv_reg_ra; + ld, S32, TMP0, X, rs1; + st, S32, TMP0, PC; exit; - })) + })) /* C.ADD adds the values in registers rd and rs2 and writes the result to * register rd. @@ -2193,27 +2226,30 @@ RVOP(cjalr, * the C.JALR and C.EBREAK instructions. The code points with rs2=x0 and rd=x0 * are HINTs. */ -RVOP(cadd, { - rv->X[ir->rd] = rv->X[ir->rs1] + rv->X[ir->rs2]; }, X64({ - ld, S32, RAX, X, rs1; - ld, S32, RBX, X, rs2; - alu32, 0x01, RBX, RAX; - st, S32, RAX, X, rd; - })) +RVOP( + cadd, + { rv->X[ir->rd] = rv->X[ir->rs1] + rv->X[ir->rs2]; }, + GEN({ + ld, S32, TMP0, X, rs1; + ld, S32, TMP1, X, rs2; + alu32, 0x01, TMP1, TMP0; + st, S32, TMP0, X, rd; + })) /* C.SWSP */ -RVOP(cswsp, - { +RVOP( + cswsp, + { const uint32_t addr = rv->X[rv_reg_sp] + ir->imm; RV_EXC_MISALIGN_HANDLER(3, store, true, 1); rv->io.mem_write_w(addr, rv->X[ir->rs2]); - }, - X64({ + }, + GEN({ mem; - ld, S32, RAX, X, rv_reg_sp; - ld_imm, RBX, mem; - alu64, 0x01, RBX, RAX; - ld, S32, RBX, X, rs2; - st, S32, RBX, RAX, 0; - })) + ld, S32, TMP0, X, rv_reg_sp; + ld_imm, TMP1, mem; + alu64, 0x01, TMP1, TMP0; + ld, S32, TMP1, X, rs2; + st, S32, TMP1, TMP0, 0; + })) #endif diff --git a/src/utils.h b/src/utils.h index 7e04cd0a8..fefcb74ef 100644 --- a/src/utils.h +++ b/src/utils.h @@ -1,5 +1,6 @@ #pragma once +#include #include #include @@ -47,3 +48,11 @@ void rv_clock_gettime(struct timespec *tp); * https://9p.io/sys/doc/lexnames.html */ char *sanitize_path(const char *input); + +static inline uintptr_t align_up(uintptr_t sz, size_t alignment) +{ + uintptr_t mask = alignment - 1; + if ((alignment & mask) == 0) + return ((sz + mask) & ~mask); + return (((sz + mask) / alignment) * alignment); +} diff --git a/tools/gen-jit-template.py b/tools/gen-jit-template.py index 83b522815..0e87e45da 100755 --- a/tools/gen-jit-template.py +++ b/tools/gen-jit-template.py @@ -110,23 +110,18 @@ def parse_argv(EXT_LIST, SKIP_LIST): for ext in EXT_LIST: SKIP_LIST += INSN[ext] - -def remove_comment(str): - str = re.sub(r'//[\s|\S]+?\n', "", str) - return re.sub(r'/\*[\s|\S]+?\*/\n', "", str) - - # parse_argv(EXT_LIST, SKIP_LIST) # prepare PROLOGUE output = "" f = open('src/rv32_template.c', 'r') lines = f.read() +# remove_comment +lines = re.sub(r'/\*[\s|\S]+?\*/', "", lines) # remove exception handler lines = re.sub(r'RV_EXC[\S]+?\([\S|\s]+?\);\s', "", lines) # collect functions emulate_funcs = re.findall(r'RVOP\([\s|\S]+?}\)', lines) -codegen_funcs = re.findall(r'X64\([\s|\S]+?}\)', lines) - +codegen_funcs = re.findall(r'GEN\([\s|\S]+?}\)', lines) op = [] impl = [] for i in range(len(emulate_funcs)): @@ -136,6 +131,7 @@ def remove_comment(str): f.close() fields = {"imm", "pc", "rs1", "rs2", "rd", "shamt", "branch_taken", "branch_untaken"} +temp_regs = {"TMP0", "TMP1", "TMP2"} # generate jit template for i in range(len(op)): if (not SKIP_LIST.count(op[i])): @@ -149,6 +145,8 @@ def remove_comment(str): for i in range(len(items)): if items[i] in fields: items[i] = "ir->" + items[i] + if items[i] in temp_regs: + items[i] = "temp_reg[" + items[i][-1] + "]" if items[0] == "alu32_imm": if len(items) == 8: asm = "emit_alu32_imm{}(state, {}, {}, {}, ({}{}_t) {});".format( @@ -219,7 +217,7 @@ def remove_comment(str): elif items[0] == "set_jmp_off": asm = "uint32_t jump_loc = state->offset;" elif items[0] == "jmp_off": - asm = "emit_jump_target_offset(state, jump_loc + 2, state->offset);" + asm = "emit_jump_target_offset(state, JUMP_LOC, state->offset);" elif items[0] == "mem": asm = "memory_t *m = ((state_t *) rv->userdata)->mem;" elif items[0] == "call":