From daa7750787d76381faad8c77618f6bee1c274a7f Mon Sep 17 00:00:00 2001 From: Yen-Fu Chen Date: Fri, 22 Dec 2023 22:48:39 +0800 Subject: [PATCH] Introduce a tier-1 JIT compiler based on aarch64 architecture We follow the template and API of X64 to implement A64 tier-1 JIT compiler. * Perfromance | Metric | rv32emu-T1C | qemu | |----------+-------------+-------| |aes | 0.034| 0.045| |puzzle | 0.0115| 0.0169| |pi | 0.035| 0.032| |dhrystone | 1.914| 2.005| |Nqeueens | 3.87| 2.898| |qsort-O2 | 7.819| 11.614| |miniz-O2 | 7.604| 3.803| |primes-O2 | 10.551| 5.986| |sha512-O2 | 6.497| 2.853| |stream | 52.25| 45.776| As demonstrated in the memory usage analysis below, the tier-1 JIT compiler utilizes less memory than QEMU across all benchmarks. * Memory usage | Metric | rv32emu-T1C | qemu | |----------+-------------+---------| |aes | 183,212|1,265,962| |puzzle | 145,239| 891,357| |pi | 144,739| 872,525| |dhrystone | 146,282| 853,256| |Nqeueens | 146,696| 854,174| |qsort-O2 | 146,907| 856,721| |miniz-O2 | 157,475| 999,897| |primes-O2 | 142,356| 851,661| |sha512-O2 | 145,369| 901,136| |stream | 157,975| 955,809| Related: #238 --- Makefile | 8 +- src/emulate.c | 2 +- src/jit.c | 1528 +++++++++++++++++++++++++++++++++++++ src/jit.h | 43 ++ src/jit_x64.c | 577 -------------- src/jit_x64.h | 407 ---------- src/riscv.c | 2 +- src/rv32_template.c | 612 +++++++-------- tools/gen-jit-template.py | 5 +- 9 files changed, 1888 insertions(+), 1296 deletions(-) create mode 100644 src/jit.c create mode 100644 src/jit.h delete mode 100644 src/jit_x64.c delete mode 100644 src/jit_x64.h diff --git a/Makefile b/Makefile index 5002c79ed..f8fe8e779 100644 --- a/Makefile +++ b/Makefile @@ -121,15 +121,17 @@ endif ENABLE_JIT ?= 0 $(call set-feature, JIT) ifeq ($(call has, JIT), 1) -OBJS_EXT += jit_x64.o +OBJS_EXT += jit.o ifneq ($(processor), x86_64) -$(error JIT mode only supports for x64 target currently.) +ifneq ($(processor), aarch64) +$(error JIT mode only supports for x64 and arm64 target currently.) +endif endif src/rv32_jit_template.c: $(Q)tools/gen-jit-template.py $(CFLAGS) > $@ -$(OUT)/jit_x64.o: src/jit_x64.c src/rv32_jit_template.c +$(OUT)/jit.o: src/jit.c src/rv32_jit_template.c $(VECHO) " CC\t$@\n" $(Q)$(CC) -o $@ $(CFLAGS) -c -MMD -MF $@.d $< endif diff --git a/src/emulate.c b/src/emulate.c index 8652f7688..f8b47334c 100644 --- a/src/emulate.c +++ b/src/emulate.c @@ -28,7 +28,7 @@ extern struct target_ops gdbstub_ops; #if RV32_HAS(JIT) #include "cache.h" -#include "jit_x64.h" +#include "jit.h" #endif /* Shortcuts for comparing each field of specified RISC-V instruction */ diff --git a/src/jit.c b/src/jit.c new file mode 100644 index 000000000..403f87475 --- /dev/null +++ b/src/jit.c @@ -0,0 +1,1528 @@ +/* + * rv32emu is freely redistributable under the MIT License. See the file + * "LICENSE" for information on usage and redistribution of this file. + */ + +/* This JIT implementation has undergone extensive modifications, heavily + * relying on the ubpf_jit_x86_64.[ch] from ubpf. The original + * ubpf_jit_x86_64.[ch] file served as the foundation and source of inspiration + * for adapting and tailoring it specifically for this JIT implementation. + * Therefore, credit and sincere thanks are extended to ubpf for their + * invaluable work. + * + * Reference: + * https://github.com/iovisor/ubpf/blob/main/vm/ubpf_jit_x86_64.c + */ + +#if !defined(__x86_64__) && !defined(__aarch64__) +#error "This implementation is dedicated to x64 and arm64." +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "cache.h" +#include "decode.h" +#include "io.h" +#include "jit.h" +#include "state.h" +#include "utils.h" + + +#define JIT_CLS_MASK 0x07 +#define JIT_ALU_OP_MASK 0xf0 +#define JIT_CLS_ALU 0x04 +#define JIT_CLS_ALU64 0x07 +#define JIT_SRC_IMM 0x00 +#define JIT_SRC_REG 0x08 +#define JIT_OP_MUL_IMM (JIT_CLS_ALU | JIT_SRC_IMM | 0x20) +#define JIT_OP_MUL_REG (JIT_CLS_ALU | JIT_SRC_REG | 0x20) +#define JIT_OP_DIV_IMM (JIT_CLS_ALU | JIT_SRC_IMM | 0x30) +#define JIT_OP_DIV_REG (JIT_CLS_ALU | JIT_SRC_REG | 0x30) +#define JIT_OP_MOD_IMM (JIT_CLS_ALU | JIT_SRC_IMM | 0x90) +#define JIT_OP_MOD_REG (JIT_CLS_ALU | JIT_SRC_REG | 0x90) + +#define STACK_SIZE 512 +#define MAX_INSNS 1024 +#if defined(__x86_64__) +#define JUMP_LOC jump_loc + 2 +/* Special values for target_pc in struct jump */ +#define TARGET_PC_EXIT -1U +#define TARGET_PC_RETPOLINE -3U +enum X64_REG { + RAX, + RCX, + RDX, + RBX, + RSP, + RBP, + RIP = 5, + RSI, + RDI, + R8, + R9, + R10, + R11, + R12, + R13, + R14, + R15, +}; + +#elif defined(__aarch64__) +#define JUMP_LOC jump_loc +/* Special values for target_pc in struct jump */ +#define TARGET_PC_EXIT ~UINT32_C(0) +#define TARGET_PC_ENTER (~UINT32_C(0) & 0x0101) +// This is guaranteed to be an illegal A64 instruction. +#define BAD_OPCODE ~UINT32_C(0) + +enum A64_REG { + R0, + R1, + R2, + R3, + R4, + R5, + R6, + R7, + R8, + R9, + R10, + R11, + R12, + R13, + R14, + R15, + R16, + R17, + R18, + R19, + R20, + R21, + R22, + R23, + R24, + R25, + R26, + R27, + R28, + R29, + R30, + SP, + RZ = 31 +}; + +enum Opcode { + /* AddSubOpcode */ + AS_ADD = 0, + AS_SUB = 2, + AS_SUBS = 3, + /* LogicalOpcode */ + LOG_AND = 0x00000000U, // 0000_0000_0000_0000_0000_0000_0000_0000 + LOG_ORR = 0x20000000U, // 0010_0000_0000_0000_0000_0000_0000_0000 + LOG_ORN = 0x20200000U, // 0010_0000_0010_0000_0000_0000_0000_0000 + LOG_EOR = 0x40000000U, // 0100_0000_0000_0000_0000_0000_0000_0000 + /* LoadStoreOpcode */ + LS_STRB = 0x00000000U, // 0000_0000_0000_0000_0000_0000_0000_0000 + LS_LDRB = 0x00400000U, // 0000_0000_0100_0000_0000_0000_0000_0000 + LS_LDRSBW = 0x00c00000U, // 0000_0000_1100_0000_0000_0000_0000_0000 + LS_STRH = 0x40000000U, // 0100_0000_0000_0000_0000_0000_0000_0000 + LS_LDRH = 0x40400000U, // 0100_0000_0100_0000_0000_0000_0000_0000 + LS_LDRSHW = 0x40c00000U, // 0100_0000_1100_0000_0000_0000_0000_0000 + LS_STRW = 0x80000000U, // 1000_0000_0000_0000_0000_0000_0000_0000 + LS_LDRW = 0x80400000U, // 1000_0000_0100_0000_0000_0000_0000_0000 + LS_STRX = 0xc0000000U, // 1100_0000_0000_0000_0000_0000_0000_0000 + LS_LDRX = 0xc0400000U, // 1100_0000_0100_0000_0000_0000_0000_0000 + /* LoadStorePairOpcode */ + LSP_STPX = 0xa9000000U, // 1010_1001_0000_0000_0000_0000_0000_0000 + LSP_LDPX = 0xa9400000U, // 1010_1001_0100_0000_0000_0000_0000_0000 + /* UnconditionalBranchOpcode */ + BR_BR = 0xd61f0000U, // 1101_0110_0001_1111_0000_0000_0000_0000 + BR_BLR = 0xd63f0000U, // 1101_0110_0011_1111_0000_0000_0000_0000 + BR_RET = 0xd65f0000U, // 1101_0110_0101_1111_0000_0000_0000_0000 + /* UnconditionalBranchImmediateOpcode */ + UBR_B = 0x14000000U, // 0001_0100_0000_0000_0000_0000_0000_0000 + /* ConditionalBranchImmediateOpcode */ + BR_Bcond = 0x54000000U, + /* DP2Opcode */ + DP2_UDIV = 0x1ac00800U, // 0001_1010_1100_0000_0000_1000_0000_0000 + DP2_LSLV = 0x1ac02000U, // 0001_1010_1100_0000_0010_0000_0000_0000 + DP2_LSRV = 0x1ac02400U, // 0001_1010_1100_0000_0010_0100_0000_0000 + DP2_ASRV = 0x1ac02800U, // 0001_1010_1100_0000_0010_1000_0000_0000 + /* DP3Opcode */ + DP3_MADD = 0x1b000000U, // 0001_1011_0000_0000_0000_0000_0000_0000 + DP3_MSUB = 0x1b008000U, // 0001_1011_0000_0000_1000_0000_0000_0000 + /* MoveWideOpcode */ + MW_MOVN = 0x12800000U, // 0001_0010_1000_0000_0000_0000_0000_0000 + MW_MOVZ = 0x52800000U, // 0101_0010_1000_0000_0000_0000_0000_0000 + MW_MOVK = 0x72800000U, // 0111_0010_1000_0000_0000_0000_0000_0000 +}; + +enum Condition { + COND_EQ, + COND_NE, + COND_HS, + COND_LO, + COND_GE = 10, + COND_LT = 11, +}; +#endif + +enum VM_REG { + VM_REG_0 = 0, + VM_REG_1, + VM_REG_2, + VM_REG_3, + VM_REG_4, + VM_REG_5, + VM_REG_6, + VM_REG_7, + VM_REG_8, + VM_REG_9, + VM_REG_10, + N_VM_REGS, +}; + +enum operand_size { + S8, + S16, + S32, +}; + +#define REGISTER_MAP_SIZE 11 + +#if defined(__x86_64__) +/* There are two common x86-64 calling conventions, discussed at: + * https://en.wikipedia.org/wiki/X64_calling_conventions#x86-64_calling_conventions + * + * Please note: R12 is an exception and is *not* being used. Consequently, it + * is omitted from the list of non-volatile registers for both platforms, + * despite being non-volatile. + */ +#if defined(_WIN32) +static const int nonvolatile_reg[] = {RBP, RBX, RDI, RSI, R13, R14, R15}; +static const int parameter_reg[] = {RCX, RDX, R8, R9}; +#define RCX_ALT R10 +static const int register_map[REGISTER_MAP_SIZE] = { + RAX, R10, RDX, R8, R9, R14, R15, RDI, RSI, RBX, RBP, +}; +#else +#define RCX_ALT R9 +static const int nonvolatile_reg[] = {RBP, RBX, R13, R14, R15}; +static const int parameter_reg[] = {RDI, RSI, RDX, RCX, R8, R9}; +static const int temp_reg[] = {RAX, RBX, RCX}; +static const int register_map[REGISTER_MAP_SIZE] = { + RAX, RDI, RSI, RDX, R9, R8, RBX, R13, R14, R15, RBP, +}; +#endif +#elif defined(__aarch64__) +/* + * There are two common x86-64 calling conventions, as discussed at + * https://en.wikipedia.org/wiki/X86_calling_conventions#x86-64_calling_conventions + * + * Please Note: R12 is special and we are *not* using it. As a result, it is + * omitted from the list of non-volatile registers for both platforms (even + * though it is, in fact, non-volatile). + * + * BPF R0-R4 are "volatile" + * BPF R5-R10 are "non-volatile" + * In general, we attempt to map BPF volatile registers to x64 volatile and BPF + * non- volatile to x64 non-volatile. + */ + +// Callee saved registers - this must be a multiple of two because of how we +// save the stack later on. +static const int callee_reg[] = {R19, R20, R21, R22, R23, R24, R25, R26}; +// Caller saved registers (and parameter registers) +static const int parameter_reg[] = {R0, R1, R2, R3, R4}; +static const int temp_reg[] = {R6, R7, R8}; +// Temp register for immediate generation +static const int temp_imm_reg = R24; +// Temp register for division results +static const int temp_div_reg = R25; + +// Register assignments: +// BPF Arm64 Usage +// r0 r5 Return value from calls (see note) +// r1 - r5 r0 - r4 Function parameters, caller-saved +// r6 - r10 r19 - r23 Callee-saved registers +// r24 Temp - used for generating 32-bit immediates +// r25 Temp - used for modulous calculations +// r26 Temp - used for large load/store offsets +// +// Note that the AArch64 ABI uses r0 both for function parameters and result. We +// use r5 to hold the result during the function and do an extra final move at +// the end of the function to copy the result to the correct place. +static const int register_map[REGISTER_MAP_SIZE] = { + R5, // result + R0, R1, R2, R3, + R4, // parameters + R19, R20, R21, R22, + R23, // callee-saved +}; +static inline void emit_load_imm(struct jit_state *state, int dst, int64_t imm); +#endif + +/* Return the x86 register for the given JIT register */ +static int map_register(int r) +{ + assert(r < N_VM_REGS); + return register_map[r % N_VM_REGS]; +} + +static inline void offset_map_insert(struct jit_state *state, int32_t target_pc) +{ + struct offset_map *map_entry = &state->offset_map[state->num_insn++]; + map_entry->PC = target_pc; + map_entry->offset = state->offset; +} + + +#define sys_icache_invalidate(addr, size) \ + __builtin___clear_cache((char *) (addr), (char *) (addr) + (size)); + +static inline void emit_bytes(struct jit_state *state, void *data, uint32_t len) +{ + assert(state->offset <= state->size - len); + if ((state->offset + len) > state->size) { + state->offset = state->size; + return; + } + memcpy(state->buf + state->offset, data, len); + sys_icache_invalidate(state->buf + state->offset, len); + state->offset += len; +} + +#if defined(__x86_64__) +static inline void emit1(struct jit_state *state, uint8_t x) +{ + emit_bytes(state, &x, sizeof(x)); +} + +static inline void emit2(struct jit_state *state, uint16_t x) +{ + emit_bytes(state, &x, sizeof(x)); +} + +static inline void emit4(struct jit_state *state, uint32_t x) +{ + emit_bytes(state, &x, sizeof(x)); +} + +static inline void emit8(struct jit_state *state, uint64_t x) +{ + emit_bytes(state, &x, sizeof(x)); +} + +static inline void emit_modrm(struct jit_state *state, int mod, int r, int m) +{ + assert(!(mod & ~0xc0)); + emit1(state, (mod & 0xc0) | ((r & 7) << 3) | (m & 7)); +} + +static inline void emit_modrm_reg2reg(struct jit_state *state, int r, int m) +{ + emit_modrm(state, 0xc0, r, m); +} + +static inline void emit_modrm_and_displacement(struct jit_state *state, + int r, + int m, + int32_t d) +{ + if (d == 0 && (m & 7) != RBP) { + emit_modrm(state, 0x00, r, m); + } else if (d >= -128 && d <= 127) { + emit_modrm(state, 0x40, r, m); + emit1(state, d); + } else { + emit_modrm(state, 0x80, r, m); + emit4(state, d); + } +} + +static inline void emit_rex(struct jit_state *state, int w, int r, int x, int b) +{ + assert(!(w & ~1)); + assert(!(r & ~1)); + assert(!(x & ~1)); + assert(!(b & ~1)); + emit1(state, 0x40 | (w << 3) | (r << 2) | (x << 1) | b); +} + +/* Emit a REX prefix incorporating the top bit of both src and dst. This step is + * skipped if no bits are set. + */ +static inline void emit_basic_rex(struct jit_state *state, + int w, + int src, + int dst) +{ + if (w || (src & 8) || (dst & 8)) + emit_rex(state, w, !!(src & 8), 0, !!(dst & 8)); +} + +static inline void emit_push(struct jit_state *state, int r) +{ + emit_basic_rex(state, 0, 0, r); + emit1(state, 0x50 | (r & 7)); +} + +static inline void emit_pop(struct jit_state *state, int r) +{ + emit_basic_rex(state, 0, 0, r); + emit1(state, 0x58 | (r & 7)); +} + +static inline void emit_jump_target_address(struct jit_state *state, + int32_t target_pc) +{ + struct jump *jump = &state->jumps[state->num_jumps++]; + jump->offset_loc = state->offset; + jump->target_pc = target_pc; + emit4(state, 0); +} +#elif defined(__aarch64__) +static inline uint32_t align_to(uint32_t amount, uint64_t boundary) +{ + return (amount + (boundary - 1)) & ~(boundary - 1); +} + +static void emit_instruction(struct jit_state *state, uint32_t instr) +{ + assert(instr != BAD_OPCODE); + emit_bytes(state, &instr, 4); +} + +/* Get the value of the size bit in most instruction encodings (bit 31). */ +static uint32_t sz(bool sixty_four) +{ + return (sixty_four ? UINT32_C(1) : UINT32_C(0)) << 31; +} + +/* [ArmARM-A H.a]: C4.1.64: Add/subtract (immediate). */ +static void emit_addsub_immediate(struct jit_state *state, + bool sixty_four, + enum Opcode op, + int rd, + int rn, + uint32_t imm12) +{ + const uint32_t imm_op_base = 0x11000000; + emit_instruction(state, sz(sixty_four) | (op << 29) | imm_op_base | + (0 << 22) | (imm12 << 10) | (rn << 5) | rd); +} + +/* [ArmARM-A H.a]: C4.1.67: Logical (shifted register). */ +static void emit_logical_register(struct jit_state *state, + bool sixty_four, + enum Opcode op, + int rd, + int rn, + int rm) +{ + emit_instruction(state, sz(sixty_four) | op | (1 << 27) | (1 << 25) | + (rm << 16) | (rn << 5) | rd); +} + +/* [ArmARM-A H.a]: C4.1.67: Add/subtract (shifted register). */ +static inline void emit_addsub_register(struct jit_state *state, + bool sixty_four, + enum Opcode op, + int rd, + int rn, + int rm) +{ + const uint32_t reg_op_base = 0x0b000000; + emit_instruction(state, sz(sixty_four) | (op << 29) | reg_op_base | + (rm << 16) | (rn << 5) | rd); +} + +/* [ArmARM-A H.a]: C4.1.64: Move wide (Immediate). */ +static inline void emit_movewide_immediate(struct jit_state *state, + bool sixty_four, + int rd, + uint64_t imm) +{ + /* Emit a MOVZ or MOVN followed by a sequence of MOVKs to generate the + * 64-bit constant in imm. See whether the 0x0000 or 0xffff pattern is more + * common in the immediate. This ensures we produce the fewest number of + * immediates. + */ + unsigned count0000 = sixty_four ? 0 : 2; + unsigned countffff = 0; + for (unsigned i = 0; i < (sixty_four ? 64 : 32); i += 16) { + uint64_t block = (imm >> i) & 0xffff; + if (block == 0xffff) { + ++countffff; + } else if (block == 0) { + ++count0000; + } + } + + /* Iterate over 16-bit elements of imm, outputting an appropriate move + * instruction. */ + bool invert = (count0000 < countffff); + enum Opcode op = invert ? MW_MOVN : MW_MOVZ; + uint64_t skip_pattern = invert ? 0xffff : 0; + for (unsigned i = 0; i < (sixty_four ? 4 : 2); ++i) { + uint64_t imm16 = (imm >> (i * 16)) & 0xffff; + if (imm16 != skip_pattern) { + if (invert) { + imm16 = ~imm16; + imm16 &= 0xffff; + } + emit_instruction( + state, sz(sixty_four) | op | (i << 21) | (imm16 << 5) | rd); + op = MW_MOVK; + invert = false; + } + } + + /* Tidy up for the case imm = 0 or imm == -1. */ + if (op != MW_MOVK) { + emit_instruction(state, + sz(sixty_four) | op | (0 << 21) | (0 << 5) | rd); + } +} + +/* [ArmARM-A H.a]: C4.1.66: Load/store register (unscaled immediate). */ +static void emit_loadstore_immediate(struct jit_state *state, + enum Opcode op, + int rt, + int rn, + int16_t imm9) +{ + const uint32_t imm_op_base = 0x38000000U; + assert(imm9 >= -256 && imm9 < 256); + imm9 &= 0x1ff; + emit_instruction(state, imm_op_base | op | (imm9 << 12) | (rn << 5) | rt); +} + +/* [ArmARM-A H.a]: C4.1.66: Load/store register pair (offset). */ +static void emit_loadstorepair_immediate(struct jit_state *state, + enum Opcode op, + int rt, + int rt2, + int rn, + int32_t imm7) +{ + int32_t imm_div = ((op == LSP_STPX) || (op == LSP_LDPX)) ? 8 : 4; + assert(imm7 % imm_div == 0); + imm7 /= imm_div; + emit_instruction(state, op | (imm7 << 15) | (rt2 << 10) | (rn << 5) | rt); +} + +/* [ArmARM-A H.a]: C4.1.65: Unconditional branch (register). */ +static void emit_unconditionalbranch_register(struct jit_state *state, + enum Opcode op, + int rn) +{ + emit_instruction(state, op | (rn << 5)); +} + +/* [ArmARM-A H.a]: C4.1.67: Data-processing (2 source). */ +static void emit_dataprocessing_twosource(struct jit_state *state, + bool sixty_four, + enum Opcode op, + int rd, + int rn, + int rm) +{ + emit_instruction(state, sz(sixty_four) | op | (rm << 16) | (rn << 5) | rd); +} + + +/* [ArmARM-A H.a]: C4.1.67: Data-processing (3 source). */ +static void emit_dataprocessing_threesource(struct jit_state *state, + bool sixty_four, + enum Opcode op, + int rd, + int rn, + int rm, + int ra) +{ + emit_instruction( + state, sz(sixty_four) | op | (rm << 16) | (ra << 10) | (rn << 5) | rd); +} + +static void update_branch_immediate(struct jit_state *state, + uint32_t offset, + int32_t imm) +{ + assert((imm & 3) == 0); + uint32_t instr; + imm >>= 2; + memcpy(&instr, state->buf + offset, sizeof(uint32_t)); + if ((instr & 0xfe000000U) == 0x54000000U /* Conditional branch immediate. */ + || (instr & 0x7e000000U) == + 0x34000000U) { /* Compare and branch immediate. */ + assert((imm >> 19) == INT64_C(-1) || (imm >> 19) == 0); + instr |= (imm & 0x7ffff) << 5; + } else if ((instr & 0x7c000000U) == 0x14000000U) { + /* Unconditional branch immediate. */ + assert((imm >> 26) == INT64_C(-1) || (imm >> 26) == 0); + instr |= (imm & 0x03ffffffU) << 0; + } else { + assert(false); + instr = BAD_OPCODE; + } + memcpy(state->buf + offset, &instr, sizeof(uint32_t)); +} +#endif + +static inline void emit_jump_target_offset(struct jit_state *state, + uint32_t jump_loc, + uint32_t jump_state_offset) +{ + struct jump *jump = &state->jumps[state->num_jumps++]; + jump->offset_loc = jump_loc; + jump->target_offset = jump_state_offset; +} + +/* The REX prefix and ModRM byte are emitted. + * The MR encoding is utilized when a choice is available. The 'src' is often + * used as an opcode extension. + */ +static inline void emit_alu32(struct jit_state *state, int op, int src, int dst) +{ +#if defined(__x86_64__) + emit_basic_rex(state, 0, src, dst); + emit1(state, op); + emit_modrm_reg2reg(state, src, dst); +#elif defined(__aarch64__) + switch (op) { + case 1: /* ADD */ + emit_addsub_register(state, false, AS_ADD, dst, dst, src); + break; + case 0x29: /* SUB */ + emit_addsub_register(state, false, AS_SUB, dst, dst, src); + break; + case 0x31: /* XOR */ + emit_logical_register(state, false, LOG_EOR, dst, dst, src); + break; + case 9: /* OR */ + emit_logical_register(state, false, LOG_ORR, dst, dst, src); + break; + case 0x21: /* AND */ + emit_logical_register(state, false, LOG_AND, dst, dst, src); + break; + case 0xd3: + if (src == 4) /* SLL */ + emit_dataprocessing_twosource(state, false, DP2_LSLV, dst, dst, R8); + else if (src == 5) /* SRL */ + emit_dataprocessing_twosource(state, false, DP2_LSRV, dst, dst, R8); + else if (src == 7) /* SRA */ + emit_dataprocessing_twosource(state, false, DP2_ASRV, dst, dst, R8); + break; + } +#endif +} + +/* REX prefix, ModRM byte, and 32-bit immediate */ +static inline void emit_alu32_imm32(struct jit_state *state, + int op UNUSED, + int src, + int dst, + int32_t imm) +{ +#if defined(__x86_64__) + emit_alu32(state, op, src, dst); + emit4(state, imm); +#elif defined(__aarch64__) + switch (src) { + case 0: + emit_load_imm(state, R10, imm); + emit_addsub_register(state, false, AS_ADD, dst, dst, R10); + break; + case 1: + emit_load_imm(state, R10, imm); + emit_logical_register(state, false, LOG_ORR, dst, dst, R10); + break; + case 4: + emit_load_imm(state, R10, imm); + emit_logical_register(state, false, LOG_AND, dst, dst, R10); + break; + case 6: + emit_load_imm(state, R10, imm); + emit_logical_register(state, false, LOG_EOR, dst, src, R10); + break; + } +#endif +} + +/* REX prefix, ModRM byte, and 8-bit immediate */ +static inline void emit_alu32_imm8(struct jit_state *state, + int op UNUSED, + int src, + int dst, + int8_t imm) +{ +#if defined(__x86_64__) + emit_alu32(state, op, src, dst); + emit1(state, imm); +#elif defined(__aarch64__) + switch (src) { + case 4: + emit_load_imm(state, R10, imm); + emit_dataprocessing_twosource(state, false, DP2_LSLV, dst, dst, R10); + break; + case 5: + emit_load_imm(state, R10, imm); + emit_dataprocessing_twosource(state, false, DP2_LSRV, dst, dst, R10); + break; + case 7: + emit_load_imm(state, R10, imm); + emit_dataprocessing_twosource(state, false, DP2_ASRV, dst, dst, R10); + break; + } +#endif +} + +/* The REX.W prefix and ModRM byte are emitted. + * The MR encoding is used when there is a choice. 'src' is often used as + * an opcode extension. + */ +static inline void emit_alu64(struct jit_state *state, int op, int src, int dst) +{ +#if defined(__x86_64__) + emit_basic_rex(state, 1, src, dst); + emit1(state, op); + emit_modrm_reg2reg(state, src, dst); +#elif defined(__aarch64__) + if (op == 0x01) + emit_addsub_register(state, true, AS_ADD, dst, dst, src); +#endif +} + +/* REX.W prefix, ModRM byte, and 8-bit immediate */ +static inline void emit_alu64_imm8(struct jit_state *state, + int op, + int src UNUSED, + int dst, + int8_t imm) +{ +#if defined(__x86_64__) + + emit_alu64(state, op, src, dst); + emit1(state, imm); +#elif defined(__aarch64__) + if (op == 0xc1) { + emit_load_imm(state, R10, imm); + emit_dataprocessing_twosource(state, true, DP2_LSRV, dst, dst, R10); + } +#endif +} + +#if defined(__x86_64__) +/* Register to register mov */ +static inline void emit_mov(struct jit_state *state, int src, int dst) +{ + emit_alu64(state, 0x89, src, dst); +} + +/* REX.W prefix, ModRM byte, and 32-bit immediate */ +static inline void emit_alu64_imm32(struct jit_state *state, + int op, + int src, + int dst, + int32_t imm) +{ + emit_alu64(state, op, src, dst); + emit4(state, imm); +} +#elif defined(__aarch64__) +static void divmod(struct jit_state *state, + uint8_t opcode, + int rd, + int rn, + int rm) +{ + bool mod = (opcode & JIT_ALU_OP_MASK) == (JIT_OP_MOD_IMM & JIT_ALU_OP_MASK); + bool sixty_four = (opcode & JIT_CLS_MASK) == JIT_CLS_ALU64; + int div_dest = mod ? temp_div_reg : rd; + + /* Do not need to treet divide by zero as special because the UDIV + * instruction already returns 0 when dividing by zero. + */ + emit_dataprocessing_twosource(state, sixty_four, DP2_UDIV, div_dest, rn, + rm); + if (mod) { + emit_dataprocessing_threesource(state, sixty_four, DP3_MSUB, rd, rm, + div_dest, rn); + } +} +#endif + +static inline void emit_cmp_imm32(struct jit_state *state, int dst, int32_t imm) +{ +#if defined(__x86_64__) + emit_alu64_imm32(state, 0x81, 7, dst, imm); +#elif defined(__aarch64__) + emit_load_imm(state, R10, imm); + emit_addsub_register(state, false, AS_SUBS, RZ, dst, R10); +#endif +} + +static inline void emit_cmp32(struct jit_state *state, int src, int dst) +{ +#if defined(__x86_64__) + emit_alu32(state, 0x39, src, dst); +#elif defined(__aarch64__) + emit_addsub_register(state, false, AS_SUBS, RZ, dst, src); +#endif +} + +static inline void emit_jcc_offset(struct jit_state *state, int code) +{ +#if defined(__x86_64__) + emit1(state, 0x0f); + emit1(state, code); + emit4(state, 0); +#elif defined(__aarch64__) + switch (code) { + case 0x84: /* BEQ */ + code = COND_EQ; + break; + case 0x85: /* BNE */ + code = COND_NE; + break; + case 0x8c: /* BLT */ + code = COND_LT; + break; + case 0x8d: /* BGE */ + code = COND_GE; + break; + case 0x82: /* BLTU */ + code = COND_LO; + break; + case 0x83: /* BGEU */ + code = COND_HS; + break; + } + emit_instruction(state, BR_Bcond | (0 << 5) | code); +#endif +} + +/* Load [src + offset] into dst */ +static inline void emit_load(struct jit_state *state, + enum operand_size size, + int src, + int dst, + int32_t offset) +{ +#if defined(__x86_64__) + if (size == S8 || size == S16) { + /* movzx */ + emit1(state, 0x0f); + emit1(state, size == S8 ? 0xb6 : 0xb7); + } else if (size == S32) { + /* mov */ + emit1(state, 0x8b); + } + + emit_modrm_and_displacement(state, dst, src, offset); +#elif defined(__aarch64__) + if (size == S8) + emit_loadstore_immediate(state, LS_LDRB, dst, src, offset); + else if (size == S16) + emit_loadstore_immediate(state, LS_LDRH, dst, src, offset); + else if (size == S32) + emit_loadstore_immediate(state, LS_LDRW, dst, src, offset); +#endif +} + +static inline void emit_load_sext(struct jit_state *state, + enum operand_size size, + int src, + int dst, + int32_t offset) +{ +#if defined(__x86_64__) + if (size == S8 || size == S16) { + /* movsx */ + emit1(state, 0x0f); + emit1(state, size == S8 ? 0xbe : 0xbf); + } else if (size == S32) { + emit_basic_rex(state, 1, dst, src); + emit1(state, 0x63); + } + + emit_modrm_and_displacement(state, dst, src, offset); +#elif defined(__aarch64__) + if (size == S8) + emit_loadstore_immediate(state, LS_LDRSBW, dst, src, offset); + else if (size == S16) + emit_loadstore_immediate(state, LS_LDRSHW, dst, src, offset); +#endif +} + +/* Load sign-extended immediate into register */ +static inline void emit_load_imm(struct jit_state *state, int dst, int64_t imm) +{ +#if defined(__x86_64__) + if (imm >= INT32_MIN && imm <= INT32_MAX) + emit_alu64_imm32(state, 0xc7, 0, dst, imm); + else { + /* movabs $imm, dst */ + emit_basic_rex(state, 1, 0, dst); + emit1(state, 0xb8 | (dst & 7)); + emit8(state, imm); + } +#elif defined(__aarch64__) + if (imm >= INT32_MIN && imm <= INT32_MAX) + emit_movewide_immediate(state, false, dst, imm); + else + emit_movewide_immediate(state, true, dst, imm); +#endif +} + +/* Store register src to [dst + offset] */ +static inline void emit_store(struct jit_state *state, + enum operand_size size, + int src, + int dst, + int32_t offset) +{ +#if defined(__x86_64__) + if (size == S16) + emit1(state, 0x66); /* 16-bit override */ + emit1(state, size == S8 ? 0x88 : 0x89); + emit_modrm_and_displacement(state, src, dst, offset); +#elif defined(__aarch64__) + if (size == S8) + emit_loadstore_immediate(state, LS_STRB, src, dst, offset); + else if (size == S16) + emit_loadstore_immediate(state, LS_STRH, src, dst, offset); + else if (size == S32) + emit_loadstore_immediate(state, LS_STRW, src, dst, offset); +#endif +} + +/* Store immediate to [dst + offset] */ +static inline void emit_store_imm32(struct jit_state *state, + enum operand_size size, + int dst, + int32_t offset, + int32_t imm) +{ +#if defined(__x86_64__) + if (size == S16) + emit1(state, 0x66); /* 16-bit override */ + emit1(state, size == S8 ? 0xc6 : 0xc7); + emit_modrm_and_displacement(state, 0, dst, offset); + if (size == S32) + emit4(state, imm); + else if (size == S16) + emit2(state, imm); + else if (size == S8) + emit1(state, imm); +#elif defined(__aarch64__) + emit_load_imm(state, R10, imm); + emit_store(state, size, R10, dst, offset); +#endif +} + +static inline void emit_jmp(struct jit_state *state, uint32_t target_pc) +{ +#if defined(__x86_64__) + emit1(state, 0xe9); + emit_jump_target_address(state, target_pc); +#elif defined(__aarch64__) + struct jump *jump = &state->jumps[state->num_jumps++]; + jump->offset_loc = state->offset; + jump->target_pc = target_pc; + emit_instruction(state, UBR_B); +#endif +} + +static inline void emit_call(struct jit_state *state, intptr_t target) +{ +#if defined(__x86_64__) + emit_load_imm(state, RAX, target); + /* callq *%rax */ + emit1(state, 0xff); + /* ModR/M byte: b11010000b = xd0, rax is register 0 */ + emit1(state, 0xd0); +#elif defined(__aarch64__) + uint32_t stack_movement = align_to(8, 16); + emit_addsub_immediate(state, true, AS_SUB, SP, SP, stack_movement); + emit_loadstore_immediate(state, LS_STRX, R30, SP, 0); + + emit_movewide_immediate(state, true, temp_imm_reg, target); + emit_unconditionalbranch_register(state, BR_BLR, temp_imm_reg); + + /* On exit need to move result from r0 to whichever register we've mapped + * EBPF r0 to. */ + int dest = map_register(0); + if (dest != R0) { + emit_logical_register(state, true, LOG_ORR, dest, RZ, R0); + } + + emit_loadstore_immediate(state, LS_LDRX, R30, SP, 0); + emit_addsub_immediate(state, true, AS_ADD, SP, SP, stack_movement); +#endif +} + +static inline void emit_exit(struct jit_state *state) +{ +#if defined(__x86_64__) + emit1(state, 0xe9); + emit_jump_target_offset(state, state->offset, state->exit_loc); + emit4(state, 0); +#elif defined(__aarch64__) + emit_jmp(state, TARGET_PC_EXIT); +#endif +} + +#if RV32_HAS(EXT_M) +static void muldivmod(struct jit_state *state, + uint8_t opcode, + int src, + int dst, + int32_t imm UNUSED) +{ +#if defined(__x86_64__) + bool mul = (opcode & JIT_ALU_OP_MASK) == (JIT_OP_MUL_IMM & JIT_ALU_OP_MASK); + bool div = (opcode & JIT_ALU_OP_MASK) == (JIT_OP_DIV_IMM & JIT_ALU_OP_MASK); + bool mod = (opcode & JIT_ALU_OP_MASK) == (JIT_OP_MOD_IMM & JIT_ALU_OP_MASK); + bool is64 = (opcode & JIT_CLS_MASK) == JIT_CLS_ALU64; + bool reg = (opcode & JIT_SRC_REG) == JIT_SRC_REG; + + /* Short circuit for imm == 0 */ + if (!reg && imm == 0) { + assert(NULL); + if (div || mul) { + /* For division and multiplication, set result to zero. */ + emit_alu32(state, 0x31, dst, dst); + } else { + /* For modulo, set result to dividend. */ + emit_mov(state, dst, dst); + } + return; + } + + if (dst != RAX) + emit_push(state, RAX); + + if (dst != RDX) + emit_push(state, RDX); + + /* Load the divisor into RCX */ + if (imm) + emit_load_imm(state, RCX, imm); + else + emit_mov(state, src, RCX); + + /* Load the dividend into RAX */ + emit_mov(state, dst, RAX); + + /* The JIT employs two different semantics for division and modulus + * operations. In the case of division, if the divisor is zero, the result + * is set to zero. For modulus operations, if the divisor is zero, the + * result becomes the dividend. To manage this, we first set the divisor to + * 1 if it is initially zero. Then, we adjust the result accordingly: for + * division, we set it to zero if the original divisor was zero; for + * modulus, we set it to the dividend under the same condition. + */ + + if (div || mod) { + /* Check if divisor is zero */ + if (is64) + emit_alu64(state, 0x85, RCX, RCX); + else + emit_alu32(state, 0x85, RCX, RCX); + + /* Save the dividend for the modulo case */ + if (mod) + emit_push(state, RAX); /* Save dividend */ + + /* Save the result of the test */ + emit1(state, 0x9c); /* pushfq */ + + /* Set the divisor to 1 if it is zero */ + emit_load_imm(state, RDX, 1); + emit1(state, 0x48); + emit1(state, 0x0f); + emit1(state, 0x44); + emit1(state, 0xca); /* cmove rcx, rdx */ + + /* xor %edx,%edx */ + emit_alu32(state, 0x31, RDX, RDX); + } + + if (is64) + emit_rex(state, 1, 0, 0, 0); + + /* Multiply or divide */ + emit_alu32(state, 0xf7, mul ? 4 : 6, RCX); + + /* The division operation stores the remainder in RDX and the quotient + * in RAX. + */ + if (div || mod) { + /* Restore the result of the test */ + emit1(state, 0x9d); /* popfq */ + + /* If zero flag is set, then the divisor was zero. */ + + if (div) { + /* Set the dividend to zero if the divisor was zero. */ + emit_load_imm(state, RCX, 0); + + /* Store 0 in RAX if the divisor was zero. */ + /* Use conditional move to avoid a branch. */ + emit1(state, 0x48); + emit1(state, 0x0f); + emit1(state, 0x44); + emit1(state, 0xc1); /* cmove rax, rcx */ + } else { + /* Restore dividend to RCX */ + emit_pop(state, RCX); + + /* Store the dividend in RAX if the divisor was zero. */ + /* Use conditional move to avoid a branch. */ + emit1(state, 0x48); + emit1(state, 0x0f); + emit1(state, 0x44); + emit1(state, 0xd1); /* cmove rdx, rcx */ + } + } + + if (dst != RDX) { + if (mod) + emit_mov(state, RDX, dst); + emit_pop(state, RDX); + } + if (dst != RAX) { + if (div || mul) + emit_mov(state, RAX, dst); + emit_pop(state, RAX); + } +#elif defined(__aarch64__) + switch (opcode) { + case 0x28: + emit_dataprocessing_threesource(state, false, DP3_MADD, dst, dst, src, + RZ); + break; + case 0x2f: + emit_dataprocessing_threesource(state, true, DP3_MADD, dst, dst, src, + RZ); + break; + case 0x38: + divmod(state, JIT_OP_DIV_REG, dst, dst, src); + break; + case 0x98: + divmod(state, JIT_OP_MOD_REG, dst, dst, src); + break; + } +#endif +} +#endif + +#define SET_SIZE_BITS 10 +#define SET_SIZE (1 << SET_SIZE_BITS) +#define SET_SLOTS_SIZE 32 +HASH_FUNC_IMPL(set_hash, SET_SIZE_BITS, 1 << SET_SIZE_BITS); + +/* The set consists of SET_SIZE buckets, with each bucket containing + * SET_SLOTS_SIZE slots. + */ +typedef struct { + uint32_t table[SET_SIZE][SET_SLOTS_SIZE]; +} set_t; + +/** + * set_reset - clear a set + * @set: a pointer points to target set + */ +static inline void set_reset(set_t *set) +{ + memset(set, 0, sizeof(set_t)); +} + +/** + * set_add - insert a new element into the set + * @set: a pointer points to target set + * @key: the key of the inserted entry + */ +static bool set_add(set_t *set, uint32_t key) +{ + const uint32_t index = set_hash(key); + uint8_t count = 0; + while (set->table[index][count]) { + if (set->table[index][count++] == key) + return false; + } + + set->table[index][count] = key; + return true; +} + +/** + * set_has - check whether the element exist in the set or not + * @set: a pointer points to target set + * @key: the key of the inserted entry + */ +static bool set_has(set_t *set, uint32_t key) +{ + const uint32_t index = set_hash(key); + for (uint8_t count = 0; set->table[index][count]; count++) { + if (set->table[index][count] == key) + return true; + } + return false; +} + +#define UPDATE_PC(pc) \ + emit_load_imm(state, RAX, (pc)); \ + emit_store(state, S32, RAX, parameter_reg[0], \ + offsetof(struct riscv_internal, PC)); + +static void prepare_translate(struct jit_state *state) +{ +#if defined(__x86_64__) + /* Save platform non-volatile registers */ + for (uint32_t i = 0; i < ARRAYS_SIZE(nonvolatile_reg); i++) + emit_push(state, nonvolatile_reg[i]); + + /* Assuming that the stack is 16-byte aligned just before the call + * instruction that brought us to this code, we need to restore 16-byte + * alignment upon starting execution of the JIT'd code. STACK_SIZE is + * guaranteed to be divisible by 16. However, if an even number of + * registers were pushed onto the stack during state saving (see above), + * an additional 8 bytes must be added to regain 16-byte alignment. + */ + if (!(ARRAYS_SIZE(nonvolatile_reg) % 2)) + emit_alu64_imm32(state, 0x81, 5, RSP, 0x8); + + /* Set JIT R10 (the way to access the frame in JIT) to match RSP. */ + emit_mov(state, RSP, map_register(VM_REG_10)); + + /* Allocate stack space */ + emit_alu64_imm32(state, 0x81, 5, RSP, STACK_SIZE); + +#if defined(_WIN32) + /* Windows x64 ABI requires home register space. */ + /* Allocate home register space - 4 registers */ + emit_alu64_imm32(state, 0x81, 5, RSP, 4 * sizeof(uint64_t)); +#endif + + /* Jump to the entry point, which is stored in the second parameter. */ + emit1(state, 0xff); + emit1(state, 0xe6); + + /* Epilogue */ + state->exit_loc = state->offset; + + /* Move register 0 into rax */ + if (map_register(VM_REG_0) != RAX) + emit_mov(state, map_register(VM_REG_0), RAX); + + /* Deallocate stack space by restoring RSP from JIT R10. */ + emit_mov(state, map_register(VM_REG_10), RSP); + + if (!(ARRAYS_SIZE(nonvolatile_reg) % 2)) + emit_alu64_imm32(state, 0x81, 0, RSP, 0x8); + + /* Restore platform non-volatile registers */ + for (uint32_t i = 0; i < ARRAYS_SIZE(nonvolatile_reg); i++) + emit_pop(state, nonvolatile_reg[ARRAYS_SIZE(nonvolatile_reg) - i - 1]); + + /* Return */ + emit1(state, 0xc3); +#elif defined(__aarch64__) + uint32_t register_space = ARRAYS_SIZE(callee_reg) * 8 + 2 * 8; + state->stack_size = align_to(STACK_SIZE + register_space, 16); + emit_addsub_immediate(state, true, AS_SUB, SP, SP, state->stack_size); + + /* Set up frame */ + emit_loadstorepair_immediate(state, LSP_STPX, R29, R30, SP, 0); + /* In ARM64 calling convention, R29 is the frame pointer. */ + emit_addsub_immediate(state, true, AS_ADD, R29, SP, 0); + + /* Save callee saved registers */ + for (size_t i = 0; i < ARRAYS_SIZE(callee_reg); i += 2) { + emit_loadstorepair_immediate(state, LSP_STPX, callee_reg[i], + callee_reg[i + 1], SP, (i + 2) * 8); + } + + emit_unconditionalbranch_register(state, BR_BR, R1); + /* Epilogue */ + state->exit_loc = state->offset; + + /* Move register 0 into R0 */ + if (map_register(0) != R0) { + emit_logical_register(state, true, LOG_ORR, R0, RZ, map_register(0)); + } + + /* Restore callee-saved registers). */ + for (size_t i = 0; i < ARRAYS_SIZE(callee_reg); i += 2) { + emit_loadstorepair_immediate(state, LSP_LDPX, callee_reg[i], + callee_reg[i + 1], SP, (i + 2) * 8); + } + emit_loadstorepair_immediate(state, LSP_LDPX, R29, R30, SP, 0); + emit_addsub_immediate(state, true, AS_ADD, SP, SP, state->stack_size); + emit_unconditionalbranch_register(state, BR_RET, R30); +#endif +} + +#define X64(inst, code) \ + static void do_##inst(struct jit_state *state UNUSED, riscv_t *rv UNUSED, \ + rv_insn_t *ir UNUSED) \ + { \ + code; \ + } +#include "rv32_jit_template.c" +#undef X64 + +static void do_fuse1(struct jit_state *state, riscv_t *rv UNUSED, rv_insn_t *ir) +{ + opcode_fuse_t *fuse = ir->fuse; + for (int i = 0; i < ir->imm2; i++) { + emit_load_imm(state, temp_reg[0], fuse[i].imm); + emit_store(state, S32, temp_reg[0], parameter_reg[0], + offsetof(struct riscv_internal, X) + 4 * fuse[i].rd); + } +} + +static void do_fuse2(struct jit_state *state, riscv_t *rv UNUSED, rv_insn_t *ir) +{ + emit_load_imm(state, temp_reg[0], ir->imm); + emit_store(state, S32, temp_reg[0], parameter_reg[0], + offsetof(struct riscv_internal, X) + 4 * ir->rd); + emit_load(state, S32, parameter_reg[0], temp_reg[1], + offsetof(struct riscv_internal, X) + 4 * ir->rs1); + emit_alu32(state, 0x01, temp_reg[1], temp_reg[0]); + emit_store(state, S32, temp_reg[0], parameter_reg[0], + offsetof(struct riscv_internal, X) + 4 * ir->rs2); +} + +static void do_fuse3(struct jit_state *state, riscv_t *rv, rv_insn_t *ir) +{ + memory_t *m = ((state_t *) rv->userdata)->mem; + opcode_fuse_t *fuse = ir->fuse; + for (int i = 0; i < ir->imm2; i++) { + emit_load(state, S32, parameter_reg[0], temp_reg[0], + offsetof(struct riscv_internal, X) + 4 * fuse[i].rs1); + emit_load_imm(state, temp_reg[1], + (intptr_t) (m->mem_base + fuse[i].imm)); + emit_alu64(state, 0x01, temp_reg[1], temp_reg[0]); + emit_load(state, S32, parameter_reg[0], temp_reg[1], + offsetof(struct riscv_internal, X) + 4 * fuse[i].rs2); + emit_store(state, S32, temp_reg[1], temp_reg[0], 0); + } +} + +static void do_fuse4(struct jit_state *state, riscv_t *rv, rv_insn_t *ir) +{ + memory_t *m = ((state_t *) rv->userdata)->mem; + opcode_fuse_t *fuse = ir->fuse; + for (int i = 0; i < ir->imm2; i++) { + emit_load(state, S32, parameter_reg[0], temp_reg[0], + offsetof(struct riscv_internal, X) + 4 * fuse[i].rs1); + emit_load_imm(state, temp_reg[1], + (intptr_t) (m->mem_base + fuse[i].imm)); + emit_alu64(state, 0x01, temp_reg[1], temp_reg[0]); + emit_load(state, S32, temp_reg[0], temp_reg[1], 0); + emit_store(state, S32, temp_reg[1], parameter_reg[0], + offsetof(struct riscv_internal, X) + 4 * fuse[i].rd); + } +} + +static void do_fuse5(struct jit_state *state, riscv_t *rv UNUSED, rv_insn_t *ir) +{ + emit_load_imm(state, temp_reg[0], ir->pc + 4); + emit_store(state, S32, temp_reg[0], parameter_reg[0], + offsetof(struct riscv_internal, PC)); + emit_call(state, (intptr_t) rv->io.on_memset); + emit_exit(&(*state)); +} + +static void do_fuse6(struct jit_state *state, riscv_t *rv UNUSED, rv_insn_t *ir) +{ + emit_load_imm(state, temp_reg[0], ir->pc + 4); + emit_store(state, S32, temp_reg[0], parameter_reg[0], + offsetof(struct riscv_internal, PC)); + emit_call(state, (intptr_t) rv->io.on_memcpy); + emit_exit(&(*state)); +} + +static void do_fuse7(struct jit_state *state, riscv_t *rv UNUSED, rv_insn_t *ir) +{ + opcode_fuse_t *fuse = ir->fuse; + for (int i = 0; i < ir->imm2; i++) { + switch (fuse[i].opcode) { + case rv_insn_slli: + emit_load(state, S32, parameter_reg[0], temp_reg[0], + offsetof(struct riscv_internal, X) + 4 * fuse[i].rs1); + emit_alu32_imm8(state, 0xc1, 4, temp_reg[0], fuse[i].imm & 0x1f); + emit_store(state, S32, temp_reg[0], parameter_reg[0], + offsetof(struct riscv_internal, X) + 4 * fuse[i].rd); + break; + case rv_insn_srli: + emit_load(state, S32, parameter_reg[0], temp_reg[0], + offsetof(struct riscv_internal, X) + 4 * fuse[i].rs1); + emit_alu32_imm8(state, 0xc1, 5, temp_reg[0], fuse[i].imm & 0x1f); + emit_store(state, S32, temp_reg[0], parameter_reg[0], + offsetof(struct riscv_internal, X) + 4 * fuse[i].rd); + break; + case rv_insn_srai: + emit_load(state, S32, parameter_reg[0], temp_reg[0], + offsetof(struct riscv_internal, X) + 4 * fuse[i].rs1); + emit_alu32_imm8(state, 0xc1, 7, temp_reg[0], fuse[i].imm & 0x1f); + emit_store(state, S32, temp_reg[0], parameter_reg[0], + offsetof(struct riscv_internal, X) + 4 * fuse[i].rd); + break; + default: + __UNREACHABLE; + break; + } + } +} + +/* clang-format off */ +static const void *dispatch_table[] = { + /* RV32 instructions */ +#define _(inst, can_branch, insn_len, translatable, reg_mask) [rv_insn_##inst] = do_##inst, + RV_INSN_LIST +#undef _ + /* Macro operation fusion instructions */ +#define _(inst) [rv_insn_##inst] = do_##inst, + FUSE_INSN_LIST +#undef _ +}; +/* clang-format on */ + +typedef void (*codegen_block_func_t)(struct jit_state *, + riscv_t *, + rv_insn_t *); + +static void translate(struct jit_state *state, riscv_t *rv, block_t *block) +{ + uint32_t idx; + rv_insn_t *ir, *next; + for (idx = 0, ir = block->ir_head; idx < block->n_insn; idx++, ir = next) { + next = ir->next; + ((codegen_block_func_t) dispatch_table[ir->opcode])(state, rv, ir); + } +} + +static void resolve_jumps(struct jit_state *state) +{ + for (int i = 0; i < state->num_jumps; i++) { + struct jump jump = state->jumps[i]; + int target_loc; + if (jump.target_offset != 0) + target_loc = jump.target_offset; + else if (jump.target_pc == TARGET_PC_EXIT) + target_loc = state->exit_loc; +#if defined(__x86_64__) + else if (jump.target_pc == TARGET_PC_RETPOLINE) + target_loc = state->retpoline_loc; +#elif defined(__aarch64__) + else if (jump.target_pc == TARGET_PC_ENTER) + target_loc = state->entry_loc; +#endif + else { + target_loc = jump.offset_loc + sizeof(uint32_t); + for (int i = 0; i < state->num_insn; i++) { + if (jump.target_pc == state->offset_map[i].PC) { + target_loc = state->offset_map[i].offset; + break; + } + } + } +#if defined(__x86_64__) + /* Assumes jump offset is at end of instruction */ + uint32_t rel = target_loc - (jump.offset_loc + sizeof(uint32_t)); + + uint8_t *offset_ptr = &state->buf[jump.offset_loc]; + memcpy(offset_ptr, &rel, sizeof(uint32_t)); +#elif defined(__aarch64__) + int32_t rel = target_loc - jump.offset_loc; + update_branch_immediate(state, jump.offset_loc, rel); +#endif + } +} + +static void translate_chained_block(struct jit_state *state, + riscv_t *rv, + block_t *block, + set_t *set) +{ + if (set_has(set, block->pc_start)) + return; + + set_add(set, block->pc_start); + offset_map_insert(state, block->pc_start); + translate(state, rv, block); + rv_insn_t *ir = block->ir_tail; + if (ir->branch_untaken && !set_has(set, ir->pc + 4)) { + block_t *block1 = cache_get(rv->block_cache, ir->pc + 4); + if (block1 && block1->translatable) + translate_chained_block(state, rv, block1, set); + } + if (ir->branch_taken && !set_has(set, ir->pc + ir->imm)) { + block_t *block1 = cache_get(rv->block_cache, ir->pc + ir->imm); + if (block1 && block1->translatable) + translate_chained_block(state, rv, block1, set); + } +} + +uint32_t translate_x64(riscv_t *rv, block_t *block) +{ + struct jit_state *state = rv->jit_state; + memset(state->offset_map, 0, MAX_INSNS * sizeof(struct offset_map)); + memset(state->jumps, 0, MAX_INSNS * sizeof(struct jump)); + state->num_insn = 0; + state->num_jumps = 0; + uint32_t entry_loc = state->offset; + set_t set; + set_reset(&set); + translate_chained_block(&(*state), rv, block, &set); + + if (state->offset == state->size) { + printf("Target buffer too small\n"); + goto out; + } + resolve_jumps(&(*state)); +out: + return entry_loc; +} + +struct jit_state *init_state(size_t size) +{ + struct jit_state *state = malloc(sizeof(struct jit_state)); + state->offset = 0; + state->size = size; + state->buf = mmap(0, size, PROT_READ | PROT_WRITE | PROT_EXEC, + MAP_PRIVATE | MAP_ANONYMOUS +#if defined(__APPLE__) + | MAP_JIT +#endif + , + -1, 0); + assert(state->buf != MAP_FAILED); + prepare_translate(state); + state->offset_map = calloc(MAX_INSNS, sizeof(struct offset_map)); + state->jumps = calloc(MAX_INSNS, sizeof(struct jump)); + return state; +} + +void destroy_state(struct jit_state *state) +{ + munmap(state->buf, state->size); + free(state->offset_map); + free(state->jumps); + free(state); +} diff --git a/src/jit.h b/src/jit.h new file mode 100644 index 000000000..efa787a50 --- /dev/null +++ b/src/jit.h @@ -0,0 +1,43 @@ +/* + * rv32emu is freely redistributable under the MIT License. See the file + * "LICENSE" for information on usage and redistribution of this file. + */ + +#pragma once + +#include +#include +#include +#include +#include + +#include "riscv_private.h" + +struct jump { + uint32_t offset_loc; + uint32_t target_pc; + uint32_t target_offset; +}; + +struct offset_map { + uint32_t PC; + uint32_t offset; +}; + +struct jit_state { + uint8_t *buf; + uint32_t offset; + uint32_t stack_size; + uint32_t size; + uint32_t entry_loc; + uint32_t exit_loc; + uint32_t retpoline_loc; + struct offset_map *offset_map; + int num_insn; + struct jump *jumps; + int num_jumps; +}; + +struct jit_state *init_state(size_t size); +void destroy_state(struct jit_state *state); +uint32_t translate_x64(riscv_t *rv, block_t *block); \ No newline at end of file diff --git a/src/jit_x64.c b/src/jit_x64.c deleted file mode 100644 index 622abc71d..000000000 --- a/src/jit_x64.c +++ /dev/null @@ -1,577 +0,0 @@ -/* - * rv32emu is freely redistributable under the MIT License. See the file - * "LICENSE" for information on usage and redistribution of this file. - */ - -/* This JIT implementation has undergone extensive modifications, heavily - * relying on the ubpf_jit_x86_64.[ch] from ubpf. The original - * ubpf_jit_x86_64.[ch] file served as the foundation and source of inspiration - * for adapting and tailoring it specifically for this JIT implementation. - * Therefore, credit and sincere thanks are extended to ubpf for their - * invaluable work. - * - * Reference: - * https://github.com/iovisor/ubpf/blob/main/vm/ubpf_jit_x86_64.c - */ - -#if !defined(__x86_64__) -#error "This implementation is dedicated to x86-64." -#endif - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "cache.h" -#include "decode.h" -#include "io.h" -#include "jit_x64.h" -#include "state.h" -#include "utils.h" - -enum VM_REG { - VM_REG_0 = 0, - VM_REG_1, - VM_REG_2, - VM_REG_3, - VM_REG_4, - VM_REG_5, - VM_REG_6, - VM_REG_7, - VM_REG_8, - VM_REG_9, - VM_REG_10, - N_VM_REGS, -}; - -#define X64_CLS_MASK 0x07 -#define X64_ALU_OP_MASK 0xf0 -#define X64_CLS_ALU 0x04 -#define X64_CLS_ALU64 0x07 -#define X64_SRC_IMM 0x00 -#define X64_SRC_REG 0x08 -#define X64_OP_MUL_IMM (X64_CLS_ALU | X64_SRC_IMM | 0x20) -#define X64_OP_MUL_REG (X64_CLS_ALU | X64_SRC_REG | 0x20) -#define X64_OP_DIV_IMM (X64_CLS_ALU | X64_SRC_IMM | 0x30) -#define X64_OP_DIV_REG (X64_CLS_ALU | X64_SRC_REG | 0x30) -#define X64_OP_MOD_IMM (X64_CLS_ALU | X64_SRC_IMM | 0x90) -#define X64_OP_MOD_REG (X64_CLS_ALU | X64_SRC_REG | 0x90) - -#define STACK_SIZE 512 -#define MAX_INSNS 1024 - -#if RV32_HAS(EXT_M) -static void muldivmod(struct jit_state *state, - uint8_t opcode, - int src, - int dst, - int32_t imm) -{ - bool mul = (opcode & X64_ALU_OP_MASK) == (X64_OP_MUL_IMM & X64_ALU_OP_MASK); - bool div = (opcode & X64_ALU_OP_MASK) == (X64_OP_DIV_IMM & X64_ALU_OP_MASK); - bool mod = (opcode & X64_ALU_OP_MASK) == (X64_OP_MOD_IMM & X64_ALU_OP_MASK); - bool is64 = (opcode & X64_CLS_MASK) == X64_CLS_ALU64; - bool reg = (opcode & X64_SRC_REG) == X64_SRC_REG; - - /* Short circuit for imm == 0 */ - if (!reg && imm == 0) { - assert(NULL); - if (div || mul) { - /* For division and multiplication, set result to zero. */ - emit_alu32(state, 0x31, dst, dst); - } else { - /* For modulo, set result to dividend. */ - emit_mov(state, dst, dst); - } - return; - } - - if (dst != RAX) - emit_push(state, RAX); - - if (dst != RDX) - emit_push(state, RDX); - - /* Load the divisor into RCX */ - if (imm) - emit_load_imm(state, RCX, imm); - else - emit_mov(state, src, RCX); - - /* Load the dividend into RAX */ - emit_mov(state, dst, RAX); - - /* The JIT employs two different semantics for division and modulus - * operations. In the case of division, if the divisor is zero, the result - * is set to zero. For modulus operations, if the divisor is zero, the - * result becomes the dividend. To manage this, we first set the divisor to - * 1 if it is initially zero. Then, we adjust the result accordingly: for - * division, we set it to zero if the original divisor was zero; for - * modulus, we set it to the dividend under the same condition. - */ - - if (div || mod) { - /* Check if divisor is zero */ - if (is64) - emit_alu64(state, 0x85, RCX, RCX); - else - emit_alu32(state, 0x85, RCX, RCX); - - /* Save the dividend for the modulo case */ - if (mod) - emit_push(state, RAX); /* Save dividend */ - - /* Save the result of the test */ - emit1(state, 0x9c); /* pushfq */ - - /* Set the divisor to 1 if it is zero */ - emit_load_imm(state, RDX, 1); - emit1(state, 0x48); - emit1(state, 0x0f); - emit1(state, 0x44); - emit1(state, 0xca); /* cmove rcx, rdx */ - - /* xor %edx,%edx */ - emit_alu32(state, 0x31, RDX, RDX); - } - - if (is64) - emit_rex(state, 1, 0, 0, 0); - - /* Multiply or divide */ - emit_alu32(state, 0xf7, mul ? 4 : 6, RCX); - - /* The division operation stores the remainder in RDX and the quotient - * in RAX. - */ - if (div || mod) { - /* Restore the result of the test */ - emit1(state, 0x9d); /* popfq */ - - /* If zero flag is set, then the divisor was zero. */ - - if (div) { - /* Set the dividend to zero if the divisor was zero. */ - emit_load_imm(state, RCX, 0); - - /* Store 0 in RAX if the divisor was zero. */ - /* Use conditional move to avoid a branch. */ - emit1(state, 0x48); - emit1(state, 0x0f); - emit1(state, 0x44); - emit1(state, 0xc1); /* cmove rax, rcx */ - } else { - /* Restore dividend to RCX */ - emit_pop(state, RCX); - - /* Store the dividend in RAX if the divisor was zero. */ - /* Use conditional move to avoid a branch. */ - emit1(state, 0x48); - emit1(state, 0x0f); - emit1(state, 0x44); - emit1(state, 0xd1); /* cmove rdx, rcx */ - } - } - - if (dst != RDX) { - if (mod) - emit_mov(state, RDX, dst); - emit_pop(state, RDX); - } - if (dst != RAX) { - if (div || mul) - emit_mov(state, RAX, dst); - emit_pop(state, RAX); - } -} -#endif - -#define REGISTER_MAP_SIZE 11 - -/* There are two common x86-64 calling conventions, discussed at: - * https://en.wikipedia.org/wiki/X64_calling_conventions#x86-64_calling_conventions - * - * Please note: R12 is an exception and is *not* being used. Consequently, it - * is omitted from the list of non-volatile registers for both platforms, - * despite being non-volatile. - */ -#if defined(_WIN32) -static int nonvolatile_reg[] = {RBP, RBX, RDI, RSI, R13, R14, R15}; -static int parameter_reg[] = {RCX, RDX, R8, R9}; -#define RCX_ALT R10 -static int register_map[REGISTER_MAP_SIZE] = { - RAX, R10, RDX, R8, R9, R14, R15, RDI, RSI, RBX, RBP, -}; -#else -#define RCX_ALT R9 -static const int nonvolatile_reg[] = {RBP, RBX, R13, R14, R15}; -static const int parameter_reg[] = {RDI, RSI, RDX, RCX, R8, R9}; -static const int register_map[REGISTER_MAP_SIZE] = { - RAX, RDI, RSI, RDX, R9, R8, RBX, R13, R14, R15, RBP, -}; -#endif - -/* Return the x86 register for the given JIT register */ -static int map_register(int r) -{ - assert(r < N_VM_REGS); - return register_map[r % N_VM_REGS]; -} - -#define SET_SIZE_BITS 10 -#define SET_SIZE (1 << SET_SIZE_BITS) -#define SET_SLOTS_SIZE 32 -HASH_FUNC_IMPL(set_hash, SET_SIZE_BITS, 1 << SET_SIZE_BITS); - -/* The set consists of SET_SIZE buckets, with each bucket containing - * SET_SLOTS_SIZE slots. - */ -typedef struct { - uint32_t table[SET_SIZE][SET_SLOTS_SIZE]; -} set_t; - -/** - * set_reset - clear a set - * @set: a pointer points to target set - */ -static inline void set_reset(set_t *set) -{ - memset(set, 0, sizeof(set_t)); -} - -/** - * set_add - insert a new element into the set - * @set: a pointer points to target set - * @key: the key of the inserted entry - */ -static bool set_add(set_t *set, uint32_t key) -{ - const uint32_t index = set_hash(key); - uint8_t count = 0; - while (set->table[index][count]) { - if (set->table[index][count++] == key) - return false; - } - - set->table[index][count] = key; - return true; -} - -/** - * set_has - check whether the element exist in the set or not - * @set: a pointer points to target set - * @key: the key of the inserted entry - */ -static bool set_has(set_t *set, uint32_t key) -{ - const uint32_t index = set_hash(key); - for (uint8_t count = 0; set->table[index][count]; count++) { - if (set->table[index][count] == key) - return true; - } - return false; -} - -#define UPDATE_PC(pc) \ - emit_load_imm(state, RAX, (pc)); \ - emit_store(state, S32, RAX, parameter_reg[0], \ - offsetof(struct riscv_internal, PC)); - -static void prepare_translate(struct jit_state *state) -{ - /* Save platform non-volatile registers */ - for (uint32_t i = 0; i < ARRAYS_SIZE(nonvolatile_reg); i++) - emit_push(state, nonvolatile_reg[i]); - - /* Assuming that the stack is 16-byte aligned just before the call - * instruction that brought us to this code, we need to restore 16-byte - * alignment upon starting execution of the JIT'd code. STACK_SIZE is - * guaranteed to be divisible by 16. However, if an even number of - * registers were pushed onto the stack during state saving (see above), - * an additional 8 bytes must be added to regain 16-byte alignment. - */ - if (!(ARRAYS_SIZE(nonvolatile_reg) % 2)) - emit_alu64_imm32(state, 0x81, 5, RSP, 0x8); - - /* Set JIT R10 (the way to access the frame in JIT) to match RSP. */ - emit_mov(state, RSP, map_register(VM_REG_10)); - - /* Allocate stack space */ - emit_alu64_imm32(state, 0x81, 5, RSP, STACK_SIZE); - -#if defined(_WIN32) - /* Windows x64 ABI requires home register space. */ - /* Allocate home register space - 4 registers */ - emit_alu64_imm32(state, 0x81, 5, RSP, 4 * sizeof(uint64_t)); -#endif - - /* Jump to the entry point, which is stored in the second parameter. */ - emit1(state, 0xff); - emit1(state, 0xe6); - - /* Epilogue */ - state->exit_loc = state->offset; - - /* Move register 0 into rax */ - if (map_register(VM_REG_0) != RAX) - emit_mov(state, map_register(VM_REG_0), RAX); - - /* Deallocate stack space by restoring RSP from JIT R10. */ - emit_mov(state, map_register(VM_REG_10), RSP); - - if (!(ARRAYS_SIZE(nonvolatile_reg) % 2)) - emit_alu64_imm32(state, 0x81, 0, RSP, 0x8); - - /* Restore platform non-volatile registers */ - for (uint32_t i = 0; i < ARRAYS_SIZE(nonvolatile_reg); i++) - emit_pop(state, nonvolatile_reg[ARRAYS_SIZE(nonvolatile_reg) - i - 1]); - - /* Return */ - emit1(state, 0xc3); -} - -#define X64(inst, code) \ - static void do_##inst(struct jit_state *state UNUSED, riscv_t *rv UNUSED, \ - rv_insn_t *ir UNUSED) \ - { \ - code; \ - } -#include "rv32_jit_template.c" -#undef X64 - -static void do_fuse1(struct jit_state *state, riscv_t *rv UNUSED, rv_insn_t *ir) -{ - opcode_fuse_t *fuse = ir->fuse; - for (int i = 0; i < ir->imm2; i++) { - emit_load_imm(state, RAX, fuse[i].imm); - emit_store(state, S32, RAX, parameter_reg[0], - offsetof(struct riscv_internal, X) + 4 * fuse[i].rd); - } -} - -static void do_fuse2(struct jit_state *state, riscv_t *rv UNUSED, rv_insn_t *ir) -{ - emit_load_imm(state, RAX, ir->imm); - emit_store(state, S32, RAX, parameter_reg[0], - offsetof(struct riscv_internal, X) + 4 * ir->rd); - emit_load(state, S32, parameter_reg[0], RBX, - offsetof(struct riscv_internal, X) + 4 * ir->rs1); - emit_alu32(state, 0x01, RBX, RAX); - emit_store(state, S32, RAX, parameter_reg[0], - offsetof(struct riscv_internal, X) + 4 * ir->rs2); -} - -static void do_fuse3(struct jit_state *state, riscv_t *rv, rv_insn_t *ir) -{ - memory_t *m = ((state_t *) rv->userdata)->mem; - opcode_fuse_t *fuse = ir->fuse; - for (int i = 0; i < ir->imm2; i++) { - emit_load(state, S32, parameter_reg[0], RAX, - offsetof(struct riscv_internal, X) + 4 * fuse[i].rs1); - emit_load_imm(state, RBX, (intptr_t) (m->mem_base + fuse[i].imm)); - emit_alu64(state, 0x01, RBX, RAX); - emit_load(state, S32, parameter_reg[0], RBX, - offsetof(struct riscv_internal, X) + 4 * fuse[i].rs2); - emit_store(state, S32, RBX, RAX, 0); - } -} - -static void do_fuse4(struct jit_state *state, riscv_t *rv, rv_insn_t *ir) -{ - memory_t *m = ((state_t *) rv->userdata)->mem; - opcode_fuse_t *fuse = ir->fuse; - for (int i = 0; i < ir->imm2; i++) { - emit_load(state, S32, parameter_reg[0], RAX, - offsetof(struct riscv_internal, X) + 4 * fuse[i].rs1); - emit_load_imm(state, RBX, (intptr_t) (m->mem_base + fuse[i].imm)); - emit_alu64(state, 0x01, RBX, RAX); - emit_load(state, S32, RAX, RBX, 0); - emit_store(state, S32, RBX, parameter_reg[0], - offsetof(struct riscv_internal, X) + 4 * fuse[i].rd); - } -} - -static void do_fuse5(struct jit_state *state, riscv_t *rv UNUSED, rv_insn_t *ir) -{ - emit_load_imm(state, RAX, ir->pc + 4); - emit_store(state, S32, RAX, parameter_reg[0], - offsetof(struct riscv_internal, PC)); - emit_call(state, (intptr_t) rv->io.on_memset); - emit_exit(&(*state)); -} - -static void do_fuse6(struct jit_state *state, riscv_t *rv UNUSED, rv_insn_t *ir) -{ - emit_load_imm(state, RAX, ir->pc + 4); - emit_store(state, S32, RAX, parameter_reg[0], - offsetof(struct riscv_internal, PC)); - emit_call(state, (intptr_t) rv->io.on_memcpy); - emit_exit(&(*state)); -} - -static void do_fuse7(struct jit_state *state, riscv_t *rv UNUSED, rv_insn_t *ir) -{ - opcode_fuse_t *fuse = ir->fuse; - for (int i = 0; i < ir->imm2; i++) { - switch (fuse[i].opcode) { - case rv_insn_slli: - emit_load(state, S32, parameter_reg[0], RAX, - offsetof(struct riscv_internal, X) + 4 * fuse[i].rs1); - emit_alu32_imm8(state, 0xc1, 4, RAX, fuse[i].imm & 0x1f); - emit_store(state, S32, RAX, parameter_reg[0], - offsetof(struct riscv_internal, X) + 4 * fuse[i].rd); - break; - case rv_insn_srli: - emit_load(state, S32, parameter_reg[0], RAX, - offsetof(struct riscv_internal, X) + 4 * fuse[i].rs1); - emit_alu32_imm8(state, 0xc1, 5, RAX, fuse[i].imm & 0x1f); - emit_store(state, S32, RAX, parameter_reg[0], - offsetof(struct riscv_internal, X) + 4 * fuse[i].rd); - break; - case rv_insn_srai: - emit_load(state, S32, parameter_reg[0], RAX, - offsetof(struct riscv_internal, X) + 4 * fuse[i].rs1); - emit_alu32_imm8(state, 0xc1, 7, RAX, fuse[i].imm & 0x1f); - emit_store(state, S32, RAX, parameter_reg[0], - offsetof(struct riscv_internal, X) + 4 * fuse[i].rd); - break; - default: - __UNREACHABLE; - break; - } - } -} - -/* clang-format off */ -static const void *dispatch_table[] = { - /* RV32 instructions */ -#define _(inst, can_branch, insn_len, translatable, reg_mask) [rv_insn_##inst] = do_##inst, - RV_INSN_LIST -#undef _ - /* Macro operation fusion instructions */ -#define _(inst) [rv_insn_##inst] = do_##inst, - FUSE_INSN_LIST -#undef _ -}; -/* clang-format on */ - -typedef void (*codegen_block_func_t)(struct jit_state *, - riscv_t *, - rv_insn_t *); - -static void translate(struct jit_state *state, riscv_t *rv, block_t *block) -{ - uint32_t idx; - rv_insn_t *ir, *next; - for (idx = 0, ir = block->ir_head; idx < block->n_insn; idx++, ir = next) { - next = ir->next; - ((codegen_block_func_t) dispatch_table[ir->opcode])(state, rv, ir); - } -} - -static void resolve_jumps(struct jit_state *state) -{ - for (int i = 0; i < state->num_jumps; i++) { - struct jump jump = state->jumps[i]; - int target_loc; - if (jump.target_offset != 0) - target_loc = jump.target_offset; - else if (jump.target_pc == TARGET_PC_EXIT) - target_loc = state->exit_loc; - else if (jump.target_pc == TARGET_PC_RETPOLINE) - target_loc = state->retpoline_loc; - else { - target_loc = jump.offset_loc + sizeof(uint32_t); - for (int i = 0; i < state->num_insn; i++) { - if (jump.target_pc == state->offset_map[i].PC) { - target_loc = state->offset_map[i].offset; - break; - } - } - } - /* Assumes jump offset is at end of instruction */ - uint32_t rel = target_loc - (jump.offset_loc + sizeof(uint32_t)); - - uint8_t *offset_ptr = &state->buf[jump.offset_loc]; - memcpy(offset_ptr, &rel, sizeof(uint32_t)); - } -} - -static void translate_chained_block(struct jit_state *state, - riscv_t *rv, - block_t *block, - set_t *set) -{ - if (set_has(set, block->pc_start)) - return; - - set_add(set, block->pc_start); - offset_map_insert(state, block->pc_start); - translate(state, rv, block); - rv_insn_t *ir = block->ir_tail; - if (ir->branch_untaken && !set_has(set, ir->pc + 4)) { - block_t *block1 = cache_get(rv->block_cache, ir->pc + 4); - if (block1 && block1->translatable) - translate_chained_block(state, rv, block1, set); - } - if (ir->branch_taken && !set_has(set, ir->pc + ir->imm)) { - block_t *block1 = cache_get(rv->block_cache, ir->pc + ir->imm); - if (block1 && block1->translatable) - translate_chained_block(state, rv, block1, set); - } -} - -uint32_t translate_x64(riscv_t *rv, block_t *block) -{ - struct jit_state *state = rv->jit_state; - memset(state->offset_map, 0, MAX_INSNS * sizeof(struct offset_map)); - memset(state->jumps, 0, MAX_INSNS * sizeof(struct jump)); - state->num_insn = 0; - state->num_jumps = 0; - uint32_t entry_loc = state->offset; - set_t set; - set_reset(&set); - translate_chained_block(&(*state), rv, block, &set); - - if (state->offset == state->size) { - printf("Target buffer too small\n"); - goto out; - } - resolve_jumps(&(*state)); -out: - return entry_loc; -} - -struct jit_state *init_state(size_t size) -{ - struct jit_state *state = malloc(sizeof(struct jit_state)); - state->offset = 0; - state->size = size; - state->buf = mmap(0, size, PROT_READ | PROT_WRITE | PROT_EXEC, - MAP_PRIVATE | MAP_ANONYMOUS -#if defined(__APPLE__) - | MAP_JIT -#endif - , - -1, 0); - assert(state->buf != MAP_FAILED); - prepare_translate(state); - state->offset_map = calloc(MAX_INSNS, sizeof(struct offset_map)); - state->jumps = calloc(MAX_INSNS, sizeof(struct jump)); - return state; -} - -void destroy_state(struct jit_state *state) -{ - munmap(state->buf, state->size); - free(state->offset_map); - free(state->jumps); - free(state); -} diff --git a/src/jit_x64.h b/src/jit_x64.h deleted file mode 100644 index 3d3799e14..000000000 --- a/src/jit_x64.h +++ /dev/null @@ -1,407 +0,0 @@ -/* - * rv32emu is freely redistributable under the MIT License. See the file - * "LICENSE" for information on usage and redistribution of this file. - */ - -#pragma once - -#include -#include -#include -#include -#include - -#include "riscv_private.h" - -enum X64_REG { - RAX, - RCX, - RDX, - RBX, - RSP, - RBP, - RIP = 5, - RSI, - RDI, - R8, - R9, - R10, - R11, - R12, - R13, - R14, - R15, -}; - -enum operand_size { - S8, - S16, - S32, -}; - -struct jump { - uint32_t offset_loc; - uint32_t target_pc; - uint32_t target_offset; -}; - -/* Special values for target_pc in struct jump */ -#define TARGET_PC_EXIT -1U -#define TARGET_PC_RETPOLINE -3U - -struct offset_map { - uint32_t PC; - uint32_t offset; -}; - -struct jit_state { - uint8_t *buf; - uint32_t offset; - uint32_t size; - uint32_t exit_loc; - uint32_t retpoline_loc; - struct offset_map *offset_map; - int num_insn; - struct jump *jumps; - int num_jumps; -}; - -struct jit_state *init_state(size_t size); -void destroy_state(struct jit_state *state); -uint32_t translate_x64(riscv_t *rv, block_t *block); - -static inline void offset_map_insert(struct jit_state *state, int32_t target_pc) -{ - struct offset_map *map_entry = &state->offset_map[state->num_insn++]; - map_entry->PC = target_pc; - map_entry->offset = state->offset; -} - -static inline void emit_bytes(struct jit_state *state, void *data, uint32_t len) -{ - assert(state->offset <= state->size - len); - if ((state->offset + len) > state->size) { - state->offset = state->size; - return; - } - memcpy(state->buf + state->offset, data, len); - state->offset += len; -} - -static inline void emit1(struct jit_state *state, uint8_t x) -{ - emit_bytes(state, &x, sizeof(x)); -} - -static inline void emit2(struct jit_state *state, uint16_t x) -{ - emit_bytes(state, &x, sizeof(x)); -} - -static inline void emit4(struct jit_state *state, uint32_t x) -{ - emit_bytes(state, &x, sizeof(x)); -} - -static inline void emit8(struct jit_state *state, uint64_t x) -{ - emit_bytes(state, &x, sizeof(x)); -} - -static inline void emit_jump_target_address(struct jit_state *state, - int32_t target_pc) -{ - struct jump *jump = &state->jumps[state->num_jumps++]; - jump->offset_loc = state->offset; - jump->target_pc = target_pc; - emit4(state, 0); -} - -static inline void emit_jump_target_offset(struct jit_state *state, - uint32_t jump_loc, - uint32_t jump_state_offset) -{ - struct jump *jump = &state->jumps[state->num_jumps++]; - jump->offset_loc = jump_loc; - jump->target_offset = jump_state_offset; -} - -static inline void emit_modrm(struct jit_state *state, int mod, int r, int m) -{ - assert(!(mod & ~0xc0)); - emit1(state, (mod & 0xc0) | ((r & 7) << 3) | (m & 7)); -} - -static inline void emit_modrm_reg2reg(struct jit_state *state, int r, int m) -{ - emit_modrm(state, 0xc0, r, m); -} - -static inline void emit_modrm_and_displacement(struct jit_state *state, - int r, - int m, - int32_t d) -{ - if (d == 0 && (m & 7) != RBP) { - emit_modrm(state, 0x00, r, m); - } else if (d >= -128 && d <= 127) { - emit_modrm(state, 0x40, r, m); - emit1(state, d); - } else { - emit_modrm(state, 0x80, r, m); - emit4(state, d); - } -} - -static inline void emit_rex(struct jit_state *state, int w, int r, int x, int b) -{ - assert(!(w & ~1)); - assert(!(r & ~1)); - assert(!(x & ~1)); - assert(!(b & ~1)); - emit1(state, 0x40 | (w << 3) | (r << 2) | (x << 1) | b); -} - -/* Emit a REX prefix incorporating the top bit of both src and dst. This step is - * skipped if no bits are set. - */ -static inline void emit_basic_rex(struct jit_state *state, - int w, - int src, - int dst) -{ - if (w || (src & 8) || (dst & 8)) - emit_rex(state, w, !!(src & 8), 0, !!(dst & 8)); -} - -static inline void emit_push(struct jit_state *state, int r) -{ - emit_basic_rex(state, 0, 0, r); - emit1(state, 0x50 | (r & 7)); -} - -static inline void emit_pop(struct jit_state *state, int r) -{ - emit_basic_rex(state, 0, 0, r); - emit1(state, 0x58 | (r & 7)); -} - -/* The REX prefix and ModRM byte are emitted. - * The MR encoding is utilized when a choice is available. The 'src' is often - * used as an opcode extension. - */ -static inline void emit_alu32(struct jit_state *state, int op, int src, int dst) -{ - emit_basic_rex(state, 0, src, dst); - emit1(state, op); - emit_modrm_reg2reg(state, src, dst); -} - -/* REX prefix, ModRM byte, and 32-bit immediate */ -static inline void emit_alu32_imm32(struct jit_state *state, - int op, - int src, - int dst, - int32_t imm) -{ - emit_alu32(state, op, src, dst); - emit4(state, imm); -} - -/* REX prefix, ModRM byte, and 8-bit immediate */ -static inline void emit_alu32_imm8(struct jit_state *state, - int op, - int src, - int dst, - int8_t imm) -{ - emit_alu32(state, op, src, dst); - emit1(state, imm); -} - -/* The REX.W prefix and ModRM byte are emitted. - * The MR encoding is used when there is a choice. 'src' is often used as - * an opcode extension. - */ -static inline void emit_alu64(struct jit_state *state, int op, int src, int dst) -{ - emit_basic_rex(state, 1, src, dst); - emit1(state, op); - emit_modrm_reg2reg(state, src, dst); -} - -/* REX.W prefix, ModRM byte, and 32-bit immediate */ -static inline void emit_alu64_imm32(struct jit_state *state, - int op, - int src, - int dst, - int32_t imm) -{ - emit_alu64(state, op, src, dst); - emit4(state, imm); -} - -/* REX.W prefix, ModRM byte, and 8-bit immediate */ -static inline void emit_alu64_imm8(struct jit_state *state, - int op, - int src, - int dst, - int8_t imm) -{ - emit_alu64(state, op, src, dst); - emit1(state, imm); -} - -/* Register to register mov */ -static inline void emit_mov(struct jit_state *state, int src, int dst) -{ - emit_alu64(state, 0x89, src, dst); -} - -static inline void emit_cmp_imm32(struct jit_state *state, int dst, int32_t imm) -{ - emit_alu64_imm32(state, 0x81, 7, dst, imm); -} - -static inline void emit_cmp32_imm32(struct jit_state *state, - int dst, - int32_t imm) -{ - emit_alu32_imm32(state, 0x81, 7, dst, imm); -} - -static inline void emit_cmp(struct jit_state *state, int src, int dst) -{ - emit_alu64(state, 0x39, src, dst); -} - -static inline void emit_cmp32(struct jit_state *state, int src, int dst) -{ - emit_alu32(state, 0x39, src, dst); -} - -static inline void emit_jcc(struct jit_state *state, - int code, - int32_t target_pc) -{ - emit1(state, 0x0f); - emit1(state, code); - emit_jump_target_address(state, target_pc); -} - -static inline void emit_jcc_offset(struct jit_state *state, int code) -{ - emit1(state, 0x0f); - emit1(state, code); - emit4(state, 0); -} - -/* Load [src + offset] into dst */ -static inline void emit_load(struct jit_state *state, - enum operand_size size, - int src, - int dst, - int32_t offset) -{ - if (size == S8 || size == S16) { - /* movzx */ - emit1(state, 0x0f); - emit1(state, size == S8 ? 0xb6 : 0xb7); - } else if (size == S32) { - /* mov */ - emit1(state, 0x8b); - } - - emit_modrm_and_displacement(state, dst, src, offset); -} - -static inline void emit_load_sext(struct jit_state *state, - enum operand_size size, - int src, - int dst, - int32_t offset) -{ - if (size == S8 || size == S16) { - /* movsx */ - emit1(state, 0x0f); - emit1(state, size == S8 ? 0xbe : 0xbf); - } else if (size == S32) { - emit_basic_rex(state, 1, dst, src); - emit1(state, 0x63); - } - - emit_modrm_and_displacement(state, dst, src, offset); -} - -/* Load sign-extended immediate into register */ -static inline void emit_load_imm(struct jit_state *state, int dst, int64_t imm) -{ - if (imm >= INT32_MIN && imm <= INT32_MAX) { - emit_alu64_imm32(state, 0xc7, 0, dst, imm); - } else { - /* movabs $imm, dst */ - emit_basic_rex(state, 1, 0, dst); - emit1(state, 0xb8 | (dst & 7)); - emit8(state, imm); - } -} - -/* Store register src to [dst + offset] */ -static inline void emit_store(struct jit_state *state, - enum operand_size size, - int src, - int dst, - int32_t offset) -{ - if (size == S16) - emit1(state, 0x66); /* 16-bit override */ - emit1(state, size == S8 ? 0x88 : 0x89); - emit_modrm_and_displacement(state, src, dst, offset); -} - -/* Store immediate to [dst + offset] */ -static inline void emit_store_imm32(struct jit_state *state, - enum operand_size size, - int dst, - int32_t offset, - int32_t imm) -{ - if (size == S16) - emit1(state, 0x66); /* 16-bit override */ - emit1(state, size == S8 ? 0xc6 : 0xc7); - emit_modrm_and_displacement(state, 0, dst, offset); - if (size == S32) { - emit4(state, imm); - } else if (size == S16) { - emit2(state, imm); - } else if (size == S8) { - emit1(state, imm); - } -} - -static inline void emit_ret(struct jit_state *state) -{ - emit1(state, 0xc3); -} - -static inline void emit_jmp(struct jit_state *state, uint32_t target_pc) -{ - emit1(state, 0xe9); - emit_jump_target_address(state, target_pc); -} - -static inline void emit_call(struct jit_state *state, intptr_t target) -{ - emit_load_imm(state, RAX, (intptr_t) target); - /* callq *%rax */ - emit1(state, 0xff); - /* ModR/M byte: b11010000b = xd0, rax is register 0 */ - emit1(state, 0xd0); -} - -static inline void emit_exit(struct jit_state *state) -{ - emit1(state, 0xe9); - emit_jump_target_offset(state, state->offset, state->exit_loc); - emit4(state, 0); -} diff --git a/src/riscv.c b/src/riscv.c index afec53507..28e20b6ba 100644 --- a/src/riscv.c +++ b/src/riscv.c @@ -13,7 +13,7 @@ #include "utils.h" #if RV32_HAS(JIT) #include "cache.h" -#include "jit_x64.h" +#include "jit.h" #define CODE_CACHE_SIZE (1024 * 1024) #endif diff --git a/src/rv32_template.c b/src/rv32_template.c index 46c0ff40a..35ce2ebb1 100644 --- a/src/rv32_template.c +++ b/src/rv32_template.c @@ -15,18 +15,18 @@ * addi, * { rv->X[ir->rd] = (int32_t) (rv->X[ir->rs1]) + ir->imm; }, * x64({ - * ld, S32, RAX, X, rs1; - * alu32_imm, 32, 0x81, 0, RAX, imm; - * st, S32, RAX, X, rd; + * ld, S32, TEMP0, X, rs1; + * alu32_imm, 32, 0x81, 0, TEMP0, imm; + * st, S32, TEMP0, X, rd; * }) * * The block defined as 'X64' is mapped to the generic C code used in the * interpreter. The following instructions will be generated by JIT compiler: - * - Load X->rs1 (target field) from the rv data structure to RAX (destination + * - Load X->rs1 (target field) from the rv data structure to TEMP0 (destination * register). * - Do ALU operation on 0 (source register) and imm and store the result into - * RAX (destination register). - * - Store RAX (source register) value to the X->rd (target field) of the rv + * TEMP0 (destination register). + * - Store TEMP0 (source register) value to the X->rd (target field) of the rv * data structure. * * The parameter of x64 instruction API @@ -92,8 +92,8 @@ RVOP( lui, { rv->X[ir->rd] = ir->imm; }, X64({ - ld_imm, RAX, imm; - st, S32, RAX, X, rd; + ld_imm, TEMP0, imm; + st, S32, TEMP0, X, rd; })) /* AUIPC is used to build pc-relative addresses and uses the U-type format. @@ -105,8 +105,8 @@ RVOP( auipc, { rv->X[ir->rd] = ir->imm + PC; }, X64({ - ld_imm, RAX, pc, imm; - st, S32, RAX, X, rd; + ld_imm, TEMP0, pc, imm; + st, S32, TEMP0, X, rd; })) /* JAL: Jump and Link @@ -145,11 +145,11 @@ RVOP( }, X64({ cond, rd; - ld_imm, RAX, pc, 4; - st, S32, RAX, X, rd; + ld_imm, TEMP0, pc, 4; + st, S32, TEMP0, X, rd; end; - ld_imm, RAX, pc, imm; - st, S32, RAX, PC; + ld_imm, TEMP0, pc, imm; + st, S32, TEMP0, PC; jmp, pc, imm; exit; })) @@ -204,13 +204,13 @@ RVOP( }, X64({ cond, rd; - ld_imm, RAX, pc, 4; - st, S32, RAX, X, rd; + ld_imm, TEMP0, pc, 4; + st, S32, TEMP0, X, rd; end; - ld, S32, RAX, X, rs1; - alu32_imm, 32, 0x81, 0, RAX, imm; - alu32_imm, 32, 0x81, 4, RAX, ~1U; - st, S32, RAX, PC; + ld, S32, TEMP0, X, rs1; + alu32_imm, 32, 0x81, 0, TEMP0, imm; + alu32_imm, 32, 0x81, 4, TEMP0, ~1U; + st, S32, TEMP0, PC; exit; })) @@ -287,23 +287,23 @@ RVOP( beq, { BRANCH_FUNC(uint32_t, !=); }, X64({ - ld, S32, RAX, X, rs1; - ld, S32, RBX, X, rs2; - cmp, RBX, RAX; + ld, S32, TEMP0, X, rs1; + ld, S32, TEMP1, X, rs2; + cmp, TEMP1, TEMP0; set_jmp_off; jcc, 0x84; cond, branch_untaken; jmp, pc, 4; end; - ld_imm, RAX, pc, 4; - st, S32, RAX, PC; + ld_imm, TEMP0, pc, 4; + st, S32, TEMP0, PC; exit; jmp_off; cond, branch_taken; jmp, pc, imm; end; - ld_imm, RAX, pc, imm; - st, S32, RAX, PC; + ld_imm, TEMP0, pc, imm; + st, S32, TEMP0, PC; exit; })) @@ -312,23 +312,23 @@ RVOP( bne, { BRANCH_FUNC(uint32_t, ==); }, X64({ - ld, S32, RAX, X, rs1; - ld, S32, RBX, X, rs2; - cmp, RBX, RAX; + ld, S32, TEMP0, X, rs1; + ld, S32, TEMP1, X, rs2; + cmp, TEMP1, TEMP0; set_jmp_off; jcc, 0x85; cond, branch_untaken; jmp, pc, 4; end; - ld_imm, RAX, pc, 4; - st, S32, RAX, PC; + ld_imm, TEMP0, pc, 4; + st, S32, TEMP0, PC; exit; jmp_off; cond, branch_taken; jmp, pc, imm; end; - ld_imm, RAX, pc, imm; - st, S32, RAX, PC; + ld_imm, TEMP0, pc, imm; + st, S32, TEMP0, PC; exit; })) @@ -337,23 +337,23 @@ RVOP( blt, { BRANCH_FUNC(int32_t, >=); }, X64({ - ld, S32, RAX, X, rs1; - ld, S32, RBX, X, rs2; - cmp, RBX, RAX; + ld, S32, TEMP0, X, rs1; + ld, S32, TEMP1, X, rs2; + cmp, TEMP1, TEMP0; set_jmp_off; jcc, 0x8c; cond, branch_untaken; jmp, pc, 4; end; - ld_imm, RAX, pc, 4; - st, S32, RAX, PC; + ld_imm, TEMP0, pc, 4; + st, S32, TEMP0, PC; exit; jmp_off; cond, branch_taken; jmp, pc, imm; end; - ld_imm, RAX, pc, imm; - st, S32, RAX, PC; + ld_imm, TEMP0, pc, imm; + st, S32, TEMP0, PC; exit; })) @@ -362,23 +362,23 @@ RVOP( bge, { BRANCH_FUNC(int32_t, <); }, X64({ - ld, S32, RAX, X, rs1; - ld, S32, RBX, X, rs2; - cmp, RBX, RAX; + ld, S32, TEMP0, X, rs1; + ld, S32, TEMP1, X, rs2; + cmp, TEMP1, TEMP0; set_jmp_off; jcc, 0x8d; cond, branch_untaken; jmp, pc, 4; end; - ld_imm, RAX, pc, 4; - st, S32, RAX, PC; + ld_imm, TEMP0, pc, 4; + st, S32, TEMP0, PC; exit; jmp_off; cond, branch_taken; jmp, pc, imm; end; - ld_imm, RAX, pc, imm; - st, S32, RAX, PC; + ld_imm, TEMP0, pc, imm; + st, S32, TEMP0, PC; exit; })) @@ -387,23 +387,23 @@ RVOP( bltu, { BRANCH_FUNC(uint32_t, >=); }, X64({ - ld, S32, RAX, X, rs1; - ld, S32, RBX, X, rs2; - cmp, RBX, RAX; + ld, S32, TEMP0, X, rs1; + ld, S32, TEMP1, X, rs2; + cmp, TEMP1, TEMP0; set_jmp_off; jcc, 0x82; cond, branch_untaken; jmp, pc, 4; end; - ld_imm, RAX, pc, 4; - st, S32, RAX, PC; + ld_imm, TEMP0, pc, 4; + st, S32, TEMP0, PC; exit; jmp_off; cond, branch_taken; jmp, pc, imm; end; - ld_imm, RAX, pc, imm; - st, S32, RAX, PC; + ld_imm, TEMP0, pc, imm; + st, S32, TEMP0, PC; exit; })) @@ -412,23 +412,23 @@ RVOP( bgeu, { BRANCH_FUNC(uint32_t, <); }, X64({ - ld, S32, RAX, X, rs1; - ld, S32, RBX, X, rs2; - cmp, RBX, RAX; + ld, S32, TEMP0, X, rs1; + ld, S32, TEMP1, X, rs2; + cmp, TEMP1, TEMP0; set_jmp_off; jcc, 0x83; cond, branch_untaken; jmp, pc, 4; end; - ld_imm, RAX, pc, 4; - st, S32, RAX, PC; + ld_imm, TEMP0, pc, 4; + st, S32, TEMP0, PC; exit; jmp_off; cond, branch_taken; jmp, pc, imm; end; - ld_imm, RAX, pc, imm; - st, S32, RAX, PC; + ld_imm, TEMP0, pc, imm; + st, S32, TEMP0, PC; exit; })) @@ -448,11 +448,11 @@ RVOP( }, X64({ mem; - ld, S32, RAX, X, rs1; - ld_imm, RBX, mem; - alu64, 0x01, RBX, RAX; - ld_sext, S8, RAX, RBX, 0; - st, S32, RBX, X, rd; + ld, S32, TEMP0, X, rs1; + ld_imm, TEMP1, mem; + alu64, 0x01, TEMP1, TEMP0; + ld_sext, S8, TEMP0, TEMP1, 0; + st, S32, TEMP1, X, rd; })) /* LH: Load Halfword */ @@ -465,11 +465,11 @@ RVOP( }, X64({ mem; - ld, S32, RAX, X, rs1; - ld_imm, RBX, mem; - alu64, 0x01, RBX, RAX; - ld_sext, S16, RAX, RBX, 0; - st, S32, RBX, X, rd; + ld, S32, TEMP0, X, rs1; + ld_imm, TEMP1, mem; + alu64, 0x01, TEMP1, TEMP0; + ld_sext, S16, TEMP0, TEMP1, 0; + st, S32, TEMP1, X, rd; })) /* LW: Load Word */ @@ -482,11 +482,11 @@ RVOP( }, X64({ mem; - ld, S32, RAX, X, rs1; - ld_imm, RBX, mem; - alu64, 0x01, RBX, RAX; - ld, S32, RAX, RBX, 0; - st, S32, RBX, X, rd; + ld, S32, TEMP0, X, rs1; + ld_imm, TEMP1, mem; + alu64, 0x01, TEMP1, TEMP0; + ld, S32, TEMP0, TEMP1, 0; + st, S32, TEMP1, X, rd; })) /* LBU: Load Byte Unsigned */ @@ -495,11 +495,11 @@ RVOP( { rv->X[ir->rd] = rv->io.mem_read_b(rv->X[ir->rs1] + ir->imm); }, X64({ mem; - ld, S32, RAX, X, rs1; - ld_imm, RBX, mem; - alu64, 0x01, RBX, RAX; - ld, S8, RAX, RBX, 0; - st, S32, RBX, X, rd; + ld, S32, TEMP0, X, rs1; + ld_imm, TEMP1, mem; + alu64, 0x01, TEMP1, TEMP0; + ld, S8, TEMP0, TEMP1, 0; + st, S32, TEMP1, X, rd; })) /* LHU: Load Halfword Unsigned */ @@ -512,11 +512,11 @@ RVOP( }, X64({ mem; - ld, S32, RAX, X, rs1; - ld_imm, RBX, mem; - alu64, 0x01, RBX, RAX; - ld, S16, RAX, RBX, 0; - st, S32, RBX, X, rd; + ld, S32, TEMP0, X, rs1; + ld_imm, TEMP1, mem; + alu64, 0x01, TEMP1, TEMP0; + ld, S16, TEMP0, TEMP1, 0; + st, S32, TEMP1, X, rd; })) /* There are 3 types of stores: byte, halfword, and word-sized. Unlike loads, @@ -531,11 +531,11 @@ RVOP( { rv->io.mem_write_b(rv->X[ir->rs1] + ir->imm, rv->X[ir->rs2]); }, X64({ mem; - ld, S32, RAX, X, rs1; - ld_imm, RBX, mem; - alu64, 0x01, RBX, RAX; - ld, S8, RBX, X, rs2; - st, S8, RBX, RAX, 0; + ld, S32, TEMP0, X, rs1; + ld_imm, TEMP1, mem; + alu64, 0x01, TEMP1, TEMP0; + ld, S8, TEMP1, X, rs2; + st, S8, TEMP1, TEMP0, 0; })) /* SH: Store Halfword */ @@ -548,11 +548,11 @@ RVOP( }, X64({ mem; - ld, S32, RAX, X, rs1; - ld_imm, RBX, mem; - alu64, 0x01, RBX, RAX; - ld, S16, RBX, X, rs2; - st, S16, RBX, RAX, 0; + ld, S32, TEMP0, X, rs1; + ld_imm, TEMP1, mem; + alu64, 0x01, TEMP1, TEMP0; + ld, S16, TEMP1, X, rs2; + st, S16, TEMP1, TEMP0, 0; })) /* SW: Store Word */ @@ -565,11 +565,11 @@ RVOP( }, X64({ mem; - ld, S32, RAX, X, rs1; - ld_imm, RBX, mem; - alu64, 0x01, RBX, RAX; - ld, S32, RBX, X, rs2; - st, S32, RBX, RAX, 0; + ld, S32, TEMP0, X, rs1; + ld_imm, TEMP1, mem; + alu64, 0x01, TEMP1, TEMP0; + ld, S32, TEMP1, X, rs2; + st, S32, TEMP1, TEMP0, 0; })) /* ADDI adds the sign-extended 12-bit immediate to register rs1. Arithmetic @@ -581,9 +581,9 @@ RVOP( addi, { rv->X[ir->rd] = (int32_t) (rv->X[ir->rs1]) + ir->imm; }, X64({ - ld, S32, RAX, X, rs1; - alu32_imm, 32, 0x81, 0, RAX, imm; - st, S32, RAX, X, rd; + ld, S32, TEMP0, X, rs1; + alu32_imm, 32, 0x81, 0, TEMP0, imm; + st, S32, TEMP0, X, rd; })) /* SLTI place the value 1 in register rd if register rs1 is less than the @@ -594,8 +594,8 @@ RVOP( slti, { rv->X[ir->rd] = ((int32_t) (rv->X[ir->rs1]) < ir->imm) ? 1 : 0; }, X64({ - ld, S32, RAX, X, rs1; - cmp_imm, RAX, imm; + ld, S32, TEMP0, X, rs1; + cmp_imm, TEMP0, imm; st_imm, S32, rd, 1; set_jmp_off; jcc, 0x82; @@ -610,8 +610,8 @@ RVOP( sltiu, { rv->X[ir->rd] = (rv->X[ir->rs1] < (uint32_t) ir->imm) ? 1 : 0; }, X64({ - ld, S32, RAX, X, rs1; - cmp_imm, RAX, imm; + ld, S32, TEMP0, X, rs1; + cmp_imm, TEMP0, imm; st_imm, S32, rd, 1; set_jmp_off; jcc, 0x82; @@ -624,9 +624,9 @@ RVOP( xori, { rv->X[ir->rd] = rv->X[ir->rs1] ^ ir->imm; }, X64({ - ld, S32, RAX, X, rs1; - alu32_imm, 32, 0x81, 6, RAX, imm; - st, S32, RAX, X, rd; + ld, S32, TEMP0, X, rs1; + alu32_imm, 32, 0x81, 6, TEMP0, imm; + st, S32, TEMP0, X, rd; })) /* ORI: OR Immediate */ @@ -634,9 +634,9 @@ RVOP( ori, { rv->X[ir->rd] = rv->X[ir->rs1] | ir->imm; }, X64({ - ld, S32, RAX, X, rs1; - alu32_imm, 32, 0x81, 1, RAX, imm; - st, S32, RAX, X, rd; + ld, S32, TEMP0, X, rs1; + alu32_imm, 32, 0x81, 1, TEMP0, imm; + st, S32, TEMP0, X, rd; })) /* ANDI performs bitwise AND on register rs1 and the sign-extended 12-bit @@ -646,9 +646,9 @@ RVOP( andi, { rv->X[ir->rd] = rv->X[ir->rs1] & ir->imm; }, X64({ - ld, S32, RAX, X, rs1; - alu32_imm, 32, 0x81, 4, RAX, imm; - st, S32, RAX, X, rd; + ld, S32, TEMP0, X, rs1; + alu32_imm, 32, 0x81, 4, TEMP0, imm; + st, S32, TEMP0, X, rd; })) FORCE_INLINE void shift_func(riscv_t *rv, const rv_insn_t *ir) @@ -676,9 +676,9 @@ RVOP( slli, { shift_func(rv, ir); }, X64({ - ld, S32, RAX, X, rs1; - alu32_imm, 8, 0xc1, 4, RAX, imm, 0x1f; - st, S32, RAX, X, rd; + ld, S32, TEMP0, X, rs1; + alu32_imm, 8, 0xc1, 4, TEMP0, imm, 0x1f; + st, S32, TEMP0, X, rd; })) /* SRLI performs logical right shift on the value in register rs1 by the shift @@ -688,9 +688,9 @@ RVOP( srli, { shift_func(rv, ir); }, X64({ - ld, S32, RAX, X, rs1; - alu32_imm, 8, 0xc1, 5, RAX, imm, 0x1f; - st, S32, RAX, X, rd; + ld, S32, TEMP0, X, rs1; + alu32_imm, 8, 0xc1, 5, TEMP0, imm, 0x1f; + st, S32, TEMP0, X, rd; })) /* SRAI performs arithmetic right shift on the value in register rs1 by the @@ -700,9 +700,9 @@ RVOP( srai, { shift_func(rv, ir); }, X64({ - ld, S32, RAX, X, rs1; - alu32_imm, 8, 0xc1, 7, RAX, imm, 0x1f; - st, S32, RAX, X, rd; + ld, S32, TEMP0, X, rs1; + alu32_imm, 8, 0xc1, 7, TEMP0, imm, 0x1f; + st, S32, TEMP0, X, rd; })) /* ADD */ @@ -712,10 +712,10 @@ RVOP( rv->X[ir->rd] = (int32_t) (rv->X[ir->rs1]) + (int32_t) (rv->X[ir->rs2]); }, X64({ - ld, S32, RAX, X, rs1; - ld, S32, RBX, X, rs2; - alu32, 0x01, RBX, RAX; - st, S32, RAX, X, rd; + ld, S32, TEMP0, X, rs1; + ld, S32, TEMP1, X, rs2; + alu32, 0x01, TEMP1, TEMP0; + st, S32, TEMP0, X, rd; })) /* SUB: Substract */ @@ -725,10 +725,10 @@ RVOP( rv->X[ir->rd] = (int32_t) (rv->X[ir->rs1]) - (int32_t) (rv->X[ir->rs2]); }, X64({ - ld, S32, RAX, X, rs1; - ld, S32, RBX, X, rs2; - alu32, 0x29, RBX, RAX; - st, S32, RAX, X, rd; + ld, S32, TEMP0, X, rs1; + ld, S32, TEMP1, X, rs2; + alu32, 0x29, TEMP1, TEMP0; + st, S32, TEMP0, X, rd; })) /* SLL: Shift Left Logical */ @@ -736,11 +736,11 @@ RVOP( sll, { rv->X[ir->rd] = rv->X[ir->rs1] << (rv->X[ir->rs2] & 0x1f); }, X64({ - ld, S32, RAX, X, rs1; - ld, S32, RCX, X, rs2; - alu32_imm, 32, 0x81, 4, RCX, 0x1f; - alu32, 0xd3, 4, RAX; - st, S32, RAX, X, rd; + ld, S32, TEMP0, X, rs1; + ld, S32, TEMP2, X, rs2; + alu32_imm, 32, 0x81, 4, TEMP2, 0x1f; + alu32, 0xd3, 4, TEMP0; + st, S32, TEMP0, X, rd; })) /* SLT: Set on Less Than */ @@ -751,9 +751,9 @@ RVOP( ((int32_t) (rv->X[ir->rs1]) < (int32_t) (rv->X[ir->rs2])) ? 1 : 0; }, X64({ - ld, S32, RAX, X, rs1; - ld, S32, RBX, X, rs2; - cmp, RBX, RAX; + ld, S32, TEMP0, X, rs1; + ld, S32, TEMP1, X, rs2; + cmp, TEMP1, TEMP0; st_imm, S32, rd, 1; set_jmp_off; jcc, 0x82; @@ -766,9 +766,9 @@ RVOP( sltu, { rv->X[ir->rd] = (rv->X[ir->rs1] < rv->X[ir->rs2]) ? 1 : 0; }, X64({ - ld, S32, RAX, X, rs1; - ld, S32, RBX, X, rs2; - cmp, RBX, RAX; + ld, S32, TEMP0, X, rs1; + ld, S32, TEMP1, X, rs2; + cmp, TEMP1, TEMP0; st_imm, S32, rd, 1; set_jmp_off; jcc, 0x82; @@ -783,10 +783,10 @@ RVOP( rv->X[ir->rd] = rv->X[ir->rs1] ^ rv->X[ir->rs2]; }, X64({ - ld, S32, RAX, X, rs1; - ld, S32, RBX, X, rs2; - alu32, 0x31, RBX, RAX; - st, S32, RAX, X, rd; + ld, S32, TEMP0, X, rs1; + ld, S32, TEMP1, X, rs2; + alu32, 0x31, TEMP1, TEMP0; + st, S32, TEMP0, X, rd; })) /* SRL: Shift Right Logical */ @@ -794,11 +794,11 @@ RVOP( srl, { rv->X[ir->rd] = rv->X[ir->rs1] >> (rv->X[ir->rs2] & 0x1f); }, X64({ - ld, S32, RAX, X, rs1; - ld, S32, RCX, X, rs2; - alu32_imm, 32, 0x81, 4, RCX, 0x1f; - alu32, 0xd3, 5, RAX; - st, S32, RAX, X, rd; + ld, S32, TEMP0, X, rs1; + ld, S32, TEMP2, X, rs2; + alu32_imm, 32, 0x81, 4, TEMP2, 0x1f; + alu32, 0xd3, 5, TEMP0; + st, S32, TEMP0, X, rd; })) /* SRA: Shift Right Arithmetic */ @@ -806,11 +806,11 @@ RVOP( sra, { rv->X[ir->rd] = ((int32_t) rv->X[ir->rs1]) >> (rv->X[ir->rs2] & 0x1f); }, X64({ - ld, S32, RAX, X, rs1; - ld, S32, RCX, X, rs2; - alu32_imm, 32, 0x81, 4, RCX, 0x1f; - alu32, 0xd3, 7, RAX; - st, S32, RAX, X, rd; + ld, S32, TEMP0, X, rs1; + ld, S32, TEMP2, X, rs2; + alu32_imm, 32, 0x81, 4, TEMP2, 0x1f; + alu32, 0xd3, 7, TEMP0; + st, S32, TEMP0, X, rd; })) /* OR */ @@ -819,10 +819,10 @@ RVOP( , { rv->X[ir->rd] = rv->X[ir->rs1] | rv->X[ir->rs2]; }, X64({ - ld, S32, RAX, X, rs1; - ld, S32, RBX, X, rs2; - alu32, 0x09, RBX, RAX; - st, S32, RAX, X, rd; + ld, S32, TEMP0, X, rs1; + ld, S32, TEMP1, X, rs2; + alu32, 0x09, TEMP1, TEMP0; + st, S32, TEMP0, X, rd; })) /* AND */ @@ -831,10 +831,10 @@ RVOP( and, { rv->X[ir->rd] = rv->X[ir->rs1] & rv->X[ir->rs2]; }, X64({ - ld, S32, RAX, X, rs1; - ld, S32, RBX, X, rs2; - alu32, 0x21, RBX, RAX; - st, S32, RAX, X, rd; + ld, S32, TEMP0, X, rs1; + ld, S32, TEMP1, X, rs2; + alu32, 0x21, TEMP1, TEMP0; + st, S32, TEMP0, X, rd; })) /* clang-format on */ @@ -849,8 +849,8 @@ RVOP( return true; }, X64({ - ld_imm, RAX, pc; - st, S32, RAX, PC; + ld_imm, TEMP0, pc; + st, S32, TEMP0, PC; call, ecall; exit; })) @@ -866,8 +866,8 @@ RVOP( return true; }, X64({ - ld_imm, RAX, pc; - st, S32, RAX, PC; + ld_imm, TEMP0, pc; + st, S32, TEMP0, PC; call, ebreak; exit; })) @@ -1029,10 +1029,10 @@ RVOP( mul, { rv->X[ir->rd] = (int32_t) rv->X[ir->rs1] * (int32_t) rv->X[ir->rs2]; }, X64({ - ld, S32, RAX, X, rs1; - ld, S32, RBX, X, rs2; - mul, 0x28, RBX, RAX, 0; - st, S32, RAX, X, rd; + ld, S32, TEMP0, X, rs1; + ld, S32, TEMP1, X, rs2; + mul, 0x28, TEMP1, TEMP0, 0; + st, S32, TEMP0, X, rd; })) /* MULH: Multiply High Signed Signed */ @@ -1047,11 +1047,11 @@ RVOP( rv->X[ir->rd] = ((uint64_t) (multiplicand * multiplier)) >> 32; }, X64({ - ld_sext, S32, RAX, X, rs1; - ld_sext, S32, RBX, X, rs2; - mul, 0x2f, RBX, RAX, 0; - alu64_imm, 8, 0xc1, 5, RAX, 32; - st, S32, RAX, X, rd; + ld_sext, S32, TEMP0, X, rs1; + ld_sext, S32, TEMP1, X, rs2; + mul, 0x2f, TEMP1, TEMP0, 0; + alu64_imm, 8, 0xc1, 5, TEMP0, 32; + st, S32, TEMP0, X, rd; })) /* MULHSU: Multiply High Signed Unsigned */ @@ -1067,11 +1067,11 @@ RVOP( rv->X[ir->rd] = ((uint64_t) (multiplicand * umultiplier)) >> 32; }, X64({ - ld_sext, S32, RAX, X, rs1; - ld, S32, RBX, X, rs2; - mul, 0x2f, RBX, RAX, 0; - alu64_imm, 8, 0xc1, 5, RAX, 32; - st, S32, RAX, X, rd; + ld_sext, S32, TEMP0, X, rs1; + ld, S32, TEMP1, X, rs2; + mul, 0x2f, TEMP1, TEMP0, 0; + alu64_imm, 8, 0xc1, 5, TEMP0, 32; + st, S32, TEMP0, X, rd; })) /* MULHU: Multiply High Unsigned Unsigned */ @@ -1082,11 +1082,11 @@ RVOP( ((uint64_t) rv->X[ir->rs1] * (uint64_t) rv->X[ir->rs2]) >> 32; }, X64({ - ld, S32, RAX, X, rs1; - ld, S32, RBX, X, rs2; - mul, 0x2f, RBX, RAX, 0; - alu64_imm, 8, 0xc1, 5, RAX, 32; - st, S32, RAX, X, rd; + ld, S32, TEMP0, X, rs1; + ld, S32, TEMP1, X, rs2; + mul, 0x2f, TEMP1, TEMP0, 0; + alu64_imm, 8, 0xc1, 5, TEMP0, 32; + st, S32, TEMP0, X, rd; })) /* DIV: Divide Signed */ @@ -1108,15 +1108,15 @@ RVOP( : (unsigned int) (dividend / divisor); }, X64({ - ld, S32, RAX, X, rs1; - ld, S32, RBX, X, rs2; - div, 0x38, RBX, RAX, 0; - cmp_imm, RBX, 0; + ld, S32, TEMP0, X, rs1; + ld, S32, TEMP1, X, rs2; + div, 0x38, TEMP1, TEMP0, 0; + cmp_imm, TEMP1, 0; set_jmp_off; jcc, 0x85; - ld_imm, RAX, -1; + ld_imm, TEMP0, -1; jmp_off; - st, S32, RAX, X, rd; + st, S32, TEMP0, X, rd; /* FIXME: handle overflow */ })) @@ -1135,15 +1135,15 @@ RVOP( rv->X[ir->rd] = !udivisor ? ~0U : udividend / udivisor; }, X64({ - ld, S32, RAX, X, rs1; - ld, S32, RBX, X, rs2; - div, 0x38, RBX, RAX, 0; - cmp_imm, RBX, 0; + ld, S32, TEMP0, X, rs1; + ld, S32, TEMP1, X, rs2; + div, 0x38, TEMP1, TEMP0, 0; + cmp_imm, TEMP1, 0; set_jmp_off; jcc, 0x85; - ld_imm, RAX, ~0U; + ld_imm, TEMP0, ~0U; jmp_off; - st, S32, RAX, X, rd; + st, S32, TEMP0, X, rd; })) /* clang-format off */ @@ -1164,10 +1164,10 @@ RVOP(rem, { % divisor); }, X64({ - ld, S32, RAX, X, rs1; - ld, S32, RBX, X, rs2; - mod, 0x98, RBX, RAX, 0; - st, S32, RAX, X, rd; + ld, S32, TEMP0, X, rs1; + ld, S32, TEMP1, X, rs2; + mod, 0x98, TEMP1, TEMP0, 0; + st, S32, TEMP0, X, rd; /* FIXME: handle overflow */ })) @@ -1185,10 +1185,10 @@ RVOP(remu, { % udivisor; }, X64({ - ld, S32, RAX, X, rs1; - ld, S32, RBX, X, rs2; - mod, 0x98, RBX, RAX, 0; - st, S32, RAX, X, rd; + ld, S32, TEMP0, X, rs1; + ld, S32, TEMP1, X, rs2; + mod, 0x98, TEMP1, TEMP0, 0; + st, S32, TEMP0, X, rd; })) /* clang-format on */ #endif @@ -1726,9 +1726,9 @@ RVOP(caddi4spn, { rv->X[ir->rd] = rv->X[rv_reg_sp] + (uint16_t) ir->imm; }, X64({ - ld, S32, RAX, X, rv_reg_sp; - alu32_imm, 32, 0x81, 0, RAX, uint, 16, imm; - st, S32, RAX, X, rd; + ld, S32, TEMP0, X, rv_reg_sp; + alu32_imm, 32, 0x81, 0, TEMP0, uint, 16, imm; + st, S32, TEMP0, X, rd; })) /* C.LW loads a 32-bit value from memory into register rd'. It computes an @@ -1743,11 +1743,11 @@ RVOP(clw, }, X64({ mem; - ld, S32, RAX, X, rs1; - ld_imm, RBX, mem; - alu64, 0x01, RBX, RAX; - ld, S32, RAX, RBX, 0; - st, S32, RBX, X, rd; + ld, S32, TEMP0, X, rs1; + ld_imm, TEMP1, mem; + alu64, 0x01, TEMP1, TEMP0; + ld, S32, TEMP0, TEMP1, 0; + st, S32, TEMP1, X, rd; })) /* C.SW stores a 32-bit value in register rs2' to memory. It computes an @@ -1763,11 +1763,11 @@ RVOP(csw, }, X64({ mem; - ld, S32, RAX, X, rs1; - ld_imm, RBX, mem; - alu64, 0x01, RBX, RAX; - ld, S32, RBX, X, rs2; - st, S32, RBX, RAX, 0; + ld, S32, TEMP0, X, rs1; + ld_imm, TEMP1, mem; + alu64, 0x01, TEMP1, TEMP0; + ld, S32, TEMP1, X, rs2; + st, S32, TEMP1, TEMP0, 0; })) /* C.NOP */ @@ -1781,9 +1781,9 @@ RVOP(cnop, {/* no operation */}, X64({/* no operation */})) */ RVOP(caddi, { rv->X[ir->rd] += (int16_t) ir->imm; }, X64({ - ld, S32, RAX, X, rd; - alu32_imm, 32, 0x81, 0, RAX, int, 16, imm; - st, S32, RAX, X, rd; + ld, S32, TEMP0, X, rd; + alu32_imm, 32, 0x81, 0, TEMP0, int, 16, imm; + st, S32, TEMP0, X, rd; })) /* C.JAL */ @@ -1812,10 +1812,10 @@ RVOP(cjal, return true; }, X64({ - ld_imm, RAX, pc, 2; - st, S32, RAX, X, rv_reg_ra; - ld_imm, RAX, pc, imm; - st, S32, RAX, PC; + ld_imm, TEMP0, pc, 2; + st, S32, TEMP0, X, rv_reg_ra; + ld_imm, TEMP0, pc, imm; + st, S32, TEMP0, PC; jmp, pc, imm; exit; })) @@ -1826,8 +1826,8 @@ RVOP(cjal, */ RVOP(cli, { rv->X[ir->rd] = ir->imm; }, X64({ - ld_imm, RAX, imm; - st, S32, RAX, X, rd; + ld_imm, TEMP0, imm; + st, S32, TEMP0, X, rd; })) /* C.ADDI16SP is used to adjust the stack pointer in procedure prologues @@ -1837,9 +1837,9 @@ RVOP(cli, { */ RVOP(caddi16sp, { rv->X[ir->rd] += ir->imm; }, X64({ - ld, S32, RAX, X, rd; - alu32_imm, 32, 0x81, 0, RAX, imm; - st, S32, RAX, X, rd; + ld, S32, TEMP0, X, rd; + alu32_imm, 32, 0x81, 0, TEMP0, imm; + st, S32, TEMP0, X, rd; })) /* C.LUI loads the non-zero 6-bit immediate field into bits 17–12 of the @@ -1851,8 +1851,8 @@ RVOP(caddi16sp, { */ RVOP(clui, { rv->X[ir->rd] = ir->imm; }, X64({ - ld_imm, RAX, imm; - st, S32, RAX, X, rd; + ld_imm, TEMP0, imm; + st, S32, TEMP0, X, rd; })) /* C.SRLI is a CB-format instruction that performs a logical right shift @@ -1862,9 +1862,9 @@ RVOP(clui, { */ RVOP(csrli, { rv->X[ir->rs1] >>= ir->shamt; }, X64({ - ld, S32, RAX, X, rs1; - alu32_imm, 8, 0xc1, 5, RAX, shamt; - st, S32, RAX, X, rs1; + ld, S32, TEMP0, X, rs1; + alu32_imm, 8, 0xc1, 5, TEMP0, shamt; + st, S32, TEMP0, X, rs1; })) /* C.SRAI is defined analogously to C.SRLI, but instead performs an @@ -1878,9 +1878,9 @@ RVOP(csrai, rv->X[ir->rs1] |= mask >> i; }, X64({ - ld, S32, RAX, X, rs1; - alu32_imm, 8, 0xc1, 7, RAX, shamt; - st, S32, RAX, X, rs1; + ld, S32, TEMP0, X, rs1; + alu32_imm, 8, 0xc1, 7, TEMP0, shamt; + st, S32, TEMP0, X, rs1; /* FIXME: Incomplete */ })) @@ -1890,43 +1890,43 @@ RVOP(csrai, */ RVOP(candi, { rv->X[ir->rs1] &= ir->imm; }, X64({ - ld, S32, RAX, X, rs1; - alu32_imm, 32, 0x81, 4, RAX, imm; - st, S32, RAX, X, rs1; + ld, S32, TEMP0, X, rs1; + alu32_imm, 32, 0x81, 4, TEMP0, imm; + st, S32, TEMP0, X, rs1; })) /* C.SUB */ RVOP(csub, { rv->X[ir->rd] = rv->X[ir->rs1] - rv->X[ir->rs2]; }, X64({ - ld, S32, RAX, X, rs1; - ld, S32, RBX, X, rs2; - alu32, 0x29, RBX, RAX; - st, S32, RAX, X, rd; + ld, S32, TEMP0, X, rs1; + ld, S32, TEMP1, X, rs2; + alu32, 0x29, TEMP1, TEMP0; + st, S32, TEMP0, X, rd; })) /* C.XOR */ RVOP(cxor, { rv->X[ir->rd] = rv->X[ir->rs1] ^ rv->X[ir->rs2]; }, X64({ - ld, S32, RAX, X, rs1; - ld, S32, RBX, X, rs2; - alu32, 0x31, RBX, RAX; - st, S32, RAX, X, rd; + ld, S32, TEMP0, X, rs1; + ld, S32, TEMP1, X, rs2; + alu32, 0x31, TEMP1, TEMP0; + st, S32, TEMP0, X, rd; })) RVOP(cor, { rv->X[ir->rd] = rv->X[ir->rs1] | rv->X[ir->rs2]; }, X64({ - ld, S32, RAX, X, rs1; - ld, S32, RBX, X, rs2; - alu32, 0x09, RBX, RAX; - st, S32, RAX, X, rd; + ld, S32, TEMP0, X, rs1; + ld, S32, TEMP1, X, rs2; + alu32, 0x09, TEMP1, TEMP0; + st, S32, TEMP0, X, rd; })) RVOP(cand, { rv->X[ir->rd] = rv->X[ir->rs1] & rv->X[ir->rs2]; }, X64({ - ld, S32, RAX, X, rs1; - ld, S32, RBX, X, rs2; - alu32, 0x21, RBX, RAX; - st, S32, RAX, X, rd; + ld, S32, TEMP0, X, rs1; + ld, S32, TEMP1, X, rs2; + alu32, 0x21, TEMP1, TEMP0; + st, S32, TEMP0, X, rd; })) /* C.J performs an unconditional control transfer. The offset is sign-extended @@ -1958,8 +1958,8 @@ RVOP(cj, return true; }, X64({ - ld_imm, RAX, pc, imm; - st, S32, RAX, PC; + ld_imm, TEMP0, pc, imm; + st, S32, TEMP0, PC; jmp, pc, imm; exit; })) @@ -2012,22 +2012,22 @@ RVOP(cbeqz, return true; }, X64({ - ld, S32, RAX, X, rs1; - cmp_imm, RAX, 0; + ld, S32, TEMP0, X, rs1; + cmp_imm, TEMP0, 0; set_jmp_off; jcc, 0x84; cond, branch_untaken; jmp, pc, 2; end; - ld_imm, RAX, pc, 2; - st, S32, RAX, PC; + ld_imm, TEMP0, pc, 2; + st, S32, TEMP0, PC; exit; jmp_off; cond, branch_taken; jmp, pc, imm; end; - ld_imm, RAX, pc, imm; - st, S32, RAX, PC; + ld_imm, TEMP0, pc, imm; + st, S32, TEMP0, PC; exit; })) @@ -2075,22 +2075,22 @@ RVOP(cbnez, return true; }, X64({ - ld, S32, RAX, X, rs1; - cmp_imm, RAX, 0; + ld, S32, TEMP0, X, rs1; + cmp_imm, TEMP0, 0; set_jmp_off; jcc, 0x85; cond, branch_untaken; jmp, pc, 2; end; - ld_imm, RAX, pc, 2; - st, S32, RAX, PC; + ld_imm, TEMP0, pc, 2; + st, S32, TEMP0, PC; exit; jmp_off; cond, branch_taken; jmp, pc, imm; end; - ld_imm, RAX, pc, imm; - st, S32, RAX, PC; + ld_imm, TEMP0, pc, imm; + st, S32, TEMP0, PC; exit; })) @@ -2100,9 +2100,9 @@ RVOP(cbnez, */ RVOP(cslli, { rv->X[ir->rd] <<= (uint8_t) ir->imm; }, X64({ - ld, S32, RAX, X, rd; - alu32_imm, 8, 0xc1, 4, RAX, uint, 8, imm; - st, S32, RAX, X, rd; + ld, S32, TEMP0, X, rd; + alu32_imm, 8, 0xc1, 4, TEMP0, uint, 8, imm; + st, S32, TEMP0, X, rd; })) /* C.LWSP */ @@ -2114,11 +2114,11 @@ RVOP(clwsp, }, X64({ mem; - ld, S32, RAX, X, rv_reg_sp; - ld_imm, RBX, mem; - alu64, 0x01, RBX, RAX; - ld, S32, RAX, RBX, 0; - st, S32, RBX, X, rd; + ld, S32, TEMP0, X, rv_reg_sp; + ld_imm, TEMP1, mem; + alu64, 0x01, TEMP1, TEMP0; + ld, S32, TEMP0, TEMP1, 0; + st, S32, TEMP1, X, rd; })) /* C.JR */ @@ -2133,16 +2133,16 @@ RVOP(cjr, return true; }, X64({ - ld, S32, RAX, X, rs1; - st, S32, RAX, PC; + ld, S32, TEMP0, X, rs1; + st, S32, TEMP0, PC; exit; })) /* C.MV */ RVOP(cmv, { rv->X[ir->rd] = rv->X[ir->rs2]; }, X64({ - ld, S32, RAX, X, rs2; - st, S32, RAX, X, rd; + ld, S32, TEMP0, X, rs2; + st, S32, TEMP0, X, rd; })) /* C.EBREAK */ @@ -2155,10 +2155,10 @@ RVOP(cebreak, return true; }, X64({ - ld_imm, RAX, pc; - st, S32, RAX, PC; - ld_imm, RAX, 1; - st, S32, RAX, compressed; + ld_imm, TEMP0, pc; + st, S32, TEMP0, PC; + ld_imm, TEMP0, 1; + st, S32, TEMP0, compressed; call, ebreak; exit; })) @@ -2179,10 +2179,10 @@ RVOP(cjalr, return true; }, X64({ - ld_imm, RAX, pc, 2; - st, S32, RAX, X, rv_reg_ra; - ld, S32, RAX, X, rs1; - st, S32, RAX, PC; + ld_imm, TEMP0, pc, 2; + st, S32, TEMP0, X, rv_reg_ra; + ld, S32, TEMP0, X, rs1; + st, S32, TEMP0, PC; exit; })) @@ -2195,10 +2195,10 @@ RVOP(cjalr, */ RVOP(cadd, { rv->X[ir->rd] = rv->X[ir->rs1] + rv->X[ir->rs2]; }, X64({ - ld, S32, RAX, X, rs1; - ld, S32, RBX, X, rs2; - alu32, 0x01, RBX, RAX; - st, S32, RAX, X, rd; + ld, S32, TEMP0, X, rs1; + ld, S32, TEMP1, X, rs2; + alu32, 0x01, TEMP1, TEMP0; + st, S32, TEMP0, X, rd; })) /* C.SWSP */ @@ -2210,10 +2210,10 @@ RVOP(cswsp, }, X64({ mem; - ld, S32, RAX, X, rv_reg_sp; - ld_imm, RBX, mem; - alu64, 0x01, RBX, RAX; - ld, S32, RBX, X, rs2; - st, S32, RBX, RAX, 0; + ld, S32, TEMP0, X, rv_reg_sp; + ld_imm, TEMP1, mem; + alu64, 0x01, TEMP1, TEMP0; + ld, S32, TEMP1, X, rs2; + st, S32, TEMP1, TEMP0, 0; })) #endif diff --git a/tools/gen-jit-template.py b/tools/gen-jit-template.py index 83b522815..744ea12f3 100755 --- a/tools/gen-jit-template.py +++ b/tools/gen-jit-template.py @@ -136,6 +136,7 @@ def remove_comment(str): f.close() fields = {"imm", "pc", "rs1", "rs2", "rd", "shamt", "branch_taken", "branch_untaken"} +temp_regs = {"TEMP0", "TEMP1", "TEMP2"} # generate jit template for i in range(len(op)): if (not SKIP_LIST.count(op[i])): @@ -149,6 +150,8 @@ def remove_comment(str): for i in range(len(items)): if items[i] in fields: items[i] = "ir->" + items[i] + if items[i] in temp_regs: + items[i] = "temp_reg[" + items[i][-1] + "]" if items[0] == "alu32_imm": if len(items) == 8: asm = "emit_alu32_imm{}(state, {}, {}, {}, ({}{}_t) {});".format( @@ -219,7 +222,7 @@ def remove_comment(str): elif items[0] == "set_jmp_off": asm = "uint32_t jump_loc = state->offset;" elif items[0] == "jmp_off": - asm = "emit_jump_target_offset(state, jump_loc + 2, state->offset);" + asm = "emit_jump_target_offset(state, JUMP_LOC, state->offset);" elif items[0] == "mem": asm = "memory_t *m = ((state_t *) rv->userdata)->mem;" elif items[0] == "call":