0048-RISCV-Peephole-optimisation-for-load-store-of-global.patch

From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
From: Alex Bradbury <asb@lowrisc.org>
Subject: [RISCV] Peephole optimisation for load/store of global value or
 constant addresses

(load (add base, off), 0) -> (load base, off)
(store val, (add base, off)) -> (store val, base, off)
---
 lib/Target/RISCV/RISCVISelDAGToDAG.cpp | 91 ++++++++++++++++++++++++++++++++++
 test/CodeGen/RISCV/blockaddress.ll     | 11 ++--
 test/CodeGen/RISCV/byval.ll            | 12 ++---
 test/CodeGen/RISCV/fp128.ll            | 48 ++++++------------
 test/CodeGen/RISCV/inline-asm.ll       |  3 +-
 test/CodeGen/RISCV/mem.ll              | 21 +++-----
 test/CodeGen/RISCV/mem64.ll            | 12 ++---
 test/CodeGen/RISCV/wide-mem.ll         |  6 +--
 8 files changed, 132 insertions(+), 72 deletions(-)

diff --git a/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index d07301df718..2dbbd704f26 100644
--- a/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -42,6 +42,8 @@ public:
     return SelectionDAGISel::runOnMachineFunction(MF);
   }
 
+  void PostprocessISelDAG() override;
+
   void Select(SDNode *Node) override;
 
   bool SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
@@ -51,9 +53,14 @@ public:
 
 // Include the pieces autogenerated from the target description.
 #include "RISCVGenDAGISel.inc"
+
+private:
+  void doPeepholeLoadStoreADDI();
 };
 }
 
+void RISCVDAGToDAGISel::PostprocessISelDAG() { doPeepholeLoadStoreADDI(); }
+
 void RISCVDAGToDAGISel::Select(SDNode *Node) {
   unsigned Opcode = Node->getOpcode();
   MVT XLenVT = Subtarget->getXLenVT();
@@ -117,6 +124,90 @@ bool RISCVDAGToDAGISel::SelectAddrFI(SDValue Addr, SDValue &Base) {
   return false;
 }
 
+void RISCVDAGToDAGISel::doPeepholeLoadStoreADDI() {
+  SelectionDAG::allnodes_iterator Position(CurDAG->getRoot().getNode());
+  ++Position;
+
+  while (Position != CurDAG->allnodes_begin()) {
+    SDNode *N = &*--Position;
+    // Skip dead nodes and any non-machine opcodes.
+    if (N->use_empty() || !N->isMachineOpcode())
+      continue;
+
+    int OffsetOpIdx;
+    int BaseOpIdx;
+
+    // Only attempt this optimisation for I-type loads and S-type stores
+    switch (N->getMachineOpcode()) {
+    default:
+      continue;
+    case RISCV::LB:
+    case RISCV::LH:
+    case RISCV::LW:
+    case RISCV::LBU:
+    case RISCV::LHU:
+    case RISCV::LWU:
+    case RISCV::LD:
+    case RISCV::FLW:
+    case RISCV::FLD:
+      BaseOpIdx = 0;
+      OffsetOpIdx = 1;
+      break;
+    case RISCV::SB:
+    case RISCV::SH:
+    case RISCV::SW:
+    case RISCV::SD:
+    case RISCV::FSW:
+    case RISCV::FSD:
+      BaseOpIdx = 1;
+      OffsetOpIdx = 2;
+      break;
+    }
+
+    // Currently, the load/store offset must be 0 to be considered for this
+    // peephole optimisation.
+    if (!isa<ConstantSDNode>(N->getOperand(OffsetOpIdx)) || N->getConstantOperandVal(OffsetOpIdx) != 0)
+      continue;
+
+    SDValue Base = N->getOperand(BaseOpIdx);
+
+    // If the base is an ADDI, we can merge it in to the load/store.
+    if (!Base.isMachineOpcode() || Base.getMachineOpcode() != RISCV::ADDI)
+      continue;
+
+    SDValue ImmOperand = Base.getOperand(1);
+
+    if (auto Const = dyn_cast<ConstantSDNode>(ImmOperand)) {
+      ImmOperand = CurDAG->getTargetConstant(
+          Const->getSExtValue(), SDLoc(ImmOperand), ImmOperand.getValueType());
+    } else if (auto GA = dyn_cast<GlobalAddressSDNode>(ImmOperand)) {
+      ImmOperand = CurDAG->getTargetGlobalAddress(
+          GA->getGlobal(), SDLoc(ImmOperand), ImmOperand.getValueType(),
+          GA->getOffset(), GA->getTargetFlags());
+    } else {
+      continue;
+    }
+
+    DEBUG(dbgs() << "Folding add-immediate into mem-op:\nBase:    ");
+    DEBUG(Base->dump(CurDAG));
+    DEBUG(dbgs() << "\nN: ");
+    DEBUG(N->dump(CurDAG));
+    DEBUG(dbgs() << "\n");
+
+    // Modify the offset operand of the load/store.
+    if (BaseOpIdx == 0) // Load
+      CurDAG->UpdateNodeOperands(N, Base.getOperand(0), ImmOperand,
+                                 N->getOperand(2));
+    else // Store
+      CurDAG->UpdateNodeOperands(N, N->getOperand(0), Base.getOperand(0),
+                                 ImmOperand, N->getOperand(3));
+
+    // The add-immediate may now be dead, in which case remove it.
+    if (Base.getNode()->use_empty())
+      CurDAG->RemoveDeadNode(Base.getNode());
+  }
+}
+
 // This pass converts a legalized DAG into a RISCV-specific DAG, ready
 // for instruction scheduling.
 FunctionPass *llvm::createRISCVISelDag(RISCVTargetMachine &TM) {
diff --git a/test/CodeGen/RISCV/blockaddress.ll b/test/CodeGen/RISCV/blockaddress.ll
index 266bf4f2b27..8bf50bdb544 100644
--- a/test/CodeGen/RISCV/blockaddress.ll
+++ b/test/CodeGen/RISCV/blockaddress.ll
@@ -9,12 +9,11 @@ define void @test_blockaddress() nounwind {
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    addi sp, sp, -16
 ; RV32I-NEXT:    sw ra, 12(sp)
-; RV32I-NEXT:    lui a0, %hi(addr)
-; RV32I-NEXT:    addi a0, a0, %lo(addr)
-; RV32I-NEXT:    lui a1, %hi(.Ltmp0)
-; RV32I-NEXT:    addi a1, a1, %lo(.Ltmp0)
-; RV32I-NEXT:    sw a1, 0(a0)
-; RV32I-NEXT:    lw a0, 0(a0)
+; RV32I-NEXT:    lui a0, %hi(.Ltmp0)
+; RV32I-NEXT:    addi a0, a0, %lo(.Ltmp0)
+; RV32I-NEXT:    lui a1, %hi(addr)
+; RV32I-NEXT:    sw a0, %lo(addr)(a1)
+; RV32I-NEXT:    lw a0, %lo(addr)(a1)
 ; RV32I-NEXT:    jalr zero, a0, 0
 ; RV32I-NEXT:  .Ltmp0: # Block address taken
 ; RV32I-NEXT:  .LBB0_1: # %block
diff --git a/test/CodeGen/RISCV/byval.ll b/test/CodeGen/RISCV/byval.ll
index 30ba8e6562e..5e2783b6ce3 100644
--- a/test/CodeGen/RISCV/byval.ll
+++ b/test/CodeGen/RISCV/byval.ll
@@ -23,20 +23,16 @@ define void @caller() nounwind {
 ; RV32I-NEXT:    addi sp, sp, -32
 ; RV32I-NEXT:    sw ra, 28(sp)
 ; RV32I-NEXT:    lui a0, %hi(foo+12)
-; RV32I-NEXT:    addi a0, a0, %lo(foo+12)
-; RV32I-NEXT:    lw a0, 0(a0)
+; RV32I-NEXT:    lw a0, %lo(foo+12)(a0)
 ; RV32I-NEXT:    sw a0, 24(sp)
 ; RV32I-NEXT:    lui a0, %hi(foo+8)
-; RV32I-NEXT:    addi a0, a0, %lo(foo+8)
-; RV32I-NEXT:    lw a0, 0(a0)
+; RV32I-NEXT:    lw a0, %lo(foo+8)(a0)
 ; RV32I-NEXT:    sw a0, 20(sp)
 ; RV32I-NEXT:    lui a0, %hi(foo+4)
-; RV32I-NEXT:    addi a0, a0, %lo(foo+4)
-; RV32I-NEXT:    lw a0, 0(a0)
+; RV32I-NEXT:    lw a0, %lo(foo+4)(a0)
 ; RV32I-NEXT:    sw a0, 16(sp)
 ; RV32I-NEXT:    lui a0, %hi(foo)
-; RV32I-NEXT:    addi a0, a0, %lo(foo)
-; RV32I-NEXT:    lw a0, 0(a0)
+; RV32I-NEXT:    lw a0, %lo(foo)(a0)
 ; RV32I-NEXT:    sw a0, 12(sp)
 ; RV32I-NEXT:    lui a0, %hi(callee)
 ; RV32I-NEXT:    addi a1, a0, %lo(callee)
diff --git a/test/CodeGen/RISCV/fp128.ll b/test/CodeGen/RISCV/fp128.ll
index 0e3b6debd9e..8041efb1699 100644
--- a/test/CodeGen/RISCV/fp128.ll
+++ b/test/CodeGen/RISCV/fp128.ll
@@ -14,36 +14,28 @@ define i32 @test_load_and_cmp() nounwind {
 ; RV32I-NEXT:    addi sp, sp, -48
 ; RV32I-NEXT:    sw ra, 44(sp)
 ; RV32I-NEXT:    lui a0, %hi(y+12)
-; RV32I-NEXT:    addi a0, a0, %lo(y+12)
-; RV32I-NEXT:    lw a0, 0(a0)
+; RV32I-NEXT:    lw a0, %lo(y+12)(a0)
 ; RV32I-NEXT:    sw a0, 20(sp)
 ; RV32I-NEXT:    lui a0, %hi(y+8)
-; RV32I-NEXT:    addi a0, a0, %lo(y+8)
-; RV32I-NEXT:    lw a0, 0(a0)
+; RV32I-NEXT:    lw a0, %lo(y+8)(a0)
 ; RV32I-NEXT:    sw a0, 16(sp)
 ; RV32I-NEXT:    lui a0, %hi(y+4)
-; RV32I-NEXT:    addi a0, a0, %lo(y+4)
-; RV32I-NEXT:    lw a0, 0(a0)
+; RV32I-NEXT:    lw a0, %lo(y+4)(a0)
 ; RV32I-NEXT:    sw a0, 12(sp)
 ; RV32I-NEXT:    lui a0, %hi(y)
-; RV32I-NEXT:    addi a0, a0, %lo(y)
-; RV32I-NEXT:    lw a0, 0(a0)
+; RV32I-NEXT:    lw a0, %lo(y)(a0)
 ; RV32I-NEXT:    sw a0, 8(sp)
 ; RV32I-NEXT:    lui a0, %hi(x+12)
-; RV32I-NEXT:    addi a0, a0, %lo(x+12)
-; RV32I-NEXT:    lw a0, 0(a0)
+; RV32I-NEXT:    lw a0, %lo(x+12)(a0)
 ; RV32I-NEXT:    sw a0, 36(sp)
 ; RV32I-NEXT:    lui a0, %hi(x+8)
-; RV32I-NEXT:    addi a0, a0, %lo(x+8)
-; RV32I-NEXT:    lw a0, 0(a0)
+; RV32I-NEXT:    lw a0, %lo(x+8)(a0)
 ; RV32I-NEXT:    sw a0, 32(sp)
 ; RV32I-NEXT:    lui a0, %hi(x+4)
-; RV32I-NEXT:    addi a0, a0, %lo(x+4)
-; RV32I-NEXT:    lw a0, 0(a0)
+; RV32I-NEXT:    lw a0, %lo(x+4)(a0)
 ; RV32I-NEXT:    sw a0, 28(sp)
 ; RV32I-NEXT:    lui a0, %hi(x)
-; RV32I-NEXT:    addi a0, a0, %lo(x)
-; RV32I-NEXT:    lw a0, 0(a0)
+; RV32I-NEXT:    lw a0, %lo(x)(a0)
 ; RV32I-NEXT:    sw a0, 24(sp)
 ; RV32I-NEXT:    lui a0, %hi(__netf2)
 ; RV32I-NEXT:    addi a2, a0, %lo(__netf2)
@@ -68,36 +60,28 @@ define i32 @test_add_and_fptosi() nounwind {
 ; RV32I-NEXT:    addi sp, sp, -80
 ; RV32I-NEXT:    sw ra, 76(sp)
 ; RV32I-NEXT:    lui a0, %hi(y+12)
-; RV32I-NEXT:    addi a0, a0, %lo(y+12)
-; RV32I-NEXT:    lw a0, 0(a0)
+; RV32I-NEXT:    lw a0, %lo(y+12)(a0)
 ; RV32I-NEXT:    sw a0, 36(sp)
 ; RV32I-NEXT:    lui a0, %hi(y+8)
-; RV32I-NEXT:    addi a0, a0, %lo(y+8)
-; RV32I-NEXT:    lw a0, 0(a0)
+; RV32I-NEXT:    lw a0, %lo(y+8)(a0)
 ; RV32I-NEXT:    sw a0, 32(sp)
 ; RV32I-NEXT:    lui a0, %hi(y+4)
-; RV32I-NEXT:    addi a0, a0, %lo(y+4)
-; RV32I-NEXT:    lw a0, 0(a0)
+; RV32I-NEXT:    lw a0, %lo(y+4)(a0)
 ; RV32I-NEXT:    sw a0, 28(sp)
 ; RV32I-NEXT:    lui a0, %hi(y)
-; RV32I-NEXT:    addi a0, a0, %lo(y)
-; RV32I-NEXT:    lw a0, 0(a0)
+; RV32I-NEXT:    lw a0, %lo(y)(a0)
 ; RV32I-NEXT:    sw a0, 24(sp)
 ; RV32I-NEXT:    lui a0, %hi(x+12)
-; RV32I-NEXT:    addi a0, a0, %lo(x+12)
-; RV32I-NEXT:    lw a0, 0(a0)
+; RV32I-NEXT:    lw a0, %lo(x+12)(a0)
 ; RV32I-NEXT:    sw a0, 52(sp)
 ; RV32I-NEXT:    lui a0, %hi(x+8)
-; RV32I-NEXT:    addi a0, a0, %lo(x+8)
-; RV32I-NEXT:    lw a0, 0(a0)
+; RV32I-NEXT:    lw a0, %lo(x+8)(a0)
 ; RV32I-NEXT:    sw a0, 48(sp)
 ; RV32I-NEXT:    lui a0, %hi(x+4)
-; RV32I-NEXT:    addi a0, a0, %lo(x+4)
-; RV32I-NEXT:    lw a0, 0(a0)
+; RV32I-NEXT:    lw a0, %lo(x+4)(a0)
 ; RV32I-NEXT:    sw a0, 44(sp)
 ; RV32I-NEXT:    lui a0, %hi(x)
-; RV32I-NEXT:    addi a0, a0, %lo(x)
-; RV32I-NEXT:    lw a0, 0(a0)
+; RV32I-NEXT:    lw a0, %lo(x)(a0)
 ; RV32I-NEXT:    sw a0, 40(sp)
 ; RV32I-NEXT:    lui a0, %hi(__addtf3)
 ; RV32I-NEXT:    addi a3, a0, %lo(__addtf3)
diff --git a/test/CodeGen/RISCV/inline-asm.ll b/test/CodeGen/RISCV/inline-asm.ll
index 05bafb93c5b..8deb19065cf 100644
--- a/test/CodeGen/RISCV/inline-asm.ll
+++ b/test/CodeGen/RISCV/inline-asm.ll
@@ -8,8 +8,7 @@ define i32 @constraint_r(i32 %a) {
 ; RV32I-LABEL: constraint_r:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a1, %hi(gi)
-; RV32I-NEXT:    addi a1, a1, %lo(gi)
-; RV32I-NEXT:    lw a1, 0(a1)
+; RV32I-NEXT:    lw a1, %lo(gi)(a1)
 ; RV32I-NEXT:    #APP
 ; RV32I-NEXT:    add a0, a0, a1
 ; RV32I-NEXT:    #NO_APP
diff --git a/test/CodeGen/RISCV/mem.ll b/test/CodeGen/RISCV/mem.ll
index 6446034e542..ee340600784 100644
--- a/test/CodeGen/RISCV/mem.ll
+++ b/test/CodeGen/RISCV/mem.ll
@@ -163,17 +163,14 @@ define i16 @load_sext_zext_anyext_i1_i16(i1 *%a) nounwind {
 @G = global i32 0
 
 define i32 @lw_sw_global(i32 %a) nounwind {
-; TODO: the addi should be folded in to the lw/sw operations
 ; RV32I-LABEL: lw_sw_global:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    lui a1, %hi(G)
-; RV32I-NEXT:    addi a2, a1, %lo(G)
-; RV32I-NEXT:    lw a1, 0(a2)
-; RV32I-NEXT:    sw a0, 0(a2)
+; RV32I-NEXT:    lui a2, %hi(G)
+; RV32I-NEXT:    lw a1, %lo(G)(a2)
+; RV32I-NEXT:    sw a0, %lo(G)(a2)
 ; RV32I-NEXT:    lui a2, %hi(G+36)
-; RV32I-NEXT:    addi a2, a2, %lo(G+36)
-; RV32I-NEXT:    lw a3, 0(a2)
-; RV32I-NEXT:    sw a0, 0(a2)
+; RV32I-NEXT:    lw a3, %lo(G+36)(a2)
+; RV32I-NEXT:    sw a0, %lo(G+36)(a2)
 ; RV32I-NEXT:    addi a0, a1, 0
 ; RV32I-NEXT:    jalr zero, ra, 0
   %1 = load volatile i32, i32* @G
@@ -186,13 +183,11 @@ define i32 @lw_sw_global(i32 %a) nounwind {
 
 ; Ensure that 1 is added to the high 20 bits if bit 11 of the low part is 1
 define i32 @lw_sw_constant(i32 %a) nounwind {
-; TODO: the addi should be folded in to the lw/sw
 ; RV32I-LABEL: lw_sw_constant:
 ; RV32I:       # %bb.0:
-; RV32I-NEXT:    lui a1, 912092
-; RV32I-NEXT:    addi a2, a1, -273
-; RV32I-NEXT:    lw a1, 0(a2)
-; RV32I-NEXT:    sw a0, 0(a2)
+; RV32I-NEXT:    lui a2, 912092
+; RV32I-NEXT:    lw a1, -273(a2)
+; RV32I-NEXT:    sw a0, -273(a2)
 ; RV32I-NEXT:    addi a0, a1, 0
 ; RV32I-NEXT:    jalr zero, ra, 0
   %1 = inttoptr i32 3735928559 to i32*
diff --git a/test/CodeGen/RISCV/mem64.ll b/test/CodeGen/RISCV/mem64.ll
index 9aaf3afc0dc..96ac2fe82de 100644
--- a/test/CodeGen/RISCV/mem64.ll
+++ b/test/CodeGen/RISCV/mem64.ll
@@ -210,14 +210,12 @@ define i64 @ld_sd_global(i64 %a) nounwind {
 ; TODO: the addi should be folded in to the ld/sd operations
 ; RV64I-LABEL: ld_sd_global:
 ; RV64I:       # %bb.0:
-; RV64I-NEXT:    lui a1, %hi(G)
-; RV64I-NEXT:    addi a2, a1, %lo(G)
-; RV64I-NEXT:    ld a1, 0(a2)
-; RV64I-NEXT:    sd a0, 0(a2)
+; RV64I-NEXT:    lui a2, %hi(G)
+; RV64I-NEXT:    ld a1, %lo(G)(a2)
+; RV64I-NEXT:    sd a0, %lo(G)(a2)
 ; RV64I-NEXT:    lui a2, %hi(G+72)
-; RV64I-NEXT:    addi a2, a2, %lo(G+72)
-; RV64I-NEXT:    ld a3, 0(a2)
-; RV64I-NEXT:    sd a0, 0(a2)
+; RV64I-NEXT:    ld a3, %lo(G+72)(a2)
+; RV64I-NEXT:    sd a0, %lo(G+72)(a2)
 ; RV64I-NEXT:    addi a0, a1, 0
 ; RV64I-NEXT:    jalr zero, ra, 0
   %1 = load volatile i64, i64* @G
diff --git a/test/CodeGen/RISCV/wide-mem.ll b/test/CodeGen/RISCV/wide-mem.ll
index cbb89f631a5..33a51efe818 100644
--- a/test/CodeGen/RISCV/wide-mem.ll
+++ b/test/CodeGen/RISCV/wide-mem.ll
@@ -23,11 +23,9 @@ define i64 @load_i64_global() nounwind {
 ; RV32I-LABEL: load_i64_global:
 ; RV32I:       # %bb.0:
 ; RV32I-NEXT:    lui a0, %hi(val64)
-; RV32I-NEXT:    addi a0, a0, %lo(val64)
-; RV32I-NEXT:    lw a0, 0(a0)
+; RV32I-NEXT:    lw a0, %lo(val64)(a0)
 ; RV32I-NEXT:    lui a1, %hi(val64+4)
-; RV32I-NEXT:    addi a1, a1, %lo(val64+4)
-; RV32I-NEXT:    lw a1, 0(a1)
+; RV32I-NEXT:    lw a1, %lo(val64+4)(a1)
 ; RV32I-NEXT:    jalr zero, ra, 0
   %1 = load i64, i64* @val64
   ret i64 %1
-- 
2.16.2