Skip to content

Commit

Permalink
[RISCV] Add combines to form binop from tail insert idioms (llvm#72675)
Browse files Browse the repository at this point in the history
This patch contains two related combines:
1) If we have an scalar vector insert into the result of a
concat_vector,
   sink the insert into the operand of the concat.
2) If we have a insert of a scalar binop into a vector binop of the
   same opcode and the RHS of both are constant, perform the insert
   and then the binop.

The common theme to both is pushing inserts closer to the sources of the
computation graph. The goal is to enable forming vector bin ops from
inserts of scalar binops at the end of another vector.

For RISCV specifically, the concat_vector transform will push inserts to
smaller vectors. This will have the effect of reducing lmul for the
vslides, and usually doesn't require an additional vsetvli since
the source vectors are already working in the narrower VL.   I tried
that one as a target independent combine first, and it doesn't appear
profitable on all targets.

This is only one approach to the problem. Another idea would be to
aggressively form build_vectors and subvector inserts from the
individual scalar inserts, and then have a transform which sunk a
subvector_insert down through the concat. The advantage of the alternate
approach is that we expose parallelism in the insert sequence, even if
the source vector isn't a concat_vector. If reviewers are okay with it,
I'd like to start with this approach, and then explore that direction in
a follow up patch.
  • Loading branch information
preames authored Nov 30, 2023
1 parent 0ef013c commit ff5e536
Show file tree
Hide file tree
Showing 3 changed files with 130 additions and 75 deletions.
76 changes: 75 additions & 1 deletion llvm/lib/Target/RISCV/RISCVISelLowering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1393,7 +1393,8 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
ISD::VP_GATHER, ISD::VP_SCATTER, ISD::SRA, ISD::SRL,
ISD::SHL, ISD::STORE, ISD::SPLAT_VECTOR,
ISD::BUILD_VECTOR, ISD::CONCAT_VECTORS,
ISD::EXPERIMENTAL_VP_REVERSE, ISD::MUL});
ISD::EXPERIMENTAL_VP_REVERSE, ISD::MUL,
ISD::INSERT_VECTOR_ELT});
if (Subtarget.hasVendorXTHeadMemPair())
setTargetDAGCombine({ISD::LOAD, ISD::STORE});
if (Subtarget.useRVVForFixedLengthVectors())
Expand Down Expand Up @@ -14342,6 +14343,75 @@ static SDValue performBUILD_VECTORCombine(SDNode *N, SelectionDAG &DAG,
DAG.getBuildVector(VT, DL, RHSOps));
}

static SDValue performINSERT_VECTOR_ELTCombine(SDNode *N, SelectionDAG &DAG,
const RISCVSubtarget &Subtarget,
const RISCVTargetLowering &TLI) {
SDValue InVec = N->getOperand(0);
SDValue InVal = N->getOperand(1);
SDValue EltNo = N->getOperand(2);
SDLoc DL(N);

EVT VT = InVec.getValueType();
if (VT.isScalableVector())
return SDValue();

if (!InVec.hasOneUse())
return SDValue();

// Given insert_vector_elt (binop a, VecC), (same_binop b, C2), Elt
// move the insert_vector_elts into the arms of the binop. Note that
// the new RHS must be a constant.
const unsigned InVecOpcode = InVec->getOpcode();
if (InVecOpcode == InVal->getOpcode() && TLI.isBinOp(InVecOpcode) &&
InVal.hasOneUse()) {
SDValue InVecLHS = InVec->getOperand(0);
SDValue InVecRHS = InVec->getOperand(1);
SDValue InValLHS = InVal->getOperand(0);
SDValue InValRHS = InVal->getOperand(1);

if (!ISD::isBuildVectorOfConstantSDNodes(InVecRHS.getNode()))
return SDValue();
if (!isa<ConstantSDNode>(InValRHS) && !isa<ConstantFPSDNode>(InValRHS))
return SDValue();
// FIXME: Return failure if the RHS type doesn't match the LHS. Shifts may
// have different LHS and RHS types.
if (InVec.getOperand(0).getValueType() != InVec.getOperand(1).getValueType())
return SDValue();
SDValue LHS = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT,
InVecLHS, InValLHS, EltNo);
SDValue RHS = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, VT,
InVecRHS, InValRHS, EltNo);
return DAG.getNode(InVecOpcode, DL, VT, LHS, RHS);
}

// Given insert_vector_elt (concat_vectors ...), InVal, Elt
// move the insert_vector_elt to the source operand of the concat_vector.
if (InVec.getOpcode() != ISD::CONCAT_VECTORS)
return SDValue();

auto *IndexC = dyn_cast<ConstantSDNode>(EltNo);
if (!IndexC)
return SDValue();
unsigned Elt = IndexC->getZExtValue();

EVT ConcatVT = InVec.getOperand(0).getValueType();
if (ConcatVT.getVectorElementType() != InVal.getValueType())
return SDValue();
unsigned ConcatNumElts = ConcatVT.getVectorNumElements();
SDValue NewIdx = DAG.getConstant(Elt % ConcatNumElts, DL,
EltNo.getValueType());

unsigned ConcatOpIdx = Elt / ConcatNumElts;
SDValue ConcatOp = InVec.getOperand(ConcatOpIdx);
ConcatOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ConcatVT,
ConcatOp, InVal, NewIdx);

SmallVector<SDValue> ConcatOps;
ConcatOps.append(InVec->op_begin(), InVec->op_end());
ConcatOps[ConcatOpIdx] = ConcatOp;
return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, ConcatOps);
}

// If we're concatenating a series of vector loads like
// concat_vectors (load v4i8, p+0), (load v4i8, p+n), (load v4i8, p+n*2) ...
// Then we can turn this into a strided load by widening the vector elements
Expand Down Expand Up @@ -15407,6 +15477,10 @@ SDValue RISCVTargetLowering::PerformDAGCombine(SDNode *N,
if (SDValue V = performCONCAT_VECTORSCombine(N, DAG, Subtarget, *this))
return V;
break;
case ISD::INSERT_VECTOR_ELT:
if (SDValue V = performINSERT_VECTOR_ELTCombine(N, DAG, Subtarget, *this))
return V;
break;
case RISCVISD::VFMV_V_F_VL: {
const MVT VT = N->getSimpleValueType(0);
SDValue Passthru = N->getOperand(0);
Expand Down
113 changes: 50 additions & 63 deletions llvm/test/CodeGen/RISCV/rvv/concat-vector-insert-elt.ll
Original file line number Diff line number Diff line change
Expand Up @@ -10,10 +10,10 @@ define void @v4xi8_concat_vector_insert_idx0(ptr %a, ptr %b, i8 %x) {
; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma
; CHECK-NEXT: vle8.v v8, (a0)
; CHECK-NEXT: vle8.v v9, (a1)
; CHECK-NEXT: vsetvli zero, zero, e8, mf8, tu, ma
; CHECK-NEXT: vmv.s.x v8, a2
; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
; CHECK-NEXT: vslideup.vi v8, v9, 2
; CHECK-NEXT: vsetvli zero, zero, e8, mf4, tu, ma
; CHECK-NEXT: vmv.s.x v8, a2
; CHECK-NEXT: vse8.v v8, (a0)
; CHECK-NEXT: ret
%v1 = load <2 x i8>, ptr %a
Expand All @@ -30,12 +30,10 @@ define void @v4xi8_concat_vector_insert_idx1(ptr %a, ptr %b, i8 %x) {
; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma
; CHECK-NEXT: vle8.v v8, (a0)
; CHECK-NEXT: vle8.v v9, (a1)
; CHECK-NEXT: vmv.s.x v10, a2
; CHECK-NEXT: vslideup.vi v8, v10, 1
; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
; CHECK-NEXT: vslideup.vi v8, v9, 2
; CHECK-NEXT: vmv.s.x v9, a2
; CHECK-NEXT: vsetivli zero, 2, e8, mf4, tu, ma
; CHECK-NEXT: vslideup.vi v8, v9, 1
; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
; CHECK-NEXT: vse8.v v8, (a0)
; CHECK-NEXT: ret
%v1 = load <2 x i8>, ptr %a
Expand All @@ -50,15 +48,13 @@ define void @v4xi8_concat_vector_insert_idx2(ptr %a, ptr %b, i8 %x) {
; CHECK-LABEL: v4xi8_concat_vector_insert_idx2:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma
; CHECK-NEXT: vle8.v v8, (a0)
; CHECK-NEXT: vle8.v v9, (a1)
; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
; CHECK-NEXT: vslideup.vi v8, v9, 2
; CHECK-NEXT: vmv.s.x v9, a2
; CHECK-NEXT: vsetivli zero, 3, e8, mf4, tu, ma
; CHECK-NEXT: vslideup.vi v8, v9, 2
; CHECK-NEXT: vle8.v v8, (a1)
; CHECK-NEXT: vle8.v v9, (a0)
; CHECK-NEXT: vsetvli zero, zero, e8, mf8, tu, ma
; CHECK-NEXT: vmv.s.x v8, a2
; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
; CHECK-NEXT: vse8.v v8, (a0)
; CHECK-NEXT: vslideup.vi v9, v8, 2
; CHECK-NEXT: vse8.v v9, (a0)
; CHECK-NEXT: ret
%v1 = load <2 x i8>, ptr %a
%v2 = load <2 x i8>, ptr %b
Expand All @@ -72,13 +68,13 @@ define void @v4xi8_concat_vector_insert_idx3(ptr %a, ptr %b, i8 %x) {
; CHECK-LABEL: v4xi8_concat_vector_insert_idx3:
; CHECK: # %bb.0:
; CHECK-NEXT: vsetivli zero, 2, e8, mf8, ta, ma
; CHECK-NEXT: vle8.v v8, (a0)
; CHECK-NEXT: vle8.v v9, (a1)
; CHECK-NEXT: vle8.v v8, (a1)
; CHECK-NEXT: vle8.v v9, (a0)
; CHECK-NEXT: vmv.s.x v10, a2
; CHECK-NEXT: vslideup.vi v8, v10, 1
; CHECK-NEXT: vsetivli zero, 4, e8, mf4, ta, ma
; CHECK-NEXT: vslideup.vi v8, v9, 2
; CHECK-NEXT: vmv.s.x v9, a2
; CHECK-NEXT: vslideup.vi v8, v9, 3
; CHECK-NEXT: vse8.v v8, (a0)
; CHECK-NEXT: vslideup.vi v9, v8, 2
; CHECK-NEXT: vse8.v v9, (a0)
; CHECK-NEXT: ret
%v1 = load <2 x i8>, ptr %a
%v2 = load <2 x i8>, ptr %b
Expand All @@ -94,12 +90,11 @@ define void @v4xi64_concat_vector_insert_idx0(ptr %a, ptr %b, i64 %x) {
; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; RV32-NEXT: vle64.v v8, (a0)
; RV32-NEXT: vle64.v v10, (a1)
; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
; RV32-NEXT: vslideup.vi v8, v10, 2
; RV32-NEXT: vsetivli zero, 2, e32, m1, tu, ma
; RV32-NEXT: vslide1down.vx v8, v8, a2
; RV32-NEXT: vslide1down.vx v8, v8, a3
; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
; RV32-NEXT: vslideup.vi v8, v10, 2
; RV32-NEXT: vse64.v v8, (a0)
; RV32-NEXT: ret
;
Expand All @@ -108,10 +103,10 @@ define void @v4xi64_concat_vector_insert_idx0(ptr %a, ptr %b, i64 %x) {
; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; RV64-NEXT: vle64.v v8, (a0)
; RV64-NEXT: vle64.v v10, (a1)
; RV64-NEXT: vsetvli zero, zero, e64, m1, tu, ma
; RV64-NEXT: vmv.s.x v8, a2
; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma
; RV64-NEXT: vslideup.vi v8, v10, 2
; RV64-NEXT: vsetvli zero, zero, e64, m2, tu, ma
; RV64-NEXT: vmv.s.x v8, a2
; RV64-NEXT: vse64.v v8, (a0)
; RV64-NEXT: ret
%v1 = load <2 x i64>, ptr %a
Expand All @@ -128,14 +123,13 @@ define void @v4xi64_concat_vector_insert_idx1(ptr %a, ptr %b, i64 %x) {
; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; RV32-NEXT: vle64.v v8, (a0)
; RV32-NEXT: vle64.v v10, (a1)
; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
; RV32-NEXT: vslideup.vi v8, v10, 2
; RV32-NEXT: vsetivli zero, 2, e32, m1, ta, ma
; RV32-NEXT: vslide1down.vx v10, v8, a2
; RV32-NEXT: vslide1down.vx v10, v10, a3
; RV32-NEXT: vsetivli zero, 2, e64, m1, tu, ma
; RV32-NEXT: vslideup.vi v8, v10, 1
; RV32-NEXT: vslide1down.vx v9, v8, a2
; RV32-NEXT: vslide1down.vx v9, v9, a3
; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; RV32-NEXT: vslideup.vi v8, v9, 1
; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
; RV32-NEXT: vslideup.vi v8, v10, 2
; RV32-NEXT: vse64.v v8, (a0)
; RV32-NEXT: ret
;
Expand All @@ -144,12 +138,10 @@ define void @v4xi64_concat_vector_insert_idx1(ptr %a, ptr %b, i64 %x) {
; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; RV64-NEXT: vle64.v v8, (a0)
; RV64-NEXT: vle64.v v10, (a1)
; RV64-NEXT: vmv.s.x v9, a2
; RV64-NEXT: vslideup.vi v8, v9, 1
; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma
; RV64-NEXT: vslideup.vi v8, v10, 2
; RV64-NEXT: vmv.s.x v10, a2
; RV64-NEXT: vsetivli zero, 2, e64, m1, tu, ma
; RV64-NEXT: vslideup.vi v8, v10, 1
; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma
; RV64-NEXT: vse64.v v8, (a0)
; RV64-NEXT: ret
%v1 = load <2 x i64>, ptr %a
Expand All @@ -164,31 +156,26 @@ define void @v4xi64_concat_vector_insert_idx2(ptr %a, ptr %b, i64 %x) {
; RV32-LABEL: v4xi64_concat_vector_insert_idx2:
; RV32: # %bb.0:
; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; RV32-NEXT: vle64.v v8, (a0)
; RV32-NEXT: vle64.v v10, (a1)
; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
; RV32-NEXT: vslideup.vi v8, v10, 2
; RV32-NEXT: vsetivli zero, 2, e32, m2, ta, ma
; RV32-NEXT: vslide1down.vx v10, v8, a2
; RV32-NEXT: vslide1down.vx v10, v10, a3
; RV32-NEXT: vsetivli zero, 3, e64, m2, tu, ma
; RV32-NEXT: vslideup.vi v8, v10, 2
; RV32-NEXT: vle64.v v8, (a1)
; RV32-NEXT: vle64.v v10, (a0)
; RV32-NEXT: vsetivli zero, 2, e32, m1, tu, ma
; RV32-NEXT: vslide1down.vx v8, v8, a2
; RV32-NEXT: vslide1down.vx v8, v8, a3
; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
; RV32-NEXT: vse64.v v8, (a0)
; RV32-NEXT: vslideup.vi v10, v8, 2
; RV32-NEXT: vse64.v v10, (a0)
; RV32-NEXT: ret
;
; RV64-LABEL: v4xi64_concat_vector_insert_idx2:
; RV64: # %bb.0:
; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; RV64-NEXT: vle64.v v8, (a0)
; RV64-NEXT: vle64.v v10, (a1)
; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma
; RV64-NEXT: vslideup.vi v8, v10, 2
; RV64-NEXT: vmv.s.x v10, a2
; RV64-NEXT: vsetivli zero, 3, e64, m2, tu, ma
; RV64-NEXT: vslideup.vi v8, v10, 2
; RV64-NEXT: vle64.v v8, (a1)
; RV64-NEXT: vle64.v v10, (a0)
; RV64-NEXT: vsetvli zero, zero, e64, m1, tu, ma
; RV64-NEXT: vmv.s.x v8, a2
; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma
; RV64-NEXT: vse64.v v8, (a0)
; RV64-NEXT: vslideup.vi v10, v8, 2
; RV64-NEXT: vse64.v v10, (a0)
; RV64-NEXT: ret
%v1 = load <2 x i64>, ptr %a
%v2 = load <2 x i64>, ptr %b
Expand All @@ -204,26 +191,26 @@ define void @v4xi64_concat_vector_insert_idx3(ptr %a, ptr %b, i64 %x) {
; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; RV32-NEXT: vle64.v v8, (a0)
; RV32-NEXT: vle64.v v10, (a1)
; RV32-NEXT: vsetivli zero, 2, e32, m1, ta, ma
; RV32-NEXT: vslide1down.vx v9, v8, a2
; RV32-NEXT: vslide1down.vx v9, v9, a3
; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; RV32-NEXT: vslideup.vi v10, v9, 1
; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
; RV32-NEXT: vslideup.vi v8, v10, 2
; RV32-NEXT: vsetivli zero, 2, e32, m2, ta, ma
; RV32-NEXT: vslide1down.vx v10, v8, a2
; RV32-NEXT: vslide1down.vx v10, v10, a3
; RV32-NEXT: vsetivli zero, 4, e64, m2, ta, ma
; RV32-NEXT: vslideup.vi v8, v10, 3
; RV32-NEXT: vse64.v v8, (a0)
; RV32-NEXT: ret
;
; RV64-LABEL: v4xi64_concat_vector_insert_idx3:
; RV64: # %bb.0:
; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma
; RV64-NEXT: vle64.v v8, (a0)
; RV64-NEXT: vle64.v v10, (a1)
; RV64-NEXT: vle64.v v8, (a1)
; RV64-NEXT: vle64.v v10, (a0)
; RV64-NEXT: vmv.s.x v9, a2
; RV64-NEXT: vslideup.vi v8, v9, 1
; RV64-NEXT: vsetivli zero, 4, e64, m2, ta, ma
; RV64-NEXT: vslideup.vi v8, v10, 2
; RV64-NEXT: vmv.s.x v10, a2
; RV64-NEXT: vslideup.vi v8, v10, 3
; RV64-NEXT: vse64.v v8, (a0)
; RV64-NEXT: vslideup.vi v10, v8, 2
; RV64-NEXT: vse64.v v10, (a0)
; RV64-NEXT: ret
%v1 = load <2 x i64>, ptr %a
%v2 = load <2 x i64>, ptr %b
Expand Down
16 changes: 5 additions & 11 deletions llvm/test/CodeGen/RISCV/rvv/fixed-vectors-buildvec-of-binop.ll
Original file line number Diff line number Diff line change
Expand Up @@ -560,18 +560,8 @@ define <8 x i32> @add_constant_rhs_8xi32_vector_in3(<8 x i32> %vin, i32 %a, i32
define <8 x i32> @add_constant_rhs_8xi32_partial(<8 x i32> %vin, i32 %a, i32 %b, i32 %c, i32 %d) {
; CHECK-LABEL: add_constant_rhs_8xi32_partial:
; CHECK: # %bb.0:
; CHECK-NEXT: lui a4, %hi(.LCPI19_0)
; CHECK-NEXT: addi a4, a4, %lo(.LCPI19_0)
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
; CHECK-NEXT: vle32.v v10, (a4)
; CHECK-NEXT: vadd.vv v8, v8, v10
; CHECK-NEXT: addi a0, a0, 23
; CHECK-NEXT: addi a1, a1, 25
; CHECK-NEXT: addi a2, a2, 1
; CHECK-NEXT: addi a3, a3, 2047
; CHECK-NEXT: addi a3, a3, 308
; CHECK-NEXT: vmv.s.x v10, a0
; CHECK-NEXT: vsetivli zero, 5, e32, m2, tu, ma
; CHECK-NEXT: vmv.s.x v10, a0
; CHECK-NEXT: vslideup.vi v8, v10, 4
; CHECK-NEXT: vmv.s.x v10, a1
; CHECK-NEXT: vsetivli zero, 6, e32, m2, tu, ma
Expand All @@ -581,7 +571,11 @@ define <8 x i32> @add_constant_rhs_8xi32_partial(<8 x i32> %vin, i32 %a, i32 %b,
; CHECK-NEXT: vslideup.vi v8, v10, 6
; CHECK-NEXT: vmv.s.x v10, a3
; CHECK-NEXT: vsetivli zero, 8, e32, m2, ta, ma
; CHECK-NEXT: lui a0, %hi(.LCPI19_0)
; CHECK-NEXT: addi a0, a0, %lo(.LCPI19_0)
; CHECK-NEXT: vle32.v v12, (a0)
; CHECK-NEXT: vslideup.vi v8, v10, 7
; CHECK-NEXT: vadd.vv v8, v8, v12
; CHECK-NEXT: ret
%vadd = add <8 x i32> %vin, <i32 1, i32 2, i32 3, i32 5, i32 undef, i32 undef, i32 undef, i32 undef>
%e0 = add i32 %a, 23
Expand Down

0 comments on commit ff5e536

Please sign in to comment.