From c5672e21ca2a16ff18cdaa83db11d2edb84c5e14 Mon Sep 17 00:00:00 2001 From: Sushant Gokhale Date: Tue, 24 Sep 2024 14:35:01 +0530 Subject: [PATCH 01/22] [AArch64][CostModel] Reduce the cost of fadd reduction with fast flag (#108791) fadd reduction with 1. Fast flag set 2. No of elements in input vector is power of 2 results in series of faddp instructions. faddp instruction has latency/throughput identical to fadd instruction and hence, we set relative cost=1 for faddp as well. The change didn't show any regression with SPEC17-FP(C/C++), llvm-test-suite on Neoverse-V2. --- .../AArch64/AArch64TargetTransformInfo.cpp | 20 ++++ .../Analysis/CostModel/AArch64/reduce-fadd.ll | 104 ++++++++-------- .../SLPVectorizer/AArch64/reduce-fadd.ll | 113 ++++++------------ 3 files changed, 107 insertions(+), 130 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 11a4aa4d01e123..da0798ebf79578 100644 --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -4159,6 +4159,26 @@ AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, switch (ISD) { default: break; + case ISD::FADD: + if (Type *EltTy = ValTy->getScalarType(); + // FIXME: For half types without fullfp16 support, this could extend and + // use a fp32 faddp reduction but current codegen unrolls. + MTy.isVector() && (EltTy->isFloatTy() || EltTy->isDoubleTy() || + (EltTy->isHalfTy() && ST->hasFullFP16()))) { + const unsigned NElts = MTy.getVectorNumElements(); + if (ValTy->getElementCount().getFixedValue() >= 2 && NElts >= 2 && + isPowerOf2_32(NElts)) + // Reduction corresponding to series of fadd instructions is lowered to + // series of faddp instructions. faddp has latency/throughput that + // matches fadd instruction and hence, every faddp instruction can be + // considered to have a relative cost = 1 with + // CostKind = TCK_RecipThroughput. + // An faddp will pairwise add vector elements, so the size of input + // vector reduces by half every time, requiring + // #(faddp instructions) = log2_32(NElts). + return (LT.first - 1) + /*No of faddp instructions*/ Log2_32(NElts); + } + break; case ISD::ADD: if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy)) return (LT.first - 1) + Entry->Cost; diff --git a/llvm/test/Analysis/CostModel/AArch64/reduce-fadd.ll b/llvm/test/Analysis/CostModel/AArch64/reduce-fadd.ll index 58cb8c2c6a8d81..a95542f6901733 100644 --- a/llvm/test/Analysis/CostModel/AArch64/reduce-fadd.ll +++ b/llvm/test/Analysis/CostModel/AArch64/reduce-fadd.ll @@ -76,49 +76,49 @@ define void @fast_fp_reductions() { ; CHECK-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %fadd_v16f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %fadd_v11f16 = call fast half @llvm.vector.reduce.fadd.v11f16(half 0xH0000, <11 x half> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %fadd_v13f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v13f16(half 0xH0000, <13 x half> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f32 = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %fadd_v8f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %fadd_v13f32 = call fast float @llvm.vector.reduce.fadd.v13f32(float 0.000000e+00, <13 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %fadd_v5f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v5f32(float 0.000000e+00, <5 x float> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f64 = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %fadd_v4f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %fadd_v7f64 = call fast double @llvm.vector.reduce.fadd.v7f64(double 0.000000e+00, <7 x double> undef) -; CHECK-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %fadd_v9f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v9f64(double 0.000000e+00, <9 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %fadd_v2f32 = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %fadd_v2f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v4f32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v4f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %fadd_v8f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %fadd_v13f32 = call fast float @llvm.vector.reduce.fadd.v13f32(float 0.000000e+00, <13 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %fadd_v5f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v5f32(float 0.000000e+00, <5 x float> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %fadd_v2f64 = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %fadd_v2f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v4f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fadd_v7f64 = call fast double @llvm.vector.reduce.fadd.v7f64(double 0.000000e+00, <7 x double> undef) +; CHECK-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fadd_v9f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v9f64(double 0.000000e+00, <9 x double> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %fadd_v4f8 = call reassoc bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0xR8000, <4 x bfloat> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %fadd_v4f128 = call reassoc fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef) ; CHECK-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void ; ; FP16-LABEL: 'fast_fp_reductions' -; FP16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fadd_v2f16_fast = call fast half @llvm.vector.reduce.fadd.v2f16(half 0xH0000, <2 x half> undef) -; FP16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fadd_v2f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v2f16(half 0xH0000, <2 x half> undef) -; FP16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f16_fast = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef) -; FP16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef) -; FP16-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %fadd_v8f16 = call fast half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef) -; FP16-NEXT: Cost Model: Found an estimated cost of 27 for instruction: %fadd_v8f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef) -; FP16-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %fadd_v16f16 = call fast half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> undef) -; FP16-NEXT: Cost Model: Found an estimated cost of 44 for instruction: %fadd_v16f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> undef) -; FP16-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %fadd_v11f16 = call fast half @llvm.vector.reduce.fadd.v11f16(half 0xH0000, <11 x half> undef) -; FP16-NEXT: Cost Model: Found an estimated cost of 35 for instruction: %fadd_v13f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v13f16(half 0xH0000, <13 x half> undef) -; FP16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f32 = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> undef) -; FP16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> undef) -; FP16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef) -; FP16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef) -; FP16-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef) -; FP16-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %fadd_v8f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef) -; FP16-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %fadd_v13f32 = call fast float @llvm.vector.reduce.fadd.v13f32(float 0.000000e+00, <13 x float> undef) -; FP16-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %fadd_v5f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v5f32(float 0.000000e+00, <5 x float> undef) -; FP16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f64 = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef) -; FP16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef) -; FP16-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef) -; FP16-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %fadd_v4f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef) -; FP16-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %fadd_v7f64 = call fast double @llvm.vector.reduce.fadd.v7f64(double 0.000000e+00, <7 x double> undef) -; FP16-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %fadd_v9f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v9f64(double 0.000000e+00, <9 x double> undef) +; FP16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f16_fast = call fast half @llvm.vector.reduce.fadd.v2f16(half 0xH0000, <2 x half> undef) +; FP16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v2f16(half 0xH0000, <2 x half> undef) +; FP16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v4f16_fast = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef) +; FP16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v4f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef) +; FP16-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %fadd_v8f16 = call fast half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef) +; FP16-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %fadd_v8f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef) +; FP16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fadd_v16f16 = call fast half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> undef) +; FP16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fadd_v16f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> undef) +; FP16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fadd_v11f16 = call fast half @llvm.vector.reduce.fadd.v11f16(half 0xH0000, <11 x half> undef) +; FP16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fadd_v13f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v13f16(half 0xH0000, <13 x half> undef) +; FP16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %fadd_v2f32 = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> undef) +; FP16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %fadd_v2f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> undef) +; FP16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v4f32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef) +; FP16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v4f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef) +; FP16-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef) +; FP16-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %fadd_v8f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef) +; FP16-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %fadd_v13f32 = call fast float @llvm.vector.reduce.fadd.v13f32(float 0.000000e+00, <13 x float> undef) +; FP16-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %fadd_v5f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v5f32(float 0.000000e+00, <5 x float> undef) +; FP16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %fadd_v2f64 = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef) +; FP16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %fadd_v2f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef) +; FP16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef) +; FP16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v4f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef) +; FP16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fadd_v7f64 = call fast double @llvm.vector.reduce.fadd.v7f64(double 0.000000e+00, <7 x double> undef) +; FP16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fadd_v9f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v9f64(double 0.000000e+00, <9 x double> undef) ; FP16-NEXT: Cost Model: Found an estimated cost of 10 for instruction: %fadd_v4f8 = call reassoc bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0xR8000, <4 x bfloat> undef) ; FP16-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %fadd_v4f128 = call reassoc fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef) ; FP16-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void @@ -134,20 +134,20 @@ define void @fast_fp_reductions() { ; BF16-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %fadd_v16f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> undef) ; BF16-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %fadd_v11f16 = call fast half @llvm.vector.reduce.fadd.v11f16(half 0xH0000, <11 x half> undef) ; BF16-NEXT: Cost Model: Found an estimated cost of 38 for instruction: %fadd_v13f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v13f16(half 0xH0000, <13 x half> undef) -; BF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f32 = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> undef) -; BF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> undef) -; BF16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef) -; BF16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef) -; BF16-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef) -; BF16-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %fadd_v8f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef) -; BF16-NEXT: Cost Model: Found an estimated cost of 25 for instruction: %fadd_v13f32 = call fast float @llvm.vector.reduce.fadd.v13f32(float 0.000000e+00, <13 x float> undef) -; BF16-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %fadd_v5f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v5f32(float 0.000000e+00, <5 x float> undef) -; BF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f64 = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef) -; BF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef) -; BF16-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef) -; BF16-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %fadd_v4f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef) -; BF16-NEXT: Cost Model: Found an estimated cost of 9 for instruction: %fadd_v7f64 = call fast double @llvm.vector.reduce.fadd.v7f64(double 0.000000e+00, <7 x double> undef) -; BF16-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %fadd_v9f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v9f64(double 0.000000e+00, <9 x double> undef) +; BF16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %fadd_v2f32 = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> undef) +; BF16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %fadd_v2f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> undef) +; BF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v4f32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef) +; BF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v4f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef) +; BF16-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef) +; BF16-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %fadd_v8f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef) +; BF16-NEXT: Cost Model: Found an estimated cost of 5 for instruction: %fadd_v13f32 = call fast float @llvm.vector.reduce.fadd.v13f32(float 0.000000e+00, <13 x float> undef) +; BF16-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %fadd_v5f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v5f32(float 0.000000e+00, <5 x float> undef) +; BF16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %fadd_v2f64 = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef) +; BF16-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %fadd_v2f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef) +; BF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef) +; BF16-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %fadd_v4f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef) +; BF16-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %fadd_v7f64 = call fast double @llvm.vector.reduce.fadd.v7f64(double 0.000000e+00, <7 x double> undef) +; BF16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fadd_v9f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v9f64(double 0.000000e+00, <9 x double> undef) ; BF16-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f8 = call reassoc bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0xR8000, <4 x bfloat> undef) ; BF16-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %fadd_v4f128 = call reassoc fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef) ; BF16-NEXT: Cost Model: Found an estimated cost of 0 for instruction: ret void diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll index edc0381aa3fcc2..6dceabe1d3243b 100644 --- a/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll +++ b/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll @@ -1,8 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5 -; RUN: opt < %s -S -passes=slp-vectorizer -mtriple=aarch64-unknown-linux \ -; RUN: -mattr=-fullfp16 | FileCheck %s --check-prefixes=CHECK,NOFP16 -; RUN: opt < %s -S -passes=slp-vectorizer -mtriple=aarch64-unknown-linux \ -; RUN: -mattr=+fullfp16 | FileCheck %s --check-prefixes=CHECK,FP16 +; RUN: opt < %s -S -passes=slp-vectorizer -mtriple=aarch64-unknown-linux -mattr=-fullfp16 | FileCheck %s --check-prefixes=CHECK,NOFP16 +; RUN: opt < %s -S -passes=slp-vectorizer -mtriple=aarch64-unknown-linux -mattr=+fullfp16 | FileCheck %s --check-prefixes=CHECK,FULLFP16 define half @reduce_fast_half2(<2 x half> %vec2) { ; CHECK-LABEL: define half @reduce_fast_half2( @@ -79,20 +77,26 @@ entry: } define half @reduce_fast_half8(<8 x half> %vec8) { -; CHECK-LABEL: define half @reduce_fast_half8( -; CHECK-SAME: <8 x half> [[VEC8:%.*]]) #[[ATTR0]] { -; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[ELT4:%.*]] = extractelement <8 x half> [[VEC8]], i64 4 -; CHECK-NEXT: [[ELT5:%.*]] = extractelement <8 x half> [[VEC8]], i64 5 -; CHECK-NEXT: [[ELT6:%.*]] = extractelement <8 x half> [[VEC8]], i64 6 -; CHECK-NEXT: [[ELT7:%.*]] = extractelement <8 x half> [[VEC8]], i64 7 -; CHECK-NEXT: [[TMP0:%.*]] = shufflevector <8 x half> [[VEC8]], <8 x half> poison, <4 x i32> -; CHECK-NEXT: [[TMP1:%.*]] = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> [[TMP0]]) -; CHECK-NEXT: [[OP_RDX:%.*]] = fadd fast half [[TMP1]], [[ELT4]] -; CHECK-NEXT: [[OP_RDX1:%.*]] = fadd fast half [[ELT5]], [[ELT6]] -; CHECK-NEXT: [[OP_RDX2:%.*]] = fadd fast half [[OP_RDX]], [[OP_RDX1]] -; CHECK-NEXT: [[OP_RDX3:%.*]] = fadd fast half [[OP_RDX2]], [[ELT7]] -; CHECK-NEXT: ret half [[OP_RDX3]] +; NOFP16-LABEL: define half @reduce_fast_half8( +; NOFP16-SAME: <8 x half> [[VEC8:%.*]]) #[[ATTR0]] { +; NOFP16-NEXT: [[ENTRY:.*:]] +; NOFP16-NEXT: [[ELT4:%.*]] = extractelement <8 x half> [[VEC8]], i64 4 +; NOFP16-NEXT: [[ELT5:%.*]] = extractelement <8 x half> [[VEC8]], i64 5 +; NOFP16-NEXT: [[ELT6:%.*]] = extractelement <8 x half> [[VEC8]], i64 6 +; NOFP16-NEXT: [[ELT7:%.*]] = extractelement <8 x half> [[VEC8]], i64 7 +; NOFP16-NEXT: [[TMP0:%.*]] = shufflevector <8 x half> [[VEC8]], <8 x half> poison, <4 x i32> +; NOFP16-NEXT: [[TMP1:%.*]] = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> [[TMP0]]) +; NOFP16-NEXT: [[OP_RDX:%.*]] = fadd fast half [[TMP1]], [[ELT4]] +; NOFP16-NEXT: [[OP_RDX1:%.*]] = fadd fast half [[ELT5]], [[ELT6]] +; NOFP16-NEXT: [[OP_RDX2:%.*]] = fadd fast half [[OP_RDX]], [[OP_RDX1]] +; NOFP16-NEXT: [[OP_RDX3:%.*]] = fadd fast half [[OP_RDX2]], [[ELT7]] +; NOFP16-NEXT: ret half [[OP_RDX3]] +; +; FULLFP16-LABEL: define half @reduce_fast_half8( +; FULLFP16-SAME: <8 x half> [[VEC8:%.*]]) #[[ATTR0]] { +; FULLFP16-NEXT: [[ENTRY:.*:]] +; FULLFP16-NEXT: [[TMP0:%.*]] = call fast half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> [[VEC8]]) +; FULLFP16-NEXT: ret half [[TMP0]] ; entry: %elt0 = extractelement <8 x half> %vec8, i64 0 @@ -154,37 +158,11 @@ entry: } define half @reduce_fast_half16(<16 x half> %vec16) { -; NOFP16-LABEL: define half @reduce_fast_half16( -; NOFP16-SAME: <16 x half> [[VEC16:%.*]]) #[[ATTR0]] { -; NOFP16-NEXT: [[ENTRY:.*:]] -; NOFP16-NEXT: [[TMP0:%.*]] = call fast half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> [[VEC16]]) -; NOFP16-NEXT: ret half [[TMP0]] -; -; FP16-LABEL: define half @reduce_fast_half16( -; FP16-SAME: <16 x half> [[VEC16:%.*]]) #[[ATTR0]] { -; FP16-NEXT: [[ENTRY:.*:]] -; FP16-NEXT: [[ELT4:%.*]] = extractelement <16 x half> [[VEC16]], i64 4 -; FP16-NEXT: [[ELT5:%.*]] = extractelement <16 x half> [[VEC16]], i64 5 -; FP16-NEXT: [[ELT6:%.*]] = extractelement <16 x half> [[VEC16]], i64 6 -; FP16-NEXT: [[ELT7:%.*]] = extractelement <16 x half> [[VEC16]], i64 7 -; FP16-NEXT: [[ELT12:%.*]] = extractelement <16 x half> [[VEC16]], i64 12 -; FP16-NEXT: [[ELT13:%.*]] = extractelement <16 x half> [[VEC16]], i64 13 -; FP16-NEXT: [[ELT14:%.*]] = extractelement <16 x half> [[VEC16]], i64 14 -; FP16-NEXT: [[ELT15:%.*]] = extractelement <16 x half> [[VEC16]], i64 15 -; FP16-NEXT: [[TMP0:%.*]] = shufflevector <16 x half> [[VEC16]], <16 x half> poison, <4 x i32> -; FP16-NEXT: [[TMP1:%.*]] = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> [[TMP0]]) -; FP16-NEXT: [[TMP2:%.*]] = shufflevector <16 x half> [[VEC16]], <16 x half> poison, <4 x i32> -; FP16-NEXT: [[TMP3:%.*]] = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> [[TMP2]]) -; FP16-NEXT: [[OP_RDX:%.*]] = fadd fast half [[TMP1]], [[TMP3]] -; FP16-NEXT: [[OP_RDX1:%.*]] = fadd fast half [[OP_RDX]], [[ELT4]] -; FP16-NEXT: [[OP_RDX2:%.*]] = fadd fast half [[ELT5]], [[ELT6]] -; FP16-NEXT: [[OP_RDX3:%.*]] = fadd fast half [[ELT7]], [[ELT12]] -; FP16-NEXT: [[OP_RDX4:%.*]] = fadd fast half [[ELT13]], [[ELT14]] -; FP16-NEXT: [[OP_RDX5:%.*]] = fadd fast half [[OP_RDX1]], [[OP_RDX2]] -; FP16-NEXT: [[OP_RDX6:%.*]] = fadd fast half [[OP_RDX3]], [[OP_RDX4]] -; FP16-NEXT: [[OP_RDX7:%.*]] = fadd fast half [[OP_RDX5]], [[OP_RDX6]] -; FP16-NEXT: [[OP_RDX8:%.*]] = fadd fast half [[OP_RDX7]], [[ELT15]] -; FP16-NEXT: ret half [[OP_RDX8]] +; CHECK-LABEL: define half @reduce_fast_half16( +; CHECK-SAME: <16 x half> [[VEC16:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = call fast half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> [[VEC16]]) +; CHECK-NEXT: ret half [[TMP0]] ; entry: %elt0 = extractelement <16 x half> %vec16, i64 0 @@ -512,19 +490,11 @@ define float @reduce_fast_float_case1(ptr %a) { ; CHECK-LABEL: define float @reduce_fast_float_case1( ; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[LOAD:%.*]] = load float, ptr [[A]], align 4 -; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 4 -; CHECK-NEXT: [[LOAD1:%.*]] = load float, ptr [[GEP]], align 4 -; CHECK-NEXT: [[ADD1:%.*]] = fadd fast float [[LOAD1]], [[LOAD]] -; CHECK-NEXT: [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 8 -; CHECK-NEXT: [[LOAD2:%.*]] = load float, ptr [[GEP2]], align 4 -; CHECK-NEXT: [[ADD2:%.*]] = fadd fast float [[LOAD2]], [[ADD1]] -; CHECK-NEXT: [[GEP3:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 12 -; CHECK-NEXT: [[LOAD3:%.*]] = load float, ptr [[GEP3]], align 4 -; CHECK-NEXT: [[ADD3:%.*]] = fadd fast float [[LOAD3]], [[ADD2]] +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 4 ; CHECK-NEXT: [[GEP4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 16 ; CHECK-NEXT: [[LOAD4:%.*]] = load float, ptr [[GEP4]], align 4 -; CHECK-NEXT: [[ADD4:%.*]] = fadd fast float [[LOAD4]], [[ADD3]] +; CHECK-NEXT: [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP0]]) +; CHECK-NEXT: [[ADD4:%.*]] = fadd fast float [[TMP1]], [[LOAD4]] ; CHECK-NEXT: ret float [[ADD4]] ; entry: @@ -586,24 +556,11 @@ define float @reduce_fast_float_case2(ptr %a, ptr %b) { ; CHECK-LABEL: define float @reduce_fast_float_case2( ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] { ; CHECK-NEXT: [[ENTRY:.*:]] -; CHECK-NEXT: [[GEPA2:%.*]] = getelementptr inbounds float, ptr [[A]], i32 2 -; CHECK-NEXT: [[GEPA3:%.*]] = getelementptr inbounds float, ptr [[A]], i32 3 -; CHECK-NEXT: [[GEPB2:%.*]] = getelementptr inbounds float, ptr [[B]], i32 2 -; CHECK-NEXT: [[GEPB3:%.*]] = getelementptr inbounds float, ptr [[B]], i32 3 -; CHECK-NEXT: [[LOADA2:%.*]] = load float, ptr [[GEPA2]], align 4 -; CHECK-NEXT: [[LOADA3:%.*]] = load float, ptr [[GEPA3]], align 4 -; CHECK-NEXT: [[LOADB2:%.*]] = load float, ptr [[GEPB2]], align 4 -; CHECK-NEXT: [[LOADB3:%.*]] = load float, ptr [[GEPB3]], align 4 -; CHECK-NEXT: [[TMP0:%.*]] = load <2 x float>, ptr [[A]], align 4 -; CHECK-NEXT: [[TMP1:%.*]] = load <2 x float>, ptr [[B]], align 4 -; CHECK-NEXT: [[TMP2:%.*]] = fadd fast <2 x float> [[TMP0]], [[TMP1]] -; CHECK-NEXT: [[ADD2:%.*]] = fadd fast float [[LOADA3]], [[LOADB2]] -; CHECK-NEXT: [[ADD3:%.*]] = fadd fast float [[LOADA2]], [[LOADB3]] -; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0 -; CHECK-NEXT: [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1 -; CHECK-NEXT: [[RED1:%.*]] = fadd fast float [[TMP3]], [[TMP4]] -; CHECK-NEXT: [[RED2:%.*]] = fadd fast float [[ADD2]], [[RED1]] -; CHECK-NEXT: [[RED3:%.*]] = fadd fast float [[ADD3]], [[RED2]] +; CHECK-NEXT: [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[B]], align 4 +; CHECK-NEXT: [[TMP2:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> poison, <4 x float> [[TMP1]], i64 0) +; CHECK-NEXT: [[TMP3:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP2]], <4 x float> [[TMP0]], i64 4) +; CHECK-NEXT: [[RED3:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP3]]) ; CHECK-NEXT: ret float [[RED3]] ; entry: From 3659aa8079e00d7bd4f2d9c68c404a93ec297200 Mon Sep 17 00:00:00 2001 From: Pravin Jagtap Date: Tue, 24 Sep 2024 14:41:45 +0530 Subject: [PATCH 02/22] [AMDGPU] Fix handling of DBG_VALUE_LIST while fixing the dead frame indices. (#109685) Both SGPR->VGPR and VGPR->AGPR spilling code give a fixup to the spill frame indices referred in debug instructions so that they can be entirely removed. The stack argument is present at 0th index in DBG_VALUE and at 2nd index for DBG_VALUE_LIST. Fixes: SWDEV-484156 --- llvm/lib/Target/AMDGPU/SIFrameLowering.cpp | 13 +++-- llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp | 13 +++-- ...ip-processing-stack-arg-dbg-value-list.mir | 53 +++++++++++++++++++ ...ip-processing-stack-arg-dbg-value-list.mir | 52 ++++++++++++++++++ 4 files changed, 123 insertions(+), 8 deletions(-) create mode 100644 llvm/test/CodeGen/AMDGPU/sgpr-spill-fi-skip-processing-stack-arg-dbg-value-list.mir create mode 100644 llvm/test/CodeGen/AMDGPU/vgpr-spill-fi-skip-processing-stack-arg-dbg-value-list.mir diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp index 2c67c4aedfe475..50a6f028f66de6 100644 --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -1418,10 +1418,15 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized( // the debug value instructions. We should instead, update it with the // correct register value. But not sure the register value alone is for (MachineInstr &MI : MBB) { - if (MI.isDebugValue() && MI.getOperand(0).isFI() && - !MFI.isFixedObjectIndex(MI.getOperand(0).getIndex()) && - SpillFIs[MI.getOperand(0).getIndex()]) { - MI.getOperand(0).ChangeToRegister(Register(), false /*isDef*/); + if (MI.isDebugValue()) { + uint32_t StackOperandIdx = MI.isDebugValueList() ? 2 : 0; + if (MI.getOperand(StackOperandIdx).isFI() && + !MFI.isFixedObjectIndex( + MI.getOperand(StackOperandIdx).getIndex()) && + SpillFIs[MI.getOperand(StackOperandIdx).getIndex()]) { + MI.getOperand(StackOperandIdx) + .ChangeToRegister(Register(), false /*isDef*/); + } } } } diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp index 28bba8cfd73528..35e5bea9ae16e2 100644 --- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp @@ -418,10 +418,15 @@ bool SILowerSGPRSpills::run(MachineFunction &MF) { // correct register value. But not sure the register value alone is // adequate to lower the DIExpression. It should be worked out later. for (MachineInstr &MI : MBB) { - if (MI.isDebugValue() && MI.getOperand(0).isFI() && - !MFI.isFixedObjectIndex(MI.getOperand(0).getIndex()) && - SpillFIs[MI.getOperand(0).getIndex()]) { - MI.getOperand(0).ChangeToRegister(Register(), false /*isDef*/); + if (MI.isDebugValue()) { + uint32_t StackOperandIdx = MI.isDebugValueList() ? 2 : 0; + if (MI.getOperand(StackOperandIdx).isFI() && + !MFI.isFixedObjectIndex( + MI.getOperand(StackOperandIdx).getIndex()) && + SpillFIs[MI.getOperand(StackOperandIdx).getIndex()]) { + MI.getOperand(StackOperandIdx) + .ChangeToRegister(Register(), false /*isDef*/); + } } } } diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-fi-skip-processing-stack-arg-dbg-value-list.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-fi-skip-processing-stack-arg-dbg-value-list.mir new file mode 100644 index 00000000000000..cdf2b41c1e5b45 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-fi-skip-processing-stack-arg-dbg-value-list.mir @@ -0,0 +1,53 @@ +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -amdgpu-spill-sgpr-to-vgpr=true -run-pass=si-lower-sgpr-spills -o - %s | FileCheck %s + +--- | + define amdgpu_kernel void @test() { ret void } + + !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !4, producer: "llvm", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, retainedTypes: !4) + !1 = !DILocalVariable(name: "a", scope: !2, file: !4, line: 126, type: !6) + !2 = distinct !DISubprogram(name: "test", scope: !4, file: !4, line: 1, type: !3, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !5) + !3 = !DISubroutineType(types: !4) + !4 = !DIFile(filename: "dummy", directory: "/") + !5 = !{!1} + !6 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !7, size: 64, align: 32) + !7 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed) + !8 = !DIExpression() + !9 = !DILocation(line: 10, column: 9, scope: !2) + +... +--- +name: test +tracksRegLiveness: true +frameInfo: + maxAlignment: 4 +fixedStack: + - { id: 0, type: default, offset: 4, size: 4, alignment: 4, stack-id: default } +stack: + - { id: 0, type: spill-slot, size: 4, alignment: 4, stack-id: sgpr-spill } +machineFunctionInfo: + maxKernArgAlign: 4 + isEntryFunction: true + waveLimiter: true + scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99' + stackPtrOffsetReg: '$sgpr32' + frameOffsetReg: '$sgpr33' + hasSpilledSGPRs: true + argumentInfo: + privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } + dispatchPtr: { reg: '$sgpr4_sgpr5' } + kernargSegmentPtr: { reg: '$sgpr6_sgpr7' } + workGroupIDX: { reg: '$sgpr8' } + privateSegmentWaveByteOffset: { reg: '$sgpr9' } +body: | + ; CHECK-LABEL: name: test + ; CHECK: bb.0: + ; CHECK: DBG_VALUE_LIST <{{.*}}>, !DIExpression(), $noreg, 0, debug-location !DILocation(line: 10, column: 9, scope: <{{.*}}>) + + bb.0: + renamable $sgpr10 = IMPLICIT_DEF + SI_SPILL_S32_SAVE killed $sgpr10, %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 + DBG_VALUE_LIST !1, !8, %stack.0, 0, debug-location !9 + + bb.1: + renamable $sgpr10 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 + S_ENDPGM 0 diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-fi-skip-processing-stack-arg-dbg-value-list.mir b/llvm/test/CodeGen/AMDGPU/vgpr-spill-fi-skip-processing-stack-arg-dbg-value-list.mir new file mode 100644 index 00000000000000..53629cdfb932b2 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-fi-skip-processing-stack-arg-dbg-value-list.mir @@ -0,0 +1,52 @@ +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -amdgpu-spill-vgpr-to-agpr=true -run-pass=prologepilog -o - %s | FileCheck %s + +--- | + define amdgpu_kernel void @test() { ret void } + + !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !4, producer: "llvm", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, retainedTypes: !4) + !1 = !DILocalVariable(name: "a", scope: !2, file: !4, line: 126, type: !6) + !2 = distinct !DISubprogram(name: "test", scope: !4, file: !4, line: 1, type: !3, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !5) + !3 = !DISubroutineType(types: !4) + !4 = !DIFile(filename: "dummy", directory: "/") + !5 = !{!1} + !6 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !7, size: 64, align: 32) + !7 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed) + !8 = !DIExpression() + !9 = !DILocation(line: 10, column: 9, scope: !2) + +... +--- +name: test +tracksRegLiveness: true +frameInfo: + maxAlignment: 4 +fixedStack: + - { id: 0, type: default, offset: 4, size: 4, alignment: 4, stack-id: default } +stack: + - { id: 0, type: spill-slot, size: 4, alignment: 4 } +machineFunctionInfo: + maxKernArgAlign: 4 + isEntryFunction: true + waveLimiter: true + scratchRSrcReg: '$sgpr96_sgpr97_sgpr98_sgpr99' + stackPtrOffsetReg: '$sgpr32' + frameOffsetReg: '$sgpr33' + hasSpilledVGPRs: true + argumentInfo: + privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' } + dispatchPtr: { reg: '$sgpr4_sgpr5' } + kernargSegmentPtr: { reg: '$sgpr6_sgpr7' } + workGroupIDX: { reg: '$sgpr8' } + privateSegmentWaveByteOffset: { reg: '$sgpr9' } +body: | + ; CHECK-LABEL: name: test + ; CHECK: bb.0: + ; CHECK: DBG_VALUE_LIST <{{.*}}>, !DIExpression(), $noreg, 0, debug-location !DILocation(line: 10, column: 9, scope: <{{.*}}>) + bb.0: + $vgpr2 = IMPLICIT_DEF + SI_SPILL_V32_SAVE $vgpr2, %stack.0, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.0, align 4, addrspace 5) + DBG_VALUE_LIST !1, !8, %stack.0, 0, debug-location !9 + + bb.1: + renamable $vgpr2 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, align 4, addrspace 5) + S_ENDPGM 0 From 30dbbdd2ea25e3ab5596e1fb0474696b242a760c Mon Sep 17 00:00:00 2001 From: Dmitry Chernenkov Date: Tue, 24 Sep 2024 09:17:24 +0000 Subject: [PATCH 03/22] [Bazel] Fix for 127349fcba81646389e4b8202b35405a5fdbef47 --- .../libc/test/src/math/libc_math_test_rules.bzl | 3 +++ 1 file changed, 3 insertions(+) diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/math/libc_math_test_rules.bzl b/utils/bazel/llvm-project-overlay/libc/test/src/math/libc_math_test_rules.bzl index d788705fc3e604..16845ab66dfd45 100644 --- a/utils/bazel/llvm-project-overlay/libc/test/src/math/libc_math_test_rules.bzl +++ b/utils/bazel/llvm-project-overlay/libc/test/src/math/libc_math_test_rules.bzl @@ -28,7 +28,9 @@ def math_test(name, hdrs = [], deps = [], **kwargs): "//libc:__support_cpp_algorithm", "//libc:__support_cpp_bit", "//libc:__support_cpp_limits", + "//libc:__support_cpp_type_traits", "//libc:__support_fputil_basic_operations", + "//libc:__support_fputil_cast", "//libc:__support_fputil_fenv_impl", "//libc:__support_fputil_fp_bits", "//libc:__support_fputil_manipulation_functions", @@ -36,6 +38,7 @@ def math_test(name, hdrs = [], deps = [], **kwargs): "//libc:__support_fputil_normal_float", "//libc:__support_macros_properties_architectures", "//libc:__support_macros_properties_os", + "//libc:__support_macros_properties_types", "//libc:__support_math_extras", "//libc:__support_uint128", "//libc:hdr_errno_macros", From cc7b24a4d125e9a81480aaaa961a2b963bbb2ea2 Mon Sep 17 00:00:00 2001 From: Piotr Fusik Date: Tue, 24 Sep 2024 11:19:56 +0200 Subject: [PATCH 04/22] [NFC] Fix typos in comments (#109765) --- llvm/include/llvm/CodeGen/ISDOpcodes.h | 2 +- llvm/lib/Analysis/VectorUtils.cpp | 2 +- llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp | 2 +- llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 2 +- llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h index 18ed60ebb124dc..da43f5be10ff3b 100644 --- a/llvm/include/llvm/CodeGen/ISDOpcodes.h +++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h @@ -825,7 +825,7 @@ enum NodeType { /// be saturated against signed values, resulting in `S`, which will combine /// to `TRUNCATE_SSAT_S`. If the value of C ranges from `0 to 255`, it will /// be saturated against unsigned values, resulting in `U`, which will - /// combine to `TRUNATE_SSAT_U`. Similarly, in `truncate(umin(x, C))`, if + /// combine to `TRUNCATE_SSAT_U`. Similarly, in `truncate(umin(x, C))`, if /// value of C ranges from `0 to 255`, it becomes `U` because it is saturated /// for unsigned values. As a result, it combines to `TRUNCATE_USAT_U`. TRUNCATE_SSAT_S, // saturate signed input to signed result - diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp index d45d3bbefe4fd3..dbffbb8a5f81d9 100644 --- a/llvm/lib/Analysis/VectorUtils.cpp +++ b/llvm/lib/Analysis/VectorUtils.cpp @@ -1454,7 +1454,7 @@ void InterleavedAccessInfo::analyzeInterleaving( // that all the pointers in the group don't wrap. // So we check only group member 0 (which is always guaranteed to exist), // and group member Factor - 1; If the latter doesn't exist we rely on - // peeling (if it is a non-reversed accsess -- see Case 3). + // peeling (if it is a non-reversed access -- see Case 3). if (InvalidateGroupIfMemberMayWrap(Group, 0, std::string("first"))) continue; if (Group->getMember(Group->getFactor() - 1)) diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp index fcd46b5921c4de..05ba18bf8ebd88 100644 --- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp @@ -2725,7 +2725,7 @@ bool RISCVDAGToDAGISel::SelectAddrRegImm(SDValue Addr, SDValue &Base, } /// Similar to SelectAddrRegImm, except that the least significant 5 bits of -/// Offset shoule be all zeros. +/// Offset should be all zeros. bool RISCVDAGToDAGISel::SelectAddrRegImmLsb00000(SDValue Addr, SDValue &Base, SDValue &Offset) { if (SelectAddrFrameIndex(Addr, Base, Offset)) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp index 2aa89aca4c808d..b998a1eb11c300 100644 --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -4540,7 +4540,7 @@ static SDValue lowerScalarInsert(SDValue Scalar, SDValue VL, MVT VT, // t33: v8i8 = extract_subvector t11, Constant:i64<8> // a) t35: v8i8 = vector_shuffle<0,2,4,6,8,10,12,14> t34, t33 // b) t35: v8i8 = vector_shuffle<1,3,5,7,9,11,13,15> t34, t33 -// Returns {Src Vector, Even Elements} om success +// Returns {Src Vector, Even Elements} on success static bool isDeinterleaveShuffle(MVT VT, MVT ContainerVT, SDValue V1, SDValue V2, ArrayRef Mask, const RISCVSubtarget &Subtarget) { diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td index fe7de9d7bc79aa..68182d238e7847 100644 --- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td @@ -232,7 +232,7 @@ class octuple_to_str { def VLOpFrag : PatFrag<(ops), (XLenVT (VLOp (XLenVT AVL:$vl)))>; // Output pattern for X0 used to represent VLMAX in the pseudo instructions. -// We can't use X0 register becuase the AVL operands use GPRNoX0. +// We can't use X0 register because the AVL operands use GPRNoX0. // This must be kept in sync with RISCV::VLMaxSentinel. def VLMax : OutPatFrag<(ops), (XLenVT -1)>; From bfd8f7ee4a85ae8873db14fa6e7e31223a1df169 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Tue, 24 Sep 2024 10:37:02 +0100 Subject: [PATCH 05/22] [X86] SimplifyDemandedVectorEltsForTargetNode - reduce vector width of FRSQRT/FRCP ymm nodes. If we only demand the lower subvector of a FRSQRT/FRCP node, then reduce the width of the instruction. --- llvm/lib/Target/X86/X86ISelLowering.cpp | 2 ++ llvm/test/CodeGen/X86/extractelement-fp.ll | 10 ++++------ 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index de8cfe31a5529f..d7a26dc4caec6c 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -43109,6 +43109,8 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode( case X86ISD::FMIN: case X86ISD::FMAXC: case X86ISD::FMINC: + case X86ISD::FRSQRT: + case X86ISD::FRCP: // Horizontal Ops. case X86ISD::HADD: case X86ISD::HSUB: diff --git a/llvm/test/CodeGen/X86/extractelement-fp.ll b/llvm/test/CodeGen/X86/extractelement-fp.ll index 38162f676e7ee3..944f6bbfd0bfbe 100644 --- a/llvm/test/CodeGen/X86/extractelement-fp.ll +++ b/llvm/test/CodeGen/X86/extractelement-fp.ll @@ -1310,15 +1310,14 @@ define float @rcp_v4f32(<4 x float> %x) nounwind { define float @rcp_v8f32(<8 x float> %x) nounwind { ; X64-LABEL: rcp_v8f32: ; X64: # %bb.0: -; X64-NEXT: vrcpps %ymm0, %ymm0 -; X64-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; X64-NEXT: vrcpss %xmm0, %xmm0, %xmm0 ; X64-NEXT: vzeroupper ; X64-NEXT: retq ; ; X86-LABEL: rcp_v8f32: ; X86: # %bb.0: ; X86-NEXT: pushl %eax -; X86-NEXT: vrcpps %ymm0, %ymm0 +; X86-NEXT: vrcpss %xmm0, %xmm0, %xmm0 ; X86-NEXT: vmovss %xmm0, (%esp) ; X86-NEXT: flds (%esp) ; X86-NEXT: popl %eax @@ -1351,15 +1350,14 @@ define float @rsqrt_v4f32(<4 x float> %x) nounwind { define float @rsqrt_v8f32(<8 x float> %x) nounwind { ; X64-LABEL: rsqrt_v8f32: ; X64: # %bb.0: -; X64-NEXT: vrsqrtps %ymm0, %ymm0 -; X64-NEXT: # kill: def $xmm0 killed $xmm0 killed $ymm0 +; X64-NEXT: vrsqrtss %xmm0, %xmm0, %xmm0 ; X64-NEXT: vzeroupper ; X64-NEXT: retq ; ; X86-LABEL: rsqrt_v8f32: ; X86: # %bb.0: ; X86-NEXT: pushl %eax -; X86-NEXT: vrsqrtps %ymm0, %ymm0 +; X86-NEXT: vrsqrtss %xmm0, %xmm0, %xmm0 ; X86-NEXT: vmovss %xmm0, (%esp) ; X86-NEXT: flds (%esp) ; X86-NEXT: popl %eax From 396f6775143ffa80b9f0e72e7250613092d88124 Mon Sep 17 00:00:00 2001 From: Scott Egerton <9487234+ScottEgerton@users.noreply.github.com> Date: Tue, 24 Sep 2024 10:58:00 +0100 Subject: [PATCH 06/22] [AMDGPU] Remove unused VGPRSingleUseHintInsts feature (#109769) --- llvm/docs/AMDGPUUsage.rst | 4 +- llvm/lib/Target/AMDGPU/AMDGPU.h | 3 - llvm/lib/Target/AMDGPU/AMDGPU.td | 13 - .../AMDGPU/AMDGPUInsertSingleUseVDST.cpp | 245 --- .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 10 - llvm/lib/Target/AMDGPU/CMakeLists.txt | 1 - llvm/lib/Target/AMDGPU/GCNSubtarget.h | 3 - llvm/lib/Target/AMDGPU/SIInstrInfo.td | 2 - llvm/lib/Target/AMDGPU/SOPInstructions.td | 11 - .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp | 18 - llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h | 2 + llvm/lib/Target/AMDGPU/VOP1Instructions.td | 18 +- llvm/lib/Target/AMDGPU/VOP2Instructions.td | 6 +- llvm/lib/Target/AMDGPU/VOP3Instructions.td | 35 +- llvm/lib/Target/AMDGPU/VOP3PInstructions.td | 12 +- llvm/lib/Target/AMDGPU/VOPCInstructions.td | 13 +- llvm/lib/Target/AMDGPU/VOPInstructions.td | 20 +- .../CodeGen/AMDGPU/insert-singleuse-vdst.mir | 1420 ----------------- llvm/test/MC/AMDGPU/gfx1150_asm_sopp.s | 10 - llvm/test/MC/AMDGPU/gfx11_unsupported.s | 3 - llvm/test/MC/AMDGPU/gfx12_asm_sopp.s | 9 - .../MC/Disassembler/AMDGPU/decode-err.txt | 5 - .../Disassembler/AMDGPU/gfx1150_dasm_sopp.txt | 10 - .../Disassembler/AMDGPU/gfx12_dasm_sopp.txt | 8 - .../secondary/llvm/lib/Target/AMDGPU/BUILD.gn | 1 - 25 files changed, 34 insertions(+), 1848 deletions(-) delete mode 100644 llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp delete mode 100644 llvm/test/CodeGen/AMDGPU/insert-singleuse-vdst.mir delete mode 100644 llvm/test/MC/AMDGPU/gfx1150_asm_sopp.s delete mode 100644 llvm/test/MC/Disassembler/AMDGPU/gfx1150_dasm_sopp.txt diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst index 4b48b54b18bb99..9e11b13c101d47 100644 --- a/llvm/docs/AMDGPUUsage.rst +++ b/llvm/docs/AMDGPUUsage.rst @@ -611,9 +611,7 @@ Generic processor code objects are versioned. See :ref:`amdgpu-generic-processor - ``gfx1152`` SALU floating point instructions - and single-use VGPR hint - instructions are not available - on: + are not available on: - ``gfx1150`` - ``gfx1151`` diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h index b2dd354e496a2e..4abb5a63ab6d2c 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -405,9 +405,6 @@ extern char &SIModeRegisterID; void initializeAMDGPUInsertDelayAluPass(PassRegistry &); extern char &AMDGPUInsertDelayAluID; -void initializeAMDGPUInsertSingleUseVDSTPass(PassRegistry &); -extern char &AMDGPUInsertSingleUseVDSTID; - void initializeSIInsertHardClausesPass(PassRegistry &); extern char &SIInsertHardClausesID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td index 919e698e76b33b..3626fd8bc78c15 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPU.td +++ b/llvm/lib/Target/AMDGPU/AMDGPU.td @@ -929,12 +929,6 @@ def FeatureSALUFloatInsts : SubtargetFeature<"salu-float", "Has SALU floating point instructions" >; -def FeatureVGPRSingleUseHintInsts : SubtargetFeature<"vgpr-singleuse-hint", - "HasVGPRSingleUseHintInsts", - "true", - "Has single-use VGPR hint instructions" ->; - def FeaturePseudoScalarTrans : SubtargetFeature<"pseudo-scalar-trans", "HasPseudoScalarTrans", "true", @@ -1615,14 +1609,12 @@ def FeatureISAVersion11_5_0 : FeatureSet< !listconcat(FeatureISAVersion11_Common.Features, [FeatureSALUFloatInsts, FeatureDPPSrc1SGPR, - FeatureVGPRSingleUseHintInsts, FeatureRequiredExportPriority])>; def FeatureISAVersion11_5_1 : FeatureSet< !listconcat(FeatureISAVersion11_Common.Features, [FeatureSALUFloatInsts, FeatureDPPSrc1SGPR, - FeatureVGPRSingleUseHintInsts, Feature1_5xVGPRs, FeatureRequiredExportPriority])>; @@ -1630,7 +1622,6 @@ def FeatureISAVersion11_5_2 : FeatureSet< !listconcat(FeatureISAVersion11_Common.Features, [FeatureSALUFloatInsts, FeatureDPPSrc1SGPR, - FeatureVGPRSingleUseHintInsts, FeatureRequiredExportPriority])>; def FeatureISAVersion12 : FeatureSet< @@ -1663,7 +1654,6 @@ def FeatureISAVersion12 : FeatureSet< FeatureSALUFloatInsts, FeaturePseudoScalarTrans, FeatureHasRestrictedSOffset, - FeatureVGPRSingleUseHintInsts, FeatureScalarDwordx3Loads, FeatureDPPSrc1SGPR, FeatureMaxHardClauseLength32, @@ -2271,9 +2261,6 @@ def HasNotMADIntraFwdBug : Predicate<"!Subtarget->hasMADIntraFwdBug()">; def HasSALUFloatInsts : Predicate<"Subtarget->hasSALUFloatInsts()">, AssemblerPredicate<(all_of FeatureSALUFloatInsts)>; -def HasVGPRSingleUseHintInsts : Predicate<"Subtarget->hasVGPRSingleUseHintInsts()">, - AssemblerPredicate<(all_of FeatureVGPRSingleUseHintInsts)>; - def HasPseudoScalarTrans : Predicate<"Subtarget->hasPseudoScalarTrans()">, AssemblerPredicate<(all_of FeaturePseudoScalarTrans)>; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp deleted file mode 100644 index 43b3bf43fe56db..00000000000000 --- a/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp +++ /dev/null @@ -1,245 +0,0 @@ -//===- AMDGPUInsertSingleUseVDST.cpp - Insert s_singleuse_vdst instructions ==// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -/// \file -/// Insert s_singleuse_vdst instructions on GFX11.5+ to mark regions of VALU -/// instructions that produce single-use VGPR values. If the value is forwarded -/// to the consumer instruction prior to VGPR writeback, the hardware can -/// then skip (kill) the VGPR write. -// -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "AMDGPUGenSearchableTables.inc" -#include "GCNSubtarget.h" -#include "SIInstrInfo.h" -#include "SIRegisterInfo.h" -#include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/STLExtras.h" -#include "llvm/ADT/SmallVector.h" -#include "llvm/ADT/StringRef.h" -#include "llvm/CodeGen/MachineBasicBlock.h" -#include "llvm/CodeGen/MachineFunction.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/MachineInstr.h" -#include "llvm/CodeGen/MachineInstrBuilder.h" -#include "llvm/CodeGen/MachineOperand.h" -#include "llvm/CodeGen/Register.h" -#include "llvm/IR/DebugLoc.h" -#include "llvm/MC/MCRegister.h" -#include "llvm/MC/MCRegisterInfo.h" -#include "llvm/Pass.h" -#include - -using namespace llvm; - -#define DEBUG_TYPE "amdgpu-insert-single-use-vdst" - -namespace { -class AMDGPUInsertSingleUseVDST : public MachineFunctionPass { -private: - const SIInstrInfo *SII; - class SingleUseInstruction { - private: - static const unsigned MaxSkipRange = 0b111; - static const unsigned MaxNumberOfSkipRegions = 2; - - unsigned LastEncodedPositionEnd; - MachineInstr *ProducerInstr; - - std::array SingleUseRegions; - SmallVector SkipRegions; - - // Adds a skip region into the instruction. - void skip(const unsigned ProducerPosition) { - while (LastEncodedPositionEnd + MaxSkipRange < ProducerPosition) { - SkipRegions.push_back(MaxSkipRange); - LastEncodedPositionEnd += MaxSkipRange; - } - SkipRegions.push_back(ProducerPosition - LastEncodedPositionEnd); - LastEncodedPositionEnd = ProducerPosition; - } - - bool currentRegionHasSpace() { - const auto Region = SkipRegions.size(); - // The first region has an extra bit of encoding space. - return SingleUseRegions[Region] < - ((Region == MaxNumberOfSkipRegions) ? 0b1111U : 0b111U); - } - - unsigned encodeImm() { - // Handle the first Single Use Region separately as it has an extra bit - // of encoding space. - unsigned Imm = SingleUseRegions[SkipRegions.size()]; - unsigned ShiftAmount = 4; - for (unsigned i = SkipRegions.size(); i > 0; i--) { - Imm |= SkipRegions[i - 1] << ShiftAmount; - ShiftAmount += 3; - Imm |= SingleUseRegions[i - 1] << ShiftAmount; - ShiftAmount += 3; - } - return Imm; - } - - public: - SingleUseInstruction(const unsigned ProducerPosition, - MachineInstr *Producer) - : LastEncodedPositionEnd(ProducerPosition + 1), ProducerInstr(Producer), - SingleUseRegions({1, 0, 0}) {} - - // Returns false if adding a new single use producer failed. This happens - // because it could not be encoded, either because there is no room to - // encode another single use producer region or that this single use - // producer is too far away to encode the amount of instructions to skip. - bool tryAddProducer(const unsigned ProducerPosition, MachineInstr *MI) { - // Producer is too far away to encode into this instruction or another - // skip region is needed and SkipRegions.size() = 2 so there's no room for - // another skip region, therefore a new instruction is needed. - if (LastEncodedPositionEnd + - (MaxSkipRange * (MaxNumberOfSkipRegions - SkipRegions.size())) < - ProducerPosition) - return false; - - // If a skip region is needed. - if (LastEncodedPositionEnd != ProducerPosition || - !currentRegionHasSpace()) { - // If the current region is out of space therefore a skip region would - // be needed, but there is no room for another skip region. - if (SkipRegions.size() == MaxNumberOfSkipRegions) - return false; - skip(ProducerPosition); - } - - SingleUseRegions[SkipRegions.size()]++; - LastEncodedPositionEnd = ProducerPosition + 1; - ProducerInstr = MI; - return true; - } - - auto emit(const SIInstrInfo *SII) { - return BuildMI(*ProducerInstr->getParent(), ProducerInstr, DebugLoc(), - SII->get(AMDGPU::S_SINGLEUSE_VDST)) - .addImm(encodeImm()); - } - }; - -public: - static char ID; - - AMDGPUInsertSingleUseVDST() : MachineFunctionPass(ID) {} - - void insertSingleUseInstructions( - ArrayRef> SingleUseProducers) const { - SmallVector Instructions; - - for (auto &[Position, MI] : SingleUseProducers) { - // Encode this position into the last single use instruction if possible. - if (Instructions.empty() || - !Instructions.back().tryAddProducer(Position, MI)) { - // If not, add a new instruction. - Instructions.push_back(SingleUseInstruction(Position, MI)); - } - } - - for (auto &Instruction : Instructions) - Instruction.emit(SII); - } - - bool runOnMachineFunction(MachineFunction &MF) override { - const auto &ST = MF.getSubtarget(); - if (!ST.hasVGPRSingleUseHintInsts()) - return false; - - SII = ST.getInstrInfo(); - const auto *TRI = &SII->getRegisterInfo(); - bool InstructionEmitted = false; - - for (MachineBasicBlock &MBB : MF) { - DenseMap RegisterUseCount; - - // Handle boundaries at the end of basic block separately to avoid - // false positives. If they are live at the end of a basic block then - // assume it has more uses later on. - for (const auto &Liveout : MBB.liveouts()) { - for (MCRegUnitMaskIterator Units(Liveout.PhysReg, TRI); Units.isValid(); - ++Units) { - const auto [Unit, Mask] = *Units; - if ((Mask & Liveout.LaneMask).any()) - RegisterUseCount[Unit] = 2; - } - } - - SmallVector> - SingleUseProducerPositions; - - unsigned VALUInstrCount = 0; - for (MachineInstr &MI : reverse(MBB.instrs())) { - // All registers in all operands need to be single use for an - // instruction to be marked as a single use producer. - bool AllProducerOperandsAreSingleUse = true; - - // Gather a list of Registers used before updating use counts to avoid - // double counting registers that appear multiple times in a single - // MachineInstr. - SmallVector RegistersUsed; - - for (const auto &Operand : MI.all_defs()) { - const auto Reg = Operand.getReg(); - - const auto RegUnits = TRI->regunits(Reg); - if (any_of(RegUnits, [&RegisterUseCount](const MCRegUnit Unit) { - return RegisterUseCount[Unit] > 1; - })) - AllProducerOperandsAreSingleUse = false; - - // Reset uses count when a register is no longer live. - for (const MCRegUnit Unit : RegUnits) - RegisterUseCount.erase(Unit); - } - - for (const auto &Operand : MI.all_uses()) { - const auto Reg = Operand.getReg(); - - // Count the number of times each register is read. - for (const MCRegUnit Unit : TRI->regunits(Reg)) { - if (!is_contained(RegistersUsed, Unit)) - RegistersUsed.push_back(Unit); - } - } - for (const MCRegUnit Unit : RegistersUsed) - RegisterUseCount[Unit]++; - - // Do not attempt to optimise across exec mask changes. - if (MI.modifiesRegister(AMDGPU::EXEC, TRI) || - AMDGPU::isInvalidSingleUseConsumerInst(MI.getOpcode())) { - for (auto &UsedReg : RegisterUseCount) - UsedReg.second = 2; - } - - if (!SIInstrInfo::isVALU(MI) || - AMDGPU::isInvalidSingleUseProducerInst(MI.getOpcode())) - continue; - if (AllProducerOperandsAreSingleUse) { - SingleUseProducerPositions.push_back({VALUInstrCount, &MI}); - InstructionEmitted = true; - } - VALUInstrCount++; - } - insertSingleUseInstructions(SingleUseProducerPositions); - } - return InstructionEmitted; - } -}; -} // namespace - -char AMDGPUInsertSingleUseVDST::ID = 0; - -char &llvm::AMDGPUInsertSingleUseVDSTID = AMDGPUInsertSingleUseVDST::ID; - -INITIALIZE_PASS(AMDGPUInsertSingleUseVDST, DEBUG_TYPE, - "AMDGPU Insert SingleUseVDST", false, false) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index 04fdee0819b502..abd50748f2cc05 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -311,12 +311,6 @@ static cl::opt EnableSIModeRegisterPass( cl::init(true), cl::Hidden); -// Enable GFX11.5+ s_singleuse_vdst insertion -static cl::opt - EnableInsertSingleUseVDST("amdgpu-enable-single-use-vdst", - cl::desc("Enable s_singleuse_vdst insertion"), - cl::init(false), cl::Hidden); - // Enable GFX11+ s_delay_alu insertion static cl::opt EnableInsertDelayAlu("amdgpu-enable-delay-alu", @@ -450,7 +444,6 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() { initializeAMDGPURewriteUndefForPHILegacyPass(*PR); initializeAMDGPUUnifyMetadataPass(*PR); initializeSIAnnotateControlFlowLegacyPass(*PR); - initializeAMDGPUInsertSingleUseVDSTPass(*PR); initializeAMDGPUInsertDelayAluPass(*PR); initializeSIInsertHardClausesPass(*PR); initializeSIInsertWaitcntsPass(*PR); @@ -1518,9 +1511,6 @@ void GCNPassConfig::addPreEmitPass() { // cases. addPass(&PostRAHazardRecognizerID); - if (isPassEnabled(EnableInsertSingleUseVDST, CodeGenOptLevel::Less)) - addPass(&AMDGPUInsertSingleUseVDSTID); - if (isPassEnabled(EnableInsertDelayAlu, CodeGenOptLevel::Less)) addPass(&AMDGPUInsertDelayAluID); diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt index e813653158e5d9..7c883cc2017ddd 100644 --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -81,7 +81,6 @@ add_llvm_target(AMDGPUCodeGen AMDGPUMCInstLower.cpp AMDGPUMemoryUtils.cpp AMDGPUIGroupLP.cpp - AMDGPUInsertSingleUseVDST.cpp AMDGPUMarkLastScratchLoad.cpp AMDGPUMIRFormatter.cpp AMDGPUOpenCLEnqueuedBlockLowering.cpp diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h index a4ae8a1be32258..e6b7342d5fffcf 100644 --- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h +++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h @@ -215,7 +215,6 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool HasPackedTID = false; bool ScalarizeGlobal = false; bool HasSALUFloatInsts = false; - bool HasVGPRSingleUseHintInsts = false; bool HasPseudoScalarTrans = false; bool HasRestrictedSOffset = false; @@ -1280,8 +1279,6 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo, bool hasSALUFloatInsts() const { return HasSALUFloatInsts; } - bool hasVGPRSingleUseHintInsts() const { return HasVGPRSingleUseHintInsts; } - bool hasPseudoScalarTrans() const { return HasPseudoScalarTrans; } bool hasRestrictedSOffset() const { return HasRestrictedSOffset; } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td index c016be2fc6c0fb..087ca1f954464d 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -2409,8 +2409,6 @@ class VOPProfile _ArgVT, bit _EnableClamp = 0> { field bit EnableClamp = _EnableClamp; field bit IsTrue16 = 0; field bit IsRealTrue16 = 0; - field bit IsInvalidSingleUseConsumer = 0; - field bit IsInvalidSingleUseProducer = 0; field ValueType DstVT = ArgVT[0]; field ValueType Src0VT = ArgVT[1]; diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td index 2e73a1a15f6b32..9da27a7c7ee7d6 100644 --- a/llvm/lib/Target/AMDGPU/SOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td @@ -1752,11 +1752,6 @@ let OtherPredicates = [HasExportInsts] in "$simm16">; } // End SubtargetPredicate = isGFX11Plus -let SubtargetPredicate = HasVGPRSingleUseHintInsts in { - def S_SINGLEUSE_VDST : - SOPP_Pseudo<"s_singleuse_vdst", (ins s16imm:$simm16), "$simm16">; -} // End SubtargetPredicate = HasVGPRSingeUseHintInsts - let SubtargetPredicate = isGFX12Plus, hasSideEffects = 1 in { def S_WAIT_LOADCNT : SOPP_Pseudo<"s_wait_loadcnt", (ins s16imm:$simm16), "$simm16", @@ -2676,12 +2671,6 @@ defm S_ICACHE_INV : SOPP_Real_32_gfx11_gfx12<0x03c>; defm S_BARRIER : SOPP_Real_32_gfx11<0x03d>; -//===----------------------------------------------------------------------===// -// SOPP - GFX1150, GFX12. -//===----------------------------------------------------------------------===// - -defm S_SINGLEUSE_VDST : SOPP_Real_32_gfx11_gfx12<0x013>; - //===----------------------------------------------------------------------===// // SOPP - GFX6, GFX7, GFX8, GFX9, GFX10 //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp index 8b5ec8793d84a2..f32c82f1e4ba4c 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -379,12 +379,6 @@ struct VOPTrue16Info { bool IsTrue16; }; -struct SingleUseExceptionInfo { - uint16_t Opcode; - bool IsInvalidSingleUseConsumer; - bool IsInvalidSingleUseProducer; -}; - struct FP8DstByteSelInfo { uint16_t Opcode; bool HasFP8DstByteSel; @@ -396,8 +390,6 @@ struct FP8DstByteSelInfo { #define GET_MTBUFInfoTable_IMPL #define GET_MUBUFInfoTable_DECL #define GET_MUBUFInfoTable_IMPL -#define GET_SingleUseExceptionTable_DECL -#define GET_SingleUseExceptionTable_IMPL #define GET_SMInfoTable_DECL #define GET_SMInfoTable_IMPL #define GET_VOP1InfoTable_DECL @@ -626,16 +618,6 @@ bool isTrue16Inst(unsigned Opc) { return Info ? Info->IsTrue16 : false; } -bool isInvalidSingleUseConsumerInst(unsigned Opc) { - const SingleUseExceptionInfo *Info = getSingleUseExceptionHelper(Opc); - return Info && Info->IsInvalidSingleUseConsumer; -} - -bool isInvalidSingleUseProducerInst(unsigned Opc) { - const SingleUseExceptionInfo *Info = getSingleUseExceptionHelper(Opc); - return Info && Info->IsInvalidSingleUseProducer; -} - bool isFP8DstSelInst(unsigned Opc) { const FP8DstByteSelInfo *Info = getFP8DstByteSelHelper(Opc); return Info ? Info->HasFP8DstByteSel : false; diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h index 35c080d8e0bebc..da37534f2fa4ff 100644 --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -870,6 +870,8 @@ bool isInvalidSingleUseConsumerInst(unsigned Opc); LLVM_READONLY bool isInvalidSingleUseProducerInst(unsigned Opc); +bool isDPMACCInstruction(unsigned Opc); + LLVM_READONLY unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc); diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td index 33f2f9f1f5c5b9..bd805059705783 100644 --- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td @@ -252,7 +252,6 @@ def VOP_READFIRSTLANE : VOPProfile <[i32, i32, untyped, untyped]> { def V_READFIRSTLANE_B32 : VOP1_Pseudo <"v_readfirstlane_b32", VOP_READFIRSTLANE, [], 1> { let isConvergent = 1; - let IsInvalidSingleUseConsumer = 1; } foreach vt = Reg32Types.types in { @@ -375,7 +374,6 @@ defm V_CLREXCP : VOP1Inst <"v_clrexcp", VOP_NO_EXT>; def VOP_MOVRELS : VOPProfile<[i32, i32, untyped, untyped]> { let Src0RC32 = VRegSrc_32; let Src0RC64 = VRegSrc_32; - let IsInvalidSingleUseConsumer = 1; } // Special case because there are no true output operands. Hack vdst @@ -419,12 +417,8 @@ class VOP_MOVREL : VOPProfile<[untyped, i32, untyped, un let EmitDst = 1; // force vdst emission } -let IsInvalidSingleUseProducer = 1 in { - def VOP_MOVRELD : VOP_MOVREL; - def VOP_MOVRELSD : VOP_MOVREL { - let IsInvalidSingleUseConsumer = 1; - } -} +def VOP_MOVRELD : VOP_MOVREL; +def VOP_MOVRELSD : VOP_MOVREL; let SubtargetPredicate = HasMovrel, Uses = [M0, EXEC] in { // v_movreld_b32 is a special case because the destination output @@ -541,7 +535,6 @@ let SubtargetPredicate = isGFX9Plus in { let Constraints = "$vdst = $src1, $vdst1 = $src0"; let DisableEncoding = "$vdst1,$src1"; let SchedRW = [Write64Bit, Write64Bit]; - let IsInvalidSingleUseConsumer = 1; } let isReMaterializable = 1 in @@ -708,8 +701,6 @@ let SubtargetPredicate = isGFX10Plus in { let Constraints = "$vdst = $src1, $vdst1 = $src0"; let DisableEncoding = "$vdst1,$src1"; let SchedRW = [Write64Bit, Write64Bit]; - let IsInvalidSingleUseConsumer = 1; - let IsInvalidSingleUseProducer = 1; } } // End Uses = [M0] } // End SubtargetPredicate = isGFX10Plus @@ -743,10 +734,7 @@ let SubtargetPredicate = isGFX11Plus in { } // Restrict src0 to be VGPR def V_PERMLANE64_B32 : VOP1_Pseudo<"v_permlane64_b32", VOP_MOVRELS, - [], /*VOP1Only=*/ 1> { - let IsInvalidSingleUseConsumer = 1; - let IsInvalidSingleUseProducer = 1; - } + [], /*VOP1Only=*/ 1>; defm V_MOV_B16 : VOP1Inst_t16<"v_mov_b16", VOP_I16_I16>; defm V_NOT_B16 : VOP1Inst_t16<"v_not_b16", VOP_I16_I16>; defm V_CVT_I32_I16 : VOP1Inst_t16<"v_cvt_i32_i16", VOP_I32_I16>; diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td index dd48607402eb0b..52f7be3b4577df 100644 --- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td @@ -788,12 +788,10 @@ defm V_SUBREV_U32 : VOP2Inst <"v_subrev_u32", VOP_I32_I32_I32_ARITH, null_frag, } // End isCommutable = 1 // These are special and do not read the exec mask. -let isConvergent = 1, Uses = [], IsInvalidSingleUseConsumer = 1 in { +let isConvergent = 1, Uses = [] in { def V_READLANE_B32 : VOP2_Pseudo<"v_readlane_b32", VOP_READLANE, []>; let IsNeverUniform = 1, Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in { -def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE, []> { - let IsInvalidSingleUseProducer = 1; - } +def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE, []>; } // End IsNeverUniform, $vdst = $vdst_in, DisableEncoding $vdst_in } // End isConvergent = 1 diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td index 466114b95f9f90..20beb41b7b58bb 100644 --- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td @@ -157,12 +157,12 @@ defm V_MAX_F64 : VOP3Inst <"v_max_f64", VOP3_Profile, fmaxnum_l } // End SubtargetPredicate = isNotGFX12Plus } // End SchedRW = [WriteDoubleAdd] -let SchedRW = [WriteIntMul], IsInvalidSingleUseConsumer = 1 in { +let SchedRW = [WriteIntMul] in { defm V_MUL_LO_U32 : VOP3Inst <"v_mul_lo_u32", V_MUL_PROF, DivergentBinFrag>; defm V_MUL_HI_U32 : VOP3Inst <"v_mul_hi_u32", V_MUL_PROF, mulhu>; defm V_MUL_LO_I32 : VOP3Inst <"v_mul_lo_i32", V_MUL_PROF>; defm V_MUL_HI_I32 : VOP3Inst <"v_mul_hi_i32", V_MUL_PROF, mulhs>; -} // End SchedRW = [WriteIntMul], IsInvalidSingleUseConsumer = 1 +} // End SchedRW = [WriteIntMul] let SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0 in { defm V_MINIMUM_F32 : VOP3Inst <"v_minimum_f32", VOP3_Profile, DivergentBinFrag>; @@ -260,9 +260,9 @@ let mayRaiseFPException = 0 in { // Seems suspicious but manual doesn't say it d let isReMaterializable = 1 in defm V_MSAD_U8 : VOP3Inst <"v_msad_u8", VOP3_Profile>; -let Constraints = "@earlyclobber $vdst", IsInvalidSingleUseConsumer = 1 in { +let Constraints = "@earlyclobber $vdst" in { defm V_MQSAD_PK_U16_U8 : VOP3Inst <"v_mqsad_pk_u16_u8", VOP3_Profile>; -} // End Constraints = "@earlyclobber $vdst", IsInvalidSingleUseConsumer = 1 +} // End Constraints = "@earlyclobber $vdst" let isReMaterializable = 1 in { @@ -277,16 +277,14 @@ let SchedRW = [Write64Bit] in { defm V_ASHR_I64 : VOP3Inst <"v_ashr_i64", VOP3_Profile, csra_64>; } // End SubtargetPredicate = isGFX6GFX7 - let IsInvalidSingleUseConsumer = 1 in { let SubtargetPredicate = isGFX8Plus in { defm V_LSHRREV_B64 : VOP3Inst <"v_lshrrev_b64", VOP3_Profile, clshr_rev_64>; defm V_ASHRREV_I64 : VOP3Inst <"v_ashrrev_i64", VOP3_Profile, cashr_rev_64>; - } // End SubtargetPredicate = isGFX8Plus, , IsInvalidSingleUseConsumer = 1 + } // End SubtargetPredicate = isGFX8Plus let SubtargetPredicate = isGFX8GFX9GFX10GFX11 in { defm V_LSHLREV_B64 : VOP3Inst <"v_lshlrev_b64", VOP3_Profile, clshl_rev_64>; } // End SubtargetPredicate = isGFX8GFX9GFX10GFX11 - } // End IsInvalidSingleUseConsumer = 1 } // End SchedRW = [Write64Bit] } // End isReMaterializable = 1 @@ -311,14 +309,14 @@ def VOPProfileMQSAD : VOP3_Profile { let HasModifiers = 0; } -let SubtargetPredicate = isGFX7Plus, IsInvalidSingleUseConsumer = 1 in { +let SubtargetPredicate = isGFX7Plus in { let Constraints = "@earlyclobber $vdst", SchedRW = [WriteQuarterRate32] in { defm V_QSAD_PK_U16_U8 : VOP3Inst <"v_qsad_pk_u16_u8", VOP3_Profile>; defm V_MQSAD_U32_U8 : VOP3Inst <"v_mqsad_u32_u8", VOPProfileMQSAD>; } // End Constraints = "@earlyclobber $vdst", SchedRW = [WriteQuarterRate32] -} // End SubtargetPredicate = isGFX7Plus, IsInvalidSingleUseConsumer = 1 +} // End SubtargetPredicate = isGFX7Plus -let isCommutable = 1, SchedRW = [WriteIntMul, WriteSALU], IsInvalidSingleUseConsumer = 1 in { +let isCommutable = 1, SchedRW = [WriteIntMul, WriteSALU] in { let SubtargetPredicate = isGFX7Plus, OtherPredicates = [HasNotMADIntraFwdBug] in { defm V_MAD_U64_U32 : VOP3Inst <"v_mad_u64_u32", VOP3b_I64_I1_I32_I32_I64>; defm V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64>; @@ -328,7 +326,7 @@ let isCommutable = 1, SchedRW = [WriteIntMul, WriteSALU], IsInvalidSingleUseCons defm V_MAD_U64_U32_gfx11 : VOP3Inst <"v_mad_u64_u32", VOP3b_I64_I1_I32_I32_I64>; defm V_MAD_I64_I32_gfx11 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64>; } -} // End isCommutable = 1, SchedRW = [WriteIntMul, WriteSALU], IsInvalidSingleUseConsumer = 1 +} // End isCommutable = 1, SchedRW = [WriteIntMul, WriteSALU] let FPDPRounding = 1 in { @@ -865,10 +863,10 @@ let SubtargetPredicate = isGFX10Plus in { } // End isCommutable = 1, isReMaterializable = 1 def : ThreeOp_i32_Pats; - let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in", IsInvalidSingleUseConsumer = 1, IsInvalidSingleUseProducer = 1 in { + let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in { defm V_PERMLANE16_B32 : VOP3Inst<"v_permlane16_b32", VOP3_PERMLANE_Profile>; defm V_PERMLANEX16_B32 : VOP3Inst<"v_permlanex16_b32", VOP3_PERMLANE_Profile>; - } // End $vdst = $vdst_in, DisableEncoding $vdst_in, IsInvalidSingleUseConsumer = 1, IsInvalidSingleUseProducer = 1 + } // End $vdst = $vdst_in, DisableEncoding $vdst_in foreach vt = Reg32Types.types in { def : PermlanePat; @@ -1286,12 +1284,11 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in { } } // End AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" -let IsInvalidSingleUseConsumer = 1 in { - defm V_READLANE_B32 : VOP3_Real_No_Suffix_gfx10<0x360>; - let InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in), IsInvalidSingleUseProducer = 1 in { - defm V_WRITELANE_B32 : VOP3_Real_No_Suffix_gfx10<0x361>; - } // End InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32: $src1, VGPR_32:$vdst_in), IsInvalidSingleUseProducer = 1 -} // End IsInvalidSingleUseConsumer = 1 +defm V_READLANE_B32 : VOP3_Real_No_Suffix_gfx10<0x360>; + +let InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in) in { + defm V_WRITELANE_B32 : VOP3_Real_No_Suffix_gfx10<0x361>; +} // End InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in) let SubtargetPredicate = isGFX10Before1030 in { defm V_MUL_LO_I32 : VOP3_Real_gfx10<0x16b>; diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td index f4d2c29158f49f..5eee71887964ad 100644 --- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td @@ -382,19 +382,15 @@ defm V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16", AMDGPUfdot2, 1/*ExplicitClamp*/>; let OtherPredicates = [HasDot7Insts] in { -let IsInvalidSingleUseConsumer = 1 in { - defm V_DOT4_U32_U8 : VOP3PInst<"v_dot4_u32_u8", - VOP3P_Profile, int_amdgcn_udot4, 1>; -} +defm V_DOT4_U32_U8 : VOP3PInst<"v_dot4_u32_u8", + VOP3P_Profile, int_amdgcn_udot4, 1>; defm V_DOT8_U32_U4 : VOP3PInst<"v_dot8_u32_u4", VOP3P_Profile, int_amdgcn_udot8, 1>; } // End OtherPredicates = [HasDot7Insts] let OtherPredicates = [HasDot1Insts] in { -let IsInvalidSingleUseConsumer = 1 in { - defm V_DOT4_I32_I8 : VOP3PInst<"v_dot4_i32_i8", - VOP3P_Profile, int_amdgcn_sdot4, 1>; -} +defm V_DOT4_I32_I8 : VOP3PInst<"v_dot4_i32_i8", + VOP3P_Profile, int_amdgcn_sdot4, 1>; defm V_DOT8_I32_I4 : VOP3PInst<"v_dot8_i32_i4", VOP3P_Profile, int_amdgcn_sdot8, 1>; } // End OtherPredicates = [HasDot1Insts] diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td index be862b44917e15..d6e08dce130ced 100644 --- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td @@ -464,10 +464,9 @@ multiclass VOPC_I16 : VOPC_Pseudos ; -let IsInvalidSingleUseConsumer = 1 in { - multiclass VOPC_I64 : - VOPC_Pseudos ; -} +multiclass VOPC_I64 : + VOPC_Pseudos ; + multiclass VOPCX_F16 { let OtherPredicates = [Has16BitInsts], True16Predicate = NotHasTrue16BitInsts in { @@ -502,10 +501,8 @@ multiclass VOPCX_I16 { multiclass VOPCX_I32 : VOPCX_Pseudos ; -let IsInvalidSingleUseConsumer = 1 in { - multiclass VOPCX_I64 : - VOPCX_Pseudos ; -} +multiclass VOPCX_I64 : + VOPCX_Pseudos ; //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td index 5a460ef0d42320..05a7d907d237ae 100644 --- a/llvm/lib/Target/AMDGPU/VOPInstructions.td +++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td @@ -17,8 +17,6 @@ class LetDummies { bit isReMaterializable; bit isAsCheapAsAMove; bit FPDPRounding; - bit IsInvalidSingleUseConsumer; - bit IsInvalidSingleUseProducer; Predicate SubtargetPredicate; string Constraints; string DisableEncoding; @@ -67,8 +65,6 @@ class VOP_Pseudo (NAME); bit IsTrue16 = P.IsTrue16; - bit IsInvalidSingleUseConsumer = P.IsInvalidSingleUseConsumer; - bit IsInvalidSingleUseProducer = P.IsInvalidSingleUseProducer; VOPProfile Pfl = P; string AsmOperands; @@ -165,8 +161,6 @@ class VOP3P_Pseudo pattern = []> : class VOP_Real { Instruction Opcode = !cast(NAME); bit IsSingle = ps.Pfl.IsSingle; - bit IsInvalidSingleUseConsumer = ps.Pfl.IsInvalidSingleUseConsumer; - bit IsInvalidSingleUseProducer = ps.Pfl.IsInvalidSingleUseProducer; } class VOP3_Real : @@ -844,9 +838,6 @@ class VOP_DPP_Pseudo pattern=[], let Constraints = !if(P.NumSrcArgs, P.TieRegDPP # " = $vdst", ""); let DisableEncoding = !if(P.NumSrcArgs, P.TieRegDPP, ""); let DecoderNamespace = "GFX8"; - - let IsInvalidSingleUseConsumer = !not(VINTERP); - let IsInvalidSingleUseProducer = !not(VINTERP); } class VOP3_DPP_Pseudo : @@ -1714,13 +1705,4 @@ def VOPTrue16Table : GenericTable { let PrimaryKey = ["Opcode"]; let PrimaryKeyName = "getTrue16OpcodeHelper"; -} - -def SingleUseExceptionTable : GenericTable { - let FilterClass = "VOP_Pseudo"; - let CppTypeName = "SingleUseExceptionInfo"; - let Fields = ["Opcode", "IsInvalidSingleUseConsumer", "IsInvalidSingleUseProducer"]; - - let PrimaryKey = ["Opcode"]; - let PrimaryKeyName = "getSingleUseExceptionHelper"; -} +} \ No newline at end of file diff --git a/llvm/test/CodeGen/AMDGPU/insert-singleuse-vdst.mir b/llvm/test/CodeGen/AMDGPU/insert-singleuse-vdst.mir deleted file mode 100644 index 9e65ce329df431..00000000000000 --- a/llvm/test/CodeGen/AMDGPU/insert-singleuse-vdst.mir +++ /dev/null @@ -1,1420 +0,0 @@ -# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4 -# RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -verify-machineinstrs -run-pass=amdgpu-insert-single-use-vdst %s -o - | FileCheck %s - -# One single-use producer. ---- -name: one_producer -tracksRegLiveness: true -body: | - ; CHECK-LABEL: name: one_producer - ; CHECK: bb.0: - ; CHECK-NEXT: successors: %bb.1(0x80000000) - ; CHECK-NEXT: liveins: $vgpr0 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: S_SINGLEUSE_VDST 1 - ; CHECK-NEXT: $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr2 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.1: - ; CHECK-NEXT: liveins: $vgpr0, $vgpr2 - bb.0: - liveins: $vgpr0 - $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec - $vgpr2 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec - bb.1: - liveins: $vgpr0, $vgpr2 -... - -# One single-use producer of a 64-bit value. ---- -name: one_producer_64bit -tracksRegLiveness: true -body: | - ; CHECK-LABEL: name: one_producer_64bit - ; CHECK: bb.0: - ; CHECK-NEXT: successors: %bb.1(0x80000000) - ; CHECK-NEXT: liveins: $vgpr0_vgpr1 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: S_SINGLEUSE_VDST 1 - ; CHECK-NEXT: $vgpr2_vgpr3 = V_LSHLREV_B64_e64 0, $vgpr0_vgpr1, implicit $exec - ; CHECK-NEXT: $vgpr4_vgpr5 = V_MOV_B64_e64 $vgpr2_vgpr3, implicit $exec - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.1: - ; CHECK-NEXT: liveins: $vgpr4_vgpr5 - bb.0: - liveins: $vgpr0_vgpr1 - $vgpr2_vgpr3 = V_LSHLREV_B64_e64 0, $vgpr0_vgpr1, implicit $exec - $vgpr4_vgpr5 = V_MOV_B64_e64 $vgpr2_vgpr3, implicit $exec - bb.1: - liveins: $vgpr4_vgpr5 -... - -# Two consecutive single-use producers. ---- -name: two_producers -tracksRegLiveness: true -body: | - ; CHECK-LABEL: name: two_producers - ; CHECK: bb.0: - ; CHECK-NEXT: successors: %bb.1(0x80000000) - ; CHECK-NEXT: liveins: $vgpr0 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: S_SINGLEUSE_VDST 2 - ; CHECK-NEXT: $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr2 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec - ; CHECK-NEXT: $vgpr3 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.1: - ; CHECK-NEXT: liveins: $vgpr0, $vgpr3 - bb.0: - liveins: $vgpr0 - $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec - $vgpr2 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec - $vgpr3 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec - bb.1: - liveins: $vgpr0, $vgpr3 -... - -# Redefinitions of v0. ---- -name: redefinitions -tracksRegLiveness: true -body: | - ; CHECK-LABEL: name: redefinitions - ; CHECK: bb.0: - ; CHECK-NEXT: successors: %bb.1(0x80000000) - ; CHECK-NEXT: liveins: $vgpr0 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: S_SINGLEUSE_VDST 4 - ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.1: - bb.0: - liveins: $vgpr0 - $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec - bb.1: -... - -# One producer with no consumers. ---- -name: no_consumer -tracksRegLiveness: true -body: | - ; CHECK-LABEL: name: no_consumer - ; CHECK: bb.0: - ; CHECK-NEXT: successors: %bb.1(0x80000000) - ; CHECK-NEXT: liveins: $vgpr0 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: S_SINGLEUSE_VDST 1 - ; CHECK-NEXT: $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.1: - bb.0: - liveins: $vgpr0 - $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec - bb.1: -... - -# One consumer with two uses of the same value. ---- -name: one_consumer_two_uses -tracksRegLiveness: true -body: | - ; CHECK-LABEL: name: one_consumer_two_uses - ; CHECK: bb.0: - ; CHECK-NEXT: successors: %bb.1(0x80000000) - ; CHECK-NEXT: liveins: $vgpr0 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: S_SINGLEUSE_VDST 1 - ; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr2 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.1: - ; CHECK-NEXT: liveins: $vgpr0, $vgpr2 - bb.0: - liveins: $vgpr0 - $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr2 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec - bb.1: - liveins: $vgpr0, $vgpr2 -... - -# A longer example. ---- -name: longer_example -tracksRegLiveness: true -body: | - ; CHECK-LABEL: name: longer_example - ; CHECK: bb.0: - ; CHECK-NEXT: successors: %bb.1(0x80000000) - ; CHECK-NEXT: liveins: $vgpr3, $vgpr5, $sgpr0, $sgpr2, $sgpr4, $sgpr5, $sgpr16, $sgpr17, $sgpr18, $sgpr19 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: S_SINGLEUSE_VDST 274 - ; CHECK-NEXT: $vgpr14 = V_MUL_F32_e32 $sgpr4, $vgpr3, implicit $exec, implicit $mode - ; CHECK-NEXT: $sgpr3 = S_MUL_F16 $sgpr0, $sgpr2, implicit $mode - ; CHECK-NEXT: $vgpr15 = V_MUL_F32_e32 $sgpr5, $vgpr3, implicit $exec, implicit $mode - ; CHECK-NEXT: $vgpr17 = V_FMA_F32_e64 0, $sgpr16, 0, $vgpr5, 0, $vgpr14, 0, 0, implicit $exec, implicit $mode - ; CHECK-NEXT: $sgpr1 = S_ADD_F16 $sgpr0, 15360, implicit $mode - ; CHECK-NEXT: $vgpr15 = V_FMA_F32_e64 0, $sgpr17, 0, $vgpr5, 0, $vgpr15, 0, 0, implicit $exec, implicit $mode - ; CHECK-NEXT: $vgpr14 = V_FMA_F32_e64 0, $sgpr18, 0, $vgpr15, 0, $vgpr17, 0, 0, implicit $exec, implicit $mode - ; CHECK-NEXT: $vgpr15 = V_FMA_F32_e64 0, $sgpr19, 0, $vgpr14, 0, $vgpr17, 0, 0, implicit $exec, implicit $mode - ; CHECK-NEXT: $vgpr16 = V_LOG_F32_e32 $vgpr15, implicit $exec, implicit $mode - ; CHECK-NEXT: $vgpr18 = V_EXP_F32_e32 $vgpr15, implicit $exec, implicit $mode - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.1: - ; CHECK-NEXT: liveins: $vgpr16, $vgpr18 - bb.0: - liveins: $vgpr3, $vgpr5, $sgpr0, $sgpr2, $sgpr4, $sgpr5, $sgpr16, $sgpr17, $sgpr18, $sgpr19 - $vgpr14 = V_MUL_F32_e32 $sgpr4, $vgpr3, implicit $exec, implicit $mode - $sgpr3 = S_MUL_F16 $sgpr0, $sgpr2, implicit $mode - $vgpr15 = V_MUL_F32_e32 $sgpr5, $vgpr3, implicit $exec, implicit $mode - $vgpr17 = V_FMA_F32_e64 0, $sgpr16, 0, $vgpr5, 0, $vgpr14, 0, 0, implicit $exec, implicit $mode - $sgpr1 = S_ADD_F16 $sgpr0, 15360, implicit $mode - $vgpr15 = V_FMA_F32_e64 0, $sgpr17, 0, $vgpr5, 0, $vgpr15, 0, 0, implicit $exec, implicit $mode - $vgpr14 = V_FMA_F32_e64 0, $sgpr18, 0, $vgpr15, 0, $vgpr17, 0, 0, implicit $exec, implicit $mode - $vgpr15 = V_FMA_F32_e64 0, $sgpr19, 0, $vgpr14, 0, $vgpr17, 0, 0, implicit $exec, implicit $mode - $vgpr16 = V_LOG_F32_e32 $vgpr15, implicit $exec, implicit $mode - $vgpr18 = V_EXP_F32_e32 $vgpr15, implicit $exec, implicit $mode - bb.1: - liveins: $vgpr16, $vgpr18 -... - -# Multiple uses of v0. ---- -name: multiple_uses_1 -tracksRegLiveness: true -body: | - ; CHECK-LABEL: name: multiple_uses_1 - ; CHECK: bb.0: - ; CHECK-NEXT: successors: %bb.1(0x80000000) - ; CHECK-NEXT: liveins: $vgpr0 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.1: - ; CHECK-NEXT: liveins: $vgpr1, $vgpr2 - bb.0: - liveins: $vgpr0 - $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec - bb.1: - liveins: $vgpr1, $vgpr2 -... - -# Multiple uses of v0 and redefinitions of v1 and v2. ---- -name: multiple_uses_2 -tracksRegLiveness: true -body: | - ; CHECK-LABEL: name: multiple_uses_2 - ; CHECK: bb.0: - ; CHECK-NEXT: successors: %bb.1(0x80000000) - ; CHECK-NEXT: liveins: $vgpr0 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: S_SINGLEUSE_VDST 2 - ; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.1: - ; CHECK-NEXT: liveins: $vgpr1, $vgpr2 - bb.0: - liveins: $vgpr0 - $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec - bb.1: - liveins: $vgpr1, $vgpr2 -... - -# Multiple uses of all but v1. ---- -name: multiple_uses_3 -tracksRegLiveness: true -body: | - ; CHECK-LABEL: name: multiple_uses_3 - ; CHECK: bb.0: - ; CHECK-NEXT: successors: %bb.1(0x80000000) - ; CHECK-NEXT: liveins: $vgpr0 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: S_SINGLEUSE_VDST 1 - ; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr1, implicit $exec - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.1: - ; CHECK-NEXT: liveins: $vgpr2, $vgpr3 - bb.0: - liveins: $vgpr0 - $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr3 = V_MOV_B32_e32 $vgpr1, implicit $exec - bb.1: - liveins: $vgpr2, $vgpr3 -... - -# Second use is an instruction that reads and writes v1. ---- -name: multiple_uses_4 -tracksRegLiveness: true -body: | - ; CHECK-LABEL: name: multiple_uses_4 - ; CHECK: bb.0: - ; CHECK-NEXT: successors: %bb.1(0x80000000) - ; CHECK-NEXT: liveins: $vgpr0 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr2 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec - ; CHECK-NEXT: $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.1: - ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - bb.0: - liveins: $vgpr0 - $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec - $vgpr2 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec - $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec - bb.1: - liveins: $vgpr0, $vgpr1, $vgpr2 -... - -# Results are live-in to another basic block. ---- -name: basic_block_1 -tracksRegLiveness: true -body: | - ; CHECK-LABEL: name: basic_block_1 - ; CHECK: bb.0: - ; CHECK-NEXT: successors: %bb.1(0x80000000) - ; CHECK-NEXT: liveins: $vgpr0 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.1: - ; CHECK-NEXT: successors: %bb.2(0x80000000) - ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: liveins: $vgpr1, $vgpr2 - bb.0: - liveins: $vgpr0 - $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec - bb.1: - liveins: $vgpr0, $vgpr1, $vgpr2 - $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec - bb.2: - liveins: $vgpr1, $vgpr2 -... - -# Result v2 has multiple uses in another basic block. ---- -name: basic_block_2 -tracksRegLiveness: true -body: | - ; CHECK-LABEL: name: basic_block_2 - ; CHECK: bb.0: - ; CHECK-NEXT: successors: %bb.1(0x80000000) - ; CHECK-NEXT: liveins: $vgpr0, $vgpr1 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr1, implicit $exec - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.1: - ; CHECK-NEXT: successors: %bb.2(0x80000000) - ; CHECK-NEXT: liveins: $vgpr2 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: S_SINGLEUSE_VDST 1 - ; CHECK-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr2, implicit $exec - ; CHECK-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr2, implicit $exec - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: liveins: $vgpr3 - bb.0: - liveins: $vgpr0, $vgpr1 - $vgpr2 = V_MOV_B32_e32 $vgpr1, implicit $exec - bb.1: - liveins: $vgpr2 - $vgpr3 = V_MOV_B32_e32 $vgpr2, implicit $exec - $vgpr3 = V_MOV_B32_e32 $vgpr2, implicit $exec - bb.2: - liveins: $vgpr3 -... - -# Results are redefined in another basic block. ---- -name: basic_block_3 -tracksRegLiveness: true -body: | - ; CHECK-LABEL: name: basic_block_3 - ; CHECK: bb.0: - ; CHECK-NEXT: successors: %bb.1(0x80000000) - ; CHECK-NEXT: liveins: $vgpr0 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: S_SINGLEUSE_VDST 1 - ; CHECK-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.1: - ; CHECK-NEXT: successors: %bb.2(0x80000000) - ; CHECK-NEXT: liveins: $vgpr0, $vgpr1 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec - ; CHECK-NEXT: $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec - ; CHECK-NEXT: $vgpr2 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.2: - ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2 - bb.0: - liveins: $vgpr0 - $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec - bb.1: - liveins: $vgpr0, $vgpr1 - $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec - $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec - $vgpr2 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec - bb.2: - liveins: $vgpr0, $vgpr1, $vgpr2 -... - -# Exec modified between producer and consumer. ---- -name: exec_mask -tracksRegLiveness: true -body: | - ; CHECK-LABEL: name: exec_mask - ; CHECK: bb.0: - ; CHECK-NEXT: successors: %bb.1(0x80000000) - ; CHECK-NEXT: liveins: $sgpr0_sgpr1 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec - ; CHECK-NEXT: $exec = COPY $sgpr0_sgpr1 - ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.1: - ; CHECK-NEXT: liveins: $vgpr0 - bb.0: - liveins: $sgpr0_sgpr1 - $vgpr0 = V_MOV_B32_e32 0, implicit $exec - $exec = COPY $sgpr0_sgpr1 - $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec - bb.1: - liveins: $vgpr0 -... - -# Exec_lo modified between producer and consumer. ---- -name: exec_mask_lo -tracksRegLiveness: true -body: | - ; CHECK-LABEL: name: exec_mask_lo - ; CHECK: bb.0: - ; CHECK-NEXT: successors: %bb.1(0x80000000) - ; CHECK-NEXT: liveins: $sgpr0 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec - ; CHECK-NEXT: $exec_lo = COPY $sgpr0 - ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.1: - ; CHECK-NEXT: liveins: $vgpr0 - bb.0: - liveins: $sgpr0 - $vgpr0 = V_MOV_B32_e32 0, implicit $exec - $exec_lo = COPY $sgpr0 - $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec - bb.1: - liveins: $vgpr0 -... - -# Exec_hi modified between producer and consumer. ---- -name: exec_mask_hi -tracksRegLiveness: true -body: | - ; CHECK-LABEL: name: exec_mask_hi - ; CHECK: bb.0: - ; CHECK-NEXT: successors: %bb.1(0x80000000) - ; CHECK-NEXT: liveins: $sgpr0 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec - ; CHECK-NEXT: $exec_hi = COPY $sgpr0 - ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.1: - ; CHECK-NEXT: liveins: $vgpr0 - bb.0: - liveins: $sgpr0 - $vgpr0 = V_MOV_B32_e32 0, implicit $exec - $exec_hi = COPY $sgpr0 - $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec - bb.1: - liveins: $vgpr0 -... - -# Write 32-bit vgpr and then read from low 16 bits. ---- -name: write_full_read_lo -tracksRegLiveness: true -body: | - ; CHECK-LABEL: name: write_full_read_lo - ; CHECK: bb.0: - ; CHECK-NEXT: successors: %bb.1(0x80000000) - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: S_SINGLEUSE_VDST 1 - ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec - ; CHECK-NEXT: $vgpr1_lo16 = V_MOV_B16_t16_e32 $vgpr0_lo16, implicit $exec - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.1: - ; CHECK-NEXT: liveins: $vgpr1_lo16 - bb.0: - $vgpr0 = V_MOV_B32_e32 0, implicit $exec - $vgpr1_lo16 = V_MOV_B16_t16_e32 $vgpr0_lo16, implicit $exec - bb.1: - liveins: $vgpr1_lo16 -... - -# Write 32-bit vgpr and then read from high 16 bits. ---- -name: write_full_read_hi -tracksRegLiveness: true -body: | - ; CHECK-LABEL: name: write_full_read_hi - ; CHECK: bb.0: - ; CHECK-NEXT: successors: %bb.1(0x80000000) - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: S_SINGLEUSE_VDST 1 - ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec - ; CHECK-NEXT: $vgpr1_hi16 = V_MOV_B16_t16_e32 $vgpr0_hi16, implicit $exec - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.1: - ; CHECK-NEXT: liveins: $vgpr1_hi16 - bb.0: - $vgpr0 = V_MOV_B32_e32 0, implicit $exec - $vgpr1_hi16 = V_MOV_B16_t16_e32 $vgpr0_hi16, implicit $exec - bb.1: - liveins: $vgpr1_hi16 -... - -# Write 32-bit vgpr and then read from both halves. ---- -name: write_full_read_both -tracksRegLiveness: true -body: | - ; CHECK-LABEL: name: write_full_read_both - ; CHECK: bb.0: - ; CHECK-NEXT: successors: %bb.1(0x80000000) - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: S_SINGLEUSE_VDST 1 - ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec - ; CHECK-NEXT: $vgpr1_lo16 = V_MOV_B16_t16_e32 $vgpr0_lo16, implicit $exec - ; CHECK-NEXT: $vgpr1_hi16 = V_MOV_B16_t16_e32 $vgpr0_hi16, implicit $exec - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.1: - ; CHECK-NEXT: liveins: $vgpr1 - bb.0: - $vgpr0 = V_MOV_B32_e32 0, implicit $exec - $vgpr1_lo16 = V_MOV_B16_t16_e32 $vgpr0_lo16, implicit $exec - $vgpr1_hi16 = V_MOV_B16_t16_e32 $vgpr0_hi16, implicit $exec - bb.1: - liveins: $vgpr1 -... - -# Write 32-bit vgpr and then read from both halves in the same instruction. ---- -name: write_full_read_both_same_instruction -tracksRegLiveness: true -body: | - ; CHECK-LABEL: name: write_full_read_both_same_instruction - ; CHECK: bb.0: - ; CHECK-NEXT: successors: %bb.1(0x80000000) - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: S_SINGLEUSE_VDST 1 - ; CHECK-NEXT: $vgpr0 = V_MOV_B32_e32 0, implicit $exec - ; CHECK-NEXT: $vgpr1_lo16 = V_ADD_F16_t16_e32 $vgpr0_lo16, $vgpr0_hi16, implicit $mode, implicit $exec - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.1: - ; CHECK-NEXT: liveins: $vgpr1_lo16 - bb.0: - $vgpr0 = V_MOV_B32_e32 0, implicit $exec - $vgpr1_lo16 = V_ADD_F16_t16_e32 $vgpr0_lo16, $vgpr0_hi16, implicit $mode, implicit $exec - bb.1: - liveins: $vgpr1_lo16 -... - -# Write low 16-bits and then read 32-bit vgpr. ---- -name: write_lo_read_full -tracksRegLiveness: true -body: | - ; CHECK-LABEL: name: write_lo_read_full - ; CHECK: bb.0: - ; CHECK-NEXT: successors: %bb.1(0x80000000) - ; CHECK-NEXT: liveins: $vgpr0 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: S_SINGLEUSE_VDST 1 - ; CHECK-NEXT: $vgpr0_lo16 = V_MOV_B16_t16_e32 0, implicit $exec - ; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.1: - ; CHECK-NEXT: liveins: $vgpr1 - bb.0: - liveins: $vgpr0 - $vgpr0_lo16 = V_MOV_B16_t16_e32 0, implicit $exec - $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec - bb.1: - liveins: $vgpr1 -... - -# Write low 16-bits and then read 32-bit vgpr twice. ---- -name: write_lo_read_full_twice -tracksRegLiveness: true -body: | - ; CHECK-LABEL: name: write_lo_read_full_twice - ; CHECK: bb.0: - ; CHECK-NEXT: successors: %bb.1(0x80000000) - ; CHECK-NEXT: liveins: $vgpr0 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $vgpr0_lo16 = V_MOV_B16_t16_e32 0, implicit $exec - ; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.1: - ; CHECK-NEXT: liveins: $vgpr1, $vgpr2 - bb.0: - liveins: $vgpr0 - $vgpr0_lo16 = V_MOV_B16_t16_e32 0, implicit $exec - $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec - bb.1: - liveins: $vgpr1, $vgpr2 -... - -# Write high 16-bits and then read 32-bit vgpr. ---- -name: write_hi_read_full -tracksRegLiveness: true -body: | - ; CHECK-LABEL: name: write_hi_read_full - ; CHECK: bb.0: - ; CHECK-NEXT: successors: %bb.1(0x80000000) - ; CHECK-NEXT: liveins: $vgpr0 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: S_SINGLEUSE_VDST 1 - ; CHECK-NEXT: $vgpr0_hi16 = V_MOV_B16_t16_e32 0, implicit $exec - ; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.1: - ; CHECK-NEXT: liveins: $vgpr1 - bb.0: - liveins: $vgpr0 - $vgpr0_hi16 = V_MOV_B16_t16_e32 0, implicit $exec - $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec - bb.1: - liveins: $vgpr1 -... - -# Write high 16-bits and then read 32-bit vgpr twice. ---- -name: write_hi_read_full_twice -tracksRegLiveness: true -body: | - ; CHECK-LABEL: name: write_hi_read_full_twice - ; CHECK: bb.0: - ; CHECK-NEXT: successors: %bb.1(0x80000000) - ; CHECK-NEXT: liveins: $vgpr0 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $vgpr0_hi16 = V_MOV_B16_t16_e32 0, implicit $exec - ; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.1: - ; CHECK-NEXT: liveins: $vgpr1, $vgpr2 - bb.0: - liveins: $vgpr0 - $vgpr0_hi16 = V_MOV_B16_t16_e32 0, implicit $exec - $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec - bb.1: - liveins: $vgpr1, $vgpr2 -... - -# Write low 16-bits and then write high 16-bits and then read 32-bit vgpr. ---- -name: write_both_read_full -tracksRegLiveness: true -body: | - ; CHECK-LABEL: name: write_both_read_full - ; CHECK: bb.0: - ; CHECK-NEXT: successors: %bb.1(0x80000000) - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: S_SINGLEUSE_VDST 2 - ; CHECK-NEXT: $vgpr0_lo16 = V_MOV_B16_t16_e32 0, implicit $exec - ; CHECK-NEXT: $vgpr0_hi16 = V_MOV_B16_t16_e32 0, implicit $exec - ; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.1: - ; CHECK-NEXT: liveins: $vgpr1 - bb.0: - $vgpr0_lo16 = V_MOV_B16_t16_e32 0, implicit $exec - $vgpr0_hi16 = V_MOV_B16_t16_e32 0, implicit $exec - $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec - bb.1: - liveins: $vgpr1 -... - -# Write low 16-bits and then write high 16-bits and then read 32-bit vgpr twice. ---- -name: write_both_read_full_twice -tracksRegLiveness: true -body: | - ; CHECK-LABEL: name: write_both_read_full_twice - ; CHECK: bb.0: - ; CHECK-NEXT: successors: %bb.1(0x80000000) - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $vgpr0_lo16 = V_MOV_B16_t16_e32 0, implicit $exec - ; CHECK-NEXT: $vgpr0_hi16 = V_MOV_B16_t16_e32 0, implicit $exec - ; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.1: - ; CHECK-NEXT: liveins: $vgpr1, $vgpr2 - bb.0: - $vgpr0_lo16 = V_MOV_B16_t16_e32 0, implicit $exec - $vgpr0_hi16 = V_MOV_B16_t16_e32 0, implicit $exec - $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec - bb.1: - liveins: $vgpr1, $vgpr2 -... - -# Three single use producer instructions with non single use producer -# instructions in between. ---- -name: three_producers_with_two_skips -tracksRegLiveness: true -body: | - ; CHECK-LABEL: name: three_producers_with_two_skips - ; CHECK: bb.0: - ; CHECK-NEXT: successors: %bb.1(0x80000000) - ; CHECK-NEXT: liveins: $vgpr0 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: S_SINGLEUSE_VDST 9361 - ; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr4 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr5 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.1: - ; CHECK-NEXT: liveins: $vgpr2, $vgpr4 - bb.0: - liveins: $vgpr0 - $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr4 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr5 = V_MOV_B32_e32 $vgpr0, implicit $exec - bb.1: - liveins: $vgpr2, $vgpr4 -... - -# Six single use producer instructions with non single use producer -# instructions in between. ---- -name: six_producers_with_four_skips -tracksRegLiveness: true -body: | - ; CHECK-LABEL: name: six_producers_with_four_skips - ; CHECK: bb.0: - ; CHECK-NEXT: successors: %bb.1(0x80000000) - ; CHECK-NEXT: liveins: $vgpr0 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: S_SINGLEUSE_VDST 145 - ; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr4 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: S_SINGLEUSE_VDST 9362 - ; CHECK-NEXT: $vgpr5 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr6 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr7 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr8 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr9 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr10 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.1: - ; CHECK-NEXT: liveins: $vgpr2, $vgpr4, $vgpr7, $vgpr9 - bb.0: - liveins: $vgpr0 - $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr4 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr5 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr6 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr7 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr8 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr9 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr10 = V_MOV_B32_e32 $vgpr0, implicit $exec - bb.1: - liveins: $vgpr2, $vgpr4, $vgpr7, $vgpr9 -... - -# Five single use producer instructions, followed by -# four non single use producers, followed by -# three single use producer instructions, followed by -# two non single use producers, followed by -# one single use producer instructions. ---- -name: immediate_order -tracksRegLiveness: true -body: | - ; CHECK-LABEL: name: immediate_order - ; CHECK: bb.0: - ; CHECK-NEXT: successors: %bb.1(0x80000000) - ; CHECK-NEXT: liveins: $vgpr0 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: S_SINGLEUSE_VDST 10693 - ; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr4 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr5 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr6 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr7 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr8 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr9 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr10 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr11 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr12 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr13 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr14 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr15 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.1: - ; CHECK-NEXT: liveins: $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr13, $vgpr14 - bb.0: - liveins: $vgpr0 - $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr4 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr5 = V_MOV_B32_e32 $vgpr0, implicit $exec - - $vgpr6 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr7 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr8 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr9 = V_MOV_B32_e32 $vgpr0, implicit $exec - - $vgpr10 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr11 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr12 = V_MOV_B32_e32 $vgpr0, implicit $exec - - $vgpr13 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr14 = V_MOV_B32_e32 $vgpr0, implicit $exec - - $vgpr15 = V_MOV_B32_e32 $vgpr0, implicit $exec - bb.1: - liveins: $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr13, $vgpr14 -... - -# Maximum number of single use producers that can be encoded in a single -# instruction. ---- -name: maximum_producers_single_instruction -tracksRegLiveness: true -body: | - ; CHECK-LABEL: name: maximum_producers_single_instruction - ; CHECK: bb.0: - ; CHECK-NEXT: successors: %bb.1(0x80000000) - ; CHECK-NEXT: liveins: $vgpr0 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: S_SINGLEUSE_VDST 58255 - ; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr4 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr5 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr6 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr7 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr8 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr9 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr10 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr11 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr12 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr13 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr14 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr15 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr16 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr17 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr18 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr19 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr20 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr21 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr22 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr23 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr24 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr25 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr26 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr27 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr28 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr29 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.1: - bb.0: - liveins: $vgpr0 - $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr4 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr5 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr6 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr7 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr8 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr9 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr10 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr11 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr12 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr13 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr14 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr15 = V_MOV_B32_e32 $vgpr0, implicit $exec - - $vgpr16 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr17 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr18 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr19 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr20 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr21 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr22 = V_MOV_B32_e32 $vgpr0, implicit $exec - - $vgpr23 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr24 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr25 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr26 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr27 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr28 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr29 = V_MOV_B32_e32 $vgpr0, implicit $exec - bb.1: -... - -# One more than the maximum number of single use producers that can be encoded -# in a single instruction. ---- -name: too_many_producers_single_instruction -tracksRegLiveness: true -body: | - ; CHECK-LABEL: name: too_many_producers_single_instruction - ; CHECK: bb.0: - ; CHECK-NEXT: successors: %bb.1(0x80000000) - ; CHECK-NEXT: liveins: $vgpr0 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: S_SINGLEUSE_VDST 1 - ; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: S_SINGLEUSE_VDST 58255 - ; CHECK-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr4 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr5 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr6 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr7 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr8 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr9 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr10 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr11 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr12 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr13 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr14 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr15 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr16 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr17 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr18 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr19 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr20 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr21 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr22 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr23 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr24 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr25 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr26 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr27 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr28 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr29 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr30 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.1: - - - - bb.0: - liveins: $vgpr0 - $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr4 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr5 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr6 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr7 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr8 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr9 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr10 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr11 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr12 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr13 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr14 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr15 = V_MOV_B32_e32 $vgpr0, implicit $exec - - $vgpr16 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr17 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr18 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr19 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr20 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr21 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr22 = V_MOV_B32_e32 $vgpr0, implicit $exec - - $vgpr23 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr24 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr25 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr26 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr27 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr28 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr29 = V_MOV_B32_e32 $vgpr0, implicit $exec - - $vgpr30 = V_MOV_B32_e32 $vgpr0, implicit $exec - bb.1: -... - -# Maximum distance between single use producers that can be encoded in a single -# instruction. ---- -name: maximum_skips_single_instruction -tracksRegLiveness: true -body: | - ; CHECK-LABEL: name: maximum_skips_single_instruction - ; CHECK: bb.0: - ; CHECK-NEXT: successors: %bb.1(0x80000000) - ; CHECK-NEXT: liveins: $vgpr0 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: S_SINGLEUSE_VDST 15473 - ; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr4 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr5 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr6 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr7 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr8 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr9 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr10 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr11 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr12 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr13 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr14 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr15 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr16 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.1: - ; CHECK-NEXT: liveins: $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15 - bb.0: - liveins: $vgpr0 - $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec - - $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr4 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr5 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr6 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr7 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr8 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr9 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr10 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr11 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr12 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr13 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr14 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr15 = V_MOV_B32_e32 $vgpr0, implicit $exec - - $vgpr16 = V_MOV_B32_e32 $vgpr0, implicit $exec - bb.1: - liveins: $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15 -... - -# One more than the maximum distance between single use producers that can be -# encoded in a single instruction. ---- -name: too_many_skips_single_instruction -tracksRegLiveness: true -body: | - ; CHECK-LABEL: name: too_many_skips_single_instruction - ; CHECK: bb.0: - ; CHECK-NEXT: successors: %bb.1(0x80000000) - ; CHECK-NEXT: liveins: $vgpr0 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: S_SINGLEUSE_VDST 1 - ; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr4 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr5 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr6 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr7 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr8 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr9 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr10 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr11 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr12 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr13 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr14 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr15 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr16 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: S_SINGLEUSE_VDST 1 - ; CHECK-NEXT: $vgpr17 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.1: - ; CHECK-NEXT: liveins: $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16 - bb.0: - liveins: $vgpr0 - $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec - - $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr4 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr5 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr6 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr7 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr8 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr9 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr10 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr11 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr12 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr13 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr14 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr15 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr16 = V_MOV_B32_e32 $vgpr0, implicit $exec - - $vgpr17 = V_MOV_B32_e32 $vgpr0, implicit $exec - bb.1: - liveins: $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16 -... - - -# Maximum possible encoding value with all bits of the immediate set ---- -name: all_immediate_bits_set -tracksRegLiveness: true -body: | - ; CHECK-LABEL: name: all_immediate_bits_set - ; CHECK: bb.0: - ; CHECK-NEXT: successors: %bb.1(0x80000000) - ; CHECK-NEXT: liveins: $vgpr0 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: S_SINGLEUSE_VDST 65535 - ; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr4 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr5 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr6 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr7 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr8 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr9 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr10 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr11 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr12 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr13 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr14 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr15 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr16 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr17 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr18 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr19 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr20 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr21 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr22 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr23 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr24 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr25 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr26 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr27 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr28 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr29 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr30 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr31 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr32 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr33 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr34 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr35 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr36 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr37 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr38 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr39 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr40 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr41 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr42 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr43 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.1: - ; CHECK-NEXT: liveins: $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr30, $vgpr31, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36 - bb.0: - liveins: $vgpr0 - $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr4 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr5 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr6 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr7 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr8 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr9 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr10 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr11 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr12 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr13 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr14 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr15 = V_MOV_B32_e32 $vgpr0, implicit $exec - - $vgpr16 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr17 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr18 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr19 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr20 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr21 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr22 = V_MOV_B32_e32 $vgpr0, implicit $exec - - $vgpr23 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr24 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr25 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr26 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr27 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr28 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr29 = V_MOV_B32_e32 $vgpr0, implicit $exec - - $vgpr30 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr31 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr32 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr33 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr34 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr35 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr36 = V_MOV_B32_e32 $vgpr0, implicit $exec - - $vgpr37 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr38 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr39 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr40 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr41 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr42 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr43 = V_MOV_B32_e32 $vgpr0, implicit $exec - bb.1: - liveins: $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr30, $vgpr31, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36 - -... - -# Tests for multi-cycle instructions that are explicitly excluded. - -# Valid producers but invalid consumer opcodes. ---- -name: v_mul_hi_u32_e64 -tracksRegLiveness: true -body: | - ; CHECK-LABEL: name: v_mul_hi_u32_e64 - ; CHECK: bb.0: - ; CHECK-NEXT: successors: %bb.1(0x80000000) - ; CHECK-NEXT: liveins: $vgpr0 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: S_SINGLEUSE_VDST 1 - ; CHECK-NEXT: $vgpr2 = V_MUL_HI_U32_e64 $vgpr0, $vgpr1, implicit $exec - ; CHECK-NEXT: $vgpr3 = V_MOV_B32_e32 $vgpr2, implicit $exec - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.1: - ; CHECK-NEXT: liveins: $vgpr0, $vgpr3 - bb.0: - liveins: $vgpr0 - $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr2 = V_MUL_HI_U32_e64 $vgpr0, $vgpr1, implicit $exec - $vgpr3 = V_MOV_B32_e32 $vgpr2, implicit $exec - bb.1: - liveins: $vgpr0, $vgpr3 -... - ---- -name: v_cmpx_t_u64_e64 -tracksRegLiveness: true -body: | - ; CHECK-LABEL: name: v_cmpx_t_u64_e64 - ; CHECK: bb.0: - ; CHECK-NEXT: successors: %bb.1(0x80000000) - ; CHECK-NEXT: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: S_SINGLEUSE_VDST 1 - ; CHECK-NEXT: $sgpr0 = V_CMPX_EQ_U64_e64 $vgpr0_vgpr1, $vgpr2_vgpr3, implicit-def $exec, implicit $exec - ; CHECK-NEXT: S_BRANCH %bb.1 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.1: - ; CHECK-NEXT: liveins: $vgpr0 - bb.0: - liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3 - $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec - $sgpr0 = V_CMPX_EQ_U64_e64 $vgpr0_vgpr1, $vgpr2_vgpr3, implicit-def $exec, implicit $exec - S_BRANCH %bb.1 - bb.1: - liveins: $vgpr0 -... - ---- -name: v_lshlrev_b64_e64 -tracksRegLiveness: true -body: | - ; CHECK-LABEL: name: v_lshlrev_b64_e64 - ; CHECK: bb.0: - ; CHECK-NEXT: successors: %bb.1(0x80000000) - ; CHECK-NEXT: liveins: $vgpr0_vgpr1 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $vgpr2_vgpr3 = V_MOV_B64_e64 $vgpr0_vgpr1, implicit $exec - ; CHECK-NEXT: $vgpr4_vgpr5 = V_LSHLREV_B64_e64 0, $vgpr2_vgpr3, implicit $exec - ; CHECK-NEXT: S_SINGLEUSE_VDST 1 - ; CHECK-NEXT: $vgpr6_vgpr7 = V_LSHLREV_B64_e64 0, $vgpr4_vgpr5, implicit $exec - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.1: - ; CHECK-NEXT: liveins: $vgpr4_vgpr5 - bb.0: - liveins: $vgpr0_vgpr1 - $vgpr2_vgpr3 = V_MOV_B64_e64 $vgpr0_vgpr1, implicit $exec - $vgpr4_vgpr5 = V_LSHLREV_B64_e64 0, $vgpr2_vgpr3, implicit $exec - $vgpr6_vgpr7 = V_LSHLREV_B64_e64 0, $vgpr4_vgpr5, implicit $exec - bb.1: - liveins: $vgpr4_vgpr5 -... - -# Invalid producers but valid consumer opcodes. ---- -name: v_movereld_b32_e32 -tracksRegLiveness: true -body: | - ; CHECK-LABEL: name: v_movereld_b32_e32 - ; CHECK: bb.0: - ; CHECK-NEXT: successors: %bb.1(0x80000000) - ; CHECK-NEXT: liveins: $vgpr0, $vgpr2 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $m0 = S_MOV_B32 0 - ; CHECK-NEXT: S_SINGLEUSE_VDST 1 - ; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: V_MOVRELD_B32_e32 $vgpr2, $vgpr1, implicit $m0, implicit $exec, implicit-def $vgpr1_vgpr2, implicit undef $vgpr1_vgpr2(tied-def 4) - ; CHECK-NEXT: $vgpr3 = V_ADD_U32_e32 $vgpr2, $vgpr1, implicit $exec - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.1: - ; CHECK-NEXT: liveins: $vgpr3 - bb.0: - liveins: $vgpr0, $vgpr2 - $m0 = S_MOV_B32 0 - $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec - V_MOVRELD_B32_e32 $vgpr2, $vgpr1, implicit $m0, implicit $exec, implicit-def $vgpr1_vgpr2, implicit undef $vgpr1_vgpr2(tied-def 4) - $vgpr3 = V_ADD_U32_e32 $vgpr2, $vgpr1, implicit $exec - bb.1: - liveins: $vgpr3 -... - -# Invalid producers and invalid consumer opcodes. ---- -name: v_writelane_b32 -tracksRegLiveness: true -body: | - ; CHECK-LABEL: name: v_writelane_b32 - ; CHECK: bb.0: - ; CHECK-NEXT: successors: %bb.1(0x80000000) - ; CHECK-NEXT: liveins: $vgpr0, $sgpr0 - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr0, 0, $vgpr1 - ; CHECK-NEXT: S_SINGLEUSE_VDST 1 - ; CHECK-NEXT: $vgpr2 = V_MOV_B32_e32 $vgpr1, implicit $exec - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.1: - ; CHECK-NEXT: liveins: $vgpr0 - bb.0: - liveins: $vgpr0, $sgpr0 - $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec - $vgpr1 = V_WRITELANE_B32 $sgpr0, 0, $vgpr1 - $vgpr2 = V_MOV_B32_e32 $vgpr1, implicit $exec - bb.1: - liveins: $vgpr0 -... - -# DPP instructions cannot be single use producers or consumers ---- -name: V_ADD_NC_U32_dpp -tracksRegLiveness: true -body: | - ; CHECK-LABEL: name: V_ADD_NC_U32_dpp - ; CHECK: bb.0: - ; CHECK-NEXT: successors: %bb.1(0x80000000) - ; CHECK-NEXT: liveins: $vgpr0, $vcc - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: $vgpr0 = V_ADDC_U32_dpp $vgpr0, $vgpr0, $vgpr0, 1, 15, 15, 1, implicit-def $vcc_lo, implicit $vcc_lo, implicit $exec - ; CHECK-NEXT: $vgpr0 = V_ADDC_U32_dpp $vgpr0, $vgpr0, $vgpr0, 1, 15, 15, 1, implicit-def $vcc_lo, implicit $vcc_lo, implicit $exec - ; CHECK-NEXT: $vgpr0 = V_ADDC_U32_dpp $vgpr0, $vgpr0, $vgpr0, 1, 15, 15, 1, implicit-def $vcc_lo, implicit $vcc_lo, implicit $exec - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.1: - ; CHECK-NEXT: liveins: $vgpr0 - bb.0: - liveins: $vgpr0, $vcc - $vgpr0 = V_ADDC_U32_dpp $vgpr0, $vgpr0, $vgpr0, 1, 15, 15, 1, implicit-def $vcc, implicit $vcc, implicit $exec - $vgpr0 = V_ADDC_U32_dpp $vgpr0, $vgpr0, $vgpr0, 1, 15, 15, 1, implicit-def $vcc, implicit $vcc, implicit $exec - $vgpr0 = V_ADDC_U32_dpp $vgpr0, $vgpr0, $vgpr0, 1, 15, 15, 1, implicit-def $vcc, implicit $vcc, implicit $exec - bb.1: - liveins: $vgpr0 -... - -# Exception to the rule that dpp instructions -# cannot be single use producers or consumers ---- -name: V_INTERP_MOV_F32 -tracksRegLiveness: true -body: | - ; CHECK-LABEL: name: V_INTERP_MOV_F32 - ; CHECK: bb.0: - ; CHECK-NEXT: successors: %bb.1(0x80000000) - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: S_SINGLEUSE_VDST 1 - ; CHECK-NEXT: $vgpr0 = V_INTERP_MOV_F32 0, 0, 0, implicit $mode, implicit $m0, implicit $exec - ; CHECK-NEXT: $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec - ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: bb.1: - ; CHECK-NEXT: liveins: $vgpr1 - bb.0: - $vgpr0 = V_INTERP_MOV_F32 0, 0, 0, implicit $mode, implicit $m0, implicit $exec - $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec - bb.1: - liveins: $vgpr1 -... - diff --git a/llvm/test/MC/AMDGPU/gfx1150_asm_sopp.s b/llvm/test/MC/AMDGPU/gfx1150_asm_sopp.s deleted file mode 100644 index 044ce48c267846..00000000000000 --- a/llvm/test/MC/AMDGPU/gfx1150_asm_sopp.s +++ /dev/null @@ -1,10 +0,0 @@ -// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1150 -show-encoding %s | FileCheck --check-prefixes=GFX1150 %s - -s_singleuse_vdst 0x0000 -// GFX1150: encoding: [0x00,0x00,0x93,0xbf] - -s_singleuse_vdst 0xffff -// GFX1150: encoding: [0xff,0xff,0x93,0xbf] - -s_singleuse_vdst 0x1234 -// GFX1150: encoding: [0x34,0x12,0x93,0xbf] diff --git a/llvm/test/MC/AMDGPU/gfx11_unsupported.s b/llvm/test/MC/AMDGPU/gfx11_unsupported.s index c9756a068890e7..c565801d275bb8 100644 --- a/llvm/test/MC/AMDGPU/gfx11_unsupported.s +++ b/llvm/test/MC/AMDGPU/gfx11_unsupported.s @@ -2014,9 +2014,6 @@ s_cmp_neq_f16 s1, s2 s_cmp_nlt_f16 s1, s2 // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU -s_singleuse_vdst 0x1234 -// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU - buffer_atomic_sub_clamp_u32 v5, off, s[8:11], s3 offset:0 glc // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_sopp.s b/llvm/test/MC/AMDGPU/gfx12_asm_sopp.s index e98659208d5a9c..fdcabc4352c69b 100644 --- a/llvm/test/MC/AMDGPU/gfx12_asm_sopp.s +++ b/llvm/test/MC/AMDGPU/gfx12_asm_sopp.s @@ -69,15 +69,6 @@ s_wait_alu depctr_va_sdst(3) s_wait_alu depctr_va_vdst(14) depctr_va_sdst(6) depctr_vm_vsrc(6) // GFX12: encoding: [0x9b,0xed,0x88,0xbf] -s_singleuse_vdst 0x0000 -// GFX12: encoding: [0x00,0x00,0x93,0xbf] - -s_singleuse_vdst 0xffff -// GFX12: encoding: [0xff,0xff,0x93,0xbf] - -s_singleuse_vdst 0x1234 -// GFX12: encoding: [0x34,0x12,0x93,0xbf] - s_barrier_wait 0xffff // GFX12: encoding: [0xff,0xff,0x94,0xbf] diff --git a/llvm/test/MC/Disassembler/AMDGPU/decode-err.txt b/llvm/test/MC/Disassembler/AMDGPU/decode-err.txt index d6e8b7ee2f01f0..f819a61949b577 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/decode-err.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/decode-err.txt @@ -1,16 +1,11 @@ # RUN: llvm-mc -triple=amdgcn -mcpu=gfx900 -disassemble -show-encoding -filetype=null < %s 2>&1 | FileCheck -check-prefix=GCN-ERR %s # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -disassemble -show-encoding < %s 2>&1 | FileCheck -check-prefixes=W32 %s # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s 2>&1 | FileCheck -check-prefixes=W64 %s -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -disassemble -show-encoding -filetype=null < %s 2>&1 | FileCheck -check-prefix=GFX11-ERR %s # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -disassemble -show-encoding -filetype=null < %s 2>&1 | FileCheck -check-prefix=GFX12-ERR %s # GCN-ERR: [[@LINE+1]]:1: warning: invalid instruction encoding 0xdf,0x00,0x00,0x02 -# this is s_singleuse_vdst 0x1234, which is only valid on gfx1150 -# GFX11-ERR: [[@LINE+1]]:1: warning: invalid instruction encoding -0x34,0x12,0x93,0xbf - # this is s_waitcnt_vscnt exec_hi, 0x1234, which is valid on gfx11, but not on gfx12 # GFX12-ERR: [[@LINE+1]]:1: warning: invalid instruction encoding 0x34,0x12,0x7f,0xbc diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1150_dasm_sopp.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1150_dasm_sopp.txt deleted file mode 100644 index 8fa266a73ff87f..00000000000000 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx1150_dasm_sopp.txt +++ /dev/null @@ -1,10 +0,0 @@ -# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1150 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1150 %s - -# GFX1150: s_singleuse_vdst 0x0 ; encoding: [0x00,0x00,0x93,0xbf] -0x00,0x00,0x93,0xbf - -# GFX1150: s_singleuse_vdst 0xffff ; encoding: [0xff,0xff,0x93,0xbf] -0xff,0xff,0x93,0xbf - -# GFX1150: s_singleuse_vdst 0x1234 ; encoding: [0x34,0x12,0x93,0xbf] -0x34,0x12,0x93,0xbf diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sopp.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sopp.txt index d42f920aa61dd7..d69801512c0786 100644 --- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sopp.txt +++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sopp.txt @@ -60,14 +60,6 @@ # GFX12: s_wait_storecnt_dscnt 0xc1d1 ; encoding: [0xd1,0xc1,0xc9,0xbf] 0xd1,0xc1,0xc9,0xbf -# GFX12: s_singleuse_vdst 0x0 ; encoding: [0x00,0x00,0x93,0xbf] -0x00,0x00,0x93,0xbf - -# GFX12: s_singleuse_vdst 0xffff ; encoding: [0xff,0xff,0x93,0xbf] -0xff,0xff,0x93,0xbf - -# GFX12: s_singleuse_vdst 0x1234 ; encoding: [0x34,0x12,0x93,0xbf] -0x34,0x12,0x93,0xbf # GFX12: s_barrier_wait 0xffff ; encoding: [0xff,0xff,0x94,0xbf] 0xff,0xff,0x94,0xbf diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn index dd4af4e98832f7..f83efbd3558025 100644 --- a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn +++ b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn @@ -152,7 +152,6 @@ static_library("LLVMAMDGPUCodeGen") { "AMDGPUISelLowering.cpp", "AMDGPUImageIntrinsicOptimizer.cpp", "AMDGPUInsertDelayAlu.cpp", - "AMDGPUInsertSingleUseVDST.cpp", "AMDGPUInstCombineIntrinsic.cpp", "AMDGPUInstrInfo.cpp", "AMDGPUInstructionSelector.cpp", From fc661df41a206779a9323fb9dd49038c44084d5e Mon Sep 17 00:00:00 2001 From: Jacek Caban Date: Tue, 24 Sep 2024 12:04:04 +0200 Subject: [PATCH 07/22] [LLD][COFF][NFC] Use dyn_cast on section chunks (#109701) Instead of dyn_cast_or_null, chunk pointers are never null. --- lld/COFF/Writer.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lld/COFF/Writer.cpp b/lld/COFF/Writer.cpp index c2765453aa964e..7cf723a8cf103f 100644 --- a/lld/COFF/Writer.cpp +++ b/lld/COFF/Writer.cpp @@ -472,7 +472,7 @@ bool Writer::createThunks(OutputSection *os, int margin) { // Recheck Chunks.size() each iteration, since we can insert more // elements into it. for (size_t i = 0; i != os->chunks.size(); ++i) { - SectionChunk *sc = dyn_cast_or_null(os->chunks[i]); + SectionChunk *sc = dyn_cast(os->chunks[i]); if (!sc) continue; MachineTypes machine = sc->getMachine(); @@ -606,7 +606,7 @@ void Writer::createECCodeMap() { // Verify that all relocations are in range, with no extra margin requirements. bool Writer::verifyRanges(const std::vector chunks) { for (Chunk *c : chunks) { - SectionChunk *sc = dyn_cast_or_null(c); + SectionChunk *sc = dyn_cast(c); if (!sc) continue; MachineTypes machine = sc->getMachine(); @@ -872,8 +872,8 @@ bool Writer::fixGnuImportChunks() { if (!pSec->chunks.empty()) hasIdata = true; llvm::stable_sort(pSec->chunks, [&](Chunk *s, Chunk *t) { - SectionChunk *sc1 = dyn_cast_or_null(s); - SectionChunk *sc2 = dyn_cast_or_null(t); + SectionChunk *sc1 = dyn_cast(s); + SectionChunk *sc2 = dyn_cast(t); if (!sc1 || !sc2) { // if SC1, order them ascending. If SC2 or both null, // S is not less than T. From f664d313cd63893d7a4a496fdf0de988323b6b09 Mon Sep 17 00:00:00 2001 From: Ramkumar Ramachandra Date: Tue, 24 Sep 2024 11:18:37 +0100 Subject: [PATCH 08/22] MemCpyOpt: replace an AA query with MSSA query (NFC) (#108535) Fix a long-standing TODO. --- .../lib/Transforms/Scalar/MemCpyOptimizer.cpp | 26 +++++++++---------- llvm/test/Transforms/MemCpyOpt/fca2memcpy.ll | 15 +++++++++++ 2 files changed, 27 insertions(+), 14 deletions(-) diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp index 1d67773585d593..2f88b19a8d3902 100644 --- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -638,6 +638,7 @@ bool MemCpyOptPass::processStoreOfLoad(StoreInst *SI, LoadInst *LI, if (!LI->isSimple() || !LI->hasOneUse() || LI->getParent() != SI->getParent()) return false; + BatchAAResults BAA(*AA); auto *T = LI->getType(); // Don't introduce calls to memcpy/memmove intrinsics out of thin air if // the corresponding libcalls are not available. @@ -647,19 +648,17 @@ bool MemCpyOptPass::processStoreOfLoad(StoreInst *SI, LoadInst *LI, (EnableMemCpyOptWithoutLibcalls || (TLI->has(LibFunc_memcpy) && TLI->has(LibFunc_memmove)))) { MemoryLocation LoadLoc = MemoryLocation::get(LI); - - // We use alias analysis to check if an instruction may store to - // the memory we load from in between the load and the store. If - // such an instruction is found, we try to promote there instead - // of at the store position. - // TODO: Can use MSSA for this. - Instruction *P = SI; - for (auto &I : make_range(++LI->getIterator(), SI->getIterator())) { - if (isModSet(AA->getModRefInfo(&I, LoadLoc))) { - P = &I; - break; - } - } + MemoryUseOrDef *LoadAccess = MSSA->getMemoryAccess(LI), + *StoreAccess = MSSA->getMemoryAccess(SI); + + // We use MSSA to check if an instruction may store to the memory we load + // from in between the load and the store. If such an instruction is found, + // we try to promote there instead of at the store position. + auto *Clobber = MSSA->getWalker()->getClobberingMemoryAccess( + StoreAccess->getDefiningAccess(), LoadLoc, BAA); + Instruction *P = MSSA->dominates(LoadAccess, Clobber) + ? cast(Clobber)->getMemoryInst() + : SI; // If we found an instruction that may write to the loaded memory, // we can try to promote at this position instead of the store @@ -707,7 +706,6 @@ bool MemCpyOptPass::processStoreOfLoad(StoreInst *SI, LoadInst *LI, // Detect cases where we're performing call slot forwarding, but // happen to be using a load-store pair to implement it, rather than // a memcpy. - BatchAAResults BAA(*AA); auto GetCall = [&]() -> CallInst * { // We defer this expensive clobber walk until the cheap checks // have been done on the source inside performCallSlotOptzn. diff --git a/llvm/test/Transforms/MemCpyOpt/fca2memcpy.ll b/llvm/test/Transforms/MemCpyOpt/fca2memcpy.ll index 51fad820509393..61e349e01ed91d 100644 --- a/llvm/test/Transforms/MemCpyOpt/fca2memcpy.ll +++ b/llvm/test/Transforms/MemCpyOpt/fca2memcpy.ll @@ -141,4 +141,19 @@ define void @throwing_call(ptr noalias %src, ptr %dst) { ret void } +define void @loop_memoryphi(ptr %a, ptr %b) { +; CHECK-LABEL: @loop_memoryphi( +; CHECK-NEXT: br label [[LOOP:%.*]] +; CHECK: loop: +; CHECK-NEXT: call void @llvm.memmove.p0.p0.i64(ptr align 8 [[B:%.*]], ptr align 8 [[A:%.*]], i64 16, i1 false) +; CHECK-NEXT: br label [[LOOP]] +; + br label %loop + +loop: + %v = load { i64, i64 }, ptr %a + store { i64, i64 } %v, ptr %b + br label %loop +} + declare void @call() From 040bb37195d93f75cc7ce6b83254ab5db959a085 Mon Sep 17 00:00:00 2001 From: Florian Hahn Date: Tue, 24 Sep 2024 11:18:49 +0100 Subject: [PATCH 09/22] [VPlan] Fix incorrect argument for CreateBinOp after 06c3a7d2d764. 06c3a7d2d764 incorrectly updated CreateBinOp to pass the debug location, which gets interpreted as FPMath node. Remove the argument. --- .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 4 +- .../LoopVectorize/float-induction.ll | 109 ++++++++++++++++++ 2 files changed, 111 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp index 3f7ab416e877bc..318d6a8c5b8c34 100644 --- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp +++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp @@ -1504,8 +1504,8 @@ void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) { VecInd->setDebugLoc(EntryVal->getDebugLoc()); State.set(this, VecInd); - Instruction *LastInduction = cast(Builder.CreateBinOp( - AddOp, VecInd, SplatVF, "vec.ind.next", EntryVal->getDebugLoc())); + Instruction *LastInduction = cast( + Builder.CreateBinOp(AddOp, VecInd, SplatVF, "vec.ind.next")); if (isa(EntryVal)) State.addMetadata(LastInduction, EntryVal); LastInduction->setDebugLoc(EntryVal->getDebugLoc()); diff --git a/llvm/test/Transforms/LoopVectorize/float-induction.ll b/llvm/test/Transforms/LoopVectorize/float-induction.ll index 9091b2c80fb97c..cedaf019a958bd 100644 --- a/llvm/test/Transforms/LoopVectorize/float-induction.ll +++ b/llvm/test/Transforms/LoopVectorize/float-induction.ll @@ -1640,3 +1640,112 @@ for.inc: for.end: ret void } + +define i32 @float_induction_with_dbg_on_fadd(ptr %dst) { +; VEC4_INTERL1-LABEL: @float_induction_with_dbg_on_fadd( +; VEC4_INTERL1-NEXT: entry: +; VEC4_INTERL1-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; VEC4_INTERL1: vector.ph: +; VEC4_INTERL1-NEXT: br label [[VECTOR_BODY:%.*]] +; VEC4_INTERL1: vector.body: +; VEC4_INTERL1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; VEC4_INTERL1-NEXT: [[TMP0:%.*]] = getelementptr float, ptr null, i64 [[INDEX]] +; VEC4_INTERL1-NEXT: store <4 x float> poison, ptr [[TMP0]], align 8 +; VEC4_INTERL1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; VEC4_INTERL1-NEXT: [[TMP1:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200 +; VEC4_INTERL1-NEXT: br i1 [[TMP1]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] +; VEC4_INTERL1: middle.block: +; VEC4_INTERL1-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; VEC4_INTERL1: scalar.ph: +; VEC4_INTERL1-NEXT: br label [[LOOP:%.*]] +; VEC4_INTERL1: loop: +; VEC4_INTERL1-NEXT: br i1 poison, label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP16:![0-9]+]] +; VEC4_INTERL1: exit: +; VEC4_INTERL1-NEXT: ret i32 0 +; +; VEC4_INTERL2-LABEL: @float_induction_with_dbg_on_fadd( +; VEC4_INTERL2-NEXT: entry: +; VEC4_INTERL2-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; VEC4_INTERL2: vector.ph: +; VEC4_INTERL2-NEXT: br label [[VECTOR_BODY:%.*]] +; VEC4_INTERL2: vector.body: +; VEC4_INTERL2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; VEC4_INTERL2-NEXT: [[TMP0:%.*]] = getelementptr float, ptr null, i64 [[INDEX]] +; VEC4_INTERL2-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i64 16 +; VEC4_INTERL2-NEXT: store <4 x float> poison, ptr [[TMP0]], align 8 +; VEC4_INTERL2-NEXT: store <4 x float> zeroinitializer, ptr [[TMP1]], align 8 +; VEC4_INTERL2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; VEC4_INTERL2-NEXT: [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200 +; VEC4_INTERL2-NEXT: br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] +; VEC4_INTERL2: middle.block: +; VEC4_INTERL2-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; VEC4_INTERL2: scalar.ph: +; VEC4_INTERL2-NEXT: br label [[LOOP:%.*]] +; VEC4_INTERL2: loop: +; VEC4_INTERL2-NEXT: br i1 poison, label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP16:![0-9]+]] +; VEC4_INTERL2: exit: +; VEC4_INTERL2-NEXT: ret i32 0 +; +; VEC1_INTERL2-LABEL: @float_induction_with_dbg_on_fadd( +; VEC1_INTERL2-NEXT: entry: +; VEC1_INTERL2-NEXT: br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; VEC1_INTERL2: vector.ph: +; VEC1_INTERL2-NEXT: br label [[VECTOR_BODY:%.*]] +; VEC1_INTERL2: vector.body: +; VEC1_INTERL2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; VEC1_INTERL2-NEXT: [[TMP0:%.*]] = or disjoint i64 [[INDEX]], 1 +; VEC1_INTERL2-NEXT: [[TMP1:%.*]] = getelementptr float, ptr null, i64 [[INDEX]] +; VEC1_INTERL2-NEXT: [[TMP2:%.*]] = getelementptr float, ptr null, i64 [[TMP0]] +; VEC1_INTERL2-NEXT: store float poison, ptr [[TMP1]], align 8 +; VEC1_INTERL2-NEXT: store float poison, ptr [[TMP2]], align 8 +; VEC1_INTERL2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; VEC1_INTERL2-NEXT: [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200 +; VEC1_INTERL2-NEXT: br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] +; VEC1_INTERL2: middle.block: +; VEC1_INTERL2-NEXT: br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]] +; VEC1_INTERL2: scalar.ph: +; VEC1_INTERL2-NEXT: br label [[LOOP:%.*]] +; VEC1_INTERL2: loop: +; VEC1_INTERL2-NEXT: br i1 poison, label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP16:![0-9]+]] +; VEC1_INTERL2: exit: +; VEC1_INTERL2-NEXT: ret i32 0 +; +; VEC2_INTERL1_PRED_STORE-LABEL: @float_induction_with_dbg_on_fadd( +; VEC2_INTERL1_PRED_STORE-NEXT: entry: +; VEC2_INTERL1_PRED_STORE-NEXT: br label [[VECTOR_BODY:%.*]] +; VEC2_INTERL1_PRED_STORE: vector.body: +; VEC2_INTERL1_PRED_STORE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP0:%.*]] = getelementptr float, ptr null, i64 [[INDEX]] +; VEC2_INTERL1_PRED_STORE-NEXT: store <2 x float> poison, ptr [[TMP0]], align 8 +; VEC2_INTERL1_PRED_STORE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2 +; VEC2_INTERL1_PRED_STORE-NEXT: [[TMP1:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200 +; VEC2_INTERL1_PRED_STORE-NEXT: br i1 [[TMP1]], label [[EXIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]] +; VEC2_INTERL1_PRED_STORE: exit: +; VEC2_INTERL1_PRED_STORE-NEXT: ret i32 0 +; +entry: + br label %loop + +loop: + %fp.iv = phi float [ 0.000000e+00, %entry ], [ %fp.iv.next, %loop ], !dbg !4 + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %fp.iv.next = fadd reassoc float %fp.iv, 0.000000e+00 + %gep = getelementptr float, ptr null, i64 %iv + store float %fp.iv.next, ptr %gep, align 8 + %iv.next = add i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 200 + br i1 %exitcond.not, label %exit, label %loop + +exit: + ret i32 0 +} + +!llvm.module.flags = !{!3} + +!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1) +!1 = !DIFile(filename: "bbi-99425.c", directory: "/tmp") +!2 = !{} +!3 = !{i32 2, !"Debug Info Version", i32 3} +!4 = !DILocation(line: 5, column: 12, scope: !8) +!8 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 3, type: !9, unit: !0, retainedNodes: !2) +!9 = !DISubroutineType(types: !2) From a3cf01d58587d81b184d40091a86d6b8bf92d240 Mon Sep 17 00:00:00 2001 From: David Spickett Date: Tue, 24 Sep 2024 11:21:24 +0100 Subject: [PATCH 10/22] [lldb][docs] Resurrect the information on adding a new language (#109427) This got deleted in e078c9507c3abb4d9bb2265c366b26557880a3e3, I presume accidentally, because it didn't have a corresponding rst file for it. So I've brought it back and converted it into Markdown. The content remains accurate, from what I know at least. It's a bit "now draw the rest of the owl" but if nothing else, it gives you a bunch of important classes to go and research as a starting point. You can see the original content here: https://github.com/llvm/llvm-project/blob/5d71fc5d7b5ffe2323418a09db6eddaf84d6c662/lldb/www/adding-language-support.html --- lldb/docs/index.rst | 1 + lldb/docs/resources/addinglanguagesupport.md | 95 ++++++++++++++++++++ 2 files changed, 96 insertions(+) create mode 100644 lldb/docs/resources/addinglanguagesupport.md diff --git a/lldb/docs/index.rst b/lldb/docs/index.rst index d9b8e589eb2ac0..dd44a8430add80 100644 --- a/lldb/docs/index.rst +++ b/lldb/docs/index.rst @@ -163,6 +163,7 @@ interesting areas to contribute to lldb. resources/caveats resources/projects resources/lldbdap + resources/addinglanguagesupport Public C++ API Private C++ API diff --git a/lldb/docs/resources/addinglanguagesupport.md b/lldb/docs/resources/addinglanguagesupport.md new file mode 100644 index 00000000000000..28789048643d77 --- /dev/null +++ b/lldb/docs/resources/addinglanguagesupport.md @@ -0,0 +1,95 @@ +# Adding Programming Language Support + +LLDB has been architected to make it straightforward to add support for a +programming language. Only a small enum in core LLDB needs to be modified to +make LLDB aware of a new programming language. Everything else can be supplied +in derived classes that need not even be present in the core LLDB repository. +This makes it convenient for developers adding language support in downstream +repositories since it practically eliminates the potential for merge conflicts. + +The basic steps are: +* Add the language to the `LanguageType` enum. +* Add a `TypeSystem` for the language. +* Add expression evaluation support. + +Additionally, you may want to create a `Language` and `LanguageRuntime` plugin +for your language, which enables support for advanced features like dynamic +typing and data formatting. + +## Add the Language to the LanguageType enum + +The `LanguageType` enum +(see [lldb-enumerations.h](https://github.com/llvm/llvm-project/blob/main/lldb/include/lldb/lldb-enumerations.h)) +contains a list of every language known to LLDB. It is the one place where +support for a language must live that will need to merge cleanly with upstream +LLDB if you are developing your language support in a separate branch. When +adding support for a language previously unknown to LLDB, start by adding an +enumeration entry to `LanguageType`. + +## Add a TypeSystem for the Language + +Both [Module](https://github.com/llvm/llvm-project/blob/main/lldb/include/lldb/Core/Module.h) +and [Target](https://github.com/llvm/llvm-project/blob/main/lldb/include/lldb/Target/Target.h) +support the retrieval of a `TypeSystem` instance via `GetTypeSystemForLanguage()`. +For `Module`, this method is directly on the `Module` instance. For `Target`, +this is retrieved indirectly via the `TypeSystemMap` for the `Target` instance. + +The `TypeSystem` instance returned by the `Target` is expected to be capable of +evaluating expressions, while the `TypeSystem` instance returned by the `Module` +is not. If you want to support expression evaluation for your language, you could +consider one of the following approaches: +* Implement a single `TypeSystem` class that supports evaluation when given an + optional `Target`, implementing all the expression evaluation methods on the + `TypeSystem`. +* Create multiple `TypeSystem` classes, one for evaluation and one for static + `Module` usage. + +For clang and Swift, the latter approach was chosen. Primarily to make it +clearer that evaluation with the static `Module`-returned `TypeSystem` instances +make no sense, and have them error out on those calls. But either approach is +fine. + +# Creating Types + +Your `TypeSystem` will need an approach for creating types based on a set of +`Module`s. If your type info is going to come from DWARF info, you will want to +subclass [DWARFASTParser](https://github.com/llvm/llvm-project/blob/main/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParser.h). + + +# Add Expression Evaluation Support + +Expression Evaluation support is enabled by implementing the relevant methods on +a `TypeSystem`-derived class. Search for `Expression` in the +[TypeSystem header](https://github.com/llvm/llvm-project/blob/main/lldb/include/lldb/Symbol/TypeSystem.h) +to find the methods to implement. + +# Type Completion + +There are three levels of type completion, each requiring more type information: +1. Pointer size: When you have a forward decl or a reference, and that's all you + need. At this stage, the pointer size is all you need. +2. Layout info: You need the size of an instance of the type, but you still don't + need to know all the guts of the type. +3. Full type info: Here you need everything, because you're playing with + internals of it, such as modifying a member variable. + +Ensure you never complete more of a type than is needed for a given situation. +This will keep your type system from doing more work than necessary. + +# Language and LanguageRuntime Plugins + +If you followed the steps outlined above, you already have taught LLDB a great +deal about your language. If your language's runtime model and fundamental data +types don't differ much from the C model, you are pretty much done. + +However it is likely that your language offers its own data types for things +like strings and arrays, and probably has a notion of dynamic types, where the +effective type of a variable can only be known at runtime. + +These tasks are covered by two plugins: +* a `LanguageRuntime` plugin, which provides LLDB with a dynamic view of your + language; this plugin answers questions that require a live process to acquire + information (for example dynamic type resolution). +* a `Language` plugin, which provides LLDB with a static view of your language; + questions that are statically knowable and do not require a process are + answered by this plugin (for example data formatters). \ No newline at end of file From d4f38f43f5402041dd36977baa459830011d6ac6 Mon Sep 17 00:00:00 2001 From: Nashe Mncube Date: Tue, 24 Sep 2024 11:26:06 +0100 Subject: [PATCH 11/22] [LLVM][ARM][CodeGen]Define branch instruction alignment for m85 and m7 (#109647) Branch instruction alignments were not defined for cortex-m85 and cortex-m7 which misses an optimisation opportunity. With this patch we see performance improvements as high as 5% on some benchmarks with most around 1%. --- llvm/lib/Target/ARM/ARMFeatures.td | 3 +++ llvm/lib/Target/ARM/ARMProcessors.td | 2 ++ llvm/test/CodeGen/ARM/preferred-function-alignment.ll | 5 +++-- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/llvm/lib/Target/ARM/ARMFeatures.td b/llvm/lib/Target/ARM/ARMFeatures.td index 8b0ade54b46d3c..dc0e86c696f63a 100644 --- a/llvm/lib/Target/ARM/ARMFeatures.td +++ b/llvm/lib/Target/ARM/ARMFeatures.td @@ -375,6 +375,9 @@ def FeaturePref32BitThumb : SubtargetFeature<"32bit", "Prefers32BitThumb", "true def FeaturePrefLoopAlign32 : SubtargetFeature<"loop-align", "PrefLoopLogAlignment","2", "Prefer 32-bit alignment for loops">; +def FeaturePrefLoopAlign64 : SubtargetFeature<"loop-align-64", "PrefLoopLogAlignment","3", + "Prefer 64-bit alignment for loops">; + def FeatureMVEVectorCostFactor1 : SubtargetFeature<"mve1beat", "MVEVectorCostFactor", "4", "Model MVE instructions as a 1 beat per tick architecture">; diff --git a/llvm/lib/Target/ARM/ARMProcessors.td b/llvm/lib/Target/ARM/ARMProcessors.td index e4e122a0d1339b..a66a2c0b1981d8 100644 --- a/llvm/lib/Target/ARM/ARMProcessors.td +++ b/llvm/lib/Target/ARM/ARMProcessors.td @@ -344,6 +344,7 @@ def : ProcessorModel<"cortex-m4", CortexM4Model, [ARMv7em, def : ProcessorModel<"cortex-m7", CortexM7Model, [ARMv7em, ProcM7, FeatureFPARMv8_D16, + FeaturePrefLoopAlign64, FeatureUseMIPipeliner, FeatureUseMISched]>; @@ -385,6 +386,7 @@ def : ProcessorModel<"cortex-m85", CortexM85Model, [ARMv81mMainline, FeatureDSP, FeatureFPARMv8_D16, FeaturePACBTI, + FeaturePrefLoopAlign64, FeatureUseMISched, HasMVEFloatOps]>; diff --git a/llvm/test/CodeGen/ARM/preferred-function-alignment.ll b/llvm/test/CodeGen/ARM/preferred-function-alignment.ll index afe64a22c5e808..f3a227c4765eb8 100644 --- a/llvm/test/CodeGen/ARM/preferred-function-alignment.ll +++ b/llvm/test/CodeGen/ARM/preferred-function-alignment.ll @@ -1,14 +1,15 @@ -; RUN: llc -mtriple=arm-none-eabi -mcpu=cortex-m85 < %s | FileCheck --check-prefixes=CHECK,ALIGN-16,ALIGN-CS-16 %s +; RUN: llc -mtriple=arm-none-eabi -mcpu=cortex-m85 < %s | FileCheck --check-prefixes=CHECK,ALIGN-64,ALIGN-CS-16 %s ; RUN: llc -mtriple=arm-none-eabi -mcpu=cortex-m23 < %s | FileCheck --check-prefixes=CHECK,ALIGN-16,ALIGN-CS-16 %s ; RUN: llc -mtriple=arm-none-eabi -mcpu=cortex-a5 < %s | FileCheck --check-prefixes=CHECK,ALIGN-32,ALIGN-CS-32 %s ; RUN: llc -mtriple=arm-none-eabi -mcpu=cortex-m33 < %s | FileCheck --check-prefixes=CHECK,ALIGN-32,ALIGN-CS-16 %s ; RUN: llc -mtriple=arm-none-eabi -mcpu=cortex-m55 < %s | FileCheck --check-prefixes=CHECK,ALIGN-32,ALIGN-CS-16 %s - +; RUN: llc -mtriple=arm-none-eabi -mcpu=cortex-m7 < %s | FileCheck --check-prefixes=CHECK,ALIGN-64,ALIGN-CS-16 %s ; CHECK-LABEL: test ; ALIGN-16: .p2align 1 ; ALIGN-32: .p2align 2 +; ALIGN-64: .p2align 3 define void @test() { ret void From ea902d1b36e4e3a7d7bdd0f7bce3c460b6dd6e80 Mon Sep 17 00:00:00 2001 From: Nikita Popov Date: Tue, 24 Sep 2024 12:08:35 +0200 Subject: [PATCH 12/22] [IR] Remove deprecated opaque pointer migration methods Remove the following methods: * Type::getNonOpaquePointerElementType() * Type::isOpaquePointerTy() * LLVMContext::supportsTypedPointers() * LLVMContext::setOpaquePointers() These were used temporarily during the opaque pointers migration, and are no longer needed. --- llvm/include/llvm/IR/LLVMContext.h | 11 ----------- llvm/include/llvm/IR/Type.h | 12 ------------ llvm/lib/IR/LLVMContext.cpp | 8 -------- 3 files changed, 31 deletions(-) diff --git a/llvm/include/llvm/IR/LLVMContext.h b/llvm/include/llvm/IR/LLVMContext.h index 6ffa2bdaa319a7..558816e146587a 100644 --- a/llvm/include/llvm/IR/LLVMContext.h +++ b/llvm/include/llvm/IR/LLVMContext.h @@ -316,17 +316,6 @@ class LLVMContext { /// LLVMContext is used by compilation. void setOptPassGate(OptPassGate&); - /// Set whether opaque pointers are enabled. The method may be called multiple - /// times, but only with the same value. Note that creating a pointer type or - /// otherwise querying the opaque pointer mode performs an implicit set to - /// the default value. - [[deprecated("Opaque pointers are always enabled")]] - void setOpaquePointers(bool Enable) const; - - /// Whether typed pointers are supported. If false, all pointers are opaque. - [[deprecated("Always returns false")]] - bool supportsTypedPointers() const; - /// Get or set the current "default" target CPU (target-cpu function /// attribute). The intent is that compiler frontends will set this to a value /// that reflects the attribute that a function would get "by default" without diff --git a/llvm/include/llvm/IR/Type.h b/llvm/include/llvm/IR/Type.h index b88c8aece8126c..2f53197df19998 100644 --- a/llvm/include/llvm/IR/Type.h +++ b/llvm/include/llvm/IR/Type.h @@ -250,10 +250,6 @@ class Type { /// True if this is an instance of PointerType. bool isPointerTy() const { return getTypeID() == PointerTyID; } - /// True if this is an instance of an opaque PointerType. - LLVM_DEPRECATED("Use isPointerTy() instead", "isPointerTy") - bool isOpaquePointerTy() const { return isPointerTy(); }; - /// Return true if this is a pointer type or a vector of pointer types. bool isPtrOrPtrVectorTy() const { return getScalarType()->isPointerTy(); } @@ -406,14 +402,6 @@ class Type { inline StringRef getTargetExtName() const; - /// Only use this method in code that is not reachable with opaque pointers, - /// or part of deprecated methods that will be removed as part of the opaque - /// pointers transition. - [[deprecated("Pointers no longer have element types")]] - Type *getNonOpaquePointerElementType() const { - llvm_unreachable("Pointers no longer have element types"); - } - /// Given vector type, change the element type, /// whilst keeping the old number of elements. /// For non-vectors simply returns \p EltTy. diff --git a/llvm/lib/IR/LLVMContext.cpp b/llvm/lib/IR/LLVMContext.cpp index c0fee93a233808..22e60772def43f 100644 --- a/llvm/lib/IR/LLVMContext.cpp +++ b/llvm/lib/IR/LLVMContext.cpp @@ -377,14 +377,6 @@ std::unique_ptr LLVMContext::getDiagnosticHandler() { return std::move(pImpl->DiagHandler); } -void LLVMContext::setOpaquePointers(bool Enable) const { - assert(Enable && "Cannot disable opaque pointers"); -} - -bool LLVMContext::supportsTypedPointers() const { - return false; -} - StringRef LLVMContext::getDefaultTargetCPU() { return pImpl->DefaultTargetCPU; } From 3d34053af61ff45e05d230d2678eb8e95322eb14 Mon Sep 17 00:00:00 2001 From: Shengchen Kan Date: Tue, 24 Sep 2024 18:39:48 +0800 Subject: [PATCH 13/22] [X86,MC] Add relocation R_X86_64_REX2_GOTPCRELX (#106681) For mov name@GOTPCREL(%rip), %reg test %reg, name@GOTPCREL(%rip) binop name@GOTPCREL(%rip), %reg where binop is one of adc, add, and, cmp, or, sbb, sub, xor instructions, add `R_X86_64_REX2_GOTPCRELX`/`R_X86_64_CODE_4_GOTPCRELX` = 43 if the instruction starts at 4 bytes before the relocation offset. It similar to R_X86_64_GOTPCRELX. Linker can treat `R_X86_64_REX2_GOTPCRELX`/`R_X86_64_CODE_4_GOTPCRELX` as `R_X86_64_GOTPCREL` or convert the above instructions to lea name(%rip), %reg mov $name, %reg test $name, %reg binop $name, %reg if the first byte of the instruction at the relocation `offset - 4` is `0xd5` (namely, encoded w/ REX2 prefix) when possible. Binutils patch: https://github.com/bminor/binutils-gdb/commit/3d5a60de52556f6a53d71d7e607c6696450ae3e4 Binutils mailthread: https://sourceware.org/pipermail/binutils/2023-December/131462.html ABI discussion: https://groups.google.com/g/x86-64-abi/c/KbzaNHRB6QU Blog: https://kanrobert.github.io/rfc/All-about-APX-relocation --- clang/test/Driver/relax.s | 2 ++ .../llvm/BinaryFormat/ELFRelocs/x86_64.def | 1 + llvm/lib/MC/MCTargetOptionsCommandFlags.cpp | 4 +-- .../Target/X86/MCTargetDesc/X86AsmBackend.cpp | 6 ++++ .../X86/MCTargetDesc/X86ELFObjectWriter.cpp | 7 +++- .../Target/X86/MCTargetDesc/X86FixupKinds.h | 4 +++ .../X86/MCTargetDesc/X86MCCodeEmitter.cpp | 19 +++++------ .../X86/MCTargetDesc/X86MachObjectWriter.cpp | 6 +++- .../MCTargetDesc/X86WinCOFFObjectWriter.cpp | 2 ++ llvm/test/MC/ELF/relocation-alias.s | 3 ++ llvm/test/MC/X86/gotpcrelx.s | 34 +++++++++++++++++++ llvm/test/MC/X86/reloc-directive-elf-64.s | 3 ++ 12 files changed, 77 insertions(+), 14 deletions(-) diff --git a/clang/test/Driver/relax.s b/clang/test/Driver/relax.s index 154d4db0a31385..b4a696a328eb56 100644 --- a/clang/test/Driver/relax.s +++ b/clang/test/Driver/relax.s @@ -8,5 +8,7 @@ // RUN: llvm-readobj -r %t | FileCheck --check-prefix=REL %s // REL: R_X86_64_REX_GOTPCRELX foo +// REL: R_X86_64_REX2_GOTPCRELX foo movq foo@GOTPCREL(%rip), %rax + movq foo@GOTPCREL(%rip), %r16 diff --git a/llvm/include/llvm/BinaryFormat/ELFRelocs/x86_64.def b/llvm/include/llvm/BinaryFormat/ELFRelocs/x86_64.def index 18fdcf9472dc48..161b1969abfeb4 100644 --- a/llvm/include/llvm/BinaryFormat/ELFRelocs/x86_64.def +++ b/llvm/include/llvm/BinaryFormat/ELFRelocs/x86_64.def @@ -43,3 +43,4 @@ ELF_RELOC(R_X86_64_TLSDESC, 36) ELF_RELOC(R_X86_64_IRELATIVE, 37) ELF_RELOC(R_X86_64_GOTPCRELX, 41) ELF_RELOC(R_X86_64_REX_GOTPCRELX, 42) +ELF_RELOC(R_X86_64_REX2_GOTPCRELX, 43) diff --git a/llvm/lib/MC/MCTargetOptionsCommandFlags.cpp b/llvm/lib/MC/MCTargetOptionsCommandFlags.cpp index 1a4f7e93eeb74a..92618bdabbe519 100644 --- a/llvm/lib/MC/MCTargetOptionsCommandFlags.cpp +++ b/llvm/lib/MC/MCTargetOptionsCommandFlags.cpp @@ -145,8 +145,8 @@ llvm::mc::RegisterMCTargetOptionsFlags::RegisterMCTargetOptionsFlags() { static cl::opt X86RelaxRelocations( "x86-relax-relocations", - cl::desc( - "Emit GOTPCRELX/REX_GOTPCRELX instead of GOTPCREL on x86-64 ELF"), + cl::desc("Emit GOTPCRELX/REX_GOTPCRELX/REX2_GOTPCRELX instead of " + "GOTPCREL on x86-64 ELF"), cl::init(true)); MCBINDOPT(X86RelaxRelocations); diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp index 1d08853faf582e..2f6b55b0d6023e 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp @@ -629,15 +629,19 @@ std::optional X86AsmBackend::getFixupKind(StringRef Name) const { const MCFixupKindInfo &X86AsmBackend::getFixupKindInfo(MCFixupKind Kind) const { const static MCFixupKindInfo Infos[X86::NumTargetFixupKinds] = { + // clang-format off {"reloc_riprel_4byte", 0, 32, MCFixupKindInfo::FKF_IsPCRel}, {"reloc_riprel_4byte_movq_load", 0, 32, MCFixupKindInfo::FKF_IsPCRel}, + {"reloc_riprel_4byte_movq_load_rex2", 0, 32, MCFixupKindInfo::FKF_IsPCRel}, {"reloc_riprel_4byte_relax", 0, 32, MCFixupKindInfo::FKF_IsPCRel}, {"reloc_riprel_4byte_relax_rex", 0, 32, MCFixupKindInfo::FKF_IsPCRel}, + {"reloc_riprel_4byte_relax_rex2", 0, 32, MCFixupKindInfo::FKF_IsPCRel}, {"reloc_signed_4byte", 0, 32, 0}, {"reloc_signed_4byte_relax", 0, 32, 0}, {"reloc_global_offset_table", 0, 32, 0}, {"reloc_global_offset_table8", 0, 64, 0}, {"reloc_branch_4byte_pcrel", 0, 32, MCFixupKindInfo::FKF_IsPCRel}, + // clang-format on }; // Fixup kinds from .reloc directive are like R_386_NONE/R_X86_64_NONE. They @@ -678,7 +682,9 @@ static unsigned getFixupKindSize(unsigned Kind) { case X86::reloc_riprel_4byte: case X86::reloc_riprel_4byte_relax: case X86::reloc_riprel_4byte_relax_rex: + case X86::reloc_riprel_4byte_relax_rex2: case X86::reloc_riprel_4byte_movq_load: + case X86::reloc_riprel_4byte_movq_load_rex2: case X86::reloc_signed_4byte: case X86::reloc_signed_4byte_relax: case X86::reloc_global_offset_table: diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp index 0b2efdfc16cc5d..90222278d1ad6f 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp @@ -74,7 +74,9 @@ static X86_64RelType getType64(MCFixupKind Kind, case X86::reloc_riprel_4byte: case X86::reloc_riprel_4byte_relax: case X86::reloc_riprel_4byte_relax_rex: + case X86::reloc_riprel_4byte_relax_rex2: case X86::reloc_riprel_4byte_movq_load: + case X86::reloc_riprel_4byte_movq_load_rex2: return RT64_32; case X86::reloc_branch_4byte_pcrel: Modifier = MCSymbolRefExpr::VK_PLT; @@ -205,7 +207,7 @@ static unsigned getRelocType64(MCContext &Ctx, SMLoc Loc, case MCSymbolRefExpr::VK_GOTPCREL: checkIs32(Ctx, Loc, Type); // Older versions of ld.bfd/ld.gold/lld - // do not support GOTPCRELX/REX_GOTPCRELX, + // do not support GOTPCRELX/REX_GOTPCRELX/REX2_GOTPCRELX, // and we want to keep back-compatibility. if (!Ctx.getTargetOptions()->X86RelaxRelocations) return ELF::R_X86_64_GOTPCREL; @@ -217,6 +219,9 @@ static unsigned getRelocType64(MCContext &Ctx, SMLoc Loc, case X86::reloc_riprel_4byte_relax_rex: case X86::reloc_riprel_4byte_movq_load: return ELF::R_X86_64_REX_GOTPCRELX; + case X86::reloc_riprel_4byte_relax_rex2: + case X86::reloc_riprel_4byte_movq_load_rex2: + return ELF::R_X86_64_REX2_GOTPCRELX; } llvm_unreachable("unexpected relocation type!"); case MCSymbolRefExpr::VK_GOTPCREL_NORELAX: diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86FixupKinds.h b/llvm/lib/Target/X86/MCTargetDesc/X86FixupKinds.h index 2d5217115d07cb..29bb7eebae3f22 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86FixupKinds.h +++ b/llvm/lib/Target/X86/MCTargetDesc/X86FixupKinds.h @@ -16,10 +16,14 @@ namespace X86 { enum Fixups { reloc_riprel_4byte = FirstTargetFixupKind, // 32-bit rip-relative reloc_riprel_4byte_movq_load, // 32-bit rip-relative in movq + reloc_riprel_4byte_movq_load_rex2, // 32-bit rip-relative in movq + // with rex2 prefix reloc_riprel_4byte_relax, // 32-bit rip-relative in relaxable // instruction reloc_riprel_4byte_relax_rex, // 32-bit rip-relative in relaxable // instruction with rex prefix + reloc_riprel_4byte_relax_rex2, // 32-bit rip-relative in relaxable + // instruction with rex2 prefix reloc_signed_4byte, // 32-bit signed. Unlike FK_Data_4 // this will be sign extended at // runtime. diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp index 71d42863fd5857..206436191c2584 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp @@ -568,8 +568,10 @@ void X86MCCodeEmitter::emitImmediate(const MCOperand &DispOp, SMLoc Loc, if (FixupKind == FK_PCRel_4 || FixupKind == MCFixupKind(X86::reloc_riprel_4byte) || FixupKind == MCFixupKind(X86::reloc_riprel_4byte_movq_load) || + FixupKind == MCFixupKind(X86::reloc_riprel_4byte_movq_load_rex2) || FixupKind == MCFixupKind(X86::reloc_riprel_4byte_relax) || FixupKind == MCFixupKind(X86::reloc_riprel_4byte_relax_rex) || + FixupKind == MCFixupKind(X86::reloc_riprel_4byte_relax_rex2) || FixupKind == MCFixupKind(X86::reloc_branch_4byte_pcrel)) { ImmOffset -= 4; // If this is a pc-relative load off _GLOBAL_OFFSET_TABLE_: @@ -637,12 +639,11 @@ void X86MCCodeEmitter::emitMemModRMByte( default: return X86::reloc_riprel_4byte; case X86::MOV64rm: - // movq loads is a subset of reloc_riprel_4byte_relax_rex. It is a + // movq loads is a subset of reloc_riprel_4byte_relax_rex/rex2. It is a // special case because COFF and Mach-O don't support ELF's more - // flexible R_X86_64_REX_GOTPCRELX relaxation. - // TODO: Support new relocation for REX2. - assert(Kind == REX || Kind == REX2); - return X86::reloc_riprel_4byte_movq_load; + // flexible R_X86_64_REX_GOTPCRELX/R_X86_64_REX2_GOTPCRELX relaxation. + return Kind == REX2 ? X86::reloc_riprel_4byte_movq_load_rex2 + : X86::reloc_riprel_4byte_movq_load; case X86::ADC32rm: case X86::ADD32rm: case X86::AND32rm: @@ -665,11 +666,9 @@ void X86MCCodeEmitter::emitMemModRMByte( case X86::SBB64rm: case X86::SUB64rm: case X86::XOR64rm: - // We haven't support relocation for REX2 prefix, so temporarily use REX - // relocation. - // TODO: Support new relocation for REX2. - return (Kind == REX || Kind == REX2) ? X86::reloc_riprel_4byte_relax_rex - : X86::reloc_riprel_4byte_relax; + return Kind == REX2 ? X86::reloc_riprel_4byte_relax_rex2 + : Kind == REX ? X86::reloc_riprel_4byte_relax_rex + : X86::reloc_riprel_4byte_relax; } }(); diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp index ec95b1ffec387d..41ce5c9fcb82ad 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp @@ -66,8 +66,10 @@ class X86MachObjectWriter : public MCMachObjectTargetWriter { static bool isFixupKindRIPRel(unsigned Kind) { return Kind == X86::reloc_riprel_4byte || Kind == X86::reloc_riprel_4byte_movq_load || + Kind == X86::reloc_riprel_4byte_movq_load_rex2 || Kind == X86::reloc_riprel_4byte_relax || - Kind == X86::reloc_riprel_4byte_relax_rex; + Kind == X86::reloc_riprel_4byte_relax_rex || + Kind == X86::reloc_riprel_4byte_relax_rex2; } static unsigned getFixupKindLog2Size(unsigned Kind) { @@ -83,7 +85,9 @@ static unsigned getFixupKindLog2Size(unsigned Kind) { case X86::reloc_riprel_4byte: case X86::reloc_riprel_4byte_relax: case X86::reloc_riprel_4byte_relax_rex: + case X86::reloc_riprel_4byte_relax_rex2: case X86::reloc_riprel_4byte_movq_load: + case X86::reloc_riprel_4byte_movq_load_rex2: case X86::reloc_signed_4byte: case X86::reloc_signed_4byte_relax: case X86::reloc_branch_4byte_pcrel: diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp index 10fc176b59d8ab..7740500fb41830 100644 --- a/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp +++ b/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp @@ -66,8 +66,10 @@ unsigned X86WinCOFFObjectWriter::getRelocType(MCContext &Ctx, case FK_PCRel_4: case X86::reloc_riprel_4byte: case X86::reloc_riprel_4byte_movq_load: + case X86::reloc_riprel_4byte_movq_load_rex2: case X86::reloc_riprel_4byte_relax: case X86::reloc_riprel_4byte_relax_rex: + case X86::reloc_riprel_4byte_relax_rex2: case X86::reloc_branch_4byte_pcrel: return COFF::IMAGE_REL_AMD64_REL32; case FK_Data_4: diff --git a/llvm/test/MC/ELF/relocation-alias.s b/llvm/test/MC/ELF/relocation-alias.s index 51fb0c37052fe7..66bf2ceea508ba 100644 --- a/llvm/test/MC/ELF/relocation-alias.s +++ b/llvm/test/MC/ELF/relocation-alias.s @@ -16,7 +16,10 @@ movabsq $memcpy+2, %rax # CHECK: movq (%rip), %rax # CHECK-NEXT: R_X86_64_REX_GOTPCRELX abs-0x4 +# CHECK: movq (%rip), %r16 +# CHECK-NEXT: R_X86_64_REX2_GOTPCRELX abs-0x4 movq abs@GOTPCREL(%rip), %rax +movq abs@GOTPCREL(%rip), %r16 abs = 42 # CHECK: movabsq $0, %rbx diff --git a/llvm/test/MC/X86/gotpcrelx.s b/llvm/test/MC/X86/gotpcrelx.s index e63e3e9a946fd1..5a8ba454bc904c 100644 --- a/llvm/test/MC/X86/gotpcrelx.s +++ b/llvm/test/MC/X86/gotpcrelx.s @@ -37,6 +37,16 @@ # CHECK-NEXT: R_X86_64_REX_GOTPCRELX sbb # CHECK-NEXT: R_X86_64_REX_GOTPCRELX sub # CHECK-NEXT: R_X86_64_REX_GOTPCRELX xor +# CHECK-NEXT: R_X86_64_REX2_GOTPCRELX mov +# CHECK-NEXT: R_X86_64_REX2_GOTPCRELX test +# CHECK-NEXT: R_X86_64_REX2_GOTPCRELX adc +# CHECK-NEXT: R_X86_64_REX2_GOTPCRELX add +# CHECK-NEXT: R_X86_64_REX2_GOTPCRELX and +# CHECK-NEXT: R_X86_64_REX2_GOTPCRELX cmp +# CHECK-NEXT: R_X86_64_REX2_GOTPCRELX or +# CHECK-NEXT: R_X86_64_REX2_GOTPCRELX sbb +# CHECK-NEXT: R_X86_64_REX2_GOTPCRELX sub +# CHECK-NEXT: R_X86_64_REX2_GOTPCRELX xor # CHECK-NEXT: } # NORELAX-NEXT: R_X86_64_GOTPCREL mov @@ -71,6 +81,16 @@ # NORELAX-NEXT: R_X86_64_GOTPCREL sbb # NORELAX-NEXT: R_X86_64_GOTPCREL sub # NORELAX-NEXT: R_X86_64_GOTPCREL xor +# NORELAX-NEXT: R_X86_64_GOTPCREL mov +# NORELAX-NEXT: R_X86_64_GOTPCREL test +# NORELAX-NEXT: R_X86_64_GOTPCREL adc +# NORELAX-NEXT: R_X86_64_GOTPCREL add +# NORELAX-NEXT: R_X86_64_GOTPCREL and +# NORELAX-NEXT: R_X86_64_GOTPCREL cmp +# NORELAX-NEXT: R_X86_64_GOTPCREL or +# NORELAX-NEXT: R_X86_64_GOTPCREL sbb +# NORELAX-NEXT: R_X86_64_GOTPCREL sub +# NORELAX-NEXT: R_X86_64_GOTPCREL xor # NORELAX-NEXT: } movl mov@GOTPCREL(%rip), %eax @@ -108,10 +128,22 @@ sbb sbb@GOTPCREL(%rip), %rax sub sub@GOTPCREL(%rip), %rax xor xor@GOTPCREL(%rip), %rax +movq mov@GOTPCREL(%rip), %r16 +test %r16, test@GOTPCREL(%rip) +adc adc@GOTPCREL(%rip), %r16 +add add@GOTPCREL(%rip), %r16 +and and@GOTPCREL(%rip), %r16 +cmp cmp@GOTPCREL(%rip), %r16 +or or@GOTPCREL(%rip), %r16 +sbb sbb@GOTPCREL(%rip), %r16 +sub sub@GOTPCREL(%rip), %r16 +xor xor@GOTPCREL(%rip), %r16 + # COMMON-NEXT: Section ({{.*}}) .rela.norelax { # COMMON-NEXT: R_X86_64_GOTPCREL mov 0x0 # COMMON-NEXT: R_X86_64_GOTPCREL mov 0xFFFFFFFFFFFFFFFD # COMMON-NEXT: R_X86_64_GOTPCREL mov 0xFFFFFFFFFFFFFFFC +# COMMON-NEXT: R_X86_64_GOTPCREL mov 0xFFFFFFFFFFFFFFFD # COMMON-NEXT: } # COMMON-NEXT: ] @@ -123,3 +155,5 @@ movl mov@GOTPCREL+4(%rip), %eax movq mov@GOTPCREL+1(%rip), %rax ## We could emit R_X86_64_GOTPCRELX, but it is probably unnecessary. movl mov@GOTPCREL+0(%rip), %eax +## Don't emit R_X86_64_GOTPCRELX. +movq mov@GOTPCREL+1(%rip), %r16 diff --git a/llvm/test/MC/X86/reloc-directive-elf-64.s b/llvm/test/MC/X86/reloc-directive-elf-64.s index 8f5d8c895e7d76..323603efc70618 100644 --- a/llvm/test/MC/X86/reloc-directive-elf-64.s +++ b/llvm/test/MC/X86/reloc-directive-elf-64.s @@ -9,6 +9,7 @@ # PRINT-NEXT: .reloc 0, R_X86_64_64, .data+2 # PRINT-NEXT: .reloc 0, R_X86_64_GOTPCRELX, foo+3 # PRINT-NEXT: .reloc 0, R_X86_64_REX_GOTPCRELX, 5 +# PRINT-NEXT: .reloc 0, R_X86_64_REX2_GOTPCRELX, 7 # PRINT: .reloc 0, BFD_RELOC_NONE, 9 # PRINT-NEXT: .reloc 0, BFD_RELOC_8, 9 # PRINT-NEXT: .reloc 0, BFD_RELOC_16, 9 @@ -21,6 +22,7 @@ # CHECK-NEXT: 0x0 R_X86_64_64 .data 0x2 # CHECK-NEXT: 0x0 R_X86_64_GOTPCRELX foo 0x3 # CHECK-NEXT: 0x0 R_X86_64_REX_GOTPCRELX - 0x5 +# CHECK-NEXT: 0x0 R_X86_64_REX2_GOTPCRELX - 0x7 # CHECK-NEXT: 0x0 R_X86_64_NONE - 0x9 # CHECK-NEXT: 0x0 R_X86_64_8 - 0x9 # CHECK-NEXT: 0x0 R_X86_64_16 - 0x9 @@ -37,6 +39,7 @@ .reloc 0, R_X86_64_64, .data+2 .reloc 0, R_X86_64_GOTPCRELX, foo+3 .reloc 0, R_X86_64_REX_GOTPCRELX, 5 + .reloc 0, R_X86_64_REX2_GOTPCRELX, 7 .reloc 0, BFD_RELOC_NONE, 9 .reloc 0, BFD_RELOC_8, 9 From 6cfe6a6b3e9578be80120add7fbe19506f747196 Mon Sep 17 00:00:00 2001 From: Georgi Mirazchiyski Date: Tue, 24 Sep 2024 11:47:57 +0100 Subject: [PATCH 14/22] [NFC][AMDGPU] Assert no bad shift operations will happen (#108416) The assumption in the asserts is based on the fact that no SGPR/VGPR register Arg mask in the ISelLowering and Legalizer can equal zero. They are implicitly set to ~0 by default (meaning non-masked) or explicitly to a non-zero value. The `optimizeCompareInstr` case is different from the above described. It requires the mask to be a power-of-two because it's a special-case optimization, hence in this case we still cannot have an invalid shift. This commit also silences static analysis tools wrt potential bad shifts that could result from the output of `countr_zero(Mask)`. --- llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h | 2 ++ llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 3 +++ 2 files changed, 5 insertions(+) diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h index 2e02bb4271adc7..06b2f181c276cd 100644 --- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h +++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h @@ -77,6 +77,8 @@ struct ArgDescriptor { } unsigned getMask() const { + // None of the target SGPRs or VGPRs are expected to have a 'zero' mask. + assert(Mask && "Invalid mask."); return Mask; } diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index c787edf7cfd11b..f5f367b2a4a7c6 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -9791,6 +9791,9 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, else return false; + // A valid Mask is required to have a single bit set, hence a non-zero and + // power-of-two value. This verifies that we will not do 64-bit shift below. + assert(llvm::has_single_bit(Mask) && "Invalid mask."); unsigned BitNo = llvm::countr_zero((uint64_t)Mask); if (IsSigned && BitNo == SrcSize - 1) return false; From 5dc15ddf575978e0115b1a6edacb59f056792a80 Mon Sep 17 00:00:00 2001 From: Georgi Mirazchiyski Date: Tue, 24 Sep 2024 11:48:21 +0100 Subject: [PATCH 15/22] [AMDGPU] Default-initialize uninitialized class member variables (#108428) --- llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp | 8 ++++---- llvm/lib/Target/AMDGPU/SIMachineScheduler.h | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp index 434336ef137ff5..46f5097c679fb3 100644 --- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp +++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp @@ -92,12 +92,12 @@ class V2SCopyInfo { SetVector SChain; // Number of SGPR to VGPR copies that are used to put the SALU computation // results back to VALU. - unsigned NumSVCopies; + unsigned NumSVCopies = 0; - unsigned Score; + unsigned Score = 0; // Actual count of v_readfirstlane_b32 // which need to be inserted to keep SChain SALU - unsigned NumReadfirstlanes; + unsigned NumReadfirstlanes = 0; // Current score state. To speedup selection V2SCopyInfos for processing bool NeedToBeConvertedToVALU = false; // Unique ID. Used as a key for mapping to keep permanent order. @@ -109,7 +109,7 @@ class V2SCopyInfo { SetVector Siblings; V2SCopyInfo() : Copy(nullptr), ID(0){}; V2SCopyInfo(unsigned Id, MachineInstr *C, unsigned Width) - : Copy(C), NumSVCopies(0), NumReadfirstlanes(Width / 32), ID(Id){}; + : Copy(C), NumReadfirstlanes(Width / 32), ID(Id){}; #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) void dump() { dbgs() << ID << " : " << *Copy << "\n\tS:" << SChain.size() diff --git a/llvm/lib/Target/AMDGPU/SIMachineScheduler.h b/llvm/lib/Target/AMDGPU/SIMachineScheduler.h index ac34a748edbc1e..f8f4b5aae338eb 100644 --- a/llvm/lib/Target/AMDGPU/SIMachineScheduler.h +++ b/llvm/lib/Target/AMDGPU/SIMachineScheduler.h @@ -120,8 +120,8 @@ class SIScheduleBlock { ArrayRef> getSuccs() const { return Succs; } - unsigned Height; // Maximum topdown path length to block without outputs - unsigned Depth; // Maximum bottomup path length to block without inputs + unsigned Height = 0; // Maximum topdown path length to block without outputs + unsigned Depth = 0; // Maximum bottomup path length to block without inputs unsigned getNumHighLatencySuccessors() const { return NumHighLatencySuccessors; From 4f8e76684f4c1e67726222c35f173ef722464a7e Mon Sep 17 00:00:00 2001 From: Vyacheslav Levytskyy Date: Tue, 24 Sep 2024 13:33:31 +0200 Subject: [PATCH 16/22] [AsmPrinter] Do not emit label instructions after the function body if the target is SPIR-V (#107013) AsmPrinter always creates a symbol for the end of function if valid debug info is present. However, this breaks SPIR-V target's output, because SPIR-V specification allows label instructions only inside a block, not after the function body (see https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#OpLabel). This PR proposes to disable emission of label instructions after the function body if the target is SPIR-V. This PR is a fix of the https://github.com/llvm/llvm-project/issues/102732 issue. --- llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp | 5 ++++- llvm/test/CodeGen/SPIRV/debug-info/debug-compilation-unit.ll | 2 ++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp index db7adfd3b21e5f..d17800d375b7f2 100644 --- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp +++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp @@ -1971,7 +1971,10 @@ void AsmPrinter::emitFunctionBody() { // are automatically sized. bool EmitFunctionSize = MAI->hasDotTypeDotSizeDirective() && !TT.isWasm(); - if (EmitFunctionSize || needFuncLabels(*MF, *this)) { + // SPIR-V supports label instructions only inside a block, not after the + // function body. + if (TT.getObjectFormat() != Triple::SPIRV && + (EmitFunctionSize || needFuncLabels(*MF, *this))) { // Create a symbol for the end of function. CurrentFnEnd = createTempSymbol("func_end"); OutStreamer->emitLabel(CurrentFnEnd); diff --git a/llvm/test/CodeGen/SPIRV/debug-info/debug-compilation-unit.ll b/llvm/test/CodeGen/SPIRV/debug-info/debug-compilation-unit.ll index bff4660559ab82..794dcd6d9f3fb4 100644 --- a/llvm/test/CodeGen/SPIRV/debug-info/debug-compilation-unit.ll +++ b/llvm/test/CodeGen/SPIRV/debug-info/debug-compilation-unit.ll @@ -29,11 +29,13 @@ define spir_func void @foo() { entry: ret void } +; CHECK-SPIRV-NOT: Lfunc_end0: define spir_func void @bar() { entry: ret void } +; CHECK-SPIRV-NOT: Lfunc_end1: !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!2, !3, !4, !5} From 497759e872a53964a54db941f3a1ed74446c5ed4 Mon Sep 17 00:00:00 2001 From: David Spickett Date: Tue, 24 Sep 2024 12:40:42 +0100 Subject: [PATCH 17/22] [lldb][AArch64] Create Neon subregs when XML only includes SVE (#108365) Fixes #107864 QEMU decided that when SVE is enabled it will only tell us about SVE registers in the XML, and not include Neon registers. On the grounds that the Neon V registers can be read from the bottom 128 bits of a SVE Z register (SVE's vector length is always >= 128 bits). To support this we create sub-registers just as we do for S and D registers of the V registers. Except this time we use part of the Z registers. This change also updates our fallback for registers with unknown types that are > 128 bit. This is detailed in https://github.com/llvm/llvm-project/issues/87471, though that covers more than this change fixes. We'll now treat any register of unknown type that is >= 128 bit as a vector of bytes. So that the user gets to see something even if the order might be wrong. And until lldb supports vector and union types for registers, this is also the only way we can get a value to apply the sub-reg to, to make the V registers. --- .../source/Plugins/ABI/AArch64/ABIAArch64.cpp | 40 +++++- .../Process/gdb-remote/ProcessGDBRemote.cpp | 9 +- .../TestAArch64XMLRegistersSVEOnly.py | 121 ++++++++++++++++++ 3 files changed, 163 insertions(+), 7 deletions(-) create mode 100644 lldb/test/API/functionalities/gdb_remote_client/TestAArch64XMLRegistersSVEOnly.py diff --git a/lldb/source/Plugins/ABI/AArch64/ABIAArch64.cpp b/lldb/source/Plugins/ABI/AArch64/ABIAArch64.cpp index 256c1f828feb38..7d8d0a4d3d6711 100644 --- a/lldb/source/Plugins/ABI/AArch64/ABIAArch64.cpp +++ b/lldb/source/Plugins/ABI/AArch64/ABIAArch64.cpp @@ -136,6 +136,8 @@ void ABIAArch64::AugmentRegisterInfo( std::array, 32> x_regs; std::array, 32> v_regs; + std::array, 32> z_regs; + std::optional z_byte_size; for (auto it : llvm::enumerate(regs)) { lldb_private::DynamicRegisterInfo::Register &info = it.value(); @@ -157,16 +159,44 @@ void ABIAArch64::AugmentRegisterInfo( x_regs[reg_num] = it.index(); else if (get_reg("v")) v_regs[reg_num] = it.index(); + else if (get_reg("z")) { + z_regs[reg_num] = it.index(); + if (!z_byte_size) + z_byte_size = info.byte_size; + } // if we have at least one subregister, abort else if (get_reg("w") || get_reg("s") || get_reg("d")) return; } - // Create aliases for partial registers: wN for xN, and sN/dN for vN. + // Create aliases for partial registers. + + // Wn for Xn. addPartialRegisters(regs, x_regs, 8, "w{0}", 4, lldb::eEncodingUint, lldb::eFormatHex); - addPartialRegisters(regs, v_regs, 16, "s{0}", 4, lldb::eEncodingIEEE754, - lldb::eFormatFloat); - addPartialRegisters(regs, v_regs, 16, "d{0}", 8, lldb::eEncodingIEEE754, - lldb::eFormatFloat); + + auto bool_predicate = [](const auto ®_num) { return bool(reg_num); }; + bool saw_v_regs = std::any_of(v_regs.begin(), v_regs.end(), bool_predicate); + bool saw_z_regs = std::any_of(z_regs.begin(), z_regs.end(), bool_predicate); + + // Sn/Dn for Vn. + if (saw_v_regs) { + addPartialRegisters(regs, v_regs, 16, "s{0}", 4, lldb::eEncodingIEEE754, + lldb::eFormatFloat); + addPartialRegisters(regs, v_regs, 16, "d{0}", 8, lldb::eEncodingIEEE754, + lldb::eFormatFloat); + } else if (saw_z_regs && z_byte_size) { + // When SVE is enabled, some debug stubs will not describe the Neon V + // registers because they can be read from the bottom 128 bits of the SVE + // registers. + + // The size used here is the one sent by the debug server. This only needs + // to be correct right now. Later we will rely on the value of vg instead. + addPartialRegisters(regs, z_regs, *z_byte_size, "v{0}", 16, + lldb::eEncodingVector, lldb::eFormatVectorOfUInt8); + addPartialRegisters(regs, z_regs, *z_byte_size, "s{0}", 4, + lldb::eEncodingIEEE754, lldb::eFormatFloat); + addPartialRegisters(regs, z_regs, *z_byte_size, "d{0}", 8, + lldb::eEncodingIEEE754, lldb::eFormatFloat); + } } diff --git a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp index 9e8c6046179631..3e09c316d74f44 100644 --- a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp +++ b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp @@ -4716,9 +4716,14 @@ bool ParseRegisters( reg_info.encoding = eEncodingIEEE754; } else if (gdb_type == "aarch64v" || llvm::StringRef(gdb_type).starts_with("vec") || - gdb_type == "i387_ext" || gdb_type == "uint128") { + gdb_type == "i387_ext" || gdb_type == "uint128" || + reg_info.byte_size > 16) { // lldb doesn't handle 128-bit uints correctly (for ymm*h), so - // treat them as vector (similarly to xmm/ymm) + // treat them as vector (similarly to xmm/ymm). + // We can fall back to handling anything else <= 128 bit as an + // unsigned integer, more than that, call it a vector of bytes. + // This can happen if we don't recognise the type for AArc64 SVE + // registers. reg_info.format = eFormatVectorOfUInt8; reg_info.encoding = eEncodingVector; } else { diff --git a/lldb/test/API/functionalities/gdb_remote_client/TestAArch64XMLRegistersSVEOnly.py b/lldb/test/API/functionalities/gdb_remote_client/TestAArch64XMLRegistersSVEOnly.py new file mode 100644 index 00000000000000..e36013a11491b3 --- /dev/null +++ b/lldb/test/API/functionalities/gdb_remote_client/TestAArch64XMLRegistersSVEOnly.py @@ -0,0 +1,121 @@ +""" Check that when a debug server provides XML that only defines SVE Z registers, + and does not include Neon V registers, lldb creates sub-registers to represent + the V registers as the bottom 128 bits of the Z registers. + + qemu-aarch64 is one such debug server. + + This also doubles as a test that lldb has a fallback path for registers of + unknown type that are > 128 bits, as the SVE registers are here. +""" + +from textwrap import dedent +import lldb +from lldbsuite.test.lldbtest import * +from lldbsuite.test.decorators import * +from lldbsuite.test.gdbclientutils import * +from lldbsuite.test.lldbgdbclient import GDBRemoteTestBase + + +class Responder(MockGDBServerResponder): + def __init__(self): + super().__init__() + self.vg = 4 + self.pc = 0xA0A0A0A0A0A0A0A0 + + def qXferRead(self, obj, annex, offset, length): + if annex == "target.xml": + # Note that QEMU sends the current SVE size in XML and the debugger + # then reads vg to know the latest size. + return ( + dedent( + """\ + + + aarch64 + + + + + + """ + ), + False, + ) + + return (None,) + + def readRegister(self, regnum): + return "E01" + + def readRegisters(self): + return "".join( + [ + # 64 bit PC. + f"{self.pc:x}", + # 64 bit vg + f"0{self.vg}00000000000000", + # Enough data for 256 and 512 bit SVE. + "".join([f"{n:02x}" * 4 for n in range(1, 17)]), + ] + ) + + def cont(self): + # vg is expedited so that lldb can resize the SVE registers. + return f"T02thread:1ff0d;threads:1ff0d;thread-pcs:{self.pc};01:0{self.vg}00000000000000;" + + def writeRegisters(self, registers_hex): + # We get a block of data containing values in regnum order. + self.vg = int(registers_hex[16:18]) + return "OK" + + +class TestXMLRegisterFlags(GDBRemoteTestBase): + def check_regs(self, vg): + # Each 32 bit chunk repeats n. + z0_value = " ".join( + [" ".join([f"0x{n:02x}"] * 4) for n in range(1, (vg * 2) + 1)] + ) + + self.expect( + "register read vg z0 v0 s0 d0", + substrs=[ + f" vg = 0x000000000000000{vg}\n" + " z0 = {" + z0_value + "}\n" + " v0 = {0x01 0x01 0x01 0x01 0x02 0x02 0x02 0x02 0x03 0x03 0x03 0x03 0x04 0x04 0x04 0x04}\n" + " s0 = 2.36942783E-38\n" + " d0 = 5.3779407333977203E-299\n" + ], + ) + + self.expect("register read s0 --format uint32", substrs=["s0 = {0x01010101}"]) + self.expect( + "register read d0 --format uint64", + substrs=["d0 = {0x0202020201010101}"], + ) + + @skipIfXmlSupportMissing + @skipIfRemote + @skipIfLLVMTargetMissing("AArch64") + def test_v_sub_registers(self): + self.server.responder = Responder() + target = self.dbg.CreateTarget("") + + if self.TraceOn(): + self.runCmd("log enable gdb-remote packets") + self.addTearDownHook(lambda: self.runCmd("log disable gdb-remote packets")) + + process = self.connect(target) + lldbutil.expect_state_changes( + self, self.dbg.GetListener(), process, [lldb.eStateStopped] + ) + + self.check_regs(4) + + # Now increase the SVE length and continue. The mock will respond with a new + # vg and lldb will reconfigure the register defs. This should not break the + # sub-registers. + + self.runCmd("register write vg 8") + self.expect("continue", substrs=["stop reason = signal SIGINT"]) + + self.check_regs(8) From c30fa3cde755e7519f0962f581868a09da1ea130 Mon Sep 17 00:00:00 2001 From: Georgi Mirazchiyski Date: Tue, 24 Sep 2024 12:54:05 +0100 Subject: [PATCH 18/22] [AMDGPU] Fix has_single_bit assertion for Mask in SIInstrInfo (#109785) Convert the `int64_t` Mask to `uint64_t` for `llvm::has_single_bit` to compile. --- llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp index f5f367b2a4a7c6..9ad0b4c65e1d90 100644 --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -9793,7 +9793,7 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg, // A valid Mask is required to have a single bit set, hence a non-zero and // power-of-two value. This verifies that we will not do 64-bit shift below. - assert(llvm::has_single_bit(Mask) && "Invalid mask."); + assert(llvm::has_single_bit(Mask) && "Invalid mask."); unsigned BitNo = llvm::countr_zero((uint64_t)Mask); if (IsSigned && BitNo == SrcSize - 1) return false; From 029b9b611d8becf04f4c525ab2b70e956b4b186d Mon Sep 17 00:00:00 2001 From: David Spickett Date: Tue, 24 Sep 2024 13:02:52 +0100 Subject: [PATCH 19/22] [llvm][docs] Improve formatting of ENABLE_PROJECTS/RUNTIMES description * Add line breaks so it's clear what should be passed to CMake. * Make the note into an RST note block. * Fix a couple of markdown style plain text markers. --- llvm/docs/CMake.rst | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/llvm/docs/CMake.rst b/llvm/docs/CMake.rst index 838447f483e510..b5adb22d8f33b1 100644 --- a/llvm/docs/CMake.rst +++ b/llvm/docs/CMake.rst @@ -571,10 +571,12 @@ enabled sub-projects. Nearly all of these variable names begin with Semicolon-separated list of projects to build, or *all* for building all (clang, lldb, lld, polly, etc) projects. This flag assumes that projects are checked out side-by-side and not nested, i.e. clang needs to be in - parallel of llvm instead of nested in `llvm/tools`. This feature allows + parallel of llvm instead of nested in ``llvm/tools``. This feature allows to have one build for only LLVM and another for clang+llvm using the same source checkout. + The full list is: + ``clang;clang-tools-extra;cross-project-tests;libc;libclc;lld;lldb;openmp;polly;pstl`` **LLVM_ENABLE_RTTI**:BOOL @@ -586,10 +588,16 @@ enabled sub-projects. Nearly all of these variable names begin with It will build the builtins separately from the other runtimes to preserve correct dependency ordering. If you want to build the runtimes using a system compiler, see the `libc++ documentation `_. - Note: the list should not have duplicates with `LLVM_ENABLE_PROJECTS`. + + .. note:: + The list should not have duplicates with ``LLVM_ENABLE_PROJECTS``. + The full list is: + ``compiler-rt;libc;libcxx;libcxxabi;libunwind;openmp`` + To enable all of them, use: + ``LLVM_ENABLE_RUNTIMES=all`` **LLVM_ENABLE_SPHINX**:BOOL From c1826aeef353bf4bd8b181b47a0dbb1f1af93836 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andrzej=20Warzy=C5=84ski?= Date: Tue, 24 Sep 2024 13:14:49 +0100 Subject: [PATCH 20/22] [mlir][tensor] Add new helper hooks for RelayoutOp (#109642) Implements two helper hooks for PackOp and UnPackOP, `getAllOuterDims` and `getTiledOuterDims`, and adds them to RelayoutOp (that both PackOp an UnPackOp inherit from). This improves code re-use and also clarifies the meaning of "outer dims" and "tiled outer dims". --- .../mlir/Dialect/Tensor/IR/TensorOps.td | 18 +++++++++- .../Dialect/Linalg/Transforms/Transforms.cpp | 23 ++++++------- mlir/lib/Dialect/Tensor/IR/TensorOps.cpp | 34 +++++++++++++++++++ 3 files changed, 61 insertions(+), 14 deletions(-) diff --git a/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td b/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td index cafc3d91fd1e9d..3170115883e2be 100644 --- a/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td +++ b/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td @@ -1814,7 +1814,7 @@ def Tensor_SplatOp : Tensor_Op<"splat", [ } //===----------------------------------------------------------------------===// -// PackOp +// RelayoutOp //===----------------------------------------------------------------------===// class Tensor_RelayoutOp traits = []> : @@ -1851,11 +1851,27 @@ class Tensor_RelayoutOp traits = []> : /// a sentinel `kDynamic` is introduced at that position in /// the returned vector. SmallVector getStaticTiles(); + + /// Retrieve all outer dims for this Pack/UnPack Op, i.e. all the leading + /// dims excluding the trailing dims corresponding to `innerTiles`. Note + /// that this will include both tiled and non-tiled dimensions. The order + /// of the output dimensions is consistent with the shape of the packed + /// tensor. + ArrayRef getAllOuterDims(); + + /// Similar to `getAllOuterDims`, but only retrieve the outer dims that + /// have been tiled. Also, the order of the output dimensions is consistent + /// with `inner_dims_pos` rather than the packed tensor. + SmallVector getTiledOuterDims(); }]; let hasVerifier = 1; } +//===----------------------------------------------------------------------===// +// PackOp +//===----------------------------------------------------------------------===// + def Tensor_PackOp : Tensor_RelayoutOp<"pack", [ AttrSizedOperandSegments]> { let summary = "tensor pack operation"; diff --git a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp index 77f0ea9d2236ea..e0dea8e78d55c1 100644 --- a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp +++ b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp @@ -1030,11 +1030,13 @@ static Value getPackOpSourceOrPaddedSource(OpBuilder &builder, return input; } + assert(llvm::all_of(packOp.getAllOuterDims(), + [](int64_t val) { return val == 1; }) && + "some outer dims are != 1"); + Location loc = packOp.getLoc(); ShapedType inputType = packOp.getSourceType(); int64_t inputRank = inputType.getRank(); - assert(llvm::all_of(packOp.getDestType().getShape().take_front(inputRank), - [](int64_t val) { return val == 1; })); SmallVector paddedShape; DenseMap tileAndPosMapping = @@ -1126,12 +1128,8 @@ LogicalResult GeneralizeOuterUnitDimsPackOpPattern::matchAndRewrite( // TODO: support the case that outer dimensions are not all 1s. A // tensor.expand_shape will be generated in this case. - auto innerDimsPos = packOp.getInnerDimsPos(); - int64_t srcRank = packOp.getSourceRank(); - auto destShape = packOp.getDestType().getShape(); - if (llvm::any_of(innerDimsPos, [destShape](int64_t index) { - return destShape[index] != 1; - })) { + if (llvm::any_of(packOp.getTiledOuterDims(), + [](int64_t dim) { return dim != 1; })) { return rewriter.notifyMatchFailure( packOp, "require the tiled outer dimensions of the result are all 1s"); } @@ -1145,6 +1143,7 @@ LogicalResult GeneralizeOuterUnitDimsPackOpPattern::matchAndRewrite( packOp.getDimAndTileMapping(); Attribute zeroIdxAttr = rewriter.getIndexAttr(0); Attribute oneIdxAttr = rewriter.getIndexAttr(1); + int64_t srcRank = packOp.getSourceRank(); SmallVector readOffsets(srcRank, zeroIdxAttr); SmallVector readStrides(srcRank, oneIdxAttr); SmallVector readSizes; @@ -1173,9 +1172,8 @@ LogicalResult GeneralizeOuterUnitDimsPackOpPattern::matchAndRewrite( loc, readType, input, readOffsets, readSizes, readStrides); // 2. Transpose the tile to match the inner tile order. - SmallVector perm = getPackUnpackRankReducedPerm( - inputShape, innerDimsPos, packOp.getOuterDimsPerm()); + inputShape, packOp.getInnerDimsPos(), packOp.getOuterDimsPerm()); LLVM_DEBUG(DBGS() << "Pack permutation: " << packOp << "\n"; llvm::interleaveComma(perm, DBGS() << "perm: "); DBGSNL();); @@ -1208,9 +1206,8 @@ LogicalResult GeneralizeOuterUnitDimsUnPackOpPattern::matchAndRewrite( int64_t destRank = unpackOp.getDestRank(); ArrayRef srcShape = unpackOp.getSourceType().getShape(); ArrayRef innerDimsPos = unpackOp.getInnerDimsPos(); - if (llvm::any_of(innerDimsPos, [srcShape](int64_t index) { - return srcShape[index] != 1; - })) { + if (llvm::any_of(unpackOp.getTiledOuterDims(), + [](int64_t dim) { return dim != 1; })) { return rewriter.notifyMatchFailure( unpackOp, "require the tiled outer dimensions of the result are all 1s"); diff --git a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp index 47f540e092e990..1ac96756e22b5e 100644 --- a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp +++ b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp @@ -3987,6 +3987,23 @@ SmallVector PackOp::getStaticTiles() { return getStaticTilesImpl(*this); } +ArrayRef PackOp::getAllOuterDims() { + ShapedType inputType = getSourceType(); + int64_t inputRank = inputType.getRank(); + return getDestType().getShape().take_front(inputRank); +} + +SmallVector PackOp::getTiledOuterDims() { + auto innerDimsPos = getInnerDimsPos(); + auto packedShape = getDestType().getShape(); + SmallVector res; + + for (auto index : innerDimsPos) + res.push_back(packedShape[index]); + + return res; +} + bool PackOp::requirePaddingValue(ArrayRef inputShape, ArrayRef innerDimsPos, ArrayRef outputShape, @@ -4411,6 +4428,23 @@ SmallVector UnPackOp::getStaticTiles() { return getStaticTilesImpl(*this); } +ArrayRef UnPackOp::getAllOuterDims() { + ShapedType destType = getDestType(); + int64_t destRank = destType.getRank(); + return getSourceType().getShape().take_front(destRank); +} + +SmallVector UnPackOp::getTiledOuterDims() { + auto innerDimsPos = getInnerDimsPos(); + auto packedShape = getSourceType().getShape(); + SmallVector res; + + for (auto index : innerDimsPos) + res.push_back(packedShape[index]); + + return res; +} + LogicalResult UnPackOp::verify() { return commonVerifierPackAndUnPackOp(*this); } From 3e3780ef6ab5902cd1763e28bb143e47091bd23a Mon Sep 17 00:00:00 2001 From: Paul Walker Date: Tue, 24 Sep 2024 13:15:26 +0100 Subject: [PATCH 21/22] [LLVM][CodeGen][SVE] Implement nxvf32 fpround to nxvbf16. (#107420) --- .../Target/AArch64/AArch64ISelLowering.cpp | 50 ++++++- .../lib/Target/AArch64/AArch64SVEInstrInfo.td | 2 +- llvm/lib/Target/AArch64/SVEInstrFormats.td | 6 +- .../test/CodeGen/AArch64/sve-bf16-converts.ll | 129 +++++++++++++++++- 4 files changed, 180 insertions(+), 7 deletions(-) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp index b11ac81069f660..4166d9bd22bc01 100644 --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1664,6 +1664,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM, setOperationAction(ISD::BITCAST, VT, Custom); setOperationAction(ISD::CONCAT_VECTORS, VT, Custom); setOperationAction(ISD::FP_EXTEND, VT, Custom); + setOperationAction(ISD::FP_ROUND, VT, Custom); setOperationAction(ISD::MLOAD, VT, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom); setOperationAction(ISD::SPLAT_VECTOR, VT, Legal); @@ -4334,14 +4335,57 @@ SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op, SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const { EVT VT = Op.getValueType(); - if (VT.isScalableVector()) - return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_ROUND_MERGE_PASSTHRU); - bool IsStrict = Op->isStrictFPOpcode(); SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0); EVT SrcVT = SrcVal.getValueType(); bool Trunc = Op.getConstantOperandVal(IsStrict ? 2 : 1) == 1; + if (VT.isScalableVector()) { + if (VT.getScalarType() != MVT::bf16) + return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_ROUND_MERGE_PASSTHRU); + + SDLoc DL(Op); + constexpr EVT I32 = MVT::nxv4i32; + auto ImmV = [&](int I) -> SDValue { return DAG.getConstant(I, DL, I32); }; + + SDValue NaN; + SDValue Narrow; + + if (SrcVT == MVT::nxv2f32 || SrcVT == MVT::nxv4f32) { + if (Subtarget->hasBF16()) + return LowerToPredicatedOp(Op, DAG, + AArch64ISD::FP_ROUND_MERGE_PASSTHRU); + + Narrow = getSVESafeBitCast(I32, SrcVal, DAG); + + // Set the quiet bit. + if (!DAG.isKnownNeverSNaN(SrcVal)) + NaN = DAG.getNode(ISD::OR, DL, I32, Narrow, ImmV(0x400000)); + } else + return SDValue(); + + if (!Trunc) { + SDValue Lsb = DAG.getNode(ISD::SRL, DL, I32, Narrow, ImmV(16)); + Lsb = DAG.getNode(ISD::AND, DL, I32, Lsb, ImmV(1)); + SDValue RoundingBias = DAG.getNode(ISD::ADD, DL, I32, Lsb, ImmV(0x7fff)); + Narrow = DAG.getNode(ISD::ADD, DL, I32, Narrow, RoundingBias); + } + + // Don't round if we had a NaN, we don't want to turn 0x7fffffff into + // 0x80000000. + if (NaN) { + EVT I1 = I32.changeElementType(MVT::i1); + EVT CondVT = VT.changeElementType(MVT::i1); + SDValue IsNaN = DAG.getSetCC(DL, CondVT, SrcVal, SrcVal, ISD::SETUO); + IsNaN = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, I1, IsNaN); + Narrow = DAG.getSelect(DL, I32, IsNaN, NaN, Narrow); + } + + // Now that we have rounded, shift the bits into position. + Narrow = DAG.getNode(ISD::SRL, DL, I32, Narrow, ImmV(16)); + return getSVESafeBitCast(VT, Narrow, DAG); + } + if (useSVEForFixedLengthVectorVT(SrcVT, !Subtarget->isNeonAvailable())) return LowerFixedLengthFPRoundToSVE(Op, DAG); diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td index 1f3d63a216c6dd..7240f6a22a87bd 100644 --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -2425,7 +2425,7 @@ let Predicates = [HasBF16, HasSVEorSME] in { defm BFMLALT_ZZZ : sve2_fp_mla_long<0b101, "bfmlalt", nxv4f32, nxv8bf16, int_aarch64_sve_bfmlalt>; defm BFMLALB_ZZZI : sve2_fp_mla_long_by_indexed_elem<0b100, "bfmlalb", nxv4f32, nxv8bf16, int_aarch64_sve_bfmlalb_lane_v2>; defm BFMLALT_ZZZI : sve2_fp_mla_long_by_indexed_elem<0b101, "bfmlalt", nxv4f32, nxv8bf16, int_aarch64_sve_bfmlalt_lane_v2>; - defm BFCVT_ZPmZ : sve_bfloat_convert<0b1, "bfcvt", int_aarch64_sve_fcvt_bf16f32>; + defm BFCVT_ZPmZ : sve_bfloat_convert<0b1, "bfcvt", int_aarch64_sve_fcvt_bf16f32, AArch64fcvtr_mt>; defm BFCVTNT_ZPmZ : sve_bfloat_convert<0b0, "bfcvtnt", int_aarch64_sve_fcvtnt_bf16f32>; } // End HasBF16, HasSVEorSME diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td index 8119198a48aa59..0bfac6465a1f30 100644 --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -8807,9 +8807,13 @@ class sve_bfloat_convert let mayRaiseFPException = 1; } -multiclass sve_bfloat_convert { +multiclass sve_bfloat_convert { def NAME : sve_bfloat_convert; + def : SVE_3_Op_Pat(NAME)>; + def : SVE_1_Op_Passthru_Round_Pat(NAME)>; + def : SVE_1_Op_Passthru_Round_Pat(NAME)>; } //===----------------------------------------------------------------------===// diff --git a/llvm/test/CodeGen/AArch64/sve-bf16-converts.ll b/llvm/test/CodeGen/AArch64/sve-bf16-converts.ll index d72f92c1dac1ff..d63f7e6f3242e0 100644 --- a/llvm/test/CodeGen/AArch64/sve-bf16-converts.ll +++ b/llvm/test/CodeGen/AArch64/sve-bf16-converts.ll @@ -1,9 +1,15 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 -; RUN: llc -mattr=+sve < %s | FileCheck %s -; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s +; RUN: llc -mattr=+sve < %s | FileCheck %s --check-prefixes=CHECK,NOBF16 +; RUN: llc -mattr=+sve --enable-no-nans-fp-math < %s | FileCheck %s --check-prefixes=CHECK,NOBF16NNAN +; RUN: llc -mattr=+sve,+bf16 < %s | FileCheck %s --check-prefixes=CHECK,BF16 +; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s --check-prefixes=CHECK,BF16 target triple = "aarch64-unknown-linux-gnu" +; NOTE: "fptrunc <# x double> to <# x bfloat>" is not supported because SVE +; lacks a down convert that rounds to odd. Such IR will trigger the usual +; failure (crash) when attempting to unroll a scalable vector. + define @fpext_nxv2bf16_to_nxv2f32( %a) { ; CHECK-LABEL: fpext_nxv2bf16_to_nxv2f32: ; CHECK: // %bb.0: @@ -87,3 +93,122 @@ define @fpext_nxv8bf16_to_nxv8f64( %a %res = fpext %a to ret %res } + +define @fptrunc_nxv2f32_to_nxv2bf16( %a) { +; NOBF16-LABEL: fptrunc_nxv2f32_to_nxv2bf16: +; NOBF16: // %bb.0: +; NOBF16-NEXT: mov z1.s, #32767 // =0x7fff +; NOBF16-NEXT: lsr z2.s, z0.s, #16 +; NOBF16-NEXT: ptrue p0.d +; NOBF16-NEXT: fcmuo p0.s, p0/z, z0.s, z0.s +; NOBF16-NEXT: and z2.s, z2.s, #0x1 +; NOBF16-NEXT: add z1.s, z0.s, z1.s +; NOBF16-NEXT: orr z0.s, z0.s, #0x400000 +; NOBF16-NEXT: add z1.s, z2.s, z1.s +; NOBF16-NEXT: sel z0.s, p0, z0.s, z1.s +; NOBF16-NEXT: lsr z0.s, z0.s, #16 +; NOBF16-NEXT: ret +; +; NOBF16NNAN-LABEL: fptrunc_nxv2f32_to_nxv2bf16: +; NOBF16NNAN: // %bb.0: +; NOBF16NNAN-NEXT: mov z1.s, #32767 // =0x7fff +; NOBF16NNAN-NEXT: lsr z2.s, z0.s, #16 +; NOBF16NNAN-NEXT: and z2.s, z2.s, #0x1 +; NOBF16NNAN-NEXT: add z0.s, z0.s, z1.s +; NOBF16NNAN-NEXT: add z0.s, z2.s, z0.s +; NOBF16NNAN-NEXT: lsr z0.s, z0.s, #16 +; NOBF16NNAN-NEXT: ret +; +; BF16-LABEL: fptrunc_nxv2f32_to_nxv2bf16: +; BF16: // %bb.0: +; BF16-NEXT: ptrue p0.d +; BF16-NEXT: bfcvt z0.h, p0/m, z0.s +; BF16-NEXT: ret + %res = fptrunc %a to + ret %res +} + +define @fptrunc_nxv4f32_to_nxv4bf16( %a) { +; NOBF16-LABEL: fptrunc_nxv4f32_to_nxv4bf16: +; NOBF16: // %bb.0: +; NOBF16-NEXT: mov z1.s, #32767 // =0x7fff +; NOBF16-NEXT: lsr z2.s, z0.s, #16 +; NOBF16-NEXT: ptrue p0.s +; NOBF16-NEXT: fcmuo p0.s, p0/z, z0.s, z0.s +; NOBF16-NEXT: and z2.s, z2.s, #0x1 +; NOBF16-NEXT: add z1.s, z0.s, z1.s +; NOBF16-NEXT: orr z0.s, z0.s, #0x400000 +; NOBF16-NEXT: add z1.s, z2.s, z1.s +; NOBF16-NEXT: sel z0.s, p0, z0.s, z1.s +; NOBF16-NEXT: lsr z0.s, z0.s, #16 +; NOBF16-NEXT: ret +; +; NOBF16NNAN-LABEL: fptrunc_nxv4f32_to_nxv4bf16: +; NOBF16NNAN: // %bb.0: +; NOBF16NNAN-NEXT: mov z1.s, #32767 // =0x7fff +; NOBF16NNAN-NEXT: lsr z2.s, z0.s, #16 +; NOBF16NNAN-NEXT: and z2.s, z2.s, #0x1 +; NOBF16NNAN-NEXT: add z0.s, z0.s, z1.s +; NOBF16NNAN-NEXT: add z0.s, z2.s, z0.s +; NOBF16NNAN-NEXT: lsr z0.s, z0.s, #16 +; NOBF16NNAN-NEXT: ret +; +; BF16-LABEL: fptrunc_nxv4f32_to_nxv4bf16: +; BF16: // %bb.0: +; BF16-NEXT: ptrue p0.s +; BF16-NEXT: bfcvt z0.h, p0/m, z0.s +; BF16-NEXT: ret + %res = fptrunc %a to + ret %res +} + +define @fptrunc_nxv8f32_to_nxv8bf16( %a) { +; NOBF16-LABEL: fptrunc_nxv8f32_to_nxv8bf16: +; NOBF16: // %bb.0: +; NOBF16-NEXT: mov z2.s, #32767 // =0x7fff +; NOBF16-NEXT: lsr z3.s, z1.s, #16 +; NOBF16-NEXT: lsr z4.s, z0.s, #16 +; NOBF16-NEXT: ptrue p0.s +; NOBF16-NEXT: and z3.s, z3.s, #0x1 +; NOBF16-NEXT: and z4.s, z4.s, #0x1 +; NOBF16-NEXT: fcmuo p1.s, p0/z, z1.s, z1.s +; NOBF16-NEXT: add z5.s, z1.s, z2.s +; NOBF16-NEXT: add z2.s, z0.s, z2.s +; NOBF16-NEXT: fcmuo p0.s, p0/z, z0.s, z0.s +; NOBF16-NEXT: orr z1.s, z1.s, #0x400000 +; NOBF16-NEXT: orr z0.s, z0.s, #0x400000 +; NOBF16-NEXT: add z3.s, z3.s, z5.s +; NOBF16-NEXT: add z2.s, z4.s, z2.s +; NOBF16-NEXT: sel z1.s, p1, z1.s, z3.s +; NOBF16-NEXT: sel z0.s, p0, z0.s, z2.s +; NOBF16-NEXT: lsr z1.s, z1.s, #16 +; NOBF16-NEXT: lsr z0.s, z0.s, #16 +; NOBF16-NEXT: uzp1 z0.h, z0.h, z1.h +; NOBF16-NEXT: ret +; +; NOBF16NNAN-LABEL: fptrunc_nxv8f32_to_nxv8bf16: +; NOBF16NNAN: // %bb.0: +; NOBF16NNAN-NEXT: mov z2.s, #32767 // =0x7fff +; NOBF16NNAN-NEXT: lsr z3.s, z1.s, #16 +; NOBF16NNAN-NEXT: lsr z4.s, z0.s, #16 +; NOBF16NNAN-NEXT: and z3.s, z3.s, #0x1 +; NOBF16NNAN-NEXT: and z4.s, z4.s, #0x1 +; NOBF16NNAN-NEXT: add z1.s, z1.s, z2.s +; NOBF16NNAN-NEXT: add z0.s, z0.s, z2.s +; NOBF16NNAN-NEXT: add z1.s, z3.s, z1.s +; NOBF16NNAN-NEXT: add z0.s, z4.s, z0.s +; NOBF16NNAN-NEXT: lsr z1.s, z1.s, #16 +; NOBF16NNAN-NEXT: lsr z0.s, z0.s, #16 +; NOBF16NNAN-NEXT: uzp1 z0.h, z0.h, z1.h +; NOBF16NNAN-NEXT: ret +; +; BF16-LABEL: fptrunc_nxv8f32_to_nxv8bf16: +; BF16: // %bb.0: +; BF16-NEXT: ptrue p0.s +; BF16-NEXT: bfcvt z1.h, p0/m, z1.s +; BF16-NEXT: bfcvt z0.h, p0/m, z0.s +; BF16-NEXT: uzp1 z0.h, z0.h, z1.h +; BF16-NEXT: ret + %res = fptrunc %a to + ret %res +} From 8ba334bc4ad1e20c8201b85ed0a3e3b87bc47fe1 Mon Sep 17 00:00:00 2001 From: Dominik Montada Date: Tue, 24 Sep 2024 14:21:45 +0200 Subject: [PATCH 22/22] [MIR] Allow overriding isSSA, noPhis, noVRegs in MIR input (#108546) Allow setting the computed properties IsSSA, NoPHIs, NoVRegs for MIR functions in MIR input. The default value is still the computed value. If the property is set to false, the computed result is ignored. Conflicting values (e.g. setting IsSSA where the input MIR is clearly not SSA) lead to an error. Closes #37787 --- llvm/include/llvm/CodeGen/MIRYamlMapping.h | 11 ++++ llvm/lib/CodeGen/MIRParser/MIRParser.cpp | 52 +++++++++++---- llvm/lib/CodeGen/MIRPrinter.cpp | 7 ++ .../AArch64/mlicm-stack-write-check.mir | 4 +- .../Hexagon/expand-condsets-impuse2.mir | 2 +- .../Hexagon/expand-condsets-phys-reg.mir | 2 +- .../Hexagon/expand-condsets-rm-reg.mir | 2 +- ...ptionally-computed-properties-conflict.mir | 35 ++++++++++ ...unction-optionally-computed-properties.mir | 64 +++++++++++++++++++ .../X86/sjlj-shadow-stack-liveness.mir | 3 +- .../llvm-reduce/mir/preserve-func-info.mir | 6 ++ 11 files changed, 170 insertions(+), 18 deletions(-) create mode 100644 llvm/test/CodeGen/MIR/Generic/machine-function-optionally-computed-properties-conflict.mir create mode 100644 llvm/test/CodeGen/MIR/Generic/machine-function-optionally-computed-properties.mir diff --git a/llvm/include/llvm/CodeGen/MIRYamlMapping.h b/llvm/include/llvm/CodeGen/MIRYamlMapping.h index 304db57eca4994..ab8dc442e04b7b 100644 --- a/llvm/include/llvm/CodeGen/MIRYamlMapping.h +++ b/llvm/include/llvm/CodeGen/MIRYamlMapping.h @@ -730,6 +730,11 @@ struct MachineFunction { bool TracksRegLiveness = false; bool HasWinCFI = false; + // Computed properties that should be overridable + std::optional NoPHIs; + std::optional IsSSA; + std::optional NoVRegs; + bool CallsEHReturn = false; bool CallsUnwindInit = false; bool HasEHCatchret = false; @@ -770,6 +775,12 @@ template <> struct MappingTraits { YamlIO.mapOptional("tracksRegLiveness", MF.TracksRegLiveness, false); YamlIO.mapOptional("hasWinCFI", MF.HasWinCFI, false); + // PHIs must be not be capitalized, since it will clash with the MIR opcode + // leading to false-positive FileCheck hits with CHECK-NOT + YamlIO.mapOptional("noPhis", MF.NoPHIs, std::optional()); + YamlIO.mapOptional("isSSA", MF.IsSSA, std::optional()); + YamlIO.mapOptional("noVRegs", MF.NoVRegs, std::optional()); + YamlIO.mapOptional("callsEHReturn", MF.CallsEHReturn, false); YamlIO.mapOptional("callsUnwindInit", MF.CallsUnwindInit, false); YamlIO.mapOptional("hasEHCatchret", MF.HasEHCatchret, false); diff --git a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp index d506cd1879648f..8d6d800d761474 100644 --- a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp +++ b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp @@ -178,7 +178,8 @@ class MIRParserImpl { SMDiagnostic diagFromBlockStringDiag(const SMDiagnostic &Error, SMRange SourceRange); - void computeFunctionProperties(MachineFunction &MF); + bool computeFunctionProperties(MachineFunction &MF, + const yaml::MachineFunction &YamlMF); void setupDebugValueTracking(MachineFunction &MF, PerFunctionMIParsingState &PFS, const yaml::MachineFunction &YamlMF); @@ -373,7 +374,8 @@ static bool isSSA(const MachineFunction &MF) { return true; } -void MIRParserImpl::computeFunctionProperties(MachineFunction &MF) { +bool MIRParserImpl::computeFunctionProperties( + MachineFunction &MF, const yaml::MachineFunction &YamlMF) { MachineFunctionProperties &Properties = MF.getProperties(); bool HasPHI = false; @@ -398,21 +400,48 @@ void MIRParserImpl::computeFunctionProperties(MachineFunction &MF) { } } } - if (!HasPHI) - Properties.set(MachineFunctionProperties::Property::NoPHIs); + + // Helper function to sanity-check and set properties that are computed, but + // may be explicitly set from the input MIR + auto ComputedPropertyHelper = + [&Properties](std::optional ExplicitProp, bool ComputedProp, + MachineFunctionProperties::Property P) -> bool { + // Prefer explicitly given values over the computed properties + if (ExplicitProp.value_or(ComputedProp)) + Properties.set(P); + else + Properties.reset(P); + + // Check for conflict between the explicit values and the computed ones + return ExplicitProp && *ExplicitProp && !ComputedProp; + }; + + if (ComputedPropertyHelper(YamlMF.NoPHIs, !HasPHI, + MachineFunctionProperties::Property::NoPHIs)) { + return error(MF.getName() + + " has explicit property NoPhi, but contains at least one PHI"); + } + MF.setHasInlineAsm(HasInlineAsm); if (HasTiedOps && AllTiedOpsRewritten) Properties.set(MachineFunctionProperties::Property::TiedOpsRewritten); - if (isSSA(MF)) - Properties.set(MachineFunctionProperties::Property::IsSSA); - else - Properties.reset(MachineFunctionProperties::Property::IsSSA); + if (ComputedPropertyHelper(YamlMF.IsSSA, isSSA(MF), + MachineFunctionProperties::Property::IsSSA)) { + return error(MF.getName() + + " has explicit property IsSSA, but is not valid SSA"); + } const MachineRegisterInfo &MRI = MF.getRegInfo(); - if (MRI.getNumVirtRegs() == 0) - Properties.set(MachineFunctionProperties::Property::NoVRegs); + if (ComputedPropertyHelper(YamlMF.NoVRegs, MRI.getNumVirtRegs() == 0, + MachineFunctionProperties::Property::NoVRegs)) { + return error( + MF.getName() + + " has explicit property NoVRegs, but contains virtual registers"); + } + + return false; } bool MIRParserImpl::initializeCallSiteInfo( @@ -595,7 +624,8 @@ MIRParserImpl::initializeMachineFunction(const yaml::MachineFunction &YamlMF, MachineRegisterInfo &MRI = MF.getRegInfo(); MRI.freezeReservedRegs(); - computeFunctionProperties(MF); + if (computeFunctionProperties(MF, YamlMF)) + return false; if (initializeCallSiteInfo(PFS, YamlMF)) return false; diff --git a/llvm/lib/CodeGen/MIRPrinter.cpp b/llvm/lib/CodeGen/MIRPrinter.cpp index 7de68b12045f14..cf6122bce22364 100644 --- a/llvm/lib/CodeGen/MIRPrinter.cpp +++ b/llvm/lib/CodeGen/MIRPrinter.cpp @@ -223,6 +223,13 @@ void MIRPrinter::print(const MachineFunction &MF) { YamlMF.TracksDebugUserValues = MF.getProperties().hasProperty( MachineFunctionProperties::Property::TracksDebugUserValues); + YamlMF.NoPHIs = MF.getProperties().hasProperty( + MachineFunctionProperties::Property::NoPHIs); + YamlMF.IsSSA = MF.getProperties().hasProperty( + MachineFunctionProperties::Property::IsSSA); + YamlMF.NoVRegs = MF.getProperties().hasProperty( + MachineFunctionProperties::Property::NoVRegs); + convert(YamlMF, MF.getRegInfo(), MF.getSubtarget().getRegisterInfo()); MachineModuleSlotTracker MST(MMI, &MF); MST.incorporateFunction(MF.getFunction()); diff --git a/llvm/test/CodeGen/AArch64/mlicm-stack-write-check.mir b/llvm/test/CodeGen/AArch64/mlicm-stack-write-check.mir index 51bc77d405b94b..406025c4fde302 100644 --- a/llvm/test/CodeGen/AArch64/mlicm-stack-write-check.mir +++ b/llvm/test/CodeGen/AArch64/mlicm-stack-write-check.mir @@ -3,6 +3,7 @@ --- name: test tracksRegLiveness: true +isSSA: false registers: - { id: 0, class: gpr64 } stack: @@ -30,11 +31,11 @@ body: | bb.2: liveins: $x0 %0 = COPY $x0 - %0 = COPY $x0 ; Force isSSA = false. ... --- name: test2 tracksRegLiveness: true +isSSA: false registers: - { id: 0, class: gpr64 } stack: @@ -62,5 +63,4 @@ body: | bb.2: liveins: $x0 %0 = COPY $x0 - %0 = COPY $x0 ; Force isSSA = false. ... diff --git a/llvm/test/CodeGen/Hexagon/expand-condsets-impuse2.mir b/llvm/test/CodeGen/Hexagon/expand-condsets-impuse2.mir index ae3f4ba78cd1ff..ebb361ab433cb7 100644 --- a/llvm/test/CodeGen/Hexagon/expand-condsets-impuse2.mir +++ b/llvm/test/CodeGen/Hexagon/expand-condsets-impuse2.mir @@ -6,12 +6,12 @@ name: f0 tracksRegLiveness: true +isSSA: false body: | bb.0: successors: %bb.1 liveins: $r0, $r1 %0:intregs = COPY $r0 - %0:intregs = COPY $r0 ; defeat IsSSA detection %1:intregs = COPY $r1 %2:intregs = COPY $r0 %3:intregs = M2_mpyi %2, %1 diff --git a/llvm/test/CodeGen/Hexagon/expand-condsets-phys-reg.mir b/llvm/test/CodeGen/Hexagon/expand-condsets-phys-reg.mir index e62cd1cc73609b..d252ec5fee4019 100644 --- a/llvm/test/CodeGen/Hexagon/expand-condsets-phys-reg.mir +++ b/llvm/test/CodeGen/Hexagon/expand-condsets-phys-reg.mir @@ -9,12 +9,12 @@ name: fred tracksRegLiveness: true +isSSA: false body: | bb.0: successors: %bb.1, %bb.2 liveins: $r0 - %0:intregs = A2_tfrsi 0 ;; Multiple defs to ensure IsSSA = false %0:intregs = L2_loadri_io $r0, 0 %1:predregs = C2_cmpgti %0, 10 %2:intregs = C2_mux %1, $r31, %0 diff --git a/llvm/test/CodeGen/Hexagon/expand-condsets-rm-reg.mir b/llvm/test/CodeGen/Hexagon/expand-condsets-rm-reg.mir index 6d7b6cd72a3099..463aa9a8e7f9b1 100644 --- a/llvm/test/CodeGen/Hexagon/expand-condsets-rm-reg.mir +++ b/llvm/test/CodeGen/Hexagon/expand-condsets-rm-reg.mir @@ -20,6 +20,7 @@ name: fred tracksRegLiveness: true +isSSA: false registers: - { id: 0, class: intregs } - { id: 1, class: intregs } @@ -35,7 +36,6 @@ body: | bb.0: liveins: $r0, $r1, $p0 %0 = COPY $r0 - %0 = COPY $r0 ; Force isSSA = false. %1 = COPY $r1 %2 = COPY $p0 ; Check that %3 was coalesced into %4. diff --git a/llvm/test/CodeGen/MIR/Generic/machine-function-optionally-computed-properties-conflict.mir b/llvm/test/CodeGen/MIR/Generic/machine-function-optionally-computed-properties-conflict.mir new file mode 100644 index 00000000000000..d8d178d90ae0af --- /dev/null +++ b/llvm/test/CodeGen/MIR/Generic/machine-function-optionally-computed-properties-conflict.mir @@ -0,0 +1,35 @@ +# RUN: not llc -run-pass none -o /dev/null %s 2>&1 | FileCheck %s + +# Test that computed properties are not conflicting with explicitly set +# properties + +--- +# CHECK: error: {{.*}}: TestNoPhisOverrideConflict has explicit property NoPhi, but contains at least one PHI +name: TestNoPhisOverrideConflict +noPhis: true +tracksRegLiveness: true +body: | + bb.0: + %0:_(s32) = G_IMPLICIT_DEF + + bb.1: + %1:_(s32) = PHI %0, %bb.0, %1, %bb.1 + G_BR %bb.1 +... +--- +# CHECK: error: {{.*}}: TestIsSSAOverrideConflict has explicit property IsSSA, but is not valid SSA +name: TestIsSSAOverrideConflict +isSSA: true +body: | + bb.0: + %0:_(s32) = G_IMPLICIT_DEF + %0:_(s32) = G_IMPLICIT_DEF +... +--- +# CHECK: error: {{.*}}: TestNoVRegsOverrideConflict has explicit property NoVRegs, but contains virtual registers +name: TestNoVRegsOverrideConflict +noVRegs: true +body: | + bb.0: + %0:_(s32) = G_IMPLICIT_DEF +... diff --git a/llvm/test/CodeGen/MIR/Generic/machine-function-optionally-computed-properties.mir b/llvm/test/CodeGen/MIR/Generic/machine-function-optionally-computed-properties.mir new file mode 100644 index 00000000000000..858bbc8394bb34 --- /dev/null +++ b/llvm/test/CodeGen/MIR/Generic/machine-function-optionally-computed-properties.mir @@ -0,0 +1,64 @@ +# RUN: llc -run-pass none -o - %s | FileCheck %s + +# Test that we can disable certain properties that are normally computed + +--- +# CHECK-LABEL: name: TestNoPhis +# CHECK: noPhis: true +# CHECK: ... +name: TestNoPhis +... +--- +# CHECK-LABEL: name: TestNoPhisOverride +# CHECK: noPhis: false +# CHECK: ... +name: TestNoPhisOverride +noPhis: false +... +--- +# CHECK-LABEL: name: TestNoPhisOverrideTrue +# CHECK: noPhis: true +# CHECK: ... +name: TestNoPhisOverrideTrue +noPhis: true +... +--- +# CHECK-LABEL: name: TestIsSSA +# CHECK: isSSA: true +# CHECK: ... +name: TestIsSSA +... +--- +# CHECK-LABEL: name: TestIsSSAOverride +# CHECK: isSSA: false +# CHECK: ... +name: TestIsSSAOverride +isSSA: false +... +--- +# CHECK-LABEL: name: TestIsSSAOverrideTrue +# CHECK: isSSA: true +# CHECK: ... +name: TestIsSSAOverrideTrue +isSSA: true +... +--- +# CHECK-LABEL: name: TestNoVRegs +# CHECK: noVRegs: true +# CHECK: ... +name: TestNoVRegs +... +--- +# CHECK-LABEL: name: TestNoVRegsOverride +# CHECK: noVRegs: false +# CHECK: ... +name: TestNoVRegsOverride +noVRegs: false +... +--- +# CHECK-LABEL: name: TestNoVRegsOverrideTrue +# CHECK: noVRegs: true +# CHECK: ... +name: TestNoVRegsOverrideTrue +noVRegs: true +... diff --git a/llvm/test/CodeGen/X86/sjlj-shadow-stack-liveness.mir b/llvm/test/CodeGen/X86/sjlj-shadow-stack-liveness.mir index 3def36f9d8ba91..83bc8ec510f646 100644 --- a/llvm/test/CodeGen/X86/sjlj-shadow-stack-liveness.mir +++ b/llvm/test/CodeGen/X86/sjlj-shadow-stack-liveness.mir @@ -14,6 +14,7 @@ name: bar # CHECK-LABEL: name: bar alignment: 16 tracksRegLiveness: true +noPhis: false body: | bb.0: %0:gr64 = IMPLICIT_DEF @@ -29,8 +30,6 @@ body: | ; CHECK-NOT: MOV64rm killed %0 ; CHECK-NEXT: MOV64rm killed %0 - ; FIXME: Dummy PHI to set the property NoPHIs to false. PR38439. bb.2: - %1:gr64 = PHI undef %1, %bb.2, undef %1, %bb.2 JMP_1 %bb.2 ... diff --git a/llvm/test/tools/llvm-reduce/mir/preserve-func-info.mir b/llvm/test/tools/llvm-reduce/mir/preserve-func-info.mir index 5f11cea89d7e7b..f735dfd5cbbf01 100644 --- a/llvm/test/tools/llvm-reduce/mir/preserve-func-info.mir +++ b/llvm/test/tools/llvm-reduce/mir/preserve-func-info.mir @@ -14,6 +14,9 @@ # RESULT-NEXT: failedISel: true # RESULT-NEXT: tracksRegLiveness: true # RESULT-NEXT: hasWinCFI: true +# RESULT-NEXT: noPhis: false +# RESULT-NEXT: isSSA: false +# RESULT-NEXT: noVRegs: false # RESULT-NEXT: callsEHReturn: true # RESULT-NEXT: callsUnwindInit: true # RESULT-NEXT: hasEHCatchret: true @@ -41,6 +44,9 @@ selected: true failedISel: true tracksRegLiveness: true hasWinCFI: true +noPhis: false +isSSA: false +noVRegs: false failsVerification: true tracksDebugUserValues: true callsEHReturn: true