From c5672e21ca2a16ff18cdaa83db11d2edb84c5e14 Mon Sep 17 00:00:00 2001
From: Sushant Gokhale <sgokhale@nvidia.com>
Date: Tue, 24 Sep 2024 14:35:01 +0530
Subject: [PATCH 01/22] [AArch64][CostModel] Reduce the cost of fadd reduction
 with fast flag (#108791)

fadd reduction with
  1. Fast flag set
2. No of elements in input vector is power of 2 results in series of
faddp instructions. faddp instruction has latency/throughput identical
to fadd instruction and hence, we set relative cost=1 for faddp as well.

The change didn't show any regression with SPEC17-FP(C/C++),
llvm-test-suite on Neoverse-V2.
---
 .../AArch64/AArch64TargetTransformInfo.cpp    |  20 ++++
 .../Analysis/CostModel/AArch64/reduce-fadd.ll | 104 ++++++++--------
 .../SLPVectorizer/AArch64/reduce-fadd.ll      | 113 ++++++------------
 3 files changed, 107 insertions(+), 130 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 11a4aa4d01e123..da0798ebf79578 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -4159,6 +4159,26 @@ AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
   switch (ISD) {
   default:
     break;
+  case ISD::FADD:
+    if (Type *EltTy = ValTy->getScalarType();
+        // FIXME: For half types without fullfp16 support, this could extend and
+        // use a fp32 faddp reduction but current codegen unrolls.
+        MTy.isVector() && (EltTy->isFloatTy() || EltTy->isDoubleTy() ||
+                           (EltTy->isHalfTy() && ST->hasFullFP16()))) {
+      const unsigned NElts = MTy.getVectorNumElements();
+      if (ValTy->getElementCount().getFixedValue() >= 2 && NElts >= 2 &&
+          isPowerOf2_32(NElts))
+        // Reduction corresponding to series of fadd instructions is lowered to
+        // series of faddp instructions. faddp has latency/throughput that
+        // matches fadd instruction and hence, every faddp instruction can be
+        // considered to have a relative cost = 1 with
+        // CostKind = TCK_RecipThroughput.
+        // An faddp will pairwise add vector elements, so the size of input
+        // vector reduces by half every time, requiring
+        // #(faddp instructions) = log2_32(NElts).
+        return (LT.first - 1) + /*No of faddp instructions*/ Log2_32(NElts);
+    }
+    break;
   case ISD::ADD:
     if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
       return (LT.first - 1) + Entry->Cost;
diff --git a/llvm/test/Analysis/CostModel/AArch64/reduce-fadd.ll b/llvm/test/Analysis/CostModel/AArch64/reduce-fadd.ll
index 58cb8c2c6a8d81..a95542f6901733 100644
--- a/llvm/test/Analysis/CostModel/AArch64/reduce-fadd.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/reduce-fadd.ll
@@ -76,49 +76,49 @@ define void @fast_fp_reductions() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %fadd_v16f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %fadd_v11f16 = call fast half @llvm.vector.reduce.fadd.v11f16(half 0xH0000, <11 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %fadd_v13f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v13f16(half 0xH0000, <13 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f32 = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %fadd_v8f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %fadd_v13f32 = call fast float @llvm.vector.reduce.fadd.v13f32(float 0.000000e+00, <13 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %fadd_v5f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v5f32(float 0.000000e+00, <5 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f64 = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %fadd_v4f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %fadd_v7f64 = call fast double @llvm.vector.reduce.fadd.v7f64(double 0.000000e+00, <7 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %fadd_v9f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v9f64(double 0.000000e+00, <9 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %fadd_v2f32 = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %fadd_v2f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v4f32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v4f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %fadd_v8f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %fadd_v13f32 = call fast float @llvm.vector.reduce.fadd.v13f32(float 0.000000e+00, <13 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %fadd_v5f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v5f32(float 0.000000e+00, <5 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %fadd_v2f64 = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %fadd_v2f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v4f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fadd_v7f64 = call fast double @llvm.vector.reduce.fadd.v7f64(double 0.000000e+00, <7 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v9f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v9f64(double 0.000000e+00, <9 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %fadd_v4f8 = call reassoc bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0xR8000, <4 x bfloat> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %fadd_v4f128 = call reassoc fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; FP16-LABEL: 'fast_fp_reductions'
-; FP16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fadd_v2f16_fast = call fast half @llvm.vector.reduce.fadd.v2f16(half 0xH0000, <2 x half> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fadd_v2f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v2f16(half 0xH0000, <2 x half> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f16_fast = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %fadd_v8f16 = call fast half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %fadd_v8f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %fadd_v16f16 = call fast half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %fadd_v16f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %fadd_v11f16 = call fast half @llvm.vector.reduce.fadd.v11f16(half 0xH0000, <11 x half> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %fadd_v13f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v13f16(half 0xH0000, <13 x half> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f32 = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %fadd_v8f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %fadd_v13f32 = call fast float @llvm.vector.reduce.fadd.v13f32(float 0.000000e+00, <13 x float> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %fadd_v5f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v5f32(float 0.000000e+00, <5 x float> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f64 = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %fadd_v4f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %fadd_v7f64 = call fast double @llvm.vector.reduce.fadd.v7f64(double 0.000000e+00, <7 x double> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %fadd_v9f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v9f64(double 0.000000e+00, <9 x double> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f16_fast = call fast half @llvm.vector.reduce.fadd.v2f16(half 0xH0000, <2 x half> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v2f16(half 0xH0000, <2 x half> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v4f16_fast = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v4f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %fadd_v8f16 = call fast half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %fadd_v8f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fadd_v16f16 = call fast half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fadd_v16f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fadd_v11f16 = call fast half @llvm.vector.reduce.fadd.v11f16(half 0xH0000, <11 x half> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fadd_v13f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v13f16(half 0xH0000, <13 x half> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %fadd_v2f32 = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %fadd_v2f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v4f32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v4f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %fadd_v8f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %fadd_v13f32 = call fast float @llvm.vector.reduce.fadd.v13f32(float 0.000000e+00, <13 x float> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %fadd_v5f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v5f32(float 0.000000e+00, <5 x float> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %fadd_v2f64 = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %fadd_v2f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v4f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fadd_v7f64 = call fast double @llvm.vector.reduce.fadd.v7f64(double 0.000000e+00, <7 x double> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v9f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v9f64(double 0.000000e+00, <9 x double> undef)
 ; FP16-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %fadd_v4f8 = call reassoc bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0xR8000, <4 x bfloat> undef)
 ; FP16-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %fadd_v4f128 = call reassoc fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
 ; FP16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
@@ -134,20 +134,20 @@ define void @fast_fp_reductions() {
 ; BF16-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %fadd_v16f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> undef)
 ; BF16-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %fadd_v11f16 = call fast half @llvm.vector.reduce.fadd.v11f16(half 0xH0000, <11 x half> undef)
 ; BF16-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %fadd_v13f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v13f16(half 0xH0000, <13 x half> undef)
-; BF16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f32 = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> undef)
-; BF16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> undef)
-; BF16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
-; BF16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
-; BF16-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
-; BF16-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %fadd_v8f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
-; BF16-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %fadd_v13f32 = call fast float @llvm.vector.reduce.fadd.v13f32(float 0.000000e+00, <13 x float> undef)
-; BF16-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %fadd_v5f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v5f32(float 0.000000e+00, <5 x float> undef)
-; BF16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f64 = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
-; BF16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
-; BF16-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
-; BF16-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %fadd_v4f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
-; BF16-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %fadd_v7f64 = call fast double @llvm.vector.reduce.fadd.v7f64(double 0.000000e+00, <7 x double> undef)
-; BF16-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %fadd_v9f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v9f64(double 0.000000e+00, <9 x double> undef)
+; BF16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %fadd_v2f32 = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> undef)
+; BF16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %fadd_v2f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> undef)
+; BF16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v4f32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
+; BF16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v4f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
+; BF16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; BF16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %fadd_v8f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; BF16-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %fadd_v13f32 = call fast float @llvm.vector.reduce.fadd.v13f32(float 0.000000e+00, <13 x float> undef)
+; BF16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %fadd_v5f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v5f32(float 0.000000e+00, <5 x float> undef)
+; BF16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %fadd_v2f64 = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
+; BF16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %fadd_v2f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
+; BF16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; BF16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v4f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; BF16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fadd_v7f64 = call fast double @llvm.vector.reduce.fadd.v7f64(double 0.000000e+00, <7 x double> undef)
+; BF16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v9f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v9f64(double 0.000000e+00, <9 x double> undef)
 ; BF16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f8 = call reassoc bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0xR8000, <4 x bfloat> undef)
 ; BF16-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %fadd_v4f128 = call reassoc fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
 ; BF16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll
index edc0381aa3fcc2..6dceabe1d3243b 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll
@@ -1,8 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt < %s -S -passes=slp-vectorizer -mtriple=aarch64-unknown-linux \
-; RUN:       -mattr=-fullfp16 | FileCheck %s --check-prefixes=CHECK,NOFP16
-; RUN: opt < %s -S -passes=slp-vectorizer -mtriple=aarch64-unknown-linux \
-; RUN:       -mattr=+fullfp16 | FileCheck %s --check-prefixes=CHECK,FP16
+; RUN: opt < %s -S -passes=slp-vectorizer -mtriple=aarch64-unknown-linux -mattr=-fullfp16 | FileCheck %s --check-prefixes=CHECK,NOFP16
+; RUN: opt < %s -S -passes=slp-vectorizer -mtriple=aarch64-unknown-linux -mattr=+fullfp16 | FileCheck %s --check-prefixes=CHECK,FULLFP16
 
 define half @reduce_fast_half2(<2 x half> %vec2) {
 ; CHECK-LABEL: define half @reduce_fast_half2(
@@ -79,20 +77,26 @@ entry:
 }
 
 define half @reduce_fast_half8(<8 x half> %vec8) {
-; CHECK-LABEL: define half @reduce_fast_half8(
-; CHECK-SAME: <8 x half> [[VEC8:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[ELT4:%.*]] = extractelement <8 x half> [[VEC8]], i64 4
-; CHECK-NEXT:    [[ELT5:%.*]] = extractelement <8 x half> [[VEC8]], i64 5
-; CHECK-NEXT:    [[ELT6:%.*]] = extractelement <8 x half> [[VEC8]], i64 6
-; CHECK-NEXT:    [[ELT7:%.*]] = extractelement <8 x half> [[VEC8]], i64 7
-; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <8 x half> [[VEC8]], <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> [[TMP0]])
-; CHECK-NEXT:    [[OP_RDX:%.*]] = fadd fast half [[TMP1]], [[ELT4]]
-; CHECK-NEXT:    [[OP_RDX1:%.*]] = fadd fast half [[ELT5]], [[ELT6]]
-; CHECK-NEXT:    [[OP_RDX2:%.*]] = fadd fast half [[OP_RDX]], [[OP_RDX1]]
-; CHECK-NEXT:    [[OP_RDX3:%.*]] = fadd fast half [[OP_RDX2]], [[ELT7]]
-; CHECK-NEXT:    ret half [[OP_RDX3]]
+; NOFP16-LABEL: define half @reduce_fast_half8(
+; NOFP16-SAME: <8 x half> [[VEC8:%.*]]) #[[ATTR0]] {
+; NOFP16-NEXT:  [[ENTRY:.*:]]
+; NOFP16-NEXT:    [[ELT4:%.*]] = extractelement <8 x half> [[VEC8]], i64 4
+; NOFP16-NEXT:    [[ELT5:%.*]] = extractelement <8 x half> [[VEC8]], i64 5
+; NOFP16-NEXT:    [[ELT6:%.*]] = extractelement <8 x half> [[VEC8]], i64 6
+; NOFP16-NEXT:    [[ELT7:%.*]] = extractelement <8 x half> [[VEC8]], i64 7
+; NOFP16-NEXT:    [[TMP0:%.*]] = shufflevector <8 x half> [[VEC8]], <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; NOFP16-NEXT:    [[TMP1:%.*]] = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> [[TMP0]])
+; NOFP16-NEXT:    [[OP_RDX:%.*]] = fadd fast half [[TMP1]], [[ELT4]]
+; NOFP16-NEXT:    [[OP_RDX1:%.*]] = fadd fast half [[ELT5]], [[ELT6]]
+; NOFP16-NEXT:    [[OP_RDX2:%.*]] = fadd fast half [[OP_RDX]], [[OP_RDX1]]
+; NOFP16-NEXT:    [[OP_RDX3:%.*]] = fadd fast half [[OP_RDX2]], [[ELT7]]
+; NOFP16-NEXT:    ret half [[OP_RDX3]]
+;
+; FULLFP16-LABEL: define half @reduce_fast_half8(
+; FULLFP16-SAME: <8 x half> [[VEC8:%.*]]) #[[ATTR0]] {
+; FULLFP16-NEXT:  [[ENTRY:.*:]]
+; FULLFP16-NEXT:    [[TMP0:%.*]] = call fast half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> [[VEC8]])
+; FULLFP16-NEXT:    ret half [[TMP0]]
 ;
 entry:
   %elt0 = extractelement <8 x half> %vec8, i64 0
@@ -154,37 +158,11 @@ entry:
 }
 
 define half @reduce_fast_half16(<16 x half> %vec16) {
-; NOFP16-LABEL: define half @reduce_fast_half16(
-; NOFP16-SAME: <16 x half> [[VEC16:%.*]]) #[[ATTR0]] {
-; NOFP16-NEXT:  [[ENTRY:.*:]]
-; NOFP16-NEXT:    [[TMP0:%.*]] = call fast half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> [[VEC16]])
-; NOFP16-NEXT:    ret half [[TMP0]]
-;
-; FP16-LABEL: define half @reduce_fast_half16(
-; FP16-SAME: <16 x half> [[VEC16:%.*]]) #[[ATTR0]] {
-; FP16-NEXT:  [[ENTRY:.*:]]
-; FP16-NEXT:    [[ELT4:%.*]] = extractelement <16 x half> [[VEC16]], i64 4
-; FP16-NEXT:    [[ELT5:%.*]] = extractelement <16 x half> [[VEC16]], i64 5
-; FP16-NEXT:    [[ELT6:%.*]] = extractelement <16 x half> [[VEC16]], i64 6
-; FP16-NEXT:    [[ELT7:%.*]] = extractelement <16 x half> [[VEC16]], i64 7
-; FP16-NEXT:    [[ELT12:%.*]] = extractelement <16 x half> [[VEC16]], i64 12
-; FP16-NEXT:    [[ELT13:%.*]] = extractelement <16 x half> [[VEC16]], i64 13
-; FP16-NEXT:    [[ELT14:%.*]] = extractelement <16 x half> [[VEC16]], i64 14
-; FP16-NEXT:    [[ELT15:%.*]] = extractelement <16 x half> [[VEC16]], i64 15
-; FP16-NEXT:    [[TMP0:%.*]] = shufflevector <16 x half> [[VEC16]], <16 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; FP16-NEXT:    [[TMP1:%.*]] = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> [[TMP0]])
-; FP16-NEXT:    [[TMP2:%.*]] = shufflevector <16 x half> [[VEC16]], <16 x half> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; FP16-NEXT:    [[TMP3:%.*]] = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> [[TMP2]])
-; FP16-NEXT:    [[OP_RDX:%.*]] = fadd fast half [[TMP1]], [[TMP3]]
-; FP16-NEXT:    [[OP_RDX1:%.*]] = fadd fast half [[OP_RDX]], [[ELT4]]
-; FP16-NEXT:    [[OP_RDX2:%.*]] = fadd fast half [[ELT5]], [[ELT6]]
-; FP16-NEXT:    [[OP_RDX3:%.*]] = fadd fast half [[ELT7]], [[ELT12]]
-; FP16-NEXT:    [[OP_RDX4:%.*]] = fadd fast half [[ELT13]], [[ELT14]]
-; FP16-NEXT:    [[OP_RDX5:%.*]] = fadd fast half [[OP_RDX1]], [[OP_RDX2]]
-; FP16-NEXT:    [[OP_RDX6:%.*]] = fadd fast half [[OP_RDX3]], [[OP_RDX4]]
-; FP16-NEXT:    [[OP_RDX7:%.*]] = fadd fast half [[OP_RDX5]], [[OP_RDX6]]
-; FP16-NEXT:    [[OP_RDX8:%.*]] = fadd fast half [[OP_RDX7]], [[ELT15]]
-; FP16-NEXT:    ret half [[OP_RDX8]]
+; CHECK-LABEL: define half @reduce_fast_half16(
+; CHECK-SAME: <16 x half> [[VEC16:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call fast half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> [[VEC16]])
+; CHECK-NEXT:    ret half [[TMP0]]
 ;
 entry:
   %elt0 = extractelement <16 x half> %vec16, i64 0
@@ -512,19 +490,11 @@ define float @reduce_fast_float_case1(ptr %a) {
 ; CHECK-LABEL: define float @reduce_fast_float_case1(
 ; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[LOAD:%.*]] = load float, ptr [[A]], align 4
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 4
-; CHECK-NEXT:    [[LOAD1:%.*]] = load float, ptr [[GEP]], align 4
-; CHECK-NEXT:    [[ADD1:%.*]] = fadd fast float [[LOAD1]], [[LOAD]]
-; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 8
-; CHECK-NEXT:    [[LOAD2:%.*]] = load float, ptr [[GEP2]], align 4
-; CHECK-NEXT:    [[ADD2:%.*]] = fadd fast float [[LOAD2]], [[ADD1]]
-; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 12
-; CHECK-NEXT:    [[LOAD3:%.*]] = load float, ptr [[GEP3]], align 4
-; CHECK-NEXT:    [[ADD3:%.*]] = fadd fast float [[LOAD3]], [[ADD2]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 4
 ; CHECK-NEXT:    [[GEP4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 16
 ; CHECK-NEXT:    [[LOAD4:%.*]] = load float, ptr [[GEP4]], align 4
-; CHECK-NEXT:    [[ADD4:%.*]] = fadd fast float [[LOAD4]], [[ADD3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP0]])
+; CHECK-NEXT:    [[ADD4:%.*]] = fadd fast float [[TMP1]], [[LOAD4]]
 ; CHECK-NEXT:    ret float [[ADD4]]
 ;
 entry:
@@ -586,24 +556,11 @@ define float @reduce_fast_float_case2(ptr %a, ptr %b) {
 ; CHECK-LABEL: define float @reduce_fast_float_case2(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[GEPA2:%.*]] = getelementptr inbounds float, ptr [[A]], i32 2
-; CHECK-NEXT:    [[GEPA3:%.*]] = getelementptr inbounds float, ptr [[A]], i32 3
-; CHECK-NEXT:    [[GEPB2:%.*]] = getelementptr inbounds float, ptr [[B]], i32 2
-; CHECK-NEXT:    [[GEPB3:%.*]] = getelementptr inbounds float, ptr [[B]], i32 3
-; CHECK-NEXT:    [[LOADA2:%.*]] = load float, ptr [[GEPA2]], align 4
-; CHECK-NEXT:    [[LOADA3:%.*]] = load float, ptr [[GEPA3]], align 4
-; CHECK-NEXT:    [[LOADB2:%.*]] = load float, ptr [[GEPB2]], align 4
-; CHECK-NEXT:    [[LOADB3:%.*]] = load float, ptr [[GEPB3]], align 4
-; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[A]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[B]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = fadd fast <2 x float> [[TMP0]], [[TMP1]]
-; CHECK-NEXT:    [[ADD2:%.*]] = fadd fast float [[LOADA3]], [[LOADB2]]
-; CHECK-NEXT:    [[ADD3:%.*]] = fadd fast float [[LOADA2]], [[LOADB3]]
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
-; CHECK-NEXT:    [[RED1:%.*]] = fadd fast float [[TMP3]], [[TMP4]]
-; CHECK-NEXT:    [[RED2:%.*]] = fadd fast float [[ADD2]], [[RED1]]
-; CHECK-NEXT:    [[RED3:%.*]] = fadd fast float [[ADD3]], [[RED2]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[B]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> poison, <4 x float> [[TMP1]], i64 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP2]], <4 x float> [[TMP0]], i64 4)
+; CHECK-NEXT:    [[RED3:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP3]])
 ; CHECK-NEXT:    ret float [[RED3]]
 ;
 entry:

From 3659aa8079e00d7bd4f2d9c68c404a93ec297200 Mon Sep 17 00:00:00 2001
From: Pravin Jagtap <Pravin.Jagtap@amd.com>
Date: Tue, 24 Sep 2024 14:41:45 +0530
Subject: [PATCH 02/22] [AMDGPU] Fix handling of DBG_VALUE_LIST while fixing
 the dead frame indices. (#109685)

Both SGPR->VGPR and VGPR->AGPR spilling code give a fixup to the spill
frame indices referred in debug instructions so that they can be
entirely removed. The stack argument is present at 0th index in
DBG_VALUE and at 2nd index for DBG_VALUE_LIST.

Fixes: SWDEV-484156
---
 llvm/lib/Target/AMDGPU/SIFrameLowering.cpp    | 13 +++--
 llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp  | 13 +++--
 ...ip-processing-stack-arg-dbg-value-list.mir | 53 +++++++++++++++++++
 ...ip-processing-stack-arg-dbg-value-list.mir | 52 ++++++++++++++++++
 4 files changed, 123 insertions(+), 8 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/sgpr-spill-fi-skip-processing-stack-arg-dbg-value-list.mir
 create mode 100644 llvm/test/CodeGen/AMDGPU/vgpr-spill-fi-skip-processing-stack-arg-dbg-value-list.mir

diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 2c67c4aedfe475..50a6f028f66de6 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -1418,10 +1418,15 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
         // the debug value instructions. We should instead, update it with the
         // correct register value. But not sure the register value alone is
         for (MachineInstr &MI : MBB) {
-          if (MI.isDebugValue() && MI.getOperand(0).isFI() &&
-              !MFI.isFixedObjectIndex(MI.getOperand(0).getIndex()) &&
-              SpillFIs[MI.getOperand(0).getIndex()]) {
-            MI.getOperand(0).ChangeToRegister(Register(), false /*isDef*/);
+          if (MI.isDebugValue()) {
+            uint32_t StackOperandIdx = MI.isDebugValueList() ? 2 : 0;
+            if (MI.getOperand(StackOperandIdx).isFI() &&
+                !MFI.isFixedObjectIndex(
+                    MI.getOperand(StackOperandIdx).getIndex()) &&
+                SpillFIs[MI.getOperand(StackOperandIdx).getIndex()]) {
+              MI.getOperand(StackOperandIdx)
+                  .ChangeToRegister(Register(), false /*isDef*/);
+            }
           }
         }
       }
diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
index 28bba8cfd73528..35e5bea9ae16e2 100644
--- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
@@ -418,10 +418,15 @@ bool SILowerSGPRSpills::run(MachineFunction &MF) {
       // correct register value. But not sure the register value alone is
       // adequate to lower the DIExpression. It should be worked out later.
       for (MachineInstr &MI : MBB) {
-        if (MI.isDebugValue() && MI.getOperand(0).isFI() &&
-            !MFI.isFixedObjectIndex(MI.getOperand(0).getIndex()) &&
-            SpillFIs[MI.getOperand(0).getIndex()]) {
-          MI.getOperand(0).ChangeToRegister(Register(), false /*isDef*/);
+        if (MI.isDebugValue()) {
+          uint32_t StackOperandIdx = MI.isDebugValueList() ? 2 : 0;
+          if (MI.getOperand(StackOperandIdx).isFI() &&
+              !MFI.isFixedObjectIndex(
+                  MI.getOperand(StackOperandIdx).getIndex()) &&
+              SpillFIs[MI.getOperand(StackOperandIdx).getIndex()]) {
+            MI.getOperand(StackOperandIdx)
+                .ChangeToRegister(Register(), false /*isDef*/);
+          }
         }
       }
     }
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-fi-skip-processing-stack-arg-dbg-value-list.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-fi-skip-processing-stack-arg-dbg-value-list.mir
new file mode 100644
index 00000000000000..cdf2b41c1e5b45
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-fi-skip-processing-stack-arg-dbg-value-list.mir
@@ -0,0 +1,53 @@
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -amdgpu-spill-sgpr-to-vgpr=true -run-pass=si-lower-sgpr-spills -o - %s | FileCheck %s
+
+--- |
+  define amdgpu_kernel void @test() { ret void }
+
+  !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !4, producer: "llvm", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, retainedTypes: !4)
+  !1 = !DILocalVariable(name: "a", scope: !2, file: !4, line: 126, type: !6)
+  !2 = distinct !DISubprogram(name: "test", scope: !4, file: !4, line: 1, type: !3, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !5)
+  !3 = !DISubroutineType(types: !4)
+  !4 = !DIFile(filename: "dummy", directory: "/")
+  !5 = !{!1}
+  !6 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !7, size: 64, align: 32)
+  !7 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+  !8 = !DIExpression()
+  !9 = !DILocation(line: 10, column: 9, scope: !2)
+
+...
+---
+name:            test
+tracksRegLiveness: true
+frameInfo:
+  maxAlignment:    4
+fixedStack:
+  - { id: 0, type: default, offset: 4, size: 4, alignment: 4, stack-id: default }
+stack:
+  - { id: 0, type: spill-slot, size: 4, alignment: 4, stack-id: sgpr-spill }
+machineFunctionInfo:
+  maxKernArgAlign: 4
+  isEntryFunction: true
+  waveLimiter:     true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  frameOffsetReg: '$sgpr33'
+  hasSpilledSGPRs: true
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+    kernargSegmentPtr: { reg: '$sgpr6_sgpr7' }
+    workGroupIDX:    { reg: '$sgpr8' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr9' }
+body:             |
+  ; CHECK-LABEL: name: test
+  ; CHECK: bb.0:
+  ; CHECK:   DBG_VALUE_LIST <{{.*}}>, !DIExpression(), $noreg, 0, debug-location !DILocation(line: 10, column: 9, scope: <{{.*}}>)
+
+  bb.0:
+    renamable $sgpr10 = IMPLICIT_DEF
+    SI_SPILL_S32_SAVE killed $sgpr10, %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32
+    DBG_VALUE_LIST !1, !8, %stack.0, 0, debug-location !9
+
+  bb.1:
+    renamable $sgpr10 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32
+    S_ENDPGM 0
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-fi-skip-processing-stack-arg-dbg-value-list.mir b/llvm/test/CodeGen/AMDGPU/vgpr-spill-fi-skip-processing-stack-arg-dbg-value-list.mir
new file mode 100644
index 00000000000000..53629cdfb932b2
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-fi-skip-processing-stack-arg-dbg-value-list.mir
@@ -0,0 +1,52 @@
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -amdgpu-spill-vgpr-to-agpr=true -run-pass=prologepilog -o - %s | FileCheck %s
+
+--- |
+  define amdgpu_kernel void @test() { ret void }
+
+  !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !4, producer: "llvm", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, retainedTypes: !4)
+  !1 = !DILocalVariable(name: "a", scope: !2, file: !4, line: 126, type: !6)
+  !2 = distinct !DISubprogram(name: "test", scope: !4, file: !4, line: 1, type: !3, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !5)
+  !3 = !DISubroutineType(types: !4)
+  !4 = !DIFile(filename: "dummy", directory: "/")
+  !5 = !{!1}
+  !6 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !7, size: 64, align: 32)
+  !7 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+  !8 = !DIExpression()
+  !9 = !DILocation(line: 10, column: 9, scope: !2)
+
+...
+---
+name:            test
+tracksRegLiveness: true
+frameInfo:
+  maxAlignment:    4
+fixedStack:
+  - { id: 0, type: default, offset: 4, size: 4, alignment: 4, stack-id: default }
+stack:
+  - { id: 0, type: spill-slot, size: 4, alignment: 4 }
+machineFunctionInfo:
+  maxKernArgAlign: 4
+  isEntryFunction: true
+  waveLimiter:     true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  frameOffsetReg: '$sgpr33'
+  hasSpilledVGPRs: true
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+    kernargSegmentPtr: { reg: '$sgpr6_sgpr7' }
+    workGroupIDX:    { reg: '$sgpr8' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr9' }
+body:             |
+  ; CHECK-LABEL: name: test
+  ; CHECK: bb.0:
+  ; CHECK:   DBG_VALUE_LIST <{{.*}}>, !DIExpression(), $noreg, 0, debug-location !DILocation(line: 10, column: 9, scope: <{{.*}}>)
+  bb.0:
+    $vgpr2 = IMPLICIT_DEF
+    SI_SPILL_V32_SAVE $vgpr2, %stack.0, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.0, align 4, addrspace 5)
+    DBG_VALUE_LIST !1, !8, %stack.0, 0, debug-location !9
+
+  bb.1:
+    renamable $vgpr2 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, align 4, addrspace 5)
+    S_ENDPGM 0

From 30dbbdd2ea25e3ab5596e1fb0474696b242a760c Mon Sep 17 00:00:00 2001
From: Dmitry Chernenkov <dmitryc@google.com>
Date: Tue, 24 Sep 2024 09:17:24 +0000
Subject: [PATCH 03/22] [Bazel] Fix for
 127349fcba81646389e4b8202b35405a5fdbef47

---
 .../libc/test/src/math/libc_math_test_rules.bzl                | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/math/libc_math_test_rules.bzl b/utils/bazel/llvm-project-overlay/libc/test/src/math/libc_math_test_rules.bzl
index d788705fc3e604..16845ab66dfd45 100644
--- a/utils/bazel/llvm-project-overlay/libc/test/src/math/libc_math_test_rules.bzl
+++ b/utils/bazel/llvm-project-overlay/libc/test/src/math/libc_math_test_rules.bzl
@@ -28,7 +28,9 @@ def math_test(name, hdrs = [], deps = [], **kwargs):
             "//libc:__support_cpp_algorithm",
             "//libc:__support_cpp_bit",
             "//libc:__support_cpp_limits",
+            "//libc:__support_cpp_type_traits",
             "//libc:__support_fputil_basic_operations",
+            "//libc:__support_fputil_cast",
             "//libc:__support_fputil_fenv_impl",
             "//libc:__support_fputil_fp_bits",
             "//libc:__support_fputil_manipulation_functions",
@@ -36,6 +38,7 @@ def math_test(name, hdrs = [], deps = [], **kwargs):
             "//libc:__support_fputil_normal_float",
             "//libc:__support_macros_properties_architectures",
             "//libc:__support_macros_properties_os",
+            "//libc:__support_macros_properties_types",
             "//libc:__support_math_extras",
             "//libc:__support_uint128",
             "//libc:hdr_errno_macros",

From cc7b24a4d125e9a81480aaaa961a2b963bbb2ea2 Mon Sep 17 00:00:00 2001
From: Piotr Fusik <p.fusik@samsung.com>
Date: Tue, 24 Sep 2024 11:19:56 +0200
Subject: [PATCH 04/22] [NFC] Fix typos in comments (#109765)

---
 llvm/include/llvm/CodeGen/ISDOpcodes.h          | 2 +-
 llvm/lib/Analysis/VectorUtils.cpp               | 2 +-
 llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp     | 2 +-
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp     | 2 +-
 llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
index 18ed60ebb124dc..da43f5be10ff3b 100644
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -825,7 +825,7 @@ enum NodeType {
   /// be saturated against signed values, resulting in `S`, which will combine
   /// to `TRUNCATE_SSAT_S`. If the value of C ranges from `0 to 255`, it will
   /// be saturated against unsigned values, resulting in `U`, which will
-  /// combine to `TRUNATE_SSAT_U`. Similarly, in `truncate(umin(x, C))`, if
+  /// combine to `TRUNCATE_SSAT_U`. Similarly, in `truncate(umin(x, C))`, if
   /// value of C ranges from `0 to 255`, it becomes `U` because it is saturated
   /// for unsigned values. As a result, it combines to `TRUNCATE_USAT_U`.
   TRUNCATE_SSAT_S, // saturate signed input to signed result -
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index d45d3bbefe4fd3..dbffbb8a5f81d9 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -1454,7 +1454,7 @@ void InterleavedAccessInfo::analyzeInterleaving(
     // that all the pointers in the group don't wrap.
     // So we check only group member 0 (which is always guaranteed to exist),
     // and group member Factor - 1; If the latter doesn't exist we rely on
-    // peeling (if it is a non-reversed accsess -- see Case 3).
+    // peeling (if it is a non-reversed access -- see Case 3).
     if (InvalidateGroupIfMemberMayWrap(Group, 0, std::string("first")))
       continue;
     if (Group->getMember(Group->getFactor() - 1))
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index fcd46b5921c4de..05ba18bf8ebd88 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -2725,7 +2725,7 @@ bool RISCVDAGToDAGISel::SelectAddrRegImm(SDValue Addr, SDValue &Base,
 }
 
 /// Similar to SelectAddrRegImm, except that the least significant 5 bits of
-/// Offset shoule be all zeros.
+/// Offset should be all zeros.
 bool RISCVDAGToDAGISel::SelectAddrRegImmLsb00000(SDValue Addr, SDValue &Base,
                                                  SDValue &Offset) {
   if (SelectAddrFrameIndex(Addr, Base, Offset))
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 2aa89aca4c808d..b998a1eb11c300 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -4540,7 +4540,7 @@ static SDValue lowerScalarInsert(SDValue Scalar, SDValue VL, MVT VT,
 // t33: v8i8 = extract_subvector t11, Constant:i64<8>
 // a) t35: v8i8 = vector_shuffle<0,2,4,6,8,10,12,14> t34, t33
 // b) t35: v8i8 = vector_shuffle<1,3,5,7,9,11,13,15> t34, t33
-// Returns {Src Vector, Even Elements} om success
+// Returns {Src Vector, Even Elements} on success
 static bool isDeinterleaveShuffle(MVT VT, MVT ContainerVT, SDValue V1,
                                   SDValue V2, ArrayRef<int> Mask,
                                   const RISCVSubtarget &Subtarget) {
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
index fe7de9d7bc79aa..68182d238e7847 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
@@ -232,7 +232,7 @@ class octuple_to_str<int octuple> {
 def VLOpFrag : PatFrag<(ops), (XLenVT (VLOp (XLenVT AVL:$vl)))>;
 
 // Output pattern for X0 used to represent VLMAX in the pseudo instructions.
-// We can't use X0 register becuase the AVL operands use GPRNoX0.
+// We can't use X0 register because the AVL operands use GPRNoX0.
 // This must be kept in sync with RISCV::VLMaxSentinel.
 def VLMax : OutPatFrag<(ops), (XLenVT -1)>;
 

From bfd8f7ee4a85ae8873db14fa6e7e31223a1df169 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 24 Sep 2024 10:37:02 +0100
Subject: [PATCH 05/22] [X86] SimplifyDemandedVectorEltsForTargetNode - reduce
 vector width of FRSQRT/FRCP ymm nodes.

If we only demand the lower subvector of a FRSQRT/FRCP node, then reduce the width of the instruction.
---
 llvm/lib/Target/X86/X86ISelLowering.cpp    |  2 ++
 llvm/test/CodeGen/X86/extractelement-fp.ll | 10 ++++------
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index de8cfe31a5529f..d7a26dc4caec6c 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -43109,6 +43109,8 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
     case X86ISD::FMIN:
     case X86ISD::FMAXC:
     case X86ISD::FMINC:
+    case X86ISD::FRSQRT:
+    case X86ISD::FRCP:
       // Horizontal Ops.
     case X86ISD::HADD:
     case X86ISD::HSUB:
diff --git a/llvm/test/CodeGen/X86/extractelement-fp.ll b/llvm/test/CodeGen/X86/extractelement-fp.ll
index 38162f676e7ee3..944f6bbfd0bfbe 100644
--- a/llvm/test/CodeGen/X86/extractelement-fp.ll
+++ b/llvm/test/CodeGen/X86/extractelement-fp.ll
@@ -1310,15 +1310,14 @@ define float @rcp_v4f32(<4 x float> %x) nounwind {
 define float @rcp_v8f32(<8 x float> %x) nounwind {
 ; X64-LABEL: rcp_v8f32:
 ; X64:       # %bb.0:
-; X64-NEXT:    vrcpps %ymm0, %ymm0
-; X64-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; X64-NEXT:    vrcpss %xmm0, %xmm0, %xmm0
 ; X64-NEXT:    vzeroupper
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: rcp_v8f32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %eax
-; X86-NEXT:    vrcpps %ymm0, %ymm0
+; X86-NEXT:    vrcpss %xmm0, %xmm0, %xmm0
 ; X86-NEXT:    vmovss %xmm0, (%esp)
 ; X86-NEXT:    flds (%esp)
 ; X86-NEXT:    popl %eax
@@ -1351,15 +1350,14 @@ define float @rsqrt_v4f32(<4 x float> %x) nounwind {
 define float @rsqrt_v8f32(<8 x float> %x) nounwind {
 ; X64-LABEL: rsqrt_v8f32:
 ; X64:       # %bb.0:
-; X64-NEXT:    vrsqrtps %ymm0, %ymm0
-; X64-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; X64-NEXT:    vrsqrtss %xmm0, %xmm0, %xmm0
 ; X64-NEXT:    vzeroupper
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: rsqrt_v8f32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %eax
-; X86-NEXT:    vrsqrtps %ymm0, %ymm0
+; X86-NEXT:    vrsqrtss %xmm0, %xmm0, %xmm0
 ; X86-NEXT:    vmovss %xmm0, (%esp)
 ; X86-NEXT:    flds (%esp)
 ; X86-NEXT:    popl %eax

From 396f6775143ffa80b9f0e72e7250613092d88124 Mon Sep 17 00:00:00 2001
From: Scott Egerton <9487234+ScottEgerton@users.noreply.github.com>
Date: Tue, 24 Sep 2024 10:58:00 +0100
Subject: [PATCH 06/22] [AMDGPU] Remove unused VGPRSingleUseHintInsts feature
 (#109769)

---
 llvm/docs/AMDGPUUsage.rst                     |    4 +-
 llvm/lib/Target/AMDGPU/AMDGPU.h               |    3 -
 llvm/lib/Target/AMDGPU/AMDGPU.td              |   13 -
 .../AMDGPU/AMDGPUInsertSingleUseVDST.cpp      |  245 ---
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |   10 -
 llvm/lib/Target/AMDGPU/CMakeLists.txt         |    1 -
 llvm/lib/Target/AMDGPU/GCNSubtarget.h         |    3 -
 llvm/lib/Target/AMDGPU/SIInstrInfo.td         |    2 -
 llvm/lib/Target/AMDGPU/SOPInstructions.td     |   11 -
 .../Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp    |   18 -
 llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h |    2 +
 llvm/lib/Target/AMDGPU/VOP1Instructions.td    |   18 +-
 llvm/lib/Target/AMDGPU/VOP2Instructions.td    |    6 +-
 llvm/lib/Target/AMDGPU/VOP3Instructions.td    |   35 +-
 llvm/lib/Target/AMDGPU/VOP3PInstructions.td   |   12 +-
 llvm/lib/Target/AMDGPU/VOPCInstructions.td    |   13 +-
 llvm/lib/Target/AMDGPU/VOPInstructions.td     |   20 +-
 .../CodeGen/AMDGPU/insert-singleuse-vdst.mir  | 1420 -----------------
 llvm/test/MC/AMDGPU/gfx1150_asm_sopp.s        |   10 -
 llvm/test/MC/AMDGPU/gfx11_unsupported.s       |    3 -
 llvm/test/MC/AMDGPU/gfx12_asm_sopp.s          |    9 -
 .../MC/Disassembler/AMDGPU/decode-err.txt     |    5 -
 .../Disassembler/AMDGPU/gfx1150_dasm_sopp.txt |   10 -
 .../Disassembler/AMDGPU/gfx12_dasm_sopp.txt   |    8 -
 .../secondary/llvm/lib/Target/AMDGPU/BUILD.gn |    1 -
 25 files changed, 34 insertions(+), 1848 deletions(-)
 delete mode 100644 llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp
 delete mode 100644 llvm/test/CodeGen/AMDGPU/insert-singleuse-vdst.mir
 delete mode 100644 llvm/test/MC/AMDGPU/gfx1150_asm_sopp.s
 delete mode 100644 llvm/test/MC/Disassembler/AMDGPU/gfx1150_dasm_sopp.txt

diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 4b48b54b18bb99..9e11b13c101d47 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -611,9 +611,7 @@ Generic processor code objects are versioned. See :ref:`amdgpu-generic-processor
                                                                                                 - ``gfx1152``
 
                                                                                                 SALU floating point instructions
-                                                                                                and single-use VGPR hint
-                                                                                                instructions are not available
-                                                                                                on:
+                                                                                                are not available on:
 
                                                                                                 - ``gfx1150``
                                                                                                 - ``gfx1151``
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index b2dd354e496a2e..4abb5a63ab6d2c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -405,9 +405,6 @@ extern char &SIModeRegisterID;
 void initializeAMDGPUInsertDelayAluPass(PassRegistry &);
 extern char &AMDGPUInsertDelayAluID;
 
-void initializeAMDGPUInsertSingleUseVDSTPass(PassRegistry &);
-extern char &AMDGPUInsertSingleUseVDSTID;
-
 void initializeSIInsertHardClausesPass(PassRegistry &);
 extern char &SIInsertHardClausesID;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 919e698e76b33b..3626fd8bc78c15 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -929,12 +929,6 @@ def FeatureSALUFloatInsts : SubtargetFeature<"salu-float",
   "Has SALU floating point instructions"
 >;
 
-def FeatureVGPRSingleUseHintInsts : SubtargetFeature<"vgpr-singleuse-hint",
-  "HasVGPRSingleUseHintInsts",
-  "true",
-  "Has single-use VGPR hint instructions"
->;
-
 def FeaturePseudoScalarTrans : SubtargetFeature<"pseudo-scalar-trans",
   "HasPseudoScalarTrans",
   "true",
@@ -1615,14 +1609,12 @@ def FeatureISAVersion11_5_0 : FeatureSet<
   !listconcat(FeatureISAVersion11_Common.Features,
     [FeatureSALUFloatInsts,
      FeatureDPPSrc1SGPR,
-     FeatureVGPRSingleUseHintInsts,
      FeatureRequiredExportPriority])>;
 
 def FeatureISAVersion11_5_1 : FeatureSet<
   !listconcat(FeatureISAVersion11_Common.Features,
     [FeatureSALUFloatInsts,
      FeatureDPPSrc1SGPR,
-     FeatureVGPRSingleUseHintInsts,
      Feature1_5xVGPRs,
      FeatureRequiredExportPriority])>;
 
@@ -1630,7 +1622,6 @@ def FeatureISAVersion11_5_2 : FeatureSet<
   !listconcat(FeatureISAVersion11_Common.Features,
     [FeatureSALUFloatInsts,
      FeatureDPPSrc1SGPR,
-     FeatureVGPRSingleUseHintInsts,
      FeatureRequiredExportPriority])>;
 
 def FeatureISAVersion12 : FeatureSet<
@@ -1663,7 +1654,6 @@ def FeatureISAVersion12 : FeatureSet<
    FeatureSALUFloatInsts,
    FeaturePseudoScalarTrans,
    FeatureHasRestrictedSOffset,
-   FeatureVGPRSingleUseHintInsts,
    FeatureScalarDwordx3Loads,
    FeatureDPPSrc1SGPR,
    FeatureMaxHardClauseLength32,
@@ -2271,9 +2261,6 @@ def HasNotMADIntraFwdBug : Predicate<"!Subtarget->hasMADIntraFwdBug()">;
 def HasSALUFloatInsts : Predicate<"Subtarget->hasSALUFloatInsts()">,
   AssemblerPredicate<(all_of FeatureSALUFloatInsts)>;
 
-def HasVGPRSingleUseHintInsts : Predicate<"Subtarget->hasVGPRSingleUseHintInsts()">,
-  AssemblerPredicate<(all_of FeatureVGPRSingleUseHintInsts)>;
-
 def HasPseudoScalarTrans : Predicate<"Subtarget->hasPseudoScalarTrans()">,
   AssemblerPredicate<(all_of FeaturePseudoScalarTrans)>;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp
deleted file mode 100644
index 43b3bf43fe56db..00000000000000
--- a/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp
+++ /dev/null
@@ -1,245 +0,0 @@
-//===- AMDGPUInsertSingleUseVDST.cpp - Insert s_singleuse_vdst instructions ==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// Insert s_singleuse_vdst instructions on GFX11.5+ to mark regions of VALU
-/// instructions that produce single-use VGPR values. If the value is forwarded
-/// to the consumer instruction prior to VGPR writeback, the hardware can
-/// then skip (kill) the VGPR write.
-//
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "AMDGPUGenSearchableTables.inc"
-#include "GCNSubtarget.h"
-#include "SIInstrInfo.h"
-#include "SIRegisterInfo.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineOperand.h"
-#include "llvm/CodeGen/Register.h"
-#include "llvm/IR/DebugLoc.h"
-#include "llvm/MC/MCRegister.h"
-#include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/Pass.h"
-#include <array>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "amdgpu-insert-single-use-vdst"
-
-namespace {
-class AMDGPUInsertSingleUseVDST : public MachineFunctionPass {
-private:
-  const SIInstrInfo *SII;
-  class SingleUseInstruction {
-  private:
-    static const unsigned MaxSkipRange = 0b111;
-    static const unsigned MaxNumberOfSkipRegions = 2;
-
-    unsigned LastEncodedPositionEnd;
-    MachineInstr *ProducerInstr;
-
-    std::array<unsigned, MaxNumberOfSkipRegions + 1> SingleUseRegions;
-    SmallVector<unsigned, MaxNumberOfSkipRegions> SkipRegions;
-
-    // Adds a skip region into the instruction.
-    void skip(const unsigned ProducerPosition) {
-      while (LastEncodedPositionEnd + MaxSkipRange < ProducerPosition) {
-        SkipRegions.push_back(MaxSkipRange);
-        LastEncodedPositionEnd += MaxSkipRange;
-      }
-      SkipRegions.push_back(ProducerPosition - LastEncodedPositionEnd);
-      LastEncodedPositionEnd = ProducerPosition;
-    }
-
-    bool currentRegionHasSpace() {
-      const auto Region = SkipRegions.size();
-      // The first region has an extra bit of encoding space.
-      return SingleUseRegions[Region] <
-             ((Region == MaxNumberOfSkipRegions) ? 0b1111U : 0b111U);
-    }
-
-    unsigned encodeImm() {
-      // Handle the first Single Use Region separately as it has an extra bit
-      // of encoding space.
-      unsigned Imm = SingleUseRegions[SkipRegions.size()];
-      unsigned ShiftAmount = 4;
-      for (unsigned i = SkipRegions.size(); i > 0; i--) {
-        Imm |= SkipRegions[i - 1] << ShiftAmount;
-        ShiftAmount += 3;
-        Imm |= SingleUseRegions[i - 1] << ShiftAmount;
-        ShiftAmount += 3;
-      }
-      return Imm;
-    }
-
-  public:
-    SingleUseInstruction(const unsigned ProducerPosition,
-                         MachineInstr *Producer)
-        : LastEncodedPositionEnd(ProducerPosition + 1), ProducerInstr(Producer),
-          SingleUseRegions({1, 0, 0}) {}
-
-    // Returns false if adding a new single use producer failed. This happens
-    // because it could not be encoded, either because there is no room to
-    // encode another single use producer region or that this single use
-    // producer is too far away to encode the amount of instructions to skip.
-    bool tryAddProducer(const unsigned ProducerPosition, MachineInstr *MI) {
-      // Producer is too far away to encode into this instruction or another
-      // skip region is needed and SkipRegions.size() = 2 so there's no room for
-      // another skip region, therefore a new instruction is needed.
-      if (LastEncodedPositionEnd +
-              (MaxSkipRange * (MaxNumberOfSkipRegions - SkipRegions.size())) <
-          ProducerPosition)
-        return false;
-
-      // If a skip region is needed.
-      if (LastEncodedPositionEnd != ProducerPosition ||
-          !currentRegionHasSpace()) {
-        // If the current region is out of space therefore a skip region would
-        // be needed, but there is no room for another skip region.
-        if (SkipRegions.size() == MaxNumberOfSkipRegions)
-          return false;
-        skip(ProducerPosition);
-      }
-
-      SingleUseRegions[SkipRegions.size()]++;
-      LastEncodedPositionEnd = ProducerPosition + 1;
-      ProducerInstr = MI;
-      return true;
-    }
-
-    auto emit(const SIInstrInfo *SII) {
-      return BuildMI(*ProducerInstr->getParent(), ProducerInstr, DebugLoc(),
-                     SII->get(AMDGPU::S_SINGLEUSE_VDST))
-          .addImm(encodeImm());
-    }
-  };
-
-public:
-  static char ID;
-
-  AMDGPUInsertSingleUseVDST() : MachineFunctionPass(ID) {}
-
-  void insertSingleUseInstructions(
-      ArrayRef<std::pair<unsigned, MachineInstr *>> SingleUseProducers) const {
-    SmallVector<SingleUseInstruction> Instructions;
-
-    for (auto &[Position, MI] : SingleUseProducers) {
-      // Encode this position into the last single use instruction if possible.
-      if (Instructions.empty() ||
-          !Instructions.back().tryAddProducer(Position, MI)) {
-        // If not, add a new instruction.
-        Instructions.push_back(SingleUseInstruction(Position, MI));
-      }
-    }
-
-    for (auto &Instruction : Instructions)
-      Instruction.emit(SII);
-  }
-
-  bool runOnMachineFunction(MachineFunction &MF) override {
-    const auto &ST = MF.getSubtarget<GCNSubtarget>();
-    if (!ST.hasVGPRSingleUseHintInsts())
-      return false;
-
-    SII = ST.getInstrInfo();
-    const auto *TRI = &SII->getRegisterInfo();
-    bool InstructionEmitted = false;
-
-    for (MachineBasicBlock &MBB : MF) {
-      DenseMap<MCRegUnit, unsigned> RegisterUseCount;
-
-      // Handle boundaries at the end of basic block separately to avoid
-      // false positives. If they are live at the end of a basic block then
-      // assume it has more uses later on.
-      for (const auto &Liveout : MBB.liveouts()) {
-        for (MCRegUnitMaskIterator Units(Liveout.PhysReg, TRI); Units.isValid();
-             ++Units) {
-          const auto [Unit, Mask] = *Units;
-          if ((Mask & Liveout.LaneMask).any())
-            RegisterUseCount[Unit] = 2;
-        }
-      }
-
-      SmallVector<std::pair<unsigned, MachineInstr *>>
-          SingleUseProducerPositions;
-
-      unsigned VALUInstrCount = 0;
-      for (MachineInstr &MI : reverse(MBB.instrs())) {
-        // All registers in all operands need to be single use for an
-        // instruction to be marked as a single use producer.
-        bool AllProducerOperandsAreSingleUse = true;
-
-        // Gather a list of Registers used before updating use counts to avoid
-        // double counting registers that appear multiple times in a single
-        // MachineInstr.
-        SmallVector<MCRegUnit> RegistersUsed;
-
-        for (const auto &Operand : MI.all_defs()) {
-          const auto Reg = Operand.getReg();
-
-          const auto RegUnits = TRI->regunits(Reg);
-          if (any_of(RegUnits, [&RegisterUseCount](const MCRegUnit Unit) {
-                return RegisterUseCount[Unit] > 1;
-              }))
-            AllProducerOperandsAreSingleUse = false;
-
-          // Reset uses count when a register is no longer live.
-          for (const MCRegUnit Unit : RegUnits)
-            RegisterUseCount.erase(Unit);
-        }
-
-        for (const auto &Operand : MI.all_uses()) {
-          const auto Reg = Operand.getReg();
-
-          // Count the number of times each register is read.
-          for (const MCRegUnit Unit : TRI->regunits(Reg)) {
-            if (!is_contained(RegistersUsed, Unit))
-              RegistersUsed.push_back(Unit);
-          }
-        }
-        for (const MCRegUnit Unit : RegistersUsed)
-          RegisterUseCount[Unit]++;
-
-        // Do not attempt to optimise across exec mask changes.
-        if (MI.modifiesRegister(AMDGPU::EXEC, TRI) ||
-            AMDGPU::isInvalidSingleUseConsumerInst(MI.getOpcode())) {
-          for (auto &UsedReg : RegisterUseCount)
-            UsedReg.second = 2;
-        }
-
-        if (!SIInstrInfo::isVALU(MI) ||
-            AMDGPU::isInvalidSingleUseProducerInst(MI.getOpcode()))
-          continue;
-        if (AllProducerOperandsAreSingleUse) {
-          SingleUseProducerPositions.push_back({VALUInstrCount, &MI});
-          InstructionEmitted = true;
-        }
-        VALUInstrCount++;
-      }
-      insertSingleUseInstructions(SingleUseProducerPositions);
-    }
-    return InstructionEmitted;
-  }
-};
-} // namespace
-
-char AMDGPUInsertSingleUseVDST::ID = 0;
-
-char &llvm::AMDGPUInsertSingleUseVDSTID = AMDGPUInsertSingleUseVDST::ID;
-
-INITIALIZE_PASS(AMDGPUInsertSingleUseVDST, DEBUG_TYPE,
-                "AMDGPU Insert SingleUseVDST", false, false)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 04fdee0819b502..abd50748f2cc05 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -311,12 +311,6 @@ static cl::opt<bool> EnableSIModeRegisterPass(
   cl::init(true),
   cl::Hidden);
 
-// Enable GFX11.5+ s_singleuse_vdst insertion
-static cl::opt<bool>
-    EnableInsertSingleUseVDST("amdgpu-enable-single-use-vdst",
-                              cl::desc("Enable s_singleuse_vdst insertion"),
-                              cl::init(false), cl::Hidden);
-
 // Enable GFX11+ s_delay_alu insertion
 static cl::opt<bool>
     EnableInsertDelayAlu("amdgpu-enable-delay-alu",
@@ -450,7 +444,6 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeAMDGPURewriteUndefForPHILegacyPass(*PR);
   initializeAMDGPUUnifyMetadataPass(*PR);
   initializeSIAnnotateControlFlowLegacyPass(*PR);
-  initializeAMDGPUInsertSingleUseVDSTPass(*PR);
   initializeAMDGPUInsertDelayAluPass(*PR);
   initializeSIInsertHardClausesPass(*PR);
   initializeSIInsertWaitcntsPass(*PR);
@@ -1518,9 +1511,6 @@ void GCNPassConfig::addPreEmitPass() {
   // cases.
   addPass(&PostRAHazardRecognizerID);
 
-  if (isPassEnabled(EnableInsertSingleUseVDST, CodeGenOptLevel::Less))
-    addPass(&AMDGPUInsertSingleUseVDSTID);
-
   if (isPassEnabled(EnableInsertDelayAlu, CodeGenOptLevel::Less))
     addPass(&AMDGPUInsertDelayAluID);
 
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index e813653158e5d9..7c883cc2017ddd 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -81,7 +81,6 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUMCInstLower.cpp
   AMDGPUMemoryUtils.cpp
   AMDGPUIGroupLP.cpp
-  AMDGPUInsertSingleUseVDST.cpp
   AMDGPUMarkLastScratchLoad.cpp
   AMDGPUMIRFormatter.cpp
   AMDGPUOpenCLEnqueuedBlockLowering.cpp
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index a4ae8a1be32258..e6b7342d5fffcf 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -215,7 +215,6 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   bool HasPackedTID = false;
   bool ScalarizeGlobal = false;
   bool HasSALUFloatInsts = false;
-  bool HasVGPRSingleUseHintInsts = false;
   bool HasPseudoScalarTrans = false;
   bool HasRestrictedSOffset = false;
 
@@ -1280,8 +1279,6 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
 
   bool hasSALUFloatInsts() const { return HasSALUFloatInsts; }
 
-  bool hasVGPRSingleUseHintInsts() const { return HasVGPRSingleUseHintInsts; }
-
   bool hasPseudoScalarTrans() const { return HasPseudoScalarTrans; }
 
   bool hasRestrictedSOffset() const { return HasRestrictedSOffset; }
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index c016be2fc6c0fb..087ca1f954464d 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2409,8 +2409,6 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
   field bit EnableClamp = _EnableClamp;
   field bit IsTrue16 = 0;
   field bit IsRealTrue16 = 0;
-  field bit IsInvalidSingleUseConsumer = 0;
-  field bit IsInvalidSingleUseProducer = 0;
 
   field ValueType DstVT = ArgVT[0];
   field ValueType Src0VT = ArgVT[1];
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 2e73a1a15f6b32..9da27a7c7ee7d6 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -1752,11 +1752,6 @@ let OtherPredicates = [HasExportInsts] in
                                 "$simm16">;
 } // End SubtargetPredicate = isGFX11Plus
 
-let SubtargetPredicate = HasVGPRSingleUseHintInsts in {
-  def S_SINGLEUSE_VDST :
-    SOPP_Pseudo<"s_singleuse_vdst", (ins s16imm:$simm16), "$simm16">;
-} // End SubtargetPredicate = HasVGPRSingeUseHintInsts
-
 let SubtargetPredicate = isGFX12Plus, hasSideEffects = 1 in {
   def S_WAIT_LOADCNT :
     SOPP_Pseudo<"s_wait_loadcnt", (ins s16imm:$simm16), "$simm16",
@@ -2676,12 +2671,6 @@ defm S_ICACHE_INV                 : SOPP_Real_32_gfx11_gfx12<0x03c>;
 
 defm S_BARRIER                    : SOPP_Real_32_gfx11<0x03d>;
 
-//===----------------------------------------------------------------------===//
-// SOPP - GFX1150, GFX12.
-//===----------------------------------------------------------------------===//
-
-defm S_SINGLEUSE_VDST             : SOPP_Real_32_gfx11_gfx12<0x013>;
-
 //===----------------------------------------------------------------------===//
 // SOPP - GFX6, GFX7, GFX8, GFX9, GFX10
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 8b5ec8793d84a2..f32c82f1e4ba4c 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -379,12 +379,6 @@ struct VOPTrue16Info {
   bool IsTrue16;
 };
 
-struct SingleUseExceptionInfo {
-  uint16_t Opcode;
-  bool IsInvalidSingleUseConsumer;
-  bool IsInvalidSingleUseProducer;
-};
-
 struct FP8DstByteSelInfo {
   uint16_t Opcode;
   bool HasFP8DstByteSel;
@@ -396,8 +390,6 @@ struct FP8DstByteSelInfo {
 #define GET_MTBUFInfoTable_IMPL
 #define GET_MUBUFInfoTable_DECL
 #define GET_MUBUFInfoTable_IMPL
-#define GET_SingleUseExceptionTable_DECL
-#define GET_SingleUseExceptionTable_IMPL
 #define GET_SMInfoTable_DECL
 #define GET_SMInfoTable_IMPL
 #define GET_VOP1InfoTable_DECL
@@ -626,16 +618,6 @@ bool isTrue16Inst(unsigned Opc) {
   return Info ? Info->IsTrue16 : false;
 }
 
-bool isInvalidSingleUseConsumerInst(unsigned Opc) {
-  const SingleUseExceptionInfo *Info = getSingleUseExceptionHelper(Opc);
-  return Info && Info->IsInvalidSingleUseConsumer;
-}
-
-bool isInvalidSingleUseProducerInst(unsigned Opc) {
-  const SingleUseExceptionInfo *Info = getSingleUseExceptionHelper(Opc);
-  return Info && Info->IsInvalidSingleUseProducer;
-}
-
 bool isFP8DstSelInst(unsigned Opc) {
   const FP8DstByteSelInfo *Info = getFP8DstByteSelHelper(Opc);
   return Info ? Info->HasFP8DstByteSel : false;
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 35c080d8e0bebc..da37534f2fa4ff 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -870,6 +870,8 @@ bool isInvalidSingleUseConsumerInst(unsigned Opc);
 LLVM_READONLY
 bool isInvalidSingleUseProducerInst(unsigned Opc);
 
+bool isDPMACCInstruction(unsigned Opc);
+
 LLVM_READONLY
 unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc);
 
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 33f2f9f1f5c5b9..bd805059705783 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -252,7 +252,6 @@ def VOP_READFIRSTLANE : VOPProfile <[i32, i32, untyped, untyped]> {
 def V_READFIRSTLANE_B32 : VOP1_Pseudo <"v_readfirstlane_b32", VOP_READFIRSTLANE,
                                        [], 1> {
   let isConvergent = 1;
-  let IsInvalidSingleUseConsumer = 1;
 }
 
 foreach vt = Reg32Types.types in {
@@ -375,7 +374,6 @@ defm V_CLREXCP : VOP1Inst <"v_clrexcp", VOP_NO_EXT<VOP_NONE>>;
 def VOP_MOVRELS : VOPProfile<[i32, i32, untyped, untyped]> {
   let Src0RC32 = VRegSrc_32;
   let Src0RC64 = VRegSrc_32;
-  let IsInvalidSingleUseConsumer = 1;
 }
 
 // Special case because there are no true output operands.  Hack vdst
@@ -419,12 +417,8 @@ class VOP_MOVREL<RegisterOperand Src1RC> : VOPProfile<[untyped, i32, untyped, un
   let EmitDst = 1; // force vdst emission
 }
 
-let IsInvalidSingleUseProducer = 1 in {
-  def VOP_MOVRELD : VOP_MOVREL<VSrc_b32>;
-  def VOP_MOVRELSD : VOP_MOVREL<VRegSrc_32> {
-    let IsInvalidSingleUseConsumer = 1;
-  }
-}
+def VOP_MOVRELD : VOP_MOVREL<VSrc_b32>;
+def VOP_MOVRELSD : VOP_MOVREL<VRegSrc_32>;
 
 let SubtargetPredicate = HasMovrel, Uses = [M0, EXEC] in {
  // v_movreld_b32 is a special case because the destination output
@@ -541,7 +535,6 @@ let SubtargetPredicate = isGFX9Plus in {
     let Constraints = "$vdst = $src1, $vdst1 = $src0";
     let DisableEncoding = "$vdst1,$src1";
     let SchedRW = [Write64Bit, Write64Bit];
-    let IsInvalidSingleUseConsumer = 1;
   }
 
   let isReMaterializable = 1 in
@@ -708,8 +701,6 @@ let SubtargetPredicate = isGFX10Plus in {
       let Constraints = "$vdst = $src1, $vdst1 = $src0";
       let DisableEncoding = "$vdst1,$src1";
       let SchedRW = [Write64Bit, Write64Bit];
-      let IsInvalidSingleUseConsumer = 1;
-      let IsInvalidSingleUseProducer = 1;
     }
   } // End Uses = [M0]
 } // End SubtargetPredicate = isGFX10Plus
@@ -743,10 +734,7 @@ let SubtargetPredicate = isGFX11Plus in {
   }
   // Restrict src0 to be VGPR
   def V_PERMLANE64_B32 : VOP1_Pseudo<"v_permlane64_b32", VOP_MOVRELS,
-                                      [], /*VOP1Only=*/ 1> {
-    let IsInvalidSingleUseConsumer = 1;
-    let IsInvalidSingleUseProducer = 1;
-  }
+                                      [], /*VOP1Only=*/ 1>;
   defm V_MOV_B16        : VOP1Inst_t16<"v_mov_b16", VOP_I16_I16>;
   defm V_NOT_B16        : VOP1Inst_t16<"v_not_b16", VOP_I16_I16>;
   defm V_CVT_I32_I16    : VOP1Inst_t16<"v_cvt_i32_i16", VOP_I32_I16>;
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index dd48607402eb0b..52f7be3b4577df 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -788,12 +788,10 @@ defm V_SUBREV_U32 : VOP2Inst <"v_subrev_u32", VOP_I32_I32_I32_ARITH, null_frag,
 } // End isCommutable = 1
 
 // These are special and do not read the exec mask.
-let isConvergent = 1, Uses = []<Register>, IsInvalidSingleUseConsumer = 1 in {
+let isConvergent = 1, Uses = []<Register> in {
 def V_READLANE_B32 : VOP2_Pseudo<"v_readlane_b32", VOP_READLANE, []>;
 let IsNeverUniform = 1, Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in {
-def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE, []> {
-    let IsInvalidSingleUseProducer = 1;
-  }
+def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE, []>;
 } // End IsNeverUniform, $vdst = $vdst_in, DisableEncoding $vdst_in
 } // End isConvergent = 1
 
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 466114b95f9f90..20beb41b7b58bb 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -157,12 +157,12 @@ defm V_MAX_F64 : VOP3Inst <"v_max_f64", VOP3_Profile<VOP_F64_F64_F64>, fmaxnum_l
 } // End SubtargetPredicate = isNotGFX12Plus
 } // End SchedRW = [WriteDoubleAdd]
 
-let SchedRW = [WriteIntMul], IsInvalidSingleUseConsumer = 1 in {
+let SchedRW = [WriteIntMul] in {
 defm V_MUL_LO_U32 : VOP3Inst <"v_mul_lo_u32", V_MUL_PROF<VOP_I32_I32_I32>, DivergentBinFrag<mul>>;
 defm V_MUL_HI_U32 : VOP3Inst <"v_mul_hi_u32", V_MUL_PROF<VOP_I32_I32_I32>, mulhu>;
 defm V_MUL_LO_I32 : VOP3Inst <"v_mul_lo_i32", V_MUL_PROF<VOP_I32_I32_I32>>;
 defm V_MUL_HI_I32 : VOP3Inst <"v_mul_hi_i32", V_MUL_PROF<VOP_I32_I32_I32>, mulhs>;
-} // End SchedRW = [WriteIntMul], IsInvalidSingleUseConsumer = 1
+} // End SchedRW = [WriteIntMul]
 
 let SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0 in {
 defm V_MINIMUM_F32 : VOP3Inst <"v_minimum_f32", VOP3_Profile<VOP_F32_F32_F32>, DivergentBinFrag<fminimum>>;
@@ -260,9 +260,9 @@ let mayRaiseFPException = 0 in { // Seems suspicious but manual doesn't say it d
 let isReMaterializable = 1 in
 defm V_MSAD_U8 : VOP3Inst <"v_msad_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
 
-let Constraints = "@earlyclobber $vdst", IsInvalidSingleUseConsumer = 1 in {
+let Constraints = "@earlyclobber $vdst" in {
 defm V_MQSAD_PK_U16_U8 : VOP3Inst <"v_mqsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64, VOP3_CLAMP>>;
-} // End Constraints = "@earlyclobber $vdst", IsInvalidSingleUseConsumer = 1
+} // End Constraints = "@earlyclobber $vdst"
 
 
 let isReMaterializable = 1 in {
@@ -277,16 +277,14 @@ let SchedRW = [Write64Bit] in {
   defm V_ASHR_I64 : VOP3Inst <"v_ashr_i64", VOP3_Profile<VOP_I64_I64_I32>, csra_64>;
   } // End SubtargetPredicate = isGFX6GFX7
 
-  let IsInvalidSingleUseConsumer = 1 in {
   let SubtargetPredicate = isGFX8Plus in {
   defm V_LSHRREV_B64 : VOP3Inst <"v_lshrrev_b64", VOP3_Profile<VOP_I64_I32_I64>, clshr_rev_64>;
   defm V_ASHRREV_I64 : VOP3Inst <"v_ashrrev_i64", VOP3_Profile<VOP_I64_I32_I64>, cashr_rev_64>;
-  } // End SubtargetPredicate = isGFX8Plus, , IsInvalidSingleUseConsumer = 1
+  } // End SubtargetPredicate = isGFX8Plus
 
   let SubtargetPredicate = isGFX8GFX9GFX10GFX11 in {
   defm V_LSHLREV_B64 : VOP3Inst <"v_lshlrev_b64", VOP3_Profile<VOP_I64_I32_I64>, clshl_rev_64>;
   } // End SubtargetPredicate = isGFX8GFX9GFX10GFX11
-  } // End IsInvalidSingleUseConsumer = 1
 } // End SchedRW = [Write64Bit]
 } // End isReMaterializable = 1
 
@@ -311,14 +309,14 @@ def VOPProfileMQSAD : VOP3_Profile<VOP_V4I32_I64_I32_V4I32, VOP3_CLAMP> {
   let HasModifiers = 0;
 }
 
-let SubtargetPredicate = isGFX7Plus, IsInvalidSingleUseConsumer = 1 in {
+let SubtargetPredicate = isGFX7Plus in {
 let Constraints = "@earlyclobber $vdst", SchedRW = [WriteQuarterRate32] in {
 defm V_QSAD_PK_U16_U8 : VOP3Inst <"v_qsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64, VOP3_CLAMP>>;
 defm V_MQSAD_U32_U8 : VOP3Inst <"v_mqsad_u32_u8", VOPProfileMQSAD>;
 } // End Constraints = "@earlyclobber $vdst", SchedRW = [WriteQuarterRate32]
-} // End SubtargetPredicate = isGFX7Plus, IsInvalidSingleUseConsumer = 1
+} // End SubtargetPredicate = isGFX7Plus
 
-let isCommutable = 1, SchedRW = [WriteIntMul, WriteSALU], IsInvalidSingleUseConsumer = 1 in {
+let isCommutable = 1, SchedRW = [WriteIntMul, WriteSALU] in {
   let SubtargetPredicate = isGFX7Plus, OtherPredicates = [HasNotMADIntraFwdBug] in {
     defm V_MAD_U64_U32 : VOP3Inst <"v_mad_u64_u32", VOP3b_I64_I1_I32_I32_I64>;
     defm V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64>;
@@ -328,7 +326,7 @@ let isCommutable = 1, SchedRW = [WriteIntMul, WriteSALU], IsInvalidSingleUseCons
     defm V_MAD_U64_U32_gfx11 : VOP3Inst <"v_mad_u64_u32", VOP3b_I64_I1_I32_I32_I64>;
     defm V_MAD_I64_I32_gfx11 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64>;
   }
-} // End isCommutable = 1, SchedRW = [WriteIntMul, WriteSALU], IsInvalidSingleUseConsumer = 1
+} // End isCommutable = 1, SchedRW = [WriteIntMul, WriteSALU]
 
 
 let FPDPRounding = 1 in {
@@ -865,10 +863,10 @@ let SubtargetPredicate = isGFX10Plus in {
   } // End isCommutable = 1, isReMaterializable = 1
   def : ThreeOp_i32_Pats<xor, xor, V_XOR3_B32_e64>;
 
-  let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in", IsInvalidSingleUseConsumer = 1, IsInvalidSingleUseProducer = 1 in {
+  let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in {
     defm V_PERMLANE16_B32 : VOP3Inst<"v_permlane16_b32", VOP3_PERMLANE_Profile>;
     defm V_PERMLANEX16_B32 : VOP3Inst<"v_permlanex16_b32", VOP3_PERMLANE_Profile>;
-  } // End $vdst = $vdst_in, DisableEncoding $vdst_in, IsInvalidSingleUseConsumer = 1, IsInvalidSingleUseProducer = 1
+  } // End $vdst = $vdst_in, DisableEncoding $vdst_in
 
   foreach vt = Reg32Types.types in {
     def : PermlanePat<int_amdgcn_permlane16, V_PERMLANE16_B32_e64, vt>;
@@ -1286,12 +1284,11 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in {
   }
 } // End AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10"
 
-let IsInvalidSingleUseConsumer = 1 in {
-  defm V_READLANE_B32  : VOP3_Real_No_Suffix_gfx10<0x360>;
-  let InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in), IsInvalidSingleUseProducer = 1 in {
-    defm V_WRITELANE_B32 : VOP3_Real_No_Suffix_gfx10<0x361>;
-  } // End InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32: $src1, VGPR_32:$vdst_in), IsInvalidSingleUseProducer = 1
-} // End IsInvalidSingleUseConsumer = 1
+defm V_READLANE_B32  : VOP3_Real_No_Suffix_gfx10<0x360>;
+
+let InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in) in {
+  defm V_WRITELANE_B32 : VOP3_Real_No_Suffix_gfx10<0x361>;
+} // End InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in)
 
 let SubtargetPredicate = isGFX10Before1030 in {
   defm V_MUL_LO_I32      : VOP3_Real_gfx10<0x16b>;
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index f4d2c29158f49f..5eee71887964ad 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -382,19 +382,15 @@ defm V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16",
   AMDGPUfdot2, 1/*ExplicitClamp*/>;
 
 let OtherPredicates = [HasDot7Insts] in {
-let IsInvalidSingleUseConsumer = 1 in {
-  defm V_DOT4_U32_U8  : VOP3PInst<"v_dot4_u32_u8",
-    VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot4, 1>;
-}
+defm V_DOT4_U32_U8  : VOP3PInst<"v_dot4_u32_u8",
+  VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot4, 1>;
 defm V_DOT8_U32_U4  : VOP3PInst<"v_dot8_u32_u4",
   VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot8, 1>;
 } // End OtherPredicates = [HasDot7Insts]
 
 let OtherPredicates = [HasDot1Insts] in {
-let IsInvalidSingleUseConsumer = 1 in {
-  defm V_DOT4_I32_I8  : VOP3PInst<"v_dot4_i32_i8",
-    VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot4, 1>;
-}
+defm V_DOT4_I32_I8  : VOP3PInst<"v_dot4_i32_i8",
+  VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot4, 1>;
 defm V_DOT8_I32_I4  : VOP3PInst<"v_dot8_i32_i4",
   VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot8, 1>;
 } // End OtherPredicates = [HasDot1Insts]
diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
index be862b44917e15..d6e08dce130ced 100644
--- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
@@ -464,10 +464,9 @@ multiclass VOPC_I16 <string opName, SDPatternOperator cond = COND_NULL,
 multiclass VOPC_I32 <string opName, SDPatternOperator cond = COND_NULL, string revOp = opName> :
   VOPC_Pseudos <opName, VOPC_I1_I32_I32, cond, revOp, 0>;
 
-let IsInvalidSingleUseConsumer = 1 in {
-  multiclass VOPC_I64 <string opName, SDPatternOperator cond = COND_NULL, string revOp = opName> :
-    VOPC_Pseudos <opName, VOPC_I1_I64_I64, cond, revOp, 0>;
-}
+multiclass VOPC_I64 <string opName, SDPatternOperator cond = COND_NULL, string revOp = opName> :
+  VOPC_Pseudos <opName, VOPC_I1_I64_I64, cond, revOp, 0>;
+
 
 multiclass VOPCX_F16<string opName, string revOp = opName> {
   let OtherPredicates = [Has16BitInsts], True16Predicate = NotHasTrue16BitInsts in {
@@ -502,10 +501,8 @@ multiclass VOPCX_I16<string opName, string revOp = opName> {
 multiclass VOPCX_I32 <string opName, string revOp = opName> :
   VOPCX_Pseudos <opName, VOPC_I1_I32_I32, VOPC_I32_I32, COND_NULL, revOp>;
 
-let IsInvalidSingleUseConsumer = 1 in {
-  multiclass VOPCX_I64 <string opName, string revOp = opName> :
-    VOPCX_Pseudos <opName, VOPC_I1_I64_I64, VOPC_I64_I64, COND_NULL, revOp>;
-}
+multiclass VOPCX_I64 <string opName, string revOp = opName> :
+  VOPCX_Pseudos <opName, VOPC_I1_I64_I64, VOPC_I64_I64, COND_NULL, revOp>;
 
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index 5a460ef0d42320..05a7d907d237ae 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -17,8 +17,6 @@ class LetDummies {
   bit isReMaterializable;
   bit isAsCheapAsAMove;
   bit FPDPRounding;
-  bit IsInvalidSingleUseConsumer;
-  bit IsInvalidSingleUseProducer;
   Predicate SubtargetPredicate;
   string Constraints;
   string DisableEncoding;
@@ -67,8 +65,6 @@ class VOP_Pseudo <string opName, string suffix, VOPProfile P, dag outs, dag ins,
   string Mnemonic = opName;
   Instruction Opcode = !cast<Instruction>(NAME);
   bit IsTrue16 = P.IsTrue16;
-  bit IsInvalidSingleUseConsumer = P.IsInvalidSingleUseConsumer;
-  bit IsInvalidSingleUseProducer = P.IsInvalidSingleUseProducer;
   VOPProfile Pfl = P;
 
   string AsmOperands;
@@ -165,8 +161,6 @@ class VOP3P_Pseudo <string opName, VOPProfile P, list<dag> pattern = []> :
 class VOP_Real<VOP_Pseudo ps> {
   Instruction Opcode = !cast<Instruction>(NAME);
   bit IsSingle = ps.Pfl.IsSingle;
-  bit IsInvalidSingleUseConsumer = ps.Pfl.IsInvalidSingleUseConsumer;
-  bit IsInvalidSingleUseProducer = ps.Pfl.IsInvalidSingleUseProducer;
 }
 
 class VOP3_Real <VOP_Pseudo ps, int EncodingFamily, string asm_name = ps.Mnemonic> :
@@ -844,9 +838,6 @@ class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[],
   let Constraints = !if(P.NumSrcArgs, P.TieRegDPP # " = $vdst", "");
   let DisableEncoding = !if(P.NumSrcArgs, P.TieRegDPP, "");
   let DecoderNamespace = "GFX8";
-
-  let IsInvalidSingleUseConsumer = !not(VINTERP);
-  let IsInvalidSingleUseProducer = !not(VINTERP);
 }
 
 class VOP3_DPP_Pseudo <string OpName, VOPProfile P> :
@@ -1714,13 +1705,4 @@ def VOPTrue16Table : GenericTable {
 
   let PrimaryKey = ["Opcode"];
   let PrimaryKeyName = "getTrue16OpcodeHelper";
-}
-
-def SingleUseExceptionTable : GenericTable {
-  let FilterClass = "VOP_Pseudo";
-  let CppTypeName = "SingleUseExceptionInfo";
-  let Fields = ["Opcode", "IsInvalidSingleUseConsumer", "IsInvalidSingleUseProducer"];
-
-  let PrimaryKey = ["Opcode"];
-  let PrimaryKeyName = "getSingleUseExceptionHelper";
-}
+}
\ No newline at end of file
diff --git a/llvm/test/CodeGen/AMDGPU/insert-singleuse-vdst.mir b/llvm/test/CodeGen/AMDGPU/insert-singleuse-vdst.mir
deleted file mode 100644
index 9e65ce329df431..00000000000000
--- a/llvm/test/CodeGen/AMDGPU/insert-singleuse-vdst.mir
+++ /dev/null
@@ -1,1420 +0,0 @@
-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -verify-machineinstrs -run-pass=amdgpu-insert-single-use-vdst %s -o - | FileCheck %s
-
-# One single-use producer.
----
-name: one_producer
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: one_producer
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $vgpr0
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
-  ; CHECK-NEXT:   $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr2 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr2
-  bb.0:
-    liveins: $vgpr0
-    $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
-    $vgpr2 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec
-  bb.1:
-    liveins: $vgpr0, $vgpr2
-...
-
-# One single-use producer of a 64-bit value.
----
-name: one_producer_64bit
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: one_producer_64bit
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $vgpr0_vgpr1
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
-  ; CHECK-NEXT:   $vgpr2_vgpr3 = V_LSHLREV_B64_e64 0, $vgpr0_vgpr1, implicit $exec
-  ; CHECK-NEXT:   $vgpr4_vgpr5 = V_MOV_B64_e64 $vgpr2_vgpr3, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   liveins: $vgpr4_vgpr5
-  bb.0:
-    liveins: $vgpr0_vgpr1
-    $vgpr2_vgpr3 = V_LSHLREV_B64_e64 0, $vgpr0_vgpr1, implicit $exec
-    $vgpr4_vgpr5 = V_MOV_B64_e64 $vgpr2_vgpr3, implicit $exec
-  bb.1:
-    liveins: $vgpr4_vgpr5
-...
-
-# Two consecutive single-use producers.
----
-name: two_producers
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: two_producers
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $vgpr0
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 2
-  ; CHECK-NEXT:   $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr2 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec
-  ; CHECK-NEXT:   $vgpr3 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr3
-  bb.0:
-    liveins: $vgpr0
-    $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
-    $vgpr2 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec
-    $vgpr3 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
-  bb.1:
-    liveins: $vgpr0, $vgpr3
-...
-
-# Redefinitions of v0.
----
-name: redefinitions
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: redefinitions
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $vgpr0
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 4
-  ; CHECK-NEXT:   $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  bb.0:
-    liveins: $vgpr0
-    $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  bb.1:
-...
-
-# One producer with no consumers.
----
-name: no_consumer
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: no_consumer
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $vgpr0
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
-  ; CHECK-NEXT:   $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  bb.0:
-    liveins: $vgpr0
-    $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
-  bb.1:
-...
-
-# One consumer with two uses of the same value.
----
-name: one_consumer_two_uses
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: one_consumer_two_uses
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $vgpr0
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
-  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr2 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr2
-  bb.0:
-    liveins: $vgpr0
-    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr2 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
-  bb.1:
-    liveins: $vgpr0, $vgpr2
-...
-
-# A longer example.
----
-name: longer_example
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: longer_example
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $vgpr3, $vgpr5, $sgpr0, $sgpr2, $sgpr4, $sgpr5, $sgpr16, $sgpr17, $sgpr18, $sgpr19
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 274
-  ; CHECK-NEXT:   $vgpr14 = V_MUL_F32_e32 $sgpr4, $vgpr3, implicit $exec, implicit $mode
-  ; CHECK-NEXT:   $sgpr3 = S_MUL_F16 $sgpr0, $sgpr2, implicit $mode
-  ; CHECK-NEXT:   $vgpr15 = V_MUL_F32_e32 $sgpr5, $vgpr3, implicit $exec, implicit $mode
-  ; CHECK-NEXT:   $vgpr17 = V_FMA_F32_e64 0, $sgpr16, 0, $vgpr5, 0, $vgpr14, 0, 0, implicit $exec, implicit $mode
-  ; CHECK-NEXT:   $sgpr1 = S_ADD_F16 $sgpr0, 15360, implicit $mode
-  ; CHECK-NEXT:   $vgpr15 = V_FMA_F32_e64 0, $sgpr17, 0, $vgpr5, 0, $vgpr15, 0, 0, implicit $exec, implicit $mode
-  ; CHECK-NEXT:   $vgpr14 = V_FMA_F32_e64 0, $sgpr18, 0, $vgpr15, 0, $vgpr17, 0, 0, implicit $exec, implicit $mode
-  ; CHECK-NEXT:   $vgpr15 = V_FMA_F32_e64 0, $sgpr19, 0, $vgpr14, 0, $vgpr17, 0, 0, implicit $exec, implicit $mode
-  ; CHECK-NEXT:   $vgpr16 = V_LOG_F32_e32 $vgpr15, implicit $exec, implicit $mode
-  ; CHECK-NEXT:   $vgpr18 = V_EXP_F32_e32 $vgpr15, implicit $exec, implicit $mode
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   liveins: $vgpr16, $vgpr18
-  bb.0:
-    liveins: $vgpr3, $vgpr5, $sgpr0, $sgpr2, $sgpr4, $sgpr5, $sgpr16, $sgpr17, $sgpr18, $sgpr19
-    $vgpr14 = V_MUL_F32_e32 $sgpr4, $vgpr3, implicit $exec, implicit $mode
-    $sgpr3 = S_MUL_F16 $sgpr0, $sgpr2, implicit $mode
-    $vgpr15 = V_MUL_F32_e32 $sgpr5, $vgpr3, implicit $exec, implicit $mode
-    $vgpr17 = V_FMA_F32_e64 0, $sgpr16, 0, $vgpr5, 0, $vgpr14, 0, 0, implicit $exec, implicit $mode
-    $sgpr1 = S_ADD_F16 $sgpr0, 15360, implicit $mode
-    $vgpr15 = V_FMA_F32_e64 0, $sgpr17, 0, $vgpr5, 0, $vgpr15, 0, 0, implicit $exec, implicit $mode
-    $vgpr14 = V_FMA_F32_e64 0, $sgpr18, 0, $vgpr15, 0, $vgpr17, 0, 0, implicit $exec, implicit $mode
-    $vgpr15 = V_FMA_F32_e64 0, $sgpr19, 0, $vgpr14, 0, $vgpr17, 0, 0, implicit $exec, implicit $mode
-    $vgpr16 = V_LOG_F32_e32 $vgpr15, implicit $exec, implicit $mode
-    $vgpr18 = V_EXP_F32_e32 $vgpr15, implicit $exec, implicit $mode
-  bb.1:
-    liveins: $vgpr16, $vgpr18
-...
-
-# Multiple uses of v0.
----
-name: multiple_uses_1
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: multiple_uses_1
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $vgpr0
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   liveins: $vgpr1, $vgpr2
-  bb.0:
-    liveins: $vgpr0
-    $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  bb.1:
-    liveins: $vgpr1, $vgpr2
-...
-
-# Multiple uses of v0 and redefinitions of v1 and v2.
----
-name: multiple_uses_2
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: multiple_uses_2
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $vgpr0
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 2
-  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   liveins: $vgpr1, $vgpr2
-  bb.0:
-    liveins: $vgpr0
-    $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  bb.1:
-    liveins: $vgpr1, $vgpr2
-...
-
-# Multiple uses of all but v1.
----
-name: multiple_uses_3
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: multiple_uses_3
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $vgpr0
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
-  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr3 = V_MOV_B32_e32 $vgpr1, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   liveins: $vgpr2, $vgpr3
-  bb.0:
-    liveins: $vgpr0
-    $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr3 = V_MOV_B32_e32 $vgpr1, implicit $exec
-  bb.1:
-    liveins: $vgpr2, $vgpr3
-...
-
-# Second use is an instruction that reads and writes v1.
----
-name: multiple_uses_4
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: multiple_uses_4
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $vgpr0
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr2 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec
-  ; CHECK-NEXT:   $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
-  bb.0:
-    liveins: $vgpr0
-    $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
-    $vgpr2 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec
-    $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec
-  bb.1:
-    liveins: $vgpr0, $vgpr1, $vgpr2
-...
-
-# Results are live-in to another basic block.
----
-name: basic_block_1
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: basic_block_1
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $vgpr0
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.2:
-  ; CHECK-NEXT:   liveins: $vgpr1, $vgpr2
-  bb.0:
-    liveins: $vgpr0
-    $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  bb.1:
-    liveins: $vgpr0, $vgpr1, $vgpr2
-    $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  bb.2:
-    liveins: $vgpr1, $vgpr2
-...
-
-# Result v2 has multiple uses in another basic block.
----
-name: basic_block_2
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: basic_block_2
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $vgpr2 = V_MOV_B32_e32 $vgpr1, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
-  ; CHECK-NEXT:   liveins: $vgpr2
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
-  ; CHECK-NEXT:   $vgpr3 = V_MOV_B32_e32 $vgpr2, implicit $exec
-  ; CHECK-NEXT:   $vgpr3 = V_MOV_B32_e32 $vgpr2, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.2:
-  ; CHECK-NEXT:   liveins: $vgpr3
-  bb.0:
-    liveins: $vgpr0, $vgpr1
-    $vgpr2 = V_MOV_B32_e32 $vgpr1, implicit $exec
-  bb.1:
-    liveins: $vgpr2
-    $vgpr3 = V_MOV_B32_e32 $vgpr2, implicit $exec
-    $vgpr3 = V_MOV_B32_e32 $vgpr2, implicit $exec
-  bb.2:
-    liveins: $vgpr3
-...
-
-# Results are redefined in another basic block.
----
-name: basic_block_3
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: basic_block_3
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $vgpr0
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
-  ; CHECK-NEXT:   $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec
-  ; CHECK-NEXT:   $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec
-  ; CHECK-NEXT:   $vgpr2 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.2:
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
-  bb.0:
-    liveins: $vgpr0
-    $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  bb.1:
-    liveins: $vgpr0, $vgpr1
-    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec
-    $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec
-    $vgpr2 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec
-  bb.2:
-    liveins: $vgpr0, $vgpr1, $vgpr2
-...
-
-# Exec modified between producer and consumer.
----
-name: exec_mask
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: exec_mask
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $sgpr0_sgpr1
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $vgpr0 = V_MOV_B32_e32 0, implicit $exec
-  ; CHECK-NEXT:   $exec = COPY $sgpr0_sgpr1
-  ; CHECK-NEXT:   $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   liveins: $vgpr0
-  bb.0:
-    liveins: $sgpr0_sgpr1
-    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
-    $exec = COPY $sgpr0_sgpr1
-    $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  bb.1:
-    liveins: $vgpr0
-...
-
-# Exec_lo modified between producer and consumer.
----
-name: exec_mask_lo
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: exec_mask_lo
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $sgpr0
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $vgpr0 = V_MOV_B32_e32 0, implicit $exec
-  ; CHECK-NEXT:   $exec_lo = COPY $sgpr0
-  ; CHECK-NEXT:   $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   liveins: $vgpr0
-  bb.0:
-    liveins: $sgpr0
-    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
-    $exec_lo = COPY $sgpr0
-    $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  bb.1:
-    liveins: $vgpr0
-...
-
-# Exec_hi modified between producer and consumer.
----
-name: exec_mask_hi
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: exec_mask_hi
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $sgpr0
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $vgpr0 = V_MOV_B32_e32 0, implicit $exec
-  ; CHECK-NEXT:   $exec_hi = COPY $sgpr0
-  ; CHECK-NEXT:   $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   liveins: $vgpr0
-  bb.0:
-    liveins: $sgpr0
-    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
-    $exec_hi = COPY $sgpr0
-    $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  bb.1:
-    liveins: $vgpr0
-...
-
-# Write 32-bit vgpr and then read from low 16 bits.
----
-name: write_full_read_lo
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: write_full_read_lo
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
-  ; CHECK-NEXT:   $vgpr0 = V_MOV_B32_e32 0, implicit $exec
-  ; CHECK-NEXT:   $vgpr1_lo16 = V_MOV_B16_t16_e32 $vgpr0_lo16, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   liveins: $vgpr1_lo16
-  bb.0:
-    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
-    $vgpr1_lo16 = V_MOV_B16_t16_e32 $vgpr0_lo16, implicit $exec
-  bb.1:
-    liveins: $vgpr1_lo16
-...
-
-# Write 32-bit vgpr and then read from high 16 bits.
----
-name: write_full_read_hi
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: write_full_read_hi
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
-  ; CHECK-NEXT:   $vgpr0 = V_MOV_B32_e32 0, implicit $exec
-  ; CHECK-NEXT:   $vgpr1_hi16 = V_MOV_B16_t16_e32 $vgpr0_hi16, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   liveins: $vgpr1_hi16
-  bb.0:
-    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
-    $vgpr1_hi16 = V_MOV_B16_t16_e32 $vgpr0_hi16, implicit $exec
-  bb.1:
-    liveins: $vgpr1_hi16
-...
-
-# Write 32-bit vgpr and then read from both halves.
----
-name: write_full_read_both
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: write_full_read_both
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
-  ; CHECK-NEXT:   $vgpr0 = V_MOV_B32_e32 0, implicit $exec
-  ; CHECK-NEXT:   $vgpr1_lo16 = V_MOV_B16_t16_e32 $vgpr0_lo16, implicit $exec
-  ; CHECK-NEXT:   $vgpr1_hi16 = V_MOV_B16_t16_e32 $vgpr0_hi16, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   liveins: $vgpr1
-  bb.0:
-    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
-    $vgpr1_lo16 = V_MOV_B16_t16_e32 $vgpr0_lo16, implicit $exec
-    $vgpr1_hi16 = V_MOV_B16_t16_e32 $vgpr0_hi16, implicit $exec
-  bb.1:
-    liveins: $vgpr1
-...
-
-# Write 32-bit vgpr and then read from both halves in the same instruction.
----
-name: write_full_read_both_same_instruction
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: write_full_read_both_same_instruction
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
-  ; CHECK-NEXT:   $vgpr0 = V_MOV_B32_e32 0, implicit $exec
-  ; CHECK-NEXT:   $vgpr1_lo16 = V_ADD_F16_t16_e32 $vgpr0_lo16, $vgpr0_hi16, implicit $mode, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   liveins: $vgpr1_lo16
-  bb.0:
-    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
-    $vgpr1_lo16 = V_ADD_F16_t16_e32 $vgpr0_lo16, $vgpr0_hi16, implicit $mode, implicit $exec
-  bb.1:
-    liveins: $vgpr1_lo16
-...
-
-# Write low 16-bits and then read 32-bit vgpr.
----
-name: write_lo_read_full
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: write_lo_read_full
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $vgpr0
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
-  ; CHECK-NEXT:   $vgpr0_lo16 = V_MOV_B16_t16_e32 0, implicit $exec
-  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   liveins: $vgpr1
-  bb.0:
-    liveins: $vgpr0
-    $vgpr0_lo16 = V_MOV_B16_t16_e32 0, implicit $exec
-    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  bb.1:
-    liveins: $vgpr1
-...
-
-# Write low 16-bits and then read 32-bit vgpr twice.
----
-name: write_lo_read_full_twice
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: write_lo_read_full_twice
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $vgpr0
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $vgpr0_lo16 = V_MOV_B16_t16_e32 0, implicit $exec
-  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   liveins: $vgpr1, $vgpr2
-  bb.0:
-    liveins: $vgpr0
-    $vgpr0_lo16 = V_MOV_B16_t16_e32 0, implicit $exec
-    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  bb.1:
-    liveins: $vgpr1, $vgpr2
-...
-
-# Write high 16-bits and then read 32-bit vgpr.
----
-name: write_hi_read_full
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: write_hi_read_full
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $vgpr0
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
-  ; CHECK-NEXT:   $vgpr0_hi16 = V_MOV_B16_t16_e32 0, implicit $exec
-  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   liveins: $vgpr1
-  bb.0:
-    liveins: $vgpr0
-    $vgpr0_hi16 = V_MOV_B16_t16_e32 0, implicit $exec
-    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  bb.1:
-    liveins: $vgpr1
-...
-
-# Write high 16-bits and then read 32-bit vgpr twice.
----
-name: write_hi_read_full_twice
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: write_hi_read_full_twice
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $vgpr0
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $vgpr0_hi16 = V_MOV_B16_t16_e32 0, implicit $exec
-  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   liveins: $vgpr1, $vgpr2
-  bb.0:
-    liveins: $vgpr0
-    $vgpr0_hi16 = V_MOV_B16_t16_e32 0, implicit $exec
-    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  bb.1:
-    liveins: $vgpr1, $vgpr2
-...
-
-# Write low 16-bits and then write high 16-bits and then read 32-bit vgpr.
----
-name: write_both_read_full
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: write_both_read_full
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 2
-  ; CHECK-NEXT:   $vgpr0_lo16 = V_MOV_B16_t16_e32 0, implicit $exec
-  ; CHECK-NEXT:   $vgpr0_hi16 = V_MOV_B16_t16_e32 0, implicit $exec
-  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   liveins: $vgpr1
-  bb.0:
-    $vgpr0_lo16 = V_MOV_B16_t16_e32 0, implicit $exec
-    $vgpr0_hi16 = V_MOV_B16_t16_e32 0, implicit $exec
-    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  bb.1:
-    liveins: $vgpr1
-...
-
-# Write low 16-bits and then write high 16-bits and then read 32-bit vgpr twice.
----
-name: write_both_read_full_twice
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: write_both_read_full_twice
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $vgpr0_lo16 = V_MOV_B16_t16_e32 0, implicit $exec
-  ; CHECK-NEXT:   $vgpr0_hi16 = V_MOV_B16_t16_e32 0, implicit $exec
-  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   liveins: $vgpr1, $vgpr2
-  bb.0:
-    $vgpr0_lo16 = V_MOV_B16_t16_e32 0, implicit $exec
-    $vgpr0_hi16 = V_MOV_B16_t16_e32 0, implicit $exec
-    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  bb.1:
-    liveins: $vgpr1, $vgpr2
-...
-
-# Three single use producer instructions with non single use producer
-# instructions in between.
----
-name: three_producers_with_two_skips
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: three_producers_with_two_skips
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $vgpr0
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 9361
-  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr4 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr5 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   liveins: $vgpr2, $vgpr4
-  bb.0:
-    liveins: $vgpr0
-    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr4 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr5 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  bb.1:
-    liveins: $vgpr2, $vgpr4
-...
-
-# Six single use producer instructions with non single use producer
-# instructions in between.
----
-name: six_producers_with_four_skips
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: six_producers_with_four_skips
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $vgpr0
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 145
-  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr4 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 9362
-  ; CHECK-NEXT:   $vgpr5 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr6 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr7 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr8 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr9 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr10 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   liveins: $vgpr2, $vgpr4, $vgpr7, $vgpr9
-  bb.0:
-    liveins: $vgpr0
-    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr4 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr5 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr6 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr7 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr8 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr9 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr10 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  bb.1:
-    liveins: $vgpr2, $vgpr4, $vgpr7, $vgpr9
-...
-
-# Five single use producer instructions, followed by
-# four non single use producers, followed by
-# three single use producer instructions, followed by
-# two non single use producers, followed by
-# one single use producer instructions.
----
-name: immediate_order
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: immediate_order
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $vgpr0
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 10693
-  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr4 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr5 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr6 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr7 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr8 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr9 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr10 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr11 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr12 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr13 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr14 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr15 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   liveins: $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr13, $vgpr14
-  bb.0:
-    liveins: $vgpr0
-    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr4 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr5 = V_MOV_B32_e32 $vgpr0, implicit $exec
-
-    $vgpr6 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr7 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr8 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr9 = V_MOV_B32_e32 $vgpr0, implicit $exec
-
-    $vgpr10 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr11 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr12 = V_MOV_B32_e32 $vgpr0, implicit $exec
-
-    $vgpr13 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr14 = V_MOV_B32_e32 $vgpr0, implicit $exec
-
-    $vgpr15 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  bb.1:
-    liveins: $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr13, $vgpr14
-...
-
-# Maximum number of single use producers that can be encoded in a single
-# instruction.
----
-name: maximum_producers_single_instruction
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: maximum_producers_single_instruction
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $vgpr0
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 58255
-  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr4 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr5 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr6 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr7 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr8 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr9 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr10 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr11 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr12 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr13 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr14 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr15 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr16 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr17 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr18 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr19 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr20 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr21 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr22 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr23 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr24 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr25 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr26 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr27 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr28 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr29 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  bb.0:
-    liveins: $vgpr0
-    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr4 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr5 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr6 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr7 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr8 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr9 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr10 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr11 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr12 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr13 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr14 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr15 = V_MOV_B32_e32 $vgpr0, implicit $exec
-
-    $vgpr16 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr17 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr18 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr19 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr20 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr21 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr22 = V_MOV_B32_e32 $vgpr0, implicit $exec
-
-    $vgpr23 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr24 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr25 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr26 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr27 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr28 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr29 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  bb.1:
-...
-
-# One more than the maximum number of single use producers that can be encoded
-# in a single instruction.
----
-name: too_many_producers_single_instruction
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: too_many_producers_single_instruction
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $vgpr0
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
-  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 58255
-  ; CHECK-NEXT:   $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr4 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr5 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr6 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr7 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr8 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr9 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr10 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr11 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr12 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr13 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr14 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr15 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr16 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr17 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr18 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr19 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr20 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr21 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr22 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr23 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr24 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr25 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr26 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr27 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr28 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr29 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr30 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-
-
-
-  bb.0:
-    liveins: $vgpr0
-    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr4 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr5 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr6 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr7 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr8 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr9 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr10 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr11 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr12 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr13 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr14 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr15 = V_MOV_B32_e32 $vgpr0, implicit $exec
-
-    $vgpr16 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr17 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr18 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr19 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr20 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr21 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr22 = V_MOV_B32_e32 $vgpr0, implicit $exec
-
-    $vgpr23 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr24 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr25 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr26 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr27 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr28 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr29 = V_MOV_B32_e32 $vgpr0, implicit $exec
-
-    $vgpr30 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  bb.1:
-...
-
-# Maximum distance between single use producers that can be encoded in a single
-# instruction.
----
-name: maximum_skips_single_instruction
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: maximum_skips_single_instruction
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $vgpr0
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 15473
-  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr4 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr5 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr6 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr7 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr8 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr9 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr10 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr11 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr12 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr13 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr14 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr15 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr16 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   liveins: $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15
-  bb.0:
-    liveins: $vgpr0
-    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-
-    $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr4 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr5 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr6 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr7 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr8 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr9 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr10 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr11 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr12 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr13 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr14 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr15 = V_MOV_B32_e32 $vgpr0, implicit $exec
-
-    $vgpr16 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  bb.1:
-    liveins: $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15
-...
-
-# One more than the maximum distance between single use producers that can be
-# encoded in a single instruction.
----
-name: too_many_skips_single_instruction
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: too_many_skips_single_instruction
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $vgpr0
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
-  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr4 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr5 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr6 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr7 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr8 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr9 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr10 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr11 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr12 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr13 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr14 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr15 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr16 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
-  ; CHECK-NEXT:   $vgpr17 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   liveins: $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16
-  bb.0:
-    liveins: $vgpr0
-    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-
-    $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr4 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr5 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr6 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr7 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr8 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr9 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr10 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr11 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr12 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr13 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr14 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr15 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr16 = V_MOV_B32_e32 $vgpr0, implicit $exec
-
-    $vgpr17 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  bb.1:
-    liveins: $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16
-...
-
-
-# Maximum possible encoding value with all bits of the immediate set
----
-name: all_immediate_bits_set
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: all_immediate_bits_set
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $vgpr0
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 65535
-  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr4 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr5 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr6 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr7 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr8 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr9 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr10 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr11 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr12 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr13 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr14 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr15 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr16 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr17 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr18 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr19 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr20 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr21 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr22 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr23 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr24 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr25 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr26 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr27 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr28 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr29 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr30 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr31 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr33 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr34 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr35 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr36 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr37 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr38 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr39 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr40 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr41 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr42 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr43 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   liveins: $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr30, $vgpr31, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36
-  bb.0:
-    liveins: $vgpr0
-    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr4 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr5 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr6 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr7 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr8 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr9 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr10 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr11 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr12 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr13 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr14 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr15 = V_MOV_B32_e32 $vgpr0, implicit $exec
-
-    $vgpr16 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr17 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr18 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr19 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr20 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr21 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr22 = V_MOV_B32_e32 $vgpr0, implicit $exec
-
-    $vgpr23 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr24 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr25 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr26 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr27 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr28 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr29 = V_MOV_B32_e32 $vgpr0, implicit $exec
-
-    $vgpr30 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr31 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr33 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr34 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr35 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr36 = V_MOV_B32_e32 $vgpr0, implicit $exec
-
-    $vgpr37 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr38 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr39 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr40 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr41 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr42 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr43 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  bb.1:
-    liveins: $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr30, $vgpr31, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36
-
-...
-
-# Tests for multi-cycle instructions that are explicitly excluded.
-
-# Valid producers but invalid consumer opcodes.
----
-name: v_mul_hi_u32_e64
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: v_mul_hi_u32_e64
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $vgpr0
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
-  ; CHECK-NEXT:   $vgpr2 = V_MUL_HI_U32_e64 $vgpr0, $vgpr1, implicit $exec
-  ; CHECK-NEXT:   $vgpr3 = V_MOV_B32_e32 $vgpr2, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr3
-  bb.0:
-    liveins: $vgpr0
-    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr2 = V_MUL_HI_U32_e64 $vgpr0, $vgpr1, implicit $exec
-    $vgpr3 = V_MOV_B32_e32 $vgpr2, implicit $exec
-  bb.1:
-    liveins: $vgpr0, $vgpr3
-...
-
----
-name: v_cmpx_t_u64_e64
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: v_cmpx_t_u64_e64
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
-  ; CHECK-NEXT:   $sgpr0 = V_CMPX_EQ_U64_e64 $vgpr0_vgpr1, $vgpr2_vgpr3, implicit-def $exec, implicit $exec
-  ; CHECK-NEXT:   S_BRANCH %bb.1
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   liveins: $vgpr0
-  bb.0:
-    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $sgpr0 = V_CMPX_EQ_U64_e64 $vgpr0_vgpr1, $vgpr2_vgpr3, implicit-def $exec, implicit $exec
-    S_BRANCH %bb.1
-  bb.1:
-    liveins: $vgpr0
-...
-
----
-name: v_lshlrev_b64_e64
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: v_lshlrev_b64_e64
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $vgpr0_vgpr1
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $vgpr2_vgpr3 = V_MOV_B64_e64 $vgpr0_vgpr1, implicit $exec
-  ; CHECK-NEXT:   $vgpr4_vgpr5 = V_LSHLREV_B64_e64 0, $vgpr2_vgpr3, implicit $exec
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
-  ; CHECK-NEXT:   $vgpr6_vgpr7 = V_LSHLREV_B64_e64 0, $vgpr4_vgpr5, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   liveins: $vgpr4_vgpr5
-  bb.0:
-    liveins: $vgpr0_vgpr1
-    $vgpr2_vgpr3 = V_MOV_B64_e64 $vgpr0_vgpr1, implicit $exec
-    $vgpr4_vgpr5 = V_LSHLREV_B64_e64 0, $vgpr2_vgpr3, implicit $exec
-    $vgpr6_vgpr7 = V_LSHLREV_B64_e64 0, $vgpr4_vgpr5, implicit $exec
-  bb.1:
-    liveins: $vgpr4_vgpr5
-...
-
-# Invalid producers but valid consumer opcodes.
----
-name: v_movereld_b32_e32
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: v_movereld_b32_e32
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr2
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $m0 = S_MOV_B32 0
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
-  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   V_MOVRELD_B32_e32 $vgpr2, $vgpr1, implicit $m0, implicit $exec, implicit-def $vgpr1_vgpr2, implicit undef $vgpr1_vgpr2(tied-def 4)
-  ; CHECK-NEXT:   $vgpr3 = V_ADD_U32_e32 $vgpr2, $vgpr1, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   liveins: $vgpr3
-  bb.0:
-    liveins: $vgpr0, $vgpr2
-    $m0 = S_MOV_B32 0
-    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    V_MOVRELD_B32_e32 $vgpr2, $vgpr1, implicit $m0, implicit $exec, implicit-def $vgpr1_vgpr2, implicit undef $vgpr1_vgpr2(tied-def 4)
-    $vgpr3 = V_ADD_U32_e32 $vgpr2, $vgpr1, implicit $exec
-  bb.1:
-    liveins: $vgpr3
-...
-
-# Invalid producers and invalid consumer opcodes.
----
-name: v_writelane_b32
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: v_writelane_b32
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr0
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr1 = V_WRITELANE_B32 $sgpr0, 0, $vgpr1
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
-  ; CHECK-NEXT:   $vgpr2 = V_MOV_B32_e32 $vgpr1, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   liveins: $vgpr0
-  bb.0:
-    liveins: $vgpr0, $sgpr0
-    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr1 = V_WRITELANE_B32 $sgpr0, 0, $vgpr1
-    $vgpr2 = V_MOV_B32_e32 $vgpr1, implicit $exec
-  bb.1:
-    liveins: $vgpr0
-...
-
-# DPP instructions cannot be single use producers or consumers
----
-name: V_ADD_NC_U32_dpp
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: V_ADD_NC_U32_dpp
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $vgpr0, $vcc
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $vgpr0 = V_ADDC_U32_dpp $vgpr0, $vgpr0, $vgpr0, 1, 15, 15, 1, implicit-def $vcc_lo, implicit $vcc_lo, implicit $exec
-  ; CHECK-NEXT:   $vgpr0 = V_ADDC_U32_dpp $vgpr0, $vgpr0, $vgpr0, 1, 15, 15, 1, implicit-def $vcc_lo, implicit $vcc_lo, implicit $exec
-  ; CHECK-NEXT:   $vgpr0 = V_ADDC_U32_dpp $vgpr0, $vgpr0, $vgpr0, 1, 15, 15, 1, implicit-def $vcc_lo, implicit $vcc_lo, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   liveins: $vgpr0
-  bb.0:
-    liveins: $vgpr0, $vcc
-    $vgpr0 = V_ADDC_U32_dpp $vgpr0, $vgpr0, $vgpr0, 1, 15, 15, 1, implicit-def $vcc, implicit $vcc, implicit $exec
-    $vgpr0 = V_ADDC_U32_dpp $vgpr0, $vgpr0, $vgpr0, 1, 15, 15, 1, implicit-def $vcc, implicit $vcc, implicit $exec
-    $vgpr0 = V_ADDC_U32_dpp $vgpr0, $vgpr0, $vgpr0, 1, 15, 15, 1, implicit-def $vcc, implicit $vcc, implicit $exec
-  bb.1:
-    liveins: $vgpr0
-...
-
-# Exception to the rule that dpp instructions
-# cannot be single use producers or consumers
----
-name: V_INTERP_MOV_F32
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: V_INTERP_MOV_F32
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
-  ; CHECK-NEXT:   $vgpr0 = V_INTERP_MOV_F32 0, 0, 0, implicit $mode, implicit $m0, implicit $exec
-  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   liveins: $vgpr1
-  bb.0:
-    $vgpr0 = V_INTERP_MOV_F32 0, 0, 0, implicit $mode, implicit $m0, implicit $exec
-    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  bb.1:
-    liveins: $vgpr1
-...
-
diff --git a/llvm/test/MC/AMDGPU/gfx1150_asm_sopp.s b/llvm/test/MC/AMDGPU/gfx1150_asm_sopp.s
deleted file mode 100644
index 044ce48c267846..00000000000000
--- a/llvm/test/MC/AMDGPU/gfx1150_asm_sopp.s
+++ /dev/null
@@ -1,10 +0,0 @@
-// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1150 -show-encoding %s | FileCheck --check-prefixes=GFX1150 %s
-
-s_singleuse_vdst 0x0000
-// GFX1150: encoding: [0x00,0x00,0x93,0xbf]
-
-s_singleuse_vdst 0xffff
-// GFX1150: encoding: [0xff,0xff,0x93,0xbf]
-
-s_singleuse_vdst 0x1234
-// GFX1150: encoding: [0x34,0x12,0x93,0xbf]
diff --git a/llvm/test/MC/AMDGPU/gfx11_unsupported.s b/llvm/test/MC/AMDGPU/gfx11_unsupported.s
index c9756a068890e7..c565801d275bb8 100644
--- a/llvm/test/MC/AMDGPU/gfx11_unsupported.s
+++ b/llvm/test/MC/AMDGPU/gfx11_unsupported.s
@@ -2014,9 +2014,6 @@ s_cmp_neq_f16 s1, s2
 s_cmp_nlt_f16 s1, s2
 // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
 
-s_singleuse_vdst 0x1234
-// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
-
 buffer_atomic_sub_clamp_u32 v5, off, s[8:11], s3 offset:0 glc
 // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
 
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_sopp.s b/llvm/test/MC/AMDGPU/gfx12_asm_sopp.s
index e98659208d5a9c..fdcabc4352c69b 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_sopp.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_sopp.s
@@ -69,15 +69,6 @@ s_wait_alu depctr_va_sdst(3)
 s_wait_alu depctr_va_vdst(14) depctr_va_sdst(6) depctr_vm_vsrc(6)
 // GFX12: encoding: [0x9b,0xed,0x88,0xbf]
 
-s_singleuse_vdst 0x0000
-// GFX12: encoding: [0x00,0x00,0x93,0xbf]
-
-s_singleuse_vdst 0xffff
-// GFX12: encoding: [0xff,0xff,0x93,0xbf]
-
-s_singleuse_vdst 0x1234
-// GFX12: encoding: [0x34,0x12,0x93,0xbf]
-
 s_barrier_wait 0xffff
 // GFX12: encoding: [0xff,0xff,0x94,0xbf]
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/decode-err.txt b/llvm/test/MC/Disassembler/AMDGPU/decode-err.txt
index d6e8b7ee2f01f0..f819a61949b577 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/decode-err.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/decode-err.txt
@@ -1,16 +1,11 @@
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx900 -disassemble -show-encoding -filetype=null < %s 2>&1 | FileCheck -check-prefix=GCN-ERR %s
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -disassemble -show-encoding < %s 2>&1 | FileCheck -check-prefixes=W32 %s
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s 2>&1 | FileCheck -check-prefixes=W64 %s
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -disassemble -show-encoding -filetype=null < %s 2>&1 | FileCheck -check-prefix=GFX11-ERR %s
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -disassemble -show-encoding -filetype=null < %s 2>&1 | FileCheck -check-prefix=GFX12-ERR %s
 
 # GCN-ERR: [[@LINE+1]]:1: warning: invalid instruction encoding
 0xdf,0x00,0x00,0x02
 
-# this is s_singleuse_vdst 0x1234, which is only valid on gfx1150
-# GFX11-ERR: [[@LINE+1]]:1: warning: invalid instruction encoding
-0x34,0x12,0x93,0xbf
-
 # this is s_waitcnt_vscnt exec_hi, 0x1234, which is valid on gfx11, but not on gfx12
 # GFX12-ERR: [[@LINE+1]]:1: warning: invalid instruction encoding
 0x34,0x12,0x7f,0xbc
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1150_dasm_sopp.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1150_dasm_sopp.txt
deleted file mode 100644
index 8fa266a73ff87f..00000000000000
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx1150_dasm_sopp.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1150 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1150 %s
-
-# GFX1150: s_singleuse_vdst 0x0                 ; encoding: [0x00,0x00,0x93,0xbf]
-0x00,0x00,0x93,0xbf
-
-# GFX1150: s_singleuse_vdst 0xffff                 ; encoding: [0xff,0xff,0x93,0xbf]
-0xff,0xff,0x93,0xbf
-
-# GFX1150: s_singleuse_vdst 0x1234                 ; encoding: [0x34,0x12,0x93,0xbf]
-0x34,0x12,0x93,0xbf
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sopp.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sopp.txt
index d42f920aa61dd7..d69801512c0786 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sopp.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sopp.txt
@@ -60,14 +60,6 @@
 # GFX12: s_wait_storecnt_dscnt 0xc1d1            ; encoding: [0xd1,0xc1,0xc9,0xbf]
 0xd1,0xc1,0xc9,0xbf
 
-# GFX12: s_singleuse_vdst 0x0                    ; encoding: [0x00,0x00,0x93,0xbf]
-0x00,0x00,0x93,0xbf
-
-# GFX12: s_singleuse_vdst 0xffff                 ; encoding: [0xff,0xff,0x93,0xbf]
-0xff,0xff,0x93,0xbf
-
-# GFX12: s_singleuse_vdst 0x1234                 ; encoding: [0x34,0x12,0x93,0xbf]
-0x34,0x12,0x93,0xbf
 
 # GFX12: s_barrier_wait 0xffff                   ; encoding: [0xff,0xff,0x94,0xbf]
 0xff,0xff,0x94,0xbf
diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
index dd4af4e98832f7..f83efbd3558025 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
@@ -152,7 +152,6 @@ static_library("LLVMAMDGPUCodeGen") {
     "AMDGPUISelLowering.cpp",
     "AMDGPUImageIntrinsicOptimizer.cpp",
     "AMDGPUInsertDelayAlu.cpp",
-    "AMDGPUInsertSingleUseVDST.cpp",
     "AMDGPUInstCombineIntrinsic.cpp",
     "AMDGPUInstrInfo.cpp",
     "AMDGPUInstructionSelector.cpp",

From fc661df41a206779a9323fb9dd49038c44084d5e Mon Sep 17 00:00:00 2001
From: Jacek Caban <jacek@codeweavers.com>
Date: Tue, 24 Sep 2024 12:04:04 +0200
Subject: [PATCH 07/22] [LLD][COFF][NFC] Use dyn_cast on section chunks
 (#109701)

Instead of dyn_cast_or_null, chunk pointers are never null.
---
 lld/COFF/Writer.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/lld/COFF/Writer.cpp b/lld/COFF/Writer.cpp
index c2765453aa964e..7cf723a8cf103f 100644
--- a/lld/COFF/Writer.cpp
+++ b/lld/COFF/Writer.cpp
@@ -472,7 +472,7 @@ bool Writer::createThunks(OutputSection *os, int margin) {
   // Recheck Chunks.size() each iteration, since we can insert more
   // elements into it.
   for (size_t i = 0; i != os->chunks.size(); ++i) {
-    SectionChunk *sc = dyn_cast_or_null<SectionChunk>(os->chunks[i]);
+    SectionChunk *sc = dyn_cast<SectionChunk>(os->chunks[i]);
     if (!sc)
       continue;
     MachineTypes machine = sc->getMachine();
@@ -606,7 +606,7 @@ void Writer::createECCodeMap() {
 // Verify that all relocations are in range, with no extra margin requirements.
 bool Writer::verifyRanges(const std::vector<Chunk *> chunks) {
   for (Chunk *c : chunks) {
-    SectionChunk *sc = dyn_cast_or_null<SectionChunk>(c);
+    SectionChunk *sc = dyn_cast<SectionChunk>(c);
     if (!sc)
       continue;
     MachineTypes machine = sc->getMachine();
@@ -872,8 +872,8 @@ bool Writer::fixGnuImportChunks() {
     if (!pSec->chunks.empty())
       hasIdata = true;
     llvm::stable_sort(pSec->chunks, [&](Chunk *s, Chunk *t) {
-      SectionChunk *sc1 = dyn_cast_or_null<SectionChunk>(s);
-      SectionChunk *sc2 = dyn_cast_or_null<SectionChunk>(t);
+      SectionChunk *sc1 = dyn_cast<SectionChunk>(s);
+      SectionChunk *sc2 = dyn_cast<SectionChunk>(t);
       if (!sc1 || !sc2) {
         // if SC1, order them ascending. If SC2 or both null,
         // S is not less than T.

From f664d313cd63893d7a4a496fdf0de988323b6b09 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra@codasip.com>
Date: Tue, 24 Sep 2024 11:18:37 +0100
Subject: [PATCH 08/22] MemCpyOpt: replace an AA query with MSSA query (NFC)
 (#108535)

Fix a long-standing TODO.
---
 .../lib/Transforms/Scalar/MemCpyOptimizer.cpp | 26 +++++++++----------
 llvm/test/Transforms/MemCpyOpt/fca2memcpy.ll  | 15 +++++++++++
 2 files changed, 27 insertions(+), 14 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index 1d67773585d593..2f88b19a8d3902 100644
--- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -638,6 +638,7 @@ bool MemCpyOptPass::processStoreOfLoad(StoreInst *SI, LoadInst *LI,
   if (!LI->isSimple() || !LI->hasOneUse() || LI->getParent() != SI->getParent())
     return false;
 
+  BatchAAResults BAA(*AA);
   auto *T = LI->getType();
   // Don't introduce calls to memcpy/memmove intrinsics out of thin air if
   // the corresponding libcalls are not available.
@@ -647,19 +648,17 @@ bool MemCpyOptPass::processStoreOfLoad(StoreInst *SI, LoadInst *LI,
       (EnableMemCpyOptWithoutLibcalls ||
        (TLI->has(LibFunc_memcpy) && TLI->has(LibFunc_memmove)))) {
     MemoryLocation LoadLoc = MemoryLocation::get(LI);
-
-    // We use alias analysis to check if an instruction may store to
-    // the memory we load from in between the load and the store. If
-    // such an instruction is found, we try to promote there instead
-    // of at the store position.
-    // TODO: Can use MSSA for this.
-    Instruction *P = SI;
-    for (auto &I : make_range(++LI->getIterator(), SI->getIterator())) {
-      if (isModSet(AA->getModRefInfo(&I, LoadLoc))) {
-        P = &I;
-        break;
-      }
-    }
+    MemoryUseOrDef *LoadAccess = MSSA->getMemoryAccess(LI),
+                   *StoreAccess = MSSA->getMemoryAccess(SI);
+
+    // We use MSSA to check if an instruction may store to the memory we load
+    // from in between the load and the store. If such an instruction is found,
+    // we try to promote there instead of at the store position.
+    auto *Clobber = MSSA->getWalker()->getClobberingMemoryAccess(
+        StoreAccess->getDefiningAccess(), LoadLoc, BAA);
+    Instruction *P = MSSA->dominates(LoadAccess, Clobber)
+                         ? cast<MemoryUseOrDef>(Clobber)->getMemoryInst()
+                         : SI;
 
     // If we found an instruction that may write to the loaded memory,
     // we can try to promote at this position instead of the store
@@ -707,7 +706,6 @@ bool MemCpyOptPass::processStoreOfLoad(StoreInst *SI, LoadInst *LI,
   // Detect cases where we're performing call slot forwarding, but
   // happen to be using a load-store pair to implement it, rather than
   // a memcpy.
-  BatchAAResults BAA(*AA);
   auto GetCall = [&]() -> CallInst * {
     // We defer this expensive clobber walk until the cheap checks
     // have been done on the source inside performCallSlotOptzn.
diff --git a/llvm/test/Transforms/MemCpyOpt/fca2memcpy.ll b/llvm/test/Transforms/MemCpyOpt/fca2memcpy.ll
index 51fad820509393..61e349e01ed91d 100644
--- a/llvm/test/Transforms/MemCpyOpt/fca2memcpy.ll
+++ b/llvm/test/Transforms/MemCpyOpt/fca2memcpy.ll
@@ -141,4 +141,19 @@ define void @throwing_call(ptr noalias %src, ptr %dst) {
   ret void
 }
 
+define void @loop_memoryphi(ptr %a, ptr %b) {
+; CHECK-LABEL: @loop_memoryphi(
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    call void @llvm.memmove.p0.p0.i64(ptr align 8 [[B:%.*]], ptr align 8 [[A:%.*]], i64 16, i1 false)
+; CHECK-NEXT:    br label [[LOOP]]
+;
+  br label %loop
+
+loop:
+  %v = load { i64, i64 }, ptr %a
+  store { i64, i64 } %v, ptr %b
+  br label %loop
+}
+
 declare void @call()

From 040bb37195d93f75cc7ce6b83254ab5db959a085 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Tue, 24 Sep 2024 11:18:49 +0100
Subject: [PATCH 09/22] [VPlan] Fix incorrect argument for CreateBinOp after
 06c3a7d2d764.

06c3a7d2d764 incorrectly updated CreateBinOp to pass the debug location,
which gets interpreted as FPMath node. Remove the argument.
---
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |   4 +-
 .../LoopVectorize/float-induction.ll          | 109 ++++++++++++++++++
 2 files changed, 111 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 3f7ab416e877bc..318d6a8c5b8c34 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -1504,8 +1504,8 @@ void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
   VecInd->setDebugLoc(EntryVal->getDebugLoc());
   State.set(this, VecInd);
 
-  Instruction *LastInduction = cast<Instruction>(Builder.CreateBinOp(
-      AddOp, VecInd, SplatVF, "vec.ind.next", EntryVal->getDebugLoc()));
+  Instruction *LastInduction = cast<Instruction>(
+      Builder.CreateBinOp(AddOp, VecInd, SplatVF, "vec.ind.next"));
   if (isa<TruncInst>(EntryVal))
     State.addMetadata(LastInduction, EntryVal);
   LastInduction->setDebugLoc(EntryVal->getDebugLoc());
diff --git a/llvm/test/Transforms/LoopVectorize/float-induction.ll b/llvm/test/Transforms/LoopVectorize/float-induction.ll
index 9091b2c80fb97c..cedaf019a958bd 100644
--- a/llvm/test/Transforms/LoopVectorize/float-induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/float-induction.ll
@@ -1640,3 +1640,112 @@ for.inc:
 for.end:
   ret void
 }
+
+define i32 @float_induction_with_dbg_on_fadd(ptr %dst) {
+; VEC4_INTERL1-LABEL: @float_induction_with_dbg_on_fadd(
+; VEC4_INTERL1-NEXT:  entry:
+; VEC4_INTERL1-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; VEC4_INTERL1:       vector.ph:
+; VEC4_INTERL1-NEXT:    br label [[VECTOR_BODY:%.*]]
+; VEC4_INTERL1:       vector.body:
+; VEC4_INTERL1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; VEC4_INTERL1-NEXT:    [[TMP0:%.*]] = getelementptr float, ptr null, i64 [[INDEX]]
+; VEC4_INTERL1-NEXT:    store <4 x float> poison, ptr [[TMP0]], align 8
+; VEC4_INTERL1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VEC4_INTERL1-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
+; VEC4_INTERL1-NEXT:    br i1 [[TMP1]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; VEC4_INTERL1:       middle.block:
+; VEC4_INTERL1-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; VEC4_INTERL1:       scalar.ph:
+; VEC4_INTERL1-NEXT:    br label [[LOOP:%.*]]
+; VEC4_INTERL1:       loop:
+; VEC4_INTERL1-NEXT:    br i1 poison, label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP16:![0-9]+]]
+; VEC4_INTERL1:       exit:
+; VEC4_INTERL1-NEXT:    ret i32 0
+;
+; VEC4_INTERL2-LABEL: @float_induction_with_dbg_on_fadd(
+; VEC4_INTERL2-NEXT:  entry:
+; VEC4_INTERL2-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; VEC4_INTERL2:       vector.ph:
+; VEC4_INTERL2-NEXT:    br label [[VECTOR_BODY:%.*]]
+; VEC4_INTERL2:       vector.body:
+; VEC4_INTERL2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; VEC4_INTERL2-NEXT:    [[TMP0:%.*]] = getelementptr float, ptr null, i64 [[INDEX]]
+; VEC4_INTERL2-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i64 16
+; VEC4_INTERL2-NEXT:    store <4 x float> poison, ptr [[TMP0]], align 8
+; VEC4_INTERL2-NEXT:    store <4 x float> zeroinitializer, ptr [[TMP1]], align 8
+; VEC4_INTERL2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; VEC4_INTERL2-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
+; VEC4_INTERL2-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; VEC4_INTERL2:       middle.block:
+; VEC4_INTERL2-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; VEC4_INTERL2:       scalar.ph:
+; VEC4_INTERL2-NEXT:    br label [[LOOP:%.*]]
+; VEC4_INTERL2:       loop:
+; VEC4_INTERL2-NEXT:    br i1 poison, label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP16:![0-9]+]]
+; VEC4_INTERL2:       exit:
+; VEC4_INTERL2-NEXT:    ret i32 0
+;
+; VEC1_INTERL2-LABEL: @float_induction_with_dbg_on_fadd(
+; VEC1_INTERL2-NEXT:  entry:
+; VEC1_INTERL2-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; VEC1_INTERL2:       vector.ph:
+; VEC1_INTERL2-NEXT:    br label [[VECTOR_BODY:%.*]]
+; VEC1_INTERL2:       vector.body:
+; VEC1_INTERL2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; VEC1_INTERL2-NEXT:    [[TMP0:%.*]] = or disjoint i64 [[INDEX]], 1
+; VEC1_INTERL2-NEXT:    [[TMP1:%.*]] = getelementptr float, ptr null, i64 [[INDEX]]
+; VEC1_INTERL2-NEXT:    [[TMP2:%.*]] = getelementptr float, ptr null, i64 [[TMP0]]
+; VEC1_INTERL2-NEXT:    store float poison, ptr [[TMP1]], align 8
+; VEC1_INTERL2-NEXT:    store float poison, ptr [[TMP2]], align 8
+; VEC1_INTERL2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VEC1_INTERL2-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
+; VEC1_INTERL2-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; VEC1_INTERL2:       middle.block:
+; VEC1_INTERL2-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; VEC1_INTERL2:       scalar.ph:
+; VEC1_INTERL2-NEXT:    br label [[LOOP:%.*]]
+; VEC1_INTERL2:       loop:
+; VEC1_INTERL2-NEXT:    br i1 poison, label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP16:![0-9]+]]
+; VEC1_INTERL2:       exit:
+; VEC1_INTERL2-NEXT:    ret i32 0
+;
+; VEC2_INTERL1_PRED_STORE-LABEL: @float_induction_with_dbg_on_fadd(
+; VEC2_INTERL1_PRED_STORE-NEXT:  entry:
+; VEC2_INTERL1_PRED_STORE-NEXT:    br label [[VECTOR_BODY:%.*]]
+; VEC2_INTERL1_PRED_STORE:       vector.body:
+; VEC2_INTERL1_PRED_STORE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP0:%.*]] = getelementptr float, ptr null, i64 [[INDEX]]
+; VEC2_INTERL1_PRED_STORE-NEXT:    store <2 x float> poison, ptr [[TMP0]], align 8
+; VEC2_INTERL1_PRED_STORE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
+; VEC2_INTERL1_PRED_STORE-NEXT:    br i1 [[TMP1]], label [[EXIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; VEC2_INTERL1_PRED_STORE:       exit:
+; VEC2_INTERL1_PRED_STORE-NEXT:    ret i32 0
+;
+entry:
+  br label %loop
+
+loop:
+  %fp.iv = phi float [ 0.000000e+00, %entry ], [ %fp.iv.next, %loop ], !dbg !4
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %fp.iv.next = fadd reassoc float %fp.iv, 0.000000e+00
+  %gep = getelementptr float, ptr null, i64 %iv
+  store float %fp.iv.next, ptr %gep, align 8
+  %iv.next = add i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 200
+  br i1 %exitcond.not, label %exit, label %loop
+
+exit:
+  ret i32 0
+}
+
+!llvm.module.flags = !{!3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1)
+!1 = !DIFile(filename: "bbi-99425.c", directory: "/tmp")
+!2 = !{}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !DILocation(line: 5, column: 12, scope: !8)
+!8 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 3, type: !9, unit: !0, retainedNodes: !2)
+!9 = !DISubroutineType(types: !2)

From a3cf01d58587d81b184d40091a86d6b8bf92d240 Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Tue, 24 Sep 2024 11:21:24 +0100
Subject: [PATCH 10/22] [lldb][docs] Resurrect the information on adding a new
 language (#109427)

This got deleted in e078c9507c3abb4d9bb2265c366b26557880a3e3, I presume
accidentally, because it didn't have a corresponding rst file for it.

So I've brought it back and converted it into Markdown. The content
remains accurate, from what I know at least.

It's a bit "now draw the rest of the owl" but if nothing else, it gives
you a bunch of important classes to go and research as a starting point.

You can see the original content here:
https://github.com/llvm/llvm-project/blob/5d71fc5d7b5ffe2323418a09db6eddaf84d6c662/lldb/www/adding-language-support.html
---
 lldb/docs/index.rst                          |  1 +
 lldb/docs/resources/addinglanguagesupport.md | 95 ++++++++++++++++++++
 2 files changed, 96 insertions(+)
 create mode 100644 lldb/docs/resources/addinglanguagesupport.md

diff --git a/lldb/docs/index.rst b/lldb/docs/index.rst
index d9b8e589eb2ac0..dd44a8430add80 100644
--- a/lldb/docs/index.rst
+++ b/lldb/docs/index.rst
@@ -163,6 +163,7 @@ interesting areas to contribute to lldb.
    resources/caveats
    resources/projects
    resources/lldbdap
+   resources/addinglanguagesupport
    Public C++ API <https://lldb.llvm.org/cpp_reference/namespacelldb.html>
    Private C++ API <https://lldb.llvm.org/cpp_reference/index.html>
 
diff --git a/lldb/docs/resources/addinglanguagesupport.md b/lldb/docs/resources/addinglanguagesupport.md
new file mode 100644
index 00000000000000..28789048643d77
--- /dev/null
+++ b/lldb/docs/resources/addinglanguagesupport.md
@@ -0,0 +1,95 @@
+# Adding Programming Language Support
+
+LLDB has been architected to make it straightforward to add support for a
+programming language. Only a small enum in core LLDB needs to be modified to
+make LLDB aware of a new programming language. Everything else can be supplied
+in derived classes that need not even be present in the core LLDB repository.
+This makes it convenient for developers adding language support in downstream
+repositories since it practically eliminates the potential for merge conflicts.
+
+The basic steps are:
+* Add the language to the `LanguageType` enum.
+* Add a `TypeSystem` for the language.
+* Add expression evaluation support.
+
+Additionally, you may want to create a `Language` and `LanguageRuntime` plugin
+for your language, which enables support for advanced features like dynamic
+typing and data formatting.
+
+## Add the Language to the LanguageType enum
+
+The `LanguageType` enum
+(see [lldb-enumerations.h](https://github.com/llvm/llvm-project/blob/main/lldb/include/lldb/lldb-enumerations.h))
+contains a list of every language known to LLDB. It is the one place where
+support for a language must live that will need to merge cleanly with upstream
+LLDB if you are developing your language support in a separate branch. When
+adding support for a language previously unknown to LLDB, start by adding an
+enumeration entry to `LanguageType`.
+
+## Add a TypeSystem for the Language
+
+Both [Module](https://github.com/llvm/llvm-project/blob/main/lldb/include/lldb/Core/Module.h)
+and [Target](https://github.com/llvm/llvm-project/blob/main/lldb/include/lldb/Target/Target.h)
+support the retrieval of a `TypeSystem` instance via `GetTypeSystemForLanguage()`.
+For `Module`, this method is directly on the `Module` instance. For `Target`,
+this is retrieved indirectly via the `TypeSystemMap` for the `Target` instance.
+
+The `TypeSystem` instance returned by the `Target` is expected to be capable of
+evaluating expressions, while the `TypeSystem` instance returned by the `Module`
+is not. If you want to support expression evaluation for your language, you could
+consider one of the following approaches:
+* Implement a single `TypeSystem` class that supports evaluation when given an
+  optional `Target`, implementing all the expression evaluation methods on the
+  `TypeSystem`.
+* Create multiple `TypeSystem` classes, one for evaluation and one for static
+  `Module` usage.
+
+For clang and Swift, the latter approach was chosen. Primarily to make it
+clearer that evaluation with the static `Module`-returned `TypeSystem` instances
+make no sense, and have them error out on those calls. But either approach is
+fine.
+
+# Creating Types
+
+Your `TypeSystem` will need an approach for creating types based on a set of
+`Module`s. If your type info is going to come from DWARF info, you will want to
+subclass [DWARFASTParser](https://github.com/llvm/llvm-project/blob/main/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParser.h).
+
+
+# Add Expression Evaluation Support
+
+Expression Evaluation support is enabled by implementing the relevant methods on
+a `TypeSystem`-derived class. Search for `Expression` in the
+[TypeSystem header](https://github.com/llvm/llvm-project/blob/main/lldb/include/lldb/Symbol/TypeSystem.h)
+to find the methods to implement.
+
+# Type Completion
+
+There are three levels of type completion, each requiring more type information:
+1. Pointer size: When you have a forward decl or a reference, and that's all you
+  need. At this stage, the pointer size is all you need.
+2. Layout info: You need the size of an instance of the type, but you still don't
+  need to know all the guts of the type.
+3. Full type info: Here you need everything, because you're playing with
+  internals of it, such as modifying a member variable.
+
+Ensure you never complete more of a type than is needed for a given situation.
+This will keep your type system from doing more work than necessary.
+
+# Language and LanguageRuntime Plugins
+
+If you followed the steps outlined above, you already have taught LLDB a great
+deal about your language. If your language's runtime model and fundamental data
+types don't differ much from the C model, you are pretty much done.
+
+However it is likely that your language offers its own data types for things
+like strings and arrays, and probably has a notion of dynamic types, where the
+effective type of a variable can only be known at runtime.
+
+These tasks are covered by two plugins:
+* a `LanguageRuntime` plugin, which provides LLDB with a dynamic view of your
+  language; this plugin answers questions that require a live process to acquire
+  information (for example dynamic type resolution).
+* a `Language` plugin, which provides LLDB with a static view of your language;
+  questions that are statically knowable and do not require a process are
+  answered by this plugin (for example data formatters).
\ No newline at end of file

From d4f38f43f5402041dd36977baa459830011d6ac6 Mon Sep 17 00:00:00 2001
From: Nashe Mncube <nashe.mncube@arm.com>
Date: Tue, 24 Sep 2024 11:26:06 +0100
Subject: [PATCH 11/22] [LLVM][ARM][CodeGen]Define branch instruction alignment
 for m85 and m7 (#109647)

Branch instruction alignments were not defined for cortex-m85 and
cortex-m7 which misses an optimisation opportunity. With this patch we
see performance improvements as high as 5% on some benchmarks with most
around 1%.
---
 llvm/lib/Target/ARM/ARMFeatures.td                    | 3 +++
 llvm/lib/Target/ARM/ARMProcessors.td                  | 2 ++
 llvm/test/CodeGen/ARM/preferred-function-alignment.ll | 5 +++--
 3 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/ARM/ARMFeatures.td b/llvm/lib/Target/ARM/ARMFeatures.td
index 8b0ade54b46d3c..dc0e86c696f63a 100644
--- a/llvm/lib/Target/ARM/ARMFeatures.td
+++ b/llvm/lib/Target/ARM/ARMFeatures.td
@@ -375,6 +375,9 @@ def FeaturePref32BitThumb : SubtargetFeature<"32bit", "Prefers32BitThumb", "true
 def FeaturePrefLoopAlign32 : SubtargetFeature<"loop-align", "PrefLoopLogAlignment","2",
                                               "Prefer 32-bit alignment for loops">;
 
+def FeaturePrefLoopAlign64 : SubtargetFeature<"loop-align-64", "PrefLoopLogAlignment","3",
+                                              "Prefer 64-bit alignment for loops">;
+
 def FeatureMVEVectorCostFactor1 : SubtargetFeature<"mve1beat", "MVEVectorCostFactor", "4",
                         "Model MVE instructions as a 1 beat per tick architecture">;
 
diff --git a/llvm/lib/Target/ARM/ARMProcessors.td b/llvm/lib/Target/ARM/ARMProcessors.td
index e4e122a0d1339b..a66a2c0b1981d8 100644
--- a/llvm/lib/Target/ARM/ARMProcessors.td
+++ b/llvm/lib/Target/ARM/ARMProcessors.td
@@ -344,6 +344,7 @@ def : ProcessorModel<"cortex-m4", CortexM4Model,        [ARMv7em,
 def : ProcessorModel<"cortex-m7", CortexM7Model,        [ARMv7em,
                                                          ProcM7,
                                                          FeatureFPARMv8_D16,
+                                                         FeaturePrefLoopAlign64,
                                                          FeatureUseMIPipeliner,
                                                          FeatureUseMISched]>;
 
@@ -385,6 +386,7 @@ def : ProcessorModel<"cortex-m85", CortexM85Model,      [ARMv81mMainline,
                                                          FeatureDSP,
                                                          FeatureFPARMv8_D16,
                                                          FeaturePACBTI,
+                                                         FeaturePrefLoopAlign64,
                                                          FeatureUseMISched,
                                                          HasMVEFloatOps]>;
 
diff --git a/llvm/test/CodeGen/ARM/preferred-function-alignment.ll b/llvm/test/CodeGen/ARM/preferred-function-alignment.ll
index afe64a22c5e808..f3a227c4765eb8 100644
--- a/llvm/test/CodeGen/ARM/preferred-function-alignment.ll
+++ b/llvm/test/CodeGen/ARM/preferred-function-alignment.ll
@@ -1,14 +1,15 @@
-; RUN: llc -mtriple=arm-none-eabi -mcpu=cortex-m85 < %s | FileCheck --check-prefixes=CHECK,ALIGN-16,ALIGN-CS-16 %s
+; RUN: llc -mtriple=arm-none-eabi -mcpu=cortex-m85 < %s | FileCheck --check-prefixes=CHECK,ALIGN-64,ALIGN-CS-16 %s
 ; RUN: llc -mtriple=arm-none-eabi -mcpu=cortex-m23 < %s | FileCheck --check-prefixes=CHECK,ALIGN-16,ALIGN-CS-16 %s
 
 ; RUN: llc -mtriple=arm-none-eabi -mcpu=cortex-a5 < %s  | FileCheck --check-prefixes=CHECK,ALIGN-32,ALIGN-CS-32 %s
 ; RUN: llc -mtriple=arm-none-eabi -mcpu=cortex-m33 < %s | FileCheck --check-prefixes=CHECK,ALIGN-32,ALIGN-CS-16 %s
 ; RUN: llc -mtriple=arm-none-eabi -mcpu=cortex-m55 < %s | FileCheck --check-prefixes=CHECK,ALIGN-32,ALIGN-CS-16 %s
-
+; RUN: llc -mtriple=arm-none-eabi -mcpu=cortex-m7 < %s | FileCheck --check-prefixes=CHECK,ALIGN-64,ALIGN-CS-16 %s
 
 ; CHECK-LABEL: test
 ; ALIGN-16: .p2align 1
 ; ALIGN-32: .p2align 2
+; ALIGN-64: .p2align 3
 
 define void @test() {
   ret void

From ea902d1b36e4e3a7d7bdd0f7bce3c460b6dd6e80 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Tue, 24 Sep 2024 12:08:35 +0200
Subject: [PATCH 12/22] [IR] Remove deprecated opaque pointer migration methods

Remove the following methods:

 * Type::getNonOpaquePointerElementType()
 * Type::isOpaquePointerTy()
 * LLVMContext::supportsTypedPointers()
 * LLVMContext::setOpaquePointers()

These were used temporarily during the opaque pointers migration,
and are no longer needed.
---
 llvm/include/llvm/IR/LLVMContext.h | 11 -----------
 llvm/include/llvm/IR/Type.h        | 12 ------------
 llvm/lib/IR/LLVMContext.cpp        |  8 --------
 3 files changed, 31 deletions(-)

diff --git a/llvm/include/llvm/IR/LLVMContext.h b/llvm/include/llvm/IR/LLVMContext.h
index 6ffa2bdaa319a7..558816e146587a 100644
--- a/llvm/include/llvm/IR/LLVMContext.h
+++ b/llvm/include/llvm/IR/LLVMContext.h
@@ -316,17 +316,6 @@ class LLVMContext {
   /// LLVMContext is used by compilation.
   void setOptPassGate(OptPassGate&);
 
-  /// Set whether opaque pointers are enabled. The method may be called multiple
-  /// times, but only with the same value. Note that creating a pointer type or
-  /// otherwise querying the opaque pointer mode performs an implicit set to
-  /// the default value.
-  [[deprecated("Opaque pointers are always enabled")]]
-  void setOpaquePointers(bool Enable) const;
-
-  /// Whether typed pointers are supported. If false, all pointers are opaque.
-  [[deprecated("Always returns false")]]
-  bool supportsTypedPointers() const;
-
   /// Get or set the current "default" target CPU (target-cpu function
   /// attribute). The intent is that compiler frontends will set this to a value
   /// that reflects the attribute that a function would get "by default" without
diff --git a/llvm/include/llvm/IR/Type.h b/llvm/include/llvm/IR/Type.h
index b88c8aece8126c..2f53197df19998 100644
--- a/llvm/include/llvm/IR/Type.h
+++ b/llvm/include/llvm/IR/Type.h
@@ -250,10 +250,6 @@ class Type {
   /// True if this is an instance of PointerType.
   bool isPointerTy() const { return getTypeID() == PointerTyID; }
 
-  /// True if this is an instance of an opaque PointerType.
-  LLVM_DEPRECATED("Use isPointerTy() instead", "isPointerTy")
-  bool isOpaquePointerTy() const { return isPointerTy(); };
-
   /// Return true if this is a pointer type or a vector of pointer types.
   bool isPtrOrPtrVectorTy() const { return getScalarType()->isPointerTy(); }
 
@@ -406,14 +402,6 @@ class Type {
 
   inline StringRef getTargetExtName() const;
 
-  /// Only use this method in code that is not reachable with opaque pointers,
-  /// or part of deprecated methods that will be removed as part of the opaque
-  /// pointers transition.
-  [[deprecated("Pointers no longer have element types")]]
-  Type *getNonOpaquePointerElementType() const {
-    llvm_unreachable("Pointers no longer have element types");
-  }
-
   /// Given vector type, change the element type,
   /// whilst keeping the old number of elements.
   /// For non-vectors simply returns \p EltTy.
diff --git a/llvm/lib/IR/LLVMContext.cpp b/llvm/lib/IR/LLVMContext.cpp
index c0fee93a233808..22e60772def43f 100644
--- a/llvm/lib/IR/LLVMContext.cpp
+++ b/llvm/lib/IR/LLVMContext.cpp
@@ -377,14 +377,6 @@ std::unique_ptr<DiagnosticHandler> LLVMContext::getDiagnosticHandler() {
   return std::move(pImpl->DiagHandler);
 }
 
-void LLVMContext::setOpaquePointers(bool Enable) const {
-  assert(Enable && "Cannot disable opaque pointers");
-}
-
-bool LLVMContext::supportsTypedPointers() const {
-  return false;
-}
-
 StringRef LLVMContext::getDefaultTargetCPU() {
   return pImpl->DefaultTargetCPU;
 }

From 3d34053af61ff45e05d230d2678eb8e95322eb14 Mon Sep 17 00:00:00 2001
From: Shengchen Kan <shengchen.kan@intel.com>
Date: Tue, 24 Sep 2024 18:39:48 +0800
Subject: [PATCH 13/22] [X86,MC] Add relocation R_X86_64_REX2_GOTPCRELX
 (#106681)

For

	mov        name@GOTPCREL(%rip), %reg
	test       %reg, name@GOTPCREL(%rip)
	binop      name@GOTPCREL(%rip), %reg

where binop is one of adc, add, and, cmp, or, sbb, sub, xor
instructions, add

 `R_X86_64_REX2_GOTPCRELX`/`R_X86_64_CODE_4_GOTPCRELX` = 43

if the instruction starts at 4 bytes before the relocation offset. It
similar to R_X86_64_GOTPCRELX.

Linker can treat `R_X86_64_REX2_GOTPCRELX`/`R_X86_64_CODE_4_GOTPCRELX`
as `R_X86_64_GOTPCREL` or convert the above instructions to

	lea	name(%rip), %reg
	mov	$name, %reg
	test	$name, %reg
	binop	$name, %reg

if the first byte of the instruction at the relocation `offset - 4` is
`0xd5` (namely, encoded w/ REX2 prefix) when possible.

Binutils patch:
https://github.com/bminor/binutils-gdb/commit/3d5a60de52556f6a53d71d7e607c6696450ae3e4
Binutils mailthread:
https://sourceware.org/pipermail/binutils/2023-December/131462.html
ABI discussion: https://groups.google.com/g/x86-64-abi/c/KbzaNHRB6QU
Blog: https://kanrobert.github.io/rfc/All-about-APX-relocation
---
 clang/test/Driver/relax.s                     |  2 ++
 .../llvm/BinaryFormat/ELFRelocs/x86_64.def    |  1 +
 llvm/lib/MC/MCTargetOptionsCommandFlags.cpp   |  4 +--
 .../Target/X86/MCTargetDesc/X86AsmBackend.cpp |  6 ++++
 .../X86/MCTargetDesc/X86ELFObjectWriter.cpp   |  7 +++-
 .../Target/X86/MCTargetDesc/X86FixupKinds.h   |  4 +++
 .../X86/MCTargetDesc/X86MCCodeEmitter.cpp     | 19 +++++------
 .../X86/MCTargetDesc/X86MachObjectWriter.cpp  |  6 +++-
 .../MCTargetDesc/X86WinCOFFObjectWriter.cpp   |  2 ++
 llvm/test/MC/ELF/relocation-alias.s           |  3 ++
 llvm/test/MC/X86/gotpcrelx.s                  | 34 +++++++++++++++++++
 llvm/test/MC/X86/reloc-directive-elf-64.s     |  3 ++
 12 files changed, 77 insertions(+), 14 deletions(-)

diff --git a/clang/test/Driver/relax.s b/clang/test/Driver/relax.s
index 154d4db0a31385..b4a696a328eb56 100644
--- a/clang/test/Driver/relax.s
+++ b/clang/test/Driver/relax.s
@@ -8,5 +8,7 @@
 // RUN: llvm-readobj -r %t | FileCheck --check-prefix=REL %s
 
 // REL: R_X86_64_REX_GOTPCRELX foo
+// REL: R_X86_64_REX2_GOTPCRELX foo
 
         movq	foo@GOTPCREL(%rip), %rax
+        movq	foo@GOTPCREL(%rip), %r16
diff --git a/llvm/include/llvm/BinaryFormat/ELFRelocs/x86_64.def b/llvm/include/llvm/BinaryFormat/ELFRelocs/x86_64.def
index 18fdcf9472dc48..161b1969abfeb4 100644
--- a/llvm/include/llvm/BinaryFormat/ELFRelocs/x86_64.def
+++ b/llvm/include/llvm/BinaryFormat/ELFRelocs/x86_64.def
@@ -43,3 +43,4 @@ ELF_RELOC(R_X86_64_TLSDESC,     36)
 ELF_RELOC(R_X86_64_IRELATIVE,   37)
 ELF_RELOC(R_X86_64_GOTPCRELX,   41)
 ELF_RELOC(R_X86_64_REX_GOTPCRELX,    42)
+ELF_RELOC(R_X86_64_REX2_GOTPCRELX,    43)
diff --git a/llvm/lib/MC/MCTargetOptionsCommandFlags.cpp b/llvm/lib/MC/MCTargetOptionsCommandFlags.cpp
index 1a4f7e93eeb74a..92618bdabbe519 100644
--- a/llvm/lib/MC/MCTargetOptionsCommandFlags.cpp
+++ b/llvm/lib/MC/MCTargetOptionsCommandFlags.cpp
@@ -145,8 +145,8 @@ llvm::mc::RegisterMCTargetOptionsFlags::RegisterMCTargetOptionsFlags() {
 
   static cl::opt<bool> X86RelaxRelocations(
       "x86-relax-relocations",
-      cl::desc(
-          "Emit GOTPCRELX/REX_GOTPCRELX instead of GOTPCREL on x86-64 ELF"),
+      cl::desc("Emit GOTPCRELX/REX_GOTPCRELX/REX2_GOTPCRELX instead of "
+               "GOTPCREL on x86-64 ELF"),
       cl::init(true));
   MCBINDOPT(X86RelaxRelocations);
 
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index 1d08853faf582e..2f6b55b0d6023e 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -629,15 +629,19 @@ std::optional<MCFixupKind> X86AsmBackend::getFixupKind(StringRef Name) const {
 
 const MCFixupKindInfo &X86AsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
   const static MCFixupKindInfo Infos[X86::NumTargetFixupKinds] = {
+      // clang-format off
       {"reloc_riprel_4byte", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
       {"reloc_riprel_4byte_movq_load", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+      {"reloc_riprel_4byte_movq_load_rex2", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
       {"reloc_riprel_4byte_relax", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
       {"reloc_riprel_4byte_relax_rex", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+      {"reloc_riprel_4byte_relax_rex2", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
       {"reloc_signed_4byte", 0, 32, 0},
       {"reloc_signed_4byte_relax", 0, 32, 0},
       {"reloc_global_offset_table", 0, 32, 0},
       {"reloc_global_offset_table8", 0, 64, 0},
       {"reloc_branch_4byte_pcrel", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+      // clang-format on
   };
 
   // Fixup kinds from .reloc directive are like R_386_NONE/R_X86_64_NONE. They
@@ -678,7 +682,9 @@ static unsigned getFixupKindSize(unsigned Kind) {
   case X86::reloc_riprel_4byte:
   case X86::reloc_riprel_4byte_relax:
   case X86::reloc_riprel_4byte_relax_rex:
+  case X86::reloc_riprel_4byte_relax_rex2:
   case X86::reloc_riprel_4byte_movq_load:
+  case X86::reloc_riprel_4byte_movq_load_rex2:
   case X86::reloc_signed_4byte:
   case X86::reloc_signed_4byte_relax:
   case X86::reloc_global_offset_table:
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
index 0b2efdfc16cc5d..90222278d1ad6f 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
@@ -74,7 +74,9 @@ static X86_64RelType getType64(MCFixupKind Kind,
   case X86::reloc_riprel_4byte:
   case X86::reloc_riprel_4byte_relax:
   case X86::reloc_riprel_4byte_relax_rex:
+  case X86::reloc_riprel_4byte_relax_rex2:
   case X86::reloc_riprel_4byte_movq_load:
+  case X86::reloc_riprel_4byte_movq_load_rex2:
     return RT64_32;
   case X86::reloc_branch_4byte_pcrel:
     Modifier = MCSymbolRefExpr::VK_PLT;
@@ -205,7 +207,7 @@ static unsigned getRelocType64(MCContext &Ctx, SMLoc Loc,
   case MCSymbolRefExpr::VK_GOTPCREL:
     checkIs32(Ctx, Loc, Type);
     // Older versions of ld.bfd/ld.gold/lld
-    // do not support GOTPCRELX/REX_GOTPCRELX,
+    // do not support GOTPCRELX/REX_GOTPCRELX/REX2_GOTPCRELX,
     // and we want to keep back-compatibility.
     if (!Ctx.getTargetOptions()->X86RelaxRelocations)
       return ELF::R_X86_64_GOTPCREL;
@@ -217,6 +219,9 @@ static unsigned getRelocType64(MCContext &Ctx, SMLoc Loc,
     case X86::reloc_riprel_4byte_relax_rex:
     case X86::reloc_riprel_4byte_movq_load:
       return ELF::R_X86_64_REX_GOTPCRELX;
+    case X86::reloc_riprel_4byte_relax_rex2:
+    case X86::reloc_riprel_4byte_movq_load_rex2:
+      return ELF::R_X86_64_REX2_GOTPCRELX;
     }
     llvm_unreachable("unexpected relocation type!");
   case MCSymbolRefExpr::VK_GOTPCREL_NORELAX:
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86FixupKinds.h b/llvm/lib/Target/X86/MCTargetDesc/X86FixupKinds.h
index 2d5217115d07cb..29bb7eebae3f22 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86FixupKinds.h
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86FixupKinds.h
@@ -16,10 +16,14 @@ namespace X86 {
 enum Fixups {
   reloc_riprel_4byte = FirstTargetFixupKind, // 32-bit rip-relative
   reloc_riprel_4byte_movq_load,              // 32-bit rip-relative in movq
+  reloc_riprel_4byte_movq_load_rex2,         // 32-bit rip-relative in movq
+                                             // with rex2 prefix
   reloc_riprel_4byte_relax,                  // 32-bit rip-relative in relaxable
                                              // instruction
   reloc_riprel_4byte_relax_rex,              // 32-bit rip-relative in relaxable
                                              // instruction with rex prefix
+  reloc_riprel_4byte_relax_rex2,             // 32-bit rip-relative in relaxable
+                                             // instruction with rex2 prefix
   reloc_signed_4byte,                        // 32-bit signed. Unlike FK_Data_4
                                              // this will be sign extended at
                                              // runtime.
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
index 71d42863fd5857..206436191c2584 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -568,8 +568,10 @@ void X86MCCodeEmitter::emitImmediate(const MCOperand &DispOp, SMLoc Loc,
   if (FixupKind == FK_PCRel_4 ||
       FixupKind == MCFixupKind(X86::reloc_riprel_4byte) ||
       FixupKind == MCFixupKind(X86::reloc_riprel_4byte_movq_load) ||
+      FixupKind == MCFixupKind(X86::reloc_riprel_4byte_movq_load_rex2) ||
       FixupKind == MCFixupKind(X86::reloc_riprel_4byte_relax) ||
       FixupKind == MCFixupKind(X86::reloc_riprel_4byte_relax_rex) ||
+      FixupKind == MCFixupKind(X86::reloc_riprel_4byte_relax_rex2) ||
       FixupKind == MCFixupKind(X86::reloc_branch_4byte_pcrel)) {
     ImmOffset -= 4;
     // If this is a pc-relative load off _GLOBAL_OFFSET_TABLE_:
@@ -637,12 +639,11 @@ void X86MCCodeEmitter::emitMemModRMByte(
       default:
         return X86::reloc_riprel_4byte;
       case X86::MOV64rm:
-        // movq loads is a subset of reloc_riprel_4byte_relax_rex. It is a
+        // movq loads is a subset of reloc_riprel_4byte_relax_rex/rex2. It is a
         // special case because COFF and Mach-O don't support ELF's more
-        // flexible R_X86_64_REX_GOTPCRELX relaxation.
-        // TODO: Support new relocation for REX2.
-        assert(Kind == REX || Kind == REX2);
-        return X86::reloc_riprel_4byte_movq_load;
+        // flexible R_X86_64_REX_GOTPCRELX/R_X86_64_REX2_GOTPCRELX relaxation.
+        return Kind == REX2 ? X86::reloc_riprel_4byte_movq_load_rex2
+                            : X86::reloc_riprel_4byte_movq_load;
       case X86::ADC32rm:
       case X86::ADD32rm:
       case X86::AND32rm:
@@ -665,11 +666,9 @@ void X86MCCodeEmitter::emitMemModRMByte(
       case X86::SBB64rm:
       case X86::SUB64rm:
       case X86::XOR64rm:
-        // We haven't support relocation for REX2 prefix, so temporarily use REX
-        // relocation.
-        // TODO: Support new relocation for REX2.
-        return (Kind == REX || Kind == REX2) ? X86::reloc_riprel_4byte_relax_rex
-                                             : X86::reloc_riprel_4byte_relax;
+        return Kind == REX2  ? X86::reloc_riprel_4byte_relax_rex2
+               : Kind == REX ? X86::reloc_riprel_4byte_relax_rex
+                             : X86::reloc_riprel_4byte_relax;
       }
     }();
 
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
index ec95b1ffec387d..41ce5c9fcb82ad 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
@@ -66,8 +66,10 @@ class X86MachObjectWriter : public MCMachObjectTargetWriter {
 static bool isFixupKindRIPRel(unsigned Kind) {
   return Kind == X86::reloc_riprel_4byte ||
          Kind == X86::reloc_riprel_4byte_movq_load ||
+         Kind == X86::reloc_riprel_4byte_movq_load_rex2 ||
          Kind == X86::reloc_riprel_4byte_relax ||
-         Kind == X86::reloc_riprel_4byte_relax_rex;
+         Kind == X86::reloc_riprel_4byte_relax_rex ||
+         Kind == X86::reloc_riprel_4byte_relax_rex2;
 }
 
 static unsigned getFixupKindLog2Size(unsigned Kind) {
@@ -83,7 +85,9 @@ static unsigned getFixupKindLog2Size(unsigned Kind) {
   case X86::reloc_riprel_4byte:
   case X86::reloc_riprel_4byte_relax:
   case X86::reloc_riprel_4byte_relax_rex:
+  case X86::reloc_riprel_4byte_relax_rex2:
   case X86::reloc_riprel_4byte_movq_load:
+  case X86::reloc_riprel_4byte_movq_load_rex2:
   case X86::reloc_signed_4byte:
   case X86::reloc_signed_4byte_relax:
   case X86::reloc_branch_4byte_pcrel:
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
index 10fc176b59d8ab..7740500fb41830 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
@@ -66,8 +66,10 @@ unsigned X86WinCOFFObjectWriter::getRelocType(MCContext &Ctx,
     case FK_PCRel_4:
     case X86::reloc_riprel_4byte:
     case X86::reloc_riprel_4byte_movq_load:
+    case X86::reloc_riprel_4byte_movq_load_rex2:
     case X86::reloc_riprel_4byte_relax:
     case X86::reloc_riprel_4byte_relax_rex:
+    case X86::reloc_riprel_4byte_relax_rex2:
     case X86::reloc_branch_4byte_pcrel:
       return COFF::IMAGE_REL_AMD64_REL32;
     case FK_Data_4:
diff --git a/llvm/test/MC/ELF/relocation-alias.s b/llvm/test/MC/ELF/relocation-alias.s
index 51fb0c37052fe7..66bf2ceea508ba 100644
--- a/llvm/test/MC/ELF/relocation-alias.s
+++ b/llvm/test/MC/ELF/relocation-alias.s
@@ -16,7 +16,10 @@ movabsq $memcpy+2, %rax
 
 # CHECK:      movq (%rip), %rax
 # CHECK-NEXT:   R_X86_64_REX_GOTPCRELX  abs-0x4
+# CHECK:      movq (%rip), %r16
+# CHECK-NEXT:   R_X86_64_REX2_GOTPCRELX abs-0x4
 movq abs@GOTPCREL(%rip), %rax
+movq abs@GOTPCREL(%rip), %r16
 abs = 42
 
 # CHECK:      movabsq $0, %rbx
diff --git a/llvm/test/MC/X86/gotpcrelx.s b/llvm/test/MC/X86/gotpcrelx.s
index e63e3e9a946fd1..5a8ba454bc904c 100644
--- a/llvm/test/MC/X86/gotpcrelx.s
+++ b/llvm/test/MC/X86/gotpcrelx.s
@@ -37,6 +37,16 @@
 # CHECK-NEXT:     R_X86_64_REX_GOTPCRELX sbb
 # CHECK-NEXT:     R_X86_64_REX_GOTPCRELX sub
 # CHECK-NEXT:     R_X86_64_REX_GOTPCRELX xor
+# CHECK-NEXT:     R_X86_64_REX2_GOTPCRELX mov
+# CHECK-NEXT:     R_X86_64_REX2_GOTPCRELX test
+# CHECK-NEXT:     R_X86_64_REX2_GOTPCRELX adc
+# CHECK-NEXT:     R_X86_64_REX2_GOTPCRELX add
+# CHECK-NEXT:     R_X86_64_REX2_GOTPCRELX and
+# CHECK-NEXT:     R_X86_64_REX2_GOTPCRELX cmp
+# CHECK-NEXT:     R_X86_64_REX2_GOTPCRELX or
+# CHECK-NEXT:     R_X86_64_REX2_GOTPCRELX sbb
+# CHECK-NEXT:     R_X86_64_REX2_GOTPCRELX sub
+# CHECK-NEXT:     R_X86_64_REX2_GOTPCRELX xor
 # CHECK-NEXT:   }
 
 # NORELAX-NEXT:     R_X86_64_GOTPCREL mov
@@ -71,6 +81,16 @@
 # NORELAX-NEXT:     R_X86_64_GOTPCREL sbb
 # NORELAX-NEXT:     R_X86_64_GOTPCREL sub
 # NORELAX-NEXT:     R_X86_64_GOTPCREL xor
+# NORELAX-NEXT:     R_X86_64_GOTPCREL mov
+# NORELAX-NEXT:     R_X86_64_GOTPCREL test
+# NORELAX-NEXT:     R_X86_64_GOTPCREL adc
+# NORELAX-NEXT:     R_X86_64_GOTPCREL add
+# NORELAX-NEXT:     R_X86_64_GOTPCREL and
+# NORELAX-NEXT:     R_X86_64_GOTPCREL cmp
+# NORELAX-NEXT:     R_X86_64_GOTPCREL or
+# NORELAX-NEXT:     R_X86_64_GOTPCREL sbb
+# NORELAX-NEXT:     R_X86_64_GOTPCREL sub
+# NORELAX-NEXT:     R_X86_64_GOTPCREL xor
 # NORELAX-NEXT:   }
 
 movl mov@GOTPCREL(%rip), %eax
@@ -108,10 +128,22 @@ sbb sbb@GOTPCREL(%rip), %rax
 sub sub@GOTPCREL(%rip), %rax
 xor xor@GOTPCREL(%rip), %rax
 
+movq mov@GOTPCREL(%rip), %r16
+test %r16, test@GOTPCREL(%rip)
+adc adc@GOTPCREL(%rip), %r16
+add add@GOTPCREL(%rip), %r16
+and and@GOTPCREL(%rip), %r16
+cmp cmp@GOTPCREL(%rip), %r16
+or  or@GOTPCREL(%rip), %r16
+sbb sbb@GOTPCREL(%rip), %r16
+sub sub@GOTPCREL(%rip), %r16
+xor xor@GOTPCREL(%rip), %r16
+
 # COMMON-NEXT:   Section ({{.*}}) .rela.norelax {
 # COMMON-NEXT:     R_X86_64_GOTPCREL mov 0x0
 # COMMON-NEXT:     R_X86_64_GOTPCREL mov 0xFFFFFFFFFFFFFFFD
 # COMMON-NEXT:     R_X86_64_GOTPCREL mov 0xFFFFFFFFFFFFFFFC
+# COMMON-NEXT:     R_X86_64_GOTPCREL mov 0xFFFFFFFFFFFFFFFD
 # COMMON-NEXT:   }
 # COMMON-NEXT: ]
 
@@ -123,3 +155,5 @@ movl mov@GOTPCREL+4(%rip), %eax
 movq mov@GOTPCREL+1(%rip), %rax
 ## We could emit R_X86_64_GOTPCRELX, but it is probably unnecessary.
 movl mov@GOTPCREL+0(%rip), %eax
+## Don't emit R_X86_64_GOTPCRELX.
+movq mov@GOTPCREL+1(%rip), %r16
diff --git a/llvm/test/MC/X86/reloc-directive-elf-64.s b/llvm/test/MC/X86/reloc-directive-elf-64.s
index 8f5d8c895e7d76..323603efc70618 100644
--- a/llvm/test/MC/X86/reloc-directive-elf-64.s
+++ b/llvm/test/MC/X86/reloc-directive-elf-64.s
@@ -9,6 +9,7 @@
 # PRINT-NEXT: .reloc 0, R_X86_64_64, .data+2
 # PRINT-NEXT: .reloc 0, R_X86_64_GOTPCRELX, foo+3
 # PRINT-NEXT: .reloc 0, R_X86_64_REX_GOTPCRELX, 5
+# PRINT-NEXT: .reloc 0, R_X86_64_REX2_GOTPCRELX, 7
 # PRINT:      .reloc 0, BFD_RELOC_NONE, 9
 # PRINT-NEXT: .reloc 0, BFD_RELOC_8, 9
 # PRINT-NEXT: .reloc 0, BFD_RELOC_16, 9
@@ -21,6 +22,7 @@
 # CHECK-NEXT: 0x0 R_X86_64_64 .data 0x2
 # CHECK-NEXT: 0x0 R_X86_64_GOTPCRELX foo 0x3
 # CHECK-NEXT: 0x0 R_X86_64_REX_GOTPCRELX - 0x5
+# CHECK-NEXT: 0x0 R_X86_64_REX2_GOTPCRELX - 0x7
 # CHECK-NEXT: 0x0 R_X86_64_NONE - 0x9
 # CHECK-NEXT: 0x0 R_X86_64_8 - 0x9
 # CHECK-NEXT: 0x0 R_X86_64_16 - 0x9
@@ -37,6 +39,7 @@
   .reloc 0, R_X86_64_64, .data+2
   .reloc 0, R_X86_64_GOTPCRELX, foo+3
   .reloc 0, R_X86_64_REX_GOTPCRELX, 5
+  .reloc 0, R_X86_64_REX2_GOTPCRELX, 7
 
   .reloc 0, BFD_RELOC_NONE, 9
   .reloc 0, BFD_RELOC_8, 9

From 6cfe6a6b3e9578be80120add7fbe19506f747196 Mon Sep 17 00:00:00 2001
From: Georgi Mirazchiyski <georgi.mirazchiyski@codeplay.com>
Date: Tue, 24 Sep 2024 11:47:57 +0100
Subject: [PATCH 14/22] [NFC][AMDGPU] Assert no bad shift operations will
 happen (#108416)

The assumption in the asserts is based on the fact that no SGPR/VGPR
register Arg mask in the ISelLowering and Legalizer can equal zero. They
are implicitly set to ~0 by default (meaning non-masked) or explicitly
to a non-zero value.
The `optimizeCompareInstr` case is different from the above described.
It requires the mask to be a power-of-two because it's a special-case
optimization, hence in this case we still cannot have an invalid shift.
This commit also silences static analysis tools wrt potential bad shifts
that could result from the output of `countr_zero(Mask)`.
---
 llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h | 2 ++
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp           | 3 +++
 2 files changed, 5 insertions(+)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
index 2e02bb4271adc7..06b2f181c276cd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
@@ -77,6 +77,8 @@ struct ArgDescriptor {
   }
 
   unsigned getMask() const {
+    // None of the target SGPRs or VGPRs are expected to have a 'zero' mask.
+    assert(Mask && "Invalid mask.");
     return Mask;
   }
 
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index c787edf7cfd11b..f5f367b2a4a7c6 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -9791,6 +9791,9 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
     else
       return false;
 
+    // A valid Mask is required to have a single bit set, hence a non-zero and
+    // power-of-two value. This verifies that we will not do 64-bit shift below.
+    assert(llvm::has_single_bit(Mask) && "Invalid mask.");
     unsigned BitNo = llvm::countr_zero((uint64_t)Mask);
     if (IsSigned && BitNo == SrcSize - 1)
       return false;

From 5dc15ddf575978e0115b1a6edacb59f056792a80 Mon Sep 17 00:00:00 2001
From: Georgi Mirazchiyski <georgi.mirazchiyski@codeplay.com>
Date: Tue, 24 Sep 2024 11:48:21 +0100
Subject: [PATCH 15/22] [AMDGPU] Default-initialize uninitialized class member
 variables (#108428)

---
 llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp  | 8 ++++----
 llvm/lib/Target/AMDGPU/SIMachineScheduler.h | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index 434336ef137ff5..46f5097c679fb3 100644
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -92,12 +92,12 @@ class V2SCopyInfo {
   SetVector<MachineInstr *> SChain;
   // Number of SGPR to VGPR copies that are used to put the SALU computation
   // results back to VALU.
-  unsigned NumSVCopies;
+  unsigned NumSVCopies = 0;
 
-  unsigned Score;
+  unsigned Score = 0;
   // Actual count of v_readfirstlane_b32
   // which need to be inserted to keep SChain SALU
-  unsigned NumReadfirstlanes;
+  unsigned NumReadfirstlanes = 0;
   // Current score state. To speedup selection V2SCopyInfos for processing
   bool NeedToBeConvertedToVALU = false;
   // Unique ID. Used as a key for mapping to keep permanent order.
@@ -109,7 +109,7 @@ class V2SCopyInfo {
   SetVector<unsigned> Siblings;
   V2SCopyInfo() : Copy(nullptr), ID(0){};
   V2SCopyInfo(unsigned Id, MachineInstr *C, unsigned Width)
-      : Copy(C), NumSVCopies(0), NumReadfirstlanes(Width / 32), ID(Id){};
+      : Copy(C), NumReadfirstlanes(Width / 32), ID(Id){};
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   void dump() {
     dbgs() << ID << " : " << *Copy << "\n\tS:" << SChain.size()
diff --git a/llvm/lib/Target/AMDGPU/SIMachineScheduler.h b/llvm/lib/Target/AMDGPU/SIMachineScheduler.h
index ac34a748edbc1e..f8f4b5aae338eb 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineScheduler.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineScheduler.h
@@ -120,8 +120,8 @@ class SIScheduleBlock {
   ArrayRef<std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind>>
     getSuccs() const { return Succs; }
 
-  unsigned Height;  // Maximum topdown path length to block without outputs
-  unsigned Depth;   // Maximum bottomup path length to block without inputs
+  unsigned Height = 0;  // Maximum topdown path length to block without outputs
+  unsigned Depth = 0;   // Maximum bottomup path length to block without inputs
 
   unsigned getNumHighLatencySuccessors() const {
     return NumHighLatencySuccessors;

From 4f8e76684f4c1e67726222c35f173ef722464a7e Mon Sep 17 00:00:00 2001
From: Vyacheslav Levytskyy <vyacheslav.levytskyy@intel.com>
Date: Tue, 24 Sep 2024 13:33:31 +0200
Subject: [PATCH 16/22] [AsmPrinter] Do not emit label instructions after the
 function body if the target is SPIR-V (#107013)

AsmPrinter always creates a symbol for the end of function if valid
debug info is present. However, this breaks SPIR-V target's output,
because SPIR-V specification allows label instructions only inside a
block, not after the function body (see
https://registry.khronos.org/SPIR-V/specs/unified1/SPIRV.html#OpLabel).
This PR proposes to disable emission of label instructions after the
function body if the target is SPIR-V.

This PR is a fix of the
https://github.com/llvm/llvm-project/issues/102732 issue.
---
 llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp                   | 5 ++++-
 llvm/test/CodeGen/SPIRV/debug-info/debug-compilation-unit.ll | 2 ++
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index db7adfd3b21e5f..d17800d375b7f2 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -1971,7 +1971,10 @@ void AsmPrinter::emitFunctionBody() {
   // are automatically sized.
   bool EmitFunctionSize = MAI->hasDotTypeDotSizeDirective() && !TT.isWasm();
 
-  if (EmitFunctionSize || needFuncLabels(*MF, *this)) {
+  // SPIR-V supports label instructions only inside a block, not after the
+  // function body.
+  if (TT.getObjectFormat() != Triple::SPIRV &&
+      (EmitFunctionSize || needFuncLabels(*MF, *this))) {
     // Create a symbol for the end of function.
     CurrentFnEnd = createTempSymbol("func_end");
     OutStreamer->emitLabel(CurrentFnEnd);
diff --git a/llvm/test/CodeGen/SPIRV/debug-info/debug-compilation-unit.ll b/llvm/test/CodeGen/SPIRV/debug-info/debug-compilation-unit.ll
index bff4660559ab82..794dcd6d9f3fb4 100644
--- a/llvm/test/CodeGen/SPIRV/debug-info/debug-compilation-unit.ll
+++ b/llvm/test/CodeGen/SPIRV/debug-info/debug-compilation-unit.ll
@@ -29,11 +29,13 @@ define spir_func void @foo() {
 entry:
   ret void
 }
+; CHECK-SPIRV-NOT: Lfunc_end0:
 
 define spir_func void @bar() {
 entry:
   ret void
 }
+; CHECK-SPIRV-NOT: Lfunc_end1:
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!2, !3, !4, !5}

From 497759e872a53964a54db941f3a1ed74446c5ed4 Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Tue, 24 Sep 2024 12:40:42 +0100
Subject: [PATCH 17/22] [lldb][AArch64] Create Neon subregs when XML only
 includes SVE (#108365)

Fixes #107864

QEMU decided that when SVE is enabled it will only tell us about SVE
registers in the XML, and not include Neon registers. On the grounds
that the Neon V registers can be read from the bottom 128 bits of a SVE
Z register (SVE's vector length is always >= 128 bits).

To support this we create sub-registers just as we do for S and D
registers of the V registers. Except this time we use part of the Z
registers.

This change also updates our fallback for registers with unknown types
that are > 128 bit. This is detailed in
https://github.com/llvm/llvm-project/issues/87471, though that covers
more than this change fixes.

We'll now treat any register of unknown type that is >= 128 bit as a
vector of bytes. So that the user gets to see something
even if the order might be wrong.

And until lldb supports vector and union types for registers, this is
also the only way we can get a value to apply the sub-reg to, to make
the V registers.
---
 .../source/Plugins/ABI/AArch64/ABIAArch64.cpp |  40 +++++-
 .../Process/gdb-remote/ProcessGDBRemote.cpp   |   9 +-
 .../TestAArch64XMLRegistersSVEOnly.py         | 121 ++++++++++++++++++
 3 files changed, 163 insertions(+), 7 deletions(-)
 create mode 100644 lldb/test/API/functionalities/gdb_remote_client/TestAArch64XMLRegistersSVEOnly.py

diff --git a/lldb/source/Plugins/ABI/AArch64/ABIAArch64.cpp b/lldb/source/Plugins/ABI/AArch64/ABIAArch64.cpp
index 256c1f828feb38..7d8d0a4d3d6711 100644
--- a/lldb/source/Plugins/ABI/AArch64/ABIAArch64.cpp
+++ b/lldb/source/Plugins/ABI/AArch64/ABIAArch64.cpp
@@ -136,6 +136,8 @@ void ABIAArch64::AugmentRegisterInfo(
 
   std::array<std::optional<uint32_t>, 32> x_regs;
   std::array<std::optional<uint32_t>, 32> v_regs;
+  std::array<std::optional<uint32_t>, 32> z_regs;
+  std::optional<uint32_t> z_byte_size;
 
   for (auto it : llvm::enumerate(regs)) {
     lldb_private::DynamicRegisterInfo::Register &info = it.value();
@@ -157,16 +159,44 @@ void ABIAArch64::AugmentRegisterInfo(
       x_regs[reg_num] = it.index();
     else if (get_reg("v"))
       v_regs[reg_num] = it.index();
+    else if (get_reg("z")) {
+      z_regs[reg_num] = it.index();
+      if (!z_byte_size)
+        z_byte_size = info.byte_size;
+    }
     // if we have at least one subregister, abort
     else if (get_reg("w") || get_reg("s") || get_reg("d"))
       return;
   }
 
-  // Create aliases for partial registers: wN for xN, and sN/dN for vN.
+  // Create aliases for partial registers.
+
+  // Wn for Xn.
   addPartialRegisters(regs, x_regs, 8, "w{0}", 4, lldb::eEncodingUint,
                       lldb::eFormatHex);
-  addPartialRegisters(regs, v_regs, 16, "s{0}", 4, lldb::eEncodingIEEE754,
-                      lldb::eFormatFloat);
-  addPartialRegisters(regs, v_regs, 16, "d{0}", 8, lldb::eEncodingIEEE754,
-                      lldb::eFormatFloat);
+
+  auto bool_predicate = [](const auto &reg_num) { return bool(reg_num); };
+  bool saw_v_regs = std::any_of(v_regs.begin(), v_regs.end(), bool_predicate);
+  bool saw_z_regs = std::any_of(z_regs.begin(), z_regs.end(), bool_predicate);
+
+  // Sn/Dn for Vn.
+  if (saw_v_regs) {
+    addPartialRegisters(regs, v_regs, 16, "s{0}", 4, lldb::eEncodingIEEE754,
+                        lldb::eFormatFloat);
+    addPartialRegisters(regs, v_regs, 16, "d{0}", 8, lldb::eEncodingIEEE754,
+                        lldb::eFormatFloat);
+  } else if (saw_z_regs && z_byte_size) {
+    // When SVE is enabled, some debug stubs will not describe the Neon V
+    // registers because they can be read from the bottom 128 bits of the SVE
+    // registers.
+
+    // The size used here is the one sent by the debug server. This only needs
+    // to be correct right now. Later we will rely on the value of vg instead.
+    addPartialRegisters(regs, z_regs, *z_byte_size, "v{0}", 16,
+                        lldb::eEncodingVector, lldb::eFormatVectorOfUInt8);
+    addPartialRegisters(regs, z_regs, *z_byte_size, "s{0}", 4,
+                        lldb::eEncodingIEEE754, lldb::eFormatFloat);
+    addPartialRegisters(regs, z_regs, *z_byte_size, "d{0}", 8,
+                        lldb::eEncodingIEEE754, lldb::eFormatFloat);
+  }
 }
diff --git a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
index 9e8c6046179631..3e09c316d74f44 100644
--- a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
@@ -4716,9 +4716,14 @@ bool ParseRegisters(
               reg_info.encoding = eEncodingIEEE754;
             } else if (gdb_type == "aarch64v" ||
                        llvm::StringRef(gdb_type).starts_with("vec") ||
-                       gdb_type == "i387_ext" || gdb_type == "uint128") {
+                       gdb_type == "i387_ext" || gdb_type == "uint128" ||
+                       reg_info.byte_size > 16) {
               // lldb doesn't handle 128-bit uints correctly (for ymm*h), so
-              // treat them as vector (similarly to xmm/ymm)
+              // treat them as vector (similarly to xmm/ymm).
+              // We can fall back to handling anything else <= 128 bit as an
+              // unsigned integer, more than that, call it a vector of bytes.
+              // This can happen if we don't recognise the type for AArc64 SVE
+              // registers.
               reg_info.format = eFormatVectorOfUInt8;
               reg_info.encoding = eEncodingVector;
             } else {
diff --git a/lldb/test/API/functionalities/gdb_remote_client/TestAArch64XMLRegistersSVEOnly.py b/lldb/test/API/functionalities/gdb_remote_client/TestAArch64XMLRegistersSVEOnly.py
new file mode 100644
index 00000000000000..e36013a11491b3
--- /dev/null
+++ b/lldb/test/API/functionalities/gdb_remote_client/TestAArch64XMLRegistersSVEOnly.py
@@ -0,0 +1,121 @@
+""" Check that when a debug server provides XML that only defines SVE Z registers,
+    and does not include Neon V registers, lldb creates sub-registers to represent
+    the V registers as the bottom 128 bits of the Z registers.
+
+    qemu-aarch64 is one such debug server.
+
+    This also doubles as a test that lldb has a fallback path for registers of
+    unknown type that are > 128 bits, as the SVE registers are here.
+"""
+
+from textwrap import dedent
+import lldb
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test.decorators import *
+from lldbsuite.test.gdbclientutils import *
+from lldbsuite.test.lldbgdbclient import GDBRemoteTestBase
+
+
+class Responder(MockGDBServerResponder):
+    def __init__(self):
+        super().__init__()
+        self.vg = 4
+        self.pc = 0xA0A0A0A0A0A0A0A0
+
+    def qXferRead(self, obj, annex, offset, length):
+        if annex == "target.xml":
+            # Note that QEMU sends the current SVE size in XML and the debugger
+            # then reads vg to know the latest size.
+            return (
+                dedent(
+                    """\
+              <?xml version="1.0"?>
+              <target version="1.0">
+                <architecture>aarch64</architecture>
+                <feature name="org.gnu.gdb.aarch64.core">
+                  <reg name="pc" regnum="0" bitsize="64"/>
+                  <reg name="vg" regnum="1" bitsize="64"/>
+                  <reg name="z0" regnum="2" bitsize="2048" type="not_a_type"/>
+                </feature>
+              </target>"""
+                ),
+                False,
+            )
+
+        return (None,)
+
+    def readRegister(self, regnum):
+        return "E01"
+
+    def readRegisters(self):
+        return "".join(
+            [
+                # 64 bit PC.
+                f"{self.pc:x}",
+                # 64 bit vg
+                f"0{self.vg}00000000000000",
+                # Enough data for 256 and 512 bit SVE.
+                "".join([f"{n:02x}" * 4 for n in range(1, 17)]),
+            ]
+        )
+
+    def cont(self):
+        # vg is expedited so that lldb can resize the SVE registers.
+        return f"T02thread:1ff0d;threads:1ff0d;thread-pcs:{self.pc};01:0{self.vg}00000000000000;"
+
+    def writeRegisters(self, registers_hex):
+        # We get a block of data containing values in regnum order.
+        self.vg = int(registers_hex[16:18])
+        return "OK"
+
+
+class TestXMLRegisterFlags(GDBRemoteTestBase):
+    def check_regs(self, vg):
+        # Each 32 bit chunk repeats n.
+        z0_value = " ".join(
+            [" ".join([f"0x{n:02x}"] * 4) for n in range(1, (vg * 2) + 1)]
+        )
+
+        self.expect(
+            "register read vg z0 v0 s0 d0",
+            substrs=[
+                f"      vg = 0x000000000000000{vg}\n"
+                "      z0 = {" + z0_value + "}\n"
+                "      v0 = {0x01 0x01 0x01 0x01 0x02 0x02 0x02 0x02 0x03 0x03 0x03 0x03 0x04 0x04 0x04 0x04}\n"
+                "      s0 = 2.36942783E-38\n"
+                "      d0 = 5.3779407333977203E-299\n"
+            ],
+        )
+
+        self.expect("register read s0 --format uint32", substrs=["s0 = {0x01010101}"])
+        self.expect(
+            "register read d0 --format uint64",
+            substrs=["d0 = {0x0202020201010101}"],
+        )
+
+    @skipIfXmlSupportMissing
+    @skipIfRemote
+    @skipIfLLVMTargetMissing("AArch64")
+    def test_v_sub_registers(self):
+        self.server.responder = Responder()
+        target = self.dbg.CreateTarget("")
+
+        if self.TraceOn():
+            self.runCmd("log enable gdb-remote packets")
+            self.addTearDownHook(lambda: self.runCmd("log disable gdb-remote packets"))
+
+        process = self.connect(target)
+        lldbutil.expect_state_changes(
+            self, self.dbg.GetListener(), process, [lldb.eStateStopped]
+        )
+
+        self.check_regs(4)
+
+        # Now increase the SVE length and continue. The mock will respond with a new
+        # vg and lldb will reconfigure the register defs. This should not break the
+        # sub-registers.
+
+        self.runCmd("register write vg 8")
+        self.expect("continue", substrs=["stop reason = signal SIGINT"])
+
+        self.check_regs(8)

From c30fa3cde755e7519f0962f581868a09da1ea130 Mon Sep 17 00:00:00 2001
From: Georgi Mirazchiyski <georgi.mirazchiyski@codeplay.com>
Date: Tue, 24 Sep 2024 12:54:05 +0100
Subject: [PATCH 18/22] [AMDGPU] Fix has_single_bit assertion for Mask in
 SIInstrInfo (#109785)

Convert the `int64_t` Mask to `uint64_t` for `llvm::has_single_bit` to
compile.
---
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index f5f367b2a4a7c6..9ad0b4c65e1d90 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -9793,7 +9793,7 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
 
     // A valid Mask is required to have a single bit set, hence a non-zero and
     // power-of-two value. This verifies that we will not do 64-bit shift below.
-    assert(llvm::has_single_bit(Mask) && "Invalid mask.");
+    assert(llvm::has_single_bit<uint64_t>(Mask) && "Invalid mask.");
     unsigned BitNo = llvm::countr_zero((uint64_t)Mask);
     if (IsSigned && BitNo == SrcSize - 1)
       return false;

From 029b9b611d8becf04f4c525ab2b70e956b4b186d Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Tue, 24 Sep 2024 13:02:52 +0100
Subject: [PATCH 19/22] [llvm][docs] Improve formatting of
 ENABLE_PROJECTS/RUNTIMES description

* Add line breaks so it's clear what should be passed to CMake.
* Make the note into an RST note block.
* Fix a couple of markdown style plain text markers.
---
 llvm/docs/CMake.rst | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/llvm/docs/CMake.rst b/llvm/docs/CMake.rst
index 838447f483e510..b5adb22d8f33b1 100644
--- a/llvm/docs/CMake.rst
+++ b/llvm/docs/CMake.rst
@@ -571,10 +571,12 @@ enabled sub-projects. Nearly all of these variable names begin with
   Semicolon-separated list of projects to build, or *all* for building all
   (clang, lldb, lld, polly, etc) projects. This flag assumes that projects
   are checked out side-by-side and not nested, i.e. clang needs to be in
-  parallel of llvm instead of nested in `llvm/tools`. This feature allows
+  parallel of llvm instead of nested in ``llvm/tools``. This feature allows
   to have one build for only LLVM and another for clang+llvm using the same
   source checkout.
+
   The full list is:
+
   ``clang;clang-tools-extra;cross-project-tests;libc;libclc;lld;lldb;openmp;polly;pstl``
 
 **LLVM_ENABLE_RTTI**:BOOL
@@ -586,10 +588,16 @@ enabled sub-projects. Nearly all of these variable names begin with
   It will build the builtins separately from the other runtimes to preserve
   correct dependency ordering. If you want to build the runtimes using a system
   compiler, see the `libc++ documentation <https://libcxx.llvm.org/BuildingLibcxx.html>`_.
-  Note: the list should not have duplicates with `LLVM_ENABLE_PROJECTS`.
+
+  .. note::
+    The list should not have duplicates with ``LLVM_ENABLE_PROJECTS``.
+
   The full list is:
+
   ``compiler-rt;libc;libcxx;libcxxabi;libunwind;openmp``
+
   To enable all of them, use:
+
   ``LLVM_ENABLE_RUNTIMES=all``
 
 **LLVM_ENABLE_SPHINX**:BOOL

From c1826aeef353bf4bd8b181b47a0dbb1f1af93836 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andrzej=20Warzy=C5=84ski?= <andrzej.warzynski@arm.com>
Date: Tue, 24 Sep 2024 13:14:49 +0100
Subject: [PATCH 20/22] [mlir][tensor] Add new helper hooks for RelayoutOp
 (#109642)

Implements two helper hooks for PackOp and UnPackOP, `getAllOuterDims`
and `getTiledOuterDims`, and adds them to RelayoutOp (that both PackOp
an UnPackOp inherit from).

This improves code re-use and also clarifies the meaning of "outer dims"
and "tiled outer dims".
---
 .../mlir/Dialect/Tensor/IR/TensorOps.td       | 18 +++++++++-
 .../Dialect/Linalg/Transforms/Transforms.cpp  | 23 ++++++-------
 mlir/lib/Dialect/Tensor/IR/TensorOps.cpp      | 34 +++++++++++++++++++
 3 files changed, 61 insertions(+), 14 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td b/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td
index cafc3d91fd1e9d..3170115883e2be 100644
--- a/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td
+++ b/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td
@@ -1814,7 +1814,7 @@ def Tensor_SplatOp : Tensor_Op<"splat", [
 }
 
 //===----------------------------------------------------------------------===//
-// PackOp
+// RelayoutOp
 //===----------------------------------------------------------------------===//
 
 class Tensor_RelayoutOp<string mnemonic, list<Trait> traits = []> :
@@ -1851,11 +1851,27 @@ class Tensor_RelayoutOp<string mnemonic, list<Trait> traits = []> :
     /// a sentinel `kDynamic` is introduced at that position in
     /// the returned vector.
     SmallVector<int64_t> getStaticTiles();
+
+    /// Retrieve all outer dims for this Pack/UnPack Op, i.e. all the leading
+    /// dims excluding the trailing dims corresponding to `innerTiles`. Note
+    /// that this will include both tiled and non-tiled dimensions. The order
+    /// of the output dimensions is consistent with the shape of the packed
+    /// tensor.
+    ArrayRef<int64_t> getAllOuterDims();
+
+    /// Similar to `getAllOuterDims`, but only retrieve the outer dims that
+    /// have been tiled. Also, the order of the output dimensions is consistent
+    /// with `inner_dims_pos` rather than the packed tensor.
+    SmallVector<int64_t> getTiledOuterDims();
   }];
 
   let hasVerifier = 1;
 }
 
+//===----------------------------------------------------------------------===//
+// PackOp
+//===----------------------------------------------------------------------===//
+
 def Tensor_PackOp : Tensor_RelayoutOp<"pack", [
     AttrSizedOperandSegments]> {
   let summary = "tensor pack operation";
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
index 77f0ea9d2236ea..e0dea8e78d55c1 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
@@ -1030,11 +1030,13 @@ static Value getPackOpSourceOrPaddedSource(OpBuilder &builder,
     return input;
   }
 
+  assert(llvm::all_of(packOp.getAllOuterDims(),
+                      [](int64_t val) { return val == 1; }) &&
+         "some outer dims are != 1");
+
   Location loc = packOp.getLoc();
   ShapedType inputType = packOp.getSourceType();
   int64_t inputRank = inputType.getRank();
-  assert(llvm::all_of(packOp.getDestType().getShape().take_front(inputRank),
-                      [](int64_t val) { return val == 1; }));
 
   SmallVector<int64_t> paddedShape;
   DenseMap<int64_t, OpFoldResult> tileAndPosMapping =
@@ -1126,12 +1128,8 @@ LogicalResult GeneralizeOuterUnitDimsPackOpPattern::matchAndRewrite(
 
   // TODO: support the case that outer dimensions are not all 1s. A
   // tensor.expand_shape will be generated in this case.
-  auto innerDimsPos = packOp.getInnerDimsPos();
-  int64_t srcRank = packOp.getSourceRank();
-  auto destShape = packOp.getDestType().getShape();
-  if (llvm::any_of(innerDimsPos, [destShape](int64_t index) {
-        return destShape[index] != 1;
-      })) {
+  if (llvm::any_of(packOp.getTiledOuterDims(),
+                   [](int64_t dim) { return dim != 1; })) {
     return rewriter.notifyMatchFailure(
         packOp, "require the tiled outer dimensions of the result are all 1s");
   }
@@ -1145,6 +1143,7 @@ LogicalResult GeneralizeOuterUnitDimsPackOpPattern::matchAndRewrite(
       packOp.getDimAndTileMapping();
   Attribute zeroIdxAttr = rewriter.getIndexAttr(0);
   Attribute oneIdxAttr = rewriter.getIndexAttr(1);
+  int64_t srcRank = packOp.getSourceRank();
   SmallVector<OpFoldResult> readOffsets(srcRank, zeroIdxAttr);
   SmallVector<OpFoldResult> readStrides(srcRank, oneIdxAttr);
   SmallVector<OpFoldResult> readSizes;
@@ -1173,9 +1172,8 @@ LogicalResult GeneralizeOuterUnitDimsPackOpPattern::matchAndRewrite(
       loc, readType, input, readOffsets, readSizes, readStrides);
 
   // 2. Transpose the tile to match the inner tile order.
-
   SmallVector<int64_t> perm = getPackUnpackRankReducedPerm(
-      inputShape, innerDimsPos, packOp.getOuterDimsPerm());
+      inputShape, packOp.getInnerDimsPos(), packOp.getOuterDimsPerm());
 
   LLVM_DEBUG(DBGS() << "Pack permutation: " << packOp << "\n";
              llvm::interleaveComma(perm, DBGS() << "perm: "); DBGSNL(););
@@ -1208,9 +1206,8 @@ LogicalResult GeneralizeOuterUnitDimsUnPackOpPattern::matchAndRewrite(
   int64_t destRank = unpackOp.getDestRank();
   ArrayRef<int64_t> srcShape = unpackOp.getSourceType().getShape();
   ArrayRef<int64_t> innerDimsPos = unpackOp.getInnerDimsPos();
-  if (llvm::any_of(innerDimsPos, [srcShape](int64_t index) {
-        return srcShape[index] != 1;
-      })) {
+  if (llvm::any_of(unpackOp.getTiledOuterDims(),
+                   [](int64_t dim) { return dim != 1; })) {
     return rewriter.notifyMatchFailure(
         unpackOp,
         "require the tiled outer dimensions of the result are all 1s");
diff --git a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
index 47f540e092e990..1ac96756e22b5e 100644
--- a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
+++ b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
@@ -3987,6 +3987,23 @@ SmallVector<int64_t> PackOp::getStaticTiles() {
   return getStaticTilesImpl(*this);
 }
 
+ArrayRef<int64_t> PackOp::getAllOuterDims() {
+  ShapedType inputType = getSourceType();
+  int64_t inputRank = inputType.getRank();
+  return getDestType().getShape().take_front(inputRank);
+}
+
+SmallVector<int64_t> PackOp::getTiledOuterDims() {
+  auto innerDimsPos = getInnerDimsPos();
+  auto packedShape = getDestType().getShape();
+  SmallVector<int64_t> res;
+
+  for (auto index : innerDimsPos)
+    res.push_back(packedShape[index]);
+
+  return res;
+}
+
 bool PackOp::requirePaddingValue(ArrayRef<int64_t> inputShape,
                                  ArrayRef<int64_t> innerDimsPos,
                                  ArrayRef<int64_t> outputShape,
@@ -4411,6 +4428,23 @@ SmallVector<int64_t> UnPackOp::getStaticTiles() {
   return getStaticTilesImpl(*this);
 }
 
+ArrayRef<int64_t> UnPackOp::getAllOuterDims() {
+  ShapedType destType = getDestType();
+  int64_t destRank = destType.getRank();
+  return getSourceType().getShape().take_front(destRank);
+}
+
+SmallVector<int64_t> UnPackOp::getTiledOuterDims() {
+  auto innerDimsPos = getInnerDimsPos();
+  auto packedShape = getSourceType().getShape();
+  SmallVector<int64_t> res;
+
+  for (auto index : innerDimsPos)
+    res.push_back(packedShape[index]);
+
+  return res;
+}
+
 LogicalResult UnPackOp::verify() {
   return commonVerifierPackAndUnPackOp(*this);
 }

From 3e3780ef6ab5902cd1763e28bb143e47091bd23a Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker@arm.com>
Date: Tue, 24 Sep 2024 13:15:26 +0100
Subject: [PATCH 21/22] [LLVM][CodeGen][SVE] Implement nxvf32 fpround to
 nxvbf16. (#107420)

---
 .../Target/AArch64/AArch64ISelLowering.cpp    |  50 ++++++-
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td |   2 +-
 llvm/lib/Target/AArch64/SVEInstrFormats.td    |   6 +-
 .../test/CodeGen/AArch64/sve-bf16-converts.ll | 129 +++++++++++++++++-
 4 files changed, 180 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index b11ac81069f660..4166d9bd22bc01 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1664,6 +1664,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::BITCAST, VT, Custom);
       setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
       setOperationAction(ISD::FP_EXTEND, VT, Custom);
+      setOperationAction(ISD::FP_ROUND, VT, Custom);
       setOperationAction(ISD::MLOAD, VT, Custom);
       setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
       setOperationAction(ISD::SPLAT_VECTOR, VT, Legal);
@@ -4334,14 +4335,57 @@ SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
 SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
                                              SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
-  if (VT.isScalableVector())
-    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_ROUND_MERGE_PASSTHRU);
-
   bool IsStrict = Op->isStrictFPOpcode();
   SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
   EVT SrcVT = SrcVal.getValueType();
   bool Trunc = Op.getConstantOperandVal(IsStrict ? 2 : 1) == 1;
 
+  if (VT.isScalableVector()) {
+    if (VT.getScalarType() != MVT::bf16)
+      return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_ROUND_MERGE_PASSTHRU);
+
+    SDLoc DL(Op);
+    constexpr EVT I32 = MVT::nxv4i32;
+    auto ImmV = [&](int I) -> SDValue { return DAG.getConstant(I, DL, I32); };
+
+    SDValue NaN;
+    SDValue Narrow;
+
+    if (SrcVT == MVT::nxv2f32 || SrcVT == MVT::nxv4f32) {
+      if (Subtarget->hasBF16())
+        return LowerToPredicatedOp(Op, DAG,
+                                   AArch64ISD::FP_ROUND_MERGE_PASSTHRU);
+
+      Narrow = getSVESafeBitCast(I32, SrcVal, DAG);
+
+      // Set the quiet bit.
+      if (!DAG.isKnownNeverSNaN(SrcVal))
+        NaN = DAG.getNode(ISD::OR, DL, I32, Narrow, ImmV(0x400000));
+    } else
+      return SDValue();
+
+    if (!Trunc) {
+      SDValue Lsb = DAG.getNode(ISD::SRL, DL, I32, Narrow, ImmV(16));
+      Lsb = DAG.getNode(ISD::AND, DL, I32, Lsb, ImmV(1));
+      SDValue RoundingBias = DAG.getNode(ISD::ADD, DL, I32, Lsb, ImmV(0x7fff));
+      Narrow = DAG.getNode(ISD::ADD, DL, I32, Narrow, RoundingBias);
+    }
+
+    // Don't round if we had a NaN, we don't want to turn 0x7fffffff into
+    // 0x80000000.
+    if (NaN) {
+      EVT I1 = I32.changeElementType(MVT::i1);
+      EVT CondVT = VT.changeElementType(MVT::i1);
+      SDValue IsNaN = DAG.getSetCC(DL, CondVT, SrcVal, SrcVal, ISD::SETUO);
+      IsNaN = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, I1, IsNaN);
+      Narrow = DAG.getSelect(DL, I32, IsNaN, NaN, Narrow);
+    }
+
+    // Now that we have rounded, shift the bits into position.
+    Narrow = DAG.getNode(ISD::SRL, DL, I32, Narrow, ImmV(16));
+    return getSVESafeBitCast(VT, Narrow, DAG);
+  }
+
   if (useSVEForFixedLengthVectorVT(SrcVT, !Subtarget->isNeonAvailable()))
     return LowerFixedLengthFPRoundToSVE(Op, DAG);
 
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 1f3d63a216c6dd..7240f6a22a87bd 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -2425,7 +2425,7 @@ let Predicates = [HasBF16, HasSVEorSME] in {
   defm BFMLALT_ZZZ : sve2_fp_mla_long<0b101, "bfmlalt", nxv4f32, nxv8bf16, int_aarch64_sve_bfmlalt>;
   defm BFMLALB_ZZZI : sve2_fp_mla_long_by_indexed_elem<0b100, "bfmlalb", nxv4f32, nxv8bf16, int_aarch64_sve_bfmlalb_lane_v2>;
   defm BFMLALT_ZZZI : sve2_fp_mla_long_by_indexed_elem<0b101, "bfmlalt", nxv4f32, nxv8bf16, int_aarch64_sve_bfmlalt_lane_v2>;
-  defm BFCVT_ZPmZ   : sve_bfloat_convert<0b1, "bfcvt",   int_aarch64_sve_fcvt_bf16f32>;
+  defm BFCVT_ZPmZ   : sve_bfloat_convert<0b1, "bfcvt",   int_aarch64_sve_fcvt_bf16f32, AArch64fcvtr_mt>;
   defm BFCVTNT_ZPmZ : sve_bfloat_convert<0b0, "bfcvtnt", int_aarch64_sve_fcvtnt_bf16f32>;
 } // End HasBF16, HasSVEorSME
 
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 8119198a48aa59..0bfac6465a1f30 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -8807,9 +8807,13 @@ class sve_bfloat_convert<bit N, string asm>
   let mayRaiseFPException = 1;
 }
 
-multiclass sve_bfloat_convert<bit N, string asm, SDPatternOperator op> {
+multiclass sve_bfloat_convert<bit N, string asm, SDPatternOperator op,
+                              SDPatternOperator ir_op = null_frag> {
   def NAME : sve_bfloat_convert<N, asm>;
+
   def : SVE_3_Op_Pat<nxv8bf16, op, nxv8bf16, nxv8i1, nxv4f32, !cast<Instruction>(NAME)>;
+  def : SVE_1_Op_Passthru_Round_Pat<nxv4bf16, ir_op, nxv4i1, nxv4f32, !cast<Instruction>(NAME)>;
+  def : SVE_1_Op_Passthru_Round_Pat<nxv2bf16, ir_op, nxv2i1, nxv2f32, !cast<Instruction>(NAME)>;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/test/CodeGen/AArch64/sve-bf16-converts.ll b/llvm/test/CodeGen/AArch64/sve-bf16-converts.ll
index d72f92c1dac1ff..d63f7e6f3242e0 100644
--- a/llvm/test/CodeGen/AArch64/sve-bf16-converts.ll
+++ b/llvm/test/CodeGen/AArch64/sve-bf16-converts.ll
@@ -1,9 +1,15 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mattr=+sve                  < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s
+; RUN: llc -mattr=+sve                          < %s | FileCheck %s --check-prefixes=CHECK,NOBF16
+; RUN: llc -mattr=+sve --enable-no-nans-fp-math < %s | FileCheck %s --check-prefixes=CHECK,NOBF16NNAN
+; RUN: llc -mattr=+sve,+bf16                    < %s | FileCheck %s --check-prefixes=CHECK,BF16
+; RUN: llc -mattr=+sme -force-streaming         < %s | FileCheck %s --check-prefixes=CHECK,BF16
 
 target triple = "aarch64-unknown-linux-gnu"
 
+; NOTE: "fptrunc <# x double> to <# x bfloat>" is not supported because SVE
+; lacks a down convert that rounds to odd. Such IR will trigger the usual
+; failure (crash) when attempting to unroll a scalable vector.
+
 define <vscale x 2 x float> @fpext_nxv2bf16_to_nxv2f32(<vscale x 2 x bfloat> %a) {
 ; CHECK-LABEL: fpext_nxv2bf16_to_nxv2f32:
 ; CHECK:       // %bb.0:
@@ -87,3 +93,122 @@ define <vscale x 8 x double> @fpext_nxv8bf16_to_nxv8f64(<vscale x 8 x bfloat> %a
   %res = fpext <vscale x 8 x bfloat> %a to <vscale x 8 x double>
   ret <vscale x 8 x double> %res
 }
+
+define <vscale x 2 x bfloat> @fptrunc_nxv2f32_to_nxv2bf16(<vscale x 2 x float> %a) {
+; NOBF16-LABEL: fptrunc_nxv2f32_to_nxv2bf16:
+; NOBF16:       // %bb.0:
+; NOBF16-NEXT:    mov z1.s, #32767 // =0x7fff
+; NOBF16-NEXT:    lsr z2.s, z0.s, #16
+; NOBF16-NEXT:    ptrue p0.d
+; NOBF16-NEXT:    fcmuo p0.s, p0/z, z0.s, z0.s
+; NOBF16-NEXT:    and z2.s, z2.s, #0x1
+; NOBF16-NEXT:    add z1.s, z0.s, z1.s
+; NOBF16-NEXT:    orr z0.s, z0.s, #0x400000
+; NOBF16-NEXT:    add z1.s, z2.s, z1.s
+; NOBF16-NEXT:    sel z0.s, p0, z0.s, z1.s
+; NOBF16-NEXT:    lsr z0.s, z0.s, #16
+; NOBF16-NEXT:    ret
+;
+; NOBF16NNAN-LABEL: fptrunc_nxv2f32_to_nxv2bf16:
+; NOBF16NNAN:       // %bb.0:
+; NOBF16NNAN-NEXT:    mov z1.s, #32767 // =0x7fff
+; NOBF16NNAN-NEXT:    lsr z2.s, z0.s, #16
+; NOBF16NNAN-NEXT:    and z2.s, z2.s, #0x1
+; NOBF16NNAN-NEXT:    add z0.s, z0.s, z1.s
+; NOBF16NNAN-NEXT:    add z0.s, z2.s, z0.s
+; NOBF16NNAN-NEXT:    lsr z0.s, z0.s, #16
+; NOBF16NNAN-NEXT:    ret
+;
+; BF16-LABEL: fptrunc_nxv2f32_to_nxv2bf16:
+; BF16:       // %bb.0:
+; BF16-NEXT:    ptrue p0.d
+; BF16-NEXT:    bfcvt z0.h, p0/m, z0.s
+; BF16-NEXT:    ret
+  %res = fptrunc <vscale x 2 x float> %a to <vscale x 2 x bfloat>
+  ret <vscale x 2 x bfloat> %res
+}
+
+define <vscale x 4 x bfloat> @fptrunc_nxv4f32_to_nxv4bf16(<vscale x 4 x float> %a) {
+; NOBF16-LABEL: fptrunc_nxv4f32_to_nxv4bf16:
+; NOBF16:       // %bb.0:
+; NOBF16-NEXT:    mov z1.s, #32767 // =0x7fff
+; NOBF16-NEXT:    lsr z2.s, z0.s, #16
+; NOBF16-NEXT:    ptrue p0.s
+; NOBF16-NEXT:    fcmuo p0.s, p0/z, z0.s, z0.s
+; NOBF16-NEXT:    and z2.s, z2.s, #0x1
+; NOBF16-NEXT:    add z1.s, z0.s, z1.s
+; NOBF16-NEXT:    orr z0.s, z0.s, #0x400000
+; NOBF16-NEXT:    add z1.s, z2.s, z1.s
+; NOBF16-NEXT:    sel z0.s, p0, z0.s, z1.s
+; NOBF16-NEXT:    lsr z0.s, z0.s, #16
+; NOBF16-NEXT:    ret
+;
+; NOBF16NNAN-LABEL: fptrunc_nxv4f32_to_nxv4bf16:
+; NOBF16NNAN:       // %bb.0:
+; NOBF16NNAN-NEXT:    mov z1.s, #32767 // =0x7fff
+; NOBF16NNAN-NEXT:    lsr z2.s, z0.s, #16
+; NOBF16NNAN-NEXT:    and z2.s, z2.s, #0x1
+; NOBF16NNAN-NEXT:    add z0.s, z0.s, z1.s
+; NOBF16NNAN-NEXT:    add z0.s, z2.s, z0.s
+; NOBF16NNAN-NEXT:    lsr z0.s, z0.s, #16
+; NOBF16NNAN-NEXT:    ret
+;
+; BF16-LABEL: fptrunc_nxv4f32_to_nxv4bf16:
+; BF16:       // %bb.0:
+; BF16-NEXT:    ptrue p0.s
+; BF16-NEXT:    bfcvt z0.h, p0/m, z0.s
+; BF16-NEXT:    ret
+  %res = fptrunc <vscale x 4 x float> %a to <vscale x 4 x bfloat>
+  ret <vscale x 4 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @fptrunc_nxv8f32_to_nxv8bf16(<vscale x 8 x float> %a) {
+; NOBF16-LABEL: fptrunc_nxv8f32_to_nxv8bf16:
+; NOBF16:       // %bb.0:
+; NOBF16-NEXT:    mov z2.s, #32767 // =0x7fff
+; NOBF16-NEXT:    lsr z3.s, z1.s, #16
+; NOBF16-NEXT:    lsr z4.s, z0.s, #16
+; NOBF16-NEXT:    ptrue p0.s
+; NOBF16-NEXT:    and z3.s, z3.s, #0x1
+; NOBF16-NEXT:    and z4.s, z4.s, #0x1
+; NOBF16-NEXT:    fcmuo p1.s, p0/z, z1.s, z1.s
+; NOBF16-NEXT:    add z5.s, z1.s, z2.s
+; NOBF16-NEXT:    add z2.s, z0.s, z2.s
+; NOBF16-NEXT:    fcmuo p0.s, p0/z, z0.s, z0.s
+; NOBF16-NEXT:    orr z1.s, z1.s, #0x400000
+; NOBF16-NEXT:    orr z0.s, z0.s, #0x400000
+; NOBF16-NEXT:    add z3.s, z3.s, z5.s
+; NOBF16-NEXT:    add z2.s, z4.s, z2.s
+; NOBF16-NEXT:    sel z1.s, p1, z1.s, z3.s
+; NOBF16-NEXT:    sel z0.s, p0, z0.s, z2.s
+; NOBF16-NEXT:    lsr z1.s, z1.s, #16
+; NOBF16-NEXT:    lsr z0.s, z0.s, #16
+; NOBF16-NEXT:    uzp1 z0.h, z0.h, z1.h
+; NOBF16-NEXT:    ret
+;
+; NOBF16NNAN-LABEL: fptrunc_nxv8f32_to_nxv8bf16:
+; NOBF16NNAN:       // %bb.0:
+; NOBF16NNAN-NEXT:    mov z2.s, #32767 // =0x7fff
+; NOBF16NNAN-NEXT:    lsr z3.s, z1.s, #16
+; NOBF16NNAN-NEXT:    lsr z4.s, z0.s, #16
+; NOBF16NNAN-NEXT:    and z3.s, z3.s, #0x1
+; NOBF16NNAN-NEXT:    and z4.s, z4.s, #0x1
+; NOBF16NNAN-NEXT:    add z1.s, z1.s, z2.s
+; NOBF16NNAN-NEXT:    add z0.s, z0.s, z2.s
+; NOBF16NNAN-NEXT:    add z1.s, z3.s, z1.s
+; NOBF16NNAN-NEXT:    add z0.s, z4.s, z0.s
+; NOBF16NNAN-NEXT:    lsr z1.s, z1.s, #16
+; NOBF16NNAN-NEXT:    lsr z0.s, z0.s, #16
+; NOBF16NNAN-NEXT:    uzp1 z0.h, z0.h, z1.h
+; NOBF16NNAN-NEXT:    ret
+;
+; BF16-LABEL: fptrunc_nxv8f32_to_nxv8bf16:
+; BF16:       // %bb.0:
+; BF16-NEXT:    ptrue p0.s
+; BF16-NEXT:    bfcvt z1.h, p0/m, z1.s
+; BF16-NEXT:    bfcvt z0.h, p0/m, z0.s
+; BF16-NEXT:    uzp1 z0.h, z0.h, z1.h
+; BF16-NEXT:    ret
+  %res = fptrunc <vscale x 8 x float> %a to <vscale x 8 x bfloat>
+  ret <vscale x 8 x bfloat> %res
+}

From 8ba334bc4ad1e20c8201b85ed0a3e3b87bc47fe1 Mon Sep 17 00:00:00 2001
From: Dominik Montada <dominik.montada@arm.com>
Date: Tue, 24 Sep 2024 14:21:45 +0200
Subject: [PATCH 22/22] [MIR] Allow overriding isSSA, noPhis, noVRegs in MIR
 input (#108546)

Allow setting the computed properties IsSSA, NoPHIs, NoVRegs for MIR
functions in MIR input. The default value is still the computed value.
If the property is set to false, the computed result is ignored. Conflicting
values (e.g. setting IsSSA where the input MIR is clearly not SSA) lead to
an error.

Closes #37787
---
 llvm/include/llvm/CodeGen/MIRYamlMapping.h    | 11 ++++
 llvm/lib/CodeGen/MIRParser/MIRParser.cpp      | 52 +++++++++++----
 llvm/lib/CodeGen/MIRPrinter.cpp               |  7 ++
 .../AArch64/mlicm-stack-write-check.mir       |  4 +-
 .../Hexagon/expand-condsets-impuse2.mir       |  2 +-
 .../Hexagon/expand-condsets-phys-reg.mir      |  2 +-
 .../Hexagon/expand-condsets-rm-reg.mir        |  2 +-
 ...ptionally-computed-properties-conflict.mir | 35 ++++++++++
 ...unction-optionally-computed-properties.mir | 64 +++++++++++++++++++
 .../X86/sjlj-shadow-stack-liveness.mir        |  3 +-
 .../llvm-reduce/mir/preserve-func-info.mir    |  6 ++
 11 files changed, 170 insertions(+), 18 deletions(-)
 create mode 100644 llvm/test/CodeGen/MIR/Generic/machine-function-optionally-computed-properties-conflict.mir
 create mode 100644 llvm/test/CodeGen/MIR/Generic/machine-function-optionally-computed-properties.mir

diff --git a/llvm/include/llvm/CodeGen/MIRYamlMapping.h b/llvm/include/llvm/CodeGen/MIRYamlMapping.h
index 304db57eca4994..ab8dc442e04b7b 100644
--- a/llvm/include/llvm/CodeGen/MIRYamlMapping.h
+++ b/llvm/include/llvm/CodeGen/MIRYamlMapping.h
@@ -730,6 +730,11 @@ struct MachineFunction {
   bool TracksRegLiveness = false;
   bool HasWinCFI = false;
 
+  // Computed properties that should be overridable
+  std::optional<bool> NoPHIs;
+  std::optional<bool> IsSSA;
+  std::optional<bool> NoVRegs;
+
   bool CallsEHReturn = false;
   bool CallsUnwindInit = false;
   bool HasEHCatchret = false;
@@ -770,6 +775,12 @@ template <> struct MappingTraits<MachineFunction> {
     YamlIO.mapOptional("tracksRegLiveness", MF.TracksRegLiveness, false);
     YamlIO.mapOptional("hasWinCFI", MF.HasWinCFI, false);
 
+    // PHIs must be not be capitalized, since it will clash with the MIR opcode
+    // leading to false-positive FileCheck hits with CHECK-NOT
+    YamlIO.mapOptional("noPhis", MF.NoPHIs, std::optional<bool>());
+    YamlIO.mapOptional("isSSA", MF.IsSSA, std::optional<bool>());
+    YamlIO.mapOptional("noVRegs", MF.NoVRegs, std::optional<bool>());
+
     YamlIO.mapOptional("callsEHReturn", MF.CallsEHReturn, false);
     YamlIO.mapOptional("callsUnwindInit", MF.CallsUnwindInit, false);
     YamlIO.mapOptional("hasEHCatchret", MF.HasEHCatchret, false);
diff --git a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
index d506cd1879648f..8d6d800d761474 100644
--- a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
@@ -178,7 +178,8 @@ class MIRParserImpl {
   SMDiagnostic diagFromBlockStringDiag(const SMDiagnostic &Error,
                                        SMRange SourceRange);
 
-  void computeFunctionProperties(MachineFunction &MF);
+  bool computeFunctionProperties(MachineFunction &MF,
+                                 const yaml::MachineFunction &YamlMF);
 
   void setupDebugValueTracking(MachineFunction &MF,
     PerFunctionMIParsingState &PFS, const yaml::MachineFunction &YamlMF);
@@ -373,7 +374,8 @@ static bool isSSA(const MachineFunction &MF) {
   return true;
 }
 
-void MIRParserImpl::computeFunctionProperties(MachineFunction &MF) {
+bool MIRParserImpl::computeFunctionProperties(
+    MachineFunction &MF, const yaml::MachineFunction &YamlMF) {
   MachineFunctionProperties &Properties = MF.getProperties();
 
   bool HasPHI = false;
@@ -398,21 +400,48 @@ void MIRParserImpl::computeFunctionProperties(MachineFunction &MF) {
       }
     }
   }
-  if (!HasPHI)
-    Properties.set(MachineFunctionProperties::Property::NoPHIs);
+
+  // Helper function to sanity-check and set properties that are computed, but
+  // may be explicitly set from the input MIR
+  auto ComputedPropertyHelper =
+      [&Properties](std::optional<bool> ExplicitProp, bool ComputedProp,
+                    MachineFunctionProperties::Property P) -> bool {
+    // Prefer explicitly given values over the computed properties
+    if (ExplicitProp.value_or(ComputedProp))
+      Properties.set(P);
+    else
+      Properties.reset(P);
+
+    // Check for conflict between the explicit values and the computed ones
+    return ExplicitProp && *ExplicitProp && !ComputedProp;
+  };
+
+  if (ComputedPropertyHelper(YamlMF.NoPHIs, !HasPHI,
+                             MachineFunctionProperties::Property::NoPHIs)) {
+    return error(MF.getName() +
+                 " has explicit property NoPhi, but contains at least one PHI");
+  }
+
   MF.setHasInlineAsm(HasInlineAsm);
 
   if (HasTiedOps && AllTiedOpsRewritten)
     Properties.set(MachineFunctionProperties::Property::TiedOpsRewritten);
 
-  if (isSSA(MF))
-    Properties.set(MachineFunctionProperties::Property::IsSSA);
-  else
-    Properties.reset(MachineFunctionProperties::Property::IsSSA);
+  if (ComputedPropertyHelper(YamlMF.IsSSA, isSSA(MF),
+                             MachineFunctionProperties::Property::IsSSA)) {
+    return error(MF.getName() +
+                 " has explicit property IsSSA, but is not valid SSA");
+  }
 
   const MachineRegisterInfo &MRI = MF.getRegInfo();
-  if (MRI.getNumVirtRegs() == 0)
-    Properties.set(MachineFunctionProperties::Property::NoVRegs);
+  if (ComputedPropertyHelper(YamlMF.NoVRegs, MRI.getNumVirtRegs() == 0,
+                             MachineFunctionProperties::Property::NoVRegs)) {
+    return error(
+        MF.getName() +
+        " has explicit property NoVRegs, but contains virtual registers");
+  }
+
+  return false;
 }
 
 bool MIRParserImpl::initializeCallSiteInfo(
@@ -595,7 +624,8 @@ MIRParserImpl::initializeMachineFunction(const yaml::MachineFunction &YamlMF,
   MachineRegisterInfo &MRI = MF.getRegInfo();
   MRI.freezeReservedRegs();
 
-  computeFunctionProperties(MF);
+  if (computeFunctionProperties(MF, YamlMF))
+    return false;
 
   if (initializeCallSiteInfo(PFS, YamlMF))
     return false;
diff --git a/llvm/lib/CodeGen/MIRPrinter.cpp b/llvm/lib/CodeGen/MIRPrinter.cpp
index 7de68b12045f14..cf6122bce22364 100644
--- a/llvm/lib/CodeGen/MIRPrinter.cpp
+++ b/llvm/lib/CodeGen/MIRPrinter.cpp
@@ -223,6 +223,13 @@ void MIRPrinter::print(const MachineFunction &MF) {
   YamlMF.TracksDebugUserValues = MF.getProperties().hasProperty(
       MachineFunctionProperties::Property::TracksDebugUserValues);
 
+  YamlMF.NoPHIs = MF.getProperties().hasProperty(
+      MachineFunctionProperties::Property::NoPHIs);
+  YamlMF.IsSSA = MF.getProperties().hasProperty(
+      MachineFunctionProperties::Property::IsSSA);
+  YamlMF.NoVRegs = MF.getProperties().hasProperty(
+      MachineFunctionProperties::Property::NoVRegs);
+
   convert(YamlMF, MF.getRegInfo(), MF.getSubtarget().getRegisterInfo());
   MachineModuleSlotTracker MST(MMI, &MF);
   MST.incorporateFunction(MF.getFunction());
diff --git a/llvm/test/CodeGen/AArch64/mlicm-stack-write-check.mir b/llvm/test/CodeGen/AArch64/mlicm-stack-write-check.mir
index 51bc77d405b94b..406025c4fde302 100644
--- a/llvm/test/CodeGen/AArch64/mlicm-stack-write-check.mir
+++ b/llvm/test/CodeGen/AArch64/mlicm-stack-write-check.mir
@@ -3,6 +3,7 @@
 ---
 name: test
 tracksRegLiveness: true
+isSSA: false
 registers:
   - { id: 0, class: gpr64 }
 stack:
@@ -30,11 +31,11 @@ body: |
   bb.2:
     liveins: $x0
     %0 = COPY $x0
-    %0 = COPY $x0  ; Force isSSA = false.
 ...
 ---
 name: test2
 tracksRegLiveness: true
+isSSA: false
 registers:
   - { id: 0, class: gpr64 }
 stack:
@@ -62,5 +63,4 @@ body: |
   bb.2:
     liveins: $x0
     %0 = COPY $x0
-    %0 = COPY $x0  ; Force isSSA = false.
 ...
diff --git a/llvm/test/CodeGen/Hexagon/expand-condsets-impuse2.mir b/llvm/test/CodeGen/Hexagon/expand-condsets-impuse2.mir
index ae3f4ba78cd1ff..ebb361ab433cb7 100644
--- a/llvm/test/CodeGen/Hexagon/expand-condsets-impuse2.mir
+++ b/llvm/test/CodeGen/Hexagon/expand-condsets-impuse2.mir
@@ -6,12 +6,12 @@
 
 name: f0
 tracksRegLiveness: true
+isSSA: false
 body: |
   bb.0:
     successors: %bb.1
     liveins: $r0, $r1
     %0:intregs = COPY $r0
-    %0:intregs = COPY $r0       ; defeat IsSSA detection
     %1:intregs = COPY $r1
     %2:intregs = COPY $r0
     %3:intregs = M2_mpyi %2, %1
diff --git a/llvm/test/CodeGen/Hexagon/expand-condsets-phys-reg.mir b/llvm/test/CodeGen/Hexagon/expand-condsets-phys-reg.mir
index e62cd1cc73609b..d252ec5fee4019 100644
--- a/llvm/test/CodeGen/Hexagon/expand-condsets-phys-reg.mir
+++ b/llvm/test/CodeGen/Hexagon/expand-condsets-phys-reg.mir
@@ -9,12 +9,12 @@
 
 name: fred
 tracksRegLiveness: true
+isSSA: false
 body: |
   bb.0:
     successors: %bb.1, %bb.2
     liveins: $r0
 
-    %0:intregs = A2_tfrsi 0           ;; Multiple defs to ensure IsSSA = false
     %0:intregs = L2_loadri_io $r0, 0
     %1:predregs = C2_cmpgti %0, 10
     %2:intregs = C2_mux %1, $r31, %0
diff --git a/llvm/test/CodeGen/Hexagon/expand-condsets-rm-reg.mir b/llvm/test/CodeGen/Hexagon/expand-condsets-rm-reg.mir
index 6d7b6cd72a3099..463aa9a8e7f9b1 100644
--- a/llvm/test/CodeGen/Hexagon/expand-condsets-rm-reg.mir
+++ b/llvm/test/CodeGen/Hexagon/expand-condsets-rm-reg.mir
@@ -20,6 +20,7 @@
 
 name: fred
 tracksRegLiveness: true
+isSSA: false
 registers:
   - { id: 0, class: intregs }
   - { id: 1, class: intregs }
@@ -35,7 +36,6 @@ body: |
   bb.0:
     liveins: $r0, $r1, $p0
         %0 = COPY $r0
-        %0 = COPY $r0  ; Force isSSA = false.
         %1 = COPY $r1
         %2 = COPY $p0
         ; Check that %3 was coalesced into %4.
diff --git a/llvm/test/CodeGen/MIR/Generic/machine-function-optionally-computed-properties-conflict.mir b/llvm/test/CodeGen/MIR/Generic/machine-function-optionally-computed-properties-conflict.mir
new file mode 100644
index 00000000000000..d8d178d90ae0af
--- /dev/null
+++ b/llvm/test/CodeGen/MIR/Generic/machine-function-optionally-computed-properties-conflict.mir
@@ -0,0 +1,35 @@
+# RUN: not llc -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
+
+# Test that computed properties are not conflicting with explicitly set
+# properties
+
+---
+# CHECK: error: {{.*}}: TestNoPhisOverrideConflict has explicit property NoPhi, but contains at least one PHI
+name:            TestNoPhisOverrideConflict
+noPhis: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    %0:_(s32) = G_IMPLICIT_DEF
+
+  bb.1:
+    %1:_(s32) = PHI %0, %bb.0, %1, %bb.1
+    G_BR %bb.1
+...
+---
+# CHECK: error: {{.*}}: TestIsSSAOverrideConflict has explicit property IsSSA, but is not valid SSA
+name:            TestIsSSAOverrideConflict
+isSSA: true
+body: |
+  bb.0:
+    %0:_(s32) = G_IMPLICIT_DEF
+    %0:_(s32) = G_IMPLICIT_DEF
+...
+---
+# CHECK: error: {{.*}}: TestNoVRegsOverrideConflict has explicit property NoVRegs, but contains virtual registers
+name:            TestNoVRegsOverrideConflict
+noVRegs: true
+body: |
+  bb.0:
+    %0:_(s32) = G_IMPLICIT_DEF
+...
diff --git a/llvm/test/CodeGen/MIR/Generic/machine-function-optionally-computed-properties.mir b/llvm/test/CodeGen/MIR/Generic/machine-function-optionally-computed-properties.mir
new file mode 100644
index 00000000000000..858bbc8394bb34
--- /dev/null
+++ b/llvm/test/CodeGen/MIR/Generic/machine-function-optionally-computed-properties.mir
@@ -0,0 +1,64 @@
+# RUN: llc -run-pass none -o - %s | FileCheck %s
+
+# Test that we can disable certain properties that are normally computed
+
+---
+# CHECK-LABEL: name: TestNoPhis
+# CHECK: noPhis: true
+# CHECK: ...
+name:            TestNoPhis
+...
+---
+# CHECK-LABEL: name: TestNoPhisOverride
+# CHECK: noPhis: false
+# CHECK: ...
+name:            TestNoPhisOverride
+noPhis: false
+...
+---
+# CHECK-LABEL: name: TestNoPhisOverrideTrue
+# CHECK: noPhis: true
+# CHECK: ...
+name:            TestNoPhisOverrideTrue
+noPhis: true
+...
+---
+# CHECK-LABEL: name: TestIsSSA
+# CHECK: isSSA: true
+# CHECK: ...
+name:            TestIsSSA
+...
+---
+# CHECK-LABEL: name: TestIsSSAOverride
+# CHECK: isSSA: false
+# CHECK: ...
+name:            TestIsSSAOverride
+isSSA: false
+...
+---
+# CHECK-LABEL: name: TestIsSSAOverrideTrue
+# CHECK: isSSA: true
+# CHECK: ...
+name:            TestIsSSAOverrideTrue
+isSSA: true
+...
+---
+# CHECK-LABEL: name: TestNoVRegs
+# CHECK: noVRegs: true
+# CHECK: ...
+name:            TestNoVRegs
+...
+---
+# CHECK-LABEL: name: TestNoVRegsOverride
+# CHECK: noVRegs: false
+# CHECK: ...
+name:            TestNoVRegsOverride
+noVRegs: false
+...
+---
+# CHECK-LABEL: name: TestNoVRegsOverrideTrue
+# CHECK: noVRegs: true
+# CHECK: ...
+name:            TestNoVRegsOverrideTrue
+noVRegs: true
+...
diff --git a/llvm/test/CodeGen/X86/sjlj-shadow-stack-liveness.mir b/llvm/test/CodeGen/X86/sjlj-shadow-stack-liveness.mir
index 3def36f9d8ba91..83bc8ec510f646 100644
--- a/llvm/test/CodeGen/X86/sjlj-shadow-stack-liveness.mir
+++ b/llvm/test/CodeGen/X86/sjlj-shadow-stack-liveness.mir
@@ -14,6 +14,7 @@ name:            bar
 # CHECK-LABEL: name: bar
 alignment:       16
 tracksRegLiveness: true
+noPhis: false
 body:             |
   bb.0:
     %0:gr64 = IMPLICIT_DEF
@@ -29,8 +30,6 @@ body:             |
     ; CHECK-NOT: MOV64rm killed %0
     ; CHECK-NEXT: MOV64rm killed %0
 
-  ; FIXME: Dummy PHI to set the property NoPHIs to false. PR38439.
   bb.2:
-    %1:gr64 = PHI undef %1, %bb.2, undef %1, %bb.2
     JMP_1 %bb.2
 ...
diff --git a/llvm/test/tools/llvm-reduce/mir/preserve-func-info.mir b/llvm/test/tools/llvm-reduce/mir/preserve-func-info.mir
index 5f11cea89d7e7b..f735dfd5cbbf01 100644
--- a/llvm/test/tools/llvm-reduce/mir/preserve-func-info.mir
+++ b/llvm/test/tools/llvm-reduce/mir/preserve-func-info.mir
@@ -14,6 +14,9 @@
 # RESULT-NEXT: failedISel:      true
 # RESULT-NEXT: tracksRegLiveness: true
 # RESULT-NEXT: hasWinCFI:       true
+# RESULT-NEXT: noPhis:          false
+# RESULT-NEXT: isSSA:           false
+# RESULT-NEXT: noVRegs:         false
 # RESULT-NEXT: callsEHReturn: true
 # RESULT-NEXT: callsUnwindInit: true
 # RESULT-NEXT: hasEHCatchret: true
@@ -41,6 +44,9 @@ selected:        true
 failedISel:      true
 tracksRegLiveness: true
 hasWinCFI:       true
+noPhis:          false
+isSSA:           false
+noVRegs:         false
 failsVerification: true
 tracksDebugUserValues: true
 callsEHReturn: true