From db054a197002c4d6b2c568d7c36d86f5fccade2d Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen@arm.com>
Date: Tue, 24 Sep 2024 13:29:30 +0100
Subject: [PATCH 01/49] [AArch64][SME] Fix ADDVL addressing to scavenged
 stackslot. (#109674)

In https://reviews.llvm.org/D159196 we avoided stackslot scavenging
when there was no FP available. But in the case where FP is available
we need to actually prefer using the FP over the BP.

This change affects more than just SME, but it should be a general
improvement, since any slot above the (address pointed to by) FP
is always closer to FP than BP, so it makes sense to always favour
using the FP to address it when the FP is available.

This also fixes the issue for SME where this is not just preferred
but required.
---
 .../Target/AArch64/AArch64FrameLowering.cpp   | 11 +++--
 ...nging-call-disable-stackslot-scavenging.ll | 45 +++++++++++++++++++
 2 files changed, 50 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 7e041b086599b9..fde07d84e97f58 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -2757,7 +2757,11 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference(
       bool FPOffsetFits = !ForSimm || FPOffset >= -256;
       PreferFP |= Offset > -FPOffset && !SVEStackSize;
 
-      if (MFI.hasVarSizedObjects()) {
+      if (FPOffset >= 0) {
+        // If the FPOffset is positive, that'll always be best, as the SP/BP
+        // will be even further away.
+        UseFP = true;
+      } else if (MFI.hasVarSizedObjects()) {
         // If we have variable sized objects, we can use either FP or BP, as the
         // SP offset is unknown. We can use the base pointer if we have one and
         // FP is not preferred. If not, we're stuck with using FP.
@@ -2769,11 +2773,6 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference(
         // else we can use BP and FP, but the offset from FP won't fit.
         // That will make us scavenge registers which we can probably avoid by
         // using BP. If it won't fit for BP either, we'll scavenge anyway.
-      } else if (FPOffset >= 0) {
-        // Use SP or FP, whichever gives us the best chance of the offset
-        // being in range for direct access. If the FPOffset is positive,
-        // that'll always be best, as the SP will be even further away.
-        UseFP = true;
       } else if (MF.hasEHFunclets() && !RegInfo->hasBasePointer(MF)) {
         // Funclets access the locals contained in the parent's stack frame
         // via the frame pointer, so we have to use the FP in the parent
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll b/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll
index ac19bd59babe46..803bb9fda458b9 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-mode-changing-call-disable-stackslot-scavenging.ll
@@ -45,6 +45,51 @@ define void @test_no_stackslot_scavenging(float %f) #0 {
   ret void
 }
 
+define void @test_no_stackslot_scavenging_with_fp(float %f, i64 %n) #0 "frame-pointer"="all" {
+; CHECK-LABEL: test_no_stackslot_scavenging_with_fp:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stp d15, d14, [sp, #-128]! // 16-byte Folded Spill
+; CHECK-NEXT:    cntd x9
+; CHECK-NEXT:    stp d13, d12, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d11, d10, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    stp d9, d8, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x29, x30, [sp, #64] // 16-byte Folded Spill
+; CHECK-NEXT:    add x29, sp, #64
+; CHECK-NEXT:    str x9, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x28, x25, [sp, #96] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x24, x19, [sp, #112] // 16-byte Folded Spill
+; CHECK-NEXT:    addvl sp, sp, #-1
+; CHECK-NEXT:    lsl x9, x0, #3
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    mov x19, sp
+; CHECK-NEXT:    str s0, [x29, #28] // 4-byte Folded Spill
+; CHECK-NEXT:    add x9, x9, #15
+; CHECK-NEXT:    and x9, x9, #0xfffffffffffffff0
+; CHECK-NEXT:    sub x8, x8, x9
+; CHECK-NEXT:    mov sp, x8
+; CHECK-NEXT:    //APP
+; CHECK-NEXT:    //NO_APP
+; CHECK-NEXT:    smstop sm
+; CHECK-NEXT:    ldr s0, [x29, #28] // 4-byte Folded Reload
+; CHECK-NEXT:    bl use_f
+; CHECK-NEXT:    smstart sm
+; CHECK-NEXT:    sub sp, x29, #64
+; CHECK-NEXT:    ldp x24, x19, [sp, #112] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x28, x25, [sp, #96] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x29, x30, [sp, #64] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d9, d8, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d11, d10, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d13, d12, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp d15, d14, [sp], #128 // 16-byte Folded Reload
+; CHECK-NEXT:    ret
+  %ptr2 = alloca i64, i64 %n, align 8
+  %ptr = alloca <vscale x 16 x i8>
+  call void asm sideeffect "", "~{x24},~{x25}"() nounwind
+  call void @use_f(float %f)
+  ret void
+}
+
 declare void @use_f(float)
+declare void @use_f_and_ptr(float, ptr)
 
 attributes #0 = { nounwind "target-features"="+sve,+sme" "aarch64_pstate_sm_enabled" }

From 3073c3c2290a6d9b12fbaefa40dd22eef6312895 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell@arm.com>
Date: Tue, 24 Sep 2024 13:36:21 +0100
Subject: [PATCH 02/49] [SDAG] Avoid creating redundant stack slots when
 lowering FSINCOS (#108401)

When lowering `FSINCOS` to a library call (that takes output pointers)
we can avoid creating new stack allocations if the results of the
`FSINCOS` are being stored. Instead, we can take the destination
pointers from the stores and pass those to the library call.

---

Note: As a NFC this also adds (and uses) `RTLIB::getFSINCOS()`.
---
 .../include/llvm/CodeGen/RuntimeLibcallUtil.h |   4 +
 llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 106 ++++----
 llvm/lib/CodeGen/TargetLoweringBase.cpp       |   5 +
 .../CodeGen/AArch64/sincos-stack-slots.ll     | 255 ++++++++++++++++++
 4 files changed, 315 insertions(+), 55 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/sincos-stack-slots.ll

diff --git a/llvm/include/llvm/CodeGen/RuntimeLibcallUtil.h b/llvm/include/llvm/CodeGen/RuntimeLibcallUtil.h
index 7a131645893921..045ec7d3653119 100644
--- a/llvm/include/llvm/CodeGen/RuntimeLibcallUtil.h
+++ b/llvm/include/llvm/CodeGen/RuntimeLibcallUtil.h
@@ -62,6 +62,10 @@ Libcall getLDEXP(EVT RetVT);
 /// UNKNOWN_LIBCALL if there is none.
 Libcall getFREXP(EVT RetVT);
 
+/// getFSINCOS - Return the FSINCOS_* value for the given types, or
+/// UNKNOWN_LIBCALL if there is none.
+Libcall getFSINCOS(EVT RetVT);
+
 /// Return the SYNC_FETCH_AND_* value for the given opcode and type, or
 /// UNKNOWN_LIBCALL if there is none.
 Libcall getSYNC(unsigned Opc, MVT VT);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index f5fbc01cd95e96..3c087727a80126 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -2326,15 +2326,7 @@ SelectionDAGLegalize::ExpandDivRemLibCall(SDNode *Node,
 
 /// Return true if sincos libcall is available.
 static bool isSinCosLibcallAvailable(SDNode *Node, const TargetLowering &TLI) {
-  RTLIB::Libcall LC;
-  switch (Node->getSimpleValueType(0).SimpleTy) {
-  default: llvm_unreachable("Unexpected request for libcall!");
-  case MVT::f32:     LC = RTLIB::SINCOS_F32; break;
-  case MVT::f64:     LC = RTLIB::SINCOS_F64; break;
-  case MVT::f80:     LC = RTLIB::SINCOS_F80; break;
-  case MVT::f128:    LC = RTLIB::SINCOS_F128; break;
-  case MVT::ppcf128: LC = RTLIB::SINCOS_PPCF128; break;
-  }
+  RTLIB::Libcall LC = RTLIB::getFSINCOS(Node->getSimpleValueType(0).SimpleTy);
   return TLI.getLibcallName(LC) != nullptr;
 }
 
@@ -2355,68 +2347,72 @@ static bool useSinCos(SDNode *Node) {
 }
 
 /// Issue libcalls to sincos to compute sin / cos pairs.
-void
-SelectionDAGLegalize::ExpandSinCosLibCall(SDNode *Node,
-                                          SmallVectorImpl<SDValue> &Results) {
-  RTLIB::Libcall LC;
-  switch (Node->getSimpleValueType(0).SimpleTy) {
-  default: llvm_unreachable("Unexpected request for libcall!");
-  case MVT::f32:     LC = RTLIB::SINCOS_F32; break;
-  case MVT::f64:     LC = RTLIB::SINCOS_F64; break;
-  case MVT::f80:     LC = RTLIB::SINCOS_F80; break;
-  case MVT::f128:    LC = RTLIB::SINCOS_F128; break;
-  case MVT::ppcf128: LC = RTLIB::SINCOS_PPCF128; break;
+void SelectionDAGLegalize::ExpandSinCosLibCall(
+    SDNode *Node, SmallVectorImpl<SDValue> &Results) {
+  EVT VT = Node->getValueType(0);
+  Type *Ty = VT.getTypeForEVT(*DAG.getContext());
+  RTLIB::Libcall LC = RTLIB::getFSINCOS(VT);
+
+  // Find users of the node that store the results (and share input chains). The
+  // destination pointers can be used instead of creating stack allocations.
+  SDValue StoresInChain{};
+  std::array<StoreSDNode *, 2> ResultStores = {nullptr};
+  for (SDNode *User : Node->uses()) {
+    if (!ISD::isNormalStore(User))
+      continue;
+    auto *ST = cast<StoreSDNode>(User);
+    if (!ST->isSimple() || ST->getAddressSpace() != 0 ||
+        ST->getAlign() < DAG.getDataLayout().getABITypeAlign(Ty) ||
+        (StoresInChain && ST->getChain() != StoresInChain) ||
+        Node->isPredecessorOf(ST->getChain().getNode()))
+      continue;
+    ResultStores[ST->getValue().getResNo()] = ST;
+    StoresInChain = ST->getChain();
   }
 
-  // The input chain to this libcall is the entry node of the function.
-  // Legalizing the call will automatically add the previous call to the
-  // dependence.
-  SDValue InChain = DAG.getEntryNode();
-
-  EVT RetVT = Node->getValueType(0);
-  Type *RetTy = RetVT.getTypeForEVT(*DAG.getContext());
-
   TargetLowering::ArgListTy Args;
-  TargetLowering::ArgListEntry Entry;
+  TargetLowering::ArgListEntry Entry{};
 
   // Pass the argument.
   Entry.Node = Node->getOperand(0);
-  Entry.Ty = RetTy;
-  Entry.IsSExt = false;
-  Entry.IsZExt = false;
-  Args.push_back(Entry);
-
-  // Pass the return address of sin.
-  SDValue SinPtr = DAG.CreateStackTemporary(RetVT);
-  Entry.Node = SinPtr;
-  Entry.Ty = PointerType::getUnqual(RetTy->getContext());
-  Entry.IsSExt = false;
-  Entry.IsZExt = false;
+  Entry.Ty = Ty;
   Args.push_back(Entry);
 
-  // Also pass the return address of the cos.
-  SDValue CosPtr = DAG.CreateStackTemporary(RetVT);
-  Entry.Node = CosPtr;
-  Entry.Ty = PointerType::getUnqual(RetTy->getContext());
-  Entry.IsSExt = false;
-  Entry.IsZExt = false;
-  Args.push_back(Entry);
+  // Pass the output pointers for sin and cos.
+  SmallVector<SDValue, 2> ResultPtrs{};
+  for (StoreSDNode *ST : ResultStores) {
+    SDValue ResultPtr = ST ? ST->getBasePtr() : DAG.CreateStackTemporary(VT);
+    Entry.Node = ResultPtr;
+    Entry.Ty = PointerType::getUnqual(Ty->getContext());
+    Args.push_back(Entry);
+    ResultPtrs.push_back(ResultPtr);
+  }
 
+  SDLoc DL(Node);
+  SDValue InChain = StoresInChain ? StoresInChain : DAG.getEntryNode();
   SDValue Callee = DAG.getExternalSymbol(TLI.getLibcallName(LC),
                                          TLI.getPointerTy(DAG.getDataLayout()));
-
-  SDLoc dl(Node);
   TargetLowering::CallLoweringInfo CLI(DAG);
-  CLI.setDebugLoc(dl).setChain(InChain).setLibCallee(
+  CLI.setDebugLoc(DL).setChain(InChain).setLibCallee(
       TLI.getLibcallCallingConv(LC), Type::getVoidTy(*DAG.getContext()), Callee,
       std::move(Args));
 
-  std::pair<SDValue, SDValue> CallInfo = TLI.LowerCallTo(CLI);
+  auto [Call, OutChain] = TLI.LowerCallTo(CLI);
 
-  Results.push_back(
-      DAG.getLoad(RetVT, dl, CallInfo.second, SinPtr, MachinePointerInfo()));
-  Results.push_back(
-      DAG.getLoad(RetVT, dl, CallInfo.second, CosPtr, MachinePointerInfo()));
+  for (auto [ResNo, ResultPtr] : llvm::enumerate(ResultPtrs)) {
+    MachinePointerInfo PtrInfo;
+    if (StoreSDNode *ST = ResultStores[ResNo]) {
+      // Replace store with the library call.
+      DAG.ReplaceAllUsesOfValueWith(SDValue(ST, 0), OutChain);
+      PtrInfo = ST->getPointerInfo();
+    } else {
+      PtrInfo = MachinePointerInfo::getFixedStack(
+          DAG.getMachineFunction(),
+          cast<FrameIndexSDNode>(ResultPtr)->getIndex());
+    }
+    SDValue LoadResult = DAG.getLoad(VT, DL, OutChain, ResultPtr, PtrInfo);
+    Results.push_back(LoadResult);
+  }
 }
 
 SDValue SelectionDAGLegalize::expandLdexp(SDNode *Node) const {
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 9fdde454559171..1f49d60c970593 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -400,6 +400,11 @@ RTLIB::Libcall RTLIB::getFREXP(EVT RetVT) {
                       FREXP_PPCF128);
 }
 
+RTLIB::Libcall RTLIB::getFSINCOS(EVT RetVT) {
+  return getFPLibCall(RetVT, SINCOS_F32, SINCOS_F64, SINCOS_F80, SINCOS_F128,
+                      SINCOS_PPCF128);
+}
+
 RTLIB::Libcall RTLIB::getOutlineAtomicHelper(const Libcall (&LC)[5][4],
                                              AtomicOrdering Order,
                                              uint64_t MemSize) {
diff --git a/llvm/test/CodeGen/AArch64/sincos-stack-slots.ll b/llvm/test/CodeGen/AArch64/sincos-stack-slots.ll
new file mode 100644
index 00000000000000..8ef8b5d13b62d4
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sincos-stack-slots.ll
@@ -0,0 +1,255 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64-linux-gnu -o - %s | FileCheck %s
+
+; This file tests eliding stack slots when lowering the FSINCOS ISD node.
+
+define { float, float } @sincos_f32_value_return(float %x) {
+; CHECK-LABEL: sincos_f32_value_return:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    add x0, sp, #12
+; CHECK-NEXT:    add x1, sp, #8
+; CHECK-NEXT:    bl sincosf
+; CHECK-NEXT:    ldp s1, s0, [sp, #8]
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %sin = tail call float @llvm.sin.f32(float %x)
+  %cos = tail call float @llvm.cos.f32(float %x)
+  %ret_0 = insertvalue { float, float } poison, float %sin, 0
+  %ret_1 = insertvalue { float, float } %ret_0, float %cos, 1
+  ret { float, float } %ret_1
+}
+
+define void @sincos_f32_ptr_return(float %x, ptr noalias %out_sin, ptr noalias %out_cos) {
+; CHECK-LABEL: sincos_f32_ptr_return:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    bl sincosf
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %sin = tail call float @llvm.sin.f32(float %x)
+  %cos = tail call float @llvm.cos.f32(float %x)
+  store float %sin, ptr %out_sin, align 4
+  store float %cos, ptr %out_cos, align 4
+  ret void
+}
+
+define float @sincos_f32_mixed_return(float %x, ptr %out_sin) {
+; CHECK-LABEL: sincos_f32_mixed_return:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    add x1, sp, #12
+; CHECK-NEXT:    bl sincosf
+; CHECK-NEXT:    ldr s0, [sp, #12]
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %sin = tail call float @llvm.sin.f32(float %x)
+  %cos = tail call float @llvm.cos.f32(float %x)
+  store float %sin, ptr %out_sin, align 4
+  ret float %cos
+}
+
+define { double, double } @sincos_f64_value_return(double %x) {
+; CHECK-LABEL: sincos_f64_value_return:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #32
+; CHECK-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    add x0, sp, #24
+; CHECK-NEXT:    add x1, sp, #8
+; CHECK-NEXT:    bl sincos
+; CHECK-NEXT:    ldr d0, [sp, #24]
+; CHECK-NEXT:    ldr d1, [sp, #8]
+; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #32
+; CHECK-NEXT:    ret
+entry:
+  %sin = tail call double @llvm.sin.f64(double %x)
+  %cos = tail call double @llvm.cos.f64(double %x)
+  %ret_0 = insertvalue { double, double } poison, double %sin, 0
+  %ret_1 = insertvalue { double, double } %ret_0, double %cos, 1
+  ret { double, double } %ret_1
+}
+
+define void @sincos_f64_ptr_return(double %x, ptr noalias %out_sin, ptr noalias %out_cos) {
+; CHECK-LABEL: sincos_f64_ptr_return:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    bl sincos
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %sin = tail call double @llvm.sin.f64(double %x)
+  %cos = tail call double @llvm.cos.f64(double %x)
+  store double %sin, ptr %out_sin, align 8
+  store double %cos, ptr %out_cos, align 8
+  ret void
+}
+
+define double @sincos_f64_mixed_return(double %x, ptr %out_sin) {
+; CHECK-LABEL: sincos_f64_mixed_return:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    add x1, sp, #8
+; CHECK-NEXT:    bl sincos
+; CHECK-NEXT:    ldr d0, [sp, #8]
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %sin = tail call double @llvm.sin.f64(double %x)
+  %cos = tail call double @llvm.cos.f64(double %x)
+  store double %sin, ptr %out_sin, align 8
+  ret double %cos
+}
+
+; Here %out_sin and %out_cos may alias so we can't replace both stores with the
+; call to sincosf (as the order of stores in sincosf is not defined).
+define void @sincos_may_alias(float %x, ptr %out_sin, ptr %out_cos) {
+; CHECK-LABEL: sincos_may_alias:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #32
+; CHECK-NEXT:    stp x30, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    mov x19, x1
+; CHECK-NEXT:    add x1, sp, #12
+; CHECK-NEXT:    bl sincosf
+; CHECK-NEXT:    ldr s0, [sp, #12]
+; CHECK-NEXT:    str s0, [x19]
+; CHECK-NEXT:    ldp x30, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #32
+; CHECK-NEXT:    ret
+entry:
+  %sin = tail call float @llvm.sin.f32(float %x)
+  %cos = tail call float @llvm.cos.f32(float %x)
+  store float %sin, ptr %out_sin, align 4
+  store float %cos, ptr %out_cos, align 4
+  ret void
+}
+
+; Here %out is used for both sin and cos (with the final value stored being cos).
+define float @sincos_multiple_uses(float %x, ptr %out) {
+; CHECK-LABEL: sincos_multiple_uses:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset w30, -16
+; CHECK-NEXT:    mov x1, x0
+; CHECK-NEXT:    add x0, sp, #12
+; CHECK-NEXT:    bl sincosf
+; CHECK-NEXT:    ldr s0, [sp, #12]
+; CHECK-NEXT:    ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+  %sin = call float @llvm.sin.f32(float %x)
+  store float %sin, ptr %out, align 4
+  %reload = load float, ptr %out, align 4
+  %cos = call float @llvm.cos.f32(float %x)
+  store float %cos, ptr %out, align 4
+  ret float %reload
+}
+
+; Negative test. We can't fold volatile stores into the library call.
+define void @sincos_volatile_result_stores(float %x, ptr noalias %out_sin, ptr noalias %out_cos) {
+; CHECK-LABEL: sincos_volatile_result_stores:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x30, [sp, #-32]! // 8-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    .cfi_offset w20, -16
+; CHECK-NEXT:    .cfi_offset w30, -32
+; CHECK-NEXT:    mov x19, x1
+; CHECK-NEXT:    mov x20, x0
+; CHECK-NEXT:    add x0, sp, #12
+; CHECK-NEXT:    add x1, sp, #8
+; CHECK-NEXT:    bl sincosf
+; CHECK-NEXT:    ldp s1, s0, [sp, #8]
+; CHECK-NEXT:    str s0, [x20]
+; CHECK-NEXT:    str s1, [x19]
+; CHECK-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp], #32 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %sin = tail call float @llvm.sin.f32(float %x)
+  %cos = tail call float @llvm.cos.f32(float %x)
+  store volatile float %sin, ptr %out_sin, align 4
+  store volatile float %cos, ptr %out_cos, align 4
+  ret void
+}
+
+; Negative test. We can't fold atomic stores into the library call.
+define void @sincos_atomic_result_stores(float %x, ptr noalias %out_sin, ptr noalias %out_cos) {
+; CHECK-LABEL: sincos_atomic_result_stores:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    str x30, [sp, #-32]! // 8-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    .cfi_offset w20, -16
+; CHECK-NEXT:    .cfi_offset w30, -32
+; CHECK-NEXT:    mov x19, x1
+; CHECK-NEXT:    mov x20, x0
+; CHECK-NEXT:    add x0, sp, #12
+; CHECK-NEXT:    add x1, sp, #8
+; CHECK-NEXT:    bl sincosf
+; CHECK-NEXT:    ldr w8, [sp, #12]
+; CHECK-NEXT:    str w8, [x20]
+; CHECK-NEXT:    ldr w8, [sp, #8]
+; CHECK-NEXT:    str w8, [x19]
+; CHECK-NEXT:    ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp], #32 // 8-byte Folded Reload
+; CHECK-NEXT:    ret
+entry:
+  %sin = tail call float @llvm.sin.f32(float %x)
+  %cos = tail call float @llvm.cos.f32(float %x)
+  store atomic float %sin, ptr %out_sin unordered, align 4
+  store atomic float %cos, ptr %out_cos unordered, align 4
+  ret void
+}
+
+; Negative test. We can't fold misaligned stores into the library call.
+define void @sincos_misaligned_result_stores(double %x, ptr noalias %out_sin, ptr noalias %out_cos) {
+; CHECK-LABEL: sincos_misaligned_result_stores:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #48
+; CHECK-NEXT:    str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    .cfi_offset w19, -8
+; CHECK-NEXT:    .cfi_offset w20, -16
+; CHECK-NEXT:    .cfi_offset w30, -32
+; CHECK-NEXT:    mov x19, x1
+; CHECK-NEXT:    mov x20, x0
+; CHECK-NEXT:    add x0, sp, #24
+; CHECK-NEXT:    add x1, sp, #8
+; CHECK-NEXT:    bl sincos
+; CHECK-NEXT:    ldr d0, [sp, #24]
+; CHECK-NEXT:    ldr d1, [sp, #8]
+; CHECK-NEXT:    ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT:    str d0, [x20]
+; CHECK-NEXT:    str d1, [x19]
+; CHECK-NEXT:    ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #48
+; CHECK-NEXT:    ret
+entry:
+  %sin = tail call double @llvm.sin.f64(double %x)
+  %cos = tail call double @llvm.cos.f64(double %x)
+  store double %sin, ptr %out_sin, align 4
+  store double %cos, ptr %out_cos, align 4
+  ret void
+}

From 3ec5e74c0dc99095f5961a31421c7adaf2860ec8 Mon Sep 17 00:00:00 2001
From: Dmitry Chernenkov <dmitryc@google.com>
Date: Tue, 24 Sep 2024 13:00:12 +0000
Subject: [PATCH 03/49] [Bazel] Fix layering for
 127349fcba81646389e4b8202b35405a5fdbef47

---
 utils/bazel/llvm-project-overlay/libc/BUILD.bazel | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index a269cf861a5b74..dbc8c045eea996 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -725,6 +725,7 @@ libc_support_library(
     deps = [
         ":__support_common",
         ":__support_cpp_type_traits",
+        ":__support_fputil_cast",
         ":__support_fputil_dyadic_float",
         ":__support_fputil_fenv_impl",
         ":__support_fputil_fp_bits",
@@ -929,6 +930,7 @@ libc_support_library(
         ":__support_cpp_bit",
         ":__support_cpp_limits",
         ":__support_cpp_type_traits",
+        ":__support_fputil_cast",
         ":__support_fputil_dyadic_float",
         ":__support_fputil_fp_bits",
         ":__support_fputil_nearest_integer_operations",
@@ -986,6 +988,7 @@ libc_support_library(
         ":__support_common",
         ":__support_cpp_bit",
         ":__support_cpp_type_traits",
+        ":__support_fputil_cast",
         ":__support_fputil_dyadic_float",
         ":__support_fputil_fenv_impl",
         ":__support_fputil_fp_bits",
@@ -1091,6 +1094,7 @@ libc_support_library(
         ":__support_fputil_fenv_impl",
         ":__support_fputil_fp_bits",
         ":__support_fputil_multiply_add",
+        ":__support_fputil_rounding_mode",
         ":__support_macros_optimization",
     ],
 )

From 66c8dce82e60ebd22bbe6ce7c1550ae6de057625 Mon Sep 17 00:00:00 2001
From: Rahul Joshi <rjoshi@nvidia.com>
Date: Tue, 24 Sep 2024 06:01:34 -0700
Subject: [PATCH 04/49] [TableGen] Add a !listflatten operator to TableGen
 (#109346)

Add a !listflatten operator that will transform an input list of type
`list<list<X>>` to `list<X>` by concatenating elements of the
constituent lists of the input argument.
---
 llvm/docs/TableGen/ProgRef.rst          | 18 +++++++++-----
 llvm/include/llvm/TableGen/Record.h     |  3 ++-
 llvm/lib/TableGen/Record.cpp            | 29 ++++++++++++++++++++++
 llvm/lib/TableGen/TGLexer.cpp           |  1 +
 llvm/lib/TableGen/TGLexer.h             |  1 +
 llvm/lib/TableGen/TGParser.cpp          | 32 +++++++++++++++++++++----
 llvm/test/TableGen/listflatten-error.td |  6 +++++
 llvm/test/TableGen/listflatten.td       | 32 +++++++++++++++++++++++++
 8 files changed, 110 insertions(+), 12 deletions(-)
 create mode 100644 llvm/test/TableGen/listflatten-error.td
 create mode 100644 llvm/test/TableGen/listflatten.td

diff --git a/llvm/docs/TableGen/ProgRef.rst b/llvm/docs/TableGen/ProgRef.rst
index dcea3b721dae27..5cf48d6ed29786 100644
--- a/llvm/docs/TableGen/ProgRef.rst
+++ b/llvm/docs/TableGen/ProgRef.rst
@@ -223,12 +223,12 @@ TableGen provides "bang operators" that have a wide variety of uses:
                : !div         !empty       !eq          !exists      !filter
                : !find        !foldl       !foreach     !ge          !getdagarg
                : !getdagname  !getdagop    !gt          !head        !if
-               : !interleave  !isa         !le          !listconcat  !listremove
-               : !listsplat   !logtwo      !lt          !mul         !ne
-               : !not         !or          !range       !repr        !setdagarg
-               : !setdagname  !setdagop    !shl         !size        !sra
-               : !srl         !strconcat   !sub         !subst       !substr
-               : !tail        !tolower     !toupper     !xor
+               : !interleave  !isa         !le          !listconcat  !listflatten
+               : !listremove  !listsplat   !logtwo      !lt          !mul
+               : !ne          !not         !or          !range       !repr
+               : !setdagarg   !setdagname  !setdagop    !shl         !size
+               : !sra         !srl         !strconcat   !sub         !subst
+               : !substr      !tail        !tolower     !toupper     !xor
 
 The ``!cond`` operator has a slightly different
 syntax compared to other bang operators, so it is defined separately:
@@ -1832,6 +1832,12 @@ and non-0 as true.
     This operator concatenates the list arguments *list1*, *list2*, etc., and
     produces the resulting list. The lists must have the same element type.
 
+``!listflatten(``\ *list*\ ``)``
+    This operator flattens a list of lists *list* and produces a list with all
+    elements of the constituent lists concatenated. If *list* is of type
+    ``list<list<X>>`` the resulting list is of type ``list<X>``. If *list*'s
+    element type is not a list, the result is *list* itself.
+
 ``!listremove(``\ *list1*\ ``,`` *list2*\ ``)``
     This operator returns a copy of *list1* removing all elements that also occur in
     *list2*. The lists must have the same element type.
diff --git a/llvm/include/llvm/TableGen/Record.h b/llvm/include/llvm/TableGen/Record.h
index 5348c1177f63ed..4cd73c3f675527 100644
--- a/llvm/include/llvm/TableGen/Record.h
+++ b/llvm/include/llvm/TableGen/Record.h
@@ -847,7 +847,8 @@ class UnOpInit : public OpInit, public FoldingSetNode {
     EMPTY,
     GETDAGOP,
     LOG2,
-    REPR
+    REPR,
+    LISTFLATTEN,
   };
 
 private:
diff --git a/llvm/lib/TableGen/Record.cpp b/llvm/lib/TableGen/Record.cpp
index a72a0799a08025..0f99b4a13d2f95 100644
--- a/llvm/lib/TableGen/Record.cpp
+++ b/llvm/lib/TableGen/Record.cpp
@@ -986,6 +986,32 @@ Init *UnOpInit::Fold(Record *CurRec, bool IsFinal) const {
       }
     }
     break;
+
+  case LISTFLATTEN:
+    if (ListInit *LHSList = dyn_cast<ListInit>(LHS)) {
+      ListRecTy *InnerListTy = dyn_cast<ListRecTy>(LHSList->getElementType());
+      // list of non-lists, !listflatten() is a NOP.
+      if (!InnerListTy)
+        return LHS;
+
+      auto Flatten = [](ListInit *List) -> std::optional<std::vector<Init *>> {
+        std::vector<Init *> Flattened;
+        // Concatenate elements of all the inner lists.
+        for (Init *InnerInit : List->getValues()) {
+          ListInit *InnerList = dyn_cast<ListInit>(InnerInit);
+          if (!InnerList)
+            return std::nullopt;
+          for (Init *InnerElem : InnerList->getValues())
+            Flattened.push_back(InnerElem);
+        };
+        return Flattened;
+      };
+
+      auto Flattened = Flatten(LHSList);
+      if (Flattened)
+        return ListInit::get(*Flattened, InnerListTy->getElementType());
+    }
+    break;
   }
   return const_cast<UnOpInit *>(this);
 }
@@ -1010,6 +1036,9 @@ std::string UnOpInit::getAsString() const {
   case EMPTY: Result = "!empty"; break;
   case GETDAGOP: Result = "!getdagop"; break;
   case LOG2 : Result = "!logtwo"; break;
+  case LISTFLATTEN:
+    Result = "!listflatten";
+    break;
   case REPR:
     Result = "!repr";
     break;
diff --git a/llvm/lib/TableGen/TGLexer.cpp b/llvm/lib/TableGen/TGLexer.cpp
index 62a884e01a5306..8fe7f69ecf8e59 100644
--- a/llvm/lib/TableGen/TGLexer.cpp
+++ b/llvm/lib/TableGen/TGLexer.cpp
@@ -628,6 +628,7 @@ tgtok::TokKind TGLexer::LexExclaim() {
           .Case("foreach", tgtok::XForEach)
           .Case("filter", tgtok::XFilter)
           .Case("listconcat", tgtok::XListConcat)
+          .Case("listflatten", tgtok::XListFlatten)
           .Case("listsplat", tgtok::XListSplat)
           .Case("listremove", tgtok::XListRemove)
           .Case("range", tgtok::XRange)
diff --git a/llvm/lib/TableGen/TGLexer.h b/llvm/lib/TableGen/TGLexer.h
index 9adc03ccc72b85..4fa4d84d0535d3 100644
--- a/llvm/lib/TableGen/TGLexer.h
+++ b/llvm/lib/TableGen/TGLexer.h
@@ -122,6 +122,7 @@ enum TokKind {
   XSRL,
   XSHL,
   XListConcat,
+  XListFlatten,
   XListSplat,
   XStrConcat,
   XInterleave,
diff --git a/llvm/lib/TableGen/TGParser.cpp b/llvm/lib/TableGen/TGParser.cpp
index 1a60c2a567a297..54c9a902ec27a1 100644
--- a/llvm/lib/TableGen/TGParser.cpp
+++ b/llvm/lib/TableGen/TGParser.cpp
@@ -1190,6 +1190,7 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
   case tgtok::XNOT:
   case tgtok::XToLower:
   case tgtok::XToUpper:
+  case tgtok::XListFlatten:
   case tgtok::XLOG2:
   case tgtok::XHead:
   case tgtok::XTail:
@@ -1235,6 +1236,11 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
       Code = UnOpInit::NOT;
       Type = IntRecTy::get(Records);
       break;
+    case tgtok::XListFlatten:
+      Lex.Lex(); // eat the operation.
+      Code = UnOpInit::LISTFLATTEN;
+      Type = IntRecTy::get(Records); // Bogus type used here.
+      break;
     case tgtok::XLOG2:
       Lex.Lex();  // eat the operation
       Code = UnOpInit::LOG2;
@@ -1309,7 +1315,8 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
       }
     }
 
-    if (Code == UnOpInit::HEAD || Code == UnOpInit::TAIL) {
+    if (Code == UnOpInit::HEAD || Code == UnOpInit::TAIL ||
+        Code == UnOpInit::LISTFLATTEN) {
       ListInit *LHSl = dyn_cast<ListInit>(LHS);
       TypedInit *LHSt = dyn_cast<TypedInit>(LHS);
       if (!LHSl && !LHSt) {
@@ -1328,6 +1335,8 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
         TokError("empty list argument in unary operator");
         return nullptr;
       }
+      bool UseElementType =
+          Code == UnOpInit::HEAD || Code == UnOpInit::LISTFLATTEN;
       if (LHSl) {
         Init *Item = LHSl->getElement(0);
         TypedInit *Itemt = dyn_cast<TypedInit>(Item);
@@ -1335,12 +1344,25 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
           TokError("untyped list element in unary operator");
           return nullptr;
         }
-        Type = (Code == UnOpInit::HEAD) ? Itemt->getType()
-                                        : ListRecTy::get(Itemt->getType());
+        Type = UseElementType ? Itemt->getType()
+                              : ListRecTy::get(Itemt->getType());
       } else {
         assert(LHSt && "expected list type argument in unary operator");
         ListRecTy *LType = dyn_cast<ListRecTy>(LHSt->getType());
-        Type = (Code == UnOpInit::HEAD) ? LType->getElementType() : LType;
+        Type = UseElementType ? LType->getElementType() : LType;
+      }
+
+      // for !listflatten, we expect a list of lists, but also support a list of
+      // non-lists, where !listflatten will be a NOP.
+      if (Code == UnOpInit::LISTFLATTEN) {
+        ListRecTy *InnerListTy = dyn_cast<ListRecTy>(Type);
+        if (InnerListTy) {
+          // listflatten will convert list<list<X>> to list<X>.
+          Type = ListRecTy::get(InnerListTy->getElementType());
+        } else {
+          // If its a list of non-lists, !listflatten will be a NOP.
+          Type = ListRecTy::get(Type);
+        }
       }
     }
 
@@ -1378,7 +1400,7 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
 
   case tgtok::XExists: {
     // Value ::= !exists '<' Type '>' '(' Value ')'
-    Lex.Lex(); // eat the operation
+    Lex.Lex(); // eat the operation.
 
     RecTy *Type = ParseOperatorType();
     if (!Type)
diff --git a/llvm/test/TableGen/listflatten-error.td b/llvm/test/TableGen/listflatten-error.td
new file mode 100644
index 00000000000000..56062420982a11
--- /dev/null
+++ b/llvm/test/TableGen/listflatten-error.td
@@ -0,0 +1,6 @@
+// RUN: not llvm-tblgen %s 2>&1 | FileCheck %s  -DFILE=%s
+
+// CHECK: [[FILE]]:[[@LINE+2]]:33: error: expected list type argument in unary operator
+class Flatten<int A> {
+    list<int> F = !listflatten(A);
+}
diff --git a/llvm/test/TableGen/listflatten.td b/llvm/test/TableGen/listflatten.td
new file mode 100644
index 00000000000000..bc9b1c71ea88d7
--- /dev/null
+++ b/llvm/test/TableGen/listflatten.td
@@ -0,0 +1,32 @@
+// RUN: llvm-tblgen %s | FileCheck %s
+
+class Flatten<list<int> A, list<int> B> {
+    list<int> Flat1 = !listflatten([A, B, [6], [7, 8]]);
+
+    list<list<int>> X = [A, B];
+    list<int> Flat2 = !listflatten(!listconcat(X, [[7]]));
+
+    // Generate a nested list of integers.
+    list<int> Y0 = [1, 2, 3, 4];
+    list<list<int>> Y1 = !foreach(elem, Y0, [elem]);
+    list<list<list<int>>> Y2 = !foreach(elem, Y1, [elem]);
+    list<list<list<list<int>>>> Y3 = !foreach(elem, Y2, [elem]);
+
+    // Flatten it completely.
+    list<int> Flat3=!listflatten(!listflatten(!listflatten(Y3)));
+
+    // Flatten it partially.
+    list<list<list<int>>> Flat4 = !listflatten(Y3);
+    list<list<int>> Flat5 = !listflatten(!listflatten(Y3));
+
+    // Test NOP flattening.
+    list<string> Flat6 = !listflatten(["a", "b"]);
+}
+
+// CHECK: list<int> Flat1 = [1, 2, 3, 4, 5, 6, 7, 8];
+// CHECK: list<int> Flat2 = [1, 2, 3, 4, 5, 7];
+// CHECK: list<int> Flat3 = [1, 2, 3, 4];
+// CHECK{LITERAL}: list<list<list<int>>> Flat4 = [[[1]], [[2]], [[3]], [[4]]];
+// CHECK: list<string> Flat6 = ["a", "b"];
+def F : Flatten<[1,2], [3,4,5]>;
+

From 12033e550b186f3b3e4d2ca3ce9cfc3d3a3fa6e1 Mon Sep 17 00:00:00 2001
From: Bevin Hansson <59652494+bevin-hansson@users.noreply.github.com>
Date: Tue, 24 Sep 2024 15:02:04 +0200
Subject: [PATCH 05/49] [ISelDAG] Salvage debug info at isel by referring to
 frame indices. (#109126)

We can refer to frame index locations when salvaging debug info
for certain nodes, which prevents the compiler from optimizing
out the location.
---
 .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 16 ++++-
 llvm/test/CodeGen/SPARC/salvage-debug-isel.ll | 69 +++++++++++++++++++
 llvm/test/CodeGen/X86/pr57673.ll              |  4 +-
 3 files changed, 84 insertions(+), 5 deletions(-)
 create mode 100644 llvm/test/CodeGen/SPARC/salvage-debug-isel.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 9b96dbb666198a..64414e2d44e65a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -11237,6 +11237,12 @@ void SelectionDAG::salvageDebugInfo(SDNode &N) {
   if (!N.getHasDebugValue())
     return;
 
+  auto GetLocationOperand = [](SDNode *Node, unsigned ResNo) {
+    if (auto *FISDN = dyn_cast<FrameIndexSDNode>(Node))
+      return SDDbgOperand::fromFrameIdx(FISDN->getIndex());
+    return SDDbgOperand::fromNode(Node, ResNo);
+  };
+
   SmallVector<SDDbgValue *, 2> ClonedDVs;
   for (auto *DV : GetDbgValues(&N)) {
     if (DV->isInvalidated())
@@ -11272,7 +11278,7 @@ void SelectionDAG::salvageDebugInfo(SDNode &N) {
           if (NewLocOps[i].getKind() != SDDbgOperand::SDNODE ||
               NewLocOps[i].getSDNode() != &N)
             continue;
-          NewLocOps[i] = SDDbgOperand::fromNode(N0.getNode(), N0.getResNo());
+          NewLocOps[i] = GetLocationOperand(N0.getNode(), N0.getResNo());
           if (RHSConstant) {
             SmallVector<uint64_t, 3> ExprOps;
             DIExpression::appendOffset(ExprOps, Offset);
@@ -11327,7 +11333,7 @@ void SelectionDAG::salvageDebugInfo(SDNode &N) {
             NewLocOps[i].getSDNode() != &N)
           continue;
 
-        NewLocOps[i] = SDDbgOperand::fromNode(N0.getNode(), N0.getResNo());
+        NewLocOps[i] = GetLocationOperand(N0.getNode(), N0.getResNo());
         DbgExpression = DIExpression::appendOpsToArg(DbgExpression, ExtOps, i);
         Changed = true;
       }
@@ -11350,7 +11356,11 @@ void SelectionDAG::salvageDebugInfo(SDNode &N) {
   }
 
   for (SDDbgValue *Dbg : ClonedDVs) {
-    assert(!Dbg->getSDNodes().empty() &&
+    assert((!Dbg->getSDNodes().empty() ||
+            llvm::any_of(Dbg->getLocationOps(),
+                         [&](const SDDbgOperand &Op) {
+                           return Op.getKind() == SDDbgOperand::FRAMEIX;
+                         })) &&
            "Salvaged DbgValue should depend on a new SDNode");
     AddDbgValue(Dbg, false);
   }
diff --git a/llvm/test/CodeGen/SPARC/salvage-debug-isel.ll b/llvm/test/CodeGen/SPARC/salvage-debug-isel.ll
new file mode 100644
index 00000000000000..ce44d3ab7fd082
--- /dev/null
+++ b/llvm/test/CodeGen/SPARC/salvage-debug-isel.ll
@@ -0,0 +1,69 @@
+; RUN: llc -march=sparc -O1 %s -o - -stop-after=finalize-isel | FileCheck %s
+
+; Debug info salvaging in isel means we should see a location for this variable.
+
+; CHECK-LABEL: name:            a
+; CHECK: DBG_VALUE %stack.0.b, $noreg, ![[#]], !DIExpression(DW_OP_plus_uconst, 3, DW_OP_stack_value)
+
+define dso_local zeroext i16 @a() local_unnamed_addr #0 !dbg !7 {
+entry:
+  %b = alloca [6 x i8], align 1
+  %arrayidx = getelementptr inbounds [6 x i8], ptr %b, i32 0, i32 undef, !dbg !27
+  store i8 4, ptr %arrayidx, align 1, !dbg !28
+  %arrayidx1 = getelementptr inbounds i8, ptr %b, i32 3, !dbg !32
+    #dbg_value(ptr %arrayidx1, !22, !DIExpression(), !25)
+  %0 = load i8, ptr %arrayidx1, align 1, !dbg !33
+  %tobool.not = icmp eq i8 %0, 0, !dbg !35
+  br i1 %tobool.not, label %if.end, label %for.cond, !dbg !36
+
+for.cond:                                         ; preds = %entry, %for.cond
+  br label %for.cond, !dbg !37, !llvm.loop !40
+
+if.end:                                           ; preds = %entry
+  ret i16 undef, !dbg !44
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3, !4, !5}
+!llvm.ident = !{!6}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "clang version 20.0.0git.prerel", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: None)
+!1 = !DIFile(filename: "file.c", directory: "/path", checksumkind: CSK_MD5, checksum: "aa7b5139660a2329a6409414c44cc1f6")
+!2 = !{i32 7, !"Dwarf Version", i32 5}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{i32 1, !"wchar_size", i32 4}
+!5 = !{i32 7, !"debug-info-assignment-tracking", i1 true}
+!6 = !{!"clang version 20.0.0git.prerel"}
+!7 = distinct !DISubprogram(name: "a", scope: !1, file: !1, line: 2, type: !8, scopeLine: 2, flags: DIFlagAllCallsDescribed, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !13)
+!8 = !DISubroutineType(types: !9)
+!9 = !{!10}
+!10 = !DIDerivedType(tag: DW_TAG_typedef, name: "uint16_t", file: !11, line: 277, baseType: !12)
+!11 = !DIFile(filename: "stdint.h", directory: "", checksumkind: CSK_MD5, checksum: "d9e8f73f3756bbd642f1729623e09484")
+!12 = !DIBasicType(name: "unsigned short", size: 16, encoding: DW_ATE_unsigned)
+!13 = !{!14, !20, !22}
+!14 = !DILocalVariable(name: "b", scope: !7, file: !1, line: 3, type: !15)
+!15 = !DICompositeType(tag: DW_TAG_array_type, baseType: !16, size: 48, elements: !18)
+!16 = !DIDerivedType(tag: DW_TAG_typedef, name: "int8_t", file: !11, line: 298, baseType: !17)
+!17 = !DIBasicType(name: "signed char", size: 8, encoding: DW_ATE_signed_char)
+!18 = !{!19}
+!19 = !DISubrange(count: 6)
+!20 = !DILocalVariable(name: "c", scope: !7, file: !1, line: 4, type: !21)
+!21 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!22 = !DILocalVariable(name: "d", scope: !7, file: !1, line: 6, type: !23)
+!23 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !16, size: 32)
+!25 = !DILocation(line: 0, scope: !7)
+!27 = !DILocation(line: 5, column: 3, scope: !7)
+!28 = !DILocation(line: 5, column: 8, scope: !7)
+!32 = !DILocation(line: 6, column: 16, scope: !7)
+!33 = !DILocation(line: 7, column: 33, scope: !34)
+!34 = distinct !DILexicalBlock(scope: !7, file: !1, line: 7, column: 7)
+!35 = !DILocation(line: 7, column: 7, scope: !34)
+!36 = !DILocation(line: 7, column: 7, scope: !7)
+!37 = !DILocation(line: 8, column: 5, scope: !38)
+!38 = distinct !DILexicalBlock(scope: !39, file: !1, line: 8, column: 5)
+!39 = distinct !DILexicalBlock(scope: !34, file: !1, line: 8, column: 5)
+!40 = distinct !{!40, !41, !42, !43}
+!41 = !DILocation(line: 8, column: 5, scope: !39)
+!42 = !DILocation(line: 9, column: 7, scope: !39)
+!43 = !{!"llvm.loop.unroll.disable"}
+!44 = !DILocation(line: 10, column: 1, scope: !7)
diff --git a/llvm/test/CodeGen/X86/pr57673.ll b/llvm/test/CodeGen/X86/pr57673.ll
index cf7717f420480b..4ca8ae91f9e6fc 100644
--- a/llvm/test/CodeGen/X86/pr57673.ll
+++ b/llvm/test/CodeGen/X86/pr57673.ll
@@ -37,7 +37,7 @@ define void @foo() {
   ; NORMAL-NEXT: {{  $}}
   ; NORMAL-NEXT:   [[MOVUPSrm:%[0-9]+]]:vr128 = MOVUPSrm %stack.1.i, 1, $noreg, 40, $noreg :: (load (s128) from %ir.i4, align 8)
   ; NORMAL-NEXT:   MOVUPSmr $noreg, 1, $noreg, 0, $noreg, killed [[MOVUPSrm]] :: (store (s128) into `ptr null`, align 8)
-  ; NORMAL-NEXT:   DBG_VALUE $noreg, $noreg, !3, !DIExpression(), debug-location !8
+  ; NORMAL-NEXT:   DBG_VALUE_LIST !3, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_plus_uconst, 40, DW_OP_stack_value), %stack.1.i, %stack.1.i, debug-location !8
   ; NORMAL-NEXT:   [[MOVUPSrm1:%[0-9]+]]:vr128 = MOVUPSrm %stack.1.i, 1, $noreg, 40, $noreg :: (load (s128) from %ir.i6, align 8)
   ; NORMAL-NEXT:   MOVUPSmr $noreg, 1, $noreg, 0, $noreg, killed [[MOVUPSrm1]] :: (store (s128) into `ptr null`, align 8)
   ; NORMAL-NEXT: {{  $}}
@@ -76,7 +76,7 @@ define void @foo() {
   ; INSTRREF-NEXT: {{  $}}
   ; INSTRREF-NEXT:   [[MOVUPSrm:%[0-9]+]]:vr128 = MOVUPSrm %stack.1.i, 1, $noreg, 40, $noreg :: (load (s128) from %ir.i4, align 8)
   ; INSTRREF-NEXT:   MOVUPSmr $noreg, 1, $noreg, 0, $noreg, killed [[MOVUPSrm]] :: (store (s128) into `ptr null`, align 8)
-  ; INSTRREF-NEXT:   DBG_VALUE $noreg, $noreg, !3, !DIExpression(), debug-location !8
+  ; INSTRREF-NEXT:   DBG_VALUE_LIST !3, !DIExpression(DW_OP_LLVM_arg, 0, DW_OP_plus_uconst, 40, DW_OP_stack_value), %stack.1.i, %stack.1.i, debug-location !8
   ; INSTRREF-NEXT:   [[MOVUPSrm1:%[0-9]+]]:vr128 = MOVUPSrm %stack.1.i, 1, $noreg, 40, $noreg :: (load (s128) from %ir.i6, align 8)
   ; INSTRREF-NEXT:   MOVUPSmr $noreg, 1, $noreg, 0, $noreg, killed [[MOVUPSrm1]] :: (store (s128) into `ptr null`, align 8)
   ; INSTRREF-NEXT: {{  $}}

From b47d1787b51f55d69ef1b4f88e72cd54af451649 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andrzej=20Warzy=C5=84ski?= <andrzej.warzynski@arm.com>
Date: Tue, 24 Sep 2024 14:03:30 +0100
Subject: [PATCH 06/49] [mlir][vector] Refine vectorisation of tensor.extract
 (#109580)

This PR fixes a bug in `isLoopInvariantIdx`. It makes sure that the
following case is vectorised as `vector.gather` (as opposed to
attempting a contiguous load):
```mlir
  func.func @index_from_output_column_vector_gather_load(%src: tensor<8x128xf32>) -> tensor<8x1xf32> {
    %c0 = arith.constant 0 : index
    %0 = tensor.empty() : tensor<8x1xf32>
    %res = linalg.generic {
      indexing_maps = [#map],
      iterator_types = ["parallel", "parallel"]
    } outs(%0 : tensor<8x1xf32>) {
    ^bb0(%arg1: f32):
        %1 = linalg.index 0 : index
      %extracted = tensor.extract %src[%1, %c0] : tensor<8x128xf32>
        linalg.yield %extracted : f32
    } -> tensor<8x1xf32>
    return %res : tensor<8x1xf32>
  }
```

Specifically, when looking for loop-invariant indices in
`tensor.extract` Ops, any `linalg.index` Op that's used in address
colcluation should only access loop dims that are == 1. In the example
above, the following does not meet that criteria:
```mlir
  %1 = linalg.index 0 : index
```

Note that this PR also effectively addresses the issue fixed in #107922,
i.e. exercised by:
  * `@vectorize_nd_tensor_extract_load_1d_column_vector_using_gather_load`

`getNonUnitLoopDim` introduced in #107922 is still valid though. In
fact, it is required to identify that the following case is a contiguous
load:
```mlir
  func.func @index_from_output_column_vector_contiguous_load(%src: tensor<8x128xf32>) -> tensor<8x1xf32> {
    %c0 = arith.constant 0 : index
    %0 = tensor.empty() : tensor<8x1xf32>
    %res = linalg.generic {
      indexing_maps = [#map],
      iterator_types = ["parallel", "parallel"]
    } outs(%0 : tensor<8x1xf32>) {
    ^bb0(%arg1: f32):
        %1 = linalg.index 0 : index
      %extracted = tensor.extract %src[%c0, %1] : tensor<8x128xf32>
        linalg.yield %extracted : f32
    } -> tensor<8x1xf32>
    return %res : tensor<8x1xf32>
  }
```
Some logic is still missing to lower the above to
`vector.transfer_read`, so it is conservatively lowered to
`vector.gather` instead (see TODO in
`getTensorExtractMemoryAccessPattern`).

There's a few additional changes:
  * `getNonUnitLoopDim` is simplified and renamed as
    `getTrailingNonUnitLoopDimIdx`, additional comments are added (note
    that the functionality didn't change);
  * extra comments in a few places, variable names in comments update to
    use Markdown (which is the preferred approach in MLIR).

This is a follow-on for:
  * https://github.com/llvm/llvm-project/pull/107922
  * https://github.com/llvm/llvm-project/pull/102321
---
 .../Linalg/Transforms/Vectorization.cpp       | 85 ++++++++++--------
 .../Linalg/vectorize-tensor-extract.mlir      | 90 +++++++++++++++++++
 2 files changed, 140 insertions(+), 35 deletions(-)

diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
index 6800a0fec278c6..c332307da4d333 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -810,27 +810,35 @@ static Value calculateGatherOffset(RewriterBase &rewriter,
 
 enum VectorMemoryAccessKind { ScalarBroadcast, Contiguous, Gather };
 
-/// Find the non-unit dim in a linalgOp.
-/// When executing this hook, it is expected that only one dim will be non-unit.
-/// Other cases (i.e. reading n-D vectors) should've been labelled as gather
-/// loads before calling this method. This is used for finding contiguous loads
-/// (represented as `tensor.extract`) within `linalg.generic` Ops. Note that
-/// this condition is expected to hold for statically shaped Linalg Ops only.
-static uint64_t getNonUnitLoopDim(LinalgOp linalgOp) {
-  uint64_t nonUnitDim = 0;
-  uint64_t countNonUnitDim = 0;
-  for (auto tripCount : llvm::enumerate(linalgOp.getStaticLoopRanges())) {
-    if (tripCount.value() != 1) {
-      nonUnitDim = tripCount.index();
-      countNonUnitDim++;
-    }
-  }
-
+/// Find the index of the trailing non-unit dim in linalgOp. This hook is used
+/// when checking whether `tensor.extract` Op (within a `linalg.generic` Op)
+/// represents a contiguous load operation.
+///
+/// Note that when calling this hook, it is assumed that the output vector is
+/// effectively 1D. Other cases (i.e. reading n-D vectors) should've been
+/// labelled as a gather load before entering this method.
+///
+/// Following on from the above, it is assumed that:
+///   * for statically shaped loops, when no masks are used, only one dim is !=
+///   1 (that's what the shape of the output vector is based on).
+///   * for dynamically shaped loops, there might be more non-unit dims
+///   as the output vector type is user-specified.
+///
+/// TODO: Statically shaped loops + vector masking
+static uint64_t getTrailingNonUnitLoopDimIdx(LinalgOp linalgOp) {
+  SmallVector<int64_t> loopRanges = linalgOp.getStaticLoopRanges();
   assert(linalgOp.hasDynamicShape() ||
-         countNonUnitDim == 1 && "For statically shaped Linalg Ops, only one "
-                                 "non-unit loop dim is expected");
-  (void)countNonUnitDim;
-  return nonUnitDim;
+         llvm::count_if(loopRanges, [](int64_t dim) { return dim != 1; }) ==
+                 1 &&
+             "For statically shaped Linalg Ops, only one "
+             "non-unit loop dim is expected");
+
+  size_t idx = loopRanges.size() - 1;
+  for (; idx >= 0; idx--)
+    if (loopRanges[idx] != 1)
+      break;
+
+  return idx;
 }
 
 /// Checks whether `val` can be used for calculating a loop invariant index.
@@ -854,11 +862,11 @@ static bool isLoopInvariantIdx(LinalgOp &linalgOp, Value &val,
   assert(defOp && "This is neither a block argument nor an operation result");
 
   // IndexOp is loop invariant as long as its result remains constant across
-  // iterations. Given the assumptions on the loop ranges above, only the
-  // trailing loop dim ever changes.
-  auto trailingLoopDim = linalgOp.getStaticLoopRanges().size() - 1;
-  if (auto indexOp = dyn_cast<linalg::IndexOp>(defOp))
-    return (indexOp.getDim() != trailingLoopDim);
+  // iterations. Note that for dynamic shapes, the corresponding dim will also
+  // be conservatively treated as != 1.
+  if (auto indexOp = dyn_cast<linalg::IndexOp>(defOp)) {
+    return linalgOp.getStaticLoopRanges()[indexOp.getDim()] == 1;
+  }
 
   auto *ancestor = block->findAncestorOpInBlock(*defOp);
 
@@ -877,7 +885,7 @@ static bool isLoopInvariantIdx(LinalgOp &linalgOp, Value &val,
   return result;
 }
 
-/// Check whether \p val could be used for calculating the trailing index for a
+/// Check whether `val` could be used for calculating the trailing index for a
 /// contiguous load operation.
 ///
 /// There are currently 3 types of values that are allowed here:
@@ -886,13 +894,14 @@ static bool isLoopInvariantIdx(LinalgOp &linalgOp, Value &val,
 ///   3. results of basic arithmetic operations (linear and continuous)
 ///      involving 1., 2. and 3.
 /// This method returns True if indeed only such values are used in calculating
-/// \p val.
+/// `val.`
 ///
 /// Additionally, the trailing index for a contiguous load operation should
 /// increment by 1 with every loop iteration, i.e. be based on:
 ///   * `linalg.index <dim>` ,
-/// where <dim> is the trailing dim of the iteration space. \p foundIndexOp is
-/// updated to `true` when such an op is found.
+/// where <dim> is the trailing non-unit dim of the iteration space (this way,
+/// `linalg.index <dim>` increments by 1 with every loop iteration).
+/// `foundIndexOp` is updated to `true` when such Op is found.
 static bool isContiguousLoadIdx(LinalgOp &linalgOp, Value &val,
                                 bool &foundIndexOp, VectorType resType) {
 
@@ -912,12 +921,10 @@ static bool isContiguousLoadIdx(LinalgOp &linalgOp, Value &val,
   Operation *defOp = val.getDefiningOp();
   assert(defOp && "This is neither a block argument nor an operation result");
 
-  // Given the assumption on the loop ranges above, we expect only 1 non-unit
-  // loop dim.
-  auto nonUnitLoopDim = getNonUnitLoopDim(linalgOp);
-
   if (auto indexOp = dyn_cast<linalg::IndexOp>(defOp)) {
-    foundIndexOp = (indexOp.getDim() == nonUnitLoopDim);
+    auto loopDimThatIncrementsByOne = getTrailingNonUnitLoopDimIdx(linalgOp);
+
+    foundIndexOp = (indexOp.getDim() == loopDimThatIncrementsByOne);
     return true;
   }
 
@@ -1012,7 +1019,10 @@ getTensorExtractMemoryAccessPattern(tensor::ExtractOp extractOp,
   bool foundIndexOp = false;
   bool isContiguousLoad = isContiguousLoadIdx(linalgOp, extractOpTrailingIdx,
                                               foundIndexOp, resType);
-  isContiguousLoad &= foundIndexOp;
+  // TODO: Support generating contiguous loads for column vectors - that will
+  // require adding a permutation map to tranfer_read Ops.
+  bool isRowVector = resType.getShape().back() != 1;
+  isContiguousLoad &= (foundIndexOp && isRowVector);
 
   if (isContiguousLoad) {
     LDBG("Found contigous load: " << extractOp);
@@ -1073,6 +1083,11 @@ vectorizeTensorExtract(RewriterBase &rewriter, VectorizationState &state,
   //  b. contiguous loads.
   // Both cases use vector.transfer_read.
 
+  assert(llvm::count_if(resultType.getShape(),
+                        [](uint64_t dim) { return dim != 1; }) &&
+         "Contiguous loads and scalar loads + broadcast only support 1-D "
+         "vectors ATM!");
+
   // Collect indices for `vector.transfer_read`. At this point, the indices will
   // either be scalars or would have been broadcast to vectors matching the
   // result type. For indices that are vectors, there are two options:
diff --git a/mlir/test/Dialect/Linalg/vectorize-tensor-extract.mlir b/mlir/test/Dialect/Linalg/vectorize-tensor-extract.mlir
index ad3a8d9f926082..2c56b7139fec49 100644
--- a/mlir/test/Dialect/Linalg/vectorize-tensor-extract.mlir
+++ b/mlir/test/Dialect/Linalg/vectorize-tensor-extract.mlir
@@ -307,6 +307,96 @@ module attributes {transform.with_named_sequence} {
 
 // -----
 
+// Reading a 1D column vector (hence a candidate for a contiguous load), but given
+// %1, it's a gather load.
+
+#map = affine_map<(d0, d1) -> (d0, d1)>
+func.func @index_from_output_column_vector_gather_load(%src: tensor<8x128xf32>) -> tensor<8x1xf32> {
+  %c0 = arith.constant 0 : index
+  %0 = tensor.empty() : tensor<8x1xf32>
+  %res = linalg.generic {
+    indexing_maps = [#map],
+    iterator_types = ["parallel", "parallel"]
+  } outs(%0 : tensor<8x1xf32>) {
+  ^bb0(%arg1: f32):
+      %1 = linalg.index 0 : index
+    %extracted = tensor.extract %src[%1, %c0] : tensor<8x128xf32>
+      linalg.yield %extracted : f32
+  } -> tensor<8x1xf32>
+  return %res : tensor<8x1xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg2: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.generic"]} in %arg2 : (!transform.any_op) -> !transform.any_op
+    %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
+    %2 = transform.structured.vectorize_children_and_apply_patterns %1 {vectorize_nd_extract} : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
+}
+
+// CHECK-LABEL:   func.func @index_from_output_column_vector_gather_load(
+// CHECK-SAME:      %[[SRC:.*]]: tensor<8x128xf32>) -> tensor<8x1xf32> {
+// CHECK:           %[[C128:.*]] = arith.constant dense<128> : vector<1x8xindex>
+// CHECK:           %[[C0:.*]] = arith.constant 0 : index
+// CHECK:           %[[PASS_THRU:.*]] = arith.constant dense<0.000000e+00> : vector<8x1xf32>
+// CHECK:           %[[MASK:.*]] = arith.constant dense<true> : vector<8x1xi1>
+// CHECK:           %[[IDX_VEC:.*]] = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7]> : vector<8xindex>
+// CHECK:           %[[OUT:.*]] = tensor.empty() : tensor<8x1xf32>
+// CHECK:           %[[B:.*]] = vector.broadcast %[[IDX_VEC]] : vector<8xindex> to vector<1x8xindex>
+// CHECK:           %[[MUL:.*]] = arith.muli %[[B]], %[[C128]] : vector<1x8xindex>
+// CHECK:           %[[TR:.*]] = vector.transpose %[[MUL]], [1, 0] : vector<1x8xindex> to vector<8x1xindex>
+// CHECK:           %[[GATHER:.*]] = vector.gather %[[SRC]]{{\[}}%[[C0]], %[[C0]]] {{\[}}%[[TR]]], %[[MASK]], %[[PASS_THRU]] : tensor<8x128xf32>, vector<8x1xindex>, vector<8x1xi1>, vector<8x1xf32> into vector<8x1xf32>
+// CHECK:           %[[RES:.*]] = vector.transfer_write %[[GATHER]], %[[OUT]]{{\[}}%[[C0]], %[[C0]]] {in_bounds = [true, true]} : vector<8x1xf32>, tensor<8x1xf32>
+// CHECK:           return %[[RES]] : tensor<8x1xf32>
+
+// -----
+
+// Same as above, but the access indices have been swapped and hence this _is_
+// a contiguous load. Currently not supported and lowered as vector.gather
+// instead.
+// TODO: Make sure that this is lowered as a contiguous load.
+
+#map = affine_map<(d0, d1) -> (d0, d1)>
+func.func @index_from_output_column_vector_contiguous_load(%src: tensor<8x128xf32>) -> tensor<8x1xf32> {
+  %c0 = arith.constant 0 : index
+  %0 = tensor.empty() : tensor<8x1xf32>
+  %res = linalg.generic {
+    indexing_maps = [#map],
+    iterator_types = ["parallel", "parallel"]
+  } outs(%0 : tensor<8x1xf32>) {
+  ^bb0(%arg1: f32):
+      %1 = linalg.index 0 : index
+    %extracted = tensor.extract %src[%c0, %1] : tensor<8x128xf32>
+      linalg.yield %extracted : f32
+  } -> tensor<8x1xf32>
+  return %res : tensor<8x1xf32>
+}
+
+module attributes {transform.with_named_sequence} {
+  transform.named_sequence @__transform_main(%arg2: !transform.any_op {transform.readonly}) {
+    %0 = transform.structured.match ops{["linalg.generic"]} in %arg2 : (!transform.any_op) -> !transform.any_op
+    %1 = transform.get_parent_op %0 {isolated_from_above} : (!transform.any_op) -> !transform.any_op
+    %2 = transform.structured.vectorize_children_and_apply_patterns %1 {vectorize_nd_extract} : (!transform.any_op) -> !transform.any_op
+    transform.yield
+  }
+}
+
+// CHECK-LABEL:   func.func @index_from_output_column_vector_contiguous_load(
+// CHECK-SAME:      %[[SRC:.*]]: tensor<8x128xf32>) -> tensor<8x1xf32> {
+// CHECK:           %[[C0:.*]] = arith.constant 0 : index
+// CHECK:           %[[PASS_THRU:.*]] = arith.constant dense<0.000000e+00> : vector<8x1xf32>
+// CHECK:           %[[MASK:.*]] = arith.constant dense<true> : vector<8x1xi1>
+// CHECK:           %[[IDX_VEC:.*]] = arith.constant dense<[0, 1, 2, 3, 4, 5, 6, 7]> : vector<8xindex>
+// CHECK:           %[[OUT:.*]] = tensor.empty() : tensor<8x1xf32>
+// CHECK:           %[[B:.*]] = vector.broadcast %[[IDX_VEC]] : vector<8xindex> to vector<1x8xindex>
+// CHECK:           %[[TR:.*]] = vector.transpose %[[B]], [1, 0] : vector<1x8xindex> to vector<8x1xindex>
+// CHECK:           %[[GATHER:.*]] = vector.gather %[[SRC]]{{\[}}%[[C0]], %[[C0]]] {{\[}}%[[TR]]], %[[MASK]], %[[PASS_THRU]] : tensor<8x128xf32>, vector<8x1xindex>, vector<8x1xi1>, vector<8x1xf32> into vector<8x1xf32>
+// CHECK:           %[[RES:.*]] = vector.transfer_write %[[GATHER]], %[[OUT]]{{\[}}%[[C0]], %[[C0]]] {in_bounds = [true, true]} : vector<8x1xf32>, tensor<8x1xf32>
+// CHECK:           return %[[RES]] : tensor<8x1xf32>
+
+// -----
+
 #map = affine_map<(d0) -> (d0)>
 func.func @vectorize_nd_tensor_extract_contiguous_and_gather(%arg0: tensor<6xf32>, %arg1: tensor<5xi32>) -> tensor<5xf32> {
  %c5 = arith.constant 5 : index

From 106e4506ce6084e10353073d8880e2f736b74981 Mon Sep 17 00:00:00 2001
From: David Pagan <dave.pagan@amd.com>
Date: Tue, 24 Sep 2024 06:16:03 -0700
Subject: [PATCH 07/49] [OpenMP][Docs] Update OpenMP release notes with 'omp
 scope' (#109752)

Release notes: added 'omp scope' directive to "OpenMP Support" section
of "What's New in Clang"
---
 clang/docs/ReleaseNotes.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index b47e06cb0c5d68..e511614fcf2451 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -637,6 +637,7 @@ Python Binding Changes
 OpenMP Support
 --------------
 - Added support for 'omp assume' directive.
+- Added support for 'omp scope' directive.
 
 Improvements
 ^^^^^^^^^^^^

From ebbf664aa31ac51c43a345a8a3d0734c54be7c4b Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Tue, 24 Sep 2024 14:06:57 +0100
Subject: [PATCH 08/49] [AMDGPU] Use shared prefix in GFX12 barrier test

---
 .../AMDGPU/llvm.amdgcn.s.barrier.wait.ll      | 2140 ++++++++---------
 1 file changed, 1061 insertions(+), 1079 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll
index 6e029f7c0a95e5..a4be9ed8c2b4af 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll
@@ -1,43 +1,43 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GLOBAL-ISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX12,GFX12-GISEL %s
 
 define amdgpu_kernel void @test1_s_barrier_signal(ptr addrspace(1) %out) #0 {
-; GCN-LABEL: test1_s_barrier_signal:
-; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GCN-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GCN-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
-; GCN-NEXT:    v_mul_u32_u24_e32 v1, v0, v0
-; GCN-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
-; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    global_store_b32 v3, v2, s[0:1]
-; GCN-NEXT:    s_wait_storecnt 0x0
-; GCN-NEXT:    s_barrier_signal -1
-; GCN-NEXT:    s_barrier_wait -1
-; GCN-NEXT:    global_store_b32 v3, v0, s[0:1]
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GCN-NEXT:    s_endpgm
+; GFX12-SDAG-LABEL: test1_s_barrier_signal:
+; GFX12-SDAG:       ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
+; GFX12-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
+; GFX12-SDAG-NEXT:    v_mul_u32_u24_e32 v1, v0, v0
+; GFX12-SDAG-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    global_store_b32 v3, v2, s[0:1]
+; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT:    s_barrier_signal -1
+; GFX12-SDAG-NEXT:    s_barrier_wait -1
+; GFX12-SDAG-NEXT:    global_store_b32 v3, v0, s[0:1]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
 ;
-; GLOBAL-ISEL-LABEL: test1_s_barrier_signal:
-; GLOBAL-ISEL:       ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GLOBAL-ISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GLOBAL-ISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
-; GLOBAL-ISEL-NEXT:    v_mul_lo_u32 v1, v0, v0
-; GLOBAL-ISEL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
-; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
-; GLOBAL-ISEL-NEXT:    s_wait_storecnt 0x0
-; GLOBAL-ISEL-NEXT:    s_barrier_signal -1
-; GLOBAL-ISEL-NEXT:    s_barrier_wait -1
-; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
-; GLOBAL-ISEL-NEXT:    s_nop 0
-; GLOBAL-ISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GLOBAL-ISEL-NEXT:    s_endpgm
+; GFX12-GISEL-LABEL: test1_s_barrier_signal:
+; GFX12-GISEL:       ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
+; GFX12-GISEL-NEXT:    v_mul_lo_u32 v1, v0, v0
+; GFX12-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
+; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT:    s_barrier_signal -1
+; GFX12-GISEL-NEXT:    s_barrier_wait -1
+; GFX12-GISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
@@ -51,41 +51,41 @@ entry:
 }
 
 define amdgpu_kernel void @test2_s_barrier_signal(ptr addrspace(1) %out) #0 {
-; GCN-LABEL: test2_s_barrier_signal:
-; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GCN-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GCN-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
-; GCN-NEXT:    v_mul_u32_u24_e32 v1, v0, v0
-; GCN-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
-; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    global_store_b32 v3, v2, s[0:1]
-; GCN-NEXT:    s_wait_storecnt 0x0
-; GCN-NEXT:    s_barrier_signal 1
-; GCN-NEXT:    s_barrier_wait 1
-; GCN-NEXT:    global_store_b32 v3, v0, s[0:1]
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GCN-NEXT:    s_endpgm
+; GFX12-SDAG-LABEL: test2_s_barrier_signal:
+; GFX12-SDAG:       ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
+; GFX12-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
+; GFX12-SDAG-NEXT:    v_mul_u32_u24_e32 v1, v0, v0
+; GFX12-SDAG-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    global_store_b32 v3, v2, s[0:1]
+; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT:    s_barrier_signal 1
+; GFX12-SDAG-NEXT:    s_barrier_wait 1
+; GFX12-SDAG-NEXT:    global_store_b32 v3, v0, s[0:1]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
 ;
-; GLOBAL-ISEL-LABEL: test2_s_barrier_signal:
-; GLOBAL-ISEL:       ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GLOBAL-ISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GLOBAL-ISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
-; GLOBAL-ISEL-NEXT:    v_mul_lo_u32 v1, v0, v0
-; GLOBAL-ISEL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
-; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
-; GLOBAL-ISEL-NEXT:    s_wait_storecnt 0x0
-; GLOBAL-ISEL-NEXT:    s_barrier_signal 1
-; GLOBAL-ISEL-NEXT:    s_barrier_wait 1
-; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
-; GLOBAL-ISEL-NEXT:    s_nop 0
-; GLOBAL-ISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GLOBAL-ISEL-NEXT:    s_endpgm
+; GFX12-GISEL-LABEL: test2_s_barrier_signal:
+; GFX12-GISEL:       ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
+; GFX12-GISEL-NEXT:    v_mul_lo_u32 v1, v0, v0
+; GFX12-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
+; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT:    s_barrier_signal 1
+; GFX12-GISEL-NEXT:    s_barrier_wait 1
+; GFX12-GISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
@@ -99,41 +99,41 @@ entry:
 }
 
 define amdgpu_kernel void @test3_s_barrier_signal(ptr addrspace(1) %out) #0 {
-; GCN-LABEL: test3_s_barrier_signal:
-; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GCN-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GCN-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
-; GCN-NEXT:    v_mul_u32_u24_e32 v1, v0, v0
-; GCN-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
-; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    global_store_b32 v3, v2, s[0:1]
-; GCN-NEXT:    s_wait_storecnt 0x0
-; GCN-NEXT:    s_barrier_signal 0
-; GCN-NEXT:    s_barrier_wait 0
-; GCN-NEXT:    global_store_b32 v3, v0, s[0:1]
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GCN-NEXT:    s_endpgm
+; GFX12-SDAG-LABEL: test3_s_barrier_signal:
+; GFX12-SDAG:       ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
+; GFX12-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
+; GFX12-SDAG-NEXT:    v_mul_u32_u24_e32 v1, v0, v0
+; GFX12-SDAG-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    global_store_b32 v3, v2, s[0:1]
+; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT:    s_barrier_signal 0
+; GFX12-SDAG-NEXT:    s_barrier_wait 0
+; GFX12-SDAG-NEXT:    global_store_b32 v3, v0, s[0:1]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
 ;
-; GLOBAL-ISEL-LABEL: test3_s_barrier_signal:
-; GLOBAL-ISEL:       ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GLOBAL-ISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GLOBAL-ISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
-; GLOBAL-ISEL-NEXT:    v_mul_lo_u32 v1, v0, v0
-; GLOBAL-ISEL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
-; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
-; GLOBAL-ISEL-NEXT:    s_wait_storecnt 0x0
-; GLOBAL-ISEL-NEXT:    s_barrier_signal 0
-; GLOBAL-ISEL-NEXT:    s_barrier_wait 0
-; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
-; GLOBAL-ISEL-NEXT:    s_nop 0
-; GLOBAL-ISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GLOBAL-ISEL-NEXT:    s_endpgm
+; GFX12-GISEL-LABEL: test3_s_barrier_signal:
+; GFX12-GISEL:       ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
+; GFX12-GISEL-NEXT:    v_mul_lo_u32 v1, v0, v0
+; GFX12-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
+; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT:    s_barrier_signal 0
+; GFX12-GISEL-NEXT:    s_barrier_wait 0
+; GFX12-GISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
@@ -147,44 +147,44 @@ entry:
 }
 
 define amdgpu_kernel void @test1_s_barrier_signal_var(ptr addrspace(1) %out) #0 {
-; GCN-LABEL: test1_s_barrier_signal_var:
-; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GCN-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
-; GCN-NEXT:    s_mov_b32 m0, 1
-; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GCN-NEXT:    v_mul_u32_u24_e32 v2, v0, v0
-; GCN-NEXT:    v_lshlrev_b32_e32 v3, 2, v0
-; GCN-NEXT:    v_sub_nc_u32_e32 v0, v2, v0
-; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    global_store_b32 v3, v1, s[0:1]
-; GCN-NEXT:    s_wait_storecnt 0x0
-; GCN-NEXT:    s_barrier_signal m0
-; GCN-NEXT:    s_barrier_wait 1
-; GCN-NEXT:    global_store_b32 v3, v0, s[0:1]
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GCN-NEXT:    s_endpgm
+; GFX12-SDAG-LABEL: test1_s_barrier_signal_var:
+; GFX12-SDAG:       ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX12-SDAG-NEXT:    s_mov_b32 m0, 1
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_mul_u32_u24_e32 v2, v0, v0
+; GFX12-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 2, v0
+; GFX12-SDAG-NEXT:    v_sub_nc_u32_e32 v0, v2, v0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    global_store_b32 v3, v1, s[0:1]
+; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT:    s_barrier_signal m0
+; GFX12-SDAG-NEXT:    s_barrier_wait 1
+; GFX12-SDAG-NEXT:    global_store_b32 v3, v0, s[0:1]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
 ;
-; GLOBAL-ISEL-LABEL: test1_s_barrier_signal_var:
-; GLOBAL-ISEL:       ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GLOBAL-ISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GLOBAL-ISEL-NEXT:    v_mov_b32_e32 v2, 0
-; GLOBAL-ISEL-NEXT:    s_mov_b32 m0, 1
-; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GLOBAL-ISEL-NEXT:    v_lshlrev_b32_e32 v3, 2, v0
-; GLOBAL-ISEL-NEXT:    v_mul_lo_u32 v1, v0, v0
-; GLOBAL-ISEL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
-; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
-; GLOBAL-ISEL-NEXT:    s_wait_storecnt 0x0
-; GLOBAL-ISEL-NEXT:    s_barrier_signal m0
-; GLOBAL-ISEL-NEXT:    s_barrier_wait 1
-; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
-; GLOBAL-ISEL-NEXT:    s_nop 0
-; GLOBAL-ISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GLOBAL-ISEL-NEXT:    s_endpgm
+; GFX12-GISEL-LABEL: test1_s_barrier_signal_var:
+; GFX12-GISEL:       ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v2, 0
+; GFX12-GISEL-NEXT:    s_mov_b32 m0, 1
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v3, 2, v0
+; GFX12-GISEL-NEXT:    v_mul_lo_u32 v1, v0, v0
+; GFX12-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
+; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT:    s_barrier_signal m0
+; GFX12-GISEL-NEXT:    s_barrier_wait 1
+; GFX12-GISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
@@ -198,83 +198,83 @@ entry:
 }
 
 define void @test2_s_barrier_signal_var(i32 %arg) {
-; GCN-LABEL: test2_s_barrier_signal_var:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GCN-NEXT:    s_wait_expcnt 0x0
-; GCN-NEXT:    s_wait_samplecnt 0x0
-; GCN-NEXT:    s_wait_bvhcnt 0x0
-; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    v_readfirstlane_b32 s0, v0
-; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GCN-NEXT:    s_mov_b32 m0, s0
-; GCN-NEXT:    s_wait_storecnt 0x0
-; GCN-NEXT:    s_barrier_signal m0
-; GCN-NEXT:    s_wait_alu 0xfffe
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-LABEL: test2_s_barrier_signal_var:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    s_mov_b32 m0, s0
+; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT:    s_barrier_signal m0
+; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GLOBAL-ISEL-LABEL: test2_s_barrier_signal_var:
-; GLOBAL-ISEL:       ; %bb.0:
-; GLOBAL-ISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GLOBAL-ISEL-NEXT:    s_wait_expcnt 0x0
-; GLOBAL-ISEL-NEXT:    s_wait_samplecnt 0x0
-; GLOBAL-ISEL-NEXT:    s_wait_bvhcnt 0x0
-; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT:    v_readfirstlane_b32 m0, v0
-; GLOBAL-ISEL-NEXT:    s_wait_storecnt 0x0
-; GLOBAL-ISEL-NEXT:    s_barrier_signal m0
-; GLOBAL-ISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-GISEL-LABEL: test2_s_barrier_signal_var:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_readfirstlane_b32 m0, v0
+; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT:    s_barrier_signal m0
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
   call void @llvm.amdgcn.s.barrier.signal.var(i32 %arg)
   ret void
 }
 
 define amdgpu_kernel void @test1_s_barrier_signal_isfirst(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %out) #0 {
-; GCN-LABEL: test1_s_barrier_signal_isfirst:
-; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_b256 s[0:7], s[2:3], 0x24
-; GCN-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
-; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    global_store_b32 v0, v1, s[6:7]
-; GCN-NEXT:    s_wait_storecnt 0x0
-; GCN-NEXT:    s_barrier_signal_isfirst -1
-; GCN-NEXT:    s_cselect_b32 s3, s3, s5
-; GCN-NEXT:    s_cselect_b32 s2, s2, s4
-; GCN-NEXT:    s_clause 0x1
-; GCN-NEXT:    global_load_b32 v2, v1, s[0:1]
-; GCN-NEXT:    global_load_b32 v1, v1, s[2:3]
-; GCN-NEXT:    s_wait_loadcnt 0x0
-; GCN-NEXT:    v_mul_lo_u32 v1, v1, v2
-; GCN-NEXT:    global_store_b32 v0, v1, s[6:7]
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GCN-NEXT:    s_endpgm
+; GFX12-SDAG-LABEL: test1_s_barrier_signal_isfirst:
+; GFX12-SDAG:       ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT:    s_load_b256 s[0:7], s[2:3], 0x24
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    global_store_b32 v0, v1, s[6:7]
+; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT:    s_barrier_signal_isfirst -1
+; GFX12-SDAG-NEXT:    s_cselect_b32 s3, s3, s5
+; GFX12-SDAG-NEXT:    s_cselect_b32 s2, s2, s4
+; GFX12-SDAG-NEXT:    s_clause 0x1
+; GFX12-SDAG-NEXT:    global_load_b32 v2, v1, s[0:1]
+; GFX12-SDAG-NEXT:    global_load_b32 v1, v1, s[2:3]
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT:    v_mul_lo_u32 v1, v1, v2
+; GFX12-SDAG-NEXT:    global_store_b32 v0, v1, s[6:7]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
 ;
-; GLOBAL-ISEL-LABEL: test1_s_barrier_signal_isfirst:
-; GLOBAL-ISEL:       ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT:    s_load_b256 s[0:7], s[2:3], 0x24
-; GLOBAL-ISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
-; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GLOBAL-ISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT:    global_store_b32 v0, v1, s[6:7]
-; GLOBAL-ISEL-NEXT:    s_wait_storecnt 0x0
-; GLOBAL-ISEL-NEXT:    s_barrier_signal_isfirst -1
-; GLOBAL-ISEL-NEXT:    s_cselect_b32 s8, 1, 0
-; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GLOBAL-ISEL-NEXT:    s_and_b32 s8, s8, 1
-; GLOBAL-ISEL-NEXT:    s_cmp_lg_u32 s8, 0
-; GLOBAL-ISEL-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[4:5]
-; GLOBAL-ISEL-NEXT:    s_clause 0x1
-; GLOBAL-ISEL-NEXT:    global_load_b32 v2, v1, s[0:1]
-; GLOBAL-ISEL-NEXT:    global_load_b32 v1, v1, s[2:3]
-; GLOBAL-ISEL-NEXT:    s_wait_loadcnt 0x0
-; GLOBAL-ISEL-NEXT:    v_mul_lo_u32 v1, v1, v2
-; GLOBAL-ISEL-NEXT:    global_store_b32 v0, v1, s[6:7]
-; GLOBAL-ISEL-NEXT:    s_nop 0
-; GLOBAL-ISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GLOBAL-ISEL-NEXT:    s_endpgm
+; GFX12-GISEL-LABEL: test1_s_barrier_signal_isfirst:
+; GFX12-GISEL:       ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT:    s_load_b256 s[0:7], s[2:3], 0x24
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    global_store_b32 v0, v1, s[6:7]
+; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT:    s_barrier_signal_isfirst -1
+; GFX12-GISEL-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    s_and_b32 s8, s8, 1
+; GFX12-GISEL-NEXT:    s_cmp_lg_u32 s8, 0
+; GFX12-GISEL-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[4:5]
+; GFX12-GISEL-NEXT:    s_clause 0x1
+; GFX12-GISEL-NEXT:    global_load_b32 v2, v1, s[0:1]
+; GFX12-GISEL-NEXT:    global_load_b32 v1, v1, s[2:3]
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT:    v_mul_lo_u32 v1, v1, v2
+; GFX12-GISEL-NEXT:    global_store_b32 v0, v1, s[6:7]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
@@ -289,52 +289,52 @@ entry:
 }
 
 define amdgpu_kernel void @test2_s_barrier_signal_isfirst(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %out) #0 {
-; GCN-LABEL: test2_s_barrier_signal_isfirst:
-; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_b256 s[0:7], s[2:3], 0x24
-; GCN-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
-; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    global_store_b32 v0, v1, s[6:7]
-; GCN-NEXT:    s_wait_storecnt 0x0
-; GCN-NEXT:    s_barrier_signal_isfirst 1
-; GCN-NEXT:    s_cselect_b32 s3, s3, s5
-; GCN-NEXT:    s_cselect_b32 s2, s2, s4
-; GCN-NEXT:    s_clause 0x1
-; GCN-NEXT:    global_load_b32 v2, v1, s[0:1]
-; GCN-NEXT:    global_load_b32 v1, v1, s[2:3]
-; GCN-NEXT:    s_wait_loadcnt 0x0
-; GCN-NEXT:    v_mul_lo_u32 v1, v1, v2
-; GCN-NEXT:    global_store_b32 v0, v1, s[6:7]
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GCN-NEXT:    s_endpgm
+; GFX12-SDAG-LABEL: test2_s_barrier_signal_isfirst:
+; GFX12-SDAG:       ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT:    s_load_b256 s[0:7], s[2:3], 0x24
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    global_store_b32 v0, v1, s[6:7]
+; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT:    s_barrier_signal_isfirst 1
+; GFX12-SDAG-NEXT:    s_cselect_b32 s3, s3, s5
+; GFX12-SDAG-NEXT:    s_cselect_b32 s2, s2, s4
+; GFX12-SDAG-NEXT:    s_clause 0x1
+; GFX12-SDAG-NEXT:    global_load_b32 v2, v1, s[0:1]
+; GFX12-SDAG-NEXT:    global_load_b32 v1, v1, s[2:3]
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT:    v_mul_lo_u32 v1, v1, v2
+; GFX12-SDAG-NEXT:    global_store_b32 v0, v1, s[6:7]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
 ;
-; GLOBAL-ISEL-LABEL: test2_s_barrier_signal_isfirst:
-; GLOBAL-ISEL:       ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT:    s_load_b256 s[0:7], s[2:3], 0x24
-; GLOBAL-ISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
-; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GLOBAL-ISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT:    global_store_b32 v0, v1, s[6:7]
-; GLOBAL-ISEL-NEXT:    s_wait_storecnt 0x0
-; GLOBAL-ISEL-NEXT:    s_barrier_signal_isfirst 1
-; GLOBAL-ISEL-NEXT:    s_cselect_b32 s8, 1, 0
-; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GLOBAL-ISEL-NEXT:    s_and_b32 s8, s8, 1
-; GLOBAL-ISEL-NEXT:    s_cmp_lg_u32 s8, 0
-; GLOBAL-ISEL-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[4:5]
-; GLOBAL-ISEL-NEXT:    s_clause 0x1
-; GLOBAL-ISEL-NEXT:    global_load_b32 v2, v1, s[0:1]
-; GLOBAL-ISEL-NEXT:    global_load_b32 v1, v1, s[2:3]
-; GLOBAL-ISEL-NEXT:    s_wait_loadcnt 0x0
-; GLOBAL-ISEL-NEXT:    v_mul_lo_u32 v1, v1, v2
-; GLOBAL-ISEL-NEXT:    global_store_b32 v0, v1, s[6:7]
-; GLOBAL-ISEL-NEXT:    s_nop 0
-; GLOBAL-ISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GLOBAL-ISEL-NEXT:    s_endpgm
+; GFX12-GISEL-LABEL: test2_s_barrier_signal_isfirst:
+; GFX12-GISEL:       ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT:    s_load_b256 s[0:7], s[2:3], 0x24
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    global_store_b32 v0, v1, s[6:7]
+; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT:    s_barrier_signal_isfirst 1
+; GFX12-GISEL-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    s_and_b32 s8, s8, 1
+; GFX12-GISEL-NEXT:    s_cmp_lg_u32 s8, 0
+; GFX12-GISEL-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[4:5]
+; GFX12-GISEL-NEXT:    s_clause 0x1
+; GFX12-GISEL-NEXT:    global_load_b32 v2, v1, s[0:1]
+; GFX12-GISEL-NEXT:    global_load_b32 v1, v1, s[2:3]
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT:    v_mul_lo_u32 v1, v1, v2
+; GFX12-GISEL-NEXT:    global_store_b32 v0, v1, s[6:7]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
@@ -349,52 +349,52 @@ entry:
 }
 
 define amdgpu_kernel void @test3_s_barrier_signal_isfirst(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %out) #0 {
-; GCN-LABEL: test3_s_barrier_signal_isfirst:
-; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_b256 s[0:7], s[2:3], 0x24
-; GCN-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
-; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    global_store_b32 v0, v1, s[6:7]
-; GCN-NEXT:    s_wait_storecnt 0x0
-; GCN-NEXT:    s_barrier_signal_isfirst 1
-; GCN-NEXT:    s_cselect_b32 s3, s3, s5
-; GCN-NEXT:    s_cselect_b32 s2, s2, s4
-; GCN-NEXT:    s_clause 0x1
-; GCN-NEXT:    global_load_b32 v2, v1, s[0:1]
-; GCN-NEXT:    global_load_b32 v1, v1, s[2:3]
-; GCN-NEXT:    s_wait_loadcnt 0x0
-; GCN-NEXT:    v_mul_lo_u32 v1, v1, v2
-; GCN-NEXT:    global_store_b32 v0, v1, s[6:7]
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GCN-NEXT:    s_endpgm
+; GFX12-SDAG-LABEL: test3_s_barrier_signal_isfirst:
+; GFX12-SDAG:       ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT:    s_load_b256 s[0:7], s[2:3], 0x24
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    global_store_b32 v0, v1, s[6:7]
+; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT:    s_barrier_signal_isfirst 1
+; GFX12-SDAG-NEXT:    s_cselect_b32 s3, s3, s5
+; GFX12-SDAG-NEXT:    s_cselect_b32 s2, s2, s4
+; GFX12-SDAG-NEXT:    s_clause 0x1
+; GFX12-SDAG-NEXT:    global_load_b32 v2, v1, s[0:1]
+; GFX12-SDAG-NEXT:    global_load_b32 v1, v1, s[2:3]
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT:    v_mul_lo_u32 v1, v1, v2
+; GFX12-SDAG-NEXT:    global_store_b32 v0, v1, s[6:7]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
 ;
-; GLOBAL-ISEL-LABEL: test3_s_barrier_signal_isfirst:
-; GLOBAL-ISEL:       ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT:    s_load_b256 s[0:7], s[2:3], 0x24
-; GLOBAL-ISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
-; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GLOBAL-ISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT:    global_store_b32 v0, v1, s[6:7]
-; GLOBAL-ISEL-NEXT:    s_wait_storecnt 0x0
-; GLOBAL-ISEL-NEXT:    s_barrier_signal_isfirst 1
-; GLOBAL-ISEL-NEXT:    s_cselect_b32 s8, 1, 0
-; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GLOBAL-ISEL-NEXT:    s_and_b32 s8, s8, 1
-; GLOBAL-ISEL-NEXT:    s_cmp_lg_u32 s8, 0
-; GLOBAL-ISEL-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[4:5]
-; GLOBAL-ISEL-NEXT:    s_clause 0x1
-; GLOBAL-ISEL-NEXT:    global_load_b32 v2, v1, s[0:1]
-; GLOBAL-ISEL-NEXT:    global_load_b32 v1, v1, s[2:3]
-; GLOBAL-ISEL-NEXT:    s_wait_loadcnt 0x0
-; GLOBAL-ISEL-NEXT:    v_mul_lo_u32 v1, v1, v2
-; GLOBAL-ISEL-NEXT:    global_store_b32 v0, v1, s[6:7]
-; GLOBAL-ISEL-NEXT:    s_nop 0
-; GLOBAL-ISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GLOBAL-ISEL-NEXT:    s_endpgm
+; GFX12-GISEL-LABEL: test3_s_barrier_signal_isfirst:
+; GFX12-GISEL:       ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT:    s_load_b256 s[0:7], s[2:3], 0x24
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    global_store_b32 v0, v1, s[6:7]
+; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT:    s_barrier_signal_isfirst 1
+; GFX12-GISEL-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    s_and_b32 s8, s8, 1
+; GFX12-GISEL-NEXT:    s_cmp_lg_u32 s8, 0
+; GFX12-GISEL-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[4:5]
+; GFX12-GISEL-NEXT:    s_clause 0x1
+; GFX12-GISEL-NEXT:    global_load_b32 v2, v1, s[0:1]
+; GFX12-GISEL-NEXT:    global_load_b32 v1, v1, s[2:3]
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT:    v_mul_lo_u32 v1, v1, v2
+; GFX12-GISEL-NEXT:    global_store_b32 v0, v1, s[6:7]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
@@ -409,54 +409,54 @@ entry:
 }
 
 define amdgpu_kernel void @test1_s_barrier_signal_isfirst_var(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %out) #0 {
-; GCN-LABEL: test1_s_barrier_signal_isfirst_var:
-; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_b256 s[0:7], s[2:3], 0x24
-; GCN-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
-; GCN-NEXT:    s_mov_b32 m0, 1
-; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    global_store_b32 v0, v1, s[6:7]
-; GCN-NEXT:    s_wait_storecnt 0x0
-; GCN-NEXT:    s_barrier_signal_isfirst m0
-; GCN-NEXT:    s_cselect_b32 s3, s3, s5
-; GCN-NEXT:    s_cselect_b32 s2, s2, s4
-; GCN-NEXT:    s_clause 0x1
-; GCN-NEXT:    global_load_b32 v2, v1, s[0:1]
-; GCN-NEXT:    global_load_b32 v1, v1, s[2:3]
-; GCN-NEXT:    s_wait_loadcnt 0x0
-; GCN-NEXT:    v_mul_lo_u32 v1, v1, v2
-; GCN-NEXT:    global_store_b32 v0, v1, s[6:7]
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GCN-NEXT:    s_endpgm
+; GFX12-SDAG-LABEL: test1_s_barrier_signal_isfirst_var:
+; GFX12-SDAG:       ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT:    s_load_b256 s[0:7], s[2:3], 0x24
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX12-SDAG-NEXT:    s_mov_b32 m0, 1
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    global_store_b32 v0, v1, s[6:7]
+; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT:    s_barrier_signal_isfirst m0
+; GFX12-SDAG-NEXT:    s_cselect_b32 s3, s3, s5
+; GFX12-SDAG-NEXT:    s_cselect_b32 s2, s2, s4
+; GFX12-SDAG-NEXT:    s_clause 0x1
+; GFX12-SDAG-NEXT:    global_load_b32 v2, v1, s[0:1]
+; GFX12-SDAG-NEXT:    global_load_b32 v1, v1, s[2:3]
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT:    v_mul_lo_u32 v1, v1, v2
+; GFX12-SDAG-NEXT:    global_store_b32 v0, v1, s[6:7]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
 ;
-; GLOBAL-ISEL-LABEL: test1_s_barrier_signal_isfirst_var:
-; GLOBAL-ISEL:       ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT:    s_load_b256 s[0:7], s[2:3], 0x24
-; GLOBAL-ISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
-; GLOBAL-ISEL-NEXT:    s_mov_b32 m0, 1
-; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GLOBAL-ISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT:    global_store_b32 v0, v1, s[6:7]
-; GLOBAL-ISEL-NEXT:    s_wait_storecnt 0x0
-; GLOBAL-ISEL-NEXT:    s_barrier_signal_isfirst m0
-; GLOBAL-ISEL-NEXT:    s_cselect_b32 s8, 1, 0
-; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GLOBAL-ISEL-NEXT:    s_and_b32 s8, s8, 1
-; GLOBAL-ISEL-NEXT:    s_cmp_lg_u32 s8, 0
-; GLOBAL-ISEL-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[4:5]
-; GLOBAL-ISEL-NEXT:    s_clause 0x1
-; GLOBAL-ISEL-NEXT:    global_load_b32 v2, v1, s[0:1]
-; GLOBAL-ISEL-NEXT:    global_load_b32 v1, v1, s[2:3]
-; GLOBAL-ISEL-NEXT:    s_wait_loadcnt 0x0
-; GLOBAL-ISEL-NEXT:    v_mul_lo_u32 v1, v1, v2
-; GLOBAL-ISEL-NEXT:    global_store_b32 v0, v1, s[6:7]
-; GLOBAL-ISEL-NEXT:    s_nop 0
-; GLOBAL-ISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GLOBAL-ISEL-NEXT:    s_endpgm
+; GFX12-GISEL-LABEL: test1_s_barrier_signal_isfirst_var:
+; GFX12-GISEL:       ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT:    s_load_b256 s[0:7], s[2:3], 0x24
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX12-GISEL-NEXT:    s_mov_b32 m0, 1
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    global_store_b32 v0, v1, s[6:7]
+; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT:    s_barrier_signal_isfirst m0
+; GFX12-GISEL-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    s_and_b32 s8, s8, 1
+; GFX12-GISEL-NEXT:    s_cmp_lg_u32 s8, 0
+; GFX12-GISEL-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[4:5]
+; GFX12-GISEL-NEXT:    s_clause 0x1
+; GFX12-GISEL-NEXT:    global_load_b32 v2, v1, s[0:1]
+; GFX12-GISEL-NEXT:    global_load_b32 v1, v1, s[2:3]
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT:    v_mul_lo_u32 v1, v1, v2
+; GFX12-GISEL-NEXT:    global_store_b32 v0, v1, s[6:7]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
@@ -471,65 +471,65 @@ entry:
 }
 
 define void @test2_s_barrier_signal_isfirst_var(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, i32 %arg, ptr addrspace(1) %out) {
-; GCN-LABEL: test2_s_barrier_signal_isfirst_var:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GCN-NEXT:    s_wait_expcnt 0x0
-; GCN-NEXT:    s_wait_samplecnt 0x0
-; GCN-NEXT:    s_wait_bvhcnt 0x0
-; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    v_dual_mov_b32 v10, 0 :: v_dual_and_b32 v9, 0x3ff, v31
-; GCN-NEXT:    v_readfirstlane_b32 s0, v6
-; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GCN-NEXT:    v_lshlrev_b32_e32 v9, 2, v9
-; GCN-NEXT:    s_mov_b32 m0, s0
-; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GCN-NEXT:    v_add_co_u32 v7, vcc_lo, v7, v9
-; GCN-NEXT:    v_add_co_ci_u32_e32 v8, vcc_lo, 0, v8, vcc_lo
-; GCN-NEXT:    global_store_b32 v[7:8], v10, off
-; GCN-NEXT:    s_wait_storecnt 0x0
-; GCN-NEXT:    s_barrier_signal_isfirst m0
-; GCN-NEXT:    s_cselect_b32 vcc_lo, -1, 0
-; GCN-NEXT:    s_wait_alu 0xfffe
-; GCN-NEXT:    v_dual_cndmask_b32 v2, v4, v2 :: v_dual_cndmask_b32 v3, v5, v3
-; GCN-NEXT:    global_load_b32 v0, v[0:1], off
-; GCN-NEXT:    global_load_b32 v1, v[2:3], off
-; GCN-NEXT:    s_wait_loadcnt 0x0
-; GCN-NEXT:    v_mul_lo_u32 v0, v1, v0
-; GCN-NEXT:    global_store_b32 v[7:8], v0, off
-; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-LABEL: test2_s_barrier_signal_isfirst_var:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v10, 0 :: v_dual_and_b32 v9, 0x3ff, v31
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s0, v6
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_lshlrev_b32_e32 v9, 2, v9
+; GFX12-SDAG-NEXT:    s_mov_b32 m0, s0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_add_co_u32 v7, vcc_lo, v7, v9
+; GFX12-SDAG-NEXT:    v_add_co_ci_u32_e32 v8, vcc_lo, 0, v8, vcc_lo
+; GFX12-SDAG-NEXT:    global_store_b32 v[7:8], v10, off
+; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT:    s_barrier_signal_isfirst m0
+; GFX12-SDAG-NEXT:    s_cselect_b32 vcc_lo, -1, 0
+; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT:    v_dual_cndmask_b32 v2, v4, v2 :: v_dual_cndmask_b32 v3, v5, v3
+; GFX12-SDAG-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX12-SDAG-NEXT:    global_load_b32 v1, v[2:3], off
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT:    v_mul_lo_u32 v0, v1, v0
+; GFX12-SDAG-NEXT:    global_store_b32 v[7:8], v0, off
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GLOBAL-ISEL-LABEL: test2_s_barrier_signal_isfirst_var:
-; GLOBAL-ISEL:       ; %bb.0:
-; GLOBAL-ISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GLOBAL-ISEL-NEXT:    s_wait_expcnt 0x0
-; GLOBAL-ISEL-NEXT:    s_wait_samplecnt 0x0
-; GLOBAL-ISEL-NEXT:    s_wait_bvhcnt 0x0
-; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT:    v_and_b32_e32 v9, 0x3ff, v31
-; GLOBAL-ISEL-NEXT:    v_readfirstlane_b32 m0, v6
-; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GLOBAL-ISEL-NEXT:    v_lshlrev_b32_e32 v9, 2, v9
-; GLOBAL-ISEL-NEXT:    v_add_co_u32 v7, vcc_lo, v7, v9
-; GLOBAL-ISEL-NEXT:    v_add_co_ci_u32_e32 v8, vcc_lo, 0, v8, vcc_lo
-; GLOBAL-ISEL-NEXT:    v_mov_b32_e32 v9, 0
-; GLOBAL-ISEL-NEXT:    global_store_b32 v[7:8], v9, off
-; GLOBAL-ISEL-NEXT:    s_wait_storecnt 0x0
-; GLOBAL-ISEL-NEXT:    s_barrier_signal_isfirst m0
-; GLOBAL-ISEL-NEXT:    s_cselect_b32 s0, 1, 0
-; GLOBAL-ISEL-NEXT:    s_wait_alu 0xfffe
-; GLOBAL-ISEL-NEXT:    s_and_b32 s0, 1, s0
-; GLOBAL-ISEL-NEXT:    s_wait_alu 0xfffe
-; GLOBAL-ISEL-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
-; GLOBAL-ISEL-NEXT:    v_dual_cndmask_b32 v2, v4, v2 :: v_dual_cndmask_b32 v3, v5, v3
-; GLOBAL-ISEL-NEXT:    global_load_b32 v0, v[0:1], off
-; GLOBAL-ISEL-NEXT:    global_load_b32 v1, v[2:3], off
-; GLOBAL-ISEL-NEXT:    s_wait_loadcnt 0x0
-; GLOBAL-ISEL-NEXT:    v_mul_lo_u32 v0, v1, v0
-; GLOBAL-ISEL-NEXT:    global_store_b32 v[7:8], v0, off
-; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-GISEL-LABEL: test2_s_barrier_signal_isfirst_var:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v9, 0x3ff, v31
+; GFX12-GISEL-NEXT:    v_readfirstlane_b32 m0, v6
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v9, 2, v9
+; GFX12-GISEL-NEXT:    v_add_co_u32 v7, vcc_lo, v7, v9
+; GFX12-GISEL-NEXT:    v_add_co_ci_u32_e32 v8, vcc_lo, 0, v8, vcc_lo
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v9, 0
+; GFX12-GISEL-NEXT:    global_store_b32 v[7:8], v9, off
+; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT:    s_barrier_signal_isfirst m0
+; GFX12-GISEL-NEXT:    s_cselect_b32 s0, 1, 0
+; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT:    s_and_b32 s0, 1, s0
+; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT:    v_cmp_ne_u32_e64 vcc_lo, 0, s0
+; GFX12-GISEL-NEXT:    v_dual_cndmask_b32 v2, v4, v2 :: v_dual_cndmask_b32 v3, v5, v3
+; GFX12-GISEL-NEXT:    global_load_b32 v0, v[0:1], off
+; GFX12-GISEL-NEXT:    global_load_b32 v1, v[2:3], off
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT:    v_mul_lo_u32 v0, v1, v0
+; GFX12-GISEL-NEXT:    global_store_b32 v[7:8], v0, off
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
   store i32 0, ptr addrspace(1) %tmp1
@@ -543,40 +543,40 @@ define void @test2_s_barrier_signal_isfirst_var(ptr addrspace(1) %a, ptr addrspa
 }
 
 define amdgpu_kernel void @test1_s_barrier_init(ptr addrspace(1) %out, i32 %mbrCnt) #0 {
-; GCN-LABEL: test1_s_barrier_init:
-; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_b96 s[0:2], s[2:3], 0x24
-; GCN-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GCN-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
-; GCN-NEXT:    v_mul_u32_u24_e32 v1, v0, v0
-; GCN-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
-; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    s_lshl_b32 s2, s2, 16
-; GCN-NEXT:    global_store_b32 v3, v2, s[0:1]
-; GCN-NEXT:    s_mov_b32 m0, s2
-; GCN-NEXT:    s_barrier_init -1
-; GCN-NEXT:    global_store_b32 v3, v0, s[0:1]
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GCN-NEXT:    s_endpgm
+; GFX12-SDAG-LABEL: test1_s_barrier_init:
+; GFX12-SDAG:       ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT:    s_load_b96 s[0:2], s[2:3], 0x24
+; GFX12-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
+; GFX12-SDAG-NEXT:    v_mul_u32_u24_e32 v1, v0, v0
+; GFX12-SDAG-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX12-SDAG-NEXT:    global_store_b32 v3, v2, s[0:1]
+; GFX12-SDAG-NEXT:    s_mov_b32 m0, s2
+; GFX12-SDAG-NEXT:    s_barrier_init -1
+; GFX12-SDAG-NEXT:    global_store_b32 v3, v0, s[0:1]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
 ;
-; GLOBAL-ISEL-LABEL: test1_s_barrier_init:
-; GLOBAL-ISEL:       ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT:    s_load_b96 s[0:2], s[2:3], 0x24
-; GLOBAL-ISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GLOBAL-ISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
-; GLOBAL-ISEL-NEXT:    v_mul_lo_u32 v1, v0, v0
-; GLOBAL-ISEL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
-; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT:    s_lshl_b32 m0, 16, s2
-; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
-; GLOBAL-ISEL-NEXT:    s_barrier_init -1
-; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
-; GLOBAL-ISEL-NEXT:    s_nop 0
-; GLOBAL-ISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GLOBAL-ISEL-NEXT:    s_endpgm
+; GFX12-GISEL-LABEL: test1_s_barrier_init:
+; GFX12-GISEL:       ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT:    s_load_b96 s[0:2], s[2:3], 0x24
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
+; GFX12-GISEL-NEXT:    v_mul_lo_u32 v1, v0, v0
+; GFX12-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    s_lshl_b32 m0, 16, s2
+; GFX12-GISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
+; GFX12-GISEL-NEXT:    s_barrier_init -1
+; GFX12-GISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
@@ -589,40 +589,40 @@ entry:
 }
 
 define amdgpu_kernel void @test2_s_barrier_init(ptr addrspace(1) %out, i32 %mbrCnt) #0 {
-; GCN-LABEL: test2_s_barrier_init:
-; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_b96 s[0:2], s[2:3], 0x24
-; GCN-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GCN-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
-; GCN-NEXT:    v_mul_u32_u24_e32 v1, v0, v0
-; GCN-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
-; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    s_lshl_b32 s2, s2, 16
-; GCN-NEXT:    global_store_b32 v3, v2, s[0:1]
-; GCN-NEXT:    s_mov_b32 m0, s2
-; GCN-NEXT:    s_barrier_init 1
-; GCN-NEXT:    global_store_b32 v3, v0, s[0:1]
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GCN-NEXT:    s_endpgm
+; GFX12-SDAG-LABEL: test2_s_barrier_init:
+; GFX12-SDAG:       ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT:    s_load_b96 s[0:2], s[2:3], 0x24
+; GFX12-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
+; GFX12-SDAG-NEXT:    v_mul_u32_u24_e32 v1, v0, v0
+; GFX12-SDAG-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX12-SDAG-NEXT:    global_store_b32 v3, v2, s[0:1]
+; GFX12-SDAG-NEXT:    s_mov_b32 m0, s2
+; GFX12-SDAG-NEXT:    s_barrier_init 1
+; GFX12-SDAG-NEXT:    global_store_b32 v3, v0, s[0:1]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
 ;
-; GLOBAL-ISEL-LABEL: test2_s_barrier_init:
-; GLOBAL-ISEL:       ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT:    s_load_b96 s[0:2], s[2:3], 0x24
-; GLOBAL-ISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GLOBAL-ISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
-; GLOBAL-ISEL-NEXT:    v_mul_lo_u32 v1, v0, v0
-; GLOBAL-ISEL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
-; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT:    s_lshl_b32 m0, 16, s2
-; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
-; GLOBAL-ISEL-NEXT:    s_barrier_init 1
-; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
-; GLOBAL-ISEL-NEXT:    s_nop 0
-; GLOBAL-ISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GLOBAL-ISEL-NEXT:    s_endpgm
+; GFX12-GISEL-LABEL: test2_s_barrier_init:
+; GFX12-GISEL:       ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT:    s_load_b96 s[0:2], s[2:3], 0x24
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
+; GFX12-GISEL-NEXT:    v_mul_lo_u32 v1, v0, v0
+; GFX12-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    s_lshl_b32 m0, 16, s2
+; GFX12-GISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
+; GFX12-GISEL-NEXT:    s_barrier_init 1
+; GFX12-GISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
@@ -635,40 +635,40 @@ entry:
 }
 
 define amdgpu_kernel void @test3_s_barrier_init(ptr addrspace(1) %out, i32 %mbrCnt) #0 {
-; GCN-LABEL: test3_s_barrier_init:
-; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_b96 s[0:2], s[2:3], 0x24
-; GCN-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GCN-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
-; GCN-NEXT:    v_mul_u32_u24_e32 v1, v0, v0
-; GCN-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
-; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    s_lshl_b32 s2, s2, 16
-; GCN-NEXT:    global_store_b32 v3, v2, s[0:1]
-; GCN-NEXT:    s_mov_b32 m0, s2
-; GCN-NEXT:    s_barrier_init 0
-; GCN-NEXT:    global_store_b32 v3, v0, s[0:1]
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GCN-NEXT:    s_endpgm
+; GFX12-SDAG-LABEL: test3_s_barrier_init:
+; GFX12-SDAG:       ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT:    s_load_b96 s[0:2], s[2:3], 0x24
+; GFX12-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
+; GFX12-SDAG-NEXT:    v_mul_u32_u24_e32 v1, v0, v0
+; GFX12-SDAG-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    s_lshl_b32 s2, s2, 16
+; GFX12-SDAG-NEXT:    global_store_b32 v3, v2, s[0:1]
+; GFX12-SDAG-NEXT:    s_mov_b32 m0, s2
+; GFX12-SDAG-NEXT:    s_barrier_init 0
+; GFX12-SDAG-NEXT:    global_store_b32 v3, v0, s[0:1]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
 ;
-; GLOBAL-ISEL-LABEL: test3_s_barrier_init:
-; GLOBAL-ISEL:       ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT:    s_load_b96 s[0:2], s[2:3], 0x24
-; GLOBAL-ISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GLOBAL-ISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
-; GLOBAL-ISEL-NEXT:    v_mul_lo_u32 v1, v0, v0
-; GLOBAL-ISEL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
-; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT:    s_lshl_b32 m0, 16, s2
-; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
-; GLOBAL-ISEL-NEXT:    s_barrier_init 0
-; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
-; GLOBAL-ISEL-NEXT:    s_nop 0
-; GLOBAL-ISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GLOBAL-ISEL-NEXT:    s_endpgm
+; GFX12-GISEL-LABEL: test3_s_barrier_init:
+; GFX12-GISEL:       ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT:    s_load_b96 s[0:2], s[2:3], 0x24
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
+; GFX12-GISEL-NEXT:    v_mul_lo_u32 v1, v0, v0
+; GFX12-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    s_lshl_b32 m0, 16, s2
+; GFX12-GISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
+; GFX12-GISEL-NEXT:    s_barrier_init 0
+; GFX12-GISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
@@ -681,43 +681,43 @@ entry:
 }
 
 define amdgpu_kernel void @test4_s_barrier_init(ptr addrspace(1) %out, i32 %bar, i32 %mbrCnt) #0 {
-; GCN-LABEL: test4_s_barrier_init:
-; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; GCN-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GCN-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
-; GCN-NEXT:    v_mul_u32_u24_e32 v1, v0, v0
-; GCN-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
-; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    s_lshl_b32 s3, s3, 16
-; GCN-NEXT:    global_store_b32 v3, v2, s[0:1]
-; GCN-NEXT:    s_or_b32 s2, s2, s3
-; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GCN-NEXT:    s_mov_b32 m0, s2
-; GCN-NEXT:    s_barrier_init m0
-; GCN-NEXT:    global_store_b32 v3, v0, s[0:1]
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GCN-NEXT:    s_endpgm
+; GFX12-SDAG-LABEL: test4_s_barrier_init:
+; GFX12-SDAG:       ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX12-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
+; GFX12-SDAG-NEXT:    v_mul_u32_u24_e32 v1, v0, v0
+; GFX12-SDAG-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    s_lshl_b32 s3, s3, 16
+; GFX12-SDAG-NEXT:    global_store_b32 v3, v2, s[0:1]
+; GFX12-SDAG-NEXT:    s_or_b32 s2, s2, s3
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT:    s_mov_b32 m0, s2
+; GFX12-SDAG-NEXT:    s_barrier_init m0
+; GFX12-SDAG-NEXT:    global_store_b32 v3, v0, s[0:1]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
 ;
-; GLOBAL-ISEL-LABEL: test4_s_barrier_init:
-; GLOBAL-ISEL:       ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; GLOBAL-ISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GLOBAL-ISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
-; GLOBAL-ISEL-NEXT:    v_mul_lo_u32 v1, v0, v0
-; GLOBAL-ISEL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
-; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT:    s_lshl_b32 s3, 16, s3
-; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
-; GLOBAL-ISEL-NEXT:    s_or_b32 m0, s2, s3
-; GLOBAL-ISEL-NEXT:    s_barrier_init m0
-; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
-; GLOBAL-ISEL-NEXT:    s_nop 0
-; GLOBAL-ISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GLOBAL-ISEL-NEXT:    s_endpgm
+; GFX12-GISEL-LABEL: test4_s_barrier_init:
+; GFX12-GISEL:       ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
+; GFX12-GISEL-NEXT:    v_mul_lo_u32 v1, v0, v0
+; GFX12-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    s_lshl_b32 s3, 16, s3
+; GFX12-GISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
+; GFX12-GISEL-NEXT:    s_or_b32 m0, s2, s3
+; GFX12-GISEL-NEXT:    s_barrier_init m0
+; GFX12-GISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
@@ -730,74 +730,74 @@ entry:
 }
 
 define void @test5_s_barrier_init_m0(i32 %arg1 ,i32 %arg2) {
-; GCN-LABEL: test5_s_barrier_init_m0:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GCN-NEXT:    s_wait_expcnt 0x0
-; GCN-NEXT:    s_wait_samplecnt 0x0
-; GCN-NEXT:    s_wait_bvhcnt 0x0
-; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
-; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GCN-NEXT:    v_or_b32_e32 v0, v0, v1
-; GCN-NEXT:    v_readfirstlane_b32 s0, v0
-; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GCN-NEXT:    s_mov_b32 m0, s0
-; GCN-NEXT:    s_barrier_init m0
-; GCN-NEXT:    s_wait_alu 0xfffe
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-LABEL: test5_s_barrier_init_m0:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_or_b32_e32 v0, v0, v1
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    s_mov_b32 m0, s0
+; GFX12-SDAG-NEXT:    s_barrier_init m0
+; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GLOBAL-ISEL-LABEL: test5_s_barrier_init_m0:
-; GLOBAL-ISEL:       ; %bb.0:
-; GLOBAL-ISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GLOBAL-ISEL-NEXT:    s_wait_expcnt 0x0
-; GLOBAL-ISEL-NEXT:    s_wait_samplecnt 0x0
-; GLOBAL-ISEL-NEXT:    s_wait_bvhcnt 0x0
-; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT:    v_readfirstlane_b32 s0, v1
-; GLOBAL-ISEL-NEXT:    v_readfirstlane_b32 s1, v0
-; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GLOBAL-ISEL-NEXT:    s_lshl_b32 s0, 16, s0
-; GLOBAL-ISEL-NEXT:    s_wait_alu 0xfffe
-; GLOBAL-ISEL-NEXT:    s_or_b32 m0, s1, s0
-; GLOBAL-ISEL-NEXT:    s_barrier_init m0
-; GLOBAL-ISEL-NEXT:    s_wait_alu 0xfffe
-; GLOBAL-ISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-GISEL-LABEL: test5_s_barrier_init_m0:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s0, v1
+; GFX12-GISEL-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    s_lshl_b32 s0, 16, s0
+; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT:    s_or_b32 m0, s1, s0
+; GFX12-GISEL-NEXT:    s_barrier_init m0
+; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
   call void @llvm.amdgcn.s.barrier.init(i32 %arg1, i32 %arg2)
   ret void
 }
 
 define amdgpu_kernel void @test1_s_barrier_join(ptr addrspace(1) %out) #0 {
-; GCN-LABEL: test1_s_barrier_join:
-; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GCN-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GCN-NEXT:    s_barrier_join -1
-; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GCN-NEXT:    v_mul_u32_u24_e32 v1, v0, v0
-; GCN-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; GCN-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
-; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GCN-NEXT:    s_endpgm
+; GFX12-SDAG-LABEL: test1_s_barrier_join:
+; GFX12-SDAG:       ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
+; GFX12-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-SDAG-NEXT:    s_barrier_join -1
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_mul_u32_u24_e32 v1, v0, v0
+; GFX12-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX12-SDAG-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    global_store_b32 v2, v0, s[0:1]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
 ;
-; GLOBAL-ISEL-LABEL: test1_s_barrier_join:
-; GLOBAL-ISEL:       ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GLOBAL-ISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GLOBAL-ISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
-; GLOBAL-ISEL-NEXT:    v_mul_lo_u32 v1, v0, v0
-; GLOBAL-ISEL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
-; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
-; GLOBAL-ISEL-NEXT:    s_barrier_join -1
-; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
-; GLOBAL-ISEL-NEXT:    s_nop 0
-; GLOBAL-ISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GLOBAL-ISEL-NEXT:    s_endpgm
+; GFX12-GISEL-LABEL: test1_s_barrier_join:
+; GFX12-GISEL:       ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
+; GFX12-GISEL-NEXT:    v_mul_lo_u32 v1, v0, v0
+; GFX12-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
+; GFX12-GISEL-NEXT:    s_barrier_join -1
+; GFX12-GISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
@@ -810,36 +810,36 @@ entry:
 }
 
 define amdgpu_kernel void @test2_s_barrier_join(ptr addrspace(1) %out) #0 {
-; GCN-LABEL: test2_s_barrier_join:
-; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GCN-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GCN-NEXT:    s_barrier_join 1
-; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GCN-NEXT:    v_mul_u32_u24_e32 v1, v0, v0
-; GCN-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; GCN-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
-; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GCN-NEXT:    s_endpgm
+; GFX12-SDAG-LABEL: test2_s_barrier_join:
+; GFX12-SDAG:       ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
+; GFX12-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-SDAG-NEXT:    s_barrier_join 1
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_mul_u32_u24_e32 v1, v0, v0
+; GFX12-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX12-SDAG-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    global_store_b32 v2, v0, s[0:1]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
 ;
-; GLOBAL-ISEL-LABEL: test2_s_barrier_join:
-; GLOBAL-ISEL:       ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GLOBAL-ISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GLOBAL-ISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
-; GLOBAL-ISEL-NEXT:    v_mul_lo_u32 v1, v0, v0
-; GLOBAL-ISEL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
-; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
-; GLOBAL-ISEL-NEXT:    s_barrier_join 1
-; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
-; GLOBAL-ISEL-NEXT:    s_nop 0
-; GLOBAL-ISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GLOBAL-ISEL-NEXT:    s_endpgm
+; GFX12-GISEL-LABEL: test2_s_barrier_join:
+; GFX12-GISEL:       ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
+; GFX12-GISEL-NEXT:    v_mul_lo_u32 v1, v0, v0
+; GFX12-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
+; GFX12-GISEL-NEXT:    s_barrier_join 1
+; GFX12-GISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
@@ -852,36 +852,36 @@ entry:
 }
 
 define amdgpu_kernel void @test3_s_barrier_join(ptr addrspace(1) %out) #0 {
-; GCN-LABEL: test3_s_barrier_join:
-; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GCN-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GCN-NEXT:    s_barrier_join 0
-; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GCN-NEXT:    v_mul_u32_u24_e32 v1, v0, v0
-; GCN-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; GCN-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
-; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GCN-NEXT:    s_endpgm
+; GFX12-SDAG-LABEL: test3_s_barrier_join:
+; GFX12-SDAG:       ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
+; GFX12-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-SDAG-NEXT:    s_barrier_join 0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_mul_u32_u24_e32 v1, v0, v0
+; GFX12-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX12-SDAG-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    global_store_b32 v2, v0, s[0:1]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
 ;
-; GLOBAL-ISEL-LABEL: test3_s_barrier_join:
-; GLOBAL-ISEL:       ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GLOBAL-ISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GLOBAL-ISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
-; GLOBAL-ISEL-NEXT:    v_mul_lo_u32 v1, v0, v0
-; GLOBAL-ISEL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
-; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
-; GLOBAL-ISEL-NEXT:    s_barrier_join 0
-; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
-; GLOBAL-ISEL-NEXT:    s_nop 0
-; GLOBAL-ISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GLOBAL-ISEL-NEXT:    s_endpgm
+; GFX12-GISEL-LABEL: test3_s_barrier_join:
+; GFX12-GISEL:       ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
+; GFX12-GISEL-NEXT:    v_mul_lo_u32 v1, v0, v0
+; GFX12-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
+; GFX12-GISEL-NEXT:    s_barrier_join 0
+; GFX12-GISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
@@ -894,39 +894,39 @@ entry:
 }
 
 define amdgpu_kernel void @test4_s_barrier_join_m0(ptr addrspace(1) %out, i32 %bar) #0 {
-; GCN-LABEL: test4_s_barrier_join_m0:
-; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_b96 s[0:2], s[2:3], 0x24
-; GCN-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
-; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GCN-NEXT:    v_mul_u32_u24_e32 v2, v0, v0
-; GCN-NEXT:    v_lshlrev_b32_e32 v3, 2, v0
-; GCN-NEXT:    v_sub_nc_u32_e32 v0, v2, v0
-; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    s_mov_b32 m0, s2
-; GCN-NEXT:    global_store_b32 v3, v1, s[0:1]
-; GCN-NEXT:    s_barrier_join m0
-; GCN-NEXT:    global_store_b32 v3, v0, s[0:1]
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GCN-NEXT:    s_endpgm
+; GFX12-SDAG-LABEL: test4_s_barrier_join_m0:
+; GFX12-SDAG:       ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT:    s_load_b96 s[0:2], s[2:3], 0x24
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_mul_u32_u24_e32 v2, v0, v0
+; GFX12-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 2, v0
+; GFX12-SDAG-NEXT:    v_sub_nc_u32_e32 v0, v2, v0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    s_mov_b32 m0, s2
+; GFX12-SDAG-NEXT:    global_store_b32 v3, v1, s[0:1]
+; GFX12-SDAG-NEXT:    s_barrier_join m0
+; GFX12-SDAG-NEXT:    global_store_b32 v3, v0, s[0:1]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
 ;
-; GLOBAL-ISEL-LABEL: test4_s_barrier_join_m0:
-; GLOBAL-ISEL:       ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT:    s_load_b96 s[0:2], s[2:3], 0x24
-; GLOBAL-ISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GLOBAL-ISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
-; GLOBAL-ISEL-NEXT:    v_mul_lo_u32 v1, v0, v0
-; GLOBAL-ISEL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
-; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT:    s_mov_b32 m0, s2
-; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
-; GLOBAL-ISEL-NEXT:    s_barrier_join m0
-; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
-; GLOBAL-ISEL-NEXT:    s_nop 0
-; GLOBAL-ISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GLOBAL-ISEL-NEXT:    s_endpgm
+; GFX12-GISEL-LABEL: test4_s_barrier_join_m0:
+; GFX12-GISEL:       ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT:    s_load_b96 s[0:2], s[2:3], 0x24
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
+; GFX12-GISEL-NEXT:    v_mul_lo_u32 v1, v0, v0
+; GFX12-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    s_mov_b32 m0, s2
+; GFX12-GISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
+; GFX12-GISEL-NEXT:    s_barrier_join m0
+; GFX12-GISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
@@ -939,79 +939,79 @@ entry:
 }
 
 define void @test5_s_barrier_join_m0(i32 %arg) {
-; GCN-LABEL: test5_s_barrier_join_m0:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GCN-NEXT:    s_wait_expcnt 0x0
-; GCN-NEXT:    s_wait_samplecnt 0x0
-; GCN-NEXT:    s_wait_bvhcnt 0x0
-; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    v_readfirstlane_b32 s0, v0
-; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GCN-NEXT:    s_mov_b32 m0, s0
-; GCN-NEXT:    s_barrier_join m0
-; GCN-NEXT:    s_wait_alu 0xfffe
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-LABEL: test5_s_barrier_join_m0:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    s_mov_b32 m0, s0
+; GFX12-SDAG-NEXT:    s_barrier_join m0
+; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GLOBAL-ISEL-LABEL: test5_s_barrier_join_m0:
-; GLOBAL-ISEL:       ; %bb.0:
-; GLOBAL-ISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GLOBAL-ISEL-NEXT:    s_wait_expcnt 0x0
-; GLOBAL-ISEL-NEXT:    s_wait_samplecnt 0x0
-; GLOBAL-ISEL-NEXT:    s_wait_bvhcnt 0x0
-; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT:    v_readfirstlane_b32 m0, v0
-; GLOBAL-ISEL-NEXT:    s_barrier_join m0
-; GLOBAL-ISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-GISEL-LABEL: test5_s_barrier_join_m0:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_readfirstlane_b32 m0, v0
+; GFX12-GISEL-NEXT:    s_barrier_join m0
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
   call void @llvm.amdgcn.s.barrier.join(i32 %arg)
   ret void
 }
 
 define amdgpu_kernel void @test1_s_barrier_leave(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %out) #0 {
-; GCN-LABEL: test1_s_barrier_leave:
-; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_b256 s[0:7], s[2:3], 0x24
-; GCN-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
-; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    global_store_b32 v0, v1, s[6:7]
-; GCN-NEXT:    s_barrier_leave
-; GCN-NEXT:    s_cselect_b32 s3, s3, s5
-; GCN-NEXT:    s_cselect_b32 s2, s2, s4
-; GCN-NEXT:    s_clause 0x1
-; GCN-NEXT:    global_load_b32 v2, v1, s[0:1]
-; GCN-NEXT:    global_load_b32 v1, v1, s[2:3]
-; GCN-NEXT:    s_wait_loadcnt 0x0
-; GCN-NEXT:    v_mul_lo_u32 v1, v1, v2
-; GCN-NEXT:    global_store_b32 v0, v1, s[6:7]
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GCN-NEXT:    s_endpgm
+; GFX12-SDAG-LABEL: test1_s_barrier_leave:
+; GFX12-SDAG:       ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT:    s_load_b256 s[0:7], s[2:3], 0x24
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    global_store_b32 v0, v1, s[6:7]
+; GFX12-SDAG-NEXT:    s_barrier_leave
+; GFX12-SDAG-NEXT:    s_cselect_b32 s3, s3, s5
+; GFX12-SDAG-NEXT:    s_cselect_b32 s2, s2, s4
+; GFX12-SDAG-NEXT:    s_clause 0x1
+; GFX12-SDAG-NEXT:    global_load_b32 v2, v1, s[0:1]
+; GFX12-SDAG-NEXT:    global_load_b32 v1, v1, s[2:3]
+; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT:    v_mul_lo_u32 v1, v1, v2
+; GFX12-SDAG-NEXT:    global_store_b32 v0, v1, s[6:7]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
 ;
-; GLOBAL-ISEL-LABEL: test1_s_barrier_leave:
-; GLOBAL-ISEL:       ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT:    s_load_b256 s[0:7], s[2:3], 0x24
-; GLOBAL-ISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
-; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
-; GLOBAL-ISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT:    global_store_b32 v0, v1, s[6:7]
-; GLOBAL-ISEL-NEXT:    s_barrier_leave
-; GLOBAL-ISEL-NEXT:    s_cselect_b32 s8, 1, 0
-; GLOBAL-ISEL-NEXT:    s_and_b32 s8, s8, 1
-; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GLOBAL-ISEL-NEXT:    s_cmp_lg_u32 s8, 0
-; GLOBAL-ISEL-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[4:5]
-; GLOBAL-ISEL-NEXT:    s_clause 0x1
-; GLOBAL-ISEL-NEXT:    global_load_b32 v2, v1, s[0:1]
-; GLOBAL-ISEL-NEXT:    global_load_b32 v1, v1, s[2:3]
-; GLOBAL-ISEL-NEXT:    s_wait_loadcnt 0x0
-; GLOBAL-ISEL-NEXT:    v_mul_lo_u32 v1, v1, v2
-; GLOBAL-ISEL-NEXT:    global_store_b32 v0, v1, s[6:7]
-; GLOBAL-ISEL-NEXT:    s_nop 0
-; GLOBAL-ISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GLOBAL-ISEL-NEXT:    s_endpgm
+; GFX12-GISEL-LABEL: test1_s_barrier_leave:
+; GFX12-GISEL:       ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT:    s_load_b256 s[0:7], s[2:3], 0x24
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    global_store_b32 v0, v1, s[6:7]
+; GFX12-GISEL-NEXT:    s_barrier_leave
+; GFX12-GISEL-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX12-GISEL-NEXT:    s_and_b32 s8, s8, 1
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    s_cmp_lg_u32 s8, 0
+; GFX12-GISEL-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[4:5]
+; GFX12-GISEL-NEXT:    s_clause 0x1
+; GFX12-GISEL-NEXT:    global_load_b32 v2, v1, s[0:1]
+; GFX12-GISEL-NEXT:    global_load_b32 v1, v1, s[2:3]
+; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT:    v_mul_lo_u32 v1, v1, v2
+; GFX12-GISEL-NEXT:    global_store_b32 v0, v1, s[6:7]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
@@ -1026,36 +1026,36 @@ entry:
 }
 
 define amdgpu_kernel void @test1_s_wakeup_barrier(ptr addrspace(1) %out) #0 {
-; GCN-LABEL: test1_s_wakeup_barrier:
-; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GCN-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GCN-NEXT:    s_wakeup_barrier -1
-; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GCN-NEXT:    v_mul_u32_u24_e32 v1, v0, v0
-; GCN-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; GCN-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
-; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GCN-NEXT:    s_endpgm
+; GFX12-SDAG-LABEL: test1_s_wakeup_barrier:
+; GFX12-SDAG:       ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
+; GFX12-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-SDAG-NEXT:    s_wakeup_barrier -1
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_mul_u32_u24_e32 v1, v0, v0
+; GFX12-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX12-SDAG-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    global_store_b32 v2, v0, s[0:1]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
 ;
-; GLOBAL-ISEL-LABEL: test1_s_wakeup_barrier:
-; GLOBAL-ISEL:       ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GLOBAL-ISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GLOBAL-ISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
-; GLOBAL-ISEL-NEXT:    v_mul_lo_u32 v1, v0, v0
-; GLOBAL-ISEL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
-; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
-; GLOBAL-ISEL-NEXT:    s_wakeup_barrier -1
-; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
-; GLOBAL-ISEL-NEXT:    s_nop 0
-; GLOBAL-ISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GLOBAL-ISEL-NEXT:    s_endpgm
+; GFX12-GISEL-LABEL: test1_s_wakeup_barrier:
+; GFX12-GISEL:       ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
+; GFX12-GISEL-NEXT:    v_mul_lo_u32 v1, v0, v0
+; GFX12-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
+; GFX12-GISEL-NEXT:    s_wakeup_barrier -1
+; GFX12-GISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
@@ -1068,36 +1068,36 @@ entry:
 }
 
 define amdgpu_kernel void @test2_s_wakeup_barrier(ptr addrspace(1) %out) #0 {
-; GCN-LABEL: test2_s_wakeup_barrier:
-; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GCN-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GCN-NEXT:    s_wakeup_barrier 1
-; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GCN-NEXT:    v_mul_u32_u24_e32 v1, v0, v0
-; GCN-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; GCN-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
-; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GCN-NEXT:    s_endpgm
+; GFX12-SDAG-LABEL: test2_s_wakeup_barrier:
+; GFX12-SDAG:       ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
+; GFX12-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-SDAG-NEXT:    s_wakeup_barrier 1
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_mul_u32_u24_e32 v1, v0, v0
+; GFX12-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX12-SDAG-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    global_store_b32 v2, v0, s[0:1]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
 ;
-; GLOBAL-ISEL-LABEL: test2_s_wakeup_barrier:
-; GLOBAL-ISEL:       ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GLOBAL-ISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GLOBAL-ISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
-; GLOBAL-ISEL-NEXT:    v_mul_lo_u32 v1, v0, v0
-; GLOBAL-ISEL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
-; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
-; GLOBAL-ISEL-NEXT:    s_wakeup_barrier 1
-; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
-; GLOBAL-ISEL-NEXT:    s_nop 0
-; GLOBAL-ISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GLOBAL-ISEL-NEXT:    s_endpgm
+; GFX12-GISEL-LABEL: test2_s_wakeup_barrier:
+; GFX12-GISEL:       ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
+; GFX12-GISEL-NEXT:    v_mul_lo_u32 v1, v0, v0
+; GFX12-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
+; GFX12-GISEL-NEXT:    s_wakeup_barrier 1
+; GFX12-GISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
@@ -1110,36 +1110,36 @@ entry:
 }
 
 define amdgpu_kernel void @test3_s_wakeup_barrier(ptr addrspace(1) %out) #0 {
-; GCN-LABEL: test3_s_wakeup_barrier:
-; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GCN-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GCN-NEXT:    s_wakeup_barrier 0
-; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GCN-NEXT:    v_mul_u32_u24_e32 v1, v0, v0
-; GCN-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
-; GCN-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
-; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    global_store_b32 v2, v0, s[0:1]
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GCN-NEXT:    s_endpgm
+; GFX12-SDAG-LABEL: test3_s_wakeup_barrier:
+; GFX12-SDAG:       ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
+; GFX12-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-SDAG-NEXT:    s_wakeup_barrier 0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_mul_u32_u24_e32 v1, v0, v0
+; GFX12-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX12-SDAG-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    global_store_b32 v2, v0, s[0:1]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
 ;
-; GLOBAL-ISEL-LABEL: test3_s_wakeup_barrier:
-; GLOBAL-ISEL:       ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GLOBAL-ISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GLOBAL-ISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
-; GLOBAL-ISEL-NEXT:    v_mul_lo_u32 v1, v0, v0
-; GLOBAL-ISEL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
-; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
-; GLOBAL-ISEL-NEXT:    s_wakeup_barrier 0
-; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
-; GLOBAL-ISEL-NEXT:    s_nop 0
-; GLOBAL-ISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GLOBAL-ISEL-NEXT:    s_endpgm
+; GFX12-GISEL-LABEL: test3_s_wakeup_barrier:
+; GFX12-GISEL:       ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
+; GFX12-GISEL-NEXT:    v_mul_lo_u32 v1, v0, v0
+; GFX12-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
+; GFX12-GISEL-NEXT:    s_wakeup_barrier 0
+; GFX12-GISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
@@ -1152,39 +1152,39 @@ entry:
 }
 
 define amdgpu_kernel void @test4_s_wakeup_barrier_m0(ptr addrspace(1) %out, i32 %bar) #0 {
-; GCN-LABEL: test4_s_wakeup_barrier_m0:
-; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_b96 s[0:2], s[2:3], 0x24
-; GCN-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
-; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GCN-NEXT:    v_mul_u32_u24_e32 v2, v0, v0
-; GCN-NEXT:    v_lshlrev_b32_e32 v3, 2, v0
-; GCN-NEXT:    v_sub_nc_u32_e32 v0, v2, v0
-; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    s_mov_b32 m0, s2
-; GCN-NEXT:    global_store_b32 v3, v1, s[0:1]
-; GCN-NEXT:    s_wakeup_barrier m0
-; GCN-NEXT:    global_store_b32 v3, v0, s[0:1]
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GCN-NEXT:    s_endpgm
+; GFX12-SDAG-LABEL: test4_s_wakeup_barrier_m0:
+; GFX12-SDAG:       ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT:    s_load_b96 s[0:2], s[2:3], 0x24
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    v_mul_u32_u24_e32 v2, v0, v0
+; GFX12-SDAG-NEXT:    v_lshlrev_b32_e32 v3, 2, v0
+; GFX12-SDAG-NEXT:    v_sub_nc_u32_e32 v0, v2, v0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    s_mov_b32 m0, s2
+; GFX12-SDAG-NEXT:    global_store_b32 v3, v1, s[0:1]
+; GFX12-SDAG-NEXT:    s_wakeup_barrier m0
+; GFX12-SDAG-NEXT:    global_store_b32 v3, v0, s[0:1]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
 ;
-; GLOBAL-ISEL-LABEL: test4_s_wakeup_barrier_m0:
-; GLOBAL-ISEL:       ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT:    s_load_b96 s[0:2], s[2:3], 0x24
-; GLOBAL-ISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GLOBAL-ISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
-; GLOBAL-ISEL-NEXT:    v_mul_lo_u32 v1, v0, v0
-; GLOBAL-ISEL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
-; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT:    s_mov_b32 m0, s2
-; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
-; GLOBAL-ISEL-NEXT:    s_wakeup_barrier m0
-; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
-; GLOBAL-ISEL-NEXT:    s_nop 0
-; GLOBAL-ISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GLOBAL-ISEL-NEXT:    s_endpgm
+; GFX12-GISEL-LABEL: test4_s_wakeup_barrier_m0:
+; GFX12-GISEL:       ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT:    s_load_b96 s[0:2], s[2:3], 0x24
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
+; GFX12-GISEL-NEXT:    v_mul_lo_u32 v1, v0, v0
+; GFX12-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    s_mov_b32 m0, s2
+; GFX12-GISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
+; GFX12-GISEL-NEXT:    s_wakeup_barrier m0
+; GFX12-GISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
@@ -1197,63 +1197,63 @@ entry:
 }
 
 define void @test5_s_wakeup_barrier_m0(i32 %arg) {
-; GCN-LABEL: test5_s_wakeup_barrier_m0:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GCN-NEXT:    s_wait_expcnt 0x0
-; GCN-NEXT:    s_wait_samplecnt 0x0
-; GCN-NEXT:    s_wait_bvhcnt 0x0
-; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    v_readfirstlane_b32 s0, v0
-; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GCN-NEXT:    s_mov_b32 m0, s0
-; GCN-NEXT:    s_wakeup_barrier m0
-; GCN-NEXT:    s_wait_alu 0xfffe
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-LABEL: test5_s_wakeup_barrier_m0:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    s_mov_b32 m0, s0
+; GFX12-SDAG-NEXT:    s_wakeup_barrier m0
+; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GLOBAL-ISEL-LABEL: test5_s_wakeup_barrier_m0:
-; GLOBAL-ISEL:       ; %bb.0:
-; GLOBAL-ISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GLOBAL-ISEL-NEXT:    s_wait_expcnt 0x0
-; GLOBAL-ISEL-NEXT:    s_wait_samplecnt 0x0
-; GLOBAL-ISEL-NEXT:    s_wait_bvhcnt 0x0
-; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT:    v_readfirstlane_b32 m0, v0
-; GLOBAL-ISEL-NEXT:    s_wakeup_barrier m0
-; GLOBAL-ISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-GISEL-LABEL: test5_s_wakeup_barrier_m0:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_readfirstlane_b32 m0, v0
+; GFX12-GISEL-NEXT:    s_wakeup_barrier m0
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
   call void @llvm.amdgcn.s.wakeup.barrier(i32 %arg)
   ret void
 }
 
 define amdgpu_kernel void @test1_s_get_barrier_state(ptr addrspace(1) %out) #0 {
-; GCN-LABEL: test1_s_get_barrier_state:
-; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_get_barrier_state s4, -1
-; GCN-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GCN-NEXT:    v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0
-; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GCN-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GCN-NEXT:    s_endpgm
+; GFX12-SDAG-LABEL: test1_s_get_barrier_state:
+; GFX12-SDAG:       ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT:    s_get_barrier_state s4, -1
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX12-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX12-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
 ;
-; GLOBAL-ISEL-LABEL: test1_s_get_barrier_state:
-; GLOBAL-ISEL:       ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GLOBAL-ISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
-; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_2)
-; GLOBAL-ISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GLOBAL-ISEL-NEXT:    s_get_barrier_state s2, -1
-; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT:    v_mov_b32_e32 v1, s2
-; GLOBAL-ISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GLOBAL-ISEL-NEXT:    s_nop 0
-; GLOBAL-ISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GLOBAL-ISEL-NEXT:    s_endpgm
+; GFX12-GISEL-LABEL: test1_s_get_barrier_state:
+; GFX12-GISEL:       ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_2)
+; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-GISEL-NEXT:    s_get_barrier_state s2, -1
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
@@ -1264,34 +1264,34 @@ entry:
 }
 
 define amdgpu_kernel void @test2_s_get_barrier_state(ptr addrspace(1) %out) #0 {
-; GCN-LABEL: test2_s_get_barrier_state:
-; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_get_barrier_state s4, 1
-; GCN-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GCN-NEXT:    v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0
-; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GCN-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GCN-NEXT:    s_endpgm
+; GFX12-SDAG-LABEL: test2_s_get_barrier_state:
+; GFX12-SDAG:       ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT:    s_get_barrier_state s4, 1
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX12-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX12-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
 ;
-; GLOBAL-ISEL-LABEL: test2_s_get_barrier_state:
-; GLOBAL-ISEL:       ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GLOBAL-ISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
-; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_2)
-; GLOBAL-ISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GLOBAL-ISEL-NEXT:    s_get_barrier_state s2, 1
-; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT:    v_mov_b32_e32 v1, s2
-; GLOBAL-ISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GLOBAL-ISEL-NEXT:    s_nop 0
-; GLOBAL-ISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GLOBAL-ISEL-NEXT:    s_endpgm
+; GFX12-GISEL-LABEL: test2_s_get_barrier_state:
+; GFX12-GISEL:       ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_2)
+; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-GISEL-NEXT:    s_get_barrier_state s2, 1
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
@@ -1302,34 +1302,34 @@ entry:
 }
 
 define amdgpu_kernel void @test3_s_get_barrier_state(ptr addrspace(1) %out) #0 {
-; GCN-LABEL: test3_s_get_barrier_state:
-; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_get_barrier_state s4, 0
-; GCN-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GCN-NEXT:    v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0
-; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GCN-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GCN-NEXT:    s_endpgm
+; GFX12-SDAG-LABEL: test3_s_get_barrier_state:
+; GFX12-SDAG:       ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT:    s_get_barrier_state s4, 0
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX12-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX12-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
 ;
-; GLOBAL-ISEL-LABEL: test3_s_get_barrier_state:
-; GLOBAL-ISEL:       ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GLOBAL-ISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
-; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_2)
-; GLOBAL-ISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GLOBAL-ISEL-NEXT:    s_get_barrier_state s2, 0
-; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT:    v_mov_b32_e32 v1, s2
-; GLOBAL-ISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GLOBAL-ISEL-NEXT:    s_nop 0
-; GLOBAL-ISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GLOBAL-ISEL-NEXT:    s_endpgm
+; GFX12-GISEL-LABEL: test3_s_get_barrier_state:
+; GFX12-GISEL:       ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_2)
+; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-GISEL-NEXT:    s_get_barrier_state s2, 0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
@@ -1340,41 +1340,23 @@ entry:
 }
 
 define amdgpu_kernel void @test4_s_get_barrier_state_m0(ptr addrspace(1) %out, i32 %bar) #0 {
-; GCN-LABEL: test4_s_get_barrier_state_m0:
-; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_b96 s[0:2], s[2:3], 0x24
-; GCN-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
-; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GCN-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    s_mov_b32 m0, s2
-; GCN-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GCN-NEXT:    s_get_barrier_state s2, m0
-; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_2)
-; GCN-NEXT:    v_mov_b32_e32 v1, s2
-; GCN-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GCN-NEXT:    s_endpgm
-;
-; GLOBAL-ISEL-LABEL: test4_s_get_barrier_state_m0:
-; GLOBAL-ISEL:       ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT:    s_load_b96 s[0:2], s[2:3], 0x24
-; GLOBAL-ISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
-; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GLOBAL-ISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT:    s_mov_b32 m0, s2
-; GLOBAL-ISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GLOBAL-ISEL-NEXT:    s_get_barrier_state s2, m0
-; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_2)
-; GLOBAL-ISEL-NEXT:    v_mov_b32_e32 v1, s2
-; GLOBAL-ISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GLOBAL-ISEL-NEXT:    s_nop 0
-; GLOBAL-ISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GLOBAL-ISEL-NEXT:    s_endpgm
+; GFX12-LABEL: test4_s_get_barrier_state_m0:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b96 s[0:2], s[2:3], 0x24
+; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_mov_b32 m0, s2
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    s_get_barrier_state s2, m0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_2)
+; GFX12-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
@@ -1385,76 +1367,76 @@ entry:
 }
 
 define i32 @test5_s_get_barrier_state_m0(i32 %arg) {
-; GCN-LABEL: test5_s_get_barrier_state_m0:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GCN-NEXT:    s_wait_expcnt 0x0
-; GCN-NEXT:    s_wait_samplecnt 0x0
-; GCN-NEXT:    s_wait_bvhcnt 0x0
-; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    v_readfirstlane_b32 s0, v0
-; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
-; GCN-NEXT:    s_mov_b32 m0, s0
-; GCN-NEXT:    s_get_barrier_state s0, m0
-; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    s_wait_alu 0xfffe
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-SDAG-LABEL: test5_s_get_barrier_state_m0:
+; GFX12-SDAG:       ; %bb.0:
+; GFX12-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_expcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_samplecnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    v_readfirstlane_b32 s0, v0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
+; GFX12-SDAG-NEXT:    s_mov_b32 m0, s0
+; GFX12-SDAG-NEXT:    s_get_barrier_state s0, m0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    s_wait_alu 0xfffe
+; GFX12-SDAG-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-SDAG-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GLOBAL-ISEL-LABEL: test5_s_get_barrier_state_m0:
-; GLOBAL-ISEL:       ; %bb.0:
-; GLOBAL-ISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GLOBAL-ISEL-NEXT:    s_wait_expcnt 0x0
-; GLOBAL-ISEL-NEXT:    s_wait_samplecnt 0x0
-; GLOBAL-ISEL-NEXT:    s_wait_bvhcnt 0x0
-; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT:    v_readfirstlane_b32 m0, v0
-; GLOBAL-ISEL-NEXT:    s_get_barrier_state s0, m0
-; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT:    s_wait_alu 0xfffe
-; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GLOBAL-ISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GLOBAL-ISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX12-GISEL-LABEL: test5_s_get_barrier_state_m0:
+; GFX12-GISEL:       ; %bb.0:
+; GFX12-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_expcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_samplecnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    v_readfirstlane_b32 m0, v0
+; GFX12-GISEL-NEXT:    s_get_barrier_state s0, m0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    s_wait_alu 0xfffe
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-GISEL-NEXT:    s_setpc_b64 s[30:31]
   %state = call i32 @llvm.amdgcn.s.get.barrier.state(i32 %arg)
   ret i32 %state
 }
 
 define amdgpu_kernel void @test_barrier_convert(ptr addrspace(1) %out) #0 {
-; GCN-LABEL: test_barrier_convert:
-; GCN:       ; %bb.0: ; %entry
-; GCN-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GCN-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GCN-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GCN-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
-; GCN-NEXT:    v_mul_u32_u24_e32 v1, v0, v0
-; GCN-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
-; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    global_store_b32 v3, v2, s[0:1]
-; GCN-NEXT:    s_wait_storecnt 0x0
-; GCN-NEXT:    s_barrier_signal -1
-; GCN-NEXT:    s_barrier_wait -1
-; GCN-NEXT:    global_store_b32 v3, v0, s[0:1]
-; GCN-NEXT:    s_nop 0
-; GCN-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GCN-NEXT:    s_endpgm
+; GFX12-SDAG-LABEL: test_barrier_convert:
+; GFX12-SDAG:       ; %bb.0: ; %entry
+; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
+; GFX12-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
+; GFX12-SDAG-NEXT:    v_mul_u32_u24_e32 v1, v0, v0
+; GFX12-SDAG-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
+; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
+; GFX12-SDAG-NEXT:    global_store_b32 v3, v2, s[0:1]
+; GFX12-SDAG-NEXT:    s_wait_storecnt 0x0
+; GFX12-SDAG-NEXT:    s_barrier_signal -1
+; GFX12-SDAG-NEXT:    s_barrier_wait -1
+; GFX12-SDAG-NEXT:    global_store_b32 v3, v0, s[0:1]
+; GFX12-SDAG-NEXT:    s_nop 0
+; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-SDAG-NEXT:    s_endpgm
 ;
-; GLOBAL-ISEL-LABEL: test_barrier_convert:
-; GLOBAL-ISEL:       ; %bb.0: ; %entry
-; GLOBAL-ISEL-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GLOBAL-ISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GLOBAL-ISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
-; GLOBAL-ISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
-; GLOBAL-ISEL-NEXT:    v_mul_lo_u32 v1, v0, v0
-; GLOBAL-ISEL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
-; GLOBAL-ISEL-NEXT:    s_wait_kmcnt 0x0
-; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
-; GLOBAL-ISEL-NEXT:    s_wait_storecnt 0x0
-; GLOBAL-ISEL-NEXT:    s_barrier_signal -1
-; GLOBAL-ISEL-NEXT:    s_barrier_wait -1
-; GLOBAL-ISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
-; GLOBAL-ISEL-NEXT:    s_nop 0
-; GLOBAL-ISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GLOBAL-ISEL-NEXT:    s_endpgm
+; GFX12-GISEL-LABEL: test_barrier_convert:
+; GFX12-GISEL:       ; %bb.0: ; %entry
+; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
+; GFX12-GISEL-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
+; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-GISEL-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
+; GFX12-GISEL-NEXT:    v_mul_lo_u32 v1, v0, v0
+; GFX12-GISEL-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
+; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
+; GFX12-GISEL-NEXT:    global_store_b32 v3, v2, s[0:1]
+; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
+; GFX12-GISEL-NEXT:    s_barrier_signal -1
+; GFX12-GISEL-NEXT:    s_barrier_wait -1
+; GFX12-GISEL-NEXT:    global_store_b32 v3, v0, s[0:1]
+; GFX12-GISEL-NEXT:    s_nop 0
+; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-GISEL-NEXT:    s_endpgm
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp

From 57bee1e4322d34f9760563081f9c10b6850bc2bf Mon Sep 17 00:00:00 2001
From: Xing Guo <higuoxing+github@gmail.com>
Date: Tue, 24 Sep 2024 21:21:42 +0800
Subject: [PATCH 09/49] [JITLink] Add support for R_X86_64_PC16 relocation
 type. (#109630)

This patch adds support for R_X86_64_PC16 relocation type and
x86_64::Delta16 edge kind. This patch also adds missing test cases for
R_X86_64_PC32, R_X86_64_PC64 relocation types.
---
 .../llvm/ExecutionEngine/JITLink/x86_64.h     | 22 +++++++++++++++++++
 .../ExecutionEngine/JITLink/ELF_x86_64.cpp    |  3 +++
 llvm/lib/ExecutionEngine/JITLink/x86_64.cpp   |  2 ++
 .../{ELF_R_X86_64_PC8.s => ELF_R_X86_64_PC.s} |  5 ++++-
 4 files changed, 31 insertions(+), 1 deletion(-)
 rename llvm/test/ExecutionEngine/JITLink/x86-64/{ELF_R_X86_64_PC8.s => ELF_R_X86_64_PC.s} (61%)

diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/x86_64.h b/llvm/include/llvm/ExecutionEngine/JITLink/x86_64.h
index 2f475ed884a7e2..24cf982fc3ab0f 100644
--- a/llvm/include/llvm/ExecutionEngine/JITLink/x86_64.h
+++ b/llvm/include/llvm/ExecutionEngine/JITLink/x86_64.h
@@ -95,6 +95,19 @@ enum EdgeKind_x86_64 : Edge::Kind {
   ///
   Delta32,
 
+  /// A 16-bit delta.
+  ///
+  /// Delta from the fixup to the target.
+  ///
+  /// Fixup expression:
+  ///   Fixup <- Target - Fixup + Addend : int16
+  ///
+  /// Errors:
+  ///   - The result of the fixup expression must fit into an int16, otherwise
+  ///     an out-of-range error will be returned.
+  ///
+  Delta16,
+
   /// An 8-bit delta.
   ///
   /// Delta from the fixup to the target.
@@ -486,6 +499,15 @@ inline Error applyFixup(LinkGraph &G, Block &B, const Edge &E,
     break;
   }
 
+  case Delta16: {
+    int64_t Value = E.getTarget().getAddress() - FixupAddress + E.getAddend();
+    if (LLVM_LIKELY(isInt<16>(Value)))
+      *(little16_t *)FixupPtr = Value;
+    else
+      return makeTargetOutOfRangeError(G, B, E);
+    break;
+  }
+
   case Delta8: {
     int64_t Value = E.getTarget().getAddress() - FixupAddress + E.getAddend();
     if (LLVM_LIKELY(isInt<8>(Value)))
diff --git a/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp b/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp
index b27a1a19acae0a..6a32ccc3776510 100644
--- a/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp
@@ -156,6 +156,9 @@ class ELFLinkGraphBuilder_x86_64 : public ELFLinkGraphBuilder<object::ELF64LE> {
     case ELF::R_X86_64_PC8:
       Kind = x86_64::Delta8;
       break;
+    case ELF::R_X86_64_PC16:
+      Kind = x86_64::Delta16;
+      break;
     case ELF::R_X86_64_PC32:
     case ELF::R_X86_64_GOTPC32:
       Kind = x86_64::Delta32;
diff --git a/llvm/lib/ExecutionEngine/JITLink/x86_64.cpp b/llvm/lib/ExecutionEngine/JITLink/x86_64.cpp
index 9f7ece8ffbbb3f..cca4358a377660 100644
--- a/llvm/lib/ExecutionEngine/JITLink/x86_64.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/x86_64.cpp
@@ -34,6 +34,8 @@ const char *getEdgeKindName(Edge::Kind K) {
     return "Delta64";
   case Delta32:
     return "Delta32";
+  case Delta16:
+    return "Delta16";
   case Delta8:
     return "Delta8";
   case NegDelta64:
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/ELF_R_X86_64_PC8.s b/llvm/test/ExecutionEngine/JITLink/x86-64/ELF_R_X86_64_PC.s
similarity index 61%
rename from llvm/test/ExecutionEngine/JITLink/x86-64/ELF_R_X86_64_PC8.s
rename to llvm/test/ExecutionEngine/JITLink/x86-64/ELF_R_X86_64_PC.s
index 46b851a836abb8..d88875e3855114 100644
--- a/llvm/test/ExecutionEngine/JITLink/x86-64/ELF_R_X86_64_PC8.s
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/ELF_R_X86_64_PC.s
@@ -2,7 +2,7 @@
 # RUN:     -filetype=obj -o %t.o %s
 # RUN: llvm-jitlink -noexec %t.o
 #
-# Check R_X86_64_PC8 handling.
+# Check R_X86_64_PC* handling.
 
 	.text
 	.globl	main
@@ -14,3 +14,6 @@ main:
 
 	.rodata
 	.byte	main-. # Generate R_X86_64_PC8 relocation.
+	.short	main-. # Generate R_X86_64_PC16 relocation.
+	.long	main-. # Generate R_X86_64_PC32 relocation.
+	.quad	main-. # Generate R_X86_64_PC64 relocation.

From 02a334de6690202154ef09456c581618ff290f9a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Nathan=20Gau=C3=ABr?= <brioche@google.com>
Date: Tue, 24 Sep 2024 15:24:21 +0200
Subject: [PATCH 10/49] [SPIR-V] Fix bad insertion for type/id MIR (#109686)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Those instructions were inserted either after the instruction using it,
or in the middle of the module.
The first directly causes an issue. The second causes a more subtle
issue: the first type the type is inserted, the emission is fine, but
the second times, the first instruction is reused, without checking its
position in the function. This can lead to the second usage dominating
the definition.

In SPIR-V, types are usually in the header, above all code definition,
but at this stage I don't think we can, so what I do instead is to emit
it in the first basic block.

This commit reduces the failed tests with expensive checks from 107 to
71.

Signed-off-by: Nathan Gauër <brioche@google.com>
---
 llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp | 128 ++++++++++++------
 llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h   |  11 ++
 llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp   |   4 +-
 3 files changed, 98 insertions(+), 45 deletions(-)

diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
index ca3e47a4b78f23..3e1873e899680c 100644
--- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.cpp
@@ -25,6 +25,7 @@
 #include "llvm/IR/Type.h"
 #include "llvm/Support/Casting.h"
 #include <cassert>
+#include <functional>
 
 using namespace llvm;
 SPIRVGlobalRegistry::SPIRVGlobalRegistry(unsigned PointerSize)
@@ -83,8 +84,11 @@ inline Register createTypeVReg(MachineIRBuilder &MIRBuilder) {
 }
 
 SPIRVType *SPIRVGlobalRegistry::getOpTypeBool(MachineIRBuilder &MIRBuilder) {
-  return MIRBuilder.buildInstr(SPIRV::OpTypeBool)
-      .addDef(createTypeVReg(MIRBuilder));
+
+  return createOpType(MIRBuilder, [&](MachineIRBuilder &MIRBuilder) {
+    return MIRBuilder.buildInstr(SPIRV::OpTypeBool)
+        .addDef(createTypeVReg(MIRBuilder));
+  });
 }
 
 unsigned SPIRVGlobalRegistry::adjustOpTypeIntWidth(unsigned Width) const {
@@ -118,24 +122,53 @@ SPIRVType *SPIRVGlobalRegistry::getOpTypeInt(unsigned Width,
     MIRBuilder.buildInstr(SPIRV::OpCapability)
         .addImm(SPIRV::Capability::ArbitraryPrecisionIntegersINTEL);
   }
-  auto MIB = MIRBuilder.buildInstr(SPIRV::OpTypeInt)
-                 .addDef(createTypeVReg(MIRBuilder))
-                 .addImm(Width)
-                 .addImm(IsSigned ? 1 : 0);
-  return MIB;
+  return createOpType(MIRBuilder, [&](MachineIRBuilder &MIRBuilder) {
+    return MIRBuilder.buildInstr(SPIRV::OpTypeInt)
+        .addDef(createTypeVReg(MIRBuilder))
+        .addImm(Width)
+        .addImm(IsSigned ? 1 : 0);
+  });
 }
 
 SPIRVType *SPIRVGlobalRegistry::getOpTypeFloat(uint32_t Width,
                                                MachineIRBuilder &MIRBuilder) {
-  auto MIB = MIRBuilder.buildInstr(SPIRV::OpTypeFloat)
-                 .addDef(createTypeVReg(MIRBuilder))
-                 .addImm(Width);
-  return MIB;
+  return createOpType(MIRBuilder, [&](MachineIRBuilder &MIRBuilder) {
+    return MIRBuilder.buildInstr(SPIRV::OpTypeFloat)
+        .addDef(createTypeVReg(MIRBuilder))
+        .addImm(Width);
+  });
 }
 
 SPIRVType *SPIRVGlobalRegistry::getOpTypeVoid(MachineIRBuilder &MIRBuilder) {
-  return MIRBuilder.buildInstr(SPIRV::OpTypeVoid)
-      .addDef(createTypeVReg(MIRBuilder));
+  return createOpType(MIRBuilder, [&](MachineIRBuilder &MIRBuilder) {
+    return MIRBuilder.buildInstr(SPIRV::OpTypeVoid)
+        .addDef(createTypeVReg(MIRBuilder));
+  });
+}
+
+SPIRVType *SPIRVGlobalRegistry::createOpType(
+    MachineIRBuilder &MIRBuilder,
+    std::function<MachineInstr *(MachineIRBuilder &)> Op) {
+  auto oldInsertPoint = MIRBuilder.getInsertPt();
+  MachineBasicBlock *OldMBB = &MIRBuilder.getMBB();
+
+  auto LastInsertedType = LastInsertedTypeMap.find(CurMF);
+  if (LastInsertedType != LastInsertedTypeMap.end()) {
+    MIRBuilder.setInsertPt(*MIRBuilder.getMF().begin(),
+                           LastInsertedType->second->getIterator());
+  } else {
+    MIRBuilder.setInsertPt(*MIRBuilder.getMF().begin(),
+                           MIRBuilder.getMF().begin()->begin());
+    auto Result = LastInsertedTypeMap.try_emplace(CurMF, nullptr);
+    assert(Result.second);
+    LastInsertedType = Result.first;
+  }
+
+  MachineInstr *Type = Op(MIRBuilder);
+  LastInsertedType->second = Type;
+
+  MIRBuilder.setInsertPt(*OldMBB, oldInsertPoint);
+  return Type;
 }
 
 SPIRVType *SPIRVGlobalRegistry::getOpTypeVector(uint32_t NumElems,
@@ -147,11 +180,12 @@ SPIRVType *SPIRVGlobalRegistry::getOpTypeVector(uint32_t NumElems,
           EleOpc == SPIRV::OpTypeBool) &&
          "Invalid vector element type");
 
-  auto MIB = MIRBuilder.buildInstr(SPIRV::OpTypeVector)
-                 .addDef(createTypeVReg(MIRBuilder))
-                 .addUse(getSPIRVTypeID(ElemType))
-                 .addImm(NumElems);
-  return MIB;
+  return createOpType(MIRBuilder, [&](MachineIRBuilder &MIRBuilder) {
+    return MIRBuilder.buildInstr(SPIRV::OpTypeVector)
+        .addDef(createTypeVReg(MIRBuilder))
+        .addUse(getSPIRVTypeID(ElemType))
+        .addImm(NumElems);
+  });
 }
 
 std::tuple<Register, ConstantInt *, bool, unsigned>
@@ -688,11 +722,12 @@ SPIRVType *SPIRVGlobalRegistry::getOpTypeArray(uint32_t NumElems,
   SPIRVType *SpvTypeInt32 = getOrCreateSPIRVIntegerType(32, MIRBuilder);
   Register NumElementsVReg =
       buildConstantInt(NumElems, MIRBuilder, SpvTypeInt32, EmitIR);
-  auto MIB = MIRBuilder.buildInstr(SPIRV::OpTypeArray)
-                 .addDef(createTypeVReg(MIRBuilder))
-                 .addUse(getSPIRVTypeID(ElemType))
-                 .addUse(NumElementsVReg);
-  return MIB;
+  return createOpType(MIRBuilder, [&](MachineIRBuilder &MIRBuilder) {
+    return MIRBuilder.buildInstr(SPIRV::OpTypeArray)
+        .addDef(createTypeVReg(MIRBuilder))
+        .addUse(getSPIRVTypeID(ElemType))
+        .addUse(NumElementsVReg);
+  });
 }
 
 SPIRVType *SPIRVGlobalRegistry::getOpTypeOpaque(const StructType *Ty,
@@ -700,10 +735,12 @@ SPIRVType *SPIRVGlobalRegistry::getOpTypeOpaque(const StructType *Ty,
   assert(Ty->hasName());
   const StringRef Name = Ty->hasName() ? Ty->getName() : "";
   Register ResVReg = createTypeVReg(MIRBuilder);
-  auto MIB = MIRBuilder.buildInstr(SPIRV::OpTypeOpaque).addDef(ResVReg);
-  addStringImm(Name, MIB);
-  buildOpName(ResVReg, Name, MIRBuilder);
-  return MIB;
+  return createOpType(MIRBuilder, [&](MachineIRBuilder &MIRBuilder) {
+    auto MIB = MIRBuilder.buildInstr(SPIRV::OpTypeOpaque).addDef(ResVReg);
+    addStringImm(Name, MIB);
+    buildOpName(ResVReg, Name, MIRBuilder);
+    return MIB;
+  });
 }
 
 SPIRVType *SPIRVGlobalRegistry::getOpTypeStruct(const StructType *Ty,
@@ -717,14 +754,16 @@ SPIRVType *SPIRVGlobalRegistry::getOpTypeStruct(const StructType *Ty,
     FieldTypes.push_back(getSPIRVTypeID(ElemTy));
   }
   Register ResVReg = createTypeVReg(MIRBuilder);
-  auto MIB = MIRBuilder.buildInstr(SPIRV::OpTypeStruct).addDef(ResVReg);
-  for (const auto &Ty : FieldTypes)
-    MIB.addUse(Ty);
-  if (Ty->hasName())
-    buildOpName(ResVReg, Ty->getName(), MIRBuilder);
-  if (Ty->isPacked())
-    buildOpDecorate(ResVReg, MIRBuilder, SPIRV::Decoration::CPacked, {});
-  return MIB;
+  return createOpType(MIRBuilder, [&](MachineIRBuilder &MIRBuilder) {
+    auto MIB = MIRBuilder.buildInstr(SPIRV::OpTypeStruct).addDef(ResVReg);
+    for (const auto &Ty : FieldTypes)
+      MIB.addUse(Ty);
+    if (Ty->hasName())
+      buildOpName(ResVReg, Ty->getName(), MIRBuilder);
+    if (Ty->isPacked())
+      buildOpDecorate(ResVReg, MIRBuilder, SPIRV::Decoration::CPacked, {});
+    return MIB;
+  });
 }
 
 SPIRVType *SPIRVGlobalRegistry::getOrCreateSpecialType(
@@ -739,17 +778,22 @@ SPIRVType *SPIRVGlobalRegistry::getOpTypePointer(
     MachineIRBuilder &MIRBuilder, Register Reg) {
   if (!Reg.isValid())
     Reg = createTypeVReg(MIRBuilder);
-  return MIRBuilder.buildInstr(SPIRV::OpTypePointer)
-      .addDef(Reg)
-      .addImm(static_cast<uint32_t>(SC))
-      .addUse(getSPIRVTypeID(ElemType));
+
+  return createOpType(MIRBuilder, [&](MachineIRBuilder &MIRBuilder) {
+    return MIRBuilder.buildInstr(SPIRV::OpTypePointer)
+        .addDef(Reg)
+        .addImm(static_cast<uint32_t>(SC))
+        .addUse(getSPIRVTypeID(ElemType));
+  });
 }
 
 SPIRVType *SPIRVGlobalRegistry::getOpTypeForwardPointer(
     SPIRV::StorageClass::StorageClass SC, MachineIRBuilder &MIRBuilder) {
-  return MIRBuilder.buildInstr(SPIRV::OpTypeForwardPointer)
-      .addUse(createTypeVReg(MIRBuilder))
-      .addImm(static_cast<uint32_t>(SC));
+  return createOpType(MIRBuilder, [&](MachineIRBuilder &MIRBuilder) {
+    return MIRBuilder.buildInstr(SPIRV::OpTypeForwardPointer)
+        .addUse(createTypeVReg(MIRBuilder))
+        .addImm(static_cast<uint32_t>(SC));
+  });
 }
 
 SPIRVType *SPIRVGlobalRegistry::getOpTypeFunction(
diff --git a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
index ed9cfc07132430..cad2bf96adf33e 100644
--- a/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
+++ b/llvm/lib/Target/SPIRV/SPIRVGlobalRegistry.h
@@ -64,6 +64,10 @@ class SPIRVGlobalRegistry {
   SmallPtrSet<const Type *, 4> TypesInProcessing;
   DenseMap<const Type *, SPIRVType *> ForwardPointerTypes;
 
+  // Stores for each function the last inserted SPIR-V Type.
+  // See: SPIRVGlobalRegistry::createOpType.
+  DenseMap<const MachineFunction *, MachineInstr *> LastInsertedTypeMap;
+
   // if a function returns a pointer, this is to map it into TypedPointerType
   DenseMap<const Function *, TypedPointerType *> FunResPointerTypes;
 
@@ -97,6 +101,13 @@ class SPIRVGlobalRegistry {
                         SPIRV::AccessQualifier::AccessQualifier AccessQual,
                         bool EmitIR);
 
+  // Internal function creating the an OpType at the correct position in the
+  // function by tweaking the passed "MIRBuilder" insertion point and restoring
+  // it to the correct position. "Op" should be the function creating the
+  // specific OpType you need, and should return the newly created instruction.
+  SPIRVType *createOpType(MachineIRBuilder &MIRBuilder,
+                          std::function<MachineInstr *(MachineIRBuilder &)> Op);
+
 public:
   SPIRVGlobalRegistry(unsigned PointerSize);
 
diff --git a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
index f1b10e264781f2..cd0aff1a518439 100644
--- a/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVPreLegalizer.cpp
@@ -389,9 +389,7 @@ void processInstr(MachineInstr &MI, MachineIRBuilder &MIB,
       createNewIdReg(nullptr, MI.getOperand(0).getReg(), MRI, *GR).first;
   AssignTypeInst.getOperand(1).setReg(NewReg);
   MI.getOperand(0).setReg(NewReg);
-  MIB.setInsertPt(*MI.getParent(),
-                  (MI.getNextNode() ? MI.getNextNode()->getIterator()
-                                    : MI.getParent()->end()));
+  MIB.setInsertPt(*MI.getParent(), MI.getIterator());
   for (auto &Op : MI.operands()) {
     if (!Op.isReg() || Op.isDef())
       continue;

From fe7bc872aad83fc0d5cf998230df752e73bb696d Mon Sep 17 00:00:00 2001
From: Zibi Sarbinowski <zibi@ca.ibm.com>
Date: Tue, 24 Sep 2024 09:42:23 -0400
Subject: [PATCH 11/49] [DebugInfo][z/OS] XFAIL debug-ranges-duplication.ll on
 z/OS (#109681)

Same fix was provided for AIX in commit
704da919bafa5b088223f9d77424f24ae754539e.
The issue is unsupported DWARF 5 section with the following assertion:

`Assertion failed: Section && "Cannot switch to a null section!", file:
llvm/lib/MC/MCStreamer.cpp, line: 1266 `
---
 llvm/test/DebugInfo/Generic/debug-ranges-duplication.ll | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/test/DebugInfo/Generic/debug-ranges-duplication.ll b/llvm/test/DebugInfo/Generic/debug-ranges-duplication.ll
index e9c23100eeda52..b31469e899d650 100644
--- a/llvm/test/DebugInfo/Generic/debug-ranges-duplication.ll
+++ b/llvm/test/DebugInfo/Generic/debug-ranges-duplication.ll
@@ -1,5 +1,5 @@
 ; AIX doesn't currently support DWARF 5 section .debug_rnglists
-; XFAIL: target={{.*}}-aix{{.*}}
+; XFAIL: target={{.*}}-zos{{.*}}, target={{.*}}-aix{{.*}}
 
 ; RUN: %llc_dwarf -O0 -filetype=obj < %s | llvm-dwarfdump -debug-info - | FileCheck %s
 ;

From 0de1e3e787c6d78b87ccb865ba2f2daf8d8e4ecc Mon Sep 17 00:00:00 2001
From: Zibi Sarbinowski <zibi@ca.ibm.com>
Date: Tue, 24 Sep 2024 09:43:02 -0400
Subject: [PATCH 12/49] [;SystemZ][z/OS] Fix llvm-ctxprof to open input files
 in text mode (#109691)

Reading text files on z/OS relies on auto conversion to handle
ASCII/EBCDIC correctly. For this to work files need to be opened in text
mode is that is the type of the file. This PR fixes `llvm-ctxprof`
utility in this regards which in turn fixes the following LIT failure on
z/OS:

`FAIL: LLVM :: Analysis/CtxProfAnalysis/flatten-zero-path.ll`
---
 llvm/tools/llvm-ctxprof-util/llvm-ctxprof-util.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/tools/llvm-ctxprof-util/llvm-ctxprof-util.cpp b/llvm/tools/llvm-ctxprof-util/llvm-ctxprof-util.cpp
index 0fad4ee4360ddf..485f6c7d33d902 100644
--- a/llvm/tools/llvm-ctxprof-util/llvm-ctxprof-util.cpp
+++ b/llvm/tools/llvm-ctxprof-util/llvm-ctxprof-util.cpp
@@ -48,7 +48,8 @@ static cl::opt<std::string> OutputFilename("output", cl::value_desc("output"),
 
 // Save the bitstream profile from the JSON representation.
 Error convertFromJSON() {
-  auto BufOrError = MemoryBuffer::getFileOrSTDIN(InputFilename);
+  auto BufOrError =
+      MemoryBuffer::getFileOrSTDIN(InputFilename, /*IsText=*/true);
   if (!BufOrError)
     return createFileError(InputFilename, BufOrError.getError());
 

From 622ae7ffa432ca5985cf1655cf499e04b289bbf2 Mon Sep 17 00:00:00 2001
From: Paul Walker <paul.walker@arm.com>
Date: Tue, 24 Sep 2024 15:11:36 +0100
Subject: [PATCH 13/49] [LLVM][InstCombine][AArch64] sve.insr(splat(x), x) ==>
 splat(x) (#109445)

Fixes https://github.com/llvm/llvm-project/issues/100497
---
 .../AArch64/AArch64TargetTransformInfo.cpp    | 12 +++
 .../InstCombine/AArch64/sve-intrinsic-insr.ll | 73 +++++++++++++++++++
 2 files changed, 85 insertions(+)
 create mode 100644 llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-insr.ll

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index da0798ebf79578..ac05a44abc2dd9 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -2140,6 +2140,16 @@ static std::optional<Instruction *> instCombineSVESrshl(InstCombiner &IC,
   return IC.replaceInstUsesWith(II, LSL);
 }
 
+static std::optional<Instruction *> instCombineSVEInsr(InstCombiner &IC,
+                                                       IntrinsicInst &II) {
+  Value *Vec = II.getOperand(0);
+
+  if (getSplatValue(Vec) == II.getOperand(1))
+    return IC.replaceInstUsesWith(II, Vec);
+
+  return std::nullopt;
+}
+
 std::optional<Instruction *>
 AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
                                      IntrinsicInst &II) const {
@@ -2460,6 +2470,8 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
     return instCombineSVESrshl(IC, II);
   case Intrinsic::aarch64_sve_dupq_lane:
     return instCombineSVEDupqLane(IC, II);
+  case Intrinsic::aarch64_sve_insr:
+    return instCombineSVEInsr(IC, II);
   }
 
   return std::nullopt;
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-insr.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-insr.ll
new file mode 100644
index 00000000000000..e8489c5be85c41
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-insr.ll
@@ -0,0 +1,73 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -passes=instcombine < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define <vscale x 16 x i8> @insr_val_into_splatted_val_int(i8 %a) #0 {
+; CHECK-LABEL: @insr_val_into_splatted_val_int(
+; CHECK-NEXT:    [[T0:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[A:%.*]], i64 0
+; CHECK-NEXT:    [[T1:%.*]] = shufflevector <vscale x 16 x i8> [[T0]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT:    ret <vscale x 16 x i8> [[T1]]
+;
+  %t0 = insertelement <vscale x 16 x i8> poison, i8 %a, i64 0
+  %t1 = shufflevector <vscale x 16 x i8> %t0, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+  %t2 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.insr.nxv16i8(<vscale x 16 x i8> %t1, i8 %a)
+  ret <vscale x 16 x i8> %t2
+}
+
+define <vscale x 8 x i16> @insr_five_into_fives() #0 {
+; CHECK-LABEL: @insr_five_into_fives(
+; CHECK-NEXT:    ret <vscale x 8 x i16> shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 5, i64 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
+;
+  %t1 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.insr.nxv8i16(<vscale x 8 x i16> splat (i16 5), i16 5)
+  ret <vscale x 8 x i16> %t1
+}
+
+define <vscale x 4 x float> @insr_val_into_splatted_val_fp(float %a) #0 {
+; CHECK-LABEL: @insr_val_into_splatted_val_fp(
+; CHECK-NEXT:    [[T0:%.*]] = insertelement <vscale x 4 x float> poison, float [[A:%.*]], i64 0
+; CHECK-NEXT:    [[T1:%.*]] = shufflevector <vscale x 4 x float> [[T0]], <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
+; CHECK-NEXT:    ret <vscale x 4 x float> [[T1]]
+;
+  %t0 = insertelement <vscale x 4 x float> poison, float %a, i64 0
+  %t1 = shufflevector <vscale x 4 x float> %t0, <vscale x 4 x float> poison, <vscale x 4 x i32> zeroinitializer
+  %t2 = tail call <vscale x 4 x float> @llvm.aarch64.sve.insr.nxv4f32(<vscale x 4 x float> %t1, float %a)
+  ret <vscale x 4 x float> %t2
+}
+
+define <vscale x 2 x double> @insr_zero_into_zero() #0 {
+; CHECK-LABEL: @insr_zero_into_zero(
+; CHECK-NEXT:    ret <vscale x 2 x double> zeroinitializer
+;
+  %t1 = tail call <vscale x 2 x double> @llvm.aarch64.sve.insr.nxv2f64(<vscale x 2 x double> zeroinitializer, double zeroinitializer)
+  ret <vscale x 2 x double> %t1
+}
+
+define <vscale x 16 x i8> @insr_val_into_splatted_other(i8 %a, i8 %b) #0 {
+; CHECK-LABEL: @insr_val_into_splatted_other(
+; CHECK-NEXT:    [[T0:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[B:%.*]], i64 0
+; CHECK-NEXT:    [[T1:%.*]] = shufflevector <vscale x 16 x i8> [[T0]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+; CHECK-NEXT:    [[T2:%.*]] = tail call <vscale x 16 x i8> @llvm.aarch64.sve.insr.nxv16i8(<vscale x 16 x i8> [[T1]], i8 [[A:%.*]])
+; CHECK-NEXT:    ret <vscale x 16 x i8> [[T2]]
+;
+  %t0 = insertelement <vscale x 16 x i8> poison, i8 %b, i64 0
+  %t1 = shufflevector <vscale x 16 x i8> %t0, <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
+  %t2 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.insr.nxv16i8(<vscale x 16 x i8> %t1, i8 %a)
+  ret <vscale x 16 x i8> %t2
+}
+
+define <vscale x 8 x i16> @insr_three_into_fives() #0 {
+; CHECK-LABEL: @insr_three_into_fives(
+; CHECK-NEXT:    [[T1:%.*]] = tail call <vscale x 8 x i16> @llvm.aarch64.sve.insr.nxv8i16(<vscale x 8 x i16> shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 5, i64 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer), i16 3)
+; CHECK-NEXT:    ret <vscale x 8 x i16> [[T1]]
+;
+  %t1 = tail call <vscale x 8 x i16> @llvm.aarch64.sve.insr.nxv8i16(<vscale x 8 x i16> splat (i16 5), i16 3)
+  ret <vscale x 8 x i16> %t1
+}
+
+declare <vscale x 16 x i8> @llvm.aarch64.sve.insr.nxv16i8(<vscale x 16 x i8>, i8)
+declare <vscale x 8 x i16> @llvm.aarch64.sve.insr.nxv8i16(<vscale x 8 x i16>, i16)
+declare <vscale x 4 x float> @llvm.aarch64.sve.insr.nxv4f32(<vscale x 4 x float>, float)
+declare <vscale x 2 x double> @llvm.aarch64.sve.insr.nxv2f64(<vscale x 2 x double>, double)
+
+attributes #0 = { "target-features"="+sve" }

From 216e1b90c4a988880b772b8d152769f7d0cac0c6 Mon Sep 17 00:00:00 2001
From: davidtrevelyan <davidtrevelyan@users.noreply.github.com>
Date: Tue, 24 Sep 2024 15:36:07 +0100
Subject: [PATCH 14/49] [rtsan] Remove std::variant from rtsan diagnostics
 (#109786)

Following issue #109529 and PR
#109715, this PR removes the
`std::variant` in rtsan's diagnostics code, in favour of a solution by
`enum` without the C++ runtime.
---
 compiler-rt/lib/rtsan/rtsan.cpp             |  6 ++-
 compiler-rt/lib/rtsan/rtsan_diagnostics.cpp | 58 ++++++++++-----------
 compiler-rt/lib/rtsan/rtsan_diagnostics.h   | 19 ++-----
 3 files changed, 37 insertions(+), 46 deletions(-)

diff --git a/compiler-rt/lib/rtsan/rtsan.cpp b/compiler-rt/lib/rtsan/rtsan.cpp
index 2afdf3c76696e7..b288da64ffbe25 100644
--- a/compiler-rt/lib/rtsan/rtsan.cpp
+++ b/compiler-rt/lib/rtsan/rtsan.cpp
@@ -87,7 +87,8 @@ __rtsan_notify_intercepted_call(const char *func_name) {
   GET_CALLER_PC_BP;
   ExpectNotRealtime(
       GetContextForThisThread(),
-      PrintDiagnosticsAndDieAction({InterceptedCallInfo{func_name}, pc, bp}));
+      PrintDiagnosticsAndDieAction(
+          {DiagnosticsInfoType::InterceptedCall, func_name, pc, bp}));
 }
 
 SANITIZER_INTERFACE_ATTRIBUTE void
@@ -96,7 +97,8 @@ __rtsan_notify_blocking_call(const char *func_name) {
   GET_CALLER_PC_BP;
   ExpectNotRealtime(
       GetContextForThisThread(),
-      PrintDiagnosticsAndDieAction({BlockingCallInfo{func_name}, pc, bp}));
+      PrintDiagnosticsAndDieAction(
+          {DiagnosticsInfoType::BlockingCall, func_name, pc, bp}));
 }
 
 } // extern "C"
diff --git a/compiler-rt/lib/rtsan/rtsan_diagnostics.cpp b/compiler-rt/lib/rtsan/rtsan_diagnostics.cpp
index ac13b0743be069..f82001f5b2057c 100644
--- a/compiler-rt/lib/rtsan/rtsan_diagnostics.cpp
+++ b/compiler-rt/lib/rtsan/rtsan_diagnostics.cpp
@@ -37,12 +37,6 @@ class Decorator : public __sanitizer::SanitizerCommonDecorator {
   const char *FunctionName() const { return Green(); }
   const char *Reason() const { return Blue(); }
 };
-
-template <class... Ts> struct Overloaded : Ts... {
-  using Ts::operator()...;
-};
-// TODO: Remove below when c++20
-template <class... Ts> Overloaded(Ts...) -> Overloaded<Ts...>;
 } // namespace
 
 static void PrintStackTrace(uptr pc, uptr bp) {
@@ -53,35 +47,39 @@ static void PrintStackTrace(uptr pc, uptr bp) {
 }
 
 static void PrintError(const Decorator &decorator,
-                       const DiagnosticsCallerInfo &info) {
-  const char *violation_type = std::visit(
-      Overloaded{
-          [](const InterceptedCallInfo &) { return "unsafe-library-call"; },
-          [](const BlockingCallInfo &) { return "blocking-call"; }},
-      info);
+                       const DiagnosticsInfo &info) {
+  const auto ErrorTypeStr = [&info]() -> const char * {
+    switch (info.type) {
+    case DiagnosticsInfoType::InterceptedCall:
+      return "unsafe-library-call";
+    case DiagnosticsInfoType::BlockingCall:
+      return "blocking-call";
+    }
+    return "(unknown error)";
+  };
 
   Printf("%s", decorator.Error());
-  Report("ERROR: RealtimeSanitizer: %s\n", violation_type);
+  Report("ERROR: RealtimeSanitizer: %s\n", ErrorTypeStr());
 }
 
 static void PrintReason(const Decorator &decorator,
-                        const DiagnosticsCallerInfo &info) {
+                        const DiagnosticsInfo &info) {
   Printf("%s", decorator.Reason());
 
-  std::visit(
-      Overloaded{[decorator](const InterceptedCallInfo &call) {
-                   Printf("Intercepted call to real-time unsafe function "
-                          "`%s%s%s` in real-time context!",
-                          decorator.FunctionName(),
-                          call.intercepted_function_name_, decorator.Reason());
-                 },
-                 [decorator](const BlockingCallInfo &arg) {
-                   Printf("Call to blocking function "
-                          "`%s%s%s` in real-time context!",
-                          decorator.FunctionName(), arg.blocking_function_name_,
-                          decorator.Reason());
-                 }},
-      info);
+  switch (info.type) {
+  case DiagnosticsInfoType::InterceptedCall: {
+    Printf("Intercepted call to real-time unsafe function "
+           "`%s%s%s` in real-time context!",
+           decorator.FunctionName(), info.func_name, decorator.Reason());
+    break;
+  }
+  case DiagnosticsInfoType::BlockingCall: {
+    Printf("Call to blocking function "
+           "`%s%s%s` in real-time context!",
+           decorator.FunctionName(), info.func_name, decorator.Reason());
+    break;
+  }
+  }
 
   Printf("\n");
 }
@@ -90,8 +88,8 @@ void __rtsan::PrintDiagnostics(const DiagnosticsInfo &info) {
   ScopedErrorReportLock l;
 
   Decorator d;
-  PrintError(d, info.call_info);
-  PrintReason(d, info.call_info);
+  PrintError(d, info);
+  PrintReason(d, info);
   Printf("%s", d.Default());
   PrintStackTrace(info.pc, info.bp);
 }
diff --git a/compiler-rt/lib/rtsan/rtsan_diagnostics.h b/compiler-rt/lib/rtsan/rtsan_diagnostics.h
index 8aec512584b309..f8a6b8a954a24a 100644
--- a/compiler-rt/lib/rtsan/rtsan_diagnostics.h
+++ b/compiler-rt/lib/rtsan/rtsan_diagnostics.h
@@ -15,25 +15,16 @@
 #include "sanitizer_common/sanitizer_common.h"
 #include "sanitizer_common/sanitizer_internal_defs.h"
 
-#include <variant>
-
 namespace __rtsan {
 
-struct InterceptedCallInfo {
-  const char *intercepted_function_name_;
-};
-
-struct BlockingCallInfo {
-public:
-  const char *blocking_function_name_;
+enum class DiagnosticsInfoType {
+  InterceptedCall,
+  BlockingCall,
 };
 
-using DiagnosticsCallerInfo =
-    std::variant<InterceptedCallInfo, BlockingCallInfo>;
-
 struct DiagnosticsInfo {
-  DiagnosticsCallerInfo call_info;
-
+  DiagnosticsInfoType type;
+  const char *func_name;
   __sanitizer::uptr pc;
   __sanitizer::uptr bp;
 };

From b1e4656e8ee3289dc5f3139fc8eb33152f96bfe6 Mon Sep 17 00:00:00 2001
From: Pavel Skripkin <paskripkin@gmail.com>
Date: Tue, 24 Sep 2024 17:48:29 +0300
Subject: [PATCH 15/49] [NFC][analyzer] Make `invalidateRegions` accept `Stmt`
 instead of `Expr` (#109792)

As was reported
[here](https://github.com/llvm/llvm-project/pull/103714#pullrequestreview-2238037812),
`invalidateRegions` should accept `Stmt` instead of `Expr`. This
conversion is possible, since `Expr` was anyway converted back to `Stmt`
later.

This refactoring is needed to fix another FP related to use of inline
assembly. The fix would be to change `State->bindLoc` to
`state->invalidateRegions` inside inline assembly visitor, since
`bindLoc` only binds to offset 0, which is not really correct semantics
in case of inline assembly.
---
 .../Core/PathSensitive/ProgramState.h         |  4 +-
 .../Core/PathSensitive/SValBuilder.h          |  6 +-
 .../StaticAnalyzer/Core/PathSensitive/Store.h |  4 +-
 .../lib/StaticAnalyzer/Core/ProgramState.cpp  | 30 +++----
 clang/lib/StaticAnalyzer/Core/RegionStore.cpp | 90 ++++++++-----------
 clang/lib/StaticAnalyzer/Core/SValBuilder.cpp |  4 +-
 6 files changed, 57 insertions(+), 81 deletions(-)

diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ProgramState.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ProgramState.h
index 2f6cd481fd6362..eef7a54f03bf11 100644
--- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ProgramState.h
+++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ProgramState.h
@@ -326,14 +326,14 @@ class ProgramState : public llvm::FoldingSetNode {
   /// \param ITraits information about special handling for particular regions
   ///        or symbols.
   [[nodiscard]] ProgramStateRef
-  invalidateRegions(ArrayRef<const MemRegion *> Regions, const Expr *E,
+  invalidateRegions(ArrayRef<const MemRegion *> Regions, const Stmt *S,
                     unsigned BlockCount, const LocationContext *LCtx,
                     bool CausesPointerEscape, InvalidatedSymbols *IS = nullptr,
                     const CallEvent *Call = nullptr,
                     RegionAndSymbolInvalidationTraits *ITraits = nullptr) const;
 
   [[nodiscard]] ProgramStateRef
-  invalidateRegions(ArrayRef<SVal> Values, const Expr *E, unsigned BlockCount,
+  invalidateRegions(ArrayRef<SVal> Values, const Stmt *S, unsigned BlockCount,
                     const LocationContext *LCtx, bool CausesPointerEscape,
                     InvalidatedSymbols *IS = nullptr,
                     const CallEvent *Call = nullptr,
diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SValBuilder.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SValBuilder.h
index 6eedaf0544559b..ec2b2b24569480 100644
--- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SValBuilder.h
+++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/SValBuilder.h
@@ -202,11 +202,9 @@ class SValBuilder {
                                         const Expr *expr,
                                         const LocationContext *LCtx,
                                         unsigned count);
-  DefinedOrUnknownSVal conjureSymbolVal(const void *symbolTag,
-                                        const Expr *expr,
+  DefinedOrUnknownSVal conjureSymbolVal(const void *symbolTag, const Stmt *S,
                                         const LocationContext *LCtx,
-                                        QualType type,
-                                        unsigned count);
+                                        QualType type, unsigned count);
   DefinedOrUnknownSVal conjureSymbolVal(const Stmt *stmt,
                                         const LocationContext *LCtx,
                                         QualType type,
diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/Store.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/Store.h
index e08d5e104e9c0a..332855a3c9c45e 100644
--- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/Store.h
+++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/Store.h
@@ -215,7 +215,7 @@ class StoreManager {
   ///
   /// \param[in] store The initial store.
   /// \param[in] Values The values to invalidate.
-  /// \param[in] E The current statement being evaluated. Used to conjure
+  /// \param[in] S The current statement being evaluated. Used to conjure
   ///   symbols to mark the values of invalidated regions.
   /// \param[in] Count The current block count. Used to conjure
   ///   symbols to mark the values of invalidated regions.
@@ -233,7 +233,7 @@ class StoreManager {
   ///   even if they do not currently have bindings. Pass \c NULL if this
   ///   information will not be used.
   virtual StoreRef invalidateRegions(
-      Store store, ArrayRef<SVal> Values, const Expr *Ex, unsigned Count,
+      Store store, ArrayRef<SVal> Values, const Stmt *S, unsigned Count,
       const LocationContext *LCtx, const CallEvent *Call,
       InvalidatedSymbols &IS, RegionAndSymbolInvalidationTraits &ITraits,
       InvalidatedRegions *TopLevelRegions, InvalidatedRegions *Invalidated) = 0;
diff --git a/clang/lib/StaticAnalyzer/Core/ProgramState.cpp b/clang/lib/StaticAnalyzer/Core/ProgramState.cpp
index e6d3399a219424..0be2709f0907d8 100644
--- a/clang/lib/StaticAnalyzer/Core/ProgramState.cpp
+++ b/clang/lib/StaticAnalyzer/Core/ProgramState.cpp
@@ -147,30 +147,24 @@ ProgramState::bindDefaultZero(SVal loc, const LocationContext *LCtx) const {
 typedef ArrayRef<const MemRegion *> RegionList;
 typedef ArrayRef<SVal> ValueList;
 
-ProgramStateRef
-ProgramState::invalidateRegions(RegionList Regions,
-                             const Expr *E, unsigned Count,
-                             const LocationContext *LCtx,
-                             bool CausedByPointerEscape,
-                             InvalidatedSymbols *IS,
-                             const CallEvent *Call,
-                             RegionAndSymbolInvalidationTraits *ITraits) const {
+ProgramStateRef ProgramState::invalidateRegions(
+    RegionList Regions, const Stmt *S, unsigned Count,
+    const LocationContext *LCtx, bool CausedByPointerEscape,
+    InvalidatedSymbols *IS, const CallEvent *Call,
+    RegionAndSymbolInvalidationTraits *ITraits) const {
   SmallVector<SVal, 8> Values;
   for (const MemRegion *Reg : Regions)
     Values.push_back(loc::MemRegionVal(Reg));
 
-  return invalidateRegions(Values, E, Count, LCtx, CausedByPointerEscape, IS,
+  return invalidateRegions(Values, S, Count, LCtx, CausedByPointerEscape, IS,
                            Call, ITraits);
 }
 
-ProgramStateRef
-ProgramState::invalidateRegions(ValueList Values,
-                             const Expr *E, unsigned Count,
-                             const LocationContext *LCtx,
-                             bool CausedByPointerEscape,
-                             InvalidatedSymbols *IS,
-                             const CallEvent *Call,
-                             RegionAndSymbolInvalidationTraits *ITraits) const {
+ProgramStateRef ProgramState::invalidateRegions(
+    ValueList Values, const Stmt *S, unsigned Count,
+    const LocationContext *LCtx, bool CausedByPointerEscape,
+    InvalidatedSymbols *IS, const CallEvent *Call,
+    RegionAndSymbolInvalidationTraits *ITraits) const {
 
   ProgramStateManager &Mgr = getStateManager();
   ExprEngine &Eng = Mgr.getOwningEngine();
@@ -186,7 +180,7 @@ ProgramState::invalidateRegions(ValueList Values,
   StoreManager::InvalidatedRegions TopLevelInvalidated;
   StoreManager::InvalidatedRegions Invalidated;
   const StoreRef &NewStore = Mgr.StoreMgr->invalidateRegions(
-      getStore(), Values, E, Count, LCtx, Call, *IS, *ITraits,
+      getStore(), Values, S, Count, LCtx, Call, *IS, *ITraits,
       &TopLevelInvalidated, &Invalidated);
 
   ProgramStateRef NewState = makeWithStore(NewStore);
diff --git a/clang/lib/StaticAnalyzer/Core/RegionStore.cpp b/clang/lib/StaticAnalyzer/Core/RegionStore.cpp
index c257a87dff385b..674099dd7e1f0f 100644
--- a/clang/lib/StaticAnalyzer/Core/RegionStore.cpp
+++ b/clang/lib/StaticAnalyzer/Core/RegionStore.cpp
@@ -405,19 +405,15 @@ class RegionStoreManager : public StoreManager {
   //===-------------------------------------------------------------------===//
   // Binding values to regions.
   //===-------------------------------------------------------------------===//
-  RegionBindingsRef invalidateGlobalRegion(MemRegion::Kind K,
-                                           const Expr *Ex,
+  RegionBindingsRef invalidateGlobalRegion(MemRegion::Kind K, const Stmt *S,
                                            unsigned Count,
                                            const LocationContext *LCtx,
                                            RegionBindingsRef B,
                                            InvalidatedRegions *Invalidated);
 
-  StoreRef invalidateRegions(Store store,
-                             ArrayRef<SVal> Values,
-                             const Expr *E, unsigned Count,
-                             const LocationContext *LCtx,
-                             const CallEvent *Call,
-                             InvalidatedSymbols &IS,
+  StoreRef invalidateRegions(Store store, ArrayRef<SVal> Values, const Stmt *S,
+                             unsigned Count, const LocationContext *LCtx,
+                             const CallEvent *Call, InvalidatedSymbols &IS,
                              RegionAndSymbolInvalidationTraits &ITraits,
                              InvalidatedRegions *Invalidated,
                              InvalidatedRegions *InvalidatedTopLevel) override;
@@ -975,7 +971,7 @@ RegionStoreManager::removeSubRegionBindings(RegionBindingsConstRef B,
 namespace {
 class InvalidateRegionsWorker : public ClusterAnalysis<InvalidateRegionsWorker>
 {
-  const Expr *Ex;
+  const Stmt *S;
   unsigned Count;
   const LocationContext *LCtx;
   InvalidatedSymbols &IS;
@@ -983,18 +979,15 @@ class InvalidateRegionsWorker : public ClusterAnalysis<InvalidateRegionsWorker>
   StoreManager::InvalidatedRegions *Regions;
   GlobalsFilterKind GlobalsFilter;
 public:
-  InvalidateRegionsWorker(RegionStoreManager &rm,
-                          ProgramStateManager &stateMgr,
-                          RegionBindingsRef b,
-                          const Expr *ex, unsigned count,
-                          const LocationContext *lctx,
-                          InvalidatedSymbols &is,
+  InvalidateRegionsWorker(RegionStoreManager &rm, ProgramStateManager &stateMgr,
+                          RegionBindingsRef b, const Stmt *S, unsigned count,
+                          const LocationContext *lctx, InvalidatedSymbols &is,
                           RegionAndSymbolInvalidationTraits &ITraitsIn,
                           StoreManager::InvalidatedRegions *r,
                           GlobalsFilterKind GFK)
-     : ClusterAnalysis<InvalidateRegionsWorker>(rm, stateMgr, b),
-       Ex(ex), Count(count), LCtx(lctx), IS(is), ITraits(ITraitsIn), Regions(r),
-       GlobalsFilter(GFK) {}
+      : ClusterAnalysis<InvalidateRegionsWorker>(rm, stateMgr, b), S(S),
+        Count(count), LCtx(lctx), IS(is), ITraits(ITraitsIn), Regions(r),
+        GlobalsFilter(GFK) {}
 
   void VisitCluster(const MemRegion *baseR, const ClusterBindings *C);
   void VisitBinding(SVal V);
@@ -1127,7 +1120,7 @@ void InvalidateRegionsWorker::VisitCluster(const MemRegion *baseR,
     // Invalidate the region by setting its default value to
     // conjured symbol. The type of the symbol is irrelevant.
     DefinedOrUnknownSVal V =
-      svalBuilder.conjureSymbolVal(baseR, Ex, LCtx, Ctx.IntTy, Count);
+        svalBuilder.conjureSymbolVal(baseR, S, LCtx, Ctx.IntTy, Count);
     B = B.addBinding(baseR, BindingKey::Default, V);
     return;
   }
@@ -1148,8 +1141,8 @@ void InvalidateRegionsWorker::VisitCluster(const MemRegion *baseR,
   if (T->isRecordType()) {
     // Invalidate the region by setting its default value to
     // conjured symbol. The type of the symbol is irrelevant.
-    DefinedOrUnknownSVal V = svalBuilder.conjureSymbolVal(baseR, Ex, LCtx,
-                                                          Ctx.IntTy, Count);
+    DefinedOrUnknownSVal V =
+        svalBuilder.conjureSymbolVal(baseR, S, LCtx, Ctx.IntTy, Count);
     B = B.addBinding(baseR, BindingKey::Default, V);
     return;
   }
@@ -1216,15 +1209,14 @@ void InvalidateRegionsWorker::VisitCluster(const MemRegion *baseR,
     }
   conjure_default:
       // Set the default value of the array to conjured symbol.
-    DefinedOrUnknownSVal V =
-    svalBuilder.conjureSymbolVal(baseR, Ex, LCtx,
-                                     AT->getElementType(), Count);
-    B = B.addBinding(baseR, BindingKey::Default, V);
-    return;
+      DefinedOrUnknownSVal V = svalBuilder.conjureSymbolVal(
+          baseR, S, LCtx, AT->getElementType(), Count);
+      B = B.addBinding(baseR, BindingKey::Default, V);
+      return;
   }
 
-  DefinedOrUnknownSVal V = svalBuilder.conjureSymbolVal(baseR, Ex, LCtx,
-                                                        T,Count);
+  DefinedOrUnknownSVal V =
+      svalBuilder.conjureSymbolVal(baseR, S, LCtx, T, Count);
   assert(SymbolManager::canSymbolicate(T) || V.isUnknown());
   B = B.addBinding(baseR, BindingKey::Direct, V);
 }
@@ -1252,19 +1244,16 @@ bool InvalidateRegionsWorker::includeEntireMemorySpace(const MemRegion *Base) {
                           RegionAndSymbolInvalidationTraits::TK_EntireMemSpace);
 }
 
-RegionBindingsRef
-RegionStoreManager::invalidateGlobalRegion(MemRegion::Kind K,
-                                           const Expr *Ex,
-                                           unsigned Count,
-                                           const LocationContext *LCtx,
-                                           RegionBindingsRef B,
-                                           InvalidatedRegions *Invalidated) {
+RegionBindingsRef RegionStoreManager::invalidateGlobalRegion(
+    MemRegion::Kind K, const Stmt *S, unsigned Count,
+    const LocationContext *LCtx, RegionBindingsRef B,
+    InvalidatedRegions *Invalidated) {
   // Bind the globals memory space to a new symbol that we will use to derive
   // the bindings for all globals.
   const GlobalsSpaceRegion *GS = MRMgr.getGlobalsRegion(K);
-  SVal V = svalBuilder.conjureSymbolVal(/* symbolTag = */ (const void*) GS, Ex, LCtx,
-                                        /* type does not matter */ Ctx.IntTy,
-                                        Count);
+  SVal V =
+      svalBuilder.conjureSymbolVal(/* symbolTag = */ (const void *)GS, S, LCtx,
+                                   /* type does not matter */ Ctx.IntTy, Count);
 
   B = B.removeBinding(GS)
        .addBinding(BindingKey::Make(GS, BindingKey::Default), V);
@@ -1298,16 +1287,11 @@ void RegionStoreManager::populateWorkList(InvalidateRegionsWorker &W,
   }
 }
 
-StoreRef
-RegionStoreManager::invalidateRegions(Store store,
-                                     ArrayRef<SVal> Values,
-                                     const Expr *Ex, unsigned Count,
-                                     const LocationContext *LCtx,
-                                     const CallEvent *Call,
-                                     InvalidatedSymbols &IS,
-                                     RegionAndSymbolInvalidationTraits &ITraits,
-                                     InvalidatedRegions *TopLevelRegions,
-                                     InvalidatedRegions *Invalidated) {
+StoreRef RegionStoreManager::invalidateRegions(
+    Store store, ArrayRef<SVal> Values, const Stmt *S, unsigned Count,
+    const LocationContext *LCtx, const CallEvent *Call, InvalidatedSymbols &IS,
+    RegionAndSymbolInvalidationTraits &ITraits,
+    InvalidatedRegions *TopLevelRegions, InvalidatedRegions *Invalidated) {
   GlobalsFilterKind GlobalsFilter;
   if (Call) {
     if (Call->isInSystemHeader())
@@ -1319,7 +1303,7 @@ RegionStoreManager::invalidateRegions(Store store,
   }
 
   RegionBindingsRef B = getRegionBindings(store);
-  InvalidateRegionsWorker W(*this, StateMgr, B, Ex, Count, LCtx, IS, ITraits,
+  InvalidateRegionsWorker W(*this, StateMgr, B, S, Count, LCtx, IS, ITraits,
                             Invalidated, GlobalsFilter);
 
   // Scan the bindings and generate the clusters.
@@ -1339,12 +1323,12 @@ RegionStoreManager::invalidateRegions(Store store,
   // TODO: This could possibly be more precise with modules.
   switch (GlobalsFilter) {
   case GFK_All:
-    B = invalidateGlobalRegion(MemRegion::GlobalInternalSpaceRegionKind,
-                               Ex, Count, LCtx, B, Invalidated);
+    B = invalidateGlobalRegion(MemRegion::GlobalInternalSpaceRegionKind, S,
+                               Count, LCtx, B, Invalidated);
     [[fallthrough]];
   case GFK_SystemOnly:
-    B = invalidateGlobalRegion(MemRegion::GlobalSystemSpaceRegionKind,
-                               Ex, Count, LCtx, B, Invalidated);
+    B = invalidateGlobalRegion(MemRegion::GlobalSystemSpaceRegionKind, S, Count,
+                               LCtx, B, Invalidated);
     [[fallthrough]];
   case GFK_None:
     break;
diff --git a/clang/lib/StaticAnalyzer/Core/SValBuilder.cpp b/clang/lib/StaticAnalyzer/Core/SValBuilder.cpp
index 7eca0579143f44..cb5fcbade2cfc2 100644
--- a/clang/lib/StaticAnalyzer/Core/SValBuilder.cpp
+++ b/clang/lib/StaticAnalyzer/Core/SValBuilder.cpp
@@ -174,7 +174,7 @@ DefinedOrUnknownSVal SValBuilder::conjureSymbolVal(const void *SymbolTag,
 }
 
 DefinedOrUnknownSVal SValBuilder::conjureSymbolVal(const void *symbolTag,
-                                                   const Expr *expr,
+                                                   const Stmt *St,
                                                    const LocationContext *LCtx,
                                                    QualType type,
                                                    unsigned count) {
@@ -184,7 +184,7 @@ DefinedOrUnknownSVal SValBuilder::conjureSymbolVal(const void *symbolTag,
   if (!SymbolManager::canSymbolicate(type))
     return UnknownVal();
 
-  SymbolRef sym = SymMgr.conjureSymbol(expr, LCtx, type, count, symbolTag);
+  SymbolRef sym = SymMgr.conjureSymbol(St, LCtx, type, count, symbolTag);
 
   if (Loc::isLocType(type))
     return loc::MemRegionVal(MemMgr.getSymbolicRegion(sym));

From 3fbf6f8bb183ad8b9157e50c442479f4ca7a9b8d Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Tue, 24 Sep 2024 15:50:12 +0100
Subject: [PATCH 16/49] [LV] Remove more references of unrolled parts after
 57f5d8f2fe.

Continue to clean up some now stale references of unroll parts and
related terminology as pointed out post-commit for 06c3a7d.
---
 .../Transforms/Vectorize/LoopVectorize.cpp    |  6 -----
 llvm/lib/Transforms/Vectorize/VPlan.cpp       | 26 +++++++++----------
 llvm/lib/Transforms/Vectorize/VPlan.h         |  6 ++---
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  4 +--
 4 files changed, 16 insertions(+), 26 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 0566d80c1cc001..d296511b977992 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -538,12 +538,6 @@ class InnerLoopVectorizer {
   /// A small list of PHINodes.
   using PhiVector = SmallVector<PHINode *, 4>;
 
-  /// A type for scalarized values in the new loop. Each value from the
-  /// original loop, when scalarized, is represented by UF x VF scalar values
-  /// in the new unrolled loop, where UF is the unroll factor and VF is the
-  /// vectorization factor.
-  using ScalarParts = SmallVector<SmallVector<Value *, 4>, 2>;
-
   /// Set up the values of the IVs correctly when exiting the vector loop.
   void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II,
                     Value *VectorTripCount, Value *EndValue,
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index ce15b2783cc457..5e4d487261c6f0 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -333,10 +333,10 @@ Value *VPTransformState::get(VPValue *Def, bool NeedsScalar) {
 
   // However, if we are vectorizing, we need to construct the vector values.
   // If the value is known to be uniform after vectorization, we can just
-  // broadcast the scalar value corresponding to lane zero for each unroll
-  // iteration. Otherwise, we construct the vector values using
-  // insertelement instructions. Since the resulting vectors are stored in
-  // State, we will only generate the insertelements once.
+  // broadcast the scalar value corresponding to lane zero. Otherwise, we
+  // construct the vector values using insertelement instructions. Since the
+  // resulting vectors are stored in State, we will only generate the
+  // insertelements once.
   Value *VectorValue = nullptr;
   if (IsUniform) {
     VectorValue = GetBroadcastInstrs(ScalarValue);
@@ -769,15 +769,15 @@ void VPRegionBlock::execute(VPTransformState *State) {
 
   // Enter replicating mode.
   State->Instance = VPIteration(0, 0);
-    assert(!State->VF.isScalable() && "VF is assumed to be non scalable.");
-    for (unsigned Lane = 0, VF = State->VF.getKnownMinValue(); Lane < VF;
-         ++Lane) {
-      State->Instance->Lane = VPLane(Lane, VPLane::Kind::First);
-      // Visit the VPBlocks connected to \p this, starting from it.
-      for (VPBlockBase *Block : RPOT) {
-        LLVM_DEBUG(dbgs() << "LV: VPBlock in RPO " << Block->getName() << '\n');
-        Block->execute(State);
-      }
+  assert(!State->VF.isScalable() && "VF is assumed to be non scalable.");
+  for (unsigned Lane = 0, VF = State->VF.getKnownMinValue(); Lane < VF;
+       ++Lane) {
+    State->Instance->Lane = VPLane(Lane, VPLane::Kind::First);
+    // Visit the VPBlocks connected to \p this, starting from it.
+    for (VPBlockBase *Block : RPOT) {
+      LLVM_DEBUG(dbgs() << "LV: VPBlock in RPO " << Block->getName() << '\n');
+      Block->execute(State);
+    }
   }
 
   // Exit replicating mode.
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 0632495bc511cd..c886a39aec76e5 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -254,7 +254,7 @@ struct VPTransformState {
                    DominatorTree *DT, IRBuilderBase &Builder,
                    InnerLoopVectorizer *ILV, VPlan *Plan);
 
-  /// The chosen Vectorization and Unroll Factors of the loop being vectorized.
+  /// The chosen Vectorization Factor of the loop being vectorized.
   ElementCount VF;
 
   /// Hold the indices to generate specific scalar instructions. Null indicates
@@ -1253,9 +1253,7 @@ class VPInstruction : public VPRecipeWithIRFlags,
     ComputeReductionResult,
     // Takes the VPValue to extract from as first operand and the lane or part
     // to extract as second operand, counting from the end starting with 1 for
-    // last. The second operand must be a positive constant and <= VF when
-    // extracting from a vector or <= UF when extracting from an unrolled
-    // scalar.
+    // last. The second operand must be a positive constant and <= VF.
     ExtractFromEnd,
     LogicalAnd, // Non-poison propagating logical And.
     // Add an offset in bytes (second operand) to a base pointer (first
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 318d6a8c5b8c34..f33293e65010f9 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2490,9 +2490,7 @@ void VPInterleaveRecipe::execute(VPTransformState &State) {
   // If the group is reverse, adjust the index to refer to the last vector lane
   // instead of the first. We adjust the index from the first vector lane,
   // rather than directly getting the pointer for lane VF - 1, because the
-  // pointer operand of the interleaved access is supposed to be uniform. For
-  // uniform instructions, we're only required to generate a value for the
-  // first vector lane in each unroll iteration.
+  // pointer operand of the interleaved access is supposed to be uniform.
   if (Group->isReverse()) {
     Value *RuntimeVF =
         getRuntimeVF(State.Builder, State.Builder.getInt32Ty(), State.VF);

From 8b5e841487976ecc4233227fdd069f5a5f4443f0 Mon Sep 17 00:00:00 2001
From: Chao Chen <chao.chen@intel.com>
Date: Tue, 24 Sep 2024 10:14:13 -0500
Subject: [PATCH 17/49] [MLIR][XeGPU] Updates XeGPU TensorDescAttr and Refine
 Gather/Scatter definition (#109675)

Bring back #109144 with fixes to VectorToXeGPU
---
 .../mlir/Dialect/XeGPU/IR/XeGPUAttrs.td       |  64 +++++++---
 .../include/mlir/Dialect/XeGPU/IR/XeGPUOps.td |  86 ++++++++++----
 .../mlir/Dialect/XeGPU/IR/XeGPUTypes.td       |  73 +++++++-----
 .../VectorToXeGPU/VectorToXeGPU.cpp           |  13 +--
 mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp    |  46 +++++---
 mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp        | 109 +++++++++++++-----
 mlir/test/Dialect/XeGPU/XeGPUOps.mlir         |  74 +++++++-----
 mlir/test/Dialect/XeGPU/invalid.mlir          |  75 +++++++-----
 8 files changed, 365 insertions(+), 175 deletions(-)

diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
index f3ca09a6a68ea8..26eec0d4f2082a 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUAttrs.td
@@ -19,12 +19,18 @@ class XeGPUAttr<string name, string attrMnemonic, list<Trait> traits = [],
   let mnemonic = attrMnemonic;
 }
 
-def XeGPU_TensorDescAttr: XeGPUAttr<"TensorDesc", "tdesc_attr"> {
+class XeGPU_TensorDescAttr<string name, string attrMnemonic, list<Trait> traits = [],
+                         string baseCppClass = "::mlir::Attribute">
+    : XeGPUAttr<name, attrMnemonic, traits, baseCppClass> {
+  let assemblyFormat = "`<` struct(params) `>`";
+}
+
+def XeGPU_BlockTensorDescAttr: XeGPU_TensorDescAttr<"BlockTensorDesc", "block_tdesc_attr"> {
   let summary = [{a composite attribute for `TensorDescType`}];
-  let description = [{`TensorDescAttr` (or `tdesc_attr`) is a composite
+  let description = [{`BlockTensorDesc` (or `block_tdesc_attr`) is a composite
     attribute defined for `TensorDescType` for describing following
     properties of a `TensorDesc`.
-    1. `memory_scope`: It describes where the data block described by the
+    1. `memory_space`: It describes where the data block described by the
         TensorDesc is located, `Global` device memory or `Shared` local memory.
         It is default to `Global`.
     2. `array_length`: It describes how many horizontally consecutive blocks
@@ -33,43 +39,63 @@ def XeGPU_TensorDescAttr: XeGPUAttr<"TensorDesc", "tdesc_attr"> {
         8x32. Its default value is 1.
     3. `boundary_check`: It is used to indicates the hardware whether to do
         out-of-boundary check. The default value is true.
-    4. `scattered`: It is used to differenciate TensorDescs created from
-       `create_nd_tdesc` vs from `create_tdesc`.
   }];
 
   let parameters = (ins
-    OptionalParameter<"MemoryScopeAttr">: $memory_scope,
+    OptionalParameter<"MemorySpaceAttr">: $memory_space,
     OptionalParameter<"IntegerAttr", "1">: $array_length,
-    OptionalParameter<"BoolAttr", "true">: $boundary_check,
-    OptionalParameter<"BoolAttr", "false">: $scattered
+    OptionalParameter<"BoolAttr", "true">: $boundary_check
   );
 
   let builders = [
     AttrBuilder<(ins
-      CArg<"xegpu::MemoryScope", "xegpu::MemoryScope::Global">:$memory_scope,
+      CArg<"xegpu::MemorySpace", "xegpu::MemorySpace::Global">:$memory_space,
       CArg<"int", "1">:$array_length,
-      CArg<"bool", "true">: $boundary_check,
-      CArg<"bool", "false">: $scattered
+      CArg<"bool", "true">: $boundary_check
     )>
   ];
 
-  let assemblyFormat = "`<` struct(params) `>`";
 }
 
+def XeGPU_ScatterTensorDescAttr: XeGPU_TensorDescAttr<"ScatterTensorDesc", "scatter_tdesc_attr"> {
+  let summary = [{a composite attribute for `TensorDescType`}];
+  let description = [{`ScatterTensorDesc` (or `scatter_tdesc_attr`) is a composite
+    attribute defined for `TensorDescType` for describing following
+    properties of a `TensorDesc`.
+    1. `memory_space`: It describes where the data block described by the
+        TensorDesc is located, `Global` device memory or `Shared` local memory.
+        It is default to `Global`.
+    2.  `chunk_size`: indicates number of continious elements accessed for each
+        offset, default is 1. It is used with `scattered` attr only.
+  }];
+
+  let parameters = (ins
+    OptionalParameter<"MemorySpaceAttr">: $memory_space,
+    OptionalParameter<"IntegerAttr", "1">: $chunk_size
+  );
+
+  let builders = [
+    AttrBuilder<(ins
+      CArg<"xegpu::MemorySpace", "xegpu::MemorySpace::Global">:$memory_space,
+      CArg<"int", "1">: $chunk_size
+    )>
+  ];
+ }
+
 //===----------------------------------------------------------------------===//
 // XeGPU Memory Scope Enums.
 //===----------------------------------------------------------------------===//
-def XeGPU_MemoryScopeGlobal: I32EnumAttrCase<"Global", 0, "global">;
-def XeGPU_MemoryScopeShared: I32EnumAttrCase<"SLM", 1, "slm">;
-def XeGPU_MemoryScope: I32EnumAttr<"MemoryScope",
+def XeGPU_MemorySpaceGlobal: I32EnumAttrCase<"Global", 0, "global">;
+def XeGPU_MemorySpaceShared: I32EnumAttrCase<"SLM", 3, "slm">;
+def XeGPU_MemorySpace: I32EnumAttr<"MemorySpace",
       "The address space of the memory the tensor descritor is created for",
-      [XeGPU_MemoryScopeGlobal, XeGPU_MemoryScopeShared]> {
+      [XeGPU_MemorySpaceGlobal, XeGPU_MemorySpaceShared]> {
   let genSpecializedAttr = 0;
   let cppNamespace = "::mlir::xegpu";
 }
 
-def XeGPU_MemoryScopeAttr:
-  EnumAttr<XeGPU_Dialect, XeGPU_MemoryScope, "memory_scope"> {
+def XeGPU_MemorySpaceAttr:
+  EnumAttr<XeGPU_Dialect, XeGPU_MemorySpace, "memory_space"> {
     let summary = [{Describe the location of data described by a `TensorDesc`:
                  Global device memory (`Global`) or Shared local memory (`SLM`).}];
     let assemblyFormat = "$value";
@@ -116,4 +142,4 @@ def XeGPU_FenceScopeAttr:
     let assemblyFormat = "$value";
 }
 
-#endif // MLIR_DIALECT_XEGPU_IR_XEGPUATTRS_TD
\ No newline at end of file
+#endif // MLIR_DIALECT_XEGPU_IR_XEGPUATTRS_TD
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
index c32c7541c39791..e24a056de2caf3 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUOps.td
@@ -218,6 +218,23 @@ def XeGPU_CreateNdDescOp: XeGPU_Op<"create_nd_tdesc", [Pure, ViewLikeOpInterface
     static unsigned getOffsetSizeAndStrideStartOperandIndex() { return 1; }
 
     mlir::Value getViewSource() { return getSource(); }
+
+    unsigned getSourceMemorySpace() {
+      auto srcTy = getSourceType();
+      if (auto memrefTy = llvm::dyn_cast<mlir::MemRefType>(srcTy)) {
+        auto attr = memrefTy.getMemorySpace();
+        if (attr) {
+          if (auto intAttr = llvm::dyn_cast<mlir::IntegerAttr>(attr)) {
+            return static_cast<unsigned>(intAttr.getInt());
+          }
+          if (auto memSpaceAttr = llvm::dyn_cast<MemorySpaceAttr>(attr))
+            return static_cast<unsigned>(memSpaceAttr.getValue());
+        }
+      }
+      // take global as default memory scope.
+      return static_cast<unsigned>(MemorySpace::Global);
+    }
+
   }];
 }
 
@@ -411,8 +428,10 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> {
       is fixed to the hardware supportted subgroup size, e.g., 16 on PVC,
       implying each element in the array corresponds to a work-item (SIMT lane)
       in the subgroup.
-    * chunk_size: [optional attribute] indicates number of continious
-      elements accessed for each offset, default is 1.
+
+    The first dimension of the result TensorDesc corresponds to work-items, so it should
+    match the dimension of offsets. It may also has a second dimension corresponding to
+    the chunk_size if the chunk size is larger than 1.
 
     Example 1. It assumes subgroup size is 4, and accesses a[0], a[16], a[32], a[64]
     ```mlir
@@ -424,29 +443,22 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> {
                It will access totally 32 data elements: a[0:7], a[16:23], a[32:39], a[64:71]
     ```mlir
     %0 = memref.alloc() : memref<1024xf32>
-    %1 = xegpu.create_tdesc %0[0, 16, 32, 64] {chunk_size = 8}: memref<1024xf32> -> TensorDesc<4x8xf32>
+    %1 = xegpu.create_tdesc %0[0, 16, 32, 64] : memref<1024xf32> -> TensorDesc<4x8xf32, chunk_size = 8>
     ```
 
     Example 3. It is similar to Example 2, but there is some overlaps among workitems.
                It accesses: a[0:7], a[4:11], a[8:15], a[12:19]
     ```mlir
     %0 = memref.alloc() : memref<1024xf32>
-    %1 = xegpu.create_tdesc %0[0, 4, 8, 12] {chunk_size = 8}: memref<1024xf32> -> TensorDesc<4x8xf32>
+    %1 = xegpu.create_tdesc %0[0, 4, 8, 12] : memref<1024xf32> -> TensorDesc<4x8xf32, chunk_size = 8>>
     ```
   }];
 
   let arguments = (ins XeGPU_BaseAddrType: $source,
                        Variadic<Index>: $offsets,
-                       DenseI64ArrayAttr: $const_offsets,
-                       DefaultValuedAttr<I64Attr, "1">: $chunk_size);
+                       DenseI64ArrayAttr: $const_offsets);
   let results = (outs XeGPU_TensorDesc:$TensorDesc);
 
-  let builders = [
-    OpBuilder<(ins "xegpu::TensorDescType": $TensorDesc, "Value": $source,
-                   "llvm::ArrayRef<OpFoldResult>": $offsets,
-                   CArg<"uint32_t", "1"> : $chunk_size)>,
-  ];
-
   let assemblyFormat = [{
     $source
     custom<DynamicIndexList>($offsets, $const_offsets)
@@ -473,6 +485,22 @@ def XeGPU_CreateDescOp: XeGPU_Op<"create_tdesc", [Pure, ViewLikeOpInterface]> {
       assert(idx < getNumOffsets() && "Invalid out of bound access.");
       return getMixedOffsets()[idx];
     }
+
+    unsigned getSourceMemorySpace() {
+      auto srcTy = getSource().getType();
+      if (auto memrefTy = llvm::dyn_cast<mlir::MemRefType>(srcTy)) {
+        auto attr = memrefTy.getMemorySpace();
+        if (attr) {
+          if (auto intAttr = llvm::dyn_cast<mlir::IntegerAttr>(attr))
+            return static_cast<unsigned>(intAttr.getInt());
+          if (auto memSpaceAttr = llvm::dyn_cast<MemorySpaceAttr>(attr))
+            return static_cast<unsigned>(memSpaceAttr.getValue());
+        }
+      }
+      // take global as default memory scope.
+      return static_cast<unsigned>(MemorySpace::Global);
+    }
+
   }];
 
   let hasVerifier = 1;
@@ -520,28 +548,31 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [AllRanksMatch<["value", "TensorDesc"]
 
   let description = [{ It (aka. load) load data per each work-item. The output
     describes the data being loaded at the subgroup level, so its size is
-    consistent with the number of work-items in a subgroup. When `chunk_size_per_lane`
-    attribute is larger than 1 in TensorDesc, the output vector will be 2D vector,
-    with dim-1 correspoding to the chunk size.
+    consistent with the number of work-items in a subgroup. When the chunk size
+    is larger than 2, the output vector is a 2D vector, with dim-1 correspoding
+    to work-items, and dim-0 corresponding to the chunk_size loaded by each work-item.
+    Specially, there is a transpose effect on the result (as compared to the TensorDesc)
+    due to the hardware implementation. Therefore, a transpose attribute is introduced
+    on purpose, making sure users are aware of this implicit transformation.
 
     The mask operand masks out memory access so that it is safe to pass out-of-boundary
     addresses/offsets as long as they are masked. It applies to slots of SIMD lanes.
 
   Example:
   ```mlir
-    %2 = xegpu.load %1, %0 {transpose = [1, 0],
+    %2 = xegpu.load %1, %0 {transpose,
                             l1_hint = #xegpu.cache_hint<cached>,
                             l2_hint = #xegpu.cache_hint<uncached>,
                             l3_hint = #xegpu.cache_hint<uncached>}
-          : !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered=true>>, vector<16xi1>
-            -> vector<16xf32>
+          : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<memory_space=global>>,
+            vector<16xi1> -> vector<16xf32>
   ```
 
   }];
 
   let arguments = (ins XeGPU_TensorDesc: $TensorDesc,
                        XeGPU_MaskType: $mask,
-                       OptionalAttr<DenseI64ArrayAttr>: $transpose,
+                       OptionalAttr<UnitAttr>: $transpose,
                        OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
                        OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
                        OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
@@ -573,11 +604,15 @@ def XeGPU_LoadGatherOp : XeGPU_Op<"load", [AllRanksMatch<["value", "TensorDesc"]
   let hasVerifier = 1;
 }
 
-def XeGPU_StoreScatterOp : XeGPU_Op<"store", [AllShapesMatch<["value", "TensorDesc"]>,
-                                        AllElementTypesMatch<["value", "TensorDesc"]>]> {
+def XeGPU_StoreScatterOp : XeGPU_Op<"store", [AllElementCountsMatch<["value", "TensorDesc"]>,
+                                              AllElementTypesMatch<["value", "TensorDesc"]>]> {
   let summary = "store data to scattered memory locations.";
-  let description = [{ It (aka. store) stores data to scattered memory locations.
-  It has similar semantic to `load_gather`.
+  let description = [{ It (aka. store) stores data to scattered memory locations. The value is
+  typically a 1D vector. But when the chunk size of the TensorDesc is larger than 1, it will be
+  a 2D vector instead. For the later case, dim-1 of the value correspods to the simd lanes
+  and the dim-0 of the value corresponds to the chunk_size stored per lane. So `store_scatter`
+  has transpose effect, which is similar to `load_gather`. Therefore, a transpose attribute is
+  introduced on purpose, making sure users are aware of this implicit transformation.
 
   Example:
   ```mlir
@@ -592,6 +627,7 @@ def XeGPU_StoreScatterOp : XeGPU_Op<"store", [AllShapesMatch<["value", "TensorDe
     XeGPU_ValueType: $value,
     XeGPU_TensorDesc: $TensorDesc,
     XeGPU_MaskType: $mask,
+    OptionalAttr<UnitAttr>: $transpose,
     OptionalAttr<XeGPU_CacheHintAttr>: $l1_hint,
     OptionalAttr<XeGPU_CacheHintAttr>: $l2_hint,
     OptionalAttr<XeGPU_CacheHintAttr>: $l3_hint);
@@ -723,7 +759,7 @@ def XeGPU_DpasOp : XeGPU_Op<"dpas", [Pure, AllElementTypesMatch<["lhs", "rhs"]>]
 
 def XeGPU_AtomicRMWOp: XeGPU_Op<"atomic_rmw", [Pure,
       AllElementTypesMatch<["tensorDesc", "value", "result"]>,
-      AllShapesMatch<["tensorDesc", "mask", "value", "result"]>]> {
+      AllShapesMatch<["tensorDesc", "value", "result"]>]> {
   let summary = "Atomic ready-modify-write operation on the TensorDesc. ";
 
   let description = [{
@@ -808,7 +844,7 @@ def XeGPU_FenceOp: XeGPU_Op<"fence", []> {
     2. `Fence_scope` describes the scope of fence. "Workgroup" means that the scope would be
         within each workgroup. "GPU" means the scope would be across workgroups within the GPU.
   }];
-  let arguments = (ins XeGPU_MemoryScopeAttr: $memory_kind,
+  let arguments = (ins XeGPU_MemorySpaceAttr: $memory_kind,
                        XeGPU_FenceScopeAttr: $fence_scope);
   let assemblyFormat = [{`memory_kind` `=` `` $memory_kind `,` `fence_scope` `=` `` $fence_scope attr-dict}];
   let extraClassDeclaration = extraBaseClassDeclaration;
diff --git a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
index 9f101a71697b56..0ce1211664b5ba 100644
--- a/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
+++ b/mlir/include/mlir/Dialect/XeGPU/IR/XeGPUTypes.td
@@ -48,7 +48,7 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
 
     Similar to the builtin tensor, it also provides an optinal attribute to encoding
     the following information via the TensorDescAttr object:
-    * memory_scope (xegpu::MemoryScope): [optional] where the data is located,
+    * memory_space (xegpu::MemorySpace): [optional] where the data is located,
                 global memory or shared memory. It is default to Global.
     * array_length (int): [optional] The number of contiguous blocks with size as `shape`,
                that will be loaded by block load at a time. It is default to 1.
@@ -63,7 +63,7 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
     element-type ::= float-type | integer-type | index-type
     dim-list := (static-dim-list `x`)?
     static-dim-list ::= decimal-literal `x` decimal-literal
-    attr-list = (, memory_scope = value)? (, arr_len = value)? (, boundary_check = value)? (, scattered = value)?
+    attr-list = (, memory_space = value)? (, arr_len = value)? (, boundary_check = value)? (, scattered = value)?
     ```
 
     Examples:
@@ -76,7 +76,7 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
     xegpu.tensor_desc<8x16xf32>
 
     // A TensorDesc with 8x16 f32 elements for a memory region in shared memory space.
-    xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_scope = slm>>
+    xegpu.tensor_desc<8x16xf32, #xegpu.tdesc_attr<memory_space = slm>>
     ```
   }];
 
@@ -88,11 +88,14 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
     TypeBuilderWithInferredContext<(ins
       "llvm::ArrayRef<int64_t>": $shape,
       "mlir::Type": $elementType,
-      CArg<"bool", "false">: $scattered,
       CArg<"int", "1">: $array_length,
-      CArg<"xegpu::MemoryScope", "xegpu::MemoryScope::Global">:$memory_scope,
-      CArg<"bool", "true">: $boundary_check
-    )>
+      CArg<"bool", "true">: $boundary_check,
+      CArg<"xegpu::MemorySpace", "xegpu::MemorySpace::Global">:$memory_space)>,
+    TypeBuilderWithInferredContext<(ins
+      "llvm::ArrayRef<int64_t>": $shape,
+      "mlir::Type": $elementType,
+      CArg<"int", "1">: $chunk_size,
+      CArg<"xegpu::MemorySpace", "xegpu::MemorySpace::Global">:$memory_space)>
   ];
 
   let extraClassDeclaration = [{
@@ -110,40 +113,58 @@ def XeGPU_TensorDesc: XeGPUTypeDef<"TensorDesc", "tensor_desc",
       return llvm::cast<TensorDescType>(cloneWith(getShape(), elementType));
     }
 
-    TensorDescAttr getEncodingAsTensorDescAttr() const {
-      return llvm::dyn_cast_if_present<TensorDescAttr>(getEncoding());
+    BlockTensorDescAttr getEncodingAsBlockTensorDescAttr() const {
+      return llvm::dyn_cast_if_present<BlockTensorDescAttr>(getEncoding());
     }
 
-    xegpu::MemoryScope getMemoryScope() const {
-      auto attr = getEncodingAsTensorDescAttr();
-      if (attr && attr.getMemoryScope())
-        return attr.getMemoryScope().getValue();
+    ScatterTensorDescAttr getEncodingAsScatterTensorDescAttr() const {
+      return llvm::dyn_cast_if_present<ScatterTensorDescAttr>(getEncoding());
+    }
+
+    xegpu::MemorySpace getMemorySpace() const {
+      auto block_attr = getEncodingAsBlockTensorDescAttr();
+      if (block_attr && block_attr.getMemorySpace())
+        return block_attr.getMemorySpace().getValue();
+
+      auto scatter_attr = getEncodingAsScatterTensorDescAttr();
+      if (scatter_attr && scatter_attr.getMemorySpace())
+        return scatter_attr.getMemorySpace().getValue();
+
       // return default value
-      return MemoryScope::Global;
+      return MemorySpace::Global;
     }
 
     int getArrayLength() {
-      auto attr = getEncodingAsTensorDescAttr();
-      if (attr && attr.getArrayLength())
-        return attr.getArrayLength().getInt();
+      auto attr = getEncoding();
+      auto block_attr = mlir::dyn_cast_if_present<BlockTensorDescAttr>(attr);
+      assert((!attr || block_attr) && "invalid on non BlockTensorDescAttr.");
+      if (block_attr && block_attr.getArrayLength())
+        return block_attr.getArrayLength().getInt();
       // return default value
       return 1;
     }
 
     bool getBoundaryCheck() {
-      auto attr = getEncodingAsTensorDescAttr();
-      if (attr && attr.getBoundaryCheck())
-        return attr.getBoundaryCheck().getValue();
+      auto attr = getEncoding();
+      auto block_attr = mlir::dyn_cast_if_present<BlockTensorDescAttr>(attr);
+      assert((!attr || block_attr) && "invalid on non BlockTensorDescAttr.");
+      if (block_attr && block_attr.getBoundaryCheck())
+        return block_attr.getBoundaryCheck().getValue();
       // return default value
       return true;
     }
 
-    bool getScattered() {
-      auto attr = getEncodingAsTensorDescAttr();
-      if (attr && attr.getScattered())
-        return attr.getScattered().getValue();
-      // return default value
-      return false;
+    bool isScattered() {
+      return bool(getEncodingAsScatterTensorDescAttr());
+    }
+
+    int getChunkSize() {
+      auto attr = getEncoding();
+      auto scatter_attr = mlir::dyn_cast_if_present<ScatterTensorDescAttr>(attr);
+      assert((!attr || scatter_attr) && "invalid on non ScatterTensorDescAttr.");
+      if (scatter_attr && scatter_attr.getChunkSize())
+        return scatter_attr.getChunkSize().getInt();
+      return 1;
     }
   }];
 
diff --git a/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp b/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp
index be1581d619a8b1..fa034427655394 100644
--- a/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp
+++ b/mlir/lib/Conversion/VectorToXeGPU/VectorToXeGPU.cpp
@@ -168,9 +168,8 @@ struct TransferReadLowering : public OpRewritePattern<vector::TransferReadOp> {
     if (isTransposeLoad)
       std::reverse(descShape.begin(), descShape.end());
     auto descType = xegpu::TensorDescType::get(
-        descShape, elementType, /*scattered=*/false, /*array_length=*/1,
-        xegpu::MemoryScope::Global,
-        /*boundary_check=*/isOutOfBounds);
+        descShape, elementType, /*array_length=*/1,
+        /*boundary_check=*/isOutOfBounds, xegpu::MemorySpace::Global);
 
     xegpu::CreateNdDescOp ndDesc =
         createNdDescriptor(rewriter, loc, descType,
@@ -212,10 +211,10 @@ struct TransferWriteLowering
       return rewriter.notifyMatchFailure(writeOp, "Expects identity map");
 
     VectorType vecTy = writeOp.getVectorType();
-    auto descType = xegpu::TensorDescType::get(
-        vecTy.getShape(), vecTy.getElementType(),
-        /*scattered=*/false, /*array_length=*/1, xegpu::MemoryScope::Global,
-        /*boundary_check=*/false);
+    auto descType =
+        xegpu::TensorDescType::get(vecTy.getShape(), vecTy.getElementType(),
+                                   /*array_length=*/1, /*boundary_check=*/false,
+                                   xegpu::MemorySpace::Global);
     xegpu::CreateNdDescOp ndDesc = createNdDescriptor(
         rewriter, loc, descType,
         dyn_cast<TypedValue<MemRefType>>(writeOp.getSource()),
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 24719fe748fe4f..1dfbaed454c193 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -30,23 +30,35 @@ void XeGPUDialect::initialize() {
 }
 
 //===----------------------------------------------------------------------===//
-// XeGPU_TensorDescAttr
+// XeGPU_BlockTensorDescAttr
 //===----------------------------------------------------------------------===//
-TensorDescAttr TensorDescAttr::get(mlir::MLIRContext *context,
-                                   xegpu::MemoryScope memory_scope,
-                                   int array_length, bool boundary_check,
-                                   bool scattered) {
-  auto scopeAttr = MemoryScopeAttr::get(context, memory_scope);
+BlockTensorDescAttr BlockTensorDescAttr::get(mlir::MLIRContext *context,
+                                             xegpu::MemorySpace memory_space,
+                                             int array_length,
+                                             bool boundary_check) {
+  auto scopeAttr = MemorySpaceAttr::get(context, memory_space);
   auto lengthAttr =
       IntegerAttr::get(IntegerType::get(context, 64), array_length);
   auto boundaryAttr = BoolAttr::get(context, boundary_check);
-  auto scatteredAttr = BoolAttr::get(context, scattered);
-  return Base::get(context, scopeAttr, lengthAttr, boundaryAttr, scatteredAttr);
+  return Base::get(context, scopeAttr, lengthAttr, boundaryAttr);
+}
+
+//===----------------------------------------------------------------------===//
+// XeGPU_ScatterTensorDescAttr
+//===----------------------------------------------------------------------===//
+ScatterTensorDescAttr
+ScatterTensorDescAttr::get(mlir::MLIRContext *context,
+                           xegpu::MemorySpace memory_space, int chunk_size) {
+  auto scopeAttr = MemorySpaceAttr::get(context, memory_space);
+  auto chunkSizeAttr =
+      IntegerAttr::get(IntegerType::get(context, 64), chunk_size);
+  return Base::get(context, scopeAttr, chunkSizeAttr);
 }
 
 //===----------------------------------------------------------------------===//
 // XeGPU_TensorDescType
 //===----------------------------------------------------------------------===//
+
 mlir::Type TensorDescType::parse(::mlir::AsmParser &parser) {
   llvm::SmallVector<int64_t> shape;
   mlir::Type elementType;
@@ -108,12 +120,20 @@ void TensorDescType::print(::mlir::AsmPrinter &printer) const {
 }
 
 TensorDescType TensorDescType::get(llvm::ArrayRef<int64_t> shape,
-                                   mlir::Type elementType, bool scattered,
-                                   int array_length, MemoryScope memory_scope,
-                                   bool boundary_check) {
+                                   mlir::Type elementType, int array_length,
+                                   bool boundary_check,
+                                   MemorySpace memory_space) {
+  auto context = elementType.getContext();
+  auto attr = BlockTensorDescAttr::get(context, memory_space, array_length,
+                                       boundary_check);
+  return Base::get(context, shape, elementType, attr);
+}
+
+TensorDescType TensorDescType::get(llvm::ArrayRef<int64_t> shape,
+                                   mlir::Type elementType, int chunk_size,
+                                   MemorySpace memory_space) {
   auto context = elementType.getContext();
-  auto attr = TensorDescAttr::get(context, memory_scope, array_length,
-                                  boundary_check, scattered);
+  auto attr = ScatterTensorDescAttr::get(context, memory_space, chunk_size);
   return Base::get(context, shape, elementType, attr);
 }
 
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
index 9c517337a3aa57..1a7a6b34784099 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUOps.cpp
@@ -124,6 +124,17 @@ LogicalResult CreateNdDescOp::verify() {
   bool invalidRank = false;
   bool invalidElemTy = false;
 
+  // Memory space of created TensorDesc should match with the source.
+  // Both source and TensorDesc are considered for global memory by default,
+  // if the memory scope attr is not specified. If source is an integer,
+  // it is considered as ptr to global memory.
+  auto srcMemorySpace = getSourceMemorySpace();
+  auto tdescMemorySpace = static_cast<unsigned>(getType().getMemorySpace());
+  if (srcMemorySpace != tdescMemorySpace)
+    return emitOpError("Memory space mismatch.")
+           << " Source: " << srcMemorySpace
+           << ", TensorDesc: " << tdescMemorySpace;
+
   // check source type matches the rank if it is a memref.
   // It also should have the same ElementType as TensorDesc.
   auto memrefTy = dyn_cast<MemRefType>(getSourceType());
@@ -152,9 +163,13 @@ LogicalResult CreateNdDescOp::verify() {
     return emitOpError("TensorDesc should have the same element "
                        "type with the source if it is a memref.\n");
 
-  if (getType().getScattered())
+  if (getType().isScattered())
     return emitOpError("Expects a non-scattered TensorDesc.\n");
 
+  if (getType().getRank() == 2 &&
+      tdescMemorySpace == static_cast<unsigned>(MemorySpace::SLM))
+    return emitOpError("SLM is not supported for 2D Block TensorDesc.\n");
+
   return success();
 }
 
@@ -163,7 +178,7 @@ LogicalResult CreateNdDescOp::verify() {
 //===----------------------------------------------------------------------===//
 LogicalResult PrefetchNdOp::verify() {
   auto tdescTy = getTensorDescType();
-  if (tdescTy.getScattered())
+  if (tdescTy.isScattered())
     return emitOpError("Expects a non-scattered TensorDesc.\n");
 
   if (!isReadHintOrNone(getL1HintAttr()))
@@ -188,7 +203,7 @@ LogicalResult LoadNdOp::verify() {
   if (tdescTy.getRank() > 2)
     return emitOpError("Expecting a 1D/2D TensorDesc.\n");
 
-  if (tdescTy.getScattered())
+  if (tdescTy.isScattered())
     return emitOpError("Expects a non-scattered TensorDesc.\n");
 
   if (!valueTy)
@@ -228,8 +243,8 @@ LogicalResult LoadNdOp::verify() {
       tdescShape[axis] /= vnni_factor;
       tdescShape.push_back(vnni_factor);
     } else {
-      return emitWarning("Invalid Packed Attr. It is ignored (available for 2D "
-                         "TensorDesc only).");
+      emitWarning("Invalid Packed Attr. It is ignored (available for 2D "
+                  "TensorDesc only).");
     }
   }
 
@@ -256,7 +271,7 @@ LogicalResult StoreNdOp::verify() {
   if (dstTy.getRank() > 2)
     return emitOpError("Expecting a 1D/2D TensorDesc.\n");
 
-  if (dstTy.getScattered())
+  if (dstTy.isScattered())
     return emitOpError("Expects a non-scattered TensorDesc.\n");
 
   if (!valTy)
@@ -279,7 +294,7 @@ LogicalResult StoreNdOp::verify() {
 //===----------------------------------------------------------------------===//
 LogicalResult UpdateNdOffsetOp::verify() {
   auto ty = getTensorDescType();
-  if (ty.getScattered())
+  if (ty.isScattered())
     return emitOpError("Expects a non-scattered TensorDesc.\n");
 
   // number of offsets specified must match the rank of the tensor descriptor
@@ -292,28 +307,55 @@ LogicalResult UpdateNdOffsetOp::verify() {
 //===----------------------------------------------------------------------===//
 // XeGPU_CreateDescOp
 //===----------------------------------------------------------------------===//
-void CreateDescOp::build(OpBuilder &builder, OperationState &state,
-                         TensorDescType TensorDesc, Value source,
-                         llvm::ArrayRef<OpFoldResult> offsets,
-                         uint32_t chunk_size) {
-  llvm::SmallVector<int64_t> staticOffsets;
-  llvm::SmallVector<Value> dynamicOffsets;
-  dispatchIndexOpFoldResults(offsets, dynamicOffsets, staticOffsets);
-  build(builder, state, TensorDesc, source, dynamicOffsets, staticOffsets,
-        chunk_size);
-}
 
 LogicalResult CreateDescOp::verify() {
   auto tdescTy = getTensorDescType();
-  auto chunkSize = getChunkSize();
 
   if (getRankOf(getSource()) > 1)
     return emitOpError(
         "Expecting the source is a 1D memref or pointer (uint64_t).");
 
-  if (!tdescTy.getScattered())
+  if (!tdescTy.isScattered())
     return emitOpError("Expects a scattered TensorDesc.\n");
 
+  // Memory space of created TensorDesc should match with the source.
+  // Both source and TensorDesc are considered for global memory by default,
+  // if the memory scope attr is not specified. If source is an integer,
+  // it is considered as ptr to global memory.
+  auto srcMemorySpace = getSourceMemorySpace();
+  auto tdescMemorySpace = static_cast<unsigned>(tdescTy.getMemorySpace());
+  if (srcMemorySpace != tdescMemorySpace)
+    return emitOpError("Memory space mismatch.")
+           << " Source: " << srcMemorySpace
+           << ", TensorDesc: " << tdescMemorySpace;
+
+  auto chunkSize = tdescTy.getChunkSize();
+
+  // check chunk_size
+  llvm::SmallVector<int64_t> supportedChunkSizes = {1,  2,  3,  4,   8,
+                                                    16, 32, 64, 128, 256};
+  if (!llvm::is_contained(supportedChunkSizes, chunkSize))
+    return emitOpError("Invalid chunk_size. Supported values are 1, 2, 3, 4, "
+                       "8, 16, 32, 64, 128, or 256.");
+
+  // check total size
+  auto elemBits = tdescTy.getElementType().getIntOrFloatBitWidth();
+  auto bitsPerLane = elemBits * chunkSize;
+  if (chunkSize > 1 && bitsPerLane % 32) {
+    // For 8-bit and 16-bit data, the hardware only supports chunk size of 1.
+    // For 32-bit data, the hardware can support larger larger chunk size. So
+    // we can bitcast 8-bit/16-bit data to 32-bit data for better performance.
+    // But this requires the total size is 32 bit aligned to make the
+    // optimization work.
+    return emitOpError(
+        "access size (chunk_size * sizeof(elemTy)) should be 32-bit aligned.");
+  }
+
+  auto lscConstraints = 512 * 8; // each access is upto 512 bytes.
+  if (elemBits * tdescTy.getNumElements() > lscConstraints)
+    return emitOpError("total access size (simd_lanes * chunk_size * "
+                       "sizeof(elemTy)) is upto 512 bytes.");
+
   SmallVector<int64_t> shape({(int64_t)getNumOffsets()});
   if (chunkSize != 1)
     shape.push_back(chunkSize);
@@ -331,7 +373,7 @@ LogicalResult CreateDescOp::verify() {
 //===----------------------------------------------------------------------===//
 LogicalResult PrefetchOp::verify() {
   auto tdescTy = getTensorDescType();
-  if (!tdescTy.getScattered())
+  if (!tdescTy.isScattered())
     return emitOpError("Expects a scattered TensorDesc.\n");
 
   if (!isReadHintOrNone(getL1HintAttr()))
@@ -354,7 +396,7 @@ LogicalResult LoadGatherOp::verify() {
   auto maskTy = getMaskType();
   auto valueTy = getValueType();
 
-  if (!tdescTy.getScattered())
+  if (!tdescTy.isScattered())
     return emitOpError("Expects a scattered TensorDesc.\n");
 
   if (!isReadHintOrNone(getL1HintAttr()))
@@ -379,12 +421,10 @@ LogicalResult LoadGatherOp::verify() {
   if (tdescShape[0] != maskShape[0])
     return emitOpError("dim-0 of the Mask and TensorDesc should be the same.");
 
-  if (getTransposeAttr()) {
-    auto trans = getTranspose().value();
-    if (tdescShape.size() < trans.size())
-      emitWarning("Invalid transpose attr. It is ignored.");
-    else
-      transpose(trans, tdescShape);
+  if (tdescTy.getRank() == 2) {
+    if (!getTransposeAttr())
+      return emitOpError("load_gather has to be transposed.");
+    transpose({1, 0}, tdescShape);
   }
 
   if (valueShape != tdescShape)
@@ -400,7 +440,7 @@ LogicalResult LoadGatherOp::verify() {
 //===----------------------------------------------------------------------===//
 LogicalResult StoreScatterOp::verify() {
   auto tdescTy = getTensorDescType();
-  if (!tdescTy.getScattered())
+  if (!tdescTy.isScattered())
     return emitOpError("Expects a scattered TensorDesc.\n");
 
   if (!isWriteHintOrNone(getL1HintAttr()))
@@ -413,11 +453,24 @@ LogicalResult StoreScatterOp::verify() {
     return emitOpError("invlid l3_hint: ") << getL3HintAttr();
 
   auto maskTy = getMaskType();
+  auto valueTy = getValueType();
   auto maskShape = getShapeOf(maskTy);
   auto tdescShape = getShapeOf(tdescTy);
+  auto valueShape = getShapeOf(valueTy);
   if (tdescShape[0] != maskShape[0])
     return emitOpError("dim-0 of the Mask and TensorDesc should be the same.");
 
+  if (tdescTy.getRank() == 2) {
+    if (!getTransposeAttr())
+      return emitOpError("load_gather has to be transposed.");
+    transpose({1, 0}, tdescShape);
+  }
+
+  if (valueShape != tdescShape)
+    return emitOpError("Unexpected value shape")
+           << "(Expected shape: " << makeString(tdescShape)
+           << ", Given shape: " << makeString(valueShape) << ").\n";
+
   return success();
 }
 //===----------------------------------------------------------------------===//
diff --git a/mlir/test/Dialect/XeGPU/XeGPUOps.mlir b/mlir/test/Dialect/XeGPU/XeGPUOps.mlir
index 35d44cf56a239b..c1126efb6046dc 100644
--- a/mlir/test/Dialect/XeGPU/XeGPUOps.mlir
+++ b/mlir/test/Dialect/XeGPU/XeGPUOps.mlir
@@ -24,8 +24,8 @@ gpu.func @test_create_nd_tdesc_vc_2(%src: ui64, %w : index, %h : index, %x : ind
 
 // CHECK: gpu.func @test_create_nd_tdesc_vc_3(%[[arg0:.*]]: memref<24x32xf32>) {
 gpu.func @test_create_nd_tdesc_vc_3(%src: memref<24x32xf32>) {
-  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.tdesc_attr<array_length = 2 : i64>
-  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.tdesc_attr<array_length = 2>>
+  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2 : i64>
+  %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf32> -> !xegpu.tensor_desc<24x16xf32, #xegpu.block_tdesc_attr<array_length = 2>>
   gpu.return
 }
 
@@ -36,6 +36,13 @@ gpu.func @test_create_nd_tdesc_vc_4(%src: memref<2x24x32xf32>) {
   gpu.return
 }
 
+// CHECK: gpu.func @test_create_nd_tdesc_vc_5(%[[arg0:.*]]: memref<2x24x32xf32, 3>) {
+gpu.func @test_create_nd_tdesc_vc_5(%src: memref<2x24x32xf32, 3>) {
+  // CHECK: %[[REG:.*]] = xegpu.create_nd_tdesc %arg0[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32, #xegpu.block_tdesc_attr<memory_space = slm>>
+  %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32, #xegpu.block_tdesc_attr<memory_space = slm>>
+  gpu.return
+}
+
 // CHECK: gpu.func @test_prefetch_nd_vc(%[[arg0:.*]]: memref<24x32xf16>) {
 gpu.func @test_prefetch_nd_vc(%src: memref<24x32xf16>) {
   // CHECK: %[[R0:.*]] = xegpu.create_nd_tdesc %[[arg0]][0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
@@ -97,17 +104,24 @@ gpu.func @test_create_update_nd_tdesc_vc(%src: memref<24x32xf32>) {
 
 // CHECK: gpu.func @test_create_tdesc_vc(%[[arg0:.*]]: ui64) {
 gpu.func @test_create_tdesc_vc(%src: ui64) {
-  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %arg0 [0, 8, 16, 24] {chunk_size = 2 : i64} : ui64 -> !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>
-  %1 = xegpu.create_tdesc %src[0, 8, 16, 24] {chunk_size = 2} : ui64  -> !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>
+  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %arg0 [0, 8, 16, 24] : ui64 -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
+  %1 = xegpu.create_tdesc %src[0, 8, 16, 24] : ui64  -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
+  gpu.return
+}
+
+// CHECK: gpu.func @test_create_tdesc_vc_1(%[[arg0:.*]]: memref<?xf32, 3>) {
+gpu.func @test_create_tdesc_vc_1(%src: memref<?xf32, 3>) {
+  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %arg0 [0, 8, 16, 24] : memref<?xf32, 3> -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<memory_space =  slm, chunk_size = 2 : i64>>
+  %1 = xegpu.create_tdesc %src[0, 8, 16, 24] : memref<?xf32, 3>  -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<memory_space = slm, chunk_size = 2>>
   gpu.return
 }
 
 // CHECK: gpu.func @test_prefetch_vc(%[[arg0:.*]]: ui64) {
 gpu.func @test_prefetch_vc(%src: ui64) {
-  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %arg0 [0, 8, 16, 24] {chunk_size = 2 : i64} : ui64 -> !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>
-  %1 = xegpu.create_tdesc %src[0, 8, 16, 24] {chunk_size = 2} : ui64  -> !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>
-  // CHECK: xegpu.prefetch %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>
-  xegpu.prefetch %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>
+  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %arg0 [0, 8, 16, 24] : ui64 -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
+  %1 = xegpu.create_tdesc %src[0, 8, 16, 24] : ui64  -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
+  // CHECK: xegpu.prefetch %[[R0]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}> : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
+  xegpu.prefetch %1 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>: !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
   gpu.return
 }
 
@@ -115,12 +129,12 @@ gpu.func @test_prefetch_vc(%src: ui64) {
 gpu.func @test_load_gather_vc(%src: ui64) {
   //CHECK: %[[cst:.*]] = arith.constant dense<true> : vector<4xi1>
   %0 = arith.constant dense<1>: vector<4xi1>
-  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %arg0 [0, 8, 16, 24] {chunk_size = 2 : i64} : ui64 -> !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>
-  %1 = xegpu.create_tdesc %src[0, 8, 16, 24] {chunk_size = 2} : ui64  -> !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>
-  //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>
-  //CHECK-SAME: !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>, vector<4xi1> -> vector<4x2xf32>
-  %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>}>
-        : !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>, vector<4xi1> -> vector<4x2xf32>
+  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %arg0 [0, 8, 16, 24] : ui64 -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
+  %1 = xegpu.create_tdesc %src[0, 8, 16, 24] : ui64  -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
+  //CHECK: %[[R1:.*]] = xegpu.load %[[R0]], %[[cst]] <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}>
+  //CHECK-SAME: !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>, vector<4xi1> -> vector<2x4xf32>
+  %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint<cached>, l2_hint = #xegpu.cache_hint<uncached>, transpose}>
+        : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<4xi1> -> vector<2x4xf32>
   gpu.return
 }
 
@@ -128,23 +142,23 @@ gpu.func @test_load_gather_vc(%src: ui64) {
 gpu.func @test_store_scatter_vc(%src: ui64) {
   //CHECK: %[[c0:.*]] = arith.constant dense<true> : vector<4xi1>
   %0 = arith.constant dense<1>: vector<4xi1>
-  //CHECK: %[[c1:.*]] = arith.constant dense<2.900000e+00> : vector<4x2xf32>
-  %1 = arith.constant dense<2.9>: vector<4x2xf32>
-  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %arg0 [0, 8, 16, 24] {chunk_size = 2 : i64} : ui64 -> !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>
-  %2 = xegpu.create_tdesc %src[0, 8, 16, 24] {chunk_size = 2} : ui64  -> !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>
-  //CHECK: xegpu.store %[[c1]], %[[R0]], %[[c0]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>
-  //CHECK-SAME: vector<4x2xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>, vector<4xi1>
-  xegpu.store %1, %2, %0 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>}>
-        : vector<4x2xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>, vector<4xi1>
+  //CHECK: %[[c1:.*]] = arith.constant dense<2.900000e+00> : vector<2x4xf32>
+  %1 = arith.constant dense<2.9>: vector<2x4xf32>
+  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %arg0 [0, 8, 16, 24] : ui64 -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
+  %2 = xegpu.create_tdesc %src[0, 8, 16, 24] : ui64  -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
+  //CHECK: xegpu.store %[[c1]], %[[R0]], %[[c0]] <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}>
+  //CHECK-SAME: vector<2x4xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>, vector<4xi1>
+  xegpu.store %1, %2, %0 <{l1_hint = #xegpu.cache_hint<write_back>, l2_hint = #xegpu.cache_hint<uncached>, transpose}>
+        : vector<2x4xf32>, !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<4xi1>
   gpu.return
 }
 
 // CHECK: gpu.func @test_create_update_tdesc_vc(%[[arg0:.*]]: ui64) {
 gpu.func @test_create_update_tdesc_vc(%src: ui64) {
-  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %arg0 [0, 8, 16, 24] {chunk_size = 2 : i64} : ui64 -> !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>
-  %1 = xegpu.create_tdesc %src[0, 8, 16, 24] {chunk_size = 2} : ui64  -> !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>
-  //CHECK: %[[R1:.*]] = xegpu.update_offset %[[R0]], [32, 32, 32, 32] : !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>
-  %2 = xegpu.update_offset %1, [32, 32, 32, 32] : !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>
+  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %arg0 [0, 8, 16, 24] : ui64 -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
+  %1 = xegpu.create_tdesc %src[0, 8, 16, 24]: ui64  -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
+  //CHECK: %[[R1:.*]] = xegpu.update_offset %[[R0]], [32, 32, 32, 32] : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2 : i64>>
+  %2 = xegpu.update_offset %1, [32, 32, 32, 32] : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
   gpu.return
 }
 
@@ -165,10 +179,10 @@ gpu.func @test_dpas_vc_with_packed_b(%a : vector<8x16xf16>, %b: vector<8x16x2xf1
 
 // CHECK: gpu.func @test_atomic_rmw(%[[arg0:.*]]: ui64, %[[arg1:.*]]: vector<16xf32>, %[[arg2:.*]]: vector<16xi1>)
 gpu.func @test_atomic_rmw(%src: ui64, %value : vector<16xf32>, %mask : vector<16xi1>) {
-  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : ui64 -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true>>
-  %1 = xegpu.create_tdesc %src[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]: ui64 -> !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true>>
-  //CHECK: %[[R1:.*]] = xegpu.atomic_rmw addf %[[R0]], %[[arg2]], %[[arg1]] : !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true>>, vector<16xi1>, vector<16xf32> -> vector<16xf32>
-  xegpu.atomic_rmw addf %1, %mask, %value: !xegpu.tensor_desc<16xf32, #xegpu.tdesc_attr<scattered = true>>, vector<16xi1>, vector<16xf32> -> vector<16xf32>
+  //CHECK: %[[R0:.*]] = xegpu.create_tdesc %[[arg0]] [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : ui64 -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
+  %1 = xegpu.create_tdesc %src[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]: ui64 -> !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>
+  //CHECK: %[[R1:.*]] = xegpu.atomic_rmw addf %[[R0]], %[[arg2]], %[[arg1]] : !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1>, vector<16xf32> -> vector<16xf32>
+  xegpu.atomic_rmw addf %1, %mask, %value: !xegpu.tensor_desc<16xf32, #xegpu.scatter_tdesc_attr<>>, vector<16xi1>, vector<16xf32> -> vector<16xf32>
   gpu.return
 }
 
diff --git a/mlir/test/Dialect/XeGPU/invalid.mlir b/mlir/test/Dialect/XeGPU/invalid.mlir
index 7ef50bb2b5fadf..193dae352e3707 100644
--- a/mlir/test/Dialect/XeGPU/invalid.mlir
+++ b/mlir/test/Dialect/XeGPU/invalid.mlir
@@ -15,6 +15,20 @@ func.func @test_create_nd_tdesc_vc_2(%src: memref<24x32xf32>) {
   return
 }
 
+// -----
+func.func @test_create_nd_tdesc_vc_3(%src: memref<2x24x32xf32, 3>) {
+  // expected-error@+1 {{SLM is not supported for 2D Block TensorDesc}}
+  %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<8x16xf32, #xegpu.block_tdesc_attr<memory_space = slm>>
+  return
+}
+
+// -----
+func.func @test_create_nd_tdesc_vc_4(%src: memref<2x24x32xf32, 3>) {
+  // expected-error@+1 {{Memory space mismatch}}
+  %1 = xegpu.create_nd_tdesc %src[0, 0, 0] : memref<2x24x32xf32, 3> -> !xegpu.tensor_desc<16xf32>
+  return
+}
+
 // -----
 func.func @test_prefetch_nd_vc_1(%src: memref<24x32xf16>) {
   %1 = xegpu.create_nd_tdesc %src[0, 0] : memref<24x32xf16> -> !xegpu.tensor_desc<8x16xf16>
@@ -26,10 +40,10 @@ func.func @test_prefetch_nd_vc_1(%src: memref<24x32xf16>) {
 // -----
 func.func @test_prefetch_nd_vc_2(%src: memref<24xf16>) {
   %1 = xegpu.create_tdesc %src[0, 1, 2, 3, 4, 5, 6, 7]
-        : memref<24xf16> -> !xegpu.tensor_desc<8xf16, #xegpu.tdesc_attr<scattered=true>>
+        : memref<24xf16> -> !xegpu.tensor_desc<8xf16, #xegpu.scatter_tdesc_attr<>>
   // expected-error@+1 {{Expects a non-scattered TensorDesc}}
   xegpu.prefetch_nd %1 <{l1_hint = #xegpu.cache_hint<cached>}>
-        : !xegpu.tensor_desc<8xf16, #xegpu.tdesc_attr<scattered=true>>
+        : !xegpu.tensor_desc<8xf16, #xegpu.scatter_tdesc_attr<>>
   return
 }
 
@@ -44,11 +58,11 @@ func.func @test_load_nd_vc_1(%src: memref<8x16xf16>) {
 
 // -----
 func.func @test_load_nd_vc_2(%src: memref<16xf16>) {
-  %1 = xegpu.create_tdesc %src[0, 2, 4, 6, 8, 10, 12, 14] {chunk_size = 2}
-        : memref<16xf16> -> !xegpu.tensor_desc<8x2xf16, #xegpu.tdesc_attr<scattered=true>>
+  %1 = xegpu.create_tdesc %src[0, 2, 4, 6, 8, 10, 12, 14]
+        : memref<16xf16> -> !xegpu.tensor_desc<8x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
   // expected-error@+1 {{Expects a non-scattered TensorDesc.}}
   %2 = xegpu.load_nd %1 <{l1_hint = #xegpu.cache_hint<cached>}>
-      : !xegpu.tensor_desc<8x2xf16, #xegpu.tdesc_attr<scattered=true>> -> vector<8x2xf16>
+      : !xegpu.tensor_desc<8x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>> -> vector<8x2xf16>
   return
 }
 
@@ -73,28 +87,28 @@ func.func @test_store_nd_vc_1(%dst: memref<24x32xf16>) {
 // -----
 func.func @test_store_nd_vc_2(%dst: memref<16xf16>) {
   %1 = arith.constant dense<1.0>: vector<8x2xf16>
-  %2 = xegpu.create_tdesc %dst[0, 2, 4, 6, 8, 10, 12, 14] {chunk_size = 2}
-        : memref<16xf16> -> !xegpu.tensor_desc<8x2xf16, #xegpu.tdesc_attr<scattered=true>>
+  %2 = xegpu.create_tdesc %dst[0, 2, 4, 6, 8, 10, 12, 14]
+        : memref<16xf16> -> !xegpu.tensor_desc<8x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
   // expected-error@+1 {{Expects a non-scattered TensorDesc}}
   xegpu.store_nd %1, %2 <{l1_hint = #xegpu.cache_hint<streaming>}>
-        : vector<8x2xf16>, !xegpu.tensor_desc<8x2xf16, #xegpu.tdesc_attr<scattered=true>>
+        : vector<8x2xf16>, !xegpu.tensor_desc<8x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
   return
 }
 
 // -----
 func.func @test_update_nd_offset_1(%dst: memref<16xf16>) {
-  %1 = xegpu.create_tdesc %dst[0, 2, 4, 6, 8, 10, 12, 14] {chunk_size = 2}
-        : memref<16xf16> -> !xegpu.tensor_desc<8x2xf16, #xegpu.tdesc_attr<scattered=true>>
+  %1 = xegpu.create_tdesc %dst[0, 2, 4, 6, 8, 10, 12, 14]
+        : memref<16xf16> -> !xegpu.tensor_desc<8x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
   // expected-error@+1 {{Expects a non-scattered TensorDesc}}
-  xegpu.update_nd_offset %1, [0, 2] : !xegpu.tensor_desc<8x2xf16, #xegpu.tdesc_attr<scattered=true>>
+  xegpu.update_nd_offset %1, [0, 2] : !xegpu.tensor_desc<8x2xf16, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
   return
 }
 
 // -----
 func.func @test_create_tdesc_vc_1(%src: ui64) {
   // expected-error@+1 {{Expects a scattered TensorDesc}}
-  %1 = xegpu.create_tdesc %src[0, 2, 4, 6, 8, 10, 12, 14] {chunk_size = 2}
-        : ui64 -> !xegpu.tensor_desc<8x2xf16>
+  %1 = xegpu.create_tdesc %src[0, 2, 4, 6, 8, 10, 12, 14]
+        : ui64 -> !xegpu.tensor_desc<8xf16>
   return
 }
 
@@ -102,7 +116,14 @@ func.func @test_create_tdesc_vc_1(%src: ui64) {
 func.func @test_create_tdesc_vc_2(%src: ui64) {
   // expected-error@+1 {{Incorrect TensorDesc shape}}
   %1 = xegpu.create_tdesc %src[0, 2, 4, 6, 8, 10, 12, 14] {chunk_size = 2}
-        : ui64 -> !xegpu.tensor_desc<8x4xf16, #xegpu.tdesc_attr<scattered = true>>
+        : ui64 -> !xegpu.tensor_desc<8x4xf16, #xegpu.scatter_tdesc_attr<>>
+  return
+}
+
+// -----
+func.func @test_create_tdesc_vc_1(%src: memref<?xf32>) {
+  // expected-error@+1 {{Memory space mismatch}}
+  %1 = xegpu.create_tdesc %src[0, 8, 16, 24] : memref<?xf32>  -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<memory_space = slm, chunk_size = 2>>
   return
 }
 
@@ -116,9 +137,9 @@ func.func @test_prefetch_vc_1(%src: memref<24x32xf16>) {
 
 // -----
 func.func @test_prefetch_vc_2(%src: ui64) {
-  %1 = xegpu.create_tdesc %src[0, 8, 16, 24] {chunk_size = 2} : ui64  -> !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>
+  %1 = xegpu.create_tdesc %src[0, 8, 16, 24] : ui64  -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
   // expected-error@+1 {{invlid l1_hint: #xegpu.cache_hint<write_back>}}
-  xegpu.prefetch %1 <{l1_hint = #xegpu.cache_hint<write_back>}>: !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>
+  xegpu.prefetch %1 <{l1_hint = #xegpu.cache_hint<write_back>}>: !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
   return
 }
 
@@ -135,11 +156,11 @@ func.func @test_load_gather_vc_1(%src: memref<24x32xf16>) {
 // -----
 func.func @test_load_gather_vc_2(%src: ui64) {
   %0 = arith.constant dense<1>: vector<4xi1>
-  %1 = xegpu.create_tdesc %src[0, 8, 16, 24] {chunk_size = 2} : ui64
-        -> !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>
+  %1 = xegpu.create_tdesc %src[0, 8, 16, 24] : ui64
+        -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
   // expected-error@+1 {{invlid l1_hint: #xegpu.cache_hint<write_back>}}
   %2 = xegpu.load %1, %0 <{l1_hint = #xegpu.cache_hint<write_back>}>
-        : !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>, vector<4xi1>
+        : !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<4xi1>
           -> vector<4x2xf32>
   return
 }
@@ -159,11 +180,11 @@ func.func @test_store_scatter_vc_1(%src: memref<24x32xf32>) {
 func.func @test_store_scatter_vc_2(%src: ui64) {
   %0 = arith.constant dense<1>: vector<4xi1>
   %1 = arith.constant dense<2.9>: vector<4x2xf32>
-  %2 = xegpu.create_tdesc %src[0, 8, 16, 24] {chunk_size = 2}
-          : ui64 -> !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>
+  %2 = xegpu.create_tdesc %src[0, 8, 16, 24]
+          : ui64 -> !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>
   // expected-error@+1 {{invlid l1_hint: #xegpu.cache_hint<streaming>}}
   xegpu.store %1, %2, %0 <{l1_hint = #xegpu.cache_hint<streaming>}> : vector<4x2xf32>,
-          !xegpu.tensor_desc<4x2xf32, #xegpu.tdesc_attr<scattered = true>>, vector<4xi1>
+          !xegpu.tensor_desc<4x2xf32, #xegpu.scatter_tdesc_attr<chunk_size = 2>>, vector<4xi1>
   return
 }
 
@@ -182,9 +203,9 @@ func.func @test_dpas_vc_2(%a : vector<8x8x2xf16>, %b: vector<8x16x2xf16>) {
 }
 
 // -----
-func.func @test_atomic_rmw(%src: ui64, %value : vector<16x8xf32>, %mask : vector<16xi1>) {
-  %1 = xegpu.create_tdesc %src[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] {chunk_size = 8}: ui64 -> !xegpu.tensor_desc<16x8xf32, #xegpu.tdesc_attr<scattered = true>>
-  // expected-error@+1 {{failed to verify that all of {tensorDesc, mask, value, result} have same shape}}
-  xegpu.atomic_rmw addf %1, %mask, %value: !xegpu.tensor_desc<16x8xf32, #xegpu.tdesc_attr<scattered = true>>, vector<16xi1>, vector<16x8xf32> -> vector<16x8xf32>
-  gpu.return
+func.func @test_atomic_rmw(%src: ui64, %value : vector<16x4xf32>, %mask : vector<16xi1>) {
+  %1 = xegpu.create_tdesc %src[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15] : ui64 -> !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<chunk_size = 8>>
+  // expected-error@+1 {{failed to verify that all of {tensorDesc, value, result} have same shape}}
+  xegpu.atomic_rmw addf %1, %mask, %value: !xegpu.tensor_desc<16x8xf32, #xegpu.scatter_tdesc_attr<chunk_size = 8>>, vector<16xi1>, vector<16x4xf32> -> vector<16x8xf32>
+  return
 }
\ No newline at end of file

From 36757613b73908f055674a8df0b51cc00aa04373 Mon Sep 17 00:00:00 2001
From: Alex MacLean <amaclean@nvidia.com>
Date: Tue, 24 Sep 2024 08:15:14 -0700
Subject: [PATCH 18/49] [NVVM] Upgrade nvvm.ptr.* intrinics to addrspace cast
 (#109710)

Remove the following intrinsics which can be trivially replaced with an
`addrspacecast`

  * llvm.nvvm.ptr.gen.to.global
  * llvm.nvvm.ptr.gen.to.shared
  * llvm.nvvm.ptr.gen.to.constant
  * llvm.nvvm.ptr.gen.to.local
  * llvm.nvvm.ptr.global.to.gen
  * llvm.nvvm.ptr.shared.to.gen
  * llvm.nvvm.ptr.constant.to.gen
  * llvm.nvvm.ptr.local.to.gen

Also, cleanup the NVPTX lowering of `addrspacecast` making it more
concise.
---
 llvm/docs/NVPTXUsage.rst                      | 63 -------------
 llvm/docs/ReleaseNotes.rst                    | 12 +++
 llvm/include/llvm/IR/IntrinsicsNVVM.td        | 50 +++-------
 llvm/lib/IR/AutoUpgrade.cpp                   | 19 ++++
 llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp   | 58 ++++++------
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.td       |  4 -
 llvm/lib/Target/NVPTX/NVPTXIntrinsics.td      | 92 ++++++-------------
 .../Assembler/auto_upgrade_nvvm_intrinsics.ll | 35 +++++++
 llvm/test/CodeGen/NVPTX/intrin-nocapture.ll   | 21 -----
 llvm/test/DebugInfo/NVPTX/debug-info.ll       | 20 ++--
 10 files changed, 146 insertions(+), 228 deletions(-)
 delete mode 100644 llvm/test/CodeGen/NVPTX/intrin-nocapture.ll

diff --git a/llvm/docs/NVPTXUsage.rst b/llvm/docs/NVPTXUsage.rst
index 3a566bbac36233..8b0b05c0ea424e 100644
--- a/llvm/docs/NVPTXUsage.rst
+++ b/llvm/docs/NVPTXUsage.rst
@@ -127,69 +127,6 @@ Example: 64-bit PTX for CUDA Driver API: ``nvptx64-nvidia-cuda``
 NVPTX Intrinsics
 ================
 
-Address Space Conversion
-------------------------
-
-'``llvm.nvvm.ptr.*.to.gen``' Intrinsics
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Syntax:
-"""""""
-
-These are overloaded intrinsics.  You can use these on any pointer types.
-
-.. code-block:: llvm
-
-    declare ptr @llvm.nvvm.ptr.global.to.gen.p0.p1(ptr addrspace(1))
-    declare ptr @llvm.nvvm.ptr.shared.to.gen.p0.p3(ptr addrspace(3))
-    declare ptr @llvm.nvvm.ptr.constant.to.gen.p0.p4(ptr addrspace(4))
-    declare ptr @llvm.nvvm.ptr.local.to.gen.p0.p5(ptr addrspace(5))
-
-Overview:
-"""""""""
-
-The '``llvm.nvvm.ptr.*.to.gen``' intrinsics convert a pointer in a non-generic
-address space to a generic address space pointer.
-
-Semantics:
-""""""""""
-
-These intrinsics modify the pointer value to be a valid generic address space
-pointer.
-
-
-'``llvm.nvvm.ptr.gen.to.*``' Intrinsics
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Syntax:
-"""""""
-
-These are overloaded intrinsics.  You can use these on any pointer types.
-
-.. code-block:: llvm
-
-    declare ptr addrspace(1) @llvm.nvvm.ptr.gen.to.global.p1.p0(ptr)
-    declare ptr addrspace(3) @llvm.nvvm.ptr.gen.to.shared.p3.p0(ptr)
-    declare ptr addrspace(4) @llvm.nvvm.ptr.gen.to.constant.p4.p0(ptr)
-    declare ptr addrspace(5) @llvm.nvvm.ptr.gen.to.local.p5.p0(ptr)
-
-Overview:
-"""""""""
-
-The '``llvm.nvvm.ptr.gen.to.*``' intrinsics convert a pointer in the generic
-address space to a pointer in the target address space.  Note that these
-intrinsics are only useful if the address space of the target address space of
-the pointer is known.  It is not legal to use address space conversion
-intrinsics to convert a pointer from one non-generic address space to another
-non-generic address space.
-
-Semantics:
-""""""""""
-
-These intrinsics modify the pointer value to be a valid pointer in the target
-non-generic address space.
-
-
 Reading PTX Special Registers
 -----------------------------
 
diff --git a/llvm/docs/ReleaseNotes.rst b/llvm/docs/ReleaseNotes.rst
index 27d6bc158b3c37..c85ea28ad9f8c7 100644
--- a/llvm/docs/ReleaseNotes.rst
+++ b/llvm/docs/ReleaseNotes.rst
@@ -69,6 +69,18 @@ Changes to the LLVM IR
   * ``llvm.nvvm.rotate.right.b64``
   * ``llvm.nvvm.rotate.b64``
 
+* Remove the following intrinsics which can be replaced with an
+  ``addrspacecast``:
+
+  * ``llvm.nvvm.ptr.gen.to.global``
+  * ``llvm.nvvm.ptr.gen.to.shared``
+  * ``llvm.nvvm.ptr.gen.to.constant``
+  * ``llvm.nvvm.ptr.gen.to.local``
+  * ``llvm.nvvm.ptr.global.to.gen``
+  * ``llvm.nvvm.ptr.shared.to.gen``
+  * ``llvm.nvvm.ptr.constant.to.gen``
+  * ``llvm.nvvm.ptr.local.to.gen``
+
 Changes to LLVM infrastructure
 ------------------------------
 
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index aa5294f5f9c909..7b8ffe417fccdb 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -30,10 +30,18 @@
 //   * llvm.nvvm.max.ui  --> select(x ule y, x, y)
 //   * llvm.nvvm.max.ull --> ibid.
 //   * llvm.nvvm.h2f     --> llvm.convert.to.fp16.f32
-//   * llvm.nvvm.bitcast.f2i  --> bitcast
-//   * llvm.nvvm.bitcast.i2f  --> ibid.
-//   * llvm.nvvm.bitcast.d2ll --> ibid.
-//   * llvm.nvvm.bitcast.ll2d --> ibid.
+//   * llvm.nvvm.bitcast.f2i         --> bitcast
+//   * llvm.nvvm.bitcast.i2f         --> ibid.
+//   * llvm.nvvm.bitcast.d2ll        --> ibid.
+//   * llvm.nvvm.bitcast.ll2d        --> ibid.
+//   * llvm.nvvm.ptr.gen.to.global   --> addrspacecast
+//   * llvm.nvvm.ptr.gen.to.shared   --> ibid.
+//   * llvm.nvvm.ptr.gen.to.constant --> ibid.
+//   * llvm.nvvm.ptr.gen.to.local    --> ibid.
+//   * llvm.nvvm.ptr.global.to.gen   --> ibid.
+//   * llvm.nvvm.ptr.shared.to.gen   --> ibid.
+//   * llvm.nvvm.ptr.constant.to.gen --> ibid.
+//   * llvm.nvvm.ptr.local.to.gen    --> ibid.
 
 def llvm_global_ptr_ty  : LLVMQualPointerType<1>;  // (global)ptr
 def llvm_shared_ptr_ty  : LLVMQualPointerType<3>;  // (shared)ptr
@@ -1602,40 +1610,6 @@ def int_nvvm_ldg_global_p : Intrinsic<[llvm_anyptr_ty],
   [IntrReadMem, IntrArgMemOnly, IntrNoCallback, IntrWillReturn, NoCapture<ArgIndex<0>>],
   "llvm.nvvm.ldg.global.p">;
 
-// Use for generic pointers
-// - These intrinsics are used to convert address spaces.
-// - The input pointer and output pointer must have the same type, except for
-//   the address-space. (This restriction is not enforced here as there is
-//   currently no way to describe it).
-// - This complements the llvm bitcast, which can be used to cast one type
-//   of pointer to another type of pointer, while the address space remains
-//   the same.
-def int_nvvm_ptr_local_to_gen: DefaultAttrsIntrinsic<[llvm_anyptr_ty],
-                 [llvm_anyptr_ty], [IntrNoMem, IntrSpeculatable],
-                 "llvm.nvvm.ptr.local.to.gen">;
-def int_nvvm_ptr_shared_to_gen: DefaultAttrsIntrinsic<[llvm_anyptr_ty],
-                 [llvm_anyptr_ty], [IntrNoMem, IntrSpeculatable],
-                 "llvm.nvvm.ptr.shared.to.gen">;
-def int_nvvm_ptr_global_to_gen: DefaultAttrsIntrinsic<[llvm_anyptr_ty],
-                 [llvm_anyptr_ty], [IntrNoMem, IntrSpeculatable],
-                 "llvm.nvvm.ptr.global.to.gen">;
-def int_nvvm_ptr_constant_to_gen: DefaultAttrsIntrinsic<[llvm_anyptr_ty],
-                 [llvm_anyptr_ty], [IntrNoMem, IntrSpeculatable],
-                 "llvm.nvvm.ptr.constant.to.gen">;
-
-def int_nvvm_ptr_gen_to_global: DefaultAttrsIntrinsic<[llvm_anyptr_ty],
-                 [llvm_anyptr_ty], [IntrNoMem, IntrSpeculatable],
-                 "llvm.nvvm.ptr.gen.to.global">;
-def int_nvvm_ptr_gen_to_shared: DefaultAttrsIntrinsic<[llvm_anyptr_ty],
-                 [llvm_anyptr_ty], [IntrNoMem, IntrSpeculatable],
-                 "llvm.nvvm.ptr.gen.to.shared">;
-def int_nvvm_ptr_gen_to_local: DefaultAttrsIntrinsic<[llvm_anyptr_ty],
-                 [llvm_anyptr_ty], [IntrNoMem, IntrSpeculatable],
-                 "llvm.nvvm.ptr.gen.to.local">;
-def int_nvvm_ptr_gen_to_constant: DefaultAttrsIntrinsic<[llvm_anyptr_ty],
-                 [llvm_anyptr_ty], [IntrNoMem, IntrSpeculatable],
-                 "llvm.nvvm.ptr.gen.to.constant">;
-
 // Used in nvvm internally to help address space opt and ptx code generation
 // This is for params that are passed to kernel functions by pointer by-val.
 def int_nvvm_ptr_gen_to_param: Intrinsic<[llvm_anyptr_ty],
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 3390d651d6c693..b84258398c1932 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -1275,6 +1275,16 @@ static bool upgradeIntrinsicFunction1(Function *F, Function *&NewFn,
       else if (Name.consume_front("rotate."))
         // nvvm.rotate.{b32,b64,right.b64}
         Expand = Name == "b32" || Name == "b64" || Name == "right.b64";
+      else if (Name.consume_front("ptr.gen.to."))
+        // nvvm.ptr.gen.to.{local,shared,global,constant}
+        Expand = Name.starts_with("local") || Name.starts_with("shared") ||
+                 Name.starts_with("global") || Name.starts_with("constant");
+      else if (Name.consume_front("ptr."))
+        // nvvm.ptr.{local,shared,global,constant}.to.gen
+        Expand =
+            (Name.consume_front("local") || Name.consume_front("shared") ||
+             Name.consume_front("global") || Name.consume_front("constant")) &&
+            Name.starts_with(".to.gen");
       else
         Expand = false;
 
@@ -2338,6 +2348,15 @@ static Value *upgradeNVVMIntrinsicCall(StringRef Name, CallBase *CI,
     Value *ZExtShiftAmt = Builder.CreateZExt(CI->getOperand(1), Int64Ty);
     Rep = Builder.CreateIntrinsic(Int64Ty, Intrinsic::fshr,
                                   {Arg, Arg, ZExtShiftAmt});
+  } else if ((Name.consume_front("ptr.gen.to.") &&
+              (Name.starts_with("local") || Name.starts_with("shared") ||
+               Name.starts_with("global") || Name.starts_with("constant"))) ||
+             (Name.consume_front("ptr.") &&
+              (Name.consume_front("local") || Name.consume_front("shared") ||
+               Name.consume_front("global") ||
+               Name.consume_front("constant")) &&
+              Name.starts_with(".to.gen"))) {
+    Rep = Builder.CreateAddrSpaceCast(CI->getArgOperand(0), CI->getType());
   } else {
     Intrinsic::ID IID = shouldUpgradeNVPTXBF16Intrinsic(Name);
     if (IID != Intrinsic::not_intrinsic &&
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
index 56c96ea943b89d..7f942de74bdcc9 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelDAGToDAG.cpp
@@ -1109,11 +1109,21 @@ void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
   AddrSpaceCastSDNode *CastN = cast<AddrSpaceCastSDNode>(N);
   unsigned SrcAddrSpace = CastN->getSrcAddressSpace();
   unsigned DstAddrSpace = CastN->getDestAddressSpace();
+  SDLoc DL(N);
   assert(SrcAddrSpace != DstAddrSpace &&
          "addrspacecast must be between different address spaces");
 
   if (DstAddrSpace == ADDRESS_SPACE_GENERIC) {
     // Specific to generic
+
+    if (TM.is64Bit() && TM.getPointerSizeInBits(SrcAddrSpace) == 32) {
+      SDValue CvtNone =
+          CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, DL, MVT::i32);
+      SDNode *Cvt = CurDAG->getMachineNode(NVPTX::CVT_u64_u32, DL, MVT::i64,
+                                           Src, CvtNone);
+      Src = SDValue(Cvt, 0);
+    }
+
     unsigned Opc;
     switch (SrcAddrSpace) {
     default: report_fatal_error("Bad address space in addrspacecast");
@@ -1121,26 +1131,16 @@ void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
       Opc = TM.is64Bit() ? NVPTX::cvta_global_64 : NVPTX::cvta_global;
       break;
     case ADDRESS_SPACE_SHARED:
-      Opc = TM.is64Bit() ? (TM.getPointerSizeInBits(SrcAddrSpace) == 32
-                                ? NVPTX::cvta_shared_6432
-                                : NVPTX::cvta_shared_64)
-                         : NVPTX::cvta_shared;
+      Opc = TM.is64Bit() ? NVPTX::cvta_shared_64 : NVPTX::cvta_shared;
       break;
     case ADDRESS_SPACE_CONST:
-      Opc = TM.is64Bit() ? (TM.getPointerSizeInBits(SrcAddrSpace) == 32
-                                ? NVPTX::cvta_const_6432
-                                : NVPTX::cvta_const_64)
-                         : NVPTX::cvta_const;
+      Opc = TM.is64Bit() ? NVPTX::cvta_const_64 : NVPTX::cvta_const;
       break;
     case ADDRESS_SPACE_LOCAL:
-      Opc = TM.is64Bit() ? (TM.getPointerSizeInBits(SrcAddrSpace) == 32
-                                ? NVPTX::cvta_local_6432
-                                : NVPTX::cvta_local_64)
-                         : NVPTX::cvta_local;
+      Opc = TM.is64Bit() ? NVPTX::cvta_local_64 : NVPTX::cvta_local;
       break;
     }
-    ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0),
-                                          Src));
+    ReplaceNode(N, CurDAG->getMachineNode(Opc, DL, N->getValueType(0), Src));
     return;
   } else {
     // Generic to specific
@@ -1153,30 +1153,28 @@ void NVPTXDAGToDAGISel::SelectAddrSpaceCast(SDNode *N) {
       Opc = TM.is64Bit() ? NVPTX::cvta_to_global_64 : NVPTX::cvta_to_global;
       break;
     case ADDRESS_SPACE_SHARED:
-      Opc = TM.is64Bit() ? (TM.getPointerSizeInBits(DstAddrSpace) == 32
-                                ? NVPTX::cvta_to_shared_3264
-                                : NVPTX::cvta_to_shared_64)
-                         : NVPTX::cvta_to_shared;
+      Opc = TM.is64Bit() ? NVPTX::cvta_to_shared_64 : NVPTX::cvta_to_shared;
       break;
     case ADDRESS_SPACE_CONST:
-      Opc = TM.is64Bit() ? (TM.getPointerSizeInBits(DstAddrSpace) == 32
-                                ? NVPTX::cvta_to_const_3264
-                                : NVPTX::cvta_to_const_64)
-                         : NVPTX::cvta_to_const;
+      Opc = TM.is64Bit() ? NVPTX::cvta_to_const_64 : NVPTX::cvta_to_const;
       break;
     case ADDRESS_SPACE_LOCAL:
-      Opc = TM.is64Bit() ? (TM.getPointerSizeInBits(DstAddrSpace) == 32
-                                ? NVPTX::cvta_to_local_3264
-                                : NVPTX::cvta_to_local_64)
-                         : NVPTX::cvta_to_local;
+      Opc = TM.is64Bit() ? NVPTX::cvta_to_local_64 : NVPTX::cvta_to_local;
       break;
     case ADDRESS_SPACE_PARAM:
-      Opc = TM.is64Bit() ? NVPTX::nvvm_ptr_gen_to_param_64
-                         : NVPTX::nvvm_ptr_gen_to_param;
+      Opc = TM.is64Bit() ? NVPTX::IMOV64rr : NVPTX::IMOV32rr;
       break;
     }
-    ReplaceNode(N, CurDAG->getMachineNode(Opc, SDLoc(N), N->getValueType(0),
-                                          Src));
+
+    SDNode *CVTA = CurDAG->getMachineNode(Opc, DL, N->getValueType(0), Src);
+    if (TM.is64Bit() && TM.getPointerSizeInBits(DstAddrSpace) == 32) {
+      SDValue CvtNone =
+          CurDAG->getTargetConstant(NVPTX::PTXCvtMode::NONE, DL, MVT::i32);
+      CVTA = CurDAG->getMachineNode(NVPTX::CVT_u32_u64, DL, MVT::i32,
+                                    SDValue(CVTA, 0), CvtNone);
+    }
+
+    ReplaceNode(N, CVTA);
     return;
   }
 }
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index f6bbf4c2ffc02f..c3a8a774673f24 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -174,10 +174,6 @@ def hasSM90a : Predicate<"Subtarget->getFullSmVersion() == 901">;
 def hasSHFL : Predicate<"!(Subtarget->getSmVersion() >= 70"
                           "&& Subtarget->getPTXVersion() >= 64)">;
 
-def useShortPtrLocal : Predicate<"TM.is64Bit() && TM.getPointerSizeInBits(ADDRESS_SPACE_LOCAL) == 32">;
-def useShortPtrShared : Predicate<"TM.is64Bit() && TM.getPointerSizeInBits(ADDRESS_SPACE_SHARED) == 32">;
-def useShortPtrConst : Predicate<"TM.is64Bit() && TM.getPointerSizeInBits(ADDRESS_SPACE_CONST) == 32">;
-
 def useFP16Math: Predicate<"Subtarget->allowFP16Math()">;
 def hasBF16Math: Predicate<"Subtarget->hasBF16Math()">;
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 2688cfbe5e824f..042b0965ea33ff 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -2537,59 +2537,45 @@ defm INT_PTX_LDG_G_v4f32_ELE
   : VLDG_G_ELE_V4<"v4.f32 \t{{$dst1, $dst2, $dst3, $dst4}}, [$src];", Float32Regs>;
 
 
-multiclass NG_TO_G<string Str, Intrinsic Intrin, Predicate ShortPtr> {
+multiclass NG_TO_G<string Str> {
    def "" : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src),
-          !strconcat("cvta.", Str, ".u32 \t$result, $src;"),
-      [(set Int32Regs:$result, (Intrin Int32Regs:$src))]>;
+          "cvta." # Str # ".u32 \t$result, $src;", []>;
    def _64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src),
-          !strconcat("cvta.", Str, ".u64 \t$result, $src;"),
-      [(set Int64Regs:$result, (Intrin Int64Regs:$src))]>;
-   def _6432 : NVPTXInst<(outs Int64Regs:$result), (ins Int32Regs:$src),
-          "{{ .reg .b64 %tmp;\n\t"
-          #"  cvt.u64.u32 \t%tmp, $src;\n\t"
-          #"  cvta." # Str # ".u64 \t$result, %tmp; }}",
-      [(set Int64Regs:$result, (Intrin Int32Regs:$src))]>,
-      Requires<[ShortPtr]>;
+          "cvta." # Str # ".u64 \t$result, $src;", []>;
 }
 
-multiclass G_TO_NG<string Str, Intrinsic Intrin, Predicate ShortPtr> {
+multiclass G_TO_NG<string Str> {
    def "" : NVPTXInst<(outs Int32Regs:$result), (ins Int32Regs:$src),
-          !strconcat("cvta.to.", Str, ".u32 \t$result, $src;"),
-      [(set Int32Regs:$result, (Intrin Int32Regs:$src))]>;
+          "cvta.to." # Str # ".u32 \t$result, $src;", []>;
    def _64 : NVPTXInst<(outs Int64Regs:$result), (ins Int64Regs:$src),
-          !strconcat("cvta.to.", Str, ".u64 \t$result, $src;"),
-      [(set Int64Regs:$result, (Intrin Int64Regs:$src))]>;
-   def _3264 : NVPTXInst<(outs Int32Regs:$result), (ins Int64Regs:$src),
-          "{{ .reg .b64 %tmp;\n\t"
-          #"  cvta.to." # Str # ".u64 \t%tmp, $src;\n\t"
-          #"  cvt.u32.u64 \t$result, %tmp; }}",
-      [(set Int32Regs:$result, (Intrin Int64Regs:$src))]>,
-      Requires<[ShortPtr]>;
-}
-
-defm cvta_local  : NG_TO_G<"local", int_nvvm_ptr_local_to_gen, useShortPtrLocal>;
-defm cvta_shared : NG_TO_G<"shared", int_nvvm_ptr_shared_to_gen, useShortPtrShared>;
-defm cvta_global : NG_TO_G<"global", int_nvvm_ptr_global_to_gen, False>;
-defm cvta_const  : NG_TO_G<"const", int_nvvm_ptr_constant_to_gen, useShortPtrConst>;
-defm cvta_param  : NG_TO_G<"param", int_nvvm_ptr_param_to_gen, False>;
-
-defm cvta_to_local  : G_TO_NG<"local", int_nvvm_ptr_gen_to_local, useShortPtrLocal>;
-defm cvta_to_shared : G_TO_NG<"shared", int_nvvm_ptr_gen_to_shared, useShortPtrShared>;
-defm cvta_to_global : G_TO_NG<"global", int_nvvm_ptr_gen_to_global, False>;
-defm cvta_to_const  : G_TO_NG<"const", int_nvvm_ptr_gen_to_constant, useShortPtrConst>;
+          "cvta.to." # Str # ".u64 \t$result, $src;", []>;
+}
+
+defm cvta_local  : NG_TO_G<"local">;
+defm cvta_shared : NG_TO_G<"shared">;
+defm cvta_global : NG_TO_G<"global">;
+defm cvta_const  : NG_TO_G<"const">;
+
+defm cvta_to_local  : G_TO_NG<"local">;
+defm cvta_to_shared : G_TO_NG<"shared">;
+defm cvta_to_global : G_TO_NG<"global">;
+defm cvta_to_const  : G_TO_NG<"const">;
+
+// nvvm.ptr.param.to.gen
+defm cvta_param : NG_TO_G<"param">;
+
+def : Pat<(int_nvvm_ptr_param_to_gen Int32Regs:$src),
+          (cvta_param Int32Regs:$src)>;
+
+def : Pat<(int_nvvm_ptr_param_to_gen Int64Regs:$src),
+          (cvta_param_64 Int64Regs:$src)>;
 
 // nvvm.ptr.gen.to.param
-def nvvm_ptr_gen_to_param : NVPTXInst<(outs Int32Regs:$result),
-  (ins Int32Regs:$src),
-                        "mov.u32 \t$result, $src;",
-                              [(set Int32Regs:$result,
-                                (int_nvvm_ptr_gen_to_param Int32Regs:$src))]>;
-def nvvm_ptr_gen_to_param_64 : NVPTXInst<(outs Int64Regs:$result),
-  (ins Int64Regs:$src),
-                        "mov.u64 \t$result, $src;",
-                              [(set Int64Regs:$result,
-                                (int_nvvm_ptr_gen_to_param Int64Regs:$src))]>;
+def : Pat<(int_nvvm_ptr_gen_to_param Int32Regs:$src),
+          (IMOV32rr Int32Regs:$src)>;
 
+def : Pat<(int_nvvm_ptr_gen_to_param Int64Regs:$src),
+          (IMOV64rr Int64Regs:$src)>;
 
 // nvvm.move intrinsicc
 def nvvm_move_i16 : NVPTXInst<(outs Int16Regs:$r), (ins Int16Regs:$s),
@@ -2632,24 +2618,6 @@ def nvvm_move_sym64 : NVPTXInst<(outs Int64Regs:$r), (ins imem:$s),
                              [(set Int64Regs:$r,
                              (int_nvvm_move_ptr texternalsym:$s))]>;*/
 
-
-// MoveParam        %r1, param
-// ptr_local_to_gen %r2, %r1
-// ptr_gen_to_local %r3, %r2
-// ->
-// mov %r1, param
-
-// @TODO: Revisit this.  There is a type
-// contradiction between iPTRAny and iPTR for the addr defs, so the move_sym
-// instructions are not currently defined. However, we can use the ptr
-// variants and the asm printer will do the right thing.
-def : Pat<(i64 (int_nvvm_ptr_gen_to_local (int_nvvm_ptr_local_to_gen
-                (MoveParam texternalsym:$src)))),
-               (nvvm_move_ptr64  texternalsym:$src)>;
-def : Pat<(i32 (int_nvvm_ptr_gen_to_local (int_nvvm_ptr_local_to_gen
-                (MoveParam texternalsym:$src)))),
-               (nvvm_move_ptr32  texternalsym:$src)>;
-
 def texsurf_handles
   : NVPTXInst<(outs Int64Regs:$result), (ins imem:$src),
               "mov.u64 \t$result, $src;", []>;
diff --git a/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll b/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll
index 43ac246055da7b..584c0ef7cfeb78 100644
--- a/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll
+++ b/llvm/test/Assembler/auto_upgrade_nvvm_intrinsics.ll
@@ -35,6 +35,15 @@ declare i32 @llvm.nvvm.rotate.b32(i32, i32)
 declare i64 @llvm.nvvm.rotate.right.b64(i64, i32)
 declare i64 @llvm.nvvm.rotate.b64(i64, i32)
 
+declare ptr addrspace(1) @llvm.nvvm.ptr.gen.to.global.p1.p0(ptr)
+declare ptr addrspace(3) @llvm.nvvm.ptr.gen.to.shared.p3.p0(ptr)
+declare ptr addrspace(4) @llvm.nvvm.ptr.gen.to.constant.p4.p0(ptr)
+declare ptr addrspace(5) @llvm.nvvm.ptr.gen.to.local.p5.p0(ptr)
+declare ptr @llvm.nvvm.ptr.global.to.gen.p0.p1(ptr addrspace(1))
+declare ptr @llvm.nvvm.ptr.shared.to.gen.p0.p3(ptr addrspace(3))
+declare ptr @llvm.nvvm.ptr.constant.to.gen.p0.p4(ptr addrspace(4))
+declare ptr @llvm.nvvm.ptr.local.to.gen.p0.p5(ptr addrspace(5))
+
 ; CHECK-LABEL: @simple_upgrade
 define void @simple_upgrade(i32 %a, i64 %b, i16 %c) {
 ; CHECK: call i32 @llvm.bitreverse.i32(i32 %a)
@@ -156,3 +165,29 @@ define void @rotate(i32 %a, i64 %b) {
   %r3 = call i64 @llvm.nvvm.rotate.b64(i64 %b, i32 8)
   ret void
 }
+
+; CHECK-LABEL: @addrspacecast
+define void @addrspacecast(ptr %p0) {
+; CHECK: %1 = addrspacecast ptr %p0 to ptr addrspace(1)
+; CHECK: %2 = addrspacecast ptr addrspace(1) %1 to ptr
+; CHECK: %3 = addrspacecast ptr %2 to ptr addrspace(3)
+; CHECK: %4 = addrspacecast ptr addrspace(3) %3 to ptr
+; CHECK: %5 = addrspacecast ptr %4 to ptr addrspace(4)
+; CHECK: %6 = addrspacecast ptr addrspace(4) %5 to ptr
+; CHECK: %7 = addrspacecast ptr %6 to ptr addrspace(5)
+; CHECK: %8 = addrspacecast ptr addrspace(5) %7 to ptr
+;
+  %p1 = call ptr addrspace(1) @llvm.nvvm.ptr.gen.to.global.p1.p0(ptr %p0)
+  %p2 = call ptr @llvm.nvvm.ptr.global.to.gen.p0.p1(ptr addrspace(1) %p1)
+
+  %p3 = call ptr addrspace(3) @llvm.nvvm.ptr.gen.to.shared.p3.p0(ptr %p2)
+  %p4 = call ptr @llvm.nvvm.ptr.shared.to.gen.p0.p3(ptr addrspace(3) %p3)
+
+  %p5 = call ptr addrspace(4) @llvm.nvvm.ptr.gen.to.constant.p4.p0(ptr %p4)
+  %p6 = call ptr @llvm.nvvm.ptr.constant.to.gen.p0.p4(ptr addrspace(4) %p5)
+
+  %p7 = call ptr addrspace(5) @llvm.nvvm.ptr.gen.to.local.p5.p0(ptr %p6)
+  %p8 = call ptr @llvm.nvvm.ptr.local.to.gen.p0.p5(ptr addrspace(5) %p7)
+
+  ret void
+}
diff --git a/llvm/test/CodeGen/NVPTX/intrin-nocapture.ll b/llvm/test/CodeGen/NVPTX/intrin-nocapture.ll
deleted file mode 100644
index 040bbde13800cd..00000000000000
--- a/llvm/test/CodeGen/NVPTX/intrin-nocapture.ll
+++ /dev/null
@@ -1,21 +0,0 @@
-; RUN: opt < %s -O3 -S | FileCheck %s
-
-; Address space intrinsics were erroneously marked NoCapture, leading to bad
-; optimizations (such as the store below being eliminated as dead code). This
-; test makes sure we don't regress.
-
-declare void @foo(ptr addrspace(1))
-
-declare ptr addrspace(1) @llvm.nvvm.ptr.gen.to.global.p1.p0(ptr)
-
-; CHECK: @bar
-define void @bar() {
-  %t1 = alloca i32
-; CHECK: call ptr addrspace(1) @llvm.nvvm.ptr.gen.to.global.p1.p0(ptr nonnull %t1)
-; CHECK-NEXT: store i32 10, ptr %t1
-  %t2 = call ptr addrspace(1) @llvm.nvvm.ptr.gen.to.global.p1.p0(ptr %t1)
-  store i32 10, ptr %t1
-  call void @foo(ptr addrspace(1) %t2)
-  ret void
-}
-
diff --git a/llvm/test/DebugInfo/NVPTX/debug-info.ll b/llvm/test/DebugInfo/NVPTX/debug-info.ll
index 9948925db57c9f..922a420820f469 100644
--- a/llvm/test/DebugInfo/NVPTX/debug-info.ll
+++ b/llvm/test/DebugInfo/NVPTX/debug-info.ll
@@ -25,6 +25,10 @@
 ; CHECK-DAG: .reg .b64       %rd<8>;
 ; CHECK: .loc [[DEBUG_INFO_CU:[0-9]+]] 5 0
 ; CHECK: ld.param.u32    %r{{.+}}, [{{.+}}];
+; CHECK: ld.param.u64    %rd{{.+}}, [{{.+}}];
+; CHECK: cvta.to.global.u64      %rd{{.+}}, %rd{{.+}};
+; CHECK: ld.param.u64    %rd{{.+}}, [{{.+}}];
+; CHECK: cvta.to.global.u64      %rd{{.+}}, %rd{{.+}};
 ; CHECK: .loc [[BUILTUIN_VARS_H:[0-9]+]] 78 180
 ; CHECK: mov.u32         %r{{.+}}, %ctaid.x;
 ; CHECK: .loc [[BUILTUIN_VARS_H]] 89 180
@@ -38,10 +42,6 @@
 ; CHECK: .loc [[DEBUG_INFO_CU]] 7 7
 ; CHECK: @%p{{.+}} bra   [[BB:\$L__.+]];
 ; CHECK: ld.param.f32    %f{{.+}}, [{{.+}}];
-; CHECK: ld.param.u64    %rd{{.+}}, [{{.+}}];
-; CHECK: cvta.to.global.u64      %rd{{.+}}, %rd{{.+}};
-; CHECK: ld.param.u64    %rd{{.+}}, [{{.+}}];
-; CHECK: cvta.to.global.u64      %rd{{.+}}, %rd{{.+}};
 ; CHECK: .loc [[DEBUG_INFO_CU]] 8 13
 ; CHECK: mul.wide.u32    %rd{{.+}}, %r{{.+}}, 4;
 ; CHECK: add.s64         %rd{{.+}}, %rd{{.+}}, %rd{{.+}};
@@ -2661,22 +2661,22 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-NEXT:.b32 4579                               // DW_AT_type
 ; CHECK-NEXT:.b8 25                                  // Abbrev [25] 0x8aa:0x18 DW_TAG_inlined_subroutine
 ; CHECK-NEXT:.b32 707                                // DW_AT_abstract_origin
-; CHECK-NEXT:.b64 $L__tmp0                           // DW_AT_low_pc
-; CHECK-NEXT:.b64 $L__tmp1                           // DW_AT_high_pc
+; CHECK-NEXT:.b64 $L__tmp1                           // DW_AT_low_pc
+; CHECK-NEXT:.b64 $L__tmp2                           // DW_AT_high_pc
 ; CHECK-NEXT:.b8 1                                   // DW_AT_call_file
 ; CHECK-NEXT:.b8 6                                   // DW_AT_call_line
 ; CHECK-NEXT:.b8 11                                  // DW_AT_call_column
 ; CHECK-NEXT:.b8 25                                  // Abbrev [25] 0x8c2:0x18 DW_TAG_inlined_subroutine
 ; CHECK-NEXT:.b32 1466                               // DW_AT_abstract_origin
-; CHECK-NEXT:.b64 $L__tmp1                           // DW_AT_low_pc
-; CHECK-NEXT:.b64 $L__tmp2                           // DW_AT_high_pc
+; CHECK-NEXT:.b64 $L__tmp2                           // DW_AT_low_pc
+; CHECK-NEXT:.b64 $L__tmp3                           // DW_AT_high_pc
 ; CHECK-NEXT:.b8 1                                   // DW_AT_call_file
 ; CHECK-NEXT:.b8 6                                   // DW_AT_call_line
 ; CHECK-NEXT:.b8 24                                  // DW_AT_call_column
 ; CHECK-NEXT:.b8 25                                  // Abbrev [25] 0x8da:0x18 DW_TAG_inlined_subroutine
 ; CHECK-NEXT:.b32 2060                               // DW_AT_abstract_origin
-; CHECK-NEXT:.b64 $L__tmp2                           // DW_AT_low_pc
-; CHECK-NEXT:.b64 $L__tmp3                           // DW_AT_high_pc
+; CHECK-NEXT:.b64 $L__tmp3                           // DW_AT_low_pc
+; CHECK-NEXT:.b64 $L__tmp4                           // DW_AT_high_pc
 ; CHECK-NEXT:.b8 1                                   // DW_AT_call_file
 ; CHECK-NEXT:.b8 6                                   // DW_AT_call_line
 ; CHECK-NEXT:.b8 37                                  // DW_AT_call_column

From bcbdf7ad6b571d11c102d018c78ee0fbf71e3e2c Mon Sep 17 00:00:00 2001
From: Philip Reames <preames@rivosinc.com>
Date: Tue, 24 Sep 2024 08:07:50 -0700
Subject: [PATCH 19/49] [RISCV][TTI/SLP] Add test coverage for select of
 constants costing

Provides coverage for an upcoming change which accounts for the cost
of materializing the vector constants in the vector select.
---
 .../Analysis/CostModel/RISCV/rvv-select.ll    | 25 +++++++++
 .../RISCV/select-profitability.ll             | 55 +++++++++++++++++++
 2 files changed, 80 insertions(+)
 create mode 100644 llvm/test/Transforms/SLPVectorizer/RISCV/select-profitability.ll

diff --git a/llvm/test/Analysis/CostModel/RISCV/rvv-select.ll b/llvm/test/Analysis/CostModel/RISCV/rvv-select.ll
index 13994c46335dee..9eadcaca6bb55e 100644
--- a/llvm/test/Analysis/CostModel/RISCV/rvv-select.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/rvv-select.ll
@@ -390,3 +390,28 @@ define void @select() {
 
   ret void
 }
+
+define void @select_of_constants() {
+; CHECK-LABEL: 'select_of_constants'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %1 = select i1 undef, <2 x i64> <i64 128, i64 128>, <2 x i64> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %2 = select i1 undef, <2 x i64> <i64 128, i64 127>, <2 x i64> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %3 = select i1 undef, <2 x i64> <i64 0, i64 1>, <2 x i64> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %4 = select i1 undef, <2 x i64> <i64 128, i64 533>, <2 x i64> <i64 0, i64 573>
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %5 = select <4 x i1> undef, <4 x i32> <i32 524288, i32 262144, i32 131072, i32 65536>, <4 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  ; Splat constants
+  select i1 undef, <2 x i64> <i64 128, i64 128>, <2 x i64> zeroinitializer
+  ; LHS is a VID patern
+  select i1 undef, <2 x i64> <i64 128, i64 127>, <2 x i64> zeroinitializer
+  select i1 undef, <2 x i64> <i64 0, i64 1>, <2 x i64> zeroinitializer
+  ; 2x general (expensive) constants
+  select i1 undef, <2 x i64> <i64 128, i64 533>, <2 x i64> <i64 0, i64 573>
+
+  ; powers of two (still expensive)
+  select <4 x i1> undef, <4 x i32> <i32 524288, i32 262144, i32 131072, i32 65536>, <4 x i32> zeroinitializer
+
+  ret void
+}
+
+
diff --git a/llvm/test/Transforms/SLPVectorizer/RISCV/select-profitability.ll b/llvm/test/Transforms/SLPVectorizer/RISCV/select-profitability.ll
new file mode 100644
index 00000000000000..4496b19fa200c5
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/RISCV/select-profitability.ll
@@ -0,0 +1,55 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S --passes=slp-vectorizer -mtriple=riscv64-unknown-linux -mattr=+v < %s | FileCheck %s
+
+define i32 @pow2_zero_constant_shift(i16 zeroext %a, i16 zeroext %b, i16 zeroext %c, i16 zeroext %d) {
+; CHECK-LABEL: define i32 @pow2_zero_constant_shift(
+; CHECK-SAME: i16 zeroext [[A:%.*]], i16 zeroext [[B:%.*]], i16 zeroext [[C:%.*]], i16 zeroext [[D:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i16> [[TMP1]], i16 [[B]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[C]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i16> [[TMP3]], i16 [[D]], i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq <4 x i16> [[TMP4]], <i16 1, i16 1, i16 1, i16 1>
+; CHECK-NEXT:    [[TMP6:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> <i32 65536, i32 65536, i32 65536, i32 65536>, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP6]])
+; CHECK-NEXT:    ret i32 [[TMP7]]
+;
+  %t39.i0 = icmp eq i16 %a, 1
+  %t39.i1 = icmp eq i16 %b, 1
+  %t39.i2 = icmp eq i16 %c, 1
+  %t39.i3 = icmp eq i16 %d, 1
+  %t40.i0 = select i1 %t39.i0, i32 65536, i32 0
+  %t40.i1 = select i1 %t39.i1, i32 65536, i32 0
+  %t40.i2 = select i1 %t39.i2, i32 65536, i32 0
+  %t40.i3 = select i1 %t39.i3, i32 65536, i32 0
+  %or.rdx0 = or i32 %t40.i0, %t40.i1
+  %or.rdx1 = or i32 %t40.i2, %t40.i3
+  %or.rdx2 = or i32 %or.rdx0, %or.rdx1
+  ret i32 %or.rdx2
+}
+
+; TODO: This case is unprofitable, and we should not be vectorizing this.
+define i32 @pow2_zero_variable_shift(i16 zeroext %a, i16 zeroext %b, i16 zeroext %c, i16 zeroext %d) {
+; CHECK-LABEL: define i32 @pow2_zero_variable_shift(
+; CHECK-SAME: i16 zeroext [[A:%.*]], i16 zeroext [[B:%.*]], i16 zeroext [[C:%.*]], i16 zeroext [[D:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i16> poison, i16 [[A]], i32 0
+; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <4 x i16> [[TMP1]], i16 [[B]], i32 1
+; CHECK-NEXT:    [[TMP3:%.*]] = insertelement <4 x i16> [[TMP2]], i16 [[C]], i32 2
+; CHECK-NEXT:    [[TMP4:%.*]] = insertelement <4 x i16> [[TMP3]], i16 [[D]], i32 3
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq <4 x i16> [[TMP4]], <i16 1, i16 1, i16 1, i16 1>
+; CHECK-NEXT:    [[TMP6:%.*]] = select <4 x i1> [[TMP5]], <4 x i32> <i32 524288, i32 262144, i32 131072, i32 65536>, <4 x i32> zeroinitializer
+; CHECK-NEXT:    [[OR_RDX2:%.*]] = call i32 @llvm.vector.reduce.or.v4i32(<4 x i32> [[TMP6]])
+; CHECK-NEXT:    ret i32 [[OR_RDX2]]
+;
+  %t39.i0 = icmp eq i16 %a, 1
+  %t39.i1 = icmp eq i16 %b, 1
+  %t39.i2 = icmp eq i16 %c, 1
+  %t39.i3 = icmp eq i16 %d, 1
+  %t40.i0 = select i1 %t39.i0, i32 524288, i32 0
+  %t40.i1 = select i1 %t39.i1, i32 262144, i32 0
+  %t40.i2 = select i1 %t39.i2, i32 131072, i32 0
+  %t40.i3 = select i1 %t39.i3, i32 65536, i32 0
+  %or.rdx0 = or i32 %t40.i0, %t40.i1
+  %or.rdx1 = or i32 %t40.i2, %t40.i3
+  %or.rdx2 = or i32 %or.rdx0, %or.rdx1
+  ret i32 %or.rdx2
+}

From 31ac400e451b5dcbf11a3ece94d58dc3edf24e14 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Tue, 24 Sep 2024 16:22:02 +0100
Subject: [PATCH 20/49] [LV] Remove another reference of unrolled parts after
 57f5d8f2fe.

Continue to clean up some now stale references of unroll parts and
related terminology as pointed out post-commit for 06c3a7d.
---
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index d296511b977992..09e4d0fcd31f3c 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -9463,7 +9463,7 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
     return;
   }
 
-  // Generate scalar instances for all VF lanes of all UF parts.
+  // Generate scalar instances for all VF lanes.
   assert(!State.VF.isScalable() && "Can't scalarize a scalable vector");
   const unsigned EndLane = State.VF.getKnownMinValue();
   for (unsigned Lane = 0; Lane < EndLane; ++Lane)

From fa17977c315062646d4d1e01262d68dd69313e61 Mon Sep 17 00:00:00 2001
From: lntue <35648136+lntue@users.noreply.github.com>
Date: Tue, 24 Sep 2024 11:26:06 -0400
Subject: [PATCH 21/49] [libc][bazel] Remove specializations from
 libc_math_function. (#109802)

There are no more specializations `libc/src/math/x86_64` or
`libc/src/math/aarch64` anymore. All implementations are going through
the generic implementation.
---
 .../llvm-project-overlay/libc/BUILD.bazel     | 63 +++----------------
 .../libc/libc_build_rules.bzl                 | 11 +---
 2 files changed, 10 insertions(+), 64 deletions(-)

diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index dbc8c045eea996..140d48c8f96848 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -1825,26 +1825,11 @@ libc_math_function(
     ],
 )
 
-libc_math_function(
-    name = "ceil",
-    specializations = [
-        "generic",
-    ],
-)
+libc_math_function(name = "ceil")
 
-libc_math_function(
-    name = "ceilf",
-    specializations = [
-        "generic",
-    ],
-)
+libc_math_function(name = "ceilf")
 
-libc_math_function(
-    name = "ceill",
-    specializations = [
-        "generic",
-    ],
-)
+libc_math_function(name = "ceill")
 
 libc_math_function(name = "ceilf128")
 
@@ -2126,19 +2111,9 @@ libc_math_function(
     ],
 )
 
-libc_math_function(
-    name = "floor",
-    specializations = [
-        "generic",
-    ],
-)
+libc_math_function(name = "floor")
 
-libc_math_function(
-    name = "floorf",
-    specializations = [
-        "generic",
-    ],
-)
+libc_math_function(name = "floorf")
 
 libc_math_function(name = "floorl")
 
@@ -2639,19 +2614,9 @@ libc_math_function(name = "rintl")
 
 libc_math_function(name = "rintf128")
 
-libc_math_function(
-    name = "round",
-    specializations = [
-        "generic",
-    ],
-)
+libc_math_function(name = "round")
 
-libc_math_function(
-    name = "roundf",
-    specializations = [
-        "generic",
-    ],
-)
+libc_math_function(name = "roundf")
 
 libc_math_function(name = "roundl")
 
@@ -2850,19 +2815,9 @@ libc_math_function(name = "totalordermagl")
 
 libc_math_function(name = "totalordermagf128")
 
-libc_math_function(
-    name = "trunc",
-    specializations = [
-        "generic",
-    ],
-)
+libc_math_function(name = "trunc")
 
-libc_math_function(
-    name = "truncf",
-    specializations = [
-        "generic",
-    ],
-)
+libc_math_function(name = "truncf")
 
 libc_math_function(name = "truncl")
 
diff --git a/utils/bazel/llvm-project-overlay/libc/libc_build_rules.bzl b/utils/bazel/llvm-project-overlay/libc/libc_build_rules.bzl
index ec3714407cb914..f298f817af83d7 100644
--- a/utils/bazel/llvm-project-overlay/libc/libc_build_rules.bzl
+++ b/utils/bazel/llvm-project-overlay/libc/libc_build_rules.bzl
@@ -129,7 +129,6 @@ def libc_function(
 
 def libc_math_function(
         name,
-        specializations = None,
         additional_deps = None):
     """Add a target for a math function.
 
@@ -142,14 +141,6 @@ def libc_math_function(
                        math function.
     """
     additional_deps = additional_deps or []
-    specializations = specializations or ["generic"]
-    select_map = {}
-    if "generic" in specializations:
-        select_map["//conditions:default"] = ["src/math/generic/" + name + ".cpp"]
-    if "aarch64" in specializations:
-        select_map[PLATFORM_CPU_ARM64] = ["src/math/aarch64/" + name + ".cpp"]
-    if "x86_64" in specializations:
-        select_map[PLATFORM_CPU_X86_64] = ["src/math/x86_64/" + name + ".cpp"]
 
     #TODO(michaelrj): Fix the floating point dependencies
     OLD_FPUTIL_DEPS = [
@@ -166,7 +157,7 @@ def libc_math_function(
     ]
     libc_function(
         name = name,
-        srcs = selects.with_or(select_map),
+        srcs = ["src/math/generic/" + name + ".cpp"],
         hdrs = ["src/math/" + name + ".h"],
         deps = [":__support_common"] + OLD_FPUTIL_DEPS + additional_deps,
     )

From 99ade15d192db4afa897a7052a9c73dd42c2b88c Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Tue, 24 Sep 2024 11:26:37 -0400
Subject: [PATCH 22/49] Revert "[compiler-rt] prctl interception update,
 SECCOMP_MODE_FILTER support. (#107722)"

This reverts commit b75174d05aa033a382d4c088e96e068a774f46da.
Does not build on Android, see comments on
https://github.com/llvm/llvm-project/pull/107722
---
 .../sanitizer_common_interceptors.inc         |  6 -----
 .../sanitizer_platform_limits_posix.cpp       | 24 +++++++++----------
 .../sanitizer_platform_limits_posix.h         |  3 +--
 .../TestCases/Linux/prctl.cpp                 | 10 --------
 4 files changed, 12 insertions(+), 31 deletions(-)

diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc
index e29cc0c8b390ab..bf5bb43018efc4 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc
@@ -1289,9 +1289,6 @@ INTERCEPTOR(int, prctl, int option, unsigned long arg2, unsigned long arg3,
   static const int PR_SCHED_CORE = 62;
   static const int PR_SCHED_CORE_GET = 0;
   static const int PR_GET_PDEATHSIG = 2;
-  static const int PR_SET_SECCOMP = 22;
-
-  static const int SECCOMP_MODE_FILTER = 2;
   if (option == PR_SET_VMA && arg2 == 0UL) {
     char *name = (char *)arg5;
     COMMON_INTERCEPTOR_READ_RANGE(ctx, name, internal_strlen(name) + 1);
@@ -1310,9 +1307,6 @@ INTERCEPTOR(int, prctl, int option, unsigned long arg2, unsigned long arg3,
     COMMON_INTERCEPTOR_WRITE_RANGE(ctx, (u64 *)(arg5), sizeof(u64));
   } else if (res != -1 && option == PR_GET_PDEATHSIG) {
     COMMON_INTERCEPTOR_WRITE_RANGE(ctx, (u64 *)(arg2), sizeof(int));
-  } else if (res != -1 && option == PR_SET_SECCOMP &&
-             arg2 == SECCOMP_MODE_FILTER) {
-    COMMON_INTERCEPTOR_WRITE_RANGE(ctx, (u64 *)(arg3), struct_sock_fprog_sz);
   }
   return res;
 }
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.cpp
index 5eeb2a89efa8c5..6d61d276d77e35 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.cpp
@@ -117,16 +117,15 @@ typedef struct user_fpregs elf_fpregset_t;
 #if SANITIZER_LINUX
 #if SANITIZER_GLIBC
 #include <fstab.h>
-#      include <linux/filter.h>
-#      include <net/if_ppp.h>
-#      include <netax25/ax25.h>
-#      include <netipx/ipx.h>
-#      include <netrom/netrom.h>
-#      include <obstack.h>
-#      if HAVE_RPC_XDR_H
-#        include <rpc/xdr.h>
-#      endif
-#      include <scsi/scsi.h>
+#include <net/if_ppp.h>
+#include <netax25/ax25.h>
+#include <netipx/ipx.h>
+#include <netrom/netrom.h>
+#include <obstack.h>
+#if HAVE_RPC_XDR_H
+# include <rpc/xdr.h>
+#endif
+#include <scsi/scsi.h>
 #else
 #include <linux/if_ppp.h>
 #include <linux/kd.h>
@@ -532,10 +531,9 @@ unsigned struct_ElfW_Phdr_sz = sizeof(Elf_Phdr);
 
   unsigned struct_audio_buf_info_sz = sizeof(struct audio_buf_info);
   unsigned struct_ppp_stats_sz = sizeof(struct ppp_stats);
-  unsigned struct_sock_fprog_sz = sizeof(struct sock_fprog);
-#  endif  // SANITIZER_GLIBC
+#endif  // SANITIZER_GLIBC
 
-#  if !SANITIZER_ANDROID && !SANITIZER_APPLE
+#if !SANITIZER_ANDROID && !SANITIZER_APPLE
   unsigned struct_sioc_sg_req_sz = sizeof(struct sioc_sg_req);
   unsigned struct_sioc_vif_req_sz = sizeof(struct sioc_vif_req);
 #endif
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.h b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.h
index ca03841ccc1988..34bfef1f7ef456 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.h
@@ -1050,8 +1050,7 @@ extern unsigned struct_serial_struct_sz;
 extern unsigned struct_sockaddr_ax25_sz;
 extern unsigned struct_unimapdesc_sz;
 extern unsigned struct_unimapinit_sz;
-extern unsigned struct_sock_fprog_sz;
-#  endif  // SANITIZER_LINUX && !SANITIZER_ANDROID
+#endif  // SANITIZER_LINUX && !SANITIZER_ANDROID
 
 extern const unsigned long __sanitizer_bufsiz;
 
diff --git a/compiler-rt/test/sanitizer_common/TestCases/Linux/prctl.cpp b/compiler-rt/test/sanitizer_common/TestCases/Linux/prctl.cpp
index dab1d1b48f8689..cbff02d66efa78 100644
--- a/compiler-rt/test/sanitizer_common/TestCases/Linux/prctl.cpp
+++ b/compiler-rt/test/sanitizer_common/TestCases/Linux/prctl.cpp
@@ -4,8 +4,6 @@
 
 #include <assert.h>
 #include <errno.h>
-#include <linux/filter.h>
-#include <linux/seccomp.h>
 #include <stdint.h>
 #include <string.h>
 #include <sys/mman.h>
@@ -80,13 +78,5 @@ int main() {
     }
   }
 
-  sock_filter f[] = {{.code = (BPF_LD | BPF_W | BPF_ABS),
-                      .k = (uint32_t)(SKF_AD_OFF | SKF_AD_CPU)},
-                     {.code = (BPF_RET | BPF_A), .k = 0}};
-  sock_fprog pr = {.len = 2, .filter = f};
-
-  res = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &pr);
-  assert(res == -1);
-
   return 0;
 }

From fda01437af339c87ecd226596dd1b5f6d2c6cbfa Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Tue, 24 Sep 2024 08:33:24 -0700
Subject: [PATCH 23/49] [Rewrite] Use SmallSetVector (NFC) (#109746)

We can combine:

  SmallVector<ValueDecl *, 8> BlockByCopyDecls;
  llvm::SmallPtrSet<ValueDecl *, 8> BlockByCopyDeclsPtrSet;

into:

  llvm::SmallSetVector<ValueDecl *, 8> BlockByCopyDecls;

Likewise, we can combine:

  SmallVector<ValueDecl *, 8> BlockByRefDecls;
  llvm::SmallPtrSet<ValueDecl *, 8> BlockByRefDeclsPtrSet;

into:

  llvm::SmallSetVector<ValueDecl *, 8> BlockByRefDecls;
---
 clang/lib/Frontend/Rewrite/RewriteObjC.cpp | 50 ++++++----------------
 1 file changed, 14 insertions(+), 36 deletions(-)

diff --git a/clang/lib/Frontend/Rewrite/RewriteObjC.cpp b/clang/lib/Frontend/Rewrite/RewriteObjC.cpp
index a1e792bf772ba2..f49ccf7be68e22 100644
--- a/clang/lib/Frontend/Rewrite/RewriteObjC.cpp
+++ b/clang/lib/Frontend/Rewrite/RewriteObjC.cpp
@@ -128,10 +128,8 @@ namespace {
     SmallVector<DeclRefExpr *, 32> BlockDeclRefs;
 
     // Block related declarations.
-    SmallVector<ValueDecl *, 8> BlockByCopyDecls;
-    llvm::SmallPtrSet<ValueDecl *, 8> BlockByCopyDeclsPtrSet;
-    SmallVector<ValueDecl *, 8> BlockByRefDecls;
-    llvm::SmallPtrSet<ValueDecl *, 8> BlockByRefDeclsPtrSet;
+    llvm::SmallSetVector<ValueDecl *, 8> BlockByCopyDecls;
+    llvm::SmallSetVector<ValueDecl *, 8> BlockByRefDecls;
     llvm::DenseMap<ValueDecl *, unsigned> BlockByRefDeclNo;
     llvm::SmallPtrSet<ValueDecl *, 8> ImportedBlockDecls;
     llvm::SmallPtrSet<VarDecl *, 8> ImportedLocalExternalDecls;
@@ -3357,7 +3355,7 @@ std::string RewriteObjC::SynthesizeBlockHelperFuncs(BlockExpr *CE, int i,
     S += VD->getNameAsString();
     S += ", (void*)src->";
     S += VD->getNameAsString();
-    if (BlockByRefDeclsPtrSet.count(VD))
+    if (BlockByRefDecls.contains(VD))
       S += ", " + utostr(BLOCK_FIELD_IS_BYREF) + "/*BLOCK_FIELD_IS_BYREF*/);";
     else if (VD->getType()->isBlockPointerType())
       S += ", " + utostr(BLOCK_FIELD_IS_BLOCK) + "/*BLOCK_FIELD_IS_BLOCK*/);";
@@ -3374,7 +3372,7 @@ std::string RewriteObjC::SynthesizeBlockHelperFuncs(BlockExpr *CE, int i,
   for (ValueDecl *VD : ImportedBlockDecls) {
     S += "_Block_object_dispose((void*)src->";
     S += VD->getNameAsString();
-    if (BlockByRefDeclsPtrSet.count(VD))
+    if (BlockByRefDecls.contains(VD))
       S += ", " + utostr(BLOCK_FIELD_IS_BYREF) + "/*BLOCK_FIELD_IS_BYREF*/);";
     else if (VD->getType()->isBlockPointerType())
       S += ", " + utostr(BLOCK_FIELD_IS_BLOCK) + "/*BLOCK_FIELD_IS_BLOCK*/);";
@@ -3553,14 +3551,10 @@ void RewriteObjC::SynthesizeBlockLiterals(SourceLocation FunLocStart,
       DeclRefExpr *Exp = InnerDeclRefs[count++];
       ValueDecl *VD = Exp->getDecl();
       BlockDeclRefs.push_back(Exp);
-      if (!VD->hasAttr<BlocksAttr>() && !BlockByCopyDeclsPtrSet.count(VD)) {
-        BlockByCopyDeclsPtrSet.insert(VD);
-        BlockByCopyDecls.push_back(VD);
-      }
-      if (VD->hasAttr<BlocksAttr>() && !BlockByRefDeclsPtrSet.count(VD)) {
-        BlockByRefDeclsPtrSet.insert(VD);
-        BlockByRefDecls.push_back(VD);
-      }
+      if (VD->hasAttr<BlocksAttr>())
+        BlockByRefDecls.insert(VD);
+      else
+        BlockByCopyDecls.insert(VD);
       // imported objects in the inner blocks not used in the outer
       // blocks must be copied/disposed in the outer block as well.
       if (VD->hasAttr<BlocksAttr>() ||
@@ -3590,9 +3584,7 @@ void RewriteObjC::SynthesizeBlockLiterals(SourceLocation FunLocStart,
 
     BlockDeclRefs.clear();
     BlockByRefDecls.clear();
-    BlockByRefDeclsPtrSet.clear();
     BlockByCopyDecls.clear();
-    BlockByCopyDeclsPtrSet.clear();
     ImportedBlockDecls.clear();
   }
   if (RewriteSC) {
@@ -4314,20 +4306,12 @@ void RewriteObjC::CollectBlockDeclRefInfo(BlockExpr *Exp) {
   if (BlockDeclRefs.size()) {
     // Unique all "by copy" declarations.
     for (unsigned i = 0; i < BlockDeclRefs.size(); i++)
-      if (!BlockDeclRefs[i]->getDecl()->hasAttr<BlocksAttr>()) {
-        if (!BlockByCopyDeclsPtrSet.count(BlockDeclRefs[i]->getDecl())) {
-          BlockByCopyDeclsPtrSet.insert(BlockDeclRefs[i]->getDecl());
-          BlockByCopyDecls.push_back(BlockDeclRefs[i]->getDecl());
-        }
-      }
+      if (!BlockDeclRefs[i]->getDecl()->hasAttr<BlocksAttr>())
+        BlockByCopyDecls.insert(BlockDeclRefs[i]->getDecl());
     // Unique all "by ref" declarations.
     for (unsigned i = 0; i < BlockDeclRefs.size(); i++)
-      if (BlockDeclRefs[i]->getDecl()->hasAttr<BlocksAttr>()) {
-        if (!BlockByRefDeclsPtrSet.count(BlockDeclRefs[i]->getDecl())) {
-          BlockByRefDeclsPtrSet.insert(BlockDeclRefs[i]->getDecl());
-          BlockByRefDecls.push_back(BlockDeclRefs[i]->getDecl());
-        }
-      }
+      if (BlockDeclRefs[i]->getDecl()->hasAttr<BlocksAttr>())
+        BlockByRefDecls.insert(BlockDeclRefs[i]->getDecl());
     // Find any imported blocks...they will need special attention.
     for (unsigned i = 0; i < BlockDeclRefs.size(); i++)
       if (BlockDeclRefs[i]->getDecl()->hasAttr<BlocksAttr>() ||
@@ -4358,22 +4342,18 @@ Stmt *RewriteObjC::SynthBlockInitExpr(BlockExpr *Exp,
     for (unsigned i = 0; i < InnerBlockDeclRefs.size(); i++) {
       DeclRefExpr *Exp = InnerBlockDeclRefs[i];
       ValueDecl *VD = Exp->getDecl();
-      if (!VD->hasAttr<BlocksAttr>() &&
-          BlockByCopyDeclsPtrSet.insert(VD).second) {
+      if (!VD->hasAttr<BlocksAttr>() && BlockByCopyDecls.insert(VD)) {
         // We need to save the copied-in variables in nested
         // blocks because it is needed at the end for some of the API
         // generations. See SynthesizeBlockLiterals routine.
         InnerDeclRefs.push_back(Exp);
         countOfInnerDecls++;
         BlockDeclRefs.push_back(Exp);
-        BlockByCopyDecls.push_back(VD);
       }
-      if (VD->hasAttr<BlocksAttr>() &&
-          BlockByRefDeclsPtrSet.insert(VD).second) {
+      if (VD->hasAttr<BlocksAttr>() && BlockByRefDecls.insert(VD)) {
         InnerDeclRefs.push_back(Exp);
         countOfInnerDecls++;
         BlockDeclRefs.push_back(Exp);
-        BlockByRefDecls.push_back(VD);
       }
     }
     // Find any imported blocks...they will need special attention.
@@ -4534,9 +4514,7 @@ Stmt *RewriteObjC::SynthBlockInitExpr(BlockExpr *Exp,
                                     NewRep);
   BlockDeclRefs.clear();
   BlockByRefDecls.clear();
-  BlockByRefDeclsPtrSet.clear();
   BlockByCopyDecls.clear();
-  BlockByCopyDeclsPtrSet.clear();
   ImportedBlockDecls.clear();
   return NewRep;
 }

From 36dce5091e384087f5d1ceaf3777ac14f41087e4 Mon Sep 17 00:00:00 2001
From: spupyrev <spupyrev@users.noreply.github.com>
Date: Tue, 24 Sep 2024 08:36:57 -0700
Subject: [PATCH 24/49] [CodeLayout][NFC] Format and minor refactoring of MBP
 (#109729)

This PR has two (NFC) commits:
- clang-format MBP
- move a part of tail duplication and block aligning into helper
functions for better readability.
---
 llvm/lib/CodeGen/MachineBlockPlacement.cpp | 555 ++++++++++-----------
 1 file changed, 274 insertions(+), 281 deletions(-)

diff --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
index be783bc4e29738..7dcb287b387343 100644
--- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
@@ -119,10 +119,10 @@ static cl::opt<unsigned> LoopToColdBlockRatio(
              "(frequency of block) is greater than this ratio"),
     cl::init(5), cl::Hidden);
 
-static cl::opt<bool> ForceLoopColdBlock(
-    "force-loop-cold-block",
-    cl::desc("Force outlining cold blocks from loops."),
-    cl::init(false), cl::Hidden);
+static cl::opt<bool>
+    ForceLoopColdBlock("force-loop-cold-block",
+                       cl::desc("Force outlining cold blocks from loops."),
+                       cl::init(false), cl::Hidden);
 
 static cl::opt<bool>
     PreciseRotationCost("precise-rotation-cost",
@@ -147,43 +147,43 @@ static cl::opt<unsigned> JumpInstCost("jump-inst-cost",
                                       cl::desc("Cost of jump instructions."),
                                       cl::init(1), cl::Hidden);
 static cl::opt<bool>
-TailDupPlacement("tail-dup-placement",
-              cl::desc("Perform tail duplication during placement. "
-                       "Creates more fallthrough opportunites in "
-                       "outline branches."),
-              cl::init(true), cl::Hidden);
+    TailDupPlacement("tail-dup-placement",
+                     cl::desc("Perform tail duplication during placement. "
+                              "Creates more fallthrough opportunites in "
+                              "outline branches."),
+                     cl::init(true), cl::Hidden);
 
 static cl::opt<bool>
-BranchFoldPlacement("branch-fold-placement",
-              cl::desc("Perform branch folding during placement. "
-                       "Reduces code size."),
-              cl::init(true), cl::Hidden);
+    BranchFoldPlacement("branch-fold-placement",
+                        cl::desc("Perform branch folding during placement. "
+                                 "Reduces code size."),
+                        cl::init(true), cl::Hidden);
 
 // Heuristic for tail duplication.
 static cl::opt<unsigned> TailDupPlacementThreshold(
     "tail-dup-placement-threshold",
     cl::desc("Instruction cutoff for tail duplication during layout. "
              "Tail merging during layout is forced to have a threshold "
-             "that won't conflict."), cl::init(2),
-    cl::Hidden);
+             "that won't conflict."),
+    cl::init(2), cl::Hidden);
 
 // Heuristic for aggressive tail duplication.
 static cl::opt<unsigned> TailDupPlacementAggressiveThreshold(
     "tail-dup-placement-aggressive-threshold",
     cl::desc("Instruction cutoff for aggressive tail duplication during "
              "layout. Used at -O3. Tail merging during layout is forced to "
-             "have a threshold that won't conflict."), cl::init(4),
-    cl::Hidden);
+             "have a threshold that won't conflict."),
+    cl::init(4), cl::Hidden);
 
 // Heuristic for tail duplication.
 static cl::opt<unsigned> TailDupPlacementPenalty(
     "tail-dup-placement-penalty",
-    cl::desc("Cost penalty for blocks that can avoid breaking CFG by copying. "
-             "Copying can increase fallthrough, but it also increases icache "
-             "pressure. This parameter controls the penalty to account for that. "
-             "Percent as integer."),
-    cl::init(2),
-    cl::Hidden);
+    cl::desc(
+        "Cost penalty for blocks that can avoid breaking CFG by copying. "
+        "Copying can increase fallthrough, but it also increases icache "
+        "pressure. This parameter controls the penalty to account for that. "
+        "Percent as integer."),
+    cl::init(2), cl::Hidden);
 
 // Heuristic for tail duplication if profile count is used in cost model.
 static cl::opt<unsigned> TailDupProfilePercentThreshold(
@@ -198,8 +198,7 @@ static cl::opt<unsigned> TriangleChainCount(
     "triangle-chain-count",
     cl::desc("Number of triangle-shaped-CFG's that need to be in a row for the "
              "triangle tail duplication heuristic to kick in. 0 to disable."),
-    cl::init(2),
-    cl::Hidden);
+    cl::init(2), cl::Hidden);
 
 // Use case: When block layout is visualized after MBP pass, the basic blocks
 // are labeled in layout order; meanwhile blocks could be numbered in a
@@ -292,8 +291,8 @@ class BlockChain {
   iterator end() { return Blocks.end(); }
   const_iterator end() const { return Blocks.end(); }
 
-  bool remove(MachineBasicBlock* BB) {
-    for(iterator i = begin(); i != end(); ++i) {
+  bool remove(MachineBasicBlock *BB) {
+    for (iterator i = begin(); i != end(); ++i) {
       if (*i == BB) {
         Blocks.erase(i);
         return true;
@@ -405,6 +404,8 @@ class MachineBlockPlacement : public MachineFunctionPass {
 
   ProfileSummaryInfo *PSI = nullptr;
 
+  TargetPassConfig *PassConfig = nullptr;
+
   /// Duplicator used to duplicate tails during placement.
   ///
   /// Placement decisions can open up new tail duplication opportunities, but
@@ -415,6 +416,8 @@ class MachineBlockPlacement : public MachineFunctionPass {
   /// Partial tail duplication threshold.
   BlockFrequency DupThreshold;
 
+  unsigned TailDupSize;
+
   /// True:  use block profile count to compute tail duplication cost.
   /// False: use block frequency to compute tail duplication cost.
   bool UseProfileCount = false;
@@ -459,26 +462,24 @@ class MachineBlockPlacement : public MachineFunctionPass {
 
   /// Scale the DupThreshold according to basic block size.
   BlockFrequency scaleThreshold(MachineBasicBlock *BB);
-  void initDupThreshold();
+  void initTailDupThreshold();
 
   /// Decrease the UnscheduledPredecessors count for all blocks in chain, and
   /// if the count goes to 0, add them to the appropriate work list.
-  void markChainSuccessors(
-      const BlockChain &Chain, const MachineBasicBlock *LoopHeaderBB,
-      const BlockFilterSet *BlockFilter = nullptr);
+  void markChainSuccessors(const BlockChain &Chain,
+                           const MachineBasicBlock *LoopHeaderBB,
+                           const BlockFilterSet *BlockFilter = nullptr);
 
   /// Decrease the UnscheduledPredecessors count for a single block, and
   /// if the count goes to 0, add them to the appropriate work list.
-  void markBlockSuccessors(
-      const BlockChain &Chain, const MachineBasicBlock *BB,
-      const MachineBasicBlock *LoopHeaderBB,
-      const BlockFilterSet *BlockFilter = nullptr);
+  void markBlockSuccessors(const BlockChain &Chain, const MachineBasicBlock *BB,
+                           const MachineBasicBlock *LoopHeaderBB,
+                           const BlockFilterSet *BlockFilter = nullptr);
 
   BranchProbability
-  collectViableSuccessors(
-      const MachineBasicBlock *BB, const BlockChain &Chain,
-      const BlockFilterSet *BlockFilter,
-      SmallVector<MachineBasicBlock *, 4> &Successors);
+  collectViableSuccessors(const MachineBasicBlock *BB, const BlockChain &Chain,
+                          const BlockFilterSet *BlockFilter,
+                          SmallVector<MachineBasicBlock *, 4> &Successors);
   bool isBestSuccessor(MachineBasicBlock *BB, MachineBasicBlock *Pred,
                        BlockFilterSet *BlockFilter);
   void findDuplicateCandidates(SmallVectorImpl<MachineBasicBlock *> &Candidates,
@@ -496,16 +497,19 @@ class MachineBlockPlacement : public MachineFunctionPass {
                           MachineFunction::iterator &PrevUnplacedBlockIt,
                           BlockFilterSet::iterator &PrevUnplacedBlockInFilterIt,
                           bool &DuplicatedToLPred);
-  bool hasBetterLayoutPredecessor(
-      const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
-      const BlockChain &SuccChain, BranchProbability SuccProb,
-      BranchProbability RealSuccProb, const BlockChain &Chain,
-      const BlockFilterSet *BlockFilter);
-  BlockAndTailDupResult selectBestSuccessor(
-      const MachineBasicBlock *BB, const BlockChain &Chain,
-      const BlockFilterSet *BlockFilter);
-  MachineBasicBlock *selectBestCandidateBlock(
-      const BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList);
+  bool hasBetterLayoutPredecessor(const MachineBasicBlock *BB,
+                                  const MachineBasicBlock *Succ,
+                                  const BlockChain &SuccChain,
+                                  BranchProbability SuccProb,
+                                  BranchProbability RealSuccProb,
+                                  const BlockChain &Chain,
+                                  const BlockFilterSet *BlockFilter);
+  BlockAndTailDupResult selectBestSuccessor(const MachineBasicBlock *BB,
+                                            const BlockChain &Chain,
+                                            const BlockFilterSet *BlockFilter);
+  MachineBasicBlock *
+  selectBestCandidateBlock(const BlockChain &Chain,
+                           SmallVectorImpl<MachineBasicBlock *> &WorkList);
   MachineBasicBlock *
   getFirstUnplacedBlock(const BlockChain &PlacedChain,
                         MachineFunction::iterator &PrevUnplacedBlockIt);
@@ -536,20 +540,19 @@ class MachineBlockPlacement : public MachineFunctionPass {
                                   const MachineBasicBlock *ExitBB,
                                   const BlockFilterSet &LoopBlockSet);
   MachineBasicBlock *findBestLoopTopHelper(MachineBasicBlock *OldTop,
-      const MachineLoop &L, const BlockFilterSet &LoopBlockSet);
-  MachineBasicBlock *findBestLoopTop(
-      const MachineLoop &L, const BlockFilterSet &LoopBlockSet);
-  MachineBasicBlock *findBestLoopExit(
-      const MachineLoop &L, const BlockFilterSet &LoopBlockSet,
-      BlockFrequency &ExitFreq);
+                                           const MachineLoop &L,
+                                           const BlockFilterSet &LoopBlockSet);
+  MachineBasicBlock *findBestLoopTop(const MachineLoop &L,
+                                     const BlockFilterSet &LoopBlockSet);
+  MachineBasicBlock *findBestLoopExit(const MachineLoop &L,
+                                      const BlockFilterSet &LoopBlockSet,
+                                      BlockFrequency &ExitFreq);
   BlockFilterSet collectLoopBlockSet(const MachineLoop &L);
   void buildLoopChains(const MachineLoop &L);
-  void rotateLoop(
-      BlockChain &LoopChain, const MachineBasicBlock *ExitingBB,
-      BlockFrequency ExitFreq, const BlockFilterSet &LoopBlockSet);
-  void rotateLoopWithProfile(
-      BlockChain &LoopChain, const MachineLoop &L,
-      const BlockFilterSet &LoopBlockSet);
+  void rotateLoop(BlockChain &LoopChain, const MachineBasicBlock *ExitingBB,
+                  BlockFrequency ExitFreq, const BlockFilterSet &LoopBlockSet);
+  void rotateLoopWithProfile(BlockChain &LoopChain, const MachineLoop &L,
+                             const BlockFilterSet &LoopBlockSet);
   void buildCFGChains();
   void optimizeBranches();
   void alignBlocks();
@@ -558,10 +561,10 @@ class MachineBlockPlacement : public MachineFunctionPass {
   bool shouldTailDuplicate(MachineBasicBlock *BB);
   /// Check the edge frequencies to see if tail duplication will increase
   /// fallthroughs.
-  bool isProfitableToTailDup(
-    const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
-    BranchProbability QProb,
-    const BlockChain &Chain, const BlockFilterSet *BlockFilter);
+  bool isProfitableToTailDup(const MachineBasicBlock *BB,
+                             const MachineBasicBlock *Succ,
+                             BranchProbability QProb, const BlockChain &Chain,
+                             const BlockFilterSet *BlockFilter);
 
   /// Check for a trellis layout.
   bool isTrellis(const MachineBasicBlock *BB,
@@ -582,9 +585,10 @@ class MachineBlockPlacement : public MachineFunctionPass {
 
   /// Returns true if a block can tail duplicate into all unplaced
   /// predecessors. Filters based on loop.
-  bool canTailDuplicateUnplacedPreds(
-      const MachineBasicBlock *BB, MachineBasicBlock *Succ,
-      const BlockChain &Chain, const BlockFilterSet *BlockFilter);
+  bool canTailDuplicateUnplacedPreds(const MachineBasicBlock *BB,
+                                     MachineBasicBlock *Succ,
+                                     const BlockChain &Chain,
+                                     const BlockFilterSet *BlockFilter);
 
   /// Find chains of triangles to tail-duplicate where a global analysis works,
   /// but a local analysis would not find them.
@@ -802,8 +806,8 @@ bool MachineBlockPlacement::shouldTailDuplicate(MachineBasicBlock *BB) {
 /// Compare 2 BlockFrequency's with a small penalty for \p A.
 /// In order to be conservative, we apply a X% penalty to account for
 /// increased icache pressure and static heuristics. For small frequencies
-/// we use only the numerators to improve accuracy. For simplicity, we assume the
-/// penalty is less than 100%
+/// we use only the numerators to improve accuracy. For simplicity, we assume
+/// the penalty is less than 100%
 /// TODO(iteratee): Use 64-bit fixed point edge frequencies everywhere.
 static bool greaterWithBias(BlockFrequency A, BlockFrequency B,
                             BlockFrequency EntryFreq) {
@@ -819,8 +823,8 @@ static bool greaterWithBias(BlockFrequency A, BlockFrequency B,
 /// considering duplication.
 bool MachineBlockPlacement::isProfitableToTailDup(
     const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
-    BranchProbability QProb,
-    const BlockChain &Chain, const BlockFilterSet *BlockFilter) {
+    BranchProbability QProb, const BlockChain &Chain,
+    const BlockFilterSet *BlockFilter) {
   // We need to do a probability calculation to make sure this is profitable.
   // First: does succ have a successor that post-dominates? This affects the
   // calculation. The 2 relevant cases are:
@@ -876,12 +880,12 @@ bool MachineBlockPlacement::isProfitableToTailDup(
   // from BB.
   auto SuccBestPred = BlockFrequency(0);
   for (MachineBasicBlock *SuccPred : Succ->predecessors()) {
-    if (SuccPred == Succ || SuccPred == BB
-        || BlockToChain[SuccPred] == &Chain
-        || (BlockFilter && !BlockFilter->count(SuccPred)))
+    if (SuccPred == Succ || SuccPred == BB ||
+        BlockToChain[SuccPred] == &Chain ||
+        (BlockFilter && !BlockFilter->count(SuccPred)))
       continue;
-    auto Freq = MBFI->getBlockFreq(SuccPred)
-        * MBPI->getEdgeProbability(SuccPred, Succ);
+    auto Freq =
+        MBFI->getBlockFreq(SuccPred) * MBPI->getEdgeProbability(SuccPred, Succ);
     if (Freq > SuccBestPred)
       SuccBestPred = Freq;
   }
@@ -1137,7 +1141,7 @@ MachineBlockPlacement::getBestTrellisSuccessor(
   }
   // We have already computed the optimal edge for the other side of the
   // trellis.
-  ComputedEdges[BestB.Src] = { BestB.Dest, false };
+  ComputedEdges[BestB.Src] = {BestB.Dest, false};
 
   auto TrellisSucc = BestA.Dest;
   LLVM_DEBUG(BranchProbability SuccProb = getAdjustedProbability(
@@ -1169,8 +1173,8 @@ bool MachineBlockPlacement::canTailDuplicateUnplacedPreds(
     // Make sure all unplaced and unfiltered predecessors can be
     // tail-duplicated into.
     // Skip any blocks that are already placed or not in this loop.
-    if (Pred == BB || (BlockFilter && !BlockFilter->count(Pred))
-        || (BlockToChain[Pred] == &Chain && !Succ->succ_empty()))
+    if (Pred == BB || (BlockFilter && !BlockFilter->count(Pred)) ||
+        (BlockToChain[Pred] == &Chain && !Succ->succ_empty()))
       continue;
     if (!TailDup.canTailDuplicate(Succ, Pred)) {
       if (Successors.size() > 1 && hasSameSuccessors(*Pred, Successors))
@@ -1289,9 +1293,7 @@ void MachineBlockPlacement::precomputeTriangleChains() {
 
     unsigned count() const { return Edges.size() - 1; }
 
-    MachineBasicBlock *getKey() const {
-      return Edges.back();
-    }
+    MachineBasicBlock *getKey() const { return Edges.back(); }
   };
 
   if (TriangleChainCount == 0)
@@ -1326,7 +1328,7 @@ void MachineBlockPlacement::precomputeTriangleChains() {
     bool CanTailDuplicate = true;
     // If PDom can't tail-duplicate into it's non-BB predecessors, then this
     // isn't the kind of triangle we're looking for.
-    for (MachineBasicBlock* Pred : PDom->predecessors()) {
+    for (MachineBasicBlock *Pred : PDom->predecessors()) {
       if (Pred == &BB)
         continue;
       if (!TailDup.canTailDuplicate(PDom, Pred)) {
@@ -1386,8 +1388,8 @@ void MachineBlockPlacement::precomputeTriangleChains() {
 
 // When profile is not present, return the StaticLikelyProb.
 // When profile is available, we need to handle the triangle-shape CFG.
-static BranchProbability getLayoutSuccessorProbThreshold(
-      const MachineBasicBlock *BB) {
+static BranchProbability
+getLayoutSuccessorProbThreshold(const MachineBasicBlock *BB) {
   if (!BB->getParent()->getFunction().hasProfileData())
     return BranchProbability(StaticLikelyProb, 100);
   if (BB->succ_size() == 2) {
@@ -1551,8 +1553,8 @@ bool MachineBlockPlacement::hasBetterLayoutPredecessor(
   for (MachineBasicBlock *Pred : Succ->predecessors()) {
     BlockChain *PredChain = BlockToChain[Pred];
     if (Pred == Succ || PredChain == &SuccChain ||
-        (BlockFilter && !BlockFilter->count(Pred)) ||
-        PredChain == &Chain || Pred != *std::prev(PredChain->end()) ||
+        (BlockFilter && !BlockFilter->count(Pred)) || PredChain == &Chain ||
+        Pred != *std::prev(PredChain->end()) ||
         // This check is redundant except for look ahead. This function is
         // called for lookahead by isProfitableToTailDup when BB hasn't been
         // placed yet.
@@ -1599,12 +1601,12 @@ bool MachineBlockPlacement::hasBetterLayoutPredecessor(
 /// \returns The best successor block found, or null if none are viable, along
 /// with a boolean indicating if tail duplication is necessary.
 MachineBlockPlacement::BlockAndTailDupResult
-MachineBlockPlacement::selectBestSuccessor(
-    const MachineBasicBlock *BB, const BlockChain &Chain,
-    const BlockFilterSet *BlockFilter) {
+MachineBlockPlacement::selectBestSuccessor(const MachineBasicBlock *BB,
+                                           const BlockChain &Chain,
+                                           const BlockFilterSet *BlockFilter) {
   const BranchProbability HotProb(StaticLikelyProb, 100);
 
-  BlockAndTailDupResult BestSucc = { nullptr, false };
+  BlockAndTailDupResult BestSucc = {nullptr, false};
   auto BestProb = BranchProbability::getZero();
 
   SmallVector<MachineBasicBlock *, 4> Successors;
@@ -1684,8 +1686,8 @@ MachineBlockPlacement::selectBestSuccessor(
     std::tie(DupProb, Succ) = Tup;
     if (DupProb < BestProb)
       break;
-    if (canTailDuplicateUnplacedPreds(BB, Succ, Chain, BlockFilter)
-        && (isProfitableToTailDup(BB, Succ, BestProb, Chain, BlockFilter))) {
+    if (canTailDuplicateUnplacedPreds(BB, Succ, Chain, BlockFilter) &&
+        (isProfitableToTailDup(BB, Succ, BestProb, Chain, BlockFilter))) {
       LLVM_DEBUG(dbgs() << "    Candidate: " << getBlockName(Succ)
                         << ", probability: " << DupProb
                         << " (Tail Duplicate)\n");
@@ -1822,8 +1824,7 @@ MachineBasicBlock *MachineBlockPlacement::getFirstUnplacedBlock(
 }
 
 void MachineBlockPlacement::fillWorkLists(
-    const MachineBasicBlock *MBB,
-    SmallPtrSetImpl<BlockChain *> &UpdatedPreds,
+    const MachineBasicBlock *MBB, SmallPtrSetImpl<BlockChain *> &UpdatedPreds,
     const BlockFilterSet *BlockFilter = nullptr) {
   BlockChain &Chain = *BlockToChain[MBB];
   if (!UpdatedPreds.insert(&Chain).second)
@@ -1854,9 +1855,9 @@ void MachineBlockPlacement::fillWorkLists(
     BlockWorkList.push_back(BB);
 }
 
-void MachineBlockPlacement::buildChain(
-    const MachineBasicBlock *HeadBB, BlockChain &Chain,
-    BlockFilterSet *BlockFilter) {
+void MachineBlockPlacement::buildChain(const MachineBasicBlock *HeadBB,
+                                       BlockChain &Chain,
+                                       BlockFilterSet *BlockFilter) {
   assert(HeadBB && "BB must not be null.\n");
   assert(BlockToChain[HeadBB] == &Chain && "BlockToChainMap mis-match.\n");
   MachineFunction::iterator PrevUnplacedBlockIt = F->begin();
@@ -1872,16 +1873,14 @@ void MachineBlockPlacement::buildChain(
     assert(BlockToChain[BB] == &Chain && "BlockToChainMap mis-match in loop.");
     assert(*std::prev(Chain.end()) == BB && "BB Not found at end of chain.");
 
-
     // Look for the best viable successor if there is one to place immediately
     // after this block.
     auto Result = selectBestSuccessor(BB, Chain, BlockFilter);
-    MachineBasicBlock* BestSucc = Result.BB;
+    MachineBasicBlock *BestSucc = Result.BB;
     bool ShouldTailDup = Result.ShouldTailDup;
     if (allowTailDupPlacement())
-      ShouldTailDup |= (BestSucc && canTailDuplicateUnplacedPreds(BB, BestSucc,
-                                                                  Chain,
-                                                                  BlockFilter));
+      ShouldTailDup |= (BestSucc && canTailDuplicateUnplacedPreds(
+                                        BB, BestSucc, Chain, BlockFilter));
 
     // If an immediate successor isn't available, look for the best viable
     // block among those we've identified as not violating the loop's CFG at
@@ -1918,8 +1917,8 @@ void MachineBlockPlacement::buildChain(
 
     // Place this block, updating the datastructures to reflect its placement.
     BlockChain &SuccChain = *BlockToChain[BestSucc];
-    // Zero out UnscheduledPredecessors for the successor we're about to merge in case
-    // we selected a successor that didn't fit naturally into the CFG.
+    // Zero out UnscheduledPredecessors for the successor we're about to merge
+    // in case we selected a successor that didn't fit naturally into the CFG.
     SuccChain.UnscheduledPredecessors = 0;
     LLVM_DEBUG(dbgs() << "Merging from " << getBlockName(BB) << " to "
                       << getBlockName(BestSucc) << "\n");
@@ -1946,10 +1945,8 @@ void MachineBlockPlacement::buildChain(
 // If BB is moved before OldTop, Pred needs a taken branch to BB, and it can't
 // layout the other successor below it, so it can't reduce taken branch.
 // In this case we keep its original layout.
-bool
-MachineBlockPlacement::canMoveBottomBlockToTop(
-    const MachineBasicBlock *BottomBlock,
-    const MachineBasicBlock *OldTop) {
+bool MachineBlockPlacement::canMoveBottomBlockToTop(
+    const MachineBasicBlock *BottomBlock, const MachineBasicBlock *OldTop) {
   if (BottomBlock->pred_size() != 1)
     return true;
   MachineBasicBlock *Pred = *BottomBlock->pred_begin();
@@ -1967,9 +1964,8 @@ MachineBlockPlacement::canMoveBottomBlockToTop(
 
 // Find out the possible fall through frequence to the top of a loop.
 BlockFrequency
-MachineBlockPlacement::TopFallThroughFreq(
-    const MachineBasicBlock *Top,
-    const BlockFilterSet &LoopBlockSet) {
+MachineBlockPlacement::TopFallThroughFreq(const MachineBasicBlock *Top,
+                                          const BlockFilterSet &LoopBlockSet) {
   BlockFrequency MaxFreq = BlockFrequency(0);
   for (MachineBasicBlock *Pred : Top->predecessors()) {
     BlockChain *PredChain = BlockToChain[Pred];
@@ -1991,8 +1987,8 @@ MachineBlockPlacement::TopFallThroughFreq(
         }
       }
       if (TopOK) {
-        BlockFrequency EdgeFreq = MBFI->getBlockFreq(Pred) *
-                                  MBPI->getEdgeProbability(Pred, Top);
+        BlockFrequency EdgeFreq =
+            MBFI->getBlockFreq(Pred) * MBPI->getEdgeProbability(Pred, Top);
         if (EdgeFreq > MaxFreq)
           MaxFreq = EdgeFreq;
       }
@@ -2022,19 +2018,16 @@ MachineBlockPlacement::TopFallThroughFreq(
 //              |-
 //              V
 //
-BlockFrequency
-MachineBlockPlacement::FallThroughGains(
-    const MachineBasicBlock *NewTop,
-    const MachineBasicBlock *OldTop,
-    const MachineBasicBlock *ExitBB,
-    const BlockFilterSet &LoopBlockSet) {
+BlockFrequency MachineBlockPlacement::FallThroughGains(
+    const MachineBasicBlock *NewTop, const MachineBasicBlock *OldTop,
+    const MachineBasicBlock *ExitBB, const BlockFilterSet &LoopBlockSet) {
   BlockFrequency FallThrough2Top = TopFallThroughFreq(OldTop, LoopBlockSet);
   BlockFrequency FallThrough2Exit = BlockFrequency(0);
   if (ExitBB)
-    FallThrough2Exit = MBFI->getBlockFreq(NewTop) *
-        MBPI->getEdgeProbability(NewTop, ExitBB);
-  BlockFrequency BackEdgeFreq = MBFI->getBlockFreq(NewTop) *
-      MBPI->getEdgeProbability(NewTop, OldTop);
+    FallThrough2Exit =
+        MBFI->getBlockFreq(NewTop) * MBPI->getEdgeProbability(NewTop, ExitBB);
+  BlockFrequency BackEdgeFreq =
+      MBFI->getBlockFreq(NewTop) * MBPI->getEdgeProbability(NewTop, OldTop);
 
   // Find the best Pred of NewTop.
   MachineBasicBlock *BestPred = nullptr;
@@ -2113,10 +2106,8 @@ MachineBlockPlacement::FallThroughGains(
 ///        At the same time, move it before old top increases the taken branch
 ///        to loop exit block, so the reduced taken branch will be compared with
 ///        the increased taken branch to the loop exit block.
-MachineBasicBlock *
-MachineBlockPlacement::findBestLoopTopHelper(
-    MachineBasicBlock *OldTop,
-    const MachineLoop &L,
+MachineBasicBlock *MachineBlockPlacement::findBestLoopTopHelper(
+    MachineBasicBlock *OldTop, const MachineLoop &L,
     const BlockFilterSet &LoopBlockSet) {
   // Check that the header hasn't been fused with a preheader block due to
   // crazy branches. If it has, we need to start with the header at the top to
@@ -2153,8 +2144,8 @@ MachineBlockPlacement::findBestLoopTopHelper(
     if (!canMoveBottomBlockToTop(Pred, OldTop))
       continue;
 
-    BlockFrequency Gains = FallThroughGains(Pred, OldTop, OtherBB,
-                                            LoopBlockSet);
+    BlockFrequency Gains =
+        FallThroughGains(Pred, OldTop, OtherBB, LoopBlockSet);
     if ((Gains > BlockFrequency(0)) &&
         (Gains > BestGains ||
          ((Gains == BestGains) && Pred->isLayoutSuccessor(OldTop)))) {
@@ -2204,7 +2195,7 @@ MachineBlockPlacement::findBestLoopTop(const MachineLoop &L,
     OldTop = NewTop;
     NewTop = findBestLoopTopHelper(OldTop, L, LoopBlockSet);
     if (NewTop != OldTop)
-      ComputedEdges[NewTop] = { OldTop, false };
+      ComputedEdges[NewTop] = {OldTop, false};
   }
   return NewTop;
 }
@@ -2336,10 +2327,8 @@ MachineBlockPlacement::findBestLoopExit(const MachineLoop &L,
 ///
 ///   1. Look for a Pred that can be layout before Top.
 ///   2. Check if Top is the most possible successor of Pred.
-bool
-MachineBlockPlacement::hasViableTopFallthrough(
-    const MachineBasicBlock *Top,
-    const BlockFilterSet &LoopBlockSet) {
+bool MachineBlockPlacement::hasViableTopFallthrough(
+    const MachineBasicBlock *Top, const BlockFilterSet &LoopBlockSet) {
   for (MachineBasicBlock *Pred : Top->predecessors()) {
     BlockChain *PredChain = BlockToChain[Pred];
     if (!LoopBlockSet.count(Pred) &&
@@ -2491,7 +2480,7 @@ void MachineBlockPlacement::rotateLoopWithProfile(
     if (!LoopBlockSet.count(Pred) &&
         (!PredChain || Pred == *std::prev(PredChain->end()))) {
       auto EdgeFreq = MBFI->getBlockFreq(Pred) *
-          MBPI->getEdgeProbability(Pred, ChainHeaderBB);
+                      MBPI->getEdgeProbability(Pred, ChainHeaderBB);
       auto FallThruCost = ScaleBlockFrequency(EdgeFreq, MisfetchCost);
       // If the predecessor has only an unconditional jump to the header, we
       // need to consider the cost of this jump.
@@ -2951,12 +2940,16 @@ void MachineBlockPlacement::alignBlocks() {
   // exclusively on the loop info here so that we can align backedges in
   // unnatural CFGs and backedges that were introduced purely because of the
   // loop rotations done during this layout pass.
-  if (F->getFunction().hasMinSize() ||
-      (F->getFunction().hasOptSize() && !TLI->alignLoopsWithOptSize()))
-    return;
+  if (!AlignAllBlock && !AlignAllNonFallThruBlocks) {
+    if (F->getFunction().hasMinSize() ||
+        (F->getFunction().hasOptSize() && !TLI->alignLoopsWithOptSize()))
+      return;
+  }
+
   BlockChain &FunctionChain = *BlockToChain[&F->front()];
+  // Empty chain.
   if (FunctionChain.begin() == FunctionChain.end())
-    return; // Empty chain.
+    return;
 
   const BranchProbability ColdProb(1, 5); // 20%
   BlockFrequency EntryFreq = MBFI->getBlockFreq(&F->front());
@@ -3052,6 +3045,33 @@ void MachineBlockPlacement::alignBlocks() {
       DetermineMaxAlignmentPadding();
     }
   }
+
+  const bool HasMaxBytesOverride =
+      MaxBytesForAlignmentOverride.getNumOccurrences() > 0;
+
+  if (AlignAllBlock)
+    // Align all of the blocks in the function to a specific alignment.
+    for (MachineBasicBlock &MBB : *F) {
+      if (HasMaxBytesOverride)
+        MBB.setAlignment(Align(1ULL << AlignAllBlock),
+                         MaxBytesForAlignmentOverride);
+      else
+        MBB.setAlignment(Align(1ULL << AlignAllBlock));
+    }
+  else if (AlignAllNonFallThruBlocks) {
+    // Align all of the blocks that have no fall-through predecessors to a
+    // specific alignment.
+    for (auto MBI = std::next(F->begin()), MBE = F->end(); MBI != MBE; ++MBI) {
+      auto LayoutPred = std::prev(MBI);
+      if (!LayoutPred->isSuccessor(&*MBI)) {
+        if (HasMaxBytesOverride)
+          MBI->setAlignment(Align(1ULL << AlignAllNonFallThruBlocks),
+                            MaxBytesForAlignmentOverride);
+        else
+          MBI->setAlignment(Align(1ULL << AlignAllNonFallThruBlocks));
+      }
+    }
+  }
 }
 
 /// Tail duplicate \p BB into (some) predecessors if profitable, repeating if
@@ -3142,67 +3162,66 @@ bool MachineBlockPlacement::maybeTailDuplicateBlock(
   // This has to be a callback because none of it can be done after
   // BB is deleted.
   bool Removed = false;
-  auto RemovalCallback =
-      [&](MachineBasicBlock *RemBB) {
-        // Signal to outer function
-        Removed = true;
-
-        // Conservative default.
-        bool InWorkList = true;
-        // Remove from the Chain and Chain Map
-        if (BlockToChain.count(RemBB)) {
-          BlockChain *Chain = BlockToChain[RemBB];
-          InWorkList = Chain->UnscheduledPredecessors == 0;
-          Chain->remove(RemBB);
-          BlockToChain.erase(RemBB);
-        }
-
-        // Handle the unplaced block iterator
-        if (&(*PrevUnplacedBlockIt) == RemBB) {
-          PrevUnplacedBlockIt++;
-        }
-
-        // Handle the Work Lists
-        if (InWorkList) {
-          SmallVectorImpl<MachineBasicBlock *> &RemoveList = BlockWorkList;
-          if (RemBB->isEHPad())
-            RemoveList = EHPadWorkList;
-          llvm::erase(RemoveList, RemBB);
-        }
-
-        // Handle the filter set
-        if (BlockFilter) {
-          auto It = llvm::find(*BlockFilter, RemBB);
-          // Erase RemBB from BlockFilter, and keep PrevUnplacedBlockInFilterIt
-          // pointing to the same element as before.
-          if (It != BlockFilter->end()) {
-            if (It < PrevUnplacedBlockInFilterIt) {
-              const MachineBasicBlock *PrevBB = *PrevUnplacedBlockInFilterIt;
-              // BlockFilter is a SmallVector so all elements after RemBB are
-              // shifted to the front by 1 after its deletion.
-              auto Distance = PrevUnplacedBlockInFilterIt - It - 1;
-              PrevUnplacedBlockInFilterIt = BlockFilter->erase(It) + Distance;
-              assert(*PrevUnplacedBlockInFilterIt == PrevBB);
-              (void)PrevBB;
-            } else if (It == PrevUnplacedBlockInFilterIt)
-              // The block pointed by PrevUnplacedBlockInFilterIt is erased, we
-              // have to set it to the next element.
-              PrevUnplacedBlockInFilterIt = BlockFilter->erase(It);
-            else
-              BlockFilter->erase(It);
-          }
-        }
+  auto RemovalCallback = [&](MachineBasicBlock *RemBB) {
+    // Signal to outer function
+    Removed = true;
+
+    // Conservative default.
+    bool InWorkList = true;
+    // Remove from the Chain and Chain Map
+    if (BlockToChain.count(RemBB)) {
+      BlockChain *Chain = BlockToChain[RemBB];
+      InWorkList = Chain->UnscheduledPredecessors == 0;
+      Chain->remove(RemBB);
+      BlockToChain.erase(RemBB);
+    }
+
+    // Handle the unplaced block iterator
+    if (&(*PrevUnplacedBlockIt) == RemBB) {
+      PrevUnplacedBlockIt++;
+    }
+
+    // Handle the Work Lists
+    if (InWorkList) {
+      SmallVectorImpl<MachineBasicBlock *> &RemoveList = BlockWorkList;
+      if (RemBB->isEHPad())
+        RemoveList = EHPadWorkList;
+      llvm::erase(RemoveList, RemBB);
+    }
+
+    // Handle the filter set
+    if (BlockFilter) {
+      auto It = llvm::find(*BlockFilter, RemBB);
+      // Erase RemBB from BlockFilter, and keep PrevUnplacedBlockInFilterIt
+      // pointing to the same element as before.
+      if (It != BlockFilter->end()) {
+        if (It < PrevUnplacedBlockInFilterIt) {
+          const MachineBasicBlock *PrevBB = *PrevUnplacedBlockInFilterIt;
+          // BlockFilter is a SmallVector so all elements after RemBB are
+          // shifted to the front by 1 after its deletion.
+          auto Distance = PrevUnplacedBlockInFilterIt - It - 1;
+          PrevUnplacedBlockInFilterIt = BlockFilter->erase(It) + Distance;
+          assert(*PrevUnplacedBlockInFilterIt == PrevBB);
+          (void)PrevBB;
+        } else if (It == PrevUnplacedBlockInFilterIt)
+          // The block pointed by PrevUnplacedBlockInFilterIt is erased, we
+          // have to set it to the next element.
+          PrevUnplacedBlockInFilterIt = BlockFilter->erase(It);
+        else
+          BlockFilter->erase(It);
+      }
+    }
 
-        // Remove the block from loop info.
-        MLI->removeBlock(RemBB);
-        if (RemBB == PreferredLoopExit)
-          PreferredLoopExit = nullptr;
+    // Remove the block from loop info.
+    MLI->removeBlock(RemBB);
+    if (RemBB == PreferredLoopExit)
+      PreferredLoopExit = nullptr;
 
-        LLVM_DEBUG(dbgs() << "TailDuplicator deleted block: "
-                          << getBlockName(RemBB) << "\n");
-      };
+    LLVM_DEBUG(dbgs() << "TailDuplicator deleted block: " << getBlockName(RemBB)
+                      << "\n");
+  };
   auto RemovalCallbackRef =
-      function_ref<void(MachineBasicBlock*)>(RemovalCallback);
+      function_ref<void(MachineBasicBlock *)>(RemovalCallback);
 
   SmallVector<MachineBasicBlock *, 8> DuplicatedPreds;
   bool IsSimple = TailDup.isSimpleBB(BB);
@@ -3223,11 +3242,11 @@ bool MachineBlockPlacement::maybeTailDuplicateBlock(
   DuplicatedToLPred = false;
   for (MachineBasicBlock *Pred : DuplicatedPreds) {
     // We're only looking for unscheduled predecessors that match the filter.
-    BlockChain* PredChain = BlockToChain[Pred];
+    BlockChain *PredChain = BlockToChain[Pred];
     if (Pred == LPred)
       DuplicatedToLPred = true;
-    if (Pred == LPred || (BlockFilter && !BlockFilter->count(Pred))
-        || PredChain == &Chain)
+    if (Pred == LPred || (BlockFilter && !BlockFilter->count(Pred)) ||
+        PredChain == &Chain)
       continue;
     for (MachineBasicBlock *NewSucc : Pred->successors()) {
       if (BlockFilter && !BlockFilter->count(NewSucc))
@@ -3297,8 +3316,7 @@ bool MachineBlockPlacement::isBestSuccessor(MachineBasicBlock *BB,
 // Find out the predecessors of BB and BB can be beneficially duplicated into
 // them.
 void MachineBlockPlacement::findDuplicateCandidates(
-    SmallVectorImpl<MachineBasicBlock *> &Candidates,
-    MachineBasicBlock *BB,
+    SmallVectorImpl<MachineBasicBlock *> &Candidates, MachineBasicBlock *BB,
     BlockFilterSet *BlockFilter) {
   MachineBasicBlock *Fallthrough = nullptr;
   BranchProbability DefaultBranchProb = BranchProbability::getZero();
@@ -3407,31 +3425,53 @@ void MachineBlockPlacement::findDuplicateCandidates(
   }
 }
 
-void MachineBlockPlacement::initDupThreshold() {
+void MachineBlockPlacement::initTailDupThreshold() {
   DupThreshold = BlockFrequency(0);
-  if (!F->getFunction().hasProfileData())
-    return;
+  if (F->getFunction().hasProfileData()) {
+    // We prefer to use prifile count.
+    uint64_t HotThreshold = PSI->getOrCompHotCountThreshold();
+    if (HotThreshold != UINT64_MAX) {
+      UseProfileCount = true;
+      DupThreshold =
+          BlockFrequency(HotThreshold * TailDupProfilePercentThreshold / 100);
+    } else {
+      // Profile count is not available, we can use block frequency instead.
+      BlockFrequency MaxFreq = BlockFrequency(0);
+      for (MachineBasicBlock &MBB : *F) {
+        BlockFrequency Freq = MBFI->getBlockFreq(&MBB);
+        if (Freq > MaxFreq)
+          MaxFreq = Freq;
+      }
 
-  // We prefer to use prifile count.
-  uint64_t HotThreshold = PSI->getOrCompHotCountThreshold();
-  if (HotThreshold != UINT64_MAX) {
-    UseProfileCount = true;
-    DupThreshold =
-        BlockFrequency(HotThreshold * TailDupProfilePercentThreshold / 100);
-    return;
+      BranchProbability ThresholdProb(TailDupPlacementPenalty, 100);
+      DupThreshold = BlockFrequency(MaxFreq * ThresholdProb);
+      UseProfileCount = false;
+    }
   }
 
-  // Profile count is not available, we can use block frequency instead.
-  BlockFrequency MaxFreq = BlockFrequency(0);
-  for (MachineBasicBlock &MBB : *F) {
-    BlockFrequency Freq = MBFI->getBlockFreq(&MBB);
-    if (Freq > MaxFreq)
-      MaxFreq = Freq;
+  TailDupSize = TailDupPlacementThreshold;
+  // If only the aggressive threshold is explicitly set, use it.
+  if (TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0 &&
+      TailDupPlacementThreshold.getNumOccurrences() == 0)
+    TailDupSize = TailDupPlacementAggressiveThreshold;
+
+  // For aggressive optimization, we can adjust some thresholds to be less
+  // conservative.
+  if (PassConfig->getOptLevel() >= CodeGenOptLevel::Aggressive) {
+    // At O3 we should be more willing to copy blocks for tail duplication. This
+    // increases size pressure, so we only do it at O3
+    // Do this unless only the regular threshold is explicitly set.
+    if (TailDupPlacementThreshold.getNumOccurrences() == 0 ||
+        TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0)
+      TailDupSize = TailDupPlacementAggressiveThreshold;
   }
 
-  BranchProbability ThresholdProb(TailDupPlacementPenalty, 100);
-  DupThreshold = BlockFrequency(MaxFreq * ThresholdProb);
-  UseProfileCount = false;
+  // If there's no threshold provided through options, query the target
+  // information for a threshold instead.
+  if (TailDupPlacementThreshold.getNumOccurrences() == 0 &&
+      (PassConfig->getOptLevel() < CodeGenOptLevel::Aggressive ||
+       TailDupPlacementAggressiveThreshold.getNumOccurrences() == 0))
+    TailDupSize = TII->getTailDuplicateSize(PassConfig->getOptLevel());
 }
 
 bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
@@ -3451,8 +3491,7 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
   TLI = MF.getSubtarget().getTargetLowering();
   MPDT = nullptr;
   PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
-
-  initDupThreshold();
+  PassConfig = &getAnalysis<TargetPassConfig>();
 
   // Initialize PreferredLoopExit to nullptr here since it may never be set if
   // there are no MachineLoops.
@@ -3463,38 +3502,17 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
   assert(ComputedEdges.empty() &&
          "Computed Edge map should be empty before starting placement.");
 
-  unsigned TailDupSize = TailDupPlacementThreshold;
-  // If only the aggressive threshold is explicitly set, use it.
-  if (TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0 &&
-      TailDupPlacementThreshold.getNumOccurrences() == 0)
-    TailDupSize = TailDupPlacementAggressiveThreshold;
-
-  TargetPassConfig *PassConfig = &getAnalysis<TargetPassConfig>();
-  // For aggressive optimization, we can adjust some thresholds to be less
-  // conservative.
-  if (PassConfig->getOptLevel() >= CodeGenOptLevel::Aggressive) {
-    // At O3 we should be more willing to copy blocks for tail duplication. This
-    // increases size pressure, so we only do it at O3
-    // Do this unless only the regular threshold is explicitly set.
-    if (TailDupPlacementThreshold.getNumOccurrences() == 0 ||
-        TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0)
-      TailDupSize = TailDupPlacementAggressiveThreshold;
-  }
-
-  // If there's no threshold provided through options, query the target
-  // information for a threshold instead.
-  if (TailDupPlacementThreshold.getNumOccurrences() == 0 &&
-      (PassConfig->getOptLevel() < CodeGenOptLevel::Aggressive ||
-       TailDupPlacementAggressiveThreshold.getNumOccurrences() == 0))
-    TailDupSize = TII->getTailDuplicateSize(PassConfig->getOptLevel());
+  // Initialize tail duplication thresholds.
+  initTailDupThreshold();
 
+  // Apply tail duplication.
   if (allowTailDupPlacement()) {
     MPDT = &getAnalysis<MachinePostDominatorTreeWrapperPass>().getPostDomTree();
     bool OptForSize = MF.getFunction().hasOptSize() ||
                       llvm::shouldOptimizeForSize(&MF, PSI, &MBFI->getMBFI());
     if (OptForSize)
       TailDupSize = 1;
-    bool PreRegAlloc = false;
+    const bool PreRegAlloc = false;
     TailDup.initMF(MF, PreRegAlloc, MBPI, MBFI.get(), PSI,
                    /* LayoutMode */ true, TailDupSize);
     precomputeTriangleChains();
@@ -3505,12 +3523,12 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
   // Changing the layout can create new tail merging opportunities.
   // TailMerge can create jump into if branches that make CFG irreducible for
   // HW that requires structured CFG.
-  bool EnableTailMerge = !MF.getTarget().requiresStructuredCFG() &&
-                         PassConfig->getEnableTailMerge() &&
-                         BranchFoldPlacement;
+  const bool EnableTailMerge = !MF.getTarget().requiresStructuredCFG() &&
+                               PassConfig->getEnableTailMerge() &&
+                               BranchFoldPlacement && MF.size() > 3;
   // No tail merging opportunities if the block number is less than four.
-  if (MF.size() > 3 && EnableTailMerge) {
-    unsigned TailMergeSize = TailDupSize + 1;
+  if (EnableTailMerge) {
+    const unsigned TailMergeSize = TailDupSize + 1;
     BranchFolder BF(/*DefaultEnableTailMerge=*/true, /*CommonHoist=*/false,
                     *MBFI, *MBPI, PSI, TailMergeSize);
 
@@ -3545,32 +3563,7 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
   ComputedEdges.clear();
   ChainAllocator.DestroyAll();
 
-  bool HasMaxBytesOverride =
-      MaxBytesForAlignmentOverride.getNumOccurrences() > 0;
-
-  if (AlignAllBlock)
-    // Align all of the blocks in the function to a specific alignment.
-    for (MachineBasicBlock &MBB : MF) {
-      if (HasMaxBytesOverride)
-        MBB.setAlignment(Align(1ULL << AlignAllBlock),
-                         MaxBytesForAlignmentOverride);
-      else
-        MBB.setAlignment(Align(1ULL << AlignAllBlock));
-    }
-  else if (AlignAllNonFallThruBlocks) {
-    // Align all of the blocks that have no fall-through predecessors to a
-    // specific alignment.
-    for (auto MBI = std::next(MF.begin()), MBE = MF.end(); MBI != MBE; ++MBI) {
-      auto LayoutPred = std::prev(MBI);
-      if (!LayoutPred->isSuccessor(&*MBI)) {
-        if (HasMaxBytesOverride)
-          MBI->setAlignment(Align(1ULL << AlignAllNonFallThruBlocks),
-                            MaxBytesForAlignmentOverride);
-        else
-          MBI->setAlignment(Align(1ULL << AlignAllNonFallThruBlocks));
-      }
-    }
-  }
+  // View the function.
   if (ViewBlockLayoutWithBFI != GVDT_None &&
       (ViewBlockFreqFuncName.empty() ||
        F->getFunction().getName() == ViewBlockFreqFuncName)) {

From f12d72d9c95ea2cd82d3e2a70c8cd5d5e88e2060 Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Tue, 24 Sep 2024 16:41:53 +0100
Subject: [PATCH 25/49] [PtrInfo] Use plain pointer over
 'PointerIntPair<Instruction *, 1, bool>'. NFC (#109772)

This PtrInfo holds a pointer and whether it has been set. We can
represent this sentinal value as a nullptr, as we would for most
pointers. This assumes that the value is never set to nullptr, but from
the uses that appears to be true and already assumed.
---
 llvm/include/llvm/Analysis/PtrUseVisitor.h | 33 ++++++++++------------
 1 file changed, 15 insertions(+), 18 deletions(-)

diff --git a/llvm/include/llvm/Analysis/PtrUseVisitor.h b/llvm/include/llvm/Analysis/PtrUseVisitor.h
index 237d328721609b..bbe2741f44fc3d 100644
--- a/llvm/include/llvm/Analysis/PtrUseVisitor.h
+++ b/llvm/include/llvm/Analysis/PtrUseVisitor.h
@@ -52,57 +52,54 @@ class PtrUseVisitorBase {
   /// analysis and whether the visit completed or aborted early.
   class PtrInfo {
   public:
-    PtrInfo() : AbortedInfo(nullptr, false), EscapedInfo(nullptr, false) {}
-
     /// Reset the pointer info, clearing all state.
     void reset() {
-      AbortedInfo.setPointer(nullptr);
-      AbortedInfo.setInt(false);
-      EscapedInfo.setPointer(nullptr);
-      EscapedInfo.setInt(false);
+      AbortedInfo = nullptr;
+      EscapedInfo = nullptr;
     }
 
     /// Did we abort the visit early?
-    bool isAborted() const { return AbortedInfo.getInt(); }
+    bool isAborted() const { return AbortedInfo != nullptr; }
 
     /// Is the pointer escaped at some point?
-    bool isEscaped() const { return EscapedInfo.getInt(); }
+    bool isEscaped() const { return EscapedInfo != nullptr; }
 
     /// Get the instruction causing the visit to abort.
     /// \returns a pointer to the instruction causing the abort if one is
     /// available; otherwise returns null.
-    Instruction *getAbortingInst() const { return AbortedInfo.getPointer(); }
+    Instruction *getAbortingInst() const { return AbortedInfo; }
 
     /// Get the instruction causing the pointer to escape.
     /// \returns a pointer to the instruction which escapes the pointer if one
     /// is available; otherwise returns null.
-    Instruction *getEscapingInst() const { return EscapedInfo.getPointer(); }
+    Instruction *getEscapingInst() const { return EscapedInfo; }
 
     /// Mark the visit as aborted. Intended for use in a void return.
     /// \param I The instruction which caused the visit to abort, if available.
-    void setAborted(Instruction *I = nullptr) {
-      AbortedInfo.setInt(true);
-      AbortedInfo.setPointer(I);
+    void setAborted(Instruction *I) {
+      assert(I && "Expected a valid pointer in setAborted");
+      AbortedInfo = I;
     }
 
     /// Mark the pointer as escaped. Intended for use in a void return.
     /// \param I The instruction which escapes the pointer, if available.
-    void setEscaped(Instruction *I = nullptr) {
-      EscapedInfo.setInt(true);
-      EscapedInfo.setPointer(I);
+    void setEscaped(Instruction *I) {
+      assert(I && "Expected a valid pointer in setEscaped");
+      EscapedInfo = I;
     }
 
     /// Mark the pointer as escaped, and the visit as aborted. Intended
     /// for use in a void return.
     /// \param I The instruction which both escapes the pointer and aborts the
     /// visit, if available.
-    void setEscapedAndAborted(Instruction *I = nullptr) {
+    void setEscapedAndAborted(Instruction *I) {
       setEscaped(I);
       setAborted(I);
     }
 
   private:
-    PointerIntPair<Instruction *, 1, bool> AbortedInfo, EscapedInfo;
+    Instruction *AbortedInfo = nullptr;
+    Instruction *EscapedInfo = nullptr;
   };
 
 protected:

From d075debc508898d5f365f8e909c54d6f4edada85 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Tue, 24 Sep 2024 16:50:46 +0100
Subject: [PATCH 26/49] [AMDGPU] Fix chain handling when lowering barrier
 intrinsics (#109799)

Previously we would fail an assertion in RemoveNodeFromCSEMaps after
lowering:
t3: ch = llvm.amdgcn.s.barrier.join t0, TargetConstant:i64<2973>,
Constant:i32<0>
to:
  t6: ch = S_BARRIER_JOIN_IMM TargetConstant:i32<0>
---
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |   5 +-
 .../AMDGPU/llvm.amdgcn.s.barrier.wait.ll      | 221 +++++++++---------
 2 files changed, 117 insertions(+), 109 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index a9754ba357893f..08f2ff4566b674 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -9365,6 +9365,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
       Opc = AMDGPU::S_GET_BARRIER_STATE_IMM;
       SDValue K = DAG.getTargetConstant(BarID, DL, MVT::i32);
       Ops.push_back(K);
+      Ops.push_back(Chain);
     } else {
       Opc = AMDGPU::S_GET_BARRIER_STATE_M0;
       SDValue M0Val = copyToM0(DAG, Chain, DL, Op.getOperand(2));
@@ -9967,7 +9968,9 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
                         0);
       }
       Ops.push_back(copyToM0(DAG, Chain, DL, M0Val).getValue(0));
-    } else if (!IsInlinableBarID) {
+    } else if (IsInlinableBarID) {
+      Ops.push_back(Chain);
+    } else {
       Ops.push_back(copyToM0(DAG, Chain, DL, BarOp).getValue(0));
     }
 
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll
index a4be9ed8c2b4af..4fb28b392c9ea9 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.s.barrier.wait.ll
@@ -768,17 +768,19 @@ define void @test5_s_barrier_init_m0(i32 %arg1 ,i32 %arg2) {
 }
 
 define amdgpu_kernel void @test1_s_barrier_join(ptr addrspace(1) %out) #0 {
+;
 ; GFX12-SDAG-LABEL: test1_s_barrier_join:
 ; GFX12-SDAG:       ; %bb.0: ; %entry
 ; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
 ; GFX12-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX12-SDAG-NEXT:    s_barrier_join -1
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
 ; GFX12-SDAG-NEXT:    v_mul_u32_u24_e32 v1, v0, v0
-; GFX12-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX12-SDAG-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    global_store_b32 v2, v0, s[0:1]
+; GFX12-SDAG-NEXT:    global_store_b32 v3, v2, s[0:1]
+; GFX12-SDAG-NEXT:    s_barrier_join -1
+; GFX12-SDAG-NEXT:    global_store_b32 v3, v0, s[0:1]
 ; GFX12-SDAG-NEXT:    s_nop 0
 ; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
@@ -810,17 +812,19 @@ entry:
 }
 
 define amdgpu_kernel void @test2_s_barrier_join(ptr addrspace(1) %out) #0 {
+;
 ; GFX12-SDAG-LABEL: test2_s_barrier_join:
 ; GFX12-SDAG:       ; %bb.0: ; %entry
 ; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
 ; GFX12-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX12-SDAG-NEXT:    s_barrier_join 1
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
 ; GFX12-SDAG-NEXT:    v_mul_u32_u24_e32 v1, v0, v0
-; GFX12-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX12-SDAG-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    global_store_b32 v2, v0, s[0:1]
+; GFX12-SDAG-NEXT:    global_store_b32 v3, v2, s[0:1]
+; GFX12-SDAG-NEXT:    s_barrier_join 1
+; GFX12-SDAG-NEXT:    global_store_b32 v3, v0, s[0:1]
 ; GFX12-SDAG-NEXT:    s_nop 0
 ; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
@@ -852,17 +856,19 @@ entry:
 }
 
 define amdgpu_kernel void @test3_s_barrier_join(ptr addrspace(1) %out) #0 {
+;
 ; GFX12-SDAG-LABEL: test3_s_barrier_join:
 ; GFX12-SDAG:       ; %bb.0: ; %entry
 ; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
 ; GFX12-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX12-SDAG-NEXT:    s_barrier_join 0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
 ; GFX12-SDAG-NEXT:    v_mul_u32_u24_e32 v1, v0, v0
-; GFX12-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX12-SDAG-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    global_store_b32 v2, v0, s[0:1]
+; GFX12-SDAG-NEXT:    global_store_b32 v3, v2, s[0:1]
+; GFX12-SDAG-NEXT:    s_barrier_join 0
+; GFX12-SDAG-NEXT:    global_store_b32 v3, v0, s[0:1]
 ; GFX12-SDAG-NEXT:    s_nop 0
 ; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
@@ -967,6 +973,20 @@ define void @test5_s_barrier_join_m0(i32 %arg) {
   ret void
 }
 
+define void @test6_s_barrier_join_0() {
+; GFX12-LABEL: test6_s_barrier_join_0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_barrier_join 0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  call void @llvm.amdgcn.s.barrier.join(i32 0)
+  ret void
+}
+
 define amdgpu_kernel void @test1_s_barrier_leave(ptr addrspace(1) %a, ptr addrspace(1) %b, ptr addrspace(1) %c, ptr addrspace(1) %out) #0 {
 ; GFX12-SDAG-LABEL: test1_s_barrier_leave:
 ; GFX12-SDAG:       ; %bb.0: ; %entry
@@ -1026,17 +1046,19 @@ entry:
 }
 
 define amdgpu_kernel void @test1_s_wakeup_barrier(ptr addrspace(1) %out) #0 {
+;
 ; GFX12-SDAG-LABEL: test1_s_wakeup_barrier:
 ; GFX12-SDAG:       ; %bb.0: ; %entry
 ; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
 ; GFX12-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX12-SDAG-NEXT:    s_wakeup_barrier -1
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
 ; GFX12-SDAG-NEXT:    v_mul_u32_u24_e32 v1, v0, v0
-; GFX12-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX12-SDAG-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    global_store_b32 v2, v0, s[0:1]
+; GFX12-SDAG-NEXT:    global_store_b32 v3, v2, s[0:1]
+; GFX12-SDAG-NEXT:    s_wakeup_barrier -1
+; GFX12-SDAG-NEXT:    global_store_b32 v3, v0, s[0:1]
 ; GFX12-SDAG-NEXT:    s_nop 0
 ; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
@@ -1068,17 +1090,19 @@ entry:
 }
 
 define amdgpu_kernel void @test2_s_wakeup_barrier(ptr addrspace(1) %out) #0 {
+;
 ; GFX12-SDAG-LABEL: test2_s_wakeup_barrier:
 ; GFX12-SDAG:       ; %bb.0: ; %entry
 ; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
 ; GFX12-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX12-SDAG-NEXT:    s_wakeup_barrier 1
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
 ; GFX12-SDAG-NEXT:    v_mul_u32_u24_e32 v1, v0, v0
-; GFX12-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX12-SDAG-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    global_store_b32 v2, v0, s[0:1]
+; GFX12-SDAG-NEXT:    global_store_b32 v3, v2, s[0:1]
+; GFX12-SDAG-NEXT:    s_wakeup_barrier 1
+; GFX12-SDAG-NEXT:    global_store_b32 v3, v0, s[0:1]
 ; GFX12-SDAG-NEXT:    s_nop 0
 ; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
@@ -1110,17 +1134,19 @@ entry:
 }
 
 define amdgpu_kernel void @test3_s_wakeup_barrier(ptr addrspace(1) %out) #0 {
+;
 ; GFX12-SDAG-LABEL: test3_s_wakeup_barrier:
 ; GFX12-SDAG:       ; %bb.0: ; %entry
 ; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
 ; GFX12-SDAG-NEXT:    v_and_b32_e32 v0, 0x3ff, v0
-; GFX12-SDAG-NEXT:    s_wakeup_barrier 0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX12-SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX12-SDAG-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_lshlrev_b32 v3, 2, v0
 ; GFX12-SDAG-NEXT:    v_mul_u32_u24_e32 v1, v0, v0
-; GFX12-SDAG-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
 ; GFX12-SDAG-NEXT:    v_sub_nc_u32_e32 v0, v1, v0
 ; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    global_store_b32 v2, v0, s[0:1]
+; GFX12-SDAG-NEXT:    global_store_b32 v3, v2, s[0:1]
+; GFX12-SDAG-NEXT:    s_wakeup_barrier 0
+; GFX12-SDAG-NEXT:    global_store_b32 v3, v0, s[0:1]
 ; GFX12-SDAG-NEXT:    s_nop 0
 ; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
 ; GFX12-SDAG-NEXT:    s_endpgm
@@ -1226,34 +1252,21 @@ define void @test5_s_wakeup_barrier_m0(i32 %arg) {
 }
 
 define amdgpu_kernel void @test1_s_get_barrier_state(ptr addrspace(1) %out) #0 {
-; GFX12-SDAG-LABEL: test1_s_get_barrier_state:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_get_barrier_state s4, -1
-; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX12-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX12-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX12-SDAG-NEXT:    s_endpgm
-;
-; GFX12-GISEL-LABEL: test1_s_get_barrier_state:
-; GFX12-GISEL:       ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_2)
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-GISEL-NEXT:    s_get_barrier_state s2, -1
-; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, s2
-; GFX12-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX12-GISEL-NEXT:    s_endpgm
+; GFX12-LABEL: test1_s_get_barrier_state:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
+; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_2)
+; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    s_get_barrier_state s2, -1
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
@@ -1264,34 +1277,21 @@ entry:
 }
 
 define amdgpu_kernel void @test2_s_get_barrier_state(ptr addrspace(1) %out) #0 {
-; GFX12-SDAG-LABEL: test2_s_get_barrier_state:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_get_barrier_state s4, 1
-; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX12-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX12-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX12-SDAG-NEXT:    s_endpgm
-;
-; GFX12-GISEL-LABEL: test2_s_get_barrier_state:
-; GFX12-GISEL:       ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_2)
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-GISEL-NEXT:    s_get_barrier_state s2, 1
-; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, s2
-; GFX12-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX12-GISEL-NEXT:    s_endpgm
+; GFX12-LABEL: test2_s_get_barrier_state:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
+; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_2)
+; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    s_get_barrier_state s2, 1
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
@@ -1302,34 +1302,21 @@ entry:
 }
 
 define amdgpu_kernel void @test3_s_get_barrier_state(ptr addrspace(1) %out) #0 {
-; GFX12-SDAG-LABEL: test3_s_get_barrier_state:
-; GFX12-SDAG:       ; %bb.0: ; %entry
-; GFX12-SDAG-NEXT:    s_get_barrier_state s4, 0
-; GFX12-SDAG-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX12-SDAG-NEXT:    s_wait_kmcnt 0x0
-; GFX12-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX12-SDAG-NEXT:    v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX12-SDAG-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX12-SDAG-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-SDAG-NEXT:    s_nop 0
-; GFX12-SDAG-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX12-SDAG-NEXT:    s_endpgm
-;
-; GFX12-GISEL-LABEL: test3_s_get_barrier_state:
-; GFX12-GISEL:       ; %bb.0: ; %entry
-; GFX12-GISEL-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX12-GISEL-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_2)
-; GFX12-GISEL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-GISEL-NEXT:    s_get_barrier_state s2, 0
-; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, s2
-; GFX12-GISEL-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GFX12-GISEL-NEXT:    s_nop 0
-; GFX12-GISEL-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX12-GISEL-NEXT:    s_endpgm
+; GFX12-LABEL: test3_s_get_barrier_state:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
+; GFX12-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 0x3ff, v0
+; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_2)
+; GFX12-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    s_get_barrier_state s2, 0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_mov_b32_e32 v1, s2
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    s_nop 0
+; GFX12-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT:    s_endpgm
 entry:
   %tmp = call i32 @llvm.amdgcn.workitem.id.x()
   %tmp1 = getelementptr i32, ptr addrspace(1) %out, i32 %tmp
@@ -1401,6 +1388,24 @@ define i32 @test5_s_get_barrier_state_m0(i32 %arg) {
   ret i32 %state
 }
 
+define i32 @test6_s_get_barrier_state_0() {
+; GFX12-LABEL: test6_s_get_barrier_state_0:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    s_wait_expcnt 0x0
+; GFX12-NEXT:    s_wait_samplecnt 0x0
+; GFX12-NEXT:    s_wait_bvhcnt 0x0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_get_barrier_state s0, 0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_wait_alu 0xfffe
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-NEXT:    s_setpc_b64 s[30:31]
+  %state = call i32 @llvm.amdgcn.s.get.barrier.state(i32 0)
+  ret i32 %state
+}
+
 define amdgpu_kernel void @test_barrier_convert(ptr addrspace(1) %out) #0 {
 ; GFX12-SDAG-LABEL: test_barrier_convert:
 ; GFX12-SDAG:       ; %bb.0: ; %entry

From d4a38c8ff5c993e14c42895b51a47272fb03a857 Mon Sep 17 00:00:00 2001
From: Usman Nadeem <mnadeem@quicinc.com>
Date: Tue, 24 Sep 2024 08:54:36 -0700
Subject: [PATCH 27/49] =?UTF-8?q?[DFAJumpThreading]=20Handle=20select=20un?=
 =?UTF-8?q?folding=20when=20user=20phi=20is=20not=20a=20dir=E2=80=A6=20(#1?=
 =?UTF-8?q?09511)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…ect successor

Previously the code assumed that the select instruction is defined in a
block that is a direct predecessor of the block where the PHINode uses
it. So, we were hitting an assertion when we tried to access the def
block as an incoming block for the user phi node.

This patch handles that case by using the correct end block and creating
a new phi node that aggregates both the values of the select in that end
block, and then using that new unfolded phi to overwrite the original
user phi node.

Fixes #106083

Change-Id: Ie471994cca232318f74a6e6438efa21e561c2dc0
---
 .../Transforms/Scalar/DFAJumpThreading.cpp    |  46 ++++---
 .../dfa-jump-threading-transform.ll           | 123 ++++++++++++++++++
 2 files changed, 152 insertions(+), 17 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
index ef9c264482a640..0e2b5c925a6a7a 100644
--- a/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/DFAJumpThreading.cpp
@@ -194,7 +194,6 @@ void unfold(DomTreeUpdater *DTU, LoopInfo *LI, SelectInstToUnfold SIToUnfold,
   SelectInst *SI = SIToUnfold.getInst();
   PHINode *SIUse = SIToUnfold.getUse();
   BasicBlock *StartBlock = SI->getParent();
-  BasicBlock *EndBlock = SIUse->getParent();
   BranchInst *StartBlockTerm =
       dyn_cast<BranchInst>(StartBlock->getTerminator());
 
@@ -202,6 +201,7 @@ void unfold(DomTreeUpdater *DTU, LoopInfo *LI, SelectInstToUnfold SIToUnfold,
   assert(SI->hasOneUse());
 
   if (StartBlockTerm->isUnconditional()) {
+    BasicBlock *EndBlock = StartBlock->getUniqueSuccessor();
     // Arbitrarily choose the 'false' side for a new input value to the PHI.
     BasicBlock *NewBlock = BasicBlock::Create(
         SI->getContext(), Twine(SI->getName(), ".si.unfold.false"),
@@ -223,32 +223,44 @@ void unfold(DomTreeUpdater *DTU, LoopInfo *LI, SelectInstToUnfold SIToUnfold,
                                       NewBlock->getFirstInsertionPt());
     NewPhi->addIncoming(SIOp2, StartBlock);
 
-    if (auto *OpSi = dyn_cast<SelectInst>(SIOp1))
-      NewSIsToUnfold->push_back(SelectInstToUnfold(OpSi, SIUse));
-    if (auto *OpSi = dyn_cast<SelectInst>(SIOp2))
-      NewSIsToUnfold->push_back(SelectInstToUnfold(OpSi, NewPhi));
-
-    // Update the phi node of SI.
-    for (unsigned Idx = 0; Idx < SIUse->getNumIncomingValues(); ++Idx) {
-      if (SIUse->getIncomingBlock(Idx) == StartBlock)
-        SIUse->setIncomingValue(Idx, SIOp1);
+    // Update any other PHI nodes in EndBlock.
+    for (PHINode &Phi : EndBlock->phis()) {
+      if (SIUse == &Phi)
+        continue;
+      Phi.addIncoming(Phi.getIncomingValueForBlock(StartBlock), NewBlock);
     }
-    SIUse->addIncoming(NewPhi, NewBlock);
 
-    // Update any other PHI nodes in EndBlock.
-    for (auto II = EndBlock->begin(); PHINode *Phi = dyn_cast<PHINode>(II);
-         ++II) {
-      if (Phi != SIUse)
-        Phi->addIncoming(Phi->getIncomingValueForBlock(StartBlock), NewBlock);
+    // Update the phi node of SI, which is its only use.
+    if (EndBlock == SIUse->getParent()) {
+      SIUse->addIncoming(NewPhi, NewBlock);
+      SIUse->replaceUsesOfWith(SI, SIOp1);
+    } else {
+      PHINode *EndPhi = PHINode::Create(SIUse->getType(), pred_size(EndBlock),
+                                        Twine(SI->getName(), ".si.unfold.phi"),
+                                        EndBlock->getFirstInsertionPt());
+      for (BasicBlock *Pred : predecessors(EndBlock)) {
+        if (Pred != StartBlock && Pred != NewBlock)
+          EndPhi->addIncoming(EndPhi, Pred);
+      }
+
+      EndPhi->addIncoming(SIOp1, StartBlock);
+      EndPhi->addIncoming(NewPhi, NewBlock);
+      SIUse->replaceUsesOfWith(SI, EndPhi);
+      SIUse = EndPhi;
     }
 
-    StartBlockTerm->eraseFromParent();
+    if (auto *OpSi = dyn_cast<SelectInst>(SIOp1))
+      NewSIsToUnfold->push_back(SelectInstToUnfold(OpSi, SIUse));
+    if (auto *OpSi = dyn_cast<SelectInst>(SIOp2))
+      NewSIsToUnfold->push_back(SelectInstToUnfold(OpSi, NewPhi));
 
     // Insert the real conditional branch based on the original condition.
+    StartBlockTerm->eraseFromParent();
     BranchInst::Create(EndBlock, NewBlock, SI->getCondition(), StartBlock);
     DTU->applyUpdates({{DominatorTree::Insert, StartBlock, EndBlock},
                        {DominatorTree::Insert, StartBlock, NewBlock}});
   } else {
+    BasicBlock *EndBlock = SIUse->getParent();
     BasicBlock *NewBlockT = BasicBlock::Create(
         SI->getContext(), Twine(SI->getName(), ".si.unfold.true"),
         EndBlock->getParent(), EndBlock);
diff --git a/llvm/test/Transforms/DFAJumpThreading/dfa-jump-threading-transform.ll b/llvm/test/Transforms/DFAJumpThreading/dfa-jump-threading-transform.ll
index c38f81d0f046ef..cba1ba8dde768e 100644
--- a/llvm/test/Transforms/DFAJumpThreading/dfa-jump-threading-transform.ll
+++ b/llvm/test/Transforms/DFAJumpThreading/dfa-jump-threading-transform.ll
@@ -300,3 +300,126 @@ define void @self-reference() {
 end:
   ret void
 }
+
+define void @pr106083_invalidBBarg_fold(i1 %cmp1, i1 %cmp2, i1 %not, ptr %d) {
+; CHECK-LABEL: @pr106083_invalidBBarg_fold(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    br i1 [[CMP1:%.*]], label [[BB1:%.*]], label [[SEL_SI_UNFOLD_FALSE:%.*]]
+; CHECK:       sel.si.unfold.false:
+; CHECK-NEXT:    [[DOTSI_UNFOLD_PHI1:%.*]] = phi i32 [ 1, [[BB:%.*]] ]
+; CHECK-NEXT:    br label [[BB1]]
+; CHECK:       BB1:
+; CHECK-NEXT:    [[I:%.*]] = phi i16 [ 0, [[BB1_BACKEDGE:%.*]] ], [ 0, [[BB]] ], [ 1, [[BB7:%.*]] ], [ 0, [[SEL_SI_UNFOLD_FALSE]] ], [ 1, [[BB7_JT0:%.*]] ]
+; CHECK-NEXT:    [[SEL_SI_UNFOLD_PHI:%.*]] = phi i32 [ [[SEL_SI_UNFOLD_PHI]], [[BB1_BACKEDGE]] ], [ [[SEL_SI_UNFOLD_PHI]], [[BB7]] ], [ 0, [[BB]] ], [ [[DOTSI_UNFOLD_PHI1]], [[SEL_SI_UNFOLD_FALSE]] ], [ [[SEL_SI_UNFOLD_PHI]], [[BB7_JT0]] ]
+; CHECK-NEXT:    br i1 [[NOT:%.*]], label [[BB7_JT0]], label [[BB2:%.*]]
+; CHECK:       BB2:
+; CHECK-NEXT:    store i16 0, ptr [[D:%.*]], align 2
+; CHECK-NEXT:    br i1 [[CMP2:%.*]], label [[BB7]], label [[SPEC_SELECT_SI_UNFOLD_FALSE_JT0:%.*]]
+; CHECK:       spec.select.si.unfold.false:
+; CHECK-NEXT:    br label [[BB7]]
+; CHECK:       spec.select.si.unfold.false.jt0:
+; CHECK-NEXT:    [[DOTSI_UNFOLD_PHI_JT0:%.*]] = phi i32 [ 0, [[BB2]] ]
+; CHECK-NEXT:    br label [[BB7_JT0]]
+; CHECK:       BB7:
+; CHECK-NEXT:    [[D_PROMOTED4:%.*]] = phi i16 [ 1, [[BB2]] ], [ 1, [[SPEC_SELECT_SI_UNFOLD_FALSE:%.*]] ]
+; CHECK-NEXT:    [[_3:%.*]] = phi i32 [ [[SEL_SI_UNFOLD_PHI]], [[BB2]] ], [ poison, [[SPEC_SELECT_SI_UNFOLD_FALSE]] ]
+; CHECK-NEXT:    switch i32 [[_3]], label [[BB1_BACKEDGE]] [
+; CHECK-NEXT:      i32 0, label [[BB1]]
+; CHECK-NEXT:      i32 1, label [[BB8:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       BB7.jt0:
+; CHECK-NEXT:    [[D_PROMOTED4_JT0:%.*]] = phi i16 [ 0, [[BB1]] ], [ 1, [[SPEC_SELECT_SI_UNFOLD_FALSE_JT0]] ]
+; CHECK-NEXT:    [[_3_JT0:%.*]] = phi i32 [ 0, [[BB1]] ], [ [[DOTSI_UNFOLD_PHI_JT0]], [[SPEC_SELECT_SI_UNFOLD_FALSE_JT0]] ]
+; CHECK-NEXT:    br label [[BB1]]
+; CHECK:       BB1.backedge:
+; CHECK-NEXT:    br label [[BB1]]
+; CHECK:       BB8:
+; CHECK-NEXT:    ret void
+;
+bb:
+  %sel = select i1 %cmp1, i32 0, i32 1
+  br label %BB1
+
+BB1:                                              ; preds = %BB1.backedge, %BB7, %bb
+  %i = phi i16 [ 0, %BB1.backedge ], [ 0, %bb ], [ 1, %BB7 ]
+  br i1 %not, label %BB7, label %BB2
+
+BB2:                                              ; preds = %BB1
+  store i16 0, ptr %d, align 2
+  %spec.select = select i1 %cmp2, i32 %sel, i32 0
+  br label %BB7
+
+BB7:                                              ; preds = %BB2, %BB1
+  %d.promoted4 = phi i16 [ 0, %BB1 ], [ 1, %BB2 ]
+  %_3 = phi i32 [ 0, %BB1 ], [ %spec.select, %BB2 ]
+  switch i32 %_3, label %BB1.backedge [
+  i32 0, label %BB1
+  i32 1, label %BB8
+  ]
+
+BB1.backedge:                                     ; preds = %BB7
+  br label %BB1
+
+BB8:                                              ; preds = %BB7
+  ret void
+}
+
+define void @pr106083_select_dead_uses(i1 %cmp1, i1 %not, ptr %p) {
+; CHECK-LABEL: @pr106083_select_dead_uses(
+; CHECK-NEXT:  bb:
+; CHECK-NEXT:    br i1 [[CMP1:%.*]], label [[DOTLOOPEXIT6:%.*]], label [[SPEC_SELECT_SI_UNFOLD_FALSE:%.*]]
+; CHECK:       spec.select.si.unfold.false:
+; CHECK-NEXT:    [[DOTSI_UNFOLD_PHI1:%.*]] = phi i32 [ 1, [[BB:%.*]] ]
+; CHECK-NEXT:    br label [[DOTLOOPEXIT6]]
+; CHECK:       .loopexit6:
+; CHECK-NEXT:    [[SPEC_SELECT_SI_UNFOLD_PHI:%.*]] = phi i32 [ [[SPEC_SELECT_SI_UNFOLD_PHI]], [[SELECT_UNFOLD:%.*]] ], [ 0, [[BB]] ], [ [[DOTSI_UNFOLD_PHI1]], [[SPEC_SELECT_SI_UNFOLD_FALSE]] ]
+; CHECK-NEXT:    br i1 [[NOT:%.*]], label [[SELECT_UNFOLD_JT0:%.*]], label [[BB1:%.*]]
+; CHECK:       bb1:
+; CHECK-NEXT:    [[I:%.*]] = load i32, ptr [[P:%.*]], align 4
+; CHECK-NEXT:    [[NOT2:%.*]] = icmp eq i32 0, 0
+; CHECK-NEXT:    br i1 [[NOT2]], label [[SELECT_UNFOLD]], label [[SPEC_SELECT7_SI_UNFOLD_FALSE_JT0:%.*]]
+; CHECK:       spec.select7.si.unfold.false:
+; CHECK-NEXT:    br label [[SELECT_UNFOLD]]
+; CHECK:       spec.select7.si.unfold.false.jt0:
+; CHECK-NEXT:    [[DOTSI_UNFOLD_PHI_JT0:%.*]] = phi i32 [ 0, [[BB1]] ]
+; CHECK-NEXT:    br label [[SELECT_UNFOLD_JT0]]
+; CHECK:       select.unfold:
+; CHECK-NEXT:    [[_2:%.*]] = phi i32 [ [[SPEC_SELECT_SI_UNFOLD_PHI]], [[BB1]] ], [ poison, [[SPEC_SELECT7_SI_UNFOLD_FALSE:%.*]] ]
+; CHECK-NEXT:    switch i32 [[_2]], label [[BB2:%.*]] [
+; CHECK-NEXT:      i32 0, label [[DOTPREHEADER_PREHEADER:%.*]]
+; CHECK-NEXT:      i32 1, label [[DOTLOOPEXIT6]]
+; CHECK-NEXT:    ]
+; CHECK:       select.unfold.jt0:
+; CHECK-NEXT:    [[_2_JT0:%.*]] = phi i32 [ 0, [[DOTLOOPEXIT6]] ], [ [[DOTSI_UNFOLD_PHI_JT0]], [[SPEC_SELECT7_SI_UNFOLD_FALSE_JT0]] ]
+; CHECK-NEXT:    br label [[DOTPREHEADER_PREHEADER]]
+; CHECK:       .preheader.preheader:
+; CHECK-NEXT:    ret void
+; CHECK:       bb2:
+; CHECK-NEXT:    unreachable
+;
+bb:
+  %spec.select = select i1 %cmp1, i32 0, i32 1
+  br label %.loopexit6
+
+.loopexit6:                                       ; preds = %select.unfold, %bb
+  br i1 %not, label %select.unfold, label %bb1
+
+bb1:                                              ; preds = %.loopexit6
+  %i = load i32, ptr %p, align 4
+  %not2 = icmp eq i32 0, 0
+  %spec.select7 = select i1 %not2, i32 %spec.select, i32 0
+  br label %select.unfold
+
+select.unfold:                                    ; preds = %bb1, %.loopexit6
+  %_2 = phi i32 [ 0, %.loopexit6 ], [ %spec.select7, %bb1 ]
+  switch i32 %_2, label %bb2 [
+  i32 0, label %.preheader.preheader
+  i32 1, label %.loopexit6
+  ]
+
+.preheader.preheader:                             ; preds = %select.unfold
+  ret void
+
+bb2:                                              ; preds = %select.unfold
+  unreachable
+}

From 679c9717dfc9687a3bca78b45d9fd104b67e16f9 Mon Sep 17 00:00:00 2001
From: Timm Baeder <tbaeder@redhat.com>
Date: Tue, 24 Sep 2024 18:07:18 +0200
Subject: [PATCH 28/49] [clang][bytecode] Fix vector shifts on big-endian
 systems (#109800)

For shifts, the LHS and RHS element types might be different. The
variable naming here could probably use some love now, but I'm trying to
fix this as fast as possible.

See the discussion in https://github.com/llvm/llvm-project/pull/108949
---
 clang/lib/AST/ByteCode/Compiler.cpp | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp
index 754cd0db9868b7..785918846976d4 100644
--- a/clang/lib/AST/ByteCode/Compiler.cpp
+++ b/clang/lib/AST/ByteCode/Compiler.cpp
@@ -1282,9 +1282,8 @@ bool Compiler<Emitter>::VisitVectorBinOp(const BinaryOperator *E) {
                 ? BinaryOperator::getOpForCompoundAssignment(E->getOpcode())
                 : E->getOpcode();
 
-  // The LHS and RHS of a comparison operator must have the same type. So we
-  // just use LHS vector element type here.
   PrimType ElemT = this->classifyVectorElementType(LHS->getType());
+  PrimType RHSElemT = this->classifyVectorElementType(RHS->getType());
   PrimType ResultElemT = this->classifyVectorElementType(E->getType());
 
   // Evaluate LHS and save value to LHSOffset.
@@ -1312,7 +1311,7 @@ bool Compiler<Emitter>::VisitVectorBinOp(const BinaryOperator *E) {
   PrimType PromotT = classifyPrim(PromotTy);
   PrimType OpT = NeedIntPromot ? PromotT : ElemT;
 
-  auto getElem = [=](unsigned Offset, unsigned Index) {
+  auto getElem = [=](unsigned Offset, PrimType ElemT, unsigned Index) {
     if (!this->emitGetLocal(PT_Ptr, Offset, E))
       return false;
     if (!this->emitArrayElemPop(ElemT, Index, E))
@@ -1342,9 +1341,9 @@ bool Compiler<Emitter>::VisitVectorBinOp(const BinaryOperator *E) {
   }
 
   for (unsigned I = 0; I != VecTy->getNumElements(); ++I) {
-    if (!getElem(LHSOffset, I))
+    if (!getElem(LHSOffset, ElemT, I))
       return false;
-    if (!getElem(RHSOffset, I))
+    if (!getElem(RHSOffset, RHSElemT, I))
       return false;
     switch (Op) {
     case BO_Add:
@@ -1372,11 +1371,11 @@ bool Compiler<Emitter>::VisitVectorBinOp(const BinaryOperator *E) {
         return false;
       break;
     case BO_Shl:
-      if (!this->emitShl(OpT, ElemT, E))
+      if (!this->emitShl(OpT, RHSElemT, E))
         return false;
       break;
     case BO_Shr:
-      if (!this->emitShr(OpT, ElemT, E))
+      if (!this->emitShr(OpT, RHSElemT, E))
         return false;
       break;
     case BO_EQ:

From 206408732bca2ef464732a39c8319d47c8a1dbea Mon Sep 17 00:00:00 2001
From: Jason Molenda <jmolenda@apple.com>
Date: Tue, 24 Sep 2024 09:08:08 -0700
Subject: [PATCH 29/49] [lldb][NFC] Add a missing setter method for UnwindPlans
 (#109751)

The UnwindPlan class has getter and setter methods for specifying an
abstract register location, but it doesn't have a setter for a DWARF
Expression register location. This hasn't been needed for any of the
UnwindPlans that we do in mainline lldb yet, but it is used in the Swift
language SwiftLanguageRuntime plugin which creates UnwindPlan for async
functions. While it's currently unused on main branch, this is a
straightforward setter in the same form as the others, the only caveat
being that it doesn't own the dwarf expression bytes, it has a pointer
to them.
---
 lldb/include/lldb/Symbol/UnwindPlan.h | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/lldb/include/lldb/Symbol/UnwindPlan.h b/lldb/include/lldb/Symbol/UnwindPlan.h
index a1d00f2d2c0cd1..e1567c7357d0b5 100644
--- a/lldb/include/lldb/Symbol/UnwindPlan.h
+++ b/lldb/include/lldb/Symbol/UnwindPlan.h
@@ -370,6 +370,13 @@ class UnwindPlan {
 
     bool SetRegisterLocationToSame(uint32_t reg_num, bool must_replace);
 
+    /// This method does not make a copy of the \a opcodes memory, it is
+    /// assumed to have the same lifetime as the Module this UnwindPlan will
+    /// be registered in.
+    bool SetRegisterLocationToIsDWARFExpression(uint32_t reg_num,
+                                                const uint8_t *opcodes,
+                                                uint32_t len, bool can_replace);
+
     bool SetRegisterLocationToIsConstant(uint32_t reg_num, uint64_t constant,
                                          bool can_replace);
 

From 7773243d9916f98ba0ffce0c3a960e4aa9f03e81 Mon Sep 17 00:00:00 2001
From: Elvina Yakubova <eyakubova@nvidia.com>
Date: Tue, 24 Sep 2024 17:10:47 +0100
Subject: [PATCH 30/49] [SLP] Move more X86 tests to common directory (#109821)

Some of the tests from the X86 directory can be generalized to improve
coverage for other architectures
---
 .../SLPVectorizer/{X86 => }/peek-through-shuffle.ll          | 3 ++-
 .../SLPVectorizer/{X86 => }/phi-node-bitwidt-op-not.ll       | 3 ++-
 .../Transforms/SLPVectorizer/{X86 => }/phi-undef-input.ll    | 3 ++-
 .../Transforms/SLPVectorizer/{X86 => }/postponed_gathers.ll  | 3 ++-
 .../SLPVectorizer/{X86 => }/pr31599-inseltpoison.ll          | 3 ++-
 llvm/test/Transforms/SLPVectorizer/{X86 => }/pr31599.ll      | 3 ++-
 .../{X86 => }/reduction-gather-non-scheduled-extracts.ll     | 3 ++-
 .../SLPVectorizer/{X86 => }/reduction-modified-values.ll     | 3 ++-
 .../SLPVectorizer/{X86 => }/reorder-clustered-node.ll        | 3 ++-
 .../SLPVectorizer/{X86 => }/reordered-top-scalars.ll         | 3 ++-
 .../SLPVectorizer/{X86 => }/reordering-single-phi.ll         | 3 ++-
 .../{X86 => }/reused-buildvector-matching-vectorized-node.ll | 3 ++-
 .../SLPVectorizer/{X86 => }/root-trunc-extract-reuse.ll      | 3 ++-
 .../{X86 => }/same-scalar-in-same-phi-extract.ll             | 3 ++-
 .../Transforms/SLPVectorizer/{X86 => }/scalarazied-result.ll | 3 ++-
 .../SLPVectorizer/{X86 => }/scalarization-overhead.ll        | 3 ++-
 .../SLPVectorizer/{X86 => }/shrink_after_reorder2.ll         | 3 ++-
 .../SLPVectorizer/{X86 => }/shuffle-multivector.ll           | 3 ++-
 .../Transforms/SLPVectorizer/{X86 => }/shufflebuilder-bug.ll | 3 ++-
 .../Transforms/SLPVectorizer/{X86 => }/stores-non-ordered.ll | 3 ++-
 .../Transforms/SLPVectorizer/{X86 => }/unknown-entries.ll    | 5 ++---
 .../SLPVectorizer/{X86 => }/zext-incoming-for-neg-icmp.ll    | 3 ++-
 22 files changed, 44 insertions(+), 24 deletions(-)
 rename llvm/test/Transforms/SLPVectorizer/{X86 => }/peek-through-shuffle.ll (85%)
 rename llvm/test/Transforms/SLPVectorizer/{X86 => }/phi-node-bitwidt-op-not.ll (94%)
 rename llvm/test/Transforms/SLPVectorizer/{X86 => }/phi-undef-input.ll (96%)
 rename llvm/test/Transforms/SLPVectorizer/{X86 => }/postponed_gathers.ll (90%)
 rename llvm/test/Transforms/SLPVectorizer/{X86 => }/pr31599-inseltpoison.ll (78%)
 rename llvm/test/Transforms/SLPVectorizer/{X86 => }/pr31599.ll (78%)
 rename llvm/test/Transforms/SLPVectorizer/{X86 => }/reduction-gather-non-scheduled-extracts.ll (85%)
 rename llvm/test/Transforms/SLPVectorizer/{X86 => }/reduction-modified-values.ll (83%)
 rename llvm/test/Transforms/SLPVectorizer/{X86 => }/reorder-clustered-node.ll (93%)
 rename llvm/test/Transforms/SLPVectorizer/{X86 => }/reordered-top-scalars.ll (83%)
 rename llvm/test/Transforms/SLPVectorizer/{X86 => }/reordering-single-phi.ll (93%)
 rename llvm/test/Transforms/SLPVectorizer/{X86 => }/reused-buildvector-matching-vectorized-node.ll (94%)
 rename llvm/test/Transforms/SLPVectorizer/{X86 => }/root-trunc-extract-reuse.ll (86%)
 rename llvm/test/Transforms/SLPVectorizer/{X86 => }/same-scalar-in-same-phi-extract.ll (88%)
 rename llvm/test/Transforms/SLPVectorizer/{X86 => }/scalarazied-result.ll (60%)
 rename llvm/test/Transforms/SLPVectorizer/{X86 => }/scalarization-overhead.ll (92%)
 rename llvm/test/Transforms/SLPVectorizer/{X86 => }/shrink_after_reorder2.ll (91%)
 rename llvm/test/Transforms/SLPVectorizer/{X86 => }/shuffle-multivector.ll (89%)
 rename llvm/test/Transforms/SLPVectorizer/{X86 => }/shufflebuilder-bug.ll (89%)
 rename llvm/test/Transforms/SLPVectorizer/{X86 => }/stores-non-ordered.ll (92%)
 rename llvm/test/Transforms/SLPVectorizer/{X86 => }/unknown-entries.ll (82%)
 rename llvm/test/Transforms/SLPVectorizer/{X86 => }/zext-incoming-for-neg-icmp.ll (89%)

diff --git a/llvm/test/Transforms/SLPVectorizer/X86/peek-through-shuffle.ll b/llvm/test/Transforms/SLPVectorizer/peek-through-shuffle.ll
similarity index 85%
rename from llvm/test/Transforms/SLPVectorizer/X86/peek-through-shuffle.ll
rename to llvm/test/Transforms/SLPVectorizer/peek-through-shuffle.ll
index c157f6117df959..839c1ebed6bcff 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/peek-through-shuffle.ll
+++ b/llvm/test/Transforms/SLPVectorizer/peek-through-shuffle.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -passes=slp-vectorizer < %s -mtriple=x86_64-unknown-linux-gnu -o - | FileCheck %s
+; RUN: %if x86-registered-target %{ opt -S -passes=slp-vectorizer < %s -mtriple=x86_64-unknown-linux-gnu -o - | FileCheck %s %}
+; RUN: %if aarch64-registered-target %{ opt -S -passes=slp-vectorizer < %s -mtriple=aarch64-unknown-linux-gnu -o - | FileCheck %s %}
 
 define void @foo(ptr %0, <4 x float> %1) {
 ; CHECK-LABEL: @foo(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/phi-node-bitwidt-op-not.ll b/llvm/test/Transforms/SLPVectorizer/phi-node-bitwidt-op-not.ll
similarity index 94%
rename from llvm/test/Transforms/SLPVectorizer/X86/phi-node-bitwidt-op-not.ll
rename to llvm/test/Transforms/SLPVectorizer/phi-node-bitwidt-op-not.ll
index f376ca71c77693..2037e0d67d2f89 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/phi-node-bitwidt-op-not.ll
+++ b/llvm/test/Transforms/SLPVectorizer/phi-node-bitwidt-op-not.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt -S -passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+; RUN: %if x86-registered-target %{ opt -S -passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s %}
+; RUN: %if aarch64-registered-target %{ opt -S -passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s %}
 
 define i32 @test(ptr %b, ptr %c, i32 %0, ptr %a, i1 %tobool3.not) {
 ; CHECK-LABEL: define i32 @test(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/phi-undef-input.ll b/llvm/test/Transforms/SLPVectorizer/phi-undef-input.ll
similarity index 96%
rename from llvm/test/Transforms/SLPVectorizer/X86/phi-undef-input.ll
rename to llvm/test/Transforms/SLPVectorizer/phi-undef-input.ll
index 3cc32c1fc7b28e..b9802a0adb8aaf 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/phi-undef-input.ll
+++ b/llvm/test/Transforms/SLPVectorizer/phi-undef-input.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=slp-vectorizer -slp-threshold=-1000 -mtriple=x86_64 -S | FileCheck %s
+; RUN: %if x86-registered-target %{ opt < %s -passes=slp-vectorizer -slp-threshold=-1000 -mtriple=x86_64 -S | FileCheck %s %}
+; RUN: %if aarch64-registered-target %{ opt < %s -passes=slp-vectorizer -slp-threshold=-1000 -mtriple=aarch64-unknown-linux-gnu -S | FileCheck %s %}
 
 ; The inputs to vector phi should remain undef.
 
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/postponed_gathers.ll b/llvm/test/Transforms/SLPVectorizer/postponed_gathers.ll
similarity index 90%
rename from llvm/test/Transforms/SLPVectorizer/X86/postponed_gathers.ll
rename to llvm/test/Transforms/SLPVectorizer/postponed_gathers.ll
index 488ca0b23cd9c5..f6bed797b9ba91 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/postponed_gathers.ll
+++ b/llvm/test/Transforms/SLPVectorizer/postponed_gathers.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
-; RUN: opt < %s -passes=slp-vectorizer -slp-threshold=-10 -mtriple=x86_64-unknown-linux-gnu -S | FileCheck %s
+; RUN: %if x86-registered-target %{ opt < %s -passes=slp-vectorizer -slp-threshold=-10 -mtriple=x86_64-unknown-linux-gnu -S | FileCheck %s %}
+; RUN: %if aarch64-registered-target %{ opt < %s -passes=slp-vectorizer -slp-threshold=-10 -mtriple=aarch64-unknown-linux-gnu -S | FileCheck %s %}
 
 define void @foo() {
 ; CHECK-LABEL: define void @foo() {
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr31599-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/pr31599-inseltpoison.ll
similarity index 78%
rename from llvm/test/Transforms/SLPVectorizer/X86/pr31599-inseltpoison.ll
rename to llvm/test/Transforms/SLPVectorizer/pr31599-inseltpoison.ll
index 5506f61fe134bd..fe5871d73cd5e2 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr31599-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/pr31599-inseltpoison.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+; RUN: %if x86-registered-target %{ opt -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s %}
+; RUN: %if aarch64-registered-target %{ opt -passes=slp-vectorizer -S -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s %}
 
 define <2 x float> @foo() {
 ; CHECK-LABEL: @foo(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr31599.ll b/llvm/test/Transforms/SLPVectorizer/pr31599.ll
similarity index 78%
rename from llvm/test/Transforms/SLPVectorizer/X86/pr31599.ll
rename to llvm/test/Transforms/SLPVectorizer/pr31599.ll
index 348656e07c6be4..10b9b224d556e5 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr31599.ll
+++ b/llvm/test/Transforms/SLPVectorizer/pr31599.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+; RUN: %if x86-registered-target %{ opt -passes=slp-vectorizer -S -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s %}
+; RUN: %if aarch64-registered-target %{ opt -passes=slp-vectorizer -S -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s %}
 
 define <2 x float> @foo() {
 ; CHECK-LABEL: @foo(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-gather-non-scheduled-extracts.ll b/llvm/test/Transforms/SLPVectorizer/reduction-gather-non-scheduled-extracts.ll
similarity index 85%
rename from llvm/test/Transforms/SLPVectorizer/X86/reduction-gather-non-scheduled-extracts.ll
rename to llvm/test/Transforms/SLPVectorizer/reduction-gather-non-scheduled-extracts.ll
index 03c8767eff327f..f1034f39711351 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-gather-non-scheduled-extracts.ll
+++ b/llvm/test/Transforms/SLPVectorizer/reduction-gather-non-scheduled-extracts.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 3
-; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-sie-ps5 < %s | FileCheck %s
+; RUN: %if x86-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=x86_64-sie-ps5 < %s | FileCheck %s %}
+; RUN: %if aarch64-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s %}
 
 define void @tes() {
 ; CHECK-LABEL: define void @tes() {
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reduction-modified-values.ll b/llvm/test/Transforms/SLPVectorizer/reduction-modified-values.ll
similarity index 83%
rename from llvm/test/Transforms/SLPVectorizer/X86/reduction-modified-values.ll
rename to llvm/test/Transforms/SLPVectorizer/reduction-modified-values.ll
index dbf490c5fe6a2f..be9318e467174a 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reduction-modified-values.ll
+++ b/llvm/test/Transforms/SLPVectorizer/reduction-modified-values.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+; RUN: %if x86-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s %}
+; RUN: %if aarch64-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s %}
 
 define i32 @test() {
 ; CHECK-LABEL: @test(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reorder-clustered-node.ll b/llvm/test/Transforms/SLPVectorizer/reorder-clustered-node.ll
similarity index 93%
rename from llvm/test/Transforms/SLPVectorizer/X86/reorder-clustered-node.ll
rename to llvm/test/Transforms/SLPVectorizer/reorder-clustered-node.ll
index 1a6ff2385905b3..561182d5e4f49d 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reorder-clustered-node.ll
+++ b/llvm/test/Transforms/SLPVectorizer/reorder-clustered-node.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=slp-vectorizer -S < %s -mtriple=x86_64 -slp-threshold=-150 | FileCheck %s
+; RUN: %if x86-registered-target %{ opt -passes=slp-vectorizer -S < %s -mtriple=x86_64 -slp-threshold=-150 | FileCheck %s %}
+; RUN: %if aarch64-registered-target %{ opt -passes=slp-vectorizer -S < %s -mtriple=aarch64-unknown-linux-gnu -slp-threshold=-150 | FileCheck %s %}
 
 define i1 @test(ptr %arg, ptr %i233, i64 %i241, ptr %i235, ptr %i237, ptr %i227) {
 ; CHECK-LABEL: @test(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reordered-top-scalars.ll b/llvm/test/Transforms/SLPVectorizer/reordered-top-scalars.ll
similarity index 83%
rename from llvm/test/Transforms/SLPVectorizer/X86/reordered-top-scalars.ll
rename to llvm/test/Transforms/SLPVectorizer/reordered-top-scalars.ll
index 4517d27598b603..1de5ee2298837a 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reordered-top-scalars.ll
+++ b/llvm/test/Transforms/SLPVectorizer/reordered-top-scalars.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown %s -slp-threshold=-5 | FileCheck %s
+; RUN: %if x86-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown %s -slp-threshold=-5 | FileCheck %s %}
+; RUN: %if aarch64-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=aarch64-unknown %s -slp-threshold=-5 | FileCheck %s %}
 
 define i32 @test(ptr %isec) {
 ; CHECK-LABEL: @test(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reordering-single-phi.ll b/llvm/test/Transforms/SLPVectorizer/reordering-single-phi.ll
similarity index 93%
rename from llvm/test/Transforms/SLPVectorizer/X86/reordering-single-phi.ll
rename to llvm/test/Transforms/SLPVectorizer/reordering-single-phi.ll
index bc1eaaac5d1bbc..a70daf9cf8d60c 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reordering-single-phi.ll
+++ b/llvm/test/Transforms/SLPVectorizer/reordering-single-phi.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux < %s | FileCheck %s
+; RUN: %if x86-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux < %s | FileCheck %s %}
+; RUN: %if aarch64-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=aarch64-unknown-linux < %s | FileCheck %s %}
 
 @a = external global [32000 x float], align 64
 
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/reused-buildvector-matching-vectorized-node.ll b/llvm/test/Transforms/SLPVectorizer/reused-buildvector-matching-vectorized-node.ll
similarity index 94%
rename from llvm/test/Transforms/SLPVectorizer/X86/reused-buildvector-matching-vectorized-node.ll
rename to llvm/test/Transforms/SLPVectorizer/reused-buildvector-matching-vectorized-node.ll
index 2b425ee624700f..3e00550a885215 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/reused-buildvector-matching-vectorized-node.ll
+++ b/llvm/test/Transforms/SLPVectorizer/reused-buildvector-matching-vectorized-node.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
-; RUN: opt -S -passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+; RUN: %if x86-registered-target %{ opt -S -passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s %}
+; RUN: %if aarch64-registered-target %{ opt -S -passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s %}
 
 define void @blam(ptr %arg, double %load2, i1 %fcmp3) {
 ; CHECK-LABEL: define void @blam
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/root-trunc-extract-reuse.ll b/llvm/test/Transforms/SLPVectorizer/root-trunc-extract-reuse.ll
similarity index 86%
rename from llvm/test/Transforms/SLPVectorizer/X86/root-trunc-extract-reuse.ll
rename to llvm/test/Transforms/SLPVectorizer/root-trunc-extract-reuse.ll
index af46b4f576234b..34c068478c5f5e 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/root-trunc-extract-reuse.ll
+++ b/llvm/test/Transforms/SLPVectorizer/root-trunc-extract-reuse.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=slp-vectorizer -S -slp-threshold=-100 -mtriple=x86_64 < %s | FileCheck %s
+; RUN: %if x86-registered-target %{ opt -passes=slp-vectorizer -S -slp-threshold=-100 -mtriple=x86_64 < %s | FileCheck %s %}
+; RUN: %if aarch64-registered-target %{ opt -passes=slp-vectorizer -S -slp-threshold=-100 -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s %}
 
 define i1 @test() {
 ; CHECK-LABEL: @test(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/same-scalar-in-same-phi-extract.ll b/llvm/test/Transforms/SLPVectorizer/same-scalar-in-same-phi-extract.ll
similarity index 88%
rename from llvm/test/Transforms/SLPVectorizer/X86/same-scalar-in-same-phi-extract.ll
rename to llvm/test/Transforms/SLPVectorizer/same-scalar-in-same-phi-extract.ll
index f1be11d0d0fc51..fe0813542f3093 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/same-scalar-in-same-phi-extract.ll
+++ b/llvm/test/Transforms/SLPVectorizer/same-scalar-in-same-phi-extract.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt -S --passes=slp-vectorizer -slp-threshold=-99999 -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+; RUN: %if x86-registered-target %{ opt -S --passes=slp-vectorizer -slp-threshold=-99999 -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s %}
+; RUN: %if aarch64-registered-target %{ opt -S --passes=slp-vectorizer -slp-threshold=-99999 -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s %}
 
 define void @test(i32 %arg) {
 ; CHECK-LABEL: define void @test(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/scalarazied-result.ll b/llvm/test/Transforms/SLPVectorizer/scalarazied-result.ll
similarity index 60%
rename from llvm/test/Transforms/SLPVectorizer/X86/scalarazied-result.ll
rename to llvm/test/Transforms/SLPVectorizer/scalarazied-result.ll
index 1d6e191c6f97bf..2570cdb45e1e78 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/scalarazied-result.ll
+++ b/llvm/test/Transforms/SLPVectorizer/scalarazied-result.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu -S < %s | FileCheck %s
+; RUN: %if x86-registered-target %{ opt -passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu -S < %s | FileCheck %s %}
+; RUN: %if aarch64-registered-target %{ opt -passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu -S < %s | FileCheck %s %}
 
 define void @test() {
 ; CHECK-LABEL: @test(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/scalarization-overhead.ll b/llvm/test/Transforms/SLPVectorizer/scalarization-overhead.ll
similarity index 92%
rename from llvm/test/Transforms/SLPVectorizer/X86/scalarization-overhead.ll
rename to llvm/test/Transforms/SLPVectorizer/scalarization-overhead.ll
index 55e155840f8588..9f6b285f1ab90a 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/scalarization-overhead.ll
+++ b/llvm/test/Transforms/SLPVectorizer/scalarization-overhead.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -mtriple=x86_64-- -passes=slp-vectorizer -S < %s | FileCheck %s
+; RUN: %if x86-registered-target %{ opt -mtriple=x86_64-- -passes=slp-vectorizer -S < %s | FileCheck %s %}
+; RUN: %if aarch64-registered-target %{ opt -mtriple=aarch64-- -passes=slp-vectorizer -S < %s | FileCheck %s %}
 
 ; Crash Test case reported on D134605
 
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder2.ll b/llvm/test/Transforms/SLPVectorizer/shrink_after_reorder2.ll
similarity index 91%
rename from llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder2.ll
rename to llvm/test/Transforms/SLPVectorizer/shrink_after_reorder2.ll
index 9e3ba05f88da8d..2f0bd4a8f1315c 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/shrink_after_reorder2.ll
+++ b/llvm/test/Transforms/SLPVectorizer/shrink_after_reorder2.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -o - -passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+; RUN: %if x86-registered-target %{ opt -S -o - -passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s %}
+; RUN: %if aarch64-registered-target %{ opt -S -o - -passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s %}
 
 %class.e = type { i32, i32 }
 %struct.a = type { i32, i32, i32, i32 }
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/shuffle-multivector.ll b/llvm/test/Transforms/SLPVectorizer/shuffle-multivector.ll
similarity index 89%
rename from llvm/test/Transforms/SLPVectorizer/X86/shuffle-multivector.ll
rename to llvm/test/Transforms/SLPVectorizer/shuffle-multivector.ll
index c2555889f59816..2253c70dc25015 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/shuffle-multivector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/shuffle-multivector.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -passes=slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux -slp-threshold=-163 | FileCheck %s
+; RUN: %if x86-registered-target %{ opt -passes=slp-vectorizer -S < %s -mtriple=x86_64-unknown-linux -slp-threshold=-163 | FileCheck %s %}
+; RUN: %if aarch64-registered-target %{ opt -passes=slp-vectorizer -S < %s -mtriple=aarch64-unknown-linux -slp-threshold=-163 | FileCheck %s %}
 
 define void @test1(i128 %p0, i128 %p1, i128 %p2, i128 %p3, <4 x i128> %vec) {
 ; CHECK-LABEL: @test1(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/shufflebuilder-bug.ll b/llvm/test/Transforms/SLPVectorizer/shufflebuilder-bug.ll
similarity index 89%
rename from llvm/test/Transforms/SLPVectorizer/X86/shufflebuilder-bug.ll
rename to llvm/test/Transforms/SLPVectorizer/shufflebuilder-bug.ll
index 9db7d696c7c7eb..019c9eadd7c096 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/shufflebuilder-bug.ll
+++ b/llvm/test/Transforms/SLPVectorizer/shufflebuilder-bug.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
-; RUN: opt -S -p slp-vectorizer -mtriple=x86_64-- %s | FileCheck %s
+; RUN: %if x86-registered-target %{ opt -S -p slp-vectorizer -mtriple=x86_64-- %s | FileCheck %s %}
+; RUN: %if aarch64-registered-target %{ opt -S -p slp-vectorizer -mtriple=aarch64-unknown-linux-gnu %s | FileCheck %s %}
 
 define void @foo(<4 x float> %vec, float %val, ptr %ptr) {
 ; CHECK-LABEL: define void @foo
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/stores-non-ordered.ll b/llvm/test/Transforms/SLPVectorizer/stores-non-ordered.ll
similarity index 92%
rename from llvm/test/Transforms/SLPVectorizer/X86/stores-non-ordered.ll
rename to llvm/test/Transforms/SLPVectorizer/stores-non-ordered.ll
index a9748ca6291ae2..aaa6be73056bd4 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/stores-non-ordered.ll
+++ b/llvm/test/Transforms/SLPVectorizer/stores-non-ordered.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -S -mtriple=x86_64-unknown -passes=slp-vectorizer -slp-min-reg-size=64 -slp-threshold=-1000 | FileCheck %s
+; RUN: %if x86-registered-target %{ opt < %s -S -mtriple=x86_64-unknown -passes=slp-vectorizer -slp-min-reg-size=64 -slp-threshold=-1000 | FileCheck %s %}
+; RUN: %if aarch64-registered-target %{ opt < %s -S -mtriple=aarch64-unknown -passes=slp-vectorizer -slp-min-reg-size=64 -slp-threshold=-1000 | FileCheck %s %}
 
 define i32 @non-ordered-stores(ptr noalias nocapture %in, ptr noalias nocapture %inn, ptr noalias nocapture %out) {
 ; CHECK-LABEL: @non-ordered-stores(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/unknown-entries.ll b/llvm/test/Transforms/SLPVectorizer/unknown-entries.ll
similarity index 82%
rename from llvm/test/Transforms/SLPVectorizer/X86/unknown-entries.ll
rename to llvm/test/Transforms/SLPVectorizer/unknown-entries.ll
index fc22280c2b8ada..ca9aa451a9a3ae 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/unknown-entries.ll
+++ b/llvm/test/Transforms/SLPVectorizer/unknown-entries.ll
@@ -1,7 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt < %s -passes=slp-vectorizer -S | FileCheck %s
-
-target triple = "x86_64-unknown-linux-gnu"
+; RUN: %if x86-registered-target %{ opt < %s -passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu -S | FileCheck %s %}
+; RUN: %if aarch64-registered-target %{ opt < %s -passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu -S | FileCheck %s %}
 
 define <3 x i64> @ahyes(i64 %position, i64 %value) {
 ; CHECK-LABEL: define <3 x i64> @ahyes(
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/zext-incoming-for-neg-icmp.ll b/llvm/test/Transforms/SLPVectorizer/zext-incoming-for-neg-icmp.ll
similarity index 89%
rename from llvm/test/Transforms/SLPVectorizer/X86/zext-incoming-for-neg-icmp.ll
rename to llvm/test/Transforms/SLPVectorizer/zext-incoming-for-neg-icmp.ll
index 7f086d17ca4c08..89fcc7e983749b 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/zext-incoming-for-neg-icmp.ll
+++ b/llvm/test/Transforms/SLPVectorizer/zext-incoming-for-neg-icmp.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; RUN: opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+; RUN: %if x86-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s %}
+; RUN: %if aarch64-registered-target %{ opt -S --passes=slp-vectorizer -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s %}
 
 define i32 @test(i32 %a, i8 %b, i8 %c) {
 ; CHECK-LABEL: define i32 @test(

From fe6a3d46aa658fdd1e9a6cbb2031a597a3e59536 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Tue, 24 Sep 2024 09:32:42 -0700
Subject: [PATCH 31/49] [libc] Implement the 'rename' function on the GPU
 (#109814)

Summary:
Straightforward implementation like the other `stdio.h` functions.
---
 libc/config/gpu/entrypoints.txt              |  1 +
 libc/docs/gpu/support.rst                    |  1 +
 libc/include/llvm-libc-types/rpc_opcodes_t.h |  1 +
 libc/src/stdio/gpu/CMakeLists.txt            | 11 +++++++
 libc/src/stdio/gpu/rename.cpp                | 30 ++++++++++++++++++++
 libc/utils/gpu/server/rpc_server.cpp         | 18 ++++++++++++
 6 files changed, 62 insertions(+)
 create mode 100644 libc/src/stdio/gpu/rename.cpp

diff --git a/libc/config/gpu/entrypoints.txt b/libc/config/gpu/entrypoints.txt
index 9fb89e6fd8d28a..b4cfe47f4505fd 100644
--- a/libc/config/gpu/entrypoints.txt
+++ b/libc/config/gpu/entrypoints.txt
@@ -240,6 +240,7 @@ set(TARGET_LIBC_ENTRYPOINTS
     libc.src.stdio.putchar
     libc.src.stdio.puts
     libc.src.stdio.remove
+    libc.src.stdio.rename
     libc.src.stdio.stderr
     libc.src.stdio.stdin
     libc.src.stdio.stdout
diff --git a/libc/docs/gpu/support.rst b/libc/docs/gpu/support.rst
index 44c21c7b4c1ff9..9c151a5fbac1f6 100644
--- a/libc/docs/gpu/support.rst
+++ b/libc/docs/gpu/support.rst
@@ -240,6 +240,7 @@ fputs          |check|    |check|
 fputc          |check|    |check|
 fwrite         |check|    |check|
 remove         |check|    |check|
+rename         |check|    |check|
 putc           |check|    |check|
 printf         |check|    |check|
 vprintf        |check|    |check|
diff --git a/libc/include/llvm-libc-types/rpc_opcodes_t.h b/libc/include/llvm-libc-types/rpc_opcodes_t.h
index 3b388de6888c5d..1a6c0cd9bc4a14 100644
--- a/libc/include/llvm-libc-types/rpc_opcodes_t.h
+++ b/libc/include/llvm-libc-types/rpc_opcodes_t.h
@@ -38,6 +38,7 @@ typedef enum {
   RPC_PRINTF_TO_STDERR_PACKED,
   RPC_PRINTF_TO_STREAM_PACKED,
   RPC_REMOVE,
+  RPC_RENAME,
   RPC_SYSTEM,
   RPC_LAST = 0xFFFF,
 } rpc_opcode_t;
diff --git a/libc/src/stdio/gpu/CMakeLists.txt b/libc/src/stdio/gpu/CMakeLists.txt
index 86470b8425e956..9cac42ed71fb76 100644
--- a/libc/src/stdio/gpu/CMakeLists.txt
+++ b/libc/src/stdio/gpu/CMakeLists.txt
@@ -294,6 +294,17 @@ add_entrypoint_object(
     .vfprintf_utils
 )
 
+add_entrypoint_object(
+  rename
+  SRCS
+    rename.cpp
+  HDRS
+    ../rename.h
+  DEPENDS
+    libc.hdr.types.FILE
+    .gpu_file
+)
+
 add_entrypoint_object(
   stdin
   SRCS
diff --git a/libc/src/stdio/gpu/rename.cpp b/libc/src/stdio/gpu/rename.cpp
new file mode 100644
index 00000000000000..1087228835842e
--- /dev/null
+++ b/libc/src/stdio/gpu/rename.cpp
@@ -0,0 +1,30 @@
+//===-- GPU Implementation of rename --------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/stdio/rename.h"
+#include "src/__support/CPP/string_view.h"
+#include "src/__support/macros/config.h"
+#include "src/stdio/gpu/file.h"
+
+#include "hdr/types/FILE.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+LLVM_LIBC_FUNCTION(int, rename, (const char *oldpath, const char *newpath)) {
+  int ret;
+  rpc::Client::Port port = rpc::client.open<RPC_RENAME>();
+  port.send_n(oldpath, internal::string_length(oldpath) + 1);
+  port.send_n(newpath, internal::string_length(newpath) + 1);
+  port.recv(
+      [&](rpc::Buffer *buffer) { ret = static_cast<int>(buffer->data[0]); });
+  port.close();
+
+  return ret;
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/utils/gpu/server/rpc_server.cpp b/libc/utils/gpu/server/rpc_server.cpp
index 8708f946b310ee..aa65dfe69c385c 100644
--- a/libc/utils/gpu/server/rpc_server.cpp
+++ b/libc/utils/gpu/server/rpc_server.cpp
@@ -392,6 +392,24 @@ rpc_status_t handle_server_impl(
     });
     break;
   }
+  case RPC_RENAME: {
+    uint64_t oldsizes[lane_size] = {0};
+    uint64_t newsizes[lane_size] = {0};
+    void *oldpath[lane_size] = {nullptr};
+    void *newpath[lane_size] = {nullptr};
+    port->recv_n(oldpath, oldsizes,
+                 [&](uint64_t size) { return new char[size]; });
+    port->recv_n(newpath, newsizes,
+                 [&](uint64_t size) { return new char[size]; });
+    port->send([&](rpc::Buffer *buffer, uint32_t id) {
+      buffer->data[0] = static_cast<uint64_t>(
+          rename(reinterpret_cast<const char *>(oldpath[id]),
+                 reinterpret_cast<const char *>(newpath[id])));
+      delete[] reinterpret_cast<uint8_t *>(oldpath[id]);
+      delete[] reinterpret_cast<uint8_t *>(newpath[id]);
+    });
+    break;
+  }
   case RPC_SYSTEM: {
     uint64_t sizes[lane_size] = {0};
     void *args[lane_size] = {nullptr};

From 8e68a512ef568324da60c8b2705e9b087d06ebcf Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 24 Sep 2024 16:09:33 +0100
Subject: [PATCH 32/49] [X86] Add test coverage for #109790

---
 llvm/test/CodeGen/X86/pmulh.ll | 72 ++++++++++++++++++++++++++++++++++
 1 file changed, 72 insertions(+)

diff --git a/llvm/test/CodeGen/X86/pmulh.ll b/llvm/test/CodeGen/X86/pmulh.ll
index c2a009f06b89df..65337b8e1316f4 100644
--- a/llvm/test/CodeGen/X86/pmulh.ll
+++ b/llvm/test/CodeGen/X86/pmulh.ll
@@ -937,6 +937,77 @@ define <16 x i32> @zext_mulhuw_v16i16_lshr(<16 x i16> %a, <16 x i16> %b) {
   ret <16 x i32> %d
 }
 
+; PR109790
+define <16 x i16> @zext_mulhuw_v16i16_negative_constant(<16 x i16> %a) {
+; SSE-LABEL: zext_mulhuw_v16i16_negative_constant:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; SSE-NEXT:    pand %xmm2, %xmm1
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [64536,64536,64536,64536,64536,64536,64536,64536]
+; SSE-NEXT:    pmulhw %xmm2, %xmm0
+; SSE-NEXT:    pmulhw %xmm2, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: zext_mulhuw_v16i16_negative_constant:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [64536,64536,64536,64536,64536,64536,64536,64536,64536,64536,64536,64536,64536,64536,64536,64536]
+; AVX-NEXT:    retq
+  %k = and <16 x i16> %a, <i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767>
+  %x = zext nneg <16 x i16> %k to <16 x i32>
+  %m = mul nsw <16 x i32> %x, <i32 -1000, i32 -1000, i32 -1000, i32 -1000, i32 -1000, i32 -1000, i32 -1000, i32 -1000, i32 -1000, i32 -1000, i32 -1000, i32 -1000, i32 -1000, i32 -1000, i32 -1000, i32 -1000>
+  %s = lshr <16 x i32> %m, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+  %t = trunc nuw <16 x i32> %s to <16 x i16>
+  ret <16 x i16> %t
+}
+
+; PR109790
+define <16 x i16> @zext_mulhuw_v16i16_positive_constant(<16 x i16> %a) {
+; SSE-LABEL: zext_mulhuw_v16i16_positive_constant:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [32767,32767,32767,32767,32767,32767,32767,32767]
+; SSE-NEXT:    pand %xmm2, %xmm1
+; SSE-NEXT:    pand %xmm2, %xmm0
+; SSE-NEXT:    movdqa {{.*#+}} xmm2 = [1000,1000,1000,1000,1000,1000,1000,1000]
+; SSE-NEXT:    pmulhw %xmm2, %xmm0
+; SSE-NEXT:    pmulhw %xmm2, %xmm1
+; SSE-NEXT:    retq
+;
+; AVX2-LABEL: zext_mulhuw_v16i16_positive_constant:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
+; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
+; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [1000,0,1000,0,1000,0,1000,0,1000,0,1000,0,1000,0,1000,0]
+; AVX2-NEXT:    vpmulhuw %ymm2, %ymm0, %ymm0
+; AVX2-NEXT:    vpmulhuw %ymm2, %ymm1, %ymm1
+; AVX2-NEXT:    vpackusdw %ymm0, %ymm1, %ymm0
+; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: zext_mulhuw_v16i16_positive_constant:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX512F-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000]
+; AVX512F-NEXT:    retq
+;
+; AVX512BW-LABEL: zext_mulhuw_v16i16_positive_constant:
+; AVX512BW:       # %bb.0:
+; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX512BW-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
+; AVX512BW-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # [1000,0,1000,0,1000,0,1000,0,1000,0,1000,0,1000,0,1000,0,1000,0,1000,0,1000,0,1000,0,1000,0,1000,0,1000,0,1000,0]
+; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
+; AVX512BW-NEXT:    retq
+  %k = and <16 x i16> %a, <i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767>
+  %x = zext nneg <16 x i16> %k to <16 x i32>
+  %m = mul nuw nsw <16 x i32> %x, <i32 1000, i32 1000, i32 1000, i32 1000, i32 1000, i32 1000, i32 1000, i32 1000, i32 1000, i32 1000, i32 1000, i32 1000, i32 1000, i32 1000, i32 1000, i32 1000>
+  %s = lshr <16 x i32> %m, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+  %t = trunc nuw nsw <16 x i32> %s to <16 x i16>
+  ret <16 x i16> %t
+}
+
 define <16 x i32> @mulhsw_v16i16_lshr(<16 x i16> %a, <16 x i16> %b) {
 ; SSE2-LABEL: mulhsw_v16i16_lshr:
 ; SSE2:       # %bb.0:
@@ -2056,3 +2127,4 @@ define <8 x i16> @sse2_pmulhu_w_const(<8 x i16> %a0, <8 x i16> %a1) {
   ret <8 x i16> %res
 }
 declare <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16>, <8 x i16>)
+

From 01be0252c834bc7be222057049d697f488493543 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 24 Sep 2024 16:49:20 +0100
Subject: [PATCH 33/49] [X86] combinePMULH - simplify pattern matching with
 SDPatternMatch. NFC.

---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 16 +++++-----------
 1 file changed, 5 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index d7a26dc4caec6c..cf6617f49e506e 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -52983,10 +52983,7 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
 // combiner.
 static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
                             SelectionDAG &DAG, const X86Subtarget &Subtarget) {
-  // First instruction should be a right shift of a multiply.
-  if (Src.getOpcode() != ISD::SRL ||
-      Src.getOperand(0).getOpcode() != ISD::MUL)
-    return SDValue();
+  using namespace llvm::SDPatternMatch;
 
   if (!Subtarget.hasSSE2())
     return SDValue();
@@ -53001,15 +52998,12 @@ static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
   if (InVT.getVectorElementType().getSizeInBits() < 32)
     return SDValue();
 
-  // Need a shift by 16.
-  APInt ShiftAmt;
-  if (!ISD::isConstantSplatVector(Src.getOperand(1).getNode(), ShiftAmt) ||
-      ShiftAmt != 16)
+  // First instruction should be a right shift by 16 of a multiply.
+  SDValue LHS, RHS;
+  if (!sd_match(Src,
+                m_Srl(m_Mul(m_Value(LHS), m_Value(RHS)), m_SpecificInt(16))))
     return SDValue();
 
-  SDValue LHS = Src.getOperand(0).getOperand(0);
-  SDValue RHS = Src.getOperand(0).getOperand(1);
-
   // Count leading sign/zero bits on both inputs - if there are enough then
   // truncation back to vXi16 will be cheap - either as a pack/shuffle
   // sequence or using AVX512 truncations. If the inputs are sext/zext then the

From 9e60a6ad4242274c39242079d5351fcb9d6d99e6 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 24 Sep 2024 16:54:58 +0100
Subject: [PATCH 34/49] [X86] combinePMULH - pull out repeated IsTruncateFree
 helper. NFC.

---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index cf6617f49e506e..65dc107f04b4f9 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -53021,12 +53021,12 @@ static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
     return SDValue();
 
   // Check if both inputs are extensions, which will be removed by truncation.
-  bool IsTruncateFree = (LHS.getOpcode() == ISD::SIGN_EXTEND ||
-                         LHS.getOpcode() == ISD::ZERO_EXTEND) &&
-                        (RHS.getOpcode() == ISD::SIGN_EXTEND ||
-                         RHS.getOpcode() == ISD::ZERO_EXTEND) &&
-                        LHS.getOperand(0).getScalarValueSizeInBits() <= 16 &&
-                        RHS.getOperand(0).getScalarValueSizeInBits() <= 16;
+  auto isOpTruncateFree = [](SDValue Op) {
+    return (Op.getOpcode() == ISD::SIGN_EXTEND ||
+            Op.getOpcode() == ISD::ZERO_EXTEND) &&
+           Op.getOperand(0).getScalarValueSizeInBits() <= 16;
+  };
+  bool IsTruncateFree = isOpTruncateFree(LHS) && isOpTruncateFree(RHS);
 
   // For AVX2+ targets, with the upper bits known zero, we can perform MULHU on
   // the (bitcasted) inputs directly, and then cheaply pack/truncate the result

From 406a2128f57b1e86748cd6de6d3eab6b1e2db2ad Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 24 Sep 2024 17:37:26 +0100
Subject: [PATCH 35/49] [X86] combinePMULH - we can treat constant vectors as
 freely truncatable.

Fixes #109790
---
 llvm/lib/Target/X86/X86ISelLowering.cpp |  7 +++---
 llvm/test/CodeGen/X86/pmulh.ll          | 31 ++++---------------------
 2 files changed, 9 insertions(+), 29 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 65dc107f04b4f9..d9eedfdfd53a43 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -53022,9 +53022,10 @@ static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
 
   // Check if both inputs are extensions, which will be removed by truncation.
   auto isOpTruncateFree = [](SDValue Op) {
-    return (Op.getOpcode() == ISD::SIGN_EXTEND ||
-            Op.getOpcode() == ISD::ZERO_EXTEND) &&
-           Op.getOperand(0).getScalarValueSizeInBits() <= 16;
+    if (Op.getOpcode() == ISD::SIGN_EXTEND ||
+        Op.getOpcode() == ISD::ZERO_EXTEND)
+      return Op.getOperand(0).getScalarValueSizeInBits() <= 16;
+    return ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
   };
   bool IsTruncateFree = isOpTruncateFree(LHS) && isOpTruncateFree(RHS);
 
diff --git a/llvm/test/CodeGen/X86/pmulh.ll b/llvm/test/CodeGen/X86/pmulh.ll
index 65337b8e1316f4..502249a87c4892 100644
--- a/llvm/test/CodeGen/X86/pmulh.ll
+++ b/llvm/test/CodeGen/X86/pmulh.ll
@@ -974,32 +974,11 @@ define <16 x i16> @zext_mulhuw_v16i16_positive_constant(<16 x i16> %a) {
 ; SSE-NEXT:    pmulhw %xmm2, %xmm1
 ; SSE-NEXT:    retq
 ;
-; AVX2-LABEL: zext_mulhuw_v16i16_positive_constant:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX2-NEXT:    vextracti128 $1, %ymm0, %xmm0
-; AVX2-NEXT:    vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
-; AVX2-NEXT:    vpbroadcastd {{.*#+}} ymm2 = [1000,0,1000,0,1000,0,1000,0,1000,0,1000,0,1000,0,1000,0]
-; AVX2-NEXT:    vpmulhuw %ymm2, %ymm0, %ymm0
-; AVX2-NEXT:    vpmulhuw %ymm2, %ymm1, %ymm1
-; AVX2-NEXT:    vpackusdw %ymm0, %ymm1, %ymm0
-; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
-; AVX2-NEXT:    retq
-;
-; AVX512F-LABEL: zext_mulhuw_v16i16_positive_constant:
-; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX512F-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000]
-; AVX512F-NEXT:    retq
-;
-; AVX512BW-LABEL: zext_mulhuw_v16i16_positive_constant:
-; AVX512BW:       # %bb.0:
-; AVX512BW-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; AVX512BW-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
-; AVX512BW-NEXT:    vpmulhuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # [1000,0,1000,0,1000,0,1000,0,1000,0,1000,0,1000,0,1000,0,1000,0,1000,0,1000,0,1000,0,1000,0,1000,0,1000,0,1000,0]
-; AVX512BW-NEXT:    vpmovdw %zmm0, %ymm0
-; AVX512BW-NEXT:    retq
+; AVX-LABEL: zext_mulhuw_v16i16_positive_constant:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX-NEXT:    vpmulhw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 # [1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000,1000]
+; AVX-NEXT:    retq
   %k = and <16 x i16> %a, <i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767, i16 32767>
   %x = zext nneg <16 x i16> %k to <16 x i32>
   %m = mul nuw nsw <16 x i32> %x, <i32 1000, i32 1000, i32 1000, i32 1000, i32 1000, i32 1000, i32 1000, i32 1000, i32 1000, i32 1000, i32 1000, i32 1000, i32 1000, i32 1000, i32 1000, i32 1000>

From e64673d3174af941eb3c9f1ad822792154aa1d31 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Tue, 24 Sep 2024 09:49:32 -0700
Subject: [PATCH 36/49] [RISCV] Treat insert_subvector into undef with index==0
 as legal. (#109745)

Regardless of fixed and scalable type. We can always use subreg ops.

We don't need to do any container conversion.
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index b998a1eb11c300..bf822eb2c6eeb5 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -10152,13 +10152,15 @@ SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
   unsigned OrigIdx = Op.getConstantOperandVal(2);
   const RISCVRegisterInfo *TRI = Subtarget.getRegisterInfo();
 
+  if (OrigIdx == 0 && Vec.isUndef())
+    return Op;
+
   // We don't have the ability to slide mask vectors up indexed by their i1
   // elements; the smallest we can do is i8. Often we are able to bitcast to
   // equivalent i8 vectors. Note that when inserting a fixed-length vector
   // into a scalable one, we might not necessarily have enough scalable
   // elements to safely divide by 8: nxv1i1 = insert nxv1i1, v4i1 is valid.
-  if (SubVecVT.getVectorElementType() == MVT::i1 &&
-      (OrigIdx != 0 || !Vec.isUndef())) {
+  if (SubVecVT.getVectorElementType() == MVT::i1) {
     if (VecVT.getVectorMinNumElements() >= 8 &&
         SubVecVT.getVectorMinNumElements() >= 8) {
       assert(OrigIdx % 8 == 0 && "Invalid index");
@@ -10196,8 +10198,6 @@ SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
   // vector group up the full amount.
   const auto VLen = Subtarget.getRealVLen();
   if (SubVecVT.isFixedLengthVector() && !VLen) {
-    if (OrigIdx == 0 && Vec.isUndef() && !VecVT.isFixedLengthVector())
-      return Op;
     MVT ContainerVT = VecVT;
     if (VecVT.isFixedLengthVector()) {
       ContainerVT = getContainerForFixedLengthVector(VecVT);
@@ -10208,11 +10208,6 @@ SDValue RISCVTargetLowering::lowerINSERT_SUBVECTOR(SDValue Op,
                          DAG.getUNDEF(ContainerVT), SubVec,
                          DAG.getVectorIdxConstant(0, DL));
 
-    if (OrigIdx == 0 && Vec.isUndef() && VecVT.isFixedLengthVector()) {
-      SubVec = convertFromScalableVector(VecVT, SubVec, DAG, Subtarget);
-      return DAG.getBitcast(Op.getValueType(), SubVec);
-    }
-
     SDValue Mask =
         getDefaultVLOps(VecVT, ContainerVT, DL, DAG, Subtarget).first;
     // Set the vector length to only the number of elements we care about. Note

From c6bf59f26b2d74474a66182db6ebd576273bfb00 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Tue, 24 Sep 2024 17:45:10 +0100
Subject: [PATCH 37/49] [X86] Add test coverage for #109272

---
 .../X86/vector-shuffle-combining-avx512vbmi.ll    | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll
index 9b32005927ace7..61814b48e6b3a3 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512vbmi.ll
@@ -146,3 +146,18 @@ define <64 x i8> @combine_permi2q_pshufb_as_permi2d_mask(<8 x i64> %a0, <8 x i64
   %res2 = call <64 x i8> @llvm.x86.avx512.mask.pshuf.b.512(<64 x i8> %res1, <64 x i8> <i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 0, i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 4, i8 5, i8 6, i8 7, i8 4, i8 5, i8 6, i8 7, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 8, i8 9, i8 10, i8 11, i8 8, i8 9, i8 10, i8 11, i8 8, i8 9, i8 10, i8 11, i8 12, i8 13, i8 14, i8 15, i8 12, i8 13, i8 14, i8 15, i8 12, i8 13, i8 14, i8 15, i8 12, i8 13, i8 14, i8 15>, <64 x i8> zeroinitializer, i64 %m)
   ret <64 x i8> %res2
 }
+
+; PR109272
+define <64 x i8> @combine_vpermi2var_v64i8_with_mask(<64 x i8> %a0, <64 x i8> %a1, <64 x i8> %a2) {
+; CHECK-LABEL: combine_vpermi2var_v64i8_with_mask:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpermt2b %zmm2, %zmm1, %zmm0
+; CHECK-NEXT:    vpmovb2m %zmm1, %k0
+; CHECK-NEXT:    vpmovm2b %k0, %zmm1
+; CHECK-NEXT:    vpandnq %zmm0, %zmm1, %zmm0
+; CHECK-NEXT:    ret{{[l|q]}}
+  %perm = tail call <64 x i8> @llvm.x86.avx512.vpermi2var.qi.512(<64 x i8> %a0, <64 x i8> %a1, <64 x i8> %a2)
+  %cmp = icmp slt <64 x i8> %a1, zeroinitializer
+  %sel = select <64 x i1> %cmp, <64 x i8> zeroinitializer, <64 x i8> %perm
+  ret <64 x i8> %sel
+}

From 04b443e77845cd20ab5acc4356cee509316135dd Mon Sep 17 00:00:00 2001
From: jimingham <jingham@apple.com>
Date: Tue, 24 Sep 2024 10:00:00 -0700
Subject: [PATCH 38/49] Add the ability to define custom completers to the
 parsed_cmd template. (#109062)

If your arguments or option values are of a type that naturally uses one
of our common completion mechanisms, you will get completion for free.
But if you have your own custom values or if you want to do fancy things
like have `break set -s foo.dylib -n ba<TAB>` only complete on symbols
in foo.dylib, you can use this new mechanism to achieve that.
---
 lldb/bindings/python/python-wrapper.swig      |  73 +++++++
 lldb/docs/use/python-reference.rst            | 185 ++++++++++++++++-
 lldb/examples/python/cmdtemplate.py           |  15 +-
 lldb/examples/python/templates/parsed_cmd.py  |  97 ++++++++-
 .../lldb/Interpreter/ScriptInterpreter.h      |  14 ++
 lldb/include/lldb/Utility/CompletionRequest.h |   2 +
 .../source/Commands/CommandObjectCommands.cpp | 191 ++++++++++++++++++
 lldb/source/Interpreter/Options.cpp           |   5 +-
 .../Python/SWIGPythonBridge.h                 |   9 +
 .../Python/ScriptInterpreterPython.cpp        |  40 ++++
 .../Python/ScriptInterpreterPythonImpl.h      |   8 +
 .../script/add/TestAddParsedCommand.py        | 132 ++++++++----
 .../command/script/add/test_commands.py       |  69 +++++--
 .../Python/PythonTestSuite.cpp                |  13 ++
 14 files changed, 771 insertions(+), 82 deletions(-)

diff --git a/lldb/bindings/python/python-wrapper.swig b/lldb/bindings/python/python-wrapper.swig
index 961fb2d1a76178..b72a462d04643b 100644
--- a/lldb/bindings/python/python-wrapper.swig
+++ b/lldb/bindings/python/python-wrapper.swig
@@ -667,6 +667,79 @@ lldb_private::python::SWIGBridge::LLDBSwigPythonGetRepeatCommandForScriptedComma
   return result.Str().GetString().str();
 }
 
+StructuredData::DictionarySP
+lldb_private::python::SWIGBridge::LLDBSwigPythonHandleArgumentCompletionForScriptedCommand(PyObject *implementor,
+    std::vector<llvm::StringRef> &args_vec, size_t args_pos, size_t pos_in_arg) {
+
+  PyErr_Cleaner py_err_cleaner(true);
+
+  PythonObject self(PyRefType::Borrowed, implementor);
+  auto pfunc = self.ResolveName<PythonCallable>("handle_argument_completion");
+  // If this isn't implemented, return an empty dict to signal falling back to default completion:
+  if (!pfunc.IsAllocated())
+    return {};
+
+  PythonList args_list(PyInitialValue::Empty);
+  for (auto elem : args_vec)
+    args_list.AppendItem(PythonString(elem));
+
+  PythonObject result = pfunc(args_list, PythonInteger(args_pos), PythonInteger(pos_in_arg));
+  // Returning None means do the ordinary completion
+  if (result.IsNone())
+    return {};
+
+  // Convert the return dictionary to a DictionarySP.
+  StructuredData::ObjectSP result_obj_sp = result.CreateStructuredObject();
+  if (!result_obj_sp)
+    return {};
+
+  StructuredData::DictionarySP dict_sp(new StructuredData::Dictionary(result_obj_sp));
+  if (dict_sp->GetType() == lldb::eStructuredDataTypeInvalid)
+    return {};
+  return dict_sp;
+}
+
+StructuredData::DictionarySP
+lldb_private::python::SWIGBridge::LLDBSwigPythonHandleOptionArgumentCompletionForScriptedCommand(PyObject *implementor,
+    llvm::StringRef &long_option, size_t pos_in_arg) {
+
+  PyErr_Cleaner py_err_cleaner(true);
+
+  PythonObject self(PyRefType::Borrowed, implementor);
+  auto pfunc = self.ResolveName<PythonCallable>("handle_option_argument_completion");
+  // If this isn't implemented, return an empty dict to signal falling back to default completion:
+  if (!pfunc.IsAllocated())
+    return {};
+
+  PythonObject result = pfunc(PythonString(long_option), PythonInteger(pos_in_arg));
+  // Returning None means do the ordinary completion
+  if (result.IsNone())
+    return {};
+
+  // Returning a boolean:
+  // True means the completion was handled, but there were no completions
+  // False means that the completion was not handled, again, do the ordinary completion:
+  if (result.GetObjectType() == PyObjectType::Boolean) {
+    if (!result.IsTrue())
+      return {};
+    // Make up a completion dictionary with the right element:
+    StructuredData::DictionarySP dict_sp(new StructuredData::Dictionary());
+    dict_sp->AddBooleanItem("no-completion", true);
+    return dict_sp;
+  }
+    
+
+  // Convert the return dictionary to a DictionarySP.
+  StructuredData::ObjectSP result_obj_sp = result.CreateStructuredObject();
+  if (!result_obj_sp)
+    return {};
+
+  StructuredData::DictionarySP dict_sp(new StructuredData::Dictionary(result_obj_sp));
+  if (dict_sp->GetType() == lldb::eStructuredDataTypeInvalid)
+    return {};
+  return dict_sp;
+}
+
 #include "lldb/Interpreter/CommandReturnObject.h"
 
 bool lldb_private::python::SWIGBridge::LLDBSwigPythonCallParsedCommandObject(
diff --git a/lldb/docs/use/python-reference.rst b/lldb/docs/use/python-reference.rst
index b12048f1af067d..95a6020ca3e455 100644
--- a/lldb/docs/use/python-reference.rst
+++ b/lldb/docs/use/python-reference.rst
@@ -551,7 +551,7 @@ command definition form can't do the right thing.
 Since lldb 3.7, Python commands can also be implemented by means of a class
 which should implement the following interface:
 
-::
+.. code-block:: python
 
   class CommandObjectType:
       def __init__(self, debugger, internal_dict):
@@ -586,20 +586,193 @@ which should implement the following interface:
 As a convenience, you can treat the result object as a Python file object, and
 say
 
-::
+.. code-block:: python
 
   print >>result, "my command does lots of cool stuff"
 
 SBCommandReturnObject and SBStream both support this file-like behavior by
 providing write() and flush() calls at the Python layer.
 
+The commands that are added using this class definition are what lldb calls
+"raw" commands.  The command interpreter doesn't attempt to parse the command,
+doesn't handle option values, neither generating help for them, or their
+completion.  Raw commands are useful when the arguments passed to the command
+are unstructured, and having to protect them against lldb command parsing would
+be onerous.  For instance, "expr" is a raw command.
+
+You can also add scripted commands that implement the "parsed command", where
+the options and their types are specified, as well as the argument and argument
+types.  These commands look and act like the majority of lldb commands, and you
+can also add custom completions for the options and/or the arguments if you have
+special needs.
+
+The easiest way to do this is to derive your new command from the lldb.ParsedCommand
+class.  That responds in the same way to the help & repeat command interfaces, and
+provides some convenience methods, and most importantly an LLDBOptionValueParser,
+accessed throught lldb.ParsedCommand.get_parser().  The parser is used to set
+your command definitions, and to retrieve option values in the __call__ method.
+
+To set up the command definition, implement the ParsedCommand abstract method:
+
+.. code-block:: python
+
+   def setup_command_definition(self):
+
+This is called when your command is added to lldb.  In this method you add the
+options and their types, the option help strings, etc. to the command using the API:
+
+.. code-block:: python
+
+    def add_option(self, short_option, long_option, help, default,
+                   dest = None, required=False, groups = None,
+                   value_type=lldb.eArgTypeNone, completion_type=None,
+                   enum_values=None):
+        """
+        short_option: one character, must be unique, not required
+        long_option:  no spaces, must be unique, required
+        help:         a usage string for this option, will print in the command help
+        default:      the initial value for this option (if it has a value)
+        dest:         the name of the property that gives you access to the value for
+                      this value.  Defaults to the long option if not provided.
+        required: if true, this option must be provided or the command will error out
+        groups: Which "option groups" does this option belong to.  This can either be
+                a simple list (e.g. [1, 3, 4, 5]) or you can specify ranges by sublists:
+                so [1, [3,5]] is the same as [1, 3, 4, 5].
+        value_type: one of the lldb.eArgType enum values.  Some of the common arg
+                    types also have default completers, which will be applied automatically.
+        completion_type: currently these are values form the lldb.CompletionType enum.	If
+                         you need custom completions, implement	handle_option_argument_completion.
+        enum_values: An array of duples: ["element_name", "element_help"].  If provided,
+                     only one of the enum elements is allowed.  The value will be the
+                     element_name for the chosen enum element as a string.
+        """
+
+Similarly, you can add argument types to the command:
+
+.. code-block:: python
+
+    def make_argument_element(self, arg_type, repeat = "optional", groups = None):
+        """
+      	arg_type: The argument type, one of the	lldb.eArgType enum values.
+      	repeat:	Choose from the	following options:
+      	      	"plain"	- one value
+      	      	"optional" - zero or more values
+      	      	"plus" - one or	more values
+      	groups:	As with	add_option.
+        """
+
+Then implement the body of the command by defining:
+
+.. code-block:: python
+
+    def __call__(self, debugger, args_array, exe_ctx, result):
+        """This is the command callback.  The option values are
+        provided by the 'dest' properties on the parser.
+
+        args_array: This is the list of arguments provided.
+        exe_ctx: Gives the SBExecutionContext on which the
+                 command should operate.
+        result:  Any results of the command should be
+                 written into this SBCommandReturnObject.
+        """
+
+This differs from the "raw" command's __call__ in that the arguments are already
+parsed into the args_array, and the option values are set in the parser, and
+can be accessed using their property name.  The LLDBOptionValueParser class has
+a couple of other handy methods:
+
+.. code-block:: python
+    def was_set(self, long_option_name):
+
+returns True if the option was specified on the command line.
+
+.. code-block:: python
+
+    def dest_for_option(self, long_option_name):
+    """
+    This will return the value of the dest variable you defined for opt_name.
+    Mostly useful for handle_completion where you get passed the long option.
+    """
+
+lldb will handle completing your option names, and all your enum values
+automatically.  If your option or argument types have associated built-in completers,
+then lldb will also handle that completion for you.  But if you have a need for
+custom completions, either in your arguments or option values, you can handle
+completion by hand as well.  To handle completion of option value arguments,
+your lldb.ParsedCommand subclass should implement:
+
+.. code-block:: python
+
+    def handle_option_argument_completion(self, long_option, cursor_pos):
+    """
+    long_option: The long option name of the option whose value you are
+                 asked to complete.
+    cursor_pos: The cursor position in the value for that option - which
+    you can get from the option parser.
+    """
+
+And to handle the completion of arguments:
+    
+.. code-block:: python
+
+    def handle_argument_completion(self, args, arg_pos, cursor_pos):
+    """
+    args: A list of the arguments to the command
+    arg_pos: An index into the args list of the argument with the cursor
+    cursor_pos: The cursor position in the arg specified by arg_pos
+    """
+
+When either of these API's is called, the command line will have been parsed up to
+the word containing the cursor, and any option values set in that part of the command
+string are available from the option value parser.  That's useful for instance
+if you have a --shared-library option that would constrain the completions for,
+say, a symbol name option or argument.
+
+The return value specifies what the completion options are.  You have four
+choices:
+
+- `True`: the completion was handled with no completions.
+
+- `False`: the completion was not handled, forward it to the regular
+completion machinery.
+
+- A dictionary with the key: "completion": there is one candidate,
+whose value is the value of the "completion" key.  Optionally you can pass a
+"mode" key whose value is either "partial" or "complete".  Return partial if
+the "completion" string is a prefix for all the completed value.
+
+For instance, if the string you are completing is "Test" and the available completions are:
+"Test1", "Test11" and "Test111", you should return the dictionary:
+
+.. code-block:: python
+
+   return {"completion": "Test1", "mode" : "partial"}
+
+and then lldb will add the "1" at the curson and advance it after the added string,
+waiting for more completions.  But if "Test1" is the only completion, return:
+
+.. code-block:: python
+
+   {"completion": "Test1", "mode": "complete"}
+
+and lldb will add "1 " at the cursor, indicating the command string is complete.
+
+The default is "complete", you don't need to specify a "mode" in that case.
+
+- A dictionary with the key: "values" whose value is a list of candidate completion
+strings.  The command interpreter will present those strings as the available choices.
+You can optionally include a "descriptions" key, whose value is a parallel array
+of description strings, and the completion will show the description next to
+each completion.
+
+
 One other handy convenience when defining lldb command-line commands is the
-command command script import which will import a module specified by file
+command "command script import" which will import a module specified by file
 path, so you don't have to change your PYTHONPATH for temporary scripts. It
 also has another convenience that if your new script module has a function of
 the form:
 
-::
+.. code-block python
 
   def __lldb_init_module(debugger, internal_dict):
       # Command Initialization code goes here
@@ -615,7 +788,7 @@ creating scripts that can be run from the command line. However, for command
 line scripts, the debugger instance must be created manually. Sample code would
 look like:
 
-::
+.. code-block:: python
 
   if __name__ == '__main__':
       # Initialize the debugger before making any API calls.
@@ -638,7 +811,7 @@ look like:
 Now we can create a module called ls.py in the file ~/ls.py that will implement
 a function that can be used by LLDB's python command code:
 
-::
+.. code-block:: python
 
   #!/usr/bin/env python
 
diff --git a/lldb/examples/python/cmdtemplate.py b/lldb/examples/python/cmdtemplate.py
index b6a21cba7113e6..a9fbe0b40e1957 100644
--- a/lldb/examples/python/cmdtemplate.py
+++ b/lldb/examples/python/cmdtemplate.py
@@ -29,8 +29,8 @@ def get_flags(self):
         return lldb.eCommandRequiresFrame | lldb.eCommandProcessMustBePaused
 
     def setup_command_definition(self):
-
-        self.ov_parser.add_option(
+        ov_parser = self.get_parser()
+        ov_parser.add_option(
             "i",
             "in-scope",
             help = "in_scope_only = True",
@@ -39,7 +39,7 @@ def setup_command_definition(self):
             default = True,
         )
 
-        self.ov_parser.add_option(
+        ov_parser.add_option(
             "i",
             "in-scope",
             help = "in_scope_only = True",
@@ -48,7 +48,7 @@ def setup_command_definition(self):
             default=True,
         )
         
-        self.ov_parser.add_option(
+        ov_parser.add_option(
             "a",
             "arguments",
             help = "arguments = True",
@@ -57,7 +57,7 @@ def setup_command_definition(self):
             default = True,
         )
 
-        self.ov_parser.add_option(
+        ov_parser.add_option(
             "l",
             "locals",
             help = "locals = True",
@@ -66,7 +66,7 @@ def setup_command_definition(self):
             default = True,
         )
 
-        self.ov_parser.add_option(
+        ov_parser.add_option(
             "s",
             "statics",
             help = "statics = True",
@@ -103,8 +103,9 @@ def __call__(self, debugger, command, exe_ctx, result):
             result.SetError("invalid frame")
             return
 
+        ov_parser = self.get_parser()
         variables_list = frame.GetVariables(
-            self.ov_parser.arguments, self.ov_parser.locals, self.ov_parser.statics, self.ov_parser.inscope
+            ov_parser.arguments, ov_parser.locals, ov_parser.statics, ov_parser.inscope
         )
         variables_count = variables_list.GetSize()
         if variables_count == 0:
diff --git a/lldb/examples/python/templates/parsed_cmd.py b/lldb/examples/python/templates/parsed_cmd.py
index 06124adf43420a..13d6eae405c08d 100644
--- a/lldb/examples/python/templates/parsed_cmd.py
+++ b/lldb/examples/python/templates/parsed_cmd.py
@@ -4,7 +4,8 @@
 The way to use it is to make a class for your command that inherits from ParsedCommandBase.
 That will make an LLDBOptionValueParser which you will use for your
 option definition, and to fetch option values for the current invocation
-of your command.  Access to the OV parser is through:
+of your command.  For concision, I'll call this the `OVParser`.  
+Access to the `OVParser` is through:
 
 ParsedCommandBase.get_parser()
 
@@ -43,7 +44,65 @@ def __call__(self, debugger, args_list, exe_ctx, result):
 will return True if the user set this option, and False if it was left at its default
 value.
 
-There are example commands in the lldb testsuite at:
+Custom Completions:
+
+You can also implement custom completers for your custom command, either for the
+arguments to your command or to the option values in your command.  If you use enum
+values or if your option/argument uses is one of the types we have completers for,
+you should not need to do this.  But if you have your own completeable types, or if
+you want completion of one option to be conditioned by other options on the command
+line, you can use this interface to take over the completion.  
+
+You can choose to add a completion for the option values defined for your command,
+or for the arguments, separately.  For the option values, define:
+
+def handle_option_argument_completion(self, long_option, cursor_pos):
+
+The line to be completed will be parsed up to the option containint the cursor position, 
+and the values will be set in the OptionValue parser object.  long_option will be
+the option name containing the cursor, and cursor_pos will be the position of the cursor
+in that option's value.  You can call the `OVParser` method: `dest_for_option(long_option)` 
+to get the value for that option.  The other options that came before the cursor in the command
+line will also be set in the `OVParser` when the completion handler is called.
+
+For argument values, define:
+
+def handle_argument_completion(self, args, arg_pos, cursor_pos):
+
+Again, the command line will be parsed up to the cursor position, and all the options
+before the cursor pose will be set in the `OVParser`.  args is a python list of the
+arguments, arg_pos is the index of the argument with the cursor, and cursor_pos is
+the position of the cursor in the argument.
+
+In both cases, the return value determines the completion.
+
+Return False to mean "Not Handled" - in which case lldb will fall back on the
+standard completion machinery.
+
+Return True to mean "Handled with no completions".
+
+If there is a single unique completion, return a Python dictionary with two elements:
+
+return {"completion" : "completed_value", "mode" : <"partial", "complete">}
+
+If the mode is "partial", then the completion is to a common base, if it is "complete"
+then the argument is considered done - mostly meaning lldb will put a space after the
+completion string.  "complete" is the default if no "mode" is specified.
+
+If there are multiple completion options, then return:
+
+return {"values" : ["option1", "option2"]}
+
+Optionally, you can return a parallel array of "descriptions" which the completer will 
+print alongside the options:
+
+return {"values" : ["option1", "option2"], "descriptions" : ["the first option", "the second option"]}
+
+The cmdtemplate example currently uses the parsed command infrastructure:
+
+llvm-project/lldb/examples/python/cmdtemplate.py
+
+There are also a few example commands in the lldb testsuite at:
 
 llvm-project/lldb/test/API/commands/command/script/add/test_commands.py
 """
@@ -226,10 +285,14 @@ def set_option_value(self, exe_ctx, opt_name, opt_value):
         return True
 
     def was_set(self, opt_name):
-        """ Call this in the __call__ method of your command to determine
-            whether this option was set on the command line.  It is sometimes
-            useful to know whether an option has the default value because the
-            user set it explicitly (was_set -> True) or not.  """
+        """Call this in the __call__ method of your command to determine
+        whether this option was set on the command line.  It is sometimes
+        useful to know whether an option has the default value because the
+        user set it explicitly (was_set -> True) or not.
+        You can also call this in a handle_completion method, but it will
+        currently only report true values for the options mentioned
+        BEFORE the cursor point in the command line.
+        """
 
         elem = self.get_option_element(opt_name)
         if not elem:
@@ -239,6 +302,16 @@ def was_set(self, opt_name):
         except AttributeError:
             return False
 
+    def dest_for_option(self, opt_name):
+        """This will return the value of the dest variable you defined for opt_name.
+        Mostly useful for handle_completion where you get passed the long option.
+        """
+        elem = self.get_option_element(opt_name)
+        if not elem:
+            return None
+        value = self.__dict__[elem["dest"]]
+        return value
+
     def add_option(self, short_option, long_option, help, default,
                    dest = None, required=False, groups = None,
                    value_type=lldb.eArgTypeNone, completion_type=None,
@@ -251,14 +324,16 @@ def add_option(self, short_option, long_option, help, default,
         dest: the name of the property that gives you access to the value for
                  this value.  Defaults to the long option if not provided.
         required: if true, this option must be provided or the command will error out
-        groups: Which "option groups" does this option belong to
+        groups: Which "option groups" does this option belong to.  This can either be
+                a simple list (e.g. [1, 3, 4, 5]) or you can specify ranges by sublists:
+                so [1, [3,5]] is the same as [1, 3, 4, 5].
         value_type: one of the lldb.eArgType enum values.  Some of the common arg
                     types also have default completers, which will be applied automatically.
-        completion_type: currently these are values form the lldb.CompletionType enum, I
-                         haven't done custom completions yet.
+        completion_type: currently these are values form the lldb.CompletionType enum.  If
+                         you need custom completions, implement handle_option_argument_completion.
         enum_values: An array of duples: ["element_name", "element_help"].  If provided,
-                     only one of the enum elements is allowed.  The value will be the 
-                     element_name for the chosen enum element as a string. 
+                     only one of the enum elements is allowed.  The value will be the
+                     element_name for the chosen enum element as a string.
         """
         if not dest:
             dest = long_option
diff --git a/lldb/include/lldb/Interpreter/ScriptInterpreter.h b/lldb/include/lldb/Interpreter/ScriptInterpreter.h
index 901ecf3012d51d..2c2bd6f232e094 100644
--- a/lldb/include/lldb/Interpreter/ScriptInterpreter.h
+++ b/lldb/include/lldb/Interpreter/ScriptInterpreter.h
@@ -420,6 +420,20 @@ class ScriptInterpreter : public PluginInterface {
     return std::nullopt;
   }
 
+  virtual StructuredData::DictionarySP
+  HandleArgumentCompletionForScriptedCommand(
+      StructuredData::GenericSP impl_obj_sp, std::vector<llvm::StringRef> &args,
+      size_t args_pos, size_t char_in_arg) {
+    return {};
+  }
+
+  virtual StructuredData::DictionarySP
+  HandleOptionArgumentCompletionForScriptedCommand(
+      StructuredData::GenericSP impl_obj_sp, llvm::StringRef &long_name,
+      size_t char_in_arg) {
+    return {};
+  }
+
   virtual bool RunScriptFormatKeyword(const char *impl_function,
                                       Process *process, std::string &output,
                                       Status &error) {
diff --git a/lldb/include/lldb/Utility/CompletionRequest.h b/lldb/include/lldb/Utility/CompletionRequest.h
index 1a2b1d639950fc..650158a197dbd9 100644
--- a/lldb/include/lldb/Utility/CompletionRequest.h
+++ b/lldb/include/lldb/Utility/CompletionRequest.h
@@ -139,6 +139,8 @@ class CompletionRequest {
     return GetParsedLine()[GetCursorIndex()];
   }
 
+  size_t GetCursorCharPos() const { return m_cursor_char_position; }
+
   /// Drops the first argument from the argument list.
   void ShiftArguments() {
     m_cursor_index--;
diff --git a/lldb/source/Commands/CommandObjectCommands.cpp b/lldb/source/Commands/CommandObjectCommands.cpp
index e3291640fa9352..845b89a75b7b39 100644
--- a/lldb/source/Commands/CommandObjectCommands.cpp
+++ b/lldb/source/Commands/CommandObjectCommands.cpp
@@ -1637,6 +1637,129 @@ class CommandObjectScriptingObjectParsed : public CommandObjectParsed {
 
     size_t GetNumOptions() { return m_num_options; }
 
+    void PrepareOptionsForCompletion(CompletionRequest &request,
+                                     OptionElementVector &option_vec,
+                                     ExecutionContext *exe_ctx) {
+      // I'm not sure if we'll get into trouble doing an option parsing start
+      // and end in this context.  If so, then I'll have to directly tell the
+      // scripter to do this.
+      OptionParsingStarting(exe_ctx);
+      auto opt_defs = GetDefinitions();
+
+      // Iterate through the options we found so far, and push them into
+      // the scripted side.
+      for (auto option_elem : option_vec) {
+        int cur_defs_index = option_elem.opt_defs_index;
+        // If we don't recognize this option we can't set it.
+        if (cur_defs_index == OptionArgElement::eUnrecognizedArg ||
+            cur_defs_index == OptionArgElement::eBareDash ||
+            cur_defs_index == OptionArgElement::eBareDoubleDash)
+          continue;
+        bool option_has_arg = opt_defs[cur_defs_index].option_has_arg;
+        llvm::StringRef cur_arg_value;
+        if (option_has_arg) {
+          int cur_arg_pos = option_elem.opt_arg_pos;
+          if (cur_arg_pos != OptionArgElement::eUnrecognizedArg &&
+              cur_arg_pos != OptionArgElement::eBareDash &&
+              cur_arg_pos != OptionArgElement::eBareDoubleDash) {
+            cur_arg_value =
+                request.GetParsedLine().GetArgumentAtIndex(cur_arg_pos);
+          }
+        }
+        SetOptionValue(cur_defs_index, cur_arg_value, exe_ctx);
+      }
+      OptionParsingFinished(exe_ctx);
+    }
+
+    void
+    ProcessCompletionDict(CompletionRequest &request,
+                          StructuredData::DictionarySP &completion_dict_sp) {
+      // We don't know how to process an empty completion dict, our callers have
+      // to do that.
+      assert(completion_dict_sp && "Must have valid completion dict");
+      // First handle the case of a single completion:
+      llvm::StringRef completion;
+      // If the dictionary has one element "no-completion" then we return here
+      if (completion_dict_sp->GetValueForKeyAsString("no-completion",
+                                                     completion))
+        return;
+
+      if (completion_dict_sp->GetValueForKeyAsString("completion",
+                                                     completion)) {
+        llvm::StringRef mode_str;
+        CompletionMode mode = CompletionMode::Normal;
+        if (completion_dict_sp->GetValueForKeyAsString("mode", mode_str)) {
+          if (mode_str == "complete")
+            mode = CompletionMode::Normal;
+          else if (mode_str == "partial")
+            mode = CompletionMode::Partial;
+          else {
+            // FIXME - how do I report errors here?
+            return;
+          }
+        }
+        request.AddCompletion(completion, "", mode);
+        return;
+      }
+      // The completions are required, the descriptions are not:
+      StructuredData::Array *completions;
+      StructuredData::Array *descriptions;
+      if (completion_dict_sp->GetValueForKeyAsArray("values", completions)) {
+        completion_dict_sp->GetValueForKeyAsArray("descriptions", descriptions);
+        size_t num_completions = completions->GetSize();
+        for (size_t idx = 0; idx < num_completions; idx++) {
+          auto val = completions->GetItemAtIndexAsString(idx);
+          if (!val)
+            // FIXME: How do I report this error?
+            return;
+
+          if (descriptions) {
+            auto desc = descriptions->GetItemAtIndexAsString(idx);
+            request.AddCompletion(*val, desc ? *desc : "");
+          } else
+            request.AddCompletion(*val);
+        }
+      }
+    }
+
+    void
+    HandleOptionArgumentCompletion(lldb_private::CompletionRequest &request,
+                                   OptionElementVector &option_vec,
+                                   int opt_element_index,
+                                   CommandInterpreter &interpreter) override {
+      ScriptInterpreter *scripter =
+          interpreter.GetDebugger().GetScriptInterpreter();
+
+      if (!scripter)
+        return;
+
+      ExecutionContext exe_ctx = interpreter.GetExecutionContext();
+      PrepareOptionsForCompletion(request, option_vec, &exe_ctx);
+
+      auto defs = GetDefinitions();
+
+      size_t defs_index = option_vec[opt_element_index].opt_defs_index;
+      llvm::StringRef option_name = defs[defs_index].long_option;
+      bool is_enum = defs[defs_index].enum_values.size() != 0;
+      if (option_name.empty())
+        return;
+      // If this is an enum, we don't call the custom completer, just let the
+      // regular option completer handle that:
+      StructuredData::DictionarySP completion_dict_sp;
+      if (!is_enum)
+        completion_dict_sp =
+            scripter->HandleOptionArgumentCompletionForScriptedCommand(
+                m_cmd_obj_sp, option_name, request.GetCursorCharPos());
+
+      if (!completion_dict_sp) {
+        Options::HandleOptionArgumentCompletion(request, option_vec,
+                                                opt_element_index, interpreter);
+        return;
+      }
+
+      ProcessCompletionDict(request, completion_dict_sp);
+    }
+
   private:
     struct EnumValueStorage {
       EnumValueStorage() {
@@ -1878,6 +2001,74 @@ class CommandObjectScriptingObjectParsed : public CommandObjectParsed {
   Status GetArgsError() { return m_args_error.Clone(); }
   bool WantsCompletion() override { return true; }
 
+private:
+  void PrepareOptionsForCompletion(CompletionRequest &request,
+                                   OptionElementVector &option_vec) {
+    // First, we have to tell the Scripted side to set the values in its
+    // option store, then we call into the handle_completion passing in
+    // an array of the args, the arg index and the cursor position in the arg.
+    // We want the script side to have a chance to clear its state, so tell
+    // it argument parsing has started:
+    Options *options = GetOptions();
+    // If there are not options, this will be nullptr, and in that case we
+    // can just skip setting the options on the scripted side:
+    if (options)
+      m_options.PrepareOptionsForCompletion(request, option_vec, &m_exe_ctx);
+  }
+
+public:
+  void HandleArgumentCompletion(CompletionRequest &request,
+                                OptionElementVector &option_vec) override {
+    ScriptInterpreter *scripter = GetDebugger().GetScriptInterpreter();
+
+    if (!scripter)
+      return;
+
+    // Set up the options values on the scripted side:
+    PrepareOptionsForCompletion(request, option_vec);
+
+    // Now we have to make up the argument list.
+    // The ParseForCompletion only identifies tokens in the m_parsed_line
+    // it doesn't remove the options leaving only the args as it does for
+    // the regular Parse, so we have to filter out the option ones using the
+    // option_element_vector:
+
+    Options *options = GetOptions();
+    auto defs = options->GetDefinitions();
+
+    std::unordered_set<size_t> option_slots;
+    for (const auto &elem : option_vec) {
+      if (elem.opt_defs_index == -1)
+        continue;
+      option_slots.insert(elem.opt_pos);
+      if (defs[elem.opt_defs_index].option_has_arg)
+        option_slots.insert(elem.opt_arg_pos);
+    }
+
+    std::vector<llvm::StringRef> args_vec;
+    Args &args = request.GetParsedLine();
+    size_t num_args = args.GetArgumentCount();
+    size_t cursor_idx = request.GetCursorIndex();
+    size_t args_elem_pos = cursor_idx;
+
+    for (size_t idx = 0; idx < num_args; idx++) {
+      if (option_slots.count(idx) == 0)
+        args_vec.push_back(args[idx].ref());
+      else if (idx < cursor_idx)
+        args_elem_pos--;
+    }
+    StructuredData::DictionarySP completion_dict_sp =
+        scripter->HandleArgumentCompletionForScriptedCommand(
+            m_cmd_obj_sp, args_vec, args_elem_pos, request.GetCursorCharPos());
+
+    if (!completion_dict_sp) {
+      CommandObject::HandleArgumentCompletion(request, option_vec);
+      return;
+    }
+
+    m_options.ProcessCompletionDict(request, completion_dict_sp);
+  }
+
   bool IsRemovable() const override { return true; }
 
   ScriptedCommandSynchronicity GetSynchronicity() { return m_synchro; }
diff --git a/lldb/source/Interpreter/Options.cpp b/lldb/source/Interpreter/Options.cpp
index b8a3f68a49b1cf..3888a5812628cd 100644
--- a/lldb/source/Interpreter/Options.cpp
+++ b/lldb/source/Interpreter/Options.cpp
@@ -661,7 +661,9 @@ bool Options::HandleOptionCompletion(CompletionRequest &request,
 
     } else if (opt_arg_pos == request.GetCursorIndex()) {
       // Okay the cursor is on the completion of an argument. See if it has a
-      // completion, otherwise return no matches.
+      // completion, otherwise return no matches.  Note, opt_defs_index == -1
+      // means we're after an option, but that option doesn't exist.  We'll
+      // end up treating that as an argument.  Not sure we can do much better.
       if (opt_defs_index != -1) {
         HandleOptionArgumentCompletion(request, opt_element_vector, i,
                                        interpreter);
@@ -688,7 +690,6 @@ void Options::HandleOptionArgumentCompletion(
   int opt_defs_index = opt_element_vector[opt_element_index].opt_defs_index;
 
   // See if this is an enumeration type option, and if so complete it here:
-
   const auto &enum_values = opt_defs[opt_defs_index].enum_values;
   if (!enum_values.empty())
     for (const auto &enum_value : enum_values)
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h b/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h
index 81ee9ea0a2fa10..518a478af5f6a8 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/SWIGPythonBridge.h
@@ -200,6 +200,15 @@ class SWIGBridge {
   LLDBSwigPythonGetRepeatCommandForScriptedCommand(PyObject *implementor,
                                                    std::string &command);
 
+  static StructuredData::DictionarySP
+  LLDBSwigPythonHandleArgumentCompletionForScriptedCommand(
+      PyObject *implementor, std::vector<llvm::StringRef> &args_impl,
+      size_t args_pos, size_t pos_in_arg);
+
+  static StructuredData::DictionarySP
+  LLDBSwigPythonHandleOptionArgumentCompletionForScriptedCommand(
+      PyObject *implementor, llvm::StringRef &long_option, size_t pos_in_arg);
+
   static bool LLDBSwigPythonCallModuleInit(const char *python_module_name,
                                            const char *session_dictionary_name,
                                            lldb::DebuggerSP debugger);
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp
index 155efc06eaf41a..db1a10e73a66ab 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPython.cpp
@@ -2720,6 +2720,46 @@ ScriptInterpreterPythonImpl::GetRepeatCommandForScriptedCommand(
   return ret_val;
 }
 
+StructuredData::DictionarySP
+ScriptInterpreterPythonImpl::HandleArgumentCompletionForScriptedCommand(
+    StructuredData::GenericSP impl_obj_sp, std::vector<llvm::StringRef> &args,
+    size_t args_pos, size_t char_in_arg) {
+  StructuredData::DictionarySP completion_dict_sp;
+  if (!impl_obj_sp || !impl_obj_sp->IsValid())
+    return completion_dict_sp;
+
+  {
+    Locker py_lock(this, Locker::AcquireLock | Locker::NoSTDIN,
+                   Locker::FreeLock);
+
+    completion_dict_sp =
+        SWIGBridge::LLDBSwigPythonHandleArgumentCompletionForScriptedCommand(
+            static_cast<PyObject *>(impl_obj_sp->GetValue()), args, args_pos,
+            char_in_arg);
+  }
+  return completion_dict_sp;
+}
+
+StructuredData::DictionarySP
+ScriptInterpreterPythonImpl::HandleOptionArgumentCompletionForScriptedCommand(
+    StructuredData::GenericSP impl_obj_sp, llvm::StringRef &long_option,
+    size_t char_in_arg) {
+  StructuredData::DictionarySP completion_dict_sp;
+  if (!impl_obj_sp || !impl_obj_sp->IsValid())
+    return completion_dict_sp;
+
+  {
+    Locker py_lock(this, Locker::AcquireLock | Locker::NoSTDIN,
+                   Locker::FreeLock);
+
+    completion_dict_sp = SWIGBridge::
+        LLDBSwigPythonHandleOptionArgumentCompletionForScriptedCommand(
+            static_cast<PyObject *>(impl_obj_sp->GetValue()), long_option,
+            char_in_arg);
+  }
+  return completion_dict_sp;
+}
+
 /// In Python, a special attribute __doc__ contains the docstring for an object
 /// (function, method, class, ...) if any is defined Otherwise, the attribute's
 /// value is None.
diff --git a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPythonImpl.h b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPythonImpl.h
index d15e2fd76f683b..2dc784777151bb 100644
--- a/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPythonImpl.h
+++ b/lldb/source/Plugins/ScriptInterpreter/Python/ScriptInterpreterPythonImpl.h
@@ -166,6 +166,14 @@ class ScriptInterpreterPythonImpl : public ScriptInterpreterPython {
   GetRepeatCommandForScriptedCommand(StructuredData::GenericSP impl_obj_sp,
                                      Args &args) override;
 
+  StructuredData::DictionarySP HandleArgumentCompletionForScriptedCommand(
+      StructuredData::GenericSP impl_obj_sp, std::vector<llvm::StringRef> &args,
+      size_t args_pos, size_t char_in_arg) override;
+
+  StructuredData::DictionarySP HandleOptionArgumentCompletionForScriptedCommand(
+      StructuredData::GenericSP impl_obj_sp, llvm::StringRef &long_options,
+      size_t char_in_arg) override;
+
   Status GenerateFunction(const char *signature, const StringList &input,
                           bool is_callback) override;
 
diff --git a/lldb/test/API/commands/command/script/add/TestAddParsedCommand.py b/lldb/test/API/commands/command/script/add/TestAddParsedCommand.py
index c7680e9bb7f418..6fac1eba919bc9 100644
--- a/lldb/test/API/commands/command/script/add/TestAddParsedCommand.py
+++ b/lldb/test/API/commands/command/script/add/TestAddParsedCommand.py
@@ -68,6 +68,57 @@ def run_one_repeat(self, commands, expected_num_errors):
 
         return results
 
+    def handle_completion(
+        self,
+        cmd_str,
+        exp_num_completions,
+        exp_matches,
+        exp_descriptions,
+        match_description,
+    ):
+        matches = lldb.SBStringList()
+        descriptions = lldb.SBStringList()
+
+        interp = self.dbg.GetCommandInterpreter()
+        num_completions = interp.HandleCompletionWithDescriptions(
+            cmd_str, len(cmd_str), 0, 1000, matches, descriptions
+        )
+        self.assertEqual(
+            num_completions, exp_num_completions, "Number of completions is right."
+        )
+        num_matches = matches.GetSize()
+        self.assertEqual(
+            num_matches,
+            exp_matches.GetSize(),
+            "matches and expected matches of different lengths",
+        )
+        num_descriptions = descriptions.GetSize()
+        if match_description:
+            self.assertEqual(
+                num_descriptions,
+                exp_descriptions.GetSize(),
+                "descriptions and expected of different lengths",
+            )
+
+        self.assertEqual(
+            matches.GetSize(),
+            num_completions + 1,
+            "The first element is the complete additional text",
+        )
+
+        for idx in range(0, num_matches):
+            match = matches.GetStringAtIndex(idx)
+            exp_match = exp_matches.GetStringAtIndex(idx)
+            self.assertEqual(
+                match, exp_match, f"{match} did not match expectation: {exp_match}"
+            )
+        if match_description:
+            desc = descriptions.GetStringAtIndex(idx)
+            exp_desc = exp_descriptions.GetStringAtIndex(idx)
+            self.assertEqual(
+                desc, exp_desc, f"{desc} didn't match expectation: {exp_desc}"
+            )
+
     def pycmd_tests(self):
         source_dir = self.getSourceDir()
         test_file_path = os.path.join(source_dir, "test_commands.py")
@@ -176,24 +227,10 @@ def cleanup():
         descriptions = lldb.SBStringList()
 
         # First try an enum completion:
-        num_completions = interp.HandleCompletionWithDescriptions(
-            "no-args -e f", 12, 0, 1000, matches, descriptions
-        )
-        self.assertEqual(num_completions, 1, "Only one completion for foo")
-        self.assertEqual(
-            matches.GetSize(), 2, "The first element is the complete additional text"
-        )
-        self.assertEqual(
-            matches.GetStringAtIndex(0), "oo ", "And we got the right extra characters"
-        )
-        self.assertEqual(
-            matches.GetStringAtIndex(1), "foo", "And we got the right match"
-        )
-        self.assertEqual(
-            descriptions.GetSize(), 2, "descriptions matche the return length"
-        )
-        # FIXME: we don't return descriptions for enum elements
-        # self.assertEqual(descriptions.GetStringAtIndex(1), "does foo things", "And we got the right description")
+        # Note - this is an enum so all the values are returned:
+        matches.AppendList(["oo ", "foo"], 2)
+
+        self.handle_completion("no-args -e f", 1, matches, descriptions, False)
 
         # Now try an internal completer, the on disk file one is handy:
         partial_name = os.path.join(source_dir, "test_")
@@ -201,24 +238,9 @@ def cleanup():
 
         matches.Clear()
         descriptions.Clear()
-        num_completions = interp.HandleCompletionWithDescriptions(
-            cmd_str, len(cmd_str) - 1, 0, 1000, matches, descriptions
-        )
-        self.assertEqual(num_completions, 1, "Only one completion for source file")
-        self.assertEqual(matches.GetSize(), 2, "The first element is the complete line")
-        self.assertEqual(
-            matches.GetStringAtIndex(0),
-            "commands.py' ",
-            "And we got the right extra characters",
-        )
-        self.assertEqual(
-            matches.GetStringAtIndex(1), test_file_path, "And we got the right match"
-        )
-        self.assertEqual(
-            descriptions.GetSize(), 2, "descriptions match the return length"
-        )
-        # FIXME: we don't return descriptions for enum elements
-        # self.assertEqual(descriptions.GetStringAtIndex(1), "does foo things", "And we got the right description")
+        matches.AppendList(["commands.py' ", test_file_path], 2)
+        # We don't have descriptions for the file path completer:
+        self.handle_completion(cmd_str, 1, matches, descriptions, False)
 
         # Try a command with arguments.
         # FIXME: It should be enough to define an argument and it's type to get the completer
@@ -231,6 +253,44 @@ def cleanup():
             substrs=["0: First Argument", "1: Second Argument"],
         )
 
+        # Now test custom completions - two-args has both option and arg completers.  In both
+        # completers we return different values if the -p option is set, so we can test that too:
+        matches.Clear()
+        descriptions.Clear()
+        cmd_str = "two-args -p something -c other_"
+        matches.AppendString("something ")
+        matches.AppendString("other_something")
+        # This is a full match so no descriptions:
+        self.handle_completion(cmd_str, 1, matches, descriptions, False)
+
+        matches.Clear()
+        descriptions.Clear()
+        cmd_str = "two-args -c other_"
+        matches.AppendList(["", "other_nice", "other_not_nice", "other_mediocre"], 4)
+        # The option doesn't return descriptions either:
+        self.handle_completion(cmd_str, 3, matches, descriptions, False)
+
+        # Now try the argument - it says "no completions" if the proc_name was set:
+        matches.Clear()
+        descriptions.Clear()
+        cmd_str = "two-args -p something arg"
+        matches.AppendString("")
+        self.handle_completion(cmd_str, 0, matches, descriptions, False)
+
+        cmd_str = "two-args arg_"
+        matches.Clear()
+        descriptions.Clear()
+        matches.AppendList(["", "arg_cool", "arg_yuck"], 3)
+        descriptions.AppendList(["", "good idea", "bad idea"], 3)
+        self.handle_completion(cmd_str, 2, matches, descriptions, True)
+
+        # This one gets a single unique match:
+        cmd_str = "two-args correct_"
+        matches.Clear()
+        descriptions.Clear()
+        matches.AppendList(["answer ", "correct_answer"], 2)
+        self.handle_completion(cmd_str, 1, matches, descriptions, False)
+
         # Now make sure get_repeat_command works properly:
 
         # no-args turns off auto-repeat
diff --git a/lldb/test/API/commands/command/script/add/test_commands.py b/lldb/test/API/commands/command/script/add/test_commands.py
index fcde6cd3ef6dc6..b15ea935c05867 100644
--- a/lldb/test/API/commands/command/script/add/test_commands.py
+++ b/lldb/test/API/commands/command/script/add/test_commands.py
@@ -18,7 +18,7 @@ def __call__(self, debugger, args_array, exe_ctx, result):
             for long_option, elem in opt_def.items():
                 dest = elem["dest"]
                 result.AppendMessage(
-                    f"{long_option} (set: {elem['_value_set']}): {object.__getattribute__(self.ov_parser, dest)}\n"
+                    f"{long_option} (set: {elem['_value_set']}): {object.__getattribute__(self.get_parser(), dest)}\n"
                 )
         else:
             result.AppendMessage("No options\n")
@@ -31,7 +31,6 @@ def __call__(self, debugger, args_array, exe_ctx, result):
                 f"{idx}: {args_array.GetItemAtIndex(idx).GetStringValue(10000)}\n"
             )
 
-
 # Use these to make sure that get_repeat_command sends the right
 # command.
 no_args_repeat = None
@@ -49,7 +48,8 @@ def register_lldb_command(cls, debugger, module_name):
         ParsedCommand.do_register_cmd(cls, debugger, module_name)
 
     def setup_command_definition(self):
-        self.ov_parser.add_option(
+        ov_parser = self.get_parser()
+        ov_parser.add_option(
             "b",
             "bool-arg",
             "a boolean arg, defaults to True",
@@ -59,7 +59,7 @@ def setup_command_definition(self):
             default=True,
         )
 
-        self.ov_parser.add_option(
+        ov_parser.add_option(
             "s",
             "shlib-name",
             "A shared library name.",
@@ -69,7 +69,7 @@ def setup_command_definition(self):
             default=None,
         )
 
-        self.ov_parser.add_option(
+        ov_parser.add_option(
             "d",
             "disk-file-name",
             "An on disk filename",
@@ -78,7 +78,7 @@ def setup_command_definition(self):
             default=None,
         )
 
-        self.ov_parser.add_option(
+        ov_parser.add_option(
             "l",
             "line-num",
             "A line number",
@@ -88,7 +88,7 @@ def setup_command_definition(self):
             default=0,
         )
 
-        self.ov_parser.add_option(
+        ov_parser.add_option(
             "e",
             "enum-option",
             "An enum, doesn't actually do anything",
@@ -126,8 +126,9 @@ def register_lldb_command(cls, debugger, module_name):
         ParsedCommand.do_register_cmd(cls, debugger, module_name)
 
     def setup_command_definition(self):
-        self.ov_parser.add_argument_set(
-            [self.ov_parser.make_argument_element(lldb.eArgTypeSourceFile, "plain")]
+        ov_parser = self.get_parser()
+        ov_parser.add_argument_set(
+            [ov_parser.make_argument_element(lldb.eArgTypeSourceFile, "plain")]
         )
 
     def get_repeat_command(self, command):
@@ -154,7 +155,8 @@ def register_lldb_command(cls, debugger, module_name):
         ParsedCommand.do_register_cmd(cls, debugger, module_name)
 
     def setup_command_definition(self):
-        self.ov_parser.add_option(
+        ov_parser = self.get_parser()
+        ov_parser.add_option(
             "l",
             "language",
             "language defaults to None",
@@ -164,7 +166,7 @@ def setup_command_definition(self):
             default=None,
         )
 
-        self.ov_parser.add_option(
+        ov_parser.add_option(
             "c",
             "log-channel",
             "log channel - defaults to lldb",
@@ -174,7 +176,7 @@ def setup_command_definition(self):
             default="lldb",
         )
 
-        self.ov_parser.add_option(
+        ov_parser.add_option(
             "p",
             "process-name",
             "A process name, defaults to None",
@@ -183,25 +185,23 @@ def setup_command_definition(self):
             default=None,
         )
 
-        self.ov_parser.add_argument_set(
+        ov_parser.add_argument_set(
             [
-                self.ov_parser.make_argument_element(
+                ov_parser.make_argument_element(
                     lldb.eArgTypeClassName, "plain", [1, 2]
                 ),
-                self.ov_parser.make_argument_element(
+                ov_parser.make_argument_element(
                     lldb.eArgTypeOffset, "optional", [1, 2]
                 ),
             ]
         )
 
-        self.ov_parser.add_argument_set(
+        ov_parser.add_argument_set(
             [
-                self.ov_parser.make_argument_element(
+                ov_parser.make_argument_element(
                     lldb.eArgTypePythonClass, "plain", [3, 4]
                 ),
-                self.ov_parser.make_argument_element(
-                    lldb.eArgTypePid, "optional", [3, 4]
-                ),
+                ov_parser.make_argument_element(lldb.eArgTypePid, "optional", [3, 4]),
             ]
         )
 
@@ -210,6 +210,35 @@ def get_repeat_command(self, command):
         two_arg_repeat = command
         return command + " THIRD_ARG"
 
+    def handle_option_argument_completion(self, long_option, cursor_pos):
+        ov_parser = self.get_parser()
+        value = ov_parser.dest_for_option(long_option)[0 : cursor_pos + 1]
+        proc_value = ov_parser.proc_name
+        if proc_value != None:
+            new_str = value + proc_value
+            ret_arr = {"completion": new_str, "mode": "partial"}
+            return ret_arr
+
+        ret_arr = {"values": [value + "nice", value + "not_nice", value + "mediocre"]}
+        return ret_arr
+
+    def handle_argument_completion(self, args, arg_pos, cursor_pos):
+        ov_parser = self.get_parser()
+        orig_arg = args[arg_pos][0:cursor_pos]
+        if orig_arg == "correct_":
+            ret_arr = {"completion": "correct_answer"}
+            return ret_arr
+
+        if ov_parser.was_set("process-name"):
+            # No completions if proc_name was set.
+            return True
+
+        ret_arr = {
+            "values": [orig_arg + "cool", orig_arg + "yuck"],
+            "descriptions": ["good idea", "bad idea"],
+        }
+        return ret_arr
+
     def get_short_help(self):
         return "This is my short help string"
 
diff --git a/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp b/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp
index c67a2b4bf46e64..3faeb587c3a91b 100644
--- a/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp
+++ b/lldb/unittests/ScriptInterpreter/Python/PythonTestSuite.cpp
@@ -211,6 +211,19 @@ LLDBSwigPythonGetRepeatCommandForScriptedCommand(PyObject *implementor,
   return std::nullopt;
 }
 
+StructuredData::DictionarySP
+LLDBSwigPythonHandleArgumentCompletionForScriptedCommand(
+    PyObject *implementor, std::vector<llvm::StringRef> &args, size_t args_pos,
+    size_t pos_in_arg) {
+  return {};
+}
+
+StructuredData::DictionarySP
+LLDBSwigPythonHandleOptionArgumentCompletionForScriptedCommand(
+    PyObject *implementor, llvm::StringRef &long_options, size_t char_in_arg) {
+  return {};
+}
+
 bool lldb_private::python::SWIGBridge::LLDBSwigPythonCallModuleInit(
     const char *python_module_name, const char *session_dictionary_name,
     lldb::DebuggerSP debugger) {

From 090755234e5033de67be994e99e31f4d13dcdcc5 Mon Sep 17 00:00:00 2001
From: Chris Apple <cja-private@pm.me>
Date: Tue, 24 Sep 2024 10:01:12 -0700
Subject: [PATCH 39/49] [rtsan] Refactor initialization state to prevent hangs
 (#109830)

Move to tri state (Uninitialized, Initialized, Initializing) for our
init state. We currently just have 2 (Uninitialized and Initialized).
---
 compiler-rt/lib/rtsan/rtsan.cpp | 29 ++++++++++++++++++++++++-----
 1 file changed, 24 insertions(+), 5 deletions(-)

diff --git a/compiler-rt/lib/rtsan/rtsan.cpp b/compiler-rt/lib/rtsan/rtsan.cpp
index b288da64ffbe25..e6d2481b2c2a3d 100644
--- a/compiler-rt/lib/rtsan/rtsan.cpp
+++ b/compiler-rt/lib/rtsan/rtsan.cpp
@@ -22,11 +22,25 @@
 using namespace __rtsan;
 using namespace __sanitizer;
 
+namespace {
+enum class InitializationState : u8 {
+  Uninitialized,
+  Initializing,
+  Initialized,
+};
+} // namespace
+
 static StaticSpinMutex rtsan_inited_mutex;
 static atomic_uint8_t rtsan_initialized = {0};
 
-static void SetInitialized() {
-  atomic_store(&rtsan_initialized, 1, memory_order_release);
+static void SetInitializationState(InitializationState state) {
+  atomic_store(&rtsan_initialized, static_cast<u8>(state),
+               memory_order_release);
+}
+
+static InitializationState GetInitializationState() {
+  return static_cast<InitializationState>(
+      atomic_load(&rtsan_initialized, memory_order_acquire));
 }
 
 static auto PrintDiagnosticsAndDieAction(DiagnosticsInfo info) {
@@ -39,13 +53,14 @@ static auto PrintDiagnosticsAndDieAction(DiagnosticsInfo info) {
 extern "C" {
 
 SANITIZER_INTERFACE_ATTRIBUTE void __rtsan_init() {
-  CHECK(!__rtsan_is_initialized());
+  CHECK(GetInitializationState() == InitializationState::Uninitialized);
+  SetInitializationState(InitializationState::Initializing);
 
   SanitizerToolName = "RealtimeSanitizer";
   InitializeFlags();
   InitializeInterceptors();
 
-  SetInitialized();
+  SetInitializationState(InitializationState::Initialized);
 }
 
 SANITIZER_INTERFACE_ATTRIBUTE void __rtsan_ensure_initialized() {
@@ -62,7 +77,7 @@ SANITIZER_INTERFACE_ATTRIBUTE void __rtsan_ensure_initialized() {
 }
 
 SANITIZER_INTERFACE_ATTRIBUTE bool __rtsan_is_initialized() {
-  return atomic_load(&rtsan_initialized, memory_order_acquire) == 1;
+  return GetInitializationState() == InitializationState::Initialized;
 }
 
 SANITIZER_INTERFACE_ATTRIBUTE void __rtsan_realtime_enter() {
@@ -83,6 +98,10 @@ SANITIZER_INTERFACE_ATTRIBUTE void __rtsan_enable() {
 
 SANITIZER_INTERFACE_ATTRIBUTE void
 __rtsan_notify_intercepted_call(const char *func_name) {
+  // While initializing, we need all intercepted functions to behave normally
+  if (GetInitializationState() == InitializationState::Initializing)
+    return;
+
   __rtsan_ensure_initialized();
   GET_CALLER_PC_BP;
   ExpectNotRealtime(

From f4042077e2e3946ee35c1df8cab8237de6086480 Mon Sep 17 00:00:00 2001
From: vporpo <vporpodas@google.com>
Date: Tue, 24 Sep 2024 10:12:12 -0700
Subject: [PATCH 40/49] [SandboxIR] Implement a few Instruction member
 functions (#109820)

This patch implements a few sandboxir::Instruction predicate functions.
---
 llvm/include/llvm/SandboxIR/SandboxIR.h    | 16 ++++++++++++++++
 llvm/unittests/SandboxIR/SandboxIRTest.cpp | 13 +++++++++++++
 2 files changed, 29 insertions(+)

diff --git a/llvm/include/llvm/SandboxIR/SandboxIR.h b/llvm/include/llvm/SandboxIR/SandboxIR.h
index d5e239e70da613..d4c907ce8327dd 100644
--- a/llvm/include/llvm/SandboxIR/SandboxIR.h
+++ b/llvm/include/llvm/SandboxIR/SandboxIR.h
@@ -1935,6 +1935,22 @@ class Instruction : public sandboxir::User {
   /// \Returns this Instruction's opcode. Note that SandboxIR has its own opcode
   /// state to allow for new SandboxIR-specific instructions.
   Opcode getOpcode() const { return Opc; }
+
+  // TODO: Missing function getOpcodeName().
+
+  bool isTerminator() const {
+    return cast<llvm::Instruction>(Val)->isTerminator();
+  }
+  bool isUnaryOp() const { return cast<llvm::Instruction>(Val)->isUnaryOp(); }
+  bool isBinaryOp() const { return cast<llvm::Instruction>(Val)->isBinaryOp(); }
+  bool isIntDivRem() const {
+    return cast<llvm::Instruction>(Val)->isIntDivRem();
+  }
+  bool isShift() const { return cast<llvm::Instruction>(Val)->isShift(); }
+  bool isCast() const { return cast<llvm::Instruction>(Val)->isCast(); }
+
+  // TODO: More missing functions
+
   /// Detach this from its parent BasicBlock without deleting it.
   void removeFromParent();
   /// Detach this Value from its parent and delete it.
diff --git a/llvm/unittests/SandboxIR/SandboxIRTest.cpp b/llvm/unittests/SandboxIR/SandboxIRTest.cpp
index 1fcc9cbea152cd..42df09609b675c 100644
--- a/llvm/unittests/SandboxIR/SandboxIRTest.cpp
+++ b/llvm/unittests/SandboxIR/SandboxIRTest.cpp
@@ -1769,6 +1769,7 @@ define void @foo(i8 %v1, ptr %ptr) {
   store volatile i8 %ld0, ptr %ptr
   %atomicrmw = atomicrmw add ptr %ptr, i8 %v1 acquire
   %udiv = udiv i8 %ld0, %v1
+  %urem = urem i8 %ld0, %v1
   call void @foo()
   ret void
 }
@@ -1861,6 +1862,18 @@ define void @foo(i8 %v1, ptr %ptr) {
 
   for (auto &LLVMI : *LLVMBB1) {
     auto &I = cast<sandboxir::Instruction>(*Ctx.getValue(&LLVMI));
+    // Check isTerminator().
+    EXPECT_EQ(LLVMI.isTerminator(), I.isTerminator());
+    // Check isUnaryOp().
+    EXPECT_EQ(LLVMI.isUnaryOp(), I.isUnaryOp());
+    // Check isBinaryOp().
+    EXPECT_EQ(LLVMI.isBinaryOp(), I.isBinaryOp());
+    // Check isIntDivRem().
+    EXPECT_EQ(LLVMI.isIntDivRem(), I.isIntDivRem());
+    // Check isShift().
+    EXPECT_EQ(LLVMI.isShift(), I.isShift());
+    // Check isCast().
+    EXPECT_EQ(LLVMI.isCast(), I.isCast());
     // Check isAssociative().
     EXPECT_EQ(LLVMI.isAssociative(), I.isAssociative());
     // Check isCommutative().

From 234193bae6cf8b19703c6e543b100517bb99a9f7 Mon Sep 17 00:00:00 2001
From: Nirvedh Meshram <96096277+nirvedhmeshram@users.noreply.github.com>
Date: Tue, 24 Sep 2024 12:24:59 -0500
Subject: [PATCH 41/49] [mlir][linalg] Vectorization support for convolution of
 i1 type (#109480)

Normally convolutions present with the following linalg op region
```
^bb0(%arg14: i4, %arg15: i4, %arg16: i4):
  %17 = arith.muli %arg14, %arg15 : i4
  %18 = arith.addi %arg16, %17 : i4
  linalg.yield %18 : i4
  ```
  However, for i1 due to strength reduction we get something like
  ```
  ^bb0(%arg14: i1, %arg15: i1, %arg16: i1):
  %17 = arith.andi %arg14, %arg15 : i1
  %18 = arith.ori %arg16, %17 : i1
  linalg.yield %18 : i1
  ```
  This PR updates the logic to support this region for i1 types.
---
 .../Linalg/Transforms/Vectorization.cpp       | 20 ++++--
 .../Dialect/Linalg/vectorize-convolution.mlir | 65 +++++++++++++++++++
 2 files changed, 81 insertions(+), 4 deletions(-)

diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
index c332307da4d333..fa20001f661822 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -2987,10 +2987,15 @@ struct Conv1DGenerator
     if (!setOperKind(reduceOp))
       return;
     auto maybeKind = getCombinerOpKind(reduceOp);
-    if (!maybeKind || (*maybeKind != vector::CombiningKind::ADD &&
+    // Typically convolution will have a `Add` CombiningKind but for i1 type it
+    // can get strength reduced to `OR` which is also supported. This strength
+    // reduction logic is in `buildBinaryFn` helper in the Linalg dialect.
+    if (!maybeKind || ((*maybeKind != vector::CombiningKind::ADD &&
+                        *maybeKind != vector::CombiningKind::OR) &&
                        (oper != Pool || !isSupportedPoolKind(*maybeKind)))) {
       return;
     }
+    reductionKind = maybeKind.value();
 
     auto rhsRank = rhsShapedType.getRank();
     switch (oper) {
@@ -3273,10 +3278,12 @@ struct Conv1DGenerator
     bindDims(ctx, n, w, f, c);
     lhs = promote(rewriter, loc, lhs, res.getType());
     rhs = promote(rewriter, loc, rhs, res.getType());
-    return rewriter.create<vector::ContractionOp>(
+    auto contrationOp = rewriter.create<vector::ContractionOp>(
         loc, lhs, rhs, res,
         /*indexingMaps=*/MapList{{n, w, c}, {c, f}, {n, w, f}},
         /*iteratorTypes=*/ArrayRef<vector::IteratorType>{par, par, par, red});
+    contrationOp.setKind(reductionKind);
+    return contrationOp;
   }
 
   // Create an outerproduct: lhs{w} * rhs{1} -> res{w} for single channel
@@ -3666,6 +3673,7 @@ struct Conv1DGenerator
   int strideW, dilationW;
   Value lhsShaped, rhsShaped, resShaped;
   ShapedType lhsShapedType, rhsShapedType, resShapedType;
+  vector::CombiningKind reductionKind;
 
   // Sets oper, poolExtOp and isPoolExt for valid conv/pooling ops.
   // Returns true iff it is a valid conv/pooling op.
@@ -3681,7 +3689,9 @@ struct Conv1DGenerator
     switch (numBlockArguments) {
     case 1: {
       // Will be convolution if feeder is a MulOp.
-      // Otherwise, if it can be pooling.
+      // A strength reduced version of MulOp for i1 type is AndOp which is also
+      // supported. Otherwise, it can be pooling. This strength reduction logic
+      // is in `buildBinaryFn` helper in the Linalg dialect.
       auto feedValIt = llvm::find_if_not(reduceOp->getOperands(),
                                          llvm::IsaPred<BlockArgument>);
       Operation *feedOp = (*feedValIt).getDefiningOp();
@@ -3689,7 +3699,9 @@ struct Conv1DGenerator
         oper = Pool;
         isPoolExt = true;
         poolExtOp = feedOp->getName().getIdentifier();
-      } else if (!(isa<arith::MulIOp, arith::MulFOp>(feedOp) &&
+      } else if (!((isa<arith::MulIOp, arith::MulFOp>(feedOp) ||
+                    (isa<arith::AndIOp>(feedOp) &&
+                     feedOp->getResultTypes()[0].isInteger(1))) &&
                    llvm::all_of(feedOp->getOperands(), [](Value v) {
                      if (isa<BlockArgument>(v))
                        return true;
diff --git a/mlir/test/Dialect/Linalg/vectorize-convolution.mlir b/mlir/test/Dialect/Linalg/vectorize-convolution.mlir
index 93e36a69567bd5..7f4b9b986c81b4 100644
--- a/mlir/test/Dialect/Linalg/vectorize-convolution.mlir
+++ b/mlir/test/Dialect/Linalg/vectorize-convolution.mlir
@@ -39,6 +39,7 @@ func.func @conv1d_nwc_4x2x8_memref(%input: memref<4x6x3xf32>, %filter: memref<1x
 //      CHECK:   %[[CONTRACT_0:.+]] = vector.contract {
 // CHECK-SAME:       indexing_maps = [#[[INPUT_MAP]], #[[FILTER_MAP]], #[[OUTPUT_MAP]]],
 // CHECK-SAME:       iterator_types = ["parallel", "parallel", "parallel", "reduction"]
+// CHECK-SAME:       kind = #vector.kind<add>
 // CHECK-SAME:     %[[V_INPUT_0]], %[[V_FILTER]], %[[V_OUTPUT_0]]
 // CHECK-SAME:     : vector<4x1x3xf32>, vector<3x8xf32> into vector<4x1x8xf32>
 
@@ -46,6 +47,7 @@ func.func @conv1d_nwc_4x2x8_memref(%input: memref<4x6x3xf32>, %filter: memref<1x
 //      CHECK:   %[[CONTRACT_1:.+]] = vector.contract {
 // CHECK-SAME:       indexing_maps = [#[[INPUT_MAP]], #[[FILTER_MAP]], #[[OUTPUT_MAP]]],
 // CHECK-SAME:       iterator_types = ["parallel", "parallel", "parallel", "reduction"]
+// CHECK-SAME:       kind = #vector.kind<add>
 // CHECK-SAME:     %[[V_INPUT_1]], %[[V_FILTER]], %[[V_OUTPUT_1]]
 // CHECK-SAME:     : vector<4x1x3xf32>, vector<3x8xf32> into vector<4x1x8xf32>
 
@@ -61,6 +63,36 @@ func.func @conv1d_nwc_4x2x8_memref(%input: memref<4x6x3xf32>, %filter: memref<1x
 
 // -----
 
+// This test is same as above but for i1 type with the only difference being that
+// the combining kind for `vector.contract` is `OR`.
+func.func @conv1d_nwc_4x2x8_memref_i1(%input: memref<4x6x3xi1>, %filter: memref<1x3x8xi1>, %output: memref<4x2x8xi1>) {
+  linalg.conv_1d_nwc_wcf
+    {dilations = dense<1> : tensor<1xi64>, strides = dense<3> : tensor<1xi64>}
+    ins(%input, %filter : memref<4x6x3xi1>, memref<1x3x8xi1>)
+    outs(%output : memref<4x2x8xi1>)
+  return
+}
+// CHECK: #[[INPUT_MAP:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
+// CHECK: #[[FILTER_MAP:.+]] = affine_map<(d0, d1, d2, d3) -> (d3, d2)>
+// CHECK: #[[OUTPUT_MAP:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
+
+//      CHECK: func @conv1d_nwc_4x2x8_memref_i1
+/// w == 0, kw == 0
+//      CHECK:   %[[CONTRACT_0:.+]] = vector.contract {
+// CHECK-SAME:       indexing_maps = [#[[INPUT_MAP]], #[[FILTER_MAP]], #[[OUTPUT_MAP]]],
+// CHECK-SAME:       iterator_types = ["parallel", "parallel", "parallel", "reduction"]
+// CHECK-SAME:       kind = #vector.kind<or>
+// CHECK-SAME:       : vector<4x1x3xi1>, vector<3x8xi1> into vector<4x1x8xi1>
+
+/// w == 1, kw == 0
+//      CHECK:   %[[CONTRACT_1:.+]] = vector.contract {
+// CHECK-SAME:       indexing_maps = [#[[INPUT_MAP]], #[[FILTER_MAP]], #[[OUTPUT_MAP]]],
+// CHECK-SAME:       iterator_types = ["parallel", "parallel", "parallel", "reduction"]
+// CHECK-SAME:       kind = #vector.kind<or>
+// CHECK-SAME:       : vector<4x1x3xi1>, vector<3x8xi1> into vector<4x1x8xi1>
+
+// -----
+
 // The i8i8i32 case is similar to f32 case, so checking one case is enough for
 // test coverage.
 func.func @conv1d_nwc_4x2x8_i8i8i32_memref(%input: memref<4x6x3xi8>, %filter: memref<1x3x8xi8>, %output: memref<4x2x8xi32>) {
@@ -299,6 +331,7 @@ func.func @conv1d_ncw_4x8x2_memref(%input: memref<4x3x6xf32>, %filter: memref<8x
 //      CHECK:   %[[CONTRACT_0:.+]] = vector.contract {
 // CHECK-SAME:       indexing_maps = [#[[INPUT_MAP]], #[[FILTER_MAP]], #[[OUTPUT_MAP]]],
 // CHECK-SAME:       iterator_types = ["parallel", "parallel", "parallel", "reduction"]
+// CHECK-SAME:       kind = #vector.kind<add>
 // CHECK-SAME:     %[[V_INPUT_0]], %[[V_FILTER]], %[[V_OUTPUT_0]]
 // CHECK-SAME:     : vector<4x1x3xf32>, vector<3x8xf32> into vector<4x1x8xf32>
 
@@ -306,6 +339,7 @@ func.func @conv1d_ncw_4x8x2_memref(%input: memref<4x3x6xf32>, %filter: memref<8x
 //      CHECK:   %[[CONTRACT_1:.+]] = vector.contract {
 // CHECK-SAME:       indexing_maps = [#[[INPUT_MAP]], #[[FILTER_MAP]], #[[OUTPUT_MAP]]],
 // CHECK-SAME:       iterator_types = ["parallel", "parallel", "parallel", "reduction"]
+// CHECK-SAME:       kind = #vector.kind<add>
 // CHECK-SAME:     %[[V_INPUT_1]], %[[V_FILTER]], %[[V_OUTPUT_1]]
 // CHECK-SAME:     : vector<4x1x3xf32>, vector<3x8xf32> into vector<4x1x8xf32>
 
@@ -324,6 +358,37 @@ func.func @conv1d_ncw_4x8x2_memref(%input: memref<4x3x6xf32>, %filter: memref<8x
 
 // -----
 
+// This test is same as above but for i1 type with the only difference being that
+// the combining kind for `vector.contract` is `OR`.
+func.func @conv1d_ncw_4x8x2_memref_i1(%input: memref<4x3x6xi1>, %filter: memref<8x3x1xi1>, %output: memref<4x8x2xi1>) {
+  linalg.conv_1d_ncw_fcw
+    {dilations = dense<1> : tensor<1xi64>, strides = dense<3> : tensor<1xi64>}
+    ins(%input, %filter : memref<4x3x6xi1>, memref<8x3x1xi1>)
+    outs(%output : memref<4x8x2xi1>)
+  return
+}
+
+// CHECK: #[[INPUT_MAP:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d3)>
+// CHECK: #[[FILTER_MAP:.+]] = affine_map<(d0, d1, d2, d3) -> (d3, d2)>
+// CHECK: #[[OUTPUT_MAP:.+]] = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>
+
+//      CHECK: func @conv1d_ncw_4x8x2_memref_i1
+/// w == 0, kw == 0
+//      CHECK:   vector.contract {
+// CHECK-SAME:       indexing_maps = [#[[INPUT_MAP]], #[[FILTER_MAP]], #[[OUTPUT_MAP]]],
+// CHECK-SAME:       iterator_types = ["parallel", "parallel", "parallel", "reduction"]
+// CHECK-SAME:       kind = #vector.kind<or>
+// CHECK-SAME:       : vector<4x1x3xi1>, vector<3x8xi1> into vector<4x1x8xi1>
+
+/// w == 1, kw == 0
+//      CHECK:   vector.contract {
+// CHECK-SAME:       indexing_maps = [#[[INPUT_MAP]], #[[FILTER_MAP]], #[[OUTPUT_MAP]]],
+// CHECK-SAME:       iterator_types = ["parallel", "parallel", "parallel", "reduction"]
+// CHECK-SAME:       kind = #vector.kind<or>
+// CHECK-SAME:       : vector<4x1x3xi1>, vector<3x8xi1> into vector<4x1x8xi1>
+
+// -----
+
 func.func @conv1d_ncw_4x8x2_memref(%input: memref<4x3x6xf32>, %filter: memref<8x3x2xf32>, %output: memref<4x8x2xf32>) {
   linalg.conv_1d_ncw_fcw
     {dilations = dense<2> : tensor<1xi64>, strides = dense<3> : tensor<1xi64>}

From 26029d77a57cb4aaa1479064109e985a90d0edd8 Mon Sep 17 00:00:00 2001
From: Tex Riddell <texr@microsoft.com>
Date: Tue, 24 Sep 2024 10:42:26 -0700
Subject: [PATCH 42/49] [DirectX] Add atan2 intrinsic and expand for DXIL
 backend (p1) (#108865)

This change is part of this proposal:
https://discourse.llvm.org/t/rfc-all-the-math-intrinsics/78294

This preliminary work adds the intrinsic to llvm and expands using atan
intrinsic for DXIL backend, since DXIL has no atan2 op.

Part 1 for Implement the atan2 HLSL Function #70096.
---
 llvm/docs/LangRef.rst                         | 37 ++++++++
 llvm/include/llvm/IR/Intrinsics.td            |  1 +
 .../Target/DirectX/DXILIntrinsicExpansion.cpp | 52 +++++++++++
 llvm/test/CodeGen/DirectX/atan2.ll            | 87 +++++++++++++++++++
 llvm/test/CodeGen/DirectX/atan2_error.ll      | 11 +++
 5 files changed, 188 insertions(+)
 create mode 100644 llvm/test/CodeGen/DirectX/atan2.ll
 create mode 100644 llvm/test/CodeGen/DirectX/atan2_error.ll

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 91c3e60bb0acb1..41d1efab752fd7 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -15583,6 +15583,43 @@ trapping or setting ``errno``.
 When specified with the fast-math-flag 'afn', the result may be approximated
 using a less accurate calculation.
 
+'``llvm.atan2.*``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+This is an overloaded intrinsic. You can use ``llvm.atan2`` on any
+floating-point or vector of floating-point type. Not all targets support
+all types however.
+
+::
+
+      declare float     @llvm.atan2.f32(float  %X, float %Y)
+      declare double    @llvm.atan2.f64(double %X, double %Y)
+      declare x86_fp80  @llvm.atan2.f80(x86_fp80  %X, x86_fp80 %Y)
+      declare fp128     @llvm.atan2.f128(fp128 %X, fp128 %Y)
+      declare ppc_fp128 @llvm.atan2.ppcf128(ppc_fp128  %X, ppc_fp128 %Y)
+
+Overview:
+"""""""""
+
+The '``llvm.atan2.*``' intrinsics return the arctangent of the operand.
+
+Arguments:
+""""""""""
+
+The arguments and return value are floating-point numbers of the same type.
+
+Semantics:
+""""""""""
+
+Return the same value as a corresponding libm '``atan2``' function but without
+trapping or setting ``errno``.
+
+When specified with the fast-math-flag 'afn', the result may be approximated
+using a less accurate calculation.
+
 '``llvm.sinh.*``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 0a74a217a5f010..48d57907e6d0bc 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -1016,6 +1016,7 @@ let IntrProperties = [IntrNoMem, IntrSpeculatable, IntrWillReturn] in {
   def int_asin : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
   def int_acos : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
   def int_atan : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
+  def int_atan2 : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>]>;
   def int_sin  : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
   def int_cos  : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
   def int_tan  : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
diff --git a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
index dd73b895b14d37..926cbe97f24fda 100644
--- a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
+++ b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
@@ -36,6 +36,7 @@ using namespace llvm;
 static bool isIntrinsicExpansion(Function &F) {
   switch (F.getIntrinsicID()) {
   case Intrinsic::abs:
+  case Intrinsic::atan2:
   case Intrinsic::exp:
   case Intrinsic::log:
   case Intrinsic::log10:
@@ -307,6 +308,54 @@ static Value *expandNormalizeIntrinsic(CallInst *Orig) {
   return Builder.CreateFMul(X, MultiplicandVec);
 }
 
+static Value *expandAtan2Intrinsic(CallInst *Orig) {
+  Value *Y = Orig->getOperand(0);
+  Value *X = Orig->getOperand(1);
+  Type *Ty = X->getType();
+  IRBuilder<> Builder(Orig);
+  Builder.setFastMathFlags(Orig->getFastMathFlags());
+
+  Value *Tan = Builder.CreateFDiv(Y, X);
+
+  CallInst *Atan =
+      Builder.CreateIntrinsic(Ty, Intrinsic::atan, {Tan}, nullptr, "Elt.Atan");
+  Atan->setTailCall(Orig->isTailCall());
+  Atan->setAttributes(Orig->getAttributes());
+
+  // Modify atan result based on https://en.wikipedia.org/wiki/Atan2.
+  Constant *Pi = ConstantFP::get(Ty, llvm::numbers::pi);
+  Constant *HalfPi = ConstantFP::get(Ty, llvm::numbers::pi / 2);
+  Constant *NegHalfPi = ConstantFP::get(Ty, -llvm::numbers::pi / 2);
+  Constant *Zero = ConstantFP::get(Ty, 0);
+  Value *AtanAddPi = Builder.CreateFAdd(Atan, Pi);
+  Value *AtanSubPi = Builder.CreateFSub(Atan, Pi);
+
+  // x > 0 -> atan.
+  Value *Result = Atan;
+  Value *XLt0 = Builder.CreateFCmpOLT(X, Zero);
+  Value *XEq0 = Builder.CreateFCmpOEQ(X, Zero);
+  Value *YGe0 = Builder.CreateFCmpOGE(Y, Zero);
+  Value *YLt0 = Builder.CreateFCmpOLT(Y, Zero);
+
+  // x < 0, y >= 0 -> atan + pi.
+  Value *XLt0AndYGe0 = Builder.CreateAnd(XLt0, YGe0);
+  Result = Builder.CreateSelect(XLt0AndYGe0, AtanAddPi, Result);
+
+  // x < 0, y < 0 -> atan - pi.
+  Value *XLt0AndYLt0 = Builder.CreateAnd(XLt0, YLt0);
+  Result = Builder.CreateSelect(XLt0AndYLt0, AtanSubPi, Result);
+
+  // x == 0, y < 0 -> -pi/2
+  Value *XEq0AndYLt0 = Builder.CreateAnd(XEq0, YLt0);
+  Result = Builder.CreateSelect(XEq0AndYLt0, NegHalfPi, Result);
+
+  // x == 0, y > 0 -> pi/2
+  Value *XEq0AndYGe0 = Builder.CreateAnd(XEq0, YGe0);
+  Result = Builder.CreateSelect(XEq0AndYGe0, HalfPi, Result);
+
+  return Result;
+}
+
 static Value *expandPowIntrinsic(CallInst *Orig) {
 
   Value *X = Orig->getOperand(0);
@@ -418,6 +467,9 @@ static bool expandIntrinsic(Function &F, CallInst *Orig) {
   case Intrinsic::abs:
     Result = expandAbs(Orig);
     break;
+  case Intrinsic::atan2:
+    Result = expandAtan2Intrinsic(Orig);
+    break;
   case Intrinsic::exp:
     Result = expandExpIntrinsic(Orig);
     break;
diff --git a/llvm/test/CodeGen/DirectX/atan2.ll b/llvm/test/CodeGen/DirectX/atan2.ll
new file mode 100644
index 00000000000000..9d86f87f3ed50e
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/atan2.ll
@@ -0,0 +1,87 @@
+; RUN: opt -S -dxil-intrinsic-expansion -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s --check-prefixes=CHECK,EXPCHECK
+; RUN: opt -S -dxil-intrinsic-expansion -scalarizer -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s --check-prefixes=CHECK,DOPCHECK
+
+; Make sure correct dxil expansions for atan2 are generated for float and half.
+
+define noundef float @atan2_float(float noundef %y, float noundef %x) {
+entry:
+; CHECK: [[DIV:%.+]] = fdiv float %y, %x
+; EXPCHECK: [[ATAN:%.+]] = call float @llvm.atan.f32(float [[DIV]])
+; DOPCHECK: [[ATAN:%.+]] = call float @dx.op.unary.f32(i32 17, float [[DIV]])
+; CHECK-DAG: [[ADD_PI:%.+]] = fadd float [[ATAN]], 0x400921FB60000000
+; CHECK-DAG: [[SUB_PI:%.+]] = fsub float [[ATAN]], 0x400921FB60000000
+; CHECK-DAG: [[X_LT_0:%.+]] = fcmp olt float %x, 0.000000e+00
+; CHECK-DAG: [[X_EQ_0:%.+]] = fcmp oeq float %x, 0.000000e+00 
+; CHECK-DAG: [[Y_GE_0:%.+]] = fcmp oge float %y, 0.000000e+00 
+; CHECK-DAG: [[Y_LT_0:%.+]] = fcmp olt float %y, 0.000000e+00
+; CHECK: [[XLT0_AND_YGE0:%.+]] = and i1 [[X_LT_0]], [[Y_GE_0]]
+; CHECK: [[SELECT_ADD_PI:%.+]] = select i1 [[XLT0_AND_YGE0]], float [[ADD_PI]], float [[ATAN]]
+; CHECK: [[XLT0_AND_YLT0:%.+]] = and i1 [[X_LT_0]], [[Y_LT_0]]
+; CHECK: [[SELECT_SUB_PI:%.+]] = select i1 [[XLT0_AND_YLT0]], float [[SUB_PI]], float [[SELECT_ADD_PI]]
+; CHECK: [[XEQ0_AND_YLT0:%.+]] = and i1 [[X_EQ_0]], [[Y_LT_0]]
+; CHECK: [[SELECT_NEGHPI:%.+]] = select i1 [[XEQ0_AND_YLT0]], float 0xBFF921FB60000000, float [[SELECT_SUB_PI]]
+; CHECK: [[XEQ0_AND_YGE0:%.+]] = and i1 [[X_EQ_0]], [[Y_GE_0]]
+; CHECK: [[SELECT_HPI:%.+]] = select i1 [[XEQ0_AND_YGE0]], float 0x3FF921FB60000000, float [[SELECT_NEGHPI]]
+; CHECK: ret float [[SELECT_HPI]]
+  %elt.atan2 = call float @llvm.atan2.f32(float %y, float %x)
+  ret float %elt.atan2
+}
+
+define noundef half @atan2_half(half noundef %y, half noundef %x) {
+entry:
+; CHECK: [[DIV:%.+]] = fdiv half %y, %x
+; EXPCHECK: [[ATAN:%.+]] = call half @llvm.atan.f16(half [[DIV]])
+; DOPCHECK: [[ATAN:%.+]] = call half @dx.op.unary.f16(i32 17, half [[DIV]])
+; CHECK-DAG: [[ADD_PI:%.+]] = fadd half [[ATAN]], 0xH4248
+; CHECK-DAG: [[SUB_PI:%.+]] = fsub half [[ATAN]], 0xH4248
+; CHECK-DAG: [[X_LT_0:%.+]] = fcmp olt half %x, 0xH0000
+; CHECK-DAG: [[X_EQ_0:%.+]] = fcmp oeq half %x, 0xH0000 
+; CHECK-DAG: [[Y_GE_0:%.+]] = fcmp oge half %y, 0xH0000 
+; CHECK-DAG: [[Y_LT_0:%.+]] = fcmp olt half %y, 0xH0000
+; CHECK: [[XLT0_AND_YGE0:%.+]] = and i1 [[X_LT_0]], [[Y_GE_0]]
+; CHECK: [[SELECT_ADD_PI:%.+]] = select i1 [[XLT0_AND_YGE0]], half [[ADD_PI]], half [[ATAN]]
+; CHECK: [[XLT0_AND_YLT0:%.+]] = and i1 [[X_LT_0]], [[Y_LT_0]]
+; CHECK: [[SELECT_SUB_PI:%.+]] = select i1 [[XLT0_AND_YLT0]], half [[SUB_PI]], half [[SELECT_ADD_PI]]
+; CHECK: [[XEQ0_AND_YLT0:%.+]] = and i1 [[X_EQ_0]], [[Y_LT_0]]
+; CHECK: [[SELECT_NEGHPI:%.+]] = select i1 [[XEQ0_AND_YLT0]], half 0xHBE48, half [[SELECT_SUB_PI]]
+; CHECK: [[XEQ0_AND_YGE0:%.+]] = and i1 [[X_EQ_0]], [[Y_GE_0]]
+; CHECK: [[SELECT_HPI:%.+]] = select i1 [[XEQ0_AND_YGE0]], half 0xH3E48, half [[SELECT_NEGHPI]]
+; CHECK: ret half [[SELECT_HPI]]
+  %elt.atan2 = call half @llvm.atan2.f16(half %y, half %x)
+  ret half %elt.atan2
+}
+
+define noundef <4 x float> @atan2_float4(<4 x float> noundef %y, <4 x float> noundef %x) {
+entry:
+; Just Expansion, no scalarization or lowering:
+; EXPCHECK: [[DIV:%.+]] = fdiv <4 x float> %y, %x
+; EXPCHECK: [[ATAN:%.+]] = call <4 x float> @llvm.atan.v4f32(<4 x float> [[DIV]])
+; EXPCHECK-DAG: [[ADD_PI:%.+]] = fadd <4 x float> [[ATAN]], <float 0x400921FB60000000, float 0x400921FB60000000, float 0x400921FB60000000, float 0x400921FB60000000>
+; EXPCHECK-DAG: [[SUB_PI:%.+]] = fsub <4 x float> [[ATAN]], <float 0x400921FB60000000, float 0x400921FB60000000, float 0x400921FB60000000, float 0x400921FB60000000>
+; EXPCHECK-DAG: [[X_LT_0:%.+]] = fcmp olt <4 x float> %x, zeroinitializer
+; EXPCHECK-DAG: [[X_EQ_0:%.+]] = fcmp oeq <4 x float> %x, zeroinitializer
+; EXPCHECK-DAG: [[Y_GE_0:%.+]] = fcmp oge <4 x float> %y, zeroinitializer
+; EXPCHECK-DAG: [[Y_LT_0:%.+]] = fcmp olt <4 x float> %y, zeroinitializer
+; EXPCHECK: [[XLT0_AND_YGE0:%.+]] = and <4 x i1> [[X_LT_0]], [[Y_GE_0]]
+; EXPCHECK: [[SELECT_ADD_PI:%.+]] = select <4 x i1> [[XLT0_AND_YGE0]], <4 x float> [[ADD_PI]], <4 x float> [[ATAN]]
+; EXPCHECK: [[XLT0_AND_YLT0:%.+]] = and <4 x i1> [[X_LT_0]], [[Y_LT_0]]
+; EXPCHECK: [[SELECT_SUB_PI:%.+]] = select <4 x i1> [[XLT0_AND_YLT0]], <4 x float> [[SUB_PI]], <4 x float> [[SELECT_ADD_PI]]
+; EXPCHECK: [[XEQ0_AND_YLT0:%.+]] = and <4 x i1> [[X_EQ_0]], [[Y_LT_0]]
+; EXPCHECK: [[SELECT_NEGHPI:%.+]] = select <4 x i1> [[XEQ0_AND_YLT0]], <4 x float> <float 0xBFF921FB60000000, float 0xBFF921FB60000000, float 0xBFF921FB60000000, float 0xBFF921FB60000000>, <4 x float> [[SELECT_SUB_PI]]
+; EXPCHECK: [[XEQ0_AND_YGE0:%.+]] = and <4 x i1> [[X_EQ_0]], [[Y_GE_0]]
+; EXPCHECK: [[SELECT_HPI:%.+]] = select <4 x i1> [[XEQ0_AND_YGE0]], <4 x float> <float 0x3FF921FB60000000, float 0x3FF921FB60000000, float 0x3FF921FB60000000, float 0x3FF921FB60000000>, <4 x float> [[SELECT_NEGHPI]]
+; EXPCHECK: ret <4 x float> [[SELECT_HPI]]
+
+; Scalarization occurs after expansion, so atan scalarization is tested separately.
+; Expansion, scalarization and lowering:
+; Just make sure this expands to exactly 4 scalar DXIL atan (OpCode=17) calls.
+; DOPCHECK-COUNT-4: call float @dx.op.unary.f32(i32 17, float %{{.*}})
+; DOPCHECK-NOT: call float @dx.op.unary.f32(i32 17,
+
+  %elt.atan2 = call <4 x float> @llvm.atan2.v4f32(<4 x float> %y, <4 x float> %x)
+  ret <4 x float> %elt.atan2
+}
+
+declare half @llvm.atan2.f16(half, half)
+declare float @llvm.atan2.f32(float, float)
+declare <4 x float> @llvm.atan2.v4f32(<4 x float>, <4 x float>)
diff --git a/llvm/test/CodeGen/DirectX/atan2_error.ll b/llvm/test/CodeGen/DirectX/atan2_error.ll
new file mode 100644
index 00000000000000..5b3077f85f5d4e
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/atan2_error.ll
@@ -0,0 +1,11 @@
+; RUN: not opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s 2>&1 | FileCheck %s
+
+; DXIL operation atan does not support double overload type
+; CHECK: in function atan2_double
+; CHECK-SAME: Cannot create ATan operation: Invalid overload type
+
+define noundef double @atan2_double(double noundef %a, double noundef %b) #0 {
+entry:
+  %1 = call double @llvm.atan2.f64(double %a, double %b)
+  ret double %1
+}

From 12285cca5ed713dfd483bd96422a5607b8af0085 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20Rodr=C3=ADguez=20Troiti=C3=B1o?=
 <drodriguez@users.noreply.github.com>
Date: Tue, 24 Sep 2024 10:55:35 -0700
Subject: [PATCH 43/49] [DWARF] Use ULEB128 and not just one byte for directory
 indices (#109067)

According to the standard `DW_LNCT_directory_index` can be `data1`,
`data2`, or `udata` (see 6.2.4.1). The code was using `data1`, but this
limits the number of directories to 256, even if the variable holding
the directory index is a `uint64_t`. `dsymutil` was hitting an assertion
when trying to write directory indices higher than 255.

Modify the classic and the parallel DWARF linkers to use `udata` and
encode the directory indices as ULEB128 and provide a test that has more
than 256 directories to check the changes are working as expected.

For people that were using `dsymutil` with CUs that had between 128-256
directories, this will mean that for those indices 2 bytes will be used
now, instead of just one.
---
 .../lib/DWARFLinker/Classic/DWARFStreamer.cpp |   5 +-
 .../Parallel/DebugLineSectionEmitter.h        |   4 +-
 .../X86/dwarf5-many-include-directories.test  | 213 ++++++++++++++++++
 3 files changed, 217 insertions(+), 5 deletions(-)
 create mode 100644 llvm/test/tools/dsymutil/X86/dwarf5-many-include-directories.test

diff --git a/llvm/lib/DWARFLinker/Classic/DWARFStreamer.cpp b/llvm/lib/DWARFLinker/Classic/DWARFStreamer.cpp
index bca31253683530..947db9cbcd92d0 100644
--- a/llvm/lib/DWARFLinker/Classic/DWARFStreamer.cpp
+++ b/llvm/lib/DWARFLinker/Classic/DWARFStreamer.cpp
@@ -933,7 +933,7 @@ void DwarfStreamer::emitLineTablePrologueV5IncludeAndFileTable(
     LineSectionSize += MS->emitULEB128IntValue(StrForm);
 
     LineSectionSize += MS->emitULEB128IntValue(dwarf::DW_LNCT_directory_index);
-    LineSectionSize += MS->emitULEB128IntValue(dwarf::DW_FORM_data1);
+    LineSectionSize += MS->emitULEB128IntValue(dwarf::DW_FORM_udata);
 
     if (HasChecksums) {
       LineSectionSize += MS->emitULEB128IntValue(dwarf::DW_LNCT_MD5);
@@ -952,8 +952,7 @@ void DwarfStreamer::emitLineTablePrologueV5IncludeAndFileTable(
   // file_names (sequence of file name entries).
   for (auto File : P.FileNames) {
     emitLineTableString(P, File.Name, DebugStrPool, DebugLineStrPool);
-    MS->emitInt8(File.DirIdx);
-    LineSectionSize += 1;
+    LineSectionSize += MS->emitULEB128IntValue(File.DirIdx);
     if (HasChecksums) {
       MS->emitBinaryData(
           StringRef(reinterpret_cast<const char *>(File.Checksum.data()),
diff --git a/llvm/lib/DWARFLinker/Parallel/DebugLineSectionEmitter.h b/llvm/lib/DWARFLinker/Parallel/DebugLineSectionEmitter.h
index 38357c7f97314c..b035c4b1d6c30d 100644
--- a/llvm/lib/DWARFLinker/Parallel/DebugLineSectionEmitter.h
+++ b/llvm/lib/DWARFLinker/Parallel/DebugLineSectionEmitter.h
@@ -215,7 +215,7 @@ class DebugLineSectionEmitter {
       encodeULEB128(FileNameForm, Section.OS);
 
       encodeULEB128(dwarf::DW_LNCT_directory_index, Section.OS);
-      encodeULEB128(dwarf::DW_FORM_data1, Section.OS);
+      encodeULEB128(dwarf::DW_FORM_udata, Section.OS);
 
       if (HasChecksums) {
         encodeULEB128(dwarf::DW_LNCT_MD5, Section.OS);
@@ -242,7 +242,7 @@ class DebugLineSectionEmitter {
       // A null-terminated string containing the full or relative path name of a
       // source file.
       Section.emitString(FileNameForm, *FileNameStr);
-      Section.emitIntVal(File.DirIdx, 1);
+      encodeULEB128(File.DirIdx, Section.OS);
 
       if (HasChecksums) {
         assert((File.Checksum.size() == 16) &&
diff --git a/llvm/test/tools/dsymutil/X86/dwarf5-many-include-directories.test b/llvm/test/tools/dsymutil/X86/dwarf5-many-include-directories.test
new file mode 100644
index 00000000000000..644eecd26d8afe
--- /dev/null
+++ b/llvm/test/tools/dsymutil/X86/dwarf5-many-include-directories.test
@@ -0,0 +1,213 @@
+# RUN: rm -rf %t && mkdir -p %t
+# RUN: split-file %s %t
+# RUN: %python %t/all.py > %t/all.ll
+# RUN: sed 's@---TEMPORARY_DIR---@%{/t:regex_replacement}@' %t/debug.map.template > %t/debug.map
+# RUN: %llc_dwarf -mtriple x86_64-apple-macosx10.4.0 -o %t/all.o -filetype=obj %t/all.ll
+# RUN: dsymutil -f -y %t/debug.map -o - | llvm-dwarfdump -debug-line - | FileCheck %s
+# RUN: dsymutil --linker parallel -f -y %t/debug.map -o - | llvm-dwarfdump -debug-line - | tee %t/output.txt | FileCheck %s
+
+# CHECK:      include_directories[255] = "/tmp/tmp.0HPkdttdoU/d254"
+# CHECK-NEXT: include_directories[256] = "/tmp/tmp.0HPkdttdoU/d255"
+# CHECK-NEXT: include_directories[257] = "/tmp/tmp.0HPkdttdoU/d256"
+
+# CHECK: dir_index: 255
+# CHECK: dir_index: 256
+# CHECK: dir_index: 257
+
+# Original file generated doing the following (fish shell):
+# - for cnt in (seq 0 256); mkdir -p d$cnt ; printf "void func$cnd() {}\n#define FUNC$cnt func$cnt()\n" >> d$cnt/f$cnt.c ; end
+# - for cnt in (seq 0 256); printf "#include \"f$cnt.c\"" >> all.c ; end
+# - printf "void all() {\n" >> all.c
+# - for cnt in (seq 0 256); printf "FUNC$cnt;\n" >> all.c ; end
+# - printf "}\n" >> all.c
+# - clang -target x86_64-apple-macos -S -emit-llvm -gdwarf-5 -o all.ll all.c (for cnt in (seq 0 256); echo "-Id$cnt"; end)
+# - Edit all.ll manually and change all DIFile so the directory in filename is
+#   moved into the directory field.
+# - Transformed into Python manually.
+
+#--- all.py
+import math
+import string
+
+PROLOGUE = string.Template("""\
+; ModuleID = 'all.c'
+source_filename = "all.c"
+target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.4.0"
+""")
+
+FUNCTION = string.Template("""\
+; Function Attrs: noinline nounwind optnone uwtable
+define void @func$idx() #0 !dbg !$dbg_reference_subprogram {
+  ret void, !dbg !$dbg_reference_location_ret
+}
+""")
+
+ALL_FUNCTION_PROLOGUE = string.Template("""\
+; Function Attrs: noinline nounwind optnone uwtable
+define void @all() #0 !dbg !$dbg_reference_subprogram {
+""")
+
+ALL_FUNCTION_CALL = string.Template("""\
+  call void @func$idx(), !dbg !$dbg_reference_location_call
+""")
+
+ALL_FUNCTION_EPILOGUE = string.Template("""\
+  ret void, !dbg !$dbg_reference_location_ret
+}
+""")
+
+DWARF_PROLOGUE = string.Template("""\
+attributes #0 = { noinline nounwind optnone uwtable "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cmov,+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+ssse3,+x87" "tune-cpu"="generic" }
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3, !4, !5, !6, !7}
+!llvm.ident = !{!8}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1, producer: "clang version 18.1.6 (CentOS 18.1.6-3.el9)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, splitDebugInlining: false, nameTableKind: Apple, sysroot: "/")
+!1 = !DIFile(filename: "all.c", directory: "/tmp/tmp.0HPkdttdoU", checksumkind: CSK_MD5, checksum: "8b5068f097f0c272ddc808ed2d82cb12")
+!2 = !{i32 7, !"Dwarf Version", i32 5}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{i32 1, !"wchar_size", i32 4}
+!5 = !{i32 8, !"PIC Level", i32 2}
+!6 = !{i32 7, !"uwtable", i32 2}
+!7 = !{i32 7, !"frame-pointer", i32 2}
+!8 = !{!"clang version 18.1.6 (CentOS 18.1.6-3.el9)"}
+""")
+
+DWARF_FUNCTION_WITH_TYPE = string.Template("""\
+!$dbg_reference_subprogram = distinct !DISubprogram(name: "func$idx", scope: !$dbg_reference_file, file: !$dbg_reference_file, line: 1, type: !11, scopeLine: 1, spFlags: DISPFlagDefinition, unit: !0)
+!$dbg_reference_file = !DIFile(filename: "f$idx.c", directory: "/tmp/tmp.0HPkdttdoU/d$idx", checksumkind: CSK_MD5, checksum: "01234567890123456789012345678901")
+!11 = !DISubroutineType(types: !12)
+!12 = !{null}
+!$dbg_reference_location = !DILocation(line: 1, column: $column, scope: !$dbg_reference_subprogram)
+""")
+
+DWARF_FUNCTION = string.Template("""\
+!$dbg_reference_subprogram = distinct !DISubprogram(name: "func$idx", scope: !$dbg_reference_file, file: !$dbg_reference_file, line: 1, type: !11, scopeLine: 1, spFlags: DISPFlagDefinition, unit: !0)
+!$dbg_reference_file = !DIFile(filename: "f$idx.c", directory: "/tmp/tmp.0HPkdttdoU/d$idx", checksumkind: CSK_MD5, checksum: "01234567890123456789012345678901")
+!$dbg_reference_location = !DILocation(line: 1, column: $column, scope: !$dbg_reference_subprogram)
+""")
+
+DWARF_ALL_FUNCTION_PROLOGUE = string.Template("""\
+!$dbg_reference_subprogram = distinct !DISubprogram(name: "all", scope: !1, file: !1, line: $line_number, type: !11, scopeLine: $line_number, spFlags: DISPFlagDefinition, unit: !0)
+""")
+
+DWARF_ALL_FUNCTION_LOCATION = string.Template("""\
+!$dbg_reference_location = !DILocation(line: $line_number, column: 1, scope: !$dbg_reference_subprogram)
+""")
+
+NUM_FUNCS = 257
+
+dbg_reference_subprogram = 9
+dbg_reference_file = 10
+dbg_reference_location = 13
+column_base = 15
+functions = []
+dwarf_subprograms = []
+
+first = True
+for idx in range(NUM_FUNCS):
+  functions.append(
+    FUNCTION.substitute(
+      idx=idx,
+      dbg_reference_subprogram=dbg_reference_subprogram,
+      dbg_reference_location_ret=dbg_reference_location,
+    )
+  )
+  if first:
+    dwarf_subprograms.append(
+      DWARF_FUNCTION_WITH_TYPE.substitute(
+        idx=idx,
+        dbg_reference_subprogram=dbg_reference_subprogram,
+        dbg_reference_file=dbg_reference_file,
+        dbg_reference_location=dbg_reference_location,
+        column=column_base,
+      )
+    )
+  else:
+    dwarf_subprograms.append(
+      DWARF_FUNCTION.substitute(
+        idx=idx,
+        dbg_reference_subprogram=dbg_reference_subprogram,
+        dbg_reference_file=dbg_reference_file,
+        dbg_reference_location=dbg_reference_location,
+        column=column_base + math.floor(math.log10(idx)),
+      )
+    )
+
+  dbg_reference_subprogram += 5 if first else 3
+  dbg_reference_file += 5 if first else 3
+  dbg_reference_location += 3
+  first = False
+
+dbg_reference_location = dbg_reference_subprogram + 1
+line_number = 258
+all_function = []
+dwarf_all_subprogram = []
+
+all_function.append(
+  ALL_FUNCTION_PROLOGUE.substitute(
+    dbg_reference_subprogram=dbg_reference_subprogram
+  )
+)
+dwarf_all_subprogram.append(
+  DWARF_ALL_FUNCTION_PROLOGUE.substitute(
+    dbg_reference_subprogram=dbg_reference_subprogram,
+    line_number=line_number
+  )
+)
+line_number += 1
+
+for idx in range(NUM_FUNCS):
+  all_function.append(
+    ALL_FUNCTION_CALL.substitute(
+      idx=idx,
+      dbg_reference_location_call=dbg_reference_location,
+    )
+  )
+  dwarf_all_subprogram.append(
+    DWARF_ALL_FUNCTION_LOCATION.substitute(
+      dbg_reference_location=dbg_reference_location,
+      line_number=line_number,
+      dbg_reference_subprogram=dbg_reference_subprogram,
+    )
+  )
+
+  dbg_reference_location += 1
+  line_number += 1
+
+all_function.append(
+  ALL_FUNCTION_EPILOGUE.substitute(
+    dbg_reference_location_ret=dbg_reference_location
+  )
+)
+dwarf_all_subprogram.append(
+  DWARF_ALL_FUNCTION_LOCATION.substitute(
+    dbg_reference_location=dbg_reference_location,
+    line_number=line_number,
+    dbg_reference_subprogram=dbg_reference_subprogram,
+  )
+)
+
+print(PROLOGUE.substitute())
+for function in functions:
+  print(function)
+for all_function_piece in all_function:
+  print(all_function_piece, end='')
+print()
+print(DWARF_PROLOGUE.substitute(), end='')
+for dwarf_subprogram in dwarf_subprograms:
+  print(dwarf_subprogram, end='')
+for dwarf_all_subprogram_piece in dwarf_all_subprogram:
+  print(dwarf_all_subprogram_piece, end='')
+print()
+
+#--- debug.map.template
+---
+triple: 'x86_64-apple-darwin'
+objects:
+  - filename: ---TEMPORARY_DIR---/all.o
+    symbols:
+      - { sym: _all, objAddr: 0x0, binAddr: 0x0, size: 0x0 }
+...

From 6dfeea3f865cff0c1e3ca19aa0e77f379809b2a9 Mon Sep 17 00:00:00 2001
From: Farzon Lotfi <1802579+farzonl@users.noreply.github.com>
Date: Tue, 24 Sep 2024 14:11:14 -0400
Subject: [PATCH 44/49] Revert "[DirectX] Add atan2 intrinsic and expand for
 DXIL backend (p1)" (#109842)

Reverts llvm/llvm-project#108865

Broke the Docs build
---
 llvm/docs/LangRef.rst                         | 37 --------
 llvm/include/llvm/IR/Intrinsics.td            |  1 -
 .../Target/DirectX/DXILIntrinsicExpansion.cpp | 52 -----------
 llvm/test/CodeGen/DirectX/atan2.ll            | 87 -------------------
 llvm/test/CodeGen/DirectX/atan2_error.ll      | 11 ---
 5 files changed, 188 deletions(-)
 delete mode 100644 llvm/test/CodeGen/DirectX/atan2.ll
 delete mode 100644 llvm/test/CodeGen/DirectX/atan2_error.ll

diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index 41d1efab752fd7..91c3e60bb0acb1 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -15583,43 +15583,6 @@ trapping or setting ``errno``.
 When specified with the fast-math-flag 'afn', the result may be approximated
 using a less accurate calculation.
 
-'``llvm.atan2.*``' Intrinsic
-^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-Syntax:
-"""""""
-
-This is an overloaded intrinsic. You can use ``llvm.atan2`` on any
-floating-point or vector of floating-point type. Not all targets support
-all types however.
-
-::
-
-      declare float     @llvm.atan2.f32(float  %X, float %Y)
-      declare double    @llvm.atan2.f64(double %X, double %Y)
-      declare x86_fp80  @llvm.atan2.f80(x86_fp80  %X, x86_fp80 %Y)
-      declare fp128     @llvm.atan2.f128(fp128 %X, fp128 %Y)
-      declare ppc_fp128 @llvm.atan2.ppcf128(ppc_fp128  %X, ppc_fp128 %Y)
-
-Overview:
-"""""""""
-
-The '``llvm.atan2.*``' intrinsics return the arctangent of the operand.
-
-Arguments:
-""""""""""
-
-The arguments and return value are floating-point numbers of the same type.
-
-Semantics:
-""""""""""
-
-Return the same value as a corresponding libm '``atan2``' function but without
-trapping or setting ``errno``.
-
-When specified with the fast-math-flag 'afn', the result may be approximated
-using a less accurate calculation.
-
 '``llvm.sinh.*``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 48d57907e6d0bc..0a74a217a5f010 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -1016,7 +1016,6 @@ let IntrProperties = [IntrNoMem, IntrSpeculatable, IntrWillReturn] in {
   def int_asin : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
   def int_acos : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
   def int_atan : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
-  def int_atan2 : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>, LLVMMatchType<0>]>;
   def int_sin  : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
   def int_cos  : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
   def int_tan  : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>]>;
diff --git a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
index 926cbe97f24fda..dd73b895b14d37 100644
--- a/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
+++ b/llvm/lib/Target/DirectX/DXILIntrinsicExpansion.cpp
@@ -36,7 +36,6 @@ using namespace llvm;
 static bool isIntrinsicExpansion(Function &F) {
   switch (F.getIntrinsicID()) {
   case Intrinsic::abs:
-  case Intrinsic::atan2:
   case Intrinsic::exp:
   case Intrinsic::log:
   case Intrinsic::log10:
@@ -308,54 +307,6 @@ static Value *expandNormalizeIntrinsic(CallInst *Orig) {
   return Builder.CreateFMul(X, MultiplicandVec);
 }
 
-static Value *expandAtan2Intrinsic(CallInst *Orig) {
-  Value *Y = Orig->getOperand(0);
-  Value *X = Orig->getOperand(1);
-  Type *Ty = X->getType();
-  IRBuilder<> Builder(Orig);
-  Builder.setFastMathFlags(Orig->getFastMathFlags());
-
-  Value *Tan = Builder.CreateFDiv(Y, X);
-
-  CallInst *Atan =
-      Builder.CreateIntrinsic(Ty, Intrinsic::atan, {Tan}, nullptr, "Elt.Atan");
-  Atan->setTailCall(Orig->isTailCall());
-  Atan->setAttributes(Orig->getAttributes());
-
-  // Modify atan result based on https://en.wikipedia.org/wiki/Atan2.
-  Constant *Pi = ConstantFP::get(Ty, llvm::numbers::pi);
-  Constant *HalfPi = ConstantFP::get(Ty, llvm::numbers::pi / 2);
-  Constant *NegHalfPi = ConstantFP::get(Ty, -llvm::numbers::pi / 2);
-  Constant *Zero = ConstantFP::get(Ty, 0);
-  Value *AtanAddPi = Builder.CreateFAdd(Atan, Pi);
-  Value *AtanSubPi = Builder.CreateFSub(Atan, Pi);
-
-  // x > 0 -> atan.
-  Value *Result = Atan;
-  Value *XLt0 = Builder.CreateFCmpOLT(X, Zero);
-  Value *XEq0 = Builder.CreateFCmpOEQ(X, Zero);
-  Value *YGe0 = Builder.CreateFCmpOGE(Y, Zero);
-  Value *YLt0 = Builder.CreateFCmpOLT(Y, Zero);
-
-  // x < 0, y >= 0 -> atan + pi.
-  Value *XLt0AndYGe0 = Builder.CreateAnd(XLt0, YGe0);
-  Result = Builder.CreateSelect(XLt0AndYGe0, AtanAddPi, Result);
-
-  // x < 0, y < 0 -> atan - pi.
-  Value *XLt0AndYLt0 = Builder.CreateAnd(XLt0, YLt0);
-  Result = Builder.CreateSelect(XLt0AndYLt0, AtanSubPi, Result);
-
-  // x == 0, y < 0 -> -pi/2
-  Value *XEq0AndYLt0 = Builder.CreateAnd(XEq0, YLt0);
-  Result = Builder.CreateSelect(XEq0AndYLt0, NegHalfPi, Result);
-
-  // x == 0, y > 0 -> pi/2
-  Value *XEq0AndYGe0 = Builder.CreateAnd(XEq0, YGe0);
-  Result = Builder.CreateSelect(XEq0AndYGe0, HalfPi, Result);
-
-  return Result;
-}
-
 static Value *expandPowIntrinsic(CallInst *Orig) {
 
   Value *X = Orig->getOperand(0);
@@ -467,9 +418,6 @@ static bool expandIntrinsic(Function &F, CallInst *Orig) {
   case Intrinsic::abs:
     Result = expandAbs(Orig);
     break;
-  case Intrinsic::atan2:
-    Result = expandAtan2Intrinsic(Orig);
-    break;
   case Intrinsic::exp:
     Result = expandExpIntrinsic(Orig);
     break;
diff --git a/llvm/test/CodeGen/DirectX/atan2.ll b/llvm/test/CodeGen/DirectX/atan2.ll
deleted file mode 100644
index 9d86f87f3ed50e..00000000000000
--- a/llvm/test/CodeGen/DirectX/atan2.ll
+++ /dev/null
@@ -1,87 +0,0 @@
-; RUN: opt -S -dxil-intrinsic-expansion -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s --check-prefixes=CHECK,EXPCHECK
-; RUN: opt -S -dxil-intrinsic-expansion -scalarizer -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s --check-prefixes=CHECK,DOPCHECK
-
-; Make sure correct dxil expansions for atan2 are generated for float and half.
-
-define noundef float @atan2_float(float noundef %y, float noundef %x) {
-entry:
-; CHECK: [[DIV:%.+]] = fdiv float %y, %x
-; EXPCHECK: [[ATAN:%.+]] = call float @llvm.atan.f32(float [[DIV]])
-; DOPCHECK: [[ATAN:%.+]] = call float @dx.op.unary.f32(i32 17, float [[DIV]])
-; CHECK-DAG: [[ADD_PI:%.+]] = fadd float [[ATAN]], 0x400921FB60000000
-; CHECK-DAG: [[SUB_PI:%.+]] = fsub float [[ATAN]], 0x400921FB60000000
-; CHECK-DAG: [[X_LT_0:%.+]] = fcmp olt float %x, 0.000000e+00
-; CHECK-DAG: [[X_EQ_0:%.+]] = fcmp oeq float %x, 0.000000e+00 
-; CHECK-DAG: [[Y_GE_0:%.+]] = fcmp oge float %y, 0.000000e+00 
-; CHECK-DAG: [[Y_LT_0:%.+]] = fcmp olt float %y, 0.000000e+00
-; CHECK: [[XLT0_AND_YGE0:%.+]] = and i1 [[X_LT_0]], [[Y_GE_0]]
-; CHECK: [[SELECT_ADD_PI:%.+]] = select i1 [[XLT0_AND_YGE0]], float [[ADD_PI]], float [[ATAN]]
-; CHECK: [[XLT0_AND_YLT0:%.+]] = and i1 [[X_LT_0]], [[Y_LT_0]]
-; CHECK: [[SELECT_SUB_PI:%.+]] = select i1 [[XLT0_AND_YLT0]], float [[SUB_PI]], float [[SELECT_ADD_PI]]
-; CHECK: [[XEQ0_AND_YLT0:%.+]] = and i1 [[X_EQ_0]], [[Y_LT_0]]
-; CHECK: [[SELECT_NEGHPI:%.+]] = select i1 [[XEQ0_AND_YLT0]], float 0xBFF921FB60000000, float [[SELECT_SUB_PI]]
-; CHECK: [[XEQ0_AND_YGE0:%.+]] = and i1 [[X_EQ_0]], [[Y_GE_0]]
-; CHECK: [[SELECT_HPI:%.+]] = select i1 [[XEQ0_AND_YGE0]], float 0x3FF921FB60000000, float [[SELECT_NEGHPI]]
-; CHECK: ret float [[SELECT_HPI]]
-  %elt.atan2 = call float @llvm.atan2.f32(float %y, float %x)
-  ret float %elt.atan2
-}
-
-define noundef half @atan2_half(half noundef %y, half noundef %x) {
-entry:
-; CHECK: [[DIV:%.+]] = fdiv half %y, %x
-; EXPCHECK: [[ATAN:%.+]] = call half @llvm.atan.f16(half [[DIV]])
-; DOPCHECK: [[ATAN:%.+]] = call half @dx.op.unary.f16(i32 17, half [[DIV]])
-; CHECK-DAG: [[ADD_PI:%.+]] = fadd half [[ATAN]], 0xH4248
-; CHECK-DAG: [[SUB_PI:%.+]] = fsub half [[ATAN]], 0xH4248
-; CHECK-DAG: [[X_LT_0:%.+]] = fcmp olt half %x, 0xH0000
-; CHECK-DAG: [[X_EQ_0:%.+]] = fcmp oeq half %x, 0xH0000 
-; CHECK-DAG: [[Y_GE_0:%.+]] = fcmp oge half %y, 0xH0000 
-; CHECK-DAG: [[Y_LT_0:%.+]] = fcmp olt half %y, 0xH0000
-; CHECK: [[XLT0_AND_YGE0:%.+]] = and i1 [[X_LT_0]], [[Y_GE_0]]
-; CHECK: [[SELECT_ADD_PI:%.+]] = select i1 [[XLT0_AND_YGE0]], half [[ADD_PI]], half [[ATAN]]
-; CHECK: [[XLT0_AND_YLT0:%.+]] = and i1 [[X_LT_0]], [[Y_LT_0]]
-; CHECK: [[SELECT_SUB_PI:%.+]] = select i1 [[XLT0_AND_YLT0]], half [[SUB_PI]], half [[SELECT_ADD_PI]]
-; CHECK: [[XEQ0_AND_YLT0:%.+]] = and i1 [[X_EQ_0]], [[Y_LT_0]]
-; CHECK: [[SELECT_NEGHPI:%.+]] = select i1 [[XEQ0_AND_YLT0]], half 0xHBE48, half [[SELECT_SUB_PI]]
-; CHECK: [[XEQ0_AND_YGE0:%.+]] = and i1 [[X_EQ_0]], [[Y_GE_0]]
-; CHECK: [[SELECT_HPI:%.+]] = select i1 [[XEQ0_AND_YGE0]], half 0xH3E48, half [[SELECT_NEGHPI]]
-; CHECK: ret half [[SELECT_HPI]]
-  %elt.atan2 = call half @llvm.atan2.f16(half %y, half %x)
-  ret half %elt.atan2
-}
-
-define noundef <4 x float> @atan2_float4(<4 x float> noundef %y, <4 x float> noundef %x) {
-entry:
-; Just Expansion, no scalarization or lowering:
-; EXPCHECK: [[DIV:%.+]] = fdiv <4 x float> %y, %x
-; EXPCHECK: [[ATAN:%.+]] = call <4 x float> @llvm.atan.v4f32(<4 x float> [[DIV]])
-; EXPCHECK-DAG: [[ADD_PI:%.+]] = fadd <4 x float> [[ATAN]], <float 0x400921FB60000000, float 0x400921FB60000000, float 0x400921FB60000000, float 0x400921FB60000000>
-; EXPCHECK-DAG: [[SUB_PI:%.+]] = fsub <4 x float> [[ATAN]], <float 0x400921FB60000000, float 0x400921FB60000000, float 0x400921FB60000000, float 0x400921FB60000000>
-; EXPCHECK-DAG: [[X_LT_0:%.+]] = fcmp olt <4 x float> %x, zeroinitializer
-; EXPCHECK-DAG: [[X_EQ_0:%.+]] = fcmp oeq <4 x float> %x, zeroinitializer
-; EXPCHECK-DAG: [[Y_GE_0:%.+]] = fcmp oge <4 x float> %y, zeroinitializer
-; EXPCHECK-DAG: [[Y_LT_0:%.+]] = fcmp olt <4 x float> %y, zeroinitializer
-; EXPCHECK: [[XLT0_AND_YGE0:%.+]] = and <4 x i1> [[X_LT_0]], [[Y_GE_0]]
-; EXPCHECK: [[SELECT_ADD_PI:%.+]] = select <4 x i1> [[XLT0_AND_YGE0]], <4 x float> [[ADD_PI]], <4 x float> [[ATAN]]
-; EXPCHECK: [[XLT0_AND_YLT0:%.+]] = and <4 x i1> [[X_LT_0]], [[Y_LT_0]]
-; EXPCHECK: [[SELECT_SUB_PI:%.+]] = select <4 x i1> [[XLT0_AND_YLT0]], <4 x float> [[SUB_PI]], <4 x float> [[SELECT_ADD_PI]]
-; EXPCHECK: [[XEQ0_AND_YLT0:%.+]] = and <4 x i1> [[X_EQ_0]], [[Y_LT_0]]
-; EXPCHECK: [[SELECT_NEGHPI:%.+]] = select <4 x i1> [[XEQ0_AND_YLT0]], <4 x float> <float 0xBFF921FB60000000, float 0xBFF921FB60000000, float 0xBFF921FB60000000, float 0xBFF921FB60000000>, <4 x float> [[SELECT_SUB_PI]]
-; EXPCHECK: [[XEQ0_AND_YGE0:%.+]] = and <4 x i1> [[X_EQ_0]], [[Y_GE_0]]
-; EXPCHECK: [[SELECT_HPI:%.+]] = select <4 x i1> [[XEQ0_AND_YGE0]], <4 x float> <float 0x3FF921FB60000000, float 0x3FF921FB60000000, float 0x3FF921FB60000000, float 0x3FF921FB60000000>, <4 x float> [[SELECT_NEGHPI]]
-; EXPCHECK: ret <4 x float> [[SELECT_HPI]]
-
-; Scalarization occurs after expansion, so atan scalarization is tested separately.
-; Expansion, scalarization and lowering:
-; Just make sure this expands to exactly 4 scalar DXIL atan (OpCode=17) calls.
-; DOPCHECK-COUNT-4: call float @dx.op.unary.f32(i32 17, float %{{.*}})
-; DOPCHECK-NOT: call float @dx.op.unary.f32(i32 17,
-
-  %elt.atan2 = call <4 x float> @llvm.atan2.v4f32(<4 x float> %y, <4 x float> %x)
-  ret <4 x float> %elt.atan2
-}
-
-declare half @llvm.atan2.f16(half, half)
-declare float @llvm.atan2.f32(float, float)
-declare <4 x float> @llvm.atan2.v4f32(<4 x float>, <4 x float>)
diff --git a/llvm/test/CodeGen/DirectX/atan2_error.ll b/llvm/test/CodeGen/DirectX/atan2_error.ll
deleted file mode 100644
index 5b3077f85f5d4e..00000000000000
--- a/llvm/test/CodeGen/DirectX/atan2_error.ll
+++ /dev/null
@@ -1,11 +0,0 @@
-; RUN: not opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s 2>&1 | FileCheck %s
-
-; DXIL operation atan does not support double overload type
-; CHECK: in function atan2_double
-; CHECK-SAME: Cannot create ATan operation: Invalid overload type
-
-define noundef double @atan2_double(double noundef %a, double noundef %b) #0 {
-entry:
-  %1 = call double @llvm.atan2.f64(double %a, double %b)
-  ret double %1
-}

From 312f73765b29db468cd282130d1b519ed3eeae96 Mon Sep 17 00:00:00 2001
From: Sam Clegg <sbc@chromium.org>
Date: Tue, 24 Sep 2024 11:14:07 -0700
Subject: [PATCH 45/49] [lld][WebAssembly] Don't report relocation error when
 linking with -r/--relocatable (#109822)

Followup to #104926.

We ran into issues on the emscripten waterfall where relocation against
`__dso_handle` were being reported as errors even though
`-r/--relocatable` was being used to generate object file output rather
than executable output.
---
 lld/test/wasm/unsupported-pic-relocations.s   |  6 +++++-
 lld/test/wasm/unsupported-pic-relocations64.s |  6 +++++-
 lld/wasm/Relocations.cpp                      | 10 +++++-----
 3 files changed, 15 insertions(+), 7 deletions(-)

diff --git a/lld/test/wasm/unsupported-pic-relocations.s b/lld/test/wasm/unsupported-pic-relocations.s
index ea32e8468cdb4d..2f85afa02c88b1 100644
--- a/lld/test/wasm/unsupported-pic-relocations.s
+++ b/lld/test/wasm/unsupported-pic-relocations.s
@@ -15,6 +15,10 @@
 # RUN: not wasm-ld --experimental-pic -shared %t.o -o /dev/null  --unresolved-symbols=import-dynamic 2>&1 | \
 # RUN:   FileCheck %s
 
+## These errors should not be reported under -r/--relocation (i.e. when
+## generating an object file)
+# RUN: wasm-ld --experimental-pic -r %t.o -o /dev/null
+
 .functype external_func () -> ()
 
 use_undefined_function:
@@ -23,7 +27,7 @@ use_undefined_function:
     # CHECK: error: {{.*}}.o: relocation R_WASM_TABLE_INDEX_REL_SLEB is not supported against an undefined symbol `external_func`
     drop
     end_function
-    
+
 use_undefined_data:
     .functype use_undefined_data () -> ()
     i32.const external_data@MBREL
diff --git a/lld/test/wasm/unsupported-pic-relocations64.s b/lld/test/wasm/unsupported-pic-relocations64.s
index db9707b7fbac5e..df885b8d75fbe8 100644
--- a/lld/test/wasm/unsupported-pic-relocations64.s
+++ b/lld/test/wasm/unsupported-pic-relocations64.s
@@ -15,6 +15,10 @@
 # RUN: not wasm-ld -mwasm64 --experimental-pic -shared %t.o -o /dev/null  --unresolved-symbols=import-dynamic 2>&1 | \
 # RUN:   FileCheck %s
 
+## These errors should not be reported under -r/--relocation (i.e. when
+## generating an object file)
+# RUN: wasm-ld -mwasm64 --experimental-pic -r %t.o -o /dev/null
+
 .functype external_func () -> ()
 
 use_undefined_function:
@@ -23,7 +27,7 @@ use_undefined_function:
     # CHECK: error: {{.*}}.o: relocation R_WASM_TABLE_INDEX_REL_SLEB64 is not supported against an undefined symbol `external_func`
     drop
     end_function
-    
+
 use_undefined_data:
     .functype use_undefined_data () -> ()
     i64.const external_data@MBREL
diff --git a/lld/wasm/Relocations.cpp b/lld/wasm/Relocations.cpp
index 2dbfe335494711..45ad32701616a1 100644
--- a/lld/wasm/Relocations.cpp
+++ b/lld/wasm/Relocations.cpp
@@ -173,7 +173,7 @@ void scanRelocations(InputChunk *chunk) {
       }
     }
 
-    if (sym->isUndefined()) {
+    if (!config->relocatable && sym->isUndefined()) {
       switch (reloc.Type) {
       case R_WASM_TABLE_INDEX_REL_SLEB:
       case R_WASM_TABLE_INDEX_REL_SLEB64:
@@ -187,11 +187,11 @@ void scanRelocations(InputChunk *chunk) {
               toString(*sym) + "`");
         break;
       }
-    }
 
-    if (sym->isUndefined() && !config->relocatable && !sym->isWeak()) {
-      // Report undefined symbols
-      reportUndefined(file, sym);
+      if (!sym->isWeak()) {
+        // Report undefined symbols
+        reportUndefined(file, sym);
+      }
     }
   }
 }

From a977b9460f8f007a7dedd9197adc7d76bb95b0b6 Mon Sep 17 00:00:00 2001
From: David CARLIER <devnexen@gmail.com>
Date: Tue, 24 Sep 2024 19:15:50 +0100
Subject: [PATCH 46/49] =?UTF-8?q?Reapply=20[compiler-rt]=20prctl=20interce?=
 =?UTF-8?q?ption=20update,=20SECCOMP=5FMODE=5FFILTER=20=E2=80=A6=20(#10983?=
 =?UTF-8?q?4)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…support. #107722
---
 .../sanitizer_common_interceptors.inc         | 10 ++++++++
 .../sanitizer_platform_limits_posix.cpp       | 24 ++++++++++---------
 .../sanitizer_platform_limits_posix.h         |  3 ++-
 .../TestCases/Linux/prctl.cpp                 | 10 ++++++++
 4 files changed, 35 insertions(+), 12 deletions(-)

diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc
index bf5bb43018efc4..e3a329712ac5a3 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_common_interceptors.inc
@@ -1289,6 +1289,11 @@ INTERCEPTOR(int, prctl, int option, unsigned long arg2, unsigned long arg3,
   static const int PR_SCHED_CORE = 62;
   static const int PR_SCHED_CORE_GET = 0;
   static const int PR_GET_PDEATHSIG = 2;
+  static const int PR_SET_SECCOMP = 22;
+
+#  if !SANITIZER_ANDROID
+  static const int SECCOMP_MODE_FILTER = 2;
+#  endif
   if (option == PR_SET_VMA && arg2 == 0UL) {
     char *name = (char *)arg5;
     COMMON_INTERCEPTOR_READ_RANGE(ctx, name, internal_strlen(name) + 1);
@@ -1307,6 +1312,11 @@ INTERCEPTOR(int, prctl, int option, unsigned long arg2, unsigned long arg3,
     COMMON_INTERCEPTOR_WRITE_RANGE(ctx, (u64 *)(arg5), sizeof(u64));
   } else if (res != -1 && option == PR_GET_PDEATHSIG) {
     COMMON_INTERCEPTOR_WRITE_RANGE(ctx, (u64 *)(arg2), sizeof(int));
+#  if !SANITIZER_ANDROID
+  } else if (res != -1 && option == PR_SET_SECCOMP &&
+             arg2 == SECCOMP_MODE_FILTER) {
+    COMMON_INTERCEPTOR_WRITE_RANGE(ctx, (u64 *)(arg3), struct_sock_fprog_sz);
+#  endif
   }
   return res;
 }
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.cpp
index 6d61d276d77e35..5eeb2a89efa8c5 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.cpp
@@ -117,15 +117,16 @@ typedef struct user_fpregs elf_fpregset_t;
 #if SANITIZER_LINUX
 #if SANITIZER_GLIBC
 #include <fstab.h>
-#include <net/if_ppp.h>
-#include <netax25/ax25.h>
-#include <netipx/ipx.h>
-#include <netrom/netrom.h>
-#include <obstack.h>
-#if HAVE_RPC_XDR_H
-# include <rpc/xdr.h>
-#endif
-#include <scsi/scsi.h>
+#      include <linux/filter.h>
+#      include <net/if_ppp.h>
+#      include <netax25/ax25.h>
+#      include <netipx/ipx.h>
+#      include <netrom/netrom.h>
+#      include <obstack.h>
+#      if HAVE_RPC_XDR_H
+#        include <rpc/xdr.h>
+#      endif
+#      include <scsi/scsi.h>
 #else
 #include <linux/if_ppp.h>
 #include <linux/kd.h>
@@ -531,9 +532,10 @@ unsigned struct_ElfW_Phdr_sz = sizeof(Elf_Phdr);
 
   unsigned struct_audio_buf_info_sz = sizeof(struct audio_buf_info);
   unsigned struct_ppp_stats_sz = sizeof(struct ppp_stats);
-#endif  // SANITIZER_GLIBC
+  unsigned struct_sock_fprog_sz = sizeof(struct sock_fprog);
+#  endif  // SANITIZER_GLIBC
 
-#if !SANITIZER_ANDROID && !SANITIZER_APPLE
+#  if !SANITIZER_ANDROID && !SANITIZER_APPLE
   unsigned struct_sioc_sg_req_sz = sizeof(struct sioc_sg_req);
   unsigned struct_sioc_vif_req_sz = sizeof(struct sioc_vif_req);
 #endif
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.h b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.h
index 34bfef1f7ef456..ca03841ccc1988 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform_limits_posix.h
@@ -1050,7 +1050,8 @@ extern unsigned struct_serial_struct_sz;
 extern unsigned struct_sockaddr_ax25_sz;
 extern unsigned struct_unimapdesc_sz;
 extern unsigned struct_unimapinit_sz;
-#endif  // SANITIZER_LINUX && !SANITIZER_ANDROID
+extern unsigned struct_sock_fprog_sz;
+#  endif  // SANITIZER_LINUX && !SANITIZER_ANDROID
 
 extern const unsigned long __sanitizer_bufsiz;
 
diff --git a/compiler-rt/test/sanitizer_common/TestCases/Linux/prctl.cpp b/compiler-rt/test/sanitizer_common/TestCases/Linux/prctl.cpp
index cbff02d66efa78..dab1d1b48f8689 100644
--- a/compiler-rt/test/sanitizer_common/TestCases/Linux/prctl.cpp
+++ b/compiler-rt/test/sanitizer_common/TestCases/Linux/prctl.cpp
@@ -4,6 +4,8 @@
 
 #include <assert.h>
 #include <errno.h>
+#include <linux/filter.h>
+#include <linux/seccomp.h>
 #include <stdint.h>
 #include <string.h>
 #include <sys/mman.h>
@@ -78,5 +80,13 @@ int main() {
     }
   }
 
+  sock_filter f[] = {{.code = (BPF_LD | BPF_W | BPF_ABS),
+                      .k = (uint32_t)(SKF_AD_OFF | SKF_AD_CPU)},
+                     {.code = (BPF_RET | BPF_A), .k = 0}};
+  sock_fprog pr = {.len = 2, .filter = f};
+
+  res = prctl(PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &pr);
+  assert(res == -1);
+
   return 0;
 }

From 6d3d5f30bd58af6d16d0b4b7d32dc3ead1e098ec Mon Sep 17 00:00:00 2001
From: Han-Kuan Chen <hankuan.chen@sifive.com>
Date: Wed, 25 Sep 2024 02:23:14 +0800
Subject: [PATCH 47/49] [SLP][REVEC] getWidenedType should be used instead of
 FixedVectorType::get. (#109843)

reference: https://github.com/llvm/llvm-project/issues/109835
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    |  4 +-
 .../SLPVectorizer/revec-fix-109835.ll         | 70 +++++++++++++++++++
 2 files changed, 72 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/Transforms/SLPVectorizer/revec-fix-109835.ll

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 7e3dbe6260983e..b79e964cdb1b6b 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -9986,8 +9986,8 @@ class BoUpSLP::ShuffleCostEstimator : public BaseShuffleAnalysis {
         }
         Cost += ::getShuffleCost(
             TTI, TTI::SK_InsertSubvector,
-            FixedVectorType::get(ScalarTy, CommonMask.size()), {}, CostKind,
-            Idx, FixedVectorType::get(ScalarTy, E->getVectorFactor()));
+            getWidenedType(ScalarTy, CommonMask.size()), {}, CostKind, Idx,
+            getWidenedType(ScalarTy, E->getVectorFactor()));
         if (!CommonMask.empty()) {
           std::iota(std::next(CommonMask.begin(), Idx),
                     std::next(CommonMask.begin(), Idx + E->getVectorFactor()),
diff --git a/llvm/test/Transforms/SLPVectorizer/revec-fix-109835.ll b/llvm/test/Transforms/SLPVectorizer/revec-fix-109835.ll
new file mode 100644
index 00000000000000..965bfc7074c638
--- /dev/null
+++ b/llvm/test/Transforms/SLPVectorizer/revec-fix-109835.ll
@@ -0,0 +1,70 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -passes=slp-vectorizer -S -slp-revec %s | FileCheck %s
+
+@b = external dso_local local_unnamed_addr global i64, align 8
+@d = external dso_local local_unnamed_addr global i32, align 4
+@c = external dso_local local_unnamed_addr global i32, align 4
+@a = external dso_local local_unnamed_addr global i8, align 2
+
+define void @e() {
+; CHECK-LABEL: @e(
+; CHECK-NEXT:  vector.ph:
+; CHECK-NEXT:    [[C_PROMOTED5:%.*]] = load i32, ptr @c, align 4
+; CHECK-NEXT:    [[A_PROMOTED7:%.*]] = load i8, ptr @a, align 2
+; CHECK-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[C_PROMOTED5]], i64 0
+; CHECK-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <16 x i32> [[DOTSPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = insertelement <16 x i8> <i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, i8 [[A_PROMOTED7]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = add <16 x i32> [[DOTSPLAT]], <i32 -6, i32 3, i32 12, i32 21, i32 30, i32 39, i32 48, i32 57, i32 66, i32 75, i32 84, i32 93, i32 102, i32 111, i32 120, i32 129>
+; CHECK-NEXT:    [[TMP2:%.*]] = add <16 x i32> [[DOTSPLAT]], <i32 -4, i32 5, i32 14, i32 23, i32 32, i32 41, i32 50, i32 59, i32 68, i32 77, i32 86, i32 95, i32 104, i32 113, i32 122, i32 131>
+; CHECK-NEXT:    [[TMP3:%.*]] = add <16 x i32> [[DOTSPLAT]], <i32 -2, i32 7, i32 16, i32 25, i32 34, i32 43, i32 52, i32 61, i32 70, i32 79, i32 88, i32 97, i32 106, i32 115, i32 124, i32 133>
+; CHECK-NEXT:    [[INDUCTION:%.*]] = add <16 x i32> [[DOTSPLAT]], <i32 0, i32 9, i32 18, i32 27, i32 36, i32 45, i32 54, i32 63, i32 72, i32 81, i32 90, i32 99, i32 108, i32 117, i32 126, i32 135>
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp ult <16 x i32> [[TMP1]], <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult <16 x i32> [[TMP2]], <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ult <16 x i32> [[TMP3]], <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ult <16 x i32> [[INDUCTION]], <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq <16 x i32> [[DOTSPLAT]], <i32 -1, i32 -10, i32 -19, i32 -28, i32 -37, i32 -46, i32 -55, i32 -64, i32 -73, i32 -82, i32 -91, i32 -100, i32 -109, i32 -118, i32 -127, i32 -136>
+; CHECK-NEXT:    [[TMP9:%.*]] = or <16 x i1> [[TMP4]], [[TMP5]]
+; CHECK-NEXT:    [[TMP10:%.*]] = or <16 x i1> [[TMP9]], [[TMP6]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or <16 x i1> [[TMP10]], [[TMP7]]
+; CHECK-NEXT:    [[TMP12:%.*]] = or <16 x i1> [[TMP11]], [[TMP8]]
+; CHECK-NEXT:    [[TMP13:%.*]] = zext <16 x i1> [[TMP12]] to <16 x i8>
+; CHECK-NEXT:    [[TMP14:%.*]] = or <16 x i8> [[TMP0]], [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = shufflevector <16 x i8> [[TMP14]], <16 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT:    [[TMP16:%.*]] = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> [[TMP15]])
+; CHECK-NEXT:    [[TMP17:%.*]] = add i32 [[C_PROMOTED5]], 81
+; CHECK-NEXT:    store i64 -1, ptr @b, align 8
+; CHECK-NEXT:    store i32 9, ptr @d, align 4
+; CHECK-NEXT:    store i32 [[TMP17]], ptr @c, align 4
+; CHECK-NEXT:    store i8 [[TMP16]], ptr @a, align 2
+; CHECK-NEXT:    ret void
+;
+vector.ph:
+  %c.promoted5 = load i32, ptr @c, align 4
+  %a.promoted7 = load i8, ptr @a, align 2
+  %.splatinsert = insertelement <16 x i32> poison, i32 %c.promoted5, i64 0
+  %.splat = shufflevector <16 x i32> %.splatinsert, <16 x i32> poison, <16 x i32> zeroinitializer
+  %0 = insertelement <16 x i8> <i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison>, i8 %a.promoted7, i64 0
+  %1 = add <16 x i32> %.splat, <i32 -6, i32 3, i32 12, i32 21, i32 30, i32 39, i32 48, i32 57, i32 66, i32 75, i32 84, i32 93, i32 102, i32 111, i32 120, i32 129>
+  %2 = add <16 x i32> %.splat, <i32 -4, i32 5, i32 14, i32 23, i32 32, i32 41, i32 50, i32 59, i32 68, i32 77, i32 86, i32 95, i32 104, i32 113, i32 122, i32 131>
+  %3 = add <16 x i32> %.splat, <i32 -2, i32 7, i32 16, i32 25, i32 34, i32 43, i32 52, i32 61, i32 70, i32 79, i32 88, i32 97, i32 106, i32 115, i32 124, i32 133>
+  %induction = add <16 x i32> %.splat, <i32 0, i32 9, i32 18, i32 27, i32 36, i32 45, i32 54, i32 63, i32 72, i32 81, i32 90, i32 99, i32 108, i32 117, i32 126, i32 135>
+  %4 = icmp ult <16 x i32> %1, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+  %5 = icmp ult <16 x i32> %2, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+  %6 = icmp ult <16 x i32> %3, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+  %7 = icmp ult <16 x i32> %induction, <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+  %8 = icmp eq <16 x i32> %.splat, <i32 -1, i32 -10, i32 -19, i32 -28, i32 -37, i32 -46, i32 -55, i32 -64, i32 -73, i32 -82, i32 -91, i32 -100, i32 -109, i32 -118, i32 -127, i32 -136>
+  %9 = or <16 x i1> %4, %5
+  %10 = or <16 x i1> %9, %6
+  %11 = or <16 x i1> %10, %7
+  %12 = or <16 x i1> %11, %8
+  %13 = zext <16 x i1> %12 to <16 x i8>
+  %14 = or <16 x i8> %0, %13
+  %15 = shufflevector <16 x i8> %14, <16 x i8> <i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 poison, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+  %16 = call i8 @llvm.vector.reduce.or.v16i8(<16 x i8> %15)
+  %17 = add i32 %c.promoted5, 81
+  store i64 -1, ptr @b, align 8
+  store i32 9, ptr @d, align 4
+  store i32 %17, ptr @c, align 4
+  store i8 %16, ptr @a, align 2
+  ret void
+}

From 71ca9fcb8dc9ea0e1e3a4a47820edc78c398a85e Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Tue, 24 Sep 2024 22:32:53 +0400
Subject: [PATCH 48/49] llvm-reduce: Don't print verifier failed machine
 functions (#109673)

This produces far too much terminal output, particularly for the
instruction reduction. Since it doesn't consider the liveness of of
the instructions it's deleting, it produces quite a lot of verifier
errors.
---
 llvm/include/llvm/CodeGen/MachineFunction.h |   5 +-
 llvm/lib/CodeGen/MIRParser/MIRParser.cpp    |   2 +-
 llvm/lib/CodeGen/MachineBlockPlacement.cpp  |   2 +-
 llvm/lib/CodeGen/MachineScheduler.cpp       |   8 +-
 llvm/lib/CodeGen/MachineVerifier.cpp        | 232 ++++++++++----------
 llvm/lib/CodeGen/RegAllocGreedy.cpp         |  10 +-
 llvm/lib/CodeGen/RegisterCoalescer.cpp      |   4 +-
 llvm/tools/llvm-reduce/ReducerWorkItem.cpp  |  22 +-
 llvm/unittests/MI/LiveIntervalTest.cpp      |   4 +-
 9 files changed, 159 insertions(+), 130 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/MachineFunction.h b/llvm/include/llvm/CodeGen/MachineFunction.h
index aeb72ca24d79b8..5c1da4fa762e84 100644
--- a/llvm/include/llvm/CodeGen/MachineFunction.h
+++ b/llvm/include/llvm/CodeGen/MachineFunction.h
@@ -897,13 +897,14 @@ class LLVM_ABI MachineFunction {
   /// for debugger use.
   /// \returns true if no problems were found.
   bool verify(Pass *p = nullptr, const char *Banner = nullptr,
-              bool AbortOnError = true) const;
+              raw_ostream *OS = nullptr, bool AbortOnError = true) const;
 
   /// Run the current MachineFunction through the machine code verifier, useful
   /// for debugger use.
   /// \returns true if no problems were found.
   bool verify(LiveIntervals *LiveInts, SlotIndexes *Indexes,
-              const char *Banner = nullptr, bool AbortOnError = true) const;
+              const char *Banner = nullptr, raw_ostream *OS = nullptr,
+              bool AbortOnError = true) const;
 
   // Provide accessors for the MachineBasicBlock list...
   using iterator = BasicBlockListType::iterator;
diff --git a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
index 8d6d800d761474..3a00b8ec4771dd 100644
--- a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
@@ -634,7 +634,7 @@ MIRParserImpl::initializeMachineFunction(const yaml::MachineFunction &YamlMF,
 
   MF.getSubtarget().mirFileLoaded(MF);
 
-  MF.verify();
+  MF.verify(nullptr, nullptr, &errs());
   return false;
 }
 
diff --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
index 7dcb287b387343..a52c82d77ca644 100644
--- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
@@ -3698,7 +3698,7 @@ void MachineBlockPlacement::assignBlockOrder(
 
 #ifndef NDEBUG
   // Make sure we correctly constructed all branches.
-  F->verify(this, "After optimized block reordering");
+  F->verify(this, "After optimized block reordering", &errs());
 #endif
 }
 
diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
index 4e6d34346b1d80..9b2862de22b690 100644
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -453,7 +453,7 @@ bool MachineScheduler::runOnMachineFunction(MachineFunction &mf) {
 
   if (VerifyScheduling) {
     LLVM_DEBUG(LIS->dump());
-    MF->verify(this, "Before machine scheduling.");
+    MF->verify(this, "Before machine scheduling.", &errs());
   }
   RegClassInfo->runOnMachineFunction(*MF);
 
@@ -472,7 +472,7 @@ bool MachineScheduler::runOnMachineFunction(MachineFunction &mf) {
 
   LLVM_DEBUG(LIS->dump());
   if (VerifyScheduling)
-    MF->verify(this, "After machine scheduling.");
+    MF->verify(this, "After machine scheduling.", &errs());
   return true;
 }
 
@@ -496,7 +496,7 @@ bool PostMachineScheduler::runOnMachineFunction(MachineFunction &mf) {
   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
 
   if (VerifyScheduling)
-    MF->verify(this, "Before post machine scheduling.");
+    MF->verify(this, "Before post machine scheduling.", &errs());
 
   // Instantiate the selected scheduler for this target, function, and
   // optimization level.
@@ -512,7 +512,7 @@ bool PostMachineScheduler::runOnMachineFunction(MachineFunction &mf) {
   scheduleRegions(*Scheduler, true);
 
   if (VerifyScheduling)
-    MF->verify(this, "After post machine scheduling.");
+    MF->verify(this, "After post machine scheduling.", &errs());
   return true;
 }
 
diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp
index e1295ec8ea6e9a..24a0f41775cc1d 100644
--- a/llvm/lib/CodeGen/MachineVerifier.cpp
+++ b/llvm/lib/CodeGen/MachineVerifier.cpp
@@ -94,21 +94,24 @@ using namespace llvm;
 namespace {
 
   struct MachineVerifier {
-    MachineVerifier(MachineFunctionAnalysisManager &MFAM, const char *b)
-        : MFAM(&MFAM), Banner(b) {}
+    MachineVerifier(MachineFunctionAnalysisManager &MFAM, const char *b,
+                    raw_ostream *OS)
+        : MFAM(&MFAM), OS(OS ? *OS : nulls()), Banner(b) {}
 
-    MachineVerifier(Pass *pass, const char *b) : PASS(pass), Banner(b) {}
+    MachineVerifier(Pass *pass, const char *b, raw_ostream *OS)
+        : PASS(pass), OS(OS ? *OS : nulls()), Banner(b) {}
 
     MachineVerifier(const char *b, LiveVariables *LiveVars,
                     LiveIntervals *LiveInts, LiveStacks *LiveStks,
-                    SlotIndexes *Indexes)
-        : Banner(b), LiveVars(LiveVars), LiveInts(LiveInts), LiveStks(LiveStks),
-          Indexes(Indexes) {}
+                    SlotIndexes *Indexes, raw_ostream *OS)
+        : OS(OS ? *OS : nulls()), Banner(b), LiveVars(LiveVars),
+          LiveInts(LiveInts), LiveStks(LiveStks), Indexes(Indexes) {}
 
     unsigned verify(const MachineFunction &MF);
 
     MachineFunctionAnalysisManager *MFAM = nullptr;
     Pass *const PASS = nullptr;
+    raw_ostream &OS;
     const char *Banner;
     const MachineFunction *MF = nullptr;
     const TargetMachine *TM = nullptr;
@@ -334,7 +337,8 @@ namespace {
               MachineFunctionProperties::Property::FailsVerification))
         return false;
 
-      unsigned FoundErrors = MachineVerifier(this, Banner.c_str()).verify(MF);
+      unsigned FoundErrors =
+          MachineVerifier(this, Banner.c_str(), &errs()).verify(MF);
       if (FoundErrors)
         report_fatal_error("Found "+Twine(FoundErrors)+" machine code errors.");
       return false;
@@ -352,7 +356,8 @@ MachineVerifierPass::run(MachineFunction &MF,
   if (MF.getProperties().hasProperty(
           MachineFunctionProperties::Property::FailsVerification))
     return PreservedAnalyses::all();
-  unsigned FoundErrors = MachineVerifier(MFAM, Banner.c_str()).verify(MF);
+  unsigned FoundErrors =
+      MachineVerifier(MFAM, Banner.c_str(), &errs()).verify(MF);
   if (FoundErrors)
     report_fatal_error("Found " + Twine(FoundErrors) + " machine code errors.");
   return PreservedAnalyses::all();
@@ -374,25 +379,28 @@ void llvm::verifyMachineFunction(const std::string &Banner,
   // LiveIntervals *LiveInts;
   // LiveStacks *LiveStks;
   // SlotIndexes *Indexes;
-  unsigned FoundErrors = MachineVerifier(nullptr, Banner.c_str()).verify(MF);
+  unsigned FoundErrors =
+      MachineVerifier(nullptr, Banner.c_str(), &errs()).verify(MF);
   if (FoundErrors)
     report_fatal_error("Found " + Twine(FoundErrors) + " machine code errors.");
 }
 
-bool MachineFunction::verify(Pass *p, const char *Banner, bool AbortOnErrors)
-    const {
+bool MachineFunction::verify(Pass *p, const char *Banner, raw_ostream *OS,
+                             bool AbortOnErrors) const {
   MachineFunction &MF = const_cast<MachineFunction&>(*this);
-  unsigned FoundErrors = MachineVerifier(p, Banner).verify(MF);
+  unsigned FoundErrors = MachineVerifier(p, Banner, OS).verify(MF);
   if (AbortOnErrors && FoundErrors)
     report_fatal_error("Found "+Twine(FoundErrors)+" machine code errors.");
   return FoundErrors == 0;
 }
 
 bool MachineFunction::verify(LiveIntervals *LiveInts, SlotIndexes *Indexes,
-                             const char *Banner, bool AbortOnErrors) const {
+                             const char *Banner, raw_ostream *OS,
+                             bool AbortOnErrors) const {
   MachineFunction &MF = const_cast<MachineFunction &>(*this);
   unsigned FoundErrors =
-      MachineVerifier(Banner, nullptr, LiveInts, nullptr, Indexes).verify(MF);
+      MachineVerifier(Banner, nullptr, LiveInts, nullptr, Indexes, OS)
+          .verify(MF);
   if (AbortOnErrors && FoundErrors)
     report_fatal_error("Found " + Twine(FoundErrors) + " machine code errors.");
   return FoundErrors == 0;
@@ -482,7 +490,7 @@ unsigned MachineVerifier::verify(const MachineFunction &MF) {
     for (const MachineInstr &MI : MBB.instrs()) {
       if (MI.getParent() != &MBB) {
         report("Bad instruction parent pointer", &MBB);
-        errs() << "Instruction: " << MI;
+        OS << "Instruction: " << MI;
         continue;
       }
 
@@ -540,46 +548,48 @@ unsigned MachineVerifier::verify(const MachineFunction &MF) {
 
 void MachineVerifier::report(const char *msg, const MachineFunction *MF) {
   assert(MF);
-  errs() << '\n';
+  OS << '\n';
   if (!foundErrors++) {
     if (Banner)
-      errs() << "# " << Banner << '\n';
+      OS << "# " << Banner << '\n';
+
     if (LiveInts != nullptr)
-      LiveInts->print(errs());
+      LiveInts->print(OS);
     else
-      MF->print(errs(), Indexes);
+      MF->print(OS, Indexes);
   }
-  errs() << "*** Bad machine code: " << msg << " ***\n"
-      << "- function:    " << MF->getName() << "\n";
+
+  OS << "*** Bad machine code: " << msg << " ***\n"
+     << "- function:    " << MF->getName() << '\n';
 }
 
 void MachineVerifier::report(const char *msg, const MachineBasicBlock *MBB) {
   assert(MBB);
   report(msg, MBB->getParent());
-  errs() << "- basic block: " << printMBBReference(*MBB) << ' '
-         << MBB->getName() << " (" << (const void *)MBB << ')';
+  OS << "- basic block: " << printMBBReference(*MBB) << ' ' << MBB->getName()
+     << " (" << (const void *)MBB << ')';
   if (Indexes)
-    errs() << " [" << Indexes->getMBBStartIdx(MBB)
-        << ';' <<  Indexes->getMBBEndIdx(MBB) << ')';
-  errs() << '\n';
+    OS << " [" << Indexes->getMBBStartIdx(MBB) << ';'
+       << Indexes->getMBBEndIdx(MBB) << ')';
+  OS << '\n';
 }
 
 void MachineVerifier::report(const char *msg, const MachineInstr *MI) {
   assert(MI);
   report(msg, MI->getParent());
-  errs() << "- instruction: ";
+  OS << "- instruction: ";
   if (Indexes && Indexes->hasIndex(*MI))
-    errs() << Indexes->getInstructionIndex(*MI) << '\t';
-  MI->print(errs(), /*IsStandalone=*/true);
+    OS << Indexes->getInstructionIndex(*MI) << '\t';
+  MI->print(OS, /*IsStandalone=*/true);
 }
 
 void MachineVerifier::report(const char *msg, const MachineOperand *MO,
                              unsigned MONum, LLT MOVRegType) {
   assert(MO);
   report(msg, MO->getParent());
-  errs() << "- operand " << MONum << ":   ";
-  MO->print(errs(), MOVRegType, TRI);
-  errs() << "\n";
+  OS << "- operand " << MONum << ":   ";
+  MO->print(OS, MOVRegType, TRI);
+  OS << '\n';
 }
 
 void MachineVerifier::report(const Twine &Msg, const MachineInstr *MI) {
@@ -587,11 +597,11 @@ void MachineVerifier::report(const Twine &Msg, const MachineInstr *MI) {
 }
 
 void MachineVerifier::report_context(SlotIndex Pos) const {
-  errs() << "- at:          " << Pos << '\n';
+  OS << "- at:          " << Pos << '\n';
 }
 
 void MachineVerifier::report_context(const LiveInterval &LI) const {
-  errs() << "- interval:    " << LI << '\n';
+  OS << "- interval:    " << LI << '\n';
 }
 
 void MachineVerifier::report_context(const LiveRange &LR, Register VRegUnit,
@@ -603,35 +613,35 @@ void MachineVerifier::report_context(const LiveRange &LR, Register VRegUnit,
 }
 
 void MachineVerifier::report_context(const LiveRange::Segment &S) const {
-  errs() << "- segment:     " << S << '\n';
+  OS << "- segment:     " << S << '\n';
 }
 
 void MachineVerifier::report_context(const VNInfo &VNI) const {
-  errs() << "- ValNo:       " << VNI.id << " (def " << VNI.def << ")\n";
+  OS << "- ValNo:       " << VNI.id << " (def " << VNI.def << ")\n";
 }
 
 void MachineVerifier::report_context_liverange(const LiveRange &LR) const {
-  errs() << "- liverange:   " << LR << '\n';
+  OS << "- liverange:   " << LR << '\n';
 }
 
 void MachineVerifier::report_context(MCPhysReg PReg) const {
-  errs() << "- p. register: " << printReg(PReg, TRI) << '\n';
+  OS << "- p. register: " << printReg(PReg, TRI) << '\n';
 }
 
 void MachineVerifier::report_context_vreg(Register VReg) const {
-  errs() << "- v. register: " << printReg(VReg, TRI) << '\n';
+  OS << "- v. register: " << printReg(VReg, TRI) << '\n';
 }
 
 void MachineVerifier::report_context_vreg_regunit(Register VRegOrUnit) const {
   if (VRegOrUnit.isVirtual()) {
     report_context_vreg(VRegOrUnit);
   } else {
-    errs() << "- regunit:     " << printRegUnit(VRegOrUnit, TRI) << '\n';
+    OS << "- regunit:     " << printRegUnit(VRegOrUnit, TRI) << '\n';
   }
 }
 
 void MachineVerifier::report_context_lanemask(LaneBitmask LaneMask) const {
-  errs() << "- lanemask:    " << PrintLaneMask(LaneMask) << '\n';
+  OS << "- lanemask:    " << PrintLaneMask(LaneMask) << '\n';
 }
 
 void MachineVerifier::markReachable(const MachineBasicBlock *MBB) {
@@ -710,8 +720,8 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
       report("MBB has successor that isn't part of the function.", MBB);
     if (!MBBInfoMap[succ].Preds.count(MBB)) {
       report("Inconsistent CFG", MBB);
-      errs() << "MBB is not in the predecessor list of the successor "
-             << printMBBReference(*succ) << ".\n";
+      OS << "MBB is not in the predecessor list of the successor "
+         << printMBBReference(*succ) << ".\n";
     }
   }
 
@@ -721,8 +731,8 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
       report("MBB has predecessor that isn't part of the function.", MBB);
     if (!MBBInfoMap[Pred].Succs.count(MBB)) {
       report("Inconsistent CFG", MBB);
-      errs() << "MBB is not in the successor list of the predecessor "
-             << printMBBReference(*Pred) << ".\n";
+      OS << "MBB is not in the successor list of the predecessor "
+         << printMBBReference(*Pred) << ".\n";
     }
   }
 
@@ -880,7 +890,7 @@ void MachineVerifier::visitMachineBundleBefore(const MachineInstr *MI) {
     SlotIndex idx = Indexes->getInstructionIndex(*MI);
     if (!(idx > lastIndex)) {
       report("Instruction index out of order", MI);
-      errs() << "Last instruction was at " << lastIndex << '\n';
+      OS << "Last instruction was at " << lastIndex << '\n';
     }
     lastIndex = idx;
   }
@@ -894,7 +904,7 @@ void MachineVerifier::visitMachineBundleBefore(const MachineInstr *MI) {
     // precede non-terminators.
     if (FirstTerminator->getOpcode() != TargetOpcode::G_INVOKE_REGION_START) {
       report("Non-terminator instruction after the first terminator", MI);
-      errs() << "First terminator was:\t" << *FirstTerminator;
+      OS << "First terminator was:\t" << *FirstTerminator;
     }
   }
 }
@@ -2185,8 +2195,8 @@ void MachineVerifier::visitMachineInstrBefore(const MachineInstr *MI) {
   const MCInstrDesc &MCID = MI->getDesc();
   if (MI->getNumOperands() < MCID.getNumOperands()) {
     report("Too few operands", MI);
-    errs() << MCID.getNumOperands() << " operands expected, but "
-           << MI->getNumOperands() << " given.\n";
+    OS << MCID.getNumOperands() << " operands expected, but "
+       << MI->getNumOperands() << " given.\n";
   }
 
   if (MI->getFlag(MachineInstr::NoConvergent) && !MCID.isConvergent())
@@ -2278,7 +2288,7 @@ void MachineVerifier::visitMachineInstrBefore(const MachineInstr *MI) {
       // If both types are valid, check that the types are the same.
       if (SrcTy != DstTy) {
         report("Copy Instruction is illegal with mismatching types", MI);
-        errs() << "Def = " << DstTy << ", Src = " << SrcTy << "\n";
+        OS << "Def = " << DstTy << ", Src = " << SrcTy << '\n';
       }
 
       break;
@@ -2322,8 +2332,7 @@ void MachineVerifier::visitMachineInstrBefore(const MachineInstr *MI) {
     if (SrcSize.isNonZero() && DstSize.isNonZero() && SrcSize != DstSize) {
       if (!DstOp.getSubReg() && !SrcOp.getSubReg()) {
         report("Copy Instruction is illegal with mismatching sizes", MI);
-        errs() << "Def Size = " << DstSize << ", Src Size = " << SrcSize
-               << "\n";
+        OS << "Def Size = " << DstSize << ", Src Size = " << SrcSize << '\n';
       }
     }
     break;
@@ -2554,8 +2563,8 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) {
               TII->getRegClass(MCID, MONum, TRI, *MF)) {
           if (!DRC->contains(Reg)) {
             report("Illegal physical register for instruction", MO, MONum);
-            errs() << printReg(Reg, TRI) << " is not a "
-                   << TRI->getRegClassName(DRC) << " register.\n";
+            OS << printReg(Reg, TRI) << " is not a "
+               << TRI->getRegClassName(DRC) << " register.\n";
           }
         }
       }
@@ -2618,9 +2627,9 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) {
               RBI->getMaximumSize(RegBank->getID()) < Ty.getSizeInBits()) {
             report("Register bank is too small for virtual register", MO,
                    MONum);
-            errs() << "Register bank " << RegBank->getName() << " too small("
-                   << RBI->getMaximumSize(RegBank->getID()) << ") to fit "
-                   << Ty.getSizeInBits() << "-bits\n";
+            OS << "Register bank " << RegBank->getName() << " too small("
+               << RBI->getMaximumSize(RegBank->getID()) << ") to fit "
+               << Ty.getSizeInBits() << "-bits\n";
             return;
           }
         }
@@ -2639,10 +2648,9 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) {
             TII->getRegClass(MCID, MONum, TRI, *MF)) {
           report("Virtual register does not match instruction constraint", MO,
                  MONum);
-          errs() << "Expect register class "
-                 << TRI->getRegClassName(
-                        TII->getRegClass(MCID, MONum, TRI, *MF))
-                 << " but got nothing\n";
+          OS << "Expect register class "
+             << TRI->getRegClassName(TII->getRegClass(MCID, MONum, TRI, *MF))
+             << " but got nothing\n";
           return;
         }
 
@@ -2653,14 +2661,14 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) {
           TRI->getSubClassWithSubReg(RC, SubIdx);
         if (!SRC) {
           report("Invalid subregister index for virtual register", MO, MONum);
-          errs() << "Register class " << TRI->getRegClassName(RC)
-              << " does not support subreg index " << SubIdx << "\n";
+          OS << "Register class " << TRI->getRegClassName(RC)
+             << " does not support subreg index " << SubIdx << '\n';
           return;
         }
         if (RC != SRC) {
           report("Invalid register class for subregister index", MO, MONum);
-          errs() << "Register class " << TRI->getRegClassName(RC)
-              << " does not fully support subreg index " << SubIdx << "\n";
+          OS << "Register class " << TRI->getRegClassName(RC)
+             << " does not fully support subreg index " << SubIdx << '\n';
           return;
         }
       }
@@ -2682,9 +2690,9 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) {
           }
           if (!RC->hasSuperClassEq(DRC)) {
             report("Illegal virtual register for instruction", MO, MONum);
-            errs() << "Expected a " << TRI->getRegClassName(DRC)
-                << " register, but got a " << TRI->getRegClassName(RC)
-                << " register\n";
+            OS << "Expected a " << TRI->getRegClassName(DRC)
+               << " register, but got a " << TRI->getRegClassName(RC)
+               << " register\n";
           }
         }
       }
@@ -2733,11 +2741,11 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) {
       }
       if (loads && !LI.liveAt(Idx.getRegSlot(true))) {
         report("Instruction loads from dead spill slot", MO, MONum);
-        errs() << "Live stack: " << LI << '\n';
+        OS << "Live stack: " << LI << '\n';
       }
       if (stores && !LI.liveAt(Idx.getRegSlot())) {
         report("Instruction stores to dead spill slot", MO, MONum);
-        errs() << "Live stack: " << LI << '\n';
+        OS << "Live stack: " << LI << '\n';
       }
     }
     break;
@@ -3050,8 +3058,8 @@ MachineVerifier::visitMachineBasicBlockAfter(const MachineBasicBlock *MBB) {
     SlotIndex stop = Indexes->getMBBEndIdx(MBB);
     if (!(stop > lastIndex)) {
       report("Block ends before last instruction index", MBB);
-      errs() << "Block ends at " << stop
-          << " last instruction was at " << lastIndex << '\n';
+      OS << "Block ends at " << stop << " last instruction was at " << lastIndex
+         << '\n';
     }
     lastIndex = stop;
   }
@@ -3296,8 +3304,8 @@ void MachineVerifier::checkPHIOps(const MachineBasicBlock &MBB) {
       for (MachineBasicBlock *Pred : MBB.predecessors()) {
         if (!seen.count(Pred)) {
           report("Missing PHI operand", &Phi);
-          errs() << printMBBReference(*Pred)
-                 << " is a predecessor according to the CFG.\n";
+          OS << printMBBReference(*Pred)
+             << " is a predecessor according to the CFG.\n";
         }
       }
     }
@@ -3306,9 +3314,10 @@ void MachineVerifier::checkPHIOps(const MachineBasicBlock &MBB) {
 
 static void
 verifyConvergenceControl(const MachineFunction &MF, MachineDominatorTree &DT,
-                         std::function<void(const Twine &Message)> FailureCB) {
+                         std::function<void(const Twine &Message)> FailureCB,
+                         raw_ostream &OS) {
   MachineConvergenceVerifier CV;
-  CV.initialize(&errs(), FailureCB, MF);
+  CV.initialize(&OS, FailureCB, MF);
 
   for (const auto &MBB : MF) {
     CV.visit(MBB);
@@ -3326,7 +3335,7 @@ void MachineVerifier::visitMachineFunctionAfter() {
   auto FailureCB = [this](const Twine &Message) {
     report(Message.str().c_str(), MF);
   };
-  verifyConvergenceControl(*MF, DT, FailureCB);
+  verifyConvergenceControl(*MF, DT, FailureCB, OS);
 
   calcRegsPassed();
 
@@ -3342,8 +3351,8 @@ void MachineVerifier::visitMachineFunctionAfter() {
     for (Register VReg : MInfo.vregsRequired)
       if (MInfo.regsKilled.count(VReg)) {
         report("Virtual register killed in block, but needed live out.", &MBB);
-        errs() << "Virtual register " << printReg(VReg)
-               << " is used after the block.\n";
+        OS << "Virtual register " << printReg(VReg)
+           << " is used after the block.\n";
       }
   }
 
@@ -3379,9 +3388,8 @@ void MachineVerifier::visitMachineFunctionAfter() {
           if (!PInfo.regsLiveOut.count(LiveInReg)) {
             report("Live in register not found to be live out from predecessor.",
                    &MBB);
-            errs() << TRI->getName(LiveInReg)
-                   << " not found to be live out from "
-                   << printMBBReference(*Pred) << "\n";
+            OS << TRI->getName(LiveInReg) << " not found to be live out from "
+               << printMBBReference(*Pred) << '\n';
           }
         }
       }
@@ -3418,14 +3426,14 @@ void MachineVerifier::verifyLiveVariables() {
       if (MInfo.vregsRequired.count(Reg)) {
         if (!VI.AliveBlocks.test(MBB.getNumber())) {
           report("LiveVariables: Block missing from AliveBlocks", &MBB);
-          errs() << "Virtual register " << printReg(Reg)
-                 << " must be live through the block.\n";
+          OS << "Virtual register " << printReg(Reg)
+             << " must be live through the block.\n";
         }
       } else {
         if (VI.AliveBlocks.test(MBB.getNumber())) {
           report("LiveVariables: Block should not be in AliveBlocks", &MBB);
-          errs() << "Virtual register " << printReg(Reg)
-                 << " is not needed live through the block.\n";
+          OS << "Virtual register " << printReg(Reg)
+             << " is not needed live through the block.\n";
         }
       }
     }
@@ -3443,7 +3451,7 @@ void MachineVerifier::verifyLiveIntervals() {
 
     if (!LiveInts->hasInterval(Reg)) {
       report("Missing live interval for virtual register", MF);
-      errs() << printReg(Reg, TRI) << " still has defs or uses\n";
+      OS << printReg(Reg, TRI) << " still has defs or uses\n";
       continue;
     }
 
@@ -3755,9 +3763,9 @@ void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR,
         report("Register not marked live out of predecessor", Pred);
         report_context(LR, Reg, LaneMask);
         report_context(*VNI);
-        errs() << " live into " << printMBBReference(*MFI) << '@'
-               << LiveInts->getMBBStartIdx(&*MFI) << ", not live before "
-               << PEnd << '\n';
+        OS << " live into " << printMBBReference(*MFI) << '@'
+           << LiveInts->getMBBStartIdx(&*MFI) << ", not live before " << PEnd
+           << '\n';
         continue;
       }
 
@@ -3765,10 +3773,10 @@ void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR,
       if (!IsPHI && PVNI != VNI) {
         report("Different value live out of predecessor", Pred);
         report_context(LR, Reg, LaneMask);
-        errs() << "Valno #" << PVNI->id << " live out of "
-               << printMBBReference(*Pred) << '@' << PEnd << "\nValno #"
-               << VNI->id << " live into " << printMBBReference(*MFI) << '@'
-               << LiveInts->getMBBStartIdx(&*MFI) << '\n';
+        OS << "Valno #" << PVNI->id << " live out of "
+           << printMBBReference(*Pred) << '@' << PEnd << "\nValno #" << VNI->id
+           << " live into " << printMBBReference(*MFI) << '@'
+           << LiveInts->getMBBStartIdx(&*MFI) << '\n';
       }
     }
     if (&*MFI == EndMBB)
@@ -3823,11 +3831,11 @@ void MachineVerifier::verifyLiveInterval(const LiveInterval &LI) {
     report("Multiple connected components in live interval", MF);
     report_context(LI);
     for (unsigned comp = 0; comp != NumComp; ++comp) {
-      errs() << comp << ": valnos";
+      OS << comp << ": valnos";
       for (const VNInfo *I : LI.valnos)
         if (comp == ConEQ.getEqClass(I))
-          errs() << ' ' << I->id;
-      errs() << '\n';
+          OS << ' ' << I->id;
+      OS << '\n';
     }
   }
 }
@@ -3889,9 +3897,9 @@ void MachineVerifier::verifyStackFrame() {
       report("Call frame size on entry does not match value computed from "
              "predecessor",
              MBB);
-      errs() << "Call frame size on entry " << MBB->getCallFrameSize()
-             << " does not match value computed from predecessor "
-             << -BBState.EntryValue << '\n';
+      OS << "Call frame size on entry " << MBB->getCallFrameSize()
+         << " does not match value computed from predecessor "
+         << -BBState.EntryValue << '\n';
     }
 
     // Update stack state by checking contents of MBB.
@@ -3914,8 +3922,8 @@ void MachineVerifier::verifyStackFrame() {
                                                BBState.ExitValue;
         if (BBState.ExitIsSetup && AbsSPAdj != Size) {
           report("FrameDestroy <n> is after FrameSetup <m>", &I);
-          errs() << "FrameDestroy <" << Size << "> is after FrameSetup <"
-              << AbsSPAdj << ">.\n";
+          OS << "FrameDestroy <" << Size << "> is after FrameSetup <"
+             << AbsSPAdj << ">.\n";
         }
         if (!MRI->isSSA() && !MF->getFrameInfo().adjustsStack())
           report("AdjustsStack not set in presence of a frame pseudo "
@@ -3933,11 +3941,11 @@ void MachineVerifier::verifyStackFrame() {
           (SPState[Pred->getNumber()].ExitValue != BBState.EntryValue ||
            SPState[Pred->getNumber()].ExitIsSetup != BBState.EntryIsSetup)) {
         report("The exit stack state of a predecessor is inconsistent.", MBB);
-        errs() << "Predecessor " << printMBBReference(*Pred)
-               << " has exit state (" << SPState[Pred->getNumber()].ExitValue
-               << ", " << SPState[Pred->getNumber()].ExitIsSetup << "), while "
-               << printMBBReference(*MBB) << " has entry state ("
-               << BBState.EntryValue << ", " << BBState.EntryIsSetup << ").\n";
+        OS << "Predecessor " << printMBBReference(*Pred) << " has exit state ("
+           << SPState[Pred->getNumber()].ExitValue << ", "
+           << SPState[Pred->getNumber()].ExitIsSetup << "), while "
+           << printMBBReference(*MBB) << " has entry state ("
+           << BBState.EntryValue << ", " << BBState.EntryIsSetup << ").\n";
       }
     }
 
@@ -3948,11 +3956,11 @@ void MachineVerifier::verifyStackFrame() {
           (SPState[Succ->getNumber()].EntryValue != BBState.ExitValue ||
            SPState[Succ->getNumber()].EntryIsSetup != BBState.ExitIsSetup)) {
         report("The entry stack state of a successor is inconsistent.", MBB);
-        errs() << "Successor " << printMBBReference(*Succ)
-               << " has entry state (" << SPState[Succ->getNumber()].EntryValue
-               << ", " << SPState[Succ->getNumber()].EntryIsSetup << "), while "
-               << printMBBReference(*MBB) << " has exit state ("
-               << BBState.ExitValue << ", " << BBState.ExitIsSetup << ").\n";
+        OS << "Successor " << printMBBReference(*Succ) << " has entry state ("
+           << SPState[Succ->getNumber()].EntryValue << ", "
+           << SPState[Succ->getNumber()].EntryIsSetup << "), while "
+           << printMBBReference(*MBB) << " has exit state ("
+           << BBState.ExitValue << ", " << BBState.ExitIsSetup << ").\n";
       }
     }
 
diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp
index 5001b4fec58f2e..1ad70c86d68e3d 100644
--- a/llvm/lib/CodeGen/RegAllocGreedy.cpp
+++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp
@@ -1054,7 +1054,7 @@ void RAGreedy::splitAroundRegion(LiveRangeEdit &LREdit,
   }
 
   if (VerifyEnabled)
-    MF->verify(this, "After splitting live range around region");
+    MF->verify(this, "After splitting live range around region", &errs());
 }
 
 MCRegister RAGreedy::tryRegionSplit(const LiveInterval &VirtReg,
@@ -1323,7 +1323,7 @@ unsigned RAGreedy::tryBlockSplit(const LiveInterval &VirtReg,
   }
 
   if (VerifyEnabled)
-    MF->verify(this, "After splitting live range around basic blocks");
+    MF->verify(this, "After splitting live range around basic blocks", &errs());
   return 0;
 }
 
@@ -2507,7 +2507,7 @@ MCRegister RAGreedy::selectOrSplitImpl(const LiveInterval &VirtReg,
     DebugVars->splitRegister(VirtReg.reg(), LRE.regs(), *LIS);
 
     if (VerifyEnabled)
-      MF->verify(this, "After spilling");
+      MF->verify(this, "After spilling", &errs());
   }
 
   // The live virtual register requesting allocation was spilled, so tell
@@ -2711,7 +2711,7 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) {
   TII = MF->getSubtarget().getInstrInfo();
 
   if (VerifyEnabled)
-    MF->verify(this, "Before greedy register allocator");
+    MF->verify(this, "Before greedy register allocator", &errs());
 
   RegAllocBase::init(getAnalysis<VirtRegMap>(),
                      getAnalysis<LiveIntervalsWrapperPass>().getLIS(),
@@ -2770,7 +2770,7 @@ bool RAGreedy::runOnMachineFunction(MachineFunction &mf) {
   tryHintsRecoloring();
 
   if (VerifyEnabled)
-    MF->verify(this, "Before post optimization");
+    MF->verify(this, "Before post optimization", &errs());
   postOptimization();
   reportStats();
 
diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp
index 99125200c1a4f1..2e1f498c090d1a 100644
--- a/llvm/lib/CodeGen/RegisterCoalescer.cpp
+++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp
@@ -4239,7 +4239,7 @@ bool RegisterCoalescer::runOnMachineFunction(MachineFunction &fn) {
   JoinSplitEdges = EnableJoinSplits;
 
   if (VerifyCoalescing)
-    MF->verify(this, "Before register coalescing");
+    MF->verify(this, "Before register coalescing", &errs());
 
   DbgVRegToValues.clear();
   buildVRegToDbgValueMap(fn);
@@ -4299,7 +4299,7 @@ bool RegisterCoalescer::runOnMachineFunction(MachineFunction &fn) {
 
   LLVM_DEBUG(dump());
   if (VerifyCoalescing)
-    MF->verify(this, "After register coalescing");
+    MF->verify(this, "After register coalescing", &errs());
   return true;
 }
 
diff --git a/llvm/tools/llvm-reduce/ReducerWorkItem.cpp b/llvm/tools/llvm-reduce/ReducerWorkItem.cpp
index 1510e9fb32007e..5409b6dc7459d3 100644
--- a/llvm/tools/llvm-reduce/ReducerWorkItem.cpp
+++ b/llvm/tools/llvm-reduce/ReducerWorkItem.cpp
@@ -52,6 +52,11 @@ extern cl::OptionCategory LLVMReduceOptions;
 static cl::opt<std::string> TargetTriple("mtriple",
                                          cl::desc("Set the target triple"),
                                          cl::cat(LLVMReduceOptions));
+static cl::opt<bool> PrintInvalidMachineReductions(
+    "print-invalid-reduction-machine-verifier-errors",
+    cl::desc(
+        "Print machine verifier errors on invalid reduction attempts triple"),
+    cl::cat(LLVMReduceOptions));
 
 static cl::opt<bool> TmpFilesAsBitcode(
     "write-tmp-files-as-bitcode",
@@ -417,7 +422,7 @@ static std::unique_ptr<MachineFunction> cloneMF(MachineFunction *SrcMF,
 
   DstMRI->freezeReservedRegs();
 
-  DstMF->verify(nullptr, "", /*AbortOnError=*/true);
+  DstMF->verify(nullptr, "", &errs(), /*AbortOnError=*/true);
   return DstMF;
 }
 
@@ -450,8 +455,21 @@ bool ReducerWorkItem::verify(raw_fd_ostream *OS) const {
 
   for (const Function &F : getModule()) {
     if (const MachineFunction *MF = MMI->getMachineFunction(F)) {
-      if (!MF->verify(nullptr, "", /*AbortOnError=*/false))
+      // With the current state of quality, most reduction attempts fail the
+      // machine verifier. Avoid spamming large function dumps on nearly every
+      // attempt until the situation is better.
+      if (!MF->verify(nullptr, "",
+                      /*OS=*/PrintInvalidMachineReductions ? &errs() : nullptr,
+                      /*AbortOnError=*/false)) {
+
+        if (!PrintInvalidMachineReductions) {
+          WithColor::warning(errs())
+              << "reduction attempt on function '" << MF->getName()
+              << "' failed machine verifier (debug with "
+                 "-print-invalid-reduction-machine-verifier-errors)\n";
+        }
         return true;
+      }
     }
   }
 
diff --git a/llvm/unittests/MI/LiveIntervalTest.cpp b/llvm/unittests/MI/LiveIntervalTest.cpp
index 7dcd82f3e7aa61..f910e8e1f2c8fb 100644
--- a/llvm/unittests/MI/LiveIntervalTest.cpp
+++ b/llvm/unittests/MI/LiveIntervalTest.cpp
@@ -101,7 +101,9 @@ struct TestPassT : public TestPass {
   bool runOnMachineFunction(MachineFunction &MF) override {
     AnalysisType &A = getAnalysis<AnalysisType>();
     T(MF, A);
-    EXPECT_EQ(MF.verify(this, /* Banner */ nullptr, /* AbortOnError */ false),
+    EXPECT_EQ(MF.verify(this, /* Banner=*/nullptr,
+                        /*OS=*/nullptr,
+                        /* AbortOnError=*/false),
               ShouldPass);
     return true;
   }

From a6bdf3face377ee7ea84a02bada8a7e2ff380fe8 Mon Sep 17 00:00:00 2001
From: Charlie Barto <chbarto@microsoft.com>
Date: Tue, 24 Sep 2024 11:45:10 -0700
Subject: [PATCH 49/49] [asan][windows][tests] support MSVC compiler-id in asan
 tests (#109706)

This follows up on https://github.com/llvm/llvm-project/pull/108255 to
allow actually running the test suite with MSVC. Note, however, that
MSVC can't yet build compiler-rt, most tests don't yet pass with msvc,
and building and testing with different compilers is ill-supported.
Follow ups will fix the first two issues, the third is future work.

Note that `/Zi` is removed from the clang-cl command line, but lit as a
whole adds `-gcodeview`, so there should still be debug info.
---
 compiler-rt/test/asan/lit.cfg.py | 22 ++++++++++++++++------
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/compiler-rt/test/asan/lit.cfg.py b/compiler-rt/test/asan/lit.cfg.py
index 05ed7e8dd294e3..dac3ef00a99af8 100644
--- a/compiler-rt/test/asan/lit.cfg.py
+++ b/compiler-rt/test/asan/lit.cfg.py
@@ -153,12 +153,16 @@ def build_invocation(compile_flags, with_lto=False):
 if platform.system() == "Windows":
     # MSVC-specific tests might also use the clang-cl.exe driver.
     if target_is_msvc:
-        clang_cl_cxxflags = [
-            "-Wno-deprecated-declarations",
-            "-WX",
-            "-D_HAS_EXCEPTIONS=0",
-            "-Zi",
-        ] + target_cflags
+        clang_cl_cxxflags = (
+            [
+                "-WX",
+                "-D_HAS_EXCEPTIONS=0",
+            ]
+            + config.debug_info_flags
+            + target_cflags
+        )
+        if config.compiler_id != "MSVC":
+            clang_cl_cxxflags = ["-Wno-deprecated-declarations"] + clang_cl_cxxflags
         clang_cl_asan_cxxflags = ["-fsanitize=address"] + clang_cl_cxxflags
         if config.asan_dynamic:
             clang_cl_asan_cxxflags.append("-MD")
@@ -286,6 +290,12 @@ def build_invocation(compile_flags, with_lto=False):
         [config.compiler_rt_libdir, os.environ.get("PATH", "")]
     )
 
+# msvc needs to be instructed where the compiler-rt libraries are
+if config.compiler_id == "MSVC":
+    config.environment["LIB"] = os.path.pathsep.join(
+        [config.compiler_rt_libdir, config.environment.get("LIB", "")]
+    )
+
 # Default test suffixes.
 config.suffixes = [".c", ".cpp"]