diff --git a/clang/test/Driver/relax.s b/clang/test/Driver/relax.s
index 154d4db0a31385..b4a696a328eb56 100644
--- a/clang/test/Driver/relax.s
+++ b/clang/test/Driver/relax.s
@@ -8,5 +8,7 @@
 // RUN: llvm-readobj -r %t | FileCheck --check-prefix=REL %s
 
 // REL: R_X86_64_REX_GOTPCRELX foo
+// REL: R_X86_64_REX2_GOTPCRELX foo
 
         movq	foo@GOTPCREL(%rip), %rax
+        movq	foo@GOTPCREL(%rip), %r16
diff --git a/lld/COFF/Writer.cpp b/lld/COFF/Writer.cpp
index c2765453aa964e..7cf723a8cf103f 100644
--- a/lld/COFF/Writer.cpp
+++ b/lld/COFF/Writer.cpp
@@ -472,7 +472,7 @@ bool Writer::createThunks(OutputSection *os, int margin) {
   // Recheck Chunks.size() each iteration, since we can insert more
   // elements into it.
   for (size_t i = 0; i != os->chunks.size(); ++i) {
-    SectionChunk *sc = dyn_cast_or_null<SectionChunk>(os->chunks[i]);
+    SectionChunk *sc = dyn_cast<SectionChunk>(os->chunks[i]);
     if (!sc)
       continue;
     MachineTypes machine = sc->getMachine();
@@ -606,7 +606,7 @@ void Writer::createECCodeMap() {
 // Verify that all relocations are in range, with no extra margin requirements.
 bool Writer::verifyRanges(const std::vector<Chunk *> chunks) {
   for (Chunk *c : chunks) {
-    SectionChunk *sc = dyn_cast_or_null<SectionChunk>(c);
+    SectionChunk *sc = dyn_cast<SectionChunk>(c);
     if (!sc)
       continue;
     MachineTypes machine = sc->getMachine();
@@ -872,8 +872,8 @@ bool Writer::fixGnuImportChunks() {
     if (!pSec->chunks.empty())
       hasIdata = true;
     llvm::stable_sort(pSec->chunks, [&](Chunk *s, Chunk *t) {
-      SectionChunk *sc1 = dyn_cast_or_null<SectionChunk>(s);
-      SectionChunk *sc2 = dyn_cast_or_null<SectionChunk>(t);
+      SectionChunk *sc1 = dyn_cast<SectionChunk>(s);
+      SectionChunk *sc2 = dyn_cast<SectionChunk>(t);
       if (!sc1 || !sc2) {
         // if SC1, order them ascending. If SC2 or both null,
         // S is not less than T.
diff --git a/lldb/docs/index.rst b/lldb/docs/index.rst
index d9b8e589eb2ac0..dd44a8430add80 100644
--- a/lldb/docs/index.rst
+++ b/lldb/docs/index.rst
@@ -163,6 +163,7 @@ interesting areas to contribute to lldb.
    resources/caveats
    resources/projects
    resources/lldbdap
+   resources/addinglanguagesupport
    Public C++ API <https://lldb.llvm.org/cpp_reference/namespacelldb.html>
    Private C++ API <https://lldb.llvm.org/cpp_reference/index.html>
 
diff --git a/lldb/docs/resources/addinglanguagesupport.md b/lldb/docs/resources/addinglanguagesupport.md
new file mode 100644
index 00000000000000..28789048643d77
--- /dev/null
+++ b/lldb/docs/resources/addinglanguagesupport.md
@@ -0,0 +1,95 @@
+# Adding Programming Language Support
+
+LLDB has been architected to make it straightforward to add support for a
+programming language. Only a small enum in core LLDB needs to be modified to
+make LLDB aware of a new programming language. Everything else can be supplied
+in derived classes that need not even be present in the core LLDB repository.
+This makes it convenient for developers adding language support in downstream
+repositories since it practically eliminates the potential for merge conflicts.
+
+The basic steps are:
+* Add the language to the `LanguageType` enum.
+* Add a `TypeSystem` for the language.
+* Add expression evaluation support.
+
+Additionally, you may want to create a `Language` and `LanguageRuntime` plugin
+for your language, which enables support for advanced features like dynamic
+typing and data formatting.
+
+## Add the Language to the LanguageType enum
+
+The `LanguageType` enum
+(see [lldb-enumerations.h](https://github.com/llvm/llvm-project/blob/main/lldb/include/lldb/lldb-enumerations.h))
+contains a list of every language known to LLDB. It is the one place where
+support for a language must live that will need to merge cleanly with upstream
+LLDB if you are developing your language support in a separate branch. When
+adding support for a language previously unknown to LLDB, start by adding an
+enumeration entry to `LanguageType`.
+
+## Add a TypeSystem for the Language
+
+Both [Module](https://github.com/llvm/llvm-project/blob/main/lldb/include/lldb/Core/Module.h)
+and [Target](https://github.com/llvm/llvm-project/blob/main/lldb/include/lldb/Target/Target.h)
+support the retrieval of a `TypeSystem` instance via `GetTypeSystemForLanguage()`.
+For `Module`, this method is directly on the `Module` instance. For `Target`,
+this is retrieved indirectly via the `TypeSystemMap` for the `Target` instance.
+
+The `TypeSystem` instance returned by the `Target` is expected to be capable of
+evaluating expressions, while the `TypeSystem` instance returned by the `Module`
+is not. If you want to support expression evaluation for your language, you could
+consider one of the following approaches:
+* Implement a single `TypeSystem` class that supports evaluation when given an
+  optional `Target`, implementing all the expression evaluation methods on the
+  `TypeSystem`.
+* Create multiple `TypeSystem` classes, one for evaluation and one for static
+  `Module` usage.
+
+For clang and Swift, the latter approach was chosen. Primarily to make it
+clearer that evaluation with the static `Module`-returned `TypeSystem` instances
+make no sense, and have them error out on those calls. But either approach is
+fine.
+
+# Creating Types
+
+Your `TypeSystem` will need an approach for creating types based on a set of
+`Module`s. If your type info is going to come from DWARF info, you will want to
+subclass [DWARFASTParser](https://github.com/llvm/llvm-project/blob/main/lldb/source/Plugins/SymbolFile/DWARF/DWARFASTParser.h).
+
+
+# Add Expression Evaluation Support
+
+Expression Evaluation support is enabled by implementing the relevant methods on
+a `TypeSystem`-derived class. Search for `Expression` in the
+[TypeSystem header](https://github.com/llvm/llvm-project/blob/main/lldb/include/lldb/Symbol/TypeSystem.h)
+to find the methods to implement.
+
+# Type Completion
+
+There are three levels of type completion, each requiring more type information:
+1. Pointer size: When you have a forward decl or a reference, and that's all you
+  need. At this stage, the pointer size is all you need.
+2. Layout info: You need the size of an instance of the type, but you still don't
+  need to know all the guts of the type.
+3. Full type info: Here you need everything, because you're playing with
+  internals of it, such as modifying a member variable.
+
+Ensure you never complete more of a type than is needed for a given situation.
+This will keep your type system from doing more work than necessary.
+
+# Language and LanguageRuntime Plugins
+
+If you followed the steps outlined above, you already have taught LLDB a great
+deal about your language. If your language's runtime model and fundamental data
+types don't differ much from the C model, you are pretty much done.
+
+However it is likely that your language offers its own data types for things
+like strings and arrays, and probably has a notion of dynamic types, where the
+effective type of a variable can only be known at runtime.
+
+These tasks are covered by two plugins:
+* a `LanguageRuntime` plugin, which provides LLDB with a dynamic view of your
+  language; this plugin answers questions that require a live process to acquire
+  information (for example dynamic type resolution).
+* a `Language` plugin, which provides LLDB with a static view of your language;
+  questions that are statically knowable and do not require a process are
+  answered by this plugin (for example data formatters).
\ No newline at end of file
diff --git a/lldb/source/Plugins/ABI/AArch64/ABIAArch64.cpp b/lldb/source/Plugins/ABI/AArch64/ABIAArch64.cpp
index 256c1f828feb38..7d8d0a4d3d6711 100644
--- a/lldb/source/Plugins/ABI/AArch64/ABIAArch64.cpp
+++ b/lldb/source/Plugins/ABI/AArch64/ABIAArch64.cpp
@@ -136,6 +136,8 @@ void ABIAArch64::AugmentRegisterInfo(
 
   std::array<std::optional<uint32_t>, 32> x_regs;
   std::array<std::optional<uint32_t>, 32> v_regs;
+  std::array<std::optional<uint32_t>, 32> z_regs;
+  std::optional<uint32_t> z_byte_size;
 
   for (auto it : llvm::enumerate(regs)) {
     lldb_private::DynamicRegisterInfo::Register &info = it.value();
@@ -157,16 +159,44 @@ void ABIAArch64::AugmentRegisterInfo(
       x_regs[reg_num] = it.index();
     else if (get_reg("v"))
       v_regs[reg_num] = it.index();
+    else if (get_reg("z")) {
+      z_regs[reg_num] = it.index();
+      if (!z_byte_size)
+        z_byte_size = info.byte_size;
+    }
     // if we have at least one subregister, abort
     else if (get_reg("w") || get_reg("s") || get_reg("d"))
       return;
   }
 
-  // Create aliases for partial registers: wN for xN, and sN/dN for vN.
+  // Create aliases for partial registers.
+
+  // Wn for Xn.
   addPartialRegisters(regs, x_regs, 8, "w{0}", 4, lldb::eEncodingUint,
                       lldb::eFormatHex);
-  addPartialRegisters(regs, v_regs, 16, "s{0}", 4, lldb::eEncodingIEEE754,
-                      lldb::eFormatFloat);
-  addPartialRegisters(regs, v_regs, 16, "d{0}", 8, lldb::eEncodingIEEE754,
-                      lldb::eFormatFloat);
+
+  auto bool_predicate = [](const auto &reg_num) { return bool(reg_num); };
+  bool saw_v_regs = std::any_of(v_regs.begin(), v_regs.end(), bool_predicate);
+  bool saw_z_regs = std::any_of(z_regs.begin(), z_regs.end(), bool_predicate);
+
+  // Sn/Dn for Vn.
+  if (saw_v_regs) {
+    addPartialRegisters(regs, v_regs, 16, "s{0}", 4, lldb::eEncodingIEEE754,
+                        lldb::eFormatFloat);
+    addPartialRegisters(regs, v_regs, 16, "d{0}", 8, lldb::eEncodingIEEE754,
+                        lldb::eFormatFloat);
+  } else if (saw_z_regs && z_byte_size) {
+    // When SVE is enabled, some debug stubs will not describe the Neon V
+    // registers because they can be read from the bottom 128 bits of the SVE
+    // registers.
+
+    // The size used here is the one sent by the debug server. This only needs
+    // to be correct right now. Later we will rely on the value of vg instead.
+    addPartialRegisters(regs, z_regs, *z_byte_size, "v{0}", 16,
+                        lldb::eEncodingVector, lldb::eFormatVectorOfUInt8);
+    addPartialRegisters(regs, z_regs, *z_byte_size, "s{0}", 4,
+                        lldb::eEncodingIEEE754, lldb::eFormatFloat);
+    addPartialRegisters(regs, z_regs, *z_byte_size, "d{0}", 8,
+                        lldb::eEncodingIEEE754, lldb::eFormatFloat);
+  }
 }
diff --git a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
index 9e8c6046179631..3e09c316d74f44 100644
--- a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
@@ -4716,9 +4716,14 @@ bool ParseRegisters(
               reg_info.encoding = eEncodingIEEE754;
             } else if (gdb_type == "aarch64v" ||
                        llvm::StringRef(gdb_type).starts_with("vec") ||
-                       gdb_type == "i387_ext" || gdb_type == "uint128") {
+                       gdb_type == "i387_ext" || gdb_type == "uint128" ||
+                       reg_info.byte_size > 16) {
               // lldb doesn't handle 128-bit uints correctly (for ymm*h), so
-              // treat them as vector (similarly to xmm/ymm)
+              // treat them as vector (similarly to xmm/ymm).
+              // We can fall back to handling anything else <= 128 bit as an
+              // unsigned integer, more than that, call it a vector of bytes.
+              // This can happen if we don't recognise the type for AArc64 SVE
+              // registers.
               reg_info.format = eFormatVectorOfUInt8;
               reg_info.encoding = eEncodingVector;
             } else {
diff --git a/lldb/test/API/functionalities/gdb_remote_client/TestAArch64XMLRegistersSVEOnly.py b/lldb/test/API/functionalities/gdb_remote_client/TestAArch64XMLRegistersSVEOnly.py
new file mode 100644
index 00000000000000..e36013a11491b3
--- /dev/null
+++ b/lldb/test/API/functionalities/gdb_remote_client/TestAArch64XMLRegistersSVEOnly.py
@@ -0,0 +1,121 @@
+""" Check that when a debug server provides XML that only defines SVE Z registers,
+    and does not include Neon V registers, lldb creates sub-registers to represent
+    the V registers as the bottom 128 bits of the Z registers.
+
+    qemu-aarch64 is one such debug server.
+
+    This also doubles as a test that lldb has a fallback path for registers of
+    unknown type that are > 128 bits, as the SVE registers are here.
+"""
+
+from textwrap import dedent
+import lldb
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test.decorators import *
+from lldbsuite.test.gdbclientutils import *
+from lldbsuite.test.lldbgdbclient import GDBRemoteTestBase
+
+
+class Responder(MockGDBServerResponder):
+    def __init__(self):
+        super().__init__()
+        self.vg = 4
+        self.pc = 0xA0A0A0A0A0A0A0A0
+
+    def qXferRead(self, obj, annex, offset, length):
+        if annex == "target.xml":
+            # Note that QEMU sends the current SVE size in XML and the debugger
+            # then reads vg to know the latest size.
+            return (
+                dedent(
+                    """\
+              <?xml version="1.0"?>
+              <target version="1.0">
+                <architecture>aarch64</architecture>
+                <feature name="org.gnu.gdb.aarch64.core">
+                  <reg name="pc" regnum="0" bitsize="64"/>
+                  <reg name="vg" regnum="1" bitsize="64"/>
+                  <reg name="z0" regnum="2" bitsize="2048" type="not_a_type"/>
+                </feature>
+              </target>"""
+                ),
+                False,
+            )
+
+        return (None,)
+
+    def readRegister(self, regnum):
+        return "E01"
+
+    def readRegisters(self):
+        return "".join(
+            [
+                # 64 bit PC.
+                f"{self.pc:x}",
+                # 64 bit vg
+                f"0{self.vg}00000000000000",
+                # Enough data for 256 and 512 bit SVE.
+                "".join([f"{n:02x}" * 4 for n in range(1, 17)]),
+            ]
+        )
+
+    def cont(self):
+        # vg is expedited so that lldb can resize the SVE registers.
+        return f"T02thread:1ff0d;threads:1ff0d;thread-pcs:{self.pc};01:0{self.vg}00000000000000;"
+
+    def writeRegisters(self, registers_hex):
+        # We get a block of data containing values in regnum order.
+        self.vg = int(registers_hex[16:18])
+        return "OK"
+
+
+class TestXMLRegisterFlags(GDBRemoteTestBase):
+    def check_regs(self, vg):
+        # Each 32 bit chunk repeats n.
+        z0_value = " ".join(
+            [" ".join([f"0x{n:02x}"] * 4) for n in range(1, (vg * 2) + 1)]
+        )
+
+        self.expect(
+            "register read vg z0 v0 s0 d0",
+            substrs=[
+                f"      vg = 0x000000000000000{vg}\n"
+                "      z0 = {" + z0_value + "}\n"
+                "      v0 = {0x01 0x01 0x01 0x01 0x02 0x02 0x02 0x02 0x03 0x03 0x03 0x03 0x04 0x04 0x04 0x04}\n"
+                "      s0 = 2.36942783E-38\n"
+                "      d0 = 5.3779407333977203E-299\n"
+            ],
+        )
+
+        self.expect("register read s0 --format uint32", substrs=["s0 = {0x01010101}"])
+        self.expect(
+            "register read d0 --format uint64",
+            substrs=["d0 = {0x0202020201010101}"],
+        )
+
+    @skipIfXmlSupportMissing
+    @skipIfRemote
+    @skipIfLLVMTargetMissing("AArch64")
+    def test_v_sub_registers(self):
+        self.server.responder = Responder()
+        target = self.dbg.CreateTarget("")
+
+        if self.TraceOn():
+            self.runCmd("log enable gdb-remote packets")
+            self.addTearDownHook(lambda: self.runCmd("log disable gdb-remote packets"))
+
+        process = self.connect(target)
+        lldbutil.expect_state_changes(
+            self, self.dbg.GetListener(), process, [lldb.eStateStopped]
+        )
+
+        self.check_regs(4)
+
+        # Now increase the SVE length and continue. The mock will respond with a new
+        # vg and lldb will reconfigure the register defs. This should not break the
+        # sub-registers.
+
+        self.runCmd("register write vg 8")
+        self.expect("continue", substrs=["stop reason = signal SIGINT"])
+
+        self.check_regs(8)
diff --git a/llvm/docs/AMDGPUUsage.rst b/llvm/docs/AMDGPUUsage.rst
index 13ceb89f750b57..de1d7e22652771 100644
--- a/llvm/docs/AMDGPUUsage.rst
+++ b/llvm/docs/AMDGPUUsage.rst
@@ -612,9 +612,7 @@ Generic processor code objects are versioned. See :ref:`amdgpu-generic-processor
                                                                                                 - ``gfx1152``
 
                                                                                                 SALU floating point instructions
-                                                                                                and single-use VGPR hint
-                                                                                                instructions are not available
-                                                                                                on:
+                                                                                                are not available on:
 
                                                                                                 - ``gfx1150``
                                                                                                 - ``gfx1151``
diff --git a/llvm/docs/CMake.rst b/llvm/docs/CMake.rst
index 838447f483e510..b5adb22d8f33b1 100644
--- a/llvm/docs/CMake.rst
+++ b/llvm/docs/CMake.rst
@@ -571,10 +571,12 @@ enabled sub-projects. Nearly all of these variable names begin with
   Semicolon-separated list of projects to build, or *all* for building all
   (clang, lldb, lld, polly, etc) projects. This flag assumes that projects
   are checked out side-by-side and not nested, i.e. clang needs to be in
-  parallel of llvm instead of nested in `llvm/tools`. This feature allows
+  parallel of llvm instead of nested in ``llvm/tools``. This feature allows
   to have one build for only LLVM and another for clang+llvm using the same
   source checkout.
+
   The full list is:
+
   ``clang;clang-tools-extra;cross-project-tests;libc;libclc;lld;lldb;openmp;polly;pstl``
 
 **LLVM_ENABLE_RTTI**:BOOL
@@ -586,10 +588,16 @@ enabled sub-projects. Nearly all of these variable names begin with
   It will build the builtins separately from the other runtimes to preserve
   correct dependency ordering. If you want to build the runtimes using a system
   compiler, see the `libc++ documentation <https://libcxx.llvm.org/BuildingLibcxx.html>`_.
-  Note: the list should not have duplicates with `LLVM_ENABLE_PROJECTS`.
+
+  .. note::
+    The list should not have duplicates with ``LLVM_ENABLE_PROJECTS``.
+
   The full list is:
+
   ``compiler-rt;libc;libcxx;libcxxabi;libunwind;openmp``
+
   To enable all of them, use:
+
   ``LLVM_ENABLE_RUNTIMES=all``
 
 **LLVM_ENABLE_SPHINX**:BOOL
diff --git a/llvm/include/llvm/BinaryFormat/ELFRelocs/x86_64.def b/llvm/include/llvm/BinaryFormat/ELFRelocs/x86_64.def
index 18fdcf9472dc48..161b1969abfeb4 100644
--- a/llvm/include/llvm/BinaryFormat/ELFRelocs/x86_64.def
+++ b/llvm/include/llvm/BinaryFormat/ELFRelocs/x86_64.def
@@ -43,3 +43,4 @@ ELF_RELOC(R_X86_64_TLSDESC,     36)
 ELF_RELOC(R_X86_64_IRELATIVE,   37)
 ELF_RELOC(R_X86_64_GOTPCRELX,   41)
 ELF_RELOC(R_X86_64_REX_GOTPCRELX,    42)
+ELF_RELOC(R_X86_64_REX2_GOTPCRELX,    43)
diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
index 18ed60ebb124dc..da43f5be10ff3b 100644
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -825,7 +825,7 @@ enum NodeType {
   /// be saturated against signed values, resulting in `S`, which will combine
   /// to `TRUNCATE_SSAT_S`. If the value of C ranges from `0 to 255`, it will
   /// be saturated against unsigned values, resulting in `U`, which will
-  /// combine to `TRUNATE_SSAT_U`. Similarly, in `truncate(umin(x, C))`, if
+  /// combine to `TRUNCATE_SSAT_U`. Similarly, in `truncate(umin(x, C))`, if
   /// value of C ranges from `0 to 255`, it becomes `U` because it is saturated
   /// for unsigned values. As a result, it combines to `TRUNCATE_USAT_U`.
   TRUNCATE_SSAT_S, // saturate signed input to signed result -
diff --git a/llvm/include/llvm/CodeGen/MIRYamlMapping.h b/llvm/include/llvm/CodeGen/MIRYamlMapping.h
index 304db57eca4994..ab8dc442e04b7b 100644
--- a/llvm/include/llvm/CodeGen/MIRYamlMapping.h
+++ b/llvm/include/llvm/CodeGen/MIRYamlMapping.h
@@ -730,6 +730,11 @@ struct MachineFunction {
   bool TracksRegLiveness = false;
   bool HasWinCFI = false;
 
+  // Computed properties that should be overridable
+  std::optional<bool> NoPHIs;
+  std::optional<bool> IsSSA;
+  std::optional<bool> NoVRegs;
+
   bool CallsEHReturn = false;
   bool CallsUnwindInit = false;
   bool HasEHCatchret = false;
@@ -770,6 +775,12 @@ template <> struct MappingTraits<MachineFunction> {
     YamlIO.mapOptional("tracksRegLiveness", MF.TracksRegLiveness, false);
     YamlIO.mapOptional("hasWinCFI", MF.HasWinCFI, false);
 
+    // PHIs must be not be capitalized, since it will clash with the MIR opcode
+    // leading to false-positive FileCheck hits with CHECK-NOT
+    YamlIO.mapOptional("noPhis", MF.NoPHIs, std::optional<bool>());
+    YamlIO.mapOptional("isSSA", MF.IsSSA, std::optional<bool>());
+    YamlIO.mapOptional("noVRegs", MF.NoVRegs, std::optional<bool>());
+
     YamlIO.mapOptional("callsEHReturn", MF.CallsEHReturn, false);
     YamlIO.mapOptional("callsUnwindInit", MF.CallsUnwindInit, false);
     YamlIO.mapOptional("hasEHCatchret", MF.HasEHCatchret, false);
diff --git a/llvm/include/llvm/IR/LLVMContext.h b/llvm/include/llvm/IR/LLVMContext.h
index 6ffa2bdaa319a7..558816e146587a 100644
--- a/llvm/include/llvm/IR/LLVMContext.h
+++ b/llvm/include/llvm/IR/LLVMContext.h
@@ -316,17 +316,6 @@ class LLVMContext {
   /// LLVMContext is used by compilation.
   void setOptPassGate(OptPassGate&);
 
-  /// Set whether opaque pointers are enabled. The method may be called multiple
-  /// times, but only with the same value. Note that creating a pointer type or
-  /// otherwise querying the opaque pointer mode performs an implicit set to
-  /// the default value.
-  [[deprecated("Opaque pointers are always enabled")]]
-  void setOpaquePointers(bool Enable) const;
-
-  /// Whether typed pointers are supported. If false, all pointers are opaque.
-  [[deprecated("Always returns false")]]
-  bool supportsTypedPointers() const;
-
   /// Get or set the current "default" target CPU (target-cpu function
   /// attribute). The intent is that compiler frontends will set this to a value
   /// that reflects the attribute that a function would get "by default" without
diff --git a/llvm/include/llvm/IR/Type.h b/llvm/include/llvm/IR/Type.h
index 15ac8a1ec59777..ce0f9b289d0c6e 100644
--- a/llvm/include/llvm/IR/Type.h
+++ b/llvm/include/llvm/IR/Type.h
@@ -254,10 +254,6 @@ class Type {
   /// True if this is an instance of PointerType.
   bool isPointerTy() const { return getTypeID() == PointerTyID; }
 
-  /// True if this is an instance of an opaque PointerType.
-  LLVM_DEPRECATED("Use isPointerTy() instead", "isPointerTy")
-  bool isOpaquePointerTy() const { return isPointerTy(); };
-
   /// Return true if this is a pointer type or a vector of pointer types.
   bool isPtrOrPtrVectorTy() const { return getScalarType()->isPointerTy(); }
 
@@ -411,14 +407,6 @@ class Type {
 
   inline StringRef getTargetExtName() const;
 
-  /// Only use this method in code that is not reachable with opaque pointers,
-  /// or part of deprecated methods that will be removed as part of the opaque
-  /// pointers transition.
-  [[deprecated("Pointers no longer have element types")]]
-  Type *getNonOpaquePointerElementType() const {
-    llvm_unreachable("Pointers no longer have element types");
-  }
-
   /// Given vector type, change the element type,
   /// whilst keeping the old number of elements.
   /// For non-vectors simply returns \p EltTy.
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index d45d3bbefe4fd3..dbffbb8a5f81d9 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -1454,7 +1454,7 @@ void InterleavedAccessInfo::analyzeInterleaving(
     // that all the pointers in the group don't wrap.
     // So we check only group member 0 (which is always guaranteed to exist),
     // and group member Factor - 1; If the latter doesn't exist we rely on
-    // peeling (if it is a non-reversed accsess -- see Case 3).
+    // peeling (if it is a non-reversed access -- see Case 3).
     if (InvalidateGroupIfMemberMayWrap(Group, 0, std::string("first")))
       continue;
     if (Group->getMember(Group->getFactor() - 1))
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 30b196fdf6252c..4041da68ccc643 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -2018,7 +2018,10 @@ void AsmPrinter::emitFunctionBody() {
   // are automatically sized.
   bool EmitFunctionSize = MAI->hasDotTypeDotSizeDirective() && !TT.isWasm();
 
-  if (EmitFunctionSize || needFuncLabels(*MF, *this)) {
+  // SPIR-V supports label instructions only inside a block, not after the
+  // function body.
+  if (TT.getObjectFormat() != Triple::SPIRV &&
+      (EmitFunctionSize || needFuncLabels(*MF, *this))) {
     // Create a symbol for the end of function.
     CurrentFnEnd = createTempSymbol("func_end");
     OutStreamer->emitLabel(CurrentFnEnd);
diff --git a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
index d506cd1879648f..8d6d800d761474 100644
--- a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
@@ -178,7 +178,8 @@ class MIRParserImpl {
   SMDiagnostic diagFromBlockStringDiag(const SMDiagnostic &Error,
                                        SMRange SourceRange);
 
-  void computeFunctionProperties(MachineFunction &MF);
+  bool computeFunctionProperties(MachineFunction &MF,
+                                 const yaml::MachineFunction &YamlMF);
 
   void setupDebugValueTracking(MachineFunction &MF,
     PerFunctionMIParsingState &PFS, const yaml::MachineFunction &YamlMF);
@@ -373,7 +374,8 @@ static bool isSSA(const MachineFunction &MF) {
   return true;
 }
 
-void MIRParserImpl::computeFunctionProperties(MachineFunction &MF) {
+bool MIRParserImpl::computeFunctionProperties(
+    MachineFunction &MF, const yaml::MachineFunction &YamlMF) {
   MachineFunctionProperties &Properties = MF.getProperties();
 
   bool HasPHI = false;
@@ -398,21 +400,48 @@ void MIRParserImpl::computeFunctionProperties(MachineFunction &MF) {
       }
     }
   }
-  if (!HasPHI)
-    Properties.set(MachineFunctionProperties::Property::NoPHIs);
+
+  // Helper function to sanity-check and set properties that are computed, but
+  // may be explicitly set from the input MIR
+  auto ComputedPropertyHelper =
+      [&Properties](std::optional<bool> ExplicitProp, bool ComputedProp,
+                    MachineFunctionProperties::Property P) -> bool {
+    // Prefer explicitly given values over the computed properties
+    if (ExplicitProp.value_or(ComputedProp))
+      Properties.set(P);
+    else
+      Properties.reset(P);
+
+    // Check for conflict between the explicit values and the computed ones
+    return ExplicitProp && *ExplicitProp && !ComputedProp;
+  };
+
+  if (ComputedPropertyHelper(YamlMF.NoPHIs, !HasPHI,
+                             MachineFunctionProperties::Property::NoPHIs)) {
+    return error(MF.getName() +
+                 " has explicit property NoPhi, but contains at least one PHI");
+  }
+
   MF.setHasInlineAsm(HasInlineAsm);
 
   if (HasTiedOps && AllTiedOpsRewritten)
     Properties.set(MachineFunctionProperties::Property::TiedOpsRewritten);
 
-  if (isSSA(MF))
-    Properties.set(MachineFunctionProperties::Property::IsSSA);
-  else
-    Properties.reset(MachineFunctionProperties::Property::IsSSA);
+  if (ComputedPropertyHelper(YamlMF.IsSSA, isSSA(MF),
+                             MachineFunctionProperties::Property::IsSSA)) {
+    return error(MF.getName() +
+                 " has explicit property IsSSA, but is not valid SSA");
+  }
 
   const MachineRegisterInfo &MRI = MF.getRegInfo();
-  if (MRI.getNumVirtRegs() == 0)
-    Properties.set(MachineFunctionProperties::Property::NoVRegs);
+  if (ComputedPropertyHelper(YamlMF.NoVRegs, MRI.getNumVirtRegs() == 0,
+                             MachineFunctionProperties::Property::NoVRegs)) {
+    return error(
+        MF.getName() +
+        " has explicit property NoVRegs, but contains virtual registers");
+  }
+
+  return false;
 }
 
 bool MIRParserImpl::initializeCallSiteInfo(
@@ -595,7 +624,8 @@ MIRParserImpl::initializeMachineFunction(const yaml::MachineFunction &YamlMF,
   MachineRegisterInfo &MRI = MF.getRegInfo();
   MRI.freezeReservedRegs();
 
-  computeFunctionProperties(MF);
+  if (computeFunctionProperties(MF, YamlMF))
+    return false;
 
   if (initializeCallSiteInfo(PFS, YamlMF))
     return false;
diff --git a/llvm/lib/CodeGen/MIRPrinter.cpp b/llvm/lib/CodeGen/MIRPrinter.cpp
index 7de68b12045f14..cf6122bce22364 100644
--- a/llvm/lib/CodeGen/MIRPrinter.cpp
+++ b/llvm/lib/CodeGen/MIRPrinter.cpp
@@ -223,6 +223,13 @@ void MIRPrinter::print(const MachineFunction &MF) {
   YamlMF.TracksDebugUserValues = MF.getProperties().hasProperty(
       MachineFunctionProperties::Property::TracksDebugUserValues);
 
+  YamlMF.NoPHIs = MF.getProperties().hasProperty(
+      MachineFunctionProperties::Property::NoPHIs);
+  YamlMF.IsSSA = MF.getProperties().hasProperty(
+      MachineFunctionProperties::Property::IsSSA);
+  YamlMF.NoVRegs = MF.getProperties().hasProperty(
+      MachineFunctionProperties::Property::NoVRegs);
+
   convert(YamlMF, MF.getRegInfo(), MF.getSubtarget().getRegisterInfo());
   MachineModuleSlotTracker MST(MMI, &MF);
   MST.incorporateFunction(MF.getFunction());
diff --git a/llvm/lib/IR/LLVMContext.cpp b/llvm/lib/IR/LLVMContext.cpp
index c0fee93a233808..22e60772def43f 100644
--- a/llvm/lib/IR/LLVMContext.cpp
+++ b/llvm/lib/IR/LLVMContext.cpp
@@ -377,14 +377,6 @@ std::unique_ptr<DiagnosticHandler> LLVMContext::getDiagnosticHandler() {
   return std::move(pImpl->DiagHandler);
 }
 
-void LLVMContext::setOpaquePointers(bool Enable) const {
-  assert(Enable && "Cannot disable opaque pointers");
-}
-
-bool LLVMContext::supportsTypedPointers() const {
-  return false;
-}
-
 StringRef LLVMContext::getDefaultTargetCPU() {
   return pImpl->DefaultTargetCPU;
 }
diff --git a/llvm/lib/MC/MCTargetOptionsCommandFlags.cpp b/llvm/lib/MC/MCTargetOptionsCommandFlags.cpp
index 1a4f7e93eeb74a..92618bdabbe519 100644
--- a/llvm/lib/MC/MCTargetOptionsCommandFlags.cpp
+++ b/llvm/lib/MC/MCTargetOptionsCommandFlags.cpp
@@ -145,8 +145,8 @@ llvm::mc::RegisterMCTargetOptionsFlags::RegisterMCTargetOptionsFlags() {
 
   static cl::opt<bool> X86RelaxRelocations(
       "x86-relax-relocations",
-      cl::desc(
-          "Emit GOTPCRELX/REX_GOTPCRELX instead of GOTPCREL on x86-64 ELF"),
+      cl::desc("Emit GOTPCRELX/REX_GOTPCRELX/REX2_GOTPCRELX instead of "
+               "GOTPCREL on x86-64 ELF"),
       cl::init(true));
   MCBINDOPT(X86RelaxRelocations);
 
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index bdad0ce2420558..9a0a4adce17918 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -1664,6 +1664,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::BITCAST, VT, Custom);
       setOperationAction(ISD::CONCAT_VECTORS, VT, Custom);
       setOperationAction(ISD::FP_EXTEND, VT, Custom);
+      setOperationAction(ISD::FP_ROUND, VT, Custom);
       setOperationAction(ISD::MLOAD, VT, Custom);
       setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
       setOperationAction(ISD::SPLAT_VECTOR, VT, Legal);
@@ -4334,14 +4335,57 @@ SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
 SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
                                              SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
-  if (VT.isScalableVector())
-    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_ROUND_MERGE_PASSTHRU);
-
   bool IsStrict = Op->isStrictFPOpcode();
   SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
   EVT SrcVT = SrcVal.getValueType();
   bool Trunc = Op.getConstantOperandVal(IsStrict ? 2 : 1) == 1;
 
+  if (VT.isScalableVector()) {
+    if (VT.getScalarType() != MVT::bf16)
+      return LowerToPredicatedOp(Op, DAG, AArch64ISD::FP_ROUND_MERGE_PASSTHRU);
+
+    SDLoc DL(Op);
+    constexpr EVT I32 = MVT::nxv4i32;
+    auto ImmV = [&](int I) -> SDValue { return DAG.getConstant(I, DL, I32); };
+
+    SDValue NaN;
+    SDValue Narrow;
+
+    if (SrcVT == MVT::nxv2f32 || SrcVT == MVT::nxv4f32) {
+      if (Subtarget->hasBF16())
+        return LowerToPredicatedOp(Op, DAG,
+                                   AArch64ISD::FP_ROUND_MERGE_PASSTHRU);
+
+      Narrow = getSVESafeBitCast(I32, SrcVal, DAG);
+
+      // Set the quiet bit.
+      if (!DAG.isKnownNeverSNaN(SrcVal))
+        NaN = DAG.getNode(ISD::OR, DL, I32, Narrow, ImmV(0x400000));
+    } else
+      return SDValue();
+
+    if (!Trunc) {
+      SDValue Lsb = DAG.getNode(ISD::SRL, DL, I32, Narrow, ImmV(16));
+      Lsb = DAG.getNode(ISD::AND, DL, I32, Lsb, ImmV(1));
+      SDValue RoundingBias = DAG.getNode(ISD::ADD, DL, I32, Lsb, ImmV(0x7fff));
+      Narrow = DAG.getNode(ISD::ADD, DL, I32, Narrow, RoundingBias);
+    }
+
+    // Don't round if we had a NaN, we don't want to turn 0x7fffffff into
+    // 0x80000000.
+    if (NaN) {
+      EVT I1 = I32.changeElementType(MVT::i1);
+      EVT CondVT = VT.changeElementType(MVT::i1);
+      SDValue IsNaN = DAG.getSetCC(DL, CondVT, SrcVal, SrcVal, ISD::SETUO);
+      IsNaN = DAG.getNode(AArch64ISD::REINTERPRET_CAST, DL, I1, IsNaN);
+      Narrow = DAG.getSelect(DL, I32, IsNaN, NaN, Narrow);
+    }
+
+    // Now that we have rounded, shift the bits into position.
+    Narrow = DAG.getNode(ISD::SRL, DL, I32, Narrow, ImmV(16));
+    return getSVESafeBitCast(VT, Narrow, DAG);
+  }
+
   if (useSVEForFixedLengthVectorVT(SrcVT, !Subtarget->isNeonAvailable()))
     return LowerFixedLengthFPRoundToSVE(Op, DAG);
 
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 1f3d63a216c6dd..7240f6a22a87bd 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -2425,7 +2425,7 @@ let Predicates = [HasBF16, HasSVEorSME] in {
   defm BFMLALT_ZZZ : sve2_fp_mla_long<0b101, "bfmlalt", nxv4f32, nxv8bf16, int_aarch64_sve_bfmlalt>;
   defm BFMLALB_ZZZI : sve2_fp_mla_long_by_indexed_elem<0b100, "bfmlalb", nxv4f32, nxv8bf16, int_aarch64_sve_bfmlalb_lane_v2>;
   defm BFMLALT_ZZZI : sve2_fp_mla_long_by_indexed_elem<0b101, "bfmlalt", nxv4f32, nxv8bf16, int_aarch64_sve_bfmlalt_lane_v2>;
-  defm BFCVT_ZPmZ   : sve_bfloat_convert<0b1, "bfcvt",   int_aarch64_sve_fcvt_bf16f32>;
+  defm BFCVT_ZPmZ   : sve_bfloat_convert<0b1, "bfcvt",   int_aarch64_sve_fcvt_bf16f32, AArch64fcvtr_mt>;
   defm BFCVTNT_ZPmZ : sve_bfloat_convert<0b0, "bfcvtnt", int_aarch64_sve_fcvtnt_bf16f32>;
 } // End HasBF16, HasSVEorSME
 
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 11a4aa4d01e123..da0798ebf79578 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -4159,6 +4159,26 @@ AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
   switch (ISD) {
   default:
     break;
+  case ISD::FADD:
+    if (Type *EltTy = ValTy->getScalarType();
+        // FIXME: For half types without fullfp16 support, this could extend and
+        // use a fp32 faddp reduction but current codegen unrolls.
+        MTy.isVector() && (EltTy->isFloatTy() || EltTy->isDoubleTy() ||
+                           (EltTy->isHalfTy() && ST->hasFullFP16()))) {
+      const unsigned NElts = MTy.getVectorNumElements();
+      if (ValTy->getElementCount().getFixedValue() >= 2 && NElts >= 2 &&
+          isPowerOf2_32(NElts))
+        // Reduction corresponding to series of fadd instructions is lowered to
+        // series of faddp instructions. faddp has latency/throughput that
+        // matches fadd instruction and hence, every faddp instruction can be
+        // considered to have a relative cost = 1 with
+        // CostKind = TCK_RecipThroughput.
+        // An faddp will pairwise add vector elements, so the size of input
+        // vector reduces by half every time, requiring
+        // #(faddp instructions) = log2_32(NElts).
+        return (LT.first - 1) + /*No of faddp instructions*/ Log2_32(NElts);
+    }
+    break;
   case ISD::ADD:
     if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
       return (LT.first - 1) + Entry->Cost;
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 8119198a48aa59..0bfac6465a1f30 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -8807,9 +8807,13 @@ class sve_bfloat_convert<bit N, string asm>
   let mayRaiseFPException = 1;
 }
 
-multiclass sve_bfloat_convert<bit N, string asm, SDPatternOperator op> {
+multiclass sve_bfloat_convert<bit N, string asm, SDPatternOperator op,
+                              SDPatternOperator ir_op = null_frag> {
   def NAME : sve_bfloat_convert<N, asm>;
+
   def : SVE_3_Op_Pat<nxv8bf16, op, nxv8bf16, nxv8i1, nxv4f32, !cast<Instruction>(NAME)>;
+  def : SVE_1_Op_Passthru_Round_Pat<nxv4bf16, ir_op, nxv4i1, nxv4f32, !cast<Instruction>(NAME)>;
+  def : SVE_1_Op_Passthru_Round_Pat<nxv2bf16, ir_op, nxv2i1, nxv2f32, !cast<Instruction>(NAME)>;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index aef9a972e96679..8a4b5674ebcb00 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -409,9 +409,6 @@ extern char &SIModeRegisterID;
 void initializeAMDGPUInsertDelayAluPass(PassRegistry &);
 extern char &AMDGPUInsertDelayAluID;
 
-void initializeAMDGPUInsertSingleUseVDSTPass(PassRegistry &);
-extern char &AMDGPUInsertSingleUseVDSTID;
-
 void initializeSIInsertHardClausesPass(PassRegistry &);
 extern char &SIInsertHardClausesID;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 919e698e76b33b..3626fd8bc78c15 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -929,12 +929,6 @@ def FeatureSALUFloatInsts : SubtargetFeature<"salu-float",
   "Has SALU floating point instructions"
 >;
 
-def FeatureVGPRSingleUseHintInsts : SubtargetFeature<"vgpr-singleuse-hint",
-  "HasVGPRSingleUseHintInsts",
-  "true",
-  "Has single-use VGPR hint instructions"
->;
-
 def FeaturePseudoScalarTrans : SubtargetFeature<"pseudo-scalar-trans",
   "HasPseudoScalarTrans",
   "true",
@@ -1615,14 +1609,12 @@ def FeatureISAVersion11_5_0 : FeatureSet<
   !listconcat(FeatureISAVersion11_Common.Features,
     [FeatureSALUFloatInsts,
      FeatureDPPSrc1SGPR,
-     FeatureVGPRSingleUseHintInsts,
      FeatureRequiredExportPriority])>;
 
 def FeatureISAVersion11_5_1 : FeatureSet<
   !listconcat(FeatureISAVersion11_Common.Features,
     [FeatureSALUFloatInsts,
      FeatureDPPSrc1SGPR,
-     FeatureVGPRSingleUseHintInsts,
      Feature1_5xVGPRs,
      FeatureRequiredExportPriority])>;
 
@@ -1630,7 +1622,6 @@ def FeatureISAVersion11_5_2 : FeatureSet<
   !listconcat(FeatureISAVersion11_Common.Features,
     [FeatureSALUFloatInsts,
      FeatureDPPSrc1SGPR,
-     FeatureVGPRSingleUseHintInsts,
      FeatureRequiredExportPriority])>;
 
 def FeatureISAVersion12 : FeatureSet<
@@ -1663,7 +1654,6 @@ def FeatureISAVersion12 : FeatureSet<
    FeatureSALUFloatInsts,
    FeaturePseudoScalarTrans,
    FeatureHasRestrictedSOffset,
-   FeatureVGPRSingleUseHintInsts,
    FeatureScalarDwordx3Loads,
    FeatureDPPSrc1SGPR,
    FeatureMaxHardClauseLength32,
@@ -2271,9 +2261,6 @@ def HasNotMADIntraFwdBug : Predicate<"!Subtarget->hasMADIntraFwdBug()">;
 def HasSALUFloatInsts : Predicate<"Subtarget->hasSALUFloatInsts()">,
   AssemblerPredicate<(all_of FeatureSALUFloatInsts)>;
 
-def HasVGPRSingleUseHintInsts : Predicate<"Subtarget->hasVGPRSingleUseHintInsts()">,
-  AssemblerPredicate<(all_of FeatureVGPRSingleUseHintInsts)>;
-
 def HasPseudoScalarTrans : Predicate<"Subtarget->hasPseudoScalarTrans()">,
   AssemblerPredicate<(all_of FeaturePseudoScalarTrans)>;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
index 2e02bb4271adc7..06b2f181c276cd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
@@ -77,6 +77,8 @@ struct ArgDescriptor {
   }
 
   unsigned getMask() const {
+    // None of the target SGPRs or VGPRs are expected to have a 'zero' mask.
+    assert(Mask && "Invalid mask.");
     return Mask;
   }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp
deleted file mode 100644
index 43b3bf43fe56db..00000000000000
--- a/llvm/lib/Target/AMDGPU/AMDGPUInsertSingleUseVDST.cpp
+++ /dev/null
@@ -1,245 +0,0 @@
-//===- AMDGPUInsertSingleUseVDST.cpp - Insert s_singleuse_vdst instructions ==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// Insert s_singleuse_vdst instructions on GFX11.5+ to mark regions of VALU
-/// instructions that produce single-use VGPR values. If the value is forwarded
-/// to the consumer instruction prior to VGPR writeback, the hardware can
-/// then skip (kill) the VGPR write.
-//
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPU.h"
-#include "AMDGPUGenSearchableTables.inc"
-#include "GCNSubtarget.h"
-#include "SIInstrInfo.h"
-#include "SIRegisterInfo.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineOperand.h"
-#include "llvm/CodeGen/Register.h"
-#include "llvm/IR/DebugLoc.h"
-#include "llvm/MC/MCRegister.h"
-#include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/Pass.h"
-#include <array>
-
-using namespace llvm;
-
-#define DEBUG_TYPE "amdgpu-insert-single-use-vdst"
-
-namespace {
-class AMDGPUInsertSingleUseVDST : public MachineFunctionPass {
-private:
-  const SIInstrInfo *SII;
-  class SingleUseInstruction {
-  private:
-    static const unsigned MaxSkipRange = 0b111;
-    static const unsigned MaxNumberOfSkipRegions = 2;
-
-    unsigned LastEncodedPositionEnd;
-    MachineInstr *ProducerInstr;
-
-    std::array<unsigned, MaxNumberOfSkipRegions + 1> SingleUseRegions;
-    SmallVector<unsigned, MaxNumberOfSkipRegions> SkipRegions;
-
-    // Adds a skip region into the instruction.
-    void skip(const unsigned ProducerPosition) {
-      while (LastEncodedPositionEnd + MaxSkipRange < ProducerPosition) {
-        SkipRegions.push_back(MaxSkipRange);
-        LastEncodedPositionEnd += MaxSkipRange;
-      }
-      SkipRegions.push_back(ProducerPosition - LastEncodedPositionEnd);
-      LastEncodedPositionEnd = ProducerPosition;
-    }
-
-    bool currentRegionHasSpace() {
-      const auto Region = SkipRegions.size();
-      // The first region has an extra bit of encoding space.
-      return SingleUseRegions[Region] <
-             ((Region == MaxNumberOfSkipRegions) ? 0b1111U : 0b111U);
-    }
-
-    unsigned encodeImm() {
-      // Handle the first Single Use Region separately as it has an extra bit
-      // of encoding space.
-      unsigned Imm = SingleUseRegions[SkipRegions.size()];
-      unsigned ShiftAmount = 4;
-      for (unsigned i = SkipRegions.size(); i > 0; i--) {
-        Imm |= SkipRegions[i - 1] << ShiftAmount;
-        ShiftAmount += 3;
-        Imm |= SingleUseRegions[i - 1] << ShiftAmount;
-        ShiftAmount += 3;
-      }
-      return Imm;
-    }
-
-  public:
-    SingleUseInstruction(const unsigned ProducerPosition,
-                         MachineInstr *Producer)
-        : LastEncodedPositionEnd(ProducerPosition + 1), ProducerInstr(Producer),
-          SingleUseRegions({1, 0, 0}) {}
-
-    // Returns false if adding a new single use producer failed. This happens
-    // because it could not be encoded, either because there is no room to
-    // encode another single use producer region or that this single use
-    // producer is too far away to encode the amount of instructions to skip.
-    bool tryAddProducer(const unsigned ProducerPosition, MachineInstr *MI) {
-      // Producer is too far away to encode into this instruction or another
-      // skip region is needed and SkipRegions.size() = 2 so there's no room for
-      // another skip region, therefore a new instruction is needed.
-      if (LastEncodedPositionEnd +
-              (MaxSkipRange * (MaxNumberOfSkipRegions - SkipRegions.size())) <
-          ProducerPosition)
-        return false;
-
-      // If a skip region is needed.
-      if (LastEncodedPositionEnd != ProducerPosition ||
-          !currentRegionHasSpace()) {
-        // If the current region is out of space therefore a skip region would
-        // be needed, but there is no room for another skip region.
-        if (SkipRegions.size() == MaxNumberOfSkipRegions)
-          return false;
-        skip(ProducerPosition);
-      }
-
-      SingleUseRegions[SkipRegions.size()]++;
-      LastEncodedPositionEnd = ProducerPosition + 1;
-      ProducerInstr = MI;
-      return true;
-    }
-
-    auto emit(const SIInstrInfo *SII) {
-      return BuildMI(*ProducerInstr->getParent(), ProducerInstr, DebugLoc(),
-                     SII->get(AMDGPU::S_SINGLEUSE_VDST))
-          .addImm(encodeImm());
-    }
-  };
-
-public:
-  static char ID;
-
-  AMDGPUInsertSingleUseVDST() : MachineFunctionPass(ID) {}
-
-  void insertSingleUseInstructions(
-      ArrayRef<std::pair<unsigned, MachineInstr *>> SingleUseProducers) const {
-    SmallVector<SingleUseInstruction> Instructions;
-
-    for (auto &[Position, MI] : SingleUseProducers) {
-      // Encode this position into the last single use instruction if possible.
-      if (Instructions.empty() ||
-          !Instructions.back().tryAddProducer(Position, MI)) {
-        // If not, add a new instruction.
-        Instructions.push_back(SingleUseInstruction(Position, MI));
-      }
-    }
-
-    for (auto &Instruction : Instructions)
-      Instruction.emit(SII);
-  }
-
-  bool runOnMachineFunction(MachineFunction &MF) override {
-    const auto &ST = MF.getSubtarget<GCNSubtarget>();
-    if (!ST.hasVGPRSingleUseHintInsts())
-      return false;
-
-    SII = ST.getInstrInfo();
-    const auto *TRI = &SII->getRegisterInfo();
-    bool InstructionEmitted = false;
-
-    for (MachineBasicBlock &MBB : MF) {
-      DenseMap<MCRegUnit, unsigned> RegisterUseCount;
-
-      // Handle boundaries at the end of basic block separately to avoid
-      // false positives. If they are live at the end of a basic block then
-      // assume it has more uses later on.
-      for (const auto &Liveout : MBB.liveouts()) {
-        for (MCRegUnitMaskIterator Units(Liveout.PhysReg, TRI); Units.isValid();
-             ++Units) {
-          const auto [Unit, Mask] = *Units;
-          if ((Mask & Liveout.LaneMask).any())
-            RegisterUseCount[Unit] = 2;
-        }
-      }
-
-      SmallVector<std::pair<unsigned, MachineInstr *>>
-          SingleUseProducerPositions;
-
-      unsigned VALUInstrCount = 0;
-      for (MachineInstr &MI : reverse(MBB.instrs())) {
-        // All registers in all operands need to be single use for an
-        // instruction to be marked as a single use producer.
-        bool AllProducerOperandsAreSingleUse = true;
-
-        // Gather a list of Registers used before updating use counts to avoid
-        // double counting registers that appear multiple times in a single
-        // MachineInstr.
-        SmallVector<MCRegUnit> RegistersUsed;
-
-        for (const auto &Operand : MI.all_defs()) {
-          const auto Reg = Operand.getReg();
-
-          const auto RegUnits = TRI->regunits(Reg);
-          if (any_of(RegUnits, [&RegisterUseCount](const MCRegUnit Unit) {
-                return RegisterUseCount[Unit] > 1;
-              }))
-            AllProducerOperandsAreSingleUse = false;
-
-          // Reset uses count when a register is no longer live.
-          for (const MCRegUnit Unit : RegUnits)
-            RegisterUseCount.erase(Unit);
-        }
-
-        for (const auto &Operand : MI.all_uses()) {
-          const auto Reg = Operand.getReg();
-
-          // Count the number of times each register is read.
-          for (const MCRegUnit Unit : TRI->regunits(Reg)) {
-            if (!is_contained(RegistersUsed, Unit))
-              RegistersUsed.push_back(Unit);
-          }
-        }
-        for (const MCRegUnit Unit : RegistersUsed)
-          RegisterUseCount[Unit]++;
-
-        // Do not attempt to optimise across exec mask changes.
-        if (MI.modifiesRegister(AMDGPU::EXEC, TRI) ||
-            AMDGPU::isInvalidSingleUseConsumerInst(MI.getOpcode())) {
-          for (auto &UsedReg : RegisterUseCount)
-            UsedReg.second = 2;
-        }
-
-        if (!SIInstrInfo::isVALU(MI) ||
-            AMDGPU::isInvalidSingleUseProducerInst(MI.getOpcode()))
-          continue;
-        if (AllProducerOperandsAreSingleUse) {
-          SingleUseProducerPositions.push_back({VALUInstrCount, &MI});
-          InstructionEmitted = true;
-        }
-        VALUInstrCount++;
-      }
-      insertSingleUseInstructions(SingleUseProducerPositions);
-    }
-    return InstructionEmitted;
-  }
-};
-} // namespace
-
-char AMDGPUInsertSingleUseVDST::ID = 0;
-
-char &llvm::AMDGPUInsertSingleUseVDSTID = AMDGPUInsertSingleUseVDST::ID;
-
-INITIALIZE_PASS(AMDGPUInsertSingleUseVDST, DEBUG_TYPE,
-                "AMDGPU Insert SingleUseVDST", false, false)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index fdef24e5c6f171..aa0c2446a437dd 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -311,12 +311,6 @@ static cl::opt<bool> EnableSIModeRegisterPass(
   cl::init(true),
   cl::Hidden);
 
-// Enable GFX11.5+ s_singleuse_vdst insertion
-static cl::opt<bool>
-    EnableInsertSingleUseVDST("amdgpu-enable-single-use-vdst",
-                              cl::desc("Enable s_singleuse_vdst insertion"),
-                              cl::init(false), cl::Hidden);
-
 // Enable GFX11+ s_delay_alu insertion
 static cl::opt<bool>
     EnableInsertDelayAlu("amdgpu-enable-delay-alu",
@@ -450,7 +444,6 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeAMDGPURewriteUndefForPHILegacyPass(*PR);
   initializeAMDGPUUnifyMetadataPass(*PR);
   initializeSIAnnotateControlFlowLegacyPass(*PR);
-  initializeAMDGPUInsertSingleUseVDSTPass(*PR);
   initializeAMDGPUInsertDelayAluPass(*PR);
   initializeSIInsertHardClausesPass(*PR);
   initializeSIInsertWaitcntsPass(*PR);
@@ -1553,9 +1546,6 @@ void GCNPassConfig::addPreEmitPass() {
   // cases.
   addPass(&PostRAHazardRecognizerID);
 
-  if (isPassEnabled(EnableInsertSingleUseVDST, CodeGenOptLevel::Less))
-    addPass(&AMDGPUInsertSingleUseVDSTID);
-
   if (isPassEnabled(EnableInsertDelayAlu, CodeGenOptLevel::Less))
     addPass(&AMDGPUInsertDelayAluID);
 
diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt
index 4e1fa525f7c17d..ac5f8279c3b96a 100644
--- a/llvm/lib/Target/AMDGPU/CMakeLists.txt
+++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt
@@ -82,7 +82,6 @@ add_llvm_target(AMDGPUCodeGen
   AMDGPUMCInstLower.cpp
   AMDGPUMemoryUtils.cpp
   AMDGPUIGroupLP.cpp
-  AMDGPUInsertSingleUseVDST.cpp
   AMDGPUMarkLastScratchLoad.cpp
   AMDGPUMIRFormatter.cpp
   AMDGPUOpenCLEnqueuedBlockLowering.cpp
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index a4ae8a1be32258..e6b7342d5fffcf 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -215,7 +215,6 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   bool HasPackedTID = false;
   bool ScalarizeGlobal = false;
   bool HasSALUFloatInsts = false;
-  bool HasVGPRSingleUseHintInsts = false;
   bool HasPseudoScalarTrans = false;
   bool HasRestrictedSOffset = false;
 
@@ -1280,8 +1279,6 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
 
   bool hasSALUFloatInsts() const { return HasSALUFloatInsts; }
 
-  bool hasVGPRSingleUseHintInsts() const { return HasVGPRSingleUseHintInsts; }
-
   bool hasPseudoScalarTrans() const { return HasPseudoScalarTrans; }
 
   bool hasRestrictedSOffset() const { return HasRestrictedSOffset; }
diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index 434336ef137ff5..46f5097c679fb3 100644
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -92,12 +92,12 @@ class V2SCopyInfo {
   SetVector<MachineInstr *> SChain;
   // Number of SGPR to VGPR copies that are used to put the SALU computation
   // results back to VALU.
-  unsigned NumSVCopies;
+  unsigned NumSVCopies = 0;
 
-  unsigned Score;
+  unsigned Score = 0;
   // Actual count of v_readfirstlane_b32
   // which need to be inserted to keep SChain SALU
-  unsigned NumReadfirstlanes;
+  unsigned NumReadfirstlanes = 0;
   // Current score state. To speedup selection V2SCopyInfos for processing
   bool NeedToBeConvertedToVALU = false;
   // Unique ID. Used as a key for mapping to keep permanent order.
@@ -109,7 +109,7 @@ class V2SCopyInfo {
   SetVector<unsigned> Siblings;
   V2SCopyInfo() : Copy(nullptr), ID(0){};
   V2SCopyInfo(unsigned Id, MachineInstr *C, unsigned Width)
-      : Copy(C), NumSVCopies(0), NumReadfirstlanes(Width / 32), ID(Id){};
+      : Copy(C), NumReadfirstlanes(Width / 32), ID(Id){};
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
   void dump() {
     dbgs() << ID << " : " << *Copy << "\n\tS:" << SChain.size()
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 6be2e8fb7bc7e3..8b3da646c45f30 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -1683,10 +1683,15 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
         // the debug value instructions. We should instead, update it with the
         // correct register value. But not sure the register value alone is
         for (MachineInstr &MI : MBB) {
-          if (MI.isDebugValue() && MI.getOperand(0).isFI() &&
-              !MFI.isFixedObjectIndex(MI.getOperand(0).getIndex()) &&
-              SpillFIs[MI.getOperand(0).getIndex()]) {
-            MI.getOperand(0).ChangeToRegister(Register(), false /*isDef*/);
+          if (MI.isDebugValue()) {
+            uint32_t StackOperandIdx = MI.isDebugValueList() ? 2 : 0;
+            if (MI.getOperand(StackOperandIdx).isFI() &&
+                !MFI.isFixedObjectIndex(
+                    MI.getOperand(StackOperandIdx).getIndex()) &&
+                SpillFIs[MI.getOperand(StackOperandIdx).getIndex()]) {
+              MI.getOperand(StackOperandIdx)
+                  .ChangeToRegister(Register(), false /*isDef*/);
+            }
           }
           // FIXME: Need to update expression to locate lane of VGPR to which
           // the SGPR was spilled.
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 6d37d781829ecf..bbcd02c6925b4c 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -9842,6 +9842,9 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
     else
       return false;
 
+    // A valid Mask is required to have a single bit set, hence a non-zero and
+    // power-of-two value. This verifies that we will not do 64-bit shift below.
+    assert(llvm::has_single_bit<uint64_t>(Mask) && "Invalid mask.");
     unsigned BitNo = llvm::countr_zero((uint64_t)Mask);
     if (IsSigned && BitNo == SrcSize - 1)
       return false;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index c016be2fc6c0fb..087ca1f954464d 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2409,8 +2409,6 @@ class VOPProfile <list<ValueType> _ArgVT, bit _EnableClamp = 0> {
   field bit EnableClamp = _EnableClamp;
   field bit IsTrue16 = 0;
   field bit IsRealTrue16 = 0;
-  field bit IsInvalidSingleUseConsumer = 0;
-  field bit IsInvalidSingleUseProducer = 0;
 
   field ValueType DstVT = ArgVT[0];
   field ValueType Src0VT = ArgVT[1];
diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
index 201836d12564e5..8c876a3ce82112 100644
--- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
@@ -414,10 +414,15 @@ bool SILowerSGPRSpills::run(MachineFunction &MF) {
       // correct register value. But not sure the register value alone is
       // adequate to lower the DIExpression. It should be worked out later.
       for (MachineInstr &MI : MBB) {
-        if (MI.isDebugValue() && MI.getOperand(0).isFI() &&
-            !MFI.isFixedObjectIndex(MI.getOperand(0).getIndex()) &&
-            SpillFIs[MI.getOperand(0).getIndex()]) {
-          MI.getOperand(0).ChangeToRegister(Register(), false /*isDef*/);
+        if (MI.isDebugValue()) {
+          uint32_t StackOperandIdx = MI.isDebugValueList() ? 2 : 0;
+          if (MI.getOperand(StackOperandIdx).isFI() &&
+              !MFI.isFixedObjectIndex(
+                  MI.getOperand(StackOperandIdx).getIndex()) &&
+              SpillFIs[MI.getOperand(StackOperandIdx).getIndex()]) {
+            MI.getOperand(StackOperandIdx)
+                .ChangeToRegister(Register(), false /*isDef*/);
+          }
         }
         // FIXME: Need to update expression to locate lane of VGPR to which the
         // SGPR was spilled.
diff --git a/llvm/lib/Target/AMDGPU/SIMachineScheduler.h b/llvm/lib/Target/AMDGPU/SIMachineScheduler.h
index ac34a748edbc1e..f8f4b5aae338eb 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineScheduler.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineScheduler.h
@@ -120,8 +120,8 @@ class SIScheduleBlock {
   ArrayRef<std::pair<SIScheduleBlock*, SIScheduleBlockLinkKind>>
     getSuccs() const { return Succs; }
 
-  unsigned Height;  // Maximum topdown path length to block without outputs
-  unsigned Depth;   // Maximum bottomup path length to block without inputs
+  unsigned Height = 0;  // Maximum topdown path length to block without outputs
+  unsigned Depth = 0;   // Maximum bottomup path length to block without inputs
 
   unsigned getNumHighLatencySuccessors() const {
     return NumHighLatencySuccessors;
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 2e73a1a15f6b32..9da27a7c7ee7d6 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -1752,11 +1752,6 @@ let OtherPredicates = [HasExportInsts] in
                                 "$simm16">;
 } // End SubtargetPredicate = isGFX11Plus
 
-let SubtargetPredicate = HasVGPRSingleUseHintInsts in {
-  def S_SINGLEUSE_VDST :
-    SOPP_Pseudo<"s_singleuse_vdst", (ins s16imm:$simm16), "$simm16">;
-} // End SubtargetPredicate = HasVGPRSingeUseHintInsts
-
 let SubtargetPredicate = isGFX12Plus, hasSideEffects = 1 in {
   def S_WAIT_LOADCNT :
     SOPP_Pseudo<"s_wait_loadcnt", (ins s16imm:$simm16), "$simm16",
@@ -2676,12 +2671,6 @@ defm S_ICACHE_INV                 : SOPP_Real_32_gfx11_gfx12<0x03c>;
 
 defm S_BARRIER                    : SOPP_Real_32_gfx11<0x03d>;
 
-//===----------------------------------------------------------------------===//
-// SOPP - GFX1150, GFX12.
-//===----------------------------------------------------------------------===//
-
-defm S_SINGLEUSE_VDST             : SOPP_Real_32_gfx11_gfx12<0x013>;
-
 //===----------------------------------------------------------------------===//
 // SOPP - GFX6, GFX7, GFX8, GFX9, GFX10
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 8b5ec8793d84a2..f32c82f1e4ba4c 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -379,12 +379,6 @@ struct VOPTrue16Info {
   bool IsTrue16;
 };
 
-struct SingleUseExceptionInfo {
-  uint16_t Opcode;
-  bool IsInvalidSingleUseConsumer;
-  bool IsInvalidSingleUseProducer;
-};
-
 struct FP8DstByteSelInfo {
   uint16_t Opcode;
   bool HasFP8DstByteSel;
@@ -396,8 +390,6 @@ struct FP8DstByteSelInfo {
 #define GET_MTBUFInfoTable_IMPL
 #define GET_MUBUFInfoTable_DECL
 #define GET_MUBUFInfoTable_IMPL
-#define GET_SingleUseExceptionTable_DECL
-#define GET_SingleUseExceptionTable_IMPL
 #define GET_SMInfoTable_DECL
 #define GET_SMInfoTable_IMPL
 #define GET_VOP1InfoTable_DECL
@@ -626,16 +618,6 @@ bool isTrue16Inst(unsigned Opc) {
   return Info ? Info->IsTrue16 : false;
 }
 
-bool isInvalidSingleUseConsumerInst(unsigned Opc) {
-  const SingleUseExceptionInfo *Info = getSingleUseExceptionHelper(Opc);
-  return Info && Info->IsInvalidSingleUseConsumer;
-}
-
-bool isInvalidSingleUseProducerInst(unsigned Opc) {
-  const SingleUseExceptionInfo *Info = getSingleUseExceptionHelper(Opc);
-  return Info && Info->IsInvalidSingleUseProducer;
-}
-
 bool isFP8DstSelInst(unsigned Opc) {
   const FP8DstByteSelInfo *Info = getFP8DstByteSelHelper(Opc);
   return Info ? Info->HasFP8DstByteSel : false;
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index 35c080d8e0bebc..da37534f2fa4ff 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -870,6 +870,8 @@ bool isInvalidSingleUseConsumerInst(unsigned Opc);
 LLVM_READONLY
 bool isInvalidSingleUseProducerInst(unsigned Opc);
 
+bool isDPMACCInstruction(unsigned Opc);
+
 LLVM_READONLY
 unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc);
 
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index 33f2f9f1f5c5b9..bd805059705783 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -252,7 +252,6 @@ def VOP_READFIRSTLANE : VOPProfile <[i32, i32, untyped, untyped]> {
 def V_READFIRSTLANE_B32 : VOP1_Pseudo <"v_readfirstlane_b32", VOP_READFIRSTLANE,
                                        [], 1> {
   let isConvergent = 1;
-  let IsInvalidSingleUseConsumer = 1;
 }
 
 foreach vt = Reg32Types.types in {
@@ -375,7 +374,6 @@ defm V_CLREXCP : VOP1Inst <"v_clrexcp", VOP_NO_EXT<VOP_NONE>>;
 def VOP_MOVRELS : VOPProfile<[i32, i32, untyped, untyped]> {
   let Src0RC32 = VRegSrc_32;
   let Src0RC64 = VRegSrc_32;
-  let IsInvalidSingleUseConsumer = 1;
 }
 
 // Special case because there are no true output operands.  Hack vdst
@@ -419,12 +417,8 @@ class VOP_MOVREL<RegisterOperand Src1RC> : VOPProfile<[untyped, i32, untyped, un
   let EmitDst = 1; // force vdst emission
 }
 
-let IsInvalidSingleUseProducer = 1 in {
-  def VOP_MOVRELD : VOP_MOVREL<VSrc_b32>;
-  def VOP_MOVRELSD : VOP_MOVREL<VRegSrc_32> {
-    let IsInvalidSingleUseConsumer = 1;
-  }
-}
+def VOP_MOVRELD : VOP_MOVREL<VSrc_b32>;
+def VOP_MOVRELSD : VOP_MOVREL<VRegSrc_32>;
 
 let SubtargetPredicate = HasMovrel, Uses = [M0, EXEC] in {
  // v_movreld_b32 is a special case because the destination output
@@ -541,7 +535,6 @@ let SubtargetPredicate = isGFX9Plus in {
     let Constraints = "$vdst = $src1, $vdst1 = $src0";
     let DisableEncoding = "$vdst1,$src1";
     let SchedRW = [Write64Bit, Write64Bit];
-    let IsInvalidSingleUseConsumer = 1;
   }
 
   let isReMaterializable = 1 in
@@ -708,8 +701,6 @@ let SubtargetPredicate = isGFX10Plus in {
       let Constraints = "$vdst = $src1, $vdst1 = $src0";
       let DisableEncoding = "$vdst1,$src1";
       let SchedRW = [Write64Bit, Write64Bit];
-      let IsInvalidSingleUseConsumer = 1;
-      let IsInvalidSingleUseProducer = 1;
     }
   } // End Uses = [M0]
 } // End SubtargetPredicate = isGFX10Plus
@@ -743,10 +734,7 @@ let SubtargetPredicate = isGFX11Plus in {
   }
   // Restrict src0 to be VGPR
   def V_PERMLANE64_B32 : VOP1_Pseudo<"v_permlane64_b32", VOP_MOVRELS,
-                                      [], /*VOP1Only=*/ 1> {
-    let IsInvalidSingleUseConsumer = 1;
-    let IsInvalidSingleUseProducer = 1;
-  }
+                                      [], /*VOP1Only=*/ 1>;
   defm V_MOV_B16        : VOP1Inst_t16<"v_mov_b16", VOP_I16_I16>;
   defm V_NOT_B16        : VOP1Inst_t16<"v_not_b16", VOP_I16_I16>;
   defm V_CVT_I32_I16    : VOP1Inst_t16<"v_cvt_i32_i16", VOP_I32_I16>;
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index dd48607402eb0b..52f7be3b4577df 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -788,12 +788,10 @@ defm V_SUBREV_U32 : VOP2Inst <"v_subrev_u32", VOP_I32_I32_I32_ARITH, null_frag,
 } // End isCommutable = 1
 
 // These are special and do not read the exec mask.
-let isConvergent = 1, Uses = []<Register>, IsInvalidSingleUseConsumer = 1 in {
+let isConvergent = 1, Uses = []<Register> in {
 def V_READLANE_B32 : VOP2_Pseudo<"v_readlane_b32", VOP_READLANE, []>;
 let IsNeverUniform = 1, Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in {
-def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE, []> {
-    let IsInvalidSingleUseProducer = 1;
-  }
+def V_WRITELANE_B32 : VOP2_Pseudo<"v_writelane_b32", VOP_WRITELANE, []>;
 } // End IsNeverUniform, $vdst = $vdst_in, DisableEncoding $vdst_in
 } // End isConvergent = 1
 
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 466114b95f9f90..20beb41b7b58bb 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -157,12 +157,12 @@ defm V_MAX_F64 : VOP3Inst <"v_max_f64", VOP3_Profile<VOP_F64_F64_F64>, fmaxnum_l
 } // End SubtargetPredicate = isNotGFX12Plus
 } // End SchedRW = [WriteDoubleAdd]
 
-let SchedRW = [WriteIntMul], IsInvalidSingleUseConsumer = 1 in {
+let SchedRW = [WriteIntMul] in {
 defm V_MUL_LO_U32 : VOP3Inst <"v_mul_lo_u32", V_MUL_PROF<VOP_I32_I32_I32>, DivergentBinFrag<mul>>;
 defm V_MUL_HI_U32 : VOP3Inst <"v_mul_hi_u32", V_MUL_PROF<VOP_I32_I32_I32>, mulhu>;
 defm V_MUL_LO_I32 : VOP3Inst <"v_mul_lo_i32", V_MUL_PROF<VOP_I32_I32_I32>>;
 defm V_MUL_HI_I32 : VOP3Inst <"v_mul_hi_i32", V_MUL_PROF<VOP_I32_I32_I32>, mulhs>;
-} // End SchedRW = [WriteIntMul], IsInvalidSingleUseConsumer = 1
+} // End SchedRW = [WriteIntMul]
 
 let SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0 in {
 defm V_MINIMUM_F32 : VOP3Inst <"v_minimum_f32", VOP3_Profile<VOP_F32_F32_F32>, DivergentBinFrag<fminimum>>;
@@ -260,9 +260,9 @@ let mayRaiseFPException = 0 in { // Seems suspicious but manual doesn't say it d
 let isReMaterializable = 1 in
 defm V_MSAD_U8 : VOP3Inst <"v_msad_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
 
-let Constraints = "@earlyclobber $vdst", IsInvalidSingleUseConsumer = 1 in {
+let Constraints = "@earlyclobber $vdst" in {
 defm V_MQSAD_PK_U16_U8 : VOP3Inst <"v_mqsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64, VOP3_CLAMP>>;
-} // End Constraints = "@earlyclobber $vdst", IsInvalidSingleUseConsumer = 1
+} // End Constraints = "@earlyclobber $vdst"
 
 
 let isReMaterializable = 1 in {
@@ -277,16 +277,14 @@ let SchedRW = [Write64Bit] in {
   defm V_ASHR_I64 : VOP3Inst <"v_ashr_i64", VOP3_Profile<VOP_I64_I64_I32>, csra_64>;
   } // End SubtargetPredicate = isGFX6GFX7
 
-  let IsInvalidSingleUseConsumer = 1 in {
   let SubtargetPredicate = isGFX8Plus in {
   defm V_LSHRREV_B64 : VOP3Inst <"v_lshrrev_b64", VOP3_Profile<VOP_I64_I32_I64>, clshr_rev_64>;
   defm V_ASHRREV_I64 : VOP3Inst <"v_ashrrev_i64", VOP3_Profile<VOP_I64_I32_I64>, cashr_rev_64>;
-  } // End SubtargetPredicate = isGFX8Plus, , IsInvalidSingleUseConsumer = 1
+  } // End SubtargetPredicate = isGFX8Plus
 
   let SubtargetPredicate = isGFX8GFX9GFX10GFX11 in {
   defm V_LSHLREV_B64 : VOP3Inst <"v_lshlrev_b64", VOP3_Profile<VOP_I64_I32_I64>, clshl_rev_64>;
   } // End SubtargetPredicate = isGFX8GFX9GFX10GFX11
-  } // End IsInvalidSingleUseConsumer = 1
 } // End SchedRW = [Write64Bit]
 } // End isReMaterializable = 1
 
@@ -311,14 +309,14 @@ def VOPProfileMQSAD : VOP3_Profile<VOP_V4I32_I64_I32_V4I32, VOP3_CLAMP> {
   let HasModifiers = 0;
 }
 
-let SubtargetPredicate = isGFX7Plus, IsInvalidSingleUseConsumer = 1 in {
+let SubtargetPredicate = isGFX7Plus in {
 let Constraints = "@earlyclobber $vdst", SchedRW = [WriteQuarterRate32] in {
 defm V_QSAD_PK_U16_U8 : VOP3Inst <"v_qsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64, VOP3_CLAMP>>;
 defm V_MQSAD_U32_U8 : VOP3Inst <"v_mqsad_u32_u8", VOPProfileMQSAD>;
 } // End Constraints = "@earlyclobber $vdst", SchedRW = [WriteQuarterRate32]
-} // End SubtargetPredicate = isGFX7Plus, IsInvalidSingleUseConsumer = 1
+} // End SubtargetPredicate = isGFX7Plus
 
-let isCommutable = 1, SchedRW = [WriteIntMul, WriteSALU], IsInvalidSingleUseConsumer = 1 in {
+let isCommutable = 1, SchedRW = [WriteIntMul, WriteSALU] in {
   let SubtargetPredicate = isGFX7Plus, OtherPredicates = [HasNotMADIntraFwdBug] in {
     defm V_MAD_U64_U32 : VOP3Inst <"v_mad_u64_u32", VOP3b_I64_I1_I32_I32_I64>;
     defm V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64>;
@@ -328,7 +326,7 @@ let isCommutable = 1, SchedRW = [WriteIntMul, WriteSALU], IsInvalidSingleUseCons
     defm V_MAD_U64_U32_gfx11 : VOP3Inst <"v_mad_u64_u32", VOP3b_I64_I1_I32_I32_I64>;
     defm V_MAD_I64_I32_gfx11 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64>;
   }
-} // End isCommutable = 1, SchedRW = [WriteIntMul, WriteSALU], IsInvalidSingleUseConsumer = 1
+} // End isCommutable = 1, SchedRW = [WriteIntMul, WriteSALU]
 
 
 let FPDPRounding = 1 in {
@@ -865,10 +863,10 @@ let SubtargetPredicate = isGFX10Plus in {
   } // End isCommutable = 1, isReMaterializable = 1
   def : ThreeOp_i32_Pats<xor, xor, V_XOR3_B32_e64>;
 
-  let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in", IsInvalidSingleUseConsumer = 1, IsInvalidSingleUseProducer = 1 in {
+  let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in {
     defm V_PERMLANE16_B32 : VOP3Inst<"v_permlane16_b32", VOP3_PERMLANE_Profile>;
     defm V_PERMLANEX16_B32 : VOP3Inst<"v_permlanex16_b32", VOP3_PERMLANE_Profile>;
-  } // End $vdst = $vdst_in, DisableEncoding $vdst_in, IsInvalidSingleUseConsumer = 1, IsInvalidSingleUseProducer = 1
+  } // End $vdst = $vdst_in, DisableEncoding $vdst_in
 
   foreach vt = Reg32Types.types in {
     def : PermlanePat<int_amdgcn_permlane16, V_PERMLANE16_B32_e64, vt>;
@@ -1286,12 +1284,11 @@ let AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10" in {
   }
 } // End AssemblerPredicate = isGFX10Only, DecoderNamespace = "GFX10"
 
-let IsInvalidSingleUseConsumer = 1 in {
-  defm V_READLANE_B32  : VOP3_Real_No_Suffix_gfx10<0x360>;
-  let InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in), IsInvalidSingleUseProducer = 1 in {
-    defm V_WRITELANE_B32 : VOP3_Real_No_Suffix_gfx10<0x361>;
-  } // End InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32: $src1, VGPR_32:$vdst_in), IsInvalidSingleUseProducer = 1
-} // End IsInvalidSingleUseConsumer = 1
+defm V_READLANE_B32  : VOP3_Real_No_Suffix_gfx10<0x360>;
+
+let InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in) in {
+  defm V_WRITELANE_B32 : VOP3_Real_No_Suffix_gfx10<0x361>;
+} // End InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in)
 
 let SubtargetPredicate = isGFX10Before1030 in {
   defm V_MUL_LO_I32      : VOP3_Real_gfx10<0x16b>;
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index f4d2c29158f49f..5eee71887964ad 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -382,19 +382,15 @@ defm V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16",
   AMDGPUfdot2, 1/*ExplicitClamp*/>;
 
 let OtherPredicates = [HasDot7Insts] in {
-let IsInvalidSingleUseConsumer = 1 in {
-  defm V_DOT4_U32_U8  : VOP3PInst<"v_dot4_u32_u8",
-    VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot4, 1>;
-}
+defm V_DOT4_U32_U8  : VOP3PInst<"v_dot4_u32_u8",
+  VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot4, 1>;
 defm V_DOT8_U32_U4  : VOP3PInst<"v_dot8_u32_u4",
   VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot8, 1>;
 } // End OtherPredicates = [HasDot7Insts]
 
 let OtherPredicates = [HasDot1Insts] in {
-let IsInvalidSingleUseConsumer = 1 in {
-  defm V_DOT4_I32_I8  : VOP3PInst<"v_dot4_i32_i8",
-    VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot4, 1>;
-}
+defm V_DOT4_I32_I8  : VOP3PInst<"v_dot4_i32_i8",
+  VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot4, 1>;
 defm V_DOT8_I32_I4  : VOP3PInst<"v_dot8_i32_i4",
   VOP3P_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot8, 1>;
 } // End OtherPredicates = [HasDot1Insts]
diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
index be862b44917e15..d6e08dce130ced 100644
--- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
@@ -464,10 +464,9 @@ multiclass VOPC_I16 <string opName, SDPatternOperator cond = COND_NULL,
 multiclass VOPC_I32 <string opName, SDPatternOperator cond = COND_NULL, string revOp = opName> :
   VOPC_Pseudos <opName, VOPC_I1_I32_I32, cond, revOp, 0>;
 
-let IsInvalidSingleUseConsumer = 1 in {
-  multiclass VOPC_I64 <string opName, SDPatternOperator cond = COND_NULL, string revOp = opName> :
-    VOPC_Pseudos <opName, VOPC_I1_I64_I64, cond, revOp, 0>;
-}
+multiclass VOPC_I64 <string opName, SDPatternOperator cond = COND_NULL, string revOp = opName> :
+  VOPC_Pseudos <opName, VOPC_I1_I64_I64, cond, revOp, 0>;
+
 
 multiclass VOPCX_F16<string opName, string revOp = opName> {
   let OtherPredicates = [Has16BitInsts], True16Predicate = NotHasTrue16BitInsts in {
@@ -502,10 +501,8 @@ multiclass VOPCX_I16<string opName, string revOp = opName> {
 multiclass VOPCX_I32 <string opName, string revOp = opName> :
   VOPCX_Pseudos <opName, VOPC_I1_I32_I32, VOPC_I32_I32, COND_NULL, revOp>;
 
-let IsInvalidSingleUseConsumer = 1 in {
-  multiclass VOPCX_I64 <string opName, string revOp = opName> :
-    VOPCX_Pseudos <opName, VOPC_I1_I64_I64, VOPC_I64_I64, COND_NULL, revOp>;
-}
+multiclass VOPCX_I64 <string opName, string revOp = opName> :
+  VOPCX_Pseudos <opName, VOPC_I1_I64_I64, VOPC_I64_I64, COND_NULL, revOp>;
 
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index 5a460ef0d42320..05a7d907d237ae 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -17,8 +17,6 @@ class LetDummies {
   bit isReMaterializable;
   bit isAsCheapAsAMove;
   bit FPDPRounding;
-  bit IsInvalidSingleUseConsumer;
-  bit IsInvalidSingleUseProducer;
   Predicate SubtargetPredicate;
   string Constraints;
   string DisableEncoding;
@@ -67,8 +65,6 @@ class VOP_Pseudo <string opName, string suffix, VOPProfile P, dag outs, dag ins,
   string Mnemonic = opName;
   Instruction Opcode = !cast<Instruction>(NAME);
   bit IsTrue16 = P.IsTrue16;
-  bit IsInvalidSingleUseConsumer = P.IsInvalidSingleUseConsumer;
-  bit IsInvalidSingleUseProducer = P.IsInvalidSingleUseProducer;
   VOPProfile Pfl = P;
 
   string AsmOperands;
@@ -165,8 +161,6 @@ class VOP3P_Pseudo <string opName, VOPProfile P, list<dag> pattern = []> :
 class VOP_Real<VOP_Pseudo ps> {
   Instruction Opcode = !cast<Instruction>(NAME);
   bit IsSingle = ps.Pfl.IsSingle;
-  bit IsInvalidSingleUseConsumer = ps.Pfl.IsInvalidSingleUseConsumer;
-  bit IsInvalidSingleUseProducer = ps.Pfl.IsInvalidSingleUseProducer;
 }
 
 class VOP3_Real <VOP_Pseudo ps, int EncodingFamily, string asm_name = ps.Mnemonic> :
@@ -844,9 +838,6 @@ class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[],
   let Constraints = !if(P.NumSrcArgs, P.TieRegDPP # " = $vdst", "");
   let DisableEncoding = !if(P.NumSrcArgs, P.TieRegDPP, "");
   let DecoderNamespace = "GFX8";
-
-  let IsInvalidSingleUseConsumer = !not(VINTERP);
-  let IsInvalidSingleUseProducer = !not(VINTERP);
 }
 
 class VOP3_DPP_Pseudo <string OpName, VOPProfile P> :
@@ -1714,13 +1705,4 @@ def VOPTrue16Table : GenericTable {
 
   let PrimaryKey = ["Opcode"];
   let PrimaryKeyName = "getTrue16OpcodeHelper";
-}
-
-def SingleUseExceptionTable : GenericTable {
-  let FilterClass = "VOP_Pseudo";
-  let CppTypeName = "SingleUseExceptionInfo";
-  let Fields = ["Opcode", "IsInvalidSingleUseConsumer", "IsInvalidSingleUseProducer"];
-
-  let PrimaryKey = ["Opcode"];
-  let PrimaryKeyName = "getSingleUseExceptionHelper";
-}
+}
\ No newline at end of file
diff --git a/llvm/lib/Target/ARM/ARMFeatures.td b/llvm/lib/Target/ARM/ARMFeatures.td
index 8b0ade54b46d3c..dc0e86c696f63a 100644
--- a/llvm/lib/Target/ARM/ARMFeatures.td
+++ b/llvm/lib/Target/ARM/ARMFeatures.td
@@ -375,6 +375,9 @@ def FeaturePref32BitThumb : SubtargetFeature<"32bit", "Prefers32BitThumb", "true
 def FeaturePrefLoopAlign32 : SubtargetFeature<"loop-align", "PrefLoopLogAlignment","2",
                                               "Prefer 32-bit alignment for loops">;
 
+def FeaturePrefLoopAlign64 : SubtargetFeature<"loop-align-64", "PrefLoopLogAlignment","3",
+                                              "Prefer 64-bit alignment for loops">;
+
 def FeatureMVEVectorCostFactor1 : SubtargetFeature<"mve1beat", "MVEVectorCostFactor", "4",
                         "Model MVE instructions as a 1 beat per tick architecture">;
 
diff --git a/llvm/lib/Target/ARM/ARMProcessors.td b/llvm/lib/Target/ARM/ARMProcessors.td
index e4e122a0d1339b..a66a2c0b1981d8 100644
--- a/llvm/lib/Target/ARM/ARMProcessors.td
+++ b/llvm/lib/Target/ARM/ARMProcessors.td
@@ -344,6 +344,7 @@ def : ProcessorModel<"cortex-m4", CortexM4Model,        [ARMv7em,
 def : ProcessorModel<"cortex-m7", CortexM7Model,        [ARMv7em,
                                                          ProcM7,
                                                          FeatureFPARMv8_D16,
+                                                         FeaturePrefLoopAlign64,
                                                          FeatureUseMIPipeliner,
                                                          FeatureUseMISched]>;
 
@@ -385,6 +386,7 @@ def : ProcessorModel<"cortex-m85", CortexM85Model,      [ARMv81mMainline,
                                                          FeatureDSP,
                                                          FeatureFPARMv8_D16,
                                                          FeaturePACBTI,
+                                                         FeaturePrefLoopAlign64,
                                                          FeatureUseMISched,
                                                          HasMVEFloatOps]>;
 
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index fcd46b5921c4de..05ba18bf8ebd88 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -2725,7 +2725,7 @@ bool RISCVDAGToDAGISel::SelectAddrRegImm(SDValue Addr, SDValue &Base,
 }
 
 /// Similar to SelectAddrRegImm, except that the least significant 5 bits of
-/// Offset shoule be all zeros.
+/// Offset should be all zeros.
 bool RISCVDAGToDAGISel::SelectAddrRegImmLsb00000(SDValue Addr, SDValue &Base,
                                                  SDValue &Offset) {
   if (SelectAddrFrameIndex(Addr, Base, Offset))
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 2aa89aca4c808d..b998a1eb11c300 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -4540,7 +4540,7 @@ static SDValue lowerScalarInsert(SDValue Scalar, SDValue VL, MVT VT,
 // t33: v8i8 = extract_subvector t11, Constant:i64<8>
 // a) t35: v8i8 = vector_shuffle<0,2,4,6,8,10,12,14> t34, t33
 // b) t35: v8i8 = vector_shuffle<1,3,5,7,9,11,13,15> t34, t33
-// Returns {Src Vector, Even Elements} om success
+// Returns {Src Vector, Even Elements} on success
 static bool isDeinterleaveShuffle(MVT VT, MVT ContainerVT, SDValue V1,
                                   SDValue V2, ArrayRef<int> Mask,
                                   const RISCVSubtarget &Subtarget) {
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
index fe7de9d7bc79aa..68182d238e7847 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoVPseudos.td
@@ -232,7 +232,7 @@ class octuple_to_str<int octuple> {
 def VLOpFrag : PatFrag<(ops), (XLenVT (VLOp (XLenVT AVL:$vl)))>;
 
 // Output pattern for X0 used to represent VLMAX in the pseudo instructions.
-// We can't use X0 register becuase the AVL operands use GPRNoX0.
+// We can't use X0 register because the AVL operands use GPRNoX0.
 // This must be kept in sync with RISCV::VLMaxSentinel.
 def VLMax : OutPatFrag<(ops), (XLenVT -1)>;
 
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index 1d08853faf582e..2f6b55b0d6023e 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -629,15 +629,19 @@ std::optional<MCFixupKind> X86AsmBackend::getFixupKind(StringRef Name) const {
 
 const MCFixupKindInfo &X86AsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
   const static MCFixupKindInfo Infos[X86::NumTargetFixupKinds] = {
+      // clang-format off
       {"reloc_riprel_4byte", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
       {"reloc_riprel_4byte_movq_load", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+      {"reloc_riprel_4byte_movq_load_rex2", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
       {"reloc_riprel_4byte_relax", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
       {"reloc_riprel_4byte_relax_rex", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+      {"reloc_riprel_4byte_relax_rex2", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
       {"reloc_signed_4byte", 0, 32, 0},
       {"reloc_signed_4byte_relax", 0, 32, 0},
       {"reloc_global_offset_table", 0, 32, 0},
       {"reloc_global_offset_table8", 0, 64, 0},
       {"reloc_branch_4byte_pcrel", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+      // clang-format on
   };
 
   // Fixup kinds from .reloc directive are like R_386_NONE/R_X86_64_NONE. They
@@ -678,7 +682,9 @@ static unsigned getFixupKindSize(unsigned Kind) {
   case X86::reloc_riprel_4byte:
   case X86::reloc_riprel_4byte_relax:
   case X86::reloc_riprel_4byte_relax_rex:
+  case X86::reloc_riprel_4byte_relax_rex2:
   case X86::reloc_riprel_4byte_movq_load:
+  case X86::reloc_riprel_4byte_movq_load_rex2:
   case X86::reloc_signed_4byte:
   case X86::reloc_signed_4byte_relax:
   case X86::reloc_global_offset_table:
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
index 0b2efdfc16cc5d..90222278d1ad6f 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
@@ -74,7 +74,9 @@ static X86_64RelType getType64(MCFixupKind Kind,
   case X86::reloc_riprel_4byte:
   case X86::reloc_riprel_4byte_relax:
   case X86::reloc_riprel_4byte_relax_rex:
+  case X86::reloc_riprel_4byte_relax_rex2:
   case X86::reloc_riprel_4byte_movq_load:
+  case X86::reloc_riprel_4byte_movq_load_rex2:
     return RT64_32;
   case X86::reloc_branch_4byte_pcrel:
     Modifier = MCSymbolRefExpr::VK_PLT;
@@ -205,7 +207,7 @@ static unsigned getRelocType64(MCContext &Ctx, SMLoc Loc,
   case MCSymbolRefExpr::VK_GOTPCREL:
     checkIs32(Ctx, Loc, Type);
     // Older versions of ld.bfd/ld.gold/lld
-    // do not support GOTPCRELX/REX_GOTPCRELX,
+    // do not support GOTPCRELX/REX_GOTPCRELX/REX2_GOTPCRELX,
     // and we want to keep back-compatibility.
     if (!Ctx.getTargetOptions()->X86RelaxRelocations)
       return ELF::R_X86_64_GOTPCREL;
@@ -217,6 +219,9 @@ static unsigned getRelocType64(MCContext &Ctx, SMLoc Loc,
     case X86::reloc_riprel_4byte_relax_rex:
     case X86::reloc_riprel_4byte_movq_load:
       return ELF::R_X86_64_REX_GOTPCRELX;
+    case X86::reloc_riprel_4byte_relax_rex2:
+    case X86::reloc_riprel_4byte_movq_load_rex2:
+      return ELF::R_X86_64_REX2_GOTPCRELX;
     }
     llvm_unreachable("unexpected relocation type!");
   case MCSymbolRefExpr::VK_GOTPCREL_NORELAX:
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86FixupKinds.h b/llvm/lib/Target/X86/MCTargetDesc/X86FixupKinds.h
index 2d5217115d07cb..29bb7eebae3f22 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86FixupKinds.h
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86FixupKinds.h
@@ -16,10 +16,14 @@ namespace X86 {
 enum Fixups {
   reloc_riprel_4byte = FirstTargetFixupKind, // 32-bit rip-relative
   reloc_riprel_4byte_movq_load,              // 32-bit rip-relative in movq
+  reloc_riprel_4byte_movq_load_rex2,         // 32-bit rip-relative in movq
+                                             // with rex2 prefix
   reloc_riprel_4byte_relax,                  // 32-bit rip-relative in relaxable
                                              // instruction
   reloc_riprel_4byte_relax_rex,              // 32-bit rip-relative in relaxable
                                              // instruction with rex prefix
+  reloc_riprel_4byte_relax_rex2,             // 32-bit rip-relative in relaxable
+                                             // instruction with rex2 prefix
   reloc_signed_4byte,                        // 32-bit signed. Unlike FK_Data_4
                                              // this will be sign extended at
                                              // runtime.
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
index 71d42863fd5857..206436191c2584 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -568,8 +568,10 @@ void X86MCCodeEmitter::emitImmediate(const MCOperand &DispOp, SMLoc Loc,
   if (FixupKind == FK_PCRel_4 ||
       FixupKind == MCFixupKind(X86::reloc_riprel_4byte) ||
       FixupKind == MCFixupKind(X86::reloc_riprel_4byte_movq_load) ||
+      FixupKind == MCFixupKind(X86::reloc_riprel_4byte_movq_load_rex2) ||
       FixupKind == MCFixupKind(X86::reloc_riprel_4byte_relax) ||
       FixupKind == MCFixupKind(X86::reloc_riprel_4byte_relax_rex) ||
+      FixupKind == MCFixupKind(X86::reloc_riprel_4byte_relax_rex2) ||
       FixupKind == MCFixupKind(X86::reloc_branch_4byte_pcrel)) {
     ImmOffset -= 4;
     // If this is a pc-relative load off _GLOBAL_OFFSET_TABLE_:
@@ -637,12 +639,11 @@ void X86MCCodeEmitter::emitMemModRMByte(
       default:
         return X86::reloc_riprel_4byte;
       case X86::MOV64rm:
-        // movq loads is a subset of reloc_riprel_4byte_relax_rex. It is a
+        // movq loads is a subset of reloc_riprel_4byte_relax_rex/rex2. It is a
         // special case because COFF and Mach-O don't support ELF's more
-        // flexible R_X86_64_REX_GOTPCRELX relaxation.
-        // TODO: Support new relocation for REX2.
-        assert(Kind == REX || Kind == REX2);
-        return X86::reloc_riprel_4byte_movq_load;
+        // flexible R_X86_64_REX_GOTPCRELX/R_X86_64_REX2_GOTPCRELX relaxation.
+        return Kind == REX2 ? X86::reloc_riprel_4byte_movq_load_rex2
+                            : X86::reloc_riprel_4byte_movq_load;
       case X86::ADC32rm:
       case X86::ADD32rm:
       case X86::AND32rm:
@@ -665,11 +666,9 @@ void X86MCCodeEmitter::emitMemModRMByte(
       case X86::SBB64rm:
       case X86::SUB64rm:
       case X86::XOR64rm:
-        // We haven't support relocation for REX2 prefix, so temporarily use REX
-        // relocation.
-        // TODO: Support new relocation for REX2.
-        return (Kind == REX || Kind == REX2) ? X86::reloc_riprel_4byte_relax_rex
-                                             : X86::reloc_riprel_4byte_relax;
+        return Kind == REX2  ? X86::reloc_riprel_4byte_relax_rex2
+               : Kind == REX ? X86::reloc_riprel_4byte_relax_rex
+                             : X86::reloc_riprel_4byte_relax;
       }
     }();
 
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
index ec95b1ffec387d..41ce5c9fcb82ad 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86MachObjectWriter.cpp
@@ -66,8 +66,10 @@ class X86MachObjectWriter : public MCMachObjectTargetWriter {
 static bool isFixupKindRIPRel(unsigned Kind) {
   return Kind == X86::reloc_riprel_4byte ||
          Kind == X86::reloc_riprel_4byte_movq_load ||
+         Kind == X86::reloc_riprel_4byte_movq_load_rex2 ||
          Kind == X86::reloc_riprel_4byte_relax ||
-         Kind == X86::reloc_riprel_4byte_relax_rex;
+         Kind == X86::reloc_riprel_4byte_relax_rex ||
+         Kind == X86::reloc_riprel_4byte_relax_rex2;
 }
 
 static unsigned getFixupKindLog2Size(unsigned Kind) {
@@ -83,7 +85,9 @@ static unsigned getFixupKindLog2Size(unsigned Kind) {
   case X86::reloc_riprel_4byte:
   case X86::reloc_riprel_4byte_relax:
   case X86::reloc_riprel_4byte_relax_rex:
+  case X86::reloc_riprel_4byte_relax_rex2:
   case X86::reloc_riprel_4byte_movq_load:
+  case X86::reloc_riprel_4byte_movq_load_rex2:
   case X86::reloc_signed_4byte:
   case X86::reloc_signed_4byte_relax:
   case X86::reloc_branch_4byte_pcrel:
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
index 10fc176b59d8ab..7740500fb41830 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFObjectWriter.cpp
@@ -66,8 +66,10 @@ unsigned X86WinCOFFObjectWriter::getRelocType(MCContext &Ctx,
     case FK_PCRel_4:
     case X86::reloc_riprel_4byte:
     case X86::reloc_riprel_4byte_movq_load:
+    case X86::reloc_riprel_4byte_movq_load_rex2:
     case X86::reloc_riprel_4byte_relax:
     case X86::reloc_riprel_4byte_relax_rex:
+    case X86::reloc_riprel_4byte_relax_rex2:
     case X86::reloc_branch_4byte_pcrel:
       return COFF::IMAGE_REL_AMD64_REL32;
     case FK_Data_4:
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index b2e5d727555327..020021e9d8abcb 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -43104,6 +43104,8 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
     case X86ISD::FMIN:
     case X86ISD::FMAXC:
     case X86ISD::FMINC:
+    case X86ISD::FRSQRT:
+    case X86ISD::FRCP:
       // Horizontal Ops.
     case X86ISD::HADD:
     case X86ISD::HSUB:
diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index 1d67773585d593..2f88b19a8d3902 100644
--- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -638,6 +638,7 @@ bool MemCpyOptPass::processStoreOfLoad(StoreInst *SI, LoadInst *LI,
   if (!LI->isSimple() || !LI->hasOneUse() || LI->getParent() != SI->getParent())
     return false;
 
+  BatchAAResults BAA(*AA);
   auto *T = LI->getType();
   // Don't introduce calls to memcpy/memmove intrinsics out of thin air if
   // the corresponding libcalls are not available.
@@ -647,19 +648,17 @@ bool MemCpyOptPass::processStoreOfLoad(StoreInst *SI, LoadInst *LI,
       (EnableMemCpyOptWithoutLibcalls ||
        (TLI->has(LibFunc_memcpy) && TLI->has(LibFunc_memmove)))) {
     MemoryLocation LoadLoc = MemoryLocation::get(LI);
-
-    // We use alias analysis to check if an instruction may store to
-    // the memory we load from in between the load and the store. If
-    // such an instruction is found, we try to promote there instead
-    // of at the store position.
-    // TODO: Can use MSSA for this.
-    Instruction *P = SI;
-    for (auto &I : make_range(++LI->getIterator(), SI->getIterator())) {
-      if (isModSet(AA->getModRefInfo(&I, LoadLoc))) {
-        P = &I;
-        break;
-      }
-    }
+    MemoryUseOrDef *LoadAccess = MSSA->getMemoryAccess(LI),
+                   *StoreAccess = MSSA->getMemoryAccess(SI);
+
+    // We use MSSA to check if an instruction may store to the memory we load
+    // from in between the load and the store. If such an instruction is found,
+    // we try to promote there instead of at the store position.
+    auto *Clobber = MSSA->getWalker()->getClobberingMemoryAccess(
+        StoreAccess->getDefiningAccess(), LoadLoc, BAA);
+    Instruction *P = MSSA->dominates(LoadAccess, Clobber)
+                         ? cast<MemoryUseOrDef>(Clobber)->getMemoryInst()
+                         : SI;
 
     // If we found an instruction that may write to the loaded memory,
     // we can try to promote at this position instead of the store
@@ -707,7 +706,6 @@ bool MemCpyOptPass::processStoreOfLoad(StoreInst *SI, LoadInst *LI,
   // Detect cases where we're performing call slot forwarding, but
   // happen to be using a load-store pair to implement it, rather than
   // a memcpy.
-  BatchAAResults BAA(*AA);
   auto GetCall = [&]() -> CallInst * {
     // We defer this expensive clobber walk until the cheap checks
     // have been done on the source inside performCallSlotOptzn.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 3f7ab416e877bc..318d6a8c5b8c34 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -1504,8 +1504,8 @@ void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
   VecInd->setDebugLoc(EntryVal->getDebugLoc());
   State.set(this, VecInd);
 
-  Instruction *LastInduction = cast<Instruction>(Builder.CreateBinOp(
-      AddOp, VecInd, SplatVF, "vec.ind.next", EntryVal->getDebugLoc()));
+  Instruction *LastInduction = cast<Instruction>(
+      Builder.CreateBinOp(AddOp, VecInd, SplatVF, "vec.ind.next"));
   if (isa<TruncInst>(EntryVal))
     State.addMetadata(LastInduction, EntryVal);
   LastInduction->setDebugLoc(EntryVal->getDebugLoc());
diff --git a/llvm/test/Analysis/CostModel/AArch64/reduce-fadd.ll b/llvm/test/Analysis/CostModel/AArch64/reduce-fadd.ll
index 58cb8c2c6a8d81..a95542f6901733 100644
--- a/llvm/test/Analysis/CostModel/AArch64/reduce-fadd.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/reduce-fadd.ll
@@ -76,49 +76,49 @@ define void @fast_fp_reductions() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %fadd_v16f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %fadd_v11f16 = call fast half @llvm.vector.reduce.fadd.v11f16(half 0xH0000, <11 x half> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %fadd_v13f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v13f16(half 0xH0000, <13 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f32 = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %fadd_v8f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %fadd_v13f32 = call fast float @llvm.vector.reduce.fadd.v13f32(float 0.000000e+00, <13 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %fadd_v5f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v5f32(float 0.000000e+00, <5 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f64 = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %fadd_v4f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %fadd_v7f64 = call fast double @llvm.vector.reduce.fadd.v7f64(double 0.000000e+00, <7 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %fadd_v9f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v9f64(double 0.000000e+00, <9 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %fadd_v2f32 = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %fadd_v2f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v4f32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v4f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %fadd_v8f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %fadd_v13f32 = call fast float @llvm.vector.reduce.fadd.v13f32(float 0.000000e+00, <13 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %fadd_v5f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v5f32(float 0.000000e+00, <5 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %fadd_v2f64 = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %fadd_v2f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v4f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fadd_v7f64 = call fast double @llvm.vector.reduce.fadd.v7f64(double 0.000000e+00, <7 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v9f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v9f64(double 0.000000e+00, <9 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %fadd_v4f8 = call reassoc bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0xR8000, <4 x bfloat> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %fadd_v4f128 = call reassoc fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
 ; FP16-LABEL: 'fast_fp_reductions'
-; FP16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fadd_v2f16_fast = call fast half @llvm.vector.reduce.fadd.v2f16(half 0xH0000, <2 x half> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fadd_v2f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v2f16(half 0xH0000, <2 x half> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f16_fast = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %fadd_v8f16 = call fast half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 27 for instruction: %fadd_v8f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %fadd_v16f16 = call fast half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %fadd_v16f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %fadd_v11f16 = call fast half @llvm.vector.reduce.fadd.v11f16(half 0xH0000, <11 x half> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 35 for instruction: %fadd_v13f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v13f16(half 0xH0000, <13 x half> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f32 = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %fadd_v8f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %fadd_v13f32 = call fast float @llvm.vector.reduce.fadd.v13f32(float 0.000000e+00, <13 x float> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %fadd_v5f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v5f32(float 0.000000e+00, <5 x float> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f64 = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %fadd_v4f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %fadd_v7f64 = call fast double @llvm.vector.reduce.fadd.v7f64(double 0.000000e+00, <7 x double> undef)
-; FP16-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %fadd_v9f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v9f64(double 0.000000e+00, <9 x double> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f16_fast = call fast half @llvm.vector.reduce.fadd.v2f16(half 0xH0000, <2 x half> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v2f16(half 0xH0000, <2 x half> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v4f16_fast = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v4f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %fadd_v8f16 = call fast half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %fadd_v8f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fadd_v16f16 = call fast half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fadd_v16f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fadd_v11f16 = call fast half @llvm.vector.reduce.fadd.v11f16(half 0xH0000, <11 x half> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fadd_v13f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v13f16(half 0xH0000, <13 x half> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %fadd_v2f32 = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %fadd_v2f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v4f32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v4f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %fadd_v8f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %fadd_v13f32 = call fast float @llvm.vector.reduce.fadd.v13f32(float 0.000000e+00, <13 x float> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %fadd_v5f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v5f32(float 0.000000e+00, <5 x float> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %fadd_v2f64 = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %fadd_v2f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v4f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fadd_v7f64 = call fast double @llvm.vector.reduce.fadd.v7f64(double 0.000000e+00, <7 x double> undef)
+; FP16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v9f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v9f64(double 0.000000e+00, <9 x double> undef)
 ; FP16-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %fadd_v4f8 = call reassoc bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0xR8000, <4 x bfloat> undef)
 ; FP16-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %fadd_v4f128 = call reassoc fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
 ; FP16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
@@ -134,20 +134,20 @@ define void @fast_fp_reductions() {
 ; BF16-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %fadd_v16f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> undef)
 ; BF16-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %fadd_v11f16 = call fast half @llvm.vector.reduce.fadd.v11f16(half 0xH0000, <11 x half> undef)
 ; BF16-NEXT:  Cost Model: Found an estimated cost of 38 for instruction: %fadd_v13f16_reassoc = call reassoc half @llvm.vector.reduce.fadd.v13f16(half 0xH0000, <13 x half> undef)
-; BF16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f32 = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> undef)
-; BF16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> undef)
-; BF16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
-; BF16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
-; BF16-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
-; BF16-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %fadd_v8f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
-; BF16-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %fadd_v13f32 = call fast float @llvm.vector.reduce.fadd.v13f32(float 0.000000e+00, <13 x float> undef)
-; BF16-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %fadd_v5f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v5f32(float 0.000000e+00, <5 x float> undef)
-; BF16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f64 = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
-; BF16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v2f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
-; BF16-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
-; BF16-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %fadd_v4f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
-; BF16-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %fadd_v7f64 = call fast double @llvm.vector.reduce.fadd.v7f64(double 0.000000e+00, <7 x double> undef)
-; BF16-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %fadd_v9f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v9f64(double 0.000000e+00, <9 x double> undef)
+; BF16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %fadd_v2f32 = call fast float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> undef)
+; BF16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %fadd_v2f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v2f32(float 0.000000e+00, <2 x float> undef)
+; BF16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v4f32 = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
+; BF16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v4f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> undef)
+; BF16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %fadd_v8f32 = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; BF16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %fadd_v8f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> undef)
+; BF16-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %fadd_v13f32 = call fast float @llvm.vector.reduce.fadd.v13f32(float 0.000000e+00, <13 x float> undef)
+; BF16-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %fadd_v5f32_reassoc = call reassoc float @llvm.vector.reduce.fadd.v5f32(float 0.000000e+00, <5 x float> undef)
+; BF16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %fadd_v2f64 = call fast double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
+; BF16-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %fadd_v2f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v2f64(double 0.000000e+00, <2 x double> undef)
+; BF16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v4f64 = call fast double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; BF16-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fadd_v4f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v4f64(double 0.000000e+00, <4 x double> undef)
+; BF16-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fadd_v7f64 = call fast double @llvm.vector.reduce.fadd.v7f64(double 0.000000e+00, <7 x double> undef)
+; BF16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v9f64_reassoc = call reassoc double @llvm.vector.reduce.fadd.v9f64(double 0.000000e+00, <9 x double> undef)
 ; BF16-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %fadd_v4f8 = call reassoc bfloat @llvm.vector.reduce.fadd.v4bf16(bfloat 0xR8000, <4 x bfloat> undef)
 ; BF16-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %fadd_v4f128 = call reassoc fp128 @llvm.vector.reduce.fadd.v4f128(fp128 undef, <4 x fp128> undef)
 ; BF16-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
diff --git a/llvm/test/CodeGen/AArch64/mlicm-stack-write-check.mir b/llvm/test/CodeGen/AArch64/mlicm-stack-write-check.mir
index 51bc77d405b94b..406025c4fde302 100644
--- a/llvm/test/CodeGen/AArch64/mlicm-stack-write-check.mir
+++ b/llvm/test/CodeGen/AArch64/mlicm-stack-write-check.mir
@@ -3,6 +3,7 @@
 ---
 name: test
 tracksRegLiveness: true
+isSSA: false
 registers:
   - { id: 0, class: gpr64 }
 stack:
@@ -30,11 +31,11 @@ body: |
   bb.2:
     liveins: $x0
     %0 = COPY $x0
-    %0 = COPY $x0  ; Force isSSA = false.
 ...
 ---
 name: test2
 tracksRegLiveness: true
+isSSA: false
 registers:
   - { id: 0, class: gpr64 }
 stack:
@@ -62,5 +63,4 @@ body: |
   bb.2:
     liveins: $x0
     %0 = COPY $x0
-    %0 = COPY $x0  ; Force isSSA = false.
 ...
diff --git a/llvm/test/CodeGen/AArch64/sve-bf16-converts.ll b/llvm/test/CodeGen/AArch64/sve-bf16-converts.ll
index d72f92c1dac1ff..d63f7e6f3242e0 100644
--- a/llvm/test/CodeGen/AArch64/sve-bf16-converts.ll
+++ b/llvm/test/CodeGen/AArch64/sve-bf16-converts.ll
@@ -1,9 +1,15 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
-; RUN: llc -mattr=+sve                  < %s | FileCheck %s
-; RUN: llc -mattr=+sme -force-streaming < %s | FileCheck %s
+; RUN: llc -mattr=+sve                          < %s | FileCheck %s --check-prefixes=CHECK,NOBF16
+; RUN: llc -mattr=+sve --enable-no-nans-fp-math < %s | FileCheck %s --check-prefixes=CHECK,NOBF16NNAN
+; RUN: llc -mattr=+sve,+bf16                    < %s | FileCheck %s --check-prefixes=CHECK,BF16
+; RUN: llc -mattr=+sme -force-streaming         < %s | FileCheck %s --check-prefixes=CHECK,BF16
 
 target triple = "aarch64-unknown-linux-gnu"
 
+; NOTE: "fptrunc <# x double> to <# x bfloat>" is not supported because SVE
+; lacks a down convert that rounds to odd. Such IR will trigger the usual
+; failure (crash) when attempting to unroll a scalable vector.
+
 define <vscale x 2 x float> @fpext_nxv2bf16_to_nxv2f32(<vscale x 2 x bfloat> %a) {
 ; CHECK-LABEL: fpext_nxv2bf16_to_nxv2f32:
 ; CHECK:       // %bb.0:
@@ -87,3 +93,122 @@ define <vscale x 8 x double> @fpext_nxv8bf16_to_nxv8f64(<vscale x 8 x bfloat> %a
   %res = fpext <vscale x 8 x bfloat> %a to <vscale x 8 x double>
   ret <vscale x 8 x double> %res
 }
+
+define <vscale x 2 x bfloat> @fptrunc_nxv2f32_to_nxv2bf16(<vscale x 2 x float> %a) {
+; NOBF16-LABEL: fptrunc_nxv2f32_to_nxv2bf16:
+; NOBF16:       // %bb.0:
+; NOBF16-NEXT:    mov z1.s, #32767 // =0x7fff
+; NOBF16-NEXT:    lsr z2.s, z0.s, #16
+; NOBF16-NEXT:    ptrue p0.d
+; NOBF16-NEXT:    fcmuo p0.s, p0/z, z0.s, z0.s
+; NOBF16-NEXT:    and z2.s, z2.s, #0x1
+; NOBF16-NEXT:    add z1.s, z0.s, z1.s
+; NOBF16-NEXT:    orr z0.s, z0.s, #0x400000
+; NOBF16-NEXT:    add z1.s, z2.s, z1.s
+; NOBF16-NEXT:    sel z0.s, p0, z0.s, z1.s
+; NOBF16-NEXT:    lsr z0.s, z0.s, #16
+; NOBF16-NEXT:    ret
+;
+; NOBF16NNAN-LABEL: fptrunc_nxv2f32_to_nxv2bf16:
+; NOBF16NNAN:       // %bb.0:
+; NOBF16NNAN-NEXT:    mov z1.s, #32767 // =0x7fff
+; NOBF16NNAN-NEXT:    lsr z2.s, z0.s, #16
+; NOBF16NNAN-NEXT:    and z2.s, z2.s, #0x1
+; NOBF16NNAN-NEXT:    add z0.s, z0.s, z1.s
+; NOBF16NNAN-NEXT:    add z0.s, z2.s, z0.s
+; NOBF16NNAN-NEXT:    lsr z0.s, z0.s, #16
+; NOBF16NNAN-NEXT:    ret
+;
+; BF16-LABEL: fptrunc_nxv2f32_to_nxv2bf16:
+; BF16:       // %bb.0:
+; BF16-NEXT:    ptrue p0.d
+; BF16-NEXT:    bfcvt z0.h, p0/m, z0.s
+; BF16-NEXT:    ret
+  %res = fptrunc <vscale x 2 x float> %a to <vscale x 2 x bfloat>
+  ret <vscale x 2 x bfloat> %res
+}
+
+define <vscale x 4 x bfloat> @fptrunc_nxv4f32_to_nxv4bf16(<vscale x 4 x float> %a) {
+; NOBF16-LABEL: fptrunc_nxv4f32_to_nxv4bf16:
+; NOBF16:       // %bb.0:
+; NOBF16-NEXT:    mov z1.s, #32767 // =0x7fff
+; NOBF16-NEXT:    lsr z2.s, z0.s, #16
+; NOBF16-NEXT:    ptrue p0.s
+; NOBF16-NEXT:    fcmuo p0.s, p0/z, z0.s, z0.s
+; NOBF16-NEXT:    and z2.s, z2.s, #0x1
+; NOBF16-NEXT:    add z1.s, z0.s, z1.s
+; NOBF16-NEXT:    orr z0.s, z0.s, #0x400000
+; NOBF16-NEXT:    add z1.s, z2.s, z1.s
+; NOBF16-NEXT:    sel z0.s, p0, z0.s, z1.s
+; NOBF16-NEXT:    lsr z0.s, z0.s, #16
+; NOBF16-NEXT:    ret
+;
+; NOBF16NNAN-LABEL: fptrunc_nxv4f32_to_nxv4bf16:
+; NOBF16NNAN:       // %bb.0:
+; NOBF16NNAN-NEXT:    mov z1.s, #32767 // =0x7fff
+; NOBF16NNAN-NEXT:    lsr z2.s, z0.s, #16
+; NOBF16NNAN-NEXT:    and z2.s, z2.s, #0x1
+; NOBF16NNAN-NEXT:    add z0.s, z0.s, z1.s
+; NOBF16NNAN-NEXT:    add z0.s, z2.s, z0.s
+; NOBF16NNAN-NEXT:    lsr z0.s, z0.s, #16
+; NOBF16NNAN-NEXT:    ret
+;
+; BF16-LABEL: fptrunc_nxv4f32_to_nxv4bf16:
+; BF16:       // %bb.0:
+; BF16-NEXT:    ptrue p0.s
+; BF16-NEXT:    bfcvt z0.h, p0/m, z0.s
+; BF16-NEXT:    ret
+  %res = fptrunc <vscale x 4 x float> %a to <vscale x 4 x bfloat>
+  ret <vscale x 4 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @fptrunc_nxv8f32_to_nxv8bf16(<vscale x 8 x float> %a) {
+; NOBF16-LABEL: fptrunc_nxv8f32_to_nxv8bf16:
+; NOBF16:       // %bb.0:
+; NOBF16-NEXT:    mov z2.s, #32767 // =0x7fff
+; NOBF16-NEXT:    lsr z3.s, z1.s, #16
+; NOBF16-NEXT:    lsr z4.s, z0.s, #16
+; NOBF16-NEXT:    ptrue p0.s
+; NOBF16-NEXT:    and z3.s, z3.s, #0x1
+; NOBF16-NEXT:    and z4.s, z4.s, #0x1
+; NOBF16-NEXT:    fcmuo p1.s, p0/z, z1.s, z1.s
+; NOBF16-NEXT:    add z5.s, z1.s, z2.s
+; NOBF16-NEXT:    add z2.s, z0.s, z2.s
+; NOBF16-NEXT:    fcmuo p0.s, p0/z, z0.s, z0.s
+; NOBF16-NEXT:    orr z1.s, z1.s, #0x400000
+; NOBF16-NEXT:    orr z0.s, z0.s, #0x400000
+; NOBF16-NEXT:    add z3.s, z3.s, z5.s
+; NOBF16-NEXT:    add z2.s, z4.s, z2.s
+; NOBF16-NEXT:    sel z1.s, p1, z1.s, z3.s
+; NOBF16-NEXT:    sel z0.s, p0, z0.s, z2.s
+; NOBF16-NEXT:    lsr z1.s, z1.s, #16
+; NOBF16-NEXT:    lsr z0.s, z0.s, #16
+; NOBF16-NEXT:    uzp1 z0.h, z0.h, z1.h
+; NOBF16-NEXT:    ret
+;
+; NOBF16NNAN-LABEL: fptrunc_nxv8f32_to_nxv8bf16:
+; NOBF16NNAN:       // %bb.0:
+; NOBF16NNAN-NEXT:    mov z2.s, #32767 // =0x7fff
+; NOBF16NNAN-NEXT:    lsr z3.s, z1.s, #16
+; NOBF16NNAN-NEXT:    lsr z4.s, z0.s, #16
+; NOBF16NNAN-NEXT:    and z3.s, z3.s, #0x1
+; NOBF16NNAN-NEXT:    and z4.s, z4.s, #0x1
+; NOBF16NNAN-NEXT:    add z1.s, z1.s, z2.s
+; NOBF16NNAN-NEXT:    add z0.s, z0.s, z2.s
+; NOBF16NNAN-NEXT:    add z1.s, z3.s, z1.s
+; NOBF16NNAN-NEXT:    add z0.s, z4.s, z0.s
+; NOBF16NNAN-NEXT:    lsr z1.s, z1.s, #16
+; NOBF16NNAN-NEXT:    lsr z0.s, z0.s, #16
+; NOBF16NNAN-NEXT:    uzp1 z0.h, z0.h, z1.h
+; NOBF16NNAN-NEXT:    ret
+;
+; BF16-LABEL: fptrunc_nxv8f32_to_nxv8bf16:
+; BF16:       // %bb.0:
+; BF16-NEXT:    ptrue p0.s
+; BF16-NEXT:    bfcvt z1.h, p0/m, z1.s
+; BF16-NEXT:    bfcvt z0.h, p0/m, z0.s
+; BF16-NEXT:    uzp1 z0.h, z0.h, z1.h
+; BF16-NEXT:    ret
+  %res = fptrunc <vscale x 8 x float> %a to <vscale x 8 x bfloat>
+  ret <vscale x 8 x bfloat> %res
+}
diff --git a/llvm/test/CodeGen/AMDGPU/insert-singleuse-vdst.mir b/llvm/test/CodeGen/AMDGPU/insert-singleuse-vdst.mir
deleted file mode 100644
index 9e65ce329df431..00000000000000
--- a/llvm/test/CodeGen/AMDGPU/insert-singleuse-vdst.mir
+++ /dev/null
@@ -1,1420 +0,0 @@
-# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1150 -verify-machineinstrs -run-pass=amdgpu-insert-single-use-vdst %s -o - | FileCheck %s
-
-# One single-use producer.
----
-name: one_producer
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: one_producer
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $vgpr0
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
-  ; CHECK-NEXT:   $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr2 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr2
-  bb.0:
-    liveins: $vgpr0
-    $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
-    $vgpr2 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec
-  bb.1:
-    liveins: $vgpr0, $vgpr2
-...
-
-# One single-use producer of a 64-bit value.
----
-name: one_producer_64bit
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: one_producer_64bit
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $vgpr0_vgpr1
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
-  ; CHECK-NEXT:   $vgpr2_vgpr3 = V_LSHLREV_B64_e64 0, $vgpr0_vgpr1, implicit $exec
-  ; CHECK-NEXT:   $vgpr4_vgpr5 = V_MOV_B64_e64 $vgpr2_vgpr3, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   liveins: $vgpr4_vgpr5
-  bb.0:
-    liveins: $vgpr0_vgpr1
-    $vgpr2_vgpr3 = V_LSHLREV_B64_e64 0, $vgpr0_vgpr1, implicit $exec
-    $vgpr4_vgpr5 = V_MOV_B64_e64 $vgpr2_vgpr3, implicit $exec
-  bb.1:
-    liveins: $vgpr4_vgpr5
-...
-
-# Two consecutive single-use producers.
----
-name: two_producers
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: two_producers
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $vgpr0
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 2
-  ; CHECK-NEXT:   $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr2 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec
-  ; CHECK-NEXT:   $vgpr3 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr3
-  bb.0:
-    liveins: $vgpr0
-    $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
-    $vgpr2 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec
-    $vgpr3 = V_ADD_U32_e32 $vgpr0, $vgpr2, implicit $exec
-  bb.1:
-    liveins: $vgpr0, $vgpr3
-...
-
-# Redefinitions of v0.
----
-name: redefinitions
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: redefinitions
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $vgpr0
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 4
-  ; CHECK-NEXT:   $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  bb.0:
-    liveins: $vgpr0
-    $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  bb.1:
-...
-
-# One producer with no consumers.
----
-name: no_consumer
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: no_consumer
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $vgpr0
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
-  ; CHECK-NEXT:   $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  bb.0:
-    liveins: $vgpr0
-    $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
-  bb.1:
-...
-
-# One consumer with two uses of the same value.
----
-name: one_consumer_two_uses
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: one_consumer_two_uses
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $vgpr0
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
-  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr2 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr2
-  bb.0:
-    liveins: $vgpr0
-    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr2 = V_ADD_U32_e32 $vgpr1, $vgpr1, implicit $exec
-  bb.1:
-    liveins: $vgpr0, $vgpr2
-...
-
-# A longer example.
----
-name: longer_example
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: longer_example
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $vgpr3, $vgpr5, $sgpr0, $sgpr2, $sgpr4, $sgpr5, $sgpr16, $sgpr17, $sgpr18, $sgpr19
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 274
-  ; CHECK-NEXT:   $vgpr14 = V_MUL_F32_e32 $sgpr4, $vgpr3, implicit $exec, implicit $mode
-  ; CHECK-NEXT:   $sgpr3 = S_MUL_F16 $sgpr0, $sgpr2, implicit $mode
-  ; CHECK-NEXT:   $vgpr15 = V_MUL_F32_e32 $sgpr5, $vgpr3, implicit $exec, implicit $mode
-  ; CHECK-NEXT:   $vgpr17 = V_FMA_F32_e64 0, $sgpr16, 0, $vgpr5, 0, $vgpr14, 0, 0, implicit $exec, implicit $mode
-  ; CHECK-NEXT:   $sgpr1 = S_ADD_F16 $sgpr0, 15360, implicit $mode
-  ; CHECK-NEXT:   $vgpr15 = V_FMA_F32_e64 0, $sgpr17, 0, $vgpr5, 0, $vgpr15, 0, 0, implicit $exec, implicit $mode
-  ; CHECK-NEXT:   $vgpr14 = V_FMA_F32_e64 0, $sgpr18, 0, $vgpr15, 0, $vgpr17, 0, 0, implicit $exec, implicit $mode
-  ; CHECK-NEXT:   $vgpr15 = V_FMA_F32_e64 0, $sgpr19, 0, $vgpr14, 0, $vgpr17, 0, 0, implicit $exec, implicit $mode
-  ; CHECK-NEXT:   $vgpr16 = V_LOG_F32_e32 $vgpr15, implicit $exec, implicit $mode
-  ; CHECK-NEXT:   $vgpr18 = V_EXP_F32_e32 $vgpr15, implicit $exec, implicit $mode
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   liveins: $vgpr16, $vgpr18
-  bb.0:
-    liveins: $vgpr3, $vgpr5, $sgpr0, $sgpr2, $sgpr4, $sgpr5, $sgpr16, $sgpr17, $sgpr18, $sgpr19
-    $vgpr14 = V_MUL_F32_e32 $sgpr4, $vgpr3, implicit $exec, implicit $mode
-    $sgpr3 = S_MUL_F16 $sgpr0, $sgpr2, implicit $mode
-    $vgpr15 = V_MUL_F32_e32 $sgpr5, $vgpr3, implicit $exec, implicit $mode
-    $vgpr17 = V_FMA_F32_e64 0, $sgpr16, 0, $vgpr5, 0, $vgpr14, 0, 0, implicit $exec, implicit $mode
-    $sgpr1 = S_ADD_F16 $sgpr0, 15360, implicit $mode
-    $vgpr15 = V_FMA_F32_e64 0, $sgpr17, 0, $vgpr5, 0, $vgpr15, 0, 0, implicit $exec, implicit $mode
-    $vgpr14 = V_FMA_F32_e64 0, $sgpr18, 0, $vgpr15, 0, $vgpr17, 0, 0, implicit $exec, implicit $mode
-    $vgpr15 = V_FMA_F32_e64 0, $sgpr19, 0, $vgpr14, 0, $vgpr17, 0, 0, implicit $exec, implicit $mode
-    $vgpr16 = V_LOG_F32_e32 $vgpr15, implicit $exec, implicit $mode
-    $vgpr18 = V_EXP_F32_e32 $vgpr15, implicit $exec, implicit $mode
-  bb.1:
-    liveins: $vgpr16, $vgpr18
-...
-
-# Multiple uses of v0.
----
-name: multiple_uses_1
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: multiple_uses_1
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $vgpr0
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   liveins: $vgpr1, $vgpr2
-  bb.0:
-    liveins: $vgpr0
-    $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  bb.1:
-    liveins: $vgpr1, $vgpr2
-...
-
-# Multiple uses of v0 and redefinitions of v1 and v2.
----
-name: multiple_uses_2
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: multiple_uses_2
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $vgpr0
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 2
-  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   liveins: $vgpr1, $vgpr2
-  bb.0:
-    liveins: $vgpr0
-    $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  bb.1:
-    liveins: $vgpr1, $vgpr2
-...
-
-# Multiple uses of all but v1.
----
-name: multiple_uses_3
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: multiple_uses_3
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $vgpr0
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
-  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr3 = V_MOV_B32_e32 $vgpr1, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   liveins: $vgpr2, $vgpr3
-  bb.0:
-    liveins: $vgpr0
-    $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr3 = V_MOV_B32_e32 $vgpr1, implicit $exec
-  bb.1:
-    liveins: $vgpr2, $vgpr3
-...
-
-# Second use is an instruction that reads and writes v1.
----
-name: multiple_uses_4
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: multiple_uses_4
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $vgpr0
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr2 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec
-  ; CHECK-NEXT:   $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
-  bb.0:
-    liveins: $vgpr0
-    $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr0, implicit $exec
-    $vgpr2 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec
-    $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec
-  bb.1:
-    liveins: $vgpr0, $vgpr1, $vgpr2
-...
-
-# Results are live-in to another basic block.
----
-name: basic_block_1
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: basic_block_1
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $vgpr0
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.2:
-  ; CHECK-NEXT:   liveins: $vgpr1, $vgpr2
-  bb.0:
-    liveins: $vgpr0
-    $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  bb.1:
-    liveins: $vgpr0, $vgpr1, $vgpr2
-    $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  bb.2:
-    liveins: $vgpr1, $vgpr2
-...
-
-# Result v2 has multiple uses in another basic block.
----
-name: basic_block_2
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: basic_block_2
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $vgpr2 = V_MOV_B32_e32 $vgpr1, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
-  ; CHECK-NEXT:   liveins: $vgpr2
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
-  ; CHECK-NEXT:   $vgpr3 = V_MOV_B32_e32 $vgpr2, implicit $exec
-  ; CHECK-NEXT:   $vgpr3 = V_MOV_B32_e32 $vgpr2, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.2:
-  ; CHECK-NEXT:   liveins: $vgpr3
-  bb.0:
-    liveins: $vgpr0, $vgpr1
-    $vgpr2 = V_MOV_B32_e32 $vgpr1, implicit $exec
-  bb.1:
-    liveins: $vgpr2
-    $vgpr3 = V_MOV_B32_e32 $vgpr2, implicit $exec
-    $vgpr3 = V_MOV_B32_e32 $vgpr2, implicit $exec
-  bb.2:
-    liveins: $vgpr3
-...
-
-# Results are redefined in another basic block.
----
-name: basic_block_3
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: basic_block_3
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $vgpr0
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
-  ; CHECK-NEXT:   $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   successors: %bb.2(0x80000000)
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec
-  ; CHECK-NEXT:   $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec
-  ; CHECK-NEXT:   $vgpr2 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.2:
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2
-  bb.0:
-    liveins: $vgpr0
-    $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  bb.1:
-    liveins: $vgpr0, $vgpr1
-    $vgpr0 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec
-    $vgpr1 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec
-    $vgpr2 = V_ADD_U32_e32 $vgpr0, $vgpr1, implicit $exec
-  bb.2:
-    liveins: $vgpr0, $vgpr1, $vgpr2
-...
-
-# Exec modified between producer and consumer.
----
-name: exec_mask
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: exec_mask
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $sgpr0_sgpr1
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $vgpr0 = V_MOV_B32_e32 0, implicit $exec
-  ; CHECK-NEXT:   $exec = COPY $sgpr0_sgpr1
-  ; CHECK-NEXT:   $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   liveins: $vgpr0
-  bb.0:
-    liveins: $sgpr0_sgpr1
-    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
-    $exec = COPY $sgpr0_sgpr1
-    $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  bb.1:
-    liveins: $vgpr0
-...
-
-# Exec_lo modified between producer and consumer.
----
-name: exec_mask_lo
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: exec_mask_lo
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $sgpr0
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $vgpr0 = V_MOV_B32_e32 0, implicit $exec
-  ; CHECK-NEXT:   $exec_lo = COPY $sgpr0
-  ; CHECK-NEXT:   $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   liveins: $vgpr0
-  bb.0:
-    liveins: $sgpr0
-    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
-    $exec_lo = COPY $sgpr0
-    $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  bb.1:
-    liveins: $vgpr0
-...
-
-# Exec_hi modified between producer and consumer.
----
-name: exec_mask_hi
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: exec_mask_hi
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $sgpr0
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $vgpr0 = V_MOV_B32_e32 0, implicit $exec
-  ; CHECK-NEXT:   $exec_hi = COPY $sgpr0
-  ; CHECK-NEXT:   $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   liveins: $vgpr0
-  bb.0:
-    liveins: $sgpr0
-    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
-    $exec_hi = COPY $sgpr0
-    $vgpr0 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  bb.1:
-    liveins: $vgpr0
-...
-
-# Write 32-bit vgpr and then read from low 16 bits.
----
-name: write_full_read_lo
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: write_full_read_lo
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
-  ; CHECK-NEXT:   $vgpr0 = V_MOV_B32_e32 0, implicit $exec
-  ; CHECK-NEXT:   $vgpr1_lo16 = V_MOV_B16_t16_e32 $vgpr0_lo16, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   liveins: $vgpr1_lo16
-  bb.0:
-    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
-    $vgpr1_lo16 = V_MOV_B16_t16_e32 $vgpr0_lo16, implicit $exec
-  bb.1:
-    liveins: $vgpr1_lo16
-...
-
-# Write 32-bit vgpr and then read from high 16 bits.
----
-name: write_full_read_hi
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: write_full_read_hi
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
-  ; CHECK-NEXT:   $vgpr0 = V_MOV_B32_e32 0, implicit $exec
-  ; CHECK-NEXT:   $vgpr1_hi16 = V_MOV_B16_t16_e32 $vgpr0_hi16, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   liveins: $vgpr1_hi16
-  bb.0:
-    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
-    $vgpr1_hi16 = V_MOV_B16_t16_e32 $vgpr0_hi16, implicit $exec
-  bb.1:
-    liveins: $vgpr1_hi16
-...
-
-# Write 32-bit vgpr and then read from both halves.
----
-name: write_full_read_both
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: write_full_read_both
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
-  ; CHECK-NEXT:   $vgpr0 = V_MOV_B32_e32 0, implicit $exec
-  ; CHECK-NEXT:   $vgpr1_lo16 = V_MOV_B16_t16_e32 $vgpr0_lo16, implicit $exec
-  ; CHECK-NEXT:   $vgpr1_hi16 = V_MOV_B16_t16_e32 $vgpr0_hi16, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   liveins: $vgpr1
-  bb.0:
-    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
-    $vgpr1_lo16 = V_MOV_B16_t16_e32 $vgpr0_lo16, implicit $exec
-    $vgpr1_hi16 = V_MOV_B16_t16_e32 $vgpr0_hi16, implicit $exec
-  bb.1:
-    liveins: $vgpr1
-...
-
-# Write 32-bit vgpr and then read from both halves in the same instruction.
----
-name: write_full_read_both_same_instruction
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: write_full_read_both_same_instruction
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
-  ; CHECK-NEXT:   $vgpr0 = V_MOV_B32_e32 0, implicit $exec
-  ; CHECK-NEXT:   $vgpr1_lo16 = V_ADD_F16_t16_e32 $vgpr0_lo16, $vgpr0_hi16, implicit $mode, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   liveins: $vgpr1_lo16
-  bb.0:
-    $vgpr0 = V_MOV_B32_e32 0, implicit $exec
-    $vgpr1_lo16 = V_ADD_F16_t16_e32 $vgpr0_lo16, $vgpr0_hi16, implicit $mode, implicit $exec
-  bb.1:
-    liveins: $vgpr1_lo16
-...
-
-# Write low 16-bits and then read 32-bit vgpr.
----
-name: write_lo_read_full
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: write_lo_read_full
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $vgpr0
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
-  ; CHECK-NEXT:   $vgpr0_lo16 = V_MOV_B16_t16_e32 0, implicit $exec
-  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   liveins: $vgpr1
-  bb.0:
-    liveins: $vgpr0
-    $vgpr0_lo16 = V_MOV_B16_t16_e32 0, implicit $exec
-    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  bb.1:
-    liveins: $vgpr1
-...
-
-# Write low 16-bits and then read 32-bit vgpr twice.
----
-name: write_lo_read_full_twice
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: write_lo_read_full_twice
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $vgpr0
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $vgpr0_lo16 = V_MOV_B16_t16_e32 0, implicit $exec
-  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   liveins: $vgpr1, $vgpr2
-  bb.0:
-    liveins: $vgpr0
-    $vgpr0_lo16 = V_MOV_B16_t16_e32 0, implicit $exec
-    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  bb.1:
-    liveins: $vgpr1, $vgpr2
-...
-
-# Write high 16-bits and then read 32-bit vgpr.
----
-name: write_hi_read_full
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: write_hi_read_full
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $vgpr0
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
-  ; CHECK-NEXT:   $vgpr0_hi16 = V_MOV_B16_t16_e32 0, implicit $exec
-  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   liveins: $vgpr1
-  bb.0:
-    liveins: $vgpr0
-    $vgpr0_hi16 = V_MOV_B16_t16_e32 0, implicit $exec
-    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  bb.1:
-    liveins: $vgpr1
-...
-
-# Write high 16-bits and then read 32-bit vgpr twice.
----
-name: write_hi_read_full_twice
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: write_hi_read_full_twice
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $vgpr0
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $vgpr0_hi16 = V_MOV_B16_t16_e32 0, implicit $exec
-  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   liveins: $vgpr1, $vgpr2
-  bb.0:
-    liveins: $vgpr0
-    $vgpr0_hi16 = V_MOV_B16_t16_e32 0, implicit $exec
-    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  bb.1:
-    liveins: $vgpr1, $vgpr2
-...
-
-# Write low 16-bits and then write high 16-bits and then read 32-bit vgpr.
----
-name: write_both_read_full
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: write_both_read_full
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 2
-  ; CHECK-NEXT:   $vgpr0_lo16 = V_MOV_B16_t16_e32 0, implicit $exec
-  ; CHECK-NEXT:   $vgpr0_hi16 = V_MOV_B16_t16_e32 0, implicit $exec
-  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   liveins: $vgpr1
-  bb.0:
-    $vgpr0_lo16 = V_MOV_B16_t16_e32 0, implicit $exec
-    $vgpr0_hi16 = V_MOV_B16_t16_e32 0, implicit $exec
-    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  bb.1:
-    liveins: $vgpr1
-...
-
-# Write low 16-bits and then write high 16-bits and then read 32-bit vgpr twice.
----
-name: write_both_read_full_twice
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: write_both_read_full_twice
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $vgpr0_lo16 = V_MOV_B16_t16_e32 0, implicit $exec
-  ; CHECK-NEXT:   $vgpr0_hi16 = V_MOV_B16_t16_e32 0, implicit $exec
-  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   liveins: $vgpr1, $vgpr2
-  bb.0:
-    $vgpr0_lo16 = V_MOV_B16_t16_e32 0, implicit $exec
-    $vgpr0_hi16 = V_MOV_B16_t16_e32 0, implicit $exec
-    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  bb.1:
-    liveins: $vgpr1, $vgpr2
-...
-
-# Three single use producer instructions with non single use producer
-# instructions in between.
----
-name: three_producers_with_two_skips
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: three_producers_with_two_skips
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $vgpr0
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 9361
-  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr4 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr5 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   liveins: $vgpr2, $vgpr4
-  bb.0:
-    liveins: $vgpr0
-    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr4 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr5 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  bb.1:
-    liveins: $vgpr2, $vgpr4
-...
-
-# Six single use producer instructions with non single use producer
-# instructions in between.
----
-name: six_producers_with_four_skips
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: six_producers_with_four_skips
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $vgpr0
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 145
-  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr4 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 9362
-  ; CHECK-NEXT:   $vgpr5 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr6 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr7 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr8 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr9 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr10 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   liveins: $vgpr2, $vgpr4, $vgpr7, $vgpr9
-  bb.0:
-    liveins: $vgpr0
-    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr4 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr5 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr6 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr7 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr8 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr9 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr10 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  bb.1:
-    liveins: $vgpr2, $vgpr4, $vgpr7, $vgpr9
-...
-
-# Five single use producer instructions, followed by
-# four non single use producers, followed by
-# three single use producer instructions, followed by
-# two non single use producers, followed by
-# one single use producer instructions.
----
-name: immediate_order
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: immediate_order
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $vgpr0
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 10693
-  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr4 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr5 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr6 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr7 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr8 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr9 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr10 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr11 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr12 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr13 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr14 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr15 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   liveins: $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr13, $vgpr14
-  bb.0:
-    liveins: $vgpr0
-    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr4 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr5 = V_MOV_B32_e32 $vgpr0, implicit $exec
-
-    $vgpr6 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr7 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr8 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr9 = V_MOV_B32_e32 $vgpr0, implicit $exec
-
-    $vgpr10 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr11 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr12 = V_MOV_B32_e32 $vgpr0, implicit $exec
-
-    $vgpr13 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr14 = V_MOV_B32_e32 $vgpr0, implicit $exec
-
-    $vgpr15 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  bb.1:
-    liveins: $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr13, $vgpr14
-...
-
-# Maximum number of single use producers that can be encoded in a single
-# instruction.
----
-name: maximum_producers_single_instruction
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: maximum_producers_single_instruction
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $vgpr0
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 58255
-  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr4 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr5 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr6 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr7 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr8 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr9 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr10 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr11 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr12 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr13 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr14 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr15 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr16 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr17 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr18 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr19 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr20 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr21 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr22 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr23 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr24 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr25 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr26 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr27 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr28 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr29 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  bb.0:
-    liveins: $vgpr0
-    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr4 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr5 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr6 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr7 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr8 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr9 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr10 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr11 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr12 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr13 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr14 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr15 = V_MOV_B32_e32 $vgpr0, implicit $exec
-
-    $vgpr16 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr17 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr18 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr19 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr20 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr21 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr22 = V_MOV_B32_e32 $vgpr0, implicit $exec
-
-    $vgpr23 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr24 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr25 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr26 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr27 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr28 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr29 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  bb.1:
-...
-
-# One more than the maximum number of single use producers that can be encoded
-# in a single instruction.
----
-name: too_many_producers_single_instruction
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: too_many_producers_single_instruction
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $vgpr0
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
-  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 58255
-  ; CHECK-NEXT:   $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr4 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr5 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr6 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr7 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr8 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr9 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr10 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr11 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr12 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr13 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr14 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr15 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr16 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr17 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr18 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr19 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr20 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr21 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr22 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr23 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr24 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr25 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr26 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr27 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr28 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr29 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr30 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-
-
-
-  bb.0:
-    liveins: $vgpr0
-    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr4 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr5 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr6 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr7 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr8 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr9 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr10 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr11 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr12 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr13 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr14 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr15 = V_MOV_B32_e32 $vgpr0, implicit $exec
-
-    $vgpr16 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr17 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr18 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr19 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr20 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr21 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr22 = V_MOV_B32_e32 $vgpr0, implicit $exec
-
-    $vgpr23 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr24 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr25 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr26 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr27 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr28 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr29 = V_MOV_B32_e32 $vgpr0, implicit $exec
-
-    $vgpr30 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  bb.1:
-...
-
-# Maximum distance between single use producers that can be encoded in a single
-# instruction.
----
-name: maximum_skips_single_instruction
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: maximum_skips_single_instruction
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $vgpr0
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 15473
-  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr4 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr5 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr6 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr7 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr8 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr9 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr10 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr11 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr12 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr13 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr14 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr15 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr16 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   liveins: $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15
-  bb.0:
-    liveins: $vgpr0
-    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-
-    $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr4 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr5 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr6 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr7 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr8 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr9 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr10 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr11 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr12 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr13 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr14 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr15 = V_MOV_B32_e32 $vgpr0, implicit $exec
-
-    $vgpr16 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  bb.1:
-    liveins: $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15
-...
-
-# One more than the maximum distance between single use producers that can be
-# encoded in a single instruction.
----
-name: too_many_skips_single_instruction
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: too_many_skips_single_instruction
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $vgpr0
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
-  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr4 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr5 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr6 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr7 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr8 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr9 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr10 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr11 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr12 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr13 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr14 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr15 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr16 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
-  ; CHECK-NEXT:   $vgpr17 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   liveins: $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16
-  bb.0:
-    liveins: $vgpr0
-    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-
-    $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr4 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr5 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr6 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr7 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr8 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr9 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr10 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr11 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr12 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr13 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr14 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr15 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr16 = V_MOV_B32_e32 $vgpr0, implicit $exec
-
-    $vgpr17 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  bb.1:
-    liveins: $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16
-...
-
-
-# Maximum possible encoding value with all bits of the immediate set
----
-name: all_immediate_bits_set
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: all_immediate_bits_set
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $vgpr0
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 65535
-  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr4 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr5 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr6 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr7 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr8 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr9 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr10 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr11 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr12 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr13 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr14 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr15 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr16 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr17 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr18 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr19 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr20 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr21 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr22 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr23 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr24 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr25 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr26 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr27 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr28 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr29 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr30 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr31 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr33 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr34 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr35 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr36 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr37 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr38 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr39 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr40 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr41 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr42 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr43 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   liveins: $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr30, $vgpr31, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36
-  bb.0:
-    liveins: $vgpr0
-    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr3 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr4 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr5 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr6 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr7 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr8 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr9 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr10 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr11 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr12 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr13 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr14 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr15 = V_MOV_B32_e32 $vgpr0, implicit $exec
-
-    $vgpr16 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr17 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr18 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr19 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr20 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr21 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr22 = V_MOV_B32_e32 $vgpr0, implicit $exec
-
-    $vgpr23 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr24 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr25 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr26 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr27 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr28 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr29 = V_MOV_B32_e32 $vgpr0, implicit $exec
-
-    $vgpr30 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr31 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr32 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr33 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr34 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr35 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr36 = V_MOV_B32_e32 $vgpr0, implicit $exec
-
-    $vgpr37 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr38 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr39 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr40 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr41 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr42 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr43 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  bb.1:
-    liveins: $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr30, $vgpr31, $vgpr31, $vgpr32, $vgpr33, $vgpr34, $vgpr35, $vgpr36
-
-...
-
-# Tests for multi-cycle instructions that are explicitly excluded.
-
-# Valid producers but invalid consumer opcodes.
----
-name: v_mul_hi_u32_e64
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: v_mul_hi_u32_e64
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $vgpr0
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
-  ; CHECK-NEXT:   $vgpr2 = V_MUL_HI_U32_e64 $vgpr0, $vgpr1, implicit $exec
-  ; CHECK-NEXT:   $vgpr3 = V_MOV_B32_e32 $vgpr2, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr3
-  bb.0:
-    liveins: $vgpr0
-    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr2 = V_MUL_HI_U32_e64 $vgpr0, $vgpr1, implicit $exec
-    $vgpr3 = V_MOV_B32_e32 $vgpr2, implicit $exec
-  bb.1:
-    liveins: $vgpr0, $vgpr3
-...
-
----
-name: v_cmpx_t_u64_e64
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: v_cmpx_t_u64_e64
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
-  ; CHECK-NEXT:   $sgpr0 = V_CMPX_EQ_U64_e64 $vgpr0_vgpr1, $vgpr2_vgpr3, implicit-def $exec, implicit $exec
-  ; CHECK-NEXT:   S_BRANCH %bb.1
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   liveins: $vgpr0
-  bb.0:
-    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
-    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $sgpr0 = V_CMPX_EQ_U64_e64 $vgpr0_vgpr1, $vgpr2_vgpr3, implicit-def $exec, implicit $exec
-    S_BRANCH %bb.1
-  bb.1:
-    liveins: $vgpr0
-...
-
----
-name: v_lshlrev_b64_e64
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: v_lshlrev_b64_e64
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $vgpr0_vgpr1
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $vgpr2_vgpr3 = V_MOV_B64_e64 $vgpr0_vgpr1, implicit $exec
-  ; CHECK-NEXT:   $vgpr4_vgpr5 = V_LSHLREV_B64_e64 0, $vgpr2_vgpr3, implicit $exec
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
-  ; CHECK-NEXT:   $vgpr6_vgpr7 = V_LSHLREV_B64_e64 0, $vgpr4_vgpr5, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   liveins: $vgpr4_vgpr5
-  bb.0:
-    liveins: $vgpr0_vgpr1
-    $vgpr2_vgpr3 = V_MOV_B64_e64 $vgpr0_vgpr1, implicit $exec
-    $vgpr4_vgpr5 = V_LSHLREV_B64_e64 0, $vgpr2_vgpr3, implicit $exec
-    $vgpr6_vgpr7 = V_LSHLREV_B64_e64 0, $vgpr4_vgpr5, implicit $exec
-  bb.1:
-    liveins: $vgpr4_vgpr5
-...
-
-# Invalid producers but valid consumer opcodes.
----
-name: v_movereld_b32_e32
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: v_movereld_b32_e32
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $vgpr0, $vgpr2
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $m0 = S_MOV_B32 0
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
-  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   V_MOVRELD_B32_e32 $vgpr2, $vgpr1, implicit $m0, implicit $exec, implicit-def $vgpr1_vgpr2, implicit undef $vgpr1_vgpr2(tied-def 4)
-  ; CHECK-NEXT:   $vgpr3 = V_ADD_U32_e32 $vgpr2, $vgpr1, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   liveins: $vgpr3
-  bb.0:
-    liveins: $vgpr0, $vgpr2
-    $m0 = S_MOV_B32 0
-    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    V_MOVRELD_B32_e32 $vgpr2, $vgpr1, implicit $m0, implicit $exec, implicit-def $vgpr1_vgpr2, implicit undef $vgpr1_vgpr2(tied-def 4)
-    $vgpr3 = V_ADD_U32_e32 $vgpr2, $vgpr1, implicit $exec
-  bb.1:
-    liveins: $vgpr3
-...
-
-# Invalid producers and invalid consumer opcodes.
----
-name: v_writelane_b32
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: v_writelane_b32
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $vgpr0, $sgpr0
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT:   $vgpr1 = V_WRITELANE_B32 $sgpr0, 0, $vgpr1
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
-  ; CHECK-NEXT:   $vgpr2 = V_MOV_B32_e32 $vgpr1, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   liveins: $vgpr0
-  bb.0:
-    liveins: $vgpr0, $sgpr0
-    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-    $vgpr1 = V_WRITELANE_B32 $sgpr0, 0, $vgpr1
-    $vgpr2 = V_MOV_B32_e32 $vgpr1, implicit $exec
-  bb.1:
-    liveins: $vgpr0
-...
-
-# DPP instructions cannot be single use producers or consumers
----
-name: V_ADD_NC_U32_dpp
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: V_ADD_NC_U32_dpp
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT:   liveins: $vgpr0, $vcc
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   $vgpr0 = V_ADDC_U32_dpp $vgpr0, $vgpr0, $vgpr0, 1, 15, 15, 1, implicit-def $vcc_lo, implicit $vcc_lo, implicit $exec
-  ; CHECK-NEXT:   $vgpr0 = V_ADDC_U32_dpp $vgpr0, $vgpr0, $vgpr0, 1, 15, 15, 1, implicit-def $vcc_lo, implicit $vcc_lo, implicit $exec
-  ; CHECK-NEXT:   $vgpr0 = V_ADDC_U32_dpp $vgpr0, $vgpr0, $vgpr0, 1, 15, 15, 1, implicit-def $vcc_lo, implicit $vcc_lo, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   liveins: $vgpr0
-  bb.0:
-    liveins: $vgpr0, $vcc
-    $vgpr0 = V_ADDC_U32_dpp $vgpr0, $vgpr0, $vgpr0, 1, 15, 15, 1, implicit-def $vcc, implicit $vcc, implicit $exec
-    $vgpr0 = V_ADDC_U32_dpp $vgpr0, $vgpr0, $vgpr0, 1, 15, 15, 1, implicit-def $vcc, implicit $vcc, implicit $exec
-    $vgpr0 = V_ADDC_U32_dpp $vgpr0, $vgpr0, $vgpr0, 1, 15, 15, 1, implicit-def $vcc, implicit $vcc, implicit $exec
-  bb.1:
-    liveins: $vgpr0
-...
-
-# Exception to the rule that dpp instructions
-# cannot be single use producers or consumers
----
-name: V_INTERP_MOV_F32
-tracksRegLiveness: true
-body: |
-  ; CHECK-LABEL: name: V_INTERP_MOV_F32
-  ; CHECK: bb.0:
-  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT:   S_SINGLEUSE_VDST 1
-  ; CHECK-NEXT:   $vgpr0 = V_INTERP_MOV_F32 0, 0, 0, implicit $mode, implicit $m0, implicit $exec
-  ; CHECK-NEXT:   $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  ; CHECK-NEXT: {{  $}}
-  ; CHECK-NEXT: bb.1:
-  ; CHECK-NEXT:   liveins: $vgpr1
-  bb.0:
-    $vgpr0 = V_INTERP_MOV_F32 0, 0, 0, implicit $mode, implicit $m0, implicit $exec
-    $vgpr1 = V_MOV_B32_e32 $vgpr0, implicit $exec
-  bb.1:
-    liveins: $vgpr1
-...
-
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-fi-skip-processing-stack-arg-dbg-value-list.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-fi-skip-processing-stack-arg-dbg-value-list.mir
new file mode 100644
index 00000000000000..cdf2b41c1e5b45
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-fi-skip-processing-stack-arg-dbg-value-list.mir
@@ -0,0 +1,53 @@
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -amdgpu-spill-sgpr-to-vgpr=true -run-pass=si-lower-sgpr-spills -o - %s | FileCheck %s
+
+--- |
+  define amdgpu_kernel void @test() { ret void }
+
+  !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !4, producer: "llvm", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, retainedTypes: !4)
+  !1 = !DILocalVariable(name: "a", scope: !2, file: !4, line: 126, type: !6)
+  !2 = distinct !DISubprogram(name: "test", scope: !4, file: !4, line: 1, type: !3, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !5)
+  !3 = !DISubroutineType(types: !4)
+  !4 = !DIFile(filename: "dummy", directory: "/")
+  !5 = !{!1}
+  !6 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !7, size: 64, align: 32)
+  !7 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+  !8 = !DIExpression()
+  !9 = !DILocation(line: 10, column: 9, scope: !2)
+
+...
+---
+name:            test
+tracksRegLiveness: true
+frameInfo:
+  maxAlignment:    4
+fixedStack:
+  - { id: 0, type: default, offset: 4, size: 4, alignment: 4, stack-id: default }
+stack:
+  - { id: 0, type: spill-slot, size: 4, alignment: 4, stack-id: sgpr-spill }
+machineFunctionInfo:
+  maxKernArgAlign: 4
+  isEntryFunction: true
+  waveLimiter:     true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  frameOffsetReg: '$sgpr33'
+  hasSpilledSGPRs: true
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+    kernargSegmentPtr: { reg: '$sgpr6_sgpr7' }
+    workGroupIDX:    { reg: '$sgpr8' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr9' }
+body:             |
+  ; CHECK-LABEL: name: test
+  ; CHECK: bb.0:
+  ; CHECK:   DBG_VALUE_LIST <{{.*}}>, !DIExpression(), $noreg, 0, debug-location !DILocation(line: 10, column: 9, scope: <{{.*}}>)
+
+  bb.0:
+    renamable $sgpr10 = IMPLICIT_DEF
+    SI_SPILL_S32_SAVE killed $sgpr10, %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32
+    DBG_VALUE_LIST !1, !8, %stack.0, 0, debug-location !9
+
+  bb.1:
+    renamable $sgpr10 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32
+    S_ENDPGM 0
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-fi-skip-processing-stack-arg-dbg-value-list.mir b/llvm/test/CodeGen/AMDGPU/vgpr-spill-fi-skip-processing-stack-arg-dbg-value-list.mir
new file mode 100644
index 00000000000000..53629cdfb932b2
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-fi-skip-processing-stack-arg-dbg-value-list.mir
@@ -0,0 +1,52 @@
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -amdgpu-spill-vgpr-to-agpr=true -run-pass=prologepilog -o - %s | FileCheck %s
+
+--- |
+  define amdgpu_kernel void @test() { ret void }
+
+  !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !4, producer: "llvm", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, enums: !4, retainedTypes: !4)
+  !1 = !DILocalVariable(name: "a", scope: !2, file: !4, line: 126, type: !6)
+  !2 = distinct !DISubprogram(name: "test", scope: !4, file: !4, line: 1, type: !3, isLocal: false, isDefinition: true, scopeLine: 2, flags: DIFlagPrototyped, isOptimized: true, unit: !0, retainedNodes: !5)
+  !3 = !DISubroutineType(types: !4)
+  !4 = !DIFile(filename: "dummy", directory: "/")
+  !5 = !{!1}
+  !6 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !7, size: 64, align: 32)
+  !7 = !DIBasicType(name: "int", size: 32, align: 32, encoding: DW_ATE_signed)
+  !8 = !DIExpression()
+  !9 = !DILocation(line: 10, column: 9, scope: !2)
+
+...
+---
+name:            test
+tracksRegLiveness: true
+frameInfo:
+  maxAlignment:    4
+fixedStack:
+  - { id: 0, type: default, offset: 4, size: 4, alignment: 4, stack-id: default }
+stack:
+  - { id: 0, type: spill-slot, size: 4, alignment: 4 }
+machineFunctionInfo:
+  maxKernArgAlign: 4
+  isEntryFunction: true
+  waveLimiter:     true
+  scratchRSrcReg:  '$sgpr96_sgpr97_sgpr98_sgpr99'
+  stackPtrOffsetReg: '$sgpr32'
+  frameOffsetReg: '$sgpr33'
+  hasSpilledVGPRs: true
+  argumentInfo:
+    privateSegmentBuffer: { reg: '$sgpr0_sgpr1_sgpr2_sgpr3' }
+    dispatchPtr:     { reg: '$sgpr4_sgpr5' }
+    kernargSegmentPtr: { reg: '$sgpr6_sgpr7' }
+    workGroupIDX:    { reg: '$sgpr8' }
+    privateSegmentWaveByteOffset: { reg: '$sgpr9' }
+body:             |
+  ; CHECK-LABEL: name: test
+  ; CHECK: bb.0:
+  ; CHECK:   DBG_VALUE_LIST <{{.*}}>, !DIExpression(), $noreg, 0, debug-location !DILocation(line: 10, column: 9, scope: <{{.*}}>)
+  bb.0:
+    $vgpr2 = IMPLICIT_DEF
+    SI_SPILL_V32_SAVE $vgpr2, %stack.0, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.0, align 4, addrspace 5)
+    DBG_VALUE_LIST !1, !8, %stack.0, 0, debug-location !9
+
+  bb.1:
+    renamable $vgpr2 = SI_SPILL_V32_RESTORE %stack.0, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.0, align 4, addrspace 5)
+    S_ENDPGM 0
diff --git a/llvm/test/CodeGen/ARM/preferred-function-alignment.ll b/llvm/test/CodeGen/ARM/preferred-function-alignment.ll
index afe64a22c5e808..f3a227c4765eb8 100644
--- a/llvm/test/CodeGen/ARM/preferred-function-alignment.ll
+++ b/llvm/test/CodeGen/ARM/preferred-function-alignment.ll
@@ -1,14 +1,15 @@
-; RUN: llc -mtriple=arm-none-eabi -mcpu=cortex-m85 < %s | FileCheck --check-prefixes=CHECK,ALIGN-16,ALIGN-CS-16 %s
+; RUN: llc -mtriple=arm-none-eabi -mcpu=cortex-m85 < %s | FileCheck --check-prefixes=CHECK,ALIGN-64,ALIGN-CS-16 %s
 ; RUN: llc -mtriple=arm-none-eabi -mcpu=cortex-m23 < %s | FileCheck --check-prefixes=CHECK,ALIGN-16,ALIGN-CS-16 %s
 
 ; RUN: llc -mtriple=arm-none-eabi -mcpu=cortex-a5 < %s  | FileCheck --check-prefixes=CHECK,ALIGN-32,ALIGN-CS-32 %s
 ; RUN: llc -mtriple=arm-none-eabi -mcpu=cortex-m33 < %s | FileCheck --check-prefixes=CHECK,ALIGN-32,ALIGN-CS-16 %s
 ; RUN: llc -mtriple=arm-none-eabi -mcpu=cortex-m55 < %s | FileCheck --check-prefixes=CHECK,ALIGN-32,ALIGN-CS-16 %s
-
+; RUN: llc -mtriple=arm-none-eabi -mcpu=cortex-m7 < %s | FileCheck --check-prefixes=CHECK,ALIGN-64,ALIGN-CS-16 %s
 
 ; CHECK-LABEL: test
 ; ALIGN-16: .p2align 1
 ; ALIGN-32: .p2align 2
+; ALIGN-64: .p2align 3
 
 define void @test() {
   ret void
diff --git a/llvm/test/CodeGen/Hexagon/expand-condsets-impuse2.mir b/llvm/test/CodeGen/Hexagon/expand-condsets-impuse2.mir
index ae3f4ba78cd1ff..ebb361ab433cb7 100644
--- a/llvm/test/CodeGen/Hexagon/expand-condsets-impuse2.mir
+++ b/llvm/test/CodeGen/Hexagon/expand-condsets-impuse2.mir
@@ -6,12 +6,12 @@
 
 name: f0
 tracksRegLiveness: true
+isSSA: false
 body: |
   bb.0:
     successors: %bb.1
     liveins: $r0, $r1
     %0:intregs = COPY $r0
-    %0:intregs = COPY $r0       ; defeat IsSSA detection
     %1:intregs = COPY $r1
     %2:intregs = COPY $r0
     %3:intregs = M2_mpyi %2, %1
diff --git a/llvm/test/CodeGen/Hexagon/expand-condsets-phys-reg.mir b/llvm/test/CodeGen/Hexagon/expand-condsets-phys-reg.mir
index e62cd1cc73609b..d252ec5fee4019 100644
--- a/llvm/test/CodeGen/Hexagon/expand-condsets-phys-reg.mir
+++ b/llvm/test/CodeGen/Hexagon/expand-condsets-phys-reg.mir
@@ -9,12 +9,12 @@
 
 name: fred
 tracksRegLiveness: true
+isSSA: false
 body: |
   bb.0:
     successors: %bb.1, %bb.2
     liveins: $r0
 
-    %0:intregs = A2_tfrsi 0           ;; Multiple defs to ensure IsSSA = false
     %0:intregs = L2_loadri_io $r0, 0
     %1:predregs = C2_cmpgti %0, 10
     %2:intregs = C2_mux %1, $r31, %0
diff --git a/llvm/test/CodeGen/Hexagon/expand-condsets-rm-reg.mir b/llvm/test/CodeGen/Hexagon/expand-condsets-rm-reg.mir
index 6d7b6cd72a3099..463aa9a8e7f9b1 100644
--- a/llvm/test/CodeGen/Hexagon/expand-condsets-rm-reg.mir
+++ b/llvm/test/CodeGen/Hexagon/expand-condsets-rm-reg.mir
@@ -20,6 +20,7 @@
 
 name: fred
 tracksRegLiveness: true
+isSSA: false
 registers:
   - { id: 0, class: intregs }
   - { id: 1, class: intregs }
@@ -35,7 +36,6 @@ body: |
   bb.0:
     liveins: $r0, $r1, $p0
         %0 = COPY $r0
-        %0 = COPY $r0  ; Force isSSA = false.
         %1 = COPY $r1
         %2 = COPY $p0
         ; Check that %3 was coalesced into %4.
diff --git a/llvm/test/CodeGen/MIR/Generic/machine-function-optionally-computed-properties-conflict.mir b/llvm/test/CodeGen/MIR/Generic/machine-function-optionally-computed-properties-conflict.mir
new file mode 100644
index 00000000000000..d8d178d90ae0af
--- /dev/null
+++ b/llvm/test/CodeGen/MIR/Generic/machine-function-optionally-computed-properties-conflict.mir
@@ -0,0 +1,35 @@
+# RUN: not llc -run-pass none -o /dev/null %s 2>&1 | FileCheck %s
+
+# Test that computed properties are not conflicting with explicitly set
+# properties
+
+---
+# CHECK: error: {{.*}}: TestNoPhisOverrideConflict has explicit property NoPhi, but contains at least one PHI
+name:            TestNoPhisOverrideConflict
+noPhis: true
+tracksRegLiveness: true
+body: |
+  bb.0:
+    %0:_(s32) = G_IMPLICIT_DEF
+
+  bb.1:
+    %1:_(s32) = PHI %0, %bb.0, %1, %bb.1
+    G_BR %bb.1
+...
+---
+# CHECK: error: {{.*}}: TestIsSSAOverrideConflict has explicit property IsSSA, but is not valid SSA
+name:            TestIsSSAOverrideConflict
+isSSA: true
+body: |
+  bb.0:
+    %0:_(s32) = G_IMPLICIT_DEF
+    %0:_(s32) = G_IMPLICIT_DEF
+...
+---
+# CHECK: error: {{.*}}: TestNoVRegsOverrideConflict has explicit property NoVRegs, but contains virtual registers
+name:            TestNoVRegsOverrideConflict
+noVRegs: true
+body: |
+  bb.0:
+    %0:_(s32) = G_IMPLICIT_DEF
+...
diff --git a/llvm/test/CodeGen/MIR/Generic/machine-function-optionally-computed-properties.mir b/llvm/test/CodeGen/MIR/Generic/machine-function-optionally-computed-properties.mir
new file mode 100644
index 00000000000000..858bbc8394bb34
--- /dev/null
+++ b/llvm/test/CodeGen/MIR/Generic/machine-function-optionally-computed-properties.mir
@@ -0,0 +1,64 @@
+# RUN: llc -run-pass none -o - %s | FileCheck %s
+
+# Test that we can disable certain properties that are normally computed
+
+---
+# CHECK-LABEL: name: TestNoPhis
+# CHECK: noPhis: true
+# CHECK: ...
+name:            TestNoPhis
+...
+---
+# CHECK-LABEL: name: TestNoPhisOverride
+# CHECK: noPhis: false
+# CHECK: ...
+name:            TestNoPhisOverride
+noPhis: false
+...
+---
+# CHECK-LABEL: name: TestNoPhisOverrideTrue
+# CHECK: noPhis: true
+# CHECK: ...
+name:            TestNoPhisOverrideTrue
+noPhis: true
+...
+---
+# CHECK-LABEL: name: TestIsSSA
+# CHECK: isSSA: true
+# CHECK: ...
+name:            TestIsSSA
+...
+---
+# CHECK-LABEL: name: TestIsSSAOverride
+# CHECK: isSSA: false
+# CHECK: ...
+name:            TestIsSSAOverride
+isSSA: false
+...
+---
+# CHECK-LABEL: name: TestIsSSAOverrideTrue
+# CHECK: isSSA: true
+# CHECK: ...
+name:            TestIsSSAOverrideTrue
+isSSA: true
+...
+---
+# CHECK-LABEL: name: TestNoVRegs
+# CHECK: noVRegs: true
+# CHECK: ...
+name:            TestNoVRegs
+...
+---
+# CHECK-LABEL: name: TestNoVRegsOverride
+# CHECK: noVRegs: false
+# CHECK: ...
+name:            TestNoVRegsOverride
+noVRegs: false
+...
+---
+# CHECK-LABEL: name: TestNoVRegsOverrideTrue
+# CHECK: noVRegs: true
+# CHECK: ...
+name:            TestNoVRegsOverrideTrue
+noVRegs: true
+...
diff --git a/llvm/test/CodeGen/SPIRV/debug-info/debug-compilation-unit.ll b/llvm/test/CodeGen/SPIRV/debug-info/debug-compilation-unit.ll
index bff4660559ab82..794dcd6d9f3fb4 100644
--- a/llvm/test/CodeGen/SPIRV/debug-info/debug-compilation-unit.ll
+++ b/llvm/test/CodeGen/SPIRV/debug-info/debug-compilation-unit.ll
@@ -29,11 +29,13 @@ define spir_func void @foo() {
 entry:
   ret void
 }
+; CHECK-SPIRV-NOT: Lfunc_end0:
 
 define spir_func void @bar() {
 entry:
   ret void
 }
+; CHECK-SPIRV-NOT: Lfunc_end1:
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!2, !3, !4, !5}
diff --git a/llvm/test/CodeGen/X86/extractelement-fp.ll b/llvm/test/CodeGen/X86/extractelement-fp.ll
index 38162f676e7ee3..944f6bbfd0bfbe 100644
--- a/llvm/test/CodeGen/X86/extractelement-fp.ll
+++ b/llvm/test/CodeGen/X86/extractelement-fp.ll
@@ -1310,15 +1310,14 @@ define float @rcp_v4f32(<4 x float> %x) nounwind {
 define float @rcp_v8f32(<8 x float> %x) nounwind {
 ; X64-LABEL: rcp_v8f32:
 ; X64:       # %bb.0:
-; X64-NEXT:    vrcpps %ymm0, %ymm0
-; X64-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; X64-NEXT:    vrcpss %xmm0, %xmm0, %xmm0
 ; X64-NEXT:    vzeroupper
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: rcp_v8f32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %eax
-; X86-NEXT:    vrcpps %ymm0, %ymm0
+; X86-NEXT:    vrcpss %xmm0, %xmm0, %xmm0
 ; X86-NEXT:    vmovss %xmm0, (%esp)
 ; X86-NEXT:    flds (%esp)
 ; X86-NEXT:    popl %eax
@@ -1351,15 +1350,14 @@ define float @rsqrt_v4f32(<4 x float> %x) nounwind {
 define float @rsqrt_v8f32(<8 x float> %x) nounwind {
 ; X64-LABEL: rsqrt_v8f32:
 ; X64:       # %bb.0:
-; X64-NEXT:    vrsqrtps %ymm0, %ymm0
-; X64-NEXT:    # kill: def $xmm0 killed $xmm0 killed $ymm0
+; X64-NEXT:    vrsqrtss %xmm0, %xmm0, %xmm0
 ; X64-NEXT:    vzeroupper
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: rsqrt_v8f32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %eax
-; X86-NEXT:    vrsqrtps %ymm0, %ymm0
+; X86-NEXT:    vrsqrtss %xmm0, %xmm0, %xmm0
 ; X86-NEXT:    vmovss %xmm0, (%esp)
 ; X86-NEXT:    flds (%esp)
 ; X86-NEXT:    popl %eax
diff --git a/llvm/test/CodeGen/X86/sjlj-shadow-stack-liveness.mir b/llvm/test/CodeGen/X86/sjlj-shadow-stack-liveness.mir
index 3def36f9d8ba91..83bc8ec510f646 100644
--- a/llvm/test/CodeGen/X86/sjlj-shadow-stack-liveness.mir
+++ b/llvm/test/CodeGen/X86/sjlj-shadow-stack-liveness.mir
@@ -14,6 +14,7 @@ name:            bar
 # CHECK-LABEL: name: bar
 alignment:       16
 tracksRegLiveness: true
+noPhis: false
 body:             |
   bb.0:
     %0:gr64 = IMPLICIT_DEF
@@ -29,8 +30,6 @@ body:             |
     ; CHECK-NOT: MOV64rm killed %0
     ; CHECK-NEXT: MOV64rm killed %0
 
-  ; FIXME: Dummy PHI to set the property NoPHIs to false. PR38439.
   bb.2:
-    %1:gr64 = PHI undef %1, %bb.2, undef %1, %bb.2
     JMP_1 %bb.2
 ...
diff --git a/llvm/test/MC/AMDGPU/gfx1150_asm_sopp.s b/llvm/test/MC/AMDGPU/gfx1150_asm_sopp.s
deleted file mode 100644
index 044ce48c267846..00000000000000
--- a/llvm/test/MC/AMDGPU/gfx1150_asm_sopp.s
+++ /dev/null
@@ -1,10 +0,0 @@
-// RUN: llvm-mc -triple=amdgcn -mcpu=gfx1150 -show-encoding %s | FileCheck --check-prefixes=GFX1150 %s
-
-s_singleuse_vdst 0x0000
-// GFX1150: encoding: [0x00,0x00,0x93,0xbf]
-
-s_singleuse_vdst 0xffff
-// GFX1150: encoding: [0xff,0xff,0x93,0xbf]
-
-s_singleuse_vdst 0x1234
-// GFX1150: encoding: [0x34,0x12,0x93,0xbf]
diff --git a/llvm/test/MC/AMDGPU/gfx11_unsupported.s b/llvm/test/MC/AMDGPU/gfx11_unsupported.s
index c9756a068890e7..c565801d275bb8 100644
--- a/llvm/test/MC/AMDGPU/gfx11_unsupported.s
+++ b/llvm/test/MC/AMDGPU/gfx11_unsupported.s
@@ -2014,9 +2014,6 @@ s_cmp_neq_f16 s1, s2
 s_cmp_nlt_f16 s1, s2
 // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
 
-s_singleuse_vdst 0x1234
-// CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
-
 buffer_atomic_sub_clamp_u32 v5, off, s[8:11], s3 offset:0 glc
 // CHECK: :[[@LINE-1]]:{{[0-9]+}}: error: instruction not supported on this GPU
 
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_sopp.s b/llvm/test/MC/AMDGPU/gfx12_asm_sopp.s
index e98659208d5a9c..fdcabc4352c69b 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_sopp.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_sopp.s
@@ -69,15 +69,6 @@ s_wait_alu depctr_va_sdst(3)
 s_wait_alu depctr_va_vdst(14) depctr_va_sdst(6) depctr_vm_vsrc(6)
 // GFX12: encoding: [0x9b,0xed,0x88,0xbf]
 
-s_singleuse_vdst 0x0000
-// GFX12: encoding: [0x00,0x00,0x93,0xbf]
-
-s_singleuse_vdst 0xffff
-// GFX12: encoding: [0xff,0xff,0x93,0xbf]
-
-s_singleuse_vdst 0x1234
-// GFX12: encoding: [0x34,0x12,0x93,0xbf]
-
 s_barrier_wait 0xffff
 // GFX12: encoding: [0xff,0xff,0x94,0xbf]
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/decode-err.txt b/llvm/test/MC/Disassembler/AMDGPU/decode-err.txt
index d6e8b7ee2f01f0..f819a61949b577 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/decode-err.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/decode-err.txt
@@ -1,16 +1,11 @@
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx900 -disassemble -show-encoding -filetype=null < %s 2>&1 | FileCheck -check-prefix=GCN-ERR %s
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -disassemble -show-encoding < %s 2>&1 | FileCheck -check-prefixes=W32 %s
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize64 -disassemble -show-encoding < %s 2>&1 | FileCheck -check-prefixes=W64 %s
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1100 -disassemble -show-encoding -filetype=null < %s 2>&1 | FileCheck -check-prefix=GFX11-ERR %s
 # RUN: llvm-mc -triple=amdgcn -mcpu=gfx1200 -disassemble -show-encoding -filetype=null < %s 2>&1 | FileCheck -check-prefix=GFX12-ERR %s
 
 # GCN-ERR: [[@LINE+1]]:1: warning: invalid instruction encoding
 0xdf,0x00,0x00,0x02
 
-# this is s_singleuse_vdst 0x1234, which is only valid on gfx1150
-# GFX11-ERR: [[@LINE+1]]:1: warning: invalid instruction encoding
-0x34,0x12,0x93,0xbf
-
 # this is s_waitcnt_vscnt exec_hi, 0x1234, which is valid on gfx11, but not on gfx12
 # GFX12-ERR: [[@LINE+1]]:1: warning: invalid instruction encoding
 0x34,0x12,0x7f,0xbc
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1150_dasm_sopp.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1150_dasm_sopp.txt
deleted file mode 100644
index 8fa266a73ff87f..00000000000000
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx1150_dasm_sopp.txt
+++ /dev/null
@@ -1,10 +0,0 @@
-# RUN: llvm-mc -triple=amdgcn -mcpu=gfx1150 -disassemble -show-encoding < %s | FileCheck -check-prefixes=GFX1150 %s
-
-# GFX1150: s_singleuse_vdst 0x0                 ; encoding: [0x00,0x00,0x93,0xbf]
-0x00,0x00,0x93,0xbf
-
-# GFX1150: s_singleuse_vdst 0xffff                 ; encoding: [0xff,0xff,0x93,0xbf]
-0xff,0xff,0x93,0xbf
-
-# GFX1150: s_singleuse_vdst 0x1234                 ; encoding: [0x34,0x12,0x93,0xbf]
-0x34,0x12,0x93,0xbf
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sopp.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sopp.txt
index d42f920aa61dd7..d69801512c0786 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sopp.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_sopp.txt
@@ -60,14 +60,6 @@
 # GFX12: s_wait_storecnt_dscnt 0xc1d1            ; encoding: [0xd1,0xc1,0xc9,0xbf]
 0xd1,0xc1,0xc9,0xbf
 
-# GFX12: s_singleuse_vdst 0x0                    ; encoding: [0x00,0x00,0x93,0xbf]
-0x00,0x00,0x93,0xbf
-
-# GFX12: s_singleuse_vdst 0xffff                 ; encoding: [0xff,0xff,0x93,0xbf]
-0xff,0xff,0x93,0xbf
-
-# GFX12: s_singleuse_vdst 0x1234                 ; encoding: [0x34,0x12,0x93,0xbf]
-0x34,0x12,0x93,0xbf
 
 # GFX12: s_barrier_wait 0xffff                   ; encoding: [0xff,0xff,0x94,0xbf]
 0xff,0xff,0x94,0xbf
diff --git a/llvm/test/MC/ELF/relocation-alias.s b/llvm/test/MC/ELF/relocation-alias.s
index 51fb0c37052fe7..66bf2ceea508ba 100644
--- a/llvm/test/MC/ELF/relocation-alias.s
+++ b/llvm/test/MC/ELF/relocation-alias.s
@@ -16,7 +16,10 @@ movabsq $memcpy+2, %rax
 
 # CHECK:      movq (%rip), %rax
 # CHECK-NEXT:   R_X86_64_REX_GOTPCRELX  abs-0x4
+# CHECK:      movq (%rip), %r16
+# CHECK-NEXT:   R_X86_64_REX2_GOTPCRELX abs-0x4
 movq abs@GOTPCREL(%rip), %rax
+movq abs@GOTPCREL(%rip), %r16
 abs = 42
 
 # CHECK:      movabsq $0, %rbx
diff --git a/llvm/test/MC/X86/gotpcrelx.s b/llvm/test/MC/X86/gotpcrelx.s
index e63e3e9a946fd1..5a8ba454bc904c 100644
--- a/llvm/test/MC/X86/gotpcrelx.s
+++ b/llvm/test/MC/X86/gotpcrelx.s
@@ -37,6 +37,16 @@
 # CHECK-NEXT:     R_X86_64_REX_GOTPCRELX sbb
 # CHECK-NEXT:     R_X86_64_REX_GOTPCRELX sub
 # CHECK-NEXT:     R_X86_64_REX_GOTPCRELX xor
+# CHECK-NEXT:     R_X86_64_REX2_GOTPCRELX mov
+# CHECK-NEXT:     R_X86_64_REX2_GOTPCRELX test
+# CHECK-NEXT:     R_X86_64_REX2_GOTPCRELX adc
+# CHECK-NEXT:     R_X86_64_REX2_GOTPCRELX add
+# CHECK-NEXT:     R_X86_64_REX2_GOTPCRELX and
+# CHECK-NEXT:     R_X86_64_REX2_GOTPCRELX cmp
+# CHECK-NEXT:     R_X86_64_REX2_GOTPCRELX or
+# CHECK-NEXT:     R_X86_64_REX2_GOTPCRELX sbb
+# CHECK-NEXT:     R_X86_64_REX2_GOTPCRELX sub
+# CHECK-NEXT:     R_X86_64_REX2_GOTPCRELX xor
 # CHECK-NEXT:   }
 
 # NORELAX-NEXT:     R_X86_64_GOTPCREL mov
@@ -71,6 +81,16 @@
 # NORELAX-NEXT:     R_X86_64_GOTPCREL sbb
 # NORELAX-NEXT:     R_X86_64_GOTPCREL sub
 # NORELAX-NEXT:     R_X86_64_GOTPCREL xor
+# NORELAX-NEXT:     R_X86_64_GOTPCREL mov
+# NORELAX-NEXT:     R_X86_64_GOTPCREL test
+# NORELAX-NEXT:     R_X86_64_GOTPCREL adc
+# NORELAX-NEXT:     R_X86_64_GOTPCREL add
+# NORELAX-NEXT:     R_X86_64_GOTPCREL and
+# NORELAX-NEXT:     R_X86_64_GOTPCREL cmp
+# NORELAX-NEXT:     R_X86_64_GOTPCREL or
+# NORELAX-NEXT:     R_X86_64_GOTPCREL sbb
+# NORELAX-NEXT:     R_X86_64_GOTPCREL sub
+# NORELAX-NEXT:     R_X86_64_GOTPCREL xor
 # NORELAX-NEXT:   }
 
 movl mov@GOTPCREL(%rip), %eax
@@ -108,10 +128,22 @@ sbb sbb@GOTPCREL(%rip), %rax
 sub sub@GOTPCREL(%rip), %rax
 xor xor@GOTPCREL(%rip), %rax
 
+movq mov@GOTPCREL(%rip), %r16
+test %r16, test@GOTPCREL(%rip)
+adc adc@GOTPCREL(%rip), %r16
+add add@GOTPCREL(%rip), %r16
+and and@GOTPCREL(%rip), %r16
+cmp cmp@GOTPCREL(%rip), %r16
+or  or@GOTPCREL(%rip), %r16
+sbb sbb@GOTPCREL(%rip), %r16
+sub sub@GOTPCREL(%rip), %r16
+xor xor@GOTPCREL(%rip), %r16
+
 # COMMON-NEXT:   Section ({{.*}}) .rela.norelax {
 # COMMON-NEXT:     R_X86_64_GOTPCREL mov 0x0
 # COMMON-NEXT:     R_X86_64_GOTPCREL mov 0xFFFFFFFFFFFFFFFD
 # COMMON-NEXT:     R_X86_64_GOTPCREL mov 0xFFFFFFFFFFFFFFFC
+# COMMON-NEXT:     R_X86_64_GOTPCREL mov 0xFFFFFFFFFFFFFFFD
 # COMMON-NEXT:   }
 # COMMON-NEXT: ]
 
@@ -123,3 +155,5 @@ movl mov@GOTPCREL+4(%rip), %eax
 movq mov@GOTPCREL+1(%rip), %rax
 ## We could emit R_X86_64_GOTPCRELX, but it is probably unnecessary.
 movl mov@GOTPCREL+0(%rip), %eax
+## Don't emit R_X86_64_GOTPCRELX.
+movq mov@GOTPCREL+1(%rip), %r16
diff --git a/llvm/test/MC/X86/reloc-directive-elf-64.s b/llvm/test/MC/X86/reloc-directive-elf-64.s
index 8f5d8c895e7d76..323603efc70618 100644
--- a/llvm/test/MC/X86/reloc-directive-elf-64.s
+++ b/llvm/test/MC/X86/reloc-directive-elf-64.s
@@ -9,6 +9,7 @@
 # PRINT-NEXT: .reloc 0, R_X86_64_64, .data+2
 # PRINT-NEXT: .reloc 0, R_X86_64_GOTPCRELX, foo+3
 # PRINT-NEXT: .reloc 0, R_X86_64_REX_GOTPCRELX, 5
+# PRINT-NEXT: .reloc 0, R_X86_64_REX2_GOTPCRELX, 7
 # PRINT:      .reloc 0, BFD_RELOC_NONE, 9
 # PRINT-NEXT: .reloc 0, BFD_RELOC_8, 9
 # PRINT-NEXT: .reloc 0, BFD_RELOC_16, 9
@@ -21,6 +22,7 @@
 # CHECK-NEXT: 0x0 R_X86_64_64 .data 0x2
 # CHECK-NEXT: 0x0 R_X86_64_GOTPCRELX foo 0x3
 # CHECK-NEXT: 0x0 R_X86_64_REX_GOTPCRELX - 0x5
+# CHECK-NEXT: 0x0 R_X86_64_REX2_GOTPCRELX - 0x7
 # CHECK-NEXT: 0x0 R_X86_64_NONE - 0x9
 # CHECK-NEXT: 0x0 R_X86_64_8 - 0x9
 # CHECK-NEXT: 0x0 R_X86_64_16 - 0x9
@@ -37,6 +39,7 @@
   .reloc 0, R_X86_64_64, .data+2
   .reloc 0, R_X86_64_GOTPCRELX, foo+3
   .reloc 0, R_X86_64_REX_GOTPCRELX, 5
+  .reloc 0, R_X86_64_REX2_GOTPCRELX, 7
 
   .reloc 0, BFD_RELOC_NONE, 9
   .reloc 0, BFD_RELOC_8, 9
diff --git a/llvm/test/Transforms/LoopVectorize/float-induction.ll b/llvm/test/Transforms/LoopVectorize/float-induction.ll
index 9091b2c80fb97c..cedaf019a958bd 100644
--- a/llvm/test/Transforms/LoopVectorize/float-induction.ll
+++ b/llvm/test/Transforms/LoopVectorize/float-induction.ll
@@ -1640,3 +1640,112 @@ for.inc:
 for.end:
   ret void
 }
+
+define i32 @float_induction_with_dbg_on_fadd(ptr %dst) {
+; VEC4_INTERL1-LABEL: @float_induction_with_dbg_on_fadd(
+; VEC4_INTERL1-NEXT:  entry:
+; VEC4_INTERL1-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; VEC4_INTERL1:       vector.ph:
+; VEC4_INTERL1-NEXT:    br label [[VECTOR_BODY:%.*]]
+; VEC4_INTERL1:       vector.body:
+; VEC4_INTERL1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; VEC4_INTERL1-NEXT:    [[TMP0:%.*]] = getelementptr float, ptr null, i64 [[INDEX]]
+; VEC4_INTERL1-NEXT:    store <4 x float> poison, ptr [[TMP0]], align 8
+; VEC4_INTERL1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VEC4_INTERL1-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
+; VEC4_INTERL1-NEXT:    br i1 [[TMP1]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; VEC4_INTERL1:       middle.block:
+; VEC4_INTERL1-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; VEC4_INTERL1:       scalar.ph:
+; VEC4_INTERL1-NEXT:    br label [[LOOP:%.*]]
+; VEC4_INTERL1:       loop:
+; VEC4_INTERL1-NEXT:    br i1 poison, label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP16:![0-9]+]]
+; VEC4_INTERL1:       exit:
+; VEC4_INTERL1-NEXT:    ret i32 0
+;
+; VEC4_INTERL2-LABEL: @float_induction_with_dbg_on_fadd(
+; VEC4_INTERL2-NEXT:  entry:
+; VEC4_INTERL2-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; VEC4_INTERL2:       vector.ph:
+; VEC4_INTERL2-NEXT:    br label [[VECTOR_BODY:%.*]]
+; VEC4_INTERL2:       vector.body:
+; VEC4_INTERL2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; VEC4_INTERL2-NEXT:    [[TMP0:%.*]] = getelementptr float, ptr null, i64 [[INDEX]]
+; VEC4_INTERL2-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i64 16
+; VEC4_INTERL2-NEXT:    store <4 x float> poison, ptr [[TMP0]], align 8
+; VEC4_INTERL2-NEXT:    store <4 x float> zeroinitializer, ptr [[TMP1]], align 8
+; VEC4_INTERL2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; VEC4_INTERL2-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
+; VEC4_INTERL2-NEXT:    br i1 [[TMP2]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; VEC4_INTERL2:       middle.block:
+; VEC4_INTERL2-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; VEC4_INTERL2:       scalar.ph:
+; VEC4_INTERL2-NEXT:    br label [[LOOP:%.*]]
+; VEC4_INTERL2:       loop:
+; VEC4_INTERL2-NEXT:    br i1 poison, label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP16:![0-9]+]]
+; VEC4_INTERL2:       exit:
+; VEC4_INTERL2-NEXT:    ret i32 0
+;
+; VEC1_INTERL2-LABEL: @float_induction_with_dbg_on_fadd(
+; VEC1_INTERL2-NEXT:  entry:
+; VEC1_INTERL2-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; VEC1_INTERL2:       vector.ph:
+; VEC1_INTERL2-NEXT:    br label [[VECTOR_BODY:%.*]]
+; VEC1_INTERL2:       vector.body:
+; VEC1_INTERL2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; VEC1_INTERL2-NEXT:    [[TMP0:%.*]] = or disjoint i64 [[INDEX]], 1
+; VEC1_INTERL2-NEXT:    [[TMP1:%.*]] = getelementptr float, ptr null, i64 [[INDEX]]
+; VEC1_INTERL2-NEXT:    [[TMP2:%.*]] = getelementptr float, ptr null, i64 [[TMP0]]
+; VEC1_INTERL2-NEXT:    store float poison, ptr [[TMP1]], align 8
+; VEC1_INTERL2-NEXT:    store float poison, ptr [[TMP2]], align 8
+; VEC1_INTERL2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VEC1_INTERL2-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
+; VEC1_INTERL2-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; VEC1_INTERL2:       middle.block:
+; VEC1_INTERL2-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; VEC1_INTERL2:       scalar.ph:
+; VEC1_INTERL2-NEXT:    br label [[LOOP:%.*]]
+; VEC1_INTERL2:       loop:
+; VEC1_INTERL2-NEXT:    br i1 poison, label [[EXIT]], label [[LOOP]], !llvm.loop [[LOOP16:![0-9]+]]
+; VEC1_INTERL2:       exit:
+; VEC1_INTERL2-NEXT:    ret i32 0
+;
+; VEC2_INTERL1_PRED_STORE-LABEL: @float_induction_with_dbg_on_fadd(
+; VEC2_INTERL1_PRED_STORE-NEXT:  entry:
+; VEC2_INTERL1_PRED_STORE-NEXT:    br label [[VECTOR_BODY:%.*]]
+; VEC2_INTERL1_PRED_STORE:       vector.body:
+; VEC2_INTERL1_PRED_STORE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP0:%.*]] = getelementptr float, ptr null, i64 [[INDEX]]
+; VEC2_INTERL1_PRED_STORE-NEXT:    store <2 x float> poison, ptr [[TMP0]], align 8
+; VEC2_INTERL1_PRED_STORE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VEC2_INTERL1_PRED_STORE-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[INDEX_NEXT]], 200
+; VEC2_INTERL1_PRED_STORE-NEXT:    br i1 [[TMP1]], label [[EXIT:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; VEC2_INTERL1_PRED_STORE:       exit:
+; VEC2_INTERL1_PRED_STORE-NEXT:    ret i32 0
+;
+entry:
+  br label %loop
+
+loop:
+  %fp.iv = phi float [ 0.000000e+00, %entry ], [ %fp.iv.next, %loop ], !dbg !4
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %fp.iv.next = fadd reassoc float %fp.iv, 0.000000e+00
+  %gep = getelementptr float, ptr null, i64 %iv
+  store float %fp.iv.next, ptr %gep, align 8
+  %iv.next = add i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 200
+  br i1 %exitcond.not, label %exit, label %loop
+
+exit:
+  ret i32 0
+}
+
+!llvm.module.flags = !{!3}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C11, file: !1)
+!1 = !DIFile(filename: "bbi-99425.c", directory: "/tmp")
+!2 = !{}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !DILocation(line: 5, column: 12, scope: !8)
+!8 = distinct !DISubprogram(name: "main", scope: !1, file: !1, line: 3, type: !9, unit: !0, retainedNodes: !2)
+!9 = !DISubroutineType(types: !2)
diff --git a/llvm/test/Transforms/MemCpyOpt/fca2memcpy.ll b/llvm/test/Transforms/MemCpyOpt/fca2memcpy.ll
index 51fad820509393..61e349e01ed91d 100644
--- a/llvm/test/Transforms/MemCpyOpt/fca2memcpy.ll
+++ b/llvm/test/Transforms/MemCpyOpt/fca2memcpy.ll
@@ -141,4 +141,19 @@ define void @throwing_call(ptr noalias %src, ptr %dst) {
   ret void
 }
 
+define void @loop_memoryphi(ptr %a, ptr %b) {
+; CHECK-LABEL: @loop_memoryphi(
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    call void @llvm.memmove.p0.p0.i64(ptr align 8 [[B:%.*]], ptr align 8 [[A:%.*]], i64 16, i1 false)
+; CHECK-NEXT:    br label [[LOOP]]
+;
+  br label %loop
+
+loop:
+  %v = load { i64, i64 }, ptr %a
+  store { i64, i64 } %v, ptr %b
+  br label %loop
+}
+
 declare void @call()
diff --git a/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll b/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll
index edc0381aa3fcc2..6dceabe1d3243b 100644
--- a/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll
+++ b/llvm/test/Transforms/SLPVectorizer/AArch64/reduce-fadd.ll
@@ -1,8 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt < %s -S -passes=slp-vectorizer -mtriple=aarch64-unknown-linux \
-; RUN:       -mattr=-fullfp16 | FileCheck %s --check-prefixes=CHECK,NOFP16
-; RUN: opt < %s -S -passes=slp-vectorizer -mtriple=aarch64-unknown-linux \
-; RUN:       -mattr=+fullfp16 | FileCheck %s --check-prefixes=CHECK,FP16
+; RUN: opt < %s -S -passes=slp-vectorizer -mtriple=aarch64-unknown-linux -mattr=-fullfp16 | FileCheck %s --check-prefixes=CHECK,NOFP16
+; RUN: opt < %s -S -passes=slp-vectorizer -mtriple=aarch64-unknown-linux -mattr=+fullfp16 | FileCheck %s --check-prefixes=CHECK,FULLFP16
 
 define half @reduce_fast_half2(<2 x half> %vec2) {
 ; CHECK-LABEL: define half @reduce_fast_half2(
@@ -79,20 +77,26 @@ entry:
 }
 
 define half @reduce_fast_half8(<8 x half> %vec8) {
-; CHECK-LABEL: define half @reduce_fast_half8(
-; CHECK-SAME: <8 x half> [[VEC8:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[ELT4:%.*]] = extractelement <8 x half> [[VEC8]], i64 4
-; CHECK-NEXT:    [[ELT5:%.*]] = extractelement <8 x half> [[VEC8]], i64 5
-; CHECK-NEXT:    [[ELT6:%.*]] = extractelement <8 x half> [[VEC8]], i64 6
-; CHECK-NEXT:    [[ELT7:%.*]] = extractelement <8 x half> [[VEC8]], i64 7
-; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <8 x half> [[VEC8]], <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEXT:    [[TMP1:%.*]] = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> [[TMP0]])
-; CHECK-NEXT:    [[OP_RDX:%.*]] = fadd fast half [[TMP1]], [[ELT4]]
-; CHECK-NEXT:    [[OP_RDX1:%.*]] = fadd fast half [[ELT5]], [[ELT6]]
-; CHECK-NEXT:    [[OP_RDX2:%.*]] = fadd fast half [[OP_RDX]], [[OP_RDX1]]
-; CHECK-NEXT:    [[OP_RDX3:%.*]] = fadd fast half [[OP_RDX2]], [[ELT7]]
-; CHECK-NEXT:    ret half [[OP_RDX3]]
+; NOFP16-LABEL: define half @reduce_fast_half8(
+; NOFP16-SAME: <8 x half> [[VEC8:%.*]]) #[[ATTR0]] {
+; NOFP16-NEXT:  [[ENTRY:.*:]]
+; NOFP16-NEXT:    [[ELT4:%.*]] = extractelement <8 x half> [[VEC8]], i64 4
+; NOFP16-NEXT:    [[ELT5:%.*]] = extractelement <8 x half> [[VEC8]], i64 5
+; NOFP16-NEXT:    [[ELT6:%.*]] = extractelement <8 x half> [[VEC8]], i64 6
+; NOFP16-NEXT:    [[ELT7:%.*]] = extractelement <8 x half> [[VEC8]], i64 7
+; NOFP16-NEXT:    [[TMP0:%.*]] = shufflevector <8 x half> [[VEC8]], <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; NOFP16-NEXT:    [[TMP1:%.*]] = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> [[TMP0]])
+; NOFP16-NEXT:    [[OP_RDX:%.*]] = fadd fast half [[TMP1]], [[ELT4]]
+; NOFP16-NEXT:    [[OP_RDX1:%.*]] = fadd fast half [[ELT5]], [[ELT6]]
+; NOFP16-NEXT:    [[OP_RDX2:%.*]] = fadd fast half [[OP_RDX]], [[OP_RDX1]]
+; NOFP16-NEXT:    [[OP_RDX3:%.*]] = fadd fast half [[OP_RDX2]], [[ELT7]]
+; NOFP16-NEXT:    ret half [[OP_RDX3]]
+;
+; FULLFP16-LABEL: define half @reduce_fast_half8(
+; FULLFP16-SAME: <8 x half> [[VEC8:%.*]]) #[[ATTR0]] {
+; FULLFP16-NEXT:  [[ENTRY:.*:]]
+; FULLFP16-NEXT:    [[TMP0:%.*]] = call fast half @llvm.vector.reduce.fadd.v8f16(half 0xH0000, <8 x half> [[VEC8]])
+; FULLFP16-NEXT:    ret half [[TMP0]]
 ;
 entry:
   %elt0 = extractelement <8 x half> %vec8, i64 0
@@ -154,37 +158,11 @@ entry:
 }
 
 define half @reduce_fast_half16(<16 x half> %vec16) {
-; NOFP16-LABEL: define half @reduce_fast_half16(
-; NOFP16-SAME: <16 x half> [[VEC16:%.*]]) #[[ATTR0]] {
-; NOFP16-NEXT:  [[ENTRY:.*:]]
-; NOFP16-NEXT:    [[TMP0:%.*]] = call fast half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> [[VEC16]])
-; NOFP16-NEXT:    ret half [[TMP0]]
-;
-; FP16-LABEL: define half @reduce_fast_half16(
-; FP16-SAME: <16 x half> [[VEC16:%.*]]) #[[ATTR0]] {
-; FP16-NEXT:  [[ENTRY:.*:]]
-; FP16-NEXT:    [[ELT4:%.*]] = extractelement <16 x half> [[VEC16]], i64 4
-; FP16-NEXT:    [[ELT5:%.*]] = extractelement <16 x half> [[VEC16]], i64 5
-; FP16-NEXT:    [[ELT6:%.*]] = extractelement <16 x half> [[VEC16]], i64 6
-; FP16-NEXT:    [[ELT7:%.*]] = extractelement <16 x half> [[VEC16]], i64 7
-; FP16-NEXT:    [[ELT12:%.*]] = extractelement <16 x half> [[VEC16]], i64 12
-; FP16-NEXT:    [[ELT13:%.*]] = extractelement <16 x half> [[VEC16]], i64 13
-; FP16-NEXT:    [[ELT14:%.*]] = extractelement <16 x half> [[VEC16]], i64 14
-; FP16-NEXT:    [[ELT15:%.*]] = extractelement <16 x half> [[VEC16]], i64 15
-; FP16-NEXT:    [[TMP0:%.*]] = shufflevector <16 x half> [[VEC16]], <16 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; FP16-NEXT:    [[TMP1:%.*]] = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> [[TMP0]])
-; FP16-NEXT:    [[TMP2:%.*]] = shufflevector <16 x half> [[VEC16]], <16 x half> poison, <4 x i32> <i32 8, i32 9, i32 10, i32 11>
-; FP16-NEXT:    [[TMP3:%.*]] = call fast half @llvm.vector.reduce.fadd.v4f16(half 0xH0000, <4 x half> [[TMP2]])
-; FP16-NEXT:    [[OP_RDX:%.*]] = fadd fast half [[TMP1]], [[TMP3]]
-; FP16-NEXT:    [[OP_RDX1:%.*]] = fadd fast half [[OP_RDX]], [[ELT4]]
-; FP16-NEXT:    [[OP_RDX2:%.*]] = fadd fast half [[ELT5]], [[ELT6]]
-; FP16-NEXT:    [[OP_RDX3:%.*]] = fadd fast half [[ELT7]], [[ELT12]]
-; FP16-NEXT:    [[OP_RDX4:%.*]] = fadd fast half [[ELT13]], [[ELT14]]
-; FP16-NEXT:    [[OP_RDX5:%.*]] = fadd fast half [[OP_RDX1]], [[OP_RDX2]]
-; FP16-NEXT:    [[OP_RDX6:%.*]] = fadd fast half [[OP_RDX3]], [[OP_RDX4]]
-; FP16-NEXT:    [[OP_RDX7:%.*]] = fadd fast half [[OP_RDX5]], [[OP_RDX6]]
-; FP16-NEXT:    [[OP_RDX8:%.*]] = fadd fast half [[OP_RDX7]], [[ELT15]]
-; FP16-NEXT:    ret half [[OP_RDX8]]
+; CHECK-LABEL: define half @reduce_fast_half16(
+; CHECK-SAME: <16 x half> [[VEC16:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call fast half @llvm.vector.reduce.fadd.v16f16(half 0xH0000, <16 x half> [[VEC16]])
+; CHECK-NEXT:    ret half [[TMP0]]
 ;
 entry:
   %elt0 = extractelement <16 x half> %vec16, i64 0
@@ -512,19 +490,11 @@ define float @reduce_fast_float_case1(ptr %a) {
 ; CHECK-LABEL: define float @reduce_fast_float_case1(
 ; CHECK-SAME: ptr [[A:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[LOAD:%.*]] = load float, ptr [[A]], align 4
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 4
-; CHECK-NEXT:    [[LOAD1:%.*]] = load float, ptr [[GEP]], align 4
-; CHECK-NEXT:    [[ADD1:%.*]] = fadd fast float [[LOAD1]], [[LOAD]]
-; CHECK-NEXT:    [[GEP2:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 8
-; CHECK-NEXT:    [[LOAD2:%.*]] = load float, ptr [[GEP2]], align 4
-; CHECK-NEXT:    [[ADD2:%.*]] = fadd fast float [[LOAD2]], [[ADD1]]
-; CHECK-NEXT:    [[GEP3:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 12
-; CHECK-NEXT:    [[LOAD3:%.*]] = load float, ptr [[GEP3]], align 4
-; CHECK-NEXT:    [[ADD3:%.*]] = fadd fast float [[LOAD3]], [[ADD2]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 4
 ; CHECK-NEXT:    [[GEP4:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 16
 ; CHECK-NEXT:    [[LOAD4:%.*]] = load float, ptr [[GEP4]], align 4
-; CHECK-NEXT:    [[ADD4:%.*]] = fadd fast float [[LOAD4]], [[ADD3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP0]])
+; CHECK-NEXT:    [[ADD4:%.*]] = fadd fast float [[TMP1]], [[LOAD4]]
 ; CHECK-NEXT:    ret float [[ADD4]]
 ;
 entry:
@@ -586,24 +556,11 @@ define float @reduce_fast_float_case2(ptr %a, ptr %b) {
 ; CHECK-LABEL: define float @reduce_fast_float_case2(
 ; CHECK-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[GEPA2:%.*]] = getelementptr inbounds float, ptr [[A]], i32 2
-; CHECK-NEXT:    [[GEPA3:%.*]] = getelementptr inbounds float, ptr [[A]], i32 3
-; CHECK-NEXT:    [[GEPB2:%.*]] = getelementptr inbounds float, ptr [[B]], i32 2
-; CHECK-NEXT:    [[GEPB3:%.*]] = getelementptr inbounds float, ptr [[B]], i32 3
-; CHECK-NEXT:    [[LOADA2:%.*]] = load float, ptr [[GEPA2]], align 4
-; CHECK-NEXT:    [[LOADA3:%.*]] = load float, ptr [[GEPA3]], align 4
-; CHECK-NEXT:    [[LOADB2:%.*]] = load float, ptr [[GEPB2]], align 4
-; CHECK-NEXT:    [[LOADB3:%.*]] = load float, ptr [[GEPB3]], align 4
-; CHECK-NEXT:    [[TMP0:%.*]] = load <2 x float>, ptr [[A]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = load <2 x float>, ptr [[B]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = fadd fast <2 x float> [[TMP0]], [[TMP1]]
-; CHECK-NEXT:    [[ADD2:%.*]] = fadd fast float [[LOADA3]], [[LOADB2]]
-; CHECK-NEXT:    [[ADD3:%.*]] = fadd fast float [[LOADA2]], [[LOADB3]]
-; CHECK-NEXT:    [[TMP3:%.*]] = extractelement <2 x float> [[TMP2]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = extractelement <2 x float> [[TMP2]], i32 1
-; CHECK-NEXT:    [[RED1:%.*]] = fadd fast float [[TMP3]], [[TMP4]]
-; CHECK-NEXT:    [[RED2:%.*]] = fadd fast float [[ADD2]], [[RED1]]
-; CHECK-NEXT:    [[RED3:%.*]] = fadd fast float [[ADD3]], [[RED2]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[A]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[B]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> poison, <4 x float> [[TMP1]], i64 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = call <8 x float> @llvm.vector.insert.v8f32.v4f32(<8 x float> [[TMP2]], <4 x float> [[TMP0]], i64 4)
+; CHECK-NEXT:    [[RED3:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP3]])
 ; CHECK-NEXT:    ret float [[RED3]]
 ;
 entry:
diff --git a/llvm/test/tools/llvm-reduce/mir/preserve-func-info.mir b/llvm/test/tools/llvm-reduce/mir/preserve-func-info.mir
index 5f11cea89d7e7b..f735dfd5cbbf01 100644
--- a/llvm/test/tools/llvm-reduce/mir/preserve-func-info.mir
+++ b/llvm/test/tools/llvm-reduce/mir/preserve-func-info.mir
@@ -14,6 +14,9 @@
 # RESULT-NEXT: failedISel:      true
 # RESULT-NEXT: tracksRegLiveness: true
 # RESULT-NEXT: hasWinCFI:       true
+# RESULT-NEXT: noPhis:          false
+# RESULT-NEXT: isSSA:           false
+# RESULT-NEXT: noVRegs:         false
 # RESULT-NEXT: callsEHReturn: true
 # RESULT-NEXT: callsUnwindInit: true
 # RESULT-NEXT: hasEHCatchret: true
@@ -41,6 +44,9 @@ selected:        true
 failedISel:      true
 tracksRegLiveness: true
 hasWinCFI:       true
+noPhis:          false
+isSSA:           false
+noVRegs:         false
 failsVerification: true
 tracksDebugUserValues: true
 callsEHReturn: true
diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
index dd4af4e98832f7..f83efbd3558025 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
@@ -152,7 +152,6 @@ static_library("LLVMAMDGPUCodeGen") {
     "AMDGPUISelLowering.cpp",
     "AMDGPUImageIntrinsicOptimizer.cpp",
     "AMDGPUInsertDelayAlu.cpp",
-    "AMDGPUInsertSingleUseVDST.cpp",
     "AMDGPUInstCombineIntrinsic.cpp",
     "AMDGPUInstrInfo.cpp",
     "AMDGPUInstructionSelector.cpp",
diff --git a/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td b/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td
index cafc3d91fd1e9d..3170115883e2be 100644
--- a/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td
+++ b/mlir/include/mlir/Dialect/Tensor/IR/TensorOps.td
@@ -1814,7 +1814,7 @@ def Tensor_SplatOp : Tensor_Op<"splat", [
 }
 
 //===----------------------------------------------------------------------===//
-// PackOp
+// RelayoutOp
 //===----------------------------------------------------------------------===//
 
 class Tensor_RelayoutOp<string mnemonic, list<Trait> traits = []> :
@@ -1851,11 +1851,27 @@ class Tensor_RelayoutOp<string mnemonic, list<Trait> traits = []> :
     /// a sentinel `kDynamic` is introduced at that position in
     /// the returned vector.
     SmallVector<int64_t> getStaticTiles();
+
+    /// Retrieve all outer dims for this Pack/UnPack Op, i.e. all the leading
+    /// dims excluding the trailing dims corresponding to `innerTiles`. Note
+    /// that this will include both tiled and non-tiled dimensions. The order
+    /// of the output dimensions is consistent with the shape of the packed
+    /// tensor.
+    ArrayRef<int64_t> getAllOuterDims();
+
+    /// Similar to `getAllOuterDims`, but only retrieve the outer dims that
+    /// have been tiled. Also, the order of the output dimensions is consistent
+    /// with `inner_dims_pos` rather than the packed tensor.
+    SmallVector<int64_t> getTiledOuterDims();
   }];
 
   let hasVerifier = 1;
 }
 
+//===----------------------------------------------------------------------===//
+// PackOp
+//===----------------------------------------------------------------------===//
+
 def Tensor_PackOp : Tensor_RelayoutOp<"pack", [
     AttrSizedOperandSegments]> {
   let summary = "tensor pack operation";
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
index 77f0ea9d2236ea..e0dea8e78d55c1 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Transforms.cpp
@@ -1030,11 +1030,13 @@ static Value getPackOpSourceOrPaddedSource(OpBuilder &builder,
     return input;
   }
 
+  assert(llvm::all_of(packOp.getAllOuterDims(),
+                      [](int64_t val) { return val == 1; }) &&
+         "some outer dims are != 1");
+
   Location loc = packOp.getLoc();
   ShapedType inputType = packOp.getSourceType();
   int64_t inputRank = inputType.getRank();
-  assert(llvm::all_of(packOp.getDestType().getShape().take_front(inputRank),
-                      [](int64_t val) { return val == 1; }));
 
   SmallVector<int64_t> paddedShape;
   DenseMap<int64_t, OpFoldResult> tileAndPosMapping =
@@ -1126,12 +1128,8 @@ LogicalResult GeneralizeOuterUnitDimsPackOpPattern::matchAndRewrite(
 
   // TODO: support the case that outer dimensions are not all 1s. A
   // tensor.expand_shape will be generated in this case.
-  auto innerDimsPos = packOp.getInnerDimsPos();
-  int64_t srcRank = packOp.getSourceRank();
-  auto destShape = packOp.getDestType().getShape();
-  if (llvm::any_of(innerDimsPos, [destShape](int64_t index) {
-        return destShape[index] != 1;
-      })) {
+  if (llvm::any_of(packOp.getTiledOuterDims(),
+                   [](int64_t dim) { return dim != 1; })) {
     return rewriter.notifyMatchFailure(
         packOp, "require the tiled outer dimensions of the result are all 1s");
   }
@@ -1145,6 +1143,7 @@ LogicalResult GeneralizeOuterUnitDimsPackOpPattern::matchAndRewrite(
       packOp.getDimAndTileMapping();
   Attribute zeroIdxAttr = rewriter.getIndexAttr(0);
   Attribute oneIdxAttr = rewriter.getIndexAttr(1);
+  int64_t srcRank = packOp.getSourceRank();
   SmallVector<OpFoldResult> readOffsets(srcRank, zeroIdxAttr);
   SmallVector<OpFoldResult> readStrides(srcRank, oneIdxAttr);
   SmallVector<OpFoldResult> readSizes;
@@ -1173,9 +1172,8 @@ LogicalResult GeneralizeOuterUnitDimsPackOpPattern::matchAndRewrite(
       loc, readType, input, readOffsets, readSizes, readStrides);
 
   // 2. Transpose the tile to match the inner tile order.
-
   SmallVector<int64_t> perm = getPackUnpackRankReducedPerm(
-      inputShape, innerDimsPos, packOp.getOuterDimsPerm());
+      inputShape, packOp.getInnerDimsPos(), packOp.getOuterDimsPerm());
 
   LLVM_DEBUG(DBGS() << "Pack permutation: " << packOp << "\n";
              llvm::interleaveComma(perm, DBGS() << "perm: "); DBGSNL(););
@@ -1208,9 +1206,8 @@ LogicalResult GeneralizeOuterUnitDimsUnPackOpPattern::matchAndRewrite(
   int64_t destRank = unpackOp.getDestRank();
   ArrayRef<int64_t> srcShape = unpackOp.getSourceType().getShape();
   ArrayRef<int64_t> innerDimsPos = unpackOp.getInnerDimsPos();
-  if (llvm::any_of(innerDimsPos, [srcShape](int64_t index) {
-        return srcShape[index] != 1;
-      })) {
+  if (llvm::any_of(unpackOp.getTiledOuterDims(),
+                   [](int64_t dim) { return dim != 1; })) {
     return rewriter.notifyMatchFailure(
         unpackOp,
         "require the tiled outer dimensions of the result are all 1s");
diff --git a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
index 47f540e092e990..1ac96756e22b5e 100644
--- a/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
+++ b/mlir/lib/Dialect/Tensor/IR/TensorOps.cpp
@@ -3987,6 +3987,23 @@ SmallVector<int64_t> PackOp::getStaticTiles() {
   return getStaticTilesImpl(*this);
 }
 
+ArrayRef<int64_t> PackOp::getAllOuterDims() {
+  ShapedType inputType = getSourceType();
+  int64_t inputRank = inputType.getRank();
+  return getDestType().getShape().take_front(inputRank);
+}
+
+SmallVector<int64_t> PackOp::getTiledOuterDims() {
+  auto innerDimsPos = getInnerDimsPos();
+  auto packedShape = getDestType().getShape();
+  SmallVector<int64_t> res;
+
+  for (auto index : innerDimsPos)
+    res.push_back(packedShape[index]);
+
+  return res;
+}
+
 bool PackOp::requirePaddingValue(ArrayRef<int64_t> inputShape,
                                  ArrayRef<int64_t> innerDimsPos,
                                  ArrayRef<int64_t> outputShape,
@@ -4411,6 +4428,23 @@ SmallVector<int64_t> UnPackOp::getStaticTiles() {
   return getStaticTilesImpl(*this);
 }
 
+ArrayRef<int64_t> UnPackOp::getAllOuterDims() {
+  ShapedType destType = getDestType();
+  int64_t destRank = destType.getRank();
+  return getSourceType().getShape().take_front(destRank);
+}
+
+SmallVector<int64_t> UnPackOp::getTiledOuterDims() {
+  auto innerDimsPos = getInnerDimsPos();
+  auto packedShape = getSourceType().getShape();
+  SmallVector<int64_t> res;
+
+  for (auto index : innerDimsPos)
+    res.push_back(packedShape[index]);
+
+  return res;
+}
+
 LogicalResult UnPackOp::verify() {
   return commonVerifierPackAndUnPackOp(*this);
 }
diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/math/libc_math_test_rules.bzl b/utils/bazel/llvm-project-overlay/libc/test/src/math/libc_math_test_rules.bzl
index d788705fc3e604..16845ab66dfd45 100644
--- a/utils/bazel/llvm-project-overlay/libc/test/src/math/libc_math_test_rules.bzl
+++ b/utils/bazel/llvm-project-overlay/libc/test/src/math/libc_math_test_rules.bzl
@@ -28,7 +28,9 @@ def math_test(name, hdrs = [], deps = [], **kwargs):
             "//libc:__support_cpp_algorithm",
             "//libc:__support_cpp_bit",
             "//libc:__support_cpp_limits",
+            "//libc:__support_cpp_type_traits",
             "//libc:__support_fputil_basic_operations",
+            "//libc:__support_fputil_cast",
             "//libc:__support_fputil_fenv_impl",
             "//libc:__support_fputil_fp_bits",
             "//libc:__support_fputil_manipulation_functions",
@@ -36,6 +38,7 @@ def math_test(name, hdrs = [], deps = [], **kwargs):
             "//libc:__support_fputil_normal_float",
             "//libc:__support_macros_properties_architectures",
             "//libc:__support_macros_properties_os",
+            "//libc:__support_macros_properties_types",
             "//libc:__support_math_extras",
             "//libc:__support_uint128",
             "//libc:hdr_errno_macros",